xref: /xnu-8796.141.3/osfmk/vm/vm_map.c (revision 1b191cb58250d0705d8a51287127505aa4bc0789)
1 /*
2  * Copyright (c) 2000-2021 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 /*
29  * @OSF_COPYRIGHT@
30  */
31 /*
32  * Mach Operating System
33  * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
34  * All Rights Reserved.
35  *
36  * Permission to use, copy, modify and distribute this software and its
37  * documentation is hereby granted, provided that both the copyright
38  * notice and this permission notice appear in all copies of the
39  * software, derivative works or modified versions, and any portions
40  * thereof, and that both notices appear in supporting documentation.
41  *
42  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44  * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45  *
46  * Carnegie Mellon requests users of this software to return to
47  *
48  *  Software Distribution Coordinator  or  [email protected]
49  *  School of Computer Science
50  *  Carnegie Mellon University
51  *  Pittsburgh PA 15213-3890
52  *
53  * any improvements or extensions that they make and grant Carnegie Mellon
54  * the rights to redistribute these changes.
55  */
56 /*
57  */
58 /*
59  *	File:	vm/vm_map.c
60  *	Author:	Avadis Tevanian, Jr., Michael Wayne Young
61  *	Date:	1985
62  *
63  *	Virtual memory mapping module.
64  */
65 
66 #include "vm/vm_map.h"
67 #include <mach/vm_types.h>
68 #include <mach_assert.h>
69 
70 #include <vm/vm_options.h>
71 
72 #include <libkern/OSAtomic.h>
73 
74 #include <mach/kern_return.h>
75 #include <mach/port.h>
76 #include <mach/vm_attributes.h>
77 #include <mach/vm_param.h>
78 #include <mach/vm_behavior.h>
79 #include <mach/vm_statistics.h>
80 #include <mach/memory_object.h>
81 #include <mach/mach_vm.h>
82 #include <machine/cpu_capabilities.h>
83 #include <mach/sdt.h>
84 
85 #include <kern/assert.h>
86 #include <kern/backtrace.h>
87 #include <kern/counter.h>
88 #include <kern/exc_guard.h>
89 #include <kern/kalloc.h>
90 #include <kern/zalloc_internal.h>
91 
92 #include <vm/cpm.h>
93 #include <vm/vm_compressor.h>
94 #include <vm/vm_compressor_pager.h>
95 #include <vm/vm_init.h>
96 #include <vm/vm_fault.h>
97 #include <vm/vm_map_internal.h>
98 #include <vm/vm_object.h>
99 #include <vm/vm_page.h>
100 #include <vm/vm_pageout.h>
101 #include <vm/pmap.h>
102 #include <vm/vm_kern.h>
103 #include <ipc/ipc_port.h>
104 #include <kern/sched_prim.h>
105 #include <kern/misc_protos.h>
106 
107 #include <mach/vm_map_server.h>
108 #include <mach/mach_host_server.h>
109 #include <vm/vm_protos.h>
110 #include <vm/vm_purgeable_internal.h>
111 #include <vm/vm_reclaim_internal.h>
112 
113 #include <vm/vm_protos.h>
114 #include <vm/vm_shared_region.h>
115 #include <vm/vm_map_store.h>
116 
117 #include <san/kasan.h>
118 
119 #include <sys/resource.h>
120 #include <sys/codesign.h>
121 #include <sys/code_signing.h>
122 #include <sys/mman.h>
123 #include <sys/reboot.h>
124 #include <sys/kdebug_triage.h>
125 
126 #include <libkern/section_keywords.h>
127 
128 #if DEVELOPMENT || DEBUG
129 extern int proc_selfcsflags(void);
130 int vm_log_xnu_user_debug = 0;
131 int panic_on_unsigned_execute = 0;
132 int panic_on_mlock_failure = 0;
133 #endif /* DEVELOPMENT || DEBUG */
134 
135 #if MACH_ASSERT
136 int debug4k_filter = 0;
137 char debug4k_proc_name[1024] = "";
138 int debug4k_proc_filter = (int)-1 & ~(1 << __DEBUG4K_FAULT);
139 int debug4k_panic_on_misaligned_sharing = 0;
140 const char *debug4k_category_name[] = {
141 	"error",        /* 0 */
142 	"life",         /* 1 */
143 	"load",         /* 2 */
144 	"fault",        /* 3 */
145 	"copy",         /* 4 */
146 	"share",        /* 5 */
147 	"adjust",       /* 6 */
148 	"pmap",         /* 7 */
149 	"mementry",     /* 8 */
150 	"iokit",        /* 9 */
151 	"upl",          /* 10 */
152 	"exc",          /* 11 */
153 	"vfs"           /* 12 */
154 };
155 #endif /* MACH_ASSERT */
156 int debug4k_no_cow_copyin = 0;
157 
158 
159 #if __arm64__
160 extern const int fourk_binary_compatibility_unsafe;
161 extern const int fourk_binary_compatibility_allow_wx;
162 #endif /* __arm64__ */
163 extern int proc_selfpid(void);
164 extern char *proc_name_address(void *p);
165 extern char *proc_best_name(struct proc *p);
166 
167 #if VM_MAP_DEBUG_APPLE_PROTECT
168 int vm_map_debug_apple_protect = 0;
169 #endif /* VM_MAP_DEBUG_APPLE_PROTECT */
170 #if VM_MAP_DEBUG_FOURK
171 int vm_map_debug_fourk = 0;
172 #endif /* VM_MAP_DEBUG_FOURK */
173 
174 #if DEBUG || DEVELOPMENT
175 static TUNABLE(bool, vm_map_executable_immutable,
176     "vm_map_executable_immutable", true);
177 #else
178 #define vm_map_executable_immutable true
179 #endif
180 
181 os_refgrp_decl(static, map_refgrp, "vm_map", NULL);
182 
183 extern u_int32_t random(void);  /* from <libkern/libkern.h> */
184 /* Internal prototypes
185  */
186 
187 typedef struct vm_map_zap {
188 	vm_map_entry_t          vmz_head;
189 	vm_map_entry_t         *vmz_tail;
190 } *vm_map_zap_t;
191 
192 #define VM_MAP_ZAP_DECLARE(zap) \
193 	struct vm_map_zap zap = { .vmz_tail = &zap.vmz_head }
194 
195 static vm_map_entry_t   vm_map_entry_insert(
196 	vm_map_t                map,
197 	vm_map_entry_t          insp_entry,
198 	vm_map_offset_t         start,
199 	vm_map_offset_t         end,
200 	vm_object_t             object,
201 	vm_object_offset_t      offset,
202 	vm_map_kernel_flags_t   vmk_flags,
203 	boolean_t               needs_copy,
204 	vm_prot_t               cur_protection,
205 	vm_prot_t               max_protection,
206 	vm_inherit_t            inheritance,
207 	boolean_t               clear_map_aligned);
208 
209 static void vm_map_simplify_range(
210 	vm_map_t        map,
211 	vm_map_offset_t start,
212 	vm_map_offset_t end);   /* forward */
213 
214 static boolean_t        vm_map_range_check(
215 	vm_map_t        map,
216 	vm_map_offset_t start,
217 	vm_map_offset_t end,
218 	vm_map_entry_t  *entry);
219 
220 static void vm_map_submap_pmap_clean(
221 	vm_map_t        map,
222 	vm_map_offset_t start,
223 	vm_map_offset_t end,
224 	vm_map_t        sub_map,
225 	vm_map_offset_t offset);
226 
227 static void             vm_map_pmap_enter(
228 	vm_map_t                map,
229 	vm_map_offset_t         addr,
230 	vm_map_offset_t         end_addr,
231 	vm_object_t             object,
232 	vm_object_offset_t      offset,
233 	vm_prot_t               protection);
234 
235 static void             _vm_map_clip_end(
236 	struct vm_map_header    *map_header,
237 	vm_map_entry_t          entry,
238 	vm_map_offset_t         end);
239 
240 static void             _vm_map_clip_start(
241 	struct vm_map_header    *map_header,
242 	vm_map_entry_t          entry,
243 	vm_map_offset_t         start);
244 
245 static kmem_return_t vm_map_delete(
246 	vm_map_t        map,
247 	vm_map_offset_t start,
248 	vm_map_offset_t end,
249 	vmr_flags_t     flags,
250 	kmem_guard_t    guard,
251 	vm_map_zap_t    zap);
252 
253 static void             vm_map_copy_insert(
254 	vm_map_t        map,
255 	vm_map_entry_t  after_where,
256 	vm_map_copy_t   copy);
257 
258 static kern_return_t    vm_map_copy_overwrite_unaligned(
259 	vm_map_t        dst_map,
260 	vm_map_entry_t  entry,
261 	vm_map_copy_t   copy,
262 	vm_map_address_t start,
263 	boolean_t       discard_on_success);
264 
265 static kern_return_t    vm_map_copy_overwrite_aligned(
266 	vm_map_t        dst_map,
267 	vm_map_entry_t  tmp_entry,
268 	vm_map_copy_t   copy,
269 	vm_map_offset_t start,
270 	pmap_t          pmap);
271 
272 static kern_return_t    vm_map_copyin_kernel_buffer(
273 	vm_map_t        src_map,
274 	vm_map_address_t src_addr,
275 	vm_map_size_t   len,
276 	boolean_t       src_destroy,
277 	vm_map_copy_t   *copy_result);  /* OUT */
278 
279 static kern_return_t    vm_map_copyout_kernel_buffer(
280 	vm_map_t        map,
281 	vm_map_address_t *addr, /* IN/OUT */
282 	vm_map_copy_t   copy,
283 	vm_map_size_t   copy_size,
284 	boolean_t       overwrite,
285 	boolean_t       consume_on_success);
286 
287 static void             vm_map_fork_share(
288 	vm_map_t        old_map,
289 	vm_map_entry_t  old_entry,
290 	vm_map_t        new_map);
291 
292 static boolean_t        vm_map_fork_copy(
293 	vm_map_t        old_map,
294 	vm_map_entry_t  *old_entry_p,
295 	vm_map_t        new_map,
296 	int             vm_map_copyin_flags);
297 
298 static kern_return_t    vm_map_wire_nested(
299 	vm_map_t                   map,
300 	vm_map_offset_t            start,
301 	vm_map_offset_t            end,
302 	vm_prot_t                  caller_prot,
303 	vm_tag_t                   tag,
304 	boolean_t                  user_wire,
305 	pmap_t                     map_pmap,
306 	vm_map_offset_t            pmap_addr,
307 	ppnum_t                    *physpage_p);
308 
309 static kern_return_t    vm_map_unwire_nested(
310 	vm_map_t                   map,
311 	vm_map_offset_t            start,
312 	vm_map_offset_t            end,
313 	boolean_t                  user_wire,
314 	pmap_t                     map_pmap,
315 	vm_map_offset_t            pmap_addr);
316 
317 static kern_return_t    vm_map_overwrite_submap_recurse(
318 	vm_map_t                   dst_map,
319 	vm_map_offset_t            dst_addr,
320 	vm_map_size_t              dst_size);
321 
322 static kern_return_t    vm_map_copy_overwrite_nested(
323 	vm_map_t                   dst_map,
324 	vm_map_offset_t            dst_addr,
325 	vm_map_copy_t              copy,
326 	boolean_t                  interruptible,
327 	pmap_t                     pmap,
328 	boolean_t                  discard_on_success);
329 
330 static kern_return_t    vm_map_remap_extract(
331 	vm_map_t                map,
332 	vm_map_offset_t         addr,
333 	vm_map_size_t           size,
334 	boolean_t               copy,
335 	vm_map_copy_t           map_copy,
336 	vm_prot_t               *cur_protection,
337 	vm_prot_t               *max_protection,
338 	vm_inherit_t            inheritance,
339 	vm_map_kernel_flags_t   vmk_flags);
340 
341 static kern_return_t    vm_map_remap_range_allocate(
342 	vm_map_t                map,
343 	vm_map_address_t        *address,
344 	vm_map_size_t           size,
345 	vm_map_offset_t         mask,
346 	vm_map_kernel_flags_t   vmk_flags,
347 	vm_map_entry_t          *map_entry,
348 	vm_map_zap_t            zap_list);
349 
350 static void             vm_map_region_look_for_page(
351 	vm_map_t                   map,
352 	vm_map_offset_t            va,
353 	vm_object_t                object,
354 	vm_object_offset_t         offset,
355 	int                        max_refcnt,
356 	unsigned short             depth,
357 	vm_region_extended_info_t  extended,
358 	mach_msg_type_number_t count);
359 
360 static int              vm_map_region_count_obj_refs(
361 	vm_map_entry_t             entry,
362 	vm_object_t                object);
363 
364 
365 static kern_return_t    vm_map_willneed(
366 	vm_map_t        map,
367 	vm_map_offset_t start,
368 	vm_map_offset_t end);
369 
370 static kern_return_t    vm_map_reuse_pages(
371 	vm_map_t        map,
372 	vm_map_offset_t start,
373 	vm_map_offset_t end);
374 
375 static kern_return_t    vm_map_reusable_pages(
376 	vm_map_t        map,
377 	vm_map_offset_t start,
378 	vm_map_offset_t end);
379 
380 static kern_return_t    vm_map_can_reuse(
381 	vm_map_t        map,
382 	vm_map_offset_t start,
383 	vm_map_offset_t end);
384 
385 static kern_return_t    vm_map_random_address_for_size(
386 	vm_map_t                map,
387 	vm_map_offset_t        *address,
388 	vm_map_size_t           size,
389 	vm_map_kernel_flags_t   vmk_flags);
390 
391 
392 #if CONFIG_MAP_RANGES
393 
394 static vm_map_range_id_t vm_map_user_range_resolve(
395 	vm_map_t                map,
396 	mach_vm_address_t       addr,
397 	mach_vm_address_t       size,
398 	mach_vm_range_t         range);
399 
400 #endif /* CONFIG_MAP_RANGES */
401 #if MACH_ASSERT
402 static kern_return_t    vm_map_pageout(
403 	vm_map_t        map,
404 	vm_map_offset_t start,
405 	vm_map_offset_t end);
406 #endif /* MACH_ASSERT */
407 
408 kern_return_t vm_map_corpse_footprint_collect(
409 	vm_map_t        old_map,
410 	vm_map_entry_t  old_entry,
411 	vm_map_t        new_map);
412 void vm_map_corpse_footprint_collect_done(
413 	vm_map_t        new_map);
414 void vm_map_corpse_footprint_destroy(
415 	vm_map_t        map);
416 kern_return_t vm_map_corpse_footprint_query_page_info(
417 	vm_map_t        map,
418 	vm_map_offset_t va,
419 	int             *disposition_p);
420 void vm_map_footprint_query_page_info(
421 	vm_map_t        map,
422 	vm_map_entry_t  map_entry,
423 	vm_map_offset_t curr_s_offset,
424 	int             *disposition_p);
425 
426 #if CONFIG_MAP_RANGES
427 static void vm_map_range_map_init(void);
428 #endif /* CONFIG_MAP_RANGES */
429 
430 pid_t find_largest_process_vm_map_entries(void);
431 
432 extern int exit_with_guard_exception(void *p, mach_exception_data_type_t code,
433     mach_exception_data_type_t subcode);
434 
435 /*
436  * Macros to copy a vm_map_entry. We must be careful to correctly
437  * manage the wired page count. vm_map_entry_copy() creates a new
438  * map entry to the same memory - the wired count in the new entry
439  * must be set to zero. vm_map_entry_copy_full() creates a new
440  * entry that is identical to the old entry.  This preserves the
441  * wire count; it's used for map splitting and zone changing in
442  * vm_map_copyout.
443  */
444 
445 static inline void
vm_map_entry_copy_csm_assoc(vm_map_t map __unused,vm_map_entry_t new __unused,vm_map_entry_t old __unused)446 vm_map_entry_copy_csm_assoc(
447 	vm_map_t map __unused,
448 	vm_map_entry_t new __unused,
449 	vm_map_entry_t old __unused)
450 {
451 #if CODE_SIGNING_MONITOR
452 	/* when code signing monitor is enabled, we want to reset on copy */
453 	new->csm_associated = FALSE;
454 #else
455 	/* when code signing monitor is not enabled, assert as a sanity check */
456 	assert(new->csm_associated == FALSE);
457 #endif
458 #if DEVELOPMENT || DEBUG
459 	if (new->vme_xnu_user_debug && vm_log_xnu_user_debug) {
460 		printf("FBDP %d[%s] %s:%d map %p entry %p [ 0x%llx 0x%llx ] resetting vme_xnu_user_debug\n",
461 		    proc_selfpid(),
462 		    (get_bsdtask_info(current_task())
463 		    ? proc_name_address(get_bsdtask_info(current_task()))
464 		    : "?"),
465 		    __FUNCTION__, __LINE__,
466 		    map, new, new->vme_start, new->vme_end);
467 	}
468 #endif /* DEVELOPMENT || DEBUG */
469 	new->vme_xnu_user_debug = FALSE;
470 }
471 
472 /*
473  * The "used_for_jit" flag was copied from OLD to NEW in vm_map_entry_copy().
474  * But for security reasons on some platforms, we don't want the
475  * new mapping to be "used for jit", so we reset the flag here.
476  */
477 static inline void
vm_map_entry_copy_code_signing(vm_map_t map,vm_map_entry_t new,vm_map_entry_t old __unused)478 vm_map_entry_copy_code_signing(
479 	vm_map_t map,
480 	vm_map_entry_t new,
481 	vm_map_entry_t old __unused)
482 {
483 	if (VM_MAP_POLICY_ALLOW_JIT_COPY(map)) {
484 		assert(new->used_for_jit == old->used_for_jit);
485 	} else {
486 		new->used_for_jit = FALSE;
487 	}
488 }
489 
490 static inline void
vm_map_entry_copy_full(vm_map_entry_t new,vm_map_entry_t old)491 vm_map_entry_copy_full(
492 	vm_map_entry_t new,
493 	vm_map_entry_t old)
494 {
495 #if MAP_ENTRY_CREATION_DEBUG
496 	btref_put(new->vme_creation_bt);
497 	btref_retain(old->vme_creation_bt);
498 #endif
499 #if MAP_ENTRY_INSERTION_DEBUG
500 	btref_put(new->vme_insertion_bt);
501 	btref_retain(old->vme_insertion_bt);
502 #endif
503 	*new = *old;
504 }
505 
506 static inline void
vm_map_entry_copy(vm_map_t map,vm_map_entry_t new,vm_map_entry_t old)507 vm_map_entry_copy(
508 	vm_map_t map,
509 	vm_map_entry_t new,
510 	vm_map_entry_t old)
511 {
512 	vm_map_entry_copy_full(new, old);
513 
514 	new->is_shared = FALSE;
515 	new->needs_wakeup = FALSE;
516 	new->in_transition = FALSE;
517 	new->wired_count = 0;
518 	new->user_wired_count = 0;
519 	new->vme_permanent = FALSE;
520 	vm_map_entry_copy_code_signing(map, new, old);
521 	vm_map_entry_copy_csm_assoc(map, new, old);
522 	if (new->iokit_acct) {
523 		assertf(!new->use_pmap, "old %p new %p\n", old, new);
524 		new->iokit_acct = FALSE;
525 		new->use_pmap = TRUE;
526 	}
527 	new->vme_resilient_codesign = FALSE;
528 	new->vme_resilient_media = FALSE;
529 	new->vme_atomic = FALSE;
530 	new->vme_no_copy_on_read = FALSE;
531 }
532 
533 /*
534  * Normal lock_read_to_write() returns FALSE/0 on failure.
535  * These functions evaluate to zero on success and non-zero value on failure.
536  */
537 __attribute__((always_inline))
538 int
vm_map_lock_read_to_write(vm_map_t map)539 vm_map_lock_read_to_write(vm_map_t map)
540 {
541 	if (lck_rw_lock_shared_to_exclusive(&(map)->lock)) {
542 		DTRACE_VM(vm_map_lock_upgrade);
543 		return 0;
544 	}
545 	return 1;
546 }
547 
548 __attribute__((always_inline))
549 boolean_t
vm_map_try_lock(vm_map_t map)550 vm_map_try_lock(vm_map_t map)
551 {
552 	if (lck_rw_try_lock_exclusive(&(map)->lock)) {
553 		DTRACE_VM(vm_map_lock_w);
554 		return TRUE;
555 	}
556 	return FALSE;
557 }
558 
559 __attribute__((always_inline))
560 boolean_t
vm_map_try_lock_read(vm_map_t map)561 vm_map_try_lock_read(vm_map_t map)
562 {
563 	if (lck_rw_try_lock_shared(&(map)->lock)) {
564 		DTRACE_VM(vm_map_lock_r);
565 		return TRUE;
566 	}
567 	return FALSE;
568 }
569 
570 /*!
571  * @function kdp_vm_map_is_acquired_exclusive
572  *
573  * @abstract
574  * Checks if vm map is acquired exclusive.
575  *
576  * @discussion
577  * NOT SAFE: To be used only by kernel debugger.
578  *
579  * @param map map to check
580  *
581  * @returns TRUE if the map is acquired exclusively.
582  */
583 boolean_t
kdp_vm_map_is_acquired_exclusive(vm_map_t map)584 kdp_vm_map_is_acquired_exclusive(vm_map_t map)
585 {
586 	return kdp_lck_rw_lock_is_acquired_exclusive(&map->lock);
587 }
588 
589 /*
590  * Routines to get the page size the caller should
591  * use while inspecting the target address space.
592  * Use the "_safely" variant if the caller is dealing with a user-provided
593  * array whose size depends on the page size, to avoid any overflow or
594  * underflow of a user-allocated buffer.
595  */
596 int
vm_self_region_page_shift_safely(vm_map_t target_map)597 vm_self_region_page_shift_safely(
598 	vm_map_t target_map)
599 {
600 	int effective_page_shift = 0;
601 
602 	if (PAGE_SIZE == (4096)) {
603 		/* x86_64 and 4k watches: always use 4k */
604 		return PAGE_SHIFT;
605 	}
606 	/* did caller provide an explicit page size for this thread to use? */
607 	effective_page_shift = thread_self_region_page_shift();
608 	if (effective_page_shift) {
609 		/* use the explicitly-provided page size */
610 		return effective_page_shift;
611 	}
612 	/* no explicit page size: use the caller's page size... */
613 	effective_page_shift = VM_MAP_PAGE_SHIFT(current_map());
614 	if (effective_page_shift == VM_MAP_PAGE_SHIFT(target_map)) {
615 		/* page size match: safe to use */
616 		return effective_page_shift;
617 	}
618 	/* page size mismatch */
619 	return -1;
620 }
621 int
vm_self_region_page_shift(vm_map_t target_map)622 vm_self_region_page_shift(
623 	vm_map_t target_map)
624 {
625 	int effective_page_shift;
626 
627 	effective_page_shift = vm_self_region_page_shift_safely(target_map);
628 	if (effective_page_shift == -1) {
629 		/* no safe value but OK to guess for caller */
630 		effective_page_shift = MIN(VM_MAP_PAGE_SHIFT(current_map()),
631 		    VM_MAP_PAGE_SHIFT(target_map));
632 	}
633 	return effective_page_shift;
634 }
635 
636 
637 /*
638  *	Decide if we want to allow processes to execute from their data or stack areas.
639  *	override_nx() returns true if we do.  Data/stack execution can be enabled independently
640  *	for 32 and 64 bit processes.  Set the VM_ABI_32 or VM_ABI_64 flags in allow_data_exec
641  *	or allow_stack_exec to enable data execution for that type of data area for that particular
642  *	ABI (or both by or'ing the flags together).  These are initialized in the architecture
643  *	specific pmap files since the default behavior varies according to architecture.  The
644  *	main reason it varies is because of the need to provide binary compatibility with old
645  *	applications that were written before these restrictions came into being.  In the old
646  *	days, an app could execute anything it could read, but this has slowly been tightened
647  *	up over time.  The default behavior is:
648  *
649  *	32-bit PPC apps		may execute from both stack and data areas
650  *	32-bit Intel apps	may exeucte from data areas but not stack
651  *	64-bit PPC/Intel apps	may not execute from either data or stack
652  *
653  *	An application on any architecture may override these defaults by explicitly
654  *	adding PROT_EXEC permission to the page in question with the mprotect(2)
655  *	system call.  This code here just determines what happens when an app tries to
656  *      execute from a page that lacks execute permission.
657  *
658  *	Note that allow_data_exec or allow_stack_exec may also be modified by sysctl to change the
659  *	default behavior for both 32 and 64 bit apps on a system-wide basis. Furthermore,
660  *	a Mach-O header flag bit (MH_NO_HEAP_EXECUTION) can be used to forcibly disallow
661  *	execution from data areas for a particular binary even if the arch normally permits it. As
662  *	a final wrinkle, a posix_spawn attribute flag can be used to negate this opt-in header bit
663  *	to support some complicated use cases, notably browsers with out-of-process plugins that
664  *	are not all NX-safe.
665  */
666 
667 extern int allow_data_exec, allow_stack_exec;
668 
669 int
override_nx(vm_map_t map,uint32_t user_tag)670 override_nx(vm_map_t map, uint32_t user_tag) /* map unused on arm */
671 {
672 	int current_abi;
673 
674 	if (map->pmap == kernel_pmap) {
675 		return FALSE;
676 	}
677 
678 	/*
679 	 * Determine if the app is running in 32 or 64 bit mode.
680 	 */
681 
682 	if (vm_map_is_64bit(map)) {
683 		current_abi = VM_ABI_64;
684 	} else {
685 		current_abi = VM_ABI_32;
686 	}
687 
688 	/*
689 	 * Determine if we should allow the execution based on whether it's a
690 	 * stack or data area and the current architecture.
691 	 */
692 
693 	if (user_tag == VM_MEMORY_STACK) {
694 		return allow_stack_exec & current_abi;
695 	}
696 
697 	return (allow_data_exec & current_abi) && (map->map_disallow_data_exec == FALSE);
698 }
699 
700 
701 /*
702  *	Virtual memory maps provide for the mapping, protection,
703  *	and sharing of virtual memory objects.  In addition,
704  *	this module provides for an efficient virtual copy of
705  *	memory from one map to another.
706  *
707  *	Synchronization is required prior to most operations.
708  *
709  *	Maps consist of an ordered doubly-linked list of simple
710  *	entries; a single hint is used to speed up lookups.
711  *
712  *	Sharing maps have been deleted from this version of Mach.
713  *	All shared objects are now mapped directly into the respective
714  *	maps.  This requires a change in the copy on write strategy;
715  *	the asymmetric (delayed) strategy is used for shared temporary
716  *	objects instead of the symmetric (shadow) strategy.  All maps
717  *	are now "top level" maps (either task map, kernel map or submap
718  *	of the kernel map).
719  *
720  *	Since portions of maps are specified by start/end addreses,
721  *	which may not align with existing map entries, all
722  *	routines merely "clip" entries to these start/end values.
723  *	[That is, an entry is split into two, bordering at a
724  *	start or end value.]  Note that these clippings may not
725  *	always be necessary (as the two resulting entries are then
726  *	not changed); however, the clipping is done for convenience.
727  *	No attempt is currently made to "glue back together" two
728  *	abutting entries.
729  *
730  *	The symmetric (shadow) copy strategy implements virtual copy
731  *	by copying VM object references from one map to
732  *	another, and then marking both regions as copy-on-write.
733  *	It is important to note that only one writeable reference
734  *	to a VM object region exists in any map when this strategy
735  *	is used -- this means that shadow object creation can be
736  *	delayed until a write operation occurs.  The symmetric (delayed)
737  *	strategy allows multiple maps to have writeable references to
738  *	the same region of a vm object, and hence cannot delay creating
739  *	its copy objects.  See vm_object_copy_quickly() in vm_object.c.
740  *	Copying of permanent objects is completely different; see
741  *	vm_object_copy_strategically() in vm_object.c.
742  */
743 
744 ZONE_DECLARE_ID(ZONE_ID_VM_MAP_COPY, struct vm_map_copy);
745 
746 #define VM_MAP_ZONE_NAME        "maps"
747 #define VM_MAP_ZFLAGS           (ZC_NOENCRYPT | ZC_VM)
748 
749 #define VM_MAP_ENTRY_ZONE_NAME  "VM map entries"
750 #define VM_MAP_ENTRY_ZFLAGS     (ZC_NOENCRYPT | ZC_VM)
751 
752 #define VM_MAP_HOLES_ZONE_NAME  "VM map holes"
753 #define VM_MAP_HOLES_ZFLAGS     (ZC_NOENCRYPT | ZC_VM)
754 
755 /*
756  * Asserts that a vm_map_copy object is coming from the
757  * vm_map_copy_zone to ensure that it isn't a fake constructed
758  * anywhere else.
759  */
760 void
vm_map_copy_require(struct vm_map_copy * copy)761 vm_map_copy_require(struct vm_map_copy *copy)
762 {
763 	zone_id_require(ZONE_ID_VM_MAP_COPY, sizeof(struct vm_map_copy), copy);
764 }
765 
766 /*
767  *	vm_map_require:
768  *
769  *	Ensures that the argument is memory allocated from the genuine
770  *	vm map zone. (See zone_id_require_allow_foreign).
771  */
772 void
vm_map_require(vm_map_t map)773 vm_map_require(vm_map_t map)
774 {
775 	zone_id_require(ZONE_ID_VM_MAP, sizeof(struct _vm_map), map);
776 }
777 
778 #define VM_MAP_EARLY_COUNT_MAX         16
779 static __startup_data vm_offset_t      map_data;
780 static __startup_data vm_size_t        map_data_size;
781 static __startup_data vm_offset_t      kentry_data;
782 static __startup_data vm_size_t        kentry_data_size;
783 static __startup_data vm_offset_t      map_holes_data;
784 static __startup_data vm_size_t        map_holes_data_size;
785 static __startup_data vm_map_t        *early_map_owners[VM_MAP_EARLY_COUNT_MAX];
786 static __startup_data uint32_t         early_map_count;
787 
788 #if XNU_TARGET_OS_OSX
789 #define         NO_COALESCE_LIMIT  ((1024 * 128) - 1)
790 #else /* XNU_TARGET_OS_OSX */
791 #define         NO_COALESCE_LIMIT  0
792 #endif /* XNU_TARGET_OS_OSX */
793 
794 /* Skip acquiring locks if we're in the midst of a kernel core dump */
795 unsigned int not_in_kdp = 1;
796 
797 unsigned int vm_map_set_cache_attr_count = 0;
798 
799 kern_return_t
vm_map_set_cache_attr(vm_map_t map,vm_map_offset_t va)800 vm_map_set_cache_attr(
801 	vm_map_t        map,
802 	vm_map_offset_t va)
803 {
804 	vm_map_entry_t  map_entry;
805 	vm_object_t     object;
806 	kern_return_t   kr = KERN_SUCCESS;
807 
808 	vm_map_lock_read(map);
809 
810 	if (!vm_map_lookup_entry(map, va, &map_entry) ||
811 	    map_entry->is_sub_map) {
812 		/*
813 		 * that memory is not properly mapped
814 		 */
815 		kr = KERN_INVALID_ARGUMENT;
816 		goto done;
817 	}
818 	object = VME_OBJECT(map_entry);
819 
820 	if (object == VM_OBJECT_NULL) {
821 		/*
822 		 * there should be a VM object here at this point
823 		 */
824 		kr = KERN_INVALID_ARGUMENT;
825 		goto done;
826 	}
827 	vm_object_lock(object);
828 	object->set_cache_attr = TRUE;
829 	vm_object_unlock(object);
830 
831 	vm_map_set_cache_attr_count++;
832 done:
833 	vm_map_unlock_read(map);
834 
835 	return kr;
836 }
837 
838 
839 #if CONFIG_CODE_DECRYPTION
840 /*
841  * vm_map_apple_protected:
842  * This remaps the requested part of the object with an object backed by
843  * the decrypting pager.
844  * crypt_info contains entry points and session data for the crypt module.
845  * The crypt_info block will be copied by vm_map_apple_protected. The data structures
846  * referenced in crypt_info must remain valid until crypt_info->crypt_end() is called.
847  */
848 kern_return_t
vm_map_apple_protected(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_object_offset_t crypto_backing_offset,struct pager_crypt_info * crypt_info,uint32_t cryptid)849 vm_map_apple_protected(
850 	vm_map_t                map,
851 	vm_map_offset_t         start,
852 	vm_map_offset_t         end,
853 	vm_object_offset_t      crypto_backing_offset,
854 	struct pager_crypt_info *crypt_info,
855 	uint32_t                cryptid)
856 {
857 	boolean_t       map_locked;
858 	kern_return_t   kr;
859 	vm_map_entry_t  map_entry;
860 	struct vm_map_entry tmp_entry;
861 	memory_object_t unprotected_mem_obj;
862 	vm_object_t     protected_object;
863 	vm_map_offset_t map_addr;
864 	vm_map_offset_t start_aligned, end_aligned;
865 	vm_object_offset_t      crypto_start, crypto_end;
866 	boolean_t       cache_pager;
867 
868 	map_locked = FALSE;
869 	unprotected_mem_obj = MEMORY_OBJECT_NULL;
870 
871 	if (__improbable(vm_map_range_overflows(map, start, end - start))) {
872 		return KERN_INVALID_ADDRESS;
873 	}
874 	start_aligned = vm_map_trunc_page(start, PAGE_MASK_64);
875 	end_aligned = vm_map_round_page(end, PAGE_MASK_64);
876 	start_aligned = vm_map_trunc_page(start_aligned, VM_MAP_PAGE_MASK(map));
877 	end_aligned = vm_map_round_page(end_aligned, VM_MAP_PAGE_MASK(map));
878 
879 #if __arm64__
880 	/*
881 	 * "start" and "end" might be 4K-aligned but not 16K-aligned,
882 	 * so we might have to loop and establish up to 3 mappings:
883 	 *
884 	 * + the first 16K-page, which might overlap with the previous
885 	 *   4K-aligned mapping,
886 	 * + the center,
887 	 * + the last 16K-page, which might overlap with the next
888 	 *   4K-aligned mapping.
889 	 * Each of these mapping might be backed by a vnode pager (if
890 	 * properly page-aligned) or a "fourk_pager", itself backed by a
891 	 * vnode pager (if 4K-aligned but not page-aligned).
892 	 */
893 #endif /* __arm64__ */
894 
895 	map_addr = start_aligned;
896 	for (map_addr = start_aligned;
897 	    map_addr < end;
898 	    map_addr = tmp_entry.vme_end) {
899 		vm_map_lock(map);
900 		map_locked = TRUE;
901 
902 		/* lookup the protected VM object */
903 		if (!vm_map_lookup_entry(map,
904 		    map_addr,
905 		    &map_entry) ||
906 		    map_entry->is_sub_map ||
907 		    VME_OBJECT(map_entry) == VM_OBJECT_NULL) {
908 			/* that memory is not properly mapped */
909 			kr = KERN_INVALID_ARGUMENT;
910 			goto done;
911 		}
912 
913 		/* ensure mapped memory is mapped as executable except
914 		 *  except for model decryption flow */
915 		if ((cryptid != CRYPTID_MODEL_ENCRYPTION) &&
916 		    !(map_entry->protection & VM_PROT_EXECUTE)) {
917 			kr = KERN_INVALID_ARGUMENT;
918 			goto done;
919 		}
920 
921 		/* get the protected object to be decrypted */
922 		protected_object = VME_OBJECT(map_entry);
923 		if (protected_object == VM_OBJECT_NULL) {
924 			/* there should be a VM object here at this point */
925 			kr = KERN_INVALID_ARGUMENT;
926 			goto done;
927 		}
928 		/* ensure protected object stays alive while map is unlocked */
929 		vm_object_reference(protected_object);
930 
931 		/* limit the map entry to the area we want to cover */
932 		vm_map_clip_start(map, map_entry, start_aligned);
933 		vm_map_clip_end(map, map_entry, end_aligned);
934 
935 		tmp_entry = *map_entry;
936 		map_entry = VM_MAP_ENTRY_NULL; /* not valid after unlocking map */
937 		vm_map_unlock(map);
938 		map_locked = FALSE;
939 
940 		/*
941 		 * This map entry might be only partially encrypted
942 		 * (if not fully "page-aligned").
943 		 */
944 		crypto_start = 0;
945 		crypto_end = tmp_entry.vme_end - tmp_entry.vme_start;
946 		if (tmp_entry.vme_start < start) {
947 			if (tmp_entry.vme_start != start_aligned) {
948 				kr = KERN_INVALID_ADDRESS;
949 			}
950 			crypto_start += (start - tmp_entry.vme_start);
951 		}
952 		if (tmp_entry.vme_end > end) {
953 			if (tmp_entry.vme_end != end_aligned) {
954 				kr = KERN_INVALID_ADDRESS;
955 			}
956 			crypto_end -= (tmp_entry.vme_end - end);
957 		}
958 
959 		/*
960 		 * This "extra backing offset" is needed to get the decryption
961 		 * routine to use the right key.  It adjusts for the possibly
962 		 * relative offset of an interposed "4K" pager...
963 		 */
964 		if (crypto_backing_offset == (vm_object_offset_t) -1) {
965 			crypto_backing_offset = VME_OFFSET(&tmp_entry);
966 		}
967 
968 		cache_pager = TRUE;
969 #if XNU_TARGET_OS_OSX
970 		if (vm_map_is_alien(map)) {
971 			cache_pager = FALSE;
972 		}
973 #endif /* XNU_TARGET_OS_OSX */
974 
975 		/*
976 		 * Lookup (and create if necessary) the protected memory object
977 		 * matching that VM object.
978 		 * If successful, this also grabs a reference on the memory object,
979 		 * to guarantee that it doesn't go away before we get a chance to map
980 		 * it.
981 		 */
982 		unprotected_mem_obj = apple_protect_pager_setup(
983 			protected_object,
984 			VME_OFFSET(&tmp_entry),
985 			crypto_backing_offset,
986 			crypt_info,
987 			crypto_start,
988 			crypto_end,
989 			cache_pager);
990 
991 		/* release extra ref on protected object */
992 		vm_object_deallocate(protected_object);
993 
994 		if (unprotected_mem_obj == NULL) {
995 			kr = KERN_FAILURE;
996 			goto done;
997 		}
998 
999 		/* can overwrite an immutable mapping */
1000 		vm_map_kernel_flags_t vmk_flags = {
1001 			.vmf_fixed = true,
1002 			.vmf_overwrite = true,
1003 			.vmkf_overwrite_immutable = true,
1004 		};
1005 #if __arm64__
1006 		if (tmp_entry.used_for_jit &&
1007 		    (VM_MAP_PAGE_SHIFT(map) != FOURK_PAGE_SHIFT ||
1008 		    PAGE_SHIFT != FOURK_PAGE_SHIFT) &&
1009 		    fourk_binary_compatibility_unsafe &&
1010 		    fourk_binary_compatibility_allow_wx) {
1011 			printf("** FOURK_COMPAT [%d]: "
1012 			    "allowing write+execute at 0x%llx\n",
1013 			    proc_selfpid(), tmp_entry.vme_start);
1014 			vmk_flags.vmkf_map_jit = TRUE;
1015 		}
1016 #endif /* __arm64__ */
1017 
1018 		/* map this memory object in place of the current one */
1019 		map_addr = tmp_entry.vme_start;
1020 		kr = vm_map_enter_mem_object(map,
1021 		    &map_addr,
1022 		    (tmp_entry.vme_end -
1023 		    tmp_entry.vme_start),
1024 		    (mach_vm_offset_t) 0,
1025 		    vmk_flags,
1026 		    (ipc_port_t)(uintptr_t) unprotected_mem_obj,
1027 		    0,
1028 		    TRUE,
1029 		    tmp_entry.protection,
1030 		    tmp_entry.max_protection,
1031 		    tmp_entry.inheritance);
1032 		assertf(kr == KERN_SUCCESS,
1033 		    "kr = 0x%x\n", kr);
1034 		assertf(map_addr == tmp_entry.vme_start,
1035 		    "map_addr=0x%llx vme_start=0x%llx tmp_entry=%p\n",
1036 		    (uint64_t)map_addr,
1037 		    (uint64_t) tmp_entry.vme_start,
1038 		    &tmp_entry);
1039 
1040 #if VM_MAP_DEBUG_APPLE_PROTECT
1041 		if (vm_map_debug_apple_protect) {
1042 			printf("APPLE_PROTECT: map %p [0x%llx:0x%llx] pager %p:"
1043 			    " backing:[object:%p,offset:0x%llx,"
1044 			    "crypto_backing_offset:0x%llx,"
1045 			    "crypto_start:0x%llx,crypto_end:0x%llx]\n",
1046 			    map,
1047 			    (uint64_t) map_addr,
1048 			    (uint64_t) (map_addr + (tmp_entry.vme_end -
1049 			    tmp_entry.vme_start)),
1050 			    unprotected_mem_obj,
1051 			    protected_object,
1052 			    VME_OFFSET(&tmp_entry),
1053 			    crypto_backing_offset,
1054 			    crypto_start,
1055 			    crypto_end);
1056 		}
1057 #endif /* VM_MAP_DEBUG_APPLE_PROTECT */
1058 
1059 		/*
1060 		 * Release the reference obtained by
1061 		 * apple_protect_pager_setup().
1062 		 * The mapping (if it succeeded) is now holding a reference on
1063 		 * the memory object.
1064 		 */
1065 		memory_object_deallocate(unprotected_mem_obj);
1066 		unprotected_mem_obj = MEMORY_OBJECT_NULL;
1067 
1068 		/* continue with next map entry */
1069 		crypto_backing_offset += (tmp_entry.vme_end -
1070 		    tmp_entry.vme_start);
1071 		crypto_backing_offset -= crypto_start;
1072 	}
1073 	kr = KERN_SUCCESS;
1074 
1075 done:
1076 	if (map_locked) {
1077 		vm_map_unlock(map);
1078 	}
1079 	return kr;
1080 }
1081 #endif  /* CONFIG_CODE_DECRYPTION */
1082 
1083 
1084 LCK_GRP_DECLARE(vm_map_lck_grp, "vm_map");
1085 LCK_ATTR_DECLARE(vm_map_lck_attr, 0, 0);
1086 LCK_ATTR_DECLARE(vm_map_lck_rw_attr, 0, LCK_ATTR_DEBUG);
1087 
1088 #if XNU_TARGET_OS_OSX
1089 int malloc_no_cow = 0;
1090 #else /* XNU_TARGET_OS_OSX */
1091 int malloc_no_cow = 1;
1092 #endif /* XNU_TARGET_OS_OSX */
1093 uint64_t vm_memory_malloc_no_cow_mask = 0ULL;
1094 #if DEBUG
1095 int vm_check_map_sanity = 0;
1096 #endif
1097 
1098 /*
1099  *	vm_map_init:
1100  *
1101  *	Initialize the vm_map module.  Must be called before
1102  *	any other vm_map routines.
1103  *
1104  *	Map and entry structures are allocated from zones -- we must
1105  *	initialize those zones.
1106  *
1107  *	There are three zones of interest:
1108  *
1109  *	vm_map_zone:		used to allocate maps.
1110  *	vm_map_entry_zone:	used to allocate map entries.
1111  *
1112  *	LP32:
1113  *	vm_map_entry_reserved_zone:     fallback zone for kernel map entries
1114  *
1115  *	The kernel allocates map entries from a special zone that is initially
1116  *	"crammed" with memory.  It would be difficult (perhaps impossible) for
1117  *	the kernel to allocate more memory to a entry zone when it became
1118  *	empty since the very act of allocating memory implies the creation
1119  *	of a new entry.
1120  */
1121 __startup_func
1122 void
vm_map_init(void)1123 vm_map_init(void)
1124 {
1125 
1126 #if MACH_ASSERT
1127 	PE_parse_boot_argn("debug4k_filter", &debug4k_filter,
1128 	    sizeof(debug4k_filter));
1129 #endif /* MACH_ASSERT */
1130 
1131 	zone_create_ext(VM_MAP_ZONE_NAME, sizeof(struct _vm_map),
1132 	    VM_MAP_ZFLAGS, ZONE_ID_VM_MAP, NULL);
1133 
1134 	/*
1135 	 * Don't quarantine because we always need elements available
1136 	 * Disallow GC on this zone... to aid the GC.
1137 	 */
1138 	zone_create_ext(VM_MAP_ENTRY_ZONE_NAME,
1139 	    sizeof(struct vm_map_entry), VM_MAP_ENTRY_ZFLAGS,
1140 	    ZONE_ID_VM_MAP_ENTRY, ^(zone_t z) {
1141 		z->z_elems_rsv = (uint16_t)(32 *
1142 		(ml_early_cpu_max_number() + 1));
1143 	});
1144 
1145 	zone_create_ext(VM_MAP_HOLES_ZONE_NAME,
1146 	    sizeof(struct vm_map_links), VM_MAP_HOLES_ZFLAGS,
1147 	    ZONE_ID_VM_MAP_HOLES, ^(zone_t z) {
1148 		z->z_elems_rsv = (uint16_t)(16 * 1024 / zone_elem_outer_size(z));
1149 	});
1150 
1151 	zone_create_ext("VM map copies", sizeof(struct vm_map_copy),
1152 	    ZC_NOENCRYPT, ZONE_ID_VM_MAP_COPY, NULL);
1153 
1154 	/*
1155 	 * Add the stolen memory to zones, adjust zone size and stolen counts.
1156 	 */
1157 	zone_cram_early(vm_map_zone, map_data, map_data_size);
1158 	zone_cram_early(vm_map_entry_zone, kentry_data, kentry_data_size);
1159 	zone_cram_early(vm_map_holes_zone, map_holes_data, map_holes_data_size);
1160 	printf("VM boostrap: %d maps, %d entries and %d holes available\n",
1161 	    zone_count_free(vm_map_zone),
1162 	    zone_count_free(vm_map_entry_zone),
1163 	    zone_count_free(vm_map_holes_zone));
1164 
1165 	/*
1166 	 * Since these are covered by zones, remove them from stolen page accounting.
1167 	 */
1168 	VM_PAGE_MOVE_STOLEN(atop_64(map_data_size) + atop_64(kentry_data_size) + atop_64(map_holes_data_size));
1169 
1170 #if VM_MAP_DEBUG_APPLE_PROTECT
1171 	PE_parse_boot_argn("vm_map_debug_apple_protect",
1172 	    &vm_map_debug_apple_protect,
1173 	    sizeof(vm_map_debug_apple_protect));
1174 #endif /* VM_MAP_DEBUG_APPLE_PROTECT */
1175 #if VM_MAP_DEBUG_APPLE_FOURK
1176 	PE_parse_boot_argn("vm_map_debug_fourk",
1177 	    &vm_map_debug_fourk,
1178 	    sizeof(vm_map_debug_fourk));
1179 #endif /* VM_MAP_DEBUG_FOURK */
1180 
1181 	PE_parse_boot_argn("malloc_no_cow",
1182 	    &malloc_no_cow,
1183 	    sizeof(malloc_no_cow));
1184 	if (malloc_no_cow) {
1185 		vm_memory_malloc_no_cow_mask = 0ULL;
1186 		vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC;
1187 		vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_SMALL;
1188 		vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_MEDIUM;
1189 		vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_LARGE;
1190 //		vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_HUGE;
1191 //		vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_REALLOC;
1192 		vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_TINY;
1193 		vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_LARGE_REUSABLE;
1194 		vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_LARGE_REUSED;
1195 		vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_NANO;
1196 //		vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_TCMALLOC;
1197 		PE_parse_boot_argn("vm_memory_malloc_no_cow_mask",
1198 		    &vm_memory_malloc_no_cow_mask,
1199 		    sizeof(vm_memory_malloc_no_cow_mask));
1200 	}
1201 
1202 #if CONFIG_MAP_RANGES
1203 	vm_map_range_map_init();
1204 #endif /* CONFIG_MAP_RANGES */
1205 
1206 #if DEBUG
1207 	PE_parse_boot_argn("vm_check_map_sanity", &vm_check_map_sanity, sizeof(vm_check_map_sanity));
1208 	if (vm_check_map_sanity) {
1209 		kprintf("VM sanity checking enabled\n");
1210 	} else {
1211 		kprintf("VM sanity checking disabled. Set bootarg vm_check_map_sanity=1 to enable\n");
1212 	}
1213 #endif /* DEBUG */
1214 
1215 #if DEVELOPMENT || DEBUG
1216 	PE_parse_boot_argn("panic_on_unsigned_execute",
1217 	    &panic_on_unsigned_execute,
1218 	    sizeof(panic_on_unsigned_execute));
1219 	PE_parse_boot_argn("panic_on_mlock_failure",
1220 	    &panic_on_mlock_failure,
1221 	    sizeof(panic_on_mlock_failure));
1222 #endif /* DEVELOPMENT || DEBUG */
1223 }
1224 
1225 __startup_func
1226 static void
vm_map_steal_memory(void)1227 vm_map_steal_memory(void)
1228 {
1229 	/*
1230 	 * We need to reserve enough memory to support boostraping VM maps
1231 	 * and the zone subsystem.
1232 	 *
1233 	 * The VM Maps that need to function before zones can support them
1234 	 * are the ones registered with vm_map_will_allocate_early_map(),
1235 	 * which are:
1236 	 * - the kernel map
1237 	 * - the various submaps used by zones (pgz, meta, ...)
1238 	 *
1239 	 * We also need enough entries and holes to support them
1240 	 * until zone_metadata_init() is called, which is when
1241 	 * the zone allocator becomes capable of expanding dynamically.
1242 	 *
1243 	 * We need:
1244 	 * - VM_MAP_EARLY_COUNT_MAX worth of VM Maps.
1245 	 * - To allow for 3-4 entries per map, but the kernel map
1246 	 *   needs a multiple of VM_MAP_EARLY_COUNT_MAX entries
1247 	 *   to describe the submaps, so double it (and make it 8x too)
1248 	 * - To allow for holes between entries,
1249 	 *   hence needs the same budget as entries
1250 	 */
1251 	map_data_size = zone_get_early_alloc_size(VM_MAP_ZONE_NAME,
1252 	    sizeof(struct _vm_map), VM_MAP_ZFLAGS,
1253 	    VM_MAP_EARLY_COUNT_MAX);
1254 
1255 	kentry_data_size = zone_get_early_alloc_size(VM_MAP_ENTRY_ZONE_NAME,
1256 	    sizeof(struct vm_map_entry), VM_MAP_ENTRY_ZFLAGS,
1257 	    8 * VM_MAP_EARLY_COUNT_MAX);
1258 
1259 	map_holes_data_size = zone_get_early_alloc_size(VM_MAP_HOLES_ZONE_NAME,
1260 	    sizeof(struct vm_map_links), VM_MAP_HOLES_ZFLAGS,
1261 	    8 * VM_MAP_EARLY_COUNT_MAX);
1262 
1263 	/*
1264 	 * Steal a contiguous range of memory so that a simple range check
1265 	 * can validate early addresses being freed/crammed to these
1266 	 * zones
1267 	 */
1268 	map_data       = zone_early_mem_init(map_data_size + kentry_data_size +
1269 	    map_holes_data_size);
1270 	kentry_data    = map_data + map_data_size;
1271 	map_holes_data = kentry_data + kentry_data_size;
1272 }
1273 STARTUP(PMAP_STEAL, STARTUP_RANK_FIRST, vm_map_steal_memory);
1274 
1275 __startup_func
1276 static void
vm_kernel_boostraped(void)1277 vm_kernel_boostraped(void)
1278 {
1279 	zone_enable_caching(&zone_array[ZONE_ID_VM_MAP_ENTRY]);
1280 	zone_enable_caching(&zone_array[ZONE_ID_VM_MAP_HOLES]);
1281 	zone_enable_caching(&zone_array[ZONE_ID_VM_MAP_COPY]);
1282 
1283 	printf("VM bootstrap done: %d maps, %d entries and %d holes left\n",
1284 	    zone_count_free(vm_map_zone),
1285 	    zone_count_free(vm_map_entry_zone),
1286 	    zone_count_free(vm_map_holes_zone));
1287 }
1288 STARTUP(ZALLOC, STARTUP_RANK_SECOND, vm_kernel_boostraped);
1289 
1290 void
vm_map_disable_hole_optimization(vm_map_t map)1291 vm_map_disable_hole_optimization(vm_map_t map)
1292 {
1293 	vm_map_entry_t  head_entry, hole_entry, next_hole_entry;
1294 
1295 	if (map->holelistenabled) {
1296 		head_entry = hole_entry = CAST_TO_VM_MAP_ENTRY(map->holes_list);
1297 
1298 		while (hole_entry != NULL) {
1299 			next_hole_entry = hole_entry->vme_next;
1300 
1301 			hole_entry->vme_next = NULL;
1302 			hole_entry->vme_prev = NULL;
1303 			zfree_id(ZONE_ID_VM_MAP_HOLES, hole_entry);
1304 
1305 			if (next_hole_entry == head_entry) {
1306 				hole_entry = NULL;
1307 			} else {
1308 				hole_entry = next_hole_entry;
1309 			}
1310 		}
1311 
1312 		map->holes_list = NULL;
1313 		map->holelistenabled = FALSE;
1314 
1315 		map->first_free = vm_map_first_entry(map);
1316 		SAVE_HINT_HOLE_WRITE(map, NULL);
1317 	}
1318 }
1319 
1320 boolean_t
vm_kernel_map_is_kernel(vm_map_t map)1321 vm_kernel_map_is_kernel(vm_map_t map)
1322 {
1323 	return map->pmap == kernel_pmap;
1324 }
1325 
1326 /*
1327  *	vm_map_create:
1328  *
1329  *	Creates and returns a new empty VM map with
1330  *	the given physical map structure, and having
1331  *	the given lower and upper address bounds.
1332  */
1333 
1334 extern vm_map_t vm_map_create_external(
1335 	pmap_t                  pmap,
1336 	vm_map_offset_t         min_off,
1337 	vm_map_offset_t         max_off,
1338 	boolean_t               pageable);
1339 
1340 vm_map_t
vm_map_create_external(pmap_t pmap,vm_map_offset_t min,vm_map_offset_t max,boolean_t pageable)1341 vm_map_create_external(
1342 	pmap_t                  pmap,
1343 	vm_map_offset_t         min,
1344 	vm_map_offset_t         max,
1345 	boolean_t               pageable)
1346 {
1347 	vm_map_create_options_t options = VM_MAP_CREATE_DEFAULT;
1348 
1349 	if (pageable) {
1350 		options |= VM_MAP_CREATE_PAGEABLE;
1351 	}
1352 	return vm_map_create_options(pmap, min, max, options);
1353 }
1354 
1355 __startup_func
1356 void
vm_map_will_allocate_early_map(vm_map_t * owner)1357 vm_map_will_allocate_early_map(vm_map_t *owner)
1358 {
1359 	if (early_map_count >= VM_MAP_EARLY_COUNT_MAX) {
1360 		panic("VM_MAP_EARLY_COUNT_MAX is too low");
1361 	}
1362 
1363 	early_map_owners[early_map_count++] = owner;
1364 }
1365 
1366 __startup_func
1367 void
vm_map_relocate_early_maps(vm_offset_t delta)1368 vm_map_relocate_early_maps(vm_offset_t delta)
1369 {
1370 	for (uint32_t i = 0; i < early_map_count; i++) {
1371 		vm_address_t addr = (vm_address_t)*early_map_owners[i];
1372 
1373 		*early_map_owners[i] = (vm_map_t)(addr + delta);
1374 	}
1375 
1376 	early_map_count = ~0u;
1377 }
1378 
1379 /*
1380  *	Routine:	vm_map_relocate_early_elem
1381  *
1382  *	Purpose:
1383  *		Early zone elements are allocated in a temporary part
1384  *		of the address space.
1385  *
1386  *		Once the zones live in their final place, the early
1387  *		VM maps, map entries and map holes need to be relocated.
1388  *
1389  *		It involves rewriting any vm_map_t, vm_map_entry_t or
1390  *		pointers to vm_map_links. Other pointers to other types
1391  *		are fine.
1392  *
1393  *		Fortunately, pointers to those types are self-contained
1394  *		in those zones, _except_ for pointers to VM maps,
1395  *		which are tracked during early boot and fixed with
1396  *		vm_map_relocate_early_maps().
1397  */
1398 __startup_func
1399 void
vm_map_relocate_early_elem(uint32_t zone_id,vm_offset_t new_addr,vm_offset_t delta)1400 vm_map_relocate_early_elem(
1401 	uint32_t                zone_id,
1402 	vm_offset_t             new_addr,
1403 	vm_offset_t             delta)
1404 {
1405 #define relocate(type_t, field)  ({ \
1406 	typeof(((type_t)NULL)->field) *__field = &((type_t)new_addr)->field;   \
1407 	if (*__field) {                                                        \
1408 	        *__field = (typeof(*__field))((vm_offset_t)*__field + delta);  \
1409 	}                                                                      \
1410 })
1411 
1412 	switch (zone_id) {
1413 	case ZONE_ID_VM_MAP:
1414 	case ZONE_ID_VM_MAP_ENTRY:
1415 	case ZONE_ID_VM_MAP_HOLES:
1416 		break;
1417 
1418 	default:
1419 		panic("Unexpected zone ID %d", zone_id);
1420 	}
1421 
1422 	if (zone_id == ZONE_ID_VM_MAP) {
1423 		relocate(vm_map_t, hdr.links.prev);
1424 		relocate(vm_map_t, hdr.links.next);
1425 		((vm_map_t)new_addr)->pmap = kernel_pmap;
1426 #ifdef VM_MAP_STORE_USE_RB
1427 		relocate(vm_map_t, hdr.rb_head_store.rbh_root);
1428 #endif /* VM_MAP_STORE_USE_RB */
1429 		relocate(vm_map_t, hint);
1430 		relocate(vm_map_t, hole_hint);
1431 		relocate(vm_map_t, first_free);
1432 		return;
1433 	}
1434 
1435 	relocate(struct vm_map_links *, prev);
1436 	relocate(struct vm_map_links *, next);
1437 
1438 	if (zone_id == ZONE_ID_VM_MAP_ENTRY) {
1439 #ifdef VM_MAP_STORE_USE_RB
1440 		relocate(vm_map_entry_t, store.entry.rbe_left);
1441 		relocate(vm_map_entry_t, store.entry.rbe_right);
1442 		relocate(vm_map_entry_t, store.entry.rbe_parent);
1443 #endif /* VM_MAP_STORE_USE_RB */
1444 		if (((vm_map_entry_t)new_addr)->is_sub_map) {
1445 			/* no object to relocate because we haven't made any */
1446 			((vm_map_entry_t)new_addr)->vme_submap +=
1447 			    delta >> VME_SUBMAP_SHIFT;
1448 		}
1449 #if MAP_ENTRY_CREATION_DEBUG
1450 		relocate(vm_map_entry_t, vme_creation_maphdr);
1451 #endif /* MAP_ENTRY_CREATION_DEBUG */
1452 	}
1453 
1454 #undef relocate
1455 }
1456 
1457 vm_map_t
vm_map_create_options(pmap_t pmap,vm_map_offset_t min,vm_map_offset_t max,vm_map_create_options_t options)1458 vm_map_create_options(
1459 	pmap_t                  pmap,
1460 	vm_map_offset_t         min,
1461 	vm_map_offset_t         max,
1462 	vm_map_create_options_t options)
1463 {
1464 	vm_map_t result;
1465 
1466 #if DEBUG || DEVELOPMENT
1467 	if (__improbable(startup_phase < STARTUP_SUB_ZALLOC)) {
1468 		if (early_map_count != ~0u && early_map_count !=
1469 		    zone_count_allocated(vm_map_zone) + 1) {
1470 			panic("allocating %dth early map, owner not known",
1471 			    zone_count_allocated(vm_map_zone) + 1);
1472 		}
1473 		if (early_map_count != ~0u && pmap && pmap != kernel_pmap) {
1474 			panic("allocating %dth early map for non kernel pmap",
1475 			    early_map_count);
1476 		}
1477 	}
1478 #endif /* DEBUG || DEVELOPMENT */
1479 
1480 	result = zalloc_id(ZONE_ID_VM_MAP, Z_WAITOK | Z_NOFAIL | Z_ZERO);
1481 
1482 	vm_map_store_init(&result->hdr);
1483 	result->hdr.entries_pageable = (bool)(options & VM_MAP_CREATE_PAGEABLE);
1484 	vm_map_set_page_shift(result, PAGE_SHIFT);
1485 
1486 	result->size_limit      = RLIM_INFINITY;        /* default unlimited */
1487 	result->data_limit      = RLIM_INFINITY;        /* default unlimited */
1488 	result->user_wire_limit = MACH_VM_MAX_ADDRESS;  /* default limit is unlimited */
1489 	os_ref_init_count_raw(&result->map_refcnt, &map_refgrp, 1);
1490 	result->pmap = pmap;
1491 	result->min_offset = min;
1492 	result->max_offset = max;
1493 	result->first_free = vm_map_to_entry(result);
1494 	result->hint = vm_map_to_entry(result);
1495 
1496 	if (options & VM_MAP_CREATE_NEVER_FAULTS) {
1497 		assert(pmap == kernel_pmap);
1498 		result->never_faults = true;
1499 	}
1500 
1501 	/* "has_corpse_footprint" and "holelistenabled" are mutually exclusive */
1502 	if (options & VM_MAP_CREATE_CORPSE_FOOTPRINT) {
1503 		result->has_corpse_footprint = true;
1504 	} else if (!(options & VM_MAP_CREATE_DISABLE_HOLELIST)) {
1505 		struct vm_map_links *hole_entry;
1506 
1507 		hole_entry = zalloc_id(ZONE_ID_VM_MAP_HOLES, Z_WAITOK | Z_NOFAIL);
1508 		hole_entry->start = min;
1509 #if defined(__arm64__)
1510 		hole_entry->end = result->max_offset;
1511 #else
1512 		hole_entry->end = MAX(max, (vm_map_offset_t)MACH_VM_MAX_ADDRESS);
1513 #endif
1514 		result->holes_list = result->hole_hint = hole_entry;
1515 		hole_entry->prev = hole_entry->next = CAST_TO_VM_MAP_ENTRY(hole_entry);
1516 		result->holelistenabled = true;
1517 	}
1518 
1519 	vm_map_lock_init(result);
1520 
1521 	return result;
1522 }
1523 
1524 /*
1525  * Adjusts a submap that was made by kmem_suballoc()
1526  * before it knew where it would be mapped,
1527  * so that it has the right min/max offsets.
1528  *
1529  * We do not need to hold any locks:
1530  * only the caller knows about this map,
1531  * and it is not published on any entry yet.
1532  */
1533 static void
vm_map_adjust_offsets(vm_map_t map,vm_map_offset_t min_off,vm_map_offset_t max_off)1534 vm_map_adjust_offsets(
1535 	vm_map_t                map,
1536 	vm_map_offset_t         min_off,
1537 	vm_map_offset_t         max_off)
1538 {
1539 	assert(map->min_offset == 0);
1540 	assert(map->max_offset == max_off - min_off);
1541 	assert(map->hdr.nentries == 0);
1542 	assert(os_ref_get_count_raw(&map->map_refcnt) == 2);
1543 
1544 	map->min_offset = min_off;
1545 	map->max_offset = max_off;
1546 
1547 	if (map->holelistenabled) {
1548 		struct vm_map_links *hole = map->holes_list;
1549 
1550 		hole->start = min_off;
1551 #if defined(__arm64__)
1552 		hole->end = max_off;
1553 #else
1554 		hole->end = MAX(max_off, (vm_map_offset_t)MACH_VM_MAX_ADDRESS);
1555 #endif
1556 	}
1557 }
1558 
1559 
1560 vm_map_size_t
vm_map_adjusted_size(vm_map_t map)1561 vm_map_adjusted_size(vm_map_t map)
1562 {
1563 	const struct vm_reserved_region *regions = NULL;
1564 	size_t num_regions = 0;
1565 	mach_vm_size_t  reserved_size = 0, map_size = 0;
1566 
1567 	if (map == NULL || (map->size == 0)) {
1568 		return 0;
1569 	}
1570 
1571 	map_size = map->size;
1572 
1573 	if (map->reserved_regions == FALSE || !vm_map_is_exotic(map) || map->terminated) {
1574 		/*
1575 		 * No special reserved regions or not an exotic map or the task
1576 		 * is terminating and these special regions might have already
1577 		 * been deallocated.
1578 		 */
1579 		return map_size;
1580 	}
1581 
1582 	num_regions = ml_get_vm_reserved_regions(vm_map_is_64bit(map), &regions);
1583 	assert((num_regions == 0) || (num_regions > 0 && regions != NULL));
1584 
1585 	while (num_regions) {
1586 		reserved_size += regions[--num_regions].vmrr_size;
1587 	}
1588 
1589 	/*
1590 	 * There are a few places where the map is being switched out due to
1591 	 * 'termination' without that bit being set (e.g. exec and corpse purging).
1592 	 * In those cases, we could have the map's regions being deallocated on
1593 	 * a core while some accounting process is trying to get the map's size.
1594 	 * So this assert can't be enabled till all those places are uniform in
1595 	 * their use of the 'map->terminated' bit.
1596 	 *
1597 	 * assert(map_size >= reserved_size);
1598 	 */
1599 
1600 	return (map_size >= reserved_size) ? (map_size - reserved_size) : map_size;
1601 }
1602 
1603 /*
1604  *	vm_map_entry_create:	[ internal use only ]
1605  *
1606  *	Allocates a VM map entry for insertion in the
1607  *	given map (or map copy).  No fields are filled.
1608  *
1609  *	The VM entry will be zero initialized, except for:
1610  *	- behavior set to VM_BEHAVIOR_DEFAULT
1611  *	- inheritance set to VM_INHERIT_DEFAULT
1612  */
1613 #define vm_map_entry_create(map)    _vm_map_entry_create(&(map)->hdr)
1614 
1615 #define vm_map_copy_entry_create(copy) _vm_map_entry_create(&(copy)->cpy_hdr)
1616 
1617 static vm_map_entry_t
_vm_map_entry_create(struct vm_map_header * map_header __unused)1618 _vm_map_entry_create(
1619 	struct vm_map_header    *map_header __unused)
1620 {
1621 	vm_map_entry_t entry = NULL;
1622 
1623 	entry = zalloc_id(ZONE_ID_VM_MAP_ENTRY, Z_WAITOK | Z_ZERO);
1624 
1625 	/*
1626 	 * Help the compiler with what we know to be true,
1627 	 * so that the further bitfields inits have good codegen.
1628 	 *
1629 	 * See rdar://87041299
1630 	 */
1631 	__builtin_assume(entry->vme_object_value == 0);
1632 	__builtin_assume(*(uint64_t *)(&entry->vme_object_value + 1) == 0);
1633 	__builtin_assume(*(uint64_t *)(&entry->vme_object_value + 2) == 0);
1634 
1635 	static_assert(VM_MAX_TAG_VALUE <= VME_ALIAS_MASK,
1636 	    "VME_ALIAS_MASK covers tags");
1637 
1638 	static_assert(VM_BEHAVIOR_DEFAULT == 0,
1639 	    "can skip zeroing of the behavior field");
1640 	entry->inheritance = VM_INHERIT_DEFAULT;
1641 
1642 #if MAP_ENTRY_CREATION_DEBUG
1643 	entry->vme_creation_maphdr = map_header;
1644 	entry->vme_creation_bt = btref_get(__builtin_frame_address(0),
1645 	    BTREF_GET_NOWAIT);
1646 #endif
1647 	return entry;
1648 }
1649 
1650 /*
1651  *	vm_map_entry_dispose:	[ internal use only ]
1652  *
1653  *	Inverse of vm_map_entry_create.
1654  *
1655  *      write map lock held so no need to
1656  *	do anything special to insure correctness
1657  *      of the stores
1658  */
1659 static void
vm_map_entry_dispose(vm_map_entry_t entry)1660 vm_map_entry_dispose(
1661 	vm_map_entry_t          entry)
1662 {
1663 #if MAP_ENTRY_CREATION_DEBUG
1664 	btref_put(entry->vme_creation_bt);
1665 #endif
1666 #if MAP_ENTRY_INSERTION_DEBUG
1667 	btref_put(entry->vme_insertion_bt);
1668 #endif
1669 	zfree(vm_map_entry_zone, entry);
1670 }
1671 
1672 #define vm_map_copy_entry_dispose(copy_entry) \
1673 	vm_map_entry_dispose(copy_entry)
1674 
1675 static vm_map_entry_t
vm_map_zap_first_entry(vm_map_zap_t list)1676 vm_map_zap_first_entry(
1677 	vm_map_zap_t            list)
1678 {
1679 	return list->vmz_head;
1680 }
1681 
1682 static vm_map_entry_t
vm_map_zap_last_entry(vm_map_zap_t list)1683 vm_map_zap_last_entry(
1684 	vm_map_zap_t            list)
1685 {
1686 	assert(vm_map_zap_first_entry(list));
1687 	return __container_of(list->vmz_tail, struct vm_map_entry, vme_next);
1688 }
1689 
1690 static void
vm_map_zap_append(vm_map_zap_t list,vm_map_entry_t entry)1691 vm_map_zap_append(
1692 	vm_map_zap_t            list,
1693 	vm_map_entry_t          entry)
1694 {
1695 	entry->vme_next = VM_MAP_ENTRY_NULL;
1696 	*list->vmz_tail = entry;
1697 	list->vmz_tail = &entry->vme_next;
1698 }
1699 
1700 static vm_map_entry_t
vm_map_zap_pop(vm_map_zap_t list)1701 vm_map_zap_pop(
1702 	vm_map_zap_t            list)
1703 {
1704 	vm_map_entry_t head = list->vmz_head;
1705 
1706 	if (head != VM_MAP_ENTRY_NULL &&
1707 	    (list->vmz_head = head->vme_next) == VM_MAP_ENTRY_NULL) {
1708 		list->vmz_tail = &list->vmz_head;
1709 	}
1710 
1711 	return head;
1712 }
1713 
1714 static void
vm_map_zap_dispose(vm_map_zap_t list)1715 vm_map_zap_dispose(
1716 	vm_map_zap_t            list)
1717 {
1718 	vm_map_entry_t          entry;
1719 
1720 	while ((entry = vm_map_zap_pop(list))) {
1721 		if (entry->is_sub_map) {
1722 			vm_map_deallocate(VME_SUBMAP(entry));
1723 		} else {
1724 			vm_object_deallocate(VME_OBJECT(entry));
1725 		}
1726 
1727 		vm_map_entry_dispose(entry);
1728 	}
1729 }
1730 
1731 #if MACH_ASSERT
1732 static boolean_t first_free_check = FALSE;
1733 boolean_t
first_free_is_valid(vm_map_t map)1734 first_free_is_valid(
1735 	vm_map_t        map)
1736 {
1737 	if (!first_free_check) {
1738 		return TRUE;
1739 	}
1740 
1741 	return first_free_is_valid_store( map );
1742 }
1743 #endif /* MACH_ASSERT */
1744 
1745 
1746 #define vm_map_copy_entry_link(copy, after_where, entry)                \
1747 	_vm_map_store_entry_link(&(copy)->cpy_hdr, after_where, (entry))
1748 
1749 #define vm_map_copy_entry_unlink(copy, entry)                           \
1750 	_vm_map_store_entry_unlink(&(copy)->cpy_hdr, (entry), false)
1751 
1752 /*
1753  *	vm_map_destroy:
1754  *
1755  *	Actually destroy a map.
1756  */
1757 void
vm_map_destroy(vm_map_t map)1758 vm_map_destroy(
1759 	vm_map_t        map)
1760 {
1761 	/* final cleanup: this is not allowed to fail */
1762 	vmr_flags_t flags = VM_MAP_REMOVE_NO_FLAGS;
1763 
1764 	VM_MAP_ZAP_DECLARE(zap);
1765 
1766 	vm_map_lock(map);
1767 
1768 	map->terminated = true;
1769 	/* clean up regular map entries */
1770 	(void)vm_map_delete(map, map->min_offset, map->max_offset, flags,
1771 	    KMEM_GUARD_NONE, &zap);
1772 	/* clean up leftover special mappings (commpage, GPU carveout, etc...) */
1773 	(void)vm_map_delete(map, 0x0, 0xFFFFFFFFFFFFF000ULL, flags,
1774 	    KMEM_GUARD_NONE, &zap);
1775 
1776 	vm_map_disable_hole_optimization(map);
1777 	vm_map_corpse_footprint_destroy(map);
1778 
1779 	vm_map_unlock(map);
1780 
1781 	vm_map_zap_dispose(&zap);
1782 
1783 	assert(map->hdr.nentries == 0);
1784 
1785 	if (map->pmap) {
1786 		pmap_destroy(map->pmap);
1787 	}
1788 
1789 	lck_rw_destroy(&map->lock, &vm_map_lck_grp);
1790 
1791 	zfree_id(ZONE_ID_VM_MAP, map);
1792 }
1793 
1794 /*
1795  * Returns pid of the task with the largest number of VM map entries.
1796  * Used in the zone-map-exhaustion jetsam path.
1797  */
1798 pid_t
find_largest_process_vm_map_entries(void)1799 find_largest_process_vm_map_entries(void)
1800 {
1801 	pid_t victim_pid = -1;
1802 	int max_vm_map_entries = 0;
1803 	task_t task = TASK_NULL;
1804 	queue_head_t *task_list = &tasks;
1805 
1806 	lck_mtx_lock(&tasks_threads_lock);
1807 	queue_iterate(task_list, task, task_t, tasks) {
1808 		if (task == kernel_task || !task->active) {
1809 			continue;
1810 		}
1811 
1812 		vm_map_t task_map = task->map;
1813 		if (task_map != VM_MAP_NULL) {
1814 			int task_vm_map_entries = task_map->hdr.nentries;
1815 			if (task_vm_map_entries > max_vm_map_entries) {
1816 				max_vm_map_entries = task_vm_map_entries;
1817 				victim_pid = pid_from_task(task);
1818 			}
1819 		}
1820 	}
1821 	lck_mtx_unlock(&tasks_threads_lock);
1822 
1823 	printf("zone_map_exhaustion: victim pid %d, vm region count: %d\n", victim_pid, max_vm_map_entries);
1824 	return victim_pid;
1825 }
1826 
1827 
1828 /*
1829  *	vm_map_lookup_entry:	[ internal use only ]
1830  *
1831  *	Calls into the vm map store layer to find the map
1832  *	entry containing (or immediately preceding) the
1833  *	specified address in the given map; the entry is returned
1834  *	in the "entry" parameter.  The boolean
1835  *	result indicates whether the address is
1836  *	actually contained in the map.
1837  */
1838 boolean_t
vm_map_lookup_entry(vm_map_t map,vm_map_offset_t address,vm_map_entry_t * entry)1839 vm_map_lookup_entry(
1840 	vm_map_t        map,
1841 	vm_map_offset_t address,
1842 	vm_map_entry_t  *entry)         /* OUT */
1843 {
1844 #if CONFIG_KERNEL_TBI
1845 	if (VM_KERNEL_ADDRESS(address)) {
1846 		address = VM_KERNEL_STRIP_UPTR(address);
1847 	}
1848 #endif /* CONFIG_KERNEL_TBI */
1849 #if CONFIG_PROB_GZALLOC
1850 	if (map->pmap == kernel_pmap) {
1851 		assertf(!pgz_owned(address),
1852 		    "it is the responsibility of callers to unguard PGZ addresses");
1853 	}
1854 #endif /* CONFIG_PROB_GZALLOC */
1855 	return vm_map_store_lookup_entry( map, address, entry );
1856 }
1857 
1858 boolean_t
vm_map_lookup_entry_or_next(vm_map_t map,vm_map_offset_t address,vm_map_entry_t * entry)1859 vm_map_lookup_entry_or_next(
1860 	vm_map_t        map,
1861 	vm_map_offset_t address,
1862 	vm_map_entry_t  *entry)         /* OUT */
1863 {
1864 	if (vm_map_lookup_entry(map, address, entry)) {
1865 		return true;
1866 	}
1867 
1868 	*entry = (*entry)->vme_next;
1869 	return false;
1870 }
1871 
1872 #if CONFIG_PROB_GZALLOC
1873 boolean_t
vm_map_lookup_entry_allow_pgz(vm_map_t map,vm_map_offset_t address,vm_map_entry_t * entry)1874 vm_map_lookup_entry_allow_pgz(
1875 	vm_map_t        map,
1876 	vm_map_offset_t address,
1877 	vm_map_entry_t  *entry)         /* OUT */
1878 {
1879 #if CONFIG_KERNEL_TBI
1880 	if (VM_KERNEL_ADDRESS(address)) {
1881 		address = VM_KERNEL_STRIP_UPTR(address);
1882 	}
1883 #endif /* CONFIG_KERNEL_TBI */
1884 	return vm_map_store_lookup_entry( map, address, entry );
1885 }
1886 #endif /* CONFIG_PROB_GZALLOC */
1887 
1888 /*
1889  *	Routine:	vm_map_range_invalid_panic
1890  *	Purpose:
1891  *			Panic on detection of an invalid range id.
1892  */
1893 __abortlike
1894 static void
vm_map_range_invalid_panic(vm_map_t map,vm_map_range_id_t range_id)1895 vm_map_range_invalid_panic(
1896 	vm_map_t                map,
1897 	vm_map_range_id_t       range_id)
1898 {
1899 	panic("invalid range ID (%u) for map %p", range_id, map);
1900 }
1901 
1902 /*
1903  *	Routine:	vm_map_get_range
1904  *	Purpose:
1905  *			Adjust bounds based on security policy.
1906  */
1907 static struct mach_vm_range
vm_map_get_range(vm_map_t map,vm_map_address_t * address,vm_map_kernel_flags_t * vmk_flags,vm_map_size_t size,bool * is_ptr)1908 vm_map_get_range(
1909 	vm_map_t                map,
1910 	vm_map_address_t       *address,
1911 	vm_map_kernel_flags_t  *vmk_flags,
1912 	vm_map_size_t           size,
1913 	bool                   *is_ptr)
1914 {
1915 	struct mach_vm_range effective_range = {};
1916 	vm_map_range_id_t range_id = vmk_flags->vmkf_range_id;
1917 
1918 	if (map == kernel_map) {
1919 		effective_range = kmem_ranges[range_id];
1920 
1921 		if (startup_phase >= STARTUP_SUB_KMEM) {
1922 			/*
1923 			 * Hint provided by caller is zeroed as the range is restricted to a
1924 			 * subset of the entire kernel_map VA, which could put the hint outside
1925 			 * the range, causing vm_map_store_find_space to fail.
1926 			 */
1927 			*address = 0ull;
1928 			/*
1929 			 * Ensure that range_id passed in by the caller is within meaningful
1930 			 * bounds. Range id of KMEM_RANGE_ID_NONE will cause vm_map_locate_space
1931 			 * to fail as the corresponding range is invalid. Range id larger than
1932 			 * KMEM_RANGE_ID_MAX will lead to an OOB access.
1933 			 */
1934 			if ((range_id == KMEM_RANGE_ID_NONE) ||
1935 			    (range_id > KMEM_RANGE_ID_MAX)) {
1936 				vm_map_range_invalid_panic(map, range_id);
1937 			}
1938 
1939 			/*
1940 			 * Pointer ranges use kmem_locate_space to do allocations.
1941 			 *
1942 			 * Non pointer fronts look like [ Small | Large | Permanent ]
1943 			 * Adjust range for allocations larger than KMEM_SMALLMAP_THRESHOLD.
1944 			 * Allocations smaller than KMEM_SMALLMAP_THRESHOLD are allowed to
1945 			 * use the entire range.
1946 			 */
1947 			if (range_id < KMEM_RANGE_ID_SPRAYQTN) {
1948 				*is_ptr = true;
1949 			} else if (size >= KMEM_SMALLMAP_THRESHOLD) {
1950 				effective_range = kmem_large_ranges[range_id];
1951 			}
1952 		}
1953 #if CONFIG_MAP_RANGES
1954 	} else if (map->uses_user_ranges) {
1955 		if (range_id > UMEM_RANGE_ID_MAX) {
1956 			vm_map_range_invalid_panic(map, range_id);
1957 		}
1958 
1959 		effective_range = map->user_range[range_id];
1960 #endif /* CONFIG_MAP_RANGES */
1961 	} else {
1962 		/*
1963 		 * If minimum is 0, bump it up by PAGE_SIZE.  We want to limit
1964 		 * allocations of PAGEZERO to explicit requests since its
1965 		 * normal use is to catch dereferences of NULL and many
1966 		 * applications also treat pointers with a value of 0 as
1967 		 * special and suddenly having address 0 contain useable
1968 		 * memory would tend to confuse those applications.
1969 		 */
1970 		effective_range.min_address = MAX(map->min_offset, VM_MAP_PAGE_SIZE(map));
1971 		effective_range.max_address = map->max_offset;
1972 	}
1973 
1974 	return effective_range;
1975 }
1976 
1977 /*
1978  *	Routine:	vm_map_locate_space
1979  *	Purpose:
1980  *		Finds a range in the specified virtual address map,
1981  *		returning the start of that range,
1982  *		as well as the entry right before it.
1983  */
1984 kern_return_t
vm_map_locate_space(vm_map_t map,vm_map_size_t size,vm_map_offset_t mask,vm_map_kernel_flags_t vmk_flags,vm_map_offset_t * start_inout,vm_map_entry_t * entry_out)1985 vm_map_locate_space(
1986 	vm_map_t                map,
1987 	vm_map_size_t           size,
1988 	vm_map_offset_t         mask,
1989 	vm_map_kernel_flags_t   vmk_flags,
1990 	vm_map_offset_t        *start_inout,
1991 	vm_map_entry_t         *entry_out)
1992 {
1993 	struct mach_vm_range effective_range = {};
1994 	vm_map_size_t   guard_offset;
1995 	vm_map_offset_t hint, limit;
1996 	vm_map_entry_t  entry;
1997 	bool            is_kmem_ptr_range = false;
1998 
1999 	/*
2000 	 * Only supported by vm_map_enter() with a fixed address.
2001 	 */
2002 	assert(!vmk_flags.vmkf_beyond_max);
2003 
2004 	if (__improbable(map->wait_for_space)) {
2005 		/*
2006 		 * support for "wait_for_space" is minimal,
2007 		 * its only consumer is the ipc_kernel_copy_map.
2008 		 */
2009 		assert(!map->holelistenabled &&
2010 		    !vmk_flags.vmkf_last_free &&
2011 		    !vmk_flags.vmkf_keep_map_locked &&
2012 		    !vmk_flags.vmkf_map_jit &&
2013 		    !vmk_flags.vmf_random_addr &&
2014 		    *start_inout <= map->min_offset);
2015 	} else if (vmk_flags.vmkf_last_free) {
2016 		assert(!vmk_flags.vmkf_map_jit &&
2017 		    !vmk_flags.vmf_random_addr);
2018 	}
2019 
2020 	if (vmk_flags.vmkf_guard_before) {
2021 		guard_offset = VM_MAP_PAGE_SIZE(map);
2022 		assert(size > guard_offset);
2023 		size -= guard_offset;
2024 	} else {
2025 		assert(size != 0);
2026 		guard_offset = 0;
2027 	}
2028 
2029 	/*
2030 	 * Validate range_id from flags and get associated range
2031 	 */
2032 	effective_range = vm_map_get_range(map, start_inout, &vmk_flags, size,
2033 	    &is_kmem_ptr_range);
2034 
2035 	if (is_kmem_ptr_range) {
2036 		return kmem_locate_space(size + guard_offset, vmk_flags.vmkf_range_id,
2037 		           vmk_flags.vmkf_last_free, start_inout, entry_out);
2038 	}
2039 
2040 #if XNU_TARGET_OS_OSX
2041 	if (__improbable(vmk_flags.vmkf_32bit_map_va)) {
2042 		assert(map != kernel_map);
2043 		effective_range.max_address = MIN(map->max_offset, 0x00000000FFFFF000ULL);
2044 	}
2045 #endif /* XNU_TARGET_OS_OSX */
2046 
2047 again:
2048 	if (vmk_flags.vmkf_last_free) {
2049 		hint = *start_inout;
2050 
2051 		if (hint == 0 || hint > effective_range.max_address) {
2052 			hint = effective_range.max_address;
2053 		}
2054 		if (hint <= effective_range.min_address) {
2055 			return KERN_NO_SPACE;
2056 		}
2057 		limit = effective_range.min_address;
2058 	} else {
2059 		hint = *start_inout;
2060 
2061 		if (vmk_flags.vmkf_map_jit) {
2062 			if (map->jit_entry_exists &&
2063 			    !VM_MAP_POLICY_ALLOW_MULTIPLE_JIT(map)) {
2064 				return KERN_INVALID_ARGUMENT;
2065 			}
2066 			if (VM_MAP_POLICY_ALLOW_JIT_RANDOM_ADDRESS(map)) {
2067 				vmk_flags.vmf_random_addr = true;
2068 			}
2069 		}
2070 
2071 		if (vmk_flags.vmf_random_addr) {
2072 			kern_return_t kr;
2073 
2074 			kr = vm_map_random_address_for_size(map, &hint, size, vmk_flags);
2075 			if (kr != KERN_SUCCESS) {
2076 				return kr;
2077 			}
2078 		}
2079 #if XNU_TARGET_OS_OSX
2080 		else if ((hint == 0 || hint == vm_map_min(map)) &&
2081 		    !map->disable_vmentry_reuse &&
2082 		    map->vmmap_high_start != 0) {
2083 			hint = map->vmmap_high_start;
2084 		}
2085 #endif /* XNU_TARGET_OS_OSX */
2086 
2087 		if (hint < effective_range.min_address) {
2088 			hint = effective_range.min_address;
2089 		}
2090 		if (effective_range.max_address <= hint) {
2091 			return KERN_NO_SPACE;
2092 		}
2093 
2094 		limit = effective_range.max_address;
2095 	}
2096 	entry = vm_map_store_find_space(map,
2097 	    hint, limit, vmk_flags.vmkf_last_free,
2098 	    guard_offset, size, mask,
2099 	    start_inout);
2100 
2101 	if (__improbable(entry == NULL)) {
2102 		if (map->wait_for_space &&
2103 		    guard_offset + size <=
2104 		    effective_range.max_address - effective_range.min_address) {
2105 			assert_wait((event_t)map, THREAD_ABORTSAFE);
2106 			vm_map_unlock(map);
2107 			thread_block(THREAD_CONTINUE_NULL);
2108 			vm_map_lock(map);
2109 			goto again;
2110 		}
2111 		return KERN_NO_SPACE;
2112 	}
2113 
2114 	if (entry_out) {
2115 		*entry_out = entry;
2116 	}
2117 	return KERN_SUCCESS;
2118 }
2119 
2120 
2121 /*
2122  *	Routine:	vm_map_find_space
2123  *	Purpose:
2124  *		Allocate a range in the specified virtual address map,
2125  *		returning the entry allocated for that range.
2126  *		Used by kmem_alloc, etc.
2127  *
2128  *		The map must be NOT be locked. It will be returned locked
2129  *		on KERN_SUCCESS, unlocked on failure.
2130  *
2131  *		If an entry is allocated, the object/offset fields
2132  *		are initialized to zero.
2133  */
2134 kern_return_t
vm_map_find_space(vm_map_t map,vm_map_offset_t hint_address,vm_map_size_t size,vm_map_offset_t mask,vm_map_kernel_flags_t vmk_flags,vm_map_entry_t * o_entry)2135 vm_map_find_space(
2136 	vm_map_t                map,
2137 	vm_map_offset_t         hint_address,
2138 	vm_map_size_t           size,
2139 	vm_map_offset_t         mask,
2140 	vm_map_kernel_flags_t   vmk_flags,
2141 	vm_map_entry_t          *o_entry)       /* OUT */
2142 {
2143 	vm_map_entry_t          new_entry, entry;
2144 	kern_return_t           kr;
2145 
2146 	if (size == 0) {
2147 		return KERN_INVALID_ARGUMENT;
2148 	}
2149 
2150 	new_entry = vm_map_entry_create(map);
2151 	new_entry->use_pmap = true;
2152 	new_entry->protection = VM_PROT_DEFAULT;
2153 	new_entry->max_protection = VM_PROT_ALL;
2154 
2155 	if (VM_MAP_PAGE_SHIFT(map) != PAGE_SHIFT) {
2156 		new_entry->map_aligned = true;
2157 	}
2158 	if (vmk_flags.vmf_permanent) {
2159 		new_entry->vme_permanent = true;
2160 	}
2161 
2162 	vm_map_lock(map);
2163 
2164 	kr = vm_map_locate_space(map, size, mask, vmk_flags,
2165 	    &hint_address, &entry);
2166 	if (kr != KERN_SUCCESS) {
2167 		vm_map_unlock(map);
2168 		vm_map_entry_dispose(new_entry);
2169 		return kr;
2170 	}
2171 	new_entry->vme_start = hint_address;
2172 	new_entry->vme_end = hint_address + size;
2173 
2174 	/*
2175 	 *	At this point,
2176 	 *
2177 	 *	- new_entry's "vme_start" and "vme_end" should define
2178 	 *	  the endpoints of the available new range,
2179 	 *
2180 	 *	- and "entry" should refer to the region before
2181 	 *	  the new range,
2182 	 *
2183 	 *	- and the map should still be locked.
2184 	 */
2185 
2186 	assert(page_aligned(new_entry->vme_start));
2187 	assert(page_aligned(new_entry->vme_end));
2188 	assert(VM_MAP_PAGE_ALIGNED(new_entry->vme_start, VM_MAP_PAGE_MASK(map)));
2189 	assert(VM_MAP_PAGE_ALIGNED(new_entry->vme_end, VM_MAP_PAGE_MASK(map)));
2190 
2191 	/*
2192 	 *	Insert the new entry into the list
2193 	 */
2194 
2195 	vm_map_store_entry_link(map, entry, new_entry,
2196 	    VM_MAP_KERNEL_FLAGS_NONE);
2197 	map->size += size;
2198 
2199 	/*
2200 	 *	Update the lookup hint
2201 	 */
2202 	SAVE_HINT_MAP_WRITE(map, new_entry);
2203 
2204 	*o_entry = new_entry;
2205 	return KERN_SUCCESS;
2206 }
2207 
2208 int vm_map_pmap_enter_print = FALSE;
2209 int vm_map_pmap_enter_enable = FALSE;
2210 
2211 /*
2212  *	Routine:	vm_map_pmap_enter [internal only]
2213  *
2214  *	Description:
2215  *		Force pages from the specified object to be entered into
2216  *		the pmap at the specified address if they are present.
2217  *		As soon as a page not found in the object the scan ends.
2218  *
2219  *	Returns:
2220  *		Nothing.
2221  *
2222  *	In/out conditions:
2223  *		The source map should not be locked on entry.
2224  */
2225 __unused static void
vm_map_pmap_enter(vm_map_t map,vm_map_offset_t addr,vm_map_offset_t end_addr,vm_object_t object,vm_object_offset_t offset,vm_prot_t protection)2226 vm_map_pmap_enter(
2227 	vm_map_t                map,
2228 	vm_map_offset_t         addr,
2229 	vm_map_offset_t         end_addr,
2230 	vm_object_t             object,
2231 	vm_object_offset_t      offset,
2232 	vm_prot_t               protection)
2233 {
2234 	int                     type_of_fault;
2235 	kern_return_t           kr;
2236 	struct vm_object_fault_info fault_info = {};
2237 
2238 	if (map->pmap == 0) {
2239 		return;
2240 	}
2241 
2242 	assert(VM_MAP_PAGE_SHIFT(map) == PAGE_SHIFT);
2243 
2244 	while (addr < end_addr) {
2245 		vm_page_t       m;
2246 
2247 
2248 		/*
2249 		 * TODO:
2250 		 * From vm_map_enter(), we come into this function without the map
2251 		 * lock held or the object lock held.
2252 		 * We haven't taken a reference on the object either.
2253 		 * We should do a proper lookup on the map to make sure
2254 		 * that things are sane before we go locking objects that
2255 		 * could have been deallocated from under us.
2256 		 */
2257 
2258 		vm_object_lock(object);
2259 
2260 		m = vm_page_lookup(object, offset);
2261 
2262 		if (m == VM_PAGE_NULL || m->vmp_busy || m->vmp_fictitious ||
2263 		    (m->vmp_unusual && (VMP_ERROR_GET(m) || m->vmp_restart || m->vmp_absent))) {
2264 			vm_object_unlock(object);
2265 			return;
2266 		}
2267 
2268 		if (vm_map_pmap_enter_print) {
2269 			printf("vm_map_pmap_enter:");
2270 			printf("map: %p, addr: %llx, object: %p, offset: %llx\n",
2271 			    map, (unsigned long long)addr, object, (unsigned long long)offset);
2272 		}
2273 		type_of_fault = DBG_CACHE_HIT_FAULT;
2274 		kr = vm_fault_enter(m, map->pmap,
2275 		    addr,
2276 		    PAGE_SIZE, 0,
2277 		    protection, protection,
2278 		    VM_PAGE_WIRED(m),
2279 		    FALSE,                 /* change_wiring */
2280 		    VM_KERN_MEMORY_NONE,                 /* tag - not wiring */
2281 		    &fault_info,
2282 		    NULL,                  /* need_retry */
2283 		    &type_of_fault);
2284 
2285 		vm_object_unlock(object);
2286 
2287 		offset += PAGE_SIZE_64;
2288 		addr += PAGE_SIZE;
2289 	}
2290 }
2291 
2292 #define MAX_TRIES_TO_GET_RANDOM_ADDRESS 1000
2293 static kern_return_t
vm_map_random_address_for_size(vm_map_t map,vm_map_offset_t * address,vm_map_size_t size,vm_map_kernel_flags_t vmk_flags)2294 vm_map_random_address_for_size(
2295 	vm_map_t                map,
2296 	vm_map_offset_t        *address,
2297 	vm_map_size_t           size,
2298 	vm_map_kernel_flags_t   vmk_flags)
2299 {
2300 	kern_return_t   kr = KERN_SUCCESS;
2301 	int             tries = 0;
2302 	vm_map_offset_t random_addr = 0;
2303 	vm_map_offset_t hole_end;
2304 
2305 	vm_map_entry_t  next_entry = VM_MAP_ENTRY_NULL;
2306 	vm_map_entry_t  prev_entry = VM_MAP_ENTRY_NULL;
2307 	vm_map_size_t   vm_hole_size = 0;
2308 	vm_map_size_t   addr_space_size;
2309 	bool            is_kmem_ptr;
2310 	struct mach_vm_range effective_range;
2311 
2312 	effective_range = vm_map_get_range(map, address, &vmk_flags, size,
2313 	    &is_kmem_ptr);
2314 
2315 	addr_space_size = effective_range.max_address - effective_range.min_address;
2316 	if (size >= addr_space_size) {
2317 		return KERN_NO_SPACE;
2318 	}
2319 	addr_space_size -= size;
2320 
2321 	assert(VM_MAP_PAGE_ALIGNED(size, VM_MAP_PAGE_MASK(map)));
2322 
2323 	while (tries < MAX_TRIES_TO_GET_RANDOM_ADDRESS) {
2324 		if (startup_phase < STARTUP_SUB_ZALLOC) {
2325 			random_addr = (vm_map_offset_t)early_random();
2326 		} else {
2327 			random_addr = (vm_map_offset_t)random();
2328 		}
2329 		random_addr <<= VM_MAP_PAGE_SHIFT(map);
2330 		random_addr = vm_map_trunc_page(
2331 			effective_range.min_address + (random_addr % addr_space_size),
2332 			VM_MAP_PAGE_MASK(map));
2333 
2334 #if CONFIG_PROB_GZALLOC
2335 		if (map->pmap == kernel_pmap && pgz_owned(random_addr)) {
2336 			continue;
2337 		}
2338 #endif /* CONFIG_PROB_GZALLOC */
2339 
2340 		if (vm_map_lookup_entry(map, random_addr, &prev_entry) == FALSE) {
2341 			if (prev_entry == vm_map_to_entry(map)) {
2342 				next_entry = vm_map_first_entry(map);
2343 			} else {
2344 				next_entry = prev_entry->vme_next;
2345 			}
2346 			if (next_entry == vm_map_to_entry(map)) {
2347 				hole_end = vm_map_max(map);
2348 			} else {
2349 				hole_end = next_entry->vme_start;
2350 			}
2351 			vm_hole_size = hole_end - random_addr;
2352 			if (vm_hole_size >= size) {
2353 				*address = random_addr;
2354 				break;
2355 			}
2356 		}
2357 		tries++;
2358 	}
2359 
2360 	if (tries == MAX_TRIES_TO_GET_RANDOM_ADDRESS) {
2361 		kr = KERN_NO_SPACE;
2362 	}
2363 	return kr;
2364 }
2365 
2366 static boolean_t
vm_memory_malloc_no_cow(int alias)2367 vm_memory_malloc_no_cow(
2368 	int alias)
2369 {
2370 	uint64_t alias_mask;
2371 
2372 	if (alias > 63) {
2373 		return FALSE;
2374 	}
2375 
2376 	alias_mask = 1ULL << alias;
2377 	if (alias_mask & vm_memory_malloc_no_cow_mask) {
2378 		return TRUE;
2379 	}
2380 	return FALSE;
2381 }
2382 
2383 uint64_t vm_map_enter_RLIMIT_AS_count = 0;
2384 uint64_t vm_map_enter_RLIMIT_DATA_count = 0;
2385 /*
2386  *	Routine:	vm_map_enter
2387  *
2388  *	Description:
2389  *		Allocate a range in the specified virtual address map.
2390  *		The resulting range will refer to memory defined by
2391  *		the given memory object and offset into that object.
2392  *
2393  *		Arguments are as defined in the vm_map call.
2394  */
2395 static unsigned int vm_map_enter_restore_successes = 0;
2396 static unsigned int vm_map_enter_restore_failures = 0;
2397 kern_return_t
vm_map_enter(vm_map_t map,vm_map_offset_t * address,vm_map_size_t size,vm_map_offset_t mask,vm_map_kernel_flags_t vmk_flags,vm_object_t object,vm_object_offset_t offset,boolean_t needs_copy,vm_prot_t cur_protection,vm_prot_t max_protection,vm_inherit_t inheritance)2398 vm_map_enter(
2399 	vm_map_t                map,
2400 	vm_map_offset_t         *address,       /* IN/OUT */
2401 	vm_map_size_t           size,
2402 	vm_map_offset_t         mask,
2403 	vm_map_kernel_flags_t   vmk_flags,
2404 	vm_object_t             object,
2405 	vm_object_offset_t      offset,
2406 	boolean_t               needs_copy,
2407 	vm_prot_t               cur_protection,
2408 	vm_prot_t               max_protection,
2409 	vm_inherit_t            inheritance)
2410 {
2411 	vm_map_entry_t          entry, new_entry;
2412 	vm_map_offset_t         start, tmp_start, tmp_offset;
2413 	vm_map_offset_t         end, tmp_end;
2414 	vm_map_offset_t         tmp2_start, tmp2_end;
2415 	vm_map_offset_t         step;
2416 	kern_return_t           result = KERN_SUCCESS;
2417 	bool                    map_locked = FALSE;
2418 	bool                    pmap_empty = TRUE;
2419 	bool                    new_mapping_established = FALSE;
2420 	const bool              keep_map_locked = vmk_flags.vmkf_keep_map_locked;
2421 	const bool              anywhere = !vmk_flags.vmf_fixed;
2422 	const bool              purgable = vmk_flags.vmf_purgeable;
2423 	const bool              overwrite = vmk_flags.vmf_overwrite;
2424 	const bool              no_cache = vmk_flags.vmf_no_cache;
2425 	const bool              is_submap = vmk_flags.vmkf_submap;
2426 	const bool              permanent = vmk_flags.vmf_permanent;
2427 	const bool              no_copy_on_read = vmk_flags.vmkf_no_copy_on_read;
2428 	const bool              entry_for_jit = vmk_flags.vmkf_map_jit;
2429 	const bool              iokit_acct = vmk_flags.vmkf_iokit_acct;
2430 	const bool              resilient_codesign = vmk_flags.vmf_resilient_codesign;
2431 	const bool              resilient_media = vmk_flags.vmf_resilient_media;
2432 	const bool              entry_for_tpro = vmk_flags.vmf_tpro;
2433 	const unsigned int      superpage_size = vmk_flags.vmf_superpage_size;
2434 	const vm_tag_t          alias = vmk_flags.vm_tag;
2435 	vm_tag_t                user_alias;
2436 	kern_return_t           kr;
2437 	bool                    clear_map_aligned = FALSE;
2438 	vm_map_size_t           chunk_size = 0;
2439 	vm_object_t             caller_object;
2440 	VM_MAP_ZAP_DECLARE(zap_old_list);
2441 	VM_MAP_ZAP_DECLARE(zap_new_list);
2442 
2443 	caller_object = object;
2444 
2445 	assertf(vmk_flags.__vmkf_unused == 0, "vmk_flags unused=0x%x\n", vmk_flags.__vmkf_unused);
2446 
2447 	if (vmk_flags.vmf_4gb_chunk) {
2448 #if defined(__LP64__)
2449 		chunk_size = (4ULL * 1024 * 1024 * 1024); /* max. 4GB chunks for the new allocation */
2450 #else /* __LP64__ */
2451 		chunk_size = ANON_CHUNK_SIZE;
2452 #endif /* __LP64__ */
2453 	} else {
2454 		chunk_size = ANON_CHUNK_SIZE;
2455 	}
2456 
2457 
2458 
2459 	if (superpage_size) {
2460 		switch (superpage_size) {
2461 			/*
2462 			 * Note that the current implementation only supports
2463 			 * a single size for superpages, SUPERPAGE_SIZE, per
2464 			 * architecture. As soon as more sizes are supposed
2465 			 * to be supported, SUPERPAGE_SIZE has to be replaced
2466 			 * with a lookup of the size depending on superpage_size.
2467 			 */
2468 #ifdef __x86_64__
2469 		case SUPERPAGE_SIZE_ANY:
2470 			/* handle it like 2 MB and round up to page size */
2471 			size = (size + 2 * 1024 * 1024 - 1) & ~(2 * 1024 * 1024 - 1);
2472 			OS_FALLTHROUGH;
2473 		case SUPERPAGE_SIZE_2MB:
2474 			break;
2475 #endif
2476 		default:
2477 			return KERN_INVALID_ARGUMENT;
2478 		}
2479 		mask = SUPERPAGE_SIZE - 1;
2480 		if (size & (SUPERPAGE_SIZE - 1)) {
2481 			return KERN_INVALID_ARGUMENT;
2482 		}
2483 		inheritance = VM_INHERIT_NONE;  /* fork() children won't inherit superpages */
2484 	}
2485 
2486 
2487 	if ((cur_protection & VM_PROT_WRITE) &&
2488 	    (cur_protection & VM_PROT_EXECUTE) &&
2489 #if XNU_TARGET_OS_OSX
2490 	    map->pmap != kernel_pmap &&
2491 	    (cs_process_global_enforcement() ||
2492 	    (vmk_flags.vmkf_cs_enforcement_override
2493 	    ? vmk_flags.vmkf_cs_enforcement
2494 	    : (vm_map_cs_enforcement(map)
2495 #if __arm64__
2496 	    || !VM_MAP_IS_EXOTIC(map)
2497 #endif /* __arm64__ */
2498 	    ))) &&
2499 #endif /* XNU_TARGET_OS_OSX */
2500 #if CODE_SIGNING_MONITOR
2501 	    (csm_address_space_exempt(map->pmap) != KERN_SUCCESS) &&
2502 #endif
2503 	    (VM_MAP_POLICY_WX_FAIL(map) ||
2504 	    VM_MAP_POLICY_WX_STRIP_X(map)) &&
2505 	    !entry_for_jit) {
2506 		boolean_t vm_protect_wx_fail = VM_MAP_POLICY_WX_FAIL(map);
2507 
2508 		DTRACE_VM3(cs_wx,
2509 		    uint64_t, 0,
2510 		    uint64_t, 0,
2511 		    vm_prot_t, cur_protection);
2512 		printf("CODE SIGNING: %d[%s] %s: curprot cannot be write+execute. %s\n",
2513 		    proc_selfpid(),
2514 		    (get_bsdtask_info(current_task())
2515 		    ? proc_name_address(get_bsdtask_info(current_task()))
2516 		    : "?"),
2517 		    __FUNCTION__,
2518 		    (vm_protect_wx_fail ? "failing" : "turning off execute"));
2519 		cur_protection &= ~VM_PROT_EXECUTE;
2520 		if (vm_protect_wx_fail) {
2521 			return KERN_PROTECTION_FAILURE;
2522 		}
2523 	}
2524 
2525 	/*
2526 	 * If the task has requested executable lockdown,
2527 	 * deny any new executable mapping.
2528 	 */
2529 	if (map->map_disallow_new_exec == TRUE) {
2530 		if (cur_protection & VM_PROT_EXECUTE) {
2531 			return KERN_PROTECTION_FAILURE;
2532 		}
2533 	}
2534 
2535 	if (resilient_codesign) {
2536 		assert(!is_submap);
2537 		int reject_prot = (needs_copy ? VM_PROT_ALLEXEC : (VM_PROT_WRITE | VM_PROT_ALLEXEC));
2538 		if ((cur_protection | max_protection) & reject_prot) {
2539 			return KERN_PROTECTION_FAILURE;
2540 		}
2541 	}
2542 
2543 	if (resilient_media) {
2544 		assert(!is_submap);
2545 //		assert(!needs_copy);
2546 		if (object != VM_OBJECT_NULL &&
2547 		    !object->internal) {
2548 			/*
2549 			 * This mapping is directly backed by an external
2550 			 * memory manager (e.g. a vnode pager for a file):
2551 			 * we would not have any safe place to inject
2552 			 * a zero-filled page if an actual page is not
2553 			 * available, without possibly impacting the actual
2554 			 * contents of the mapped object (e.g. the file),
2555 			 * so we can't provide any media resiliency here.
2556 			 */
2557 			return KERN_INVALID_ARGUMENT;
2558 		}
2559 	}
2560 
2561 	if (is_submap) {
2562 		vm_map_t submap;
2563 		if (purgable) {
2564 			/* submaps can not be purgeable */
2565 			return KERN_INVALID_ARGUMENT;
2566 		}
2567 		if (object == VM_OBJECT_NULL) {
2568 			/* submaps can not be created lazily */
2569 			return KERN_INVALID_ARGUMENT;
2570 		}
2571 		submap = (vm_map_t) object;
2572 		if (VM_MAP_PAGE_SHIFT(submap) != VM_MAP_PAGE_SHIFT(map)) {
2573 			/* page size mismatch */
2574 			return KERN_INVALID_ARGUMENT;
2575 		}
2576 	}
2577 	if (vmk_flags.vmkf_already) {
2578 		/*
2579 		 * VM_FLAGS_ALREADY says that it's OK if the same mapping
2580 		 * is already present.  For it to be meaningul, the requested
2581 		 * mapping has to be at a fixed address (!VM_FLAGS_ANYWHERE) and
2582 		 * we shouldn't try and remove what was mapped there first
2583 		 * (!VM_FLAGS_OVERWRITE).
2584 		 */
2585 		if (!vmk_flags.vmf_fixed || vmk_flags.vmf_overwrite) {
2586 			return KERN_INVALID_ARGUMENT;
2587 		}
2588 	}
2589 
2590 	if (size == 0 ||
2591 	    (offset & MIN(VM_MAP_PAGE_MASK(map), PAGE_MASK_64)) != 0) {
2592 		*address = 0;
2593 		return KERN_INVALID_ARGUMENT;
2594 	}
2595 
2596 	if (map->pmap == kernel_pmap) {
2597 		user_alias = VM_KERN_MEMORY_NONE;
2598 	} else {
2599 		user_alias = alias;
2600 	}
2601 
2602 	if (user_alias == VM_MEMORY_MALLOC_MEDIUM) {
2603 		chunk_size = MALLOC_MEDIUM_CHUNK_SIZE;
2604 	}
2605 
2606 #define RETURN(value)   { result = value; goto BailOut; }
2607 
2608 	assertf(VM_MAP_PAGE_ALIGNED(*address, FOURK_PAGE_MASK), "0x%llx", (uint64_t)*address);
2609 	assertf(VM_MAP_PAGE_ALIGNED(size, FOURK_PAGE_MASK), "0x%llx", (uint64_t)size);
2610 	if (VM_MAP_PAGE_MASK(map) >= PAGE_MASK) {
2611 		assertf(page_aligned(*address), "0x%llx", (uint64_t)*address);
2612 		assertf(page_aligned(size), "0x%llx", (uint64_t)size);
2613 	}
2614 
2615 	if (VM_MAP_PAGE_MASK(map) >= PAGE_MASK &&
2616 	    !VM_MAP_PAGE_ALIGNED(size, VM_MAP_PAGE_MASK(map))) {
2617 		/*
2618 		 * In most cases, the caller rounds the size up to the
2619 		 * map's page size.
2620 		 * If we get a size that is explicitly not map-aligned here,
2621 		 * we'll have to respect the caller's wish and mark the
2622 		 * mapping as "not map-aligned" to avoid tripping the
2623 		 * map alignment checks later.
2624 		 */
2625 		clear_map_aligned = TRUE;
2626 	}
2627 	if (!anywhere &&
2628 	    VM_MAP_PAGE_MASK(map) >= PAGE_MASK &&
2629 	    !VM_MAP_PAGE_ALIGNED(*address, VM_MAP_PAGE_MASK(map))) {
2630 		/*
2631 		 * We've been asked to map at a fixed address and that
2632 		 * address is not aligned to the map's specific alignment.
2633 		 * The caller should know what it's doing (i.e. most likely
2634 		 * mapping some fragmented copy map, transferring memory from
2635 		 * a VM map with a different alignment), so clear map_aligned
2636 		 * for this new VM map entry and proceed.
2637 		 */
2638 		clear_map_aligned = TRUE;
2639 	}
2640 
2641 	/*
2642 	 * Only zero-fill objects are allowed to be purgable.
2643 	 * LP64todo - limit purgable objects to 32-bits for now
2644 	 */
2645 	if (purgable &&
2646 	    (offset != 0 ||
2647 	    (object != VM_OBJECT_NULL &&
2648 	    (object->vo_size != size ||
2649 	    object->purgable == VM_PURGABLE_DENY))
2650 #if __LP64__
2651 	    || size > ANON_MAX_SIZE
2652 #endif
2653 	    )) {
2654 		return KERN_INVALID_ARGUMENT;
2655 	}
2656 
2657 	start = *address;
2658 
2659 	if (anywhere) {
2660 		vm_map_lock(map);
2661 		map_locked = TRUE;
2662 
2663 		result = vm_map_locate_space(map, size, mask, vmk_flags,
2664 		    &start, &entry);
2665 		if (result != KERN_SUCCESS) {
2666 			goto BailOut;
2667 		}
2668 
2669 		*address = start;
2670 		end = start + size;
2671 		assert(VM_MAP_PAGE_ALIGNED(*address,
2672 		    VM_MAP_PAGE_MASK(map)));
2673 	} else {
2674 		vm_map_offset_t effective_min_offset, effective_max_offset;
2675 
2676 		effective_min_offset = map->min_offset;
2677 		effective_max_offset = map->max_offset;
2678 
2679 		if (vmk_flags.vmkf_beyond_max) {
2680 			/*
2681 			 * Allow an insertion beyond the map's max offset.
2682 			 */
2683 			effective_max_offset = 0x00000000FFFFF000ULL;
2684 			if (vm_map_is_64bit(map)) {
2685 				effective_max_offset = 0xFFFFFFFFFFFFF000ULL;
2686 			}
2687 #if XNU_TARGET_OS_OSX
2688 		} else if (__improbable(vmk_flags.vmkf_32bit_map_va)) {
2689 			effective_max_offset = MIN(map->max_offset, 0x00000000FFFFF000ULL);
2690 #endif /* XNU_TARGET_OS_OSX */
2691 		}
2692 
2693 		if (VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT &&
2694 		    !overwrite &&
2695 		    user_alias == VM_MEMORY_REALLOC) {
2696 			/*
2697 			 * Force realloc() to switch to a new allocation,
2698 			 * to prevent 4k-fragmented virtual ranges.
2699 			 */
2700 //			DEBUG4K_ERROR("no realloc in place");
2701 			return KERN_NO_SPACE;
2702 		}
2703 
2704 		/*
2705 		 *	Verify that:
2706 		 *		the address doesn't itself violate
2707 		 *		the mask requirement.
2708 		 */
2709 
2710 		vm_map_lock(map);
2711 		map_locked = TRUE;
2712 		if ((start & mask) != 0) {
2713 			RETURN(KERN_NO_SPACE);
2714 		}
2715 
2716 #if CONFIG_MAP_RANGES
2717 		if (map->uses_user_ranges) {
2718 			struct mach_vm_range r;
2719 
2720 			vm_map_user_range_resolve(map, start, 1, &r);
2721 			if (r.max_address == 0) {
2722 				RETURN(KERN_INVALID_ADDRESS);
2723 			}
2724 			effective_min_offset = r.min_address;
2725 			effective_max_offset = r.max_address;
2726 		}
2727 #endif /* CONFIG_MAP_RANGES */
2728 
2729 		if ((startup_phase >= STARTUP_SUB_KMEM) && !is_submap &&
2730 		    (map == kernel_map)) {
2731 			mach_vm_range_t r = kmem_validate_range_for_overwrite(start, size);
2732 			effective_min_offset = r->min_address;
2733 			effective_max_offset = r->max_address;
2734 		}
2735 
2736 		/*
2737 		 *	...	the address is within bounds
2738 		 */
2739 
2740 		end = start + size;
2741 
2742 		if ((start < effective_min_offset) ||
2743 		    (end > effective_max_offset) ||
2744 		    (start >= end)) {
2745 			RETURN(KERN_INVALID_ADDRESS);
2746 		}
2747 
2748 		if (overwrite) {
2749 			vmr_flags_t remove_flags = VM_MAP_REMOVE_NO_MAP_ALIGN;
2750 			kern_return_t remove_kr;
2751 
2752 			/*
2753 			 * Fixed mapping and "overwrite" flag: attempt to
2754 			 * remove all existing mappings in the specified
2755 			 * address range, saving them in our "zap_old_list".
2756 			 *
2757 			 * This avoids releasing the VM map lock in
2758 			 * vm_map_entry_delete() and allows atomicity
2759 			 * when we want to replace some mappings with a new one.
2760 			 * It also allows us to restore the old VM mappings if the
2761 			 * new mapping fails.
2762 			 */
2763 			remove_flags |= VM_MAP_REMOVE_NO_YIELD;
2764 
2765 			if (vmk_flags.vmkf_overwrite_immutable) {
2766 				/* we can overwrite immutable mappings */
2767 				remove_flags |= VM_MAP_REMOVE_IMMUTABLE;
2768 			}
2769 			if (vmk_flags.vmkf_remap_prot_copy) {
2770 				remove_flags |= VM_MAP_REMOVE_IMMUTABLE_CODE;
2771 			}
2772 			remove_kr = vm_map_delete(map, start, end, remove_flags,
2773 			    KMEM_GUARD_NONE, &zap_old_list).kmr_return;
2774 			if (remove_kr) {
2775 				/* XXX FBDP restore zap_old_list? */
2776 				RETURN(remove_kr);
2777 			}
2778 		}
2779 
2780 		/*
2781 		 *	...	the starting address isn't allocated
2782 		 */
2783 
2784 		if (vm_map_lookup_entry(map, start, &entry)) {
2785 			if (!(vmk_flags.vmkf_already)) {
2786 				RETURN(KERN_NO_SPACE);
2787 			}
2788 			/*
2789 			 * Check if what's already there is what we want.
2790 			 */
2791 			tmp_start = start;
2792 			tmp_offset = offset;
2793 			if (entry->vme_start < start) {
2794 				tmp_start -= start - entry->vme_start;
2795 				tmp_offset -= start - entry->vme_start;
2796 			}
2797 			for (; entry->vme_start < end;
2798 			    entry = entry->vme_next) {
2799 				/*
2800 				 * Check if the mapping's attributes
2801 				 * match the existing map entry.
2802 				 */
2803 				if (entry == vm_map_to_entry(map) ||
2804 				    entry->vme_start != tmp_start ||
2805 				    entry->is_sub_map != is_submap ||
2806 				    VME_OFFSET(entry) != tmp_offset ||
2807 				    entry->needs_copy != needs_copy ||
2808 				    entry->protection != cur_protection ||
2809 				    entry->max_protection != max_protection ||
2810 				    entry->inheritance != inheritance ||
2811 				    entry->iokit_acct != iokit_acct ||
2812 				    VME_ALIAS(entry) != alias) {
2813 					/* not the same mapping ! */
2814 					RETURN(KERN_NO_SPACE);
2815 				}
2816 				/*
2817 				 * Check if the same object is being mapped.
2818 				 */
2819 				if (is_submap) {
2820 					if (VME_SUBMAP(entry) !=
2821 					    (vm_map_t) object) {
2822 						/* not the same submap */
2823 						RETURN(KERN_NO_SPACE);
2824 					}
2825 				} else {
2826 					if (VME_OBJECT(entry) != object) {
2827 						/* not the same VM object... */
2828 						vm_object_t obj2;
2829 
2830 						obj2 = VME_OBJECT(entry);
2831 						if ((obj2 == VM_OBJECT_NULL ||
2832 						    obj2->internal) &&
2833 						    (object == VM_OBJECT_NULL ||
2834 						    object->internal)) {
2835 							/*
2836 							 * ... but both are
2837 							 * anonymous memory,
2838 							 * so equivalent.
2839 							 */
2840 						} else {
2841 							RETURN(KERN_NO_SPACE);
2842 						}
2843 					}
2844 				}
2845 
2846 				tmp_offset += entry->vme_end - entry->vme_start;
2847 				tmp_start += entry->vme_end - entry->vme_start;
2848 				if (entry->vme_end >= end) {
2849 					/* reached the end of our mapping */
2850 					break;
2851 				}
2852 			}
2853 			/* it all matches:  let's use what's already there ! */
2854 			RETURN(KERN_MEMORY_PRESENT);
2855 		}
2856 
2857 		/*
2858 		 *	...	the next region doesn't overlap the
2859 		 *		end point.
2860 		 */
2861 
2862 		if ((entry->vme_next != vm_map_to_entry(map)) &&
2863 		    (entry->vme_next->vme_start < end)) {
2864 			RETURN(KERN_NO_SPACE);
2865 		}
2866 	}
2867 
2868 	/*
2869 	 *	At this point,
2870 	 *		"start" and "end" should define the endpoints of the
2871 	 *			available new range, and
2872 	 *		"entry" should refer to the region before the new
2873 	 *			range, and
2874 	 *
2875 	 *		the map should be locked.
2876 	 */
2877 
2878 	/*
2879 	 *	See whether we can avoid creating a new entry (and object) by
2880 	 *	extending one of our neighbors.  [So far, we only attempt to
2881 	 *	extend from below.]  Note that we can never extend/join
2882 	 *	purgable objects because they need to remain distinct
2883 	 *	entities in order to implement their "volatile object"
2884 	 *	semantics.
2885 	 */
2886 
2887 	if (purgable ||
2888 	    entry_for_jit ||
2889 	    entry_for_tpro ||
2890 	    vm_memory_malloc_no_cow(user_alias)) {
2891 		if (object == VM_OBJECT_NULL) {
2892 			object = vm_object_allocate(size);
2893 			object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
2894 			object->true_share = FALSE;
2895 			if (purgable) {
2896 				task_t owner;
2897 				object->purgable = VM_PURGABLE_NONVOLATILE;
2898 				if (map->pmap == kernel_pmap) {
2899 					/*
2900 					 * Purgeable mappings made in a kernel
2901 					 * map are "owned" by the kernel itself
2902 					 * rather than the current user task
2903 					 * because they're likely to be used by
2904 					 * more than this user task (see
2905 					 * execargs_purgeable_allocate(), for
2906 					 * example).
2907 					 */
2908 					owner = kernel_task;
2909 				} else {
2910 					owner = current_task();
2911 				}
2912 				assert(object->vo_owner == NULL);
2913 				assert(object->resident_page_count == 0);
2914 				assert(object->wired_page_count == 0);
2915 				vm_object_lock(object);
2916 				vm_purgeable_nonvolatile_enqueue(object, owner);
2917 				vm_object_unlock(object);
2918 			}
2919 			offset = (vm_object_offset_t)0;
2920 		}
2921 	} else if (VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT) {
2922 		/* no coalescing if address space uses sub-pages */
2923 	} else if ((is_submap == FALSE) &&
2924 	    (object == VM_OBJECT_NULL) &&
2925 	    (entry != vm_map_to_entry(map)) &&
2926 	    (entry->vme_end == start) &&
2927 	    (!entry->is_shared) &&
2928 	    (!entry->is_sub_map) &&
2929 	    (!entry->in_transition) &&
2930 	    (!entry->needs_wakeup) &&
2931 	    (entry->behavior == VM_BEHAVIOR_DEFAULT) &&
2932 	    (entry->protection == cur_protection) &&
2933 	    (entry->max_protection == max_protection) &&
2934 	    (entry->inheritance == inheritance) &&
2935 	    ((user_alias == VM_MEMORY_REALLOC) ||
2936 	    (VME_ALIAS(entry) == alias)) &&
2937 	    (entry->no_cache == no_cache) &&
2938 	    (entry->vme_permanent == permanent) &&
2939 	    /* no coalescing for immutable executable mappings */
2940 	    !((entry->protection & VM_PROT_EXECUTE) &&
2941 	    entry->vme_permanent) &&
2942 	    (!entry->superpage_size && !superpage_size) &&
2943 	    /*
2944 	     * No coalescing if not map-aligned, to avoid propagating
2945 	     * that condition any further than needed:
2946 	     */
2947 	    (!entry->map_aligned || !clear_map_aligned) &&
2948 	    (!entry->zero_wired_pages) &&
2949 	    (!entry->used_for_jit && !entry_for_jit) &&
2950 #if __arm64e__
2951 	    (!entry->used_for_tpro && !entry_for_tpro) &&
2952 #endif
2953 	    (!entry->csm_associated) &&
2954 	    (entry->iokit_acct == iokit_acct) &&
2955 	    (!entry->vme_resilient_codesign) &&
2956 	    (!entry->vme_resilient_media) &&
2957 	    (!entry->vme_atomic) &&
2958 	    (entry->vme_no_copy_on_read == no_copy_on_read) &&
2959 
2960 	    ((entry->vme_end - entry->vme_start) + size <=
2961 	    (user_alias == VM_MEMORY_REALLOC ?
2962 	    ANON_CHUNK_SIZE :
2963 	    NO_COALESCE_LIMIT)) &&
2964 
2965 	    (entry->wired_count == 0)) {        /* implies user_wired_count == 0 */
2966 		if (vm_object_coalesce(VME_OBJECT(entry),
2967 		    VM_OBJECT_NULL,
2968 		    VME_OFFSET(entry),
2969 		    (vm_object_offset_t) 0,
2970 		    (vm_map_size_t)(entry->vme_end - entry->vme_start),
2971 		    (vm_map_size_t)(end - entry->vme_end))) {
2972 			/*
2973 			 *	Coalesced the two objects - can extend
2974 			 *	the previous map entry to include the
2975 			 *	new range.
2976 			 */
2977 			map->size += (end - entry->vme_end);
2978 			assert(entry->vme_start < end);
2979 			assert(VM_MAP_PAGE_ALIGNED(end,
2980 			    VM_MAP_PAGE_MASK(map)));
2981 			if (__improbable(vm_debug_events)) {
2982 				DTRACE_VM5(map_entry_extend, vm_map_t, map, vm_map_entry_t, entry, vm_address_t, entry->vme_start, vm_address_t, entry->vme_end, vm_address_t, end);
2983 			}
2984 			entry->vme_end = end;
2985 			if (map->holelistenabled) {
2986 				vm_map_store_update_first_free(map, entry, TRUE);
2987 			} else {
2988 				vm_map_store_update_first_free(map, map->first_free, TRUE);
2989 			}
2990 			new_mapping_established = TRUE;
2991 			RETURN(KERN_SUCCESS);
2992 		}
2993 	}
2994 
2995 	step = superpage_size ? SUPERPAGE_SIZE : (end - start);
2996 	new_entry = NULL;
2997 
2998 	if (vmk_flags.vmkf_submap_adjust) {
2999 		vm_map_adjust_offsets((vm_map_t)caller_object, start, end);
3000 		offset = start;
3001 	}
3002 
3003 	for (tmp2_start = start; tmp2_start < end; tmp2_start += step) {
3004 		tmp2_end = tmp2_start + step;
3005 		/*
3006 		 *	Create a new entry
3007 		 *
3008 		 * XXX FBDP
3009 		 * The reserved "page zero" in each process's address space can
3010 		 * be arbitrarily large.  Splitting it into separate objects and
3011 		 * therefore different VM map entries serves no purpose and just
3012 		 * slows down operations on the VM map, so let's not split the
3013 		 * allocation into chunks if the max protection is NONE.  That
3014 		 * memory should never be accessible, so it will never get to the
3015 		 * default pager.
3016 		 */
3017 		tmp_start = tmp2_start;
3018 		if (!is_submap &&
3019 		    object == VM_OBJECT_NULL &&
3020 		    size > chunk_size &&
3021 		    max_protection != VM_PROT_NONE &&
3022 		    superpage_size == 0) {
3023 			tmp_end = tmp_start + chunk_size;
3024 		} else {
3025 			tmp_end = tmp2_end;
3026 		}
3027 		do {
3028 			if (!is_submap &&
3029 			    object != VM_OBJECT_NULL &&
3030 			    object->internal &&
3031 			    offset + (tmp_end - tmp_start) > object->vo_size) {
3032 //				printf("FBDP object %p size 0x%llx overmapping offset 0x%llx size 0x%llx\n", object, object->vo_size, offset, (uint64_t)(tmp_end - tmp_start));
3033 				DTRACE_VM5(vm_map_enter_overmap,
3034 				    vm_map_t, map,
3035 				    vm_map_address_t, tmp_start,
3036 				    vm_map_address_t, tmp_end,
3037 				    vm_object_offset_t, offset,
3038 				    vm_object_size_t, object->vo_size);
3039 			}
3040 			new_entry = vm_map_entry_insert(map,
3041 			    entry, tmp_start, tmp_end,
3042 			    object, offset, vmk_flags,
3043 			    needs_copy,
3044 			    cur_protection, max_protection,
3045 			    (entry_for_jit && !VM_MAP_POLICY_ALLOW_JIT_INHERIT(map) ?
3046 			    VM_INHERIT_NONE : inheritance),
3047 			    clear_map_aligned);
3048 
3049 			assert((object != kernel_object) || (VM_KERN_MEMORY_NONE != alias));
3050 
3051 			if (resilient_codesign) {
3052 				int reject_prot = (needs_copy ? VM_PROT_ALLEXEC : (VM_PROT_WRITE | VM_PROT_ALLEXEC));
3053 				if (!((cur_protection | max_protection) & reject_prot)) {
3054 					new_entry->vme_resilient_codesign = TRUE;
3055 				}
3056 			}
3057 
3058 			if (resilient_media &&
3059 			    (object == VM_OBJECT_NULL ||
3060 			    object->internal)) {
3061 				new_entry->vme_resilient_media = TRUE;
3062 			}
3063 
3064 			assert(!new_entry->iokit_acct);
3065 			if (!is_submap &&
3066 			    object != VM_OBJECT_NULL &&
3067 			    (object->purgable != VM_PURGABLE_DENY ||
3068 			    object->vo_ledger_tag)) {
3069 				assert(new_entry->use_pmap);
3070 				assert(!new_entry->iokit_acct);
3071 				/*
3072 				 * Turn off pmap accounting since
3073 				 * purgeable (or tagged) objects have their
3074 				 * own ledgers.
3075 				 */
3076 				new_entry->use_pmap = FALSE;
3077 			} else if (!is_submap &&
3078 			    iokit_acct &&
3079 			    object != VM_OBJECT_NULL &&
3080 			    object->internal) {
3081 				/* alternate accounting */
3082 				assert(!new_entry->iokit_acct);
3083 				assert(new_entry->use_pmap);
3084 				new_entry->iokit_acct = TRUE;
3085 				new_entry->use_pmap = FALSE;
3086 				DTRACE_VM4(
3087 					vm_map_iokit_mapped_region,
3088 					vm_map_t, map,
3089 					vm_map_offset_t, new_entry->vme_start,
3090 					vm_map_offset_t, new_entry->vme_end,
3091 					int, VME_ALIAS(new_entry));
3092 				vm_map_iokit_mapped_region(
3093 					map,
3094 					(new_entry->vme_end -
3095 					new_entry->vme_start));
3096 			} else if (!is_submap) {
3097 				assert(!new_entry->iokit_acct);
3098 				assert(new_entry->use_pmap);
3099 			}
3100 
3101 			if (is_submap) {
3102 				vm_map_t        submap;
3103 				boolean_t       submap_is_64bit;
3104 				boolean_t       use_pmap;
3105 
3106 				assert(new_entry->is_sub_map);
3107 				assert(!new_entry->use_pmap);
3108 				assert(!new_entry->iokit_acct);
3109 				submap = (vm_map_t) object;
3110 				submap_is_64bit = vm_map_is_64bit(submap);
3111 				use_pmap = vmk_flags.vmkf_nested_pmap;
3112 #ifndef NO_NESTED_PMAP
3113 				if (use_pmap && submap->pmap == NULL) {
3114 					ledger_t ledger = map->pmap->ledger;
3115 					/* we need a sub pmap to nest... */
3116 					submap->pmap = pmap_create_options(ledger, 0,
3117 					    submap_is_64bit ? PMAP_CREATE_64BIT : 0);
3118 					if (submap->pmap == NULL) {
3119 						/* let's proceed without nesting... */
3120 					}
3121 #if defined(__arm64__)
3122 					else {
3123 						pmap_set_nested(submap->pmap);
3124 					}
3125 #endif
3126 				}
3127 				if (use_pmap && submap->pmap != NULL) {
3128 					if (VM_MAP_PAGE_SHIFT(map) != VM_MAP_PAGE_SHIFT(submap)) {
3129 						DEBUG4K_ERROR("map %p (%d) submap %p (%d): incompatible page sizes\n", map, VM_MAP_PAGE_SHIFT(map), submap, VM_MAP_PAGE_SHIFT(submap));
3130 						kr = KERN_FAILURE;
3131 					} else {
3132 						kr = pmap_nest(map->pmap,
3133 						    submap->pmap,
3134 						    tmp_start,
3135 						    tmp_end - tmp_start);
3136 					}
3137 					if (kr != KERN_SUCCESS) {
3138 						printf("vm_map_enter: "
3139 						    "pmap_nest(0x%llx,0x%llx) "
3140 						    "error 0x%x\n",
3141 						    (long long)tmp_start,
3142 						    (long long)tmp_end,
3143 						    kr);
3144 					} else {
3145 						/* we're now nested ! */
3146 						new_entry->use_pmap = TRUE;
3147 						pmap_empty = FALSE;
3148 					}
3149 				}
3150 #endif /* NO_NESTED_PMAP */
3151 			}
3152 			entry = new_entry;
3153 
3154 			if (superpage_size) {
3155 				vm_page_t pages, m;
3156 				vm_object_t sp_object;
3157 				vm_object_offset_t sp_offset;
3158 
3159 				VME_OFFSET_SET(entry, 0);
3160 
3161 				/* allocate one superpage */
3162 				kr = cpm_allocate(SUPERPAGE_SIZE, &pages, 0, SUPERPAGE_NBASEPAGES - 1, TRUE, 0);
3163 				if (kr != KERN_SUCCESS) {
3164 					/* deallocate whole range... */
3165 					new_mapping_established = TRUE;
3166 					/* ... but only up to "tmp_end" */
3167 					size -= end - tmp_end;
3168 					RETURN(kr);
3169 				}
3170 
3171 				/* create one vm_object per superpage */
3172 				sp_object = vm_object_allocate((vm_map_size_t)(entry->vme_end - entry->vme_start));
3173 				sp_object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
3174 				sp_object->phys_contiguous = TRUE;
3175 				sp_object->vo_shadow_offset = (vm_object_offset_t)VM_PAGE_GET_PHYS_PAGE(pages) * PAGE_SIZE;
3176 				VME_OBJECT_SET(entry, sp_object, false, 0);
3177 				assert(entry->use_pmap);
3178 
3179 				/* enter the base pages into the object */
3180 				vm_object_lock(sp_object);
3181 				for (sp_offset = 0;
3182 				    sp_offset < SUPERPAGE_SIZE;
3183 				    sp_offset += PAGE_SIZE) {
3184 					m = pages;
3185 					pmap_zero_page(VM_PAGE_GET_PHYS_PAGE(m));
3186 					pages = NEXT_PAGE(m);
3187 					*(NEXT_PAGE_PTR(m)) = VM_PAGE_NULL;
3188 					vm_page_insert_wired(m, sp_object, sp_offset, VM_KERN_MEMORY_OSFMK);
3189 				}
3190 				vm_object_unlock(sp_object);
3191 			}
3192 		} while (tmp_end != tmp2_end &&
3193 		    (tmp_start = tmp_end) &&
3194 		    (tmp_end = (tmp2_end - tmp_end > chunk_size) ?
3195 		    tmp_end + chunk_size : tmp2_end));
3196 	}
3197 
3198 	new_mapping_established = TRUE;
3199 
3200 BailOut:
3201 	assert(map_locked == TRUE);
3202 
3203 	/*
3204 	 * Address space limit enforcement (RLIMIT_AS and RLIMIT_DATA):
3205 	 * If we have identified and possibly established the new mapping(s),
3206 	 * make sure we did not go beyond the address space limit.
3207 	 */
3208 	if (result == KERN_SUCCESS) {
3209 		if (map->size_limit != RLIM_INFINITY &&
3210 		    map->size > map->size_limit) {
3211 			/*
3212 			 * Establishing the requested mappings would exceed
3213 			 * the process's RLIMIT_AS limit: fail with
3214 			 * KERN_NO_SPACE.
3215 			 */
3216 			result = KERN_NO_SPACE;
3217 			printf("%d[%s] %s: map size 0x%llx over RLIMIT_AS 0x%llx\n",
3218 			    proc_selfpid(),
3219 			    (get_bsdtask_info(current_task())
3220 			    ? proc_name_address(get_bsdtask_info(current_task()))
3221 			    : "?"),
3222 			    __FUNCTION__,
3223 			    (uint64_t) map->size,
3224 			    (uint64_t) map->size_limit);
3225 			DTRACE_VM2(vm_map_enter_RLIMIT_AS,
3226 			    vm_map_size_t, map->size,
3227 			    uint64_t, map->size_limit);
3228 			vm_map_enter_RLIMIT_AS_count++;
3229 		} else if (map->data_limit != RLIM_INFINITY &&
3230 		    map->size > map->data_limit) {
3231 			/*
3232 			 * Establishing the requested mappings would exceed
3233 			 * the process's RLIMIT_DATA limit: fail with
3234 			 * KERN_NO_SPACE.
3235 			 */
3236 			result = KERN_NO_SPACE;
3237 			printf("%d[%s] %s: map size 0x%llx over RLIMIT_DATA 0x%llx\n",
3238 			    proc_selfpid(),
3239 			    (get_bsdtask_info(current_task())
3240 			    ? proc_name_address(get_bsdtask_info(current_task()))
3241 			    : "?"),
3242 			    __FUNCTION__,
3243 			    (uint64_t) map->size,
3244 			    (uint64_t) map->data_limit);
3245 			DTRACE_VM2(vm_map_enter_RLIMIT_DATA,
3246 			    vm_map_size_t, map->size,
3247 			    uint64_t, map->data_limit);
3248 			vm_map_enter_RLIMIT_DATA_count++;
3249 		}
3250 	}
3251 
3252 	if (result == KERN_SUCCESS) {
3253 		vm_prot_t pager_prot;
3254 		memory_object_t pager;
3255 
3256 #if DEBUG
3257 		if (pmap_empty &&
3258 		    !(vmk_flags.vmkf_no_pmap_check)) {
3259 			assert(pmap_is_empty(map->pmap,
3260 			    *address,
3261 			    *address + size));
3262 		}
3263 #endif /* DEBUG */
3264 
3265 		/*
3266 		 * For "named" VM objects, let the pager know that the
3267 		 * memory object is being mapped.  Some pagers need to keep
3268 		 * track of this, to know when they can reclaim the memory
3269 		 * object, for example.
3270 		 * VM calls memory_object_map() for each mapping (specifying
3271 		 * the protection of each mapping) and calls
3272 		 * memory_object_last_unmap() when all the mappings are gone.
3273 		 */
3274 		pager_prot = max_protection;
3275 		if (needs_copy) {
3276 			/*
3277 			 * Copy-On-Write mapping: won't modify
3278 			 * the memory object.
3279 			 */
3280 			pager_prot &= ~VM_PROT_WRITE;
3281 		}
3282 		if (!is_submap &&
3283 		    object != VM_OBJECT_NULL &&
3284 		    object->named &&
3285 		    object->pager != MEMORY_OBJECT_NULL) {
3286 			vm_object_lock(object);
3287 			pager = object->pager;
3288 			if (object->named &&
3289 			    pager != MEMORY_OBJECT_NULL) {
3290 				assert(object->pager_ready);
3291 				vm_object_mapping_wait(object, THREAD_UNINT);
3292 				vm_object_mapping_begin(object);
3293 				vm_object_unlock(object);
3294 
3295 				kr = memory_object_map(pager, pager_prot);
3296 				assert(kr == KERN_SUCCESS);
3297 
3298 				vm_object_lock(object);
3299 				vm_object_mapping_end(object);
3300 			}
3301 			vm_object_unlock(object);
3302 		}
3303 	}
3304 
3305 	assert(map_locked == TRUE);
3306 
3307 	if (new_mapping_established) {
3308 		/*
3309 		 * If we release the map lock for any reason below,
3310 		 * another thread could deallocate our new mapping,
3311 		 * releasing the caller's reference on "caller_object",
3312 		 * which was transferred to the mapping.
3313 		 * If this was the only reference, the object could be
3314 		 * destroyed.
3315 		 *
3316 		 * We need to take an extra reference on "caller_object"
3317 		 * to keep it alive if we need to return the caller's
3318 		 * reference to the caller in case of failure.
3319 		 */
3320 		if (is_submap) {
3321 			vm_map_reference((vm_map_t)caller_object);
3322 		} else {
3323 			vm_object_reference(caller_object);
3324 		}
3325 	}
3326 
3327 	if (!keep_map_locked) {
3328 		vm_map_unlock(map);
3329 		map_locked = FALSE;
3330 		entry = VM_MAP_ENTRY_NULL;
3331 		new_entry = VM_MAP_ENTRY_NULL;
3332 	}
3333 
3334 	/*
3335 	 * We can't hold the map lock if we enter this block.
3336 	 */
3337 
3338 	if (result == KERN_SUCCESS) {
3339 		/*	Wire down the new entry if the user
3340 		 *	requested all new map entries be wired.
3341 		 */
3342 		if ((map->wiring_required) || (superpage_size)) {
3343 			assert(!keep_map_locked);
3344 			pmap_empty = FALSE; /* pmap won't be empty */
3345 			kr = vm_map_wire_kernel(map, start, end,
3346 			    cur_protection, VM_KERN_MEMORY_MLOCK,
3347 			    TRUE);
3348 			result = kr;
3349 		}
3350 
3351 	}
3352 
3353 	if (result != KERN_SUCCESS) {
3354 		if (new_mapping_established) {
3355 			vmr_flags_t remove_flags = VM_MAP_REMOVE_NO_FLAGS;
3356 
3357 			/*
3358 			 * We have to get rid of the new mappings since we
3359 			 * won't make them available to the user.
3360 			 * Try and do that atomically, to minimize the risk
3361 			 * that someone else create new mappings that range.
3362 			 */
3363 			if (!map_locked) {
3364 				vm_map_lock(map);
3365 				map_locked = TRUE;
3366 			}
3367 			remove_flags |= VM_MAP_REMOVE_NO_MAP_ALIGN;
3368 			remove_flags |= VM_MAP_REMOVE_NO_YIELD;
3369 			if (permanent) {
3370 				remove_flags |= VM_MAP_REMOVE_IMMUTABLE;
3371 			}
3372 			(void) vm_map_delete(map,
3373 			    *address, *address + size,
3374 			    remove_flags,
3375 			    KMEM_GUARD_NONE, &zap_new_list);
3376 		}
3377 
3378 		if (vm_map_zap_first_entry(&zap_old_list)) {
3379 			vm_map_entry_t entry1, entry2;
3380 
3381 			/*
3382 			 * The new mapping failed.  Attempt to restore
3383 			 * the old mappings, saved in the "zap_old_map".
3384 			 */
3385 			if (!map_locked) {
3386 				vm_map_lock(map);
3387 				map_locked = TRUE;
3388 			}
3389 
3390 			/* first check if the coast is still clear */
3391 			start = vm_map_zap_first_entry(&zap_old_list)->vme_start;
3392 			end   = vm_map_zap_last_entry(&zap_old_list)->vme_end;
3393 
3394 			if (vm_map_lookup_entry(map, start, &entry1) ||
3395 			    vm_map_lookup_entry(map, end, &entry2) ||
3396 			    entry1 != entry2) {
3397 				/*
3398 				 * Part of that range has already been
3399 				 * re-mapped:  we can't restore the old
3400 				 * mappings...
3401 				 */
3402 				vm_map_enter_restore_failures++;
3403 			} else {
3404 				/*
3405 				 * Transfer the saved map entries from
3406 				 * "zap_old_map" to the original "map",
3407 				 * inserting them all after "entry1".
3408 				 */
3409 				while ((entry2 = vm_map_zap_pop(&zap_old_list))) {
3410 					vm_map_size_t entry_size;
3411 
3412 					entry_size = (entry2->vme_end -
3413 					    entry2->vme_start);
3414 					vm_map_store_entry_link(map, entry1, entry2,
3415 					    VM_MAP_KERNEL_FLAGS_NONE);
3416 					map->size += entry_size;
3417 					entry1 = entry2;
3418 				}
3419 				if (map->wiring_required) {
3420 					/*
3421 					 * XXX TODO: we should rewire the
3422 					 * old pages here...
3423 					 */
3424 				}
3425 				vm_map_enter_restore_successes++;
3426 			}
3427 		}
3428 	}
3429 
3430 	/*
3431 	 * The caller is responsible for releasing the lock if it requested to
3432 	 * keep the map locked.
3433 	 */
3434 	if (map_locked && !keep_map_locked) {
3435 		vm_map_unlock(map);
3436 	}
3437 
3438 	vm_map_zap_dispose(&zap_old_list);
3439 	vm_map_zap_dispose(&zap_new_list);
3440 
3441 	if (new_mapping_established) {
3442 		/*
3443 		 * The caller had a reference on "caller_object" and we
3444 		 * transferred that reference to the mapping.
3445 		 * We also took an extra reference on "caller_object" to keep
3446 		 * it alive while the map was unlocked.
3447 		 */
3448 		if (result == KERN_SUCCESS) {
3449 			/*
3450 			 * On success, the caller's reference on the object gets
3451 			 * tranferred to the mapping.
3452 			 * Release our extra reference.
3453 			 */
3454 			if (is_submap) {
3455 				vm_map_deallocate((vm_map_t)caller_object);
3456 			} else {
3457 				vm_object_deallocate(caller_object);
3458 			}
3459 		} else {
3460 			/*
3461 			 * On error, the caller expects to still have a
3462 			 * reference on the object it gave us.
3463 			 * Let's use our extra reference for that.
3464 			 */
3465 		}
3466 	}
3467 
3468 	return result;
3469 
3470 #undef  RETURN
3471 }
3472 
3473 #if __arm64__
3474 extern const struct memory_object_pager_ops fourk_pager_ops;
3475 kern_return_t
vm_map_enter_fourk(vm_map_t map,vm_map_offset_t * address,vm_map_size_t size,vm_map_offset_t mask,vm_map_kernel_flags_t vmk_flags,vm_object_t object,vm_object_offset_t offset,boolean_t needs_copy,vm_prot_t cur_protection,vm_prot_t max_protection,vm_inherit_t inheritance)3476 vm_map_enter_fourk(
3477 	vm_map_t                map,
3478 	vm_map_offset_t         *address,       /* IN/OUT */
3479 	vm_map_size_t           size,
3480 	vm_map_offset_t         mask,
3481 	vm_map_kernel_flags_t   vmk_flags,
3482 	vm_object_t             object,
3483 	vm_object_offset_t      offset,
3484 	boolean_t               needs_copy,
3485 	vm_prot_t               cur_protection,
3486 	vm_prot_t               max_protection,
3487 	vm_inherit_t            inheritance)
3488 {
3489 	vm_map_entry_t          entry, new_entry;
3490 	vm_map_offset_t         start, fourk_start;
3491 	vm_map_offset_t         end, fourk_end;
3492 	vm_map_size_t           fourk_size;
3493 	kern_return_t           result = KERN_SUCCESS;
3494 	boolean_t               map_locked = FALSE;
3495 	boolean_t               pmap_empty = TRUE;
3496 	boolean_t               new_mapping_established = FALSE;
3497 	const bool              keep_map_locked = vmk_flags.vmkf_keep_map_locked;
3498 	const bool              anywhere = !vmk_flags.vmf_fixed;
3499 	const bool              purgable = vmk_flags.vmf_purgeable;
3500 	const bool              overwrite = vmk_flags.vmf_overwrite;
3501 	const bool              is_submap = vmk_flags.vmkf_submap;
3502 	const bool              entry_for_jit = vmk_flags.vmkf_map_jit;
3503 	const unsigned int      superpage_size = vmk_flags.vmf_superpage_size;
3504 	vm_map_offset_t         effective_min_offset, effective_max_offset;
3505 	kern_return_t           kr;
3506 	boolean_t               clear_map_aligned = FALSE;
3507 	memory_object_t         fourk_mem_obj;
3508 	vm_object_t             fourk_object;
3509 	vm_map_offset_t         fourk_pager_offset;
3510 	int                     fourk_pager_index_start, fourk_pager_index_num;
3511 	int                     cur_idx;
3512 	boolean_t               fourk_copy;
3513 	vm_object_t             copy_object;
3514 	vm_object_offset_t      copy_offset;
3515 	VM_MAP_ZAP_DECLARE(zap_list);
3516 
3517 	if (VM_MAP_PAGE_MASK(map) < PAGE_MASK) {
3518 		panic("%s:%d", __FUNCTION__, __LINE__);
3519 	}
3520 	fourk_mem_obj = MEMORY_OBJECT_NULL;
3521 	fourk_object = VM_OBJECT_NULL;
3522 
3523 	if (superpage_size) {
3524 		return KERN_NOT_SUPPORTED;
3525 	}
3526 
3527 	if ((cur_protection & VM_PROT_WRITE) &&
3528 	    (cur_protection & VM_PROT_EXECUTE) &&
3529 #if XNU_TARGET_OS_OSX
3530 	    map->pmap != kernel_pmap &&
3531 	    (vm_map_cs_enforcement(map)
3532 #if __arm64__
3533 	    || !VM_MAP_IS_EXOTIC(map)
3534 #endif /* __arm64__ */
3535 	    ) &&
3536 #endif /* XNU_TARGET_OS_OSX */
3537 #if CODE_SIGNING_MONITOR
3538 	    (csm_address_space_exempt(map->pmap) != KERN_SUCCESS) &&
3539 #endif
3540 	    !entry_for_jit) {
3541 		DTRACE_VM3(cs_wx,
3542 		    uint64_t, 0,
3543 		    uint64_t, 0,
3544 		    vm_prot_t, cur_protection);
3545 		printf("CODE SIGNING: %d[%s] %s: curprot cannot be write+execute. "
3546 		    "turning off execute\n",
3547 		    proc_selfpid(),
3548 		    (get_bsdtask_info(current_task())
3549 		    ? proc_name_address(get_bsdtask_info(current_task()))
3550 		    : "?"),
3551 		    __FUNCTION__);
3552 		cur_protection &= ~VM_PROT_EXECUTE;
3553 	}
3554 
3555 	/*
3556 	 * If the task has requested executable lockdown,
3557 	 * deny any new executable mapping.
3558 	 */
3559 	if (map->map_disallow_new_exec == TRUE) {
3560 		if (cur_protection & VM_PROT_EXECUTE) {
3561 			return KERN_PROTECTION_FAILURE;
3562 		}
3563 	}
3564 
3565 	if (is_submap) {
3566 		return KERN_NOT_SUPPORTED;
3567 	}
3568 	if (vmk_flags.vmkf_already) {
3569 		return KERN_NOT_SUPPORTED;
3570 	}
3571 	if (purgable || entry_for_jit) {
3572 		return KERN_NOT_SUPPORTED;
3573 	}
3574 
3575 	effective_min_offset = map->min_offset;
3576 
3577 	if (vmk_flags.vmkf_beyond_max) {
3578 		return KERN_NOT_SUPPORTED;
3579 	} else {
3580 		effective_max_offset = map->max_offset;
3581 	}
3582 
3583 	if (size == 0 ||
3584 	    (offset & FOURK_PAGE_MASK) != 0) {
3585 		*address = 0;
3586 		return KERN_INVALID_ARGUMENT;
3587 	}
3588 
3589 #define RETURN(value)   { result = value; goto BailOut; }
3590 
3591 	assert(VM_MAP_PAGE_ALIGNED(*address, FOURK_PAGE_MASK));
3592 	assert(VM_MAP_PAGE_ALIGNED(size, FOURK_PAGE_MASK));
3593 
3594 	if (!anywhere && overwrite) {
3595 		return KERN_NOT_SUPPORTED;
3596 	}
3597 
3598 	fourk_start = *address;
3599 	fourk_size = size;
3600 	fourk_end = fourk_start + fourk_size;
3601 
3602 	start = vm_map_trunc_page(*address, VM_MAP_PAGE_MASK(map));
3603 	end = vm_map_round_page(fourk_end, VM_MAP_PAGE_MASK(map));
3604 	size = end - start;
3605 
3606 	if (anywhere) {
3607 		return KERN_NOT_SUPPORTED;
3608 	} else {
3609 		/*
3610 		 *	Verify that:
3611 		 *		the address doesn't itself violate
3612 		 *		the mask requirement.
3613 		 */
3614 
3615 		vm_map_lock(map);
3616 		map_locked = TRUE;
3617 		if ((start & mask) != 0) {
3618 			RETURN(KERN_NO_SPACE);
3619 		}
3620 
3621 		/*
3622 		 *	...	the address is within bounds
3623 		 */
3624 
3625 		end = start + size;
3626 
3627 		if ((start < effective_min_offset) ||
3628 		    (end > effective_max_offset) ||
3629 		    (start >= end)) {
3630 			RETURN(KERN_INVALID_ADDRESS);
3631 		}
3632 
3633 		/*
3634 		 *	...	the starting address isn't allocated
3635 		 */
3636 		if (vm_map_lookup_entry(map, start, &entry)) {
3637 			vm_object_t cur_object, shadow_object;
3638 
3639 			/*
3640 			 * We might already some 4K mappings
3641 			 * in a 16K page here.
3642 			 */
3643 
3644 			if (entry->vme_end - entry->vme_start
3645 			    != SIXTEENK_PAGE_SIZE) {
3646 				RETURN(KERN_NO_SPACE);
3647 			}
3648 			if (entry->is_sub_map) {
3649 				RETURN(KERN_NO_SPACE);
3650 			}
3651 			if (VME_OBJECT(entry) == VM_OBJECT_NULL) {
3652 				RETURN(KERN_NO_SPACE);
3653 			}
3654 
3655 			/* go all the way down the shadow chain */
3656 			cur_object = VME_OBJECT(entry);
3657 			vm_object_lock(cur_object);
3658 			while (cur_object->shadow != VM_OBJECT_NULL) {
3659 				shadow_object = cur_object->shadow;
3660 				vm_object_lock(shadow_object);
3661 				vm_object_unlock(cur_object);
3662 				cur_object = shadow_object;
3663 				shadow_object = VM_OBJECT_NULL;
3664 			}
3665 			if (cur_object->internal ||
3666 			    cur_object->pager == NULL) {
3667 				vm_object_unlock(cur_object);
3668 				RETURN(KERN_NO_SPACE);
3669 			}
3670 			if (cur_object->pager->mo_pager_ops
3671 			    != &fourk_pager_ops) {
3672 				vm_object_unlock(cur_object);
3673 				RETURN(KERN_NO_SPACE);
3674 			}
3675 			fourk_object = cur_object;
3676 			fourk_mem_obj = fourk_object->pager;
3677 
3678 			/* keep the "4K" object alive */
3679 			vm_object_reference_locked(fourk_object);
3680 			memory_object_reference(fourk_mem_obj);
3681 			vm_object_unlock(fourk_object);
3682 
3683 			/* merge permissions */
3684 			entry->protection |= cur_protection;
3685 			entry->max_protection |= max_protection;
3686 
3687 			if ((entry->protection & VM_PROT_WRITE) &&
3688 			    (entry->protection & VM_PROT_ALLEXEC) &&
3689 			    fourk_binary_compatibility_unsafe &&
3690 			    fourk_binary_compatibility_allow_wx) {
3691 				/* write+execute: need to be "jit" */
3692 				entry->used_for_jit = TRUE;
3693 			}
3694 			goto map_in_fourk_pager;
3695 		}
3696 
3697 		/*
3698 		 *	...	the next region doesn't overlap the
3699 		 *		end point.
3700 		 */
3701 
3702 		if ((entry->vme_next != vm_map_to_entry(map)) &&
3703 		    (entry->vme_next->vme_start < end)) {
3704 			RETURN(KERN_NO_SPACE);
3705 		}
3706 	}
3707 
3708 	/*
3709 	 *	At this point,
3710 	 *		"start" and "end" should define the endpoints of the
3711 	 *			available new range, and
3712 	 *		"entry" should refer to the region before the new
3713 	 *			range, and
3714 	 *
3715 	 *		the map should be locked.
3716 	 */
3717 
3718 	/* create a new "4K" pager */
3719 	fourk_mem_obj = fourk_pager_create();
3720 	fourk_object = fourk_pager_to_vm_object(fourk_mem_obj);
3721 	assert(fourk_object);
3722 
3723 	/* keep the "4" object alive */
3724 	vm_object_reference(fourk_object);
3725 
3726 	/* create a "copy" object, to map the "4K" object copy-on-write */
3727 	fourk_copy = TRUE;
3728 	result = vm_object_copy_strategically(fourk_object,
3729 	    0,
3730 	    end - start,
3731 	    &copy_object,
3732 	    &copy_offset,
3733 	    &fourk_copy);
3734 	assert(result == KERN_SUCCESS);
3735 	assert(copy_object != VM_OBJECT_NULL);
3736 	assert(copy_offset == 0);
3737 
3738 	/* map the "4K" pager's copy object */
3739 	new_entry = vm_map_entry_insert(map,
3740 	    entry,
3741 	    vm_map_trunc_page(start, VM_MAP_PAGE_MASK(map)),
3742 	    vm_map_round_page(end, VM_MAP_PAGE_MASK(map)),
3743 	    copy_object,
3744 	    0,                      /* offset */
3745 	    vmk_flags,
3746 	    FALSE,                  /* needs_copy */
3747 	    cur_protection, max_protection,
3748 	    (entry_for_jit && !VM_MAP_POLICY_ALLOW_JIT_INHERIT(map) ?
3749 	    VM_INHERIT_NONE : inheritance),
3750 	    clear_map_aligned);
3751 	entry = new_entry;
3752 
3753 #if VM_MAP_DEBUG_FOURK
3754 	if (vm_map_debug_fourk) {
3755 		printf("FOURK_PAGER: map %p [0x%llx:0x%llx] new pager %p\n",
3756 		    map,
3757 		    (uint64_t) entry->vme_start,
3758 		    (uint64_t) entry->vme_end,
3759 		    fourk_mem_obj);
3760 	}
3761 #endif /* VM_MAP_DEBUG_FOURK */
3762 
3763 	new_mapping_established = TRUE;
3764 
3765 map_in_fourk_pager:
3766 	/* "map" the original "object" where it belongs in the "4K" pager */
3767 	fourk_pager_offset = (fourk_start & SIXTEENK_PAGE_MASK);
3768 	fourk_pager_index_start = (int) (fourk_pager_offset / FOURK_PAGE_SIZE);
3769 	if (fourk_size > SIXTEENK_PAGE_SIZE) {
3770 		fourk_pager_index_num = 4;
3771 	} else {
3772 		fourk_pager_index_num = (int) (fourk_size / FOURK_PAGE_SIZE);
3773 	}
3774 	if (fourk_pager_index_start + fourk_pager_index_num > 4) {
3775 		fourk_pager_index_num = 4 - fourk_pager_index_start;
3776 	}
3777 	for (cur_idx = 0;
3778 	    cur_idx < fourk_pager_index_num;
3779 	    cur_idx++) {
3780 		vm_object_t             old_object;
3781 		vm_object_offset_t      old_offset;
3782 
3783 		kr = fourk_pager_populate(fourk_mem_obj,
3784 		    TRUE,                       /* overwrite */
3785 		    fourk_pager_index_start + cur_idx,
3786 		    object,
3787 		    (object
3788 		    ? (offset +
3789 		    (cur_idx * FOURK_PAGE_SIZE))
3790 		    : 0),
3791 		    &old_object,
3792 		    &old_offset);
3793 #if VM_MAP_DEBUG_FOURK
3794 		if (vm_map_debug_fourk) {
3795 			if (old_object == (vm_object_t) -1 &&
3796 			    old_offset == (vm_object_offset_t) -1) {
3797 				printf("FOURK_PAGER: map %p [0x%llx:0x%llx] "
3798 				    "pager [%p:0x%llx] "
3799 				    "populate[%d] "
3800 				    "[object:%p,offset:0x%llx]\n",
3801 				    map,
3802 				    (uint64_t) entry->vme_start,
3803 				    (uint64_t) entry->vme_end,
3804 				    fourk_mem_obj,
3805 				    VME_OFFSET(entry),
3806 				    fourk_pager_index_start + cur_idx,
3807 				    object,
3808 				    (object
3809 				    ? (offset + (cur_idx * FOURK_PAGE_SIZE))
3810 				    : 0));
3811 			} else {
3812 				printf("FOURK_PAGER: map %p [0x%llx:0x%llx] "
3813 				    "pager [%p:0x%llx] "
3814 				    "populate[%d] [object:%p,offset:0x%llx] "
3815 				    "old [%p:0x%llx]\n",
3816 				    map,
3817 				    (uint64_t) entry->vme_start,
3818 				    (uint64_t) entry->vme_end,
3819 				    fourk_mem_obj,
3820 				    VME_OFFSET(entry),
3821 				    fourk_pager_index_start + cur_idx,
3822 				    object,
3823 				    (object
3824 				    ? (offset + (cur_idx * FOURK_PAGE_SIZE))
3825 				    : 0),
3826 				    old_object,
3827 				    old_offset);
3828 			}
3829 		}
3830 #endif /* VM_MAP_DEBUG_FOURK */
3831 
3832 		assert(kr == KERN_SUCCESS);
3833 		if (object != old_object &&
3834 		    object != VM_OBJECT_NULL &&
3835 		    object != (vm_object_t) -1) {
3836 			vm_object_reference(object);
3837 		}
3838 		if (object != old_object &&
3839 		    old_object != VM_OBJECT_NULL &&
3840 		    old_object != (vm_object_t) -1) {
3841 			vm_object_deallocate(old_object);
3842 		}
3843 	}
3844 
3845 BailOut:
3846 	assert(map_locked == TRUE);
3847 
3848 	if (result == KERN_SUCCESS) {
3849 		vm_prot_t pager_prot;
3850 		memory_object_t pager;
3851 
3852 #if DEBUG
3853 		if (pmap_empty &&
3854 		    !(vmk_flags.vmkf_no_pmap_check)) {
3855 			assert(pmap_is_empty(map->pmap,
3856 			    *address,
3857 			    *address + size));
3858 		}
3859 #endif /* DEBUG */
3860 
3861 		/*
3862 		 * For "named" VM objects, let the pager know that the
3863 		 * memory object is being mapped.  Some pagers need to keep
3864 		 * track of this, to know when they can reclaim the memory
3865 		 * object, for example.
3866 		 * VM calls memory_object_map() for each mapping (specifying
3867 		 * the protection of each mapping) and calls
3868 		 * memory_object_last_unmap() when all the mappings are gone.
3869 		 */
3870 		pager_prot = max_protection;
3871 		if (needs_copy) {
3872 			/*
3873 			 * Copy-On-Write mapping: won't modify
3874 			 * the memory object.
3875 			 */
3876 			pager_prot &= ~VM_PROT_WRITE;
3877 		}
3878 		if (!is_submap &&
3879 		    object != VM_OBJECT_NULL &&
3880 		    object->named &&
3881 		    object->pager != MEMORY_OBJECT_NULL) {
3882 			vm_object_lock(object);
3883 			pager = object->pager;
3884 			if (object->named &&
3885 			    pager != MEMORY_OBJECT_NULL) {
3886 				assert(object->pager_ready);
3887 				vm_object_mapping_wait(object, THREAD_UNINT);
3888 				vm_object_mapping_begin(object);
3889 				vm_object_unlock(object);
3890 
3891 				kr = memory_object_map(pager, pager_prot);
3892 				assert(kr == KERN_SUCCESS);
3893 
3894 				vm_object_lock(object);
3895 				vm_object_mapping_end(object);
3896 			}
3897 			vm_object_unlock(object);
3898 		}
3899 		if (!is_submap &&
3900 		    fourk_object != VM_OBJECT_NULL &&
3901 		    fourk_object->named &&
3902 		    fourk_object->pager != MEMORY_OBJECT_NULL) {
3903 			vm_object_lock(fourk_object);
3904 			pager = fourk_object->pager;
3905 			if (fourk_object->named &&
3906 			    pager != MEMORY_OBJECT_NULL) {
3907 				assert(fourk_object->pager_ready);
3908 				vm_object_mapping_wait(fourk_object,
3909 				    THREAD_UNINT);
3910 				vm_object_mapping_begin(fourk_object);
3911 				vm_object_unlock(fourk_object);
3912 
3913 				kr = memory_object_map(pager, VM_PROT_READ);
3914 				assert(kr == KERN_SUCCESS);
3915 
3916 				vm_object_lock(fourk_object);
3917 				vm_object_mapping_end(fourk_object);
3918 			}
3919 			vm_object_unlock(fourk_object);
3920 		}
3921 	}
3922 
3923 	if (fourk_object != VM_OBJECT_NULL) {
3924 		vm_object_deallocate(fourk_object);
3925 		fourk_object = VM_OBJECT_NULL;
3926 		memory_object_deallocate(fourk_mem_obj);
3927 		fourk_mem_obj = MEMORY_OBJECT_NULL;
3928 	}
3929 
3930 	assert(map_locked == TRUE);
3931 
3932 	if (!keep_map_locked) {
3933 		vm_map_unlock(map);
3934 		map_locked = FALSE;
3935 	}
3936 
3937 	/*
3938 	 * We can't hold the map lock if we enter this block.
3939 	 */
3940 
3941 	if (result == KERN_SUCCESS) {
3942 		/*	Wire down the new entry if the user
3943 		 *	requested all new map entries be wired.
3944 		 */
3945 		if ((map->wiring_required) || (superpage_size)) {
3946 			assert(!keep_map_locked);
3947 			pmap_empty = FALSE; /* pmap won't be empty */
3948 			kr = vm_map_wire_kernel(map, start, end,
3949 			    new_entry->protection, VM_KERN_MEMORY_MLOCK,
3950 			    TRUE);
3951 			result = kr;
3952 		}
3953 
3954 	}
3955 
3956 	if (result != KERN_SUCCESS) {
3957 		if (new_mapping_established) {
3958 			/*
3959 			 * We have to get rid of the new mappings since we
3960 			 * won't make them available to the user.
3961 			 * Try and do that atomically, to minimize the risk
3962 			 * that someone else create new mappings that range.
3963 			 */
3964 
3965 			if (!map_locked) {
3966 				vm_map_lock(map);
3967 				map_locked = TRUE;
3968 			}
3969 			(void)vm_map_delete(map, *address, *address + size,
3970 			    VM_MAP_REMOVE_NO_MAP_ALIGN | VM_MAP_REMOVE_NO_YIELD,
3971 			    KMEM_GUARD_NONE, &zap_list);
3972 		}
3973 	}
3974 
3975 	/*
3976 	 * The caller is responsible for releasing the lock if it requested to
3977 	 * keep the map locked.
3978 	 */
3979 	if (map_locked && !keep_map_locked) {
3980 		vm_map_unlock(map);
3981 	}
3982 
3983 	vm_map_zap_dispose(&zap_list);
3984 
3985 	return result;
3986 
3987 #undef  RETURN
3988 }
3989 #endif /* __arm64__ */
3990 
3991 /*
3992  * Counters for the prefault optimization.
3993  */
3994 int64_t vm_prefault_nb_pages = 0;
3995 int64_t vm_prefault_nb_bailout = 0;
3996 
3997 static kern_return_t
vm_map_enter_mem_object_helper(vm_map_t target_map,vm_map_offset_t * address,vm_map_size_t initial_size,vm_map_offset_t mask,vm_map_kernel_flags_t vmk_flags,ipc_port_t port,vm_object_offset_t offset,boolean_t copy,vm_prot_t cur_protection,vm_prot_t max_protection,vm_inherit_t inheritance,upl_page_list_ptr_t page_list,unsigned int page_list_count)3998 vm_map_enter_mem_object_helper(
3999 	vm_map_t                target_map,
4000 	vm_map_offset_t         *address,
4001 	vm_map_size_t           initial_size,
4002 	vm_map_offset_t         mask,
4003 	vm_map_kernel_flags_t   vmk_flags,
4004 	ipc_port_t              port,
4005 	vm_object_offset_t      offset,
4006 	boolean_t               copy,
4007 	vm_prot_t               cur_protection,
4008 	vm_prot_t               max_protection,
4009 	vm_inherit_t            inheritance,
4010 	upl_page_list_ptr_t     page_list,
4011 	unsigned int            page_list_count)
4012 {
4013 	vm_map_address_t        map_addr;
4014 	vm_map_size_t           map_size;
4015 	vm_object_t             object;
4016 	vm_object_size_t        size;
4017 	kern_return_t           result;
4018 	boolean_t               mask_cur_protection, mask_max_protection;
4019 	boolean_t               kernel_prefault, try_prefault = (page_list_count != 0);
4020 	vm_map_offset_t         offset_in_mapping = 0;
4021 #if __arm64__
4022 	boolean_t               fourk = vmk_flags.vmkf_fourk;
4023 #endif /* __arm64__ */
4024 
4025 	if (VM_MAP_PAGE_SHIFT(target_map) < PAGE_SHIFT) {
4026 		/* XXX TODO4K prefaulting depends on page size... */
4027 		try_prefault = FALSE;
4028 	}
4029 
4030 	assertf(vmk_flags.__vmkf_unused == 0, "vmk_flags unused=0x%x\n", vmk_flags.__vmkf_unused);
4031 	vm_map_kernel_flags_update_range_id(&vmk_flags, target_map);
4032 
4033 	mask_cur_protection = cur_protection & VM_PROT_IS_MASK;
4034 	mask_max_protection = max_protection & VM_PROT_IS_MASK;
4035 	cur_protection &= ~VM_PROT_IS_MASK;
4036 	max_protection &= ~VM_PROT_IS_MASK;
4037 
4038 	/*
4039 	 * Check arguments for validity
4040 	 */
4041 	if ((target_map == VM_MAP_NULL) ||
4042 	    (cur_protection & ~(VM_PROT_ALL | VM_PROT_ALLEXEC)) ||
4043 	    (max_protection & ~(VM_PROT_ALL | VM_PROT_ALLEXEC)) ||
4044 	    (inheritance > VM_INHERIT_LAST_VALID) ||
4045 	    (try_prefault && (copy || !page_list)) ||
4046 	    initial_size == 0) {
4047 		return KERN_INVALID_ARGUMENT;
4048 	}
4049 
4050 #if __arm64__
4051 	if (cur_protection & VM_PROT_EXECUTE) {
4052 		cur_protection |= VM_PROT_READ;
4053 	}
4054 
4055 	if (fourk && VM_MAP_PAGE_SHIFT(target_map) < PAGE_SHIFT) {
4056 		/* no "fourk" if map is using a sub-page page size */
4057 		fourk = FALSE;
4058 	}
4059 	if (fourk) {
4060 		map_addr = vm_map_trunc_page(*address, FOURK_PAGE_MASK);
4061 		map_size = vm_map_round_page(initial_size, FOURK_PAGE_MASK);
4062 	} else
4063 #endif /* __arm64__ */
4064 	{
4065 		map_addr = vm_map_trunc_page(*address,
4066 		    VM_MAP_PAGE_MASK(target_map));
4067 		map_size = vm_map_round_page(initial_size,
4068 		    VM_MAP_PAGE_MASK(target_map));
4069 	}
4070 	if (map_size == 0) {
4071 		return KERN_INVALID_ARGUMENT;
4072 	}
4073 	size = vm_object_round_page(initial_size);
4074 
4075 	/*
4076 	 * Find the vm object (if any) corresponding to this port.
4077 	 */
4078 	if (!IP_VALID(port)) {
4079 		object = VM_OBJECT_NULL;
4080 		offset = 0;
4081 		copy = FALSE;
4082 	} else if (ip_kotype(port) == IKOT_NAMED_ENTRY) {
4083 		vm_named_entry_t        named_entry;
4084 		vm_object_offset_t      data_offset;
4085 
4086 		named_entry = mach_memory_entry_from_port(port);
4087 
4088 		if (vmk_flags.vmf_return_data_addr ||
4089 		    vmk_flags.vmf_return_4k_data_addr) {
4090 			data_offset = named_entry->data_offset;
4091 			offset += named_entry->data_offset;
4092 		} else {
4093 			data_offset = 0;
4094 		}
4095 
4096 		/* a few checks to make sure user is obeying rules */
4097 		if (mask_max_protection) {
4098 			max_protection &= named_entry->protection;
4099 		}
4100 		if (mask_cur_protection) {
4101 			cur_protection &= named_entry->protection;
4102 		}
4103 		if ((named_entry->protection & max_protection) !=
4104 		    max_protection) {
4105 			return KERN_INVALID_RIGHT;
4106 		}
4107 		if ((named_entry->protection & cur_protection) !=
4108 		    cur_protection) {
4109 			return KERN_INVALID_RIGHT;
4110 		}
4111 		if (offset + size <= offset) {
4112 			/* overflow */
4113 			return KERN_INVALID_ARGUMENT;
4114 		}
4115 		if (named_entry->size < (offset + initial_size)) {
4116 			return KERN_INVALID_ARGUMENT;
4117 		}
4118 
4119 		if (named_entry->is_copy) {
4120 			/* for a vm_map_copy, we can only map it whole */
4121 			if ((size != named_entry->size) &&
4122 			    (vm_map_round_page(size,
4123 			    VM_MAP_PAGE_MASK(target_map)) ==
4124 			    named_entry->size)) {
4125 				/* XXX FBDP use the rounded size... */
4126 				size = vm_map_round_page(
4127 					size,
4128 					VM_MAP_PAGE_MASK(target_map));
4129 			}
4130 		}
4131 
4132 		/* the callers parameter offset is defined to be the */
4133 		/* offset from beginning of named entry offset in object */
4134 		offset = offset + named_entry->offset;
4135 
4136 		if (!VM_MAP_PAGE_ALIGNED(size,
4137 		    VM_MAP_PAGE_MASK(target_map))) {
4138 			/*
4139 			 * Let's not map more than requested;
4140 			 * vm_map_enter() will handle this "not map-aligned"
4141 			 * case.
4142 			 */
4143 			map_size = size;
4144 		}
4145 
4146 		named_entry_lock(named_entry);
4147 		if (named_entry->is_sub_map) {
4148 			vm_map_t                submap;
4149 
4150 			if (vmk_flags.vmf_return_data_addr ||
4151 			    vmk_flags.vmf_return_4k_data_addr) {
4152 				panic("VM_FLAGS_RETURN_DATA_ADDR not expected for submap.");
4153 			}
4154 
4155 			submap = named_entry->backing.map;
4156 			vm_map_reference(submap);
4157 			named_entry_unlock(named_entry);
4158 
4159 			vmk_flags.vmkf_submap = TRUE;
4160 
4161 			result = vm_map_enter(target_map,
4162 			    &map_addr,
4163 			    map_size,
4164 			    mask,
4165 			    vmk_flags,
4166 			    (vm_object_t)(uintptr_t) submap,
4167 			    offset,
4168 			    copy,
4169 			    cur_protection,
4170 			    max_protection,
4171 			    inheritance);
4172 			if (result != KERN_SUCCESS) {
4173 				vm_map_deallocate(submap);
4174 			} else {
4175 				/*
4176 				 * No need to lock "submap" just to check its
4177 				 * "mapped" flag: that flag is never reset
4178 				 * once it's been set and if we race, we'll
4179 				 * just end up setting it twice, which is OK.
4180 				 */
4181 				if (submap->mapped_in_other_pmaps == FALSE &&
4182 				    vm_map_pmap(submap) != PMAP_NULL &&
4183 				    vm_map_pmap(submap) !=
4184 				    vm_map_pmap(target_map)) {
4185 					/*
4186 					 * This submap is being mapped in a map
4187 					 * that uses a different pmap.
4188 					 * Set its "mapped_in_other_pmaps" flag
4189 					 * to indicate that we now need to
4190 					 * remove mappings from all pmaps rather
4191 					 * than just the submap's pmap.
4192 					 */
4193 					vm_map_lock(submap);
4194 					submap->mapped_in_other_pmaps = TRUE;
4195 					vm_map_unlock(submap);
4196 				}
4197 				*address = map_addr;
4198 			}
4199 			return result;
4200 		} else if (named_entry->is_copy) {
4201 			kern_return_t   kr;
4202 			vm_map_copy_t   copy_map;
4203 			vm_map_entry_t  copy_entry;
4204 			vm_map_offset_t copy_addr;
4205 			vm_map_copy_t   target_copy_map;
4206 			vm_map_offset_t overmap_start, overmap_end;
4207 			vm_map_offset_t trimmed_start;
4208 			vm_map_size_t   target_size;
4209 
4210 			if (!vm_map_kernel_flags_check_vmflags(vmk_flags,
4211 			    (VM_FLAGS_FIXED |
4212 			    VM_FLAGS_ANYWHERE |
4213 			    VM_FLAGS_OVERWRITE |
4214 			    VM_FLAGS_RETURN_4K_DATA_ADDR |
4215 			    VM_FLAGS_RETURN_DATA_ADDR))) {
4216 				named_entry_unlock(named_entry);
4217 				return KERN_INVALID_ARGUMENT;
4218 			}
4219 
4220 			copy_map = named_entry->backing.copy;
4221 			assert(copy_map->type == VM_MAP_COPY_ENTRY_LIST);
4222 			if (copy_map->type != VM_MAP_COPY_ENTRY_LIST) {
4223 				/* unsupported type; should not happen */
4224 				printf("vm_map_enter_mem_object: "
4225 				    "memory_entry->backing.copy "
4226 				    "unsupported type 0x%x\n",
4227 				    copy_map->type);
4228 				named_entry_unlock(named_entry);
4229 				return KERN_INVALID_ARGUMENT;
4230 			}
4231 
4232 			if (VM_MAP_PAGE_SHIFT(target_map) != copy_map->cpy_hdr.page_shift) {
4233 				DEBUG4K_SHARE("copy_map %p offset %llx size 0x%llx pgshift %d -> target_map %p pgshift %d\n", copy_map, offset, (uint64_t)map_size, copy_map->cpy_hdr.page_shift, target_map, VM_MAP_PAGE_SHIFT(target_map));
4234 			}
4235 
4236 			if (vmk_flags.vmf_return_data_addr ||
4237 			    vmk_flags.vmf_return_4k_data_addr) {
4238 				offset_in_mapping = offset & VM_MAP_PAGE_MASK(target_map);
4239 				if (vmk_flags.vmf_return_4k_data_addr) {
4240 					offset_in_mapping &= ~((signed)(0xFFF));
4241 				}
4242 			}
4243 
4244 			target_copy_map = VM_MAP_COPY_NULL;
4245 			target_size = copy_map->size;
4246 			overmap_start = 0;
4247 			overmap_end = 0;
4248 			trimmed_start = 0;
4249 			if (copy_map->cpy_hdr.page_shift != VM_MAP_PAGE_SHIFT(target_map)) {
4250 				DEBUG4K_ADJUST("adjusting...\n");
4251 				kr = vm_map_copy_adjust_to_target(
4252 					copy_map,
4253 					offset /* includes data_offset */,
4254 					initial_size,
4255 					target_map,
4256 					copy,
4257 					&target_copy_map,
4258 					&overmap_start,
4259 					&overmap_end,
4260 					&trimmed_start);
4261 				if (kr != KERN_SUCCESS) {
4262 					named_entry_unlock(named_entry);
4263 					return kr;
4264 				}
4265 				target_size = target_copy_map->size;
4266 				if (trimmed_start >= data_offset) {
4267 					data_offset = offset & VM_MAP_PAGE_MASK(target_map);
4268 				} else {
4269 					data_offset -= trimmed_start;
4270 				}
4271 			} else {
4272 				/*
4273 				 * Assert that the vm_map_copy is coming from the right
4274 				 * zone and hasn't been forged
4275 				 */
4276 				vm_map_copy_require(copy_map);
4277 				target_copy_map = copy_map;
4278 			}
4279 
4280 			vm_map_kernel_flags_t rsv_flags = vmk_flags;
4281 
4282 			vm_map_kernel_flags_and_vmflags(&rsv_flags,
4283 			    (VM_FLAGS_FIXED |
4284 			    VM_FLAGS_ANYWHERE |
4285 			    VM_FLAGS_OVERWRITE |
4286 			    VM_FLAGS_RETURN_4K_DATA_ADDR |
4287 			    VM_FLAGS_RETURN_DATA_ADDR));
4288 
4289 			/* reserve a contiguous range */
4290 			kr = vm_map_enter(target_map,
4291 			    &map_addr,
4292 			    vm_map_round_page(target_size, VM_MAP_PAGE_MASK(target_map)),
4293 			    mask,
4294 			    rsv_flags,
4295 			    VM_OBJECT_NULL,
4296 			    0,
4297 			    FALSE,               /* copy */
4298 			    cur_protection,
4299 			    max_protection,
4300 			    inheritance);
4301 			if (kr != KERN_SUCCESS) {
4302 				DEBUG4K_ERROR("kr 0x%x\n", kr);
4303 				if (target_copy_map != copy_map) {
4304 					vm_map_copy_discard(target_copy_map);
4305 					target_copy_map = VM_MAP_COPY_NULL;
4306 				}
4307 				named_entry_unlock(named_entry);
4308 				return kr;
4309 			}
4310 
4311 			copy_addr = map_addr;
4312 
4313 			for (copy_entry = vm_map_copy_first_entry(target_copy_map);
4314 			    copy_entry != vm_map_copy_to_entry(target_copy_map);
4315 			    copy_entry = copy_entry->vme_next) {
4316 				vm_map_t                copy_submap = VM_MAP_NULL;
4317 				vm_object_t             copy_object = VM_OBJECT_NULL;
4318 				vm_map_size_t           copy_size;
4319 				vm_object_offset_t      copy_offset;
4320 				boolean_t               do_copy = false;
4321 
4322 				if (copy_entry->is_sub_map) {
4323 					copy_submap = VME_SUBMAP(copy_entry);
4324 					copy_object = (vm_object_t)copy_submap;
4325 				} else {
4326 					copy_object = VME_OBJECT(copy_entry);
4327 				}
4328 				copy_offset = VME_OFFSET(copy_entry);
4329 				copy_size = (copy_entry->vme_end -
4330 				    copy_entry->vme_start);
4331 
4332 				/* sanity check */
4333 				if ((copy_addr + copy_size) >
4334 				    (map_addr +
4335 				    overmap_start + overmap_end +
4336 				    named_entry->size /* XXX full size */)) {
4337 					/* over-mapping too much !? */
4338 					kr = KERN_INVALID_ARGUMENT;
4339 					DEBUG4K_ERROR("kr 0x%x\n", kr);
4340 					/* abort */
4341 					break;
4342 				}
4343 
4344 				/* take a reference on the object */
4345 				if (copy_entry->is_sub_map) {
4346 					vm_map_reference(copy_submap);
4347 				} else {
4348 					if (!copy &&
4349 					    copy_object != VM_OBJECT_NULL &&
4350 					    copy_object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) {
4351 						/*
4352 						 * We need to resolve our side of this
4353 						 * "symmetric" copy-on-write now; we
4354 						 * need a new object to map and share,
4355 						 * instead of the current one which
4356 						 * might still be shared with the
4357 						 * original mapping.
4358 						 *
4359 						 * Note: A "vm_map_copy_t" does not
4360 						 * have a lock but we're protected by
4361 						 * the named entry's lock here.
4362 						 */
4363 						// assert(copy_object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC);
4364 						VME_OBJECT_SHADOW(copy_entry, copy_size, TRUE);
4365 						assert(copy_object != VME_OBJECT(copy_entry));
4366 						if (!copy_entry->needs_copy &&
4367 						    copy_entry->protection & VM_PROT_WRITE) {
4368 							vm_prot_t prot;
4369 
4370 							prot = copy_entry->protection & ~VM_PROT_WRITE;
4371 							vm_object_pmap_protect(copy_object,
4372 							    copy_offset,
4373 							    copy_size,
4374 							    PMAP_NULL,
4375 							    PAGE_SIZE,
4376 							    0,
4377 							    prot);
4378 						}
4379 						copy_entry->needs_copy = FALSE;
4380 						copy_entry->is_shared = TRUE;
4381 						copy_object = VME_OBJECT(copy_entry);
4382 						copy_offset = VME_OFFSET(copy_entry);
4383 						vm_object_lock(copy_object);
4384 						/* we're about to make a shared mapping of this object */
4385 						copy_object->copy_strategy = MEMORY_OBJECT_COPY_DELAY;
4386 						copy_object->true_share = TRUE;
4387 						vm_object_unlock(copy_object);
4388 					}
4389 
4390 					if (copy_object != VM_OBJECT_NULL &&
4391 					    copy_object->named &&
4392 					    copy_object->pager != MEMORY_OBJECT_NULL &&
4393 					    copy_object->copy_strategy != MEMORY_OBJECT_COPY_NONE) {
4394 						memory_object_t pager;
4395 						vm_prot_t       pager_prot;
4396 
4397 						/*
4398 						 * For "named" VM objects, let the pager know that the
4399 						 * memory object is being mapped.  Some pagers need to keep
4400 						 * track of this, to know when they can reclaim the memory
4401 						 * object, for example.
4402 						 * VM calls memory_object_map() for each mapping (specifying
4403 						 * the protection of each mapping) and calls
4404 						 * memory_object_last_unmap() when all the mappings are gone.
4405 						 */
4406 						pager_prot = max_protection;
4407 						if (copy) {
4408 							/*
4409 							 * Copy-On-Write mapping: won't modify the
4410 							 * memory object.
4411 							 */
4412 							pager_prot &= ~VM_PROT_WRITE;
4413 						}
4414 						vm_object_lock(copy_object);
4415 						pager = copy_object->pager;
4416 						if (copy_object->named &&
4417 						    pager != MEMORY_OBJECT_NULL &&
4418 						    copy_object->copy_strategy != MEMORY_OBJECT_COPY_NONE) {
4419 							assert(copy_object->pager_ready);
4420 							vm_object_mapping_wait(copy_object, THREAD_UNINT);
4421 							vm_object_mapping_begin(copy_object);
4422 							vm_object_unlock(copy_object);
4423 
4424 							kr = memory_object_map(pager, pager_prot);
4425 							assert(kr == KERN_SUCCESS);
4426 
4427 							vm_object_lock(copy_object);
4428 							vm_object_mapping_end(copy_object);
4429 						}
4430 						vm_object_unlock(copy_object);
4431 					}
4432 
4433 					/*
4434 					 *	Perform the copy if requested
4435 					 */
4436 
4437 					if (copy && copy_object != VM_OBJECT_NULL) {
4438 						vm_object_t             new_object;
4439 						vm_object_offset_t      new_offset;
4440 
4441 						result = vm_object_copy_strategically(copy_object, copy_offset,
4442 						    copy_size,
4443 						    &new_object, &new_offset,
4444 						    &do_copy);
4445 
4446 
4447 						if (result == KERN_MEMORY_RESTART_COPY) {
4448 							boolean_t success;
4449 							boolean_t src_needs_copy;
4450 
4451 							/*
4452 							 * XXX
4453 							 * We currently ignore src_needs_copy.
4454 							 * This really is the issue of how to make
4455 							 * MEMORY_OBJECT_COPY_SYMMETRIC safe for
4456 							 * non-kernel users to use. Solution forthcoming.
4457 							 * In the meantime, since we don't allow non-kernel
4458 							 * memory managers to specify symmetric copy,
4459 							 * we won't run into problems here.
4460 							 */
4461 							new_object = copy_object;
4462 							new_offset = copy_offset;
4463 							success = vm_object_copy_quickly(new_object,
4464 							    new_offset,
4465 							    copy_size,
4466 							    &src_needs_copy,
4467 							    &do_copy);
4468 							assert(success);
4469 							result = KERN_SUCCESS;
4470 						}
4471 						if (result != KERN_SUCCESS) {
4472 							kr = result;
4473 							break;
4474 						}
4475 
4476 						copy_object = new_object;
4477 						copy_offset = new_offset;
4478 						/*
4479 						 * No extra object reference for the mapping:
4480 						 * the mapping should be the only thing keeping
4481 						 * this new object alive.
4482 						 */
4483 					} else {
4484 						/*
4485 						 * We already have the right object
4486 						 * to map.
4487 						 */
4488 						copy_object = VME_OBJECT(copy_entry);
4489 						/* take an extra ref for the mapping below */
4490 						vm_object_reference(copy_object);
4491 					}
4492 				}
4493 
4494 				/*
4495 				 * If the caller does not want a specific
4496 				 * tag for this new mapping:  use
4497 				 * the tag of the original mapping.
4498 				 */
4499 				vm_map_kernel_flags_t vmk_remap_flags = {
4500 					.vmkf_submap = copy_entry->is_sub_map,
4501 				};
4502 
4503 				vm_map_kernel_flags_set_vmflags(&vmk_remap_flags,
4504 				    vm_map_kernel_flags_vmflags(vmk_flags),
4505 				    vmk_flags.vm_tag ?: VME_ALIAS(copy_entry));
4506 
4507 				/* over-map the object into destination */
4508 				vmk_remap_flags.vmf_fixed = true;
4509 				vmk_remap_flags.vmf_overwrite = true;
4510 
4511 				if (!copy && !copy_entry->is_sub_map) {
4512 					/*
4513 					 * copy-on-write should have been
4514 					 * resolved at this point, or we would
4515 					 * end up sharing instead of copying.
4516 					 */
4517 					assert(!copy_entry->needs_copy);
4518 				}
4519 #if XNU_TARGET_OS_OSX
4520 				if (copy_entry->used_for_jit) {
4521 					vmk_remap_flags.vmkf_map_jit = TRUE;
4522 				}
4523 #endif /* XNU_TARGET_OS_OSX */
4524 
4525 				kr = vm_map_enter(target_map,
4526 				    &copy_addr,
4527 				    copy_size,
4528 				    (vm_map_offset_t) 0,
4529 				    vmk_remap_flags,
4530 				    copy_object,
4531 				    copy_offset,
4532 				    ((copy_object == NULL)
4533 				    ? FALSE
4534 				    : (copy || copy_entry->needs_copy)),
4535 				    cur_protection,
4536 				    max_protection,
4537 				    inheritance);
4538 				if (kr != KERN_SUCCESS) {
4539 					DEBUG4K_SHARE("failed kr 0x%x\n", kr);
4540 					if (copy_entry->is_sub_map) {
4541 						vm_map_deallocate(copy_submap);
4542 					} else {
4543 						vm_object_deallocate(copy_object);
4544 					}
4545 					/* abort */
4546 					break;
4547 				}
4548 
4549 				/* next mapping */
4550 				copy_addr += copy_size;
4551 			}
4552 
4553 			if (kr == KERN_SUCCESS) {
4554 				if (vmk_flags.vmf_return_data_addr ||
4555 				    vmk_flags.vmf_return_4k_data_addr) {
4556 					*address = map_addr + offset_in_mapping;
4557 				} else {
4558 					*address = map_addr;
4559 				}
4560 				if (overmap_start) {
4561 					*address += overmap_start;
4562 					DEBUG4K_SHARE("map %p map_addr 0x%llx offset_in_mapping 0x%llx overmap_start 0x%llx -> *address 0x%llx\n", target_map, (uint64_t)map_addr, (uint64_t) offset_in_mapping, (uint64_t)overmap_start, (uint64_t)*address);
4563 				}
4564 			}
4565 			named_entry_unlock(named_entry);
4566 			if (target_copy_map != copy_map) {
4567 				vm_map_copy_discard(target_copy_map);
4568 				target_copy_map = VM_MAP_COPY_NULL;
4569 			}
4570 
4571 			if (kr != KERN_SUCCESS && !vmk_flags.vmf_overwrite) {
4572 				/* deallocate the contiguous range */
4573 				(void) vm_deallocate(target_map,
4574 				    map_addr,
4575 				    map_size);
4576 			}
4577 
4578 			return kr;
4579 		}
4580 
4581 		if (named_entry->is_object) {
4582 			unsigned int    access;
4583 			unsigned int    wimg_mode;
4584 
4585 			/* we are mapping a VM object */
4586 
4587 			access = named_entry->access;
4588 
4589 			if (vmk_flags.vmf_return_data_addr ||
4590 			    vmk_flags.vmf_return_4k_data_addr) {
4591 				offset_in_mapping = offset - VM_MAP_TRUNC_PAGE(offset, VM_MAP_PAGE_MASK(target_map));
4592 				if (vmk_flags.vmf_return_4k_data_addr) {
4593 					offset_in_mapping &= ~((signed)(0xFFF));
4594 				}
4595 				offset = VM_MAP_TRUNC_PAGE(offset, VM_MAP_PAGE_MASK(target_map));
4596 				map_size = VM_MAP_ROUND_PAGE((offset + offset_in_mapping + initial_size) - offset, VM_MAP_PAGE_MASK(target_map));
4597 			}
4598 
4599 			object = vm_named_entry_to_vm_object(named_entry);
4600 			assert(object != VM_OBJECT_NULL);
4601 			vm_object_lock(object);
4602 			named_entry_unlock(named_entry);
4603 
4604 			vm_object_reference_locked(object);
4605 
4606 			wimg_mode = object->wimg_bits;
4607 			vm_prot_to_wimg(access, &wimg_mode);
4608 			if (object->wimg_bits != wimg_mode) {
4609 				vm_object_change_wimg_mode(object, wimg_mode);
4610 			}
4611 
4612 			vm_object_unlock(object);
4613 		} else {
4614 			panic("invalid VM named entry %p", named_entry);
4615 		}
4616 	} else if (ip_kotype(port) == IKOT_MEMORY_OBJECT) {
4617 		/*
4618 		 * JMM - This is temporary until we unify named entries
4619 		 * and raw memory objects.
4620 		 *
4621 		 * Detected fake ip_kotype for a memory object.  In
4622 		 * this case, the port isn't really a port at all, but
4623 		 * instead is just a raw memory object.
4624 		 */
4625 		if (vmk_flags.vmf_return_data_addr ||
4626 		    vmk_flags.vmf_return_4k_data_addr) {
4627 			panic("VM_FLAGS_RETURN_DATA_ADDR not expected for raw memory object.");
4628 		}
4629 
4630 		object = memory_object_to_vm_object((memory_object_t)port);
4631 		if (object == VM_OBJECT_NULL) {
4632 			return KERN_INVALID_OBJECT;
4633 		}
4634 		vm_object_reference(object);
4635 
4636 		/* wait for object (if any) to be ready */
4637 		if (object != VM_OBJECT_NULL) {
4638 			if (object == kernel_object) {
4639 				printf("Warning: Attempt to map kernel object"
4640 				    " by a non-private kernel entity\n");
4641 				return KERN_INVALID_OBJECT;
4642 			}
4643 			if (!object->pager_ready) {
4644 				vm_object_lock(object);
4645 
4646 				while (!object->pager_ready) {
4647 					vm_object_wait(object,
4648 					    VM_OBJECT_EVENT_PAGER_READY,
4649 					    THREAD_UNINT);
4650 					vm_object_lock(object);
4651 				}
4652 				vm_object_unlock(object);
4653 			}
4654 		}
4655 	} else {
4656 		return KERN_INVALID_OBJECT;
4657 	}
4658 
4659 	if (object != VM_OBJECT_NULL &&
4660 	    object->named &&
4661 	    object->pager != MEMORY_OBJECT_NULL &&
4662 	    object->copy_strategy != MEMORY_OBJECT_COPY_NONE) {
4663 		memory_object_t pager;
4664 		vm_prot_t       pager_prot;
4665 		kern_return_t   kr;
4666 
4667 		/*
4668 		 * For "named" VM objects, let the pager know that the
4669 		 * memory object is being mapped.  Some pagers need to keep
4670 		 * track of this, to know when they can reclaim the memory
4671 		 * object, for example.
4672 		 * VM calls memory_object_map() for each mapping (specifying
4673 		 * the protection of each mapping) and calls
4674 		 * memory_object_last_unmap() when all the mappings are gone.
4675 		 */
4676 		pager_prot = max_protection;
4677 		if (copy) {
4678 			/*
4679 			 * Copy-On-Write mapping: won't modify the
4680 			 * memory object.
4681 			 */
4682 			pager_prot &= ~VM_PROT_WRITE;
4683 		}
4684 		vm_object_lock(object);
4685 		pager = object->pager;
4686 		if (object->named &&
4687 		    pager != MEMORY_OBJECT_NULL &&
4688 		    object->copy_strategy != MEMORY_OBJECT_COPY_NONE) {
4689 			assert(object->pager_ready);
4690 			vm_object_mapping_wait(object, THREAD_UNINT);
4691 			vm_object_mapping_begin(object);
4692 			vm_object_unlock(object);
4693 
4694 			kr = memory_object_map(pager, pager_prot);
4695 			assert(kr == KERN_SUCCESS);
4696 
4697 			vm_object_lock(object);
4698 			vm_object_mapping_end(object);
4699 		}
4700 		vm_object_unlock(object);
4701 	}
4702 
4703 	/*
4704 	 *	Perform the copy if requested
4705 	 */
4706 
4707 	if (copy) {
4708 		vm_object_t             new_object;
4709 		vm_object_offset_t      new_offset;
4710 
4711 		result = vm_object_copy_strategically(object, offset,
4712 		    map_size,
4713 		    &new_object, &new_offset,
4714 		    &copy);
4715 
4716 
4717 		if (result == KERN_MEMORY_RESTART_COPY) {
4718 			boolean_t success;
4719 			boolean_t src_needs_copy;
4720 
4721 			/*
4722 			 * XXX
4723 			 * We currently ignore src_needs_copy.
4724 			 * This really is the issue of how to make
4725 			 * MEMORY_OBJECT_COPY_SYMMETRIC safe for
4726 			 * non-kernel users to use. Solution forthcoming.
4727 			 * In the meantime, since we don't allow non-kernel
4728 			 * memory managers to specify symmetric copy,
4729 			 * we won't run into problems here.
4730 			 */
4731 			new_object = object;
4732 			new_offset = offset;
4733 			success = vm_object_copy_quickly(new_object,
4734 			    new_offset,
4735 			    map_size,
4736 			    &src_needs_copy,
4737 			    &copy);
4738 			assert(success);
4739 			result = KERN_SUCCESS;
4740 		}
4741 		/*
4742 		 *	Throw away the reference to the
4743 		 *	original object, as it won't be mapped.
4744 		 */
4745 
4746 		vm_object_deallocate(object);
4747 
4748 		if (result != KERN_SUCCESS) {
4749 			return result;
4750 		}
4751 
4752 		object = new_object;
4753 		offset = new_offset;
4754 	}
4755 
4756 	/*
4757 	 * If non-kernel users want to try to prefault pages, the mapping and prefault
4758 	 * needs to be atomic.
4759 	 */
4760 	kernel_prefault = (try_prefault && vm_kernel_map_is_kernel(target_map));
4761 	vmk_flags.vmkf_keep_map_locked = (try_prefault && !kernel_prefault);
4762 
4763 #if __arm64__
4764 	if (fourk) {
4765 		/* map this object in a "4K" pager */
4766 		result = vm_map_enter_fourk(target_map,
4767 		    &map_addr,
4768 		    map_size,
4769 		    (vm_map_offset_t) mask,
4770 		    vmk_flags,
4771 		    object,
4772 		    offset,
4773 		    copy,
4774 		    cur_protection,
4775 		    max_protection,
4776 		    inheritance);
4777 	} else
4778 #endif /* __arm64__ */
4779 	{
4780 		result = vm_map_enter(target_map,
4781 		    &map_addr, map_size,
4782 		    (vm_map_offset_t)mask,
4783 		    vmk_flags,
4784 		    object, offset,
4785 		    copy,
4786 		    cur_protection, max_protection,
4787 		    inheritance);
4788 	}
4789 	if (result != KERN_SUCCESS) {
4790 		vm_object_deallocate(object);
4791 	}
4792 
4793 	/*
4794 	 * Try to prefault, and do not forget to release the vm map lock.
4795 	 */
4796 	if (result == KERN_SUCCESS && try_prefault) {
4797 		mach_vm_address_t va = map_addr;
4798 		kern_return_t kr = KERN_SUCCESS;
4799 		unsigned int i = 0;
4800 		int pmap_options;
4801 
4802 		pmap_options = kernel_prefault ? 0 : PMAP_OPTIONS_NOWAIT;
4803 		if (object->internal) {
4804 			pmap_options |= PMAP_OPTIONS_INTERNAL;
4805 		}
4806 
4807 		for (i = 0; i < page_list_count; ++i) {
4808 			if (!UPL_VALID_PAGE(page_list, i)) {
4809 				if (kernel_prefault) {
4810 					assertf(FALSE, "kernel_prefault && !UPL_VALID_PAGE");
4811 					result = KERN_MEMORY_ERROR;
4812 					break;
4813 				}
4814 			} else {
4815 				/*
4816 				 * If this function call failed, we should stop
4817 				 * trying to optimize, other calls are likely
4818 				 * going to fail too.
4819 				 *
4820 				 * We are not gonna report an error for such
4821 				 * failure though. That's an optimization, not
4822 				 * something critical.
4823 				 */
4824 				kr = pmap_enter_options(target_map->pmap,
4825 				    va, UPL_PHYS_PAGE(page_list, i),
4826 				    cur_protection, VM_PROT_NONE,
4827 				    0, TRUE, pmap_options, NULL);
4828 				if (kr != KERN_SUCCESS) {
4829 					OSIncrementAtomic64(&vm_prefault_nb_bailout);
4830 					if (kernel_prefault) {
4831 						result = kr;
4832 					}
4833 					break;
4834 				}
4835 				OSIncrementAtomic64(&vm_prefault_nb_pages);
4836 			}
4837 
4838 			/* Next virtual address */
4839 			va += PAGE_SIZE;
4840 		}
4841 		if (vmk_flags.vmkf_keep_map_locked) {
4842 			vm_map_unlock(target_map);
4843 		}
4844 	}
4845 
4846 	if (vmk_flags.vmf_return_data_addr ||
4847 	    vmk_flags.vmf_return_4k_data_addr) {
4848 		*address = map_addr + offset_in_mapping;
4849 	} else {
4850 		*address = map_addr;
4851 	}
4852 	return result;
4853 }
4854 
4855 kern_return_t
vm_map_enter_mem_object(vm_map_t target_map,vm_map_offset_t * address,vm_map_size_t initial_size,vm_map_offset_t mask,vm_map_kernel_flags_t vmk_flags,ipc_port_t port,vm_object_offset_t offset,boolean_t copy,vm_prot_t cur_protection,vm_prot_t max_protection,vm_inherit_t inheritance)4856 vm_map_enter_mem_object(
4857 	vm_map_t                target_map,
4858 	vm_map_offset_t         *address,
4859 	vm_map_size_t           initial_size,
4860 	vm_map_offset_t         mask,
4861 	vm_map_kernel_flags_t   vmk_flags,
4862 	ipc_port_t              port,
4863 	vm_object_offset_t      offset,
4864 	boolean_t               copy,
4865 	vm_prot_t               cur_protection,
4866 	vm_prot_t               max_protection,
4867 	vm_inherit_t            inheritance)
4868 {
4869 	kern_return_t ret;
4870 
4871 	/* range_id is set by vm_map_enter_mem_object_helper */
4872 	ret = vm_map_enter_mem_object_helper(target_map,
4873 	    address,
4874 	    initial_size,
4875 	    mask,
4876 	    vmk_flags,
4877 	    port,
4878 	    offset,
4879 	    copy,
4880 	    cur_protection,
4881 	    max_protection,
4882 	    inheritance,
4883 	    NULL,
4884 	    0);
4885 
4886 #if KASAN
4887 	if (ret == KERN_SUCCESS && address && target_map->pmap == kernel_pmap) {
4888 		kasan_notify_address(*address, initial_size);
4889 	}
4890 #endif
4891 
4892 	return ret;
4893 }
4894 
4895 kern_return_t
vm_map_enter_mem_object_prefault(vm_map_t target_map,vm_map_offset_t * address,vm_map_size_t initial_size,vm_map_offset_t mask,vm_map_kernel_flags_t vmk_flags,ipc_port_t port,vm_object_offset_t offset,vm_prot_t cur_protection,vm_prot_t max_protection,upl_page_list_ptr_t page_list,unsigned int page_list_count)4896 vm_map_enter_mem_object_prefault(
4897 	vm_map_t                target_map,
4898 	vm_map_offset_t         *address,
4899 	vm_map_size_t           initial_size,
4900 	vm_map_offset_t         mask,
4901 	vm_map_kernel_flags_t   vmk_flags,
4902 	ipc_port_t              port,
4903 	vm_object_offset_t      offset,
4904 	vm_prot_t               cur_protection,
4905 	vm_prot_t               max_protection,
4906 	upl_page_list_ptr_t     page_list,
4907 	unsigned int            page_list_count)
4908 {
4909 	kern_return_t ret;
4910 
4911 	/* range_id is set by vm_map_enter_mem_object_helper */
4912 	ret = vm_map_enter_mem_object_helper(target_map,
4913 	    address,
4914 	    initial_size,
4915 	    mask,
4916 	    vmk_flags,
4917 	    port,
4918 	    offset,
4919 	    FALSE,
4920 	    cur_protection,
4921 	    max_protection,
4922 	    VM_INHERIT_DEFAULT,
4923 	    page_list,
4924 	    page_list_count);
4925 
4926 #if KASAN
4927 	if (ret == KERN_SUCCESS && address && target_map->pmap == kernel_pmap) {
4928 		kasan_notify_address(*address, initial_size);
4929 	}
4930 #endif
4931 
4932 	return ret;
4933 }
4934 
4935 
4936 kern_return_t
vm_map_enter_mem_object_control(vm_map_t target_map,vm_map_offset_t * address,vm_map_size_t initial_size,vm_map_offset_t mask,vm_map_kernel_flags_t vmk_flags,memory_object_control_t control,vm_object_offset_t offset,boolean_t copy,vm_prot_t cur_protection,vm_prot_t max_protection,vm_inherit_t inheritance)4937 vm_map_enter_mem_object_control(
4938 	vm_map_t                target_map,
4939 	vm_map_offset_t         *address,
4940 	vm_map_size_t           initial_size,
4941 	vm_map_offset_t         mask,
4942 	vm_map_kernel_flags_t   vmk_flags,
4943 	memory_object_control_t control,
4944 	vm_object_offset_t      offset,
4945 	boolean_t               copy,
4946 	vm_prot_t               cur_protection,
4947 	vm_prot_t               max_protection,
4948 	vm_inherit_t            inheritance)
4949 {
4950 	vm_map_address_t        map_addr;
4951 	vm_map_size_t           map_size;
4952 	vm_object_t             object;
4953 	vm_object_size_t        size;
4954 	kern_return_t           result;
4955 	memory_object_t         pager;
4956 	vm_prot_t               pager_prot;
4957 	kern_return_t           kr;
4958 #if __arm64__
4959 	boolean_t               fourk = vmk_flags.vmkf_fourk;
4960 #endif /* __arm64__ */
4961 
4962 	/*
4963 	 * Check arguments for validity
4964 	 */
4965 	if ((target_map == VM_MAP_NULL) ||
4966 	    (cur_protection & ~(VM_PROT_ALL | VM_PROT_ALLEXEC)) ||
4967 	    (max_protection & ~(VM_PROT_ALL | VM_PROT_ALLEXEC)) ||
4968 	    (inheritance > VM_INHERIT_LAST_VALID) ||
4969 	    initial_size == 0) {
4970 		return KERN_INVALID_ARGUMENT;
4971 	}
4972 
4973 #if __arm64__
4974 	if (fourk && VM_MAP_PAGE_MASK(target_map) < PAGE_MASK) {
4975 		fourk = FALSE;
4976 	}
4977 
4978 	if (fourk) {
4979 		map_addr = vm_map_trunc_page(*address,
4980 		    FOURK_PAGE_MASK);
4981 		map_size = vm_map_round_page(initial_size,
4982 		    FOURK_PAGE_MASK);
4983 	} else
4984 #endif /* __arm64__ */
4985 	{
4986 		map_addr = vm_map_trunc_page(*address,
4987 		    VM_MAP_PAGE_MASK(target_map));
4988 		map_size = vm_map_round_page(initial_size,
4989 		    VM_MAP_PAGE_MASK(target_map));
4990 	}
4991 	size = vm_object_round_page(initial_size);
4992 
4993 	object = memory_object_control_to_vm_object(control);
4994 
4995 	if (object == VM_OBJECT_NULL) {
4996 		return KERN_INVALID_OBJECT;
4997 	}
4998 
4999 	if (object == kernel_object) {
5000 		printf("Warning: Attempt to map kernel object"
5001 		    " by a non-private kernel entity\n");
5002 		return KERN_INVALID_OBJECT;
5003 	}
5004 
5005 	vm_object_lock(object);
5006 	object->ref_count++;
5007 
5008 	/*
5009 	 * For "named" VM objects, let the pager know that the
5010 	 * memory object is being mapped.  Some pagers need to keep
5011 	 * track of this, to know when they can reclaim the memory
5012 	 * object, for example.
5013 	 * VM calls memory_object_map() for each mapping (specifying
5014 	 * the protection of each mapping) and calls
5015 	 * memory_object_last_unmap() when all the mappings are gone.
5016 	 */
5017 	pager_prot = max_protection;
5018 	if (copy) {
5019 		pager_prot &= ~VM_PROT_WRITE;
5020 	}
5021 	pager = object->pager;
5022 	if (object->named &&
5023 	    pager != MEMORY_OBJECT_NULL &&
5024 	    object->copy_strategy != MEMORY_OBJECT_COPY_NONE) {
5025 		assert(object->pager_ready);
5026 		vm_object_mapping_wait(object, THREAD_UNINT);
5027 		vm_object_mapping_begin(object);
5028 		vm_object_unlock(object);
5029 
5030 		kr = memory_object_map(pager, pager_prot);
5031 		assert(kr == KERN_SUCCESS);
5032 
5033 		vm_object_lock(object);
5034 		vm_object_mapping_end(object);
5035 	}
5036 	vm_object_unlock(object);
5037 
5038 	/*
5039 	 *	Perform the copy if requested
5040 	 */
5041 
5042 	if (copy) {
5043 		vm_object_t             new_object;
5044 		vm_object_offset_t      new_offset;
5045 
5046 		result = vm_object_copy_strategically(object, offset, size,
5047 		    &new_object, &new_offset,
5048 		    &copy);
5049 
5050 
5051 		if (result == KERN_MEMORY_RESTART_COPY) {
5052 			boolean_t success;
5053 			boolean_t src_needs_copy;
5054 
5055 			/*
5056 			 * XXX
5057 			 * We currently ignore src_needs_copy.
5058 			 * This really is the issue of how to make
5059 			 * MEMORY_OBJECT_COPY_SYMMETRIC safe for
5060 			 * non-kernel users to use. Solution forthcoming.
5061 			 * In the meantime, since we don't allow non-kernel
5062 			 * memory managers to specify symmetric copy,
5063 			 * we won't run into problems here.
5064 			 */
5065 			new_object = object;
5066 			new_offset = offset;
5067 			success = vm_object_copy_quickly(new_object,
5068 			    new_offset, size,
5069 			    &src_needs_copy,
5070 			    &copy);
5071 			assert(success);
5072 			result = KERN_SUCCESS;
5073 		}
5074 		/*
5075 		 *	Throw away the reference to the
5076 		 *	original object, as it won't be mapped.
5077 		 */
5078 
5079 		vm_object_deallocate(object);
5080 
5081 		if (result != KERN_SUCCESS) {
5082 			return result;
5083 		}
5084 
5085 		object = new_object;
5086 		offset = new_offset;
5087 	}
5088 
5089 #if __arm64__
5090 	if (fourk) {
5091 		result = vm_map_enter_fourk(target_map,
5092 		    &map_addr,
5093 		    map_size,
5094 		    (vm_map_offset_t)mask,
5095 		    vmk_flags,
5096 		    object, offset,
5097 		    copy,
5098 		    cur_protection, max_protection,
5099 		    inheritance);
5100 	} else
5101 #endif /* __arm64__ */
5102 	{
5103 		result = vm_map_enter(target_map,
5104 		    &map_addr, map_size,
5105 		    (vm_map_offset_t)mask,
5106 		    vmk_flags,
5107 		    object, offset,
5108 		    copy,
5109 		    cur_protection, max_protection,
5110 		    inheritance);
5111 	}
5112 	if (result != KERN_SUCCESS) {
5113 		vm_object_deallocate(object);
5114 	}
5115 	*address = map_addr;
5116 
5117 	return result;
5118 }
5119 
5120 
5121 #if     VM_CPM
5122 
5123 #ifdef MACH_ASSERT
5124 extern pmap_paddr_t     avail_start, avail_end;
5125 #endif
5126 
5127 /*
5128  *	Allocate memory in the specified map, with the caveat that
5129  *	the memory is physically contiguous.  This call may fail
5130  *	if the system can't find sufficient contiguous memory.
5131  *	This call may cause or lead to heart-stopping amounts of
5132  *	paging activity.
5133  *
5134  *	Memory obtained from this call should be freed in the
5135  *	normal way, viz., via vm_deallocate.
5136  */
5137 kern_return_t
vm_map_enter_cpm(vm_map_t map,vm_map_offset_t * addr,vm_map_size_t size,vm_map_kernel_flags_t vmk_flags)5138 vm_map_enter_cpm(
5139 	vm_map_t                map,
5140 	vm_map_offset_t        *addr,
5141 	vm_map_size_t           size,
5142 	vm_map_kernel_flags_t   vmk_flags)
5143 {
5144 	vm_object_t             cpm_obj;
5145 	pmap_t                  pmap;
5146 	vm_page_t               m, pages;
5147 	kern_return_t           kr;
5148 	vm_map_offset_t         va, start, end, offset;
5149 #if     MACH_ASSERT
5150 	vm_map_offset_t         prev_addr = 0;
5151 #endif  /* MACH_ASSERT */
5152 
5153 	if (VM_MAP_PAGE_SHIFT(map) != PAGE_SHIFT) {
5154 		/* XXX TODO4K do we need to support this? */
5155 		*addr = 0;
5156 		return KERN_NOT_SUPPORTED;
5157 	}
5158 
5159 	if (size == 0) {
5160 		*addr = 0;
5161 		return KERN_SUCCESS;
5162 	}
5163 	if (vmk_flags.vmf_fixed) {
5164 		*addr = vm_map_trunc_page(*addr,
5165 		    VM_MAP_PAGE_MASK(map));
5166 	} else {
5167 		*addr = vm_map_min(map);
5168 	}
5169 	size = vm_map_round_page(size,
5170 	    VM_MAP_PAGE_MASK(map));
5171 
5172 	/*
5173 	 * LP64todo - cpm_allocate should probably allow
5174 	 * allocations of >4GB, but not with the current
5175 	 * algorithm, so just cast down the size for now.
5176 	 */
5177 	if (size > VM_MAX_ADDRESS) {
5178 		return KERN_RESOURCE_SHORTAGE;
5179 	}
5180 	if ((kr = cpm_allocate(CAST_DOWN(vm_size_t, size),
5181 	    &pages, 0, 0, TRUE, flags)) != KERN_SUCCESS) {
5182 		return kr;
5183 	}
5184 
5185 	cpm_obj = vm_object_allocate((vm_object_size_t)size);
5186 	assert(cpm_obj != VM_OBJECT_NULL);
5187 	assert(cpm_obj->internal);
5188 	assert(cpm_obj->vo_size == (vm_object_size_t)size);
5189 	assert(cpm_obj->can_persist == FALSE);
5190 	assert(cpm_obj->pager_created == FALSE);
5191 	assert(cpm_obj->pageout == FALSE);
5192 	assert(cpm_obj->shadow == VM_OBJECT_NULL);
5193 
5194 	/*
5195 	 *	Insert pages into object.
5196 	 */
5197 
5198 	vm_object_lock(cpm_obj);
5199 	for (offset = 0; offset < size; offset += PAGE_SIZE) {
5200 		m = pages;
5201 		pages = NEXT_PAGE(m);
5202 		*(NEXT_PAGE_PTR(m)) = VM_PAGE_NULL;
5203 
5204 		assert(!m->vmp_gobbled);
5205 		assert(!m->vmp_wanted);
5206 		assert(!m->vmp_pageout);
5207 		assert(!m->vmp_tabled);
5208 		assert(VM_PAGE_WIRED(m));
5209 		assert(m->vmp_busy);
5210 		assert(VM_PAGE_GET_PHYS_PAGE(m) >= (avail_start >> PAGE_SHIFT) && VM_PAGE_GET_PHYS_PAGE(m) <= (avail_end >> PAGE_SHIFT));
5211 
5212 		m->vmp_busy = FALSE;
5213 		vm_page_insert(m, cpm_obj, offset);
5214 	}
5215 	assert(cpm_obj->resident_page_count == size / PAGE_SIZE);
5216 	vm_object_unlock(cpm_obj);
5217 
5218 	/*
5219 	 *	Hang onto a reference on the object in case a
5220 	 *	multi-threaded application for some reason decides
5221 	 *	to deallocate the portion of the address space into
5222 	 *	which we will insert this object.
5223 	 *
5224 	 *	Unfortunately, we must insert the object now before
5225 	 *	we can talk to the pmap module about which addresses
5226 	 *	must be wired down.  Hence, the race with a multi-
5227 	 *	threaded app.
5228 	 */
5229 	vm_object_reference(cpm_obj);
5230 
5231 	/*
5232 	 *	Insert object into map.
5233 	 */
5234 
5235 	kr = vm_map_enter(
5236 		map,
5237 		addr,
5238 		size,
5239 		(vm_map_offset_t)0,
5240 		vmk_flags,
5241 		cpm_obj,
5242 		(vm_object_offset_t)0,
5243 		FALSE,
5244 		VM_PROT_ALL,
5245 		VM_PROT_ALL,
5246 		VM_INHERIT_DEFAULT);
5247 
5248 	if (kr != KERN_SUCCESS) {
5249 		/*
5250 		 *	A CPM object doesn't have can_persist set,
5251 		 *	so all we have to do is deallocate it to
5252 		 *	free up these pages.
5253 		 */
5254 		assert(cpm_obj->pager_created == FALSE);
5255 		assert(cpm_obj->can_persist == FALSE);
5256 		assert(cpm_obj->pageout == FALSE);
5257 		assert(cpm_obj->shadow == VM_OBJECT_NULL);
5258 		vm_object_deallocate(cpm_obj); /* kill acquired ref */
5259 		vm_object_deallocate(cpm_obj); /* kill creation ref */
5260 	}
5261 
5262 	/*
5263 	 *	Inform the physical mapping system that the
5264 	 *	range of addresses may not fault, so that
5265 	 *	page tables and such can be locked down as well.
5266 	 */
5267 	start = *addr;
5268 	end = start + size;
5269 	pmap = vm_map_pmap(map);
5270 	pmap_pageable(pmap, start, end, FALSE);
5271 
5272 	/*
5273 	 *	Enter each page into the pmap, to avoid faults.
5274 	 *	Note that this loop could be coded more efficiently,
5275 	 *	if the need arose, rather than looking up each page
5276 	 *	again.
5277 	 */
5278 	for (offset = 0, va = start; offset < size;
5279 	    va += PAGE_SIZE, offset += PAGE_SIZE) {
5280 		int type_of_fault;
5281 
5282 		vm_object_lock(cpm_obj);
5283 		m = vm_page_lookup(cpm_obj, (vm_object_offset_t)offset);
5284 		assert(m != VM_PAGE_NULL);
5285 
5286 		vm_page_zero_fill(m);
5287 
5288 		type_of_fault = DBG_ZERO_FILL_FAULT;
5289 
5290 		vm_fault_enter(m, pmap, va,
5291 		    PAGE_SIZE, 0,
5292 		    VM_PROT_ALL, VM_PROT_WRITE,
5293 		    VM_PAGE_WIRED(m),
5294 		    FALSE,                             /* change_wiring */
5295 		    VM_KERN_MEMORY_NONE,                             /* tag - not wiring */
5296 		    FALSE,                             /* cs_bypass */
5297 		    0,                                 /* user_tag */
5298 		    0,                             /* pmap_options */
5299 		    NULL,                              /* need_retry */
5300 		    &type_of_fault);
5301 
5302 		vm_object_unlock(cpm_obj);
5303 	}
5304 
5305 #if     MACH_ASSERT
5306 	/*
5307 	 *	Verify ordering in address space.
5308 	 */
5309 	for (offset = 0; offset < size; offset += PAGE_SIZE) {
5310 		vm_object_lock(cpm_obj);
5311 		m = vm_page_lookup(cpm_obj, (vm_object_offset_t)offset);
5312 		vm_object_unlock(cpm_obj);
5313 		if (m == VM_PAGE_NULL) {
5314 			panic("vm_allocate_cpm:  obj %p off 0x%llx no page",
5315 			    cpm_obj, (uint64_t)offset);
5316 		}
5317 		assert(m->vmp_tabled);
5318 		assert(!m->vmp_busy);
5319 		assert(!m->vmp_wanted);
5320 		assert(!m->vmp_fictitious);
5321 		assert(!m->vmp_private);
5322 		assert(!m->vmp_absent);
5323 		assert(!m->vmp_cleaning);
5324 		assert(!m->vmp_laundry);
5325 		assert(!m->vmp_precious);
5326 		assert(!m->vmp_clustered);
5327 		if (offset != 0) {
5328 			if (VM_PAGE_GET_PHYS_PAGE(m) != prev_addr + 1) {
5329 				printf("start 0x%llx end 0x%llx va 0x%llx\n",
5330 				    (uint64_t)start, (uint64_t)end, (uint64_t)va);
5331 				printf("obj %p off 0x%llx\n", cpm_obj, (uint64_t)offset);
5332 				printf("m %p prev_address 0x%llx\n", m, (uint64_t)prev_addr);
5333 				panic("vm_allocate_cpm:  pages not contig!");
5334 			}
5335 		}
5336 		prev_addr = VM_PAGE_GET_PHYS_PAGE(m);
5337 	}
5338 #endif  /* MACH_ASSERT */
5339 
5340 	vm_object_deallocate(cpm_obj); /* kill extra ref */
5341 
5342 	return kr;
5343 }
5344 
5345 
5346 #else   /* VM_CPM */
5347 
5348 /*
5349  *	Interface is defined in all cases, but unless the kernel
5350  *	is built explicitly for this option, the interface does
5351  *	nothing.
5352  */
5353 
5354 kern_return_t
vm_map_enter_cpm(__unused vm_map_t map,__unused vm_map_offset_t * addr,__unused vm_map_size_t size,__unused vm_map_kernel_flags_t vmk_flags)5355 vm_map_enter_cpm(
5356 	__unused vm_map_t                map,
5357 	__unused vm_map_offset_t        *addr,
5358 	__unused vm_map_size_t           size,
5359 	__unused vm_map_kernel_flags_t   vmk_flags)
5360 {
5361 	return KERN_FAILURE;
5362 }
5363 #endif /* VM_CPM */
5364 
5365 /* Not used without nested pmaps */
5366 #ifndef NO_NESTED_PMAP
5367 /*
5368  * Clip and unnest a portion of a nested submap mapping.
5369  */
5370 
5371 
5372 static void
vm_map_clip_unnest(vm_map_t map,vm_map_entry_t entry,vm_map_offset_t start_unnest,vm_map_offset_t end_unnest)5373 vm_map_clip_unnest(
5374 	vm_map_t        map,
5375 	vm_map_entry_t  entry,
5376 	vm_map_offset_t start_unnest,
5377 	vm_map_offset_t end_unnest)
5378 {
5379 	vm_map_offset_t old_start_unnest = start_unnest;
5380 	vm_map_offset_t old_end_unnest = end_unnest;
5381 
5382 	assert(entry->is_sub_map);
5383 	assert(VME_SUBMAP(entry) != NULL);
5384 	assert(entry->use_pmap);
5385 
5386 	/*
5387 	 * Query the platform for the optimal unnest range.
5388 	 * DRK: There's some duplication of effort here, since
5389 	 * callers may have adjusted the range to some extent. This
5390 	 * routine was introduced to support 1GiB subtree nesting
5391 	 * for x86 platforms, which can also nest on 2MiB boundaries
5392 	 * depending on size/alignment.
5393 	 */
5394 	if (pmap_adjust_unnest_parameters(map->pmap, &start_unnest, &end_unnest)) {
5395 		assert(VME_SUBMAP(entry)->is_nested_map);
5396 		assert(!VME_SUBMAP(entry)->disable_vmentry_reuse);
5397 		log_unnest_badness(map,
5398 		    old_start_unnest,
5399 		    old_end_unnest,
5400 		    VME_SUBMAP(entry)->is_nested_map,
5401 		    (entry->vme_start +
5402 		    VME_SUBMAP(entry)->lowest_unnestable_start -
5403 		    VME_OFFSET(entry)));
5404 	}
5405 
5406 	if (entry->vme_start > start_unnest ||
5407 	    entry->vme_end < end_unnest) {
5408 		panic("vm_map_clip_unnest(0x%llx,0x%llx): "
5409 		    "bad nested entry: start=0x%llx end=0x%llx\n",
5410 		    (long long)start_unnest, (long long)end_unnest,
5411 		    (long long)entry->vme_start, (long long)entry->vme_end);
5412 	}
5413 
5414 	if (start_unnest > entry->vme_start) {
5415 		_vm_map_clip_start(&map->hdr,
5416 		    entry,
5417 		    start_unnest);
5418 		if (map->holelistenabled) {
5419 			vm_map_store_update_first_free(map, NULL, FALSE);
5420 		} else {
5421 			vm_map_store_update_first_free(map, map->first_free, FALSE);
5422 		}
5423 	}
5424 	if (entry->vme_end > end_unnest) {
5425 		_vm_map_clip_end(&map->hdr,
5426 		    entry,
5427 		    end_unnest);
5428 		if (map->holelistenabled) {
5429 			vm_map_store_update_first_free(map, NULL, FALSE);
5430 		} else {
5431 			vm_map_store_update_first_free(map, map->first_free, FALSE);
5432 		}
5433 	}
5434 
5435 	pmap_unnest(map->pmap,
5436 	    entry->vme_start,
5437 	    entry->vme_end - entry->vme_start);
5438 	if ((map->mapped_in_other_pmaps) && os_ref_get_count_raw(&map->map_refcnt) != 0) {
5439 		/* clean up parent map/maps */
5440 		vm_map_submap_pmap_clean(
5441 			map, entry->vme_start,
5442 			entry->vme_end,
5443 			VME_SUBMAP(entry),
5444 			VME_OFFSET(entry));
5445 	}
5446 	entry->use_pmap = FALSE;
5447 	if ((map->pmap != kernel_pmap) &&
5448 	    (VME_ALIAS(entry) == VM_MEMORY_SHARED_PMAP)) {
5449 		VME_ALIAS_SET(entry, VM_MEMORY_UNSHARED_PMAP);
5450 	}
5451 }
5452 #endif  /* NO_NESTED_PMAP */
5453 
5454 __abortlike
5455 static void
__vm_map_clip_atomic_entry_panic(vm_map_t map,vm_map_entry_t entry,vm_map_offset_t where)5456 __vm_map_clip_atomic_entry_panic(
5457 	vm_map_t        map,
5458 	vm_map_entry_t  entry,
5459 	vm_map_offset_t where)
5460 {
5461 	panic("vm_map_clip(%p): Attempting to clip an atomic VM map entry "
5462 	    "%p [0x%llx:0x%llx] at 0x%llx", map, entry,
5463 	    (uint64_t)entry->vme_start,
5464 	    (uint64_t)entry->vme_end,
5465 	    (uint64_t)where);
5466 }
5467 
5468 /*
5469  *	vm_map_clip_start:	[ internal use only ]
5470  *
5471  *	Asserts that the given entry begins at or after
5472  *	the specified address; if necessary,
5473  *	it splits the entry into two.
5474  */
5475 void
vm_map_clip_start(vm_map_t map,vm_map_entry_t entry,vm_map_offset_t startaddr)5476 vm_map_clip_start(
5477 	vm_map_t        map,
5478 	vm_map_entry_t  entry,
5479 	vm_map_offset_t startaddr)
5480 {
5481 #ifndef NO_NESTED_PMAP
5482 	if (entry->is_sub_map &&
5483 	    entry->use_pmap &&
5484 	    startaddr >= entry->vme_start) {
5485 		vm_map_offset_t start_unnest, end_unnest;
5486 
5487 		/*
5488 		 * Make sure "startaddr" is no longer in a nested range
5489 		 * before we clip.  Unnest only the minimum range the platform
5490 		 * can handle.
5491 		 * vm_map_clip_unnest may perform additional adjustments to
5492 		 * the unnest range.
5493 		 */
5494 		start_unnest = startaddr & ~(pmap_shared_region_size_min(map->pmap) - 1);
5495 		end_unnest = start_unnest + pmap_shared_region_size_min(map->pmap);
5496 		vm_map_clip_unnest(map, entry, start_unnest, end_unnest);
5497 	}
5498 #endif /* NO_NESTED_PMAP */
5499 	if (startaddr > entry->vme_start) {
5500 		if (!entry->is_sub_map &&
5501 		    VME_OBJECT(entry) &&
5502 		    VME_OBJECT(entry)->phys_contiguous) {
5503 			pmap_remove(map->pmap,
5504 			    (addr64_t)(entry->vme_start),
5505 			    (addr64_t)(entry->vme_end));
5506 		}
5507 		if (entry->vme_atomic) {
5508 			__vm_map_clip_atomic_entry_panic(map, entry, startaddr);
5509 		}
5510 
5511 		DTRACE_VM5(
5512 			vm_map_clip_start,
5513 			vm_map_t, map,
5514 			vm_map_offset_t, entry->vme_start,
5515 			vm_map_offset_t, entry->vme_end,
5516 			vm_map_offset_t, startaddr,
5517 			int, VME_ALIAS(entry));
5518 
5519 		_vm_map_clip_start(&map->hdr, entry, startaddr);
5520 		if (map->holelistenabled) {
5521 			vm_map_store_update_first_free(map, NULL, FALSE);
5522 		} else {
5523 			vm_map_store_update_first_free(map, map->first_free, FALSE);
5524 		}
5525 	}
5526 }
5527 
5528 
5529 #define vm_map_copy_clip_start(copy, entry, startaddr) \
5530 	MACRO_BEGIN \
5531 	if ((startaddr) > (entry)->vme_start) \
5532 	        _vm_map_clip_start(&(copy)->cpy_hdr,(entry),(startaddr)); \
5533 	MACRO_END
5534 
5535 /*
5536  *	This routine is called only when it is known that
5537  *	the entry must be split.
5538  */
5539 static void
_vm_map_clip_start(struct vm_map_header * map_header,vm_map_entry_t entry,vm_map_offset_t start)5540 _vm_map_clip_start(
5541 	struct vm_map_header    *map_header,
5542 	vm_map_entry_t          entry,
5543 	vm_map_offset_t         start)
5544 {
5545 	vm_map_entry_t  new_entry;
5546 
5547 	/*
5548 	 *	Split off the front portion --
5549 	 *	note that we must insert the new
5550 	 *	entry BEFORE this one, so that
5551 	 *	this entry has the specified starting
5552 	 *	address.
5553 	 */
5554 
5555 	if (entry->map_aligned) {
5556 		assert(VM_MAP_PAGE_ALIGNED(start,
5557 		    VM_MAP_HDR_PAGE_MASK(map_header)));
5558 	}
5559 
5560 	new_entry = _vm_map_entry_create(map_header);
5561 	vm_map_entry_copy_full(new_entry, entry);
5562 
5563 	new_entry->vme_end = start;
5564 	assert(new_entry->vme_start < new_entry->vme_end);
5565 	VME_OFFSET_SET(entry, VME_OFFSET(entry) + (start - entry->vme_start));
5566 	if (__improbable(start >= entry->vme_end)) {
5567 		panic("mapHdr %p entry %p start 0x%llx end 0x%llx new start 0x%llx", map_header, entry, entry->vme_start, entry->vme_end, start);
5568 	}
5569 	assert(start < entry->vme_end);
5570 	entry->vme_start = start;
5571 
5572 	_vm_map_store_entry_link(map_header, entry->vme_prev, new_entry);
5573 
5574 	if (entry->is_sub_map) {
5575 		vm_map_reference(VME_SUBMAP(new_entry));
5576 	} else {
5577 		vm_object_reference(VME_OBJECT(new_entry));
5578 	}
5579 }
5580 
5581 
5582 /*
5583  *	vm_map_clip_end:	[ internal use only ]
5584  *
5585  *	Asserts that the given entry ends at or before
5586  *	the specified address; if necessary,
5587  *	it splits the entry into two.
5588  */
5589 void
vm_map_clip_end(vm_map_t map,vm_map_entry_t entry,vm_map_offset_t endaddr)5590 vm_map_clip_end(
5591 	vm_map_t        map,
5592 	vm_map_entry_t  entry,
5593 	vm_map_offset_t endaddr)
5594 {
5595 	if (endaddr > entry->vme_end) {
5596 		/*
5597 		 * Within the scope of this clipping, limit "endaddr" to
5598 		 * the end of this map entry...
5599 		 */
5600 		endaddr = entry->vme_end;
5601 	}
5602 #ifndef NO_NESTED_PMAP
5603 	if (entry->is_sub_map && entry->use_pmap) {
5604 		vm_map_offset_t start_unnest, end_unnest;
5605 
5606 		/*
5607 		 * Make sure the range between the start of this entry and
5608 		 * the new "endaddr" is no longer nested before we clip.
5609 		 * Unnest only the minimum range the platform can handle.
5610 		 * vm_map_clip_unnest may perform additional adjustments to
5611 		 * the unnest range.
5612 		 */
5613 		start_unnest = entry->vme_start;
5614 		end_unnest =
5615 		    (endaddr + pmap_shared_region_size_min(map->pmap) - 1) &
5616 		    ~(pmap_shared_region_size_min(map->pmap) - 1);
5617 		vm_map_clip_unnest(map, entry, start_unnest, end_unnest);
5618 	}
5619 #endif /* NO_NESTED_PMAP */
5620 	if (endaddr < entry->vme_end) {
5621 		if (!entry->is_sub_map &&
5622 		    VME_OBJECT(entry) &&
5623 		    VME_OBJECT(entry)->phys_contiguous) {
5624 			pmap_remove(map->pmap,
5625 			    (addr64_t)(entry->vme_start),
5626 			    (addr64_t)(entry->vme_end));
5627 		}
5628 		if (entry->vme_atomic) {
5629 			__vm_map_clip_atomic_entry_panic(map, entry, endaddr);
5630 		}
5631 		DTRACE_VM5(
5632 			vm_map_clip_end,
5633 			vm_map_t, map,
5634 			vm_map_offset_t, entry->vme_start,
5635 			vm_map_offset_t, entry->vme_end,
5636 			vm_map_offset_t, endaddr,
5637 			int, VME_ALIAS(entry));
5638 
5639 		_vm_map_clip_end(&map->hdr, entry, endaddr);
5640 		if (map->holelistenabled) {
5641 			vm_map_store_update_first_free(map, NULL, FALSE);
5642 		} else {
5643 			vm_map_store_update_first_free(map, map->first_free, FALSE);
5644 		}
5645 	}
5646 }
5647 
5648 
5649 #define vm_map_copy_clip_end(copy, entry, endaddr) \
5650 	MACRO_BEGIN \
5651 	if ((endaddr) < (entry)->vme_end) \
5652 	        _vm_map_clip_end(&(copy)->cpy_hdr,(entry),(endaddr)); \
5653 	MACRO_END
5654 
5655 /*
5656  *	This routine is called only when it is known that
5657  *	the entry must be split.
5658  */
5659 static void
_vm_map_clip_end(struct vm_map_header * map_header,vm_map_entry_t entry,vm_map_offset_t end)5660 _vm_map_clip_end(
5661 	struct vm_map_header    *map_header,
5662 	vm_map_entry_t          entry,
5663 	vm_map_offset_t         end)
5664 {
5665 	vm_map_entry_t  new_entry;
5666 
5667 	/*
5668 	 *	Create a new entry and insert it
5669 	 *	AFTER the specified entry
5670 	 */
5671 
5672 	if (entry->map_aligned) {
5673 		assert(VM_MAP_PAGE_ALIGNED(end,
5674 		    VM_MAP_HDR_PAGE_MASK(map_header)));
5675 	}
5676 
5677 	new_entry = _vm_map_entry_create(map_header);
5678 	vm_map_entry_copy_full(new_entry, entry);
5679 
5680 	if (__improbable(end <= entry->vme_start)) {
5681 		panic("mapHdr %p entry %p start 0x%llx end 0x%llx new end 0x%llx", map_header, entry, entry->vme_start, entry->vme_end, end);
5682 	}
5683 	assert(entry->vme_start < end);
5684 	new_entry->vme_start = entry->vme_end = end;
5685 	VME_OFFSET_SET(new_entry,
5686 	    VME_OFFSET(new_entry) + (end - entry->vme_start));
5687 	assert(new_entry->vme_start < new_entry->vme_end);
5688 
5689 	_vm_map_store_entry_link(map_header, entry, new_entry);
5690 
5691 	if (entry->is_sub_map) {
5692 		vm_map_reference(VME_SUBMAP(new_entry));
5693 	} else {
5694 		vm_object_reference(VME_OBJECT(new_entry));
5695 	}
5696 }
5697 
5698 
5699 /*
5700  *	VM_MAP_RANGE_CHECK:	[ internal use only ]
5701  *
5702  *	Asserts that the starting and ending region
5703  *	addresses fall within the valid range of the map.
5704  */
5705 #define VM_MAP_RANGE_CHECK(map, start, end)     \
5706 	MACRO_BEGIN                             \
5707 	if (start < vm_map_min(map))            \
5708 	        start = vm_map_min(map);        \
5709 	if (end > vm_map_max(map))              \
5710 	        end = vm_map_max(map);          \
5711 	if (start > end)                        \
5712 	        start = end;                    \
5713 	MACRO_END
5714 
5715 /*
5716  *	vm_map_range_check:	[ internal use only ]
5717  *
5718  *	Check that the region defined by the specified start and
5719  *	end addresses are wholly contained within a single map
5720  *	entry or set of adjacent map entries of the spacified map,
5721  *	i.e. the specified region contains no unmapped space.
5722  *	If any or all of the region is unmapped, FALSE is returned.
5723  *	Otherwise, TRUE is returned and if the output argument 'entry'
5724  *	is not NULL it points to the map entry containing the start
5725  *	of the region.
5726  *
5727  *	The map is locked for reading on entry and is left locked.
5728  */
5729 static boolean_t
vm_map_range_check(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_map_entry_t * entry)5730 vm_map_range_check(
5731 	vm_map_t                map,
5732 	vm_map_offset_t         start,
5733 	vm_map_offset_t         end,
5734 	vm_map_entry_t          *entry)
5735 {
5736 	vm_map_entry_t          cur;
5737 	vm_map_offset_t         prev;
5738 
5739 	/*
5740 	 *      Basic sanity checks first
5741 	 */
5742 	if (start < vm_map_min(map) || end > vm_map_max(map) || start > end) {
5743 		return FALSE;
5744 	}
5745 
5746 	/*
5747 	 *      Check first if the region starts within a valid
5748 	 *	mapping for the map.
5749 	 */
5750 	if (!vm_map_lookup_entry(map, start, &cur)) {
5751 		return FALSE;
5752 	}
5753 
5754 	/*
5755 	 *	Optimize for the case that the region is contained
5756 	 *	in a single map entry.
5757 	 */
5758 	if (entry != (vm_map_entry_t *) NULL) {
5759 		*entry = cur;
5760 	}
5761 	if (end <= cur->vme_end) {
5762 		return TRUE;
5763 	}
5764 
5765 	/*
5766 	 *      If the region is not wholly contained within a
5767 	 *      single entry, walk the entries looking for holes.
5768 	 */
5769 	prev = cur->vme_end;
5770 	cur = cur->vme_next;
5771 	while ((cur != vm_map_to_entry(map)) && (prev == cur->vme_start)) {
5772 		if (end <= cur->vme_end) {
5773 			return TRUE;
5774 		}
5775 		prev = cur->vme_end;
5776 		cur = cur->vme_next;
5777 	}
5778 	return FALSE;
5779 }
5780 
5781 /*
5782  *	vm_map_protect:
5783  *
5784  *	Sets the protection of the specified address
5785  *	region in the target map.  If "set_max" is
5786  *	specified, the maximum protection is to be set;
5787  *	otherwise, only the current protection is affected.
5788  */
5789 kern_return_t
vm_map_protect(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_prot_t new_prot,boolean_t set_max)5790 vm_map_protect(
5791 	vm_map_t        map,
5792 	vm_map_offset_t start,
5793 	vm_map_offset_t end,
5794 	vm_prot_t       new_prot,
5795 	boolean_t       set_max)
5796 {
5797 	vm_map_entry_t                  current;
5798 	vm_map_offset_t                 prev;
5799 	vm_map_entry_t                  entry;
5800 	vm_prot_t                       new_max;
5801 	int                             pmap_options = 0;
5802 	kern_return_t                   kr;
5803 
5804 	if (vm_map_range_overflows(map, start, end - start)) {
5805 		return KERN_INVALID_ARGUMENT;
5806 	}
5807 
5808 	if (new_prot & VM_PROT_COPY) {
5809 		vm_map_offset_t         new_start;
5810 		vm_prot_t               cur_prot, max_prot;
5811 		vm_map_kernel_flags_t   kflags;
5812 
5813 		/* LP64todo - see below */
5814 		if (start >= map->max_offset) {
5815 			return KERN_INVALID_ADDRESS;
5816 		}
5817 
5818 		if ((new_prot & VM_PROT_ALLEXEC) &&
5819 		    map->pmap != kernel_pmap &&
5820 		    (vm_map_cs_enforcement(map)
5821 #if XNU_TARGET_OS_OSX && __arm64__
5822 		    || !VM_MAP_IS_EXOTIC(map)
5823 #endif /* XNU_TARGET_OS_OSX && __arm64__ */
5824 		    ) &&
5825 		    VM_MAP_POLICY_WX_FAIL(map)) {
5826 			DTRACE_VM3(cs_wx,
5827 			    uint64_t, (uint64_t) start,
5828 			    uint64_t, (uint64_t) end,
5829 			    vm_prot_t, new_prot);
5830 			printf("CODE SIGNING: %d[%s] %s:%d(0x%llx,0x%llx,0x%x) can't have both write and exec at the same time\n",
5831 			    proc_selfpid(),
5832 			    (get_bsdtask_info(current_task())
5833 			    ? proc_name_address(get_bsdtask_info(current_task()))
5834 			    : "?"),
5835 			    __FUNCTION__, __LINE__,
5836 #if DEVELOPMENT || DEBUG
5837 			    (uint64_t)start,
5838 			    (uint64_t)end,
5839 #else /* DEVELOPMENT || DEBUG */
5840 			    (uint64_t)0,
5841 			    (uint64_t)0,
5842 #endif /* DEVELOPMENT || DEBUG */
5843 			    new_prot);
5844 			return KERN_PROTECTION_FAILURE;
5845 		}
5846 
5847 		/*
5848 		 * Let vm_map_remap_extract() know that it will need to:
5849 		 * + make a copy of the mapping
5850 		 * + add VM_PROT_WRITE to the max protections
5851 		 * + remove any protections that are no longer allowed from the
5852 		 *   max protections (to avoid any WRITE/EXECUTE conflict, for
5853 		 *   example).
5854 		 * Note that "max_prot" is an IN/OUT parameter only for this
5855 		 * specific (VM_PROT_COPY) case.  It's usually an OUT parameter
5856 		 * only.
5857 		 */
5858 		max_prot = new_prot & (VM_PROT_ALL | VM_PROT_ALLEXEC);
5859 		cur_prot = VM_PROT_NONE;
5860 		kflags = VM_MAP_KERNEL_FLAGS_FIXED(.vmf_overwrite = true);
5861 		kflags.vmkf_remap_prot_copy = true;
5862 		new_start = start;
5863 		kr = vm_map_remap(map,
5864 		    &new_start,
5865 		    end - start,
5866 		    0, /* mask */
5867 		    kflags,
5868 		    map,
5869 		    start,
5870 		    TRUE, /* copy-on-write remapping! */
5871 		    &cur_prot, /* IN/OUT */
5872 		    &max_prot, /* IN/OUT */
5873 		    VM_INHERIT_DEFAULT);
5874 		if (kr != KERN_SUCCESS) {
5875 			return kr;
5876 		}
5877 		new_prot &= ~VM_PROT_COPY;
5878 	}
5879 
5880 	vm_map_lock(map);
5881 
5882 	/* LP64todo - remove this check when vm_map_commpage64()
5883 	 * no longer has to stuff in a map_entry for the commpage
5884 	 * above the map's max_offset.
5885 	 */
5886 	if (start >= map->max_offset) {
5887 		vm_map_unlock(map);
5888 		return KERN_INVALID_ADDRESS;
5889 	}
5890 
5891 	while (1) {
5892 		/*
5893 		 *      Lookup the entry.  If it doesn't start in a valid
5894 		 *	entry, return an error.
5895 		 */
5896 		if (!vm_map_lookup_entry(map, start, &entry)) {
5897 			vm_map_unlock(map);
5898 			return KERN_INVALID_ADDRESS;
5899 		}
5900 
5901 		if (entry->superpage_size && (start & (SUPERPAGE_SIZE - 1))) { /* extend request to whole entry */
5902 			start = SUPERPAGE_ROUND_DOWN(start);
5903 			continue;
5904 		}
5905 		break;
5906 	}
5907 	if (entry->superpage_size) {
5908 		end = SUPERPAGE_ROUND_UP(end);
5909 	}
5910 
5911 	/*
5912 	 *	Make a first pass to check for protection and address
5913 	 *	violations.
5914 	 */
5915 
5916 	current = entry;
5917 	prev = current->vme_start;
5918 	while ((current != vm_map_to_entry(map)) &&
5919 	    (current->vme_start < end)) {
5920 		/*
5921 		 * If there is a hole, return an error.
5922 		 */
5923 		if (current->vme_start != prev) {
5924 			vm_map_unlock(map);
5925 			return KERN_INVALID_ADDRESS;
5926 		}
5927 
5928 		new_max = current->max_protection;
5929 
5930 #if defined(__x86_64__)
5931 		/* Allow max mask to include execute prot bits if this map doesn't enforce CS */
5932 		if (set_max && (new_prot & VM_PROT_ALLEXEC) && !vm_map_cs_enforcement(map)) {
5933 			new_max = (new_max & ~VM_PROT_ALLEXEC) | (new_prot & VM_PROT_ALLEXEC);
5934 		}
5935 #elif CODE_SIGNING_MONITOR
5936 		if (set_max && (new_prot & VM_PROT_EXECUTE) && (csm_address_space_exempt(map->pmap) == KERN_SUCCESS)) {
5937 			new_max |= VM_PROT_EXECUTE;
5938 		}
5939 #endif
5940 		if ((new_prot & new_max) != new_prot) {
5941 			vm_map_unlock(map);
5942 			return KERN_PROTECTION_FAILURE;
5943 		}
5944 
5945 		if (current->used_for_jit &&
5946 		    pmap_has_prot_policy(map->pmap, current->translated_allow_execute, current->protection)) {
5947 			vm_map_unlock(map);
5948 			return KERN_PROTECTION_FAILURE;
5949 		}
5950 
5951 #if __arm64e__
5952 		/* Disallow remapping hw assisted TPRO mappings */
5953 		if (current->used_for_tpro) {
5954 			vm_map_unlock(map);
5955 			return KERN_PROTECTION_FAILURE;
5956 		}
5957 #endif /* __arm64e__ */
5958 
5959 
5960 		if ((new_prot & VM_PROT_WRITE) &&
5961 		    (new_prot & VM_PROT_ALLEXEC) &&
5962 #if XNU_TARGET_OS_OSX
5963 		    map->pmap != kernel_pmap &&
5964 		    (vm_map_cs_enforcement(map)
5965 #if __arm64__
5966 		    || !VM_MAP_IS_EXOTIC(map)
5967 #endif /* __arm64__ */
5968 		    ) &&
5969 #endif /* XNU_TARGET_OS_OSX */
5970 #if CODE_SIGNING_MONITOR
5971 		    (csm_address_space_exempt(map->pmap) != KERN_SUCCESS) &&
5972 #endif
5973 		    !(current->used_for_jit)) {
5974 			DTRACE_VM3(cs_wx,
5975 			    uint64_t, (uint64_t) current->vme_start,
5976 			    uint64_t, (uint64_t) current->vme_end,
5977 			    vm_prot_t, new_prot);
5978 			printf("CODE SIGNING: %d[%s] %s:%d(0x%llx,0x%llx,0x%x) can't have both write and exec at the same time\n",
5979 			    proc_selfpid(),
5980 			    (get_bsdtask_info(current_task())
5981 			    ? proc_name_address(get_bsdtask_info(current_task()))
5982 			    : "?"),
5983 			    __FUNCTION__, __LINE__,
5984 #if DEVELOPMENT || DEBUG
5985 			    (uint64_t)current->vme_start,
5986 			    (uint64_t)current->vme_end,
5987 #else /* DEVELOPMENT || DEBUG */
5988 			    (uint64_t)0,
5989 			    (uint64_t)0,
5990 #endif /* DEVELOPMENT || DEBUG */
5991 			    new_prot);
5992 			new_prot &= ~VM_PROT_ALLEXEC;
5993 			if (VM_MAP_POLICY_WX_FAIL(map)) {
5994 				vm_map_unlock(map);
5995 				return KERN_PROTECTION_FAILURE;
5996 			}
5997 		}
5998 
5999 		/*
6000 		 * If the task has requested executable lockdown,
6001 		 * deny both:
6002 		 * - adding executable protections OR
6003 		 * - adding write protections to an existing executable mapping.
6004 		 */
6005 		if (map->map_disallow_new_exec == TRUE) {
6006 			if ((new_prot & VM_PROT_ALLEXEC) ||
6007 			    ((current->protection & VM_PROT_EXECUTE) && (new_prot & VM_PROT_WRITE))) {
6008 				vm_map_unlock(map);
6009 				return KERN_PROTECTION_FAILURE;
6010 			}
6011 		}
6012 
6013 		prev = current->vme_end;
6014 		current = current->vme_next;
6015 	}
6016 
6017 #if __arm64__
6018 	if (end > prev &&
6019 	    end == vm_map_round_page(prev, VM_MAP_PAGE_MASK(map))) {
6020 		vm_map_entry_t prev_entry;
6021 
6022 		prev_entry = current->vme_prev;
6023 		if (prev_entry != vm_map_to_entry(map) &&
6024 		    !prev_entry->map_aligned &&
6025 		    (vm_map_round_page(prev_entry->vme_end,
6026 		    VM_MAP_PAGE_MASK(map))
6027 		    == end)) {
6028 			/*
6029 			 * The last entry in our range is not "map-aligned"
6030 			 * but it would have reached all the way to "end"
6031 			 * if it had been map-aligned, so this is not really
6032 			 * a hole in the range and we can proceed.
6033 			 */
6034 			prev = end;
6035 		}
6036 	}
6037 #endif /* __arm64__ */
6038 
6039 	if (end > prev) {
6040 		vm_map_unlock(map);
6041 		return KERN_INVALID_ADDRESS;
6042 	}
6043 
6044 	/*
6045 	 *	Go back and fix up protections.
6046 	 *	Clip to start here if the range starts within
6047 	 *	the entry.
6048 	 */
6049 
6050 	current = entry;
6051 	if (current != vm_map_to_entry(map)) {
6052 		/* clip and unnest if necessary */
6053 		vm_map_clip_start(map, current, start);
6054 	}
6055 
6056 	while ((current != vm_map_to_entry(map)) &&
6057 	    (current->vme_start < end)) {
6058 		vm_prot_t       old_prot;
6059 
6060 		vm_map_clip_end(map, current, end);
6061 
6062 #if DEVELOPMENT || DEBUG
6063 		if (current->csm_associated && vm_log_xnu_user_debug) {
6064 			printf("FBDP %d[%s] %s(0x%llx,0x%llx,0x%x) on map %p entry %p [0x%llx:0x%llx 0x%x/0x%x] csm_associated\n",
6065 			    proc_selfpid(),
6066 			    (get_bsdtask_info(current_task())
6067 			    ? proc_name_address(get_bsdtask_info(current_task()))
6068 			    : "?"),
6069 			    __FUNCTION__,
6070 			    (uint64_t)start,
6071 			    (uint64_t)end,
6072 			    new_prot,
6073 			    map, current,
6074 			    current->vme_start,
6075 			    current->vme_end,
6076 			    current->protection,
6077 			    current->max_protection);
6078 		}
6079 #endif /* DEVELOPMENT || DEBUG */
6080 
6081 		if (current->is_sub_map) {
6082 			/* clipping did unnest if needed */
6083 			assert(!current->use_pmap);
6084 		}
6085 
6086 		old_prot = current->protection;
6087 
6088 		if (set_max) {
6089 			current->max_protection = new_prot;
6090 			/* Consider either EXECUTE or UEXEC as EXECUTE for this masking */
6091 			current->protection = (new_prot & old_prot);
6092 		} else {
6093 			current->protection = new_prot;
6094 		}
6095 
6096 #if CODE_SIGNING_MONITOR
6097 		if (!current->vme_xnu_user_debug &&
6098 		    /* a !csm_associated mapping becoming executable */
6099 		    ((!current->csm_associated &&
6100 		    !(old_prot & VM_PROT_EXECUTE) &&
6101 		    (current->protection & VM_PROT_EXECUTE))
6102 		    ||
6103 		    /* a csm_associated mapping becoming writable */
6104 		    (current->csm_associated &&
6105 		    !(old_prot & VM_PROT_WRITE) &&
6106 		    (current->protection & VM_PROT_WRITE)))) {
6107 			/*
6108 			 * This mapping has not already been marked as
6109 			 * "user_debug" and it is either:
6110 			 * 1. not code-signing-monitored and becoming executable
6111 			 * 2. code-signing-monitored and becoming writable,
6112 			 * so inform the CodeSigningMonitor and mark the
6113 			 * mapping as "user_debug" if appropriate.
6114 			 */
6115 			vm_map_kernel_flags_t vmk_flags;
6116 			vmk_flags = VM_MAP_KERNEL_FLAGS_NONE;
6117 			/* pretend it's a vm_protect(VM_PROT_COPY)... */
6118 			vmk_flags.vmkf_remap_prot_copy = true;
6119 			kr = vm_map_entry_cs_associate(map, current, vmk_flags);
6120 #if DEVELOPMENT || DEBUG
6121 			if (vm_log_xnu_user_debug) {
6122 				printf("FBDP %d[%s] %s:%d map %p entry %p [ 0x%llx 0x%llx ] prot 0x%x -> 0x%x cs_associate -> %d user_debug=%d\n",
6123 				    proc_selfpid(),
6124 				    (get_bsdtask_info(current_task()) ? proc_name_address(get_bsdtask_info(current_task())) : "?"),
6125 				    __FUNCTION__, __LINE__,
6126 				    map, current,
6127 				    current->vme_start, current->vme_end,
6128 				    old_prot, current->protection,
6129 				    kr, current->vme_xnu_user_debug);
6130 			}
6131 #endif /* DEVELOPMENT || DEBUG */
6132 		}
6133 #endif /* CODE_SIGNING_MONITOR */
6134 
6135 		/*
6136 		 *	Update physical map if necessary.
6137 		 *	If the request is to turn off write protection,
6138 		 *	we won't do it for real (in pmap). This is because
6139 		 *	it would cause copy-on-write to fail.  We've already
6140 		 *	set, the new protection in the map, so if a
6141 		 *	write-protect fault occurred, it will be fixed up
6142 		 *	properly, COW or not.
6143 		 */
6144 		if (current->protection != old_prot) {
6145 			/* Look one level in we support nested pmaps */
6146 			/* from mapped submaps which are direct entries */
6147 			/* in our map */
6148 
6149 			vm_prot_t prot;
6150 
6151 			prot = current->protection;
6152 			if (current->is_sub_map || (VME_OBJECT(current) == NULL) || (VME_OBJECT(current) != compressor_object)) {
6153 				prot &= ~VM_PROT_WRITE;
6154 			} else {
6155 				assert(!VME_OBJECT(current)->code_signed);
6156 				assert(VME_OBJECT(current)->copy_strategy == MEMORY_OBJECT_COPY_NONE);
6157 				if (prot & VM_PROT_WRITE) {
6158 					/*
6159 					 * For write requests on the
6160 					 * compressor, we wil ask the
6161 					 * pmap layer to prevent us from
6162 					 * taking a write fault when we
6163 					 * attempt to access the mapping
6164 					 * next.
6165 					 */
6166 					pmap_options |= PMAP_OPTIONS_PROTECT_IMMEDIATE;
6167 				}
6168 			}
6169 
6170 			if (override_nx(map, VME_ALIAS(current)) && prot) {
6171 				prot |= VM_PROT_EXECUTE;
6172 			}
6173 
6174 #if DEVELOPMENT || DEBUG
6175 			if (!(old_prot & VM_PROT_EXECUTE) &&
6176 			    (prot & VM_PROT_EXECUTE) &&
6177 			    panic_on_unsigned_execute &&
6178 			    (proc_selfcsflags() & CS_KILL)) {
6179 				panic("vm_map_protect(%p,0x%llx,0x%llx) old=0x%x new=0x%x - <rdar://23770418> code-signing bypass?", map, (uint64_t)current->vme_start, (uint64_t)current->vme_end, old_prot, prot);
6180 			}
6181 #endif /* DEVELOPMENT || DEBUG */
6182 
6183 			if (pmap_has_prot_policy(map->pmap, current->translated_allow_execute, prot)) {
6184 				if (current->wired_count) {
6185 					panic("vm_map_protect(%p,0x%llx,0x%llx) new=0x%x wired=%x",
6186 					    map, (uint64_t)current->vme_start, (uint64_t)current->vme_end, prot, current->wired_count);
6187 				}
6188 
6189 				/* If the pmap layer cares about this
6190 				 * protection type, force a fault for
6191 				 * each page so that vm_fault will
6192 				 * repopulate the page with the full
6193 				 * set of protections.
6194 				 */
6195 				/*
6196 				 * TODO: We don't seem to need this,
6197 				 * but this is due to an internal
6198 				 * implementation detail of
6199 				 * pmap_protect.  Do we want to rely
6200 				 * on this?
6201 				 */
6202 				prot = VM_PROT_NONE;
6203 			}
6204 
6205 			if (current->is_sub_map && current->use_pmap) {
6206 				pmap_protect(VME_SUBMAP(current)->pmap,
6207 				    current->vme_start,
6208 				    current->vme_end,
6209 				    prot);
6210 			} else {
6211 				pmap_protect_options(map->pmap,
6212 				    current->vme_start,
6213 				    current->vme_end,
6214 				    prot,
6215 				    pmap_options,
6216 				    NULL);
6217 			}
6218 		}
6219 		current = current->vme_next;
6220 	}
6221 
6222 	current = entry;
6223 	while ((current != vm_map_to_entry(map)) &&
6224 	    (current->vme_start <= end)) {
6225 		vm_map_simplify_entry(map, current);
6226 		current = current->vme_next;
6227 	}
6228 
6229 	vm_map_unlock(map);
6230 	return KERN_SUCCESS;
6231 }
6232 
6233 /*
6234  *	vm_map_inherit:
6235  *
6236  *	Sets the inheritance of the specified address
6237  *	range in the target map.  Inheritance
6238  *	affects how the map will be shared with
6239  *	child maps at the time of vm_map_fork.
6240  */
6241 kern_return_t
vm_map_inherit(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_inherit_t new_inheritance)6242 vm_map_inherit(
6243 	vm_map_t        map,
6244 	vm_map_offset_t start,
6245 	vm_map_offset_t end,
6246 	vm_inherit_t    new_inheritance)
6247 {
6248 	vm_map_entry_t  entry;
6249 	vm_map_entry_t  temp_entry;
6250 
6251 	vm_map_lock(map);
6252 
6253 	VM_MAP_RANGE_CHECK(map, start, end);
6254 
6255 	if (__improbable(vm_map_range_overflows(map, start, end - start))) {
6256 		return KERN_INVALID_ADDRESS;
6257 	}
6258 
6259 	if (vm_map_lookup_entry(map, start, &temp_entry)) {
6260 		entry = temp_entry;
6261 	} else {
6262 		temp_entry = temp_entry->vme_next;
6263 		entry = temp_entry;
6264 	}
6265 
6266 	/* first check entire range for submaps which can't support the */
6267 	/* given inheritance. */
6268 	while ((entry != vm_map_to_entry(map)) && (entry->vme_start < end)) {
6269 		if (entry->is_sub_map) {
6270 			if (new_inheritance == VM_INHERIT_COPY) {
6271 				vm_map_unlock(map);
6272 				return KERN_INVALID_ARGUMENT;
6273 			}
6274 		}
6275 
6276 		entry = entry->vme_next;
6277 	}
6278 
6279 	entry = temp_entry;
6280 	if (entry != vm_map_to_entry(map)) {
6281 		/* clip and unnest if necessary */
6282 		vm_map_clip_start(map, entry, start);
6283 	}
6284 
6285 	while ((entry != vm_map_to_entry(map)) && (entry->vme_start < end)) {
6286 		vm_map_clip_end(map, entry, end);
6287 		if (entry->is_sub_map) {
6288 			/* clip did unnest if needed */
6289 			assert(!entry->use_pmap);
6290 		}
6291 
6292 		entry->inheritance = new_inheritance;
6293 
6294 		entry = entry->vme_next;
6295 	}
6296 
6297 	vm_map_unlock(map);
6298 	return KERN_SUCCESS;
6299 }
6300 
6301 /*
6302  * Update the accounting for the amount of wired memory in this map.  If the user has
6303  * exceeded the defined limits, then we fail.  Wiring on behalf of the kernel never fails.
6304  */
6305 
6306 static kern_return_t
add_wire_counts(vm_map_t map,vm_map_entry_t entry,boolean_t user_wire)6307 add_wire_counts(
6308 	vm_map_t        map,
6309 	vm_map_entry_t  entry,
6310 	boolean_t       user_wire)
6311 {
6312 	vm_map_size_t   size;
6313 
6314 	if (user_wire) {
6315 		unsigned int total_wire_count =  vm_page_wire_count + vm_lopage_free_count;
6316 
6317 		/*
6318 		 * We're wiring memory at the request of the user.  Check if this is the first time the user is wiring
6319 		 * this map entry.
6320 		 */
6321 
6322 		if (entry->user_wired_count == 0) {
6323 			size = entry->vme_end - entry->vme_start;
6324 
6325 			/*
6326 			 * Since this is the first time the user is wiring this map entry, check to see if we're
6327 			 * exceeding the user wire limits.  There is a per map limit which is the smaller of either
6328 			 * the process's rlimit or the global vm_per_task_user_wire_limit which caps this value.  There is also
6329 			 * a system-wide limit on the amount of memory all users can wire.  If the user is over either
6330 			 * limit, then we fail.
6331 			 */
6332 
6333 			if (size + map->user_wire_size > MIN(map->user_wire_limit, vm_per_task_user_wire_limit) ||
6334 			    size + ptoa_64(total_wire_count) > vm_global_user_wire_limit) {
6335 				if (size + ptoa_64(total_wire_count) > vm_global_user_wire_limit) {
6336 #if DEVELOPMENT || DEBUG
6337 					if (panic_on_mlock_failure) {
6338 						panic("mlock: Over global wire limit. %llu bytes wired and requested to wire %llu bytes more", ptoa_64(total_wire_count), (uint64_t) size);
6339 					}
6340 #endif /* DEVELOPMENT || DEBUG */
6341 					os_atomic_inc(&vm_add_wire_count_over_global_limit, relaxed);
6342 				} else {
6343 					os_atomic_inc(&vm_add_wire_count_over_user_limit, relaxed);
6344 #if DEVELOPMENT || DEBUG
6345 					if (panic_on_mlock_failure) {
6346 						panic("mlock: Over process wire limit. %llu bytes wired and requested to wire %llu bytes more", (uint64_t) map->user_wire_size, (uint64_t) size);
6347 					}
6348 #endif /* DEVELOPMENT || DEBUG */
6349 				}
6350 				return KERN_RESOURCE_SHORTAGE;
6351 			}
6352 
6353 			/*
6354 			 * The first time the user wires an entry, we also increment the wired_count and add this to
6355 			 * the total that has been wired in the map.
6356 			 */
6357 
6358 			if (entry->wired_count >= MAX_WIRE_COUNT) {
6359 				return KERN_FAILURE;
6360 			}
6361 
6362 			entry->wired_count++;
6363 			map->user_wire_size += size;
6364 		}
6365 
6366 		if (entry->user_wired_count >= MAX_WIRE_COUNT) {
6367 			return KERN_FAILURE;
6368 		}
6369 
6370 		entry->user_wired_count++;
6371 	} else {
6372 		/*
6373 		 * The kernel's wiring the memory.  Just bump the count and continue.
6374 		 */
6375 
6376 		if (entry->wired_count >= MAX_WIRE_COUNT) {
6377 			panic("vm_map_wire: too many wirings");
6378 		}
6379 
6380 		entry->wired_count++;
6381 	}
6382 
6383 	return KERN_SUCCESS;
6384 }
6385 
6386 /*
6387  * Update the memory wiring accounting now that the given map entry is being unwired.
6388  */
6389 
6390 static void
subtract_wire_counts(vm_map_t map,vm_map_entry_t entry,boolean_t user_wire)6391 subtract_wire_counts(
6392 	vm_map_t        map,
6393 	vm_map_entry_t  entry,
6394 	boolean_t       user_wire)
6395 {
6396 	if (user_wire) {
6397 		/*
6398 		 * We're unwiring memory at the request of the user.  See if we're removing the last user wire reference.
6399 		 */
6400 
6401 		if (entry->user_wired_count == 1) {
6402 			/*
6403 			 * We're removing the last user wire reference.  Decrement the wired_count and the total
6404 			 * user wired memory for this map.
6405 			 */
6406 
6407 			assert(entry->wired_count >= 1);
6408 			entry->wired_count--;
6409 			map->user_wire_size -= entry->vme_end - entry->vme_start;
6410 		}
6411 
6412 		assert(entry->user_wired_count >= 1);
6413 		entry->user_wired_count--;
6414 	} else {
6415 		/*
6416 		 * The kernel is unwiring the memory.   Just update the count.
6417 		 */
6418 
6419 		assert(entry->wired_count >= 1);
6420 		entry->wired_count--;
6421 	}
6422 }
6423 
6424 int cs_executable_wire = 0;
6425 
6426 /*
6427  *	vm_map_wire:
6428  *
6429  *	Sets the pageability of the specified address range in the
6430  *	target map as wired.  Regions specified as not pageable require
6431  *	locked-down physical memory and physical page maps.  The
6432  *	access_type variable indicates types of accesses that must not
6433  *	generate page faults.  This is checked against protection of
6434  *	memory being locked-down.
6435  *
6436  *	The map must not be locked, but a reference must remain to the
6437  *	map throughout the call.
6438  */
6439 static kern_return_t
vm_map_wire_nested(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_prot_t caller_prot,vm_tag_t tag,boolean_t user_wire,pmap_t map_pmap,vm_map_offset_t pmap_addr,ppnum_t * physpage_p)6440 vm_map_wire_nested(
6441 	vm_map_t                map,
6442 	vm_map_offset_t         start,
6443 	vm_map_offset_t         end,
6444 	vm_prot_t               caller_prot,
6445 	vm_tag_t                tag,
6446 	boolean_t               user_wire,
6447 	pmap_t                  map_pmap,
6448 	vm_map_offset_t         pmap_addr,
6449 	ppnum_t                 *physpage_p)
6450 {
6451 	vm_map_entry_t          entry;
6452 	vm_prot_t               access_type;
6453 	struct vm_map_entry     *first_entry, tmp_entry;
6454 	vm_map_t                real_map;
6455 	vm_map_offset_t         s, e;
6456 	kern_return_t           rc;
6457 	boolean_t               need_wakeup;
6458 	boolean_t               main_map = FALSE;
6459 	wait_interrupt_t        interruptible_state;
6460 	thread_t                cur_thread;
6461 	unsigned int            last_timestamp;
6462 	vm_map_size_t           size;
6463 	boolean_t               wire_and_extract;
6464 	vm_prot_t               extra_prots;
6465 
6466 	extra_prots = VM_PROT_COPY;
6467 	extra_prots |= VM_PROT_COPY_FAIL_IF_EXECUTABLE;
6468 #if XNU_TARGET_OS_OSX
6469 	if (map->pmap == kernel_pmap ||
6470 	    !vm_map_cs_enforcement(map)) {
6471 		extra_prots &= ~VM_PROT_COPY_FAIL_IF_EXECUTABLE;
6472 	}
6473 #endif /* XNU_TARGET_OS_OSX */
6474 #if CODE_SIGNING_MONITOR
6475 	if (csm_address_space_exempt(map->pmap) == KERN_SUCCESS) {
6476 		extra_prots &= ~VM_PROT_COPY_FAIL_IF_EXECUTABLE;
6477 	}
6478 #endif /* CODE_SIGNING_MONITOR */
6479 
6480 	access_type = (caller_prot & (VM_PROT_ALL | VM_PROT_ALLEXEC));
6481 
6482 	wire_and_extract = FALSE;
6483 	if (physpage_p != NULL) {
6484 		/*
6485 		 * The caller wants the physical page number of the
6486 		 * wired page.  We return only one physical page number
6487 		 * so this works for only one page at a time.
6488 		 */
6489 		if ((end - start) != PAGE_SIZE) {
6490 			return KERN_INVALID_ARGUMENT;
6491 		}
6492 		wire_and_extract = TRUE;
6493 		*physpage_p = 0;
6494 	}
6495 
6496 	vm_map_lock(map);
6497 	if (map_pmap == NULL) {
6498 		main_map = TRUE;
6499 	}
6500 	last_timestamp = map->timestamp;
6501 
6502 	VM_MAP_RANGE_CHECK(map, start, end);
6503 	assert(VM_MAP_PAGE_ALIGNED(start, VM_MAP_PAGE_MASK(map)));
6504 	assert(VM_MAP_PAGE_ALIGNED(end, VM_MAP_PAGE_MASK(map)));
6505 
6506 	if (start == end) {
6507 		/* We wired what the caller asked for, zero pages */
6508 		vm_map_unlock(map);
6509 		return KERN_SUCCESS;
6510 	}
6511 
6512 	if (__improbable(vm_map_range_overflows(map, start, end - start))) {
6513 		return KERN_INVALID_ADDRESS;
6514 	}
6515 
6516 	need_wakeup = FALSE;
6517 	cur_thread = current_thread();
6518 
6519 	s = start;
6520 	rc = KERN_SUCCESS;
6521 
6522 	if (vm_map_lookup_entry(map, s, &first_entry)) {
6523 		entry = first_entry;
6524 		/*
6525 		 * vm_map_clip_start will be done later.
6526 		 * We don't want to unnest any nested submaps here !
6527 		 */
6528 	} else {
6529 		/* Start address is not in map */
6530 		rc = KERN_INVALID_ADDRESS;
6531 		goto done;
6532 	}
6533 
6534 	while ((entry != vm_map_to_entry(map)) && (s < end)) {
6535 		/*
6536 		 * At this point, we have wired from "start" to "s".
6537 		 * We still need to wire from "s" to "end".
6538 		 *
6539 		 * "entry" hasn't been clipped, so it could start before "s"
6540 		 * and/or end after "end".
6541 		 */
6542 
6543 		/* "e" is how far we want to wire in this entry */
6544 		e = entry->vme_end;
6545 		if (e > end) {
6546 			e = end;
6547 		}
6548 
6549 		/*
6550 		 * If another thread is wiring/unwiring this entry then
6551 		 * block after informing other thread to wake us up.
6552 		 */
6553 		if (entry->in_transition) {
6554 			wait_result_t wait_result;
6555 
6556 			/*
6557 			 * We have not clipped the entry.  Make sure that
6558 			 * the start address is in range so that the lookup
6559 			 * below will succeed.
6560 			 * "s" is the current starting point: we've already
6561 			 * wired from "start" to "s" and we still have
6562 			 * to wire from "s" to "end".
6563 			 */
6564 
6565 			entry->needs_wakeup = TRUE;
6566 
6567 			/*
6568 			 * wake up anybody waiting on entries that we have
6569 			 * already wired.
6570 			 */
6571 			if (need_wakeup) {
6572 				vm_map_entry_wakeup(map);
6573 				need_wakeup = FALSE;
6574 			}
6575 			/*
6576 			 * User wiring is interruptible
6577 			 */
6578 			wait_result = vm_map_entry_wait(map,
6579 			    (user_wire) ? THREAD_ABORTSAFE :
6580 			    THREAD_UNINT);
6581 			if (user_wire && wait_result == THREAD_INTERRUPTED) {
6582 				/*
6583 				 * undo the wirings we have done so far
6584 				 * We do not clear the needs_wakeup flag,
6585 				 * because we cannot tell if we were the
6586 				 * only one waiting.
6587 				 */
6588 				rc = KERN_FAILURE;
6589 				goto done;
6590 			}
6591 
6592 			/*
6593 			 * Cannot avoid a lookup here. reset timestamp.
6594 			 */
6595 			last_timestamp = map->timestamp;
6596 
6597 			/*
6598 			 * The entry could have been clipped, look it up again.
6599 			 * Worse that can happen is, it may not exist anymore.
6600 			 */
6601 			if (!vm_map_lookup_entry(map, s, &first_entry)) {
6602 				/*
6603 				 * User: undo everything upto the previous
6604 				 * entry.  let vm_map_unwire worry about
6605 				 * checking the validity of the range.
6606 				 */
6607 				rc = KERN_FAILURE;
6608 				goto done;
6609 			}
6610 			entry = first_entry;
6611 			continue;
6612 		}
6613 
6614 		if (entry->is_sub_map) {
6615 			vm_map_offset_t sub_start;
6616 			vm_map_offset_t sub_end;
6617 			vm_map_offset_t local_start;
6618 			vm_map_offset_t local_end;
6619 			pmap_t          pmap;
6620 
6621 			if (wire_and_extract) {
6622 				/*
6623 				 * Wiring would result in copy-on-write
6624 				 * which would not be compatible with
6625 				 * the sharing we have with the original
6626 				 * provider of this memory.
6627 				 */
6628 				rc = KERN_INVALID_ARGUMENT;
6629 				goto done;
6630 			}
6631 
6632 			vm_map_clip_start(map, entry, s);
6633 			vm_map_clip_end(map, entry, end);
6634 
6635 			sub_start = VME_OFFSET(entry);
6636 			sub_end = entry->vme_end;
6637 			sub_end += VME_OFFSET(entry) - entry->vme_start;
6638 
6639 			local_end = entry->vme_end;
6640 			if (map_pmap == NULL) {
6641 				vm_object_t             object;
6642 				vm_object_offset_t      offset;
6643 				vm_prot_t               prot;
6644 				boolean_t               wired;
6645 				vm_map_entry_t          local_entry;
6646 				vm_map_version_t         version;
6647 				vm_map_t                lookup_map;
6648 
6649 				if (entry->use_pmap) {
6650 					pmap = VME_SUBMAP(entry)->pmap;
6651 					/* ppc implementation requires that */
6652 					/* submaps pmap address ranges line */
6653 					/* up with parent map */
6654 #ifdef notdef
6655 					pmap_addr = sub_start;
6656 #endif
6657 					pmap_addr = s;
6658 				} else {
6659 					pmap = map->pmap;
6660 					pmap_addr = s;
6661 				}
6662 
6663 				if (entry->wired_count) {
6664 					if ((rc = add_wire_counts(map, entry, user_wire)) != KERN_SUCCESS) {
6665 						goto done;
6666 					}
6667 
6668 					/*
6669 					 * The map was not unlocked:
6670 					 * no need to goto re-lookup.
6671 					 * Just go directly to next entry.
6672 					 */
6673 					entry = entry->vme_next;
6674 					s = entry->vme_start;
6675 					continue;
6676 				}
6677 
6678 				/* call vm_map_lookup_and_lock_object to */
6679 				/* cause any needs copy to be   */
6680 				/* evaluated */
6681 				local_start = entry->vme_start;
6682 				lookup_map = map;
6683 				vm_map_lock_write_to_read(map);
6684 				rc = vm_map_lookup_and_lock_object(
6685 					&lookup_map, local_start,
6686 					(access_type | extra_prots),
6687 					OBJECT_LOCK_EXCLUSIVE,
6688 					&version, &object,
6689 					&offset, &prot, &wired,
6690 					NULL,
6691 					&real_map, NULL);
6692 				if (rc != KERN_SUCCESS) {
6693 					vm_map_unlock_read(lookup_map);
6694 					assert(map_pmap == NULL);
6695 					vm_map_unwire(map, start,
6696 					    s, user_wire);
6697 					return rc;
6698 				}
6699 				vm_object_unlock(object);
6700 				if (real_map != lookup_map) {
6701 					vm_map_unlock(real_map);
6702 				}
6703 				vm_map_unlock_read(lookup_map);
6704 				vm_map_lock(map);
6705 
6706 				/* we unlocked, so must re-lookup */
6707 				if (!vm_map_lookup_entry(map,
6708 				    local_start,
6709 				    &local_entry)) {
6710 					rc = KERN_FAILURE;
6711 					goto done;
6712 				}
6713 
6714 				/*
6715 				 * entry could have been "simplified",
6716 				 * so re-clip
6717 				 */
6718 				entry = local_entry;
6719 				assert(s == local_start);
6720 				vm_map_clip_start(map, entry, s);
6721 				vm_map_clip_end(map, entry, end);
6722 				/* re-compute "e" */
6723 				e = entry->vme_end;
6724 				if (e > end) {
6725 					e = end;
6726 				}
6727 
6728 				/* did we have a change of type? */
6729 				if (!entry->is_sub_map) {
6730 					last_timestamp = map->timestamp;
6731 					continue;
6732 				}
6733 			} else {
6734 				local_start = entry->vme_start;
6735 				pmap = map_pmap;
6736 			}
6737 
6738 			if ((rc = add_wire_counts(map, entry, user_wire)) != KERN_SUCCESS) {
6739 				goto done;
6740 			}
6741 
6742 			entry->in_transition = TRUE;
6743 
6744 			vm_map_unlock(map);
6745 			rc = vm_map_wire_nested(VME_SUBMAP(entry),
6746 			    sub_start, sub_end,
6747 			    caller_prot, tag,
6748 			    user_wire, pmap, pmap_addr,
6749 			    NULL);
6750 			vm_map_lock(map);
6751 
6752 			/*
6753 			 * Find the entry again.  It could have been clipped
6754 			 * after we unlocked the map.
6755 			 */
6756 			if (!vm_map_lookup_entry(map, local_start,
6757 			    &first_entry)) {
6758 				panic("vm_map_wire: re-lookup failed");
6759 			}
6760 			entry = first_entry;
6761 
6762 			assert(local_start == s);
6763 			/* re-compute "e" */
6764 			e = entry->vme_end;
6765 			if (e > end) {
6766 				e = end;
6767 			}
6768 
6769 			last_timestamp = map->timestamp;
6770 			while ((entry != vm_map_to_entry(map)) &&
6771 			    (entry->vme_start < e)) {
6772 				assert(entry->in_transition);
6773 				entry->in_transition = FALSE;
6774 				if (entry->needs_wakeup) {
6775 					entry->needs_wakeup = FALSE;
6776 					need_wakeup = TRUE;
6777 				}
6778 				if (rc != KERN_SUCCESS) {/* from vm_*_wire */
6779 					subtract_wire_counts(map, entry, user_wire);
6780 				}
6781 				entry = entry->vme_next;
6782 			}
6783 			if (rc != KERN_SUCCESS) {       /* from vm_*_wire */
6784 				goto done;
6785 			}
6786 
6787 			/* no need to relookup again */
6788 			s = entry->vme_start;
6789 			continue;
6790 		}
6791 
6792 		/*
6793 		 * If this entry is already wired then increment
6794 		 * the appropriate wire reference count.
6795 		 */
6796 		if (entry->wired_count) {
6797 			if ((entry->protection & access_type) != access_type) {
6798 				/* found a protection problem */
6799 
6800 				/*
6801 				 * XXX FBDP
6802 				 * We should always return an error
6803 				 * in this case but since we didn't
6804 				 * enforce it before, let's do
6805 				 * it only for the new "wire_and_extract"
6806 				 * code path for now...
6807 				 */
6808 				if (wire_and_extract) {
6809 					rc = KERN_PROTECTION_FAILURE;
6810 					goto done;
6811 				}
6812 			}
6813 
6814 			/*
6815 			 * entry is already wired down, get our reference
6816 			 * after clipping to our range.
6817 			 */
6818 			vm_map_clip_start(map, entry, s);
6819 			vm_map_clip_end(map, entry, end);
6820 
6821 			if ((rc = add_wire_counts(map, entry, user_wire)) != KERN_SUCCESS) {
6822 				goto done;
6823 			}
6824 
6825 			if (wire_and_extract) {
6826 				vm_object_t             object;
6827 				vm_object_offset_t      offset;
6828 				vm_page_t               m;
6829 
6830 				/*
6831 				 * We don't have to "wire" the page again
6832 				 * bit we still have to "extract" its
6833 				 * physical page number, after some sanity
6834 				 * checks.
6835 				 */
6836 				assert((entry->vme_end - entry->vme_start)
6837 				    == PAGE_SIZE);
6838 				assert(!entry->needs_copy);
6839 				assert(!entry->is_sub_map);
6840 				assert(VME_OBJECT(entry));
6841 				if (((entry->vme_end - entry->vme_start)
6842 				    != PAGE_SIZE) ||
6843 				    entry->needs_copy ||
6844 				    entry->is_sub_map ||
6845 				    VME_OBJECT(entry) == VM_OBJECT_NULL) {
6846 					rc = KERN_INVALID_ARGUMENT;
6847 					goto done;
6848 				}
6849 
6850 				object = VME_OBJECT(entry);
6851 				offset = VME_OFFSET(entry);
6852 				/* need exclusive lock to update m->dirty */
6853 				if (entry->protection & VM_PROT_WRITE) {
6854 					vm_object_lock(object);
6855 				} else {
6856 					vm_object_lock_shared(object);
6857 				}
6858 				m = vm_page_lookup(object, offset);
6859 				assert(m != VM_PAGE_NULL);
6860 				assert(VM_PAGE_WIRED(m));
6861 				if (m != VM_PAGE_NULL && VM_PAGE_WIRED(m)) {
6862 					*physpage_p = VM_PAGE_GET_PHYS_PAGE(m);
6863 					if (entry->protection & VM_PROT_WRITE) {
6864 						vm_object_lock_assert_exclusive(
6865 							object);
6866 						m->vmp_dirty = TRUE;
6867 					}
6868 				} else {
6869 					/* not already wired !? */
6870 					*physpage_p = 0;
6871 				}
6872 				vm_object_unlock(object);
6873 			}
6874 
6875 			/* map was not unlocked: no need to relookup */
6876 			entry = entry->vme_next;
6877 			s = entry->vme_start;
6878 			continue;
6879 		}
6880 
6881 		/*
6882 		 * Unwired entry or wire request transmitted via submap
6883 		 */
6884 
6885 		/*
6886 		 * Wiring would copy the pages to the shadow object.
6887 		 * The shadow object would not be code-signed so
6888 		 * attempting to execute code from these copied pages
6889 		 * would trigger a code-signing violation.
6890 		 */
6891 
6892 		if ((entry->protection & VM_PROT_EXECUTE)
6893 #if XNU_TARGET_OS_OSX
6894 		    &&
6895 		    map->pmap != kernel_pmap &&
6896 		    (vm_map_cs_enforcement(map)
6897 #if __arm64__
6898 		    || !VM_MAP_IS_EXOTIC(map)
6899 #endif /* __arm64__ */
6900 		    )
6901 #endif /* XNU_TARGET_OS_OSX */
6902 #if CODE_SIGNING_MONITOR
6903 		    &&
6904 		    (csm_address_space_exempt(map->pmap) != KERN_SUCCESS)
6905 #endif
6906 		    ) {
6907 #if MACH_ASSERT
6908 			printf("pid %d[%s] wiring executable range from "
6909 			    "0x%llx to 0x%llx: rejected to preserve "
6910 			    "code-signing\n",
6911 			    proc_selfpid(),
6912 			    (get_bsdtask_info(current_task())
6913 			    ? proc_name_address(get_bsdtask_info(current_task()))
6914 			    : "?"),
6915 			    (uint64_t) entry->vme_start,
6916 			    (uint64_t) entry->vme_end);
6917 #endif /* MACH_ASSERT */
6918 			DTRACE_VM2(cs_executable_wire,
6919 			    uint64_t, (uint64_t)entry->vme_start,
6920 			    uint64_t, (uint64_t)entry->vme_end);
6921 			cs_executable_wire++;
6922 			rc = KERN_PROTECTION_FAILURE;
6923 			goto done;
6924 		}
6925 
6926 		/*
6927 		 * Perform actions of vm_map_lookup that need the write
6928 		 * lock on the map: create a shadow object for a
6929 		 * copy-on-write region, or an object for a zero-fill
6930 		 * region.
6931 		 */
6932 		size = entry->vme_end - entry->vme_start;
6933 		/*
6934 		 * If wiring a copy-on-write page, we need to copy it now
6935 		 * even if we're only (currently) requesting read access.
6936 		 * This is aggressive, but once it's wired we can't move it.
6937 		 */
6938 		if (entry->needs_copy) {
6939 			if (wire_and_extract) {
6940 				/*
6941 				 * We're supposed to share with the original
6942 				 * provider so should not be "needs_copy"
6943 				 */
6944 				rc = KERN_INVALID_ARGUMENT;
6945 				goto done;
6946 			}
6947 
6948 			VME_OBJECT_SHADOW(entry, size,
6949 			    vm_map_always_shadow(map));
6950 			entry->needs_copy = FALSE;
6951 		} else if (VME_OBJECT(entry) == VM_OBJECT_NULL) {
6952 			if (wire_and_extract) {
6953 				/*
6954 				 * We're supposed to share with the original
6955 				 * provider so should already have an object.
6956 				 */
6957 				rc = KERN_INVALID_ARGUMENT;
6958 				goto done;
6959 			}
6960 			VME_OBJECT_SET(entry, vm_object_allocate(size), false, 0);
6961 			VME_OFFSET_SET(entry, (vm_object_offset_t)0);
6962 			assert(entry->use_pmap);
6963 		} else if (VME_OBJECT(entry)->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) {
6964 			if (wire_and_extract) {
6965 				/*
6966 				 * We're supposed to share with the original
6967 				 * provider so should not be COPY_SYMMETRIC.
6968 				 */
6969 				rc = KERN_INVALID_ARGUMENT;
6970 				goto done;
6971 			}
6972 			/*
6973 			 * Force an unrequested "copy-on-write" but only for
6974 			 * the range we're wiring.
6975 			 */
6976 //			printf("FBDP %s:%d map %p entry %p [ 0x%llx 0x%llx ] s 0x%llx end 0x%llx wire&extract=%d\n", __FUNCTION__, __LINE__, map, entry, (uint64_t)entry->vme_start, (uint64_t)entry->vme_end, (uint64_t)s, (uint64_t)end, wire_and_extract);
6977 			vm_map_clip_start(map, entry, s);
6978 			vm_map_clip_end(map, entry, end);
6979 			/* recompute "size" */
6980 			size = entry->vme_end - entry->vme_start;
6981 			/* make a shadow object */
6982 			vm_object_t orig_object;
6983 			vm_object_offset_t orig_offset;
6984 			orig_object = VME_OBJECT(entry);
6985 			orig_offset = VME_OFFSET(entry);
6986 			VME_OBJECT_SHADOW(entry, size, vm_map_always_shadow(map));
6987 			if (VME_OBJECT(entry) != orig_object) {
6988 				/*
6989 				 * This mapping has not been shared (or it would be
6990 				 * COPY_DELAY instead of COPY_SYMMETRIC) and it has
6991 				 * not been copied-on-write (or it would be marked
6992 				 * as "needs_copy" and would have been handled above
6993 				 * and also already write-protected).
6994 				 * We still need to write-protect here to prevent
6995 				 * other threads from modifying these pages while
6996 				 * we're in the process of copying and wiring
6997 				 * the copied pages.
6998 				 * Since the mapping is neither shared nor COWed,
6999 				 * we only need to write-protect the PTEs for this
7000 				 * mapping.
7001 				 */
7002 				vm_object_pmap_protect(orig_object,
7003 				    orig_offset,
7004 				    size,
7005 				    map->pmap,
7006 				    VM_MAP_PAGE_SIZE(map),
7007 				    entry->vme_start,
7008 				    entry->protection & ~VM_PROT_WRITE);
7009 			}
7010 		}
7011 		if (VME_OBJECT(entry)->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) {
7012 			/*
7013 			 * Make the object COPY_DELAY to get a stable object
7014 			 * to wire.
7015 			 * That should avoid creating long shadow chains while
7016 			 * wiring/unwiring the same range repeatedly.
7017 			 * That also prevents part of the object from being
7018 			 * wired while another part is "needs_copy", which
7019 			 * could result in conflicting rules wrt copy-on-write.
7020 			 */
7021 			vm_object_t object;
7022 
7023 			object = VME_OBJECT(entry);
7024 			vm_object_lock(object);
7025 			if (object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) {
7026 				assertf(vm_object_round_page(VME_OFFSET(entry) + size) - vm_object_trunc_page(VME_OFFSET(entry)) == object->vo_size,
7027 				    "object %p size 0x%llx entry %p [0x%llx:0x%llx:0x%llx] size 0x%llx\n",
7028 				    object, (uint64_t)object->vo_size,
7029 				    entry,
7030 				    (uint64_t)entry->vme_start,
7031 				    (uint64_t)entry->vme_end,
7032 				    (uint64_t)VME_OFFSET(entry),
7033 				    (uint64_t)size);
7034 				assertf(object->ref_count == 1,
7035 				    "object %p ref_count %d\n",
7036 				    object, object->ref_count);
7037 				assertf(!entry->needs_copy,
7038 				    "entry %p\n", entry);
7039 				object->copy_strategy = MEMORY_OBJECT_COPY_DELAY;
7040 				object->true_share = TRUE;
7041 			}
7042 			vm_object_unlock(object);
7043 		}
7044 
7045 		vm_map_clip_start(map, entry, s);
7046 		vm_map_clip_end(map, entry, end);
7047 
7048 		/* re-compute "e" */
7049 		e = entry->vme_end;
7050 		if (e > end) {
7051 			e = end;
7052 		}
7053 
7054 		/*
7055 		 * Check for holes and protection mismatch.
7056 		 * Holes: Next entry should be contiguous unless this
7057 		 *	  is the end of the region.
7058 		 * Protection: Access requested must be allowed, unless
7059 		 *	wiring is by protection class
7060 		 */
7061 		if ((entry->vme_end < end) &&
7062 		    ((entry->vme_next == vm_map_to_entry(map)) ||
7063 		    (entry->vme_next->vme_start > entry->vme_end))) {
7064 			/* found a hole */
7065 			rc = KERN_INVALID_ADDRESS;
7066 			goto done;
7067 		}
7068 		if ((entry->protection & access_type) != access_type) {
7069 			/* found a protection problem */
7070 			rc = KERN_PROTECTION_FAILURE;
7071 			goto done;
7072 		}
7073 
7074 		assert(entry->wired_count == 0 && entry->user_wired_count == 0);
7075 
7076 		if ((rc = add_wire_counts(map, entry, user_wire)) != KERN_SUCCESS) {
7077 			goto done;
7078 		}
7079 
7080 		entry->in_transition = TRUE;
7081 
7082 		/*
7083 		 * This entry might get split once we unlock the map.
7084 		 * In vm_fault_wire(), we need the current range as
7085 		 * defined by this entry.  In order for this to work
7086 		 * along with a simultaneous clip operation, we make a
7087 		 * temporary copy of this entry and use that for the
7088 		 * wiring.  Note that the underlying objects do not
7089 		 * change during a clip.
7090 		 */
7091 		tmp_entry = *entry;
7092 
7093 		/*
7094 		 * The in_transition state guarentees that the entry
7095 		 * (or entries for this range, if split occured) will be
7096 		 * there when the map lock is acquired for the second time.
7097 		 */
7098 		vm_map_unlock(map);
7099 
7100 		if (!user_wire && cur_thread != THREAD_NULL) {
7101 			interruptible_state = thread_interrupt_level(THREAD_UNINT);
7102 		} else {
7103 			interruptible_state = THREAD_UNINT;
7104 		}
7105 
7106 		if (map_pmap) {
7107 			rc = vm_fault_wire(map,
7108 			    &tmp_entry, caller_prot, tag, map_pmap, pmap_addr,
7109 			    physpage_p);
7110 		} else {
7111 			rc = vm_fault_wire(map,
7112 			    &tmp_entry, caller_prot, tag, map->pmap,
7113 			    tmp_entry.vme_start,
7114 			    physpage_p);
7115 		}
7116 
7117 		if (!user_wire && cur_thread != THREAD_NULL) {
7118 			thread_interrupt_level(interruptible_state);
7119 		}
7120 
7121 		vm_map_lock(map);
7122 
7123 		if (last_timestamp + 1 != map->timestamp) {
7124 			/*
7125 			 * Find the entry again.  It could have been clipped
7126 			 * after we unlocked the map.
7127 			 */
7128 			if (!vm_map_lookup_entry(map, tmp_entry.vme_start,
7129 			    &first_entry)) {
7130 				panic("vm_map_wire: re-lookup failed");
7131 			}
7132 
7133 			entry = first_entry;
7134 		}
7135 
7136 		last_timestamp = map->timestamp;
7137 
7138 		while ((entry != vm_map_to_entry(map)) &&
7139 		    (entry->vme_start < tmp_entry.vme_end)) {
7140 			assert(entry->in_transition);
7141 			entry->in_transition = FALSE;
7142 			if (entry->needs_wakeup) {
7143 				entry->needs_wakeup = FALSE;
7144 				need_wakeup = TRUE;
7145 			}
7146 			if (rc != KERN_SUCCESS) {       /* from vm_*_wire */
7147 				subtract_wire_counts(map, entry, user_wire);
7148 			}
7149 			entry = entry->vme_next;
7150 		}
7151 
7152 		if (rc != KERN_SUCCESS) {               /* from vm_*_wire */
7153 			goto done;
7154 		}
7155 
7156 		if ((entry != vm_map_to_entry(map)) && /* we still have entries in the map */
7157 		    (tmp_entry.vme_end != end) &&    /* AND, we are not at the end of the requested range */
7158 		    (entry->vme_start != tmp_entry.vme_end)) { /* AND, the next entry is not contiguous. */
7159 			/* found a "new" hole */
7160 			s = tmp_entry.vme_end;
7161 			rc = KERN_INVALID_ADDRESS;
7162 			goto done;
7163 		}
7164 
7165 		s = entry->vme_start;
7166 	} /* end while loop through map entries */
7167 
7168 done:
7169 	if (rc == KERN_SUCCESS) {
7170 		/* repair any damage we may have made to the VM map */
7171 		vm_map_simplify_range(map, start, end);
7172 	}
7173 
7174 	vm_map_unlock(map);
7175 
7176 	/*
7177 	 * wake up anybody waiting on entries we wired.
7178 	 */
7179 	if (need_wakeup) {
7180 		vm_map_entry_wakeup(map);
7181 	}
7182 
7183 	if (rc != KERN_SUCCESS) {
7184 		/* undo what has been wired so far */
7185 		vm_map_unwire_nested(map, start, s, user_wire,
7186 		    map_pmap, pmap_addr);
7187 		if (physpage_p) {
7188 			*physpage_p = 0;
7189 		}
7190 	}
7191 
7192 	return rc;
7193 }
7194 
7195 kern_return_t
vm_map_wire_external(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_prot_t caller_prot,boolean_t user_wire)7196 vm_map_wire_external(
7197 	vm_map_t                map,
7198 	vm_map_offset_t         start,
7199 	vm_map_offset_t         end,
7200 	vm_prot_t               caller_prot,
7201 	boolean_t               user_wire)
7202 {
7203 	kern_return_t   kret;
7204 
7205 	kret = vm_map_wire_nested(map, start, end, caller_prot, vm_tag_bt(),
7206 	    user_wire, (pmap_t)NULL, 0, NULL);
7207 	return kret;
7208 }
7209 
7210 kern_return_t
vm_map_wire_kernel(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_prot_t caller_prot,vm_tag_t tag,boolean_t user_wire)7211 vm_map_wire_kernel(
7212 	vm_map_t                map,
7213 	vm_map_offset_t         start,
7214 	vm_map_offset_t         end,
7215 	vm_prot_t               caller_prot,
7216 	vm_tag_t                tag,
7217 	boolean_t               user_wire)
7218 {
7219 	kern_return_t   kret;
7220 
7221 	kret = vm_map_wire_nested(map, start, end, caller_prot, tag,
7222 	    user_wire, (pmap_t)NULL, 0, NULL);
7223 	return kret;
7224 }
7225 
7226 kern_return_t
vm_map_wire_and_extract_external(vm_map_t map,vm_map_offset_t start,vm_prot_t caller_prot,boolean_t user_wire,ppnum_t * physpage_p)7227 vm_map_wire_and_extract_external(
7228 	vm_map_t        map,
7229 	vm_map_offset_t start,
7230 	vm_prot_t       caller_prot,
7231 	boolean_t       user_wire,
7232 	ppnum_t         *physpage_p)
7233 {
7234 	kern_return_t   kret;
7235 
7236 	kret = vm_map_wire_nested(map,
7237 	    start,
7238 	    start + VM_MAP_PAGE_SIZE(map),
7239 	    caller_prot,
7240 	    vm_tag_bt(),
7241 	    user_wire,
7242 	    (pmap_t)NULL,
7243 	    0,
7244 	    physpage_p);
7245 	if (kret != KERN_SUCCESS &&
7246 	    physpage_p != NULL) {
7247 		*physpage_p = 0;
7248 	}
7249 	return kret;
7250 }
7251 
7252 /*
7253  *	vm_map_unwire:
7254  *
7255  *	Sets the pageability of the specified address range in the target
7256  *	as pageable.  Regions specified must have been wired previously.
7257  *
7258  *	The map must not be locked, but a reference must remain to the map
7259  *	throughout the call.
7260  *
7261  *	Kernel will panic on failures.  User unwire ignores holes and
7262  *	unwired and intransition entries to avoid losing memory by leaving
7263  *	it unwired.
7264  */
7265 static kern_return_t
vm_map_unwire_nested(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,boolean_t user_wire,pmap_t map_pmap,vm_map_offset_t pmap_addr)7266 vm_map_unwire_nested(
7267 	vm_map_t                map,
7268 	vm_map_offset_t         start,
7269 	vm_map_offset_t         end,
7270 	boolean_t               user_wire,
7271 	pmap_t                  map_pmap,
7272 	vm_map_offset_t         pmap_addr)
7273 {
7274 	vm_map_entry_t          entry;
7275 	struct vm_map_entry     *first_entry, tmp_entry;
7276 	boolean_t               need_wakeup;
7277 	boolean_t               main_map = FALSE;
7278 	unsigned int            last_timestamp;
7279 
7280 	vm_map_lock(map);
7281 	if (map_pmap == NULL) {
7282 		main_map = TRUE;
7283 	}
7284 	last_timestamp = map->timestamp;
7285 
7286 	VM_MAP_RANGE_CHECK(map, start, end);
7287 	assert(VM_MAP_PAGE_ALIGNED(start, VM_MAP_PAGE_MASK(map)));
7288 	assert(VM_MAP_PAGE_ALIGNED(end, VM_MAP_PAGE_MASK(map)));
7289 
7290 	if (start == end) {
7291 		/* We unwired what the caller asked for: zero pages */
7292 		vm_map_unlock(map);
7293 		return KERN_SUCCESS;
7294 	}
7295 
7296 	if (__improbable(vm_map_range_overflows(map, start, end - start))) {
7297 		return KERN_INVALID_ADDRESS;
7298 	}
7299 
7300 	if (vm_map_lookup_entry(map, start, &first_entry)) {
7301 		entry = first_entry;
7302 		/*
7303 		 * vm_map_clip_start will be done later.
7304 		 * We don't want to unnest any nested sub maps here !
7305 		 */
7306 	} else {
7307 		if (!user_wire) {
7308 			panic("vm_map_unwire: start not found");
7309 		}
7310 		/*	Start address is not in map. */
7311 		vm_map_unlock(map);
7312 		return KERN_INVALID_ADDRESS;
7313 	}
7314 
7315 	if (entry->superpage_size) {
7316 		/* superpages are always wired */
7317 		vm_map_unlock(map);
7318 		return KERN_INVALID_ADDRESS;
7319 	}
7320 
7321 	need_wakeup = FALSE;
7322 	while ((entry != vm_map_to_entry(map)) && (entry->vme_start < end)) {
7323 		if (entry->in_transition) {
7324 			/*
7325 			 * 1)
7326 			 * Another thread is wiring down this entry. Note
7327 			 * that if it is not for the other thread we would
7328 			 * be unwiring an unwired entry.  This is not
7329 			 * permitted.  If we wait, we will be unwiring memory
7330 			 * we did not wire.
7331 			 *
7332 			 * 2)
7333 			 * Another thread is unwiring this entry.  We did not
7334 			 * have a reference to it, because if we did, this
7335 			 * entry will not be getting unwired now.
7336 			 */
7337 			if (!user_wire) {
7338 				/*
7339 				 * XXX FBDP
7340 				 * This could happen:  there could be some
7341 				 * overlapping vslock/vsunlock operations
7342 				 * going on.
7343 				 * We should probably just wait and retry,
7344 				 * but then we have to be careful that this
7345 				 * entry could get "simplified" after
7346 				 * "in_transition" gets unset and before
7347 				 * we re-lookup the entry, so we would
7348 				 * have to re-clip the entry to avoid
7349 				 * re-unwiring what we have already unwired...
7350 				 * See vm_map_wire_nested().
7351 				 *
7352 				 * Or we could just ignore "in_transition"
7353 				 * here and proceed to decement the wired
7354 				 * count(s) on this entry.  That should be fine
7355 				 * as long as "wired_count" doesn't drop all
7356 				 * the way to 0 (and we should panic if THAT
7357 				 * happens).
7358 				 */
7359 				panic("vm_map_unwire: in_transition entry");
7360 			}
7361 
7362 			entry = entry->vme_next;
7363 			continue;
7364 		}
7365 
7366 		if (entry->is_sub_map) {
7367 			vm_map_offset_t sub_start;
7368 			vm_map_offset_t sub_end;
7369 			vm_map_offset_t local_end;
7370 			pmap_t          pmap;
7371 
7372 			vm_map_clip_start(map, entry, start);
7373 			vm_map_clip_end(map, entry, end);
7374 
7375 			sub_start = VME_OFFSET(entry);
7376 			sub_end = entry->vme_end - entry->vme_start;
7377 			sub_end += VME_OFFSET(entry);
7378 			local_end = entry->vme_end;
7379 			if (map_pmap == NULL) {
7380 				if (entry->use_pmap) {
7381 					pmap = VME_SUBMAP(entry)->pmap;
7382 					pmap_addr = sub_start;
7383 				} else {
7384 					pmap = map->pmap;
7385 					pmap_addr = start;
7386 				}
7387 				if (entry->wired_count == 0 ||
7388 				    (user_wire && entry->user_wired_count == 0)) {
7389 					if (!user_wire) {
7390 						panic("vm_map_unwire: entry is unwired");
7391 					}
7392 					entry = entry->vme_next;
7393 					continue;
7394 				}
7395 
7396 				/*
7397 				 * Check for holes
7398 				 * Holes: Next entry should be contiguous unless
7399 				 * this is the end of the region.
7400 				 */
7401 				if (((entry->vme_end < end) &&
7402 				    ((entry->vme_next == vm_map_to_entry(map)) ||
7403 				    (entry->vme_next->vme_start
7404 				    > entry->vme_end)))) {
7405 					if (!user_wire) {
7406 						panic("vm_map_unwire: non-contiguous region");
7407 					}
7408 /*
7409  *                                       entry = entry->vme_next;
7410  *                                       continue;
7411  */
7412 				}
7413 
7414 				subtract_wire_counts(map, entry, user_wire);
7415 
7416 				if (entry->wired_count != 0) {
7417 					entry = entry->vme_next;
7418 					continue;
7419 				}
7420 
7421 				entry->in_transition = TRUE;
7422 				tmp_entry = *entry;/* see comment in vm_map_wire() */
7423 
7424 				/*
7425 				 * We can unlock the map now. The in_transition state
7426 				 * guarantees existance of the entry.
7427 				 */
7428 				vm_map_unlock(map);
7429 				vm_map_unwire_nested(VME_SUBMAP(entry),
7430 				    sub_start, sub_end, user_wire, pmap, pmap_addr);
7431 				vm_map_lock(map);
7432 
7433 				if (last_timestamp + 1 != map->timestamp) {
7434 					/*
7435 					 * Find the entry again.  It could have been
7436 					 * clipped or deleted after we unlocked the map.
7437 					 */
7438 					if (!vm_map_lookup_entry(map,
7439 					    tmp_entry.vme_start,
7440 					    &first_entry)) {
7441 						if (!user_wire) {
7442 							panic("vm_map_unwire: re-lookup failed");
7443 						}
7444 						entry = first_entry->vme_next;
7445 					} else {
7446 						entry = first_entry;
7447 					}
7448 				}
7449 				last_timestamp = map->timestamp;
7450 
7451 				/*
7452 				 * clear transition bit for all constituent entries
7453 				 * that were in the original entry (saved in
7454 				 * tmp_entry).  Also check for waiters.
7455 				 */
7456 				while ((entry != vm_map_to_entry(map)) &&
7457 				    (entry->vme_start < tmp_entry.vme_end)) {
7458 					assert(entry->in_transition);
7459 					entry->in_transition = FALSE;
7460 					if (entry->needs_wakeup) {
7461 						entry->needs_wakeup = FALSE;
7462 						need_wakeup = TRUE;
7463 					}
7464 					entry = entry->vme_next;
7465 				}
7466 				continue;
7467 			} else {
7468 				tmp_entry = *entry;
7469 				vm_map_unlock(map);
7470 				vm_map_unwire_nested(VME_SUBMAP(entry),
7471 				    sub_start, sub_end, user_wire, map_pmap,
7472 				    pmap_addr);
7473 				vm_map_lock(map);
7474 
7475 				if (last_timestamp + 1 != map->timestamp) {
7476 					/*
7477 					 * Find the entry again.  It could have been
7478 					 * clipped or deleted after we unlocked the map.
7479 					 */
7480 					if (!vm_map_lookup_entry(map,
7481 					    tmp_entry.vme_start,
7482 					    &first_entry)) {
7483 						if (!user_wire) {
7484 							panic("vm_map_unwire: re-lookup failed");
7485 						}
7486 						entry = first_entry->vme_next;
7487 					} else {
7488 						entry = first_entry;
7489 					}
7490 				}
7491 				last_timestamp = map->timestamp;
7492 			}
7493 		}
7494 
7495 
7496 		if ((entry->wired_count == 0) ||
7497 		    (user_wire && entry->user_wired_count == 0)) {
7498 			if (!user_wire) {
7499 				panic("vm_map_unwire: entry is unwired");
7500 			}
7501 
7502 			entry = entry->vme_next;
7503 			continue;
7504 		}
7505 
7506 		assert(entry->wired_count > 0 &&
7507 		    (!user_wire || entry->user_wired_count > 0));
7508 
7509 		vm_map_clip_start(map, entry, start);
7510 		vm_map_clip_end(map, entry, end);
7511 
7512 		/*
7513 		 * Check for holes
7514 		 * Holes: Next entry should be contiguous unless
7515 		 *	  this is the end of the region.
7516 		 */
7517 		if (((entry->vme_end < end) &&
7518 		    ((entry->vme_next == vm_map_to_entry(map)) ||
7519 		    (entry->vme_next->vme_start > entry->vme_end)))) {
7520 			if (!user_wire) {
7521 				panic("vm_map_unwire: non-contiguous region");
7522 			}
7523 			entry = entry->vme_next;
7524 			continue;
7525 		}
7526 
7527 		subtract_wire_counts(map, entry, user_wire);
7528 
7529 		if (entry->wired_count != 0) {
7530 			entry = entry->vme_next;
7531 			continue;
7532 		}
7533 
7534 		if (entry->zero_wired_pages) {
7535 			entry->zero_wired_pages = FALSE;
7536 		}
7537 
7538 		entry->in_transition = TRUE;
7539 		tmp_entry = *entry;     /* see comment in vm_map_wire() */
7540 
7541 		/*
7542 		 * We can unlock the map now. The in_transition state
7543 		 * guarantees existance of the entry.
7544 		 */
7545 		vm_map_unlock(map);
7546 		if (map_pmap) {
7547 			vm_fault_unwire(map, &tmp_entry, FALSE, map_pmap,
7548 			    pmap_addr, tmp_entry.vme_end);
7549 		} else {
7550 			vm_fault_unwire(map, &tmp_entry, FALSE, map->pmap,
7551 			    tmp_entry.vme_start, tmp_entry.vme_end);
7552 		}
7553 		vm_map_lock(map);
7554 
7555 		if (last_timestamp + 1 != map->timestamp) {
7556 			/*
7557 			 * Find the entry again.  It could have been clipped
7558 			 * or deleted after we unlocked the map.
7559 			 */
7560 			if (!vm_map_lookup_entry(map, tmp_entry.vme_start,
7561 			    &first_entry)) {
7562 				if (!user_wire) {
7563 					panic("vm_map_unwire: re-lookup failed");
7564 				}
7565 				entry = first_entry->vme_next;
7566 			} else {
7567 				entry = first_entry;
7568 			}
7569 		}
7570 		last_timestamp = map->timestamp;
7571 
7572 		/*
7573 		 * clear transition bit for all constituent entries that
7574 		 * were in the original entry (saved in tmp_entry).  Also
7575 		 * check for waiters.
7576 		 */
7577 		while ((entry != vm_map_to_entry(map)) &&
7578 		    (entry->vme_start < tmp_entry.vme_end)) {
7579 			assert(entry->in_transition);
7580 			entry->in_transition = FALSE;
7581 			if (entry->needs_wakeup) {
7582 				entry->needs_wakeup = FALSE;
7583 				need_wakeup = TRUE;
7584 			}
7585 			entry = entry->vme_next;
7586 		}
7587 	}
7588 
7589 	/*
7590 	 * We might have fragmented the address space when we wired this
7591 	 * range of addresses.  Attempt to re-coalesce these VM map entries
7592 	 * with their neighbors now that they're no longer wired.
7593 	 * Under some circumstances, address space fragmentation can
7594 	 * prevent VM object shadow chain collapsing, which can cause
7595 	 * swap space leaks.
7596 	 */
7597 	vm_map_simplify_range(map, start, end);
7598 
7599 	vm_map_unlock(map);
7600 	/*
7601 	 * wake up anybody waiting on entries that we have unwired.
7602 	 */
7603 	if (need_wakeup) {
7604 		vm_map_entry_wakeup(map);
7605 	}
7606 	return KERN_SUCCESS;
7607 }
7608 
7609 kern_return_t
vm_map_unwire(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,boolean_t user_wire)7610 vm_map_unwire(
7611 	vm_map_t                map,
7612 	vm_map_offset_t         start,
7613 	vm_map_offset_t         end,
7614 	boolean_t               user_wire)
7615 {
7616 	return vm_map_unwire_nested(map, start, end,
7617 	           user_wire, (pmap_t)NULL, 0);
7618 }
7619 
7620 
7621 /*
7622  *	vm_map_entry_zap:	[ internal use only ]
7623  *
7624  *	Remove the entry from the target map
7625  *	and put it on a zap list.
7626  */
7627 static void
vm_map_entry_zap(vm_map_t map,vm_map_entry_t entry,vm_map_zap_t zap)7628 vm_map_entry_zap(
7629 	vm_map_t                map,
7630 	vm_map_entry_t          entry,
7631 	vm_map_zap_t            zap)
7632 {
7633 	vm_map_offset_t s, e;
7634 
7635 	s = entry->vme_start;
7636 	e = entry->vme_end;
7637 	assert(VM_MAP_PAGE_ALIGNED(s, FOURK_PAGE_MASK));
7638 	assert(VM_MAP_PAGE_ALIGNED(e, FOURK_PAGE_MASK));
7639 	if (VM_MAP_PAGE_MASK(map) >= PAGE_MASK) {
7640 		assert(page_aligned(s));
7641 		assert(page_aligned(e));
7642 	}
7643 	if (entry->map_aligned == TRUE) {
7644 		assert(VM_MAP_PAGE_ALIGNED(s, VM_MAP_PAGE_MASK(map)));
7645 		assert(VM_MAP_PAGE_ALIGNED(e, VM_MAP_PAGE_MASK(map)));
7646 	}
7647 	assert(entry->wired_count == 0);
7648 	assert(entry->user_wired_count == 0);
7649 	assert(!entry->vme_permanent);
7650 
7651 	vm_map_store_entry_unlink(map, entry, false);
7652 	map->size -= e - s;
7653 
7654 	vm_map_zap_append(zap, entry);
7655 }
7656 
7657 static void
vm_map_submap_pmap_clean(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_map_t sub_map,vm_map_offset_t offset)7658 vm_map_submap_pmap_clean(
7659 	vm_map_t        map,
7660 	vm_map_offset_t start,
7661 	vm_map_offset_t end,
7662 	vm_map_t        sub_map,
7663 	vm_map_offset_t offset)
7664 {
7665 	vm_map_offset_t submap_start;
7666 	vm_map_offset_t submap_end;
7667 	vm_map_size_t   remove_size;
7668 	vm_map_entry_t  entry;
7669 
7670 	submap_end = offset + (end - start);
7671 	submap_start = offset;
7672 
7673 	vm_map_lock_read(sub_map);
7674 	if (vm_map_lookup_entry(sub_map, offset, &entry)) {
7675 		remove_size = (entry->vme_end - entry->vme_start);
7676 		if (offset > entry->vme_start) {
7677 			remove_size -= offset - entry->vme_start;
7678 		}
7679 
7680 
7681 		if (submap_end < entry->vme_end) {
7682 			remove_size -=
7683 			    entry->vme_end - submap_end;
7684 		}
7685 		if (entry->is_sub_map) {
7686 			vm_map_submap_pmap_clean(
7687 				sub_map,
7688 				start,
7689 				start + remove_size,
7690 				VME_SUBMAP(entry),
7691 				VME_OFFSET(entry));
7692 		} else {
7693 			if (map->mapped_in_other_pmaps &&
7694 			    os_ref_get_count_raw(&map->map_refcnt) != 0 &&
7695 			    VME_OBJECT(entry) != NULL) {
7696 				vm_object_pmap_protect_options(
7697 					VME_OBJECT(entry),
7698 					(VME_OFFSET(entry) +
7699 					offset -
7700 					entry->vme_start),
7701 					remove_size,
7702 					PMAP_NULL,
7703 					PAGE_SIZE,
7704 					entry->vme_start,
7705 					VM_PROT_NONE,
7706 					PMAP_OPTIONS_REMOVE);
7707 			} else {
7708 				pmap_remove(map->pmap,
7709 				    (addr64_t)start,
7710 				    (addr64_t)(start + remove_size));
7711 			}
7712 		}
7713 	}
7714 
7715 	entry = entry->vme_next;
7716 
7717 	while ((entry != vm_map_to_entry(sub_map))
7718 	    && (entry->vme_start < submap_end)) {
7719 		remove_size = (entry->vme_end - entry->vme_start);
7720 		if (submap_end < entry->vme_end) {
7721 			remove_size -= entry->vme_end - submap_end;
7722 		}
7723 		if (entry->is_sub_map) {
7724 			vm_map_submap_pmap_clean(
7725 				sub_map,
7726 				(start + entry->vme_start) - offset,
7727 				((start + entry->vme_start) - offset) + remove_size,
7728 				VME_SUBMAP(entry),
7729 				VME_OFFSET(entry));
7730 		} else {
7731 			if (map->mapped_in_other_pmaps &&
7732 			    os_ref_get_count_raw(&map->map_refcnt) != 0 &&
7733 			    VME_OBJECT(entry) != NULL) {
7734 				vm_object_pmap_protect_options(
7735 					VME_OBJECT(entry),
7736 					VME_OFFSET(entry),
7737 					remove_size,
7738 					PMAP_NULL,
7739 					PAGE_SIZE,
7740 					entry->vme_start,
7741 					VM_PROT_NONE,
7742 					PMAP_OPTIONS_REMOVE);
7743 			} else {
7744 				pmap_remove(map->pmap,
7745 				    (addr64_t)((start + entry->vme_start)
7746 				    - offset),
7747 				    (addr64_t)(((start + entry->vme_start)
7748 				    - offset) + remove_size));
7749 			}
7750 		}
7751 		entry = entry->vme_next;
7752 	}
7753 	vm_map_unlock_read(sub_map);
7754 	return;
7755 }
7756 
7757 /*
7758  *     virt_memory_guard_ast:
7759  *
7760  *     Handle the AST callout for a virtual memory guard.
7761  *	   raise an EXC_GUARD exception and terminate the task
7762  *     if configured to do so.
7763  */
7764 void
virt_memory_guard_ast(thread_t thread,mach_exception_data_type_t code,mach_exception_data_type_t subcode)7765 virt_memory_guard_ast(
7766 	thread_t thread,
7767 	mach_exception_data_type_t code,
7768 	mach_exception_data_type_t subcode)
7769 {
7770 	task_t task = get_threadtask(thread);
7771 	assert(task != kernel_task);
7772 	assert(task == current_task());
7773 	kern_return_t sync_exception_result;
7774 	uint32_t behavior;
7775 
7776 	behavior = task->task_exc_guard;
7777 
7778 	/* Is delivery enabled */
7779 	if ((behavior & TASK_EXC_GUARD_VM_DELIVER) == 0) {
7780 		return;
7781 	}
7782 
7783 	/* If only once, make sure we're that once */
7784 	while (behavior & TASK_EXC_GUARD_VM_ONCE) {
7785 		uint32_t new_behavior = behavior & ~TASK_EXC_GUARD_VM_DELIVER;
7786 
7787 		if (OSCompareAndSwap(behavior, new_behavior, &task->task_exc_guard)) {
7788 			break;
7789 		}
7790 		behavior = task->task_exc_guard;
7791 		if ((behavior & TASK_EXC_GUARD_VM_DELIVER) == 0) {
7792 			return;
7793 		}
7794 	}
7795 
7796 	/* Raise exception synchronously and see if handler claimed it */
7797 	sync_exception_result = task_exception_notify(EXC_GUARD, code, subcode);
7798 
7799 	if (task->task_exc_guard & TASK_EXC_GUARD_VM_FATAL) {
7800 		/*
7801 		 * If Synchronous EXC_GUARD delivery was successful then
7802 		 * kill the process and return, else kill the process
7803 		 * and deliver the exception via EXC_CORPSE_NOTIFY.
7804 		 */
7805 		if (sync_exception_result == KERN_SUCCESS) {
7806 			task_bsdtask_kill(current_task());
7807 		} else {
7808 			exit_with_guard_exception(current_proc(), code, subcode);
7809 		}
7810 	} else if (task->task_exc_guard & TASK_EXC_GUARD_VM_CORPSE) {
7811 		/*
7812 		 * If the synchronous EXC_GUARD delivery was not successful,
7813 		 * raise a simulated crash.
7814 		 */
7815 		if (sync_exception_result != KERN_SUCCESS) {
7816 			task_violated_guard(code, subcode, NULL, FALSE);
7817 		}
7818 	}
7819 }
7820 
7821 /*
7822  *     vm_map_guard_exception:
7823  *
7824  *     Generate a GUARD_TYPE_VIRTUAL_MEMORY EXC_GUARD exception.
7825  *
7826  *     Right now, we do this when we find nothing mapped, or a
7827  *     gap in the mapping when a user address space deallocate
7828  *     was requested. We report the address of the first gap found.
7829  */
7830 static void
vm_map_guard_exception(vm_map_offset_t gap_start,unsigned reason)7831 vm_map_guard_exception(
7832 	vm_map_offset_t gap_start,
7833 	unsigned reason)
7834 {
7835 	mach_exception_code_t code = 0;
7836 	unsigned int guard_type = GUARD_TYPE_VIRT_MEMORY;
7837 	unsigned int target = 0; /* should we pass in pid associated with map? */
7838 	mach_exception_data_type_t subcode = (uint64_t)gap_start;
7839 	boolean_t fatal = FALSE;
7840 
7841 	task_t task = current_task_early();
7842 
7843 	/* Can't deliver exceptions to a NULL task (early boot) or kernel task */
7844 	if (task == NULL || task == kernel_task) {
7845 		return;
7846 	}
7847 
7848 	EXC_GUARD_ENCODE_TYPE(code, guard_type);
7849 	EXC_GUARD_ENCODE_FLAVOR(code, reason);
7850 	EXC_GUARD_ENCODE_TARGET(code, target);
7851 
7852 	if (task->task_exc_guard & TASK_EXC_GUARD_VM_FATAL) {
7853 		fatal = TRUE;
7854 	}
7855 	thread_guard_violation(current_thread(), code, subcode, fatal);
7856 }
7857 
7858 static kern_return_t
vm_map_delete_submap_recurse(vm_map_t submap,vm_map_offset_t submap_start,vm_map_offset_t submap_end)7859 vm_map_delete_submap_recurse(
7860 	vm_map_t submap,
7861 	vm_map_offset_t submap_start,
7862 	vm_map_offset_t submap_end)
7863 {
7864 	vm_map_entry_t submap_entry;
7865 
7866 	/*
7867 	 * Verify that the submap does not contain any "permanent" entries
7868 	 * within the specified range.
7869 	 * We do not care about gaps.
7870 	 */
7871 
7872 	vm_map_lock(submap);
7873 
7874 	if (!vm_map_lookup_entry(submap, submap_start, &submap_entry)) {
7875 		submap_entry = submap_entry->vme_next;
7876 	}
7877 
7878 	for (;
7879 	    submap_entry != vm_map_to_entry(submap) &&
7880 	    submap_entry->vme_start < submap_end;
7881 	    submap_entry = submap_entry->vme_next) {
7882 		if (submap_entry->vme_permanent) {
7883 			/* "permanent" entry -> fail */
7884 			vm_map_unlock(submap);
7885 			return KERN_PROTECTION_FAILURE;
7886 		}
7887 	}
7888 	/* no "permanent" entries in the range -> success */
7889 	vm_map_unlock(submap);
7890 	return KERN_SUCCESS;
7891 }
7892 
7893 __abortlike
7894 static void
__vm_map_delete_misaligned_panic(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)7895 __vm_map_delete_misaligned_panic(
7896 	vm_map_t                map,
7897 	vm_map_offset_t         start,
7898 	vm_map_offset_t         end)
7899 {
7900 	panic("vm_map_delete(%p,0x%llx,0x%llx): start is not aligned to 0x%x",
7901 	    map, (uint64_t)start, (uint64_t)end, VM_MAP_PAGE_SIZE(map));
7902 }
7903 
7904 __abortlike
7905 static void
__vm_map_delete_failed_panic(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,kern_return_t kr)7906 __vm_map_delete_failed_panic(
7907 	vm_map_t                map,
7908 	vm_map_offset_t         start,
7909 	vm_map_offset_t         end,
7910 	kern_return_t           kr)
7911 {
7912 	panic("vm_map_delete(%p,0x%llx,0x%llx): failed unexpected with %d",
7913 	    map, (uint64_t)start, (uint64_t)end, kr);
7914 }
7915 
7916 __abortlike
7917 static void
__vm_map_delete_gap_panic(vm_map_t map,vm_map_offset_t where,vm_map_offset_t start,vm_map_offset_t end)7918 __vm_map_delete_gap_panic(
7919 	vm_map_t                map,
7920 	vm_map_offset_t         where,
7921 	vm_map_offset_t         start,
7922 	vm_map_offset_t         end)
7923 {
7924 	panic("vm_map_delete(%p,0x%llx,0x%llx): no map entry at 0x%llx",
7925 	    map, (uint64_t)start, (uint64_t)end, (uint64_t)where);
7926 }
7927 
7928 __abortlike
7929 static void
__vm_map_delete_permanent_panic(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_map_entry_t entry)7930 __vm_map_delete_permanent_panic(
7931 	vm_map_t                map,
7932 	vm_map_offset_t         start,
7933 	vm_map_offset_t         end,
7934 	vm_map_entry_t          entry)
7935 {
7936 	panic("vm_map_delete(%p,0x%llx,0x%llx): "
7937 	    "Attempting to remove permanent VM map entry %p [0x%llx:0x%llx]",
7938 	    map, (uint64_t)start, (uint64_t)end, entry,
7939 	    (uint64_t)entry->vme_start,
7940 	    (uint64_t)entry->vme_end);
7941 }
7942 
7943 __options_decl(vm_map_delete_state_t, uint32_t, {
7944 	VMDS_NONE               = 0x0000,
7945 
7946 	VMDS_FOUND_GAP          = 0x0001,
7947 	VMDS_GAPS_OK            = 0x0002,
7948 
7949 	VMDS_KERNEL_PMAP        = 0x0004,
7950 	VMDS_NEEDS_LOOKUP       = 0x0008,
7951 	VMDS_NEEDS_WAKEUP       = 0x0010,
7952 	VMDS_KERNEL_KMEMPTR     = 0x0020
7953 });
7954 
7955 /*
7956  *	vm_map_delete:	[ internal use only ]
7957  *
7958  *	Deallocates the given address range from the target map.
7959  *	Removes all user wirings. Unwires one kernel wiring if
7960  *	VM_MAP_REMOVE_KUNWIRE is set.  Waits for kernel wirings to go
7961  *	away if VM_MAP_REMOVE_WAIT_FOR_KWIRE is set.  Sleeps
7962  *	interruptibly if VM_MAP_REMOVE_INTERRUPTIBLE is set.
7963  *
7964  *
7965  *	When the map is a kernel map, then any error in removing mappings
7966  *	will lead to a panic so that clients do not have to repeat the panic
7967  *	code at each call site.  If VM_MAP_REMOVE_INTERRUPTIBLE
7968  *	is also passed, then KERN_ABORTED will not lead to a panic.
7969  *
7970  *	This routine is called with map locked and leaves map locked.
7971  */
7972 static kmem_return_t
vm_map_delete(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vmr_flags_t flags,kmem_guard_t guard,vm_map_zap_t zap_list)7973 vm_map_delete(
7974 	vm_map_t                map,
7975 	vm_map_offset_t         start,
7976 	vm_map_offset_t         end,
7977 	vmr_flags_t             flags,
7978 	kmem_guard_t            guard,
7979 	vm_map_zap_t            zap_list)
7980 {
7981 	vm_map_entry_t          entry, next;
7982 	int                     interruptible;
7983 	vm_map_offset_t         gap_start = 0;
7984 	vm_map_offset_t         clear_in_transition_end = 0;
7985 	__unused vm_map_offset_t save_start = start;
7986 	__unused vm_map_offset_t save_end = end;
7987 	vm_map_delete_state_t   state = VMDS_NONE;
7988 	kmem_return_t           ret = { };
7989 	vm_map_range_id_t       range_id = 0;
7990 	struct kmem_page_meta  *meta = NULL;
7991 	uint32_t                size_idx, slot_idx;
7992 	struct mach_vm_range    slot;
7993 
7994 	if (vm_map_pmap(map) == kernel_pmap) {
7995 		state |= VMDS_KERNEL_PMAP;
7996 		range_id = kmem_addr_get_range(start, end - start);
7997 		if (kmem_is_ptr_range(range_id)) {
7998 			state |= VMDS_KERNEL_KMEMPTR;
7999 			slot_idx = kmem_addr_get_slot_idx(start, end, range_id, &meta,
8000 			    &size_idx, &slot);
8001 		}
8002 	}
8003 
8004 	if (map->terminated || os_ref_get_count_raw(&map->map_refcnt) == 0) {
8005 		state |= VMDS_GAPS_OK;
8006 	}
8007 
8008 	interruptible = (flags & VM_MAP_REMOVE_INTERRUPTIBLE) ?
8009 	    THREAD_ABORTSAFE : THREAD_UNINT;
8010 
8011 	if ((flags & VM_MAP_REMOVE_NO_MAP_ALIGN) == 0 &&
8012 	    (start & VM_MAP_PAGE_MASK(map))) {
8013 		__vm_map_delete_misaligned_panic(map, start, end);
8014 	}
8015 
8016 	if ((state & VMDS_GAPS_OK) == 0) {
8017 		/*
8018 		 * If the map isn't terminated then all deletions must have
8019 		 * no gaps, and be within the [min, max) of the map.
8020 		 *
8021 		 * We got here without VM_MAP_RANGE_CHECK() being called,
8022 		 * and hence must validate bounds manually.
8023 		 *
8024 		 * It is worth noting that because vm_deallocate() will
8025 		 * round_page() the deallocation size, it's possible for "end"
8026 		 * to be 0 here due to overflow. We hence must treat it as being
8027 		 * beyond vm_map_max(map).
8028 		 *
8029 		 * Similarly, end < start means some wrap around happend,
8030 		 * which should cause an error or panic.
8031 		 */
8032 		if (end == 0 || end > vm_map_max(map)) {
8033 			state |= VMDS_FOUND_GAP;
8034 			gap_start = vm_map_max(map);
8035 			if (state & VMDS_KERNEL_PMAP) {
8036 				__vm_map_delete_gap_panic(map,
8037 				    gap_start, start, end);
8038 			}
8039 			goto out;
8040 		}
8041 
8042 		if (end < start) {
8043 			if (state & VMDS_KERNEL_PMAP) {
8044 				__vm_map_delete_gap_panic(map,
8045 				    vm_map_max(map), start, end);
8046 			}
8047 			ret.kmr_return = KERN_INVALID_ARGUMENT;
8048 			goto out;
8049 		}
8050 
8051 		if (start < vm_map_min(map)) {
8052 			state |= VMDS_FOUND_GAP;
8053 			gap_start = start;
8054 			if (state & VMDS_KERNEL_PMAP) {
8055 				__vm_map_delete_gap_panic(map,
8056 				    gap_start, start, end);
8057 			}
8058 			goto out;
8059 		}
8060 	} else {
8061 		/*
8062 		 * If the map is terminated, we must accept start/end
8063 		 * being beyond the boundaries of the map as this is
8064 		 * how some of the mappings like commpage mappings
8065 		 * can be destroyed (they're outside of those bounds).
8066 		 *
8067 		 * end < start is still something we can't cope with,
8068 		 * so just bail.
8069 		 */
8070 		if (end < start) {
8071 			goto out;
8072 		}
8073 	}
8074 
8075 
8076 	/*
8077 	 *	Find the start of the region.
8078 	 *
8079 	 *	If in a superpage, extend the range
8080 	 *	to include the start of the mapping.
8081 	 */
8082 	while (vm_map_lookup_entry_or_next(map, start, &entry)) {
8083 		if (entry->superpage_size && (start & ~SUPERPAGE_MASK)) {
8084 			start = SUPERPAGE_ROUND_DOWN(start);
8085 		} else {
8086 			SAVE_HINT_MAP_WRITE(map, entry->vme_prev);
8087 			break;
8088 		}
8089 	}
8090 
8091 	if (entry->superpage_size) {
8092 		end = SUPERPAGE_ROUND_UP(end);
8093 	}
8094 
8095 	/*
8096 	 *	Step through all entries in this region
8097 	 */
8098 	for (vm_map_offset_t s = start; s < end;) {
8099 		/*
8100 		 * At this point, we have deleted all the memory entries
8101 		 * in [start, s) and are proceeding with the [s, end) range.
8102 		 *
8103 		 * This loop might drop the map lock, and it is possible that
8104 		 * some memory was already reallocated within [start, s)
8105 		 * and we don't want to mess with those entries.
8106 		 *
8107 		 * Some of those entries could even have been re-assembled
8108 		 * with an entry after "s" (in vm_map_simplify_entry()), so
8109 		 * we may have to vm_map_clip_start() again.
8110 		 *
8111 		 * When clear_in_transition_end is set, the we had marked
8112 		 * [start, clear_in_transition_end) as "in_transition"
8113 		 * during a previous iteration and we need to clear it.
8114 		 */
8115 
8116 		/*
8117 		 * Step 1: If needed (because we dropped locks),
8118 		 *         lookup the entry again.
8119 		 *
8120 		 *         If we're coming back from unwiring (Step 5),
8121 		 *         we also need to mark the entries as no longer
8122 		 *         in transition after that.
8123 		 */
8124 
8125 		if (state & VMDS_NEEDS_LOOKUP) {
8126 			state &= ~VMDS_NEEDS_LOOKUP;
8127 
8128 			if (vm_map_lookup_entry_or_next(map, s, &entry)) {
8129 				SAVE_HINT_MAP_WRITE(map, entry->vme_prev);
8130 			}
8131 
8132 			if (state & VMDS_KERNEL_KMEMPTR) {
8133 				kmem_validate_slot(s, meta, size_idx, slot_idx);
8134 			}
8135 		}
8136 
8137 		if (clear_in_transition_end) {
8138 			for (vm_map_entry_t it = entry;
8139 			    it != vm_map_to_entry(map) &&
8140 			    it->vme_start < clear_in_transition_end;
8141 			    it = it->vme_next) {
8142 				assert(it->in_transition);
8143 				it->in_transition = FALSE;
8144 				if (it->needs_wakeup) {
8145 					it->needs_wakeup = FALSE;
8146 					state |= VMDS_NEEDS_WAKEUP;
8147 				}
8148 			}
8149 
8150 			clear_in_transition_end = 0;
8151 		}
8152 
8153 
8154 		/*
8155 		 * Step 2: Perform various policy checks
8156 		 *         before we do _anything_ to this entry.
8157 		 */
8158 
8159 		if (entry == vm_map_to_entry(map) || s < entry->vme_start) {
8160 			if (state & (VMDS_GAPS_OK | VMDS_FOUND_GAP)) {
8161 				/*
8162 				 * Either we found a gap already,
8163 				 * or we are tearing down a map,
8164 				 * keep going.
8165 				 */
8166 			} else if (state & VMDS_KERNEL_PMAP) {
8167 				__vm_map_delete_gap_panic(map, s, start, end);
8168 			} else if (s < end) {
8169 				state |= VMDS_FOUND_GAP;
8170 				gap_start = s;
8171 			}
8172 
8173 			if (entry == vm_map_to_entry(map) ||
8174 			    end <= entry->vme_start) {
8175 				break;
8176 			}
8177 
8178 			s = entry->vme_start;
8179 		}
8180 
8181 		if (state & VMDS_KERNEL_PMAP) {
8182 			/*
8183 			 * In the kernel map and its submaps,
8184 			 * permanent entries never die, even
8185 			 * if VM_MAP_REMOVE_IMMUTABLE is passed.
8186 			 */
8187 			if (entry->vme_permanent) {
8188 				__vm_map_delete_permanent_panic(map, start, end, entry);
8189 			}
8190 
8191 			if (flags & VM_MAP_REMOVE_GUESS_SIZE) {
8192 				end = entry->vme_end;
8193 				flags &= ~VM_MAP_REMOVE_GUESS_SIZE;
8194 			}
8195 
8196 			/*
8197 			 * In the kernel map and its submaps,
8198 			 * the removal of an atomic/guarded entry is strict.
8199 			 *
8200 			 * An atomic entry is processed only if it was
8201 			 * specifically targeted.
8202 			 *
8203 			 * We might have deleted non-atomic entries before
8204 			 * we reach this this point however...
8205 			 */
8206 			kmem_entry_validate_guard(map, entry,
8207 			    start, end - start, guard);
8208 		}
8209 
8210 		/*
8211 		 * Step 2.1: handle "permanent" and "submap" entries
8212 		 * *before* clipping to avoid triggering some unnecessary
8213 		 * un-nesting of the shared region.
8214 		 */
8215 		if (entry->vme_permanent && entry->is_sub_map) {
8216 //			printf("FBDP %s:%d permanent submap...\n", __FUNCTION__, __LINE__);
8217 			/*
8218 			 * Un-mapping a "permanent" mapping of a user-space
8219 			 * submap is not allowed unless...
8220 			 */
8221 			if (flags & VM_MAP_REMOVE_IMMUTABLE) {
8222 				/*
8223 				 * a. explicitly requested by the kernel caller.
8224 				 */
8225 //				printf("FBDP %s:%d flags & REMOVE_IMMUTABLE\n", __FUNCTION__, __LINE__);
8226 			} else if ((flags & VM_MAP_REMOVE_IMMUTABLE_CODE) &&
8227 			    developer_mode_state()) {
8228 				/*
8229 				 * b. we're in "developer" mode (for
8230 				 *    breakpoints, dtrace probes, ...).
8231 				 */
8232 //				printf("FBDP %s:%d flags & REMOVE_IMMUTABLE_CODE\n", __FUNCTION__, __LINE__);
8233 			} else if (map->terminated) {
8234 				/*
8235 				 * c. this is the final address space cleanup.
8236 				 */
8237 //				printf("FBDP %s:%d map->terminated\n", __FUNCTION__, __LINE__);
8238 			} else {
8239 				vm_map_offset_t submap_start, submap_end;
8240 				kern_return_t submap_kr;
8241 
8242 				/*
8243 				 * Check if there are any "permanent" mappings
8244 				 * in this range in the submap.
8245 				 */
8246 				if (entry->in_transition) {
8247 					/* can that even happen ? */
8248 					goto in_transition;
8249 				}
8250 				/* compute the clipped range in the submap */
8251 				submap_start = s - entry->vme_start;
8252 				submap_start += VME_OFFSET(entry);
8253 				submap_end = end - entry->vme_start;
8254 				submap_end += VME_OFFSET(entry);
8255 				submap_kr = vm_map_delete_submap_recurse(
8256 					VME_SUBMAP(entry),
8257 					submap_start,
8258 					submap_end);
8259 				if (submap_kr != KERN_SUCCESS) {
8260 					/*
8261 					 * There are some "permanent" mappings
8262 					 * in the submap: we are not allowed
8263 					 * to remove this range.
8264 					 */
8265 					printf("%d[%s] removing permanent submap entry "
8266 					    "%p [0x%llx:0x%llx] prot 0x%x/0x%x -> KERN_PROT_FAILURE\n",
8267 					    proc_selfpid(),
8268 					    (get_bsdtask_info(current_task())
8269 					    ? proc_name_address(get_bsdtask_info(current_task()))
8270 					    : "?"), entry,
8271 					    (uint64_t)entry->vme_start,
8272 					    (uint64_t)entry->vme_end,
8273 					    entry->protection,
8274 					    entry->max_protection);
8275 					DTRACE_VM6(vm_map_delete_permanent_deny_submap,
8276 					    vm_map_entry_t, entry,
8277 					    vm_map_offset_t, entry->vme_start,
8278 					    vm_map_offset_t, entry->vme_end,
8279 					    vm_prot_t, entry->protection,
8280 					    vm_prot_t, entry->max_protection,
8281 					    int, VME_ALIAS(entry));
8282 					ret.kmr_return = KERN_PROTECTION_FAILURE;
8283 					goto out;
8284 				}
8285 				/* no permanent mappings: proceed */
8286 			}
8287 		}
8288 
8289 		/*
8290 		 * Step 3: Perform any clipping needed.
8291 		 *
8292 		 *         After this, "entry" starts at "s", ends before "end"
8293 		 */
8294 
8295 		if (entry->vme_start < s) {
8296 			if ((flags & VM_MAP_REMOVE_NO_MAP_ALIGN) &&
8297 			    entry->map_aligned &&
8298 			    !VM_MAP_PAGE_ALIGNED(s, VM_MAP_PAGE_MASK(map))) {
8299 				/*
8300 				 * The entry will no longer be map-aligned
8301 				 * after clipping and the caller said it's OK.
8302 				 */
8303 				entry->map_aligned = FALSE;
8304 			}
8305 			vm_map_clip_start(map, entry, s);
8306 			SAVE_HINT_MAP_WRITE(map, entry->vme_prev);
8307 		}
8308 
8309 		if (end < entry->vme_end) {
8310 			if ((flags & VM_MAP_REMOVE_NO_MAP_ALIGN) &&
8311 			    entry->map_aligned &&
8312 			    !VM_MAP_PAGE_ALIGNED(end, VM_MAP_PAGE_MASK(map))) {
8313 				/*
8314 				 * The entry will no longer be map-aligned
8315 				 * after clipping and the caller said it's OK.
8316 				 */
8317 				entry->map_aligned = FALSE;
8318 			}
8319 			vm_map_clip_end(map, entry, end);
8320 		}
8321 
8322 		if (entry->vme_permanent && entry->is_sub_map) {
8323 			/*
8324 			 * We already went through step 2.1 which did not deny
8325 			 * the removal of this "permanent" and "is_sub_map"
8326 			 * entry.
8327 			 * Now that we've clipped what we actually want to
8328 			 * delete, undo the "permanent" part to allow the
8329 			 * removal to proceed.
8330 			 */
8331 			DTRACE_VM6(vm_map_delete_permanent_allow_submap,
8332 			    vm_map_entry_t, entry,
8333 			    vm_map_offset_t, entry->vme_start,
8334 			    vm_map_offset_t, entry->vme_end,
8335 			    vm_prot_t, entry->protection,
8336 			    vm_prot_t, entry->max_protection,
8337 			    int, VME_ALIAS(entry));
8338 			entry->vme_permanent = false;
8339 		}
8340 
8341 		assert(s == entry->vme_start);
8342 		assert(entry->vme_end <= end);
8343 
8344 
8345 		/*
8346 		 * Step 4: If the entry is in flux, wait for this to resolve.
8347 		 */
8348 
8349 		if (entry->in_transition) {
8350 			wait_result_t wait_result;
8351 
8352 in_transition:
8353 			/*
8354 			 * Another thread is wiring/unwiring this entry.
8355 			 * Let the other thread know we are waiting.
8356 			 */
8357 
8358 			entry->needs_wakeup = TRUE;
8359 
8360 			/*
8361 			 * wake up anybody waiting on entries that we have
8362 			 * already unwired/deleted.
8363 			 */
8364 			if (state & VMDS_NEEDS_WAKEUP) {
8365 				vm_map_entry_wakeup(map);
8366 				state &= ~VMDS_NEEDS_WAKEUP;
8367 			}
8368 
8369 			wait_result = vm_map_entry_wait(map, interruptible);
8370 
8371 			if (interruptible &&
8372 			    wait_result == THREAD_INTERRUPTED) {
8373 				/*
8374 				 * We do not clear the needs_wakeup flag,
8375 				 * since we cannot tell if we were the only one.
8376 				 */
8377 				ret.kmr_return = KERN_ABORTED;
8378 				return ret;
8379 			}
8380 
8381 			/*
8382 			 * The entry could have been clipped or it
8383 			 * may not exist anymore.  Look it up again.
8384 			 */
8385 			state |= VMDS_NEEDS_LOOKUP;
8386 			continue;
8387 		}
8388 
8389 
8390 		/*
8391 		 * Step 5: Handle wiring
8392 		 */
8393 
8394 		if (entry->wired_count) {
8395 			struct vm_map_entry tmp_entry;
8396 			boolean_t           user_wire;
8397 			unsigned int        last_timestamp;
8398 
8399 			user_wire = entry->user_wired_count > 0;
8400 
8401 			/*
8402 			 *      Remove a kernel wiring if requested
8403 			 */
8404 			if (flags & VM_MAP_REMOVE_KUNWIRE) {
8405 				entry->wired_count--;
8406 			}
8407 
8408 			/*
8409 			 *	Remove all user wirings for proper accounting
8410 			 */
8411 			while (entry->user_wired_count) {
8412 				subtract_wire_counts(map, entry, user_wire);
8413 			}
8414 
8415 			/*
8416 			 * All our DMA I/O operations in IOKit are currently
8417 			 * done by wiring through the map entries of the task
8418 			 * requesting the I/O.
8419 			 *
8420 			 * Because of this, we must always wait for kernel wirings
8421 			 * to go away on the entries before deleting them.
8422 			 *
8423 			 * Any caller who wants to actually remove a kernel wiring
8424 			 * should explicitly set the VM_MAP_REMOVE_KUNWIRE flag to
8425 			 * properly remove one wiring instead of blasting through
8426 			 * them all.
8427 			 */
8428 			if (entry->wired_count != 0) {
8429 				assert(map != kernel_map);
8430 				/*
8431 				 * Cannot continue.  Typical case is when
8432 				 * a user thread has physical io pending on
8433 				 * on this page.  Either wait for the
8434 				 * kernel wiring to go away or return an
8435 				 * error.
8436 				 */
8437 				wait_result_t wait_result;
8438 
8439 				entry->needs_wakeup = TRUE;
8440 				wait_result = vm_map_entry_wait(map,
8441 				    interruptible);
8442 
8443 				if (interruptible &&
8444 				    wait_result == THREAD_INTERRUPTED) {
8445 					/*
8446 					 * We do not clear the
8447 					 * needs_wakeup flag, since we
8448 					 * cannot tell if we were the
8449 					 * only one.
8450 					 */
8451 					ret.kmr_return = KERN_ABORTED;
8452 					return ret;
8453 				}
8454 
8455 
8456 				/*
8457 				 * The entry could have been clipped or
8458 				 * it may not exist anymore.  Look it
8459 				 * up again.
8460 				 */
8461 				state |= VMDS_NEEDS_LOOKUP;
8462 				continue;
8463 			}
8464 
8465 			/*
8466 			 * We can unlock the map now.
8467 			 *
8468 			 * The entry might be split once we unlock the map,
8469 			 * but we need the range as defined by this entry
8470 			 * to be stable. So we must make a local copy.
8471 			 *
8472 			 * The underlying objects do not change during clips,
8473 			 * and the in_transition state guarentees existence
8474 			 * of the entry.
8475 			 */
8476 			last_timestamp = map->timestamp;
8477 			entry->in_transition = TRUE;
8478 			tmp_entry = *entry;
8479 			vm_map_unlock(map);
8480 
8481 			if (tmp_entry.is_sub_map) {
8482 				vm_map_t sub_map;
8483 				vm_map_offset_t sub_start, sub_end;
8484 				pmap_t pmap;
8485 				vm_map_offset_t pmap_addr;
8486 
8487 
8488 				sub_map = VME_SUBMAP(&tmp_entry);
8489 				sub_start = VME_OFFSET(&tmp_entry);
8490 				sub_end = sub_start + (tmp_entry.vme_end -
8491 				    tmp_entry.vme_start);
8492 				if (tmp_entry.use_pmap) {
8493 					pmap = sub_map->pmap;
8494 					pmap_addr = tmp_entry.vme_start;
8495 				} else {
8496 					pmap = map->pmap;
8497 					pmap_addr = tmp_entry.vme_start;
8498 				}
8499 				(void) vm_map_unwire_nested(sub_map,
8500 				    sub_start, sub_end,
8501 				    user_wire,
8502 				    pmap, pmap_addr);
8503 			} else {
8504 				vm_map_offset_t entry_end = tmp_entry.vme_end;
8505 				vm_map_offset_t max_end;
8506 
8507 				if (flags & VM_MAP_REMOVE_NOKUNWIRE_LAST) {
8508 					max_end = end - VM_MAP_PAGE_SIZE(map);
8509 					if (entry_end > max_end) {
8510 						entry_end = max_end;
8511 					}
8512 				}
8513 
8514 				if (tmp_entry.vme_kernel_object) {
8515 					pmap_protect_options(
8516 						map->pmap,
8517 						tmp_entry.vme_start,
8518 						entry_end,
8519 						VM_PROT_NONE,
8520 						PMAP_OPTIONS_REMOVE,
8521 						NULL);
8522 				}
8523 				vm_fault_unwire(map, &tmp_entry,
8524 				    tmp_entry.vme_kernel_object, map->pmap,
8525 				    tmp_entry.vme_start, entry_end);
8526 			}
8527 
8528 			vm_map_lock(map);
8529 
8530 			/*
8531 			 * Unwiring happened, we can now go back to deleting
8532 			 * them (after we clear the in_transition bit for the range).
8533 			 */
8534 			if (last_timestamp + 1 != map->timestamp) {
8535 				state |= VMDS_NEEDS_LOOKUP;
8536 			}
8537 			clear_in_transition_end = tmp_entry.vme_end;
8538 			continue;
8539 		}
8540 
8541 		assert(entry->wired_count == 0);
8542 		assert(entry->user_wired_count == 0);
8543 
8544 
8545 		/*
8546 		 * Step 6: Entry is unwired and ready for us to delete !
8547 		 */
8548 
8549 		if (!entry->vme_permanent) {
8550 			/*
8551 			 * Typical case: the entry really shouldn't be permanent
8552 			 */
8553 		} else if ((flags & VM_MAP_REMOVE_IMMUTABLE_CODE) &&
8554 		    (entry->protection & VM_PROT_EXECUTE) &&
8555 		    developer_mode_state()) {
8556 			/*
8557 			 * Allow debuggers to undo executable mappings
8558 			 * when developer mode is on.
8559 			 */
8560 #if 0
8561 			printf("FBDP %d[%s] removing permanent executable entry "
8562 			    "%p [0x%llx:0x%llx] prot 0x%x/0x%x\n",
8563 			    proc_selfpid(),
8564 			    (current_task()->bsd_info
8565 			    ? proc_name_address(current_task()->bsd_info)
8566 			    : "?"), entry,
8567 			    (uint64_t)entry->vme_start,
8568 			    (uint64_t)entry->vme_end,
8569 			    entry->protection,
8570 			    entry->max_protection);
8571 #endif
8572 			entry->vme_permanent = FALSE;
8573 		} else if ((flags & VM_MAP_REMOVE_IMMUTABLE) || map->terminated) {
8574 #if 0
8575 			printf("FBDP %d[%s] removing permanent entry "
8576 			    "%p [0x%llx:0x%llx] prot 0x%x/0x%x\n",
8577 			    proc_selfpid(),
8578 			    (current_task()->bsd_info
8579 			    ? proc_name_address(current_task()->bsd_info)
8580 			    : "?"), entry,
8581 			    (uint64_t)entry->vme_start,
8582 			    (uint64_t)entry->vme_end,
8583 			    entry->protection,
8584 			    entry->max_protection);
8585 #endif
8586 			entry->vme_permanent = FALSE;
8587 #if CODE_SIGNING_MONITOR
8588 		} else if ((entry->protection & VM_PROT_EXECUTE) && !csm_enabled()) {
8589 			entry->vme_permanent = FALSE;
8590 
8591 			printf("%d[%s] %s(0x%llx,0x%llx): "
8592 			    "code signing monitor disabled, allowing for permanent executable entry [0x%llx:0x%llx] "
8593 			    "prot 0x%x/0x%x\n",
8594 			    proc_selfpid(),
8595 			    (get_bsdtask_info(current_task())
8596 			    ? proc_name_address(get_bsdtask_info(current_task()))
8597 			    : "?"),
8598 			    __FUNCTION__,
8599 			    (uint64_t)start,
8600 			    (uint64_t)end,
8601 			    (uint64_t)entry->vme_start,
8602 			    (uint64_t)entry->vme_end,
8603 			    entry->protection,
8604 			    entry->max_protection);
8605 #endif
8606 		} else {
8607 			DTRACE_VM6(vm_map_delete_permanent,
8608 			    vm_map_entry_t, entry,
8609 			    vm_map_offset_t, entry->vme_start,
8610 			    vm_map_offset_t, entry->vme_end,
8611 			    vm_prot_t, entry->protection,
8612 			    vm_prot_t, entry->max_protection,
8613 			    int, VME_ALIAS(entry));
8614 		}
8615 
8616 		if (entry->is_sub_map) {
8617 			assertf(VM_MAP_PAGE_SHIFT(VME_SUBMAP(entry)) >= VM_MAP_PAGE_SHIFT(map),
8618 			    "map %p (%d) entry %p submap %p (%d)\n",
8619 			    map, VM_MAP_PAGE_SHIFT(map), entry,
8620 			    VME_SUBMAP(entry),
8621 			    VM_MAP_PAGE_SHIFT(VME_SUBMAP(entry)));
8622 			if (entry->use_pmap) {
8623 #ifndef NO_NESTED_PMAP
8624 				int pmap_flags;
8625 
8626 				if (map->terminated) {
8627 					/*
8628 					 * This is the final cleanup of the
8629 					 * address space being terminated.
8630 					 * No new mappings are expected and
8631 					 * we don't really need to unnest the
8632 					 * shared region (and lose the "global"
8633 					 * pmap mappings, if applicable).
8634 					 *
8635 					 * Tell the pmap layer that we're
8636 					 * "clean" wrt nesting.
8637 					 */
8638 					pmap_flags = PMAP_UNNEST_CLEAN;
8639 				} else {
8640 					/*
8641 					 * We're unmapping part of the nested
8642 					 * shared region, so we can't keep the
8643 					 * nested pmap.
8644 					 */
8645 					pmap_flags = 0;
8646 				}
8647 				pmap_unnest_options(
8648 					map->pmap,
8649 					(addr64_t)entry->vme_start,
8650 					entry->vme_end - entry->vme_start,
8651 					pmap_flags);
8652 #endif  /* NO_NESTED_PMAP */
8653 				if (map->mapped_in_other_pmaps &&
8654 				    os_ref_get_count_raw(&map->map_refcnt) != 0) {
8655 					/* clean up parent map/maps */
8656 					vm_map_submap_pmap_clean(
8657 						map, entry->vme_start,
8658 						entry->vme_end,
8659 						VME_SUBMAP(entry),
8660 						VME_OFFSET(entry));
8661 				}
8662 			} else {
8663 				vm_map_submap_pmap_clean(
8664 					map, entry->vme_start, entry->vme_end,
8665 					VME_SUBMAP(entry),
8666 					VME_OFFSET(entry));
8667 			}
8668 		} else if (entry->vme_kernel_object ||
8669 		    VME_OBJECT(entry) == compressor_object) {
8670 			/*
8671 			 * nothing to do
8672 			 */
8673 		} else if (map->mapped_in_other_pmaps &&
8674 		    os_ref_get_count_raw(&map->map_refcnt) != 0) {
8675 			vm_object_pmap_protect_options(
8676 				VME_OBJECT(entry), VME_OFFSET(entry),
8677 				entry->vme_end - entry->vme_start,
8678 				PMAP_NULL,
8679 				PAGE_SIZE,
8680 				entry->vme_start,
8681 				VM_PROT_NONE,
8682 				PMAP_OPTIONS_REMOVE);
8683 		} else if ((VME_OBJECT(entry) != VM_OBJECT_NULL) ||
8684 		    (state & VMDS_KERNEL_PMAP)) {
8685 			/* Remove translations associated
8686 			 * with this range unless the entry
8687 			 * does not have an object, or
8688 			 * it's the kernel map or a descendant
8689 			 * since the platform could potentially
8690 			 * create "backdoor" mappings invisible
8691 			 * to the VM. It is expected that
8692 			 * objectless, non-kernel ranges
8693 			 * do not have such VM invisible
8694 			 * translations.
8695 			 */
8696 			pmap_remove_options(map->pmap,
8697 			    (addr64_t)entry->vme_start,
8698 			    (addr64_t)entry->vme_end,
8699 			    PMAP_OPTIONS_REMOVE);
8700 		}
8701 
8702 #if DEBUG
8703 		/*
8704 		 * All pmap mappings for this map entry must have been
8705 		 * cleared by now.
8706 		 */
8707 		assert(pmap_is_empty(map->pmap,
8708 		    entry->vme_start,
8709 		    entry->vme_end));
8710 #endif /* DEBUG */
8711 
8712 		if (entry->iokit_acct) {
8713 			/* alternate accounting */
8714 			DTRACE_VM4(vm_map_iokit_unmapped_region,
8715 			    vm_map_t, map,
8716 			    vm_map_offset_t, entry->vme_start,
8717 			    vm_map_offset_t, entry->vme_end,
8718 			    int, VME_ALIAS(entry));
8719 			vm_map_iokit_unmapped_region(map,
8720 			    (entry->vme_end -
8721 			    entry->vme_start));
8722 			entry->iokit_acct = FALSE;
8723 			entry->use_pmap = FALSE;
8724 		}
8725 
8726 		/* move "s" forward */
8727 		s    = entry->vme_end;
8728 		next = entry->vme_next;
8729 		if (!entry->map_aligned) {
8730 			vm_map_offset_t rounded_s;
8731 
8732 			/*
8733 			 * Skip artificial gap due to mis-aligned entry
8734 			 * on devices with a page size smaller than the
8735 			 * map's page size (i.e. 16k task on a 4k device).
8736 			 */
8737 			rounded_s = VM_MAP_ROUND_PAGE(s, VM_MAP_PAGE_MASK(map));
8738 			if (next == vm_map_to_entry(map)) {
8739 				s = rounded_s;
8740 			} else if (s < rounded_s) {
8741 				s = MIN(rounded_s, next->vme_start);
8742 			}
8743 		}
8744 		ret.kmr_size += s - entry->vme_start;
8745 
8746 		if (entry->vme_permanent) {
8747 			/*
8748 			 * A permanent entry can not be removed, so leave it
8749 			 * in place but remove all access permissions.
8750 			 */
8751 			if (!entry->csm_associated) {
8752 				printf("%s:%d %d[%s] map %p entry %p [ 0x%llx - 0x%llx ] submap %d prot 0x%x/0x%x -> 0/0\n",
8753 				    __FUNCTION__, __LINE__,
8754 				    proc_selfpid(),
8755 				    (get_bsdtask_info(current_task())
8756 				    ? proc_name_address(get_bsdtask_info(current_task()))
8757 				    : "?"),
8758 				    map,
8759 				    entry,
8760 				    (uint64_t)entry->vme_start,
8761 				    (uint64_t)entry->vme_end,
8762 				    entry->is_sub_map,
8763 				    entry->protection,
8764 				    entry->max_protection);
8765 			}
8766 			DTRACE_VM6(vm_map_delete_permanent_prot_none,
8767 			    vm_map_entry_t, entry,
8768 			    vm_map_offset_t, entry->vme_start,
8769 			    vm_map_offset_t, entry->vme_end,
8770 			    vm_prot_t, entry->protection,
8771 			    vm_prot_t, entry->max_protection,
8772 			    int, VME_ALIAS(entry));
8773 			entry->protection = VM_PROT_NONE;
8774 			entry->max_protection = VM_PROT_NONE;
8775 		} else {
8776 			vm_map_entry_zap(map, entry, zap_list);
8777 		}
8778 
8779 		entry = next;
8780 		next  = VM_MAP_ENTRY_NULL;
8781 
8782 		if ((flags & VM_MAP_REMOVE_NO_YIELD) == 0 && s < end) {
8783 			unsigned int last_timestamp = map->timestamp++;
8784 
8785 			if (lck_rw_lock_yield_exclusive(&map->lock,
8786 			    LCK_RW_YIELD_ANY_WAITER)) {
8787 				if (last_timestamp != map->timestamp + 1) {
8788 					state |= VMDS_NEEDS_LOOKUP;
8789 				}
8790 			} else {
8791 				/* we didn't yield, undo our change */
8792 				map->timestamp--;
8793 			}
8794 		}
8795 	}
8796 
8797 	if (map->wait_for_space) {
8798 		thread_wakeup((event_t) map);
8799 	}
8800 
8801 	if (state & VMDS_NEEDS_WAKEUP) {
8802 		vm_map_entry_wakeup(map);
8803 	}
8804 
8805 out:
8806 	if ((state & VMDS_KERNEL_PMAP) && ret.kmr_return) {
8807 		__vm_map_delete_failed_panic(map, start, end, ret.kmr_return);
8808 	}
8809 
8810 	if (state & VMDS_KERNEL_KMEMPTR) {
8811 		kmem_free_space(start, end, range_id, &slot);
8812 	}
8813 
8814 	if (state & VMDS_FOUND_GAP) {
8815 		DTRACE_VM3(kern_vm_deallocate_gap,
8816 		    vm_map_offset_t, gap_start,
8817 		    vm_map_offset_t, save_start,
8818 		    vm_map_offset_t, save_end);
8819 		if (flags & VM_MAP_REMOVE_GAPS_FAIL) {
8820 			ret.kmr_return = KERN_INVALID_VALUE;
8821 		} else {
8822 			vm_map_guard_exception(gap_start, kGUARD_EXC_DEALLOC_GAP);
8823 		}
8824 	}
8825 
8826 	return ret;
8827 }
8828 
8829 kmem_return_t
vm_map_remove_and_unlock(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vmr_flags_t flags,kmem_guard_t guard)8830 vm_map_remove_and_unlock(
8831 	vm_map_t        map,
8832 	vm_map_offset_t start,
8833 	vm_map_offset_t end,
8834 	vmr_flags_t     flags,
8835 	kmem_guard_t    guard)
8836 {
8837 	kmem_return_t ret;
8838 	VM_MAP_ZAP_DECLARE(zap);
8839 
8840 	ret = vm_map_delete(map, start, end, flags, guard, &zap);
8841 	vm_map_unlock(map);
8842 
8843 	vm_map_zap_dispose(&zap);
8844 
8845 	return ret;
8846 }
8847 
8848 /*
8849  *	vm_map_remove_guard:
8850  *
8851  *	Remove the given address range from the target map.
8852  *	This is the exported form of vm_map_delete.
8853  */
8854 kmem_return_t
vm_map_remove_guard(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vmr_flags_t flags,kmem_guard_t guard)8855 vm_map_remove_guard(
8856 	vm_map_t        map,
8857 	vm_map_offset_t start,
8858 	vm_map_offset_t end,
8859 	vmr_flags_t     flags,
8860 	kmem_guard_t    guard)
8861 {
8862 	vm_map_lock(map);
8863 	return vm_map_remove_and_unlock(map, start, end, flags, guard);
8864 }
8865 
8866 /*
8867  *	vm_map_terminate:
8868  *
8869  *	Clean out a task's map.
8870  */
8871 kern_return_t
vm_map_terminate(vm_map_t map)8872 vm_map_terminate(
8873 	vm_map_t        map)
8874 {
8875 	vm_map_lock(map);
8876 	map->terminated = TRUE;
8877 	vm_map_disable_hole_optimization(map);
8878 	(void)vm_map_remove_and_unlock(map, map->min_offset, map->max_offset,
8879 	    VM_MAP_REMOVE_NO_FLAGS, KMEM_GUARD_NONE);
8880 	return KERN_SUCCESS;
8881 }
8882 
8883 /*
8884  *	Routine:	vm_map_copy_allocate
8885  *
8886  *	Description:
8887  *		Allocates and initializes a map copy object.
8888  */
8889 static vm_map_copy_t
vm_map_copy_allocate(uint16_t type)8890 vm_map_copy_allocate(uint16_t type)
8891 {
8892 	vm_map_copy_t new_copy;
8893 
8894 	new_copy = zalloc_id(ZONE_ID_VM_MAP_COPY, Z_WAITOK | Z_ZERO);
8895 	new_copy->type = type;
8896 	if (type == VM_MAP_COPY_ENTRY_LIST) {
8897 		new_copy->c_u.hdr.rb_head_store.rbh_root = (void*)(int)SKIP_RB_TREE;
8898 		vm_map_store_init(&new_copy->cpy_hdr);
8899 	}
8900 	return new_copy;
8901 }
8902 
8903 /*
8904  *	Routine:	vm_map_copy_discard
8905  *
8906  *	Description:
8907  *		Dispose of a map copy object (returned by
8908  *		vm_map_copyin).
8909  */
8910 void
vm_map_copy_discard(vm_map_copy_t copy)8911 vm_map_copy_discard(
8912 	vm_map_copy_t   copy)
8913 {
8914 	if (copy == VM_MAP_COPY_NULL) {
8915 		return;
8916 	}
8917 
8918 	/*
8919 	 * Assert that the vm_map_copy is coming from the right
8920 	 * zone and hasn't been forged
8921 	 */
8922 	vm_map_copy_require(copy);
8923 
8924 	switch (copy->type) {
8925 	case VM_MAP_COPY_ENTRY_LIST:
8926 		while (vm_map_copy_first_entry(copy) !=
8927 		    vm_map_copy_to_entry(copy)) {
8928 			vm_map_entry_t  entry = vm_map_copy_first_entry(copy);
8929 
8930 			vm_map_copy_entry_unlink(copy, entry);
8931 			if (entry->is_sub_map) {
8932 				vm_map_deallocate(VME_SUBMAP(entry));
8933 			} else {
8934 				vm_object_deallocate(VME_OBJECT(entry));
8935 			}
8936 			vm_map_copy_entry_dispose(entry);
8937 		}
8938 		break;
8939 	case VM_MAP_COPY_KERNEL_BUFFER:
8940 
8941 		/*
8942 		 * The vm_map_copy_t and possibly the data buffer were
8943 		 * allocated by a single call to kalloc_data(), i.e. the
8944 		 * vm_map_copy_t was not allocated out of the zone.
8945 		 */
8946 		if (copy->size > msg_ool_size_small || copy->offset) {
8947 			panic("Invalid vm_map_copy_t sz:%lld, ofst:%lld",
8948 			    (long long)copy->size, (long long)copy->offset);
8949 		}
8950 		kfree_data(copy->cpy_kdata, copy->size);
8951 	}
8952 	zfree_id(ZONE_ID_VM_MAP_COPY, copy);
8953 }
8954 
8955 #if XNU_PLATFORM_MacOSX
8956 
8957 /*
8958  *	Routine:	vm_map_copy_copy
8959  *
8960  *	Description:
8961  *			Move the information in a map copy object to
8962  *			a new map copy object, leaving the old one
8963  *			empty.
8964  *
8965  *			This is used by kernel routines that need
8966  *			to look at out-of-line data (in copyin form)
8967  *			before deciding whether to return SUCCESS.
8968  *			If the routine returns FAILURE, the original
8969  *			copy object will be deallocated; therefore,
8970  *			these routines must make a copy of the copy
8971  *			object and leave the original empty so that
8972  *			deallocation will not fail.
8973  */
8974 vm_map_copy_t
vm_map_copy_copy(vm_map_copy_t copy)8975 vm_map_copy_copy(
8976 	vm_map_copy_t   copy)
8977 {
8978 	vm_map_copy_t   new_copy;
8979 
8980 	if (copy == VM_MAP_COPY_NULL) {
8981 		return VM_MAP_COPY_NULL;
8982 	}
8983 
8984 	/*
8985 	 * Assert that the vm_map_copy is coming from the right
8986 	 * zone and hasn't been forged
8987 	 */
8988 	vm_map_copy_require(copy);
8989 
8990 	/*
8991 	 * Allocate a new copy object, and copy the information
8992 	 * from the old one into it.
8993 	 */
8994 
8995 	new_copy = zalloc_id(ZONE_ID_VM_MAP_COPY, Z_WAITOK | Z_ZERO | Z_NOFAIL);
8996 	memcpy((void *) new_copy, (void *) copy, sizeof(struct vm_map_copy));
8997 #if __has_feature(ptrauth_calls)
8998 	if (copy->type == VM_MAP_COPY_KERNEL_BUFFER) {
8999 		new_copy->cpy_kdata = copy->cpy_kdata;
9000 	}
9001 #endif
9002 
9003 	if (copy->type == VM_MAP_COPY_ENTRY_LIST) {
9004 		/*
9005 		 * The links in the entry chain must be
9006 		 * changed to point to the new copy object.
9007 		 */
9008 		vm_map_copy_first_entry(copy)->vme_prev
9009 		        = vm_map_copy_to_entry(new_copy);
9010 		vm_map_copy_last_entry(copy)->vme_next
9011 		        = vm_map_copy_to_entry(new_copy);
9012 	}
9013 
9014 	/*
9015 	 * Change the old copy object into one that contains
9016 	 * nothing to be deallocated.
9017 	 */
9018 	bzero(copy, sizeof(struct vm_map_copy));
9019 	copy->type = VM_MAP_COPY_KERNEL_BUFFER;
9020 
9021 	/*
9022 	 * Return the new object.
9023 	 */
9024 	return new_copy;
9025 }
9026 
9027 #endif /* XNU_PLATFORM_MacOSX */
9028 
9029 static boolean_t
vm_map_entry_is_overwritable(vm_map_t dst_map __unused,vm_map_entry_t entry)9030 vm_map_entry_is_overwritable(
9031 	vm_map_t        dst_map __unused,
9032 	vm_map_entry_t  entry)
9033 {
9034 	if (!(entry->protection & VM_PROT_WRITE)) {
9035 		/* can't overwrite if not writable */
9036 		return FALSE;
9037 	}
9038 #if !__x86_64__
9039 	if (entry->used_for_jit &&
9040 	    vm_map_cs_enforcement(dst_map) &&
9041 	    !dst_map->cs_debugged) {
9042 		/*
9043 		 * Can't overwrite a JIT region while cs_enforced
9044 		 * and not cs_debugged.
9045 		 */
9046 		return FALSE;
9047 	}
9048 
9049 #if __arm64e__
9050 	/* Do not allow overwrite HW assisted TPRO entries */
9051 	if (entry->used_for_tpro) {
9052 		return FALSE;
9053 	}
9054 #endif /* __arm64e__ */
9055 
9056 	if (entry->vme_permanent) {
9057 		if (entry->is_sub_map) {
9058 			/*
9059 			 * We can't tell if the submap contains "permanent"
9060 			 * entries within the range targeted by the caller.
9061 			 * The caller will have to check for that with
9062 			 * vm_map_overwrite_submap_recurse() for example.
9063 			 */
9064 		} else {
9065 			/*
9066 			 * Do not allow overwriting of a "permanent"
9067 			 * entry.
9068 			 */
9069 			DTRACE_VM6(vm_map_delete_permanent_deny_overwrite,
9070 			    vm_map_entry_t, entry,
9071 			    vm_map_offset_t, entry->vme_start,
9072 			    vm_map_offset_t, entry->vme_end,
9073 			    vm_prot_t, entry->protection,
9074 			    vm_prot_t, entry->max_protection,
9075 			    int, VME_ALIAS(entry));
9076 			return FALSE;
9077 		}
9078 	}
9079 #endif /* !__x86_64__ */
9080 	return TRUE;
9081 }
9082 
9083 static kern_return_t
vm_map_overwrite_submap_recurse(vm_map_t dst_map,vm_map_offset_t dst_addr,vm_map_size_t dst_size)9084 vm_map_overwrite_submap_recurse(
9085 	vm_map_t        dst_map,
9086 	vm_map_offset_t dst_addr,
9087 	vm_map_size_t   dst_size)
9088 {
9089 	vm_map_offset_t dst_end;
9090 	vm_map_entry_t  tmp_entry;
9091 	vm_map_entry_t  entry;
9092 	kern_return_t   result;
9093 	boolean_t       encountered_sub_map = FALSE;
9094 
9095 
9096 
9097 	/*
9098 	 *	Verify that the destination is all writeable
9099 	 *	initially.  We have to trunc the destination
9100 	 *	address and round the copy size or we'll end up
9101 	 *	splitting entries in strange ways.
9102 	 */
9103 
9104 	dst_end = vm_map_round_page(dst_addr + dst_size,
9105 	    VM_MAP_PAGE_MASK(dst_map));
9106 	vm_map_lock(dst_map);
9107 
9108 start_pass_1:
9109 	if (!vm_map_lookup_entry(dst_map, dst_addr, &tmp_entry)) {
9110 		vm_map_unlock(dst_map);
9111 		return KERN_INVALID_ADDRESS;
9112 	}
9113 
9114 	vm_map_clip_start(dst_map,
9115 	    tmp_entry,
9116 	    vm_map_trunc_page(dst_addr,
9117 	    VM_MAP_PAGE_MASK(dst_map)));
9118 	if (tmp_entry->is_sub_map) {
9119 		/* clipping did unnest if needed */
9120 		assert(!tmp_entry->use_pmap);
9121 	}
9122 
9123 	for (entry = tmp_entry;;) {
9124 		vm_map_entry_t  next;
9125 
9126 		next = entry->vme_next;
9127 		while (entry->is_sub_map) {
9128 			vm_map_offset_t sub_start;
9129 			vm_map_offset_t sub_end;
9130 			vm_map_offset_t local_end;
9131 
9132 			if (entry->in_transition) {
9133 				/*
9134 				 * Say that we are waiting, and wait for entry.
9135 				 */
9136 				entry->needs_wakeup = TRUE;
9137 				vm_map_entry_wait(dst_map, THREAD_UNINT);
9138 
9139 				goto start_pass_1;
9140 			}
9141 
9142 			encountered_sub_map = TRUE;
9143 			sub_start = VME_OFFSET(entry);
9144 
9145 			if (entry->vme_end < dst_end) {
9146 				sub_end = entry->vme_end;
9147 			} else {
9148 				sub_end = dst_end;
9149 			}
9150 			sub_end -= entry->vme_start;
9151 			sub_end += VME_OFFSET(entry);
9152 			local_end = entry->vme_end;
9153 			vm_map_unlock(dst_map);
9154 
9155 			result = vm_map_overwrite_submap_recurse(
9156 				VME_SUBMAP(entry),
9157 				sub_start,
9158 				sub_end - sub_start);
9159 
9160 			if (result != KERN_SUCCESS) {
9161 				return result;
9162 			}
9163 			if (dst_end <= entry->vme_end) {
9164 				return KERN_SUCCESS;
9165 			}
9166 			vm_map_lock(dst_map);
9167 			if (!vm_map_lookup_entry(dst_map, local_end,
9168 			    &tmp_entry)) {
9169 				vm_map_unlock(dst_map);
9170 				return KERN_INVALID_ADDRESS;
9171 			}
9172 			entry = tmp_entry;
9173 			next = entry->vme_next;
9174 		}
9175 
9176 		if (!(entry->protection & VM_PROT_WRITE)) {
9177 			vm_map_unlock(dst_map);
9178 			return KERN_PROTECTION_FAILURE;
9179 		}
9180 
9181 		if (!vm_map_entry_is_overwritable(dst_map, entry)) {
9182 			vm_map_unlock(dst_map);
9183 			return KERN_PROTECTION_FAILURE;
9184 		}
9185 
9186 		/*
9187 		 *	If the entry is in transition, we must wait
9188 		 *	for it to exit that state.  Anything could happen
9189 		 *	when we unlock the map, so start over.
9190 		 */
9191 		if (entry->in_transition) {
9192 			/*
9193 			 * Say that we are waiting, and wait for entry.
9194 			 */
9195 			entry->needs_wakeup = TRUE;
9196 			vm_map_entry_wait(dst_map, THREAD_UNINT);
9197 
9198 			goto start_pass_1;
9199 		}
9200 
9201 /*
9202  *		our range is contained completely within this map entry
9203  */
9204 		if (dst_end <= entry->vme_end) {
9205 			vm_map_unlock(dst_map);
9206 			return KERN_SUCCESS;
9207 		}
9208 /*
9209  *		check that range specified is contiguous region
9210  */
9211 		if ((next == vm_map_to_entry(dst_map)) ||
9212 		    (next->vme_start != entry->vme_end)) {
9213 			vm_map_unlock(dst_map);
9214 			return KERN_INVALID_ADDRESS;
9215 		}
9216 
9217 		/*
9218 		 *	Check for permanent objects in the destination.
9219 		 */
9220 		if ((VME_OBJECT(entry) != VM_OBJECT_NULL) &&
9221 		    ((!VME_OBJECT(entry)->internal) ||
9222 		    (VME_OBJECT(entry)->true_share))) {
9223 			if (encountered_sub_map) {
9224 				vm_map_unlock(dst_map);
9225 				return KERN_FAILURE;
9226 			}
9227 		}
9228 
9229 
9230 		entry = next;
9231 	}/* for */
9232 	vm_map_unlock(dst_map);
9233 	return KERN_SUCCESS;
9234 }
9235 
9236 /*
9237  *	Routine:	vm_map_copy_overwrite
9238  *
9239  *	Description:
9240  *		Copy the memory described by the map copy
9241  *		object (copy; returned by vm_map_copyin) onto
9242  *		the specified destination region (dst_map, dst_addr).
9243  *		The destination must be writeable.
9244  *
9245  *		Unlike vm_map_copyout, this routine actually
9246  *		writes over previously-mapped memory.  If the
9247  *		previous mapping was to a permanent (user-supplied)
9248  *		memory object, it is preserved.
9249  *
9250  *		The attributes (protection and inheritance) of the
9251  *		destination region are preserved.
9252  *
9253  *		If successful, consumes the copy object.
9254  *		Otherwise, the caller is responsible for it.
9255  *
9256  *	Implementation notes:
9257  *		To overwrite aligned temporary virtual memory, it is
9258  *		sufficient to remove the previous mapping and insert
9259  *		the new copy.  This replacement is done either on
9260  *		the whole region (if no permanent virtual memory
9261  *		objects are embedded in the destination region) or
9262  *		in individual map entries.
9263  *
9264  *		To overwrite permanent virtual memory , it is necessary
9265  *		to copy each page, as the external memory management
9266  *		interface currently does not provide any optimizations.
9267  *
9268  *		Unaligned memory also has to be copied.  It is possible
9269  *		to use 'vm_trickery' to copy the aligned data.  This is
9270  *		not done but not hard to implement.
9271  *
9272  *		Once a page of permanent memory has been overwritten,
9273  *		it is impossible to interrupt this function; otherwise,
9274  *		the call would be neither atomic nor location-independent.
9275  *		The kernel-state portion of a user thread must be
9276  *		interruptible.
9277  *
9278  *		It may be expensive to forward all requests that might
9279  *		overwrite permanent memory (vm_write, vm_copy) to
9280  *		uninterruptible kernel threads.  This routine may be
9281  *		called by interruptible threads; however, success is
9282  *		not guaranteed -- if the request cannot be performed
9283  *		atomically and interruptibly, an error indication is
9284  *		returned.
9285  *
9286  *		Callers of this function must call vm_map_copy_require on
9287  *		previously created vm_map_copy_t or pass a newly created
9288  *		one to ensure that it hasn't been forged.
9289  */
9290 
9291 static kern_return_t
vm_map_copy_overwrite_nested(vm_map_t dst_map,vm_map_address_t dst_addr,vm_map_copy_t copy,boolean_t interruptible,pmap_t pmap,boolean_t discard_on_success)9292 vm_map_copy_overwrite_nested(
9293 	vm_map_t                dst_map,
9294 	vm_map_address_t        dst_addr,
9295 	vm_map_copy_t           copy,
9296 	boolean_t               interruptible,
9297 	pmap_t                  pmap,
9298 	boolean_t               discard_on_success)
9299 {
9300 	vm_map_offset_t         dst_end;
9301 	vm_map_entry_t          tmp_entry;
9302 	vm_map_entry_t          entry;
9303 	kern_return_t           kr;
9304 	boolean_t               aligned = TRUE;
9305 	boolean_t               contains_permanent_objects = FALSE;
9306 	boolean_t               encountered_sub_map = FALSE;
9307 	vm_map_offset_t         base_addr;
9308 	vm_map_size_t           copy_size;
9309 	vm_map_size_t           total_size;
9310 	uint16_t                copy_page_shift;
9311 
9312 	/*
9313 	 *	Check for special kernel buffer allocated
9314 	 *	by new_ipc_kmsg_copyin.
9315 	 */
9316 
9317 	if (copy->type == VM_MAP_COPY_KERNEL_BUFFER) {
9318 		return vm_map_copyout_kernel_buffer(
9319 			dst_map, &dst_addr,
9320 			copy, copy->size, TRUE, discard_on_success);
9321 	}
9322 
9323 	/*
9324 	 *      Only works for entry lists at the moment.  Will
9325 	 *	support page lists later.
9326 	 */
9327 
9328 	assert(copy->type == VM_MAP_COPY_ENTRY_LIST);
9329 
9330 	if (copy->size == 0) {
9331 		if (discard_on_success) {
9332 			vm_map_copy_discard(copy);
9333 		}
9334 		return KERN_SUCCESS;
9335 	}
9336 
9337 	copy_page_shift = copy->cpy_hdr.page_shift;
9338 
9339 	/*
9340 	 *	Verify that the destination is all writeable
9341 	 *	initially.  We have to trunc the destination
9342 	 *	address and round the copy size or we'll end up
9343 	 *	splitting entries in strange ways.
9344 	 */
9345 
9346 	if (!VM_MAP_PAGE_ALIGNED(copy->size,
9347 	    VM_MAP_PAGE_MASK(dst_map)) ||
9348 	    !VM_MAP_PAGE_ALIGNED(copy->offset,
9349 	    VM_MAP_PAGE_MASK(dst_map)) ||
9350 	    !VM_MAP_PAGE_ALIGNED(dst_addr,
9351 	    VM_MAP_PAGE_MASK(dst_map)) ||
9352 	    copy_page_shift != VM_MAP_PAGE_SHIFT(dst_map)) {
9353 		aligned = FALSE;
9354 		dst_end = vm_map_round_page(dst_addr + copy->size,
9355 		    VM_MAP_PAGE_MASK(dst_map));
9356 	} else {
9357 		dst_end = dst_addr + copy->size;
9358 	}
9359 
9360 	vm_map_lock(dst_map);
9361 
9362 	/* LP64todo - remove this check when vm_map_commpage64()
9363 	 * no longer has to stuff in a map_entry for the commpage
9364 	 * above the map's max_offset.
9365 	 */
9366 	if (dst_addr >= dst_map->max_offset) {
9367 		vm_map_unlock(dst_map);
9368 		return KERN_INVALID_ADDRESS;
9369 	}
9370 
9371 start_pass_1:
9372 	if (!vm_map_lookup_entry(dst_map, dst_addr, &tmp_entry)) {
9373 		vm_map_unlock(dst_map);
9374 		return KERN_INVALID_ADDRESS;
9375 	}
9376 	vm_map_clip_start(dst_map,
9377 	    tmp_entry,
9378 	    vm_map_trunc_page(dst_addr,
9379 	    VM_MAP_PAGE_MASK(dst_map)));
9380 	for (entry = tmp_entry;;) {
9381 		vm_map_entry_t  next = entry->vme_next;
9382 
9383 		while (entry->is_sub_map) {
9384 			vm_map_offset_t sub_start;
9385 			vm_map_offset_t sub_end;
9386 			vm_map_offset_t local_end;
9387 
9388 			if (entry->in_transition) {
9389 				/*
9390 				 * Say that we are waiting, and wait for entry.
9391 				 */
9392 				entry->needs_wakeup = TRUE;
9393 				vm_map_entry_wait(dst_map, THREAD_UNINT);
9394 
9395 				goto start_pass_1;
9396 			}
9397 
9398 			local_end = entry->vme_end;
9399 			if (!(entry->needs_copy)) {
9400 				/* if needs_copy we are a COW submap */
9401 				/* in such a case we just replace so */
9402 				/* there is no need for the follow-  */
9403 				/* ing check.                        */
9404 				encountered_sub_map = TRUE;
9405 				sub_start = VME_OFFSET(entry);
9406 
9407 				if (entry->vme_end < dst_end) {
9408 					sub_end = entry->vme_end;
9409 				} else {
9410 					sub_end = dst_end;
9411 				}
9412 				sub_end -= entry->vme_start;
9413 				sub_end += VME_OFFSET(entry);
9414 				vm_map_unlock(dst_map);
9415 
9416 				kr = vm_map_overwrite_submap_recurse(
9417 					VME_SUBMAP(entry),
9418 					sub_start,
9419 					sub_end - sub_start);
9420 				if (kr != KERN_SUCCESS) {
9421 					return kr;
9422 				}
9423 				vm_map_lock(dst_map);
9424 			}
9425 
9426 			if (dst_end <= entry->vme_end) {
9427 				goto start_overwrite;
9428 			}
9429 			if (!vm_map_lookup_entry(dst_map, local_end,
9430 			    &entry)) {
9431 				vm_map_unlock(dst_map);
9432 				return KERN_INVALID_ADDRESS;
9433 			}
9434 			next = entry->vme_next;
9435 		}
9436 
9437 		if (!(entry->protection & VM_PROT_WRITE)) {
9438 			vm_map_unlock(dst_map);
9439 			return KERN_PROTECTION_FAILURE;
9440 		}
9441 
9442 		if (!vm_map_entry_is_overwritable(dst_map, entry)) {
9443 			vm_map_unlock(dst_map);
9444 			return KERN_PROTECTION_FAILURE;
9445 		}
9446 
9447 		/*
9448 		 *	If the entry is in transition, we must wait
9449 		 *	for it to exit that state.  Anything could happen
9450 		 *	when we unlock the map, so start over.
9451 		 */
9452 		if (entry->in_transition) {
9453 			/*
9454 			 * Say that we are waiting, and wait for entry.
9455 			 */
9456 			entry->needs_wakeup = TRUE;
9457 			vm_map_entry_wait(dst_map, THREAD_UNINT);
9458 
9459 			goto start_pass_1;
9460 		}
9461 
9462 /*
9463  *		our range is contained completely within this map entry
9464  */
9465 		if (dst_end <= entry->vme_end) {
9466 			break;
9467 		}
9468 /*
9469  *		check that range specified is contiguous region
9470  */
9471 		if ((next == vm_map_to_entry(dst_map)) ||
9472 		    (next->vme_start != entry->vme_end)) {
9473 			vm_map_unlock(dst_map);
9474 			return KERN_INVALID_ADDRESS;
9475 		}
9476 
9477 
9478 		/*
9479 		 *	Check for permanent objects in the destination.
9480 		 */
9481 		if ((VME_OBJECT(entry) != VM_OBJECT_NULL) &&
9482 		    ((!VME_OBJECT(entry)->internal) ||
9483 		    (VME_OBJECT(entry)->true_share))) {
9484 			contains_permanent_objects = TRUE;
9485 		}
9486 
9487 		entry = next;
9488 	}/* for */
9489 
9490 start_overwrite:
9491 	/*
9492 	 *	If there are permanent objects in the destination, then
9493 	 *	the copy cannot be interrupted.
9494 	 */
9495 
9496 	if (interruptible && contains_permanent_objects) {
9497 		vm_map_unlock(dst_map);
9498 		return KERN_FAILURE;   /* XXX */
9499 	}
9500 
9501 	/*
9502 	 *
9503 	 *	Make a second pass, overwriting the data
9504 	 *	At the beginning of each loop iteration,
9505 	 *	the next entry to be overwritten is "tmp_entry"
9506 	 *	(initially, the value returned from the lookup above),
9507 	 *	and the starting address expected in that entry
9508 	 *	is "start".
9509 	 */
9510 
9511 	total_size = copy->size;
9512 	if (encountered_sub_map) {
9513 		copy_size = 0;
9514 		/* re-calculate tmp_entry since we've had the map */
9515 		/* unlocked */
9516 		if (!vm_map_lookup_entry( dst_map, dst_addr, &tmp_entry)) {
9517 			vm_map_unlock(dst_map);
9518 			return KERN_INVALID_ADDRESS;
9519 		}
9520 	} else {
9521 		copy_size = copy->size;
9522 	}
9523 
9524 	base_addr = dst_addr;
9525 	while (TRUE) {
9526 		/* deconstruct the copy object and do in parts */
9527 		/* only in sub_map, interruptable case */
9528 		vm_map_entry_t  copy_entry;
9529 		vm_map_entry_t  previous_prev = VM_MAP_ENTRY_NULL;
9530 		vm_map_entry_t  next_copy = VM_MAP_ENTRY_NULL;
9531 		int             nentries;
9532 		int             remaining_entries = 0;
9533 		vm_map_offset_t new_offset = 0;
9534 
9535 		for (entry = tmp_entry; copy_size == 0;) {
9536 			vm_map_entry_t  next;
9537 
9538 			next = entry->vme_next;
9539 
9540 			/* tmp_entry and base address are moved along */
9541 			/* each time we encounter a sub-map.  Otherwise */
9542 			/* entry can outpase tmp_entry, and the copy_size */
9543 			/* may reflect the distance between them */
9544 			/* if the current entry is found to be in transition */
9545 			/* we will start over at the beginning or the last */
9546 			/* encounter of a submap as dictated by base_addr */
9547 			/* we will zero copy_size accordingly. */
9548 			if (entry->in_transition) {
9549 				/*
9550 				 * Say that we are waiting, and wait for entry.
9551 				 */
9552 				entry->needs_wakeup = TRUE;
9553 				vm_map_entry_wait(dst_map, THREAD_UNINT);
9554 
9555 				if (!vm_map_lookup_entry(dst_map, base_addr,
9556 				    &tmp_entry)) {
9557 					vm_map_unlock(dst_map);
9558 					return KERN_INVALID_ADDRESS;
9559 				}
9560 				copy_size = 0;
9561 				entry = tmp_entry;
9562 				continue;
9563 			}
9564 			if (entry->is_sub_map) {
9565 				vm_map_offset_t sub_start;
9566 				vm_map_offset_t sub_end;
9567 				vm_map_offset_t local_end;
9568 
9569 				if (entry->needs_copy) {
9570 					/* if this is a COW submap */
9571 					/* just back the range with a */
9572 					/* anonymous entry */
9573 					assert(!entry->vme_permanent);
9574 					if (entry->vme_end < dst_end) {
9575 						sub_end = entry->vme_end;
9576 					} else {
9577 						sub_end = dst_end;
9578 					}
9579 					if (entry->vme_start < base_addr) {
9580 						sub_start = base_addr;
9581 					} else {
9582 						sub_start = entry->vme_start;
9583 					}
9584 					vm_map_clip_end(
9585 						dst_map, entry, sub_end);
9586 					vm_map_clip_start(
9587 						dst_map, entry, sub_start);
9588 					assert(!entry->use_pmap);
9589 					assert(!entry->iokit_acct);
9590 					entry->use_pmap = TRUE;
9591 					vm_map_deallocate(VME_SUBMAP(entry));
9592 					assert(!entry->vme_permanent);
9593 					VME_OBJECT_SET(entry, VM_OBJECT_NULL, false, 0);
9594 					VME_OFFSET_SET(entry, 0);
9595 					entry->is_shared = FALSE;
9596 					entry->needs_copy = FALSE;
9597 					entry->protection = VM_PROT_DEFAULT;
9598 					entry->max_protection = VM_PROT_ALL;
9599 					entry->wired_count = 0;
9600 					entry->user_wired_count = 0;
9601 					if (entry->inheritance
9602 					    == VM_INHERIT_SHARE) {
9603 						entry->inheritance = VM_INHERIT_COPY;
9604 					}
9605 					continue;
9606 				}
9607 				/* first take care of any non-sub_map */
9608 				/* entries to send */
9609 				if (base_addr < entry->vme_start) {
9610 					/* stuff to send */
9611 					copy_size =
9612 					    entry->vme_start - base_addr;
9613 					break;
9614 				}
9615 				sub_start = VME_OFFSET(entry);
9616 
9617 				if (entry->vme_end < dst_end) {
9618 					sub_end = entry->vme_end;
9619 				} else {
9620 					sub_end = dst_end;
9621 				}
9622 				sub_end -= entry->vme_start;
9623 				sub_end += VME_OFFSET(entry);
9624 				local_end = entry->vme_end;
9625 				vm_map_unlock(dst_map);
9626 				copy_size = sub_end - sub_start;
9627 
9628 				/* adjust the copy object */
9629 				if (total_size > copy_size) {
9630 					vm_map_size_t   local_size = 0;
9631 					vm_map_size_t   entry_size;
9632 
9633 					nentries = 1;
9634 					new_offset = copy->offset;
9635 					copy_entry = vm_map_copy_first_entry(copy);
9636 					while (copy_entry !=
9637 					    vm_map_copy_to_entry(copy)) {
9638 						entry_size = copy_entry->vme_end -
9639 						    copy_entry->vme_start;
9640 						if ((local_size < copy_size) &&
9641 						    ((local_size + entry_size)
9642 						    >= copy_size)) {
9643 							vm_map_copy_clip_end(copy,
9644 							    copy_entry,
9645 							    copy_entry->vme_start +
9646 							    (copy_size - local_size));
9647 							entry_size = copy_entry->vme_end -
9648 							    copy_entry->vme_start;
9649 							local_size += entry_size;
9650 							new_offset += entry_size;
9651 						}
9652 						if (local_size >= copy_size) {
9653 							next_copy = copy_entry->vme_next;
9654 							copy_entry->vme_next =
9655 							    vm_map_copy_to_entry(copy);
9656 							previous_prev =
9657 							    copy->cpy_hdr.links.prev;
9658 							copy->cpy_hdr.links.prev = copy_entry;
9659 							copy->size = copy_size;
9660 							remaining_entries =
9661 							    copy->cpy_hdr.nentries;
9662 							remaining_entries -= nentries;
9663 							copy->cpy_hdr.nentries = nentries;
9664 							break;
9665 						} else {
9666 							local_size += entry_size;
9667 							new_offset += entry_size;
9668 							nentries++;
9669 						}
9670 						copy_entry = copy_entry->vme_next;
9671 					}
9672 				}
9673 
9674 				if ((entry->use_pmap) && (pmap == NULL)) {
9675 					kr = vm_map_copy_overwrite_nested(
9676 						VME_SUBMAP(entry),
9677 						sub_start,
9678 						copy,
9679 						interruptible,
9680 						VME_SUBMAP(entry)->pmap,
9681 						TRUE);
9682 				} else if (pmap != NULL) {
9683 					kr = vm_map_copy_overwrite_nested(
9684 						VME_SUBMAP(entry),
9685 						sub_start,
9686 						copy,
9687 						interruptible, pmap,
9688 						TRUE);
9689 				} else {
9690 					kr = vm_map_copy_overwrite_nested(
9691 						VME_SUBMAP(entry),
9692 						sub_start,
9693 						copy,
9694 						interruptible,
9695 						dst_map->pmap,
9696 						TRUE);
9697 				}
9698 				if (kr != KERN_SUCCESS) {
9699 					if (next_copy != NULL) {
9700 						copy->cpy_hdr.nentries +=
9701 						    remaining_entries;
9702 						copy->cpy_hdr.links.prev->vme_next =
9703 						    next_copy;
9704 						copy->cpy_hdr.links.prev
9705 						        = previous_prev;
9706 						copy->size = total_size;
9707 					}
9708 					return kr;
9709 				}
9710 				if (dst_end <= local_end) {
9711 					return KERN_SUCCESS;
9712 				}
9713 				/* otherwise copy no longer exists, it was */
9714 				/* destroyed after successful copy_overwrite */
9715 				copy = vm_map_copy_allocate(VM_MAP_COPY_ENTRY_LIST);
9716 				copy->offset = new_offset;
9717 				copy->cpy_hdr.page_shift = copy_page_shift;
9718 
9719 				total_size -= copy_size;
9720 				copy_size = 0;
9721 				/* put back remainder of copy in container */
9722 				if (next_copy != NULL) {
9723 					copy->cpy_hdr.nentries = remaining_entries;
9724 					copy->cpy_hdr.links.next = next_copy;
9725 					copy->cpy_hdr.links.prev = previous_prev;
9726 					copy->size = total_size;
9727 					next_copy->vme_prev =
9728 					    vm_map_copy_to_entry(copy);
9729 					next_copy = NULL;
9730 				}
9731 				base_addr = local_end;
9732 				vm_map_lock(dst_map);
9733 				if (!vm_map_lookup_entry(dst_map,
9734 				    local_end, &tmp_entry)) {
9735 					vm_map_unlock(dst_map);
9736 					return KERN_INVALID_ADDRESS;
9737 				}
9738 				entry = tmp_entry;
9739 				continue;
9740 			}
9741 			if (dst_end <= entry->vme_end) {
9742 				copy_size = dst_end - base_addr;
9743 				break;
9744 			}
9745 
9746 			if ((next == vm_map_to_entry(dst_map)) ||
9747 			    (next->vme_start != entry->vme_end)) {
9748 				vm_map_unlock(dst_map);
9749 				return KERN_INVALID_ADDRESS;
9750 			}
9751 
9752 			entry = next;
9753 		}/* for */
9754 
9755 		next_copy = NULL;
9756 		nentries = 1;
9757 
9758 		/* adjust the copy object */
9759 		if (total_size > copy_size) {
9760 			vm_map_size_t   local_size = 0;
9761 			vm_map_size_t   entry_size;
9762 
9763 			new_offset = copy->offset;
9764 			copy_entry = vm_map_copy_first_entry(copy);
9765 			while (copy_entry != vm_map_copy_to_entry(copy)) {
9766 				entry_size = copy_entry->vme_end -
9767 				    copy_entry->vme_start;
9768 				if ((local_size < copy_size) &&
9769 				    ((local_size + entry_size)
9770 				    >= copy_size)) {
9771 					vm_map_copy_clip_end(copy, copy_entry,
9772 					    copy_entry->vme_start +
9773 					    (copy_size - local_size));
9774 					entry_size = copy_entry->vme_end -
9775 					    copy_entry->vme_start;
9776 					local_size += entry_size;
9777 					new_offset += entry_size;
9778 				}
9779 				if (local_size >= copy_size) {
9780 					next_copy = copy_entry->vme_next;
9781 					copy_entry->vme_next =
9782 					    vm_map_copy_to_entry(copy);
9783 					previous_prev =
9784 					    copy->cpy_hdr.links.prev;
9785 					copy->cpy_hdr.links.prev = copy_entry;
9786 					copy->size = copy_size;
9787 					remaining_entries =
9788 					    copy->cpy_hdr.nentries;
9789 					remaining_entries -= nentries;
9790 					copy->cpy_hdr.nentries = nentries;
9791 					break;
9792 				} else {
9793 					local_size += entry_size;
9794 					new_offset += entry_size;
9795 					nentries++;
9796 				}
9797 				copy_entry = copy_entry->vme_next;
9798 			}
9799 		}
9800 
9801 		if (aligned) {
9802 			pmap_t  local_pmap;
9803 
9804 			if (pmap) {
9805 				local_pmap = pmap;
9806 			} else {
9807 				local_pmap = dst_map->pmap;
9808 			}
9809 
9810 			if ((kr =  vm_map_copy_overwrite_aligned(
9811 				    dst_map, tmp_entry, copy,
9812 				    base_addr, local_pmap)) != KERN_SUCCESS) {
9813 				if (next_copy != NULL) {
9814 					copy->cpy_hdr.nentries +=
9815 					    remaining_entries;
9816 					copy->cpy_hdr.links.prev->vme_next =
9817 					    next_copy;
9818 					copy->cpy_hdr.links.prev =
9819 					    previous_prev;
9820 					copy->size += copy_size;
9821 				}
9822 				return kr;
9823 			}
9824 			vm_map_unlock(dst_map);
9825 		} else {
9826 			/*
9827 			 * Performance gain:
9828 			 *
9829 			 * if the copy and dst address are misaligned but the same
9830 			 * offset within the page we can copy_not_aligned the
9831 			 * misaligned parts and copy aligned the rest.  If they are
9832 			 * aligned but len is unaligned we simply need to copy
9833 			 * the end bit unaligned.  We'll need to split the misaligned
9834 			 * bits of the region in this case !
9835 			 */
9836 			/* ALWAYS UNLOCKS THE dst_map MAP */
9837 			kr = vm_map_copy_overwrite_unaligned(
9838 				dst_map,
9839 				tmp_entry,
9840 				copy,
9841 				base_addr,
9842 				discard_on_success);
9843 			if (kr != KERN_SUCCESS) {
9844 				if (next_copy != NULL) {
9845 					copy->cpy_hdr.nentries +=
9846 					    remaining_entries;
9847 					copy->cpy_hdr.links.prev->vme_next =
9848 					    next_copy;
9849 					copy->cpy_hdr.links.prev =
9850 					    previous_prev;
9851 					copy->size += copy_size;
9852 				}
9853 				return kr;
9854 			}
9855 		}
9856 		total_size -= copy_size;
9857 		if (total_size == 0) {
9858 			break;
9859 		}
9860 		base_addr += copy_size;
9861 		copy_size = 0;
9862 		copy->offset = new_offset;
9863 		if (next_copy != NULL) {
9864 			copy->cpy_hdr.nentries = remaining_entries;
9865 			copy->cpy_hdr.links.next = next_copy;
9866 			copy->cpy_hdr.links.prev = previous_prev;
9867 			next_copy->vme_prev = vm_map_copy_to_entry(copy);
9868 			copy->size = total_size;
9869 		}
9870 		vm_map_lock(dst_map);
9871 		while (TRUE) {
9872 			if (!vm_map_lookup_entry(dst_map,
9873 			    base_addr, &tmp_entry)) {
9874 				vm_map_unlock(dst_map);
9875 				return KERN_INVALID_ADDRESS;
9876 			}
9877 			if (tmp_entry->in_transition) {
9878 				entry->needs_wakeup = TRUE;
9879 				vm_map_entry_wait(dst_map, THREAD_UNINT);
9880 			} else {
9881 				break;
9882 			}
9883 		}
9884 		vm_map_clip_start(dst_map,
9885 		    tmp_entry,
9886 		    vm_map_trunc_page(base_addr,
9887 		    VM_MAP_PAGE_MASK(dst_map)));
9888 
9889 		entry = tmp_entry;
9890 	} /* while */
9891 
9892 	/*
9893 	 *	Throw away the vm_map_copy object
9894 	 */
9895 	if (discard_on_success) {
9896 		vm_map_copy_discard(copy);
9897 	}
9898 
9899 	return KERN_SUCCESS;
9900 }/* vm_map_copy_overwrite */
9901 
9902 kern_return_t
vm_map_copy_overwrite(vm_map_t dst_map,vm_map_offset_t dst_addr,vm_map_copy_t copy,vm_map_size_t copy_size,boolean_t interruptible)9903 vm_map_copy_overwrite(
9904 	vm_map_t        dst_map,
9905 	vm_map_offset_t dst_addr,
9906 	vm_map_copy_t   copy,
9907 	vm_map_size_t   copy_size,
9908 	boolean_t       interruptible)
9909 {
9910 	vm_map_size_t   head_size, tail_size;
9911 	vm_map_copy_t   head_copy, tail_copy;
9912 	vm_map_offset_t head_addr, tail_addr;
9913 	vm_map_entry_t  entry;
9914 	kern_return_t   kr;
9915 	vm_map_offset_t effective_page_mask, effective_page_size;
9916 	uint16_t        copy_page_shift;
9917 
9918 	head_size = 0;
9919 	tail_size = 0;
9920 	head_copy = NULL;
9921 	tail_copy = NULL;
9922 	head_addr = 0;
9923 	tail_addr = 0;
9924 
9925 	/*
9926 	 *	Check for null copy object.
9927 	 */
9928 	if (copy == VM_MAP_COPY_NULL) {
9929 		return KERN_SUCCESS;
9930 	}
9931 
9932 	if (__improbable(vm_map_range_overflows(dst_map, dst_addr, copy_size))) {
9933 		return KERN_INVALID_ADDRESS;
9934 	}
9935 
9936 	/*
9937 	 * Assert that the vm_map_copy is coming from the right
9938 	 * zone and hasn't been forged
9939 	 */
9940 	vm_map_copy_require(copy);
9941 
9942 	if (interruptible ||
9943 	    copy->type != VM_MAP_COPY_ENTRY_LIST) {
9944 		/*
9945 		 * We can't split the "copy" map if we're interruptible
9946 		 * or if we don't have a "copy" map...
9947 		 */
9948 blunt_copy:
9949 		return vm_map_copy_overwrite_nested(dst_map,
9950 		           dst_addr,
9951 		           copy,
9952 		           interruptible,
9953 		           (pmap_t) NULL,
9954 		           TRUE);
9955 	}
9956 
9957 	copy_page_shift = VM_MAP_COPY_PAGE_SHIFT(copy);
9958 	if (copy_page_shift < PAGE_SHIFT ||
9959 	    VM_MAP_PAGE_SHIFT(dst_map) < PAGE_SHIFT) {
9960 		goto blunt_copy;
9961 	}
9962 
9963 	if (VM_MAP_PAGE_SHIFT(dst_map) < PAGE_SHIFT) {
9964 		effective_page_mask = VM_MAP_PAGE_MASK(dst_map);
9965 	} else {
9966 		effective_page_mask = MAX(VM_MAP_PAGE_MASK(dst_map), PAGE_MASK);
9967 		effective_page_mask = MAX(VM_MAP_COPY_PAGE_MASK(copy),
9968 		    effective_page_mask);
9969 	}
9970 	effective_page_size = effective_page_mask + 1;
9971 
9972 	if (copy_size < VM_MAP_COPY_OVERWRITE_OPTIMIZATION_THRESHOLD_PAGES * effective_page_size) {
9973 		/*
9974 		 * Too small to bother with optimizing...
9975 		 */
9976 		goto blunt_copy;
9977 	}
9978 
9979 	if ((dst_addr & effective_page_mask) !=
9980 	    (copy->offset & effective_page_mask)) {
9981 		/*
9982 		 * Incompatible mis-alignment of source and destination...
9983 		 */
9984 		goto blunt_copy;
9985 	}
9986 
9987 	/*
9988 	 * Proper alignment or identical mis-alignment at the beginning.
9989 	 * Let's try and do a small unaligned copy first (if needed)
9990 	 * and then an aligned copy for the rest.
9991 	 */
9992 	if (!vm_map_page_aligned(dst_addr, effective_page_mask)) {
9993 		head_addr = dst_addr;
9994 		head_size = (effective_page_size -
9995 		    (copy->offset & effective_page_mask));
9996 		head_size = MIN(head_size, copy_size);
9997 	}
9998 	if (!vm_map_page_aligned(copy->offset + copy_size,
9999 	    effective_page_mask)) {
10000 		/*
10001 		 * Mis-alignment at the end.
10002 		 * Do an aligned copy up to the last page and
10003 		 * then an unaligned copy for the remaining bytes.
10004 		 */
10005 		tail_size = ((copy->offset + copy_size) &
10006 		    effective_page_mask);
10007 		tail_size = MIN(tail_size, copy_size);
10008 		tail_addr = dst_addr + copy_size - tail_size;
10009 		assert(tail_addr >= head_addr + head_size);
10010 	}
10011 	assert(head_size + tail_size <= copy_size);
10012 
10013 	if (head_size + tail_size == copy_size) {
10014 		/*
10015 		 * It's all unaligned, no optimization possible...
10016 		 */
10017 		goto blunt_copy;
10018 	}
10019 
10020 	/*
10021 	 * Can't optimize if there are any submaps in the
10022 	 * destination due to the way we free the "copy" map
10023 	 * progressively in vm_map_copy_overwrite_nested()
10024 	 * in that case.
10025 	 */
10026 	vm_map_lock_read(dst_map);
10027 	if (!vm_map_lookup_entry(dst_map, dst_addr, &entry)) {
10028 		vm_map_unlock_read(dst_map);
10029 		goto blunt_copy;
10030 	}
10031 	for (;
10032 	    (entry != vm_map_to_entry(dst_map) &&
10033 	    entry->vme_start < dst_addr + copy_size);
10034 	    entry = entry->vme_next) {
10035 		if (entry->is_sub_map) {
10036 			vm_map_unlock_read(dst_map);
10037 			goto blunt_copy;
10038 		}
10039 	}
10040 	vm_map_unlock_read(dst_map);
10041 
10042 	if (head_size) {
10043 		/*
10044 		 * Unaligned copy of the first "head_size" bytes, to reach
10045 		 * a page boundary.
10046 		 */
10047 
10048 		/*
10049 		 * Extract "head_copy" out of "copy".
10050 		 */
10051 		head_copy = vm_map_copy_allocate(VM_MAP_COPY_ENTRY_LIST);
10052 		head_copy->cpy_hdr.entries_pageable =
10053 		    copy->cpy_hdr.entries_pageable;
10054 		head_copy->cpy_hdr.page_shift = copy_page_shift;
10055 
10056 		entry = vm_map_copy_first_entry(copy);
10057 		if (entry->vme_end < copy->offset + head_size) {
10058 			head_size = entry->vme_end - copy->offset;
10059 		}
10060 
10061 		head_copy->offset = copy->offset;
10062 		head_copy->size = head_size;
10063 		copy->offset += head_size;
10064 		copy->size -= head_size;
10065 		copy_size -= head_size;
10066 		assert(copy_size > 0);
10067 
10068 		vm_map_copy_clip_end(copy, entry, copy->offset);
10069 		vm_map_copy_entry_unlink(copy, entry);
10070 		vm_map_copy_entry_link(head_copy,
10071 		    vm_map_copy_to_entry(head_copy),
10072 		    entry);
10073 
10074 		/*
10075 		 * Do the unaligned copy.
10076 		 */
10077 		kr = vm_map_copy_overwrite_nested(dst_map,
10078 		    head_addr,
10079 		    head_copy,
10080 		    interruptible,
10081 		    (pmap_t) NULL,
10082 		    FALSE);
10083 		if (kr != KERN_SUCCESS) {
10084 			goto done;
10085 		}
10086 	}
10087 
10088 	if (tail_size) {
10089 		/*
10090 		 * Extract "tail_copy" out of "copy".
10091 		 */
10092 		tail_copy = vm_map_copy_allocate(VM_MAP_COPY_ENTRY_LIST);
10093 		tail_copy->cpy_hdr.entries_pageable =
10094 		    copy->cpy_hdr.entries_pageable;
10095 		tail_copy->cpy_hdr.page_shift = copy_page_shift;
10096 
10097 		tail_copy->offset = copy->offset + copy_size - tail_size;
10098 		tail_copy->size = tail_size;
10099 
10100 		copy->size -= tail_size;
10101 		copy_size -= tail_size;
10102 		assert(copy_size > 0);
10103 
10104 		entry = vm_map_copy_last_entry(copy);
10105 		vm_map_copy_clip_start(copy, entry, tail_copy->offset);
10106 		entry = vm_map_copy_last_entry(copy);
10107 		vm_map_copy_entry_unlink(copy, entry);
10108 		vm_map_copy_entry_link(tail_copy,
10109 		    vm_map_copy_last_entry(tail_copy),
10110 		    entry);
10111 	}
10112 
10113 	/*
10114 	 * If we are here from ipc_kmsg_copyout_ool_descriptor(),
10115 	 * we want to avoid TOCTOU issues w.r.t copy->size but
10116 	 * we don't need to change vm_map_copy_overwrite_nested()
10117 	 * and all other vm_map_copy_overwrite variants.
10118 	 *
10119 	 * So we assign the original copy_size that was passed into
10120 	 * this routine back to copy.
10121 	 *
10122 	 * This use of local 'copy_size' passed into this routine is
10123 	 * to try and protect against TOCTOU attacks where the kernel
10124 	 * has been exploited. We don't expect this to be an issue
10125 	 * during normal system operation.
10126 	 */
10127 	assertf(copy->size == copy_size,
10128 	    "Mismatch of copy sizes. Expected 0x%llx, Got 0x%llx\n", (uint64_t) copy_size, (uint64_t) copy->size);
10129 	copy->size = copy_size;
10130 
10131 	/*
10132 	 * Copy most (or possibly all) of the data.
10133 	 */
10134 	kr = vm_map_copy_overwrite_nested(dst_map,
10135 	    dst_addr + head_size,
10136 	    copy,
10137 	    interruptible,
10138 	    (pmap_t) NULL,
10139 	    FALSE);
10140 	if (kr != KERN_SUCCESS) {
10141 		goto done;
10142 	}
10143 
10144 	if (tail_size) {
10145 		kr = vm_map_copy_overwrite_nested(dst_map,
10146 		    tail_addr,
10147 		    tail_copy,
10148 		    interruptible,
10149 		    (pmap_t) NULL,
10150 		    FALSE);
10151 	}
10152 
10153 done:
10154 	assert(copy->type == VM_MAP_COPY_ENTRY_LIST);
10155 	if (kr == KERN_SUCCESS) {
10156 		/*
10157 		 * Discard all the copy maps.
10158 		 */
10159 		if (head_copy) {
10160 			vm_map_copy_discard(head_copy);
10161 			head_copy = NULL;
10162 		}
10163 		vm_map_copy_discard(copy);
10164 		if (tail_copy) {
10165 			vm_map_copy_discard(tail_copy);
10166 			tail_copy = NULL;
10167 		}
10168 	} else {
10169 		/*
10170 		 * Re-assemble the original copy map.
10171 		 */
10172 		if (head_copy) {
10173 			entry = vm_map_copy_first_entry(head_copy);
10174 			vm_map_copy_entry_unlink(head_copy, entry);
10175 			vm_map_copy_entry_link(copy,
10176 			    vm_map_copy_to_entry(copy),
10177 			    entry);
10178 			copy->offset -= head_size;
10179 			copy->size += head_size;
10180 			vm_map_copy_discard(head_copy);
10181 			head_copy = NULL;
10182 		}
10183 		if (tail_copy) {
10184 			entry = vm_map_copy_last_entry(tail_copy);
10185 			vm_map_copy_entry_unlink(tail_copy, entry);
10186 			vm_map_copy_entry_link(copy,
10187 			    vm_map_copy_last_entry(copy),
10188 			    entry);
10189 			copy->size += tail_size;
10190 			vm_map_copy_discard(tail_copy);
10191 			tail_copy = NULL;
10192 		}
10193 	}
10194 	return kr;
10195 }
10196 
10197 
10198 /*
10199  *	Routine: vm_map_copy_overwrite_unaligned	[internal use only]
10200  *
10201  *	Decription:
10202  *	Physically copy unaligned data
10203  *
10204  *	Implementation:
10205  *	Unaligned parts of pages have to be physically copied.  We use
10206  *	a modified form of vm_fault_copy (which understands none-aligned
10207  *	page offsets and sizes) to do the copy.  We attempt to copy as
10208  *	much memory in one go as possibly, however vm_fault_copy copies
10209  *	within 1 memory object so we have to find the smaller of "amount left"
10210  *	"source object data size" and "target object data size".  With
10211  *	unaligned data we don't need to split regions, therefore the source
10212  *	(copy) object should be one map entry, the target range may be split
10213  *	over multiple map entries however.  In any event we are pessimistic
10214  *	about these assumptions.
10215  *
10216  *	Callers of this function must call vm_map_copy_require on
10217  *	previously created vm_map_copy_t or pass a newly created
10218  *	one to ensure that it hasn't been forged.
10219  *
10220  *	Assumptions:
10221  *	dst_map is locked on entry and is return locked on success,
10222  *	unlocked on error.
10223  */
10224 
10225 static kern_return_t
vm_map_copy_overwrite_unaligned(vm_map_t dst_map,vm_map_entry_t entry,vm_map_copy_t copy,vm_map_offset_t start,boolean_t discard_on_success)10226 vm_map_copy_overwrite_unaligned(
10227 	vm_map_t        dst_map,
10228 	vm_map_entry_t  entry,
10229 	vm_map_copy_t   copy,
10230 	vm_map_offset_t start,
10231 	boolean_t       discard_on_success)
10232 {
10233 	vm_map_entry_t          copy_entry;
10234 	vm_map_entry_t          copy_entry_next;
10235 	vm_map_version_t        version;
10236 	vm_object_t             dst_object;
10237 	vm_object_offset_t      dst_offset;
10238 	vm_object_offset_t      src_offset;
10239 	vm_object_offset_t      entry_offset;
10240 	vm_map_offset_t         entry_end;
10241 	vm_map_size_t           src_size,
10242 	    dst_size,
10243 	    copy_size,
10244 	    amount_left;
10245 	kern_return_t           kr = KERN_SUCCESS;
10246 
10247 
10248 	copy_entry = vm_map_copy_first_entry(copy);
10249 
10250 	vm_map_lock_write_to_read(dst_map);
10251 
10252 	src_offset = copy->offset - trunc_page_mask_64(copy->offset, VM_MAP_COPY_PAGE_MASK(copy));
10253 	amount_left = copy->size;
10254 /*
10255  *	unaligned so we never clipped this entry, we need the offset into
10256  *	the vm_object not just the data.
10257  */
10258 	while (amount_left > 0) {
10259 		if (entry == vm_map_to_entry(dst_map)) {
10260 			vm_map_unlock_read(dst_map);
10261 			return KERN_INVALID_ADDRESS;
10262 		}
10263 
10264 		/* "start" must be within the current map entry */
10265 		assert((start >= entry->vme_start) && (start < entry->vme_end));
10266 
10267 		/*
10268 		 *	Check protection again
10269 		 */
10270 		if (!(entry->protection & VM_PROT_WRITE)) {
10271 			vm_map_unlock_read(dst_map);
10272 			return KERN_PROTECTION_FAILURE;
10273 		}
10274 		if (!vm_map_entry_is_overwritable(dst_map, entry)) {
10275 			vm_map_unlock_read(dst_map);
10276 			return KERN_PROTECTION_FAILURE;
10277 		}
10278 
10279 		dst_offset = start - entry->vme_start;
10280 
10281 		dst_size = entry->vme_end - start;
10282 
10283 		src_size = copy_entry->vme_end -
10284 		    (copy_entry->vme_start + src_offset);
10285 
10286 		if (dst_size < src_size) {
10287 /*
10288  *			we can only copy dst_size bytes before
10289  *			we have to get the next destination entry
10290  */
10291 			copy_size = dst_size;
10292 		} else {
10293 /*
10294  *			we can only copy src_size bytes before
10295  *			we have to get the next source copy entry
10296  */
10297 			copy_size = src_size;
10298 		}
10299 
10300 		if (copy_size > amount_left) {
10301 			copy_size = amount_left;
10302 		}
10303 /*
10304  *		Entry needs copy, create a shadow shadow object for
10305  *		Copy on write region.
10306  */
10307 		if (entry->needs_copy) {
10308 			if (vm_map_lock_read_to_write(dst_map)) {
10309 				vm_map_lock_read(dst_map);
10310 				goto RetryLookup;
10311 			}
10312 			VME_OBJECT_SHADOW(entry,
10313 			    (vm_map_size_t)(entry->vme_end
10314 			    - entry->vme_start),
10315 			    vm_map_always_shadow(dst_map));
10316 			entry->needs_copy = FALSE;
10317 			vm_map_lock_write_to_read(dst_map);
10318 		}
10319 		dst_object = VME_OBJECT(entry);
10320 /*
10321  *		unlike with the virtual (aligned) copy we're going
10322  *		to fault on it therefore we need a target object.
10323  */
10324 		if (dst_object == VM_OBJECT_NULL) {
10325 			if (vm_map_lock_read_to_write(dst_map)) {
10326 				vm_map_lock_read(dst_map);
10327 				goto RetryLookup;
10328 			}
10329 			dst_object = vm_object_allocate((vm_map_size_t)
10330 			    entry->vme_end - entry->vme_start);
10331 			VME_OBJECT_SET(entry, dst_object, false, 0);
10332 			VME_OFFSET_SET(entry, 0);
10333 			assert(entry->use_pmap);
10334 			vm_map_lock_write_to_read(dst_map);
10335 		}
10336 /*
10337  *		Take an object reference and unlock map. The "entry" may
10338  *		disappear or change when the map is unlocked.
10339  */
10340 		vm_object_reference(dst_object);
10341 		version.main_timestamp = dst_map->timestamp;
10342 		entry_offset = VME_OFFSET(entry);
10343 		entry_end = entry->vme_end;
10344 		vm_map_unlock_read(dst_map);
10345 /*
10346  *		Copy as much as possible in one pass
10347  */
10348 		kr = vm_fault_copy(
10349 			VME_OBJECT(copy_entry),
10350 			VME_OFFSET(copy_entry) + src_offset,
10351 			&copy_size,
10352 			dst_object,
10353 			entry_offset + dst_offset,
10354 			dst_map,
10355 			&version,
10356 			THREAD_UNINT );
10357 
10358 		start += copy_size;
10359 		src_offset += copy_size;
10360 		amount_left -= copy_size;
10361 /*
10362  *		Release the object reference
10363  */
10364 		vm_object_deallocate(dst_object);
10365 /*
10366  *		If a hard error occurred, return it now
10367  */
10368 		if (kr != KERN_SUCCESS) {
10369 			return kr;
10370 		}
10371 
10372 		if ((copy_entry->vme_start + src_offset) == copy_entry->vme_end
10373 		    || amount_left == 0) {
10374 /*
10375  *			all done with this copy entry, dispose.
10376  */
10377 			copy_entry_next = copy_entry->vme_next;
10378 
10379 			if (discard_on_success) {
10380 				vm_map_copy_entry_unlink(copy, copy_entry);
10381 				assert(!copy_entry->is_sub_map);
10382 				vm_object_deallocate(VME_OBJECT(copy_entry));
10383 				vm_map_copy_entry_dispose(copy_entry);
10384 			}
10385 
10386 			if (copy_entry_next == vm_map_copy_to_entry(copy) &&
10387 			    amount_left) {
10388 /*
10389  *				not finished copying but run out of source
10390  */
10391 				return KERN_INVALID_ADDRESS;
10392 			}
10393 
10394 			copy_entry = copy_entry_next;
10395 
10396 			src_offset = 0;
10397 		}
10398 
10399 		if (amount_left == 0) {
10400 			return KERN_SUCCESS;
10401 		}
10402 
10403 		vm_map_lock_read(dst_map);
10404 		if (version.main_timestamp == dst_map->timestamp) {
10405 			if (start == entry_end) {
10406 /*
10407  *				destination region is split.  Use the version
10408  *				information to avoid a lookup in the normal
10409  *				case.
10410  */
10411 				entry = entry->vme_next;
10412 /*
10413  *				should be contiguous. Fail if we encounter
10414  *				a hole in the destination.
10415  */
10416 				if (start != entry->vme_start) {
10417 					vm_map_unlock_read(dst_map);
10418 					return KERN_INVALID_ADDRESS;
10419 				}
10420 			}
10421 		} else {
10422 /*
10423  *			Map version check failed.
10424  *			we must lookup the entry because somebody
10425  *			might have changed the map behind our backs.
10426  */
10427 RetryLookup:
10428 			if (!vm_map_lookup_entry(dst_map, start, &entry)) {
10429 				vm_map_unlock_read(dst_map);
10430 				return KERN_INVALID_ADDRESS;
10431 			}
10432 		}
10433 	}/* while */
10434 
10435 	return KERN_SUCCESS;
10436 }/* vm_map_copy_overwrite_unaligned */
10437 
10438 /*
10439  *	Routine: vm_map_copy_overwrite_aligned	[internal use only]
10440  *
10441  *	Description:
10442  *	Does all the vm_trickery possible for whole pages.
10443  *
10444  *	Implementation:
10445  *
10446  *	If there are no permanent objects in the destination,
10447  *	and the source and destination map entry zones match,
10448  *	and the destination map entry is not shared,
10449  *	then the map entries can be deleted and replaced
10450  *	with those from the copy.  The following code is the
10451  *	basic idea of what to do, but there are lots of annoying
10452  *	little details about getting protection and inheritance
10453  *	right.  Should add protection, inheritance, and sharing checks
10454  *	to the above pass and make sure that no wiring is involved.
10455  *
10456  *	Callers of this function must call vm_map_copy_require on
10457  *	previously created vm_map_copy_t or pass a newly created
10458  *	one to ensure that it hasn't been forged.
10459  */
10460 
10461 int vm_map_copy_overwrite_aligned_src_not_internal = 0;
10462 int vm_map_copy_overwrite_aligned_src_not_symmetric = 0;
10463 int vm_map_copy_overwrite_aligned_src_large = 0;
10464 
10465 static kern_return_t
vm_map_copy_overwrite_aligned(vm_map_t dst_map,vm_map_entry_t tmp_entry,vm_map_copy_t copy,vm_map_offset_t start,__unused pmap_t pmap)10466 vm_map_copy_overwrite_aligned(
10467 	vm_map_t        dst_map,
10468 	vm_map_entry_t  tmp_entry,
10469 	vm_map_copy_t   copy,
10470 	vm_map_offset_t start,
10471 	__unused pmap_t pmap)
10472 {
10473 	vm_object_t     object;
10474 	vm_map_entry_t  copy_entry;
10475 	vm_map_size_t   copy_size;
10476 	vm_map_size_t   size;
10477 	vm_map_entry_t  entry;
10478 
10479 	while ((copy_entry = vm_map_copy_first_entry(copy))
10480 	    != vm_map_copy_to_entry(copy)) {
10481 		copy_size = (copy_entry->vme_end - copy_entry->vme_start);
10482 
10483 		entry = tmp_entry;
10484 		if (entry->is_sub_map) {
10485 			/* unnested when clipped earlier */
10486 			assert(!entry->use_pmap);
10487 		}
10488 		if (entry == vm_map_to_entry(dst_map)) {
10489 			vm_map_unlock(dst_map);
10490 			return KERN_INVALID_ADDRESS;
10491 		}
10492 		size = (entry->vme_end - entry->vme_start);
10493 		/*
10494 		 *	Make sure that no holes popped up in the
10495 		 *	address map, and that the protection is
10496 		 *	still valid, in case the map was unlocked
10497 		 *	earlier.
10498 		 */
10499 
10500 		if ((entry->vme_start != start) || ((entry->is_sub_map)
10501 		    && !entry->needs_copy)) {
10502 			vm_map_unlock(dst_map);
10503 			return KERN_INVALID_ADDRESS;
10504 		}
10505 		assert(entry != vm_map_to_entry(dst_map));
10506 
10507 		/*
10508 		 *	Check protection again
10509 		 */
10510 
10511 		if (!(entry->protection & VM_PROT_WRITE)) {
10512 			vm_map_unlock(dst_map);
10513 			return KERN_PROTECTION_FAILURE;
10514 		}
10515 
10516 		if (!vm_map_entry_is_overwritable(dst_map, entry)) {
10517 			vm_map_unlock(dst_map);
10518 			return KERN_PROTECTION_FAILURE;
10519 		}
10520 
10521 		/*
10522 		 *	Adjust to source size first
10523 		 */
10524 
10525 		if (copy_size < size) {
10526 			if (entry->map_aligned &&
10527 			    !VM_MAP_PAGE_ALIGNED(entry->vme_start + copy_size,
10528 			    VM_MAP_PAGE_MASK(dst_map))) {
10529 				/* no longer map-aligned */
10530 				entry->map_aligned = FALSE;
10531 			}
10532 			vm_map_clip_end(dst_map, entry, entry->vme_start + copy_size);
10533 			size = copy_size;
10534 		}
10535 
10536 		/*
10537 		 *	Adjust to destination size
10538 		 */
10539 
10540 		if (size < copy_size) {
10541 			vm_map_copy_clip_end(copy, copy_entry,
10542 			    copy_entry->vme_start + size);
10543 			copy_size = size;
10544 		}
10545 
10546 		assert((entry->vme_end - entry->vme_start) == size);
10547 		assert((tmp_entry->vme_end - tmp_entry->vme_start) == size);
10548 		assert((copy_entry->vme_end - copy_entry->vme_start) == size);
10549 
10550 		/*
10551 		 *	If the destination contains temporary unshared memory,
10552 		 *	we can perform the copy by throwing it away and
10553 		 *	installing the source data.
10554 		 */
10555 
10556 		object = VME_OBJECT(entry);
10557 		if ((!entry->is_shared &&
10558 		    ((object == VM_OBJECT_NULL) ||
10559 		    (object->internal && !object->true_share))) ||
10560 		    entry->needs_copy) {
10561 			vm_object_t     old_object = VME_OBJECT(entry);
10562 			vm_object_offset_t      old_offset = VME_OFFSET(entry);
10563 			vm_object_offset_t      offset;
10564 
10565 			/*
10566 			 * Ensure that the source and destination aren't
10567 			 * identical
10568 			 */
10569 			if (old_object == VME_OBJECT(copy_entry) &&
10570 			    old_offset == VME_OFFSET(copy_entry)) {
10571 				vm_map_copy_entry_unlink(copy, copy_entry);
10572 				vm_map_copy_entry_dispose(copy_entry);
10573 
10574 				if (old_object != VM_OBJECT_NULL) {
10575 					vm_object_deallocate(old_object);
10576 				}
10577 
10578 				start = tmp_entry->vme_end;
10579 				tmp_entry = tmp_entry->vme_next;
10580 				continue;
10581 			}
10582 
10583 #if XNU_TARGET_OS_OSX
10584 #define __TRADEOFF1_OBJ_SIZE (64 * 1024 * 1024) /* 64 MB */
10585 #define __TRADEOFF1_COPY_SIZE (128 * 1024)      /* 128 KB */
10586 			if (VME_OBJECT(copy_entry) != VM_OBJECT_NULL &&
10587 			    VME_OBJECT(copy_entry)->vo_size >= __TRADEOFF1_OBJ_SIZE &&
10588 			    copy_size <= __TRADEOFF1_COPY_SIZE) {
10589 				/*
10590 				 * Virtual vs. Physical copy tradeoff #1.
10591 				 *
10592 				 * Copying only a few pages out of a large
10593 				 * object:  do a physical copy instead of
10594 				 * a virtual copy, to avoid possibly keeping
10595 				 * the entire large object alive because of
10596 				 * those few copy-on-write pages.
10597 				 */
10598 				vm_map_copy_overwrite_aligned_src_large++;
10599 				goto slow_copy;
10600 			}
10601 #endif /* XNU_TARGET_OS_OSX */
10602 
10603 			if ((dst_map->pmap != kernel_pmap) &&
10604 			    (VME_ALIAS(entry) >= VM_MEMORY_MALLOC) &&
10605 			    (VME_ALIAS(entry) <= VM_MEMORY_MALLOC_MEDIUM)) {
10606 				vm_object_t new_object, new_shadow;
10607 
10608 				/*
10609 				 * We're about to map something over a mapping
10610 				 * established by malloc()...
10611 				 */
10612 				new_object = VME_OBJECT(copy_entry);
10613 				if (new_object != VM_OBJECT_NULL) {
10614 					vm_object_lock_shared(new_object);
10615 				}
10616 				while (new_object != VM_OBJECT_NULL &&
10617 #if XNU_TARGET_OS_OSX
10618 				    !new_object->true_share &&
10619 				    new_object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC &&
10620 #endif /* XNU_TARGET_OS_OSX */
10621 				    new_object->internal) {
10622 					new_shadow = new_object->shadow;
10623 					if (new_shadow == VM_OBJECT_NULL) {
10624 						break;
10625 					}
10626 					vm_object_lock_shared(new_shadow);
10627 					vm_object_unlock(new_object);
10628 					new_object = new_shadow;
10629 				}
10630 				if (new_object != VM_OBJECT_NULL) {
10631 					if (!new_object->internal) {
10632 						/*
10633 						 * The new mapping is backed
10634 						 * by an external object.  We
10635 						 * don't want malloc'ed memory
10636 						 * to be replaced with such a
10637 						 * non-anonymous mapping, so
10638 						 * let's go off the optimized
10639 						 * path...
10640 						 */
10641 						vm_map_copy_overwrite_aligned_src_not_internal++;
10642 						vm_object_unlock(new_object);
10643 						goto slow_copy;
10644 					}
10645 #if XNU_TARGET_OS_OSX
10646 					if (new_object->true_share ||
10647 					    new_object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC) {
10648 						/*
10649 						 * Same if there's a "true_share"
10650 						 * object in the shadow chain, or
10651 						 * an object with a non-default
10652 						 * (SYMMETRIC) copy strategy.
10653 						 */
10654 						vm_map_copy_overwrite_aligned_src_not_symmetric++;
10655 						vm_object_unlock(new_object);
10656 						goto slow_copy;
10657 					}
10658 #endif /* XNU_TARGET_OS_OSX */
10659 					vm_object_unlock(new_object);
10660 				}
10661 				/*
10662 				 * The new mapping is still backed by
10663 				 * anonymous (internal) memory, so it's
10664 				 * OK to substitute it for the original
10665 				 * malloc() mapping.
10666 				 */
10667 			}
10668 
10669 			if (old_object != VM_OBJECT_NULL) {
10670 				assert(!entry->vme_permanent);
10671 				if (entry->is_sub_map) {
10672 					if (entry->use_pmap) {
10673 #ifndef NO_NESTED_PMAP
10674 						pmap_unnest(dst_map->pmap,
10675 						    (addr64_t)entry->vme_start,
10676 						    entry->vme_end - entry->vme_start);
10677 #endif  /* NO_NESTED_PMAP */
10678 						if (dst_map->mapped_in_other_pmaps) {
10679 							/* clean up parent */
10680 							/* map/maps */
10681 							vm_map_submap_pmap_clean(
10682 								dst_map, entry->vme_start,
10683 								entry->vme_end,
10684 								VME_SUBMAP(entry),
10685 								VME_OFFSET(entry));
10686 						}
10687 					} else {
10688 						vm_map_submap_pmap_clean(
10689 							dst_map, entry->vme_start,
10690 							entry->vme_end,
10691 							VME_SUBMAP(entry),
10692 							VME_OFFSET(entry));
10693 					}
10694 					vm_map_deallocate(VME_SUBMAP(entry));
10695 				} else {
10696 					if (dst_map->mapped_in_other_pmaps) {
10697 						vm_object_pmap_protect_options(
10698 							VME_OBJECT(entry),
10699 							VME_OFFSET(entry),
10700 							entry->vme_end
10701 							- entry->vme_start,
10702 							PMAP_NULL,
10703 							PAGE_SIZE,
10704 							entry->vme_start,
10705 							VM_PROT_NONE,
10706 							PMAP_OPTIONS_REMOVE);
10707 					} else {
10708 						pmap_remove_options(
10709 							dst_map->pmap,
10710 							(addr64_t)(entry->vme_start),
10711 							(addr64_t)(entry->vme_end),
10712 							PMAP_OPTIONS_REMOVE);
10713 					}
10714 					vm_object_deallocate(old_object);
10715 				}
10716 			}
10717 
10718 			if (entry->iokit_acct) {
10719 				/* keep using iokit accounting */
10720 				entry->use_pmap = FALSE;
10721 			} else {
10722 				/* use pmap accounting */
10723 				entry->use_pmap = TRUE;
10724 			}
10725 			assert(!entry->vme_permanent);
10726 			VME_OBJECT_SET(entry, VME_OBJECT(copy_entry), false, 0);
10727 			object = VME_OBJECT(entry);
10728 			entry->needs_copy = copy_entry->needs_copy;
10729 			entry->wired_count = 0;
10730 			entry->user_wired_count = 0;
10731 			offset = VME_OFFSET(copy_entry);
10732 			VME_OFFSET_SET(entry, offset);
10733 
10734 			vm_map_copy_entry_unlink(copy, copy_entry);
10735 			vm_map_copy_entry_dispose(copy_entry);
10736 
10737 			/*
10738 			 * we could try to push pages into the pmap at this point, BUT
10739 			 * this optimization only saved on average 2 us per page if ALL
10740 			 * the pages in the source were currently mapped
10741 			 * and ALL the pages in the dest were touched, if there were fewer
10742 			 * than 2/3 of the pages touched, this optimization actually cost more cycles
10743 			 * it also puts a lot of pressure on the pmap layer w/r to mapping structures
10744 			 */
10745 
10746 			/*
10747 			 *	Set up for the next iteration.  The map
10748 			 *	has not been unlocked, so the next
10749 			 *	address should be at the end of this
10750 			 *	entry, and the next map entry should be
10751 			 *	the one following it.
10752 			 */
10753 
10754 			start = tmp_entry->vme_end;
10755 			tmp_entry = tmp_entry->vme_next;
10756 		} else {
10757 			vm_map_version_t        version;
10758 			vm_object_t             dst_object;
10759 			vm_object_offset_t      dst_offset;
10760 			kern_return_t           r;
10761 
10762 slow_copy:
10763 			if (entry->needs_copy) {
10764 				VME_OBJECT_SHADOW(entry,
10765 				    (entry->vme_end -
10766 				    entry->vme_start),
10767 				    vm_map_always_shadow(dst_map));
10768 				entry->needs_copy = FALSE;
10769 			}
10770 
10771 			dst_object = VME_OBJECT(entry);
10772 			dst_offset = VME_OFFSET(entry);
10773 
10774 			/*
10775 			 *	Take an object reference, and record
10776 			 *	the map version information so that the
10777 			 *	map can be safely unlocked.
10778 			 */
10779 
10780 			if (dst_object == VM_OBJECT_NULL) {
10781 				/*
10782 				 * We would usually have just taken the
10783 				 * optimized path above if the destination
10784 				 * object has not been allocated yet.  But we
10785 				 * now disable that optimization if the copy
10786 				 * entry's object is not backed by anonymous
10787 				 * memory to avoid replacing malloc'ed
10788 				 * (i.e. re-usable) anonymous memory with a
10789 				 * not-so-anonymous mapping.
10790 				 * So we have to handle this case here and
10791 				 * allocate a new VM object for this map entry.
10792 				 */
10793 				dst_object = vm_object_allocate(
10794 					entry->vme_end - entry->vme_start);
10795 				dst_offset = 0;
10796 				VME_OBJECT_SET(entry, dst_object, false, 0);
10797 				VME_OFFSET_SET(entry, dst_offset);
10798 				assert(entry->use_pmap);
10799 			}
10800 
10801 			vm_object_reference(dst_object);
10802 
10803 			/* account for unlock bumping up timestamp */
10804 			version.main_timestamp = dst_map->timestamp + 1;
10805 
10806 			vm_map_unlock(dst_map);
10807 
10808 			/*
10809 			 *	Copy as much as possible in one pass
10810 			 */
10811 
10812 			copy_size = size;
10813 			r = vm_fault_copy(
10814 				VME_OBJECT(copy_entry),
10815 				VME_OFFSET(copy_entry),
10816 				&copy_size,
10817 				dst_object,
10818 				dst_offset,
10819 				dst_map,
10820 				&version,
10821 				THREAD_UNINT );
10822 
10823 			/*
10824 			 *	Release the object reference
10825 			 */
10826 
10827 			vm_object_deallocate(dst_object);
10828 
10829 			/*
10830 			 *	If a hard error occurred, return it now
10831 			 */
10832 
10833 			if (r != KERN_SUCCESS) {
10834 				return r;
10835 			}
10836 
10837 			if (copy_size != 0) {
10838 				/*
10839 				 *	Dispose of the copied region
10840 				 */
10841 
10842 				vm_map_copy_clip_end(copy, copy_entry,
10843 				    copy_entry->vme_start + copy_size);
10844 				vm_map_copy_entry_unlink(copy, copy_entry);
10845 				vm_object_deallocate(VME_OBJECT(copy_entry));
10846 				vm_map_copy_entry_dispose(copy_entry);
10847 			}
10848 
10849 			/*
10850 			 *	Pick up in the destination map where we left off.
10851 			 *
10852 			 *	Use the version information to avoid a lookup
10853 			 *	in the normal case.
10854 			 */
10855 
10856 			start += copy_size;
10857 			vm_map_lock(dst_map);
10858 			if (version.main_timestamp == dst_map->timestamp &&
10859 			    copy_size != 0) {
10860 				/* We can safely use saved tmp_entry value */
10861 
10862 				if (tmp_entry->map_aligned &&
10863 				    !VM_MAP_PAGE_ALIGNED(
10864 					    start,
10865 					    VM_MAP_PAGE_MASK(dst_map))) {
10866 					/* no longer map-aligned */
10867 					tmp_entry->map_aligned = FALSE;
10868 				}
10869 				vm_map_clip_end(dst_map, tmp_entry, start);
10870 				tmp_entry = tmp_entry->vme_next;
10871 			} else {
10872 				/* Must do lookup of tmp_entry */
10873 
10874 				if (!vm_map_lookup_entry(dst_map, start, &tmp_entry)) {
10875 					vm_map_unlock(dst_map);
10876 					return KERN_INVALID_ADDRESS;
10877 				}
10878 				if (tmp_entry->map_aligned &&
10879 				    !VM_MAP_PAGE_ALIGNED(
10880 					    start,
10881 					    VM_MAP_PAGE_MASK(dst_map))) {
10882 					/* no longer map-aligned */
10883 					tmp_entry->map_aligned = FALSE;
10884 				}
10885 				vm_map_clip_start(dst_map, tmp_entry, start);
10886 			}
10887 		}
10888 	}/* while */
10889 
10890 	return KERN_SUCCESS;
10891 }/* vm_map_copy_overwrite_aligned */
10892 
10893 /*
10894  *	Routine: vm_map_copyin_kernel_buffer [internal use only]
10895  *
10896  *	Description:
10897  *		Copy in data to a kernel buffer from space in the
10898  *		source map. The original space may be optionally
10899  *		deallocated.
10900  *
10901  *		If successful, returns a new copy object.
10902  */
10903 static kern_return_t
vm_map_copyin_kernel_buffer(vm_map_t src_map,vm_map_offset_t src_addr,vm_map_size_t len,boolean_t src_destroy,vm_map_copy_t * copy_result)10904 vm_map_copyin_kernel_buffer(
10905 	vm_map_t        src_map,
10906 	vm_map_offset_t src_addr,
10907 	vm_map_size_t   len,
10908 	boolean_t       src_destroy,
10909 	vm_map_copy_t   *copy_result)
10910 {
10911 	kern_return_t kr;
10912 	vm_map_copy_t copy;
10913 	void *kdata;
10914 
10915 	if (len > msg_ool_size_small) {
10916 		return KERN_INVALID_ARGUMENT;
10917 	}
10918 
10919 	kdata = kalloc_data(len, Z_WAITOK);
10920 	if (kdata == NULL) {
10921 		return KERN_RESOURCE_SHORTAGE;
10922 	}
10923 	kr = copyinmap(src_map, src_addr, kdata, (vm_size_t)len);
10924 	if (kr != KERN_SUCCESS) {
10925 		kfree_data(kdata, len);
10926 		return kr;
10927 	}
10928 
10929 	copy = vm_map_copy_allocate(VM_MAP_COPY_KERNEL_BUFFER);
10930 	copy->cpy_kdata = kdata;
10931 	copy->size = len;
10932 	copy->offset = 0;
10933 
10934 	if (src_destroy) {
10935 		vmr_flags_t flags = VM_MAP_REMOVE_INTERRUPTIBLE;
10936 
10937 		if (src_map == kernel_map) {
10938 			flags |= VM_MAP_REMOVE_KUNWIRE;
10939 		}
10940 
10941 		(void)vm_map_remove_guard(src_map,
10942 		    vm_map_trunc_page(src_addr, VM_MAP_PAGE_MASK(src_map)),
10943 		    vm_map_round_page(src_addr + len, VM_MAP_PAGE_MASK(src_map)),
10944 		    flags, KMEM_GUARD_NONE);
10945 	}
10946 
10947 	*copy_result = copy;
10948 	return KERN_SUCCESS;
10949 }
10950 
10951 /*
10952  *	Routine: vm_map_copyout_kernel_buffer	[internal use only]
10953  *
10954  *	Description:
10955  *		Copy out data from a kernel buffer into space in the
10956  *		destination map. The space may be otpionally dynamically
10957  *		allocated.
10958  *
10959  *		If successful, consumes the copy object.
10960  *		Otherwise, the caller is responsible for it.
10961  *
10962  *		Callers of this function must call vm_map_copy_require on
10963  *		previously created vm_map_copy_t or pass a newly created
10964  *		one to ensure that it hasn't been forged.
10965  */
10966 static int vm_map_copyout_kernel_buffer_failures = 0;
10967 static kern_return_t
vm_map_copyout_kernel_buffer(vm_map_t map,vm_map_address_t * addr,vm_map_copy_t copy,vm_map_size_t copy_size,boolean_t overwrite,boolean_t consume_on_success)10968 vm_map_copyout_kernel_buffer(
10969 	vm_map_t                map,
10970 	vm_map_address_t        *addr,  /* IN/OUT */
10971 	vm_map_copy_t           copy,
10972 	vm_map_size_t           copy_size,
10973 	boolean_t               overwrite,
10974 	boolean_t               consume_on_success)
10975 {
10976 	kern_return_t kr = KERN_SUCCESS;
10977 	thread_t thread = current_thread();
10978 
10979 	assert(copy->size == copy_size);
10980 
10981 	/*
10982 	 * check for corrupted vm_map_copy structure
10983 	 */
10984 	if (copy_size > msg_ool_size_small || copy->offset) {
10985 		panic("Invalid vm_map_copy_t sz:%lld, ofst:%lld",
10986 		    (long long)copy->size, (long long)copy->offset);
10987 	}
10988 
10989 	if (!overwrite) {
10990 		/*
10991 		 * Allocate space in the target map for the data
10992 		 */
10993 		vm_map_kernel_flags_t vmk_flags = VM_MAP_KERNEL_FLAGS_ANYWHERE();
10994 
10995 		if (map == kernel_map) {
10996 			vmk_flags.vmkf_range_id = KMEM_RANGE_ID_DATA;
10997 		}
10998 
10999 		*addr = 0;
11000 		kr = vm_map_enter(map,
11001 		    addr,
11002 		    vm_map_round_page(copy_size,
11003 		    VM_MAP_PAGE_MASK(map)),
11004 		    (vm_map_offset_t) 0,
11005 		    vmk_flags,
11006 		    VM_OBJECT_NULL,
11007 		    (vm_object_offset_t) 0,
11008 		    FALSE,
11009 		    VM_PROT_DEFAULT,
11010 		    VM_PROT_ALL,
11011 		    VM_INHERIT_DEFAULT);
11012 		if (kr != KERN_SUCCESS) {
11013 			return kr;
11014 		}
11015 #if KASAN
11016 		if (map->pmap == kernel_pmap) {
11017 			kasan_notify_address(*addr, copy->size);
11018 		}
11019 #endif
11020 	}
11021 
11022 	/*
11023 	 * Copyout the data from the kernel buffer to the target map.
11024 	 */
11025 	if (thread->map == map) {
11026 		/*
11027 		 * If the target map is the current map, just do
11028 		 * the copy.
11029 		 */
11030 		assert((vm_size_t)copy_size == copy_size);
11031 		if (copyout(copy->cpy_kdata, *addr, (vm_size_t)copy_size)) {
11032 			kr = KERN_INVALID_ADDRESS;
11033 		}
11034 	} else {
11035 		vm_map_t oldmap;
11036 
11037 		/*
11038 		 * If the target map is another map, assume the
11039 		 * target's address space identity for the duration
11040 		 * of the copy.
11041 		 */
11042 		vm_map_reference(map);
11043 		oldmap = vm_map_switch(map);
11044 
11045 		assert((vm_size_t)copy_size == copy_size);
11046 		if (copyout(copy->cpy_kdata, *addr, (vm_size_t)copy_size)) {
11047 			vm_map_copyout_kernel_buffer_failures++;
11048 			kr = KERN_INVALID_ADDRESS;
11049 		}
11050 
11051 		(void) vm_map_switch(oldmap);
11052 		vm_map_deallocate(map);
11053 	}
11054 
11055 	if (kr != KERN_SUCCESS) {
11056 		/* the copy failed, clean up */
11057 		if (!overwrite) {
11058 			/*
11059 			 * Deallocate the space we allocated in the target map.
11060 			 */
11061 			(void) vm_map_remove(map,
11062 			    vm_map_trunc_page(*addr,
11063 			    VM_MAP_PAGE_MASK(map)),
11064 			    vm_map_round_page((*addr +
11065 			    vm_map_round_page(copy_size,
11066 			    VM_MAP_PAGE_MASK(map))),
11067 			    VM_MAP_PAGE_MASK(map)));
11068 			*addr = 0;
11069 		}
11070 	} else {
11071 		/* copy was successful, dicard the copy structure */
11072 		if (consume_on_success) {
11073 			kfree_data(copy->cpy_kdata, copy_size);
11074 			zfree_id(ZONE_ID_VM_MAP_COPY, copy);
11075 		}
11076 	}
11077 
11078 	return kr;
11079 }
11080 
11081 /*
11082  *	Routine:	vm_map_copy_insert      [internal use only]
11083  *
11084  *	Description:
11085  *		Link a copy chain ("copy") into a map at the
11086  *		specified location (after "where").
11087  *
11088  *		Callers of this function must call vm_map_copy_require on
11089  *		previously created vm_map_copy_t or pass a newly created
11090  *		one to ensure that it hasn't been forged.
11091  *	Side effects:
11092  *		The copy chain is destroyed.
11093  */
11094 static void
vm_map_copy_insert(vm_map_t map,vm_map_entry_t after_where,vm_map_copy_t copy)11095 vm_map_copy_insert(
11096 	vm_map_t        map,
11097 	vm_map_entry_t  after_where,
11098 	vm_map_copy_t   copy)
11099 {
11100 	vm_map_entry_t  entry;
11101 
11102 	while (vm_map_copy_first_entry(copy) != vm_map_copy_to_entry(copy)) {
11103 		entry = vm_map_copy_first_entry(copy);
11104 		vm_map_copy_entry_unlink(copy, entry);
11105 		vm_map_store_entry_link(map, after_where, entry,
11106 		    VM_MAP_KERNEL_FLAGS_NONE);
11107 		after_where = entry;
11108 	}
11109 	zfree_id(ZONE_ID_VM_MAP_COPY, copy);
11110 }
11111 
11112 /*
11113  * Callers of this function must call vm_map_copy_require on
11114  * previously created vm_map_copy_t or pass a newly created
11115  * one to ensure that it hasn't been forged.
11116  */
11117 void
vm_map_copy_remap(vm_map_t map,vm_map_entry_t where,vm_map_copy_t copy,vm_map_offset_t adjustment,vm_prot_t cur_prot,vm_prot_t max_prot,vm_inherit_t inheritance)11118 vm_map_copy_remap(
11119 	vm_map_t        map,
11120 	vm_map_entry_t  where,
11121 	vm_map_copy_t   copy,
11122 	vm_map_offset_t adjustment,
11123 	vm_prot_t       cur_prot,
11124 	vm_prot_t       max_prot,
11125 	vm_inherit_t    inheritance)
11126 {
11127 	vm_map_entry_t  copy_entry, new_entry;
11128 
11129 	for (copy_entry = vm_map_copy_first_entry(copy);
11130 	    copy_entry != vm_map_copy_to_entry(copy);
11131 	    copy_entry = copy_entry->vme_next) {
11132 		/* get a new VM map entry for the map */
11133 		new_entry = vm_map_entry_create(map);
11134 		/* copy the "copy entry" to the new entry */
11135 		vm_map_entry_copy(map, new_entry, copy_entry);
11136 		/* adjust "start" and "end" */
11137 		new_entry->vme_start += adjustment;
11138 		new_entry->vme_end += adjustment;
11139 		/* clear some attributes */
11140 		new_entry->inheritance = inheritance;
11141 		new_entry->protection = cur_prot;
11142 		new_entry->max_protection = max_prot;
11143 		new_entry->behavior = VM_BEHAVIOR_DEFAULT;
11144 		/* take an extra reference on the entry's "object" */
11145 		if (new_entry->is_sub_map) {
11146 			assert(!new_entry->use_pmap); /* not nested */
11147 			vm_map_reference(VME_SUBMAP(new_entry));
11148 		} else {
11149 			vm_object_reference(VME_OBJECT(new_entry));
11150 		}
11151 		/* insert the new entry in the map */
11152 		vm_map_store_entry_link(map, where, new_entry,
11153 		    VM_MAP_KERNEL_FLAGS_NONE);
11154 		/* continue inserting the "copy entries" after the new entry */
11155 		where = new_entry;
11156 	}
11157 }
11158 
11159 
11160 /*
11161  * Returns true if *size matches (or is in the range of) copy->size.
11162  * Upon returning true, the *size field is updated with the actual size of the
11163  * copy object (may be different for VM_MAP_COPY_ENTRY_LIST types)
11164  */
11165 boolean_t
vm_map_copy_validate_size(vm_map_t dst_map,vm_map_copy_t copy,vm_map_size_t * size)11166 vm_map_copy_validate_size(
11167 	vm_map_t                dst_map,
11168 	vm_map_copy_t           copy,
11169 	vm_map_size_t           *size)
11170 {
11171 	if (copy == VM_MAP_COPY_NULL) {
11172 		return FALSE;
11173 	}
11174 
11175 	/*
11176 	 * Assert that the vm_map_copy is coming from the right
11177 	 * zone and hasn't been forged
11178 	 */
11179 	vm_map_copy_require(copy);
11180 
11181 	vm_map_size_t copy_sz = copy->size;
11182 	vm_map_size_t sz = *size;
11183 	switch (copy->type) {
11184 	case VM_MAP_COPY_KERNEL_BUFFER:
11185 		if (sz == copy_sz) {
11186 			return TRUE;
11187 		}
11188 		break;
11189 	case VM_MAP_COPY_ENTRY_LIST:
11190 		/*
11191 		 * potential page-size rounding prevents us from exactly
11192 		 * validating this flavor of vm_map_copy, but we can at least
11193 		 * assert that it's within a range.
11194 		 */
11195 		if (copy_sz >= sz &&
11196 		    copy_sz <= vm_map_round_page(sz, VM_MAP_PAGE_MASK(dst_map))) {
11197 			*size = copy_sz;
11198 			return TRUE;
11199 		}
11200 		break;
11201 	default:
11202 		break;
11203 	}
11204 	return FALSE;
11205 }
11206 
11207 /*
11208  *	Routine:	vm_map_copyout_size
11209  *
11210  *	Description:
11211  *		Copy out a copy chain ("copy") into newly-allocated
11212  *		space in the destination map. Uses a prevalidated
11213  *		size for the copy object (vm_map_copy_validate_size).
11214  *
11215  *		If successful, consumes the copy object.
11216  *		Otherwise, the caller is responsible for it.
11217  */
11218 kern_return_t
vm_map_copyout_size(vm_map_t dst_map,vm_map_address_t * dst_addr,vm_map_copy_t copy,vm_map_size_t copy_size)11219 vm_map_copyout_size(
11220 	vm_map_t                dst_map,
11221 	vm_map_address_t        *dst_addr,      /* OUT */
11222 	vm_map_copy_t           copy,
11223 	vm_map_size_t           copy_size)
11224 {
11225 	return vm_map_copyout_internal(dst_map, dst_addr, copy, copy_size,
11226 	           TRUE,                     /* consume_on_success */
11227 	           VM_PROT_DEFAULT,
11228 	           VM_PROT_ALL,
11229 	           VM_INHERIT_DEFAULT);
11230 }
11231 
11232 /*
11233  *	Routine:	vm_map_copyout
11234  *
11235  *	Description:
11236  *		Copy out a copy chain ("copy") into newly-allocated
11237  *		space in the destination map.
11238  *
11239  *		If successful, consumes the copy object.
11240  *		Otherwise, the caller is responsible for it.
11241  */
11242 kern_return_t
vm_map_copyout(vm_map_t dst_map,vm_map_address_t * dst_addr,vm_map_copy_t copy)11243 vm_map_copyout(
11244 	vm_map_t                dst_map,
11245 	vm_map_address_t        *dst_addr,      /* OUT */
11246 	vm_map_copy_t           copy)
11247 {
11248 	return vm_map_copyout_internal(dst_map, dst_addr, copy, copy ? copy->size : 0,
11249 	           TRUE,                     /* consume_on_success */
11250 	           VM_PROT_DEFAULT,
11251 	           VM_PROT_ALL,
11252 	           VM_INHERIT_DEFAULT);
11253 }
11254 
11255 kern_return_t
vm_map_copyout_internal(vm_map_t dst_map,vm_map_address_t * dst_addr,vm_map_copy_t copy,vm_map_size_t copy_size,boolean_t consume_on_success,vm_prot_t cur_protection,vm_prot_t max_protection,vm_inherit_t inheritance)11256 vm_map_copyout_internal(
11257 	vm_map_t                dst_map,
11258 	vm_map_address_t        *dst_addr,      /* OUT */
11259 	vm_map_copy_t           copy,
11260 	vm_map_size_t           copy_size,
11261 	boolean_t               consume_on_success,
11262 	vm_prot_t               cur_protection,
11263 	vm_prot_t               max_protection,
11264 	vm_inherit_t            inheritance)
11265 {
11266 	vm_map_size_t           size;
11267 	vm_map_size_t           adjustment;
11268 	vm_map_offset_t         start;
11269 	vm_object_offset_t      vm_copy_start;
11270 	vm_map_entry_t          last;
11271 	vm_map_entry_t          entry;
11272 	vm_map_copy_t           original_copy;
11273 	kern_return_t           kr;
11274 	vm_map_kernel_flags_t   vmk_flags = VM_MAP_KERNEL_FLAGS_ANYWHERE();
11275 
11276 	/*
11277 	 *	Check for null copy object.
11278 	 */
11279 
11280 	if (copy == VM_MAP_COPY_NULL) {
11281 		*dst_addr = 0;
11282 		return KERN_SUCCESS;
11283 	}
11284 
11285 	/*
11286 	 * Assert that the vm_map_copy is coming from the right
11287 	 * zone and hasn't been forged
11288 	 */
11289 	vm_map_copy_require(copy);
11290 
11291 	if (copy->size != copy_size) {
11292 		*dst_addr = 0;
11293 		return KERN_FAILURE;
11294 	}
11295 
11296 	/*
11297 	 *	Check for special kernel buffer allocated
11298 	 *	by new_ipc_kmsg_copyin.
11299 	 */
11300 
11301 	if (copy->type == VM_MAP_COPY_KERNEL_BUFFER) {
11302 		return vm_map_copyout_kernel_buffer(dst_map, dst_addr,
11303 		           copy, copy_size, FALSE,
11304 		           consume_on_success);
11305 	}
11306 
11307 	original_copy = copy;
11308 	if (copy->cpy_hdr.page_shift != VM_MAP_PAGE_SHIFT(dst_map)) {
11309 		vm_map_copy_t target_copy;
11310 		vm_map_offset_t overmap_start, overmap_end, trimmed_start;
11311 
11312 		target_copy = VM_MAP_COPY_NULL;
11313 		DEBUG4K_ADJUST("adjusting...\n");
11314 		kr = vm_map_copy_adjust_to_target(
11315 			copy,
11316 			0, /* offset */
11317 			copy->size, /* size */
11318 			dst_map,
11319 			TRUE, /* copy */
11320 			&target_copy,
11321 			&overmap_start,
11322 			&overmap_end,
11323 			&trimmed_start);
11324 		if (kr != KERN_SUCCESS) {
11325 			DEBUG4K_COPY("adjust failed 0x%x\n", kr);
11326 			return kr;
11327 		}
11328 		DEBUG4K_COPY("copy %p (%d 0x%llx 0x%llx) dst_map %p (%d) target_copy %p (%d 0x%llx 0x%llx) overmap_start 0x%llx overmap_end 0x%llx trimmed_start 0x%llx\n", copy, copy->cpy_hdr.page_shift, copy->offset, (uint64_t)copy->size, dst_map, VM_MAP_PAGE_SHIFT(dst_map), target_copy, target_copy->cpy_hdr.page_shift, target_copy->offset, (uint64_t)target_copy->size, (uint64_t)overmap_start, (uint64_t)overmap_end, (uint64_t)trimmed_start);
11329 		if (target_copy != copy) {
11330 			copy = target_copy;
11331 		}
11332 		copy_size = copy->size;
11333 	}
11334 
11335 	/*
11336 	 *	Find space for the data
11337 	 */
11338 
11339 	vm_copy_start = vm_map_trunc_page((vm_map_size_t)copy->offset,
11340 	    VM_MAP_COPY_PAGE_MASK(copy));
11341 	size = vm_map_round_page((vm_map_size_t)copy->offset + copy_size,
11342 	    VM_MAP_COPY_PAGE_MASK(copy))
11343 	    - vm_copy_start;
11344 
11345 	vm_map_kernel_flags_update_range_id(&vmk_flags, dst_map);
11346 
11347 	vm_map_lock(dst_map);
11348 	kr = vm_map_locate_space(dst_map, size, 0, vmk_flags,
11349 	    &start, &last);
11350 	if (kr != KERN_SUCCESS) {
11351 		vm_map_unlock(dst_map);
11352 		return kr;
11353 	}
11354 
11355 	adjustment = start - vm_copy_start;
11356 	if (!consume_on_success) {
11357 		/*
11358 		 * We're not allowed to consume "copy", so we'll have to
11359 		 * copy its map entries into the destination map below.
11360 		 * No need to re-allocate map entries from the correct
11361 		 * (pageable or not) zone, since we'll get new map entries
11362 		 * during the transfer.
11363 		 * We'll also adjust the map entries's "start" and "end"
11364 		 * during the transfer, to keep "copy"'s entries consistent
11365 		 * with its "offset".
11366 		 */
11367 		goto after_adjustments;
11368 	}
11369 
11370 	/*
11371 	 *	Since we're going to just drop the map
11372 	 *	entries from the copy into the destination
11373 	 *	map, they must come from the same pool.
11374 	 */
11375 
11376 	if (copy->cpy_hdr.entries_pageable != dst_map->hdr.entries_pageable) {
11377 		/*
11378 		 * Mismatches occur when dealing with the default
11379 		 * pager.
11380 		 */
11381 		vm_map_entry_t  next, new;
11382 
11383 		/*
11384 		 * Find the zone that the copies were allocated from
11385 		 */
11386 
11387 		entry = vm_map_copy_first_entry(copy);
11388 
11389 		/*
11390 		 * Reinitialize the copy so that vm_map_copy_entry_link
11391 		 * will work.
11392 		 */
11393 		vm_map_store_copy_reset(copy, entry);
11394 		copy->cpy_hdr.entries_pageable = dst_map->hdr.entries_pageable;
11395 
11396 		/*
11397 		 * Copy each entry.
11398 		 */
11399 		while (entry != vm_map_copy_to_entry(copy)) {
11400 			new = vm_map_copy_entry_create(copy);
11401 			vm_map_entry_copy_full(new, entry);
11402 			new->vme_no_copy_on_read = FALSE;
11403 			assert(!new->iokit_acct);
11404 			if (new->is_sub_map) {
11405 				/* clr address space specifics */
11406 				new->use_pmap = FALSE;
11407 			}
11408 			vm_map_copy_entry_link(copy,
11409 			    vm_map_copy_last_entry(copy),
11410 			    new);
11411 			next = entry->vme_next;
11412 			vm_map_entry_dispose(entry);
11413 			entry = next;
11414 		}
11415 	}
11416 
11417 	/*
11418 	 *	Adjust the addresses in the copy chain, and
11419 	 *	reset the region attributes.
11420 	 */
11421 
11422 	for (entry = vm_map_copy_first_entry(copy);
11423 	    entry != vm_map_copy_to_entry(copy);
11424 	    entry = entry->vme_next) {
11425 		if (VM_MAP_PAGE_SHIFT(dst_map) == PAGE_SHIFT) {
11426 			/*
11427 			 * We're injecting this copy entry into a map that
11428 			 * has the standard page alignment, so clear
11429 			 * "map_aligned" (which might have been inherited
11430 			 * from the original map entry).
11431 			 */
11432 			entry->map_aligned = FALSE;
11433 		}
11434 
11435 		entry->vme_start += adjustment;
11436 		entry->vme_end += adjustment;
11437 
11438 		if (entry->map_aligned) {
11439 			assert(VM_MAP_PAGE_ALIGNED(entry->vme_start,
11440 			    VM_MAP_PAGE_MASK(dst_map)));
11441 			assert(VM_MAP_PAGE_ALIGNED(entry->vme_end,
11442 			    VM_MAP_PAGE_MASK(dst_map)));
11443 		}
11444 
11445 		entry->inheritance = VM_INHERIT_DEFAULT;
11446 		entry->protection = VM_PROT_DEFAULT;
11447 		entry->max_protection = VM_PROT_ALL;
11448 		entry->behavior = VM_BEHAVIOR_DEFAULT;
11449 
11450 		/*
11451 		 * If the entry is now wired,
11452 		 * map the pages into the destination map.
11453 		 */
11454 		if (entry->wired_count != 0) {
11455 			vm_map_offset_t va;
11456 			vm_object_offset_t       offset;
11457 			vm_object_t object;
11458 			vm_prot_t prot;
11459 			int     type_of_fault;
11460 
11461 			/* TODO4K would need to use actual page size */
11462 			assert(VM_MAP_PAGE_SHIFT(dst_map) == PAGE_SHIFT);
11463 
11464 			object = VME_OBJECT(entry);
11465 			offset = VME_OFFSET(entry);
11466 			va = entry->vme_start;
11467 
11468 			pmap_pageable(dst_map->pmap,
11469 			    entry->vme_start,
11470 			    entry->vme_end,
11471 			    TRUE);
11472 
11473 			while (va < entry->vme_end) {
11474 				vm_page_t       m;
11475 				struct vm_object_fault_info fault_info = {};
11476 
11477 				/*
11478 				 * Look up the page in the object.
11479 				 * Assert that the page will be found in the
11480 				 * top object:
11481 				 * either
11482 				 *	the object was newly created by
11483 				 *	vm_object_copy_slowly, and has
11484 				 *	copies of all of the pages from
11485 				 *	the source object
11486 				 * or
11487 				 *	the object was moved from the old
11488 				 *	map entry; because the old map
11489 				 *	entry was wired, all of the pages
11490 				 *	were in the top-level object.
11491 				 *	(XXX not true if we wire pages for
11492 				 *	 reading)
11493 				 */
11494 				vm_object_lock(object);
11495 
11496 				m = vm_page_lookup(object, offset);
11497 				if (m == VM_PAGE_NULL || !VM_PAGE_WIRED(m) ||
11498 				    m->vmp_absent) {
11499 					panic("vm_map_copyout: wiring %p", m);
11500 				}
11501 
11502 				prot = entry->protection;
11503 
11504 				if (override_nx(dst_map, VME_ALIAS(entry)) &&
11505 				    prot) {
11506 					prot |= VM_PROT_EXECUTE;
11507 				}
11508 
11509 				type_of_fault = DBG_CACHE_HIT_FAULT;
11510 
11511 				fault_info.user_tag = VME_ALIAS(entry);
11512 				fault_info.pmap_options = 0;
11513 				if (entry->iokit_acct ||
11514 				    (!entry->is_sub_map && !entry->use_pmap)) {
11515 					fault_info.pmap_options |= PMAP_OPTIONS_ALT_ACCT;
11516 				}
11517 				if (entry->vme_xnu_user_debug &&
11518 				    !VM_PAGE_OBJECT(m)->code_signed) {
11519 					/*
11520 					 * Modified code-signed executable
11521 					 * region: this page does not belong
11522 					 * to a code-signed VM object, so it
11523 					 * must have been copied and should
11524 					 * therefore be typed XNU_USER_DEBUG
11525 					 * rather than XNU_USER_EXEC.
11526 					 */
11527 					fault_info.pmap_options |= PMAP_OPTIONS_XNU_USER_DEBUG;
11528 				}
11529 
11530 				vm_fault_enter(m,
11531 				    dst_map->pmap,
11532 				    va,
11533 				    PAGE_SIZE, 0,
11534 				    prot,
11535 				    prot,
11536 				    VM_PAGE_WIRED(m),
11537 				    FALSE,            /* change_wiring */
11538 				    VM_KERN_MEMORY_NONE,            /* tag - not wiring */
11539 				    &fault_info,
11540 				    NULL,             /* need_retry */
11541 				    &type_of_fault);
11542 
11543 				vm_object_unlock(object);
11544 
11545 				offset += PAGE_SIZE_64;
11546 				va += PAGE_SIZE;
11547 			}
11548 		}
11549 	}
11550 
11551 after_adjustments:
11552 
11553 	/*
11554 	 *	Correct the page alignment for the result
11555 	 */
11556 
11557 	*dst_addr = start + (copy->offset - vm_copy_start);
11558 
11559 #if KASAN
11560 	kasan_notify_address(*dst_addr, size);
11561 #endif
11562 
11563 	/*
11564 	 *	Update the hints and the map size
11565 	 */
11566 
11567 	if (consume_on_success) {
11568 		SAVE_HINT_MAP_WRITE(dst_map, vm_map_copy_last_entry(copy));
11569 	} else {
11570 		SAVE_HINT_MAP_WRITE(dst_map, last);
11571 	}
11572 
11573 	dst_map->size += size;
11574 
11575 	/*
11576 	 *	Link in the copy
11577 	 */
11578 
11579 	if (consume_on_success) {
11580 		vm_map_copy_insert(dst_map, last, copy);
11581 		if (copy != original_copy) {
11582 			vm_map_copy_discard(original_copy);
11583 			original_copy = VM_MAP_COPY_NULL;
11584 		}
11585 	} else {
11586 		vm_map_copy_remap(dst_map, last, copy, adjustment,
11587 		    cur_protection, max_protection,
11588 		    inheritance);
11589 		if (copy != original_copy && original_copy != VM_MAP_COPY_NULL) {
11590 			vm_map_copy_discard(copy);
11591 			copy = original_copy;
11592 		}
11593 	}
11594 
11595 
11596 	vm_map_unlock(dst_map);
11597 
11598 	/*
11599 	 * XXX	If wiring_required, call vm_map_pageable
11600 	 */
11601 
11602 	return KERN_SUCCESS;
11603 }
11604 
11605 /*
11606  *	Routine:	vm_map_copyin
11607  *
11608  *	Description:
11609  *		see vm_map_copyin_common.  Exported via Unsupported.exports.
11610  *
11611  */
11612 
11613 #undef vm_map_copyin
11614 
11615 kern_return_t
vm_map_copyin(vm_map_t src_map,vm_map_address_t src_addr,vm_map_size_t len,boolean_t src_destroy,vm_map_copy_t * copy_result)11616 vm_map_copyin(
11617 	vm_map_t                        src_map,
11618 	vm_map_address_t        src_addr,
11619 	vm_map_size_t           len,
11620 	boolean_t                       src_destroy,
11621 	vm_map_copy_t           *copy_result)   /* OUT */
11622 {
11623 	return vm_map_copyin_common(src_map, src_addr, len, src_destroy,
11624 	           FALSE, copy_result, FALSE);
11625 }
11626 
11627 /*
11628  *	Routine:	vm_map_copyin_common
11629  *
11630  *	Description:
11631  *		Copy the specified region (src_addr, len) from the
11632  *		source address space (src_map), possibly removing
11633  *		the region from the source address space (src_destroy).
11634  *
11635  *	Returns:
11636  *		A vm_map_copy_t object (copy_result), suitable for
11637  *		insertion into another address space (using vm_map_copyout),
11638  *		copying over another address space region (using
11639  *		vm_map_copy_overwrite).  If the copy is unused, it
11640  *		should be destroyed (using vm_map_copy_discard).
11641  *
11642  *	In/out conditions:
11643  *		The source map should not be locked on entry.
11644  */
11645 
11646 typedef struct submap_map {
11647 	vm_map_t        parent_map;
11648 	vm_map_offset_t base_start;
11649 	vm_map_offset_t base_end;
11650 	vm_map_size_t   base_len;
11651 	struct submap_map *next;
11652 } submap_map_t;
11653 
11654 kern_return_t
vm_map_copyin_common(vm_map_t src_map,vm_map_address_t src_addr,vm_map_size_t len,boolean_t src_destroy,__unused boolean_t src_volatile,vm_map_copy_t * copy_result,boolean_t use_maxprot)11655 vm_map_copyin_common(
11656 	vm_map_t        src_map,
11657 	vm_map_address_t src_addr,
11658 	vm_map_size_t   len,
11659 	boolean_t       src_destroy,
11660 	__unused boolean_t      src_volatile,
11661 	vm_map_copy_t   *copy_result,   /* OUT */
11662 	boolean_t       use_maxprot)
11663 {
11664 	int flags;
11665 
11666 	flags = 0;
11667 	if (src_destroy) {
11668 		flags |= VM_MAP_COPYIN_SRC_DESTROY;
11669 	}
11670 	if (use_maxprot) {
11671 		flags |= VM_MAP_COPYIN_USE_MAXPROT;
11672 	}
11673 	return vm_map_copyin_internal(src_map,
11674 	           src_addr,
11675 	           len,
11676 	           flags,
11677 	           copy_result);
11678 }
11679 kern_return_t
vm_map_copyin_internal(vm_map_t src_map,vm_map_address_t src_addr,vm_map_size_t len,int flags,vm_map_copy_t * copy_result)11680 vm_map_copyin_internal(
11681 	vm_map_t        src_map,
11682 	vm_map_address_t src_addr,
11683 	vm_map_size_t   len,
11684 	int             flags,
11685 	vm_map_copy_t   *copy_result)   /* OUT */
11686 {
11687 	vm_map_entry_t  tmp_entry;      /* Result of last map lookup --
11688 	                                 * in multi-level lookup, this
11689 	                                 * entry contains the actual
11690 	                                 * vm_object/offset.
11691 	                                 */
11692 	vm_map_entry_t  new_entry = VM_MAP_ENTRY_NULL;  /* Map entry for copy */
11693 
11694 	vm_map_offset_t src_start;      /* Start of current entry --
11695 	                                 * where copy is taking place now
11696 	                                 */
11697 	vm_map_offset_t src_end;        /* End of entire region to be
11698 	                                 * copied */
11699 	vm_map_offset_t src_base;
11700 	vm_map_t        base_map = src_map;
11701 	boolean_t       map_share = FALSE;
11702 	submap_map_t    *parent_maps = NULL;
11703 
11704 	vm_map_copy_t   copy;           /* Resulting copy */
11705 	vm_map_address_t copy_addr;
11706 	vm_map_size_t   copy_size;
11707 	boolean_t       src_destroy;
11708 	boolean_t       use_maxprot;
11709 	boolean_t       preserve_purgeable;
11710 	boolean_t       entry_was_shared;
11711 	vm_map_entry_t  saved_src_entry;
11712 
11713 	if (flags & ~VM_MAP_COPYIN_ALL_FLAGS) {
11714 		return KERN_INVALID_ARGUMENT;
11715 	}
11716 
11717 #if CONFIG_KERNEL_TBI
11718 	if (src_map->pmap == kernel_pmap) {
11719 		src_addr = VM_KERNEL_TBI_FILL(src_addr);
11720 	}
11721 #endif /* CONFIG_KERNEL_TBI && KASAN_TBI */
11722 
11723 	src_destroy = (flags & VM_MAP_COPYIN_SRC_DESTROY) ? TRUE : FALSE;
11724 	use_maxprot = (flags & VM_MAP_COPYIN_USE_MAXPROT) ? TRUE : FALSE;
11725 	preserve_purgeable =
11726 	    (flags & VM_MAP_COPYIN_PRESERVE_PURGEABLE) ? TRUE : FALSE;
11727 
11728 	/*
11729 	 *	Check for copies of zero bytes.
11730 	 */
11731 
11732 	if (len == 0) {
11733 		*copy_result = VM_MAP_COPY_NULL;
11734 		return KERN_SUCCESS;
11735 	}
11736 
11737 	/*
11738 	 *	Check that the end address doesn't overflow
11739 	 */
11740 	if (__improbable(vm_map_range_overflows(src_map, src_addr, len))) {
11741 		return KERN_INVALID_ADDRESS;
11742 	}
11743 	src_end = src_addr + len;
11744 	if (src_end < src_addr) {
11745 		return KERN_INVALID_ADDRESS;
11746 	}
11747 
11748 	/*
11749 	 *	Compute (page aligned) start and end of region
11750 	 */
11751 	src_start = vm_map_trunc_page(src_addr,
11752 	    VM_MAP_PAGE_MASK(src_map));
11753 	src_end = vm_map_round_page(src_end,
11754 	    VM_MAP_PAGE_MASK(src_map));
11755 	if (src_end < src_addr) {
11756 		return KERN_INVALID_ADDRESS;
11757 	}
11758 
11759 	/*
11760 	 * If the copy is sufficiently small, use a kernel buffer instead
11761 	 * of making a virtual copy.  The theory being that the cost of
11762 	 * setting up VM (and taking C-O-W faults) dominates the copy costs
11763 	 * for small regions.
11764 	 */
11765 	if ((len <= msg_ool_size_small) &&
11766 	    !use_maxprot &&
11767 	    !preserve_purgeable &&
11768 	    !(flags & VM_MAP_COPYIN_ENTRY_LIST) &&
11769 	    /*
11770 	     * Since the "msg_ool_size_small" threshold was increased and
11771 	     * vm_map_copyin_kernel_buffer() doesn't handle accesses beyond the
11772 	     * address space limits, we revert to doing a virtual copy if the
11773 	     * copied range goes beyond those limits.  Otherwise, mach_vm_read()
11774 	     * of the commpage would now fail when it used to work.
11775 	     */
11776 	    (src_start >= vm_map_min(src_map) &&
11777 	    src_start < vm_map_max(src_map) &&
11778 	    src_end >= vm_map_min(src_map) &&
11779 	    src_end < vm_map_max(src_map))) {
11780 		return vm_map_copyin_kernel_buffer(src_map, src_addr, len,
11781 		           src_destroy, copy_result);
11782 	}
11783 
11784 	/*
11785 	 *	Allocate a header element for the list.
11786 	 *
11787 	 *	Use the start and end in the header to
11788 	 *	remember the endpoints prior to rounding.
11789 	 */
11790 
11791 	copy = vm_map_copy_allocate(VM_MAP_COPY_ENTRY_LIST);
11792 	copy->cpy_hdr.entries_pageable = TRUE;
11793 	copy->cpy_hdr.page_shift = (uint16_t)VM_MAP_PAGE_SHIFT(src_map);
11794 	copy->offset = src_addr;
11795 	copy->size = len;
11796 
11797 	new_entry = vm_map_copy_entry_create(copy);
11798 
11799 #define RETURN(x)                                               \
11800 	MACRO_BEGIN                                             \
11801 	vm_map_unlock(src_map);                                 \
11802 	if(src_map != base_map)                                 \
11803 	        vm_map_deallocate(src_map);                     \
11804 	if (new_entry != VM_MAP_ENTRY_NULL)                     \
11805 	        vm_map_copy_entry_dispose(new_entry);           \
11806 	vm_map_copy_discard(copy);                              \
11807 	{                                                       \
11808 	        submap_map_t	*_ptr;                          \
11809                                                                 \
11810 	        for(_ptr = parent_maps; _ptr != NULL; _ptr = parent_maps) { \
11811 	                parent_maps=parent_maps->next;          \
11812 	                if (_ptr->parent_map != base_map)       \
11813 	                        vm_map_deallocate(_ptr->parent_map);    \
11814 	                kfree_type(submap_map_t, _ptr);         \
11815 	        }                                               \
11816 	}                                                       \
11817 	MACRO_RETURN(x);                                        \
11818 	MACRO_END
11819 
11820 	/*
11821 	 *	Find the beginning of the region.
11822 	 */
11823 
11824 	vm_map_lock(src_map);
11825 
11826 	/*
11827 	 * Lookup the original "src_addr" rather than the truncated
11828 	 * "src_start", in case "src_start" falls in a non-map-aligned
11829 	 * map entry *before* the map entry that contains "src_addr"...
11830 	 */
11831 	if (!vm_map_lookup_entry(src_map, src_addr, &tmp_entry)) {
11832 		RETURN(KERN_INVALID_ADDRESS);
11833 	}
11834 	if (!tmp_entry->is_sub_map) {
11835 		/*
11836 		 * ... but clip to the map-rounded "src_start" rather than
11837 		 * "src_addr" to preserve map-alignment.  We'll adjust the
11838 		 * first copy entry at the end, if needed.
11839 		 */
11840 		vm_map_clip_start(src_map, tmp_entry, src_start);
11841 	}
11842 	if (src_start < tmp_entry->vme_start) {
11843 		/*
11844 		 * Move "src_start" up to the start of the
11845 		 * first map entry to copy.
11846 		 */
11847 		src_start = tmp_entry->vme_start;
11848 	}
11849 	/* set for later submap fix-up */
11850 	copy_addr = src_start;
11851 
11852 	/*
11853 	 *	Go through entries until we get to the end.
11854 	 */
11855 
11856 	while (TRUE) {
11857 		vm_map_entry_t  src_entry = tmp_entry;  /* Top-level entry */
11858 		vm_map_size_t   src_size;               /* Size of source
11859 		                                         * map entry (in both
11860 		                                         * maps)
11861 		                                         */
11862 
11863 		vm_object_t             src_object;     /* Object to copy */
11864 		vm_object_offset_t      src_offset;
11865 
11866 		vm_object_t             new_copy_object;/* vm_object_copy_* result */
11867 
11868 		boolean_t       src_needs_copy;         /* Should source map
11869 		                                         * be made read-only
11870 		                                         * for copy-on-write?
11871 		                                         */
11872 
11873 		boolean_t       new_entry_needs_copy;   /* Will new entry be COW? */
11874 
11875 		boolean_t       was_wired;              /* Was source wired? */
11876 		boolean_t       saved_used_for_jit;     /* Saved used_for_jit. */
11877 #if __arm64e__
11878 		boolean_t       saved_used_for_tpro;    /* Saved used_for_tpro */
11879 #endif
11880 		vm_map_version_t version;               /* Version before locks
11881 		                                         * dropped to make copy
11882 		                                         */
11883 		kern_return_t   result;                 /* Return value from
11884 		                                         * copy_strategically.
11885 		                                         */
11886 		while (tmp_entry->is_sub_map) {
11887 			vm_map_size_t submap_len;
11888 			submap_map_t *ptr;
11889 
11890 			ptr = kalloc_type(submap_map_t, Z_WAITOK);
11891 			ptr->next = parent_maps;
11892 			parent_maps = ptr;
11893 			ptr->parent_map = src_map;
11894 			ptr->base_start = src_start;
11895 			ptr->base_end = src_end;
11896 			submap_len = tmp_entry->vme_end - src_start;
11897 			if (submap_len > (src_end - src_start)) {
11898 				submap_len = src_end - src_start;
11899 			}
11900 			ptr->base_len = submap_len;
11901 
11902 			src_start -= tmp_entry->vme_start;
11903 			src_start += VME_OFFSET(tmp_entry);
11904 			src_end = src_start + submap_len;
11905 			src_map = VME_SUBMAP(tmp_entry);
11906 			vm_map_lock(src_map);
11907 			/* keep an outstanding reference for all maps in */
11908 			/* the parents tree except the base map */
11909 			vm_map_reference(src_map);
11910 			vm_map_unlock(ptr->parent_map);
11911 			if (!vm_map_lookup_entry(
11912 				    src_map, src_start, &tmp_entry)) {
11913 				RETURN(KERN_INVALID_ADDRESS);
11914 			}
11915 			map_share = TRUE;
11916 			if (!tmp_entry->is_sub_map) {
11917 				vm_map_clip_start(src_map, tmp_entry, src_start);
11918 			}
11919 			src_entry = tmp_entry;
11920 		}
11921 		/* we are now in the lowest level submap... */
11922 
11923 		if ((VME_OBJECT(tmp_entry) != VM_OBJECT_NULL) &&
11924 		    (VME_OBJECT(tmp_entry)->phys_contiguous)) {
11925 			/* This is not, supported for now.In future */
11926 			/* we will need to detect the phys_contig   */
11927 			/* condition and then upgrade copy_slowly   */
11928 			/* to do physical copy from the device mem  */
11929 			/* based object. We can piggy-back off of   */
11930 			/* the was wired boolean to set-up the      */
11931 			/* proper handling */
11932 			RETURN(KERN_PROTECTION_FAILURE);
11933 		}
11934 		/*
11935 		 *	Create a new address map entry to hold the result.
11936 		 *	Fill in the fields from the appropriate source entries.
11937 		 *	We must unlock the source map to do this if we need
11938 		 *	to allocate a map entry.
11939 		 */
11940 		if (new_entry == VM_MAP_ENTRY_NULL) {
11941 			version.main_timestamp = src_map->timestamp;
11942 			vm_map_unlock(src_map);
11943 
11944 			new_entry = vm_map_copy_entry_create(copy);
11945 
11946 			vm_map_lock(src_map);
11947 			if ((version.main_timestamp + 1) != src_map->timestamp) {
11948 				if (!vm_map_lookup_entry(src_map, src_start,
11949 				    &tmp_entry)) {
11950 					RETURN(KERN_INVALID_ADDRESS);
11951 				}
11952 				if (!tmp_entry->is_sub_map) {
11953 					vm_map_clip_start(src_map, tmp_entry, src_start);
11954 				}
11955 				continue; /* restart w/ new tmp_entry */
11956 			}
11957 		}
11958 
11959 		/*
11960 		 *	Verify that the region can be read.
11961 		 */
11962 		if (((src_entry->protection & VM_PROT_READ) == VM_PROT_NONE &&
11963 		    !use_maxprot) ||
11964 		    (src_entry->max_protection & VM_PROT_READ) == 0) {
11965 			RETURN(KERN_PROTECTION_FAILURE);
11966 		}
11967 
11968 		/*
11969 		 *	Clip against the endpoints of the entire region.
11970 		 */
11971 
11972 		vm_map_clip_end(src_map, src_entry, src_end);
11973 
11974 		src_size = src_entry->vme_end - src_start;
11975 		src_object = VME_OBJECT(src_entry);
11976 		src_offset = VME_OFFSET(src_entry);
11977 		was_wired = (src_entry->wired_count != 0);
11978 
11979 		vm_map_entry_copy(src_map, new_entry, src_entry);
11980 		if (new_entry->is_sub_map) {
11981 			/* clr address space specifics */
11982 			new_entry->use_pmap = FALSE;
11983 		} else {
11984 			/*
11985 			 * We're dealing with a copy-on-write operation,
11986 			 * so the resulting mapping should not inherit the
11987 			 * original mapping's accounting settings.
11988 			 * "iokit_acct" should have been cleared in
11989 			 * vm_map_entry_copy().
11990 			 * "use_pmap" should be reset to its default (TRUE)
11991 			 * so that the new mapping gets accounted for in
11992 			 * the task's memory footprint.
11993 			 */
11994 			assert(!new_entry->iokit_acct);
11995 			new_entry->use_pmap = TRUE;
11996 		}
11997 
11998 		/*
11999 		 *	Attempt non-blocking copy-on-write optimizations.
12000 		 */
12001 
12002 		/*
12003 		 * If we are destroying the source, and the object
12004 		 * is internal, we could move the object reference
12005 		 * from the source to the copy.  The copy is
12006 		 * copy-on-write only if the source is.
12007 		 * We make another reference to the object, because
12008 		 * destroying the source entry will deallocate it.
12009 		 *
12010 		 * This memory transfer has to be atomic, (to prevent
12011 		 * the VM object from being shared or copied while
12012 		 * it's being moved here), so we could only do this
12013 		 * if we won't have to unlock the VM map until the
12014 		 * original mapping has been fully removed.
12015 		 */
12016 
12017 RestartCopy:
12018 		if ((src_object == VM_OBJECT_NULL ||
12019 		    (!was_wired && !map_share && !tmp_entry->is_shared
12020 		    && !(debug4k_no_cow_copyin && VM_MAP_PAGE_SHIFT(src_map) < PAGE_SHIFT))) &&
12021 		    vm_object_copy_quickly(
12022 			    VME_OBJECT(new_entry),
12023 			    src_offset,
12024 			    src_size,
12025 			    &src_needs_copy,
12026 			    &new_entry_needs_copy)) {
12027 			new_entry->needs_copy = new_entry_needs_copy;
12028 
12029 			/*
12030 			 *	Handle copy-on-write obligations
12031 			 */
12032 
12033 			if (src_needs_copy && !tmp_entry->needs_copy) {
12034 				vm_prot_t prot;
12035 
12036 				prot = src_entry->protection & ~VM_PROT_WRITE;
12037 
12038 				if (override_nx(src_map, VME_ALIAS(src_entry))
12039 				    && prot) {
12040 					prot |= VM_PROT_EXECUTE;
12041 				}
12042 
12043 				vm_object_pmap_protect(
12044 					src_object,
12045 					src_offset,
12046 					src_size,
12047 					(src_entry->is_shared ?
12048 					PMAP_NULL
12049 					: src_map->pmap),
12050 					VM_MAP_PAGE_SIZE(src_map),
12051 					src_entry->vme_start,
12052 					prot);
12053 
12054 				assert(tmp_entry->wired_count == 0);
12055 				tmp_entry->needs_copy = TRUE;
12056 			}
12057 
12058 			/*
12059 			 *	The map has never been unlocked, so it's safe
12060 			 *	to move to the next entry rather than doing
12061 			 *	another lookup.
12062 			 */
12063 
12064 			goto CopySuccessful;
12065 		}
12066 
12067 		entry_was_shared = tmp_entry->is_shared;
12068 
12069 		/*
12070 		 *	Take an object reference, so that we may
12071 		 *	release the map lock(s).
12072 		 */
12073 
12074 		assert(src_object != VM_OBJECT_NULL);
12075 		vm_object_reference(src_object);
12076 
12077 		/*
12078 		 *	Record the timestamp for later verification.
12079 		 *	Unlock the map.
12080 		 */
12081 
12082 		version.main_timestamp = src_map->timestamp;
12083 		vm_map_unlock(src_map); /* Increments timestamp once! */
12084 		saved_src_entry = src_entry;
12085 		tmp_entry = VM_MAP_ENTRY_NULL;
12086 		src_entry = VM_MAP_ENTRY_NULL;
12087 
12088 		/*
12089 		 *	Perform the copy
12090 		 */
12091 
12092 		if (was_wired ||
12093 		    (debug4k_no_cow_copyin &&
12094 		    VM_MAP_PAGE_SHIFT(src_map) < PAGE_SHIFT)) {
12095 CopySlowly:
12096 			vm_object_lock(src_object);
12097 			result = vm_object_copy_slowly(
12098 				src_object,
12099 				src_offset,
12100 				src_size,
12101 				THREAD_UNINT,
12102 				&new_copy_object);
12103 			/* VME_OBJECT_SET will reset used_for_jit|tpro, so preserve it. */
12104 			saved_used_for_jit = new_entry->used_for_jit;
12105 #if __arm64e__
12106 			saved_used_for_tpro = new_entry->used_for_tpro;
12107 #endif
12108 			VME_OBJECT_SET(new_entry, new_copy_object, false, 0);
12109 			new_entry->used_for_jit = saved_used_for_jit;
12110 #if __arm64e__
12111 			new_entry->used_for_tpro = saved_used_for_tpro;
12112 #endif
12113 			VME_OFFSET_SET(new_entry,
12114 			    src_offset - vm_object_trunc_page(src_offset));
12115 			new_entry->needs_copy = FALSE;
12116 		} else if (src_object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC &&
12117 		    (entry_was_shared || map_share)) {
12118 			vm_object_t new_object;
12119 
12120 			vm_object_lock_shared(src_object);
12121 			new_object = vm_object_copy_delayed(
12122 				src_object,
12123 				src_offset,
12124 				src_size,
12125 				TRUE);
12126 			if (new_object == VM_OBJECT_NULL) {
12127 				goto CopySlowly;
12128 			}
12129 
12130 			VME_OBJECT_SET(new_entry, new_object, false, 0);
12131 			assert(new_entry->wired_count == 0);
12132 			new_entry->needs_copy = TRUE;
12133 			assert(!new_entry->iokit_acct);
12134 			assert(new_object->purgable == VM_PURGABLE_DENY);
12135 			assertf(new_entry->use_pmap, "src_map %p new_entry %p\n", src_map, new_entry);
12136 			result = KERN_SUCCESS;
12137 		} else {
12138 			vm_object_offset_t new_offset;
12139 			new_offset = VME_OFFSET(new_entry);
12140 			result = vm_object_copy_strategically(src_object,
12141 			    src_offset,
12142 			    src_size,
12143 			    &new_copy_object,
12144 			    &new_offset,
12145 			    &new_entry_needs_copy);
12146 			/* VME_OBJECT_SET will reset used_for_jit, so preserve it. */
12147 			saved_used_for_jit = new_entry->used_for_jit;
12148 #if __arm64e__
12149 			saved_used_for_tpro = new_entry->used_for_tpro;
12150 #endif
12151 			VME_OBJECT_SET(new_entry, new_copy_object, false, 0);
12152 			new_entry->used_for_jit = saved_used_for_jit;
12153 #if __arm64e__
12154 			new_entry->used_for_tpro = saved_used_for_tpro;
12155 #endif
12156 			if (new_offset != VME_OFFSET(new_entry)) {
12157 				VME_OFFSET_SET(new_entry, new_offset);
12158 			}
12159 
12160 			new_entry->needs_copy = new_entry_needs_copy;
12161 		}
12162 
12163 		if (result == KERN_SUCCESS &&
12164 		    ((preserve_purgeable &&
12165 		    src_object->purgable != VM_PURGABLE_DENY) ||
12166 		    new_entry->used_for_jit
12167 #if __arm64e__
12168 		    || new_entry->used_for_tpro
12169 #endif
12170 		    )) {
12171 			/*
12172 			 * Purgeable objects should be COPY_NONE, true share;
12173 			 * this should be propogated to the copy.
12174 			 *
12175 			 * Also force mappings the pmap specially protects to
12176 			 * be COPY_NONE; trying to COW these mappings would
12177 			 * change the effective protections, which could have
12178 			 * side effects if the pmap layer relies on the
12179 			 * specified protections.
12180 			 */
12181 
12182 			vm_object_t     new_object;
12183 
12184 			new_object = VME_OBJECT(new_entry);
12185 			assert(new_object != src_object);
12186 			vm_object_lock(new_object);
12187 			assert(new_object->ref_count == 1);
12188 			assert(new_object->shadow == VM_OBJECT_NULL);
12189 			assert(new_object->copy == VM_OBJECT_NULL);
12190 			assert(new_object->vo_owner == NULL);
12191 
12192 			new_object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
12193 
12194 			if (preserve_purgeable &&
12195 			    src_object->purgable != VM_PURGABLE_DENY) {
12196 				new_object->true_share = TRUE;
12197 
12198 				/* start as non-volatile with no owner... */
12199 				new_object->purgable = VM_PURGABLE_NONVOLATILE;
12200 				vm_purgeable_nonvolatile_enqueue(new_object, NULL);
12201 				/* ... and move to src_object's purgeable state */
12202 				if (src_object->purgable != VM_PURGABLE_NONVOLATILE) {
12203 					int state;
12204 					state = src_object->purgable;
12205 					vm_object_purgable_control(
12206 						new_object,
12207 						VM_PURGABLE_SET_STATE_FROM_KERNEL,
12208 						&state);
12209 				}
12210 				/* no pmap accounting for purgeable objects */
12211 				new_entry->use_pmap = FALSE;
12212 			}
12213 
12214 			vm_object_unlock(new_object);
12215 			new_object = VM_OBJECT_NULL;
12216 		}
12217 
12218 		if (result != KERN_SUCCESS &&
12219 		    result != KERN_MEMORY_RESTART_COPY) {
12220 			vm_map_lock(src_map);
12221 			RETURN(result);
12222 		}
12223 
12224 		/*
12225 		 *	Throw away the extra reference
12226 		 */
12227 
12228 		vm_object_deallocate(src_object);
12229 
12230 		/*
12231 		 *	Verify that the map has not substantially
12232 		 *	changed while the copy was being made.
12233 		 */
12234 
12235 		vm_map_lock(src_map);
12236 
12237 		if ((version.main_timestamp + 1) == src_map->timestamp) {
12238 			/* src_map hasn't changed: src_entry is still valid */
12239 			src_entry = saved_src_entry;
12240 			goto VerificationSuccessful;
12241 		}
12242 
12243 		/*
12244 		 *	Simple version comparison failed.
12245 		 *
12246 		 *	Retry the lookup and verify that the
12247 		 *	same object/offset are still present.
12248 		 *
12249 		 *	[Note: a memory manager that colludes with
12250 		 *	the calling task can detect that we have
12251 		 *	cheated.  While the map was unlocked, the
12252 		 *	mapping could have been changed and restored.]
12253 		 */
12254 
12255 		if (!vm_map_lookup_entry(src_map, src_start, &tmp_entry)) {
12256 			if (result != KERN_MEMORY_RESTART_COPY) {
12257 				vm_object_deallocate(VME_OBJECT(new_entry));
12258 				VME_OBJECT_SET(new_entry, VM_OBJECT_NULL, false, 0);
12259 				/* reset accounting state */
12260 				new_entry->iokit_acct = FALSE;
12261 				new_entry->use_pmap = TRUE;
12262 			}
12263 			RETURN(KERN_INVALID_ADDRESS);
12264 		}
12265 
12266 		src_entry = tmp_entry;
12267 		vm_map_clip_start(src_map, src_entry, src_start);
12268 
12269 		if ((((src_entry->protection & VM_PROT_READ) == VM_PROT_NONE) &&
12270 		    !use_maxprot) ||
12271 		    ((src_entry->max_protection & VM_PROT_READ) == 0)) {
12272 			goto VerificationFailed;
12273 		}
12274 
12275 		if (src_entry->vme_end < new_entry->vme_end) {
12276 			/*
12277 			 * This entry might have been shortened
12278 			 * (vm_map_clip_end) or been replaced with
12279 			 * an entry that ends closer to "src_start"
12280 			 * than before.
12281 			 * Adjust "new_entry" accordingly; copying
12282 			 * less memory would be correct but we also
12283 			 * redo the copy (see below) if the new entry
12284 			 * no longer points at the same object/offset.
12285 			 */
12286 			assert(VM_MAP_PAGE_ALIGNED(src_entry->vme_end,
12287 			    VM_MAP_COPY_PAGE_MASK(copy)));
12288 			new_entry->vme_end = src_entry->vme_end;
12289 			src_size = new_entry->vme_end - src_start;
12290 		} else if (src_entry->vme_end > new_entry->vme_end) {
12291 			/*
12292 			 * This entry might have been extended
12293 			 * (vm_map_entry_simplify() or coalesce)
12294 			 * or been replaced with an entry that ends farther
12295 			 * from "src_start" than before.
12296 			 *
12297 			 * We've called vm_object_copy_*() only on
12298 			 * the previous <start:end> range, so we can't
12299 			 * just extend new_entry.  We have to re-do
12300 			 * the copy based on the new entry as if it was
12301 			 * pointing at a different object/offset (see
12302 			 * "Verification failed" below).
12303 			 */
12304 		}
12305 
12306 		if ((VME_OBJECT(src_entry) != src_object) ||
12307 		    (VME_OFFSET(src_entry) != src_offset) ||
12308 		    (src_entry->vme_end > new_entry->vme_end)) {
12309 			/*
12310 			 *	Verification failed.
12311 			 *
12312 			 *	Start over with this top-level entry.
12313 			 */
12314 
12315 VerificationFailed:     ;
12316 
12317 			vm_object_deallocate(VME_OBJECT(new_entry));
12318 			tmp_entry = src_entry;
12319 			continue;
12320 		}
12321 
12322 		/*
12323 		 *	Verification succeeded.
12324 		 */
12325 
12326 VerificationSuccessful:;
12327 
12328 		if (result == KERN_MEMORY_RESTART_COPY) {
12329 			goto RestartCopy;
12330 		}
12331 
12332 		/*
12333 		 *	Copy succeeded.
12334 		 */
12335 
12336 CopySuccessful: ;
12337 
12338 		/*
12339 		 *	Link in the new copy entry.
12340 		 */
12341 
12342 		vm_map_copy_entry_link(copy, vm_map_copy_last_entry(copy),
12343 		    new_entry);
12344 
12345 		/*
12346 		 *	Determine whether the entire region
12347 		 *	has been copied.
12348 		 */
12349 		src_base = src_start;
12350 		src_start = new_entry->vme_end;
12351 		new_entry = VM_MAP_ENTRY_NULL;
12352 		while ((src_start >= src_end) && (src_end != 0)) {
12353 			submap_map_t    *ptr;
12354 
12355 			if (src_map == base_map) {
12356 				/* back to the top */
12357 				break;
12358 			}
12359 
12360 			ptr = parent_maps;
12361 			assert(ptr != NULL);
12362 			parent_maps = parent_maps->next;
12363 
12364 			/* fix up the damage we did in that submap */
12365 			vm_map_simplify_range(src_map,
12366 			    src_base,
12367 			    src_end);
12368 
12369 			vm_map_unlock(src_map);
12370 			vm_map_deallocate(src_map);
12371 			vm_map_lock(ptr->parent_map);
12372 			src_map = ptr->parent_map;
12373 			src_base = ptr->base_start;
12374 			src_start = ptr->base_start + ptr->base_len;
12375 			src_end = ptr->base_end;
12376 			if (!vm_map_lookup_entry(src_map,
12377 			    src_start,
12378 			    &tmp_entry) &&
12379 			    (src_end > src_start)) {
12380 				RETURN(KERN_INVALID_ADDRESS);
12381 			}
12382 			kfree_type(submap_map_t, ptr);
12383 			if (parent_maps == NULL) {
12384 				map_share = FALSE;
12385 			}
12386 			src_entry = tmp_entry->vme_prev;
12387 		}
12388 
12389 		if ((VM_MAP_PAGE_SHIFT(src_map) != PAGE_SHIFT) &&
12390 		    (src_start >= src_addr + len) &&
12391 		    (src_addr + len != 0)) {
12392 			/*
12393 			 * Stop copying now, even though we haven't reached
12394 			 * "src_end".  We'll adjust the end of the last copy
12395 			 * entry at the end, if needed.
12396 			 *
12397 			 * If src_map's aligment is different from the
12398 			 * system's page-alignment, there could be
12399 			 * extra non-map-aligned map entries between
12400 			 * the original (non-rounded) "src_addr + len"
12401 			 * and the rounded "src_end".
12402 			 * We do not want to copy those map entries since
12403 			 * they're not part of the copied range.
12404 			 */
12405 			break;
12406 		}
12407 
12408 		if ((src_start >= src_end) && (src_end != 0)) {
12409 			break;
12410 		}
12411 
12412 		/*
12413 		 *	Verify that there are no gaps in the region
12414 		 */
12415 
12416 		tmp_entry = src_entry->vme_next;
12417 		if ((tmp_entry->vme_start != src_start) ||
12418 		    (tmp_entry == vm_map_to_entry(src_map))) {
12419 			RETURN(KERN_INVALID_ADDRESS);
12420 		}
12421 	}
12422 
12423 	/*
12424 	 * If the source should be destroyed, do it now, since the
12425 	 * copy was successful.
12426 	 */
12427 	if (src_destroy) {
12428 		vmr_flags_t remove_flags = VM_MAP_REMOVE_NO_FLAGS;
12429 
12430 		if (src_map == kernel_map) {
12431 			remove_flags |= VM_MAP_REMOVE_KUNWIRE;
12432 		}
12433 		(void)vm_map_remove_and_unlock(src_map,
12434 		    vm_map_trunc_page(src_addr, VM_MAP_PAGE_MASK(src_map)),
12435 		    src_end,
12436 		    remove_flags,
12437 		    KMEM_GUARD_NONE);
12438 	} else {
12439 		/* fix up the damage we did in the base map */
12440 		vm_map_simplify_range(
12441 			src_map,
12442 			vm_map_trunc_page(src_addr,
12443 			VM_MAP_PAGE_MASK(src_map)),
12444 			vm_map_round_page(src_end,
12445 			VM_MAP_PAGE_MASK(src_map)));
12446 		vm_map_unlock(src_map);
12447 	}
12448 
12449 	tmp_entry = VM_MAP_ENTRY_NULL;
12450 
12451 	if (VM_MAP_PAGE_SHIFT(src_map) > PAGE_SHIFT &&
12452 	    VM_MAP_PAGE_SHIFT(src_map) != VM_MAP_COPY_PAGE_SHIFT(copy)) {
12453 		vm_map_offset_t original_start, original_offset, original_end;
12454 
12455 		assert(VM_MAP_COPY_PAGE_MASK(copy) == PAGE_MASK);
12456 
12457 		/* adjust alignment of first copy_entry's "vme_start" */
12458 		tmp_entry = vm_map_copy_first_entry(copy);
12459 		if (tmp_entry != vm_map_copy_to_entry(copy)) {
12460 			vm_map_offset_t adjustment;
12461 
12462 			original_start = tmp_entry->vme_start;
12463 			original_offset = VME_OFFSET(tmp_entry);
12464 
12465 			/* map-align the start of the first copy entry... */
12466 			adjustment = (tmp_entry->vme_start -
12467 			    vm_map_trunc_page(
12468 				    tmp_entry->vme_start,
12469 				    VM_MAP_PAGE_MASK(src_map)));
12470 			tmp_entry->vme_start -= adjustment;
12471 			VME_OFFSET_SET(tmp_entry,
12472 			    VME_OFFSET(tmp_entry) - adjustment);
12473 			copy_addr -= adjustment;
12474 			assert(tmp_entry->vme_start < tmp_entry->vme_end);
12475 			/* ... adjust for mis-aligned start of copy range */
12476 			adjustment =
12477 			    (vm_map_trunc_page(copy->offset,
12478 			    PAGE_MASK) -
12479 			    vm_map_trunc_page(copy->offset,
12480 			    VM_MAP_PAGE_MASK(src_map)));
12481 			if (adjustment) {
12482 				assert(page_aligned(adjustment));
12483 				assert(adjustment < VM_MAP_PAGE_SIZE(src_map));
12484 				tmp_entry->vme_start += adjustment;
12485 				VME_OFFSET_SET(tmp_entry,
12486 				    (VME_OFFSET(tmp_entry) +
12487 				    adjustment));
12488 				copy_addr += adjustment;
12489 				assert(tmp_entry->vme_start < tmp_entry->vme_end);
12490 			}
12491 
12492 			/*
12493 			 * Assert that the adjustments haven't exposed
12494 			 * more than was originally copied...
12495 			 */
12496 			assert(tmp_entry->vme_start >= original_start);
12497 			assert(VME_OFFSET(tmp_entry) >= original_offset);
12498 			/*
12499 			 * ... and that it did not adjust outside of a
12500 			 * a single 16K page.
12501 			 */
12502 			assert(vm_map_trunc_page(tmp_entry->vme_start,
12503 			    VM_MAP_PAGE_MASK(src_map)) ==
12504 			    vm_map_trunc_page(original_start,
12505 			    VM_MAP_PAGE_MASK(src_map)));
12506 		}
12507 
12508 		/* adjust alignment of last copy_entry's "vme_end" */
12509 		tmp_entry = vm_map_copy_last_entry(copy);
12510 		if (tmp_entry != vm_map_copy_to_entry(copy)) {
12511 			vm_map_offset_t adjustment;
12512 
12513 			original_end = tmp_entry->vme_end;
12514 
12515 			/* map-align the end of the last copy entry... */
12516 			tmp_entry->vme_end =
12517 			    vm_map_round_page(tmp_entry->vme_end,
12518 			    VM_MAP_PAGE_MASK(src_map));
12519 			/* ... adjust for mis-aligned end of copy range */
12520 			adjustment =
12521 			    (vm_map_round_page((copy->offset +
12522 			    copy->size),
12523 			    VM_MAP_PAGE_MASK(src_map)) -
12524 			    vm_map_round_page((copy->offset +
12525 			    copy->size),
12526 			    PAGE_MASK));
12527 			if (adjustment) {
12528 				assert(page_aligned(adjustment));
12529 				assert(adjustment < VM_MAP_PAGE_SIZE(src_map));
12530 				tmp_entry->vme_end -= adjustment;
12531 				assert(tmp_entry->vme_start < tmp_entry->vme_end);
12532 			}
12533 
12534 			/*
12535 			 * Assert that the adjustments haven't exposed
12536 			 * more than was originally copied...
12537 			 */
12538 			assert(tmp_entry->vme_end <= original_end);
12539 			/*
12540 			 * ... and that it did not adjust outside of a
12541 			 * a single 16K page.
12542 			 */
12543 			assert(vm_map_round_page(tmp_entry->vme_end,
12544 			    VM_MAP_PAGE_MASK(src_map)) ==
12545 			    vm_map_round_page(original_end,
12546 			    VM_MAP_PAGE_MASK(src_map)));
12547 		}
12548 	}
12549 
12550 	/* Fix-up start and end points in copy.  This is necessary */
12551 	/* when the various entries in the copy object were picked */
12552 	/* up from different sub-maps */
12553 
12554 	tmp_entry = vm_map_copy_first_entry(copy);
12555 	copy_size = 0; /* compute actual size */
12556 	while (tmp_entry != vm_map_copy_to_entry(copy)) {
12557 		assert(VM_MAP_PAGE_ALIGNED(
12558 			    copy_addr + (tmp_entry->vme_end -
12559 			    tmp_entry->vme_start),
12560 			    MIN(VM_MAP_COPY_PAGE_MASK(copy), PAGE_MASK)));
12561 		assert(VM_MAP_PAGE_ALIGNED(
12562 			    copy_addr,
12563 			    MIN(VM_MAP_COPY_PAGE_MASK(copy), PAGE_MASK)));
12564 
12565 		/*
12566 		 * The copy_entries will be injected directly into the
12567 		 * destination map and might not be "map aligned" there...
12568 		 */
12569 		tmp_entry->map_aligned = FALSE;
12570 
12571 		tmp_entry->vme_end = copy_addr +
12572 		    (tmp_entry->vme_end - tmp_entry->vme_start);
12573 		tmp_entry->vme_start = copy_addr;
12574 		assert(tmp_entry->vme_start < tmp_entry->vme_end);
12575 		copy_addr += tmp_entry->vme_end - tmp_entry->vme_start;
12576 		copy_size += tmp_entry->vme_end - tmp_entry->vme_start;
12577 		tmp_entry = (struct vm_map_entry *)tmp_entry->vme_next;
12578 	}
12579 
12580 	if (VM_MAP_PAGE_SHIFT(src_map) != PAGE_SHIFT &&
12581 	    copy_size < copy->size) {
12582 		/*
12583 		 * The actual size of the VM map copy is smaller than what
12584 		 * was requested by the caller.  This must be because some
12585 		 * PAGE_SIZE-sized pages are missing at the end of the last
12586 		 * VM_MAP_PAGE_SIZE(src_map)-sized chunk of the range.
12587 		 * The caller might not have been aware of those missing
12588 		 * pages and might not want to be aware of it, which is
12589 		 * fine as long as they don't try to access (and crash on)
12590 		 * those missing pages.
12591 		 * Let's adjust the size of the "copy", to avoid failing
12592 		 * in vm_map_copyout() or vm_map_copy_overwrite().
12593 		 */
12594 		assert(vm_map_round_page(copy_size,
12595 		    VM_MAP_PAGE_MASK(src_map)) ==
12596 		    vm_map_round_page(copy->size,
12597 		    VM_MAP_PAGE_MASK(src_map)));
12598 		copy->size = copy_size;
12599 	}
12600 
12601 	*copy_result = copy;
12602 	return KERN_SUCCESS;
12603 
12604 #undef  RETURN
12605 }
12606 
12607 kern_return_t
vm_map_copy_extract(vm_map_t src_map,vm_map_address_t src_addr,vm_map_size_t len,boolean_t do_copy,vm_map_copy_t * copy_result,vm_prot_t * cur_prot,vm_prot_t * max_prot,vm_inherit_t inheritance,vm_map_kernel_flags_t vmk_flags)12608 vm_map_copy_extract(
12609 	vm_map_t                src_map,
12610 	vm_map_address_t        src_addr,
12611 	vm_map_size_t           len,
12612 	boolean_t               do_copy,
12613 	vm_map_copy_t           *copy_result,   /* OUT */
12614 	vm_prot_t               *cur_prot,      /* IN/OUT */
12615 	vm_prot_t               *max_prot,      /* IN/OUT */
12616 	vm_inherit_t            inheritance,
12617 	vm_map_kernel_flags_t   vmk_flags)
12618 {
12619 	vm_map_copy_t   copy;
12620 	kern_return_t   kr;
12621 	vm_prot_t required_cur_prot, required_max_prot;
12622 
12623 	/*
12624 	 *	Check for copies of zero bytes.
12625 	 */
12626 
12627 	if (len == 0) {
12628 		*copy_result = VM_MAP_COPY_NULL;
12629 		return KERN_SUCCESS;
12630 	}
12631 
12632 	/*
12633 	 *	Check that the end address doesn't overflow
12634 	 */
12635 	if (src_addr + len < src_addr) {
12636 		return KERN_INVALID_ADDRESS;
12637 	}
12638 	if (__improbable(vm_map_range_overflows(src_map, src_addr, len))) {
12639 		return KERN_INVALID_ADDRESS;
12640 	}
12641 
12642 	if (VM_MAP_PAGE_SIZE(src_map) < PAGE_SIZE) {
12643 		DEBUG4K_SHARE("src_map %p src_addr 0x%llx src_end 0x%llx\n", src_map, (uint64_t)src_addr, (uint64_t)(src_addr + len));
12644 	}
12645 
12646 	required_cur_prot = *cur_prot;
12647 	required_max_prot = *max_prot;
12648 
12649 	/*
12650 	 *	Allocate a header element for the list.
12651 	 *
12652 	 *	Use the start and end in the header to
12653 	 *	remember the endpoints prior to rounding.
12654 	 */
12655 
12656 	copy = vm_map_copy_allocate(VM_MAP_COPY_ENTRY_LIST);
12657 	copy->cpy_hdr.entries_pageable = vmk_flags.vmkf_copy_pageable;
12658 	copy->offset = 0;
12659 	copy->size = len;
12660 
12661 	kr = vm_map_remap_extract(src_map,
12662 	    src_addr,
12663 	    len,
12664 	    do_copy,             /* copy */
12665 	    copy,
12666 	    cur_prot,            /* IN/OUT */
12667 	    max_prot,            /* IN/OUT */
12668 	    inheritance,
12669 	    vmk_flags);
12670 	if (kr != KERN_SUCCESS) {
12671 		vm_map_copy_discard(copy);
12672 		return kr;
12673 	}
12674 	if (required_cur_prot != VM_PROT_NONE) {
12675 		assert((*cur_prot & required_cur_prot) == required_cur_prot);
12676 		assert((*max_prot & required_max_prot) == required_max_prot);
12677 	}
12678 
12679 	*copy_result = copy;
12680 	return KERN_SUCCESS;
12681 }
12682 
12683 static void
vm_map_fork_share(vm_map_t old_map,vm_map_entry_t old_entry,vm_map_t new_map)12684 vm_map_fork_share(
12685 	vm_map_t        old_map,
12686 	vm_map_entry_t  old_entry,
12687 	vm_map_t        new_map)
12688 {
12689 	vm_object_t     object;
12690 	vm_map_entry_t  new_entry;
12691 
12692 	/*
12693 	 *	New sharing code.  New map entry
12694 	 *	references original object.  Internal
12695 	 *	objects use asynchronous copy algorithm for
12696 	 *	future copies.  First make sure we have
12697 	 *	the right object.  If we need a shadow,
12698 	 *	or someone else already has one, then
12699 	 *	make a new shadow and share it.
12700 	 */
12701 
12702 	if (!old_entry->is_sub_map) {
12703 		object = VME_OBJECT(old_entry);
12704 	}
12705 
12706 	if (old_entry->is_sub_map) {
12707 		assert(old_entry->wired_count == 0);
12708 #ifndef NO_NESTED_PMAP
12709 #if !PMAP_FORK_NEST
12710 		if (old_entry->use_pmap) {
12711 			kern_return_t   result;
12712 
12713 			result = pmap_nest(new_map->pmap,
12714 			    (VME_SUBMAP(old_entry))->pmap,
12715 			    (addr64_t)old_entry->vme_start,
12716 			    (uint64_t)(old_entry->vme_end - old_entry->vme_start));
12717 			if (result) {
12718 				panic("vm_map_fork_share: pmap_nest failed!");
12719 			}
12720 		}
12721 #endif /* !PMAP_FORK_NEST */
12722 #endif  /* NO_NESTED_PMAP */
12723 	} else if (object == VM_OBJECT_NULL) {
12724 		object = vm_object_allocate((vm_map_size_t)(old_entry->vme_end -
12725 		    old_entry->vme_start));
12726 		VME_OFFSET_SET(old_entry, 0);
12727 		VME_OBJECT_SET(old_entry, object, false, 0);
12728 		old_entry->use_pmap = TRUE;
12729 //		assert(!old_entry->needs_copy);
12730 	} else if (object->copy_strategy !=
12731 	    MEMORY_OBJECT_COPY_SYMMETRIC) {
12732 		/*
12733 		 *	We are already using an asymmetric
12734 		 *	copy, and therefore we already have
12735 		 *	the right object.
12736 		 */
12737 
12738 		assert(!old_entry->needs_copy);
12739 	} else if (old_entry->needs_copy ||       /* case 1 */
12740 	    object->shadowed ||                 /* case 2 */
12741 	    (!object->true_share &&             /* case 3 */
12742 	    !old_entry->is_shared &&
12743 	    (object->vo_size >
12744 	    (vm_map_size_t)(old_entry->vme_end -
12745 	    old_entry->vme_start)))) {
12746 		/*
12747 		 *	We need to create a shadow.
12748 		 *	There are three cases here.
12749 		 *	In the first case, we need to
12750 		 *	complete a deferred symmetrical
12751 		 *	copy that we participated in.
12752 		 *	In the second and third cases,
12753 		 *	we need to create the shadow so
12754 		 *	that changes that we make to the
12755 		 *	object do not interfere with
12756 		 *	any symmetrical copies which
12757 		 *	have occured (case 2) or which
12758 		 *	might occur (case 3).
12759 		 *
12760 		 *	The first case is when we had
12761 		 *	deferred shadow object creation
12762 		 *	via the entry->needs_copy mechanism.
12763 		 *	This mechanism only works when
12764 		 *	only one entry points to the source
12765 		 *	object, and we are about to create
12766 		 *	a second entry pointing to the
12767 		 *	same object. The problem is that
12768 		 *	there is no way of mapping from
12769 		 *	an object to the entries pointing
12770 		 *	to it. (Deferred shadow creation
12771 		 *	works with one entry because occurs
12772 		 *	at fault time, and we walk from the
12773 		 *	entry to the object when handling
12774 		 *	the fault.)
12775 		 *
12776 		 *	The second case is when the object
12777 		 *	to be shared has already been copied
12778 		 *	with a symmetric copy, but we point
12779 		 *	directly to the object without
12780 		 *	needs_copy set in our entry. (This
12781 		 *	can happen because different ranges
12782 		 *	of an object can be pointed to by
12783 		 *	different entries. In particular,
12784 		 *	a single entry pointing to an object
12785 		 *	can be split by a call to vm_inherit,
12786 		 *	which, combined with task_create, can
12787 		 *	result in the different entries
12788 		 *	having different needs_copy values.)
12789 		 *	The shadowed flag in the object allows
12790 		 *	us to detect this case. The problem
12791 		 *	with this case is that if this object
12792 		 *	has or will have shadows, then we
12793 		 *	must not perform an asymmetric copy
12794 		 *	of this object, since such a copy
12795 		 *	allows the object to be changed, which
12796 		 *	will break the previous symmetrical
12797 		 *	copies (which rely upon the object
12798 		 *	not changing). In a sense, the shadowed
12799 		 *	flag says "don't change this object".
12800 		 *	We fix this by creating a shadow
12801 		 *	object for this object, and sharing
12802 		 *	that. This works because we are free
12803 		 *	to change the shadow object (and thus
12804 		 *	to use an asymmetric copy strategy);
12805 		 *	this is also semantically correct,
12806 		 *	since this object is temporary, and
12807 		 *	therefore a copy of the object is
12808 		 *	as good as the object itself. (This
12809 		 *	is not true for permanent objects,
12810 		 *	since the pager needs to see changes,
12811 		 *	which won't happen if the changes
12812 		 *	are made to a copy.)
12813 		 *
12814 		 *	The third case is when the object
12815 		 *	to be shared has parts sticking
12816 		 *	outside of the entry we're working
12817 		 *	with, and thus may in the future
12818 		 *	be subject to a symmetrical copy.
12819 		 *	(This is a preemptive version of
12820 		 *	case 2.)
12821 		 */
12822 		VME_OBJECT_SHADOW(old_entry,
12823 		    (vm_map_size_t) (old_entry->vme_end -
12824 		    old_entry->vme_start),
12825 		    vm_map_always_shadow(old_map));
12826 
12827 		/*
12828 		 *	If we're making a shadow for other than
12829 		 *	copy on write reasons, then we have
12830 		 *	to remove write permission.
12831 		 */
12832 
12833 		if (!old_entry->needs_copy &&
12834 		    (old_entry->protection & VM_PROT_WRITE)) {
12835 			vm_prot_t prot;
12836 
12837 			assert(!pmap_has_prot_policy(old_map->pmap, old_entry->translated_allow_execute, old_entry->protection));
12838 
12839 			prot = old_entry->protection & ~VM_PROT_WRITE;
12840 
12841 			assert(!pmap_has_prot_policy(old_map->pmap, old_entry->translated_allow_execute, prot));
12842 
12843 			if (override_nx(old_map, VME_ALIAS(old_entry)) && prot) {
12844 				prot |= VM_PROT_EXECUTE;
12845 			}
12846 
12847 
12848 			if (old_map->mapped_in_other_pmaps) {
12849 				vm_object_pmap_protect(
12850 					VME_OBJECT(old_entry),
12851 					VME_OFFSET(old_entry),
12852 					(old_entry->vme_end -
12853 					old_entry->vme_start),
12854 					PMAP_NULL,
12855 					PAGE_SIZE,
12856 					old_entry->vme_start,
12857 					prot);
12858 			} else {
12859 				pmap_protect(old_map->pmap,
12860 				    old_entry->vme_start,
12861 				    old_entry->vme_end,
12862 				    prot);
12863 			}
12864 		}
12865 
12866 		old_entry->needs_copy = FALSE;
12867 		object = VME_OBJECT(old_entry);
12868 	}
12869 
12870 
12871 	/*
12872 	 *	If object was using a symmetric copy strategy,
12873 	 *	change its copy strategy to the default
12874 	 *	asymmetric copy strategy, which is copy_delay
12875 	 *	in the non-norma case and copy_call in the
12876 	 *	norma case. Bump the reference count for the
12877 	 *	new entry.
12878 	 */
12879 
12880 	if (old_entry->is_sub_map) {
12881 		vm_map_reference(VME_SUBMAP(old_entry));
12882 	} else {
12883 		vm_object_lock(object);
12884 		vm_object_reference_locked(object);
12885 		if (object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) {
12886 			object->copy_strategy = MEMORY_OBJECT_COPY_DELAY;
12887 		}
12888 		vm_object_unlock(object);
12889 	}
12890 
12891 	/*
12892 	 *	Clone the entry, using object ref from above.
12893 	 *	Mark both entries as shared.
12894 	 */
12895 
12896 	new_entry = vm_map_entry_create(new_map); /* Never the kernel map or descendants */
12897 	vm_map_entry_copy(old_map, new_entry, old_entry);
12898 	old_entry->is_shared = TRUE;
12899 	new_entry->is_shared = TRUE;
12900 
12901 	/*
12902 	 * We're dealing with a shared mapping, so the resulting mapping
12903 	 * should inherit some of the original mapping's accounting settings.
12904 	 * "iokit_acct" should have been cleared in vm_map_entry_copy().
12905 	 * "use_pmap" should stay the same as before (if it hasn't been reset
12906 	 * to TRUE when we cleared "iokit_acct").
12907 	 */
12908 	assert(!new_entry->iokit_acct);
12909 
12910 	/*
12911 	 *	If old entry's inheritence is VM_INHERIT_NONE,
12912 	 *	the new entry is for corpse fork, remove the
12913 	 *	write permission from the new entry.
12914 	 */
12915 	if (old_entry->inheritance == VM_INHERIT_NONE) {
12916 		new_entry->protection &= ~VM_PROT_WRITE;
12917 		new_entry->max_protection &= ~VM_PROT_WRITE;
12918 	}
12919 
12920 	/*
12921 	 *	Insert the entry into the new map -- we
12922 	 *	know we're inserting at the end of the new
12923 	 *	map.
12924 	 */
12925 
12926 	vm_map_store_entry_link(new_map, vm_map_last_entry(new_map), new_entry,
12927 	    VM_MAP_KERNEL_FLAGS_NONE);
12928 
12929 	/*
12930 	 *	Update the physical map
12931 	 */
12932 
12933 	if (old_entry->is_sub_map) {
12934 		/* Bill Angell pmap support goes here */
12935 	} else {
12936 		pmap_copy(new_map->pmap, old_map->pmap, new_entry->vme_start,
12937 		    old_entry->vme_end - old_entry->vme_start,
12938 		    old_entry->vme_start);
12939 	}
12940 }
12941 
12942 static boolean_t
vm_map_fork_copy(vm_map_t old_map,vm_map_entry_t * old_entry_p,vm_map_t new_map,int vm_map_copyin_flags)12943 vm_map_fork_copy(
12944 	vm_map_t        old_map,
12945 	vm_map_entry_t  *old_entry_p,
12946 	vm_map_t        new_map,
12947 	int             vm_map_copyin_flags)
12948 {
12949 	vm_map_entry_t old_entry = *old_entry_p;
12950 	vm_map_size_t entry_size = old_entry->vme_end - old_entry->vme_start;
12951 	vm_map_offset_t start = old_entry->vme_start;
12952 	vm_map_copy_t copy;
12953 	vm_map_entry_t last = vm_map_last_entry(new_map);
12954 
12955 	vm_map_unlock(old_map);
12956 	/*
12957 	 *	Use maxprot version of copyin because we
12958 	 *	care about whether this memory can ever
12959 	 *	be accessed, not just whether it's accessible
12960 	 *	right now.
12961 	 */
12962 	vm_map_copyin_flags |= VM_MAP_COPYIN_USE_MAXPROT;
12963 	if (vm_map_copyin_internal(old_map, start, entry_size,
12964 	    vm_map_copyin_flags, &copy)
12965 	    != KERN_SUCCESS) {
12966 		/*
12967 		 *	The map might have changed while it
12968 		 *	was unlocked, check it again.  Skip
12969 		 *	any blank space or permanently
12970 		 *	unreadable region.
12971 		 */
12972 		vm_map_lock(old_map);
12973 		if (!vm_map_lookup_entry(old_map, start, &last) ||
12974 		    (last->max_protection & VM_PROT_READ) == VM_PROT_NONE) {
12975 			last = last->vme_next;
12976 		}
12977 		*old_entry_p = last;
12978 
12979 		/*
12980 		 * XXX	For some error returns, want to
12981 		 * XXX	skip to the next element.  Note
12982 		 *	that INVALID_ADDRESS and
12983 		 *	PROTECTION_FAILURE are handled above.
12984 		 */
12985 
12986 		return FALSE;
12987 	}
12988 
12989 	/*
12990 	 * Assert that the vm_map_copy is coming from the right
12991 	 * zone and hasn't been forged
12992 	 */
12993 	vm_map_copy_require(copy);
12994 
12995 	/*
12996 	 *	Insert the copy into the new map
12997 	 */
12998 	vm_map_copy_insert(new_map, last, copy);
12999 
13000 	/*
13001 	 *	Pick up the traversal at the end of
13002 	 *	the copied region.
13003 	 */
13004 
13005 	vm_map_lock(old_map);
13006 	start += entry_size;
13007 	if (!vm_map_lookup_entry(old_map, start, &last)) {
13008 		last = last->vme_next;
13009 	} else {
13010 		if (last->vme_start == start) {
13011 			/*
13012 			 * No need to clip here and we don't
13013 			 * want to cause any unnecessary
13014 			 * unnesting...
13015 			 */
13016 		} else {
13017 			vm_map_clip_start(old_map, last, start);
13018 		}
13019 	}
13020 	*old_entry_p = last;
13021 
13022 	return TRUE;
13023 }
13024 
13025 #if PMAP_FORK_NEST
13026 #define PMAP_FORK_NEST_DEBUG 0
13027 static inline void
vm_map_fork_unnest(pmap_t new_pmap,vm_map_offset_t pre_nested_start,vm_map_offset_t pre_nested_end,vm_map_offset_t start,vm_map_offset_t end)13028 vm_map_fork_unnest(
13029 	pmap_t new_pmap,
13030 	vm_map_offset_t pre_nested_start,
13031 	vm_map_offset_t pre_nested_end,
13032 	vm_map_offset_t start,
13033 	vm_map_offset_t end)
13034 {
13035 	kern_return_t kr;
13036 	vm_map_offset_t nesting_mask, start_unnest, end_unnest;
13037 
13038 	assertf(pre_nested_start <= pre_nested_end,
13039 	    "pre_nested start 0x%llx end 0x%llx",
13040 	    (uint64_t)pre_nested_start, (uint64_t)pre_nested_end);
13041 	assertf(start <= end,
13042 	    "start 0x%llx end 0x%llx",
13043 	    (uint64_t) start, (uint64_t)end);
13044 
13045 	if (pre_nested_start == pre_nested_end) {
13046 		/* nothing was pre-nested: done */
13047 		return;
13048 	}
13049 	if (end <= pre_nested_start) {
13050 		/* fully before pre-nested range: done */
13051 		return;
13052 	}
13053 	if (start >= pre_nested_end) {
13054 		/* fully after pre-nested range: done */
13055 		return;
13056 	}
13057 	/* ignore parts of range outside of pre_nested range */
13058 	if (start < pre_nested_start) {
13059 		start = pre_nested_start;
13060 	}
13061 	if (end > pre_nested_end) {
13062 		end = pre_nested_end;
13063 	}
13064 	nesting_mask = pmap_shared_region_size_min(new_pmap) - 1;
13065 	start_unnest = start & ~nesting_mask;
13066 	end_unnest = (end + nesting_mask) & ~nesting_mask;
13067 	kr = pmap_unnest(new_pmap,
13068 	    (addr64_t)start_unnest,
13069 	    (uint64_t)(end_unnest - start_unnest));
13070 #if PMAP_FORK_NEST_DEBUG
13071 	printf("PMAP_FORK_NEST %s:%d new_pmap %p 0x%llx:0x%llx -> pmap_unnest 0x%llx:0x%llx kr 0x%x\n", __FUNCTION__, __LINE__, new_pmap, (uint64_t)start, (uint64_t)end, (uint64_t)start_unnest, (uint64_t)end_unnest, kr);
13072 #endif /* PMAP_FORK_NEST_DEBUG */
13073 	assertf(kr == KERN_SUCCESS,
13074 	    "0x%llx 0x%llx pmap_unnest(%p, 0x%llx, 0x%llx) -> 0x%x",
13075 	    (uint64_t)start, (uint64_t)end, new_pmap,
13076 	    (uint64_t)start_unnest, (uint64_t)(end_unnest - start_unnest),
13077 	    kr);
13078 }
13079 #endif /* PMAP_FORK_NEST */
13080 
13081 void
vm_map_inherit_limits(vm_map_t new_map,const struct _vm_map * old_map)13082 vm_map_inherit_limits(vm_map_t new_map, const struct _vm_map *old_map)
13083 {
13084 	new_map->size_limit = old_map->size_limit;
13085 	new_map->data_limit = old_map->data_limit;
13086 	new_map->user_wire_limit = old_map->user_wire_limit;
13087 	new_map->reserved_regions = old_map->reserved_regions;
13088 }
13089 
13090 /*
13091  *	vm_map_fork:
13092  *
13093  *	Create and return a new map based on the old
13094  *	map, according to the inheritance values on the
13095  *	regions in that map and the options.
13096  *
13097  *	The source map must not be locked.
13098  */
13099 vm_map_t
vm_map_fork(ledger_t ledger,vm_map_t old_map,int options)13100 vm_map_fork(
13101 	ledger_t        ledger,
13102 	vm_map_t        old_map,
13103 	int             options)
13104 {
13105 	pmap_t          new_pmap;
13106 	vm_map_t        new_map;
13107 	vm_map_entry_t  old_entry;
13108 	vm_map_size_t   new_size = 0, entry_size;
13109 	vm_map_entry_t  new_entry;
13110 	boolean_t       src_needs_copy;
13111 	boolean_t       new_entry_needs_copy;
13112 	boolean_t       pmap_is64bit;
13113 	int             vm_map_copyin_flags;
13114 	vm_inherit_t    old_entry_inheritance;
13115 	int             map_create_options;
13116 	kern_return_t   footprint_collect_kr;
13117 
13118 	if (options & ~(VM_MAP_FORK_SHARE_IF_INHERIT_NONE |
13119 	    VM_MAP_FORK_PRESERVE_PURGEABLE |
13120 	    VM_MAP_FORK_CORPSE_FOOTPRINT)) {
13121 		/* unsupported option */
13122 		return VM_MAP_NULL;
13123 	}
13124 
13125 	pmap_is64bit =
13126 #if defined(__i386__) || defined(__x86_64__)
13127 	    old_map->pmap->pm_task_map != TASK_MAP_32BIT;
13128 #elif defined(__arm64__)
13129 	    old_map->pmap->is_64bit;
13130 #else
13131 #error Unknown architecture.
13132 #endif
13133 
13134 	unsigned int pmap_flags = 0;
13135 	pmap_flags |= pmap_is64bit ? PMAP_CREATE_64BIT : 0;
13136 #if defined(HAS_APPLE_PAC)
13137 	pmap_flags |= old_map->pmap->disable_jop ? PMAP_CREATE_DISABLE_JOP : 0;
13138 #endif
13139 #if CONFIG_ROSETTA
13140 	pmap_flags |= old_map->pmap->is_rosetta ? PMAP_CREATE_ROSETTA : 0;
13141 #endif
13142 #if PMAP_CREATE_FORCE_4K_PAGES
13143 	if (VM_MAP_PAGE_SIZE(old_map) == FOURK_PAGE_SIZE &&
13144 	    PAGE_SIZE != FOURK_PAGE_SIZE) {
13145 		pmap_flags |= PMAP_CREATE_FORCE_4K_PAGES;
13146 	}
13147 #endif /* PMAP_CREATE_FORCE_4K_PAGES */
13148 	new_pmap = pmap_create_options(ledger, (vm_map_size_t) 0, pmap_flags);
13149 	if (new_pmap == NULL) {
13150 		return VM_MAP_NULL;
13151 	}
13152 
13153 	vm_map_reference(old_map);
13154 	vm_map_lock(old_map);
13155 
13156 	map_create_options = 0;
13157 	if (old_map->hdr.entries_pageable) {
13158 		map_create_options |= VM_MAP_CREATE_PAGEABLE;
13159 	}
13160 	if (options & VM_MAP_FORK_CORPSE_FOOTPRINT) {
13161 		map_create_options |= VM_MAP_CREATE_CORPSE_FOOTPRINT;
13162 		footprint_collect_kr = KERN_SUCCESS;
13163 	}
13164 	new_map = vm_map_create_options(new_pmap,
13165 	    old_map->min_offset,
13166 	    old_map->max_offset,
13167 	    map_create_options);
13168 
13169 	/* inherit cs_enforcement */
13170 	vm_map_cs_enforcement_set(new_map, old_map->cs_enforcement);
13171 
13172 	vm_map_lock(new_map);
13173 	vm_commit_pagezero_status(new_map);
13174 	/* inherit the parent map's page size */
13175 	vm_map_set_page_shift(new_map, VM_MAP_PAGE_SHIFT(old_map));
13176 
13177 	/* inherit the parent rlimits */
13178 	vm_map_inherit_limits(new_map, old_map);
13179 
13180 #if CONFIG_MAP_RANGES
13181 	/* inherit the parent map's VM ranges */
13182 	vm_map_range_fork(new_map, old_map);
13183 #endif
13184 
13185 #if CODE_SIGNING_MONITOR
13186 	/* Prepare the monitor for the fork */
13187 	csm_fork_prepare(old_map->pmap, new_pmap);
13188 #endif
13189 
13190 #if PMAP_FORK_NEST
13191 	/*
13192 	 * Pre-nest the shared region's pmap.
13193 	 */
13194 	vm_map_offset_t pre_nested_start = 0, pre_nested_end = 0;
13195 	pmap_fork_nest(old_map->pmap, new_pmap,
13196 	    &pre_nested_start, &pre_nested_end);
13197 #if PMAP_FORK_NEST_DEBUG
13198 	printf("PMAP_FORK_NEST %s:%d old %p new %p pre_nested start 0x%llx end 0x%llx\n", __FUNCTION__, __LINE__, old_map->pmap, new_pmap, (uint64_t)pre_nested_start, (uint64_t)pre_nested_end);
13199 #endif /* PMAP_FORK_NEST_DEBUG */
13200 #endif /* PMAP_FORK_NEST */
13201 
13202 	for (old_entry = vm_map_first_entry(old_map); old_entry != vm_map_to_entry(old_map);) {
13203 		/*
13204 		 * Abort any corpse collection if the system is shutting down.
13205 		 */
13206 		if ((options & VM_MAP_FORK_CORPSE_FOOTPRINT) &&
13207 		    get_system_inshutdown()) {
13208 #if PMAP_FORK_NEST
13209 			new_entry = vm_map_last_entry(new_map);
13210 			if (new_entry == vm_map_to_entry(new_map)) {
13211 				/* unnest all that was pre-nested */
13212 				vm_map_fork_unnest(new_pmap,
13213 				    pre_nested_start, pre_nested_end,
13214 				    vm_map_min(new_map), vm_map_max(new_map));
13215 			} else if (new_entry->vme_end < vm_map_max(new_map)) {
13216 				/* unnest hole at the end, if pre-nested */
13217 				vm_map_fork_unnest(new_pmap,
13218 				    pre_nested_start, pre_nested_end,
13219 				    new_entry->vme_end, vm_map_max(new_map));
13220 			}
13221 #endif /* PMAP_FORK_NEST */
13222 			vm_map_corpse_footprint_collect_done(new_map);
13223 			vm_map_unlock(new_map);
13224 			vm_map_unlock(old_map);
13225 			vm_map_deallocate(new_map);
13226 			vm_map_deallocate(old_map);
13227 			printf("Aborting corpse map due to system shutdown\n");
13228 			return VM_MAP_NULL;
13229 		}
13230 
13231 		entry_size = old_entry->vme_end - old_entry->vme_start;
13232 
13233 #if PMAP_FORK_NEST
13234 		/*
13235 		 * Undo any unnecessary pre-nesting.
13236 		 */
13237 		vm_map_offset_t prev_end;
13238 		if (old_entry == vm_map_first_entry(old_map)) {
13239 			prev_end = vm_map_min(old_map);
13240 		} else {
13241 			prev_end = old_entry->vme_prev->vme_end;
13242 		}
13243 		if (prev_end < old_entry->vme_start) {
13244 			/* unnest hole before this entry, if pre-nested */
13245 			vm_map_fork_unnest(new_pmap,
13246 			    pre_nested_start, pre_nested_end,
13247 			    prev_end, old_entry->vme_start);
13248 		}
13249 		if (old_entry->is_sub_map && old_entry->use_pmap) {
13250 			/* keep this entry nested in the child */
13251 #if PMAP_FORK_NEST_DEBUG
13252 			printf("PMAP_FORK_NEST %s:%d new_pmap %p keeping 0x%llx:0x%llx nested\n", __FUNCTION__, __LINE__, new_pmap, (uint64_t)old_entry->vme_start, (uint64_t)old_entry->vme_end);
13253 #endif /* PMAP_FORK_NEST_DEBUG */
13254 		} else {
13255 			/* undo nesting for this entry, if pre-nested */
13256 			vm_map_fork_unnest(new_pmap,
13257 			    pre_nested_start, pre_nested_end,
13258 			    old_entry->vme_start, old_entry->vme_end);
13259 		}
13260 #endif /* PMAP_FORK_NEST */
13261 
13262 		old_entry_inheritance = old_entry->inheritance;
13263 		/*
13264 		 * If caller used the VM_MAP_FORK_SHARE_IF_INHERIT_NONE option
13265 		 * share VM_INHERIT_NONE entries that are not backed by a
13266 		 * device pager.
13267 		 */
13268 		if (old_entry_inheritance == VM_INHERIT_NONE &&
13269 		    (options & VM_MAP_FORK_SHARE_IF_INHERIT_NONE) &&
13270 		    (old_entry->protection & VM_PROT_READ) &&
13271 		    !(!old_entry->is_sub_map &&
13272 		    VME_OBJECT(old_entry) != NULL &&
13273 		    VME_OBJECT(old_entry)->pager != NULL &&
13274 		    is_device_pager_ops(
13275 			    VME_OBJECT(old_entry)->pager->mo_pager_ops))) {
13276 			old_entry_inheritance = VM_INHERIT_SHARE;
13277 		}
13278 
13279 		if (old_entry_inheritance != VM_INHERIT_NONE &&
13280 		    (options & VM_MAP_FORK_CORPSE_FOOTPRINT) &&
13281 		    footprint_collect_kr == KERN_SUCCESS) {
13282 			/*
13283 			 * The corpse won't have old_map->pmap to query
13284 			 * footprint information, so collect that data now
13285 			 * and store it in new_map->vmmap_corpse_footprint
13286 			 * for later autopsy.
13287 			 */
13288 			footprint_collect_kr =
13289 			    vm_map_corpse_footprint_collect(old_map,
13290 			    old_entry,
13291 			    new_map);
13292 		}
13293 
13294 		switch (old_entry_inheritance) {
13295 		case VM_INHERIT_NONE:
13296 			break;
13297 
13298 		case VM_INHERIT_SHARE:
13299 			vm_map_fork_share(old_map, old_entry, new_map);
13300 			new_size += entry_size;
13301 			break;
13302 
13303 		case VM_INHERIT_COPY:
13304 
13305 			/*
13306 			 *	Inline the copy_quickly case;
13307 			 *	upon failure, fall back on call
13308 			 *	to vm_map_fork_copy.
13309 			 */
13310 
13311 			if (old_entry->is_sub_map) {
13312 				break;
13313 			}
13314 			if ((old_entry->wired_count != 0) ||
13315 			    ((VME_OBJECT(old_entry) != NULL) &&
13316 			    (VME_OBJECT(old_entry)->true_share))) {
13317 				goto slow_vm_map_fork_copy;
13318 			}
13319 
13320 			new_entry = vm_map_entry_create(new_map); /* never the kernel map or descendants */
13321 			vm_map_entry_copy(old_map, new_entry, old_entry);
13322 			if (old_entry->vme_permanent) {
13323 				/* inherit "permanent" on fork() */
13324 				new_entry->vme_permanent = TRUE;
13325 			}
13326 
13327 			if (new_entry->used_for_jit == TRUE && new_map->jit_entry_exists == FALSE) {
13328 				new_map->jit_entry_exists = TRUE;
13329 			}
13330 
13331 			if (new_entry->is_sub_map) {
13332 				/* clear address space specifics */
13333 				new_entry->use_pmap = FALSE;
13334 			} else {
13335 				/*
13336 				 * We're dealing with a copy-on-write operation,
13337 				 * so the resulting mapping should not inherit
13338 				 * the original mapping's accounting settings.
13339 				 * "iokit_acct" should have been cleared in
13340 				 * vm_map_entry_copy().
13341 				 * "use_pmap" should be reset to its default
13342 				 * (TRUE) so that the new mapping gets
13343 				 * accounted for in the task's memory footprint.
13344 				 */
13345 				assert(!new_entry->iokit_acct);
13346 				new_entry->use_pmap = TRUE;
13347 			}
13348 
13349 			if (!vm_object_copy_quickly(
13350 				    VME_OBJECT(new_entry),
13351 				    VME_OFFSET(old_entry),
13352 				    (old_entry->vme_end -
13353 				    old_entry->vme_start),
13354 				    &src_needs_copy,
13355 				    &new_entry_needs_copy)) {
13356 				vm_map_entry_dispose(new_entry);
13357 				goto slow_vm_map_fork_copy;
13358 			}
13359 
13360 			/*
13361 			 *	Handle copy-on-write obligations
13362 			 */
13363 
13364 			if (src_needs_copy && !old_entry->needs_copy) {
13365 				vm_prot_t prot;
13366 
13367 				assert(!pmap_has_prot_policy(old_map->pmap, old_entry->translated_allow_execute, old_entry->protection));
13368 
13369 				prot = old_entry->protection & ~VM_PROT_WRITE;
13370 
13371 				if (override_nx(old_map, VME_ALIAS(old_entry))
13372 				    && prot) {
13373 					prot |= VM_PROT_EXECUTE;
13374 				}
13375 
13376 				assert(!pmap_has_prot_policy(old_map->pmap, old_entry->translated_allow_execute, prot));
13377 
13378 				vm_object_pmap_protect(
13379 					VME_OBJECT(old_entry),
13380 					VME_OFFSET(old_entry),
13381 					(old_entry->vme_end -
13382 					old_entry->vme_start),
13383 					((old_entry->is_shared
13384 					|| old_map->mapped_in_other_pmaps)
13385 					? PMAP_NULL :
13386 					old_map->pmap),
13387 					VM_MAP_PAGE_SIZE(old_map),
13388 					old_entry->vme_start,
13389 					prot);
13390 
13391 				assert(old_entry->wired_count == 0);
13392 				old_entry->needs_copy = TRUE;
13393 			}
13394 			new_entry->needs_copy = new_entry_needs_copy;
13395 
13396 			/*
13397 			 *	Insert the entry at the end
13398 			 *	of the map.
13399 			 */
13400 
13401 			vm_map_store_entry_link(new_map,
13402 			    vm_map_last_entry(new_map),
13403 			    new_entry,
13404 			    VM_MAP_KERNEL_FLAGS_NONE);
13405 			new_size += entry_size;
13406 			break;
13407 
13408 slow_vm_map_fork_copy:
13409 			vm_map_copyin_flags = 0;
13410 			if (options & VM_MAP_FORK_PRESERVE_PURGEABLE) {
13411 				vm_map_copyin_flags |=
13412 				    VM_MAP_COPYIN_PRESERVE_PURGEABLE;
13413 			}
13414 			if (vm_map_fork_copy(old_map,
13415 			    &old_entry,
13416 			    new_map,
13417 			    vm_map_copyin_flags)) {
13418 				new_size += entry_size;
13419 			}
13420 			continue;
13421 		}
13422 		old_entry = old_entry->vme_next;
13423 	}
13424 
13425 #if PMAP_FORK_NEST
13426 	new_entry = vm_map_last_entry(new_map);
13427 	if (new_entry == vm_map_to_entry(new_map)) {
13428 		/* unnest all that was pre-nested */
13429 		vm_map_fork_unnest(new_pmap,
13430 		    pre_nested_start, pre_nested_end,
13431 		    vm_map_min(new_map), vm_map_max(new_map));
13432 	} else if (new_entry->vme_end < vm_map_max(new_map)) {
13433 		/* unnest hole at the end, if pre-nested */
13434 		vm_map_fork_unnest(new_pmap,
13435 		    pre_nested_start, pre_nested_end,
13436 		    new_entry->vme_end, vm_map_max(new_map));
13437 	}
13438 #endif /* PMAP_FORK_NEST */
13439 
13440 #if defined(__arm64__)
13441 	pmap_insert_commpage(new_map->pmap);
13442 #endif /* __arm64__ */
13443 
13444 	new_map->size = new_size;
13445 
13446 	if (options & VM_MAP_FORK_CORPSE_FOOTPRINT) {
13447 		vm_map_corpse_footprint_collect_done(new_map);
13448 	}
13449 
13450 	/* Propagate JIT entitlement for the pmap layer. */
13451 	if (pmap_get_jit_entitled(old_map->pmap)) {
13452 		/* Tell the pmap that it supports JIT. */
13453 		pmap_set_jit_entitled(new_map->pmap);
13454 	}
13455 
13456 	/* Propagate TPRO settings for the pmap layer */
13457 	if (pmap_get_tpro(old_map->pmap)) {
13458 		/* Tell the pmap that it supports TPRO */
13459 		pmap_set_tpro(new_map->pmap);
13460 	}
13461 
13462 	vm_map_unlock(new_map);
13463 	vm_map_unlock(old_map);
13464 	vm_map_deallocate(old_map);
13465 
13466 	return new_map;
13467 }
13468 
13469 /*
13470  * vm_map_exec:
13471  *
13472  *      Setup the "new_map" with the proper execution environment according
13473  *	to the type of executable (platform, 64bit, chroot environment).
13474  *	Map the comm page and shared region, etc...
13475  */
13476 kern_return_t
vm_map_exec(vm_map_t new_map,task_t task,boolean_t is64bit,void * fsroot,cpu_type_t cpu,cpu_subtype_t cpu_subtype,boolean_t reslide,boolean_t is_driverkit,uint32_t rsr_version)13477 vm_map_exec(
13478 	vm_map_t        new_map,
13479 	task_t          task,
13480 	boolean_t       is64bit,
13481 	void            *fsroot,
13482 	cpu_type_t      cpu,
13483 	cpu_subtype_t   cpu_subtype,
13484 	boolean_t       reslide,
13485 	boolean_t       is_driverkit,
13486 	uint32_t        rsr_version)
13487 {
13488 	SHARED_REGION_TRACE_DEBUG(
13489 		("shared_region: task %p: vm_map_exec(%p,%p,%p,0x%x,0x%x): ->\n",
13490 		(void *)VM_KERNEL_ADDRPERM(current_task()),
13491 		(void *)VM_KERNEL_ADDRPERM(new_map),
13492 		(void *)VM_KERNEL_ADDRPERM(task),
13493 		(void *)VM_KERNEL_ADDRPERM(fsroot),
13494 		cpu,
13495 		cpu_subtype));
13496 	(void) vm_commpage_enter(new_map, task, is64bit);
13497 
13498 	(void) vm_shared_region_enter(new_map, task, is64bit, fsroot, cpu, cpu_subtype, reslide, is_driverkit, rsr_version);
13499 
13500 	SHARED_REGION_TRACE_DEBUG(
13501 		("shared_region: task %p: vm_map_exec(%p,%p,%p,0x%x,0x%x): <-\n",
13502 		(void *)VM_KERNEL_ADDRPERM(current_task()),
13503 		(void *)VM_KERNEL_ADDRPERM(new_map),
13504 		(void *)VM_KERNEL_ADDRPERM(task),
13505 		(void *)VM_KERNEL_ADDRPERM(fsroot),
13506 		cpu,
13507 		cpu_subtype));
13508 
13509 	/*
13510 	 * Some devices have region(s) of memory that shouldn't get allocated by
13511 	 * user processes. The following code creates dummy vm_map_entry_t's for each
13512 	 * of the regions that needs to be reserved to prevent any allocations in
13513 	 * those regions.
13514 	 */
13515 	kern_return_t kr = KERN_FAILURE;
13516 	vm_map_kernel_flags_t vmk_flags = VM_MAP_KERNEL_FLAGS_FIXED_PERMANENT();
13517 	vmk_flags.vmkf_beyond_max = true;
13518 
13519 	const struct vm_reserved_region *regions = NULL;
13520 	size_t num_regions = ml_get_vm_reserved_regions(is64bit, &regions);
13521 	assert((num_regions == 0) || (num_regions > 0 && regions != NULL));
13522 
13523 	for (size_t i = 0; i < num_regions; ++i) {
13524 		vm_map_offset_t address = regions[i].vmrr_addr;
13525 
13526 		kr = vm_map_enter(
13527 			new_map,
13528 			&address,
13529 			regions[i].vmrr_size,
13530 			(vm_map_offset_t)0,
13531 			vmk_flags,
13532 			VM_OBJECT_NULL,
13533 			(vm_object_offset_t)0,
13534 			FALSE,
13535 			VM_PROT_NONE,
13536 			VM_PROT_NONE,
13537 			VM_INHERIT_COPY);
13538 
13539 		if (kr != KERN_SUCCESS) {
13540 			panic("Failed to reserve %s region in user map %p %d", regions[i].vmrr_name, new_map, kr);
13541 		}
13542 	}
13543 
13544 	new_map->reserved_regions = (num_regions ? TRUE : FALSE);
13545 
13546 	return KERN_SUCCESS;
13547 }
13548 
13549 uint64_t vm_map_lookup_and_lock_object_copy_slowly_count = 0;
13550 uint64_t vm_map_lookup_and_lock_object_copy_slowly_size = 0;
13551 uint64_t vm_map_lookup_and_lock_object_copy_slowly_max = 0;
13552 uint64_t vm_map_lookup_and_lock_object_copy_slowly_restart = 0;
13553 uint64_t vm_map_lookup_and_lock_object_copy_slowly_error = 0;
13554 uint64_t vm_map_lookup_and_lock_object_copy_strategically_count = 0;
13555 uint64_t vm_map_lookup_and_lock_object_copy_strategically_size = 0;
13556 uint64_t vm_map_lookup_and_lock_object_copy_strategically_max = 0;
13557 uint64_t vm_map_lookup_and_lock_object_copy_strategically_restart = 0;
13558 uint64_t vm_map_lookup_and_lock_object_copy_strategically_error = 0;
13559 uint64_t vm_map_lookup_and_lock_object_copy_shadow_count = 0;
13560 uint64_t vm_map_lookup_and_lock_object_copy_shadow_size = 0;
13561 uint64_t vm_map_lookup_and_lock_object_copy_shadow_max = 0;
13562 /*
13563  *	vm_map_lookup_and_lock_object:
13564  *
13565  *	Finds the VM object, offset, and
13566  *	protection for a given virtual address in the
13567  *	specified map, assuming a page fault of the
13568  *	type specified.
13569  *
13570  *	Returns the (object, offset, protection) for
13571  *	this address, whether it is wired down, and whether
13572  *	this map has the only reference to the data in question.
13573  *	In order to later verify this lookup, a "version"
13574  *	is returned.
13575  *	If contended != NULL, *contended will be set to
13576  *	true iff the thread had to spin or block to acquire
13577  *	an exclusive lock.
13578  *
13579  *	The map MUST be locked by the caller and WILL be
13580  *	locked on exit.  In order to guarantee the
13581  *	existence of the returned object, it is returned
13582  *	locked.
13583  *
13584  *	If a lookup is requested with "write protection"
13585  *	specified, the map may be changed to perform virtual
13586  *	copying operations, although the data referenced will
13587  *	remain the same.
13588  */
13589 kern_return_t
vm_map_lookup_and_lock_object(vm_map_t * var_map,vm_map_offset_t vaddr,vm_prot_t fault_type,int object_lock_type,vm_map_version_t * out_version,vm_object_t * object,vm_object_offset_t * offset,vm_prot_t * out_prot,boolean_t * wired,vm_object_fault_info_t fault_info,vm_map_t * real_map,bool * contended)13590 vm_map_lookup_and_lock_object(
13591 	vm_map_t                *var_map,       /* IN/OUT */
13592 	vm_map_offset_t         vaddr,
13593 	vm_prot_t               fault_type,
13594 	int                     object_lock_type,
13595 	vm_map_version_t        *out_version,   /* OUT */
13596 	vm_object_t             *object,        /* OUT */
13597 	vm_object_offset_t      *offset,        /* OUT */
13598 	vm_prot_t               *out_prot,      /* OUT */
13599 	boolean_t               *wired,         /* OUT */
13600 	vm_object_fault_info_t  fault_info,     /* OUT */
13601 	vm_map_t                *real_map,      /* OUT */
13602 	bool                    *contended)     /* OUT */
13603 {
13604 	vm_map_entry_t                  entry;
13605 	vm_map_t                        map = *var_map;
13606 	vm_map_t                        old_map = *var_map;
13607 	vm_map_t                        cow_sub_map_parent = VM_MAP_NULL;
13608 	vm_map_offset_t                 cow_parent_vaddr = 0;
13609 	vm_map_offset_t                 old_start = 0;
13610 	vm_map_offset_t                 old_end = 0;
13611 	vm_prot_t                       prot;
13612 	boolean_t                       mask_protections;
13613 	boolean_t                       force_copy;
13614 	boolean_t                       no_force_copy_if_executable;
13615 	boolean_t                       submap_needed_copy;
13616 	vm_prot_t                       original_fault_type;
13617 	vm_map_size_t                   fault_page_mask;
13618 
13619 	/*
13620 	 * VM_PROT_MASK means that the caller wants us to use "fault_type"
13621 	 * as a mask against the mapping's actual protections, not as an
13622 	 * absolute value.
13623 	 */
13624 	mask_protections = (fault_type & VM_PROT_IS_MASK) ? TRUE : FALSE;
13625 	force_copy = (fault_type & VM_PROT_COPY) ? TRUE : FALSE;
13626 	no_force_copy_if_executable = (fault_type & VM_PROT_COPY_FAIL_IF_EXECUTABLE) ? TRUE : FALSE;
13627 	fault_type &= VM_PROT_ALL;
13628 	original_fault_type = fault_type;
13629 	if (contended) {
13630 		*contended = false;
13631 	}
13632 
13633 	*real_map = map;
13634 
13635 	fault_page_mask = MIN(VM_MAP_PAGE_MASK(map), PAGE_MASK);
13636 	vaddr = VM_MAP_TRUNC_PAGE(vaddr, fault_page_mask);
13637 
13638 RetryLookup:
13639 	fault_type = original_fault_type;
13640 
13641 	/*
13642 	 *	If the map has an interesting hint, try it before calling
13643 	 *	full blown lookup routine.
13644 	 */
13645 	entry = map->hint;
13646 
13647 	if ((entry == vm_map_to_entry(map)) ||
13648 	    (vaddr < entry->vme_start) || (vaddr >= entry->vme_end)) {
13649 		vm_map_entry_t  tmp_entry;
13650 
13651 		/*
13652 		 *	Entry was either not a valid hint, or the vaddr
13653 		 *	was not contained in the entry, so do a full lookup.
13654 		 */
13655 		if (!vm_map_lookup_entry(map, vaddr, &tmp_entry)) {
13656 			if ((cow_sub_map_parent) && (cow_sub_map_parent != map)) {
13657 				vm_map_unlock(cow_sub_map_parent);
13658 			}
13659 			if ((*real_map != map)
13660 			    && (*real_map != cow_sub_map_parent)) {
13661 				vm_map_unlock(*real_map);
13662 			}
13663 			return KERN_INVALID_ADDRESS;
13664 		}
13665 
13666 		entry = tmp_entry;
13667 	}
13668 	if (map == old_map) {
13669 		old_start = entry->vme_start;
13670 		old_end = entry->vme_end;
13671 	}
13672 
13673 	/*
13674 	 *	Handle submaps.  Drop lock on upper map, submap is
13675 	 *	returned locked.
13676 	 */
13677 
13678 	submap_needed_copy = FALSE;
13679 submap_recurse:
13680 	if (entry->is_sub_map) {
13681 		vm_map_offset_t         local_vaddr;
13682 		vm_map_offset_t         end_delta;
13683 		vm_map_offset_t         start_delta;
13684 		vm_map_offset_t         top_entry_saved_start;
13685 		vm_object_offset_t      top_entry_saved_offset;
13686 		vm_map_entry_t          submap_entry, saved_submap_entry;
13687 		vm_object_offset_t      submap_entry_offset;
13688 		vm_object_size_t        submap_entry_size;
13689 		vm_prot_t               subentry_protection;
13690 		vm_prot_t               subentry_max_protection;
13691 		boolean_t               subentry_no_copy_on_read;
13692 		boolean_t               subentry_permanent;
13693 		boolean_t               subentry_csm_associated;
13694 		boolean_t               mapped_needs_copy = FALSE;
13695 		vm_map_version_t        version;
13696 
13697 		assertf(VM_MAP_PAGE_SHIFT(VME_SUBMAP(entry)) >= VM_MAP_PAGE_SHIFT(map),
13698 		    "map %p (%d) entry %p submap %p (%d)\n",
13699 		    map, VM_MAP_PAGE_SHIFT(map), entry,
13700 		    VME_SUBMAP(entry), VM_MAP_PAGE_SHIFT(VME_SUBMAP(entry)));
13701 
13702 		local_vaddr = vaddr;
13703 		top_entry_saved_start = entry->vme_start;
13704 		top_entry_saved_offset = VME_OFFSET(entry);
13705 
13706 		if ((entry->use_pmap &&
13707 		    !((fault_type & VM_PROT_WRITE) ||
13708 		    force_copy))) {
13709 			/* if real_map equals map we unlock below */
13710 			if ((*real_map != map) &&
13711 			    (*real_map != cow_sub_map_parent)) {
13712 				vm_map_unlock(*real_map);
13713 			}
13714 			*real_map = VME_SUBMAP(entry);
13715 		}
13716 
13717 		if (entry->needs_copy &&
13718 		    ((fault_type & VM_PROT_WRITE) ||
13719 		    force_copy)) {
13720 			if (!mapped_needs_copy) {
13721 				if (vm_map_lock_read_to_write(map)) {
13722 					vm_map_lock_read(map);
13723 					*real_map = map;
13724 					goto RetryLookup;
13725 				}
13726 				vm_map_lock_read(VME_SUBMAP(entry));
13727 				*var_map = VME_SUBMAP(entry);
13728 				cow_sub_map_parent = map;
13729 				/* reset base to map before cow object */
13730 				/* this is the map which will accept   */
13731 				/* the new cow object */
13732 				old_start = entry->vme_start;
13733 				old_end = entry->vme_end;
13734 				cow_parent_vaddr = vaddr;
13735 				mapped_needs_copy = TRUE;
13736 			} else {
13737 				vm_map_lock_read(VME_SUBMAP(entry));
13738 				*var_map = VME_SUBMAP(entry);
13739 				if ((cow_sub_map_parent != map) &&
13740 				    (*real_map != map)) {
13741 					vm_map_unlock(map);
13742 				}
13743 			}
13744 		} else {
13745 			if (entry->needs_copy) {
13746 				submap_needed_copy = TRUE;
13747 			}
13748 			vm_map_lock_read(VME_SUBMAP(entry));
13749 			*var_map = VME_SUBMAP(entry);
13750 			/* leave map locked if it is a target */
13751 			/* cow sub_map above otherwise, just  */
13752 			/* follow the maps down to the object */
13753 			/* here we unlock knowing we are not  */
13754 			/* revisiting the map.  */
13755 			if ((*real_map != map) && (map != cow_sub_map_parent)) {
13756 				vm_map_unlock_read(map);
13757 			}
13758 		}
13759 
13760 		entry = NULL;
13761 		map = *var_map;
13762 
13763 		/* calculate the offset in the submap for vaddr */
13764 		local_vaddr = (local_vaddr - top_entry_saved_start) + top_entry_saved_offset;
13765 		assertf(VM_MAP_PAGE_ALIGNED(local_vaddr, fault_page_mask),
13766 		    "local_vaddr 0x%llx entry->vme_start 0x%llx fault_page_mask 0x%llx\n",
13767 		    (uint64_t)local_vaddr, (uint64_t)top_entry_saved_start, (uint64_t)fault_page_mask);
13768 
13769 RetrySubMap:
13770 		if (!vm_map_lookup_entry(map, local_vaddr, &submap_entry)) {
13771 			if ((cow_sub_map_parent) && (cow_sub_map_parent != map)) {
13772 				vm_map_unlock(cow_sub_map_parent);
13773 			}
13774 			if ((*real_map != map)
13775 			    && (*real_map != cow_sub_map_parent)) {
13776 				vm_map_unlock(*real_map);
13777 			}
13778 			*real_map = map;
13779 			return KERN_INVALID_ADDRESS;
13780 		}
13781 
13782 		/* find the attenuated shadow of the underlying object */
13783 		/* on our target map */
13784 
13785 		/* in english the submap object may extend beyond the     */
13786 		/* region mapped by the entry or, may only fill a portion */
13787 		/* of it.  For our purposes, we only care if the object   */
13788 		/* doesn't fill.  In this case the area which will        */
13789 		/* ultimately be clipped in the top map will only need    */
13790 		/* to be as big as the portion of the underlying entry    */
13791 		/* which is mapped */
13792 		start_delta = submap_entry->vme_start > top_entry_saved_offset ?
13793 		    submap_entry->vme_start - top_entry_saved_offset : 0;
13794 
13795 		end_delta =
13796 		    (top_entry_saved_offset + start_delta + (old_end - old_start)) <=
13797 		    submap_entry->vme_end ?
13798 		    0 : (top_entry_saved_offset +
13799 		    (old_end - old_start))
13800 		    - submap_entry->vme_end;
13801 
13802 		old_start += start_delta;
13803 		old_end -= end_delta;
13804 
13805 		if (submap_entry->is_sub_map) {
13806 			entry = submap_entry;
13807 			vaddr = local_vaddr;
13808 			goto submap_recurse;
13809 		}
13810 
13811 		if (((fault_type & VM_PROT_WRITE) ||
13812 		    force_copy)
13813 		    && cow_sub_map_parent) {
13814 			vm_object_t     sub_object, copy_object;
13815 			vm_object_offset_t copy_offset;
13816 			vm_map_offset_t local_start;
13817 			vm_map_offset_t local_end;
13818 			boolean_t       object_copied = FALSE;
13819 			vm_object_offset_t object_copied_offset = 0;
13820 			boolean_t       object_copied_needs_copy = FALSE;
13821 			kern_return_t   kr = KERN_SUCCESS;
13822 
13823 			if (vm_map_lock_read_to_write(map)) {
13824 				vm_map_lock_read(map);
13825 				old_start -= start_delta;
13826 				old_end += end_delta;
13827 				goto RetrySubMap;
13828 			}
13829 
13830 
13831 			sub_object = VME_OBJECT(submap_entry);
13832 			if (sub_object == VM_OBJECT_NULL) {
13833 				sub_object =
13834 				    vm_object_allocate(
13835 					(vm_map_size_t)
13836 					(submap_entry->vme_end -
13837 					submap_entry->vme_start));
13838 				VME_OBJECT_SET(submap_entry, sub_object, false, 0);
13839 				VME_OFFSET_SET(submap_entry, 0);
13840 				assert(!submap_entry->is_sub_map);
13841 				assert(submap_entry->use_pmap);
13842 			}
13843 			local_start =  local_vaddr -
13844 			    (cow_parent_vaddr - old_start);
13845 			local_end = local_vaddr +
13846 			    (old_end - cow_parent_vaddr);
13847 			vm_map_clip_start(map, submap_entry, local_start);
13848 			vm_map_clip_end(map, submap_entry, local_end);
13849 			if (submap_entry->is_sub_map) {
13850 				/* unnesting was done when clipping */
13851 				assert(!submap_entry->use_pmap);
13852 			}
13853 
13854 			/* This is the COW case, lets connect */
13855 			/* an entry in our space to the underlying */
13856 			/* object in the submap, bypassing the  */
13857 			/* submap. */
13858 			submap_entry_offset = VME_OFFSET(submap_entry);
13859 			submap_entry_size = submap_entry->vme_end - submap_entry->vme_start;
13860 
13861 			if ((submap_entry->wired_count != 0 ||
13862 			    sub_object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC) &&
13863 			    (submap_entry->protection & VM_PROT_EXECUTE) &&
13864 			    no_force_copy_if_executable) {
13865 //				printf("FBDP map %p entry %p start 0x%llx end 0x%llx wired %d strat %d\n", map, submap_entry, (uint64_t)local_start, (uint64_t)local_end, submap_entry->wired_count, sub_object->copy_strategy);
13866 				if ((cow_sub_map_parent) && (cow_sub_map_parent != map)) {
13867 					vm_map_unlock(cow_sub_map_parent);
13868 				}
13869 				if ((*real_map != map)
13870 				    && (*real_map != cow_sub_map_parent)) {
13871 					vm_map_unlock(*real_map);
13872 				}
13873 				*real_map = map;
13874 				ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_SUBMAP_NO_COW_ON_EXECUTABLE), 0 /* arg */);
13875 				vm_map_lock_write_to_read(map);
13876 				kr = KERN_PROTECTION_FAILURE;
13877 				DTRACE_VM4(submap_no_copy_executable,
13878 				    vm_map_t, map,
13879 				    vm_object_offset_t, submap_entry_offset,
13880 				    vm_object_size_t, submap_entry_size,
13881 				    int, kr);
13882 				return kr;
13883 			}
13884 
13885 			if (submap_entry->wired_count != 0) {
13886 				vm_object_reference(sub_object);
13887 
13888 				assertf(VM_MAP_PAGE_ALIGNED(VME_OFFSET(submap_entry), VM_MAP_PAGE_MASK(map)),
13889 				    "submap_entry %p offset 0x%llx\n",
13890 				    submap_entry, VME_OFFSET(submap_entry));
13891 
13892 				DTRACE_VM6(submap_copy_slowly,
13893 				    vm_map_t, cow_sub_map_parent,
13894 				    vm_map_offset_t, vaddr,
13895 				    vm_map_t, map,
13896 				    vm_object_size_t, submap_entry_size,
13897 				    int, submap_entry->wired_count,
13898 				    int, sub_object->copy_strategy);
13899 
13900 				saved_submap_entry = submap_entry;
13901 				version.main_timestamp = map->timestamp;
13902 				vm_map_unlock(map); /* Increments timestamp by 1 */
13903 				submap_entry = VM_MAP_ENTRY_NULL;
13904 
13905 				vm_object_lock(sub_object);
13906 				kr = vm_object_copy_slowly(sub_object,
13907 				    submap_entry_offset,
13908 				    submap_entry_size,
13909 				    FALSE,
13910 				    &copy_object);
13911 				object_copied = TRUE;
13912 				object_copied_offset = 0;
13913 				/* 4k: account for extra offset in physical page */
13914 				object_copied_offset += submap_entry_offset - vm_object_trunc_page(submap_entry_offset);
13915 				object_copied_needs_copy = FALSE;
13916 				vm_object_deallocate(sub_object);
13917 
13918 				vm_map_lock(map);
13919 
13920 				if (kr != KERN_SUCCESS &&
13921 				    kr != KERN_MEMORY_RESTART_COPY) {
13922 					if ((cow_sub_map_parent) && (cow_sub_map_parent != map)) {
13923 						vm_map_unlock(cow_sub_map_parent);
13924 					}
13925 					if ((*real_map != map)
13926 					    && (*real_map != cow_sub_map_parent)) {
13927 						vm_map_unlock(*real_map);
13928 					}
13929 					*real_map = map;
13930 					vm_object_deallocate(copy_object);
13931 					copy_object = VM_OBJECT_NULL;
13932 					ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_SUBMAP_COPY_SLOWLY_FAILED), 0 /* arg */);
13933 					vm_map_lock_write_to_read(map);
13934 					DTRACE_VM4(submap_copy_error_slowly,
13935 					    vm_object_t, sub_object,
13936 					    vm_object_offset_t, submap_entry_offset,
13937 					    vm_object_size_t, submap_entry_size,
13938 					    int, kr);
13939 					vm_map_lookup_and_lock_object_copy_slowly_error++;
13940 					return kr;
13941 				}
13942 
13943 				if ((kr == KERN_SUCCESS) &&
13944 				    (version.main_timestamp + 1) == map->timestamp) {
13945 					submap_entry = saved_submap_entry;
13946 				} else {
13947 					saved_submap_entry = NULL;
13948 					old_start -= start_delta;
13949 					old_end += end_delta;
13950 					vm_object_deallocate(copy_object);
13951 					copy_object = VM_OBJECT_NULL;
13952 					vm_map_lock_write_to_read(map);
13953 					vm_map_lookup_and_lock_object_copy_slowly_restart++;
13954 					goto RetrySubMap;
13955 				}
13956 				vm_map_lookup_and_lock_object_copy_slowly_count++;
13957 				vm_map_lookup_and_lock_object_copy_slowly_size += submap_entry_size;
13958 				if (submap_entry_size > vm_map_lookup_and_lock_object_copy_slowly_max) {
13959 					vm_map_lookup_and_lock_object_copy_slowly_max = submap_entry_size;
13960 				}
13961 			} else if (sub_object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC) {
13962 				submap_entry_offset = VME_OFFSET(submap_entry);
13963 				copy_object = VM_OBJECT_NULL;
13964 				object_copied_offset = submap_entry_offset;
13965 				object_copied_needs_copy = FALSE;
13966 				DTRACE_VM6(submap_copy_strategically,
13967 				    vm_map_t, cow_sub_map_parent,
13968 				    vm_map_offset_t, vaddr,
13969 				    vm_map_t, map,
13970 				    vm_object_size_t, submap_entry_size,
13971 				    int, submap_entry->wired_count,
13972 				    int, sub_object->copy_strategy);
13973 				kr = vm_object_copy_strategically(
13974 					sub_object,
13975 					submap_entry_offset,
13976 					submap_entry->vme_end - submap_entry->vme_start,
13977 					&copy_object,
13978 					&object_copied_offset,
13979 					&object_copied_needs_copy);
13980 				if (kr == KERN_MEMORY_RESTART_COPY) {
13981 					old_start -= start_delta;
13982 					old_end += end_delta;
13983 					vm_object_deallocate(copy_object);
13984 					copy_object = VM_OBJECT_NULL;
13985 					vm_map_lock_write_to_read(map);
13986 					vm_map_lookup_and_lock_object_copy_strategically_restart++;
13987 					goto RetrySubMap;
13988 				}
13989 				if (kr != KERN_SUCCESS) {
13990 					if ((cow_sub_map_parent) && (cow_sub_map_parent != map)) {
13991 						vm_map_unlock(cow_sub_map_parent);
13992 					}
13993 					if ((*real_map != map)
13994 					    && (*real_map != cow_sub_map_parent)) {
13995 						vm_map_unlock(*real_map);
13996 					}
13997 					*real_map = map;
13998 					vm_object_deallocate(copy_object);
13999 					copy_object = VM_OBJECT_NULL;
14000 					ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_SUBMAP_COPY_STRAT_FAILED), 0 /* arg */);
14001 					vm_map_lock_write_to_read(map);
14002 					DTRACE_VM4(submap_copy_error_strategically,
14003 					    vm_object_t, sub_object,
14004 					    vm_object_offset_t, submap_entry_offset,
14005 					    vm_object_size_t, submap_entry_size,
14006 					    int, kr);
14007 					vm_map_lookup_and_lock_object_copy_strategically_error++;
14008 					return kr;
14009 				}
14010 				assert(copy_object != VM_OBJECT_NULL);
14011 				assert(copy_object != sub_object);
14012 				object_copied = TRUE;
14013 				vm_map_lookup_and_lock_object_copy_strategically_count++;
14014 				vm_map_lookup_and_lock_object_copy_strategically_size += submap_entry_size;
14015 				if (submap_entry_size > vm_map_lookup_and_lock_object_copy_strategically_max) {
14016 					vm_map_lookup_and_lock_object_copy_strategically_max = submap_entry_size;
14017 				}
14018 			} else {
14019 				/* set up shadow object */
14020 				object_copied = FALSE;
14021 				copy_object = sub_object;
14022 				vm_object_lock(sub_object);
14023 				vm_object_reference_locked(sub_object);
14024 				sub_object->shadowed = TRUE;
14025 				vm_object_unlock(sub_object);
14026 
14027 				assert(submap_entry->wired_count == 0);
14028 				submap_entry->needs_copy = TRUE;
14029 
14030 				prot = submap_entry->protection;
14031 				assert(!pmap_has_prot_policy(map->pmap, submap_entry->translated_allow_execute, prot));
14032 				prot = prot & ~VM_PROT_WRITE;
14033 				assert(!pmap_has_prot_policy(map->pmap, submap_entry->translated_allow_execute, prot));
14034 
14035 				if (override_nx(old_map,
14036 				    VME_ALIAS(submap_entry))
14037 				    && prot) {
14038 					prot |= VM_PROT_EXECUTE;
14039 				}
14040 
14041 				vm_object_pmap_protect(
14042 					sub_object,
14043 					VME_OFFSET(submap_entry),
14044 					submap_entry->vme_end -
14045 					submap_entry->vme_start,
14046 					(submap_entry->is_shared
14047 					|| map->mapped_in_other_pmaps) ?
14048 					PMAP_NULL : map->pmap,
14049 					VM_MAP_PAGE_SIZE(map),
14050 					submap_entry->vme_start,
14051 					prot);
14052 				vm_map_lookup_and_lock_object_copy_shadow_count++;
14053 				vm_map_lookup_and_lock_object_copy_shadow_size += submap_entry_size;
14054 				if (submap_entry_size > vm_map_lookup_and_lock_object_copy_shadow_max) {
14055 					vm_map_lookup_and_lock_object_copy_shadow_max = submap_entry_size;
14056 				}
14057 			}
14058 
14059 			/*
14060 			 * Adjust the fault offset to the submap entry.
14061 			 */
14062 			copy_offset = (local_vaddr -
14063 			    submap_entry->vme_start +
14064 			    VME_OFFSET(submap_entry));
14065 
14066 			/* This works diffently than the   */
14067 			/* normal submap case. We go back  */
14068 			/* to the parent of the cow map and*/
14069 			/* clip out the target portion of  */
14070 			/* the sub_map, substituting the   */
14071 			/* new copy object,                */
14072 
14073 			subentry_protection = submap_entry->protection;
14074 			subentry_max_protection = submap_entry->max_protection;
14075 			subentry_no_copy_on_read = submap_entry->vme_no_copy_on_read;
14076 			subentry_permanent = submap_entry->vme_permanent;
14077 			subentry_csm_associated = submap_entry->csm_associated;
14078 
14079 			vm_map_unlock(map);
14080 			submap_entry = NULL; /* not valid after map unlock */
14081 
14082 			local_start = old_start;
14083 			local_end = old_end;
14084 			map = cow_sub_map_parent;
14085 			*var_map = cow_sub_map_parent;
14086 			vaddr = cow_parent_vaddr;
14087 			cow_sub_map_parent = NULL;
14088 
14089 			if (!vm_map_lookup_entry(map,
14090 			    vaddr, &entry)) {
14091 				if ((cow_sub_map_parent) && (cow_sub_map_parent != map)) {
14092 					vm_map_unlock(cow_sub_map_parent);
14093 				}
14094 				if ((*real_map != map)
14095 				    && (*real_map != cow_sub_map_parent)) {
14096 					vm_map_unlock(*real_map);
14097 				}
14098 				*real_map = map;
14099 				vm_object_deallocate(
14100 					copy_object);
14101 				copy_object = VM_OBJECT_NULL;
14102 				vm_map_lock_write_to_read(map);
14103 				DTRACE_VM4(submap_lookup_post_unlock,
14104 				    uint64_t, (uint64_t)entry->vme_start,
14105 				    uint64_t, (uint64_t)entry->vme_end,
14106 				    vm_map_offset_t, vaddr,
14107 				    int, object_copied);
14108 				return KERN_INVALID_ADDRESS;
14109 			}
14110 
14111 			/* clip out the portion of space */
14112 			/* mapped by the sub map which   */
14113 			/* corresponds to the underlying */
14114 			/* object */
14115 
14116 			/*
14117 			 * Clip (and unnest) the smallest nested chunk
14118 			 * possible around the faulting address...
14119 			 */
14120 			local_start = vaddr & ~(pmap_shared_region_size_min(map->pmap) - 1);
14121 			local_end = local_start + pmap_shared_region_size_min(map->pmap);
14122 			/*
14123 			 * ... but don't go beyond the "old_start" to "old_end"
14124 			 * range, to avoid spanning over another VM region
14125 			 * with a possibly different VM object and/or offset.
14126 			 */
14127 			if (local_start < old_start) {
14128 				local_start = old_start;
14129 			}
14130 			if (local_end > old_end) {
14131 				local_end = old_end;
14132 			}
14133 			/*
14134 			 * Adjust copy_offset to the start of the range.
14135 			 */
14136 			copy_offset -= (vaddr - local_start);
14137 
14138 			vm_map_clip_start(map, entry, local_start);
14139 			vm_map_clip_end(map, entry, local_end);
14140 			if (entry->is_sub_map) {
14141 				/* unnesting was done when clipping */
14142 				assert(!entry->use_pmap);
14143 			}
14144 
14145 			/* substitute copy object for */
14146 			/* shared map entry           */
14147 			vm_map_deallocate(VME_SUBMAP(entry));
14148 			assert(!entry->iokit_acct);
14149 			entry->use_pmap = TRUE;
14150 			VME_OBJECT_SET(entry, copy_object, false, 0);
14151 
14152 			/* propagate the submap entry's protections */
14153 			if (entry->protection != VM_PROT_READ) {
14154 				/*
14155 				 * Someone has already altered the top entry's
14156 				 * protections via vm_protect(VM_PROT_COPY).
14157 				 * Respect these new values and ignore the
14158 				 * submap entry's protections.
14159 				 */
14160 			} else {
14161 				/*
14162 				 * Regular copy-on-write: propagate the submap
14163 				 * entry's protections to the top map entry.
14164 				 */
14165 				entry->protection |= subentry_protection;
14166 			}
14167 			entry->max_protection |= subentry_max_protection;
14168 			/* propagate some attributes from subentry */
14169 			entry->vme_no_copy_on_read = subentry_no_copy_on_read;
14170 			entry->vme_permanent = subentry_permanent;
14171 			entry->csm_associated = subentry_csm_associated;
14172 
14173 			if ((entry->protection & VM_PROT_WRITE) &&
14174 			    (entry->protection & VM_PROT_EXECUTE) &&
14175 #if XNU_TARGET_OS_OSX
14176 			    map->pmap != kernel_pmap &&
14177 			    (vm_map_cs_enforcement(map)
14178 #if __arm64__
14179 			    || !VM_MAP_IS_EXOTIC(map)
14180 #endif /* __arm64__ */
14181 			    ) &&
14182 #endif /* XNU_TARGET_OS_OSX */
14183 #if CODE_SIGNING_MONITOR
14184 			    (csm_address_space_exempt(map->pmap) != KERN_SUCCESS) &&
14185 #endif
14186 			    !(entry->used_for_jit) &&
14187 			    VM_MAP_POLICY_WX_STRIP_X(map)) {
14188 				DTRACE_VM3(cs_wx,
14189 				    uint64_t, (uint64_t)entry->vme_start,
14190 				    uint64_t, (uint64_t)entry->vme_end,
14191 				    vm_prot_t, entry->protection);
14192 				printf("CODE SIGNING: %d[%s] %s:%d(0x%llx,0x%llx,0x%x) can't have both write and exec at the same time\n",
14193 				    proc_selfpid(),
14194 				    (get_bsdtask_info(current_task())
14195 				    ? proc_name_address(get_bsdtask_info(current_task()))
14196 				    : "?"),
14197 				    __FUNCTION__, __LINE__,
14198 #if DEVELOPMENT || DEBUG
14199 				    (uint64_t)entry->vme_start,
14200 				    (uint64_t)entry->vme_end,
14201 #else /* DEVELOPMENT || DEBUG */
14202 				    (uint64_t)0,
14203 				    (uint64_t)0,
14204 #endif /* DEVELOPMENT || DEBUG */
14205 				    entry->protection);
14206 				entry->protection &= ~VM_PROT_EXECUTE;
14207 			}
14208 
14209 			if (object_copied) {
14210 				VME_OFFSET_SET(entry, local_start - old_start + object_copied_offset);
14211 				entry->needs_copy = object_copied_needs_copy;
14212 				entry->is_shared = FALSE;
14213 			} else {
14214 				assert(VME_OBJECT(entry) != VM_OBJECT_NULL);
14215 				assert(VME_OBJECT(entry)->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC);
14216 				assert(entry->wired_count == 0);
14217 				VME_OFFSET_SET(entry, copy_offset);
14218 				entry->needs_copy = TRUE;
14219 				if (map != old_map) {
14220 					entry->is_shared = TRUE;
14221 				}
14222 			}
14223 			if (entry->inheritance == VM_INHERIT_SHARE) {
14224 				entry->inheritance = VM_INHERIT_COPY;
14225 			}
14226 
14227 			vm_map_lock_write_to_read(map);
14228 		} else {
14229 			if ((cow_sub_map_parent)
14230 			    && (cow_sub_map_parent != *real_map)
14231 			    && (cow_sub_map_parent != map)) {
14232 				vm_map_unlock(cow_sub_map_parent);
14233 			}
14234 			entry = submap_entry;
14235 			vaddr = local_vaddr;
14236 		}
14237 	}
14238 
14239 	/*
14240 	 *	Check whether this task is allowed to have
14241 	 *	this page.
14242 	 */
14243 
14244 	prot = entry->protection;
14245 
14246 	if (override_nx(old_map, VME_ALIAS(entry)) && prot) {
14247 		/*
14248 		 * HACK -- if not a stack, then allow execution
14249 		 */
14250 		prot |= VM_PROT_EXECUTE;
14251 	}
14252 
14253 	if (mask_protections) {
14254 		fault_type &= prot;
14255 		if (fault_type == VM_PROT_NONE) {
14256 			goto protection_failure;
14257 		}
14258 	}
14259 	if (((fault_type & prot) != fault_type)
14260 #if __arm64__
14261 	    /* prefetch abort in execute-only page */
14262 	    && !(prot == VM_PROT_EXECUTE && fault_type == (VM_PROT_READ | VM_PROT_EXECUTE))
14263 #elif defined(__x86_64__)
14264 	    /* Consider the UEXEC bit when handling an EXECUTE fault */
14265 	    && !((fault_type & VM_PROT_EXECUTE) && !(prot & VM_PROT_EXECUTE) && (prot & VM_PROT_UEXEC))
14266 #endif
14267 	    ) {
14268 protection_failure:
14269 		if (*real_map != map) {
14270 			vm_map_unlock(*real_map);
14271 		}
14272 		*real_map = map;
14273 
14274 		if ((fault_type & VM_PROT_EXECUTE) && prot) {
14275 			log_stack_execution_failure((addr64_t)vaddr, prot);
14276 		}
14277 
14278 		DTRACE_VM2(prot_fault, int, 1, (uint64_t *), NULL);
14279 		DTRACE_VM3(prot_fault_detailed, vm_prot_t, fault_type, vm_prot_t, prot, void *, vaddr);
14280 		/*
14281 		 * Noisy (esp. internally) and can be inferred from CrashReports. So OFF for now.
14282 		 *
14283 		 * ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_PROTECTION_FAILURE), 0);
14284 		 */
14285 		return KERN_PROTECTION_FAILURE;
14286 	}
14287 
14288 	/*
14289 	 *	If this page is not pageable, we have to get
14290 	 *	it for all possible accesses.
14291 	 */
14292 
14293 	*wired = (entry->wired_count != 0);
14294 	if (*wired) {
14295 		fault_type = prot;
14296 	}
14297 
14298 	/*
14299 	 *	If the entry was copy-on-write, we either ...
14300 	 */
14301 
14302 	if (entry->needs_copy) {
14303 		/*
14304 		 *	If we want to write the page, we may as well
14305 		 *	handle that now since we've got the map locked.
14306 		 *
14307 		 *	If we don't need to write the page, we just
14308 		 *	demote the permissions allowed.
14309 		 */
14310 
14311 		if ((fault_type & VM_PROT_WRITE) || *wired || force_copy) {
14312 			/*
14313 			 *	Make a new object, and place it in the
14314 			 *	object chain.  Note that no new references
14315 			 *	have appeared -- one just moved from the
14316 			 *	map to the new object.
14317 			 */
14318 
14319 			if (vm_map_lock_read_to_write(map)) {
14320 				vm_map_lock_read(map);
14321 				goto RetryLookup;
14322 			}
14323 
14324 			if (VME_OBJECT(entry)->shadowed == FALSE) {
14325 				vm_object_lock(VME_OBJECT(entry));
14326 				VME_OBJECT(entry)->shadowed = TRUE;
14327 				vm_object_unlock(VME_OBJECT(entry));
14328 			}
14329 			VME_OBJECT_SHADOW(entry,
14330 			    (vm_map_size_t) (entry->vme_end -
14331 			    entry->vme_start),
14332 			    vm_map_always_shadow(map));
14333 			entry->needs_copy = FALSE;
14334 
14335 			vm_map_lock_write_to_read(map);
14336 		}
14337 		if ((fault_type & VM_PROT_WRITE) == 0 && *wired == 0) {
14338 			/*
14339 			 *	We're attempting to read a copy-on-write
14340 			 *	page -- don't allow writes.
14341 			 */
14342 
14343 			prot &= (~VM_PROT_WRITE);
14344 		}
14345 	}
14346 
14347 	if (submap_needed_copy && (prot & VM_PROT_WRITE)) {
14348 		/*
14349 		 * We went through a "needs_copy" submap without triggering
14350 		 * a copy, so granting write access to the page would bypass
14351 		 * that submap's "needs_copy".
14352 		 */
14353 		assert(!(fault_type & VM_PROT_WRITE));
14354 		assert(!*wired);
14355 		assert(!force_copy);
14356 		// printf("FBDP %d[%s] submap_needed_copy for %p 0x%llx\n", proc_selfpid(), proc_name_address(current_task()->bsd_info), map, vaddr);
14357 		prot &= ~VM_PROT_WRITE;
14358 	}
14359 
14360 	/*
14361 	 *	Create an object if necessary.
14362 	 */
14363 	if (VME_OBJECT(entry) == VM_OBJECT_NULL) {
14364 		if (vm_map_lock_read_to_write(map)) {
14365 			vm_map_lock_read(map);
14366 			goto RetryLookup;
14367 		}
14368 
14369 		VME_OBJECT_SET(entry,
14370 		    vm_object_allocate(
14371 			    (vm_map_size_t)(entry->vme_end -
14372 			    entry->vme_start)), false, 0);
14373 		VME_OFFSET_SET(entry, 0);
14374 		assert(entry->use_pmap);
14375 		vm_map_lock_write_to_read(map);
14376 	}
14377 
14378 	/*
14379 	 *	Return the object/offset from this entry.  If the entry
14380 	 *	was copy-on-write or empty, it has been fixed up.  Also
14381 	 *	return the protection.
14382 	 */
14383 
14384 	*offset = (vaddr - entry->vme_start) + VME_OFFSET(entry);
14385 	*object = VME_OBJECT(entry);
14386 	*out_prot = prot;
14387 	KDBG_FILTERED(MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_MAP_LOOKUP_OBJECT), VM_KERNEL_UNSLIDE_OR_PERM(*object), (unsigned long) VME_ALIAS(entry), 0, 0);
14388 
14389 	if (fault_info) {
14390 		fault_info->interruptible = THREAD_UNINT; /* for now... */
14391 		/* ... the caller will change "interruptible" if needed */
14392 		fault_info->cluster_size = 0;
14393 		fault_info->user_tag = VME_ALIAS(entry);
14394 		fault_info->pmap_options = 0;
14395 		if (entry->iokit_acct ||
14396 		    (!entry->is_sub_map && !entry->use_pmap)) {
14397 			fault_info->pmap_options |= PMAP_OPTIONS_ALT_ACCT;
14398 		}
14399 		fault_info->behavior = entry->behavior;
14400 		fault_info->lo_offset = VME_OFFSET(entry);
14401 		fault_info->hi_offset =
14402 		    (entry->vme_end - entry->vme_start) + VME_OFFSET(entry);
14403 		fault_info->no_cache  = entry->no_cache;
14404 		fault_info->stealth = FALSE;
14405 		fault_info->io_sync = FALSE;
14406 		if (entry->used_for_jit ||
14407 #if CODE_SIGNING_MONITOR
14408 		    (csm_address_space_exempt(map->pmap) == KERN_SUCCESS) ||
14409 #endif
14410 		    entry->vme_resilient_codesign) {
14411 			fault_info->cs_bypass = TRUE;
14412 		} else {
14413 			fault_info->cs_bypass = FALSE;
14414 		}
14415 		fault_info->csm_associated = FALSE;
14416 #if CODE_SIGNING_MONITOR
14417 		if (entry->csm_associated) {
14418 			/*
14419 			 * The pmap layer will validate this page
14420 			 * before allowing it to be executed from.
14421 			 */
14422 			fault_info->csm_associated = TRUE;
14423 		}
14424 #endif
14425 		fault_info->mark_zf_absent = FALSE;
14426 		fault_info->batch_pmap_op = FALSE;
14427 		fault_info->resilient_media = entry->vme_resilient_media;
14428 		fault_info->fi_xnu_user_debug = entry->vme_xnu_user_debug;
14429 		fault_info->no_copy_on_read = entry->vme_no_copy_on_read;
14430 		if (entry->translated_allow_execute) {
14431 			fault_info->pmap_options |= PMAP_OPTIONS_TRANSLATED_ALLOW_EXECUTE;
14432 		}
14433 	}
14434 
14435 	/*
14436 	 *	Lock the object to prevent it from disappearing
14437 	 */
14438 	if (object_lock_type == OBJECT_LOCK_EXCLUSIVE) {
14439 		if (contended == NULL) {
14440 			vm_object_lock(*object);
14441 		} else {
14442 			*contended = vm_object_lock_check_contended(*object);
14443 		}
14444 	} else {
14445 		vm_object_lock_shared(*object);
14446 	}
14447 
14448 	/*
14449 	 *	Save the version number
14450 	 */
14451 
14452 	out_version->main_timestamp = map->timestamp;
14453 
14454 	return KERN_SUCCESS;
14455 }
14456 
14457 
14458 /*
14459  *	vm_map_verify:
14460  *
14461  *	Verifies that the map in question has not changed
14462  *	since the given version. The map has to be locked
14463  *	("shared" mode is fine) before calling this function
14464  *	and it will be returned locked too.
14465  */
14466 boolean_t
vm_map_verify(vm_map_t map,vm_map_version_t * version)14467 vm_map_verify(
14468 	vm_map_t                map,
14469 	vm_map_version_t        *version)       /* REF */
14470 {
14471 	boolean_t       result;
14472 
14473 	vm_map_lock_assert_held(map);
14474 	result = (map->timestamp == version->main_timestamp);
14475 
14476 	return result;
14477 }
14478 
14479 /*
14480  *	TEMPORARYTEMPORARYTEMPORARYTEMPORARYTEMPORARYTEMPORARY
14481  *	Goes away after regular vm_region_recurse function migrates to
14482  *	64 bits
14483  *	vm_region_recurse: A form of vm_region which follows the
14484  *	submaps in a target map
14485  *
14486  */
14487 
14488 kern_return_t
vm_map_region_recurse_64(vm_map_t map,vm_map_offset_t * address,vm_map_size_t * size,natural_t * nesting_depth,vm_region_submap_info_64_t submap_info,mach_msg_type_number_t * count)14489 vm_map_region_recurse_64(
14490 	vm_map_t                 map,
14491 	vm_map_offset_t *address,               /* IN/OUT */
14492 	vm_map_size_t           *size,                  /* OUT */
14493 	natural_t               *nesting_depth, /* IN/OUT */
14494 	vm_region_submap_info_64_t      submap_info,    /* IN/OUT */
14495 	mach_msg_type_number_t  *count) /* IN/OUT */
14496 {
14497 	mach_msg_type_number_t  original_count;
14498 	vm_region_extended_info_data_t  extended;
14499 	vm_map_entry_t                  tmp_entry;
14500 	vm_map_offset_t                 user_address;
14501 	unsigned int                    user_max_depth;
14502 
14503 	/*
14504 	 * "curr_entry" is the VM map entry preceding or including the
14505 	 * address we're looking for.
14506 	 * "curr_map" is the map or sub-map containing "curr_entry".
14507 	 * "curr_address" is the equivalent of the top map's "user_address"
14508 	 * in the current map.
14509 	 * "curr_offset" is the cumulated offset of "curr_map" in the
14510 	 * target task's address space.
14511 	 * "curr_depth" is the depth of "curr_map" in the chain of
14512 	 * sub-maps.
14513 	 *
14514 	 * "curr_max_below" and "curr_max_above" limit the range (around
14515 	 * "curr_address") we should take into account in the current (sub)map.
14516 	 * They limit the range to what's visible through the map entries
14517 	 * we've traversed from the top map to the current map.
14518 	 *
14519 	 */
14520 	vm_map_entry_t                  curr_entry;
14521 	vm_map_address_t                curr_address;
14522 	vm_map_offset_t                 curr_offset;
14523 	vm_map_t                        curr_map;
14524 	unsigned int                    curr_depth;
14525 	vm_map_offset_t                 curr_max_below, curr_max_above;
14526 	vm_map_offset_t                 curr_skip;
14527 
14528 	/*
14529 	 * "next_" is the same as "curr_" but for the VM region immediately
14530 	 * after the address we're looking for.  We need to keep track of this
14531 	 * too because we want to return info about that region if the
14532 	 * address we're looking for is not mapped.
14533 	 */
14534 	vm_map_entry_t                  next_entry;
14535 	vm_map_offset_t                 next_offset;
14536 	vm_map_offset_t                 next_address;
14537 	vm_map_t                        next_map;
14538 	unsigned int                    next_depth;
14539 	vm_map_offset_t                 next_max_below, next_max_above;
14540 	vm_map_offset_t                 next_skip;
14541 
14542 	boolean_t                       look_for_pages;
14543 	vm_region_submap_short_info_64_t short_info;
14544 	boolean_t                       do_region_footprint;
14545 	int                             effective_page_size, effective_page_shift;
14546 	boolean_t                       submap_needed_copy;
14547 
14548 	if (map == VM_MAP_NULL) {
14549 		/* no address space to work on */
14550 		return KERN_INVALID_ARGUMENT;
14551 	}
14552 
14553 	effective_page_shift = vm_self_region_page_shift(map);
14554 	effective_page_size = (1 << effective_page_shift);
14555 
14556 	if (*count < VM_REGION_SUBMAP_SHORT_INFO_COUNT_64) {
14557 		/*
14558 		 * "info" structure is not big enough and
14559 		 * would overflow
14560 		 */
14561 		return KERN_INVALID_ARGUMENT;
14562 	}
14563 
14564 	do_region_footprint = task_self_region_footprint();
14565 	original_count = *count;
14566 
14567 	if (original_count < VM_REGION_SUBMAP_INFO_V0_COUNT_64) {
14568 		*count = VM_REGION_SUBMAP_SHORT_INFO_COUNT_64;
14569 		look_for_pages = FALSE;
14570 		short_info = (vm_region_submap_short_info_64_t) submap_info;
14571 		submap_info = NULL;
14572 	} else {
14573 		look_for_pages = TRUE;
14574 		*count = VM_REGION_SUBMAP_INFO_V0_COUNT_64;
14575 		short_info = NULL;
14576 
14577 		if (original_count >= VM_REGION_SUBMAP_INFO_V1_COUNT_64) {
14578 			*count = VM_REGION_SUBMAP_INFO_V1_COUNT_64;
14579 		}
14580 		if (original_count >= VM_REGION_SUBMAP_INFO_V2_COUNT_64) {
14581 			*count = VM_REGION_SUBMAP_INFO_V2_COUNT_64;
14582 		}
14583 	}
14584 
14585 	user_address = *address;
14586 	user_max_depth = *nesting_depth;
14587 	submap_needed_copy = FALSE;
14588 
14589 	if (not_in_kdp) {
14590 		vm_map_lock_read(map);
14591 	}
14592 
14593 recurse_again:
14594 	curr_entry = NULL;
14595 	curr_map = map;
14596 	curr_address = user_address;
14597 	curr_offset = 0;
14598 	curr_skip = 0;
14599 	curr_depth = 0;
14600 	curr_max_above = ((vm_map_offset_t) -1) - curr_address;
14601 	curr_max_below = curr_address;
14602 
14603 	next_entry = NULL;
14604 	next_map = NULL;
14605 	next_address = 0;
14606 	next_offset = 0;
14607 	next_skip = 0;
14608 	next_depth = 0;
14609 	next_max_above = (vm_map_offset_t) -1;
14610 	next_max_below = (vm_map_offset_t) -1;
14611 
14612 	for (;;) {
14613 		if (vm_map_lookup_entry(curr_map,
14614 		    curr_address,
14615 		    &tmp_entry)) {
14616 			/* tmp_entry contains the address we're looking for */
14617 			curr_entry = tmp_entry;
14618 		} else {
14619 			vm_map_offset_t skip;
14620 			/*
14621 			 * The address is not mapped.  "tmp_entry" is the
14622 			 * map entry preceding the address.  We want the next
14623 			 * one, if it exists.
14624 			 */
14625 			curr_entry = tmp_entry->vme_next;
14626 
14627 			if (curr_entry == vm_map_to_entry(curr_map) ||
14628 			    (curr_entry->vme_start >=
14629 			    curr_address + curr_max_above)) {
14630 				/* no next entry at this level: stop looking */
14631 				if (not_in_kdp) {
14632 					vm_map_unlock_read(curr_map);
14633 				}
14634 				curr_entry = NULL;
14635 				curr_map = NULL;
14636 				curr_skip = 0;
14637 				curr_offset = 0;
14638 				curr_depth = 0;
14639 				curr_max_above = 0;
14640 				curr_max_below = 0;
14641 				break;
14642 			}
14643 
14644 			/* adjust current address and offset */
14645 			skip = curr_entry->vme_start - curr_address;
14646 			curr_address = curr_entry->vme_start;
14647 			curr_skip += skip;
14648 			curr_offset += skip;
14649 			curr_max_above -= skip;
14650 			curr_max_below = 0;
14651 		}
14652 
14653 		/*
14654 		 * Is the next entry at this level closer to the address (or
14655 		 * deeper in the submap chain) than the one we had
14656 		 * so far ?
14657 		 */
14658 		tmp_entry = curr_entry->vme_next;
14659 		if (tmp_entry == vm_map_to_entry(curr_map)) {
14660 			/* no next entry at this level */
14661 		} else if (tmp_entry->vme_start >=
14662 		    curr_address + curr_max_above) {
14663 			/*
14664 			 * tmp_entry is beyond the scope of what we mapped of
14665 			 * this submap in the upper level: ignore it.
14666 			 */
14667 		} else if ((next_entry == NULL) ||
14668 		    (tmp_entry->vme_start + curr_offset <=
14669 		    next_entry->vme_start + next_offset)) {
14670 			/*
14671 			 * We didn't have a "next_entry" or this one is
14672 			 * closer to the address we're looking for:
14673 			 * use this "tmp_entry" as the new "next_entry".
14674 			 */
14675 			if (next_entry != NULL) {
14676 				/* unlock the last "next_map" */
14677 				if (next_map != curr_map && not_in_kdp) {
14678 					vm_map_unlock_read(next_map);
14679 				}
14680 			}
14681 			next_entry = tmp_entry;
14682 			next_map = curr_map;
14683 			next_depth = curr_depth;
14684 			next_address = next_entry->vme_start;
14685 			next_skip = curr_skip;
14686 			next_skip += (next_address - curr_address);
14687 			next_offset = curr_offset;
14688 			next_offset += (next_address - curr_address);
14689 			next_max_above = MIN(next_max_above, curr_max_above);
14690 			next_max_above = MIN(next_max_above,
14691 			    next_entry->vme_end - next_address);
14692 			next_max_below = MIN(next_max_below, curr_max_below);
14693 			next_max_below = MIN(next_max_below,
14694 			    next_address - next_entry->vme_start);
14695 		}
14696 
14697 		/*
14698 		 * "curr_max_{above,below}" allow us to keep track of the
14699 		 * portion of the submap that is actually mapped at this level:
14700 		 * the rest of that submap is irrelevant to us, since it's not
14701 		 * mapped here.
14702 		 * The relevant portion of the map starts at
14703 		 * "VME_OFFSET(curr_entry)" up to the size of "curr_entry".
14704 		 */
14705 		curr_max_above = MIN(curr_max_above,
14706 		    curr_entry->vme_end - curr_address);
14707 		curr_max_below = MIN(curr_max_below,
14708 		    curr_address - curr_entry->vme_start);
14709 
14710 		if (!curr_entry->is_sub_map ||
14711 		    curr_depth >= user_max_depth) {
14712 			/*
14713 			 * We hit a leaf map or we reached the maximum depth
14714 			 * we could, so stop looking.  Keep the current map
14715 			 * locked.
14716 			 */
14717 			break;
14718 		}
14719 
14720 		/*
14721 		 * Get down to the next submap level.
14722 		 */
14723 
14724 		if (curr_entry->needs_copy) {
14725 			/* everything below this is effectively copy-on-write */
14726 			submap_needed_copy = TRUE;
14727 		}
14728 
14729 		/*
14730 		 * Lock the next level and unlock the current level,
14731 		 * unless we need to keep it locked to access the "next_entry"
14732 		 * later.
14733 		 */
14734 		if (not_in_kdp) {
14735 			vm_map_lock_read(VME_SUBMAP(curr_entry));
14736 		}
14737 		if (curr_map == next_map) {
14738 			/* keep "next_map" locked in case we need it */
14739 		} else {
14740 			/* release this map */
14741 			if (not_in_kdp) {
14742 				vm_map_unlock_read(curr_map);
14743 			}
14744 		}
14745 
14746 		/*
14747 		 * Adjust the offset.  "curr_entry" maps the submap
14748 		 * at relative address "curr_entry->vme_start" in the
14749 		 * curr_map but skips the first "VME_OFFSET(curr_entry)"
14750 		 * bytes of the submap.
14751 		 * "curr_offset" always represents the offset of a virtual
14752 		 * address in the curr_map relative to the absolute address
14753 		 * space (i.e. the top-level VM map).
14754 		 */
14755 		curr_offset +=
14756 		    (VME_OFFSET(curr_entry) - curr_entry->vme_start);
14757 		curr_address = user_address + curr_offset;
14758 		/* switch to the submap */
14759 		curr_map = VME_SUBMAP(curr_entry);
14760 		curr_depth++;
14761 		curr_entry = NULL;
14762 	}
14763 
14764 // LP64todo: all the current tools are 32bit, obviously never worked for 64b
14765 // so probably should be a real 32b ID vs. ptr.
14766 // Current users just check for equality
14767 
14768 	if (curr_entry == NULL) {
14769 		/* no VM region contains the address... */
14770 
14771 		if (do_region_footprint && /* we want footprint numbers */
14772 		    next_entry == NULL && /* & there are no more regions */
14773 		    /* & we haven't already provided our fake region: */
14774 		    user_address <= vm_map_last_entry(map)->vme_end) {
14775 			ledger_amount_t ledger_resident, ledger_compressed;
14776 
14777 			/*
14778 			 * Add a fake memory region to account for
14779 			 * purgeable and/or ledger-tagged memory that
14780 			 * counts towards this task's memory footprint,
14781 			 * i.e. the resident/compressed pages of non-volatile
14782 			 * objects owned by that task.
14783 			 */
14784 			task_ledgers_footprint(map->pmap->ledger,
14785 			    &ledger_resident,
14786 			    &ledger_compressed);
14787 			if (ledger_resident + ledger_compressed == 0) {
14788 				/* no purgeable memory usage to report */
14789 				return KERN_INVALID_ADDRESS;
14790 			}
14791 			/* fake region to show nonvolatile footprint */
14792 			if (look_for_pages) {
14793 				submap_info->protection = VM_PROT_DEFAULT;
14794 				submap_info->max_protection = VM_PROT_DEFAULT;
14795 				submap_info->inheritance = VM_INHERIT_DEFAULT;
14796 				submap_info->offset = 0;
14797 				submap_info->user_tag = -1;
14798 				submap_info->pages_resident = (unsigned int) (ledger_resident / effective_page_size);
14799 				submap_info->pages_shared_now_private = 0;
14800 				submap_info->pages_swapped_out = (unsigned int) (ledger_compressed / effective_page_size);
14801 				submap_info->pages_dirtied = submap_info->pages_resident;
14802 				submap_info->ref_count = 1;
14803 				submap_info->shadow_depth = 0;
14804 				submap_info->external_pager = 0;
14805 				submap_info->share_mode = SM_PRIVATE;
14806 				if (submap_needed_copy) {
14807 					submap_info->share_mode = SM_COW;
14808 				}
14809 				submap_info->is_submap = 0;
14810 				submap_info->behavior = VM_BEHAVIOR_DEFAULT;
14811 				submap_info->object_id = VM_OBJECT_ID_FAKE(map, task_ledgers.purgeable_nonvolatile);
14812 				submap_info->user_wired_count = 0;
14813 				submap_info->pages_reusable = 0;
14814 			} else {
14815 				short_info->user_tag = -1;
14816 				short_info->offset = 0;
14817 				short_info->protection = VM_PROT_DEFAULT;
14818 				short_info->inheritance = VM_INHERIT_DEFAULT;
14819 				short_info->max_protection = VM_PROT_DEFAULT;
14820 				short_info->behavior = VM_BEHAVIOR_DEFAULT;
14821 				short_info->user_wired_count = 0;
14822 				short_info->is_submap = 0;
14823 				short_info->object_id = VM_OBJECT_ID_FAKE(map, task_ledgers.purgeable_nonvolatile);
14824 				short_info->external_pager = 0;
14825 				short_info->shadow_depth = 0;
14826 				short_info->share_mode = SM_PRIVATE;
14827 				if (submap_needed_copy) {
14828 					short_info->share_mode = SM_COW;
14829 				}
14830 				short_info->ref_count = 1;
14831 			}
14832 			*nesting_depth = 0;
14833 			*size = (vm_map_size_t) (ledger_resident + ledger_compressed);
14834 //			*address = user_address;
14835 			*address = vm_map_last_entry(map)->vme_end;
14836 			return KERN_SUCCESS;
14837 		}
14838 
14839 		if (next_entry == NULL) {
14840 			/* ... and no VM region follows it either */
14841 			return KERN_INVALID_ADDRESS;
14842 		}
14843 		/* ... gather info about the next VM region */
14844 		curr_entry = next_entry;
14845 		curr_map = next_map;    /* still locked ... */
14846 		curr_address = next_address;
14847 		curr_skip = next_skip;
14848 		curr_offset = next_offset;
14849 		curr_depth = next_depth;
14850 		curr_max_above = next_max_above;
14851 		curr_max_below = next_max_below;
14852 	} else {
14853 		/* we won't need "next_entry" after all */
14854 		if (next_entry != NULL) {
14855 			/* release "next_map" */
14856 			if (next_map != curr_map && not_in_kdp) {
14857 				vm_map_unlock_read(next_map);
14858 			}
14859 		}
14860 	}
14861 	next_entry = NULL;
14862 	next_map = NULL;
14863 	next_offset = 0;
14864 	next_skip = 0;
14865 	next_depth = 0;
14866 	next_max_below = -1;
14867 	next_max_above = -1;
14868 
14869 	if (curr_entry->is_sub_map &&
14870 	    curr_depth < user_max_depth) {
14871 		/*
14872 		 * We're not as deep as we could be:  we must have
14873 		 * gone back up after not finding anything mapped
14874 		 * below the original top-level map entry's.
14875 		 * Let's move "curr_address" forward and recurse again.
14876 		 */
14877 		user_address = curr_address;
14878 		goto recurse_again;
14879 	}
14880 
14881 	*nesting_depth = curr_depth;
14882 	*size = curr_max_above + curr_max_below;
14883 	*address = user_address + curr_skip - curr_max_below;
14884 
14885 	if (look_for_pages) {
14886 		submap_info->user_tag = VME_ALIAS(curr_entry);
14887 		submap_info->offset = VME_OFFSET(curr_entry);
14888 		submap_info->protection = curr_entry->protection;
14889 		submap_info->inheritance = curr_entry->inheritance;
14890 		submap_info->max_protection = curr_entry->max_protection;
14891 		submap_info->behavior = curr_entry->behavior;
14892 		submap_info->user_wired_count = curr_entry->user_wired_count;
14893 		submap_info->is_submap = curr_entry->is_sub_map;
14894 		if (curr_entry->is_sub_map) {
14895 			submap_info->object_id = VM_OBJECT_ID(VME_SUBMAP(curr_entry));
14896 		} else {
14897 			submap_info->object_id = VM_OBJECT_ID(VME_OBJECT(curr_entry));
14898 		}
14899 	} else {
14900 		short_info->user_tag = VME_ALIAS(curr_entry);
14901 		short_info->offset = VME_OFFSET(curr_entry);
14902 		short_info->protection = curr_entry->protection;
14903 		short_info->inheritance = curr_entry->inheritance;
14904 		short_info->max_protection = curr_entry->max_protection;
14905 		short_info->behavior = curr_entry->behavior;
14906 		short_info->user_wired_count = curr_entry->user_wired_count;
14907 		short_info->is_submap = curr_entry->is_sub_map;
14908 		if (curr_entry->is_sub_map) {
14909 			short_info->object_id = VM_OBJECT_ID(VME_SUBMAP(curr_entry));
14910 		} else {
14911 			short_info->object_id = VM_OBJECT_ID(VME_OBJECT(curr_entry));
14912 		}
14913 	}
14914 
14915 	extended.pages_resident = 0;
14916 	extended.pages_swapped_out = 0;
14917 	extended.pages_shared_now_private = 0;
14918 	extended.pages_dirtied = 0;
14919 	extended.pages_reusable = 0;
14920 	extended.external_pager = 0;
14921 	extended.shadow_depth = 0;
14922 	extended.share_mode = SM_EMPTY;
14923 	extended.ref_count = 0;
14924 
14925 	if (not_in_kdp) {
14926 		if (!curr_entry->is_sub_map) {
14927 			vm_map_offset_t range_start, range_end;
14928 			range_start = MAX((curr_address - curr_max_below),
14929 			    curr_entry->vme_start);
14930 			range_end = MIN((curr_address + curr_max_above),
14931 			    curr_entry->vme_end);
14932 			vm_map_region_walk(curr_map,
14933 			    range_start,
14934 			    curr_entry,
14935 			    (VME_OFFSET(curr_entry) +
14936 			    (range_start -
14937 			    curr_entry->vme_start)),
14938 			    range_end - range_start,
14939 			    &extended,
14940 			    look_for_pages, VM_REGION_EXTENDED_INFO_COUNT);
14941 			if (extended.external_pager &&
14942 			    extended.ref_count == 2 &&
14943 			    extended.share_mode == SM_SHARED) {
14944 				extended.share_mode = SM_PRIVATE;
14945 			}
14946 			if (submap_needed_copy) {
14947 				extended.share_mode = SM_COW;
14948 			}
14949 		} else {
14950 			if (curr_entry->use_pmap) {
14951 				extended.share_mode = SM_TRUESHARED;
14952 			} else {
14953 				extended.share_mode = SM_PRIVATE;
14954 			}
14955 			extended.ref_count = os_ref_get_count_raw(&VME_SUBMAP(curr_entry)->map_refcnt);
14956 		}
14957 	}
14958 
14959 	if (look_for_pages) {
14960 		submap_info->pages_resident = extended.pages_resident;
14961 		submap_info->pages_swapped_out = extended.pages_swapped_out;
14962 		submap_info->pages_shared_now_private =
14963 		    extended.pages_shared_now_private;
14964 		submap_info->pages_dirtied = extended.pages_dirtied;
14965 		submap_info->external_pager = extended.external_pager;
14966 		submap_info->shadow_depth = extended.shadow_depth;
14967 		submap_info->share_mode = extended.share_mode;
14968 		submap_info->ref_count = extended.ref_count;
14969 
14970 		if (original_count >= VM_REGION_SUBMAP_INFO_V1_COUNT_64) {
14971 			submap_info->pages_reusable = extended.pages_reusable;
14972 		}
14973 		if (original_count >= VM_REGION_SUBMAP_INFO_V2_COUNT_64) {
14974 			if (curr_entry->is_sub_map) {
14975 				submap_info->object_id_full = (vm_object_id_t)VM_KERNEL_ADDRPERM(VME_SUBMAP(curr_entry));
14976 			} else if (VME_OBJECT(curr_entry)) {
14977 				submap_info->object_id_full = (vm_object_id_t)VM_KERNEL_ADDRPERM(VME_OBJECT(curr_entry));
14978 			} else {
14979 				submap_info->object_id_full = 0ull;
14980 			}
14981 		}
14982 	} else {
14983 		short_info->external_pager = extended.external_pager;
14984 		short_info->shadow_depth = extended.shadow_depth;
14985 		short_info->share_mode = extended.share_mode;
14986 		short_info->ref_count = extended.ref_count;
14987 	}
14988 
14989 	if (not_in_kdp) {
14990 		vm_map_unlock_read(curr_map);
14991 	}
14992 
14993 	return KERN_SUCCESS;
14994 }
14995 
14996 /*
14997  *	vm_region:
14998  *
14999  *	User call to obtain information about a region in
15000  *	a task's address map. Currently, only one flavor is
15001  *	supported.
15002  *
15003  *	XXX The reserved and behavior fields cannot be filled
15004  *	    in until the vm merge from the IK is completed, and
15005  *	    vm_reserve is implemented.
15006  */
15007 
15008 kern_return_t
vm_map_region(vm_map_t map,vm_map_offset_t * address,vm_map_size_t * size,vm_region_flavor_t flavor,vm_region_info_t info,mach_msg_type_number_t * count,mach_port_t * object_name)15009 vm_map_region(
15010 	vm_map_t                 map,
15011 	vm_map_offset_t *address,               /* IN/OUT */
15012 	vm_map_size_t           *size,                  /* OUT */
15013 	vm_region_flavor_t       flavor,                /* IN */
15014 	vm_region_info_t         info,                  /* OUT */
15015 	mach_msg_type_number_t  *count, /* IN/OUT */
15016 	mach_port_t             *object_name)           /* OUT */
15017 {
15018 	vm_map_entry_t          tmp_entry;
15019 	vm_map_entry_t          entry;
15020 	vm_map_offset_t         start;
15021 
15022 	if (map == VM_MAP_NULL) {
15023 		return KERN_INVALID_ARGUMENT;
15024 	}
15025 
15026 	switch (flavor) {
15027 	case VM_REGION_BASIC_INFO:
15028 		/* legacy for old 32-bit objects info */
15029 	{
15030 		vm_region_basic_info_t  basic;
15031 
15032 		if (*count < VM_REGION_BASIC_INFO_COUNT) {
15033 			return KERN_INVALID_ARGUMENT;
15034 		}
15035 
15036 		basic = (vm_region_basic_info_t) info;
15037 		*count = VM_REGION_BASIC_INFO_COUNT;
15038 
15039 		vm_map_lock_read(map);
15040 
15041 		start = *address;
15042 		if (!vm_map_lookup_entry(map, start, &tmp_entry)) {
15043 			if ((entry = tmp_entry->vme_next) == vm_map_to_entry(map)) {
15044 				vm_map_unlock_read(map);
15045 				return KERN_INVALID_ADDRESS;
15046 			}
15047 		} else {
15048 			entry = tmp_entry;
15049 		}
15050 
15051 		start = entry->vme_start;
15052 
15053 		basic->offset = (uint32_t)VME_OFFSET(entry);
15054 		basic->protection = entry->protection;
15055 		basic->inheritance = entry->inheritance;
15056 		basic->max_protection = entry->max_protection;
15057 		basic->behavior = entry->behavior;
15058 		basic->user_wired_count = entry->user_wired_count;
15059 		basic->reserved = entry->is_sub_map;
15060 		*address = start;
15061 		*size = (entry->vme_end - start);
15062 
15063 		if (object_name) {
15064 			*object_name = IP_NULL;
15065 		}
15066 		if (entry->is_sub_map) {
15067 			basic->shared = FALSE;
15068 		} else {
15069 			basic->shared = entry->is_shared;
15070 		}
15071 
15072 		vm_map_unlock_read(map);
15073 		return KERN_SUCCESS;
15074 	}
15075 
15076 	case VM_REGION_BASIC_INFO_64:
15077 	{
15078 		vm_region_basic_info_64_t       basic;
15079 
15080 		if (*count < VM_REGION_BASIC_INFO_COUNT_64) {
15081 			return KERN_INVALID_ARGUMENT;
15082 		}
15083 
15084 		basic = (vm_region_basic_info_64_t) info;
15085 		*count = VM_REGION_BASIC_INFO_COUNT_64;
15086 
15087 		vm_map_lock_read(map);
15088 
15089 		start = *address;
15090 		if (!vm_map_lookup_entry(map, start, &tmp_entry)) {
15091 			if ((entry = tmp_entry->vme_next) == vm_map_to_entry(map)) {
15092 				vm_map_unlock_read(map);
15093 				return KERN_INVALID_ADDRESS;
15094 			}
15095 		} else {
15096 			entry = tmp_entry;
15097 		}
15098 
15099 		start = entry->vme_start;
15100 
15101 		basic->offset = VME_OFFSET(entry);
15102 		basic->protection = entry->protection;
15103 		basic->inheritance = entry->inheritance;
15104 		basic->max_protection = entry->max_protection;
15105 		basic->behavior = entry->behavior;
15106 		basic->user_wired_count = entry->user_wired_count;
15107 		basic->reserved = entry->is_sub_map;
15108 		*address = start;
15109 		*size = (entry->vme_end - start);
15110 
15111 		if (object_name) {
15112 			*object_name = IP_NULL;
15113 		}
15114 		if (entry->is_sub_map) {
15115 			basic->shared = FALSE;
15116 		} else {
15117 			basic->shared = entry->is_shared;
15118 		}
15119 
15120 		vm_map_unlock_read(map);
15121 		return KERN_SUCCESS;
15122 	}
15123 	case VM_REGION_EXTENDED_INFO:
15124 		if (*count < VM_REGION_EXTENDED_INFO_COUNT) {
15125 			return KERN_INVALID_ARGUMENT;
15126 		}
15127 		OS_FALLTHROUGH;
15128 	case VM_REGION_EXTENDED_INFO__legacy:
15129 		if (*count < VM_REGION_EXTENDED_INFO_COUNT__legacy) {
15130 			return KERN_INVALID_ARGUMENT;
15131 		}
15132 
15133 		{
15134 			vm_region_extended_info_t       extended;
15135 			mach_msg_type_number_t original_count;
15136 			int effective_page_size, effective_page_shift;
15137 
15138 			extended = (vm_region_extended_info_t) info;
15139 
15140 			effective_page_shift = vm_self_region_page_shift(map);
15141 			effective_page_size = (1 << effective_page_shift);
15142 
15143 			vm_map_lock_read(map);
15144 
15145 			start = *address;
15146 			if (!vm_map_lookup_entry(map, start, &tmp_entry)) {
15147 				if ((entry = tmp_entry->vme_next) == vm_map_to_entry(map)) {
15148 					vm_map_unlock_read(map);
15149 					return KERN_INVALID_ADDRESS;
15150 				}
15151 			} else {
15152 				entry = tmp_entry;
15153 			}
15154 			start = entry->vme_start;
15155 
15156 			extended->protection = entry->protection;
15157 			extended->user_tag = VME_ALIAS(entry);
15158 			extended->pages_resident = 0;
15159 			extended->pages_swapped_out = 0;
15160 			extended->pages_shared_now_private = 0;
15161 			extended->pages_dirtied = 0;
15162 			extended->external_pager = 0;
15163 			extended->shadow_depth = 0;
15164 
15165 			original_count = *count;
15166 			if (flavor == VM_REGION_EXTENDED_INFO__legacy) {
15167 				*count = VM_REGION_EXTENDED_INFO_COUNT__legacy;
15168 			} else {
15169 				extended->pages_reusable = 0;
15170 				*count = VM_REGION_EXTENDED_INFO_COUNT;
15171 			}
15172 
15173 			vm_map_region_walk(map, start, entry, VME_OFFSET(entry), entry->vme_end - start, extended, TRUE, *count);
15174 
15175 			if (extended->external_pager && extended->ref_count == 2 && extended->share_mode == SM_SHARED) {
15176 				extended->share_mode = SM_PRIVATE;
15177 			}
15178 
15179 			if (object_name) {
15180 				*object_name = IP_NULL;
15181 			}
15182 			*address = start;
15183 			*size = (entry->vme_end - start);
15184 
15185 			vm_map_unlock_read(map);
15186 			return KERN_SUCCESS;
15187 		}
15188 	case VM_REGION_TOP_INFO:
15189 	{
15190 		vm_region_top_info_t    top;
15191 
15192 		if (*count < VM_REGION_TOP_INFO_COUNT) {
15193 			return KERN_INVALID_ARGUMENT;
15194 		}
15195 
15196 		top = (vm_region_top_info_t) info;
15197 		*count = VM_REGION_TOP_INFO_COUNT;
15198 
15199 		vm_map_lock_read(map);
15200 
15201 		start = *address;
15202 		if (!vm_map_lookup_entry(map, start, &tmp_entry)) {
15203 			if ((entry = tmp_entry->vme_next) == vm_map_to_entry(map)) {
15204 				vm_map_unlock_read(map);
15205 				return KERN_INVALID_ADDRESS;
15206 			}
15207 		} else {
15208 			entry = tmp_entry;
15209 		}
15210 		start = entry->vme_start;
15211 
15212 		top->private_pages_resident = 0;
15213 		top->shared_pages_resident = 0;
15214 
15215 		vm_map_region_top_walk(entry, top);
15216 
15217 		if (object_name) {
15218 			*object_name = IP_NULL;
15219 		}
15220 		*address = start;
15221 		*size = (entry->vme_end - start);
15222 
15223 		vm_map_unlock_read(map);
15224 		return KERN_SUCCESS;
15225 	}
15226 	default:
15227 		return KERN_INVALID_ARGUMENT;
15228 	}
15229 }
15230 
15231 #define OBJ_RESIDENT_COUNT(obj, entry_size)                             \
15232 	MIN((entry_size),                                               \
15233 	    ((obj)->all_reusable ?                                      \
15234 	     (obj)->wired_page_count :                                  \
15235 	     (obj)->resident_page_count - (obj)->reusable_page_count))
15236 
15237 void
vm_map_region_top_walk(vm_map_entry_t entry,vm_region_top_info_t top)15238 vm_map_region_top_walk(
15239 	vm_map_entry_t             entry,
15240 	vm_region_top_info_t       top)
15241 {
15242 	if (entry->is_sub_map || VME_OBJECT(entry) == 0) {
15243 		top->share_mode = SM_EMPTY;
15244 		top->ref_count = 0;
15245 		top->obj_id = 0;
15246 		return;
15247 	}
15248 
15249 	{
15250 		struct  vm_object *obj, *tmp_obj;
15251 		int             ref_count;
15252 		uint32_t        entry_size;
15253 
15254 		entry_size = (uint32_t) ((entry->vme_end - entry->vme_start) / PAGE_SIZE_64);
15255 
15256 		obj = VME_OBJECT(entry);
15257 
15258 		vm_object_lock(obj);
15259 
15260 		if ((ref_count = obj->ref_count) > 1 && obj->paging_in_progress) {
15261 			ref_count--;
15262 		}
15263 
15264 		assert(obj->reusable_page_count <= obj->resident_page_count);
15265 		if (obj->shadow) {
15266 			if (ref_count == 1) {
15267 				top->private_pages_resident =
15268 				    OBJ_RESIDENT_COUNT(obj, entry_size);
15269 			} else {
15270 				top->shared_pages_resident =
15271 				    OBJ_RESIDENT_COUNT(obj, entry_size);
15272 			}
15273 			top->ref_count  = ref_count;
15274 			top->share_mode = SM_COW;
15275 
15276 			while ((tmp_obj = obj->shadow)) {
15277 				vm_object_lock(tmp_obj);
15278 				vm_object_unlock(obj);
15279 				obj = tmp_obj;
15280 
15281 				if ((ref_count = obj->ref_count) > 1 && obj->paging_in_progress) {
15282 					ref_count--;
15283 				}
15284 
15285 				assert(obj->reusable_page_count <= obj->resident_page_count);
15286 				top->shared_pages_resident +=
15287 				    OBJ_RESIDENT_COUNT(obj, entry_size);
15288 				top->ref_count += ref_count - 1;
15289 			}
15290 		} else {
15291 			if (entry->superpage_size) {
15292 				top->share_mode = SM_LARGE_PAGE;
15293 				top->shared_pages_resident = 0;
15294 				top->private_pages_resident = entry_size;
15295 			} else if (entry->needs_copy) {
15296 				top->share_mode = SM_COW;
15297 				top->shared_pages_resident =
15298 				    OBJ_RESIDENT_COUNT(obj, entry_size);
15299 			} else {
15300 				if (ref_count == 1 ||
15301 				    (ref_count == 2 && obj->named)) {
15302 					top->share_mode = SM_PRIVATE;
15303 					top->private_pages_resident =
15304 					    OBJ_RESIDENT_COUNT(obj,
15305 					    entry_size);
15306 				} else {
15307 					top->share_mode = SM_SHARED;
15308 					top->shared_pages_resident =
15309 					    OBJ_RESIDENT_COUNT(obj,
15310 					    entry_size);
15311 				}
15312 			}
15313 			top->ref_count = ref_count;
15314 		}
15315 		/* XXX K64: obj_id will be truncated */
15316 		top->obj_id = (unsigned int) (uintptr_t)VM_KERNEL_ADDRPERM(obj);
15317 
15318 		vm_object_unlock(obj);
15319 	}
15320 }
15321 
15322 void
vm_map_region_walk(vm_map_t map,vm_map_offset_t va,vm_map_entry_t entry,vm_object_offset_t offset,vm_object_size_t range,vm_region_extended_info_t extended,boolean_t look_for_pages,mach_msg_type_number_t count)15323 vm_map_region_walk(
15324 	vm_map_t                        map,
15325 	vm_map_offset_t                 va,
15326 	vm_map_entry_t                  entry,
15327 	vm_object_offset_t              offset,
15328 	vm_object_size_t                range,
15329 	vm_region_extended_info_t       extended,
15330 	boolean_t                       look_for_pages,
15331 	mach_msg_type_number_t count)
15332 {
15333 	struct vm_object *obj, *tmp_obj;
15334 	vm_map_offset_t       last_offset;
15335 	int               i;
15336 	int               ref_count;
15337 	struct vm_object        *shadow_object;
15338 	unsigned short          shadow_depth;
15339 	boolean_t         do_region_footprint;
15340 	int                     effective_page_size, effective_page_shift;
15341 	vm_map_offset_t         effective_page_mask;
15342 
15343 	do_region_footprint = task_self_region_footprint();
15344 
15345 	if ((entry->is_sub_map) ||
15346 	    (VME_OBJECT(entry) == 0) ||
15347 	    (VME_OBJECT(entry)->phys_contiguous &&
15348 	    !entry->superpage_size)) {
15349 		extended->share_mode = SM_EMPTY;
15350 		extended->ref_count = 0;
15351 		return;
15352 	}
15353 
15354 	if (entry->superpage_size) {
15355 		extended->shadow_depth = 0;
15356 		extended->share_mode = SM_LARGE_PAGE;
15357 		extended->ref_count = 1;
15358 		extended->external_pager = 0;
15359 
15360 		/* TODO4K: Superpage in 4k mode? */
15361 		extended->pages_resident = (unsigned int)(range >> PAGE_SHIFT);
15362 		extended->shadow_depth = 0;
15363 		return;
15364 	}
15365 
15366 	effective_page_shift = vm_self_region_page_shift(map);
15367 	effective_page_size = (1 << effective_page_shift);
15368 	effective_page_mask = effective_page_size - 1;
15369 
15370 	offset = vm_map_trunc_page(offset, effective_page_mask);
15371 
15372 	obj = VME_OBJECT(entry);
15373 
15374 	vm_object_lock(obj);
15375 
15376 	if ((ref_count = obj->ref_count) > 1 && obj->paging_in_progress) {
15377 		ref_count--;
15378 	}
15379 
15380 	if (look_for_pages) {
15381 		for (last_offset = offset + range;
15382 		    offset < last_offset;
15383 		    offset += effective_page_size, va += effective_page_size) {
15384 			if (do_region_footprint) {
15385 				int disp;
15386 
15387 				disp = 0;
15388 				if (map->has_corpse_footprint) {
15389 					/*
15390 					 * Query the page info data we saved
15391 					 * while forking the corpse.
15392 					 */
15393 					vm_map_corpse_footprint_query_page_info(
15394 						map,
15395 						va,
15396 						&disp);
15397 				} else {
15398 					/*
15399 					 * Query the pmap.
15400 					 */
15401 					vm_map_footprint_query_page_info(
15402 						map,
15403 						entry,
15404 						va,
15405 						&disp);
15406 				}
15407 				if (disp & VM_PAGE_QUERY_PAGE_PRESENT) {
15408 					extended->pages_resident++;
15409 				}
15410 				if (disp & VM_PAGE_QUERY_PAGE_REUSABLE) {
15411 					extended->pages_reusable++;
15412 				}
15413 				if (disp & VM_PAGE_QUERY_PAGE_DIRTY) {
15414 					extended->pages_dirtied++;
15415 				}
15416 				if (disp & PMAP_QUERY_PAGE_COMPRESSED) {
15417 					extended->pages_swapped_out++;
15418 				}
15419 				continue;
15420 			}
15421 
15422 			vm_map_region_look_for_page(map, va, obj,
15423 			    vm_object_trunc_page(offset), ref_count,
15424 			    0, extended, count);
15425 		}
15426 
15427 		if (do_region_footprint) {
15428 			goto collect_object_info;
15429 		}
15430 	} else {
15431 collect_object_info:
15432 		shadow_object = obj->shadow;
15433 		shadow_depth = 0;
15434 
15435 		if (!(obj->internal)) {
15436 			extended->external_pager = 1;
15437 		}
15438 
15439 		if (shadow_object != VM_OBJECT_NULL) {
15440 			vm_object_lock(shadow_object);
15441 			for (;
15442 			    shadow_object != VM_OBJECT_NULL;
15443 			    shadow_depth++) {
15444 				vm_object_t     next_shadow;
15445 
15446 				if (!(shadow_object->internal)) {
15447 					extended->external_pager = 1;
15448 				}
15449 
15450 				next_shadow = shadow_object->shadow;
15451 				if (next_shadow) {
15452 					vm_object_lock(next_shadow);
15453 				}
15454 				vm_object_unlock(shadow_object);
15455 				shadow_object = next_shadow;
15456 			}
15457 		}
15458 		extended->shadow_depth = shadow_depth;
15459 	}
15460 
15461 	if (extended->shadow_depth || entry->needs_copy) {
15462 		extended->share_mode = SM_COW;
15463 	} else {
15464 		if (ref_count == 1) {
15465 			extended->share_mode = SM_PRIVATE;
15466 		} else {
15467 			if (obj->true_share) {
15468 				extended->share_mode = SM_TRUESHARED;
15469 			} else {
15470 				extended->share_mode = SM_SHARED;
15471 			}
15472 		}
15473 	}
15474 	extended->ref_count = ref_count - extended->shadow_depth;
15475 
15476 	for (i = 0; i < extended->shadow_depth; i++) {
15477 		if ((tmp_obj = obj->shadow) == 0) {
15478 			break;
15479 		}
15480 		vm_object_lock(tmp_obj);
15481 		vm_object_unlock(obj);
15482 
15483 		if ((ref_count = tmp_obj->ref_count) > 1 && tmp_obj->paging_in_progress) {
15484 			ref_count--;
15485 		}
15486 
15487 		extended->ref_count += ref_count;
15488 		obj = tmp_obj;
15489 	}
15490 	vm_object_unlock(obj);
15491 
15492 	if (extended->share_mode == SM_SHARED) {
15493 		vm_map_entry_t       cur;
15494 		vm_map_entry_t       last;
15495 		int      my_refs;
15496 
15497 		obj = VME_OBJECT(entry);
15498 		last = vm_map_to_entry(map);
15499 		my_refs = 0;
15500 
15501 		if ((ref_count = obj->ref_count) > 1 && obj->paging_in_progress) {
15502 			ref_count--;
15503 		}
15504 		for (cur = vm_map_first_entry(map); cur != last; cur = cur->vme_next) {
15505 			my_refs += vm_map_region_count_obj_refs(cur, obj);
15506 		}
15507 
15508 		if (my_refs == ref_count) {
15509 			extended->share_mode = SM_PRIVATE_ALIASED;
15510 		} else if (my_refs > 1) {
15511 			extended->share_mode = SM_SHARED_ALIASED;
15512 		}
15513 	}
15514 }
15515 
15516 
15517 /* object is locked on entry and locked on return */
15518 
15519 
15520 static void
vm_map_region_look_for_page(__unused vm_map_t map,__unused vm_map_offset_t va,vm_object_t object,vm_object_offset_t offset,int max_refcnt,unsigned short depth,vm_region_extended_info_t extended,mach_msg_type_number_t count)15521 vm_map_region_look_for_page(
15522 	__unused vm_map_t               map,
15523 	__unused vm_map_offset_t        va,
15524 	vm_object_t                     object,
15525 	vm_object_offset_t              offset,
15526 	int                             max_refcnt,
15527 	unsigned short                  depth,
15528 	vm_region_extended_info_t       extended,
15529 	mach_msg_type_number_t count)
15530 {
15531 	vm_page_t       p;
15532 	vm_object_t     shadow;
15533 	int             ref_count;
15534 	vm_object_t     caller_object;
15535 
15536 	shadow = object->shadow;
15537 	caller_object = object;
15538 
15539 
15540 	while (TRUE) {
15541 		if (!(object->internal)) {
15542 			extended->external_pager = 1;
15543 		}
15544 
15545 		if ((p = vm_page_lookup(object, offset)) != VM_PAGE_NULL) {
15546 			if (shadow && (max_refcnt == 1)) {
15547 				extended->pages_shared_now_private++;
15548 			}
15549 
15550 			if (!p->vmp_fictitious &&
15551 			    (p->vmp_dirty || pmap_is_modified(VM_PAGE_GET_PHYS_PAGE(p)))) {
15552 				extended->pages_dirtied++;
15553 			} else if (count >= VM_REGION_EXTENDED_INFO_COUNT) {
15554 				if (p->vmp_reusable || object->all_reusable) {
15555 					extended->pages_reusable++;
15556 				}
15557 			}
15558 
15559 			extended->pages_resident++;
15560 
15561 			if (object != caller_object) {
15562 				vm_object_unlock(object);
15563 			}
15564 
15565 			return;
15566 		}
15567 		if (object->internal &&
15568 		    object->alive &&
15569 		    !object->terminating &&
15570 		    object->pager_ready) {
15571 			if (VM_COMPRESSOR_PAGER_STATE_GET(object, offset)
15572 			    == VM_EXTERNAL_STATE_EXISTS) {
15573 				/* the pager has that page */
15574 				extended->pages_swapped_out++;
15575 				if (object != caller_object) {
15576 					vm_object_unlock(object);
15577 				}
15578 				return;
15579 			}
15580 		}
15581 
15582 		if (shadow) {
15583 			vm_object_lock(shadow);
15584 
15585 			if ((ref_count = shadow->ref_count) > 1 && shadow->paging_in_progress) {
15586 				ref_count--;
15587 			}
15588 
15589 			if (++depth > extended->shadow_depth) {
15590 				extended->shadow_depth = depth;
15591 			}
15592 
15593 			if (ref_count > max_refcnt) {
15594 				max_refcnt = ref_count;
15595 			}
15596 
15597 			if (object != caller_object) {
15598 				vm_object_unlock(object);
15599 			}
15600 
15601 			offset = offset + object->vo_shadow_offset;
15602 			object = shadow;
15603 			shadow = object->shadow;
15604 			continue;
15605 		}
15606 		if (object != caller_object) {
15607 			vm_object_unlock(object);
15608 		}
15609 		break;
15610 	}
15611 }
15612 
15613 static int
vm_map_region_count_obj_refs(vm_map_entry_t entry,vm_object_t object)15614 vm_map_region_count_obj_refs(
15615 	vm_map_entry_t    entry,
15616 	vm_object_t       object)
15617 {
15618 	int ref_count;
15619 	vm_object_t chk_obj;
15620 	vm_object_t tmp_obj;
15621 
15622 	if (entry->is_sub_map || VME_OBJECT(entry) == VM_OBJECT_NULL) {
15623 		return 0;
15624 	}
15625 
15626 	ref_count = 0;
15627 	chk_obj = VME_OBJECT(entry);
15628 	vm_object_lock(chk_obj);
15629 
15630 	while (chk_obj) {
15631 		if (chk_obj == object) {
15632 			ref_count++;
15633 		}
15634 		tmp_obj = chk_obj->shadow;
15635 		if (tmp_obj) {
15636 			vm_object_lock(tmp_obj);
15637 		}
15638 		vm_object_unlock(chk_obj);
15639 
15640 		chk_obj = tmp_obj;
15641 	}
15642 
15643 	return ref_count;
15644 }
15645 
15646 
15647 /*
15648  *	Routine:	vm_map_simplify
15649  *
15650  *	Description:
15651  *		Attempt to simplify the map representation in
15652  *		the vicinity of the given starting address.
15653  *	Note:
15654  *		This routine is intended primarily to keep the
15655  *		kernel maps more compact -- they generally don't
15656  *		benefit from the "expand a map entry" technology
15657  *		at allocation time because the adjacent entry
15658  *		is often wired down.
15659  */
15660 void
vm_map_simplify_entry(vm_map_t map,vm_map_entry_t this_entry)15661 vm_map_simplify_entry(
15662 	vm_map_t        map,
15663 	vm_map_entry_t  this_entry)
15664 {
15665 	vm_map_entry_t  prev_entry;
15666 
15667 	prev_entry = this_entry->vme_prev;
15668 
15669 	if ((this_entry != vm_map_to_entry(map)) &&
15670 	    (prev_entry != vm_map_to_entry(map)) &&
15671 
15672 	    (prev_entry->vme_end == this_entry->vme_start) &&
15673 
15674 	    (prev_entry->is_sub_map == this_entry->is_sub_map) &&
15675 	    (prev_entry->vme_object_value == this_entry->vme_object_value) &&
15676 	    (prev_entry->vme_kernel_object == this_entry->vme_kernel_object) &&
15677 	    ((VME_OFFSET(prev_entry) + (prev_entry->vme_end -
15678 	    prev_entry->vme_start))
15679 	    == VME_OFFSET(this_entry)) &&
15680 
15681 	    (prev_entry->behavior == this_entry->behavior) &&
15682 	    (prev_entry->needs_copy == this_entry->needs_copy) &&
15683 	    (prev_entry->protection == this_entry->protection) &&
15684 	    (prev_entry->max_protection == this_entry->max_protection) &&
15685 	    (prev_entry->inheritance == this_entry->inheritance) &&
15686 	    (prev_entry->use_pmap == this_entry->use_pmap) &&
15687 	    (VME_ALIAS(prev_entry) == VME_ALIAS(this_entry)) &&
15688 	    (prev_entry->no_cache == this_entry->no_cache) &&
15689 	    (prev_entry->vme_permanent == this_entry->vme_permanent) &&
15690 	    (prev_entry->map_aligned == this_entry->map_aligned) &&
15691 	    (prev_entry->zero_wired_pages == this_entry->zero_wired_pages) &&
15692 	    (prev_entry->used_for_jit == this_entry->used_for_jit) &&
15693 #if __arm64e__
15694 	    (prev_entry->used_for_tpro == this_entry->used_for_tpro) &&
15695 #endif
15696 	    (prev_entry->csm_associated == this_entry->csm_associated) &&
15697 	    (prev_entry->vme_xnu_user_debug == this_entry->vme_xnu_user_debug) &&
15698 	    (prev_entry->iokit_acct == this_entry->iokit_acct) &&
15699 	    (prev_entry->vme_resilient_codesign ==
15700 	    this_entry->vme_resilient_codesign) &&
15701 	    (prev_entry->vme_resilient_media ==
15702 	    this_entry->vme_resilient_media) &&
15703 	    (prev_entry->vme_no_copy_on_read == this_entry->vme_no_copy_on_read) &&
15704 	    (prev_entry->translated_allow_execute == this_entry->translated_allow_execute) &&
15705 
15706 	    (prev_entry->wired_count == this_entry->wired_count) &&
15707 	    (prev_entry->user_wired_count == this_entry->user_wired_count) &&
15708 
15709 	    ((prev_entry->vme_atomic == FALSE) && (this_entry->vme_atomic == FALSE)) &&
15710 	    (prev_entry->in_transition == FALSE) &&
15711 	    (this_entry->in_transition == FALSE) &&
15712 	    (prev_entry->needs_wakeup == FALSE) &&
15713 	    (this_entry->needs_wakeup == FALSE) &&
15714 	    (prev_entry->is_shared == this_entry->is_shared) &&
15715 	    (prev_entry->superpage_size == FALSE) &&
15716 	    (this_entry->superpage_size == FALSE)
15717 	    ) {
15718 		if (prev_entry->vme_permanent) {
15719 			assert(this_entry->vme_permanent);
15720 			prev_entry->vme_permanent = false;
15721 		}
15722 		vm_map_store_entry_unlink(map, prev_entry, true);
15723 		assert(prev_entry->vme_start < this_entry->vme_end);
15724 		if (prev_entry->map_aligned) {
15725 			assert(VM_MAP_PAGE_ALIGNED(prev_entry->vme_start,
15726 			    VM_MAP_PAGE_MASK(map)));
15727 		}
15728 		this_entry->vme_start = prev_entry->vme_start;
15729 		VME_OFFSET_SET(this_entry, VME_OFFSET(prev_entry));
15730 
15731 		if (map->holelistenabled) {
15732 			vm_map_store_update_first_free(map, this_entry, TRUE);
15733 		}
15734 
15735 		if (prev_entry->is_sub_map) {
15736 			vm_map_deallocate(VME_SUBMAP(prev_entry));
15737 		} else {
15738 			vm_object_deallocate(VME_OBJECT(prev_entry));
15739 		}
15740 		vm_map_entry_dispose(prev_entry);
15741 		SAVE_HINT_MAP_WRITE(map, this_entry);
15742 	}
15743 }
15744 
15745 void
vm_map_simplify(vm_map_t map,vm_map_offset_t start)15746 vm_map_simplify(
15747 	vm_map_t        map,
15748 	vm_map_offset_t start)
15749 {
15750 	vm_map_entry_t  this_entry;
15751 
15752 	vm_map_lock(map);
15753 	if (vm_map_lookup_entry(map, start, &this_entry)) {
15754 		vm_map_simplify_entry(map, this_entry);
15755 		vm_map_simplify_entry(map, this_entry->vme_next);
15756 	}
15757 	vm_map_unlock(map);
15758 }
15759 
15760 static void
vm_map_simplify_range(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)15761 vm_map_simplify_range(
15762 	vm_map_t        map,
15763 	vm_map_offset_t start,
15764 	vm_map_offset_t end)
15765 {
15766 	vm_map_entry_t  entry;
15767 
15768 	/*
15769 	 * The map should be locked (for "write") by the caller.
15770 	 */
15771 
15772 	if (start >= end) {
15773 		/* invalid address range */
15774 		return;
15775 	}
15776 
15777 	start = vm_map_trunc_page(start,
15778 	    VM_MAP_PAGE_MASK(map));
15779 	end = vm_map_round_page(end,
15780 	    VM_MAP_PAGE_MASK(map));
15781 
15782 	if (!vm_map_lookup_entry(map, start, &entry)) {
15783 		/* "start" is not mapped and "entry" ends before "start" */
15784 		if (entry == vm_map_to_entry(map)) {
15785 			/* start with first entry in the map */
15786 			entry = vm_map_first_entry(map);
15787 		} else {
15788 			/* start with next entry */
15789 			entry = entry->vme_next;
15790 		}
15791 	}
15792 
15793 	while (entry != vm_map_to_entry(map) &&
15794 	    entry->vme_start <= end) {
15795 		/* try and coalesce "entry" with its previous entry */
15796 		vm_map_simplify_entry(map, entry);
15797 		entry = entry->vme_next;
15798 	}
15799 }
15800 
15801 
15802 /*
15803  *	Routine:	vm_map_machine_attribute
15804  *	Purpose:
15805  *		Provide machine-specific attributes to mappings,
15806  *		such as cachability etc. for machines that provide
15807  *		them.  NUMA architectures and machines with big/strange
15808  *		caches will use this.
15809  *	Note:
15810  *		Responsibilities for locking and checking are handled here,
15811  *		everything else in the pmap module. If any non-volatile
15812  *		information must be kept, the pmap module should handle
15813  *		it itself. [This assumes that attributes do not
15814  *		need to be inherited, which seems ok to me]
15815  */
15816 kern_return_t
vm_map_machine_attribute(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_machine_attribute_t attribute,vm_machine_attribute_val_t * value)15817 vm_map_machine_attribute(
15818 	vm_map_t                        map,
15819 	vm_map_offset_t         start,
15820 	vm_map_offset_t         end,
15821 	vm_machine_attribute_t  attribute,
15822 	vm_machine_attribute_val_t* value)              /* IN/OUT */
15823 {
15824 	kern_return_t   ret;
15825 	vm_map_size_t sync_size;
15826 	vm_map_entry_t entry;
15827 
15828 	if (start < vm_map_min(map) || end > vm_map_max(map)) {
15829 		return KERN_INVALID_ADDRESS;
15830 	}
15831 	if (__improbable(vm_map_range_overflows(map, start, end - start))) {
15832 		return KERN_INVALID_ADDRESS;
15833 	}
15834 
15835 	/* Figure how much memory we need to flush (in page increments) */
15836 	sync_size = end - start;
15837 
15838 	vm_map_lock(map);
15839 
15840 	if (attribute != MATTR_CACHE) {
15841 		/* If we don't have to find physical addresses, we */
15842 		/* don't have to do an explicit traversal here.    */
15843 		ret = pmap_attribute(map->pmap, start, end - start,
15844 		    attribute, value);
15845 		vm_map_unlock(map);
15846 		return ret;
15847 	}
15848 
15849 	ret = KERN_SUCCESS;                                                                             /* Assume it all worked */
15850 
15851 	while (sync_size) {
15852 		if (vm_map_lookup_entry(map, start, &entry)) {
15853 			vm_map_size_t   sub_size;
15854 			if ((entry->vme_end - start) > sync_size) {
15855 				sub_size = sync_size;
15856 				sync_size = 0;
15857 			} else {
15858 				sub_size = entry->vme_end - start;
15859 				sync_size -= sub_size;
15860 			}
15861 			if (entry->is_sub_map) {
15862 				vm_map_offset_t sub_start;
15863 				vm_map_offset_t sub_end;
15864 
15865 				sub_start = (start - entry->vme_start)
15866 				    + VME_OFFSET(entry);
15867 				sub_end = sub_start + sub_size;
15868 				vm_map_machine_attribute(
15869 					VME_SUBMAP(entry),
15870 					sub_start,
15871 					sub_end,
15872 					attribute, value);
15873 			} else if (VME_OBJECT(entry)) {
15874 				vm_page_t               m;
15875 				vm_object_t             object;
15876 				vm_object_t             base_object;
15877 				vm_object_t             last_object;
15878 				vm_object_offset_t      offset;
15879 				vm_object_offset_t      base_offset;
15880 				vm_map_size_t           range;
15881 				range = sub_size;
15882 				offset = (start - entry->vme_start)
15883 				    + VME_OFFSET(entry);
15884 				offset = vm_object_trunc_page(offset);
15885 				base_offset = offset;
15886 				object = VME_OBJECT(entry);
15887 				base_object = object;
15888 				last_object = NULL;
15889 
15890 				vm_object_lock(object);
15891 
15892 				while (range) {
15893 					m = vm_page_lookup(
15894 						object, offset);
15895 
15896 					if (m && !m->vmp_fictitious) {
15897 						ret =
15898 						    pmap_attribute_cache_sync(
15899 							VM_PAGE_GET_PHYS_PAGE(m),
15900 							PAGE_SIZE,
15901 							attribute, value);
15902 					} else if (object->shadow) {
15903 						offset = offset + object->vo_shadow_offset;
15904 						last_object = object;
15905 						object = object->shadow;
15906 						vm_object_lock(last_object->shadow);
15907 						vm_object_unlock(last_object);
15908 						continue;
15909 					}
15910 					if (range < PAGE_SIZE) {
15911 						range = 0;
15912 					} else {
15913 						range -= PAGE_SIZE;
15914 					}
15915 
15916 					if (base_object != object) {
15917 						vm_object_unlock(object);
15918 						vm_object_lock(base_object);
15919 						object = base_object;
15920 					}
15921 					/* Bump to the next page */
15922 					base_offset += PAGE_SIZE;
15923 					offset = base_offset;
15924 				}
15925 				vm_object_unlock(object);
15926 			}
15927 			start += sub_size;
15928 		} else {
15929 			vm_map_unlock(map);
15930 			return KERN_FAILURE;
15931 		}
15932 	}
15933 
15934 	vm_map_unlock(map);
15935 
15936 	return ret;
15937 }
15938 
15939 /*
15940  *	vm_map_behavior_set:
15941  *
15942  *	Sets the paging reference behavior of the specified address
15943  *	range in the target map.  Paging reference behavior affects
15944  *	how pagein operations resulting from faults on the map will be
15945  *	clustered.
15946  */
15947 kern_return_t
vm_map_behavior_set(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_behavior_t new_behavior)15948 vm_map_behavior_set(
15949 	vm_map_t        map,
15950 	vm_map_offset_t start,
15951 	vm_map_offset_t end,
15952 	vm_behavior_t   new_behavior)
15953 {
15954 	vm_map_entry_t  entry;
15955 	vm_map_entry_t  temp_entry;
15956 
15957 	if (start > end ||
15958 	    start < vm_map_min(map) ||
15959 	    end > vm_map_max(map)) {
15960 		return KERN_NO_SPACE;
15961 	}
15962 	if (__improbable(vm_map_range_overflows(map, start, end - start))) {
15963 		return KERN_INVALID_ADDRESS;
15964 	}
15965 
15966 	switch (new_behavior) {
15967 	/*
15968 	 * This first block of behaviors all set a persistent state on the specified
15969 	 * memory range.  All we have to do here is to record the desired behavior
15970 	 * in the vm_map_entry_t's.
15971 	 */
15972 
15973 	case VM_BEHAVIOR_DEFAULT:
15974 	case VM_BEHAVIOR_RANDOM:
15975 	case VM_BEHAVIOR_SEQUENTIAL:
15976 	case VM_BEHAVIOR_RSEQNTL:
15977 	case VM_BEHAVIOR_ZERO_WIRED_PAGES:
15978 		vm_map_lock(map);
15979 
15980 		/*
15981 		 *	The entire address range must be valid for the map.
15982 		 *      Note that vm_map_range_check() does a
15983 		 *	vm_map_lookup_entry() internally and returns the
15984 		 *	entry containing the start of the address range if
15985 		 *	the entire range is valid.
15986 		 */
15987 		if (vm_map_range_check(map, start, end, &temp_entry)) {
15988 			entry = temp_entry;
15989 			vm_map_clip_start(map, entry, start);
15990 		} else {
15991 			vm_map_unlock(map);
15992 			return KERN_INVALID_ADDRESS;
15993 		}
15994 
15995 		while ((entry != vm_map_to_entry(map)) && (entry->vme_start < end)) {
15996 			vm_map_clip_end(map, entry, end);
15997 			if (entry->is_sub_map) {
15998 				assert(!entry->use_pmap);
15999 			}
16000 
16001 			if (new_behavior == VM_BEHAVIOR_ZERO_WIRED_PAGES) {
16002 				entry->zero_wired_pages = TRUE;
16003 			} else {
16004 				entry->behavior = new_behavior;
16005 			}
16006 			entry = entry->vme_next;
16007 		}
16008 
16009 		vm_map_unlock(map);
16010 		break;
16011 
16012 	/*
16013 	 * The rest of these are different from the above in that they cause
16014 	 * an immediate action to take place as opposed to setting a behavior that
16015 	 * affects future actions.
16016 	 */
16017 
16018 	case VM_BEHAVIOR_WILLNEED:
16019 		return vm_map_willneed(map, start, end);
16020 
16021 	case VM_BEHAVIOR_DONTNEED:
16022 		return vm_map_msync(map, start, end - start, VM_SYNC_DEACTIVATE | VM_SYNC_CONTIGUOUS);
16023 
16024 	case VM_BEHAVIOR_FREE:
16025 		return vm_map_msync(map, start, end - start, VM_SYNC_KILLPAGES | VM_SYNC_CONTIGUOUS);
16026 
16027 	case VM_BEHAVIOR_REUSABLE:
16028 		return vm_map_reusable_pages(map, start, end);
16029 
16030 	case VM_BEHAVIOR_REUSE:
16031 		return vm_map_reuse_pages(map, start, end);
16032 
16033 	case VM_BEHAVIOR_CAN_REUSE:
16034 		return vm_map_can_reuse(map, start, end);
16035 
16036 #if MACH_ASSERT
16037 	case VM_BEHAVIOR_PAGEOUT:
16038 		return vm_map_pageout(map, start, end);
16039 #endif /* MACH_ASSERT */
16040 
16041 	default:
16042 		return KERN_INVALID_ARGUMENT;
16043 	}
16044 
16045 	return KERN_SUCCESS;
16046 }
16047 
16048 
16049 /*
16050  * Internals for madvise(MADV_WILLNEED) system call.
16051  *
16052  * The implementation is to do:-
16053  * a) read-ahead if the mapping corresponds to a mapped regular file
16054  * b) or, fault in the pages (zero-fill, decompress etc) if it's an anonymous mapping
16055  */
16056 
16057 
16058 static kern_return_t
vm_map_willneed(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)16059 vm_map_willneed(
16060 	vm_map_t        map,
16061 	vm_map_offset_t start,
16062 	vm_map_offset_t end
16063 	)
16064 {
16065 	vm_map_entry_t                  entry;
16066 	vm_object_t                     object;
16067 	memory_object_t                 pager;
16068 	struct vm_object_fault_info     fault_info = {};
16069 	kern_return_t                   kr;
16070 	vm_object_size_t                len;
16071 	vm_object_offset_t              offset;
16072 
16073 	fault_info.interruptible = THREAD_UNINT;        /* ignored value */
16074 	fault_info.behavior      = VM_BEHAVIOR_SEQUENTIAL;
16075 	fault_info.stealth       = TRUE;
16076 
16077 	/*
16078 	 * The MADV_WILLNEED operation doesn't require any changes to the
16079 	 * vm_map_entry_t's, so the read lock is sufficient.
16080 	 */
16081 
16082 	vm_map_lock_read(map);
16083 
16084 	/*
16085 	 * The madvise semantics require that the address range be fully
16086 	 * allocated with no holes.  Otherwise, we're required to return
16087 	 * an error.
16088 	 */
16089 
16090 	if (!vm_map_range_check(map, start, end, &entry)) {
16091 		vm_map_unlock_read(map);
16092 		return KERN_INVALID_ADDRESS;
16093 	}
16094 
16095 	/*
16096 	 * Examine each vm_map_entry_t in the range.
16097 	 */
16098 	for (; entry != vm_map_to_entry(map) && start < end;) {
16099 		/*
16100 		 * The first time through, the start address could be anywhere
16101 		 * within the vm_map_entry we found.  So adjust the offset to
16102 		 * correspond.  After that, the offset will always be zero to
16103 		 * correspond to the beginning of the current vm_map_entry.
16104 		 */
16105 		offset = (start - entry->vme_start) + VME_OFFSET(entry);
16106 
16107 		/*
16108 		 * Set the length so we don't go beyond the end of the
16109 		 * map_entry or beyond the end of the range we were given.
16110 		 * This range could span also multiple map entries all of which
16111 		 * map different files, so make sure we only do the right amount
16112 		 * of I/O for each object.  Note that it's possible for there
16113 		 * to be multiple map entries all referring to the same object
16114 		 * but with different page permissions, but it's not worth
16115 		 * trying to optimize that case.
16116 		 */
16117 		len = MIN(entry->vme_end - start, end - start);
16118 
16119 		if ((vm_size_t) len != len) {
16120 			/* 32-bit overflow */
16121 			len = (vm_size_t) (0 - PAGE_SIZE);
16122 		}
16123 		fault_info.cluster_size = (vm_size_t) len;
16124 		fault_info.lo_offset    = offset;
16125 		fault_info.hi_offset    = offset + len;
16126 		fault_info.user_tag     = VME_ALIAS(entry);
16127 		fault_info.pmap_options = 0;
16128 		if (entry->iokit_acct ||
16129 		    (!entry->is_sub_map && !entry->use_pmap)) {
16130 			fault_info.pmap_options |= PMAP_OPTIONS_ALT_ACCT;
16131 		}
16132 		fault_info.fi_xnu_user_debug = entry->vme_xnu_user_debug;
16133 
16134 		/*
16135 		 * If the entry is a submap OR there's no read permission
16136 		 * to this mapping, then just skip it.
16137 		 */
16138 		if ((entry->is_sub_map) || (entry->protection & VM_PROT_READ) == 0) {
16139 			entry = entry->vme_next;
16140 			start = entry->vme_start;
16141 			continue;
16142 		}
16143 
16144 		object = VME_OBJECT(entry);
16145 
16146 		if (object == NULL ||
16147 		    (object && object->internal)) {
16148 			/*
16149 			 * Memory range backed by anonymous memory.
16150 			 */
16151 			vm_size_t region_size = 0, effective_page_size = 0;
16152 			vm_map_offset_t addr = 0, effective_page_mask = 0;
16153 
16154 			region_size = len;
16155 			addr = start;
16156 
16157 			effective_page_mask = MIN(vm_map_page_mask(current_map()), PAGE_MASK);
16158 			effective_page_size = effective_page_mask + 1;
16159 
16160 			vm_map_unlock_read(map);
16161 
16162 			while (region_size) {
16163 				vm_pre_fault(
16164 					vm_map_trunc_page(addr, effective_page_mask),
16165 					VM_PROT_READ | VM_PROT_WRITE);
16166 
16167 				region_size -= effective_page_size;
16168 				addr += effective_page_size;
16169 			}
16170 		} else {
16171 			/*
16172 			 * Find the file object backing this map entry.  If there is
16173 			 * none, then we simply ignore the "will need" advice for this
16174 			 * entry and go on to the next one.
16175 			 */
16176 			if ((object = find_vnode_object(entry)) == VM_OBJECT_NULL) {
16177 				entry = entry->vme_next;
16178 				start = entry->vme_start;
16179 				continue;
16180 			}
16181 
16182 			vm_object_paging_begin(object);
16183 			pager = object->pager;
16184 			vm_object_unlock(object);
16185 
16186 			/*
16187 			 * The data_request() could take a long time, so let's
16188 			 * release the map lock to avoid blocking other threads.
16189 			 */
16190 			vm_map_unlock_read(map);
16191 
16192 			/*
16193 			 * Get the data from the object asynchronously.
16194 			 *
16195 			 * Note that memory_object_data_request() places limits on the
16196 			 * amount of I/O it will do.  Regardless of the len we
16197 			 * specified, it won't do more than MAX_UPL_TRANSFER_BYTES and it
16198 			 * silently truncates the len to that size.  This isn't
16199 			 * necessarily bad since madvise shouldn't really be used to
16200 			 * page in unlimited amounts of data.  Other Unix variants
16201 			 * limit the willneed case as well.  If this turns out to be an
16202 			 * issue for developers, then we can always adjust the policy
16203 			 * here and still be backwards compatible since this is all
16204 			 * just "advice".
16205 			 */
16206 			kr = memory_object_data_request(
16207 				pager,
16208 				vm_object_trunc_page(offset) + object->paging_offset,
16209 				0,      /* ignored */
16210 				VM_PROT_READ,
16211 				(memory_object_fault_info_t)&fault_info);
16212 
16213 			vm_object_lock(object);
16214 			vm_object_paging_end(object);
16215 			vm_object_unlock(object);
16216 
16217 			/*
16218 			 * If we couldn't do the I/O for some reason, just give up on
16219 			 * the madvise.  We still return success to the user since
16220 			 * madvise isn't supposed to fail when the advice can't be
16221 			 * taken.
16222 			 */
16223 
16224 			if (kr != KERN_SUCCESS) {
16225 				return KERN_SUCCESS;
16226 			}
16227 		}
16228 
16229 		start += len;
16230 		if (start >= end) {
16231 			/* done */
16232 			return KERN_SUCCESS;
16233 		}
16234 
16235 		/* look up next entry */
16236 		vm_map_lock_read(map);
16237 		if (!vm_map_lookup_entry(map, start, &entry)) {
16238 			/*
16239 			 * There's a new hole in the address range.
16240 			 */
16241 			vm_map_unlock_read(map);
16242 			return KERN_INVALID_ADDRESS;
16243 		}
16244 	}
16245 
16246 	vm_map_unlock_read(map);
16247 	return KERN_SUCCESS;
16248 }
16249 
16250 static boolean_t
vm_map_entry_is_reusable(vm_map_entry_t entry)16251 vm_map_entry_is_reusable(
16252 	vm_map_entry_t entry)
16253 {
16254 	/* Only user map entries */
16255 
16256 	vm_object_t object;
16257 
16258 	if (entry->is_sub_map) {
16259 		return FALSE;
16260 	}
16261 
16262 	switch (VME_ALIAS(entry)) {
16263 	case VM_MEMORY_MALLOC:
16264 	case VM_MEMORY_MALLOC_SMALL:
16265 	case VM_MEMORY_MALLOC_LARGE:
16266 	case VM_MEMORY_REALLOC:
16267 	case VM_MEMORY_MALLOC_TINY:
16268 	case VM_MEMORY_MALLOC_LARGE_REUSABLE:
16269 	case VM_MEMORY_MALLOC_LARGE_REUSED:
16270 		/*
16271 		 * This is a malloc() memory region: check if it's still
16272 		 * in its original state and can be re-used for more
16273 		 * malloc() allocations.
16274 		 */
16275 		break;
16276 	default:
16277 		/*
16278 		 * Not a malloc() memory region: let the caller decide if
16279 		 * it's re-usable.
16280 		 */
16281 		return TRUE;
16282 	}
16283 
16284 	if (/*entry->is_shared ||*/
16285 		entry->is_sub_map ||
16286 		entry->in_transition ||
16287 		entry->protection != VM_PROT_DEFAULT ||
16288 		entry->max_protection != VM_PROT_ALL ||
16289 		entry->inheritance != VM_INHERIT_DEFAULT ||
16290 		entry->no_cache ||
16291 		entry->vme_permanent ||
16292 		entry->superpage_size != FALSE ||
16293 		entry->zero_wired_pages ||
16294 		entry->wired_count != 0 ||
16295 		entry->user_wired_count != 0) {
16296 		return FALSE;
16297 	}
16298 
16299 	object = VME_OBJECT(entry);
16300 	if (object == VM_OBJECT_NULL) {
16301 		return TRUE;
16302 	}
16303 	if (
16304 #if 0
16305 		/*
16306 		 * Let's proceed even if the VM object is potentially
16307 		 * shared.
16308 		 * We check for this later when processing the actual
16309 		 * VM pages, so the contents will be safe if shared.
16310 		 *
16311 		 * But we can still mark this memory region as "reusable" to
16312 		 * acknowledge that the caller did let us know that the memory
16313 		 * could be re-used and should not be penalized for holding
16314 		 * on to it.  This allows its "resident size" to not include
16315 		 * the reusable range.
16316 		 */
16317 		object->ref_count == 1 &&
16318 #endif
16319 		object->wired_page_count == 0 &&
16320 		object->copy == VM_OBJECT_NULL &&
16321 		object->shadow == VM_OBJECT_NULL &&
16322 		object->internal &&
16323 		object->purgable == VM_PURGABLE_DENY &&
16324 		object->wimg_bits == VM_WIMG_USE_DEFAULT &&
16325 		!object->code_signed) {
16326 		return TRUE;
16327 	}
16328 	return FALSE;
16329 }
16330 
16331 static kern_return_t
vm_map_reuse_pages(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)16332 vm_map_reuse_pages(
16333 	vm_map_t        map,
16334 	vm_map_offset_t start,
16335 	vm_map_offset_t end)
16336 {
16337 	vm_map_entry_t                  entry;
16338 	vm_object_t                     object;
16339 	vm_object_offset_t              start_offset, end_offset;
16340 
16341 	/*
16342 	 * The MADV_REUSE operation doesn't require any changes to the
16343 	 * vm_map_entry_t's, so the read lock is sufficient.
16344 	 */
16345 
16346 	if (VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT) {
16347 		/*
16348 		 * XXX TODO4K
16349 		 * need to figure out what reusable means for a
16350 		 * portion of a native page.
16351 		 */
16352 		return KERN_SUCCESS;
16353 	}
16354 
16355 	vm_map_lock_read(map);
16356 	assert(map->pmap != kernel_pmap);       /* protect alias access */
16357 
16358 	/*
16359 	 * The madvise semantics require that the address range be fully
16360 	 * allocated with no holes.  Otherwise, we're required to return
16361 	 * an error.
16362 	 */
16363 
16364 	if (!vm_map_range_check(map, start, end, &entry)) {
16365 		vm_map_unlock_read(map);
16366 		vm_page_stats_reusable.reuse_pages_failure++;
16367 		return KERN_INVALID_ADDRESS;
16368 	}
16369 
16370 	/*
16371 	 * Examine each vm_map_entry_t in the range.
16372 	 */
16373 	for (; entry != vm_map_to_entry(map) && entry->vme_start < end;
16374 	    entry = entry->vme_next) {
16375 		/*
16376 		 * Sanity check on the VM map entry.
16377 		 */
16378 		if (!vm_map_entry_is_reusable(entry)) {
16379 			vm_map_unlock_read(map);
16380 			vm_page_stats_reusable.reuse_pages_failure++;
16381 			return KERN_INVALID_ADDRESS;
16382 		}
16383 
16384 		/*
16385 		 * The first time through, the start address could be anywhere
16386 		 * within the vm_map_entry we found.  So adjust the offset to
16387 		 * correspond.
16388 		 */
16389 		if (entry->vme_start < start) {
16390 			start_offset = start - entry->vme_start;
16391 		} else {
16392 			start_offset = 0;
16393 		}
16394 		end_offset = MIN(end, entry->vme_end) - entry->vme_start;
16395 		start_offset += VME_OFFSET(entry);
16396 		end_offset += VME_OFFSET(entry);
16397 
16398 		object = VME_OBJECT(entry);
16399 		if (object != VM_OBJECT_NULL) {
16400 			vm_object_lock(object);
16401 			vm_object_reuse_pages(object, start_offset, end_offset,
16402 			    TRUE);
16403 			vm_object_unlock(object);
16404 		}
16405 
16406 		if (VME_ALIAS(entry) == VM_MEMORY_MALLOC_LARGE_REUSABLE) {
16407 			/*
16408 			 * XXX
16409 			 * We do not hold the VM map exclusively here.
16410 			 * The "alias" field is not that critical, so it's
16411 			 * safe to update it here, as long as it is the only
16412 			 * one that can be modified while holding the VM map
16413 			 * "shared".
16414 			 */
16415 			VME_ALIAS_SET(entry, VM_MEMORY_MALLOC_LARGE_REUSED);
16416 		}
16417 	}
16418 
16419 	vm_map_unlock_read(map);
16420 	vm_page_stats_reusable.reuse_pages_success++;
16421 	return KERN_SUCCESS;
16422 }
16423 
16424 
16425 static kern_return_t
vm_map_reusable_pages(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)16426 vm_map_reusable_pages(
16427 	vm_map_t        map,
16428 	vm_map_offset_t start,
16429 	vm_map_offset_t end)
16430 {
16431 	vm_map_entry_t                  entry;
16432 	vm_object_t                     object;
16433 	vm_object_offset_t              start_offset, end_offset;
16434 	vm_map_offset_t                 pmap_offset;
16435 
16436 	if (VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT) {
16437 		/*
16438 		 * XXX TODO4K
16439 		 * need to figure out what reusable means for a portion
16440 		 * of a native page.
16441 		 */
16442 		return KERN_SUCCESS;
16443 	}
16444 
16445 	/*
16446 	 * The MADV_REUSABLE operation doesn't require any changes to the
16447 	 * vm_map_entry_t's, so the read lock is sufficient.
16448 	 */
16449 
16450 	vm_map_lock_read(map);
16451 	assert(map->pmap != kernel_pmap);       /* protect alias access */
16452 
16453 	/*
16454 	 * The madvise semantics require that the address range be fully
16455 	 * allocated with no holes.  Otherwise, we're required to return
16456 	 * an error.
16457 	 */
16458 
16459 	if (!vm_map_range_check(map, start, end, &entry)) {
16460 		vm_map_unlock_read(map);
16461 		vm_page_stats_reusable.reusable_pages_failure++;
16462 		return KERN_INVALID_ADDRESS;
16463 	}
16464 
16465 	/*
16466 	 * Examine each vm_map_entry_t in the range.
16467 	 */
16468 	for (; entry != vm_map_to_entry(map) && entry->vme_start < end;
16469 	    entry = entry->vme_next) {
16470 		int kill_pages = 0;
16471 		boolean_t reusable_no_write = FALSE;
16472 
16473 		/*
16474 		 * Sanity check on the VM map entry.
16475 		 */
16476 		if (!vm_map_entry_is_reusable(entry)) {
16477 			vm_map_unlock_read(map);
16478 			vm_page_stats_reusable.reusable_pages_failure++;
16479 			return KERN_INVALID_ADDRESS;
16480 		}
16481 
16482 		if (!(entry->protection & VM_PROT_WRITE) && !entry->used_for_jit
16483 #if __arm64e__
16484 		    && !entry->used_for_tpro
16485 #endif
16486 		    ) {
16487 			/* not writable: can't discard contents */
16488 			vm_map_unlock_read(map);
16489 			vm_page_stats_reusable.reusable_nonwritable++;
16490 			vm_page_stats_reusable.reusable_pages_failure++;
16491 			return KERN_PROTECTION_FAILURE;
16492 		}
16493 
16494 		/*
16495 		 * The first time through, the start address could be anywhere
16496 		 * within the vm_map_entry we found.  So adjust the offset to
16497 		 * correspond.
16498 		 */
16499 		if (entry->vme_start < start) {
16500 			start_offset = start - entry->vme_start;
16501 			pmap_offset = start;
16502 		} else {
16503 			start_offset = 0;
16504 			pmap_offset = entry->vme_start;
16505 		}
16506 		end_offset = MIN(end, entry->vme_end) - entry->vme_start;
16507 		start_offset += VME_OFFSET(entry);
16508 		end_offset += VME_OFFSET(entry);
16509 
16510 		object = VME_OBJECT(entry);
16511 		if (object == VM_OBJECT_NULL) {
16512 			continue;
16513 		}
16514 
16515 		if (entry->protection & VM_PROT_EXECUTE) {
16516 			/*
16517 			 * Executable mappings might be write-protected by
16518 			 * hardware, so do not attempt to write to these pages.
16519 			 */
16520 			reusable_no_write = TRUE;
16521 		}
16522 
16523 		vm_object_lock(object);
16524 		if (((object->ref_count == 1) ||
16525 		    (object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC &&
16526 		    object->copy == VM_OBJECT_NULL)) &&
16527 		    object->shadow == VM_OBJECT_NULL &&
16528 		    /*
16529 		     * "iokit_acct" entries are billed for their virtual size
16530 		     * (rather than for their resident pages only), so they
16531 		     * wouldn't benefit from making pages reusable, and it
16532 		     * would be hard to keep track of pages that are both
16533 		     * "iokit_acct" and "reusable" in the pmap stats and
16534 		     * ledgers.
16535 		     */
16536 		    !(entry->iokit_acct ||
16537 		    (!entry->is_sub_map && !entry->use_pmap))) {
16538 			if (object->ref_count != 1) {
16539 				vm_page_stats_reusable.reusable_shared++;
16540 			}
16541 			kill_pages = 1;
16542 		} else {
16543 			kill_pages = -1;
16544 		}
16545 		if (kill_pages != -1) {
16546 			vm_object_deactivate_pages(object,
16547 			    start_offset,
16548 			    end_offset - start_offset,
16549 			    kill_pages,
16550 			    TRUE /*reusable_pages*/,
16551 			    reusable_no_write,
16552 			    map->pmap,
16553 			    pmap_offset);
16554 		} else {
16555 			vm_page_stats_reusable.reusable_pages_shared++;
16556 			DTRACE_VM4(vm_map_reusable_pages_shared,
16557 			    unsigned int, VME_ALIAS(entry),
16558 			    vm_map_t, map,
16559 			    vm_map_entry_t, entry,
16560 			    vm_object_t, object);
16561 		}
16562 		vm_object_unlock(object);
16563 
16564 		if (VME_ALIAS(entry) == VM_MEMORY_MALLOC_LARGE ||
16565 		    VME_ALIAS(entry) == VM_MEMORY_MALLOC_LARGE_REUSED) {
16566 			/*
16567 			 * XXX
16568 			 * We do not hold the VM map exclusively here.
16569 			 * The "alias" field is not that critical, so it's
16570 			 * safe to update it here, as long as it is the only
16571 			 * one that can be modified while holding the VM map
16572 			 * "shared".
16573 			 */
16574 			VME_ALIAS_SET(entry, VM_MEMORY_MALLOC_LARGE_REUSABLE);
16575 		}
16576 	}
16577 
16578 	vm_map_unlock_read(map);
16579 	vm_page_stats_reusable.reusable_pages_success++;
16580 	return KERN_SUCCESS;
16581 }
16582 
16583 
16584 static kern_return_t
vm_map_can_reuse(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)16585 vm_map_can_reuse(
16586 	vm_map_t        map,
16587 	vm_map_offset_t start,
16588 	vm_map_offset_t end)
16589 {
16590 	vm_map_entry_t                  entry;
16591 
16592 	/*
16593 	 * The MADV_REUSABLE operation doesn't require any changes to the
16594 	 * vm_map_entry_t's, so the read lock is sufficient.
16595 	 */
16596 
16597 	vm_map_lock_read(map);
16598 	assert(map->pmap != kernel_pmap);       /* protect alias access */
16599 
16600 	/*
16601 	 * The madvise semantics require that the address range be fully
16602 	 * allocated with no holes.  Otherwise, we're required to return
16603 	 * an error.
16604 	 */
16605 
16606 	if (!vm_map_range_check(map, start, end, &entry)) {
16607 		vm_map_unlock_read(map);
16608 		vm_page_stats_reusable.can_reuse_failure++;
16609 		return KERN_INVALID_ADDRESS;
16610 	}
16611 
16612 	/*
16613 	 * Examine each vm_map_entry_t in the range.
16614 	 */
16615 	for (; entry != vm_map_to_entry(map) && entry->vme_start < end;
16616 	    entry = entry->vme_next) {
16617 		/*
16618 		 * Sanity check on the VM map entry.
16619 		 */
16620 		if (!vm_map_entry_is_reusable(entry)) {
16621 			vm_map_unlock_read(map);
16622 			vm_page_stats_reusable.can_reuse_failure++;
16623 			return KERN_INVALID_ADDRESS;
16624 		}
16625 	}
16626 
16627 	vm_map_unlock_read(map);
16628 	vm_page_stats_reusable.can_reuse_success++;
16629 	return KERN_SUCCESS;
16630 }
16631 
16632 
16633 #if MACH_ASSERT
16634 static kern_return_t
vm_map_pageout(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)16635 vm_map_pageout(
16636 	vm_map_t        map,
16637 	vm_map_offset_t start,
16638 	vm_map_offset_t end)
16639 {
16640 	vm_map_entry_t                  entry;
16641 
16642 	/*
16643 	 * The MADV_PAGEOUT operation doesn't require any changes to the
16644 	 * vm_map_entry_t's, so the read lock is sufficient.
16645 	 */
16646 
16647 	vm_map_lock_read(map);
16648 
16649 	/*
16650 	 * The madvise semantics require that the address range be fully
16651 	 * allocated with no holes.  Otherwise, we're required to return
16652 	 * an error.
16653 	 */
16654 
16655 	if (!vm_map_range_check(map, start, end, &entry)) {
16656 		vm_map_unlock_read(map);
16657 		return KERN_INVALID_ADDRESS;
16658 	}
16659 
16660 	/*
16661 	 * Examine each vm_map_entry_t in the range.
16662 	 */
16663 	for (; entry != vm_map_to_entry(map) && entry->vme_start < end;
16664 	    entry = entry->vme_next) {
16665 		vm_object_t     object;
16666 
16667 		/*
16668 		 * Sanity check on the VM map entry.
16669 		 */
16670 		if (entry->is_sub_map) {
16671 			vm_map_t submap;
16672 			vm_map_offset_t submap_start;
16673 			vm_map_offset_t submap_end;
16674 			vm_map_entry_t submap_entry;
16675 
16676 			submap = VME_SUBMAP(entry);
16677 			submap_start = VME_OFFSET(entry);
16678 			submap_end = submap_start + (entry->vme_end -
16679 			    entry->vme_start);
16680 
16681 			vm_map_lock_read(submap);
16682 
16683 			if (!vm_map_range_check(submap,
16684 			    submap_start,
16685 			    submap_end,
16686 			    &submap_entry)) {
16687 				vm_map_unlock_read(submap);
16688 				vm_map_unlock_read(map);
16689 				return KERN_INVALID_ADDRESS;
16690 			}
16691 
16692 			if (submap_entry->is_sub_map) {
16693 				vm_map_unlock_read(submap);
16694 				continue;
16695 			}
16696 
16697 			object = VME_OBJECT(submap_entry);
16698 			if (object == VM_OBJECT_NULL || !object->internal) {
16699 				vm_map_unlock_read(submap);
16700 				continue;
16701 			}
16702 
16703 			vm_object_pageout(object);
16704 
16705 			vm_map_unlock_read(submap);
16706 			submap = VM_MAP_NULL;
16707 			submap_entry = VM_MAP_ENTRY_NULL;
16708 			continue;
16709 		}
16710 
16711 		object = VME_OBJECT(entry);
16712 		if (object == VM_OBJECT_NULL || !object->internal) {
16713 			continue;
16714 		}
16715 
16716 		vm_object_pageout(object);
16717 	}
16718 
16719 	vm_map_unlock_read(map);
16720 	return KERN_SUCCESS;
16721 }
16722 #endif /* MACH_ASSERT */
16723 
16724 
16725 /*
16726  *	Routine:	vm_map_entry_insert
16727  *
16728  *	Description:	This routine inserts a new vm_entry in a locked map.
16729  */
16730 static vm_map_entry_t
vm_map_entry_insert(vm_map_t map,vm_map_entry_t insp_entry,vm_map_offset_t start,vm_map_offset_t end,vm_object_t object,vm_object_offset_t offset,vm_map_kernel_flags_t vmk_flags,boolean_t needs_copy,vm_prot_t cur_protection,vm_prot_t max_protection,vm_inherit_t inheritance,boolean_t clear_map_aligned)16731 vm_map_entry_insert(
16732 	vm_map_t                map,
16733 	vm_map_entry_t          insp_entry,
16734 	vm_map_offset_t         start,
16735 	vm_map_offset_t         end,
16736 	vm_object_t             object,
16737 	vm_object_offset_t      offset,
16738 	vm_map_kernel_flags_t   vmk_flags,
16739 	boolean_t               needs_copy,
16740 	vm_prot_t               cur_protection,
16741 	vm_prot_t               max_protection,
16742 	vm_inherit_t            inheritance,
16743 	boolean_t               clear_map_aligned)
16744 {
16745 	vm_map_entry_t  new_entry;
16746 	boolean_t map_aligned = FALSE;
16747 
16748 	assert(insp_entry != (vm_map_entry_t)0);
16749 	vm_map_lock_assert_exclusive(map);
16750 
16751 #if DEVELOPMENT || DEBUG
16752 	vm_object_offset_t      end_offset = 0;
16753 	assertf(!os_add_overflow(end - start, offset, &end_offset), "size 0x%llx, offset 0x%llx caused overflow", (uint64_t)(end - start), offset);
16754 #endif /* DEVELOPMENT || DEBUG */
16755 
16756 	if (VM_MAP_PAGE_SHIFT(map) != PAGE_SHIFT) {
16757 		map_aligned = TRUE;
16758 	}
16759 	if (clear_map_aligned &&
16760 	    (!VM_MAP_PAGE_ALIGNED(start, VM_MAP_PAGE_MASK(map)) ||
16761 	    !VM_MAP_PAGE_ALIGNED(end, VM_MAP_PAGE_MASK(map)))) {
16762 		map_aligned = FALSE;
16763 	}
16764 	if (map_aligned) {
16765 		assert(VM_MAP_PAGE_ALIGNED(start, VM_MAP_PAGE_MASK(map)));
16766 		assert(VM_MAP_PAGE_ALIGNED(end, VM_MAP_PAGE_MASK(map)));
16767 	} else {
16768 		assert(page_aligned(start));
16769 		assert(page_aligned(end));
16770 	}
16771 	assert(start < end);
16772 
16773 	new_entry = vm_map_entry_create(map);
16774 
16775 	new_entry->vme_start = start;
16776 	new_entry->vme_end = end;
16777 
16778 	if (vmk_flags.vmkf_submap) {
16779 		new_entry->vme_atomic = vmk_flags.vmkf_submap_atomic;
16780 		VME_SUBMAP_SET(new_entry, (vm_map_t)object);
16781 	} else {
16782 		VME_OBJECT_SET(new_entry, object, false, 0);
16783 	}
16784 	VME_OFFSET_SET(new_entry, offset);
16785 	VME_ALIAS_SET(new_entry, vmk_flags.vm_tag);
16786 
16787 	new_entry->map_aligned = map_aligned;
16788 	new_entry->needs_copy = needs_copy;
16789 	new_entry->inheritance = inheritance;
16790 	new_entry->protection = cur_protection;
16791 	new_entry->max_protection = max_protection;
16792 	/*
16793 	 * submap: "use_pmap" means "nested".
16794 	 * default: false.
16795 	 *
16796 	 * object: "use_pmap" means "use pmap accounting" for footprint.
16797 	 * default: true.
16798 	 */
16799 	new_entry->use_pmap = !vmk_flags.vmkf_submap;
16800 	new_entry->no_cache = vmk_flags.vmf_no_cache;
16801 	new_entry->vme_permanent = vmk_flags.vmf_permanent;
16802 	new_entry->translated_allow_execute = vmk_flags.vmkf_translated_allow_execute;
16803 	new_entry->vme_no_copy_on_read = vmk_flags.vmkf_no_copy_on_read;
16804 	new_entry->superpage_size = (vmk_flags.vmf_superpage_size != 0);
16805 
16806 	if (vmk_flags.vmkf_map_jit) {
16807 		if (!(map->jit_entry_exists) ||
16808 		    VM_MAP_POLICY_ALLOW_MULTIPLE_JIT(map)) {
16809 			new_entry->used_for_jit = TRUE;
16810 			map->jit_entry_exists = TRUE;
16811 		}
16812 	}
16813 
16814 	/*
16815 	 *	Insert the new entry into the list.
16816 	 */
16817 
16818 	vm_map_store_entry_link(map, insp_entry, new_entry, vmk_flags);
16819 	map->size += end - start;
16820 
16821 	/*
16822 	 *	Update the free space hint and the lookup hint.
16823 	 */
16824 
16825 	SAVE_HINT_MAP_WRITE(map, new_entry);
16826 	return new_entry;
16827 }
16828 
16829 /*
16830  *	Routine:	vm_map_remap_extract
16831  *
16832  *	Description:	This routine returns a vm_entry list from a map.
16833  */
16834 static kern_return_t
vm_map_remap_extract(vm_map_t map,vm_map_offset_t addr,vm_map_size_t size,boolean_t copy,vm_map_copy_t map_copy,vm_prot_t * cur_protection,vm_prot_t * max_protection,vm_inherit_t inheritance,vm_map_kernel_flags_t vmk_flags)16835 vm_map_remap_extract(
16836 	vm_map_t                map,
16837 	vm_map_offset_t         addr,
16838 	vm_map_size_t           size,
16839 	boolean_t               copy,
16840 	vm_map_copy_t           map_copy,
16841 	vm_prot_t               *cur_protection,   /* IN/OUT */
16842 	vm_prot_t               *max_protection,   /* IN/OUT */
16843 	/* What, no behavior? */
16844 	vm_inherit_t            inheritance,
16845 	vm_map_kernel_flags_t   vmk_flags)
16846 {
16847 	struct vm_map_header   *map_header = &map_copy->cpy_hdr;
16848 	kern_return_t           result;
16849 	vm_map_size_t           mapped_size;
16850 	vm_map_size_t           tmp_size;
16851 	vm_map_entry_t          src_entry;     /* result of last map lookup */
16852 	vm_map_entry_t          new_entry;
16853 	vm_object_offset_t      offset;
16854 	vm_map_offset_t         map_address;
16855 	vm_map_offset_t         src_start;     /* start of entry to map */
16856 	vm_map_offset_t         src_end;       /* end of region to be mapped */
16857 	vm_object_t             object;
16858 	vm_map_version_t        version;
16859 	boolean_t               src_needs_copy;
16860 	boolean_t               new_entry_needs_copy;
16861 	vm_map_entry_t          saved_src_entry;
16862 	boolean_t               src_entry_was_wired;
16863 	vm_prot_t               max_prot_for_prot_copy;
16864 	vm_map_offset_t         effective_page_mask;
16865 	bool                    pageable, same_map;
16866 	boolean_t               vm_remap_legacy;
16867 	vm_prot_t               required_cur_prot, required_max_prot;
16868 	vm_object_t             new_copy_object;     /* vm_object_copy_* result */
16869 	boolean_t               saved_used_for_jit;     /* Saved used_for_jit. */
16870 #if __arm64e__
16871 	boolean_t               saved_used_for_tpro;    /* Saved used_for_tpro. */
16872 #endif
16873 
16874 	pageable = vmk_flags.vmkf_copy_pageable;
16875 	same_map = vmk_flags.vmkf_copy_same_map;
16876 
16877 	effective_page_mask = MIN(PAGE_MASK, VM_MAP_PAGE_MASK(map));
16878 
16879 	assert(map != VM_MAP_NULL);
16880 	assert(size != 0);
16881 	assert(size == vm_map_round_page(size, effective_page_mask));
16882 	assert(inheritance == VM_INHERIT_NONE ||
16883 	    inheritance == VM_INHERIT_COPY ||
16884 	    inheritance == VM_INHERIT_SHARE);
16885 	assert(!(*cur_protection & ~(VM_PROT_ALL | VM_PROT_ALLEXEC)));
16886 	assert(!(*max_protection & ~(VM_PROT_ALL | VM_PROT_ALLEXEC)));
16887 	assert((*cur_protection & *max_protection) == *cur_protection);
16888 
16889 	/*
16890 	 *	Compute start and end of region.
16891 	 */
16892 	src_start = vm_map_trunc_page(addr, effective_page_mask);
16893 	src_end = vm_map_round_page(src_start + size, effective_page_mask);
16894 
16895 	/*
16896 	 *	Initialize map_header.
16897 	 */
16898 	map_header->nentries = 0;
16899 	map_header->entries_pageable = pageable;
16900 //	map_header->page_shift = MIN(VM_MAP_PAGE_SHIFT(map), PAGE_SHIFT);
16901 	map_header->page_shift = (uint16_t)VM_MAP_PAGE_SHIFT(map);
16902 	map_header->rb_head_store.rbh_root = (void *)(int)SKIP_RB_TREE;
16903 	vm_map_store_init(map_header);
16904 
16905 	if (copy && vmk_flags.vmkf_remap_prot_copy) {
16906 		/*
16907 		 * Special case for vm_map_protect(VM_PROT_COPY):
16908 		 * we want to set the new mappings' max protection to the
16909 		 * specified *max_protection...
16910 		 */
16911 		max_prot_for_prot_copy = *max_protection & (VM_PROT_ALL | VM_PROT_ALLEXEC);
16912 		/* ... but we want to use the vm_remap() legacy mode */
16913 		*max_protection = VM_PROT_NONE;
16914 		*cur_protection = VM_PROT_NONE;
16915 	} else {
16916 		max_prot_for_prot_copy = VM_PROT_NONE;
16917 	}
16918 
16919 	if (*cur_protection == VM_PROT_NONE &&
16920 	    *max_protection == VM_PROT_NONE) {
16921 		/*
16922 		 * vm_remap() legacy mode:
16923 		 * Extract all memory regions in the specified range and
16924 		 * collect the strictest set of protections allowed on the
16925 		 * entire range, so the caller knows what they can do with
16926 		 * the remapped range.
16927 		 * We start with VM_PROT_ALL and we'll remove the protections
16928 		 * missing from each memory region.
16929 		 */
16930 		vm_remap_legacy = TRUE;
16931 		*cur_protection = VM_PROT_ALL;
16932 		*max_protection = VM_PROT_ALL;
16933 		required_cur_prot = VM_PROT_NONE;
16934 		required_max_prot = VM_PROT_NONE;
16935 	} else {
16936 		/*
16937 		 * vm_remap_new() mode:
16938 		 * Extract all memory regions in the specified range and
16939 		 * ensure that they have at least the protections specified
16940 		 * by the caller via *cur_protection and *max_protection.
16941 		 * The resulting mapping should have these protections.
16942 		 */
16943 		vm_remap_legacy = FALSE;
16944 		if (copy) {
16945 			required_cur_prot = VM_PROT_NONE;
16946 			required_max_prot = VM_PROT_READ;
16947 		} else {
16948 			required_cur_prot = *cur_protection;
16949 			required_max_prot = *max_protection;
16950 		}
16951 	}
16952 
16953 	map_address = 0;
16954 	mapped_size = 0;
16955 	result = KERN_SUCCESS;
16956 
16957 	/*
16958 	 *	The specified source virtual space might correspond to
16959 	 *	multiple map entries, need to loop on them.
16960 	 */
16961 	vm_map_lock(map);
16962 
16963 	if (map->pmap == kernel_pmap) {
16964 		map_copy->is_kernel_range = true;
16965 		map_copy->orig_range = kmem_addr_get_range(addr, size);
16966 #if CONFIG_MAP_RANGES
16967 	} else if (map->uses_user_ranges) {
16968 		map_copy->is_user_range = true;
16969 		map_copy->orig_range = vm_map_user_range_resolve(map, addr, size, NULL);
16970 #endif /* CONFIG_MAP_RANGES */
16971 	}
16972 
16973 	if (VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT) {
16974 		/*
16975 		 * This address space uses sub-pages so the range might
16976 		 * not be re-mappable in an address space with larger
16977 		 * pages. Re-assemble any broken-up VM map entries to
16978 		 * improve our chances of making it work.
16979 		 */
16980 		vm_map_simplify_range(map, src_start, src_end);
16981 	}
16982 	while (mapped_size != size) {
16983 		vm_map_size_t   entry_size;
16984 
16985 		/*
16986 		 *	Find the beginning of the region.
16987 		 */
16988 		if (!vm_map_lookup_entry(map, src_start, &src_entry)) {
16989 			result = KERN_INVALID_ADDRESS;
16990 			break;
16991 		}
16992 
16993 		if (src_start < src_entry->vme_start ||
16994 		    (mapped_size && src_start != src_entry->vme_start)) {
16995 			result = KERN_INVALID_ADDRESS;
16996 			break;
16997 		}
16998 
16999 		tmp_size = size - mapped_size;
17000 		if (src_end > src_entry->vme_end) {
17001 			tmp_size -= (src_end - src_entry->vme_end);
17002 		}
17003 
17004 		entry_size = (vm_map_size_t)(src_entry->vme_end -
17005 		    src_entry->vme_start);
17006 
17007 		if (src_entry->is_sub_map &&
17008 		    vmk_flags.vmkf_copy_single_object) {
17009 			vm_map_t submap;
17010 			vm_map_offset_t submap_start;
17011 			vm_map_size_t submap_size;
17012 			boolean_t submap_needs_copy;
17013 
17014 			/*
17015 			 * No check for "required protection" on "src_entry"
17016 			 * because the protections that matter are the ones
17017 			 * on the submap's VM map entry, which will be checked
17018 			 * during the call to vm_map_remap_extract() below.
17019 			 */
17020 			submap_size = src_entry->vme_end - src_start;
17021 			if (submap_size > size) {
17022 				submap_size = size;
17023 			}
17024 			submap_start = VME_OFFSET(src_entry) + src_start - src_entry->vme_start;
17025 			submap = VME_SUBMAP(src_entry);
17026 			if (copy) {
17027 				/*
17028 				 * The caller wants a copy-on-write re-mapping,
17029 				 * so let's extract from the submap accordingly.
17030 				 */
17031 				submap_needs_copy = TRUE;
17032 			} else if (src_entry->needs_copy) {
17033 				/*
17034 				 * The caller wants a shared re-mapping but the
17035 				 * submap is mapped with "needs_copy", so its
17036 				 * contents can't be shared as is. Extract the
17037 				 * contents of the submap as "copy-on-write".
17038 				 * The re-mapping won't be shared with the
17039 				 * original mapping but this is equivalent to
17040 				 * what happened with the original "remap from
17041 				 * submap" code.
17042 				 * The shared region is mapped "needs_copy", for
17043 				 * example.
17044 				 */
17045 				submap_needs_copy = TRUE;
17046 			} else {
17047 				/*
17048 				 * The caller wants a shared re-mapping and
17049 				 * this mapping can be shared (no "needs_copy"),
17050 				 * so let's extract from the submap accordingly.
17051 				 * Kernel submaps are mapped without
17052 				 * "needs_copy", for example.
17053 				 */
17054 				submap_needs_copy = FALSE;
17055 			}
17056 			vm_map_reference(submap);
17057 			vm_map_unlock(map);
17058 			src_entry = NULL;
17059 			if (vm_remap_legacy) {
17060 				*cur_protection = VM_PROT_NONE;
17061 				*max_protection = VM_PROT_NONE;
17062 			}
17063 
17064 			DTRACE_VM7(remap_submap_recurse,
17065 			    vm_map_t, map,
17066 			    vm_map_offset_t, addr,
17067 			    vm_map_size_t, size,
17068 			    boolean_t, copy,
17069 			    vm_map_offset_t, submap_start,
17070 			    vm_map_size_t, submap_size,
17071 			    boolean_t, submap_needs_copy);
17072 
17073 			result = vm_map_remap_extract(submap,
17074 			    submap_start,
17075 			    submap_size,
17076 			    submap_needs_copy,
17077 			    map_copy,
17078 			    cur_protection,
17079 			    max_protection,
17080 			    inheritance,
17081 			    vmk_flags);
17082 			vm_map_deallocate(submap);
17083 			return result;
17084 		}
17085 
17086 		if (src_entry->is_sub_map) {
17087 			/* protections for submap mapping are irrelevant here */
17088 		} else if (((src_entry->protection & required_cur_prot) !=
17089 		    required_cur_prot) ||
17090 		    ((src_entry->max_protection & required_max_prot) !=
17091 		    required_max_prot)) {
17092 			if (vmk_flags.vmkf_copy_single_object &&
17093 			    mapped_size != 0) {
17094 				/*
17095 				 * Single object extraction.
17096 				 * We can't extract more with the required
17097 				 * protection but we've extracted some, so
17098 				 * stop there and declare success.
17099 				 * The caller should check the size of
17100 				 * the copy entry we've extracted.
17101 				 */
17102 				result = KERN_SUCCESS;
17103 			} else {
17104 				/*
17105 				 * VM range extraction.
17106 				 * Required proctection is not available
17107 				 * for this part of the range: fail.
17108 				 */
17109 				result = KERN_PROTECTION_FAILURE;
17110 			}
17111 			break;
17112 		}
17113 
17114 		if (src_entry->is_sub_map) {
17115 			vm_map_t submap;
17116 			vm_map_offset_t submap_start;
17117 			vm_map_size_t submap_size;
17118 			vm_map_copy_t submap_copy;
17119 			vm_prot_t submap_curprot, submap_maxprot;
17120 			boolean_t submap_needs_copy;
17121 
17122 			/*
17123 			 * No check for "required protection" on "src_entry"
17124 			 * because the protections that matter are the ones
17125 			 * on the submap's VM map entry, which will be checked
17126 			 * during the call to vm_map_copy_extract() below.
17127 			 */
17128 			object = VM_OBJECT_NULL;
17129 			submap_copy = VM_MAP_COPY_NULL;
17130 
17131 			/* find equivalent range in the submap */
17132 			submap = VME_SUBMAP(src_entry);
17133 			submap_start = VME_OFFSET(src_entry) + src_start - src_entry->vme_start;
17134 			submap_size = tmp_size;
17135 			if (copy) {
17136 				/*
17137 				 * The caller wants a copy-on-write re-mapping,
17138 				 * so let's extract from the submap accordingly.
17139 				 */
17140 				submap_needs_copy = TRUE;
17141 			} else if (src_entry->needs_copy) {
17142 				/*
17143 				 * The caller wants a shared re-mapping but the
17144 				 * submap is mapped with "needs_copy", so its
17145 				 * contents can't be shared as is. Extract the
17146 				 * contents of the submap as "copy-on-write".
17147 				 * The re-mapping won't be shared with the
17148 				 * original mapping but this is equivalent to
17149 				 * what happened with the original "remap from
17150 				 * submap" code.
17151 				 * The shared region is mapped "needs_copy", for
17152 				 * example.
17153 				 */
17154 				submap_needs_copy = TRUE;
17155 			} else {
17156 				/*
17157 				 * The caller wants a shared re-mapping and
17158 				 * this mapping can be shared (no "needs_copy"),
17159 				 * so let's extract from the submap accordingly.
17160 				 * Kernel submaps are mapped without
17161 				 * "needs_copy", for example.
17162 				 */
17163 				submap_needs_copy = FALSE;
17164 			}
17165 			/* extra ref to keep submap alive */
17166 			vm_map_reference(submap);
17167 
17168 			DTRACE_VM7(remap_submap_recurse,
17169 			    vm_map_t, map,
17170 			    vm_map_offset_t, addr,
17171 			    vm_map_size_t, size,
17172 			    boolean_t, copy,
17173 			    vm_map_offset_t, submap_start,
17174 			    vm_map_size_t, submap_size,
17175 			    boolean_t, submap_needs_copy);
17176 
17177 			/*
17178 			 * The map can be safely unlocked since we
17179 			 * already hold a reference on the submap.
17180 			 *
17181 			 * No timestamp since we don't care if the map
17182 			 * gets modified while we're down in the submap.
17183 			 * We'll resume the extraction at src_start + tmp_size
17184 			 * anyway.
17185 			 */
17186 			vm_map_unlock(map);
17187 			src_entry = NULL; /* not valid once map is unlocked */
17188 
17189 			if (vm_remap_legacy) {
17190 				submap_curprot = VM_PROT_NONE;
17191 				submap_maxprot = VM_PROT_NONE;
17192 				if (max_prot_for_prot_copy) {
17193 					submap_maxprot = max_prot_for_prot_copy;
17194 				}
17195 			} else {
17196 				assert(!max_prot_for_prot_copy);
17197 				submap_curprot = *cur_protection;
17198 				submap_maxprot = *max_protection;
17199 			}
17200 			result = vm_map_copy_extract(submap,
17201 			    submap_start,
17202 			    submap_size,
17203 			    submap_needs_copy,
17204 			    &submap_copy,
17205 			    &submap_curprot,
17206 			    &submap_maxprot,
17207 			    inheritance,
17208 			    vmk_flags);
17209 
17210 			/* release extra ref on submap */
17211 			vm_map_deallocate(submap);
17212 			submap = VM_MAP_NULL;
17213 
17214 			if (result != KERN_SUCCESS) {
17215 				vm_map_lock(map);
17216 				break;
17217 			}
17218 
17219 			/* transfer submap_copy entries to map_header */
17220 			while (vm_map_copy_first_entry(submap_copy) !=
17221 			    vm_map_copy_to_entry(submap_copy)) {
17222 				vm_map_entry_t copy_entry;
17223 				vm_map_size_t copy_entry_size;
17224 
17225 				copy_entry = vm_map_copy_first_entry(submap_copy);
17226 
17227 				/*
17228 				 * Prevent kernel_object from being exposed to
17229 				 * user space.
17230 				 */
17231 				if (__improbable(copy_entry->vme_kernel_object)) {
17232 					printf("%d[%s]: rejecting attempt to extract from kernel_object\n",
17233 					    proc_selfpid(),
17234 					    (get_bsdtask_info(current_task())
17235 					    ? proc_name_address(get_bsdtask_info(current_task()))
17236 					    : "?"));
17237 					DTRACE_VM(extract_kernel_only);
17238 					result = KERN_INVALID_RIGHT;
17239 					vm_map_copy_discard(submap_copy);
17240 					submap_copy = VM_MAP_COPY_NULL;
17241 					vm_map_lock(map);
17242 					break;
17243 				}
17244 
17245 				vm_map_copy_entry_unlink(submap_copy, copy_entry);
17246 				copy_entry_size = copy_entry->vme_end - copy_entry->vme_start;
17247 				copy_entry->vme_start = map_address;
17248 				copy_entry->vme_end = map_address + copy_entry_size;
17249 				map_address += copy_entry_size;
17250 				mapped_size += copy_entry_size;
17251 				src_start += copy_entry_size;
17252 				assert(src_start <= src_end);
17253 				_vm_map_store_entry_link(map_header,
17254 				    map_header->links.prev,
17255 				    copy_entry);
17256 			}
17257 			/* done with submap_copy */
17258 			vm_map_copy_discard(submap_copy);
17259 
17260 			if (vm_remap_legacy) {
17261 				*cur_protection &= submap_curprot;
17262 				*max_protection &= submap_maxprot;
17263 			}
17264 
17265 			/* re-acquire the map lock and continue to next entry */
17266 			vm_map_lock(map);
17267 			continue;
17268 		} else {
17269 			object = VME_OBJECT(src_entry);
17270 
17271 			/*
17272 			 * Prevent kernel_object from being exposed to
17273 			 * user space.
17274 			 */
17275 			if (__improbable(object == kernel_object)) {
17276 				printf("%d[%s]: rejecting attempt to extract from kernel_object\n",
17277 				    proc_selfpid(),
17278 				    (get_bsdtask_info(current_task())
17279 				    ? proc_name_address(get_bsdtask_info(current_task()))
17280 				    : "?"));
17281 				DTRACE_VM(extract_kernel_only);
17282 				result = KERN_INVALID_RIGHT;
17283 				break;
17284 			}
17285 
17286 			if (src_entry->iokit_acct) {
17287 				/*
17288 				 * This entry uses "IOKit accounting".
17289 				 */
17290 			} else if (object != VM_OBJECT_NULL &&
17291 			    (object->purgable != VM_PURGABLE_DENY ||
17292 			    object->vo_ledger_tag != VM_LEDGER_TAG_NONE)) {
17293 				/*
17294 				 * Purgeable objects have their own accounting:
17295 				 * no pmap accounting for them.
17296 				 */
17297 				assertf(!src_entry->use_pmap,
17298 				    "map=%p src_entry=%p [0x%llx:0x%llx] 0x%x/0x%x %d",
17299 				    map,
17300 				    src_entry,
17301 				    (uint64_t)src_entry->vme_start,
17302 				    (uint64_t)src_entry->vme_end,
17303 				    src_entry->protection,
17304 				    src_entry->max_protection,
17305 				    VME_ALIAS(src_entry));
17306 			} else {
17307 				/*
17308 				 * Not IOKit or purgeable:
17309 				 * must be accounted by pmap stats.
17310 				 */
17311 				assertf(src_entry->use_pmap,
17312 				    "map=%p src_entry=%p [0x%llx:0x%llx] 0x%x/0x%x %d",
17313 				    map,
17314 				    src_entry,
17315 				    (uint64_t)src_entry->vme_start,
17316 				    (uint64_t)src_entry->vme_end,
17317 				    src_entry->protection,
17318 				    src_entry->max_protection,
17319 				    VME_ALIAS(src_entry));
17320 			}
17321 
17322 			if (object == VM_OBJECT_NULL) {
17323 				assert(!src_entry->needs_copy);
17324 				if (src_entry->max_protection == VM_PROT_NONE) {
17325 					assert(src_entry->protection == VM_PROT_NONE);
17326 					/*
17327 					 * No VM object and no permissions:
17328 					 * this must be a reserved range with
17329 					 * nothing to share or copy.
17330 					 * There could also be all sorts of
17331 					 * pmap shenanigans within that reserved
17332 					 * range, so let's just copy the map
17333 					 * entry as is to remap a similar
17334 					 * reserved range.
17335 					 */
17336 					offset = 0; /* no object => no offset */
17337 					goto copy_src_entry;
17338 				}
17339 				object = vm_object_allocate(entry_size);
17340 				VME_OFFSET_SET(src_entry, 0);
17341 				VME_OBJECT_SET(src_entry, object, false, 0);
17342 				assert(src_entry->use_pmap);
17343 				assert(!map->mapped_in_other_pmaps);
17344 			} else if (src_entry->wired_count ||
17345 			    object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC) {
17346 				/*
17347 				 * A wired memory region should not have
17348 				 * any pending copy-on-write and needs to
17349 				 * keep pointing at the VM object that
17350 				 * contains the wired pages.
17351 				 * If we're sharing this memory (copy=false),
17352 				 * we'll share this VM object.
17353 				 * If we're copying this memory (copy=true),
17354 				 * we'll call vm_object_copy_slowly() below
17355 				 * and use the new VM object for the remapping.
17356 				 *
17357 				 * Or, we are already using an asymmetric
17358 				 * copy, and therefore we already have
17359 				 * the right object.
17360 				 */
17361 				assert(!src_entry->needs_copy);
17362 			} else if (src_entry->needs_copy || object->shadowed ||
17363 			    (object->internal && !object->true_share &&
17364 			    !src_entry->is_shared &&
17365 			    object->vo_size > entry_size)) {
17366 				VME_OBJECT_SHADOW(src_entry, entry_size,
17367 				    vm_map_always_shadow(map));
17368 				assert(src_entry->use_pmap);
17369 
17370 				if (!src_entry->needs_copy &&
17371 				    (src_entry->protection & VM_PROT_WRITE)) {
17372 					vm_prot_t prot;
17373 
17374 					assert(!pmap_has_prot_policy(map->pmap, src_entry->translated_allow_execute, src_entry->protection));
17375 
17376 					prot = src_entry->protection & ~VM_PROT_WRITE;
17377 
17378 					if (override_nx(map,
17379 					    VME_ALIAS(src_entry))
17380 					    && prot) {
17381 						prot |= VM_PROT_EXECUTE;
17382 					}
17383 
17384 					assert(!pmap_has_prot_policy(map->pmap, src_entry->translated_allow_execute, prot));
17385 
17386 					if (map->mapped_in_other_pmaps) {
17387 						vm_object_pmap_protect(
17388 							VME_OBJECT(src_entry),
17389 							VME_OFFSET(src_entry),
17390 							entry_size,
17391 							PMAP_NULL,
17392 							PAGE_SIZE,
17393 							src_entry->vme_start,
17394 							prot);
17395 #if MACH_ASSERT
17396 					} else if (__improbable(map->pmap == PMAP_NULL)) {
17397 						extern boolean_t vm_tests_in_progress;
17398 						assert(vm_tests_in_progress);
17399 						/*
17400 						 * Some VM tests (in vm_tests.c)
17401 						 * sometimes want to use a VM
17402 						 * map without a pmap.
17403 						 * Otherwise, this should never
17404 						 * happen.
17405 						 */
17406 #endif /* MACH_ASSERT */
17407 					} else {
17408 						pmap_protect(vm_map_pmap(map),
17409 						    src_entry->vme_start,
17410 						    src_entry->vme_end,
17411 						    prot);
17412 					}
17413 				}
17414 
17415 				object = VME_OBJECT(src_entry);
17416 				src_entry->needs_copy = FALSE;
17417 			}
17418 
17419 
17420 			vm_object_lock(object);
17421 			vm_object_reference_locked(object); /* object ref. for new entry */
17422 			assert(!src_entry->needs_copy);
17423 			if (object->copy_strategy ==
17424 			    MEMORY_OBJECT_COPY_SYMMETRIC) {
17425 				/*
17426 				 * If we want to share this object (copy==0),
17427 				 * it needs to be COPY_DELAY.
17428 				 * If we want to copy this object (copy==1),
17429 				 * we can't just set "needs_copy" on our side
17430 				 * and expect the other side to do the same
17431 				 * (symmetrically), so we can't let the object
17432 				 * stay COPY_SYMMETRIC.
17433 				 * So we always switch from COPY_SYMMETRIC to
17434 				 * COPY_DELAY.
17435 				 */
17436 				object->copy_strategy =
17437 				    MEMORY_OBJECT_COPY_DELAY;
17438 				object->true_share = TRUE;
17439 			}
17440 			vm_object_unlock(object);
17441 		}
17442 
17443 		offset = (VME_OFFSET(src_entry) +
17444 		    (src_start - src_entry->vme_start));
17445 
17446 copy_src_entry:
17447 		new_entry = _vm_map_entry_create(map_header);
17448 		vm_map_entry_copy(map, new_entry, src_entry);
17449 		if (new_entry->is_sub_map) {
17450 			/* clr address space specifics */
17451 			new_entry->use_pmap = FALSE;
17452 		} else if (copy) {
17453 			/*
17454 			 * We're dealing with a copy-on-write operation,
17455 			 * so the resulting mapping should not inherit the
17456 			 * original mapping's accounting settings.
17457 			 * "use_pmap" should be reset to its default (TRUE)
17458 			 * so that the new mapping gets accounted for in
17459 			 * the task's memory footprint.
17460 			 */
17461 			new_entry->use_pmap = TRUE;
17462 		}
17463 		/* "iokit_acct" was cleared in vm_map_entry_copy() */
17464 		assert(!new_entry->iokit_acct);
17465 
17466 		new_entry->map_aligned = FALSE;
17467 
17468 		new_entry->vme_start = map_address;
17469 		new_entry->vme_end = map_address + tmp_size;
17470 		assert(new_entry->vme_start < new_entry->vme_end);
17471 		if (copy && vmk_flags.vmkf_remap_prot_copy) {
17472 			/* security: keep "permanent" and "csm_associated" */
17473 			new_entry->vme_permanent = src_entry->vme_permanent;
17474 			new_entry->csm_associated = src_entry->csm_associated;
17475 			/*
17476 			 * Remapping for vm_map_protect(VM_PROT_COPY)
17477 			 * to convert a read-only mapping into a
17478 			 * copy-on-write version of itself but
17479 			 * with write access:
17480 			 * keep the original inheritance but let's not
17481 			 * add VM_PROT_WRITE to the max protection yet
17482 			 * since we want to do more security checks against
17483 			 * the target map.
17484 			 */
17485 			new_entry->inheritance = src_entry->inheritance;
17486 			new_entry->protection &= max_prot_for_prot_copy;
17487 		} else {
17488 			new_entry->inheritance = inheritance;
17489 			if (!vm_remap_legacy) {
17490 				new_entry->protection = *cur_protection;
17491 				new_entry->max_protection = *max_protection;
17492 			}
17493 		}
17494 		VME_OFFSET_SET(new_entry, offset);
17495 
17496 		/*
17497 		 * The new region has to be copied now if required.
17498 		 */
17499 RestartCopy:
17500 		if (!copy) {
17501 			if (src_entry->used_for_jit == TRUE) {
17502 				if (same_map) {
17503 				} else if (!VM_MAP_POLICY_ALLOW_JIT_SHARING(map)) {
17504 					/*
17505 					 * Cannot allow an entry describing a JIT
17506 					 * region to be shared across address spaces.
17507 					 */
17508 					result = KERN_INVALID_ARGUMENT;
17509 					vm_object_deallocate(object);
17510 					vm_map_entry_dispose(new_entry);
17511 					new_entry = VM_MAP_ENTRY_NULL;
17512 					break;
17513 				}
17514 			}
17515 
17516 			src_entry->is_shared = TRUE;
17517 			new_entry->is_shared = TRUE;
17518 			if (!(new_entry->is_sub_map)) {
17519 				new_entry->needs_copy = FALSE;
17520 			}
17521 		} else if (src_entry->is_sub_map) {
17522 			/* make this a COW sub_map if not already */
17523 			assert(new_entry->wired_count == 0);
17524 			new_entry->needs_copy = TRUE;
17525 			object = VM_OBJECT_NULL;
17526 		} else if (src_entry->wired_count == 0 &&
17527 		    !(debug4k_no_cow_copyin && VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT) &&
17528 		    vm_object_copy_quickly(VME_OBJECT(new_entry),
17529 		    VME_OFFSET(new_entry),
17530 		    (new_entry->vme_end -
17531 		    new_entry->vme_start),
17532 		    &src_needs_copy,
17533 		    &new_entry_needs_copy)) {
17534 			new_entry->needs_copy = new_entry_needs_copy;
17535 			new_entry->is_shared = FALSE;
17536 			assertf(new_entry->use_pmap, "map %p new_entry %p\n", map, new_entry);
17537 
17538 			/*
17539 			 * Handle copy_on_write semantics.
17540 			 */
17541 			if (src_needs_copy && !src_entry->needs_copy) {
17542 				vm_prot_t prot;
17543 
17544 				assert(!pmap_has_prot_policy(map->pmap, src_entry->translated_allow_execute, src_entry->protection));
17545 
17546 				prot = src_entry->protection & ~VM_PROT_WRITE;
17547 
17548 				if (override_nx(map,
17549 				    VME_ALIAS(src_entry))
17550 				    && prot) {
17551 					prot |= VM_PROT_EXECUTE;
17552 				}
17553 
17554 				assert(!pmap_has_prot_policy(map->pmap, src_entry->translated_allow_execute, prot));
17555 
17556 				vm_object_pmap_protect(object,
17557 				    offset,
17558 				    entry_size,
17559 				    ((src_entry->is_shared
17560 				    || map->mapped_in_other_pmaps) ?
17561 				    PMAP_NULL : map->pmap),
17562 				    VM_MAP_PAGE_SIZE(map),
17563 				    src_entry->vme_start,
17564 				    prot);
17565 
17566 				assert(src_entry->wired_count == 0);
17567 				src_entry->needs_copy = TRUE;
17568 			}
17569 			/*
17570 			 * Throw away the old object reference of the new entry.
17571 			 */
17572 			vm_object_deallocate(object);
17573 		} else {
17574 			new_entry->is_shared = FALSE;
17575 			assertf(new_entry->use_pmap, "map %p new_entry %p\n", map, new_entry);
17576 
17577 			src_entry_was_wired = (src_entry->wired_count > 0);
17578 			saved_src_entry = src_entry;
17579 			src_entry = VM_MAP_ENTRY_NULL;
17580 
17581 			/*
17582 			 * The map can be safely unlocked since we
17583 			 * already hold a reference on the object.
17584 			 *
17585 			 * Record the timestamp of the map for later
17586 			 * verification, and unlock the map.
17587 			 */
17588 			version.main_timestamp = map->timestamp;
17589 			vm_map_unlock(map);     /* Increments timestamp once! */
17590 
17591 			/*
17592 			 * Perform the copy.
17593 			 */
17594 			if (src_entry_was_wired > 0 ||
17595 			    (debug4k_no_cow_copyin &&
17596 			    VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT)) {
17597 				vm_object_lock(object);
17598 				result = vm_object_copy_slowly(
17599 					object,
17600 					offset,
17601 					(new_entry->vme_end -
17602 					new_entry->vme_start),
17603 					THREAD_UNINT,
17604 					&new_copy_object);
17605 				/* VME_OBJECT_SET will reset used_for_jit, so preserve it. */
17606 				saved_used_for_jit = new_entry->used_for_jit;
17607 #if __arm64e__
17608 				saved_used_for_tpro = new_entry->used_for_tpro;
17609 #endif
17610 				VME_OBJECT_SET(new_entry, new_copy_object, false, 0);
17611 				new_entry->used_for_jit = saved_used_for_jit;
17612 #if __arm64e__
17613 				new_entry->used_for_tpro = saved_used_for_tpro;
17614 #endif
17615 				VME_OFFSET_SET(new_entry, offset - vm_object_trunc_page(offset));
17616 				new_entry->needs_copy = FALSE;
17617 			} else {
17618 				vm_object_offset_t new_offset;
17619 
17620 				new_offset = VME_OFFSET(new_entry);
17621 				result = vm_object_copy_strategically(
17622 					object,
17623 					offset,
17624 					(new_entry->vme_end -
17625 					new_entry->vme_start),
17626 					&new_copy_object,
17627 					&new_offset,
17628 					&new_entry_needs_copy);
17629 				/* VME_OBJECT_SET will reset used_for_jit, so preserve it. */
17630 				saved_used_for_jit = new_entry->used_for_jit;
17631 #if __arm64e__
17632 				saved_used_for_tpro = new_entry->used_for_tpro;
17633 #endif
17634 				VME_OBJECT_SET(new_entry, new_copy_object, false, 0);
17635 				new_entry->used_for_jit = saved_used_for_jit;
17636 #if __arm64e__
17637 				new_entry->used_for_tpro = saved_used_for_tpro;
17638 #endif
17639 				if (new_offset != VME_OFFSET(new_entry)) {
17640 					VME_OFFSET_SET(new_entry, new_offset);
17641 				}
17642 
17643 				new_entry->needs_copy = new_entry_needs_copy;
17644 			}
17645 
17646 			/*
17647 			 * Throw away the old object reference of the new entry.
17648 			 */
17649 			vm_object_deallocate(object);
17650 
17651 			if (result != KERN_SUCCESS &&
17652 			    result != KERN_MEMORY_RESTART_COPY) {
17653 				vm_map_entry_dispose(new_entry);
17654 				vm_map_lock(map);
17655 				break;
17656 			}
17657 
17658 			/*
17659 			 * Verify that the map has not substantially
17660 			 * changed while the copy was being made.
17661 			 */
17662 
17663 			vm_map_lock(map);
17664 			if (version.main_timestamp + 1 != map->timestamp) {
17665 				/*
17666 				 * Simple version comparison failed.
17667 				 *
17668 				 * Retry the lookup and verify that the
17669 				 * same object/offset are still present.
17670 				 */
17671 				saved_src_entry = VM_MAP_ENTRY_NULL;
17672 				vm_object_deallocate(VME_OBJECT(new_entry));
17673 				vm_map_entry_dispose(new_entry);
17674 				if (result == KERN_MEMORY_RESTART_COPY) {
17675 					result = KERN_SUCCESS;
17676 				}
17677 				continue;
17678 			}
17679 			/* map hasn't changed: src_entry is still valid */
17680 			src_entry = saved_src_entry;
17681 			saved_src_entry = VM_MAP_ENTRY_NULL;
17682 
17683 			if (result == KERN_MEMORY_RESTART_COPY) {
17684 				vm_object_reference(object);
17685 				goto RestartCopy;
17686 			}
17687 		}
17688 
17689 		_vm_map_store_entry_link(map_header,
17690 		    map_header->links.prev, new_entry);
17691 
17692 		/* protections for submap mapping are irrelevant here */
17693 		if (vm_remap_legacy && !src_entry->is_sub_map) {
17694 			*cur_protection &= src_entry->protection;
17695 			*max_protection &= src_entry->max_protection;
17696 		}
17697 
17698 		map_address += tmp_size;
17699 		mapped_size += tmp_size;
17700 		src_start += tmp_size;
17701 
17702 		if (vmk_flags.vmkf_copy_single_object) {
17703 			if (mapped_size != size) {
17704 				DEBUG4K_SHARE("map %p addr 0x%llx size 0x%llx clipped copy at mapped_size 0x%llx\n",
17705 				    map, (uint64_t)addr, (uint64_t)size, (uint64_t)mapped_size);
17706 				if (src_entry->vme_next != vm_map_to_entry(map) &&
17707 				    src_entry->vme_next->vme_object_value ==
17708 				    src_entry->vme_object_value) {
17709 					/* XXX TODO4K */
17710 					DEBUG4K_ERROR("could have extended copy to next entry...\n");
17711 				}
17712 			}
17713 			break;
17714 		}
17715 	} /* end while */
17716 
17717 	vm_map_unlock(map);
17718 	if (result != KERN_SUCCESS) {
17719 		/*
17720 		 * Free all allocated elements.
17721 		 */
17722 		for (src_entry = map_header->links.next;
17723 		    src_entry != CAST_TO_VM_MAP_ENTRY(&map_header->links);
17724 		    src_entry = new_entry) {
17725 			new_entry = src_entry->vme_next;
17726 			_vm_map_store_entry_unlink(map_header, src_entry, false);
17727 			if (src_entry->is_sub_map) {
17728 				vm_map_deallocate(VME_SUBMAP(src_entry));
17729 			} else {
17730 				vm_object_deallocate(VME_OBJECT(src_entry));
17731 			}
17732 			vm_map_entry_dispose(src_entry);
17733 		}
17734 	}
17735 	return result;
17736 }
17737 
17738 bool
vm_map_is_exotic(vm_map_t map)17739 vm_map_is_exotic(
17740 	vm_map_t map)
17741 {
17742 	return VM_MAP_IS_EXOTIC(map);
17743 }
17744 
17745 bool
vm_map_is_alien(vm_map_t map)17746 vm_map_is_alien(
17747 	vm_map_t map)
17748 {
17749 	return VM_MAP_IS_ALIEN(map);
17750 }
17751 
17752 #if XNU_TARGET_OS_OSX
17753 void
vm_map_mark_alien(vm_map_t map)17754 vm_map_mark_alien(
17755 	vm_map_t map)
17756 {
17757 	vm_map_lock(map);
17758 	map->is_alien = true;
17759 	vm_map_unlock(map);
17760 }
17761 
17762 void
vm_map_single_jit(vm_map_t map)17763 vm_map_single_jit(
17764 	vm_map_t map)
17765 {
17766 	vm_map_lock(map);
17767 	map->single_jit = true;
17768 	vm_map_unlock(map);
17769 }
17770 #endif /* XNU_TARGET_OS_OSX */
17771 
17772 /*
17773  * Callers of this function must call vm_map_copy_require on
17774  * previously created vm_map_copy_t or pass a newly created
17775  * one to ensure that it hasn't been forged.
17776  */
17777 static kern_return_t
vm_map_copy_to_physcopy(vm_map_copy_t copy_map,vm_map_t target_map)17778 vm_map_copy_to_physcopy(
17779 	vm_map_copy_t   copy_map,
17780 	vm_map_t        target_map)
17781 {
17782 	vm_map_size_t           size;
17783 	vm_map_entry_t          entry;
17784 	vm_map_entry_t          new_entry;
17785 	vm_object_t             new_object;
17786 	unsigned int            pmap_flags;
17787 	pmap_t                  new_pmap;
17788 	vm_map_t                new_map;
17789 	vm_map_address_t        src_start, src_end, src_cur;
17790 	vm_map_address_t        dst_start, dst_end, dst_cur;
17791 	kern_return_t           kr;
17792 	void                    *kbuf;
17793 
17794 	/*
17795 	 * Perform the equivalent of vm_allocate() and memcpy().
17796 	 * Replace the mappings in "copy_map" with the newly allocated mapping.
17797 	 */
17798 	DEBUG4K_COPY("copy_map %p (%d %d 0x%llx 0x%llx) BEFORE\n", copy_map, copy_map->cpy_hdr.page_shift, copy_map->cpy_hdr.nentries, copy_map->offset, (uint64_t)copy_map->size);
17799 
17800 	assert(copy_map->cpy_hdr.page_shift != VM_MAP_PAGE_MASK(target_map));
17801 
17802 	/* create a new pmap to map "copy_map" */
17803 	pmap_flags = 0;
17804 	assert(copy_map->cpy_hdr.page_shift == FOURK_PAGE_SHIFT);
17805 #if PMAP_CREATE_FORCE_4K_PAGES
17806 	pmap_flags |= PMAP_CREATE_FORCE_4K_PAGES;
17807 #endif /* PMAP_CREATE_FORCE_4K_PAGES */
17808 	pmap_flags |= PMAP_CREATE_64BIT;
17809 	new_pmap = pmap_create_options(NULL, (vm_map_size_t)0, pmap_flags);
17810 	if (new_pmap == NULL) {
17811 		return KERN_RESOURCE_SHORTAGE;
17812 	}
17813 
17814 	/* allocate new VM object */
17815 	size = VM_MAP_ROUND_PAGE(copy_map->size, PAGE_MASK);
17816 	new_object = vm_object_allocate(size);
17817 	assert(new_object);
17818 
17819 	/* allocate new VM map entry */
17820 	new_entry = vm_map_copy_entry_create(copy_map);
17821 	assert(new_entry);
17822 
17823 	/* finish initializing new VM map entry */
17824 	new_entry->protection = VM_PROT_DEFAULT;
17825 	new_entry->max_protection = VM_PROT_DEFAULT;
17826 	new_entry->use_pmap = TRUE;
17827 
17828 	/* make new VM map entry point to new VM object */
17829 	new_entry->vme_start = 0;
17830 	new_entry->vme_end = size;
17831 	VME_OBJECT_SET(new_entry, new_object, false, 0);
17832 	VME_OFFSET_SET(new_entry, 0);
17833 
17834 	/* create a new pageable VM map to map "copy_map" */
17835 	new_map = vm_map_create_options(new_pmap, 0, MACH_VM_MAX_ADDRESS,
17836 	    VM_MAP_CREATE_PAGEABLE);
17837 	assert(new_map);
17838 	vm_map_set_page_shift(new_map, copy_map->cpy_hdr.page_shift);
17839 
17840 	/* map "copy_map" in the new VM map */
17841 	src_start = 0;
17842 	kr = vm_map_copyout_internal(
17843 		new_map,
17844 		&src_start,
17845 		copy_map,
17846 		copy_map->size,
17847 		FALSE, /* consume_on_success */
17848 		VM_PROT_DEFAULT,
17849 		VM_PROT_DEFAULT,
17850 		VM_INHERIT_DEFAULT);
17851 	assert(kr == KERN_SUCCESS);
17852 	src_end = src_start + copy_map->size;
17853 
17854 	/* map "new_object" in the new VM map */
17855 	vm_object_reference(new_object);
17856 	dst_start = 0;
17857 	kr = vm_map_enter(new_map,
17858 	    &dst_start,
17859 	    size,
17860 	    0,               /* mask */
17861 	    VM_MAP_KERNEL_FLAGS_ANYWHERE(.vm_tag = VM_KERN_MEMORY_OSFMK),
17862 	    new_object,
17863 	    0,               /* offset */
17864 	    FALSE,               /* needs copy */
17865 	    VM_PROT_DEFAULT,
17866 	    VM_PROT_DEFAULT,
17867 	    VM_INHERIT_DEFAULT);
17868 	assert(kr == KERN_SUCCESS);
17869 	dst_end = dst_start + size;
17870 
17871 	/* get a kernel buffer */
17872 	kbuf = kalloc_data(PAGE_SIZE, Z_WAITOK | Z_NOFAIL);
17873 
17874 	/* physically copy "copy_map" mappings to new VM object */
17875 	for (src_cur = src_start, dst_cur = dst_start;
17876 	    src_cur < src_end;
17877 	    src_cur += PAGE_SIZE, dst_cur += PAGE_SIZE) {
17878 		vm_size_t bytes;
17879 
17880 		bytes = PAGE_SIZE;
17881 		if (src_cur + PAGE_SIZE > src_end) {
17882 			/* partial copy for last page */
17883 			bytes = src_end - src_cur;
17884 			assert(bytes > 0 && bytes < PAGE_SIZE);
17885 			/* rest of dst page should be zero-filled */
17886 		}
17887 		/* get bytes from src mapping */
17888 		kr = copyinmap(new_map, src_cur, kbuf, bytes);
17889 		if (kr != KERN_SUCCESS) {
17890 			DEBUG4K_COPY("copyinmap(%p, 0x%llx, %p, 0x%llx) kr 0x%x\n", new_map, (uint64_t)src_cur, kbuf, (uint64_t)bytes, kr);
17891 		}
17892 		/* put bytes in dst mapping */
17893 		assert(dst_cur < dst_end);
17894 		assert(dst_cur + bytes <= dst_end);
17895 		kr = copyoutmap(new_map, kbuf, dst_cur, bytes);
17896 		if (kr != KERN_SUCCESS) {
17897 			DEBUG4K_COPY("copyoutmap(%p, %p, 0x%llx, 0x%llx) kr 0x%x\n", new_map, kbuf, (uint64_t)dst_cur, (uint64_t)bytes, kr);
17898 		}
17899 	}
17900 
17901 	/* free kernel buffer */
17902 	kfree_data(kbuf, PAGE_SIZE);
17903 
17904 	/* destroy new map */
17905 	vm_map_destroy(new_map);
17906 	new_map = VM_MAP_NULL;
17907 
17908 	/* dispose of the old map entries in "copy_map" */
17909 	while (vm_map_copy_first_entry(copy_map) !=
17910 	    vm_map_copy_to_entry(copy_map)) {
17911 		entry = vm_map_copy_first_entry(copy_map);
17912 		vm_map_copy_entry_unlink(copy_map, entry);
17913 		if (entry->is_sub_map) {
17914 			vm_map_deallocate(VME_SUBMAP(entry));
17915 		} else {
17916 			vm_object_deallocate(VME_OBJECT(entry));
17917 		}
17918 		vm_map_copy_entry_dispose(entry);
17919 	}
17920 
17921 	/* change "copy_map"'s page_size to match "target_map" */
17922 	copy_map->cpy_hdr.page_shift = (uint16_t)VM_MAP_PAGE_SHIFT(target_map);
17923 	copy_map->offset = 0;
17924 	copy_map->size = size;
17925 
17926 	/* insert new map entry in "copy_map" */
17927 	assert(vm_map_copy_last_entry(copy_map) == vm_map_copy_to_entry(copy_map));
17928 	vm_map_copy_entry_link(copy_map, vm_map_copy_last_entry(copy_map), new_entry);
17929 
17930 	DEBUG4K_COPY("copy_map %p (%d %d 0x%llx 0x%llx) AFTER\n", copy_map, copy_map->cpy_hdr.page_shift, copy_map->cpy_hdr.nentries, copy_map->offset, (uint64_t)copy_map->size);
17931 	return KERN_SUCCESS;
17932 }
17933 
17934 void
17935 vm_map_copy_adjust_get_target_copy_map(
17936 	vm_map_copy_t   copy_map,
17937 	vm_map_copy_t   *target_copy_map_p);
17938 void
vm_map_copy_adjust_get_target_copy_map(vm_map_copy_t copy_map,vm_map_copy_t * target_copy_map_p)17939 vm_map_copy_adjust_get_target_copy_map(
17940 	vm_map_copy_t   copy_map,
17941 	vm_map_copy_t   *target_copy_map_p)
17942 {
17943 	vm_map_copy_t   target_copy_map;
17944 	vm_map_entry_t  entry, target_entry;
17945 
17946 	if (*target_copy_map_p != VM_MAP_COPY_NULL) {
17947 		/* the caller already has a "target_copy_map": use it */
17948 		return;
17949 	}
17950 
17951 	/* the caller wants us to create a new copy of "copy_map" */
17952 	assert(copy_map->type == VM_MAP_COPY_ENTRY_LIST);
17953 	target_copy_map = vm_map_copy_allocate(copy_map->type);
17954 	target_copy_map->offset = copy_map->offset;
17955 	target_copy_map->size = copy_map->size;
17956 	target_copy_map->cpy_hdr.page_shift = copy_map->cpy_hdr.page_shift;
17957 	for (entry = vm_map_copy_first_entry(copy_map);
17958 	    entry != vm_map_copy_to_entry(copy_map);
17959 	    entry = entry->vme_next) {
17960 		target_entry = vm_map_copy_entry_create(target_copy_map);
17961 		vm_map_entry_copy_full(target_entry, entry);
17962 		if (target_entry->is_sub_map) {
17963 			vm_map_reference(VME_SUBMAP(target_entry));
17964 		} else {
17965 			vm_object_reference(VME_OBJECT(target_entry));
17966 		}
17967 		vm_map_copy_entry_link(
17968 			target_copy_map,
17969 			vm_map_copy_last_entry(target_copy_map),
17970 			target_entry);
17971 	}
17972 	entry = VM_MAP_ENTRY_NULL;
17973 	*target_copy_map_p = target_copy_map;
17974 }
17975 
17976 /*
17977  * Callers of this function must call vm_map_copy_require on
17978  * previously created vm_map_copy_t or pass a newly created
17979  * one to ensure that it hasn't been forged.
17980  */
17981 static void
vm_map_copy_trim(vm_map_copy_t copy_map,uint16_t new_page_shift,vm_map_offset_t trim_start,vm_map_offset_t trim_end)17982 vm_map_copy_trim(
17983 	vm_map_copy_t   copy_map,
17984 	uint16_t        new_page_shift,
17985 	vm_map_offset_t trim_start,
17986 	vm_map_offset_t trim_end)
17987 {
17988 	uint16_t        copy_page_shift;
17989 	vm_map_entry_t  entry, next_entry;
17990 
17991 	assert(copy_map->type == VM_MAP_COPY_ENTRY_LIST);
17992 	assert(copy_map->cpy_hdr.nentries > 0);
17993 
17994 	trim_start += vm_map_copy_first_entry(copy_map)->vme_start;
17995 	trim_end += vm_map_copy_first_entry(copy_map)->vme_start;
17996 
17997 	/* use the new page_shift to do the clipping */
17998 	copy_page_shift = VM_MAP_COPY_PAGE_SHIFT(copy_map);
17999 	copy_map->cpy_hdr.page_shift = new_page_shift;
18000 
18001 	for (entry = vm_map_copy_first_entry(copy_map);
18002 	    entry != vm_map_copy_to_entry(copy_map);
18003 	    entry = next_entry) {
18004 		next_entry = entry->vme_next;
18005 		if (entry->vme_end <= trim_start) {
18006 			/* entry fully before trim range: skip */
18007 			continue;
18008 		}
18009 		if (entry->vme_start >= trim_end) {
18010 			/* entry fully after trim range: done */
18011 			break;
18012 		}
18013 		/* clip entry if needed */
18014 		vm_map_copy_clip_start(copy_map, entry, trim_start);
18015 		vm_map_copy_clip_end(copy_map, entry, trim_end);
18016 		/* dispose of entry */
18017 		copy_map->size -= entry->vme_end - entry->vme_start;
18018 		vm_map_copy_entry_unlink(copy_map, entry);
18019 		if (entry->is_sub_map) {
18020 			vm_map_deallocate(VME_SUBMAP(entry));
18021 		} else {
18022 			vm_object_deallocate(VME_OBJECT(entry));
18023 		}
18024 		vm_map_copy_entry_dispose(entry);
18025 		entry = VM_MAP_ENTRY_NULL;
18026 	}
18027 
18028 	/* restore copy_map's original page_shift */
18029 	copy_map->cpy_hdr.page_shift = copy_page_shift;
18030 }
18031 
18032 /*
18033  * Make any necessary adjustments to "copy_map" to allow it to be
18034  * mapped into "target_map".
18035  * If no changes were necessary, "target_copy_map" points to the
18036  * untouched "copy_map".
18037  * If changes are necessary, changes will be made to "target_copy_map".
18038  * If "target_copy_map" was NULL, we create a new "vm_map_copy_t" and
18039  * copy the original "copy_map" to it before applying the changes.
18040  * The caller should discard "target_copy_map" if it's not the same as
18041  * the original "copy_map".
18042  */
18043 /* TODO4K: also adjust to sub-range in the copy_map -> add start&end? */
18044 kern_return_t
vm_map_copy_adjust_to_target(vm_map_copy_t src_copy_map,vm_map_offset_t offset,vm_map_size_t size,vm_map_t target_map,boolean_t copy,vm_map_copy_t * target_copy_map_p,vm_map_offset_t * overmap_start_p,vm_map_offset_t * overmap_end_p,vm_map_offset_t * trimmed_start_p)18045 vm_map_copy_adjust_to_target(
18046 	vm_map_copy_t           src_copy_map,
18047 	vm_map_offset_t         offset,
18048 	vm_map_size_t           size,
18049 	vm_map_t                target_map,
18050 	boolean_t               copy,
18051 	vm_map_copy_t           *target_copy_map_p,
18052 	vm_map_offset_t         *overmap_start_p,
18053 	vm_map_offset_t         *overmap_end_p,
18054 	vm_map_offset_t         *trimmed_start_p)
18055 {
18056 	vm_map_copy_t           copy_map, target_copy_map;
18057 	vm_map_size_t           target_size;
18058 	vm_map_size_t           src_copy_map_size;
18059 	vm_map_size_t           overmap_start, overmap_end;
18060 	int                     misalignments;
18061 	vm_map_entry_t          entry, target_entry;
18062 	vm_map_offset_t         addr_adjustment;
18063 	vm_map_offset_t         new_start, new_end;
18064 	int                     copy_page_mask, target_page_mask;
18065 	uint16_t                copy_page_shift, target_page_shift;
18066 	vm_map_offset_t         trimmed_end;
18067 
18068 	/*
18069 	 * Assert that the vm_map_copy is coming from the right
18070 	 * zone and hasn't been forged
18071 	 */
18072 	vm_map_copy_require(src_copy_map);
18073 	assert(src_copy_map->type == VM_MAP_COPY_ENTRY_LIST);
18074 
18075 	/*
18076 	 * Start working with "src_copy_map" but we'll switch
18077 	 * to "target_copy_map" as soon as we start making adjustments.
18078 	 */
18079 	copy_map = src_copy_map;
18080 	src_copy_map_size = src_copy_map->size;
18081 
18082 	copy_page_shift = VM_MAP_COPY_PAGE_SHIFT(copy_map);
18083 	copy_page_mask = VM_MAP_COPY_PAGE_MASK(copy_map);
18084 	target_page_shift = (uint16_t)VM_MAP_PAGE_SHIFT(target_map);
18085 	target_page_mask = VM_MAP_PAGE_MASK(target_map);
18086 
18087 	DEBUG4K_ADJUST("copy_map %p (%d offset 0x%llx size 0x%llx) target_map %p (%d) copy %d offset 0x%llx size 0x%llx target_copy_map %p...\n", copy_map, copy_page_shift, (uint64_t)copy_map->offset, (uint64_t)copy_map->size, target_map, target_page_shift, copy, (uint64_t)offset, (uint64_t)size, *target_copy_map_p);
18088 
18089 	target_copy_map = *target_copy_map_p;
18090 	if (target_copy_map != VM_MAP_COPY_NULL) {
18091 		vm_map_copy_require(target_copy_map);
18092 	}
18093 
18094 	if (offset + size > copy_map->size) {
18095 		DEBUG4K_ERROR("copy_map %p (%d->%d) copy_map->size 0x%llx offset 0x%llx size 0x%llx KERN_INVALID_ARGUMENT\n", copy_map, copy_page_shift, target_page_shift, (uint64_t)copy_map->size, (uint64_t)offset, (uint64_t)size);
18096 		return KERN_INVALID_ARGUMENT;
18097 	}
18098 
18099 	/* trim the end */
18100 	trimmed_end = 0;
18101 	new_end = VM_MAP_ROUND_PAGE(offset + size, target_page_mask);
18102 	if (new_end < copy_map->size) {
18103 		trimmed_end = src_copy_map_size - new_end;
18104 		DEBUG4K_ADJUST("copy_map %p (%d->%d) copy %d offset 0x%llx size 0x%llx target_copy_map %p... trim end from 0x%llx to 0x%llx\n", copy_map, copy_page_shift, target_page_shift, copy, (uint64_t)offset, (uint64_t)size, target_copy_map, (uint64_t)new_end, (uint64_t)copy_map->size);
18105 		/* get "target_copy_map" if needed and adjust it */
18106 		vm_map_copy_adjust_get_target_copy_map(copy_map,
18107 		    &target_copy_map);
18108 		copy_map = target_copy_map;
18109 		vm_map_copy_trim(target_copy_map, target_page_shift,
18110 		    new_end, copy_map->size);
18111 	}
18112 
18113 	/* trim the start */
18114 	new_start = VM_MAP_TRUNC_PAGE(offset, target_page_mask);
18115 	if (new_start != 0) {
18116 		DEBUG4K_ADJUST("copy_map %p (%d->%d) copy %d offset 0x%llx size 0x%llx target_copy_map %p... trim start from 0x%llx to 0x%llx\n", copy_map, copy_page_shift, target_page_shift, copy, (uint64_t)offset, (uint64_t)size, target_copy_map, (uint64_t)0, (uint64_t)new_start);
18117 		/* get "target_copy_map" if needed and adjust it */
18118 		vm_map_copy_adjust_get_target_copy_map(copy_map,
18119 		    &target_copy_map);
18120 		copy_map = target_copy_map;
18121 		vm_map_copy_trim(target_copy_map, target_page_shift,
18122 		    0, new_start);
18123 	}
18124 	*trimmed_start_p = new_start;
18125 
18126 	/* target_size starts with what's left after trimming */
18127 	target_size = copy_map->size;
18128 	assertf(target_size == src_copy_map_size - *trimmed_start_p - trimmed_end,
18129 	    "target_size 0x%llx src_copy_map_size 0x%llx trimmed_start 0x%llx trimmed_end 0x%llx\n",
18130 	    (uint64_t)target_size, (uint64_t)src_copy_map_size,
18131 	    (uint64_t)*trimmed_start_p, (uint64_t)trimmed_end);
18132 
18133 	/* check for misalignments but don't adjust yet */
18134 	misalignments = 0;
18135 	overmap_start = 0;
18136 	overmap_end = 0;
18137 	if (copy_page_shift < target_page_shift) {
18138 		/*
18139 		 * Remapping from 4K to 16K: check the VM object alignments
18140 		 * throughout the range.
18141 		 * If the start and end of the range are mis-aligned, we can
18142 		 * over-map to re-align, and adjust the "overmap" start/end
18143 		 * and "target_size" of the range accordingly.
18144 		 * If there is any mis-alignment within the range:
18145 		 *     if "copy":
18146 		 *         we can do immediate-copy instead of copy-on-write,
18147 		 *     else:
18148 		 *         no way to remap and share; fail.
18149 		 */
18150 		for (entry = vm_map_copy_first_entry(copy_map);
18151 		    entry != vm_map_copy_to_entry(copy_map);
18152 		    entry = entry->vme_next) {
18153 			vm_object_offset_t object_offset_start, object_offset_end;
18154 
18155 			object_offset_start = VME_OFFSET(entry);
18156 			object_offset_end = object_offset_start;
18157 			object_offset_end += entry->vme_end - entry->vme_start;
18158 			if (object_offset_start & target_page_mask) {
18159 				if (entry == vm_map_copy_first_entry(copy_map) && !copy) {
18160 					overmap_start++;
18161 				} else {
18162 					misalignments++;
18163 				}
18164 			}
18165 			if (object_offset_end & target_page_mask) {
18166 				if (entry->vme_next == vm_map_copy_to_entry(copy_map) && !copy) {
18167 					overmap_end++;
18168 				} else {
18169 					misalignments++;
18170 				}
18171 			}
18172 		}
18173 	}
18174 	entry = VM_MAP_ENTRY_NULL;
18175 
18176 	/* decide how to deal with misalignments */
18177 	assert(overmap_start <= 1);
18178 	assert(overmap_end <= 1);
18179 	if (!overmap_start && !overmap_end && !misalignments) {
18180 		/* copy_map is properly aligned for target_map ... */
18181 		if (*trimmed_start_p) {
18182 			/* ... but we trimmed it, so still need to adjust */
18183 		} else {
18184 			/* ... and we didn't trim anything: we're done */
18185 			if (target_copy_map == VM_MAP_COPY_NULL) {
18186 				target_copy_map = copy_map;
18187 			}
18188 			*target_copy_map_p = target_copy_map;
18189 			*overmap_start_p = 0;
18190 			*overmap_end_p = 0;
18191 			DEBUG4K_ADJUST("copy_map %p (%d offset 0x%llx size 0x%llx) target_map %p (%d) copy %d target_copy_map %p (%d offset 0x%llx size 0x%llx) -> trimmed 0x%llx overmap start 0x%llx end 0x%llx KERN_SUCCESS\n", copy_map, copy_page_shift, (uint64_t)copy_map->offset, (uint64_t)copy_map->size, target_map, target_page_shift, copy, *target_copy_map_p, VM_MAP_COPY_PAGE_SHIFT(*target_copy_map_p), (uint64_t)(*target_copy_map_p)->offset, (uint64_t)(*target_copy_map_p)->size, (uint64_t)*trimmed_start_p, (uint64_t)*overmap_start_p, (uint64_t)*overmap_end_p);
18192 			return KERN_SUCCESS;
18193 		}
18194 	} else if (misalignments && !copy) {
18195 		/* can't "share" if misaligned */
18196 		DEBUG4K_ADJUST("unsupported sharing\n");
18197 #if MACH_ASSERT
18198 		if (debug4k_panic_on_misaligned_sharing) {
18199 			panic("DEBUG4k %s:%d unsupported sharing", __FUNCTION__, __LINE__);
18200 		}
18201 #endif /* MACH_ASSERT */
18202 		DEBUG4K_ADJUST("copy_map %p (%d) target_map %p (%d) copy %d target_copy_map %p -> KERN_NOT_SUPPORTED\n", copy_map, copy_page_shift, target_map, target_page_shift, copy, *target_copy_map_p);
18203 		return KERN_NOT_SUPPORTED;
18204 	} else {
18205 		/* can't virtual-copy if misaligned (but can physical-copy) */
18206 		DEBUG4K_ADJUST("mis-aligned copying\n");
18207 	}
18208 
18209 	/* get a "target_copy_map" if needed and switch to it */
18210 	vm_map_copy_adjust_get_target_copy_map(copy_map, &target_copy_map);
18211 	copy_map = target_copy_map;
18212 
18213 	if (misalignments && copy) {
18214 		vm_map_size_t target_copy_map_size;
18215 
18216 		/*
18217 		 * Can't do copy-on-write with misaligned mappings.
18218 		 * Replace the mappings with a physical copy of the original
18219 		 * mappings' contents.
18220 		 */
18221 		target_copy_map_size = target_copy_map->size;
18222 		kern_return_t kr = vm_map_copy_to_physcopy(target_copy_map, target_map);
18223 		if (kr != KERN_SUCCESS) {
18224 			return kr;
18225 		}
18226 		*target_copy_map_p = target_copy_map;
18227 		*overmap_start_p = 0;
18228 		*overmap_end_p = target_copy_map->size - target_copy_map_size;
18229 		DEBUG4K_ADJUST("copy_map %p (%d offset 0x%llx size 0x%llx) target_map %p (%d) copy %d target_copy_map %p (%d offset 0x%llx size 0x%llx)-> trimmed 0x%llx overmap start 0x%llx end 0x%llx PHYSCOPY\n", copy_map, copy_page_shift, (uint64_t)copy_map->offset, (uint64_t)copy_map->size, target_map, target_page_shift, copy, *target_copy_map_p, VM_MAP_COPY_PAGE_SHIFT(*target_copy_map_p), (uint64_t)(*target_copy_map_p)->offset, (uint64_t)(*target_copy_map_p)->size, (uint64_t)*trimmed_start_p, (uint64_t)*overmap_start_p, (uint64_t)*overmap_end_p);
18230 		return KERN_SUCCESS;
18231 	}
18232 
18233 	/* apply the adjustments */
18234 	misalignments = 0;
18235 	overmap_start = 0;
18236 	overmap_end = 0;
18237 	/* remove copy_map->offset, so that everything starts at offset 0 */
18238 	addr_adjustment = copy_map->offset;
18239 	/* also remove whatever we trimmed from the start */
18240 	addr_adjustment += *trimmed_start_p;
18241 	for (target_entry = vm_map_copy_first_entry(target_copy_map);
18242 	    target_entry != vm_map_copy_to_entry(target_copy_map);
18243 	    target_entry = target_entry->vme_next) {
18244 		vm_object_offset_t object_offset_start, object_offset_end;
18245 
18246 		DEBUG4K_ADJUST("copy %p (%d 0x%llx 0x%llx) entry %p [ 0x%llx 0x%llx ] object %p offset 0x%llx BEFORE\n", target_copy_map, VM_MAP_COPY_PAGE_SHIFT(target_copy_map), target_copy_map->offset, (uint64_t)target_copy_map->size, target_entry, (uint64_t)target_entry->vme_start, (uint64_t)target_entry->vme_end, VME_OBJECT(target_entry), VME_OFFSET(target_entry));
18247 		object_offset_start = VME_OFFSET(target_entry);
18248 		if (object_offset_start & target_page_mask) {
18249 			DEBUG4K_ADJUST("copy %p (%d 0x%llx 0x%llx) entry %p [ 0x%llx 0x%llx ] object %p offset 0x%llx misaligned at start\n", target_copy_map, VM_MAP_COPY_PAGE_SHIFT(target_copy_map), target_copy_map->offset, (uint64_t)target_copy_map->size, target_entry, (uint64_t)target_entry->vme_start, (uint64_t)target_entry->vme_end, VME_OBJECT(target_entry), VME_OFFSET(target_entry));
18250 			if (target_entry == vm_map_copy_first_entry(target_copy_map)) {
18251 				/*
18252 				 * start of 1st entry is mis-aligned:
18253 				 * re-adjust by over-mapping.
18254 				 */
18255 				overmap_start = object_offset_start - trunc_page_mask_64(object_offset_start, target_page_mask);
18256 				DEBUG4K_ADJUST("entry %p offset 0x%llx copy %d -> overmap_start 0x%llx\n", target_entry, VME_OFFSET(target_entry), copy, (uint64_t)overmap_start);
18257 				VME_OFFSET_SET(target_entry, VME_OFFSET(target_entry) - overmap_start);
18258 			} else {
18259 				misalignments++;
18260 				DEBUG4K_ADJUST("entry %p offset 0x%llx copy %d -> misalignments %d\n", target_entry, VME_OFFSET(target_entry), copy, misalignments);
18261 				assert(copy);
18262 			}
18263 		}
18264 
18265 		if (target_entry == vm_map_copy_first_entry(target_copy_map)) {
18266 			target_size += overmap_start;
18267 		} else {
18268 			target_entry->vme_start += overmap_start;
18269 		}
18270 		target_entry->vme_end += overmap_start;
18271 
18272 		object_offset_end = VME_OFFSET(target_entry) + target_entry->vme_end - target_entry->vme_start;
18273 		if (object_offset_end & target_page_mask) {
18274 			DEBUG4K_ADJUST("copy %p (%d 0x%llx 0x%llx) entry %p [ 0x%llx 0x%llx ] object %p offset 0x%llx misaligned at end\n", target_copy_map, VM_MAP_COPY_PAGE_SHIFT(target_copy_map), target_copy_map->offset, (uint64_t)target_copy_map->size, target_entry, (uint64_t)target_entry->vme_start, (uint64_t)target_entry->vme_end, VME_OBJECT(target_entry), VME_OFFSET(target_entry));
18275 			if (target_entry->vme_next == vm_map_copy_to_entry(target_copy_map)) {
18276 				/*
18277 				 * end of last entry is mis-aligned: re-adjust by over-mapping.
18278 				 */
18279 				overmap_end = round_page_mask_64(object_offset_end, target_page_mask) - object_offset_end;
18280 				DEBUG4K_ADJUST("entry %p offset 0x%llx copy %d -> overmap_end 0x%llx\n", target_entry, VME_OFFSET(target_entry), copy, (uint64_t)overmap_end);
18281 				target_entry->vme_end += overmap_end;
18282 				target_size += overmap_end;
18283 			} else {
18284 				misalignments++;
18285 				DEBUG4K_ADJUST("entry %p offset 0x%llx copy %d -> misalignments %d\n", target_entry, VME_OFFSET(target_entry), copy, misalignments);
18286 				assert(copy);
18287 			}
18288 		}
18289 		target_entry->vme_start -= addr_adjustment;
18290 		target_entry->vme_end -= addr_adjustment;
18291 		DEBUG4K_ADJUST("copy %p (%d 0x%llx 0x%llx) entry %p [ 0x%llx 0x%llx ] object %p offset 0x%llx AFTER\n", target_copy_map, VM_MAP_COPY_PAGE_SHIFT(target_copy_map), target_copy_map->offset, (uint64_t)target_copy_map->size, target_entry, (uint64_t)target_entry->vme_start, (uint64_t)target_entry->vme_end, VME_OBJECT(target_entry), VME_OFFSET(target_entry));
18292 	}
18293 
18294 	target_copy_map->size = target_size;
18295 	target_copy_map->offset += overmap_start;
18296 	target_copy_map->offset -= addr_adjustment;
18297 	target_copy_map->cpy_hdr.page_shift = target_page_shift;
18298 
18299 //	assert(VM_MAP_PAGE_ALIGNED(target_copy_map->size, target_page_mask));
18300 //	assert(VM_MAP_PAGE_ALIGNED(target_copy_map->offset, FOURK_PAGE_MASK));
18301 	assert(overmap_start < VM_MAP_PAGE_SIZE(target_map));
18302 	assert(overmap_end < VM_MAP_PAGE_SIZE(target_map));
18303 
18304 	*target_copy_map_p = target_copy_map;
18305 	*overmap_start_p = overmap_start;
18306 	*overmap_end_p = overmap_end;
18307 
18308 	DEBUG4K_ADJUST("copy_map %p (%d offset 0x%llx size 0x%llx) target_map %p (%d) copy %d target_copy_map %p (%d offset 0x%llx size 0x%llx) -> trimmed 0x%llx overmap start 0x%llx end 0x%llx KERN_SUCCESS\n", copy_map, copy_page_shift, (uint64_t)copy_map->offset, (uint64_t)copy_map->size, target_map, target_page_shift, copy, *target_copy_map_p, VM_MAP_COPY_PAGE_SHIFT(*target_copy_map_p), (uint64_t)(*target_copy_map_p)->offset, (uint64_t)(*target_copy_map_p)->size, (uint64_t)*trimmed_start_p, (uint64_t)*overmap_start_p, (uint64_t)*overmap_end_p);
18309 	return KERN_SUCCESS;
18310 }
18311 
18312 kern_return_t
vm_map_range_physical_size(vm_map_t map,vm_map_address_t start,mach_vm_size_t size,mach_vm_size_t * phys_size)18313 vm_map_range_physical_size(
18314 	vm_map_t         map,
18315 	vm_map_address_t start,
18316 	mach_vm_size_t   size,
18317 	mach_vm_size_t * phys_size)
18318 {
18319 	kern_return_t   kr;
18320 	vm_map_copy_t   copy_map, target_copy_map;
18321 	vm_map_offset_t adjusted_start, adjusted_end;
18322 	vm_map_size_t   adjusted_size;
18323 	vm_prot_t       cur_prot, max_prot;
18324 	vm_map_offset_t overmap_start, overmap_end, trimmed_start, end;
18325 	vm_map_kernel_flags_t vmk_flags;
18326 
18327 	if (size == 0) {
18328 		DEBUG4K_SHARE("map %p start 0x%llx size 0x%llx -> phys_size 0!\n", map, (uint64_t)start, (uint64_t)size);
18329 		*phys_size = 0;
18330 		return KERN_SUCCESS;
18331 	}
18332 
18333 	adjusted_start = vm_map_trunc_page(start, VM_MAP_PAGE_MASK(map));
18334 	adjusted_end = vm_map_round_page(start + size, VM_MAP_PAGE_MASK(map));
18335 	if (__improbable(os_add_overflow(start, size, &end) ||
18336 	    adjusted_end <= adjusted_start)) {
18337 		/* wraparound */
18338 		printf("%s:%d(start=0x%llx, size=0x%llx) pgmask 0x%x: wraparound\n", __FUNCTION__, __LINE__, (uint64_t)start, (uint64_t)size, VM_MAP_PAGE_MASK(map));
18339 		*phys_size = 0;
18340 		return KERN_INVALID_ARGUMENT;
18341 	}
18342 	if (__improbable(vm_map_range_overflows(map, start, size))) {
18343 		*phys_size = 0;
18344 		return KERN_INVALID_ADDRESS;
18345 	}
18346 	assert(adjusted_end > adjusted_start);
18347 	adjusted_size = adjusted_end - adjusted_start;
18348 	*phys_size = adjusted_size;
18349 	if (VM_MAP_PAGE_SIZE(map) == PAGE_SIZE) {
18350 		return KERN_SUCCESS;
18351 	}
18352 	if (start == 0) {
18353 		adjusted_start = vm_map_trunc_page(start, PAGE_MASK);
18354 		adjusted_end = vm_map_round_page(start + size, PAGE_MASK);
18355 		if (__improbable(adjusted_end <= adjusted_start)) {
18356 			/* wraparound */
18357 			printf("%s:%d(start=0x%llx, size=0x%llx) pgmask 0x%x: wraparound\n", __FUNCTION__, __LINE__, (uint64_t)start, (uint64_t)size, PAGE_MASK);
18358 			*phys_size = 0;
18359 			return KERN_INVALID_ARGUMENT;
18360 		}
18361 		assert(adjusted_end > adjusted_start);
18362 		adjusted_size = adjusted_end - adjusted_start;
18363 		*phys_size = adjusted_size;
18364 		return KERN_SUCCESS;
18365 	}
18366 
18367 	vmk_flags = VM_MAP_KERNEL_FLAGS_NONE;
18368 	vmk_flags.vmkf_copy_pageable = TRUE;
18369 	vmk_flags.vmkf_copy_same_map = TRUE;
18370 	assert(adjusted_size != 0);
18371 	cur_prot = VM_PROT_NONE; /* legacy mode */
18372 	max_prot = VM_PROT_NONE; /* legacy mode */
18373 	kr = vm_map_copy_extract(map, adjusted_start, adjusted_size,
18374 	    FALSE /* copy */,
18375 	    &copy_map,
18376 	    &cur_prot, &max_prot, VM_INHERIT_DEFAULT,
18377 	    vmk_flags);
18378 	if (kr != KERN_SUCCESS) {
18379 		DEBUG4K_ERROR("map %p start 0x%llx 0x%llx size 0x%llx 0x%llx kr 0x%x\n", map, (uint64_t)start, (uint64_t)adjusted_start, size, (uint64_t)adjusted_size, kr);
18380 		//assert(0);
18381 		*phys_size = 0;
18382 		return kr;
18383 	}
18384 	assert(copy_map != VM_MAP_COPY_NULL);
18385 	target_copy_map = copy_map;
18386 	DEBUG4K_ADJUST("adjusting...\n");
18387 	kr = vm_map_copy_adjust_to_target(
18388 		copy_map,
18389 		start - adjusted_start, /* offset */
18390 		size, /* size */
18391 		kernel_map,
18392 		FALSE,                          /* copy */
18393 		&target_copy_map,
18394 		&overmap_start,
18395 		&overmap_end,
18396 		&trimmed_start);
18397 	if (kr == KERN_SUCCESS) {
18398 		if (target_copy_map->size != *phys_size) {
18399 			DEBUG4K_ADJUST("map %p (%d) start 0x%llx size 0x%llx adjusted_start 0x%llx adjusted_end 0x%llx overmap_start 0x%llx overmap_end 0x%llx trimmed_start 0x%llx phys_size 0x%llx -> 0x%llx\n", map, VM_MAP_PAGE_SHIFT(map), (uint64_t)start, (uint64_t)size, (uint64_t)adjusted_start, (uint64_t)adjusted_end, (uint64_t)overmap_start, (uint64_t)overmap_end, (uint64_t)trimmed_start, (uint64_t)*phys_size, (uint64_t)target_copy_map->size);
18400 		}
18401 		*phys_size = target_copy_map->size;
18402 	} else {
18403 		DEBUG4K_ERROR("map %p start 0x%llx 0x%llx size 0x%llx 0x%llx kr 0x%x\n", map, (uint64_t)start, (uint64_t)adjusted_start, size, (uint64_t)adjusted_size, kr);
18404 		//assert(0);
18405 		*phys_size = 0;
18406 	}
18407 	vm_map_copy_discard(copy_map);
18408 	copy_map = VM_MAP_COPY_NULL;
18409 
18410 	return kr;
18411 }
18412 
18413 
18414 kern_return_t
memory_entry_check_for_adjustment(vm_map_t src_map,ipc_port_t port,vm_map_offset_t * overmap_start,vm_map_offset_t * overmap_end)18415 memory_entry_check_for_adjustment(
18416 	vm_map_t                        src_map,
18417 	ipc_port_t                      port,
18418 	vm_map_offset_t         *overmap_start,
18419 	vm_map_offset_t         *overmap_end)
18420 {
18421 	kern_return_t kr = KERN_SUCCESS;
18422 	vm_map_copy_t copy_map = VM_MAP_COPY_NULL, target_copy_map = VM_MAP_COPY_NULL;
18423 
18424 	assert(port);
18425 	assertf(ip_kotype(port) == IKOT_NAMED_ENTRY, "Port Type expected: %d...received:%d\n", IKOT_NAMED_ENTRY, ip_kotype(port));
18426 
18427 	vm_named_entry_t        named_entry;
18428 
18429 	named_entry = mach_memory_entry_from_port(port);
18430 	named_entry_lock(named_entry);
18431 	copy_map = named_entry->backing.copy;
18432 	target_copy_map = copy_map;
18433 
18434 	if (src_map && VM_MAP_PAGE_SHIFT(src_map) < PAGE_SHIFT) {
18435 		vm_map_offset_t trimmed_start;
18436 
18437 		trimmed_start = 0;
18438 		DEBUG4K_ADJUST("adjusting...\n");
18439 		kr = vm_map_copy_adjust_to_target(
18440 			copy_map,
18441 			0, /* offset */
18442 			copy_map->size, /* size */
18443 			src_map,
18444 			FALSE, /* copy */
18445 			&target_copy_map,
18446 			overmap_start,
18447 			overmap_end,
18448 			&trimmed_start);
18449 		assert(trimmed_start == 0);
18450 	}
18451 	named_entry_unlock(named_entry);
18452 
18453 	return kr;
18454 }
18455 
18456 
18457 /*
18458  *	Routine:	vm_remap
18459  *
18460  *			Map portion of a task's address space.
18461  *			Mapped region must not overlap more than
18462  *			one vm memory object. Protections and
18463  *			inheritance attributes remain the same
18464  *			as in the original task and are	out parameters.
18465  *			Source and Target task can be identical
18466  *			Other attributes are identical as for vm_map()
18467  */
18468 kern_return_t
vm_map_remap(vm_map_t target_map,vm_map_address_t * address,vm_map_size_t size,vm_map_offset_t mask,vm_map_kernel_flags_t vmk_flags,vm_map_t src_map,vm_map_offset_t memory_address,boolean_t copy,vm_prot_t * cur_protection,vm_prot_t * max_protection,vm_inherit_t inheritance)18469 vm_map_remap(
18470 	vm_map_t                target_map,
18471 	vm_map_address_t        *address,
18472 	vm_map_size_t           size,
18473 	vm_map_offset_t         mask,
18474 	vm_map_kernel_flags_t   vmk_flags,
18475 	vm_map_t                src_map,
18476 	vm_map_offset_t         memory_address,
18477 	boolean_t               copy,
18478 	vm_prot_t               *cur_protection, /* IN/OUT */
18479 	vm_prot_t               *max_protection, /* IN/OUT */
18480 	vm_inherit_t            inheritance)
18481 {
18482 	kern_return_t           result;
18483 	vm_map_entry_t          entry;
18484 	vm_map_entry_t          insp_entry = VM_MAP_ENTRY_NULL;
18485 	vm_map_entry_t          new_entry;
18486 	vm_map_copy_t           copy_map;
18487 	vm_map_offset_t         offset_in_mapping;
18488 	vm_map_size_t           target_size = 0;
18489 	vm_map_size_t           src_page_mask, target_page_mask;
18490 	vm_map_offset_t         overmap_start, overmap_end, trimmed_start;
18491 	vm_map_offset_t         initial_memory_address;
18492 	vm_map_size_t           initial_size;
18493 	VM_MAP_ZAP_DECLARE(zap_list);
18494 
18495 	if (target_map == VM_MAP_NULL) {
18496 		return KERN_INVALID_ARGUMENT;
18497 	}
18498 
18499 	if (__improbable(vm_map_range_overflows(src_map, memory_address, size))) {
18500 		return KERN_INVALID_ARGUMENT;
18501 	}
18502 
18503 	initial_memory_address = memory_address;
18504 	initial_size = size;
18505 	src_page_mask = VM_MAP_PAGE_MASK(src_map);
18506 	target_page_mask = VM_MAP_PAGE_MASK(target_map);
18507 
18508 	switch (inheritance) {
18509 	case VM_INHERIT_NONE:
18510 	case VM_INHERIT_COPY:
18511 	case VM_INHERIT_SHARE:
18512 		if (size != 0 && src_map != VM_MAP_NULL) {
18513 			break;
18514 		}
18515 		OS_FALLTHROUGH;
18516 	default:
18517 		return KERN_INVALID_ARGUMENT;
18518 	}
18519 
18520 	if (src_page_mask != target_page_mask) {
18521 		if (copy) {
18522 			DEBUG4K_COPY("src_map %p pgsz 0x%x addr 0x%llx size 0x%llx copy %d -> target_map %p pgsz 0x%x\n", src_map, VM_MAP_PAGE_SIZE(src_map), (uint64_t)memory_address, (uint64_t)size, copy, target_map, VM_MAP_PAGE_SIZE(target_map));
18523 		} else {
18524 			DEBUG4K_SHARE("src_map %p pgsz 0x%x addr 0x%llx size 0x%llx copy %d -> target_map %p pgsz 0x%x\n", src_map, VM_MAP_PAGE_SIZE(src_map), (uint64_t)memory_address, (uint64_t)size, copy, target_map, VM_MAP_PAGE_SIZE(target_map));
18525 		}
18526 	}
18527 
18528 	/*
18529 	 * If the user is requesting that we return the address of the
18530 	 * first byte of the data (rather than the base of the page),
18531 	 * then we use different rounding semantics: specifically,
18532 	 * we assume that (memory_address, size) describes a region
18533 	 * all of whose pages we must cover, rather than a base to be truncated
18534 	 * down and a size to be added to that base.  So we figure out
18535 	 * the highest page that the requested region includes and make
18536 	 * sure that the size will cover it.
18537 	 *
18538 	 * The key example we're worried about it is of the form:
18539 	 *
18540 	 *              memory_address = 0x1ff0, size = 0x20
18541 	 *
18542 	 * With the old semantics, we round down the memory_address to 0x1000
18543 	 * and round up the size to 0x1000, resulting in our covering *only*
18544 	 * page 0x1000.  With the new semantics, we'd realize that the region covers
18545 	 * 0x1ff0-0x2010, and compute a size of 0x2000.  Thus, we cover both page
18546 	 * 0x1000 and page 0x2000 in the region we remap.
18547 	 */
18548 	if (vmk_flags.vmf_return_data_addr) {
18549 		vm_map_offset_t range_start, range_end;
18550 
18551 		range_start = vm_map_trunc_page(memory_address, src_page_mask);
18552 		range_end = vm_map_round_page(memory_address + size, src_page_mask);
18553 		memory_address = range_start;
18554 		size = range_end - range_start;
18555 		offset_in_mapping = initial_memory_address - memory_address;
18556 	} else {
18557 		/*
18558 		 * IMPORTANT:
18559 		 * This legacy code path is broken: for the range mentioned
18560 		 * above [ memory_address = 0x1ff0,size = 0x20 ], which spans
18561 		 * two 4k pages, it yields [ memory_address = 0x1000,
18562 		 * size = 0x1000 ], which covers only the first 4k page.
18563 		 * BUT some code unfortunately depends on this bug, so we
18564 		 * can't fix it without breaking something.
18565 		 * New code should get automatically opted in the new
18566 		 * behavior with the new VM_FLAGS_RETURN_DATA_ADDR flags.
18567 		 */
18568 		offset_in_mapping = 0;
18569 		memory_address = vm_map_trunc_page(memory_address, src_page_mask);
18570 		size = vm_map_round_page(size, src_page_mask);
18571 		initial_memory_address = memory_address;
18572 		initial_size = size;
18573 	}
18574 
18575 
18576 	if (size == 0) {
18577 		return KERN_INVALID_ARGUMENT;
18578 	}
18579 
18580 	if (vmk_flags.vmf_resilient_media) {
18581 		/* must be copy-on-write to be "media resilient" */
18582 		if (!copy) {
18583 			return KERN_INVALID_ARGUMENT;
18584 		}
18585 	}
18586 
18587 	vmk_flags.vmkf_copy_pageable = target_map->hdr.entries_pageable;
18588 	vmk_flags.vmkf_copy_same_map = (src_map == target_map);
18589 
18590 	assert(size != 0);
18591 	result = vm_map_copy_extract(src_map,
18592 	    memory_address,
18593 	    size,
18594 	    copy, &copy_map,
18595 	    cur_protection, /* IN/OUT */
18596 	    max_protection, /* IN/OUT */
18597 	    inheritance,
18598 	    vmk_flags);
18599 	if (result != KERN_SUCCESS) {
18600 		return result;
18601 	}
18602 	assert(copy_map != VM_MAP_COPY_NULL);
18603 
18604 	/*
18605 	 * Handle the policy for vm map ranges
18606 	 *
18607 	 * If the maps differ, the target_map policy applies like for vm_map()
18608 	 * For same mapping remaps, we preserve the range.
18609 	 */
18610 	if (vmk_flags.vmkf_copy_same_map) {
18611 		vmk_flags.vmkf_range_id = copy_map->orig_range;
18612 	} else {
18613 		vm_map_kernel_flags_update_range_id(&vmk_flags, target_map);
18614 	}
18615 
18616 	overmap_start = 0;
18617 	overmap_end = 0;
18618 	trimmed_start = 0;
18619 	target_size = size;
18620 	if (src_page_mask != target_page_mask) {
18621 		vm_map_copy_t target_copy_map;
18622 
18623 		target_copy_map = copy_map; /* can modify "copy_map" itself */
18624 		DEBUG4K_ADJUST("adjusting...\n");
18625 		result = vm_map_copy_adjust_to_target(
18626 			copy_map,
18627 			offset_in_mapping, /* offset */
18628 			initial_size,
18629 			target_map,
18630 			copy,
18631 			&target_copy_map,
18632 			&overmap_start,
18633 			&overmap_end,
18634 			&trimmed_start);
18635 		if (result != KERN_SUCCESS) {
18636 			DEBUG4K_COPY("failed to adjust 0x%x\n", result);
18637 			vm_map_copy_discard(copy_map);
18638 			return result;
18639 		}
18640 		if (trimmed_start == 0) {
18641 			/* nothing trimmed: no adjustment needed */
18642 		} else if (trimmed_start >= offset_in_mapping) {
18643 			/* trimmed more than offset_in_mapping: nothing left */
18644 			assert(overmap_start == 0);
18645 			assert(overmap_end == 0);
18646 			offset_in_mapping = 0;
18647 		} else {
18648 			/* trimmed some of offset_in_mapping: adjust */
18649 			assert(overmap_start == 0);
18650 			assert(overmap_end == 0);
18651 			offset_in_mapping -= trimmed_start;
18652 		}
18653 		offset_in_mapping += overmap_start;
18654 		target_size = target_copy_map->size;
18655 	}
18656 
18657 	/*
18658 	 * Allocate/check a range of free virtual address
18659 	 * space for the target
18660 	 */
18661 	*address = vm_map_trunc_page(*address, target_page_mask);
18662 	vm_map_lock(target_map);
18663 	target_size = vm_map_round_page(target_size, target_page_mask);
18664 	result = vm_map_remap_range_allocate(target_map, address,
18665 	    target_size, mask, vmk_flags,
18666 	    &insp_entry, &zap_list);
18667 
18668 	for (entry = vm_map_copy_first_entry(copy_map);
18669 	    entry != vm_map_copy_to_entry(copy_map);
18670 	    entry = new_entry) {
18671 		new_entry = entry->vme_next;
18672 		vm_map_copy_entry_unlink(copy_map, entry);
18673 		if (result == KERN_SUCCESS) {
18674 			if (vmk_flags.vmkf_remap_prot_copy) {
18675 				/*
18676 				 * This vm_map_remap() is for a
18677 				 * vm_protect(VM_PROT_COPY), so the caller
18678 				 * expects to be allowed to add write access
18679 				 * to this new mapping.  This is done by
18680 				 * adding VM_PROT_WRITE to each entry's
18681 				 * max_protection... unless some security
18682 				 * settings disallow it.
18683 				 */
18684 				bool allow_write = false;
18685 				if (entry->vme_permanent) {
18686 					/* immutable mapping... */
18687 					if ((entry->max_protection & VM_PROT_EXECUTE) &&
18688 					    developer_mode_state()) {
18689 						/*
18690 						 * ... but executable and
18691 						 * possibly being debugged,
18692 						 * so let's allow it to become
18693 						 * writable, for breakpoints
18694 						 * and dtrace probes, for
18695 						 * example.
18696 						 */
18697 						allow_write = true;
18698 					} else {
18699 						printf("%d[%s] vm_remap(0x%llx,0x%llx) VM_PROT_COPY denied on permanent mapping prot 0x%x/0x%x developer %d\n",
18700 						    proc_selfpid(),
18701 						    (get_bsdtask_info(current_task())
18702 						    ? proc_name_address(get_bsdtask_info(current_task()))
18703 						    : "?"),
18704 						    (uint64_t)memory_address,
18705 						    (uint64_t)size,
18706 						    entry->protection,
18707 						    entry->max_protection,
18708 						    developer_mode_state());
18709 						DTRACE_VM6(vm_map_delete_permanent_deny_protcopy,
18710 						    vm_map_entry_t, entry,
18711 						    vm_map_offset_t, entry->vme_start,
18712 						    vm_map_offset_t, entry->vme_end,
18713 						    vm_prot_t, entry->protection,
18714 						    vm_prot_t, entry->max_protection,
18715 						    int, VME_ALIAS(entry));
18716 					}
18717 				} else {
18718 					allow_write = true;
18719 				}
18720 
18721 				/*
18722 				 * VM_PROT_COPY: allow this mapping to become
18723 				 * writable, unless it was "permanent".
18724 				 */
18725 				if (allow_write) {
18726 					entry->max_protection |= VM_PROT_WRITE;
18727 				}
18728 			}
18729 			if (vmk_flags.vmf_resilient_codesign) {
18730 				/* no codesigning -> read-only access */
18731 				entry->max_protection = VM_PROT_READ;
18732 				entry->protection = VM_PROT_READ;
18733 				entry->vme_resilient_codesign = TRUE;
18734 			}
18735 			entry->vme_start += *address;
18736 			entry->vme_end += *address;
18737 			assert(!entry->map_aligned);
18738 			if (vmk_flags.vmf_resilient_media &&
18739 			    !entry->is_sub_map &&
18740 			    (VME_OBJECT(entry) == VM_OBJECT_NULL ||
18741 			    VME_OBJECT(entry)->internal)) {
18742 				entry->vme_resilient_media = TRUE;
18743 			}
18744 			assert(VM_MAP_PAGE_ALIGNED(entry->vme_start, MIN(target_page_mask, PAGE_MASK)));
18745 			assert(VM_MAP_PAGE_ALIGNED(entry->vme_end, MIN(target_page_mask, PAGE_MASK)));
18746 			assert(VM_MAP_PAGE_ALIGNED(VME_OFFSET(entry), MIN(target_page_mask, PAGE_MASK)));
18747 			vm_map_store_entry_link(target_map, insp_entry, entry,
18748 			    vmk_flags);
18749 			insp_entry = entry;
18750 		} else {
18751 			if (!entry->is_sub_map) {
18752 				vm_object_deallocate(VME_OBJECT(entry));
18753 			} else {
18754 				vm_map_deallocate(VME_SUBMAP(entry));
18755 			}
18756 			vm_map_copy_entry_dispose(entry);
18757 		}
18758 	}
18759 
18760 	if (vmk_flags.vmf_resilient_codesign) {
18761 		*cur_protection = VM_PROT_READ;
18762 		*max_protection = VM_PROT_READ;
18763 	}
18764 
18765 	if (result == KERN_SUCCESS) {
18766 		target_map->size += target_size;
18767 		SAVE_HINT_MAP_WRITE(target_map, insp_entry);
18768 	}
18769 	vm_map_unlock(target_map);
18770 
18771 	vm_map_zap_dispose(&zap_list);
18772 
18773 	if (result == KERN_SUCCESS && target_map->wiring_required) {
18774 		result = vm_map_wire_kernel(target_map, *address,
18775 		    *address + size, *cur_protection, VM_KERN_MEMORY_MLOCK,
18776 		    TRUE);
18777 	}
18778 
18779 	/*
18780 	 * If requested, return the address of the data pointed to by the
18781 	 * request, rather than the base of the resulting page.
18782 	 */
18783 	if (vmk_flags.vmf_return_data_addr) {
18784 		*address += offset_in_mapping;
18785 	}
18786 
18787 	if (src_page_mask != target_page_mask) {
18788 		DEBUG4K_SHARE("vm_remap(%p 0x%llx 0x%llx copy=%d-> %p 0x%llx 0x%llx  result=0x%x\n", src_map, (uint64_t)memory_address, (uint64_t)size, copy, target_map, (uint64_t)*address, (uint64_t)offset_in_mapping, result);
18789 	}
18790 	vm_map_copy_discard(copy_map);
18791 	copy_map = VM_MAP_COPY_NULL;
18792 
18793 	return result;
18794 }
18795 
18796 /*
18797  *	Routine:	vm_map_remap_range_allocate
18798  *
18799  *	Description:
18800  *		Allocate a range in the specified virtual address map.
18801  *		returns the address and the map entry just before the allocated
18802  *		range
18803  *
18804  *	Map must be locked.
18805  */
18806 
18807 static kern_return_t
vm_map_remap_range_allocate(vm_map_t map,vm_map_address_t * address,vm_map_size_t size,vm_map_offset_t mask,vm_map_kernel_flags_t vmk_flags,vm_map_entry_t * map_entry,vm_map_zap_t zap_list)18808 vm_map_remap_range_allocate(
18809 	vm_map_t                map,
18810 	vm_map_address_t        *address,       /* IN/OUT */
18811 	vm_map_size_t           size,
18812 	vm_map_offset_t         mask,
18813 	vm_map_kernel_flags_t   vmk_flags,
18814 	vm_map_entry_t          *map_entry,     /* OUT */
18815 	vm_map_zap_t            zap_list)
18816 {
18817 	vm_map_entry_t  entry;
18818 	vm_map_offset_t start;
18819 	kern_return_t   kr;
18820 
18821 	start = *address;
18822 
18823 	if (!vmk_flags.vmf_fixed) {
18824 		kr = vm_map_locate_space(map, size, mask, vmk_flags,
18825 		    &start, &entry);
18826 		if (kr != KERN_SUCCESS) {
18827 			return kr;
18828 		}
18829 		*address = start;
18830 	} else {
18831 		vm_map_offset_t effective_min_offset, effective_max_offset;
18832 		vm_map_entry_t  temp_entry;
18833 		vm_map_offset_t end;
18834 
18835 		effective_min_offset = map->min_offset;
18836 		effective_max_offset = map->max_offset;
18837 
18838 		/*
18839 		 *	Verify that:
18840 		 *		the address doesn't itself violate
18841 		 *		the mask requirement.
18842 		 */
18843 
18844 		if ((start & mask) != 0) {
18845 			return KERN_NO_SPACE;
18846 		}
18847 
18848 #if CONFIG_MAP_RANGES
18849 		if (map->uses_user_ranges) {
18850 			struct mach_vm_range r;
18851 
18852 			vm_map_user_range_resolve(map, start, 1, &r);
18853 			if (r.max_address == 0) {
18854 				return KERN_INVALID_ADDRESS;
18855 			}
18856 
18857 			effective_min_offset = r.min_address;
18858 			effective_max_offset = r.max_address;
18859 		}
18860 #endif /* CONFIG_MAP_RANGES */
18861 		if (map == kernel_map) {
18862 			mach_vm_range_t r = kmem_validate_range_for_overwrite(start, size);
18863 			effective_min_offset = r->min_address;
18864 			effective_min_offset = r->max_address;
18865 		}
18866 
18867 		/*
18868 		 *	...	the address is within bounds
18869 		 */
18870 
18871 		end = start + size;
18872 
18873 		if ((start < effective_min_offset) ||
18874 		    (end > effective_max_offset) ||
18875 		    (start >= end)) {
18876 			return KERN_INVALID_ADDRESS;
18877 		}
18878 
18879 		/*
18880 		 * If we're asked to overwrite whatever was mapped in that
18881 		 * range, first deallocate that range.
18882 		 */
18883 		if (vmk_flags.vmf_overwrite) {
18884 			vmr_flags_t remove_flags = VM_MAP_REMOVE_NO_MAP_ALIGN;
18885 
18886 			/*
18887 			 * We use a "zap_list" to avoid having to unlock
18888 			 * the "map" in vm_map_delete(), which would compromise
18889 			 * the atomicity of the "deallocate" and then "remap"
18890 			 * combination.
18891 			 */
18892 			remove_flags |= VM_MAP_REMOVE_NO_YIELD;
18893 
18894 			if (vmk_flags.vmkf_overwrite_immutable) {
18895 				remove_flags |= VM_MAP_REMOVE_IMMUTABLE;
18896 			}
18897 			if (vmk_flags.vmkf_remap_prot_copy) {
18898 				remove_flags |= VM_MAP_REMOVE_IMMUTABLE_CODE;
18899 			}
18900 			kr = vm_map_delete(map, start, end, remove_flags,
18901 			    KMEM_GUARD_NONE, zap_list).kmr_return;
18902 			if (kr != KERN_SUCCESS) {
18903 				/* XXX FBDP restore zap_list? */
18904 				return kr;
18905 			}
18906 		}
18907 
18908 		/*
18909 		 *	...	the starting address isn't allocated
18910 		 */
18911 
18912 		if (vm_map_lookup_entry(map, start, &temp_entry)) {
18913 			return KERN_NO_SPACE;
18914 		}
18915 
18916 		entry = temp_entry;
18917 
18918 		/*
18919 		 *	...	the next region doesn't overlap the
18920 		 *		end point.
18921 		 */
18922 
18923 		if ((entry->vme_next != vm_map_to_entry(map)) &&
18924 		    (entry->vme_next->vme_start < end)) {
18925 			return KERN_NO_SPACE;
18926 		}
18927 	}
18928 	*map_entry = entry;
18929 	return KERN_SUCCESS;
18930 }
18931 
18932 /*
18933  *	vm_map_switch:
18934  *
18935  *	Set the address map for the current thread to the specified map
18936  */
18937 
18938 vm_map_t
vm_map_switch(vm_map_t map)18939 vm_map_switch(
18940 	vm_map_t        map)
18941 {
18942 	thread_t        thread = current_thread();
18943 	vm_map_t        oldmap = thread->map;
18944 
18945 
18946 	/*
18947 	 *	Deactivate the current map and activate the requested map
18948 	 */
18949 	mp_disable_preemption();
18950 	PMAP_SWITCH_USER(thread, map, cpu_number());
18951 	mp_enable_preemption();
18952 	return oldmap;
18953 }
18954 
18955 
18956 /*
18957  *	Routine:	vm_map_write_user
18958  *
18959  *	Description:
18960  *		Copy out data from a kernel space into space in the
18961  *		destination map. The space must already exist in the
18962  *		destination map.
18963  *		NOTE:  This routine should only be called by threads
18964  *		which can block on a page fault. i.e. kernel mode user
18965  *		threads.
18966  *
18967  */
18968 kern_return_t
vm_map_write_user(vm_map_t map,void * src_p,vm_map_address_t dst_addr,vm_size_t size)18969 vm_map_write_user(
18970 	vm_map_t                map,
18971 	void                    *src_p,
18972 	vm_map_address_t        dst_addr,
18973 	vm_size_t               size)
18974 {
18975 	kern_return_t   kr = KERN_SUCCESS;
18976 
18977 	if (__improbable(vm_map_range_overflows(map, dst_addr, size))) {
18978 		return KERN_INVALID_ADDRESS;
18979 	}
18980 
18981 	if (current_map() == map) {
18982 		if (copyout(src_p, dst_addr, size)) {
18983 			kr = KERN_INVALID_ADDRESS;
18984 		}
18985 	} else {
18986 		vm_map_t        oldmap;
18987 
18988 		/* take on the identity of the target map while doing */
18989 		/* the transfer */
18990 
18991 		vm_map_reference(map);
18992 		oldmap = vm_map_switch(map);
18993 		if (copyout(src_p, dst_addr, size)) {
18994 			kr = KERN_INVALID_ADDRESS;
18995 		}
18996 		vm_map_switch(oldmap);
18997 		vm_map_deallocate(map);
18998 	}
18999 	return kr;
19000 }
19001 
19002 /*
19003  *	Routine:	vm_map_read_user
19004  *
19005  *	Description:
19006  *		Copy in data from a user space source map into the
19007  *		kernel map. The space must already exist in the
19008  *		kernel map.
19009  *		NOTE:  This routine should only be called by threads
19010  *		which can block on a page fault. i.e. kernel mode user
19011  *		threads.
19012  *
19013  */
19014 kern_return_t
vm_map_read_user(vm_map_t map,vm_map_address_t src_addr,void * dst_p,vm_size_t size)19015 vm_map_read_user(
19016 	vm_map_t                map,
19017 	vm_map_address_t        src_addr,
19018 	void                    *dst_p,
19019 	vm_size_t               size)
19020 {
19021 	kern_return_t   kr = KERN_SUCCESS;
19022 
19023 	if (__improbable(vm_map_range_overflows(map, src_addr, size))) {
19024 		return KERN_INVALID_ADDRESS;
19025 	}
19026 
19027 	if (current_map() == map) {
19028 		if (copyin(src_addr, dst_p, size)) {
19029 			kr = KERN_INVALID_ADDRESS;
19030 		}
19031 	} else {
19032 		vm_map_t        oldmap;
19033 
19034 		/* take on the identity of the target map while doing */
19035 		/* the transfer */
19036 
19037 		vm_map_reference(map);
19038 		oldmap = vm_map_switch(map);
19039 		if (copyin(src_addr, dst_p, size)) {
19040 			kr = KERN_INVALID_ADDRESS;
19041 		}
19042 		vm_map_switch(oldmap);
19043 		vm_map_deallocate(map);
19044 	}
19045 	return kr;
19046 }
19047 
19048 
19049 /*
19050  *	vm_map_check_protection:
19051  *
19052  *	Assert that the target map allows the specified
19053  *	privilege on the entire address region given.
19054  *	The entire region must be allocated.
19055  */
19056 boolean_t
vm_map_check_protection(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_prot_t protection)19057 vm_map_check_protection(vm_map_t map, vm_map_offset_t start,
19058     vm_map_offset_t end, vm_prot_t protection)
19059 {
19060 	vm_map_entry_t entry;
19061 	vm_map_entry_t tmp_entry;
19062 
19063 	if (__improbable(vm_map_range_overflows(map, start, end - start))) {
19064 		return FALSE;
19065 	}
19066 
19067 	vm_map_lock(map);
19068 
19069 	if (start < vm_map_min(map) || end > vm_map_max(map) || start > end) {
19070 		vm_map_unlock(map);
19071 		return FALSE;
19072 	}
19073 
19074 	if (!vm_map_lookup_entry(map, start, &tmp_entry)) {
19075 		vm_map_unlock(map);
19076 		return FALSE;
19077 	}
19078 
19079 	entry = tmp_entry;
19080 
19081 	while (start < end) {
19082 		if (entry == vm_map_to_entry(map)) {
19083 			vm_map_unlock(map);
19084 			return FALSE;
19085 		}
19086 
19087 		/*
19088 		 *	No holes allowed!
19089 		 */
19090 
19091 		if (start < entry->vme_start) {
19092 			vm_map_unlock(map);
19093 			return FALSE;
19094 		}
19095 
19096 		/*
19097 		 * Check protection associated with entry.
19098 		 */
19099 
19100 		if ((entry->protection & protection) != protection) {
19101 			vm_map_unlock(map);
19102 			return FALSE;
19103 		}
19104 
19105 		/* go to next entry */
19106 
19107 		start = entry->vme_end;
19108 		entry = entry->vme_next;
19109 	}
19110 	vm_map_unlock(map);
19111 	return TRUE;
19112 }
19113 
19114 kern_return_t
vm_map_purgable_control(vm_map_t map,vm_map_offset_t address,vm_purgable_t control,int * state)19115 vm_map_purgable_control(
19116 	vm_map_t                map,
19117 	vm_map_offset_t         address,
19118 	vm_purgable_t           control,
19119 	int                     *state)
19120 {
19121 	vm_map_entry_t          entry;
19122 	vm_object_t             object;
19123 	kern_return_t           kr;
19124 	boolean_t               was_nonvolatile;
19125 
19126 	/*
19127 	 * Vet all the input parameters and current type and state of the
19128 	 * underlaying object.  Return with an error if anything is amiss.
19129 	 */
19130 	if (map == VM_MAP_NULL) {
19131 		return KERN_INVALID_ARGUMENT;
19132 	}
19133 
19134 	if (control != VM_PURGABLE_SET_STATE &&
19135 	    control != VM_PURGABLE_GET_STATE &&
19136 	    control != VM_PURGABLE_PURGE_ALL &&
19137 	    control != VM_PURGABLE_SET_STATE_FROM_KERNEL) {
19138 		return KERN_INVALID_ARGUMENT;
19139 	}
19140 
19141 	if (control == VM_PURGABLE_PURGE_ALL) {
19142 		vm_purgeable_object_purge_all();
19143 		return KERN_SUCCESS;
19144 	}
19145 
19146 	if ((control == VM_PURGABLE_SET_STATE ||
19147 	    control == VM_PURGABLE_SET_STATE_FROM_KERNEL) &&
19148 	    (((*state & ~(VM_PURGABLE_ALL_MASKS)) != 0) ||
19149 	    ((*state & VM_PURGABLE_STATE_MASK) > VM_PURGABLE_STATE_MASK))) {
19150 		return KERN_INVALID_ARGUMENT;
19151 	}
19152 
19153 	vm_map_lock_read(map);
19154 
19155 	if (!vm_map_lookup_entry(map, address, &entry) || entry->is_sub_map) {
19156 		/*
19157 		 * Must pass a valid non-submap address.
19158 		 */
19159 		vm_map_unlock_read(map);
19160 		return KERN_INVALID_ADDRESS;
19161 	}
19162 
19163 	if ((entry->protection & VM_PROT_WRITE) == 0 &&
19164 	    control != VM_PURGABLE_GET_STATE) {
19165 		/*
19166 		 * Can't apply purgable controls to something you can't write.
19167 		 */
19168 		vm_map_unlock_read(map);
19169 		return KERN_PROTECTION_FAILURE;
19170 	}
19171 
19172 	object = VME_OBJECT(entry);
19173 	if (object == VM_OBJECT_NULL ||
19174 	    object->purgable == VM_PURGABLE_DENY) {
19175 		/*
19176 		 * Object must already be present and be purgeable.
19177 		 */
19178 		vm_map_unlock_read(map);
19179 		return KERN_INVALID_ARGUMENT;
19180 	}
19181 
19182 	vm_object_lock(object);
19183 
19184 #if 00
19185 	if (VME_OFFSET(entry) != 0 ||
19186 	    entry->vme_end - entry->vme_start != object->vo_size) {
19187 		/*
19188 		 * Can only apply purgable controls to the whole (existing)
19189 		 * object at once.
19190 		 */
19191 		vm_map_unlock_read(map);
19192 		vm_object_unlock(object);
19193 		return KERN_INVALID_ARGUMENT;
19194 	}
19195 #endif
19196 
19197 	assert(!entry->is_sub_map);
19198 	assert(!entry->use_pmap); /* purgeable has its own accounting */
19199 
19200 	vm_map_unlock_read(map);
19201 
19202 	was_nonvolatile = (object->purgable == VM_PURGABLE_NONVOLATILE);
19203 
19204 	kr = vm_object_purgable_control(object, control, state);
19205 
19206 	if (was_nonvolatile &&
19207 	    object->purgable != VM_PURGABLE_NONVOLATILE &&
19208 	    map->pmap == kernel_pmap) {
19209 #if DEBUG
19210 		object->vo_purgeable_volatilizer = kernel_task;
19211 #endif /* DEBUG */
19212 	}
19213 
19214 	vm_object_unlock(object);
19215 
19216 	return kr;
19217 }
19218 
19219 void
vm_map_footprint_query_page_info(vm_map_t map,vm_map_entry_t map_entry,vm_map_offset_t curr_s_offset,int * disposition_p)19220 vm_map_footprint_query_page_info(
19221 	vm_map_t        map,
19222 	vm_map_entry_t  map_entry,
19223 	vm_map_offset_t curr_s_offset,
19224 	int             *disposition_p)
19225 {
19226 	int             pmap_disp;
19227 	vm_object_t     object = VM_OBJECT_NULL;
19228 	int             disposition;
19229 	int             effective_page_size;
19230 
19231 	vm_map_lock_assert_held(map);
19232 	assert(!map->has_corpse_footprint);
19233 	assert(curr_s_offset >= map_entry->vme_start);
19234 	assert(curr_s_offset < map_entry->vme_end);
19235 
19236 	if (map_entry->is_sub_map) {
19237 		if (!map_entry->use_pmap) {
19238 			/* nested pmap: no footprint */
19239 			*disposition_p = 0;
19240 			return;
19241 		}
19242 	} else {
19243 		object = VME_OBJECT(map_entry);
19244 		if (object == VM_OBJECT_NULL) {
19245 			/* nothing mapped here: no need to ask */
19246 			*disposition_p = 0;
19247 			return;
19248 		}
19249 	}
19250 
19251 	effective_page_size = MIN(PAGE_SIZE, VM_MAP_PAGE_SIZE(map));
19252 
19253 	pmap_disp = 0;
19254 
19255 	/*
19256 	 * Query the pmap.
19257 	 */
19258 	pmap_query_page_info(map->pmap, curr_s_offset, &pmap_disp);
19259 
19260 	/*
19261 	 * Compute this page's disposition.
19262 	 */
19263 	disposition = 0;
19264 
19265 	/* deal with "alternate accounting" first */
19266 	if (!map_entry->is_sub_map &&
19267 	    object->vo_no_footprint) {
19268 		/* does not count in footprint */
19269 		assertf(!map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry);
19270 	} else if (!map_entry->is_sub_map &&
19271 	    (object->purgable == VM_PURGABLE_NONVOLATILE ||
19272 	    (object->purgable == VM_PURGABLE_DENY &&
19273 	    object->vo_ledger_tag)) &&
19274 	    VM_OBJECT_OWNER(object) != NULL &&
19275 	    VM_OBJECT_OWNER(object)->map == map) {
19276 		assertf(!map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry);
19277 		if ((((curr_s_offset
19278 		    - map_entry->vme_start
19279 		    + VME_OFFSET(map_entry))
19280 		    / effective_page_size) <
19281 		    (object->resident_page_count +
19282 		    vm_compressor_pager_get_count(object->pager)))) {
19283 			/*
19284 			 * Non-volatile purgeable object owned
19285 			 * by this task: report the first
19286 			 * "#resident + #compressed" pages as
19287 			 * "resident" (to show that they
19288 			 * contribute to the footprint) but not
19289 			 * "dirty" (to avoid double-counting
19290 			 * with the fake "non-volatile" region
19291 			 * we'll report at the end of the
19292 			 * address space to account for all
19293 			 * (mapped or not) non-volatile memory
19294 			 * owned by this task.
19295 			 */
19296 			disposition |= VM_PAGE_QUERY_PAGE_PRESENT;
19297 		}
19298 	} else if (!map_entry->is_sub_map &&
19299 	    (object->purgable == VM_PURGABLE_VOLATILE ||
19300 	    object->purgable == VM_PURGABLE_EMPTY) &&
19301 	    VM_OBJECT_OWNER(object) != NULL &&
19302 	    VM_OBJECT_OWNER(object)->map == map) {
19303 		assertf(!map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry);
19304 		if ((((curr_s_offset
19305 		    - map_entry->vme_start
19306 		    + VME_OFFSET(map_entry))
19307 		    / effective_page_size) <
19308 		    object->wired_page_count)) {
19309 			/*
19310 			 * Volatile|empty purgeable object owned
19311 			 * by this task: report the first
19312 			 * "#wired" pages as "resident" (to
19313 			 * show that they contribute to the
19314 			 * footprint) but not "dirty" (to avoid
19315 			 * double-counting with the fake
19316 			 * "non-volatile" region we'll report
19317 			 * at the end of the address space to
19318 			 * account for all (mapped or not)
19319 			 * non-volatile memory owned by this
19320 			 * task.
19321 			 */
19322 			disposition |= VM_PAGE_QUERY_PAGE_PRESENT;
19323 		}
19324 	} else if (!map_entry->is_sub_map &&
19325 	    map_entry->iokit_acct &&
19326 	    object->internal &&
19327 	    object->purgable == VM_PURGABLE_DENY) {
19328 		/*
19329 		 * Non-purgeable IOKit memory: phys_footprint
19330 		 * includes the entire virtual mapping.
19331 		 */
19332 		assertf(!map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry);
19333 		disposition |= VM_PAGE_QUERY_PAGE_PRESENT;
19334 		disposition |= VM_PAGE_QUERY_PAGE_DIRTY;
19335 	} else if (pmap_disp & (PMAP_QUERY_PAGE_ALTACCT |
19336 	    PMAP_QUERY_PAGE_COMPRESSED_ALTACCT)) {
19337 		/* alternate accounting */
19338 #if __arm64__ && (DEVELOPMENT || DEBUG)
19339 		if (map->pmap->footprint_was_suspended) {
19340 			/*
19341 			 * The assertion below can fail if dyld
19342 			 * suspended footprint accounting
19343 			 * while doing some adjustments to
19344 			 * this page;  the mapping would say
19345 			 * "use pmap accounting" but the page
19346 			 * would be marked "alternate
19347 			 * accounting".
19348 			 */
19349 		} else
19350 #endif /* __arm64__ && (DEVELOPMENT || DEBUG) */
19351 		{
19352 			assertf(!map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry);
19353 		}
19354 		disposition = 0;
19355 	} else {
19356 		if (pmap_disp & PMAP_QUERY_PAGE_PRESENT) {
19357 			assertf(map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry);
19358 			disposition |= VM_PAGE_QUERY_PAGE_PRESENT;
19359 			disposition |= VM_PAGE_QUERY_PAGE_REF;
19360 			if (pmap_disp & PMAP_QUERY_PAGE_INTERNAL) {
19361 				disposition |= VM_PAGE_QUERY_PAGE_DIRTY;
19362 			} else {
19363 				disposition |= VM_PAGE_QUERY_PAGE_EXTERNAL;
19364 			}
19365 			if (pmap_disp & PMAP_QUERY_PAGE_REUSABLE) {
19366 				disposition |= VM_PAGE_QUERY_PAGE_REUSABLE;
19367 			}
19368 		} else if (pmap_disp & PMAP_QUERY_PAGE_COMPRESSED) {
19369 			assertf(map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry);
19370 			disposition |= VM_PAGE_QUERY_PAGE_PAGED_OUT;
19371 		}
19372 	}
19373 
19374 	*disposition_p = disposition;
19375 }
19376 
19377 kern_return_t
vm_map_page_query_internal(vm_map_t target_map,vm_map_offset_t offset,int * disposition,int * ref_count)19378 vm_map_page_query_internal(
19379 	vm_map_t        target_map,
19380 	vm_map_offset_t offset,
19381 	int             *disposition,
19382 	int             *ref_count)
19383 {
19384 	kern_return_t                   kr;
19385 	vm_page_info_basic_data_t       info;
19386 	mach_msg_type_number_t          count;
19387 
19388 	count = VM_PAGE_INFO_BASIC_COUNT;
19389 	kr = vm_map_page_info(target_map,
19390 	    offset,
19391 	    VM_PAGE_INFO_BASIC,
19392 	    (vm_page_info_t) &info,
19393 	    &count);
19394 	if (kr == KERN_SUCCESS) {
19395 		*disposition = info.disposition;
19396 		*ref_count = info.ref_count;
19397 	} else {
19398 		*disposition = 0;
19399 		*ref_count = 0;
19400 	}
19401 
19402 	return kr;
19403 }
19404 
19405 kern_return_t
vm_map_page_info(vm_map_t map,vm_map_offset_t offset,vm_page_info_flavor_t flavor,vm_page_info_t info,mach_msg_type_number_t * count)19406 vm_map_page_info(
19407 	vm_map_t                map,
19408 	vm_map_offset_t         offset,
19409 	vm_page_info_flavor_t   flavor,
19410 	vm_page_info_t          info,
19411 	mach_msg_type_number_t  *count)
19412 {
19413 	return vm_map_page_range_info_internal(map,
19414 	           offset, /* start of range */
19415 	           (offset + 1), /* this will get rounded in the call to the page boundary */
19416 	           (int)-1, /* effective_page_shift: unspecified */
19417 	           flavor,
19418 	           info,
19419 	           count);
19420 }
19421 
19422 kern_return_t
vm_map_page_range_info_internal(vm_map_t map,vm_map_offset_t start_offset,vm_map_offset_t end_offset,int effective_page_shift,vm_page_info_flavor_t flavor,vm_page_info_t info,mach_msg_type_number_t * count)19423 vm_map_page_range_info_internal(
19424 	vm_map_t                map,
19425 	vm_map_offset_t         start_offset,
19426 	vm_map_offset_t         end_offset,
19427 	int                     effective_page_shift,
19428 	vm_page_info_flavor_t   flavor,
19429 	vm_page_info_t          info,
19430 	mach_msg_type_number_t  *count)
19431 {
19432 	vm_map_entry_t          map_entry = VM_MAP_ENTRY_NULL;
19433 	vm_object_t             object = VM_OBJECT_NULL, curr_object = VM_OBJECT_NULL;
19434 	vm_page_t               m = VM_PAGE_NULL;
19435 	kern_return_t           retval = KERN_SUCCESS;
19436 	int                     disposition = 0;
19437 	int                     ref_count = 0;
19438 	int                     depth = 0, info_idx = 0;
19439 	vm_page_info_basic_t    basic_info = 0;
19440 	vm_map_offset_t         offset_in_page = 0, offset_in_object = 0, curr_offset_in_object = 0;
19441 	vm_map_offset_t         start = 0, end = 0, curr_s_offset = 0, curr_e_offset = 0;
19442 	boolean_t               do_region_footprint;
19443 	ledger_amount_t         ledger_resident, ledger_compressed;
19444 	int                     effective_page_size;
19445 	vm_map_offset_t         effective_page_mask;
19446 
19447 	switch (flavor) {
19448 	case VM_PAGE_INFO_BASIC:
19449 		if (*count != VM_PAGE_INFO_BASIC_COUNT) {
19450 			/*
19451 			 * The "vm_page_info_basic_data" structure was not
19452 			 * properly padded, so allow the size to be off by
19453 			 * one to maintain backwards binary compatibility...
19454 			 */
19455 			if (*count != VM_PAGE_INFO_BASIC_COUNT - 1) {
19456 				return KERN_INVALID_ARGUMENT;
19457 			}
19458 		}
19459 		break;
19460 	default:
19461 		return KERN_INVALID_ARGUMENT;
19462 	}
19463 
19464 	if (effective_page_shift == -1) {
19465 		effective_page_shift = vm_self_region_page_shift_safely(map);
19466 		if (effective_page_shift == -1) {
19467 			return KERN_INVALID_ARGUMENT;
19468 		}
19469 	}
19470 	effective_page_size = (1 << effective_page_shift);
19471 	effective_page_mask = effective_page_size - 1;
19472 
19473 	do_region_footprint = task_self_region_footprint();
19474 	disposition = 0;
19475 	ref_count = 0;
19476 	depth = 0;
19477 	info_idx = 0; /* Tracks the next index within the info structure to be filled.*/
19478 	retval = KERN_SUCCESS;
19479 
19480 	if (__improbable(vm_map_range_overflows(map, start_offset, end_offset - start_offset))) {
19481 		return KERN_INVALID_ADDRESS;
19482 	}
19483 
19484 	offset_in_page = start_offset & effective_page_mask;
19485 	start = vm_map_trunc_page(start_offset, effective_page_mask);
19486 	end = vm_map_round_page(end_offset, effective_page_mask);
19487 
19488 	if (end < start) {
19489 		return KERN_INVALID_ARGUMENT;
19490 	}
19491 
19492 	assert((end - start) <= MAX_PAGE_RANGE_QUERY);
19493 
19494 	vm_map_lock_read(map);
19495 
19496 	task_ledgers_footprint(map->pmap->ledger, &ledger_resident, &ledger_compressed);
19497 
19498 	for (curr_s_offset = start; curr_s_offset < end;) {
19499 		/*
19500 		 * New lookup needs reset of these variables.
19501 		 */
19502 		curr_object = object = VM_OBJECT_NULL;
19503 		offset_in_object = 0;
19504 		ref_count = 0;
19505 		depth = 0;
19506 
19507 		if (do_region_footprint &&
19508 		    curr_s_offset >= vm_map_last_entry(map)->vme_end) {
19509 			/*
19510 			 * Request for "footprint" info about a page beyond
19511 			 * the end of address space: this must be for
19512 			 * the fake region vm_map_region_recurse_64()
19513 			 * reported to account for non-volatile purgeable
19514 			 * memory owned by this task.
19515 			 */
19516 			disposition = 0;
19517 
19518 			if (curr_s_offset - vm_map_last_entry(map)->vme_end <=
19519 			    (unsigned) ledger_compressed) {
19520 				/*
19521 				 * We haven't reported all the "non-volatile
19522 				 * compressed" pages yet, so report this fake
19523 				 * page as "compressed".
19524 				 */
19525 				disposition |= VM_PAGE_QUERY_PAGE_PAGED_OUT;
19526 			} else {
19527 				/*
19528 				 * We've reported all the non-volatile
19529 				 * compressed page but not all the non-volatile
19530 				 * pages , so report this fake page as
19531 				 * "resident dirty".
19532 				 */
19533 				disposition |= VM_PAGE_QUERY_PAGE_PRESENT;
19534 				disposition |= VM_PAGE_QUERY_PAGE_DIRTY;
19535 				disposition |= VM_PAGE_QUERY_PAGE_REF;
19536 			}
19537 			switch (flavor) {
19538 			case VM_PAGE_INFO_BASIC:
19539 				basic_info = (vm_page_info_basic_t) (((uintptr_t) info) + (info_idx * sizeof(struct vm_page_info_basic)));
19540 				basic_info->disposition = disposition;
19541 				basic_info->ref_count = 1;
19542 				basic_info->object_id = VM_OBJECT_ID_FAKE(map, task_ledgers.purgeable_nonvolatile);
19543 				basic_info->offset = 0;
19544 				basic_info->depth = 0;
19545 
19546 				info_idx++;
19547 				break;
19548 			}
19549 			curr_s_offset += effective_page_size;
19550 			continue;
19551 		}
19552 
19553 		/*
19554 		 * First, find the map entry covering "curr_s_offset", going down
19555 		 * submaps if necessary.
19556 		 */
19557 		if (!vm_map_lookup_entry(map, curr_s_offset, &map_entry)) {
19558 			/* no entry -> no object -> no page */
19559 
19560 			if (curr_s_offset < vm_map_min(map)) {
19561 				/*
19562 				 * Illegal address that falls below map min.
19563 				 */
19564 				curr_e_offset = MIN(end, vm_map_min(map));
19565 			} else if (curr_s_offset >= vm_map_max(map)) {
19566 				/*
19567 				 * Illegal address that falls on/after map max.
19568 				 */
19569 				curr_e_offset = end;
19570 			} else if (map_entry == vm_map_to_entry(map)) {
19571 				/*
19572 				 * Hit a hole.
19573 				 */
19574 				if (map_entry->vme_next == vm_map_to_entry(map)) {
19575 					/*
19576 					 * Empty map.
19577 					 */
19578 					curr_e_offset = MIN(map->max_offset, end);
19579 				} else {
19580 					/*
19581 					 * Hole at start of the map.
19582 					 */
19583 					curr_e_offset = MIN(map_entry->vme_next->vme_start, end);
19584 				}
19585 			} else {
19586 				if (map_entry->vme_next == vm_map_to_entry(map)) {
19587 					/*
19588 					 * Hole at the end of the map.
19589 					 */
19590 					curr_e_offset = MIN(map->max_offset, end);
19591 				} else {
19592 					curr_e_offset = MIN(map_entry->vme_next->vme_start, end);
19593 				}
19594 			}
19595 
19596 			assert(curr_e_offset >= curr_s_offset);
19597 
19598 			uint64_t num_pages = (curr_e_offset - curr_s_offset) >> effective_page_shift;
19599 
19600 			void *info_ptr = (void*) (((uintptr_t) info) + (info_idx * sizeof(struct vm_page_info_basic)));
19601 
19602 			bzero(info_ptr, num_pages * sizeof(struct vm_page_info_basic));
19603 
19604 			curr_s_offset = curr_e_offset;
19605 
19606 			info_idx += num_pages;
19607 
19608 			continue;
19609 		}
19610 
19611 		/* compute offset from this map entry's start */
19612 		offset_in_object = curr_s_offset - map_entry->vme_start;
19613 
19614 		/* compute offset into this map entry's object (or submap) */
19615 		offset_in_object += VME_OFFSET(map_entry);
19616 
19617 		if (map_entry->is_sub_map) {
19618 			vm_map_t sub_map = VM_MAP_NULL;
19619 			vm_page_info_t submap_info = 0;
19620 			vm_map_offset_t submap_s_offset = 0, submap_e_offset = 0, range_len = 0;
19621 
19622 			range_len = MIN(map_entry->vme_end, end) - curr_s_offset;
19623 
19624 			submap_s_offset = offset_in_object;
19625 			submap_e_offset = submap_s_offset + range_len;
19626 
19627 			sub_map = VME_SUBMAP(map_entry);
19628 
19629 			vm_map_reference(sub_map);
19630 			vm_map_unlock_read(map);
19631 
19632 			submap_info = (vm_page_info_t) (((uintptr_t) info) + (info_idx * sizeof(struct vm_page_info_basic)));
19633 
19634 			assertf(VM_MAP_PAGE_SHIFT(sub_map) >= VM_MAP_PAGE_SHIFT(map),
19635 			    "Submap page size (%d) differs from current map (%d)\n", VM_MAP_PAGE_SIZE(sub_map), VM_MAP_PAGE_SIZE(map));
19636 
19637 			retval = vm_map_page_range_info_internal(sub_map,
19638 			    submap_s_offset,
19639 			    submap_e_offset,
19640 			    effective_page_shift,
19641 			    VM_PAGE_INFO_BASIC,
19642 			    (vm_page_info_t) submap_info,
19643 			    count);
19644 
19645 			assert(retval == KERN_SUCCESS);
19646 
19647 			vm_map_lock_read(map);
19648 			vm_map_deallocate(sub_map);
19649 
19650 			/* Move the "info" index by the number of pages we inspected.*/
19651 			info_idx += range_len >> effective_page_shift;
19652 
19653 			/* Move our current offset by the size of the range we inspected.*/
19654 			curr_s_offset += range_len;
19655 
19656 			continue;
19657 		}
19658 
19659 		object = VME_OBJECT(map_entry);
19660 
19661 		if (object == VM_OBJECT_NULL) {
19662 			/*
19663 			 * We don't have an object here and, hence,
19664 			 * no pages to inspect. We'll fill up the
19665 			 * info structure appropriately.
19666 			 */
19667 
19668 			curr_e_offset = MIN(map_entry->vme_end, end);
19669 
19670 			uint64_t num_pages = (curr_e_offset - curr_s_offset) >> effective_page_shift;
19671 
19672 			void *info_ptr = (void*) (((uintptr_t) info) + (info_idx * sizeof(struct vm_page_info_basic)));
19673 
19674 			bzero(info_ptr, num_pages * sizeof(struct vm_page_info_basic));
19675 
19676 			curr_s_offset = curr_e_offset;
19677 
19678 			info_idx += num_pages;
19679 
19680 			continue;
19681 		}
19682 
19683 		if (do_region_footprint) {
19684 			disposition = 0;
19685 			if (map->has_corpse_footprint) {
19686 				/*
19687 				 * Query the page info data we saved
19688 				 * while forking the corpse.
19689 				 */
19690 				vm_map_corpse_footprint_query_page_info(
19691 					map,
19692 					curr_s_offset,
19693 					&disposition);
19694 			} else {
19695 				/*
19696 				 * Query the live pmap for footprint info
19697 				 * about this page.
19698 				 */
19699 				vm_map_footprint_query_page_info(
19700 					map,
19701 					map_entry,
19702 					curr_s_offset,
19703 					&disposition);
19704 			}
19705 			switch (flavor) {
19706 			case VM_PAGE_INFO_BASIC:
19707 				basic_info = (vm_page_info_basic_t) (((uintptr_t) info) + (info_idx * sizeof(struct vm_page_info_basic)));
19708 				basic_info->disposition = disposition;
19709 				basic_info->ref_count = 1;
19710 				basic_info->object_id = VM_OBJECT_ID_FAKE(map, task_ledgers.purgeable_nonvolatile);
19711 				basic_info->offset = 0;
19712 				basic_info->depth = 0;
19713 
19714 				info_idx++;
19715 				break;
19716 			}
19717 			curr_s_offset += effective_page_size;
19718 			continue;
19719 		}
19720 
19721 		vm_object_reference(object);
19722 		/*
19723 		 * Shared mode -- so we can allow other readers
19724 		 * to grab the lock too.
19725 		 */
19726 		vm_object_lock_shared(object);
19727 
19728 		curr_e_offset = MIN(map_entry->vme_end, end);
19729 
19730 		vm_map_unlock_read(map);
19731 
19732 		map_entry = NULL; /* map is unlocked, the entry is no longer valid. */
19733 
19734 		curr_object = object;
19735 
19736 		for (; curr_s_offset < curr_e_offset;) {
19737 			if (object == curr_object) {
19738 				ref_count = curr_object->ref_count - 1; /* account for our object reference above. */
19739 			} else {
19740 				ref_count = curr_object->ref_count;
19741 			}
19742 
19743 			curr_offset_in_object = offset_in_object;
19744 
19745 			for (;;) {
19746 				m = vm_page_lookup(curr_object, vm_object_trunc_page(curr_offset_in_object));
19747 
19748 				if (m != VM_PAGE_NULL) {
19749 					disposition |= VM_PAGE_QUERY_PAGE_PRESENT;
19750 					break;
19751 				} else {
19752 					if (curr_object->internal &&
19753 					    curr_object->alive &&
19754 					    !curr_object->terminating &&
19755 					    curr_object->pager_ready) {
19756 						if (VM_COMPRESSOR_PAGER_STATE_GET(curr_object, vm_object_trunc_page(curr_offset_in_object))
19757 						    == VM_EXTERNAL_STATE_EXISTS) {
19758 							/* the pager has that page */
19759 							disposition |= VM_PAGE_QUERY_PAGE_PAGED_OUT;
19760 							break;
19761 						}
19762 					}
19763 
19764 					/*
19765 					 * Go down the VM object shadow chain until we find the page
19766 					 * we're looking for.
19767 					 */
19768 
19769 					if (curr_object->shadow != VM_OBJECT_NULL) {
19770 						vm_object_t shadow = VM_OBJECT_NULL;
19771 
19772 						curr_offset_in_object += curr_object->vo_shadow_offset;
19773 						shadow = curr_object->shadow;
19774 
19775 						vm_object_lock_shared(shadow);
19776 						vm_object_unlock(curr_object);
19777 
19778 						curr_object = shadow;
19779 						depth++;
19780 						continue;
19781 					} else {
19782 						break;
19783 					}
19784 				}
19785 			}
19786 
19787 			/* The ref_count is not strictly accurate, it measures the number   */
19788 			/* of entities holding a ref on the object, they may not be mapping */
19789 			/* the object or may not be mapping the section holding the         */
19790 			/* target page but its still a ball park number and though an over- */
19791 			/* count, it picks up the copy-on-write cases                       */
19792 
19793 			/* We could also get a picture of page sharing from pmap_attributes */
19794 			/* but this would under count as only faulted-in mappings would     */
19795 			/* show up.							    */
19796 
19797 			if ((curr_object == object) && curr_object->shadow) {
19798 				disposition |= VM_PAGE_QUERY_PAGE_COPIED;
19799 			}
19800 
19801 			if (!curr_object->internal) {
19802 				disposition |= VM_PAGE_QUERY_PAGE_EXTERNAL;
19803 			}
19804 
19805 			if (m != VM_PAGE_NULL) {
19806 				if (m->vmp_fictitious) {
19807 					disposition |= VM_PAGE_QUERY_PAGE_FICTITIOUS;
19808 				} else {
19809 					if (m->vmp_dirty || pmap_is_modified(VM_PAGE_GET_PHYS_PAGE(m))) {
19810 						disposition |= VM_PAGE_QUERY_PAGE_DIRTY;
19811 					}
19812 
19813 					if (m->vmp_reference || pmap_is_referenced(VM_PAGE_GET_PHYS_PAGE(m))) {
19814 						disposition |= VM_PAGE_QUERY_PAGE_REF;
19815 					}
19816 
19817 					if (m->vmp_q_state == VM_PAGE_ON_SPECULATIVE_Q) {
19818 						disposition |= VM_PAGE_QUERY_PAGE_SPECULATIVE;
19819 					}
19820 
19821 					/*
19822 					 * XXX TODO4K:
19823 					 * when this routine deals with 4k
19824 					 * pages, check the appropriate CS bit
19825 					 * here.
19826 					 */
19827 					if (m->vmp_cs_validated) {
19828 						disposition |= VM_PAGE_QUERY_PAGE_CS_VALIDATED;
19829 					}
19830 					if (m->vmp_cs_tainted) {
19831 						disposition |= VM_PAGE_QUERY_PAGE_CS_TAINTED;
19832 					}
19833 					if (m->vmp_cs_nx) {
19834 						disposition |= VM_PAGE_QUERY_PAGE_CS_NX;
19835 					}
19836 					if (m->vmp_reusable || curr_object->all_reusable) {
19837 						disposition |= VM_PAGE_QUERY_PAGE_REUSABLE;
19838 					}
19839 				}
19840 			}
19841 
19842 			switch (flavor) {
19843 			case VM_PAGE_INFO_BASIC:
19844 				basic_info = (vm_page_info_basic_t) (((uintptr_t) info) + (info_idx * sizeof(struct vm_page_info_basic)));
19845 				basic_info->disposition = disposition;
19846 				basic_info->ref_count = ref_count;
19847 				basic_info->object_id = (vm_object_id_t) (uintptr_t)
19848 				    VM_KERNEL_ADDRPERM(curr_object);
19849 				basic_info->offset =
19850 				    (memory_object_offset_t) curr_offset_in_object + offset_in_page;
19851 				basic_info->depth = depth;
19852 
19853 				info_idx++;
19854 				break;
19855 			}
19856 
19857 			disposition = 0;
19858 			offset_in_page = 0; // This doesn't really make sense for any offset other than the starting offset.
19859 
19860 			/*
19861 			 * Move to next offset in the range and in our object.
19862 			 */
19863 			curr_s_offset += effective_page_size;
19864 			offset_in_object += effective_page_size;
19865 			curr_offset_in_object = offset_in_object;
19866 
19867 			if (curr_object != object) {
19868 				vm_object_unlock(curr_object);
19869 
19870 				curr_object = object;
19871 
19872 				vm_object_lock_shared(curr_object);
19873 			} else {
19874 				vm_object_lock_yield_shared(curr_object);
19875 			}
19876 		}
19877 
19878 		vm_object_unlock(curr_object);
19879 		vm_object_deallocate(curr_object);
19880 
19881 		vm_map_lock_read(map);
19882 	}
19883 
19884 	vm_map_unlock_read(map);
19885 	return retval;
19886 }
19887 
19888 /*
19889  *	vm_map_msync
19890  *
19891  *	Synchronises the memory range specified with its backing store
19892  *	image by either flushing or cleaning the contents to the appropriate
19893  *	memory manager engaging in a memory object synchronize dialog with
19894  *	the manager.  The client doesn't return until the manager issues
19895  *	m_o_s_completed message.  MIG Magically converts user task parameter
19896  *	to the task's address map.
19897  *
19898  *	interpretation of sync_flags
19899  *	VM_SYNC_INVALIDATE	- discard pages, only return precious
19900  *				  pages to manager.
19901  *
19902  *	VM_SYNC_INVALIDATE & (VM_SYNC_SYNCHRONOUS | VM_SYNC_ASYNCHRONOUS)
19903  *				- discard pages, write dirty or precious
19904  *				  pages back to memory manager.
19905  *
19906  *	VM_SYNC_SYNCHRONOUS | VM_SYNC_ASYNCHRONOUS
19907  *				- write dirty or precious pages back to
19908  *				  the memory manager.
19909  *
19910  *	VM_SYNC_CONTIGUOUS	- does everything normally, but if there
19911  *				  is a hole in the region, and we would
19912  *				  have returned KERN_SUCCESS, return
19913  *				  KERN_INVALID_ADDRESS instead.
19914  *
19915  *	NOTE
19916  *	The memory object attributes have not yet been implemented, this
19917  *	function will have to deal with the invalidate attribute
19918  *
19919  *	RETURNS
19920  *	KERN_INVALID_TASK		Bad task parameter
19921  *	KERN_INVALID_ARGUMENT		both sync and async were specified.
19922  *	KERN_SUCCESS			The usual.
19923  *	KERN_INVALID_ADDRESS		There was a hole in the region.
19924  */
19925 
19926 kern_return_t
vm_map_msync(vm_map_t map,vm_map_address_t address,vm_map_size_t size,vm_sync_t sync_flags)19927 vm_map_msync(
19928 	vm_map_t                map,
19929 	vm_map_address_t        address,
19930 	vm_map_size_t           size,
19931 	vm_sync_t               sync_flags)
19932 {
19933 	vm_map_entry_t          entry;
19934 	vm_map_size_t           amount_left;
19935 	vm_object_offset_t      offset;
19936 	vm_object_offset_t      start_offset, end_offset;
19937 	boolean_t               do_sync_req;
19938 	boolean_t               had_hole = FALSE;
19939 	vm_map_offset_t         pmap_offset;
19940 
19941 	if ((sync_flags & VM_SYNC_ASYNCHRONOUS) &&
19942 	    (sync_flags & VM_SYNC_SYNCHRONOUS)) {
19943 		return KERN_INVALID_ARGUMENT;
19944 	}
19945 
19946 	if (__improbable(vm_map_range_overflows(map, address, size))) {
19947 		return KERN_INVALID_ADDRESS;
19948 	}
19949 
19950 	if (VM_MAP_PAGE_MASK(map) < PAGE_MASK) {
19951 		DEBUG4K_SHARE("map %p address 0x%llx size 0x%llx flags 0x%x\n", map, (uint64_t)address, (uint64_t)size, sync_flags);
19952 	}
19953 
19954 	/*
19955 	 * align address and size on page boundaries
19956 	 */
19957 	size = (vm_map_round_page(address + size,
19958 	    VM_MAP_PAGE_MASK(map)) -
19959 	    vm_map_trunc_page(address,
19960 	    VM_MAP_PAGE_MASK(map)));
19961 	address = vm_map_trunc_page(address,
19962 	    VM_MAP_PAGE_MASK(map));
19963 
19964 	if (map == VM_MAP_NULL) {
19965 		return KERN_INVALID_TASK;
19966 	}
19967 
19968 	if (size == 0) {
19969 		return KERN_SUCCESS;
19970 	}
19971 
19972 	amount_left = size;
19973 
19974 	while (amount_left > 0) {
19975 		vm_object_size_t        flush_size;
19976 		vm_object_t             object;
19977 
19978 		vm_map_lock(map);
19979 		if (!vm_map_lookup_entry(map,
19980 		    address,
19981 		    &entry)) {
19982 			vm_map_size_t   skip;
19983 
19984 			/*
19985 			 * hole in the address map.
19986 			 */
19987 			had_hole = TRUE;
19988 
19989 			if (sync_flags & VM_SYNC_KILLPAGES) {
19990 				/*
19991 				 * For VM_SYNC_KILLPAGES, there should be
19992 				 * no holes in the range, since we couldn't
19993 				 * prevent someone else from allocating in
19994 				 * that hole and we wouldn't want to "kill"
19995 				 * their pages.
19996 				 */
19997 				vm_map_unlock(map);
19998 				break;
19999 			}
20000 
20001 			/*
20002 			 * Check for empty map.
20003 			 */
20004 			if (entry == vm_map_to_entry(map) &&
20005 			    entry->vme_next == entry) {
20006 				vm_map_unlock(map);
20007 				break;
20008 			}
20009 			/*
20010 			 * Check that we don't wrap and that
20011 			 * we have at least one real map entry.
20012 			 */
20013 			if ((map->hdr.nentries == 0) ||
20014 			    (entry->vme_next->vme_start < address)) {
20015 				vm_map_unlock(map);
20016 				break;
20017 			}
20018 			/*
20019 			 * Move up to the next entry if needed
20020 			 */
20021 			skip = (entry->vme_next->vme_start - address);
20022 			if (skip >= amount_left) {
20023 				amount_left = 0;
20024 			} else {
20025 				amount_left -= skip;
20026 			}
20027 			address = entry->vme_next->vme_start;
20028 			vm_map_unlock(map);
20029 			continue;
20030 		}
20031 
20032 		offset = address - entry->vme_start;
20033 		pmap_offset = address;
20034 
20035 		/*
20036 		 * do we have more to flush than is contained in this
20037 		 * entry ?
20038 		 */
20039 		if (amount_left + entry->vme_start + offset > entry->vme_end) {
20040 			flush_size = entry->vme_end -
20041 			    (entry->vme_start + offset);
20042 		} else {
20043 			flush_size = amount_left;
20044 		}
20045 		amount_left -= flush_size;
20046 		address += flush_size;
20047 
20048 		if (entry->is_sub_map == TRUE) {
20049 			vm_map_t        local_map;
20050 			vm_map_offset_t local_offset;
20051 
20052 			local_map = VME_SUBMAP(entry);
20053 			local_offset = VME_OFFSET(entry);
20054 			vm_map_reference(local_map);
20055 			vm_map_unlock(map);
20056 			if (vm_map_msync(
20057 				    local_map,
20058 				    local_offset,
20059 				    flush_size,
20060 				    sync_flags) == KERN_INVALID_ADDRESS) {
20061 				had_hole = TRUE;
20062 			}
20063 			vm_map_deallocate(local_map);
20064 			continue;
20065 		}
20066 		object = VME_OBJECT(entry);
20067 
20068 		/*
20069 		 * We can't sync this object if the object has not been
20070 		 * created yet
20071 		 */
20072 		if (object == VM_OBJECT_NULL) {
20073 			vm_map_unlock(map);
20074 			continue;
20075 		}
20076 		offset += VME_OFFSET(entry);
20077 
20078 		vm_object_lock(object);
20079 
20080 		if (sync_flags & (VM_SYNC_KILLPAGES | VM_SYNC_DEACTIVATE)) {
20081 			int kill_pages = 0;
20082 
20083 			if (VM_MAP_PAGE_MASK(map) < PAGE_MASK) {
20084 				/*
20085 				 * This is a destructive operation and so we
20086 				 * err on the side of limiting the range of
20087 				 * the operation.
20088 				 */
20089 				start_offset = vm_object_round_page(offset);
20090 				end_offset = vm_object_trunc_page(offset + flush_size);
20091 
20092 				if (end_offset <= start_offset) {
20093 					vm_object_unlock(object);
20094 					vm_map_unlock(map);
20095 					continue;
20096 				}
20097 
20098 				pmap_offset += start_offset - offset;
20099 			} else {
20100 				start_offset = offset;
20101 				end_offset = offset + flush_size;
20102 			}
20103 
20104 			if (sync_flags & VM_SYNC_KILLPAGES) {
20105 				if (((object->ref_count == 1) ||
20106 				    ((object->copy_strategy !=
20107 				    MEMORY_OBJECT_COPY_SYMMETRIC) &&
20108 				    (object->copy == VM_OBJECT_NULL))) &&
20109 				    (object->shadow == VM_OBJECT_NULL)) {
20110 					if (object->ref_count != 1) {
20111 						vm_page_stats_reusable.free_shared++;
20112 					}
20113 					kill_pages = 1;
20114 				} else {
20115 					kill_pages = -1;
20116 				}
20117 			}
20118 			if (kill_pages != -1) {
20119 				vm_object_deactivate_pages(
20120 					object,
20121 					start_offset,
20122 					(vm_object_size_t) (end_offset - start_offset),
20123 					kill_pages,
20124 					FALSE, /* reusable_pages */
20125 					FALSE, /* reusable_no_write */
20126 					map->pmap,
20127 					pmap_offset);
20128 			}
20129 			vm_object_unlock(object);
20130 			vm_map_unlock(map);
20131 			continue;
20132 		}
20133 		/*
20134 		 * We can't sync this object if there isn't a pager.
20135 		 * Don't bother to sync internal objects, since there can't
20136 		 * be any "permanent" storage for these objects anyway.
20137 		 */
20138 		if ((object->pager == MEMORY_OBJECT_NULL) ||
20139 		    (object->internal) || (object->private)) {
20140 			vm_object_unlock(object);
20141 			vm_map_unlock(map);
20142 			continue;
20143 		}
20144 		/*
20145 		 * keep reference on the object until syncing is done
20146 		 */
20147 		vm_object_reference_locked(object);
20148 		vm_object_unlock(object);
20149 
20150 		vm_map_unlock(map);
20151 
20152 		if (VM_MAP_PAGE_MASK(map) < PAGE_MASK) {
20153 			start_offset = vm_object_trunc_page(offset);
20154 			end_offset = vm_object_round_page(offset + flush_size);
20155 		} else {
20156 			start_offset = offset;
20157 			end_offset = offset + flush_size;
20158 		}
20159 
20160 		do_sync_req = vm_object_sync(object,
20161 		    start_offset,
20162 		    (end_offset - start_offset),
20163 		    sync_flags & VM_SYNC_INVALIDATE,
20164 		    ((sync_flags & VM_SYNC_SYNCHRONOUS) ||
20165 		    (sync_flags & VM_SYNC_ASYNCHRONOUS)),
20166 		    sync_flags & VM_SYNC_SYNCHRONOUS);
20167 
20168 		if ((sync_flags & VM_SYNC_INVALIDATE) && object->resident_page_count == 0) {
20169 			/*
20170 			 * clear out the clustering and read-ahead hints
20171 			 */
20172 			vm_object_lock(object);
20173 
20174 			object->pages_created = 0;
20175 			object->pages_used = 0;
20176 			object->sequential = 0;
20177 			object->last_alloc = 0;
20178 
20179 			vm_object_unlock(object);
20180 		}
20181 		vm_object_deallocate(object);
20182 	} /* while */
20183 
20184 	/* for proper msync() behaviour */
20185 	if (had_hole == TRUE && (sync_flags & VM_SYNC_CONTIGUOUS)) {
20186 		return KERN_INVALID_ADDRESS;
20187 	}
20188 
20189 	return KERN_SUCCESS;
20190 }/* vm_msync */
20191 
20192 void
vm_named_entry_associate_vm_object(vm_named_entry_t named_entry,vm_object_t object,vm_object_offset_t offset,vm_object_size_t size,vm_prot_t prot)20193 vm_named_entry_associate_vm_object(
20194 	vm_named_entry_t        named_entry,
20195 	vm_object_t             object,
20196 	vm_object_offset_t      offset,
20197 	vm_object_size_t        size,
20198 	vm_prot_t               prot)
20199 {
20200 	vm_map_copy_t copy;
20201 	vm_map_entry_t copy_entry;
20202 
20203 	assert(!named_entry->is_sub_map);
20204 	assert(!named_entry->is_copy);
20205 	assert(!named_entry->is_object);
20206 	assert(!named_entry->internal);
20207 	assert(named_entry->backing.copy == VM_MAP_COPY_NULL);
20208 
20209 	copy = vm_map_copy_allocate(VM_MAP_COPY_ENTRY_LIST);
20210 	copy->offset = offset;
20211 	copy->size = size;
20212 	copy->cpy_hdr.page_shift = (uint16_t)PAGE_SHIFT;
20213 
20214 	copy_entry = vm_map_copy_entry_create(copy);
20215 	copy_entry->protection = prot;
20216 	copy_entry->max_protection = prot;
20217 	copy_entry->use_pmap = TRUE;
20218 	copy_entry->vme_start = VM_MAP_TRUNC_PAGE(offset, PAGE_MASK);
20219 	copy_entry->vme_end = VM_MAP_ROUND_PAGE(offset + size, PAGE_MASK);
20220 	VME_OBJECT_SET(copy_entry, object, false, 0);
20221 	VME_OFFSET_SET(copy_entry, vm_object_trunc_page(offset));
20222 	vm_map_copy_entry_link(copy, vm_map_copy_last_entry(copy), copy_entry);
20223 
20224 	named_entry->backing.copy = copy;
20225 	named_entry->is_object = TRUE;
20226 	if (object->internal) {
20227 		named_entry->internal = TRUE;
20228 	}
20229 
20230 	DEBUG4K_MEMENTRY("named_entry %p copy %p object %p offset 0x%llx size 0x%llx prot 0x%x\n",
20231 	    named_entry, copy, object, offset, size, prot);
20232 }
20233 
20234 vm_object_t
vm_named_entry_to_vm_object(vm_named_entry_t named_entry)20235 vm_named_entry_to_vm_object(
20236 	vm_named_entry_t named_entry)
20237 {
20238 	vm_map_copy_t   copy;
20239 	vm_map_entry_t  copy_entry;
20240 	vm_object_t     object;
20241 
20242 	assert(!named_entry->is_sub_map);
20243 	assert(!named_entry->is_copy);
20244 	assert(named_entry->is_object);
20245 	copy = named_entry->backing.copy;
20246 	assert(copy != VM_MAP_COPY_NULL);
20247 	/*
20248 	 * Assert that the vm_map_copy is coming from the right
20249 	 * zone and hasn't been forged
20250 	 */
20251 	vm_map_copy_require(copy);
20252 	assert(copy->cpy_hdr.nentries == 1);
20253 	copy_entry = vm_map_copy_first_entry(copy);
20254 	object = VME_OBJECT(copy_entry);
20255 
20256 	DEBUG4K_MEMENTRY("%p -> %p -> %p [0x%llx 0x%llx 0x%llx 0x%x/0x%x ] -> %p offset 0x%llx size 0x%llx prot 0x%x\n", named_entry, copy, copy_entry, (uint64_t)copy_entry->vme_start, (uint64_t)copy_entry->vme_end, copy_entry->vme_offset, copy_entry->protection, copy_entry->max_protection, object, named_entry->offset, named_entry->size, named_entry->protection);
20257 
20258 	return object;
20259 }
20260 
20261 /*
20262  *	Routine:	convert_port_entry_to_map
20263  *	Purpose:
20264  *		Convert from a port specifying an entry or a task
20265  *		to a map. Doesn't consume the port ref; produces a map ref,
20266  *		which may be null.  Unlike convert_port_to_map, the
20267  *		port may be task or a named entry backed.
20268  *	Conditions:
20269  *		Nothing locked.
20270  */
20271 
20272 vm_map_t
convert_port_entry_to_map(ipc_port_t port)20273 convert_port_entry_to_map(
20274 	ipc_port_t      port)
20275 {
20276 	vm_map_t map = VM_MAP_NULL;
20277 	vm_named_entry_t named_entry;
20278 
20279 	if (!IP_VALID(port)) {
20280 		return VM_MAP_NULL;
20281 	}
20282 
20283 	if (ip_kotype(port) != IKOT_NAMED_ENTRY) {
20284 		return convert_port_to_map(port);
20285 	}
20286 
20287 	named_entry = mach_memory_entry_from_port(port);
20288 
20289 	if ((named_entry->is_sub_map) &&
20290 	    (named_entry->protection & VM_PROT_WRITE)) {
20291 		map = named_entry->backing.map;
20292 		if (map->pmap != PMAP_NULL) {
20293 			if (map->pmap == kernel_pmap) {
20294 				panic("userspace has access "
20295 				    "to a kernel map %p", map);
20296 			}
20297 			pmap_require(map->pmap);
20298 		}
20299 		vm_map_reference(map);
20300 	}
20301 
20302 	return map;
20303 }
20304 
20305 /*
20306  * Export routines to other components for the things we access locally through
20307  * macros.
20308  */
20309 #undef current_map
20310 vm_map_t
current_map(void)20311 current_map(void)
20312 {
20313 	return current_map_fast();
20314 }
20315 
20316 /*
20317  *	vm_map_reference:
20318  *
20319  *	Takes a reference on the specified map.
20320  */
20321 void
vm_map_reference(vm_map_t map)20322 vm_map_reference(
20323 	vm_map_t        map)
20324 {
20325 	if (__probable(map != VM_MAP_NULL)) {
20326 		vm_map_require(map);
20327 		os_ref_retain_raw(&map->map_refcnt, &map_refgrp);
20328 	}
20329 }
20330 
20331 /*
20332  *	vm_map_deallocate:
20333  *
20334  *	Removes a reference from the specified map,
20335  *	destroying it if no references remain.
20336  *	The map should not be locked.
20337  */
20338 void
vm_map_deallocate(vm_map_t map)20339 vm_map_deallocate(
20340 	vm_map_t        map)
20341 {
20342 	if (__probable(map != VM_MAP_NULL)) {
20343 		vm_map_require(map);
20344 		if (os_ref_release_raw(&map->map_refcnt, &map_refgrp) == 0) {
20345 			vm_map_destroy(map);
20346 		}
20347 	}
20348 }
20349 
20350 void
vm_map_inspect_deallocate(vm_map_inspect_t map)20351 vm_map_inspect_deallocate(
20352 	vm_map_inspect_t      map)
20353 {
20354 	vm_map_deallocate((vm_map_t)map);
20355 }
20356 
20357 void
vm_map_read_deallocate(vm_map_read_t map)20358 vm_map_read_deallocate(
20359 	vm_map_read_t      map)
20360 {
20361 	vm_map_deallocate((vm_map_t)map);
20362 }
20363 
20364 
20365 void
vm_map_disable_NX(vm_map_t map)20366 vm_map_disable_NX(vm_map_t map)
20367 {
20368 	if (map == NULL) {
20369 		return;
20370 	}
20371 	if (map->pmap == NULL) {
20372 		return;
20373 	}
20374 
20375 	pmap_disable_NX(map->pmap);
20376 }
20377 
20378 void
vm_map_disallow_data_exec(vm_map_t map)20379 vm_map_disallow_data_exec(vm_map_t map)
20380 {
20381 	if (map == NULL) {
20382 		return;
20383 	}
20384 
20385 	map->map_disallow_data_exec = TRUE;
20386 }
20387 
20388 /* XXX Consider making these constants (VM_MAX_ADDRESS and MACH_VM_MAX_ADDRESS)
20389  * more descriptive.
20390  */
20391 void
vm_map_set_32bit(vm_map_t map)20392 vm_map_set_32bit(vm_map_t map)
20393 {
20394 #if defined(__arm64__)
20395 	map->max_offset = pmap_max_offset(FALSE, ARM_PMAP_MAX_OFFSET_DEVICE);
20396 #else
20397 	map->max_offset = (vm_map_offset_t)VM_MAX_ADDRESS;
20398 #endif
20399 }
20400 
20401 
20402 void
vm_map_set_64bit(vm_map_t map)20403 vm_map_set_64bit(vm_map_t map)
20404 {
20405 #if defined(__arm64__)
20406 	map->max_offset = pmap_max_offset(TRUE, ARM_PMAP_MAX_OFFSET_DEVICE);
20407 #else
20408 	map->max_offset = (vm_map_offset_t)MACH_VM_MAX_ADDRESS;
20409 #endif
20410 }
20411 
20412 /*
20413  * Expand the maximum size of an existing map to the maximum supported.
20414  */
20415 void
vm_map_set_jumbo(vm_map_t map)20416 vm_map_set_jumbo(vm_map_t map)
20417 {
20418 #if defined (__arm64__) && !XNU_TARGET_OS_OSX
20419 	vm_map_set_max_addr(map, ~0);
20420 #else /* arm64 */
20421 	(void) map;
20422 #endif
20423 }
20424 
20425 /*
20426  * This map has a JIT entitlement
20427  */
20428 void
vm_map_set_jit_entitled(vm_map_t map)20429 vm_map_set_jit_entitled(vm_map_t map)
20430 {
20431 #if defined (__arm64__)
20432 	pmap_set_jit_entitled(map->pmap);
20433 #else /* arm64 */
20434 	(void) map;
20435 #endif
20436 }
20437 
20438 /*
20439  * Get status of this maps TPRO flag
20440  */
20441 boolean_t
vm_map_tpro(vm_map_t map)20442 vm_map_tpro(vm_map_t map)
20443 {
20444 #if defined (__arm64e__)
20445 	return pmap_get_tpro(map->pmap);
20446 #else /* arm64e */
20447 	(void) map;
20448 	return false;
20449 #endif
20450 }
20451 
20452 /*
20453  * This map has TPRO enabled
20454  */
20455 void
vm_map_set_tpro(vm_map_t map)20456 vm_map_set_tpro(vm_map_t map)
20457 {
20458 #if defined (__arm64e__)
20459 	pmap_set_tpro(map->pmap);
20460 #else /* arm64e */
20461 	(void) map;
20462 #endif
20463 }
20464 
20465 /*
20466  * Expand the maximum size of an existing map.
20467  */
20468 void
vm_map_set_max_addr(vm_map_t map,vm_map_offset_t new_max_offset)20469 vm_map_set_max_addr(vm_map_t map, vm_map_offset_t new_max_offset)
20470 {
20471 #if defined(__arm64__)
20472 	vm_map_offset_t max_supported_offset;
20473 	vm_map_offset_t old_max_offset;
20474 
20475 	vm_map_lock(map);
20476 
20477 	old_max_offset = map->max_offset;
20478 	max_supported_offset = pmap_max_offset(vm_map_is_64bit(map), ARM_PMAP_MAX_OFFSET_JUMBO);
20479 
20480 	new_max_offset = trunc_page(new_max_offset);
20481 
20482 	/* The address space cannot be shrunk using this routine. */
20483 	if (old_max_offset >= new_max_offset) {
20484 		vm_map_unlock(map);
20485 		return;
20486 	}
20487 
20488 	if (max_supported_offset < new_max_offset) {
20489 		new_max_offset = max_supported_offset;
20490 	}
20491 
20492 	map->max_offset = new_max_offset;
20493 
20494 	if (map->holelistenabled) {
20495 		if (map->holes_list->prev->vme_end == old_max_offset) {
20496 			/*
20497 			 * There is already a hole at the end of the map; simply make it bigger.
20498 			 */
20499 			map->holes_list->prev->vme_end = map->max_offset;
20500 		} else {
20501 			/*
20502 			 * There is no hole at the end, so we need to create a new hole
20503 			 * for the new empty space we're creating.
20504 			 */
20505 			struct vm_map_links *new_hole;
20506 
20507 			new_hole = zalloc_id(ZONE_ID_VM_MAP_HOLES, Z_WAITOK | Z_NOFAIL);
20508 			new_hole->start = old_max_offset;
20509 			new_hole->end = map->max_offset;
20510 			new_hole->prev = map->holes_list->prev;
20511 			new_hole->next = (struct vm_map_entry *)map->holes_list;
20512 			map->holes_list->prev->vme_next = (struct vm_map_entry *)new_hole;
20513 			map->holes_list->prev = (struct vm_map_entry *)new_hole;
20514 		}
20515 	}
20516 
20517 	vm_map_unlock(map);
20518 #else
20519 	(void)map;
20520 	(void)new_max_offset;
20521 #endif
20522 }
20523 
20524 vm_map_offset_t
vm_compute_max_offset(boolean_t is64)20525 vm_compute_max_offset(boolean_t is64)
20526 {
20527 #if defined(__arm64__)
20528 	return pmap_max_offset(is64, ARM_PMAP_MAX_OFFSET_DEVICE);
20529 #else
20530 	return is64 ? (vm_map_offset_t)MACH_VM_MAX_ADDRESS : (vm_map_offset_t)VM_MAX_ADDRESS;
20531 #endif
20532 }
20533 
20534 void
vm_map_get_max_aslr_slide_section(vm_map_t map __unused,int64_t * max_sections,int64_t * section_size)20535 vm_map_get_max_aslr_slide_section(
20536 	vm_map_t                map __unused,
20537 	int64_t                 *max_sections,
20538 	int64_t                 *section_size)
20539 {
20540 #if defined(__arm64__)
20541 	*max_sections = 3;
20542 	*section_size = ARM_TT_TWIG_SIZE;
20543 #else
20544 	*max_sections = 1;
20545 	*section_size = 0;
20546 #endif
20547 }
20548 
20549 uint64_t
vm_map_get_max_aslr_slide_pages(vm_map_t map)20550 vm_map_get_max_aslr_slide_pages(vm_map_t map)
20551 {
20552 #if defined(__arm64__)
20553 	/* Limit arm64 slide to 16MB to conserve contiguous VA space in the more
20554 	 * limited embedded address space; this is also meant to minimize pmap
20555 	 * memory usage on 16KB page systems.
20556 	 */
20557 	return 1 << (24 - VM_MAP_PAGE_SHIFT(map));
20558 #else
20559 	return 1 << (vm_map_is_64bit(map) ? 16 : 8);
20560 #endif
20561 }
20562 
20563 uint64_t
vm_map_get_max_loader_aslr_slide_pages(vm_map_t map)20564 vm_map_get_max_loader_aslr_slide_pages(vm_map_t map)
20565 {
20566 #if defined(__arm64__)
20567 	/* We limit the loader slide to 4MB, in order to ensure at least 8 bits
20568 	 * of independent entropy on 16KB page systems.
20569 	 */
20570 	return 1 << (22 - VM_MAP_PAGE_SHIFT(map));
20571 #else
20572 	return 1 << (vm_map_is_64bit(map) ? 16 : 8);
20573 #endif
20574 }
20575 
20576 boolean_t
vm_map_is_64bit(vm_map_t map)20577 vm_map_is_64bit(
20578 	vm_map_t map)
20579 {
20580 	return map->max_offset > ((vm_map_offset_t)VM_MAX_ADDRESS);
20581 }
20582 
20583 boolean_t
vm_map_has_hard_pagezero(vm_map_t map,vm_map_offset_t pagezero_size)20584 vm_map_has_hard_pagezero(
20585 	vm_map_t        map,
20586 	vm_map_offset_t pagezero_size)
20587 {
20588 	/*
20589 	 * XXX FBDP
20590 	 * We should lock the VM map (for read) here but we can get away
20591 	 * with it for now because there can't really be any race condition:
20592 	 * the VM map's min_offset is changed only when the VM map is created
20593 	 * and when the zero page is established (when the binary gets loaded),
20594 	 * and this routine gets called only when the task terminates and the
20595 	 * VM map is being torn down, and when a new map is created via
20596 	 * load_machfile()/execve().
20597 	 */
20598 	return map->min_offset >= pagezero_size;
20599 }
20600 
20601 /*
20602  * Raise a VM map's maximun offset.
20603  */
20604 kern_return_t
vm_map_raise_max_offset(vm_map_t map,vm_map_offset_t new_max_offset)20605 vm_map_raise_max_offset(
20606 	vm_map_t        map,
20607 	vm_map_offset_t new_max_offset)
20608 {
20609 	kern_return_t   ret;
20610 
20611 	vm_map_lock(map);
20612 	ret = KERN_INVALID_ADDRESS;
20613 
20614 	if (new_max_offset >= map->max_offset) {
20615 		if (!vm_map_is_64bit(map)) {
20616 			if (new_max_offset <= (vm_map_offset_t)VM_MAX_ADDRESS) {
20617 				map->max_offset = new_max_offset;
20618 				ret = KERN_SUCCESS;
20619 			}
20620 		} else {
20621 			if (new_max_offset <= (vm_map_offset_t)MACH_VM_MAX_ADDRESS) {
20622 				map->max_offset = new_max_offset;
20623 				ret = KERN_SUCCESS;
20624 			}
20625 		}
20626 	}
20627 
20628 	vm_map_unlock(map);
20629 	return ret;
20630 }
20631 
20632 
20633 /*
20634  * Raise a VM map's minimum offset.
20635  * To strictly enforce "page zero" reservation.
20636  */
20637 kern_return_t
vm_map_raise_min_offset(vm_map_t map,vm_map_offset_t new_min_offset)20638 vm_map_raise_min_offset(
20639 	vm_map_t        map,
20640 	vm_map_offset_t new_min_offset)
20641 {
20642 	vm_map_entry_t  first_entry;
20643 
20644 	new_min_offset = vm_map_round_page(new_min_offset,
20645 	    VM_MAP_PAGE_MASK(map));
20646 
20647 	vm_map_lock(map);
20648 
20649 	if (new_min_offset < map->min_offset) {
20650 		/*
20651 		 * Can't move min_offset backwards, as that would expose
20652 		 * a part of the address space that was previously, and for
20653 		 * possibly good reasons, inaccessible.
20654 		 */
20655 		vm_map_unlock(map);
20656 		return KERN_INVALID_ADDRESS;
20657 	}
20658 	if (new_min_offset >= map->max_offset) {
20659 		/* can't go beyond the end of the address space */
20660 		vm_map_unlock(map);
20661 		return KERN_INVALID_ADDRESS;
20662 	}
20663 
20664 	first_entry = vm_map_first_entry(map);
20665 	if (first_entry != vm_map_to_entry(map) &&
20666 	    first_entry->vme_start < new_min_offset) {
20667 		/*
20668 		 * Some memory was already allocated below the new
20669 		 * minimun offset.  It's too late to change it now...
20670 		 */
20671 		vm_map_unlock(map);
20672 		return KERN_NO_SPACE;
20673 	}
20674 
20675 	map->min_offset = new_min_offset;
20676 
20677 	if (map->holelistenabled) {
20678 		assert(map->holes_list);
20679 		map->holes_list->start = new_min_offset;
20680 		assert(new_min_offset < map->holes_list->end);
20681 	}
20682 
20683 	vm_map_unlock(map);
20684 
20685 	return KERN_SUCCESS;
20686 }
20687 
20688 /*
20689  * Set the limit on the maximum amount of address space and user wired memory allowed for this map.
20690  * This is basically a copy of the RLIMIT_AS and RLIMIT_MEMLOCK rlimit value maintained by the BSD
20691  * side of the kernel. The limits are checked in the mach VM side, so we keep a copy so we don't
20692  * have to reach over to the BSD data structures.
20693  */
20694 
20695 uint64_t vm_map_set_size_limit_count = 0;
20696 kern_return_t
vm_map_set_size_limit(vm_map_t map,uint64_t new_size_limit)20697 vm_map_set_size_limit(vm_map_t map, uint64_t new_size_limit)
20698 {
20699 	kern_return_t kr;
20700 
20701 	vm_map_lock(map);
20702 	if (new_size_limit < map->size) {
20703 		/* new limit should not be lower than its current size */
20704 		DTRACE_VM2(vm_map_set_size_limit_fail,
20705 		    vm_map_size_t, map->size,
20706 		    uint64_t, new_size_limit);
20707 		kr = KERN_FAILURE;
20708 	} else if (new_size_limit == map->size_limit) {
20709 		/* no change */
20710 		kr = KERN_SUCCESS;
20711 	} else {
20712 		/* set new limit */
20713 		DTRACE_VM2(vm_map_set_size_limit,
20714 		    vm_map_size_t, map->size,
20715 		    uint64_t, new_size_limit);
20716 		if (new_size_limit != RLIM_INFINITY) {
20717 			vm_map_set_size_limit_count++;
20718 		}
20719 		map->size_limit = new_size_limit;
20720 		kr = KERN_SUCCESS;
20721 	}
20722 	vm_map_unlock(map);
20723 	return kr;
20724 }
20725 
20726 uint64_t vm_map_set_data_limit_count = 0;
20727 kern_return_t
vm_map_set_data_limit(vm_map_t map,uint64_t new_data_limit)20728 vm_map_set_data_limit(vm_map_t map, uint64_t new_data_limit)
20729 {
20730 	kern_return_t kr;
20731 
20732 	vm_map_lock(map);
20733 	if (new_data_limit < map->size) {
20734 		/* new limit should not be lower than its current size */
20735 		DTRACE_VM2(vm_map_set_data_limit_fail,
20736 		    vm_map_size_t, map->size,
20737 		    uint64_t, new_data_limit);
20738 		kr = KERN_FAILURE;
20739 	} else if (new_data_limit == map->data_limit) {
20740 		/* no change */
20741 		kr = KERN_SUCCESS;
20742 	} else {
20743 		/* set new limit */
20744 		DTRACE_VM2(vm_map_set_data_limit,
20745 		    vm_map_size_t, map->size,
20746 		    uint64_t, new_data_limit);
20747 		if (new_data_limit != RLIM_INFINITY) {
20748 			vm_map_set_data_limit_count++;
20749 		}
20750 		map->data_limit = new_data_limit;
20751 		kr = KERN_SUCCESS;
20752 	}
20753 	vm_map_unlock(map);
20754 	return kr;
20755 }
20756 
20757 void
vm_map_set_user_wire_limit(vm_map_t map,vm_size_t limit)20758 vm_map_set_user_wire_limit(vm_map_t     map,
20759     vm_size_t    limit)
20760 {
20761 	vm_map_lock(map);
20762 	map->user_wire_limit = limit;
20763 	vm_map_unlock(map);
20764 }
20765 
20766 
20767 void
vm_map_switch_protect(vm_map_t map,boolean_t val)20768 vm_map_switch_protect(vm_map_t     map,
20769     boolean_t    val)
20770 {
20771 	vm_map_lock(map);
20772 	map->switch_protect = val;
20773 	vm_map_unlock(map);
20774 }
20775 
20776 extern int cs_process_enforcement_enable;
20777 boolean_t
vm_map_cs_enforcement(vm_map_t map)20778 vm_map_cs_enforcement(
20779 	vm_map_t map)
20780 {
20781 	if (cs_process_enforcement_enable) {
20782 		return TRUE;
20783 	}
20784 	return map->cs_enforcement;
20785 }
20786 
20787 kern_return_t
vm_map_cs_wx_enable(__unused vm_map_t map)20788 vm_map_cs_wx_enable(
20789 	__unused vm_map_t map)
20790 {
20791 #if CODE_SIGNING_MONITOR
20792 	kern_return_t ret = csm_allow_invalid_code(vm_map_pmap(map));
20793 	if ((ret == KERN_SUCCESS) || (ret == KERN_NOT_SUPPORTED)) {
20794 		return KERN_SUCCESS;
20795 	}
20796 	return ret;
20797 #else
20798 	/* The VM manages WX memory entirely on its own */
20799 	return true;
20800 #endif
20801 }
20802 
20803 void
vm_map_cs_debugged_set(vm_map_t map,boolean_t val)20804 vm_map_cs_debugged_set(
20805 	vm_map_t map,
20806 	boolean_t val)
20807 {
20808 	vm_map_lock(map);
20809 	map->cs_debugged = val;
20810 	vm_map_unlock(map);
20811 }
20812 
20813 void
vm_map_cs_enforcement_set(vm_map_t map,boolean_t val)20814 vm_map_cs_enforcement_set(
20815 	vm_map_t map,
20816 	boolean_t val)
20817 {
20818 	vm_map_lock(map);
20819 	map->cs_enforcement = val;
20820 	pmap_set_vm_map_cs_enforced(map->pmap, val);
20821 	vm_map_unlock(map);
20822 }
20823 
20824 /*
20825  * IOKit has mapped a region into this map; adjust the pmap's ledgers appropriately.
20826  * phys_footprint is a composite limit consisting of iokit + physmem, so we need to
20827  * bump both counters.
20828  */
20829 void
vm_map_iokit_mapped_region(vm_map_t map,vm_size_t bytes)20830 vm_map_iokit_mapped_region(vm_map_t map, vm_size_t bytes)
20831 {
20832 	pmap_t pmap = vm_map_pmap(map);
20833 
20834 	ledger_credit(pmap->ledger, task_ledgers.iokit_mapped, bytes);
20835 	ledger_credit(pmap->ledger, task_ledgers.phys_footprint, bytes);
20836 }
20837 
20838 void
vm_map_iokit_unmapped_region(vm_map_t map,vm_size_t bytes)20839 vm_map_iokit_unmapped_region(vm_map_t map, vm_size_t bytes)
20840 {
20841 	pmap_t pmap = vm_map_pmap(map);
20842 
20843 	ledger_debit(pmap->ledger, task_ledgers.iokit_mapped, bytes);
20844 	ledger_debit(pmap->ledger, task_ledgers.phys_footprint, bytes);
20845 }
20846 
20847 /* Add (generate) code signature for memory range */
20848 #if CONFIG_DYNAMIC_CODE_SIGNING
20849 kern_return_t
vm_map_sign(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)20850 vm_map_sign(vm_map_t map,
20851     vm_map_offset_t start,
20852     vm_map_offset_t end)
20853 {
20854 	vm_map_entry_t entry;
20855 	vm_page_t m;
20856 	vm_object_t object;
20857 
20858 	/*
20859 	 * Vet all the input parameters and current type and state of the
20860 	 * underlaying object.  Return with an error if anything is amiss.
20861 	 */
20862 	if (map == VM_MAP_NULL) {
20863 		return KERN_INVALID_ARGUMENT;
20864 	}
20865 
20866 	if (__improbable(vm_map_range_overflows(map, start, end - start))) {
20867 		return KERN_INVALID_ADDRESS;
20868 	}
20869 
20870 	vm_map_lock_read(map);
20871 
20872 	if (!vm_map_lookup_entry(map, start, &entry) || entry->is_sub_map) {
20873 		/*
20874 		 * Must pass a valid non-submap address.
20875 		 */
20876 		vm_map_unlock_read(map);
20877 		return KERN_INVALID_ADDRESS;
20878 	}
20879 
20880 	if ((entry->vme_start > start) || (entry->vme_end < end)) {
20881 		/*
20882 		 * Map entry doesn't cover the requested range. Not handling
20883 		 * this situation currently.
20884 		 */
20885 		vm_map_unlock_read(map);
20886 		return KERN_INVALID_ARGUMENT;
20887 	}
20888 
20889 	object = VME_OBJECT(entry);
20890 	if (object == VM_OBJECT_NULL) {
20891 		/*
20892 		 * Object must already be present or we can't sign.
20893 		 */
20894 		vm_map_unlock_read(map);
20895 		return KERN_INVALID_ARGUMENT;
20896 	}
20897 
20898 	vm_object_lock(object);
20899 	vm_map_unlock_read(map);
20900 
20901 	while (start < end) {
20902 		uint32_t refmod;
20903 
20904 		m = vm_page_lookup(object,
20905 		    start - entry->vme_start + VME_OFFSET(entry));
20906 		if (m == VM_PAGE_NULL) {
20907 			/* shoud we try to fault a page here? we can probably
20908 			 * demand it exists and is locked for this request */
20909 			vm_object_unlock(object);
20910 			return KERN_FAILURE;
20911 		}
20912 		/* deal with special page status */
20913 		if (m->vmp_busy ||
20914 		    (m->vmp_unusual && (VMP_ERROR_GET(m) || m->vmp_restart || m->vmp_private || m->vmp_absent))) {
20915 			vm_object_unlock(object);
20916 			return KERN_FAILURE;
20917 		}
20918 
20919 		/* Page is OK... now "validate" it */
20920 		/* This is the place where we'll call out to create a code
20921 		 * directory, later */
20922 		/* XXX TODO4K: deal with 4k subpages individually? */
20923 		m->vmp_cs_validated = VMP_CS_ALL_TRUE;
20924 
20925 		/* The page is now "clean" for codesigning purposes. That means
20926 		 * we don't consider it as modified (wpmapped) anymore. But
20927 		 * we'll disconnect the page so we note any future modification
20928 		 * attempts. */
20929 		m->vmp_wpmapped = FALSE;
20930 		refmod = pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m));
20931 
20932 		/* Pull the dirty status from the pmap, since we cleared the
20933 		 * wpmapped bit */
20934 		if ((refmod & VM_MEM_MODIFIED) && !m->vmp_dirty) {
20935 			SET_PAGE_DIRTY(m, FALSE);
20936 		}
20937 
20938 		/* On to the next page */
20939 		start += PAGE_SIZE;
20940 	}
20941 	vm_object_unlock(object);
20942 
20943 	return KERN_SUCCESS;
20944 }
20945 #endif
20946 
20947 kern_return_t
vm_map_partial_reap(vm_map_t map,unsigned int * reclaimed_resident,unsigned int * reclaimed_compressed)20948 vm_map_partial_reap(vm_map_t map, unsigned int *reclaimed_resident, unsigned int *reclaimed_compressed)
20949 {
20950 	vm_map_entry_t  entry = VM_MAP_ENTRY_NULL;
20951 	vm_map_entry_t  next_entry;
20952 	kern_return_t   kr = KERN_SUCCESS;
20953 	VM_MAP_ZAP_DECLARE(zap_list);
20954 
20955 	vm_map_lock(map);
20956 
20957 	for (entry = vm_map_first_entry(map);
20958 	    entry != vm_map_to_entry(map);
20959 	    entry = next_entry) {
20960 		next_entry = entry->vme_next;
20961 
20962 		if (!entry->is_sub_map &&
20963 		    VME_OBJECT(entry) &&
20964 		    (VME_OBJECT(entry)->internal == TRUE) &&
20965 		    (VME_OBJECT(entry)->ref_count == 1)) {
20966 			*reclaimed_resident += VME_OBJECT(entry)->resident_page_count;
20967 			*reclaimed_compressed += vm_compressor_pager_get_count(VME_OBJECT(entry)->pager);
20968 
20969 			(void)vm_map_delete(map, entry->vme_start,
20970 			    entry->vme_end, VM_MAP_REMOVE_NO_YIELD,
20971 			    KMEM_GUARD_NONE, &zap_list);
20972 		}
20973 	}
20974 
20975 	vm_map_unlock(map);
20976 
20977 	vm_map_zap_dispose(&zap_list);
20978 
20979 	return kr;
20980 }
20981 
20982 
20983 #if DEVELOPMENT || DEBUG
20984 
20985 int
vm_map_disconnect_page_mappings(vm_map_t map,boolean_t do_unnest)20986 vm_map_disconnect_page_mappings(
20987 	vm_map_t map,
20988 	boolean_t do_unnest)
20989 {
20990 	vm_map_entry_t entry;
20991 	ledger_amount_t byte_count = 0;
20992 
20993 	if (do_unnest == TRUE) {
20994 #ifndef NO_NESTED_PMAP
20995 		vm_map_lock(map);
20996 
20997 		for (entry = vm_map_first_entry(map);
20998 		    entry != vm_map_to_entry(map);
20999 		    entry = entry->vme_next) {
21000 			if (entry->is_sub_map && entry->use_pmap) {
21001 				/*
21002 				 * Make sure the range between the start of this entry and
21003 				 * the end of this entry is no longer nested, so that
21004 				 * we will only remove mappings from the pmap in use by this
21005 				 * this task
21006 				 */
21007 				vm_map_clip_unnest(map, entry, entry->vme_start, entry->vme_end);
21008 			}
21009 		}
21010 		vm_map_unlock(map);
21011 #endif
21012 	}
21013 	vm_map_lock_read(map);
21014 
21015 	ledger_get_balance(map->pmap->ledger, task_ledgers.phys_mem, &byte_count);
21016 
21017 	for (entry = vm_map_first_entry(map);
21018 	    entry != vm_map_to_entry(map);
21019 	    entry = entry->vme_next) {
21020 		if (!entry->is_sub_map && ((VME_OBJECT(entry) == 0) ||
21021 		    (VME_OBJECT(entry)->phys_contiguous))) {
21022 			continue;
21023 		}
21024 		if (entry->is_sub_map) {
21025 			assert(!entry->use_pmap);
21026 		}
21027 
21028 		pmap_remove_options(map->pmap, entry->vme_start, entry->vme_end, 0);
21029 	}
21030 	vm_map_unlock_read(map);
21031 
21032 	return (int) (byte_count / VM_MAP_PAGE_SIZE(map));
21033 }
21034 
21035 kern_return_t
vm_map_inject_error(vm_map_t map,vm_map_offset_t vaddr)21036 vm_map_inject_error(vm_map_t map, vm_map_offset_t vaddr)
21037 {
21038 	vm_object_t object = NULL;
21039 	vm_object_offset_t offset;
21040 	vm_prot_t prot;
21041 	boolean_t wired;
21042 	vm_map_version_t version;
21043 	vm_map_t real_map;
21044 	int result = KERN_FAILURE;
21045 
21046 	vaddr = vm_map_trunc_page(vaddr, PAGE_MASK);
21047 	vm_map_lock(map);
21048 
21049 	result = vm_map_lookup_and_lock_object(&map, vaddr, VM_PROT_READ,
21050 	    OBJECT_LOCK_EXCLUSIVE, &version, &object, &offset, &prot, &wired,
21051 	    NULL, &real_map, NULL);
21052 	if (object == NULL) {
21053 		result = KERN_MEMORY_ERROR;
21054 	} else if (object->pager) {
21055 		result = vm_compressor_pager_inject_error(object->pager,
21056 		    offset);
21057 	} else {
21058 		result = KERN_MEMORY_PRESENT;
21059 	}
21060 
21061 	if (object != NULL) {
21062 		vm_object_unlock(object);
21063 	}
21064 
21065 	if (real_map != map) {
21066 		vm_map_unlock(real_map);
21067 	}
21068 	vm_map_unlock(map);
21069 
21070 	return result;
21071 }
21072 
21073 #endif
21074 
21075 
21076 #if CONFIG_FREEZE
21077 
21078 
21079 extern struct freezer_context freezer_context_global;
21080 AbsoluteTime c_freezer_last_yield_ts = 0;
21081 
21082 extern unsigned int memorystatus_freeze_private_shared_pages_ratio;
21083 extern unsigned int memorystatus_freeze_shared_mb_per_process_max;
21084 
21085 kern_return_t
vm_map_freeze(task_t task,unsigned int * purgeable_count,unsigned int * wired_count,unsigned int * clean_count,unsigned int * dirty_count,unsigned int dirty_budget,unsigned int * shared_count,int * freezer_error_code,boolean_t eval_only)21086 vm_map_freeze(
21087 	task_t       task,
21088 	unsigned int *purgeable_count,
21089 	unsigned int *wired_count,
21090 	unsigned int *clean_count,
21091 	unsigned int *dirty_count,
21092 	unsigned int dirty_budget,
21093 	unsigned int *shared_count,
21094 	int          *freezer_error_code,
21095 	boolean_t    eval_only)
21096 {
21097 	vm_map_entry_t  entry2 = VM_MAP_ENTRY_NULL;
21098 	kern_return_t   kr = KERN_SUCCESS;
21099 	boolean_t       evaluation_phase = TRUE;
21100 	vm_object_t     cur_shared_object = NULL;
21101 	int             cur_shared_obj_ref_cnt = 0;
21102 	unsigned int    dirty_private_count = 0, dirty_shared_count = 0, obj_pages_snapshot = 0;
21103 
21104 	*purgeable_count = *wired_count = *clean_count = *dirty_count = *shared_count = 0;
21105 
21106 	/*
21107 	 * We need the exclusive lock here so that we can
21108 	 * block any page faults or lookups while we are
21109 	 * in the middle of freezing this vm map.
21110 	 */
21111 	vm_map_t map = task->map;
21112 
21113 	vm_map_lock(map);
21114 
21115 	assert(VM_CONFIG_COMPRESSOR_IS_PRESENT);
21116 
21117 	if (vm_compressor_low_on_space() || vm_swap_low_on_space()) {
21118 		if (vm_compressor_low_on_space()) {
21119 			*freezer_error_code = FREEZER_ERROR_NO_COMPRESSOR_SPACE;
21120 		}
21121 
21122 		if (vm_swap_low_on_space()) {
21123 			*freezer_error_code = FREEZER_ERROR_NO_SWAP_SPACE;
21124 		}
21125 
21126 		kr = KERN_NO_SPACE;
21127 		goto done;
21128 	}
21129 
21130 	if (VM_CONFIG_FREEZER_SWAP_IS_ACTIVE == FALSE) {
21131 		/*
21132 		 * In-memory compressor backing the freezer. No disk.
21133 		 * So no need to do the evaluation phase.
21134 		 */
21135 		evaluation_phase = FALSE;
21136 
21137 		if (eval_only == TRUE) {
21138 			/*
21139 			 * We don't support 'eval_only' mode
21140 			 * in this non-swap config.
21141 			 */
21142 			*freezer_error_code = FREEZER_ERROR_GENERIC;
21143 			kr = KERN_INVALID_ARGUMENT;
21144 			goto done;
21145 		}
21146 
21147 		freezer_context_global.freezer_ctx_uncompressed_pages = 0;
21148 		clock_get_uptime(&c_freezer_last_yield_ts);
21149 	}
21150 again:
21151 
21152 	for (entry2 = vm_map_first_entry(map);
21153 	    entry2 != vm_map_to_entry(map);
21154 	    entry2 = entry2->vme_next) {
21155 		vm_object_t src_object;
21156 
21157 		if (entry2->is_sub_map) {
21158 			continue;
21159 		}
21160 
21161 		src_object = VME_OBJECT(entry2);
21162 		if (!src_object ||
21163 		    src_object->phys_contiguous ||
21164 		    !src_object->internal) {
21165 			continue;
21166 		}
21167 
21168 		/* If eligible, scan the entry, moving eligible pages over to our parent object */
21169 
21170 		if (VM_CONFIG_FREEZER_SWAP_IS_ACTIVE) {
21171 			/*
21172 			 * We skip purgeable objects during evaluation phase only.
21173 			 * If we decide to freeze this process, we'll explicitly
21174 			 * purge these objects before we go around again with
21175 			 * 'evaluation_phase' set to FALSE.
21176 			 */
21177 
21178 			if ((src_object->purgable == VM_PURGABLE_EMPTY) || (src_object->purgable == VM_PURGABLE_VOLATILE)) {
21179 				/*
21180 				 * We want to purge objects that may not belong to this task but are mapped
21181 				 * in this task alone. Since we already purged this task's purgeable memory
21182 				 * at the end of a successful evaluation phase, we want to avoid doing no-op calls
21183 				 * on this task's purgeable objects. Hence the check for only volatile objects.
21184 				 */
21185 				if (evaluation_phase == FALSE &&
21186 				    (src_object->purgable == VM_PURGABLE_VOLATILE) &&
21187 				    (src_object->ref_count == 1)) {
21188 					vm_object_lock(src_object);
21189 					vm_object_purge(src_object, 0);
21190 					vm_object_unlock(src_object);
21191 				}
21192 				continue;
21193 			}
21194 
21195 			/*
21196 			 * Pages belonging to this object could be swapped to disk.
21197 			 * Make sure it's not a shared object because we could end
21198 			 * up just bringing it back in again.
21199 			 *
21200 			 * We try to optimize somewhat by checking for objects that are mapped
21201 			 * more than once within our own map. But we don't do full searches,
21202 			 * we just look at the entries following our current entry.
21203 			 */
21204 
21205 			if (src_object->ref_count > 1) {
21206 				if (src_object != cur_shared_object) {
21207 					obj_pages_snapshot = (src_object->resident_page_count - src_object->wired_page_count) + vm_compressor_pager_get_count(src_object->pager);
21208 					dirty_shared_count += obj_pages_snapshot;
21209 
21210 					cur_shared_object = src_object;
21211 					cur_shared_obj_ref_cnt = 1;
21212 					continue;
21213 				} else {
21214 					cur_shared_obj_ref_cnt++;
21215 					if (src_object->ref_count == cur_shared_obj_ref_cnt) {
21216 						/*
21217 						 * Fall through to below and treat this object as private.
21218 						 * So deduct its pages from our shared total and add it to the
21219 						 * private total.
21220 						 */
21221 
21222 						dirty_shared_count -= obj_pages_snapshot;
21223 						dirty_private_count += obj_pages_snapshot;
21224 					} else {
21225 						continue;
21226 					}
21227 				}
21228 			}
21229 
21230 
21231 			if (src_object->ref_count == 1) {
21232 				dirty_private_count += (src_object->resident_page_count - src_object->wired_page_count) + vm_compressor_pager_get_count(src_object->pager);
21233 			}
21234 
21235 			if (evaluation_phase == TRUE) {
21236 				continue;
21237 			}
21238 		}
21239 
21240 		uint32_t paged_out_count = vm_object_compressed_freezer_pageout(src_object, dirty_budget);
21241 		*wired_count += src_object->wired_page_count;
21242 
21243 		if (vm_compressor_low_on_space() || vm_swap_low_on_space()) {
21244 			if (vm_compressor_low_on_space()) {
21245 				*freezer_error_code = FREEZER_ERROR_NO_COMPRESSOR_SPACE;
21246 			}
21247 
21248 			if (vm_swap_low_on_space()) {
21249 				*freezer_error_code = FREEZER_ERROR_NO_SWAP_SPACE;
21250 			}
21251 
21252 			kr = KERN_NO_SPACE;
21253 			break;
21254 		}
21255 		if (paged_out_count >= dirty_budget) {
21256 			break;
21257 		}
21258 		dirty_budget -= paged_out_count;
21259 	}
21260 
21261 	*shared_count = (unsigned int) ((dirty_shared_count * PAGE_SIZE_64) / (1024 * 1024ULL));
21262 	if (evaluation_phase) {
21263 		unsigned int shared_pages_threshold = (memorystatus_freeze_shared_mb_per_process_max * 1024 * 1024ULL) / PAGE_SIZE_64;
21264 
21265 		if (dirty_shared_count > shared_pages_threshold) {
21266 			*freezer_error_code = FREEZER_ERROR_EXCESS_SHARED_MEMORY;
21267 			kr = KERN_FAILURE;
21268 			goto done;
21269 		}
21270 
21271 		if (dirty_shared_count &&
21272 		    ((dirty_private_count / dirty_shared_count) < memorystatus_freeze_private_shared_pages_ratio)) {
21273 			*freezer_error_code = FREEZER_ERROR_LOW_PRIVATE_SHARED_RATIO;
21274 			kr = KERN_FAILURE;
21275 			goto done;
21276 		}
21277 
21278 		evaluation_phase = FALSE;
21279 		dirty_shared_count = dirty_private_count = 0;
21280 
21281 		freezer_context_global.freezer_ctx_uncompressed_pages = 0;
21282 		clock_get_uptime(&c_freezer_last_yield_ts);
21283 
21284 		if (eval_only) {
21285 			kr = KERN_SUCCESS;
21286 			goto done;
21287 		}
21288 
21289 		vm_purgeable_purge_task_owned(task);
21290 
21291 		goto again;
21292 	} else {
21293 		kr = KERN_SUCCESS;
21294 	}
21295 
21296 done:
21297 	vm_map_unlock(map);
21298 
21299 	if ((eval_only == FALSE) && (kr == KERN_SUCCESS)) {
21300 		vm_object_compressed_freezer_done();
21301 	}
21302 	return kr;
21303 }
21304 
21305 #endif
21306 
21307 /*
21308  * vm_map_entry_should_cow_for_true_share:
21309  *
21310  * Determines if the map entry should be clipped and setup for copy-on-write
21311  * to avoid applying "true_share" to a large VM object when only a subset is
21312  * targeted.
21313  *
21314  * For now, we target only the map entries created for the Objective C
21315  * Garbage Collector, which initially have the following properties:
21316  *	- alias == VM_MEMORY_MALLOC
21317  *      - wired_count == 0
21318  *      - !needs_copy
21319  * and a VM object with:
21320  *      - internal
21321  *      - copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC
21322  *      - !true_share
21323  *      - vo_size == ANON_CHUNK_SIZE
21324  *
21325  * Only non-kernel map entries.
21326  */
21327 boolean_t
vm_map_entry_should_cow_for_true_share(vm_map_entry_t entry)21328 vm_map_entry_should_cow_for_true_share(
21329 	vm_map_entry_t  entry)
21330 {
21331 	vm_object_t     object;
21332 
21333 	if (entry->is_sub_map) {
21334 		/* entry does not point at a VM object */
21335 		return FALSE;
21336 	}
21337 
21338 	if (entry->needs_copy) {
21339 		/* already set for copy_on_write: done! */
21340 		return FALSE;
21341 	}
21342 
21343 	if (VME_ALIAS(entry) != VM_MEMORY_MALLOC &&
21344 	    VME_ALIAS(entry) != VM_MEMORY_MALLOC_SMALL) {
21345 		/* not a malloc heap or Obj-C Garbage Collector heap */
21346 		return FALSE;
21347 	}
21348 
21349 	if (entry->wired_count) {
21350 		/* wired: can't change the map entry... */
21351 		vm_counters.should_cow_but_wired++;
21352 		return FALSE;
21353 	}
21354 
21355 	object = VME_OBJECT(entry);
21356 
21357 	if (object == VM_OBJECT_NULL) {
21358 		/* no object yet... */
21359 		return FALSE;
21360 	}
21361 
21362 	if (!object->internal) {
21363 		/* not an internal object */
21364 		return FALSE;
21365 	}
21366 
21367 	if (object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC) {
21368 		/* not the default copy strategy */
21369 		return FALSE;
21370 	}
21371 
21372 	if (object->true_share) {
21373 		/* already true_share: too late to avoid it */
21374 		return FALSE;
21375 	}
21376 
21377 	if (VME_ALIAS(entry) == VM_MEMORY_MALLOC &&
21378 	    object->vo_size != ANON_CHUNK_SIZE) {
21379 		/* ... not an object created for the ObjC Garbage Collector */
21380 		return FALSE;
21381 	}
21382 
21383 	if (VME_ALIAS(entry) == VM_MEMORY_MALLOC_SMALL &&
21384 	    object->vo_size != 2048 * 4096) {
21385 		/* ... not a "MALLOC_SMALL" heap */
21386 		return FALSE;
21387 	}
21388 
21389 	/*
21390 	 * All the criteria match: we have a large object being targeted for "true_share".
21391 	 * To limit the adverse side-effects linked with "true_share", tell the caller to
21392 	 * try and avoid setting up the entire object for "true_share" by clipping the
21393 	 * targeted range and setting it up for copy-on-write.
21394 	 */
21395 	return TRUE;
21396 }
21397 
21398 uint64_t vm_map_range_overflows_count = 0;
21399 TUNABLE_WRITEABLE(boolean_t, vm_map_range_overflows_log, "vm_map_range_overflows_log", FALSE);
21400 bool
vm_map_range_overflows(vm_map_t map,vm_map_offset_t addr,vm_map_size_t size)21401 vm_map_range_overflows(
21402 	vm_map_t map,
21403 	vm_map_offset_t addr,
21404 	vm_map_size_t size)
21405 {
21406 	vm_map_offset_t start, end, sum;
21407 	vm_map_offset_t pgmask;
21408 
21409 	if (size == 0) {
21410 		/* empty range -> no overflow */
21411 		return false;
21412 	}
21413 	pgmask = vm_map_page_mask(map);
21414 	start = vm_map_trunc_page_mask(addr, pgmask);
21415 	end = vm_map_round_page_mask(addr + size, pgmask);
21416 	if (__improbable(os_add_overflow(addr, size, &sum) || end <= start)) {
21417 		vm_map_range_overflows_count++;
21418 		if (vm_map_range_overflows_log) {
21419 			printf("%d[%s] vm_map_range_overflows addr 0x%llx size 0x%llx pgmask 0x%llx\n",
21420 			    proc_selfpid(),
21421 			    proc_best_name(current_proc()),
21422 			    (uint64_t)addr,
21423 			    (uint64_t)size,
21424 			    (uint64_t)pgmask);
21425 		}
21426 		DTRACE_VM4(vm_map_range_overflows,
21427 		    vm_map_t, map,
21428 		    uint32_t, pgmask,
21429 		    uint64_t, (uint64_t)addr,
21430 		    uint64_t, (uint64_t)size);
21431 		return true;
21432 	}
21433 	return false;
21434 }
21435 
21436 vm_map_offset_t
vm_map_round_page_mask(vm_map_offset_t offset,vm_map_offset_t mask)21437 vm_map_round_page_mask(
21438 	vm_map_offset_t offset,
21439 	vm_map_offset_t mask)
21440 {
21441 	return VM_MAP_ROUND_PAGE(offset, mask);
21442 }
21443 
21444 vm_map_offset_t
vm_map_trunc_page_mask(vm_map_offset_t offset,vm_map_offset_t mask)21445 vm_map_trunc_page_mask(
21446 	vm_map_offset_t offset,
21447 	vm_map_offset_t mask)
21448 {
21449 	return VM_MAP_TRUNC_PAGE(offset, mask);
21450 }
21451 
21452 boolean_t
vm_map_page_aligned(vm_map_offset_t offset,vm_map_offset_t mask)21453 vm_map_page_aligned(
21454 	vm_map_offset_t offset,
21455 	vm_map_offset_t mask)
21456 {
21457 	return ((offset) & mask) == 0;
21458 }
21459 
21460 int
vm_map_page_shift(vm_map_t map)21461 vm_map_page_shift(
21462 	vm_map_t map)
21463 {
21464 	return VM_MAP_PAGE_SHIFT(map);
21465 }
21466 
21467 int
vm_map_page_size(vm_map_t map)21468 vm_map_page_size(
21469 	vm_map_t map)
21470 {
21471 	return VM_MAP_PAGE_SIZE(map);
21472 }
21473 
21474 vm_map_offset_t
vm_map_page_mask(vm_map_t map)21475 vm_map_page_mask(
21476 	vm_map_t map)
21477 {
21478 	return VM_MAP_PAGE_MASK(map);
21479 }
21480 
21481 kern_return_t
vm_map_set_page_shift(vm_map_t map,int pageshift)21482 vm_map_set_page_shift(
21483 	vm_map_t        map,
21484 	int             pageshift)
21485 {
21486 	if (map->hdr.nentries != 0) {
21487 		/* too late to change page size */
21488 		return KERN_FAILURE;
21489 	}
21490 
21491 	map->hdr.page_shift = (uint16_t)pageshift;
21492 
21493 	return KERN_SUCCESS;
21494 }
21495 
21496 kern_return_t
vm_map_query_volatile(vm_map_t map,mach_vm_size_t * volatile_virtual_size_p,mach_vm_size_t * volatile_resident_size_p,mach_vm_size_t * volatile_compressed_size_p,mach_vm_size_t * volatile_pmap_size_p,mach_vm_size_t * volatile_compressed_pmap_size_p)21497 vm_map_query_volatile(
21498 	vm_map_t        map,
21499 	mach_vm_size_t  *volatile_virtual_size_p,
21500 	mach_vm_size_t  *volatile_resident_size_p,
21501 	mach_vm_size_t  *volatile_compressed_size_p,
21502 	mach_vm_size_t  *volatile_pmap_size_p,
21503 	mach_vm_size_t  *volatile_compressed_pmap_size_p)
21504 {
21505 	mach_vm_size_t  volatile_virtual_size;
21506 	mach_vm_size_t  volatile_resident_count;
21507 	mach_vm_size_t  volatile_compressed_count;
21508 	mach_vm_size_t  volatile_pmap_count;
21509 	mach_vm_size_t  volatile_compressed_pmap_count;
21510 	mach_vm_size_t  resident_count;
21511 	vm_map_entry_t  entry;
21512 	vm_object_t     object;
21513 
21514 	/* map should be locked by caller */
21515 
21516 	volatile_virtual_size = 0;
21517 	volatile_resident_count = 0;
21518 	volatile_compressed_count = 0;
21519 	volatile_pmap_count = 0;
21520 	volatile_compressed_pmap_count = 0;
21521 
21522 	for (entry = vm_map_first_entry(map);
21523 	    entry != vm_map_to_entry(map);
21524 	    entry = entry->vme_next) {
21525 		mach_vm_size_t  pmap_resident_bytes, pmap_compressed_bytes;
21526 
21527 		if (entry->is_sub_map) {
21528 			continue;
21529 		}
21530 		if (!(entry->protection & VM_PROT_WRITE)) {
21531 			continue;
21532 		}
21533 		object = VME_OBJECT(entry);
21534 		if (object == VM_OBJECT_NULL) {
21535 			continue;
21536 		}
21537 		if (object->purgable != VM_PURGABLE_VOLATILE &&
21538 		    object->purgable != VM_PURGABLE_EMPTY) {
21539 			continue;
21540 		}
21541 		if (VME_OFFSET(entry)) {
21542 			/*
21543 			 * If the map entry has been split and the object now
21544 			 * appears several times in the VM map, we don't want
21545 			 * to count the object's resident_page_count more than
21546 			 * once.  We count it only for the first one, starting
21547 			 * at offset 0 and ignore the other VM map entries.
21548 			 */
21549 			continue;
21550 		}
21551 		resident_count = object->resident_page_count;
21552 		if ((VME_OFFSET(entry) / PAGE_SIZE) >= resident_count) {
21553 			resident_count = 0;
21554 		} else {
21555 			resident_count -= (VME_OFFSET(entry) / PAGE_SIZE);
21556 		}
21557 
21558 		volatile_virtual_size += entry->vme_end - entry->vme_start;
21559 		volatile_resident_count += resident_count;
21560 		if (object->pager) {
21561 			volatile_compressed_count +=
21562 			    vm_compressor_pager_get_count(object->pager);
21563 		}
21564 		pmap_compressed_bytes = 0;
21565 		pmap_resident_bytes =
21566 		    pmap_query_resident(map->pmap,
21567 		    entry->vme_start,
21568 		    entry->vme_end,
21569 		    &pmap_compressed_bytes);
21570 		volatile_pmap_count += (pmap_resident_bytes / PAGE_SIZE);
21571 		volatile_compressed_pmap_count += (pmap_compressed_bytes
21572 		    / PAGE_SIZE);
21573 	}
21574 
21575 	/* map is still locked on return */
21576 
21577 	*volatile_virtual_size_p = volatile_virtual_size;
21578 	*volatile_resident_size_p = volatile_resident_count * PAGE_SIZE;
21579 	*volatile_compressed_size_p = volatile_compressed_count * PAGE_SIZE;
21580 	*volatile_pmap_size_p = volatile_pmap_count * PAGE_SIZE;
21581 	*volatile_compressed_pmap_size_p = volatile_compressed_pmap_count * PAGE_SIZE;
21582 
21583 	return KERN_SUCCESS;
21584 }
21585 
21586 void
vm_map_sizes(vm_map_t map,vm_map_size_t * psize,vm_map_size_t * pfree,vm_map_size_t * plargest_free)21587 vm_map_sizes(vm_map_t map,
21588     vm_map_size_t * psize,
21589     vm_map_size_t * pfree,
21590     vm_map_size_t * plargest_free)
21591 {
21592 	vm_map_entry_t  entry;
21593 	vm_map_offset_t prev;
21594 	vm_map_size_t   free, total_free, largest_free;
21595 	boolean_t       end;
21596 
21597 	if (!map) {
21598 		*psize = *pfree = *plargest_free = 0;
21599 		return;
21600 	}
21601 	total_free = largest_free = 0;
21602 
21603 	vm_map_lock_read(map);
21604 	if (psize) {
21605 		*psize = map->max_offset - map->min_offset;
21606 	}
21607 
21608 	prev = map->min_offset;
21609 	for (entry = vm_map_first_entry(map);; entry = entry->vme_next) {
21610 		end = (entry == vm_map_to_entry(map));
21611 
21612 		if (end) {
21613 			free = entry->vme_end   - prev;
21614 		} else {
21615 			free = entry->vme_start - prev;
21616 		}
21617 
21618 		total_free += free;
21619 		if (free > largest_free) {
21620 			largest_free = free;
21621 		}
21622 
21623 		if (end) {
21624 			break;
21625 		}
21626 		prev = entry->vme_end;
21627 	}
21628 	vm_map_unlock_read(map);
21629 	if (pfree) {
21630 		*pfree = total_free;
21631 	}
21632 	if (plargest_free) {
21633 		*plargest_free = largest_free;
21634 	}
21635 }
21636 
21637 #if VM_SCAN_FOR_SHADOW_CHAIN
21638 int vm_map_shadow_max(vm_map_t map);
21639 int
vm_map_shadow_max(vm_map_t map)21640 vm_map_shadow_max(
21641 	vm_map_t map)
21642 {
21643 	int             shadows, shadows_max;
21644 	vm_map_entry_t  entry;
21645 	vm_object_t     object, next_object;
21646 
21647 	if (map == NULL) {
21648 		return 0;
21649 	}
21650 
21651 	shadows_max = 0;
21652 
21653 	vm_map_lock_read(map);
21654 
21655 	for (entry = vm_map_first_entry(map);
21656 	    entry != vm_map_to_entry(map);
21657 	    entry = entry->vme_next) {
21658 		if (entry->is_sub_map) {
21659 			continue;
21660 		}
21661 		object = VME_OBJECT(entry);
21662 		if (object == NULL) {
21663 			continue;
21664 		}
21665 		vm_object_lock_shared(object);
21666 		for (shadows = 0;
21667 		    object->shadow != NULL;
21668 		    shadows++, object = next_object) {
21669 			next_object = object->shadow;
21670 			vm_object_lock_shared(next_object);
21671 			vm_object_unlock(object);
21672 		}
21673 		vm_object_unlock(object);
21674 		if (shadows > shadows_max) {
21675 			shadows_max = shadows;
21676 		}
21677 	}
21678 
21679 	vm_map_unlock_read(map);
21680 
21681 	return shadows_max;
21682 }
21683 #endif /* VM_SCAN_FOR_SHADOW_CHAIN */
21684 
21685 void
vm_commit_pagezero_status(vm_map_t lmap)21686 vm_commit_pagezero_status(vm_map_t lmap)
21687 {
21688 	pmap_advise_pagezero_range(lmap->pmap, lmap->min_offset);
21689 }
21690 
21691 #if XNU_TARGET_OS_OSX
21692 void
vm_map_set_high_start(vm_map_t map,vm_map_offset_t high_start)21693 vm_map_set_high_start(
21694 	vm_map_t        map,
21695 	vm_map_offset_t high_start)
21696 {
21697 	map->vmmap_high_start = high_start;
21698 }
21699 #endif /* XNU_TARGET_OS_OSX */
21700 
21701 #if CODE_SIGNING_MONITOR
21702 
21703 kern_return_t
vm_map_entry_cs_associate(vm_map_t map,vm_map_entry_t entry,vm_map_kernel_flags_t vmk_flags)21704 vm_map_entry_cs_associate(
21705 	vm_map_t                map,
21706 	vm_map_entry_t          entry,
21707 	vm_map_kernel_flags_t   vmk_flags)
21708 {
21709 	vm_object_t cs_object, cs_shadow, backing_object;
21710 	vm_object_offset_t cs_offset, backing_offset;
21711 	void *cs_blobs;
21712 	struct vnode *cs_vnode;
21713 	kern_return_t cs_ret;
21714 
21715 	if (map->pmap == NULL ||
21716 	    entry->is_sub_map || /* XXX FBDP: recurse on sub-range? */
21717 	    (csm_address_space_exempt(map->pmap) == KERN_SUCCESS) ||
21718 	    VME_OBJECT(entry) == VM_OBJECT_NULL) {
21719 		return KERN_SUCCESS;
21720 	}
21721 
21722 	if (!(entry->protection & VM_PROT_EXECUTE)) {
21723 		/*
21724 		 * This memory region is not executable, so the code-signing
21725 		 * monitor would usually not care about it...
21726 		 */
21727 		if (vmk_flags.vmkf_remap_prot_copy &&
21728 		    (entry->max_protection & VM_PROT_EXECUTE)) {
21729 			/*
21730 			 * ... except if the memory region is being remapped
21731 			 * from r-x/r-x to rw-/rwx via vm_protect(VM_PROT_COPY)
21732 			 * which is what a debugger or dtrace would be doing
21733 			 * to prepare to modify an executable page to insert
21734 			 * a breakpoint or activate a probe.
21735 			 * In that case, fall through so that we can mark
21736 			 * this region as being "debugged" and no longer
21737 			 * strictly code-signed.
21738 			 */
21739 		} else {
21740 			/*
21741 			 * Really not executable, so no need to tell the
21742 			 * code-signing monitor.
21743 			 */
21744 			return KERN_SUCCESS;
21745 		}
21746 	}
21747 
21748 	vm_map_lock_assert_exclusive(map);
21749 
21750 	if (entry->used_for_jit) {
21751 		cs_ret = csm_associate_jit_region(
21752 			map->pmap,
21753 			entry->vme_start,
21754 			entry->vme_end - entry->vme_start);
21755 		goto done;
21756 	}
21757 
21758 	if (vmk_flags.vmkf_remap_prot_copy) {
21759 		cs_ret = csm_associate_debug_region(
21760 			map->pmap,
21761 			entry->vme_start,
21762 			entry->vme_end - entry->vme_start);
21763 		if (cs_ret == KERN_SUCCESS) {
21764 			entry->vme_xnu_user_debug = TRUE;
21765 		}
21766 #if DEVELOPMENT || DEBUG
21767 		if (vm_log_xnu_user_debug) {
21768 			printf("FBDP %d[%s] %s:%d map %p entry %p [ 0x%llx 0x%llx ]  vme_xnu_user_debug=%d cs_ret %d\n",
21769 			    proc_selfpid(),
21770 			    (get_bsdtask_info(current_task()) ? proc_name_address(get_bsdtask_info(current_task())) : "?"),
21771 			    __FUNCTION__, __LINE__,
21772 			    map, entry,
21773 			    (uint64_t)entry->vme_start, (uint64_t)entry->vme_end,
21774 			    entry->vme_xnu_user_debug,
21775 			    cs_ret);
21776 		}
21777 #endif /* DEVELOPMENT || DEBUG */
21778 		goto done;
21779 	}
21780 
21781 	cs_object = VME_OBJECT(entry);
21782 	vm_object_lock_shared(cs_object);
21783 	cs_offset = VME_OFFSET(entry);
21784 
21785 	/* find the VM object backed by the code-signed vnode */
21786 	for (;;) {
21787 		/* go to the bottom of cs_object's shadow chain */
21788 		for (;
21789 		    cs_object->shadow != VM_OBJECT_NULL;
21790 		    cs_object = cs_shadow) {
21791 			cs_shadow = cs_object->shadow;
21792 			cs_offset += cs_object->vo_shadow_offset;
21793 			vm_object_lock_shared(cs_shadow);
21794 			vm_object_unlock(cs_object);
21795 		}
21796 		if (cs_object->internal ||
21797 		    cs_object->pager == MEMORY_OBJECT_NULL) {
21798 			vm_object_unlock(cs_object);
21799 			return KERN_SUCCESS;
21800 		}
21801 
21802 		cs_offset += cs_object->paging_offset;
21803 
21804 		/*
21805 		 * cs_object could be backed by a:
21806 		 *      vnode_pager
21807 		 *	apple_protect_pager
21808 		 *      shared_region_pager
21809 		 *	fourk_pager (multiple backing objects -> fail?)
21810 		 * ask the pager if it has a backing VM object
21811 		 */
21812 		if (!memory_object_backing_object(cs_object->pager,
21813 		    cs_offset,
21814 		    &backing_object,
21815 		    &backing_offset)) {
21816 			/* no backing object: cs_object is it */
21817 			break;
21818 		}
21819 
21820 		/* look down the backing object's shadow chain */
21821 		vm_object_lock_shared(backing_object);
21822 		vm_object_unlock(cs_object);
21823 		cs_object = backing_object;
21824 		cs_offset = backing_offset;
21825 	}
21826 
21827 	cs_vnode = vnode_pager_lookup_vnode(cs_object->pager);
21828 	if (cs_vnode == NULL) {
21829 		/* no vnode, no code signatures to associate */
21830 		cs_ret = KERN_SUCCESS;
21831 	} else {
21832 		cs_ret = vnode_pager_get_cs_blobs(cs_vnode,
21833 		    &cs_blobs);
21834 		assert(cs_ret == KERN_SUCCESS);
21835 		cs_ret = cs_associate_blob_with_mapping(map->pmap,
21836 		    entry->vme_start,
21837 		    (entry->vme_end - entry->vme_start),
21838 		    cs_offset,
21839 		    cs_blobs);
21840 	}
21841 	vm_object_unlock(cs_object);
21842 	cs_object = VM_OBJECT_NULL;
21843 
21844 done:
21845 	if (cs_ret == KERN_SUCCESS) {
21846 		DTRACE_VM2(vm_map_entry_cs_associate_success,
21847 		    vm_map_offset_t, entry->vme_start,
21848 		    vm_map_offset_t, entry->vme_end);
21849 		if (vm_map_executable_immutable) {
21850 			/*
21851 			 * Prevent this executable
21852 			 * mapping from being unmapped
21853 			 * or modified.
21854 			 */
21855 			entry->vme_permanent = TRUE;
21856 		}
21857 		/*
21858 		 * pmap says it will validate the
21859 		 * code-signing validity of pages
21860 		 * faulted in via this mapping, so
21861 		 * this map entry should be marked so
21862 		 * that vm_fault() bypasses code-signing
21863 		 * validation for faults coming through
21864 		 * this mapping.
21865 		 */
21866 		entry->csm_associated = TRUE;
21867 	} else if (cs_ret == KERN_NOT_SUPPORTED) {
21868 		/*
21869 		 * pmap won't check the code-signing
21870 		 * validity of pages faulted in via
21871 		 * this mapping, so VM should keep
21872 		 * doing it.
21873 		 */
21874 		DTRACE_VM3(vm_map_entry_cs_associate_off,
21875 		    vm_map_offset_t, entry->vme_start,
21876 		    vm_map_offset_t, entry->vme_end,
21877 		    int, cs_ret);
21878 	} else {
21879 		/*
21880 		 * A real error: do not allow
21881 		 * execution in this mapping.
21882 		 */
21883 		DTRACE_VM3(vm_map_entry_cs_associate_failure,
21884 		    vm_map_offset_t, entry->vme_start,
21885 		    vm_map_offset_t, entry->vme_end,
21886 		    int, cs_ret);
21887 		if (vmk_flags.vmkf_overwrite_immutable) {
21888 			/*
21889 			 * We can get here when we remap an apple_protect pager
21890 			 * on top of an already cs_associated executable mapping
21891 			 * with the same code signatures, so we don't want to
21892 			 * lose VM_PROT_EXECUTE in that case...
21893 			 */
21894 		} else {
21895 			entry->protection &= ~VM_PROT_ALLEXEC;
21896 			entry->max_protection &= ~VM_PROT_ALLEXEC;
21897 		}
21898 	}
21899 
21900 	return cs_ret;
21901 }
21902 
21903 #endif /* CODE_SIGNING_MONITOR */
21904 
21905 /*
21906  * FORKED CORPSE FOOTPRINT
21907  *
21908  * A forked corpse gets a copy of the original VM map but its pmap is mostly
21909  * empty since it never ran and never got to fault in any pages.
21910  * Collecting footprint info (via "sysctl vm.self_region_footprint") for
21911  * a forked corpse would therefore return very little information.
21912  *
21913  * When forking a corpse, we can pass the VM_MAP_FORK_CORPSE_FOOTPRINT option
21914  * to vm_map_fork() to collect footprint information from the original VM map
21915  * and its pmap, and store it in the forked corpse's VM map.  That information
21916  * is stored in place of the VM map's "hole list" since we'll never need to
21917  * lookup for holes in the corpse's map.
21918  *
21919  * The corpse's footprint info looks like this:
21920  *
21921  * vm_map->vmmap_corpse_footprint points to pageable kernel memory laid out
21922  * as follows:
21923  *                     +---------------------------------------+
21924  *            header-> | cf_size                               |
21925  *                     +-------------------+-------------------+
21926  *                     | cf_last_region    | cf_last_zeroes    |
21927  *                     +-------------------+-------------------+
21928  *           region1-> | cfr_vaddr                             |
21929  *                     +-------------------+-------------------+
21930  *                     | cfr_num_pages     | d0 | d1 | d2 | d3 |
21931  *                     +---------------------------------------+
21932  *                     | d4 | d5 | ...                         |
21933  *                     +---------------------------------------+
21934  *                     | ...                                   |
21935  *                     +-------------------+-------------------+
21936  *                     | dy | dz | na | na | cfr_vaddr...      | <-region2
21937  *                     +-------------------+-------------------+
21938  *                     | cfr_vaddr (ctd)   | cfr_num_pages     |
21939  *                     +---------------------------------------+
21940  *                     | d0 | d1 ...                           |
21941  *                     +---------------------------------------+
21942  *                       ...
21943  *                     +---------------------------------------+
21944  *       last region-> | cfr_vaddr                             |
21945  *                     +---------------------------------------+
21946  *                     + cfr_num_pages     | d0 | d1 | d2 | d3 |
21947  *                     +---------------------------------------+
21948  *                       ...
21949  *                     +---------------------------------------+
21950  *                     | dx | dy | dz | na | na | na | na | na |
21951  *                     +---------------------------------------+
21952  *
21953  * where:
21954  *      cf_size:	total size of the buffer (rounded to page size)
21955  *      cf_last_region:	offset in the buffer of the last "region" sub-header
21956  *	cf_last_zeroes: number of trailing "zero" dispositions at the end
21957  *			of last region
21958  *	cfr_vaddr:	virtual address of the start of the covered "region"
21959  *	cfr_num_pages:	number of pages in the covered "region"
21960  *	d*:		disposition of the page at that virtual address
21961  * Regions in the buffer are word-aligned.
21962  *
21963  * We estimate the size of the buffer based on the number of memory regions
21964  * and the virtual size of the address space.  While copying each memory region
21965  * during vm_map_fork(), we also collect the footprint info for that region
21966  * and store it in the buffer, packing it as much as possible (coalescing
21967  * contiguous memory regions to avoid having too many region headers and
21968  * avoiding long streaks of "zero" page dispositions by splitting footprint
21969  * "regions", so the number of regions in the footprint buffer might not match
21970  * the number of memory regions in the address space.
21971  *
21972  * We also have to copy the original task's "nonvolatile" ledgers since that's
21973  * part of the footprint and will need to be reported to any tool asking for
21974  * the footprint information of the forked corpse.
21975  */
21976 
21977 uint64_t vm_map_corpse_footprint_count = 0;
21978 uint64_t vm_map_corpse_footprint_size_avg = 0;
21979 uint64_t vm_map_corpse_footprint_size_max = 0;
21980 uint64_t vm_map_corpse_footprint_full = 0;
21981 uint64_t vm_map_corpse_footprint_no_buf = 0;
21982 
21983 struct vm_map_corpse_footprint_header {
21984 	vm_size_t       cf_size;        /* allocated buffer size */
21985 	uint32_t        cf_last_region; /* offset of last region in buffer */
21986 	union {
21987 		uint32_t cfu_last_zeroes; /* during creation:
21988 		                           * number of "zero" dispositions at
21989 		                           * end of last region */
21990 		uint32_t cfu_hint_region; /* during lookup:
21991 		                           * offset of last looked up region */
21992 #define cf_last_zeroes cfu.cfu_last_zeroes
21993 #define cf_hint_region cfu.cfu_hint_region
21994 	} cfu;
21995 };
21996 typedef uint8_t cf_disp_t;
21997 struct vm_map_corpse_footprint_region {
21998 	vm_map_offset_t cfr_vaddr;      /* region start virtual address */
21999 	uint32_t        cfr_num_pages;  /* number of pages in this "region" */
22000 	cf_disp_t   cfr_disposition[0]; /* disposition of each page */
22001 } __attribute__((packed));
22002 
22003 static cf_disp_t
vm_page_disposition_to_cf_disp(int disposition)22004 vm_page_disposition_to_cf_disp(
22005 	int disposition)
22006 {
22007 	assert(sizeof(cf_disp_t) == 1);
22008 	/* relocate bits that don't fit in a "uint8_t" */
22009 	if (disposition & VM_PAGE_QUERY_PAGE_REUSABLE) {
22010 		disposition |= VM_PAGE_QUERY_PAGE_FICTITIOUS;
22011 	}
22012 	/* cast gets rid of extra bits */
22013 	return (cf_disp_t) disposition;
22014 }
22015 
22016 static int
vm_page_cf_disp_to_disposition(cf_disp_t cf_disp)22017 vm_page_cf_disp_to_disposition(
22018 	cf_disp_t cf_disp)
22019 {
22020 	int disposition;
22021 
22022 	assert(sizeof(cf_disp_t) == 1);
22023 	disposition = (int) cf_disp;
22024 	/* move relocated bits back in place */
22025 	if (cf_disp & VM_PAGE_QUERY_PAGE_FICTITIOUS) {
22026 		disposition |= VM_PAGE_QUERY_PAGE_REUSABLE;
22027 		disposition &= ~VM_PAGE_QUERY_PAGE_FICTITIOUS;
22028 	}
22029 	return disposition;
22030 }
22031 
22032 /*
22033  * vm_map_corpse_footprint_new_region:
22034  *      closes the current footprint "region" and creates a new one
22035  *
22036  * Returns NULL if there's not enough space in the buffer for a new region.
22037  */
22038 static struct vm_map_corpse_footprint_region *
vm_map_corpse_footprint_new_region(struct vm_map_corpse_footprint_header * footprint_header)22039 vm_map_corpse_footprint_new_region(
22040 	struct vm_map_corpse_footprint_header *footprint_header)
22041 {
22042 	uintptr_t       footprint_edge;
22043 	uint32_t        new_region_offset;
22044 	struct vm_map_corpse_footprint_region *footprint_region;
22045 	struct vm_map_corpse_footprint_region *new_footprint_region;
22046 
22047 	footprint_edge = ((uintptr_t)footprint_header +
22048 	    footprint_header->cf_size);
22049 	footprint_region = ((struct vm_map_corpse_footprint_region *)
22050 	    ((char *)footprint_header +
22051 	    footprint_header->cf_last_region));
22052 	assert((uintptr_t)footprint_region + sizeof(*footprint_region) <=
22053 	    footprint_edge);
22054 
22055 	/* get rid of trailing zeroes in the last region */
22056 	assert(footprint_region->cfr_num_pages >=
22057 	    footprint_header->cf_last_zeroes);
22058 	footprint_region->cfr_num_pages -=
22059 	    footprint_header->cf_last_zeroes;
22060 	footprint_header->cf_last_zeroes = 0;
22061 
22062 	/* reuse this region if it's now empty */
22063 	if (footprint_region->cfr_num_pages == 0) {
22064 		return footprint_region;
22065 	}
22066 
22067 	/* compute offset of new region */
22068 	new_region_offset = footprint_header->cf_last_region;
22069 	new_region_offset += sizeof(*footprint_region);
22070 	new_region_offset += (footprint_region->cfr_num_pages * sizeof(cf_disp_t));
22071 	new_region_offset = roundup(new_region_offset, sizeof(int));
22072 
22073 	/* check if we're going over the edge */
22074 	if (((uintptr_t)footprint_header +
22075 	    new_region_offset +
22076 	    sizeof(*footprint_region)) >=
22077 	    footprint_edge) {
22078 		/* over the edge: no new region */
22079 		return NULL;
22080 	}
22081 
22082 	/* adjust offset of last region in header */
22083 	footprint_header->cf_last_region = new_region_offset;
22084 
22085 	new_footprint_region = (struct vm_map_corpse_footprint_region *)
22086 	    ((char *)footprint_header +
22087 	    footprint_header->cf_last_region);
22088 	new_footprint_region->cfr_vaddr = 0;
22089 	new_footprint_region->cfr_num_pages = 0;
22090 	/* caller needs to initialize new region */
22091 
22092 	return new_footprint_region;
22093 }
22094 
22095 /*
22096  * vm_map_corpse_footprint_collect:
22097  *	collect footprint information for "old_entry" in "old_map" and
22098  *	stores it in "new_map"'s vmmap_footprint_info.
22099  */
22100 kern_return_t
vm_map_corpse_footprint_collect(vm_map_t old_map,vm_map_entry_t old_entry,vm_map_t new_map)22101 vm_map_corpse_footprint_collect(
22102 	vm_map_t        old_map,
22103 	vm_map_entry_t  old_entry,
22104 	vm_map_t        new_map)
22105 {
22106 	vm_map_offset_t va;
22107 	kern_return_t   kr;
22108 	struct vm_map_corpse_footprint_header *footprint_header;
22109 	struct vm_map_corpse_footprint_region *footprint_region;
22110 	struct vm_map_corpse_footprint_region *new_footprint_region;
22111 	cf_disp_t       *next_disp_p;
22112 	uintptr_t       footprint_edge;
22113 	uint32_t        num_pages_tmp;
22114 	int             effective_page_size;
22115 
22116 	effective_page_size = MIN(PAGE_SIZE, VM_MAP_PAGE_SIZE(old_map));
22117 
22118 	va = old_entry->vme_start;
22119 
22120 	vm_map_lock_assert_exclusive(old_map);
22121 	vm_map_lock_assert_exclusive(new_map);
22122 
22123 	assert(new_map->has_corpse_footprint);
22124 	assert(!old_map->has_corpse_footprint);
22125 	if (!new_map->has_corpse_footprint ||
22126 	    old_map->has_corpse_footprint) {
22127 		/*
22128 		 * This can only transfer footprint info from a
22129 		 * map with a live pmap to a map with a corpse footprint.
22130 		 */
22131 		return KERN_NOT_SUPPORTED;
22132 	}
22133 
22134 	if (new_map->vmmap_corpse_footprint == NULL) {
22135 		vm_offset_t     buf;
22136 		vm_size_t       buf_size;
22137 
22138 		buf = 0;
22139 		buf_size = (sizeof(*footprint_header) +
22140 		    (old_map->hdr.nentries
22141 		    *
22142 		    (sizeof(*footprint_region) +
22143 		    +3))            /* potential alignment for each region */
22144 		    +
22145 		    ((old_map->size / effective_page_size)
22146 		    *
22147 		    sizeof(cf_disp_t)));      /* disposition for each page */
22148 //		printf("FBDP corpse map %p guestimate footprint size 0x%llx\n", new_map, (uint64_t) buf_size);
22149 		buf_size = round_page(buf_size);
22150 
22151 		/* limit buffer to 1 page to validate overflow detection */
22152 //		buf_size = PAGE_SIZE;
22153 
22154 		/* limit size to a somewhat sane amount */
22155 #if XNU_TARGET_OS_OSX
22156 #define VM_MAP_CORPSE_FOOTPRINT_INFO_MAX_SIZE   (8*1024*1024)   /* 8MB */
22157 #else /* XNU_TARGET_OS_OSX */
22158 #define VM_MAP_CORPSE_FOOTPRINT_INFO_MAX_SIZE   (256*1024)      /* 256KB */
22159 #endif /* XNU_TARGET_OS_OSX */
22160 		if (buf_size > VM_MAP_CORPSE_FOOTPRINT_INFO_MAX_SIZE) {
22161 			buf_size = VM_MAP_CORPSE_FOOTPRINT_INFO_MAX_SIZE;
22162 		}
22163 
22164 		/*
22165 		 * Allocate the pageable buffer (with a trailing guard page).
22166 		 * It will be zero-filled on demand.
22167 		 */
22168 		kr = kmem_alloc(kernel_map, &buf, buf_size + PAGE_SIZE,
22169 		    KMA_DATA | KMA_PAGEABLE | KMA_GUARD_LAST,
22170 		    VM_KERN_MEMORY_DIAG);
22171 		if (kr != KERN_SUCCESS) {
22172 			vm_map_corpse_footprint_no_buf++;
22173 			return kr;
22174 		}
22175 
22176 		/* initialize header and 1st region */
22177 		footprint_header = (struct vm_map_corpse_footprint_header *)buf;
22178 		new_map->vmmap_corpse_footprint = footprint_header;
22179 
22180 		footprint_header->cf_size = buf_size;
22181 		footprint_header->cf_last_region =
22182 		    sizeof(*footprint_header);
22183 		footprint_header->cf_last_zeroes = 0;
22184 
22185 		footprint_region = (struct vm_map_corpse_footprint_region *)
22186 		    ((char *)footprint_header +
22187 		    footprint_header->cf_last_region);
22188 		footprint_region->cfr_vaddr = 0;
22189 		footprint_region->cfr_num_pages = 0;
22190 	} else {
22191 		/* retrieve header and last region */
22192 		footprint_header = (struct vm_map_corpse_footprint_header *)
22193 		    new_map->vmmap_corpse_footprint;
22194 		footprint_region = (struct vm_map_corpse_footprint_region *)
22195 		    ((char *)footprint_header +
22196 		    footprint_header->cf_last_region);
22197 	}
22198 	footprint_edge = ((uintptr_t)footprint_header +
22199 	    footprint_header->cf_size);
22200 
22201 	if ((footprint_region->cfr_vaddr +
22202 	    (((vm_map_offset_t)footprint_region->cfr_num_pages) *
22203 	    effective_page_size))
22204 	    != old_entry->vme_start) {
22205 		uint64_t num_pages_delta, num_pages_delta_size;
22206 		uint32_t region_offset_delta_size;
22207 
22208 		/*
22209 		 * Not the next contiguous virtual address:
22210 		 * start a new region or store "zero" dispositions for
22211 		 * the missing pages?
22212 		 */
22213 		/* size of gap in actual page dispositions */
22214 		num_pages_delta = ((old_entry->vme_start -
22215 		    footprint_region->cfr_vaddr) / effective_page_size)
22216 		    - footprint_region->cfr_num_pages;
22217 		num_pages_delta_size = num_pages_delta * sizeof(cf_disp_t);
22218 		/* size of gap as a new footprint region header */
22219 		region_offset_delta_size =
22220 		    (sizeof(*footprint_region) +
22221 		    roundup(((footprint_region->cfr_num_pages -
22222 		    footprint_header->cf_last_zeroes) * sizeof(cf_disp_t)),
22223 		    sizeof(int)) -
22224 		    ((footprint_region->cfr_num_pages -
22225 		    footprint_header->cf_last_zeroes) * sizeof(cf_disp_t)));
22226 //		printf("FBDP %s:%d region 0x%x 0x%llx 0x%x vme_start 0x%llx pages_delta 0x%llx region_delta 0x%x\n", __FUNCTION__, __LINE__, footprint_header->cf_last_region, footprint_region->cfr_vaddr, footprint_region->cfr_num_pages, old_entry->vme_start, num_pages_delta, region_offset_delta);
22227 		if (region_offset_delta_size < num_pages_delta_size ||
22228 		    os_add3_overflow(footprint_region->cfr_num_pages,
22229 		    (uint32_t) num_pages_delta,
22230 		    1,
22231 		    &num_pages_tmp)) {
22232 			/*
22233 			 * Storing data for this gap would take more space
22234 			 * than inserting a new footprint region header:
22235 			 * let's start a new region and save space. If it's a
22236 			 * tie, let's avoid using a new region, since that
22237 			 * would require more region hops to find the right
22238 			 * range during lookups.
22239 			 *
22240 			 * If the current region's cfr_num_pages would overflow
22241 			 * if we added "zero" page dispositions for the gap,
22242 			 * no choice but to start a new region.
22243 			 */
22244 //			printf("FBDP %s:%d new region\n", __FUNCTION__, __LINE__);
22245 			new_footprint_region =
22246 			    vm_map_corpse_footprint_new_region(footprint_header);
22247 			/* check that we're not going over the edge */
22248 			if (new_footprint_region == NULL) {
22249 				goto over_the_edge;
22250 			}
22251 			footprint_region = new_footprint_region;
22252 			/* initialize new region as empty */
22253 			footprint_region->cfr_vaddr = old_entry->vme_start;
22254 			footprint_region->cfr_num_pages = 0;
22255 		} else {
22256 			/*
22257 			 * Store "zero" page dispositions for the missing
22258 			 * pages.
22259 			 */
22260 //			printf("FBDP %s:%d zero gap\n", __FUNCTION__, __LINE__);
22261 			for (; num_pages_delta > 0; num_pages_delta--) {
22262 				next_disp_p = (cf_disp_t *)
22263 				    ((uintptr_t) footprint_region +
22264 				    sizeof(*footprint_region));
22265 				next_disp_p += footprint_region->cfr_num_pages;
22266 				/* check that we're not going over the edge */
22267 				if ((uintptr_t)next_disp_p >= footprint_edge) {
22268 					goto over_the_edge;
22269 				}
22270 				/* store "zero" disposition for this gap page */
22271 				footprint_region->cfr_num_pages++;
22272 				*next_disp_p = (cf_disp_t) 0;
22273 				footprint_header->cf_last_zeroes++;
22274 			}
22275 		}
22276 	}
22277 
22278 	for (va = old_entry->vme_start;
22279 	    va < old_entry->vme_end;
22280 	    va += effective_page_size) {
22281 		int             disposition;
22282 		cf_disp_t       cf_disp;
22283 
22284 		vm_map_footprint_query_page_info(old_map,
22285 		    old_entry,
22286 		    va,
22287 		    &disposition);
22288 		cf_disp = vm_page_disposition_to_cf_disp(disposition);
22289 
22290 //		if (va < SHARED_REGION_BASE_ARM64) printf("FBDP collect map %p va 0x%llx disp 0x%x\n", new_map, va, disp);
22291 
22292 		if (cf_disp == 0 && footprint_region->cfr_num_pages == 0) {
22293 			/*
22294 			 * Ignore "zero" dispositions at start of
22295 			 * region: just move start of region.
22296 			 */
22297 			footprint_region->cfr_vaddr += effective_page_size;
22298 			continue;
22299 		}
22300 
22301 		/* would region's cfr_num_pages overflow? */
22302 		if (os_add_overflow(footprint_region->cfr_num_pages, 1,
22303 		    &num_pages_tmp)) {
22304 			/* overflow: create a new region */
22305 			new_footprint_region =
22306 			    vm_map_corpse_footprint_new_region(
22307 				footprint_header);
22308 			if (new_footprint_region == NULL) {
22309 				goto over_the_edge;
22310 			}
22311 			footprint_region = new_footprint_region;
22312 			footprint_region->cfr_vaddr = va;
22313 			footprint_region->cfr_num_pages = 0;
22314 		}
22315 
22316 		next_disp_p = (cf_disp_t *) ((uintptr_t) footprint_region +
22317 		    sizeof(*footprint_region));
22318 		next_disp_p += footprint_region->cfr_num_pages;
22319 		/* check that we're not going over the edge */
22320 		if ((uintptr_t)next_disp_p >= footprint_edge) {
22321 			goto over_the_edge;
22322 		}
22323 		/* store this dispostion */
22324 		*next_disp_p = cf_disp;
22325 		footprint_region->cfr_num_pages++;
22326 
22327 		if (cf_disp != 0) {
22328 			/* non-zero disp: break the current zero streak */
22329 			footprint_header->cf_last_zeroes = 0;
22330 			/* done */
22331 			continue;
22332 		}
22333 
22334 		/* zero disp: add to the current streak of zeroes */
22335 		footprint_header->cf_last_zeroes++;
22336 		if ((footprint_header->cf_last_zeroes +
22337 		    roundup(((footprint_region->cfr_num_pages -
22338 		    footprint_header->cf_last_zeroes) * sizeof(cf_disp_t)) &
22339 		    (sizeof(int) - 1),
22340 		    sizeof(int))) <
22341 		    (sizeof(*footprint_header))) {
22342 			/*
22343 			 * There are not enough trailing "zero" dispositions
22344 			 * (+ the extra padding we would need for the previous
22345 			 * region); creating a new region would not save space
22346 			 * at this point, so let's keep this "zero" disposition
22347 			 * in this region and reconsider later.
22348 			 */
22349 			continue;
22350 		}
22351 		/*
22352 		 * Create a new region to avoid having too many consecutive
22353 		 * "zero" dispositions.
22354 		 */
22355 		new_footprint_region =
22356 		    vm_map_corpse_footprint_new_region(footprint_header);
22357 		if (new_footprint_region == NULL) {
22358 			goto over_the_edge;
22359 		}
22360 		footprint_region = new_footprint_region;
22361 		/* initialize the new region as empty ... */
22362 		footprint_region->cfr_num_pages = 0;
22363 		/* ... and skip this "zero" disp */
22364 		footprint_region->cfr_vaddr = va + effective_page_size;
22365 	}
22366 
22367 	return KERN_SUCCESS;
22368 
22369 over_the_edge:
22370 //	printf("FBDP map %p footprint was full for va 0x%llx\n", new_map, va);
22371 	vm_map_corpse_footprint_full++;
22372 	return KERN_RESOURCE_SHORTAGE;
22373 }
22374 
22375 /*
22376  * vm_map_corpse_footprint_collect_done:
22377  *	completes the footprint collection by getting rid of any remaining
22378  *	trailing "zero" dispositions and trimming the unused part of the
22379  *	kernel buffer
22380  */
22381 void
vm_map_corpse_footprint_collect_done(vm_map_t new_map)22382 vm_map_corpse_footprint_collect_done(
22383 	vm_map_t        new_map)
22384 {
22385 	struct vm_map_corpse_footprint_header *footprint_header;
22386 	struct vm_map_corpse_footprint_region *footprint_region;
22387 	vm_size_t       buf_size, actual_size;
22388 	kern_return_t   kr;
22389 
22390 	assert(new_map->has_corpse_footprint);
22391 	if (!new_map->has_corpse_footprint ||
22392 	    new_map->vmmap_corpse_footprint == NULL) {
22393 		return;
22394 	}
22395 
22396 	footprint_header = (struct vm_map_corpse_footprint_header *)
22397 	    new_map->vmmap_corpse_footprint;
22398 	buf_size = footprint_header->cf_size;
22399 
22400 	footprint_region = (struct vm_map_corpse_footprint_region *)
22401 	    ((char *)footprint_header +
22402 	    footprint_header->cf_last_region);
22403 
22404 	/* get rid of trailing zeroes in last region */
22405 	assert(footprint_region->cfr_num_pages >= footprint_header->cf_last_zeroes);
22406 	footprint_region->cfr_num_pages -= footprint_header->cf_last_zeroes;
22407 	footprint_header->cf_last_zeroes = 0;
22408 
22409 	actual_size = (vm_size_t)(footprint_header->cf_last_region +
22410 	    sizeof(*footprint_region) +
22411 	    (footprint_region->cfr_num_pages * sizeof(cf_disp_t)));
22412 
22413 //	printf("FBDP map %p buf_size 0x%llx actual_size 0x%llx\n", new_map, (uint64_t) buf_size, (uint64_t) actual_size);
22414 	vm_map_corpse_footprint_size_avg =
22415 	    (((vm_map_corpse_footprint_size_avg *
22416 	    vm_map_corpse_footprint_count) +
22417 	    actual_size) /
22418 	    (vm_map_corpse_footprint_count + 1));
22419 	vm_map_corpse_footprint_count++;
22420 	if (actual_size > vm_map_corpse_footprint_size_max) {
22421 		vm_map_corpse_footprint_size_max = actual_size;
22422 	}
22423 
22424 	actual_size = round_page(actual_size);
22425 	if (buf_size > actual_size) {
22426 		kr = vm_deallocate(kernel_map,
22427 		    ((vm_address_t)footprint_header +
22428 		    actual_size +
22429 		    PAGE_SIZE),                 /* trailing guard page */
22430 		    (buf_size - actual_size));
22431 		assertf(kr == KERN_SUCCESS,
22432 		    "trim: footprint_header %p buf_size 0x%llx actual_size 0x%llx kr=0x%x\n",
22433 		    footprint_header,
22434 		    (uint64_t) buf_size,
22435 		    (uint64_t) actual_size,
22436 		    kr);
22437 		kr = vm_protect(kernel_map,
22438 		    ((vm_address_t)footprint_header +
22439 		    actual_size),
22440 		    PAGE_SIZE,
22441 		    FALSE,             /* set_maximum */
22442 		    VM_PROT_NONE);
22443 		assertf(kr == KERN_SUCCESS,
22444 		    "guard: footprint_header %p buf_size 0x%llx actual_size 0x%llx kr=0x%x\n",
22445 		    footprint_header,
22446 		    (uint64_t) buf_size,
22447 		    (uint64_t) actual_size,
22448 		    kr);
22449 	}
22450 
22451 	footprint_header->cf_size = actual_size;
22452 }
22453 
22454 /*
22455  * vm_map_corpse_footprint_query_page_info:
22456  *	retrieves the disposition of the page at virtual address "vaddr"
22457  *	in the forked corpse's VM map
22458  *
22459  * This is the equivalent of vm_map_footprint_query_page_info() for a forked corpse.
22460  */
22461 kern_return_t
vm_map_corpse_footprint_query_page_info(vm_map_t map,vm_map_offset_t va,int * disposition_p)22462 vm_map_corpse_footprint_query_page_info(
22463 	vm_map_t        map,
22464 	vm_map_offset_t va,
22465 	int             *disposition_p)
22466 {
22467 	struct vm_map_corpse_footprint_header *footprint_header;
22468 	struct vm_map_corpse_footprint_region *footprint_region;
22469 	uint32_t        footprint_region_offset;
22470 	vm_map_offset_t region_start, region_end;
22471 	int             disp_idx;
22472 	kern_return_t   kr;
22473 	int             effective_page_size;
22474 	cf_disp_t       cf_disp;
22475 
22476 	if (!map->has_corpse_footprint) {
22477 		*disposition_p = 0;
22478 		kr = KERN_INVALID_ARGUMENT;
22479 		goto done;
22480 	}
22481 
22482 	footprint_header = map->vmmap_corpse_footprint;
22483 	if (footprint_header == NULL) {
22484 		*disposition_p = 0;
22485 //		if (va < SHARED_REGION_BASE_ARM64) printf("FBDP %d query map %p va 0x%llx disp 0x%x\n", __LINE__, map, va, *disposition_p);
22486 		kr = KERN_INVALID_ARGUMENT;
22487 		goto done;
22488 	}
22489 
22490 	/* start looking at the hint ("cf_hint_region") */
22491 	footprint_region_offset = footprint_header->cf_hint_region;
22492 
22493 	effective_page_size = MIN(PAGE_SIZE, VM_MAP_PAGE_SIZE(map));
22494 
22495 lookup_again:
22496 	if (footprint_region_offset < sizeof(*footprint_header)) {
22497 		/* hint too low: start from 1st region */
22498 		footprint_region_offset = sizeof(*footprint_header);
22499 	}
22500 	if (footprint_region_offset >= footprint_header->cf_last_region) {
22501 		/* hint too high: re-start from 1st region */
22502 		footprint_region_offset = sizeof(*footprint_header);
22503 	}
22504 	footprint_region = (struct vm_map_corpse_footprint_region *)
22505 	    ((char *)footprint_header + footprint_region_offset);
22506 	region_start = footprint_region->cfr_vaddr;
22507 	region_end = (region_start +
22508 	    ((vm_map_offset_t)(footprint_region->cfr_num_pages) *
22509 	    effective_page_size));
22510 	if (va < region_start &&
22511 	    footprint_region_offset != sizeof(*footprint_header)) {
22512 		/* our range starts before the hint region */
22513 
22514 		/* reset the hint (in a racy way...) */
22515 		footprint_header->cf_hint_region = sizeof(*footprint_header);
22516 		/* lookup "va" again from 1st region */
22517 		footprint_region_offset = sizeof(*footprint_header);
22518 		goto lookup_again;
22519 	}
22520 
22521 	while (va >= region_end) {
22522 		if (footprint_region_offset >= footprint_header->cf_last_region) {
22523 			break;
22524 		}
22525 		/* skip the region's header */
22526 		footprint_region_offset += sizeof(*footprint_region);
22527 		/* skip the region's page dispositions */
22528 		footprint_region_offset += (footprint_region->cfr_num_pages * sizeof(cf_disp_t));
22529 		/* align to next word boundary */
22530 		footprint_region_offset =
22531 		    roundup(footprint_region_offset,
22532 		    sizeof(int));
22533 		footprint_region = (struct vm_map_corpse_footprint_region *)
22534 		    ((char *)footprint_header + footprint_region_offset);
22535 		region_start = footprint_region->cfr_vaddr;
22536 		region_end = (region_start +
22537 		    ((vm_map_offset_t)(footprint_region->cfr_num_pages) *
22538 		    effective_page_size));
22539 	}
22540 	if (va < region_start || va >= region_end) {
22541 		/* page not found */
22542 		*disposition_p = 0;
22543 //		if (va < SHARED_REGION_BASE_ARM64) printf("FBDP %d query map %p va 0x%llx disp 0x%x\n", __LINE__, map, va, *disposition_p);
22544 		kr = KERN_SUCCESS;
22545 		goto done;
22546 	}
22547 
22548 	/* "va" found: set the lookup hint for next lookup (in a racy way...) */
22549 	footprint_header->cf_hint_region = footprint_region_offset;
22550 
22551 	/* get page disposition for "va" in this region */
22552 	disp_idx = (int) ((va - footprint_region->cfr_vaddr) / effective_page_size);
22553 	cf_disp = footprint_region->cfr_disposition[disp_idx];
22554 	*disposition_p = vm_page_cf_disp_to_disposition(cf_disp);
22555 	kr = KERN_SUCCESS;
22556 done:
22557 //	if (va < SHARED_REGION_BASE_ARM64) printf("FBDP %d query map %p va 0x%llx disp 0x%x\n", __LINE__, map, va, *disposition_p);
22558 	/* dtrace -n 'vminfo:::footprint_query_page_info { printf("map 0x%p va 0x%llx disp 0x%x kr 0x%x", arg0, arg1, arg2, arg3); }' */
22559 	DTRACE_VM4(footprint_query_page_info,
22560 	    vm_map_t, map,
22561 	    vm_map_offset_t, va,
22562 	    int, *disposition_p,
22563 	    kern_return_t, kr);
22564 
22565 	return kr;
22566 }
22567 
22568 void
vm_map_corpse_footprint_destroy(vm_map_t map)22569 vm_map_corpse_footprint_destroy(
22570 	vm_map_t        map)
22571 {
22572 	if (map->has_corpse_footprint &&
22573 	    map->vmmap_corpse_footprint != 0) {
22574 		struct vm_map_corpse_footprint_header *footprint_header;
22575 		vm_size_t buf_size;
22576 		kern_return_t kr;
22577 
22578 		footprint_header = map->vmmap_corpse_footprint;
22579 		buf_size = footprint_header->cf_size;
22580 		kr = vm_deallocate(kernel_map,
22581 		    (vm_offset_t) map->vmmap_corpse_footprint,
22582 		    ((vm_size_t) buf_size
22583 		    + PAGE_SIZE));                 /* trailing guard page */
22584 		assertf(kr == KERN_SUCCESS, "kr=0x%x\n", kr);
22585 		map->vmmap_corpse_footprint = 0;
22586 		map->has_corpse_footprint = FALSE;
22587 	}
22588 }
22589 
22590 /*
22591  * vm_map_copy_footprint_ledgers:
22592  *	copies any ledger that's relevant to the memory footprint of "old_task"
22593  *	into the forked corpse's task ("new_task")
22594  */
22595 void
vm_map_copy_footprint_ledgers(task_t old_task,task_t new_task)22596 vm_map_copy_footprint_ledgers(
22597 	task_t  old_task,
22598 	task_t  new_task)
22599 {
22600 	vm_map_copy_ledger(old_task, new_task, task_ledgers.phys_footprint);
22601 	vm_map_copy_ledger(old_task, new_task, task_ledgers.purgeable_nonvolatile);
22602 	vm_map_copy_ledger(old_task, new_task, task_ledgers.purgeable_nonvolatile_compressed);
22603 	vm_map_copy_ledger(old_task, new_task, task_ledgers.internal);
22604 	vm_map_copy_ledger(old_task, new_task, task_ledgers.internal_compressed);
22605 	vm_map_copy_ledger(old_task, new_task, task_ledgers.iokit_mapped);
22606 	vm_map_copy_ledger(old_task, new_task, task_ledgers.alternate_accounting);
22607 	vm_map_copy_ledger(old_task, new_task, task_ledgers.alternate_accounting_compressed);
22608 	vm_map_copy_ledger(old_task, new_task, task_ledgers.page_table);
22609 	vm_map_copy_ledger(old_task, new_task, task_ledgers.tagged_footprint);
22610 	vm_map_copy_ledger(old_task, new_task, task_ledgers.tagged_footprint_compressed);
22611 	vm_map_copy_ledger(old_task, new_task, task_ledgers.network_nonvolatile);
22612 	vm_map_copy_ledger(old_task, new_task, task_ledgers.network_nonvolatile_compressed);
22613 	vm_map_copy_ledger(old_task, new_task, task_ledgers.media_footprint);
22614 	vm_map_copy_ledger(old_task, new_task, task_ledgers.media_footprint_compressed);
22615 	vm_map_copy_ledger(old_task, new_task, task_ledgers.graphics_footprint);
22616 	vm_map_copy_ledger(old_task, new_task, task_ledgers.graphics_footprint_compressed);
22617 	vm_map_copy_ledger(old_task, new_task, task_ledgers.neural_footprint);
22618 	vm_map_copy_ledger(old_task, new_task, task_ledgers.neural_footprint_compressed);
22619 	vm_map_copy_ledger(old_task, new_task, task_ledgers.wired_mem);
22620 }
22621 
22622 /*
22623  * vm_map_copy_ledger:
22624  *	copy a single ledger from "old_task" to "new_task"
22625  */
22626 void
vm_map_copy_ledger(task_t old_task,task_t new_task,int ledger_entry)22627 vm_map_copy_ledger(
22628 	task_t  old_task,
22629 	task_t  new_task,
22630 	int     ledger_entry)
22631 {
22632 	ledger_amount_t old_balance, new_balance, delta;
22633 
22634 	assert(new_task->map->has_corpse_footprint);
22635 	if (!new_task->map->has_corpse_footprint) {
22636 		return;
22637 	}
22638 
22639 	/* turn off sanity checks for the ledger we're about to mess with */
22640 	ledger_disable_panic_on_negative(new_task->ledger,
22641 	    ledger_entry);
22642 
22643 	/* adjust "new_task" to match "old_task" */
22644 	ledger_get_balance(old_task->ledger,
22645 	    ledger_entry,
22646 	    &old_balance);
22647 	ledger_get_balance(new_task->ledger,
22648 	    ledger_entry,
22649 	    &new_balance);
22650 	if (new_balance == old_balance) {
22651 		/* new == old: done */
22652 	} else if (new_balance > old_balance) {
22653 		/* new > old ==> new -= new - old */
22654 		delta = new_balance - old_balance;
22655 		ledger_debit(new_task->ledger,
22656 		    ledger_entry,
22657 		    delta);
22658 	} else {
22659 		/* new < old ==> new += old - new */
22660 		delta = old_balance - new_balance;
22661 		ledger_credit(new_task->ledger,
22662 		    ledger_entry,
22663 		    delta);
22664 	}
22665 }
22666 
22667 /*
22668  * vm_map_get_pmap:
22669  * returns the pmap associated with the vm_map
22670  */
22671 pmap_t
vm_map_get_pmap(vm_map_t map)22672 vm_map_get_pmap(vm_map_t map)
22673 {
22674 	return vm_map_pmap(map);
22675 }
22676 
22677 #if CONFIG_MAP_RANGES
22678 static bitmap_t vm_map_user_range_heap_map[BITMAP_LEN(VM_MEMORY_COUNT)];
22679 
22680 /*
22681  * vm_map_range_map_init:
22682  *  initializes the VM range ID map to enable index lookup
22683  *  of user VM ranges based on VM tag from userspace.
22684  */
22685 static void
vm_map_range_map_init(void)22686 vm_map_range_map_init(void)
22687 {
22688 	bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_MALLOC);
22689 	bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_MALLOC_HUGE);
22690 	bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_MALLOC_LARGE);
22691 	bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_MALLOC_LARGE_REUSED);
22692 	bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_MALLOC_MEDIUM);
22693 	bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_MALLOC_PROB_GUARD);
22694 	bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_MALLOC_SMALL);
22695 	bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_MALLOC_TINY);
22696 }
22697 
22698 /*
22699  * vm_map_range_configure:
22700  *	configures the user vm_map ranges by increasing the maximum VA range of
22701  *  the map and carving out a range at the end of VA space (searching backwards
22702  *  in the newly expanded map).
22703  */
22704 kern_return_t
vm_map_range_configure(vm_map_t map)22705 vm_map_range_configure(vm_map_t map)
22706 {
22707 	vm_map_size_t           addr_space_size;
22708 	vm_map_offset_t         start, end, saved_max, random_addr;
22709 	kern_return_t           kr;
22710 
22711 	/* Should not be applying ranges to kernel map or kernel map submaps */
22712 	assert(map != kernel_map);
22713 	assert(vm_map_pmap(map) != kernel_pmap);
22714 
22715 	/* save the existing max offset */
22716 	vm_map_lock_read(map);
22717 	saved_max = vm_map_max(map);
22718 	vm_map_unlock_read(map);
22719 
22720 	/*
22721 	 * Check that we're not already jumbo'd. If so we cannot guarantee that
22722 	 * we can set up the ranges safely without interfering with the existing
22723 	 * map.
22724 	 */
22725 	if (saved_max > vm_compute_max_offset(vm_map_is_64bit(map))) {
22726 		return KERN_NO_SPACE;
22727 	}
22728 
22729 	/* expand the default VM space to the largest possible address */
22730 	vm_map_set_jumbo(map);
22731 
22732 	vm_map_lock(map);
22733 	addr_space_size = vm_map_max(map) - saved_max;
22734 
22735 	if (addr_space_size <= VM_MAP_USER_RANGE_MAX) {
22736 		vm_map_unlock(map);
22737 		return KERN_NO_SPACE;
22738 	}
22739 
22740 	addr_space_size -= VM_MAP_USER_RANGE_MAX;
22741 	random_addr = (vm_map_offset_t)random();
22742 	random_addr <<= VM_MAP_PAGE_SHIFT(map);
22743 	random_addr %= addr_space_size;
22744 
22745 	/*
22746 	 * round off the start so we begin on a L2 TT boundary and ensure we have
22747 	 * at least a ARM_TT_L2_SIZE sized hole between existing map range and
22748 	 * new range(s).
22749 	 */
22750 	start = vm_map_round_page(saved_max + random_addr + 1, ARM_TT_L2_OFFMASK);
22751 	end = MIN(vm_map_max(map), start + VM_MAP_USER_RANGE_MAX);
22752 	assert(start > saved_max);
22753 	assert(end <= vm_map_max(map));
22754 
22755 	/* default range covers the "normal" heap range */
22756 	map->user_range[UMEM_RANGE_ID_DEFAULT].min_address = vm_map_min(map);
22757 	map->user_range[UMEM_RANGE_ID_DEFAULT].max_address = saved_max;
22758 
22759 	/* heap range covers the new extended range */
22760 	map->user_range[UMEM_RANGE_ID_HEAP].min_address = start;
22761 	map->user_range[UMEM_RANGE_ID_HEAP].max_address = end;
22762 
22763 	vm_map_unlock(map);
22764 
22765 	/*
22766 	 * Poke holes so that ASAN or people listing regions
22767 	 * do not think this space is free.
22768 	 */
22769 
22770 	if (start != saved_max) {
22771 		kr = vm_map_enter(map, &saved_max, start - saved_max,
22772 		    0, VM_MAP_KERNEL_FLAGS_FIXED_PERMANENT(), VM_OBJECT_NULL,
22773 		    0, FALSE, VM_PROT_NONE, VM_PROT_NONE, VM_INHERIT_DEFAULT);
22774 		assert(kr == KERN_SUCCESS);
22775 	}
22776 
22777 	if (end != vm_map_max(map)) {
22778 		kr = vm_map_enter(map, &end, vm_map_max(map) - end,
22779 		    0, VM_MAP_KERNEL_FLAGS_FIXED_PERMANENT(), VM_OBJECT_NULL,
22780 		    0, FALSE, VM_PROT_NONE, VM_PROT_NONE, VM_INHERIT_DEFAULT);
22781 		assert(kr == KERN_SUCCESS);
22782 	}
22783 
22784 	vm_map_lock(map);
22785 
22786 	map->uses_user_ranges = true;
22787 
22788 	vm_map_unlock(map);
22789 
22790 	return KERN_SUCCESS;
22791 }
22792 
22793 /*
22794  * vm_map_range_fork:
22795  *	clones the array of ranges from old_map to new_map in support
22796  *  of a VM map fork.
22797  */
22798 void
vm_map_range_fork(vm_map_t new_map,vm_map_t old_map)22799 vm_map_range_fork(vm_map_t new_map, vm_map_t old_map)
22800 {
22801 	if (!old_map->uses_user_ranges) {
22802 		/* nothing to do */
22803 		return;
22804 	}
22805 
22806 	for (size_t i = 0; i < UMEM_RANGE_COUNT; i++) {
22807 		new_map->user_range[i] = old_map->user_range[i];
22808 	}
22809 
22810 	new_map->uses_user_ranges = true;
22811 }
22812 
22813 /*
22814  * vm_map_get_user_range:
22815  *	copy the VM user range for the given VM map and range ID.
22816  */
22817 kern_return_t
vm_map_get_user_range(vm_map_t map,vm_map_range_id_t range_id,mach_vm_range_t range)22818 vm_map_get_user_range(
22819 	vm_map_t                map,
22820 	vm_map_range_id_t       range_id,
22821 	mach_vm_range_t         range)
22822 {
22823 	if (map == NULL ||
22824 	    !map->uses_user_ranges ||
22825 	    range_id > UMEM_RANGE_ID_MAX ||
22826 	    range == NULL) {
22827 		return KERN_INVALID_ARGUMENT;
22828 	}
22829 
22830 	*range = map->user_range[range_id];
22831 	return KERN_SUCCESS;
22832 }
22833 
22834 static vm_map_range_id_t
vm_map_user_range_resolve(vm_map_t map,mach_vm_address_t addr,mach_vm_size_t size,mach_vm_range_t range)22835 vm_map_user_range_resolve(
22836 	vm_map_t                map,
22837 	mach_vm_address_t       addr,
22838 	mach_vm_size_t          size,
22839 	mach_vm_range_t         range)
22840 {
22841 	vm_map_lock_assert_held(map);
22842 
22843 	for (vm_map_range_id_t i = 0; i < UMEM_RANGE_COUNT; i++) {
22844 		mach_vm_range_t r = &map->user_range[i];
22845 
22846 		if (mach_vm_range_contains(r, addr, size)) {
22847 			if (range) {
22848 				*range = *r;
22849 			}
22850 			return i;
22851 		}
22852 	}
22853 
22854 	if (range) {
22855 		range->min_address = range->max_address = 0;
22856 	}
22857 	return UMEM_RANGE_ID_DEFAULT;
22858 }
22859 
22860 #endif /* CONFIG_MAP_RANGES */
22861 
22862 void
vm_map_kernel_flags_update_range_id(vm_map_kernel_flags_t * vmkf,vm_map_t map)22863 vm_map_kernel_flags_update_range_id(vm_map_kernel_flags_t *vmkf, vm_map_t map)
22864 {
22865 	if (map == kernel_map) {
22866 		if (vmkf->vmkf_range_id == KMEM_RANGE_ID_NONE) {
22867 			vmkf->vmkf_range_id = KMEM_RANGE_ID_DATA;
22868 		}
22869 #if CONFIG_MAP_RANGES
22870 	} else if (vmkf->vm_tag < VM_MEMORY_COUNT &&
22871 	    vmkf->vmkf_range_id == UMEM_RANGE_ID_DEFAULT &&
22872 	    bitmap_test(vm_map_user_range_heap_map, vmkf->vm_tag)) {
22873 		vmkf->vmkf_range_id = UMEM_RANGE_ID_HEAP;
22874 #endif /* CONFIG_MAP_RANGES */
22875 	}
22876 }
22877 
22878 /*
22879  * vm_map_entry_has_device_pager:
22880  * Check if the vm map entry specified by the virtual address has a device pager.
22881  * If the vm map entry does not exist or if the map is NULL, this returns FALSE.
22882  */
22883 boolean_t
vm_map_entry_has_device_pager(vm_map_t map,vm_map_offset_t vaddr)22884 vm_map_entry_has_device_pager(vm_map_t map, vm_map_offset_t vaddr)
22885 {
22886 	vm_map_entry_t entry;
22887 	vm_object_t object;
22888 	boolean_t result;
22889 
22890 	if (map == NULL) {
22891 		return FALSE;
22892 	}
22893 
22894 	vm_map_lock(map);
22895 	while (TRUE) {
22896 		if (!vm_map_lookup_entry(map, vaddr, &entry)) {
22897 			result = FALSE;
22898 			break;
22899 		}
22900 		if (entry->is_sub_map) {
22901 			// Check the submap
22902 			vm_map_t submap = VME_SUBMAP(entry);
22903 			assert(submap != NULL);
22904 			vm_map_lock(submap);
22905 			vm_map_unlock(map);
22906 			map = submap;
22907 			continue;
22908 		}
22909 		object = VME_OBJECT(entry);
22910 		if (object != NULL && object->pager != NULL && is_device_pager_ops(object->pager->mo_pager_ops)) {
22911 			result = TRUE;
22912 			break;
22913 		}
22914 		result = FALSE;
22915 		break;
22916 	}
22917 
22918 	vm_map_unlock(map);
22919 	return result;
22920 }
22921 
22922 
22923 #if MACH_ASSERT
22924 
22925 extern int pmap_ledgers_panic;
22926 extern int pmap_ledgers_panic_leeway;
22927 
22928 #define LEDGER_DRIFT(__LEDGER)                    \
22929 	int             __LEDGER##_over;          \
22930 	ledger_amount_t __LEDGER##_over_total;    \
22931 	ledger_amount_t __LEDGER##_over_max;      \
22932 	int             __LEDGER##_under;         \
22933 	ledger_amount_t __LEDGER##_under_total;   \
22934 	ledger_amount_t __LEDGER##_under_max
22935 
22936 struct {
22937 	uint64_t        num_pmaps_checked;
22938 
22939 	LEDGER_DRIFT(phys_footprint);
22940 	LEDGER_DRIFT(internal);
22941 	LEDGER_DRIFT(internal_compressed);
22942 	LEDGER_DRIFT(external);
22943 	LEDGER_DRIFT(reusable);
22944 	LEDGER_DRIFT(iokit_mapped);
22945 	LEDGER_DRIFT(alternate_accounting);
22946 	LEDGER_DRIFT(alternate_accounting_compressed);
22947 	LEDGER_DRIFT(page_table);
22948 	LEDGER_DRIFT(purgeable_volatile);
22949 	LEDGER_DRIFT(purgeable_nonvolatile);
22950 	LEDGER_DRIFT(purgeable_volatile_compressed);
22951 	LEDGER_DRIFT(purgeable_nonvolatile_compressed);
22952 	LEDGER_DRIFT(tagged_nofootprint);
22953 	LEDGER_DRIFT(tagged_footprint);
22954 	LEDGER_DRIFT(tagged_nofootprint_compressed);
22955 	LEDGER_DRIFT(tagged_footprint_compressed);
22956 	LEDGER_DRIFT(network_volatile);
22957 	LEDGER_DRIFT(network_nonvolatile);
22958 	LEDGER_DRIFT(network_volatile_compressed);
22959 	LEDGER_DRIFT(network_nonvolatile_compressed);
22960 	LEDGER_DRIFT(media_nofootprint);
22961 	LEDGER_DRIFT(media_footprint);
22962 	LEDGER_DRIFT(media_nofootprint_compressed);
22963 	LEDGER_DRIFT(media_footprint_compressed);
22964 	LEDGER_DRIFT(graphics_nofootprint);
22965 	LEDGER_DRIFT(graphics_footprint);
22966 	LEDGER_DRIFT(graphics_nofootprint_compressed);
22967 	LEDGER_DRIFT(graphics_footprint_compressed);
22968 	LEDGER_DRIFT(neural_nofootprint);
22969 	LEDGER_DRIFT(neural_footprint);
22970 	LEDGER_DRIFT(neural_nofootprint_compressed);
22971 	LEDGER_DRIFT(neural_footprint_compressed);
22972 } pmap_ledgers_drift;
22973 
22974 void
vm_map_pmap_check_ledgers(pmap_t pmap,ledger_t ledger,int pid,char * procname)22975 vm_map_pmap_check_ledgers(
22976 	pmap_t          pmap,
22977 	ledger_t        ledger,
22978 	int             pid,
22979 	char            *procname)
22980 {
22981 	ledger_amount_t bal;
22982 	boolean_t       do_panic;
22983 
22984 	do_panic = FALSE;
22985 
22986 	pmap_ledgers_drift.num_pmaps_checked++;
22987 
22988 #define LEDGER_CHECK_BALANCE(__LEDGER)                                  \
22989 MACRO_BEGIN                                                             \
22990 	int panic_on_negative = TRUE;                                   \
22991 	ledger_get_balance(ledger,                                      \
22992 	                   task_ledgers.__LEDGER,                       \
22993 	                   &bal);                                       \
22994 	ledger_get_panic_on_negative(ledger,                            \
22995 	                             task_ledgers.__LEDGER,             \
22996 	                             &panic_on_negative);               \
22997 	if (bal != 0) {                                                 \
22998 	        if (panic_on_negative ||                                \
22999 	            (pmap_ledgers_panic &&                              \
23000 	             pmap_ledgers_panic_leeway > 0 &&                   \
23001 	             (bal > (pmap_ledgers_panic_leeway * PAGE_SIZE) ||  \
23002 	              bal < (-pmap_ledgers_panic_leeway * PAGE_SIZE)))) { \
23003 	                do_panic = TRUE;                                \
23004 	        }                                                       \
23005 	        printf("LEDGER BALANCE proc %d (%s) "                   \
23006 	               "\"%s\" = %lld\n",                               \
23007 	               pid, procname, #__LEDGER, bal);                  \
23008 	        if (bal > 0) {                                          \
23009 	                pmap_ledgers_drift.__LEDGER##_over++;           \
23010 	                pmap_ledgers_drift.__LEDGER##_over_total += bal; \
23011 	                if (bal > pmap_ledgers_drift.__LEDGER##_over_max) { \
23012 	                        pmap_ledgers_drift.__LEDGER##_over_max = bal; \
23013 	                }                                               \
23014 	        } else if (bal < 0) {                                   \
23015 	                pmap_ledgers_drift.__LEDGER##_under++;          \
23016 	                pmap_ledgers_drift.__LEDGER##_under_total += bal; \
23017 	                if (bal < pmap_ledgers_drift.__LEDGER##_under_max) { \
23018 	                        pmap_ledgers_drift.__LEDGER##_under_max = bal; \
23019 	                }                                               \
23020 	        }                                                       \
23021 	}                                                               \
23022 MACRO_END
23023 
23024 	LEDGER_CHECK_BALANCE(phys_footprint);
23025 	LEDGER_CHECK_BALANCE(internal);
23026 	LEDGER_CHECK_BALANCE(internal_compressed);
23027 	LEDGER_CHECK_BALANCE(external);
23028 	LEDGER_CHECK_BALANCE(reusable);
23029 	LEDGER_CHECK_BALANCE(iokit_mapped);
23030 	LEDGER_CHECK_BALANCE(alternate_accounting);
23031 	LEDGER_CHECK_BALANCE(alternate_accounting_compressed);
23032 	LEDGER_CHECK_BALANCE(page_table);
23033 	LEDGER_CHECK_BALANCE(purgeable_volatile);
23034 	LEDGER_CHECK_BALANCE(purgeable_nonvolatile);
23035 	LEDGER_CHECK_BALANCE(purgeable_volatile_compressed);
23036 	LEDGER_CHECK_BALANCE(purgeable_nonvolatile_compressed);
23037 	LEDGER_CHECK_BALANCE(tagged_nofootprint);
23038 	LEDGER_CHECK_BALANCE(tagged_footprint);
23039 	LEDGER_CHECK_BALANCE(tagged_nofootprint_compressed);
23040 	LEDGER_CHECK_BALANCE(tagged_footprint_compressed);
23041 	LEDGER_CHECK_BALANCE(network_volatile);
23042 	LEDGER_CHECK_BALANCE(network_nonvolatile);
23043 	LEDGER_CHECK_BALANCE(network_volatile_compressed);
23044 	LEDGER_CHECK_BALANCE(network_nonvolatile_compressed);
23045 	LEDGER_CHECK_BALANCE(media_nofootprint);
23046 	LEDGER_CHECK_BALANCE(media_footprint);
23047 	LEDGER_CHECK_BALANCE(media_nofootprint_compressed);
23048 	LEDGER_CHECK_BALANCE(media_footprint_compressed);
23049 	LEDGER_CHECK_BALANCE(graphics_nofootprint);
23050 	LEDGER_CHECK_BALANCE(graphics_footprint);
23051 	LEDGER_CHECK_BALANCE(graphics_nofootprint_compressed);
23052 	LEDGER_CHECK_BALANCE(graphics_footprint_compressed);
23053 	LEDGER_CHECK_BALANCE(neural_nofootprint);
23054 	LEDGER_CHECK_BALANCE(neural_footprint);
23055 	LEDGER_CHECK_BALANCE(neural_nofootprint_compressed);
23056 	LEDGER_CHECK_BALANCE(neural_footprint_compressed);
23057 
23058 	if (do_panic) {
23059 		if (pmap_ledgers_panic) {
23060 			panic("pmap_destroy(%p) %d[%s] has imbalanced ledgers",
23061 			    pmap, pid, procname);
23062 		} else {
23063 			printf("pmap_destroy(%p) %d[%s] has imbalanced ledgers\n",
23064 			    pmap, pid, procname);
23065 		}
23066 	}
23067 }
23068 
23069 void
vm_map_pmap_set_process(vm_map_t map,int pid,char * procname)23070 vm_map_pmap_set_process(
23071 	vm_map_t map,
23072 	int pid,
23073 	char *procname)
23074 {
23075 	pmap_set_process(vm_map_pmap(map), pid, procname);
23076 }
23077 
23078 #endif /* MACH_ASSERT */
23079