xref: /xnu-8796.121.2/osfmk/vm/vm_map.c (revision c54f35ca767986246321eb901baf8f5ff7923f6a)
1 /*
2  * Copyright (c) 2000-2021 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 /*
29  * @OSF_COPYRIGHT@
30  */
31 /*
32  * Mach Operating System
33  * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
34  * All Rights Reserved.
35  *
36  * Permission to use, copy, modify and distribute this software and its
37  * documentation is hereby granted, provided that both the copyright
38  * notice and this permission notice appear in all copies of the
39  * software, derivative works or modified versions, and any portions
40  * thereof, and that both notices appear in supporting documentation.
41  *
42  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44  * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45  *
46  * Carnegie Mellon requests users of this software to return to
47  *
48  *  Software Distribution Coordinator  or  [email protected]
49  *  School of Computer Science
50  *  Carnegie Mellon University
51  *  Pittsburgh PA 15213-3890
52  *
53  * any improvements or extensions that they make and grant Carnegie Mellon
54  * the rights to redistribute these changes.
55  */
56 /*
57  */
58 /*
59  *	File:	vm/vm_map.c
60  *	Author:	Avadis Tevanian, Jr., Michael Wayne Young
61  *	Date:	1985
62  *
63  *	Virtual memory mapping module.
64  */
65 
66 #include "vm/vm_map.h"
67 #include <mach/vm_types.h>
68 #include <mach_assert.h>
69 
70 #include <vm/vm_options.h>
71 
72 #include <libkern/OSAtomic.h>
73 
74 #include <mach/kern_return.h>
75 #include <mach/port.h>
76 #include <mach/vm_attributes.h>
77 #include <mach/vm_param.h>
78 #include <mach/vm_behavior.h>
79 #include <mach/vm_statistics.h>
80 #include <mach/memory_object.h>
81 #include <mach/mach_vm.h>
82 #include <machine/cpu_capabilities.h>
83 #include <mach/sdt.h>
84 
85 #include <kern/assert.h>
86 #include <kern/backtrace.h>
87 #include <kern/counter.h>
88 #include <kern/exc_guard.h>
89 #include <kern/kalloc.h>
90 #include <kern/zalloc_internal.h>
91 
92 #include <vm/cpm.h>
93 #include <vm/vm_compressor.h>
94 #include <vm/vm_compressor_pager.h>
95 #include <vm/vm_init.h>
96 #include <vm/vm_fault.h>
97 #include <vm/vm_map_internal.h>
98 #include <vm/vm_object.h>
99 #include <vm/vm_page.h>
100 #include <vm/vm_pageout.h>
101 #include <vm/pmap.h>
102 #include <vm/vm_kern.h>
103 #include <ipc/ipc_port.h>
104 #include <kern/sched_prim.h>
105 #include <kern/misc_protos.h>
106 
107 #include <mach/vm_map_server.h>
108 #include <mach/mach_host_server.h>
109 #include <vm/vm_protos.h>
110 #include <vm/vm_purgeable_internal.h>
111 #include <vm/vm_reclaim_internal.h>
112 
113 #include <vm/vm_protos.h>
114 #include <vm/vm_shared_region.h>
115 #include <vm/vm_map_store.h>
116 
117 #include <san/kasan.h>
118 
119 #include <sys/resource.h>
120 #include <sys/codesign.h>
121 #include <sys/code_signing.h>
122 #include <sys/mman.h>
123 #include <sys/reboot.h>
124 #include <sys/kdebug_triage.h>
125 
126 #include <libkern/section_keywords.h>
127 
128 #if DEVELOPMENT || DEBUG
129 extern int proc_selfcsflags(void);
130 int vm_log_xnu_user_debug = 0;
131 int panic_on_unsigned_execute = 0;
132 int panic_on_mlock_failure = 0;
133 #endif /* DEVELOPMENT || DEBUG */
134 
135 #if MACH_ASSERT
136 int debug4k_filter = 0;
137 char debug4k_proc_name[1024] = "";
138 int debug4k_proc_filter = (int)-1 & ~(1 << __DEBUG4K_FAULT);
139 int debug4k_panic_on_misaligned_sharing = 0;
140 const char *debug4k_category_name[] = {
141 	"error",        /* 0 */
142 	"life",         /* 1 */
143 	"load",         /* 2 */
144 	"fault",        /* 3 */
145 	"copy",         /* 4 */
146 	"share",        /* 5 */
147 	"adjust",       /* 6 */
148 	"pmap",         /* 7 */
149 	"mementry",     /* 8 */
150 	"iokit",        /* 9 */
151 	"upl",          /* 10 */
152 	"exc",          /* 11 */
153 	"vfs"           /* 12 */
154 };
155 #endif /* MACH_ASSERT */
156 int debug4k_no_cow_copyin = 0;
157 
158 
159 #if __arm64__
160 extern const int fourk_binary_compatibility_unsafe;
161 extern const int fourk_binary_compatibility_allow_wx;
162 #endif /* __arm64__ */
163 extern int proc_selfpid(void);
164 extern char *proc_name_address(void *p);
165 
166 #if VM_MAP_DEBUG_APPLE_PROTECT
167 int vm_map_debug_apple_protect = 0;
168 #endif /* VM_MAP_DEBUG_APPLE_PROTECT */
169 #if VM_MAP_DEBUG_FOURK
170 int vm_map_debug_fourk = 0;
171 #endif /* VM_MAP_DEBUG_FOURK */
172 
173 #if DEBUG || DEVELOPMENT
174 static TUNABLE(bool, vm_map_executable_immutable,
175     "vm_map_executable_immutable", true);
176 #else
177 #define vm_map_executable_immutable true
178 #endif
179 
180 os_refgrp_decl(static, map_refgrp, "vm_map", NULL);
181 
182 extern u_int32_t random(void);  /* from <libkern/libkern.h> */
183 /* Internal prototypes
184  */
185 
186 typedef struct vm_map_zap {
187 	vm_map_entry_t          vmz_head;
188 	vm_map_entry_t         *vmz_tail;
189 } *vm_map_zap_t;
190 
191 #define VM_MAP_ZAP_DECLARE(zap) \
192 	struct vm_map_zap zap = { .vmz_tail = &zap.vmz_head }
193 
194 static vm_map_entry_t   vm_map_entry_insert(
195 	vm_map_t                map,
196 	vm_map_entry_t          insp_entry,
197 	vm_map_offset_t         start,
198 	vm_map_offset_t         end,
199 	vm_object_t             object,
200 	vm_object_offset_t      offset,
201 	vm_map_kernel_flags_t   vmk_flags,
202 	boolean_t               needs_copy,
203 	vm_prot_t               cur_protection,
204 	vm_prot_t               max_protection,
205 	vm_inherit_t            inheritance,
206 	boolean_t               clear_map_aligned);
207 
208 static void vm_map_simplify_range(
209 	vm_map_t        map,
210 	vm_map_offset_t start,
211 	vm_map_offset_t end);   /* forward */
212 
213 static boolean_t        vm_map_range_check(
214 	vm_map_t        map,
215 	vm_map_offset_t start,
216 	vm_map_offset_t end,
217 	vm_map_entry_t  *entry);
218 
219 static void vm_map_submap_pmap_clean(
220 	vm_map_t        map,
221 	vm_map_offset_t start,
222 	vm_map_offset_t end,
223 	vm_map_t        sub_map,
224 	vm_map_offset_t offset);
225 
226 static void             vm_map_pmap_enter(
227 	vm_map_t                map,
228 	vm_map_offset_t         addr,
229 	vm_map_offset_t         end_addr,
230 	vm_object_t             object,
231 	vm_object_offset_t      offset,
232 	vm_prot_t               protection);
233 
234 static void             _vm_map_clip_end(
235 	struct vm_map_header    *map_header,
236 	vm_map_entry_t          entry,
237 	vm_map_offset_t         end);
238 
239 static void             _vm_map_clip_start(
240 	struct vm_map_header    *map_header,
241 	vm_map_entry_t          entry,
242 	vm_map_offset_t         start);
243 
244 static kmem_return_t vm_map_delete(
245 	vm_map_t        map,
246 	vm_map_offset_t start,
247 	vm_map_offset_t end,
248 	vmr_flags_t     flags,
249 	kmem_guard_t    guard,
250 	vm_map_zap_t    zap);
251 
252 static void             vm_map_copy_insert(
253 	vm_map_t        map,
254 	vm_map_entry_t  after_where,
255 	vm_map_copy_t   copy);
256 
257 static kern_return_t    vm_map_copy_overwrite_unaligned(
258 	vm_map_t        dst_map,
259 	vm_map_entry_t  entry,
260 	vm_map_copy_t   copy,
261 	vm_map_address_t start,
262 	boolean_t       discard_on_success);
263 
264 static kern_return_t    vm_map_copy_overwrite_aligned(
265 	vm_map_t        dst_map,
266 	vm_map_entry_t  tmp_entry,
267 	vm_map_copy_t   copy,
268 	vm_map_offset_t start,
269 	pmap_t          pmap);
270 
271 static kern_return_t    vm_map_copyin_kernel_buffer(
272 	vm_map_t        src_map,
273 	vm_map_address_t src_addr,
274 	vm_map_size_t   len,
275 	boolean_t       src_destroy,
276 	vm_map_copy_t   *copy_result);  /* OUT */
277 
278 static kern_return_t    vm_map_copyout_kernel_buffer(
279 	vm_map_t        map,
280 	vm_map_address_t *addr, /* IN/OUT */
281 	vm_map_copy_t   copy,
282 	vm_map_size_t   copy_size,
283 	boolean_t       overwrite,
284 	boolean_t       consume_on_success);
285 
286 static void             vm_map_fork_share(
287 	vm_map_t        old_map,
288 	vm_map_entry_t  old_entry,
289 	vm_map_t        new_map);
290 
291 static boolean_t        vm_map_fork_copy(
292 	vm_map_t        old_map,
293 	vm_map_entry_t  *old_entry_p,
294 	vm_map_t        new_map,
295 	int             vm_map_copyin_flags);
296 
297 static kern_return_t    vm_map_wire_nested(
298 	vm_map_t                   map,
299 	vm_map_offset_t            start,
300 	vm_map_offset_t            end,
301 	vm_prot_t                  caller_prot,
302 	vm_tag_t                   tag,
303 	boolean_t                  user_wire,
304 	pmap_t                     map_pmap,
305 	vm_map_offset_t            pmap_addr,
306 	ppnum_t                    *physpage_p);
307 
308 static kern_return_t    vm_map_unwire_nested(
309 	vm_map_t                   map,
310 	vm_map_offset_t            start,
311 	vm_map_offset_t            end,
312 	boolean_t                  user_wire,
313 	pmap_t                     map_pmap,
314 	vm_map_offset_t            pmap_addr);
315 
316 static kern_return_t    vm_map_overwrite_submap_recurse(
317 	vm_map_t                   dst_map,
318 	vm_map_offset_t            dst_addr,
319 	vm_map_size_t              dst_size);
320 
321 static kern_return_t    vm_map_copy_overwrite_nested(
322 	vm_map_t                   dst_map,
323 	vm_map_offset_t            dst_addr,
324 	vm_map_copy_t              copy,
325 	boolean_t                  interruptible,
326 	pmap_t                     pmap,
327 	boolean_t                  discard_on_success);
328 
329 static kern_return_t    vm_map_remap_extract(
330 	vm_map_t                map,
331 	vm_map_offset_t         addr,
332 	vm_map_size_t           size,
333 	boolean_t               copy,
334 	vm_map_copy_t           map_copy,
335 	vm_prot_t               *cur_protection,
336 	vm_prot_t               *max_protection,
337 	vm_inherit_t            inheritance,
338 	vm_map_kernel_flags_t   vmk_flags);
339 
340 static kern_return_t    vm_map_remap_range_allocate(
341 	vm_map_t                map,
342 	vm_map_address_t        *address,
343 	vm_map_size_t           size,
344 	vm_map_offset_t         mask,
345 	vm_map_kernel_flags_t   vmk_flags,
346 	vm_map_entry_t          *map_entry,
347 	vm_map_zap_t            zap_list);
348 
349 static void             vm_map_region_look_for_page(
350 	vm_map_t                   map,
351 	vm_map_offset_t            va,
352 	vm_object_t                object,
353 	vm_object_offset_t         offset,
354 	int                        max_refcnt,
355 	unsigned short             depth,
356 	vm_region_extended_info_t  extended,
357 	mach_msg_type_number_t count);
358 
359 static int              vm_map_region_count_obj_refs(
360 	vm_map_entry_t             entry,
361 	vm_object_t                object);
362 
363 
364 static kern_return_t    vm_map_willneed(
365 	vm_map_t        map,
366 	vm_map_offset_t start,
367 	vm_map_offset_t end);
368 
369 static kern_return_t    vm_map_reuse_pages(
370 	vm_map_t        map,
371 	vm_map_offset_t start,
372 	vm_map_offset_t end);
373 
374 static kern_return_t    vm_map_reusable_pages(
375 	vm_map_t        map,
376 	vm_map_offset_t start,
377 	vm_map_offset_t end);
378 
379 static kern_return_t    vm_map_can_reuse(
380 	vm_map_t        map,
381 	vm_map_offset_t start,
382 	vm_map_offset_t end);
383 
384 static kern_return_t    vm_map_random_address_for_size(
385 	vm_map_t                map,
386 	vm_map_offset_t        *address,
387 	vm_map_size_t           size,
388 	vm_map_kernel_flags_t   vmk_flags);
389 
390 
391 #if CONFIG_MAP_RANGES
392 
393 static vm_map_range_id_t vm_map_user_range_resolve(
394 	vm_map_t                map,
395 	mach_vm_address_t       addr,
396 	mach_vm_address_t       size,
397 	mach_vm_range_t         range);
398 
399 #endif /* CONFIG_MAP_RANGES */
400 #if MACH_ASSERT
401 static kern_return_t    vm_map_pageout(
402 	vm_map_t        map,
403 	vm_map_offset_t start,
404 	vm_map_offset_t end);
405 #endif /* MACH_ASSERT */
406 
407 kern_return_t vm_map_corpse_footprint_collect(
408 	vm_map_t        old_map,
409 	vm_map_entry_t  old_entry,
410 	vm_map_t        new_map);
411 void vm_map_corpse_footprint_collect_done(
412 	vm_map_t        new_map);
413 void vm_map_corpse_footprint_destroy(
414 	vm_map_t        map);
415 kern_return_t vm_map_corpse_footprint_query_page_info(
416 	vm_map_t        map,
417 	vm_map_offset_t va,
418 	int             *disposition_p);
419 void vm_map_footprint_query_page_info(
420 	vm_map_t        map,
421 	vm_map_entry_t  map_entry,
422 	vm_map_offset_t curr_s_offset,
423 	int             *disposition_p);
424 
425 #if CONFIG_MAP_RANGES
426 static void vm_map_range_map_init(void);
427 #endif /* CONFIG_MAP_RANGES */
428 
429 pid_t find_largest_process_vm_map_entries(void);
430 
431 extern int exit_with_guard_exception(void *p, mach_exception_data_type_t code,
432     mach_exception_data_type_t subcode);
433 
434 /*
435  * Macros to copy a vm_map_entry. We must be careful to correctly
436  * manage the wired page count. vm_map_entry_copy() creates a new
437  * map entry to the same memory - the wired count in the new entry
438  * must be set to zero. vm_map_entry_copy_full() creates a new
439  * entry that is identical to the old entry.  This preserves the
440  * wire count; it's used for map splitting and zone changing in
441  * vm_map_copyout.
442  */
443 
444 static inline void
vm_map_entry_copy_csm_assoc(vm_map_t map __unused,vm_map_entry_t new __unused,vm_map_entry_t old __unused)445 vm_map_entry_copy_csm_assoc(
446 	vm_map_t map __unused,
447 	vm_map_entry_t new __unused,
448 	vm_map_entry_t old __unused)
449 {
450 #if CODE_SIGNING_MONITOR
451 	/* when code signing monitor is enabled, we want to reset on copy */
452 	new->csm_associated = FALSE;
453 #else
454 	/* when code signing monitor is not enabled, assert as a sanity check */
455 	assert(new->csm_associated == FALSE);
456 #endif
457 #if DEVELOPMENT || DEBUG
458 	if (new->vme_xnu_user_debug && vm_log_xnu_user_debug) {
459 		printf("FBDP %d[%s] %s:%d map %p entry %p [ 0x%llx 0x%llx ] resetting vme_xnu_user_debug\n",
460 		    proc_selfpid(),
461 		    (get_bsdtask_info(current_task())
462 		    ? proc_name_address(get_bsdtask_info(current_task()))
463 		    : "?"),
464 		    __FUNCTION__, __LINE__,
465 		    map, new, new->vme_start, new->vme_end);
466 	}
467 #endif /* DEVELOPMENT || DEBUG */
468 	new->vme_xnu_user_debug = FALSE;
469 }
470 
471 /*
472  * The "used_for_jit" flag was copied from OLD to NEW in vm_map_entry_copy().
473  * But for security reasons on some platforms, we don't want the
474  * new mapping to be "used for jit", so we reset the flag here.
475  */
476 static inline void
vm_map_entry_copy_code_signing(vm_map_t map,vm_map_entry_t new,vm_map_entry_t old __unused)477 vm_map_entry_copy_code_signing(
478 	vm_map_t map,
479 	vm_map_entry_t new,
480 	vm_map_entry_t old __unused)
481 {
482 	if (VM_MAP_POLICY_ALLOW_JIT_COPY(map)) {
483 		assert(new->used_for_jit == old->used_for_jit);
484 	} else {
485 		new->used_for_jit = FALSE;
486 	}
487 }
488 
489 static inline void
vm_map_entry_copy_full(vm_map_entry_t new,vm_map_entry_t old)490 vm_map_entry_copy_full(
491 	vm_map_entry_t new,
492 	vm_map_entry_t old)
493 {
494 #if MAP_ENTRY_CREATION_DEBUG
495 	btref_put(new->vme_creation_bt);
496 	btref_retain(old->vme_creation_bt);
497 #endif
498 #if MAP_ENTRY_INSERTION_DEBUG
499 	btref_put(new->vme_insertion_bt);
500 	btref_retain(old->vme_insertion_bt);
501 #endif
502 	*new = *old;
503 }
504 
505 static inline void
vm_map_entry_copy(vm_map_t map,vm_map_entry_t new,vm_map_entry_t old)506 vm_map_entry_copy(
507 	vm_map_t map,
508 	vm_map_entry_t new,
509 	vm_map_entry_t old)
510 {
511 	vm_map_entry_copy_full(new, old);
512 
513 	new->is_shared = FALSE;
514 	new->needs_wakeup = FALSE;
515 	new->in_transition = FALSE;
516 	new->wired_count = 0;
517 	new->user_wired_count = 0;
518 	new->vme_permanent = FALSE;
519 	vm_map_entry_copy_code_signing(map, new, old);
520 	vm_map_entry_copy_csm_assoc(map, new, old);
521 	if (new->iokit_acct) {
522 		assertf(!new->use_pmap, "old %p new %p\n", old, new);
523 		new->iokit_acct = FALSE;
524 		new->use_pmap = TRUE;
525 	}
526 	new->vme_resilient_codesign = FALSE;
527 	new->vme_resilient_media = FALSE;
528 	new->vme_atomic = FALSE;
529 	new->vme_no_copy_on_read = FALSE;
530 }
531 
532 /*
533  * Normal lock_read_to_write() returns FALSE/0 on failure.
534  * These functions evaluate to zero on success and non-zero value on failure.
535  */
536 __attribute__((always_inline))
537 int
vm_map_lock_read_to_write(vm_map_t map)538 vm_map_lock_read_to_write(vm_map_t map)
539 {
540 	if (lck_rw_lock_shared_to_exclusive(&(map)->lock)) {
541 		DTRACE_VM(vm_map_lock_upgrade);
542 		return 0;
543 	}
544 	return 1;
545 }
546 
547 __attribute__((always_inline))
548 boolean_t
vm_map_try_lock(vm_map_t map)549 vm_map_try_lock(vm_map_t map)
550 {
551 	if (lck_rw_try_lock_exclusive(&(map)->lock)) {
552 		DTRACE_VM(vm_map_lock_w);
553 		return TRUE;
554 	}
555 	return FALSE;
556 }
557 
558 __attribute__((always_inline))
559 boolean_t
vm_map_try_lock_read(vm_map_t map)560 vm_map_try_lock_read(vm_map_t map)
561 {
562 	if (lck_rw_try_lock_shared(&(map)->lock)) {
563 		DTRACE_VM(vm_map_lock_r);
564 		return TRUE;
565 	}
566 	return FALSE;
567 }
568 
569 /*!
570  * @function kdp_vm_map_is_acquired_exclusive
571  *
572  * @abstract
573  * Checks if vm map is acquired exclusive.
574  *
575  * @discussion
576  * NOT SAFE: To be used only by kernel debugger.
577  *
578  * @param map map to check
579  *
580  * @returns TRUE if the map is acquired exclusively.
581  */
582 boolean_t
kdp_vm_map_is_acquired_exclusive(vm_map_t map)583 kdp_vm_map_is_acquired_exclusive(vm_map_t map)
584 {
585 	return kdp_lck_rw_lock_is_acquired_exclusive(&map->lock);
586 }
587 
588 /*
589  * Routines to get the page size the caller should
590  * use while inspecting the target address space.
591  * Use the "_safely" variant if the caller is dealing with a user-provided
592  * array whose size depends on the page size, to avoid any overflow or
593  * underflow of a user-allocated buffer.
594  */
595 int
vm_self_region_page_shift_safely(vm_map_t target_map)596 vm_self_region_page_shift_safely(
597 	vm_map_t target_map)
598 {
599 	int effective_page_shift = 0;
600 
601 	if (PAGE_SIZE == (4096)) {
602 		/* x86_64 and 4k watches: always use 4k */
603 		return PAGE_SHIFT;
604 	}
605 	/* did caller provide an explicit page size for this thread to use? */
606 	effective_page_shift = thread_self_region_page_shift();
607 	if (effective_page_shift) {
608 		/* use the explicitly-provided page size */
609 		return effective_page_shift;
610 	}
611 	/* no explicit page size: use the caller's page size... */
612 	effective_page_shift = VM_MAP_PAGE_SHIFT(current_map());
613 	if (effective_page_shift == VM_MAP_PAGE_SHIFT(target_map)) {
614 		/* page size match: safe to use */
615 		return effective_page_shift;
616 	}
617 	/* page size mismatch */
618 	return -1;
619 }
620 int
vm_self_region_page_shift(vm_map_t target_map)621 vm_self_region_page_shift(
622 	vm_map_t target_map)
623 {
624 	int effective_page_shift;
625 
626 	effective_page_shift = vm_self_region_page_shift_safely(target_map);
627 	if (effective_page_shift == -1) {
628 		/* no safe value but OK to guess for caller */
629 		effective_page_shift = MIN(VM_MAP_PAGE_SHIFT(current_map()),
630 		    VM_MAP_PAGE_SHIFT(target_map));
631 	}
632 	return effective_page_shift;
633 }
634 
635 
636 /*
637  *	Decide if we want to allow processes to execute from their data or stack areas.
638  *	override_nx() returns true if we do.  Data/stack execution can be enabled independently
639  *	for 32 and 64 bit processes.  Set the VM_ABI_32 or VM_ABI_64 flags in allow_data_exec
640  *	or allow_stack_exec to enable data execution for that type of data area for that particular
641  *	ABI (or both by or'ing the flags together).  These are initialized in the architecture
642  *	specific pmap files since the default behavior varies according to architecture.  The
643  *	main reason it varies is because of the need to provide binary compatibility with old
644  *	applications that were written before these restrictions came into being.  In the old
645  *	days, an app could execute anything it could read, but this has slowly been tightened
646  *	up over time.  The default behavior is:
647  *
648  *	32-bit PPC apps		may execute from both stack and data areas
649  *	32-bit Intel apps	may exeucte from data areas but not stack
650  *	64-bit PPC/Intel apps	may not execute from either data or stack
651  *
652  *	An application on any architecture may override these defaults by explicitly
653  *	adding PROT_EXEC permission to the page in question with the mprotect(2)
654  *	system call.  This code here just determines what happens when an app tries to
655  *      execute from a page that lacks execute permission.
656  *
657  *	Note that allow_data_exec or allow_stack_exec may also be modified by sysctl to change the
658  *	default behavior for both 32 and 64 bit apps on a system-wide basis. Furthermore,
659  *	a Mach-O header flag bit (MH_NO_HEAP_EXECUTION) can be used to forcibly disallow
660  *	execution from data areas for a particular binary even if the arch normally permits it. As
661  *	a final wrinkle, a posix_spawn attribute flag can be used to negate this opt-in header bit
662  *	to support some complicated use cases, notably browsers with out-of-process plugins that
663  *	are not all NX-safe.
664  */
665 
666 extern int allow_data_exec, allow_stack_exec;
667 
668 int
override_nx(vm_map_t map,uint32_t user_tag)669 override_nx(vm_map_t map, uint32_t user_tag) /* map unused on arm */
670 {
671 	int current_abi;
672 
673 	if (map->pmap == kernel_pmap) {
674 		return FALSE;
675 	}
676 
677 	/*
678 	 * Determine if the app is running in 32 or 64 bit mode.
679 	 */
680 
681 	if (vm_map_is_64bit(map)) {
682 		current_abi = VM_ABI_64;
683 	} else {
684 		current_abi = VM_ABI_32;
685 	}
686 
687 	/*
688 	 * Determine if we should allow the execution based on whether it's a
689 	 * stack or data area and the current architecture.
690 	 */
691 
692 	if (user_tag == VM_MEMORY_STACK) {
693 		return allow_stack_exec & current_abi;
694 	}
695 
696 	return (allow_data_exec & current_abi) && (map->map_disallow_data_exec == FALSE);
697 }
698 
699 
700 /*
701  *	Virtual memory maps provide for the mapping, protection,
702  *	and sharing of virtual memory objects.  In addition,
703  *	this module provides for an efficient virtual copy of
704  *	memory from one map to another.
705  *
706  *	Synchronization is required prior to most operations.
707  *
708  *	Maps consist of an ordered doubly-linked list of simple
709  *	entries; a single hint is used to speed up lookups.
710  *
711  *	Sharing maps have been deleted from this version of Mach.
712  *	All shared objects are now mapped directly into the respective
713  *	maps.  This requires a change in the copy on write strategy;
714  *	the asymmetric (delayed) strategy is used for shared temporary
715  *	objects instead of the symmetric (shadow) strategy.  All maps
716  *	are now "top level" maps (either task map, kernel map or submap
717  *	of the kernel map).
718  *
719  *	Since portions of maps are specified by start/end addreses,
720  *	which may not align with existing map entries, all
721  *	routines merely "clip" entries to these start/end values.
722  *	[That is, an entry is split into two, bordering at a
723  *	start or end value.]  Note that these clippings may not
724  *	always be necessary (as the two resulting entries are then
725  *	not changed); however, the clipping is done for convenience.
726  *	No attempt is currently made to "glue back together" two
727  *	abutting entries.
728  *
729  *	The symmetric (shadow) copy strategy implements virtual copy
730  *	by copying VM object references from one map to
731  *	another, and then marking both regions as copy-on-write.
732  *	It is important to note that only one writeable reference
733  *	to a VM object region exists in any map when this strategy
734  *	is used -- this means that shadow object creation can be
735  *	delayed until a write operation occurs.  The symmetric (delayed)
736  *	strategy allows multiple maps to have writeable references to
737  *	the same region of a vm object, and hence cannot delay creating
738  *	its copy objects.  See vm_object_copy_quickly() in vm_object.c.
739  *	Copying of permanent objects is completely different; see
740  *	vm_object_copy_strategically() in vm_object.c.
741  */
742 
743 ZONE_DECLARE_ID(ZONE_ID_VM_MAP_COPY, struct vm_map_copy);
744 
745 #define VM_MAP_ZONE_NAME        "maps"
746 #define VM_MAP_ZFLAGS           (ZC_NOENCRYPT | ZC_VM)
747 
748 #define VM_MAP_ENTRY_ZONE_NAME  "VM map entries"
749 #define VM_MAP_ENTRY_ZFLAGS     (ZC_NOENCRYPT | ZC_VM)
750 
751 #define VM_MAP_HOLES_ZONE_NAME  "VM map holes"
752 #define VM_MAP_HOLES_ZFLAGS     (ZC_NOENCRYPT | ZC_VM)
753 
754 /*
755  * Asserts that a vm_map_copy object is coming from the
756  * vm_map_copy_zone to ensure that it isn't a fake constructed
757  * anywhere else.
758  */
759 void
vm_map_copy_require(struct vm_map_copy * copy)760 vm_map_copy_require(struct vm_map_copy *copy)
761 {
762 	zone_id_require(ZONE_ID_VM_MAP_COPY, sizeof(struct vm_map_copy), copy);
763 }
764 
765 /*
766  *	vm_map_require:
767  *
768  *	Ensures that the argument is memory allocated from the genuine
769  *	vm map zone. (See zone_id_require_allow_foreign).
770  */
771 void
vm_map_require(vm_map_t map)772 vm_map_require(vm_map_t map)
773 {
774 	zone_id_require(ZONE_ID_VM_MAP, sizeof(struct _vm_map), map);
775 }
776 
777 #define VM_MAP_EARLY_COUNT_MAX         16
778 static __startup_data vm_offset_t      map_data;
779 static __startup_data vm_size_t        map_data_size;
780 static __startup_data vm_offset_t      kentry_data;
781 static __startup_data vm_size_t        kentry_data_size;
782 static __startup_data vm_offset_t      map_holes_data;
783 static __startup_data vm_size_t        map_holes_data_size;
784 static __startup_data vm_map_t        *early_map_owners[VM_MAP_EARLY_COUNT_MAX];
785 static __startup_data uint32_t         early_map_count;
786 
787 #if XNU_TARGET_OS_OSX
788 #define         NO_COALESCE_LIMIT  ((1024 * 128) - 1)
789 #else /* XNU_TARGET_OS_OSX */
790 #define         NO_COALESCE_LIMIT  0
791 #endif /* XNU_TARGET_OS_OSX */
792 
793 /* Skip acquiring locks if we're in the midst of a kernel core dump */
794 unsigned int not_in_kdp = 1;
795 
796 unsigned int vm_map_set_cache_attr_count = 0;
797 
798 kern_return_t
vm_map_set_cache_attr(vm_map_t map,vm_map_offset_t va)799 vm_map_set_cache_attr(
800 	vm_map_t        map,
801 	vm_map_offset_t va)
802 {
803 	vm_map_entry_t  map_entry;
804 	vm_object_t     object;
805 	kern_return_t   kr = KERN_SUCCESS;
806 
807 	vm_map_lock_read(map);
808 
809 	if (!vm_map_lookup_entry(map, va, &map_entry) ||
810 	    map_entry->is_sub_map) {
811 		/*
812 		 * that memory is not properly mapped
813 		 */
814 		kr = KERN_INVALID_ARGUMENT;
815 		goto done;
816 	}
817 	object = VME_OBJECT(map_entry);
818 
819 	if (object == VM_OBJECT_NULL) {
820 		/*
821 		 * there should be a VM object here at this point
822 		 */
823 		kr = KERN_INVALID_ARGUMENT;
824 		goto done;
825 	}
826 	vm_object_lock(object);
827 	object->set_cache_attr = TRUE;
828 	vm_object_unlock(object);
829 
830 	vm_map_set_cache_attr_count++;
831 done:
832 	vm_map_unlock_read(map);
833 
834 	return kr;
835 }
836 
837 
838 #if CONFIG_CODE_DECRYPTION
839 /*
840  * vm_map_apple_protected:
841  * This remaps the requested part of the object with an object backed by
842  * the decrypting pager.
843  * crypt_info contains entry points and session data for the crypt module.
844  * The crypt_info block will be copied by vm_map_apple_protected. The data structures
845  * referenced in crypt_info must remain valid until crypt_info->crypt_end() is called.
846  */
847 kern_return_t
vm_map_apple_protected(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_object_offset_t crypto_backing_offset,struct pager_crypt_info * crypt_info,uint32_t cryptid)848 vm_map_apple_protected(
849 	vm_map_t                map,
850 	vm_map_offset_t         start,
851 	vm_map_offset_t         end,
852 	vm_object_offset_t      crypto_backing_offset,
853 	struct pager_crypt_info *crypt_info,
854 	uint32_t                cryptid)
855 {
856 	boolean_t       map_locked;
857 	kern_return_t   kr;
858 	vm_map_entry_t  map_entry;
859 	struct vm_map_entry tmp_entry;
860 	memory_object_t unprotected_mem_obj;
861 	vm_object_t     protected_object;
862 	vm_map_offset_t map_addr;
863 	vm_map_offset_t start_aligned, end_aligned;
864 	vm_object_offset_t      crypto_start, crypto_end;
865 	boolean_t       cache_pager;
866 
867 	map_locked = FALSE;
868 	unprotected_mem_obj = MEMORY_OBJECT_NULL;
869 
870 	start_aligned = vm_map_trunc_page(start, PAGE_MASK_64);
871 	end_aligned = vm_map_round_page(end, PAGE_MASK_64);
872 	start_aligned = vm_map_trunc_page(start_aligned, VM_MAP_PAGE_MASK(map));
873 	end_aligned = vm_map_round_page(end_aligned, VM_MAP_PAGE_MASK(map));
874 
875 #if __arm64__
876 	/*
877 	 * "start" and "end" might be 4K-aligned but not 16K-aligned,
878 	 * so we might have to loop and establish up to 3 mappings:
879 	 *
880 	 * + the first 16K-page, which might overlap with the previous
881 	 *   4K-aligned mapping,
882 	 * + the center,
883 	 * + the last 16K-page, which might overlap with the next
884 	 *   4K-aligned mapping.
885 	 * Each of these mapping might be backed by a vnode pager (if
886 	 * properly page-aligned) or a "fourk_pager", itself backed by a
887 	 * vnode pager (if 4K-aligned but not page-aligned).
888 	 */
889 #endif /* __arm64__ */
890 
891 	map_addr = start_aligned;
892 	for (map_addr = start_aligned;
893 	    map_addr < end;
894 	    map_addr = tmp_entry.vme_end) {
895 		vm_map_lock(map);
896 		map_locked = TRUE;
897 
898 		/* lookup the protected VM object */
899 		if (!vm_map_lookup_entry(map,
900 		    map_addr,
901 		    &map_entry) ||
902 		    map_entry->is_sub_map ||
903 		    VME_OBJECT(map_entry) == VM_OBJECT_NULL) {
904 			/* that memory is not properly mapped */
905 			kr = KERN_INVALID_ARGUMENT;
906 			goto done;
907 		}
908 
909 		/* ensure mapped memory is mapped as executable except
910 		 *  except for model decryption flow */
911 		if ((cryptid != CRYPTID_MODEL_ENCRYPTION) &&
912 		    !(map_entry->protection & VM_PROT_EXECUTE)) {
913 			kr = KERN_INVALID_ARGUMENT;
914 			goto done;
915 		}
916 
917 		/* get the protected object to be decrypted */
918 		protected_object = VME_OBJECT(map_entry);
919 		if (protected_object == VM_OBJECT_NULL) {
920 			/* there should be a VM object here at this point */
921 			kr = KERN_INVALID_ARGUMENT;
922 			goto done;
923 		}
924 		/* ensure protected object stays alive while map is unlocked */
925 		vm_object_reference(protected_object);
926 
927 		/* limit the map entry to the area we want to cover */
928 		vm_map_clip_start(map, map_entry, start_aligned);
929 		vm_map_clip_end(map, map_entry, end_aligned);
930 
931 		tmp_entry = *map_entry;
932 		map_entry = VM_MAP_ENTRY_NULL; /* not valid after unlocking map */
933 		vm_map_unlock(map);
934 		map_locked = FALSE;
935 
936 		/*
937 		 * This map entry might be only partially encrypted
938 		 * (if not fully "page-aligned").
939 		 */
940 		crypto_start = 0;
941 		crypto_end = tmp_entry.vme_end - tmp_entry.vme_start;
942 		if (tmp_entry.vme_start < start) {
943 			if (tmp_entry.vme_start != start_aligned) {
944 				kr = KERN_INVALID_ADDRESS;
945 			}
946 			crypto_start += (start - tmp_entry.vme_start);
947 		}
948 		if (tmp_entry.vme_end > end) {
949 			if (tmp_entry.vme_end != end_aligned) {
950 				kr = KERN_INVALID_ADDRESS;
951 			}
952 			crypto_end -= (tmp_entry.vme_end - end);
953 		}
954 
955 		/*
956 		 * This "extra backing offset" is needed to get the decryption
957 		 * routine to use the right key.  It adjusts for the possibly
958 		 * relative offset of an interposed "4K" pager...
959 		 */
960 		if (crypto_backing_offset == (vm_object_offset_t) -1) {
961 			crypto_backing_offset = VME_OFFSET(&tmp_entry);
962 		}
963 
964 		cache_pager = TRUE;
965 #if XNU_TARGET_OS_OSX
966 		if (vm_map_is_alien(map)) {
967 			cache_pager = FALSE;
968 		}
969 #endif /* XNU_TARGET_OS_OSX */
970 
971 		/*
972 		 * Lookup (and create if necessary) the protected memory object
973 		 * matching that VM object.
974 		 * If successful, this also grabs a reference on the memory object,
975 		 * to guarantee that it doesn't go away before we get a chance to map
976 		 * it.
977 		 */
978 		unprotected_mem_obj = apple_protect_pager_setup(
979 			protected_object,
980 			VME_OFFSET(&tmp_entry),
981 			crypto_backing_offset,
982 			crypt_info,
983 			crypto_start,
984 			crypto_end,
985 			cache_pager);
986 
987 		/* release extra ref on protected object */
988 		vm_object_deallocate(protected_object);
989 
990 		if (unprotected_mem_obj == NULL) {
991 			kr = KERN_FAILURE;
992 			goto done;
993 		}
994 
995 		/* can overwrite an immutable mapping */
996 		vm_map_kernel_flags_t vmk_flags = {
997 			.vmf_fixed = true,
998 			.vmf_overwrite = true,
999 			.vmkf_overwrite_immutable = true,
1000 		};
1001 #if __arm64__
1002 		if (tmp_entry.used_for_jit &&
1003 		    (VM_MAP_PAGE_SHIFT(map) != FOURK_PAGE_SHIFT ||
1004 		    PAGE_SHIFT != FOURK_PAGE_SHIFT) &&
1005 		    fourk_binary_compatibility_unsafe &&
1006 		    fourk_binary_compatibility_allow_wx) {
1007 			printf("** FOURK_COMPAT [%d]: "
1008 			    "allowing write+execute at 0x%llx\n",
1009 			    proc_selfpid(), tmp_entry.vme_start);
1010 			vmk_flags.vmkf_map_jit = TRUE;
1011 		}
1012 #endif /* __arm64__ */
1013 
1014 		/* map this memory object in place of the current one */
1015 		map_addr = tmp_entry.vme_start;
1016 		kr = vm_map_enter_mem_object(map,
1017 		    &map_addr,
1018 		    (tmp_entry.vme_end -
1019 		    tmp_entry.vme_start),
1020 		    (mach_vm_offset_t) 0,
1021 		    vmk_flags,
1022 		    (ipc_port_t)(uintptr_t) unprotected_mem_obj,
1023 		    0,
1024 		    TRUE,
1025 		    tmp_entry.protection,
1026 		    tmp_entry.max_protection,
1027 		    tmp_entry.inheritance);
1028 		assertf(kr == KERN_SUCCESS,
1029 		    "kr = 0x%x\n", kr);
1030 		assertf(map_addr == tmp_entry.vme_start,
1031 		    "map_addr=0x%llx vme_start=0x%llx tmp_entry=%p\n",
1032 		    (uint64_t)map_addr,
1033 		    (uint64_t) tmp_entry.vme_start,
1034 		    &tmp_entry);
1035 
1036 #if VM_MAP_DEBUG_APPLE_PROTECT
1037 		if (vm_map_debug_apple_protect) {
1038 			printf("APPLE_PROTECT: map %p [0x%llx:0x%llx] pager %p:"
1039 			    " backing:[object:%p,offset:0x%llx,"
1040 			    "crypto_backing_offset:0x%llx,"
1041 			    "crypto_start:0x%llx,crypto_end:0x%llx]\n",
1042 			    map,
1043 			    (uint64_t) map_addr,
1044 			    (uint64_t) (map_addr + (tmp_entry.vme_end -
1045 			    tmp_entry.vme_start)),
1046 			    unprotected_mem_obj,
1047 			    protected_object,
1048 			    VME_OFFSET(&tmp_entry),
1049 			    crypto_backing_offset,
1050 			    crypto_start,
1051 			    crypto_end);
1052 		}
1053 #endif /* VM_MAP_DEBUG_APPLE_PROTECT */
1054 
1055 		/*
1056 		 * Release the reference obtained by
1057 		 * apple_protect_pager_setup().
1058 		 * The mapping (if it succeeded) is now holding a reference on
1059 		 * the memory object.
1060 		 */
1061 		memory_object_deallocate(unprotected_mem_obj);
1062 		unprotected_mem_obj = MEMORY_OBJECT_NULL;
1063 
1064 		/* continue with next map entry */
1065 		crypto_backing_offset += (tmp_entry.vme_end -
1066 		    tmp_entry.vme_start);
1067 		crypto_backing_offset -= crypto_start;
1068 	}
1069 	kr = KERN_SUCCESS;
1070 
1071 done:
1072 	if (map_locked) {
1073 		vm_map_unlock(map);
1074 	}
1075 	return kr;
1076 }
1077 #endif  /* CONFIG_CODE_DECRYPTION */
1078 
1079 
1080 LCK_GRP_DECLARE(vm_map_lck_grp, "vm_map");
1081 LCK_ATTR_DECLARE(vm_map_lck_attr, 0, 0);
1082 LCK_ATTR_DECLARE(vm_map_lck_rw_attr, 0, LCK_ATTR_DEBUG);
1083 
1084 #if XNU_TARGET_OS_OSX
1085 int malloc_no_cow = 0;
1086 #else /* XNU_TARGET_OS_OSX */
1087 int malloc_no_cow = 1;
1088 #endif /* XNU_TARGET_OS_OSX */
1089 uint64_t vm_memory_malloc_no_cow_mask = 0ULL;
1090 #if DEBUG
1091 int vm_check_map_sanity = 0;
1092 #endif
1093 
1094 /*
1095  *	vm_map_init:
1096  *
1097  *	Initialize the vm_map module.  Must be called before
1098  *	any other vm_map routines.
1099  *
1100  *	Map and entry structures are allocated from zones -- we must
1101  *	initialize those zones.
1102  *
1103  *	There are three zones of interest:
1104  *
1105  *	vm_map_zone:		used to allocate maps.
1106  *	vm_map_entry_zone:	used to allocate map entries.
1107  *
1108  *	LP32:
1109  *	vm_map_entry_reserved_zone:     fallback zone for kernel map entries
1110  *
1111  *	The kernel allocates map entries from a special zone that is initially
1112  *	"crammed" with memory.  It would be difficult (perhaps impossible) for
1113  *	the kernel to allocate more memory to a entry zone when it became
1114  *	empty since the very act of allocating memory implies the creation
1115  *	of a new entry.
1116  */
1117 __startup_func
1118 void
vm_map_init(void)1119 vm_map_init(void)
1120 {
1121 
1122 #if MACH_ASSERT
1123 	PE_parse_boot_argn("debug4k_filter", &debug4k_filter,
1124 	    sizeof(debug4k_filter));
1125 #endif /* MACH_ASSERT */
1126 
1127 	zone_create_ext(VM_MAP_ZONE_NAME, sizeof(struct _vm_map),
1128 	    VM_MAP_ZFLAGS, ZONE_ID_VM_MAP, NULL);
1129 
1130 	/*
1131 	 * Don't quarantine because we always need elements available
1132 	 * Disallow GC on this zone... to aid the GC.
1133 	 */
1134 	zone_create_ext(VM_MAP_ENTRY_ZONE_NAME,
1135 	    sizeof(struct vm_map_entry), VM_MAP_ENTRY_ZFLAGS,
1136 	    ZONE_ID_VM_MAP_ENTRY, ^(zone_t z) {
1137 		z->z_elems_rsv = (uint16_t)(32 *
1138 		(ml_early_cpu_max_number() + 1));
1139 	});
1140 
1141 	zone_create_ext(VM_MAP_HOLES_ZONE_NAME,
1142 	    sizeof(struct vm_map_links), VM_MAP_HOLES_ZFLAGS,
1143 	    ZONE_ID_VM_MAP_HOLES, ^(zone_t z) {
1144 		z->z_elems_rsv = (uint16_t)(16 * 1024 / zone_elem_outer_size(z));
1145 	});
1146 
1147 	zone_create_ext("VM map copies", sizeof(struct vm_map_copy),
1148 	    ZC_NOENCRYPT, ZONE_ID_VM_MAP_COPY, NULL);
1149 
1150 	/*
1151 	 * Add the stolen memory to zones, adjust zone size and stolen counts.
1152 	 */
1153 	zone_cram_early(vm_map_zone, map_data, map_data_size);
1154 	zone_cram_early(vm_map_entry_zone, kentry_data, kentry_data_size);
1155 	zone_cram_early(vm_map_holes_zone, map_holes_data, map_holes_data_size);
1156 	printf("VM boostrap: %d maps, %d entries and %d holes available\n",
1157 	    zone_count_free(vm_map_zone),
1158 	    zone_count_free(vm_map_entry_zone),
1159 	    zone_count_free(vm_map_holes_zone));
1160 
1161 	/*
1162 	 * Since these are covered by zones, remove them from stolen page accounting.
1163 	 */
1164 	VM_PAGE_MOVE_STOLEN(atop_64(map_data_size) + atop_64(kentry_data_size) + atop_64(map_holes_data_size));
1165 
1166 #if VM_MAP_DEBUG_APPLE_PROTECT
1167 	PE_parse_boot_argn("vm_map_debug_apple_protect",
1168 	    &vm_map_debug_apple_protect,
1169 	    sizeof(vm_map_debug_apple_protect));
1170 #endif /* VM_MAP_DEBUG_APPLE_PROTECT */
1171 #if VM_MAP_DEBUG_APPLE_FOURK
1172 	PE_parse_boot_argn("vm_map_debug_fourk",
1173 	    &vm_map_debug_fourk,
1174 	    sizeof(vm_map_debug_fourk));
1175 #endif /* VM_MAP_DEBUG_FOURK */
1176 
1177 	PE_parse_boot_argn("malloc_no_cow",
1178 	    &malloc_no_cow,
1179 	    sizeof(malloc_no_cow));
1180 	if (malloc_no_cow) {
1181 		vm_memory_malloc_no_cow_mask = 0ULL;
1182 		vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC;
1183 		vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_SMALL;
1184 		vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_MEDIUM;
1185 		vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_LARGE;
1186 //		vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_HUGE;
1187 //		vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_REALLOC;
1188 		vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_TINY;
1189 		vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_LARGE_REUSABLE;
1190 		vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_LARGE_REUSED;
1191 		vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_NANO;
1192 //		vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_TCMALLOC;
1193 		PE_parse_boot_argn("vm_memory_malloc_no_cow_mask",
1194 		    &vm_memory_malloc_no_cow_mask,
1195 		    sizeof(vm_memory_malloc_no_cow_mask));
1196 	}
1197 
1198 #if CONFIG_MAP_RANGES
1199 	vm_map_range_map_init();
1200 #endif /* CONFIG_MAP_RANGES */
1201 
1202 #if DEBUG
1203 	PE_parse_boot_argn("vm_check_map_sanity", &vm_check_map_sanity, sizeof(vm_check_map_sanity));
1204 	if (vm_check_map_sanity) {
1205 		kprintf("VM sanity checking enabled\n");
1206 	} else {
1207 		kprintf("VM sanity checking disabled. Set bootarg vm_check_map_sanity=1 to enable\n");
1208 	}
1209 #endif /* DEBUG */
1210 
1211 #if DEVELOPMENT || DEBUG
1212 	PE_parse_boot_argn("panic_on_unsigned_execute",
1213 	    &panic_on_unsigned_execute,
1214 	    sizeof(panic_on_unsigned_execute));
1215 	PE_parse_boot_argn("panic_on_mlock_failure",
1216 	    &panic_on_mlock_failure,
1217 	    sizeof(panic_on_mlock_failure));
1218 #endif /* DEVELOPMENT || DEBUG */
1219 }
1220 
1221 __startup_func
1222 static void
vm_map_steal_memory(void)1223 vm_map_steal_memory(void)
1224 {
1225 	/*
1226 	 * We need to reserve enough memory to support boostraping VM maps
1227 	 * and the zone subsystem.
1228 	 *
1229 	 * The VM Maps that need to function before zones can support them
1230 	 * are the ones registered with vm_map_will_allocate_early_map(),
1231 	 * which are:
1232 	 * - the kernel map
1233 	 * - the various submaps used by zones (pgz, meta, ...)
1234 	 *
1235 	 * We also need enough entries and holes to support them
1236 	 * until zone_metadata_init() is called, which is when
1237 	 * the zone allocator becomes capable of expanding dynamically.
1238 	 *
1239 	 * We need:
1240 	 * - VM_MAP_EARLY_COUNT_MAX worth of VM Maps.
1241 	 * - To allow for 3-4 entries per map, but the kernel map
1242 	 *   needs a multiple of VM_MAP_EARLY_COUNT_MAX entries
1243 	 *   to describe the submaps, so double it (and make it 8x too)
1244 	 * - To allow for holes between entries,
1245 	 *   hence needs the same budget as entries
1246 	 */
1247 	map_data_size = zone_get_early_alloc_size(VM_MAP_ZONE_NAME,
1248 	    sizeof(struct _vm_map), VM_MAP_ZFLAGS,
1249 	    VM_MAP_EARLY_COUNT_MAX);
1250 
1251 	kentry_data_size = zone_get_early_alloc_size(VM_MAP_ENTRY_ZONE_NAME,
1252 	    sizeof(struct vm_map_entry), VM_MAP_ENTRY_ZFLAGS,
1253 	    8 * VM_MAP_EARLY_COUNT_MAX);
1254 
1255 	map_holes_data_size = zone_get_early_alloc_size(VM_MAP_HOLES_ZONE_NAME,
1256 	    sizeof(struct vm_map_links), VM_MAP_HOLES_ZFLAGS,
1257 	    8 * VM_MAP_EARLY_COUNT_MAX);
1258 
1259 	/*
1260 	 * Steal a contiguous range of memory so that a simple range check
1261 	 * can validate early addresses being freed/crammed to these
1262 	 * zones
1263 	 */
1264 	map_data       = zone_early_mem_init(map_data_size + kentry_data_size +
1265 	    map_holes_data_size);
1266 	kentry_data    = map_data + map_data_size;
1267 	map_holes_data = kentry_data + kentry_data_size;
1268 }
1269 STARTUP(PMAP_STEAL, STARTUP_RANK_FIRST, vm_map_steal_memory);
1270 
1271 __startup_func
1272 static void
vm_kernel_boostraped(void)1273 vm_kernel_boostraped(void)
1274 {
1275 	zone_enable_caching(&zone_array[ZONE_ID_VM_MAP_ENTRY]);
1276 	zone_enable_caching(&zone_array[ZONE_ID_VM_MAP_HOLES]);
1277 	zone_enable_caching(&zone_array[ZONE_ID_VM_MAP_COPY]);
1278 
1279 	printf("VM bootstrap done: %d maps, %d entries and %d holes left\n",
1280 	    zone_count_free(vm_map_zone),
1281 	    zone_count_free(vm_map_entry_zone),
1282 	    zone_count_free(vm_map_holes_zone));
1283 }
1284 STARTUP(ZALLOC, STARTUP_RANK_SECOND, vm_kernel_boostraped);
1285 
1286 void
vm_map_disable_hole_optimization(vm_map_t map)1287 vm_map_disable_hole_optimization(vm_map_t map)
1288 {
1289 	vm_map_entry_t  head_entry, hole_entry, next_hole_entry;
1290 
1291 	if (map->holelistenabled) {
1292 		head_entry = hole_entry = CAST_TO_VM_MAP_ENTRY(map->holes_list);
1293 
1294 		while (hole_entry != NULL) {
1295 			next_hole_entry = hole_entry->vme_next;
1296 
1297 			hole_entry->vme_next = NULL;
1298 			hole_entry->vme_prev = NULL;
1299 			zfree_id(ZONE_ID_VM_MAP_HOLES, hole_entry);
1300 
1301 			if (next_hole_entry == head_entry) {
1302 				hole_entry = NULL;
1303 			} else {
1304 				hole_entry = next_hole_entry;
1305 			}
1306 		}
1307 
1308 		map->holes_list = NULL;
1309 		map->holelistenabled = FALSE;
1310 
1311 		map->first_free = vm_map_first_entry(map);
1312 		SAVE_HINT_HOLE_WRITE(map, NULL);
1313 	}
1314 }
1315 
1316 boolean_t
vm_kernel_map_is_kernel(vm_map_t map)1317 vm_kernel_map_is_kernel(vm_map_t map)
1318 {
1319 	return map->pmap == kernel_pmap;
1320 }
1321 
1322 /*
1323  *	vm_map_create:
1324  *
1325  *	Creates and returns a new empty VM map with
1326  *	the given physical map structure, and having
1327  *	the given lower and upper address bounds.
1328  */
1329 
1330 extern vm_map_t vm_map_create_external(
1331 	pmap_t                  pmap,
1332 	vm_map_offset_t         min_off,
1333 	vm_map_offset_t         max_off,
1334 	boolean_t               pageable);
1335 
1336 vm_map_t
vm_map_create_external(pmap_t pmap,vm_map_offset_t min,vm_map_offset_t max,boolean_t pageable)1337 vm_map_create_external(
1338 	pmap_t                  pmap,
1339 	vm_map_offset_t         min,
1340 	vm_map_offset_t         max,
1341 	boolean_t               pageable)
1342 {
1343 	vm_map_create_options_t options = VM_MAP_CREATE_DEFAULT;
1344 
1345 	if (pageable) {
1346 		options |= VM_MAP_CREATE_PAGEABLE;
1347 	}
1348 	return vm_map_create_options(pmap, min, max, options);
1349 }
1350 
1351 __startup_func
1352 void
vm_map_will_allocate_early_map(vm_map_t * owner)1353 vm_map_will_allocate_early_map(vm_map_t *owner)
1354 {
1355 	if (early_map_count >= VM_MAP_EARLY_COUNT_MAX) {
1356 		panic("VM_MAP_EARLY_COUNT_MAX is too low");
1357 	}
1358 
1359 	early_map_owners[early_map_count++] = owner;
1360 }
1361 
1362 __startup_func
1363 void
vm_map_relocate_early_maps(vm_offset_t delta)1364 vm_map_relocate_early_maps(vm_offset_t delta)
1365 {
1366 	for (uint32_t i = 0; i < early_map_count; i++) {
1367 		vm_address_t addr = (vm_address_t)*early_map_owners[i];
1368 
1369 		*early_map_owners[i] = (vm_map_t)(addr + delta);
1370 	}
1371 
1372 	early_map_count = ~0u;
1373 }
1374 
1375 /*
1376  *	Routine:	vm_map_relocate_early_elem
1377  *
1378  *	Purpose:
1379  *		Early zone elements are allocated in a temporary part
1380  *		of the address space.
1381  *
1382  *		Once the zones live in their final place, the early
1383  *		VM maps, map entries and map holes need to be relocated.
1384  *
1385  *		It involves rewriting any vm_map_t, vm_map_entry_t or
1386  *		pointers to vm_map_links. Other pointers to other types
1387  *		are fine.
1388  *
1389  *		Fortunately, pointers to those types are self-contained
1390  *		in those zones, _except_ for pointers to VM maps,
1391  *		which are tracked during early boot and fixed with
1392  *		vm_map_relocate_early_maps().
1393  */
1394 __startup_func
1395 void
vm_map_relocate_early_elem(uint32_t zone_id,vm_offset_t new_addr,vm_offset_t delta)1396 vm_map_relocate_early_elem(
1397 	uint32_t                zone_id,
1398 	vm_offset_t             new_addr,
1399 	vm_offset_t             delta)
1400 {
1401 #define relocate(type_t, field)  ({ \
1402 	typeof(((type_t)NULL)->field) *__field = &((type_t)new_addr)->field;   \
1403 	if (*__field) {                                                        \
1404 	        *__field = (typeof(*__field))((vm_offset_t)*__field + delta);  \
1405 	}                                                                      \
1406 })
1407 
1408 	switch (zone_id) {
1409 	case ZONE_ID_VM_MAP:
1410 	case ZONE_ID_VM_MAP_ENTRY:
1411 	case ZONE_ID_VM_MAP_HOLES:
1412 		break;
1413 
1414 	default:
1415 		panic("Unexpected zone ID %d", zone_id);
1416 	}
1417 
1418 	if (zone_id == ZONE_ID_VM_MAP) {
1419 		relocate(vm_map_t, hdr.links.prev);
1420 		relocate(vm_map_t, hdr.links.next);
1421 		((vm_map_t)new_addr)->pmap = kernel_pmap;
1422 #ifdef VM_MAP_STORE_USE_RB
1423 		relocate(vm_map_t, hdr.rb_head_store.rbh_root);
1424 #endif /* VM_MAP_STORE_USE_RB */
1425 		relocate(vm_map_t, hint);
1426 		relocate(vm_map_t, hole_hint);
1427 		relocate(vm_map_t, first_free);
1428 		return;
1429 	}
1430 
1431 	relocate(struct vm_map_links *, prev);
1432 	relocate(struct vm_map_links *, next);
1433 
1434 	if (zone_id == ZONE_ID_VM_MAP_ENTRY) {
1435 #ifdef VM_MAP_STORE_USE_RB
1436 		relocate(vm_map_entry_t, store.entry.rbe_left);
1437 		relocate(vm_map_entry_t, store.entry.rbe_right);
1438 		relocate(vm_map_entry_t, store.entry.rbe_parent);
1439 #endif /* VM_MAP_STORE_USE_RB */
1440 		if (((vm_map_entry_t)new_addr)->is_sub_map) {
1441 			/* no object to relocate because we haven't made any */
1442 			((vm_map_entry_t)new_addr)->vme_submap +=
1443 			    delta >> VME_SUBMAP_SHIFT;
1444 		}
1445 #if MAP_ENTRY_CREATION_DEBUG
1446 		relocate(vm_map_entry_t, vme_creation_maphdr);
1447 #endif /* MAP_ENTRY_CREATION_DEBUG */
1448 	}
1449 
1450 #undef relocate
1451 }
1452 
1453 vm_map_t
vm_map_create_options(pmap_t pmap,vm_map_offset_t min,vm_map_offset_t max,vm_map_create_options_t options)1454 vm_map_create_options(
1455 	pmap_t                  pmap,
1456 	vm_map_offset_t         min,
1457 	vm_map_offset_t         max,
1458 	vm_map_create_options_t options)
1459 {
1460 	vm_map_t result;
1461 
1462 #if DEBUG || DEVELOPMENT
1463 	if (__improbable(startup_phase < STARTUP_SUB_ZALLOC)) {
1464 		if (early_map_count != ~0u && early_map_count !=
1465 		    zone_count_allocated(vm_map_zone) + 1) {
1466 			panic("allocating %dth early map, owner not known",
1467 			    zone_count_allocated(vm_map_zone) + 1);
1468 		}
1469 		if (early_map_count != ~0u && pmap && pmap != kernel_pmap) {
1470 			panic("allocating %dth early map for non kernel pmap",
1471 			    early_map_count);
1472 		}
1473 	}
1474 #endif /* DEBUG || DEVELOPMENT */
1475 
1476 	result = zalloc_id(ZONE_ID_VM_MAP, Z_WAITOK | Z_NOFAIL | Z_ZERO);
1477 
1478 	vm_map_store_init(&result->hdr);
1479 	result->hdr.entries_pageable = (bool)(options & VM_MAP_CREATE_PAGEABLE);
1480 	vm_map_set_page_shift(result, PAGE_SHIFT);
1481 
1482 	result->size_limit      = RLIM_INFINITY;        /* default unlimited */
1483 	result->data_limit      = RLIM_INFINITY;        /* default unlimited */
1484 	result->user_wire_limit = MACH_VM_MAX_ADDRESS;  /* default limit is unlimited */
1485 	os_ref_init_count_raw(&result->map_refcnt, &map_refgrp, 1);
1486 	result->pmap = pmap;
1487 	result->min_offset = min;
1488 	result->max_offset = max;
1489 	result->first_free = vm_map_to_entry(result);
1490 	result->hint = vm_map_to_entry(result);
1491 
1492 	if (options & VM_MAP_CREATE_NEVER_FAULTS) {
1493 		assert(pmap == kernel_pmap);
1494 		result->never_faults = true;
1495 	}
1496 
1497 	/* "has_corpse_footprint" and "holelistenabled" are mutually exclusive */
1498 	if (options & VM_MAP_CREATE_CORPSE_FOOTPRINT) {
1499 		result->has_corpse_footprint = true;
1500 	} else if (!(options & VM_MAP_CREATE_DISABLE_HOLELIST)) {
1501 		struct vm_map_links *hole_entry;
1502 
1503 		hole_entry = zalloc_id(ZONE_ID_VM_MAP_HOLES, Z_WAITOK | Z_NOFAIL);
1504 		hole_entry->start = min;
1505 #if defined(__arm64__)
1506 		hole_entry->end = result->max_offset;
1507 #else
1508 		hole_entry->end = MAX(max, (vm_map_offset_t)MACH_VM_MAX_ADDRESS);
1509 #endif
1510 		result->holes_list = result->hole_hint = hole_entry;
1511 		hole_entry->prev = hole_entry->next = CAST_TO_VM_MAP_ENTRY(hole_entry);
1512 		result->holelistenabled = true;
1513 	}
1514 
1515 	vm_map_lock_init(result);
1516 
1517 	return result;
1518 }
1519 
1520 /*
1521  * Adjusts a submap that was made by kmem_suballoc()
1522  * before it knew where it would be mapped,
1523  * so that it has the right min/max offsets.
1524  *
1525  * We do not need to hold any locks:
1526  * only the caller knows about this map,
1527  * and it is not published on any entry yet.
1528  */
1529 static void
vm_map_adjust_offsets(vm_map_t map,vm_map_offset_t min_off,vm_map_offset_t max_off)1530 vm_map_adjust_offsets(
1531 	vm_map_t                map,
1532 	vm_map_offset_t         min_off,
1533 	vm_map_offset_t         max_off)
1534 {
1535 	assert(map->min_offset == 0);
1536 	assert(map->max_offset == max_off - min_off);
1537 	assert(map->hdr.nentries == 0);
1538 	assert(os_ref_get_count_raw(&map->map_refcnt) == 2);
1539 
1540 	map->min_offset = min_off;
1541 	map->max_offset = max_off;
1542 
1543 	if (map->holelistenabled) {
1544 		struct vm_map_links *hole = map->holes_list;
1545 
1546 		hole->start = min_off;
1547 #if defined(__arm64__)
1548 		hole->end = max_off;
1549 #else
1550 		hole->end = MAX(max_off, (vm_map_offset_t)MACH_VM_MAX_ADDRESS);
1551 #endif
1552 	}
1553 }
1554 
1555 
1556 vm_map_size_t
vm_map_adjusted_size(vm_map_t map)1557 vm_map_adjusted_size(vm_map_t map)
1558 {
1559 	const struct vm_reserved_region *regions = NULL;
1560 	size_t num_regions = 0;
1561 	mach_vm_size_t  reserved_size = 0, map_size = 0;
1562 
1563 	if (map == NULL || (map->size == 0)) {
1564 		return 0;
1565 	}
1566 
1567 	map_size = map->size;
1568 
1569 	if (map->reserved_regions == FALSE || !vm_map_is_exotic(map) || map->terminated) {
1570 		/*
1571 		 * No special reserved regions or not an exotic map or the task
1572 		 * is terminating and these special regions might have already
1573 		 * been deallocated.
1574 		 */
1575 		return map_size;
1576 	}
1577 
1578 	num_regions = ml_get_vm_reserved_regions(vm_map_is_64bit(map), &regions);
1579 	assert((num_regions == 0) || (num_regions > 0 && regions != NULL));
1580 
1581 	while (num_regions) {
1582 		reserved_size += regions[--num_regions].vmrr_size;
1583 	}
1584 
1585 	/*
1586 	 * There are a few places where the map is being switched out due to
1587 	 * 'termination' without that bit being set (e.g. exec and corpse purging).
1588 	 * In those cases, we could have the map's regions being deallocated on
1589 	 * a core while some accounting process is trying to get the map's size.
1590 	 * So this assert can't be enabled till all those places are uniform in
1591 	 * their use of the 'map->terminated' bit.
1592 	 *
1593 	 * assert(map_size >= reserved_size);
1594 	 */
1595 
1596 	return (map_size >= reserved_size) ? (map_size - reserved_size) : map_size;
1597 }
1598 
1599 /*
1600  *	vm_map_entry_create:	[ internal use only ]
1601  *
1602  *	Allocates a VM map entry for insertion in the
1603  *	given map (or map copy).  No fields are filled.
1604  *
1605  *	The VM entry will be zero initialized, except for:
1606  *	- behavior set to VM_BEHAVIOR_DEFAULT
1607  *	- inheritance set to VM_INHERIT_DEFAULT
1608  */
1609 #define vm_map_entry_create(map)    _vm_map_entry_create(&(map)->hdr)
1610 
1611 #define vm_map_copy_entry_create(copy) _vm_map_entry_create(&(copy)->cpy_hdr)
1612 
1613 static vm_map_entry_t
_vm_map_entry_create(struct vm_map_header * map_header __unused)1614 _vm_map_entry_create(
1615 	struct vm_map_header    *map_header __unused)
1616 {
1617 	vm_map_entry_t entry = NULL;
1618 
1619 	entry = zalloc_id(ZONE_ID_VM_MAP_ENTRY, Z_WAITOK | Z_ZERO);
1620 
1621 	/*
1622 	 * Help the compiler with what we know to be true,
1623 	 * so that the further bitfields inits have good codegen.
1624 	 *
1625 	 * See rdar://87041299
1626 	 */
1627 	__builtin_assume(entry->vme_object_value == 0);
1628 	__builtin_assume(*(uint64_t *)(&entry->vme_object_value + 1) == 0);
1629 	__builtin_assume(*(uint64_t *)(&entry->vme_object_value + 2) == 0);
1630 
1631 	static_assert(VM_MAX_TAG_VALUE <= VME_ALIAS_MASK,
1632 	    "VME_ALIAS_MASK covers tags");
1633 
1634 	static_assert(VM_BEHAVIOR_DEFAULT == 0,
1635 	    "can skip zeroing of the behavior field");
1636 	entry->inheritance = VM_INHERIT_DEFAULT;
1637 
1638 #if MAP_ENTRY_CREATION_DEBUG
1639 	entry->vme_creation_maphdr = map_header;
1640 	entry->vme_creation_bt = btref_get(__builtin_frame_address(0),
1641 	    BTREF_GET_NOWAIT);
1642 #endif
1643 	return entry;
1644 }
1645 
1646 /*
1647  *	vm_map_entry_dispose:	[ internal use only ]
1648  *
1649  *	Inverse of vm_map_entry_create.
1650  *
1651  *      write map lock held so no need to
1652  *	do anything special to insure correctness
1653  *      of the stores
1654  */
1655 static void
vm_map_entry_dispose(vm_map_entry_t entry)1656 vm_map_entry_dispose(
1657 	vm_map_entry_t          entry)
1658 {
1659 #if MAP_ENTRY_CREATION_DEBUG
1660 	btref_put(entry->vme_creation_bt);
1661 #endif
1662 #if MAP_ENTRY_INSERTION_DEBUG
1663 	btref_put(entry->vme_insertion_bt);
1664 #endif
1665 	zfree(vm_map_entry_zone, entry);
1666 }
1667 
1668 #define vm_map_copy_entry_dispose(copy_entry) \
1669 	vm_map_entry_dispose(copy_entry)
1670 
1671 static vm_map_entry_t
vm_map_zap_first_entry(vm_map_zap_t list)1672 vm_map_zap_first_entry(
1673 	vm_map_zap_t            list)
1674 {
1675 	return list->vmz_head;
1676 }
1677 
1678 static vm_map_entry_t
vm_map_zap_last_entry(vm_map_zap_t list)1679 vm_map_zap_last_entry(
1680 	vm_map_zap_t            list)
1681 {
1682 	assert(vm_map_zap_first_entry(list));
1683 	return __container_of(list->vmz_tail, struct vm_map_entry, vme_next);
1684 }
1685 
1686 static void
vm_map_zap_append(vm_map_zap_t list,vm_map_entry_t entry)1687 vm_map_zap_append(
1688 	vm_map_zap_t            list,
1689 	vm_map_entry_t          entry)
1690 {
1691 	entry->vme_next = VM_MAP_ENTRY_NULL;
1692 	*list->vmz_tail = entry;
1693 	list->vmz_tail = &entry->vme_next;
1694 }
1695 
1696 static vm_map_entry_t
vm_map_zap_pop(vm_map_zap_t list)1697 vm_map_zap_pop(
1698 	vm_map_zap_t            list)
1699 {
1700 	vm_map_entry_t head = list->vmz_head;
1701 
1702 	if (head != VM_MAP_ENTRY_NULL &&
1703 	    (list->vmz_head = head->vme_next) == VM_MAP_ENTRY_NULL) {
1704 		list->vmz_tail = &list->vmz_head;
1705 	}
1706 
1707 	return head;
1708 }
1709 
1710 static void
vm_map_zap_dispose(vm_map_zap_t list)1711 vm_map_zap_dispose(
1712 	vm_map_zap_t            list)
1713 {
1714 	vm_map_entry_t          entry;
1715 
1716 	while ((entry = vm_map_zap_pop(list))) {
1717 		if (entry->is_sub_map) {
1718 			vm_map_deallocate(VME_SUBMAP(entry));
1719 		} else {
1720 			vm_object_deallocate(VME_OBJECT(entry));
1721 		}
1722 
1723 		vm_map_entry_dispose(entry);
1724 	}
1725 }
1726 
1727 #if MACH_ASSERT
1728 static boolean_t first_free_check = FALSE;
1729 boolean_t
first_free_is_valid(vm_map_t map)1730 first_free_is_valid(
1731 	vm_map_t        map)
1732 {
1733 	if (!first_free_check) {
1734 		return TRUE;
1735 	}
1736 
1737 	return first_free_is_valid_store( map );
1738 }
1739 #endif /* MACH_ASSERT */
1740 
1741 
1742 #define vm_map_copy_entry_link(copy, after_where, entry)                \
1743 	_vm_map_store_entry_link(&(copy)->cpy_hdr, after_where, (entry))
1744 
1745 #define vm_map_copy_entry_unlink(copy, entry)                           \
1746 	_vm_map_store_entry_unlink(&(copy)->cpy_hdr, (entry), false)
1747 
1748 /*
1749  *	vm_map_destroy:
1750  *
1751  *	Actually destroy a map.
1752  */
1753 void
vm_map_destroy(vm_map_t map)1754 vm_map_destroy(
1755 	vm_map_t        map)
1756 {
1757 	/* final cleanup: this is not allowed to fail */
1758 	vmr_flags_t flags = VM_MAP_REMOVE_NO_FLAGS;
1759 
1760 	VM_MAP_ZAP_DECLARE(zap);
1761 
1762 	vm_map_lock(map);
1763 
1764 	map->terminated = true;
1765 	/* clean up regular map entries */
1766 	(void)vm_map_delete(map, map->min_offset, map->max_offset, flags,
1767 	    KMEM_GUARD_NONE, &zap);
1768 	/* clean up leftover special mappings (commpage, GPU carveout, etc...) */
1769 	(void)vm_map_delete(map, 0x0, 0xFFFFFFFFFFFFF000ULL, flags,
1770 	    KMEM_GUARD_NONE, &zap);
1771 
1772 	vm_map_disable_hole_optimization(map);
1773 	vm_map_corpse_footprint_destroy(map);
1774 
1775 	vm_map_unlock(map);
1776 
1777 	vm_map_zap_dispose(&zap);
1778 
1779 	assert(map->hdr.nentries == 0);
1780 
1781 	if (map->pmap) {
1782 		pmap_destroy(map->pmap);
1783 	}
1784 
1785 	lck_rw_destroy(&map->lock, &vm_map_lck_grp);
1786 
1787 	zfree_id(ZONE_ID_VM_MAP, map);
1788 }
1789 
1790 /*
1791  * Returns pid of the task with the largest number of VM map entries.
1792  * Used in the zone-map-exhaustion jetsam path.
1793  */
1794 pid_t
find_largest_process_vm_map_entries(void)1795 find_largest_process_vm_map_entries(void)
1796 {
1797 	pid_t victim_pid = -1;
1798 	int max_vm_map_entries = 0;
1799 	task_t task = TASK_NULL;
1800 	queue_head_t *task_list = &tasks;
1801 
1802 	lck_mtx_lock(&tasks_threads_lock);
1803 	queue_iterate(task_list, task, task_t, tasks) {
1804 		if (task == kernel_task || !task->active) {
1805 			continue;
1806 		}
1807 
1808 		vm_map_t task_map = task->map;
1809 		if (task_map != VM_MAP_NULL) {
1810 			int task_vm_map_entries = task_map->hdr.nentries;
1811 			if (task_vm_map_entries > max_vm_map_entries) {
1812 				max_vm_map_entries = task_vm_map_entries;
1813 				victim_pid = pid_from_task(task);
1814 			}
1815 		}
1816 	}
1817 	lck_mtx_unlock(&tasks_threads_lock);
1818 
1819 	printf("zone_map_exhaustion: victim pid %d, vm region count: %d\n", victim_pid, max_vm_map_entries);
1820 	return victim_pid;
1821 }
1822 
1823 
1824 /*
1825  *	vm_map_lookup_entry:	[ internal use only ]
1826  *
1827  *	Calls into the vm map store layer to find the map
1828  *	entry containing (or immediately preceding) the
1829  *	specified address in the given map; the entry is returned
1830  *	in the "entry" parameter.  The boolean
1831  *	result indicates whether the address is
1832  *	actually contained in the map.
1833  */
1834 boolean_t
vm_map_lookup_entry(vm_map_t map,vm_map_offset_t address,vm_map_entry_t * entry)1835 vm_map_lookup_entry(
1836 	vm_map_t        map,
1837 	vm_map_offset_t address,
1838 	vm_map_entry_t  *entry)         /* OUT */
1839 {
1840 #if CONFIG_KERNEL_TBI
1841 	if (VM_KERNEL_ADDRESS(address)) {
1842 		address = VM_KERNEL_STRIP_UPTR(address);
1843 	}
1844 #endif /* CONFIG_KERNEL_TBI */
1845 #if CONFIG_PROB_GZALLOC
1846 	if (map->pmap == kernel_pmap) {
1847 		assertf(!pgz_owned(address),
1848 		    "it is the responsibility of callers to unguard PGZ addresses");
1849 	}
1850 #endif /* CONFIG_PROB_GZALLOC */
1851 	return vm_map_store_lookup_entry( map, address, entry );
1852 }
1853 
1854 boolean_t
vm_map_lookup_entry_or_next(vm_map_t map,vm_map_offset_t address,vm_map_entry_t * entry)1855 vm_map_lookup_entry_or_next(
1856 	vm_map_t        map,
1857 	vm_map_offset_t address,
1858 	vm_map_entry_t  *entry)         /* OUT */
1859 {
1860 	if (vm_map_lookup_entry(map, address, entry)) {
1861 		return true;
1862 	}
1863 
1864 	*entry = (*entry)->vme_next;
1865 	return false;
1866 }
1867 
1868 #if CONFIG_PROB_GZALLOC
1869 boolean_t
vm_map_lookup_entry_allow_pgz(vm_map_t map,vm_map_offset_t address,vm_map_entry_t * entry)1870 vm_map_lookup_entry_allow_pgz(
1871 	vm_map_t        map,
1872 	vm_map_offset_t address,
1873 	vm_map_entry_t  *entry)         /* OUT */
1874 {
1875 #if CONFIG_KERNEL_TBI
1876 	if (VM_KERNEL_ADDRESS(address)) {
1877 		address = VM_KERNEL_STRIP_UPTR(address);
1878 	}
1879 #endif /* CONFIG_KERNEL_TBI */
1880 	return vm_map_store_lookup_entry( map, address, entry );
1881 }
1882 #endif /* CONFIG_PROB_GZALLOC */
1883 
1884 /*
1885  *	Routine:	vm_map_range_invalid_panic
1886  *	Purpose:
1887  *			Panic on detection of an invalid range id.
1888  */
1889 __abortlike
1890 static void
vm_map_range_invalid_panic(vm_map_t map,vm_map_range_id_t range_id)1891 vm_map_range_invalid_panic(
1892 	vm_map_t                map,
1893 	vm_map_range_id_t       range_id)
1894 {
1895 	panic("invalid range ID (%u) for map %p", range_id, map);
1896 }
1897 
1898 /*
1899  *	Routine:	vm_map_get_range
1900  *	Purpose:
1901  *			Adjust bounds based on security policy.
1902  */
1903 static struct mach_vm_range
vm_map_get_range(vm_map_t map,vm_map_address_t * address,vm_map_kernel_flags_t * vmk_flags,vm_map_size_t size,bool * is_ptr)1904 vm_map_get_range(
1905 	vm_map_t                map,
1906 	vm_map_address_t       *address,
1907 	vm_map_kernel_flags_t  *vmk_flags,
1908 	vm_map_size_t           size,
1909 	bool                   *is_ptr)
1910 {
1911 	struct mach_vm_range effective_range = {};
1912 	vm_map_range_id_t range_id = vmk_flags->vmkf_range_id;
1913 
1914 	if (map == kernel_map) {
1915 		effective_range = kmem_ranges[range_id];
1916 
1917 		if (startup_phase >= STARTUP_SUB_KMEM) {
1918 			/*
1919 			 * Hint provided by caller is zeroed as the range is restricted to a
1920 			 * subset of the entire kernel_map VA, which could put the hint outside
1921 			 * the range, causing vm_map_store_find_space to fail.
1922 			 */
1923 			*address = 0ull;
1924 			/*
1925 			 * Ensure that range_id passed in by the caller is within meaningful
1926 			 * bounds. Range id of KMEM_RANGE_ID_NONE will cause vm_map_locate_space
1927 			 * to fail as the corresponding range is invalid. Range id larger than
1928 			 * KMEM_RANGE_ID_MAX will lead to an OOB access.
1929 			 */
1930 			if ((range_id == KMEM_RANGE_ID_NONE) ||
1931 			    (range_id > KMEM_RANGE_ID_MAX)) {
1932 				vm_map_range_invalid_panic(map, range_id);
1933 			}
1934 
1935 			/*
1936 			 * Pointer ranges use kmem_locate_space to do allocations.
1937 			 *
1938 			 * Non pointer fronts look like [ Small | Large | Permanent ]
1939 			 * Adjust range for allocations larger than KMEM_SMALLMAP_THRESHOLD.
1940 			 * Allocations smaller than KMEM_SMALLMAP_THRESHOLD are allowed to
1941 			 * use the entire range.
1942 			 */
1943 			if (range_id < KMEM_RANGE_ID_SPRAYQTN) {
1944 				*is_ptr = true;
1945 			} else if (size >= KMEM_SMALLMAP_THRESHOLD) {
1946 				effective_range = kmem_large_ranges[range_id];
1947 			}
1948 		}
1949 #if CONFIG_MAP_RANGES
1950 	} else if (map->uses_user_ranges) {
1951 		if (range_id > UMEM_RANGE_ID_MAX) {
1952 			vm_map_range_invalid_panic(map, range_id);
1953 		}
1954 
1955 		effective_range = map->user_range[range_id];
1956 #endif /* CONFIG_MAP_RANGES */
1957 	} else {
1958 		/*
1959 		 * If minimum is 0, bump it up by PAGE_SIZE.  We want to limit
1960 		 * allocations of PAGEZERO to explicit requests since its
1961 		 * normal use is to catch dereferences of NULL and many
1962 		 * applications also treat pointers with a value of 0 as
1963 		 * special and suddenly having address 0 contain useable
1964 		 * memory would tend to confuse those applications.
1965 		 */
1966 		effective_range.min_address = MAX(map->min_offset, VM_MAP_PAGE_SIZE(map));
1967 		effective_range.max_address = map->max_offset;
1968 	}
1969 
1970 	return effective_range;
1971 }
1972 
1973 /*
1974  *	Routine:	vm_map_locate_space
1975  *	Purpose:
1976  *		Finds a range in the specified virtual address map,
1977  *		returning the start of that range,
1978  *		as well as the entry right before it.
1979  */
1980 kern_return_t
vm_map_locate_space(vm_map_t map,vm_map_size_t size,vm_map_offset_t mask,vm_map_kernel_flags_t vmk_flags,vm_map_offset_t * start_inout,vm_map_entry_t * entry_out)1981 vm_map_locate_space(
1982 	vm_map_t                map,
1983 	vm_map_size_t           size,
1984 	vm_map_offset_t         mask,
1985 	vm_map_kernel_flags_t   vmk_flags,
1986 	vm_map_offset_t        *start_inout,
1987 	vm_map_entry_t         *entry_out)
1988 {
1989 	struct mach_vm_range effective_range = {};
1990 	vm_map_size_t   guard_offset;
1991 	vm_map_offset_t hint, limit;
1992 	vm_map_entry_t  entry;
1993 	bool            is_kmem_ptr_range = false;
1994 
1995 	/*
1996 	 * Only supported by vm_map_enter() with a fixed address.
1997 	 */
1998 	assert(!vmk_flags.vmkf_beyond_max);
1999 
2000 	if (__improbable(map->wait_for_space)) {
2001 		/*
2002 		 * support for "wait_for_space" is minimal,
2003 		 * its only consumer is the ipc_kernel_copy_map.
2004 		 */
2005 		assert(!map->holelistenabled &&
2006 		    !vmk_flags.vmkf_last_free &&
2007 		    !vmk_flags.vmkf_keep_map_locked &&
2008 		    !vmk_flags.vmkf_map_jit &&
2009 		    !vmk_flags.vmf_random_addr &&
2010 		    *start_inout <= map->min_offset);
2011 	} else if (vmk_flags.vmkf_last_free) {
2012 		assert(!vmk_flags.vmkf_map_jit &&
2013 		    !vmk_flags.vmf_random_addr);
2014 	}
2015 
2016 	if (vmk_flags.vmkf_guard_before) {
2017 		guard_offset = VM_MAP_PAGE_SIZE(map);
2018 		assert(size > guard_offset);
2019 		size -= guard_offset;
2020 	} else {
2021 		assert(size != 0);
2022 		guard_offset = 0;
2023 	}
2024 
2025 	/*
2026 	 * Validate range_id from flags and get associated range
2027 	 */
2028 	effective_range = vm_map_get_range(map, start_inout, &vmk_flags, size,
2029 	    &is_kmem_ptr_range);
2030 
2031 	if (is_kmem_ptr_range) {
2032 		return kmem_locate_space(size + guard_offset, vmk_flags.vmkf_range_id,
2033 		           vmk_flags.vmkf_last_free, start_inout, entry_out);
2034 	}
2035 
2036 #if XNU_TARGET_OS_OSX
2037 	if (__improbable(vmk_flags.vmkf_32bit_map_va)) {
2038 		assert(map != kernel_map);
2039 		effective_range.max_address = MIN(map->max_offset, 0x00000000FFFFF000ULL);
2040 	}
2041 #endif /* XNU_TARGET_OS_OSX */
2042 
2043 again:
2044 	if (vmk_flags.vmkf_last_free) {
2045 		hint = *start_inout;
2046 
2047 		if (hint == 0 || hint > effective_range.max_address) {
2048 			hint = effective_range.max_address;
2049 		}
2050 		if (hint <= effective_range.min_address) {
2051 			return KERN_NO_SPACE;
2052 		}
2053 		limit = effective_range.min_address;
2054 	} else {
2055 		hint = *start_inout;
2056 
2057 		if (vmk_flags.vmkf_map_jit) {
2058 			if (map->jit_entry_exists &&
2059 			    !VM_MAP_POLICY_ALLOW_MULTIPLE_JIT(map)) {
2060 				return KERN_INVALID_ARGUMENT;
2061 			}
2062 			if (VM_MAP_POLICY_ALLOW_JIT_RANDOM_ADDRESS(map)) {
2063 				vmk_flags.vmf_random_addr = true;
2064 			}
2065 		}
2066 
2067 		if (vmk_flags.vmf_random_addr) {
2068 			kern_return_t kr;
2069 
2070 			kr = vm_map_random_address_for_size(map, &hint, size, vmk_flags);
2071 			if (kr != KERN_SUCCESS) {
2072 				return kr;
2073 			}
2074 		}
2075 #if XNU_TARGET_OS_OSX
2076 		else if ((hint == 0 || hint == vm_map_min(map)) &&
2077 		    !map->disable_vmentry_reuse &&
2078 		    map->vmmap_high_start != 0) {
2079 			hint = map->vmmap_high_start;
2080 		}
2081 #endif /* XNU_TARGET_OS_OSX */
2082 
2083 		if (hint < effective_range.min_address) {
2084 			hint = effective_range.min_address;
2085 		}
2086 		if (effective_range.max_address <= hint) {
2087 			return KERN_NO_SPACE;
2088 		}
2089 
2090 		limit = effective_range.max_address;
2091 	}
2092 	entry = vm_map_store_find_space(map,
2093 	    hint, limit, vmk_flags.vmkf_last_free,
2094 	    guard_offset, size, mask,
2095 	    start_inout);
2096 
2097 	if (__improbable(entry == NULL)) {
2098 		if (map->wait_for_space &&
2099 		    guard_offset + size <=
2100 		    effective_range.max_address - effective_range.min_address) {
2101 			assert_wait((event_t)map, THREAD_ABORTSAFE);
2102 			vm_map_unlock(map);
2103 			thread_block(THREAD_CONTINUE_NULL);
2104 			vm_map_lock(map);
2105 			goto again;
2106 		}
2107 		return KERN_NO_SPACE;
2108 	}
2109 
2110 	if (entry_out) {
2111 		*entry_out = entry;
2112 	}
2113 	return KERN_SUCCESS;
2114 }
2115 
2116 
2117 /*
2118  *	Routine:	vm_map_find_space
2119  *	Purpose:
2120  *		Allocate a range in the specified virtual address map,
2121  *		returning the entry allocated for that range.
2122  *		Used by kmem_alloc, etc.
2123  *
2124  *		The map must be NOT be locked. It will be returned locked
2125  *		on KERN_SUCCESS, unlocked on failure.
2126  *
2127  *		If an entry is allocated, the object/offset fields
2128  *		are initialized to zero.
2129  */
2130 kern_return_t
vm_map_find_space(vm_map_t map,vm_map_offset_t hint_address,vm_map_size_t size,vm_map_offset_t mask,vm_map_kernel_flags_t vmk_flags,vm_map_entry_t * o_entry)2131 vm_map_find_space(
2132 	vm_map_t                map,
2133 	vm_map_offset_t         hint_address,
2134 	vm_map_size_t           size,
2135 	vm_map_offset_t         mask,
2136 	vm_map_kernel_flags_t   vmk_flags,
2137 	vm_map_entry_t          *o_entry)       /* OUT */
2138 {
2139 	vm_map_entry_t          new_entry, entry;
2140 	kern_return_t           kr;
2141 
2142 	if (size == 0) {
2143 		return KERN_INVALID_ARGUMENT;
2144 	}
2145 
2146 	new_entry = vm_map_entry_create(map);
2147 	new_entry->use_pmap = true;
2148 	new_entry->protection = VM_PROT_DEFAULT;
2149 	new_entry->max_protection = VM_PROT_ALL;
2150 
2151 	if (VM_MAP_PAGE_SHIFT(map) != PAGE_SHIFT) {
2152 		new_entry->map_aligned = true;
2153 	}
2154 	if (vmk_flags.vmf_permanent) {
2155 		new_entry->vme_permanent = true;
2156 	}
2157 
2158 	vm_map_lock(map);
2159 
2160 	kr = vm_map_locate_space(map, size, mask, vmk_flags,
2161 	    &hint_address, &entry);
2162 	if (kr != KERN_SUCCESS) {
2163 		vm_map_unlock(map);
2164 		vm_map_entry_dispose(new_entry);
2165 		return kr;
2166 	}
2167 	new_entry->vme_start = hint_address;
2168 	new_entry->vme_end = hint_address + size;
2169 
2170 	/*
2171 	 *	At this point,
2172 	 *
2173 	 *	- new_entry's "vme_start" and "vme_end" should define
2174 	 *	  the endpoints of the available new range,
2175 	 *
2176 	 *	- and "entry" should refer to the region before
2177 	 *	  the new range,
2178 	 *
2179 	 *	- and the map should still be locked.
2180 	 */
2181 
2182 	assert(page_aligned(new_entry->vme_start));
2183 	assert(page_aligned(new_entry->vme_end));
2184 	assert(VM_MAP_PAGE_ALIGNED(new_entry->vme_start, VM_MAP_PAGE_MASK(map)));
2185 	assert(VM_MAP_PAGE_ALIGNED(new_entry->vme_end, VM_MAP_PAGE_MASK(map)));
2186 
2187 	/*
2188 	 *	Insert the new entry into the list
2189 	 */
2190 
2191 	vm_map_store_entry_link(map, entry, new_entry,
2192 	    VM_MAP_KERNEL_FLAGS_NONE);
2193 	map->size += size;
2194 
2195 	/*
2196 	 *	Update the lookup hint
2197 	 */
2198 	SAVE_HINT_MAP_WRITE(map, new_entry);
2199 
2200 	*o_entry = new_entry;
2201 	return KERN_SUCCESS;
2202 }
2203 
2204 int vm_map_pmap_enter_print = FALSE;
2205 int vm_map_pmap_enter_enable = FALSE;
2206 
2207 /*
2208  *	Routine:	vm_map_pmap_enter [internal only]
2209  *
2210  *	Description:
2211  *		Force pages from the specified object to be entered into
2212  *		the pmap at the specified address if they are present.
2213  *		As soon as a page not found in the object the scan ends.
2214  *
2215  *	Returns:
2216  *		Nothing.
2217  *
2218  *	In/out conditions:
2219  *		The source map should not be locked on entry.
2220  */
2221 __unused static void
vm_map_pmap_enter(vm_map_t map,vm_map_offset_t addr,vm_map_offset_t end_addr,vm_object_t object,vm_object_offset_t offset,vm_prot_t protection)2222 vm_map_pmap_enter(
2223 	vm_map_t                map,
2224 	vm_map_offset_t         addr,
2225 	vm_map_offset_t         end_addr,
2226 	vm_object_t             object,
2227 	vm_object_offset_t      offset,
2228 	vm_prot_t               protection)
2229 {
2230 	int                     type_of_fault;
2231 	kern_return_t           kr;
2232 	struct vm_object_fault_info fault_info = {};
2233 
2234 	if (map->pmap == 0) {
2235 		return;
2236 	}
2237 
2238 	assert(VM_MAP_PAGE_SHIFT(map) == PAGE_SHIFT);
2239 
2240 	while (addr < end_addr) {
2241 		vm_page_t       m;
2242 
2243 
2244 		/*
2245 		 * TODO:
2246 		 * From vm_map_enter(), we come into this function without the map
2247 		 * lock held or the object lock held.
2248 		 * We haven't taken a reference on the object either.
2249 		 * We should do a proper lookup on the map to make sure
2250 		 * that things are sane before we go locking objects that
2251 		 * could have been deallocated from under us.
2252 		 */
2253 
2254 		vm_object_lock(object);
2255 
2256 		m = vm_page_lookup(object, offset);
2257 
2258 		if (m == VM_PAGE_NULL || m->vmp_busy || m->vmp_fictitious ||
2259 		    (m->vmp_unusual && (VMP_ERROR_GET(m) || m->vmp_restart || m->vmp_absent))) {
2260 			vm_object_unlock(object);
2261 			return;
2262 		}
2263 
2264 		if (vm_map_pmap_enter_print) {
2265 			printf("vm_map_pmap_enter:");
2266 			printf("map: %p, addr: %llx, object: %p, offset: %llx\n",
2267 			    map, (unsigned long long)addr, object, (unsigned long long)offset);
2268 		}
2269 		type_of_fault = DBG_CACHE_HIT_FAULT;
2270 		kr = vm_fault_enter(m, map->pmap,
2271 		    addr,
2272 		    PAGE_SIZE, 0,
2273 		    protection, protection,
2274 		    VM_PAGE_WIRED(m),
2275 		    FALSE,                 /* change_wiring */
2276 		    VM_KERN_MEMORY_NONE,                 /* tag - not wiring */
2277 		    &fault_info,
2278 		    NULL,                  /* need_retry */
2279 		    &type_of_fault);
2280 
2281 		vm_object_unlock(object);
2282 
2283 		offset += PAGE_SIZE_64;
2284 		addr += PAGE_SIZE;
2285 	}
2286 }
2287 
2288 #define MAX_TRIES_TO_GET_RANDOM_ADDRESS 1000
2289 static kern_return_t
vm_map_random_address_for_size(vm_map_t map,vm_map_offset_t * address,vm_map_size_t size,vm_map_kernel_flags_t vmk_flags)2290 vm_map_random_address_for_size(
2291 	vm_map_t                map,
2292 	vm_map_offset_t        *address,
2293 	vm_map_size_t           size,
2294 	vm_map_kernel_flags_t   vmk_flags)
2295 {
2296 	kern_return_t   kr = KERN_SUCCESS;
2297 	int             tries = 0;
2298 	vm_map_offset_t random_addr = 0;
2299 	vm_map_offset_t hole_end;
2300 
2301 	vm_map_entry_t  next_entry = VM_MAP_ENTRY_NULL;
2302 	vm_map_entry_t  prev_entry = VM_MAP_ENTRY_NULL;
2303 	vm_map_size_t   vm_hole_size = 0;
2304 	vm_map_size_t   addr_space_size;
2305 	bool            is_kmem_ptr;
2306 	struct mach_vm_range effective_range;
2307 
2308 	effective_range = vm_map_get_range(map, address, &vmk_flags, size,
2309 	    &is_kmem_ptr);
2310 
2311 	addr_space_size = effective_range.max_address - effective_range.min_address;
2312 	if (size >= addr_space_size) {
2313 		return KERN_NO_SPACE;
2314 	}
2315 	addr_space_size -= size;
2316 
2317 	assert(VM_MAP_PAGE_ALIGNED(size, VM_MAP_PAGE_MASK(map)));
2318 
2319 	while (tries < MAX_TRIES_TO_GET_RANDOM_ADDRESS) {
2320 		if (startup_phase < STARTUP_SUB_ZALLOC) {
2321 			random_addr = (vm_map_offset_t)early_random();
2322 		} else {
2323 			random_addr = (vm_map_offset_t)random();
2324 		}
2325 		random_addr <<= VM_MAP_PAGE_SHIFT(map);
2326 		random_addr = vm_map_trunc_page(
2327 			effective_range.min_address + (random_addr % addr_space_size),
2328 			VM_MAP_PAGE_MASK(map));
2329 
2330 #if CONFIG_PROB_GZALLOC
2331 		if (map->pmap == kernel_pmap && pgz_owned(random_addr)) {
2332 			continue;
2333 		}
2334 #endif /* CONFIG_PROB_GZALLOC */
2335 
2336 		if (vm_map_lookup_entry(map, random_addr, &prev_entry) == FALSE) {
2337 			if (prev_entry == vm_map_to_entry(map)) {
2338 				next_entry = vm_map_first_entry(map);
2339 			} else {
2340 				next_entry = prev_entry->vme_next;
2341 			}
2342 			if (next_entry == vm_map_to_entry(map)) {
2343 				hole_end = vm_map_max(map);
2344 			} else {
2345 				hole_end = next_entry->vme_start;
2346 			}
2347 			vm_hole_size = hole_end - random_addr;
2348 			if (vm_hole_size >= size) {
2349 				*address = random_addr;
2350 				break;
2351 			}
2352 		}
2353 		tries++;
2354 	}
2355 
2356 	if (tries == MAX_TRIES_TO_GET_RANDOM_ADDRESS) {
2357 		kr = KERN_NO_SPACE;
2358 	}
2359 	return kr;
2360 }
2361 
2362 static boolean_t
vm_memory_malloc_no_cow(int alias)2363 vm_memory_malloc_no_cow(
2364 	int alias)
2365 {
2366 	uint64_t alias_mask;
2367 
2368 	if (alias > 63) {
2369 		return FALSE;
2370 	}
2371 
2372 	alias_mask = 1ULL << alias;
2373 	if (alias_mask & vm_memory_malloc_no_cow_mask) {
2374 		return TRUE;
2375 	}
2376 	return FALSE;
2377 }
2378 
2379 uint64_t vm_map_enter_RLIMIT_AS_count = 0;
2380 uint64_t vm_map_enter_RLIMIT_DATA_count = 0;
2381 /*
2382  *	Routine:	vm_map_enter
2383  *
2384  *	Description:
2385  *		Allocate a range in the specified virtual address map.
2386  *		The resulting range will refer to memory defined by
2387  *		the given memory object and offset into that object.
2388  *
2389  *		Arguments are as defined in the vm_map call.
2390  */
2391 static unsigned int vm_map_enter_restore_successes = 0;
2392 static unsigned int vm_map_enter_restore_failures = 0;
2393 kern_return_t
vm_map_enter(vm_map_t map,vm_map_offset_t * address,vm_map_size_t size,vm_map_offset_t mask,vm_map_kernel_flags_t vmk_flags,vm_object_t object,vm_object_offset_t offset,boolean_t needs_copy,vm_prot_t cur_protection,vm_prot_t max_protection,vm_inherit_t inheritance)2394 vm_map_enter(
2395 	vm_map_t                map,
2396 	vm_map_offset_t         *address,       /* IN/OUT */
2397 	vm_map_size_t           size,
2398 	vm_map_offset_t         mask,
2399 	vm_map_kernel_flags_t   vmk_flags,
2400 	vm_object_t             object,
2401 	vm_object_offset_t      offset,
2402 	boolean_t               needs_copy,
2403 	vm_prot_t               cur_protection,
2404 	vm_prot_t               max_protection,
2405 	vm_inherit_t            inheritance)
2406 {
2407 	vm_map_entry_t          entry, new_entry;
2408 	vm_map_offset_t         start, tmp_start, tmp_offset;
2409 	vm_map_offset_t         end, tmp_end;
2410 	vm_map_offset_t         tmp2_start, tmp2_end;
2411 	vm_map_offset_t         step;
2412 	kern_return_t           result = KERN_SUCCESS;
2413 	bool                    map_locked = FALSE;
2414 	bool                    pmap_empty = TRUE;
2415 	bool                    new_mapping_established = FALSE;
2416 	const bool              keep_map_locked = vmk_flags.vmkf_keep_map_locked;
2417 	const bool              anywhere = !vmk_flags.vmf_fixed;
2418 	const bool              purgable = vmk_flags.vmf_purgeable;
2419 	const bool              overwrite = vmk_flags.vmf_overwrite;
2420 	const bool              no_cache = vmk_flags.vmf_no_cache;
2421 	const bool              is_submap = vmk_flags.vmkf_submap;
2422 	const bool              permanent = vmk_flags.vmf_permanent;
2423 	const bool              no_copy_on_read = vmk_flags.vmkf_no_copy_on_read;
2424 	const bool              entry_for_jit = vmk_flags.vmkf_map_jit;
2425 	const bool              iokit_acct = vmk_flags.vmkf_iokit_acct;
2426 	const bool              resilient_codesign = vmk_flags.vmf_resilient_codesign;
2427 	const bool              resilient_media = vmk_flags.vmf_resilient_media;
2428 	const bool              entry_for_tpro = vmk_flags.vmf_tpro;
2429 	const unsigned int      superpage_size = vmk_flags.vmf_superpage_size;
2430 	const vm_tag_t          alias = vmk_flags.vm_tag;
2431 	vm_tag_t                user_alias;
2432 	kern_return_t           kr;
2433 	bool                    clear_map_aligned = FALSE;
2434 	vm_map_size_t           chunk_size = 0;
2435 	vm_object_t             caller_object;
2436 	VM_MAP_ZAP_DECLARE(zap_old_list);
2437 	VM_MAP_ZAP_DECLARE(zap_new_list);
2438 
2439 	caller_object = object;
2440 
2441 	assertf(vmk_flags.__vmkf_unused == 0, "vmk_flags unused=0x%x\n", vmk_flags.__vmkf_unused);
2442 
2443 	if (vmk_flags.vmf_4gb_chunk) {
2444 #if defined(__LP64__)
2445 		chunk_size = (4ULL * 1024 * 1024 * 1024); /* max. 4GB chunks for the new allocation */
2446 #else /* __LP64__ */
2447 		chunk_size = ANON_CHUNK_SIZE;
2448 #endif /* __LP64__ */
2449 	} else {
2450 		chunk_size = ANON_CHUNK_SIZE;
2451 	}
2452 
2453 
2454 
2455 	if (superpage_size) {
2456 		switch (superpage_size) {
2457 			/*
2458 			 * Note that the current implementation only supports
2459 			 * a single size for superpages, SUPERPAGE_SIZE, per
2460 			 * architecture. As soon as more sizes are supposed
2461 			 * to be supported, SUPERPAGE_SIZE has to be replaced
2462 			 * with a lookup of the size depending on superpage_size.
2463 			 */
2464 #ifdef __x86_64__
2465 		case SUPERPAGE_SIZE_ANY:
2466 			/* handle it like 2 MB and round up to page size */
2467 			size = (size + 2 * 1024 * 1024 - 1) & ~(2 * 1024 * 1024 - 1);
2468 			OS_FALLTHROUGH;
2469 		case SUPERPAGE_SIZE_2MB:
2470 			break;
2471 #endif
2472 		default:
2473 			return KERN_INVALID_ARGUMENT;
2474 		}
2475 		mask = SUPERPAGE_SIZE - 1;
2476 		if (size & (SUPERPAGE_SIZE - 1)) {
2477 			return KERN_INVALID_ARGUMENT;
2478 		}
2479 		inheritance = VM_INHERIT_NONE;  /* fork() children won't inherit superpages */
2480 	}
2481 
2482 
2483 	if ((cur_protection & VM_PROT_WRITE) &&
2484 	    (cur_protection & VM_PROT_EXECUTE) &&
2485 #if XNU_TARGET_OS_OSX
2486 	    map->pmap != kernel_pmap &&
2487 	    (cs_process_global_enforcement() ||
2488 	    (vmk_flags.vmkf_cs_enforcement_override
2489 	    ? vmk_flags.vmkf_cs_enforcement
2490 	    : (vm_map_cs_enforcement(map)
2491 #if __arm64__
2492 	    || !VM_MAP_IS_EXOTIC(map)
2493 #endif /* __arm64__ */
2494 	    ))) &&
2495 #endif /* XNU_TARGET_OS_OSX */
2496 #if CODE_SIGNING_MONITOR
2497 	    (csm_address_space_exempt(map->pmap) != KERN_SUCCESS) &&
2498 #endif
2499 	    (VM_MAP_POLICY_WX_FAIL(map) ||
2500 	    VM_MAP_POLICY_WX_STRIP_X(map)) &&
2501 	    !entry_for_jit) {
2502 		boolean_t vm_protect_wx_fail = VM_MAP_POLICY_WX_FAIL(map);
2503 
2504 		DTRACE_VM3(cs_wx,
2505 		    uint64_t, 0,
2506 		    uint64_t, 0,
2507 		    vm_prot_t, cur_protection);
2508 		printf("CODE SIGNING: %d[%s] %s: curprot cannot be write+execute. %s\n",
2509 		    proc_selfpid(),
2510 		    (get_bsdtask_info(current_task())
2511 		    ? proc_name_address(get_bsdtask_info(current_task()))
2512 		    : "?"),
2513 		    __FUNCTION__,
2514 		    (vm_protect_wx_fail ? "failing" : "turning off execute"));
2515 		cur_protection &= ~VM_PROT_EXECUTE;
2516 		if (vm_protect_wx_fail) {
2517 			return KERN_PROTECTION_FAILURE;
2518 		}
2519 	}
2520 
2521 	/*
2522 	 * If the task has requested executable lockdown,
2523 	 * deny any new executable mapping.
2524 	 */
2525 	if (map->map_disallow_new_exec == TRUE) {
2526 		if (cur_protection & VM_PROT_EXECUTE) {
2527 			return KERN_PROTECTION_FAILURE;
2528 		}
2529 	}
2530 
2531 	if (resilient_codesign) {
2532 		assert(!is_submap);
2533 		int reject_prot = (needs_copy ? VM_PROT_ALLEXEC : (VM_PROT_WRITE | VM_PROT_ALLEXEC));
2534 		if ((cur_protection | max_protection) & reject_prot) {
2535 			return KERN_PROTECTION_FAILURE;
2536 		}
2537 	}
2538 
2539 	if (resilient_media) {
2540 		assert(!is_submap);
2541 //		assert(!needs_copy);
2542 		if (object != VM_OBJECT_NULL &&
2543 		    !object->internal) {
2544 			/*
2545 			 * This mapping is directly backed by an external
2546 			 * memory manager (e.g. a vnode pager for a file):
2547 			 * we would not have any safe place to inject
2548 			 * a zero-filled page if an actual page is not
2549 			 * available, without possibly impacting the actual
2550 			 * contents of the mapped object (e.g. the file),
2551 			 * so we can't provide any media resiliency here.
2552 			 */
2553 			return KERN_INVALID_ARGUMENT;
2554 		}
2555 	}
2556 
2557 	if (is_submap) {
2558 		vm_map_t submap;
2559 		if (purgable) {
2560 			/* submaps can not be purgeable */
2561 			return KERN_INVALID_ARGUMENT;
2562 		}
2563 		if (object == VM_OBJECT_NULL) {
2564 			/* submaps can not be created lazily */
2565 			return KERN_INVALID_ARGUMENT;
2566 		}
2567 		submap = (vm_map_t) object;
2568 		if (VM_MAP_PAGE_SHIFT(submap) != VM_MAP_PAGE_SHIFT(map)) {
2569 			/* page size mismatch */
2570 			return KERN_INVALID_ARGUMENT;
2571 		}
2572 	}
2573 	if (vmk_flags.vmkf_already) {
2574 		/*
2575 		 * VM_FLAGS_ALREADY says that it's OK if the same mapping
2576 		 * is already present.  For it to be meaningul, the requested
2577 		 * mapping has to be at a fixed address (!VM_FLAGS_ANYWHERE) and
2578 		 * we shouldn't try and remove what was mapped there first
2579 		 * (!VM_FLAGS_OVERWRITE).
2580 		 */
2581 		if (!vmk_flags.vmf_fixed || vmk_flags.vmf_overwrite) {
2582 			return KERN_INVALID_ARGUMENT;
2583 		}
2584 	}
2585 
2586 	if (size == 0 ||
2587 	    (offset & MIN(VM_MAP_PAGE_MASK(map), PAGE_MASK_64)) != 0) {
2588 		*address = 0;
2589 		return KERN_INVALID_ARGUMENT;
2590 	}
2591 
2592 	if (map->pmap == kernel_pmap) {
2593 		user_alias = VM_KERN_MEMORY_NONE;
2594 	} else {
2595 		user_alias = alias;
2596 	}
2597 
2598 	if (user_alias == VM_MEMORY_MALLOC_MEDIUM) {
2599 		chunk_size = MALLOC_MEDIUM_CHUNK_SIZE;
2600 	}
2601 
2602 #define RETURN(value)   { result = value; goto BailOut; }
2603 
2604 	assertf(VM_MAP_PAGE_ALIGNED(*address, FOURK_PAGE_MASK), "0x%llx", (uint64_t)*address);
2605 	assertf(VM_MAP_PAGE_ALIGNED(size, FOURK_PAGE_MASK), "0x%llx", (uint64_t)size);
2606 	if (VM_MAP_PAGE_MASK(map) >= PAGE_MASK) {
2607 		assertf(page_aligned(*address), "0x%llx", (uint64_t)*address);
2608 		assertf(page_aligned(size), "0x%llx", (uint64_t)size);
2609 	}
2610 
2611 	if (VM_MAP_PAGE_MASK(map) >= PAGE_MASK &&
2612 	    !VM_MAP_PAGE_ALIGNED(size, VM_MAP_PAGE_MASK(map))) {
2613 		/*
2614 		 * In most cases, the caller rounds the size up to the
2615 		 * map's page size.
2616 		 * If we get a size that is explicitly not map-aligned here,
2617 		 * we'll have to respect the caller's wish and mark the
2618 		 * mapping as "not map-aligned" to avoid tripping the
2619 		 * map alignment checks later.
2620 		 */
2621 		clear_map_aligned = TRUE;
2622 	}
2623 	if (!anywhere &&
2624 	    VM_MAP_PAGE_MASK(map) >= PAGE_MASK &&
2625 	    !VM_MAP_PAGE_ALIGNED(*address, VM_MAP_PAGE_MASK(map))) {
2626 		/*
2627 		 * We've been asked to map at a fixed address and that
2628 		 * address is not aligned to the map's specific alignment.
2629 		 * The caller should know what it's doing (i.e. most likely
2630 		 * mapping some fragmented copy map, transferring memory from
2631 		 * a VM map with a different alignment), so clear map_aligned
2632 		 * for this new VM map entry and proceed.
2633 		 */
2634 		clear_map_aligned = TRUE;
2635 	}
2636 
2637 	/*
2638 	 * Only zero-fill objects are allowed to be purgable.
2639 	 * LP64todo - limit purgable objects to 32-bits for now
2640 	 */
2641 	if (purgable &&
2642 	    (offset != 0 ||
2643 	    (object != VM_OBJECT_NULL &&
2644 	    (object->vo_size != size ||
2645 	    object->purgable == VM_PURGABLE_DENY))
2646 #if __LP64__
2647 	    || size > ANON_MAX_SIZE
2648 #endif
2649 	    )) {
2650 		return KERN_INVALID_ARGUMENT;
2651 	}
2652 
2653 	start = *address;
2654 
2655 	if (anywhere) {
2656 		vm_map_lock(map);
2657 		map_locked = TRUE;
2658 
2659 		result = vm_map_locate_space(map, size, mask, vmk_flags,
2660 		    &start, &entry);
2661 		if (result != KERN_SUCCESS) {
2662 			goto BailOut;
2663 		}
2664 
2665 		*address = start;
2666 		end = start + size;
2667 		assert(VM_MAP_PAGE_ALIGNED(*address,
2668 		    VM_MAP_PAGE_MASK(map)));
2669 	} else {
2670 		vm_map_offset_t effective_min_offset, effective_max_offset;
2671 
2672 		effective_min_offset = map->min_offset;
2673 		effective_max_offset = map->max_offset;
2674 
2675 		if (vmk_flags.vmkf_beyond_max) {
2676 			/*
2677 			 * Allow an insertion beyond the map's max offset.
2678 			 */
2679 			effective_max_offset = 0x00000000FFFFF000ULL;
2680 			if (vm_map_is_64bit(map)) {
2681 				effective_max_offset = 0xFFFFFFFFFFFFF000ULL;
2682 			}
2683 #if XNU_TARGET_OS_OSX
2684 		} else if (__improbable(vmk_flags.vmkf_32bit_map_va)) {
2685 			effective_max_offset = MIN(map->max_offset, 0x00000000FFFFF000ULL);
2686 #endif /* XNU_TARGET_OS_OSX */
2687 		}
2688 
2689 		if (VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT &&
2690 		    !overwrite &&
2691 		    user_alias == VM_MEMORY_REALLOC) {
2692 			/*
2693 			 * Force realloc() to switch to a new allocation,
2694 			 * to prevent 4k-fragmented virtual ranges.
2695 			 */
2696 //			DEBUG4K_ERROR("no realloc in place");
2697 			return KERN_NO_SPACE;
2698 		}
2699 
2700 		/*
2701 		 *	Verify that:
2702 		 *		the address doesn't itself violate
2703 		 *		the mask requirement.
2704 		 */
2705 
2706 		vm_map_lock(map);
2707 		map_locked = TRUE;
2708 		if ((start & mask) != 0) {
2709 			RETURN(KERN_NO_SPACE);
2710 		}
2711 
2712 #if CONFIG_MAP_RANGES
2713 		if (map->uses_user_ranges) {
2714 			struct mach_vm_range r;
2715 
2716 			vm_map_user_range_resolve(map, start, 1, &r);
2717 			if (r.max_address == 0) {
2718 				RETURN(KERN_INVALID_ADDRESS);
2719 			}
2720 			effective_min_offset = r.min_address;
2721 			effective_max_offset = r.max_address;
2722 		}
2723 #endif /* CONFIG_MAP_RANGES */
2724 
2725 		if ((startup_phase >= STARTUP_SUB_KMEM) && !is_submap &&
2726 		    (map == kernel_map)) {
2727 			mach_vm_range_t r = kmem_validate_range_for_overwrite(start, size);
2728 			effective_min_offset = r->min_address;
2729 			effective_max_offset = r->max_address;
2730 		}
2731 
2732 		/*
2733 		 *	...	the address is within bounds
2734 		 */
2735 
2736 		end = start + size;
2737 
2738 		if ((start < effective_min_offset) ||
2739 		    (end > effective_max_offset) ||
2740 		    (start >= end)) {
2741 			RETURN(KERN_INVALID_ADDRESS);
2742 		}
2743 
2744 		if (overwrite) {
2745 			vmr_flags_t remove_flags = VM_MAP_REMOVE_NO_MAP_ALIGN;
2746 			kern_return_t remove_kr;
2747 
2748 			/*
2749 			 * Fixed mapping and "overwrite" flag: attempt to
2750 			 * remove all existing mappings in the specified
2751 			 * address range, saving them in our "zap_old_list".
2752 			 *
2753 			 * This avoids releasing the VM map lock in
2754 			 * vm_map_entry_delete() and allows atomicity
2755 			 * when we want to replace some mappings with a new one.
2756 			 * It also allows us to restore the old VM mappings if the
2757 			 * new mapping fails.
2758 			 */
2759 			remove_flags |= VM_MAP_REMOVE_NO_YIELD;
2760 
2761 			if (vmk_flags.vmkf_overwrite_immutable) {
2762 				/* we can overwrite immutable mappings */
2763 				remove_flags |= VM_MAP_REMOVE_IMMUTABLE;
2764 			}
2765 			if (vmk_flags.vmkf_remap_prot_copy) {
2766 				remove_flags |= VM_MAP_REMOVE_IMMUTABLE_CODE;
2767 			}
2768 			remove_kr = vm_map_delete(map, start, end, remove_flags,
2769 			    KMEM_GUARD_NONE, &zap_old_list).kmr_return;
2770 			if (remove_kr) {
2771 				/* XXX FBDP restore zap_old_list? */
2772 				RETURN(remove_kr);
2773 			}
2774 		}
2775 
2776 		/*
2777 		 *	...	the starting address isn't allocated
2778 		 */
2779 
2780 		if (vm_map_lookup_entry(map, start, &entry)) {
2781 			if (!(vmk_flags.vmkf_already)) {
2782 				RETURN(KERN_NO_SPACE);
2783 			}
2784 			/*
2785 			 * Check if what's already there is what we want.
2786 			 */
2787 			tmp_start = start;
2788 			tmp_offset = offset;
2789 			if (entry->vme_start < start) {
2790 				tmp_start -= start - entry->vme_start;
2791 				tmp_offset -= start - entry->vme_start;
2792 			}
2793 			for (; entry->vme_start < end;
2794 			    entry = entry->vme_next) {
2795 				/*
2796 				 * Check if the mapping's attributes
2797 				 * match the existing map entry.
2798 				 */
2799 				if (entry == vm_map_to_entry(map) ||
2800 				    entry->vme_start != tmp_start ||
2801 				    entry->is_sub_map != is_submap ||
2802 				    VME_OFFSET(entry) != tmp_offset ||
2803 				    entry->needs_copy != needs_copy ||
2804 				    entry->protection != cur_protection ||
2805 				    entry->max_protection != max_protection ||
2806 				    entry->inheritance != inheritance ||
2807 				    entry->iokit_acct != iokit_acct ||
2808 				    VME_ALIAS(entry) != alias) {
2809 					/* not the same mapping ! */
2810 					RETURN(KERN_NO_SPACE);
2811 				}
2812 				/*
2813 				 * Check if the same object is being mapped.
2814 				 */
2815 				if (is_submap) {
2816 					if (VME_SUBMAP(entry) !=
2817 					    (vm_map_t) object) {
2818 						/* not the same submap */
2819 						RETURN(KERN_NO_SPACE);
2820 					}
2821 				} else {
2822 					if (VME_OBJECT(entry) != object) {
2823 						/* not the same VM object... */
2824 						vm_object_t obj2;
2825 
2826 						obj2 = VME_OBJECT(entry);
2827 						if ((obj2 == VM_OBJECT_NULL ||
2828 						    obj2->internal) &&
2829 						    (object == VM_OBJECT_NULL ||
2830 						    object->internal)) {
2831 							/*
2832 							 * ... but both are
2833 							 * anonymous memory,
2834 							 * so equivalent.
2835 							 */
2836 						} else {
2837 							RETURN(KERN_NO_SPACE);
2838 						}
2839 					}
2840 				}
2841 
2842 				tmp_offset += entry->vme_end - entry->vme_start;
2843 				tmp_start += entry->vme_end - entry->vme_start;
2844 				if (entry->vme_end >= end) {
2845 					/* reached the end of our mapping */
2846 					break;
2847 				}
2848 			}
2849 			/* it all matches:  let's use what's already there ! */
2850 			RETURN(KERN_MEMORY_PRESENT);
2851 		}
2852 
2853 		/*
2854 		 *	...	the next region doesn't overlap the
2855 		 *		end point.
2856 		 */
2857 
2858 		if ((entry->vme_next != vm_map_to_entry(map)) &&
2859 		    (entry->vme_next->vme_start < end)) {
2860 			RETURN(KERN_NO_SPACE);
2861 		}
2862 	}
2863 
2864 	/*
2865 	 *	At this point,
2866 	 *		"start" and "end" should define the endpoints of the
2867 	 *			available new range, and
2868 	 *		"entry" should refer to the region before the new
2869 	 *			range, and
2870 	 *
2871 	 *		the map should be locked.
2872 	 */
2873 
2874 	/*
2875 	 *	See whether we can avoid creating a new entry (and object) by
2876 	 *	extending one of our neighbors.  [So far, we only attempt to
2877 	 *	extend from below.]  Note that we can never extend/join
2878 	 *	purgable objects because they need to remain distinct
2879 	 *	entities in order to implement their "volatile object"
2880 	 *	semantics.
2881 	 */
2882 
2883 	if (purgable ||
2884 	    entry_for_jit ||
2885 	    entry_for_tpro ||
2886 	    vm_memory_malloc_no_cow(user_alias)) {
2887 		if (object == VM_OBJECT_NULL) {
2888 			object = vm_object_allocate(size);
2889 			object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
2890 			object->true_share = FALSE;
2891 			if (purgable) {
2892 				task_t owner;
2893 				object->purgable = VM_PURGABLE_NONVOLATILE;
2894 				if (map->pmap == kernel_pmap) {
2895 					/*
2896 					 * Purgeable mappings made in a kernel
2897 					 * map are "owned" by the kernel itself
2898 					 * rather than the current user task
2899 					 * because they're likely to be used by
2900 					 * more than this user task (see
2901 					 * execargs_purgeable_allocate(), for
2902 					 * example).
2903 					 */
2904 					owner = kernel_task;
2905 				} else {
2906 					owner = current_task();
2907 				}
2908 				assert(object->vo_owner == NULL);
2909 				assert(object->resident_page_count == 0);
2910 				assert(object->wired_page_count == 0);
2911 				vm_object_lock(object);
2912 				vm_purgeable_nonvolatile_enqueue(object, owner);
2913 				vm_object_unlock(object);
2914 			}
2915 			offset = (vm_object_offset_t)0;
2916 		}
2917 	} else if (VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT) {
2918 		/* no coalescing if address space uses sub-pages */
2919 	} else if ((is_submap == FALSE) &&
2920 	    (object == VM_OBJECT_NULL) &&
2921 	    (entry != vm_map_to_entry(map)) &&
2922 	    (entry->vme_end == start) &&
2923 	    (!entry->is_shared) &&
2924 	    (!entry->is_sub_map) &&
2925 	    (!entry->in_transition) &&
2926 	    (!entry->needs_wakeup) &&
2927 	    (entry->behavior == VM_BEHAVIOR_DEFAULT) &&
2928 	    (entry->protection == cur_protection) &&
2929 	    (entry->max_protection == max_protection) &&
2930 	    (entry->inheritance == inheritance) &&
2931 	    ((user_alias == VM_MEMORY_REALLOC) ||
2932 	    (VME_ALIAS(entry) == alias)) &&
2933 	    (entry->no_cache == no_cache) &&
2934 	    (entry->vme_permanent == permanent) &&
2935 	    /* no coalescing for immutable executable mappings */
2936 	    !((entry->protection & VM_PROT_EXECUTE) &&
2937 	    entry->vme_permanent) &&
2938 	    (!entry->superpage_size && !superpage_size) &&
2939 	    /*
2940 	     * No coalescing if not map-aligned, to avoid propagating
2941 	     * that condition any further than needed:
2942 	     */
2943 	    (!entry->map_aligned || !clear_map_aligned) &&
2944 	    (!entry->zero_wired_pages) &&
2945 	    (!entry->used_for_jit && !entry_for_jit) &&
2946 #if __arm64e__
2947 	    (!entry->used_for_tpro && !entry_for_tpro) &&
2948 #endif
2949 	    (!entry->csm_associated) &&
2950 	    (entry->iokit_acct == iokit_acct) &&
2951 	    (!entry->vme_resilient_codesign) &&
2952 	    (!entry->vme_resilient_media) &&
2953 	    (!entry->vme_atomic) &&
2954 	    (entry->vme_no_copy_on_read == no_copy_on_read) &&
2955 
2956 	    ((entry->vme_end - entry->vme_start) + size <=
2957 	    (user_alias == VM_MEMORY_REALLOC ?
2958 	    ANON_CHUNK_SIZE :
2959 	    NO_COALESCE_LIMIT)) &&
2960 
2961 	    (entry->wired_count == 0)) {        /* implies user_wired_count == 0 */
2962 		if (vm_object_coalesce(VME_OBJECT(entry),
2963 		    VM_OBJECT_NULL,
2964 		    VME_OFFSET(entry),
2965 		    (vm_object_offset_t) 0,
2966 		    (vm_map_size_t)(entry->vme_end - entry->vme_start),
2967 		    (vm_map_size_t)(end - entry->vme_end))) {
2968 			/*
2969 			 *	Coalesced the two objects - can extend
2970 			 *	the previous map entry to include the
2971 			 *	new range.
2972 			 */
2973 			map->size += (end - entry->vme_end);
2974 			assert(entry->vme_start < end);
2975 			assert(VM_MAP_PAGE_ALIGNED(end,
2976 			    VM_MAP_PAGE_MASK(map)));
2977 			if (__improbable(vm_debug_events)) {
2978 				DTRACE_VM5(map_entry_extend, vm_map_t, map, vm_map_entry_t, entry, vm_address_t, entry->vme_start, vm_address_t, entry->vme_end, vm_address_t, end);
2979 			}
2980 			entry->vme_end = end;
2981 			if (map->holelistenabled) {
2982 				vm_map_store_update_first_free(map, entry, TRUE);
2983 			} else {
2984 				vm_map_store_update_first_free(map, map->first_free, TRUE);
2985 			}
2986 			new_mapping_established = TRUE;
2987 			RETURN(KERN_SUCCESS);
2988 		}
2989 	}
2990 
2991 	step = superpage_size ? SUPERPAGE_SIZE : (end - start);
2992 	new_entry = NULL;
2993 
2994 	if (vmk_flags.vmkf_submap_adjust) {
2995 		vm_map_adjust_offsets((vm_map_t)caller_object, start, end);
2996 		offset = start;
2997 	}
2998 
2999 	for (tmp2_start = start; tmp2_start < end; tmp2_start += step) {
3000 		tmp2_end = tmp2_start + step;
3001 		/*
3002 		 *	Create a new entry
3003 		 *
3004 		 * XXX FBDP
3005 		 * The reserved "page zero" in each process's address space can
3006 		 * be arbitrarily large.  Splitting it into separate objects and
3007 		 * therefore different VM map entries serves no purpose and just
3008 		 * slows down operations on the VM map, so let's not split the
3009 		 * allocation into chunks if the max protection is NONE.  That
3010 		 * memory should never be accessible, so it will never get to the
3011 		 * default pager.
3012 		 */
3013 		tmp_start = tmp2_start;
3014 		if (!is_submap &&
3015 		    object == VM_OBJECT_NULL &&
3016 		    size > chunk_size &&
3017 		    max_protection != VM_PROT_NONE &&
3018 		    superpage_size == 0) {
3019 			tmp_end = tmp_start + chunk_size;
3020 		} else {
3021 			tmp_end = tmp2_end;
3022 		}
3023 		do {
3024 			if (!is_submap &&
3025 			    object != VM_OBJECT_NULL &&
3026 			    object->internal &&
3027 			    offset + (tmp_end - tmp_start) > object->vo_size) {
3028 //				printf("FBDP object %p size 0x%llx overmapping offset 0x%llx size 0x%llx\n", object, object->vo_size, offset, (uint64_t)(tmp_end - tmp_start));
3029 				DTRACE_VM5(vm_map_enter_overmap,
3030 				    vm_map_t, map,
3031 				    vm_map_address_t, tmp_start,
3032 				    vm_map_address_t, tmp_end,
3033 				    vm_object_offset_t, offset,
3034 				    vm_object_size_t, object->vo_size);
3035 			}
3036 			new_entry = vm_map_entry_insert(map,
3037 			    entry, tmp_start, tmp_end,
3038 			    object, offset, vmk_flags,
3039 			    needs_copy,
3040 			    cur_protection, max_protection,
3041 			    (entry_for_jit && !VM_MAP_POLICY_ALLOW_JIT_INHERIT(map) ?
3042 			    VM_INHERIT_NONE : inheritance),
3043 			    clear_map_aligned);
3044 
3045 			assert((object != kernel_object) || (VM_KERN_MEMORY_NONE != alias));
3046 
3047 			if (resilient_codesign) {
3048 				int reject_prot = (needs_copy ? VM_PROT_ALLEXEC : (VM_PROT_WRITE | VM_PROT_ALLEXEC));
3049 				if (!((cur_protection | max_protection) & reject_prot)) {
3050 					new_entry->vme_resilient_codesign = TRUE;
3051 				}
3052 			}
3053 
3054 			if (resilient_media &&
3055 			    (object == VM_OBJECT_NULL ||
3056 			    object->internal)) {
3057 				new_entry->vme_resilient_media = TRUE;
3058 			}
3059 
3060 			assert(!new_entry->iokit_acct);
3061 			if (!is_submap &&
3062 			    object != VM_OBJECT_NULL &&
3063 			    (object->purgable != VM_PURGABLE_DENY ||
3064 			    object->vo_ledger_tag)) {
3065 				assert(new_entry->use_pmap);
3066 				assert(!new_entry->iokit_acct);
3067 				/*
3068 				 * Turn off pmap accounting since
3069 				 * purgeable (or tagged) objects have their
3070 				 * own ledgers.
3071 				 */
3072 				new_entry->use_pmap = FALSE;
3073 			} else if (!is_submap &&
3074 			    iokit_acct &&
3075 			    object != VM_OBJECT_NULL &&
3076 			    object->internal) {
3077 				/* alternate accounting */
3078 				assert(!new_entry->iokit_acct);
3079 				assert(new_entry->use_pmap);
3080 				new_entry->iokit_acct = TRUE;
3081 				new_entry->use_pmap = FALSE;
3082 				DTRACE_VM4(
3083 					vm_map_iokit_mapped_region,
3084 					vm_map_t, map,
3085 					vm_map_offset_t, new_entry->vme_start,
3086 					vm_map_offset_t, new_entry->vme_end,
3087 					int, VME_ALIAS(new_entry));
3088 				vm_map_iokit_mapped_region(
3089 					map,
3090 					(new_entry->vme_end -
3091 					new_entry->vme_start));
3092 			} else if (!is_submap) {
3093 				assert(!new_entry->iokit_acct);
3094 				assert(new_entry->use_pmap);
3095 			}
3096 
3097 			if (is_submap) {
3098 				vm_map_t        submap;
3099 				boolean_t       submap_is_64bit;
3100 				boolean_t       use_pmap;
3101 
3102 				assert(new_entry->is_sub_map);
3103 				assert(!new_entry->use_pmap);
3104 				assert(!new_entry->iokit_acct);
3105 				submap = (vm_map_t) object;
3106 				submap_is_64bit = vm_map_is_64bit(submap);
3107 				use_pmap = vmk_flags.vmkf_nested_pmap;
3108 #ifndef NO_NESTED_PMAP
3109 				if (use_pmap && submap->pmap == NULL) {
3110 					ledger_t ledger = map->pmap->ledger;
3111 					/* we need a sub pmap to nest... */
3112 					submap->pmap = pmap_create_options(ledger, 0,
3113 					    submap_is_64bit ? PMAP_CREATE_64BIT : 0);
3114 					if (submap->pmap == NULL) {
3115 						/* let's proceed without nesting... */
3116 					}
3117 #if defined(__arm64__)
3118 					else {
3119 						pmap_set_nested(submap->pmap);
3120 					}
3121 #endif
3122 				}
3123 				if (use_pmap && submap->pmap != NULL) {
3124 					if (VM_MAP_PAGE_SHIFT(map) != VM_MAP_PAGE_SHIFT(submap)) {
3125 						DEBUG4K_ERROR("map %p (%d) submap %p (%d): incompatible page sizes\n", map, VM_MAP_PAGE_SHIFT(map), submap, VM_MAP_PAGE_SHIFT(submap));
3126 						kr = KERN_FAILURE;
3127 					} else {
3128 						kr = pmap_nest(map->pmap,
3129 						    submap->pmap,
3130 						    tmp_start,
3131 						    tmp_end - tmp_start);
3132 					}
3133 					if (kr != KERN_SUCCESS) {
3134 						printf("vm_map_enter: "
3135 						    "pmap_nest(0x%llx,0x%llx) "
3136 						    "error 0x%x\n",
3137 						    (long long)tmp_start,
3138 						    (long long)tmp_end,
3139 						    kr);
3140 					} else {
3141 						/* we're now nested ! */
3142 						new_entry->use_pmap = TRUE;
3143 						pmap_empty = FALSE;
3144 					}
3145 				}
3146 #endif /* NO_NESTED_PMAP */
3147 			}
3148 			entry = new_entry;
3149 
3150 			if (superpage_size) {
3151 				vm_page_t pages, m;
3152 				vm_object_t sp_object;
3153 				vm_object_offset_t sp_offset;
3154 
3155 				VME_OFFSET_SET(entry, 0);
3156 
3157 				/* allocate one superpage */
3158 				kr = cpm_allocate(SUPERPAGE_SIZE, &pages, 0, SUPERPAGE_NBASEPAGES - 1, TRUE, 0);
3159 				if (kr != KERN_SUCCESS) {
3160 					/* deallocate whole range... */
3161 					new_mapping_established = TRUE;
3162 					/* ... but only up to "tmp_end" */
3163 					size -= end - tmp_end;
3164 					RETURN(kr);
3165 				}
3166 
3167 				/* create one vm_object per superpage */
3168 				sp_object = vm_object_allocate((vm_map_size_t)(entry->vme_end - entry->vme_start));
3169 				sp_object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
3170 				sp_object->phys_contiguous = TRUE;
3171 				sp_object->vo_shadow_offset = (vm_object_offset_t)VM_PAGE_GET_PHYS_PAGE(pages) * PAGE_SIZE;
3172 				VME_OBJECT_SET(entry, sp_object, false, 0);
3173 				assert(entry->use_pmap);
3174 
3175 				/* enter the base pages into the object */
3176 				vm_object_lock(sp_object);
3177 				for (sp_offset = 0;
3178 				    sp_offset < SUPERPAGE_SIZE;
3179 				    sp_offset += PAGE_SIZE) {
3180 					m = pages;
3181 					pmap_zero_page(VM_PAGE_GET_PHYS_PAGE(m));
3182 					pages = NEXT_PAGE(m);
3183 					*(NEXT_PAGE_PTR(m)) = VM_PAGE_NULL;
3184 					vm_page_insert_wired(m, sp_object, sp_offset, VM_KERN_MEMORY_OSFMK);
3185 				}
3186 				vm_object_unlock(sp_object);
3187 			}
3188 		} while (tmp_end != tmp2_end &&
3189 		    (tmp_start = tmp_end) &&
3190 		    (tmp_end = (tmp2_end - tmp_end > chunk_size) ?
3191 		    tmp_end + chunk_size : tmp2_end));
3192 	}
3193 
3194 	new_mapping_established = TRUE;
3195 
3196 BailOut:
3197 	assert(map_locked == TRUE);
3198 
3199 	/*
3200 	 * Address space limit enforcement (RLIMIT_AS and RLIMIT_DATA):
3201 	 * If we have identified and possibly established the new mapping(s),
3202 	 * make sure we did not go beyond the address space limit.
3203 	 */
3204 	if (result == KERN_SUCCESS) {
3205 		if (map->size_limit != RLIM_INFINITY &&
3206 		    map->size > map->size_limit) {
3207 			/*
3208 			 * Establishing the requested mappings would exceed
3209 			 * the process's RLIMIT_AS limit: fail with
3210 			 * KERN_NO_SPACE.
3211 			 */
3212 			result = KERN_NO_SPACE;
3213 			printf("%d[%s] %s: map size 0x%llx over RLIMIT_AS 0x%llx\n",
3214 			    proc_selfpid(),
3215 			    (get_bsdtask_info(current_task())
3216 			    ? proc_name_address(get_bsdtask_info(current_task()))
3217 			    : "?"),
3218 			    __FUNCTION__,
3219 			    (uint64_t) map->size,
3220 			    (uint64_t) map->size_limit);
3221 			DTRACE_VM2(vm_map_enter_RLIMIT_AS,
3222 			    vm_map_size_t, map->size,
3223 			    uint64_t, map->size_limit);
3224 			vm_map_enter_RLIMIT_AS_count++;
3225 		} else if (map->data_limit != RLIM_INFINITY &&
3226 		    map->size > map->data_limit) {
3227 			/*
3228 			 * Establishing the requested mappings would exceed
3229 			 * the process's RLIMIT_DATA limit: fail with
3230 			 * KERN_NO_SPACE.
3231 			 */
3232 			result = KERN_NO_SPACE;
3233 			printf("%d[%s] %s: map size 0x%llx over RLIMIT_DATA 0x%llx\n",
3234 			    proc_selfpid(),
3235 			    (get_bsdtask_info(current_task())
3236 			    ? proc_name_address(get_bsdtask_info(current_task()))
3237 			    : "?"),
3238 			    __FUNCTION__,
3239 			    (uint64_t) map->size,
3240 			    (uint64_t) map->data_limit);
3241 			DTRACE_VM2(vm_map_enter_RLIMIT_DATA,
3242 			    vm_map_size_t, map->size,
3243 			    uint64_t, map->data_limit);
3244 			vm_map_enter_RLIMIT_DATA_count++;
3245 		}
3246 	}
3247 
3248 	if (result == KERN_SUCCESS) {
3249 		vm_prot_t pager_prot;
3250 		memory_object_t pager;
3251 
3252 #if DEBUG
3253 		if (pmap_empty &&
3254 		    !(vmk_flags.vmkf_no_pmap_check)) {
3255 			assert(pmap_is_empty(map->pmap,
3256 			    *address,
3257 			    *address + size));
3258 		}
3259 #endif /* DEBUG */
3260 
3261 		/*
3262 		 * For "named" VM objects, let the pager know that the
3263 		 * memory object is being mapped.  Some pagers need to keep
3264 		 * track of this, to know when they can reclaim the memory
3265 		 * object, for example.
3266 		 * VM calls memory_object_map() for each mapping (specifying
3267 		 * the protection of each mapping) and calls
3268 		 * memory_object_last_unmap() when all the mappings are gone.
3269 		 */
3270 		pager_prot = max_protection;
3271 		if (needs_copy) {
3272 			/*
3273 			 * Copy-On-Write mapping: won't modify
3274 			 * the memory object.
3275 			 */
3276 			pager_prot &= ~VM_PROT_WRITE;
3277 		}
3278 		if (!is_submap &&
3279 		    object != VM_OBJECT_NULL &&
3280 		    object->named &&
3281 		    object->pager != MEMORY_OBJECT_NULL) {
3282 			vm_object_lock(object);
3283 			pager = object->pager;
3284 			if (object->named &&
3285 			    pager != MEMORY_OBJECT_NULL) {
3286 				assert(object->pager_ready);
3287 				vm_object_mapping_wait(object, THREAD_UNINT);
3288 				vm_object_mapping_begin(object);
3289 				vm_object_unlock(object);
3290 
3291 				kr = memory_object_map(pager, pager_prot);
3292 				assert(kr == KERN_SUCCESS);
3293 
3294 				vm_object_lock(object);
3295 				vm_object_mapping_end(object);
3296 			}
3297 			vm_object_unlock(object);
3298 		}
3299 	}
3300 
3301 	assert(map_locked == TRUE);
3302 
3303 	if (new_mapping_established) {
3304 		/*
3305 		 * If we release the map lock for any reason below,
3306 		 * another thread could deallocate our new mapping,
3307 		 * releasing the caller's reference on "caller_object",
3308 		 * which was transferred to the mapping.
3309 		 * If this was the only reference, the object could be
3310 		 * destroyed.
3311 		 *
3312 		 * We need to take an extra reference on "caller_object"
3313 		 * to keep it alive if we need to return the caller's
3314 		 * reference to the caller in case of failure.
3315 		 */
3316 		if (is_submap) {
3317 			vm_map_reference((vm_map_t)caller_object);
3318 		} else {
3319 			vm_object_reference(caller_object);
3320 		}
3321 	}
3322 
3323 	if (!keep_map_locked) {
3324 		vm_map_unlock(map);
3325 		map_locked = FALSE;
3326 		entry = VM_MAP_ENTRY_NULL;
3327 		new_entry = VM_MAP_ENTRY_NULL;
3328 	}
3329 
3330 	/*
3331 	 * We can't hold the map lock if we enter this block.
3332 	 */
3333 
3334 	if (result == KERN_SUCCESS) {
3335 		/*	Wire down the new entry if the user
3336 		 *	requested all new map entries be wired.
3337 		 */
3338 		if ((map->wiring_required) || (superpage_size)) {
3339 			assert(!keep_map_locked);
3340 			pmap_empty = FALSE; /* pmap won't be empty */
3341 			kr = vm_map_wire_kernel(map, start, end,
3342 			    cur_protection, VM_KERN_MEMORY_MLOCK,
3343 			    TRUE);
3344 			result = kr;
3345 		}
3346 
3347 	}
3348 
3349 	if (result != KERN_SUCCESS) {
3350 		if (new_mapping_established) {
3351 			vmr_flags_t remove_flags = VM_MAP_REMOVE_NO_FLAGS;
3352 
3353 			/*
3354 			 * We have to get rid of the new mappings since we
3355 			 * won't make them available to the user.
3356 			 * Try and do that atomically, to minimize the risk
3357 			 * that someone else create new mappings that range.
3358 			 */
3359 			if (!map_locked) {
3360 				vm_map_lock(map);
3361 				map_locked = TRUE;
3362 			}
3363 			remove_flags |= VM_MAP_REMOVE_NO_MAP_ALIGN;
3364 			remove_flags |= VM_MAP_REMOVE_NO_YIELD;
3365 			if (permanent) {
3366 				remove_flags |= VM_MAP_REMOVE_IMMUTABLE;
3367 			}
3368 			(void) vm_map_delete(map,
3369 			    *address, *address + size,
3370 			    remove_flags,
3371 			    KMEM_GUARD_NONE, &zap_new_list);
3372 		}
3373 
3374 		if (vm_map_zap_first_entry(&zap_old_list)) {
3375 			vm_map_entry_t entry1, entry2;
3376 
3377 			/*
3378 			 * The new mapping failed.  Attempt to restore
3379 			 * the old mappings, saved in the "zap_old_map".
3380 			 */
3381 			if (!map_locked) {
3382 				vm_map_lock(map);
3383 				map_locked = TRUE;
3384 			}
3385 
3386 			/* first check if the coast is still clear */
3387 			start = vm_map_zap_first_entry(&zap_old_list)->vme_start;
3388 			end   = vm_map_zap_last_entry(&zap_old_list)->vme_end;
3389 
3390 			if (vm_map_lookup_entry(map, start, &entry1) ||
3391 			    vm_map_lookup_entry(map, end, &entry2) ||
3392 			    entry1 != entry2) {
3393 				/*
3394 				 * Part of that range has already been
3395 				 * re-mapped:  we can't restore the old
3396 				 * mappings...
3397 				 */
3398 				vm_map_enter_restore_failures++;
3399 			} else {
3400 				/*
3401 				 * Transfer the saved map entries from
3402 				 * "zap_old_map" to the original "map",
3403 				 * inserting them all after "entry1".
3404 				 */
3405 				while ((entry2 = vm_map_zap_pop(&zap_old_list))) {
3406 					vm_map_size_t entry_size;
3407 
3408 					entry_size = (entry2->vme_end -
3409 					    entry2->vme_start);
3410 					vm_map_store_entry_link(map, entry1, entry2,
3411 					    VM_MAP_KERNEL_FLAGS_NONE);
3412 					map->size += entry_size;
3413 					entry1 = entry2;
3414 				}
3415 				if (map->wiring_required) {
3416 					/*
3417 					 * XXX TODO: we should rewire the
3418 					 * old pages here...
3419 					 */
3420 				}
3421 				vm_map_enter_restore_successes++;
3422 			}
3423 		}
3424 	}
3425 
3426 	/*
3427 	 * The caller is responsible for releasing the lock if it requested to
3428 	 * keep the map locked.
3429 	 */
3430 	if (map_locked && !keep_map_locked) {
3431 		vm_map_unlock(map);
3432 	}
3433 
3434 	vm_map_zap_dispose(&zap_old_list);
3435 	vm_map_zap_dispose(&zap_new_list);
3436 
3437 	if (new_mapping_established) {
3438 		/*
3439 		 * The caller had a reference on "caller_object" and we
3440 		 * transferred that reference to the mapping.
3441 		 * We also took an extra reference on "caller_object" to keep
3442 		 * it alive while the map was unlocked.
3443 		 */
3444 		if (result == KERN_SUCCESS) {
3445 			/*
3446 			 * On success, the caller's reference on the object gets
3447 			 * tranferred to the mapping.
3448 			 * Release our extra reference.
3449 			 */
3450 			if (is_submap) {
3451 				vm_map_deallocate((vm_map_t)caller_object);
3452 			} else {
3453 				vm_object_deallocate(caller_object);
3454 			}
3455 		} else {
3456 			/*
3457 			 * On error, the caller expects to still have a
3458 			 * reference on the object it gave us.
3459 			 * Let's use our extra reference for that.
3460 			 */
3461 		}
3462 	}
3463 
3464 	return result;
3465 
3466 #undef  RETURN
3467 }
3468 
3469 #if __arm64__
3470 extern const struct memory_object_pager_ops fourk_pager_ops;
3471 kern_return_t
vm_map_enter_fourk(vm_map_t map,vm_map_offset_t * address,vm_map_size_t size,vm_map_offset_t mask,vm_map_kernel_flags_t vmk_flags,vm_object_t object,vm_object_offset_t offset,boolean_t needs_copy,vm_prot_t cur_protection,vm_prot_t max_protection,vm_inherit_t inheritance)3472 vm_map_enter_fourk(
3473 	vm_map_t                map,
3474 	vm_map_offset_t         *address,       /* IN/OUT */
3475 	vm_map_size_t           size,
3476 	vm_map_offset_t         mask,
3477 	vm_map_kernel_flags_t   vmk_flags,
3478 	vm_object_t             object,
3479 	vm_object_offset_t      offset,
3480 	boolean_t               needs_copy,
3481 	vm_prot_t               cur_protection,
3482 	vm_prot_t               max_protection,
3483 	vm_inherit_t            inheritance)
3484 {
3485 	vm_map_entry_t          entry, new_entry;
3486 	vm_map_offset_t         start, fourk_start;
3487 	vm_map_offset_t         end, fourk_end;
3488 	vm_map_size_t           fourk_size;
3489 	kern_return_t           result = KERN_SUCCESS;
3490 	boolean_t               map_locked = FALSE;
3491 	boolean_t               pmap_empty = TRUE;
3492 	boolean_t               new_mapping_established = FALSE;
3493 	const bool              keep_map_locked = vmk_flags.vmkf_keep_map_locked;
3494 	const bool              anywhere = !vmk_flags.vmf_fixed;
3495 	const bool              purgable = vmk_flags.vmf_purgeable;
3496 	const bool              overwrite = vmk_flags.vmf_overwrite;
3497 	const bool              is_submap = vmk_flags.vmkf_submap;
3498 	const bool              entry_for_jit = vmk_flags.vmkf_map_jit;
3499 	const unsigned int      superpage_size = vmk_flags.vmf_superpage_size;
3500 	vm_map_offset_t         effective_min_offset, effective_max_offset;
3501 	kern_return_t           kr;
3502 	boolean_t               clear_map_aligned = FALSE;
3503 	memory_object_t         fourk_mem_obj;
3504 	vm_object_t             fourk_object;
3505 	vm_map_offset_t         fourk_pager_offset;
3506 	int                     fourk_pager_index_start, fourk_pager_index_num;
3507 	int                     cur_idx;
3508 	boolean_t               fourk_copy;
3509 	vm_object_t             copy_object;
3510 	vm_object_offset_t      copy_offset;
3511 	VM_MAP_ZAP_DECLARE(zap_list);
3512 
3513 	if (VM_MAP_PAGE_MASK(map) < PAGE_MASK) {
3514 		panic("%s:%d", __FUNCTION__, __LINE__);
3515 	}
3516 	fourk_mem_obj = MEMORY_OBJECT_NULL;
3517 	fourk_object = VM_OBJECT_NULL;
3518 
3519 	if (superpage_size) {
3520 		return KERN_NOT_SUPPORTED;
3521 	}
3522 
3523 	if ((cur_protection & VM_PROT_WRITE) &&
3524 	    (cur_protection & VM_PROT_EXECUTE) &&
3525 #if XNU_TARGET_OS_OSX
3526 	    map->pmap != kernel_pmap &&
3527 	    (vm_map_cs_enforcement(map)
3528 #if __arm64__
3529 	    || !VM_MAP_IS_EXOTIC(map)
3530 #endif /* __arm64__ */
3531 	    ) &&
3532 #endif /* XNU_TARGET_OS_OSX */
3533 #if CODE_SIGNING_MONITOR
3534 	    (csm_address_space_exempt(map->pmap) != KERN_SUCCESS) &&
3535 #endif
3536 	    !entry_for_jit) {
3537 		DTRACE_VM3(cs_wx,
3538 		    uint64_t, 0,
3539 		    uint64_t, 0,
3540 		    vm_prot_t, cur_protection);
3541 		printf("CODE SIGNING: %d[%s] %s: curprot cannot be write+execute. "
3542 		    "turning off execute\n",
3543 		    proc_selfpid(),
3544 		    (get_bsdtask_info(current_task())
3545 		    ? proc_name_address(get_bsdtask_info(current_task()))
3546 		    : "?"),
3547 		    __FUNCTION__);
3548 		cur_protection &= ~VM_PROT_EXECUTE;
3549 	}
3550 
3551 	/*
3552 	 * If the task has requested executable lockdown,
3553 	 * deny any new executable mapping.
3554 	 */
3555 	if (map->map_disallow_new_exec == TRUE) {
3556 		if (cur_protection & VM_PROT_EXECUTE) {
3557 			return KERN_PROTECTION_FAILURE;
3558 		}
3559 	}
3560 
3561 	if (is_submap) {
3562 		return KERN_NOT_SUPPORTED;
3563 	}
3564 	if (vmk_flags.vmkf_already) {
3565 		return KERN_NOT_SUPPORTED;
3566 	}
3567 	if (purgable || entry_for_jit) {
3568 		return KERN_NOT_SUPPORTED;
3569 	}
3570 
3571 	effective_min_offset = map->min_offset;
3572 
3573 	if (vmk_flags.vmkf_beyond_max) {
3574 		return KERN_NOT_SUPPORTED;
3575 	} else {
3576 		effective_max_offset = map->max_offset;
3577 	}
3578 
3579 	if (size == 0 ||
3580 	    (offset & FOURK_PAGE_MASK) != 0) {
3581 		*address = 0;
3582 		return KERN_INVALID_ARGUMENT;
3583 	}
3584 
3585 #define RETURN(value)   { result = value; goto BailOut; }
3586 
3587 	assert(VM_MAP_PAGE_ALIGNED(*address, FOURK_PAGE_MASK));
3588 	assert(VM_MAP_PAGE_ALIGNED(size, FOURK_PAGE_MASK));
3589 
3590 	if (!anywhere && overwrite) {
3591 		return KERN_NOT_SUPPORTED;
3592 	}
3593 
3594 	fourk_start = *address;
3595 	fourk_size = size;
3596 	fourk_end = fourk_start + fourk_size;
3597 
3598 	start = vm_map_trunc_page(*address, VM_MAP_PAGE_MASK(map));
3599 	end = vm_map_round_page(fourk_end, VM_MAP_PAGE_MASK(map));
3600 	size = end - start;
3601 
3602 	if (anywhere) {
3603 		return KERN_NOT_SUPPORTED;
3604 	} else {
3605 		/*
3606 		 *	Verify that:
3607 		 *		the address doesn't itself violate
3608 		 *		the mask requirement.
3609 		 */
3610 
3611 		vm_map_lock(map);
3612 		map_locked = TRUE;
3613 		if ((start & mask) != 0) {
3614 			RETURN(KERN_NO_SPACE);
3615 		}
3616 
3617 		/*
3618 		 *	...	the address is within bounds
3619 		 */
3620 
3621 		end = start + size;
3622 
3623 		if ((start < effective_min_offset) ||
3624 		    (end > effective_max_offset) ||
3625 		    (start >= end)) {
3626 			RETURN(KERN_INVALID_ADDRESS);
3627 		}
3628 
3629 		/*
3630 		 *	...	the starting address isn't allocated
3631 		 */
3632 		if (vm_map_lookup_entry(map, start, &entry)) {
3633 			vm_object_t cur_object, shadow_object;
3634 
3635 			/*
3636 			 * We might already some 4K mappings
3637 			 * in a 16K page here.
3638 			 */
3639 
3640 			if (entry->vme_end - entry->vme_start
3641 			    != SIXTEENK_PAGE_SIZE) {
3642 				RETURN(KERN_NO_SPACE);
3643 			}
3644 			if (entry->is_sub_map) {
3645 				RETURN(KERN_NO_SPACE);
3646 			}
3647 			if (VME_OBJECT(entry) == VM_OBJECT_NULL) {
3648 				RETURN(KERN_NO_SPACE);
3649 			}
3650 
3651 			/* go all the way down the shadow chain */
3652 			cur_object = VME_OBJECT(entry);
3653 			vm_object_lock(cur_object);
3654 			while (cur_object->shadow != VM_OBJECT_NULL) {
3655 				shadow_object = cur_object->shadow;
3656 				vm_object_lock(shadow_object);
3657 				vm_object_unlock(cur_object);
3658 				cur_object = shadow_object;
3659 				shadow_object = VM_OBJECT_NULL;
3660 			}
3661 			if (cur_object->internal ||
3662 			    cur_object->pager == NULL) {
3663 				vm_object_unlock(cur_object);
3664 				RETURN(KERN_NO_SPACE);
3665 			}
3666 			if (cur_object->pager->mo_pager_ops
3667 			    != &fourk_pager_ops) {
3668 				vm_object_unlock(cur_object);
3669 				RETURN(KERN_NO_SPACE);
3670 			}
3671 			fourk_object = cur_object;
3672 			fourk_mem_obj = fourk_object->pager;
3673 
3674 			/* keep the "4K" object alive */
3675 			vm_object_reference_locked(fourk_object);
3676 			memory_object_reference(fourk_mem_obj);
3677 			vm_object_unlock(fourk_object);
3678 
3679 			/* merge permissions */
3680 			entry->protection |= cur_protection;
3681 			entry->max_protection |= max_protection;
3682 
3683 			if ((entry->protection & VM_PROT_WRITE) &&
3684 			    (entry->protection & VM_PROT_ALLEXEC) &&
3685 			    fourk_binary_compatibility_unsafe &&
3686 			    fourk_binary_compatibility_allow_wx) {
3687 				/* write+execute: need to be "jit" */
3688 				entry->used_for_jit = TRUE;
3689 			}
3690 			goto map_in_fourk_pager;
3691 		}
3692 
3693 		/*
3694 		 *	...	the next region doesn't overlap the
3695 		 *		end point.
3696 		 */
3697 
3698 		if ((entry->vme_next != vm_map_to_entry(map)) &&
3699 		    (entry->vme_next->vme_start < end)) {
3700 			RETURN(KERN_NO_SPACE);
3701 		}
3702 	}
3703 
3704 	/*
3705 	 *	At this point,
3706 	 *		"start" and "end" should define the endpoints of the
3707 	 *			available new range, and
3708 	 *		"entry" should refer to the region before the new
3709 	 *			range, and
3710 	 *
3711 	 *		the map should be locked.
3712 	 */
3713 
3714 	/* create a new "4K" pager */
3715 	fourk_mem_obj = fourk_pager_create();
3716 	fourk_object = fourk_pager_to_vm_object(fourk_mem_obj);
3717 	assert(fourk_object);
3718 
3719 	/* keep the "4" object alive */
3720 	vm_object_reference(fourk_object);
3721 
3722 	/* create a "copy" object, to map the "4K" object copy-on-write */
3723 	fourk_copy = TRUE;
3724 	result = vm_object_copy_strategically(fourk_object,
3725 	    0,
3726 	    end - start,
3727 	    &copy_object,
3728 	    &copy_offset,
3729 	    &fourk_copy);
3730 	assert(result == KERN_SUCCESS);
3731 	assert(copy_object != VM_OBJECT_NULL);
3732 	assert(copy_offset == 0);
3733 
3734 	/* map the "4K" pager's copy object */
3735 	new_entry = vm_map_entry_insert(map,
3736 	    entry,
3737 	    vm_map_trunc_page(start, VM_MAP_PAGE_MASK(map)),
3738 	    vm_map_round_page(end, VM_MAP_PAGE_MASK(map)),
3739 	    copy_object,
3740 	    0,                      /* offset */
3741 	    vmk_flags,
3742 	    FALSE,                  /* needs_copy */
3743 	    cur_protection, max_protection,
3744 	    (entry_for_jit && !VM_MAP_POLICY_ALLOW_JIT_INHERIT(map) ?
3745 	    VM_INHERIT_NONE : inheritance),
3746 	    clear_map_aligned);
3747 	entry = new_entry;
3748 
3749 #if VM_MAP_DEBUG_FOURK
3750 	if (vm_map_debug_fourk) {
3751 		printf("FOURK_PAGER: map %p [0x%llx:0x%llx] new pager %p\n",
3752 		    map,
3753 		    (uint64_t) entry->vme_start,
3754 		    (uint64_t) entry->vme_end,
3755 		    fourk_mem_obj);
3756 	}
3757 #endif /* VM_MAP_DEBUG_FOURK */
3758 
3759 	new_mapping_established = TRUE;
3760 
3761 map_in_fourk_pager:
3762 	/* "map" the original "object" where it belongs in the "4K" pager */
3763 	fourk_pager_offset = (fourk_start & SIXTEENK_PAGE_MASK);
3764 	fourk_pager_index_start = (int) (fourk_pager_offset / FOURK_PAGE_SIZE);
3765 	if (fourk_size > SIXTEENK_PAGE_SIZE) {
3766 		fourk_pager_index_num = 4;
3767 	} else {
3768 		fourk_pager_index_num = (int) (fourk_size / FOURK_PAGE_SIZE);
3769 	}
3770 	if (fourk_pager_index_start + fourk_pager_index_num > 4) {
3771 		fourk_pager_index_num = 4 - fourk_pager_index_start;
3772 	}
3773 	for (cur_idx = 0;
3774 	    cur_idx < fourk_pager_index_num;
3775 	    cur_idx++) {
3776 		vm_object_t             old_object;
3777 		vm_object_offset_t      old_offset;
3778 
3779 		kr = fourk_pager_populate(fourk_mem_obj,
3780 		    TRUE,                       /* overwrite */
3781 		    fourk_pager_index_start + cur_idx,
3782 		    object,
3783 		    (object
3784 		    ? (offset +
3785 		    (cur_idx * FOURK_PAGE_SIZE))
3786 		    : 0),
3787 		    &old_object,
3788 		    &old_offset);
3789 #if VM_MAP_DEBUG_FOURK
3790 		if (vm_map_debug_fourk) {
3791 			if (old_object == (vm_object_t) -1 &&
3792 			    old_offset == (vm_object_offset_t) -1) {
3793 				printf("FOURK_PAGER: map %p [0x%llx:0x%llx] "
3794 				    "pager [%p:0x%llx] "
3795 				    "populate[%d] "
3796 				    "[object:%p,offset:0x%llx]\n",
3797 				    map,
3798 				    (uint64_t) entry->vme_start,
3799 				    (uint64_t) entry->vme_end,
3800 				    fourk_mem_obj,
3801 				    VME_OFFSET(entry),
3802 				    fourk_pager_index_start + cur_idx,
3803 				    object,
3804 				    (object
3805 				    ? (offset + (cur_idx * FOURK_PAGE_SIZE))
3806 				    : 0));
3807 			} else {
3808 				printf("FOURK_PAGER: map %p [0x%llx:0x%llx] "
3809 				    "pager [%p:0x%llx] "
3810 				    "populate[%d] [object:%p,offset:0x%llx] "
3811 				    "old [%p:0x%llx]\n",
3812 				    map,
3813 				    (uint64_t) entry->vme_start,
3814 				    (uint64_t) entry->vme_end,
3815 				    fourk_mem_obj,
3816 				    VME_OFFSET(entry),
3817 				    fourk_pager_index_start + cur_idx,
3818 				    object,
3819 				    (object
3820 				    ? (offset + (cur_idx * FOURK_PAGE_SIZE))
3821 				    : 0),
3822 				    old_object,
3823 				    old_offset);
3824 			}
3825 		}
3826 #endif /* VM_MAP_DEBUG_FOURK */
3827 
3828 		assert(kr == KERN_SUCCESS);
3829 		if (object != old_object &&
3830 		    object != VM_OBJECT_NULL &&
3831 		    object != (vm_object_t) -1) {
3832 			vm_object_reference(object);
3833 		}
3834 		if (object != old_object &&
3835 		    old_object != VM_OBJECT_NULL &&
3836 		    old_object != (vm_object_t) -1) {
3837 			vm_object_deallocate(old_object);
3838 		}
3839 	}
3840 
3841 BailOut:
3842 	assert(map_locked == TRUE);
3843 
3844 	if (result == KERN_SUCCESS) {
3845 		vm_prot_t pager_prot;
3846 		memory_object_t pager;
3847 
3848 #if DEBUG
3849 		if (pmap_empty &&
3850 		    !(vmk_flags.vmkf_no_pmap_check)) {
3851 			assert(pmap_is_empty(map->pmap,
3852 			    *address,
3853 			    *address + size));
3854 		}
3855 #endif /* DEBUG */
3856 
3857 		/*
3858 		 * For "named" VM objects, let the pager know that the
3859 		 * memory object is being mapped.  Some pagers need to keep
3860 		 * track of this, to know when they can reclaim the memory
3861 		 * object, for example.
3862 		 * VM calls memory_object_map() for each mapping (specifying
3863 		 * the protection of each mapping) and calls
3864 		 * memory_object_last_unmap() when all the mappings are gone.
3865 		 */
3866 		pager_prot = max_protection;
3867 		if (needs_copy) {
3868 			/*
3869 			 * Copy-On-Write mapping: won't modify
3870 			 * the memory object.
3871 			 */
3872 			pager_prot &= ~VM_PROT_WRITE;
3873 		}
3874 		if (!is_submap &&
3875 		    object != VM_OBJECT_NULL &&
3876 		    object->named &&
3877 		    object->pager != MEMORY_OBJECT_NULL) {
3878 			vm_object_lock(object);
3879 			pager = object->pager;
3880 			if (object->named &&
3881 			    pager != MEMORY_OBJECT_NULL) {
3882 				assert(object->pager_ready);
3883 				vm_object_mapping_wait(object, THREAD_UNINT);
3884 				vm_object_mapping_begin(object);
3885 				vm_object_unlock(object);
3886 
3887 				kr = memory_object_map(pager, pager_prot);
3888 				assert(kr == KERN_SUCCESS);
3889 
3890 				vm_object_lock(object);
3891 				vm_object_mapping_end(object);
3892 			}
3893 			vm_object_unlock(object);
3894 		}
3895 		if (!is_submap &&
3896 		    fourk_object != VM_OBJECT_NULL &&
3897 		    fourk_object->named &&
3898 		    fourk_object->pager != MEMORY_OBJECT_NULL) {
3899 			vm_object_lock(fourk_object);
3900 			pager = fourk_object->pager;
3901 			if (fourk_object->named &&
3902 			    pager != MEMORY_OBJECT_NULL) {
3903 				assert(fourk_object->pager_ready);
3904 				vm_object_mapping_wait(fourk_object,
3905 				    THREAD_UNINT);
3906 				vm_object_mapping_begin(fourk_object);
3907 				vm_object_unlock(fourk_object);
3908 
3909 				kr = memory_object_map(pager, VM_PROT_READ);
3910 				assert(kr == KERN_SUCCESS);
3911 
3912 				vm_object_lock(fourk_object);
3913 				vm_object_mapping_end(fourk_object);
3914 			}
3915 			vm_object_unlock(fourk_object);
3916 		}
3917 	}
3918 
3919 	if (fourk_object != VM_OBJECT_NULL) {
3920 		vm_object_deallocate(fourk_object);
3921 		fourk_object = VM_OBJECT_NULL;
3922 		memory_object_deallocate(fourk_mem_obj);
3923 		fourk_mem_obj = MEMORY_OBJECT_NULL;
3924 	}
3925 
3926 	assert(map_locked == TRUE);
3927 
3928 	if (!keep_map_locked) {
3929 		vm_map_unlock(map);
3930 		map_locked = FALSE;
3931 	}
3932 
3933 	/*
3934 	 * We can't hold the map lock if we enter this block.
3935 	 */
3936 
3937 	if (result == KERN_SUCCESS) {
3938 		/*	Wire down the new entry if the user
3939 		 *	requested all new map entries be wired.
3940 		 */
3941 		if ((map->wiring_required) || (superpage_size)) {
3942 			assert(!keep_map_locked);
3943 			pmap_empty = FALSE; /* pmap won't be empty */
3944 			kr = vm_map_wire_kernel(map, start, end,
3945 			    new_entry->protection, VM_KERN_MEMORY_MLOCK,
3946 			    TRUE);
3947 			result = kr;
3948 		}
3949 
3950 	}
3951 
3952 	if (result != KERN_SUCCESS) {
3953 		if (new_mapping_established) {
3954 			/*
3955 			 * We have to get rid of the new mappings since we
3956 			 * won't make them available to the user.
3957 			 * Try and do that atomically, to minimize the risk
3958 			 * that someone else create new mappings that range.
3959 			 */
3960 
3961 			if (!map_locked) {
3962 				vm_map_lock(map);
3963 				map_locked = TRUE;
3964 			}
3965 			(void)vm_map_delete(map, *address, *address + size,
3966 			    VM_MAP_REMOVE_NO_MAP_ALIGN | VM_MAP_REMOVE_NO_YIELD,
3967 			    KMEM_GUARD_NONE, &zap_list);
3968 		}
3969 	}
3970 
3971 	/*
3972 	 * The caller is responsible for releasing the lock if it requested to
3973 	 * keep the map locked.
3974 	 */
3975 	if (map_locked && !keep_map_locked) {
3976 		vm_map_unlock(map);
3977 	}
3978 
3979 	vm_map_zap_dispose(&zap_list);
3980 
3981 	return result;
3982 
3983 #undef  RETURN
3984 }
3985 #endif /* __arm64__ */
3986 
3987 /*
3988  * Counters for the prefault optimization.
3989  */
3990 int64_t vm_prefault_nb_pages = 0;
3991 int64_t vm_prefault_nb_bailout = 0;
3992 
3993 static kern_return_t
vm_map_enter_mem_object_helper(vm_map_t target_map,vm_map_offset_t * address,vm_map_size_t initial_size,vm_map_offset_t mask,vm_map_kernel_flags_t vmk_flags,ipc_port_t port,vm_object_offset_t offset,boolean_t copy,vm_prot_t cur_protection,vm_prot_t max_protection,vm_inherit_t inheritance,upl_page_list_ptr_t page_list,unsigned int page_list_count)3994 vm_map_enter_mem_object_helper(
3995 	vm_map_t                target_map,
3996 	vm_map_offset_t         *address,
3997 	vm_map_size_t           initial_size,
3998 	vm_map_offset_t         mask,
3999 	vm_map_kernel_flags_t   vmk_flags,
4000 	ipc_port_t              port,
4001 	vm_object_offset_t      offset,
4002 	boolean_t               copy,
4003 	vm_prot_t               cur_protection,
4004 	vm_prot_t               max_protection,
4005 	vm_inherit_t            inheritance,
4006 	upl_page_list_ptr_t     page_list,
4007 	unsigned int            page_list_count)
4008 {
4009 	vm_map_address_t        map_addr;
4010 	vm_map_size_t           map_size;
4011 	vm_object_t             object;
4012 	vm_object_size_t        size;
4013 	kern_return_t           result;
4014 	boolean_t               mask_cur_protection, mask_max_protection;
4015 	boolean_t               kernel_prefault, try_prefault = (page_list_count != 0);
4016 	vm_map_offset_t         offset_in_mapping = 0;
4017 #if __arm64__
4018 	boolean_t               fourk = vmk_flags.vmkf_fourk;
4019 #endif /* __arm64__ */
4020 
4021 	if (VM_MAP_PAGE_SHIFT(target_map) < PAGE_SHIFT) {
4022 		/* XXX TODO4K prefaulting depends on page size... */
4023 		try_prefault = FALSE;
4024 	}
4025 
4026 	assertf(vmk_flags.__vmkf_unused == 0, "vmk_flags unused=0x%x\n", vmk_flags.__vmkf_unused);
4027 	vm_map_kernel_flags_update_range_id(&vmk_flags, target_map);
4028 
4029 	mask_cur_protection = cur_protection & VM_PROT_IS_MASK;
4030 	mask_max_protection = max_protection & VM_PROT_IS_MASK;
4031 	cur_protection &= ~VM_PROT_IS_MASK;
4032 	max_protection &= ~VM_PROT_IS_MASK;
4033 
4034 	/*
4035 	 * Check arguments for validity
4036 	 */
4037 	if ((target_map == VM_MAP_NULL) ||
4038 	    (cur_protection & ~(VM_PROT_ALL | VM_PROT_ALLEXEC)) ||
4039 	    (max_protection & ~(VM_PROT_ALL | VM_PROT_ALLEXEC)) ||
4040 	    (inheritance > VM_INHERIT_LAST_VALID) ||
4041 	    (try_prefault && (copy || !page_list)) ||
4042 	    initial_size == 0) {
4043 		return KERN_INVALID_ARGUMENT;
4044 	}
4045 
4046 #if __arm64__
4047 	if (cur_protection & VM_PROT_EXECUTE) {
4048 		cur_protection |= VM_PROT_READ;
4049 	}
4050 
4051 	if (fourk && VM_MAP_PAGE_SHIFT(target_map) < PAGE_SHIFT) {
4052 		/* no "fourk" if map is using a sub-page page size */
4053 		fourk = FALSE;
4054 	}
4055 	if (fourk) {
4056 		map_addr = vm_map_trunc_page(*address, FOURK_PAGE_MASK);
4057 		map_size = vm_map_round_page(initial_size, FOURK_PAGE_MASK);
4058 	} else
4059 #endif /* __arm64__ */
4060 	{
4061 		map_addr = vm_map_trunc_page(*address,
4062 		    VM_MAP_PAGE_MASK(target_map));
4063 		map_size = vm_map_round_page(initial_size,
4064 		    VM_MAP_PAGE_MASK(target_map));
4065 	}
4066 	if (map_size == 0) {
4067 		return KERN_INVALID_ARGUMENT;
4068 	}
4069 	size = vm_object_round_page(initial_size);
4070 
4071 	/*
4072 	 * Find the vm object (if any) corresponding to this port.
4073 	 */
4074 	if (!IP_VALID(port)) {
4075 		object = VM_OBJECT_NULL;
4076 		offset = 0;
4077 		copy = FALSE;
4078 	} else if (ip_kotype(port) == IKOT_NAMED_ENTRY) {
4079 		vm_named_entry_t        named_entry;
4080 		vm_object_offset_t      data_offset;
4081 
4082 		named_entry = mach_memory_entry_from_port(port);
4083 
4084 		if (vmk_flags.vmf_return_data_addr ||
4085 		    vmk_flags.vmf_return_4k_data_addr) {
4086 			data_offset = named_entry->data_offset;
4087 			offset += named_entry->data_offset;
4088 		} else {
4089 			data_offset = 0;
4090 		}
4091 
4092 		/* a few checks to make sure user is obeying rules */
4093 		if (mask_max_protection) {
4094 			max_protection &= named_entry->protection;
4095 		}
4096 		if (mask_cur_protection) {
4097 			cur_protection &= named_entry->protection;
4098 		}
4099 		if ((named_entry->protection & max_protection) !=
4100 		    max_protection) {
4101 			return KERN_INVALID_RIGHT;
4102 		}
4103 		if ((named_entry->protection & cur_protection) !=
4104 		    cur_protection) {
4105 			return KERN_INVALID_RIGHT;
4106 		}
4107 		if (offset + size <= offset) {
4108 			/* overflow */
4109 			return KERN_INVALID_ARGUMENT;
4110 		}
4111 		if (named_entry->size < (offset + initial_size)) {
4112 			return KERN_INVALID_ARGUMENT;
4113 		}
4114 
4115 		if (named_entry->is_copy) {
4116 			/* for a vm_map_copy, we can only map it whole */
4117 			if ((size != named_entry->size) &&
4118 			    (vm_map_round_page(size,
4119 			    VM_MAP_PAGE_MASK(target_map)) ==
4120 			    named_entry->size)) {
4121 				/* XXX FBDP use the rounded size... */
4122 				size = vm_map_round_page(
4123 					size,
4124 					VM_MAP_PAGE_MASK(target_map));
4125 			}
4126 		}
4127 
4128 		/* the callers parameter offset is defined to be the */
4129 		/* offset from beginning of named entry offset in object */
4130 		offset = offset + named_entry->offset;
4131 
4132 		if (!VM_MAP_PAGE_ALIGNED(size,
4133 		    VM_MAP_PAGE_MASK(target_map))) {
4134 			/*
4135 			 * Let's not map more than requested;
4136 			 * vm_map_enter() will handle this "not map-aligned"
4137 			 * case.
4138 			 */
4139 			map_size = size;
4140 		}
4141 
4142 		named_entry_lock(named_entry);
4143 		if (named_entry->is_sub_map) {
4144 			vm_map_t                submap;
4145 
4146 			if (vmk_flags.vmf_return_data_addr ||
4147 			    vmk_flags.vmf_return_4k_data_addr) {
4148 				panic("VM_FLAGS_RETURN_DATA_ADDR not expected for submap.");
4149 			}
4150 
4151 			submap = named_entry->backing.map;
4152 			vm_map_reference(submap);
4153 			named_entry_unlock(named_entry);
4154 
4155 			vmk_flags.vmkf_submap = TRUE;
4156 
4157 			result = vm_map_enter(target_map,
4158 			    &map_addr,
4159 			    map_size,
4160 			    mask,
4161 			    vmk_flags,
4162 			    (vm_object_t)(uintptr_t) submap,
4163 			    offset,
4164 			    copy,
4165 			    cur_protection,
4166 			    max_protection,
4167 			    inheritance);
4168 			if (result != KERN_SUCCESS) {
4169 				vm_map_deallocate(submap);
4170 			} else {
4171 				/*
4172 				 * No need to lock "submap" just to check its
4173 				 * "mapped" flag: that flag is never reset
4174 				 * once it's been set and if we race, we'll
4175 				 * just end up setting it twice, which is OK.
4176 				 */
4177 				if (submap->mapped_in_other_pmaps == FALSE &&
4178 				    vm_map_pmap(submap) != PMAP_NULL &&
4179 				    vm_map_pmap(submap) !=
4180 				    vm_map_pmap(target_map)) {
4181 					/*
4182 					 * This submap is being mapped in a map
4183 					 * that uses a different pmap.
4184 					 * Set its "mapped_in_other_pmaps" flag
4185 					 * to indicate that we now need to
4186 					 * remove mappings from all pmaps rather
4187 					 * than just the submap's pmap.
4188 					 */
4189 					vm_map_lock(submap);
4190 					submap->mapped_in_other_pmaps = TRUE;
4191 					vm_map_unlock(submap);
4192 				}
4193 				*address = map_addr;
4194 			}
4195 			return result;
4196 		} else if (named_entry->is_copy) {
4197 			kern_return_t   kr;
4198 			vm_map_copy_t   copy_map;
4199 			vm_map_entry_t  copy_entry;
4200 			vm_map_offset_t copy_addr;
4201 			vm_map_copy_t   target_copy_map;
4202 			vm_map_offset_t overmap_start, overmap_end;
4203 			vm_map_offset_t trimmed_start;
4204 			vm_map_size_t   target_size;
4205 
4206 			if (!vm_map_kernel_flags_check_vmflags(vmk_flags,
4207 			    (VM_FLAGS_FIXED |
4208 			    VM_FLAGS_ANYWHERE |
4209 			    VM_FLAGS_OVERWRITE |
4210 			    VM_FLAGS_RETURN_4K_DATA_ADDR |
4211 			    VM_FLAGS_RETURN_DATA_ADDR))) {
4212 				named_entry_unlock(named_entry);
4213 				return KERN_INVALID_ARGUMENT;
4214 			}
4215 
4216 			copy_map = named_entry->backing.copy;
4217 			assert(copy_map->type == VM_MAP_COPY_ENTRY_LIST);
4218 			if (copy_map->type != VM_MAP_COPY_ENTRY_LIST) {
4219 				/* unsupported type; should not happen */
4220 				printf("vm_map_enter_mem_object: "
4221 				    "memory_entry->backing.copy "
4222 				    "unsupported type 0x%x\n",
4223 				    copy_map->type);
4224 				named_entry_unlock(named_entry);
4225 				return KERN_INVALID_ARGUMENT;
4226 			}
4227 
4228 			if (VM_MAP_PAGE_SHIFT(target_map) != copy_map->cpy_hdr.page_shift) {
4229 				DEBUG4K_SHARE("copy_map %p offset %llx size 0x%llx pgshift %d -> target_map %p pgshift %d\n", copy_map, offset, (uint64_t)map_size, copy_map->cpy_hdr.page_shift, target_map, VM_MAP_PAGE_SHIFT(target_map));
4230 			}
4231 
4232 			if (vmk_flags.vmf_return_data_addr ||
4233 			    vmk_flags.vmf_return_4k_data_addr) {
4234 				offset_in_mapping = offset & VM_MAP_PAGE_MASK(target_map);
4235 				if (vmk_flags.vmf_return_4k_data_addr) {
4236 					offset_in_mapping &= ~((signed)(0xFFF));
4237 				}
4238 			}
4239 
4240 			target_copy_map = VM_MAP_COPY_NULL;
4241 			target_size = copy_map->size;
4242 			overmap_start = 0;
4243 			overmap_end = 0;
4244 			trimmed_start = 0;
4245 			if (copy_map->cpy_hdr.page_shift != VM_MAP_PAGE_SHIFT(target_map)) {
4246 				DEBUG4K_ADJUST("adjusting...\n");
4247 				kr = vm_map_copy_adjust_to_target(
4248 					copy_map,
4249 					offset /* includes data_offset */,
4250 					initial_size,
4251 					target_map,
4252 					copy,
4253 					&target_copy_map,
4254 					&overmap_start,
4255 					&overmap_end,
4256 					&trimmed_start);
4257 				if (kr != KERN_SUCCESS) {
4258 					named_entry_unlock(named_entry);
4259 					return kr;
4260 				}
4261 				target_size = target_copy_map->size;
4262 				if (trimmed_start >= data_offset) {
4263 					data_offset = offset & VM_MAP_PAGE_MASK(target_map);
4264 				} else {
4265 					data_offset -= trimmed_start;
4266 				}
4267 			} else {
4268 				/*
4269 				 * Assert that the vm_map_copy is coming from the right
4270 				 * zone and hasn't been forged
4271 				 */
4272 				vm_map_copy_require(copy_map);
4273 				target_copy_map = copy_map;
4274 			}
4275 
4276 			vm_map_kernel_flags_t rsv_flags = vmk_flags;
4277 
4278 			vm_map_kernel_flags_and_vmflags(&rsv_flags,
4279 			    (VM_FLAGS_FIXED |
4280 			    VM_FLAGS_ANYWHERE |
4281 			    VM_FLAGS_OVERWRITE |
4282 			    VM_FLAGS_RETURN_4K_DATA_ADDR |
4283 			    VM_FLAGS_RETURN_DATA_ADDR));
4284 
4285 			/* reserve a contiguous range */
4286 			kr = vm_map_enter(target_map,
4287 			    &map_addr,
4288 			    vm_map_round_page(target_size, VM_MAP_PAGE_MASK(target_map)),
4289 			    mask,
4290 			    rsv_flags,
4291 			    VM_OBJECT_NULL,
4292 			    0,
4293 			    FALSE,               /* copy */
4294 			    cur_protection,
4295 			    max_protection,
4296 			    inheritance);
4297 			if (kr != KERN_SUCCESS) {
4298 				DEBUG4K_ERROR("kr 0x%x\n", kr);
4299 				if (target_copy_map != copy_map) {
4300 					vm_map_copy_discard(target_copy_map);
4301 					target_copy_map = VM_MAP_COPY_NULL;
4302 				}
4303 				named_entry_unlock(named_entry);
4304 				return kr;
4305 			}
4306 
4307 			copy_addr = map_addr;
4308 
4309 			for (copy_entry = vm_map_copy_first_entry(target_copy_map);
4310 			    copy_entry != vm_map_copy_to_entry(target_copy_map);
4311 			    copy_entry = copy_entry->vme_next) {
4312 				vm_map_t                copy_submap = VM_MAP_NULL;
4313 				vm_object_t             copy_object = VM_OBJECT_NULL;
4314 				vm_map_size_t           copy_size;
4315 				vm_object_offset_t      copy_offset;
4316 				boolean_t               do_copy = false;
4317 
4318 				if (copy_entry->is_sub_map) {
4319 					copy_submap = VME_SUBMAP(copy_entry);
4320 					copy_object = (vm_object_t)copy_submap;
4321 				} else {
4322 					copy_object = VME_OBJECT(copy_entry);
4323 				}
4324 				copy_offset = VME_OFFSET(copy_entry);
4325 				copy_size = (copy_entry->vme_end -
4326 				    copy_entry->vme_start);
4327 
4328 				/* sanity check */
4329 				if ((copy_addr + copy_size) >
4330 				    (map_addr +
4331 				    overmap_start + overmap_end +
4332 				    named_entry->size /* XXX full size */)) {
4333 					/* over-mapping too much !? */
4334 					kr = KERN_INVALID_ARGUMENT;
4335 					DEBUG4K_ERROR("kr 0x%x\n", kr);
4336 					/* abort */
4337 					break;
4338 				}
4339 
4340 				/* take a reference on the object */
4341 				if (copy_entry->is_sub_map) {
4342 					vm_map_reference(copy_submap);
4343 				} else {
4344 					if (!copy &&
4345 					    copy_object != VM_OBJECT_NULL &&
4346 					    copy_object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) {
4347 						/*
4348 						 * We need to resolve our side of this
4349 						 * "symmetric" copy-on-write now; we
4350 						 * need a new object to map and share,
4351 						 * instead of the current one which
4352 						 * might still be shared with the
4353 						 * original mapping.
4354 						 *
4355 						 * Note: A "vm_map_copy_t" does not
4356 						 * have a lock but we're protected by
4357 						 * the named entry's lock here.
4358 						 */
4359 						// assert(copy_object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC);
4360 						VME_OBJECT_SHADOW(copy_entry, copy_size, TRUE);
4361 						assert(copy_object != VME_OBJECT(copy_entry));
4362 						if (!copy_entry->needs_copy &&
4363 						    copy_entry->protection & VM_PROT_WRITE) {
4364 							vm_prot_t prot;
4365 
4366 							prot = copy_entry->protection & ~VM_PROT_WRITE;
4367 							vm_object_pmap_protect(copy_object,
4368 							    copy_offset,
4369 							    copy_size,
4370 							    PMAP_NULL,
4371 							    PAGE_SIZE,
4372 							    0,
4373 							    prot);
4374 						}
4375 						copy_entry->needs_copy = FALSE;
4376 						copy_entry->is_shared = TRUE;
4377 						copy_object = VME_OBJECT(copy_entry);
4378 						copy_offset = VME_OFFSET(copy_entry);
4379 						vm_object_lock(copy_object);
4380 						/* we're about to make a shared mapping of this object */
4381 						copy_object->copy_strategy = MEMORY_OBJECT_COPY_DELAY;
4382 						copy_object->true_share = TRUE;
4383 						vm_object_unlock(copy_object);
4384 					}
4385 
4386 					if (copy_object != VM_OBJECT_NULL &&
4387 					    copy_object->named &&
4388 					    copy_object->pager != MEMORY_OBJECT_NULL &&
4389 					    copy_object->copy_strategy != MEMORY_OBJECT_COPY_NONE) {
4390 						memory_object_t pager;
4391 						vm_prot_t       pager_prot;
4392 
4393 						/*
4394 						 * For "named" VM objects, let the pager know that the
4395 						 * memory object is being mapped.  Some pagers need to keep
4396 						 * track of this, to know when they can reclaim the memory
4397 						 * object, for example.
4398 						 * VM calls memory_object_map() for each mapping (specifying
4399 						 * the protection of each mapping) and calls
4400 						 * memory_object_last_unmap() when all the mappings are gone.
4401 						 */
4402 						pager_prot = max_protection;
4403 						if (copy) {
4404 							/*
4405 							 * Copy-On-Write mapping: won't modify the
4406 							 * memory object.
4407 							 */
4408 							pager_prot &= ~VM_PROT_WRITE;
4409 						}
4410 						vm_object_lock(copy_object);
4411 						pager = copy_object->pager;
4412 						if (copy_object->named &&
4413 						    pager != MEMORY_OBJECT_NULL &&
4414 						    copy_object->copy_strategy != MEMORY_OBJECT_COPY_NONE) {
4415 							assert(copy_object->pager_ready);
4416 							vm_object_mapping_wait(copy_object, THREAD_UNINT);
4417 							vm_object_mapping_begin(copy_object);
4418 							vm_object_unlock(copy_object);
4419 
4420 							kr = memory_object_map(pager, pager_prot);
4421 							assert(kr == KERN_SUCCESS);
4422 
4423 							vm_object_lock(copy_object);
4424 							vm_object_mapping_end(copy_object);
4425 						}
4426 						vm_object_unlock(copy_object);
4427 					}
4428 
4429 					/*
4430 					 *	Perform the copy if requested
4431 					 */
4432 
4433 					if (copy && copy_object != VM_OBJECT_NULL) {
4434 						vm_object_t             new_object;
4435 						vm_object_offset_t      new_offset;
4436 
4437 						result = vm_object_copy_strategically(copy_object, copy_offset,
4438 						    copy_size,
4439 						    &new_object, &new_offset,
4440 						    &do_copy);
4441 
4442 
4443 						if (result == KERN_MEMORY_RESTART_COPY) {
4444 							boolean_t success;
4445 							boolean_t src_needs_copy;
4446 
4447 							/*
4448 							 * XXX
4449 							 * We currently ignore src_needs_copy.
4450 							 * This really is the issue of how to make
4451 							 * MEMORY_OBJECT_COPY_SYMMETRIC safe for
4452 							 * non-kernel users to use. Solution forthcoming.
4453 							 * In the meantime, since we don't allow non-kernel
4454 							 * memory managers to specify symmetric copy,
4455 							 * we won't run into problems here.
4456 							 */
4457 							new_object = copy_object;
4458 							new_offset = copy_offset;
4459 							success = vm_object_copy_quickly(new_object,
4460 							    new_offset,
4461 							    copy_size,
4462 							    &src_needs_copy,
4463 							    &do_copy);
4464 							assert(success);
4465 							result = KERN_SUCCESS;
4466 						}
4467 						if (result != KERN_SUCCESS) {
4468 							kr = result;
4469 							break;
4470 						}
4471 
4472 						copy_object = new_object;
4473 						copy_offset = new_offset;
4474 						/*
4475 						 * No extra object reference for the mapping:
4476 						 * the mapping should be the only thing keeping
4477 						 * this new object alive.
4478 						 */
4479 					} else {
4480 						/*
4481 						 * We already have the right object
4482 						 * to map.
4483 						 */
4484 						copy_object = VME_OBJECT(copy_entry);
4485 						/* take an extra ref for the mapping below */
4486 						vm_object_reference(copy_object);
4487 					}
4488 				}
4489 
4490 				/*
4491 				 * If the caller does not want a specific
4492 				 * tag for this new mapping:  use
4493 				 * the tag of the original mapping.
4494 				 */
4495 				vm_map_kernel_flags_t vmk_remap_flags = {
4496 					.vmkf_submap = copy_entry->is_sub_map,
4497 				};
4498 
4499 				vm_map_kernel_flags_set_vmflags(&vmk_remap_flags,
4500 				    vm_map_kernel_flags_vmflags(vmk_flags),
4501 				    vmk_flags.vm_tag ?: VME_ALIAS(copy_entry));
4502 
4503 				/* over-map the object into destination */
4504 				vmk_remap_flags.vmf_fixed = true;
4505 				vmk_remap_flags.vmf_overwrite = true;
4506 
4507 				if (!copy && !copy_entry->is_sub_map) {
4508 					/*
4509 					 * copy-on-write should have been
4510 					 * resolved at this point, or we would
4511 					 * end up sharing instead of copying.
4512 					 */
4513 					assert(!copy_entry->needs_copy);
4514 				}
4515 #if XNU_TARGET_OS_OSX
4516 				if (copy_entry->used_for_jit) {
4517 					vmk_remap_flags.vmkf_map_jit = TRUE;
4518 				}
4519 #endif /* XNU_TARGET_OS_OSX */
4520 
4521 				kr = vm_map_enter(target_map,
4522 				    &copy_addr,
4523 				    copy_size,
4524 				    (vm_map_offset_t) 0,
4525 				    vmk_remap_flags,
4526 				    copy_object,
4527 				    copy_offset,
4528 				    ((copy_object == NULL)
4529 				    ? FALSE
4530 				    : (copy || copy_entry->needs_copy)),
4531 				    cur_protection,
4532 				    max_protection,
4533 				    inheritance);
4534 				if (kr != KERN_SUCCESS) {
4535 					DEBUG4K_SHARE("failed kr 0x%x\n", kr);
4536 					if (copy_entry->is_sub_map) {
4537 						vm_map_deallocate(copy_submap);
4538 					} else {
4539 						vm_object_deallocate(copy_object);
4540 					}
4541 					/* abort */
4542 					break;
4543 				}
4544 
4545 				/* next mapping */
4546 				copy_addr += copy_size;
4547 			}
4548 
4549 			if (kr == KERN_SUCCESS) {
4550 				if (vmk_flags.vmf_return_data_addr ||
4551 				    vmk_flags.vmf_return_4k_data_addr) {
4552 					*address = map_addr + offset_in_mapping;
4553 				} else {
4554 					*address = map_addr;
4555 				}
4556 				if (overmap_start) {
4557 					*address += overmap_start;
4558 					DEBUG4K_SHARE("map %p map_addr 0x%llx offset_in_mapping 0x%llx overmap_start 0x%llx -> *address 0x%llx\n", target_map, (uint64_t)map_addr, (uint64_t) offset_in_mapping, (uint64_t)overmap_start, (uint64_t)*address);
4559 				}
4560 			}
4561 			named_entry_unlock(named_entry);
4562 			if (target_copy_map != copy_map) {
4563 				vm_map_copy_discard(target_copy_map);
4564 				target_copy_map = VM_MAP_COPY_NULL;
4565 			}
4566 
4567 			if (kr != KERN_SUCCESS && !vmk_flags.vmf_overwrite) {
4568 				/* deallocate the contiguous range */
4569 				(void) vm_deallocate(target_map,
4570 				    map_addr,
4571 				    map_size);
4572 			}
4573 
4574 			return kr;
4575 		}
4576 
4577 		if (named_entry->is_object) {
4578 			unsigned int    access;
4579 			unsigned int    wimg_mode;
4580 
4581 			/* we are mapping a VM object */
4582 
4583 			access = named_entry->access;
4584 
4585 			if (vmk_flags.vmf_return_data_addr ||
4586 			    vmk_flags.vmf_return_4k_data_addr) {
4587 				offset_in_mapping = offset - VM_MAP_TRUNC_PAGE(offset, VM_MAP_PAGE_MASK(target_map));
4588 				if (vmk_flags.vmf_return_4k_data_addr) {
4589 					offset_in_mapping &= ~((signed)(0xFFF));
4590 				}
4591 				offset = VM_MAP_TRUNC_PAGE(offset, VM_MAP_PAGE_MASK(target_map));
4592 				map_size = VM_MAP_ROUND_PAGE((offset + offset_in_mapping + initial_size) - offset, VM_MAP_PAGE_MASK(target_map));
4593 			}
4594 
4595 			object = vm_named_entry_to_vm_object(named_entry);
4596 			assert(object != VM_OBJECT_NULL);
4597 			vm_object_lock(object);
4598 			named_entry_unlock(named_entry);
4599 
4600 			vm_object_reference_locked(object);
4601 
4602 			wimg_mode = object->wimg_bits;
4603 			vm_prot_to_wimg(access, &wimg_mode);
4604 			if (object->wimg_bits != wimg_mode) {
4605 				vm_object_change_wimg_mode(object, wimg_mode);
4606 			}
4607 
4608 			vm_object_unlock(object);
4609 		} else {
4610 			panic("invalid VM named entry %p", named_entry);
4611 		}
4612 	} else if (ip_kotype(port) == IKOT_MEMORY_OBJECT) {
4613 		/*
4614 		 * JMM - This is temporary until we unify named entries
4615 		 * and raw memory objects.
4616 		 *
4617 		 * Detected fake ip_kotype for a memory object.  In
4618 		 * this case, the port isn't really a port at all, but
4619 		 * instead is just a raw memory object.
4620 		 */
4621 		if (vmk_flags.vmf_return_data_addr ||
4622 		    vmk_flags.vmf_return_4k_data_addr) {
4623 			panic("VM_FLAGS_RETURN_DATA_ADDR not expected for raw memory object.");
4624 		}
4625 
4626 		object = memory_object_to_vm_object((memory_object_t)port);
4627 		if (object == VM_OBJECT_NULL) {
4628 			return KERN_INVALID_OBJECT;
4629 		}
4630 		vm_object_reference(object);
4631 
4632 		/* wait for object (if any) to be ready */
4633 		if (object != VM_OBJECT_NULL) {
4634 			if (object == kernel_object) {
4635 				printf("Warning: Attempt to map kernel object"
4636 				    " by a non-private kernel entity\n");
4637 				return KERN_INVALID_OBJECT;
4638 			}
4639 			if (!object->pager_ready) {
4640 				vm_object_lock(object);
4641 
4642 				while (!object->pager_ready) {
4643 					vm_object_wait(object,
4644 					    VM_OBJECT_EVENT_PAGER_READY,
4645 					    THREAD_UNINT);
4646 					vm_object_lock(object);
4647 				}
4648 				vm_object_unlock(object);
4649 			}
4650 		}
4651 	} else {
4652 		return KERN_INVALID_OBJECT;
4653 	}
4654 
4655 	if (object != VM_OBJECT_NULL &&
4656 	    object->named &&
4657 	    object->pager != MEMORY_OBJECT_NULL &&
4658 	    object->copy_strategy != MEMORY_OBJECT_COPY_NONE) {
4659 		memory_object_t pager;
4660 		vm_prot_t       pager_prot;
4661 		kern_return_t   kr;
4662 
4663 		/*
4664 		 * For "named" VM objects, let the pager know that the
4665 		 * memory object is being mapped.  Some pagers need to keep
4666 		 * track of this, to know when they can reclaim the memory
4667 		 * object, for example.
4668 		 * VM calls memory_object_map() for each mapping (specifying
4669 		 * the protection of each mapping) and calls
4670 		 * memory_object_last_unmap() when all the mappings are gone.
4671 		 */
4672 		pager_prot = max_protection;
4673 		if (copy) {
4674 			/*
4675 			 * Copy-On-Write mapping: won't modify the
4676 			 * memory object.
4677 			 */
4678 			pager_prot &= ~VM_PROT_WRITE;
4679 		}
4680 		vm_object_lock(object);
4681 		pager = object->pager;
4682 		if (object->named &&
4683 		    pager != MEMORY_OBJECT_NULL &&
4684 		    object->copy_strategy != MEMORY_OBJECT_COPY_NONE) {
4685 			assert(object->pager_ready);
4686 			vm_object_mapping_wait(object, THREAD_UNINT);
4687 			vm_object_mapping_begin(object);
4688 			vm_object_unlock(object);
4689 
4690 			kr = memory_object_map(pager, pager_prot);
4691 			assert(kr == KERN_SUCCESS);
4692 
4693 			vm_object_lock(object);
4694 			vm_object_mapping_end(object);
4695 		}
4696 		vm_object_unlock(object);
4697 	}
4698 
4699 	/*
4700 	 *	Perform the copy if requested
4701 	 */
4702 
4703 	if (copy) {
4704 		vm_object_t             new_object;
4705 		vm_object_offset_t      new_offset;
4706 
4707 		result = vm_object_copy_strategically(object, offset,
4708 		    map_size,
4709 		    &new_object, &new_offset,
4710 		    &copy);
4711 
4712 
4713 		if (result == KERN_MEMORY_RESTART_COPY) {
4714 			boolean_t success;
4715 			boolean_t src_needs_copy;
4716 
4717 			/*
4718 			 * XXX
4719 			 * We currently ignore src_needs_copy.
4720 			 * This really is the issue of how to make
4721 			 * MEMORY_OBJECT_COPY_SYMMETRIC safe for
4722 			 * non-kernel users to use. Solution forthcoming.
4723 			 * In the meantime, since we don't allow non-kernel
4724 			 * memory managers to specify symmetric copy,
4725 			 * we won't run into problems here.
4726 			 */
4727 			new_object = object;
4728 			new_offset = offset;
4729 			success = vm_object_copy_quickly(new_object,
4730 			    new_offset,
4731 			    map_size,
4732 			    &src_needs_copy,
4733 			    &copy);
4734 			assert(success);
4735 			result = KERN_SUCCESS;
4736 		}
4737 		/*
4738 		 *	Throw away the reference to the
4739 		 *	original object, as it won't be mapped.
4740 		 */
4741 
4742 		vm_object_deallocate(object);
4743 
4744 		if (result != KERN_SUCCESS) {
4745 			return result;
4746 		}
4747 
4748 		object = new_object;
4749 		offset = new_offset;
4750 	}
4751 
4752 	/*
4753 	 * If non-kernel users want to try to prefault pages, the mapping and prefault
4754 	 * needs to be atomic.
4755 	 */
4756 	kernel_prefault = (try_prefault && vm_kernel_map_is_kernel(target_map));
4757 	vmk_flags.vmkf_keep_map_locked = (try_prefault && !kernel_prefault);
4758 
4759 #if __arm64__
4760 	if (fourk) {
4761 		/* map this object in a "4K" pager */
4762 		result = vm_map_enter_fourk(target_map,
4763 		    &map_addr,
4764 		    map_size,
4765 		    (vm_map_offset_t) mask,
4766 		    vmk_flags,
4767 		    object,
4768 		    offset,
4769 		    copy,
4770 		    cur_protection,
4771 		    max_protection,
4772 		    inheritance);
4773 	} else
4774 #endif /* __arm64__ */
4775 	{
4776 		result = vm_map_enter(target_map,
4777 		    &map_addr, map_size,
4778 		    (vm_map_offset_t)mask,
4779 		    vmk_flags,
4780 		    object, offset,
4781 		    copy,
4782 		    cur_protection, max_protection,
4783 		    inheritance);
4784 	}
4785 	if (result != KERN_SUCCESS) {
4786 		vm_object_deallocate(object);
4787 	}
4788 
4789 	/*
4790 	 * Try to prefault, and do not forget to release the vm map lock.
4791 	 */
4792 	if (result == KERN_SUCCESS && try_prefault) {
4793 		mach_vm_address_t va = map_addr;
4794 		kern_return_t kr = KERN_SUCCESS;
4795 		unsigned int i = 0;
4796 		int pmap_options;
4797 
4798 		pmap_options = kernel_prefault ? 0 : PMAP_OPTIONS_NOWAIT;
4799 		if (object->internal) {
4800 			pmap_options |= PMAP_OPTIONS_INTERNAL;
4801 		}
4802 
4803 		for (i = 0; i < page_list_count; ++i) {
4804 			if (!UPL_VALID_PAGE(page_list, i)) {
4805 				if (kernel_prefault) {
4806 					assertf(FALSE, "kernel_prefault && !UPL_VALID_PAGE");
4807 					result = KERN_MEMORY_ERROR;
4808 					break;
4809 				}
4810 			} else {
4811 				/*
4812 				 * If this function call failed, we should stop
4813 				 * trying to optimize, other calls are likely
4814 				 * going to fail too.
4815 				 *
4816 				 * We are not gonna report an error for such
4817 				 * failure though. That's an optimization, not
4818 				 * something critical.
4819 				 */
4820 				kr = pmap_enter_options(target_map->pmap,
4821 				    va, UPL_PHYS_PAGE(page_list, i),
4822 				    cur_protection, VM_PROT_NONE,
4823 				    0, TRUE, pmap_options, NULL);
4824 				if (kr != KERN_SUCCESS) {
4825 					OSIncrementAtomic64(&vm_prefault_nb_bailout);
4826 					if (kernel_prefault) {
4827 						result = kr;
4828 					}
4829 					break;
4830 				}
4831 				OSIncrementAtomic64(&vm_prefault_nb_pages);
4832 			}
4833 
4834 			/* Next virtual address */
4835 			va += PAGE_SIZE;
4836 		}
4837 		if (vmk_flags.vmkf_keep_map_locked) {
4838 			vm_map_unlock(target_map);
4839 		}
4840 	}
4841 
4842 	if (vmk_flags.vmf_return_data_addr ||
4843 	    vmk_flags.vmf_return_4k_data_addr) {
4844 		*address = map_addr + offset_in_mapping;
4845 	} else {
4846 		*address = map_addr;
4847 	}
4848 	return result;
4849 }
4850 
4851 kern_return_t
vm_map_enter_mem_object(vm_map_t target_map,vm_map_offset_t * address,vm_map_size_t initial_size,vm_map_offset_t mask,vm_map_kernel_flags_t vmk_flags,ipc_port_t port,vm_object_offset_t offset,boolean_t copy,vm_prot_t cur_protection,vm_prot_t max_protection,vm_inherit_t inheritance)4852 vm_map_enter_mem_object(
4853 	vm_map_t                target_map,
4854 	vm_map_offset_t         *address,
4855 	vm_map_size_t           initial_size,
4856 	vm_map_offset_t         mask,
4857 	vm_map_kernel_flags_t   vmk_flags,
4858 	ipc_port_t              port,
4859 	vm_object_offset_t      offset,
4860 	boolean_t               copy,
4861 	vm_prot_t               cur_protection,
4862 	vm_prot_t               max_protection,
4863 	vm_inherit_t            inheritance)
4864 {
4865 	kern_return_t ret;
4866 
4867 	/* range_id is set by vm_map_enter_mem_object_helper */
4868 	ret = vm_map_enter_mem_object_helper(target_map,
4869 	    address,
4870 	    initial_size,
4871 	    mask,
4872 	    vmk_flags,
4873 	    port,
4874 	    offset,
4875 	    copy,
4876 	    cur_protection,
4877 	    max_protection,
4878 	    inheritance,
4879 	    NULL,
4880 	    0);
4881 
4882 #if KASAN
4883 	if (ret == KERN_SUCCESS && address && target_map->pmap == kernel_pmap) {
4884 		kasan_notify_address(*address, initial_size);
4885 	}
4886 #endif
4887 
4888 	return ret;
4889 }
4890 
4891 kern_return_t
vm_map_enter_mem_object_prefault(vm_map_t target_map,vm_map_offset_t * address,vm_map_size_t initial_size,vm_map_offset_t mask,vm_map_kernel_flags_t vmk_flags,ipc_port_t port,vm_object_offset_t offset,vm_prot_t cur_protection,vm_prot_t max_protection,upl_page_list_ptr_t page_list,unsigned int page_list_count)4892 vm_map_enter_mem_object_prefault(
4893 	vm_map_t                target_map,
4894 	vm_map_offset_t         *address,
4895 	vm_map_size_t           initial_size,
4896 	vm_map_offset_t         mask,
4897 	vm_map_kernel_flags_t   vmk_flags,
4898 	ipc_port_t              port,
4899 	vm_object_offset_t      offset,
4900 	vm_prot_t               cur_protection,
4901 	vm_prot_t               max_protection,
4902 	upl_page_list_ptr_t     page_list,
4903 	unsigned int            page_list_count)
4904 {
4905 	kern_return_t ret;
4906 
4907 	/* range_id is set by vm_map_enter_mem_object_helper */
4908 	ret = vm_map_enter_mem_object_helper(target_map,
4909 	    address,
4910 	    initial_size,
4911 	    mask,
4912 	    vmk_flags,
4913 	    port,
4914 	    offset,
4915 	    FALSE,
4916 	    cur_protection,
4917 	    max_protection,
4918 	    VM_INHERIT_DEFAULT,
4919 	    page_list,
4920 	    page_list_count);
4921 
4922 #if KASAN
4923 	if (ret == KERN_SUCCESS && address && target_map->pmap == kernel_pmap) {
4924 		kasan_notify_address(*address, initial_size);
4925 	}
4926 #endif
4927 
4928 	return ret;
4929 }
4930 
4931 
4932 kern_return_t
vm_map_enter_mem_object_control(vm_map_t target_map,vm_map_offset_t * address,vm_map_size_t initial_size,vm_map_offset_t mask,vm_map_kernel_flags_t vmk_flags,memory_object_control_t control,vm_object_offset_t offset,boolean_t copy,vm_prot_t cur_protection,vm_prot_t max_protection,vm_inherit_t inheritance)4933 vm_map_enter_mem_object_control(
4934 	vm_map_t                target_map,
4935 	vm_map_offset_t         *address,
4936 	vm_map_size_t           initial_size,
4937 	vm_map_offset_t         mask,
4938 	vm_map_kernel_flags_t   vmk_flags,
4939 	memory_object_control_t control,
4940 	vm_object_offset_t      offset,
4941 	boolean_t               copy,
4942 	vm_prot_t               cur_protection,
4943 	vm_prot_t               max_protection,
4944 	vm_inherit_t            inheritance)
4945 {
4946 	vm_map_address_t        map_addr;
4947 	vm_map_size_t           map_size;
4948 	vm_object_t             object;
4949 	vm_object_size_t        size;
4950 	kern_return_t           result;
4951 	memory_object_t         pager;
4952 	vm_prot_t               pager_prot;
4953 	kern_return_t           kr;
4954 #if __arm64__
4955 	boolean_t               fourk = vmk_flags.vmkf_fourk;
4956 #endif /* __arm64__ */
4957 
4958 	/*
4959 	 * Check arguments for validity
4960 	 */
4961 	if ((target_map == VM_MAP_NULL) ||
4962 	    (cur_protection & ~(VM_PROT_ALL | VM_PROT_ALLEXEC)) ||
4963 	    (max_protection & ~(VM_PROT_ALL | VM_PROT_ALLEXEC)) ||
4964 	    (inheritance > VM_INHERIT_LAST_VALID) ||
4965 	    initial_size == 0) {
4966 		return KERN_INVALID_ARGUMENT;
4967 	}
4968 
4969 #if __arm64__
4970 	if (fourk && VM_MAP_PAGE_MASK(target_map) < PAGE_MASK) {
4971 		fourk = FALSE;
4972 	}
4973 
4974 	if (fourk) {
4975 		map_addr = vm_map_trunc_page(*address,
4976 		    FOURK_PAGE_MASK);
4977 		map_size = vm_map_round_page(initial_size,
4978 		    FOURK_PAGE_MASK);
4979 	} else
4980 #endif /* __arm64__ */
4981 	{
4982 		map_addr = vm_map_trunc_page(*address,
4983 		    VM_MAP_PAGE_MASK(target_map));
4984 		map_size = vm_map_round_page(initial_size,
4985 		    VM_MAP_PAGE_MASK(target_map));
4986 	}
4987 	size = vm_object_round_page(initial_size);
4988 
4989 	object = memory_object_control_to_vm_object(control);
4990 
4991 	if (object == VM_OBJECT_NULL) {
4992 		return KERN_INVALID_OBJECT;
4993 	}
4994 
4995 	if (object == kernel_object) {
4996 		printf("Warning: Attempt to map kernel object"
4997 		    " by a non-private kernel entity\n");
4998 		return KERN_INVALID_OBJECT;
4999 	}
5000 
5001 	vm_object_lock(object);
5002 	object->ref_count++;
5003 
5004 	/*
5005 	 * For "named" VM objects, let the pager know that the
5006 	 * memory object is being mapped.  Some pagers need to keep
5007 	 * track of this, to know when they can reclaim the memory
5008 	 * object, for example.
5009 	 * VM calls memory_object_map() for each mapping (specifying
5010 	 * the protection of each mapping) and calls
5011 	 * memory_object_last_unmap() when all the mappings are gone.
5012 	 */
5013 	pager_prot = max_protection;
5014 	if (copy) {
5015 		pager_prot &= ~VM_PROT_WRITE;
5016 	}
5017 	pager = object->pager;
5018 	if (object->named &&
5019 	    pager != MEMORY_OBJECT_NULL &&
5020 	    object->copy_strategy != MEMORY_OBJECT_COPY_NONE) {
5021 		assert(object->pager_ready);
5022 		vm_object_mapping_wait(object, THREAD_UNINT);
5023 		vm_object_mapping_begin(object);
5024 		vm_object_unlock(object);
5025 
5026 		kr = memory_object_map(pager, pager_prot);
5027 		assert(kr == KERN_SUCCESS);
5028 
5029 		vm_object_lock(object);
5030 		vm_object_mapping_end(object);
5031 	}
5032 	vm_object_unlock(object);
5033 
5034 	/*
5035 	 *	Perform the copy if requested
5036 	 */
5037 
5038 	if (copy) {
5039 		vm_object_t             new_object;
5040 		vm_object_offset_t      new_offset;
5041 
5042 		result = vm_object_copy_strategically(object, offset, size,
5043 		    &new_object, &new_offset,
5044 		    &copy);
5045 
5046 
5047 		if (result == KERN_MEMORY_RESTART_COPY) {
5048 			boolean_t success;
5049 			boolean_t src_needs_copy;
5050 
5051 			/*
5052 			 * XXX
5053 			 * We currently ignore src_needs_copy.
5054 			 * This really is the issue of how to make
5055 			 * MEMORY_OBJECT_COPY_SYMMETRIC safe for
5056 			 * non-kernel users to use. Solution forthcoming.
5057 			 * In the meantime, since we don't allow non-kernel
5058 			 * memory managers to specify symmetric copy,
5059 			 * we won't run into problems here.
5060 			 */
5061 			new_object = object;
5062 			new_offset = offset;
5063 			success = vm_object_copy_quickly(new_object,
5064 			    new_offset, size,
5065 			    &src_needs_copy,
5066 			    &copy);
5067 			assert(success);
5068 			result = KERN_SUCCESS;
5069 		}
5070 		/*
5071 		 *	Throw away the reference to the
5072 		 *	original object, as it won't be mapped.
5073 		 */
5074 
5075 		vm_object_deallocate(object);
5076 
5077 		if (result != KERN_SUCCESS) {
5078 			return result;
5079 		}
5080 
5081 		object = new_object;
5082 		offset = new_offset;
5083 	}
5084 
5085 #if __arm64__
5086 	if (fourk) {
5087 		result = vm_map_enter_fourk(target_map,
5088 		    &map_addr,
5089 		    map_size,
5090 		    (vm_map_offset_t)mask,
5091 		    vmk_flags,
5092 		    object, offset,
5093 		    copy,
5094 		    cur_protection, max_protection,
5095 		    inheritance);
5096 	} else
5097 #endif /* __arm64__ */
5098 	{
5099 		result = vm_map_enter(target_map,
5100 		    &map_addr, map_size,
5101 		    (vm_map_offset_t)mask,
5102 		    vmk_flags,
5103 		    object, offset,
5104 		    copy,
5105 		    cur_protection, max_protection,
5106 		    inheritance);
5107 	}
5108 	if (result != KERN_SUCCESS) {
5109 		vm_object_deallocate(object);
5110 	}
5111 	*address = map_addr;
5112 
5113 	return result;
5114 }
5115 
5116 
5117 #if     VM_CPM
5118 
5119 #ifdef MACH_ASSERT
5120 extern pmap_paddr_t     avail_start, avail_end;
5121 #endif
5122 
5123 /*
5124  *	Allocate memory in the specified map, with the caveat that
5125  *	the memory is physically contiguous.  This call may fail
5126  *	if the system can't find sufficient contiguous memory.
5127  *	This call may cause or lead to heart-stopping amounts of
5128  *	paging activity.
5129  *
5130  *	Memory obtained from this call should be freed in the
5131  *	normal way, viz., via vm_deallocate.
5132  */
5133 kern_return_t
vm_map_enter_cpm(vm_map_t map,vm_map_offset_t * addr,vm_map_size_t size,vm_map_kernel_flags_t vmk_flags)5134 vm_map_enter_cpm(
5135 	vm_map_t                map,
5136 	vm_map_offset_t        *addr,
5137 	vm_map_size_t           size,
5138 	vm_map_kernel_flags_t   vmk_flags)
5139 {
5140 	vm_object_t             cpm_obj;
5141 	pmap_t                  pmap;
5142 	vm_page_t               m, pages;
5143 	kern_return_t           kr;
5144 	vm_map_offset_t         va, start, end, offset;
5145 #if     MACH_ASSERT
5146 	vm_map_offset_t         prev_addr = 0;
5147 #endif  /* MACH_ASSERT */
5148 
5149 	if (VM_MAP_PAGE_SHIFT(map) != PAGE_SHIFT) {
5150 		/* XXX TODO4K do we need to support this? */
5151 		*addr = 0;
5152 		return KERN_NOT_SUPPORTED;
5153 	}
5154 
5155 	if (size == 0) {
5156 		*addr = 0;
5157 		return KERN_SUCCESS;
5158 	}
5159 	if (vmk_flags.vmf_fixed) {
5160 		*addr = vm_map_trunc_page(*addr,
5161 		    VM_MAP_PAGE_MASK(map));
5162 	} else {
5163 		*addr = vm_map_min(map);
5164 	}
5165 	size = vm_map_round_page(size,
5166 	    VM_MAP_PAGE_MASK(map));
5167 
5168 	/*
5169 	 * LP64todo - cpm_allocate should probably allow
5170 	 * allocations of >4GB, but not with the current
5171 	 * algorithm, so just cast down the size for now.
5172 	 */
5173 	if (size > VM_MAX_ADDRESS) {
5174 		return KERN_RESOURCE_SHORTAGE;
5175 	}
5176 	if ((kr = cpm_allocate(CAST_DOWN(vm_size_t, size),
5177 	    &pages, 0, 0, TRUE, flags)) != KERN_SUCCESS) {
5178 		return kr;
5179 	}
5180 
5181 	cpm_obj = vm_object_allocate((vm_object_size_t)size);
5182 	assert(cpm_obj != VM_OBJECT_NULL);
5183 	assert(cpm_obj->internal);
5184 	assert(cpm_obj->vo_size == (vm_object_size_t)size);
5185 	assert(cpm_obj->can_persist == FALSE);
5186 	assert(cpm_obj->pager_created == FALSE);
5187 	assert(cpm_obj->pageout == FALSE);
5188 	assert(cpm_obj->shadow == VM_OBJECT_NULL);
5189 
5190 	/*
5191 	 *	Insert pages into object.
5192 	 */
5193 
5194 	vm_object_lock(cpm_obj);
5195 	for (offset = 0; offset < size; offset += PAGE_SIZE) {
5196 		m = pages;
5197 		pages = NEXT_PAGE(m);
5198 		*(NEXT_PAGE_PTR(m)) = VM_PAGE_NULL;
5199 
5200 		assert(!m->vmp_gobbled);
5201 		assert(!m->vmp_wanted);
5202 		assert(!m->vmp_pageout);
5203 		assert(!m->vmp_tabled);
5204 		assert(VM_PAGE_WIRED(m));
5205 		assert(m->vmp_busy);
5206 		assert(VM_PAGE_GET_PHYS_PAGE(m) >= (avail_start >> PAGE_SHIFT) && VM_PAGE_GET_PHYS_PAGE(m) <= (avail_end >> PAGE_SHIFT));
5207 
5208 		m->vmp_busy = FALSE;
5209 		vm_page_insert(m, cpm_obj, offset);
5210 	}
5211 	assert(cpm_obj->resident_page_count == size / PAGE_SIZE);
5212 	vm_object_unlock(cpm_obj);
5213 
5214 	/*
5215 	 *	Hang onto a reference on the object in case a
5216 	 *	multi-threaded application for some reason decides
5217 	 *	to deallocate the portion of the address space into
5218 	 *	which we will insert this object.
5219 	 *
5220 	 *	Unfortunately, we must insert the object now before
5221 	 *	we can talk to the pmap module about which addresses
5222 	 *	must be wired down.  Hence, the race with a multi-
5223 	 *	threaded app.
5224 	 */
5225 	vm_object_reference(cpm_obj);
5226 
5227 	/*
5228 	 *	Insert object into map.
5229 	 */
5230 
5231 	kr = vm_map_enter(
5232 		map,
5233 		addr,
5234 		size,
5235 		(vm_map_offset_t)0,
5236 		vmk_flags,
5237 		cpm_obj,
5238 		(vm_object_offset_t)0,
5239 		FALSE,
5240 		VM_PROT_ALL,
5241 		VM_PROT_ALL,
5242 		VM_INHERIT_DEFAULT);
5243 
5244 	if (kr != KERN_SUCCESS) {
5245 		/*
5246 		 *	A CPM object doesn't have can_persist set,
5247 		 *	so all we have to do is deallocate it to
5248 		 *	free up these pages.
5249 		 */
5250 		assert(cpm_obj->pager_created == FALSE);
5251 		assert(cpm_obj->can_persist == FALSE);
5252 		assert(cpm_obj->pageout == FALSE);
5253 		assert(cpm_obj->shadow == VM_OBJECT_NULL);
5254 		vm_object_deallocate(cpm_obj); /* kill acquired ref */
5255 		vm_object_deallocate(cpm_obj); /* kill creation ref */
5256 	}
5257 
5258 	/*
5259 	 *	Inform the physical mapping system that the
5260 	 *	range of addresses may not fault, so that
5261 	 *	page tables and such can be locked down as well.
5262 	 */
5263 	start = *addr;
5264 	end = start + size;
5265 	pmap = vm_map_pmap(map);
5266 	pmap_pageable(pmap, start, end, FALSE);
5267 
5268 	/*
5269 	 *	Enter each page into the pmap, to avoid faults.
5270 	 *	Note that this loop could be coded more efficiently,
5271 	 *	if the need arose, rather than looking up each page
5272 	 *	again.
5273 	 */
5274 	for (offset = 0, va = start; offset < size;
5275 	    va += PAGE_SIZE, offset += PAGE_SIZE) {
5276 		int type_of_fault;
5277 
5278 		vm_object_lock(cpm_obj);
5279 		m = vm_page_lookup(cpm_obj, (vm_object_offset_t)offset);
5280 		assert(m != VM_PAGE_NULL);
5281 
5282 		vm_page_zero_fill(m);
5283 
5284 		type_of_fault = DBG_ZERO_FILL_FAULT;
5285 
5286 		vm_fault_enter(m, pmap, va,
5287 		    PAGE_SIZE, 0,
5288 		    VM_PROT_ALL, VM_PROT_WRITE,
5289 		    VM_PAGE_WIRED(m),
5290 		    FALSE,                             /* change_wiring */
5291 		    VM_KERN_MEMORY_NONE,                             /* tag - not wiring */
5292 		    FALSE,                             /* cs_bypass */
5293 		    0,                                 /* user_tag */
5294 		    0,                             /* pmap_options */
5295 		    NULL,                              /* need_retry */
5296 		    &type_of_fault);
5297 
5298 		vm_object_unlock(cpm_obj);
5299 	}
5300 
5301 #if     MACH_ASSERT
5302 	/*
5303 	 *	Verify ordering in address space.
5304 	 */
5305 	for (offset = 0; offset < size; offset += PAGE_SIZE) {
5306 		vm_object_lock(cpm_obj);
5307 		m = vm_page_lookup(cpm_obj, (vm_object_offset_t)offset);
5308 		vm_object_unlock(cpm_obj);
5309 		if (m == VM_PAGE_NULL) {
5310 			panic("vm_allocate_cpm:  obj %p off 0x%llx no page",
5311 			    cpm_obj, (uint64_t)offset);
5312 		}
5313 		assert(m->vmp_tabled);
5314 		assert(!m->vmp_busy);
5315 		assert(!m->vmp_wanted);
5316 		assert(!m->vmp_fictitious);
5317 		assert(!m->vmp_private);
5318 		assert(!m->vmp_absent);
5319 		assert(!m->vmp_cleaning);
5320 		assert(!m->vmp_laundry);
5321 		assert(!m->vmp_precious);
5322 		assert(!m->vmp_clustered);
5323 		if (offset != 0) {
5324 			if (VM_PAGE_GET_PHYS_PAGE(m) != prev_addr + 1) {
5325 				printf("start 0x%llx end 0x%llx va 0x%llx\n",
5326 				    (uint64_t)start, (uint64_t)end, (uint64_t)va);
5327 				printf("obj %p off 0x%llx\n", cpm_obj, (uint64_t)offset);
5328 				printf("m %p prev_address 0x%llx\n", m, (uint64_t)prev_addr);
5329 				panic("vm_allocate_cpm:  pages not contig!");
5330 			}
5331 		}
5332 		prev_addr = VM_PAGE_GET_PHYS_PAGE(m);
5333 	}
5334 #endif  /* MACH_ASSERT */
5335 
5336 	vm_object_deallocate(cpm_obj); /* kill extra ref */
5337 
5338 	return kr;
5339 }
5340 
5341 
5342 #else   /* VM_CPM */
5343 
5344 /*
5345  *	Interface is defined in all cases, but unless the kernel
5346  *	is built explicitly for this option, the interface does
5347  *	nothing.
5348  */
5349 
5350 kern_return_t
vm_map_enter_cpm(__unused vm_map_t map,__unused vm_map_offset_t * addr,__unused vm_map_size_t size,__unused vm_map_kernel_flags_t vmk_flags)5351 vm_map_enter_cpm(
5352 	__unused vm_map_t                map,
5353 	__unused vm_map_offset_t        *addr,
5354 	__unused vm_map_size_t           size,
5355 	__unused vm_map_kernel_flags_t   vmk_flags)
5356 {
5357 	return KERN_FAILURE;
5358 }
5359 #endif /* VM_CPM */
5360 
5361 /* Not used without nested pmaps */
5362 #ifndef NO_NESTED_PMAP
5363 /*
5364  * Clip and unnest a portion of a nested submap mapping.
5365  */
5366 
5367 
5368 static void
vm_map_clip_unnest(vm_map_t map,vm_map_entry_t entry,vm_map_offset_t start_unnest,vm_map_offset_t end_unnest)5369 vm_map_clip_unnest(
5370 	vm_map_t        map,
5371 	vm_map_entry_t  entry,
5372 	vm_map_offset_t start_unnest,
5373 	vm_map_offset_t end_unnest)
5374 {
5375 	vm_map_offset_t old_start_unnest = start_unnest;
5376 	vm_map_offset_t old_end_unnest = end_unnest;
5377 
5378 	assert(entry->is_sub_map);
5379 	assert(VME_SUBMAP(entry) != NULL);
5380 	assert(entry->use_pmap);
5381 
5382 	/*
5383 	 * Query the platform for the optimal unnest range.
5384 	 * DRK: There's some duplication of effort here, since
5385 	 * callers may have adjusted the range to some extent. This
5386 	 * routine was introduced to support 1GiB subtree nesting
5387 	 * for x86 platforms, which can also nest on 2MiB boundaries
5388 	 * depending on size/alignment.
5389 	 */
5390 	if (pmap_adjust_unnest_parameters(map->pmap, &start_unnest, &end_unnest)) {
5391 		assert(VME_SUBMAP(entry)->is_nested_map);
5392 		assert(!VME_SUBMAP(entry)->disable_vmentry_reuse);
5393 		log_unnest_badness(map,
5394 		    old_start_unnest,
5395 		    old_end_unnest,
5396 		    VME_SUBMAP(entry)->is_nested_map,
5397 		    (entry->vme_start +
5398 		    VME_SUBMAP(entry)->lowest_unnestable_start -
5399 		    VME_OFFSET(entry)));
5400 	}
5401 
5402 	if (entry->vme_start > start_unnest ||
5403 	    entry->vme_end < end_unnest) {
5404 		panic("vm_map_clip_unnest(0x%llx,0x%llx): "
5405 		    "bad nested entry: start=0x%llx end=0x%llx\n",
5406 		    (long long)start_unnest, (long long)end_unnest,
5407 		    (long long)entry->vme_start, (long long)entry->vme_end);
5408 	}
5409 
5410 	if (start_unnest > entry->vme_start) {
5411 		_vm_map_clip_start(&map->hdr,
5412 		    entry,
5413 		    start_unnest);
5414 		if (map->holelistenabled) {
5415 			vm_map_store_update_first_free(map, NULL, FALSE);
5416 		} else {
5417 			vm_map_store_update_first_free(map, map->first_free, FALSE);
5418 		}
5419 	}
5420 	if (entry->vme_end > end_unnest) {
5421 		_vm_map_clip_end(&map->hdr,
5422 		    entry,
5423 		    end_unnest);
5424 		if (map->holelistenabled) {
5425 			vm_map_store_update_first_free(map, NULL, FALSE);
5426 		} else {
5427 			vm_map_store_update_first_free(map, map->first_free, FALSE);
5428 		}
5429 	}
5430 
5431 	pmap_unnest(map->pmap,
5432 	    entry->vme_start,
5433 	    entry->vme_end - entry->vme_start);
5434 	if ((map->mapped_in_other_pmaps) && os_ref_get_count_raw(&map->map_refcnt) != 0) {
5435 		/* clean up parent map/maps */
5436 		vm_map_submap_pmap_clean(
5437 			map, entry->vme_start,
5438 			entry->vme_end,
5439 			VME_SUBMAP(entry),
5440 			VME_OFFSET(entry));
5441 	}
5442 	entry->use_pmap = FALSE;
5443 	if ((map->pmap != kernel_pmap) &&
5444 	    (VME_ALIAS(entry) == VM_MEMORY_SHARED_PMAP)) {
5445 		VME_ALIAS_SET(entry, VM_MEMORY_UNSHARED_PMAP);
5446 	}
5447 }
5448 #endif  /* NO_NESTED_PMAP */
5449 
5450 __abortlike
5451 static void
__vm_map_clip_atomic_entry_panic(vm_map_t map,vm_map_entry_t entry,vm_map_offset_t where)5452 __vm_map_clip_atomic_entry_panic(
5453 	vm_map_t        map,
5454 	vm_map_entry_t  entry,
5455 	vm_map_offset_t where)
5456 {
5457 	panic("vm_map_clip(%p): Attempting to clip an atomic VM map entry "
5458 	    "%p [0x%llx:0x%llx] at 0x%llx", map, entry,
5459 	    (uint64_t)entry->vme_start,
5460 	    (uint64_t)entry->vme_end,
5461 	    (uint64_t)where);
5462 }
5463 
5464 /*
5465  *	vm_map_clip_start:	[ internal use only ]
5466  *
5467  *	Asserts that the given entry begins at or after
5468  *	the specified address; if necessary,
5469  *	it splits the entry into two.
5470  */
5471 void
vm_map_clip_start(vm_map_t map,vm_map_entry_t entry,vm_map_offset_t startaddr)5472 vm_map_clip_start(
5473 	vm_map_t        map,
5474 	vm_map_entry_t  entry,
5475 	vm_map_offset_t startaddr)
5476 {
5477 #ifndef NO_NESTED_PMAP
5478 	if (entry->is_sub_map &&
5479 	    entry->use_pmap &&
5480 	    startaddr >= entry->vme_start) {
5481 		vm_map_offset_t start_unnest, end_unnest;
5482 
5483 		/*
5484 		 * Make sure "startaddr" is no longer in a nested range
5485 		 * before we clip.  Unnest only the minimum range the platform
5486 		 * can handle.
5487 		 * vm_map_clip_unnest may perform additional adjustments to
5488 		 * the unnest range.
5489 		 */
5490 		start_unnest = startaddr & ~(pmap_shared_region_size_min(map->pmap) - 1);
5491 		end_unnest = start_unnest + pmap_shared_region_size_min(map->pmap);
5492 		vm_map_clip_unnest(map, entry, start_unnest, end_unnest);
5493 	}
5494 #endif /* NO_NESTED_PMAP */
5495 	if (startaddr > entry->vme_start) {
5496 		if (!entry->is_sub_map &&
5497 		    VME_OBJECT(entry) &&
5498 		    VME_OBJECT(entry)->phys_contiguous) {
5499 			pmap_remove(map->pmap,
5500 			    (addr64_t)(entry->vme_start),
5501 			    (addr64_t)(entry->vme_end));
5502 		}
5503 		if (entry->vme_atomic) {
5504 			__vm_map_clip_atomic_entry_panic(map, entry, startaddr);
5505 		}
5506 
5507 		DTRACE_VM5(
5508 			vm_map_clip_start,
5509 			vm_map_t, map,
5510 			vm_map_offset_t, entry->vme_start,
5511 			vm_map_offset_t, entry->vme_end,
5512 			vm_map_offset_t, startaddr,
5513 			int, VME_ALIAS(entry));
5514 
5515 		_vm_map_clip_start(&map->hdr, entry, startaddr);
5516 		if (map->holelistenabled) {
5517 			vm_map_store_update_first_free(map, NULL, FALSE);
5518 		} else {
5519 			vm_map_store_update_first_free(map, map->first_free, FALSE);
5520 		}
5521 	}
5522 }
5523 
5524 
5525 #define vm_map_copy_clip_start(copy, entry, startaddr) \
5526 	MACRO_BEGIN \
5527 	if ((startaddr) > (entry)->vme_start) \
5528 	        _vm_map_clip_start(&(copy)->cpy_hdr,(entry),(startaddr)); \
5529 	MACRO_END
5530 
5531 /*
5532  *	This routine is called only when it is known that
5533  *	the entry must be split.
5534  */
5535 static void
_vm_map_clip_start(struct vm_map_header * map_header,vm_map_entry_t entry,vm_map_offset_t start)5536 _vm_map_clip_start(
5537 	struct vm_map_header    *map_header,
5538 	vm_map_entry_t          entry,
5539 	vm_map_offset_t         start)
5540 {
5541 	vm_map_entry_t  new_entry;
5542 
5543 	/*
5544 	 *	Split off the front portion --
5545 	 *	note that we must insert the new
5546 	 *	entry BEFORE this one, so that
5547 	 *	this entry has the specified starting
5548 	 *	address.
5549 	 */
5550 
5551 	if (entry->map_aligned) {
5552 		assert(VM_MAP_PAGE_ALIGNED(start,
5553 		    VM_MAP_HDR_PAGE_MASK(map_header)));
5554 	}
5555 
5556 	new_entry = _vm_map_entry_create(map_header);
5557 	vm_map_entry_copy_full(new_entry, entry);
5558 
5559 	new_entry->vme_end = start;
5560 	assert(new_entry->vme_start < new_entry->vme_end);
5561 	VME_OFFSET_SET(entry, VME_OFFSET(entry) + (start - entry->vme_start));
5562 	assert(start < entry->vme_end);
5563 	entry->vme_start = start;
5564 
5565 	_vm_map_store_entry_link(map_header, entry->vme_prev, new_entry);
5566 
5567 	if (entry->is_sub_map) {
5568 		vm_map_reference(VME_SUBMAP(new_entry));
5569 	} else {
5570 		vm_object_reference(VME_OBJECT(new_entry));
5571 	}
5572 }
5573 
5574 
5575 /*
5576  *	vm_map_clip_end:	[ internal use only ]
5577  *
5578  *	Asserts that the given entry ends at or before
5579  *	the specified address; if necessary,
5580  *	it splits the entry into two.
5581  */
5582 void
vm_map_clip_end(vm_map_t map,vm_map_entry_t entry,vm_map_offset_t endaddr)5583 vm_map_clip_end(
5584 	vm_map_t        map,
5585 	vm_map_entry_t  entry,
5586 	vm_map_offset_t endaddr)
5587 {
5588 	if (endaddr > entry->vme_end) {
5589 		/*
5590 		 * Within the scope of this clipping, limit "endaddr" to
5591 		 * the end of this map entry...
5592 		 */
5593 		endaddr = entry->vme_end;
5594 	}
5595 #ifndef NO_NESTED_PMAP
5596 	if (entry->is_sub_map && entry->use_pmap) {
5597 		vm_map_offset_t start_unnest, end_unnest;
5598 
5599 		/*
5600 		 * Make sure the range between the start of this entry and
5601 		 * the new "endaddr" is no longer nested before we clip.
5602 		 * Unnest only the minimum range the platform can handle.
5603 		 * vm_map_clip_unnest may perform additional adjustments to
5604 		 * the unnest range.
5605 		 */
5606 		start_unnest = entry->vme_start;
5607 		end_unnest =
5608 		    (endaddr + pmap_shared_region_size_min(map->pmap) - 1) &
5609 		    ~(pmap_shared_region_size_min(map->pmap) - 1);
5610 		vm_map_clip_unnest(map, entry, start_unnest, end_unnest);
5611 	}
5612 #endif /* NO_NESTED_PMAP */
5613 	if (endaddr < entry->vme_end) {
5614 		if (!entry->is_sub_map &&
5615 		    VME_OBJECT(entry) &&
5616 		    VME_OBJECT(entry)->phys_contiguous) {
5617 			pmap_remove(map->pmap,
5618 			    (addr64_t)(entry->vme_start),
5619 			    (addr64_t)(entry->vme_end));
5620 		}
5621 		if (entry->vme_atomic) {
5622 			__vm_map_clip_atomic_entry_panic(map, entry, endaddr);
5623 		}
5624 		DTRACE_VM5(
5625 			vm_map_clip_end,
5626 			vm_map_t, map,
5627 			vm_map_offset_t, entry->vme_start,
5628 			vm_map_offset_t, entry->vme_end,
5629 			vm_map_offset_t, endaddr,
5630 			int, VME_ALIAS(entry));
5631 
5632 		_vm_map_clip_end(&map->hdr, entry, endaddr);
5633 		if (map->holelistenabled) {
5634 			vm_map_store_update_first_free(map, NULL, FALSE);
5635 		} else {
5636 			vm_map_store_update_first_free(map, map->first_free, FALSE);
5637 		}
5638 	}
5639 }
5640 
5641 
5642 #define vm_map_copy_clip_end(copy, entry, endaddr) \
5643 	MACRO_BEGIN \
5644 	if ((endaddr) < (entry)->vme_end) \
5645 	        _vm_map_clip_end(&(copy)->cpy_hdr,(entry),(endaddr)); \
5646 	MACRO_END
5647 
5648 /*
5649  *	This routine is called only when it is known that
5650  *	the entry must be split.
5651  */
5652 static void
_vm_map_clip_end(struct vm_map_header * map_header,vm_map_entry_t entry,vm_map_offset_t end)5653 _vm_map_clip_end(
5654 	struct vm_map_header    *map_header,
5655 	vm_map_entry_t          entry,
5656 	vm_map_offset_t         end)
5657 {
5658 	vm_map_entry_t  new_entry;
5659 
5660 	/*
5661 	 *	Create a new entry and insert it
5662 	 *	AFTER the specified entry
5663 	 */
5664 
5665 	if (entry->map_aligned) {
5666 		assert(VM_MAP_PAGE_ALIGNED(end,
5667 		    VM_MAP_HDR_PAGE_MASK(map_header)));
5668 	}
5669 
5670 	new_entry = _vm_map_entry_create(map_header);
5671 	vm_map_entry_copy_full(new_entry, entry);
5672 
5673 	assert(entry->vme_start < end);
5674 	new_entry->vme_start = entry->vme_end = end;
5675 	VME_OFFSET_SET(new_entry,
5676 	    VME_OFFSET(new_entry) + (end - entry->vme_start));
5677 	assert(new_entry->vme_start < new_entry->vme_end);
5678 
5679 	_vm_map_store_entry_link(map_header, entry, new_entry);
5680 
5681 	if (entry->is_sub_map) {
5682 		vm_map_reference(VME_SUBMAP(new_entry));
5683 	} else {
5684 		vm_object_reference(VME_OBJECT(new_entry));
5685 	}
5686 }
5687 
5688 
5689 /*
5690  *	VM_MAP_RANGE_CHECK:	[ internal use only ]
5691  *
5692  *	Asserts that the starting and ending region
5693  *	addresses fall within the valid range of the map.
5694  */
5695 #define VM_MAP_RANGE_CHECK(map, start, end)     \
5696 	MACRO_BEGIN                             \
5697 	if (start < vm_map_min(map))            \
5698 	        start = vm_map_min(map);        \
5699 	if (end > vm_map_max(map))              \
5700 	        end = vm_map_max(map);          \
5701 	if (start > end)                        \
5702 	        start = end;                    \
5703 	MACRO_END
5704 
5705 /*
5706  *	vm_map_range_check:	[ internal use only ]
5707  *
5708  *	Check that the region defined by the specified start and
5709  *	end addresses are wholly contained within a single map
5710  *	entry or set of adjacent map entries of the spacified map,
5711  *	i.e. the specified region contains no unmapped space.
5712  *	If any or all of the region is unmapped, FALSE is returned.
5713  *	Otherwise, TRUE is returned and if the output argument 'entry'
5714  *	is not NULL it points to the map entry containing the start
5715  *	of the region.
5716  *
5717  *	The map is locked for reading on entry and is left locked.
5718  */
5719 static boolean_t
vm_map_range_check(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_map_entry_t * entry)5720 vm_map_range_check(
5721 	vm_map_t                map,
5722 	vm_map_offset_t         start,
5723 	vm_map_offset_t         end,
5724 	vm_map_entry_t          *entry)
5725 {
5726 	vm_map_entry_t          cur;
5727 	vm_map_offset_t         prev;
5728 
5729 	/*
5730 	 *      Basic sanity checks first
5731 	 */
5732 	if (start < vm_map_min(map) || end > vm_map_max(map) || start > end) {
5733 		return FALSE;
5734 	}
5735 
5736 	/*
5737 	 *      Check first if the region starts within a valid
5738 	 *	mapping for the map.
5739 	 */
5740 	if (!vm_map_lookup_entry(map, start, &cur)) {
5741 		return FALSE;
5742 	}
5743 
5744 	/*
5745 	 *	Optimize for the case that the region is contained
5746 	 *	in a single map entry.
5747 	 */
5748 	if (entry != (vm_map_entry_t *) NULL) {
5749 		*entry = cur;
5750 	}
5751 	if (end <= cur->vme_end) {
5752 		return TRUE;
5753 	}
5754 
5755 	/*
5756 	 *      If the region is not wholly contained within a
5757 	 *      single entry, walk the entries looking for holes.
5758 	 */
5759 	prev = cur->vme_end;
5760 	cur = cur->vme_next;
5761 	while ((cur != vm_map_to_entry(map)) && (prev == cur->vme_start)) {
5762 		if (end <= cur->vme_end) {
5763 			return TRUE;
5764 		}
5765 		prev = cur->vme_end;
5766 		cur = cur->vme_next;
5767 	}
5768 	return FALSE;
5769 }
5770 
5771 /*
5772  *	vm_map_protect:
5773  *
5774  *	Sets the protection of the specified address
5775  *	region in the target map.  If "set_max" is
5776  *	specified, the maximum protection is to be set;
5777  *	otherwise, only the current protection is affected.
5778  */
5779 kern_return_t
vm_map_protect(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_prot_t new_prot,boolean_t set_max)5780 vm_map_protect(
5781 	vm_map_t        map,
5782 	vm_map_offset_t start,
5783 	vm_map_offset_t end,
5784 	vm_prot_t       new_prot,
5785 	boolean_t       set_max)
5786 {
5787 	vm_map_entry_t                  current;
5788 	vm_map_offset_t                 prev;
5789 	vm_map_entry_t                  entry;
5790 	vm_prot_t                       new_max;
5791 	int                             pmap_options = 0;
5792 	kern_return_t                   kr;
5793 
5794 	if (new_prot & VM_PROT_COPY) {
5795 		vm_map_offset_t         new_start;
5796 		vm_prot_t               cur_prot, max_prot;
5797 		vm_map_kernel_flags_t   kflags;
5798 
5799 		/* LP64todo - see below */
5800 		if (start >= map->max_offset) {
5801 			return KERN_INVALID_ADDRESS;
5802 		}
5803 
5804 		if ((new_prot & VM_PROT_ALLEXEC) &&
5805 		    map->pmap != kernel_pmap &&
5806 		    (vm_map_cs_enforcement(map)
5807 #if XNU_TARGET_OS_OSX && __arm64__
5808 		    || !VM_MAP_IS_EXOTIC(map)
5809 #endif /* XNU_TARGET_OS_OSX && __arm64__ */
5810 		    ) &&
5811 		    VM_MAP_POLICY_WX_FAIL(map)) {
5812 			DTRACE_VM3(cs_wx,
5813 			    uint64_t, (uint64_t) start,
5814 			    uint64_t, (uint64_t) end,
5815 			    vm_prot_t, new_prot);
5816 			printf("CODE SIGNING: %d[%s] %s:%d(0x%llx,0x%llx,0x%x) can't have both write and exec at the same time\n",
5817 			    proc_selfpid(),
5818 			    (get_bsdtask_info(current_task())
5819 			    ? proc_name_address(get_bsdtask_info(current_task()))
5820 			    : "?"),
5821 			    __FUNCTION__, __LINE__,
5822 #if DEVELOPMENT || DEBUG
5823 			    (uint64_t)start,
5824 			    (uint64_t)end,
5825 #else /* DEVELOPMENT || DEBUG */
5826 			    (uint64_t)0,
5827 			    (uint64_t)0,
5828 #endif /* DEVELOPMENT || DEBUG */
5829 			    new_prot);
5830 			return KERN_PROTECTION_FAILURE;
5831 		}
5832 
5833 		/*
5834 		 * Let vm_map_remap_extract() know that it will need to:
5835 		 * + make a copy of the mapping
5836 		 * + add VM_PROT_WRITE to the max protections
5837 		 * + remove any protections that are no longer allowed from the
5838 		 *   max protections (to avoid any WRITE/EXECUTE conflict, for
5839 		 *   example).
5840 		 * Note that "max_prot" is an IN/OUT parameter only for this
5841 		 * specific (VM_PROT_COPY) case.  It's usually an OUT parameter
5842 		 * only.
5843 		 */
5844 		max_prot = new_prot & (VM_PROT_ALL | VM_PROT_ALLEXEC);
5845 		cur_prot = VM_PROT_NONE;
5846 		kflags = VM_MAP_KERNEL_FLAGS_FIXED(.vmf_overwrite = true);
5847 		kflags.vmkf_remap_prot_copy = true;
5848 		new_start = start;
5849 		kr = vm_map_remap(map,
5850 		    &new_start,
5851 		    end - start,
5852 		    0, /* mask */
5853 		    kflags,
5854 		    map,
5855 		    start,
5856 		    TRUE, /* copy-on-write remapping! */
5857 		    &cur_prot, /* IN/OUT */
5858 		    &max_prot, /* IN/OUT */
5859 		    VM_INHERIT_DEFAULT);
5860 		if (kr != KERN_SUCCESS) {
5861 			return kr;
5862 		}
5863 		new_prot &= ~VM_PROT_COPY;
5864 	}
5865 
5866 	vm_map_lock(map);
5867 
5868 	/* LP64todo - remove this check when vm_map_commpage64()
5869 	 * no longer has to stuff in a map_entry for the commpage
5870 	 * above the map's max_offset.
5871 	 */
5872 	if (start >= map->max_offset) {
5873 		vm_map_unlock(map);
5874 		return KERN_INVALID_ADDRESS;
5875 	}
5876 
5877 	while (1) {
5878 		/*
5879 		 *      Lookup the entry.  If it doesn't start in a valid
5880 		 *	entry, return an error.
5881 		 */
5882 		if (!vm_map_lookup_entry(map, start, &entry)) {
5883 			vm_map_unlock(map);
5884 			return KERN_INVALID_ADDRESS;
5885 		}
5886 
5887 		if (entry->superpage_size && (start & (SUPERPAGE_SIZE - 1))) { /* extend request to whole entry */
5888 			start = SUPERPAGE_ROUND_DOWN(start);
5889 			continue;
5890 		}
5891 		break;
5892 	}
5893 	if (entry->superpage_size) {
5894 		end = SUPERPAGE_ROUND_UP(end);
5895 	}
5896 
5897 	/*
5898 	 *	Make a first pass to check for protection and address
5899 	 *	violations.
5900 	 */
5901 
5902 	current = entry;
5903 	prev = current->vme_start;
5904 	while ((current != vm_map_to_entry(map)) &&
5905 	    (current->vme_start < end)) {
5906 		/*
5907 		 * If there is a hole, return an error.
5908 		 */
5909 		if (current->vme_start != prev) {
5910 			vm_map_unlock(map);
5911 			return KERN_INVALID_ADDRESS;
5912 		}
5913 
5914 		new_max = current->max_protection;
5915 
5916 #if defined(__x86_64__)
5917 		/* Allow max mask to include execute prot bits if this map doesn't enforce CS */
5918 		if (set_max && (new_prot & VM_PROT_ALLEXEC) && !vm_map_cs_enforcement(map)) {
5919 			new_max = (new_max & ~VM_PROT_ALLEXEC) | (new_prot & VM_PROT_ALLEXEC);
5920 		}
5921 #elif CODE_SIGNING_MONITOR
5922 		if (set_max && (new_prot & VM_PROT_EXECUTE) && (csm_address_space_exempt(map->pmap) == KERN_SUCCESS)) {
5923 			new_max |= VM_PROT_EXECUTE;
5924 		}
5925 #endif
5926 		if ((new_prot & new_max) != new_prot) {
5927 			vm_map_unlock(map);
5928 			return KERN_PROTECTION_FAILURE;
5929 		}
5930 
5931 		if (current->used_for_jit &&
5932 		    pmap_has_prot_policy(map->pmap, current->translated_allow_execute, current->protection)) {
5933 			vm_map_unlock(map);
5934 			return KERN_PROTECTION_FAILURE;
5935 		}
5936 
5937 #if __arm64e__
5938 		/* Disallow remapping hw assisted TPRO mappings */
5939 		if (current->used_for_tpro) {
5940 			vm_map_unlock(map);
5941 			return KERN_PROTECTION_FAILURE;
5942 		}
5943 #endif /* __arm64e__ */
5944 
5945 
5946 		if ((new_prot & VM_PROT_WRITE) &&
5947 		    (new_prot & VM_PROT_ALLEXEC) &&
5948 #if XNU_TARGET_OS_OSX
5949 		    map->pmap != kernel_pmap &&
5950 		    (vm_map_cs_enforcement(map)
5951 #if __arm64__
5952 		    || !VM_MAP_IS_EXOTIC(map)
5953 #endif /* __arm64__ */
5954 		    ) &&
5955 #endif /* XNU_TARGET_OS_OSX */
5956 #if CODE_SIGNING_MONITOR
5957 		    (csm_address_space_exempt(map->pmap) != KERN_SUCCESS) &&
5958 #endif
5959 		    !(current->used_for_jit)) {
5960 			DTRACE_VM3(cs_wx,
5961 			    uint64_t, (uint64_t) current->vme_start,
5962 			    uint64_t, (uint64_t) current->vme_end,
5963 			    vm_prot_t, new_prot);
5964 			printf("CODE SIGNING: %d[%s] %s:%d(0x%llx,0x%llx,0x%x) can't have both write and exec at the same time\n",
5965 			    proc_selfpid(),
5966 			    (get_bsdtask_info(current_task())
5967 			    ? proc_name_address(get_bsdtask_info(current_task()))
5968 			    : "?"),
5969 			    __FUNCTION__, __LINE__,
5970 #if DEVELOPMENT || DEBUG
5971 			    (uint64_t)current->vme_start,
5972 			    (uint64_t)current->vme_end,
5973 #else /* DEVELOPMENT || DEBUG */
5974 			    (uint64_t)0,
5975 			    (uint64_t)0,
5976 #endif /* DEVELOPMENT || DEBUG */
5977 			    new_prot);
5978 			new_prot &= ~VM_PROT_ALLEXEC;
5979 			if (VM_MAP_POLICY_WX_FAIL(map)) {
5980 				vm_map_unlock(map);
5981 				return KERN_PROTECTION_FAILURE;
5982 			}
5983 		}
5984 
5985 		/*
5986 		 * If the task has requested executable lockdown,
5987 		 * deny both:
5988 		 * - adding executable protections OR
5989 		 * - adding write protections to an existing executable mapping.
5990 		 */
5991 		if (map->map_disallow_new_exec == TRUE) {
5992 			if ((new_prot & VM_PROT_ALLEXEC) ||
5993 			    ((current->protection & VM_PROT_EXECUTE) && (new_prot & VM_PROT_WRITE))) {
5994 				vm_map_unlock(map);
5995 				return KERN_PROTECTION_FAILURE;
5996 			}
5997 		}
5998 
5999 		prev = current->vme_end;
6000 		current = current->vme_next;
6001 	}
6002 
6003 #if __arm64__
6004 	if (end > prev &&
6005 	    end == vm_map_round_page(prev, VM_MAP_PAGE_MASK(map))) {
6006 		vm_map_entry_t prev_entry;
6007 
6008 		prev_entry = current->vme_prev;
6009 		if (prev_entry != vm_map_to_entry(map) &&
6010 		    !prev_entry->map_aligned &&
6011 		    (vm_map_round_page(prev_entry->vme_end,
6012 		    VM_MAP_PAGE_MASK(map))
6013 		    == end)) {
6014 			/*
6015 			 * The last entry in our range is not "map-aligned"
6016 			 * but it would have reached all the way to "end"
6017 			 * if it had been map-aligned, so this is not really
6018 			 * a hole in the range and we can proceed.
6019 			 */
6020 			prev = end;
6021 		}
6022 	}
6023 #endif /* __arm64__ */
6024 
6025 	if (end > prev) {
6026 		vm_map_unlock(map);
6027 		return KERN_INVALID_ADDRESS;
6028 	}
6029 
6030 	/*
6031 	 *	Go back and fix up protections.
6032 	 *	Clip to start here if the range starts within
6033 	 *	the entry.
6034 	 */
6035 
6036 	current = entry;
6037 	if (current != vm_map_to_entry(map)) {
6038 		/* clip and unnest if necessary */
6039 		vm_map_clip_start(map, current, start);
6040 	}
6041 
6042 	while ((current != vm_map_to_entry(map)) &&
6043 	    (current->vme_start < end)) {
6044 		vm_prot_t       old_prot;
6045 
6046 		vm_map_clip_end(map, current, end);
6047 
6048 #if DEVELOPMENT || DEBUG
6049 		if (current->csm_associated && vm_log_xnu_user_debug) {
6050 			printf("FBDP %d[%s] %s(0x%llx,0x%llx,0x%x) on map %p entry %p [0x%llx:0x%llx 0x%x/0x%x] csm_associated\n",
6051 			    proc_selfpid(),
6052 			    (get_bsdtask_info(current_task())
6053 			    ? proc_name_address(get_bsdtask_info(current_task()))
6054 			    : "?"),
6055 			    __FUNCTION__,
6056 			    (uint64_t)start,
6057 			    (uint64_t)end,
6058 			    new_prot,
6059 			    map, current,
6060 			    current->vme_start,
6061 			    current->vme_end,
6062 			    current->protection,
6063 			    current->max_protection);
6064 		}
6065 #endif /* DEVELOPMENT || DEBUG */
6066 
6067 		if (current->is_sub_map) {
6068 			/* clipping did unnest if needed */
6069 			assert(!current->use_pmap);
6070 		}
6071 
6072 		old_prot = current->protection;
6073 
6074 		if (set_max) {
6075 			current->max_protection = new_prot;
6076 			/* Consider either EXECUTE or UEXEC as EXECUTE for this masking */
6077 			current->protection = (new_prot & old_prot);
6078 		} else {
6079 			current->protection = new_prot;
6080 		}
6081 
6082 #if CODE_SIGNING_MONITOR
6083 		if (!current->vme_xnu_user_debug &&
6084 		    /* a !csm_associated mapping becoming executable */
6085 		    ((!current->csm_associated &&
6086 		    !(old_prot & VM_PROT_EXECUTE) &&
6087 		    (current->protection & VM_PROT_EXECUTE))
6088 		    ||
6089 		    /* a csm_associated mapping becoming writable */
6090 		    (current->csm_associated &&
6091 		    !(old_prot & VM_PROT_WRITE) &&
6092 		    (current->protection & VM_PROT_WRITE)))) {
6093 			/*
6094 			 * This mapping has not already been marked as
6095 			 * "user_debug" and it is either:
6096 			 * 1. not code-signing-monitored and becoming executable
6097 			 * 2. code-signing-monitored and becoming writable,
6098 			 * so inform the CodeSigningMonitor and mark the
6099 			 * mapping as "user_debug" if appropriate.
6100 			 */
6101 			vm_map_kernel_flags_t vmk_flags;
6102 			vmk_flags = VM_MAP_KERNEL_FLAGS_NONE;
6103 			/* pretend it's a vm_protect(VM_PROT_COPY)... */
6104 			vmk_flags.vmkf_remap_prot_copy = true;
6105 			kr = vm_map_entry_cs_associate(map, current, vmk_flags);
6106 #if DEVELOPMENT || DEBUG
6107 			if (vm_log_xnu_user_debug) {
6108 				printf("FBDP %d[%s] %s:%d map %p entry %p [ 0x%llx 0x%llx ] prot 0x%x -> 0x%x cs_associate -> %d user_debug=%d\n",
6109 				    proc_selfpid(),
6110 				    (get_bsdtask_info(current_task()) ? proc_name_address(get_bsdtask_info(current_task())) : "?"),
6111 				    __FUNCTION__, __LINE__,
6112 				    map, current,
6113 				    current->vme_start, current->vme_end,
6114 				    old_prot, current->protection,
6115 				    kr, current->vme_xnu_user_debug);
6116 			}
6117 #endif /* DEVELOPMENT || DEBUG */
6118 		}
6119 #endif /* CODE_SIGNING_MONITOR */
6120 
6121 		/*
6122 		 *	Update physical map if necessary.
6123 		 *	If the request is to turn off write protection,
6124 		 *	we won't do it for real (in pmap). This is because
6125 		 *	it would cause copy-on-write to fail.  We've already
6126 		 *	set, the new protection in the map, so if a
6127 		 *	write-protect fault occurred, it will be fixed up
6128 		 *	properly, COW or not.
6129 		 */
6130 		if (current->protection != old_prot) {
6131 			/* Look one level in we support nested pmaps */
6132 			/* from mapped submaps which are direct entries */
6133 			/* in our map */
6134 
6135 			vm_prot_t prot;
6136 
6137 			prot = current->protection;
6138 			if (current->is_sub_map || (VME_OBJECT(current) == NULL) || (VME_OBJECT(current) != compressor_object)) {
6139 				prot &= ~VM_PROT_WRITE;
6140 			} else {
6141 				assert(!VME_OBJECT(current)->code_signed);
6142 				assert(VME_OBJECT(current)->copy_strategy == MEMORY_OBJECT_COPY_NONE);
6143 				if (prot & VM_PROT_WRITE) {
6144 					/*
6145 					 * For write requests on the
6146 					 * compressor, we wil ask the
6147 					 * pmap layer to prevent us from
6148 					 * taking a write fault when we
6149 					 * attempt to access the mapping
6150 					 * next.
6151 					 */
6152 					pmap_options |= PMAP_OPTIONS_PROTECT_IMMEDIATE;
6153 				}
6154 			}
6155 
6156 			if (override_nx(map, VME_ALIAS(current)) && prot) {
6157 				prot |= VM_PROT_EXECUTE;
6158 			}
6159 
6160 #if DEVELOPMENT || DEBUG
6161 			if (!(old_prot & VM_PROT_EXECUTE) &&
6162 			    (prot & VM_PROT_EXECUTE) &&
6163 			    panic_on_unsigned_execute &&
6164 			    (proc_selfcsflags() & CS_KILL)) {
6165 				panic("vm_map_protect(%p,0x%llx,0x%llx) old=0x%x new=0x%x - <rdar://23770418> code-signing bypass?", map, (uint64_t)current->vme_start, (uint64_t)current->vme_end, old_prot, prot);
6166 			}
6167 #endif /* DEVELOPMENT || DEBUG */
6168 
6169 			if (pmap_has_prot_policy(map->pmap, current->translated_allow_execute, prot)) {
6170 				if (current->wired_count) {
6171 					panic("vm_map_protect(%p,0x%llx,0x%llx) new=0x%x wired=%x",
6172 					    map, (uint64_t)current->vme_start, (uint64_t)current->vme_end, prot, current->wired_count);
6173 				}
6174 
6175 				/* If the pmap layer cares about this
6176 				 * protection type, force a fault for
6177 				 * each page so that vm_fault will
6178 				 * repopulate the page with the full
6179 				 * set of protections.
6180 				 */
6181 				/*
6182 				 * TODO: We don't seem to need this,
6183 				 * but this is due to an internal
6184 				 * implementation detail of
6185 				 * pmap_protect.  Do we want to rely
6186 				 * on this?
6187 				 */
6188 				prot = VM_PROT_NONE;
6189 			}
6190 
6191 			if (current->is_sub_map && current->use_pmap) {
6192 				pmap_protect(VME_SUBMAP(current)->pmap,
6193 				    current->vme_start,
6194 				    current->vme_end,
6195 				    prot);
6196 			} else {
6197 				pmap_protect_options(map->pmap,
6198 				    current->vme_start,
6199 				    current->vme_end,
6200 				    prot,
6201 				    pmap_options,
6202 				    NULL);
6203 			}
6204 		}
6205 		current = current->vme_next;
6206 	}
6207 
6208 	current = entry;
6209 	while ((current != vm_map_to_entry(map)) &&
6210 	    (current->vme_start <= end)) {
6211 		vm_map_simplify_entry(map, current);
6212 		current = current->vme_next;
6213 	}
6214 
6215 	vm_map_unlock(map);
6216 	return KERN_SUCCESS;
6217 }
6218 
6219 /*
6220  *	vm_map_inherit:
6221  *
6222  *	Sets the inheritance of the specified address
6223  *	range in the target map.  Inheritance
6224  *	affects how the map will be shared with
6225  *	child maps at the time of vm_map_fork.
6226  */
6227 kern_return_t
vm_map_inherit(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_inherit_t new_inheritance)6228 vm_map_inherit(
6229 	vm_map_t        map,
6230 	vm_map_offset_t start,
6231 	vm_map_offset_t end,
6232 	vm_inherit_t    new_inheritance)
6233 {
6234 	vm_map_entry_t  entry;
6235 	vm_map_entry_t  temp_entry;
6236 
6237 	vm_map_lock(map);
6238 
6239 	VM_MAP_RANGE_CHECK(map, start, end);
6240 
6241 	if (vm_map_lookup_entry(map, start, &temp_entry)) {
6242 		entry = temp_entry;
6243 	} else {
6244 		temp_entry = temp_entry->vme_next;
6245 		entry = temp_entry;
6246 	}
6247 
6248 	/* first check entire range for submaps which can't support the */
6249 	/* given inheritance. */
6250 	while ((entry != vm_map_to_entry(map)) && (entry->vme_start < end)) {
6251 		if (entry->is_sub_map) {
6252 			if (new_inheritance == VM_INHERIT_COPY) {
6253 				vm_map_unlock(map);
6254 				return KERN_INVALID_ARGUMENT;
6255 			}
6256 		}
6257 
6258 		entry = entry->vme_next;
6259 	}
6260 
6261 	entry = temp_entry;
6262 	if (entry != vm_map_to_entry(map)) {
6263 		/* clip and unnest if necessary */
6264 		vm_map_clip_start(map, entry, start);
6265 	}
6266 
6267 	while ((entry != vm_map_to_entry(map)) && (entry->vme_start < end)) {
6268 		vm_map_clip_end(map, entry, end);
6269 		if (entry->is_sub_map) {
6270 			/* clip did unnest if needed */
6271 			assert(!entry->use_pmap);
6272 		}
6273 
6274 		entry->inheritance = new_inheritance;
6275 
6276 		entry = entry->vme_next;
6277 	}
6278 
6279 	vm_map_unlock(map);
6280 	return KERN_SUCCESS;
6281 }
6282 
6283 /*
6284  * Update the accounting for the amount of wired memory in this map.  If the user has
6285  * exceeded the defined limits, then we fail.  Wiring on behalf of the kernel never fails.
6286  */
6287 
6288 static kern_return_t
add_wire_counts(vm_map_t map,vm_map_entry_t entry,boolean_t user_wire)6289 add_wire_counts(
6290 	vm_map_t        map,
6291 	vm_map_entry_t  entry,
6292 	boolean_t       user_wire)
6293 {
6294 	vm_map_size_t   size;
6295 
6296 	if (user_wire) {
6297 		unsigned int total_wire_count =  vm_page_wire_count + vm_lopage_free_count;
6298 
6299 		/*
6300 		 * We're wiring memory at the request of the user.  Check if this is the first time the user is wiring
6301 		 * this map entry.
6302 		 */
6303 
6304 		if (entry->user_wired_count == 0) {
6305 			size = entry->vme_end - entry->vme_start;
6306 
6307 			/*
6308 			 * Since this is the first time the user is wiring this map entry, check to see if we're
6309 			 * exceeding the user wire limits.  There is a per map limit which is the smaller of either
6310 			 * the process's rlimit or the global vm_per_task_user_wire_limit which caps this value.  There is also
6311 			 * a system-wide limit on the amount of memory all users can wire.  If the user is over either
6312 			 * limit, then we fail.
6313 			 */
6314 
6315 			if (size + map->user_wire_size > MIN(map->user_wire_limit, vm_per_task_user_wire_limit) ||
6316 			    size + ptoa_64(total_wire_count) > vm_global_user_wire_limit) {
6317 				if (size + ptoa_64(total_wire_count) > vm_global_user_wire_limit) {
6318 #if DEVELOPMENT || DEBUG
6319 					if (panic_on_mlock_failure) {
6320 						panic("mlock: Over global wire limit. %llu bytes wired and requested to wire %llu bytes more", ptoa_64(total_wire_count), (uint64_t) size);
6321 					}
6322 #endif /* DEVELOPMENT || DEBUG */
6323 					os_atomic_inc(&vm_add_wire_count_over_global_limit, relaxed);
6324 				} else {
6325 					os_atomic_inc(&vm_add_wire_count_over_user_limit, relaxed);
6326 #if DEVELOPMENT || DEBUG
6327 					if (panic_on_mlock_failure) {
6328 						panic("mlock: Over process wire limit. %llu bytes wired and requested to wire %llu bytes more", (uint64_t) map->user_wire_size, (uint64_t) size);
6329 					}
6330 #endif /* DEVELOPMENT || DEBUG */
6331 				}
6332 				return KERN_RESOURCE_SHORTAGE;
6333 			}
6334 
6335 			/*
6336 			 * The first time the user wires an entry, we also increment the wired_count and add this to
6337 			 * the total that has been wired in the map.
6338 			 */
6339 
6340 			if (entry->wired_count >= MAX_WIRE_COUNT) {
6341 				return KERN_FAILURE;
6342 			}
6343 
6344 			entry->wired_count++;
6345 			map->user_wire_size += size;
6346 		}
6347 
6348 		if (entry->user_wired_count >= MAX_WIRE_COUNT) {
6349 			return KERN_FAILURE;
6350 		}
6351 
6352 		entry->user_wired_count++;
6353 	} else {
6354 		/*
6355 		 * The kernel's wiring the memory.  Just bump the count and continue.
6356 		 */
6357 
6358 		if (entry->wired_count >= MAX_WIRE_COUNT) {
6359 			panic("vm_map_wire: too many wirings");
6360 		}
6361 
6362 		entry->wired_count++;
6363 	}
6364 
6365 	return KERN_SUCCESS;
6366 }
6367 
6368 /*
6369  * Update the memory wiring accounting now that the given map entry is being unwired.
6370  */
6371 
6372 static void
subtract_wire_counts(vm_map_t map,vm_map_entry_t entry,boolean_t user_wire)6373 subtract_wire_counts(
6374 	vm_map_t        map,
6375 	vm_map_entry_t  entry,
6376 	boolean_t       user_wire)
6377 {
6378 	if (user_wire) {
6379 		/*
6380 		 * We're unwiring memory at the request of the user.  See if we're removing the last user wire reference.
6381 		 */
6382 
6383 		if (entry->user_wired_count == 1) {
6384 			/*
6385 			 * We're removing the last user wire reference.  Decrement the wired_count and the total
6386 			 * user wired memory for this map.
6387 			 */
6388 
6389 			assert(entry->wired_count >= 1);
6390 			entry->wired_count--;
6391 			map->user_wire_size -= entry->vme_end - entry->vme_start;
6392 		}
6393 
6394 		assert(entry->user_wired_count >= 1);
6395 		entry->user_wired_count--;
6396 	} else {
6397 		/*
6398 		 * The kernel is unwiring the memory.   Just update the count.
6399 		 */
6400 
6401 		assert(entry->wired_count >= 1);
6402 		entry->wired_count--;
6403 	}
6404 }
6405 
6406 int cs_executable_wire = 0;
6407 
6408 /*
6409  *	vm_map_wire:
6410  *
6411  *	Sets the pageability of the specified address range in the
6412  *	target map as wired.  Regions specified as not pageable require
6413  *	locked-down physical memory and physical page maps.  The
6414  *	access_type variable indicates types of accesses that must not
6415  *	generate page faults.  This is checked against protection of
6416  *	memory being locked-down.
6417  *
6418  *	The map must not be locked, but a reference must remain to the
6419  *	map throughout the call.
6420  */
6421 static kern_return_t
vm_map_wire_nested(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_prot_t caller_prot,vm_tag_t tag,boolean_t user_wire,pmap_t map_pmap,vm_map_offset_t pmap_addr,ppnum_t * physpage_p)6422 vm_map_wire_nested(
6423 	vm_map_t                map,
6424 	vm_map_offset_t         start,
6425 	vm_map_offset_t         end,
6426 	vm_prot_t               caller_prot,
6427 	vm_tag_t                tag,
6428 	boolean_t               user_wire,
6429 	pmap_t                  map_pmap,
6430 	vm_map_offset_t         pmap_addr,
6431 	ppnum_t                 *physpage_p)
6432 {
6433 	vm_map_entry_t          entry;
6434 	vm_prot_t               access_type;
6435 	struct vm_map_entry     *first_entry, tmp_entry;
6436 	vm_map_t                real_map;
6437 	vm_map_offset_t         s, e;
6438 	kern_return_t           rc;
6439 	boolean_t               need_wakeup;
6440 	boolean_t               main_map = FALSE;
6441 	wait_interrupt_t        interruptible_state;
6442 	thread_t                cur_thread;
6443 	unsigned int            last_timestamp;
6444 	vm_map_size_t           size;
6445 	boolean_t               wire_and_extract;
6446 	vm_prot_t               extra_prots;
6447 
6448 	extra_prots = VM_PROT_COPY;
6449 	extra_prots |= VM_PROT_COPY_FAIL_IF_EXECUTABLE;
6450 #if XNU_TARGET_OS_OSX
6451 	if (map->pmap == kernel_pmap ||
6452 	    !vm_map_cs_enforcement(map)) {
6453 		extra_prots &= ~VM_PROT_COPY_FAIL_IF_EXECUTABLE;
6454 	}
6455 #endif /* XNU_TARGET_OS_OSX */
6456 #if CODE_SIGNING_MONITOR
6457 	if (csm_address_space_exempt(map->pmap) == KERN_SUCCESS) {
6458 		extra_prots &= ~VM_PROT_COPY_FAIL_IF_EXECUTABLE;
6459 	}
6460 #endif /* CODE_SIGNING_MONITOR */
6461 
6462 	access_type = (caller_prot & (VM_PROT_ALL | VM_PROT_ALLEXEC));
6463 
6464 	wire_and_extract = FALSE;
6465 	if (physpage_p != NULL) {
6466 		/*
6467 		 * The caller wants the physical page number of the
6468 		 * wired page.  We return only one physical page number
6469 		 * so this works for only one page at a time.
6470 		 */
6471 		if ((end - start) != PAGE_SIZE) {
6472 			return KERN_INVALID_ARGUMENT;
6473 		}
6474 		wire_and_extract = TRUE;
6475 		*physpage_p = 0;
6476 	}
6477 
6478 	vm_map_lock(map);
6479 	if (map_pmap == NULL) {
6480 		main_map = TRUE;
6481 	}
6482 	last_timestamp = map->timestamp;
6483 
6484 	VM_MAP_RANGE_CHECK(map, start, end);
6485 	assert(VM_MAP_PAGE_ALIGNED(start, VM_MAP_PAGE_MASK(map)));
6486 	assert(VM_MAP_PAGE_ALIGNED(end, VM_MAP_PAGE_MASK(map)));
6487 
6488 	if (start == end) {
6489 		/* We wired what the caller asked for, zero pages */
6490 		vm_map_unlock(map);
6491 		return KERN_SUCCESS;
6492 	}
6493 
6494 	need_wakeup = FALSE;
6495 	cur_thread = current_thread();
6496 
6497 	s = start;
6498 	rc = KERN_SUCCESS;
6499 
6500 	if (vm_map_lookup_entry(map, s, &first_entry)) {
6501 		entry = first_entry;
6502 		/*
6503 		 * vm_map_clip_start will be done later.
6504 		 * We don't want to unnest any nested submaps here !
6505 		 */
6506 	} else {
6507 		/* Start address is not in map */
6508 		rc = KERN_INVALID_ADDRESS;
6509 		goto done;
6510 	}
6511 
6512 	while ((entry != vm_map_to_entry(map)) && (s < end)) {
6513 		/*
6514 		 * At this point, we have wired from "start" to "s".
6515 		 * We still need to wire from "s" to "end".
6516 		 *
6517 		 * "entry" hasn't been clipped, so it could start before "s"
6518 		 * and/or end after "end".
6519 		 */
6520 
6521 		/* "e" is how far we want to wire in this entry */
6522 		e = entry->vme_end;
6523 		if (e > end) {
6524 			e = end;
6525 		}
6526 
6527 		/*
6528 		 * If another thread is wiring/unwiring this entry then
6529 		 * block after informing other thread to wake us up.
6530 		 */
6531 		if (entry->in_transition) {
6532 			wait_result_t wait_result;
6533 
6534 			/*
6535 			 * We have not clipped the entry.  Make sure that
6536 			 * the start address is in range so that the lookup
6537 			 * below will succeed.
6538 			 * "s" is the current starting point: we've already
6539 			 * wired from "start" to "s" and we still have
6540 			 * to wire from "s" to "end".
6541 			 */
6542 
6543 			entry->needs_wakeup = TRUE;
6544 
6545 			/*
6546 			 * wake up anybody waiting on entries that we have
6547 			 * already wired.
6548 			 */
6549 			if (need_wakeup) {
6550 				vm_map_entry_wakeup(map);
6551 				need_wakeup = FALSE;
6552 			}
6553 			/*
6554 			 * User wiring is interruptible
6555 			 */
6556 			wait_result = vm_map_entry_wait(map,
6557 			    (user_wire) ? THREAD_ABORTSAFE :
6558 			    THREAD_UNINT);
6559 			if (user_wire && wait_result == THREAD_INTERRUPTED) {
6560 				/*
6561 				 * undo the wirings we have done so far
6562 				 * We do not clear the needs_wakeup flag,
6563 				 * because we cannot tell if we were the
6564 				 * only one waiting.
6565 				 */
6566 				rc = KERN_FAILURE;
6567 				goto done;
6568 			}
6569 
6570 			/*
6571 			 * Cannot avoid a lookup here. reset timestamp.
6572 			 */
6573 			last_timestamp = map->timestamp;
6574 
6575 			/*
6576 			 * The entry could have been clipped, look it up again.
6577 			 * Worse that can happen is, it may not exist anymore.
6578 			 */
6579 			if (!vm_map_lookup_entry(map, s, &first_entry)) {
6580 				/*
6581 				 * User: undo everything upto the previous
6582 				 * entry.  let vm_map_unwire worry about
6583 				 * checking the validity of the range.
6584 				 */
6585 				rc = KERN_FAILURE;
6586 				goto done;
6587 			}
6588 			entry = first_entry;
6589 			continue;
6590 		}
6591 
6592 		if (entry->is_sub_map) {
6593 			vm_map_offset_t sub_start;
6594 			vm_map_offset_t sub_end;
6595 			vm_map_offset_t local_start;
6596 			vm_map_offset_t local_end;
6597 			pmap_t          pmap;
6598 
6599 			if (wire_and_extract) {
6600 				/*
6601 				 * Wiring would result in copy-on-write
6602 				 * which would not be compatible with
6603 				 * the sharing we have with the original
6604 				 * provider of this memory.
6605 				 */
6606 				rc = KERN_INVALID_ARGUMENT;
6607 				goto done;
6608 			}
6609 
6610 			vm_map_clip_start(map, entry, s);
6611 			vm_map_clip_end(map, entry, end);
6612 
6613 			sub_start = VME_OFFSET(entry);
6614 			sub_end = entry->vme_end;
6615 			sub_end += VME_OFFSET(entry) - entry->vme_start;
6616 
6617 			local_end = entry->vme_end;
6618 			if (map_pmap == NULL) {
6619 				vm_object_t             object;
6620 				vm_object_offset_t      offset;
6621 				vm_prot_t               prot;
6622 				boolean_t               wired;
6623 				vm_map_entry_t          local_entry;
6624 				vm_map_version_t         version;
6625 				vm_map_t                lookup_map;
6626 
6627 				if (entry->use_pmap) {
6628 					pmap = VME_SUBMAP(entry)->pmap;
6629 					/* ppc implementation requires that */
6630 					/* submaps pmap address ranges line */
6631 					/* up with parent map */
6632 #ifdef notdef
6633 					pmap_addr = sub_start;
6634 #endif
6635 					pmap_addr = s;
6636 				} else {
6637 					pmap = map->pmap;
6638 					pmap_addr = s;
6639 				}
6640 
6641 				if (entry->wired_count) {
6642 					if ((rc = add_wire_counts(map, entry, user_wire)) != KERN_SUCCESS) {
6643 						goto done;
6644 					}
6645 
6646 					/*
6647 					 * The map was not unlocked:
6648 					 * no need to goto re-lookup.
6649 					 * Just go directly to next entry.
6650 					 */
6651 					entry = entry->vme_next;
6652 					s = entry->vme_start;
6653 					continue;
6654 				}
6655 
6656 				/* call vm_map_lookup_and_lock_object to */
6657 				/* cause any needs copy to be   */
6658 				/* evaluated */
6659 				local_start = entry->vme_start;
6660 				lookup_map = map;
6661 				vm_map_lock_write_to_read(map);
6662 				rc = vm_map_lookup_and_lock_object(
6663 					&lookup_map, local_start,
6664 					(access_type | extra_prots),
6665 					OBJECT_LOCK_EXCLUSIVE,
6666 					&version, &object,
6667 					&offset, &prot, &wired,
6668 					NULL,
6669 					&real_map, NULL);
6670 				if (rc != KERN_SUCCESS) {
6671 					vm_map_unlock_read(lookup_map);
6672 					assert(map_pmap == NULL);
6673 					vm_map_unwire(map, start,
6674 					    s, user_wire);
6675 					return rc;
6676 				}
6677 				vm_object_unlock(object);
6678 				if (real_map != lookup_map) {
6679 					vm_map_unlock(real_map);
6680 				}
6681 				vm_map_unlock_read(lookup_map);
6682 				vm_map_lock(map);
6683 
6684 				/* we unlocked, so must re-lookup */
6685 				if (!vm_map_lookup_entry(map,
6686 				    local_start,
6687 				    &local_entry)) {
6688 					rc = KERN_FAILURE;
6689 					goto done;
6690 				}
6691 
6692 				/*
6693 				 * entry could have been "simplified",
6694 				 * so re-clip
6695 				 */
6696 				entry = local_entry;
6697 				assert(s == local_start);
6698 				vm_map_clip_start(map, entry, s);
6699 				vm_map_clip_end(map, entry, end);
6700 				/* re-compute "e" */
6701 				e = entry->vme_end;
6702 				if (e > end) {
6703 					e = end;
6704 				}
6705 
6706 				/* did we have a change of type? */
6707 				if (!entry->is_sub_map) {
6708 					last_timestamp = map->timestamp;
6709 					continue;
6710 				}
6711 			} else {
6712 				local_start = entry->vme_start;
6713 				pmap = map_pmap;
6714 			}
6715 
6716 			if ((rc = add_wire_counts(map, entry, user_wire)) != KERN_SUCCESS) {
6717 				goto done;
6718 			}
6719 
6720 			entry->in_transition = TRUE;
6721 
6722 			vm_map_unlock(map);
6723 			rc = vm_map_wire_nested(VME_SUBMAP(entry),
6724 			    sub_start, sub_end,
6725 			    caller_prot, tag,
6726 			    user_wire, pmap, pmap_addr,
6727 			    NULL);
6728 			vm_map_lock(map);
6729 
6730 			/*
6731 			 * Find the entry again.  It could have been clipped
6732 			 * after we unlocked the map.
6733 			 */
6734 			if (!vm_map_lookup_entry(map, local_start,
6735 			    &first_entry)) {
6736 				panic("vm_map_wire: re-lookup failed");
6737 			}
6738 			entry = first_entry;
6739 
6740 			assert(local_start == s);
6741 			/* re-compute "e" */
6742 			e = entry->vme_end;
6743 			if (e > end) {
6744 				e = end;
6745 			}
6746 
6747 			last_timestamp = map->timestamp;
6748 			while ((entry != vm_map_to_entry(map)) &&
6749 			    (entry->vme_start < e)) {
6750 				assert(entry->in_transition);
6751 				entry->in_transition = FALSE;
6752 				if (entry->needs_wakeup) {
6753 					entry->needs_wakeup = FALSE;
6754 					need_wakeup = TRUE;
6755 				}
6756 				if (rc != KERN_SUCCESS) {/* from vm_*_wire */
6757 					subtract_wire_counts(map, entry, user_wire);
6758 				}
6759 				entry = entry->vme_next;
6760 			}
6761 			if (rc != KERN_SUCCESS) {       /* from vm_*_wire */
6762 				goto done;
6763 			}
6764 
6765 			/* no need to relookup again */
6766 			s = entry->vme_start;
6767 			continue;
6768 		}
6769 
6770 		/*
6771 		 * If this entry is already wired then increment
6772 		 * the appropriate wire reference count.
6773 		 */
6774 		if (entry->wired_count) {
6775 			if ((entry->protection & access_type) != access_type) {
6776 				/* found a protection problem */
6777 
6778 				/*
6779 				 * XXX FBDP
6780 				 * We should always return an error
6781 				 * in this case but since we didn't
6782 				 * enforce it before, let's do
6783 				 * it only for the new "wire_and_extract"
6784 				 * code path for now...
6785 				 */
6786 				if (wire_and_extract) {
6787 					rc = KERN_PROTECTION_FAILURE;
6788 					goto done;
6789 				}
6790 			}
6791 
6792 			/*
6793 			 * entry is already wired down, get our reference
6794 			 * after clipping to our range.
6795 			 */
6796 			vm_map_clip_start(map, entry, s);
6797 			vm_map_clip_end(map, entry, end);
6798 
6799 			if ((rc = add_wire_counts(map, entry, user_wire)) != KERN_SUCCESS) {
6800 				goto done;
6801 			}
6802 
6803 			if (wire_and_extract) {
6804 				vm_object_t             object;
6805 				vm_object_offset_t      offset;
6806 				vm_page_t               m;
6807 
6808 				/*
6809 				 * We don't have to "wire" the page again
6810 				 * bit we still have to "extract" its
6811 				 * physical page number, after some sanity
6812 				 * checks.
6813 				 */
6814 				assert((entry->vme_end - entry->vme_start)
6815 				    == PAGE_SIZE);
6816 				assert(!entry->needs_copy);
6817 				assert(!entry->is_sub_map);
6818 				assert(VME_OBJECT(entry));
6819 				if (((entry->vme_end - entry->vme_start)
6820 				    != PAGE_SIZE) ||
6821 				    entry->needs_copy ||
6822 				    entry->is_sub_map ||
6823 				    VME_OBJECT(entry) == VM_OBJECT_NULL) {
6824 					rc = KERN_INVALID_ARGUMENT;
6825 					goto done;
6826 				}
6827 
6828 				object = VME_OBJECT(entry);
6829 				offset = VME_OFFSET(entry);
6830 				/* need exclusive lock to update m->dirty */
6831 				if (entry->protection & VM_PROT_WRITE) {
6832 					vm_object_lock(object);
6833 				} else {
6834 					vm_object_lock_shared(object);
6835 				}
6836 				m = vm_page_lookup(object, offset);
6837 				assert(m != VM_PAGE_NULL);
6838 				assert(VM_PAGE_WIRED(m));
6839 				if (m != VM_PAGE_NULL && VM_PAGE_WIRED(m)) {
6840 					*physpage_p = VM_PAGE_GET_PHYS_PAGE(m);
6841 					if (entry->protection & VM_PROT_WRITE) {
6842 						vm_object_lock_assert_exclusive(
6843 							object);
6844 						m->vmp_dirty = TRUE;
6845 					}
6846 				} else {
6847 					/* not already wired !? */
6848 					*physpage_p = 0;
6849 				}
6850 				vm_object_unlock(object);
6851 			}
6852 
6853 			/* map was not unlocked: no need to relookup */
6854 			entry = entry->vme_next;
6855 			s = entry->vme_start;
6856 			continue;
6857 		}
6858 
6859 		/*
6860 		 * Unwired entry or wire request transmitted via submap
6861 		 */
6862 
6863 		/*
6864 		 * Wiring would copy the pages to the shadow object.
6865 		 * The shadow object would not be code-signed so
6866 		 * attempting to execute code from these copied pages
6867 		 * would trigger a code-signing violation.
6868 		 */
6869 
6870 		if ((entry->protection & VM_PROT_EXECUTE)
6871 #if XNU_TARGET_OS_OSX
6872 		    &&
6873 		    map->pmap != kernel_pmap &&
6874 		    (vm_map_cs_enforcement(map)
6875 #if __arm64__
6876 		    || !VM_MAP_IS_EXOTIC(map)
6877 #endif /* __arm64__ */
6878 		    )
6879 #endif /* XNU_TARGET_OS_OSX */
6880 #if CODE_SIGNING_MONITOR
6881 		    &&
6882 		    (csm_address_space_exempt(map->pmap) != KERN_SUCCESS)
6883 #endif
6884 		    ) {
6885 #if MACH_ASSERT
6886 			printf("pid %d[%s] wiring executable range from "
6887 			    "0x%llx to 0x%llx: rejected to preserve "
6888 			    "code-signing\n",
6889 			    proc_selfpid(),
6890 			    (get_bsdtask_info(current_task())
6891 			    ? proc_name_address(get_bsdtask_info(current_task()))
6892 			    : "?"),
6893 			    (uint64_t) entry->vme_start,
6894 			    (uint64_t) entry->vme_end);
6895 #endif /* MACH_ASSERT */
6896 			DTRACE_VM2(cs_executable_wire,
6897 			    uint64_t, (uint64_t)entry->vme_start,
6898 			    uint64_t, (uint64_t)entry->vme_end);
6899 			cs_executable_wire++;
6900 			rc = KERN_PROTECTION_FAILURE;
6901 			goto done;
6902 		}
6903 
6904 		/*
6905 		 * Perform actions of vm_map_lookup that need the write
6906 		 * lock on the map: create a shadow object for a
6907 		 * copy-on-write region, or an object for a zero-fill
6908 		 * region.
6909 		 */
6910 		size = entry->vme_end - entry->vme_start;
6911 		/*
6912 		 * If wiring a copy-on-write page, we need to copy it now
6913 		 * even if we're only (currently) requesting read access.
6914 		 * This is aggressive, but once it's wired we can't move it.
6915 		 */
6916 		if (entry->needs_copy) {
6917 			if (wire_and_extract) {
6918 				/*
6919 				 * We're supposed to share with the original
6920 				 * provider so should not be "needs_copy"
6921 				 */
6922 				rc = KERN_INVALID_ARGUMENT;
6923 				goto done;
6924 			}
6925 
6926 			VME_OBJECT_SHADOW(entry, size,
6927 			    vm_map_always_shadow(map));
6928 			entry->needs_copy = FALSE;
6929 		} else if (VME_OBJECT(entry) == VM_OBJECT_NULL) {
6930 			if (wire_and_extract) {
6931 				/*
6932 				 * We're supposed to share with the original
6933 				 * provider so should already have an object.
6934 				 */
6935 				rc = KERN_INVALID_ARGUMENT;
6936 				goto done;
6937 			}
6938 			VME_OBJECT_SET(entry, vm_object_allocate(size), false, 0);
6939 			VME_OFFSET_SET(entry, (vm_object_offset_t)0);
6940 			assert(entry->use_pmap);
6941 		} else if (VME_OBJECT(entry)->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) {
6942 			if (wire_and_extract) {
6943 				/*
6944 				 * We're supposed to share with the original
6945 				 * provider so should not be COPY_SYMMETRIC.
6946 				 */
6947 				rc = KERN_INVALID_ARGUMENT;
6948 				goto done;
6949 			}
6950 			/*
6951 			 * Force an unrequested "copy-on-write" but only for
6952 			 * the range we're wiring.
6953 			 */
6954 //			printf("FBDP %s:%d map %p entry %p [ 0x%llx 0x%llx ] s 0x%llx end 0x%llx wire&extract=%d\n", __FUNCTION__, __LINE__, map, entry, (uint64_t)entry->vme_start, (uint64_t)entry->vme_end, (uint64_t)s, (uint64_t)end, wire_and_extract);
6955 			vm_map_clip_start(map, entry, s);
6956 			vm_map_clip_end(map, entry, end);
6957 			/* recompute "size" */
6958 			size = entry->vme_end - entry->vme_start;
6959 			/* make a shadow object */
6960 			vm_object_t orig_object;
6961 			vm_object_offset_t orig_offset;
6962 			orig_object = VME_OBJECT(entry);
6963 			orig_offset = VME_OFFSET(entry);
6964 			VME_OBJECT_SHADOW(entry, size, vm_map_always_shadow(map));
6965 			if (VME_OBJECT(entry) != orig_object) {
6966 				/*
6967 				 * This mapping has not been shared (or it would be
6968 				 * COPY_DELAY instead of COPY_SYMMETRIC) and it has
6969 				 * not been copied-on-write (or it would be marked
6970 				 * as "needs_copy" and would have been handled above
6971 				 * and also already write-protected).
6972 				 * We still need to write-protect here to prevent
6973 				 * other threads from modifying these pages while
6974 				 * we're in the process of copying and wiring
6975 				 * the copied pages.
6976 				 * Since the mapping is neither shared nor COWed,
6977 				 * we only need to write-protect the PTEs for this
6978 				 * mapping.
6979 				 */
6980 				vm_object_pmap_protect(orig_object,
6981 				    orig_offset,
6982 				    size,
6983 				    map->pmap,
6984 				    VM_MAP_PAGE_SIZE(map),
6985 				    entry->vme_start,
6986 				    entry->protection & ~VM_PROT_WRITE);
6987 			}
6988 		}
6989 		if (VME_OBJECT(entry)->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) {
6990 			/*
6991 			 * Make the object COPY_DELAY to get a stable object
6992 			 * to wire.
6993 			 * That should avoid creating long shadow chains while
6994 			 * wiring/unwiring the same range repeatedly.
6995 			 * That also prevents part of the object from being
6996 			 * wired while another part is "needs_copy", which
6997 			 * could result in conflicting rules wrt copy-on-write.
6998 			 */
6999 			vm_object_t object;
7000 
7001 			object = VME_OBJECT(entry);
7002 			vm_object_lock(object);
7003 			if (object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) {
7004 				assertf(vm_object_round_page(VME_OFFSET(entry) + size) - vm_object_trunc_page(VME_OFFSET(entry)) == object->vo_size,
7005 				    "object %p size 0x%llx entry %p [0x%llx:0x%llx:0x%llx] size 0x%llx\n",
7006 				    object, (uint64_t)object->vo_size,
7007 				    entry,
7008 				    (uint64_t)entry->vme_start,
7009 				    (uint64_t)entry->vme_end,
7010 				    (uint64_t)VME_OFFSET(entry),
7011 				    (uint64_t)size);
7012 				assertf(object->ref_count == 1,
7013 				    "object %p ref_count %d\n",
7014 				    object, object->ref_count);
7015 				assertf(!entry->needs_copy,
7016 				    "entry %p\n", entry);
7017 				object->copy_strategy = MEMORY_OBJECT_COPY_DELAY;
7018 				object->true_share = TRUE;
7019 			}
7020 			vm_object_unlock(object);
7021 		}
7022 
7023 		vm_map_clip_start(map, entry, s);
7024 		vm_map_clip_end(map, entry, end);
7025 
7026 		/* re-compute "e" */
7027 		e = entry->vme_end;
7028 		if (e > end) {
7029 			e = end;
7030 		}
7031 
7032 		/*
7033 		 * Check for holes and protection mismatch.
7034 		 * Holes: Next entry should be contiguous unless this
7035 		 *	  is the end of the region.
7036 		 * Protection: Access requested must be allowed, unless
7037 		 *	wiring is by protection class
7038 		 */
7039 		if ((entry->vme_end < end) &&
7040 		    ((entry->vme_next == vm_map_to_entry(map)) ||
7041 		    (entry->vme_next->vme_start > entry->vme_end))) {
7042 			/* found a hole */
7043 			rc = KERN_INVALID_ADDRESS;
7044 			goto done;
7045 		}
7046 		if ((entry->protection & access_type) != access_type) {
7047 			/* found a protection problem */
7048 			rc = KERN_PROTECTION_FAILURE;
7049 			goto done;
7050 		}
7051 
7052 		assert(entry->wired_count == 0 && entry->user_wired_count == 0);
7053 
7054 		if ((rc = add_wire_counts(map, entry, user_wire)) != KERN_SUCCESS) {
7055 			goto done;
7056 		}
7057 
7058 		entry->in_transition = TRUE;
7059 
7060 		/*
7061 		 * This entry might get split once we unlock the map.
7062 		 * In vm_fault_wire(), we need the current range as
7063 		 * defined by this entry.  In order for this to work
7064 		 * along with a simultaneous clip operation, we make a
7065 		 * temporary copy of this entry and use that for the
7066 		 * wiring.  Note that the underlying objects do not
7067 		 * change during a clip.
7068 		 */
7069 		tmp_entry = *entry;
7070 
7071 		/*
7072 		 * The in_transition state guarentees that the entry
7073 		 * (or entries for this range, if split occured) will be
7074 		 * there when the map lock is acquired for the second time.
7075 		 */
7076 		vm_map_unlock(map);
7077 
7078 		if (!user_wire && cur_thread != THREAD_NULL) {
7079 			interruptible_state = thread_interrupt_level(THREAD_UNINT);
7080 		} else {
7081 			interruptible_state = THREAD_UNINT;
7082 		}
7083 
7084 		if (map_pmap) {
7085 			rc = vm_fault_wire(map,
7086 			    &tmp_entry, caller_prot, tag, map_pmap, pmap_addr,
7087 			    physpage_p);
7088 		} else {
7089 			rc = vm_fault_wire(map,
7090 			    &tmp_entry, caller_prot, tag, map->pmap,
7091 			    tmp_entry.vme_start,
7092 			    physpage_p);
7093 		}
7094 
7095 		if (!user_wire && cur_thread != THREAD_NULL) {
7096 			thread_interrupt_level(interruptible_state);
7097 		}
7098 
7099 		vm_map_lock(map);
7100 
7101 		if (last_timestamp + 1 != map->timestamp) {
7102 			/*
7103 			 * Find the entry again.  It could have been clipped
7104 			 * after we unlocked the map.
7105 			 */
7106 			if (!vm_map_lookup_entry(map, tmp_entry.vme_start,
7107 			    &first_entry)) {
7108 				panic("vm_map_wire: re-lookup failed");
7109 			}
7110 
7111 			entry = first_entry;
7112 		}
7113 
7114 		last_timestamp = map->timestamp;
7115 
7116 		while ((entry != vm_map_to_entry(map)) &&
7117 		    (entry->vme_start < tmp_entry.vme_end)) {
7118 			assert(entry->in_transition);
7119 			entry->in_transition = FALSE;
7120 			if (entry->needs_wakeup) {
7121 				entry->needs_wakeup = FALSE;
7122 				need_wakeup = TRUE;
7123 			}
7124 			if (rc != KERN_SUCCESS) {       /* from vm_*_wire */
7125 				subtract_wire_counts(map, entry, user_wire);
7126 			}
7127 			entry = entry->vme_next;
7128 		}
7129 
7130 		if (rc != KERN_SUCCESS) {               /* from vm_*_wire */
7131 			goto done;
7132 		}
7133 
7134 		if ((entry != vm_map_to_entry(map)) && /* we still have entries in the map */
7135 		    (tmp_entry.vme_end != end) &&    /* AND, we are not at the end of the requested range */
7136 		    (entry->vme_start != tmp_entry.vme_end)) { /* AND, the next entry is not contiguous. */
7137 			/* found a "new" hole */
7138 			s = tmp_entry.vme_end;
7139 			rc = KERN_INVALID_ADDRESS;
7140 			goto done;
7141 		}
7142 
7143 		s = entry->vme_start;
7144 	} /* end while loop through map entries */
7145 
7146 done:
7147 	if (rc == KERN_SUCCESS) {
7148 		/* repair any damage we may have made to the VM map */
7149 		vm_map_simplify_range(map, start, end);
7150 	}
7151 
7152 	vm_map_unlock(map);
7153 
7154 	/*
7155 	 * wake up anybody waiting on entries we wired.
7156 	 */
7157 	if (need_wakeup) {
7158 		vm_map_entry_wakeup(map);
7159 	}
7160 
7161 	if (rc != KERN_SUCCESS) {
7162 		/* undo what has been wired so far */
7163 		vm_map_unwire_nested(map, start, s, user_wire,
7164 		    map_pmap, pmap_addr);
7165 		if (physpage_p) {
7166 			*physpage_p = 0;
7167 		}
7168 	}
7169 
7170 	return rc;
7171 }
7172 
7173 kern_return_t
vm_map_wire_external(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_prot_t caller_prot,boolean_t user_wire)7174 vm_map_wire_external(
7175 	vm_map_t                map,
7176 	vm_map_offset_t         start,
7177 	vm_map_offset_t         end,
7178 	vm_prot_t               caller_prot,
7179 	boolean_t               user_wire)
7180 {
7181 	kern_return_t   kret;
7182 
7183 	kret = vm_map_wire_nested(map, start, end, caller_prot, vm_tag_bt(),
7184 	    user_wire, (pmap_t)NULL, 0, NULL);
7185 	return kret;
7186 }
7187 
7188 kern_return_t
vm_map_wire_kernel(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_prot_t caller_prot,vm_tag_t tag,boolean_t user_wire)7189 vm_map_wire_kernel(
7190 	vm_map_t                map,
7191 	vm_map_offset_t         start,
7192 	vm_map_offset_t         end,
7193 	vm_prot_t               caller_prot,
7194 	vm_tag_t                tag,
7195 	boolean_t               user_wire)
7196 {
7197 	kern_return_t   kret;
7198 
7199 	kret = vm_map_wire_nested(map, start, end, caller_prot, tag,
7200 	    user_wire, (pmap_t)NULL, 0, NULL);
7201 	return kret;
7202 }
7203 
7204 kern_return_t
vm_map_wire_and_extract_external(vm_map_t map,vm_map_offset_t start,vm_prot_t caller_prot,boolean_t user_wire,ppnum_t * physpage_p)7205 vm_map_wire_and_extract_external(
7206 	vm_map_t        map,
7207 	vm_map_offset_t start,
7208 	vm_prot_t       caller_prot,
7209 	boolean_t       user_wire,
7210 	ppnum_t         *physpage_p)
7211 {
7212 	kern_return_t   kret;
7213 
7214 	kret = vm_map_wire_nested(map,
7215 	    start,
7216 	    start + VM_MAP_PAGE_SIZE(map),
7217 	    caller_prot,
7218 	    vm_tag_bt(),
7219 	    user_wire,
7220 	    (pmap_t)NULL,
7221 	    0,
7222 	    physpage_p);
7223 	if (kret != KERN_SUCCESS &&
7224 	    physpage_p != NULL) {
7225 		*physpage_p = 0;
7226 	}
7227 	return kret;
7228 }
7229 
7230 /*
7231  *	vm_map_unwire:
7232  *
7233  *	Sets the pageability of the specified address range in the target
7234  *	as pageable.  Regions specified must have been wired previously.
7235  *
7236  *	The map must not be locked, but a reference must remain to the map
7237  *	throughout the call.
7238  *
7239  *	Kernel will panic on failures.  User unwire ignores holes and
7240  *	unwired and intransition entries to avoid losing memory by leaving
7241  *	it unwired.
7242  */
7243 static kern_return_t
vm_map_unwire_nested(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,boolean_t user_wire,pmap_t map_pmap,vm_map_offset_t pmap_addr)7244 vm_map_unwire_nested(
7245 	vm_map_t                map,
7246 	vm_map_offset_t         start,
7247 	vm_map_offset_t         end,
7248 	boolean_t               user_wire,
7249 	pmap_t                  map_pmap,
7250 	vm_map_offset_t         pmap_addr)
7251 {
7252 	vm_map_entry_t          entry;
7253 	struct vm_map_entry     *first_entry, tmp_entry;
7254 	boolean_t               need_wakeup;
7255 	boolean_t               main_map = FALSE;
7256 	unsigned int            last_timestamp;
7257 
7258 	vm_map_lock(map);
7259 	if (map_pmap == NULL) {
7260 		main_map = TRUE;
7261 	}
7262 	last_timestamp = map->timestamp;
7263 
7264 	VM_MAP_RANGE_CHECK(map, start, end);
7265 	assert(VM_MAP_PAGE_ALIGNED(start, VM_MAP_PAGE_MASK(map)));
7266 	assert(VM_MAP_PAGE_ALIGNED(end, VM_MAP_PAGE_MASK(map)));
7267 
7268 	if (start == end) {
7269 		/* We unwired what the caller asked for: zero pages */
7270 		vm_map_unlock(map);
7271 		return KERN_SUCCESS;
7272 	}
7273 
7274 	if (vm_map_lookup_entry(map, start, &first_entry)) {
7275 		entry = first_entry;
7276 		/*
7277 		 * vm_map_clip_start will be done later.
7278 		 * We don't want to unnest any nested sub maps here !
7279 		 */
7280 	} else {
7281 		if (!user_wire) {
7282 			panic("vm_map_unwire: start not found");
7283 		}
7284 		/*	Start address is not in map. */
7285 		vm_map_unlock(map);
7286 		return KERN_INVALID_ADDRESS;
7287 	}
7288 
7289 	if (entry->superpage_size) {
7290 		/* superpages are always wired */
7291 		vm_map_unlock(map);
7292 		return KERN_INVALID_ADDRESS;
7293 	}
7294 
7295 	need_wakeup = FALSE;
7296 	while ((entry != vm_map_to_entry(map)) && (entry->vme_start < end)) {
7297 		if (entry->in_transition) {
7298 			/*
7299 			 * 1)
7300 			 * Another thread is wiring down this entry. Note
7301 			 * that if it is not for the other thread we would
7302 			 * be unwiring an unwired entry.  This is not
7303 			 * permitted.  If we wait, we will be unwiring memory
7304 			 * we did not wire.
7305 			 *
7306 			 * 2)
7307 			 * Another thread is unwiring this entry.  We did not
7308 			 * have a reference to it, because if we did, this
7309 			 * entry will not be getting unwired now.
7310 			 */
7311 			if (!user_wire) {
7312 				/*
7313 				 * XXX FBDP
7314 				 * This could happen:  there could be some
7315 				 * overlapping vslock/vsunlock operations
7316 				 * going on.
7317 				 * We should probably just wait and retry,
7318 				 * but then we have to be careful that this
7319 				 * entry could get "simplified" after
7320 				 * "in_transition" gets unset and before
7321 				 * we re-lookup the entry, so we would
7322 				 * have to re-clip the entry to avoid
7323 				 * re-unwiring what we have already unwired...
7324 				 * See vm_map_wire_nested().
7325 				 *
7326 				 * Or we could just ignore "in_transition"
7327 				 * here and proceed to decement the wired
7328 				 * count(s) on this entry.  That should be fine
7329 				 * as long as "wired_count" doesn't drop all
7330 				 * the way to 0 (and we should panic if THAT
7331 				 * happens).
7332 				 */
7333 				panic("vm_map_unwire: in_transition entry");
7334 			}
7335 
7336 			entry = entry->vme_next;
7337 			continue;
7338 		}
7339 
7340 		if (entry->is_sub_map) {
7341 			vm_map_offset_t sub_start;
7342 			vm_map_offset_t sub_end;
7343 			vm_map_offset_t local_end;
7344 			pmap_t          pmap;
7345 
7346 			vm_map_clip_start(map, entry, start);
7347 			vm_map_clip_end(map, entry, end);
7348 
7349 			sub_start = VME_OFFSET(entry);
7350 			sub_end = entry->vme_end - entry->vme_start;
7351 			sub_end += VME_OFFSET(entry);
7352 			local_end = entry->vme_end;
7353 			if (map_pmap == NULL) {
7354 				if (entry->use_pmap) {
7355 					pmap = VME_SUBMAP(entry)->pmap;
7356 					pmap_addr = sub_start;
7357 				} else {
7358 					pmap = map->pmap;
7359 					pmap_addr = start;
7360 				}
7361 				if (entry->wired_count == 0 ||
7362 				    (user_wire && entry->user_wired_count == 0)) {
7363 					if (!user_wire) {
7364 						panic("vm_map_unwire: entry is unwired");
7365 					}
7366 					entry = entry->vme_next;
7367 					continue;
7368 				}
7369 
7370 				/*
7371 				 * Check for holes
7372 				 * Holes: Next entry should be contiguous unless
7373 				 * this is the end of the region.
7374 				 */
7375 				if (((entry->vme_end < end) &&
7376 				    ((entry->vme_next == vm_map_to_entry(map)) ||
7377 				    (entry->vme_next->vme_start
7378 				    > entry->vme_end)))) {
7379 					if (!user_wire) {
7380 						panic("vm_map_unwire: non-contiguous region");
7381 					}
7382 /*
7383  *                                       entry = entry->vme_next;
7384  *                                       continue;
7385  */
7386 				}
7387 
7388 				subtract_wire_counts(map, entry, user_wire);
7389 
7390 				if (entry->wired_count != 0) {
7391 					entry = entry->vme_next;
7392 					continue;
7393 				}
7394 
7395 				entry->in_transition = TRUE;
7396 				tmp_entry = *entry;/* see comment in vm_map_wire() */
7397 
7398 				/*
7399 				 * We can unlock the map now. The in_transition state
7400 				 * guarantees existance of the entry.
7401 				 */
7402 				vm_map_unlock(map);
7403 				vm_map_unwire_nested(VME_SUBMAP(entry),
7404 				    sub_start, sub_end, user_wire, pmap, pmap_addr);
7405 				vm_map_lock(map);
7406 
7407 				if (last_timestamp + 1 != map->timestamp) {
7408 					/*
7409 					 * Find the entry again.  It could have been
7410 					 * clipped or deleted after we unlocked the map.
7411 					 */
7412 					if (!vm_map_lookup_entry(map,
7413 					    tmp_entry.vme_start,
7414 					    &first_entry)) {
7415 						if (!user_wire) {
7416 							panic("vm_map_unwire: re-lookup failed");
7417 						}
7418 						entry = first_entry->vme_next;
7419 					} else {
7420 						entry = first_entry;
7421 					}
7422 				}
7423 				last_timestamp = map->timestamp;
7424 
7425 				/*
7426 				 * clear transition bit for all constituent entries
7427 				 * that were in the original entry (saved in
7428 				 * tmp_entry).  Also check for waiters.
7429 				 */
7430 				while ((entry != vm_map_to_entry(map)) &&
7431 				    (entry->vme_start < tmp_entry.vme_end)) {
7432 					assert(entry->in_transition);
7433 					entry->in_transition = FALSE;
7434 					if (entry->needs_wakeup) {
7435 						entry->needs_wakeup = FALSE;
7436 						need_wakeup = TRUE;
7437 					}
7438 					entry = entry->vme_next;
7439 				}
7440 				continue;
7441 			} else {
7442 				tmp_entry = *entry;
7443 				vm_map_unlock(map);
7444 				vm_map_unwire_nested(VME_SUBMAP(entry),
7445 				    sub_start, sub_end, user_wire, map_pmap,
7446 				    pmap_addr);
7447 				vm_map_lock(map);
7448 
7449 				if (last_timestamp + 1 != map->timestamp) {
7450 					/*
7451 					 * Find the entry again.  It could have been
7452 					 * clipped or deleted after we unlocked the map.
7453 					 */
7454 					if (!vm_map_lookup_entry(map,
7455 					    tmp_entry.vme_start,
7456 					    &first_entry)) {
7457 						if (!user_wire) {
7458 							panic("vm_map_unwire: re-lookup failed");
7459 						}
7460 						entry = first_entry->vme_next;
7461 					} else {
7462 						entry = first_entry;
7463 					}
7464 				}
7465 				last_timestamp = map->timestamp;
7466 			}
7467 		}
7468 
7469 
7470 		if ((entry->wired_count == 0) ||
7471 		    (user_wire && entry->user_wired_count == 0)) {
7472 			if (!user_wire) {
7473 				panic("vm_map_unwire: entry is unwired");
7474 			}
7475 
7476 			entry = entry->vme_next;
7477 			continue;
7478 		}
7479 
7480 		assert(entry->wired_count > 0 &&
7481 		    (!user_wire || entry->user_wired_count > 0));
7482 
7483 		vm_map_clip_start(map, entry, start);
7484 		vm_map_clip_end(map, entry, end);
7485 
7486 		/*
7487 		 * Check for holes
7488 		 * Holes: Next entry should be contiguous unless
7489 		 *	  this is the end of the region.
7490 		 */
7491 		if (((entry->vme_end < end) &&
7492 		    ((entry->vme_next == vm_map_to_entry(map)) ||
7493 		    (entry->vme_next->vme_start > entry->vme_end)))) {
7494 			if (!user_wire) {
7495 				panic("vm_map_unwire: non-contiguous region");
7496 			}
7497 			entry = entry->vme_next;
7498 			continue;
7499 		}
7500 
7501 		subtract_wire_counts(map, entry, user_wire);
7502 
7503 		if (entry->wired_count != 0) {
7504 			entry = entry->vme_next;
7505 			continue;
7506 		}
7507 
7508 		if (entry->zero_wired_pages) {
7509 			entry->zero_wired_pages = FALSE;
7510 		}
7511 
7512 		entry->in_transition = TRUE;
7513 		tmp_entry = *entry;     /* see comment in vm_map_wire() */
7514 
7515 		/*
7516 		 * We can unlock the map now. The in_transition state
7517 		 * guarantees existance of the entry.
7518 		 */
7519 		vm_map_unlock(map);
7520 		if (map_pmap) {
7521 			vm_fault_unwire(map, &tmp_entry, FALSE, map_pmap,
7522 			    pmap_addr, tmp_entry.vme_end);
7523 		} else {
7524 			vm_fault_unwire(map, &tmp_entry, FALSE, map->pmap,
7525 			    tmp_entry.vme_start, tmp_entry.vme_end);
7526 		}
7527 		vm_map_lock(map);
7528 
7529 		if (last_timestamp + 1 != map->timestamp) {
7530 			/*
7531 			 * Find the entry again.  It could have been clipped
7532 			 * or deleted after we unlocked the map.
7533 			 */
7534 			if (!vm_map_lookup_entry(map, tmp_entry.vme_start,
7535 			    &first_entry)) {
7536 				if (!user_wire) {
7537 					panic("vm_map_unwire: re-lookup failed");
7538 				}
7539 				entry = first_entry->vme_next;
7540 			} else {
7541 				entry = first_entry;
7542 			}
7543 		}
7544 		last_timestamp = map->timestamp;
7545 
7546 		/*
7547 		 * clear transition bit for all constituent entries that
7548 		 * were in the original entry (saved in tmp_entry).  Also
7549 		 * check for waiters.
7550 		 */
7551 		while ((entry != vm_map_to_entry(map)) &&
7552 		    (entry->vme_start < tmp_entry.vme_end)) {
7553 			assert(entry->in_transition);
7554 			entry->in_transition = FALSE;
7555 			if (entry->needs_wakeup) {
7556 				entry->needs_wakeup = FALSE;
7557 				need_wakeup = TRUE;
7558 			}
7559 			entry = entry->vme_next;
7560 		}
7561 	}
7562 
7563 	/*
7564 	 * We might have fragmented the address space when we wired this
7565 	 * range of addresses.  Attempt to re-coalesce these VM map entries
7566 	 * with their neighbors now that they're no longer wired.
7567 	 * Under some circumstances, address space fragmentation can
7568 	 * prevent VM object shadow chain collapsing, which can cause
7569 	 * swap space leaks.
7570 	 */
7571 	vm_map_simplify_range(map, start, end);
7572 
7573 	vm_map_unlock(map);
7574 	/*
7575 	 * wake up anybody waiting on entries that we have unwired.
7576 	 */
7577 	if (need_wakeup) {
7578 		vm_map_entry_wakeup(map);
7579 	}
7580 	return KERN_SUCCESS;
7581 }
7582 
7583 kern_return_t
vm_map_unwire(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,boolean_t user_wire)7584 vm_map_unwire(
7585 	vm_map_t                map,
7586 	vm_map_offset_t         start,
7587 	vm_map_offset_t         end,
7588 	boolean_t               user_wire)
7589 {
7590 	return vm_map_unwire_nested(map, start, end,
7591 	           user_wire, (pmap_t)NULL, 0);
7592 }
7593 
7594 
7595 /*
7596  *	vm_map_entry_zap:	[ internal use only ]
7597  *
7598  *	Remove the entry from the target map
7599  *	and put it on a zap list.
7600  */
7601 static void
vm_map_entry_zap(vm_map_t map,vm_map_entry_t entry,vm_map_zap_t zap)7602 vm_map_entry_zap(
7603 	vm_map_t                map,
7604 	vm_map_entry_t          entry,
7605 	vm_map_zap_t            zap)
7606 {
7607 	vm_map_offset_t s, e;
7608 
7609 	s = entry->vme_start;
7610 	e = entry->vme_end;
7611 	assert(VM_MAP_PAGE_ALIGNED(s, FOURK_PAGE_MASK));
7612 	assert(VM_MAP_PAGE_ALIGNED(e, FOURK_PAGE_MASK));
7613 	if (VM_MAP_PAGE_MASK(map) >= PAGE_MASK) {
7614 		assert(page_aligned(s));
7615 		assert(page_aligned(e));
7616 	}
7617 	if (entry->map_aligned == TRUE) {
7618 		assert(VM_MAP_PAGE_ALIGNED(s, VM_MAP_PAGE_MASK(map)));
7619 		assert(VM_MAP_PAGE_ALIGNED(e, VM_MAP_PAGE_MASK(map)));
7620 	}
7621 	assert(entry->wired_count == 0);
7622 	assert(entry->user_wired_count == 0);
7623 	assert(!entry->vme_permanent);
7624 
7625 	vm_map_store_entry_unlink(map, entry, false);
7626 	map->size -= e - s;
7627 
7628 	vm_map_zap_append(zap, entry);
7629 }
7630 
7631 static void
vm_map_submap_pmap_clean(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_map_t sub_map,vm_map_offset_t offset)7632 vm_map_submap_pmap_clean(
7633 	vm_map_t        map,
7634 	vm_map_offset_t start,
7635 	vm_map_offset_t end,
7636 	vm_map_t        sub_map,
7637 	vm_map_offset_t offset)
7638 {
7639 	vm_map_offset_t submap_start;
7640 	vm_map_offset_t submap_end;
7641 	vm_map_size_t   remove_size;
7642 	vm_map_entry_t  entry;
7643 
7644 	submap_end = offset + (end - start);
7645 	submap_start = offset;
7646 
7647 	vm_map_lock_read(sub_map);
7648 	if (vm_map_lookup_entry(sub_map, offset, &entry)) {
7649 		remove_size = (entry->vme_end - entry->vme_start);
7650 		if (offset > entry->vme_start) {
7651 			remove_size -= offset - entry->vme_start;
7652 		}
7653 
7654 
7655 		if (submap_end < entry->vme_end) {
7656 			remove_size -=
7657 			    entry->vme_end - submap_end;
7658 		}
7659 		if (entry->is_sub_map) {
7660 			vm_map_submap_pmap_clean(
7661 				sub_map,
7662 				start,
7663 				start + remove_size,
7664 				VME_SUBMAP(entry),
7665 				VME_OFFSET(entry));
7666 		} else {
7667 			if (map->mapped_in_other_pmaps &&
7668 			    os_ref_get_count_raw(&map->map_refcnt) != 0 &&
7669 			    VME_OBJECT(entry) != NULL) {
7670 				vm_object_pmap_protect_options(
7671 					VME_OBJECT(entry),
7672 					(VME_OFFSET(entry) +
7673 					offset -
7674 					entry->vme_start),
7675 					remove_size,
7676 					PMAP_NULL,
7677 					PAGE_SIZE,
7678 					entry->vme_start,
7679 					VM_PROT_NONE,
7680 					PMAP_OPTIONS_REMOVE);
7681 			} else {
7682 				pmap_remove(map->pmap,
7683 				    (addr64_t)start,
7684 				    (addr64_t)(start + remove_size));
7685 			}
7686 		}
7687 	}
7688 
7689 	entry = entry->vme_next;
7690 
7691 	while ((entry != vm_map_to_entry(sub_map))
7692 	    && (entry->vme_start < submap_end)) {
7693 		remove_size = (entry->vme_end - entry->vme_start);
7694 		if (submap_end < entry->vme_end) {
7695 			remove_size -= entry->vme_end - submap_end;
7696 		}
7697 		if (entry->is_sub_map) {
7698 			vm_map_submap_pmap_clean(
7699 				sub_map,
7700 				(start + entry->vme_start) - offset,
7701 				((start + entry->vme_start) - offset) + remove_size,
7702 				VME_SUBMAP(entry),
7703 				VME_OFFSET(entry));
7704 		} else {
7705 			if (map->mapped_in_other_pmaps &&
7706 			    os_ref_get_count_raw(&map->map_refcnt) != 0 &&
7707 			    VME_OBJECT(entry) != NULL) {
7708 				vm_object_pmap_protect_options(
7709 					VME_OBJECT(entry),
7710 					VME_OFFSET(entry),
7711 					remove_size,
7712 					PMAP_NULL,
7713 					PAGE_SIZE,
7714 					entry->vme_start,
7715 					VM_PROT_NONE,
7716 					PMAP_OPTIONS_REMOVE);
7717 			} else {
7718 				pmap_remove(map->pmap,
7719 				    (addr64_t)((start + entry->vme_start)
7720 				    - offset),
7721 				    (addr64_t)(((start + entry->vme_start)
7722 				    - offset) + remove_size));
7723 			}
7724 		}
7725 		entry = entry->vme_next;
7726 	}
7727 	vm_map_unlock_read(sub_map);
7728 	return;
7729 }
7730 
7731 /*
7732  *     virt_memory_guard_ast:
7733  *
7734  *     Handle the AST callout for a virtual memory guard.
7735  *	   raise an EXC_GUARD exception and terminate the task
7736  *     if configured to do so.
7737  */
7738 void
virt_memory_guard_ast(thread_t thread,mach_exception_data_type_t code,mach_exception_data_type_t subcode)7739 virt_memory_guard_ast(
7740 	thread_t thread,
7741 	mach_exception_data_type_t code,
7742 	mach_exception_data_type_t subcode)
7743 {
7744 	task_t task = get_threadtask(thread);
7745 	assert(task != kernel_task);
7746 	assert(task == current_task());
7747 	kern_return_t sync_exception_result;
7748 	uint32_t behavior;
7749 
7750 	behavior = task->task_exc_guard;
7751 
7752 	/* Is delivery enabled */
7753 	if ((behavior & TASK_EXC_GUARD_VM_DELIVER) == 0) {
7754 		return;
7755 	}
7756 
7757 	/* If only once, make sure we're that once */
7758 	while (behavior & TASK_EXC_GUARD_VM_ONCE) {
7759 		uint32_t new_behavior = behavior & ~TASK_EXC_GUARD_VM_DELIVER;
7760 
7761 		if (OSCompareAndSwap(behavior, new_behavior, &task->task_exc_guard)) {
7762 			break;
7763 		}
7764 		behavior = task->task_exc_guard;
7765 		if ((behavior & TASK_EXC_GUARD_VM_DELIVER) == 0) {
7766 			return;
7767 		}
7768 	}
7769 
7770 	/* Raise exception synchronously and see if handler claimed it */
7771 	sync_exception_result = task_exception_notify(EXC_GUARD, code, subcode);
7772 
7773 	if (task->task_exc_guard & TASK_EXC_GUARD_VM_FATAL) {
7774 		/*
7775 		 * If Synchronous EXC_GUARD delivery was successful then
7776 		 * kill the process and return, else kill the process
7777 		 * and deliver the exception via EXC_CORPSE_NOTIFY.
7778 		 */
7779 		if (sync_exception_result == KERN_SUCCESS) {
7780 			task_bsdtask_kill(current_task());
7781 		} else {
7782 			exit_with_guard_exception(current_proc(), code, subcode);
7783 		}
7784 	} else if (task->task_exc_guard & TASK_EXC_GUARD_VM_CORPSE) {
7785 		/*
7786 		 * If the synchronous EXC_GUARD delivery was not successful,
7787 		 * raise a simulated crash.
7788 		 */
7789 		if (sync_exception_result != KERN_SUCCESS) {
7790 			task_violated_guard(code, subcode, NULL, FALSE);
7791 		}
7792 	}
7793 }
7794 
7795 /*
7796  *     vm_map_guard_exception:
7797  *
7798  *     Generate a GUARD_TYPE_VIRTUAL_MEMORY EXC_GUARD exception.
7799  *
7800  *     Right now, we do this when we find nothing mapped, or a
7801  *     gap in the mapping when a user address space deallocate
7802  *     was requested. We report the address of the first gap found.
7803  */
7804 static void
vm_map_guard_exception(vm_map_offset_t gap_start,unsigned reason)7805 vm_map_guard_exception(
7806 	vm_map_offset_t gap_start,
7807 	unsigned reason)
7808 {
7809 	mach_exception_code_t code = 0;
7810 	unsigned int guard_type = GUARD_TYPE_VIRT_MEMORY;
7811 	unsigned int target = 0; /* should we pass in pid associated with map? */
7812 	mach_exception_data_type_t subcode = (uint64_t)gap_start;
7813 	boolean_t fatal = FALSE;
7814 
7815 	task_t task = current_task_early();
7816 
7817 	/* Can't deliver exceptions to a NULL task (early boot) or kernel task */
7818 	if (task == NULL || task == kernel_task) {
7819 		return;
7820 	}
7821 
7822 	EXC_GUARD_ENCODE_TYPE(code, guard_type);
7823 	EXC_GUARD_ENCODE_FLAVOR(code, reason);
7824 	EXC_GUARD_ENCODE_TARGET(code, target);
7825 
7826 	if (task->task_exc_guard & TASK_EXC_GUARD_VM_FATAL) {
7827 		fatal = TRUE;
7828 	}
7829 	thread_guard_violation(current_thread(), code, subcode, fatal);
7830 }
7831 
7832 static kern_return_t
vm_map_delete_submap_recurse(vm_map_t submap,vm_map_offset_t submap_start,vm_map_offset_t submap_end)7833 vm_map_delete_submap_recurse(
7834 	vm_map_t submap,
7835 	vm_map_offset_t submap_start,
7836 	vm_map_offset_t submap_end)
7837 {
7838 	vm_map_entry_t submap_entry;
7839 
7840 	/*
7841 	 * Verify that the submap does not contain any "permanent" entries
7842 	 * within the specified range.
7843 	 * We do not care about gaps.
7844 	 */
7845 
7846 	vm_map_lock(submap);
7847 
7848 	if (!vm_map_lookup_entry(submap, submap_start, &submap_entry)) {
7849 		submap_entry = submap_entry->vme_next;
7850 	}
7851 
7852 	for (;
7853 	    submap_entry != vm_map_to_entry(submap) &&
7854 	    submap_entry->vme_start < submap_end;
7855 	    submap_entry = submap_entry->vme_next) {
7856 		if (submap_entry->vme_permanent) {
7857 			/* "permanent" entry -> fail */
7858 			vm_map_unlock(submap);
7859 			return KERN_PROTECTION_FAILURE;
7860 		}
7861 	}
7862 	/* no "permanent" entries in the range -> success */
7863 	vm_map_unlock(submap);
7864 	return KERN_SUCCESS;
7865 }
7866 
7867 __abortlike
7868 static void
__vm_map_delete_misaligned_panic(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)7869 __vm_map_delete_misaligned_panic(
7870 	vm_map_t                map,
7871 	vm_map_offset_t         start,
7872 	vm_map_offset_t         end)
7873 {
7874 	panic("vm_map_delete(%p,0x%llx,0x%llx): start is not aligned to 0x%x",
7875 	    map, (uint64_t)start, (uint64_t)end, VM_MAP_PAGE_SIZE(map));
7876 }
7877 
7878 __abortlike
7879 static void
__vm_map_delete_failed_panic(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,kern_return_t kr)7880 __vm_map_delete_failed_panic(
7881 	vm_map_t                map,
7882 	vm_map_offset_t         start,
7883 	vm_map_offset_t         end,
7884 	kern_return_t           kr)
7885 {
7886 	panic("vm_map_delete(%p,0x%llx,0x%llx): failed unexpected with %d",
7887 	    map, (uint64_t)start, (uint64_t)end, kr);
7888 }
7889 
7890 __abortlike
7891 static void
__vm_map_delete_gap_panic(vm_map_t map,vm_map_offset_t where,vm_map_offset_t start,vm_map_offset_t end)7892 __vm_map_delete_gap_panic(
7893 	vm_map_t                map,
7894 	vm_map_offset_t         where,
7895 	vm_map_offset_t         start,
7896 	vm_map_offset_t         end)
7897 {
7898 	panic("vm_map_delete(%p,0x%llx,0x%llx): no map entry at 0x%llx",
7899 	    map, (uint64_t)start, (uint64_t)end, (uint64_t)where);
7900 }
7901 
7902 __abortlike
7903 static void
__vm_map_delete_permanent_panic(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_map_entry_t entry)7904 __vm_map_delete_permanent_panic(
7905 	vm_map_t                map,
7906 	vm_map_offset_t         start,
7907 	vm_map_offset_t         end,
7908 	vm_map_entry_t          entry)
7909 {
7910 	panic("vm_map_delete(%p,0x%llx,0x%llx): "
7911 	    "Attempting to remove permanent VM map entry %p [0x%llx:0x%llx]",
7912 	    map, (uint64_t)start, (uint64_t)end, entry,
7913 	    (uint64_t)entry->vme_start,
7914 	    (uint64_t)entry->vme_end);
7915 }
7916 
7917 __options_decl(vm_map_delete_state_t, uint32_t, {
7918 	VMDS_NONE               = 0x0000,
7919 
7920 	VMDS_FOUND_GAP          = 0x0001,
7921 	VMDS_GAPS_OK            = 0x0002,
7922 
7923 	VMDS_KERNEL_PMAP        = 0x0004,
7924 	VMDS_NEEDS_LOOKUP       = 0x0008,
7925 	VMDS_NEEDS_WAKEUP       = 0x0010,
7926 	VMDS_KERNEL_KMEMPTR     = 0x0020
7927 });
7928 
7929 /*
7930  *	vm_map_delete:	[ internal use only ]
7931  *
7932  *	Deallocates the given address range from the target map.
7933  *	Removes all user wirings. Unwires one kernel wiring if
7934  *	VM_MAP_REMOVE_KUNWIRE is set.  Waits for kernel wirings to go
7935  *	away if VM_MAP_REMOVE_WAIT_FOR_KWIRE is set.  Sleeps
7936  *	interruptibly if VM_MAP_REMOVE_INTERRUPTIBLE is set.
7937  *
7938  *
7939  *	When the map is a kernel map, then any error in removing mappings
7940  *	will lead to a panic so that clients do not have to repeat the panic
7941  *	code at each call site.  If VM_MAP_REMOVE_INTERRUPTIBLE
7942  *	is also passed, then KERN_ABORTED will not lead to a panic.
7943  *
7944  *	This routine is called with map locked and leaves map locked.
7945  */
7946 static kmem_return_t
vm_map_delete(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vmr_flags_t flags,kmem_guard_t guard,vm_map_zap_t zap_list)7947 vm_map_delete(
7948 	vm_map_t                map,
7949 	vm_map_offset_t         start,
7950 	vm_map_offset_t         end,
7951 	vmr_flags_t             flags,
7952 	kmem_guard_t            guard,
7953 	vm_map_zap_t            zap_list)
7954 {
7955 	vm_map_entry_t          entry, next;
7956 	int                     interruptible;
7957 	vm_map_offset_t         gap_start = 0;
7958 	vm_map_offset_t         clear_in_transition_end = 0;
7959 	__unused vm_map_offset_t save_start = start;
7960 	__unused vm_map_offset_t save_end = end;
7961 	vm_map_delete_state_t   state = VMDS_NONE;
7962 	kmem_return_t           ret = { };
7963 	vm_map_range_id_t       range_id = 0;
7964 	struct kmem_page_meta  *meta = NULL;
7965 	uint32_t                size_idx, slot_idx;
7966 	struct mach_vm_range    slot;
7967 
7968 	if (vm_map_pmap(map) == kernel_pmap) {
7969 		state |= VMDS_KERNEL_PMAP;
7970 		range_id = kmem_addr_get_range(start, end - start);
7971 		if (kmem_is_ptr_range(range_id)) {
7972 			state |= VMDS_KERNEL_KMEMPTR;
7973 			slot_idx = kmem_addr_get_slot_idx(start, end, range_id, &meta,
7974 			    &size_idx, &slot);
7975 		}
7976 	}
7977 
7978 	if (map->terminated || os_ref_get_count_raw(&map->map_refcnt) == 0) {
7979 		state |= VMDS_GAPS_OK;
7980 	}
7981 
7982 	interruptible = (flags & VM_MAP_REMOVE_INTERRUPTIBLE) ?
7983 	    THREAD_ABORTSAFE : THREAD_UNINT;
7984 
7985 	if ((flags & VM_MAP_REMOVE_NO_MAP_ALIGN) == 0 &&
7986 	    (start & VM_MAP_PAGE_MASK(map))) {
7987 		__vm_map_delete_misaligned_panic(map, start, end);
7988 	}
7989 
7990 	if ((state & VMDS_GAPS_OK) == 0) {
7991 		/*
7992 		 * If the map isn't terminated then all deletions must have
7993 		 * no gaps, and be within the [min, max) of the map.
7994 		 *
7995 		 * We got here without VM_MAP_RANGE_CHECK() being called,
7996 		 * and hence must validate bounds manually.
7997 		 *
7998 		 * It is worth noting that because vm_deallocate() will
7999 		 * round_page() the deallocation size, it's possible for "end"
8000 		 * to be 0 here due to overflow. We hence must treat it as being
8001 		 * beyond vm_map_max(map).
8002 		 *
8003 		 * Similarly, end < start means some wrap around happend,
8004 		 * which should cause an error or panic.
8005 		 */
8006 		if (end == 0 || end > vm_map_max(map)) {
8007 			state |= VMDS_FOUND_GAP;
8008 			gap_start = vm_map_max(map);
8009 			if (state & VMDS_KERNEL_PMAP) {
8010 				__vm_map_delete_gap_panic(map,
8011 				    gap_start, start, end);
8012 			}
8013 			goto out;
8014 		}
8015 
8016 		if (end < start) {
8017 			if (state & VMDS_KERNEL_PMAP) {
8018 				__vm_map_delete_gap_panic(map,
8019 				    vm_map_max(map), start, end);
8020 			}
8021 			ret.kmr_return = KERN_INVALID_ARGUMENT;
8022 			goto out;
8023 		}
8024 
8025 		if (start < vm_map_min(map)) {
8026 			state |= VMDS_FOUND_GAP;
8027 			gap_start = start;
8028 			if (state & VMDS_KERNEL_PMAP) {
8029 				__vm_map_delete_gap_panic(map,
8030 				    gap_start, start, end);
8031 			}
8032 			goto out;
8033 		}
8034 	} else {
8035 		/*
8036 		 * If the map is terminated, we must accept start/end
8037 		 * being beyond the boundaries of the map as this is
8038 		 * how some of the mappings like commpage mappings
8039 		 * can be destroyed (they're outside of those bounds).
8040 		 *
8041 		 * end < start is still something we can't cope with,
8042 		 * so just bail.
8043 		 */
8044 		if (end < start) {
8045 			goto out;
8046 		}
8047 	}
8048 
8049 
8050 	/*
8051 	 *	Find the start of the region.
8052 	 *
8053 	 *	If in a superpage, extend the range
8054 	 *	to include the start of the mapping.
8055 	 */
8056 	while (vm_map_lookup_entry_or_next(map, start, &entry)) {
8057 		if (entry->superpage_size && (start & ~SUPERPAGE_MASK)) {
8058 			start = SUPERPAGE_ROUND_DOWN(start);
8059 		} else {
8060 			SAVE_HINT_MAP_WRITE(map, entry->vme_prev);
8061 			break;
8062 		}
8063 	}
8064 
8065 	if (entry->superpage_size) {
8066 		end = SUPERPAGE_ROUND_UP(end);
8067 	}
8068 
8069 	/*
8070 	 *	Step through all entries in this region
8071 	 */
8072 	for (vm_map_offset_t s = start; s < end;) {
8073 		/*
8074 		 * At this point, we have deleted all the memory entries
8075 		 * in [start, s) and are proceeding with the [s, end) range.
8076 		 *
8077 		 * This loop might drop the map lock, and it is possible that
8078 		 * some memory was already reallocated within [start, s)
8079 		 * and we don't want to mess with those entries.
8080 		 *
8081 		 * Some of those entries could even have been re-assembled
8082 		 * with an entry after "s" (in vm_map_simplify_entry()), so
8083 		 * we may have to vm_map_clip_start() again.
8084 		 *
8085 		 * When clear_in_transition_end is set, the we had marked
8086 		 * [start, clear_in_transition_end) as "in_transition"
8087 		 * during a previous iteration and we need to clear it.
8088 		 */
8089 
8090 		/*
8091 		 * Step 1: If needed (because we dropped locks),
8092 		 *         lookup the entry again.
8093 		 *
8094 		 *         If we're coming back from unwiring (Step 5),
8095 		 *         we also need to mark the entries as no longer
8096 		 *         in transition after that.
8097 		 */
8098 
8099 		if (state & VMDS_NEEDS_LOOKUP) {
8100 			state &= ~VMDS_NEEDS_LOOKUP;
8101 
8102 			if (vm_map_lookup_entry_or_next(map, s, &entry)) {
8103 				SAVE_HINT_MAP_WRITE(map, entry->vme_prev);
8104 			}
8105 
8106 			if (state & VMDS_KERNEL_KMEMPTR) {
8107 				kmem_validate_slot(s, meta, size_idx, slot_idx);
8108 			}
8109 		}
8110 
8111 		if (clear_in_transition_end) {
8112 			for (vm_map_entry_t it = entry;
8113 			    it != vm_map_to_entry(map) &&
8114 			    it->vme_start < clear_in_transition_end;
8115 			    it = it->vme_next) {
8116 				assert(it->in_transition);
8117 				it->in_transition = FALSE;
8118 				if (it->needs_wakeup) {
8119 					it->needs_wakeup = FALSE;
8120 					state |= VMDS_NEEDS_WAKEUP;
8121 				}
8122 			}
8123 
8124 			clear_in_transition_end = 0;
8125 		}
8126 
8127 
8128 		/*
8129 		 * Step 2: Perform various policy checks
8130 		 *         before we do _anything_ to this entry.
8131 		 */
8132 
8133 		if (entry == vm_map_to_entry(map) || s < entry->vme_start) {
8134 			if (state & (VMDS_GAPS_OK | VMDS_FOUND_GAP)) {
8135 				/*
8136 				 * Either we found a gap already,
8137 				 * or we are tearing down a map,
8138 				 * keep going.
8139 				 */
8140 			} else if (state & VMDS_KERNEL_PMAP) {
8141 				__vm_map_delete_gap_panic(map, s, start, end);
8142 			} else if (s < end) {
8143 				state |= VMDS_FOUND_GAP;
8144 				gap_start = s;
8145 			}
8146 
8147 			if (entry == vm_map_to_entry(map) ||
8148 			    end <= entry->vme_start) {
8149 				break;
8150 			}
8151 
8152 			s = entry->vme_start;
8153 		}
8154 
8155 		if (state & VMDS_KERNEL_PMAP) {
8156 			/*
8157 			 * In the kernel map and its submaps,
8158 			 * permanent entries never die, even
8159 			 * if VM_MAP_REMOVE_IMMUTABLE is passed.
8160 			 */
8161 			if (entry->vme_permanent) {
8162 				__vm_map_delete_permanent_panic(map, start, end, entry);
8163 			}
8164 
8165 			if (flags & VM_MAP_REMOVE_GUESS_SIZE) {
8166 				end = entry->vme_end;
8167 				flags &= ~VM_MAP_REMOVE_GUESS_SIZE;
8168 			}
8169 
8170 			/*
8171 			 * In the kernel map and its submaps,
8172 			 * the removal of an atomic/guarded entry is strict.
8173 			 *
8174 			 * An atomic entry is processed only if it was
8175 			 * specifically targeted.
8176 			 *
8177 			 * We might have deleted non-atomic entries before
8178 			 * we reach this this point however...
8179 			 */
8180 			kmem_entry_validate_guard(map, entry,
8181 			    start, end - start, guard);
8182 		}
8183 
8184 		/*
8185 		 * Step 2.1: handle "permanent" and "submap" entries
8186 		 * *before* clipping to avoid triggering some unnecessary
8187 		 * un-nesting of the shared region.
8188 		 */
8189 		if (entry->vme_permanent && entry->is_sub_map) {
8190 //			printf("FBDP %s:%d permanent submap...\n", __FUNCTION__, __LINE__);
8191 			/*
8192 			 * Un-mapping a "permanent" mapping of a user-space
8193 			 * submap is not allowed unless...
8194 			 */
8195 			if (flags & VM_MAP_REMOVE_IMMUTABLE) {
8196 				/*
8197 				 * a. explicitly requested by the kernel caller.
8198 				 */
8199 //				printf("FBDP %s:%d flags & REMOVE_IMMUTABLE\n", __FUNCTION__, __LINE__);
8200 			} else if ((flags & VM_MAP_REMOVE_IMMUTABLE_CODE) &&
8201 			    developer_mode_state()) {
8202 				/*
8203 				 * b. we're in "developer" mode (for
8204 				 *    breakpoints, dtrace probes, ...).
8205 				 */
8206 //				printf("FBDP %s:%d flags & REMOVE_IMMUTABLE_CODE\n", __FUNCTION__, __LINE__);
8207 			} else if (map->terminated) {
8208 				/*
8209 				 * c. this is the final address space cleanup.
8210 				 */
8211 //				printf("FBDP %s:%d map->terminated\n", __FUNCTION__, __LINE__);
8212 			} else {
8213 				vm_map_offset_t submap_start, submap_end;
8214 				kern_return_t submap_kr;
8215 
8216 				/*
8217 				 * Check if there are any "permanent" mappings
8218 				 * in this range in the submap.
8219 				 */
8220 				if (entry->in_transition) {
8221 					/* can that even happen ? */
8222 					goto in_transition;
8223 				}
8224 				/* compute the clipped range in the submap */
8225 				submap_start = s - entry->vme_start;
8226 				submap_start += VME_OFFSET(entry);
8227 				submap_end = end - entry->vme_start;
8228 				submap_end += VME_OFFSET(entry);
8229 				submap_kr = vm_map_delete_submap_recurse(
8230 					VME_SUBMAP(entry),
8231 					submap_start,
8232 					submap_end);
8233 				if (submap_kr != KERN_SUCCESS) {
8234 					/*
8235 					 * There are some "permanent" mappings
8236 					 * in the submap: we are not allowed
8237 					 * to remove this range.
8238 					 */
8239 					printf("%d[%s] removing permanent submap entry "
8240 					    "%p [0x%llx:0x%llx] prot 0x%x/0x%x -> KERN_PROT_FAILURE\n",
8241 					    proc_selfpid(),
8242 					    (get_bsdtask_info(current_task())
8243 					    ? proc_name_address(get_bsdtask_info(current_task()))
8244 					    : "?"), entry,
8245 					    (uint64_t)entry->vme_start,
8246 					    (uint64_t)entry->vme_end,
8247 					    entry->protection,
8248 					    entry->max_protection);
8249 					DTRACE_VM6(vm_map_delete_permanent_deny_submap,
8250 					    vm_map_entry_t, entry,
8251 					    vm_map_offset_t, entry->vme_start,
8252 					    vm_map_offset_t, entry->vme_end,
8253 					    vm_prot_t, entry->protection,
8254 					    vm_prot_t, entry->max_protection,
8255 					    int, VME_ALIAS(entry));
8256 					ret.kmr_return = KERN_PROTECTION_FAILURE;
8257 					goto out;
8258 				}
8259 				/* no permanent mappings: proceed */
8260 			}
8261 		}
8262 
8263 		/*
8264 		 * Step 3: Perform any clipping needed.
8265 		 *
8266 		 *         After this, "entry" starts at "s", ends before "end"
8267 		 */
8268 
8269 		if (entry->vme_start < s) {
8270 			if ((flags & VM_MAP_REMOVE_NO_MAP_ALIGN) &&
8271 			    entry->map_aligned &&
8272 			    !VM_MAP_PAGE_ALIGNED(s, VM_MAP_PAGE_MASK(map))) {
8273 				/*
8274 				 * The entry will no longer be map-aligned
8275 				 * after clipping and the caller said it's OK.
8276 				 */
8277 				entry->map_aligned = FALSE;
8278 			}
8279 			vm_map_clip_start(map, entry, s);
8280 			SAVE_HINT_MAP_WRITE(map, entry->vme_prev);
8281 		}
8282 
8283 		if (end < entry->vme_end) {
8284 			if ((flags & VM_MAP_REMOVE_NO_MAP_ALIGN) &&
8285 			    entry->map_aligned &&
8286 			    !VM_MAP_PAGE_ALIGNED(end, VM_MAP_PAGE_MASK(map))) {
8287 				/*
8288 				 * The entry will no longer be map-aligned
8289 				 * after clipping and the caller said it's OK.
8290 				 */
8291 				entry->map_aligned = FALSE;
8292 			}
8293 			vm_map_clip_end(map, entry, end);
8294 		}
8295 
8296 		if (entry->vme_permanent && entry->is_sub_map) {
8297 			/*
8298 			 * We already went through step 2.1 which did not deny
8299 			 * the removal of this "permanent" and "is_sub_map"
8300 			 * entry.
8301 			 * Now that we've clipped what we actually want to
8302 			 * delete, undo the "permanent" part to allow the
8303 			 * removal to proceed.
8304 			 */
8305 			DTRACE_VM6(vm_map_delete_permanent_allow_submap,
8306 			    vm_map_entry_t, entry,
8307 			    vm_map_offset_t, entry->vme_start,
8308 			    vm_map_offset_t, entry->vme_end,
8309 			    vm_prot_t, entry->protection,
8310 			    vm_prot_t, entry->max_protection,
8311 			    int, VME_ALIAS(entry));
8312 			entry->vme_permanent = false;
8313 		}
8314 
8315 		assert(s == entry->vme_start);
8316 		assert(entry->vme_end <= end);
8317 
8318 
8319 		/*
8320 		 * Step 4: If the entry is in flux, wait for this to resolve.
8321 		 */
8322 
8323 		if (entry->in_transition) {
8324 			wait_result_t wait_result;
8325 
8326 in_transition:
8327 			/*
8328 			 * Another thread is wiring/unwiring this entry.
8329 			 * Let the other thread know we are waiting.
8330 			 */
8331 
8332 			entry->needs_wakeup = TRUE;
8333 
8334 			/*
8335 			 * wake up anybody waiting on entries that we have
8336 			 * already unwired/deleted.
8337 			 */
8338 			if (state & VMDS_NEEDS_WAKEUP) {
8339 				vm_map_entry_wakeup(map);
8340 				state &= ~VMDS_NEEDS_WAKEUP;
8341 			}
8342 
8343 			wait_result = vm_map_entry_wait(map, interruptible);
8344 
8345 			if (interruptible &&
8346 			    wait_result == THREAD_INTERRUPTED) {
8347 				/*
8348 				 * We do not clear the needs_wakeup flag,
8349 				 * since we cannot tell if we were the only one.
8350 				 */
8351 				ret.kmr_return = KERN_ABORTED;
8352 				return ret;
8353 			}
8354 
8355 			/*
8356 			 * The entry could have been clipped or it
8357 			 * may not exist anymore.  Look it up again.
8358 			 */
8359 			state |= VMDS_NEEDS_LOOKUP;
8360 			continue;
8361 		}
8362 
8363 
8364 		/*
8365 		 * Step 5: Handle wiring
8366 		 */
8367 
8368 		if (entry->wired_count) {
8369 			struct vm_map_entry tmp_entry;
8370 			boolean_t           user_wire;
8371 			unsigned int        last_timestamp;
8372 
8373 			user_wire = entry->user_wired_count > 0;
8374 
8375 			/*
8376 			 *      Remove a kernel wiring if requested
8377 			 */
8378 			if (flags & VM_MAP_REMOVE_KUNWIRE) {
8379 				entry->wired_count--;
8380 			}
8381 
8382 			/*
8383 			 *	Remove all user wirings for proper accounting
8384 			 */
8385 			while (entry->user_wired_count) {
8386 				subtract_wire_counts(map, entry, user_wire);
8387 			}
8388 
8389 			/*
8390 			 * All our DMA I/O operations in IOKit are currently
8391 			 * done by wiring through the map entries of the task
8392 			 * requesting the I/O.
8393 			 *
8394 			 * Because of this, we must always wait for kernel wirings
8395 			 * to go away on the entries before deleting them.
8396 			 *
8397 			 * Any caller who wants to actually remove a kernel wiring
8398 			 * should explicitly set the VM_MAP_REMOVE_KUNWIRE flag to
8399 			 * properly remove one wiring instead of blasting through
8400 			 * them all.
8401 			 */
8402 			if (entry->wired_count != 0) {
8403 				assert(map != kernel_map);
8404 				/*
8405 				 * Cannot continue.  Typical case is when
8406 				 * a user thread has physical io pending on
8407 				 * on this page.  Either wait for the
8408 				 * kernel wiring to go away or return an
8409 				 * error.
8410 				 */
8411 				wait_result_t wait_result;
8412 
8413 				entry->needs_wakeup = TRUE;
8414 				wait_result = vm_map_entry_wait(map,
8415 				    interruptible);
8416 
8417 				if (interruptible &&
8418 				    wait_result == THREAD_INTERRUPTED) {
8419 					/*
8420 					 * We do not clear the
8421 					 * needs_wakeup flag, since we
8422 					 * cannot tell if we were the
8423 					 * only one.
8424 					 */
8425 					ret.kmr_return = KERN_ABORTED;
8426 					return ret;
8427 				}
8428 
8429 
8430 				/*
8431 				 * The entry could have been clipped or
8432 				 * it may not exist anymore.  Look it
8433 				 * up again.
8434 				 */
8435 				state |= VMDS_NEEDS_LOOKUP;
8436 				continue;
8437 			}
8438 
8439 			/*
8440 			 * We can unlock the map now.
8441 			 *
8442 			 * The entry might be split once we unlock the map,
8443 			 * but we need the range as defined by this entry
8444 			 * to be stable. So we must make a local copy.
8445 			 *
8446 			 * The underlying objects do not change during clips,
8447 			 * and the in_transition state guarentees existence
8448 			 * of the entry.
8449 			 */
8450 			last_timestamp = map->timestamp;
8451 			entry->in_transition = TRUE;
8452 			tmp_entry = *entry;
8453 			vm_map_unlock(map);
8454 
8455 			if (tmp_entry.is_sub_map) {
8456 				vm_map_t sub_map;
8457 				vm_map_offset_t sub_start, sub_end;
8458 				pmap_t pmap;
8459 				vm_map_offset_t pmap_addr;
8460 
8461 
8462 				sub_map = VME_SUBMAP(&tmp_entry);
8463 				sub_start = VME_OFFSET(&tmp_entry);
8464 				sub_end = sub_start + (tmp_entry.vme_end -
8465 				    tmp_entry.vme_start);
8466 				if (tmp_entry.use_pmap) {
8467 					pmap = sub_map->pmap;
8468 					pmap_addr = tmp_entry.vme_start;
8469 				} else {
8470 					pmap = map->pmap;
8471 					pmap_addr = tmp_entry.vme_start;
8472 				}
8473 				(void) vm_map_unwire_nested(sub_map,
8474 				    sub_start, sub_end,
8475 				    user_wire,
8476 				    pmap, pmap_addr);
8477 			} else {
8478 				vm_map_offset_t entry_end = tmp_entry.vme_end;
8479 				vm_map_offset_t max_end;
8480 
8481 				if (flags & VM_MAP_REMOVE_NOKUNWIRE_LAST) {
8482 					max_end = end - VM_MAP_PAGE_SIZE(map);
8483 					if (entry_end > max_end) {
8484 						entry_end = max_end;
8485 					}
8486 				}
8487 
8488 				if (tmp_entry.vme_kernel_object) {
8489 					pmap_protect_options(
8490 						map->pmap,
8491 						tmp_entry.vme_start,
8492 						entry_end,
8493 						VM_PROT_NONE,
8494 						PMAP_OPTIONS_REMOVE,
8495 						NULL);
8496 				}
8497 				vm_fault_unwire(map, &tmp_entry,
8498 				    tmp_entry.vme_kernel_object, map->pmap,
8499 				    tmp_entry.vme_start, entry_end);
8500 			}
8501 
8502 			vm_map_lock(map);
8503 
8504 			/*
8505 			 * Unwiring happened, we can now go back to deleting
8506 			 * them (after we clear the in_transition bit for the range).
8507 			 */
8508 			if (last_timestamp + 1 != map->timestamp) {
8509 				state |= VMDS_NEEDS_LOOKUP;
8510 			}
8511 			clear_in_transition_end = tmp_entry.vme_end;
8512 			continue;
8513 		}
8514 
8515 		assert(entry->wired_count == 0);
8516 		assert(entry->user_wired_count == 0);
8517 
8518 
8519 		/*
8520 		 * Step 6: Entry is unwired and ready for us to delete !
8521 		 */
8522 
8523 		if (!entry->vme_permanent) {
8524 			/*
8525 			 * Typical case: the entry really shouldn't be permanent
8526 			 */
8527 		} else if ((flags & VM_MAP_REMOVE_IMMUTABLE_CODE) &&
8528 		    (entry->protection & VM_PROT_EXECUTE) &&
8529 		    developer_mode_state()) {
8530 			/*
8531 			 * Allow debuggers to undo executable mappings
8532 			 * when developer mode is on.
8533 			 */
8534 #if 0
8535 			printf("FBDP %d[%s] removing permanent executable entry "
8536 			    "%p [0x%llx:0x%llx] prot 0x%x/0x%x\n",
8537 			    proc_selfpid(),
8538 			    (current_task()->bsd_info
8539 			    ? proc_name_address(current_task()->bsd_info)
8540 			    : "?"), entry,
8541 			    (uint64_t)entry->vme_start,
8542 			    (uint64_t)entry->vme_end,
8543 			    entry->protection,
8544 			    entry->max_protection);
8545 #endif
8546 			entry->vme_permanent = FALSE;
8547 		} else if ((flags & VM_MAP_REMOVE_IMMUTABLE) || map->terminated) {
8548 #if 0
8549 			printf("FBDP %d[%s] removing permanent entry "
8550 			    "%p [0x%llx:0x%llx] prot 0x%x/0x%x\n",
8551 			    proc_selfpid(),
8552 			    (current_task()->bsd_info
8553 			    ? proc_name_address(current_task()->bsd_info)
8554 			    : "?"), entry,
8555 			    (uint64_t)entry->vme_start,
8556 			    (uint64_t)entry->vme_end,
8557 			    entry->protection,
8558 			    entry->max_protection);
8559 #endif
8560 			entry->vme_permanent = FALSE;
8561 #if CODE_SIGNING_MONITOR
8562 		} else if ((entry->protection & VM_PROT_EXECUTE) && !csm_enabled()) {
8563 			entry->vme_permanent = FALSE;
8564 
8565 			printf("%d[%s] %s(0x%llx,0x%llx): "
8566 			    "code signing monitor disabled, allowing for permanent executable entry [0x%llx:0x%llx] "
8567 			    "prot 0x%x/0x%x\n",
8568 			    proc_selfpid(),
8569 			    (get_bsdtask_info(current_task())
8570 			    ? proc_name_address(get_bsdtask_info(current_task()))
8571 			    : "?"),
8572 			    __FUNCTION__,
8573 			    (uint64_t)start,
8574 			    (uint64_t)end,
8575 			    (uint64_t)entry->vme_start,
8576 			    (uint64_t)entry->vme_end,
8577 			    entry->protection,
8578 			    entry->max_protection);
8579 #endif
8580 		} else {
8581 			DTRACE_VM6(vm_map_delete_permanent,
8582 			    vm_map_entry_t, entry,
8583 			    vm_map_offset_t, entry->vme_start,
8584 			    vm_map_offset_t, entry->vme_end,
8585 			    vm_prot_t, entry->protection,
8586 			    vm_prot_t, entry->max_protection,
8587 			    int, VME_ALIAS(entry));
8588 		}
8589 
8590 		if (entry->is_sub_map) {
8591 			assertf(VM_MAP_PAGE_SHIFT(VME_SUBMAP(entry)) >= VM_MAP_PAGE_SHIFT(map),
8592 			    "map %p (%d) entry %p submap %p (%d)\n",
8593 			    map, VM_MAP_PAGE_SHIFT(map), entry,
8594 			    VME_SUBMAP(entry),
8595 			    VM_MAP_PAGE_SHIFT(VME_SUBMAP(entry)));
8596 			if (entry->use_pmap) {
8597 #ifndef NO_NESTED_PMAP
8598 				int pmap_flags;
8599 
8600 				if (map->terminated) {
8601 					/*
8602 					 * This is the final cleanup of the
8603 					 * address space being terminated.
8604 					 * No new mappings are expected and
8605 					 * we don't really need to unnest the
8606 					 * shared region (and lose the "global"
8607 					 * pmap mappings, if applicable).
8608 					 *
8609 					 * Tell the pmap layer that we're
8610 					 * "clean" wrt nesting.
8611 					 */
8612 					pmap_flags = PMAP_UNNEST_CLEAN;
8613 				} else {
8614 					/*
8615 					 * We're unmapping part of the nested
8616 					 * shared region, so we can't keep the
8617 					 * nested pmap.
8618 					 */
8619 					pmap_flags = 0;
8620 				}
8621 				pmap_unnest_options(
8622 					map->pmap,
8623 					(addr64_t)entry->vme_start,
8624 					entry->vme_end - entry->vme_start,
8625 					pmap_flags);
8626 #endif  /* NO_NESTED_PMAP */
8627 				if (map->mapped_in_other_pmaps &&
8628 				    os_ref_get_count_raw(&map->map_refcnt) != 0) {
8629 					/* clean up parent map/maps */
8630 					vm_map_submap_pmap_clean(
8631 						map, entry->vme_start,
8632 						entry->vme_end,
8633 						VME_SUBMAP(entry),
8634 						VME_OFFSET(entry));
8635 				}
8636 			} else {
8637 				vm_map_submap_pmap_clean(
8638 					map, entry->vme_start, entry->vme_end,
8639 					VME_SUBMAP(entry),
8640 					VME_OFFSET(entry));
8641 			}
8642 		} else if (entry->vme_kernel_object ||
8643 		    VME_OBJECT(entry) == compressor_object) {
8644 			/*
8645 			 * nothing to do
8646 			 */
8647 		} else if (map->mapped_in_other_pmaps &&
8648 		    os_ref_get_count_raw(&map->map_refcnt) != 0) {
8649 			vm_object_pmap_protect_options(
8650 				VME_OBJECT(entry), VME_OFFSET(entry),
8651 				entry->vme_end - entry->vme_start,
8652 				PMAP_NULL,
8653 				PAGE_SIZE,
8654 				entry->vme_start,
8655 				VM_PROT_NONE,
8656 				PMAP_OPTIONS_REMOVE);
8657 		} else if ((VME_OBJECT(entry) != VM_OBJECT_NULL) ||
8658 		    (state & VMDS_KERNEL_PMAP)) {
8659 			/* Remove translations associated
8660 			 * with this range unless the entry
8661 			 * does not have an object, or
8662 			 * it's the kernel map or a descendant
8663 			 * since the platform could potentially
8664 			 * create "backdoor" mappings invisible
8665 			 * to the VM. It is expected that
8666 			 * objectless, non-kernel ranges
8667 			 * do not have such VM invisible
8668 			 * translations.
8669 			 */
8670 			pmap_remove_options(map->pmap,
8671 			    (addr64_t)entry->vme_start,
8672 			    (addr64_t)entry->vme_end,
8673 			    PMAP_OPTIONS_REMOVE);
8674 		}
8675 
8676 #if DEBUG
8677 		/*
8678 		 * All pmap mappings for this map entry must have been
8679 		 * cleared by now.
8680 		 */
8681 		assert(pmap_is_empty(map->pmap,
8682 		    entry->vme_start,
8683 		    entry->vme_end));
8684 #endif /* DEBUG */
8685 
8686 		if (entry->iokit_acct) {
8687 			/* alternate accounting */
8688 			DTRACE_VM4(vm_map_iokit_unmapped_region,
8689 			    vm_map_t, map,
8690 			    vm_map_offset_t, entry->vme_start,
8691 			    vm_map_offset_t, entry->vme_end,
8692 			    int, VME_ALIAS(entry));
8693 			vm_map_iokit_unmapped_region(map,
8694 			    (entry->vme_end -
8695 			    entry->vme_start));
8696 			entry->iokit_acct = FALSE;
8697 			entry->use_pmap = FALSE;
8698 		}
8699 
8700 		/* move "s" forward */
8701 		s    = entry->vme_end;
8702 		next = entry->vme_next;
8703 		if (!entry->map_aligned) {
8704 			vm_map_offset_t rounded_s;
8705 
8706 			/*
8707 			 * Skip artificial gap due to mis-aligned entry
8708 			 * on devices with a page size smaller than the
8709 			 * map's page size (i.e. 16k task on a 4k device).
8710 			 */
8711 			rounded_s = VM_MAP_ROUND_PAGE(s, VM_MAP_PAGE_MASK(map));
8712 			if (next == vm_map_to_entry(map)) {
8713 				s = rounded_s;
8714 			} else if (s < rounded_s) {
8715 				s = MIN(rounded_s, next->vme_start);
8716 			}
8717 		}
8718 		ret.kmr_size += s - entry->vme_start;
8719 
8720 		if (entry->vme_permanent) {
8721 			/*
8722 			 * A permanent entry can not be removed, so leave it
8723 			 * in place but remove all access permissions.
8724 			 */
8725 			if (!entry->csm_associated) {
8726 				printf("%s:%d %d[%s] map %p entry %p [ 0x%llx - 0x%llx ] submap %d prot 0x%x/0x%x -> 0/0\n",
8727 				    __FUNCTION__, __LINE__,
8728 				    proc_selfpid(),
8729 				    (get_bsdtask_info(current_task())
8730 				    ? proc_name_address(get_bsdtask_info(current_task()))
8731 				    : "?"),
8732 				    map,
8733 				    entry,
8734 				    (uint64_t)entry->vme_start,
8735 				    (uint64_t)entry->vme_end,
8736 				    entry->is_sub_map,
8737 				    entry->protection,
8738 				    entry->max_protection);
8739 			}
8740 			DTRACE_VM6(vm_map_delete_permanent_prot_none,
8741 			    vm_map_entry_t, entry,
8742 			    vm_map_offset_t, entry->vme_start,
8743 			    vm_map_offset_t, entry->vme_end,
8744 			    vm_prot_t, entry->protection,
8745 			    vm_prot_t, entry->max_protection,
8746 			    int, VME_ALIAS(entry));
8747 			entry->protection = VM_PROT_NONE;
8748 			entry->max_protection = VM_PROT_NONE;
8749 		} else {
8750 			vm_map_entry_zap(map, entry, zap_list);
8751 		}
8752 
8753 		entry = next;
8754 		next  = VM_MAP_ENTRY_NULL;
8755 
8756 		if ((flags & VM_MAP_REMOVE_NO_YIELD) == 0 && s < end) {
8757 			unsigned int last_timestamp = map->timestamp++;
8758 
8759 			if (lck_rw_lock_yield_exclusive(&map->lock,
8760 			    LCK_RW_YIELD_ANY_WAITER)) {
8761 				if (last_timestamp != map->timestamp + 1) {
8762 					state |= VMDS_NEEDS_LOOKUP;
8763 				}
8764 			} else {
8765 				/* we didn't yield, undo our change */
8766 				map->timestamp--;
8767 			}
8768 		}
8769 	}
8770 
8771 	if (map->wait_for_space) {
8772 		thread_wakeup((event_t) map);
8773 	}
8774 
8775 	if (state & VMDS_NEEDS_WAKEUP) {
8776 		vm_map_entry_wakeup(map);
8777 	}
8778 
8779 out:
8780 	if ((state & VMDS_KERNEL_PMAP) && ret.kmr_return) {
8781 		__vm_map_delete_failed_panic(map, start, end, ret.kmr_return);
8782 	}
8783 
8784 	if (state & VMDS_KERNEL_KMEMPTR) {
8785 		kmem_free_space(start, end, range_id, &slot);
8786 	}
8787 
8788 	if (state & VMDS_FOUND_GAP) {
8789 		DTRACE_VM3(kern_vm_deallocate_gap,
8790 		    vm_map_offset_t, gap_start,
8791 		    vm_map_offset_t, save_start,
8792 		    vm_map_offset_t, save_end);
8793 		if (flags & VM_MAP_REMOVE_GAPS_FAIL) {
8794 			ret.kmr_return = KERN_INVALID_VALUE;
8795 		} else {
8796 			vm_map_guard_exception(gap_start, kGUARD_EXC_DEALLOC_GAP);
8797 		}
8798 	}
8799 
8800 	return ret;
8801 }
8802 
8803 kmem_return_t
vm_map_remove_and_unlock(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vmr_flags_t flags,kmem_guard_t guard)8804 vm_map_remove_and_unlock(
8805 	vm_map_t        map,
8806 	vm_map_offset_t start,
8807 	vm_map_offset_t end,
8808 	vmr_flags_t     flags,
8809 	kmem_guard_t    guard)
8810 {
8811 	kmem_return_t ret;
8812 	VM_MAP_ZAP_DECLARE(zap);
8813 
8814 	ret = vm_map_delete(map, start, end, flags, guard, &zap);
8815 	vm_map_unlock(map);
8816 
8817 	vm_map_zap_dispose(&zap);
8818 
8819 	return ret;
8820 }
8821 
8822 /*
8823  *	vm_map_remove_guard:
8824  *
8825  *	Remove the given address range from the target map.
8826  *	This is the exported form of vm_map_delete.
8827  */
8828 kmem_return_t
vm_map_remove_guard(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vmr_flags_t flags,kmem_guard_t guard)8829 vm_map_remove_guard(
8830 	vm_map_t        map,
8831 	vm_map_offset_t start,
8832 	vm_map_offset_t end,
8833 	vmr_flags_t     flags,
8834 	kmem_guard_t    guard)
8835 {
8836 	vm_map_lock(map);
8837 	return vm_map_remove_and_unlock(map, start, end, flags, guard);
8838 }
8839 
8840 /*
8841  *	vm_map_terminate:
8842  *
8843  *	Clean out a task's map.
8844  */
8845 kern_return_t
vm_map_terminate(vm_map_t map)8846 vm_map_terminate(
8847 	vm_map_t        map)
8848 {
8849 	vm_map_lock(map);
8850 	map->terminated = TRUE;
8851 	vm_map_disable_hole_optimization(map);
8852 	(void)vm_map_remove_and_unlock(map, map->min_offset, map->max_offset,
8853 	    VM_MAP_REMOVE_NO_FLAGS, KMEM_GUARD_NONE);
8854 	return KERN_SUCCESS;
8855 }
8856 
8857 /*
8858  *	Routine:	vm_map_copy_allocate
8859  *
8860  *	Description:
8861  *		Allocates and initializes a map copy object.
8862  */
8863 static vm_map_copy_t
vm_map_copy_allocate(uint16_t type)8864 vm_map_copy_allocate(uint16_t type)
8865 {
8866 	vm_map_copy_t new_copy;
8867 
8868 	new_copy = zalloc_id(ZONE_ID_VM_MAP_COPY, Z_WAITOK | Z_ZERO);
8869 	new_copy->type = type;
8870 	if (type == VM_MAP_COPY_ENTRY_LIST) {
8871 		new_copy->c_u.hdr.rb_head_store.rbh_root = (void*)(int)SKIP_RB_TREE;
8872 		vm_map_store_init(&new_copy->cpy_hdr);
8873 	}
8874 	return new_copy;
8875 }
8876 
8877 /*
8878  *	Routine:	vm_map_copy_discard
8879  *
8880  *	Description:
8881  *		Dispose of a map copy object (returned by
8882  *		vm_map_copyin).
8883  */
8884 void
vm_map_copy_discard(vm_map_copy_t copy)8885 vm_map_copy_discard(
8886 	vm_map_copy_t   copy)
8887 {
8888 	if (copy == VM_MAP_COPY_NULL) {
8889 		return;
8890 	}
8891 
8892 	/*
8893 	 * Assert that the vm_map_copy is coming from the right
8894 	 * zone and hasn't been forged
8895 	 */
8896 	vm_map_copy_require(copy);
8897 
8898 	switch (copy->type) {
8899 	case VM_MAP_COPY_ENTRY_LIST:
8900 		while (vm_map_copy_first_entry(copy) !=
8901 		    vm_map_copy_to_entry(copy)) {
8902 			vm_map_entry_t  entry = vm_map_copy_first_entry(copy);
8903 
8904 			vm_map_copy_entry_unlink(copy, entry);
8905 			if (entry->is_sub_map) {
8906 				vm_map_deallocate(VME_SUBMAP(entry));
8907 			} else {
8908 				vm_object_deallocate(VME_OBJECT(entry));
8909 			}
8910 			vm_map_copy_entry_dispose(entry);
8911 		}
8912 		break;
8913 	case VM_MAP_COPY_KERNEL_BUFFER:
8914 
8915 		/*
8916 		 * The vm_map_copy_t and possibly the data buffer were
8917 		 * allocated by a single call to kalloc_data(), i.e. the
8918 		 * vm_map_copy_t was not allocated out of the zone.
8919 		 */
8920 		if (copy->size > msg_ool_size_small || copy->offset) {
8921 			panic("Invalid vm_map_copy_t sz:%lld, ofst:%lld",
8922 			    (long long)copy->size, (long long)copy->offset);
8923 		}
8924 		kfree_data(copy->cpy_kdata, copy->size);
8925 	}
8926 	zfree_id(ZONE_ID_VM_MAP_COPY, copy);
8927 }
8928 
8929 #if XNU_PLATFORM_MacOSX
8930 
8931 /*
8932  *	Routine:	vm_map_copy_copy
8933  *
8934  *	Description:
8935  *			Move the information in a map copy object to
8936  *			a new map copy object, leaving the old one
8937  *			empty.
8938  *
8939  *			This is used by kernel routines that need
8940  *			to look at out-of-line data (in copyin form)
8941  *			before deciding whether to return SUCCESS.
8942  *			If the routine returns FAILURE, the original
8943  *			copy object will be deallocated; therefore,
8944  *			these routines must make a copy of the copy
8945  *			object and leave the original empty so that
8946  *			deallocation will not fail.
8947  */
8948 vm_map_copy_t
vm_map_copy_copy(vm_map_copy_t copy)8949 vm_map_copy_copy(
8950 	vm_map_copy_t   copy)
8951 {
8952 	vm_map_copy_t   new_copy;
8953 
8954 	if (copy == VM_MAP_COPY_NULL) {
8955 		return VM_MAP_COPY_NULL;
8956 	}
8957 
8958 	/*
8959 	 * Assert that the vm_map_copy is coming from the right
8960 	 * zone and hasn't been forged
8961 	 */
8962 	vm_map_copy_require(copy);
8963 
8964 	/*
8965 	 * Allocate a new copy object, and copy the information
8966 	 * from the old one into it.
8967 	 */
8968 
8969 	new_copy = zalloc_id(ZONE_ID_VM_MAP_COPY, Z_WAITOK | Z_ZERO | Z_NOFAIL);
8970 	memcpy((void *) new_copy, (void *) copy, sizeof(struct vm_map_copy));
8971 #if __has_feature(ptrauth_calls)
8972 	if (copy->type == VM_MAP_COPY_KERNEL_BUFFER) {
8973 		new_copy->cpy_kdata = copy->cpy_kdata;
8974 	}
8975 #endif
8976 
8977 	if (copy->type == VM_MAP_COPY_ENTRY_LIST) {
8978 		/*
8979 		 * The links in the entry chain must be
8980 		 * changed to point to the new copy object.
8981 		 */
8982 		vm_map_copy_first_entry(copy)->vme_prev
8983 		        = vm_map_copy_to_entry(new_copy);
8984 		vm_map_copy_last_entry(copy)->vme_next
8985 		        = vm_map_copy_to_entry(new_copy);
8986 	}
8987 
8988 	/*
8989 	 * Change the old copy object into one that contains
8990 	 * nothing to be deallocated.
8991 	 */
8992 	bzero(copy, sizeof(struct vm_map_copy));
8993 	copy->type = VM_MAP_COPY_KERNEL_BUFFER;
8994 
8995 	/*
8996 	 * Return the new object.
8997 	 */
8998 	return new_copy;
8999 }
9000 
9001 #endif /* XNU_PLATFORM_MacOSX */
9002 
9003 static boolean_t
vm_map_entry_is_overwritable(vm_map_t dst_map __unused,vm_map_entry_t entry)9004 vm_map_entry_is_overwritable(
9005 	vm_map_t        dst_map __unused,
9006 	vm_map_entry_t  entry)
9007 {
9008 	if (!(entry->protection & VM_PROT_WRITE)) {
9009 		/* can't overwrite if not writable */
9010 		return FALSE;
9011 	}
9012 #if !__x86_64__
9013 	if (entry->used_for_jit &&
9014 	    vm_map_cs_enforcement(dst_map) &&
9015 	    !dst_map->cs_debugged) {
9016 		/*
9017 		 * Can't overwrite a JIT region while cs_enforced
9018 		 * and not cs_debugged.
9019 		 */
9020 		return FALSE;
9021 	}
9022 
9023 #if __arm64e__
9024 	/* Do not allow overwrite HW assisted TPRO entries */
9025 	if (entry->used_for_tpro) {
9026 		return FALSE;
9027 	}
9028 #endif /* __arm64e__ */
9029 
9030 	if (entry->vme_permanent) {
9031 		if (entry->is_sub_map) {
9032 			/*
9033 			 * We can't tell if the submap contains "permanent"
9034 			 * entries within the range targeted by the caller.
9035 			 * The caller will have to check for that with
9036 			 * vm_map_overwrite_submap_recurse() for example.
9037 			 */
9038 		} else {
9039 			/*
9040 			 * Do not allow overwriting of a "permanent"
9041 			 * entry.
9042 			 */
9043 			DTRACE_VM6(vm_map_delete_permanent_deny_overwrite,
9044 			    vm_map_entry_t, entry,
9045 			    vm_map_offset_t, entry->vme_start,
9046 			    vm_map_offset_t, entry->vme_end,
9047 			    vm_prot_t, entry->protection,
9048 			    vm_prot_t, entry->max_protection,
9049 			    int, VME_ALIAS(entry));
9050 			return FALSE;
9051 		}
9052 	}
9053 #endif /* !__x86_64__ */
9054 	return TRUE;
9055 }
9056 
9057 static kern_return_t
vm_map_overwrite_submap_recurse(vm_map_t dst_map,vm_map_offset_t dst_addr,vm_map_size_t dst_size)9058 vm_map_overwrite_submap_recurse(
9059 	vm_map_t        dst_map,
9060 	vm_map_offset_t dst_addr,
9061 	vm_map_size_t   dst_size)
9062 {
9063 	vm_map_offset_t dst_end;
9064 	vm_map_entry_t  tmp_entry;
9065 	vm_map_entry_t  entry;
9066 	kern_return_t   result;
9067 	boolean_t       encountered_sub_map = FALSE;
9068 
9069 
9070 
9071 	/*
9072 	 *	Verify that the destination is all writeable
9073 	 *	initially.  We have to trunc the destination
9074 	 *	address and round the copy size or we'll end up
9075 	 *	splitting entries in strange ways.
9076 	 */
9077 
9078 	dst_end = vm_map_round_page(dst_addr + dst_size,
9079 	    VM_MAP_PAGE_MASK(dst_map));
9080 	vm_map_lock(dst_map);
9081 
9082 start_pass_1:
9083 	if (!vm_map_lookup_entry(dst_map, dst_addr, &tmp_entry)) {
9084 		vm_map_unlock(dst_map);
9085 		return KERN_INVALID_ADDRESS;
9086 	}
9087 
9088 	vm_map_clip_start(dst_map,
9089 	    tmp_entry,
9090 	    vm_map_trunc_page(dst_addr,
9091 	    VM_MAP_PAGE_MASK(dst_map)));
9092 	if (tmp_entry->is_sub_map) {
9093 		/* clipping did unnest if needed */
9094 		assert(!tmp_entry->use_pmap);
9095 	}
9096 
9097 	for (entry = tmp_entry;;) {
9098 		vm_map_entry_t  next;
9099 
9100 		next = entry->vme_next;
9101 		while (entry->is_sub_map) {
9102 			vm_map_offset_t sub_start;
9103 			vm_map_offset_t sub_end;
9104 			vm_map_offset_t local_end;
9105 
9106 			if (entry->in_transition) {
9107 				/*
9108 				 * Say that we are waiting, and wait for entry.
9109 				 */
9110 				entry->needs_wakeup = TRUE;
9111 				vm_map_entry_wait(dst_map, THREAD_UNINT);
9112 
9113 				goto start_pass_1;
9114 			}
9115 
9116 			encountered_sub_map = TRUE;
9117 			sub_start = VME_OFFSET(entry);
9118 
9119 			if (entry->vme_end < dst_end) {
9120 				sub_end = entry->vme_end;
9121 			} else {
9122 				sub_end = dst_end;
9123 			}
9124 			sub_end -= entry->vme_start;
9125 			sub_end += VME_OFFSET(entry);
9126 			local_end = entry->vme_end;
9127 			vm_map_unlock(dst_map);
9128 
9129 			result = vm_map_overwrite_submap_recurse(
9130 				VME_SUBMAP(entry),
9131 				sub_start,
9132 				sub_end - sub_start);
9133 
9134 			if (result != KERN_SUCCESS) {
9135 				return result;
9136 			}
9137 			if (dst_end <= entry->vme_end) {
9138 				return KERN_SUCCESS;
9139 			}
9140 			vm_map_lock(dst_map);
9141 			if (!vm_map_lookup_entry(dst_map, local_end,
9142 			    &tmp_entry)) {
9143 				vm_map_unlock(dst_map);
9144 				return KERN_INVALID_ADDRESS;
9145 			}
9146 			entry = tmp_entry;
9147 			next = entry->vme_next;
9148 		}
9149 
9150 		if (!(entry->protection & VM_PROT_WRITE)) {
9151 			vm_map_unlock(dst_map);
9152 			return KERN_PROTECTION_FAILURE;
9153 		}
9154 
9155 		if (!vm_map_entry_is_overwritable(dst_map, entry)) {
9156 			vm_map_unlock(dst_map);
9157 			return KERN_PROTECTION_FAILURE;
9158 		}
9159 
9160 		/*
9161 		 *	If the entry is in transition, we must wait
9162 		 *	for it to exit that state.  Anything could happen
9163 		 *	when we unlock the map, so start over.
9164 		 */
9165 		if (entry->in_transition) {
9166 			/*
9167 			 * Say that we are waiting, and wait for entry.
9168 			 */
9169 			entry->needs_wakeup = TRUE;
9170 			vm_map_entry_wait(dst_map, THREAD_UNINT);
9171 
9172 			goto start_pass_1;
9173 		}
9174 
9175 /*
9176  *		our range is contained completely within this map entry
9177  */
9178 		if (dst_end <= entry->vme_end) {
9179 			vm_map_unlock(dst_map);
9180 			return KERN_SUCCESS;
9181 		}
9182 /*
9183  *		check that range specified is contiguous region
9184  */
9185 		if ((next == vm_map_to_entry(dst_map)) ||
9186 		    (next->vme_start != entry->vme_end)) {
9187 			vm_map_unlock(dst_map);
9188 			return KERN_INVALID_ADDRESS;
9189 		}
9190 
9191 		/*
9192 		 *	Check for permanent objects in the destination.
9193 		 */
9194 		if ((VME_OBJECT(entry) != VM_OBJECT_NULL) &&
9195 		    ((!VME_OBJECT(entry)->internal) ||
9196 		    (VME_OBJECT(entry)->true_share))) {
9197 			if (encountered_sub_map) {
9198 				vm_map_unlock(dst_map);
9199 				return KERN_FAILURE;
9200 			}
9201 		}
9202 
9203 
9204 		entry = next;
9205 	}/* for */
9206 	vm_map_unlock(dst_map);
9207 	return KERN_SUCCESS;
9208 }
9209 
9210 /*
9211  *	Routine:	vm_map_copy_overwrite
9212  *
9213  *	Description:
9214  *		Copy the memory described by the map copy
9215  *		object (copy; returned by vm_map_copyin) onto
9216  *		the specified destination region (dst_map, dst_addr).
9217  *		The destination must be writeable.
9218  *
9219  *		Unlike vm_map_copyout, this routine actually
9220  *		writes over previously-mapped memory.  If the
9221  *		previous mapping was to a permanent (user-supplied)
9222  *		memory object, it is preserved.
9223  *
9224  *		The attributes (protection and inheritance) of the
9225  *		destination region are preserved.
9226  *
9227  *		If successful, consumes the copy object.
9228  *		Otherwise, the caller is responsible for it.
9229  *
9230  *	Implementation notes:
9231  *		To overwrite aligned temporary virtual memory, it is
9232  *		sufficient to remove the previous mapping and insert
9233  *		the new copy.  This replacement is done either on
9234  *		the whole region (if no permanent virtual memory
9235  *		objects are embedded in the destination region) or
9236  *		in individual map entries.
9237  *
9238  *		To overwrite permanent virtual memory , it is necessary
9239  *		to copy each page, as the external memory management
9240  *		interface currently does not provide any optimizations.
9241  *
9242  *		Unaligned memory also has to be copied.  It is possible
9243  *		to use 'vm_trickery' to copy the aligned data.  This is
9244  *		not done but not hard to implement.
9245  *
9246  *		Once a page of permanent memory has been overwritten,
9247  *		it is impossible to interrupt this function; otherwise,
9248  *		the call would be neither atomic nor location-independent.
9249  *		The kernel-state portion of a user thread must be
9250  *		interruptible.
9251  *
9252  *		It may be expensive to forward all requests that might
9253  *		overwrite permanent memory (vm_write, vm_copy) to
9254  *		uninterruptible kernel threads.  This routine may be
9255  *		called by interruptible threads; however, success is
9256  *		not guaranteed -- if the request cannot be performed
9257  *		atomically and interruptibly, an error indication is
9258  *		returned.
9259  *
9260  *		Callers of this function must call vm_map_copy_require on
9261  *		previously created vm_map_copy_t or pass a newly created
9262  *		one to ensure that it hasn't been forged.
9263  */
9264 
9265 static kern_return_t
vm_map_copy_overwrite_nested(vm_map_t dst_map,vm_map_address_t dst_addr,vm_map_copy_t copy,boolean_t interruptible,pmap_t pmap,boolean_t discard_on_success)9266 vm_map_copy_overwrite_nested(
9267 	vm_map_t                dst_map,
9268 	vm_map_address_t        dst_addr,
9269 	vm_map_copy_t           copy,
9270 	boolean_t               interruptible,
9271 	pmap_t                  pmap,
9272 	boolean_t               discard_on_success)
9273 {
9274 	vm_map_offset_t         dst_end;
9275 	vm_map_entry_t          tmp_entry;
9276 	vm_map_entry_t          entry;
9277 	kern_return_t           kr;
9278 	boolean_t               aligned = TRUE;
9279 	boolean_t               contains_permanent_objects = FALSE;
9280 	boolean_t               encountered_sub_map = FALSE;
9281 	vm_map_offset_t         base_addr;
9282 	vm_map_size_t           copy_size;
9283 	vm_map_size_t           total_size;
9284 	uint16_t                copy_page_shift;
9285 
9286 	/*
9287 	 *	Check for special kernel buffer allocated
9288 	 *	by new_ipc_kmsg_copyin.
9289 	 */
9290 
9291 	if (copy->type == VM_MAP_COPY_KERNEL_BUFFER) {
9292 		return vm_map_copyout_kernel_buffer(
9293 			dst_map, &dst_addr,
9294 			copy, copy->size, TRUE, discard_on_success);
9295 	}
9296 
9297 	/*
9298 	 *      Only works for entry lists at the moment.  Will
9299 	 *	support page lists later.
9300 	 */
9301 
9302 	assert(copy->type == VM_MAP_COPY_ENTRY_LIST);
9303 
9304 	if (copy->size == 0) {
9305 		if (discard_on_success) {
9306 			vm_map_copy_discard(copy);
9307 		}
9308 		return KERN_SUCCESS;
9309 	}
9310 
9311 	copy_page_shift = copy->cpy_hdr.page_shift;
9312 
9313 	/*
9314 	 *	Verify that the destination is all writeable
9315 	 *	initially.  We have to trunc the destination
9316 	 *	address and round the copy size or we'll end up
9317 	 *	splitting entries in strange ways.
9318 	 */
9319 
9320 	if (!VM_MAP_PAGE_ALIGNED(copy->size,
9321 	    VM_MAP_PAGE_MASK(dst_map)) ||
9322 	    !VM_MAP_PAGE_ALIGNED(copy->offset,
9323 	    VM_MAP_PAGE_MASK(dst_map)) ||
9324 	    !VM_MAP_PAGE_ALIGNED(dst_addr,
9325 	    VM_MAP_PAGE_MASK(dst_map)) ||
9326 	    copy_page_shift != VM_MAP_PAGE_SHIFT(dst_map)) {
9327 		aligned = FALSE;
9328 		dst_end = vm_map_round_page(dst_addr + copy->size,
9329 		    VM_MAP_PAGE_MASK(dst_map));
9330 	} else {
9331 		dst_end = dst_addr + copy->size;
9332 	}
9333 
9334 	vm_map_lock(dst_map);
9335 
9336 	/* LP64todo - remove this check when vm_map_commpage64()
9337 	 * no longer has to stuff in a map_entry for the commpage
9338 	 * above the map's max_offset.
9339 	 */
9340 	if (dst_addr >= dst_map->max_offset) {
9341 		vm_map_unlock(dst_map);
9342 		return KERN_INVALID_ADDRESS;
9343 	}
9344 
9345 start_pass_1:
9346 	if (!vm_map_lookup_entry(dst_map, dst_addr, &tmp_entry)) {
9347 		vm_map_unlock(dst_map);
9348 		return KERN_INVALID_ADDRESS;
9349 	}
9350 	vm_map_clip_start(dst_map,
9351 	    tmp_entry,
9352 	    vm_map_trunc_page(dst_addr,
9353 	    VM_MAP_PAGE_MASK(dst_map)));
9354 	for (entry = tmp_entry;;) {
9355 		vm_map_entry_t  next = entry->vme_next;
9356 
9357 		while (entry->is_sub_map) {
9358 			vm_map_offset_t sub_start;
9359 			vm_map_offset_t sub_end;
9360 			vm_map_offset_t local_end;
9361 
9362 			if (entry->in_transition) {
9363 				/*
9364 				 * Say that we are waiting, and wait for entry.
9365 				 */
9366 				entry->needs_wakeup = TRUE;
9367 				vm_map_entry_wait(dst_map, THREAD_UNINT);
9368 
9369 				goto start_pass_1;
9370 			}
9371 
9372 			local_end = entry->vme_end;
9373 			if (!(entry->needs_copy)) {
9374 				/* if needs_copy we are a COW submap */
9375 				/* in such a case we just replace so */
9376 				/* there is no need for the follow-  */
9377 				/* ing check.                        */
9378 				encountered_sub_map = TRUE;
9379 				sub_start = VME_OFFSET(entry);
9380 
9381 				if (entry->vme_end < dst_end) {
9382 					sub_end = entry->vme_end;
9383 				} else {
9384 					sub_end = dst_end;
9385 				}
9386 				sub_end -= entry->vme_start;
9387 				sub_end += VME_OFFSET(entry);
9388 				vm_map_unlock(dst_map);
9389 
9390 				kr = vm_map_overwrite_submap_recurse(
9391 					VME_SUBMAP(entry),
9392 					sub_start,
9393 					sub_end - sub_start);
9394 				if (kr != KERN_SUCCESS) {
9395 					return kr;
9396 				}
9397 				vm_map_lock(dst_map);
9398 			}
9399 
9400 			if (dst_end <= entry->vme_end) {
9401 				goto start_overwrite;
9402 			}
9403 			if (!vm_map_lookup_entry(dst_map, local_end,
9404 			    &entry)) {
9405 				vm_map_unlock(dst_map);
9406 				return KERN_INVALID_ADDRESS;
9407 			}
9408 			next = entry->vme_next;
9409 		}
9410 
9411 		if (!(entry->protection & VM_PROT_WRITE)) {
9412 			vm_map_unlock(dst_map);
9413 			return KERN_PROTECTION_FAILURE;
9414 		}
9415 
9416 		if (!vm_map_entry_is_overwritable(dst_map, entry)) {
9417 			vm_map_unlock(dst_map);
9418 			return KERN_PROTECTION_FAILURE;
9419 		}
9420 
9421 		/*
9422 		 *	If the entry is in transition, we must wait
9423 		 *	for it to exit that state.  Anything could happen
9424 		 *	when we unlock the map, so start over.
9425 		 */
9426 		if (entry->in_transition) {
9427 			/*
9428 			 * Say that we are waiting, and wait for entry.
9429 			 */
9430 			entry->needs_wakeup = TRUE;
9431 			vm_map_entry_wait(dst_map, THREAD_UNINT);
9432 
9433 			goto start_pass_1;
9434 		}
9435 
9436 /*
9437  *		our range is contained completely within this map entry
9438  */
9439 		if (dst_end <= entry->vme_end) {
9440 			break;
9441 		}
9442 /*
9443  *		check that range specified is contiguous region
9444  */
9445 		if ((next == vm_map_to_entry(dst_map)) ||
9446 		    (next->vme_start != entry->vme_end)) {
9447 			vm_map_unlock(dst_map);
9448 			return KERN_INVALID_ADDRESS;
9449 		}
9450 
9451 
9452 		/*
9453 		 *	Check for permanent objects in the destination.
9454 		 */
9455 		if ((VME_OBJECT(entry) != VM_OBJECT_NULL) &&
9456 		    ((!VME_OBJECT(entry)->internal) ||
9457 		    (VME_OBJECT(entry)->true_share))) {
9458 			contains_permanent_objects = TRUE;
9459 		}
9460 
9461 		entry = next;
9462 	}/* for */
9463 
9464 start_overwrite:
9465 	/*
9466 	 *	If there are permanent objects in the destination, then
9467 	 *	the copy cannot be interrupted.
9468 	 */
9469 
9470 	if (interruptible && contains_permanent_objects) {
9471 		vm_map_unlock(dst_map);
9472 		return KERN_FAILURE;   /* XXX */
9473 	}
9474 
9475 	/*
9476 	 *
9477 	 *	Make a second pass, overwriting the data
9478 	 *	At the beginning of each loop iteration,
9479 	 *	the next entry to be overwritten is "tmp_entry"
9480 	 *	(initially, the value returned from the lookup above),
9481 	 *	and the starting address expected in that entry
9482 	 *	is "start".
9483 	 */
9484 
9485 	total_size = copy->size;
9486 	if (encountered_sub_map) {
9487 		copy_size = 0;
9488 		/* re-calculate tmp_entry since we've had the map */
9489 		/* unlocked */
9490 		if (!vm_map_lookup_entry( dst_map, dst_addr, &tmp_entry)) {
9491 			vm_map_unlock(dst_map);
9492 			return KERN_INVALID_ADDRESS;
9493 		}
9494 	} else {
9495 		copy_size = copy->size;
9496 	}
9497 
9498 	base_addr = dst_addr;
9499 	while (TRUE) {
9500 		/* deconstruct the copy object and do in parts */
9501 		/* only in sub_map, interruptable case */
9502 		vm_map_entry_t  copy_entry;
9503 		vm_map_entry_t  previous_prev = VM_MAP_ENTRY_NULL;
9504 		vm_map_entry_t  next_copy = VM_MAP_ENTRY_NULL;
9505 		int             nentries;
9506 		int             remaining_entries = 0;
9507 		vm_map_offset_t new_offset = 0;
9508 
9509 		for (entry = tmp_entry; copy_size == 0;) {
9510 			vm_map_entry_t  next;
9511 
9512 			next = entry->vme_next;
9513 
9514 			/* tmp_entry and base address are moved along */
9515 			/* each time we encounter a sub-map.  Otherwise */
9516 			/* entry can outpase tmp_entry, and the copy_size */
9517 			/* may reflect the distance between them */
9518 			/* if the current entry is found to be in transition */
9519 			/* we will start over at the beginning or the last */
9520 			/* encounter of a submap as dictated by base_addr */
9521 			/* we will zero copy_size accordingly. */
9522 			if (entry->in_transition) {
9523 				/*
9524 				 * Say that we are waiting, and wait for entry.
9525 				 */
9526 				entry->needs_wakeup = TRUE;
9527 				vm_map_entry_wait(dst_map, THREAD_UNINT);
9528 
9529 				if (!vm_map_lookup_entry(dst_map, base_addr,
9530 				    &tmp_entry)) {
9531 					vm_map_unlock(dst_map);
9532 					return KERN_INVALID_ADDRESS;
9533 				}
9534 				copy_size = 0;
9535 				entry = tmp_entry;
9536 				continue;
9537 			}
9538 			if (entry->is_sub_map) {
9539 				vm_map_offset_t sub_start;
9540 				vm_map_offset_t sub_end;
9541 				vm_map_offset_t local_end;
9542 
9543 				if (entry->needs_copy) {
9544 					/* if this is a COW submap */
9545 					/* just back the range with a */
9546 					/* anonymous entry */
9547 					assert(!entry->vme_permanent);
9548 					if (entry->vme_end < dst_end) {
9549 						sub_end = entry->vme_end;
9550 					} else {
9551 						sub_end = dst_end;
9552 					}
9553 					if (entry->vme_start < base_addr) {
9554 						sub_start = base_addr;
9555 					} else {
9556 						sub_start = entry->vme_start;
9557 					}
9558 					vm_map_clip_end(
9559 						dst_map, entry, sub_end);
9560 					vm_map_clip_start(
9561 						dst_map, entry, sub_start);
9562 					assert(!entry->use_pmap);
9563 					assert(!entry->iokit_acct);
9564 					entry->use_pmap = TRUE;
9565 					vm_map_deallocate(VME_SUBMAP(entry));
9566 					assert(!entry->vme_permanent);
9567 					VME_OBJECT_SET(entry, VM_OBJECT_NULL, false, 0);
9568 					VME_OFFSET_SET(entry, 0);
9569 					entry->is_shared = FALSE;
9570 					entry->needs_copy = FALSE;
9571 					entry->protection = VM_PROT_DEFAULT;
9572 					entry->max_protection = VM_PROT_ALL;
9573 					entry->wired_count = 0;
9574 					entry->user_wired_count = 0;
9575 					if (entry->inheritance
9576 					    == VM_INHERIT_SHARE) {
9577 						entry->inheritance = VM_INHERIT_COPY;
9578 					}
9579 					continue;
9580 				}
9581 				/* first take care of any non-sub_map */
9582 				/* entries to send */
9583 				if (base_addr < entry->vme_start) {
9584 					/* stuff to send */
9585 					copy_size =
9586 					    entry->vme_start - base_addr;
9587 					break;
9588 				}
9589 				sub_start = VME_OFFSET(entry);
9590 
9591 				if (entry->vme_end < dst_end) {
9592 					sub_end = entry->vme_end;
9593 				} else {
9594 					sub_end = dst_end;
9595 				}
9596 				sub_end -= entry->vme_start;
9597 				sub_end += VME_OFFSET(entry);
9598 				local_end = entry->vme_end;
9599 				vm_map_unlock(dst_map);
9600 				copy_size = sub_end - sub_start;
9601 
9602 				/* adjust the copy object */
9603 				if (total_size > copy_size) {
9604 					vm_map_size_t   local_size = 0;
9605 					vm_map_size_t   entry_size;
9606 
9607 					nentries = 1;
9608 					new_offset = copy->offset;
9609 					copy_entry = vm_map_copy_first_entry(copy);
9610 					while (copy_entry !=
9611 					    vm_map_copy_to_entry(copy)) {
9612 						entry_size = copy_entry->vme_end -
9613 						    copy_entry->vme_start;
9614 						if ((local_size < copy_size) &&
9615 						    ((local_size + entry_size)
9616 						    >= copy_size)) {
9617 							vm_map_copy_clip_end(copy,
9618 							    copy_entry,
9619 							    copy_entry->vme_start +
9620 							    (copy_size - local_size));
9621 							entry_size = copy_entry->vme_end -
9622 							    copy_entry->vme_start;
9623 							local_size += entry_size;
9624 							new_offset += entry_size;
9625 						}
9626 						if (local_size >= copy_size) {
9627 							next_copy = copy_entry->vme_next;
9628 							copy_entry->vme_next =
9629 							    vm_map_copy_to_entry(copy);
9630 							previous_prev =
9631 							    copy->cpy_hdr.links.prev;
9632 							copy->cpy_hdr.links.prev = copy_entry;
9633 							copy->size = copy_size;
9634 							remaining_entries =
9635 							    copy->cpy_hdr.nentries;
9636 							remaining_entries -= nentries;
9637 							copy->cpy_hdr.nentries = nentries;
9638 							break;
9639 						} else {
9640 							local_size += entry_size;
9641 							new_offset += entry_size;
9642 							nentries++;
9643 						}
9644 						copy_entry = copy_entry->vme_next;
9645 					}
9646 				}
9647 
9648 				if ((entry->use_pmap) && (pmap == NULL)) {
9649 					kr = vm_map_copy_overwrite_nested(
9650 						VME_SUBMAP(entry),
9651 						sub_start,
9652 						copy,
9653 						interruptible,
9654 						VME_SUBMAP(entry)->pmap,
9655 						TRUE);
9656 				} else if (pmap != NULL) {
9657 					kr = vm_map_copy_overwrite_nested(
9658 						VME_SUBMAP(entry),
9659 						sub_start,
9660 						copy,
9661 						interruptible, pmap,
9662 						TRUE);
9663 				} else {
9664 					kr = vm_map_copy_overwrite_nested(
9665 						VME_SUBMAP(entry),
9666 						sub_start,
9667 						copy,
9668 						interruptible,
9669 						dst_map->pmap,
9670 						TRUE);
9671 				}
9672 				if (kr != KERN_SUCCESS) {
9673 					if (next_copy != NULL) {
9674 						copy->cpy_hdr.nentries +=
9675 						    remaining_entries;
9676 						copy->cpy_hdr.links.prev->vme_next =
9677 						    next_copy;
9678 						copy->cpy_hdr.links.prev
9679 						        = previous_prev;
9680 						copy->size = total_size;
9681 					}
9682 					return kr;
9683 				}
9684 				if (dst_end <= local_end) {
9685 					return KERN_SUCCESS;
9686 				}
9687 				/* otherwise copy no longer exists, it was */
9688 				/* destroyed after successful copy_overwrite */
9689 				copy = vm_map_copy_allocate(VM_MAP_COPY_ENTRY_LIST);
9690 				copy->offset = new_offset;
9691 				copy->cpy_hdr.page_shift = copy_page_shift;
9692 
9693 				total_size -= copy_size;
9694 				copy_size = 0;
9695 				/* put back remainder of copy in container */
9696 				if (next_copy != NULL) {
9697 					copy->cpy_hdr.nentries = remaining_entries;
9698 					copy->cpy_hdr.links.next = next_copy;
9699 					copy->cpy_hdr.links.prev = previous_prev;
9700 					copy->size = total_size;
9701 					next_copy->vme_prev =
9702 					    vm_map_copy_to_entry(copy);
9703 					next_copy = NULL;
9704 				}
9705 				base_addr = local_end;
9706 				vm_map_lock(dst_map);
9707 				if (!vm_map_lookup_entry(dst_map,
9708 				    local_end, &tmp_entry)) {
9709 					vm_map_unlock(dst_map);
9710 					return KERN_INVALID_ADDRESS;
9711 				}
9712 				entry = tmp_entry;
9713 				continue;
9714 			}
9715 			if (dst_end <= entry->vme_end) {
9716 				copy_size = dst_end - base_addr;
9717 				break;
9718 			}
9719 
9720 			if ((next == vm_map_to_entry(dst_map)) ||
9721 			    (next->vme_start != entry->vme_end)) {
9722 				vm_map_unlock(dst_map);
9723 				return KERN_INVALID_ADDRESS;
9724 			}
9725 
9726 			entry = next;
9727 		}/* for */
9728 
9729 		next_copy = NULL;
9730 		nentries = 1;
9731 
9732 		/* adjust the copy object */
9733 		if (total_size > copy_size) {
9734 			vm_map_size_t   local_size = 0;
9735 			vm_map_size_t   entry_size;
9736 
9737 			new_offset = copy->offset;
9738 			copy_entry = vm_map_copy_first_entry(copy);
9739 			while (copy_entry != vm_map_copy_to_entry(copy)) {
9740 				entry_size = copy_entry->vme_end -
9741 				    copy_entry->vme_start;
9742 				if ((local_size < copy_size) &&
9743 				    ((local_size + entry_size)
9744 				    >= copy_size)) {
9745 					vm_map_copy_clip_end(copy, copy_entry,
9746 					    copy_entry->vme_start +
9747 					    (copy_size - local_size));
9748 					entry_size = copy_entry->vme_end -
9749 					    copy_entry->vme_start;
9750 					local_size += entry_size;
9751 					new_offset += entry_size;
9752 				}
9753 				if (local_size >= copy_size) {
9754 					next_copy = copy_entry->vme_next;
9755 					copy_entry->vme_next =
9756 					    vm_map_copy_to_entry(copy);
9757 					previous_prev =
9758 					    copy->cpy_hdr.links.prev;
9759 					copy->cpy_hdr.links.prev = copy_entry;
9760 					copy->size = copy_size;
9761 					remaining_entries =
9762 					    copy->cpy_hdr.nentries;
9763 					remaining_entries -= nentries;
9764 					copy->cpy_hdr.nentries = nentries;
9765 					break;
9766 				} else {
9767 					local_size += entry_size;
9768 					new_offset += entry_size;
9769 					nentries++;
9770 				}
9771 				copy_entry = copy_entry->vme_next;
9772 			}
9773 		}
9774 
9775 		if (aligned) {
9776 			pmap_t  local_pmap;
9777 
9778 			if (pmap) {
9779 				local_pmap = pmap;
9780 			} else {
9781 				local_pmap = dst_map->pmap;
9782 			}
9783 
9784 			if ((kr =  vm_map_copy_overwrite_aligned(
9785 				    dst_map, tmp_entry, copy,
9786 				    base_addr, local_pmap)) != KERN_SUCCESS) {
9787 				if (next_copy != NULL) {
9788 					copy->cpy_hdr.nentries +=
9789 					    remaining_entries;
9790 					copy->cpy_hdr.links.prev->vme_next =
9791 					    next_copy;
9792 					copy->cpy_hdr.links.prev =
9793 					    previous_prev;
9794 					copy->size += copy_size;
9795 				}
9796 				return kr;
9797 			}
9798 			vm_map_unlock(dst_map);
9799 		} else {
9800 			/*
9801 			 * Performance gain:
9802 			 *
9803 			 * if the copy and dst address are misaligned but the same
9804 			 * offset within the page we can copy_not_aligned the
9805 			 * misaligned parts and copy aligned the rest.  If they are
9806 			 * aligned but len is unaligned we simply need to copy
9807 			 * the end bit unaligned.  We'll need to split the misaligned
9808 			 * bits of the region in this case !
9809 			 */
9810 			/* ALWAYS UNLOCKS THE dst_map MAP */
9811 			kr = vm_map_copy_overwrite_unaligned(
9812 				dst_map,
9813 				tmp_entry,
9814 				copy,
9815 				base_addr,
9816 				discard_on_success);
9817 			if (kr != KERN_SUCCESS) {
9818 				if (next_copy != NULL) {
9819 					copy->cpy_hdr.nentries +=
9820 					    remaining_entries;
9821 					copy->cpy_hdr.links.prev->vme_next =
9822 					    next_copy;
9823 					copy->cpy_hdr.links.prev =
9824 					    previous_prev;
9825 					copy->size += copy_size;
9826 				}
9827 				return kr;
9828 			}
9829 		}
9830 		total_size -= copy_size;
9831 		if (total_size == 0) {
9832 			break;
9833 		}
9834 		base_addr += copy_size;
9835 		copy_size = 0;
9836 		copy->offset = new_offset;
9837 		if (next_copy != NULL) {
9838 			copy->cpy_hdr.nentries = remaining_entries;
9839 			copy->cpy_hdr.links.next = next_copy;
9840 			copy->cpy_hdr.links.prev = previous_prev;
9841 			next_copy->vme_prev = vm_map_copy_to_entry(copy);
9842 			copy->size = total_size;
9843 		}
9844 		vm_map_lock(dst_map);
9845 		while (TRUE) {
9846 			if (!vm_map_lookup_entry(dst_map,
9847 			    base_addr, &tmp_entry)) {
9848 				vm_map_unlock(dst_map);
9849 				return KERN_INVALID_ADDRESS;
9850 			}
9851 			if (tmp_entry->in_transition) {
9852 				entry->needs_wakeup = TRUE;
9853 				vm_map_entry_wait(dst_map, THREAD_UNINT);
9854 			} else {
9855 				break;
9856 			}
9857 		}
9858 		vm_map_clip_start(dst_map,
9859 		    tmp_entry,
9860 		    vm_map_trunc_page(base_addr,
9861 		    VM_MAP_PAGE_MASK(dst_map)));
9862 
9863 		entry = tmp_entry;
9864 	} /* while */
9865 
9866 	/*
9867 	 *	Throw away the vm_map_copy object
9868 	 */
9869 	if (discard_on_success) {
9870 		vm_map_copy_discard(copy);
9871 	}
9872 
9873 	return KERN_SUCCESS;
9874 }/* vm_map_copy_overwrite */
9875 
9876 kern_return_t
vm_map_copy_overwrite(vm_map_t dst_map,vm_map_offset_t dst_addr,vm_map_copy_t copy,vm_map_size_t copy_size,boolean_t interruptible)9877 vm_map_copy_overwrite(
9878 	vm_map_t        dst_map,
9879 	vm_map_offset_t dst_addr,
9880 	vm_map_copy_t   copy,
9881 	vm_map_size_t   copy_size,
9882 	boolean_t       interruptible)
9883 {
9884 	vm_map_size_t   head_size, tail_size;
9885 	vm_map_copy_t   head_copy, tail_copy;
9886 	vm_map_offset_t head_addr, tail_addr;
9887 	vm_map_entry_t  entry;
9888 	kern_return_t   kr;
9889 	vm_map_offset_t effective_page_mask, effective_page_size;
9890 	uint16_t        copy_page_shift;
9891 
9892 	head_size = 0;
9893 	tail_size = 0;
9894 	head_copy = NULL;
9895 	tail_copy = NULL;
9896 	head_addr = 0;
9897 	tail_addr = 0;
9898 
9899 	/*
9900 	 *	Check for null copy object.
9901 	 */
9902 	if (copy == VM_MAP_COPY_NULL) {
9903 		return KERN_SUCCESS;
9904 	}
9905 
9906 	/*
9907 	 * Assert that the vm_map_copy is coming from the right
9908 	 * zone and hasn't been forged
9909 	 */
9910 	vm_map_copy_require(copy);
9911 
9912 	if (interruptible ||
9913 	    copy->type != VM_MAP_COPY_ENTRY_LIST) {
9914 		/*
9915 		 * We can't split the "copy" map if we're interruptible
9916 		 * or if we don't have a "copy" map...
9917 		 */
9918 blunt_copy:
9919 		return vm_map_copy_overwrite_nested(dst_map,
9920 		           dst_addr,
9921 		           copy,
9922 		           interruptible,
9923 		           (pmap_t) NULL,
9924 		           TRUE);
9925 	}
9926 
9927 	copy_page_shift = VM_MAP_COPY_PAGE_SHIFT(copy);
9928 	if (copy_page_shift < PAGE_SHIFT ||
9929 	    VM_MAP_PAGE_SHIFT(dst_map) < PAGE_SHIFT) {
9930 		goto blunt_copy;
9931 	}
9932 
9933 	if (VM_MAP_PAGE_SHIFT(dst_map) < PAGE_SHIFT) {
9934 		effective_page_mask = VM_MAP_PAGE_MASK(dst_map);
9935 	} else {
9936 		effective_page_mask = MAX(VM_MAP_PAGE_MASK(dst_map), PAGE_MASK);
9937 		effective_page_mask = MAX(VM_MAP_COPY_PAGE_MASK(copy),
9938 		    effective_page_mask);
9939 	}
9940 	effective_page_size = effective_page_mask + 1;
9941 
9942 	if (copy_size < VM_MAP_COPY_OVERWRITE_OPTIMIZATION_THRESHOLD_PAGES * effective_page_size) {
9943 		/*
9944 		 * Too small to bother with optimizing...
9945 		 */
9946 		goto blunt_copy;
9947 	}
9948 
9949 	if ((dst_addr & effective_page_mask) !=
9950 	    (copy->offset & effective_page_mask)) {
9951 		/*
9952 		 * Incompatible mis-alignment of source and destination...
9953 		 */
9954 		goto blunt_copy;
9955 	}
9956 
9957 	/*
9958 	 * Proper alignment or identical mis-alignment at the beginning.
9959 	 * Let's try and do a small unaligned copy first (if needed)
9960 	 * and then an aligned copy for the rest.
9961 	 */
9962 	if (!vm_map_page_aligned(dst_addr, effective_page_mask)) {
9963 		head_addr = dst_addr;
9964 		head_size = (effective_page_size -
9965 		    (copy->offset & effective_page_mask));
9966 		head_size = MIN(head_size, copy_size);
9967 	}
9968 	if (!vm_map_page_aligned(copy->offset + copy_size,
9969 	    effective_page_mask)) {
9970 		/*
9971 		 * Mis-alignment at the end.
9972 		 * Do an aligned copy up to the last page and
9973 		 * then an unaligned copy for the remaining bytes.
9974 		 */
9975 		tail_size = ((copy->offset + copy_size) &
9976 		    effective_page_mask);
9977 		tail_size = MIN(tail_size, copy_size);
9978 		tail_addr = dst_addr + copy_size - tail_size;
9979 		assert(tail_addr >= head_addr + head_size);
9980 	}
9981 	assert(head_size + tail_size <= copy_size);
9982 
9983 	if (head_size + tail_size == copy_size) {
9984 		/*
9985 		 * It's all unaligned, no optimization possible...
9986 		 */
9987 		goto blunt_copy;
9988 	}
9989 
9990 	/*
9991 	 * Can't optimize if there are any submaps in the
9992 	 * destination due to the way we free the "copy" map
9993 	 * progressively in vm_map_copy_overwrite_nested()
9994 	 * in that case.
9995 	 */
9996 	vm_map_lock_read(dst_map);
9997 	if (!vm_map_lookup_entry(dst_map, dst_addr, &entry)) {
9998 		vm_map_unlock_read(dst_map);
9999 		goto blunt_copy;
10000 	}
10001 	for (;
10002 	    (entry != vm_map_to_entry(dst_map) &&
10003 	    entry->vme_start < dst_addr + copy_size);
10004 	    entry = entry->vme_next) {
10005 		if (entry->is_sub_map) {
10006 			vm_map_unlock_read(dst_map);
10007 			goto blunt_copy;
10008 		}
10009 	}
10010 	vm_map_unlock_read(dst_map);
10011 
10012 	if (head_size) {
10013 		/*
10014 		 * Unaligned copy of the first "head_size" bytes, to reach
10015 		 * a page boundary.
10016 		 */
10017 
10018 		/*
10019 		 * Extract "head_copy" out of "copy".
10020 		 */
10021 		head_copy = vm_map_copy_allocate(VM_MAP_COPY_ENTRY_LIST);
10022 		head_copy->cpy_hdr.entries_pageable =
10023 		    copy->cpy_hdr.entries_pageable;
10024 		head_copy->cpy_hdr.page_shift = copy_page_shift;
10025 
10026 		entry = vm_map_copy_first_entry(copy);
10027 		if (entry->vme_end < copy->offset + head_size) {
10028 			head_size = entry->vme_end - copy->offset;
10029 		}
10030 
10031 		head_copy->offset = copy->offset;
10032 		head_copy->size = head_size;
10033 		copy->offset += head_size;
10034 		copy->size -= head_size;
10035 		copy_size -= head_size;
10036 		assert(copy_size > 0);
10037 
10038 		vm_map_copy_clip_end(copy, entry, copy->offset);
10039 		vm_map_copy_entry_unlink(copy, entry);
10040 		vm_map_copy_entry_link(head_copy,
10041 		    vm_map_copy_to_entry(head_copy),
10042 		    entry);
10043 
10044 		/*
10045 		 * Do the unaligned copy.
10046 		 */
10047 		kr = vm_map_copy_overwrite_nested(dst_map,
10048 		    head_addr,
10049 		    head_copy,
10050 		    interruptible,
10051 		    (pmap_t) NULL,
10052 		    FALSE);
10053 		if (kr != KERN_SUCCESS) {
10054 			goto done;
10055 		}
10056 	}
10057 
10058 	if (tail_size) {
10059 		/*
10060 		 * Extract "tail_copy" out of "copy".
10061 		 */
10062 		tail_copy = vm_map_copy_allocate(VM_MAP_COPY_ENTRY_LIST);
10063 		tail_copy->cpy_hdr.entries_pageable =
10064 		    copy->cpy_hdr.entries_pageable;
10065 		tail_copy->cpy_hdr.page_shift = copy_page_shift;
10066 
10067 		tail_copy->offset = copy->offset + copy_size - tail_size;
10068 		tail_copy->size = tail_size;
10069 
10070 		copy->size -= tail_size;
10071 		copy_size -= tail_size;
10072 		assert(copy_size > 0);
10073 
10074 		entry = vm_map_copy_last_entry(copy);
10075 		vm_map_copy_clip_start(copy, entry, tail_copy->offset);
10076 		entry = vm_map_copy_last_entry(copy);
10077 		vm_map_copy_entry_unlink(copy, entry);
10078 		vm_map_copy_entry_link(tail_copy,
10079 		    vm_map_copy_last_entry(tail_copy),
10080 		    entry);
10081 	}
10082 
10083 	/*
10084 	 * If we are here from ipc_kmsg_copyout_ool_descriptor(),
10085 	 * we want to avoid TOCTOU issues w.r.t copy->size but
10086 	 * we don't need to change vm_map_copy_overwrite_nested()
10087 	 * and all other vm_map_copy_overwrite variants.
10088 	 *
10089 	 * So we assign the original copy_size that was passed into
10090 	 * this routine back to copy.
10091 	 *
10092 	 * This use of local 'copy_size' passed into this routine is
10093 	 * to try and protect against TOCTOU attacks where the kernel
10094 	 * has been exploited. We don't expect this to be an issue
10095 	 * during normal system operation.
10096 	 */
10097 	assertf(copy->size == copy_size,
10098 	    "Mismatch of copy sizes. Expected 0x%llx, Got 0x%llx\n", (uint64_t) copy_size, (uint64_t) copy->size);
10099 	copy->size = copy_size;
10100 
10101 	/*
10102 	 * Copy most (or possibly all) of the data.
10103 	 */
10104 	kr = vm_map_copy_overwrite_nested(dst_map,
10105 	    dst_addr + head_size,
10106 	    copy,
10107 	    interruptible,
10108 	    (pmap_t) NULL,
10109 	    FALSE);
10110 	if (kr != KERN_SUCCESS) {
10111 		goto done;
10112 	}
10113 
10114 	if (tail_size) {
10115 		kr = vm_map_copy_overwrite_nested(dst_map,
10116 		    tail_addr,
10117 		    tail_copy,
10118 		    interruptible,
10119 		    (pmap_t) NULL,
10120 		    FALSE);
10121 	}
10122 
10123 done:
10124 	assert(copy->type == VM_MAP_COPY_ENTRY_LIST);
10125 	if (kr == KERN_SUCCESS) {
10126 		/*
10127 		 * Discard all the copy maps.
10128 		 */
10129 		if (head_copy) {
10130 			vm_map_copy_discard(head_copy);
10131 			head_copy = NULL;
10132 		}
10133 		vm_map_copy_discard(copy);
10134 		if (tail_copy) {
10135 			vm_map_copy_discard(tail_copy);
10136 			tail_copy = NULL;
10137 		}
10138 	} else {
10139 		/*
10140 		 * Re-assemble the original copy map.
10141 		 */
10142 		if (head_copy) {
10143 			entry = vm_map_copy_first_entry(head_copy);
10144 			vm_map_copy_entry_unlink(head_copy, entry);
10145 			vm_map_copy_entry_link(copy,
10146 			    vm_map_copy_to_entry(copy),
10147 			    entry);
10148 			copy->offset -= head_size;
10149 			copy->size += head_size;
10150 			vm_map_copy_discard(head_copy);
10151 			head_copy = NULL;
10152 		}
10153 		if (tail_copy) {
10154 			entry = vm_map_copy_last_entry(tail_copy);
10155 			vm_map_copy_entry_unlink(tail_copy, entry);
10156 			vm_map_copy_entry_link(copy,
10157 			    vm_map_copy_last_entry(copy),
10158 			    entry);
10159 			copy->size += tail_size;
10160 			vm_map_copy_discard(tail_copy);
10161 			tail_copy = NULL;
10162 		}
10163 	}
10164 	return kr;
10165 }
10166 
10167 
10168 /*
10169  *	Routine: vm_map_copy_overwrite_unaligned	[internal use only]
10170  *
10171  *	Decription:
10172  *	Physically copy unaligned data
10173  *
10174  *	Implementation:
10175  *	Unaligned parts of pages have to be physically copied.  We use
10176  *	a modified form of vm_fault_copy (which understands none-aligned
10177  *	page offsets and sizes) to do the copy.  We attempt to copy as
10178  *	much memory in one go as possibly, however vm_fault_copy copies
10179  *	within 1 memory object so we have to find the smaller of "amount left"
10180  *	"source object data size" and "target object data size".  With
10181  *	unaligned data we don't need to split regions, therefore the source
10182  *	(copy) object should be one map entry, the target range may be split
10183  *	over multiple map entries however.  In any event we are pessimistic
10184  *	about these assumptions.
10185  *
10186  *	Callers of this function must call vm_map_copy_require on
10187  *	previously created vm_map_copy_t or pass a newly created
10188  *	one to ensure that it hasn't been forged.
10189  *
10190  *	Assumptions:
10191  *	dst_map is locked on entry and is return locked on success,
10192  *	unlocked on error.
10193  */
10194 
10195 static kern_return_t
vm_map_copy_overwrite_unaligned(vm_map_t dst_map,vm_map_entry_t entry,vm_map_copy_t copy,vm_map_offset_t start,boolean_t discard_on_success)10196 vm_map_copy_overwrite_unaligned(
10197 	vm_map_t        dst_map,
10198 	vm_map_entry_t  entry,
10199 	vm_map_copy_t   copy,
10200 	vm_map_offset_t start,
10201 	boolean_t       discard_on_success)
10202 {
10203 	vm_map_entry_t          copy_entry;
10204 	vm_map_entry_t          copy_entry_next;
10205 	vm_map_version_t        version;
10206 	vm_object_t             dst_object;
10207 	vm_object_offset_t      dst_offset;
10208 	vm_object_offset_t      src_offset;
10209 	vm_object_offset_t      entry_offset;
10210 	vm_map_offset_t         entry_end;
10211 	vm_map_size_t           src_size,
10212 	    dst_size,
10213 	    copy_size,
10214 	    amount_left;
10215 	kern_return_t           kr = KERN_SUCCESS;
10216 
10217 
10218 	copy_entry = vm_map_copy_first_entry(copy);
10219 
10220 	vm_map_lock_write_to_read(dst_map);
10221 
10222 	src_offset = copy->offset - trunc_page_mask_64(copy->offset, VM_MAP_COPY_PAGE_MASK(copy));
10223 	amount_left = copy->size;
10224 /*
10225  *	unaligned so we never clipped this entry, we need the offset into
10226  *	the vm_object not just the data.
10227  */
10228 	while (amount_left > 0) {
10229 		if (entry == vm_map_to_entry(dst_map)) {
10230 			vm_map_unlock_read(dst_map);
10231 			return KERN_INVALID_ADDRESS;
10232 		}
10233 
10234 		/* "start" must be within the current map entry */
10235 		assert((start >= entry->vme_start) && (start < entry->vme_end));
10236 
10237 		/*
10238 		 *	Check protection again
10239 		 */
10240 		if (!(entry->protection & VM_PROT_WRITE)) {
10241 			vm_map_unlock_read(dst_map);
10242 			return KERN_PROTECTION_FAILURE;
10243 		}
10244 		if (!vm_map_entry_is_overwritable(dst_map, entry)) {
10245 			vm_map_unlock_read(dst_map);
10246 			return KERN_PROTECTION_FAILURE;
10247 		}
10248 
10249 		dst_offset = start - entry->vme_start;
10250 
10251 		dst_size = entry->vme_end - start;
10252 
10253 		src_size = copy_entry->vme_end -
10254 		    (copy_entry->vme_start + src_offset);
10255 
10256 		if (dst_size < src_size) {
10257 /*
10258  *			we can only copy dst_size bytes before
10259  *			we have to get the next destination entry
10260  */
10261 			copy_size = dst_size;
10262 		} else {
10263 /*
10264  *			we can only copy src_size bytes before
10265  *			we have to get the next source copy entry
10266  */
10267 			copy_size = src_size;
10268 		}
10269 
10270 		if (copy_size > amount_left) {
10271 			copy_size = amount_left;
10272 		}
10273 /*
10274  *		Entry needs copy, create a shadow shadow object for
10275  *		Copy on write region.
10276  */
10277 		if (entry->needs_copy) {
10278 			if (vm_map_lock_read_to_write(dst_map)) {
10279 				vm_map_lock_read(dst_map);
10280 				goto RetryLookup;
10281 			}
10282 			VME_OBJECT_SHADOW(entry,
10283 			    (vm_map_size_t)(entry->vme_end
10284 			    - entry->vme_start),
10285 			    vm_map_always_shadow(dst_map));
10286 			entry->needs_copy = FALSE;
10287 			vm_map_lock_write_to_read(dst_map);
10288 		}
10289 		dst_object = VME_OBJECT(entry);
10290 /*
10291  *		unlike with the virtual (aligned) copy we're going
10292  *		to fault on it therefore we need a target object.
10293  */
10294 		if (dst_object == VM_OBJECT_NULL) {
10295 			if (vm_map_lock_read_to_write(dst_map)) {
10296 				vm_map_lock_read(dst_map);
10297 				goto RetryLookup;
10298 			}
10299 			dst_object = vm_object_allocate((vm_map_size_t)
10300 			    entry->vme_end - entry->vme_start);
10301 			VME_OBJECT_SET(entry, dst_object, false, 0);
10302 			VME_OFFSET_SET(entry, 0);
10303 			assert(entry->use_pmap);
10304 			vm_map_lock_write_to_read(dst_map);
10305 		}
10306 /*
10307  *		Take an object reference and unlock map. The "entry" may
10308  *		disappear or change when the map is unlocked.
10309  */
10310 		vm_object_reference(dst_object);
10311 		version.main_timestamp = dst_map->timestamp;
10312 		entry_offset = VME_OFFSET(entry);
10313 		entry_end = entry->vme_end;
10314 		vm_map_unlock_read(dst_map);
10315 /*
10316  *		Copy as much as possible in one pass
10317  */
10318 		kr = vm_fault_copy(
10319 			VME_OBJECT(copy_entry),
10320 			VME_OFFSET(copy_entry) + src_offset,
10321 			&copy_size,
10322 			dst_object,
10323 			entry_offset + dst_offset,
10324 			dst_map,
10325 			&version,
10326 			THREAD_UNINT );
10327 
10328 		start += copy_size;
10329 		src_offset += copy_size;
10330 		amount_left -= copy_size;
10331 /*
10332  *		Release the object reference
10333  */
10334 		vm_object_deallocate(dst_object);
10335 /*
10336  *		If a hard error occurred, return it now
10337  */
10338 		if (kr != KERN_SUCCESS) {
10339 			return kr;
10340 		}
10341 
10342 		if ((copy_entry->vme_start + src_offset) == copy_entry->vme_end
10343 		    || amount_left == 0) {
10344 /*
10345  *			all done with this copy entry, dispose.
10346  */
10347 			copy_entry_next = copy_entry->vme_next;
10348 
10349 			if (discard_on_success) {
10350 				vm_map_copy_entry_unlink(copy, copy_entry);
10351 				assert(!copy_entry->is_sub_map);
10352 				vm_object_deallocate(VME_OBJECT(copy_entry));
10353 				vm_map_copy_entry_dispose(copy_entry);
10354 			}
10355 
10356 			if (copy_entry_next == vm_map_copy_to_entry(copy) &&
10357 			    amount_left) {
10358 /*
10359  *				not finished copying but run out of source
10360  */
10361 				return KERN_INVALID_ADDRESS;
10362 			}
10363 
10364 			copy_entry = copy_entry_next;
10365 
10366 			src_offset = 0;
10367 		}
10368 
10369 		if (amount_left == 0) {
10370 			return KERN_SUCCESS;
10371 		}
10372 
10373 		vm_map_lock_read(dst_map);
10374 		if (version.main_timestamp == dst_map->timestamp) {
10375 			if (start == entry_end) {
10376 /*
10377  *				destination region is split.  Use the version
10378  *				information to avoid a lookup in the normal
10379  *				case.
10380  */
10381 				entry = entry->vme_next;
10382 /*
10383  *				should be contiguous. Fail if we encounter
10384  *				a hole in the destination.
10385  */
10386 				if (start != entry->vme_start) {
10387 					vm_map_unlock_read(dst_map);
10388 					return KERN_INVALID_ADDRESS;
10389 				}
10390 			}
10391 		} else {
10392 /*
10393  *			Map version check failed.
10394  *			we must lookup the entry because somebody
10395  *			might have changed the map behind our backs.
10396  */
10397 RetryLookup:
10398 			if (!vm_map_lookup_entry(dst_map, start, &entry)) {
10399 				vm_map_unlock_read(dst_map);
10400 				return KERN_INVALID_ADDRESS;
10401 			}
10402 		}
10403 	}/* while */
10404 
10405 	return KERN_SUCCESS;
10406 }/* vm_map_copy_overwrite_unaligned */
10407 
10408 /*
10409  *	Routine: vm_map_copy_overwrite_aligned	[internal use only]
10410  *
10411  *	Description:
10412  *	Does all the vm_trickery possible for whole pages.
10413  *
10414  *	Implementation:
10415  *
10416  *	If there are no permanent objects in the destination,
10417  *	and the source and destination map entry zones match,
10418  *	and the destination map entry is not shared,
10419  *	then the map entries can be deleted and replaced
10420  *	with those from the copy.  The following code is the
10421  *	basic idea of what to do, but there are lots of annoying
10422  *	little details about getting protection and inheritance
10423  *	right.  Should add protection, inheritance, and sharing checks
10424  *	to the above pass and make sure that no wiring is involved.
10425  *
10426  *	Callers of this function must call vm_map_copy_require on
10427  *	previously created vm_map_copy_t or pass a newly created
10428  *	one to ensure that it hasn't been forged.
10429  */
10430 
10431 int vm_map_copy_overwrite_aligned_src_not_internal = 0;
10432 int vm_map_copy_overwrite_aligned_src_not_symmetric = 0;
10433 int vm_map_copy_overwrite_aligned_src_large = 0;
10434 
10435 static kern_return_t
vm_map_copy_overwrite_aligned(vm_map_t dst_map,vm_map_entry_t tmp_entry,vm_map_copy_t copy,vm_map_offset_t start,__unused pmap_t pmap)10436 vm_map_copy_overwrite_aligned(
10437 	vm_map_t        dst_map,
10438 	vm_map_entry_t  tmp_entry,
10439 	vm_map_copy_t   copy,
10440 	vm_map_offset_t start,
10441 	__unused pmap_t pmap)
10442 {
10443 	vm_object_t     object;
10444 	vm_map_entry_t  copy_entry;
10445 	vm_map_size_t   copy_size;
10446 	vm_map_size_t   size;
10447 	vm_map_entry_t  entry;
10448 
10449 	while ((copy_entry = vm_map_copy_first_entry(copy))
10450 	    != vm_map_copy_to_entry(copy)) {
10451 		copy_size = (copy_entry->vme_end - copy_entry->vme_start);
10452 
10453 		entry = tmp_entry;
10454 		if (entry->is_sub_map) {
10455 			/* unnested when clipped earlier */
10456 			assert(!entry->use_pmap);
10457 		}
10458 		if (entry == vm_map_to_entry(dst_map)) {
10459 			vm_map_unlock(dst_map);
10460 			return KERN_INVALID_ADDRESS;
10461 		}
10462 		size = (entry->vme_end - entry->vme_start);
10463 		/*
10464 		 *	Make sure that no holes popped up in the
10465 		 *	address map, and that the protection is
10466 		 *	still valid, in case the map was unlocked
10467 		 *	earlier.
10468 		 */
10469 
10470 		if ((entry->vme_start != start) || ((entry->is_sub_map)
10471 		    && !entry->needs_copy)) {
10472 			vm_map_unlock(dst_map);
10473 			return KERN_INVALID_ADDRESS;
10474 		}
10475 		assert(entry != vm_map_to_entry(dst_map));
10476 
10477 		/*
10478 		 *	Check protection again
10479 		 */
10480 
10481 		if (!(entry->protection & VM_PROT_WRITE)) {
10482 			vm_map_unlock(dst_map);
10483 			return KERN_PROTECTION_FAILURE;
10484 		}
10485 
10486 		if (!vm_map_entry_is_overwritable(dst_map, entry)) {
10487 			vm_map_unlock(dst_map);
10488 			return KERN_PROTECTION_FAILURE;
10489 		}
10490 
10491 		/*
10492 		 *	Adjust to source size first
10493 		 */
10494 
10495 		if (copy_size < size) {
10496 			if (entry->map_aligned &&
10497 			    !VM_MAP_PAGE_ALIGNED(entry->vme_start + copy_size,
10498 			    VM_MAP_PAGE_MASK(dst_map))) {
10499 				/* no longer map-aligned */
10500 				entry->map_aligned = FALSE;
10501 			}
10502 			vm_map_clip_end(dst_map, entry, entry->vme_start + copy_size);
10503 			size = copy_size;
10504 		}
10505 
10506 		/*
10507 		 *	Adjust to destination size
10508 		 */
10509 
10510 		if (size < copy_size) {
10511 			vm_map_copy_clip_end(copy, copy_entry,
10512 			    copy_entry->vme_start + size);
10513 			copy_size = size;
10514 		}
10515 
10516 		assert((entry->vme_end - entry->vme_start) == size);
10517 		assert((tmp_entry->vme_end - tmp_entry->vme_start) == size);
10518 		assert((copy_entry->vme_end - copy_entry->vme_start) == size);
10519 
10520 		/*
10521 		 *	If the destination contains temporary unshared memory,
10522 		 *	we can perform the copy by throwing it away and
10523 		 *	installing the source data.
10524 		 */
10525 
10526 		object = VME_OBJECT(entry);
10527 		if ((!entry->is_shared &&
10528 		    ((object == VM_OBJECT_NULL) ||
10529 		    (object->internal && !object->true_share))) ||
10530 		    entry->needs_copy) {
10531 			vm_object_t     old_object = VME_OBJECT(entry);
10532 			vm_object_offset_t      old_offset = VME_OFFSET(entry);
10533 			vm_object_offset_t      offset;
10534 
10535 			/*
10536 			 * Ensure that the source and destination aren't
10537 			 * identical
10538 			 */
10539 			if (old_object == VME_OBJECT(copy_entry) &&
10540 			    old_offset == VME_OFFSET(copy_entry)) {
10541 				vm_map_copy_entry_unlink(copy, copy_entry);
10542 				vm_map_copy_entry_dispose(copy_entry);
10543 
10544 				if (old_object != VM_OBJECT_NULL) {
10545 					vm_object_deallocate(old_object);
10546 				}
10547 
10548 				start = tmp_entry->vme_end;
10549 				tmp_entry = tmp_entry->vme_next;
10550 				continue;
10551 			}
10552 
10553 #if XNU_TARGET_OS_OSX
10554 #define __TRADEOFF1_OBJ_SIZE (64 * 1024 * 1024) /* 64 MB */
10555 #define __TRADEOFF1_COPY_SIZE (128 * 1024)      /* 128 KB */
10556 			if (VME_OBJECT(copy_entry) != VM_OBJECT_NULL &&
10557 			    VME_OBJECT(copy_entry)->vo_size >= __TRADEOFF1_OBJ_SIZE &&
10558 			    copy_size <= __TRADEOFF1_COPY_SIZE) {
10559 				/*
10560 				 * Virtual vs. Physical copy tradeoff #1.
10561 				 *
10562 				 * Copying only a few pages out of a large
10563 				 * object:  do a physical copy instead of
10564 				 * a virtual copy, to avoid possibly keeping
10565 				 * the entire large object alive because of
10566 				 * those few copy-on-write pages.
10567 				 */
10568 				vm_map_copy_overwrite_aligned_src_large++;
10569 				goto slow_copy;
10570 			}
10571 #endif /* XNU_TARGET_OS_OSX */
10572 
10573 			if ((dst_map->pmap != kernel_pmap) &&
10574 			    (VME_ALIAS(entry) >= VM_MEMORY_MALLOC) &&
10575 			    (VME_ALIAS(entry) <= VM_MEMORY_MALLOC_MEDIUM)) {
10576 				vm_object_t new_object, new_shadow;
10577 
10578 				/*
10579 				 * We're about to map something over a mapping
10580 				 * established by malloc()...
10581 				 */
10582 				new_object = VME_OBJECT(copy_entry);
10583 				if (new_object != VM_OBJECT_NULL) {
10584 					vm_object_lock_shared(new_object);
10585 				}
10586 				while (new_object != VM_OBJECT_NULL &&
10587 #if XNU_TARGET_OS_OSX
10588 				    !new_object->true_share &&
10589 				    new_object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC &&
10590 #endif /* XNU_TARGET_OS_OSX */
10591 				    new_object->internal) {
10592 					new_shadow = new_object->shadow;
10593 					if (new_shadow == VM_OBJECT_NULL) {
10594 						break;
10595 					}
10596 					vm_object_lock_shared(new_shadow);
10597 					vm_object_unlock(new_object);
10598 					new_object = new_shadow;
10599 				}
10600 				if (new_object != VM_OBJECT_NULL) {
10601 					if (!new_object->internal) {
10602 						/*
10603 						 * The new mapping is backed
10604 						 * by an external object.  We
10605 						 * don't want malloc'ed memory
10606 						 * to be replaced with such a
10607 						 * non-anonymous mapping, so
10608 						 * let's go off the optimized
10609 						 * path...
10610 						 */
10611 						vm_map_copy_overwrite_aligned_src_not_internal++;
10612 						vm_object_unlock(new_object);
10613 						goto slow_copy;
10614 					}
10615 #if XNU_TARGET_OS_OSX
10616 					if (new_object->true_share ||
10617 					    new_object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC) {
10618 						/*
10619 						 * Same if there's a "true_share"
10620 						 * object in the shadow chain, or
10621 						 * an object with a non-default
10622 						 * (SYMMETRIC) copy strategy.
10623 						 */
10624 						vm_map_copy_overwrite_aligned_src_not_symmetric++;
10625 						vm_object_unlock(new_object);
10626 						goto slow_copy;
10627 					}
10628 #endif /* XNU_TARGET_OS_OSX */
10629 					vm_object_unlock(new_object);
10630 				}
10631 				/*
10632 				 * The new mapping is still backed by
10633 				 * anonymous (internal) memory, so it's
10634 				 * OK to substitute it for the original
10635 				 * malloc() mapping.
10636 				 */
10637 			}
10638 
10639 			if (old_object != VM_OBJECT_NULL) {
10640 				assert(!entry->vme_permanent);
10641 				if (entry->is_sub_map) {
10642 					if (entry->use_pmap) {
10643 #ifndef NO_NESTED_PMAP
10644 						pmap_unnest(dst_map->pmap,
10645 						    (addr64_t)entry->vme_start,
10646 						    entry->vme_end - entry->vme_start);
10647 #endif  /* NO_NESTED_PMAP */
10648 						if (dst_map->mapped_in_other_pmaps) {
10649 							/* clean up parent */
10650 							/* map/maps */
10651 							vm_map_submap_pmap_clean(
10652 								dst_map, entry->vme_start,
10653 								entry->vme_end,
10654 								VME_SUBMAP(entry),
10655 								VME_OFFSET(entry));
10656 						}
10657 					} else {
10658 						vm_map_submap_pmap_clean(
10659 							dst_map, entry->vme_start,
10660 							entry->vme_end,
10661 							VME_SUBMAP(entry),
10662 							VME_OFFSET(entry));
10663 					}
10664 					vm_map_deallocate(VME_SUBMAP(entry));
10665 				} else {
10666 					if (dst_map->mapped_in_other_pmaps) {
10667 						vm_object_pmap_protect_options(
10668 							VME_OBJECT(entry),
10669 							VME_OFFSET(entry),
10670 							entry->vme_end
10671 							- entry->vme_start,
10672 							PMAP_NULL,
10673 							PAGE_SIZE,
10674 							entry->vme_start,
10675 							VM_PROT_NONE,
10676 							PMAP_OPTIONS_REMOVE);
10677 					} else {
10678 						pmap_remove_options(
10679 							dst_map->pmap,
10680 							(addr64_t)(entry->vme_start),
10681 							(addr64_t)(entry->vme_end),
10682 							PMAP_OPTIONS_REMOVE);
10683 					}
10684 					vm_object_deallocate(old_object);
10685 				}
10686 			}
10687 
10688 			if (entry->iokit_acct) {
10689 				/* keep using iokit accounting */
10690 				entry->use_pmap = FALSE;
10691 			} else {
10692 				/* use pmap accounting */
10693 				entry->use_pmap = TRUE;
10694 			}
10695 			assert(!entry->vme_permanent);
10696 			VME_OBJECT_SET(entry, VME_OBJECT(copy_entry), false, 0);
10697 			object = VME_OBJECT(entry);
10698 			entry->needs_copy = copy_entry->needs_copy;
10699 			entry->wired_count = 0;
10700 			entry->user_wired_count = 0;
10701 			offset = VME_OFFSET(copy_entry);
10702 			VME_OFFSET_SET(entry, offset);
10703 
10704 			vm_map_copy_entry_unlink(copy, copy_entry);
10705 			vm_map_copy_entry_dispose(copy_entry);
10706 
10707 			/*
10708 			 * we could try to push pages into the pmap at this point, BUT
10709 			 * this optimization only saved on average 2 us per page if ALL
10710 			 * the pages in the source were currently mapped
10711 			 * and ALL the pages in the dest were touched, if there were fewer
10712 			 * than 2/3 of the pages touched, this optimization actually cost more cycles
10713 			 * it also puts a lot of pressure on the pmap layer w/r to mapping structures
10714 			 */
10715 
10716 			/*
10717 			 *	Set up for the next iteration.  The map
10718 			 *	has not been unlocked, so the next
10719 			 *	address should be at the end of this
10720 			 *	entry, and the next map entry should be
10721 			 *	the one following it.
10722 			 */
10723 
10724 			start = tmp_entry->vme_end;
10725 			tmp_entry = tmp_entry->vme_next;
10726 		} else {
10727 			vm_map_version_t        version;
10728 			vm_object_t             dst_object;
10729 			vm_object_offset_t      dst_offset;
10730 			kern_return_t           r;
10731 
10732 slow_copy:
10733 			if (entry->needs_copy) {
10734 				VME_OBJECT_SHADOW(entry,
10735 				    (entry->vme_end -
10736 				    entry->vme_start),
10737 				    vm_map_always_shadow(dst_map));
10738 				entry->needs_copy = FALSE;
10739 			}
10740 
10741 			dst_object = VME_OBJECT(entry);
10742 			dst_offset = VME_OFFSET(entry);
10743 
10744 			/*
10745 			 *	Take an object reference, and record
10746 			 *	the map version information so that the
10747 			 *	map can be safely unlocked.
10748 			 */
10749 
10750 			if (dst_object == VM_OBJECT_NULL) {
10751 				/*
10752 				 * We would usually have just taken the
10753 				 * optimized path above if the destination
10754 				 * object has not been allocated yet.  But we
10755 				 * now disable that optimization if the copy
10756 				 * entry's object is not backed by anonymous
10757 				 * memory to avoid replacing malloc'ed
10758 				 * (i.e. re-usable) anonymous memory with a
10759 				 * not-so-anonymous mapping.
10760 				 * So we have to handle this case here and
10761 				 * allocate a new VM object for this map entry.
10762 				 */
10763 				dst_object = vm_object_allocate(
10764 					entry->vme_end - entry->vme_start);
10765 				dst_offset = 0;
10766 				VME_OBJECT_SET(entry, dst_object, false, 0);
10767 				VME_OFFSET_SET(entry, dst_offset);
10768 				assert(entry->use_pmap);
10769 			}
10770 
10771 			vm_object_reference(dst_object);
10772 
10773 			/* account for unlock bumping up timestamp */
10774 			version.main_timestamp = dst_map->timestamp + 1;
10775 
10776 			vm_map_unlock(dst_map);
10777 
10778 			/*
10779 			 *	Copy as much as possible in one pass
10780 			 */
10781 
10782 			copy_size = size;
10783 			r = vm_fault_copy(
10784 				VME_OBJECT(copy_entry),
10785 				VME_OFFSET(copy_entry),
10786 				&copy_size,
10787 				dst_object,
10788 				dst_offset,
10789 				dst_map,
10790 				&version,
10791 				THREAD_UNINT );
10792 
10793 			/*
10794 			 *	Release the object reference
10795 			 */
10796 
10797 			vm_object_deallocate(dst_object);
10798 
10799 			/*
10800 			 *	If a hard error occurred, return it now
10801 			 */
10802 
10803 			if (r != KERN_SUCCESS) {
10804 				return r;
10805 			}
10806 
10807 			if (copy_size != 0) {
10808 				/*
10809 				 *	Dispose of the copied region
10810 				 */
10811 
10812 				vm_map_copy_clip_end(copy, copy_entry,
10813 				    copy_entry->vme_start + copy_size);
10814 				vm_map_copy_entry_unlink(copy, copy_entry);
10815 				vm_object_deallocate(VME_OBJECT(copy_entry));
10816 				vm_map_copy_entry_dispose(copy_entry);
10817 			}
10818 
10819 			/*
10820 			 *	Pick up in the destination map where we left off.
10821 			 *
10822 			 *	Use the version information to avoid a lookup
10823 			 *	in the normal case.
10824 			 */
10825 
10826 			start += copy_size;
10827 			vm_map_lock(dst_map);
10828 			if (version.main_timestamp == dst_map->timestamp &&
10829 			    copy_size != 0) {
10830 				/* We can safely use saved tmp_entry value */
10831 
10832 				if (tmp_entry->map_aligned &&
10833 				    !VM_MAP_PAGE_ALIGNED(
10834 					    start,
10835 					    VM_MAP_PAGE_MASK(dst_map))) {
10836 					/* no longer map-aligned */
10837 					tmp_entry->map_aligned = FALSE;
10838 				}
10839 				vm_map_clip_end(dst_map, tmp_entry, start);
10840 				tmp_entry = tmp_entry->vme_next;
10841 			} else {
10842 				/* Must do lookup of tmp_entry */
10843 
10844 				if (!vm_map_lookup_entry(dst_map, start, &tmp_entry)) {
10845 					vm_map_unlock(dst_map);
10846 					return KERN_INVALID_ADDRESS;
10847 				}
10848 				if (tmp_entry->map_aligned &&
10849 				    !VM_MAP_PAGE_ALIGNED(
10850 					    start,
10851 					    VM_MAP_PAGE_MASK(dst_map))) {
10852 					/* no longer map-aligned */
10853 					tmp_entry->map_aligned = FALSE;
10854 				}
10855 				vm_map_clip_start(dst_map, tmp_entry, start);
10856 			}
10857 		}
10858 	}/* while */
10859 
10860 	return KERN_SUCCESS;
10861 }/* vm_map_copy_overwrite_aligned */
10862 
10863 /*
10864  *	Routine: vm_map_copyin_kernel_buffer [internal use only]
10865  *
10866  *	Description:
10867  *		Copy in data to a kernel buffer from space in the
10868  *		source map. The original space may be optionally
10869  *		deallocated.
10870  *
10871  *		If successful, returns a new copy object.
10872  */
10873 static kern_return_t
vm_map_copyin_kernel_buffer(vm_map_t src_map,vm_map_offset_t src_addr,vm_map_size_t len,boolean_t src_destroy,vm_map_copy_t * copy_result)10874 vm_map_copyin_kernel_buffer(
10875 	vm_map_t        src_map,
10876 	vm_map_offset_t src_addr,
10877 	vm_map_size_t   len,
10878 	boolean_t       src_destroy,
10879 	vm_map_copy_t   *copy_result)
10880 {
10881 	kern_return_t kr;
10882 	vm_map_copy_t copy;
10883 	void *kdata;
10884 
10885 	if (len > msg_ool_size_small) {
10886 		return KERN_INVALID_ARGUMENT;
10887 	}
10888 
10889 	kdata = kalloc_data(len, Z_WAITOK);
10890 	if (kdata == NULL) {
10891 		return KERN_RESOURCE_SHORTAGE;
10892 	}
10893 	kr = copyinmap(src_map, src_addr, kdata, (vm_size_t)len);
10894 	if (kr != KERN_SUCCESS) {
10895 		kfree_data(kdata, len);
10896 		return kr;
10897 	}
10898 
10899 	copy = vm_map_copy_allocate(VM_MAP_COPY_KERNEL_BUFFER);
10900 	copy->cpy_kdata = kdata;
10901 	copy->size = len;
10902 	copy->offset = 0;
10903 
10904 	if (src_destroy) {
10905 		vmr_flags_t flags = VM_MAP_REMOVE_INTERRUPTIBLE;
10906 
10907 		if (src_map == kernel_map) {
10908 			flags |= VM_MAP_REMOVE_KUNWIRE;
10909 		}
10910 
10911 		(void)vm_map_remove_guard(src_map,
10912 		    vm_map_trunc_page(src_addr, VM_MAP_PAGE_MASK(src_map)),
10913 		    vm_map_round_page(src_addr + len, VM_MAP_PAGE_MASK(src_map)),
10914 		    flags, KMEM_GUARD_NONE);
10915 	}
10916 
10917 	*copy_result = copy;
10918 	return KERN_SUCCESS;
10919 }
10920 
10921 /*
10922  *	Routine: vm_map_copyout_kernel_buffer	[internal use only]
10923  *
10924  *	Description:
10925  *		Copy out data from a kernel buffer into space in the
10926  *		destination map. The space may be otpionally dynamically
10927  *		allocated.
10928  *
10929  *		If successful, consumes the copy object.
10930  *		Otherwise, the caller is responsible for it.
10931  *
10932  *		Callers of this function must call vm_map_copy_require on
10933  *		previously created vm_map_copy_t or pass a newly created
10934  *		one to ensure that it hasn't been forged.
10935  */
10936 static int vm_map_copyout_kernel_buffer_failures = 0;
10937 static kern_return_t
vm_map_copyout_kernel_buffer(vm_map_t map,vm_map_address_t * addr,vm_map_copy_t copy,vm_map_size_t copy_size,boolean_t overwrite,boolean_t consume_on_success)10938 vm_map_copyout_kernel_buffer(
10939 	vm_map_t                map,
10940 	vm_map_address_t        *addr,  /* IN/OUT */
10941 	vm_map_copy_t           copy,
10942 	vm_map_size_t           copy_size,
10943 	boolean_t               overwrite,
10944 	boolean_t               consume_on_success)
10945 {
10946 	kern_return_t kr = KERN_SUCCESS;
10947 	thread_t thread = current_thread();
10948 
10949 	assert(copy->size == copy_size);
10950 
10951 	/*
10952 	 * check for corrupted vm_map_copy structure
10953 	 */
10954 	if (copy_size > msg_ool_size_small || copy->offset) {
10955 		panic("Invalid vm_map_copy_t sz:%lld, ofst:%lld",
10956 		    (long long)copy->size, (long long)copy->offset);
10957 	}
10958 
10959 	if (!overwrite) {
10960 		/*
10961 		 * Allocate space in the target map for the data
10962 		 */
10963 		vm_map_kernel_flags_t vmk_flags = VM_MAP_KERNEL_FLAGS_ANYWHERE();
10964 
10965 		if (map == kernel_map) {
10966 			vmk_flags.vmkf_range_id = KMEM_RANGE_ID_DATA;
10967 		}
10968 
10969 		*addr = 0;
10970 		kr = vm_map_enter(map,
10971 		    addr,
10972 		    vm_map_round_page(copy_size,
10973 		    VM_MAP_PAGE_MASK(map)),
10974 		    (vm_map_offset_t) 0,
10975 		    vmk_flags,
10976 		    VM_OBJECT_NULL,
10977 		    (vm_object_offset_t) 0,
10978 		    FALSE,
10979 		    VM_PROT_DEFAULT,
10980 		    VM_PROT_ALL,
10981 		    VM_INHERIT_DEFAULT);
10982 		if (kr != KERN_SUCCESS) {
10983 			return kr;
10984 		}
10985 #if KASAN
10986 		if (map->pmap == kernel_pmap) {
10987 			kasan_notify_address(*addr, copy->size);
10988 		}
10989 #endif
10990 	}
10991 
10992 	/*
10993 	 * Copyout the data from the kernel buffer to the target map.
10994 	 */
10995 	if (thread->map == map) {
10996 		/*
10997 		 * If the target map is the current map, just do
10998 		 * the copy.
10999 		 */
11000 		assert((vm_size_t)copy_size == copy_size);
11001 		if (copyout(copy->cpy_kdata, *addr, (vm_size_t)copy_size)) {
11002 			kr = KERN_INVALID_ADDRESS;
11003 		}
11004 	} else {
11005 		vm_map_t oldmap;
11006 
11007 		/*
11008 		 * If the target map is another map, assume the
11009 		 * target's address space identity for the duration
11010 		 * of the copy.
11011 		 */
11012 		vm_map_reference(map);
11013 		oldmap = vm_map_switch(map);
11014 
11015 		assert((vm_size_t)copy_size == copy_size);
11016 		if (copyout(copy->cpy_kdata, *addr, (vm_size_t)copy_size)) {
11017 			vm_map_copyout_kernel_buffer_failures++;
11018 			kr = KERN_INVALID_ADDRESS;
11019 		}
11020 
11021 		(void) vm_map_switch(oldmap);
11022 		vm_map_deallocate(map);
11023 	}
11024 
11025 	if (kr != KERN_SUCCESS) {
11026 		/* the copy failed, clean up */
11027 		if (!overwrite) {
11028 			/*
11029 			 * Deallocate the space we allocated in the target map.
11030 			 */
11031 			(void) vm_map_remove(map,
11032 			    vm_map_trunc_page(*addr,
11033 			    VM_MAP_PAGE_MASK(map)),
11034 			    vm_map_round_page((*addr +
11035 			    vm_map_round_page(copy_size,
11036 			    VM_MAP_PAGE_MASK(map))),
11037 			    VM_MAP_PAGE_MASK(map)));
11038 			*addr = 0;
11039 		}
11040 	} else {
11041 		/* copy was successful, dicard the copy structure */
11042 		if (consume_on_success) {
11043 			kfree_data(copy->cpy_kdata, copy_size);
11044 			zfree_id(ZONE_ID_VM_MAP_COPY, copy);
11045 		}
11046 	}
11047 
11048 	return kr;
11049 }
11050 
11051 /*
11052  *	Routine:	vm_map_copy_insert      [internal use only]
11053  *
11054  *	Description:
11055  *		Link a copy chain ("copy") into a map at the
11056  *		specified location (after "where").
11057  *
11058  *		Callers of this function must call vm_map_copy_require on
11059  *		previously created vm_map_copy_t or pass a newly created
11060  *		one to ensure that it hasn't been forged.
11061  *	Side effects:
11062  *		The copy chain is destroyed.
11063  */
11064 static void
vm_map_copy_insert(vm_map_t map,vm_map_entry_t after_where,vm_map_copy_t copy)11065 vm_map_copy_insert(
11066 	vm_map_t        map,
11067 	vm_map_entry_t  after_where,
11068 	vm_map_copy_t   copy)
11069 {
11070 	vm_map_entry_t  entry;
11071 
11072 	while (vm_map_copy_first_entry(copy) != vm_map_copy_to_entry(copy)) {
11073 		entry = vm_map_copy_first_entry(copy);
11074 		vm_map_copy_entry_unlink(copy, entry);
11075 		vm_map_store_entry_link(map, after_where, entry,
11076 		    VM_MAP_KERNEL_FLAGS_NONE);
11077 		after_where = entry;
11078 	}
11079 	zfree_id(ZONE_ID_VM_MAP_COPY, copy);
11080 }
11081 
11082 /*
11083  * Callers of this function must call vm_map_copy_require on
11084  * previously created vm_map_copy_t or pass a newly created
11085  * one to ensure that it hasn't been forged.
11086  */
11087 void
vm_map_copy_remap(vm_map_t map,vm_map_entry_t where,vm_map_copy_t copy,vm_map_offset_t adjustment,vm_prot_t cur_prot,vm_prot_t max_prot,vm_inherit_t inheritance)11088 vm_map_copy_remap(
11089 	vm_map_t        map,
11090 	vm_map_entry_t  where,
11091 	vm_map_copy_t   copy,
11092 	vm_map_offset_t adjustment,
11093 	vm_prot_t       cur_prot,
11094 	vm_prot_t       max_prot,
11095 	vm_inherit_t    inheritance)
11096 {
11097 	vm_map_entry_t  copy_entry, new_entry;
11098 
11099 	for (copy_entry = vm_map_copy_first_entry(copy);
11100 	    copy_entry != vm_map_copy_to_entry(copy);
11101 	    copy_entry = copy_entry->vme_next) {
11102 		/* get a new VM map entry for the map */
11103 		new_entry = vm_map_entry_create(map);
11104 		/* copy the "copy entry" to the new entry */
11105 		vm_map_entry_copy(map, new_entry, copy_entry);
11106 		/* adjust "start" and "end" */
11107 		new_entry->vme_start += adjustment;
11108 		new_entry->vme_end += adjustment;
11109 		/* clear some attributes */
11110 		new_entry->inheritance = inheritance;
11111 		new_entry->protection = cur_prot;
11112 		new_entry->max_protection = max_prot;
11113 		new_entry->behavior = VM_BEHAVIOR_DEFAULT;
11114 		/* take an extra reference on the entry's "object" */
11115 		if (new_entry->is_sub_map) {
11116 			assert(!new_entry->use_pmap); /* not nested */
11117 			vm_map_reference(VME_SUBMAP(new_entry));
11118 		} else {
11119 			vm_object_reference(VME_OBJECT(new_entry));
11120 		}
11121 		/* insert the new entry in the map */
11122 		vm_map_store_entry_link(map, where, new_entry,
11123 		    VM_MAP_KERNEL_FLAGS_NONE);
11124 		/* continue inserting the "copy entries" after the new entry */
11125 		where = new_entry;
11126 	}
11127 }
11128 
11129 
11130 /*
11131  * Returns true if *size matches (or is in the range of) copy->size.
11132  * Upon returning true, the *size field is updated with the actual size of the
11133  * copy object (may be different for VM_MAP_COPY_ENTRY_LIST types)
11134  */
11135 boolean_t
vm_map_copy_validate_size(vm_map_t dst_map,vm_map_copy_t copy,vm_map_size_t * size)11136 vm_map_copy_validate_size(
11137 	vm_map_t                dst_map,
11138 	vm_map_copy_t           copy,
11139 	vm_map_size_t           *size)
11140 {
11141 	if (copy == VM_MAP_COPY_NULL) {
11142 		return FALSE;
11143 	}
11144 
11145 	/*
11146 	 * Assert that the vm_map_copy is coming from the right
11147 	 * zone and hasn't been forged
11148 	 */
11149 	vm_map_copy_require(copy);
11150 
11151 	vm_map_size_t copy_sz = copy->size;
11152 	vm_map_size_t sz = *size;
11153 	switch (copy->type) {
11154 	case VM_MAP_COPY_KERNEL_BUFFER:
11155 		if (sz == copy_sz) {
11156 			return TRUE;
11157 		}
11158 		break;
11159 	case VM_MAP_COPY_ENTRY_LIST:
11160 		/*
11161 		 * potential page-size rounding prevents us from exactly
11162 		 * validating this flavor of vm_map_copy, but we can at least
11163 		 * assert that it's within a range.
11164 		 */
11165 		if (copy_sz >= sz &&
11166 		    copy_sz <= vm_map_round_page(sz, VM_MAP_PAGE_MASK(dst_map))) {
11167 			*size = copy_sz;
11168 			return TRUE;
11169 		}
11170 		break;
11171 	default:
11172 		break;
11173 	}
11174 	return FALSE;
11175 }
11176 
11177 /*
11178  *	Routine:	vm_map_copyout_size
11179  *
11180  *	Description:
11181  *		Copy out a copy chain ("copy") into newly-allocated
11182  *		space in the destination map. Uses a prevalidated
11183  *		size for the copy object (vm_map_copy_validate_size).
11184  *
11185  *		If successful, consumes the copy object.
11186  *		Otherwise, the caller is responsible for it.
11187  */
11188 kern_return_t
vm_map_copyout_size(vm_map_t dst_map,vm_map_address_t * dst_addr,vm_map_copy_t copy,vm_map_size_t copy_size)11189 vm_map_copyout_size(
11190 	vm_map_t                dst_map,
11191 	vm_map_address_t        *dst_addr,      /* OUT */
11192 	vm_map_copy_t           copy,
11193 	vm_map_size_t           copy_size)
11194 {
11195 	return vm_map_copyout_internal(dst_map, dst_addr, copy, copy_size,
11196 	           TRUE,                     /* consume_on_success */
11197 	           VM_PROT_DEFAULT,
11198 	           VM_PROT_ALL,
11199 	           VM_INHERIT_DEFAULT);
11200 }
11201 
11202 /*
11203  *	Routine:	vm_map_copyout
11204  *
11205  *	Description:
11206  *		Copy out a copy chain ("copy") into newly-allocated
11207  *		space in the destination map.
11208  *
11209  *		If successful, consumes the copy object.
11210  *		Otherwise, the caller is responsible for it.
11211  */
11212 kern_return_t
vm_map_copyout(vm_map_t dst_map,vm_map_address_t * dst_addr,vm_map_copy_t copy)11213 vm_map_copyout(
11214 	vm_map_t                dst_map,
11215 	vm_map_address_t        *dst_addr,      /* OUT */
11216 	vm_map_copy_t           copy)
11217 {
11218 	return vm_map_copyout_internal(dst_map, dst_addr, copy, copy ? copy->size : 0,
11219 	           TRUE,                     /* consume_on_success */
11220 	           VM_PROT_DEFAULT,
11221 	           VM_PROT_ALL,
11222 	           VM_INHERIT_DEFAULT);
11223 }
11224 
11225 kern_return_t
vm_map_copyout_internal(vm_map_t dst_map,vm_map_address_t * dst_addr,vm_map_copy_t copy,vm_map_size_t copy_size,boolean_t consume_on_success,vm_prot_t cur_protection,vm_prot_t max_protection,vm_inherit_t inheritance)11226 vm_map_copyout_internal(
11227 	vm_map_t                dst_map,
11228 	vm_map_address_t        *dst_addr,      /* OUT */
11229 	vm_map_copy_t           copy,
11230 	vm_map_size_t           copy_size,
11231 	boolean_t               consume_on_success,
11232 	vm_prot_t               cur_protection,
11233 	vm_prot_t               max_protection,
11234 	vm_inherit_t            inheritance)
11235 {
11236 	vm_map_size_t           size;
11237 	vm_map_size_t           adjustment;
11238 	vm_map_offset_t         start;
11239 	vm_object_offset_t      vm_copy_start;
11240 	vm_map_entry_t          last;
11241 	vm_map_entry_t          entry;
11242 	vm_map_copy_t           original_copy;
11243 	kern_return_t           kr;
11244 	vm_map_kernel_flags_t   vmk_flags = VM_MAP_KERNEL_FLAGS_ANYWHERE();
11245 
11246 	/*
11247 	 *	Check for null copy object.
11248 	 */
11249 
11250 	if (copy == VM_MAP_COPY_NULL) {
11251 		*dst_addr = 0;
11252 		return KERN_SUCCESS;
11253 	}
11254 
11255 	/*
11256 	 * Assert that the vm_map_copy is coming from the right
11257 	 * zone and hasn't been forged
11258 	 */
11259 	vm_map_copy_require(copy);
11260 
11261 	if (copy->size != copy_size) {
11262 		*dst_addr = 0;
11263 		return KERN_FAILURE;
11264 	}
11265 
11266 	/*
11267 	 *	Check for special kernel buffer allocated
11268 	 *	by new_ipc_kmsg_copyin.
11269 	 */
11270 
11271 	if (copy->type == VM_MAP_COPY_KERNEL_BUFFER) {
11272 		return vm_map_copyout_kernel_buffer(dst_map, dst_addr,
11273 		           copy, copy_size, FALSE,
11274 		           consume_on_success);
11275 	}
11276 
11277 	original_copy = copy;
11278 	if (copy->cpy_hdr.page_shift != VM_MAP_PAGE_SHIFT(dst_map)) {
11279 		vm_map_copy_t target_copy;
11280 		vm_map_offset_t overmap_start, overmap_end, trimmed_start;
11281 
11282 		target_copy = VM_MAP_COPY_NULL;
11283 		DEBUG4K_ADJUST("adjusting...\n");
11284 		kr = vm_map_copy_adjust_to_target(
11285 			copy,
11286 			0, /* offset */
11287 			copy->size, /* size */
11288 			dst_map,
11289 			TRUE, /* copy */
11290 			&target_copy,
11291 			&overmap_start,
11292 			&overmap_end,
11293 			&trimmed_start);
11294 		if (kr != KERN_SUCCESS) {
11295 			DEBUG4K_COPY("adjust failed 0x%x\n", kr);
11296 			return kr;
11297 		}
11298 		DEBUG4K_COPY("copy %p (%d 0x%llx 0x%llx) dst_map %p (%d) target_copy %p (%d 0x%llx 0x%llx) overmap_start 0x%llx overmap_end 0x%llx trimmed_start 0x%llx\n", copy, copy->cpy_hdr.page_shift, copy->offset, (uint64_t)copy->size, dst_map, VM_MAP_PAGE_SHIFT(dst_map), target_copy, target_copy->cpy_hdr.page_shift, target_copy->offset, (uint64_t)target_copy->size, (uint64_t)overmap_start, (uint64_t)overmap_end, (uint64_t)trimmed_start);
11299 		if (target_copy != copy) {
11300 			copy = target_copy;
11301 		}
11302 		copy_size = copy->size;
11303 	}
11304 
11305 	/*
11306 	 *	Find space for the data
11307 	 */
11308 
11309 	vm_copy_start = vm_map_trunc_page((vm_map_size_t)copy->offset,
11310 	    VM_MAP_COPY_PAGE_MASK(copy));
11311 	size = vm_map_round_page((vm_map_size_t)copy->offset + copy_size,
11312 	    VM_MAP_COPY_PAGE_MASK(copy))
11313 	    - vm_copy_start;
11314 
11315 	vm_map_kernel_flags_update_range_id(&vmk_flags, dst_map);
11316 
11317 	vm_map_lock(dst_map);
11318 	kr = vm_map_locate_space(dst_map, size, 0, vmk_flags,
11319 	    &start, &last);
11320 	if (kr != KERN_SUCCESS) {
11321 		vm_map_unlock(dst_map);
11322 		return kr;
11323 	}
11324 
11325 	adjustment = start - vm_copy_start;
11326 	if (!consume_on_success) {
11327 		/*
11328 		 * We're not allowed to consume "copy", so we'll have to
11329 		 * copy its map entries into the destination map below.
11330 		 * No need to re-allocate map entries from the correct
11331 		 * (pageable or not) zone, since we'll get new map entries
11332 		 * during the transfer.
11333 		 * We'll also adjust the map entries's "start" and "end"
11334 		 * during the transfer, to keep "copy"'s entries consistent
11335 		 * with its "offset".
11336 		 */
11337 		goto after_adjustments;
11338 	}
11339 
11340 	/*
11341 	 *	Since we're going to just drop the map
11342 	 *	entries from the copy into the destination
11343 	 *	map, they must come from the same pool.
11344 	 */
11345 
11346 	if (copy->cpy_hdr.entries_pageable != dst_map->hdr.entries_pageable) {
11347 		/*
11348 		 * Mismatches occur when dealing with the default
11349 		 * pager.
11350 		 */
11351 		vm_map_entry_t  next, new;
11352 
11353 		/*
11354 		 * Find the zone that the copies were allocated from
11355 		 */
11356 
11357 		entry = vm_map_copy_first_entry(copy);
11358 
11359 		/*
11360 		 * Reinitialize the copy so that vm_map_copy_entry_link
11361 		 * will work.
11362 		 */
11363 		vm_map_store_copy_reset(copy, entry);
11364 		copy->cpy_hdr.entries_pageable = dst_map->hdr.entries_pageable;
11365 
11366 		/*
11367 		 * Copy each entry.
11368 		 */
11369 		while (entry != vm_map_copy_to_entry(copy)) {
11370 			new = vm_map_copy_entry_create(copy);
11371 			vm_map_entry_copy_full(new, entry);
11372 			new->vme_no_copy_on_read = FALSE;
11373 			assert(!new->iokit_acct);
11374 			if (new->is_sub_map) {
11375 				/* clr address space specifics */
11376 				new->use_pmap = FALSE;
11377 			}
11378 			vm_map_copy_entry_link(copy,
11379 			    vm_map_copy_last_entry(copy),
11380 			    new);
11381 			next = entry->vme_next;
11382 			vm_map_entry_dispose(entry);
11383 			entry = next;
11384 		}
11385 	}
11386 
11387 	/*
11388 	 *	Adjust the addresses in the copy chain, and
11389 	 *	reset the region attributes.
11390 	 */
11391 
11392 	for (entry = vm_map_copy_first_entry(copy);
11393 	    entry != vm_map_copy_to_entry(copy);
11394 	    entry = entry->vme_next) {
11395 		if (VM_MAP_PAGE_SHIFT(dst_map) == PAGE_SHIFT) {
11396 			/*
11397 			 * We're injecting this copy entry into a map that
11398 			 * has the standard page alignment, so clear
11399 			 * "map_aligned" (which might have been inherited
11400 			 * from the original map entry).
11401 			 */
11402 			entry->map_aligned = FALSE;
11403 		}
11404 
11405 		entry->vme_start += adjustment;
11406 		entry->vme_end += adjustment;
11407 
11408 		if (entry->map_aligned) {
11409 			assert(VM_MAP_PAGE_ALIGNED(entry->vme_start,
11410 			    VM_MAP_PAGE_MASK(dst_map)));
11411 			assert(VM_MAP_PAGE_ALIGNED(entry->vme_end,
11412 			    VM_MAP_PAGE_MASK(dst_map)));
11413 		}
11414 
11415 		entry->inheritance = VM_INHERIT_DEFAULT;
11416 		entry->protection = VM_PROT_DEFAULT;
11417 		entry->max_protection = VM_PROT_ALL;
11418 		entry->behavior = VM_BEHAVIOR_DEFAULT;
11419 
11420 		/*
11421 		 * If the entry is now wired,
11422 		 * map the pages into the destination map.
11423 		 */
11424 		if (entry->wired_count != 0) {
11425 			vm_map_offset_t va;
11426 			vm_object_offset_t       offset;
11427 			vm_object_t object;
11428 			vm_prot_t prot;
11429 			int     type_of_fault;
11430 
11431 			/* TODO4K would need to use actual page size */
11432 			assert(VM_MAP_PAGE_SHIFT(dst_map) == PAGE_SHIFT);
11433 
11434 			object = VME_OBJECT(entry);
11435 			offset = VME_OFFSET(entry);
11436 			va = entry->vme_start;
11437 
11438 			pmap_pageable(dst_map->pmap,
11439 			    entry->vme_start,
11440 			    entry->vme_end,
11441 			    TRUE);
11442 
11443 			while (va < entry->vme_end) {
11444 				vm_page_t       m;
11445 				struct vm_object_fault_info fault_info = {};
11446 
11447 				/*
11448 				 * Look up the page in the object.
11449 				 * Assert that the page will be found in the
11450 				 * top object:
11451 				 * either
11452 				 *	the object was newly created by
11453 				 *	vm_object_copy_slowly, and has
11454 				 *	copies of all of the pages from
11455 				 *	the source object
11456 				 * or
11457 				 *	the object was moved from the old
11458 				 *	map entry; because the old map
11459 				 *	entry was wired, all of the pages
11460 				 *	were in the top-level object.
11461 				 *	(XXX not true if we wire pages for
11462 				 *	 reading)
11463 				 */
11464 				vm_object_lock(object);
11465 
11466 				m = vm_page_lookup(object, offset);
11467 				if (m == VM_PAGE_NULL || !VM_PAGE_WIRED(m) ||
11468 				    m->vmp_absent) {
11469 					panic("vm_map_copyout: wiring %p", m);
11470 				}
11471 
11472 				prot = entry->protection;
11473 
11474 				if (override_nx(dst_map, VME_ALIAS(entry)) &&
11475 				    prot) {
11476 					prot |= VM_PROT_EXECUTE;
11477 				}
11478 
11479 				type_of_fault = DBG_CACHE_HIT_FAULT;
11480 
11481 				fault_info.user_tag = VME_ALIAS(entry);
11482 				fault_info.pmap_options = 0;
11483 				if (entry->iokit_acct ||
11484 				    (!entry->is_sub_map && !entry->use_pmap)) {
11485 					fault_info.pmap_options |= PMAP_OPTIONS_ALT_ACCT;
11486 				}
11487 				if (entry->vme_xnu_user_debug &&
11488 				    !VM_PAGE_OBJECT(m)->code_signed) {
11489 					/*
11490 					 * Modified code-signed executable
11491 					 * region: this page does not belong
11492 					 * to a code-signed VM object, so it
11493 					 * must have been copied and should
11494 					 * therefore be typed XNU_USER_DEBUG
11495 					 * rather than XNU_USER_EXEC.
11496 					 */
11497 					fault_info.pmap_options |= PMAP_OPTIONS_XNU_USER_DEBUG;
11498 				}
11499 
11500 				vm_fault_enter(m,
11501 				    dst_map->pmap,
11502 				    va,
11503 				    PAGE_SIZE, 0,
11504 				    prot,
11505 				    prot,
11506 				    VM_PAGE_WIRED(m),
11507 				    FALSE,            /* change_wiring */
11508 				    VM_KERN_MEMORY_NONE,            /* tag - not wiring */
11509 				    &fault_info,
11510 				    NULL,             /* need_retry */
11511 				    &type_of_fault);
11512 
11513 				vm_object_unlock(object);
11514 
11515 				offset += PAGE_SIZE_64;
11516 				va += PAGE_SIZE;
11517 			}
11518 		}
11519 	}
11520 
11521 after_adjustments:
11522 
11523 	/*
11524 	 *	Correct the page alignment for the result
11525 	 */
11526 
11527 	*dst_addr = start + (copy->offset - vm_copy_start);
11528 
11529 #if KASAN
11530 	kasan_notify_address(*dst_addr, size);
11531 #endif
11532 
11533 	/*
11534 	 *	Update the hints and the map size
11535 	 */
11536 
11537 	if (consume_on_success) {
11538 		SAVE_HINT_MAP_WRITE(dst_map, vm_map_copy_last_entry(copy));
11539 	} else {
11540 		SAVE_HINT_MAP_WRITE(dst_map, last);
11541 	}
11542 
11543 	dst_map->size += size;
11544 
11545 	/*
11546 	 *	Link in the copy
11547 	 */
11548 
11549 	if (consume_on_success) {
11550 		vm_map_copy_insert(dst_map, last, copy);
11551 		if (copy != original_copy) {
11552 			vm_map_copy_discard(original_copy);
11553 			original_copy = VM_MAP_COPY_NULL;
11554 		}
11555 	} else {
11556 		vm_map_copy_remap(dst_map, last, copy, adjustment,
11557 		    cur_protection, max_protection,
11558 		    inheritance);
11559 		if (copy != original_copy && original_copy != VM_MAP_COPY_NULL) {
11560 			vm_map_copy_discard(copy);
11561 			copy = original_copy;
11562 		}
11563 	}
11564 
11565 
11566 	vm_map_unlock(dst_map);
11567 
11568 	/*
11569 	 * XXX	If wiring_required, call vm_map_pageable
11570 	 */
11571 
11572 	return KERN_SUCCESS;
11573 }
11574 
11575 /*
11576  *	Routine:	vm_map_copyin
11577  *
11578  *	Description:
11579  *		see vm_map_copyin_common.  Exported via Unsupported.exports.
11580  *
11581  */
11582 
11583 #undef vm_map_copyin
11584 
11585 kern_return_t
vm_map_copyin(vm_map_t src_map,vm_map_address_t src_addr,vm_map_size_t len,boolean_t src_destroy,vm_map_copy_t * copy_result)11586 vm_map_copyin(
11587 	vm_map_t                        src_map,
11588 	vm_map_address_t        src_addr,
11589 	vm_map_size_t           len,
11590 	boolean_t                       src_destroy,
11591 	vm_map_copy_t           *copy_result)   /* OUT */
11592 {
11593 	return vm_map_copyin_common(src_map, src_addr, len, src_destroy,
11594 	           FALSE, copy_result, FALSE);
11595 }
11596 
11597 /*
11598  *	Routine:	vm_map_copyin_common
11599  *
11600  *	Description:
11601  *		Copy the specified region (src_addr, len) from the
11602  *		source address space (src_map), possibly removing
11603  *		the region from the source address space (src_destroy).
11604  *
11605  *	Returns:
11606  *		A vm_map_copy_t object (copy_result), suitable for
11607  *		insertion into another address space (using vm_map_copyout),
11608  *		copying over another address space region (using
11609  *		vm_map_copy_overwrite).  If the copy is unused, it
11610  *		should be destroyed (using vm_map_copy_discard).
11611  *
11612  *	In/out conditions:
11613  *		The source map should not be locked on entry.
11614  */
11615 
11616 typedef struct submap_map {
11617 	vm_map_t        parent_map;
11618 	vm_map_offset_t base_start;
11619 	vm_map_offset_t base_end;
11620 	vm_map_size_t   base_len;
11621 	struct submap_map *next;
11622 } submap_map_t;
11623 
11624 kern_return_t
vm_map_copyin_common(vm_map_t src_map,vm_map_address_t src_addr,vm_map_size_t len,boolean_t src_destroy,__unused boolean_t src_volatile,vm_map_copy_t * copy_result,boolean_t use_maxprot)11625 vm_map_copyin_common(
11626 	vm_map_t        src_map,
11627 	vm_map_address_t src_addr,
11628 	vm_map_size_t   len,
11629 	boolean_t       src_destroy,
11630 	__unused boolean_t      src_volatile,
11631 	vm_map_copy_t   *copy_result,   /* OUT */
11632 	boolean_t       use_maxprot)
11633 {
11634 	int flags;
11635 
11636 	flags = 0;
11637 	if (src_destroy) {
11638 		flags |= VM_MAP_COPYIN_SRC_DESTROY;
11639 	}
11640 	if (use_maxprot) {
11641 		flags |= VM_MAP_COPYIN_USE_MAXPROT;
11642 	}
11643 	return vm_map_copyin_internal(src_map,
11644 	           src_addr,
11645 	           len,
11646 	           flags,
11647 	           copy_result);
11648 }
11649 kern_return_t
vm_map_copyin_internal(vm_map_t src_map,vm_map_address_t src_addr,vm_map_size_t len,int flags,vm_map_copy_t * copy_result)11650 vm_map_copyin_internal(
11651 	vm_map_t        src_map,
11652 	vm_map_address_t src_addr,
11653 	vm_map_size_t   len,
11654 	int             flags,
11655 	vm_map_copy_t   *copy_result)   /* OUT */
11656 {
11657 	vm_map_entry_t  tmp_entry;      /* Result of last map lookup --
11658 	                                 * in multi-level lookup, this
11659 	                                 * entry contains the actual
11660 	                                 * vm_object/offset.
11661 	                                 */
11662 	vm_map_entry_t  new_entry = VM_MAP_ENTRY_NULL;  /* Map entry for copy */
11663 
11664 	vm_map_offset_t src_start;      /* Start of current entry --
11665 	                                 * where copy is taking place now
11666 	                                 */
11667 	vm_map_offset_t src_end;        /* End of entire region to be
11668 	                                 * copied */
11669 	vm_map_offset_t src_base;
11670 	vm_map_t        base_map = src_map;
11671 	boolean_t       map_share = FALSE;
11672 	submap_map_t    *parent_maps = NULL;
11673 
11674 	vm_map_copy_t   copy;           /* Resulting copy */
11675 	vm_map_address_t copy_addr;
11676 	vm_map_size_t   copy_size;
11677 	boolean_t       src_destroy;
11678 	boolean_t       use_maxprot;
11679 	boolean_t       preserve_purgeable;
11680 	boolean_t       entry_was_shared;
11681 	vm_map_entry_t  saved_src_entry;
11682 
11683 	if (flags & ~VM_MAP_COPYIN_ALL_FLAGS) {
11684 		return KERN_INVALID_ARGUMENT;
11685 	}
11686 
11687 #if CONFIG_KERNEL_TBI
11688 	if (src_map->pmap == kernel_pmap) {
11689 		src_addr = VM_KERNEL_TBI_FILL(src_addr);
11690 	}
11691 #endif /* CONFIG_KERNEL_TBI && KASAN_TBI */
11692 
11693 	src_destroy = (flags & VM_MAP_COPYIN_SRC_DESTROY) ? TRUE : FALSE;
11694 	use_maxprot = (flags & VM_MAP_COPYIN_USE_MAXPROT) ? TRUE : FALSE;
11695 	preserve_purgeable =
11696 	    (flags & VM_MAP_COPYIN_PRESERVE_PURGEABLE) ? TRUE : FALSE;
11697 
11698 	/*
11699 	 *	Check for copies of zero bytes.
11700 	 */
11701 
11702 	if (len == 0) {
11703 		*copy_result = VM_MAP_COPY_NULL;
11704 		return KERN_SUCCESS;
11705 	}
11706 
11707 	/*
11708 	 *	Check that the end address doesn't overflow
11709 	 */
11710 	src_end = src_addr + len;
11711 	if (src_end < src_addr) {
11712 		return KERN_INVALID_ADDRESS;
11713 	}
11714 
11715 	/*
11716 	 *	Compute (page aligned) start and end of region
11717 	 */
11718 	src_start = vm_map_trunc_page(src_addr,
11719 	    VM_MAP_PAGE_MASK(src_map));
11720 	src_end = vm_map_round_page(src_end,
11721 	    VM_MAP_PAGE_MASK(src_map));
11722 
11723 	/*
11724 	 * If the copy is sufficiently small, use a kernel buffer instead
11725 	 * of making a virtual copy.  The theory being that the cost of
11726 	 * setting up VM (and taking C-O-W faults) dominates the copy costs
11727 	 * for small regions.
11728 	 */
11729 	if ((len <= msg_ool_size_small) &&
11730 	    !use_maxprot &&
11731 	    !preserve_purgeable &&
11732 	    !(flags & VM_MAP_COPYIN_ENTRY_LIST) &&
11733 	    /*
11734 	     * Since the "msg_ool_size_small" threshold was increased and
11735 	     * vm_map_copyin_kernel_buffer() doesn't handle accesses beyond the
11736 	     * address space limits, we revert to doing a virtual copy if the
11737 	     * copied range goes beyond those limits.  Otherwise, mach_vm_read()
11738 	     * of the commpage would now fail when it used to work.
11739 	     */
11740 	    (src_start >= vm_map_min(src_map) &&
11741 	    src_start < vm_map_max(src_map) &&
11742 	    src_end >= vm_map_min(src_map) &&
11743 	    src_end < vm_map_max(src_map))) {
11744 		return vm_map_copyin_kernel_buffer(src_map, src_addr, len,
11745 		           src_destroy, copy_result);
11746 	}
11747 
11748 	/*
11749 	 *	Allocate a header element for the list.
11750 	 *
11751 	 *	Use the start and end in the header to
11752 	 *	remember the endpoints prior to rounding.
11753 	 */
11754 
11755 	copy = vm_map_copy_allocate(VM_MAP_COPY_ENTRY_LIST);
11756 	copy->cpy_hdr.entries_pageable = TRUE;
11757 	copy->cpy_hdr.page_shift = (uint16_t)VM_MAP_PAGE_SHIFT(src_map);
11758 	copy->offset = src_addr;
11759 	copy->size = len;
11760 
11761 	new_entry = vm_map_copy_entry_create(copy);
11762 
11763 #define RETURN(x)                                               \
11764 	MACRO_BEGIN                                             \
11765 	vm_map_unlock(src_map);                                 \
11766 	if(src_map != base_map)                                 \
11767 	        vm_map_deallocate(src_map);                     \
11768 	if (new_entry != VM_MAP_ENTRY_NULL)                     \
11769 	        vm_map_copy_entry_dispose(new_entry);           \
11770 	vm_map_copy_discard(copy);                              \
11771 	{                                                       \
11772 	        submap_map_t	*_ptr;                          \
11773                                                                 \
11774 	        for(_ptr = parent_maps; _ptr != NULL; _ptr = parent_maps) { \
11775 	                parent_maps=parent_maps->next;          \
11776 	                if (_ptr->parent_map != base_map)       \
11777 	                        vm_map_deallocate(_ptr->parent_map);    \
11778 	                kfree_type(submap_map_t, _ptr);         \
11779 	        }                                               \
11780 	}                                                       \
11781 	MACRO_RETURN(x);                                        \
11782 	MACRO_END
11783 
11784 	/*
11785 	 *	Find the beginning of the region.
11786 	 */
11787 
11788 	vm_map_lock(src_map);
11789 
11790 	/*
11791 	 * Lookup the original "src_addr" rather than the truncated
11792 	 * "src_start", in case "src_start" falls in a non-map-aligned
11793 	 * map entry *before* the map entry that contains "src_addr"...
11794 	 */
11795 	if (!vm_map_lookup_entry(src_map, src_addr, &tmp_entry)) {
11796 		RETURN(KERN_INVALID_ADDRESS);
11797 	}
11798 	if (!tmp_entry->is_sub_map) {
11799 		/*
11800 		 * ... but clip to the map-rounded "src_start" rather than
11801 		 * "src_addr" to preserve map-alignment.  We'll adjust the
11802 		 * first copy entry at the end, if needed.
11803 		 */
11804 		vm_map_clip_start(src_map, tmp_entry, src_start);
11805 	}
11806 	if (src_start < tmp_entry->vme_start) {
11807 		/*
11808 		 * Move "src_start" up to the start of the
11809 		 * first map entry to copy.
11810 		 */
11811 		src_start = tmp_entry->vme_start;
11812 	}
11813 	/* set for later submap fix-up */
11814 	copy_addr = src_start;
11815 
11816 	/*
11817 	 *	Go through entries until we get to the end.
11818 	 */
11819 
11820 	while (TRUE) {
11821 		vm_map_entry_t  src_entry = tmp_entry;  /* Top-level entry */
11822 		vm_map_size_t   src_size;               /* Size of source
11823 		                                         * map entry (in both
11824 		                                         * maps)
11825 		                                         */
11826 
11827 		vm_object_t             src_object;     /* Object to copy */
11828 		vm_object_offset_t      src_offset;
11829 
11830 		vm_object_t             new_copy_object;/* vm_object_copy_* result */
11831 
11832 		boolean_t       src_needs_copy;         /* Should source map
11833 		                                         * be made read-only
11834 		                                         * for copy-on-write?
11835 		                                         */
11836 
11837 		boolean_t       new_entry_needs_copy;   /* Will new entry be COW? */
11838 
11839 		boolean_t       was_wired;              /* Was source wired? */
11840 		boolean_t       saved_used_for_jit;     /* Saved used_for_jit. */
11841 #if __arm64e__
11842 		boolean_t       saved_used_for_tpro;    /* Saved used_for_tpro */
11843 #endif
11844 		vm_map_version_t version;               /* Version before locks
11845 		                                         * dropped to make copy
11846 		                                         */
11847 		kern_return_t   result;                 /* Return value from
11848 		                                         * copy_strategically.
11849 		                                         */
11850 		while (tmp_entry->is_sub_map) {
11851 			vm_map_size_t submap_len;
11852 			submap_map_t *ptr;
11853 
11854 			ptr = kalloc_type(submap_map_t, Z_WAITOK);
11855 			ptr->next = parent_maps;
11856 			parent_maps = ptr;
11857 			ptr->parent_map = src_map;
11858 			ptr->base_start = src_start;
11859 			ptr->base_end = src_end;
11860 			submap_len = tmp_entry->vme_end - src_start;
11861 			if (submap_len > (src_end - src_start)) {
11862 				submap_len = src_end - src_start;
11863 			}
11864 			ptr->base_len = submap_len;
11865 
11866 			src_start -= tmp_entry->vme_start;
11867 			src_start += VME_OFFSET(tmp_entry);
11868 			src_end = src_start + submap_len;
11869 			src_map = VME_SUBMAP(tmp_entry);
11870 			vm_map_lock(src_map);
11871 			/* keep an outstanding reference for all maps in */
11872 			/* the parents tree except the base map */
11873 			vm_map_reference(src_map);
11874 			vm_map_unlock(ptr->parent_map);
11875 			if (!vm_map_lookup_entry(
11876 				    src_map, src_start, &tmp_entry)) {
11877 				RETURN(KERN_INVALID_ADDRESS);
11878 			}
11879 			map_share = TRUE;
11880 			if (!tmp_entry->is_sub_map) {
11881 				vm_map_clip_start(src_map, tmp_entry, src_start);
11882 			}
11883 			src_entry = tmp_entry;
11884 		}
11885 		/* we are now in the lowest level submap... */
11886 
11887 		if ((VME_OBJECT(tmp_entry) != VM_OBJECT_NULL) &&
11888 		    (VME_OBJECT(tmp_entry)->phys_contiguous)) {
11889 			/* This is not, supported for now.In future */
11890 			/* we will need to detect the phys_contig   */
11891 			/* condition and then upgrade copy_slowly   */
11892 			/* to do physical copy from the device mem  */
11893 			/* based object. We can piggy-back off of   */
11894 			/* the was wired boolean to set-up the      */
11895 			/* proper handling */
11896 			RETURN(KERN_PROTECTION_FAILURE);
11897 		}
11898 		/*
11899 		 *	Create a new address map entry to hold the result.
11900 		 *	Fill in the fields from the appropriate source entries.
11901 		 *	We must unlock the source map to do this if we need
11902 		 *	to allocate a map entry.
11903 		 */
11904 		if (new_entry == VM_MAP_ENTRY_NULL) {
11905 			version.main_timestamp = src_map->timestamp;
11906 			vm_map_unlock(src_map);
11907 
11908 			new_entry = vm_map_copy_entry_create(copy);
11909 
11910 			vm_map_lock(src_map);
11911 			if ((version.main_timestamp + 1) != src_map->timestamp) {
11912 				if (!vm_map_lookup_entry(src_map, src_start,
11913 				    &tmp_entry)) {
11914 					RETURN(KERN_INVALID_ADDRESS);
11915 				}
11916 				if (!tmp_entry->is_sub_map) {
11917 					vm_map_clip_start(src_map, tmp_entry, src_start);
11918 				}
11919 				continue; /* restart w/ new tmp_entry */
11920 			}
11921 		}
11922 
11923 		/*
11924 		 *	Verify that the region can be read.
11925 		 */
11926 		if (((src_entry->protection & VM_PROT_READ) == VM_PROT_NONE &&
11927 		    !use_maxprot) ||
11928 		    (src_entry->max_protection & VM_PROT_READ) == 0) {
11929 			RETURN(KERN_PROTECTION_FAILURE);
11930 		}
11931 
11932 		/*
11933 		 *	Clip against the endpoints of the entire region.
11934 		 */
11935 
11936 		vm_map_clip_end(src_map, src_entry, src_end);
11937 
11938 		src_size = src_entry->vme_end - src_start;
11939 		src_object = VME_OBJECT(src_entry);
11940 		src_offset = VME_OFFSET(src_entry);
11941 		was_wired = (src_entry->wired_count != 0);
11942 
11943 		vm_map_entry_copy(src_map, new_entry, src_entry);
11944 		if (new_entry->is_sub_map) {
11945 			/* clr address space specifics */
11946 			new_entry->use_pmap = FALSE;
11947 		} else {
11948 			/*
11949 			 * We're dealing with a copy-on-write operation,
11950 			 * so the resulting mapping should not inherit the
11951 			 * original mapping's accounting settings.
11952 			 * "iokit_acct" should have been cleared in
11953 			 * vm_map_entry_copy().
11954 			 * "use_pmap" should be reset to its default (TRUE)
11955 			 * so that the new mapping gets accounted for in
11956 			 * the task's memory footprint.
11957 			 */
11958 			assert(!new_entry->iokit_acct);
11959 			new_entry->use_pmap = TRUE;
11960 		}
11961 
11962 		/*
11963 		 *	Attempt non-blocking copy-on-write optimizations.
11964 		 */
11965 
11966 		/*
11967 		 * If we are destroying the source, and the object
11968 		 * is internal, we could move the object reference
11969 		 * from the source to the copy.  The copy is
11970 		 * copy-on-write only if the source is.
11971 		 * We make another reference to the object, because
11972 		 * destroying the source entry will deallocate it.
11973 		 *
11974 		 * This memory transfer has to be atomic, (to prevent
11975 		 * the VM object from being shared or copied while
11976 		 * it's being moved here), so we could only do this
11977 		 * if we won't have to unlock the VM map until the
11978 		 * original mapping has been fully removed.
11979 		 */
11980 
11981 RestartCopy:
11982 		if ((src_object == VM_OBJECT_NULL ||
11983 		    (!was_wired && !map_share && !tmp_entry->is_shared
11984 		    && !(debug4k_no_cow_copyin && VM_MAP_PAGE_SHIFT(src_map) < PAGE_SHIFT))) &&
11985 		    vm_object_copy_quickly(
11986 			    VME_OBJECT(new_entry),
11987 			    src_offset,
11988 			    src_size,
11989 			    &src_needs_copy,
11990 			    &new_entry_needs_copy)) {
11991 			new_entry->needs_copy = new_entry_needs_copy;
11992 
11993 			/*
11994 			 *	Handle copy-on-write obligations
11995 			 */
11996 
11997 			if (src_needs_copy && !tmp_entry->needs_copy) {
11998 				vm_prot_t prot;
11999 
12000 				prot = src_entry->protection & ~VM_PROT_WRITE;
12001 
12002 				if (override_nx(src_map, VME_ALIAS(src_entry))
12003 				    && prot) {
12004 					prot |= VM_PROT_EXECUTE;
12005 				}
12006 
12007 				vm_object_pmap_protect(
12008 					src_object,
12009 					src_offset,
12010 					src_size,
12011 					(src_entry->is_shared ?
12012 					PMAP_NULL
12013 					: src_map->pmap),
12014 					VM_MAP_PAGE_SIZE(src_map),
12015 					src_entry->vme_start,
12016 					prot);
12017 
12018 				assert(tmp_entry->wired_count == 0);
12019 				tmp_entry->needs_copy = TRUE;
12020 			}
12021 
12022 			/*
12023 			 *	The map has never been unlocked, so it's safe
12024 			 *	to move to the next entry rather than doing
12025 			 *	another lookup.
12026 			 */
12027 
12028 			goto CopySuccessful;
12029 		}
12030 
12031 		entry_was_shared = tmp_entry->is_shared;
12032 
12033 		/*
12034 		 *	Take an object reference, so that we may
12035 		 *	release the map lock(s).
12036 		 */
12037 
12038 		assert(src_object != VM_OBJECT_NULL);
12039 		vm_object_reference(src_object);
12040 
12041 		/*
12042 		 *	Record the timestamp for later verification.
12043 		 *	Unlock the map.
12044 		 */
12045 
12046 		version.main_timestamp = src_map->timestamp;
12047 		vm_map_unlock(src_map); /* Increments timestamp once! */
12048 		saved_src_entry = src_entry;
12049 		tmp_entry = VM_MAP_ENTRY_NULL;
12050 		src_entry = VM_MAP_ENTRY_NULL;
12051 
12052 		/*
12053 		 *	Perform the copy
12054 		 */
12055 
12056 		if (was_wired ||
12057 		    (debug4k_no_cow_copyin &&
12058 		    VM_MAP_PAGE_SHIFT(src_map) < PAGE_SHIFT)) {
12059 CopySlowly:
12060 			vm_object_lock(src_object);
12061 			result = vm_object_copy_slowly(
12062 				src_object,
12063 				src_offset,
12064 				src_size,
12065 				THREAD_UNINT,
12066 				&new_copy_object);
12067 			/* VME_OBJECT_SET will reset used_for_jit|tpro, so preserve it. */
12068 			saved_used_for_jit = new_entry->used_for_jit;
12069 #if __arm64e__
12070 			saved_used_for_tpro = new_entry->used_for_tpro;
12071 #endif
12072 			VME_OBJECT_SET(new_entry, new_copy_object, false, 0);
12073 			new_entry->used_for_jit = saved_used_for_jit;
12074 #if __arm64e__
12075 			new_entry->used_for_tpro = saved_used_for_tpro;
12076 #endif
12077 			VME_OFFSET_SET(new_entry,
12078 			    src_offset - vm_object_trunc_page(src_offset));
12079 			new_entry->needs_copy = FALSE;
12080 		} else if (src_object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC &&
12081 		    (entry_was_shared || map_share)) {
12082 			vm_object_t new_object;
12083 
12084 			vm_object_lock_shared(src_object);
12085 			new_object = vm_object_copy_delayed(
12086 				src_object,
12087 				src_offset,
12088 				src_size,
12089 				TRUE);
12090 			if (new_object == VM_OBJECT_NULL) {
12091 				goto CopySlowly;
12092 			}
12093 
12094 			VME_OBJECT_SET(new_entry, new_object, false, 0);
12095 			assert(new_entry->wired_count == 0);
12096 			new_entry->needs_copy = TRUE;
12097 			assert(!new_entry->iokit_acct);
12098 			assert(new_object->purgable == VM_PURGABLE_DENY);
12099 			assertf(new_entry->use_pmap, "src_map %p new_entry %p\n", src_map, new_entry);
12100 			result = KERN_SUCCESS;
12101 		} else {
12102 			vm_object_offset_t new_offset;
12103 			new_offset = VME_OFFSET(new_entry);
12104 			result = vm_object_copy_strategically(src_object,
12105 			    src_offset,
12106 			    src_size,
12107 			    &new_copy_object,
12108 			    &new_offset,
12109 			    &new_entry_needs_copy);
12110 			/* VME_OBJECT_SET will reset used_for_jit, so preserve it. */
12111 			saved_used_for_jit = new_entry->used_for_jit;
12112 #if __arm64e__
12113 			saved_used_for_tpro = new_entry->used_for_tpro;
12114 #endif
12115 			VME_OBJECT_SET(new_entry, new_copy_object, false, 0);
12116 			new_entry->used_for_jit = saved_used_for_jit;
12117 #if __arm64e__
12118 			new_entry->used_for_tpro = saved_used_for_tpro;
12119 #endif
12120 			if (new_offset != VME_OFFSET(new_entry)) {
12121 				VME_OFFSET_SET(new_entry, new_offset);
12122 			}
12123 
12124 			new_entry->needs_copy = new_entry_needs_copy;
12125 		}
12126 
12127 		if (result == KERN_SUCCESS &&
12128 		    ((preserve_purgeable &&
12129 		    src_object->purgable != VM_PURGABLE_DENY) ||
12130 		    new_entry->used_for_jit
12131 #if __arm64e__
12132 		    || new_entry->used_for_tpro
12133 #endif
12134 		    )) {
12135 			/*
12136 			 * Purgeable objects should be COPY_NONE, true share;
12137 			 * this should be propogated to the copy.
12138 			 *
12139 			 * Also force mappings the pmap specially protects to
12140 			 * be COPY_NONE; trying to COW these mappings would
12141 			 * change the effective protections, which could have
12142 			 * side effects if the pmap layer relies on the
12143 			 * specified protections.
12144 			 */
12145 
12146 			vm_object_t     new_object;
12147 
12148 			new_object = VME_OBJECT(new_entry);
12149 			assert(new_object != src_object);
12150 			vm_object_lock(new_object);
12151 			assert(new_object->ref_count == 1);
12152 			assert(new_object->shadow == VM_OBJECT_NULL);
12153 			assert(new_object->copy == VM_OBJECT_NULL);
12154 			assert(new_object->vo_owner == NULL);
12155 
12156 			new_object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
12157 
12158 			if (preserve_purgeable &&
12159 			    src_object->purgable != VM_PURGABLE_DENY) {
12160 				new_object->true_share = TRUE;
12161 
12162 				/* start as non-volatile with no owner... */
12163 				new_object->purgable = VM_PURGABLE_NONVOLATILE;
12164 				vm_purgeable_nonvolatile_enqueue(new_object, NULL);
12165 				/* ... and move to src_object's purgeable state */
12166 				if (src_object->purgable != VM_PURGABLE_NONVOLATILE) {
12167 					int state;
12168 					state = src_object->purgable;
12169 					vm_object_purgable_control(
12170 						new_object,
12171 						VM_PURGABLE_SET_STATE_FROM_KERNEL,
12172 						&state);
12173 				}
12174 				/* no pmap accounting for purgeable objects */
12175 				new_entry->use_pmap = FALSE;
12176 			}
12177 
12178 			vm_object_unlock(new_object);
12179 			new_object = VM_OBJECT_NULL;
12180 		}
12181 
12182 		if (result != KERN_SUCCESS &&
12183 		    result != KERN_MEMORY_RESTART_COPY) {
12184 			vm_map_lock(src_map);
12185 			RETURN(result);
12186 		}
12187 
12188 		/*
12189 		 *	Throw away the extra reference
12190 		 */
12191 
12192 		vm_object_deallocate(src_object);
12193 
12194 		/*
12195 		 *	Verify that the map has not substantially
12196 		 *	changed while the copy was being made.
12197 		 */
12198 
12199 		vm_map_lock(src_map);
12200 
12201 		if ((version.main_timestamp + 1) == src_map->timestamp) {
12202 			/* src_map hasn't changed: src_entry is still valid */
12203 			src_entry = saved_src_entry;
12204 			goto VerificationSuccessful;
12205 		}
12206 
12207 		/*
12208 		 *	Simple version comparison failed.
12209 		 *
12210 		 *	Retry the lookup and verify that the
12211 		 *	same object/offset are still present.
12212 		 *
12213 		 *	[Note: a memory manager that colludes with
12214 		 *	the calling task can detect that we have
12215 		 *	cheated.  While the map was unlocked, the
12216 		 *	mapping could have been changed and restored.]
12217 		 */
12218 
12219 		if (!vm_map_lookup_entry(src_map, src_start, &tmp_entry)) {
12220 			if (result != KERN_MEMORY_RESTART_COPY) {
12221 				vm_object_deallocate(VME_OBJECT(new_entry));
12222 				VME_OBJECT_SET(new_entry, VM_OBJECT_NULL, false, 0);
12223 				/* reset accounting state */
12224 				new_entry->iokit_acct = FALSE;
12225 				new_entry->use_pmap = TRUE;
12226 			}
12227 			RETURN(KERN_INVALID_ADDRESS);
12228 		}
12229 
12230 		src_entry = tmp_entry;
12231 		vm_map_clip_start(src_map, src_entry, src_start);
12232 
12233 		if ((((src_entry->protection & VM_PROT_READ) == VM_PROT_NONE) &&
12234 		    !use_maxprot) ||
12235 		    ((src_entry->max_protection & VM_PROT_READ) == 0)) {
12236 			goto VerificationFailed;
12237 		}
12238 
12239 		if (src_entry->vme_end < new_entry->vme_end) {
12240 			/*
12241 			 * This entry might have been shortened
12242 			 * (vm_map_clip_end) or been replaced with
12243 			 * an entry that ends closer to "src_start"
12244 			 * than before.
12245 			 * Adjust "new_entry" accordingly; copying
12246 			 * less memory would be correct but we also
12247 			 * redo the copy (see below) if the new entry
12248 			 * no longer points at the same object/offset.
12249 			 */
12250 			assert(VM_MAP_PAGE_ALIGNED(src_entry->vme_end,
12251 			    VM_MAP_COPY_PAGE_MASK(copy)));
12252 			new_entry->vme_end = src_entry->vme_end;
12253 			src_size = new_entry->vme_end - src_start;
12254 		} else if (src_entry->vme_end > new_entry->vme_end) {
12255 			/*
12256 			 * This entry might have been extended
12257 			 * (vm_map_entry_simplify() or coalesce)
12258 			 * or been replaced with an entry that ends farther
12259 			 * from "src_start" than before.
12260 			 *
12261 			 * We've called vm_object_copy_*() only on
12262 			 * the previous <start:end> range, so we can't
12263 			 * just extend new_entry.  We have to re-do
12264 			 * the copy based on the new entry as if it was
12265 			 * pointing at a different object/offset (see
12266 			 * "Verification failed" below).
12267 			 */
12268 		}
12269 
12270 		if ((VME_OBJECT(src_entry) != src_object) ||
12271 		    (VME_OFFSET(src_entry) != src_offset) ||
12272 		    (src_entry->vme_end > new_entry->vme_end)) {
12273 			/*
12274 			 *	Verification failed.
12275 			 *
12276 			 *	Start over with this top-level entry.
12277 			 */
12278 
12279 VerificationFailed:     ;
12280 
12281 			vm_object_deallocate(VME_OBJECT(new_entry));
12282 			tmp_entry = src_entry;
12283 			continue;
12284 		}
12285 
12286 		/*
12287 		 *	Verification succeeded.
12288 		 */
12289 
12290 VerificationSuccessful:;
12291 
12292 		if (result == KERN_MEMORY_RESTART_COPY) {
12293 			goto RestartCopy;
12294 		}
12295 
12296 		/*
12297 		 *	Copy succeeded.
12298 		 */
12299 
12300 CopySuccessful: ;
12301 
12302 		/*
12303 		 *	Link in the new copy entry.
12304 		 */
12305 
12306 		vm_map_copy_entry_link(copy, vm_map_copy_last_entry(copy),
12307 		    new_entry);
12308 
12309 		/*
12310 		 *	Determine whether the entire region
12311 		 *	has been copied.
12312 		 */
12313 		src_base = src_start;
12314 		src_start = new_entry->vme_end;
12315 		new_entry = VM_MAP_ENTRY_NULL;
12316 		while ((src_start >= src_end) && (src_end != 0)) {
12317 			submap_map_t    *ptr;
12318 
12319 			if (src_map == base_map) {
12320 				/* back to the top */
12321 				break;
12322 			}
12323 
12324 			ptr = parent_maps;
12325 			assert(ptr != NULL);
12326 			parent_maps = parent_maps->next;
12327 
12328 			/* fix up the damage we did in that submap */
12329 			vm_map_simplify_range(src_map,
12330 			    src_base,
12331 			    src_end);
12332 
12333 			vm_map_unlock(src_map);
12334 			vm_map_deallocate(src_map);
12335 			vm_map_lock(ptr->parent_map);
12336 			src_map = ptr->parent_map;
12337 			src_base = ptr->base_start;
12338 			src_start = ptr->base_start + ptr->base_len;
12339 			src_end = ptr->base_end;
12340 			if (!vm_map_lookup_entry(src_map,
12341 			    src_start,
12342 			    &tmp_entry) &&
12343 			    (src_end > src_start)) {
12344 				RETURN(KERN_INVALID_ADDRESS);
12345 			}
12346 			kfree_type(submap_map_t, ptr);
12347 			if (parent_maps == NULL) {
12348 				map_share = FALSE;
12349 			}
12350 			src_entry = tmp_entry->vme_prev;
12351 		}
12352 
12353 		if ((VM_MAP_PAGE_SHIFT(src_map) != PAGE_SHIFT) &&
12354 		    (src_start >= src_addr + len) &&
12355 		    (src_addr + len != 0)) {
12356 			/*
12357 			 * Stop copying now, even though we haven't reached
12358 			 * "src_end".  We'll adjust the end of the last copy
12359 			 * entry at the end, if needed.
12360 			 *
12361 			 * If src_map's aligment is different from the
12362 			 * system's page-alignment, there could be
12363 			 * extra non-map-aligned map entries between
12364 			 * the original (non-rounded) "src_addr + len"
12365 			 * and the rounded "src_end".
12366 			 * We do not want to copy those map entries since
12367 			 * they're not part of the copied range.
12368 			 */
12369 			break;
12370 		}
12371 
12372 		if ((src_start >= src_end) && (src_end != 0)) {
12373 			break;
12374 		}
12375 
12376 		/*
12377 		 *	Verify that there are no gaps in the region
12378 		 */
12379 
12380 		tmp_entry = src_entry->vme_next;
12381 		if ((tmp_entry->vme_start != src_start) ||
12382 		    (tmp_entry == vm_map_to_entry(src_map))) {
12383 			RETURN(KERN_INVALID_ADDRESS);
12384 		}
12385 	}
12386 
12387 	/*
12388 	 * If the source should be destroyed, do it now, since the
12389 	 * copy was successful.
12390 	 */
12391 	if (src_destroy) {
12392 		vmr_flags_t remove_flags = VM_MAP_REMOVE_NO_FLAGS;
12393 
12394 		if (src_map == kernel_map) {
12395 			remove_flags |= VM_MAP_REMOVE_KUNWIRE;
12396 		}
12397 		(void)vm_map_remove_and_unlock(src_map,
12398 		    vm_map_trunc_page(src_addr, VM_MAP_PAGE_MASK(src_map)),
12399 		    src_end,
12400 		    remove_flags,
12401 		    KMEM_GUARD_NONE);
12402 	} else {
12403 		/* fix up the damage we did in the base map */
12404 		vm_map_simplify_range(
12405 			src_map,
12406 			vm_map_trunc_page(src_addr,
12407 			VM_MAP_PAGE_MASK(src_map)),
12408 			vm_map_round_page(src_end,
12409 			VM_MAP_PAGE_MASK(src_map)));
12410 		vm_map_unlock(src_map);
12411 	}
12412 
12413 	tmp_entry = VM_MAP_ENTRY_NULL;
12414 
12415 	if (VM_MAP_PAGE_SHIFT(src_map) > PAGE_SHIFT &&
12416 	    VM_MAP_PAGE_SHIFT(src_map) != VM_MAP_COPY_PAGE_SHIFT(copy)) {
12417 		vm_map_offset_t original_start, original_offset, original_end;
12418 
12419 		assert(VM_MAP_COPY_PAGE_MASK(copy) == PAGE_MASK);
12420 
12421 		/* adjust alignment of first copy_entry's "vme_start" */
12422 		tmp_entry = vm_map_copy_first_entry(copy);
12423 		if (tmp_entry != vm_map_copy_to_entry(copy)) {
12424 			vm_map_offset_t adjustment;
12425 
12426 			original_start = tmp_entry->vme_start;
12427 			original_offset = VME_OFFSET(tmp_entry);
12428 
12429 			/* map-align the start of the first copy entry... */
12430 			adjustment = (tmp_entry->vme_start -
12431 			    vm_map_trunc_page(
12432 				    tmp_entry->vme_start,
12433 				    VM_MAP_PAGE_MASK(src_map)));
12434 			tmp_entry->vme_start -= adjustment;
12435 			VME_OFFSET_SET(tmp_entry,
12436 			    VME_OFFSET(tmp_entry) - adjustment);
12437 			copy_addr -= adjustment;
12438 			assert(tmp_entry->vme_start < tmp_entry->vme_end);
12439 			/* ... adjust for mis-aligned start of copy range */
12440 			adjustment =
12441 			    (vm_map_trunc_page(copy->offset,
12442 			    PAGE_MASK) -
12443 			    vm_map_trunc_page(copy->offset,
12444 			    VM_MAP_PAGE_MASK(src_map)));
12445 			if (adjustment) {
12446 				assert(page_aligned(adjustment));
12447 				assert(adjustment < VM_MAP_PAGE_SIZE(src_map));
12448 				tmp_entry->vme_start += adjustment;
12449 				VME_OFFSET_SET(tmp_entry,
12450 				    (VME_OFFSET(tmp_entry) +
12451 				    adjustment));
12452 				copy_addr += adjustment;
12453 				assert(tmp_entry->vme_start < tmp_entry->vme_end);
12454 			}
12455 
12456 			/*
12457 			 * Assert that the adjustments haven't exposed
12458 			 * more than was originally copied...
12459 			 */
12460 			assert(tmp_entry->vme_start >= original_start);
12461 			assert(VME_OFFSET(tmp_entry) >= original_offset);
12462 			/*
12463 			 * ... and that it did not adjust outside of a
12464 			 * a single 16K page.
12465 			 */
12466 			assert(vm_map_trunc_page(tmp_entry->vme_start,
12467 			    VM_MAP_PAGE_MASK(src_map)) ==
12468 			    vm_map_trunc_page(original_start,
12469 			    VM_MAP_PAGE_MASK(src_map)));
12470 		}
12471 
12472 		/* adjust alignment of last copy_entry's "vme_end" */
12473 		tmp_entry = vm_map_copy_last_entry(copy);
12474 		if (tmp_entry != vm_map_copy_to_entry(copy)) {
12475 			vm_map_offset_t adjustment;
12476 
12477 			original_end = tmp_entry->vme_end;
12478 
12479 			/* map-align the end of the last copy entry... */
12480 			tmp_entry->vme_end =
12481 			    vm_map_round_page(tmp_entry->vme_end,
12482 			    VM_MAP_PAGE_MASK(src_map));
12483 			/* ... adjust for mis-aligned end of copy range */
12484 			adjustment =
12485 			    (vm_map_round_page((copy->offset +
12486 			    copy->size),
12487 			    VM_MAP_PAGE_MASK(src_map)) -
12488 			    vm_map_round_page((copy->offset +
12489 			    copy->size),
12490 			    PAGE_MASK));
12491 			if (adjustment) {
12492 				assert(page_aligned(adjustment));
12493 				assert(adjustment < VM_MAP_PAGE_SIZE(src_map));
12494 				tmp_entry->vme_end -= adjustment;
12495 				assert(tmp_entry->vme_start < tmp_entry->vme_end);
12496 			}
12497 
12498 			/*
12499 			 * Assert that the adjustments haven't exposed
12500 			 * more than was originally copied...
12501 			 */
12502 			assert(tmp_entry->vme_end <= original_end);
12503 			/*
12504 			 * ... and that it did not adjust outside of a
12505 			 * a single 16K page.
12506 			 */
12507 			assert(vm_map_round_page(tmp_entry->vme_end,
12508 			    VM_MAP_PAGE_MASK(src_map)) ==
12509 			    vm_map_round_page(original_end,
12510 			    VM_MAP_PAGE_MASK(src_map)));
12511 		}
12512 	}
12513 
12514 	/* Fix-up start and end points in copy.  This is necessary */
12515 	/* when the various entries in the copy object were picked */
12516 	/* up from different sub-maps */
12517 
12518 	tmp_entry = vm_map_copy_first_entry(copy);
12519 	copy_size = 0; /* compute actual size */
12520 	while (tmp_entry != vm_map_copy_to_entry(copy)) {
12521 		assert(VM_MAP_PAGE_ALIGNED(
12522 			    copy_addr + (tmp_entry->vme_end -
12523 			    tmp_entry->vme_start),
12524 			    MIN(VM_MAP_COPY_PAGE_MASK(copy), PAGE_MASK)));
12525 		assert(VM_MAP_PAGE_ALIGNED(
12526 			    copy_addr,
12527 			    MIN(VM_MAP_COPY_PAGE_MASK(copy), PAGE_MASK)));
12528 
12529 		/*
12530 		 * The copy_entries will be injected directly into the
12531 		 * destination map and might not be "map aligned" there...
12532 		 */
12533 		tmp_entry->map_aligned = FALSE;
12534 
12535 		tmp_entry->vme_end = copy_addr +
12536 		    (tmp_entry->vme_end - tmp_entry->vme_start);
12537 		tmp_entry->vme_start = copy_addr;
12538 		assert(tmp_entry->vme_start < tmp_entry->vme_end);
12539 		copy_addr += tmp_entry->vme_end - tmp_entry->vme_start;
12540 		copy_size += tmp_entry->vme_end - tmp_entry->vme_start;
12541 		tmp_entry = (struct vm_map_entry *)tmp_entry->vme_next;
12542 	}
12543 
12544 	if (VM_MAP_PAGE_SHIFT(src_map) != PAGE_SHIFT &&
12545 	    copy_size < copy->size) {
12546 		/*
12547 		 * The actual size of the VM map copy is smaller than what
12548 		 * was requested by the caller.  This must be because some
12549 		 * PAGE_SIZE-sized pages are missing at the end of the last
12550 		 * VM_MAP_PAGE_SIZE(src_map)-sized chunk of the range.
12551 		 * The caller might not have been aware of those missing
12552 		 * pages and might not want to be aware of it, which is
12553 		 * fine as long as they don't try to access (and crash on)
12554 		 * those missing pages.
12555 		 * Let's adjust the size of the "copy", to avoid failing
12556 		 * in vm_map_copyout() or vm_map_copy_overwrite().
12557 		 */
12558 		assert(vm_map_round_page(copy_size,
12559 		    VM_MAP_PAGE_MASK(src_map)) ==
12560 		    vm_map_round_page(copy->size,
12561 		    VM_MAP_PAGE_MASK(src_map)));
12562 		copy->size = copy_size;
12563 	}
12564 
12565 	*copy_result = copy;
12566 	return KERN_SUCCESS;
12567 
12568 #undef  RETURN
12569 }
12570 
12571 kern_return_t
vm_map_copy_extract(vm_map_t src_map,vm_map_address_t src_addr,vm_map_size_t len,boolean_t do_copy,vm_map_copy_t * copy_result,vm_prot_t * cur_prot,vm_prot_t * max_prot,vm_inherit_t inheritance,vm_map_kernel_flags_t vmk_flags)12572 vm_map_copy_extract(
12573 	vm_map_t                src_map,
12574 	vm_map_address_t        src_addr,
12575 	vm_map_size_t           len,
12576 	boolean_t               do_copy,
12577 	vm_map_copy_t           *copy_result,   /* OUT */
12578 	vm_prot_t               *cur_prot,      /* IN/OUT */
12579 	vm_prot_t               *max_prot,      /* IN/OUT */
12580 	vm_inherit_t            inheritance,
12581 	vm_map_kernel_flags_t   vmk_flags)
12582 {
12583 	vm_map_copy_t   copy;
12584 	kern_return_t   kr;
12585 	vm_prot_t required_cur_prot, required_max_prot;
12586 
12587 	/*
12588 	 *	Check for copies of zero bytes.
12589 	 */
12590 
12591 	if (len == 0) {
12592 		*copy_result = VM_MAP_COPY_NULL;
12593 		return KERN_SUCCESS;
12594 	}
12595 
12596 	/*
12597 	 *	Check that the end address doesn't overflow
12598 	 */
12599 	if (src_addr + len < src_addr) {
12600 		return KERN_INVALID_ADDRESS;
12601 	}
12602 
12603 	if (VM_MAP_PAGE_SIZE(src_map) < PAGE_SIZE) {
12604 		DEBUG4K_SHARE("src_map %p src_addr 0x%llx src_end 0x%llx\n", src_map, (uint64_t)src_addr, (uint64_t)(src_addr + len));
12605 	}
12606 
12607 	required_cur_prot = *cur_prot;
12608 	required_max_prot = *max_prot;
12609 
12610 	/*
12611 	 *	Allocate a header element for the list.
12612 	 *
12613 	 *	Use the start and end in the header to
12614 	 *	remember the endpoints prior to rounding.
12615 	 */
12616 
12617 	copy = vm_map_copy_allocate(VM_MAP_COPY_ENTRY_LIST);
12618 	copy->cpy_hdr.entries_pageable = vmk_flags.vmkf_copy_pageable;
12619 	copy->offset = 0;
12620 	copy->size = len;
12621 
12622 	kr = vm_map_remap_extract(src_map,
12623 	    src_addr,
12624 	    len,
12625 	    do_copy,             /* copy */
12626 	    copy,
12627 	    cur_prot,            /* IN/OUT */
12628 	    max_prot,            /* IN/OUT */
12629 	    inheritance,
12630 	    vmk_flags);
12631 	if (kr != KERN_SUCCESS) {
12632 		vm_map_copy_discard(copy);
12633 		return kr;
12634 	}
12635 	if (required_cur_prot != VM_PROT_NONE) {
12636 		assert((*cur_prot & required_cur_prot) == required_cur_prot);
12637 		assert((*max_prot & required_max_prot) == required_max_prot);
12638 	}
12639 
12640 	*copy_result = copy;
12641 	return KERN_SUCCESS;
12642 }
12643 
12644 static void
vm_map_fork_share(vm_map_t old_map,vm_map_entry_t old_entry,vm_map_t new_map)12645 vm_map_fork_share(
12646 	vm_map_t        old_map,
12647 	vm_map_entry_t  old_entry,
12648 	vm_map_t        new_map)
12649 {
12650 	vm_object_t     object;
12651 	vm_map_entry_t  new_entry;
12652 
12653 	/*
12654 	 *	New sharing code.  New map entry
12655 	 *	references original object.  Internal
12656 	 *	objects use asynchronous copy algorithm for
12657 	 *	future copies.  First make sure we have
12658 	 *	the right object.  If we need a shadow,
12659 	 *	or someone else already has one, then
12660 	 *	make a new shadow and share it.
12661 	 */
12662 
12663 	if (!old_entry->is_sub_map) {
12664 		object = VME_OBJECT(old_entry);
12665 	}
12666 
12667 	if (old_entry->is_sub_map) {
12668 		assert(old_entry->wired_count == 0);
12669 #ifndef NO_NESTED_PMAP
12670 #if !PMAP_FORK_NEST
12671 		if (old_entry->use_pmap) {
12672 			kern_return_t   result;
12673 
12674 			result = pmap_nest(new_map->pmap,
12675 			    (VME_SUBMAP(old_entry))->pmap,
12676 			    (addr64_t)old_entry->vme_start,
12677 			    (uint64_t)(old_entry->vme_end - old_entry->vme_start));
12678 			if (result) {
12679 				panic("vm_map_fork_share: pmap_nest failed!");
12680 			}
12681 		}
12682 #endif /* !PMAP_FORK_NEST */
12683 #endif  /* NO_NESTED_PMAP */
12684 	} else if (object == VM_OBJECT_NULL) {
12685 		object = vm_object_allocate((vm_map_size_t)(old_entry->vme_end -
12686 		    old_entry->vme_start));
12687 		VME_OFFSET_SET(old_entry, 0);
12688 		VME_OBJECT_SET(old_entry, object, false, 0);
12689 		old_entry->use_pmap = TRUE;
12690 //		assert(!old_entry->needs_copy);
12691 	} else if (object->copy_strategy !=
12692 	    MEMORY_OBJECT_COPY_SYMMETRIC) {
12693 		/*
12694 		 *	We are already using an asymmetric
12695 		 *	copy, and therefore we already have
12696 		 *	the right object.
12697 		 */
12698 
12699 		assert(!old_entry->needs_copy);
12700 	} else if (old_entry->needs_copy ||       /* case 1 */
12701 	    object->shadowed ||                 /* case 2 */
12702 	    (!object->true_share &&             /* case 3 */
12703 	    !old_entry->is_shared &&
12704 	    (object->vo_size >
12705 	    (vm_map_size_t)(old_entry->vme_end -
12706 	    old_entry->vme_start)))) {
12707 		/*
12708 		 *	We need to create a shadow.
12709 		 *	There are three cases here.
12710 		 *	In the first case, we need to
12711 		 *	complete a deferred symmetrical
12712 		 *	copy that we participated in.
12713 		 *	In the second and third cases,
12714 		 *	we need to create the shadow so
12715 		 *	that changes that we make to the
12716 		 *	object do not interfere with
12717 		 *	any symmetrical copies which
12718 		 *	have occured (case 2) or which
12719 		 *	might occur (case 3).
12720 		 *
12721 		 *	The first case is when we had
12722 		 *	deferred shadow object creation
12723 		 *	via the entry->needs_copy mechanism.
12724 		 *	This mechanism only works when
12725 		 *	only one entry points to the source
12726 		 *	object, and we are about to create
12727 		 *	a second entry pointing to the
12728 		 *	same object. The problem is that
12729 		 *	there is no way of mapping from
12730 		 *	an object to the entries pointing
12731 		 *	to it. (Deferred shadow creation
12732 		 *	works with one entry because occurs
12733 		 *	at fault time, and we walk from the
12734 		 *	entry to the object when handling
12735 		 *	the fault.)
12736 		 *
12737 		 *	The second case is when the object
12738 		 *	to be shared has already been copied
12739 		 *	with a symmetric copy, but we point
12740 		 *	directly to the object without
12741 		 *	needs_copy set in our entry. (This
12742 		 *	can happen because different ranges
12743 		 *	of an object can be pointed to by
12744 		 *	different entries. In particular,
12745 		 *	a single entry pointing to an object
12746 		 *	can be split by a call to vm_inherit,
12747 		 *	which, combined with task_create, can
12748 		 *	result in the different entries
12749 		 *	having different needs_copy values.)
12750 		 *	The shadowed flag in the object allows
12751 		 *	us to detect this case. The problem
12752 		 *	with this case is that if this object
12753 		 *	has or will have shadows, then we
12754 		 *	must not perform an asymmetric copy
12755 		 *	of this object, since such a copy
12756 		 *	allows the object to be changed, which
12757 		 *	will break the previous symmetrical
12758 		 *	copies (which rely upon the object
12759 		 *	not changing). In a sense, the shadowed
12760 		 *	flag says "don't change this object".
12761 		 *	We fix this by creating a shadow
12762 		 *	object for this object, and sharing
12763 		 *	that. This works because we are free
12764 		 *	to change the shadow object (and thus
12765 		 *	to use an asymmetric copy strategy);
12766 		 *	this is also semantically correct,
12767 		 *	since this object is temporary, and
12768 		 *	therefore a copy of the object is
12769 		 *	as good as the object itself. (This
12770 		 *	is not true for permanent objects,
12771 		 *	since the pager needs to see changes,
12772 		 *	which won't happen if the changes
12773 		 *	are made to a copy.)
12774 		 *
12775 		 *	The third case is when the object
12776 		 *	to be shared has parts sticking
12777 		 *	outside of the entry we're working
12778 		 *	with, and thus may in the future
12779 		 *	be subject to a symmetrical copy.
12780 		 *	(This is a preemptive version of
12781 		 *	case 2.)
12782 		 */
12783 		VME_OBJECT_SHADOW(old_entry,
12784 		    (vm_map_size_t) (old_entry->vme_end -
12785 		    old_entry->vme_start),
12786 		    vm_map_always_shadow(old_map));
12787 
12788 		/*
12789 		 *	If we're making a shadow for other than
12790 		 *	copy on write reasons, then we have
12791 		 *	to remove write permission.
12792 		 */
12793 
12794 		if (!old_entry->needs_copy &&
12795 		    (old_entry->protection & VM_PROT_WRITE)) {
12796 			vm_prot_t prot;
12797 
12798 			assert(!pmap_has_prot_policy(old_map->pmap, old_entry->translated_allow_execute, old_entry->protection));
12799 
12800 			prot = old_entry->protection & ~VM_PROT_WRITE;
12801 
12802 			assert(!pmap_has_prot_policy(old_map->pmap, old_entry->translated_allow_execute, prot));
12803 
12804 			if (override_nx(old_map, VME_ALIAS(old_entry)) && prot) {
12805 				prot |= VM_PROT_EXECUTE;
12806 			}
12807 
12808 
12809 			if (old_map->mapped_in_other_pmaps) {
12810 				vm_object_pmap_protect(
12811 					VME_OBJECT(old_entry),
12812 					VME_OFFSET(old_entry),
12813 					(old_entry->vme_end -
12814 					old_entry->vme_start),
12815 					PMAP_NULL,
12816 					PAGE_SIZE,
12817 					old_entry->vme_start,
12818 					prot);
12819 			} else {
12820 				pmap_protect(old_map->pmap,
12821 				    old_entry->vme_start,
12822 				    old_entry->vme_end,
12823 				    prot);
12824 			}
12825 		}
12826 
12827 		old_entry->needs_copy = FALSE;
12828 		object = VME_OBJECT(old_entry);
12829 	}
12830 
12831 
12832 	/*
12833 	 *	If object was using a symmetric copy strategy,
12834 	 *	change its copy strategy to the default
12835 	 *	asymmetric copy strategy, which is copy_delay
12836 	 *	in the non-norma case and copy_call in the
12837 	 *	norma case. Bump the reference count for the
12838 	 *	new entry.
12839 	 */
12840 
12841 	if (old_entry->is_sub_map) {
12842 		vm_map_reference(VME_SUBMAP(old_entry));
12843 	} else {
12844 		vm_object_lock(object);
12845 		vm_object_reference_locked(object);
12846 		if (object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) {
12847 			object->copy_strategy = MEMORY_OBJECT_COPY_DELAY;
12848 		}
12849 		vm_object_unlock(object);
12850 	}
12851 
12852 	/*
12853 	 *	Clone the entry, using object ref from above.
12854 	 *	Mark both entries as shared.
12855 	 */
12856 
12857 	new_entry = vm_map_entry_create(new_map); /* Never the kernel map or descendants */
12858 	vm_map_entry_copy(old_map, new_entry, old_entry);
12859 	old_entry->is_shared = TRUE;
12860 	new_entry->is_shared = TRUE;
12861 
12862 	/*
12863 	 * We're dealing with a shared mapping, so the resulting mapping
12864 	 * should inherit some of the original mapping's accounting settings.
12865 	 * "iokit_acct" should have been cleared in vm_map_entry_copy().
12866 	 * "use_pmap" should stay the same as before (if it hasn't been reset
12867 	 * to TRUE when we cleared "iokit_acct").
12868 	 */
12869 	assert(!new_entry->iokit_acct);
12870 
12871 	/*
12872 	 *	If old entry's inheritence is VM_INHERIT_NONE,
12873 	 *	the new entry is for corpse fork, remove the
12874 	 *	write permission from the new entry.
12875 	 */
12876 	if (old_entry->inheritance == VM_INHERIT_NONE) {
12877 		new_entry->protection &= ~VM_PROT_WRITE;
12878 		new_entry->max_protection &= ~VM_PROT_WRITE;
12879 	}
12880 
12881 	/*
12882 	 *	Insert the entry into the new map -- we
12883 	 *	know we're inserting at the end of the new
12884 	 *	map.
12885 	 */
12886 
12887 	vm_map_store_entry_link(new_map, vm_map_last_entry(new_map), new_entry,
12888 	    VM_MAP_KERNEL_FLAGS_NONE);
12889 
12890 	/*
12891 	 *	Update the physical map
12892 	 */
12893 
12894 	if (old_entry->is_sub_map) {
12895 		/* Bill Angell pmap support goes here */
12896 	} else {
12897 		pmap_copy(new_map->pmap, old_map->pmap, new_entry->vme_start,
12898 		    old_entry->vme_end - old_entry->vme_start,
12899 		    old_entry->vme_start);
12900 	}
12901 }
12902 
12903 static boolean_t
vm_map_fork_copy(vm_map_t old_map,vm_map_entry_t * old_entry_p,vm_map_t new_map,int vm_map_copyin_flags)12904 vm_map_fork_copy(
12905 	vm_map_t        old_map,
12906 	vm_map_entry_t  *old_entry_p,
12907 	vm_map_t        new_map,
12908 	int             vm_map_copyin_flags)
12909 {
12910 	vm_map_entry_t old_entry = *old_entry_p;
12911 	vm_map_size_t entry_size = old_entry->vme_end - old_entry->vme_start;
12912 	vm_map_offset_t start = old_entry->vme_start;
12913 	vm_map_copy_t copy;
12914 	vm_map_entry_t last = vm_map_last_entry(new_map);
12915 
12916 	vm_map_unlock(old_map);
12917 	/*
12918 	 *	Use maxprot version of copyin because we
12919 	 *	care about whether this memory can ever
12920 	 *	be accessed, not just whether it's accessible
12921 	 *	right now.
12922 	 */
12923 	vm_map_copyin_flags |= VM_MAP_COPYIN_USE_MAXPROT;
12924 	if (vm_map_copyin_internal(old_map, start, entry_size,
12925 	    vm_map_copyin_flags, &copy)
12926 	    != KERN_SUCCESS) {
12927 		/*
12928 		 *	The map might have changed while it
12929 		 *	was unlocked, check it again.  Skip
12930 		 *	any blank space or permanently
12931 		 *	unreadable region.
12932 		 */
12933 		vm_map_lock(old_map);
12934 		if (!vm_map_lookup_entry(old_map, start, &last) ||
12935 		    (last->max_protection & VM_PROT_READ) == VM_PROT_NONE) {
12936 			last = last->vme_next;
12937 		}
12938 		*old_entry_p = last;
12939 
12940 		/*
12941 		 * XXX	For some error returns, want to
12942 		 * XXX	skip to the next element.  Note
12943 		 *	that INVALID_ADDRESS and
12944 		 *	PROTECTION_FAILURE are handled above.
12945 		 */
12946 
12947 		return FALSE;
12948 	}
12949 
12950 	/*
12951 	 * Assert that the vm_map_copy is coming from the right
12952 	 * zone and hasn't been forged
12953 	 */
12954 	vm_map_copy_require(copy);
12955 
12956 	/*
12957 	 *	Insert the copy into the new map
12958 	 */
12959 	vm_map_copy_insert(new_map, last, copy);
12960 
12961 	/*
12962 	 *	Pick up the traversal at the end of
12963 	 *	the copied region.
12964 	 */
12965 
12966 	vm_map_lock(old_map);
12967 	start += entry_size;
12968 	if (!vm_map_lookup_entry(old_map, start, &last)) {
12969 		last = last->vme_next;
12970 	} else {
12971 		if (last->vme_start == start) {
12972 			/*
12973 			 * No need to clip here and we don't
12974 			 * want to cause any unnecessary
12975 			 * unnesting...
12976 			 */
12977 		} else {
12978 			vm_map_clip_start(old_map, last, start);
12979 		}
12980 	}
12981 	*old_entry_p = last;
12982 
12983 	return TRUE;
12984 }
12985 
12986 #if PMAP_FORK_NEST
12987 #define PMAP_FORK_NEST_DEBUG 0
12988 static inline void
vm_map_fork_unnest(pmap_t new_pmap,vm_map_offset_t pre_nested_start,vm_map_offset_t pre_nested_end,vm_map_offset_t start,vm_map_offset_t end)12989 vm_map_fork_unnest(
12990 	pmap_t new_pmap,
12991 	vm_map_offset_t pre_nested_start,
12992 	vm_map_offset_t pre_nested_end,
12993 	vm_map_offset_t start,
12994 	vm_map_offset_t end)
12995 {
12996 	kern_return_t kr;
12997 	vm_map_offset_t nesting_mask, start_unnest, end_unnest;
12998 
12999 	assertf(pre_nested_start <= pre_nested_end,
13000 	    "pre_nested start 0x%llx end 0x%llx",
13001 	    (uint64_t)pre_nested_start, (uint64_t)pre_nested_end);
13002 	assertf(start <= end,
13003 	    "start 0x%llx end 0x%llx",
13004 	    (uint64_t) start, (uint64_t)end);
13005 
13006 	if (pre_nested_start == pre_nested_end) {
13007 		/* nothing was pre-nested: done */
13008 		return;
13009 	}
13010 	if (end <= pre_nested_start) {
13011 		/* fully before pre-nested range: done */
13012 		return;
13013 	}
13014 	if (start >= pre_nested_end) {
13015 		/* fully after pre-nested range: done */
13016 		return;
13017 	}
13018 	/* ignore parts of range outside of pre_nested range */
13019 	if (start < pre_nested_start) {
13020 		start = pre_nested_start;
13021 	}
13022 	if (end > pre_nested_end) {
13023 		end = pre_nested_end;
13024 	}
13025 	nesting_mask = pmap_shared_region_size_min(new_pmap) - 1;
13026 	start_unnest = start & ~nesting_mask;
13027 	end_unnest = (end + nesting_mask) & ~nesting_mask;
13028 	kr = pmap_unnest(new_pmap,
13029 	    (addr64_t)start_unnest,
13030 	    (uint64_t)(end_unnest - start_unnest));
13031 #if PMAP_FORK_NEST_DEBUG
13032 	printf("PMAP_FORK_NEST %s:%d new_pmap %p 0x%llx:0x%llx -> pmap_unnest 0x%llx:0x%llx kr 0x%x\n", __FUNCTION__, __LINE__, new_pmap, (uint64_t)start, (uint64_t)end, (uint64_t)start_unnest, (uint64_t)end_unnest, kr);
13033 #endif /* PMAP_FORK_NEST_DEBUG */
13034 	assertf(kr == KERN_SUCCESS,
13035 	    "0x%llx 0x%llx pmap_unnest(%p, 0x%llx, 0x%llx) -> 0x%x",
13036 	    (uint64_t)start, (uint64_t)end, new_pmap,
13037 	    (uint64_t)start_unnest, (uint64_t)(end_unnest - start_unnest),
13038 	    kr);
13039 }
13040 #endif /* PMAP_FORK_NEST */
13041 
13042 void
vm_map_inherit_limits(vm_map_t new_map,const struct _vm_map * old_map)13043 vm_map_inherit_limits(vm_map_t new_map, const struct _vm_map *old_map)
13044 {
13045 	new_map->size_limit = old_map->size_limit;
13046 	new_map->data_limit = old_map->data_limit;
13047 	new_map->user_wire_limit = old_map->user_wire_limit;
13048 	new_map->reserved_regions = old_map->reserved_regions;
13049 }
13050 
13051 /*
13052  *	vm_map_fork:
13053  *
13054  *	Create and return a new map based on the old
13055  *	map, according to the inheritance values on the
13056  *	regions in that map and the options.
13057  *
13058  *	The source map must not be locked.
13059  */
13060 vm_map_t
vm_map_fork(ledger_t ledger,vm_map_t old_map,int options)13061 vm_map_fork(
13062 	ledger_t        ledger,
13063 	vm_map_t        old_map,
13064 	int             options)
13065 {
13066 	pmap_t          new_pmap;
13067 	vm_map_t        new_map;
13068 	vm_map_entry_t  old_entry;
13069 	vm_map_size_t   new_size = 0, entry_size;
13070 	vm_map_entry_t  new_entry;
13071 	boolean_t       src_needs_copy;
13072 	boolean_t       new_entry_needs_copy;
13073 	boolean_t       pmap_is64bit;
13074 	int             vm_map_copyin_flags;
13075 	vm_inherit_t    old_entry_inheritance;
13076 	int             map_create_options;
13077 	kern_return_t   footprint_collect_kr;
13078 
13079 	if (options & ~(VM_MAP_FORK_SHARE_IF_INHERIT_NONE |
13080 	    VM_MAP_FORK_PRESERVE_PURGEABLE |
13081 	    VM_MAP_FORK_CORPSE_FOOTPRINT)) {
13082 		/* unsupported option */
13083 		return VM_MAP_NULL;
13084 	}
13085 
13086 	pmap_is64bit =
13087 #if defined(__i386__) || defined(__x86_64__)
13088 	    old_map->pmap->pm_task_map != TASK_MAP_32BIT;
13089 #elif defined(__arm64__)
13090 	    old_map->pmap->is_64bit;
13091 #else
13092 #error Unknown architecture.
13093 #endif
13094 
13095 	unsigned int pmap_flags = 0;
13096 	pmap_flags |= pmap_is64bit ? PMAP_CREATE_64BIT : 0;
13097 #if defined(HAS_APPLE_PAC)
13098 	pmap_flags |= old_map->pmap->disable_jop ? PMAP_CREATE_DISABLE_JOP : 0;
13099 #endif
13100 #if CONFIG_ROSETTA
13101 	pmap_flags |= old_map->pmap->is_rosetta ? PMAP_CREATE_ROSETTA : 0;
13102 #endif
13103 #if PMAP_CREATE_FORCE_4K_PAGES
13104 	if (VM_MAP_PAGE_SIZE(old_map) == FOURK_PAGE_SIZE &&
13105 	    PAGE_SIZE != FOURK_PAGE_SIZE) {
13106 		pmap_flags |= PMAP_CREATE_FORCE_4K_PAGES;
13107 	}
13108 #endif /* PMAP_CREATE_FORCE_4K_PAGES */
13109 	new_pmap = pmap_create_options(ledger, (vm_map_size_t) 0, pmap_flags);
13110 	if (new_pmap == NULL) {
13111 		return VM_MAP_NULL;
13112 	}
13113 
13114 	vm_map_reference(old_map);
13115 	vm_map_lock(old_map);
13116 
13117 	map_create_options = 0;
13118 	if (old_map->hdr.entries_pageable) {
13119 		map_create_options |= VM_MAP_CREATE_PAGEABLE;
13120 	}
13121 	if (options & VM_MAP_FORK_CORPSE_FOOTPRINT) {
13122 		map_create_options |= VM_MAP_CREATE_CORPSE_FOOTPRINT;
13123 		footprint_collect_kr = KERN_SUCCESS;
13124 	}
13125 	new_map = vm_map_create_options(new_pmap,
13126 	    old_map->min_offset,
13127 	    old_map->max_offset,
13128 	    map_create_options);
13129 
13130 	/* inherit cs_enforcement */
13131 	vm_map_cs_enforcement_set(new_map, old_map->cs_enforcement);
13132 
13133 	vm_map_lock(new_map);
13134 	vm_commit_pagezero_status(new_map);
13135 	/* inherit the parent map's page size */
13136 	vm_map_set_page_shift(new_map, VM_MAP_PAGE_SHIFT(old_map));
13137 
13138 	/* inherit the parent rlimits */
13139 	vm_map_inherit_limits(new_map, old_map);
13140 
13141 #if CONFIG_MAP_RANGES
13142 	/* inherit the parent map's VM ranges */
13143 	vm_map_range_fork(new_map, old_map);
13144 #endif
13145 
13146 #if CODE_SIGNING_MONITOR
13147 	/* Prepare the monitor for the fork */
13148 	csm_fork_prepare(old_map->pmap, new_pmap);
13149 #endif
13150 
13151 #if PMAP_FORK_NEST
13152 	/*
13153 	 * Pre-nest the shared region's pmap.
13154 	 */
13155 	vm_map_offset_t pre_nested_start = 0, pre_nested_end = 0;
13156 	pmap_fork_nest(old_map->pmap, new_pmap,
13157 	    &pre_nested_start, &pre_nested_end);
13158 #if PMAP_FORK_NEST_DEBUG
13159 	printf("PMAP_FORK_NEST %s:%d old %p new %p pre_nested start 0x%llx end 0x%llx\n", __FUNCTION__, __LINE__, old_map->pmap, new_pmap, (uint64_t)pre_nested_start, (uint64_t)pre_nested_end);
13160 #endif /* PMAP_FORK_NEST_DEBUG */
13161 #endif /* PMAP_FORK_NEST */
13162 
13163 	for (old_entry = vm_map_first_entry(old_map); old_entry != vm_map_to_entry(old_map);) {
13164 		/*
13165 		 * Abort any corpse collection if the system is shutting down.
13166 		 */
13167 		if ((options & VM_MAP_FORK_CORPSE_FOOTPRINT) &&
13168 		    get_system_inshutdown()) {
13169 #if PMAP_FORK_NEST
13170 			new_entry = vm_map_last_entry(new_map);
13171 			if (new_entry == vm_map_to_entry(new_map)) {
13172 				/* unnest all that was pre-nested */
13173 				vm_map_fork_unnest(new_pmap,
13174 				    pre_nested_start, pre_nested_end,
13175 				    vm_map_min(new_map), vm_map_max(new_map));
13176 			} else if (new_entry->vme_end < vm_map_max(new_map)) {
13177 				/* unnest hole at the end, if pre-nested */
13178 				vm_map_fork_unnest(new_pmap,
13179 				    pre_nested_start, pre_nested_end,
13180 				    new_entry->vme_end, vm_map_max(new_map));
13181 			}
13182 #endif /* PMAP_FORK_NEST */
13183 			vm_map_corpse_footprint_collect_done(new_map);
13184 			vm_map_unlock(new_map);
13185 			vm_map_unlock(old_map);
13186 			vm_map_deallocate(new_map);
13187 			vm_map_deallocate(old_map);
13188 			printf("Aborting corpse map due to system shutdown\n");
13189 			return VM_MAP_NULL;
13190 		}
13191 
13192 		entry_size = old_entry->vme_end - old_entry->vme_start;
13193 
13194 #if PMAP_FORK_NEST
13195 		/*
13196 		 * Undo any unnecessary pre-nesting.
13197 		 */
13198 		vm_map_offset_t prev_end;
13199 		if (old_entry == vm_map_first_entry(old_map)) {
13200 			prev_end = vm_map_min(old_map);
13201 		} else {
13202 			prev_end = old_entry->vme_prev->vme_end;
13203 		}
13204 		if (prev_end < old_entry->vme_start) {
13205 			/* unnest hole before this entry, if pre-nested */
13206 			vm_map_fork_unnest(new_pmap,
13207 			    pre_nested_start, pre_nested_end,
13208 			    prev_end, old_entry->vme_start);
13209 		}
13210 		if (old_entry->is_sub_map && old_entry->use_pmap) {
13211 			/* keep this entry nested in the child */
13212 #if PMAP_FORK_NEST_DEBUG
13213 			printf("PMAP_FORK_NEST %s:%d new_pmap %p keeping 0x%llx:0x%llx nested\n", __FUNCTION__, __LINE__, new_pmap, (uint64_t)old_entry->vme_start, (uint64_t)old_entry->vme_end);
13214 #endif /* PMAP_FORK_NEST_DEBUG */
13215 		} else {
13216 			/* undo nesting for this entry, if pre-nested */
13217 			vm_map_fork_unnest(new_pmap,
13218 			    pre_nested_start, pre_nested_end,
13219 			    old_entry->vme_start, old_entry->vme_end);
13220 		}
13221 #endif /* PMAP_FORK_NEST */
13222 
13223 		old_entry_inheritance = old_entry->inheritance;
13224 		/*
13225 		 * If caller used the VM_MAP_FORK_SHARE_IF_INHERIT_NONE option
13226 		 * share VM_INHERIT_NONE entries that are not backed by a
13227 		 * device pager.
13228 		 */
13229 		if (old_entry_inheritance == VM_INHERIT_NONE &&
13230 		    (options & VM_MAP_FORK_SHARE_IF_INHERIT_NONE) &&
13231 		    (old_entry->protection & VM_PROT_READ) &&
13232 		    !(!old_entry->is_sub_map &&
13233 		    VME_OBJECT(old_entry) != NULL &&
13234 		    VME_OBJECT(old_entry)->pager != NULL &&
13235 		    is_device_pager_ops(
13236 			    VME_OBJECT(old_entry)->pager->mo_pager_ops))) {
13237 			old_entry_inheritance = VM_INHERIT_SHARE;
13238 		}
13239 
13240 		if (old_entry_inheritance != VM_INHERIT_NONE &&
13241 		    (options & VM_MAP_FORK_CORPSE_FOOTPRINT) &&
13242 		    footprint_collect_kr == KERN_SUCCESS) {
13243 			/*
13244 			 * The corpse won't have old_map->pmap to query
13245 			 * footprint information, so collect that data now
13246 			 * and store it in new_map->vmmap_corpse_footprint
13247 			 * for later autopsy.
13248 			 */
13249 			footprint_collect_kr =
13250 			    vm_map_corpse_footprint_collect(old_map,
13251 			    old_entry,
13252 			    new_map);
13253 		}
13254 
13255 		switch (old_entry_inheritance) {
13256 		case VM_INHERIT_NONE:
13257 			break;
13258 
13259 		case VM_INHERIT_SHARE:
13260 			vm_map_fork_share(old_map, old_entry, new_map);
13261 			new_size += entry_size;
13262 			break;
13263 
13264 		case VM_INHERIT_COPY:
13265 
13266 			/*
13267 			 *	Inline the copy_quickly case;
13268 			 *	upon failure, fall back on call
13269 			 *	to vm_map_fork_copy.
13270 			 */
13271 
13272 			if (old_entry->is_sub_map) {
13273 				break;
13274 			}
13275 			if ((old_entry->wired_count != 0) ||
13276 			    ((VME_OBJECT(old_entry) != NULL) &&
13277 			    (VME_OBJECT(old_entry)->true_share))) {
13278 				goto slow_vm_map_fork_copy;
13279 			}
13280 
13281 			new_entry = vm_map_entry_create(new_map); /* never the kernel map or descendants */
13282 			vm_map_entry_copy(old_map, new_entry, old_entry);
13283 			if (old_entry->vme_permanent) {
13284 				/* inherit "permanent" on fork() */
13285 				new_entry->vme_permanent = TRUE;
13286 			}
13287 
13288 			if (new_entry->used_for_jit == TRUE && new_map->jit_entry_exists == FALSE) {
13289 				new_map->jit_entry_exists = TRUE;
13290 			}
13291 
13292 			if (new_entry->is_sub_map) {
13293 				/* clear address space specifics */
13294 				new_entry->use_pmap = FALSE;
13295 			} else {
13296 				/*
13297 				 * We're dealing with a copy-on-write operation,
13298 				 * so the resulting mapping should not inherit
13299 				 * the original mapping's accounting settings.
13300 				 * "iokit_acct" should have been cleared in
13301 				 * vm_map_entry_copy().
13302 				 * "use_pmap" should be reset to its default
13303 				 * (TRUE) so that the new mapping gets
13304 				 * accounted for in the task's memory footprint.
13305 				 */
13306 				assert(!new_entry->iokit_acct);
13307 				new_entry->use_pmap = TRUE;
13308 			}
13309 
13310 			if (!vm_object_copy_quickly(
13311 				    VME_OBJECT(new_entry),
13312 				    VME_OFFSET(old_entry),
13313 				    (old_entry->vme_end -
13314 				    old_entry->vme_start),
13315 				    &src_needs_copy,
13316 				    &new_entry_needs_copy)) {
13317 				vm_map_entry_dispose(new_entry);
13318 				goto slow_vm_map_fork_copy;
13319 			}
13320 
13321 			/*
13322 			 *	Handle copy-on-write obligations
13323 			 */
13324 
13325 			if (src_needs_copy && !old_entry->needs_copy) {
13326 				vm_prot_t prot;
13327 
13328 				assert(!pmap_has_prot_policy(old_map->pmap, old_entry->translated_allow_execute, old_entry->protection));
13329 
13330 				prot = old_entry->protection & ~VM_PROT_WRITE;
13331 
13332 				if (override_nx(old_map, VME_ALIAS(old_entry))
13333 				    && prot) {
13334 					prot |= VM_PROT_EXECUTE;
13335 				}
13336 
13337 				assert(!pmap_has_prot_policy(old_map->pmap, old_entry->translated_allow_execute, prot));
13338 
13339 				vm_object_pmap_protect(
13340 					VME_OBJECT(old_entry),
13341 					VME_OFFSET(old_entry),
13342 					(old_entry->vme_end -
13343 					old_entry->vme_start),
13344 					((old_entry->is_shared
13345 					|| old_map->mapped_in_other_pmaps)
13346 					? PMAP_NULL :
13347 					old_map->pmap),
13348 					VM_MAP_PAGE_SIZE(old_map),
13349 					old_entry->vme_start,
13350 					prot);
13351 
13352 				assert(old_entry->wired_count == 0);
13353 				old_entry->needs_copy = TRUE;
13354 			}
13355 			new_entry->needs_copy = new_entry_needs_copy;
13356 
13357 			/*
13358 			 *	Insert the entry at the end
13359 			 *	of the map.
13360 			 */
13361 
13362 			vm_map_store_entry_link(new_map,
13363 			    vm_map_last_entry(new_map),
13364 			    new_entry,
13365 			    VM_MAP_KERNEL_FLAGS_NONE);
13366 			new_size += entry_size;
13367 			break;
13368 
13369 slow_vm_map_fork_copy:
13370 			vm_map_copyin_flags = 0;
13371 			if (options & VM_MAP_FORK_PRESERVE_PURGEABLE) {
13372 				vm_map_copyin_flags |=
13373 				    VM_MAP_COPYIN_PRESERVE_PURGEABLE;
13374 			}
13375 			if (vm_map_fork_copy(old_map,
13376 			    &old_entry,
13377 			    new_map,
13378 			    vm_map_copyin_flags)) {
13379 				new_size += entry_size;
13380 			}
13381 			continue;
13382 		}
13383 		old_entry = old_entry->vme_next;
13384 	}
13385 
13386 #if PMAP_FORK_NEST
13387 	new_entry = vm_map_last_entry(new_map);
13388 	if (new_entry == vm_map_to_entry(new_map)) {
13389 		/* unnest all that was pre-nested */
13390 		vm_map_fork_unnest(new_pmap,
13391 		    pre_nested_start, pre_nested_end,
13392 		    vm_map_min(new_map), vm_map_max(new_map));
13393 	} else if (new_entry->vme_end < vm_map_max(new_map)) {
13394 		/* unnest hole at the end, if pre-nested */
13395 		vm_map_fork_unnest(new_pmap,
13396 		    pre_nested_start, pre_nested_end,
13397 		    new_entry->vme_end, vm_map_max(new_map));
13398 	}
13399 #endif /* PMAP_FORK_NEST */
13400 
13401 #if defined(__arm64__)
13402 	pmap_insert_commpage(new_map->pmap);
13403 #endif /* __arm64__ */
13404 
13405 	new_map->size = new_size;
13406 
13407 	if (options & VM_MAP_FORK_CORPSE_FOOTPRINT) {
13408 		vm_map_corpse_footprint_collect_done(new_map);
13409 	}
13410 
13411 	/* Propagate JIT entitlement for the pmap layer. */
13412 	if (pmap_get_jit_entitled(old_map->pmap)) {
13413 		/* Tell the pmap that it supports JIT. */
13414 		pmap_set_jit_entitled(new_map->pmap);
13415 	}
13416 
13417 	/* Propagate TPRO settings for the pmap layer */
13418 	if (pmap_get_tpro(old_map->pmap)) {
13419 		/* Tell the pmap that it supports TPRO */
13420 		pmap_set_tpro(new_map->pmap);
13421 	}
13422 
13423 	vm_map_unlock(new_map);
13424 	vm_map_unlock(old_map);
13425 	vm_map_deallocate(old_map);
13426 
13427 	return new_map;
13428 }
13429 
13430 /*
13431  * vm_map_exec:
13432  *
13433  *      Setup the "new_map" with the proper execution environment according
13434  *	to the type of executable (platform, 64bit, chroot environment).
13435  *	Map the comm page and shared region, etc...
13436  */
13437 kern_return_t
vm_map_exec(vm_map_t new_map,task_t task,boolean_t is64bit,void * fsroot,cpu_type_t cpu,cpu_subtype_t cpu_subtype,boolean_t reslide,boolean_t is_driverkit,uint32_t rsr_version)13438 vm_map_exec(
13439 	vm_map_t        new_map,
13440 	task_t          task,
13441 	boolean_t       is64bit,
13442 	void            *fsroot,
13443 	cpu_type_t      cpu,
13444 	cpu_subtype_t   cpu_subtype,
13445 	boolean_t       reslide,
13446 	boolean_t       is_driverkit,
13447 	uint32_t        rsr_version)
13448 {
13449 	SHARED_REGION_TRACE_DEBUG(
13450 		("shared_region: task %p: vm_map_exec(%p,%p,%p,0x%x,0x%x): ->\n",
13451 		(void *)VM_KERNEL_ADDRPERM(current_task()),
13452 		(void *)VM_KERNEL_ADDRPERM(new_map),
13453 		(void *)VM_KERNEL_ADDRPERM(task),
13454 		(void *)VM_KERNEL_ADDRPERM(fsroot),
13455 		cpu,
13456 		cpu_subtype));
13457 	(void) vm_commpage_enter(new_map, task, is64bit);
13458 
13459 	(void) vm_shared_region_enter(new_map, task, is64bit, fsroot, cpu, cpu_subtype, reslide, is_driverkit, rsr_version);
13460 
13461 	SHARED_REGION_TRACE_DEBUG(
13462 		("shared_region: task %p: vm_map_exec(%p,%p,%p,0x%x,0x%x): <-\n",
13463 		(void *)VM_KERNEL_ADDRPERM(current_task()),
13464 		(void *)VM_KERNEL_ADDRPERM(new_map),
13465 		(void *)VM_KERNEL_ADDRPERM(task),
13466 		(void *)VM_KERNEL_ADDRPERM(fsroot),
13467 		cpu,
13468 		cpu_subtype));
13469 
13470 	/*
13471 	 * Some devices have region(s) of memory that shouldn't get allocated by
13472 	 * user processes. The following code creates dummy vm_map_entry_t's for each
13473 	 * of the regions that needs to be reserved to prevent any allocations in
13474 	 * those regions.
13475 	 */
13476 	kern_return_t kr = KERN_FAILURE;
13477 	vm_map_kernel_flags_t vmk_flags = VM_MAP_KERNEL_FLAGS_FIXED_PERMANENT();
13478 	vmk_flags.vmkf_beyond_max = true;
13479 
13480 	const struct vm_reserved_region *regions = NULL;
13481 	size_t num_regions = ml_get_vm_reserved_regions(is64bit, &regions);
13482 	assert((num_regions == 0) || (num_regions > 0 && regions != NULL));
13483 
13484 	for (size_t i = 0; i < num_regions; ++i) {
13485 		vm_map_offset_t address = regions[i].vmrr_addr;
13486 
13487 		kr = vm_map_enter(
13488 			new_map,
13489 			&address,
13490 			regions[i].vmrr_size,
13491 			(vm_map_offset_t)0,
13492 			vmk_flags,
13493 			VM_OBJECT_NULL,
13494 			(vm_object_offset_t)0,
13495 			FALSE,
13496 			VM_PROT_NONE,
13497 			VM_PROT_NONE,
13498 			VM_INHERIT_COPY);
13499 
13500 		if (kr != KERN_SUCCESS) {
13501 			panic("Failed to reserve %s region in user map %p %d", regions[i].vmrr_name, new_map, kr);
13502 		}
13503 	}
13504 
13505 	new_map->reserved_regions = (num_regions ? TRUE : FALSE);
13506 
13507 	return KERN_SUCCESS;
13508 }
13509 
13510 uint64_t vm_map_lookup_and_lock_object_copy_slowly_count = 0;
13511 uint64_t vm_map_lookup_and_lock_object_copy_slowly_size = 0;
13512 uint64_t vm_map_lookup_and_lock_object_copy_slowly_max = 0;
13513 uint64_t vm_map_lookup_and_lock_object_copy_slowly_restart = 0;
13514 uint64_t vm_map_lookup_and_lock_object_copy_slowly_error = 0;
13515 uint64_t vm_map_lookup_and_lock_object_copy_strategically_count = 0;
13516 uint64_t vm_map_lookup_and_lock_object_copy_strategically_size = 0;
13517 uint64_t vm_map_lookup_and_lock_object_copy_strategically_max = 0;
13518 uint64_t vm_map_lookup_and_lock_object_copy_strategically_restart = 0;
13519 uint64_t vm_map_lookup_and_lock_object_copy_strategically_error = 0;
13520 uint64_t vm_map_lookup_and_lock_object_copy_shadow_count = 0;
13521 uint64_t vm_map_lookup_and_lock_object_copy_shadow_size = 0;
13522 uint64_t vm_map_lookup_and_lock_object_copy_shadow_max = 0;
13523 /*
13524  *	vm_map_lookup_and_lock_object:
13525  *
13526  *	Finds the VM object, offset, and
13527  *	protection for a given virtual address in the
13528  *	specified map, assuming a page fault of the
13529  *	type specified.
13530  *
13531  *	Returns the (object, offset, protection) for
13532  *	this address, whether it is wired down, and whether
13533  *	this map has the only reference to the data in question.
13534  *	In order to later verify this lookup, a "version"
13535  *	is returned.
13536  *	If contended != NULL, *contended will be set to
13537  *	true iff the thread had to spin or block to acquire
13538  *	an exclusive lock.
13539  *
13540  *	The map MUST be locked by the caller and WILL be
13541  *	locked on exit.  In order to guarantee the
13542  *	existence of the returned object, it is returned
13543  *	locked.
13544  *
13545  *	If a lookup is requested with "write protection"
13546  *	specified, the map may be changed to perform virtual
13547  *	copying operations, although the data referenced will
13548  *	remain the same.
13549  */
13550 kern_return_t
vm_map_lookup_and_lock_object(vm_map_t * var_map,vm_map_offset_t vaddr,vm_prot_t fault_type,int object_lock_type,vm_map_version_t * out_version,vm_object_t * object,vm_object_offset_t * offset,vm_prot_t * out_prot,boolean_t * wired,vm_object_fault_info_t fault_info,vm_map_t * real_map,bool * contended)13551 vm_map_lookup_and_lock_object(
13552 	vm_map_t                *var_map,       /* IN/OUT */
13553 	vm_map_offset_t         vaddr,
13554 	vm_prot_t               fault_type,
13555 	int                     object_lock_type,
13556 	vm_map_version_t        *out_version,   /* OUT */
13557 	vm_object_t             *object,        /* OUT */
13558 	vm_object_offset_t      *offset,        /* OUT */
13559 	vm_prot_t               *out_prot,      /* OUT */
13560 	boolean_t               *wired,         /* OUT */
13561 	vm_object_fault_info_t  fault_info,     /* OUT */
13562 	vm_map_t                *real_map,      /* OUT */
13563 	bool                    *contended)     /* OUT */
13564 {
13565 	vm_map_entry_t                  entry;
13566 	vm_map_t                        map = *var_map;
13567 	vm_map_t                        old_map = *var_map;
13568 	vm_map_t                        cow_sub_map_parent = VM_MAP_NULL;
13569 	vm_map_offset_t                 cow_parent_vaddr = 0;
13570 	vm_map_offset_t                 old_start = 0;
13571 	vm_map_offset_t                 old_end = 0;
13572 	vm_prot_t                       prot;
13573 	boolean_t                       mask_protections;
13574 	boolean_t                       force_copy;
13575 	boolean_t                       no_force_copy_if_executable;
13576 	boolean_t                       submap_needed_copy;
13577 	vm_prot_t                       original_fault_type;
13578 	vm_map_size_t                   fault_page_mask;
13579 
13580 	/*
13581 	 * VM_PROT_MASK means that the caller wants us to use "fault_type"
13582 	 * as a mask against the mapping's actual protections, not as an
13583 	 * absolute value.
13584 	 */
13585 	mask_protections = (fault_type & VM_PROT_IS_MASK) ? TRUE : FALSE;
13586 	force_copy = (fault_type & VM_PROT_COPY) ? TRUE : FALSE;
13587 	no_force_copy_if_executable = (fault_type & VM_PROT_COPY_FAIL_IF_EXECUTABLE) ? TRUE : FALSE;
13588 	fault_type &= VM_PROT_ALL;
13589 	original_fault_type = fault_type;
13590 	if (contended) {
13591 		*contended = false;
13592 	}
13593 
13594 	*real_map = map;
13595 
13596 	fault_page_mask = MIN(VM_MAP_PAGE_MASK(map), PAGE_MASK);
13597 	vaddr = VM_MAP_TRUNC_PAGE(vaddr, fault_page_mask);
13598 
13599 RetryLookup:
13600 	fault_type = original_fault_type;
13601 
13602 	/*
13603 	 *	If the map has an interesting hint, try it before calling
13604 	 *	full blown lookup routine.
13605 	 */
13606 	entry = map->hint;
13607 
13608 	if ((entry == vm_map_to_entry(map)) ||
13609 	    (vaddr < entry->vme_start) || (vaddr >= entry->vme_end)) {
13610 		vm_map_entry_t  tmp_entry;
13611 
13612 		/*
13613 		 *	Entry was either not a valid hint, or the vaddr
13614 		 *	was not contained in the entry, so do a full lookup.
13615 		 */
13616 		if (!vm_map_lookup_entry(map, vaddr, &tmp_entry)) {
13617 			if ((cow_sub_map_parent) && (cow_sub_map_parent != map)) {
13618 				vm_map_unlock(cow_sub_map_parent);
13619 			}
13620 			if ((*real_map != map)
13621 			    && (*real_map != cow_sub_map_parent)) {
13622 				vm_map_unlock(*real_map);
13623 			}
13624 			return KERN_INVALID_ADDRESS;
13625 		}
13626 
13627 		entry = tmp_entry;
13628 	}
13629 	if (map == old_map) {
13630 		old_start = entry->vme_start;
13631 		old_end = entry->vme_end;
13632 	}
13633 
13634 	/*
13635 	 *	Handle submaps.  Drop lock on upper map, submap is
13636 	 *	returned locked.
13637 	 */
13638 
13639 	submap_needed_copy = FALSE;
13640 submap_recurse:
13641 	if (entry->is_sub_map) {
13642 		vm_map_offset_t         local_vaddr;
13643 		vm_map_offset_t         end_delta;
13644 		vm_map_offset_t         start_delta;
13645 		vm_map_offset_t         top_entry_saved_start;
13646 		vm_object_offset_t      top_entry_saved_offset;
13647 		vm_map_entry_t          submap_entry, saved_submap_entry;
13648 		vm_object_offset_t      submap_entry_offset;
13649 		vm_object_size_t        submap_entry_size;
13650 		vm_prot_t               subentry_protection;
13651 		vm_prot_t               subentry_max_protection;
13652 		boolean_t               subentry_no_copy_on_read;
13653 		boolean_t               subentry_permanent;
13654 		boolean_t               subentry_csm_associated;
13655 		boolean_t               mapped_needs_copy = FALSE;
13656 		vm_map_version_t        version;
13657 
13658 		assertf(VM_MAP_PAGE_SHIFT(VME_SUBMAP(entry)) >= VM_MAP_PAGE_SHIFT(map),
13659 		    "map %p (%d) entry %p submap %p (%d)\n",
13660 		    map, VM_MAP_PAGE_SHIFT(map), entry,
13661 		    VME_SUBMAP(entry), VM_MAP_PAGE_SHIFT(VME_SUBMAP(entry)));
13662 
13663 		local_vaddr = vaddr;
13664 		top_entry_saved_start = entry->vme_start;
13665 		top_entry_saved_offset = VME_OFFSET(entry);
13666 
13667 		if ((entry->use_pmap &&
13668 		    !((fault_type & VM_PROT_WRITE) ||
13669 		    force_copy))) {
13670 			/* if real_map equals map we unlock below */
13671 			if ((*real_map != map) &&
13672 			    (*real_map != cow_sub_map_parent)) {
13673 				vm_map_unlock(*real_map);
13674 			}
13675 			*real_map = VME_SUBMAP(entry);
13676 		}
13677 
13678 		if (entry->needs_copy &&
13679 		    ((fault_type & VM_PROT_WRITE) ||
13680 		    force_copy)) {
13681 			if (!mapped_needs_copy) {
13682 				if (vm_map_lock_read_to_write(map)) {
13683 					vm_map_lock_read(map);
13684 					*real_map = map;
13685 					goto RetryLookup;
13686 				}
13687 				vm_map_lock_read(VME_SUBMAP(entry));
13688 				*var_map = VME_SUBMAP(entry);
13689 				cow_sub_map_parent = map;
13690 				/* reset base to map before cow object */
13691 				/* this is the map which will accept   */
13692 				/* the new cow object */
13693 				old_start = entry->vme_start;
13694 				old_end = entry->vme_end;
13695 				cow_parent_vaddr = vaddr;
13696 				mapped_needs_copy = TRUE;
13697 			} else {
13698 				vm_map_lock_read(VME_SUBMAP(entry));
13699 				*var_map = VME_SUBMAP(entry);
13700 				if ((cow_sub_map_parent != map) &&
13701 				    (*real_map != map)) {
13702 					vm_map_unlock(map);
13703 				}
13704 			}
13705 		} else {
13706 			if (entry->needs_copy) {
13707 				submap_needed_copy = TRUE;
13708 			}
13709 			vm_map_lock_read(VME_SUBMAP(entry));
13710 			*var_map = VME_SUBMAP(entry);
13711 			/* leave map locked if it is a target */
13712 			/* cow sub_map above otherwise, just  */
13713 			/* follow the maps down to the object */
13714 			/* here we unlock knowing we are not  */
13715 			/* revisiting the map.  */
13716 			if ((*real_map != map) && (map != cow_sub_map_parent)) {
13717 				vm_map_unlock_read(map);
13718 			}
13719 		}
13720 
13721 		entry = NULL;
13722 		map = *var_map;
13723 
13724 		/* calculate the offset in the submap for vaddr */
13725 		local_vaddr = (local_vaddr - top_entry_saved_start) + top_entry_saved_offset;
13726 		assertf(VM_MAP_PAGE_ALIGNED(local_vaddr, fault_page_mask),
13727 		    "local_vaddr 0x%llx entry->vme_start 0x%llx fault_page_mask 0x%llx\n",
13728 		    (uint64_t)local_vaddr, (uint64_t)top_entry_saved_start, (uint64_t)fault_page_mask);
13729 
13730 RetrySubMap:
13731 		if (!vm_map_lookup_entry(map, local_vaddr, &submap_entry)) {
13732 			if ((cow_sub_map_parent) && (cow_sub_map_parent != map)) {
13733 				vm_map_unlock(cow_sub_map_parent);
13734 			}
13735 			if ((*real_map != map)
13736 			    && (*real_map != cow_sub_map_parent)) {
13737 				vm_map_unlock(*real_map);
13738 			}
13739 			*real_map = map;
13740 			return KERN_INVALID_ADDRESS;
13741 		}
13742 
13743 		/* find the attenuated shadow of the underlying object */
13744 		/* on our target map */
13745 
13746 		/* in english the submap object may extend beyond the     */
13747 		/* region mapped by the entry or, may only fill a portion */
13748 		/* of it.  For our purposes, we only care if the object   */
13749 		/* doesn't fill.  In this case the area which will        */
13750 		/* ultimately be clipped in the top map will only need    */
13751 		/* to be as big as the portion of the underlying entry    */
13752 		/* which is mapped */
13753 		start_delta = submap_entry->vme_start > top_entry_saved_offset ?
13754 		    submap_entry->vme_start - top_entry_saved_offset : 0;
13755 
13756 		end_delta =
13757 		    (top_entry_saved_offset + start_delta + (old_end - old_start)) <=
13758 		    submap_entry->vme_end ?
13759 		    0 : (top_entry_saved_offset +
13760 		    (old_end - old_start))
13761 		    - submap_entry->vme_end;
13762 
13763 		old_start += start_delta;
13764 		old_end -= end_delta;
13765 
13766 		if (submap_entry->is_sub_map) {
13767 			entry = submap_entry;
13768 			vaddr = local_vaddr;
13769 			goto submap_recurse;
13770 		}
13771 
13772 		if (((fault_type & VM_PROT_WRITE) ||
13773 		    force_copy)
13774 		    && cow_sub_map_parent) {
13775 			vm_object_t     sub_object, copy_object;
13776 			vm_object_offset_t copy_offset;
13777 			vm_map_offset_t local_start;
13778 			vm_map_offset_t local_end;
13779 			boolean_t       object_copied = FALSE;
13780 			vm_object_offset_t object_copied_offset = 0;
13781 			boolean_t       object_copied_needs_copy = FALSE;
13782 			kern_return_t   kr = KERN_SUCCESS;
13783 
13784 			if (vm_map_lock_read_to_write(map)) {
13785 				vm_map_lock_read(map);
13786 				old_start -= start_delta;
13787 				old_end += end_delta;
13788 				goto RetrySubMap;
13789 			}
13790 
13791 
13792 			sub_object = VME_OBJECT(submap_entry);
13793 			if (sub_object == VM_OBJECT_NULL) {
13794 				sub_object =
13795 				    vm_object_allocate(
13796 					(vm_map_size_t)
13797 					(submap_entry->vme_end -
13798 					submap_entry->vme_start));
13799 				VME_OBJECT_SET(submap_entry, sub_object, false, 0);
13800 				VME_OFFSET_SET(submap_entry, 0);
13801 				assert(!submap_entry->is_sub_map);
13802 				assert(submap_entry->use_pmap);
13803 			}
13804 			local_start =  local_vaddr -
13805 			    (cow_parent_vaddr - old_start);
13806 			local_end = local_vaddr +
13807 			    (old_end - cow_parent_vaddr);
13808 			vm_map_clip_start(map, submap_entry, local_start);
13809 			vm_map_clip_end(map, submap_entry, local_end);
13810 			if (submap_entry->is_sub_map) {
13811 				/* unnesting was done when clipping */
13812 				assert(!submap_entry->use_pmap);
13813 			}
13814 
13815 			/* This is the COW case, lets connect */
13816 			/* an entry in our space to the underlying */
13817 			/* object in the submap, bypassing the  */
13818 			/* submap. */
13819 			submap_entry_offset = VME_OFFSET(submap_entry);
13820 			submap_entry_size = submap_entry->vme_end - submap_entry->vme_start;
13821 
13822 			if ((submap_entry->wired_count != 0 ||
13823 			    sub_object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC) &&
13824 			    (submap_entry->protection & VM_PROT_EXECUTE) &&
13825 			    no_force_copy_if_executable) {
13826 //				printf("FBDP map %p entry %p start 0x%llx end 0x%llx wired %d strat %d\n", map, submap_entry, (uint64_t)local_start, (uint64_t)local_end, submap_entry->wired_count, sub_object->copy_strategy);
13827 				if ((cow_sub_map_parent) && (cow_sub_map_parent != map)) {
13828 					vm_map_unlock(cow_sub_map_parent);
13829 				}
13830 				if ((*real_map != map)
13831 				    && (*real_map != cow_sub_map_parent)) {
13832 					vm_map_unlock(*real_map);
13833 				}
13834 				*real_map = map;
13835 				ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_SUBMAP_NO_COW_ON_EXECUTABLE), 0 /* arg */);
13836 				vm_map_lock_write_to_read(map);
13837 				kr = KERN_PROTECTION_FAILURE;
13838 				DTRACE_VM4(submap_no_copy_executable,
13839 				    vm_map_t, map,
13840 				    vm_object_offset_t, submap_entry_offset,
13841 				    vm_object_size_t, submap_entry_size,
13842 				    int, kr);
13843 				return kr;
13844 			}
13845 
13846 			if (submap_entry->wired_count != 0) {
13847 				vm_object_reference(sub_object);
13848 
13849 				assertf(VM_MAP_PAGE_ALIGNED(VME_OFFSET(submap_entry), VM_MAP_PAGE_MASK(map)),
13850 				    "submap_entry %p offset 0x%llx\n",
13851 				    submap_entry, VME_OFFSET(submap_entry));
13852 
13853 				DTRACE_VM6(submap_copy_slowly,
13854 				    vm_map_t, cow_sub_map_parent,
13855 				    vm_map_offset_t, vaddr,
13856 				    vm_map_t, map,
13857 				    vm_object_size_t, submap_entry_size,
13858 				    int, submap_entry->wired_count,
13859 				    int, sub_object->copy_strategy);
13860 
13861 				saved_submap_entry = submap_entry;
13862 				version.main_timestamp = map->timestamp;
13863 				vm_map_unlock(map); /* Increments timestamp by 1 */
13864 				submap_entry = VM_MAP_ENTRY_NULL;
13865 
13866 				vm_object_lock(sub_object);
13867 				kr = vm_object_copy_slowly(sub_object,
13868 				    submap_entry_offset,
13869 				    submap_entry_size,
13870 				    FALSE,
13871 				    &copy_object);
13872 				object_copied = TRUE;
13873 				object_copied_offset = 0;
13874 				/* 4k: account for extra offset in physical page */
13875 				object_copied_offset += submap_entry_offset - vm_object_trunc_page(submap_entry_offset);
13876 				object_copied_needs_copy = FALSE;
13877 				vm_object_deallocate(sub_object);
13878 
13879 				vm_map_lock(map);
13880 
13881 				if (kr != KERN_SUCCESS &&
13882 				    kr != KERN_MEMORY_RESTART_COPY) {
13883 					if ((cow_sub_map_parent) && (cow_sub_map_parent != map)) {
13884 						vm_map_unlock(cow_sub_map_parent);
13885 					}
13886 					if ((*real_map != map)
13887 					    && (*real_map != cow_sub_map_parent)) {
13888 						vm_map_unlock(*real_map);
13889 					}
13890 					*real_map = map;
13891 					vm_object_deallocate(copy_object);
13892 					copy_object = VM_OBJECT_NULL;
13893 					ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_SUBMAP_COPY_SLOWLY_FAILED), 0 /* arg */);
13894 					vm_map_lock_write_to_read(map);
13895 					DTRACE_VM4(submap_copy_error_slowly,
13896 					    vm_object_t, sub_object,
13897 					    vm_object_offset_t, submap_entry_offset,
13898 					    vm_object_size_t, submap_entry_size,
13899 					    int, kr);
13900 					vm_map_lookup_and_lock_object_copy_slowly_error++;
13901 					return kr;
13902 				}
13903 
13904 				if ((kr == KERN_SUCCESS) &&
13905 				    (version.main_timestamp + 1) == map->timestamp) {
13906 					submap_entry = saved_submap_entry;
13907 				} else {
13908 					saved_submap_entry = NULL;
13909 					old_start -= start_delta;
13910 					old_end += end_delta;
13911 					vm_object_deallocate(copy_object);
13912 					copy_object = VM_OBJECT_NULL;
13913 					vm_map_lock_write_to_read(map);
13914 					vm_map_lookup_and_lock_object_copy_slowly_restart++;
13915 					goto RetrySubMap;
13916 				}
13917 				vm_map_lookup_and_lock_object_copy_slowly_count++;
13918 				vm_map_lookup_and_lock_object_copy_slowly_size += submap_entry_size;
13919 				if (submap_entry_size > vm_map_lookup_and_lock_object_copy_slowly_max) {
13920 					vm_map_lookup_and_lock_object_copy_slowly_max = submap_entry_size;
13921 				}
13922 			} else if (sub_object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC) {
13923 				submap_entry_offset = VME_OFFSET(submap_entry);
13924 				copy_object = VM_OBJECT_NULL;
13925 				object_copied_offset = submap_entry_offset;
13926 				object_copied_needs_copy = FALSE;
13927 				DTRACE_VM6(submap_copy_strategically,
13928 				    vm_map_t, cow_sub_map_parent,
13929 				    vm_map_offset_t, vaddr,
13930 				    vm_map_t, map,
13931 				    vm_object_size_t, submap_entry_size,
13932 				    int, submap_entry->wired_count,
13933 				    int, sub_object->copy_strategy);
13934 				kr = vm_object_copy_strategically(
13935 					sub_object,
13936 					submap_entry_offset,
13937 					submap_entry->vme_end - submap_entry->vme_start,
13938 					&copy_object,
13939 					&object_copied_offset,
13940 					&object_copied_needs_copy);
13941 				if (kr == KERN_MEMORY_RESTART_COPY) {
13942 					old_start -= start_delta;
13943 					old_end += end_delta;
13944 					vm_object_deallocate(copy_object);
13945 					copy_object = VM_OBJECT_NULL;
13946 					vm_map_lock_write_to_read(map);
13947 					vm_map_lookup_and_lock_object_copy_strategically_restart++;
13948 					goto RetrySubMap;
13949 				}
13950 				if (kr != KERN_SUCCESS) {
13951 					if ((cow_sub_map_parent) && (cow_sub_map_parent != map)) {
13952 						vm_map_unlock(cow_sub_map_parent);
13953 					}
13954 					if ((*real_map != map)
13955 					    && (*real_map != cow_sub_map_parent)) {
13956 						vm_map_unlock(*real_map);
13957 					}
13958 					*real_map = map;
13959 					vm_object_deallocate(copy_object);
13960 					copy_object = VM_OBJECT_NULL;
13961 					ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_SUBMAP_COPY_STRAT_FAILED), 0 /* arg */);
13962 					vm_map_lock_write_to_read(map);
13963 					DTRACE_VM4(submap_copy_error_strategically,
13964 					    vm_object_t, sub_object,
13965 					    vm_object_offset_t, submap_entry_offset,
13966 					    vm_object_size_t, submap_entry_size,
13967 					    int, kr);
13968 					vm_map_lookup_and_lock_object_copy_strategically_error++;
13969 					return kr;
13970 				}
13971 				assert(copy_object != VM_OBJECT_NULL);
13972 				assert(copy_object != sub_object);
13973 				object_copied = TRUE;
13974 				vm_map_lookup_and_lock_object_copy_strategically_count++;
13975 				vm_map_lookup_and_lock_object_copy_strategically_size += submap_entry_size;
13976 				if (submap_entry_size > vm_map_lookup_and_lock_object_copy_strategically_max) {
13977 					vm_map_lookup_and_lock_object_copy_strategically_max = submap_entry_size;
13978 				}
13979 			} else {
13980 				/* set up shadow object */
13981 				object_copied = FALSE;
13982 				copy_object = sub_object;
13983 				vm_object_lock(sub_object);
13984 				vm_object_reference_locked(sub_object);
13985 				sub_object->shadowed = TRUE;
13986 				vm_object_unlock(sub_object);
13987 
13988 				assert(submap_entry->wired_count == 0);
13989 				submap_entry->needs_copy = TRUE;
13990 
13991 				prot = submap_entry->protection;
13992 				assert(!pmap_has_prot_policy(map->pmap, submap_entry->translated_allow_execute, prot));
13993 				prot = prot & ~VM_PROT_WRITE;
13994 				assert(!pmap_has_prot_policy(map->pmap, submap_entry->translated_allow_execute, prot));
13995 
13996 				if (override_nx(old_map,
13997 				    VME_ALIAS(submap_entry))
13998 				    && prot) {
13999 					prot |= VM_PROT_EXECUTE;
14000 				}
14001 
14002 				vm_object_pmap_protect(
14003 					sub_object,
14004 					VME_OFFSET(submap_entry),
14005 					submap_entry->vme_end -
14006 					submap_entry->vme_start,
14007 					(submap_entry->is_shared
14008 					|| map->mapped_in_other_pmaps) ?
14009 					PMAP_NULL : map->pmap,
14010 					VM_MAP_PAGE_SIZE(map),
14011 					submap_entry->vme_start,
14012 					prot);
14013 				vm_map_lookup_and_lock_object_copy_shadow_count++;
14014 				vm_map_lookup_and_lock_object_copy_shadow_size += submap_entry_size;
14015 				if (submap_entry_size > vm_map_lookup_and_lock_object_copy_shadow_max) {
14016 					vm_map_lookup_and_lock_object_copy_shadow_max = submap_entry_size;
14017 				}
14018 			}
14019 
14020 			/*
14021 			 * Adjust the fault offset to the submap entry.
14022 			 */
14023 			copy_offset = (local_vaddr -
14024 			    submap_entry->vme_start +
14025 			    VME_OFFSET(submap_entry));
14026 
14027 			/* This works diffently than the   */
14028 			/* normal submap case. We go back  */
14029 			/* to the parent of the cow map and*/
14030 			/* clip out the target portion of  */
14031 			/* the sub_map, substituting the   */
14032 			/* new copy object,                */
14033 
14034 			subentry_protection = submap_entry->protection;
14035 			subentry_max_protection = submap_entry->max_protection;
14036 			subentry_no_copy_on_read = submap_entry->vme_no_copy_on_read;
14037 			subentry_permanent = submap_entry->vme_permanent;
14038 			subentry_csm_associated = submap_entry->csm_associated;
14039 
14040 			vm_map_unlock(map);
14041 			submap_entry = NULL; /* not valid after map unlock */
14042 
14043 			local_start = old_start;
14044 			local_end = old_end;
14045 			map = cow_sub_map_parent;
14046 			*var_map = cow_sub_map_parent;
14047 			vaddr = cow_parent_vaddr;
14048 			cow_sub_map_parent = NULL;
14049 
14050 			if (!vm_map_lookup_entry(map,
14051 			    vaddr, &entry)) {
14052 				if ((cow_sub_map_parent) && (cow_sub_map_parent != map)) {
14053 					vm_map_unlock(cow_sub_map_parent);
14054 				}
14055 				if ((*real_map != map)
14056 				    && (*real_map != cow_sub_map_parent)) {
14057 					vm_map_unlock(*real_map);
14058 				}
14059 				*real_map = map;
14060 				vm_object_deallocate(
14061 					copy_object);
14062 				copy_object = VM_OBJECT_NULL;
14063 				vm_map_lock_write_to_read(map);
14064 				DTRACE_VM4(submap_lookup_post_unlock,
14065 				    uint64_t, (uint64_t)entry->vme_start,
14066 				    uint64_t, (uint64_t)entry->vme_end,
14067 				    vm_map_offset_t, vaddr,
14068 				    int, object_copied);
14069 				return KERN_INVALID_ADDRESS;
14070 			}
14071 
14072 			/* clip out the portion of space */
14073 			/* mapped by the sub map which   */
14074 			/* corresponds to the underlying */
14075 			/* object */
14076 
14077 			/*
14078 			 * Clip (and unnest) the smallest nested chunk
14079 			 * possible around the faulting address...
14080 			 */
14081 			local_start = vaddr & ~(pmap_shared_region_size_min(map->pmap) - 1);
14082 			local_end = local_start + pmap_shared_region_size_min(map->pmap);
14083 			/*
14084 			 * ... but don't go beyond the "old_start" to "old_end"
14085 			 * range, to avoid spanning over another VM region
14086 			 * with a possibly different VM object and/or offset.
14087 			 */
14088 			if (local_start < old_start) {
14089 				local_start = old_start;
14090 			}
14091 			if (local_end > old_end) {
14092 				local_end = old_end;
14093 			}
14094 			/*
14095 			 * Adjust copy_offset to the start of the range.
14096 			 */
14097 			copy_offset -= (vaddr - local_start);
14098 
14099 			vm_map_clip_start(map, entry, local_start);
14100 			vm_map_clip_end(map, entry, local_end);
14101 			if (entry->is_sub_map) {
14102 				/* unnesting was done when clipping */
14103 				assert(!entry->use_pmap);
14104 			}
14105 
14106 			/* substitute copy object for */
14107 			/* shared map entry           */
14108 			vm_map_deallocate(VME_SUBMAP(entry));
14109 			assert(!entry->iokit_acct);
14110 			entry->use_pmap = TRUE;
14111 			VME_OBJECT_SET(entry, copy_object, false, 0);
14112 
14113 			/* propagate the submap entry's protections */
14114 			if (entry->protection != VM_PROT_READ) {
14115 				/*
14116 				 * Someone has already altered the top entry's
14117 				 * protections via vm_protect(VM_PROT_COPY).
14118 				 * Respect these new values and ignore the
14119 				 * submap entry's protections.
14120 				 */
14121 			} else {
14122 				/*
14123 				 * Regular copy-on-write: propagate the submap
14124 				 * entry's protections to the top map entry.
14125 				 */
14126 				entry->protection |= subentry_protection;
14127 			}
14128 			entry->max_protection |= subentry_max_protection;
14129 			/* propagate some attributes from subentry */
14130 			entry->vme_no_copy_on_read = subentry_no_copy_on_read;
14131 			entry->vme_permanent = subentry_permanent;
14132 			entry->csm_associated = subentry_csm_associated;
14133 
14134 			if ((entry->protection & VM_PROT_WRITE) &&
14135 			    (entry->protection & VM_PROT_EXECUTE) &&
14136 #if XNU_TARGET_OS_OSX
14137 			    map->pmap != kernel_pmap &&
14138 			    (vm_map_cs_enforcement(map)
14139 #if __arm64__
14140 			    || !VM_MAP_IS_EXOTIC(map)
14141 #endif /* __arm64__ */
14142 			    ) &&
14143 #endif /* XNU_TARGET_OS_OSX */
14144 #if CODE_SIGNING_MONITOR
14145 			    (csm_address_space_exempt(map->pmap) != KERN_SUCCESS) &&
14146 #endif
14147 			    !(entry->used_for_jit) &&
14148 			    VM_MAP_POLICY_WX_STRIP_X(map)) {
14149 				DTRACE_VM3(cs_wx,
14150 				    uint64_t, (uint64_t)entry->vme_start,
14151 				    uint64_t, (uint64_t)entry->vme_end,
14152 				    vm_prot_t, entry->protection);
14153 				printf("CODE SIGNING: %d[%s] %s:%d(0x%llx,0x%llx,0x%x) can't have both write and exec at the same time\n",
14154 				    proc_selfpid(),
14155 				    (get_bsdtask_info(current_task())
14156 				    ? proc_name_address(get_bsdtask_info(current_task()))
14157 				    : "?"),
14158 				    __FUNCTION__, __LINE__,
14159 #if DEVELOPMENT || DEBUG
14160 				    (uint64_t)entry->vme_start,
14161 				    (uint64_t)entry->vme_end,
14162 #else /* DEVELOPMENT || DEBUG */
14163 				    (uint64_t)0,
14164 				    (uint64_t)0,
14165 #endif /* DEVELOPMENT || DEBUG */
14166 				    entry->protection);
14167 				entry->protection &= ~VM_PROT_EXECUTE;
14168 			}
14169 
14170 			if (object_copied) {
14171 				VME_OFFSET_SET(entry, local_start - old_start + object_copied_offset);
14172 				entry->needs_copy = object_copied_needs_copy;
14173 				entry->is_shared = FALSE;
14174 			} else {
14175 				assert(VME_OBJECT(entry) != VM_OBJECT_NULL);
14176 				assert(VME_OBJECT(entry)->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC);
14177 				assert(entry->wired_count == 0);
14178 				VME_OFFSET_SET(entry, copy_offset);
14179 				entry->needs_copy = TRUE;
14180 				if (map != old_map) {
14181 					entry->is_shared = TRUE;
14182 				}
14183 			}
14184 			if (entry->inheritance == VM_INHERIT_SHARE) {
14185 				entry->inheritance = VM_INHERIT_COPY;
14186 			}
14187 
14188 			vm_map_lock_write_to_read(map);
14189 		} else {
14190 			if ((cow_sub_map_parent)
14191 			    && (cow_sub_map_parent != *real_map)
14192 			    && (cow_sub_map_parent != map)) {
14193 				vm_map_unlock(cow_sub_map_parent);
14194 			}
14195 			entry = submap_entry;
14196 			vaddr = local_vaddr;
14197 		}
14198 	}
14199 
14200 	/*
14201 	 *	Check whether this task is allowed to have
14202 	 *	this page.
14203 	 */
14204 
14205 	prot = entry->protection;
14206 
14207 	if (override_nx(old_map, VME_ALIAS(entry)) && prot) {
14208 		/*
14209 		 * HACK -- if not a stack, then allow execution
14210 		 */
14211 		prot |= VM_PROT_EXECUTE;
14212 	}
14213 
14214 	if (mask_protections) {
14215 		fault_type &= prot;
14216 		if (fault_type == VM_PROT_NONE) {
14217 			goto protection_failure;
14218 		}
14219 	}
14220 	if (((fault_type & prot) != fault_type)
14221 #if __arm64__
14222 	    /* prefetch abort in execute-only page */
14223 	    && !(prot == VM_PROT_EXECUTE && fault_type == (VM_PROT_READ | VM_PROT_EXECUTE))
14224 #elif defined(__x86_64__)
14225 	    /* Consider the UEXEC bit when handling an EXECUTE fault */
14226 	    && !((fault_type & VM_PROT_EXECUTE) && !(prot & VM_PROT_EXECUTE) && (prot & VM_PROT_UEXEC))
14227 #endif
14228 	    ) {
14229 protection_failure:
14230 		if (*real_map != map) {
14231 			vm_map_unlock(*real_map);
14232 		}
14233 		*real_map = map;
14234 
14235 		if ((fault_type & VM_PROT_EXECUTE) && prot) {
14236 			log_stack_execution_failure((addr64_t)vaddr, prot);
14237 		}
14238 
14239 		DTRACE_VM2(prot_fault, int, 1, (uint64_t *), NULL);
14240 		DTRACE_VM3(prot_fault_detailed, vm_prot_t, fault_type, vm_prot_t, prot, void *, vaddr);
14241 		/*
14242 		 * Noisy (esp. internally) and can be inferred from CrashReports. So OFF for now.
14243 		 *
14244 		 * ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_PROTECTION_FAILURE), 0);
14245 		 */
14246 		return KERN_PROTECTION_FAILURE;
14247 	}
14248 
14249 	/*
14250 	 *	If this page is not pageable, we have to get
14251 	 *	it for all possible accesses.
14252 	 */
14253 
14254 	*wired = (entry->wired_count != 0);
14255 	if (*wired) {
14256 		fault_type = prot;
14257 	}
14258 
14259 	/*
14260 	 *	If the entry was copy-on-write, we either ...
14261 	 */
14262 
14263 	if (entry->needs_copy) {
14264 		/*
14265 		 *	If we want to write the page, we may as well
14266 		 *	handle that now since we've got the map locked.
14267 		 *
14268 		 *	If we don't need to write the page, we just
14269 		 *	demote the permissions allowed.
14270 		 */
14271 
14272 		if ((fault_type & VM_PROT_WRITE) || *wired || force_copy) {
14273 			/*
14274 			 *	Make a new object, and place it in the
14275 			 *	object chain.  Note that no new references
14276 			 *	have appeared -- one just moved from the
14277 			 *	map to the new object.
14278 			 */
14279 
14280 			if (vm_map_lock_read_to_write(map)) {
14281 				vm_map_lock_read(map);
14282 				goto RetryLookup;
14283 			}
14284 
14285 			if (VME_OBJECT(entry)->shadowed == FALSE) {
14286 				vm_object_lock(VME_OBJECT(entry));
14287 				VME_OBJECT(entry)->shadowed = TRUE;
14288 				vm_object_unlock(VME_OBJECT(entry));
14289 			}
14290 			VME_OBJECT_SHADOW(entry,
14291 			    (vm_map_size_t) (entry->vme_end -
14292 			    entry->vme_start),
14293 			    vm_map_always_shadow(map));
14294 			entry->needs_copy = FALSE;
14295 
14296 			vm_map_lock_write_to_read(map);
14297 		}
14298 		if ((fault_type & VM_PROT_WRITE) == 0 && *wired == 0) {
14299 			/*
14300 			 *	We're attempting to read a copy-on-write
14301 			 *	page -- don't allow writes.
14302 			 */
14303 
14304 			prot &= (~VM_PROT_WRITE);
14305 		}
14306 	}
14307 
14308 	if (submap_needed_copy && (prot & VM_PROT_WRITE)) {
14309 		/*
14310 		 * We went through a "needs_copy" submap without triggering
14311 		 * a copy, so granting write access to the page would bypass
14312 		 * that submap's "needs_copy".
14313 		 */
14314 		assert(!(fault_type & VM_PROT_WRITE));
14315 		assert(!*wired);
14316 		assert(!force_copy);
14317 		// printf("FBDP %d[%s] submap_needed_copy for %p 0x%llx\n", proc_selfpid(), proc_name_address(current_task()->bsd_info), map, vaddr);
14318 		prot &= ~VM_PROT_WRITE;
14319 	}
14320 
14321 	/*
14322 	 *	Create an object if necessary.
14323 	 */
14324 	if (VME_OBJECT(entry) == VM_OBJECT_NULL) {
14325 		if (vm_map_lock_read_to_write(map)) {
14326 			vm_map_lock_read(map);
14327 			goto RetryLookup;
14328 		}
14329 
14330 		VME_OBJECT_SET(entry,
14331 		    vm_object_allocate(
14332 			    (vm_map_size_t)(entry->vme_end -
14333 			    entry->vme_start)), false, 0);
14334 		VME_OFFSET_SET(entry, 0);
14335 		assert(entry->use_pmap);
14336 		vm_map_lock_write_to_read(map);
14337 	}
14338 
14339 	/*
14340 	 *	Return the object/offset from this entry.  If the entry
14341 	 *	was copy-on-write or empty, it has been fixed up.  Also
14342 	 *	return the protection.
14343 	 */
14344 
14345 	*offset = (vaddr - entry->vme_start) + VME_OFFSET(entry);
14346 	*object = VME_OBJECT(entry);
14347 	*out_prot = prot;
14348 	KDBG_FILTERED(MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_MAP_LOOKUP_OBJECT), VM_KERNEL_UNSLIDE_OR_PERM(*object), (unsigned long) VME_ALIAS(entry), 0, 0);
14349 
14350 	if (fault_info) {
14351 		fault_info->interruptible = THREAD_UNINT; /* for now... */
14352 		/* ... the caller will change "interruptible" if needed */
14353 		fault_info->cluster_size = 0;
14354 		fault_info->user_tag = VME_ALIAS(entry);
14355 		fault_info->pmap_options = 0;
14356 		if (entry->iokit_acct ||
14357 		    (!entry->is_sub_map && !entry->use_pmap)) {
14358 			fault_info->pmap_options |= PMAP_OPTIONS_ALT_ACCT;
14359 		}
14360 		fault_info->behavior = entry->behavior;
14361 		fault_info->lo_offset = VME_OFFSET(entry);
14362 		fault_info->hi_offset =
14363 		    (entry->vme_end - entry->vme_start) + VME_OFFSET(entry);
14364 		fault_info->no_cache  = entry->no_cache;
14365 		fault_info->stealth = FALSE;
14366 		fault_info->io_sync = FALSE;
14367 		if (entry->used_for_jit ||
14368 #if CODE_SIGNING_MONITOR
14369 		    (csm_address_space_exempt(map->pmap) == KERN_SUCCESS) ||
14370 #endif
14371 		    entry->vme_resilient_codesign) {
14372 			fault_info->cs_bypass = TRUE;
14373 		} else {
14374 			fault_info->cs_bypass = FALSE;
14375 		}
14376 		fault_info->csm_associated = FALSE;
14377 #if CODE_SIGNING_MONITOR
14378 		if (entry->csm_associated) {
14379 			/*
14380 			 * The pmap layer will validate this page
14381 			 * before allowing it to be executed from.
14382 			 */
14383 			fault_info->csm_associated = TRUE;
14384 		}
14385 #endif
14386 		fault_info->mark_zf_absent = FALSE;
14387 		fault_info->batch_pmap_op = FALSE;
14388 		fault_info->resilient_media = entry->vme_resilient_media;
14389 		fault_info->fi_xnu_user_debug = entry->vme_xnu_user_debug;
14390 		fault_info->no_copy_on_read = entry->vme_no_copy_on_read;
14391 		if (entry->translated_allow_execute) {
14392 			fault_info->pmap_options |= PMAP_OPTIONS_TRANSLATED_ALLOW_EXECUTE;
14393 		}
14394 	}
14395 
14396 	/*
14397 	 *	Lock the object to prevent it from disappearing
14398 	 */
14399 	if (object_lock_type == OBJECT_LOCK_EXCLUSIVE) {
14400 		if (contended == NULL) {
14401 			vm_object_lock(*object);
14402 		} else {
14403 			*contended = vm_object_lock_check_contended(*object);
14404 		}
14405 	} else {
14406 		vm_object_lock_shared(*object);
14407 	}
14408 
14409 	/*
14410 	 *	Save the version number
14411 	 */
14412 
14413 	out_version->main_timestamp = map->timestamp;
14414 
14415 	return KERN_SUCCESS;
14416 }
14417 
14418 
14419 /*
14420  *	vm_map_verify:
14421  *
14422  *	Verifies that the map in question has not changed
14423  *	since the given version. The map has to be locked
14424  *	("shared" mode is fine) before calling this function
14425  *	and it will be returned locked too.
14426  */
14427 boolean_t
vm_map_verify(vm_map_t map,vm_map_version_t * version)14428 vm_map_verify(
14429 	vm_map_t                map,
14430 	vm_map_version_t        *version)       /* REF */
14431 {
14432 	boolean_t       result;
14433 
14434 	vm_map_lock_assert_held(map);
14435 	result = (map->timestamp == version->main_timestamp);
14436 
14437 	return result;
14438 }
14439 
14440 /*
14441  *	TEMPORARYTEMPORARYTEMPORARYTEMPORARYTEMPORARYTEMPORARY
14442  *	Goes away after regular vm_region_recurse function migrates to
14443  *	64 bits
14444  *	vm_region_recurse: A form of vm_region which follows the
14445  *	submaps in a target map
14446  *
14447  */
14448 
14449 kern_return_t
vm_map_region_recurse_64(vm_map_t map,vm_map_offset_t * address,vm_map_size_t * size,natural_t * nesting_depth,vm_region_submap_info_64_t submap_info,mach_msg_type_number_t * count)14450 vm_map_region_recurse_64(
14451 	vm_map_t                 map,
14452 	vm_map_offset_t *address,               /* IN/OUT */
14453 	vm_map_size_t           *size,                  /* OUT */
14454 	natural_t               *nesting_depth, /* IN/OUT */
14455 	vm_region_submap_info_64_t      submap_info,    /* IN/OUT */
14456 	mach_msg_type_number_t  *count) /* IN/OUT */
14457 {
14458 	mach_msg_type_number_t  original_count;
14459 	vm_region_extended_info_data_t  extended;
14460 	vm_map_entry_t                  tmp_entry;
14461 	vm_map_offset_t                 user_address;
14462 	unsigned int                    user_max_depth;
14463 
14464 	/*
14465 	 * "curr_entry" is the VM map entry preceding or including the
14466 	 * address we're looking for.
14467 	 * "curr_map" is the map or sub-map containing "curr_entry".
14468 	 * "curr_address" is the equivalent of the top map's "user_address"
14469 	 * in the current map.
14470 	 * "curr_offset" is the cumulated offset of "curr_map" in the
14471 	 * target task's address space.
14472 	 * "curr_depth" is the depth of "curr_map" in the chain of
14473 	 * sub-maps.
14474 	 *
14475 	 * "curr_max_below" and "curr_max_above" limit the range (around
14476 	 * "curr_address") we should take into account in the current (sub)map.
14477 	 * They limit the range to what's visible through the map entries
14478 	 * we've traversed from the top map to the current map.
14479 	 *
14480 	 */
14481 	vm_map_entry_t                  curr_entry;
14482 	vm_map_address_t                curr_address;
14483 	vm_map_offset_t                 curr_offset;
14484 	vm_map_t                        curr_map;
14485 	unsigned int                    curr_depth;
14486 	vm_map_offset_t                 curr_max_below, curr_max_above;
14487 	vm_map_offset_t                 curr_skip;
14488 
14489 	/*
14490 	 * "next_" is the same as "curr_" but for the VM region immediately
14491 	 * after the address we're looking for.  We need to keep track of this
14492 	 * too because we want to return info about that region if the
14493 	 * address we're looking for is not mapped.
14494 	 */
14495 	vm_map_entry_t                  next_entry;
14496 	vm_map_offset_t                 next_offset;
14497 	vm_map_offset_t                 next_address;
14498 	vm_map_t                        next_map;
14499 	unsigned int                    next_depth;
14500 	vm_map_offset_t                 next_max_below, next_max_above;
14501 	vm_map_offset_t                 next_skip;
14502 
14503 	boolean_t                       look_for_pages;
14504 	vm_region_submap_short_info_64_t short_info;
14505 	boolean_t                       do_region_footprint;
14506 	int                             effective_page_size, effective_page_shift;
14507 	boolean_t                       submap_needed_copy;
14508 
14509 	if (map == VM_MAP_NULL) {
14510 		/* no address space to work on */
14511 		return KERN_INVALID_ARGUMENT;
14512 	}
14513 
14514 	effective_page_shift = vm_self_region_page_shift(map);
14515 	effective_page_size = (1 << effective_page_shift);
14516 
14517 	if (*count < VM_REGION_SUBMAP_SHORT_INFO_COUNT_64) {
14518 		/*
14519 		 * "info" structure is not big enough and
14520 		 * would overflow
14521 		 */
14522 		return KERN_INVALID_ARGUMENT;
14523 	}
14524 
14525 	do_region_footprint = task_self_region_footprint();
14526 	original_count = *count;
14527 
14528 	if (original_count < VM_REGION_SUBMAP_INFO_V0_COUNT_64) {
14529 		*count = VM_REGION_SUBMAP_SHORT_INFO_COUNT_64;
14530 		look_for_pages = FALSE;
14531 		short_info = (vm_region_submap_short_info_64_t) submap_info;
14532 		submap_info = NULL;
14533 	} else {
14534 		look_for_pages = TRUE;
14535 		*count = VM_REGION_SUBMAP_INFO_V0_COUNT_64;
14536 		short_info = NULL;
14537 
14538 		if (original_count >= VM_REGION_SUBMAP_INFO_V1_COUNT_64) {
14539 			*count = VM_REGION_SUBMAP_INFO_V1_COUNT_64;
14540 		}
14541 		if (original_count >= VM_REGION_SUBMAP_INFO_V2_COUNT_64) {
14542 			*count = VM_REGION_SUBMAP_INFO_V2_COUNT_64;
14543 		}
14544 	}
14545 
14546 	user_address = *address;
14547 	user_max_depth = *nesting_depth;
14548 	submap_needed_copy = FALSE;
14549 
14550 	if (not_in_kdp) {
14551 		vm_map_lock_read(map);
14552 	}
14553 
14554 recurse_again:
14555 	curr_entry = NULL;
14556 	curr_map = map;
14557 	curr_address = user_address;
14558 	curr_offset = 0;
14559 	curr_skip = 0;
14560 	curr_depth = 0;
14561 	curr_max_above = ((vm_map_offset_t) -1) - curr_address;
14562 	curr_max_below = curr_address;
14563 
14564 	next_entry = NULL;
14565 	next_map = NULL;
14566 	next_address = 0;
14567 	next_offset = 0;
14568 	next_skip = 0;
14569 	next_depth = 0;
14570 	next_max_above = (vm_map_offset_t) -1;
14571 	next_max_below = (vm_map_offset_t) -1;
14572 
14573 	for (;;) {
14574 		if (vm_map_lookup_entry(curr_map,
14575 		    curr_address,
14576 		    &tmp_entry)) {
14577 			/* tmp_entry contains the address we're looking for */
14578 			curr_entry = tmp_entry;
14579 		} else {
14580 			vm_map_offset_t skip;
14581 			/*
14582 			 * The address is not mapped.  "tmp_entry" is the
14583 			 * map entry preceding the address.  We want the next
14584 			 * one, if it exists.
14585 			 */
14586 			curr_entry = tmp_entry->vme_next;
14587 
14588 			if (curr_entry == vm_map_to_entry(curr_map) ||
14589 			    (curr_entry->vme_start >=
14590 			    curr_address + curr_max_above)) {
14591 				/* no next entry at this level: stop looking */
14592 				if (not_in_kdp) {
14593 					vm_map_unlock_read(curr_map);
14594 				}
14595 				curr_entry = NULL;
14596 				curr_map = NULL;
14597 				curr_skip = 0;
14598 				curr_offset = 0;
14599 				curr_depth = 0;
14600 				curr_max_above = 0;
14601 				curr_max_below = 0;
14602 				break;
14603 			}
14604 
14605 			/* adjust current address and offset */
14606 			skip = curr_entry->vme_start - curr_address;
14607 			curr_address = curr_entry->vme_start;
14608 			curr_skip += skip;
14609 			curr_offset += skip;
14610 			curr_max_above -= skip;
14611 			curr_max_below = 0;
14612 		}
14613 
14614 		/*
14615 		 * Is the next entry at this level closer to the address (or
14616 		 * deeper in the submap chain) than the one we had
14617 		 * so far ?
14618 		 */
14619 		tmp_entry = curr_entry->vme_next;
14620 		if (tmp_entry == vm_map_to_entry(curr_map)) {
14621 			/* no next entry at this level */
14622 		} else if (tmp_entry->vme_start >=
14623 		    curr_address + curr_max_above) {
14624 			/*
14625 			 * tmp_entry is beyond the scope of what we mapped of
14626 			 * this submap in the upper level: ignore it.
14627 			 */
14628 		} else if ((next_entry == NULL) ||
14629 		    (tmp_entry->vme_start + curr_offset <=
14630 		    next_entry->vme_start + next_offset)) {
14631 			/*
14632 			 * We didn't have a "next_entry" or this one is
14633 			 * closer to the address we're looking for:
14634 			 * use this "tmp_entry" as the new "next_entry".
14635 			 */
14636 			if (next_entry != NULL) {
14637 				/* unlock the last "next_map" */
14638 				if (next_map != curr_map && not_in_kdp) {
14639 					vm_map_unlock_read(next_map);
14640 				}
14641 			}
14642 			next_entry = tmp_entry;
14643 			next_map = curr_map;
14644 			next_depth = curr_depth;
14645 			next_address = next_entry->vme_start;
14646 			next_skip = curr_skip;
14647 			next_skip += (next_address - curr_address);
14648 			next_offset = curr_offset;
14649 			next_offset += (next_address - curr_address);
14650 			next_max_above = MIN(next_max_above, curr_max_above);
14651 			next_max_above = MIN(next_max_above,
14652 			    next_entry->vme_end - next_address);
14653 			next_max_below = MIN(next_max_below, curr_max_below);
14654 			next_max_below = MIN(next_max_below,
14655 			    next_address - next_entry->vme_start);
14656 		}
14657 
14658 		/*
14659 		 * "curr_max_{above,below}" allow us to keep track of the
14660 		 * portion of the submap that is actually mapped at this level:
14661 		 * the rest of that submap is irrelevant to us, since it's not
14662 		 * mapped here.
14663 		 * The relevant portion of the map starts at
14664 		 * "VME_OFFSET(curr_entry)" up to the size of "curr_entry".
14665 		 */
14666 		curr_max_above = MIN(curr_max_above,
14667 		    curr_entry->vme_end - curr_address);
14668 		curr_max_below = MIN(curr_max_below,
14669 		    curr_address - curr_entry->vme_start);
14670 
14671 		if (!curr_entry->is_sub_map ||
14672 		    curr_depth >= user_max_depth) {
14673 			/*
14674 			 * We hit a leaf map or we reached the maximum depth
14675 			 * we could, so stop looking.  Keep the current map
14676 			 * locked.
14677 			 */
14678 			break;
14679 		}
14680 
14681 		/*
14682 		 * Get down to the next submap level.
14683 		 */
14684 
14685 		if (curr_entry->needs_copy) {
14686 			/* everything below this is effectively copy-on-write */
14687 			submap_needed_copy = TRUE;
14688 		}
14689 
14690 		/*
14691 		 * Lock the next level and unlock the current level,
14692 		 * unless we need to keep it locked to access the "next_entry"
14693 		 * later.
14694 		 */
14695 		if (not_in_kdp) {
14696 			vm_map_lock_read(VME_SUBMAP(curr_entry));
14697 		}
14698 		if (curr_map == next_map) {
14699 			/* keep "next_map" locked in case we need it */
14700 		} else {
14701 			/* release this map */
14702 			if (not_in_kdp) {
14703 				vm_map_unlock_read(curr_map);
14704 			}
14705 		}
14706 
14707 		/*
14708 		 * Adjust the offset.  "curr_entry" maps the submap
14709 		 * at relative address "curr_entry->vme_start" in the
14710 		 * curr_map but skips the first "VME_OFFSET(curr_entry)"
14711 		 * bytes of the submap.
14712 		 * "curr_offset" always represents the offset of a virtual
14713 		 * address in the curr_map relative to the absolute address
14714 		 * space (i.e. the top-level VM map).
14715 		 */
14716 		curr_offset +=
14717 		    (VME_OFFSET(curr_entry) - curr_entry->vme_start);
14718 		curr_address = user_address + curr_offset;
14719 		/* switch to the submap */
14720 		curr_map = VME_SUBMAP(curr_entry);
14721 		curr_depth++;
14722 		curr_entry = NULL;
14723 	}
14724 
14725 // LP64todo: all the current tools are 32bit, obviously never worked for 64b
14726 // so probably should be a real 32b ID vs. ptr.
14727 // Current users just check for equality
14728 
14729 	if (curr_entry == NULL) {
14730 		/* no VM region contains the address... */
14731 
14732 		if (do_region_footprint && /* we want footprint numbers */
14733 		    next_entry == NULL && /* & there are no more regions */
14734 		    /* & we haven't already provided our fake region: */
14735 		    user_address <= vm_map_last_entry(map)->vme_end) {
14736 			ledger_amount_t ledger_resident, ledger_compressed;
14737 
14738 			/*
14739 			 * Add a fake memory region to account for
14740 			 * purgeable and/or ledger-tagged memory that
14741 			 * counts towards this task's memory footprint,
14742 			 * i.e. the resident/compressed pages of non-volatile
14743 			 * objects owned by that task.
14744 			 */
14745 			task_ledgers_footprint(map->pmap->ledger,
14746 			    &ledger_resident,
14747 			    &ledger_compressed);
14748 			if (ledger_resident + ledger_compressed == 0) {
14749 				/* no purgeable memory usage to report */
14750 				return KERN_INVALID_ADDRESS;
14751 			}
14752 			/* fake region to show nonvolatile footprint */
14753 			if (look_for_pages) {
14754 				submap_info->protection = VM_PROT_DEFAULT;
14755 				submap_info->max_protection = VM_PROT_DEFAULT;
14756 				submap_info->inheritance = VM_INHERIT_DEFAULT;
14757 				submap_info->offset = 0;
14758 				submap_info->user_tag = -1;
14759 				submap_info->pages_resident = (unsigned int) (ledger_resident / effective_page_size);
14760 				submap_info->pages_shared_now_private = 0;
14761 				submap_info->pages_swapped_out = (unsigned int) (ledger_compressed / effective_page_size);
14762 				submap_info->pages_dirtied = submap_info->pages_resident;
14763 				submap_info->ref_count = 1;
14764 				submap_info->shadow_depth = 0;
14765 				submap_info->external_pager = 0;
14766 				submap_info->share_mode = SM_PRIVATE;
14767 				if (submap_needed_copy) {
14768 					submap_info->share_mode = SM_COW;
14769 				}
14770 				submap_info->is_submap = 0;
14771 				submap_info->behavior = VM_BEHAVIOR_DEFAULT;
14772 				submap_info->object_id = VM_OBJECT_ID_FAKE(map, task_ledgers.purgeable_nonvolatile);
14773 				submap_info->user_wired_count = 0;
14774 				submap_info->pages_reusable = 0;
14775 			} else {
14776 				short_info->user_tag = -1;
14777 				short_info->offset = 0;
14778 				short_info->protection = VM_PROT_DEFAULT;
14779 				short_info->inheritance = VM_INHERIT_DEFAULT;
14780 				short_info->max_protection = VM_PROT_DEFAULT;
14781 				short_info->behavior = VM_BEHAVIOR_DEFAULT;
14782 				short_info->user_wired_count = 0;
14783 				short_info->is_submap = 0;
14784 				short_info->object_id = VM_OBJECT_ID_FAKE(map, task_ledgers.purgeable_nonvolatile);
14785 				short_info->external_pager = 0;
14786 				short_info->shadow_depth = 0;
14787 				short_info->share_mode = SM_PRIVATE;
14788 				if (submap_needed_copy) {
14789 					short_info->share_mode = SM_COW;
14790 				}
14791 				short_info->ref_count = 1;
14792 			}
14793 			*nesting_depth = 0;
14794 			*size = (vm_map_size_t) (ledger_resident + ledger_compressed);
14795 //			*address = user_address;
14796 			*address = vm_map_last_entry(map)->vme_end;
14797 			return KERN_SUCCESS;
14798 		}
14799 
14800 		if (next_entry == NULL) {
14801 			/* ... and no VM region follows it either */
14802 			return KERN_INVALID_ADDRESS;
14803 		}
14804 		/* ... gather info about the next VM region */
14805 		curr_entry = next_entry;
14806 		curr_map = next_map;    /* still locked ... */
14807 		curr_address = next_address;
14808 		curr_skip = next_skip;
14809 		curr_offset = next_offset;
14810 		curr_depth = next_depth;
14811 		curr_max_above = next_max_above;
14812 		curr_max_below = next_max_below;
14813 	} else {
14814 		/* we won't need "next_entry" after all */
14815 		if (next_entry != NULL) {
14816 			/* release "next_map" */
14817 			if (next_map != curr_map && not_in_kdp) {
14818 				vm_map_unlock_read(next_map);
14819 			}
14820 		}
14821 	}
14822 	next_entry = NULL;
14823 	next_map = NULL;
14824 	next_offset = 0;
14825 	next_skip = 0;
14826 	next_depth = 0;
14827 	next_max_below = -1;
14828 	next_max_above = -1;
14829 
14830 	if (curr_entry->is_sub_map &&
14831 	    curr_depth < user_max_depth) {
14832 		/*
14833 		 * We're not as deep as we could be:  we must have
14834 		 * gone back up after not finding anything mapped
14835 		 * below the original top-level map entry's.
14836 		 * Let's move "curr_address" forward and recurse again.
14837 		 */
14838 		user_address = curr_address;
14839 		goto recurse_again;
14840 	}
14841 
14842 	*nesting_depth = curr_depth;
14843 	*size = curr_max_above + curr_max_below;
14844 	*address = user_address + curr_skip - curr_max_below;
14845 
14846 	if (look_for_pages) {
14847 		submap_info->user_tag = VME_ALIAS(curr_entry);
14848 		submap_info->offset = VME_OFFSET(curr_entry);
14849 		submap_info->protection = curr_entry->protection;
14850 		submap_info->inheritance = curr_entry->inheritance;
14851 		submap_info->max_protection = curr_entry->max_protection;
14852 		submap_info->behavior = curr_entry->behavior;
14853 		submap_info->user_wired_count = curr_entry->user_wired_count;
14854 		submap_info->is_submap = curr_entry->is_sub_map;
14855 		if (curr_entry->is_sub_map) {
14856 			submap_info->object_id = VM_OBJECT_ID(VME_SUBMAP(curr_entry));
14857 		} else {
14858 			submap_info->object_id = VM_OBJECT_ID(VME_OBJECT(curr_entry));
14859 		}
14860 	} else {
14861 		short_info->user_tag = VME_ALIAS(curr_entry);
14862 		short_info->offset = VME_OFFSET(curr_entry);
14863 		short_info->protection = curr_entry->protection;
14864 		short_info->inheritance = curr_entry->inheritance;
14865 		short_info->max_protection = curr_entry->max_protection;
14866 		short_info->behavior = curr_entry->behavior;
14867 		short_info->user_wired_count = curr_entry->user_wired_count;
14868 		short_info->is_submap = curr_entry->is_sub_map;
14869 		if (curr_entry->is_sub_map) {
14870 			short_info->object_id = VM_OBJECT_ID(VME_SUBMAP(curr_entry));
14871 		} else {
14872 			short_info->object_id = VM_OBJECT_ID(VME_OBJECT(curr_entry));
14873 		}
14874 	}
14875 
14876 	extended.pages_resident = 0;
14877 	extended.pages_swapped_out = 0;
14878 	extended.pages_shared_now_private = 0;
14879 	extended.pages_dirtied = 0;
14880 	extended.pages_reusable = 0;
14881 	extended.external_pager = 0;
14882 	extended.shadow_depth = 0;
14883 	extended.share_mode = SM_EMPTY;
14884 	extended.ref_count = 0;
14885 
14886 	if (not_in_kdp) {
14887 		if (!curr_entry->is_sub_map) {
14888 			vm_map_offset_t range_start, range_end;
14889 			range_start = MAX((curr_address - curr_max_below),
14890 			    curr_entry->vme_start);
14891 			range_end = MIN((curr_address + curr_max_above),
14892 			    curr_entry->vme_end);
14893 			vm_map_region_walk(curr_map,
14894 			    range_start,
14895 			    curr_entry,
14896 			    (VME_OFFSET(curr_entry) +
14897 			    (range_start -
14898 			    curr_entry->vme_start)),
14899 			    range_end - range_start,
14900 			    &extended,
14901 			    look_for_pages, VM_REGION_EXTENDED_INFO_COUNT);
14902 			if (extended.external_pager &&
14903 			    extended.ref_count == 2 &&
14904 			    extended.share_mode == SM_SHARED) {
14905 				extended.share_mode = SM_PRIVATE;
14906 			}
14907 			if (submap_needed_copy) {
14908 				extended.share_mode = SM_COW;
14909 			}
14910 		} else {
14911 			if (curr_entry->use_pmap) {
14912 				extended.share_mode = SM_TRUESHARED;
14913 			} else {
14914 				extended.share_mode = SM_PRIVATE;
14915 			}
14916 			extended.ref_count = os_ref_get_count_raw(&VME_SUBMAP(curr_entry)->map_refcnt);
14917 		}
14918 	}
14919 
14920 	if (look_for_pages) {
14921 		submap_info->pages_resident = extended.pages_resident;
14922 		submap_info->pages_swapped_out = extended.pages_swapped_out;
14923 		submap_info->pages_shared_now_private =
14924 		    extended.pages_shared_now_private;
14925 		submap_info->pages_dirtied = extended.pages_dirtied;
14926 		submap_info->external_pager = extended.external_pager;
14927 		submap_info->shadow_depth = extended.shadow_depth;
14928 		submap_info->share_mode = extended.share_mode;
14929 		submap_info->ref_count = extended.ref_count;
14930 
14931 		if (original_count >= VM_REGION_SUBMAP_INFO_V1_COUNT_64) {
14932 			submap_info->pages_reusable = extended.pages_reusable;
14933 		}
14934 		if (original_count >= VM_REGION_SUBMAP_INFO_V2_COUNT_64) {
14935 			if (curr_entry->is_sub_map) {
14936 				submap_info->object_id_full = (vm_object_id_t)VM_KERNEL_ADDRPERM(VME_SUBMAP(curr_entry));
14937 			} else if (VME_OBJECT(curr_entry)) {
14938 				submap_info->object_id_full = (vm_object_id_t)VM_KERNEL_ADDRPERM(VME_OBJECT(curr_entry));
14939 			} else {
14940 				submap_info->object_id_full = 0ull;
14941 			}
14942 		}
14943 	} else {
14944 		short_info->external_pager = extended.external_pager;
14945 		short_info->shadow_depth = extended.shadow_depth;
14946 		short_info->share_mode = extended.share_mode;
14947 		short_info->ref_count = extended.ref_count;
14948 	}
14949 
14950 	if (not_in_kdp) {
14951 		vm_map_unlock_read(curr_map);
14952 	}
14953 
14954 	return KERN_SUCCESS;
14955 }
14956 
14957 /*
14958  *	vm_region:
14959  *
14960  *	User call to obtain information about a region in
14961  *	a task's address map. Currently, only one flavor is
14962  *	supported.
14963  *
14964  *	XXX The reserved and behavior fields cannot be filled
14965  *	    in until the vm merge from the IK is completed, and
14966  *	    vm_reserve is implemented.
14967  */
14968 
14969 kern_return_t
vm_map_region(vm_map_t map,vm_map_offset_t * address,vm_map_size_t * size,vm_region_flavor_t flavor,vm_region_info_t info,mach_msg_type_number_t * count,mach_port_t * object_name)14970 vm_map_region(
14971 	vm_map_t                 map,
14972 	vm_map_offset_t *address,               /* IN/OUT */
14973 	vm_map_size_t           *size,                  /* OUT */
14974 	vm_region_flavor_t       flavor,                /* IN */
14975 	vm_region_info_t         info,                  /* OUT */
14976 	mach_msg_type_number_t  *count, /* IN/OUT */
14977 	mach_port_t             *object_name)           /* OUT */
14978 {
14979 	vm_map_entry_t          tmp_entry;
14980 	vm_map_entry_t          entry;
14981 	vm_map_offset_t         start;
14982 
14983 	if (map == VM_MAP_NULL) {
14984 		return KERN_INVALID_ARGUMENT;
14985 	}
14986 
14987 	switch (flavor) {
14988 	case VM_REGION_BASIC_INFO:
14989 		/* legacy for old 32-bit objects info */
14990 	{
14991 		vm_region_basic_info_t  basic;
14992 
14993 		if (*count < VM_REGION_BASIC_INFO_COUNT) {
14994 			return KERN_INVALID_ARGUMENT;
14995 		}
14996 
14997 		basic = (vm_region_basic_info_t) info;
14998 		*count = VM_REGION_BASIC_INFO_COUNT;
14999 
15000 		vm_map_lock_read(map);
15001 
15002 		start = *address;
15003 		if (!vm_map_lookup_entry(map, start, &tmp_entry)) {
15004 			if ((entry = tmp_entry->vme_next) == vm_map_to_entry(map)) {
15005 				vm_map_unlock_read(map);
15006 				return KERN_INVALID_ADDRESS;
15007 			}
15008 		} else {
15009 			entry = tmp_entry;
15010 		}
15011 
15012 		start = entry->vme_start;
15013 
15014 		basic->offset = (uint32_t)VME_OFFSET(entry);
15015 		basic->protection = entry->protection;
15016 		basic->inheritance = entry->inheritance;
15017 		basic->max_protection = entry->max_protection;
15018 		basic->behavior = entry->behavior;
15019 		basic->user_wired_count = entry->user_wired_count;
15020 		basic->reserved = entry->is_sub_map;
15021 		*address = start;
15022 		*size = (entry->vme_end - start);
15023 
15024 		if (object_name) {
15025 			*object_name = IP_NULL;
15026 		}
15027 		if (entry->is_sub_map) {
15028 			basic->shared = FALSE;
15029 		} else {
15030 			basic->shared = entry->is_shared;
15031 		}
15032 
15033 		vm_map_unlock_read(map);
15034 		return KERN_SUCCESS;
15035 	}
15036 
15037 	case VM_REGION_BASIC_INFO_64:
15038 	{
15039 		vm_region_basic_info_64_t       basic;
15040 
15041 		if (*count < VM_REGION_BASIC_INFO_COUNT_64) {
15042 			return KERN_INVALID_ARGUMENT;
15043 		}
15044 
15045 		basic = (vm_region_basic_info_64_t) info;
15046 		*count = VM_REGION_BASIC_INFO_COUNT_64;
15047 
15048 		vm_map_lock_read(map);
15049 
15050 		start = *address;
15051 		if (!vm_map_lookup_entry(map, start, &tmp_entry)) {
15052 			if ((entry = tmp_entry->vme_next) == vm_map_to_entry(map)) {
15053 				vm_map_unlock_read(map);
15054 				return KERN_INVALID_ADDRESS;
15055 			}
15056 		} else {
15057 			entry = tmp_entry;
15058 		}
15059 
15060 		start = entry->vme_start;
15061 
15062 		basic->offset = VME_OFFSET(entry);
15063 		basic->protection = entry->protection;
15064 		basic->inheritance = entry->inheritance;
15065 		basic->max_protection = entry->max_protection;
15066 		basic->behavior = entry->behavior;
15067 		basic->user_wired_count = entry->user_wired_count;
15068 		basic->reserved = entry->is_sub_map;
15069 		*address = start;
15070 		*size = (entry->vme_end - start);
15071 
15072 		if (object_name) {
15073 			*object_name = IP_NULL;
15074 		}
15075 		if (entry->is_sub_map) {
15076 			basic->shared = FALSE;
15077 		} else {
15078 			basic->shared = entry->is_shared;
15079 		}
15080 
15081 		vm_map_unlock_read(map);
15082 		return KERN_SUCCESS;
15083 	}
15084 	case VM_REGION_EXTENDED_INFO:
15085 		if (*count < VM_REGION_EXTENDED_INFO_COUNT) {
15086 			return KERN_INVALID_ARGUMENT;
15087 		}
15088 		OS_FALLTHROUGH;
15089 	case VM_REGION_EXTENDED_INFO__legacy:
15090 		if (*count < VM_REGION_EXTENDED_INFO_COUNT__legacy) {
15091 			return KERN_INVALID_ARGUMENT;
15092 		}
15093 
15094 		{
15095 			vm_region_extended_info_t       extended;
15096 			mach_msg_type_number_t original_count;
15097 			int effective_page_size, effective_page_shift;
15098 
15099 			extended = (vm_region_extended_info_t) info;
15100 
15101 			effective_page_shift = vm_self_region_page_shift(map);
15102 			effective_page_size = (1 << effective_page_shift);
15103 
15104 			vm_map_lock_read(map);
15105 
15106 			start = *address;
15107 			if (!vm_map_lookup_entry(map, start, &tmp_entry)) {
15108 				if ((entry = tmp_entry->vme_next) == vm_map_to_entry(map)) {
15109 					vm_map_unlock_read(map);
15110 					return KERN_INVALID_ADDRESS;
15111 				}
15112 			} else {
15113 				entry = tmp_entry;
15114 			}
15115 			start = entry->vme_start;
15116 
15117 			extended->protection = entry->protection;
15118 			extended->user_tag = VME_ALIAS(entry);
15119 			extended->pages_resident = 0;
15120 			extended->pages_swapped_out = 0;
15121 			extended->pages_shared_now_private = 0;
15122 			extended->pages_dirtied = 0;
15123 			extended->external_pager = 0;
15124 			extended->shadow_depth = 0;
15125 
15126 			original_count = *count;
15127 			if (flavor == VM_REGION_EXTENDED_INFO__legacy) {
15128 				*count = VM_REGION_EXTENDED_INFO_COUNT__legacy;
15129 			} else {
15130 				extended->pages_reusable = 0;
15131 				*count = VM_REGION_EXTENDED_INFO_COUNT;
15132 			}
15133 
15134 			vm_map_region_walk(map, start, entry, VME_OFFSET(entry), entry->vme_end - start, extended, TRUE, *count);
15135 
15136 			if (extended->external_pager && extended->ref_count == 2 && extended->share_mode == SM_SHARED) {
15137 				extended->share_mode = SM_PRIVATE;
15138 			}
15139 
15140 			if (object_name) {
15141 				*object_name = IP_NULL;
15142 			}
15143 			*address = start;
15144 			*size = (entry->vme_end - start);
15145 
15146 			vm_map_unlock_read(map);
15147 			return KERN_SUCCESS;
15148 		}
15149 	case VM_REGION_TOP_INFO:
15150 	{
15151 		vm_region_top_info_t    top;
15152 
15153 		if (*count < VM_REGION_TOP_INFO_COUNT) {
15154 			return KERN_INVALID_ARGUMENT;
15155 		}
15156 
15157 		top = (vm_region_top_info_t) info;
15158 		*count = VM_REGION_TOP_INFO_COUNT;
15159 
15160 		vm_map_lock_read(map);
15161 
15162 		start = *address;
15163 		if (!vm_map_lookup_entry(map, start, &tmp_entry)) {
15164 			if ((entry = tmp_entry->vme_next) == vm_map_to_entry(map)) {
15165 				vm_map_unlock_read(map);
15166 				return KERN_INVALID_ADDRESS;
15167 			}
15168 		} else {
15169 			entry = tmp_entry;
15170 		}
15171 		start = entry->vme_start;
15172 
15173 		top->private_pages_resident = 0;
15174 		top->shared_pages_resident = 0;
15175 
15176 		vm_map_region_top_walk(entry, top);
15177 
15178 		if (object_name) {
15179 			*object_name = IP_NULL;
15180 		}
15181 		*address = start;
15182 		*size = (entry->vme_end - start);
15183 
15184 		vm_map_unlock_read(map);
15185 		return KERN_SUCCESS;
15186 	}
15187 	default:
15188 		return KERN_INVALID_ARGUMENT;
15189 	}
15190 }
15191 
15192 #define OBJ_RESIDENT_COUNT(obj, entry_size)                             \
15193 	MIN((entry_size),                                               \
15194 	    ((obj)->all_reusable ?                                      \
15195 	     (obj)->wired_page_count :                                  \
15196 	     (obj)->resident_page_count - (obj)->reusable_page_count))
15197 
15198 void
vm_map_region_top_walk(vm_map_entry_t entry,vm_region_top_info_t top)15199 vm_map_region_top_walk(
15200 	vm_map_entry_t             entry,
15201 	vm_region_top_info_t       top)
15202 {
15203 	if (entry->is_sub_map || VME_OBJECT(entry) == 0) {
15204 		top->share_mode = SM_EMPTY;
15205 		top->ref_count = 0;
15206 		top->obj_id = 0;
15207 		return;
15208 	}
15209 
15210 	{
15211 		struct  vm_object *obj, *tmp_obj;
15212 		int             ref_count;
15213 		uint32_t        entry_size;
15214 
15215 		entry_size = (uint32_t) ((entry->vme_end - entry->vme_start) / PAGE_SIZE_64);
15216 
15217 		obj = VME_OBJECT(entry);
15218 
15219 		vm_object_lock(obj);
15220 
15221 		if ((ref_count = obj->ref_count) > 1 && obj->paging_in_progress) {
15222 			ref_count--;
15223 		}
15224 
15225 		assert(obj->reusable_page_count <= obj->resident_page_count);
15226 		if (obj->shadow) {
15227 			if (ref_count == 1) {
15228 				top->private_pages_resident =
15229 				    OBJ_RESIDENT_COUNT(obj, entry_size);
15230 			} else {
15231 				top->shared_pages_resident =
15232 				    OBJ_RESIDENT_COUNT(obj, entry_size);
15233 			}
15234 			top->ref_count  = ref_count;
15235 			top->share_mode = SM_COW;
15236 
15237 			while ((tmp_obj = obj->shadow)) {
15238 				vm_object_lock(tmp_obj);
15239 				vm_object_unlock(obj);
15240 				obj = tmp_obj;
15241 
15242 				if ((ref_count = obj->ref_count) > 1 && obj->paging_in_progress) {
15243 					ref_count--;
15244 				}
15245 
15246 				assert(obj->reusable_page_count <= obj->resident_page_count);
15247 				top->shared_pages_resident +=
15248 				    OBJ_RESIDENT_COUNT(obj, entry_size);
15249 				top->ref_count += ref_count - 1;
15250 			}
15251 		} else {
15252 			if (entry->superpage_size) {
15253 				top->share_mode = SM_LARGE_PAGE;
15254 				top->shared_pages_resident = 0;
15255 				top->private_pages_resident = entry_size;
15256 			} else if (entry->needs_copy) {
15257 				top->share_mode = SM_COW;
15258 				top->shared_pages_resident =
15259 				    OBJ_RESIDENT_COUNT(obj, entry_size);
15260 			} else {
15261 				if (ref_count == 1 ||
15262 				    (ref_count == 2 && obj->named)) {
15263 					top->share_mode = SM_PRIVATE;
15264 					top->private_pages_resident =
15265 					    OBJ_RESIDENT_COUNT(obj,
15266 					    entry_size);
15267 				} else {
15268 					top->share_mode = SM_SHARED;
15269 					top->shared_pages_resident =
15270 					    OBJ_RESIDENT_COUNT(obj,
15271 					    entry_size);
15272 				}
15273 			}
15274 			top->ref_count = ref_count;
15275 		}
15276 		/* XXX K64: obj_id will be truncated */
15277 		top->obj_id = (unsigned int) (uintptr_t)VM_KERNEL_ADDRPERM(obj);
15278 
15279 		vm_object_unlock(obj);
15280 	}
15281 }
15282 
15283 void
vm_map_region_walk(vm_map_t map,vm_map_offset_t va,vm_map_entry_t entry,vm_object_offset_t offset,vm_object_size_t range,vm_region_extended_info_t extended,boolean_t look_for_pages,mach_msg_type_number_t count)15284 vm_map_region_walk(
15285 	vm_map_t                        map,
15286 	vm_map_offset_t                 va,
15287 	vm_map_entry_t                  entry,
15288 	vm_object_offset_t              offset,
15289 	vm_object_size_t                range,
15290 	vm_region_extended_info_t       extended,
15291 	boolean_t                       look_for_pages,
15292 	mach_msg_type_number_t count)
15293 {
15294 	struct vm_object *obj, *tmp_obj;
15295 	vm_map_offset_t       last_offset;
15296 	int               i;
15297 	int               ref_count;
15298 	struct vm_object        *shadow_object;
15299 	unsigned short          shadow_depth;
15300 	boolean_t         do_region_footprint;
15301 	int                     effective_page_size, effective_page_shift;
15302 	vm_map_offset_t         effective_page_mask;
15303 
15304 	do_region_footprint = task_self_region_footprint();
15305 
15306 	if ((entry->is_sub_map) ||
15307 	    (VME_OBJECT(entry) == 0) ||
15308 	    (VME_OBJECT(entry)->phys_contiguous &&
15309 	    !entry->superpage_size)) {
15310 		extended->share_mode = SM_EMPTY;
15311 		extended->ref_count = 0;
15312 		return;
15313 	}
15314 
15315 	if (entry->superpage_size) {
15316 		extended->shadow_depth = 0;
15317 		extended->share_mode = SM_LARGE_PAGE;
15318 		extended->ref_count = 1;
15319 		extended->external_pager = 0;
15320 
15321 		/* TODO4K: Superpage in 4k mode? */
15322 		extended->pages_resident = (unsigned int)(range >> PAGE_SHIFT);
15323 		extended->shadow_depth = 0;
15324 		return;
15325 	}
15326 
15327 	effective_page_shift = vm_self_region_page_shift(map);
15328 	effective_page_size = (1 << effective_page_shift);
15329 	effective_page_mask = effective_page_size - 1;
15330 
15331 	offset = vm_map_trunc_page(offset, effective_page_mask);
15332 
15333 	obj = VME_OBJECT(entry);
15334 
15335 	vm_object_lock(obj);
15336 
15337 	if ((ref_count = obj->ref_count) > 1 && obj->paging_in_progress) {
15338 		ref_count--;
15339 	}
15340 
15341 	if (look_for_pages) {
15342 		for (last_offset = offset + range;
15343 		    offset < last_offset;
15344 		    offset += effective_page_size, va += effective_page_size) {
15345 			if (do_region_footprint) {
15346 				int disp;
15347 
15348 				disp = 0;
15349 				if (map->has_corpse_footprint) {
15350 					/*
15351 					 * Query the page info data we saved
15352 					 * while forking the corpse.
15353 					 */
15354 					vm_map_corpse_footprint_query_page_info(
15355 						map,
15356 						va,
15357 						&disp);
15358 				} else {
15359 					/*
15360 					 * Query the pmap.
15361 					 */
15362 					vm_map_footprint_query_page_info(
15363 						map,
15364 						entry,
15365 						va,
15366 						&disp);
15367 				}
15368 				if (disp & VM_PAGE_QUERY_PAGE_PRESENT) {
15369 					extended->pages_resident++;
15370 				}
15371 				if (disp & VM_PAGE_QUERY_PAGE_REUSABLE) {
15372 					extended->pages_reusable++;
15373 				}
15374 				if (disp & VM_PAGE_QUERY_PAGE_DIRTY) {
15375 					extended->pages_dirtied++;
15376 				}
15377 				if (disp & PMAP_QUERY_PAGE_COMPRESSED) {
15378 					extended->pages_swapped_out++;
15379 				}
15380 				continue;
15381 			}
15382 
15383 			vm_map_region_look_for_page(map, va, obj,
15384 			    vm_object_trunc_page(offset), ref_count,
15385 			    0, extended, count);
15386 		}
15387 
15388 		if (do_region_footprint) {
15389 			goto collect_object_info;
15390 		}
15391 	} else {
15392 collect_object_info:
15393 		shadow_object = obj->shadow;
15394 		shadow_depth = 0;
15395 
15396 		if (!(obj->internal)) {
15397 			extended->external_pager = 1;
15398 		}
15399 
15400 		if (shadow_object != VM_OBJECT_NULL) {
15401 			vm_object_lock(shadow_object);
15402 			for (;
15403 			    shadow_object != VM_OBJECT_NULL;
15404 			    shadow_depth++) {
15405 				vm_object_t     next_shadow;
15406 
15407 				if (!(shadow_object->internal)) {
15408 					extended->external_pager = 1;
15409 				}
15410 
15411 				next_shadow = shadow_object->shadow;
15412 				if (next_shadow) {
15413 					vm_object_lock(next_shadow);
15414 				}
15415 				vm_object_unlock(shadow_object);
15416 				shadow_object = next_shadow;
15417 			}
15418 		}
15419 		extended->shadow_depth = shadow_depth;
15420 	}
15421 
15422 	if (extended->shadow_depth || entry->needs_copy) {
15423 		extended->share_mode = SM_COW;
15424 	} else {
15425 		if (ref_count == 1) {
15426 			extended->share_mode = SM_PRIVATE;
15427 		} else {
15428 			if (obj->true_share) {
15429 				extended->share_mode = SM_TRUESHARED;
15430 			} else {
15431 				extended->share_mode = SM_SHARED;
15432 			}
15433 		}
15434 	}
15435 	extended->ref_count = ref_count - extended->shadow_depth;
15436 
15437 	for (i = 0; i < extended->shadow_depth; i++) {
15438 		if ((tmp_obj = obj->shadow) == 0) {
15439 			break;
15440 		}
15441 		vm_object_lock(tmp_obj);
15442 		vm_object_unlock(obj);
15443 
15444 		if ((ref_count = tmp_obj->ref_count) > 1 && tmp_obj->paging_in_progress) {
15445 			ref_count--;
15446 		}
15447 
15448 		extended->ref_count += ref_count;
15449 		obj = tmp_obj;
15450 	}
15451 	vm_object_unlock(obj);
15452 
15453 	if (extended->share_mode == SM_SHARED) {
15454 		vm_map_entry_t       cur;
15455 		vm_map_entry_t       last;
15456 		int      my_refs;
15457 
15458 		obj = VME_OBJECT(entry);
15459 		last = vm_map_to_entry(map);
15460 		my_refs = 0;
15461 
15462 		if ((ref_count = obj->ref_count) > 1 && obj->paging_in_progress) {
15463 			ref_count--;
15464 		}
15465 		for (cur = vm_map_first_entry(map); cur != last; cur = cur->vme_next) {
15466 			my_refs += vm_map_region_count_obj_refs(cur, obj);
15467 		}
15468 
15469 		if (my_refs == ref_count) {
15470 			extended->share_mode = SM_PRIVATE_ALIASED;
15471 		} else if (my_refs > 1) {
15472 			extended->share_mode = SM_SHARED_ALIASED;
15473 		}
15474 	}
15475 }
15476 
15477 
15478 /* object is locked on entry and locked on return */
15479 
15480 
15481 static void
vm_map_region_look_for_page(__unused vm_map_t map,__unused vm_map_offset_t va,vm_object_t object,vm_object_offset_t offset,int max_refcnt,unsigned short depth,vm_region_extended_info_t extended,mach_msg_type_number_t count)15482 vm_map_region_look_for_page(
15483 	__unused vm_map_t               map,
15484 	__unused vm_map_offset_t        va,
15485 	vm_object_t                     object,
15486 	vm_object_offset_t              offset,
15487 	int                             max_refcnt,
15488 	unsigned short                  depth,
15489 	vm_region_extended_info_t       extended,
15490 	mach_msg_type_number_t count)
15491 {
15492 	vm_page_t       p;
15493 	vm_object_t     shadow;
15494 	int             ref_count;
15495 	vm_object_t     caller_object;
15496 
15497 	shadow = object->shadow;
15498 	caller_object = object;
15499 
15500 
15501 	while (TRUE) {
15502 		if (!(object->internal)) {
15503 			extended->external_pager = 1;
15504 		}
15505 
15506 		if ((p = vm_page_lookup(object, offset)) != VM_PAGE_NULL) {
15507 			if (shadow && (max_refcnt == 1)) {
15508 				extended->pages_shared_now_private++;
15509 			}
15510 
15511 			if (!p->vmp_fictitious &&
15512 			    (p->vmp_dirty || pmap_is_modified(VM_PAGE_GET_PHYS_PAGE(p)))) {
15513 				extended->pages_dirtied++;
15514 			} else if (count >= VM_REGION_EXTENDED_INFO_COUNT) {
15515 				if (p->vmp_reusable || object->all_reusable) {
15516 					extended->pages_reusable++;
15517 				}
15518 			}
15519 
15520 			extended->pages_resident++;
15521 
15522 			if (object != caller_object) {
15523 				vm_object_unlock(object);
15524 			}
15525 
15526 			return;
15527 		}
15528 		if (object->internal &&
15529 		    object->alive &&
15530 		    !object->terminating &&
15531 		    object->pager_ready) {
15532 			if (VM_COMPRESSOR_PAGER_STATE_GET(object, offset)
15533 			    == VM_EXTERNAL_STATE_EXISTS) {
15534 				/* the pager has that page */
15535 				extended->pages_swapped_out++;
15536 				if (object != caller_object) {
15537 					vm_object_unlock(object);
15538 				}
15539 				return;
15540 			}
15541 		}
15542 
15543 		if (shadow) {
15544 			vm_object_lock(shadow);
15545 
15546 			if ((ref_count = shadow->ref_count) > 1 && shadow->paging_in_progress) {
15547 				ref_count--;
15548 			}
15549 
15550 			if (++depth > extended->shadow_depth) {
15551 				extended->shadow_depth = depth;
15552 			}
15553 
15554 			if (ref_count > max_refcnt) {
15555 				max_refcnt = ref_count;
15556 			}
15557 
15558 			if (object != caller_object) {
15559 				vm_object_unlock(object);
15560 			}
15561 
15562 			offset = offset + object->vo_shadow_offset;
15563 			object = shadow;
15564 			shadow = object->shadow;
15565 			continue;
15566 		}
15567 		if (object != caller_object) {
15568 			vm_object_unlock(object);
15569 		}
15570 		break;
15571 	}
15572 }
15573 
15574 static int
vm_map_region_count_obj_refs(vm_map_entry_t entry,vm_object_t object)15575 vm_map_region_count_obj_refs(
15576 	vm_map_entry_t    entry,
15577 	vm_object_t       object)
15578 {
15579 	int ref_count;
15580 	vm_object_t chk_obj;
15581 	vm_object_t tmp_obj;
15582 
15583 	if (entry->is_sub_map || VME_OBJECT(entry) == VM_OBJECT_NULL) {
15584 		return 0;
15585 	}
15586 
15587 	ref_count = 0;
15588 	chk_obj = VME_OBJECT(entry);
15589 	vm_object_lock(chk_obj);
15590 
15591 	while (chk_obj) {
15592 		if (chk_obj == object) {
15593 			ref_count++;
15594 		}
15595 		tmp_obj = chk_obj->shadow;
15596 		if (tmp_obj) {
15597 			vm_object_lock(tmp_obj);
15598 		}
15599 		vm_object_unlock(chk_obj);
15600 
15601 		chk_obj = tmp_obj;
15602 	}
15603 
15604 	return ref_count;
15605 }
15606 
15607 
15608 /*
15609  *	Routine:	vm_map_simplify
15610  *
15611  *	Description:
15612  *		Attempt to simplify the map representation in
15613  *		the vicinity of the given starting address.
15614  *	Note:
15615  *		This routine is intended primarily to keep the
15616  *		kernel maps more compact -- they generally don't
15617  *		benefit from the "expand a map entry" technology
15618  *		at allocation time because the adjacent entry
15619  *		is often wired down.
15620  */
15621 void
vm_map_simplify_entry(vm_map_t map,vm_map_entry_t this_entry)15622 vm_map_simplify_entry(
15623 	vm_map_t        map,
15624 	vm_map_entry_t  this_entry)
15625 {
15626 	vm_map_entry_t  prev_entry;
15627 
15628 	prev_entry = this_entry->vme_prev;
15629 
15630 	if ((this_entry != vm_map_to_entry(map)) &&
15631 	    (prev_entry != vm_map_to_entry(map)) &&
15632 
15633 	    (prev_entry->vme_end == this_entry->vme_start) &&
15634 
15635 	    (prev_entry->is_sub_map == this_entry->is_sub_map) &&
15636 	    (prev_entry->vme_object_value == this_entry->vme_object_value) &&
15637 	    (prev_entry->vme_kernel_object == this_entry->vme_kernel_object) &&
15638 	    ((VME_OFFSET(prev_entry) + (prev_entry->vme_end -
15639 	    prev_entry->vme_start))
15640 	    == VME_OFFSET(this_entry)) &&
15641 
15642 	    (prev_entry->behavior == this_entry->behavior) &&
15643 	    (prev_entry->needs_copy == this_entry->needs_copy) &&
15644 	    (prev_entry->protection == this_entry->protection) &&
15645 	    (prev_entry->max_protection == this_entry->max_protection) &&
15646 	    (prev_entry->inheritance == this_entry->inheritance) &&
15647 	    (prev_entry->use_pmap == this_entry->use_pmap) &&
15648 	    (VME_ALIAS(prev_entry) == VME_ALIAS(this_entry)) &&
15649 	    (prev_entry->no_cache == this_entry->no_cache) &&
15650 	    (prev_entry->vme_permanent == this_entry->vme_permanent) &&
15651 	    (prev_entry->map_aligned == this_entry->map_aligned) &&
15652 	    (prev_entry->zero_wired_pages == this_entry->zero_wired_pages) &&
15653 	    (prev_entry->used_for_jit == this_entry->used_for_jit) &&
15654 #if __arm64e__
15655 	    (prev_entry->used_for_tpro == this_entry->used_for_tpro) &&
15656 #endif
15657 	    (prev_entry->csm_associated == this_entry->csm_associated) &&
15658 	    (prev_entry->vme_xnu_user_debug == this_entry->vme_xnu_user_debug) &&
15659 	    (prev_entry->iokit_acct == this_entry->iokit_acct) &&
15660 	    (prev_entry->vme_resilient_codesign ==
15661 	    this_entry->vme_resilient_codesign) &&
15662 	    (prev_entry->vme_resilient_media ==
15663 	    this_entry->vme_resilient_media) &&
15664 	    (prev_entry->vme_no_copy_on_read == this_entry->vme_no_copy_on_read) &&
15665 	    (prev_entry->translated_allow_execute == this_entry->translated_allow_execute) &&
15666 
15667 	    (prev_entry->wired_count == this_entry->wired_count) &&
15668 	    (prev_entry->user_wired_count == this_entry->user_wired_count) &&
15669 
15670 	    ((prev_entry->vme_atomic == FALSE) && (this_entry->vme_atomic == FALSE)) &&
15671 	    (prev_entry->in_transition == FALSE) &&
15672 	    (this_entry->in_transition == FALSE) &&
15673 	    (prev_entry->needs_wakeup == FALSE) &&
15674 	    (this_entry->needs_wakeup == FALSE) &&
15675 	    (prev_entry->is_shared == this_entry->is_shared) &&
15676 	    (prev_entry->superpage_size == FALSE) &&
15677 	    (this_entry->superpage_size == FALSE)
15678 	    ) {
15679 		if (prev_entry->vme_permanent) {
15680 			assert(this_entry->vme_permanent);
15681 			prev_entry->vme_permanent = false;
15682 		}
15683 		vm_map_store_entry_unlink(map, prev_entry, true);
15684 		assert(prev_entry->vme_start < this_entry->vme_end);
15685 		if (prev_entry->map_aligned) {
15686 			assert(VM_MAP_PAGE_ALIGNED(prev_entry->vme_start,
15687 			    VM_MAP_PAGE_MASK(map)));
15688 		}
15689 		this_entry->vme_start = prev_entry->vme_start;
15690 		VME_OFFSET_SET(this_entry, VME_OFFSET(prev_entry));
15691 
15692 		if (map->holelistenabled) {
15693 			vm_map_store_update_first_free(map, this_entry, TRUE);
15694 		}
15695 
15696 		if (prev_entry->is_sub_map) {
15697 			vm_map_deallocate(VME_SUBMAP(prev_entry));
15698 		} else {
15699 			vm_object_deallocate(VME_OBJECT(prev_entry));
15700 		}
15701 		vm_map_entry_dispose(prev_entry);
15702 		SAVE_HINT_MAP_WRITE(map, this_entry);
15703 	}
15704 }
15705 
15706 void
vm_map_simplify(vm_map_t map,vm_map_offset_t start)15707 vm_map_simplify(
15708 	vm_map_t        map,
15709 	vm_map_offset_t start)
15710 {
15711 	vm_map_entry_t  this_entry;
15712 
15713 	vm_map_lock(map);
15714 	if (vm_map_lookup_entry(map, start, &this_entry)) {
15715 		vm_map_simplify_entry(map, this_entry);
15716 		vm_map_simplify_entry(map, this_entry->vme_next);
15717 	}
15718 	vm_map_unlock(map);
15719 }
15720 
15721 static void
vm_map_simplify_range(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)15722 vm_map_simplify_range(
15723 	vm_map_t        map,
15724 	vm_map_offset_t start,
15725 	vm_map_offset_t end)
15726 {
15727 	vm_map_entry_t  entry;
15728 
15729 	/*
15730 	 * The map should be locked (for "write") by the caller.
15731 	 */
15732 
15733 	if (start >= end) {
15734 		/* invalid address range */
15735 		return;
15736 	}
15737 
15738 	start = vm_map_trunc_page(start,
15739 	    VM_MAP_PAGE_MASK(map));
15740 	end = vm_map_round_page(end,
15741 	    VM_MAP_PAGE_MASK(map));
15742 
15743 	if (!vm_map_lookup_entry(map, start, &entry)) {
15744 		/* "start" is not mapped and "entry" ends before "start" */
15745 		if (entry == vm_map_to_entry(map)) {
15746 			/* start with first entry in the map */
15747 			entry = vm_map_first_entry(map);
15748 		} else {
15749 			/* start with next entry */
15750 			entry = entry->vme_next;
15751 		}
15752 	}
15753 
15754 	while (entry != vm_map_to_entry(map) &&
15755 	    entry->vme_start <= end) {
15756 		/* try and coalesce "entry" with its previous entry */
15757 		vm_map_simplify_entry(map, entry);
15758 		entry = entry->vme_next;
15759 	}
15760 }
15761 
15762 
15763 /*
15764  *	Routine:	vm_map_machine_attribute
15765  *	Purpose:
15766  *		Provide machine-specific attributes to mappings,
15767  *		such as cachability etc. for machines that provide
15768  *		them.  NUMA architectures and machines with big/strange
15769  *		caches will use this.
15770  *	Note:
15771  *		Responsibilities for locking and checking are handled here,
15772  *		everything else in the pmap module. If any non-volatile
15773  *		information must be kept, the pmap module should handle
15774  *		it itself. [This assumes that attributes do not
15775  *		need to be inherited, which seems ok to me]
15776  */
15777 kern_return_t
vm_map_machine_attribute(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_machine_attribute_t attribute,vm_machine_attribute_val_t * value)15778 vm_map_machine_attribute(
15779 	vm_map_t                        map,
15780 	vm_map_offset_t         start,
15781 	vm_map_offset_t         end,
15782 	vm_machine_attribute_t  attribute,
15783 	vm_machine_attribute_val_t* value)              /* IN/OUT */
15784 {
15785 	kern_return_t   ret;
15786 	vm_map_size_t sync_size;
15787 	vm_map_entry_t entry;
15788 
15789 	if (start < vm_map_min(map) || end > vm_map_max(map)) {
15790 		return KERN_INVALID_ADDRESS;
15791 	}
15792 
15793 	/* Figure how much memory we need to flush (in page increments) */
15794 	sync_size = end - start;
15795 
15796 	vm_map_lock(map);
15797 
15798 	if (attribute != MATTR_CACHE) {
15799 		/* If we don't have to find physical addresses, we */
15800 		/* don't have to do an explicit traversal here.    */
15801 		ret = pmap_attribute(map->pmap, start, end - start,
15802 		    attribute, value);
15803 		vm_map_unlock(map);
15804 		return ret;
15805 	}
15806 
15807 	ret = KERN_SUCCESS;                                                                             /* Assume it all worked */
15808 
15809 	while (sync_size) {
15810 		if (vm_map_lookup_entry(map, start, &entry)) {
15811 			vm_map_size_t   sub_size;
15812 			if ((entry->vme_end - start) > sync_size) {
15813 				sub_size = sync_size;
15814 				sync_size = 0;
15815 			} else {
15816 				sub_size = entry->vme_end - start;
15817 				sync_size -= sub_size;
15818 			}
15819 			if (entry->is_sub_map) {
15820 				vm_map_offset_t sub_start;
15821 				vm_map_offset_t sub_end;
15822 
15823 				sub_start = (start - entry->vme_start)
15824 				    + VME_OFFSET(entry);
15825 				sub_end = sub_start + sub_size;
15826 				vm_map_machine_attribute(
15827 					VME_SUBMAP(entry),
15828 					sub_start,
15829 					sub_end,
15830 					attribute, value);
15831 			} else if (VME_OBJECT(entry)) {
15832 				vm_page_t               m;
15833 				vm_object_t             object;
15834 				vm_object_t             base_object;
15835 				vm_object_t             last_object;
15836 				vm_object_offset_t      offset;
15837 				vm_object_offset_t      base_offset;
15838 				vm_map_size_t           range;
15839 				range = sub_size;
15840 				offset = (start - entry->vme_start)
15841 				    + VME_OFFSET(entry);
15842 				offset = vm_object_trunc_page(offset);
15843 				base_offset = offset;
15844 				object = VME_OBJECT(entry);
15845 				base_object = object;
15846 				last_object = NULL;
15847 
15848 				vm_object_lock(object);
15849 
15850 				while (range) {
15851 					m = vm_page_lookup(
15852 						object, offset);
15853 
15854 					if (m && !m->vmp_fictitious) {
15855 						ret =
15856 						    pmap_attribute_cache_sync(
15857 							VM_PAGE_GET_PHYS_PAGE(m),
15858 							PAGE_SIZE,
15859 							attribute, value);
15860 					} else if (object->shadow) {
15861 						offset = offset + object->vo_shadow_offset;
15862 						last_object = object;
15863 						object = object->shadow;
15864 						vm_object_lock(last_object->shadow);
15865 						vm_object_unlock(last_object);
15866 						continue;
15867 					}
15868 					if (range < PAGE_SIZE) {
15869 						range = 0;
15870 					} else {
15871 						range -= PAGE_SIZE;
15872 					}
15873 
15874 					if (base_object != object) {
15875 						vm_object_unlock(object);
15876 						vm_object_lock(base_object);
15877 						object = base_object;
15878 					}
15879 					/* Bump to the next page */
15880 					base_offset += PAGE_SIZE;
15881 					offset = base_offset;
15882 				}
15883 				vm_object_unlock(object);
15884 			}
15885 			start += sub_size;
15886 		} else {
15887 			vm_map_unlock(map);
15888 			return KERN_FAILURE;
15889 		}
15890 	}
15891 
15892 	vm_map_unlock(map);
15893 
15894 	return ret;
15895 }
15896 
15897 /*
15898  *	vm_map_behavior_set:
15899  *
15900  *	Sets the paging reference behavior of the specified address
15901  *	range in the target map.  Paging reference behavior affects
15902  *	how pagein operations resulting from faults on the map will be
15903  *	clustered.
15904  */
15905 kern_return_t
vm_map_behavior_set(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_behavior_t new_behavior)15906 vm_map_behavior_set(
15907 	vm_map_t        map,
15908 	vm_map_offset_t start,
15909 	vm_map_offset_t end,
15910 	vm_behavior_t   new_behavior)
15911 {
15912 	vm_map_entry_t  entry;
15913 	vm_map_entry_t  temp_entry;
15914 
15915 	if (start > end ||
15916 	    start < vm_map_min(map) ||
15917 	    end > vm_map_max(map)) {
15918 		return KERN_NO_SPACE;
15919 	}
15920 
15921 	switch (new_behavior) {
15922 	/*
15923 	 * This first block of behaviors all set a persistent state on the specified
15924 	 * memory range.  All we have to do here is to record the desired behavior
15925 	 * in the vm_map_entry_t's.
15926 	 */
15927 
15928 	case VM_BEHAVIOR_DEFAULT:
15929 	case VM_BEHAVIOR_RANDOM:
15930 	case VM_BEHAVIOR_SEQUENTIAL:
15931 	case VM_BEHAVIOR_RSEQNTL:
15932 	case VM_BEHAVIOR_ZERO_WIRED_PAGES:
15933 		vm_map_lock(map);
15934 
15935 		/*
15936 		 *	The entire address range must be valid for the map.
15937 		 *      Note that vm_map_range_check() does a
15938 		 *	vm_map_lookup_entry() internally and returns the
15939 		 *	entry containing the start of the address range if
15940 		 *	the entire range is valid.
15941 		 */
15942 		if (vm_map_range_check(map, start, end, &temp_entry)) {
15943 			entry = temp_entry;
15944 			vm_map_clip_start(map, entry, start);
15945 		} else {
15946 			vm_map_unlock(map);
15947 			return KERN_INVALID_ADDRESS;
15948 		}
15949 
15950 		while ((entry != vm_map_to_entry(map)) && (entry->vme_start < end)) {
15951 			vm_map_clip_end(map, entry, end);
15952 			if (entry->is_sub_map) {
15953 				assert(!entry->use_pmap);
15954 			}
15955 
15956 			if (new_behavior == VM_BEHAVIOR_ZERO_WIRED_PAGES) {
15957 				entry->zero_wired_pages = TRUE;
15958 			} else {
15959 				entry->behavior = new_behavior;
15960 			}
15961 			entry = entry->vme_next;
15962 		}
15963 
15964 		vm_map_unlock(map);
15965 		break;
15966 
15967 	/*
15968 	 * The rest of these are different from the above in that they cause
15969 	 * an immediate action to take place as opposed to setting a behavior that
15970 	 * affects future actions.
15971 	 */
15972 
15973 	case VM_BEHAVIOR_WILLNEED:
15974 		return vm_map_willneed(map, start, end);
15975 
15976 	case VM_BEHAVIOR_DONTNEED:
15977 		return vm_map_msync(map, start, end - start, VM_SYNC_DEACTIVATE | VM_SYNC_CONTIGUOUS);
15978 
15979 	case VM_BEHAVIOR_FREE:
15980 		return vm_map_msync(map, start, end - start, VM_SYNC_KILLPAGES | VM_SYNC_CONTIGUOUS);
15981 
15982 	case VM_BEHAVIOR_REUSABLE:
15983 		return vm_map_reusable_pages(map, start, end);
15984 
15985 	case VM_BEHAVIOR_REUSE:
15986 		return vm_map_reuse_pages(map, start, end);
15987 
15988 	case VM_BEHAVIOR_CAN_REUSE:
15989 		return vm_map_can_reuse(map, start, end);
15990 
15991 #if MACH_ASSERT
15992 	case VM_BEHAVIOR_PAGEOUT:
15993 		return vm_map_pageout(map, start, end);
15994 #endif /* MACH_ASSERT */
15995 
15996 	default:
15997 		return KERN_INVALID_ARGUMENT;
15998 	}
15999 
16000 	return KERN_SUCCESS;
16001 }
16002 
16003 
16004 /*
16005  * Internals for madvise(MADV_WILLNEED) system call.
16006  *
16007  * The implementation is to do:-
16008  * a) read-ahead if the mapping corresponds to a mapped regular file
16009  * b) or, fault in the pages (zero-fill, decompress etc) if it's an anonymous mapping
16010  */
16011 
16012 
16013 static kern_return_t
vm_map_willneed(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)16014 vm_map_willneed(
16015 	vm_map_t        map,
16016 	vm_map_offset_t start,
16017 	vm_map_offset_t end
16018 	)
16019 {
16020 	vm_map_entry_t                  entry;
16021 	vm_object_t                     object;
16022 	memory_object_t                 pager;
16023 	struct vm_object_fault_info     fault_info = {};
16024 	kern_return_t                   kr;
16025 	vm_object_size_t                len;
16026 	vm_object_offset_t              offset;
16027 
16028 	fault_info.interruptible = THREAD_UNINT;        /* ignored value */
16029 	fault_info.behavior      = VM_BEHAVIOR_SEQUENTIAL;
16030 	fault_info.stealth       = TRUE;
16031 
16032 	/*
16033 	 * The MADV_WILLNEED operation doesn't require any changes to the
16034 	 * vm_map_entry_t's, so the read lock is sufficient.
16035 	 */
16036 
16037 	vm_map_lock_read(map);
16038 
16039 	/*
16040 	 * The madvise semantics require that the address range be fully
16041 	 * allocated with no holes.  Otherwise, we're required to return
16042 	 * an error.
16043 	 */
16044 
16045 	if (!vm_map_range_check(map, start, end, &entry)) {
16046 		vm_map_unlock_read(map);
16047 		return KERN_INVALID_ADDRESS;
16048 	}
16049 
16050 	/*
16051 	 * Examine each vm_map_entry_t in the range.
16052 	 */
16053 	for (; entry != vm_map_to_entry(map) && start < end;) {
16054 		/*
16055 		 * The first time through, the start address could be anywhere
16056 		 * within the vm_map_entry we found.  So adjust the offset to
16057 		 * correspond.  After that, the offset will always be zero to
16058 		 * correspond to the beginning of the current vm_map_entry.
16059 		 */
16060 		offset = (start - entry->vme_start) + VME_OFFSET(entry);
16061 
16062 		/*
16063 		 * Set the length so we don't go beyond the end of the
16064 		 * map_entry or beyond the end of the range we were given.
16065 		 * This range could span also multiple map entries all of which
16066 		 * map different files, so make sure we only do the right amount
16067 		 * of I/O for each object.  Note that it's possible for there
16068 		 * to be multiple map entries all referring to the same object
16069 		 * but with different page permissions, but it's not worth
16070 		 * trying to optimize that case.
16071 		 */
16072 		len = MIN(entry->vme_end - start, end - start);
16073 
16074 		if ((vm_size_t) len != len) {
16075 			/* 32-bit overflow */
16076 			len = (vm_size_t) (0 - PAGE_SIZE);
16077 		}
16078 		fault_info.cluster_size = (vm_size_t) len;
16079 		fault_info.lo_offset    = offset;
16080 		fault_info.hi_offset    = offset + len;
16081 		fault_info.user_tag     = VME_ALIAS(entry);
16082 		fault_info.pmap_options = 0;
16083 		if (entry->iokit_acct ||
16084 		    (!entry->is_sub_map && !entry->use_pmap)) {
16085 			fault_info.pmap_options |= PMAP_OPTIONS_ALT_ACCT;
16086 		}
16087 		fault_info.fi_xnu_user_debug = entry->vme_xnu_user_debug;
16088 
16089 		/*
16090 		 * If the entry is a submap OR there's no read permission
16091 		 * to this mapping, then just skip it.
16092 		 */
16093 		if ((entry->is_sub_map) || (entry->protection & VM_PROT_READ) == 0) {
16094 			entry = entry->vme_next;
16095 			start = entry->vme_start;
16096 			continue;
16097 		}
16098 
16099 		object = VME_OBJECT(entry);
16100 
16101 		if (object == NULL ||
16102 		    (object && object->internal)) {
16103 			/*
16104 			 * Memory range backed by anonymous memory.
16105 			 */
16106 			vm_size_t region_size = 0, effective_page_size = 0;
16107 			vm_map_offset_t addr = 0, effective_page_mask = 0;
16108 
16109 			region_size = len;
16110 			addr = start;
16111 
16112 			effective_page_mask = MIN(vm_map_page_mask(current_map()), PAGE_MASK);
16113 			effective_page_size = effective_page_mask + 1;
16114 
16115 			vm_map_unlock_read(map);
16116 
16117 			while (region_size) {
16118 				vm_pre_fault(
16119 					vm_map_trunc_page(addr, effective_page_mask),
16120 					VM_PROT_READ | VM_PROT_WRITE);
16121 
16122 				region_size -= effective_page_size;
16123 				addr += effective_page_size;
16124 			}
16125 		} else {
16126 			/*
16127 			 * Find the file object backing this map entry.  If there is
16128 			 * none, then we simply ignore the "will need" advice for this
16129 			 * entry and go on to the next one.
16130 			 */
16131 			if ((object = find_vnode_object(entry)) == VM_OBJECT_NULL) {
16132 				entry = entry->vme_next;
16133 				start = entry->vme_start;
16134 				continue;
16135 			}
16136 
16137 			vm_object_paging_begin(object);
16138 			pager = object->pager;
16139 			vm_object_unlock(object);
16140 
16141 			/*
16142 			 * The data_request() could take a long time, so let's
16143 			 * release the map lock to avoid blocking other threads.
16144 			 */
16145 			vm_map_unlock_read(map);
16146 
16147 			/*
16148 			 * Get the data from the object asynchronously.
16149 			 *
16150 			 * Note that memory_object_data_request() places limits on the
16151 			 * amount of I/O it will do.  Regardless of the len we
16152 			 * specified, it won't do more than MAX_UPL_TRANSFER_BYTES and it
16153 			 * silently truncates the len to that size.  This isn't
16154 			 * necessarily bad since madvise shouldn't really be used to
16155 			 * page in unlimited amounts of data.  Other Unix variants
16156 			 * limit the willneed case as well.  If this turns out to be an
16157 			 * issue for developers, then we can always adjust the policy
16158 			 * here and still be backwards compatible since this is all
16159 			 * just "advice".
16160 			 */
16161 			kr = memory_object_data_request(
16162 				pager,
16163 				vm_object_trunc_page(offset) + object->paging_offset,
16164 				0,      /* ignored */
16165 				VM_PROT_READ,
16166 				(memory_object_fault_info_t)&fault_info);
16167 
16168 			vm_object_lock(object);
16169 			vm_object_paging_end(object);
16170 			vm_object_unlock(object);
16171 
16172 			/*
16173 			 * If we couldn't do the I/O for some reason, just give up on
16174 			 * the madvise.  We still return success to the user since
16175 			 * madvise isn't supposed to fail when the advice can't be
16176 			 * taken.
16177 			 */
16178 
16179 			if (kr != KERN_SUCCESS) {
16180 				return KERN_SUCCESS;
16181 			}
16182 		}
16183 
16184 		start += len;
16185 		if (start >= end) {
16186 			/* done */
16187 			return KERN_SUCCESS;
16188 		}
16189 
16190 		/* look up next entry */
16191 		vm_map_lock_read(map);
16192 		if (!vm_map_lookup_entry(map, start, &entry)) {
16193 			/*
16194 			 * There's a new hole in the address range.
16195 			 */
16196 			vm_map_unlock_read(map);
16197 			return KERN_INVALID_ADDRESS;
16198 		}
16199 	}
16200 
16201 	vm_map_unlock_read(map);
16202 	return KERN_SUCCESS;
16203 }
16204 
16205 static boolean_t
vm_map_entry_is_reusable(vm_map_entry_t entry)16206 vm_map_entry_is_reusable(
16207 	vm_map_entry_t entry)
16208 {
16209 	/* Only user map entries */
16210 
16211 	vm_object_t object;
16212 
16213 	if (entry->is_sub_map) {
16214 		return FALSE;
16215 	}
16216 
16217 	switch (VME_ALIAS(entry)) {
16218 	case VM_MEMORY_MALLOC:
16219 	case VM_MEMORY_MALLOC_SMALL:
16220 	case VM_MEMORY_MALLOC_LARGE:
16221 	case VM_MEMORY_REALLOC:
16222 	case VM_MEMORY_MALLOC_TINY:
16223 	case VM_MEMORY_MALLOC_LARGE_REUSABLE:
16224 	case VM_MEMORY_MALLOC_LARGE_REUSED:
16225 		/*
16226 		 * This is a malloc() memory region: check if it's still
16227 		 * in its original state and can be re-used for more
16228 		 * malloc() allocations.
16229 		 */
16230 		break;
16231 	default:
16232 		/*
16233 		 * Not a malloc() memory region: let the caller decide if
16234 		 * it's re-usable.
16235 		 */
16236 		return TRUE;
16237 	}
16238 
16239 	if (/*entry->is_shared ||*/
16240 		entry->is_sub_map ||
16241 		entry->in_transition ||
16242 		entry->protection != VM_PROT_DEFAULT ||
16243 		entry->max_protection != VM_PROT_ALL ||
16244 		entry->inheritance != VM_INHERIT_DEFAULT ||
16245 		entry->no_cache ||
16246 		entry->vme_permanent ||
16247 		entry->superpage_size != FALSE ||
16248 		entry->zero_wired_pages ||
16249 		entry->wired_count != 0 ||
16250 		entry->user_wired_count != 0) {
16251 		return FALSE;
16252 	}
16253 
16254 	object = VME_OBJECT(entry);
16255 	if (object == VM_OBJECT_NULL) {
16256 		return TRUE;
16257 	}
16258 	if (
16259 #if 0
16260 		/*
16261 		 * Let's proceed even if the VM object is potentially
16262 		 * shared.
16263 		 * We check for this later when processing the actual
16264 		 * VM pages, so the contents will be safe if shared.
16265 		 *
16266 		 * But we can still mark this memory region as "reusable" to
16267 		 * acknowledge that the caller did let us know that the memory
16268 		 * could be re-used and should not be penalized for holding
16269 		 * on to it.  This allows its "resident size" to not include
16270 		 * the reusable range.
16271 		 */
16272 		object->ref_count == 1 &&
16273 #endif
16274 		object->wired_page_count == 0 &&
16275 		object->copy == VM_OBJECT_NULL &&
16276 		object->shadow == VM_OBJECT_NULL &&
16277 		object->internal &&
16278 		object->purgable == VM_PURGABLE_DENY &&
16279 		object->wimg_bits == VM_WIMG_USE_DEFAULT &&
16280 		!object->code_signed) {
16281 		return TRUE;
16282 	}
16283 	return FALSE;
16284 }
16285 
16286 static kern_return_t
vm_map_reuse_pages(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)16287 vm_map_reuse_pages(
16288 	vm_map_t        map,
16289 	vm_map_offset_t start,
16290 	vm_map_offset_t end)
16291 {
16292 	vm_map_entry_t                  entry;
16293 	vm_object_t                     object;
16294 	vm_object_offset_t              start_offset, end_offset;
16295 
16296 	/*
16297 	 * The MADV_REUSE operation doesn't require any changes to the
16298 	 * vm_map_entry_t's, so the read lock is sufficient.
16299 	 */
16300 
16301 	if (VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT) {
16302 		/*
16303 		 * XXX TODO4K
16304 		 * need to figure out what reusable means for a
16305 		 * portion of a native page.
16306 		 */
16307 		return KERN_SUCCESS;
16308 	}
16309 
16310 	vm_map_lock_read(map);
16311 	assert(map->pmap != kernel_pmap);       /* protect alias access */
16312 
16313 	/*
16314 	 * The madvise semantics require that the address range be fully
16315 	 * allocated with no holes.  Otherwise, we're required to return
16316 	 * an error.
16317 	 */
16318 
16319 	if (!vm_map_range_check(map, start, end, &entry)) {
16320 		vm_map_unlock_read(map);
16321 		vm_page_stats_reusable.reuse_pages_failure++;
16322 		return KERN_INVALID_ADDRESS;
16323 	}
16324 
16325 	/*
16326 	 * Examine each vm_map_entry_t in the range.
16327 	 */
16328 	for (; entry != vm_map_to_entry(map) && entry->vme_start < end;
16329 	    entry = entry->vme_next) {
16330 		/*
16331 		 * Sanity check on the VM map entry.
16332 		 */
16333 		if (!vm_map_entry_is_reusable(entry)) {
16334 			vm_map_unlock_read(map);
16335 			vm_page_stats_reusable.reuse_pages_failure++;
16336 			return KERN_INVALID_ADDRESS;
16337 		}
16338 
16339 		/*
16340 		 * The first time through, the start address could be anywhere
16341 		 * within the vm_map_entry we found.  So adjust the offset to
16342 		 * correspond.
16343 		 */
16344 		if (entry->vme_start < start) {
16345 			start_offset = start - entry->vme_start;
16346 		} else {
16347 			start_offset = 0;
16348 		}
16349 		end_offset = MIN(end, entry->vme_end) - entry->vme_start;
16350 		start_offset += VME_OFFSET(entry);
16351 		end_offset += VME_OFFSET(entry);
16352 
16353 		object = VME_OBJECT(entry);
16354 		if (object != VM_OBJECT_NULL) {
16355 			vm_object_lock(object);
16356 			vm_object_reuse_pages(object, start_offset, end_offset,
16357 			    TRUE);
16358 			vm_object_unlock(object);
16359 		}
16360 
16361 		if (VME_ALIAS(entry) == VM_MEMORY_MALLOC_LARGE_REUSABLE) {
16362 			/*
16363 			 * XXX
16364 			 * We do not hold the VM map exclusively here.
16365 			 * The "alias" field is not that critical, so it's
16366 			 * safe to update it here, as long as it is the only
16367 			 * one that can be modified while holding the VM map
16368 			 * "shared".
16369 			 */
16370 			VME_ALIAS_SET(entry, VM_MEMORY_MALLOC_LARGE_REUSED);
16371 		}
16372 	}
16373 
16374 	vm_map_unlock_read(map);
16375 	vm_page_stats_reusable.reuse_pages_success++;
16376 	return KERN_SUCCESS;
16377 }
16378 
16379 
16380 static kern_return_t
vm_map_reusable_pages(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)16381 vm_map_reusable_pages(
16382 	vm_map_t        map,
16383 	vm_map_offset_t start,
16384 	vm_map_offset_t end)
16385 {
16386 	vm_map_entry_t                  entry;
16387 	vm_object_t                     object;
16388 	vm_object_offset_t              start_offset, end_offset;
16389 	vm_map_offset_t                 pmap_offset;
16390 
16391 	if (VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT) {
16392 		/*
16393 		 * XXX TODO4K
16394 		 * need to figure out what reusable means for a portion
16395 		 * of a native page.
16396 		 */
16397 		return KERN_SUCCESS;
16398 	}
16399 
16400 	/*
16401 	 * The MADV_REUSABLE operation doesn't require any changes to the
16402 	 * vm_map_entry_t's, so the read lock is sufficient.
16403 	 */
16404 
16405 	vm_map_lock_read(map);
16406 	assert(map->pmap != kernel_pmap);       /* protect alias access */
16407 
16408 	/*
16409 	 * The madvise semantics require that the address range be fully
16410 	 * allocated with no holes.  Otherwise, we're required to return
16411 	 * an error.
16412 	 */
16413 
16414 	if (!vm_map_range_check(map, start, end, &entry)) {
16415 		vm_map_unlock_read(map);
16416 		vm_page_stats_reusable.reusable_pages_failure++;
16417 		return KERN_INVALID_ADDRESS;
16418 	}
16419 
16420 	/*
16421 	 * Examine each vm_map_entry_t in the range.
16422 	 */
16423 	for (; entry != vm_map_to_entry(map) && entry->vme_start < end;
16424 	    entry = entry->vme_next) {
16425 		int kill_pages = 0;
16426 		boolean_t reusable_no_write = FALSE;
16427 
16428 		/*
16429 		 * Sanity check on the VM map entry.
16430 		 */
16431 		if (!vm_map_entry_is_reusable(entry)) {
16432 			vm_map_unlock_read(map);
16433 			vm_page_stats_reusable.reusable_pages_failure++;
16434 			return KERN_INVALID_ADDRESS;
16435 		}
16436 
16437 		if (!(entry->protection & VM_PROT_WRITE) && !entry->used_for_jit
16438 #if __arm64e__
16439 		    && !entry->used_for_tpro
16440 #endif
16441 		    ) {
16442 			/* not writable: can't discard contents */
16443 			vm_map_unlock_read(map);
16444 			vm_page_stats_reusable.reusable_nonwritable++;
16445 			vm_page_stats_reusable.reusable_pages_failure++;
16446 			return KERN_PROTECTION_FAILURE;
16447 		}
16448 
16449 		/*
16450 		 * The first time through, the start address could be anywhere
16451 		 * within the vm_map_entry we found.  So adjust the offset to
16452 		 * correspond.
16453 		 */
16454 		if (entry->vme_start < start) {
16455 			start_offset = start - entry->vme_start;
16456 			pmap_offset = start;
16457 		} else {
16458 			start_offset = 0;
16459 			pmap_offset = entry->vme_start;
16460 		}
16461 		end_offset = MIN(end, entry->vme_end) - entry->vme_start;
16462 		start_offset += VME_OFFSET(entry);
16463 		end_offset += VME_OFFSET(entry);
16464 
16465 		object = VME_OBJECT(entry);
16466 		if (object == VM_OBJECT_NULL) {
16467 			continue;
16468 		}
16469 
16470 		if (entry->protection & VM_PROT_EXECUTE) {
16471 			/*
16472 			 * Executable mappings might be write-protected by
16473 			 * hardware, so do not attempt to write to these pages.
16474 			 */
16475 			reusable_no_write = TRUE;
16476 		}
16477 
16478 		vm_object_lock(object);
16479 		if (((object->ref_count == 1) ||
16480 		    (object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC &&
16481 		    object->copy == VM_OBJECT_NULL)) &&
16482 		    object->shadow == VM_OBJECT_NULL &&
16483 		    /*
16484 		     * "iokit_acct" entries are billed for their virtual size
16485 		     * (rather than for their resident pages only), so they
16486 		     * wouldn't benefit from making pages reusable, and it
16487 		     * would be hard to keep track of pages that are both
16488 		     * "iokit_acct" and "reusable" in the pmap stats and
16489 		     * ledgers.
16490 		     */
16491 		    !(entry->iokit_acct ||
16492 		    (!entry->is_sub_map && !entry->use_pmap))) {
16493 			if (object->ref_count != 1) {
16494 				vm_page_stats_reusable.reusable_shared++;
16495 			}
16496 			kill_pages = 1;
16497 		} else {
16498 			kill_pages = -1;
16499 		}
16500 		if (kill_pages != -1) {
16501 			vm_object_deactivate_pages(object,
16502 			    start_offset,
16503 			    end_offset - start_offset,
16504 			    kill_pages,
16505 			    TRUE /*reusable_pages*/,
16506 			    reusable_no_write,
16507 			    map->pmap,
16508 			    pmap_offset);
16509 		} else {
16510 			vm_page_stats_reusable.reusable_pages_shared++;
16511 			DTRACE_VM4(vm_map_reusable_pages_shared,
16512 			    unsigned int, VME_ALIAS(entry),
16513 			    vm_map_t, map,
16514 			    vm_map_entry_t, entry,
16515 			    vm_object_t, object);
16516 		}
16517 		vm_object_unlock(object);
16518 
16519 		if (VME_ALIAS(entry) == VM_MEMORY_MALLOC_LARGE ||
16520 		    VME_ALIAS(entry) == VM_MEMORY_MALLOC_LARGE_REUSED) {
16521 			/*
16522 			 * XXX
16523 			 * We do not hold the VM map exclusively here.
16524 			 * The "alias" field is not that critical, so it's
16525 			 * safe to update it here, as long as it is the only
16526 			 * one that can be modified while holding the VM map
16527 			 * "shared".
16528 			 */
16529 			VME_ALIAS_SET(entry, VM_MEMORY_MALLOC_LARGE_REUSABLE);
16530 		}
16531 	}
16532 
16533 	vm_map_unlock_read(map);
16534 	vm_page_stats_reusable.reusable_pages_success++;
16535 	return KERN_SUCCESS;
16536 }
16537 
16538 
16539 static kern_return_t
vm_map_can_reuse(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)16540 vm_map_can_reuse(
16541 	vm_map_t        map,
16542 	vm_map_offset_t start,
16543 	vm_map_offset_t end)
16544 {
16545 	vm_map_entry_t                  entry;
16546 
16547 	/*
16548 	 * The MADV_REUSABLE operation doesn't require any changes to the
16549 	 * vm_map_entry_t's, so the read lock is sufficient.
16550 	 */
16551 
16552 	vm_map_lock_read(map);
16553 	assert(map->pmap != kernel_pmap);       /* protect alias access */
16554 
16555 	/*
16556 	 * The madvise semantics require that the address range be fully
16557 	 * allocated with no holes.  Otherwise, we're required to return
16558 	 * an error.
16559 	 */
16560 
16561 	if (!vm_map_range_check(map, start, end, &entry)) {
16562 		vm_map_unlock_read(map);
16563 		vm_page_stats_reusable.can_reuse_failure++;
16564 		return KERN_INVALID_ADDRESS;
16565 	}
16566 
16567 	/*
16568 	 * Examine each vm_map_entry_t in the range.
16569 	 */
16570 	for (; entry != vm_map_to_entry(map) && entry->vme_start < end;
16571 	    entry = entry->vme_next) {
16572 		/*
16573 		 * Sanity check on the VM map entry.
16574 		 */
16575 		if (!vm_map_entry_is_reusable(entry)) {
16576 			vm_map_unlock_read(map);
16577 			vm_page_stats_reusable.can_reuse_failure++;
16578 			return KERN_INVALID_ADDRESS;
16579 		}
16580 	}
16581 
16582 	vm_map_unlock_read(map);
16583 	vm_page_stats_reusable.can_reuse_success++;
16584 	return KERN_SUCCESS;
16585 }
16586 
16587 
16588 #if MACH_ASSERT
16589 static kern_return_t
vm_map_pageout(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)16590 vm_map_pageout(
16591 	vm_map_t        map,
16592 	vm_map_offset_t start,
16593 	vm_map_offset_t end)
16594 {
16595 	vm_map_entry_t                  entry;
16596 
16597 	/*
16598 	 * The MADV_PAGEOUT operation doesn't require any changes to the
16599 	 * vm_map_entry_t's, so the read lock is sufficient.
16600 	 */
16601 
16602 	vm_map_lock_read(map);
16603 
16604 	/*
16605 	 * The madvise semantics require that the address range be fully
16606 	 * allocated with no holes.  Otherwise, we're required to return
16607 	 * an error.
16608 	 */
16609 
16610 	if (!vm_map_range_check(map, start, end, &entry)) {
16611 		vm_map_unlock_read(map);
16612 		return KERN_INVALID_ADDRESS;
16613 	}
16614 
16615 	/*
16616 	 * Examine each vm_map_entry_t in the range.
16617 	 */
16618 	for (; entry != vm_map_to_entry(map) && entry->vme_start < end;
16619 	    entry = entry->vme_next) {
16620 		vm_object_t     object;
16621 
16622 		/*
16623 		 * Sanity check on the VM map entry.
16624 		 */
16625 		if (entry->is_sub_map) {
16626 			vm_map_t submap;
16627 			vm_map_offset_t submap_start;
16628 			vm_map_offset_t submap_end;
16629 			vm_map_entry_t submap_entry;
16630 
16631 			submap = VME_SUBMAP(entry);
16632 			submap_start = VME_OFFSET(entry);
16633 			submap_end = submap_start + (entry->vme_end -
16634 			    entry->vme_start);
16635 
16636 			vm_map_lock_read(submap);
16637 
16638 			if (!vm_map_range_check(submap,
16639 			    submap_start,
16640 			    submap_end,
16641 			    &submap_entry)) {
16642 				vm_map_unlock_read(submap);
16643 				vm_map_unlock_read(map);
16644 				return KERN_INVALID_ADDRESS;
16645 			}
16646 
16647 			if (submap_entry->is_sub_map) {
16648 				vm_map_unlock_read(submap);
16649 				continue;
16650 			}
16651 
16652 			object = VME_OBJECT(submap_entry);
16653 			if (object == VM_OBJECT_NULL || !object->internal) {
16654 				vm_map_unlock_read(submap);
16655 				continue;
16656 			}
16657 
16658 			vm_object_pageout(object);
16659 
16660 			vm_map_unlock_read(submap);
16661 			submap = VM_MAP_NULL;
16662 			submap_entry = VM_MAP_ENTRY_NULL;
16663 			continue;
16664 		}
16665 
16666 		object = VME_OBJECT(entry);
16667 		if (object == VM_OBJECT_NULL || !object->internal) {
16668 			continue;
16669 		}
16670 
16671 		vm_object_pageout(object);
16672 	}
16673 
16674 	vm_map_unlock_read(map);
16675 	return KERN_SUCCESS;
16676 }
16677 #endif /* MACH_ASSERT */
16678 
16679 
16680 /*
16681  *	Routine:	vm_map_entry_insert
16682  *
16683  *	Description:	This routine inserts a new vm_entry in a locked map.
16684  */
16685 static vm_map_entry_t
vm_map_entry_insert(vm_map_t map,vm_map_entry_t insp_entry,vm_map_offset_t start,vm_map_offset_t end,vm_object_t object,vm_object_offset_t offset,vm_map_kernel_flags_t vmk_flags,boolean_t needs_copy,vm_prot_t cur_protection,vm_prot_t max_protection,vm_inherit_t inheritance,boolean_t clear_map_aligned)16686 vm_map_entry_insert(
16687 	vm_map_t                map,
16688 	vm_map_entry_t          insp_entry,
16689 	vm_map_offset_t         start,
16690 	vm_map_offset_t         end,
16691 	vm_object_t             object,
16692 	vm_object_offset_t      offset,
16693 	vm_map_kernel_flags_t   vmk_flags,
16694 	boolean_t               needs_copy,
16695 	vm_prot_t               cur_protection,
16696 	vm_prot_t               max_protection,
16697 	vm_inherit_t            inheritance,
16698 	boolean_t               clear_map_aligned)
16699 {
16700 	vm_map_entry_t  new_entry;
16701 	boolean_t map_aligned = FALSE;
16702 
16703 	assert(insp_entry != (vm_map_entry_t)0);
16704 	vm_map_lock_assert_exclusive(map);
16705 
16706 #if DEVELOPMENT || DEBUG
16707 	vm_object_offset_t      end_offset = 0;
16708 	assertf(!os_add_overflow(end - start, offset, &end_offset), "size 0x%llx, offset 0x%llx caused overflow", (uint64_t)(end - start), offset);
16709 #endif /* DEVELOPMENT || DEBUG */
16710 
16711 	if (VM_MAP_PAGE_SHIFT(map) != PAGE_SHIFT) {
16712 		map_aligned = TRUE;
16713 	}
16714 	if (clear_map_aligned &&
16715 	    (!VM_MAP_PAGE_ALIGNED(start, VM_MAP_PAGE_MASK(map)) ||
16716 	    !VM_MAP_PAGE_ALIGNED(end, VM_MAP_PAGE_MASK(map)))) {
16717 		map_aligned = FALSE;
16718 	}
16719 	if (map_aligned) {
16720 		assert(VM_MAP_PAGE_ALIGNED(start, VM_MAP_PAGE_MASK(map)));
16721 		assert(VM_MAP_PAGE_ALIGNED(end, VM_MAP_PAGE_MASK(map)));
16722 	} else {
16723 		assert(page_aligned(start));
16724 		assert(page_aligned(end));
16725 	}
16726 	assert(start < end);
16727 
16728 	new_entry = vm_map_entry_create(map);
16729 
16730 	new_entry->vme_start = start;
16731 	new_entry->vme_end = end;
16732 
16733 	if (vmk_flags.vmkf_submap) {
16734 		new_entry->vme_atomic = vmk_flags.vmkf_submap_atomic;
16735 		VME_SUBMAP_SET(new_entry, (vm_map_t)object);
16736 	} else {
16737 		VME_OBJECT_SET(new_entry, object, false, 0);
16738 	}
16739 	VME_OFFSET_SET(new_entry, offset);
16740 	VME_ALIAS_SET(new_entry, vmk_flags.vm_tag);
16741 
16742 	new_entry->map_aligned = map_aligned;
16743 	new_entry->needs_copy = needs_copy;
16744 	new_entry->inheritance = inheritance;
16745 	new_entry->protection = cur_protection;
16746 	new_entry->max_protection = max_protection;
16747 	/*
16748 	 * submap: "use_pmap" means "nested".
16749 	 * default: false.
16750 	 *
16751 	 * object: "use_pmap" means "use pmap accounting" for footprint.
16752 	 * default: true.
16753 	 */
16754 	new_entry->use_pmap = !vmk_flags.vmkf_submap;
16755 	new_entry->no_cache = vmk_flags.vmf_no_cache;
16756 	new_entry->vme_permanent = vmk_flags.vmf_permanent;
16757 	new_entry->translated_allow_execute = vmk_flags.vmkf_translated_allow_execute;
16758 	new_entry->vme_no_copy_on_read = vmk_flags.vmkf_no_copy_on_read;
16759 	new_entry->superpage_size = (vmk_flags.vmf_superpage_size != 0);
16760 
16761 	if (vmk_flags.vmkf_map_jit) {
16762 		if (!(map->jit_entry_exists) ||
16763 		    VM_MAP_POLICY_ALLOW_MULTIPLE_JIT(map)) {
16764 			new_entry->used_for_jit = TRUE;
16765 			map->jit_entry_exists = TRUE;
16766 		}
16767 	}
16768 
16769 	/*
16770 	 *	Insert the new entry into the list.
16771 	 */
16772 
16773 	vm_map_store_entry_link(map, insp_entry, new_entry, vmk_flags);
16774 	map->size += end - start;
16775 
16776 	/*
16777 	 *	Update the free space hint and the lookup hint.
16778 	 */
16779 
16780 	SAVE_HINT_MAP_WRITE(map, new_entry);
16781 	return new_entry;
16782 }
16783 
16784 /*
16785  *	Routine:	vm_map_remap_extract
16786  *
16787  *	Description:	This routine returns a vm_entry list from a map.
16788  */
16789 static kern_return_t
vm_map_remap_extract(vm_map_t map,vm_map_offset_t addr,vm_map_size_t size,boolean_t copy,vm_map_copy_t map_copy,vm_prot_t * cur_protection,vm_prot_t * max_protection,vm_inherit_t inheritance,vm_map_kernel_flags_t vmk_flags)16790 vm_map_remap_extract(
16791 	vm_map_t                map,
16792 	vm_map_offset_t         addr,
16793 	vm_map_size_t           size,
16794 	boolean_t               copy,
16795 	vm_map_copy_t           map_copy,
16796 	vm_prot_t               *cur_protection,   /* IN/OUT */
16797 	vm_prot_t               *max_protection,   /* IN/OUT */
16798 	/* What, no behavior? */
16799 	vm_inherit_t            inheritance,
16800 	vm_map_kernel_flags_t   vmk_flags)
16801 {
16802 	struct vm_map_header   *map_header = &map_copy->cpy_hdr;
16803 	kern_return_t           result;
16804 	vm_map_size_t           mapped_size;
16805 	vm_map_size_t           tmp_size;
16806 	vm_map_entry_t          src_entry;     /* result of last map lookup */
16807 	vm_map_entry_t          new_entry;
16808 	vm_object_offset_t      offset;
16809 	vm_map_offset_t         map_address;
16810 	vm_map_offset_t         src_start;     /* start of entry to map */
16811 	vm_map_offset_t         src_end;       /* end of region to be mapped */
16812 	vm_object_t             object;
16813 	vm_map_version_t        version;
16814 	boolean_t               src_needs_copy;
16815 	boolean_t               new_entry_needs_copy;
16816 	vm_map_entry_t          saved_src_entry;
16817 	boolean_t               src_entry_was_wired;
16818 	vm_prot_t               max_prot_for_prot_copy;
16819 	vm_map_offset_t         effective_page_mask;
16820 	bool                    pageable, same_map;
16821 	boolean_t               vm_remap_legacy;
16822 	vm_prot_t               required_cur_prot, required_max_prot;
16823 	vm_object_t             new_copy_object;     /* vm_object_copy_* result */
16824 	boolean_t               saved_used_for_jit;     /* Saved used_for_jit. */
16825 #if __arm64e__
16826 	boolean_t               saved_used_for_tpro;    /* Saved used_for_tpro. */
16827 #endif
16828 
16829 	pageable = vmk_flags.vmkf_copy_pageable;
16830 	same_map = vmk_flags.vmkf_copy_same_map;
16831 
16832 	effective_page_mask = MIN(PAGE_MASK, VM_MAP_PAGE_MASK(map));
16833 
16834 	assert(map != VM_MAP_NULL);
16835 	assert(size != 0);
16836 	assert(size == vm_map_round_page(size, effective_page_mask));
16837 	assert(inheritance == VM_INHERIT_NONE ||
16838 	    inheritance == VM_INHERIT_COPY ||
16839 	    inheritance == VM_INHERIT_SHARE);
16840 	assert(!(*cur_protection & ~(VM_PROT_ALL | VM_PROT_ALLEXEC)));
16841 	assert(!(*max_protection & ~(VM_PROT_ALL | VM_PROT_ALLEXEC)));
16842 	assert((*cur_protection & *max_protection) == *cur_protection);
16843 
16844 	/*
16845 	 *	Compute start and end of region.
16846 	 */
16847 	src_start = vm_map_trunc_page(addr, effective_page_mask);
16848 	src_end = vm_map_round_page(src_start + size, effective_page_mask);
16849 
16850 	/*
16851 	 *	Initialize map_header.
16852 	 */
16853 	map_header->nentries = 0;
16854 	map_header->entries_pageable = pageable;
16855 //	map_header->page_shift = MIN(VM_MAP_PAGE_SHIFT(map), PAGE_SHIFT);
16856 	map_header->page_shift = (uint16_t)VM_MAP_PAGE_SHIFT(map);
16857 	map_header->rb_head_store.rbh_root = (void *)(int)SKIP_RB_TREE;
16858 	vm_map_store_init(map_header);
16859 
16860 	if (copy && vmk_flags.vmkf_remap_prot_copy) {
16861 		/*
16862 		 * Special case for vm_map_protect(VM_PROT_COPY):
16863 		 * we want to set the new mappings' max protection to the
16864 		 * specified *max_protection...
16865 		 */
16866 		max_prot_for_prot_copy = *max_protection & (VM_PROT_ALL | VM_PROT_ALLEXEC);
16867 		/* ... but we want to use the vm_remap() legacy mode */
16868 		*max_protection = VM_PROT_NONE;
16869 		*cur_protection = VM_PROT_NONE;
16870 	} else {
16871 		max_prot_for_prot_copy = VM_PROT_NONE;
16872 	}
16873 
16874 	if (*cur_protection == VM_PROT_NONE &&
16875 	    *max_protection == VM_PROT_NONE) {
16876 		/*
16877 		 * vm_remap() legacy mode:
16878 		 * Extract all memory regions in the specified range and
16879 		 * collect the strictest set of protections allowed on the
16880 		 * entire range, so the caller knows what they can do with
16881 		 * the remapped range.
16882 		 * We start with VM_PROT_ALL and we'll remove the protections
16883 		 * missing from each memory region.
16884 		 */
16885 		vm_remap_legacy = TRUE;
16886 		*cur_protection = VM_PROT_ALL;
16887 		*max_protection = VM_PROT_ALL;
16888 		required_cur_prot = VM_PROT_NONE;
16889 		required_max_prot = VM_PROT_NONE;
16890 	} else {
16891 		/*
16892 		 * vm_remap_new() mode:
16893 		 * Extract all memory regions in the specified range and
16894 		 * ensure that they have at least the protections specified
16895 		 * by the caller via *cur_protection and *max_protection.
16896 		 * The resulting mapping should have these protections.
16897 		 */
16898 		vm_remap_legacy = FALSE;
16899 		if (copy) {
16900 			required_cur_prot = VM_PROT_NONE;
16901 			required_max_prot = VM_PROT_READ;
16902 		} else {
16903 			required_cur_prot = *cur_protection;
16904 			required_max_prot = *max_protection;
16905 		}
16906 	}
16907 
16908 	map_address = 0;
16909 	mapped_size = 0;
16910 	result = KERN_SUCCESS;
16911 
16912 	/*
16913 	 *	The specified source virtual space might correspond to
16914 	 *	multiple map entries, need to loop on them.
16915 	 */
16916 	vm_map_lock(map);
16917 
16918 	if (map->pmap == kernel_pmap) {
16919 		map_copy->is_kernel_range = true;
16920 		map_copy->orig_range = kmem_addr_get_range(addr, size);
16921 #if CONFIG_MAP_RANGES
16922 	} else if (map->uses_user_ranges) {
16923 		map_copy->is_user_range = true;
16924 		map_copy->orig_range = vm_map_user_range_resolve(map, addr, size, NULL);
16925 #endif /* CONFIG_MAP_RANGES */
16926 	}
16927 
16928 	if (VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT) {
16929 		/*
16930 		 * This address space uses sub-pages so the range might
16931 		 * not be re-mappable in an address space with larger
16932 		 * pages. Re-assemble any broken-up VM map entries to
16933 		 * improve our chances of making it work.
16934 		 */
16935 		vm_map_simplify_range(map, src_start, src_end);
16936 	}
16937 	while (mapped_size != size) {
16938 		vm_map_size_t   entry_size;
16939 
16940 		/*
16941 		 *	Find the beginning of the region.
16942 		 */
16943 		if (!vm_map_lookup_entry(map, src_start, &src_entry)) {
16944 			result = KERN_INVALID_ADDRESS;
16945 			break;
16946 		}
16947 
16948 		if (src_start < src_entry->vme_start ||
16949 		    (mapped_size && src_start != src_entry->vme_start)) {
16950 			result = KERN_INVALID_ADDRESS;
16951 			break;
16952 		}
16953 
16954 		tmp_size = size - mapped_size;
16955 		if (src_end > src_entry->vme_end) {
16956 			tmp_size -= (src_end - src_entry->vme_end);
16957 		}
16958 
16959 		entry_size = (vm_map_size_t)(src_entry->vme_end -
16960 		    src_entry->vme_start);
16961 
16962 		if (src_entry->is_sub_map &&
16963 		    vmk_flags.vmkf_copy_single_object) {
16964 			vm_map_t submap;
16965 			vm_map_offset_t submap_start;
16966 			vm_map_size_t submap_size;
16967 			boolean_t submap_needs_copy;
16968 
16969 			/*
16970 			 * No check for "required protection" on "src_entry"
16971 			 * because the protections that matter are the ones
16972 			 * on the submap's VM map entry, which will be checked
16973 			 * during the call to vm_map_remap_extract() below.
16974 			 */
16975 			submap_size = src_entry->vme_end - src_start;
16976 			if (submap_size > size) {
16977 				submap_size = size;
16978 			}
16979 			submap_start = VME_OFFSET(src_entry) + src_start - src_entry->vme_start;
16980 			submap = VME_SUBMAP(src_entry);
16981 			if (copy) {
16982 				/*
16983 				 * The caller wants a copy-on-write re-mapping,
16984 				 * so let's extract from the submap accordingly.
16985 				 */
16986 				submap_needs_copy = TRUE;
16987 			} else if (src_entry->needs_copy) {
16988 				/*
16989 				 * The caller wants a shared re-mapping but the
16990 				 * submap is mapped with "needs_copy", so its
16991 				 * contents can't be shared as is. Extract the
16992 				 * contents of the submap as "copy-on-write".
16993 				 * The re-mapping won't be shared with the
16994 				 * original mapping but this is equivalent to
16995 				 * what happened with the original "remap from
16996 				 * submap" code.
16997 				 * The shared region is mapped "needs_copy", for
16998 				 * example.
16999 				 */
17000 				submap_needs_copy = TRUE;
17001 			} else {
17002 				/*
17003 				 * The caller wants a shared re-mapping and
17004 				 * this mapping can be shared (no "needs_copy"),
17005 				 * so let's extract from the submap accordingly.
17006 				 * Kernel submaps are mapped without
17007 				 * "needs_copy", for example.
17008 				 */
17009 				submap_needs_copy = FALSE;
17010 			}
17011 			vm_map_reference(submap);
17012 			vm_map_unlock(map);
17013 			src_entry = NULL;
17014 			if (vm_remap_legacy) {
17015 				*cur_protection = VM_PROT_NONE;
17016 				*max_protection = VM_PROT_NONE;
17017 			}
17018 
17019 			DTRACE_VM7(remap_submap_recurse,
17020 			    vm_map_t, map,
17021 			    vm_map_offset_t, addr,
17022 			    vm_map_size_t, size,
17023 			    boolean_t, copy,
17024 			    vm_map_offset_t, submap_start,
17025 			    vm_map_size_t, submap_size,
17026 			    boolean_t, submap_needs_copy);
17027 
17028 			result = vm_map_remap_extract(submap,
17029 			    submap_start,
17030 			    submap_size,
17031 			    submap_needs_copy,
17032 			    map_copy,
17033 			    cur_protection,
17034 			    max_protection,
17035 			    inheritance,
17036 			    vmk_flags);
17037 			vm_map_deallocate(submap);
17038 			return result;
17039 		}
17040 
17041 		if (src_entry->is_sub_map) {
17042 			/* protections for submap mapping are irrelevant here */
17043 		} else if (((src_entry->protection & required_cur_prot) !=
17044 		    required_cur_prot) ||
17045 		    ((src_entry->max_protection & required_max_prot) !=
17046 		    required_max_prot)) {
17047 			if (vmk_flags.vmkf_copy_single_object &&
17048 			    mapped_size != 0) {
17049 				/*
17050 				 * Single object extraction.
17051 				 * We can't extract more with the required
17052 				 * protection but we've extracted some, so
17053 				 * stop there and declare success.
17054 				 * The caller should check the size of
17055 				 * the copy entry we've extracted.
17056 				 */
17057 				result = KERN_SUCCESS;
17058 			} else {
17059 				/*
17060 				 * VM range extraction.
17061 				 * Required proctection is not available
17062 				 * for this part of the range: fail.
17063 				 */
17064 				result = KERN_PROTECTION_FAILURE;
17065 			}
17066 			break;
17067 		}
17068 
17069 		if (src_entry->is_sub_map) {
17070 			vm_map_t submap;
17071 			vm_map_offset_t submap_start;
17072 			vm_map_size_t submap_size;
17073 			vm_map_copy_t submap_copy;
17074 			vm_prot_t submap_curprot, submap_maxprot;
17075 			boolean_t submap_needs_copy;
17076 
17077 			/*
17078 			 * No check for "required protection" on "src_entry"
17079 			 * because the protections that matter are the ones
17080 			 * on the submap's VM map entry, which will be checked
17081 			 * during the call to vm_map_copy_extract() below.
17082 			 */
17083 			object = VM_OBJECT_NULL;
17084 			submap_copy = VM_MAP_COPY_NULL;
17085 
17086 			/* find equivalent range in the submap */
17087 			submap = VME_SUBMAP(src_entry);
17088 			submap_start = VME_OFFSET(src_entry) + src_start - src_entry->vme_start;
17089 			submap_size = tmp_size;
17090 			if (copy) {
17091 				/*
17092 				 * The caller wants a copy-on-write re-mapping,
17093 				 * so let's extract from the submap accordingly.
17094 				 */
17095 				submap_needs_copy = TRUE;
17096 			} else if (src_entry->needs_copy) {
17097 				/*
17098 				 * The caller wants a shared re-mapping but the
17099 				 * submap is mapped with "needs_copy", so its
17100 				 * contents can't be shared as is. Extract the
17101 				 * contents of the submap as "copy-on-write".
17102 				 * The re-mapping won't be shared with the
17103 				 * original mapping but this is equivalent to
17104 				 * what happened with the original "remap from
17105 				 * submap" code.
17106 				 * The shared region is mapped "needs_copy", for
17107 				 * example.
17108 				 */
17109 				submap_needs_copy = TRUE;
17110 			} else {
17111 				/*
17112 				 * The caller wants a shared re-mapping and
17113 				 * this mapping can be shared (no "needs_copy"),
17114 				 * so let's extract from the submap accordingly.
17115 				 * Kernel submaps are mapped without
17116 				 * "needs_copy", for example.
17117 				 */
17118 				submap_needs_copy = FALSE;
17119 			}
17120 			/* extra ref to keep submap alive */
17121 			vm_map_reference(submap);
17122 
17123 			DTRACE_VM7(remap_submap_recurse,
17124 			    vm_map_t, map,
17125 			    vm_map_offset_t, addr,
17126 			    vm_map_size_t, size,
17127 			    boolean_t, copy,
17128 			    vm_map_offset_t, submap_start,
17129 			    vm_map_size_t, submap_size,
17130 			    boolean_t, submap_needs_copy);
17131 
17132 			/*
17133 			 * The map can be safely unlocked since we
17134 			 * already hold a reference on the submap.
17135 			 *
17136 			 * No timestamp since we don't care if the map
17137 			 * gets modified while we're down in the submap.
17138 			 * We'll resume the extraction at src_start + tmp_size
17139 			 * anyway.
17140 			 */
17141 			vm_map_unlock(map);
17142 			src_entry = NULL; /* not valid once map is unlocked */
17143 
17144 			if (vm_remap_legacy) {
17145 				submap_curprot = VM_PROT_NONE;
17146 				submap_maxprot = VM_PROT_NONE;
17147 				if (max_prot_for_prot_copy) {
17148 					submap_maxprot = max_prot_for_prot_copy;
17149 				}
17150 			} else {
17151 				assert(!max_prot_for_prot_copy);
17152 				submap_curprot = *cur_protection;
17153 				submap_maxprot = *max_protection;
17154 			}
17155 			result = vm_map_copy_extract(submap,
17156 			    submap_start,
17157 			    submap_size,
17158 			    submap_needs_copy,
17159 			    &submap_copy,
17160 			    &submap_curprot,
17161 			    &submap_maxprot,
17162 			    inheritance,
17163 			    vmk_flags);
17164 
17165 			/* release extra ref on submap */
17166 			vm_map_deallocate(submap);
17167 			submap = VM_MAP_NULL;
17168 
17169 			if (result != KERN_SUCCESS) {
17170 				vm_map_lock(map);
17171 				break;
17172 			}
17173 
17174 			/* transfer submap_copy entries to map_header */
17175 			while (vm_map_copy_first_entry(submap_copy) !=
17176 			    vm_map_copy_to_entry(submap_copy)) {
17177 				vm_map_entry_t copy_entry;
17178 				vm_map_size_t copy_entry_size;
17179 
17180 				copy_entry = vm_map_copy_first_entry(submap_copy);
17181 
17182 				/*
17183 				 * Prevent kernel_object from being exposed to
17184 				 * user space.
17185 				 */
17186 				if (__improbable(copy_entry->vme_kernel_object)) {
17187 					printf("%d[%s]: rejecting attempt to extract from kernel_object\n",
17188 					    proc_selfpid(),
17189 					    (get_bsdtask_info(current_task())
17190 					    ? proc_name_address(get_bsdtask_info(current_task()))
17191 					    : "?"));
17192 					DTRACE_VM(extract_kernel_only);
17193 					result = KERN_INVALID_RIGHT;
17194 					vm_map_copy_discard(submap_copy);
17195 					submap_copy = VM_MAP_COPY_NULL;
17196 					vm_map_lock(map);
17197 					break;
17198 				}
17199 
17200 				vm_map_copy_entry_unlink(submap_copy, copy_entry);
17201 				copy_entry_size = copy_entry->vme_end - copy_entry->vme_start;
17202 				copy_entry->vme_start = map_address;
17203 				copy_entry->vme_end = map_address + copy_entry_size;
17204 				map_address += copy_entry_size;
17205 				mapped_size += copy_entry_size;
17206 				src_start += copy_entry_size;
17207 				assert(src_start <= src_end);
17208 				_vm_map_store_entry_link(map_header,
17209 				    map_header->links.prev,
17210 				    copy_entry);
17211 			}
17212 			/* done with submap_copy */
17213 			vm_map_copy_discard(submap_copy);
17214 
17215 			if (vm_remap_legacy) {
17216 				*cur_protection &= submap_curprot;
17217 				*max_protection &= submap_maxprot;
17218 			}
17219 
17220 			/* re-acquire the map lock and continue to next entry */
17221 			vm_map_lock(map);
17222 			continue;
17223 		} else {
17224 			object = VME_OBJECT(src_entry);
17225 
17226 			/*
17227 			 * Prevent kernel_object from being exposed to
17228 			 * user space.
17229 			 */
17230 			if (__improbable(object == kernel_object)) {
17231 				printf("%d[%s]: rejecting attempt to extract from kernel_object\n",
17232 				    proc_selfpid(),
17233 				    (get_bsdtask_info(current_task())
17234 				    ? proc_name_address(get_bsdtask_info(current_task()))
17235 				    : "?"));
17236 				DTRACE_VM(extract_kernel_only);
17237 				result = KERN_INVALID_RIGHT;
17238 				break;
17239 			}
17240 
17241 			if (src_entry->iokit_acct) {
17242 				/*
17243 				 * This entry uses "IOKit accounting".
17244 				 */
17245 			} else if (object != VM_OBJECT_NULL &&
17246 			    (object->purgable != VM_PURGABLE_DENY ||
17247 			    object->vo_ledger_tag != VM_LEDGER_TAG_NONE)) {
17248 				/*
17249 				 * Purgeable objects have their own accounting:
17250 				 * no pmap accounting for them.
17251 				 */
17252 				assertf(!src_entry->use_pmap,
17253 				    "map=%p src_entry=%p [0x%llx:0x%llx] 0x%x/0x%x %d",
17254 				    map,
17255 				    src_entry,
17256 				    (uint64_t)src_entry->vme_start,
17257 				    (uint64_t)src_entry->vme_end,
17258 				    src_entry->protection,
17259 				    src_entry->max_protection,
17260 				    VME_ALIAS(src_entry));
17261 			} else {
17262 				/*
17263 				 * Not IOKit or purgeable:
17264 				 * must be accounted by pmap stats.
17265 				 */
17266 				assertf(src_entry->use_pmap,
17267 				    "map=%p src_entry=%p [0x%llx:0x%llx] 0x%x/0x%x %d",
17268 				    map,
17269 				    src_entry,
17270 				    (uint64_t)src_entry->vme_start,
17271 				    (uint64_t)src_entry->vme_end,
17272 				    src_entry->protection,
17273 				    src_entry->max_protection,
17274 				    VME_ALIAS(src_entry));
17275 			}
17276 
17277 			if (object == VM_OBJECT_NULL) {
17278 				assert(!src_entry->needs_copy);
17279 				if (src_entry->max_protection == VM_PROT_NONE) {
17280 					assert(src_entry->protection == VM_PROT_NONE);
17281 					/*
17282 					 * No VM object and no permissions:
17283 					 * this must be a reserved range with
17284 					 * nothing to share or copy.
17285 					 * There could also be all sorts of
17286 					 * pmap shenanigans within that reserved
17287 					 * range, so let's just copy the map
17288 					 * entry as is to remap a similar
17289 					 * reserved range.
17290 					 */
17291 					offset = 0; /* no object => no offset */
17292 					goto copy_src_entry;
17293 				}
17294 				object = vm_object_allocate(entry_size);
17295 				VME_OFFSET_SET(src_entry, 0);
17296 				VME_OBJECT_SET(src_entry, object, false, 0);
17297 				assert(src_entry->use_pmap);
17298 				assert(!map->mapped_in_other_pmaps);
17299 			} else if (src_entry->wired_count ||
17300 			    object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC) {
17301 				/*
17302 				 * A wired memory region should not have
17303 				 * any pending copy-on-write and needs to
17304 				 * keep pointing at the VM object that
17305 				 * contains the wired pages.
17306 				 * If we're sharing this memory (copy=false),
17307 				 * we'll share this VM object.
17308 				 * If we're copying this memory (copy=true),
17309 				 * we'll call vm_object_copy_slowly() below
17310 				 * and use the new VM object for the remapping.
17311 				 *
17312 				 * Or, we are already using an asymmetric
17313 				 * copy, and therefore we already have
17314 				 * the right object.
17315 				 */
17316 				assert(!src_entry->needs_copy);
17317 			} else if (src_entry->needs_copy || object->shadowed ||
17318 			    (object->internal && !object->true_share &&
17319 			    !src_entry->is_shared &&
17320 			    object->vo_size > entry_size)) {
17321 				VME_OBJECT_SHADOW(src_entry, entry_size,
17322 				    vm_map_always_shadow(map));
17323 				assert(src_entry->use_pmap);
17324 
17325 				if (!src_entry->needs_copy &&
17326 				    (src_entry->protection & VM_PROT_WRITE)) {
17327 					vm_prot_t prot;
17328 
17329 					assert(!pmap_has_prot_policy(map->pmap, src_entry->translated_allow_execute, src_entry->protection));
17330 
17331 					prot = src_entry->protection & ~VM_PROT_WRITE;
17332 
17333 					if (override_nx(map,
17334 					    VME_ALIAS(src_entry))
17335 					    && prot) {
17336 						prot |= VM_PROT_EXECUTE;
17337 					}
17338 
17339 					assert(!pmap_has_prot_policy(map->pmap, src_entry->translated_allow_execute, prot));
17340 
17341 					if (map->mapped_in_other_pmaps) {
17342 						vm_object_pmap_protect(
17343 							VME_OBJECT(src_entry),
17344 							VME_OFFSET(src_entry),
17345 							entry_size,
17346 							PMAP_NULL,
17347 							PAGE_SIZE,
17348 							src_entry->vme_start,
17349 							prot);
17350 #if MACH_ASSERT
17351 					} else if (__improbable(map->pmap == PMAP_NULL)) {
17352 						extern boolean_t vm_tests_in_progress;
17353 						assert(vm_tests_in_progress);
17354 						/*
17355 						 * Some VM tests (in vm_tests.c)
17356 						 * sometimes want to use a VM
17357 						 * map without a pmap.
17358 						 * Otherwise, this should never
17359 						 * happen.
17360 						 */
17361 #endif /* MACH_ASSERT */
17362 					} else {
17363 						pmap_protect(vm_map_pmap(map),
17364 						    src_entry->vme_start,
17365 						    src_entry->vme_end,
17366 						    prot);
17367 					}
17368 				}
17369 
17370 				object = VME_OBJECT(src_entry);
17371 				src_entry->needs_copy = FALSE;
17372 			}
17373 
17374 
17375 			vm_object_lock(object);
17376 			vm_object_reference_locked(object); /* object ref. for new entry */
17377 			assert(!src_entry->needs_copy);
17378 			if (object->copy_strategy ==
17379 			    MEMORY_OBJECT_COPY_SYMMETRIC) {
17380 				/*
17381 				 * If we want to share this object (copy==0),
17382 				 * it needs to be COPY_DELAY.
17383 				 * If we want to copy this object (copy==1),
17384 				 * we can't just set "needs_copy" on our side
17385 				 * and expect the other side to do the same
17386 				 * (symmetrically), so we can't let the object
17387 				 * stay COPY_SYMMETRIC.
17388 				 * So we always switch from COPY_SYMMETRIC to
17389 				 * COPY_DELAY.
17390 				 */
17391 				object->copy_strategy =
17392 				    MEMORY_OBJECT_COPY_DELAY;
17393 				object->true_share = TRUE;
17394 			}
17395 			vm_object_unlock(object);
17396 		}
17397 
17398 		offset = (VME_OFFSET(src_entry) +
17399 		    (src_start - src_entry->vme_start));
17400 
17401 copy_src_entry:
17402 		new_entry = _vm_map_entry_create(map_header);
17403 		vm_map_entry_copy(map, new_entry, src_entry);
17404 		if (new_entry->is_sub_map) {
17405 			/* clr address space specifics */
17406 			new_entry->use_pmap = FALSE;
17407 		} else if (copy) {
17408 			/*
17409 			 * We're dealing with a copy-on-write operation,
17410 			 * so the resulting mapping should not inherit the
17411 			 * original mapping's accounting settings.
17412 			 * "use_pmap" should be reset to its default (TRUE)
17413 			 * so that the new mapping gets accounted for in
17414 			 * the task's memory footprint.
17415 			 */
17416 			new_entry->use_pmap = TRUE;
17417 		}
17418 		/* "iokit_acct" was cleared in vm_map_entry_copy() */
17419 		assert(!new_entry->iokit_acct);
17420 
17421 		new_entry->map_aligned = FALSE;
17422 
17423 		new_entry->vme_start = map_address;
17424 		new_entry->vme_end = map_address + tmp_size;
17425 		assert(new_entry->vme_start < new_entry->vme_end);
17426 		if (copy && vmk_flags.vmkf_remap_prot_copy) {
17427 			/* security: keep "permanent" and "csm_associated" */
17428 			new_entry->vme_permanent = src_entry->vme_permanent;
17429 			new_entry->csm_associated = src_entry->csm_associated;
17430 			/*
17431 			 * Remapping for vm_map_protect(VM_PROT_COPY)
17432 			 * to convert a read-only mapping into a
17433 			 * copy-on-write version of itself but
17434 			 * with write access:
17435 			 * keep the original inheritance but let's not
17436 			 * add VM_PROT_WRITE to the max protection yet
17437 			 * since we want to do more security checks against
17438 			 * the target map.
17439 			 */
17440 			new_entry->inheritance = src_entry->inheritance;
17441 			new_entry->protection &= max_prot_for_prot_copy;
17442 		} else {
17443 			new_entry->inheritance = inheritance;
17444 			if (!vm_remap_legacy) {
17445 				new_entry->protection = *cur_protection;
17446 				new_entry->max_protection = *max_protection;
17447 			}
17448 		}
17449 		VME_OFFSET_SET(new_entry, offset);
17450 
17451 		/*
17452 		 * The new region has to be copied now if required.
17453 		 */
17454 RestartCopy:
17455 		if (!copy) {
17456 			if (src_entry->used_for_jit == TRUE) {
17457 				if (same_map) {
17458 				} else if (!VM_MAP_POLICY_ALLOW_JIT_SHARING(map)) {
17459 					/*
17460 					 * Cannot allow an entry describing a JIT
17461 					 * region to be shared across address spaces.
17462 					 */
17463 					result = KERN_INVALID_ARGUMENT;
17464 					vm_object_deallocate(object);
17465 					vm_map_entry_dispose(new_entry);
17466 					new_entry = VM_MAP_ENTRY_NULL;
17467 					break;
17468 				}
17469 			}
17470 
17471 			src_entry->is_shared = TRUE;
17472 			new_entry->is_shared = TRUE;
17473 			if (!(new_entry->is_sub_map)) {
17474 				new_entry->needs_copy = FALSE;
17475 			}
17476 		} else if (src_entry->is_sub_map) {
17477 			/* make this a COW sub_map if not already */
17478 			assert(new_entry->wired_count == 0);
17479 			new_entry->needs_copy = TRUE;
17480 			object = VM_OBJECT_NULL;
17481 		} else if (src_entry->wired_count == 0 &&
17482 		    !(debug4k_no_cow_copyin && VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT) &&
17483 		    vm_object_copy_quickly(VME_OBJECT(new_entry),
17484 		    VME_OFFSET(new_entry),
17485 		    (new_entry->vme_end -
17486 		    new_entry->vme_start),
17487 		    &src_needs_copy,
17488 		    &new_entry_needs_copy)) {
17489 			new_entry->needs_copy = new_entry_needs_copy;
17490 			new_entry->is_shared = FALSE;
17491 			assertf(new_entry->use_pmap, "map %p new_entry %p\n", map, new_entry);
17492 
17493 			/*
17494 			 * Handle copy_on_write semantics.
17495 			 */
17496 			if (src_needs_copy && !src_entry->needs_copy) {
17497 				vm_prot_t prot;
17498 
17499 				assert(!pmap_has_prot_policy(map->pmap, src_entry->translated_allow_execute, src_entry->protection));
17500 
17501 				prot = src_entry->protection & ~VM_PROT_WRITE;
17502 
17503 				if (override_nx(map,
17504 				    VME_ALIAS(src_entry))
17505 				    && prot) {
17506 					prot |= VM_PROT_EXECUTE;
17507 				}
17508 
17509 				assert(!pmap_has_prot_policy(map->pmap, src_entry->translated_allow_execute, prot));
17510 
17511 				vm_object_pmap_protect(object,
17512 				    offset,
17513 				    entry_size,
17514 				    ((src_entry->is_shared
17515 				    || map->mapped_in_other_pmaps) ?
17516 				    PMAP_NULL : map->pmap),
17517 				    VM_MAP_PAGE_SIZE(map),
17518 				    src_entry->vme_start,
17519 				    prot);
17520 
17521 				assert(src_entry->wired_count == 0);
17522 				src_entry->needs_copy = TRUE;
17523 			}
17524 			/*
17525 			 * Throw away the old object reference of the new entry.
17526 			 */
17527 			vm_object_deallocate(object);
17528 		} else {
17529 			new_entry->is_shared = FALSE;
17530 			assertf(new_entry->use_pmap, "map %p new_entry %p\n", map, new_entry);
17531 
17532 			src_entry_was_wired = (src_entry->wired_count > 0);
17533 			saved_src_entry = src_entry;
17534 			src_entry = VM_MAP_ENTRY_NULL;
17535 
17536 			/*
17537 			 * The map can be safely unlocked since we
17538 			 * already hold a reference on the object.
17539 			 *
17540 			 * Record the timestamp of the map for later
17541 			 * verification, and unlock the map.
17542 			 */
17543 			version.main_timestamp = map->timestamp;
17544 			vm_map_unlock(map);     /* Increments timestamp once! */
17545 
17546 			/*
17547 			 * Perform the copy.
17548 			 */
17549 			if (src_entry_was_wired > 0 ||
17550 			    (debug4k_no_cow_copyin &&
17551 			    VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT)) {
17552 				vm_object_lock(object);
17553 				result = vm_object_copy_slowly(
17554 					object,
17555 					offset,
17556 					(new_entry->vme_end -
17557 					new_entry->vme_start),
17558 					THREAD_UNINT,
17559 					&new_copy_object);
17560 				/* VME_OBJECT_SET will reset used_for_jit, so preserve it. */
17561 				saved_used_for_jit = new_entry->used_for_jit;
17562 #if __arm64e__
17563 				saved_used_for_tpro = new_entry->used_for_tpro;
17564 #endif
17565 				VME_OBJECT_SET(new_entry, new_copy_object, false, 0);
17566 				new_entry->used_for_jit = saved_used_for_jit;
17567 #if __arm64e__
17568 				new_entry->used_for_tpro = saved_used_for_tpro;
17569 #endif
17570 				VME_OFFSET_SET(new_entry, offset - vm_object_trunc_page(offset));
17571 				new_entry->needs_copy = FALSE;
17572 			} else {
17573 				vm_object_offset_t new_offset;
17574 
17575 				new_offset = VME_OFFSET(new_entry);
17576 				result = vm_object_copy_strategically(
17577 					object,
17578 					offset,
17579 					(new_entry->vme_end -
17580 					new_entry->vme_start),
17581 					&new_copy_object,
17582 					&new_offset,
17583 					&new_entry_needs_copy);
17584 				/* VME_OBJECT_SET will reset used_for_jit, so preserve it. */
17585 				saved_used_for_jit = new_entry->used_for_jit;
17586 #if __arm64e__
17587 				saved_used_for_tpro = new_entry->used_for_tpro;
17588 #endif
17589 				VME_OBJECT_SET(new_entry, new_copy_object, false, 0);
17590 				new_entry->used_for_jit = saved_used_for_jit;
17591 #if __arm64e__
17592 				new_entry->used_for_tpro = saved_used_for_tpro;
17593 #endif
17594 				if (new_offset != VME_OFFSET(new_entry)) {
17595 					VME_OFFSET_SET(new_entry, new_offset);
17596 				}
17597 
17598 				new_entry->needs_copy = new_entry_needs_copy;
17599 			}
17600 
17601 			/*
17602 			 * Throw away the old object reference of the new entry.
17603 			 */
17604 			vm_object_deallocate(object);
17605 
17606 			if (result != KERN_SUCCESS &&
17607 			    result != KERN_MEMORY_RESTART_COPY) {
17608 				vm_map_entry_dispose(new_entry);
17609 				vm_map_lock(map);
17610 				break;
17611 			}
17612 
17613 			/*
17614 			 * Verify that the map has not substantially
17615 			 * changed while the copy was being made.
17616 			 */
17617 
17618 			vm_map_lock(map);
17619 			if (version.main_timestamp + 1 != map->timestamp) {
17620 				/*
17621 				 * Simple version comparison failed.
17622 				 *
17623 				 * Retry the lookup and verify that the
17624 				 * same object/offset are still present.
17625 				 */
17626 				saved_src_entry = VM_MAP_ENTRY_NULL;
17627 				vm_object_deallocate(VME_OBJECT(new_entry));
17628 				vm_map_entry_dispose(new_entry);
17629 				if (result == KERN_MEMORY_RESTART_COPY) {
17630 					result = KERN_SUCCESS;
17631 				}
17632 				continue;
17633 			}
17634 			/* map hasn't changed: src_entry is still valid */
17635 			src_entry = saved_src_entry;
17636 			saved_src_entry = VM_MAP_ENTRY_NULL;
17637 
17638 			if (result == KERN_MEMORY_RESTART_COPY) {
17639 				vm_object_reference(object);
17640 				goto RestartCopy;
17641 			}
17642 		}
17643 
17644 		_vm_map_store_entry_link(map_header,
17645 		    map_header->links.prev, new_entry);
17646 
17647 		/* protections for submap mapping are irrelevant here */
17648 		if (vm_remap_legacy && !src_entry->is_sub_map) {
17649 			*cur_protection &= src_entry->protection;
17650 			*max_protection &= src_entry->max_protection;
17651 		}
17652 
17653 		map_address += tmp_size;
17654 		mapped_size += tmp_size;
17655 		src_start += tmp_size;
17656 
17657 		if (vmk_flags.vmkf_copy_single_object) {
17658 			if (mapped_size != size) {
17659 				DEBUG4K_SHARE("map %p addr 0x%llx size 0x%llx clipped copy at mapped_size 0x%llx\n",
17660 				    map, (uint64_t)addr, (uint64_t)size, (uint64_t)mapped_size);
17661 				if (src_entry->vme_next != vm_map_to_entry(map) &&
17662 				    src_entry->vme_next->vme_object_value ==
17663 				    src_entry->vme_object_value) {
17664 					/* XXX TODO4K */
17665 					DEBUG4K_ERROR("could have extended copy to next entry...\n");
17666 				}
17667 			}
17668 			break;
17669 		}
17670 	} /* end while */
17671 
17672 	vm_map_unlock(map);
17673 	if (result != KERN_SUCCESS) {
17674 		/*
17675 		 * Free all allocated elements.
17676 		 */
17677 		for (src_entry = map_header->links.next;
17678 		    src_entry != CAST_TO_VM_MAP_ENTRY(&map_header->links);
17679 		    src_entry = new_entry) {
17680 			new_entry = src_entry->vme_next;
17681 			_vm_map_store_entry_unlink(map_header, src_entry, false);
17682 			if (src_entry->is_sub_map) {
17683 				vm_map_deallocate(VME_SUBMAP(src_entry));
17684 			} else {
17685 				vm_object_deallocate(VME_OBJECT(src_entry));
17686 			}
17687 			vm_map_entry_dispose(src_entry);
17688 		}
17689 	}
17690 	return result;
17691 }
17692 
17693 bool
vm_map_is_exotic(vm_map_t map)17694 vm_map_is_exotic(
17695 	vm_map_t map)
17696 {
17697 	return VM_MAP_IS_EXOTIC(map);
17698 }
17699 
17700 bool
vm_map_is_alien(vm_map_t map)17701 vm_map_is_alien(
17702 	vm_map_t map)
17703 {
17704 	return VM_MAP_IS_ALIEN(map);
17705 }
17706 
17707 #if XNU_TARGET_OS_OSX
17708 void
vm_map_mark_alien(vm_map_t map)17709 vm_map_mark_alien(
17710 	vm_map_t map)
17711 {
17712 	vm_map_lock(map);
17713 	map->is_alien = true;
17714 	vm_map_unlock(map);
17715 }
17716 
17717 void
vm_map_single_jit(vm_map_t map)17718 vm_map_single_jit(
17719 	vm_map_t map)
17720 {
17721 	vm_map_lock(map);
17722 	map->single_jit = true;
17723 	vm_map_unlock(map);
17724 }
17725 #endif /* XNU_TARGET_OS_OSX */
17726 
17727 /*
17728  * Callers of this function must call vm_map_copy_require on
17729  * previously created vm_map_copy_t or pass a newly created
17730  * one to ensure that it hasn't been forged.
17731  */
17732 static kern_return_t
vm_map_copy_to_physcopy(vm_map_copy_t copy_map,vm_map_t target_map)17733 vm_map_copy_to_physcopy(
17734 	vm_map_copy_t   copy_map,
17735 	vm_map_t        target_map)
17736 {
17737 	vm_map_size_t           size;
17738 	vm_map_entry_t          entry;
17739 	vm_map_entry_t          new_entry;
17740 	vm_object_t             new_object;
17741 	unsigned int            pmap_flags;
17742 	pmap_t                  new_pmap;
17743 	vm_map_t                new_map;
17744 	vm_map_address_t        src_start, src_end, src_cur;
17745 	vm_map_address_t        dst_start, dst_end, dst_cur;
17746 	kern_return_t           kr;
17747 	void                    *kbuf;
17748 
17749 	/*
17750 	 * Perform the equivalent of vm_allocate() and memcpy().
17751 	 * Replace the mappings in "copy_map" with the newly allocated mapping.
17752 	 */
17753 	DEBUG4K_COPY("copy_map %p (%d %d 0x%llx 0x%llx) BEFORE\n", copy_map, copy_map->cpy_hdr.page_shift, copy_map->cpy_hdr.nentries, copy_map->offset, (uint64_t)copy_map->size);
17754 
17755 	assert(copy_map->cpy_hdr.page_shift != VM_MAP_PAGE_MASK(target_map));
17756 
17757 	/* create a new pmap to map "copy_map" */
17758 	pmap_flags = 0;
17759 	assert(copy_map->cpy_hdr.page_shift == FOURK_PAGE_SHIFT);
17760 #if PMAP_CREATE_FORCE_4K_PAGES
17761 	pmap_flags |= PMAP_CREATE_FORCE_4K_PAGES;
17762 #endif /* PMAP_CREATE_FORCE_4K_PAGES */
17763 	pmap_flags |= PMAP_CREATE_64BIT;
17764 	new_pmap = pmap_create_options(NULL, (vm_map_size_t)0, pmap_flags);
17765 	if (new_pmap == NULL) {
17766 		return KERN_RESOURCE_SHORTAGE;
17767 	}
17768 
17769 	/* allocate new VM object */
17770 	size = VM_MAP_ROUND_PAGE(copy_map->size, PAGE_MASK);
17771 	new_object = vm_object_allocate(size);
17772 	assert(new_object);
17773 
17774 	/* allocate new VM map entry */
17775 	new_entry = vm_map_copy_entry_create(copy_map);
17776 	assert(new_entry);
17777 
17778 	/* finish initializing new VM map entry */
17779 	new_entry->protection = VM_PROT_DEFAULT;
17780 	new_entry->max_protection = VM_PROT_DEFAULT;
17781 	new_entry->use_pmap = TRUE;
17782 
17783 	/* make new VM map entry point to new VM object */
17784 	new_entry->vme_start = 0;
17785 	new_entry->vme_end = size;
17786 	VME_OBJECT_SET(new_entry, new_object, false, 0);
17787 	VME_OFFSET_SET(new_entry, 0);
17788 
17789 	/* create a new pageable VM map to map "copy_map" */
17790 	new_map = vm_map_create_options(new_pmap, 0, MACH_VM_MAX_ADDRESS,
17791 	    VM_MAP_CREATE_PAGEABLE);
17792 	assert(new_map);
17793 	vm_map_set_page_shift(new_map, copy_map->cpy_hdr.page_shift);
17794 
17795 	/* map "copy_map" in the new VM map */
17796 	src_start = 0;
17797 	kr = vm_map_copyout_internal(
17798 		new_map,
17799 		&src_start,
17800 		copy_map,
17801 		copy_map->size,
17802 		FALSE, /* consume_on_success */
17803 		VM_PROT_DEFAULT,
17804 		VM_PROT_DEFAULT,
17805 		VM_INHERIT_DEFAULT);
17806 	assert(kr == KERN_SUCCESS);
17807 	src_end = src_start + copy_map->size;
17808 
17809 	/* map "new_object" in the new VM map */
17810 	vm_object_reference(new_object);
17811 	dst_start = 0;
17812 	kr = vm_map_enter(new_map,
17813 	    &dst_start,
17814 	    size,
17815 	    0,               /* mask */
17816 	    VM_MAP_KERNEL_FLAGS_ANYWHERE(.vm_tag = VM_KERN_MEMORY_OSFMK),
17817 	    new_object,
17818 	    0,               /* offset */
17819 	    FALSE,               /* needs copy */
17820 	    VM_PROT_DEFAULT,
17821 	    VM_PROT_DEFAULT,
17822 	    VM_INHERIT_DEFAULT);
17823 	assert(kr == KERN_SUCCESS);
17824 	dst_end = dst_start + size;
17825 
17826 	/* get a kernel buffer */
17827 	kbuf = kalloc_data(PAGE_SIZE, Z_WAITOK | Z_NOFAIL);
17828 
17829 	/* physically copy "copy_map" mappings to new VM object */
17830 	for (src_cur = src_start, dst_cur = dst_start;
17831 	    src_cur < src_end;
17832 	    src_cur += PAGE_SIZE, dst_cur += PAGE_SIZE) {
17833 		vm_size_t bytes;
17834 
17835 		bytes = PAGE_SIZE;
17836 		if (src_cur + PAGE_SIZE > src_end) {
17837 			/* partial copy for last page */
17838 			bytes = src_end - src_cur;
17839 			assert(bytes > 0 && bytes < PAGE_SIZE);
17840 			/* rest of dst page should be zero-filled */
17841 		}
17842 		/* get bytes from src mapping */
17843 		kr = copyinmap(new_map, src_cur, kbuf, bytes);
17844 		if (kr != KERN_SUCCESS) {
17845 			DEBUG4K_COPY("copyinmap(%p, 0x%llx, %p, 0x%llx) kr 0x%x\n", new_map, (uint64_t)src_cur, kbuf, (uint64_t)bytes, kr);
17846 		}
17847 		/* put bytes in dst mapping */
17848 		assert(dst_cur < dst_end);
17849 		assert(dst_cur + bytes <= dst_end);
17850 		kr = copyoutmap(new_map, kbuf, dst_cur, bytes);
17851 		if (kr != KERN_SUCCESS) {
17852 			DEBUG4K_COPY("copyoutmap(%p, %p, 0x%llx, 0x%llx) kr 0x%x\n", new_map, kbuf, (uint64_t)dst_cur, (uint64_t)bytes, kr);
17853 		}
17854 	}
17855 
17856 	/* free kernel buffer */
17857 	kfree_data(kbuf, PAGE_SIZE);
17858 
17859 	/* destroy new map */
17860 	vm_map_destroy(new_map);
17861 	new_map = VM_MAP_NULL;
17862 
17863 	/* dispose of the old map entries in "copy_map" */
17864 	while (vm_map_copy_first_entry(copy_map) !=
17865 	    vm_map_copy_to_entry(copy_map)) {
17866 		entry = vm_map_copy_first_entry(copy_map);
17867 		vm_map_copy_entry_unlink(copy_map, entry);
17868 		if (entry->is_sub_map) {
17869 			vm_map_deallocate(VME_SUBMAP(entry));
17870 		} else {
17871 			vm_object_deallocate(VME_OBJECT(entry));
17872 		}
17873 		vm_map_copy_entry_dispose(entry);
17874 	}
17875 
17876 	/* change "copy_map"'s page_size to match "target_map" */
17877 	copy_map->cpy_hdr.page_shift = (uint16_t)VM_MAP_PAGE_SHIFT(target_map);
17878 	copy_map->offset = 0;
17879 	copy_map->size = size;
17880 
17881 	/* insert new map entry in "copy_map" */
17882 	assert(vm_map_copy_last_entry(copy_map) == vm_map_copy_to_entry(copy_map));
17883 	vm_map_copy_entry_link(copy_map, vm_map_copy_last_entry(copy_map), new_entry);
17884 
17885 	DEBUG4K_COPY("copy_map %p (%d %d 0x%llx 0x%llx) AFTER\n", copy_map, copy_map->cpy_hdr.page_shift, copy_map->cpy_hdr.nentries, copy_map->offset, (uint64_t)copy_map->size);
17886 	return KERN_SUCCESS;
17887 }
17888 
17889 void
17890 vm_map_copy_adjust_get_target_copy_map(
17891 	vm_map_copy_t   copy_map,
17892 	vm_map_copy_t   *target_copy_map_p);
17893 void
vm_map_copy_adjust_get_target_copy_map(vm_map_copy_t copy_map,vm_map_copy_t * target_copy_map_p)17894 vm_map_copy_adjust_get_target_copy_map(
17895 	vm_map_copy_t   copy_map,
17896 	vm_map_copy_t   *target_copy_map_p)
17897 {
17898 	vm_map_copy_t   target_copy_map;
17899 	vm_map_entry_t  entry, target_entry;
17900 
17901 	if (*target_copy_map_p != VM_MAP_COPY_NULL) {
17902 		/* the caller already has a "target_copy_map": use it */
17903 		return;
17904 	}
17905 
17906 	/* the caller wants us to create a new copy of "copy_map" */
17907 	assert(copy_map->type == VM_MAP_COPY_ENTRY_LIST);
17908 	target_copy_map = vm_map_copy_allocate(copy_map->type);
17909 	target_copy_map->offset = copy_map->offset;
17910 	target_copy_map->size = copy_map->size;
17911 	target_copy_map->cpy_hdr.page_shift = copy_map->cpy_hdr.page_shift;
17912 	for (entry = vm_map_copy_first_entry(copy_map);
17913 	    entry != vm_map_copy_to_entry(copy_map);
17914 	    entry = entry->vme_next) {
17915 		target_entry = vm_map_copy_entry_create(target_copy_map);
17916 		vm_map_entry_copy_full(target_entry, entry);
17917 		if (target_entry->is_sub_map) {
17918 			vm_map_reference(VME_SUBMAP(target_entry));
17919 		} else {
17920 			vm_object_reference(VME_OBJECT(target_entry));
17921 		}
17922 		vm_map_copy_entry_link(
17923 			target_copy_map,
17924 			vm_map_copy_last_entry(target_copy_map),
17925 			target_entry);
17926 	}
17927 	entry = VM_MAP_ENTRY_NULL;
17928 	*target_copy_map_p = target_copy_map;
17929 }
17930 
17931 /*
17932  * Callers of this function must call vm_map_copy_require on
17933  * previously created vm_map_copy_t or pass a newly created
17934  * one to ensure that it hasn't been forged.
17935  */
17936 static void
vm_map_copy_trim(vm_map_copy_t copy_map,uint16_t new_page_shift,vm_map_offset_t trim_start,vm_map_offset_t trim_end)17937 vm_map_copy_trim(
17938 	vm_map_copy_t   copy_map,
17939 	uint16_t        new_page_shift,
17940 	vm_map_offset_t trim_start,
17941 	vm_map_offset_t trim_end)
17942 {
17943 	uint16_t        copy_page_shift;
17944 	vm_map_entry_t  entry, next_entry;
17945 
17946 	assert(copy_map->type == VM_MAP_COPY_ENTRY_LIST);
17947 	assert(copy_map->cpy_hdr.nentries > 0);
17948 
17949 	trim_start += vm_map_copy_first_entry(copy_map)->vme_start;
17950 	trim_end += vm_map_copy_first_entry(copy_map)->vme_start;
17951 
17952 	/* use the new page_shift to do the clipping */
17953 	copy_page_shift = VM_MAP_COPY_PAGE_SHIFT(copy_map);
17954 	copy_map->cpy_hdr.page_shift = new_page_shift;
17955 
17956 	for (entry = vm_map_copy_first_entry(copy_map);
17957 	    entry != vm_map_copy_to_entry(copy_map);
17958 	    entry = next_entry) {
17959 		next_entry = entry->vme_next;
17960 		if (entry->vme_end <= trim_start) {
17961 			/* entry fully before trim range: skip */
17962 			continue;
17963 		}
17964 		if (entry->vme_start >= trim_end) {
17965 			/* entry fully after trim range: done */
17966 			break;
17967 		}
17968 		/* clip entry if needed */
17969 		vm_map_copy_clip_start(copy_map, entry, trim_start);
17970 		vm_map_copy_clip_end(copy_map, entry, trim_end);
17971 		/* dispose of entry */
17972 		copy_map->size -= entry->vme_end - entry->vme_start;
17973 		vm_map_copy_entry_unlink(copy_map, entry);
17974 		if (entry->is_sub_map) {
17975 			vm_map_deallocate(VME_SUBMAP(entry));
17976 		} else {
17977 			vm_object_deallocate(VME_OBJECT(entry));
17978 		}
17979 		vm_map_copy_entry_dispose(entry);
17980 		entry = VM_MAP_ENTRY_NULL;
17981 	}
17982 
17983 	/* restore copy_map's original page_shift */
17984 	copy_map->cpy_hdr.page_shift = copy_page_shift;
17985 }
17986 
17987 /*
17988  * Make any necessary adjustments to "copy_map" to allow it to be
17989  * mapped into "target_map".
17990  * If no changes were necessary, "target_copy_map" points to the
17991  * untouched "copy_map".
17992  * If changes are necessary, changes will be made to "target_copy_map".
17993  * If "target_copy_map" was NULL, we create a new "vm_map_copy_t" and
17994  * copy the original "copy_map" to it before applying the changes.
17995  * The caller should discard "target_copy_map" if it's not the same as
17996  * the original "copy_map".
17997  */
17998 /* TODO4K: also adjust to sub-range in the copy_map -> add start&end? */
17999 kern_return_t
vm_map_copy_adjust_to_target(vm_map_copy_t src_copy_map,vm_map_offset_t offset,vm_map_size_t size,vm_map_t target_map,boolean_t copy,vm_map_copy_t * target_copy_map_p,vm_map_offset_t * overmap_start_p,vm_map_offset_t * overmap_end_p,vm_map_offset_t * trimmed_start_p)18000 vm_map_copy_adjust_to_target(
18001 	vm_map_copy_t           src_copy_map,
18002 	vm_map_offset_t         offset,
18003 	vm_map_size_t           size,
18004 	vm_map_t                target_map,
18005 	boolean_t               copy,
18006 	vm_map_copy_t           *target_copy_map_p,
18007 	vm_map_offset_t         *overmap_start_p,
18008 	vm_map_offset_t         *overmap_end_p,
18009 	vm_map_offset_t         *trimmed_start_p)
18010 {
18011 	vm_map_copy_t           copy_map, target_copy_map;
18012 	vm_map_size_t           target_size;
18013 	vm_map_size_t           src_copy_map_size;
18014 	vm_map_size_t           overmap_start, overmap_end;
18015 	int                     misalignments;
18016 	vm_map_entry_t          entry, target_entry;
18017 	vm_map_offset_t         addr_adjustment;
18018 	vm_map_offset_t         new_start, new_end;
18019 	int                     copy_page_mask, target_page_mask;
18020 	uint16_t                copy_page_shift, target_page_shift;
18021 	vm_map_offset_t         trimmed_end;
18022 
18023 	/*
18024 	 * Assert that the vm_map_copy is coming from the right
18025 	 * zone and hasn't been forged
18026 	 */
18027 	vm_map_copy_require(src_copy_map);
18028 	assert(src_copy_map->type == VM_MAP_COPY_ENTRY_LIST);
18029 
18030 	/*
18031 	 * Start working with "src_copy_map" but we'll switch
18032 	 * to "target_copy_map" as soon as we start making adjustments.
18033 	 */
18034 	copy_map = src_copy_map;
18035 	src_copy_map_size = src_copy_map->size;
18036 
18037 	copy_page_shift = VM_MAP_COPY_PAGE_SHIFT(copy_map);
18038 	copy_page_mask = VM_MAP_COPY_PAGE_MASK(copy_map);
18039 	target_page_shift = (uint16_t)VM_MAP_PAGE_SHIFT(target_map);
18040 	target_page_mask = VM_MAP_PAGE_MASK(target_map);
18041 
18042 	DEBUG4K_ADJUST("copy_map %p (%d offset 0x%llx size 0x%llx) target_map %p (%d) copy %d offset 0x%llx size 0x%llx target_copy_map %p...\n", copy_map, copy_page_shift, (uint64_t)copy_map->offset, (uint64_t)copy_map->size, target_map, target_page_shift, copy, (uint64_t)offset, (uint64_t)size, *target_copy_map_p);
18043 
18044 	target_copy_map = *target_copy_map_p;
18045 	if (target_copy_map != VM_MAP_COPY_NULL) {
18046 		vm_map_copy_require(target_copy_map);
18047 	}
18048 
18049 	if (offset + size > copy_map->size) {
18050 		DEBUG4K_ERROR("copy_map %p (%d->%d) copy_map->size 0x%llx offset 0x%llx size 0x%llx KERN_INVALID_ARGUMENT\n", copy_map, copy_page_shift, target_page_shift, (uint64_t)copy_map->size, (uint64_t)offset, (uint64_t)size);
18051 		return KERN_INVALID_ARGUMENT;
18052 	}
18053 
18054 	/* trim the end */
18055 	trimmed_end = 0;
18056 	new_end = VM_MAP_ROUND_PAGE(offset + size, target_page_mask);
18057 	if (new_end < copy_map->size) {
18058 		trimmed_end = src_copy_map_size - new_end;
18059 		DEBUG4K_ADJUST("copy_map %p (%d->%d) copy %d offset 0x%llx size 0x%llx target_copy_map %p... trim end from 0x%llx to 0x%llx\n", copy_map, copy_page_shift, target_page_shift, copy, (uint64_t)offset, (uint64_t)size, target_copy_map, (uint64_t)new_end, (uint64_t)copy_map->size);
18060 		/* get "target_copy_map" if needed and adjust it */
18061 		vm_map_copy_adjust_get_target_copy_map(copy_map,
18062 		    &target_copy_map);
18063 		copy_map = target_copy_map;
18064 		vm_map_copy_trim(target_copy_map, target_page_shift,
18065 		    new_end, copy_map->size);
18066 	}
18067 
18068 	/* trim the start */
18069 	new_start = VM_MAP_TRUNC_PAGE(offset, target_page_mask);
18070 	if (new_start != 0) {
18071 		DEBUG4K_ADJUST("copy_map %p (%d->%d) copy %d offset 0x%llx size 0x%llx target_copy_map %p... trim start from 0x%llx to 0x%llx\n", copy_map, copy_page_shift, target_page_shift, copy, (uint64_t)offset, (uint64_t)size, target_copy_map, (uint64_t)0, (uint64_t)new_start);
18072 		/* get "target_copy_map" if needed and adjust it */
18073 		vm_map_copy_adjust_get_target_copy_map(copy_map,
18074 		    &target_copy_map);
18075 		copy_map = target_copy_map;
18076 		vm_map_copy_trim(target_copy_map, target_page_shift,
18077 		    0, new_start);
18078 	}
18079 	*trimmed_start_p = new_start;
18080 
18081 	/* target_size starts with what's left after trimming */
18082 	target_size = copy_map->size;
18083 	assertf(target_size == src_copy_map_size - *trimmed_start_p - trimmed_end,
18084 	    "target_size 0x%llx src_copy_map_size 0x%llx trimmed_start 0x%llx trimmed_end 0x%llx\n",
18085 	    (uint64_t)target_size, (uint64_t)src_copy_map_size,
18086 	    (uint64_t)*trimmed_start_p, (uint64_t)trimmed_end);
18087 
18088 	/* check for misalignments but don't adjust yet */
18089 	misalignments = 0;
18090 	overmap_start = 0;
18091 	overmap_end = 0;
18092 	if (copy_page_shift < target_page_shift) {
18093 		/*
18094 		 * Remapping from 4K to 16K: check the VM object alignments
18095 		 * throughout the range.
18096 		 * If the start and end of the range are mis-aligned, we can
18097 		 * over-map to re-align, and adjust the "overmap" start/end
18098 		 * and "target_size" of the range accordingly.
18099 		 * If there is any mis-alignment within the range:
18100 		 *     if "copy":
18101 		 *         we can do immediate-copy instead of copy-on-write,
18102 		 *     else:
18103 		 *         no way to remap and share; fail.
18104 		 */
18105 		for (entry = vm_map_copy_first_entry(copy_map);
18106 		    entry != vm_map_copy_to_entry(copy_map);
18107 		    entry = entry->vme_next) {
18108 			vm_object_offset_t object_offset_start, object_offset_end;
18109 
18110 			object_offset_start = VME_OFFSET(entry);
18111 			object_offset_end = object_offset_start;
18112 			object_offset_end += entry->vme_end - entry->vme_start;
18113 			if (object_offset_start & target_page_mask) {
18114 				if (entry == vm_map_copy_first_entry(copy_map) && !copy) {
18115 					overmap_start++;
18116 				} else {
18117 					misalignments++;
18118 				}
18119 			}
18120 			if (object_offset_end & target_page_mask) {
18121 				if (entry->vme_next == vm_map_copy_to_entry(copy_map) && !copy) {
18122 					overmap_end++;
18123 				} else {
18124 					misalignments++;
18125 				}
18126 			}
18127 		}
18128 	}
18129 	entry = VM_MAP_ENTRY_NULL;
18130 
18131 	/* decide how to deal with misalignments */
18132 	assert(overmap_start <= 1);
18133 	assert(overmap_end <= 1);
18134 	if (!overmap_start && !overmap_end && !misalignments) {
18135 		/* copy_map is properly aligned for target_map ... */
18136 		if (*trimmed_start_p) {
18137 			/* ... but we trimmed it, so still need to adjust */
18138 		} else {
18139 			/* ... and we didn't trim anything: we're done */
18140 			if (target_copy_map == VM_MAP_COPY_NULL) {
18141 				target_copy_map = copy_map;
18142 			}
18143 			*target_copy_map_p = target_copy_map;
18144 			*overmap_start_p = 0;
18145 			*overmap_end_p = 0;
18146 			DEBUG4K_ADJUST("copy_map %p (%d offset 0x%llx size 0x%llx) target_map %p (%d) copy %d target_copy_map %p (%d offset 0x%llx size 0x%llx) -> trimmed 0x%llx overmap start 0x%llx end 0x%llx KERN_SUCCESS\n", copy_map, copy_page_shift, (uint64_t)copy_map->offset, (uint64_t)copy_map->size, target_map, target_page_shift, copy, *target_copy_map_p, VM_MAP_COPY_PAGE_SHIFT(*target_copy_map_p), (uint64_t)(*target_copy_map_p)->offset, (uint64_t)(*target_copy_map_p)->size, (uint64_t)*trimmed_start_p, (uint64_t)*overmap_start_p, (uint64_t)*overmap_end_p);
18147 			return KERN_SUCCESS;
18148 		}
18149 	} else if (misalignments && !copy) {
18150 		/* can't "share" if misaligned */
18151 		DEBUG4K_ADJUST("unsupported sharing\n");
18152 #if MACH_ASSERT
18153 		if (debug4k_panic_on_misaligned_sharing) {
18154 			panic("DEBUG4k %s:%d unsupported sharing", __FUNCTION__, __LINE__);
18155 		}
18156 #endif /* MACH_ASSERT */
18157 		DEBUG4K_ADJUST("copy_map %p (%d) target_map %p (%d) copy %d target_copy_map %p -> KERN_NOT_SUPPORTED\n", copy_map, copy_page_shift, target_map, target_page_shift, copy, *target_copy_map_p);
18158 		return KERN_NOT_SUPPORTED;
18159 	} else {
18160 		/* can't virtual-copy if misaligned (but can physical-copy) */
18161 		DEBUG4K_ADJUST("mis-aligned copying\n");
18162 	}
18163 
18164 	/* get a "target_copy_map" if needed and switch to it */
18165 	vm_map_copy_adjust_get_target_copy_map(copy_map, &target_copy_map);
18166 	copy_map = target_copy_map;
18167 
18168 	if (misalignments && copy) {
18169 		vm_map_size_t target_copy_map_size;
18170 
18171 		/*
18172 		 * Can't do copy-on-write with misaligned mappings.
18173 		 * Replace the mappings with a physical copy of the original
18174 		 * mappings' contents.
18175 		 */
18176 		target_copy_map_size = target_copy_map->size;
18177 		kern_return_t kr = vm_map_copy_to_physcopy(target_copy_map, target_map);
18178 		if (kr != KERN_SUCCESS) {
18179 			return kr;
18180 		}
18181 		*target_copy_map_p = target_copy_map;
18182 		*overmap_start_p = 0;
18183 		*overmap_end_p = target_copy_map->size - target_copy_map_size;
18184 		DEBUG4K_ADJUST("copy_map %p (%d offset 0x%llx size 0x%llx) target_map %p (%d) copy %d target_copy_map %p (%d offset 0x%llx size 0x%llx)-> trimmed 0x%llx overmap start 0x%llx end 0x%llx PHYSCOPY\n", copy_map, copy_page_shift, (uint64_t)copy_map->offset, (uint64_t)copy_map->size, target_map, target_page_shift, copy, *target_copy_map_p, VM_MAP_COPY_PAGE_SHIFT(*target_copy_map_p), (uint64_t)(*target_copy_map_p)->offset, (uint64_t)(*target_copy_map_p)->size, (uint64_t)*trimmed_start_p, (uint64_t)*overmap_start_p, (uint64_t)*overmap_end_p);
18185 		return KERN_SUCCESS;
18186 	}
18187 
18188 	/* apply the adjustments */
18189 	misalignments = 0;
18190 	overmap_start = 0;
18191 	overmap_end = 0;
18192 	/* remove copy_map->offset, so that everything starts at offset 0 */
18193 	addr_adjustment = copy_map->offset;
18194 	/* also remove whatever we trimmed from the start */
18195 	addr_adjustment += *trimmed_start_p;
18196 	for (target_entry = vm_map_copy_first_entry(target_copy_map);
18197 	    target_entry != vm_map_copy_to_entry(target_copy_map);
18198 	    target_entry = target_entry->vme_next) {
18199 		vm_object_offset_t object_offset_start, object_offset_end;
18200 
18201 		DEBUG4K_ADJUST("copy %p (%d 0x%llx 0x%llx) entry %p [ 0x%llx 0x%llx ] object %p offset 0x%llx BEFORE\n", target_copy_map, VM_MAP_COPY_PAGE_SHIFT(target_copy_map), target_copy_map->offset, (uint64_t)target_copy_map->size, target_entry, (uint64_t)target_entry->vme_start, (uint64_t)target_entry->vme_end, VME_OBJECT(target_entry), VME_OFFSET(target_entry));
18202 		object_offset_start = VME_OFFSET(target_entry);
18203 		if (object_offset_start & target_page_mask) {
18204 			DEBUG4K_ADJUST("copy %p (%d 0x%llx 0x%llx) entry %p [ 0x%llx 0x%llx ] object %p offset 0x%llx misaligned at start\n", target_copy_map, VM_MAP_COPY_PAGE_SHIFT(target_copy_map), target_copy_map->offset, (uint64_t)target_copy_map->size, target_entry, (uint64_t)target_entry->vme_start, (uint64_t)target_entry->vme_end, VME_OBJECT(target_entry), VME_OFFSET(target_entry));
18205 			if (target_entry == vm_map_copy_first_entry(target_copy_map)) {
18206 				/*
18207 				 * start of 1st entry is mis-aligned:
18208 				 * re-adjust by over-mapping.
18209 				 */
18210 				overmap_start = object_offset_start - trunc_page_mask_64(object_offset_start, target_page_mask);
18211 				DEBUG4K_ADJUST("entry %p offset 0x%llx copy %d -> overmap_start 0x%llx\n", target_entry, VME_OFFSET(target_entry), copy, (uint64_t)overmap_start);
18212 				VME_OFFSET_SET(target_entry, VME_OFFSET(target_entry) - overmap_start);
18213 			} else {
18214 				misalignments++;
18215 				DEBUG4K_ADJUST("entry %p offset 0x%llx copy %d -> misalignments %d\n", target_entry, VME_OFFSET(target_entry), copy, misalignments);
18216 				assert(copy);
18217 			}
18218 		}
18219 
18220 		if (target_entry == vm_map_copy_first_entry(target_copy_map)) {
18221 			target_size += overmap_start;
18222 		} else {
18223 			target_entry->vme_start += overmap_start;
18224 		}
18225 		target_entry->vme_end += overmap_start;
18226 
18227 		object_offset_end = VME_OFFSET(target_entry) + target_entry->vme_end - target_entry->vme_start;
18228 		if (object_offset_end & target_page_mask) {
18229 			DEBUG4K_ADJUST("copy %p (%d 0x%llx 0x%llx) entry %p [ 0x%llx 0x%llx ] object %p offset 0x%llx misaligned at end\n", target_copy_map, VM_MAP_COPY_PAGE_SHIFT(target_copy_map), target_copy_map->offset, (uint64_t)target_copy_map->size, target_entry, (uint64_t)target_entry->vme_start, (uint64_t)target_entry->vme_end, VME_OBJECT(target_entry), VME_OFFSET(target_entry));
18230 			if (target_entry->vme_next == vm_map_copy_to_entry(target_copy_map)) {
18231 				/*
18232 				 * end of last entry is mis-aligned: re-adjust by over-mapping.
18233 				 */
18234 				overmap_end = round_page_mask_64(object_offset_end, target_page_mask) - object_offset_end;
18235 				DEBUG4K_ADJUST("entry %p offset 0x%llx copy %d -> overmap_end 0x%llx\n", target_entry, VME_OFFSET(target_entry), copy, (uint64_t)overmap_end);
18236 				target_entry->vme_end += overmap_end;
18237 				target_size += overmap_end;
18238 			} else {
18239 				misalignments++;
18240 				DEBUG4K_ADJUST("entry %p offset 0x%llx copy %d -> misalignments %d\n", target_entry, VME_OFFSET(target_entry), copy, misalignments);
18241 				assert(copy);
18242 			}
18243 		}
18244 		target_entry->vme_start -= addr_adjustment;
18245 		target_entry->vme_end -= addr_adjustment;
18246 		DEBUG4K_ADJUST("copy %p (%d 0x%llx 0x%llx) entry %p [ 0x%llx 0x%llx ] object %p offset 0x%llx AFTER\n", target_copy_map, VM_MAP_COPY_PAGE_SHIFT(target_copy_map), target_copy_map->offset, (uint64_t)target_copy_map->size, target_entry, (uint64_t)target_entry->vme_start, (uint64_t)target_entry->vme_end, VME_OBJECT(target_entry), VME_OFFSET(target_entry));
18247 	}
18248 
18249 	target_copy_map->size = target_size;
18250 	target_copy_map->offset += overmap_start;
18251 	target_copy_map->offset -= addr_adjustment;
18252 	target_copy_map->cpy_hdr.page_shift = target_page_shift;
18253 
18254 //	assert(VM_MAP_PAGE_ALIGNED(target_copy_map->size, target_page_mask));
18255 //	assert(VM_MAP_PAGE_ALIGNED(target_copy_map->offset, FOURK_PAGE_MASK));
18256 	assert(overmap_start < VM_MAP_PAGE_SIZE(target_map));
18257 	assert(overmap_end < VM_MAP_PAGE_SIZE(target_map));
18258 
18259 	*target_copy_map_p = target_copy_map;
18260 	*overmap_start_p = overmap_start;
18261 	*overmap_end_p = overmap_end;
18262 
18263 	DEBUG4K_ADJUST("copy_map %p (%d offset 0x%llx size 0x%llx) target_map %p (%d) copy %d target_copy_map %p (%d offset 0x%llx size 0x%llx) -> trimmed 0x%llx overmap start 0x%llx end 0x%llx KERN_SUCCESS\n", copy_map, copy_page_shift, (uint64_t)copy_map->offset, (uint64_t)copy_map->size, target_map, target_page_shift, copy, *target_copy_map_p, VM_MAP_COPY_PAGE_SHIFT(*target_copy_map_p), (uint64_t)(*target_copy_map_p)->offset, (uint64_t)(*target_copy_map_p)->size, (uint64_t)*trimmed_start_p, (uint64_t)*overmap_start_p, (uint64_t)*overmap_end_p);
18264 	return KERN_SUCCESS;
18265 }
18266 
18267 kern_return_t
vm_map_range_physical_size(vm_map_t map,vm_map_address_t start,mach_vm_size_t size,mach_vm_size_t * phys_size)18268 vm_map_range_physical_size(
18269 	vm_map_t         map,
18270 	vm_map_address_t start,
18271 	mach_vm_size_t   size,
18272 	mach_vm_size_t * phys_size)
18273 {
18274 	kern_return_t   kr;
18275 	vm_map_copy_t   copy_map, target_copy_map;
18276 	vm_map_offset_t adjusted_start, adjusted_end;
18277 	vm_map_size_t   adjusted_size;
18278 	vm_prot_t       cur_prot, max_prot;
18279 	vm_map_offset_t overmap_start, overmap_end, trimmed_start, end;
18280 	vm_map_kernel_flags_t vmk_flags;
18281 
18282 	if (size == 0) {
18283 		DEBUG4K_SHARE("map %p start 0x%llx size 0x%llx -> phys_size 0!\n", map, (uint64_t)start, (uint64_t)size);
18284 		*phys_size = 0;
18285 		return KERN_SUCCESS;
18286 	}
18287 
18288 	adjusted_start = vm_map_trunc_page(start, VM_MAP_PAGE_MASK(map));
18289 	adjusted_end = vm_map_round_page(start + size, VM_MAP_PAGE_MASK(map));
18290 	if (__improbable(os_add_overflow(start, size, &end) ||
18291 	    adjusted_end <= adjusted_start)) {
18292 		/* wraparound */
18293 		printf("%s:%d(start=0x%llx, size=0x%llx) pgmask 0x%x: wraparound\n", __FUNCTION__, __LINE__, (uint64_t)start, (uint64_t)size, VM_MAP_PAGE_MASK(map));
18294 		*phys_size = 0;
18295 		return KERN_INVALID_ARGUMENT;
18296 	}
18297 	assert(adjusted_end > adjusted_start);
18298 	adjusted_size = adjusted_end - adjusted_start;
18299 	*phys_size = adjusted_size;
18300 	if (VM_MAP_PAGE_SIZE(map) == PAGE_SIZE) {
18301 		return KERN_SUCCESS;
18302 	}
18303 	if (start == 0) {
18304 		adjusted_start = vm_map_trunc_page(start, PAGE_MASK);
18305 		adjusted_end = vm_map_round_page(start + size, PAGE_MASK);
18306 		if (__improbable(adjusted_end <= adjusted_start)) {
18307 			/* wraparound */
18308 			printf("%s:%d(start=0x%llx, size=0x%llx) pgmask 0x%x: wraparound\n", __FUNCTION__, __LINE__, (uint64_t)start, (uint64_t)size, PAGE_MASK);
18309 			*phys_size = 0;
18310 			return KERN_INVALID_ARGUMENT;
18311 		}
18312 		assert(adjusted_end > adjusted_start);
18313 		adjusted_size = adjusted_end - adjusted_start;
18314 		*phys_size = adjusted_size;
18315 		return KERN_SUCCESS;
18316 	}
18317 
18318 	vmk_flags = VM_MAP_KERNEL_FLAGS_NONE;
18319 	vmk_flags.vmkf_copy_pageable = TRUE;
18320 	vmk_flags.vmkf_copy_same_map = TRUE;
18321 	assert(adjusted_size != 0);
18322 	cur_prot = VM_PROT_NONE; /* legacy mode */
18323 	max_prot = VM_PROT_NONE; /* legacy mode */
18324 	kr = vm_map_copy_extract(map, adjusted_start, adjusted_size,
18325 	    FALSE /* copy */,
18326 	    &copy_map,
18327 	    &cur_prot, &max_prot, VM_INHERIT_DEFAULT,
18328 	    vmk_flags);
18329 	if (kr != KERN_SUCCESS) {
18330 		DEBUG4K_ERROR("map %p start 0x%llx 0x%llx size 0x%llx 0x%llx kr 0x%x\n", map, (uint64_t)start, (uint64_t)adjusted_start, size, (uint64_t)adjusted_size, kr);
18331 		//assert(0);
18332 		*phys_size = 0;
18333 		return kr;
18334 	}
18335 	assert(copy_map != VM_MAP_COPY_NULL);
18336 	target_copy_map = copy_map;
18337 	DEBUG4K_ADJUST("adjusting...\n");
18338 	kr = vm_map_copy_adjust_to_target(
18339 		copy_map,
18340 		start - adjusted_start, /* offset */
18341 		size, /* size */
18342 		kernel_map,
18343 		FALSE,                          /* copy */
18344 		&target_copy_map,
18345 		&overmap_start,
18346 		&overmap_end,
18347 		&trimmed_start);
18348 	if (kr == KERN_SUCCESS) {
18349 		if (target_copy_map->size != *phys_size) {
18350 			DEBUG4K_ADJUST("map %p (%d) start 0x%llx size 0x%llx adjusted_start 0x%llx adjusted_end 0x%llx overmap_start 0x%llx overmap_end 0x%llx trimmed_start 0x%llx phys_size 0x%llx -> 0x%llx\n", map, VM_MAP_PAGE_SHIFT(map), (uint64_t)start, (uint64_t)size, (uint64_t)adjusted_start, (uint64_t)adjusted_end, (uint64_t)overmap_start, (uint64_t)overmap_end, (uint64_t)trimmed_start, (uint64_t)*phys_size, (uint64_t)target_copy_map->size);
18351 		}
18352 		*phys_size = target_copy_map->size;
18353 	} else {
18354 		DEBUG4K_ERROR("map %p start 0x%llx 0x%llx size 0x%llx 0x%llx kr 0x%x\n", map, (uint64_t)start, (uint64_t)adjusted_start, size, (uint64_t)adjusted_size, kr);
18355 		//assert(0);
18356 		*phys_size = 0;
18357 	}
18358 	vm_map_copy_discard(copy_map);
18359 	copy_map = VM_MAP_COPY_NULL;
18360 
18361 	return kr;
18362 }
18363 
18364 
18365 kern_return_t
memory_entry_check_for_adjustment(vm_map_t src_map,ipc_port_t port,vm_map_offset_t * overmap_start,vm_map_offset_t * overmap_end)18366 memory_entry_check_for_adjustment(
18367 	vm_map_t                        src_map,
18368 	ipc_port_t                      port,
18369 	vm_map_offset_t         *overmap_start,
18370 	vm_map_offset_t         *overmap_end)
18371 {
18372 	kern_return_t kr = KERN_SUCCESS;
18373 	vm_map_copy_t copy_map = VM_MAP_COPY_NULL, target_copy_map = VM_MAP_COPY_NULL;
18374 
18375 	assert(port);
18376 	assertf(ip_kotype(port) == IKOT_NAMED_ENTRY, "Port Type expected: %d...received:%d\n", IKOT_NAMED_ENTRY, ip_kotype(port));
18377 
18378 	vm_named_entry_t        named_entry;
18379 
18380 	named_entry = mach_memory_entry_from_port(port);
18381 	named_entry_lock(named_entry);
18382 	copy_map = named_entry->backing.copy;
18383 	target_copy_map = copy_map;
18384 
18385 	if (src_map && VM_MAP_PAGE_SHIFT(src_map) < PAGE_SHIFT) {
18386 		vm_map_offset_t trimmed_start;
18387 
18388 		trimmed_start = 0;
18389 		DEBUG4K_ADJUST("adjusting...\n");
18390 		kr = vm_map_copy_adjust_to_target(
18391 			copy_map,
18392 			0, /* offset */
18393 			copy_map->size, /* size */
18394 			src_map,
18395 			FALSE, /* copy */
18396 			&target_copy_map,
18397 			overmap_start,
18398 			overmap_end,
18399 			&trimmed_start);
18400 		assert(trimmed_start == 0);
18401 	}
18402 	named_entry_unlock(named_entry);
18403 
18404 	return kr;
18405 }
18406 
18407 
18408 /*
18409  *	Routine:	vm_remap
18410  *
18411  *			Map portion of a task's address space.
18412  *			Mapped region must not overlap more than
18413  *			one vm memory object. Protections and
18414  *			inheritance attributes remain the same
18415  *			as in the original task and are	out parameters.
18416  *			Source and Target task can be identical
18417  *			Other attributes are identical as for vm_map()
18418  */
18419 kern_return_t
vm_map_remap(vm_map_t target_map,vm_map_address_t * address,vm_map_size_t size,vm_map_offset_t mask,vm_map_kernel_flags_t vmk_flags,vm_map_t src_map,vm_map_offset_t memory_address,boolean_t copy,vm_prot_t * cur_protection,vm_prot_t * max_protection,vm_inherit_t inheritance)18420 vm_map_remap(
18421 	vm_map_t                target_map,
18422 	vm_map_address_t        *address,
18423 	vm_map_size_t           size,
18424 	vm_map_offset_t         mask,
18425 	vm_map_kernel_flags_t   vmk_flags,
18426 	vm_map_t                src_map,
18427 	vm_map_offset_t         memory_address,
18428 	boolean_t               copy,
18429 	vm_prot_t               *cur_protection, /* IN/OUT */
18430 	vm_prot_t               *max_protection, /* IN/OUT */
18431 	vm_inherit_t            inheritance)
18432 {
18433 	kern_return_t           result;
18434 	vm_map_entry_t          entry;
18435 	vm_map_entry_t          insp_entry = VM_MAP_ENTRY_NULL;
18436 	vm_map_entry_t          new_entry;
18437 	vm_map_copy_t           copy_map;
18438 	vm_map_offset_t         offset_in_mapping;
18439 	vm_map_size_t           target_size = 0;
18440 	vm_map_size_t           src_page_mask, target_page_mask;
18441 	vm_map_offset_t         overmap_start, overmap_end, trimmed_start;
18442 	vm_map_offset_t         initial_memory_address;
18443 	vm_map_size_t           initial_size;
18444 	VM_MAP_ZAP_DECLARE(zap_list);
18445 
18446 	if (target_map == VM_MAP_NULL) {
18447 		return KERN_INVALID_ARGUMENT;
18448 	}
18449 
18450 	initial_memory_address = memory_address;
18451 	initial_size = size;
18452 	src_page_mask = VM_MAP_PAGE_MASK(src_map);
18453 	target_page_mask = VM_MAP_PAGE_MASK(target_map);
18454 
18455 	switch (inheritance) {
18456 	case VM_INHERIT_NONE:
18457 	case VM_INHERIT_COPY:
18458 	case VM_INHERIT_SHARE:
18459 		if (size != 0 && src_map != VM_MAP_NULL) {
18460 			break;
18461 		}
18462 		OS_FALLTHROUGH;
18463 	default:
18464 		return KERN_INVALID_ARGUMENT;
18465 	}
18466 
18467 	if (src_page_mask != target_page_mask) {
18468 		if (copy) {
18469 			DEBUG4K_COPY("src_map %p pgsz 0x%x addr 0x%llx size 0x%llx copy %d -> target_map %p pgsz 0x%x\n", src_map, VM_MAP_PAGE_SIZE(src_map), (uint64_t)memory_address, (uint64_t)size, copy, target_map, VM_MAP_PAGE_SIZE(target_map));
18470 		} else {
18471 			DEBUG4K_SHARE("src_map %p pgsz 0x%x addr 0x%llx size 0x%llx copy %d -> target_map %p pgsz 0x%x\n", src_map, VM_MAP_PAGE_SIZE(src_map), (uint64_t)memory_address, (uint64_t)size, copy, target_map, VM_MAP_PAGE_SIZE(target_map));
18472 		}
18473 	}
18474 
18475 	/*
18476 	 * If the user is requesting that we return the address of the
18477 	 * first byte of the data (rather than the base of the page),
18478 	 * then we use different rounding semantics: specifically,
18479 	 * we assume that (memory_address, size) describes a region
18480 	 * all of whose pages we must cover, rather than a base to be truncated
18481 	 * down and a size to be added to that base.  So we figure out
18482 	 * the highest page that the requested region includes and make
18483 	 * sure that the size will cover it.
18484 	 *
18485 	 * The key example we're worried about it is of the form:
18486 	 *
18487 	 *              memory_address = 0x1ff0, size = 0x20
18488 	 *
18489 	 * With the old semantics, we round down the memory_address to 0x1000
18490 	 * and round up the size to 0x1000, resulting in our covering *only*
18491 	 * page 0x1000.  With the new semantics, we'd realize that the region covers
18492 	 * 0x1ff0-0x2010, and compute a size of 0x2000.  Thus, we cover both page
18493 	 * 0x1000 and page 0x2000 in the region we remap.
18494 	 */
18495 	if (vmk_flags.vmf_return_data_addr) {
18496 		vm_map_offset_t range_start, range_end;
18497 
18498 		range_start = vm_map_trunc_page(memory_address, src_page_mask);
18499 		range_end = vm_map_round_page(memory_address + size, src_page_mask);
18500 		memory_address = range_start;
18501 		size = range_end - range_start;
18502 		offset_in_mapping = initial_memory_address - memory_address;
18503 	} else {
18504 		/*
18505 		 * IMPORTANT:
18506 		 * This legacy code path is broken: for the range mentioned
18507 		 * above [ memory_address = 0x1ff0,size = 0x20 ], which spans
18508 		 * two 4k pages, it yields [ memory_address = 0x1000,
18509 		 * size = 0x1000 ], which covers only the first 4k page.
18510 		 * BUT some code unfortunately depends on this bug, so we
18511 		 * can't fix it without breaking something.
18512 		 * New code should get automatically opted in the new
18513 		 * behavior with the new VM_FLAGS_RETURN_DATA_ADDR flags.
18514 		 */
18515 		offset_in_mapping = 0;
18516 		memory_address = vm_map_trunc_page(memory_address, src_page_mask);
18517 		size = vm_map_round_page(size, src_page_mask);
18518 		initial_memory_address = memory_address;
18519 		initial_size = size;
18520 	}
18521 
18522 
18523 	if (size == 0) {
18524 		return KERN_INVALID_ARGUMENT;
18525 	}
18526 
18527 	if (vmk_flags.vmf_resilient_media) {
18528 		/* must be copy-on-write to be "media resilient" */
18529 		if (!copy) {
18530 			return KERN_INVALID_ARGUMENT;
18531 		}
18532 	}
18533 
18534 	vmk_flags.vmkf_copy_pageable = target_map->hdr.entries_pageable;
18535 	vmk_flags.vmkf_copy_same_map = (src_map == target_map);
18536 
18537 	assert(size != 0);
18538 	result = vm_map_copy_extract(src_map,
18539 	    memory_address,
18540 	    size,
18541 	    copy, &copy_map,
18542 	    cur_protection, /* IN/OUT */
18543 	    max_protection, /* IN/OUT */
18544 	    inheritance,
18545 	    vmk_flags);
18546 	if (result != KERN_SUCCESS) {
18547 		return result;
18548 	}
18549 	assert(copy_map != VM_MAP_COPY_NULL);
18550 
18551 	/*
18552 	 * Handle the policy for vm map ranges
18553 	 *
18554 	 * If the maps differ, the target_map policy applies like for vm_map()
18555 	 * For same mapping remaps, we preserve the range.
18556 	 */
18557 	if (vmk_flags.vmkf_copy_same_map) {
18558 		vmk_flags.vmkf_range_id = copy_map->orig_range;
18559 	} else {
18560 		vm_map_kernel_flags_update_range_id(&vmk_flags, target_map);
18561 	}
18562 
18563 	overmap_start = 0;
18564 	overmap_end = 0;
18565 	trimmed_start = 0;
18566 	target_size = size;
18567 	if (src_page_mask != target_page_mask) {
18568 		vm_map_copy_t target_copy_map;
18569 
18570 		target_copy_map = copy_map; /* can modify "copy_map" itself */
18571 		DEBUG4K_ADJUST("adjusting...\n");
18572 		result = vm_map_copy_adjust_to_target(
18573 			copy_map,
18574 			offset_in_mapping, /* offset */
18575 			initial_size,
18576 			target_map,
18577 			copy,
18578 			&target_copy_map,
18579 			&overmap_start,
18580 			&overmap_end,
18581 			&trimmed_start);
18582 		if (result != KERN_SUCCESS) {
18583 			DEBUG4K_COPY("failed to adjust 0x%x\n", result);
18584 			vm_map_copy_discard(copy_map);
18585 			return result;
18586 		}
18587 		if (trimmed_start == 0) {
18588 			/* nothing trimmed: no adjustment needed */
18589 		} else if (trimmed_start >= offset_in_mapping) {
18590 			/* trimmed more than offset_in_mapping: nothing left */
18591 			assert(overmap_start == 0);
18592 			assert(overmap_end == 0);
18593 			offset_in_mapping = 0;
18594 		} else {
18595 			/* trimmed some of offset_in_mapping: adjust */
18596 			assert(overmap_start == 0);
18597 			assert(overmap_end == 0);
18598 			offset_in_mapping -= trimmed_start;
18599 		}
18600 		offset_in_mapping += overmap_start;
18601 		target_size = target_copy_map->size;
18602 	}
18603 
18604 	/*
18605 	 * Allocate/check a range of free virtual address
18606 	 * space for the target
18607 	 */
18608 	*address = vm_map_trunc_page(*address, target_page_mask);
18609 	vm_map_lock(target_map);
18610 	target_size = vm_map_round_page(target_size, target_page_mask);
18611 	result = vm_map_remap_range_allocate(target_map, address,
18612 	    target_size, mask, vmk_flags,
18613 	    &insp_entry, &zap_list);
18614 
18615 	for (entry = vm_map_copy_first_entry(copy_map);
18616 	    entry != vm_map_copy_to_entry(copy_map);
18617 	    entry = new_entry) {
18618 		new_entry = entry->vme_next;
18619 		vm_map_copy_entry_unlink(copy_map, entry);
18620 		if (result == KERN_SUCCESS) {
18621 			if (vmk_flags.vmkf_remap_prot_copy) {
18622 				/*
18623 				 * This vm_map_remap() is for a
18624 				 * vm_protect(VM_PROT_COPY), so the caller
18625 				 * expects to be allowed to add write access
18626 				 * to this new mapping.  This is done by
18627 				 * adding VM_PROT_WRITE to each entry's
18628 				 * max_protection... unless some security
18629 				 * settings disallow it.
18630 				 */
18631 				bool allow_write = false;
18632 				if (entry->vme_permanent) {
18633 					/* immutable mapping... */
18634 					if ((entry->max_protection & VM_PROT_EXECUTE) &&
18635 					    developer_mode_state()) {
18636 						/*
18637 						 * ... but executable and
18638 						 * possibly being debugged,
18639 						 * so let's allow it to become
18640 						 * writable, for breakpoints
18641 						 * and dtrace probes, for
18642 						 * example.
18643 						 */
18644 						allow_write = true;
18645 					} else {
18646 						printf("%d[%s] vm_remap(0x%llx,0x%llx) VM_PROT_COPY denied on permanent mapping prot 0x%x/0x%x developer %d\n",
18647 						    proc_selfpid(),
18648 						    (get_bsdtask_info(current_task())
18649 						    ? proc_name_address(get_bsdtask_info(current_task()))
18650 						    : "?"),
18651 						    (uint64_t)memory_address,
18652 						    (uint64_t)size,
18653 						    entry->protection,
18654 						    entry->max_protection,
18655 						    developer_mode_state());
18656 						DTRACE_VM6(vm_map_delete_permanent_deny_protcopy,
18657 						    vm_map_entry_t, entry,
18658 						    vm_map_offset_t, entry->vme_start,
18659 						    vm_map_offset_t, entry->vme_end,
18660 						    vm_prot_t, entry->protection,
18661 						    vm_prot_t, entry->max_protection,
18662 						    int, VME_ALIAS(entry));
18663 					}
18664 				} else {
18665 					allow_write = true;
18666 				}
18667 
18668 				/*
18669 				 * VM_PROT_COPY: allow this mapping to become
18670 				 * writable, unless it was "permanent".
18671 				 */
18672 				if (allow_write) {
18673 					entry->max_protection |= VM_PROT_WRITE;
18674 				}
18675 			}
18676 			if (vmk_flags.vmf_resilient_codesign) {
18677 				/* no codesigning -> read-only access */
18678 				entry->max_protection = VM_PROT_READ;
18679 				entry->protection = VM_PROT_READ;
18680 				entry->vme_resilient_codesign = TRUE;
18681 			}
18682 			entry->vme_start += *address;
18683 			entry->vme_end += *address;
18684 			assert(!entry->map_aligned);
18685 			if (vmk_flags.vmf_resilient_media &&
18686 			    !entry->is_sub_map &&
18687 			    (VME_OBJECT(entry) == VM_OBJECT_NULL ||
18688 			    VME_OBJECT(entry)->internal)) {
18689 				entry->vme_resilient_media = TRUE;
18690 			}
18691 			assert(VM_MAP_PAGE_ALIGNED(entry->vme_start, MIN(target_page_mask, PAGE_MASK)));
18692 			assert(VM_MAP_PAGE_ALIGNED(entry->vme_end, MIN(target_page_mask, PAGE_MASK)));
18693 			assert(VM_MAP_PAGE_ALIGNED(VME_OFFSET(entry), MIN(target_page_mask, PAGE_MASK)));
18694 			vm_map_store_entry_link(target_map, insp_entry, entry,
18695 			    vmk_flags);
18696 			insp_entry = entry;
18697 		} else {
18698 			if (!entry->is_sub_map) {
18699 				vm_object_deallocate(VME_OBJECT(entry));
18700 			} else {
18701 				vm_map_deallocate(VME_SUBMAP(entry));
18702 			}
18703 			vm_map_copy_entry_dispose(entry);
18704 		}
18705 	}
18706 
18707 	if (vmk_flags.vmf_resilient_codesign) {
18708 		*cur_protection = VM_PROT_READ;
18709 		*max_protection = VM_PROT_READ;
18710 	}
18711 
18712 	if (result == KERN_SUCCESS) {
18713 		target_map->size += target_size;
18714 		SAVE_HINT_MAP_WRITE(target_map, insp_entry);
18715 	}
18716 	vm_map_unlock(target_map);
18717 
18718 	vm_map_zap_dispose(&zap_list);
18719 
18720 	if (result == KERN_SUCCESS && target_map->wiring_required) {
18721 		result = vm_map_wire_kernel(target_map, *address,
18722 		    *address + size, *cur_protection, VM_KERN_MEMORY_MLOCK,
18723 		    TRUE);
18724 	}
18725 
18726 	/*
18727 	 * If requested, return the address of the data pointed to by the
18728 	 * request, rather than the base of the resulting page.
18729 	 */
18730 	if (vmk_flags.vmf_return_data_addr) {
18731 		*address += offset_in_mapping;
18732 	}
18733 
18734 	if (src_page_mask != target_page_mask) {
18735 		DEBUG4K_SHARE("vm_remap(%p 0x%llx 0x%llx copy=%d-> %p 0x%llx 0x%llx  result=0x%x\n", src_map, (uint64_t)memory_address, (uint64_t)size, copy, target_map, (uint64_t)*address, (uint64_t)offset_in_mapping, result);
18736 	}
18737 	vm_map_copy_discard(copy_map);
18738 	copy_map = VM_MAP_COPY_NULL;
18739 
18740 	return result;
18741 }
18742 
18743 /*
18744  *	Routine:	vm_map_remap_range_allocate
18745  *
18746  *	Description:
18747  *		Allocate a range in the specified virtual address map.
18748  *		returns the address and the map entry just before the allocated
18749  *		range
18750  *
18751  *	Map must be locked.
18752  */
18753 
18754 static kern_return_t
vm_map_remap_range_allocate(vm_map_t map,vm_map_address_t * address,vm_map_size_t size,vm_map_offset_t mask,vm_map_kernel_flags_t vmk_flags,vm_map_entry_t * map_entry,vm_map_zap_t zap_list)18755 vm_map_remap_range_allocate(
18756 	vm_map_t                map,
18757 	vm_map_address_t        *address,       /* IN/OUT */
18758 	vm_map_size_t           size,
18759 	vm_map_offset_t         mask,
18760 	vm_map_kernel_flags_t   vmk_flags,
18761 	vm_map_entry_t          *map_entry,     /* OUT */
18762 	vm_map_zap_t            zap_list)
18763 {
18764 	vm_map_entry_t  entry;
18765 	vm_map_offset_t start;
18766 	kern_return_t   kr;
18767 
18768 	start = *address;
18769 
18770 	if (!vmk_flags.vmf_fixed) {
18771 		kr = vm_map_locate_space(map, size, mask, vmk_flags,
18772 		    &start, &entry);
18773 		if (kr != KERN_SUCCESS) {
18774 			return kr;
18775 		}
18776 		*address = start;
18777 	} else {
18778 		vm_map_offset_t effective_min_offset, effective_max_offset;
18779 		vm_map_entry_t  temp_entry;
18780 		vm_map_offset_t end;
18781 
18782 		effective_min_offset = map->min_offset;
18783 		effective_max_offset = map->max_offset;
18784 
18785 		/*
18786 		 *	Verify that:
18787 		 *		the address doesn't itself violate
18788 		 *		the mask requirement.
18789 		 */
18790 
18791 		if ((start & mask) != 0) {
18792 			return KERN_NO_SPACE;
18793 		}
18794 
18795 #if CONFIG_MAP_RANGES
18796 		if (map->uses_user_ranges) {
18797 			struct mach_vm_range r;
18798 
18799 			vm_map_user_range_resolve(map, start, 1, &r);
18800 			if (r.max_address == 0) {
18801 				return KERN_INVALID_ADDRESS;
18802 			}
18803 
18804 			effective_min_offset = r.min_address;
18805 			effective_max_offset = r.max_address;
18806 		}
18807 #endif /* CONFIG_MAP_RANGES */
18808 		if (map == kernel_map) {
18809 			mach_vm_range_t r = kmem_validate_range_for_overwrite(start, size);
18810 			effective_min_offset = r->min_address;
18811 			effective_min_offset = r->max_address;
18812 		}
18813 
18814 		/*
18815 		 *	...	the address is within bounds
18816 		 */
18817 
18818 		end = start + size;
18819 
18820 		if ((start < effective_min_offset) ||
18821 		    (end > effective_max_offset) ||
18822 		    (start >= end)) {
18823 			return KERN_INVALID_ADDRESS;
18824 		}
18825 
18826 		/*
18827 		 * If we're asked to overwrite whatever was mapped in that
18828 		 * range, first deallocate that range.
18829 		 */
18830 		if (vmk_flags.vmf_overwrite) {
18831 			vmr_flags_t remove_flags = VM_MAP_REMOVE_NO_MAP_ALIGN;
18832 
18833 			/*
18834 			 * We use a "zap_list" to avoid having to unlock
18835 			 * the "map" in vm_map_delete(), which would compromise
18836 			 * the atomicity of the "deallocate" and then "remap"
18837 			 * combination.
18838 			 */
18839 			remove_flags |= VM_MAP_REMOVE_NO_YIELD;
18840 
18841 			if (vmk_flags.vmkf_overwrite_immutable) {
18842 				remove_flags |= VM_MAP_REMOVE_IMMUTABLE;
18843 			}
18844 			if (vmk_flags.vmkf_remap_prot_copy) {
18845 				remove_flags |= VM_MAP_REMOVE_IMMUTABLE_CODE;
18846 			}
18847 			kr = vm_map_delete(map, start, end, remove_flags,
18848 			    KMEM_GUARD_NONE, zap_list).kmr_return;
18849 			if (kr != KERN_SUCCESS) {
18850 				/* XXX FBDP restore zap_list? */
18851 				return kr;
18852 			}
18853 		}
18854 
18855 		/*
18856 		 *	...	the starting address isn't allocated
18857 		 */
18858 
18859 		if (vm_map_lookup_entry(map, start, &temp_entry)) {
18860 			return KERN_NO_SPACE;
18861 		}
18862 
18863 		entry = temp_entry;
18864 
18865 		/*
18866 		 *	...	the next region doesn't overlap the
18867 		 *		end point.
18868 		 */
18869 
18870 		if ((entry->vme_next != vm_map_to_entry(map)) &&
18871 		    (entry->vme_next->vme_start < end)) {
18872 			return KERN_NO_SPACE;
18873 		}
18874 	}
18875 	*map_entry = entry;
18876 	return KERN_SUCCESS;
18877 }
18878 
18879 /*
18880  *	vm_map_switch:
18881  *
18882  *	Set the address map for the current thread to the specified map
18883  */
18884 
18885 vm_map_t
vm_map_switch(vm_map_t map)18886 vm_map_switch(
18887 	vm_map_t        map)
18888 {
18889 	thread_t        thread = current_thread();
18890 	vm_map_t        oldmap = thread->map;
18891 
18892 
18893 	/*
18894 	 *	Deactivate the current map and activate the requested map
18895 	 */
18896 	mp_disable_preemption();
18897 	PMAP_SWITCH_USER(thread, map, cpu_number());
18898 	mp_enable_preemption();
18899 	return oldmap;
18900 }
18901 
18902 
18903 /*
18904  *	Routine:	vm_map_write_user
18905  *
18906  *	Description:
18907  *		Copy out data from a kernel space into space in the
18908  *		destination map. The space must already exist in the
18909  *		destination map.
18910  *		NOTE:  This routine should only be called by threads
18911  *		which can block on a page fault. i.e. kernel mode user
18912  *		threads.
18913  *
18914  */
18915 kern_return_t
vm_map_write_user(vm_map_t map,void * src_p,vm_map_address_t dst_addr,vm_size_t size)18916 vm_map_write_user(
18917 	vm_map_t                map,
18918 	void                    *src_p,
18919 	vm_map_address_t        dst_addr,
18920 	vm_size_t               size)
18921 {
18922 	kern_return_t   kr = KERN_SUCCESS;
18923 
18924 	if (current_map() == map) {
18925 		if (copyout(src_p, dst_addr, size)) {
18926 			kr = KERN_INVALID_ADDRESS;
18927 		}
18928 	} else {
18929 		vm_map_t        oldmap;
18930 
18931 		/* take on the identity of the target map while doing */
18932 		/* the transfer */
18933 
18934 		vm_map_reference(map);
18935 		oldmap = vm_map_switch(map);
18936 		if (copyout(src_p, dst_addr, size)) {
18937 			kr = KERN_INVALID_ADDRESS;
18938 		}
18939 		vm_map_switch(oldmap);
18940 		vm_map_deallocate(map);
18941 	}
18942 	return kr;
18943 }
18944 
18945 /*
18946  *	Routine:	vm_map_read_user
18947  *
18948  *	Description:
18949  *		Copy in data from a user space source map into the
18950  *		kernel map. The space must already exist in the
18951  *		kernel map.
18952  *		NOTE:  This routine should only be called by threads
18953  *		which can block on a page fault. i.e. kernel mode user
18954  *		threads.
18955  *
18956  */
18957 kern_return_t
vm_map_read_user(vm_map_t map,vm_map_address_t src_addr,void * dst_p,vm_size_t size)18958 vm_map_read_user(
18959 	vm_map_t                map,
18960 	vm_map_address_t        src_addr,
18961 	void                    *dst_p,
18962 	vm_size_t               size)
18963 {
18964 	kern_return_t   kr = KERN_SUCCESS;
18965 
18966 	if (current_map() == map) {
18967 		if (copyin(src_addr, dst_p, size)) {
18968 			kr = KERN_INVALID_ADDRESS;
18969 		}
18970 	} else {
18971 		vm_map_t        oldmap;
18972 
18973 		/* take on the identity of the target map while doing */
18974 		/* the transfer */
18975 
18976 		vm_map_reference(map);
18977 		oldmap = vm_map_switch(map);
18978 		if (copyin(src_addr, dst_p, size)) {
18979 			kr = KERN_INVALID_ADDRESS;
18980 		}
18981 		vm_map_switch(oldmap);
18982 		vm_map_deallocate(map);
18983 	}
18984 	return kr;
18985 }
18986 
18987 
18988 /*
18989  *	vm_map_check_protection:
18990  *
18991  *	Assert that the target map allows the specified
18992  *	privilege on the entire address region given.
18993  *	The entire region must be allocated.
18994  */
18995 boolean_t
vm_map_check_protection(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_prot_t protection)18996 vm_map_check_protection(vm_map_t map, vm_map_offset_t start,
18997     vm_map_offset_t end, vm_prot_t protection)
18998 {
18999 	vm_map_entry_t entry;
19000 	vm_map_entry_t tmp_entry;
19001 
19002 	vm_map_lock(map);
19003 
19004 	if (start < vm_map_min(map) || end > vm_map_max(map) || start > end) {
19005 		vm_map_unlock(map);
19006 		return FALSE;
19007 	}
19008 
19009 	if (!vm_map_lookup_entry(map, start, &tmp_entry)) {
19010 		vm_map_unlock(map);
19011 		return FALSE;
19012 	}
19013 
19014 	entry = tmp_entry;
19015 
19016 	while (start < end) {
19017 		if (entry == vm_map_to_entry(map)) {
19018 			vm_map_unlock(map);
19019 			return FALSE;
19020 		}
19021 
19022 		/*
19023 		 *	No holes allowed!
19024 		 */
19025 
19026 		if (start < entry->vme_start) {
19027 			vm_map_unlock(map);
19028 			return FALSE;
19029 		}
19030 
19031 		/*
19032 		 * Check protection associated with entry.
19033 		 */
19034 
19035 		if ((entry->protection & protection) != protection) {
19036 			vm_map_unlock(map);
19037 			return FALSE;
19038 		}
19039 
19040 		/* go to next entry */
19041 
19042 		start = entry->vme_end;
19043 		entry = entry->vme_next;
19044 	}
19045 	vm_map_unlock(map);
19046 	return TRUE;
19047 }
19048 
19049 kern_return_t
vm_map_purgable_control(vm_map_t map,vm_map_offset_t address,vm_purgable_t control,int * state)19050 vm_map_purgable_control(
19051 	vm_map_t                map,
19052 	vm_map_offset_t         address,
19053 	vm_purgable_t           control,
19054 	int                     *state)
19055 {
19056 	vm_map_entry_t          entry;
19057 	vm_object_t             object;
19058 	kern_return_t           kr;
19059 	boolean_t               was_nonvolatile;
19060 
19061 	/*
19062 	 * Vet all the input parameters and current type and state of the
19063 	 * underlaying object.  Return with an error if anything is amiss.
19064 	 */
19065 	if (map == VM_MAP_NULL) {
19066 		return KERN_INVALID_ARGUMENT;
19067 	}
19068 
19069 	if (control != VM_PURGABLE_SET_STATE &&
19070 	    control != VM_PURGABLE_GET_STATE &&
19071 	    control != VM_PURGABLE_PURGE_ALL &&
19072 	    control != VM_PURGABLE_SET_STATE_FROM_KERNEL) {
19073 		return KERN_INVALID_ARGUMENT;
19074 	}
19075 
19076 	if (control == VM_PURGABLE_PURGE_ALL) {
19077 		vm_purgeable_object_purge_all();
19078 		return KERN_SUCCESS;
19079 	}
19080 
19081 	if ((control == VM_PURGABLE_SET_STATE ||
19082 	    control == VM_PURGABLE_SET_STATE_FROM_KERNEL) &&
19083 	    (((*state & ~(VM_PURGABLE_ALL_MASKS)) != 0) ||
19084 	    ((*state & VM_PURGABLE_STATE_MASK) > VM_PURGABLE_STATE_MASK))) {
19085 		return KERN_INVALID_ARGUMENT;
19086 	}
19087 
19088 	vm_map_lock_read(map);
19089 
19090 	if (!vm_map_lookup_entry(map, address, &entry) || entry->is_sub_map) {
19091 		/*
19092 		 * Must pass a valid non-submap address.
19093 		 */
19094 		vm_map_unlock_read(map);
19095 		return KERN_INVALID_ADDRESS;
19096 	}
19097 
19098 	if ((entry->protection & VM_PROT_WRITE) == 0 &&
19099 	    control != VM_PURGABLE_GET_STATE) {
19100 		/*
19101 		 * Can't apply purgable controls to something you can't write.
19102 		 */
19103 		vm_map_unlock_read(map);
19104 		return KERN_PROTECTION_FAILURE;
19105 	}
19106 
19107 	object = VME_OBJECT(entry);
19108 	if (object == VM_OBJECT_NULL ||
19109 	    object->purgable == VM_PURGABLE_DENY) {
19110 		/*
19111 		 * Object must already be present and be purgeable.
19112 		 */
19113 		vm_map_unlock_read(map);
19114 		return KERN_INVALID_ARGUMENT;
19115 	}
19116 
19117 	vm_object_lock(object);
19118 
19119 #if 00
19120 	if (VME_OFFSET(entry) != 0 ||
19121 	    entry->vme_end - entry->vme_start != object->vo_size) {
19122 		/*
19123 		 * Can only apply purgable controls to the whole (existing)
19124 		 * object at once.
19125 		 */
19126 		vm_map_unlock_read(map);
19127 		vm_object_unlock(object);
19128 		return KERN_INVALID_ARGUMENT;
19129 	}
19130 #endif
19131 
19132 	assert(!entry->is_sub_map);
19133 	assert(!entry->use_pmap); /* purgeable has its own accounting */
19134 
19135 	vm_map_unlock_read(map);
19136 
19137 	was_nonvolatile = (object->purgable == VM_PURGABLE_NONVOLATILE);
19138 
19139 	kr = vm_object_purgable_control(object, control, state);
19140 
19141 	if (was_nonvolatile &&
19142 	    object->purgable != VM_PURGABLE_NONVOLATILE &&
19143 	    map->pmap == kernel_pmap) {
19144 #if DEBUG
19145 		object->vo_purgeable_volatilizer = kernel_task;
19146 #endif /* DEBUG */
19147 	}
19148 
19149 	vm_object_unlock(object);
19150 
19151 	return kr;
19152 }
19153 
19154 void
vm_map_footprint_query_page_info(vm_map_t map,vm_map_entry_t map_entry,vm_map_offset_t curr_s_offset,int * disposition_p)19155 vm_map_footprint_query_page_info(
19156 	vm_map_t        map,
19157 	vm_map_entry_t  map_entry,
19158 	vm_map_offset_t curr_s_offset,
19159 	int             *disposition_p)
19160 {
19161 	int             pmap_disp;
19162 	vm_object_t     object = VM_OBJECT_NULL;
19163 	int             disposition;
19164 	int             effective_page_size;
19165 
19166 	vm_map_lock_assert_held(map);
19167 	assert(!map->has_corpse_footprint);
19168 	assert(curr_s_offset >= map_entry->vme_start);
19169 	assert(curr_s_offset < map_entry->vme_end);
19170 
19171 	if (map_entry->is_sub_map) {
19172 		if (!map_entry->use_pmap) {
19173 			/* nested pmap: no footprint */
19174 			*disposition_p = 0;
19175 			return;
19176 		}
19177 	} else {
19178 		object = VME_OBJECT(map_entry);
19179 		if (object == VM_OBJECT_NULL) {
19180 			/* nothing mapped here: no need to ask */
19181 			*disposition_p = 0;
19182 			return;
19183 		}
19184 	}
19185 
19186 	effective_page_size = MIN(PAGE_SIZE, VM_MAP_PAGE_SIZE(map));
19187 
19188 	pmap_disp = 0;
19189 
19190 	/*
19191 	 * Query the pmap.
19192 	 */
19193 	pmap_query_page_info(map->pmap, curr_s_offset, &pmap_disp);
19194 
19195 	/*
19196 	 * Compute this page's disposition.
19197 	 */
19198 	disposition = 0;
19199 
19200 	/* deal with "alternate accounting" first */
19201 	if (!map_entry->is_sub_map &&
19202 	    object->vo_no_footprint) {
19203 		/* does not count in footprint */
19204 		assertf(!map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry);
19205 	} else if (!map_entry->is_sub_map &&
19206 	    (object->purgable == VM_PURGABLE_NONVOLATILE ||
19207 	    (object->purgable == VM_PURGABLE_DENY &&
19208 	    object->vo_ledger_tag)) &&
19209 	    VM_OBJECT_OWNER(object) != NULL &&
19210 	    VM_OBJECT_OWNER(object)->map == map) {
19211 		assertf(!map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry);
19212 		if ((((curr_s_offset
19213 		    - map_entry->vme_start
19214 		    + VME_OFFSET(map_entry))
19215 		    / effective_page_size) <
19216 		    (object->resident_page_count +
19217 		    vm_compressor_pager_get_count(object->pager)))) {
19218 			/*
19219 			 * Non-volatile purgeable object owned
19220 			 * by this task: report the first
19221 			 * "#resident + #compressed" pages as
19222 			 * "resident" (to show that they
19223 			 * contribute to the footprint) but not
19224 			 * "dirty" (to avoid double-counting
19225 			 * with the fake "non-volatile" region
19226 			 * we'll report at the end of the
19227 			 * address space to account for all
19228 			 * (mapped or not) non-volatile memory
19229 			 * owned by this task.
19230 			 */
19231 			disposition |= VM_PAGE_QUERY_PAGE_PRESENT;
19232 		}
19233 	} else if (!map_entry->is_sub_map &&
19234 	    (object->purgable == VM_PURGABLE_VOLATILE ||
19235 	    object->purgable == VM_PURGABLE_EMPTY) &&
19236 	    VM_OBJECT_OWNER(object) != NULL &&
19237 	    VM_OBJECT_OWNER(object)->map == map) {
19238 		assertf(!map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry);
19239 		if ((((curr_s_offset
19240 		    - map_entry->vme_start
19241 		    + VME_OFFSET(map_entry))
19242 		    / effective_page_size) <
19243 		    object->wired_page_count)) {
19244 			/*
19245 			 * Volatile|empty purgeable object owned
19246 			 * by this task: report the first
19247 			 * "#wired" pages as "resident" (to
19248 			 * show that they contribute to the
19249 			 * footprint) but not "dirty" (to avoid
19250 			 * double-counting with the fake
19251 			 * "non-volatile" region we'll report
19252 			 * at the end of the address space to
19253 			 * account for all (mapped or not)
19254 			 * non-volatile memory owned by this
19255 			 * task.
19256 			 */
19257 			disposition |= VM_PAGE_QUERY_PAGE_PRESENT;
19258 		}
19259 	} else if (!map_entry->is_sub_map &&
19260 	    map_entry->iokit_acct &&
19261 	    object->internal &&
19262 	    object->purgable == VM_PURGABLE_DENY) {
19263 		/*
19264 		 * Non-purgeable IOKit memory: phys_footprint
19265 		 * includes the entire virtual mapping.
19266 		 */
19267 		assertf(!map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry);
19268 		disposition |= VM_PAGE_QUERY_PAGE_PRESENT;
19269 		disposition |= VM_PAGE_QUERY_PAGE_DIRTY;
19270 	} else if (pmap_disp & (PMAP_QUERY_PAGE_ALTACCT |
19271 	    PMAP_QUERY_PAGE_COMPRESSED_ALTACCT)) {
19272 		/* alternate accounting */
19273 #if __arm64__ && (DEVELOPMENT || DEBUG)
19274 		if (map->pmap->footprint_was_suspended) {
19275 			/*
19276 			 * The assertion below can fail if dyld
19277 			 * suspended footprint accounting
19278 			 * while doing some adjustments to
19279 			 * this page;  the mapping would say
19280 			 * "use pmap accounting" but the page
19281 			 * would be marked "alternate
19282 			 * accounting".
19283 			 */
19284 		} else
19285 #endif /* __arm64__ && (DEVELOPMENT || DEBUG) */
19286 		{
19287 			assertf(!map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry);
19288 		}
19289 		disposition = 0;
19290 	} else {
19291 		if (pmap_disp & PMAP_QUERY_PAGE_PRESENT) {
19292 			assertf(map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry);
19293 			disposition |= VM_PAGE_QUERY_PAGE_PRESENT;
19294 			disposition |= VM_PAGE_QUERY_PAGE_REF;
19295 			if (pmap_disp & PMAP_QUERY_PAGE_INTERNAL) {
19296 				disposition |= VM_PAGE_QUERY_PAGE_DIRTY;
19297 			} else {
19298 				disposition |= VM_PAGE_QUERY_PAGE_EXTERNAL;
19299 			}
19300 			if (pmap_disp & PMAP_QUERY_PAGE_REUSABLE) {
19301 				disposition |= VM_PAGE_QUERY_PAGE_REUSABLE;
19302 			}
19303 		} else if (pmap_disp & PMAP_QUERY_PAGE_COMPRESSED) {
19304 			assertf(map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry);
19305 			disposition |= VM_PAGE_QUERY_PAGE_PAGED_OUT;
19306 		}
19307 	}
19308 
19309 	*disposition_p = disposition;
19310 }
19311 
19312 kern_return_t
vm_map_page_query_internal(vm_map_t target_map,vm_map_offset_t offset,int * disposition,int * ref_count)19313 vm_map_page_query_internal(
19314 	vm_map_t        target_map,
19315 	vm_map_offset_t offset,
19316 	int             *disposition,
19317 	int             *ref_count)
19318 {
19319 	kern_return_t                   kr;
19320 	vm_page_info_basic_data_t       info;
19321 	mach_msg_type_number_t          count;
19322 
19323 	count = VM_PAGE_INFO_BASIC_COUNT;
19324 	kr = vm_map_page_info(target_map,
19325 	    offset,
19326 	    VM_PAGE_INFO_BASIC,
19327 	    (vm_page_info_t) &info,
19328 	    &count);
19329 	if (kr == KERN_SUCCESS) {
19330 		*disposition = info.disposition;
19331 		*ref_count = info.ref_count;
19332 	} else {
19333 		*disposition = 0;
19334 		*ref_count = 0;
19335 	}
19336 
19337 	return kr;
19338 }
19339 
19340 kern_return_t
vm_map_page_info(vm_map_t map,vm_map_offset_t offset,vm_page_info_flavor_t flavor,vm_page_info_t info,mach_msg_type_number_t * count)19341 vm_map_page_info(
19342 	vm_map_t                map,
19343 	vm_map_offset_t         offset,
19344 	vm_page_info_flavor_t   flavor,
19345 	vm_page_info_t          info,
19346 	mach_msg_type_number_t  *count)
19347 {
19348 	return vm_map_page_range_info_internal(map,
19349 	           offset, /* start of range */
19350 	           (offset + 1), /* this will get rounded in the call to the page boundary */
19351 	           (int)-1, /* effective_page_shift: unspecified */
19352 	           flavor,
19353 	           info,
19354 	           count);
19355 }
19356 
19357 kern_return_t
vm_map_page_range_info_internal(vm_map_t map,vm_map_offset_t start_offset,vm_map_offset_t end_offset,int effective_page_shift,vm_page_info_flavor_t flavor,vm_page_info_t info,mach_msg_type_number_t * count)19358 vm_map_page_range_info_internal(
19359 	vm_map_t                map,
19360 	vm_map_offset_t         start_offset,
19361 	vm_map_offset_t         end_offset,
19362 	int                     effective_page_shift,
19363 	vm_page_info_flavor_t   flavor,
19364 	vm_page_info_t          info,
19365 	mach_msg_type_number_t  *count)
19366 {
19367 	vm_map_entry_t          map_entry = VM_MAP_ENTRY_NULL;
19368 	vm_object_t             object = VM_OBJECT_NULL, curr_object = VM_OBJECT_NULL;
19369 	vm_page_t               m = VM_PAGE_NULL;
19370 	kern_return_t           retval = KERN_SUCCESS;
19371 	int                     disposition = 0;
19372 	int                     ref_count = 0;
19373 	int                     depth = 0, info_idx = 0;
19374 	vm_page_info_basic_t    basic_info = 0;
19375 	vm_map_offset_t         offset_in_page = 0, offset_in_object = 0, curr_offset_in_object = 0;
19376 	vm_map_offset_t         start = 0, end = 0, curr_s_offset = 0, curr_e_offset = 0;
19377 	boolean_t               do_region_footprint;
19378 	ledger_amount_t         ledger_resident, ledger_compressed;
19379 	int                     effective_page_size;
19380 	vm_map_offset_t         effective_page_mask;
19381 
19382 	switch (flavor) {
19383 	case VM_PAGE_INFO_BASIC:
19384 		if (*count != VM_PAGE_INFO_BASIC_COUNT) {
19385 			/*
19386 			 * The "vm_page_info_basic_data" structure was not
19387 			 * properly padded, so allow the size to be off by
19388 			 * one to maintain backwards binary compatibility...
19389 			 */
19390 			if (*count != VM_PAGE_INFO_BASIC_COUNT - 1) {
19391 				return KERN_INVALID_ARGUMENT;
19392 			}
19393 		}
19394 		break;
19395 	default:
19396 		return KERN_INVALID_ARGUMENT;
19397 	}
19398 
19399 	if (effective_page_shift == -1) {
19400 		effective_page_shift = vm_self_region_page_shift_safely(map);
19401 		if (effective_page_shift == -1) {
19402 			return KERN_INVALID_ARGUMENT;
19403 		}
19404 	}
19405 	effective_page_size = (1 << effective_page_shift);
19406 	effective_page_mask = effective_page_size - 1;
19407 
19408 	do_region_footprint = task_self_region_footprint();
19409 	disposition = 0;
19410 	ref_count = 0;
19411 	depth = 0;
19412 	info_idx = 0; /* Tracks the next index within the info structure to be filled.*/
19413 	retval = KERN_SUCCESS;
19414 
19415 	offset_in_page = start_offset & effective_page_mask;
19416 	start = vm_map_trunc_page(start_offset, effective_page_mask);
19417 	end = vm_map_round_page(end_offset, effective_page_mask);
19418 
19419 	if (end < start) {
19420 		return KERN_INVALID_ARGUMENT;
19421 	}
19422 
19423 	assert((end - start) <= MAX_PAGE_RANGE_QUERY);
19424 
19425 	vm_map_lock_read(map);
19426 
19427 	task_ledgers_footprint(map->pmap->ledger, &ledger_resident, &ledger_compressed);
19428 
19429 	for (curr_s_offset = start; curr_s_offset < end;) {
19430 		/*
19431 		 * New lookup needs reset of these variables.
19432 		 */
19433 		curr_object = object = VM_OBJECT_NULL;
19434 		offset_in_object = 0;
19435 		ref_count = 0;
19436 		depth = 0;
19437 
19438 		if (do_region_footprint &&
19439 		    curr_s_offset >= vm_map_last_entry(map)->vme_end) {
19440 			/*
19441 			 * Request for "footprint" info about a page beyond
19442 			 * the end of address space: this must be for
19443 			 * the fake region vm_map_region_recurse_64()
19444 			 * reported to account for non-volatile purgeable
19445 			 * memory owned by this task.
19446 			 */
19447 			disposition = 0;
19448 
19449 			if (curr_s_offset - vm_map_last_entry(map)->vme_end <=
19450 			    (unsigned) ledger_compressed) {
19451 				/*
19452 				 * We haven't reported all the "non-volatile
19453 				 * compressed" pages yet, so report this fake
19454 				 * page as "compressed".
19455 				 */
19456 				disposition |= VM_PAGE_QUERY_PAGE_PAGED_OUT;
19457 			} else {
19458 				/*
19459 				 * We've reported all the non-volatile
19460 				 * compressed page but not all the non-volatile
19461 				 * pages , so report this fake page as
19462 				 * "resident dirty".
19463 				 */
19464 				disposition |= VM_PAGE_QUERY_PAGE_PRESENT;
19465 				disposition |= VM_PAGE_QUERY_PAGE_DIRTY;
19466 				disposition |= VM_PAGE_QUERY_PAGE_REF;
19467 			}
19468 			switch (flavor) {
19469 			case VM_PAGE_INFO_BASIC:
19470 				basic_info = (vm_page_info_basic_t) (((uintptr_t) info) + (info_idx * sizeof(struct vm_page_info_basic)));
19471 				basic_info->disposition = disposition;
19472 				basic_info->ref_count = 1;
19473 				basic_info->object_id = VM_OBJECT_ID_FAKE(map, task_ledgers.purgeable_nonvolatile);
19474 				basic_info->offset = 0;
19475 				basic_info->depth = 0;
19476 
19477 				info_idx++;
19478 				break;
19479 			}
19480 			curr_s_offset += effective_page_size;
19481 			continue;
19482 		}
19483 
19484 		/*
19485 		 * First, find the map entry covering "curr_s_offset", going down
19486 		 * submaps if necessary.
19487 		 */
19488 		if (!vm_map_lookup_entry(map, curr_s_offset, &map_entry)) {
19489 			/* no entry -> no object -> no page */
19490 
19491 			if (curr_s_offset < vm_map_min(map)) {
19492 				/*
19493 				 * Illegal address that falls below map min.
19494 				 */
19495 				curr_e_offset = MIN(end, vm_map_min(map));
19496 			} else if (curr_s_offset >= vm_map_max(map)) {
19497 				/*
19498 				 * Illegal address that falls on/after map max.
19499 				 */
19500 				curr_e_offset = end;
19501 			} else if (map_entry == vm_map_to_entry(map)) {
19502 				/*
19503 				 * Hit a hole.
19504 				 */
19505 				if (map_entry->vme_next == vm_map_to_entry(map)) {
19506 					/*
19507 					 * Empty map.
19508 					 */
19509 					curr_e_offset = MIN(map->max_offset, end);
19510 				} else {
19511 					/*
19512 					 * Hole at start of the map.
19513 					 */
19514 					curr_e_offset = MIN(map_entry->vme_next->vme_start, end);
19515 				}
19516 			} else {
19517 				if (map_entry->vme_next == vm_map_to_entry(map)) {
19518 					/*
19519 					 * Hole at the end of the map.
19520 					 */
19521 					curr_e_offset = MIN(map->max_offset, end);
19522 				} else {
19523 					curr_e_offset = MIN(map_entry->vme_next->vme_start, end);
19524 				}
19525 			}
19526 
19527 			assert(curr_e_offset >= curr_s_offset);
19528 
19529 			uint64_t num_pages = (curr_e_offset - curr_s_offset) >> effective_page_shift;
19530 
19531 			void *info_ptr = (void*) (((uintptr_t) info) + (info_idx * sizeof(struct vm_page_info_basic)));
19532 
19533 			bzero(info_ptr, num_pages * sizeof(struct vm_page_info_basic));
19534 
19535 			curr_s_offset = curr_e_offset;
19536 
19537 			info_idx += num_pages;
19538 
19539 			continue;
19540 		}
19541 
19542 		/* compute offset from this map entry's start */
19543 		offset_in_object = curr_s_offset - map_entry->vme_start;
19544 
19545 		/* compute offset into this map entry's object (or submap) */
19546 		offset_in_object += VME_OFFSET(map_entry);
19547 
19548 		if (map_entry->is_sub_map) {
19549 			vm_map_t sub_map = VM_MAP_NULL;
19550 			vm_page_info_t submap_info = 0;
19551 			vm_map_offset_t submap_s_offset = 0, submap_e_offset = 0, range_len = 0;
19552 
19553 			range_len = MIN(map_entry->vme_end, end) - curr_s_offset;
19554 
19555 			submap_s_offset = offset_in_object;
19556 			submap_e_offset = submap_s_offset + range_len;
19557 
19558 			sub_map = VME_SUBMAP(map_entry);
19559 
19560 			vm_map_reference(sub_map);
19561 			vm_map_unlock_read(map);
19562 
19563 			submap_info = (vm_page_info_t) (((uintptr_t) info) + (info_idx * sizeof(struct vm_page_info_basic)));
19564 
19565 			assertf(VM_MAP_PAGE_SHIFT(sub_map) >= VM_MAP_PAGE_SHIFT(map),
19566 			    "Submap page size (%d) differs from current map (%d)\n", VM_MAP_PAGE_SIZE(sub_map), VM_MAP_PAGE_SIZE(map));
19567 
19568 			retval = vm_map_page_range_info_internal(sub_map,
19569 			    submap_s_offset,
19570 			    submap_e_offset,
19571 			    effective_page_shift,
19572 			    VM_PAGE_INFO_BASIC,
19573 			    (vm_page_info_t) submap_info,
19574 			    count);
19575 
19576 			assert(retval == KERN_SUCCESS);
19577 
19578 			vm_map_lock_read(map);
19579 			vm_map_deallocate(sub_map);
19580 
19581 			/* Move the "info" index by the number of pages we inspected.*/
19582 			info_idx += range_len >> effective_page_shift;
19583 
19584 			/* Move our current offset by the size of the range we inspected.*/
19585 			curr_s_offset += range_len;
19586 
19587 			continue;
19588 		}
19589 
19590 		object = VME_OBJECT(map_entry);
19591 
19592 		if (object == VM_OBJECT_NULL) {
19593 			/*
19594 			 * We don't have an object here and, hence,
19595 			 * no pages to inspect. We'll fill up the
19596 			 * info structure appropriately.
19597 			 */
19598 
19599 			curr_e_offset = MIN(map_entry->vme_end, end);
19600 
19601 			uint64_t num_pages = (curr_e_offset - curr_s_offset) >> effective_page_shift;
19602 
19603 			void *info_ptr = (void*) (((uintptr_t) info) + (info_idx * sizeof(struct vm_page_info_basic)));
19604 
19605 			bzero(info_ptr, num_pages * sizeof(struct vm_page_info_basic));
19606 
19607 			curr_s_offset = curr_e_offset;
19608 
19609 			info_idx += num_pages;
19610 
19611 			continue;
19612 		}
19613 
19614 		if (do_region_footprint) {
19615 			disposition = 0;
19616 			if (map->has_corpse_footprint) {
19617 				/*
19618 				 * Query the page info data we saved
19619 				 * while forking the corpse.
19620 				 */
19621 				vm_map_corpse_footprint_query_page_info(
19622 					map,
19623 					curr_s_offset,
19624 					&disposition);
19625 			} else {
19626 				/*
19627 				 * Query the live pmap for footprint info
19628 				 * about this page.
19629 				 */
19630 				vm_map_footprint_query_page_info(
19631 					map,
19632 					map_entry,
19633 					curr_s_offset,
19634 					&disposition);
19635 			}
19636 			switch (flavor) {
19637 			case VM_PAGE_INFO_BASIC:
19638 				basic_info = (vm_page_info_basic_t) (((uintptr_t) info) + (info_idx * sizeof(struct vm_page_info_basic)));
19639 				basic_info->disposition = disposition;
19640 				basic_info->ref_count = 1;
19641 				basic_info->object_id = VM_OBJECT_ID_FAKE(map, task_ledgers.purgeable_nonvolatile);
19642 				basic_info->offset = 0;
19643 				basic_info->depth = 0;
19644 
19645 				info_idx++;
19646 				break;
19647 			}
19648 			curr_s_offset += effective_page_size;
19649 			continue;
19650 		}
19651 
19652 		vm_object_reference(object);
19653 		/*
19654 		 * Shared mode -- so we can allow other readers
19655 		 * to grab the lock too.
19656 		 */
19657 		vm_object_lock_shared(object);
19658 
19659 		curr_e_offset = MIN(map_entry->vme_end, end);
19660 
19661 		vm_map_unlock_read(map);
19662 
19663 		map_entry = NULL; /* map is unlocked, the entry is no longer valid. */
19664 
19665 		curr_object = object;
19666 
19667 		for (; curr_s_offset < curr_e_offset;) {
19668 			if (object == curr_object) {
19669 				ref_count = curr_object->ref_count - 1; /* account for our object reference above. */
19670 			} else {
19671 				ref_count = curr_object->ref_count;
19672 			}
19673 
19674 			curr_offset_in_object = offset_in_object;
19675 
19676 			for (;;) {
19677 				m = vm_page_lookup(curr_object, vm_object_trunc_page(curr_offset_in_object));
19678 
19679 				if (m != VM_PAGE_NULL) {
19680 					disposition |= VM_PAGE_QUERY_PAGE_PRESENT;
19681 					break;
19682 				} else {
19683 					if (curr_object->internal &&
19684 					    curr_object->alive &&
19685 					    !curr_object->terminating &&
19686 					    curr_object->pager_ready) {
19687 						if (VM_COMPRESSOR_PAGER_STATE_GET(curr_object, vm_object_trunc_page(curr_offset_in_object))
19688 						    == VM_EXTERNAL_STATE_EXISTS) {
19689 							/* the pager has that page */
19690 							disposition |= VM_PAGE_QUERY_PAGE_PAGED_OUT;
19691 							break;
19692 						}
19693 					}
19694 
19695 					/*
19696 					 * Go down the VM object shadow chain until we find the page
19697 					 * we're looking for.
19698 					 */
19699 
19700 					if (curr_object->shadow != VM_OBJECT_NULL) {
19701 						vm_object_t shadow = VM_OBJECT_NULL;
19702 
19703 						curr_offset_in_object += curr_object->vo_shadow_offset;
19704 						shadow = curr_object->shadow;
19705 
19706 						vm_object_lock_shared(shadow);
19707 						vm_object_unlock(curr_object);
19708 
19709 						curr_object = shadow;
19710 						depth++;
19711 						continue;
19712 					} else {
19713 						break;
19714 					}
19715 				}
19716 			}
19717 
19718 			/* The ref_count is not strictly accurate, it measures the number   */
19719 			/* of entities holding a ref on the object, they may not be mapping */
19720 			/* the object or may not be mapping the section holding the         */
19721 			/* target page but its still a ball park number and though an over- */
19722 			/* count, it picks up the copy-on-write cases                       */
19723 
19724 			/* We could also get a picture of page sharing from pmap_attributes */
19725 			/* but this would under count as only faulted-in mappings would     */
19726 			/* show up.							    */
19727 
19728 			if ((curr_object == object) && curr_object->shadow) {
19729 				disposition |= VM_PAGE_QUERY_PAGE_COPIED;
19730 			}
19731 
19732 			if (!curr_object->internal) {
19733 				disposition |= VM_PAGE_QUERY_PAGE_EXTERNAL;
19734 			}
19735 
19736 			if (m != VM_PAGE_NULL) {
19737 				if (m->vmp_fictitious) {
19738 					disposition |= VM_PAGE_QUERY_PAGE_FICTITIOUS;
19739 				} else {
19740 					if (m->vmp_dirty || pmap_is_modified(VM_PAGE_GET_PHYS_PAGE(m))) {
19741 						disposition |= VM_PAGE_QUERY_PAGE_DIRTY;
19742 					}
19743 
19744 					if (m->vmp_reference || pmap_is_referenced(VM_PAGE_GET_PHYS_PAGE(m))) {
19745 						disposition |= VM_PAGE_QUERY_PAGE_REF;
19746 					}
19747 
19748 					if (m->vmp_q_state == VM_PAGE_ON_SPECULATIVE_Q) {
19749 						disposition |= VM_PAGE_QUERY_PAGE_SPECULATIVE;
19750 					}
19751 
19752 					/*
19753 					 * XXX TODO4K:
19754 					 * when this routine deals with 4k
19755 					 * pages, check the appropriate CS bit
19756 					 * here.
19757 					 */
19758 					if (m->vmp_cs_validated) {
19759 						disposition |= VM_PAGE_QUERY_PAGE_CS_VALIDATED;
19760 					}
19761 					if (m->vmp_cs_tainted) {
19762 						disposition |= VM_PAGE_QUERY_PAGE_CS_TAINTED;
19763 					}
19764 					if (m->vmp_cs_nx) {
19765 						disposition |= VM_PAGE_QUERY_PAGE_CS_NX;
19766 					}
19767 					if (m->vmp_reusable || curr_object->all_reusable) {
19768 						disposition |= VM_PAGE_QUERY_PAGE_REUSABLE;
19769 					}
19770 				}
19771 			}
19772 
19773 			switch (flavor) {
19774 			case VM_PAGE_INFO_BASIC:
19775 				basic_info = (vm_page_info_basic_t) (((uintptr_t) info) + (info_idx * sizeof(struct vm_page_info_basic)));
19776 				basic_info->disposition = disposition;
19777 				basic_info->ref_count = ref_count;
19778 				basic_info->object_id = (vm_object_id_t) (uintptr_t)
19779 				    VM_KERNEL_ADDRPERM(curr_object);
19780 				basic_info->offset =
19781 				    (memory_object_offset_t) curr_offset_in_object + offset_in_page;
19782 				basic_info->depth = depth;
19783 
19784 				info_idx++;
19785 				break;
19786 			}
19787 
19788 			disposition = 0;
19789 			offset_in_page = 0; // This doesn't really make sense for any offset other than the starting offset.
19790 
19791 			/*
19792 			 * Move to next offset in the range and in our object.
19793 			 */
19794 			curr_s_offset += effective_page_size;
19795 			offset_in_object += effective_page_size;
19796 			curr_offset_in_object = offset_in_object;
19797 
19798 			if (curr_object != object) {
19799 				vm_object_unlock(curr_object);
19800 
19801 				curr_object = object;
19802 
19803 				vm_object_lock_shared(curr_object);
19804 			} else {
19805 				vm_object_lock_yield_shared(curr_object);
19806 			}
19807 		}
19808 
19809 		vm_object_unlock(curr_object);
19810 		vm_object_deallocate(curr_object);
19811 
19812 		vm_map_lock_read(map);
19813 	}
19814 
19815 	vm_map_unlock_read(map);
19816 	return retval;
19817 }
19818 
19819 /*
19820  *	vm_map_msync
19821  *
19822  *	Synchronises the memory range specified with its backing store
19823  *	image by either flushing or cleaning the contents to the appropriate
19824  *	memory manager engaging in a memory object synchronize dialog with
19825  *	the manager.  The client doesn't return until the manager issues
19826  *	m_o_s_completed message.  MIG Magically converts user task parameter
19827  *	to the task's address map.
19828  *
19829  *	interpretation of sync_flags
19830  *	VM_SYNC_INVALIDATE	- discard pages, only return precious
19831  *				  pages to manager.
19832  *
19833  *	VM_SYNC_INVALIDATE & (VM_SYNC_SYNCHRONOUS | VM_SYNC_ASYNCHRONOUS)
19834  *				- discard pages, write dirty or precious
19835  *				  pages back to memory manager.
19836  *
19837  *	VM_SYNC_SYNCHRONOUS | VM_SYNC_ASYNCHRONOUS
19838  *				- write dirty or precious pages back to
19839  *				  the memory manager.
19840  *
19841  *	VM_SYNC_CONTIGUOUS	- does everything normally, but if there
19842  *				  is a hole in the region, and we would
19843  *				  have returned KERN_SUCCESS, return
19844  *				  KERN_INVALID_ADDRESS instead.
19845  *
19846  *	NOTE
19847  *	The memory object attributes have not yet been implemented, this
19848  *	function will have to deal with the invalidate attribute
19849  *
19850  *	RETURNS
19851  *	KERN_INVALID_TASK		Bad task parameter
19852  *	KERN_INVALID_ARGUMENT		both sync and async were specified.
19853  *	KERN_SUCCESS			The usual.
19854  *	KERN_INVALID_ADDRESS		There was a hole in the region.
19855  */
19856 
19857 kern_return_t
vm_map_msync(vm_map_t map,vm_map_address_t address,vm_map_size_t size,vm_sync_t sync_flags)19858 vm_map_msync(
19859 	vm_map_t                map,
19860 	vm_map_address_t        address,
19861 	vm_map_size_t           size,
19862 	vm_sync_t               sync_flags)
19863 {
19864 	vm_map_entry_t          entry;
19865 	vm_map_size_t           amount_left;
19866 	vm_object_offset_t      offset;
19867 	vm_object_offset_t      start_offset, end_offset;
19868 	boolean_t               do_sync_req;
19869 	boolean_t               had_hole = FALSE;
19870 	vm_map_offset_t         pmap_offset;
19871 
19872 	if ((sync_flags & VM_SYNC_ASYNCHRONOUS) &&
19873 	    (sync_flags & VM_SYNC_SYNCHRONOUS)) {
19874 		return KERN_INVALID_ARGUMENT;
19875 	}
19876 
19877 	if (VM_MAP_PAGE_MASK(map) < PAGE_MASK) {
19878 		DEBUG4K_SHARE("map %p address 0x%llx size 0x%llx flags 0x%x\n", map, (uint64_t)address, (uint64_t)size, sync_flags);
19879 	}
19880 
19881 	/*
19882 	 * align address and size on page boundaries
19883 	 */
19884 	size = (vm_map_round_page(address + size,
19885 	    VM_MAP_PAGE_MASK(map)) -
19886 	    vm_map_trunc_page(address,
19887 	    VM_MAP_PAGE_MASK(map)));
19888 	address = vm_map_trunc_page(address,
19889 	    VM_MAP_PAGE_MASK(map));
19890 
19891 	if (map == VM_MAP_NULL) {
19892 		return KERN_INVALID_TASK;
19893 	}
19894 
19895 	if (size == 0) {
19896 		return KERN_SUCCESS;
19897 	}
19898 
19899 	amount_left = size;
19900 
19901 	while (amount_left > 0) {
19902 		vm_object_size_t        flush_size;
19903 		vm_object_t             object;
19904 
19905 		vm_map_lock(map);
19906 		if (!vm_map_lookup_entry(map,
19907 		    address,
19908 		    &entry)) {
19909 			vm_map_size_t   skip;
19910 
19911 			/*
19912 			 * hole in the address map.
19913 			 */
19914 			had_hole = TRUE;
19915 
19916 			if (sync_flags & VM_SYNC_KILLPAGES) {
19917 				/*
19918 				 * For VM_SYNC_KILLPAGES, there should be
19919 				 * no holes in the range, since we couldn't
19920 				 * prevent someone else from allocating in
19921 				 * that hole and we wouldn't want to "kill"
19922 				 * their pages.
19923 				 */
19924 				vm_map_unlock(map);
19925 				break;
19926 			}
19927 
19928 			/*
19929 			 * Check for empty map.
19930 			 */
19931 			if (entry == vm_map_to_entry(map) &&
19932 			    entry->vme_next == entry) {
19933 				vm_map_unlock(map);
19934 				break;
19935 			}
19936 			/*
19937 			 * Check that we don't wrap and that
19938 			 * we have at least one real map entry.
19939 			 */
19940 			if ((map->hdr.nentries == 0) ||
19941 			    (entry->vme_next->vme_start < address)) {
19942 				vm_map_unlock(map);
19943 				break;
19944 			}
19945 			/*
19946 			 * Move up to the next entry if needed
19947 			 */
19948 			skip = (entry->vme_next->vme_start - address);
19949 			if (skip >= amount_left) {
19950 				amount_left = 0;
19951 			} else {
19952 				amount_left -= skip;
19953 			}
19954 			address = entry->vme_next->vme_start;
19955 			vm_map_unlock(map);
19956 			continue;
19957 		}
19958 
19959 		offset = address - entry->vme_start;
19960 		pmap_offset = address;
19961 
19962 		/*
19963 		 * do we have more to flush than is contained in this
19964 		 * entry ?
19965 		 */
19966 		if (amount_left + entry->vme_start + offset > entry->vme_end) {
19967 			flush_size = entry->vme_end -
19968 			    (entry->vme_start + offset);
19969 		} else {
19970 			flush_size = amount_left;
19971 		}
19972 		amount_left -= flush_size;
19973 		address += flush_size;
19974 
19975 		if (entry->is_sub_map == TRUE) {
19976 			vm_map_t        local_map;
19977 			vm_map_offset_t local_offset;
19978 
19979 			local_map = VME_SUBMAP(entry);
19980 			local_offset = VME_OFFSET(entry);
19981 			vm_map_reference(local_map);
19982 			vm_map_unlock(map);
19983 			if (vm_map_msync(
19984 				    local_map,
19985 				    local_offset,
19986 				    flush_size,
19987 				    sync_flags) == KERN_INVALID_ADDRESS) {
19988 				had_hole = TRUE;
19989 			}
19990 			vm_map_deallocate(local_map);
19991 			continue;
19992 		}
19993 		object = VME_OBJECT(entry);
19994 
19995 		/*
19996 		 * We can't sync this object if the object has not been
19997 		 * created yet
19998 		 */
19999 		if (object == VM_OBJECT_NULL) {
20000 			vm_map_unlock(map);
20001 			continue;
20002 		}
20003 		offset += VME_OFFSET(entry);
20004 
20005 		vm_object_lock(object);
20006 
20007 		if (sync_flags & (VM_SYNC_KILLPAGES | VM_SYNC_DEACTIVATE)) {
20008 			int kill_pages = 0;
20009 
20010 			if (VM_MAP_PAGE_MASK(map) < PAGE_MASK) {
20011 				/*
20012 				 * This is a destructive operation and so we
20013 				 * err on the side of limiting the range of
20014 				 * the operation.
20015 				 */
20016 				start_offset = vm_object_round_page(offset);
20017 				end_offset = vm_object_trunc_page(offset + flush_size);
20018 
20019 				if (end_offset <= start_offset) {
20020 					vm_object_unlock(object);
20021 					vm_map_unlock(map);
20022 					continue;
20023 				}
20024 
20025 				pmap_offset += start_offset - offset;
20026 			} else {
20027 				start_offset = offset;
20028 				end_offset = offset + flush_size;
20029 			}
20030 
20031 			if (sync_flags & VM_SYNC_KILLPAGES) {
20032 				if (((object->ref_count == 1) ||
20033 				    ((object->copy_strategy !=
20034 				    MEMORY_OBJECT_COPY_SYMMETRIC) &&
20035 				    (object->copy == VM_OBJECT_NULL))) &&
20036 				    (object->shadow == VM_OBJECT_NULL)) {
20037 					if (object->ref_count != 1) {
20038 						vm_page_stats_reusable.free_shared++;
20039 					}
20040 					kill_pages = 1;
20041 				} else {
20042 					kill_pages = -1;
20043 				}
20044 			}
20045 			if (kill_pages != -1) {
20046 				vm_object_deactivate_pages(
20047 					object,
20048 					start_offset,
20049 					(vm_object_size_t) (end_offset - start_offset),
20050 					kill_pages,
20051 					FALSE, /* reusable_pages */
20052 					FALSE, /* reusable_no_write */
20053 					map->pmap,
20054 					pmap_offset);
20055 			}
20056 			vm_object_unlock(object);
20057 			vm_map_unlock(map);
20058 			continue;
20059 		}
20060 		/*
20061 		 * We can't sync this object if there isn't a pager.
20062 		 * Don't bother to sync internal objects, since there can't
20063 		 * be any "permanent" storage for these objects anyway.
20064 		 */
20065 		if ((object->pager == MEMORY_OBJECT_NULL) ||
20066 		    (object->internal) || (object->private)) {
20067 			vm_object_unlock(object);
20068 			vm_map_unlock(map);
20069 			continue;
20070 		}
20071 		/*
20072 		 * keep reference on the object until syncing is done
20073 		 */
20074 		vm_object_reference_locked(object);
20075 		vm_object_unlock(object);
20076 
20077 		vm_map_unlock(map);
20078 
20079 		if (VM_MAP_PAGE_MASK(map) < PAGE_MASK) {
20080 			start_offset = vm_object_trunc_page(offset);
20081 			end_offset = vm_object_round_page(offset + flush_size);
20082 		} else {
20083 			start_offset = offset;
20084 			end_offset = offset + flush_size;
20085 		}
20086 
20087 		do_sync_req = vm_object_sync(object,
20088 		    start_offset,
20089 		    (end_offset - start_offset),
20090 		    sync_flags & VM_SYNC_INVALIDATE,
20091 		    ((sync_flags & VM_SYNC_SYNCHRONOUS) ||
20092 		    (sync_flags & VM_SYNC_ASYNCHRONOUS)),
20093 		    sync_flags & VM_SYNC_SYNCHRONOUS);
20094 
20095 		if ((sync_flags & VM_SYNC_INVALIDATE) && object->resident_page_count == 0) {
20096 			/*
20097 			 * clear out the clustering and read-ahead hints
20098 			 */
20099 			vm_object_lock(object);
20100 
20101 			object->pages_created = 0;
20102 			object->pages_used = 0;
20103 			object->sequential = 0;
20104 			object->last_alloc = 0;
20105 
20106 			vm_object_unlock(object);
20107 		}
20108 		vm_object_deallocate(object);
20109 	} /* while */
20110 
20111 	/* for proper msync() behaviour */
20112 	if (had_hole == TRUE && (sync_flags & VM_SYNC_CONTIGUOUS)) {
20113 		return KERN_INVALID_ADDRESS;
20114 	}
20115 
20116 	return KERN_SUCCESS;
20117 }/* vm_msync */
20118 
20119 void
vm_named_entry_associate_vm_object(vm_named_entry_t named_entry,vm_object_t object,vm_object_offset_t offset,vm_object_size_t size,vm_prot_t prot)20120 vm_named_entry_associate_vm_object(
20121 	vm_named_entry_t        named_entry,
20122 	vm_object_t             object,
20123 	vm_object_offset_t      offset,
20124 	vm_object_size_t        size,
20125 	vm_prot_t               prot)
20126 {
20127 	vm_map_copy_t copy;
20128 	vm_map_entry_t copy_entry;
20129 
20130 	assert(!named_entry->is_sub_map);
20131 	assert(!named_entry->is_copy);
20132 	assert(!named_entry->is_object);
20133 	assert(!named_entry->internal);
20134 	assert(named_entry->backing.copy == VM_MAP_COPY_NULL);
20135 
20136 	copy = vm_map_copy_allocate(VM_MAP_COPY_ENTRY_LIST);
20137 	copy->offset = offset;
20138 	copy->size = size;
20139 	copy->cpy_hdr.page_shift = (uint16_t)PAGE_SHIFT;
20140 
20141 	copy_entry = vm_map_copy_entry_create(copy);
20142 	copy_entry->protection = prot;
20143 	copy_entry->max_protection = prot;
20144 	copy_entry->use_pmap = TRUE;
20145 	copy_entry->vme_start = VM_MAP_TRUNC_PAGE(offset, PAGE_MASK);
20146 	copy_entry->vme_end = VM_MAP_ROUND_PAGE(offset + size, PAGE_MASK);
20147 	VME_OBJECT_SET(copy_entry, object, false, 0);
20148 	VME_OFFSET_SET(copy_entry, vm_object_trunc_page(offset));
20149 	vm_map_copy_entry_link(copy, vm_map_copy_last_entry(copy), copy_entry);
20150 
20151 	named_entry->backing.copy = copy;
20152 	named_entry->is_object = TRUE;
20153 	if (object->internal) {
20154 		named_entry->internal = TRUE;
20155 	}
20156 
20157 	DEBUG4K_MEMENTRY("named_entry %p copy %p object %p offset 0x%llx size 0x%llx prot 0x%x\n",
20158 	    named_entry, copy, object, offset, size, prot);
20159 }
20160 
20161 vm_object_t
vm_named_entry_to_vm_object(vm_named_entry_t named_entry)20162 vm_named_entry_to_vm_object(
20163 	vm_named_entry_t named_entry)
20164 {
20165 	vm_map_copy_t   copy;
20166 	vm_map_entry_t  copy_entry;
20167 	vm_object_t     object;
20168 
20169 	assert(!named_entry->is_sub_map);
20170 	assert(!named_entry->is_copy);
20171 	assert(named_entry->is_object);
20172 	copy = named_entry->backing.copy;
20173 	assert(copy != VM_MAP_COPY_NULL);
20174 	/*
20175 	 * Assert that the vm_map_copy is coming from the right
20176 	 * zone and hasn't been forged
20177 	 */
20178 	vm_map_copy_require(copy);
20179 	assert(copy->cpy_hdr.nentries == 1);
20180 	copy_entry = vm_map_copy_first_entry(copy);
20181 	object = VME_OBJECT(copy_entry);
20182 
20183 	DEBUG4K_MEMENTRY("%p -> %p -> %p [0x%llx 0x%llx 0x%llx 0x%x/0x%x ] -> %p offset 0x%llx size 0x%llx prot 0x%x\n", named_entry, copy, copy_entry, (uint64_t)copy_entry->vme_start, (uint64_t)copy_entry->vme_end, copy_entry->vme_offset, copy_entry->protection, copy_entry->max_protection, object, named_entry->offset, named_entry->size, named_entry->protection);
20184 
20185 	return object;
20186 }
20187 
20188 /*
20189  *	Routine:	convert_port_entry_to_map
20190  *	Purpose:
20191  *		Convert from a port specifying an entry or a task
20192  *		to a map. Doesn't consume the port ref; produces a map ref,
20193  *		which may be null.  Unlike convert_port_to_map, the
20194  *		port may be task or a named entry backed.
20195  *	Conditions:
20196  *		Nothing locked.
20197  */
20198 
20199 vm_map_t
convert_port_entry_to_map(ipc_port_t port)20200 convert_port_entry_to_map(
20201 	ipc_port_t      port)
20202 {
20203 	vm_map_t map = VM_MAP_NULL;
20204 	vm_named_entry_t named_entry;
20205 
20206 	if (!IP_VALID(port)) {
20207 		return VM_MAP_NULL;
20208 	}
20209 
20210 	if (ip_kotype(port) != IKOT_NAMED_ENTRY) {
20211 		return convert_port_to_map(port);
20212 	}
20213 
20214 	named_entry = mach_memory_entry_from_port(port);
20215 
20216 	if ((named_entry->is_sub_map) &&
20217 	    (named_entry->protection & VM_PROT_WRITE)) {
20218 		map = named_entry->backing.map;
20219 		if (map->pmap != PMAP_NULL) {
20220 			if (map->pmap == kernel_pmap) {
20221 				panic("userspace has access "
20222 				    "to a kernel map %p", map);
20223 			}
20224 			pmap_require(map->pmap);
20225 		}
20226 		vm_map_reference(map);
20227 	}
20228 
20229 	return map;
20230 }
20231 
20232 /*
20233  * Export routines to other components for the things we access locally through
20234  * macros.
20235  */
20236 #undef current_map
20237 vm_map_t
current_map(void)20238 current_map(void)
20239 {
20240 	return current_map_fast();
20241 }
20242 
20243 /*
20244  *	vm_map_reference:
20245  *
20246  *	Takes a reference on the specified map.
20247  */
20248 void
vm_map_reference(vm_map_t map)20249 vm_map_reference(
20250 	vm_map_t        map)
20251 {
20252 	if (__probable(map != VM_MAP_NULL)) {
20253 		vm_map_require(map);
20254 		os_ref_retain_raw(&map->map_refcnt, &map_refgrp);
20255 	}
20256 }
20257 
20258 /*
20259  *	vm_map_deallocate:
20260  *
20261  *	Removes a reference from the specified map,
20262  *	destroying it if no references remain.
20263  *	The map should not be locked.
20264  */
20265 void
vm_map_deallocate(vm_map_t map)20266 vm_map_deallocate(
20267 	vm_map_t        map)
20268 {
20269 	if (__probable(map != VM_MAP_NULL)) {
20270 		vm_map_require(map);
20271 		if (os_ref_release_raw(&map->map_refcnt, &map_refgrp) == 0) {
20272 			vm_map_destroy(map);
20273 		}
20274 	}
20275 }
20276 
20277 void
vm_map_inspect_deallocate(vm_map_inspect_t map)20278 vm_map_inspect_deallocate(
20279 	vm_map_inspect_t      map)
20280 {
20281 	vm_map_deallocate((vm_map_t)map);
20282 }
20283 
20284 void
vm_map_read_deallocate(vm_map_read_t map)20285 vm_map_read_deallocate(
20286 	vm_map_read_t      map)
20287 {
20288 	vm_map_deallocate((vm_map_t)map);
20289 }
20290 
20291 
20292 void
vm_map_disable_NX(vm_map_t map)20293 vm_map_disable_NX(vm_map_t map)
20294 {
20295 	if (map == NULL) {
20296 		return;
20297 	}
20298 	if (map->pmap == NULL) {
20299 		return;
20300 	}
20301 
20302 	pmap_disable_NX(map->pmap);
20303 }
20304 
20305 void
vm_map_disallow_data_exec(vm_map_t map)20306 vm_map_disallow_data_exec(vm_map_t map)
20307 {
20308 	if (map == NULL) {
20309 		return;
20310 	}
20311 
20312 	map->map_disallow_data_exec = TRUE;
20313 }
20314 
20315 /* XXX Consider making these constants (VM_MAX_ADDRESS and MACH_VM_MAX_ADDRESS)
20316  * more descriptive.
20317  */
20318 void
vm_map_set_32bit(vm_map_t map)20319 vm_map_set_32bit(vm_map_t map)
20320 {
20321 #if defined(__arm64__)
20322 	map->max_offset = pmap_max_offset(FALSE, ARM_PMAP_MAX_OFFSET_DEVICE);
20323 #else
20324 	map->max_offset = (vm_map_offset_t)VM_MAX_ADDRESS;
20325 #endif
20326 }
20327 
20328 
20329 void
vm_map_set_64bit(vm_map_t map)20330 vm_map_set_64bit(vm_map_t map)
20331 {
20332 #if defined(__arm64__)
20333 	map->max_offset = pmap_max_offset(TRUE, ARM_PMAP_MAX_OFFSET_DEVICE);
20334 #else
20335 	map->max_offset = (vm_map_offset_t)MACH_VM_MAX_ADDRESS;
20336 #endif
20337 }
20338 
20339 /*
20340  * Expand the maximum size of an existing map to the maximum supported.
20341  */
20342 void
vm_map_set_jumbo(vm_map_t map)20343 vm_map_set_jumbo(vm_map_t map)
20344 {
20345 #if defined (__arm64__) && !XNU_TARGET_OS_OSX
20346 	vm_map_set_max_addr(map, ~0);
20347 #else /* arm64 */
20348 	(void) map;
20349 #endif
20350 }
20351 
20352 /*
20353  * This map has a JIT entitlement
20354  */
20355 void
vm_map_set_jit_entitled(vm_map_t map)20356 vm_map_set_jit_entitled(vm_map_t map)
20357 {
20358 #if defined (__arm64__)
20359 	pmap_set_jit_entitled(map->pmap);
20360 #else /* arm64 */
20361 	(void) map;
20362 #endif
20363 }
20364 
20365 /*
20366  * Get status of this maps TPRO flag
20367  */
20368 boolean_t
vm_map_tpro(vm_map_t map)20369 vm_map_tpro(vm_map_t map)
20370 {
20371 #if defined (__arm64e__)
20372 	return pmap_get_tpro(map->pmap);
20373 #else /* arm64e */
20374 	(void) map;
20375 	return false;
20376 #endif
20377 }
20378 
20379 /*
20380  * This map has TPRO enabled
20381  */
20382 void
vm_map_set_tpro(vm_map_t map)20383 vm_map_set_tpro(vm_map_t map)
20384 {
20385 #if defined (__arm64e__)
20386 	pmap_set_tpro(map->pmap);
20387 #else /* arm64e */
20388 	(void) map;
20389 #endif
20390 }
20391 
20392 /*
20393  * Expand the maximum size of an existing map.
20394  */
20395 void
vm_map_set_max_addr(vm_map_t map,vm_map_offset_t new_max_offset)20396 vm_map_set_max_addr(vm_map_t map, vm_map_offset_t new_max_offset)
20397 {
20398 #if defined(__arm64__)
20399 	vm_map_offset_t max_supported_offset;
20400 	vm_map_offset_t old_max_offset;
20401 
20402 	vm_map_lock(map);
20403 
20404 	old_max_offset = map->max_offset;
20405 	max_supported_offset = pmap_max_offset(vm_map_is_64bit(map), ARM_PMAP_MAX_OFFSET_JUMBO);
20406 
20407 	new_max_offset = trunc_page(new_max_offset);
20408 
20409 	/* The address space cannot be shrunk using this routine. */
20410 	if (old_max_offset >= new_max_offset) {
20411 		vm_map_unlock(map);
20412 		return;
20413 	}
20414 
20415 	if (max_supported_offset < new_max_offset) {
20416 		new_max_offset = max_supported_offset;
20417 	}
20418 
20419 	map->max_offset = new_max_offset;
20420 
20421 	if (map->holelistenabled) {
20422 		if (map->holes_list->prev->vme_end == old_max_offset) {
20423 			/*
20424 			 * There is already a hole at the end of the map; simply make it bigger.
20425 			 */
20426 			map->holes_list->prev->vme_end = map->max_offset;
20427 		} else {
20428 			/*
20429 			 * There is no hole at the end, so we need to create a new hole
20430 			 * for the new empty space we're creating.
20431 			 */
20432 			struct vm_map_links *new_hole;
20433 
20434 			new_hole = zalloc_id(ZONE_ID_VM_MAP_HOLES, Z_WAITOK | Z_NOFAIL);
20435 			new_hole->start = old_max_offset;
20436 			new_hole->end = map->max_offset;
20437 			new_hole->prev = map->holes_list->prev;
20438 			new_hole->next = (struct vm_map_entry *)map->holes_list;
20439 			map->holes_list->prev->vme_next = (struct vm_map_entry *)new_hole;
20440 			map->holes_list->prev = (struct vm_map_entry *)new_hole;
20441 		}
20442 	}
20443 
20444 	vm_map_unlock(map);
20445 #else
20446 	(void)map;
20447 	(void)new_max_offset;
20448 #endif
20449 }
20450 
20451 vm_map_offset_t
vm_compute_max_offset(boolean_t is64)20452 vm_compute_max_offset(boolean_t is64)
20453 {
20454 #if defined(__arm64__)
20455 	return pmap_max_offset(is64, ARM_PMAP_MAX_OFFSET_DEVICE);
20456 #else
20457 	return is64 ? (vm_map_offset_t)MACH_VM_MAX_ADDRESS : (vm_map_offset_t)VM_MAX_ADDRESS;
20458 #endif
20459 }
20460 
20461 void
vm_map_get_max_aslr_slide_section(vm_map_t map __unused,int64_t * max_sections,int64_t * section_size)20462 vm_map_get_max_aslr_slide_section(
20463 	vm_map_t                map __unused,
20464 	int64_t                 *max_sections,
20465 	int64_t                 *section_size)
20466 {
20467 #if defined(__arm64__)
20468 	*max_sections = 3;
20469 	*section_size = ARM_TT_TWIG_SIZE;
20470 #else
20471 	*max_sections = 1;
20472 	*section_size = 0;
20473 #endif
20474 }
20475 
20476 uint64_t
vm_map_get_max_aslr_slide_pages(vm_map_t map)20477 vm_map_get_max_aslr_slide_pages(vm_map_t map)
20478 {
20479 #if defined(__arm64__)
20480 	/* Limit arm64 slide to 16MB to conserve contiguous VA space in the more
20481 	 * limited embedded address space; this is also meant to minimize pmap
20482 	 * memory usage on 16KB page systems.
20483 	 */
20484 	return 1 << (24 - VM_MAP_PAGE_SHIFT(map));
20485 #else
20486 	return 1 << (vm_map_is_64bit(map) ? 16 : 8);
20487 #endif
20488 }
20489 
20490 uint64_t
vm_map_get_max_loader_aslr_slide_pages(vm_map_t map)20491 vm_map_get_max_loader_aslr_slide_pages(vm_map_t map)
20492 {
20493 #if defined(__arm64__)
20494 	/* We limit the loader slide to 4MB, in order to ensure at least 8 bits
20495 	 * of independent entropy on 16KB page systems.
20496 	 */
20497 	return 1 << (22 - VM_MAP_PAGE_SHIFT(map));
20498 #else
20499 	return 1 << (vm_map_is_64bit(map) ? 16 : 8);
20500 #endif
20501 }
20502 
20503 boolean_t
vm_map_is_64bit(vm_map_t map)20504 vm_map_is_64bit(
20505 	vm_map_t map)
20506 {
20507 	return map->max_offset > ((vm_map_offset_t)VM_MAX_ADDRESS);
20508 }
20509 
20510 boolean_t
vm_map_has_hard_pagezero(vm_map_t map,vm_map_offset_t pagezero_size)20511 vm_map_has_hard_pagezero(
20512 	vm_map_t        map,
20513 	vm_map_offset_t pagezero_size)
20514 {
20515 	/*
20516 	 * XXX FBDP
20517 	 * We should lock the VM map (for read) here but we can get away
20518 	 * with it for now because there can't really be any race condition:
20519 	 * the VM map's min_offset is changed only when the VM map is created
20520 	 * and when the zero page is established (when the binary gets loaded),
20521 	 * and this routine gets called only when the task terminates and the
20522 	 * VM map is being torn down, and when a new map is created via
20523 	 * load_machfile()/execve().
20524 	 */
20525 	return map->min_offset >= pagezero_size;
20526 }
20527 
20528 /*
20529  * Raise a VM map's maximun offset.
20530  */
20531 kern_return_t
vm_map_raise_max_offset(vm_map_t map,vm_map_offset_t new_max_offset)20532 vm_map_raise_max_offset(
20533 	vm_map_t        map,
20534 	vm_map_offset_t new_max_offset)
20535 {
20536 	kern_return_t   ret;
20537 
20538 	vm_map_lock(map);
20539 	ret = KERN_INVALID_ADDRESS;
20540 
20541 	if (new_max_offset >= map->max_offset) {
20542 		if (!vm_map_is_64bit(map)) {
20543 			if (new_max_offset <= (vm_map_offset_t)VM_MAX_ADDRESS) {
20544 				map->max_offset = new_max_offset;
20545 				ret = KERN_SUCCESS;
20546 			}
20547 		} else {
20548 			if (new_max_offset <= (vm_map_offset_t)MACH_VM_MAX_ADDRESS) {
20549 				map->max_offset = new_max_offset;
20550 				ret = KERN_SUCCESS;
20551 			}
20552 		}
20553 	}
20554 
20555 	vm_map_unlock(map);
20556 	return ret;
20557 }
20558 
20559 
20560 /*
20561  * Raise a VM map's minimum offset.
20562  * To strictly enforce "page zero" reservation.
20563  */
20564 kern_return_t
vm_map_raise_min_offset(vm_map_t map,vm_map_offset_t new_min_offset)20565 vm_map_raise_min_offset(
20566 	vm_map_t        map,
20567 	vm_map_offset_t new_min_offset)
20568 {
20569 	vm_map_entry_t  first_entry;
20570 
20571 	new_min_offset = vm_map_round_page(new_min_offset,
20572 	    VM_MAP_PAGE_MASK(map));
20573 
20574 	vm_map_lock(map);
20575 
20576 	if (new_min_offset < map->min_offset) {
20577 		/*
20578 		 * Can't move min_offset backwards, as that would expose
20579 		 * a part of the address space that was previously, and for
20580 		 * possibly good reasons, inaccessible.
20581 		 */
20582 		vm_map_unlock(map);
20583 		return KERN_INVALID_ADDRESS;
20584 	}
20585 	if (new_min_offset >= map->max_offset) {
20586 		/* can't go beyond the end of the address space */
20587 		vm_map_unlock(map);
20588 		return KERN_INVALID_ADDRESS;
20589 	}
20590 
20591 	first_entry = vm_map_first_entry(map);
20592 	if (first_entry != vm_map_to_entry(map) &&
20593 	    first_entry->vme_start < new_min_offset) {
20594 		/*
20595 		 * Some memory was already allocated below the new
20596 		 * minimun offset.  It's too late to change it now...
20597 		 */
20598 		vm_map_unlock(map);
20599 		return KERN_NO_SPACE;
20600 	}
20601 
20602 	map->min_offset = new_min_offset;
20603 
20604 	if (map->holelistenabled) {
20605 		assert(map->holes_list);
20606 		map->holes_list->start = new_min_offset;
20607 		assert(new_min_offset < map->holes_list->end);
20608 	}
20609 
20610 	vm_map_unlock(map);
20611 
20612 	return KERN_SUCCESS;
20613 }
20614 
20615 /*
20616  * Set the limit on the maximum amount of address space and user wired memory allowed for this map.
20617  * This is basically a copy of the RLIMIT_AS and RLIMIT_MEMLOCK rlimit value maintained by the BSD
20618  * side of the kernel. The limits are checked in the mach VM side, so we keep a copy so we don't
20619  * have to reach over to the BSD data structures.
20620  */
20621 
20622 uint64_t vm_map_set_size_limit_count = 0;
20623 kern_return_t
vm_map_set_size_limit(vm_map_t map,uint64_t new_size_limit)20624 vm_map_set_size_limit(vm_map_t map, uint64_t new_size_limit)
20625 {
20626 	kern_return_t kr;
20627 
20628 	vm_map_lock(map);
20629 	if (new_size_limit < map->size) {
20630 		/* new limit should not be lower than its current size */
20631 		DTRACE_VM2(vm_map_set_size_limit_fail,
20632 		    vm_map_size_t, map->size,
20633 		    uint64_t, new_size_limit);
20634 		kr = KERN_FAILURE;
20635 	} else if (new_size_limit == map->size_limit) {
20636 		/* no change */
20637 		kr = KERN_SUCCESS;
20638 	} else {
20639 		/* set new limit */
20640 		DTRACE_VM2(vm_map_set_size_limit,
20641 		    vm_map_size_t, map->size,
20642 		    uint64_t, new_size_limit);
20643 		if (new_size_limit != RLIM_INFINITY) {
20644 			vm_map_set_size_limit_count++;
20645 		}
20646 		map->size_limit = new_size_limit;
20647 		kr = KERN_SUCCESS;
20648 	}
20649 	vm_map_unlock(map);
20650 	return kr;
20651 }
20652 
20653 uint64_t vm_map_set_data_limit_count = 0;
20654 kern_return_t
vm_map_set_data_limit(vm_map_t map,uint64_t new_data_limit)20655 vm_map_set_data_limit(vm_map_t map, uint64_t new_data_limit)
20656 {
20657 	kern_return_t kr;
20658 
20659 	vm_map_lock(map);
20660 	if (new_data_limit < map->size) {
20661 		/* new limit should not be lower than its current size */
20662 		DTRACE_VM2(vm_map_set_data_limit_fail,
20663 		    vm_map_size_t, map->size,
20664 		    uint64_t, new_data_limit);
20665 		kr = KERN_FAILURE;
20666 	} else if (new_data_limit == map->data_limit) {
20667 		/* no change */
20668 		kr = KERN_SUCCESS;
20669 	} else {
20670 		/* set new limit */
20671 		DTRACE_VM2(vm_map_set_data_limit,
20672 		    vm_map_size_t, map->size,
20673 		    uint64_t, new_data_limit);
20674 		if (new_data_limit != RLIM_INFINITY) {
20675 			vm_map_set_data_limit_count++;
20676 		}
20677 		map->data_limit = new_data_limit;
20678 		kr = KERN_SUCCESS;
20679 	}
20680 	vm_map_unlock(map);
20681 	return kr;
20682 }
20683 
20684 void
vm_map_set_user_wire_limit(vm_map_t map,vm_size_t limit)20685 vm_map_set_user_wire_limit(vm_map_t     map,
20686     vm_size_t    limit)
20687 {
20688 	vm_map_lock(map);
20689 	map->user_wire_limit = limit;
20690 	vm_map_unlock(map);
20691 }
20692 
20693 
20694 void
vm_map_switch_protect(vm_map_t map,boolean_t val)20695 vm_map_switch_protect(vm_map_t     map,
20696     boolean_t    val)
20697 {
20698 	vm_map_lock(map);
20699 	map->switch_protect = val;
20700 	vm_map_unlock(map);
20701 }
20702 
20703 extern int cs_process_enforcement_enable;
20704 boolean_t
vm_map_cs_enforcement(vm_map_t map)20705 vm_map_cs_enforcement(
20706 	vm_map_t map)
20707 {
20708 	if (cs_process_enforcement_enable) {
20709 		return TRUE;
20710 	}
20711 	return map->cs_enforcement;
20712 }
20713 
20714 kern_return_t
vm_map_cs_wx_enable(__unused vm_map_t map)20715 vm_map_cs_wx_enable(
20716 	__unused vm_map_t map)
20717 {
20718 #if CODE_SIGNING_MONITOR
20719 	kern_return_t ret = csm_allow_invalid_code(vm_map_pmap(map));
20720 	if ((ret == KERN_SUCCESS) || (ret == KERN_NOT_SUPPORTED)) {
20721 		return KERN_SUCCESS;
20722 	}
20723 	return ret;
20724 #else
20725 	/* The VM manages WX memory entirely on its own */
20726 	return true;
20727 #endif
20728 }
20729 
20730 void
vm_map_cs_debugged_set(vm_map_t map,boolean_t val)20731 vm_map_cs_debugged_set(
20732 	vm_map_t map,
20733 	boolean_t val)
20734 {
20735 	vm_map_lock(map);
20736 	map->cs_debugged = val;
20737 	vm_map_unlock(map);
20738 }
20739 
20740 void
vm_map_cs_enforcement_set(vm_map_t map,boolean_t val)20741 vm_map_cs_enforcement_set(
20742 	vm_map_t map,
20743 	boolean_t val)
20744 {
20745 	vm_map_lock(map);
20746 	map->cs_enforcement = val;
20747 	pmap_set_vm_map_cs_enforced(map->pmap, val);
20748 	vm_map_unlock(map);
20749 }
20750 
20751 /*
20752  * IOKit has mapped a region into this map; adjust the pmap's ledgers appropriately.
20753  * phys_footprint is a composite limit consisting of iokit + physmem, so we need to
20754  * bump both counters.
20755  */
20756 void
vm_map_iokit_mapped_region(vm_map_t map,vm_size_t bytes)20757 vm_map_iokit_mapped_region(vm_map_t map, vm_size_t bytes)
20758 {
20759 	pmap_t pmap = vm_map_pmap(map);
20760 
20761 	ledger_credit(pmap->ledger, task_ledgers.iokit_mapped, bytes);
20762 	ledger_credit(pmap->ledger, task_ledgers.phys_footprint, bytes);
20763 }
20764 
20765 void
vm_map_iokit_unmapped_region(vm_map_t map,vm_size_t bytes)20766 vm_map_iokit_unmapped_region(vm_map_t map, vm_size_t bytes)
20767 {
20768 	pmap_t pmap = vm_map_pmap(map);
20769 
20770 	ledger_debit(pmap->ledger, task_ledgers.iokit_mapped, bytes);
20771 	ledger_debit(pmap->ledger, task_ledgers.phys_footprint, bytes);
20772 }
20773 
20774 /* Add (generate) code signature for memory range */
20775 #if CONFIG_DYNAMIC_CODE_SIGNING
20776 kern_return_t
vm_map_sign(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)20777 vm_map_sign(vm_map_t map,
20778     vm_map_offset_t start,
20779     vm_map_offset_t end)
20780 {
20781 	vm_map_entry_t entry;
20782 	vm_page_t m;
20783 	vm_object_t object;
20784 
20785 	/*
20786 	 * Vet all the input parameters and current type and state of the
20787 	 * underlaying object.  Return with an error if anything is amiss.
20788 	 */
20789 	if (map == VM_MAP_NULL) {
20790 		return KERN_INVALID_ARGUMENT;
20791 	}
20792 
20793 	vm_map_lock_read(map);
20794 
20795 	if (!vm_map_lookup_entry(map, start, &entry) || entry->is_sub_map) {
20796 		/*
20797 		 * Must pass a valid non-submap address.
20798 		 */
20799 		vm_map_unlock_read(map);
20800 		return KERN_INVALID_ADDRESS;
20801 	}
20802 
20803 	if ((entry->vme_start > start) || (entry->vme_end < end)) {
20804 		/*
20805 		 * Map entry doesn't cover the requested range. Not handling
20806 		 * this situation currently.
20807 		 */
20808 		vm_map_unlock_read(map);
20809 		return KERN_INVALID_ARGUMENT;
20810 	}
20811 
20812 	object = VME_OBJECT(entry);
20813 	if (object == VM_OBJECT_NULL) {
20814 		/*
20815 		 * Object must already be present or we can't sign.
20816 		 */
20817 		vm_map_unlock_read(map);
20818 		return KERN_INVALID_ARGUMENT;
20819 	}
20820 
20821 	vm_object_lock(object);
20822 	vm_map_unlock_read(map);
20823 
20824 	while (start < end) {
20825 		uint32_t refmod;
20826 
20827 		m = vm_page_lookup(object,
20828 		    start - entry->vme_start + VME_OFFSET(entry));
20829 		if (m == VM_PAGE_NULL) {
20830 			/* shoud we try to fault a page here? we can probably
20831 			 * demand it exists and is locked for this request */
20832 			vm_object_unlock(object);
20833 			return KERN_FAILURE;
20834 		}
20835 		/* deal with special page status */
20836 		if (m->vmp_busy ||
20837 		    (m->vmp_unusual && (VMP_ERROR_GET(m) || m->vmp_restart || m->vmp_private || m->vmp_absent))) {
20838 			vm_object_unlock(object);
20839 			return KERN_FAILURE;
20840 		}
20841 
20842 		/* Page is OK... now "validate" it */
20843 		/* This is the place where we'll call out to create a code
20844 		 * directory, later */
20845 		/* XXX TODO4K: deal with 4k subpages individually? */
20846 		m->vmp_cs_validated = VMP_CS_ALL_TRUE;
20847 
20848 		/* The page is now "clean" for codesigning purposes. That means
20849 		 * we don't consider it as modified (wpmapped) anymore. But
20850 		 * we'll disconnect the page so we note any future modification
20851 		 * attempts. */
20852 		m->vmp_wpmapped = FALSE;
20853 		refmod = pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m));
20854 
20855 		/* Pull the dirty status from the pmap, since we cleared the
20856 		 * wpmapped bit */
20857 		if ((refmod & VM_MEM_MODIFIED) && !m->vmp_dirty) {
20858 			SET_PAGE_DIRTY(m, FALSE);
20859 		}
20860 
20861 		/* On to the next page */
20862 		start += PAGE_SIZE;
20863 	}
20864 	vm_object_unlock(object);
20865 
20866 	return KERN_SUCCESS;
20867 }
20868 #endif
20869 
20870 kern_return_t
vm_map_partial_reap(vm_map_t map,unsigned int * reclaimed_resident,unsigned int * reclaimed_compressed)20871 vm_map_partial_reap(vm_map_t map, unsigned int *reclaimed_resident, unsigned int *reclaimed_compressed)
20872 {
20873 	vm_map_entry_t  entry = VM_MAP_ENTRY_NULL;
20874 	vm_map_entry_t  next_entry;
20875 	kern_return_t   kr = KERN_SUCCESS;
20876 	VM_MAP_ZAP_DECLARE(zap_list);
20877 
20878 	vm_map_lock(map);
20879 
20880 	for (entry = vm_map_first_entry(map);
20881 	    entry != vm_map_to_entry(map);
20882 	    entry = next_entry) {
20883 		next_entry = entry->vme_next;
20884 
20885 		if (!entry->is_sub_map &&
20886 		    VME_OBJECT(entry) &&
20887 		    (VME_OBJECT(entry)->internal == TRUE) &&
20888 		    (VME_OBJECT(entry)->ref_count == 1)) {
20889 			*reclaimed_resident += VME_OBJECT(entry)->resident_page_count;
20890 			*reclaimed_compressed += vm_compressor_pager_get_count(VME_OBJECT(entry)->pager);
20891 
20892 			(void)vm_map_delete(map, entry->vme_start,
20893 			    entry->vme_end, VM_MAP_REMOVE_NO_YIELD,
20894 			    KMEM_GUARD_NONE, &zap_list);
20895 		}
20896 	}
20897 
20898 	vm_map_unlock(map);
20899 
20900 	vm_map_zap_dispose(&zap_list);
20901 
20902 	return kr;
20903 }
20904 
20905 
20906 #if DEVELOPMENT || DEBUG
20907 
20908 int
vm_map_disconnect_page_mappings(vm_map_t map,boolean_t do_unnest)20909 vm_map_disconnect_page_mappings(
20910 	vm_map_t map,
20911 	boolean_t do_unnest)
20912 {
20913 	vm_map_entry_t entry;
20914 	ledger_amount_t byte_count = 0;
20915 
20916 	if (do_unnest == TRUE) {
20917 #ifndef NO_NESTED_PMAP
20918 		vm_map_lock(map);
20919 
20920 		for (entry = vm_map_first_entry(map);
20921 		    entry != vm_map_to_entry(map);
20922 		    entry = entry->vme_next) {
20923 			if (entry->is_sub_map && entry->use_pmap) {
20924 				/*
20925 				 * Make sure the range between the start of this entry and
20926 				 * the end of this entry is no longer nested, so that
20927 				 * we will only remove mappings from the pmap in use by this
20928 				 * this task
20929 				 */
20930 				vm_map_clip_unnest(map, entry, entry->vme_start, entry->vme_end);
20931 			}
20932 		}
20933 		vm_map_unlock(map);
20934 #endif
20935 	}
20936 	vm_map_lock_read(map);
20937 
20938 	ledger_get_balance(map->pmap->ledger, task_ledgers.phys_mem, &byte_count);
20939 
20940 	for (entry = vm_map_first_entry(map);
20941 	    entry != vm_map_to_entry(map);
20942 	    entry = entry->vme_next) {
20943 		if (!entry->is_sub_map && ((VME_OBJECT(entry) == 0) ||
20944 		    (VME_OBJECT(entry)->phys_contiguous))) {
20945 			continue;
20946 		}
20947 		if (entry->is_sub_map) {
20948 			assert(!entry->use_pmap);
20949 		}
20950 
20951 		pmap_remove_options(map->pmap, entry->vme_start, entry->vme_end, 0);
20952 	}
20953 	vm_map_unlock_read(map);
20954 
20955 	return (int) (byte_count / VM_MAP_PAGE_SIZE(map));
20956 }
20957 
20958 kern_return_t
vm_map_inject_error(vm_map_t map,vm_map_offset_t vaddr)20959 vm_map_inject_error(vm_map_t map, vm_map_offset_t vaddr)
20960 {
20961 	vm_object_t object = NULL;
20962 	vm_object_offset_t offset;
20963 	vm_prot_t prot;
20964 	boolean_t wired;
20965 	vm_map_version_t version;
20966 	vm_map_t real_map;
20967 	int result = KERN_FAILURE;
20968 
20969 	vaddr = vm_map_trunc_page(vaddr, PAGE_MASK);
20970 	vm_map_lock(map);
20971 
20972 	result = vm_map_lookup_and_lock_object(&map, vaddr, VM_PROT_READ,
20973 	    OBJECT_LOCK_EXCLUSIVE, &version, &object, &offset, &prot, &wired,
20974 	    NULL, &real_map, NULL);
20975 	if (object == NULL) {
20976 		result = KERN_MEMORY_ERROR;
20977 	} else if (object->pager) {
20978 		result = vm_compressor_pager_inject_error(object->pager,
20979 		    offset);
20980 	} else {
20981 		result = KERN_MEMORY_PRESENT;
20982 	}
20983 
20984 	if (object != NULL) {
20985 		vm_object_unlock(object);
20986 	}
20987 
20988 	if (real_map != map) {
20989 		vm_map_unlock(real_map);
20990 	}
20991 	vm_map_unlock(map);
20992 
20993 	return result;
20994 }
20995 
20996 #endif
20997 
20998 
20999 #if CONFIG_FREEZE
21000 
21001 
21002 extern struct freezer_context freezer_context_global;
21003 AbsoluteTime c_freezer_last_yield_ts = 0;
21004 
21005 extern unsigned int memorystatus_freeze_private_shared_pages_ratio;
21006 extern unsigned int memorystatus_freeze_shared_mb_per_process_max;
21007 
21008 kern_return_t
vm_map_freeze(task_t task,unsigned int * purgeable_count,unsigned int * wired_count,unsigned int * clean_count,unsigned int * dirty_count,unsigned int dirty_budget,unsigned int * shared_count,int * freezer_error_code,boolean_t eval_only)21009 vm_map_freeze(
21010 	task_t       task,
21011 	unsigned int *purgeable_count,
21012 	unsigned int *wired_count,
21013 	unsigned int *clean_count,
21014 	unsigned int *dirty_count,
21015 	unsigned int dirty_budget,
21016 	unsigned int *shared_count,
21017 	int          *freezer_error_code,
21018 	boolean_t    eval_only)
21019 {
21020 	vm_map_entry_t  entry2 = VM_MAP_ENTRY_NULL;
21021 	kern_return_t   kr = KERN_SUCCESS;
21022 	boolean_t       evaluation_phase = TRUE;
21023 	vm_object_t     cur_shared_object = NULL;
21024 	int             cur_shared_obj_ref_cnt = 0;
21025 	unsigned int    dirty_private_count = 0, dirty_shared_count = 0, obj_pages_snapshot = 0;
21026 
21027 	*purgeable_count = *wired_count = *clean_count = *dirty_count = *shared_count = 0;
21028 
21029 	/*
21030 	 * We need the exclusive lock here so that we can
21031 	 * block any page faults or lookups while we are
21032 	 * in the middle of freezing this vm map.
21033 	 */
21034 	vm_map_t map = task->map;
21035 
21036 	vm_map_lock(map);
21037 
21038 	assert(VM_CONFIG_COMPRESSOR_IS_PRESENT);
21039 
21040 	if (vm_compressor_low_on_space() || vm_swap_low_on_space()) {
21041 		if (vm_compressor_low_on_space()) {
21042 			*freezer_error_code = FREEZER_ERROR_NO_COMPRESSOR_SPACE;
21043 		}
21044 
21045 		if (vm_swap_low_on_space()) {
21046 			*freezer_error_code = FREEZER_ERROR_NO_SWAP_SPACE;
21047 		}
21048 
21049 		kr = KERN_NO_SPACE;
21050 		goto done;
21051 	}
21052 
21053 	if (VM_CONFIG_FREEZER_SWAP_IS_ACTIVE == FALSE) {
21054 		/*
21055 		 * In-memory compressor backing the freezer. No disk.
21056 		 * So no need to do the evaluation phase.
21057 		 */
21058 		evaluation_phase = FALSE;
21059 
21060 		if (eval_only == TRUE) {
21061 			/*
21062 			 * We don't support 'eval_only' mode
21063 			 * in this non-swap config.
21064 			 */
21065 			*freezer_error_code = FREEZER_ERROR_GENERIC;
21066 			kr = KERN_INVALID_ARGUMENT;
21067 			goto done;
21068 		}
21069 
21070 		freezer_context_global.freezer_ctx_uncompressed_pages = 0;
21071 		clock_get_uptime(&c_freezer_last_yield_ts);
21072 	}
21073 again:
21074 
21075 	for (entry2 = vm_map_first_entry(map);
21076 	    entry2 != vm_map_to_entry(map);
21077 	    entry2 = entry2->vme_next) {
21078 		vm_object_t src_object;
21079 
21080 		if (entry2->is_sub_map) {
21081 			continue;
21082 		}
21083 
21084 		src_object = VME_OBJECT(entry2);
21085 		if (!src_object ||
21086 		    src_object->phys_contiguous ||
21087 		    !src_object->internal) {
21088 			continue;
21089 		}
21090 
21091 		/* If eligible, scan the entry, moving eligible pages over to our parent object */
21092 
21093 		if (VM_CONFIG_FREEZER_SWAP_IS_ACTIVE) {
21094 			/*
21095 			 * We skip purgeable objects during evaluation phase only.
21096 			 * If we decide to freeze this process, we'll explicitly
21097 			 * purge these objects before we go around again with
21098 			 * 'evaluation_phase' set to FALSE.
21099 			 */
21100 
21101 			if ((src_object->purgable == VM_PURGABLE_EMPTY) || (src_object->purgable == VM_PURGABLE_VOLATILE)) {
21102 				/*
21103 				 * We want to purge objects that may not belong to this task but are mapped
21104 				 * in this task alone. Since we already purged this task's purgeable memory
21105 				 * at the end of a successful evaluation phase, we want to avoid doing no-op calls
21106 				 * on this task's purgeable objects. Hence the check for only volatile objects.
21107 				 */
21108 				if (evaluation_phase == FALSE &&
21109 				    (src_object->purgable == VM_PURGABLE_VOLATILE) &&
21110 				    (src_object->ref_count == 1)) {
21111 					vm_object_lock(src_object);
21112 					vm_object_purge(src_object, 0);
21113 					vm_object_unlock(src_object);
21114 				}
21115 				continue;
21116 			}
21117 
21118 			/*
21119 			 * Pages belonging to this object could be swapped to disk.
21120 			 * Make sure it's not a shared object because we could end
21121 			 * up just bringing it back in again.
21122 			 *
21123 			 * We try to optimize somewhat by checking for objects that are mapped
21124 			 * more than once within our own map. But we don't do full searches,
21125 			 * we just look at the entries following our current entry.
21126 			 */
21127 
21128 			if (src_object->ref_count > 1) {
21129 				if (src_object != cur_shared_object) {
21130 					obj_pages_snapshot = (src_object->resident_page_count - src_object->wired_page_count) + vm_compressor_pager_get_count(src_object->pager);
21131 					dirty_shared_count += obj_pages_snapshot;
21132 
21133 					cur_shared_object = src_object;
21134 					cur_shared_obj_ref_cnt = 1;
21135 					continue;
21136 				} else {
21137 					cur_shared_obj_ref_cnt++;
21138 					if (src_object->ref_count == cur_shared_obj_ref_cnt) {
21139 						/*
21140 						 * Fall through to below and treat this object as private.
21141 						 * So deduct its pages from our shared total and add it to the
21142 						 * private total.
21143 						 */
21144 
21145 						dirty_shared_count -= obj_pages_snapshot;
21146 						dirty_private_count += obj_pages_snapshot;
21147 					} else {
21148 						continue;
21149 					}
21150 				}
21151 			}
21152 
21153 
21154 			if (src_object->ref_count == 1) {
21155 				dirty_private_count += (src_object->resident_page_count - src_object->wired_page_count) + vm_compressor_pager_get_count(src_object->pager);
21156 			}
21157 
21158 			if (evaluation_phase == TRUE) {
21159 				continue;
21160 			}
21161 		}
21162 
21163 		uint32_t paged_out_count = vm_object_compressed_freezer_pageout(src_object, dirty_budget);
21164 		*wired_count += src_object->wired_page_count;
21165 
21166 		if (vm_compressor_low_on_space() || vm_swap_low_on_space()) {
21167 			if (vm_compressor_low_on_space()) {
21168 				*freezer_error_code = FREEZER_ERROR_NO_COMPRESSOR_SPACE;
21169 			}
21170 
21171 			if (vm_swap_low_on_space()) {
21172 				*freezer_error_code = FREEZER_ERROR_NO_SWAP_SPACE;
21173 			}
21174 
21175 			kr = KERN_NO_SPACE;
21176 			break;
21177 		}
21178 		if (paged_out_count >= dirty_budget) {
21179 			break;
21180 		}
21181 		dirty_budget -= paged_out_count;
21182 	}
21183 
21184 	*shared_count = (unsigned int) ((dirty_shared_count * PAGE_SIZE_64) / (1024 * 1024ULL));
21185 	if (evaluation_phase) {
21186 		unsigned int shared_pages_threshold = (memorystatus_freeze_shared_mb_per_process_max * 1024 * 1024ULL) / PAGE_SIZE_64;
21187 
21188 		if (dirty_shared_count > shared_pages_threshold) {
21189 			*freezer_error_code = FREEZER_ERROR_EXCESS_SHARED_MEMORY;
21190 			kr = KERN_FAILURE;
21191 			goto done;
21192 		}
21193 
21194 		if (dirty_shared_count &&
21195 		    ((dirty_private_count / dirty_shared_count) < memorystatus_freeze_private_shared_pages_ratio)) {
21196 			*freezer_error_code = FREEZER_ERROR_LOW_PRIVATE_SHARED_RATIO;
21197 			kr = KERN_FAILURE;
21198 			goto done;
21199 		}
21200 
21201 		evaluation_phase = FALSE;
21202 		dirty_shared_count = dirty_private_count = 0;
21203 
21204 		freezer_context_global.freezer_ctx_uncompressed_pages = 0;
21205 		clock_get_uptime(&c_freezer_last_yield_ts);
21206 
21207 		if (eval_only) {
21208 			kr = KERN_SUCCESS;
21209 			goto done;
21210 		}
21211 
21212 		vm_purgeable_purge_task_owned(task);
21213 
21214 		goto again;
21215 	} else {
21216 		kr = KERN_SUCCESS;
21217 	}
21218 
21219 done:
21220 	vm_map_unlock(map);
21221 
21222 	if ((eval_only == FALSE) && (kr == KERN_SUCCESS)) {
21223 		vm_object_compressed_freezer_done();
21224 	}
21225 	return kr;
21226 }
21227 
21228 #endif
21229 
21230 /*
21231  * vm_map_entry_should_cow_for_true_share:
21232  *
21233  * Determines if the map entry should be clipped and setup for copy-on-write
21234  * to avoid applying "true_share" to a large VM object when only a subset is
21235  * targeted.
21236  *
21237  * For now, we target only the map entries created for the Objective C
21238  * Garbage Collector, which initially have the following properties:
21239  *	- alias == VM_MEMORY_MALLOC
21240  *      - wired_count == 0
21241  *      - !needs_copy
21242  * and a VM object with:
21243  *      - internal
21244  *      - copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC
21245  *      - !true_share
21246  *      - vo_size == ANON_CHUNK_SIZE
21247  *
21248  * Only non-kernel map entries.
21249  */
21250 boolean_t
vm_map_entry_should_cow_for_true_share(vm_map_entry_t entry)21251 vm_map_entry_should_cow_for_true_share(
21252 	vm_map_entry_t  entry)
21253 {
21254 	vm_object_t     object;
21255 
21256 	if (entry->is_sub_map) {
21257 		/* entry does not point at a VM object */
21258 		return FALSE;
21259 	}
21260 
21261 	if (entry->needs_copy) {
21262 		/* already set for copy_on_write: done! */
21263 		return FALSE;
21264 	}
21265 
21266 	if (VME_ALIAS(entry) != VM_MEMORY_MALLOC &&
21267 	    VME_ALIAS(entry) != VM_MEMORY_MALLOC_SMALL) {
21268 		/* not a malloc heap or Obj-C Garbage Collector heap */
21269 		return FALSE;
21270 	}
21271 
21272 	if (entry->wired_count) {
21273 		/* wired: can't change the map entry... */
21274 		vm_counters.should_cow_but_wired++;
21275 		return FALSE;
21276 	}
21277 
21278 	object = VME_OBJECT(entry);
21279 
21280 	if (object == VM_OBJECT_NULL) {
21281 		/* no object yet... */
21282 		return FALSE;
21283 	}
21284 
21285 	if (!object->internal) {
21286 		/* not an internal object */
21287 		return FALSE;
21288 	}
21289 
21290 	if (object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC) {
21291 		/* not the default copy strategy */
21292 		return FALSE;
21293 	}
21294 
21295 	if (object->true_share) {
21296 		/* already true_share: too late to avoid it */
21297 		return FALSE;
21298 	}
21299 
21300 	if (VME_ALIAS(entry) == VM_MEMORY_MALLOC &&
21301 	    object->vo_size != ANON_CHUNK_SIZE) {
21302 		/* ... not an object created for the ObjC Garbage Collector */
21303 		return FALSE;
21304 	}
21305 
21306 	if (VME_ALIAS(entry) == VM_MEMORY_MALLOC_SMALL &&
21307 	    object->vo_size != 2048 * 4096) {
21308 		/* ... not a "MALLOC_SMALL" heap */
21309 		return FALSE;
21310 	}
21311 
21312 	/*
21313 	 * All the criteria match: we have a large object being targeted for "true_share".
21314 	 * To limit the adverse side-effects linked with "true_share", tell the caller to
21315 	 * try and avoid setting up the entire object for "true_share" by clipping the
21316 	 * targeted range and setting it up for copy-on-write.
21317 	 */
21318 	return TRUE;
21319 }
21320 
21321 vm_map_offset_t
vm_map_round_page_mask(vm_map_offset_t offset,vm_map_offset_t mask)21322 vm_map_round_page_mask(
21323 	vm_map_offset_t offset,
21324 	vm_map_offset_t mask)
21325 {
21326 	return VM_MAP_ROUND_PAGE(offset, mask);
21327 }
21328 
21329 vm_map_offset_t
vm_map_trunc_page_mask(vm_map_offset_t offset,vm_map_offset_t mask)21330 vm_map_trunc_page_mask(
21331 	vm_map_offset_t offset,
21332 	vm_map_offset_t mask)
21333 {
21334 	return VM_MAP_TRUNC_PAGE(offset, mask);
21335 }
21336 
21337 boolean_t
vm_map_page_aligned(vm_map_offset_t offset,vm_map_offset_t mask)21338 vm_map_page_aligned(
21339 	vm_map_offset_t offset,
21340 	vm_map_offset_t mask)
21341 {
21342 	return ((offset) & mask) == 0;
21343 }
21344 
21345 int
vm_map_page_shift(vm_map_t map)21346 vm_map_page_shift(
21347 	vm_map_t map)
21348 {
21349 	return VM_MAP_PAGE_SHIFT(map);
21350 }
21351 
21352 int
vm_map_page_size(vm_map_t map)21353 vm_map_page_size(
21354 	vm_map_t map)
21355 {
21356 	return VM_MAP_PAGE_SIZE(map);
21357 }
21358 
21359 vm_map_offset_t
vm_map_page_mask(vm_map_t map)21360 vm_map_page_mask(
21361 	vm_map_t map)
21362 {
21363 	return VM_MAP_PAGE_MASK(map);
21364 }
21365 
21366 kern_return_t
vm_map_set_page_shift(vm_map_t map,int pageshift)21367 vm_map_set_page_shift(
21368 	vm_map_t        map,
21369 	int             pageshift)
21370 {
21371 	if (map->hdr.nentries != 0) {
21372 		/* too late to change page size */
21373 		return KERN_FAILURE;
21374 	}
21375 
21376 	map->hdr.page_shift = (uint16_t)pageshift;
21377 
21378 	return KERN_SUCCESS;
21379 }
21380 
21381 kern_return_t
vm_map_query_volatile(vm_map_t map,mach_vm_size_t * volatile_virtual_size_p,mach_vm_size_t * volatile_resident_size_p,mach_vm_size_t * volatile_compressed_size_p,mach_vm_size_t * volatile_pmap_size_p,mach_vm_size_t * volatile_compressed_pmap_size_p)21382 vm_map_query_volatile(
21383 	vm_map_t        map,
21384 	mach_vm_size_t  *volatile_virtual_size_p,
21385 	mach_vm_size_t  *volatile_resident_size_p,
21386 	mach_vm_size_t  *volatile_compressed_size_p,
21387 	mach_vm_size_t  *volatile_pmap_size_p,
21388 	mach_vm_size_t  *volatile_compressed_pmap_size_p)
21389 {
21390 	mach_vm_size_t  volatile_virtual_size;
21391 	mach_vm_size_t  volatile_resident_count;
21392 	mach_vm_size_t  volatile_compressed_count;
21393 	mach_vm_size_t  volatile_pmap_count;
21394 	mach_vm_size_t  volatile_compressed_pmap_count;
21395 	mach_vm_size_t  resident_count;
21396 	vm_map_entry_t  entry;
21397 	vm_object_t     object;
21398 
21399 	/* map should be locked by caller */
21400 
21401 	volatile_virtual_size = 0;
21402 	volatile_resident_count = 0;
21403 	volatile_compressed_count = 0;
21404 	volatile_pmap_count = 0;
21405 	volatile_compressed_pmap_count = 0;
21406 
21407 	for (entry = vm_map_first_entry(map);
21408 	    entry != vm_map_to_entry(map);
21409 	    entry = entry->vme_next) {
21410 		mach_vm_size_t  pmap_resident_bytes, pmap_compressed_bytes;
21411 
21412 		if (entry->is_sub_map) {
21413 			continue;
21414 		}
21415 		if (!(entry->protection & VM_PROT_WRITE)) {
21416 			continue;
21417 		}
21418 		object = VME_OBJECT(entry);
21419 		if (object == VM_OBJECT_NULL) {
21420 			continue;
21421 		}
21422 		if (object->purgable != VM_PURGABLE_VOLATILE &&
21423 		    object->purgable != VM_PURGABLE_EMPTY) {
21424 			continue;
21425 		}
21426 		if (VME_OFFSET(entry)) {
21427 			/*
21428 			 * If the map entry has been split and the object now
21429 			 * appears several times in the VM map, we don't want
21430 			 * to count the object's resident_page_count more than
21431 			 * once.  We count it only for the first one, starting
21432 			 * at offset 0 and ignore the other VM map entries.
21433 			 */
21434 			continue;
21435 		}
21436 		resident_count = object->resident_page_count;
21437 		if ((VME_OFFSET(entry) / PAGE_SIZE) >= resident_count) {
21438 			resident_count = 0;
21439 		} else {
21440 			resident_count -= (VME_OFFSET(entry) / PAGE_SIZE);
21441 		}
21442 
21443 		volatile_virtual_size += entry->vme_end - entry->vme_start;
21444 		volatile_resident_count += resident_count;
21445 		if (object->pager) {
21446 			volatile_compressed_count +=
21447 			    vm_compressor_pager_get_count(object->pager);
21448 		}
21449 		pmap_compressed_bytes = 0;
21450 		pmap_resident_bytes =
21451 		    pmap_query_resident(map->pmap,
21452 		    entry->vme_start,
21453 		    entry->vme_end,
21454 		    &pmap_compressed_bytes);
21455 		volatile_pmap_count += (pmap_resident_bytes / PAGE_SIZE);
21456 		volatile_compressed_pmap_count += (pmap_compressed_bytes
21457 		    / PAGE_SIZE);
21458 	}
21459 
21460 	/* map is still locked on return */
21461 
21462 	*volatile_virtual_size_p = volatile_virtual_size;
21463 	*volatile_resident_size_p = volatile_resident_count * PAGE_SIZE;
21464 	*volatile_compressed_size_p = volatile_compressed_count * PAGE_SIZE;
21465 	*volatile_pmap_size_p = volatile_pmap_count * PAGE_SIZE;
21466 	*volatile_compressed_pmap_size_p = volatile_compressed_pmap_count * PAGE_SIZE;
21467 
21468 	return KERN_SUCCESS;
21469 }
21470 
21471 void
vm_map_sizes(vm_map_t map,vm_map_size_t * psize,vm_map_size_t * pfree,vm_map_size_t * plargest_free)21472 vm_map_sizes(vm_map_t map,
21473     vm_map_size_t * psize,
21474     vm_map_size_t * pfree,
21475     vm_map_size_t * plargest_free)
21476 {
21477 	vm_map_entry_t  entry;
21478 	vm_map_offset_t prev;
21479 	vm_map_size_t   free, total_free, largest_free;
21480 	boolean_t       end;
21481 
21482 	if (!map) {
21483 		*psize = *pfree = *plargest_free = 0;
21484 		return;
21485 	}
21486 	total_free = largest_free = 0;
21487 
21488 	vm_map_lock_read(map);
21489 	if (psize) {
21490 		*psize = map->max_offset - map->min_offset;
21491 	}
21492 
21493 	prev = map->min_offset;
21494 	for (entry = vm_map_first_entry(map);; entry = entry->vme_next) {
21495 		end = (entry == vm_map_to_entry(map));
21496 
21497 		if (end) {
21498 			free = entry->vme_end   - prev;
21499 		} else {
21500 			free = entry->vme_start - prev;
21501 		}
21502 
21503 		total_free += free;
21504 		if (free > largest_free) {
21505 			largest_free = free;
21506 		}
21507 
21508 		if (end) {
21509 			break;
21510 		}
21511 		prev = entry->vme_end;
21512 	}
21513 	vm_map_unlock_read(map);
21514 	if (pfree) {
21515 		*pfree = total_free;
21516 	}
21517 	if (plargest_free) {
21518 		*plargest_free = largest_free;
21519 	}
21520 }
21521 
21522 #if VM_SCAN_FOR_SHADOW_CHAIN
21523 int vm_map_shadow_max(vm_map_t map);
21524 int
vm_map_shadow_max(vm_map_t map)21525 vm_map_shadow_max(
21526 	vm_map_t map)
21527 {
21528 	int             shadows, shadows_max;
21529 	vm_map_entry_t  entry;
21530 	vm_object_t     object, next_object;
21531 
21532 	if (map == NULL) {
21533 		return 0;
21534 	}
21535 
21536 	shadows_max = 0;
21537 
21538 	vm_map_lock_read(map);
21539 
21540 	for (entry = vm_map_first_entry(map);
21541 	    entry != vm_map_to_entry(map);
21542 	    entry = entry->vme_next) {
21543 		if (entry->is_sub_map) {
21544 			continue;
21545 		}
21546 		object = VME_OBJECT(entry);
21547 		if (object == NULL) {
21548 			continue;
21549 		}
21550 		vm_object_lock_shared(object);
21551 		for (shadows = 0;
21552 		    object->shadow != NULL;
21553 		    shadows++, object = next_object) {
21554 			next_object = object->shadow;
21555 			vm_object_lock_shared(next_object);
21556 			vm_object_unlock(object);
21557 		}
21558 		vm_object_unlock(object);
21559 		if (shadows > shadows_max) {
21560 			shadows_max = shadows;
21561 		}
21562 	}
21563 
21564 	vm_map_unlock_read(map);
21565 
21566 	return shadows_max;
21567 }
21568 #endif /* VM_SCAN_FOR_SHADOW_CHAIN */
21569 
21570 void
vm_commit_pagezero_status(vm_map_t lmap)21571 vm_commit_pagezero_status(vm_map_t lmap)
21572 {
21573 	pmap_advise_pagezero_range(lmap->pmap, lmap->min_offset);
21574 }
21575 
21576 #if XNU_TARGET_OS_OSX
21577 void
vm_map_set_high_start(vm_map_t map,vm_map_offset_t high_start)21578 vm_map_set_high_start(
21579 	vm_map_t        map,
21580 	vm_map_offset_t high_start)
21581 {
21582 	map->vmmap_high_start = high_start;
21583 }
21584 #endif /* XNU_TARGET_OS_OSX */
21585 
21586 #if CODE_SIGNING_MONITOR
21587 
21588 kern_return_t
vm_map_entry_cs_associate(vm_map_t map,vm_map_entry_t entry,vm_map_kernel_flags_t vmk_flags)21589 vm_map_entry_cs_associate(
21590 	vm_map_t                map,
21591 	vm_map_entry_t          entry,
21592 	vm_map_kernel_flags_t   vmk_flags)
21593 {
21594 	vm_object_t cs_object, cs_shadow, backing_object;
21595 	vm_object_offset_t cs_offset, backing_offset;
21596 	void *cs_blobs;
21597 	struct vnode *cs_vnode;
21598 	kern_return_t cs_ret;
21599 
21600 	if (map->pmap == NULL ||
21601 	    entry->is_sub_map || /* XXX FBDP: recurse on sub-range? */
21602 	    (csm_address_space_exempt(map->pmap) == KERN_SUCCESS) ||
21603 	    VME_OBJECT(entry) == VM_OBJECT_NULL) {
21604 		return KERN_SUCCESS;
21605 	}
21606 
21607 	if (!(entry->protection & VM_PROT_EXECUTE)) {
21608 		/*
21609 		 * This memory region is not executable, so the code-signing
21610 		 * monitor would usually not care about it...
21611 		 */
21612 		if (vmk_flags.vmkf_remap_prot_copy &&
21613 		    (entry->max_protection & VM_PROT_EXECUTE)) {
21614 			/*
21615 			 * ... except if the memory region is being remapped
21616 			 * from r-x/r-x to rw-/rwx via vm_protect(VM_PROT_COPY)
21617 			 * which is what a debugger or dtrace would be doing
21618 			 * to prepare to modify an executable page to insert
21619 			 * a breakpoint or activate a probe.
21620 			 * In that case, fall through so that we can mark
21621 			 * this region as being "debugged" and no longer
21622 			 * strictly code-signed.
21623 			 */
21624 		} else {
21625 			/*
21626 			 * Really not executable, so no need to tell the
21627 			 * code-signing monitor.
21628 			 */
21629 			return KERN_SUCCESS;
21630 		}
21631 	}
21632 
21633 	vm_map_lock_assert_exclusive(map);
21634 
21635 	if (entry->used_for_jit) {
21636 		cs_ret = csm_associate_jit_region(
21637 			map->pmap,
21638 			entry->vme_start,
21639 			entry->vme_end - entry->vme_start);
21640 		goto done;
21641 	}
21642 
21643 	if (vmk_flags.vmkf_remap_prot_copy) {
21644 		cs_ret = csm_associate_debug_region(
21645 			map->pmap,
21646 			entry->vme_start,
21647 			entry->vme_end - entry->vme_start);
21648 		if (cs_ret == KERN_SUCCESS) {
21649 			entry->vme_xnu_user_debug = TRUE;
21650 		}
21651 #if DEVELOPMENT || DEBUG
21652 		if (vm_log_xnu_user_debug) {
21653 			printf("FBDP %d[%s] %s:%d map %p entry %p [ 0x%llx 0x%llx ]  vme_xnu_user_debug=%d cs_ret %d\n",
21654 			    proc_selfpid(),
21655 			    (get_bsdtask_info(current_task()) ? proc_name_address(get_bsdtask_info(current_task())) : "?"),
21656 			    __FUNCTION__, __LINE__,
21657 			    map, entry,
21658 			    (uint64_t)entry->vme_start, (uint64_t)entry->vme_end,
21659 			    entry->vme_xnu_user_debug,
21660 			    cs_ret);
21661 		}
21662 #endif /* DEVELOPMENT || DEBUG */
21663 		goto done;
21664 	}
21665 
21666 	cs_object = VME_OBJECT(entry);
21667 	vm_object_lock_shared(cs_object);
21668 	cs_offset = VME_OFFSET(entry);
21669 
21670 	/* find the VM object backed by the code-signed vnode */
21671 	for (;;) {
21672 		/* go to the bottom of cs_object's shadow chain */
21673 		for (;
21674 		    cs_object->shadow != VM_OBJECT_NULL;
21675 		    cs_object = cs_shadow) {
21676 			cs_shadow = cs_object->shadow;
21677 			cs_offset += cs_object->vo_shadow_offset;
21678 			vm_object_lock_shared(cs_shadow);
21679 			vm_object_unlock(cs_object);
21680 		}
21681 		if (cs_object->internal ||
21682 		    cs_object->pager == MEMORY_OBJECT_NULL) {
21683 			vm_object_unlock(cs_object);
21684 			return KERN_SUCCESS;
21685 		}
21686 
21687 		cs_offset += cs_object->paging_offset;
21688 
21689 		/*
21690 		 * cs_object could be backed by a:
21691 		 *      vnode_pager
21692 		 *	apple_protect_pager
21693 		 *      shared_region_pager
21694 		 *	fourk_pager (multiple backing objects -> fail?)
21695 		 * ask the pager if it has a backing VM object
21696 		 */
21697 		if (!memory_object_backing_object(cs_object->pager,
21698 		    cs_offset,
21699 		    &backing_object,
21700 		    &backing_offset)) {
21701 			/* no backing object: cs_object is it */
21702 			break;
21703 		}
21704 
21705 		/* look down the backing object's shadow chain */
21706 		vm_object_lock_shared(backing_object);
21707 		vm_object_unlock(cs_object);
21708 		cs_object = backing_object;
21709 		cs_offset = backing_offset;
21710 	}
21711 
21712 	cs_vnode = vnode_pager_lookup_vnode(cs_object->pager);
21713 	if (cs_vnode == NULL) {
21714 		/* no vnode, no code signatures to associate */
21715 		cs_ret = KERN_SUCCESS;
21716 	} else {
21717 		cs_ret = vnode_pager_get_cs_blobs(cs_vnode,
21718 		    &cs_blobs);
21719 		assert(cs_ret == KERN_SUCCESS);
21720 		cs_ret = cs_associate_blob_with_mapping(map->pmap,
21721 		    entry->vme_start,
21722 		    (entry->vme_end - entry->vme_start),
21723 		    cs_offset,
21724 		    cs_blobs);
21725 	}
21726 	vm_object_unlock(cs_object);
21727 	cs_object = VM_OBJECT_NULL;
21728 
21729 done:
21730 	if (cs_ret == KERN_SUCCESS) {
21731 		DTRACE_VM2(vm_map_entry_cs_associate_success,
21732 		    vm_map_offset_t, entry->vme_start,
21733 		    vm_map_offset_t, entry->vme_end);
21734 		if (vm_map_executable_immutable) {
21735 			/*
21736 			 * Prevent this executable
21737 			 * mapping from being unmapped
21738 			 * or modified.
21739 			 */
21740 			entry->vme_permanent = TRUE;
21741 		}
21742 		/*
21743 		 * pmap says it will validate the
21744 		 * code-signing validity of pages
21745 		 * faulted in via this mapping, so
21746 		 * this map entry should be marked so
21747 		 * that vm_fault() bypasses code-signing
21748 		 * validation for faults coming through
21749 		 * this mapping.
21750 		 */
21751 		entry->csm_associated = TRUE;
21752 	} else if (cs_ret == KERN_NOT_SUPPORTED) {
21753 		/*
21754 		 * pmap won't check the code-signing
21755 		 * validity of pages faulted in via
21756 		 * this mapping, so VM should keep
21757 		 * doing it.
21758 		 */
21759 		DTRACE_VM3(vm_map_entry_cs_associate_off,
21760 		    vm_map_offset_t, entry->vme_start,
21761 		    vm_map_offset_t, entry->vme_end,
21762 		    int, cs_ret);
21763 	} else {
21764 		/*
21765 		 * A real error: do not allow
21766 		 * execution in this mapping.
21767 		 */
21768 		DTRACE_VM3(vm_map_entry_cs_associate_failure,
21769 		    vm_map_offset_t, entry->vme_start,
21770 		    vm_map_offset_t, entry->vme_end,
21771 		    int, cs_ret);
21772 		if (vmk_flags.vmkf_overwrite_immutable) {
21773 			/*
21774 			 * We can get here when we remap an apple_protect pager
21775 			 * on top of an already cs_associated executable mapping
21776 			 * with the same code signatures, so we don't want to
21777 			 * lose VM_PROT_EXECUTE in that case...
21778 			 */
21779 		} else {
21780 			entry->protection &= ~VM_PROT_ALLEXEC;
21781 			entry->max_protection &= ~VM_PROT_ALLEXEC;
21782 		}
21783 	}
21784 
21785 	return cs_ret;
21786 }
21787 
21788 #endif /* CODE_SIGNING_MONITOR */
21789 
21790 /*
21791  * FORKED CORPSE FOOTPRINT
21792  *
21793  * A forked corpse gets a copy of the original VM map but its pmap is mostly
21794  * empty since it never ran and never got to fault in any pages.
21795  * Collecting footprint info (via "sysctl vm.self_region_footprint") for
21796  * a forked corpse would therefore return very little information.
21797  *
21798  * When forking a corpse, we can pass the VM_MAP_FORK_CORPSE_FOOTPRINT option
21799  * to vm_map_fork() to collect footprint information from the original VM map
21800  * and its pmap, and store it in the forked corpse's VM map.  That information
21801  * is stored in place of the VM map's "hole list" since we'll never need to
21802  * lookup for holes in the corpse's map.
21803  *
21804  * The corpse's footprint info looks like this:
21805  *
21806  * vm_map->vmmap_corpse_footprint points to pageable kernel memory laid out
21807  * as follows:
21808  *                     +---------------------------------------+
21809  *            header-> | cf_size                               |
21810  *                     +-------------------+-------------------+
21811  *                     | cf_last_region    | cf_last_zeroes    |
21812  *                     +-------------------+-------------------+
21813  *           region1-> | cfr_vaddr                             |
21814  *                     +-------------------+-------------------+
21815  *                     | cfr_num_pages     | d0 | d1 | d2 | d3 |
21816  *                     +---------------------------------------+
21817  *                     | d4 | d5 | ...                         |
21818  *                     +---------------------------------------+
21819  *                     | ...                                   |
21820  *                     +-------------------+-------------------+
21821  *                     | dy | dz | na | na | cfr_vaddr...      | <-region2
21822  *                     +-------------------+-------------------+
21823  *                     | cfr_vaddr (ctd)   | cfr_num_pages     |
21824  *                     +---------------------------------------+
21825  *                     | d0 | d1 ...                           |
21826  *                     +---------------------------------------+
21827  *                       ...
21828  *                     +---------------------------------------+
21829  *       last region-> | cfr_vaddr                             |
21830  *                     +---------------------------------------+
21831  *                     + cfr_num_pages     | d0 | d1 | d2 | d3 |
21832  *                     +---------------------------------------+
21833  *                       ...
21834  *                     +---------------------------------------+
21835  *                     | dx | dy | dz | na | na | na | na | na |
21836  *                     +---------------------------------------+
21837  *
21838  * where:
21839  *      cf_size:	total size of the buffer (rounded to page size)
21840  *      cf_last_region:	offset in the buffer of the last "region" sub-header
21841  *	cf_last_zeroes: number of trailing "zero" dispositions at the end
21842  *			of last region
21843  *	cfr_vaddr:	virtual address of the start of the covered "region"
21844  *	cfr_num_pages:	number of pages in the covered "region"
21845  *	d*:		disposition of the page at that virtual address
21846  * Regions in the buffer are word-aligned.
21847  *
21848  * We estimate the size of the buffer based on the number of memory regions
21849  * and the virtual size of the address space.  While copying each memory region
21850  * during vm_map_fork(), we also collect the footprint info for that region
21851  * and store it in the buffer, packing it as much as possible (coalescing
21852  * contiguous memory regions to avoid having too many region headers and
21853  * avoiding long streaks of "zero" page dispositions by splitting footprint
21854  * "regions", so the number of regions in the footprint buffer might not match
21855  * the number of memory regions in the address space.
21856  *
21857  * We also have to copy the original task's "nonvolatile" ledgers since that's
21858  * part of the footprint and will need to be reported to any tool asking for
21859  * the footprint information of the forked corpse.
21860  */
21861 
21862 uint64_t vm_map_corpse_footprint_count = 0;
21863 uint64_t vm_map_corpse_footprint_size_avg = 0;
21864 uint64_t vm_map_corpse_footprint_size_max = 0;
21865 uint64_t vm_map_corpse_footprint_full = 0;
21866 uint64_t vm_map_corpse_footprint_no_buf = 0;
21867 
21868 struct vm_map_corpse_footprint_header {
21869 	vm_size_t       cf_size;        /* allocated buffer size */
21870 	uint32_t        cf_last_region; /* offset of last region in buffer */
21871 	union {
21872 		uint32_t cfu_last_zeroes; /* during creation:
21873 		                           * number of "zero" dispositions at
21874 		                           * end of last region */
21875 		uint32_t cfu_hint_region; /* during lookup:
21876 		                           * offset of last looked up region */
21877 #define cf_last_zeroes cfu.cfu_last_zeroes
21878 #define cf_hint_region cfu.cfu_hint_region
21879 	} cfu;
21880 };
21881 typedef uint8_t cf_disp_t;
21882 struct vm_map_corpse_footprint_region {
21883 	vm_map_offset_t cfr_vaddr;      /* region start virtual address */
21884 	uint32_t        cfr_num_pages;  /* number of pages in this "region" */
21885 	cf_disp_t   cfr_disposition[0]; /* disposition of each page */
21886 } __attribute__((packed));
21887 
21888 static cf_disp_t
vm_page_disposition_to_cf_disp(int disposition)21889 vm_page_disposition_to_cf_disp(
21890 	int disposition)
21891 {
21892 	assert(sizeof(cf_disp_t) == 1);
21893 	/* relocate bits that don't fit in a "uint8_t" */
21894 	if (disposition & VM_PAGE_QUERY_PAGE_REUSABLE) {
21895 		disposition |= VM_PAGE_QUERY_PAGE_FICTITIOUS;
21896 	}
21897 	/* cast gets rid of extra bits */
21898 	return (cf_disp_t) disposition;
21899 }
21900 
21901 static int
vm_page_cf_disp_to_disposition(cf_disp_t cf_disp)21902 vm_page_cf_disp_to_disposition(
21903 	cf_disp_t cf_disp)
21904 {
21905 	int disposition;
21906 
21907 	assert(sizeof(cf_disp_t) == 1);
21908 	disposition = (int) cf_disp;
21909 	/* move relocated bits back in place */
21910 	if (cf_disp & VM_PAGE_QUERY_PAGE_FICTITIOUS) {
21911 		disposition |= VM_PAGE_QUERY_PAGE_REUSABLE;
21912 		disposition &= ~VM_PAGE_QUERY_PAGE_FICTITIOUS;
21913 	}
21914 	return disposition;
21915 }
21916 
21917 /*
21918  * vm_map_corpse_footprint_new_region:
21919  *      closes the current footprint "region" and creates a new one
21920  *
21921  * Returns NULL if there's not enough space in the buffer for a new region.
21922  */
21923 static struct vm_map_corpse_footprint_region *
vm_map_corpse_footprint_new_region(struct vm_map_corpse_footprint_header * footprint_header)21924 vm_map_corpse_footprint_new_region(
21925 	struct vm_map_corpse_footprint_header *footprint_header)
21926 {
21927 	uintptr_t       footprint_edge;
21928 	uint32_t        new_region_offset;
21929 	struct vm_map_corpse_footprint_region *footprint_region;
21930 	struct vm_map_corpse_footprint_region *new_footprint_region;
21931 
21932 	footprint_edge = ((uintptr_t)footprint_header +
21933 	    footprint_header->cf_size);
21934 	footprint_region = ((struct vm_map_corpse_footprint_region *)
21935 	    ((char *)footprint_header +
21936 	    footprint_header->cf_last_region));
21937 	assert((uintptr_t)footprint_region + sizeof(*footprint_region) <=
21938 	    footprint_edge);
21939 
21940 	/* get rid of trailing zeroes in the last region */
21941 	assert(footprint_region->cfr_num_pages >=
21942 	    footprint_header->cf_last_zeroes);
21943 	footprint_region->cfr_num_pages -=
21944 	    footprint_header->cf_last_zeroes;
21945 	footprint_header->cf_last_zeroes = 0;
21946 
21947 	/* reuse this region if it's now empty */
21948 	if (footprint_region->cfr_num_pages == 0) {
21949 		return footprint_region;
21950 	}
21951 
21952 	/* compute offset of new region */
21953 	new_region_offset = footprint_header->cf_last_region;
21954 	new_region_offset += sizeof(*footprint_region);
21955 	new_region_offset += (footprint_region->cfr_num_pages * sizeof(cf_disp_t));
21956 	new_region_offset = roundup(new_region_offset, sizeof(int));
21957 
21958 	/* check if we're going over the edge */
21959 	if (((uintptr_t)footprint_header +
21960 	    new_region_offset +
21961 	    sizeof(*footprint_region)) >=
21962 	    footprint_edge) {
21963 		/* over the edge: no new region */
21964 		return NULL;
21965 	}
21966 
21967 	/* adjust offset of last region in header */
21968 	footprint_header->cf_last_region = new_region_offset;
21969 
21970 	new_footprint_region = (struct vm_map_corpse_footprint_region *)
21971 	    ((char *)footprint_header +
21972 	    footprint_header->cf_last_region);
21973 	new_footprint_region->cfr_vaddr = 0;
21974 	new_footprint_region->cfr_num_pages = 0;
21975 	/* caller needs to initialize new region */
21976 
21977 	return new_footprint_region;
21978 }
21979 
21980 /*
21981  * vm_map_corpse_footprint_collect:
21982  *	collect footprint information for "old_entry" in "old_map" and
21983  *	stores it in "new_map"'s vmmap_footprint_info.
21984  */
21985 kern_return_t
vm_map_corpse_footprint_collect(vm_map_t old_map,vm_map_entry_t old_entry,vm_map_t new_map)21986 vm_map_corpse_footprint_collect(
21987 	vm_map_t        old_map,
21988 	vm_map_entry_t  old_entry,
21989 	vm_map_t        new_map)
21990 {
21991 	vm_map_offset_t va;
21992 	kern_return_t   kr;
21993 	struct vm_map_corpse_footprint_header *footprint_header;
21994 	struct vm_map_corpse_footprint_region *footprint_region;
21995 	struct vm_map_corpse_footprint_region *new_footprint_region;
21996 	cf_disp_t       *next_disp_p;
21997 	uintptr_t       footprint_edge;
21998 	uint32_t        num_pages_tmp;
21999 	int             effective_page_size;
22000 
22001 	effective_page_size = MIN(PAGE_SIZE, VM_MAP_PAGE_SIZE(old_map));
22002 
22003 	va = old_entry->vme_start;
22004 
22005 	vm_map_lock_assert_exclusive(old_map);
22006 	vm_map_lock_assert_exclusive(new_map);
22007 
22008 	assert(new_map->has_corpse_footprint);
22009 	assert(!old_map->has_corpse_footprint);
22010 	if (!new_map->has_corpse_footprint ||
22011 	    old_map->has_corpse_footprint) {
22012 		/*
22013 		 * This can only transfer footprint info from a
22014 		 * map with a live pmap to a map with a corpse footprint.
22015 		 */
22016 		return KERN_NOT_SUPPORTED;
22017 	}
22018 
22019 	if (new_map->vmmap_corpse_footprint == NULL) {
22020 		vm_offset_t     buf;
22021 		vm_size_t       buf_size;
22022 
22023 		buf = 0;
22024 		buf_size = (sizeof(*footprint_header) +
22025 		    (old_map->hdr.nentries
22026 		    *
22027 		    (sizeof(*footprint_region) +
22028 		    +3))            /* potential alignment for each region */
22029 		    +
22030 		    ((old_map->size / effective_page_size)
22031 		    *
22032 		    sizeof(cf_disp_t)));      /* disposition for each page */
22033 //		printf("FBDP corpse map %p guestimate footprint size 0x%llx\n", new_map, (uint64_t) buf_size);
22034 		buf_size = round_page(buf_size);
22035 
22036 		/* limit buffer to 1 page to validate overflow detection */
22037 //		buf_size = PAGE_SIZE;
22038 
22039 		/* limit size to a somewhat sane amount */
22040 #if XNU_TARGET_OS_OSX
22041 #define VM_MAP_CORPSE_FOOTPRINT_INFO_MAX_SIZE   (8*1024*1024)   /* 8MB */
22042 #else /* XNU_TARGET_OS_OSX */
22043 #define VM_MAP_CORPSE_FOOTPRINT_INFO_MAX_SIZE   (256*1024)      /* 256KB */
22044 #endif /* XNU_TARGET_OS_OSX */
22045 		if (buf_size > VM_MAP_CORPSE_FOOTPRINT_INFO_MAX_SIZE) {
22046 			buf_size = VM_MAP_CORPSE_FOOTPRINT_INFO_MAX_SIZE;
22047 		}
22048 
22049 		/*
22050 		 * Allocate the pageable buffer (with a trailing guard page).
22051 		 * It will be zero-filled on demand.
22052 		 */
22053 		kr = kmem_alloc(kernel_map, &buf, buf_size + PAGE_SIZE,
22054 		    KMA_DATA | KMA_PAGEABLE | KMA_GUARD_LAST,
22055 		    VM_KERN_MEMORY_DIAG);
22056 		if (kr != KERN_SUCCESS) {
22057 			vm_map_corpse_footprint_no_buf++;
22058 			return kr;
22059 		}
22060 
22061 		/* initialize header and 1st region */
22062 		footprint_header = (struct vm_map_corpse_footprint_header *)buf;
22063 		new_map->vmmap_corpse_footprint = footprint_header;
22064 
22065 		footprint_header->cf_size = buf_size;
22066 		footprint_header->cf_last_region =
22067 		    sizeof(*footprint_header);
22068 		footprint_header->cf_last_zeroes = 0;
22069 
22070 		footprint_region = (struct vm_map_corpse_footprint_region *)
22071 		    ((char *)footprint_header +
22072 		    footprint_header->cf_last_region);
22073 		footprint_region->cfr_vaddr = 0;
22074 		footprint_region->cfr_num_pages = 0;
22075 	} else {
22076 		/* retrieve header and last region */
22077 		footprint_header = (struct vm_map_corpse_footprint_header *)
22078 		    new_map->vmmap_corpse_footprint;
22079 		footprint_region = (struct vm_map_corpse_footprint_region *)
22080 		    ((char *)footprint_header +
22081 		    footprint_header->cf_last_region);
22082 	}
22083 	footprint_edge = ((uintptr_t)footprint_header +
22084 	    footprint_header->cf_size);
22085 
22086 	if ((footprint_region->cfr_vaddr +
22087 	    (((vm_map_offset_t)footprint_region->cfr_num_pages) *
22088 	    effective_page_size))
22089 	    != old_entry->vme_start) {
22090 		uint64_t num_pages_delta, num_pages_delta_size;
22091 		uint32_t region_offset_delta_size;
22092 
22093 		/*
22094 		 * Not the next contiguous virtual address:
22095 		 * start a new region or store "zero" dispositions for
22096 		 * the missing pages?
22097 		 */
22098 		/* size of gap in actual page dispositions */
22099 		num_pages_delta = ((old_entry->vme_start -
22100 		    footprint_region->cfr_vaddr) / effective_page_size)
22101 		    - footprint_region->cfr_num_pages;
22102 		num_pages_delta_size = num_pages_delta * sizeof(cf_disp_t);
22103 		/* size of gap as a new footprint region header */
22104 		region_offset_delta_size =
22105 		    (sizeof(*footprint_region) +
22106 		    roundup(((footprint_region->cfr_num_pages -
22107 		    footprint_header->cf_last_zeroes) * sizeof(cf_disp_t)),
22108 		    sizeof(int)) -
22109 		    ((footprint_region->cfr_num_pages -
22110 		    footprint_header->cf_last_zeroes) * sizeof(cf_disp_t)));
22111 //		printf("FBDP %s:%d region 0x%x 0x%llx 0x%x vme_start 0x%llx pages_delta 0x%llx region_delta 0x%x\n", __FUNCTION__, __LINE__, footprint_header->cf_last_region, footprint_region->cfr_vaddr, footprint_region->cfr_num_pages, old_entry->vme_start, num_pages_delta, region_offset_delta);
22112 		if (region_offset_delta_size < num_pages_delta_size ||
22113 		    os_add3_overflow(footprint_region->cfr_num_pages,
22114 		    (uint32_t) num_pages_delta,
22115 		    1,
22116 		    &num_pages_tmp)) {
22117 			/*
22118 			 * Storing data for this gap would take more space
22119 			 * than inserting a new footprint region header:
22120 			 * let's start a new region and save space. If it's a
22121 			 * tie, let's avoid using a new region, since that
22122 			 * would require more region hops to find the right
22123 			 * range during lookups.
22124 			 *
22125 			 * If the current region's cfr_num_pages would overflow
22126 			 * if we added "zero" page dispositions for the gap,
22127 			 * no choice but to start a new region.
22128 			 */
22129 //			printf("FBDP %s:%d new region\n", __FUNCTION__, __LINE__);
22130 			new_footprint_region =
22131 			    vm_map_corpse_footprint_new_region(footprint_header);
22132 			/* check that we're not going over the edge */
22133 			if (new_footprint_region == NULL) {
22134 				goto over_the_edge;
22135 			}
22136 			footprint_region = new_footprint_region;
22137 			/* initialize new region as empty */
22138 			footprint_region->cfr_vaddr = old_entry->vme_start;
22139 			footprint_region->cfr_num_pages = 0;
22140 		} else {
22141 			/*
22142 			 * Store "zero" page dispositions for the missing
22143 			 * pages.
22144 			 */
22145 //			printf("FBDP %s:%d zero gap\n", __FUNCTION__, __LINE__);
22146 			for (; num_pages_delta > 0; num_pages_delta--) {
22147 				next_disp_p = (cf_disp_t *)
22148 				    ((uintptr_t) footprint_region +
22149 				    sizeof(*footprint_region));
22150 				next_disp_p += footprint_region->cfr_num_pages;
22151 				/* check that we're not going over the edge */
22152 				if ((uintptr_t)next_disp_p >= footprint_edge) {
22153 					goto over_the_edge;
22154 				}
22155 				/* store "zero" disposition for this gap page */
22156 				footprint_region->cfr_num_pages++;
22157 				*next_disp_p = (cf_disp_t) 0;
22158 				footprint_header->cf_last_zeroes++;
22159 			}
22160 		}
22161 	}
22162 
22163 	for (va = old_entry->vme_start;
22164 	    va < old_entry->vme_end;
22165 	    va += effective_page_size) {
22166 		int             disposition;
22167 		cf_disp_t       cf_disp;
22168 
22169 		vm_map_footprint_query_page_info(old_map,
22170 		    old_entry,
22171 		    va,
22172 		    &disposition);
22173 		cf_disp = vm_page_disposition_to_cf_disp(disposition);
22174 
22175 //		if (va < SHARED_REGION_BASE_ARM64) printf("FBDP collect map %p va 0x%llx disp 0x%x\n", new_map, va, disp);
22176 
22177 		if (cf_disp == 0 && footprint_region->cfr_num_pages == 0) {
22178 			/*
22179 			 * Ignore "zero" dispositions at start of
22180 			 * region: just move start of region.
22181 			 */
22182 			footprint_region->cfr_vaddr += effective_page_size;
22183 			continue;
22184 		}
22185 
22186 		/* would region's cfr_num_pages overflow? */
22187 		if (os_add_overflow(footprint_region->cfr_num_pages, 1,
22188 		    &num_pages_tmp)) {
22189 			/* overflow: create a new region */
22190 			new_footprint_region =
22191 			    vm_map_corpse_footprint_new_region(
22192 				footprint_header);
22193 			if (new_footprint_region == NULL) {
22194 				goto over_the_edge;
22195 			}
22196 			footprint_region = new_footprint_region;
22197 			footprint_region->cfr_vaddr = va;
22198 			footprint_region->cfr_num_pages = 0;
22199 		}
22200 
22201 		next_disp_p = (cf_disp_t *) ((uintptr_t) footprint_region +
22202 		    sizeof(*footprint_region));
22203 		next_disp_p += footprint_region->cfr_num_pages;
22204 		/* check that we're not going over the edge */
22205 		if ((uintptr_t)next_disp_p >= footprint_edge) {
22206 			goto over_the_edge;
22207 		}
22208 		/* store this dispostion */
22209 		*next_disp_p = cf_disp;
22210 		footprint_region->cfr_num_pages++;
22211 
22212 		if (cf_disp != 0) {
22213 			/* non-zero disp: break the current zero streak */
22214 			footprint_header->cf_last_zeroes = 0;
22215 			/* done */
22216 			continue;
22217 		}
22218 
22219 		/* zero disp: add to the current streak of zeroes */
22220 		footprint_header->cf_last_zeroes++;
22221 		if ((footprint_header->cf_last_zeroes +
22222 		    roundup(((footprint_region->cfr_num_pages -
22223 		    footprint_header->cf_last_zeroes) * sizeof(cf_disp_t)) &
22224 		    (sizeof(int) - 1),
22225 		    sizeof(int))) <
22226 		    (sizeof(*footprint_header))) {
22227 			/*
22228 			 * There are not enough trailing "zero" dispositions
22229 			 * (+ the extra padding we would need for the previous
22230 			 * region); creating a new region would not save space
22231 			 * at this point, so let's keep this "zero" disposition
22232 			 * in this region and reconsider later.
22233 			 */
22234 			continue;
22235 		}
22236 		/*
22237 		 * Create a new region to avoid having too many consecutive
22238 		 * "zero" dispositions.
22239 		 */
22240 		new_footprint_region =
22241 		    vm_map_corpse_footprint_new_region(footprint_header);
22242 		if (new_footprint_region == NULL) {
22243 			goto over_the_edge;
22244 		}
22245 		footprint_region = new_footprint_region;
22246 		/* initialize the new region as empty ... */
22247 		footprint_region->cfr_num_pages = 0;
22248 		/* ... and skip this "zero" disp */
22249 		footprint_region->cfr_vaddr = va + effective_page_size;
22250 	}
22251 
22252 	return KERN_SUCCESS;
22253 
22254 over_the_edge:
22255 //	printf("FBDP map %p footprint was full for va 0x%llx\n", new_map, va);
22256 	vm_map_corpse_footprint_full++;
22257 	return KERN_RESOURCE_SHORTAGE;
22258 }
22259 
22260 /*
22261  * vm_map_corpse_footprint_collect_done:
22262  *	completes the footprint collection by getting rid of any remaining
22263  *	trailing "zero" dispositions and trimming the unused part of the
22264  *	kernel buffer
22265  */
22266 void
vm_map_corpse_footprint_collect_done(vm_map_t new_map)22267 vm_map_corpse_footprint_collect_done(
22268 	vm_map_t        new_map)
22269 {
22270 	struct vm_map_corpse_footprint_header *footprint_header;
22271 	struct vm_map_corpse_footprint_region *footprint_region;
22272 	vm_size_t       buf_size, actual_size;
22273 	kern_return_t   kr;
22274 
22275 	assert(new_map->has_corpse_footprint);
22276 	if (!new_map->has_corpse_footprint ||
22277 	    new_map->vmmap_corpse_footprint == NULL) {
22278 		return;
22279 	}
22280 
22281 	footprint_header = (struct vm_map_corpse_footprint_header *)
22282 	    new_map->vmmap_corpse_footprint;
22283 	buf_size = footprint_header->cf_size;
22284 
22285 	footprint_region = (struct vm_map_corpse_footprint_region *)
22286 	    ((char *)footprint_header +
22287 	    footprint_header->cf_last_region);
22288 
22289 	/* get rid of trailing zeroes in last region */
22290 	assert(footprint_region->cfr_num_pages >= footprint_header->cf_last_zeroes);
22291 	footprint_region->cfr_num_pages -= footprint_header->cf_last_zeroes;
22292 	footprint_header->cf_last_zeroes = 0;
22293 
22294 	actual_size = (vm_size_t)(footprint_header->cf_last_region +
22295 	    sizeof(*footprint_region) +
22296 	    (footprint_region->cfr_num_pages * sizeof(cf_disp_t)));
22297 
22298 //	printf("FBDP map %p buf_size 0x%llx actual_size 0x%llx\n", new_map, (uint64_t) buf_size, (uint64_t) actual_size);
22299 	vm_map_corpse_footprint_size_avg =
22300 	    (((vm_map_corpse_footprint_size_avg *
22301 	    vm_map_corpse_footprint_count) +
22302 	    actual_size) /
22303 	    (vm_map_corpse_footprint_count + 1));
22304 	vm_map_corpse_footprint_count++;
22305 	if (actual_size > vm_map_corpse_footprint_size_max) {
22306 		vm_map_corpse_footprint_size_max = actual_size;
22307 	}
22308 
22309 	actual_size = round_page(actual_size);
22310 	if (buf_size > actual_size) {
22311 		kr = vm_deallocate(kernel_map,
22312 		    ((vm_address_t)footprint_header +
22313 		    actual_size +
22314 		    PAGE_SIZE),                 /* trailing guard page */
22315 		    (buf_size - actual_size));
22316 		assertf(kr == KERN_SUCCESS,
22317 		    "trim: footprint_header %p buf_size 0x%llx actual_size 0x%llx kr=0x%x\n",
22318 		    footprint_header,
22319 		    (uint64_t) buf_size,
22320 		    (uint64_t) actual_size,
22321 		    kr);
22322 		kr = vm_protect(kernel_map,
22323 		    ((vm_address_t)footprint_header +
22324 		    actual_size),
22325 		    PAGE_SIZE,
22326 		    FALSE,             /* set_maximum */
22327 		    VM_PROT_NONE);
22328 		assertf(kr == KERN_SUCCESS,
22329 		    "guard: footprint_header %p buf_size 0x%llx actual_size 0x%llx kr=0x%x\n",
22330 		    footprint_header,
22331 		    (uint64_t) buf_size,
22332 		    (uint64_t) actual_size,
22333 		    kr);
22334 	}
22335 
22336 	footprint_header->cf_size = actual_size;
22337 }
22338 
22339 /*
22340  * vm_map_corpse_footprint_query_page_info:
22341  *	retrieves the disposition of the page at virtual address "vaddr"
22342  *	in the forked corpse's VM map
22343  *
22344  * This is the equivalent of vm_map_footprint_query_page_info() for a forked corpse.
22345  */
22346 kern_return_t
vm_map_corpse_footprint_query_page_info(vm_map_t map,vm_map_offset_t va,int * disposition_p)22347 vm_map_corpse_footprint_query_page_info(
22348 	vm_map_t        map,
22349 	vm_map_offset_t va,
22350 	int             *disposition_p)
22351 {
22352 	struct vm_map_corpse_footprint_header *footprint_header;
22353 	struct vm_map_corpse_footprint_region *footprint_region;
22354 	uint32_t        footprint_region_offset;
22355 	vm_map_offset_t region_start, region_end;
22356 	int             disp_idx;
22357 	kern_return_t   kr;
22358 	int             effective_page_size;
22359 	cf_disp_t       cf_disp;
22360 
22361 	if (!map->has_corpse_footprint) {
22362 		*disposition_p = 0;
22363 		kr = KERN_INVALID_ARGUMENT;
22364 		goto done;
22365 	}
22366 
22367 	footprint_header = map->vmmap_corpse_footprint;
22368 	if (footprint_header == NULL) {
22369 		*disposition_p = 0;
22370 //		if (va < SHARED_REGION_BASE_ARM64) printf("FBDP %d query map %p va 0x%llx disp 0x%x\n", __LINE__, map, va, *disposition_p);
22371 		kr = KERN_INVALID_ARGUMENT;
22372 		goto done;
22373 	}
22374 
22375 	/* start looking at the hint ("cf_hint_region") */
22376 	footprint_region_offset = footprint_header->cf_hint_region;
22377 
22378 	effective_page_size = MIN(PAGE_SIZE, VM_MAP_PAGE_SIZE(map));
22379 
22380 lookup_again:
22381 	if (footprint_region_offset < sizeof(*footprint_header)) {
22382 		/* hint too low: start from 1st region */
22383 		footprint_region_offset = sizeof(*footprint_header);
22384 	}
22385 	if (footprint_region_offset >= footprint_header->cf_last_region) {
22386 		/* hint too high: re-start from 1st region */
22387 		footprint_region_offset = sizeof(*footprint_header);
22388 	}
22389 	footprint_region = (struct vm_map_corpse_footprint_region *)
22390 	    ((char *)footprint_header + footprint_region_offset);
22391 	region_start = footprint_region->cfr_vaddr;
22392 	region_end = (region_start +
22393 	    ((vm_map_offset_t)(footprint_region->cfr_num_pages) *
22394 	    effective_page_size));
22395 	if (va < region_start &&
22396 	    footprint_region_offset != sizeof(*footprint_header)) {
22397 		/* our range starts before the hint region */
22398 
22399 		/* reset the hint (in a racy way...) */
22400 		footprint_header->cf_hint_region = sizeof(*footprint_header);
22401 		/* lookup "va" again from 1st region */
22402 		footprint_region_offset = sizeof(*footprint_header);
22403 		goto lookup_again;
22404 	}
22405 
22406 	while (va >= region_end) {
22407 		if (footprint_region_offset >= footprint_header->cf_last_region) {
22408 			break;
22409 		}
22410 		/* skip the region's header */
22411 		footprint_region_offset += sizeof(*footprint_region);
22412 		/* skip the region's page dispositions */
22413 		footprint_region_offset += (footprint_region->cfr_num_pages * sizeof(cf_disp_t));
22414 		/* align to next word boundary */
22415 		footprint_region_offset =
22416 		    roundup(footprint_region_offset,
22417 		    sizeof(int));
22418 		footprint_region = (struct vm_map_corpse_footprint_region *)
22419 		    ((char *)footprint_header + footprint_region_offset);
22420 		region_start = footprint_region->cfr_vaddr;
22421 		region_end = (region_start +
22422 		    ((vm_map_offset_t)(footprint_region->cfr_num_pages) *
22423 		    effective_page_size));
22424 	}
22425 	if (va < region_start || va >= region_end) {
22426 		/* page not found */
22427 		*disposition_p = 0;
22428 //		if (va < SHARED_REGION_BASE_ARM64) printf("FBDP %d query map %p va 0x%llx disp 0x%x\n", __LINE__, map, va, *disposition_p);
22429 		kr = KERN_SUCCESS;
22430 		goto done;
22431 	}
22432 
22433 	/* "va" found: set the lookup hint for next lookup (in a racy way...) */
22434 	footprint_header->cf_hint_region = footprint_region_offset;
22435 
22436 	/* get page disposition for "va" in this region */
22437 	disp_idx = (int) ((va - footprint_region->cfr_vaddr) / effective_page_size);
22438 	cf_disp = footprint_region->cfr_disposition[disp_idx];
22439 	*disposition_p = vm_page_cf_disp_to_disposition(cf_disp);
22440 	kr = KERN_SUCCESS;
22441 done:
22442 //	if (va < SHARED_REGION_BASE_ARM64) printf("FBDP %d query map %p va 0x%llx disp 0x%x\n", __LINE__, map, va, *disposition_p);
22443 	/* dtrace -n 'vminfo:::footprint_query_page_info { printf("map 0x%p va 0x%llx disp 0x%x kr 0x%x", arg0, arg1, arg2, arg3); }' */
22444 	DTRACE_VM4(footprint_query_page_info,
22445 	    vm_map_t, map,
22446 	    vm_map_offset_t, va,
22447 	    int, *disposition_p,
22448 	    kern_return_t, kr);
22449 
22450 	return kr;
22451 }
22452 
22453 void
vm_map_corpse_footprint_destroy(vm_map_t map)22454 vm_map_corpse_footprint_destroy(
22455 	vm_map_t        map)
22456 {
22457 	if (map->has_corpse_footprint &&
22458 	    map->vmmap_corpse_footprint != 0) {
22459 		struct vm_map_corpse_footprint_header *footprint_header;
22460 		vm_size_t buf_size;
22461 		kern_return_t kr;
22462 
22463 		footprint_header = map->vmmap_corpse_footprint;
22464 		buf_size = footprint_header->cf_size;
22465 		kr = vm_deallocate(kernel_map,
22466 		    (vm_offset_t) map->vmmap_corpse_footprint,
22467 		    ((vm_size_t) buf_size
22468 		    + PAGE_SIZE));                 /* trailing guard page */
22469 		assertf(kr == KERN_SUCCESS, "kr=0x%x\n", kr);
22470 		map->vmmap_corpse_footprint = 0;
22471 		map->has_corpse_footprint = FALSE;
22472 	}
22473 }
22474 
22475 /*
22476  * vm_map_copy_footprint_ledgers:
22477  *	copies any ledger that's relevant to the memory footprint of "old_task"
22478  *	into the forked corpse's task ("new_task")
22479  */
22480 void
vm_map_copy_footprint_ledgers(task_t old_task,task_t new_task)22481 vm_map_copy_footprint_ledgers(
22482 	task_t  old_task,
22483 	task_t  new_task)
22484 {
22485 	vm_map_copy_ledger(old_task, new_task, task_ledgers.phys_footprint);
22486 	vm_map_copy_ledger(old_task, new_task, task_ledgers.purgeable_nonvolatile);
22487 	vm_map_copy_ledger(old_task, new_task, task_ledgers.purgeable_nonvolatile_compressed);
22488 	vm_map_copy_ledger(old_task, new_task, task_ledgers.internal);
22489 	vm_map_copy_ledger(old_task, new_task, task_ledgers.internal_compressed);
22490 	vm_map_copy_ledger(old_task, new_task, task_ledgers.iokit_mapped);
22491 	vm_map_copy_ledger(old_task, new_task, task_ledgers.alternate_accounting);
22492 	vm_map_copy_ledger(old_task, new_task, task_ledgers.alternate_accounting_compressed);
22493 	vm_map_copy_ledger(old_task, new_task, task_ledgers.page_table);
22494 	vm_map_copy_ledger(old_task, new_task, task_ledgers.tagged_footprint);
22495 	vm_map_copy_ledger(old_task, new_task, task_ledgers.tagged_footprint_compressed);
22496 	vm_map_copy_ledger(old_task, new_task, task_ledgers.network_nonvolatile);
22497 	vm_map_copy_ledger(old_task, new_task, task_ledgers.network_nonvolatile_compressed);
22498 	vm_map_copy_ledger(old_task, new_task, task_ledgers.media_footprint);
22499 	vm_map_copy_ledger(old_task, new_task, task_ledgers.media_footprint_compressed);
22500 	vm_map_copy_ledger(old_task, new_task, task_ledgers.graphics_footprint);
22501 	vm_map_copy_ledger(old_task, new_task, task_ledgers.graphics_footprint_compressed);
22502 	vm_map_copy_ledger(old_task, new_task, task_ledgers.neural_footprint);
22503 	vm_map_copy_ledger(old_task, new_task, task_ledgers.neural_footprint_compressed);
22504 	vm_map_copy_ledger(old_task, new_task, task_ledgers.wired_mem);
22505 }
22506 
22507 /*
22508  * vm_map_copy_ledger:
22509  *	copy a single ledger from "old_task" to "new_task"
22510  */
22511 void
vm_map_copy_ledger(task_t old_task,task_t new_task,int ledger_entry)22512 vm_map_copy_ledger(
22513 	task_t  old_task,
22514 	task_t  new_task,
22515 	int     ledger_entry)
22516 {
22517 	ledger_amount_t old_balance, new_balance, delta;
22518 
22519 	assert(new_task->map->has_corpse_footprint);
22520 	if (!new_task->map->has_corpse_footprint) {
22521 		return;
22522 	}
22523 
22524 	/* turn off sanity checks for the ledger we're about to mess with */
22525 	ledger_disable_panic_on_negative(new_task->ledger,
22526 	    ledger_entry);
22527 
22528 	/* adjust "new_task" to match "old_task" */
22529 	ledger_get_balance(old_task->ledger,
22530 	    ledger_entry,
22531 	    &old_balance);
22532 	ledger_get_balance(new_task->ledger,
22533 	    ledger_entry,
22534 	    &new_balance);
22535 	if (new_balance == old_balance) {
22536 		/* new == old: done */
22537 	} else if (new_balance > old_balance) {
22538 		/* new > old ==> new -= new - old */
22539 		delta = new_balance - old_balance;
22540 		ledger_debit(new_task->ledger,
22541 		    ledger_entry,
22542 		    delta);
22543 	} else {
22544 		/* new < old ==> new += old - new */
22545 		delta = old_balance - new_balance;
22546 		ledger_credit(new_task->ledger,
22547 		    ledger_entry,
22548 		    delta);
22549 	}
22550 }
22551 
22552 /*
22553  * vm_map_get_pmap:
22554  * returns the pmap associated with the vm_map
22555  */
22556 pmap_t
vm_map_get_pmap(vm_map_t map)22557 vm_map_get_pmap(vm_map_t map)
22558 {
22559 	return vm_map_pmap(map);
22560 }
22561 
22562 #if CONFIG_MAP_RANGES
22563 static bitmap_t vm_map_user_range_heap_map[BITMAP_LEN(VM_MEMORY_COUNT)];
22564 
22565 /*
22566  * vm_map_range_map_init:
22567  *  initializes the VM range ID map to enable index lookup
22568  *  of user VM ranges based on VM tag from userspace.
22569  */
22570 static void
vm_map_range_map_init(void)22571 vm_map_range_map_init(void)
22572 {
22573 	bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_MALLOC);
22574 	bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_MALLOC_HUGE);
22575 	bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_MALLOC_LARGE);
22576 	bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_MALLOC_LARGE_REUSED);
22577 	bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_MALLOC_MEDIUM);
22578 	bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_MALLOC_PROB_GUARD);
22579 	bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_MALLOC_SMALL);
22580 	bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_MALLOC_TINY);
22581 }
22582 
22583 /*
22584  * vm_map_range_configure:
22585  *	configures the user vm_map ranges by increasing the maximum VA range of
22586  *  the map and carving out a range at the end of VA space (searching backwards
22587  *  in the newly expanded map).
22588  */
22589 kern_return_t
vm_map_range_configure(vm_map_t map)22590 vm_map_range_configure(vm_map_t map)
22591 {
22592 	vm_map_size_t           addr_space_size;
22593 	vm_map_offset_t         start, end, saved_max, random_addr;
22594 	kern_return_t           kr;
22595 
22596 	/* Should not be applying ranges to kernel map or kernel map submaps */
22597 	assert(map != kernel_map);
22598 	assert(vm_map_pmap(map) != kernel_pmap);
22599 
22600 	/* save the existing max offset */
22601 	vm_map_lock_read(map);
22602 	saved_max = vm_map_max(map);
22603 	vm_map_unlock_read(map);
22604 
22605 	/*
22606 	 * Check that we're not already jumbo'd. If so we cannot guarantee that
22607 	 * we can set up the ranges safely without interfering with the existing
22608 	 * map.
22609 	 */
22610 	if (saved_max > vm_compute_max_offset(vm_map_is_64bit(map))) {
22611 		return KERN_NO_SPACE;
22612 	}
22613 
22614 	/* expand the default VM space to the largest possible address */
22615 	vm_map_set_jumbo(map);
22616 
22617 	vm_map_lock(map);
22618 	addr_space_size = vm_map_max(map) - saved_max;
22619 
22620 	if (addr_space_size <= VM_MAP_USER_RANGE_MAX) {
22621 		vm_map_unlock(map);
22622 		return KERN_NO_SPACE;
22623 	}
22624 
22625 	addr_space_size -= VM_MAP_USER_RANGE_MAX;
22626 	random_addr = (vm_map_offset_t)random();
22627 	random_addr <<= VM_MAP_PAGE_SHIFT(map);
22628 	random_addr %= addr_space_size;
22629 
22630 	/*
22631 	 * round off the start so we begin on a L2 TT boundary and ensure we have
22632 	 * at least a ARM_TT_L2_SIZE sized hole between existing map range and
22633 	 * new range(s).
22634 	 */
22635 	start = vm_map_round_page(saved_max + random_addr + 1, ARM_TT_L2_OFFMASK);
22636 	end = MIN(vm_map_max(map), start + VM_MAP_USER_RANGE_MAX);
22637 	assert(start > saved_max);
22638 	assert(end <= vm_map_max(map));
22639 
22640 	/* default range covers the "normal" heap range */
22641 	map->user_range[UMEM_RANGE_ID_DEFAULT].min_address = vm_map_min(map);
22642 	map->user_range[UMEM_RANGE_ID_DEFAULT].max_address = saved_max;
22643 
22644 	/* heap range covers the new extended range */
22645 	map->user_range[UMEM_RANGE_ID_HEAP].min_address = start;
22646 	map->user_range[UMEM_RANGE_ID_HEAP].max_address = end;
22647 
22648 	vm_map_unlock(map);
22649 
22650 	/*
22651 	 * Poke holes so that ASAN or people listing regions
22652 	 * do not think this space is free.
22653 	 */
22654 
22655 	if (start != saved_max) {
22656 		kr = vm_map_enter(map, &saved_max, start - saved_max,
22657 		    0, VM_MAP_KERNEL_FLAGS_FIXED_PERMANENT(), VM_OBJECT_NULL,
22658 		    0, FALSE, VM_PROT_NONE, VM_PROT_NONE, VM_INHERIT_DEFAULT);
22659 		assert(kr == KERN_SUCCESS);
22660 	}
22661 
22662 	if (end != vm_map_max(map)) {
22663 		kr = vm_map_enter(map, &end, vm_map_max(map) - end,
22664 		    0, VM_MAP_KERNEL_FLAGS_FIXED_PERMANENT(), VM_OBJECT_NULL,
22665 		    0, FALSE, VM_PROT_NONE, VM_PROT_NONE, VM_INHERIT_DEFAULT);
22666 		assert(kr == KERN_SUCCESS);
22667 	}
22668 
22669 	vm_map_lock(map);
22670 
22671 	map->uses_user_ranges = true;
22672 
22673 	vm_map_unlock(map);
22674 
22675 	return KERN_SUCCESS;
22676 }
22677 
22678 /*
22679  * vm_map_range_fork:
22680  *	clones the array of ranges from old_map to new_map in support
22681  *  of a VM map fork.
22682  */
22683 void
vm_map_range_fork(vm_map_t new_map,vm_map_t old_map)22684 vm_map_range_fork(vm_map_t new_map, vm_map_t old_map)
22685 {
22686 	if (!old_map->uses_user_ranges) {
22687 		/* nothing to do */
22688 		return;
22689 	}
22690 
22691 	for (size_t i = 0; i < UMEM_RANGE_COUNT; i++) {
22692 		new_map->user_range[i] = old_map->user_range[i];
22693 	}
22694 
22695 	new_map->uses_user_ranges = true;
22696 }
22697 
22698 /*
22699  * vm_map_get_user_range:
22700  *	copy the VM user range for the given VM map and range ID.
22701  */
22702 kern_return_t
vm_map_get_user_range(vm_map_t map,vm_map_range_id_t range_id,mach_vm_range_t range)22703 vm_map_get_user_range(
22704 	vm_map_t                map,
22705 	vm_map_range_id_t       range_id,
22706 	mach_vm_range_t         range)
22707 {
22708 	if (map == NULL ||
22709 	    !map->uses_user_ranges ||
22710 	    range_id > UMEM_RANGE_ID_MAX ||
22711 	    range == NULL) {
22712 		return KERN_INVALID_ARGUMENT;
22713 	}
22714 
22715 	*range = map->user_range[range_id];
22716 	return KERN_SUCCESS;
22717 }
22718 
22719 static vm_map_range_id_t
vm_map_user_range_resolve(vm_map_t map,mach_vm_address_t addr,mach_vm_size_t size,mach_vm_range_t range)22720 vm_map_user_range_resolve(
22721 	vm_map_t                map,
22722 	mach_vm_address_t       addr,
22723 	mach_vm_size_t          size,
22724 	mach_vm_range_t         range)
22725 {
22726 	vm_map_lock_assert_held(map);
22727 
22728 	for (vm_map_range_id_t i = 0; i < UMEM_RANGE_COUNT; i++) {
22729 		mach_vm_range_t r = &map->user_range[i];
22730 
22731 		if (mach_vm_range_contains(r, addr, size)) {
22732 			if (range) {
22733 				*range = *r;
22734 			}
22735 			return i;
22736 		}
22737 	}
22738 
22739 	if (range) {
22740 		range->min_address = range->max_address = 0;
22741 	}
22742 	return UMEM_RANGE_ID_DEFAULT;
22743 }
22744 
22745 #endif /* CONFIG_MAP_RANGES */
22746 
22747 void
vm_map_kernel_flags_update_range_id(vm_map_kernel_flags_t * vmkf,vm_map_t map)22748 vm_map_kernel_flags_update_range_id(vm_map_kernel_flags_t *vmkf, vm_map_t map)
22749 {
22750 	if (map == kernel_map) {
22751 		if (vmkf->vmkf_range_id == KMEM_RANGE_ID_NONE) {
22752 			vmkf->vmkf_range_id = KMEM_RANGE_ID_DATA;
22753 		}
22754 #if CONFIG_MAP_RANGES
22755 	} else if (vmkf->vm_tag < VM_MEMORY_COUNT &&
22756 	    vmkf->vmkf_range_id == UMEM_RANGE_ID_DEFAULT &&
22757 	    bitmap_test(vm_map_user_range_heap_map, vmkf->vm_tag)) {
22758 		vmkf->vmkf_range_id = UMEM_RANGE_ID_HEAP;
22759 #endif /* CONFIG_MAP_RANGES */
22760 	}
22761 }
22762 
22763 /*
22764  * vm_map_entry_has_device_pager:
22765  * Check if the vm map entry specified by the virtual address has a device pager.
22766  * If the vm map entry does not exist or if the map is NULL, this returns FALSE.
22767  */
22768 boolean_t
vm_map_entry_has_device_pager(vm_map_t map,vm_map_offset_t vaddr)22769 vm_map_entry_has_device_pager(vm_map_t map, vm_map_offset_t vaddr)
22770 {
22771 	vm_map_entry_t entry;
22772 	vm_object_t object;
22773 	boolean_t result;
22774 
22775 	if (map == NULL) {
22776 		return FALSE;
22777 	}
22778 
22779 	vm_map_lock(map);
22780 	while (TRUE) {
22781 		if (!vm_map_lookup_entry(map, vaddr, &entry)) {
22782 			result = FALSE;
22783 			break;
22784 		}
22785 		if (entry->is_sub_map) {
22786 			// Check the submap
22787 			vm_map_t submap = VME_SUBMAP(entry);
22788 			assert(submap != NULL);
22789 			vm_map_lock(submap);
22790 			vm_map_unlock(map);
22791 			map = submap;
22792 			continue;
22793 		}
22794 		object = VME_OBJECT(entry);
22795 		if (object != NULL && object->pager != NULL && is_device_pager_ops(object->pager->mo_pager_ops)) {
22796 			result = TRUE;
22797 			break;
22798 		}
22799 		result = FALSE;
22800 		break;
22801 	}
22802 
22803 	vm_map_unlock(map);
22804 	return result;
22805 }
22806 
22807 
22808 #if MACH_ASSERT
22809 
22810 extern int pmap_ledgers_panic;
22811 extern int pmap_ledgers_panic_leeway;
22812 
22813 #define LEDGER_DRIFT(__LEDGER)                    \
22814 	int             __LEDGER##_over;          \
22815 	ledger_amount_t __LEDGER##_over_total;    \
22816 	ledger_amount_t __LEDGER##_over_max;      \
22817 	int             __LEDGER##_under;         \
22818 	ledger_amount_t __LEDGER##_under_total;   \
22819 	ledger_amount_t __LEDGER##_under_max
22820 
22821 struct {
22822 	uint64_t        num_pmaps_checked;
22823 
22824 	LEDGER_DRIFT(phys_footprint);
22825 	LEDGER_DRIFT(internal);
22826 	LEDGER_DRIFT(internal_compressed);
22827 	LEDGER_DRIFT(external);
22828 	LEDGER_DRIFT(reusable);
22829 	LEDGER_DRIFT(iokit_mapped);
22830 	LEDGER_DRIFT(alternate_accounting);
22831 	LEDGER_DRIFT(alternate_accounting_compressed);
22832 	LEDGER_DRIFT(page_table);
22833 	LEDGER_DRIFT(purgeable_volatile);
22834 	LEDGER_DRIFT(purgeable_nonvolatile);
22835 	LEDGER_DRIFT(purgeable_volatile_compressed);
22836 	LEDGER_DRIFT(purgeable_nonvolatile_compressed);
22837 	LEDGER_DRIFT(tagged_nofootprint);
22838 	LEDGER_DRIFT(tagged_footprint);
22839 	LEDGER_DRIFT(tagged_nofootprint_compressed);
22840 	LEDGER_DRIFT(tagged_footprint_compressed);
22841 	LEDGER_DRIFT(network_volatile);
22842 	LEDGER_DRIFT(network_nonvolatile);
22843 	LEDGER_DRIFT(network_volatile_compressed);
22844 	LEDGER_DRIFT(network_nonvolatile_compressed);
22845 	LEDGER_DRIFT(media_nofootprint);
22846 	LEDGER_DRIFT(media_footprint);
22847 	LEDGER_DRIFT(media_nofootprint_compressed);
22848 	LEDGER_DRIFT(media_footprint_compressed);
22849 	LEDGER_DRIFT(graphics_nofootprint);
22850 	LEDGER_DRIFT(graphics_footprint);
22851 	LEDGER_DRIFT(graphics_nofootprint_compressed);
22852 	LEDGER_DRIFT(graphics_footprint_compressed);
22853 	LEDGER_DRIFT(neural_nofootprint);
22854 	LEDGER_DRIFT(neural_footprint);
22855 	LEDGER_DRIFT(neural_nofootprint_compressed);
22856 	LEDGER_DRIFT(neural_footprint_compressed);
22857 } pmap_ledgers_drift;
22858 
22859 void
vm_map_pmap_check_ledgers(pmap_t pmap,ledger_t ledger,int pid,char * procname)22860 vm_map_pmap_check_ledgers(
22861 	pmap_t          pmap,
22862 	ledger_t        ledger,
22863 	int             pid,
22864 	char            *procname)
22865 {
22866 	ledger_amount_t bal;
22867 	boolean_t       do_panic;
22868 
22869 	do_panic = FALSE;
22870 
22871 	pmap_ledgers_drift.num_pmaps_checked++;
22872 
22873 #define LEDGER_CHECK_BALANCE(__LEDGER)                                  \
22874 MACRO_BEGIN                                                             \
22875 	int panic_on_negative = TRUE;                                   \
22876 	ledger_get_balance(ledger,                                      \
22877 	                   task_ledgers.__LEDGER,                       \
22878 	                   &bal);                                       \
22879 	ledger_get_panic_on_negative(ledger,                            \
22880 	                             task_ledgers.__LEDGER,             \
22881 	                             &panic_on_negative);               \
22882 	if (bal != 0) {                                                 \
22883 	        if (panic_on_negative ||                                \
22884 	            (pmap_ledgers_panic &&                              \
22885 	             pmap_ledgers_panic_leeway > 0 &&                   \
22886 	             (bal > (pmap_ledgers_panic_leeway * PAGE_SIZE) ||  \
22887 	              bal < (-pmap_ledgers_panic_leeway * PAGE_SIZE)))) { \
22888 	                do_panic = TRUE;                                \
22889 	        }                                                       \
22890 	        printf("LEDGER BALANCE proc %d (%s) "                   \
22891 	               "\"%s\" = %lld\n",                               \
22892 	               pid, procname, #__LEDGER, bal);                  \
22893 	        if (bal > 0) {                                          \
22894 	                pmap_ledgers_drift.__LEDGER##_over++;           \
22895 	                pmap_ledgers_drift.__LEDGER##_over_total += bal; \
22896 	                if (bal > pmap_ledgers_drift.__LEDGER##_over_max) { \
22897 	                        pmap_ledgers_drift.__LEDGER##_over_max = bal; \
22898 	                }                                               \
22899 	        } else if (bal < 0) {                                   \
22900 	                pmap_ledgers_drift.__LEDGER##_under++;          \
22901 	                pmap_ledgers_drift.__LEDGER##_under_total += bal; \
22902 	                if (bal < pmap_ledgers_drift.__LEDGER##_under_max) { \
22903 	                        pmap_ledgers_drift.__LEDGER##_under_max = bal; \
22904 	                }                                               \
22905 	        }                                                       \
22906 	}                                                               \
22907 MACRO_END
22908 
22909 	LEDGER_CHECK_BALANCE(phys_footprint);
22910 	LEDGER_CHECK_BALANCE(internal);
22911 	LEDGER_CHECK_BALANCE(internal_compressed);
22912 	LEDGER_CHECK_BALANCE(external);
22913 	LEDGER_CHECK_BALANCE(reusable);
22914 	LEDGER_CHECK_BALANCE(iokit_mapped);
22915 	LEDGER_CHECK_BALANCE(alternate_accounting);
22916 	LEDGER_CHECK_BALANCE(alternate_accounting_compressed);
22917 	LEDGER_CHECK_BALANCE(page_table);
22918 	LEDGER_CHECK_BALANCE(purgeable_volatile);
22919 	LEDGER_CHECK_BALANCE(purgeable_nonvolatile);
22920 	LEDGER_CHECK_BALANCE(purgeable_volatile_compressed);
22921 	LEDGER_CHECK_BALANCE(purgeable_nonvolatile_compressed);
22922 	LEDGER_CHECK_BALANCE(tagged_nofootprint);
22923 	LEDGER_CHECK_BALANCE(tagged_footprint);
22924 	LEDGER_CHECK_BALANCE(tagged_nofootprint_compressed);
22925 	LEDGER_CHECK_BALANCE(tagged_footprint_compressed);
22926 	LEDGER_CHECK_BALANCE(network_volatile);
22927 	LEDGER_CHECK_BALANCE(network_nonvolatile);
22928 	LEDGER_CHECK_BALANCE(network_volatile_compressed);
22929 	LEDGER_CHECK_BALANCE(network_nonvolatile_compressed);
22930 	LEDGER_CHECK_BALANCE(media_nofootprint);
22931 	LEDGER_CHECK_BALANCE(media_footprint);
22932 	LEDGER_CHECK_BALANCE(media_nofootprint_compressed);
22933 	LEDGER_CHECK_BALANCE(media_footprint_compressed);
22934 	LEDGER_CHECK_BALANCE(graphics_nofootprint);
22935 	LEDGER_CHECK_BALANCE(graphics_footprint);
22936 	LEDGER_CHECK_BALANCE(graphics_nofootprint_compressed);
22937 	LEDGER_CHECK_BALANCE(graphics_footprint_compressed);
22938 	LEDGER_CHECK_BALANCE(neural_nofootprint);
22939 	LEDGER_CHECK_BALANCE(neural_footprint);
22940 	LEDGER_CHECK_BALANCE(neural_nofootprint_compressed);
22941 	LEDGER_CHECK_BALANCE(neural_footprint_compressed);
22942 
22943 	if (do_panic) {
22944 		if (pmap_ledgers_panic) {
22945 			panic("pmap_destroy(%p) %d[%s] has imbalanced ledgers",
22946 			    pmap, pid, procname);
22947 		} else {
22948 			printf("pmap_destroy(%p) %d[%s] has imbalanced ledgers\n",
22949 			    pmap, pid, procname);
22950 		}
22951 	}
22952 }
22953 
22954 void
vm_map_pmap_set_process(vm_map_t map,int pid,char * procname)22955 vm_map_pmap_set_process(
22956 	vm_map_t map,
22957 	int pid,
22958 	char *procname)
22959 {
22960 	pmap_set_process(vm_map_pmap(map), pid, procname);
22961 }
22962 
22963 #endif /* MACH_ASSERT */
22964