xref: /xnu-10063.141.1/osfmk/vm/vm_map.c (revision d8b80295118ef25ac3a784134bcf95cd8e88109f)
1 /*
2  * Copyright (c) 2000-2021 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 /*
29  * @OSF_COPYRIGHT@
30  */
31 /*
32  * Mach Operating System
33  * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
34  * All Rights Reserved.
35  *
36  * Permission to use, copy, modify and distribute this software and its
37  * documentation is hereby granted, provided that both the copyright
38  * notice and this permission notice appear in all copies of the
39  * software, derivative works or modified versions, and any portions
40  * thereof, and that both notices appear in supporting documentation.
41  *
42  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44  * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45  *
46  * Carnegie Mellon requests users of this software to return to
47  *
48  *  Software Distribution Coordinator  or  [email protected]
49  *  School of Computer Science
50  *  Carnegie Mellon University
51  *  Pittsburgh PA 15213-3890
52  *
53  * any improvements or extensions that they make and grant Carnegie Mellon
54  * the rights to redistribute these changes.
55  */
56 /*
57  */
58 /*
59  *	File:	vm/vm_map.c
60  *	Author:	Avadis Tevanian, Jr., Michael Wayne Young
61  *	Date:	1985
62  *
63  *	Virtual memory mapping module.
64  */
65 
66 #include <mach/vm_types.h>
67 #include <mach_assert.h>
68 
69 #include <vm/vm_options.h>
70 
71 #include <libkern/OSAtomic.h>
72 
73 #include <mach/kern_return.h>
74 #include <mach/port.h>
75 #include <mach/vm_attributes.h>
76 #include <mach/vm_param.h>
77 #include <mach/vm_behavior.h>
78 #include <mach/vm_statistics.h>
79 #include <mach/memory_object.h>
80 #include <mach/mach_vm.h>
81 #include <machine/cpu_capabilities.h>
82 #include <mach/sdt.h>
83 
84 #include <kern/assert.h>
85 #include <kern/backtrace.h>
86 #include <kern/counter.h>
87 #include <kern/exc_guard.h>
88 #include <kern/kalloc.h>
89 #include <kern/zalloc_internal.h>
90 
91 #include <vm/cpm.h>
92 #include <vm/vm_compressor.h>
93 #include <vm/vm_compressor_pager.h>
94 #include <vm/vm_init.h>
95 #include <vm/vm_fault.h>
96 #include <vm/vm_map_internal.h>
97 #include <vm/vm_object.h>
98 #include <vm/vm_page.h>
99 #include <vm/vm_pageout.h>
100 #include <vm/pmap.h>
101 #include <vm/vm_kern.h>
102 #include <ipc/ipc_port.h>
103 #include <kern/sched_prim.h>
104 #include <kern/misc_protos.h>
105 
106 #include <mach/vm_map_server.h>
107 #include <mach/mach_host_server.h>
108 #include <vm/vm_memtag.h>
109 #include <vm/vm_protos.h>
110 #include <vm/vm_purgeable_internal.h>
111 #include <vm/vm_reclaim_internal.h>
112 
113 #include <vm/vm_protos.h>
114 #include <vm/vm_shared_region.h>
115 #include <vm/vm_map_store.h>
116 
117 #include <san/kasan.h>
118 
119 #include <sys/resource.h>
120 #include <sys/random.h>
121 #include <sys/codesign.h>
122 #include <sys/code_signing.h>
123 #include <sys/mman.h>
124 #include <sys/reboot.h>
125 #include <sys/kdebug_triage.h>
126 
127 #include <libkern/section_keywords.h>
128 
129 #if DEVELOPMENT || DEBUG
130 extern int proc_selfcsflags(void);
131 int vm_log_xnu_user_debug = 0;
132 int panic_on_unsigned_execute = 0;
133 int panic_on_mlock_failure = 0;
134 #endif /* DEVELOPMENT || DEBUG */
135 
136 #if MACH_ASSERT
137 int debug4k_filter = 0;
138 char debug4k_proc_name[1024] = "";
139 int debug4k_proc_filter = (int)-1 & ~(1 << __DEBUG4K_FAULT);
140 int debug4k_panic_on_misaligned_sharing = 0;
141 const char *debug4k_category_name[] = {
142 	"error",        /* 0 */
143 	"life",         /* 1 */
144 	"load",         /* 2 */
145 	"fault",        /* 3 */
146 	"copy",         /* 4 */
147 	"share",        /* 5 */
148 	"adjust",       /* 6 */
149 	"pmap",         /* 7 */
150 	"mementry",     /* 8 */
151 	"iokit",        /* 9 */
152 	"upl",          /* 10 */
153 	"exc",          /* 11 */
154 	"vfs"           /* 12 */
155 };
156 #endif /* MACH_ASSERT */
157 int debug4k_no_cow_copyin = 0;
158 
159 
160 #if __arm64__
161 extern const int fourk_binary_compatibility_unsafe;
162 extern const int fourk_binary_compatibility_allow_wx;
163 #endif /* __arm64__ */
164 extern void qsort(void *a, size_t n, size_t es, int (*cmp)(const void *, const void *));
165 extern int proc_selfpid(void);
166 extern char *proc_name_address(void *p);
167 extern char *proc_best_name(struct proc *p);
168 
169 #if VM_MAP_DEBUG_APPLE_PROTECT
170 int vm_map_debug_apple_protect = 0;
171 #endif /* VM_MAP_DEBUG_APPLE_PROTECT */
172 #if VM_MAP_DEBUG_FOURK
173 int vm_map_debug_fourk = 0;
174 #endif /* VM_MAP_DEBUG_FOURK */
175 
176 #if DEBUG || DEVELOPMENT
177 static TUNABLE(bool, vm_map_executable_immutable,
178     "vm_map_executable_immutable", true);
179 #else
180 #define vm_map_executable_immutable true
181 #endif
182 
183 os_refgrp_decl(static, map_refgrp, "vm_map", NULL);
184 
185 extern u_int32_t random(void);  /* from <libkern/libkern.h> */
186 /* Internal prototypes
187  */
188 
189 typedef struct vm_map_zap {
190 	vm_map_entry_t          vmz_head;
191 	vm_map_entry_t         *vmz_tail;
192 } *vm_map_zap_t;
193 
194 #define VM_MAP_ZAP_DECLARE(zap) \
195 	struct vm_map_zap zap = { .vmz_tail = &zap.vmz_head }
196 
197 static vm_map_entry_t   vm_map_entry_insert(
198 	vm_map_t                map,
199 	vm_map_entry_t          insp_entry,
200 	vm_map_offset_t         start,
201 	vm_map_offset_t         end,
202 	vm_object_t             object,
203 	vm_object_offset_t      offset,
204 	vm_map_kernel_flags_t   vmk_flags,
205 	boolean_t               needs_copy,
206 	vm_prot_t               cur_protection,
207 	vm_prot_t               max_protection,
208 	vm_inherit_t            inheritance,
209 	boolean_t               clear_map_aligned);
210 
211 static void vm_map_simplify_range(
212 	vm_map_t        map,
213 	vm_map_offset_t start,
214 	vm_map_offset_t end);   /* forward */
215 
216 static boolean_t        vm_map_range_check(
217 	vm_map_t        map,
218 	vm_map_offset_t start,
219 	vm_map_offset_t end,
220 	vm_map_entry_t  *entry);
221 
222 static void vm_map_submap_pmap_clean(
223 	vm_map_t        map,
224 	vm_map_offset_t start,
225 	vm_map_offset_t end,
226 	vm_map_t        sub_map,
227 	vm_map_offset_t offset);
228 
229 static void             vm_map_pmap_enter(
230 	vm_map_t                map,
231 	vm_map_offset_t         addr,
232 	vm_map_offset_t         end_addr,
233 	vm_object_t             object,
234 	vm_object_offset_t      offset,
235 	vm_prot_t               protection);
236 
237 static void             _vm_map_clip_end(
238 	struct vm_map_header    *map_header,
239 	vm_map_entry_t          entry,
240 	vm_map_offset_t         end);
241 
242 static void             _vm_map_clip_start(
243 	struct vm_map_header    *map_header,
244 	vm_map_entry_t          entry,
245 	vm_map_offset_t         start);
246 
247 static kmem_return_t vm_map_delete(
248 	vm_map_t        map,
249 	vm_map_offset_t start,
250 	vm_map_offset_t end,
251 	vmr_flags_t     flags,
252 	kmem_guard_t    guard,
253 	vm_map_zap_t    zap);
254 
255 static void             vm_map_copy_insert(
256 	vm_map_t        map,
257 	vm_map_entry_t  after_where,
258 	vm_map_copy_t   copy);
259 
260 static kern_return_t    vm_map_copy_overwrite_unaligned(
261 	vm_map_t        dst_map,
262 	vm_map_entry_t  entry,
263 	vm_map_copy_t   copy,
264 	vm_map_address_t start,
265 	boolean_t       discard_on_success);
266 
267 static kern_return_t    vm_map_copy_overwrite_aligned(
268 	vm_map_t        dst_map,
269 	vm_map_entry_t  tmp_entry,
270 	vm_map_copy_t   copy,
271 	vm_map_offset_t start,
272 	pmap_t          pmap);
273 
274 static kern_return_t    vm_map_copyin_kernel_buffer(
275 	vm_map_t        src_map,
276 	vm_map_address_t src_addr,
277 	vm_map_size_t   len,
278 	boolean_t       src_destroy,
279 	vm_map_copy_t   *copy_result);  /* OUT */
280 
281 static kern_return_t    vm_map_copyout_kernel_buffer(
282 	vm_map_t        map,
283 	vm_map_address_t *addr, /* IN/OUT */
284 	vm_map_copy_t   copy,
285 	vm_map_size_t   copy_size,
286 	boolean_t       overwrite,
287 	boolean_t       consume_on_success);
288 
289 static void             vm_map_fork_share(
290 	vm_map_t        old_map,
291 	vm_map_entry_t  old_entry,
292 	vm_map_t        new_map);
293 
294 static boolean_t        vm_map_fork_copy(
295 	vm_map_t        old_map,
296 	vm_map_entry_t  *old_entry_p,
297 	vm_map_t        new_map,
298 	int             vm_map_copyin_flags);
299 
300 static kern_return_t    vm_map_wire_nested(
301 	vm_map_t                   map,
302 	vm_map_offset_t            start,
303 	vm_map_offset_t            end,
304 	vm_prot_t                  caller_prot,
305 	vm_tag_t                   tag,
306 	boolean_t                  user_wire,
307 	pmap_t                     map_pmap,
308 	vm_map_offset_t            pmap_addr,
309 	ppnum_t                    *physpage_p);
310 
311 static kern_return_t    vm_map_unwire_nested(
312 	vm_map_t                   map,
313 	vm_map_offset_t            start,
314 	vm_map_offset_t            end,
315 	boolean_t                  user_wire,
316 	pmap_t                     map_pmap,
317 	vm_map_offset_t            pmap_addr);
318 
319 static kern_return_t    vm_map_overwrite_submap_recurse(
320 	vm_map_t                   dst_map,
321 	vm_map_offset_t            dst_addr,
322 	vm_map_size_t              dst_size);
323 
324 static kern_return_t    vm_map_copy_overwrite_nested(
325 	vm_map_t                   dst_map,
326 	vm_map_offset_t            dst_addr,
327 	vm_map_copy_t              copy,
328 	boolean_t                  interruptible,
329 	pmap_t                     pmap,
330 	boolean_t                  discard_on_success);
331 
332 static kern_return_t    vm_map_remap_extract(
333 	vm_map_t                map,
334 	vm_map_offset_t         addr,
335 	vm_map_size_t           size,
336 	boolean_t               copy,
337 	vm_map_copy_t           map_copy,
338 	vm_prot_t               *cur_protection,
339 	vm_prot_t               *max_protection,
340 	vm_inherit_t            inheritance,
341 	vm_map_kernel_flags_t   vmk_flags);
342 
343 static kern_return_t    vm_map_remap_range_allocate(
344 	vm_map_t                map,
345 	vm_map_address_t        *address,
346 	vm_map_size_t           size,
347 	vm_map_offset_t         mask,
348 	vm_map_kernel_flags_t   vmk_flags,
349 	vm_map_entry_t          *map_entry,
350 	vm_map_zap_t            zap_list);
351 
352 static void             vm_map_region_look_for_page(
353 	vm_map_t                   map,
354 	vm_map_offset_t            va,
355 	vm_object_t                object,
356 	vm_object_offset_t         offset,
357 	int                        max_refcnt,
358 	unsigned short             depth,
359 	vm_region_extended_info_t  extended,
360 	mach_msg_type_number_t count);
361 
362 static int              vm_map_region_count_obj_refs(
363 	vm_map_entry_t             entry,
364 	vm_object_t                object);
365 
366 
367 static kern_return_t    vm_map_willneed(
368 	vm_map_t        map,
369 	vm_map_offset_t start,
370 	vm_map_offset_t end);
371 
372 static kern_return_t    vm_map_reuse_pages(
373 	vm_map_t        map,
374 	vm_map_offset_t start,
375 	vm_map_offset_t end);
376 
377 static kern_return_t    vm_map_reusable_pages(
378 	vm_map_t        map,
379 	vm_map_offset_t start,
380 	vm_map_offset_t end);
381 
382 static kern_return_t    vm_map_can_reuse(
383 	vm_map_t        map,
384 	vm_map_offset_t start,
385 	vm_map_offset_t end);
386 
387 static kern_return_t    vm_map_zero(
388 	vm_map_t        map,
389 	vm_map_offset_t start,
390 	vm_map_offset_t end);
391 
392 static kern_return_t    vm_map_random_address_for_size(
393 	vm_map_t                map,
394 	vm_map_offset_t        *address,
395 	vm_map_size_t           size,
396 	vm_map_kernel_flags_t   vmk_flags);
397 
398 
399 #if CONFIG_MAP_RANGES
400 
401 static vm_map_range_id_t vm_map_user_range_resolve(
402 	vm_map_t                map,
403 	mach_vm_address_t       addr,
404 	mach_vm_address_t       size,
405 	mach_vm_range_t         range);
406 
407 #endif /* CONFIG_MAP_RANGES */
408 #if MACH_ASSERT
409 static kern_return_t    vm_map_pageout(
410 	vm_map_t        map,
411 	vm_map_offset_t start,
412 	vm_map_offset_t end);
413 #endif /* MACH_ASSERT */
414 
415 kern_return_t vm_map_corpse_footprint_collect(
416 	vm_map_t        old_map,
417 	vm_map_entry_t  old_entry,
418 	vm_map_t        new_map);
419 void vm_map_corpse_footprint_collect_done(
420 	vm_map_t        new_map);
421 void vm_map_corpse_footprint_destroy(
422 	vm_map_t        map);
423 kern_return_t vm_map_corpse_footprint_query_page_info(
424 	vm_map_t        map,
425 	vm_map_offset_t va,
426 	int             *disposition_p);
427 void vm_map_footprint_query_page_info(
428 	vm_map_t        map,
429 	vm_map_entry_t  map_entry,
430 	vm_map_offset_t curr_s_offset,
431 	int             *disposition_p);
432 
433 #if CONFIG_MAP_RANGES
434 static void vm_map_range_map_init(void);
435 #endif /* CONFIG_MAP_RANGES */
436 
437 pid_t find_largest_process_vm_map_entries(void);
438 
439 extern int exit_with_guard_exception(void *p, mach_exception_data_type_t code,
440     mach_exception_data_type_t subcode);
441 
442 /*
443  * Macros to copy a vm_map_entry. We must be careful to correctly
444  * manage the wired page count. vm_map_entry_copy() creates a new
445  * map entry to the same memory - the wired count in the new entry
446  * must be set to zero. vm_map_entry_copy_full() creates a new
447  * entry that is identical to the old entry.  This preserves the
448  * wire count; it's used for map splitting and zone changing in
449  * vm_map_copyout.
450  */
451 
452 static inline void
vm_map_entry_copy_csm_assoc(vm_map_t map __unused,vm_map_entry_t new __unused,vm_map_entry_t old __unused)453 vm_map_entry_copy_csm_assoc(
454 	vm_map_t map __unused,
455 	vm_map_entry_t new __unused,
456 	vm_map_entry_t old __unused)
457 {
458 #if CODE_SIGNING_MONITOR
459 	/* when code signing monitor is enabled, we want to reset on copy */
460 	new->csm_associated = FALSE;
461 #else
462 	/* when code signing monitor is not enabled, assert as a sanity check */
463 	assert(new->csm_associated == FALSE);
464 #endif
465 #if DEVELOPMENT || DEBUG
466 	if (new->vme_xnu_user_debug && vm_log_xnu_user_debug) {
467 		printf("FBDP %d[%s] %s:%d map %p entry %p [ 0x%llx 0x%llx ] resetting vme_xnu_user_debug\n",
468 		    proc_selfpid(),
469 		    (get_bsdtask_info(current_task())
470 		    ? proc_name_address(get_bsdtask_info(current_task()))
471 		    : "?"),
472 		    __FUNCTION__, __LINE__,
473 		    map, new, new->vme_start, new->vme_end);
474 	}
475 #endif /* DEVELOPMENT || DEBUG */
476 	new->vme_xnu_user_debug = FALSE;
477 }
478 
479 /*
480  * The "used_for_jit" flag was copied from OLD to NEW in vm_map_entry_copy().
481  * But for security reasons on some platforms, we don't want the
482  * new mapping to be "used for jit", so we reset the flag here.
483  */
484 static inline void
vm_map_entry_copy_code_signing(vm_map_t map,vm_map_entry_t new,vm_map_entry_t old __unused)485 vm_map_entry_copy_code_signing(
486 	vm_map_t map,
487 	vm_map_entry_t new,
488 	vm_map_entry_t old __unused)
489 {
490 	if (VM_MAP_POLICY_ALLOW_JIT_COPY(map)) {
491 		assert(new->used_for_jit == old->used_for_jit);
492 	} else {
493 		if (old->used_for_jit) {
494 			DTRACE_VM3(cs_wx,
495 			    uint64_t, new->vme_start,
496 			    uint64_t, new->vme_end,
497 			    vm_prot_t, new->protection);
498 			printf("CODE SIGNING: %d[%s] %s: curprot cannot be write+execute. %s\n",
499 			    proc_selfpid(),
500 			    (get_bsdtask_info(current_task())
501 			    ? proc_name_address(get_bsdtask_info(current_task()))
502 			    : "?"),
503 			    __FUNCTION__,
504 			    "removing execute access");
505 			new->protection &= ~VM_PROT_EXECUTE;
506 			new->max_protection &= ~VM_PROT_EXECUTE;
507 		}
508 		new->used_for_jit = FALSE;
509 	}
510 }
511 
512 static inline void
vm_map_entry_copy_full(vm_map_entry_t new,vm_map_entry_t old)513 vm_map_entry_copy_full(
514 	vm_map_entry_t new,
515 	vm_map_entry_t old)
516 {
517 #if MAP_ENTRY_CREATION_DEBUG
518 	btref_put(new->vme_creation_bt);
519 	btref_retain(old->vme_creation_bt);
520 #endif
521 #if MAP_ENTRY_INSERTION_DEBUG
522 	btref_put(new->vme_insertion_bt);
523 	btref_retain(old->vme_insertion_bt);
524 #endif
525 #if VM_BTLOG_TAGS
526 	/* Discard the btref that might be in the new entry */
527 	if (new->vme_kernel_object) {
528 		btref_put(new->vme_tag_btref);
529 	}
530 	/* Retain the btref in the old entry to account for its copy */
531 	if (old->vme_kernel_object) {
532 		btref_retain(old->vme_tag_btref);
533 	}
534 #endif /* VM_BTLOG_TAGS */
535 	*new = *old;
536 }
537 
538 static inline void
vm_map_entry_copy(vm_map_t map,vm_map_entry_t new,vm_map_entry_t old)539 vm_map_entry_copy(
540 	vm_map_t map,
541 	vm_map_entry_t new,
542 	vm_map_entry_t old)
543 {
544 	vm_map_entry_copy_full(new, old);
545 
546 	new->is_shared = FALSE;
547 	new->needs_wakeup = FALSE;
548 	new->in_transition = FALSE;
549 	new->wired_count = 0;
550 	new->user_wired_count = 0;
551 	new->vme_permanent = FALSE;
552 	vm_map_entry_copy_code_signing(map, new, old);
553 	vm_map_entry_copy_csm_assoc(map, new, old);
554 	if (new->iokit_acct) {
555 		assertf(!new->use_pmap, "old %p new %p\n", old, new);
556 		new->iokit_acct = FALSE;
557 		new->use_pmap = TRUE;
558 	}
559 	new->vme_resilient_codesign = FALSE;
560 	new->vme_resilient_media = FALSE;
561 	new->vme_atomic = FALSE;
562 	new->vme_no_copy_on_read = FALSE;
563 }
564 
565 /*
566  * Normal lock_read_to_write() returns FALSE/0 on failure.
567  * These functions evaluate to zero on success and non-zero value on failure.
568  */
569 __attribute__((always_inline))
570 int
vm_map_lock_read_to_write(vm_map_t map)571 vm_map_lock_read_to_write(vm_map_t map)
572 {
573 	if (lck_rw_lock_shared_to_exclusive(&(map)->lock)) {
574 		DTRACE_VM(vm_map_lock_upgrade);
575 		return 0;
576 	}
577 	return 1;
578 }
579 
580 __attribute__((always_inline))
581 boolean_t
vm_map_try_lock(vm_map_t map)582 vm_map_try_lock(vm_map_t map)
583 {
584 	if (lck_rw_try_lock_exclusive(&(map)->lock)) {
585 		DTRACE_VM(vm_map_lock_w);
586 		return TRUE;
587 	}
588 	return FALSE;
589 }
590 
591 __attribute__((always_inline))
592 boolean_t
vm_map_try_lock_read(vm_map_t map)593 vm_map_try_lock_read(vm_map_t map)
594 {
595 	if (lck_rw_try_lock_shared(&(map)->lock)) {
596 		DTRACE_VM(vm_map_lock_r);
597 		return TRUE;
598 	}
599 	return FALSE;
600 }
601 
602 /*!
603  * @function kdp_vm_map_is_acquired_exclusive
604  *
605  * @abstract
606  * Checks if vm map is acquired exclusive.
607  *
608  * @discussion
609  * NOT SAFE: To be used only by kernel debugger.
610  *
611  * @param map map to check
612  *
613  * @returns TRUE if the map is acquired exclusively.
614  */
615 boolean_t
kdp_vm_map_is_acquired_exclusive(vm_map_t map)616 kdp_vm_map_is_acquired_exclusive(vm_map_t map)
617 {
618 	return kdp_lck_rw_lock_is_acquired_exclusive(&map->lock);
619 }
620 
621 /*
622  * Routines to get the page size the caller should
623  * use while inspecting the target address space.
624  * Use the "_safely" variant if the caller is dealing with a user-provided
625  * array whose size depends on the page size, to avoid any overflow or
626  * underflow of a user-allocated buffer.
627  */
628 int
vm_self_region_page_shift_safely(vm_map_t target_map)629 vm_self_region_page_shift_safely(
630 	vm_map_t target_map)
631 {
632 	int effective_page_shift = 0;
633 
634 	if (PAGE_SIZE == (4096)) {
635 		/* x86_64 and 4k watches: always use 4k */
636 		return PAGE_SHIFT;
637 	}
638 	/* did caller provide an explicit page size for this thread to use? */
639 	effective_page_shift = thread_self_region_page_shift();
640 	if (effective_page_shift) {
641 		/* use the explicitly-provided page size */
642 		return effective_page_shift;
643 	}
644 	/* no explicit page size: use the caller's page size... */
645 	effective_page_shift = VM_MAP_PAGE_SHIFT(current_map());
646 	if (effective_page_shift == VM_MAP_PAGE_SHIFT(target_map)) {
647 		/* page size match: safe to use */
648 		return effective_page_shift;
649 	}
650 	/* page size mismatch */
651 	return -1;
652 }
653 int
vm_self_region_page_shift(vm_map_t target_map)654 vm_self_region_page_shift(
655 	vm_map_t target_map)
656 {
657 	int effective_page_shift;
658 
659 	effective_page_shift = vm_self_region_page_shift_safely(target_map);
660 	if (effective_page_shift == -1) {
661 		/* no safe value but OK to guess for caller */
662 		effective_page_shift = MIN(VM_MAP_PAGE_SHIFT(current_map()),
663 		    VM_MAP_PAGE_SHIFT(target_map));
664 	}
665 	return effective_page_shift;
666 }
667 
668 
669 /*
670  *	Decide if we want to allow processes to execute from their data or stack areas.
671  *	override_nx() returns true if we do.  Data/stack execution can be enabled independently
672  *	for 32 and 64 bit processes.  Set the VM_ABI_32 or VM_ABI_64 flags in allow_data_exec
673  *	or allow_stack_exec to enable data execution for that type of data area for that particular
674  *	ABI (or both by or'ing the flags together).  These are initialized in the architecture
675  *	specific pmap files since the default behavior varies according to architecture.  The
676  *	main reason it varies is because of the need to provide binary compatibility with old
677  *	applications that were written before these restrictions came into being.  In the old
678  *	days, an app could execute anything it could read, but this has slowly been tightened
679  *	up over time.  The default behavior is:
680  *
681  *	32-bit PPC apps		may execute from both stack and data areas
682  *	32-bit Intel apps	may exeucte from data areas but not stack
683  *	64-bit PPC/Intel apps	may not execute from either data or stack
684  *
685  *	An application on any architecture may override these defaults by explicitly
686  *	adding PROT_EXEC permission to the page in question with the mprotect(2)
687  *	system call.  This code here just determines what happens when an app tries to
688  *      execute from a page that lacks execute permission.
689  *
690  *	Note that allow_data_exec or allow_stack_exec may also be modified by sysctl to change the
691  *	default behavior for both 32 and 64 bit apps on a system-wide basis. Furthermore,
692  *	a Mach-O header flag bit (MH_NO_HEAP_EXECUTION) can be used to forcibly disallow
693  *	execution from data areas for a particular binary even if the arch normally permits it. As
694  *	a final wrinkle, a posix_spawn attribute flag can be used to negate this opt-in header bit
695  *	to support some complicated use cases, notably browsers with out-of-process plugins that
696  *	are not all NX-safe.
697  */
698 
699 extern int allow_data_exec, allow_stack_exec;
700 
701 int
override_nx(vm_map_t map,uint32_t user_tag)702 override_nx(vm_map_t map, uint32_t user_tag) /* map unused on arm */
703 {
704 	int current_abi;
705 
706 	if (map->pmap == kernel_pmap) {
707 		return FALSE;
708 	}
709 
710 	/*
711 	 * Determine if the app is running in 32 or 64 bit mode.
712 	 */
713 
714 	if (vm_map_is_64bit(map)) {
715 		current_abi = VM_ABI_64;
716 	} else {
717 		current_abi = VM_ABI_32;
718 	}
719 
720 	/*
721 	 * Determine if we should allow the execution based on whether it's a
722 	 * stack or data area and the current architecture.
723 	 */
724 
725 	if (user_tag == VM_MEMORY_STACK) {
726 		return allow_stack_exec & current_abi;
727 	}
728 
729 	return (allow_data_exec & current_abi) && (map->map_disallow_data_exec == FALSE);
730 }
731 
732 
733 /*
734  *	Virtual memory maps provide for the mapping, protection,
735  *	and sharing of virtual memory objects.  In addition,
736  *	this module provides for an efficient virtual copy of
737  *	memory from one map to another.
738  *
739  *	Synchronization is required prior to most operations.
740  *
741  *	Maps consist of an ordered doubly-linked list of simple
742  *	entries; a single hint is used to speed up lookups.
743  *
744  *	Sharing maps have been deleted from this version of Mach.
745  *	All shared objects are now mapped directly into the respective
746  *	maps.  This requires a change in the copy on write strategy;
747  *	the asymmetric (delayed) strategy is used for shared temporary
748  *	objects instead of the symmetric (shadow) strategy.  All maps
749  *	are now "top level" maps (either task map, kernel map or submap
750  *	of the kernel map).
751  *
752  *	Since portions of maps are specified by start/end addreses,
753  *	which may not align with existing map entries, all
754  *	routines merely "clip" entries to these start/end values.
755  *	[That is, an entry is split into two, bordering at a
756  *	start or end value.]  Note that these clippings may not
757  *	always be necessary (as the two resulting entries are then
758  *	not changed); however, the clipping is done for convenience.
759  *	No attempt is currently made to "glue back together" two
760  *	abutting entries.
761  *
762  *	The symmetric (shadow) copy strategy implements virtual copy
763  *	by copying VM object references from one map to
764  *	another, and then marking both regions as copy-on-write.
765  *	It is important to note that only one writeable reference
766  *	to a VM object region exists in any map when this strategy
767  *	is used -- this means that shadow object creation can be
768  *	delayed until a write operation occurs.  The symmetric (delayed)
769  *	strategy allows multiple maps to have writeable references to
770  *	the same region of a vm object, and hence cannot delay creating
771  *	its copy objects.  See vm_object_copy_quickly() in vm_object.c.
772  *	Copying of permanent objects is completely different; see
773  *	vm_object_copy_strategically() in vm_object.c.
774  */
775 
776 ZONE_DECLARE_ID(ZONE_ID_VM_MAP_COPY, struct vm_map_copy);
777 
778 #define VM_MAP_ZONE_NAME        "maps"
779 #define VM_MAP_ZFLAGS           (ZC_NOENCRYPT | ZC_VM)
780 
781 #define VM_MAP_ENTRY_ZONE_NAME  "VM map entries"
782 #define VM_MAP_ENTRY_ZFLAGS     (ZC_NOENCRYPT | ZC_VM)
783 
784 #define VM_MAP_HOLES_ZONE_NAME  "VM map holes"
785 #define VM_MAP_HOLES_ZFLAGS     (ZC_NOENCRYPT | ZC_VM)
786 
787 /*
788  * Asserts that a vm_map_copy object is coming from the
789  * vm_map_copy_zone to ensure that it isn't a fake constructed
790  * anywhere else.
791  */
792 void
vm_map_copy_require(struct vm_map_copy * copy)793 vm_map_copy_require(struct vm_map_copy *copy)
794 {
795 	zone_id_require(ZONE_ID_VM_MAP_COPY, sizeof(struct vm_map_copy), copy);
796 }
797 
798 /*
799  *	vm_map_require:
800  *
801  *	Ensures that the argument is memory allocated from the genuine
802  *	vm map zone. (See zone_id_require_allow_foreign).
803  */
804 void
vm_map_require(vm_map_t map)805 vm_map_require(vm_map_t map)
806 {
807 	zone_id_require(ZONE_ID_VM_MAP, sizeof(struct _vm_map), map);
808 }
809 
810 #define VM_MAP_EARLY_COUNT_MAX         16
811 static __startup_data vm_offset_t      map_data;
812 static __startup_data vm_size_t        map_data_size;
813 static __startup_data vm_offset_t      kentry_data;
814 static __startup_data vm_size_t        kentry_data_size;
815 static __startup_data vm_offset_t      map_holes_data;
816 static __startup_data vm_size_t        map_holes_data_size;
817 static __startup_data vm_map_t        *early_map_owners[VM_MAP_EARLY_COUNT_MAX];
818 static __startup_data uint32_t         early_map_count;
819 
820 #if XNU_TARGET_OS_OSX
821 #define         NO_COALESCE_LIMIT  ((1024 * 128) - 1)
822 #else /* XNU_TARGET_OS_OSX */
823 #define         NO_COALESCE_LIMIT  0
824 #endif /* XNU_TARGET_OS_OSX */
825 
826 /* Skip acquiring locks if we're in the midst of a kernel core dump */
827 unsigned int not_in_kdp = 1;
828 
829 unsigned int vm_map_set_cache_attr_count = 0;
830 
831 kern_return_t
vm_map_set_cache_attr(vm_map_t map,vm_map_offset_t va)832 vm_map_set_cache_attr(
833 	vm_map_t        map,
834 	vm_map_offset_t va)
835 {
836 	vm_map_entry_t  map_entry;
837 	vm_object_t     object;
838 	kern_return_t   kr = KERN_SUCCESS;
839 
840 	vm_map_lock_read(map);
841 
842 	if (!vm_map_lookup_entry(map, va, &map_entry) ||
843 	    map_entry->is_sub_map) {
844 		/*
845 		 * that memory is not properly mapped
846 		 */
847 		kr = KERN_INVALID_ARGUMENT;
848 		goto done;
849 	}
850 	object = VME_OBJECT(map_entry);
851 
852 	if (object == VM_OBJECT_NULL) {
853 		/*
854 		 * there should be a VM object here at this point
855 		 */
856 		kr = KERN_INVALID_ARGUMENT;
857 		goto done;
858 	}
859 	vm_object_lock(object);
860 	object->set_cache_attr = TRUE;
861 	vm_object_unlock(object);
862 
863 	vm_map_set_cache_attr_count++;
864 done:
865 	vm_map_unlock_read(map);
866 
867 	return kr;
868 }
869 
870 
871 #if CONFIG_CODE_DECRYPTION
872 /*
873  * vm_map_apple_protected:
874  * This remaps the requested part of the object with an object backed by
875  * the decrypting pager.
876  * crypt_info contains entry points and session data for the crypt module.
877  * The crypt_info block will be copied by vm_map_apple_protected. The data structures
878  * referenced in crypt_info must remain valid until crypt_info->crypt_end() is called.
879  */
880 kern_return_t
vm_map_apple_protected(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_object_offset_t crypto_backing_offset,struct pager_crypt_info * crypt_info,uint32_t cryptid)881 vm_map_apple_protected(
882 	vm_map_t                map,
883 	vm_map_offset_t         start,
884 	vm_map_offset_t         end,
885 	vm_object_offset_t      crypto_backing_offset,
886 	struct pager_crypt_info *crypt_info,
887 	uint32_t                cryptid)
888 {
889 	boolean_t       map_locked;
890 	kern_return_t   kr;
891 	vm_map_entry_t  map_entry;
892 	struct vm_map_entry tmp_entry;
893 	memory_object_t unprotected_mem_obj;
894 	vm_object_t     protected_object;
895 	vm_map_offset_t map_addr;
896 	vm_map_offset_t start_aligned, end_aligned;
897 	vm_object_offset_t      crypto_start, crypto_end;
898 	boolean_t       cache_pager;
899 
900 	map_locked = FALSE;
901 	unprotected_mem_obj = MEMORY_OBJECT_NULL;
902 
903 	if (__improbable(vm_map_range_overflows(map, start, end - start))) {
904 		return KERN_INVALID_ADDRESS;
905 	}
906 	start_aligned = vm_map_trunc_page(start, PAGE_MASK_64);
907 	end_aligned = vm_map_round_page(end, PAGE_MASK_64);
908 	start_aligned = vm_map_trunc_page(start_aligned, VM_MAP_PAGE_MASK(map));
909 	end_aligned = vm_map_round_page(end_aligned, VM_MAP_PAGE_MASK(map));
910 
911 #if __arm64__
912 	/*
913 	 * "start" and "end" might be 4K-aligned but not 16K-aligned,
914 	 * so we might have to loop and establish up to 3 mappings:
915 	 *
916 	 * + the first 16K-page, which might overlap with the previous
917 	 *   4K-aligned mapping,
918 	 * + the center,
919 	 * + the last 16K-page, which might overlap with the next
920 	 *   4K-aligned mapping.
921 	 * Each of these mapping might be backed by a vnode pager (if
922 	 * properly page-aligned) or a "fourk_pager", itself backed by a
923 	 * vnode pager (if 4K-aligned but not page-aligned).
924 	 */
925 #endif /* __arm64__ */
926 
927 	map_addr = start_aligned;
928 	for (map_addr = start_aligned;
929 	    map_addr < end;
930 	    map_addr = tmp_entry.vme_end) {
931 		vm_map_lock(map);
932 		map_locked = TRUE;
933 
934 		/* lookup the protected VM object */
935 		if (!vm_map_lookup_entry(map,
936 		    map_addr,
937 		    &map_entry) ||
938 		    map_entry->is_sub_map ||
939 		    VME_OBJECT(map_entry) == VM_OBJECT_NULL) {
940 			/* that memory is not properly mapped */
941 			kr = KERN_INVALID_ARGUMENT;
942 			goto done;
943 		}
944 
945 		/* ensure mapped memory is mapped as executable except
946 		 *  except for model decryption flow */
947 		if ((cryptid != CRYPTID_MODEL_ENCRYPTION) &&
948 		    !(map_entry->protection & VM_PROT_EXECUTE)) {
949 			kr = KERN_INVALID_ARGUMENT;
950 			goto done;
951 		}
952 
953 		/* get the protected object to be decrypted */
954 		protected_object = VME_OBJECT(map_entry);
955 		if (protected_object == VM_OBJECT_NULL) {
956 			/* there should be a VM object here at this point */
957 			kr = KERN_INVALID_ARGUMENT;
958 			goto done;
959 		}
960 		/* ensure protected object stays alive while map is unlocked */
961 		vm_object_reference(protected_object);
962 
963 		/* limit the map entry to the area we want to cover */
964 		vm_map_clip_start(map, map_entry, start_aligned);
965 		vm_map_clip_end(map, map_entry, end_aligned);
966 
967 		tmp_entry = *map_entry;
968 		map_entry = VM_MAP_ENTRY_NULL; /* not valid after unlocking map */
969 		vm_map_unlock(map);
970 		map_locked = FALSE;
971 
972 		/*
973 		 * This map entry might be only partially encrypted
974 		 * (if not fully "page-aligned").
975 		 */
976 		crypto_start = 0;
977 		crypto_end = tmp_entry.vme_end - tmp_entry.vme_start;
978 		if (tmp_entry.vme_start < start) {
979 			if (tmp_entry.vme_start != start_aligned) {
980 				kr = KERN_INVALID_ADDRESS;
981 				vm_object_deallocate(protected_object);
982 				goto done;
983 			}
984 			crypto_start += (start - tmp_entry.vme_start);
985 		}
986 		if (tmp_entry.vme_end > end) {
987 			if (tmp_entry.vme_end != end_aligned) {
988 				kr = KERN_INVALID_ADDRESS;
989 				vm_object_deallocate(protected_object);
990 				goto done;
991 			}
992 			crypto_end -= (tmp_entry.vme_end - end);
993 		}
994 
995 		/*
996 		 * This "extra backing offset" is needed to get the decryption
997 		 * routine to use the right key.  It adjusts for the possibly
998 		 * relative offset of an interposed "4K" pager...
999 		 */
1000 		if (crypto_backing_offset == (vm_object_offset_t) -1) {
1001 			crypto_backing_offset = VME_OFFSET(&tmp_entry);
1002 		}
1003 
1004 		cache_pager = TRUE;
1005 #if XNU_TARGET_OS_OSX
1006 		if (vm_map_is_alien(map)) {
1007 			cache_pager = FALSE;
1008 		}
1009 #endif /* XNU_TARGET_OS_OSX */
1010 
1011 		/*
1012 		 * Lookup (and create if necessary) the protected memory object
1013 		 * matching that VM object.
1014 		 * If successful, this also grabs a reference on the memory object,
1015 		 * to guarantee that it doesn't go away before we get a chance to map
1016 		 * it.
1017 		 */
1018 		unprotected_mem_obj = apple_protect_pager_setup(
1019 			protected_object,
1020 			VME_OFFSET(&tmp_entry),
1021 			crypto_backing_offset,
1022 			crypt_info,
1023 			crypto_start,
1024 			crypto_end,
1025 			cache_pager);
1026 
1027 		/* release extra ref on protected object */
1028 		vm_object_deallocate(protected_object);
1029 
1030 		if (unprotected_mem_obj == NULL) {
1031 			kr = KERN_FAILURE;
1032 			goto done;
1033 		}
1034 
1035 		/* can overwrite an immutable mapping */
1036 		vm_map_kernel_flags_t vmk_flags = {
1037 			.vmf_fixed = true,
1038 			.vmf_overwrite = true,
1039 			.vmkf_overwrite_immutable = true,
1040 		};
1041 		/* make the new mapping as "permanent" as the one it replaces */
1042 		vmk_flags.vmf_permanent = tmp_entry.vme_permanent;
1043 #if __arm64__
1044 		if (tmp_entry.used_for_jit &&
1045 		    (VM_MAP_PAGE_SHIFT(map) != FOURK_PAGE_SHIFT ||
1046 		    PAGE_SHIFT != FOURK_PAGE_SHIFT) &&
1047 		    fourk_binary_compatibility_unsafe &&
1048 		    fourk_binary_compatibility_allow_wx) {
1049 			printf("** FOURK_COMPAT [%d]: "
1050 			    "allowing write+execute at 0x%llx\n",
1051 			    proc_selfpid(), tmp_entry.vme_start);
1052 			vmk_flags.vmkf_map_jit = TRUE;
1053 		}
1054 #endif /* __arm64__ */
1055 
1056 		/* map this memory object in place of the current one */
1057 		map_addr = tmp_entry.vme_start;
1058 		kr = vm_map_enter_mem_object(map,
1059 		    &map_addr,
1060 		    (tmp_entry.vme_end -
1061 		    tmp_entry.vme_start),
1062 		    (mach_vm_offset_t) 0,
1063 		    vmk_flags,
1064 		    (ipc_port_t)(uintptr_t) unprotected_mem_obj,
1065 		    0,
1066 		    TRUE,
1067 		    tmp_entry.protection,
1068 		    tmp_entry.max_protection,
1069 		    tmp_entry.inheritance);
1070 		assertf(kr == KERN_SUCCESS,
1071 		    "kr = 0x%x\n", kr);
1072 		assertf(map_addr == tmp_entry.vme_start,
1073 		    "map_addr=0x%llx vme_start=0x%llx tmp_entry=%p\n",
1074 		    (uint64_t)map_addr,
1075 		    (uint64_t) tmp_entry.vme_start,
1076 		    &tmp_entry);
1077 
1078 #if VM_MAP_DEBUG_APPLE_PROTECT
1079 		if (vm_map_debug_apple_protect) {
1080 			printf("APPLE_PROTECT: map %p [0x%llx:0x%llx] pager %p:"
1081 			    " backing:[object:%p,offset:0x%llx,"
1082 			    "crypto_backing_offset:0x%llx,"
1083 			    "crypto_start:0x%llx,crypto_end:0x%llx]\n",
1084 			    map,
1085 			    (uint64_t) map_addr,
1086 			    (uint64_t) (map_addr + (tmp_entry.vme_end -
1087 			    tmp_entry.vme_start)),
1088 			    unprotected_mem_obj,
1089 			    protected_object,
1090 			    VME_OFFSET(&tmp_entry),
1091 			    crypto_backing_offset,
1092 			    crypto_start,
1093 			    crypto_end);
1094 		}
1095 #endif /* VM_MAP_DEBUG_APPLE_PROTECT */
1096 
1097 		/*
1098 		 * Release the reference obtained by
1099 		 * apple_protect_pager_setup().
1100 		 * The mapping (if it succeeded) is now holding a reference on
1101 		 * the memory object.
1102 		 */
1103 		memory_object_deallocate(unprotected_mem_obj);
1104 		unprotected_mem_obj = MEMORY_OBJECT_NULL;
1105 
1106 		/* continue with next map entry */
1107 		crypto_backing_offset += (tmp_entry.vme_end -
1108 		    tmp_entry.vme_start);
1109 		crypto_backing_offset -= crypto_start;
1110 	}
1111 	kr = KERN_SUCCESS;
1112 
1113 done:
1114 	if (map_locked) {
1115 		vm_map_unlock(map);
1116 	}
1117 	return kr;
1118 }
1119 #endif  /* CONFIG_CODE_DECRYPTION */
1120 
1121 
1122 LCK_GRP_DECLARE(vm_map_lck_grp, "vm_map");
1123 LCK_ATTR_DECLARE(vm_map_lck_attr, 0, 0);
1124 LCK_ATTR_DECLARE(vm_map_lck_rw_attr, 0, LCK_ATTR_DEBUG);
1125 
1126 #if XNU_TARGET_OS_OSX
1127 #define MALLOC_NO_COW_DEFAULT 1
1128 #define MALLOC_NO_COW_EXCEPT_FORK_DEFAULT 1
1129 #else /* XNU_TARGET_OS_OSX */
1130 #define MALLOC_NO_COW_DEFAULT 1
1131 #define MALLOC_NO_COW_EXCEPT_FORK_DEFAULT 0
1132 #endif /* XNU_TARGET_OS_OSX */
1133 TUNABLE(int, malloc_no_cow, "malloc_no_cow", MALLOC_NO_COW_DEFAULT);
1134 TUNABLE(int, malloc_no_cow_except_fork, "malloc_no_cow_except_fork", MALLOC_NO_COW_EXCEPT_FORK_DEFAULT);
1135 uint64_t vm_memory_malloc_no_cow_mask = 0ULL;
1136 #if DEBUG
1137 int vm_check_map_sanity = 0;
1138 #endif
1139 
1140 /*
1141  *	vm_map_init:
1142  *
1143  *	Initialize the vm_map module.  Must be called before
1144  *	any other vm_map routines.
1145  *
1146  *	Map and entry structures are allocated from zones -- we must
1147  *	initialize those zones.
1148  *
1149  *	There are three zones of interest:
1150  *
1151  *	vm_map_zone:		used to allocate maps.
1152  *	vm_map_entry_zone:	used to allocate map entries.
1153  *
1154  *	LP32:
1155  *	vm_map_entry_reserved_zone:     fallback zone for kernel map entries
1156  *
1157  *	The kernel allocates map entries from a special zone that is initially
1158  *	"crammed" with memory.  It would be difficult (perhaps impossible) for
1159  *	the kernel to allocate more memory to a entry zone when it became
1160  *	empty since the very act of allocating memory implies the creation
1161  *	of a new entry.
1162  */
1163 __startup_func
1164 void
vm_map_init(void)1165 vm_map_init(void)
1166 {
1167 
1168 #if MACH_ASSERT
1169 	PE_parse_boot_argn("debug4k_filter", &debug4k_filter,
1170 	    sizeof(debug4k_filter));
1171 #endif /* MACH_ASSERT */
1172 
1173 	zone_create_ext(VM_MAP_ZONE_NAME, sizeof(struct _vm_map),
1174 	    VM_MAP_ZFLAGS, ZONE_ID_VM_MAP, NULL);
1175 
1176 	/*
1177 	 * Don't quarantine because we always need elements available
1178 	 * Disallow GC on this zone... to aid the GC.
1179 	 */
1180 	zone_create_ext(VM_MAP_ENTRY_ZONE_NAME,
1181 	    sizeof(struct vm_map_entry), VM_MAP_ENTRY_ZFLAGS,
1182 	    ZONE_ID_VM_MAP_ENTRY, ^(zone_t z) {
1183 		z->z_elems_rsv = (uint16_t)(32 *
1184 		(ml_early_cpu_max_number() + 1));
1185 	});
1186 
1187 	zone_create_ext(VM_MAP_HOLES_ZONE_NAME,
1188 	    sizeof(struct vm_map_links), VM_MAP_HOLES_ZFLAGS,
1189 	    ZONE_ID_VM_MAP_HOLES, ^(zone_t z) {
1190 		z->z_elems_rsv = (uint16_t)(16 * 1024 / zone_elem_outer_size(z));
1191 	});
1192 
1193 	zone_create_ext("VM map copies", sizeof(struct vm_map_copy),
1194 	    ZC_NOENCRYPT, ZONE_ID_VM_MAP_COPY, NULL);
1195 
1196 	/*
1197 	 * Add the stolen memory to zones, adjust zone size and stolen counts.
1198 	 */
1199 	zone_cram_early(vm_map_zone, map_data, map_data_size);
1200 	zone_cram_early(vm_map_entry_zone, kentry_data, kentry_data_size);
1201 	zone_cram_early(vm_map_holes_zone, map_holes_data, map_holes_data_size);
1202 	printf("VM boostrap: %d maps, %d entries and %d holes available\n",
1203 	    zone_count_free(vm_map_zone),
1204 	    zone_count_free(vm_map_entry_zone),
1205 	    zone_count_free(vm_map_holes_zone));
1206 
1207 	/*
1208 	 * Since these are covered by zones, remove them from stolen page accounting.
1209 	 */
1210 	VM_PAGE_MOVE_STOLEN(atop_64(map_data_size) + atop_64(kentry_data_size) + atop_64(map_holes_data_size));
1211 
1212 #if VM_MAP_DEBUG_APPLE_PROTECT
1213 	PE_parse_boot_argn("vm_map_debug_apple_protect",
1214 	    &vm_map_debug_apple_protect,
1215 	    sizeof(vm_map_debug_apple_protect));
1216 #endif /* VM_MAP_DEBUG_APPLE_PROTECT */
1217 #if VM_MAP_DEBUG_APPLE_FOURK
1218 	PE_parse_boot_argn("vm_map_debug_fourk",
1219 	    &vm_map_debug_fourk,
1220 	    sizeof(vm_map_debug_fourk));
1221 #endif /* VM_MAP_DEBUG_FOURK */
1222 
1223 	if (malloc_no_cow) {
1224 		vm_memory_malloc_no_cow_mask = 0ULL;
1225 		vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC;
1226 		vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_SMALL;
1227 		vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_MEDIUM;
1228 #if XNU_TARGET_OS_OSX
1229 		/*
1230 		 * On macOS, keep copy-on-write for MALLOC_LARGE because
1231 		 * realloc() may use vm_copy() to transfer the old contents
1232 		 * to the new location.
1233 		 */
1234 #else /* XNU_TARGET_OS_OSX */
1235 		vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_LARGE;
1236 		vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_LARGE_REUSABLE;
1237 		vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_LARGE_REUSED;
1238 #endif /* XNU_TARGET_OS_OSX */
1239 //		vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_HUGE;
1240 //		vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_REALLOC;
1241 		vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_TINY;
1242 		vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_NANO;
1243 //		vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_TCMALLOC;
1244 		PE_parse_boot_argn("vm_memory_malloc_no_cow_mask",
1245 		    &vm_memory_malloc_no_cow_mask,
1246 		    sizeof(vm_memory_malloc_no_cow_mask));
1247 	}
1248 
1249 #if CONFIG_MAP_RANGES
1250 	vm_map_range_map_init();
1251 #endif /* CONFIG_MAP_RANGES */
1252 
1253 #if DEBUG
1254 	PE_parse_boot_argn("vm_check_map_sanity", &vm_check_map_sanity, sizeof(vm_check_map_sanity));
1255 	if (vm_check_map_sanity) {
1256 		kprintf("VM sanity checking enabled\n");
1257 	} else {
1258 		kprintf("VM sanity checking disabled. Set bootarg vm_check_map_sanity=1 to enable\n");
1259 	}
1260 #endif /* DEBUG */
1261 
1262 #if DEVELOPMENT || DEBUG
1263 	PE_parse_boot_argn("panic_on_unsigned_execute",
1264 	    &panic_on_unsigned_execute,
1265 	    sizeof(panic_on_unsigned_execute));
1266 	PE_parse_boot_argn("panic_on_mlock_failure",
1267 	    &panic_on_mlock_failure,
1268 	    sizeof(panic_on_mlock_failure));
1269 #endif /* DEVELOPMENT || DEBUG */
1270 }
1271 
1272 __startup_func
1273 static void
vm_map_steal_memory(void)1274 vm_map_steal_memory(void)
1275 {
1276 	/*
1277 	 * We need to reserve enough memory to support boostraping VM maps
1278 	 * and the zone subsystem.
1279 	 *
1280 	 * The VM Maps that need to function before zones can support them
1281 	 * are the ones registered with vm_map_will_allocate_early_map(),
1282 	 * which are:
1283 	 * - the kernel map
1284 	 * - the various submaps used by zones (pgz, meta, ...)
1285 	 *
1286 	 * We also need enough entries and holes to support them
1287 	 * until zone_metadata_init() is called, which is when
1288 	 * the zone allocator becomes capable of expanding dynamically.
1289 	 *
1290 	 * We need:
1291 	 * - VM_MAP_EARLY_COUNT_MAX worth of VM Maps.
1292 	 * - To allow for 3-4 entries per map, but the kernel map
1293 	 *   needs a multiple of VM_MAP_EARLY_COUNT_MAX entries
1294 	 *   to describe the submaps, so double it (and make it 8x too)
1295 	 * - To allow for holes between entries,
1296 	 *   hence needs the same budget as entries
1297 	 */
1298 	map_data_size = zone_get_early_alloc_size(VM_MAP_ZONE_NAME,
1299 	    sizeof(struct _vm_map), VM_MAP_ZFLAGS,
1300 	    VM_MAP_EARLY_COUNT_MAX);
1301 
1302 	kentry_data_size = zone_get_early_alloc_size(VM_MAP_ENTRY_ZONE_NAME,
1303 	    sizeof(struct vm_map_entry), VM_MAP_ENTRY_ZFLAGS,
1304 	    8 * VM_MAP_EARLY_COUNT_MAX);
1305 
1306 	map_holes_data_size = zone_get_early_alloc_size(VM_MAP_HOLES_ZONE_NAME,
1307 	    sizeof(struct vm_map_links), VM_MAP_HOLES_ZFLAGS,
1308 	    8 * VM_MAP_EARLY_COUNT_MAX);
1309 
1310 	/*
1311 	 * Steal a contiguous range of memory so that a simple range check
1312 	 * can validate early addresses being freed/crammed to these
1313 	 * zones
1314 	 */
1315 	map_data       = zone_early_mem_init(map_data_size + kentry_data_size +
1316 	    map_holes_data_size);
1317 	kentry_data    = map_data + map_data_size;
1318 	map_holes_data = kentry_data + kentry_data_size;
1319 }
1320 STARTUP(PMAP_STEAL, STARTUP_RANK_FIRST, vm_map_steal_memory);
1321 
1322 __startup_func
1323 static void
vm_kernel_boostraped(void)1324 vm_kernel_boostraped(void)
1325 {
1326 	zone_enable_caching(&zone_array[ZONE_ID_VM_MAP_ENTRY]);
1327 	zone_enable_caching(&zone_array[ZONE_ID_VM_MAP_HOLES]);
1328 	zone_enable_caching(&zone_array[ZONE_ID_VM_MAP_COPY]);
1329 
1330 	printf("VM bootstrap done: %d maps, %d entries and %d holes left\n",
1331 	    zone_count_free(vm_map_zone),
1332 	    zone_count_free(vm_map_entry_zone),
1333 	    zone_count_free(vm_map_holes_zone));
1334 }
1335 STARTUP(ZALLOC, STARTUP_RANK_SECOND, vm_kernel_boostraped);
1336 
1337 void
vm_map_disable_hole_optimization(vm_map_t map)1338 vm_map_disable_hole_optimization(vm_map_t map)
1339 {
1340 	vm_map_entry_t  head_entry, hole_entry, next_hole_entry;
1341 
1342 	if (map->holelistenabled) {
1343 		head_entry = hole_entry = CAST_TO_VM_MAP_ENTRY(map->holes_list);
1344 
1345 		while (hole_entry != NULL) {
1346 			next_hole_entry = hole_entry->vme_next;
1347 
1348 			hole_entry->vme_next = NULL;
1349 			hole_entry->vme_prev = NULL;
1350 			zfree_id(ZONE_ID_VM_MAP_HOLES, hole_entry);
1351 
1352 			if (next_hole_entry == head_entry) {
1353 				hole_entry = NULL;
1354 			} else {
1355 				hole_entry = next_hole_entry;
1356 			}
1357 		}
1358 
1359 		map->holes_list = NULL;
1360 		map->holelistenabled = FALSE;
1361 
1362 		map->first_free = vm_map_first_entry(map);
1363 		SAVE_HINT_HOLE_WRITE(map, NULL);
1364 	}
1365 }
1366 
1367 boolean_t
vm_kernel_map_is_kernel(vm_map_t map)1368 vm_kernel_map_is_kernel(vm_map_t map)
1369 {
1370 	return map->pmap == kernel_pmap;
1371 }
1372 
1373 /*
1374  *	vm_map_create:
1375  *
1376  *	Creates and returns a new empty VM map with
1377  *	the given physical map structure, and having
1378  *	the given lower and upper address bounds.
1379  */
1380 
1381 extern vm_map_t vm_map_create_external(
1382 	pmap_t                  pmap,
1383 	vm_map_offset_t         min_off,
1384 	vm_map_offset_t         max_off,
1385 	boolean_t               pageable);
1386 
1387 vm_map_t
vm_map_create_external(pmap_t pmap,vm_map_offset_t min,vm_map_offset_t max,boolean_t pageable)1388 vm_map_create_external(
1389 	pmap_t                  pmap,
1390 	vm_map_offset_t         min,
1391 	vm_map_offset_t         max,
1392 	boolean_t               pageable)
1393 {
1394 	vm_map_create_options_t options = VM_MAP_CREATE_DEFAULT;
1395 
1396 	if (pageable) {
1397 		options |= VM_MAP_CREATE_PAGEABLE;
1398 	}
1399 	return vm_map_create_options(pmap, min, max, options);
1400 }
1401 
1402 __startup_func
1403 void
vm_map_will_allocate_early_map(vm_map_t * owner)1404 vm_map_will_allocate_early_map(vm_map_t *owner)
1405 {
1406 	if (early_map_count >= VM_MAP_EARLY_COUNT_MAX) {
1407 		panic("VM_MAP_EARLY_COUNT_MAX is too low");
1408 	}
1409 
1410 	early_map_owners[early_map_count++] = owner;
1411 }
1412 
1413 __startup_func
1414 void
vm_map_relocate_early_maps(vm_offset_t delta)1415 vm_map_relocate_early_maps(vm_offset_t delta)
1416 {
1417 	for (uint32_t i = 0; i < early_map_count; i++) {
1418 		vm_address_t addr = (vm_address_t)*early_map_owners[i];
1419 
1420 		*early_map_owners[i] = (vm_map_t)(addr + delta);
1421 	}
1422 
1423 	early_map_count = ~0u;
1424 }
1425 
1426 /*
1427  *	Routine:	vm_map_relocate_early_elem
1428  *
1429  *	Purpose:
1430  *		Early zone elements are allocated in a temporary part
1431  *		of the address space.
1432  *
1433  *		Once the zones live in their final place, the early
1434  *		VM maps, map entries and map holes need to be relocated.
1435  *
1436  *		It involves rewriting any vm_map_t, vm_map_entry_t or
1437  *		pointers to vm_map_links. Other pointers to other types
1438  *		are fine.
1439  *
1440  *		Fortunately, pointers to those types are self-contained
1441  *		in those zones, _except_ for pointers to VM maps,
1442  *		which are tracked during early boot and fixed with
1443  *		vm_map_relocate_early_maps().
1444  */
1445 __startup_func
1446 void
vm_map_relocate_early_elem(uint32_t zone_id,vm_offset_t new_addr,vm_offset_t delta)1447 vm_map_relocate_early_elem(
1448 	uint32_t                zone_id,
1449 	vm_offset_t             new_addr,
1450 	vm_offset_t             delta)
1451 {
1452 #define relocate(type_t, field)  ({ \
1453 	typeof(((type_t)NULL)->field) *__field = &((type_t)new_addr)->field;   \
1454 	if (*__field) {                                                        \
1455 	        *__field = (typeof(*__field))((vm_offset_t)*__field + delta);  \
1456 	}                                                                      \
1457 })
1458 
1459 	switch (zone_id) {
1460 	case ZONE_ID_VM_MAP:
1461 	case ZONE_ID_VM_MAP_ENTRY:
1462 	case ZONE_ID_VM_MAP_HOLES:
1463 		break;
1464 
1465 	default:
1466 		panic("Unexpected zone ID %d", zone_id);
1467 	}
1468 
1469 	if (zone_id == ZONE_ID_VM_MAP) {
1470 		relocate(vm_map_t, hdr.links.prev);
1471 		relocate(vm_map_t, hdr.links.next);
1472 		((vm_map_t)new_addr)->pmap = kernel_pmap;
1473 #ifdef VM_MAP_STORE_USE_RB
1474 		relocate(vm_map_t, hdr.rb_head_store.rbh_root);
1475 #endif /* VM_MAP_STORE_USE_RB */
1476 		relocate(vm_map_t, hint);
1477 		relocate(vm_map_t, hole_hint);
1478 		relocate(vm_map_t, first_free);
1479 		return;
1480 	}
1481 
1482 	relocate(struct vm_map_links *, prev);
1483 	relocate(struct vm_map_links *, next);
1484 
1485 	if (zone_id == ZONE_ID_VM_MAP_ENTRY) {
1486 #ifdef VM_MAP_STORE_USE_RB
1487 		relocate(vm_map_entry_t, store.entry.rbe_left);
1488 		relocate(vm_map_entry_t, store.entry.rbe_right);
1489 		relocate(vm_map_entry_t, store.entry.rbe_parent);
1490 #endif /* VM_MAP_STORE_USE_RB */
1491 		if (((vm_map_entry_t)new_addr)->is_sub_map) {
1492 			/* no object to relocate because we haven't made any */
1493 			((vm_map_entry_t)new_addr)->vme_submap +=
1494 			    delta >> VME_SUBMAP_SHIFT;
1495 		}
1496 #if MAP_ENTRY_CREATION_DEBUG
1497 		relocate(vm_map_entry_t, vme_creation_maphdr);
1498 #endif /* MAP_ENTRY_CREATION_DEBUG */
1499 	}
1500 
1501 #undef relocate
1502 }
1503 
1504 vm_map_t
vm_map_create_options(pmap_t pmap,vm_map_offset_t min,vm_map_offset_t max,vm_map_create_options_t options)1505 vm_map_create_options(
1506 	pmap_t                  pmap,
1507 	vm_map_offset_t         min,
1508 	vm_map_offset_t         max,
1509 	vm_map_create_options_t options)
1510 {
1511 	vm_map_t result;
1512 
1513 #if DEBUG || DEVELOPMENT
1514 	if (__improbable(startup_phase < STARTUP_SUB_ZALLOC)) {
1515 		if (early_map_count != ~0u && early_map_count !=
1516 		    zone_count_allocated(vm_map_zone) + 1) {
1517 			panic("allocating %dth early map, owner not known",
1518 			    zone_count_allocated(vm_map_zone) + 1);
1519 		}
1520 		if (early_map_count != ~0u && pmap && pmap != kernel_pmap) {
1521 			panic("allocating %dth early map for non kernel pmap",
1522 			    early_map_count);
1523 		}
1524 	}
1525 #endif /* DEBUG || DEVELOPMENT */
1526 
1527 	result = zalloc_id(ZONE_ID_VM_MAP, Z_WAITOK | Z_NOFAIL | Z_ZERO);
1528 
1529 	vm_map_store_init(&result->hdr);
1530 	result->hdr.entries_pageable = (bool)(options & VM_MAP_CREATE_PAGEABLE);
1531 	vm_map_set_page_shift(result, PAGE_SHIFT);
1532 
1533 	result->size_limit      = RLIM_INFINITY;        /* default unlimited */
1534 	result->data_limit      = RLIM_INFINITY;        /* default unlimited */
1535 	result->user_wire_limit = MACH_VM_MAX_ADDRESS;  /* default limit is unlimited */
1536 	os_ref_init_count_raw(&result->map_refcnt, &map_refgrp, 1);
1537 	result->pmap = pmap;
1538 	result->min_offset = min;
1539 	result->max_offset = max;
1540 	result->first_free = vm_map_to_entry(result);
1541 	result->hint = vm_map_to_entry(result);
1542 
1543 	if (options & VM_MAP_CREATE_NEVER_FAULTS) {
1544 		assert(pmap == kernel_pmap);
1545 		result->never_faults = true;
1546 	}
1547 
1548 	/* "has_corpse_footprint" and "holelistenabled" are mutually exclusive */
1549 	if (options & VM_MAP_CREATE_CORPSE_FOOTPRINT) {
1550 		result->has_corpse_footprint = true;
1551 	} else if (!(options & VM_MAP_CREATE_DISABLE_HOLELIST)) {
1552 		struct vm_map_links *hole_entry;
1553 
1554 		hole_entry = zalloc_id(ZONE_ID_VM_MAP_HOLES, Z_WAITOK | Z_NOFAIL);
1555 		hole_entry->start = min;
1556 #if defined(__arm64__)
1557 		hole_entry->end = result->max_offset;
1558 #else
1559 		hole_entry->end = MAX(max, (vm_map_offset_t)MACH_VM_MAX_ADDRESS);
1560 #endif
1561 		result->holes_list = result->hole_hint = hole_entry;
1562 		hole_entry->prev = hole_entry->next = CAST_TO_VM_MAP_ENTRY(hole_entry);
1563 		result->holelistenabled = true;
1564 	}
1565 
1566 	vm_map_lock_init(result);
1567 
1568 	return result;
1569 }
1570 
1571 /*
1572  * Adjusts a submap that was made by kmem_suballoc()
1573  * before it knew where it would be mapped,
1574  * so that it has the right min/max offsets.
1575  *
1576  * We do not need to hold any locks:
1577  * only the caller knows about this map,
1578  * and it is not published on any entry yet.
1579  */
1580 static void
vm_map_adjust_offsets(vm_map_t map,vm_map_offset_t min_off,vm_map_offset_t max_off)1581 vm_map_adjust_offsets(
1582 	vm_map_t                map,
1583 	vm_map_offset_t         min_off,
1584 	vm_map_offset_t         max_off)
1585 {
1586 	assert(map->min_offset == 0);
1587 	assert(map->max_offset == max_off - min_off);
1588 	assert(map->hdr.nentries == 0);
1589 	assert(os_ref_get_count_raw(&map->map_refcnt) == 2);
1590 
1591 	map->min_offset = min_off;
1592 	map->max_offset = max_off;
1593 
1594 	if (map->holelistenabled) {
1595 		struct vm_map_links *hole = map->holes_list;
1596 
1597 		hole->start = min_off;
1598 #if defined(__arm64__)
1599 		hole->end = max_off;
1600 #else
1601 		hole->end = MAX(max_off, (vm_map_offset_t)MACH_VM_MAX_ADDRESS);
1602 #endif
1603 	}
1604 }
1605 
1606 
1607 vm_map_size_t
vm_map_adjusted_size(vm_map_t map)1608 vm_map_adjusted_size(vm_map_t map)
1609 {
1610 	const struct vm_reserved_region *regions = NULL;
1611 	size_t num_regions = 0;
1612 	mach_vm_size_t  reserved_size = 0, map_size = 0;
1613 
1614 	if (map == NULL || (map->size == 0)) {
1615 		return 0;
1616 	}
1617 
1618 	map_size = map->size;
1619 
1620 	if (map->reserved_regions == FALSE || !vm_map_is_exotic(map) || map->terminated) {
1621 		/*
1622 		 * No special reserved regions or not an exotic map or the task
1623 		 * is terminating and these special regions might have already
1624 		 * been deallocated.
1625 		 */
1626 		return map_size;
1627 	}
1628 
1629 	num_regions = ml_get_vm_reserved_regions(vm_map_is_64bit(map), &regions);
1630 	assert((num_regions == 0) || (num_regions > 0 && regions != NULL));
1631 
1632 	while (num_regions) {
1633 		reserved_size += regions[--num_regions].vmrr_size;
1634 	}
1635 
1636 	/*
1637 	 * There are a few places where the map is being switched out due to
1638 	 * 'termination' without that bit being set (e.g. exec and corpse purging).
1639 	 * In those cases, we could have the map's regions being deallocated on
1640 	 * a core while some accounting process is trying to get the map's size.
1641 	 * So this assert can't be enabled till all those places are uniform in
1642 	 * their use of the 'map->terminated' bit.
1643 	 *
1644 	 * assert(map_size >= reserved_size);
1645 	 */
1646 
1647 	return (map_size >= reserved_size) ? (map_size - reserved_size) : map_size;
1648 }
1649 
1650 /*
1651  *	vm_map_entry_create:	[ internal use only ]
1652  *
1653  *	Allocates a VM map entry for insertion in the
1654  *	given map (or map copy).  No fields are filled.
1655  *
1656  *	The VM entry will be zero initialized, except for:
1657  *	- behavior set to VM_BEHAVIOR_DEFAULT
1658  *	- inheritance set to VM_INHERIT_DEFAULT
1659  */
1660 #define vm_map_entry_create(map)    _vm_map_entry_create(&(map)->hdr)
1661 
1662 #define vm_map_copy_entry_create(copy) _vm_map_entry_create(&(copy)->cpy_hdr)
1663 
1664 static vm_map_entry_t
_vm_map_entry_create(struct vm_map_header * map_header __unused)1665 _vm_map_entry_create(
1666 	struct vm_map_header    *map_header __unused)
1667 {
1668 	vm_map_entry_t entry = NULL;
1669 
1670 	entry = zalloc_id(ZONE_ID_VM_MAP_ENTRY, Z_WAITOK | Z_ZERO);
1671 
1672 	/*
1673 	 * Help the compiler with what we know to be true,
1674 	 * so that the further bitfields inits have good codegen.
1675 	 *
1676 	 * See rdar://87041299
1677 	 */
1678 	__builtin_assume(entry->vme_object_value == 0);
1679 	__builtin_assume(*(uint64_t *)(&entry->vme_object_value + 1) == 0);
1680 	__builtin_assume(*(uint64_t *)(&entry->vme_object_value + 2) == 0);
1681 
1682 	static_assert(VM_MAX_TAG_VALUE <= VME_ALIAS_MASK,
1683 	    "VME_ALIAS_MASK covers tags");
1684 
1685 	static_assert(VM_BEHAVIOR_DEFAULT == 0,
1686 	    "can skip zeroing of the behavior field");
1687 	entry->inheritance = VM_INHERIT_DEFAULT;
1688 
1689 #if MAP_ENTRY_CREATION_DEBUG
1690 	entry->vme_creation_maphdr = map_header;
1691 	entry->vme_creation_bt = btref_get(__builtin_frame_address(0),
1692 	    BTREF_GET_NOWAIT);
1693 #endif
1694 	return entry;
1695 }
1696 
1697 /*
1698  *	vm_map_entry_dispose:	[ internal use only ]
1699  *
1700  *	Inverse of vm_map_entry_create.
1701  *
1702  *      write map lock held so no need to
1703  *	do anything special to insure correctness
1704  *      of the stores
1705  */
1706 static void
vm_map_entry_dispose(vm_map_entry_t entry)1707 vm_map_entry_dispose(
1708 	vm_map_entry_t          entry)
1709 {
1710 #if VM_BTLOG_TAGS
1711 	if (entry->vme_kernel_object) {
1712 		btref_put(entry->vme_tag_btref);
1713 	}
1714 #endif /* VM_BTLOG_TAGS */
1715 #if MAP_ENTRY_CREATION_DEBUG
1716 	btref_put(entry->vme_creation_bt);
1717 #endif
1718 #if MAP_ENTRY_INSERTION_DEBUG
1719 	btref_put(entry->vme_insertion_bt);
1720 #endif
1721 	zfree(vm_map_entry_zone, entry);
1722 }
1723 
1724 #define vm_map_copy_entry_dispose(copy_entry) \
1725 	vm_map_entry_dispose(copy_entry)
1726 
1727 static vm_map_entry_t
vm_map_zap_first_entry(vm_map_zap_t list)1728 vm_map_zap_first_entry(
1729 	vm_map_zap_t            list)
1730 {
1731 	return list->vmz_head;
1732 }
1733 
1734 static vm_map_entry_t
vm_map_zap_last_entry(vm_map_zap_t list)1735 vm_map_zap_last_entry(
1736 	vm_map_zap_t            list)
1737 {
1738 	assert(vm_map_zap_first_entry(list));
1739 	return __container_of(list->vmz_tail, struct vm_map_entry, vme_next);
1740 }
1741 
1742 static void
vm_map_zap_append(vm_map_zap_t list,vm_map_entry_t entry)1743 vm_map_zap_append(
1744 	vm_map_zap_t            list,
1745 	vm_map_entry_t          entry)
1746 {
1747 	entry->vme_next = VM_MAP_ENTRY_NULL;
1748 	*list->vmz_tail = entry;
1749 	list->vmz_tail = &entry->vme_next;
1750 }
1751 
1752 static vm_map_entry_t
vm_map_zap_pop(vm_map_zap_t list)1753 vm_map_zap_pop(
1754 	vm_map_zap_t            list)
1755 {
1756 	vm_map_entry_t head = list->vmz_head;
1757 
1758 	if (head != VM_MAP_ENTRY_NULL &&
1759 	    (list->vmz_head = head->vme_next) == VM_MAP_ENTRY_NULL) {
1760 		list->vmz_tail = &list->vmz_head;
1761 	}
1762 
1763 	return head;
1764 }
1765 
1766 static void
vm_map_zap_dispose(vm_map_zap_t list)1767 vm_map_zap_dispose(
1768 	vm_map_zap_t            list)
1769 {
1770 	vm_map_entry_t          entry;
1771 
1772 	while ((entry = vm_map_zap_pop(list))) {
1773 		if (entry->is_sub_map) {
1774 			vm_map_deallocate(VME_SUBMAP(entry));
1775 		} else {
1776 			vm_object_deallocate(VME_OBJECT(entry));
1777 		}
1778 
1779 		vm_map_entry_dispose(entry);
1780 	}
1781 }
1782 
1783 #if MACH_ASSERT
1784 static boolean_t first_free_check = FALSE;
1785 boolean_t
first_free_is_valid(vm_map_t map)1786 first_free_is_valid(
1787 	vm_map_t        map)
1788 {
1789 	if (!first_free_check) {
1790 		return TRUE;
1791 	}
1792 
1793 	return first_free_is_valid_store( map );
1794 }
1795 #endif /* MACH_ASSERT */
1796 
1797 
1798 #define vm_map_copy_entry_link(copy, after_where, entry)                \
1799 	_vm_map_store_entry_link(&(copy)->cpy_hdr, after_where, (entry))
1800 
1801 #define vm_map_copy_entry_unlink(copy, entry)                           \
1802 	_vm_map_store_entry_unlink(&(copy)->cpy_hdr, (entry), false)
1803 
1804 /*
1805  *	vm_map_destroy:
1806  *
1807  *	Actually destroy a map.
1808  */
1809 void
vm_map_destroy(vm_map_t map)1810 vm_map_destroy(
1811 	vm_map_t        map)
1812 {
1813 	/* final cleanup: this is not allowed to fail */
1814 	vmr_flags_t flags = VM_MAP_REMOVE_NO_FLAGS;
1815 
1816 	VM_MAP_ZAP_DECLARE(zap);
1817 
1818 	vm_map_lock(map);
1819 
1820 	map->terminated = true;
1821 	/* clean up regular map entries */
1822 	(void)vm_map_delete(map, map->min_offset, map->max_offset, flags,
1823 	    KMEM_GUARD_NONE, &zap);
1824 	/* clean up leftover special mappings (commpage, GPU carveout, etc...) */
1825 	(void)vm_map_delete(map, 0x0, 0xFFFFFFFFFFFFF000ULL, flags,
1826 	    KMEM_GUARD_NONE, &zap);
1827 
1828 	vm_map_disable_hole_optimization(map);
1829 	vm_map_corpse_footprint_destroy(map);
1830 
1831 	vm_map_unlock(map);
1832 
1833 	vm_map_zap_dispose(&zap);
1834 
1835 	assert(map->hdr.nentries == 0);
1836 
1837 	if (map->pmap) {
1838 		pmap_destroy(map->pmap);
1839 	}
1840 
1841 	lck_rw_destroy(&map->lock, &vm_map_lck_grp);
1842 
1843 #if CONFIG_MAP_RANGES
1844 	kfree_data(map->extra_ranges,
1845 	    map->extra_ranges_count * sizeof(struct vm_map_user_range));
1846 #endif
1847 
1848 	zfree_id(ZONE_ID_VM_MAP, map);
1849 }
1850 
1851 /*
1852  * Returns pid of the task with the largest number of VM map entries.
1853  * Used in the zone-map-exhaustion jetsam path.
1854  */
1855 pid_t
find_largest_process_vm_map_entries(void)1856 find_largest_process_vm_map_entries(void)
1857 {
1858 	pid_t victim_pid = -1;
1859 	int max_vm_map_entries = 0;
1860 	task_t task = TASK_NULL;
1861 	queue_head_t *task_list = &tasks;
1862 
1863 	lck_mtx_lock(&tasks_threads_lock);
1864 	queue_iterate(task_list, task, task_t, tasks) {
1865 		if (task == kernel_task || !task->active) {
1866 			continue;
1867 		}
1868 
1869 		vm_map_t task_map = task->map;
1870 		if (task_map != VM_MAP_NULL) {
1871 			int task_vm_map_entries = task_map->hdr.nentries;
1872 			if (task_vm_map_entries > max_vm_map_entries) {
1873 				max_vm_map_entries = task_vm_map_entries;
1874 				victim_pid = pid_from_task(task);
1875 			}
1876 		}
1877 	}
1878 	lck_mtx_unlock(&tasks_threads_lock);
1879 
1880 	printf("zone_map_exhaustion: victim pid %d, vm region count: %d\n", victim_pid, max_vm_map_entries);
1881 	return victim_pid;
1882 }
1883 
1884 
1885 /*
1886  *	vm_map_lookup_entry:	[ internal use only ]
1887  *
1888  *	Calls into the vm map store layer to find the map
1889  *	entry containing (or immediately preceding) the
1890  *	specified address in the given map; the entry is returned
1891  *	in the "entry" parameter.  The boolean
1892  *	result indicates whether the address is
1893  *	actually contained in the map.
1894  */
1895 boolean_t
vm_map_lookup_entry(vm_map_t map,vm_map_offset_t address,vm_map_entry_t * entry)1896 vm_map_lookup_entry(
1897 	vm_map_t        map,
1898 	vm_map_offset_t address,
1899 	vm_map_entry_t  *entry)         /* OUT */
1900 {
1901 	if (VM_KERNEL_ADDRESS(address)) {
1902 		address = VM_KERNEL_STRIP_UPTR(address);
1903 	}
1904 
1905 
1906 #if CONFIG_PROB_GZALLOC
1907 	if (map->pmap == kernel_pmap) {
1908 		assertf(!pgz_owned(address),
1909 		    "it is the responsibility of callers to unguard PGZ addresses");
1910 	}
1911 #endif /* CONFIG_PROB_GZALLOC */
1912 	return vm_map_store_lookup_entry( map, address, entry );
1913 }
1914 
1915 boolean_t
vm_map_lookup_entry_or_next(vm_map_t map,vm_map_offset_t address,vm_map_entry_t * entry)1916 vm_map_lookup_entry_or_next(
1917 	vm_map_t        map,
1918 	vm_map_offset_t address,
1919 	vm_map_entry_t  *entry)         /* OUT */
1920 {
1921 	if (vm_map_lookup_entry(map, address, entry)) {
1922 		return true;
1923 	}
1924 
1925 	*entry = (*entry)->vme_next;
1926 	return false;
1927 }
1928 
1929 #if CONFIG_PROB_GZALLOC
1930 boolean_t
vm_map_lookup_entry_allow_pgz(vm_map_t map,vm_map_offset_t address,vm_map_entry_t * entry)1931 vm_map_lookup_entry_allow_pgz(
1932 	vm_map_t        map,
1933 	vm_map_offset_t address,
1934 	vm_map_entry_t  *entry)         /* OUT */
1935 {
1936 	if (VM_KERNEL_ADDRESS(address)) {
1937 		address = VM_KERNEL_STRIP_UPTR(address);
1938 	}
1939 	return vm_map_store_lookup_entry( map, address, entry );
1940 }
1941 #endif /* CONFIG_PROB_GZALLOC */
1942 
1943 /*
1944  *	Routine:	vm_map_range_invalid_panic
1945  *	Purpose:
1946  *			Panic on detection of an invalid range id.
1947  */
1948 __abortlike
1949 static void
vm_map_range_invalid_panic(vm_map_t map,vm_map_range_id_t range_id)1950 vm_map_range_invalid_panic(
1951 	vm_map_t                map,
1952 	vm_map_range_id_t       range_id)
1953 {
1954 	panic("invalid range ID (%u) for map %p", range_id, map);
1955 }
1956 
1957 /*
1958  *	Routine:	vm_map_get_range
1959  *	Purpose:
1960  *			Adjust bounds based on security policy.
1961  */
1962 static struct mach_vm_range
vm_map_get_range(vm_map_t map,vm_map_address_t * address,vm_map_kernel_flags_t * vmk_flags,vm_map_size_t size,bool * is_ptr)1963 vm_map_get_range(
1964 	vm_map_t                map,
1965 	vm_map_address_t       *address,
1966 	vm_map_kernel_flags_t  *vmk_flags,
1967 	vm_map_size_t           size,
1968 	bool                   *is_ptr)
1969 {
1970 	struct mach_vm_range effective_range = {};
1971 	vm_map_range_id_t range_id = vmk_flags->vmkf_range_id;
1972 
1973 	if (map == kernel_map) {
1974 		effective_range = kmem_ranges[range_id];
1975 
1976 		if (startup_phase >= STARTUP_SUB_KMEM) {
1977 			/*
1978 			 * Hint provided by caller is zeroed as the range is restricted to a
1979 			 * subset of the entire kernel_map VA, which could put the hint outside
1980 			 * the range, causing vm_map_store_find_space to fail.
1981 			 */
1982 			*address = 0ull;
1983 			/*
1984 			 * Ensure that range_id passed in by the caller is within meaningful
1985 			 * bounds. Range id of KMEM_RANGE_ID_NONE will cause vm_map_locate_space
1986 			 * to fail as the corresponding range is invalid. Range id larger than
1987 			 * KMEM_RANGE_ID_MAX will lead to an OOB access.
1988 			 */
1989 			if ((range_id == KMEM_RANGE_ID_NONE) ||
1990 			    (range_id > KMEM_RANGE_ID_MAX)) {
1991 				vm_map_range_invalid_panic(map, range_id);
1992 			}
1993 
1994 			/*
1995 			 * Pointer ranges use kmem_locate_space to do allocations.
1996 			 *
1997 			 * Non pointer fronts look like [ Small | Large | Permanent ]
1998 			 * Adjust range for allocations larger than KMEM_SMALLMAP_THRESHOLD.
1999 			 * Allocations smaller than KMEM_SMALLMAP_THRESHOLD are allowed to
2000 			 * use the entire range.
2001 			 */
2002 			if (range_id < KMEM_RANGE_ID_SPRAYQTN) {
2003 				*is_ptr = true;
2004 			} else if (size >= KMEM_SMALLMAP_THRESHOLD) {
2005 				effective_range = kmem_large_ranges[range_id];
2006 			}
2007 		}
2008 #if CONFIG_MAP_RANGES
2009 	} else if (map->uses_user_ranges) {
2010 		switch (range_id) {
2011 		case UMEM_RANGE_ID_DEFAULT:
2012 			effective_range = map->default_range;
2013 			break;
2014 		case UMEM_RANGE_ID_HEAP:
2015 			effective_range = map->data_range;
2016 			break;
2017 		case UMEM_RANGE_ID_FIXED:
2018 			/*
2019 			 * anywhere allocations with an address in "FIXED"
2020 			 * makes no sense, leave the range empty
2021 			 */
2022 			break;
2023 
2024 		default:
2025 			vm_map_range_invalid_panic(map, range_id);
2026 		}
2027 #endif /* CONFIG_MAP_RANGES */
2028 	} else {
2029 		/*
2030 		 * If minimum is 0, bump it up by PAGE_SIZE.  We want to limit
2031 		 * allocations of PAGEZERO to explicit requests since its
2032 		 * normal use is to catch dereferences of NULL and many
2033 		 * applications also treat pointers with a value of 0 as
2034 		 * special and suddenly having address 0 contain useable
2035 		 * memory would tend to confuse those applications.
2036 		 */
2037 		effective_range.min_address = MAX(map->min_offset, VM_MAP_PAGE_SIZE(map));
2038 		effective_range.max_address = map->max_offset;
2039 	}
2040 
2041 	return effective_range;
2042 }
2043 
2044 /*
2045  *	Routine:	vm_map_locate_space
2046  *	Purpose:
2047  *		Finds a range in the specified virtual address map,
2048  *		returning the start of that range,
2049  *		as well as the entry right before it.
2050  */
2051 kern_return_t
vm_map_locate_space(vm_map_t map,vm_map_size_t size,vm_map_offset_t mask,vm_map_kernel_flags_t vmk_flags,vm_map_offset_t * start_inout,vm_map_entry_t * entry_out)2052 vm_map_locate_space(
2053 	vm_map_t                map,
2054 	vm_map_size_t           size,
2055 	vm_map_offset_t         mask,
2056 	vm_map_kernel_flags_t   vmk_flags,
2057 	vm_map_offset_t        *start_inout,
2058 	vm_map_entry_t         *entry_out)
2059 {
2060 	struct mach_vm_range effective_range = {};
2061 	vm_map_size_t   guard_offset;
2062 	vm_map_offset_t hint, limit;
2063 	vm_map_entry_t  entry;
2064 	bool            is_kmem_ptr_range = false;
2065 
2066 	/*
2067 	 * Only supported by vm_map_enter() with a fixed address.
2068 	 */
2069 	assert(!vmk_flags.vmkf_beyond_max);
2070 
2071 	if (__improbable(map->wait_for_space)) {
2072 		/*
2073 		 * support for "wait_for_space" is minimal,
2074 		 * its only consumer is the ipc_kernel_copy_map.
2075 		 */
2076 		assert(!map->holelistenabled &&
2077 		    !vmk_flags.vmkf_last_free &&
2078 		    !vmk_flags.vmkf_keep_map_locked &&
2079 		    !vmk_flags.vmkf_map_jit &&
2080 		    !vmk_flags.vmf_random_addr &&
2081 		    *start_inout <= map->min_offset);
2082 	} else if (vmk_flags.vmkf_last_free) {
2083 		assert(!vmk_flags.vmkf_map_jit &&
2084 		    !vmk_flags.vmf_random_addr);
2085 	}
2086 
2087 	if (vmk_flags.vmkf_guard_before) {
2088 		guard_offset = VM_MAP_PAGE_SIZE(map);
2089 		assert(size > guard_offset);
2090 		size -= guard_offset;
2091 	} else {
2092 		assert(size != 0);
2093 		guard_offset = 0;
2094 	}
2095 
2096 	/*
2097 	 * Validate range_id from flags and get associated range
2098 	 */
2099 	effective_range = vm_map_get_range(map, start_inout, &vmk_flags, size,
2100 	    &is_kmem_ptr_range);
2101 
2102 	if (is_kmem_ptr_range) {
2103 		return kmem_locate_space(size + guard_offset, vmk_flags.vmkf_range_id,
2104 		           vmk_flags.vmkf_last_free, start_inout, entry_out);
2105 	}
2106 
2107 #if XNU_TARGET_OS_OSX
2108 	if (__improbable(vmk_flags.vmkf_32bit_map_va)) {
2109 		assert(map != kernel_map);
2110 		effective_range.max_address = MIN(map->max_offset, 0x00000000FFFFF000ULL);
2111 	}
2112 #endif /* XNU_TARGET_OS_OSX */
2113 
2114 again:
2115 	if (vmk_flags.vmkf_last_free) {
2116 		hint = *start_inout;
2117 
2118 		if (hint == 0 || hint > effective_range.max_address) {
2119 			hint = effective_range.max_address;
2120 		}
2121 		if (hint <= effective_range.min_address) {
2122 			return KERN_NO_SPACE;
2123 		}
2124 		limit = effective_range.min_address;
2125 	} else {
2126 		hint = *start_inout;
2127 
2128 		if (vmk_flags.vmkf_map_jit) {
2129 			if (map->jit_entry_exists &&
2130 			    !VM_MAP_POLICY_ALLOW_MULTIPLE_JIT(map)) {
2131 				return KERN_INVALID_ARGUMENT;
2132 			}
2133 			if (VM_MAP_POLICY_ALLOW_JIT_RANDOM_ADDRESS(map)) {
2134 				vmk_flags.vmf_random_addr = true;
2135 			}
2136 		}
2137 
2138 		if (vmk_flags.vmf_random_addr) {
2139 			kern_return_t kr;
2140 
2141 			kr = vm_map_random_address_for_size(map, &hint, size, vmk_flags);
2142 			if (kr != KERN_SUCCESS) {
2143 				return kr;
2144 			}
2145 		}
2146 #if __x86_64__
2147 		else if ((hint == 0 || hint == vm_map_min(map)) &&
2148 		    !map->disable_vmentry_reuse &&
2149 		    map->vmmap_high_start != 0) {
2150 			hint = map->vmmap_high_start;
2151 		}
2152 #endif /* __x86_64__ */
2153 
2154 		if (hint < effective_range.min_address) {
2155 			hint = effective_range.min_address;
2156 		}
2157 		if (effective_range.max_address <= hint) {
2158 			return KERN_NO_SPACE;
2159 		}
2160 
2161 		limit = effective_range.max_address;
2162 	}
2163 	entry = vm_map_store_find_space(map,
2164 	    hint, limit, vmk_flags.vmkf_last_free,
2165 	    guard_offset, size, mask,
2166 	    start_inout);
2167 
2168 	if (__improbable(entry == NULL)) {
2169 		if (map->wait_for_space &&
2170 		    guard_offset + size <=
2171 		    effective_range.max_address - effective_range.min_address) {
2172 			assert_wait((event_t)map, THREAD_ABORTSAFE);
2173 			vm_map_unlock(map);
2174 			thread_block(THREAD_CONTINUE_NULL);
2175 			vm_map_lock(map);
2176 			goto again;
2177 		}
2178 		return KERN_NO_SPACE;
2179 	}
2180 
2181 	if (entry_out) {
2182 		*entry_out = entry;
2183 	}
2184 	return KERN_SUCCESS;
2185 }
2186 
2187 
2188 /*
2189  *	Routine:	vm_map_find_space
2190  *	Purpose:
2191  *		Allocate a range in the specified virtual address map,
2192  *		returning the entry allocated for that range.
2193  *		Used by kmem_alloc, etc.
2194  *
2195  *		The map must be NOT be locked. It will be returned locked
2196  *		on KERN_SUCCESS, unlocked on failure.
2197  *
2198  *		If an entry is allocated, the object/offset fields
2199  *		are initialized to zero.
2200  */
2201 kern_return_t
vm_map_find_space(vm_map_t map,vm_map_offset_t hint_address,vm_map_size_t size,vm_map_offset_t mask,vm_map_kernel_flags_t vmk_flags,vm_map_entry_t * o_entry)2202 vm_map_find_space(
2203 	vm_map_t                map,
2204 	vm_map_offset_t         hint_address,
2205 	vm_map_size_t           size,
2206 	vm_map_offset_t         mask,
2207 	vm_map_kernel_flags_t   vmk_flags,
2208 	vm_map_entry_t          *o_entry)       /* OUT */
2209 {
2210 	vm_map_entry_t          new_entry, entry;
2211 	kern_return_t           kr;
2212 
2213 	if (size == 0) {
2214 		return KERN_INVALID_ARGUMENT;
2215 	}
2216 
2217 	new_entry = vm_map_entry_create(map);
2218 	new_entry->use_pmap = true;
2219 	new_entry->protection = VM_PROT_DEFAULT;
2220 	new_entry->max_protection = VM_PROT_ALL;
2221 
2222 	if (VM_MAP_PAGE_SHIFT(map) != PAGE_SHIFT) {
2223 		new_entry->map_aligned = true;
2224 	}
2225 	if (vmk_flags.vmf_permanent) {
2226 		new_entry->vme_permanent = true;
2227 	}
2228 
2229 	vm_map_lock(map);
2230 
2231 	kr = vm_map_locate_space(map, size, mask, vmk_flags,
2232 	    &hint_address, &entry);
2233 	if (kr != KERN_SUCCESS) {
2234 		vm_map_unlock(map);
2235 		vm_map_entry_dispose(new_entry);
2236 		return kr;
2237 	}
2238 	new_entry->vme_start = hint_address;
2239 	new_entry->vme_end = hint_address + size;
2240 
2241 	/*
2242 	 *	At this point,
2243 	 *
2244 	 *	- new_entry's "vme_start" and "vme_end" should define
2245 	 *	  the endpoints of the available new range,
2246 	 *
2247 	 *	- and "entry" should refer to the region before
2248 	 *	  the new range,
2249 	 *
2250 	 *	- and the map should still be locked.
2251 	 */
2252 
2253 	assert(page_aligned(new_entry->vme_start));
2254 	assert(page_aligned(new_entry->vme_end));
2255 	assert(VM_MAP_PAGE_ALIGNED(new_entry->vme_start, VM_MAP_PAGE_MASK(map)));
2256 	assert(VM_MAP_PAGE_ALIGNED(new_entry->vme_end, VM_MAP_PAGE_MASK(map)));
2257 
2258 	/*
2259 	 *	Insert the new entry into the list
2260 	 */
2261 
2262 	vm_map_store_entry_link(map, entry, new_entry,
2263 	    VM_MAP_KERNEL_FLAGS_NONE);
2264 	map->size += size;
2265 
2266 	/*
2267 	 *	Update the lookup hint
2268 	 */
2269 	SAVE_HINT_MAP_WRITE(map, new_entry);
2270 
2271 	*o_entry = new_entry;
2272 	return KERN_SUCCESS;
2273 }
2274 
2275 int vm_map_pmap_enter_print = FALSE;
2276 int vm_map_pmap_enter_enable = FALSE;
2277 
2278 /*
2279  *	Routine:	vm_map_pmap_enter [internal only]
2280  *
2281  *	Description:
2282  *		Force pages from the specified object to be entered into
2283  *		the pmap at the specified address if they are present.
2284  *		As soon as a page not found in the object the scan ends.
2285  *
2286  *	Returns:
2287  *		Nothing.
2288  *
2289  *	In/out conditions:
2290  *		The source map should not be locked on entry.
2291  */
2292 __unused static void
vm_map_pmap_enter(vm_map_t map,vm_map_offset_t addr,vm_map_offset_t end_addr,vm_object_t object,vm_object_offset_t offset,vm_prot_t protection)2293 vm_map_pmap_enter(
2294 	vm_map_t                map,
2295 	vm_map_offset_t         addr,
2296 	vm_map_offset_t         end_addr,
2297 	vm_object_t             object,
2298 	vm_object_offset_t      offset,
2299 	vm_prot_t               protection)
2300 {
2301 	int                     type_of_fault;
2302 	kern_return_t           kr;
2303 	uint8_t                 object_lock_type = 0;
2304 	struct vm_object_fault_info fault_info = {};
2305 
2306 	if (map->pmap == 0) {
2307 		return;
2308 	}
2309 
2310 	assert(VM_MAP_PAGE_SHIFT(map) == PAGE_SHIFT);
2311 
2312 	while (addr < end_addr) {
2313 		vm_page_t       m;
2314 
2315 
2316 		/*
2317 		 * TODO:
2318 		 * From vm_map_enter(), we come into this function without the map
2319 		 * lock held or the object lock held.
2320 		 * We haven't taken a reference on the object either.
2321 		 * We should do a proper lookup on the map to make sure
2322 		 * that things are sane before we go locking objects that
2323 		 * could have been deallocated from under us.
2324 		 */
2325 
2326 		object_lock_type = OBJECT_LOCK_EXCLUSIVE;
2327 		vm_object_lock(object);
2328 
2329 		m = vm_page_lookup(object, offset);
2330 
2331 		if (m == VM_PAGE_NULL || m->vmp_busy || m->vmp_fictitious ||
2332 		    (m->vmp_unusual && (VMP_ERROR_GET(m) || m->vmp_restart || m->vmp_absent))) {
2333 			vm_object_unlock(object);
2334 			return;
2335 		}
2336 
2337 		if (vm_map_pmap_enter_print) {
2338 			printf("vm_map_pmap_enter:");
2339 			printf("map: %p, addr: %llx, object: %p, offset: %llx\n",
2340 			    map, (unsigned long long)addr, object, (unsigned long long)offset);
2341 		}
2342 		type_of_fault = DBG_CACHE_HIT_FAULT;
2343 		kr = vm_fault_enter(m, map->pmap,
2344 		    addr,
2345 		    PAGE_SIZE, 0,
2346 		    protection, protection,
2347 		    VM_PAGE_WIRED(m),
2348 		    FALSE,                 /* change_wiring */
2349 		    VM_KERN_MEMORY_NONE,                 /* tag - not wiring */
2350 		    &fault_info,
2351 		    NULL,                  /* need_retry */
2352 		    &type_of_fault,
2353 		    &object_lock_type); /* Exclusive lock mode. Will remain unchanged.*/
2354 
2355 		vm_object_unlock(object);
2356 
2357 		offset += PAGE_SIZE_64;
2358 		addr += PAGE_SIZE;
2359 	}
2360 }
2361 
2362 #define MAX_TRIES_TO_GET_RANDOM_ADDRESS 1000
2363 static kern_return_t
vm_map_random_address_for_size(vm_map_t map,vm_map_offset_t * address,vm_map_size_t size,vm_map_kernel_flags_t vmk_flags)2364 vm_map_random_address_for_size(
2365 	vm_map_t                map,
2366 	vm_map_offset_t        *address,
2367 	vm_map_size_t           size,
2368 	vm_map_kernel_flags_t   vmk_flags)
2369 {
2370 	kern_return_t   kr = KERN_SUCCESS;
2371 	int             tries = 0;
2372 	vm_map_offset_t random_addr = 0;
2373 	vm_map_offset_t hole_end;
2374 
2375 	vm_map_entry_t  next_entry = VM_MAP_ENTRY_NULL;
2376 	vm_map_entry_t  prev_entry = VM_MAP_ENTRY_NULL;
2377 	vm_map_size_t   vm_hole_size = 0;
2378 	vm_map_size_t   addr_space_size;
2379 	bool            is_kmem_ptr;
2380 	struct mach_vm_range effective_range;
2381 
2382 	effective_range = vm_map_get_range(map, address, &vmk_flags, size,
2383 	    &is_kmem_ptr);
2384 
2385 	addr_space_size = effective_range.max_address - effective_range.min_address;
2386 	if (size >= addr_space_size) {
2387 		return KERN_NO_SPACE;
2388 	}
2389 	addr_space_size -= size;
2390 
2391 	assert(VM_MAP_PAGE_ALIGNED(size, VM_MAP_PAGE_MASK(map)));
2392 
2393 	while (tries < MAX_TRIES_TO_GET_RANDOM_ADDRESS) {
2394 		if (startup_phase < STARTUP_SUB_ZALLOC) {
2395 			random_addr = (vm_map_offset_t)early_random();
2396 		} else {
2397 			random_addr = (vm_map_offset_t)random();
2398 		}
2399 		random_addr <<= VM_MAP_PAGE_SHIFT(map);
2400 		random_addr = vm_map_trunc_page(
2401 			effective_range.min_address + (random_addr % addr_space_size),
2402 			VM_MAP_PAGE_MASK(map));
2403 
2404 #if CONFIG_PROB_GZALLOC
2405 		if (map->pmap == kernel_pmap && pgz_owned(random_addr)) {
2406 			continue;
2407 		}
2408 #endif /* CONFIG_PROB_GZALLOC */
2409 
2410 		if (vm_map_lookup_entry(map, random_addr, &prev_entry) == FALSE) {
2411 			if (prev_entry == vm_map_to_entry(map)) {
2412 				next_entry = vm_map_first_entry(map);
2413 			} else {
2414 				next_entry = prev_entry->vme_next;
2415 			}
2416 			if (next_entry == vm_map_to_entry(map)) {
2417 				hole_end = vm_map_max(map);
2418 			} else {
2419 				hole_end = next_entry->vme_start;
2420 			}
2421 			vm_hole_size = hole_end - random_addr;
2422 			if (vm_hole_size >= size) {
2423 				*address = random_addr;
2424 				break;
2425 			}
2426 		}
2427 		tries++;
2428 	}
2429 
2430 	if (tries == MAX_TRIES_TO_GET_RANDOM_ADDRESS) {
2431 		kr = KERN_NO_SPACE;
2432 	}
2433 	return kr;
2434 }
2435 
2436 static boolean_t
vm_memory_malloc_no_cow(int alias)2437 vm_memory_malloc_no_cow(
2438 	int alias)
2439 {
2440 	uint64_t alias_mask;
2441 
2442 	if (!malloc_no_cow) {
2443 		return FALSE;
2444 	}
2445 	if (alias > 63) {
2446 		return FALSE;
2447 	}
2448 	alias_mask = 1ULL << alias;
2449 	if (alias_mask & vm_memory_malloc_no_cow_mask) {
2450 		return TRUE;
2451 	}
2452 	return FALSE;
2453 }
2454 
2455 uint64_t vm_map_enter_RLIMIT_AS_count = 0;
2456 uint64_t vm_map_enter_RLIMIT_DATA_count = 0;
2457 /*
2458  *	Routine:	vm_map_enter
2459  *
2460  *	Description:
2461  *		Allocate a range in the specified virtual address map.
2462  *		The resulting range will refer to memory defined by
2463  *		the given memory object and offset into that object.
2464  *
2465  *		Arguments are as defined in the vm_map call.
2466  */
2467 static unsigned int vm_map_enter_restore_successes = 0;
2468 static unsigned int vm_map_enter_restore_failures = 0;
2469 kern_return_t
vm_map_enter(vm_map_t map,vm_map_offset_t * address,vm_map_size_t size,vm_map_offset_t mask,vm_map_kernel_flags_t vmk_flags,vm_object_t object,vm_object_offset_t offset,boolean_t needs_copy,vm_prot_t cur_protection,vm_prot_t max_protection,vm_inherit_t inheritance)2470 vm_map_enter(
2471 	vm_map_t                map,
2472 	vm_map_offset_t         *address,       /* IN/OUT */
2473 	vm_map_size_t           size,
2474 	vm_map_offset_t         mask,
2475 	vm_map_kernel_flags_t   vmk_flags,
2476 	vm_object_t             object,
2477 	vm_object_offset_t      offset,
2478 	boolean_t               needs_copy,
2479 	vm_prot_t               cur_protection,
2480 	vm_prot_t               max_protection,
2481 	vm_inherit_t            inheritance)
2482 {
2483 	vm_map_entry_t          entry, new_entry;
2484 	vm_map_offset_t         start, tmp_start, tmp_offset;
2485 	vm_map_offset_t         end, tmp_end;
2486 	vm_map_offset_t         tmp2_start, tmp2_end;
2487 	vm_map_offset_t         step;
2488 	kern_return_t           result = KERN_SUCCESS;
2489 	bool                    map_locked = FALSE;
2490 	bool                    pmap_empty = TRUE;
2491 	bool                    new_mapping_established = FALSE;
2492 	const bool              keep_map_locked = vmk_flags.vmkf_keep_map_locked;
2493 	const bool              anywhere = !vmk_flags.vmf_fixed;
2494 	const bool              purgable = vmk_flags.vmf_purgeable;
2495 	const bool              overwrite = vmk_flags.vmf_overwrite;
2496 	const bool              no_cache = vmk_flags.vmf_no_cache;
2497 	const bool              is_submap = vmk_flags.vmkf_submap;
2498 	const bool              permanent = vmk_flags.vmf_permanent;
2499 	const bool              no_copy_on_read = vmk_flags.vmkf_no_copy_on_read;
2500 	const bool              entry_for_jit = vmk_flags.vmkf_map_jit;
2501 	const bool              iokit_acct = vmk_flags.vmkf_iokit_acct;
2502 	const bool              resilient_codesign = vmk_flags.vmf_resilient_codesign;
2503 	const bool              resilient_media = vmk_flags.vmf_resilient_media;
2504 	const bool              entry_for_tpro = vmk_flags.vmf_tpro;
2505 	const unsigned int      superpage_size = vmk_flags.vmf_superpage_size;
2506 	const vm_tag_t          alias = vmk_flags.vm_tag;
2507 	vm_tag_t                user_alias;
2508 	kern_return_t           kr;
2509 	bool                    clear_map_aligned = FALSE;
2510 	vm_map_size_t           chunk_size = 0;
2511 	vm_object_t             caller_object;
2512 	VM_MAP_ZAP_DECLARE(zap_old_list);
2513 	VM_MAP_ZAP_DECLARE(zap_new_list);
2514 
2515 	caller_object = object;
2516 
2517 	assertf(vmk_flags.__vmkf_unused == 0, "vmk_flags unused=0x%x\n", vmk_flags.__vmkf_unused);
2518 
2519 	if (vmk_flags.vmf_4gb_chunk) {
2520 #if defined(__LP64__)
2521 		chunk_size = (4ULL * 1024 * 1024 * 1024); /* max. 4GB chunks for the new allocation */
2522 #else /* __LP64__ */
2523 		chunk_size = ANON_CHUNK_SIZE;
2524 #endif /* __LP64__ */
2525 	} else {
2526 		chunk_size = ANON_CHUNK_SIZE;
2527 	}
2528 
2529 
2530 
2531 	if (superpage_size) {
2532 		switch (superpage_size) {
2533 			/*
2534 			 * Note that the current implementation only supports
2535 			 * a single size for superpages, SUPERPAGE_SIZE, per
2536 			 * architecture. As soon as more sizes are supposed
2537 			 * to be supported, SUPERPAGE_SIZE has to be replaced
2538 			 * with a lookup of the size depending on superpage_size.
2539 			 */
2540 #ifdef __x86_64__
2541 		case SUPERPAGE_SIZE_ANY:
2542 			/* handle it like 2 MB and round up to page size */
2543 			size = (size + 2 * 1024 * 1024 - 1) & ~(2 * 1024 * 1024 - 1);
2544 			OS_FALLTHROUGH;
2545 		case SUPERPAGE_SIZE_2MB:
2546 			break;
2547 #endif
2548 		default:
2549 			return KERN_INVALID_ARGUMENT;
2550 		}
2551 		mask = SUPERPAGE_SIZE - 1;
2552 		if (size & (SUPERPAGE_SIZE - 1)) {
2553 			return KERN_INVALID_ARGUMENT;
2554 		}
2555 		inheritance = VM_INHERIT_NONE;  /* fork() children won't inherit superpages */
2556 	}
2557 
2558 
2559 	if ((cur_protection & VM_PROT_WRITE) &&
2560 	    (cur_protection & VM_PROT_EXECUTE) &&
2561 #if XNU_TARGET_OS_OSX
2562 	    map->pmap != kernel_pmap &&
2563 	    (cs_process_global_enforcement() ||
2564 	    (vmk_flags.vmkf_cs_enforcement_override
2565 	    ? vmk_flags.vmkf_cs_enforcement
2566 	    : (vm_map_cs_enforcement(map)
2567 #if __arm64__
2568 	    || !VM_MAP_IS_EXOTIC(map)
2569 #endif /* __arm64__ */
2570 	    ))) &&
2571 #endif /* XNU_TARGET_OS_OSX */
2572 #if CODE_SIGNING_MONITOR
2573 	    (csm_address_space_exempt(map->pmap) != KERN_SUCCESS) &&
2574 #endif
2575 	    (VM_MAP_POLICY_WX_FAIL(map) ||
2576 	    VM_MAP_POLICY_WX_STRIP_X(map)) &&
2577 	    !entry_for_jit) {
2578 		boolean_t vm_protect_wx_fail = VM_MAP_POLICY_WX_FAIL(map);
2579 
2580 		DTRACE_VM3(cs_wx,
2581 		    uint64_t, 0,
2582 		    uint64_t, 0,
2583 		    vm_prot_t, cur_protection);
2584 		printf("CODE SIGNING: %d[%s] %s: curprot cannot be write+execute. %s\n",
2585 		    proc_selfpid(),
2586 		    (get_bsdtask_info(current_task())
2587 		    ? proc_name_address(get_bsdtask_info(current_task()))
2588 		    : "?"),
2589 		    __FUNCTION__,
2590 		    (vm_protect_wx_fail ? "failing" : "turning off execute"));
2591 		cur_protection &= ~VM_PROT_EXECUTE;
2592 		if (vm_protect_wx_fail) {
2593 			return KERN_PROTECTION_FAILURE;
2594 		}
2595 	}
2596 
2597 	if (entry_for_jit
2598 	    && cur_protection != VM_PROT_ALL) {
2599 		/*
2600 		 * Native macOS processes and all non-macOS processes are
2601 		 * expected to create JIT regions via mmap(MAP_JIT, RWX) but
2602 		 * the RWX requirement was not enforced, and thus, we must live
2603 		 * with our sins. We are now dealing with a JIT mapping without
2604 		 * RWX.
2605 		 *
2606 		 * We deal with these by letting the MAP_JIT stick in order
2607 		 * to avoid CS violations when these pages are mapped executable
2608 		 * down the line. In order to appease the page table monitor (you
2609 		 * know what I'm talking about), these pages will end up being
2610 		 * marked as XNU_USER_DEBUG, which will be allowed because we
2611 		 * don't enforce the code signing monitor on macOS systems. If
2612 		 * the user-space application ever changes permissions to RWX,
2613 		 * which they are allowed to since the mapping was originally
2614 		 * created with MAP_JIT, then they'll switch over to using the
2615 		 * XNU_USER_JIT type, and won't be allowed to downgrade any
2616 		 * more after that.
2617 		 *
2618 		 * When not on macOS, a MAP_JIT mapping without VM_PROT_ALL is
2619 		 * strictly disallowed.
2620 		 */
2621 
2622 #if XNU_TARGET_OS_OSX
2623 		/*
2624 		 * Continue to allow non-RWX JIT
2625 		 */
2626 #else
2627 		/* non-macOS: reject JIT regions without RWX */
2628 		DTRACE_VM3(cs_wx,
2629 		    uint64_t, 0,
2630 		    uint64_t, 0,
2631 		    vm_prot_t, cur_protection);
2632 		printf("CODE SIGNING: %d[%s] %s(%d): JIT requires RWX: failing. \n",
2633 		    proc_selfpid(),
2634 		    (get_bsdtask_info(current_task())
2635 		    ? proc_name_address(get_bsdtask_info(current_task()))
2636 		    : "?"),
2637 		    __FUNCTION__,
2638 		    cur_protection);
2639 		return KERN_PROTECTION_FAILURE;
2640 #endif
2641 	}
2642 
2643 	/*
2644 	 * If the task has requested executable lockdown,
2645 	 * deny any new executable mapping.
2646 	 */
2647 	if (map->map_disallow_new_exec == TRUE) {
2648 		if (cur_protection & VM_PROT_EXECUTE) {
2649 			return KERN_PROTECTION_FAILURE;
2650 		}
2651 	}
2652 
2653 	if (resilient_codesign) {
2654 		assert(!is_submap);
2655 		int reject_prot = (needs_copy ? VM_PROT_ALLEXEC : (VM_PROT_WRITE | VM_PROT_ALLEXEC));
2656 		if ((cur_protection | max_protection) & reject_prot) {
2657 			return KERN_PROTECTION_FAILURE;
2658 		}
2659 	}
2660 
2661 	if (resilient_media) {
2662 		assert(!is_submap);
2663 //		assert(!needs_copy);
2664 		if (object != VM_OBJECT_NULL &&
2665 		    !object->internal) {
2666 			/*
2667 			 * This mapping is directly backed by an external
2668 			 * memory manager (e.g. a vnode pager for a file):
2669 			 * we would not have any safe place to inject
2670 			 * a zero-filled page if an actual page is not
2671 			 * available, without possibly impacting the actual
2672 			 * contents of the mapped object (e.g. the file),
2673 			 * so we can't provide any media resiliency here.
2674 			 */
2675 			return KERN_INVALID_ARGUMENT;
2676 		}
2677 	}
2678 
2679 	if (entry_for_tpro) {
2680 		/*
2681 		 * TPRO overrides the effective permissions of the region
2682 		 * and explicitly maps as RW. Ensure we have been passed
2683 		 * the expected permissions. We accept `cur_protections`
2684 		 * RO as that will be handled on fault.
2685 		 */
2686 		if (!(max_protection & VM_PROT_READ) ||
2687 		    !(max_protection & VM_PROT_WRITE) ||
2688 		    !(cur_protection & VM_PROT_READ)) {
2689 			return KERN_PROTECTION_FAILURE;
2690 		}
2691 
2692 		/*
2693 		 * We can now downgrade the cur_protection to RO. This is a mild lie
2694 		 * to the VM layer. But TPRO will be responsible for toggling the
2695 		 * protections between RO/RW
2696 		 */
2697 		cur_protection = VM_PROT_READ;
2698 	}
2699 
2700 	if (is_submap) {
2701 		vm_map_t submap;
2702 		if (purgable) {
2703 			/* submaps can not be purgeable */
2704 			return KERN_INVALID_ARGUMENT;
2705 		}
2706 		if (object == VM_OBJECT_NULL) {
2707 			/* submaps can not be created lazily */
2708 			return KERN_INVALID_ARGUMENT;
2709 		}
2710 		submap = (vm_map_t) object;
2711 		if (VM_MAP_PAGE_SHIFT(submap) != VM_MAP_PAGE_SHIFT(map)) {
2712 			/* page size mismatch */
2713 			return KERN_INVALID_ARGUMENT;
2714 		}
2715 	}
2716 	if (vmk_flags.vmkf_already) {
2717 		/*
2718 		 * VM_FLAGS_ALREADY says that it's OK if the same mapping
2719 		 * is already present.  For it to be meaningul, the requested
2720 		 * mapping has to be at a fixed address (!VM_FLAGS_ANYWHERE) and
2721 		 * we shouldn't try and remove what was mapped there first
2722 		 * (!VM_FLAGS_OVERWRITE).
2723 		 */
2724 		if (!vmk_flags.vmf_fixed || vmk_flags.vmf_overwrite) {
2725 			return KERN_INVALID_ARGUMENT;
2726 		}
2727 	}
2728 
2729 	if (size == 0 ||
2730 	    (offset & MIN(VM_MAP_PAGE_MASK(map), PAGE_MASK_64)) != 0) {
2731 		*address = 0;
2732 		return KERN_INVALID_ARGUMENT;
2733 	}
2734 
2735 	if (map->pmap == kernel_pmap) {
2736 		user_alias = VM_KERN_MEMORY_NONE;
2737 	} else {
2738 		user_alias = alias;
2739 	}
2740 
2741 	if (user_alias == VM_MEMORY_MALLOC_MEDIUM) {
2742 		chunk_size = MALLOC_MEDIUM_CHUNK_SIZE;
2743 	}
2744 
2745 #define RETURN(value)   { result = value; goto BailOut; }
2746 
2747 	assertf(VM_MAP_PAGE_ALIGNED(*address, FOURK_PAGE_MASK), "0x%llx", (uint64_t)*address);
2748 	assertf(VM_MAP_PAGE_ALIGNED(size, FOURK_PAGE_MASK), "0x%llx", (uint64_t)size);
2749 	if (VM_MAP_PAGE_MASK(map) >= PAGE_MASK) {
2750 		assertf(page_aligned(*address), "0x%llx", (uint64_t)*address);
2751 		assertf(page_aligned(size), "0x%llx", (uint64_t)size);
2752 	}
2753 
2754 	if (VM_MAP_PAGE_MASK(map) >= PAGE_MASK &&
2755 	    !VM_MAP_PAGE_ALIGNED(size, VM_MAP_PAGE_MASK(map))) {
2756 		/*
2757 		 * In most cases, the caller rounds the size up to the
2758 		 * map's page size.
2759 		 * If we get a size that is explicitly not map-aligned here,
2760 		 * we'll have to respect the caller's wish and mark the
2761 		 * mapping as "not map-aligned" to avoid tripping the
2762 		 * map alignment checks later.
2763 		 */
2764 		clear_map_aligned = TRUE;
2765 	}
2766 	if (!anywhere &&
2767 	    VM_MAP_PAGE_MASK(map) >= PAGE_MASK &&
2768 	    !VM_MAP_PAGE_ALIGNED(*address, VM_MAP_PAGE_MASK(map))) {
2769 		/*
2770 		 * We've been asked to map at a fixed address and that
2771 		 * address is not aligned to the map's specific alignment.
2772 		 * The caller should know what it's doing (i.e. most likely
2773 		 * mapping some fragmented copy map, transferring memory from
2774 		 * a VM map with a different alignment), so clear map_aligned
2775 		 * for this new VM map entry and proceed.
2776 		 */
2777 		clear_map_aligned = TRUE;
2778 	}
2779 
2780 	/*
2781 	 * Only zero-fill objects are allowed to be purgable.
2782 	 * LP64todo - limit purgable objects to 32-bits for now
2783 	 */
2784 	if (purgable &&
2785 	    (offset != 0 ||
2786 	    (object != VM_OBJECT_NULL &&
2787 	    (object->vo_size != size ||
2788 	    object->purgable == VM_PURGABLE_DENY))
2789 #if __LP64__
2790 	    || size > ANON_MAX_SIZE
2791 #endif
2792 	    )) {
2793 		return KERN_INVALID_ARGUMENT;
2794 	}
2795 
2796 	start = *address;
2797 
2798 	if (anywhere) {
2799 		vm_map_lock(map);
2800 		map_locked = TRUE;
2801 
2802 		result = vm_map_locate_space(map, size, mask, vmk_flags,
2803 		    &start, &entry);
2804 		if (result != KERN_SUCCESS) {
2805 			goto BailOut;
2806 		}
2807 
2808 		*address = start;
2809 		end = start + size;
2810 		assert(VM_MAP_PAGE_ALIGNED(*address,
2811 		    VM_MAP_PAGE_MASK(map)));
2812 	} else {
2813 		vm_map_offset_t effective_min_offset, effective_max_offset;
2814 
2815 		effective_min_offset = map->min_offset;
2816 		effective_max_offset = map->max_offset;
2817 
2818 		if (vmk_flags.vmkf_beyond_max) {
2819 			/*
2820 			 * Allow an insertion beyond the map's max offset.
2821 			 */
2822 			effective_max_offset = 0x00000000FFFFF000ULL;
2823 			if (vm_map_is_64bit(map)) {
2824 				effective_max_offset = 0xFFFFFFFFFFFFF000ULL;
2825 			}
2826 #if XNU_TARGET_OS_OSX
2827 		} else if (__improbable(vmk_flags.vmkf_32bit_map_va)) {
2828 			effective_max_offset = MIN(map->max_offset, 0x00000000FFFFF000ULL);
2829 #endif /* XNU_TARGET_OS_OSX */
2830 		}
2831 
2832 		if (VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT &&
2833 		    !overwrite &&
2834 		    user_alias == VM_MEMORY_REALLOC) {
2835 			/*
2836 			 * Force realloc() to switch to a new allocation,
2837 			 * to prevent 4k-fragmented virtual ranges.
2838 			 */
2839 //			DEBUG4K_ERROR("no realloc in place");
2840 			return KERN_NO_SPACE;
2841 		}
2842 
2843 		/*
2844 		 *	Verify that:
2845 		 *		the address doesn't itself violate
2846 		 *		the mask requirement.
2847 		 */
2848 
2849 		vm_map_lock(map);
2850 		map_locked = TRUE;
2851 		if ((start & mask) != 0) {
2852 			RETURN(KERN_NO_SPACE);
2853 		}
2854 
2855 #if CONFIG_MAP_RANGES
2856 		if (map->uses_user_ranges) {
2857 			struct mach_vm_range r;
2858 
2859 			vm_map_user_range_resolve(map, start, 1, &r);
2860 			if (r.max_address == 0) {
2861 				RETURN(KERN_INVALID_ADDRESS);
2862 			}
2863 			effective_min_offset = r.min_address;
2864 			effective_max_offset = r.max_address;
2865 		}
2866 #endif /* CONFIG_MAP_RANGES */
2867 
2868 		if ((startup_phase >= STARTUP_SUB_KMEM) && !is_submap &&
2869 		    (map == kernel_map)) {
2870 			mach_vm_range_t r = kmem_validate_range_for_overwrite(start, size);
2871 			effective_min_offset = r->min_address;
2872 			effective_max_offset = r->max_address;
2873 		}
2874 
2875 		/*
2876 		 *	...	the address is within bounds
2877 		 */
2878 
2879 		end = start + size;
2880 
2881 		if ((start < effective_min_offset) ||
2882 		    (end > effective_max_offset) ||
2883 		    (start >= end)) {
2884 			RETURN(KERN_INVALID_ADDRESS);
2885 		}
2886 
2887 		if (overwrite) {
2888 			vmr_flags_t remove_flags = VM_MAP_REMOVE_NO_MAP_ALIGN | VM_MAP_REMOVE_TO_OVERWRITE;
2889 			kern_return_t remove_kr;
2890 
2891 			/*
2892 			 * Fixed mapping and "overwrite" flag: attempt to
2893 			 * remove all existing mappings in the specified
2894 			 * address range, saving them in our "zap_old_list".
2895 			 *
2896 			 * This avoids releasing the VM map lock in
2897 			 * vm_map_entry_delete() and allows atomicity
2898 			 * when we want to replace some mappings with a new one.
2899 			 * It also allows us to restore the old VM mappings if the
2900 			 * new mapping fails.
2901 			 */
2902 			remove_flags |= VM_MAP_REMOVE_NO_YIELD;
2903 
2904 			if (vmk_flags.vmkf_overwrite_immutable) {
2905 				/* we can overwrite immutable mappings */
2906 				remove_flags |= VM_MAP_REMOVE_IMMUTABLE;
2907 			}
2908 			if (vmk_flags.vmkf_remap_prot_copy) {
2909 				remove_flags |= VM_MAP_REMOVE_IMMUTABLE_CODE;
2910 			}
2911 			remove_kr = vm_map_delete(map, start, end, remove_flags,
2912 			    KMEM_GUARD_NONE, &zap_old_list).kmr_return;
2913 			if (remove_kr) {
2914 				/* XXX FBDP restore zap_old_list? */
2915 				RETURN(remove_kr);
2916 			}
2917 		}
2918 
2919 		/*
2920 		 *	...	the starting address isn't allocated
2921 		 */
2922 
2923 		if (vm_map_lookup_entry(map, start, &entry)) {
2924 			if (!(vmk_flags.vmkf_already)) {
2925 				RETURN(KERN_NO_SPACE);
2926 			}
2927 			/*
2928 			 * Check if what's already there is what we want.
2929 			 */
2930 			tmp_start = start;
2931 			tmp_offset = offset;
2932 			if (entry->vme_start < start) {
2933 				tmp_start -= start - entry->vme_start;
2934 				tmp_offset -= start - entry->vme_start;
2935 			}
2936 			for (; entry->vme_start < end;
2937 			    entry = entry->vme_next) {
2938 				/*
2939 				 * Check if the mapping's attributes
2940 				 * match the existing map entry.
2941 				 */
2942 				if (entry == vm_map_to_entry(map) ||
2943 				    entry->vme_start != tmp_start ||
2944 				    entry->is_sub_map != is_submap ||
2945 				    VME_OFFSET(entry) != tmp_offset ||
2946 				    entry->needs_copy != needs_copy ||
2947 				    entry->protection != cur_protection ||
2948 				    entry->max_protection != max_protection ||
2949 				    entry->inheritance != inheritance ||
2950 				    entry->iokit_acct != iokit_acct ||
2951 				    VME_ALIAS(entry) != alias) {
2952 					/* not the same mapping ! */
2953 					RETURN(KERN_NO_SPACE);
2954 				}
2955 				/*
2956 				 * Check if the same object is being mapped.
2957 				 */
2958 				if (is_submap) {
2959 					if (VME_SUBMAP(entry) !=
2960 					    (vm_map_t) object) {
2961 						/* not the same submap */
2962 						RETURN(KERN_NO_SPACE);
2963 					}
2964 				} else {
2965 					if (VME_OBJECT(entry) != object) {
2966 						/* not the same VM object... */
2967 						vm_object_t obj2;
2968 
2969 						obj2 = VME_OBJECT(entry);
2970 						if ((obj2 == VM_OBJECT_NULL ||
2971 						    obj2->internal) &&
2972 						    (object == VM_OBJECT_NULL ||
2973 						    object->internal)) {
2974 							/*
2975 							 * ... but both are
2976 							 * anonymous memory,
2977 							 * so equivalent.
2978 							 */
2979 						} else {
2980 							RETURN(KERN_NO_SPACE);
2981 						}
2982 					}
2983 				}
2984 
2985 				tmp_offset += entry->vme_end - entry->vme_start;
2986 				tmp_start += entry->vme_end - entry->vme_start;
2987 				if (entry->vme_end >= end) {
2988 					/* reached the end of our mapping */
2989 					break;
2990 				}
2991 			}
2992 			/* it all matches:  let's use what's already there ! */
2993 			RETURN(KERN_MEMORY_PRESENT);
2994 		}
2995 
2996 		/*
2997 		 *	...	the next region doesn't overlap the
2998 		 *		end point.
2999 		 */
3000 
3001 		if ((entry->vme_next != vm_map_to_entry(map)) &&
3002 		    (entry->vme_next->vme_start < end)) {
3003 			RETURN(KERN_NO_SPACE);
3004 		}
3005 	}
3006 
3007 	/*
3008 	 *	At this point,
3009 	 *		"start" and "end" should define the endpoints of the
3010 	 *			available new range, and
3011 	 *		"entry" should refer to the region before the new
3012 	 *			range, and
3013 	 *
3014 	 *		the map should be locked.
3015 	 */
3016 
3017 	/*
3018 	 *	See whether we can avoid creating a new entry (and object) by
3019 	 *	extending one of our neighbors.  [So far, we only attempt to
3020 	 *	extend from below.]  Note that we can never extend/join
3021 	 *	purgable objects because they need to remain distinct
3022 	 *	entities in order to implement their "volatile object"
3023 	 *	semantics.
3024 	 */
3025 
3026 	if (purgable ||
3027 	    entry_for_jit ||
3028 	    entry_for_tpro ||
3029 	    vm_memory_malloc_no_cow(user_alias)) {
3030 		if (object == VM_OBJECT_NULL) {
3031 			object = vm_object_allocate(size);
3032 			vm_object_lock(object);
3033 			object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
3034 			VM_OBJECT_SET_TRUE_SHARE(object, FALSE);
3035 			if (malloc_no_cow_except_fork &&
3036 			    !purgable &&
3037 			    !entry_for_jit &&
3038 			    !entry_for_tpro &&
3039 			    vm_memory_malloc_no_cow(user_alias)) {
3040 				object->copy_strategy = MEMORY_OBJECT_COPY_DELAY_FORK;
3041 				VM_OBJECT_SET_TRUE_SHARE(object, TRUE);
3042 			}
3043 			if (purgable) {
3044 				task_t owner;
3045 				VM_OBJECT_SET_PURGABLE(object, VM_PURGABLE_NONVOLATILE);
3046 				if (map->pmap == kernel_pmap) {
3047 					/*
3048 					 * Purgeable mappings made in a kernel
3049 					 * map are "owned" by the kernel itself
3050 					 * rather than the current user task
3051 					 * because they're likely to be used by
3052 					 * more than this user task (see
3053 					 * execargs_purgeable_allocate(), for
3054 					 * example).
3055 					 */
3056 					owner = kernel_task;
3057 				} else {
3058 					owner = current_task();
3059 				}
3060 				assert(object->vo_owner == NULL);
3061 				assert(object->resident_page_count == 0);
3062 				assert(object->wired_page_count == 0);
3063 				vm_purgeable_nonvolatile_enqueue(object, owner);
3064 			}
3065 			vm_object_unlock(object);
3066 			offset = (vm_object_offset_t)0;
3067 		}
3068 	} else if (VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT) {
3069 		/* no coalescing if address space uses sub-pages */
3070 	} else if ((is_submap == FALSE) &&
3071 	    (object == VM_OBJECT_NULL) &&
3072 	    (entry != vm_map_to_entry(map)) &&
3073 	    (entry->vme_end == start) &&
3074 	    (!entry->is_shared) &&
3075 	    (!entry->is_sub_map) &&
3076 	    (!entry->in_transition) &&
3077 	    (!entry->needs_wakeup) &&
3078 	    (entry->behavior == VM_BEHAVIOR_DEFAULT) &&
3079 	    (entry->protection == cur_protection) &&
3080 	    (entry->max_protection == max_protection) &&
3081 	    (entry->inheritance == inheritance) &&
3082 	    ((user_alias == VM_MEMORY_REALLOC) ||
3083 	    (VME_ALIAS(entry) == alias)) &&
3084 	    (entry->no_cache == no_cache) &&
3085 	    (entry->vme_permanent == permanent) &&
3086 	    /* no coalescing for immutable executable mappings */
3087 	    !((entry->protection & VM_PROT_EXECUTE) &&
3088 	    entry->vme_permanent) &&
3089 	    (!entry->superpage_size && !superpage_size) &&
3090 	    /*
3091 	     * No coalescing if not map-aligned, to avoid propagating
3092 	     * that condition any further than needed:
3093 	     */
3094 	    (!entry->map_aligned || !clear_map_aligned) &&
3095 	    (!entry->zero_wired_pages) &&
3096 	    (!entry->used_for_jit && !entry_for_jit) &&
3097 #if __arm64e__
3098 	    (!entry->used_for_tpro && !entry_for_tpro) &&
3099 #endif
3100 	    (!entry->csm_associated) &&
3101 	    (entry->iokit_acct == iokit_acct) &&
3102 	    (!entry->vme_resilient_codesign) &&
3103 	    (!entry->vme_resilient_media) &&
3104 	    (!entry->vme_atomic) &&
3105 	    (entry->vme_no_copy_on_read == no_copy_on_read) &&
3106 
3107 	    ((entry->vme_end - entry->vme_start) + size <=
3108 	    (user_alias == VM_MEMORY_REALLOC ?
3109 	    ANON_CHUNK_SIZE :
3110 	    NO_COALESCE_LIMIT)) &&
3111 
3112 	    (entry->wired_count == 0)) {        /* implies user_wired_count == 0 */
3113 		if (vm_object_coalesce(VME_OBJECT(entry),
3114 		    VM_OBJECT_NULL,
3115 		    VME_OFFSET(entry),
3116 		    (vm_object_offset_t) 0,
3117 		    (vm_map_size_t)(entry->vme_end - entry->vme_start),
3118 		    (vm_map_size_t)(end - entry->vme_end))) {
3119 			/*
3120 			 *	Coalesced the two objects - can extend
3121 			 *	the previous map entry to include the
3122 			 *	new range.
3123 			 */
3124 			map->size += (end - entry->vme_end);
3125 			assert(entry->vme_start < end);
3126 			assert(VM_MAP_PAGE_ALIGNED(end,
3127 			    VM_MAP_PAGE_MASK(map)));
3128 			if (__improbable(vm_debug_events)) {
3129 				DTRACE_VM5(map_entry_extend, vm_map_t, map, vm_map_entry_t, entry, vm_address_t, entry->vme_start, vm_address_t, entry->vme_end, vm_address_t, end);
3130 			}
3131 			entry->vme_end = end;
3132 			if (map->holelistenabled) {
3133 				vm_map_store_update_first_free(map, entry, TRUE);
3134 			} else {
3135 				vm_map_store_update_first_free(map, map->first_free, TRUE);
3136 			}
3137 			new_mapping_established = TRUE;
3138 			RETURN(KERN_SUCCESS);
3139 		}
3140 	}
3141 
3142 	step = superpage_size ? SUPERPAGE_SIZE : (end - start);
3143 	new_entry = NULL;
3144 
3145 	if (vmk_flags.vmkf_submap_adjust) {
3146 		vm_map_adjust_offsets((vm_map_t)caller_object, start, end);
3147 		offset = start;
3148 	}
3149 
3150 	for (tmp2_start = start; tmp2_start < end; tmp2_start += step) {
3151 		tmp2_end = tmp2_start + step;
3152 		/*
3153 		 *	Create a new entry
3154 		 *
3155 		 * XXX FBDP
3156 		 * The reserved "page zero" in each process's address space can
3157 		 * be arbitrarily large.  Splitting it into separate objects and
3158 		 * therefore different VM map entries serves no purpose and just
3159 		 * slows down operations on the VM map, so let's not split the
3160 		 * allocation into chunks if the max protection is NONE.  That
3161 		 * memory should never be accessible, so it will never get to the
3162 		 * default pager.
3163 		 */
3164 		tmp_start = tmp2_start;
3165 		if (!is_submap &&
3166 		    object == VM_OBJECT_NULL &&
3167 		    size > chunk_size &&
3168 		    max_protection != VM_PROT_NONE &&
3169 		    superpage_size == 0) {
3170 			tmp_end = tmp_start + chunk_size;
3171 		} else {
3172 			tmp_end = tmp2_end;
3173 		}
3174 		do {
3175 			if (!is_submap &&
3176 			    object != VM_OBJECT_NULL &&
3177 			    object->internal &&
3178 			    offset + (tmp_end - tmp_start) > object->vo_size) {
3179 //				printf("FBDP object %p size 0x%llx overmapping offset 0x%llx size 0x%llx\n", object, object->vo_size, offset, (uint64_t)(tmp_end - tmp_start));
3180 				DTRACE_VM5(vm_map_enter_overmap,
3181 				    vm_map_t, map,
3182 				    vm_map_address_t, tmp_start,
3183 				    vm_map_address_t, tmp_end,
3184 				    vm_object_offset_t, offset,
3185 				    vm_object_size_t, object->vo_size);
3186 			}
3187 			new_entry = vm_map_entry_insert(map,
3188 			    entry, tmp_start, tmp_end,
3189 			    object, offset, vmk_flags,
3190 			    needs_copy,
3191 			    cur_protection, max_protection,
3192 			    (entry_for_jit && !VM_MAP_POLICY_ALLOW_JIT_INHERIT(map) ?
3193 			    VM_INHERIT_NONE : inheritance),
3194 			    clear_map_aligned);
3195 
3196 			assert(!is_kernel_object(object) || (VM_KERN_MEMORY_NONE != alias));
3197 
3198 			if (resilient_codesign) {
3199 				int reject_prot = (needs_copy ? VM_PROT_ALLEXEC : (VM_PROT_WRITE | VM_PROT_ALLEXEC));
3200 				if (!((cur_protection | max_protection) & reject_prot)) {
3201 					new_entry->vme_resilient_codesign = TRUE;
3202 				}
3203 			}
3204 
3205 			if (resilient_media &&
3206 			    (object == VM_OBJECT_NULL ||
3207 			    object->internal)) {
3208 				new_entry->vme_resilient_media = TRUE;
3209 			}
3210 
3211 			assert(!new_entry->iokit_acct);
3212 			if (!is_submap &&
3213 			    object != VM_OBJECT_NULL &&
3214 			    (object->purgable != VM_PURGABLE_DENY ||
3215 			    object->vo_ledger_tag)) {
3216 				assert(new_entry->use_pmap);
3217 				assert(!new_entry->iokit_acct);
3218 				/*
3219 				 * Turn off pmap accounting since
3220 				 * purgeable (or tagged) objects have their
3221 				 * own ledgers.
3222 				 */
3223 				new_entry->use_pmap = FALSE;
3224 			} else if (!is_submap &&
3225 			    iokit_acct &&
3226 			    object != VM_OBJECT_NULL &&
3227 			    object->internal) {
3228 				/* alternate accounting */
3229 				assert(!new_entry->iokit_acct);
3230 				assert(new_entry->use_pmap);
3231 				new_entry->iokit_acct = TRUE;
3232 				new_entry->use_pmap = FALSE;
3233 				DTRACE_VM4(
3234 					vm_map_iokit_mapped_region,
3235 					vm_map_t, map,
3236 					vm_map_offset_t, new_entry->vme_start,
3237 					vm_map_offset_t, new_entry->vme_end,
3238 					int, VME_ALIAS(new_entry));
3239 				vm_map_iokit_mapped_region(
3240 					map,
3241 					(new_entry->vme_end -
3242 					new_entry->vme_start));
3243 			} else if (!is_submap) {
3244 				assert(!new_entry->iokit_acct);
3245 				assert(new_entry->use_pmap);
3246 			}
3247 
3248 			if (is_submap) {
3249 				vm_map_t        submap;
3250 				boolean_t       submap_is_64bit;
3251 				boolean_t       use_pmap;
3252 
3253 				assert(new_entry->is_sub_map);
3254 				assert(!new_entry->use_pmap);
3255 				assert(!new_entry->iokit_acct);
3256 				submap = (vm_map_t) object;
3257 				submap_is_64bit = vm_map_is_64bit(submap);
3258 				use_pmap = vmk_flags.vmkf_nested_pmap;
3259 #ifndef NO_NESTED_PMAP
3260 				if (use_pmap && submap->pmap == NULL) {
3261 					ledger_t ledger = map->pmap->ledger;
3262 					/* we need a sub pmap to nest... */
3263 					submap->pmap = pmap_create_options(ledger, 0,
3264 					    submap_is_64bit ? PMAP_CREATE_64BIT : 0);
3265 					if (submap->pmap == NULL) {
3266 						/* let's proceed without nesting... */
3267 					}
3268 #if defined(__arm64__)
3269 					else {
3270 						pmap_set_nested(submap->pmap);
3271 					}
3272 #endif
3273 				}
3274 				if (use_pmap && submap->pmap != NULL) {
3275 					if (VM_MAP_PAGE_SHIFT(map) != VM_MAP_PAGE_SHIFT(submap)) {
3276 						DEBUG4K_ERROR("map %p (%d) submap %p (%d): incompatible page sizes\n", map, VM_MAP_PAGE_SHIFT(map), submap, VM_MAP_PAGE_SHIFT(submap));
3277 						kr = KERN_FAILURE;
3278 					} else {
3279 						kr = pmap_nest(map->pmap,
3280 						    submap->pmap,
3281 						    tmp_start,
3282 						    tmp_end - tmp_start);
3283 					}
3284 					if (kr != KERN_SUCCESS) {
3285 						printf("vm_map_enter: "
3286 						    "pmap_nest(0x%llx,0x%llx) "
3287 						    "error 0x%x\n",
3288 						    (long long)tmp_start,
3289 						    (long long)tmp_end,
3290 						    kr);
3291 					} else {
3292 						/* we're now nested ! */
3293 						new_entry->use_pmap = TRUE;
3294 						pmap_empty = FALSE;
3295 					}
3296 				}
3297 #endif /* NO_NESTED_PMAP */
3298 			}
3299 			entry = new_entry;
3300 
3301 			if (superpage_size) {
3302 				vm_page_t pages, m;
3303 				vm_object_t sp_object;
3304 				vm_object_offset_t sp_offset;
3305 
3306 				VME_OFFSET_SET(entry, 0);
3307 
3308 				/* allocate one superpage */
3309 				kr = cpm_allocate(SUPERPAGE_SIZE, &pages, 0, SUPERPAGE_NBASEPAGES - 1, TRUE, 0);
3310 				if (kr != KERN_SUCCESS) {
3311 					/* deallocate whole range... */
3312 					new_mapping_established = TRUE;
3313 					/* ... but only up to "tmp_end" */
3314 					size -= end - tmp_end;
3315 					RETURN(kr);
3316 				}
3317 
3318 				/* create one vm_object per superpage */
3319 				sp_object = vm_object_allocate((vm_map_size_t)(entry->vme_end - entry->vme_start));
3320 				vm_object_lock(sp_object);
3321 				sp_object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
3322 				VM_OBJECT_SET_PHYS_CONTIGUOUS(sp_object, TRUE);
3323 				sp_object->vo_shadow_offset = (vm_object_offset_t)VM_PAGE_GET_PHYS_PAGE(pages) * PAGE_SIZE;
3324 				VME_OBJECT_SET(entry, sp_object, false, 0);
3325 				assert(entry->use_pmap);
3326 
3327 				/* enter the base pages into the object */
3328 				for (sp_offset = 0;
3329 				    sp_offset < SUPERPAGE_SIZE;
3330 				    sp_offset += PAGE_SIZE) {
3331 					m = pages;
3332 					pmap_zero_page(VM_PAGE_GET_PHYS_PAGE(m));
3333 					pages = NEXT_PAGE(m);
3334 					*(NEXT_PAGE_PTR(m)) = VM_PAGE_NULL;
3335 					vm_page_insert_wired(m, sp_object, sp_offset, VM_KERN_MEMORY_OSFMK);
3336 				}
3337 				vm_object_unlock(sp_object);
3338 			}
3339 		} while (tmp_end != tmp2_end &&
3340 		    (tmp_start = tmp_end) &&
3341 		    (tmp_end = (tmp2_end - tmp_end > chunk_size) ?
3342 		    tmp_end + chunk_size : tmp2_end));
3343 	}
3344 
3345 	new_mapping_established = TRUE;
3346 
3347 BailOut:
3348 	assert(map_locked == TRUE);
3349 
3350 	/*
3351 	 * Address space limit enforcement (RLIMIT_AS and RLIMIT_DATA):
3352 	 * If we have identified and possibly established the new mapping(s),
3353 	 * make sure we did not go beyond the address space limit.
3354 	 */
3355 	if (result == KERN_SUCCESS) {
3356 		if (map->size_limit != RLIM_INFINITY &&
3357 		    map->size > map->size_limit) {
3358 			/*
3359 			 * Establishing the requested mappings would exceed
3360 			 * the process's RLIMIT_AS limit: fail with
3361 			 * KERN_NO_SPACE.
3362 			 */
3363 			result = KERN_NO_SPACE;
3364 			printf("%d[%s] %s: map size 0x%llx over RLIMIT_AS 0x%llx\n",
3365 			    proc_selfpid(),
3366 			    (get_bsdtask_info(current_task())
3367 			    ? proc_name_address(get_bsdtask_info(current_task()))
3368 			    : "?"),
3369 			    __FUNCTION__,
3370 			    (uint64_t) map->size,
3371 			    (uint64_t) map->size_limit);
3372 			DTRACE_VM2(vm_map_enter_RLIMIT_AS,
3373 			    vm_map_size_t, map->size,
3374 			    uint64_t, map->size_limit);
3375 			vm_map_enter_RLIMIT_AS_count++;
3376 		} else if (map->data_limit != RLIM_INFINITY &&
3377 		    map->size > map->data_limit) {
3378 			/*
3379 			 * Establishing the requested mappings would exceed
3380 			 * the process's RLIMIT_DATA limit: fail with
3381 			 * KERN_NO_SPACE.
3382 			 */
3383 			result = KERN_NO_SPACE;
3384 			printf("%d[%s] %s: map size 0x%llx over RLIMIT_DATA 0x%llx\n",
3385 			    proc_selfpid(),
3386 			    (get_bsdtask_info(current_task())
3387 			    ? proc_name_address(get_bsdtask_info(current_task()))
3388 			    : "?"),
3389 			    __FUNCTION__,
3390 			    (uint64_t) map->size,
3391 			    (uint64_t) map->data_limit);
3392 			DTRACE_VM2(vm_map_enter_RLIMIT_DATA,
3393 			    vm_map_size_t, map->size,
3394 			    uint64_t, map->data_limit);
3395 			vm_map_enter_RLIMIT_DATA_count++;
3396 		}
3397 	}
3398 
3399 	if (result == KERN_SUCCESS) {
3400 		vm_prot_t pager_prot;
3401 		memory_object_t pager;
3402 
3403 #if DEBUG
3404 		if (pmap_empty &&
3405 		    !(vmk_flags.vmkf_no_pmap_check)) {
3406 			assert(pmap_is_empty(map->pmap,
3407 			    *address,
3408 			    *address + size));
3409 		}
3410 #endif /* DEBUG */
3411 
3412 		/*
3413 		 * For "named" VM objects, let the pager know that the
3414 		 * memory object is being mapped.  Some pagers need to keep
3415 		 * track of this, to know when they can reclaim the memory
3416 		 * object, for example.
3417 		 * VM calls memory_object_map() for each mapping (specifying
3418 		 * the protection of each mapping) and calls
3419 		 * memory_object_last_unmap() when all the mappings are gone.
3420 		 */
3421 		pager_prot = max_protection;
3422 		if (needs_copy) {
3423 			/*
3424 			 * Copy-On-Write mapping: won't modify
3425 			 * the memory object.
3426 			 */
3427 			pager_prot &= ~VM_PROT_WRITE;
3428 		}
3429 		if (!is_submap &&
3430 		    object != VM_OBJECT_NULL &&
3431 		    object->named &&
3432 		    object->pager != MEMORY_OBJECT_NULL) {
3433 			vm_object_lock(object);
3434 			pager = object->pager;
3435 			if (object->named &&
3436 			    pager != MEMORY_OBJECT_NULL) {
3437 				assert(object->pager_ready);
3438 				vm_object_mapping_wait(object, THREAD_UNINT);
3439 				vm_object_mapping_begin(object);
3440 				vm_object_unlock(object);
3441 
3442 				kr = memory_object_map(pager, pager_prot);
3443 				assert(kr == KERN_SUCCESS);
3444 
3445 				vm_object_lock(object);
3446 				vm_object_mapping_end(object);
3447 			}
3448 			vm_object_unlock(object);
3449 		}
3450 	}
3451 
3452 	assert(map_locked == TRUE);
3453 
3454 	if (new_mapping_established) {
3455 		/*
3456 		 * If we release the map lock for any reason below,
3457 		 * another thread could deallocate our new mapping,
3458 		 * releasing the caller's reference on "caller_object",
3459 		 * which was transferred to the mapping.
3460 		 * If this was the only reference, the object could be
3461 		 * destroyed.
3462 		 *
3463 		 * We need to take an extra reference on "caller_object"
3464 		 * to keep it alive if we need to return the caller's
3465 		 * reference to the caller in case of failure.
3466 		 */
3467 		if (is_submap) {
3468 			vm_map_reference((vm_map_t)caller_object);
3469 		} else {
3470 			vm_object_reference(caller_object);
3471 		}
3472 	}
3473 
3474 	if (!keep_map_locked) {
3475 		vm_map_unlock(map);
3476 		map_locked = FALSE;
3477 		entry = VM_MAP_ENTRY_NULL;
3478 		new_entry = VM_MAP_ENTRY_NULL;
3479 	}
3480 
3481 	/*
3482 	 * We can't hold the map lock if we enter this block.
3483 	 */
3484 
3485 	if (result == KERN_SUCCESS) {
3486 		/*	Wire down the new entry if the user
3487 		 *	requested all new map entries be wired.
3488 		 */
3489 		if ((map->wiring_required) || (superpage_size)) {
3490 			assert(!keep_map_locked);
3491 			pmap_empty = FALSE; /* pmap won't be empty */
3492 			kr = vm_map_wire_kernel(map, start, end,
3493 			    cur_protection, VM_KERN_MEMORY_MLOCK,
3494 			    TRUE);
3495 			result = kr;
3496 		}
3497 
3498 	}
3499 
3500 	if (result != KERN_SUCCESS) {
3501 		if (new_mapping_established) {
3502 			vmr_flags_t remove_flags = VM_MAP_REMOVE_NO_FLAGS;
3503 
3504 			/*
3505 			 * We have to get rid of the new mappings since we
3506 			 * won't make them available to the user.
3507 			 * Try and do that atomically, to minimize the risk
3508 			 * that someone else create new mappings that range.
3509 			 */
3510 			if (!map_locked) {
3511 				vm_map_lock(map);
3512 				map_locked = TRUE;
3513 			}
3514 			remove_flags |= VM_MAP_REMOVE_NO_MAP_ALIGN;
3515 			remove_flags |= VM_MAP_REMOVE_NO_YIELD;
3516 			if (permanent) {
3517 				remove_flags |= VM_MAP_REMOVE_IMMUTABLE;
3518 			}
3519 			(void) vm_map_delete(map,
3520 			    *address, *address + size,
3521 			    remove_flags,
3522 			    KMEM_GUARD_NONE, &zap_new_list);
3523 		}
3524 
3525 		if (vm_map_zap_first_entry(&zap_old_list)) {
3526 			vm_map_entry_t entry1, entry2;
3527 
3528 			/*
3529 			 * The new mapping failed.  Attempt to restore
3530 			 * the old mappings, saved in the "zap_old_map".
3531 			 */
3532 			if (!map_locked) {
3533 				vm_map_lock(map);
3534 				map_locked = TRUE;
3535 			}
3536 
3537 			/* first check if the coast is still clear */
3538 			start = vm_map_zap_first_entry(&zap_old_list)->vme_start;
3539 			end   = vm_map_zap_last_entry(&zap_old_list)->vme_end;
3540 
3541 			if (vm_map_lookup_entry(map, start, &entry1) ||
3542 			    vm_map_lookup_entry(map, end, &entry2) ||
3543 			    entry1 != entry2) {
3544 				/*
3545 				 * Part of that range has already been
3546 				 * re-mapped:  we can't restore the old
3547 				 * mappings...
3548 				 */
3549 				vm_map_enter_restore_failures++;
3550 			} else {
3551 				/*
3552 				 * Transfer the saved map entries from
3553 				 * "zap_old_map" to the original "map",
3554 				 * inserting them all after "entry1".
3555 				 */
3556 				while ((entry2 = vm_map_zap_pop(&zap_old_list))) {
3557 					vm_map_size_t entry_size;
3558 
3559 					entry_size = (entry2->vme_end -
3560 					    entry2->vme_start);
3561 					vm_map_store_entry_link(map, entry1, entry2,
3562 					    VM_MAP_KERNEL_FLAGS_NONE);
3563 					map->size += entry_size;
3564 					entry1 = entry2;
3565 				}
3566 				if (map->wiring_required) {
3567 					/*
3568 					 * XXX TODO: we should rewire the
3569 					 * old pages here...
3570 					 */
3571 				}
3572 				vm_map_enter_restore_successes++;
3573 			}
3574 		}
3575 	}
3576 
3577 	/*
3578 	 * The caller is responsible for releasing the lock if it requested to
3579 	 * keep the map locked.
3580 	 */
3581 	if (map_locked && !keep_map_locked) {
3582 		vm_map_unlock(map);
3583 	}
3584 
3585 	vm_map_zap_dispose(&zap_old_list);
3586 	vm_map_zap_dispose(&zap_new_list);
3587 
3588 	if (new_mapping_established) {
3589 		/*
3590 		 * The caller had a reference on "caller_object" and we
3591 		 * transferred that reference to the mapping.
3592 		 * We also took an extra reference on "caller_object" to keep
3593 		 * it alive while the map was unlocked.
3594 		 */
3595 		if (result == KERN_SUCCESS) {
3596 			/*
3597 			 * On success, the caller's reference on the object gets
3598 			 * tranferred to the mapping.
3599 			 * Release our extra reference.
3600 			 */
3601 			if (is_submap) {
3602 				vm_map_deallocate((vm_map_t)caller_object);
3603 			} else {
3604 				vm_object_deallocate(caller_object);
3605 			}
3606 		} else {
3607 			/*
3608 			 * On error, the caller expects to still have a
3609 			 * reference on the object it gave us.
3610 			 * Let's use our extra reference for that.
3611 			 */
3612 		}
3613 	}
3614 
3615 	return result;
3616 
3617 #undef  RETURN
3618 }
3619 
3620 #if __arm64__
3621 extern const struct memory_object_pager_ops fourk_pager_ops;
3622 kern_return_t
vm_map_enter_fourk(vm_map_t map,vm_map_offset_t * address,vm_map_size_t size,vm_map_offset_t mask,vm_map_kernel_flags_t vmk_flags,vm_object_t object,vm_object_offset_t offset,boolean_t needs_copy,vm_prot_t cur_protection,vm_prot_t max_protection,vm_inherit_t inheritance)3623 vm_map_enter_fourk(
3624 	vm_map_t                map,
3625 	vm_map_offset_t         *address,       /* IN/OUT */
3626 	vm_map_size_t           size,
3627 	vm_map_offset_t         mask,
3628 	vm_map_kernel_flags_t   vmk_flags,
3629 	vm_object_t             object,
3630 	vm_object_offset_t      offset,
3631 	boolean_t               needs_copy,
3632 	vm_prot_t               cur_protection,
3633 	vm_prot_t               max_protection,
3634 	vm_inherit_t            inheritance)
3635 {
3636 	vm_map_entry_t          entry, new_entry;
3637 	vm_map_offset_t         start, fourk_start;
3638 	vm_map_offset_t         end, fourk_end;
3639 	vm_map_size_t           fourk_size;
3640 	kern_return_t           result = KERN_SUCCESS;
3641 	boolean_t               map_locked = FALSE;
3642 	boolean_t               pmap_empty = TRUE;
3643 	boolean_t               new_mapping_established = FALSE;
3644 	const bool              keep_map_locked = vmk_flags.vmkf_keep_map_locked;
3645 	const bool              anywhere = !vmk_flags.vmf_fixed;
3646 	const bool              purgable = vmk_flags.vmf_purgeable;
3647 	const bool              overwrite = vmk_flags.vmf_overwrite;
3648 	const bool              is_submap = vmk_flags.vmkf_submap;
3649 	const bool              entry_for_jit = vmk_flags.vmkf_map_jit;
3650 	const unsigned int      superpage_size = vmk_flags.vmf_superpage_size;
3651 	vm_map_offset_t         effective_min_offset, effective_max_offset;
3652 	kern_return_t           kr;
3653 	boolean_t               clear_map_aligned = FALSE;
3654 	memory_object_t         fourk_mem_obj;
3655 	vm_object_t             fourk_object;
3656 	vm_map_offset_t         fourk_pager_offset;
3657 	int                     fourk_pager_index_start, fourk_pager_index_num;
3658 	int                     cur_idx;
3659 	boolean_t               fourk_copy;
3660 	vm_object_t             copy_object;
3661 	vm_object_offset_t      copy_offset;
3662 	VM_MAP_ZAP_DECLARE(zap_list);
3663 
3664 	if (VM_MAP_PAGE_MASK(map) < PAGE_MASK) {
3665 		panic("%s:%d", __FUNCTION__, __LINE__);
3666 	}
3667 	fourk_mem_obj = MEMORY_OBJECT_NULL;
3668 	fourk_object = VM_OBJECT_NULL;
3669 
3670 	if (superpage_size) {
3671 		return KERN_NOT_SUPPORTED;
3672 	}
3673 
3674 	if ((cur_protection & VM_PROT_WRITE) &&
3675 	    (cur_protection & VM_PROT_EXECUTE) &&
3676 #if XNU_TARGET_OS_OSX
3677 	    map->pmap != kernel_pmap &&
3678 	    (vm_map_cs_enforcement(map)
3679 #if __arm64__
3680 	    || !VM_MAP_IS_EXOTIC(map)
3681 #endif /* __arm64__ */
3682 	    ) &&
3683 #endif /* XNU_TARGET_OS_OSX */
3684 #if CODE_SIGNING_MONITOR
3685 	    (csm_address_space_exempt(map->pmap) != KERN_SUCCESS) &&
3686 #endif
3687 	    !entry_for_jit) {
3688 		DTRACE_VM3(cs_wx,
3689 		    uint64_t, 0,
3690 		    uint64_t, 0,
3691 		    vm_prot_t, cur_protection);
3692 		printf("CODE SIGNING: %d[%s] %s: curprot cannot be write+execute. "
3693 		    "turning off execute\n",
3694 		    proc_selfpid(),
3695 		    (get_bsdtask_info(current_task())
3696 		    ? proc_name_address(get_bsdtask_info(current_task()))
3697 		    : "?"),
3698 		    __FUNCTION__);
3699 		cur_protection &= ~VM_PROT_EXECUTE;
3700 	}
3701 
3702 	/*
3703 	 * If the task has requested executable lockdown,
3704 	 * deny any new executable mapping.
3705 	 */
3706 	if (map->map_disallow_new_exec == TRUE) {
3707 		if (cur_protection & VM_PROT_EXECUTE) {
3708 			return KERN_PROTECTION_FAILURE;
3709 		}
3710 	}
3711 
3712 	if (is_submap) {
3713 		return KERN_NOT_SUPPORTED;
3714 	}
3715 	if (vmk_flags.vmkf_already) {
3716 		return KERN_NOT_SUPPORTED;
3717 	}
3718 	if (purgable || entry_for_jit) {
3719 		return KERN_NOT_SUPPORTED;
3720 	}
3721 
3722 	effective_min_offset = map->min_offset;
3723 
3724 	if (vmk_flags.vmkf_beyond_max) {
3725 		return KERN_NOT_SUPPORTED;
3726 	} else {
3727 		effective_max_offset = map->max_offset;
3728 	}
3729 
3730 	if (size == 0 ||
3731 	    (offset & FOURK_PAGE_MASK) != 0) {
3732 		*address = 0;
3733 		return KERN_INVALID_ARGUMENT;
3734 	}
3735 
3736 #define RETURN(value)   { result = value; goto BailOut; }
3737 
3738 	assert(VM_MAP_PAGE_ALIGNED(*address, FOURK_PAGE_MASK));
3739 	assert(VM_MAP_PAGE_ALIGNED(size, FOURK_PAGE_MASK));
3740 
3741 	if (!anywhere && overwrite) {
3742 		return KERN_NOT_SUPPORTED;
3743 	}
3744 
3745 	fourk_start = *address;
3746 	fourk_size = size;
3747 	fourk_end = fourk_start + fourk_size;
3748 
3749 	start = vm_map_trunc_page(*address, VM_MAP_PAGE_MASK(map));
3750 	end = vm_map_round_page(fourk_end, VM_MAP_PAGE_MASK(map));
3751 	size = end - start;
3752 
3753 	if (anywhere) {
3754 		return KERN_NOT_SUPPORTED;
3755 	} else {
3756 		/*
3757 		 *	Verify that:
3758 		 *		the address doesn't itself violate
3759 		 *		the mask requirement.
3760 		 */
3761 
3762 		vm_map_lock(map);
3763 		map_locked = TRUE;
3764 		if ((start & mask) != 0) {
3765 			RETURN(KERN_NO_SPACE);
3766 		}
3767 
3768 		/*
3769 		 *	...	the address is within bounds
3770 		 */
3771 
3772 		end = start + size;
3773 
3774 		if ((start < effective_min_offset) ||
3775 		    (end > effective_max_offset) ||
3776 		    (start >= end)) {
3777 			RETURN(KERN_INVALID_ADDRESS);
3778 		}
3779 
3780 		/*
3781 		 *	...	the starting address isn't allocated
3782 		 */
3783 		if (vm_map_lookup_entry(map, start, &entry)) {
3784 			vm_object_t cur_object, shadow_object;
3785 
3786 			/*
3787 			 * We might already some 4K mappings
3788 			 * in a 16K page here.
3789 			 */
3790 
3791 			if (entry->vme_end - entry->vme_start
3792 			    != SIXTEENK_PAGE_SIZE) {
3793 				RETURN(KERN_NO_SPACE);
3794 			}
3795 			if (entry->is_sub_map) {
3796 				RETURN(KERN_NO_SPACE);
3797 			}
3798 			if (VME_OBJECT(entry) == VM_OBJECT_NULL) {
3799 				RETURN(KERN_NO_SPACE);
3800 			}
3801 
3802 			/* go all the way down the shadow chain */
3803 			cur_object = VME_OBJECT(entry);
3804 			vm_object_lock(cur_object);
3805 			while (cur_object->shadow != VM_OBJECT_NULL) {
3806 				shadow_object = cur_object->shadow;
3807 				vm_object_lock(shadow_object);
3808 				vm_object_unlock(cur_object);
3809 				cur_object = shadow_object;
3810 				shadow_object = VM_OBJECT_NULL;
3811 			}
3812 			if (cur_object->internal ||
3813 			    cur_object->pager == NULL) {
3814 				vm_object_unlock(cur_object);
3815 				RETURN(KERN_NO_SPACE);
3816 			}
3817 			if (cur_object->pager->mo_pager_ops
3818 			    != &fourk_pager_ops) {
3819 				vm_object_unlock(cur_object);
3820 				RETURN(KERN_NO_SPACE);
3821 			}
3822 			fourk_object = cur_object;
3823 			fourk_mem_obj = fourk_object->pager;
3824 
3825 			/* keep the "4K" object alive */
3826 			vm_object_reference_locked(fourk_object);
3827 			memory_object_reference(fourk_mem_obj);
3828 			vm_object_unlock(fourk_object);
3829 
3830 			/* merge permissions */
3831 			entry->protection |= cur_protection;
3832 			entry->max_protection |= max_protection;
3833 
3834 			if ((entry->protection & VM_PROT_WRITE) &&
3835 			    (entry->protection & VM_PROT_ALLEXEC) &&
3836 			    fourk_binary_compatibility_unsafe &&
3837 			    fourk_binary_compatibility_allow_wx) {
3838 				/* write+execute: need to be "jit" */
3839 				entry->used_for_jit = TRUE;
3840 			}
3841 			goto map_in_fourk_pager;
3842 		}
3843 
3844 		/*
3845 		 *	...	the next region doesn't overlap the
3846 		 *		end point.
3847 		 */
3848 
3849 		if ((entry->vme_next != vm_map_to_entry(map)) &&
3850 		    (entry->vme_next->vme_start < end)) {
3851 			RETURN(KERN_NO_SPACE);
3852 		}
3853 	}
3854 
3855 	/*
3856 	 *	At this point,
3857 	 *		"start" and "end" should define the endpoints of the
3858 	 *			available new range, and
3859 	 *		"entry" should refer to the region before the new
3860 	 *			range, and
3861 	 *
3862 	 *		the map should be locked.
3863 	 */
3864 
3865 	/* create a new "4K" pager */
3866 	fourk_mem_obj = fourk_pager_create();
3867 	fourk_object = fourk_pager_to_vm_object(fourk_mem_obj);
3868 	assert(fourk_object);
3869 
3870 	/* keep the "4" object alive */
3871 	vm_object_reference(fourk_object);
3872 
3873 	/* create a "copy" object, to map the "4K" object copy-on-write */
3874 	fourk_copy = TRUE;
3875 	result = vm_object_copy_strategically(fourk_object,
3876 	    0,
3877 	    end - start,
3878 	    false,                                   /* forking */
3879 	    &copy_object,
3880 	    &copy_offset,
3881 	    &fourk_copy);
3882 	assert(result == KERN_SUCCESS);
3883 	assert(copy_object != VM_OBJECT_NULL);
3884 	assert(copy_offset == 0);
3885 
3886 	/* map the "4K" pager's copy object */
3887 	new_entry = vm_map_entry_insert(map,
3888 	    entry,
3889 	    vm_map_trunc_page(start, VM_MAP_PAGE_MASK(map)),
3890 	    vm_map_round_page(end, VM_MAP_PAGE_MASK(map)),
3891 	    copy_object,
3892 	    0,                      /* offset */
3893 	    vmk_flags,
3894 	    FALSE,                  /* needs_copy */
3895 	    cur_protection, max_protection,
3896 	    (entry_for_jit && !VM_MAP_POLICY_ALLOW_JIT_INHERIT(map) ?
3897 	    VM_INHERIT_NONE : inheritance),
3898 	    clear_map_aligned);
3899 	entry = new_entry;
3900 
3901 #if VM_MAP_DEBUG_FOURK
3902 	if (vm_map_debug_fourk) {
3903 		printf("FOURK_PAGER: map %p [0x%llx:0x%llx] new pager %p\n",
3904 		    map,
3905 		    (uint64_t) entry->vme_start,
3906 		    (uint64_t) entry->vme_end,
3907 		    fourk_mem_obj);
3908 	}
3909 #endif /* VM_MAP_DEBUG_FOURK */
3910 
3911 	new_mapping_established = TRUE;
3912 
3913 map_in_fourk_pager:
3914 	/* "map" the original "object" where it belongs in the "4K" pager */
3915 	fourk_pager_offset = (fourk_start & SIXTEENK_PAGE_MASK);
3916 	fourk_pager_index_start = (int) (fourk_pager_offset / FOURK_PAGE_SIZE);
3917 	if (fourk_size > SIXTEENK_PAGE_SIZE) {
3918 		fourk_pager_index_num = 4;
3919 	} else {
3920 		fourk_pager_index_num = (int) (fourk_size / FOURK_PAGE_SIZE);
3921 	}
3922 	if (fourk_pager_index_start + fourk_pager_index_num > 4) {
3923 		fourk_pager_index_num = 4 - fourk_pager_index_start;
3924 	}
3925 	for (cur_idx = 0;
3926 	    cur_idx < fourk_pager_index_num;
3927 	    cur_idx++) {
3928 		vm_object_t             old_object;
3929 		vm_object_offset_t      old_offset;
3930 
3931 		kr = fourk_pager_populate(fourk_mem_obj,
3932 		    TRUE,                       /* overwrite */
3933 		    fourk_pager_index_start + cur_idx,
3934 		    object,
3935 		    (object
3936 		    ? (offset +
3937 		    (cur_idx * FOURK_PAGE_SIZE))
3938 		    : 0),
3939 		    &old_object,
3940 		    &old_offset);
3941 #if VM_MAP_DEBUG_FOURK
3942 		if (vm_map_debug_fourk) {
3943 			if (old_object == (vm_object_t) -1 &&
3944 			    old_offset == (vm_object_offset_t) -1) {
3945 				printf("FOURK_PAGER: map %p [0x%llx:0x%llx] "
3946 				    "pager [%p:0x%llx] "
3947 				    "populate[%d] "
3948 				    "[object:%p,offset:0x%llx]\n",
3949 				    map,
3950 				    (uint64_t) entry->vme_start,
3951 				    (uint64_t) entry->vme_end,
3952 				    fourk_mem_obj,
3953 				    VME_OFFSET(entry),
3954 				    fourk_pager_index_start + cur_idx,
3955 				    object,
3956 				    (object
3957 				    ? (offset + (cur_idx * FOURK_PAGE_SIZE))
3958 				    : 0));
3959 			} else {
3960 				printf("FOURK_PAGER: map %p [0x%llx:0x%llx] "
3961 				    "pager [%p:0x%llx] "
3962 				    "populate[%d] [object:%p,offset:0x%llx] "
3963 				    "old [%p:0x%llx]\n",
3964 				    map,
3965 				    (uint64_t) entry->vme_start,
3966 				    (uint64_t) entry->vme_end,
3967 				    fourk_mem_obj,
3968 				    VME_OFFSET(entry),
3969 				    fourk_pager_index_start + cur_idx,
3970 				    object,
3971 				    (object
3972 				    ? (offset + (cur_idx * FOURK_PAGE_SIZE))
3973 				    : 0),
3974 				    old_object,
3975 				    old_offset);
3976 			}
3977 		}
3978 #endif /* VM_MAP_DEBUG_FOURK */
3979 
3980 		assert(kr == KERN_SUCCESS);
3981 		if (object != old_object &&
3982 		    object != VM_OBJECT_NULL &&
3983 		    object != (vm_object_t) -1) {
3984 			vm_object_reference(object);
3985 		}
3986 		if (object != old_object &&
3987 		    old_object != VM_OBJECT_NULL &&
3988 		    old_object != (vm_object_t) -1) {
3989 			vm_object_deallocate(old_object);
3990 		}
3991 	}
3992 
3993 BailOut:
3994 	assert(map_locked == TRUE);
3995 
3996 	if (result == KERN_SUCCESS) {
3997 		vm_prot_t pager_prot;
3998 		memory_object_t pager;
3999 
4000 #if DEBUG
4001 		if (pmap_empty &&
4002 		    !(vmk_flags.vmkf_no_pmap_check)) {
4003 			assert(pmap_is_empty(map->pmap,
4004 			    *address,
4005 			    *address + size));
4006 		}
4007 #endif /* DEBUG */
4008 
4009 		/*
4010 		 * For "named" VM objects, let the pager know that the
4011 		 * memory object is being mapped.  Some pagers need to keep
4012 		 * track of this, to know when they can reclaim the memory
4013 		 * object, for example.
4014 		 * VM calls memory_object_map() for each mapping (specifying
4015 		 * the protection of each mapping) and calls
4016 		 * memory_object_last_unmap() when all the mappings are gone.
4017 		 */
4018 		pager_prot = max_protection;
4019 		if (needs_copy) {
4020 			/*
4021 			 * Copy-On-Write mapping: won't modify
4022 			 * the memory object.
4023 			 */
4024 			pager_prot &= ~VM_PROT_WRITE;
4025 		}
4026 		if (!is_submap &&
4027 		    object != VM_OBJECT_NULL &&
4028 		    object->named &&
4029 		    object->pager != MEMORY_OBJECT_NULL) {
4030 			vm_object_lock(object);
4031 			pager = object->pager;
4032 			if (object->named &&
4033 			    pager != MEMORY_OBJECT_NULL) {
4034 				assert(object->pager_ready);
4035 				vm_object_mapping_wait(object, THREAD_UNINT);
4036 				vm_object_mapping_begin(object);
4037 				vm_object_unlock(object);
4038 
4039 				kr = memory_object_map(pager, pager_prot);
4040 				assert(kr == KERN_SUCCESS);
4041 
4042 				vm_object_lock(object);
4043 				vm_object_mapping_end(object);
4044 			}
4045 			vm_object_unlock(object);
4046 		}
4047 		if (!is_submap &&
4048 		    fourk_object != VM_OBJECT_NULL &&
4049 		    fourk_object->named &&
4050 		    fourk_object->pager != MEMORY_OBJECT_NULL) {
4051 			vm_object_lock(fourk_object);
4052 			pager = fourk_object->pager;
4053 			if (fourk_object->named &&
4054 			    pager != MEMORY_OBJECT_NULL) {
4055 				assert(fourk_object->pager_ready);
4056 				vm_object_mapping_wait(fourk_object,
4057 				    THREAD_UNINT);
4058 				vm_object_mapping_begin(fourk_object);
4059 				vm_object_unlock(fourk_object);
4060 
4061 				kr = memory_object_map(pager, VM_PROT_READ);
4062 				assert(kr == KERN_SUCCESS);
4063 
4064 				vm_object_lock(fourk_object);
4065 				vm_object_mapping_end(fourk_object);
4066 			}
4067 			vm_object_unlock(fourk_object);
4068 		}
4069 	}
4070 
4071 	if (fourk_object != VM_OBJECT_NULL) {
4072 		vm_object_deallocate(fourk_object);
4073 		fourk_object = VM_OBJECT_NULL;
4074 		memory_object_deallocate(fourk_mem_obj);
4075 		fourk_mem_obj = MEMORY_OBJECT_NULL;
4076 	}
4077 
4078 	assert(map_locked == TRUE);
4079 
4080 	if (!keep_map_locked) {
4081 		vm_map_unlock(map);
4082 		map_locked = FALSE;
4083 	}
4084 
4085 	/*
4086 	 * We can't hold the map lock if we enter this block.
4087 	 */
4088 
4089 	if (result == KERN_SUCCESS) {
4090 		/*	Wire down the new entry if the user
4091 		 *	requested all new map entries be wired.
4092 		 */
4093 		if ((map->wiring_required) || (superpage_size)) {
4094 			assert(!keep_map_locked);
4095 			pmap_empty = FALSE; /* pmap won't be empty */
4096 			kr = vm_map_wire_kernel(map, start, end,
4097 			    new_entry->protection, VM_KERN_MEMORY_MLOCK,
4098 			    TRUE);
4099 			result = kr;
4100 		}
4101 
4102 	}
4103 
4104 	if (result != KERN_SUCCESS) {
4105 		if (new_mapping_established) {
4106 			/*
4107 			 * We have to get rid of the new mappings since we
4108 			 * won't make them available to the user.
4109 			 * Try and do that atomically, to minimize the risk
4110 			 * that someone else create new mappings that range.
4111 			 */
4112 
4113 			if (!map_locked) {
4114 				vm_map_lock(map);
4115 				map_locked = TRUE;
4116 			}
4117 			(void)vm_map_delete(map, *address, *address + size,
4118 			    VM_MAP_REMOVE_NO_MAP_ALIGN | VM_MAP_REMOVE_NO_YIELD,
4119 			    KMEM_GUARD_NONE, &zap_list);
4120 		}
4121 	}
4122 
4123 	/*
4124 	 * The caller is responsible for releasing the lock if it requested to
4125 	 * keep the map locked.
4126 	 */
4127 	if (map_locked && !keep_map_locked) {
4128 		vm_map_unlock(map);
4129 	}
4130 
4131 	vm_map_zap_dispose(&zap_list);
4132 
4133 	return result;
4134 
4135 #undef  RETURN
4136 }
4137 #endif /* __arm64__ */
4138 
4139 /*
4140  * Counters for the prefault optimization.
4141  */
4142 int64_t vm_prefault_nb_pages = 0;
4143 int64_t vm_prefault_nb_bailout = 0;
4144 
4145 static kern_return_t
vm_map_enter_mem_object_helper(vm_map_t target_map,vm_map_offset_t * address,vm_map_size_t initial_size,vm_map_offset_t mask,vm_map_kernel_flags_t vmk_flags,ipc_port_t port,vm_object_offset_t offset,boolean_t copy,vm_prot_t cur_protection,vm_prot_t max_protection,vm_inherit_t inheritance,upl_page_list_ptr_t page_list,unsigned int page_list_count)4146 vm_map_enter_mem_object_helper(
4147 	vm_map_t                target_map,
4148 	vm_map_offset_t         *address,
4149 	vm_map_size_t           initial_size,
4150 	vm_map_offset_t         mask,
4151 	vm_map_kernel_flags_t   vmk_flags,
4152 	ipc_port_t              port,
4153 	vm_object_offset_t      offset,
4154 	boolean_t               copy,
4155 	vm_prot_t               cur_protection,
4156 	vm_prot_t               max_protection,
4157 	vm_inherit_t            inheritance,
4158 	upl_page_list_ptr_t     page_list,
4159 	unsigned int            page_list_count)
4160 {
4161 	vm_map_address_t        map_addr;
4162 	vm_map_size_t           map_size;
4163 	vm_object_t             object;
4164 	vm_object_size_t        size;
4165 	kern_return_t           result;
4166 	boolean_t               mask_cur_protection, mask_max_protection;
4167 	boolean_t               kernel_prefault, try_prefault = (page_list_count != 0);
4168 	vm_map_offset_t         offset_in_mapping = 0;
4169 #if __arm64__
4170 	boolean_t               fourk = vmk_flags.vmkf_fourk;
4171 #endif /* __arm64__ */
4172 
4173 	if (VM_MAP_PAGE_SHIFT(target_map) < PAGE_SHIFT) {
4174 		/* XXX TODO4K prefaulting depends on page size... */
4175 		try_prefault = FALSE;
4176 	}
4177 
4178 	assertf(vmk_flags.__vmkf_unused == 0, "vmk_flags unused=0x%x\n", vmk_flags.__vmkf_unused);
4179 	vm_map_kernel_flags_update_range_id(&vmk_flags, target_map);
4180 
4181 	mask_cur_protection = cur_protection & VM_PROT_IS_MASK;
4182 	mask_max_protection = max_protection & VM_PROT_IS_MASK;
4183 	cur_protection &= ~VM_PROT_IS_MASK;
4184 	max_protection &= ~VM_PROT_IS_MASK;
4185 
4186 	/*
4187 	 * Check arguments for validity
4188 	 */
4189 	if ((target_map == VM_MAP_NULL) ||
4190 	    (cur_protection & ~(VM_PROT_ALL | VM_PROT_ALLEXEC)) ||
4191 	    (max_protection & ~(VM_PROT_ALL | VM_PROT_ALLEXEC)) ||
4192 	    (inheritance > VM_INHERIT_LAST_VALID) ||
4193 	    (try_prefault && (copy || !page_list)) ||
4194 	    initial_size == 0) {
4195 		return KERN_INVALID_ARGUMENT;
4196 	}
4197 
4198 	if (__improbable((cur_protection & max_protection) != cur_protection)) {
4199 		/* cur is more permissive than max */
4200 		cur_protection &= max_protection;
4201 	}
4202 
4203 #if __arm64__
4204 	if (cur_protection & VM_PROT_EXECUTE) {
4205 		cur_protection |= VM_PROT_READ;
4206 	}
4207 
4208 	if (fourk && VM_MAP_PAGE_SHIFT(target_map) < PAGE_SHIFT) {
4209 		/* no "fourk" if map is using a sub-page page size */
4210 		fourk = FALSE;
4211 	}
4212 	if (fourk) {
4213 		map_addr = vm_map_trunc_page(*address, FOURK_PAGE_MASK);
4214 		map_size = vm_map_round_page(initial_size, FOURK_PAGE_MASK);
4215 	} else
4216 #endif /* __arm64__ */
4217 	{
4218 		map_addr = vm_map_trunc_page(*address,
4219 		    VM_MAP_PAGE_MASK(target_map));
4220 		map_size = vm_map_round_page(initial_size,
4221 		    VM_MAP_PAGE_MASK(target_map));
4222 	}
4223 	if (map_size == 0) {
4224 		return KERN_INVALID_ARGUMENT;
4225 	}
4226 	size = vm_object_round_page(initial_size);
4227 
4228 	/*
4229 	 * Find the vm object (if any) corresponding to this port.
4230 	 */
4231 	if (!IP_VALID(port)) {
4232 		object = VM_OBJECT_NULL;
4233 		offset = 0;
4234 		copy = FALSE;
4235 	} else if (ip_kotype(port) == IKOT_NAMED_ENTRY) {
4236 		vm_named_entry_t        named_entry;
4237 		vm_object_offset_t      data_offset;
4238 
4239 		named_entry = mach_memory_entry_from_port(port);
4240 
4241 		if (vmk_flags.vmf_return_data_addr ||
4242 		    vmk_flags.vmf_return_4k_data_addr) {
4243 			data_offset = named_entry->data_offset;
4244 			offset += named_entry->data_offset;
4245 		} else {
4246 			data_offset = 0;
4247 		}
4248 
4249 		/* a few checks to make sure user is obeying rules */
4250 		if (mask_max_protection) {
4251 			max_protection &= named_entry->protection;
4252 		}
4253 		if (mask_cur_protection) {
4254 			cur_protection &= named_entry->protection;
4255 		}
4256 		if ((named_entry->protection & max_protection) !=
4257 		    max_protection) {
4258 			return KERN_INVALID_RIGHT;
4259 		}
4260 		if ((named_entry->protection & cur_protection) !=
4261 		    cur_protection) {
4262 			return KERN_INVALID_RIGHT;
4263 		}
4264 		if (offset + size <= offset) {
4265 			/* overflow */
4266 			return KERN_INVALID_ARGUMENT;
4267 		}
4268 		if (named_entry->size < (offset + initial_size)) {
4269 			return KERN_INVALID_ARGUMENT;
4270 		}
4271 
4272 		if (named_entry->is_copy) {
4273 			/* for a vm_map_copy, we can only map it whole */
4274 			if ((size != named_entry->size) &&
4275 			    (vm_map_round_page(size,
4276 			    VM_MAP_PAGE_MASK(target_map)) ==
4277 			    named_entry->size)) {
4278 				/* XXX FBDP use the rounded size... */
4279 				size = vm_map_round_page(
4280 					size,
4281 					VM_MAP_PAGE_MASK(target_map));
4282 			}
4283 		}
4284 
4285 		/* the callers parameter offset is defined to be the */
4286 		/* offset from beginning of named entry offset in object */
4287 		offset = offset + named_entry->offset;
4288 
4289 		if (!VM_MAP_PAGE_ALIGNED(size,
4290 		    VM_MAP_PAGE_MASK(target_map))) {
4291 			/*
4292 			 * Let's not map more than requested;
4293 			 * vm_map_enter() will handle this "not map-aligned"
4294 			 * case.
4295 			 */
4296 			map_size = size;
4297 		}
4298 
4299 		named_entry_lock(named_entry);
4300 		if (named_entry->is_sub_map) {
4301 			vm_map_t                submap;
4302 
4303 			if (vmk_flags.vmf_return_data_addr ||
4304 			    vmk_flags.vmf_return_4k_data_addr) {
4305 				panic("VM_FLAGS_RETURN_DATA_ADDR not expected for submap.");
4306 			}
4307 
4308 			submap = named_entry->backing.map;
4309 			vm_map_reference(submap);
4310 			named_entry_unlock(named_entry);
4311 
4312 			vmk_flags.vmkf_submap = TRUE;
4313 
4314 			result = vm_map_enter(target_map,
4315 			    &map_addr,
4316 			    map_size,
4317 			    mask,
4318 			    vmk_flags,
4319 			    (vm_object_t)(uintptr_t) submap,
4320 			    offset,
4321 			    copy,
4322 			    cur_protection,
4323 			    max_protection,
4324 			    inheritance);
4325 			if (result != KERN_SUCCESS) {
4326 				vm_map_deallocate(submap);
4327 			} else {
4328 				/*
4329 				 * No need to lock "submap" just to check its
4330 				 * "mapped" flag: that flag is never reset
4331 				 * once it's been set and if we race, we'll
4332 				 * just end up setting it twice, which is OK.
4333 				 */
4334 				if (submap->mapped_in_other_pmaps == FALSE &&
4335 				    vm_map_pmap(submap) != PMAP_NULL &&
4336 				    vm_map_pmap(submap) !=
4337 				    vm_map_pmap(target_map)) {
4338 					/*
4339 					 * This submap is being mapped in a map
4340 					 * that uses a different pmap.
4341 					 * Set its "mapped_in_other_pmaps" flag
4342 					 * to indicate that we now need to
4343 					 * remove mappings from all pmaps rather
4344 					 * than just the submap's pmap.
4345 					 */
4346 					vm_map_lock(submap);
4347 					submap->mapped_in_other_pmaps = TRUE;
4348 					vm_map_unlock(submap);
4349 				}
4350 				*address = map_addr;
4351 			}
4352 			return result;
4353 		} else if (named_entry->is_copy) {
4354 			kern_return_t   kr;
4355 			vm_map_copy_t   copy_map;
4356 			vm_map_entry_t  copy_entry;
4357 			vm_map_offset_t copy_addr;
4358 			vm_map_copy_t   target_copy_map;
4359 			vm_map_offset_t overmap_start, overmap_end;
4360 			vm_map_offset_t trimmed_start;
4361 			vm_map_size_t   target_size;
4362 
4363 			if (!vm_map_kernel_flags_check_vmflags(vmk_flags,
4364 			    (VM_FLAGS_FIXED |
4365 			    VM_FLAGS_ANYWHERE |
4366 			    VM_FLAGS_OVERWRITE |
4367 			    VM_FLAGS_RETURN_4K_DATA_ADDR |
4368 			    VM_FLAGS_RETURN_DATA_ADDR))) {
4369 				named_entry_unlock(named_entry);
4370 				return KERN_INVALID_ARGUMENT;
4371 			}
4372 
4373 			copy_map = named_entry->backing.copy;
4374 			assert(copy_map->type == VM_MAP_COPY_ENTRY_LIST);
4375 			if (copy_map->type != VM_MAP_COPY_ENTRY_LIST) {
4376 				/* unsupported type; should not happen */
4377 				printf("vm_map_enter_mem_object: "
4378 				    "memory_entry->backing.copy "
4379 				    "unsupported type 0x%x\n",
4380 				    copy_map->type);
4381 				named_entry_unlock(named_entry);
4382 				return KERN_INVALID_ARGUMENT;
4383 			}
4384 
4385 			if (VM_MAP_PAGE_SHIFT(target_map) != copy_map->cpy_hdr.page_shift) {
4386 				DEBUG4K_SHARE("copy_map %p offset %llx size 0x%llx pgshift %d -> target_map %p pgshift %d\n", copy_map, offset, (uint64_t)map_size, copy_map->cpy_hdr.page_shift, target_map, VM_MAP_PAGE_SHIFT(target_map));
4387 			}
4388 
4389 			if (vmk_flags.vmf_return_data_addr ||
4390 			    vmk_flags.vmf_return_4k_data_addr) {
4391 				offset_in_mapping = offset & VM_MAP_PAGE_MASK(target_map);
4392 				if (vmk_flags.vmf_return_4k_data_addr) {
4393 					offset_in_mapping &= ~((signed)(0xFFF));
4394 				}
4395 			}
4396 
4397 			target_copy_map = VM_MAP_COPY_NULL;
4398 			target_size = copy_map->size;
4399 			overmap_start = 0;
4400 			overmap_end = 0;
4401 			trimmed_start = 0;
4402 			if (copy_map->cpy_hdr.page_shift != VM_MAP_PAGE_SHIFT(target_map)) {
4403 				DEBUG4K_ADJUST("adjusting...\n");
4404 				kr = vm_map_copy_adjust_to_target(
4405 					copy_map,
4406 					offset /* includes data_offset */,
4407 					initial_size,
4408 					target_map,
4409 					copy,
4410 					&target_copy_map,
4411 					&overmap_start,
4412 					&overmap_end,
4413 					&trimmed_start);
4414 				if (kr != KERN_SUCCESS) {
4415 					named_entry_unlock(named_entry);
4416 					return kr;
4417 				}
4418 				target_size = target_copy_map->size;
4419 				if (trimmed_start >= data_offset) {
4420 					data_offset = offset & VM_MAP_PAGE_MASK(target_map);
4421 				} else {
4422 					data_offset -= trimmed_start;
4423 				}
4424 			} else {
4425 				/*
4426 				 * Assert that the vm_map_copy is coming from the right
4427 				 * zone and hasn't been forged
4428 				 */
4429 				vm_map_copy_require(copy_map);
4430 				target_copy_map = copy_map;
4431 			}
4432 
4433 			vm_map_kernel_flags_t rsv_flags = vmk_flags;
4434 
4435 			vm_map_kernel_flags_and_vmflags(&rsv_flags,
4436 			    (VM_FLAGS_FIXED |
4437 			    VM_FLAGS_ANYWHERE |
4438 			    VM_FLAGS_OVERWRITE |
4439 			    VM_FLAGS_RETURN_4K_DATA_ADDR |
4440 			    VM_FLAGS_RETURN_DATA_ADDR));
4441 
4442 			/* reserve a contiguous range */
4443 			kr = vm_map_enter(target_map,
4444 			    &map_addr,
4445 			    vm_map_round_page(target_size, VM_MAP_PAGE_MASK(target_map)),
4446 			    mask,
4447 			    rsv_flags,
4448 			    VM_OBJECT_NULL,
4449 			    0,
4450 			    FALSE,               /* copy */
4451 			    cur_protection,
4452 			    max_protection,
4453 			    inheritance);
4454 			if (kr != KERN_SUCCESS) {
4455 				DEBUG4K_ERROR("kr 0x%x\n", kr);
4456 				if (target_copy_map != copy_map) {
4457 					vm_map_copy_discard(target_copy_map);
4458 					target_copy_map = VM_MAP_COPY_NULL;
4459 				}
4460 				named_entry_unlock(named_entry);
4461 				return kr;
4462 			}
4463 
4464 			copy_addr = map_addr;
4465 
4466 			for (copy_entry = vm_map_copy_first_entry(target_copy_map);
4467 			    copy_entry != vm_map_copy_to_entry(target_copy_map);
4468 			    copy_entry = copy_entry->vme_next) {
4469 				vm_map_t                copy_submap = VM_MAP_NULL;
4470 				vm_object_t             copy_object = VM_OBJECT_NULL;
4471 				vm_map_size_t           copy_size;
4472 				vm_object_offset_t      copy_offset;
4473 				boolean_t               do_copy = false;
4474 
4475 				if (copy_entry->is_sub_map) {
4476 					copy_submap = VME_SUBMAP(copy_entry);
4477 					copy_object = (vm_object_t)copy_submap;
4478 				} else {
4479 					copy_object = VME_OBJECT(copy_entry);
4480 				}
4481 				copy_offset = VME_OFFSET(copy_entry);
4482 				copy_size = (copy_entry->vme_end -
4483 				    copy_entry->vme_start);
4484 
4485 				/* sanity check */
4486 				if ((copy_addr + copy_size) >
4487 				    (map_addr +
4488 				    overmap_start + overmap_end +
4489 				    named_entry->size /* XXX full size */)) {
4490 					/* over-mapping too much !? */
4491 					kr = KERN_INVALID_ARGUMENT;
4492 					DEBUG4K_ERROR("kr 0x%x\n", kr);
4493 					/* abort */
4494 					break;
4495 				}
4496 
4497 				/* take a reference on the object */
4498 				if (copy_entry->is_sub_map) {
4499 					vm_map_reference(copy_submap);
4500 				} else {
4501 					if (!copy &&
4502 					    copy_object != VM_OBJECT_NULL &&
4503 					    copy_object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) {
4504 						bool is_writable;
4505 
4506 						/*
4507 						 * We need to resolve our side of this
4508 						 * "symmetric" copy-on-write now; we
4509 						 * need a new object to map and share,
4510 						 * instead of the current one which
4511 						 * might still be shared with the
4512 						 * original mapping.
4513 						 *
4514 						 * Note: A "vm_map_copy_t" does not
4515 						 * have a lock but we're protected by
4516 						 * the named entry's lock here.
4517 						 */
4518 						// assert(copy_object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC);
4519 						VME_OBJECT_SHADOW(copy_entry, copy_size, TRUE);
4520 						assert(copy_object != VME_OBJECT(copy_entry));
4521 						is_writable = false;
4522 						if (copy_entry->protection & VM_PROT_WRITE) {
4523 							is_writable = true;
4524 #if __arm64e__
4525 						} else if (copy_entry->used_for_tpro) {
4526 							is_writable = true;
4527 #endif /* __arm64e__ */
4528 						}
4529 						if (!copy_entry->needs_copy && is_writable) {
4530 							vm_prot_t prot;
4531 
4532 							prot = copy_entry->protection & ~VM_PROT_WRITE;
4533 							vm_object_pmap_protect(copy_object,
4534 							    copy_offset,
4535 							    copy_size,
4536 							    PMAP_NULL,
4537 							    PAGE_SIZE,
4538 							    0,
4539 							    prot);
4540 						}
4541 						copy_entry->needs_copy = FALSE;
4542 						copy_entry->is_shared = TRUE;
4543 						copy_object = VME_OBJECT(copy_entry);
4544 						copy_offset = VME_OFFSET(copy_entry);
4545 						vm_object_lock(copy_object);
4546 						/* we're about to make a shared mapping of this object */
4547 						copy_object->copy_strategy = MEMORY_OBJECT_COPY_DELAY;
4548 						VM_OBJECT_SET_TRUE_SHARE(copy_object, TRUE);
4549 						vm_object_unlock(copy_object);
4550 					}
4551 
4552 					if (copy_object != VM_OBJECT_NULL &&
4553 					    copy_object->named &&
4554 					    copy_object->pager != MEMORY_OBJECT_NULL &&
4555 					    copy_object->copy_strategy != MEMORY_OBJECT_COPY_NONE) {
4556 						memory_object_t pager;
4557 						vm_prot_t       pager_prot;
4558 
4559 						/*
4560 						 * For "named" VM objects, let the pager know that the
4561 						 * memory object is being mapped.  Some pagers need to keep
4562 						 * track of this, to know when they can reclaim the memory
4563 						 * object, for example.
4564 						 * VM calls memory_object_map() for each mapping (specifying
4565 						 * the protection of each mapping) and calls
4566 						 * memory_object_last_unmap() when all the mappings are gone.
4567 						 */
4568 						pager_prot = max_protection;
4569 						if (copy) {
4570 							/*
4571 							 * Copy-On-Write mapping: won't modify the
4572 							 * memory object.
4573 							 */
4574 							pager_prot &= ~VM_PROT_WRITE;
4575 						}
4576 						vm_object_lock(copy_object);
4577 						pager = copy_object->pager;
4578 						if (copy_object->named &&
4579 						    pager != MEMORY_OBJECT_NULL &&
4580 						    copy_object->copy_strategy != MEMORY_OBJECT_COPY_NONE) {
4581 							assert(copy_object->pager_ready);
4582 							vm_object_mapping_wait(copy_object, THREAD_UNINT);
4583 							vm_object_mapping_begin(copy_object);
4584 							vm_object_unlock(copy_object);
4585 
4586 							kr = memory_object_map(pager, pager_prot);
4587 							assert(kr == KERN_SUCCESS);
4588 
4589 							vm_object_lock(copy_object);
4590 							vm_object_mapping_end(copy_object);
4591 						}
4592 						vm_object_unlock(copy_object);
4593 					}
4594 
4595 					/*
4596 					 *	Perform the copy if requested
4597 					 */
4598 
4599 					if (copy && copy_object != VM_OBJECT_NULL) {
4600 						vm_object_t             new_object;
4601 						vm_object_offset_t      new_offset;
4602 
4603 						result = vm_object_copy_strategically(copy_object, copy_offset,
4604 						    copy_size,
4605 						    false,                                   /* forking */
4606 						    &new_object, &new_offset,
4607 						    &do_copy);
4608 
4609 
4610 						if (result == KERN_MEMORY_RESTART_COPY) {
4611 							boolean_t success;
4612 							boolean_t src_needs_copy;
4613 
4614 							/*
4615 							 * XXX
4616 							 * We currently ignore src_needs_copy.
4617 							 * This really is the issue of how to make
4618 							 * MEMORY_OBJECT_COPY_SYMMETRIC safe for
4619 							 * non-kernel users to use. Solution forthcoming.
4620 							 * In the meantime, since we don't allow non-kernel
4621 							 * memory managers to specify symmetric copy,
4622 							 * we won't run into problems here.
4623 							 */
4624 							new_object = copy_object;
4625 							new_offset = copy_offset;
4626 							success = vm_object_copy_quickly(new_object,
4627 							    new_offset,
4628 							    copy_size,
4629 							    &src_needs_copy,
4630 							    &do_copy);
4631 							assert(success);
4632 							result = KERN_SUCCESS;
4633 						}
4634 						if (result != KERN_SUCCESS) {
4635 							kr = result;
4636 							break;
4637 						}
4638 
4639 						copy_object = new_object;
4640 						copy_offset = new_offset;
4641 						/*
4642 						 * No extra object reference for the mapping:
4643 						 * the mapping should be the only thing keeping
4644 						 * this new object alive.
4645 						 */
4646 					} else {
4647 						/*
4648 						 * We already have the right object
4649 						 * to map.
4650 						 */
4651 						copy_object = VME_OBJECT(copy_entry);
4652 						/* take an extra ref for the mapping below */
4653 						vm_object_reference(copy_object);
4654 					}
4655 				}
4656 
4657 				/*
4658 				 * If the caller does not want a specific
4659 				 * tag for this new mapping:  use
4660 				 * the tag of the original mapping.
4661 				 */
4662 				vm_map_kernel_flags_t vmk_remap_flags = {
4663 					.vmkf_submap = copy_entry->is_sub_map,
4664 				};
4665 
4666 				vm_map_kernel_flags_set_vmflags(&vmk_remap_flags,
4667 				    vm_map_kernel_flags_vmflags(vmk_flags),
4668 				    vmk_flags.vm_tag ?: VME_ALIAS(copy_entry));
4669 
4670 				/* over-map the object into destination */
4671 				vmk_remap_flags.vmf_fixed = true;
4672 				vmk_remap_flags.vmf_overwrite = true;
4673 
4674 				if (!copy && !copy_entry->is_sub_map) {
4675 					/*
4676 					 * copy-on-write should have been
4677 					 * resolved at this point, or we would
4678 					 * end up sharing instead of copying.
4679 					 */
4680 					assert(!copy_entry->needs_copy);
4681 				}
4682 #if XNU_TARGET_OS_OSX
4683 				if (copy_entry->used_for_jit) {
4684 					vmk_remap_flags.vmkf_map_jit = TRUE;
4685 				}
4686 #endif /* XNU_TARGET_OS_OSX */
4687 
4688 				kr = vm_map_enter(target_map,
4689 				    &copy_addr,
4690 				    copy_size,
4691 				    (vm_map_offset_t) 0,
4692 				    vmk_remap_flags,
4693 				    copy_object,
4694 				    copy_offset,
4695 				    ((copy_object == NULL)
4696 				    ? FALSE
4697 				    : (copy || copy_entry->needs_copy)),
4698 				    cur_protection,
4699 				    max_protection,
4700 				    inheritance);
4701 				if (kr != KERN_SUCCESS) {
4702 					DEBUG4K_SHARE("failed kr 0x%x\n", kr);
4703 					if (copy_entry->is_sub_map) {
4704 						vm_map_deallocate(copy_submap);
4705 					} else {
4706 						vm_object_deallocate(copy_object);
4707 					}
4708 					/* abort */
4709 					break;
4710 				}
4711 
4712 				/* next mapping */
4713 				copy_addr += copy_size;
4714 			}
4715 
4716 			if (kr == KERN_SUCCESS) {
4717 				if (vmk_flags.vmf_return_data_addr ||
4718 				    vmk_flags.vmf_return_4k_data_addr) {
4719 					*address = map_addr + offset_in_mapping;
4720 				} else {
4721 					*address = map_addr;
4722 				}
4723 				if (overmap_start) {
4724 					*address += overmap_start;
4725 					DEBUG4K_SHARE("map %p map_addr 0x%llx offset_in_mapping 0x%llx overmap_start 0x%llx -> *address 0x%llx\n", target_map, (uint64_t)map_addr, (uint64_t) offset_in_mapping, (uint64_t)overmap_start, (uint64_t)*address);
4726 				}
4727 			}
4728 			named_entry_unlock(named_entry);
4729 			if (target_copy_map != copy_map) {
4730 				vm_map_copy_discard(target_copy_map);
4731 				target_copy_map = VM_MAP_COPY_NULL;
4732 			}
4733 
4734 			if (kr != KERN_SUCCESS && !vmk_flags.vmf_overwrite) {
4735 				/* deallocate the contiguous range */
4736 				(void) vm_deallocate(target_map,
4737 				    map_addr,
4738 				    map_size);
4739 			}
4740 
4741 			return kr;
4742 		}
4743 
4744 		if (named_entry->is_object) {
4745 			unsigned int    access;
4746 			unsigned int    wimg_mode;
4747 
4748 			/* we are mapping a VM object */
4749 
4750 			access = named_entry->access;
4751 
4752 			if (vmk_flags.vmf_return_data_addr ||
4753 			    vmk_flags.vmf_return_4k_data_addr) {
4754 				offset_in_mapping = offset - VM_MAP_TRUNC_PAGE(offset, VM_MAP_PAGE_MASK(target_map));
4755 				if (vmk_flags.vmf_return_4k_data_addr) {
4756 					offset_in_mapping &= ~((signed)(0xFFF));
4757 				}
4758 				offset = VM_MAP_TRUNC_PAGE(offset, VM_MAP_PAGE_MASK(target_map));
4759 				map_size = VM_MAP_ROUND_PAGE((offset + offset_in_mapping + initial_size) - offset, VM_MAP_PAGE_MASK(target_map));
4760 			}
4761 
4762 			object = vm_named_entry_to_vm_object(named_entry);
4763 			assert(object != VM_OBJECT_NULL);
4764 			vm_object_lock(object);
4765 			named_entry_unlock(named_entry);
4766 
4767 			vm_object_reference_locked(object);
4768 
4769 			wimg_mode = object->wimg_bits;
4770 			vm_prot_to_wimg(access, &wimg_mode);
4771 			if (object->wimg_bits != wimg_mode) {
4772 				vm_object_change_wimg_mode(object, wimg_mode);
4773 			}
4774 
4775 			vm_object_unlock(object);
4776 		} else {
4777 			panic("invalid VM named entry %p", named_entry);
4778 		}
4779 	} else if (ip_kotype(port) == IKOT_MEMORY_OBJECT) {
4780 		/*
4781 		 * JMM - This is temporary until we unify named entries
4782 		 * and raw memory objects.
4783 		 *
4784 		 * Detected fake ip_kotype for a memory object.  In
4785 		 * this case, the port isn't really a port at all, but
4786 		 * instead is just a raw memory object.
4787 		 */
4788 		if (vmk_flags.vmf_return_data_addr ||
4789 		    vmk_flags.vmf_return_4k_data_addr) {
4790 			panic("VM_FLAGS_RETURN_DATA_ADDR not expected for raw memory object.");
4791 		}
4792 
4793 		object = memory_object_to_vm_object((memory_object_t)port);
4794 		if (object == VM_OBJECT_NULL) {
4795 			return KERN_INVALID_OBJECT;
4796 		}
4797 		vm_object_reference(object);
4798 
4799 		/* wait for object (if any) to be ready */
4800 		if (object != VM_OBJECT_NULL) {
4801 			if (is_kernel_object(object)) {
4802 				printf("Warning: Attempt to map kernel object"
4803 				    " by a non-private kernel entity\n");
4804 				return KERN_INVALID_OBJECT;
4805 			}
4806 			if (!object->pager_ready) {
4807 				vm_object_lock(object);
4808 
4809 				while (!object->pager_ready) {
4810 					vm_object_wait(object,
4811 					    VM_OBJECT_EVENT_PAGER_READY,
4812 					    THREAD_UNINT);
4813 					vm_object_lock(object);
4814 				}
4815 				vm_object_unlock(object);
4816 			}
4817 		}
4818 	} else {
4819 		return KERN_INVALID_OBJECT;
4820 	}
4821 
4822 	if (object != VM_OBJECT_NULL &&
4823 	    object->named &&
4824 	    object->pager != MEMORY_OBJECT_NULL &&
4825 	    object->copy_strategy != MEMORY_OBJECT_COPY_NONE) {
4826 		memory_object_t pager;
4827 		vm_prot_t       pager_prot;
4828 		kern_return_t   kr;
4829 
4830 		/*
4831 		 * For "named" VM objects, let the pager know that the
4832 		 * memory object is being mapped.  Some pagers need to keep
4833 		 * track of this, to know when they can reclaim the memory
4834 		 * object, for example.
4835 		 * VM calls memory_object_map() for each mapping (specifying
4836 		 * the protection of each mapping) and calls
4837 		 * memory_object_last_unmap() when all the mappings are gone.
4838 		 */
4839 		pager_prot = max_protection;
4840 		if (copy) {
4841 			/*
4842 			 * Copy-On-Write mapping: won't modify the
4843 			 * memory object.
4844 			 */
4845 			pager_prot &= ~VM_PROT_WRITE;
4846 		}
4847 		vm_object_lock(object);
4848 		pager = object->pager;
4849 		if (object->named &&
4850 		    pager != MEMORY_OBJECT_NULL &&
4851 		    object->copy_strategy != MEMORY_OBJECT_COPY_NONE) {
4852 			assert(object->pager_ready);
4853 			vm_object_mapping_wait(object, THREAD_UNINT);
4854 			vm_object_mapping_begin(object);
4855 			vm_object_unlock(object);
4856 
4857 			kr = memory_object_map(pager, pager_prot);
4858 			assert(kr == KERN_SUCCESS);
4859 
4860 			vm_object_lock(object);
4861 			vm_object_mapping_end(object);
4862 		}
4863 		vm_object_unlock(object);
4864 	}
4865 
4866 	/*
4867 	 *	Perform the copy if requested
4868 	 */
4869 
4870 	if (copy) {
4871 		vm_object_t             new_object;
4872 		vm_object_offset_t      new_offset;
4873 
4874 		result = vm_object_copy_strategically(object, offset,
4875 		    map_size,
4876 		    false,                                   /* forking */
4877 		    &new_object, &new_offset,
4878 		    &copy);
4879 
4880 
4881 		if (result == KERN_MEMORY_RESTART_COPY) {
4882 			boolean_t success;
4883 			boolean_t src_needs_copy;
4884 
4885 			/*
4886 			 * XXX
4887 			 * We currently ignore src_needs_copy.
4888 			 * This really is the issue of how to make
4889 			 * MEMORY_OBJECT_COPY_SYMMETRIC safe for
4890 			 * non-kernel users to use. Solution forthcoming.
4891 			 * In the meantime, since we don't allow non-kernel
4892 			 * memory managers to specify symmetric copy,
4893 			 * we won't run into problems here.
4894 			 */
4895 			new_object = object;
4896 			new_offset = offset;
4897 			success = vm_object_copy_quickly(new_object,
4898 			    new_offset,
4899 			    map_size,
4900 			    &src_needs_copy,
4901 			    &copy);
4902 			assert(success);
4903 			result = KERN_SUCCESS;
4904 		}
4905 		/*
4906 		 *	Throw away the reference to the
4907 		 *	original object, as it won't be mapped.
4908 		 */
4909 
4910 		vm_object_deallocate(object);
4911 
4912 		if (result != KERN_SUCCESS) {
4913 			return result;
4914 		}
4915 
4916 		object = new_object;
4917 		offset = new_offset;
4918 	}
4919 
4920 	/*
4921 	 * If non-kernel users want to try to prefault pages, the mapping and prefault
4922 	 * needs to be atomic.
4923 	 */
4924 	kernel_prefault = (try_prefault && vm_kernel_map_is_kernel(target_map));
4925 	vmk_flags.vmkf_keep_map_locked = (try_prefault && !kernel_prefault);
4926 
4927 #if __arm64__
4928 	if (fourk) {
4929 		/* map this object in a "4K" pager */
4930 		result = vm_map_enter_fourk(target_map,
4931 		    &map_addr,
4932 		    map_size,
4933 		    (vm_map_offset_t) mask,
4934 		    vmk_flags,
4935 		    object,
4936 		    offset,
4937 		    copy,
4938 		    cur_protection,
4939 		    max_protection,
4940 		    inheritance);
4941 	} else
4942 #endif /* __arm64__ */
4943 	{
4944 		result = vm_map_enter(target_map,
4945 		    &map_addr, map_size,
4946 		    (vm_map_offset_t)mask,
4947 		    vmk_flags,
4948 		    object, offset,
4949 		    copy,
4950 		    cur_protection, max_protection,
4951 		    inheritance);
4952 	}
4953 	if (result != KERN_SUCCESS) {
4954 		vm_object_deallocate(object);
4955 	}
4956 
4957 	/*
4958 	 * Try to prefault, and do not forget to release the vm map lock.
4959 	 */
4960 	if (result == KERN_SUCCESS && try_prefault) {
4961 		mach_vm_address_t va = map_addr;
4962 		kern_return_t kr = KERN_SUCCESS;
4963 		unsigned int i = 0;
4964 		int pmap_options;
4965 
4966 		pmap_options = kernel_prefault ? 0 : PMAP_OPTIONS_NOWAIT;
4967 		if (object->internal) {
4968 			pmap_options |= PMAP_OPTIONS_INTERNAL;
4969 		}
4970 
4971 		for (i = 0; i < page_list_count; ++i) {
4972 			if (!UPL_VALID_PAGE(page_list, i)) {
4973 				if (kernel_prefault) {
4974 					assertf(FALSE, "kernel_prefault && !UPL_VALID_PAGE");
4975 					result = KERN_MEMORY_ERROR;
4976 					break;
4977 				}
4978 			} else {
4979 				/*
4980 				 * If this function call failed, we should stop
4981 				 * trying to optimize, other calls are likely
4982 				 * going to fail too.
4983 				 *
4984 				 * We are not gonna report an error for such
4985 				 * failure though. That's an optimization, not
4986 				 * something critical.
4987 				 */
4988 				kr = pmap_enter_options(target_map->pmap,
4989 				    va, UPL_PHYS_PAGE(page_list, i),
4990 				    cur_protection, VM_PROT_NONE,
4991 				    0, TRUE, pmap_options, NULL, PMAP_MAPPING_TYPE_INFER);
4992 				if (kr != KERN_SUCCESS) {
4993 					OSIncrementAtomic64(&vm_prefault_nb_bailout);
4994 					if (kernel_prefault) {
4995 						result = kr;
4996 					}
4997 					break;
4998 				}
4999 				OSIncrementAtomic64(&vm_prefault_nb_pages);
5000 			}
5001 
5002 			/* Next virtual address */
5003 			va += PAGE_SIZE;
5004 		}
5005 		if (vmk_flags.vmkf_keep_map_locked) {
5006 			vm_map_unlock(target_map);
5007 		}
5008 	}
5009 
5010 	if (vmk_flags.vmf_return_data_addr ||
5011 	    vmk_flags.vmf_return_4k_data_addr) {
5012 		*address = map_addr + offset_in_mapping;
5013 	} else {
5014 		*address = map_addr;
5015 	}
5016 	return result;
5017 }
5018 
5019 kern_return_t
vm_map_enter_mem_object(vm_map_t target_map,vm_map_offset_t * address,vm_map_size_t initial_size,vm_map_offset_t mask,vm_map_kernel_flags_t vmk_flags,ipc_port_t port,vm_object_offset_t offset,boolean_t copy,vm_prot_t cur_protection,vm_prot_t max_protection,vm_inherit_t inheritance)5020 vm_map_enter_mem_object(
5021 	vm_map_t                target_map,
5022 	vm_map_offset_t         *address,
5023 	vm_map_size_t           initial_size,
5024 	vm_map_offset_t         mask,
5025 	vm_map_kernel_flags_t   vmk_flags,
5026 	ipc_port_t              port,
5027 	vm_object_offset_t      offset,
5028 	boolean_t               copy,
5029 	vm_prot_t               cur_protection,
5030 	vm_prot_t               max_protection,
5031 	vm_inherit_t            inheritance)
5032 {
5033 	kern_return_t ret;
5034 
5035 	/* range_id is set by vm_map_enter_mem_object_helper */
5036 	ret = vm_map_enter_mem_object_helper(target_map,
5037 	    address,
5038 	    initial_size,
5039 	    mask,
5040 	    vmk_flags,
5041 	    port,
5042 	    offset,
5043 	    copy,
5044 	    cur_protection,
5045 	    max_protection,
5046 	    inheritance,
5047 	    NULL,
5048 	    0);
5049 
5050 #if KASAN
5051 	if (ret == KERN_SUCCESS && address && target_map->pmap == kernel_pmap) {
5052 		kasan_notify_address(*address, initial_size);
5053 	}
5054 #endif
5055 
5056 	return ret;
5057 }
5058 
5059 kern_return_t
vm_map_enter_mem_object_prefault(vm_map_t target_map,vm_map_offset_t * address,vm_map_size_t initial_size,vm_map_offset_t mask,vm_map_kernel_flags_t vmk_flags,ipc_port_t port,vm_object_offset_t offset,vm_prot_t cur_protection,vm_prot_t max_protection,upl_page_list_ptr_t page_list,unsigned int page_list_count)5060 vm_map_enter_mem_object_prefault(
5061 	vm_map_t                target_map,
5062 	vm_map_offset_t         *address,
5063 	vm_map_size_t           initial_size,
5064 	vm_map_offset_t         mask,
5065 	vm_map_kernel_flags_t   vmk_flags,
5066 	ipc_port_t              port,
5067 	vm_object_offset_t      offset,
5068 	vm_prot_t               cur_protection,
5069 	vm_prot_t               max_protection,
5070 	upl_page_list_ptr_t     page_list,
5071 	unsigned int            page_list_count)
5072 {
5073 	kern_return_t ret;
5074 
5075 	/* range_id is set by vm_map_enter_mem_object_helper */
5076 	ret = vm_map_enter_mem_object_helper(target_map,
5077 	    address,
5078 	    initial_size,
5079 	    mask,
5080 	    vmk_flags,
5081 	    port,
5082 	    offset,
5083 	    FALSE,
5084 	    cur_protection,
5085 	    max_protection,
5086 	    VM_INHERIT_DEFAULT,
5087 	    page_list,
5088 	    page_list_count);
5089 
5090 #if KASAN
5091 	if (ret == KERN_SUCCESS && address && target_map->pmap == kernel_pmap) {
5092 		kasan_notify_address(*address, initial_size);
5093 	}
5094 #endif
5095 
5096 	return ret;
5097 }
5098 
5099 
5100 kern_return_t
vm_map_enter_mem_object_control(vm_map_t target_map,vm_map_offset_t * address,vm_map_size_t initial_size,vm_map_offset_t mask,vm_map_kernel_flags_t vmk_flags,memory_object_control_t control,vm_object_offset_t offset,boolean_t copy,vm_prot_t cur_protection,vm_prot_t max_protection,vm_inherit_t inheritance)5101 vm_map_enter_mem_object_control(
5102 	vm_map_t                target_map,
5103 	vm_map_offset_t         *address,
5104 	vm_map_size_t           initial_size,
5105 	vm_map_offset_t         mask,
5106 	vm_map_kernel_flags_t   vmk_flags,
5107 	memory_object_control_t control,
5108 	vm_object_offset_t      offset,
5109 	boolean_t               copy,
5110 	vm_prot_t               cur_protection,
5111 	vm_prot_t               max_protection,
5112 	vm_inherit_t            inheritance)
5113 {
5114 	vm_map_address_t        map_addr;
5115 	vm_map_size_t           map_size;
5116 	vm_object_t             object;
5117 	vm_object_size_t        size;
5118 	kern_return_t           result;
5119 	memory_object_t         pager;
5120 	vm_prot_t               pager_prot;
5121 	kern_return_t           kr;
5122 #if __arm64__
5123 	boolean_t               fourk = vmk_flags.vmkf_fourk;
5124 #endif /* __arm64__ */
5125 
5126 	/*
5127 	 * Check arguments for validity
5128 	 */
5129 	if ((target_map == VM_MAP_NULL) ||
5130 	    (cur_protection & ~(VM_PROT_ALL | VM_PROT_ALLEXEC)) ||
5131 	    (max_protection & ~(VM_PROT_ALL | VM_PROT_ALLEXEC)) ||
5132 	    (inheritance > VM_INHERIT_LAST_VALID) ||
5133 	    initial_size == 0) {
5134 		return KERN_INVALID_ARGUMENT;
5135 	}
5136 
5137 	if (__improbable((cur_protection & max_protection) != cur_protection)) {
5138 		/* cur is more permissive than max */
5139 		cur_protection &= max_protection;
5140 	}
5141 
5142 #if __arm64__
5143 	if (fourk && VM_MAP_PAGE_MASK(target_map) < PAGE_MASK) {
5144 		fourk = FALSE;
5145 	}
5146 
5147 	if (fourk) {
5148 		map_addr = vm_map_trunc_page(*address,
5149 		    FOURK_PAGE_MASK);
5150 		map_size = vm_map_round_page(initial_size,
5151 		    FOURK_PAGE_MASK);
5152 	} else
5153 #endif /* __arm64__ */
5154 	{
5155 		map_addr = vm_map_trunc_page(*address,
5156 		    VM_MAP_PAGE_MASK(target_map));
5157 		map_size = vm_map_round_page(initial_size,
5158 		    VM_MAP_PAGE_MASK(target_map));
5159 	}
5160 	size = vm_object_round_page(initial_size);
5161 
5162 	object = memory_object_control_to_vm_object(control);
5163 
5164 	if (object == VM_OBJECT_NULL) {
5165 		return KERN_INVALID_OBJECT;
5166 	}
5167 
5168 	if (is_kernel_object(object)) {
5169 		printf("Warning: Attempt to map kernel object"
5170 		    " by a non-private kernel entity\n");
5171 		return KERN_INVALID_OBJECT;
5172 	}
5173 
5174 	vm_object_lock(object);
5175 	object->ref_count++;
5176 
5177 	/*
5178 	 * For "named" VM objects, let the pager know that the
5179 	 * memory object is being mapped.  Some pagers need to keep
5180 	 * track of this, to know when they can reclaim the memory
5181 	 * object, for example.
5182 	 * VM calls memory_object_map() for each mapping (specifying
5183 	 * the protection of each mapping) and calls
5184 	 * memory_object_last_unmap() when all the mappings are gone.
5185 	 */
5186 	pager_prot = max_protection;
5187 	if (copy) {
5188 		pager_prot &= ~VM_PROT_WRITE;
5189 	}
5190 	pager = object->pager;
5191 	if (object->named &&
5192 	    pager != MEMORY_OBJECT_NULL &&
5193 	    object->copy_strategy != MEMORY_OBJECT_COPY_NONE) {
5194 		assert(object->pager_ready);
5195 		vm_object_mapping_wait(object, THREAD_UNINT);
5196 		vm_object_mapping_begin(object);
5197 		vm_object_unlock(object);
5198 
5199 		kr = memory_object_map(pager, pager_prot);
5200 		assert(kr == KERN_SUCCESS);
5201 
5202 		vm_object_lock(object);
5203 		vm_object_mapping_end(object);
5204 	}
5205 	vm_object_unlock(object);
5206 
5207 	/*
5208 	 *	Perform the copy if requested
5209 	 */
5210 
5211 	if (copy) {
5212 		vm_object_t             new_object;
5213 		vm_object_offset_t      new_offset;
5214 
5215 		result = vm_object_copy_strategically(object, offset, size,
5216 		    false,                                   /* forking */
5217 		    &new_object, &new_offset,
5218 		    &copy);
5219 
5220 
5221 		if (result == KERN_MEMORY_RESTART_COPY) {
5222 			boolean_t success;
5223 			boolean_t src_needs_copy;
5224 
5225 			/*
5226 			 * XXX
5227 			 * We currently ignore src_needs_copy.
5228 			 * This really is the issue of how to make
5229 			 * MEMORY_OBJECT_COPY_SYMMETRIC safe for
5230 			 * non-kernel users to use. Solution forthcoming.
5231 			 * In the meantime, since we don't allow non-kernel
5232 			 * memory managers to specify symmetric copy,
5233 			 * we won't run into problems here.
5234 			 */
5235 			new_object = object;
5236 			new_offset = offset;
5237 			success = vm_object_copy_quickly(new_object,
5238 			    new_offset, size,
5239 			    &src_needs_copy,
5240 			    &copy);
5241 			assert(success);
5242 			result = KERN_SUCCESS;
5243 		}
5244 		/*
5245 		 *	Throw away the reference to the
5246 		 *	original object, as it won't be mapped.
5247 		 */
5248 
5249 		vm_object_deallocate(object);
5250 
5251 		if (result != KERN_SUCCESS) {
5252 			return result;
5253 		}
5254 
5255 		object = new_object;
5256 		offset = new_offset;
5257 	}
5258 
5259 #if __arm64__
5260 	if (fourk) {
5261 		result = vm_map_enter_fourk(target_map,
5262 		    &map_addr,
5263 		    map_size,
5264 		    (vm_map_offset_t)mask,
5265 		    vmk_flags,
5266 		    object, offset,
5267 		    copy,
5268 		    cur_protection, max_protection,
5269 		    inheritance);
5270 	} else
5271 #endif /* __arm64__ */
5272 	{
5273 		result = vm_map_enter(target_map,
5274 		    &map_addr, map_size,
5275 		    (vm_map_offset_t)mask,
5276 		    vmk_flags,
5277 		    object, offset,
5278 		    copy,
5279 		    cur_protection, max_protection,
5280 		    inheritance);
5281 	}
5282 	if (result != KERN_SUCCESS) {
5283 		vm_object_deallocate(object);
5284 	}
5285 	*address = map_addr;
5286 
5287 	return result;
5288 }
5289 
5290 
5291 #if     VM_CPM
5292 
5293 #ifdef MACH_ASSERT
5294 extern pmap_paddr_t     avail_start, avail_end;
5295 #endif
5296 
5297 /*
5298  *	Allocate memory in the specified map, with the caveat that
5299  *	the memory is physically contiguous.  This call may fail
5300  *	if the system can't find sufficient contiguous memory.
5301  *	This call may cause or lead to heart-stopping amounts of
5302  *	paging activity.
5303  *
5304  *	Memory obtained from this call should be freed in the
5305  *	normal way, viz., via vm_deallocate.
5306  */
5307 kern_return_t
vm_map_enter_cpm(vm_map_t map,vm_map_offset_t * addr,vm_map_size_t size,vm_map_kernel_flags_t vmk_flags)5308 vm_map_enter_cpm(
5309 	vm_map_t                map,
5310 	vm_map_offset_t        *addr,
5311 	vm_map_size_t           size,
5312 	vm_map_kernel_flags_t   vmk_flags)
5313 {
5314 	vm_object_t             cpm_obj;
5315 	pmap_t                  pmap;
5316 	vm_page_t               m, pages;
5317 	kern_return_t           kr;
5318 	vm_map_offset_t         va, start, end, offset;
5319 #if     MACH_ASSERT
5320 	vm_map_offset_t         prev_addr = 0;
5321 #endif  /* MACH_ASSERT */
5322 	uint8_t                 object_lock_type = 0;
5323 
5324 	if (VM_MAP_PAGE_SHIFT(map) != PAGE_SHIFT) {
5325 		/* XXX TODO4K do we need to support this? */
5326 		*addr = 0;
5327 		return KERN_NOT_SUPPORTED;
5328 	}
5329 
5330 	if (size == 0) {
5331 		*addr = 0;
5332 		return KERN_SUCCESS;
5333 	}
5334 	if (vmk_flags.vmf_fixed) {
5335 		*addr = vm_map_trunc_page(*addr,
5336 		    VM_MAP_PAGE_MASK(map));
5337 	} else {
5338 		*addr = vm_map_min(map);
5339 	}
5340 	size = vm_map_round_page(size,
5341 	    VM_MAP_PAGE_MASK(map));
5342 
5343 	/*
5344 	 * LP64todo - cpm_allocate should probably allow
5345 	 * allocations of >4GB, but not with the current
5346 	 * algorithm, so just cast down the size for now.
5347 	 */
5348 	if (size > VM_MAX_ADDRESS) {
5349 		return KERN_RESOURCE_SHORTAGE;
5350 	}
5351 	if ((kr = cpm_allocate(CAST_DOWN(vm_size_t, size),
5352 	    &pages, 0, 0, TRUE, flags)) != KERN_SUCCESS) {
5353 		return kr;
5354 	}
5355 
5356 	cpm_obj = vm_object_allocate((vm_object_size_t)size);
5357 	assert(cpm_obj != VM_OBJECT_NULL);
5358 	assert(cpm_obj->internal);
5359 	assert(cpm_obj->vo_size == (vm_object_size_t)size);
5360 	assert(cpm_obj->can_persist == FALSE);
5361 	assert(cpm_obj->pager_created == FALSE);
5362 	assert(cpm_obj->pageout == FALSE);
5363 	assert(cpm_obj->shadow == VM_OBJECT_NULL);
5364 
5365 	/*
5366 	 *	Insert pages into object.
5367 	 */
5368 	object_lock_type = OBJECT_LOCK_EXCLUSIVE;
5369 	vm_object_lock(cpm_obj);
5370 	for (offset = 0; offset < size; offset += PAGE_SIZE) {
5371 		m = pages;
5372 		pages = NEXT_PAGE(m);
5373 		*(NEXT_PAGE_PTR(m)) = VM_PAGE_NULL;
5374 
5375 		assert(!m->vmp_gobbled);
5376 		assert(!m->vmp_wanted);
5377 		assert(!m->vmp_pageout);
5378 		assert(!m->vmp_tabled);
5379 		assert(VM_PAGE_WIRED(m));
5380 		assert(m->vmp_busy);
5381 		assert(VM_PAGE_GET_PHYS_PAGE(m) >= (avail_start >> PAGE_SHIFT) && VM_PAGE_GET_PHYS_PAGE(m) <= (avail_end >> PAGE_SHIFT));
5382 
5383 		m->vmp_busy = FALSE;
5384 		vm_page_insert(m, cpm_obj, offset);
5385 	}
5386 	assert(cpm_obj->resident_page_count == size / PAGE_SIZE);
5387 	vm_object_unlock(cpm_obj);
5388 
5389 	/*
5390 	 *	Hang onto a reference on the object in case a
5391 	 *	multi-threaded application for some reason decides
5392 	 *	to deallocate the portion of the address space into
5393 	 *	which we will insert this object.
5394 	 *
5395 	 *	Unfortunately, we must insert the object now before
5396 	 *	we can talk to the pmap module about which addresses
5397 	 *	must be wired down.  Hence, the race with a multi-
5398 	 *	threaded app.
5399 	 */
5400 	vm_object_reference(cpm_obj);
5401 
5402 	/*
5403 	 *	Insert object into map.
5404 	 */
5405 
5406 	kr = vm_map_enter(
5407 		map,
5408 		addr,
5409 		size,
5410 		(vm_map_offset_t)0,
5411 		vmk_flags,
5412 		cpm_obj,
5413 		(vm_object_offset_t)0,
5414 		FALSE,
5415 		VM_PROT_ALL,
5416 		VM_PROT_ALL,
5417 		VM_INHERIT_DEFAULT);
5418 
5419 	if (kr != KERN_SUCCESS) {
5420 		/*
5421 		 *	A CPM object doesn't have can_persist set,
5422 		 *	so all we have to do is deallocate it to
5423 		 *	free up these pages.
5424 		 */
5425 		assert(cpm_obj->pager_created == FALSE);
5426 		assert(cpm_obj->can_persist == FALSE);
5427 		assert(cpm_obj->pageout == FALSE);
5428 		assert(cpm_obj->shadow == VM_OBJECT_NULL);
5429 		vm_object_deallocate(cpm_obj); /* kill acquired ref */
5430 		vm_object_deallocate(cpm_obj); /* kill creation ref */
5431 	}
5432 
5433 	/*
5434 	 *	Inform the physical mapping system that the
5435 	 *	range of addresses may not fault, so that
5436 	 *	page tables and such can be locked down as well.
5437 	 */
5438 	start = *addr;
5439 	end = start + size;
5440 	pmap = vm_map_pmap(map);
5441 	pmap_pageable(pmap, start, end, FALSE);
5442 
5443 	/*
5444 	 *	Enter each page into the pmap, to avoid faults.
5445 	 *	Note that this loop could be coded more efficiently,
5446 	 *	if the need arose, rather than looking up each page
5447 	 *	again.
5448 	 */
5449 	for (offset = 0, va = start; offset < size;
5450 	    va += PAGE_SIZE, offset += PAGE_SIZE) {
5451 		int type_of_fault;
5452 
5453 		vm_object_lock(cpm_obj);
5454 		m = vm_page_lookup(cpm_obj, (vm_object_offset_t)offset);
5455 		assert(m != VM_PAGE_NULL);
5456 
5457 		vm_page_zero_fill(m);
5458 
5459 		type_of_fault = DBG_ZERO_FILL_FAULT;
5460 
5461 		vm_fault_enter(m, pmap, va,
5462 		    PAGE_SIZE, 0,
5463 		    VM_PROT_ALL, VM_PROT_WRITE,
5464 		    VM_PAGE_WIRED(m),
5465 		    FALSE,                             /* change_wiring */
5466 		    VM_KERN_MEMORY_NONE,                             /* tag - not wiring */
5467 		    FALSE,                             /* cs_bypass */
5468 		    0,                                 /* user_tag */
5469 		    0,                             /* pmap_options */
5470 		    NULL,                              /* need_retry */
5471 		    &type_of_fault,
5472 		    &object_lock_type);                 /* Exclusive lock mode. Will remain unchanged.*/
5473 
5474 		vm_object_unlock(cpm_obj);
5475 	}
5476 
5477 #if     MACH_ASSERT
5478 	/*
5479 	 *	Verify ordering in address space.
5480 	 */
5481 	for (offset = 0; offset < size; offset += PAGE_SIZE) {
5482 		vm_object_lock(cpm_obj);
5483 		m = vm_page_lookup(cpm_obj, (vm_object_offset_t)offset);
5484 		vm_object_unlock(cpm_obj);
5485 		if (m == VM_PAGE_NULL) {
5486 			panic("vm_allocate_cpm:  obj %p off 0x%llx no page",
5487 			    cpm_obj, (uint64_t)offset);
5488 		}
5489 		assert(m->vmp_tabled);
5490 		assert(!m->vmp_busy);
5491 		assert(!m->vmp_wanted);
5492 		assert(!m->vmp_fictitious);
5493 		assert(!m->vmp_private);
5494 		assert(!m->vmp_absent);
5495 		assert(!m->vmp_cleaning);
5496 		assert(!m->vmp_laundry);
5497 		assert(!m->vmp_precious);
5498 		assert(!m->vmp_clustered);
5499 		if (offset != 0) {
5500 			if (VM_PAGE_GET_PHYS_PAGE(m) != prev_addr + 1) {
5501 				printf("start 0x%llx end 0x%llx va 0x%llx\n",
5502 				    (uint64_t)start, (uint64_t)end, (uint64_t)va);
5503 				printf("obj %p off 0x%llx\n", cpm_obj, (uint64_t)offset);
5504 				printf("m %p prev_address 0x%llx\n", m, (uint64_t)prev_addr);
5505 				panic("vm_allocate_cpm:  pages not contig!");
5506 			}
5507 		}
5508 		prev_addr = VM_PAGE_GET_PHYS_PAGE(m);
5509 	}
5510 #endif  /* MACH_ASSERT */
5511 
5512 	vm_object_deallocate(cpm_obj); /* kill extra ref */
5513 
5514 	return kr;
5515 }
5516 
5517 
5518 #else   /* VM_CPM */
5519 
5520 /*
5521  *	Interface is defined in all cases, but unless the kernel
5522  *	is built explicitly for this option, the interface does
5523  *	nothing.
5524  */
5525 
5526 kern_return_t
vm_map_enter_cpm(__unused vm_map_t map,__unused vm_map_offset_t * addr,__unused vm_map_size_t size,__unused vm_map_kernel_flags_t vmk_flags)5527 vm_map_enter_cpm(
5528 	__unused vm_map_t                map,
5529 	__unused vm_map_offset_t        *addr,
5530 	__unused vm_map_size_t           size,
5531 	__unused vm_map_kernel_flags_t   vmk_flags)
5532 {
5533 	return KERN_FAILURE;
5534 }
5535 #endif /* VM_CPM */
5536 
5537 /* Not used without nested pmaps */
5538 #ifndef NO_NESTED_PMAP
5539 /*
5540  * Clip and unnest a portion of a nested submap mapping.
5541  */
5542 
5543 
5544 static void
vm_map_clip_unnest(vm_map_t map,vm_map_entry_t entry,vm_map_offset_t start_unnest,vm_map_offset_t end_unnest)5545 vm_map_clip_unnest(
5546 	vm_map_t        map,
5547 	vm_map_entry_t  entry,
5548 	vm_map_offset_t start_unnest,
5549 	vm_map_offset_t end_unnest)
5550 {
5551 	vm_map_offset_t old_start_unnest = start_unnest;
5552 	vm_map_offset_t old_end_unnest = end_unnest;
5553 
5554 	assert(entry->is_sub_map);
5555 	assert(VME_SUBMAP(entry) != NULL);
5556 	assert(entry->use_pmap);
5557 
5558 	/*
5559 	 * Query the platform for the optimal unnest range.
5560 	 * DRK: There's some duplication of effort here, since
5561 	 * callers may have adjusted the range to some extent. This
5562 	 * routine was introduced to support 1GiB subtree nesting
5563 	 * for x86 platforms, which can also nest on 2MiB boundaries
5564 	 * depending on size/alignment.
5565 	 */
5566 	if (pmap_adjust_unnest_parameters(map->pmap, &start_unnest, &end_unnest)) {
5567 		assert(VME_SUBMAP(entry)->is_nested_map);
5568 		assert(!VME_SUBMAP(entry)->disable_vmentry_reuse);
5569 		log_unnest_badness(map,
5570 		    old_start_unnest,
5571 		    old_end_unnest,
5572 		    VME_SUBMAP(entry)->is_nested_map,
5573 		    (entry->vme_start +
5574 		    VME_SUBMAP(entry)->lowest_unnestable_start -
5575 		    VME_OFFSET(entry)));
5576 	}
5577 
5578 	if (entry->vme_start > start_unnest ||
5579 	    entry->vme_end < end_unnest) {
5580 		panic("vm_map_clip_unnest(0x%llx,0x%llx): "
5581 		    "bad nested entry: start=0x%llx end=0x%llx\n",
5582 		    (long long)start_unnest, (long long)end_unnest,
5583 		    (long long)entry->vme_start, (long long)entry->vme_end);
5584 	}
5585 
5586 	if (start_unnest > entry->vme_start) {
5587 		_vm_map_clip_start(&map->hdr,
5588 		    entry,
5589 		    start_unnest);
5590 		if (map->holelistenabled) {
5591 			vm_map_store_update_first_free(map, NULL, FALSE);
5592 		} else {
5593 			vm_map_store_update_first_free(map, map->first_free, FALSE);
5594 		}
5595 	}
5596 	if (entry->vme_end > end_unnest) {
5597 		_vm_map_clip_end(&map->hdr,
5598 		    entry,
5599 		    end_unnest);
5600 		if (map->holelistenabled) {
5601 			vm_map_store_update_first_free(map, NULL, FALSE);
5602 		} else {
5603 			vm_map_store_update_first_free(map, map->first_free, FALSE);
5604 		}
5605 	}
5606 
5607 	pmap_unnest(map->pmap,
5608 	    entry->vme_start,
5609 	    entry->vme_end - entry->vme_start);
5610 	if ((map->mapped_in_other_pmaps) && os_ref_get_count_raw(&map->map_refcnt) != 0) {
5611 		/* clean up parent map/maps */
5612 		vm_map_submap_pmap_clean(
5613 			map, entry->vme_start,
5614 			entry->vme_end,
5615 			VME_SUBMAP(entry),
5616 			VME_OFFSET(entry));
5617 	}
5618 	entry->use_pmap = FALSE;
5619 	if ((map->pmap != kernel_pmap) &&
5620 	    (VME_ALIAS(entry) == VM_MEMORY_SHARED_PMAP)) {
5621 		VME_ALIAS_SET(entry, VM_MEMORY_UNSHARED_PMAP);
5622 	}
5623 }
5624 #endif  /* NO_NESTED_PMAP */
5625 
5626 __abortlike
5627 static void
__vm_map_clip_atomic_entry_panic(vm_map_t map,vm_map_entry_t entry,vm_map_offset_t where)5628 __vm_map_clip_atomic_entry_panic(
5629 	vm_map_t        map,
5630 	vm_map_entry_t  entry,
5631 	vm_map_offset_t where)
5632 {
5633 	panic("vm_map_clip(%p): Attempting to clip an atomic VM map entry "
5634 	    "%p [0x%llx:0x%llx] at 0x%llx", map, entry,
5635 	    (uint64_t)entry->vme_start,
5636 	    (uint64_t)entry->vme_end,
5637 	    (uint64_t)where);
5638 }
5639 
5640 /*
5641  *	vm_map_clip_start:	[ internal use only ]
5642  *
5643  *	Asserts that the given entry begins at or after
5644  *	the specified address; if necessary,
5645  *	it splits the entry into two.
5646  */
5647 void
vm_map_clip_start(vm_map_t map,vm_map_entry_t entry,vm_map_offset_t startaddr)5648 vm_map_clip_start(
5649 	vm_map_t        map,
5650 	vm_map_entry_t  entry,
5651 	vm_map_offset_t startaddr)
5652 {
5653 #ifndef NO_NESTED_PMAP
5654 	if (entry->is_sub_map &&
5655 	    entry->use_pmap &&
5656 	    startaddr >= entry->vme_start) {
5657 		vm_map_offset_t start_unnest, end_unnest;
5658 
5659 		/*
5660 		 * Make sure "startaddr" is no longer in a nested range
5661 		 * before we clip.  Unnest only the minimum range the platform
5662 		 * can handle.
5663 		 * vm_map_clip_unnest may perform additional adjustments to
5664 		 * the unnest range.
5665 		 */
5666 		start_unnest = startaddr & ~(pmap_shared_region_size_min(map->pmap) - 1);
5667 		end_unnest = start_unnest + pmap_shared_region_size_min(map->pmap);
5668 		vm_map_clip_unnest(map, entry, start_unnest, end_unnest);
5669 	}
5670 #endif /* NO_NESTED_PMAP */
5671 	if (startaddr > entry->vme_start) {
5672 		if (!entry->is_sub_map &&
5673 		    VME_OBJECT(entry) &&
5674 		    VME_OBJECT(entry)->phys_contiguous) {
5675 			pmap_remove(map->pmap,
5676 			    (addr64_t)(entry->vme_start),
5677 			    (addr64_t)(entry->vme_end));
5678 		}
5679 		if (entry->vme_atomic) {
5680 			__vm_map_clip_atomic_entry_panic(map, entry, startaddr);
5681 		}
5682 
5683 		DTRACE_VM5(
5684 			vm_map_clip_start,
5685 			vm_map_t, map,
5686 			vm_map_offset_t, entry->vme_start,
5687 			vm_map_offset_t, entry->vme_end,
5688 			vm_map_offset_t, startaddr,
5689 			int, VME_ALIAS(entry));
5690 
5691 		_vm_map_clip_start(&map->hdr, entry, startaddr);
5692 		if (map->holelistenabled) {
5693 			vm_map_store_update_first_free(map, NULL, FALSE);
5694 		} else {
5695 			vm_map_store_update_first_free(map, map->first_free, FALSE);
5696 		}
5697 	}
5698 }
5699 
5700 
5701 #define vm_map_copy_clip_start(copy, entry, startaddr) \
5702 	MACRO_BEGIN \
5703 	if ((startaddr) > (entry)->vme_start) \
5704 	        _vm_map_clip_start(&(copy)->cpy_hdr,(entry),(startaddr)); \
5705 	MACRO_END
5706 
5707 /*
5708  *	This routine is called only when it is known that
5709  *	the entry must be split.
5710  */
5711 static void
_vm_map_clip_start(struct vm_map_header * map_header,vm_map_entry_t entry,vm_map_offset_t start)5712 _vm_map_clip_start(
5713 	struct vm_map_header    *map_header,
5714 	vm_map_entry_t          entry,
5715 	vm_map_offset_t         start)
5716 {
5717 	vm_map_entry_t  new_entry;
5718 
5719 	/*
5720 	 *	Split off the front portion --
5721 	 *	note that we must insert the new
5722 	 *	entry BEFORE this one, so that
5723 	 *	this entry has the specified starting
5724 	 *	address.
5725 	 */
5726 
5727 	if (entry->map_aligned) {
5728 		assert(VM_MAP_PAGE_ALIGNED(start,
5729 		    VM_MAP_HDR_PAGE_MASK(map_header)));
5730 	}
5731 
5732 	new_entry = _vm_map_entry_create(map_header);
5733 	vm_map_entry_copy_full(new_entry, entry);
5734 
5735 	new_entry->vme_end = start;
5736 	assert(new_entry->vme_start < new_entry->vme_end);
5737 	VME_OFFSET_SET(entry, VME_OFFSET(entry) + (start - entry->vme_start));
5738 	if (__improbable(start >= entry->vme_end)) {
5739 		panic("mapHdr %p entry %p start 0x%llx end 0x%llx new start 0x%llx", map_header, entry, entry->vme_start, entry->vme_end, start);
5740 	}
5741 	assert(start < entry->vme_end);
5742 	entry->vme_start = start;
5743 
5744 #if VM_BTLOG_TAGS
5745 	if (new_entry->vme_kernel_object) {
5746 		btref_retain(new_entry->vme_tag_btref);
5747 	}
5748 #endif /* VM_BTLOG_TAGS */
5749 
5750 	_vm_map_store_entry_link(map_header, entry->vme_prev, new_entry);
5751 
5752 	if (entry->is_sub_map) {
5753 		vm_map_reference(VME_SUBMAP(new_entry));
5754 	} else {
5755 		vm_object_reference(VME_OBJECT(new_entry));
5756 	}
5757 }
5758 
5759 
5760 /*
5761  *	vm_map_clip_end:	[ internal use only ]
5762  *
5763  *	Asserts that the given entry ends at or before
5764  *	the specified address; if necessary,
5765  *	it splits the entry into two.
5766  */
5767 void
vm_map_clip_end(vm_map_t map,vm_map_entry_t entry,vm_map_offset_t endaddr)5768 vm_map_clip_end(
5769 	vm_map_t        map,
5770 	vm_map_entry_t  entry,
5771 	vm_map_offset_t endaddr)
5772 {
5773 	if (endaddr > entry->vme_end) {
5774 		/*
5775 		 * Within the scope of this clipping, limit "endaddr" to
5776 		 * the end of this map entry...
5777 		 */
5778 		endaddr = entry->vme_end;
5779 	}
5780 #ifndef NO_NESTED_PMAP
5781 	if (entry->is_sub_map && entry->use_pmap) {
5782 		vm_map_offset_t start_unnest, end_unnest;
5783 
5784 		/*
5785 		 * Make sure the range between the start of this entry and
5786 		 * the new "endaddr" is no longer nested before we clip.
5787 		 * Unnest only the minimum range the platform can handle.
5788 		 * vm_map_clip_unnest may perform additional adjustments to
5789 		 * the unnest range.
5790 		 */
5791 		start_unnest = entry->vme_start;
5792 		end_unnest =
5793 		    (endaddr + pmap_shared_region_size_min(map->pmap) - 1) &
5794 		    ~(pmap_shared_region_size_min(map->pmap) - 1);
5795 		vm_map_clip_unnest(map, entry, start_unnest, end_unnest);
5796 	}
5797 #endif /* NO_NESTED_PMAP */
5798 	if (endaddr < entry->vme_end) {
5799 		if (!entry->is_sub_map &&
5800 		    VME_OBJECT(entry) &&
5801 		    VME_OBJECT(entry)->phys_contiguous) {
5802 			pmap_remove(map->pmap,
5803 			    (addr64_t)(entry->vme_start),
5804 			    (addr64_t)(entry->vme_end));
5805 		}
5806 		if (entry->vme_atomic) {
5807 			__vm_map_clip_atomic_entry_panic(map, entry, endaddr);
5808 		}
5809 		DTRACE_VM5(
5810 			vm_map_clip_end,
5811 			vm_map_t, map,
5812 			vm_map_offset_t, entry->vme_start,
5813 			vm_map_offset_t, entry->vme_end,
5814 			vm_map_offset_t, endaddr,
5815 			int, VME_ALIAS(entry));
5816 
5817 		_vm_map_clip_end(&map->hdr, entry, endaddr);
5818 		if (map->holelistenabled) {
5819 			vm_map_store_update_first_free(map, NULL, FALSE);
5820 		} else {
5821 			vm_map_store_update_first_free(map, map->first_free, FALSE);
5822 		}
5823 	}
5824 }
5825 
5826 
5827 #define vm_map_copy_clip_end(copy, entry, endaddr) \
5828 	MACRO_BEGIN \
5829 	if ((endaddr) < (entry)->vme_end) \
5830 	        _vm_map_clip_end(&(copy)->cpy_hdr,(entry),(endaddr)); \
5831 	MACRO_END
5832 
5833 /*
5834  *	This routine is called only when it is known that
5835  *	the entry must be split.
5836  */
5837 static void
_vm_map_clip_end(struct vm_map_header * map_header,vm_map_entry_t entry,vm_map_offset_t end)5838 _vm_map_clip_end(
5839 	struct vm_map_header    *map_header,
5840 	vm_map_entry_t          entry,
5841 	vm_map_offset_t         end)
5842 {
5843 	vm_map_entry_t  new_entry;
5844 
5845 	/*
5846 	 *	Create a new entry and insert it
5847 	 *	AFTER the specified entry
5848 	 */
5849 
5850 	if (entry->map_aligned) {
5851 		assert(VM_MAP_PAGE_ALIGNED(end,
5852 		    VM_MAP_HDR_PAGE_MASK(map_header)));
5853 	}
5854 
5855 	new_entry = _vm_map_entry_create(map_header);
5856 	vm_map_entry_copy_full(new_entry, entry);
5857 
5858 	if (__improbable(end <= entry->vme_start)) {
5859 		panic("mapHdr %p entry %p start 0x%llx end 0x%llx new end 0x%llx", map_header, entry, entry->vme_start, entry->vme_end, end);
5860 	}
5861 	assert(entry->vme_start < end);
5862 	new_entry->vme_start = entry->vme_end = end;
5863 	VME_OFFSET_SET(new_entry,
5864 	    VME_OFFSET(new_entry) + (end - entry->vme_start));
5865 	assert(new_entry->vme_start < new_entry->vme_end);
5866 
5867 #if VM_BTLOG_TAGS
5868 	if (new_entry->vme_kernel_object) {
5869 		btref_retain(new_entry->vme_tag_btref);
5870 	}
5871 #endif /* VM_BTLOG_TAGS */
5872 
5873 	_vm_map_store_entry_link(map_header, entry, new_entry);
5874 
5875 	if (entry->is_sub_map) {
5876 		vm_map_reference(VME_SUBMAP(new_entry));
5877 	} else {
5878 		vm_object_reference(VME_OBJECT(new_entry));
5879 	}
5880 }
5881 
5882 
5883 /*
5884  *	VM_MAP_RANGE_CHECK:	[ internal use only ]
5885  *
5886  *	Asserts that the starting and ending region
5887  *	addresses fall within the valid range of the map.
5888  */
5889 #define VM_MAP_RANGE_CHECK(map, start, end)     \
5890 	MACRO_BEGIN                             \
5891 	if (start < vm_map_min(map))            \
5892 	        start = vm_map_min(map);        \
5893 	if (end > vm_map_max(map))              \
5894 	        end = vm_map_max(map);          \
5895 	if (start > end)                        \
5896 	        start = end;                    \
5897 	MACRO_END
5898 
5899 /*
5900  *	vm_map_range_check:	[ internal use only ]
5901  *
5902  *	Check that the region defined by the specified start and
5903  *	end addresses are wholly contained within a single map
5904  *	entry or set of adjacent map entries of the spacified map,
5905  *	i.e. the specified region contains no unmapped space.
5906  *	If any or all of the region is unmapped, FALSE is returned.
5907  *	Otherwise, TRUE is returned and if the output argument 'entry'
5908  *	is not NULL it points to the map entry containing the start
5909  *	of the region.
5910  *
5911  *	The map is locked for reading on entry and is left locked.
5912  */
5913 static boolean_t
vm_map_range_check(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_map_entry_t * entry)5914 vm_map_range_check(
5915 	vm_map_t                map,
5916 	vm_map_offset_t         start,
5917 	vm_map_offset_t         end,
5918 	vm_map_entry_t          *entry)
5919 {
5920 	vm_map_entry_t          cur;
5921 	vm_map_offset_t         prev;
5922 
5923 	/*
5924 	 *      Basic sanity checks first
5925 	 */
5926 	if (start < vm_map_min(map) || end > vm_map_max(map) || start > end) {
5927 		return FALSE;
5928 	}
5929 
5930 	/*
5931 	 *      Check first if the region starts within a valid
5932 	 *	mapping for the map.
5933 	 */
5934 	if (!vm_map_lookup_entry(map, start, &cur)) {
5935 		return FALSE;
5936 	}
5937 
5938 	/*
5939 	 *	Optimize for the case that the region is contained
5940 	 *	in a single map entry.
5941 	 */
5942 	if (entry != (vm_map_entry_t *) NULL) {
5943 		*entry = cur;
5944 	}
5945 	if (end <= cur->vme_end) {
5946 		return TRUE;
5947 	}
5948 
5949 	/*
5950 	 *      If the region is not wholly contained within a
5951 	 *      single entry, walk the entries looking for holes.
5952 	 */
5953 	prev = cur->vme_end;
5954 	cur = cur->vme_next;
5955 	while ((cur != vm_map_to_entry(map)) && (prev == cur->vme_start)) {
5956 		if (end <= cur->vme_end) {
5957 			return TRUE;
5958 		}
5959 		prev = cur->vme_end;
5960 		cur = cur->vme_next;
5961 	}
5962 	return FALSE;
5963 }
5964 
5965 /*
5966  *	vm_map_protect:
5967  *
5968  *	Sets the protection of the specified address
5969  *	region in the target map.  If "set_max" is
5970  *	specified, the maximum protection is to be set;
5971  *	otherwise, only the current protection is affected.
5972  */
5973 kern_return_t
vm_map_protect(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_prot_t new_prot,boolean_t set_max)5974 vm_map_protect(
5975 	vm_map_t        map,
5976 	vm_map_offset_t start,
5977 	vm_map_offset_t end,
5978 	vm_prot_t       new_prot,
5979 	boolean_t       set_max)
5980 {
5981 	vm_map_entry_t                  current;
5982 	vm_map_offset_t                 prev;
5983 	vm_map_entry_t                  entry;
5984 	vm_prot_t                       new_max;
5985 	int                             pmap_options = 0;
5986 	kern_return_t                   kr;
5987 
5988 	if (__improbable(vm_map_range_overflows(map, start, end - start))) {
5989 		return KERN_INVALID_ARGUMENT;
5990 	}
5991 
5992 	if (new_prot & VM_PROT_COPY) {
5993 		vm_map_offset_t         new_start;
5994 		vm_prot_t               cur_prot, max_prot;
5995 		vm_map_kernel_flags_t   kflags;
5996 
5997 		/* LP64todo - see below */
5998 		if (start >= map->max_offset) {
5999 			return KERN_INVALID_ADDRESS;
6000 		}
6001 
6002 		if ((new_prot & VM_PROT_ALLEXEC) &&
6003 		    map->pmap != kernel_pmap &&
6004 		    (vm_map_cs_enforcement(map)
6005 #if XNU_TARGET_OS_OSX && __arm64__
6006 		    || !VM_MAP_IS_EXOTIC(map)
6007 #endif /* XNU_TARGET_OS_OSX && __arm64__ */
6008 		    ) &&
6009 		    VM_MAP_POLICY_WX_FAIL(map)) {
6010 			DTRACE_VM3(cs_wx,
6011 			    uint64_t, (uint64_t) start,
6012 			    uint64_t, (uint64_t) end,
6013 			    vm_prot_t, new_prot);
6014 			printf("CODE SIGNING: %d[%s] %s:%d(0x%llx,0x%llx,0x%x) can't have both write and exec at the same time\n",
6015 			    proc_selfpid(),
6016 			    (get_bsdtask_info(current_task())
6017 			    ? proc_name_address(get_bsdtask_info(current_task()))
6018 			    : "?"),
6019 			    __FUNCTION__, __LINE__,
6020 #if DEVELOPMENT || DEBUG
6021 			    (uint64_t)start,
6022 			    (uint64_t)end,
6023 #else /* DEVELOPMENT || DEBUG */
6024 			    (uint64_t)0,
6025 			    (uint64_t)0,
6026 #endif /* DEVELOPMENT || DEBUG */
6027 			    new_prot);
6028 			return KERN_PROTECTION_FAILURE;
6029 		}
6030 
6031 		/*
6032 		 * Let vm_map_remap_extract() know that it will need to:
6033 		 * + make a copy of the mapping
6034 		 * + add VM_PROT_WRITE to the max protections
6035 		 * + remove any protections that are no longer allowed from the
6036 		 *   max protections (to avoid any WRITE/EXECUTE conflict, for
6037 		 *   example).
6038 		 * Note that "max_prot" is an IN/OUT parameter only for this
6039 		 * specific (VM_PROT_COPY) case.  It's usually an OUT parameter
6040 		 * only.
6041 		 */
6042 		max_prot = new_prot & (VM_PROT_ALL | VM_PROT_ALLEXEC);
6043 		cur_prot = VM_PROT_NONE;
6044 		kflags = VM_MAP_KERNEL_FLAGS_FIXED(.vmf_overwrite = true);
6045 		kflags.vmkf_remap_prot_copy = true;
6046 		kflags.vmkf_tpro_enforcement_override = !vm_map_tpro_enforcement(map);
6047 		new_start = start;
6048 		kr = vm_map_remap(map,
6049 		    &new_start,
6050 		    end - start,
6051 		    0, /* mask */
6052 		    kflags,
6053 		    map,
6054 		    start,
6055 		    TRUE, /* copy-on-write remapping! */
6056 		    &cur_prot, /* IN/OUT */
6057 		    &max_prot, /* IN/OUT */
6058 		    VM_INHERIT_DEFAULT);
6059 		if (kr != KERN_SUCCESS) {
6060 			return kr;
6061 		}
6062 		new_prot &= ~VM_PROT_COPY;
6063 	}
6064 
6065 	vm_map_lock(map);
6066 
6067 	/* LP64todo - remove this check when vm_map_commpage64()
6068 	 * no longer has to stuff in a map_entry for the commpage
6069 	 * above the map's max_offset.
6070 	 */
6071 	if (start >= map->max_offset) {
6072 		vm_map_unlock(map);
6073 		return KERN_INVALID_ADDRESS;
6074 	}
6075 
6076 	while (1) {
6077 		/*
6078 		 *      Lookup the entry.  If it doesn't start in a valid
6079 		 *	entry, return an error.
6080 		 */
6081 		if (!vm_map_lookup_entry(map, start, &entry)) {
6082 			vm_map_unlock(map);
6083 			return KERN_INVALID_ADDRESS;
6084 		}
6085 
6086 		if (entry->superpage_size && (start & (SUPERPAGE_SIZE - 1))) { /* extend request to whole entry */
6087 			start = SUPERPAGE_ROUND_DOWN(start);
6088 			continue;
6089 		}
6090 		break;
6091 	}
6092 	if (entry->superpage_size) {
6093 		end = SUPERPAGE_ROUND_UP(end);
6094 	}
6095 
6096 	/*
6097 	 *	Make a first pass to check for protection and address
6098 	 *	violations.
6099 	 */
6100 
6101 	current = entry;
6102 	prev = current->vme_start;
6103 	while ((current != vm_map_to_entry(map)) &&
6104 	    (current->vme_start < end)) {
6105 		/*
6106 		 * If there is a hole, return an error.
6107 		 */
6108 		if (current->vme_start != prev) {
6109 			vm_map_unlock(map);
6110 			return KERN_INVALID_ADDRESS;
6111 		}
6112 
6113 		new_max = current->max_protection;
6114 
6115 #if defined(__x86_64__)
6116 		/* Allow max mask to include execute prot bits if this map doesn't enforce CS */
6117 		if (set_max && (new_prot & VM_PROT_ALLEXEC) && !vm_map_cs_enforcement(map)) {
6118 			new_max = (new_max & ~VM_PROT_ALLEXEC) | (new_prot & VM_PROT_ALLEXEC);
6119 		}
6120 #elif CODE_SIGNING_MONITOR
6121 		if (set_max && (new_prot & VM_PROT_EXECUTE) && (csm_address_space_exempt(map->pmap) == KERN_SUCCESS)) {
6122 			new_max |= VM_PROT_EXECUTE;
6123 		}
6124 #endif
6125 		if ((new_prot & new_max) != new_prot) {
6126 			vm_map_unlock(map);
6127 			return KERN_PROTECTION_FAILURE;
6128 		}
6129 
6130 		if (current->used_for_jit &&
6131 		    pmap_has_prot_policy(map->pmap, current->translated_allow_execute, current->protection)) {
6132 			vm_map_unlock(map);
6133 			return KERN_PROTECTION_FAILURE;
6134 		}
6135 
6136 #if __arm64e__
6137 		/* Disallow remapping hw assisted TPRO mappings */
6138 		if (current->used_for_tpro) {
6139 			vm_map_unlock(map);
6140 			return KERN_PROTECTION_FAILURE;
6141 		}
6142 #endif /* __arm64e__ */
6143 
6144 
6145 		if ((new_prot & VM_PROT_WRITE) &&
6146 		    (new_prot & VM_PROT_ALLEXEC) &&
6147 #if XNU_TARGET_OS_OSX
6148 		    map->pmap != kernel_pmap &&
6149 		    (vm_map_cs_enforcement(map)
6150 #if __arm64__
6151 		    || !VM_MAP_IS_EXOTIC(map)
6152 #endif /* __arm64__ */
6153 		    ) &&
6154 #endif /* XNU_TARGET_OS_OSX */
6155 #if CODE_SIGNING_MONITOR
6156 		    (csm_address_space_exempt(map->pmap) != KERN_SUCCESS) &&
6157 #endif
6158 		    !(current->used_for_jit)) {
6159 			DTRACE_VM3(cs_wx,
6160 			    uint64_t, (uint64_t) current->vme_start,
6161 			    uint64_t, (uint64_t) current->vme_end,
6162 			    vm_prot_t, new_prot);
6163 			printf("CODE SIGNING: %d[%s] %s:%d(0x%llx,0x%llx,0x%x) can't have both write and exec at the same time\n",
6164 			    proc_selfpid(),
6165 			    (get_bsdtask_info(current_task())
6166 			    ? proc_name_address(get_bsdtask_info(current_task()))
6167 			    : "?"),
6168 			    __FUNCTION__, __LINE__,
6169 #if DEVELOPMENT || DEBUG
6170 			    (uint64_t)current->vme_start,
6171 			    (uint64_t)current->vme_end,
6172 #else /* DEVELOPMENT || DEBUG */
6173 			    (uint64_t)0,
6174 			    (uint64_t)0,
6175 #endif /* DEVELOPMENT || DEBUG */
6176 			    new_prot);
6177 			new_prot &= ~VM_PROT_ALLEXEC;
6178 			if (VM_MAP_POLICY_WX_FAIL(map)) {
6179 				vm_map_unlock(map);
6180 				return KERN_PROTECTION_FAILURE;
6181 			}
6182 		}
6183 
6184 		/*
6185 		 * If the task has requested executable lockdown,
6186 		 * deny both:
6187 		 * - adding executable protections OR
6188 		 * - adding write protections to an existing executable mapping.
6189 		 */
6190 		if (map->map_disallow_new_exec == TRUE) {
6191 			if ((new_prot & VM_PROT_ALLEXEC) ||
6192 			    ((current->protection & VM_PROT_EXECUTE) && (new_prot & VM_PROT_WRITE))) {
6193 				vm_map_unlock(map);
6194 				return KERN_PROTECTION_FAILURE;
6195 			}
6196 		}
6197 
6198 		prev = current->vme_end;
6199 		current = current->vme_next;
6200 	}
6201 
6202 #if __arm64__
6203 	if (end > prev &&
6204 	    end == vm_map_round_page(prev, VM_MAP_PAGE_MASK(map))) {
6205 		vm_map_entry_t prev_entry;
6206 
6207 		prev_entry = current->vme_prev;
6208 		if (prev_entry != vm_map_to_entry(map) &&
6209 		    !prev_entry->map_aligned &&
6210 		    (vm_map_round_page(prev_entry->vme_end,
6211 		    VM_MAP_PAGE_MASK(map))
6212 		    == end)) {
6213 			/*
6214 			 * The last entry in our range is not "map-aligned"
6215 			 * but it would have reached all the way to "end"
6216 			 * if it had been map-aligned, so this is not really
6217 			 * a hole in the range and we can proceed.
6218 			 */
6219 			prev = end;
6220 		}
6221 	}
6222 #endif /* __arm64__ */
6223 
6224 	if (end > prev) {
6225 		vm_map_unlock(map);
6226 		return KERN_INVALID_ADDRESS;
6227 	}
6228 
6229 	/*
6230 	 *	Go back and fix up protections.
6231 	 *	Clip to start here if the range starts within
6232 	 *	the entry.
6233 	 */
6234 
6235 	current = entry;
6236 	if (current != vm_map_to_entry(map)) {
6237 		/* clip and unnest if necessary */
6238 		vm_map_clip_start(map, current, start);
6239 	}
6240 
6241 	while ((current != vm_map_to_entry(map)) &&
6242 	    (current->vme_start < end)) {
6243 		vm_prot_t       old_prot;
6244 
6245 		vm_map_clip_end(map, current, end);
6246 
6247 #if DEVELOPMENT || DEBUG
6248 		if (current->csm_associated && vm_log_xnu_user_debug) {
6249 			printf("FBDP %d[%s] %s(0x%llx,0x%llx,0x%x) on map %p entry %p [0x%llx:0x%llx 0x%x/0x%x] csm_associated\n",
6250 			    proc_selfpid(),
6251 			    (get_bsdtask_info(current_task())
6252 			    ? proc_name_address(get_bsdtask_info(current_task()))
6253 			    : "?"),
6254 			    __FUNCTION__,
6255 			    (uint64_t)start,
6256 			    (uint64_t)end,
6257 			    new_prot,
6258 			    map, current,
6259 			    current->vme_start,
6260 			    current->vme_end,
6261 			    current->protection,
6262 			    current->max_protection);
6263 		}
6264 #endif /* DEVELOPMENT || DEBUG */
6265 
6266 		if (current->is_sub_map) {
6267 			/* clipping did unnest if needed */
6268 			assert(!current->use_pmap);
6269 		}
6270 
6271 		old_prot = current->protection;
6272 
6273 		if (set_max) {
6274 			current->max_protection = new_prot;
6275 			/* Consider either EXECUTE or UEXEC as EXECUTE for this masking */
6276 			current->protection = (new_prot & old_prot);
6277 		} else {
6278 			current->protection = new_prot;
6279 		}
6280 
6281 #if CODE_SIGNING_MONITOR
6282 		if (!current->vme_xnu_user_debug &&
6283 		    /* a !csm_associated mapping becoming executable */
6284 		    ((!current->csm_associated &&
6285 		    !(old_prot & VM_PROT_EXECUTE) &&
6286 		    (current->protection & VM_PROT_EXECUTE))
6287 		    ||
6288 		    /* a csm_associated mapping becoming writable */
6289 		    (current->csm_associated &&
6290 		    !(old_prot & VM_PROT_WRITE) &&
6291 		    (current->protection & VM_PROT_WRITE)))) {
6292 			/*
6293 			 * This mapping has not already been marked as
6294 			 * "user_debug" and it is either:
6295 			 * 1. not code-signing-monitored and becoming executable
6296 			 * 2. code-signing-monitored and becoming writable,
6297 			 * so inform the CodeSigningMonitor and mark the
6298 			 * mapping as "user_debug" if appropriate.
6299 			 */
6300 			vm_map_kernel_flags_t vmk_flags;
6301 			vmk_flags = VM_MAP_KERNEL_FLAGS_NONE;
6302 			/* pretend it's a vm_protect(VM_PROT_COPY)... */
6303 			vmk_flags.vmkf_remap_prot_copy = true;
6304 			kr = vm_map_entry_cs_associate(map, current, vmk_flags);
6305 #if DEVELOPMENT || DEBUG
6306 			if (vm_log_xnu_user_debug) {
6307 				printf("FBDP %d[%s] %s:%d map %p entry %p [ 0x%llx 0x%llx ] prot 0x%x -> 0x%x cs_associate -> %d user_debug=%d\n",
6308 				    proc_selfpid(),
6309 				    (get_bsdtask_info(current_task()) ? proc_name_address(get_bsdtask_info(current_task())) : "?"),
6310 				    __FUNCTION__, __LINE__,
6311 				    map, current,
6312 				    current->vme_start, current->vme_end,
6313 				    old_prot, current->protection,
6314 				    kr, current->vme_xnu_user_debug);
6315 			}
6316 #endif /* DEVELOPMENT || DEBUG */
6317 		}
6318 #endif /* CODE_SIGNING_MONITOR */
6319 
6320 		/*
6321 		 *	Update physical map if necessary.
6322 		 *	If the request is to turn off write protection,
6323 		 *	we won't do it for real (in pmap). This is because
6324 		 *	it would cause copy-on-write to fail.  We've already
6325 		 *	set, the new protection in the map, so if a
6326 		 *	write-protect fault occurred, it will be fixed up
6327 		 *	properly, COW or not.
6328 		 */
6329 		if (current->protection != old_prot) {
6330 			/* Look one level in we support nested pmaps */
6331 			/* from mapped submaps which are direct entries */
6332 			/* in our map */
6333 
6334 			vm_prot_t prot;
6335 
6336 			prot = current->protection;
6337 			if (current->is_sub_map || (VME_OBJECT(current) == NULL) || (VME_OBJECT(current) != compressor_object)) {
6338 				prot &= ~VM_PROT_WRITE;
6339 			} else {
6340 				assert(!VME_OBJECT(current)->code_signed);
6341 				assert(VME_OBJECT(current)->copy_strategy == MEMORY_OBJECT_COPY_NONE);
6342 				if (prot & VM_PROT_WRITE) {
6343 					/*
6344 					 * For write requests on the
6345 					 * compressor, we wil ask the
6346 					 * pmap layer to prevent us from
6347 					 * taking a write fault when we
6348 					 * attempt to access the mapping
6349 					 * next.
6350 					 */
6351 					pmap_options |= PMAP_OPTIONS_PROTECT_IMMEDIATE;
6352 				}
6353 			}
6354 
6355 			if (override_nx(map, VME_ALIAS(current)) && prot) {
6356 				prot |= VM_PROT_EXECUTE;
6357 			}
6358 
6359 #if DEVELOPMENT || DEBUG
6360 			if (!(old_prot & VM_PROT_EXECUTE) &&
6361 			    (prot & VM_PROT_EXECUTE) &&
6362 			    panic_on_unsigned_execute &&
6363 			    (proc_selfcsflags() & CS_KILL)) {
6364 				panic("vm_map_protect(%p,0x%llx,0x%llx) old=0x%x new=0x%x - <rdar://23770418> code-signing bypass?", map, (uint64_t)current->vme_start, (uint64_t)current->vme_end, old_prot, prot);
6365 			}
6366 #endif /* DEVELOPMENT || DEBUG */
6367 
6368 			if (pmap_has_prot_policy(map->pmap, current->translated_allow_execute, prot)) {
6369 				if (current->wired_count) {
6370 					panic("vm_map_protect(%p,0x%llx,0x%llx) new=0x%x wired=%x",
6371 					    map, (uint64_t)current->vme_start, (uint64_t)current->vme_end, prot, current->wired_count);
6372 				}
6373 
6374 				/* If the pmap layer cares about this
6375 				 * protection type, force a fault for
6376 				 * each page so that vm_fault will
6377 				 * repopulate the page with the full
6378 				 * set of protections.
6379 				 */
6380 				/*
6381 				 * TODO: We don't seem to need this,
6382 				 * but this is due to an internal
6383 				 * implementation detail of
6384 				 * pmap_protect.  Do we want to rely
6385 				 * on this?
6386 				 */
6387 				prot = VM_PROT_NONE;
6388 			}
6389 
6390 			if (current->is_sub_map && current->use_pmap) {
6391 				pmap_protect(VME_SUBMAP(current)->pmap,
6392 				    current->vme_start,
6393 				    current->vme_end,
6394 				    prot);
6395 			} else {
6396 				pmap_protect_options(map->pmap,
6397 				    current->vme_start,
6398 				    current->vme_end,
6399 				    prot,
6400 				    pmap_options,
6401 				    NULL);
6402 			}
6403 		}
6404 		current = current->vme_next;
6405 	}
6406 
6407 	current = entry;
6408 	while ((current != vm_map_to_entry(map)) &&
6409 	    (current->vme_start <= end)) {
6410 		vm_map_simplify_entry(map, current);
6411 		current = current->vme_next;
6412 	}
6413 
6414 	vm_map_unlock(map);
6415 	return KERN_SUCCESS;
6416 }
6417 
6418 /*
6419  *	vm_map_inherit:
6420  *
6421  *	Sets the inheritance of the specified address
6422  *	range in the target map.  Inheritance
6423  *	affects how the map will be shared with
6424  *	child maps at the time of vm_map_fork.
6425  */
6426 kern_return_t
vm_map_inherit(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_inherit_t new_inheritance)6427 vm_map_inherit(
6428 	vm_map_t        map,
6429 	vm_map_offset_t start,
6430 	vm_map_offset_t end,
6431 	vm_inherit_t    new_inheritance)
6432 {
6433 	vm_map_entry_t  entry;
6434 	vm_map_entry_t  temp_entry;
6435 
6436 	vm_map_lock(map);
6437 
6438 	VM_MAP_RANGE_CHECK(map, start, end);
6439 
6440 	if (__improbable(vm_map_range_overflows(map, start, end - start))) {
6441 		vm_map_unlock(map);
6442 		return KERN_INVALID_ADDRESS;
6443 	}
6444 
6445 	if (vm_map_lookup_entry(map, start, &temp_entry)) {
6446 		entry = temp_entry;
6447 	} else {
6448 		temp_entry = temp_entry->vme_next;
6449 		entry = temp_entry;
6450 	}
6451 
6452 	/* first check entire range for submaps which can't support the */
6453 	/* given inheritance. */
6454 	while ((entry != vm_map_to_entry(map)) && (entry->vme_start < end)) {
6455 		if (entry->is_sub_map) {
6456 			if (new_inheritance == VM_INHERIT_COPY) {
6457 				vm_map_unlock(map);
6458 				return KERN_INVALID_ARGUMENT;
6459 			}
6460 		}
6461 
6462 		entry = entry->vme_next;
6463 	}
6464 
6465 	entry = temp_entry;
6466 	if (entry != vm_map_to_entry(map)) {
6467 		/* clip and unnest if necessary */
6468 		vm_map_clip_start(map, entry, start);
6469 	}
6470 
6471 	while ((entry != vm_map_to_entry(map)) && (entry->vme_start < end)) {
6472 		vm_map_clip_end(map, entry, end);
6473 		if (entry->is_sub_map) {
6474 			/* clip did unnest if needed */
6475 			assert(!entry->use_pmap);
6476 		}
6477 
6478 		entry->inheritance = new_inheritance;
6479 
6480 		entry = entry->vme_next;
6481 	}
6482 
6483 	vm_map_unlock(map);
6484 	return KERN_SUCCESS;
6485 }
6486 
6487 /*
6488  * Update the accounting for the amount of wired memory in this map.  If the user has
6489  * exceeded the defined limits, then we fail.  Wiring on behalf of the kernel never fails.
6490  */
6491 
6492 static kern_return_t
add_wire_counts(vm_map_t map,vm_map_entry_t entry,boolean_t user_wire)6493 add_wire_counts(
6494 	vm_map_t        map,
6495 	vm_map_entry_t  entry,
6496 	boolean_t       user_wire)
6497 {
6498 	vm_map_size_t   size;
6499 
6500 	bool first_wire = entry->wired_count == 0 && entry->user_wired_count == 0;
6501 
6502 	if (user_wire) {
6503 		unsigned int total_wire_count =  vm_page_wire_count + vm_lopage_free_count;
6504 
6505 		/*
6506 		 * We're wiring memory at the request of the user.  Check if this is the first time the user is wiring
6507 		 * this map entry.
6508 		 */
6509 
6510 		if (entry->user_wired_count == 0) {
6511 			size = entry->vme_end - entry->vme_start;
6512 
6513 			/*
6514 			 * Since this is the first time the user is wiring this map entry, check to see if we're
6515 			 * exceeding the user wire limits.  There is a per map limit which is the smaller of either
6516 			 * the process's rlimit or the global vm_per_task_user_wire_limit which caps this value.  There is also
6517 			 * a system-wide limit on the amount of memory all users can wire.  If the user is over either
6518 			 * limit, then we fail.
6519 			 */
6520 
6521 			if (size + map->user_wire_size > MIN(map->user_wire_limit, vm_per_task_user_wire_limit) ||
6522 			    size + ptoa_64(total_wire_count) > vm_global_user_wire_limit) {
6523 				if (size + ptoa_64(total_wire_count) > vm_global_user_wire_limit) {
6524 #if DEVELOPMENT || DEBUG
6525 					if (panic_on_mlock_failure) {
6526 						panic("mlock: Over global wire limit. %llu bytes wired and requested to wire %llu bytes more", ptoa_64(total_wire_count), (uint64_t) size);
6527 					}
6528 #endif /* DEVELOPMENT || DEBUG */
6529 					os_atomic_inc(&vm_add_wire_count_over_global_limit, relaxed);
6530 				} else {
6531 					os_atomic_inc(&vm_add_wire_count_over_user_limit, relaxed);
6532 #if DEVELOPMENT || DEBUG
6533 					if (panic_on_mlock_failure) {
6534 						panic("mlock: Over process wire limit. %llu bytes wired and requested to wire %llu bytes more", (uint64_t) map->user_wire_size, (uint64_t) size);
6535 					}
6536 #endif /* DEVELOPMENT || DEBUG */
6537 				}
6538 				return KERN_RESOURCE_SHORTAGE;
6539 			}
6540 
6541 			/*
6542 			 * The first time the user wires an entry, we also increment the wired_count and add this to
6543 			 * the total that has been wired in the map.
6544 			 */
6545 
6546 			if (entry->wired_count >= MAX_WIRE_COUNT) {
6547 				return KERN_FAILURE;
6548 			}
6549 
6550 			entry->wired_count++;
6551 			map->user_wire_size += size;
6552 		}
6553 
6554 		if (entry->user_wired_count >= MAX_WIRE_COUNT) {
6555 			return KERN_FAILURE;
6556 		}
6557 
6558 		entry->user_wired_count++;
6559 	} else {
6560 		/*
6561 		 * The kernel's wiring the memory.  Just bump the count and continue.
6562 		 */
6563 
6564 		if (entry->wired_count >= MAX_WIRE_COUNT) {
6565 			panic("vm_map_wire: too many wirings");
6566 		}
6567 
6568 		entry->wired_count++;
6569 	}
6570 
6571 	if (first_wire) {
6572 		vme_btref_consider_and_set(entry, __builtin_frame_address(0));
6573 	}
6574 
6575 	return KERN_SUCCESS;
6576 }
6577 
6578 /*
6579  * Update the memory wiring accounting now that the given map entry is being unwired.
6580  */
6581 
6582 static void
subtract_wire_counts(vm_map_t map,vm_map_entry_t entry,boolean_t user_wire)6583 subtract_wire_counts(
6584 	vm_map_t        map,
6585 	vm_map_entry_t  entry,
6586 	boolean_t       user_wire)
6587 {
6588 	if (user_wire) {
6589 		/*
6590 		 * We're unwiring memory at the request of the user.  See if we're removing the last user wire reference.
6591 		 */
6592 
6593 		if (entry->user_wired_count == 1) {
6594 			/*
6595 			 * We're removing the last user wire reference.  Decrement the wired_count and the total
6596 			 * user wired memory for this map.
6597 			 */
6598 
6599 			assert(entry->wired_count >= 1);
6600 			entry->wired_count--;
6601 			map->user_wire_size -= entry->vme_end - entry->vme_start;
6602 		}
6603 
6604 		assert(entry->user_wired_count >= 1);
6605 		entry->user_wired_count--;
6606 	} else {
6607 		/*
6608 		 * The kernel is unwiring the memory.   Just update the count.
6609 		 */
6610 
6611 		assert(entry->wired_count >= 1);
6612 		entry->wired_count--;
6613 	}
6614 
6615 	vme_btref_consider_and_put(entry);
6616 }
6617 
6618 int cs_executable_wire = 0;
6619 
6620 /*
6621  *	vm_map_wire:
6622  *
6623  *	Sets the pageability of the specified address range in the
6624  *	target map as wired.  Regions specified as not pageable require
6625  *	locked-down physical memory and physical page maps.  The
6626  *	access_type variable indicates types of accesses that must not
6627  *	generate page faults.  This is checked against protection of
6628  *	memory being locked-down.
6629  *
6630  *	The map must not be locked, but a reference must remain to the
6631  *	map throughout the call.
6632  */
6633 static kern_return_t
vm_map_wire_nested(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_prot_t caller_prot,vm_tag_t tag,boolean_t user_wire,pmap_t map_pmap,vm_map_offset_t pmap_addr,ppnum_t * physpage_p)6634 vm_map_wire_nested(
6635 	vm_map_t                map,
6636 	vm_map_offset_t         start,
6637 	vm_map_offset_t         end,
6638 	vm_prot_t               caller_prot,
6639 	vm_tag_t                tag,
6640 	boolean_t               user_wire,
6641 	pmap_t                  map_pmap,
6642 	vm_map_offset_t         pmap_addr,
6643 	ppnum_t                 *physpage_p)
6644 {
6645 	vm_map_entry_t          entry;
6646 	vm_prot_t               access_type;
6647 	struct vm_map_entry     *first_entry, tmp_entry;
6648 	vm_map_t                real_map;
6649 	vm_map_offset_t         s, e;
6650 	kern_return_t           rc;
6651 	boolean_t               need_wakeup;
6652 	boolean_t               main_map = FALSE;
6653 	wait_interrupt_t        interruptible_state;
6654 	thread_t                cur_thread;
6655 	unsigned int            last_timestamp;
6656 	vm_map_size_t           size;
6657 	boolean_t               wire_and_extract;
6658 	vm_prot_t               extra_prots;
6659 
6660 	extra_prots = VM_PROT_COPY;
6661 	extra_prots |= VM_PROT_COPY_FAIL_IF_EXECUTABLE;
6662 #if XNU_TARGET_OS_OSX
6663 	if (map->pmap == kernel_pmap ||
6664 	    !vm_map_cs_enforcement(map)) {
6665 		extra_prots &= ~VM_PROT_COPY_FAIL_IF_EXECUTABLE;
6666 	}
6667 #endif /* XNU_TARGET_OS_OSX */
6668 #if CODE_SIGNING_MONITOR
6669 	if (csm_address_space_exempt(map->pmap) == KERN_SUCCESS) {
6670 		extra_prots &= ~VM_PROT_COPY_FAIL_IF_EXECUTABLE;
6671 	}
6672 #endif /* CODE_SIGNING_MONITOR */
6673 
6674 	access_type = (caller_prot & (VM_PROT_ALL | VM_PROT_ALLEXEC));
6675 
6676 	wire_and_extract = FALSE;
6677 	if (physpage_p != NULL) {
6678 		/*
6679 		 * The caller wants the physical page number of the
6680 		 * wired page.  We return only one physical page number
6681 		 * so this works for only one page at a time.
6682 		 */
6683 		if ((end - start) != PAGE_SIZE) {
6684 			return KERN_INVALID_ARGUMENT;
6685 		}
6686 		wire_and_extract = TRUE;
6687 		*physpage_p = 0;
6688 	}
6689 
6690 	vm_map_lock(map);
6691 	if (map_pmap == NULL) {
6692 		main_map = TRUE;
6693 	}
6694 	last_timestamp = map->timestamp;
6695 
6696 	VM_MAP_RANGE_CHECK(map, start, end);
6697 	assert(VM_MAP_PAGE_ALIGNED(start, VM_MAP_PAGE_MASK(map)));
6698 	assert(VM_MAP_PAGE_ALIGNED(end, VM_MAP_PAGE_MASK(map)));
6699 
6700 	if (start == end) {
6701 		/* We wired what the caller asked for, zero pages */
6702 		vm_map_unlock(map);
6703 		return KERN_SUCCESS;
6704 	}
6705 
6706 	if (__improbable(vm_map_range_overflows(map, start, end - start))) {
6707 		vm_map_unlock(map);
6708 		return KERN_INVALID_ADDRESS;
6709 	}
6710 
6711 	need_wakeup = FALSE;
6712 	cur_thread = current_thread();
6713 
6714 	s = start;
6715 	rc = KERN_SUCCESS;
6716 
6717 	if (vm_map_lookup_entry(map, s, &first_entry)) {
6718 		entry = first_entry;
6719 		/*
6720 		 * vm_map_clip_start will be done later.
6721 		 * We don't want to unnest any nested submaps here !
6722 		 */
6723 	} else {
6724 		/* Start address is not in map */
6725 		rc = KERN_INVALID_ADDRESS;
6726 		goto done;
6727 	}
6728 
6729 	while ((entry != vm_map_to_entry(map)) && (s < end)) {
6730 		/*
6731 		 * At this point, we have wired from "start" to "s".
6732 		 * We still need to wire from "s" to "end".
6733 		 *
6734 		 * "entry" hasn't been clipped, so it could start before "s"
6735 		 * and/or end after "end".
6736 		 */
6737 
6738 		/* "e" is how far we want to wire in this entry */
6739 		e = entry->vme_end;
6740 		if (e > end) {
6741 			e = end;
6742 		}
6743 
6744 		/*
6745 		 * If another thread is wiring/unwiring this entry then
6746 		 * block after informing other thread to wake us up.
6747 		 */
6748 		if (entry->in_transition) {
6749 			wait_result_t wait_result;
6750 
6751 			/*
6752 			 * We have not clipped the entry.  Make sure that
6753 			 * the start address is in range so that the lookup
6754 			 * below will succeed.
6755 			 * "s" is the current starting point: we've already
6756 			 * wired from "start" to "s" and we still have
6757 			 * to wire from "s" to "end".
6758 			 */
6759 
6760 			entry->needs_wakeup = TRUE;
6761 
6762 			/*
6763 			 * wake up anybody waiting on entries that we have
6764 			 * already wired.
6765 			 */
6766 			if (need_wakeup) {
6767 				vm_map_entry_wakeup(map);
6768 				need_wakeup = FALSE;
6769 			}
6770 			/*
6771 			 * User wiring is interruptible
6772 			 */
6773 			wait_result = vm_map_entry_wait(map,
6774 			    (user_wire) ? THREAD_ABORTSAFE :
6775 			    THREAD_UNINT);
6776 			if (user_wire && wait_result == THREAD_INTERRUPTED) {
6777 				/*
6778 				 * undo the wirings we have done so far
6779 				 * We do not clear the needs_wakeup flag,
6780 				 * because we cannot tell if we were the
6781 				 * only one waiting.
6782 				 */
6783 				rc = KERN_FAILURE;
6784 				goto done;
6785 			}
6786 
6787 			/*
6788 			 * Cannot avoid a lookup here. reset timestamp.
6789 			 */
6790 			last_timestamp = map->timestamp;
6791 
6792 			/*
6793 			 * The entry could have been clipped, look it up again.
6794 			 * Worse that can happen is, it may not exist anymore.
6795 			 */
6796 			if (!vm_map_lookup_entry(map, s, &first_entry)) {
6797 				/*
6798 				 * User: undo everything upto the previous
6799 				 * entry.  let vm_map_unwire worry about
6800 				 * checking the validity of the range.
6801 				 */
6802 				rc = KERN_FAILURE;
6803 				goto done;
6804 			}
6805 			entry = first_entry;
6806 			continue;
6807 		}
6808 
6809 		if (entry->is_sub_map) {
6810 			vm_map_offset_t sub_start;
6811 			vm_map_offset_t sub_end;
6812 			vm_map_offset_t local_start;
6813 			vm_map_offset_t local_end;
6814 			pmap_t          pmap;
6815 
6816 			if (wire_and_extract) {
6817 				/*
6818 				 * Wiring would result in copy-on-write
6819 				 * which would not be compatible with
6820 				 * the sharing we have with the original
6821 				 * provider of this memory.
6822 				 */
6823 				rc = KERN_INVALID_ARGUMENT;
6824 				goto done;
6825 			}
6826 
6827 			vm_map_clip_start(map, entry, s);
6828 			vm_map_clip_end(map, entry, end);
6829 
6830 			sub_start = VME_OFFSET(entry);
6831 			sub_end = entry->vme_end;
6832 			sub_end += VME_OFFSET(entry) - entry->vme_start;
6833 
6834 			local_end = entry->vme_end;
6835 			if (map_pmap == NULL) {
6836 				vm_object_t             object;
6837 				vm_object_offset_t      offset;
6838 				vm_prot_t               prot;
6839 				boolean_t               wired;
6840 				vm_map_entry_t          local_entry;
6841 				vm_map_version_t         version;
6842 				vm_map_t                lookup_map;
6843 
6844 				if (entry->use_pmap) {
6845 					pmap = VME_SUBMAP(entry)->pmap;
6846 					/* ppc implementation requires that */
6847 					/* submaps pmap address ranges line */
6848 					/* up with parent map */
6849 #ifdef notdef
6850 					pmap_addr = sub_start;
6851 #endif
6852 					pmap_addr = s;
6853 				} else {
6854 					pmap = map->pmap;
6855 					pmap_addr = s;
6856 				}
6857 
6858 				if (entry->wired_count) {
6859 					if ((rc = add_wire_counts(map, entry, user_wire)) != KERN_SUCCESS) {
6860 						goto done;
6861 					}
6862 
6863 					/*
6864 					 * The map was not unlocked:
6865 					 * no need to goto re-lookup.
6866 					 * Just go directly to next entry.
6867 					 */
6868 					entry = entry->vme_next;
6869 					s = entry->vme_start;
6870 					continue;
6871 				}
6872 
6873 				/* call vm_map_lookup_and_lock_object to */
6874 				/* cause any needs copy to be   */
6875 				/* evaluated */
6876 				local_start = entry->vme_start;
6877 				lookup_map = map;
6878 				vm_map_lock_write_to_read(map);
6879 				rc = vm_map_lookup_and_lock_object(
6880 					&lookup_map, local_start,
6881 					(access_type | extra_prots),
6882 					OBJECT_LOCK_EXCLUSIVE,
6883 					&version, &object,
6884 					&offset, &prot, &wired,
6885 					NULL,
6886 					&real_map, NULL);
6887 				if (rc != KERN_SUCCESS) {
6888 					vm_map_unlock_read(lookup_map);
6889 					assert(map_pmap == NULL);
6890 					vm_map_unwire(map, start,
6891 					    s, user_wire);
6892 					return rc;
6893 				}
6894 				vm_object_unlock(object);
6895 				if (real_map != lookup_map) {
6896 					vm_map_unlock(real_map);
6897 				}
6898 				vm_map_unlock_read(lookup_map);
6899 				vm_map_lock(map);
6900 
6901 				/* we unlocked, so must re-lookup */
6902 				if (!vm_map_lookup_entry(map,
6903 				    local_start,
6904 				    &local_entry)) {
6905 					rc = KERN_FAILURE;
6906 					goto done;
6907 				}
6908 
6909 				/*
6910 				 * entry could have been "simplified",
6911 				 * so re-clip
6912 				 */
6913 				entry = local_entry;
6914 				assert(s == local_start);
6915 				vm_map_clip_start(map, entry, s);
6916 				vm_map_clip_end(map, entry, end);
6917 				/* re-compute "e" */
6918 				e = entry->vme_end;
6919 				if (e > end) {
6920 					e = end;
6921 				}
6922 
6923 				/* did we have a change of type? */
6924 				if (!entry->is_sub_map) {
6925 					last_timestamp = map->timestamp;
6926 					continue;
6927 				}
6928 			} else {
6929 				local_start = entry->vme_start;
6930 				pmap = map_pmap;
6931 			}
6932 
6933 			if ((rc = add_wire_counts(map, entry, user_wire)) != KERN_SUCCESS) {
6934 				goto done;
6935 			}
6936 
6937 			entry->in_transition = TRUE;
6938 
6939 			vm_map_unlock(map);
6940 			rc = vm_map_wire_nested(VME_SUBMAP(entry),
6941 			    sub_start, sub_end,
6942 			    caller_prot, tag,
6943 			    user_wire, pmap, pmap_addr,
6944 			    NULL);
6945 			vm_map_lock(map);
6946 
6947 			/*
6948 			 * Find the entry again.  It could have been clipped
6949 			 * after we unlocked the map.
6950 			 */
6951 			if (!vm_map_lookup_entry(map, local_start,
6952 			    &first_entry)) {
6953 				panic("vm_map_wire: re-lookup failed");
6954 			}
6955 			entry = first_entry;
6956 
6957 			assert(local_start == s);
6958 			/* re-compute "e" */
6959 			e = entry->vme_end;
6960 			if (e > end) {
6961 				e = end;
6962 			}
6963 
6964 			last_timestamp = map->timestamp;
6965 			while ((entry != vm_map_to_entry(map)) &&
6966 			    (entry->vme_start < e)) {
6967 				assert(entry->in_transition);
6968 				entry->in_transition = FALSE;
6969 				if (entry->needs_wakeup) {
6970 					entry->needs_wakeup = FALSE;
6971 					need_wakeup = TRUE;
6972 				}
6973 				if (rc != KERN_SUCCESS) {/* from vm_*_wire */
6974 					subtract_wire_counts(map, entry, user_wire);
6975 				}
6976 				entry = entry->vme_next;
6977 			}
6978 			if (rc != KERN_SUCCESS) {       /* from vm_*_wire */
6979 				goto done;
6980 			}
6981 
6982 			/* no need to relookup again */
6983 			s = entry->vme_start;
6984 			continue;
6985 		}
6986 
6987 		/*
6988 		 * If this entry is already wired then increment
6989 		 * the appropriate wire reference count.
6990 		 */
6991 		if (entry->wired_count) {
6992 			if ((entry->protection & access_type) != access_type) {
6993 				/* found a protection problem */
6994 
6995 				/*
6996 				 * XXX FBDP
6997 				 * We should always return an error
6998 				 * in this case but since we didn't
6999 				 * enforce it before, let's do
7000 				 * it only for the new "wire_and_extract"
7001 				 * code path for now...
7002 				 */
7003 				if (wire_and_extract) {
7004 					rc = KERN_PROTECTION_FAILURE;
7005 					goto done;
7006 				}
7007 			}
7008 
7009 			/*
7010 			 * entry is already wired down, get our reference
7011 			 * after clipping to our range.
7012 			 */
7013 			vm_map_clip_start(map, entry, s);
7014 			vm_map_clip_end(map, entry, end);
7015 
7016 			if ((rc = add_wire_counts(map, entry, user_wire)) != KERN_SUCCESS) {
7017 				goto done;
7018 			}
7019 
7020 			if (wire_and_extract) {
7021 				vm_object_t             object;
7022 				vm_object_offset_t      offset;
7023 				vm_page_t               m;
7024 
7025 				/*
7026 				 * We don't have to "wire" the page again
7027 				 * bit we still have to "extract" its
7028 				 * physical page number, after some sanity
7029 				 * checks.
7030 				 */
7031 				assert((entry->vme_end - entry->vme_start)
7032 				    == PAGE_SIZE);
7033 				assert(!entry->needs_copy);
7034 				assert(!entry->is_sub_map);
7035 				assert(VME_OBJECT(entry));
7036 				if (((entry->vme_end - entry->vme_start)
7037 				    != PAGE_SIZE) ||
7038 				    entry->needs_copy ||
7039 				    entry->is_sub_map ||
7040 				    VME_OBJECT(entry) == VM_OBJECT_NULL) {
7041 					rc = KERN_INVALID_ARGUMENT;
7042 					goto done;
7043 				}
7044 
7045 				object = VME_OBJECT(entry);
7046 				offset = VME_OFFSET(entry);
7047 				/* need exclusive lock to update m->dirty */
7048 				if (entry->protection & VM_PROT_WRITE) {
7049 					vm_object_lock(object);
7050 				} else {
7051 					vm_object_lock_shared(object);
7052 				}
7053 				m = vm_page_lookup(object, offset);
7054 				assert(m != VM_PAGE_NULL);
7055 				assert(VM_PAGE_WIRED(m));
7056 				if (m != VM_PAGE_NULL && VM_PAGE_WIRED(m)) {
7057 					*physpage_p = VM_PAGE_GET_PHYS_PAGE(m);
7058 					if (entry->protection & VM_PROT_WRITE) {
7059 						vm_object_lock_assert_exclusive(
7060 							object);
7061 						m->vmp_dirty = TRUE;
7062 					}
7063 				} else {
7064 					/* not already wired !? */
7065 					*physpage_p = 0;
7066 				}
7067 				vm_object_unlock(object);
7068 			}
7069 
7070 			/* map was not unlocked: no need to relookup */
7071 			entry = entry->vme_next;
7072 			s = entry->vme_start;
7073 			continue;
7074 		}
7075 
7076 		/*
7077 		 * Unwired entry or wire request transmitted via submap
7078 		 */
7079 
7080 		/*
7081 		 * Wiring would copy the pages to the shadow object.
7082 		 * The shadow object would not be code-signed so
7083 		 * attempting to execute code from these copied pages
7084 		 * would trigger a code-signing violation.
7085 		 */
7086 
7087 		if ((entry->protection & VM_PROT_EXECUTE)
7088 #if XNU_TARGET_OS_OSX
7089 		    &&
7090 		    map->pmap != kernel_pmap &&
7091 		    (vm_map_cs_enforcement(map)
7092 #if __arm64__
7093 		    || !VM_MAP_IS_EXOTIC(map)
7094 #endif /* __arm64__ */
7095 		    )
7096 #endif /* XNU_TARGET_OS_OSX */
7097 #if CODE_SIGNING_MONITOR
7098 		    &&
7099 		    (csm_address_space_exempt(map->pmap) != KERN_SUCCESS)
7100 #endif
7101 		    ) {
7102 #if MACH_ASSERT
7103 			printf("pid %d[%s] wiring executable range from "
7104 			    "0x%llx to 0x%llx: rejected to preserve "
7105 			    "code-signing\n",
7106 			    proc_selfpid(),
7107 			    (get_bsdtask_info(current_task())
7108 			    ? proc_name_address(get_bsdtask_info(current_task()))
7109 			    : "?"),
7110 			    (uint64_t) entry->vme_start,
7111 			    (uint64_t) entry->vme_end);
7112 #endif /* MACH_ASSERT */
7113 			DTRACE_VM2(cs_executable_wire,
7114 			    uint64_t, (uint64_t)entry->vme_start,
7115 			    uint64_t, (uint64_t)entry->vme_end);
7116 			cs_executable_wire++;
7117 			rc = KERN_PROTECTION_FAILURE;
7118 			goto done;
7119 		}
7120 
7121 		/*
7122 		 * Perform actions of vm_map_lookup that need the write
7123 		 * lock on the map: create a shadow object for a
7124 		 * copy-on-write region, or an object for a zero-fill
7125 		 * region.
7126 		 */
7127 		size = entry->vme_end - entry->vme_start;
7128 		/*
7129 		 * If wiring a copy-on-write page, we need to copy it now
7130 		 * even if we're only (currently) requesting read access.
7131 		 * This is aggressive, but once it's wired we can't move it.
7132 		 */
7133 		if (entry->needs_copy) {
7134 			if (wire_and_extract) {
7135 				/*
7136 				 * We're supposed to share with the original
7137 				 * provider so should not be "needs_copy"
7138 				 */
7139 				rc = KERN_INVALID_ARGUMENT;
7140 				goto done;
7141 			}
7142 
7143 			VME_OBJECT_SHADOW(entry, size,
7144 			    vm_map_always_shadow(map));
7145 			entry->needs_copy = FALSE;
7146 		} else if (VME_OBJECT(entry) == VM_OBJECT_NULL) {
7147 			if (wire_and_extract) {
7148 				/*
7149 				 * We're supposed to share with the original
7150 				 * provider so should already have an object.
7151 				 */
7152 				rc = KERN_INVALID_ARGUMENT;
7153 				goto done;
7154 			}
7155 			VME_OBJECT_SET(entry, vm_object_allocate(size), false, 0);
7156 			VME_OFFSET_SET(entry, (vm_object_offset_t)0);
7157 			assert(entry->use_pmap);
7158 		} else if (VME_OBJECT(entry)->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) {
7159 			if (wire_and_extract) {
7160 				/*
7161 				 * We're supposed to share with the original
7162 				 * provider so should not be COPY_SYMMETRIC.
7163 				 */
7164 				rc = KERN_INVALID_ARGUMENT;
7165 				goto done;
7166 			}
7167 			/*
7168 			 * Force an unrequested "copy-on-write" but only for
7169 			 * the range we're wiring.
7170 			 */
7171 //			printf("FBDP %s:%d map %p entry %p [ 0x%llx 0x%llx ] s 0x%llx end 0x%llx wire&extract=%d\n", __FUNCTION__, __LINE__, map, entry, (uint64_t)entry->vme_start, (uint64_t)entry->vme_end, (uint64_t)s, (uint64_t)end, wire_and_extract);
7172 			vm_map_clip_start(map, entry, s);
7173 			vm_map_clip_end(map, entry, end);
7174 			/* recompute "size" */
7175 			size = entry->vme_end - entry->vme_start;
7176 			/* make a shadow object */
7177 			vm_object_t orig_object;
7178 			vm_object_offset_t orig_offset;
7179 			orig_object = VME_OBJECT(entry);
7180 			orig_offset = VME_OFFSET(entry);
7181 			VME_OBJECT_SHADOW(entry, size, vm_map_always_shadow(map));
7182 			if (VME_OBJECT(entry) != orig_object) {
7183 				/*
7184 				 * This mapping has not been shared (or it would be
7185 				 * COPY_DELAY instead of COPY_SYMMETRIC) and it has
7186 				 * not been copied-on-write (or it would be marked
7187 				 * as "needs_copy" and would have been handled above
7188 				 * and also already write-protected).
7189 				 * We still need to write-protect here to prevent
7190 				 * other threads from modifying these pages while
7191 				 * we're in the process of copying and wiring
7192 				 * the copied pages.
7193 				 * Since the mapping is neither shared nor COWed,
7194 				 * we only need to write-protect the PTEs for this
7195 				 * mapping.
7196 				 */
7197 				vm_object_pmap_protect(orig_object,
7198 				    orig_offset,
7199 				    size,
7200 				    map->pmap,
7201 				    VM_MAP_PAGE_SIZE(map),
7202 				    entry->vme_start,
7203 				    entry->protection & ~VM_PROT_WRITE);
7204 			}
7205 		}
7206 		if (VME_OBJECT(entry)->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) {
7207 			/*
7208 			 * Make the object COPY_DELAY to get a stable object
7209 			 * to wire.
7210 			 * That should avoid creating long shadow chains while
7211 			 * wiring/unwiring the same range repeatedly.
7212 			 * That also prevents part of the object from being
7213 			 * wired while another part is "needs_copy", which
7214 			 * could result in conflicting rules wrt copy-on-write.
7215 			 */
7216 			vm_object_t object;
7217 
7218 			object = VME_OBJECT(entry);
7219 			vm_object_lock(object);
7220 			if (object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) {
7221 				assertf(vm_object_round_page(VME_OFFSET(entry) + size) - vm_object_trunc_page(VME_OFFSET(entry)) == object->vo_size,
7222 				    "object %p size 0x%llx entry %p [0x%llx:0x%llx:0x%llx] size 0x%llx\n",
7223 				    object, (uint64_t)object->vo_size,
7224 				    entry,
7225 				    (uint64_t)entry->vme_start,
7226 				    (uint64_t)entry->vme_end,
7227 				    (uint64_t)VME_OFFSET(entry),
7228 				    (uint64_t)size);
7229 				assertf(object->ref_count == 1,
7230 				    "object %p ref_count %d\n",
7231 				    object, object->ref_count);
7232 				assertf(!entry->needs_copy,
7233 				    "entry %p\n", entry);
7234 				object->copy_strategy = MEMORY_OBJECT_COPY_DELAY;
7235 				VM_OBJECT_SET_TRUE_SHARE(object, TRUE);
7236 			}
7237 			vm_object_unlock(object);
7238 		}
7239 
7240 		vm_map_clip_start(map, entry, s);
7241 		vm_map_clip_end(map, entry, end);
7242 
7243 		/* re-compute "e" */
7244 		e = entry->vme_end;
7245 		if (e > end) {
7246 			e = end;
7247 		}
7248 
7249 		/*
7250 		 * Check for holes and protection mismatch.
7251 		 * Holes: Next entry should be contiguous unless this
7252 		 *	  is the end of the region.
7253 		 * Protection: Access requested must be allowed, unless
7254 		 *	wiring is by protection class
7255 		 */
7256 		if ((entry->vme_end < end) &&
7257 		    ((entry->vme_next == vm_map_to_entry(map)) ||
7258 		    (entry->vme_next->vme_start > entry->vme_end))) {
7259 			/* found a hole */
7260 			rc = KERN_INVALID_ADDRESS;
7261 			goto done;
7262 		}
7263 		if ((entry->protection & access_type) != access_type) {
7264 			/* found a protection problem */
7265 			rc = KERN_PROTECTION_FAILURE;
7266 			goto done;
7267 		}
7268 
7269 		assert(entry->wired_count == 0 && entry->user_wired_count == 0);
7270 
7271 		if ((rc = add_wire_counts(map, entry, user_wire)) != KERN_SUCCESS) {
7272 			goto done;
7273 		}
7274 
7275 		entry->in_transition = TRUE;
7276 
7277 		/*
7278 		 * This entry might get split once we unlock the map.
7279 		 * In vm_fault_wire(), we need the current range as
7280 		 * defined by this entry.  In order for this to work
7281 		 * along with a simultaneous clip operation, we make a
7282 		 * temporary copy of this entry and use that for the
7283 		 * wiring.  Note that the underlying objects do not
7284 		 * change during a clip.
7285 		 */
7286 		tmp_entry = *entry;
7287 
7288 		/*
7289 		 * The in_transition state guarentees that the entry
7290 		 * (or entries for this range, if split occured) will be
7291 		 * there when the map lock is acquired for the second time.
7292 		 */
7293 		vm_map_unlock(map);
7294 
7295 		if (!user_wire && cur_thread != THREAD_NULL) {
7296 			interruptible_state = thread_interrupt_level(THREAD_UNINT);
7297 		} else {
7298 			interruptible_state = THREAD_UNINT;
7299 		}
7300 
7301 		if (map_pmap) {
7302 			rc = vm_fault_wire(map,
7303 			    &tmp_entry, caller_prot, tag, map_pmap, pmap_addr,
7304 			    physpage_p);
7305 		} else {
7306 			rc = vm_fault_wire(map,
7307 			    &tmp_entry, caller_prot, tag, map->pmap,
7308 			    tmp_entry.vme_start,
7309 			    physpage_p);
7310 		}
7311 
7312 		if (!user_wire && cur_thread != THREAD_NULL) {
7313 			thread_interrupt_level(interruptible_state);
7314 		}
7315 
7316 		vm_map_lock(map);
7317 
7318 		if (last_timestamp + 1 != map->timestamp) {
7319 			/*
7320 			 * Find the entry again.  It could have been clipped
7321 			 * after we unlocked the map.
7322 			 */
7323 			if (!vm_map_lookup_entry(map, tmp_entry.vme_start,
7324 			    &first_entry)) {
7325 				panic("vm_map_wire: re-lookup failed");
7326 			}
7327 
7328 			entry = first_entry;
7329 		}
7330 
7331 		last_timestamp = map->timestamp;
7332 
7333 		while ((entry != vm_map_to_entry(map)) &&
7334 		    (entry->vme_start < tmp_entry.vme_end)) {
7335 			assert(entry->in_transition);
7336 			entry->in_transition = FALSE;
7337 			if (entry->needs_wakeup) {
7338 				entry->needs_wakeup = FALSE;
7339 				need_wakeup = TRUE;
7340 			}
7341 			if (rc != KERN_SUCCESS) {       /* from vm_*_wire */
7342 				subtract_wire_counts(map, entry, user_wire);
7343 			}
7344 			entry = entry->vme_next;
7345 		}
7346 
7347 		if (rc != KERN_SUCCESS) {               /* from vm_*_wire */
7348 			goto done;
7349 		}
7350 
7351 		if ((entry != vm_map_to_entry(map)) && /* we still have entries in the map */
7352 		    (tmp_entry.vme_end != end) &&    /* AND, we are not at the end of the requested range */
7353 		    (entry->vme_start != tmp_entry.vme_end)) { /* AND, the next entry is not contiguous. */
7354 			/* found a "new" hole */
7355 			s = tmp_entry.vme_end;
7356 			rc = KERN_INVALID_ADDRESS;
7357 			goto done;
7358 		}
7359 
7360 		s = entry->vme_start;
7361 	} /* end while loop through map entries */
7362 
7363 done:
7364 	if (rc == KERN_SUCCESS) {
7365 		/* repair any damage we may have made to the VM map */
7366 		vm_map_simplify_range(map, start, end);
7367 	}
7368 
7369 	vm_map_unlock(map);
7370 
7371 	/*
7372 	 * wake up anybody waiting on entries we wired.
7373 	 */
7374 	if (need_wakeup) {
7375 		vm_map_entry_wakeup(map);
7376 	}
7377 
7378 	if (rc != KERN_SUCCESS) {
7379 		/* undo what has been wired so far */
7380 		vm_map_unwire_nested(map, start, s, user_wire,
7381 		    map_pmap, pmap_addr);
7382 		if (physpage_p) {
7383 			*physpage_p = 0;
7384 		}
7385 	}
7386 
7387 	return rc;
7388 }
7389 
7390 kern_return_t
vm_map_wire_external(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_prot_t caller_prot,boolean_t user_wire)7391 vm_map_wire_external(
7392 	vm_map_t                map,
7393 	vm_map_offset_t         start,
7394 	vm_map_offset_t         end,
7395 	vm_prot_t               caller_prot,
7396 	boolean_t               user_wire)
7397 {
7398 	kern_return_t   kret;
7399 
7400 	kret = vm_map_wire_nested(map, start, end, caller_prot, vm_tag_bt(),
7401 	    user_wire, (pmap_t)NULL, 0, NULL);
7402 	return kret;
7403 }
7404 
7405 kern_return_t
vm_map_wire_kernel(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_prot_t caller_prot,vm_tag_t tag,boolean_t user_wire)7406 vm_map_wire_kernel(
7407 	vm_map_t                map,
7408 	vm_map_offset_t         start,
7409 	vm_map_offset_t         end,
7410 	vm_prot_t               caller_prot,
7411 	vm_tag_t                tag,
7412 	boolean_t               user_wire)
7413 {
7414 	kern_return_t   kret;
7415 
7416 	kret = vm_map_wire_nested(map, start, end, caller_prot, tag,
7417 	    user_wire, (pmap_t)NULL, 0, NULL);
7418 	return kret;
7419 }
7420 
7421 kern_return_t
vm_map_wire_and_extract_external(vm_map_t map,vm_map_offset_t start,vm_prot_t caller_prot,boolean_t user_wire,ppnum_t * physpage_p)7422 vm_map_wire_and_extract_external(
7423 	vm_map_t        map,
7424 	vm_map_offset_t start,
7425 	vm_prot_t       caller_prot,
7426 	boolean_t       user_wire,
7427 	ppnum_t         *physpage_p)
7428 {
7429 	kern_return_t   kret;
7430 
7431 	kret = vm_map_wire_nested(map,
7432 	    start,
7433 	    start + VM_MAP_PAGE_SIZE(map),
7434 	    caller_prot,
7435 	    vm_tag_bt(),
7436 	    user_wire,
7437 	    (pmap_t)NULL,
7438 	    0,
7439 	    physpage_p);
7440 	if (kret != KERN_SUCCESS &&
7441 	    physpage_p != NULL) {
7442 		*physpage_p = 0;
7443 	}
7444 	return kret;
7445 }
7446 
7447 /*
7448  *	vm_map_unwire:
7449  *
7450  *	Sets the pageability of the specified address range in the target
7451  *	as pageable.  Regions specified must have been wired previously.
7452  *
7453  *	The map must not be locked, but a reference must remain to the map
7454  *	throughout the call.
7455  *
7456  *	Kernel will panic on failures.  User unwire ignores holes and
7457  *	unwired and intransition entries to avoid losing memory by leaving
7458  *	it unwired.
7459  */
7460 static kern_return_t
vm_map_unwire_nested(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,boolean_t user_wire,pmap_t map_pmap,vm_map_offset_t pmap_addr)7461 vm_map_unwire_nested(
7462 	vm_map_t                map,
7463 	vm_map_offset_t         start,
7464 	vm_map_offset_t         end,
7465 	boolean_t               user_wire,
7466 	pmap_t                  map_pmap,
7467 	vm_map_offset_t         pmap_addr)
7468 {
7469 	vm_map_entry_t          entry;
7470 	struct vm_map_entry     *first_entry, tmp_entry;
7471 	boolean_t               need_wakeup;
7472 	boolean_t               main_map = FALSE;
7473 	unsigned int            last_timestamp;
7474 
7475 	vm_map_lock(map);
7476 	if (map_pmap == NULL) {
7477 		main_map = TRUE;
7478 	}
7479 	last_timestamp = map->timestamp;
7480 
7481 	VM_MAP_RANGE_CHECK(map, start, end);
7482 	assert(VM_MAP_PAGE_ALIGNED(start, VM_MAP_PAGE_MASK(map)));
7483 	assert(VM_MAP_PAGE_ALIGNED(end, VM_MAP_PAGE_MASK(map)));
7484 
7485 	if (start == end) {
7486 		/* We unwired what the caller asked for: zero pages */
7487 		vm_map_unlock(map);
7488 		return KERN_SUCCESS;
7489 	}
7490 
7491 	if (__improbable(vm_map_range_overflows(map, start, end - start))) {
7492 		vm_map_unlock(map);
7493 		return KERN_INVALID_ADDRESS;
7494 	}
7495 
7496 	if (vm_map_lookup_entry(map, start, &first_entry)) {
7497 		entry = first_entry;
7498 		/*
7499 		 * vm_map_clip_start will be done later.
7500 		 * We don't want to unnest any nested sub maps here !
7501 		 */
7502 	} else {
7503 		if (!user_wire) {
7504 			panic("vm_map_unwire: start not found");
7505 		}
7506 		/*	Start address is not in map. */
7507 		vm_map_unlock(map);
7508 		return KERN_INVALID_ADDRESS;
7509 	}
7510 
7511 	if (entry->superpage_size) {
7512 		/* superpages are always wired */
7513 		vm_map_unlock(map);
7514 		return KERN_INVALID_ADDRESS;
7515 	}
7516 
7517 	need_wakeup = FALSE;
7518 	while ((entry != vm_map_to_entry(map)) && (entry->vme_start < end)) {
7519 		if (entry->in_transition) {
7520 			/*
7521 			 * 1)
7522 			 * Another thread is wiring down this entry. Note
7523 			 * that if it is not for the other thread we would
7524 			 * be unwiring an unwired entry.  This is not
7525 			 * permitted.  If we wait, we will be unwiring memory
7526 			 * we did not wire.
7527 			 *
7528 			 * 2)
7529 			 * Another thread is unwiring this entry.  We did not
7530 			 * have a reference to it, because if we did, this
7531 			 * entry will not be getting unwired now.
7532 			 */
7533 			if (!user_wire) {
7534 				/*
7535 				 * XXX FBDP
7536 				 * This could happen:  there could be some
7537 				 * overlapping vslock/vsunlock operations
7538 				 * going on.
7539 				 * We should probably just wait and retry,
7540 				 * but then we have to be careful that this
7541 				 * entry could get "simplified" after
7542 				 * "in_transition" gets unset and before
7543 				 * we re-lookup the entry, so we would
7544 				 * have to re-clip the entry to avoid
7545 				 * re-unwiring what we have already unwired...
7546 				 * See vm_map_wire_nested().
7547 				 *
7548 				 * Or we could just ignore "in_transition"
7549 				 * here and proceed to decement the wired
7550 				 * count(s) on this entry.  That should be fine
7551 				 * as long as "wired_count" doesn't drop all
7552 				 * the way to 0 (and we should panic if THAT
7553 				 * happens).
7554 				 */
7555 				panic("vm_map_unwire: in_transition entry");
7556 			}
7557 
7558 			entry = entry->vme_next;
7559 			continue;
7560 		}
7561 
7562 		if (entry->is_sub_map) {
7563 			vm_map_offset_t sub_start;
7564 			vm_map_offset_t sub_end;
7565 			vm_map_offset_t local_end;
7566 			pmap_t          pmap;
7567 
7568 			vm_map_clip_start(map, entry, start);
7569 			vm_map_clip_end(map, entry, end);
7570 
7571 			sub_start = VME_OFFSET(entry);
7572 			sub_end = entry->vme_end - entry->vme_start;
7573 			sub_end += VME_OFFSET(entry);
7574 			local_end = entry->vme_end;
7575 			if (map_pmap == NULL) {
7576 				if (entry->use_pmap) {
7577 					pmap = VME_SUBMAP(entry)->pmap;
7578 					pmap_addr = sub_start;
7579 				} else {
7580 					pmap = map->pmap;
7581 					pmap_addr = start;
7582 				}
7583 				if (entry->wired_count == 0 ||
7584 				    (user_wire && entry->user_wired_count == 0)) {
7585 					if (!user_wire) {
7586 						panic("vm_map_unwire: entry is unwired");
7587 					}
7588 					entry = entry->vme_next;
7589 					continue;
7590 				}
7591 
7592 				/*
7593 				 * Check for holes
7594 				 * Holes: Next entry should be contiguous unless
7595 				 * this is the end of the region.
7596 				 */
7597 				if (((entry->vme_end < end) &&
7598 				    ((entry->vme_next == vm_map_to_entry(map)) ||
7599 				    (entry->vme_next->vme_start
7600 				    > entry->vme_end)))) {
7601 					if (!user_wire) {
7602 						panic("vm_map_unwire: non-contiguous region");
7603 					}
7604 /*
7605  *                                       entry = entry->vme_next;
7606  *                                       continue;
7607  */
7608 				}
7609 
7610 				subtract_wire_counts(map, entry, user_wire);
7611 
7612 				if (entry->wired_count != 0) {
7613 					entry = entry->vme_next;
7614 					continue;
7615 				}
7616 
7617 				entry->in_transition = TRUE;
7618 				tmp_entry = *entry;/* see comment in vm_map_wire() */
7619 
7620 				/*
7621 				 * We can unlock the map now. The in_transition state
7622 				 * guarantees existance of the entry.
7623 				 */
7624 				vm_map_unlock(map);
7625 				vm_map_unwire_nested(VME_SUBMAP(entry),
7626 				    sub_start, sub_end, user_wire, pmap, pmap_addr);
7627 				vm_map_lock(map);
7628 
7629 				if (last_timestamp + 1 != map->timestamp) {
7630 					/*
7631 					 * Find the entry again.  It could have been
7632 					 * clipped or deleted after we unlocked the map.
7633 					 */
7634 					if (!vm_map_lookup_entry(map,
7635 					    tmp_entry.vme_start,
7636 					    &first_entry)) {
7637 						if (!user_wire) {
7638 							panic("vm_map_unwire: re-lookup failed");
7639 						}
7640 						entry = first_entry->vme_next;
7641 					} else {
7642 						entry = first_entry;
7643 					}
7644 				}
7645 				last_timestamp = map->timestamp;
7646 
7647 				/*
7648 				 * clear transition bit for all constituent entries
7649 				 * that were in the original entry (saved in
7650 				 * tmp_entry).  Also check for waiters.
7651 				 */
7652 				while ((entry != vm_map_to_entry(map)) &&
7653 				    (entry->vme_start < tmp_entry.vme_end)) {
7654 					assert(entry->in_transition);
7655 					entry->in_transition = FALSE;
7656 					if (entry->needs_wakeup) {
7657 						entry->needs_wakeup = FALSE;
7658 						need_wakeup = TRUE;
7659 					}
7660 					entry = entry->vme_next;
7661 				}
7662 				continue;
7663 			} else {
7664 				tmp_entry = *entry;
7665 				vm_map_unlock(map);
7666 				vm_map_unwire_nested(VME_SUBMAP(entry),
7667 				    sub_start, sub_end, user_wire, map_pmap,
7668 				    pmap_addr);
7669 				vm_map_lock(map);
7670 
7671 				if (last_timestamp + 1 != map->timestamp) {
7672 					/*
7673 					 * Find the entry again.  It could have been
7674 					 * clipped or deleted after we unlocked the map.
7675 					 */
7676 					if (!vm_map_lookup_entry(map,
7677 					    tmp_entry.vme_start,
7678 					    &first_entry)) {
7679 						if (!user_wire) {
7680 							panic("vm_map_unwire: re-lookup failed");
7681 						}
7682 						entry = first_entry->vme_next;
7683 					} else {
7684 						entry = first_entry;
7685 					}
7686 				}
7687 				last_timestamp = map->timestamp;
7688 			}
7689 		}
7690 
7691 
7692 		if ((entry->wired_count == 0) ||
7693 		    (user_wire && entry->user_wired_count == 0)) {
7694 			if (!user_wire) {
7695 				panic("vm_map_unwire: entry is unwired");
7696 			}
7697 
7698 			entry = entry->vme_next;
7699 			continue;
7700 		}
7701 
7702 		assert(entry->wired_count > 0 &&
7703 		    (!user_wire || entry->user_wired_count > 0));
7704 
7705 		vm_map_clip_start(map, entry, start);
7706 		vm_map_clip_end(map, entry, end);
7707 
7708 		/*
7709 		 * Check for holes
7710 		 * Holes: Next entry should be contiguous unless
7711 		 *	  this is the end of the region.
7712 		 */
7713 		if (((entry->vme_end < end) &&
7714 		    ((entry->vme_next == vm_map_to_entry(map)) ||
7715 		    (entry->vme_next->vme_start > entry->vme_end)))) {
7716 			if (!user_wire) {
7717 				panic("vm_map_unwire: non-contiguous region");
7718 			}
7719 			entry = entry->vme_next;
7720 			continue;
7721 		}
7722 
7723 		subtract_wire_counts(map, entry, user_wire);
7724 
7725 		if (entry->wired_count != 0) {
7726 			entry = entry->vme_next;
7727 			continue;
7728 		}
7729 
7730 		if (entry->zero_wired_pages) {
7731 			entry->zero_wired_pages = FALSE;
7732 		}
7733 
7734 		entry->in_transition = TRUE;
7735 		tmp_entry = *entry;     /* see comment in vm_map_wire() */
7736 
7737 		/*
7738 		 * We can unlock the map now. The in_transition state
7739 		 * guarantees existance of the entry.
7740 		 */
7741 		vm_map_unlock(map);
7742 		if (map_pmap) {
7743 			vm_fault_unwire(map, &tmp_entry, FALSE, map_pmap,
7744 			    pmap_addr, tmp_entry.vme_end);
7745 		} else {
7746 			vm_fault_unwire(map, &tmp_entry, FALSE, map->pmap,
7747 			    tmp_entry.vme_start, tmp_entry.vme_end);
7748 		}
7749 		vm_map_lock(map);
7750 
7751 		if (last_timestamp + 1 != map->timestamp) {
7752 			/*
7753 			 * Find the entry again.  It could have been clipped
7754 			 * or deleted after we unlocked the map.
7755 			 */
7756 			if (!vm_map_lookup_entry(map, tmp_entry.vme_start,
7757 			    &first_entry)) {
7758 				if (!user_wire) {
7759 					panic("vm_map_unwire: re-lookup failed");
7760 				}
7761 				entry = first_entry->vme_next;
7762 			} else {
7763 				entry = first_entry;
7764 			}
7765 		}
7766 		last_timestamp = map->timestamp;
7767 
7768 		/*
7769 		 * clear transition bit for all constituent entries that
7770 		 * were in the original entry (saved in tmp_entry).  Also
7771 		 * check for waiters.
7772 		 */
7773 		while ((entry != vm_map_to_entry(map)) &&
7774 		    (entry->vme_start < tmp_entry.vme_end)) {
7775 			assert(entry->in_transition);
7776 			entry->in_transition = FALSE;
7777 			if (entry->needs_wakeup) {
7778 				entry->needs_wakeup = FALSE;
7779 				need_wakeup = TRUE;
7780 			}
7781 			entry = entry->vme_next;
7782 		}
7783 	}
7784 
7785 	/*
7786 	 * We might have fragmented the address space when we wired this
7787 	 * range of addresses.  Attempt to re-coalesce these VM map entries
7788 	 * with their neighbors now that they're no longer wired.
7789 	 * Under some circumstances, address space fragmentation can
7790 	 * prevent VM object shadow chain collapsing, which can cause
7791 	 * swap space leaks.
7792 	 */
7793 	vm_map_simplify_range(map, start, end);
7794 
7795 	vm_map_unlock(map);
7796 	/*
7797 	 * wake up anybody waiting on entries that we have unwired.
7798 	 */
7799 	if (need_wakeup) {
7800 		vm_map_entry_wakeup(map);
7801 	}
7802 	return KERN_SUCCESS;
7803 }
7804 
7805 kern_return_t
vm_map_unwire(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,boolean_t user_wire)7806 vm_map_unwire(
7807 	vm_map_t                map,
7808 	vm_map_offset_t         start,
7809 	vm_map_offset_t         end,
7810 	boolean_t               user_wire)
7811 {
7812 	return vm_map_unwire_nested(map, start, end,
7813 	           user_wire, (pmap_t)NULL, 0);
7814 }
7815 
7816 
7817 /*
7818  *	vm_map_entry_zap:	[ internal use only ]
7819  *
7820  *	Remove the entry from the target map
7821  *	and put it on a zap list.
7822  */
7823 static void
vm_map_entry_zap(vm_map_t map,vm_map_entry_t entry,vm_map_zap_t zap)7824 vm_map_entry_zap(
7825 	vm_map_t                map,
7826 	vm_map_entry_t          entry,
7827 	vm_map_zap_t            zap)
7828 {
7829 	vm_map_offset_t s, e;
7830 
7831 	s = entry->vme_start;
7832 	e = entry->vme_end;
7833 	assert(VM_MAP_PAGE_ALIGNED(s, FOURK_PAGE_MASK));
7834 	assert(VM_MAP_PAGE_ALIGNED(e, FOURK_PAGE_MASK));
7835 	if (VM_MAP_PAGE_MASK(map) >= PAGE_MASK) {
7836 		assert(page_aligned(s));
7837 		assert(page_aligned(e));
7838 	}
7839 	if (entry->map_aligned == TRUE) {
7840 		assert(VM_MAP_PAGE_ALIGNED(s, VM_MAP_PAGE_MASK(map)));
7841 		assert(VM_MAP_PAGE_ALIGNED(e, VM_MAP_PAGE_MASK(map)));
7842 	}
7843 	assert(entry->wired_count == 0);
7844 	assert(entry->user_wired_count == 0);
7845 	assert(!entry->vme_permanent);
7846 
7847 	vm_map_store_entry_unlink(map, entry, false);
7848 	map->size -= e - s;
7849 
7850 	vm_map_zap_append(zap, entry);
7851 }
7852 
7853 static void
vm_map_submap_pmap_clean(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_map_t sub_map,vm_map_offset_t offset)7854 vm_map_submap_pmap_clean(
7855 	vm_map_t        map,
7856 	vm_map_offset_t start,
7857 	vm_map_offset_t end,
7858 	vm_map_t        sub_map,
7859 	vm_map_offset_t offset)
7860 {
7861 	vm_map_offset_t submap_start;
7862 	vm_map_offset_t submap_end;
7863 	vm_map_size_t   remove_size;
7864 	vm_map_entry_t  entry;
7865 
7866 	submap_end = offset + (end - start);
7867 	submap_start = offset;
7868 
7869 	vm_map_lock_read(sub_map);
7870 	if (vm_map_lookup_entry(sub_map, offset, &entry)) {
7871 		remove_size = (entry->vme_end - entry->vme_start);
7872 		if (offset > entry->vme_start) {
7873 			remove_size -= offset - entry->vme_start;
7874 		}
7875 
7876 
7877 		if (submap_end < entry->vme_end) {
7878 			remove_size -=
7879 			    entry->vme_end - submap_end;
7880 		}
7881 		if (entry->is_sub_map) {
7882 			vm_map_submap_pmap_clean(
7883 				sub_map,
7884 				start,
7885 				start + remove_size,
7886 				VME_SUBMAP(entry),
7887 				VME_OFFSET(entry));
7888 		} else {
7889 			if (map->mapped_in_other_pmaps &&
7890 			    os_ref_get_count_raw(&map->map_refcnt) != 0 &&
7891 			    VME_OBJECT(entry) != NULL) {
7892 				vm_object_pmap_protect_options(
7893 					VME_OBJECT(entry),
7894 					(VME_OFFSET(entry) +
7895 					offset -
7896 					entry->vme_start),
7897 					remove_size,
7898 					PMAP_NULL,
7899 					PAGE_SIZE,
7900 					entry->vme_start,
7901 					VM_PROT_NONE,
7902 					PMAP_OPTIONS_REMOVE);
7903 			} else {
7904 				pmap_remove(map->pmap,
7905 				    (addr64_t)start,
7906 				    (addr64_t)(start + remove_size));
7907 			}
7908 		}
7909 	}
7910 
7911 	entry = entry->vme_next;
7912 
7913 	while ((entry != vm_map_to_entry(sub_map))
7914 	    && (entry->vme_start < submap_end)) {
7915 		remove_size = (entry->vme_end - entry->vme_start);
7916 		if (submap_end < entry->vme_end) {
7917 			remove_size -= entry->vme_end - submap_end;
7918 		}
7919 		if (entry->is_sub_map) {
7920 			vm_map_submap_pmap_clean(
7921 				sub_map,
7922 				(start + entry->vme_start) - offset,
7923 				((start + entry->vme_start) - offset) + remove_size,
7924 				VME_SUBMAP(entry),
7925 				VME_OFFSET(entry));
7926 		} else {
7927 			if (map->mapped_in_other_pmaps &&
7928 			    os_ref_get_count_raw(&map->map_refcnt) != 0 &&
7929 			    VME_OBJECT(entry) != NULL) {
7930 				vm_object_pmap_protect_options(
7931 					VME_OBJECT(entry),
7932 					VME_OFFSET(entry),
7933 					remove_size,
7934 					PMAP_NULL,
7935 					PAGE_SIZE,
7936 					entry->vme_start,
7937 					VM_PROT_NONE,
7938 					PMAP_OPTIONS_REMOVE);
7939 			} else {
7940 				pmap_remove(map->pmap,
7941 				    (addr64_t)((start + entry->vme_start)
7942 				    - offset),
7943 				    (addr64_t)(((start + entry->vme_start)
7944 				    - offset) + remove_size));
7945 			}
7946 		}
7947 		entry = entry->vme_next;
7948 	}
7949 	vm_map_unlock_read(sub_map);
7950 	return;
7951 }
7952 
7953 /*
7954  *     virt_memory_guard_ast:
7955  *
7956  *     Handle the AST callout for a virtual memory guard.
7957  *	   raise an EXC_GUARD exception and terminate the task
7958  *     if configured to do so.
7959  */
7960 void
virt_memory_guard_ast(thread_t thread,mach_exception_data_type_t code,mach_exception_data_type_t subcode)7961 virt_memory_guard_ast(
7962 	thread_t thread,
7963 	mach_exception_data_type_t code,
7964 	mach_exception_data_type_t subcode)
7965 {
7966 	task_t task = get_threadtask(thread);
7967 	assert(task != kernel_task);
7968 	assert(task == current_task());
7969 	kern_return_t sync_exception_result;
7970 	uint32_t behavior;
7971 
7972 	behavior = task->task_exc_guard;
7973 
7974 	/* Is delivery enabled */
7975 	if ((behavior & TASK_EXC_GUARD_VM_DELIVER) == 0) {
7976 		return;
7977 	}
7978 
7979 	/* If only once, make sure we're that once */
7980 	while (behavior & TASK_EXC_GUARD_VM_ONCE) {
7981 		uint32_t new_behavior = behavior & ~TASK_EXC_GUARD_VM_DELIVER;
7982 
7983 		if (OSCompareAndSwap(behavior, new_behavior, &task->task_exc_guard)) {
7984 			break;
7985 		}
7986 		behavior = task->task_exc_guard;
7987 		if ((behavior & TASK_EXC_GUARD_VM_DELIVER) == 0) {
7988 			return;
7989 		}
7990 	}
7991 
7992 	const bool fatal = task->task_exc_guard & TASK_EXC_GUARD_VM_FATAL;
7993 	/* Raise exception synchronously and see if handler claimed it */
7994 	sync_exception_result = task_exception_notify(EXC_GUARD, code, subcode, fatal);
7995 
7996 	if (fatal) {
7997 		/*
7998 		 * If Synchronous EXC_GUARD delivery was successful then
7999 		 * kill the process and return, else kill the process
8000 		 * and deliver the exception via EXC_CORPSE_NOTIFY.
8001 		 */
8002 		if (sync_exception_result == KERN_SUCCESS) {
8003 			task_bsdtask_kill(current_task());
8004 		} else {
8005 			exit_with_guard_exception(current_proc(), code, subcode);
8006 		}
8007 	} else if (task->task_exc_guard & TASK_EXC_GUARD_VM_CORPSE) {
8008 		/*
8009 		 * If the synchronous EXC_GUARD delivery was not successful,
8010 		 * raise a simulated crash.
8011 		 */
8012 		if (sync_exception_result != KERN_SUCCESS) {
8013 			task_violated_guard(code, subcode, NULL, FALSE);
8014 		}
8015 	}
8016 }
8017 
8018 /*
8019  *     vm_map_guard_exception:
8020  *
8021  *     Generate a GUARD_TYPE_VIRTUAL_MEMORY EXC_GUARD exception.
8022  *
8023  *     Right now, we do this when we find nothing mapped, or a
8024  *     gap in the mapping when a user address space deallocate
8025  *     was requested. We report the address of the first gap found.
8026  */
8027 static void
vm_map_guard_exception(vm_map_offset_t gap_start,unsigned reason)8028 vm_map_guard_exception(
8029 	vm_map_offset_t gap_start,
8030 	unsigned reason)
8031 {
8032 	mach_exception_code_t code = 0;
8033 	unsigned int guard_type = GUARD_TYPE_VIRT_MEMORY;
8034 	unsigned int target = 0; /* should we pass in pid associated with map? */
8035 	mach_exception_data_type_t subcode = (uint64_t)gap_start;
8036 	boolean_t fatal = FALSE;
8037 
8038 	task_t task = current_task_early();
8039 
8040 	/* Can't deliver exceptions to a NULL task (early boot) or kernel task */
8041 	if (task == NULL || task == kernel_task) {
8042 		return;
8043 	}
8044 
8045 	EXC_GUARD_ENCODE_TYPE(code, guard_type);
8046 	EXC_GUARD_ENCODE_FLAVOR(code, reason);
8047 	EXC_GUARD_ENCODE_TARGET(code, target);
8048 
8049 	if (task->task_exc_guard & TASK_EXC_GUARD_VM_FATAL) {
8050 		fatal = TRUE;
8051 	}
8052 	thread_guard_violation(current_thread(), code, subcode, fatal);
8053 }
8054 
8055 static kern_return_t
vm_map_delete_submap_recurse(vm_map_t submap,vm_map_offset_t submap_start,vm_map_offset_t submap_end)8056 vm_map_delete_submap_recurse(
8057 	vm_map_t submap,
8058 	vm_map_offset_t submap_start,
8059 	vm_map_offset_t submap_end)
8060 {
8061 	vm_map_entry_t submap_entry;
8062 
8063 	/*
8064 	 * Verify that the submap does not contain any "permanent" entries
8065 	 * within the specified range.
8066 	 * We do not care about gaps.
8067 	 */
8068 
8069 	vm_map_lock(submap);
8070 
8071 	if (!vm_map_lookup_entry(submap, submap_start, &submap_entry)) {
8072 		submap_entry = submap_entry->vme_next;
8073 	}
8074 
8075 	for (;
8076 	    submap_entry != vm_map_to_entry(submap) &&
8077 	    submap_entry->vme_start < submap_end;
8078 	    submap_entry = submap_entry->vme_next) {
8079 		if (submap_entry->vme_permanent) {
8080 			/* "permanent" entry -> fail */
8081 			vm_map_unlock(submap);
8082 			return KERN_PROTECTION_FAILURE;
8083 		}
8084 	}
8085 	/* no "permanent" entries in the range -> success */
8086 	vm_map_unlock(submap);
8087 	return KERN_SUCCESS;
8088 }
8089 
8090 __abortlike
8091 static void
__vm_map_delete_misaligned_panic(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)8092 __vm_map_delete_misaligned_panic(
8093 	vm_map_t                map,
8094 	vm_map_offset_t         start,
8095 	vm_map_offset_t         end)
8096 {
8097 	panic("vm_map_delete(%p,0x%llx,0x%llx): start is not aligned to 0x%x",
8098 	    map, (uint64_t)start, (uint64_t)end, VM_MAP_PAGE_SIZE(map));
8099 }
8100 
8101 __abortlike
8102 static void
__vm_map_delete_failed_panic(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,kern_return_t kr)8103 __vm_map_delete_failed_panic(
8104 	vm_map_t                map,
8105 	vm_map_offset_t         start,
8106 	vm_map_offset_t         end,
8107 	kern_return_t           kr)
8108 {
8109 	panic("vm_map_delete(%p,0x%llx,0x%llx): failed unexpected with %d",
8110 	    map, (uint64_t)start, (uint64_t)end, kr);
8111 }
8112 
8113 __abortlike
8114 static void
__vm_map_delete_gap_panic(vm_map_t map,vm_map_offset_t where,vm_map_offset_t start,vm_map_offset_t end)8115 __vm_map_delete_gap_panic(
8116 	vm_map_t                map,
8117 	vm_map_offset_t         where,
8118 	vm_map_offset_t         start,
8119 	vm_map_offset_t         end)
8120 {
8121 	panic("vm_map_delete(%p,0x%llx,0x%llx): no map entry at 0x%llx",
8122 	    map, (uint64_t)start, (uint64_t)end, (uint64_t)where);
8123 }
8124 
8125 __abortlike
8126 static void
__vm_map_delete_permanent_panic(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_map_entry_t entry)8127 __vm_map_delete_permanent_panic(
8128 	vm_map_t                map,
8129 	vm_map_offset_t         start,
8130 	vm_map_offset_t         end,
8131 	vm_map_entry_t          entry)
8132 {
8133 	panic("vm_map_delete(%p,0x%llx,0x%llx): "
8134 	    "Attempting to remove permanent VM map entry %p [0x%llx:0x%llx]",
8135 	    map, (uint64_t)start, (uint64_t)end, entry,
8136 	    (uint64_t)entry->vme_start,
8137 	    (uint64_t)entry->vme_end);
8138 }
8139 
8140 __options_decl(vm_map_delete_state_t, uint32_t, {
8141 	VMDS_NONE               = 0x0000,
8142 
8143 	VMDS_FOUND_GAP          = 0x0001,
8144 	VMDS_GAPS_OK            = 0x0002,
8145 
8146 	VMDS_KERNEL_PMAP        = 0x0004,
8147 	VMDS_NEEDS_LOOKUP       = 0x0008,
8148 	VMDS_NEEDS_WAKEUP       = 0x0010,
8149 	VMDS_KERNEL_KMEMPTR     = 0x0020
8150 });
8151 
8152 /*
8153  *	vm_map_delete:	[ internal use only ]
8154  *
8155  *	Deallocates the given address range from the target map.
8156  *	Removes all user wirings. Unwires one kernel wiring if
8157  *	VM_MAP_REMOVE_KUNWIRE is set.  Waits for kernel wirings to go
8158  *	away if VM_MAP_REMOVE_WAIT_FOR_KWIRE is set.  Sleeps
8159  *	interruptibly if VM_MAP_REMOVE_INTERRUPTIBLE is set.
8160  *
8161  *
8162  *	When the map is a kernel map, then any error in removing mappings
8163  *	will lead to a panic so that clients do not have to repeat the panic
8164  *	code at each call site.  If VM_MAP_REMOVE_INTERRUPTIBLE
8165  *	is also passed, then KERN_ABORTED will not lead to a panic.
8166  *
8167  *	This routine is called with map locked and leaves map locked.
8168  */
8169 static kmem_return_t
vm_map_delete(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vmr_flags_t flags,kmem_guard_t guard,vm_map_zap_t zap_list)8170 vm_map_delete(
8171 	vm_map_t                map,
8172 	vm_map_offset_t         start,
8173 	vm_map_offset_t         end,
8174 	vmr_flags_t             flags,
8175 	kmem_guard_t            guard,
8176 	vm_map_zap_t            zap_list)
8177 {
8178 	vm_map_entry_t          entry, next;
8179 	int                     interruptible;
8180 	vm_map_offset_t         gap_start = 0;
8181 	vm_map_offset_t         clear_in_transition_end = 0;
8182 	__unused vm_map_offset_t save_start = start;
8183 	__unused vm_map_offset_t save_end = end;
8184 	vm_map_delete_state_t   state = VMDS_NONE;
8185 	kmem_return_t           ret = { };
8186 	vm_map_range_id_t       range_id = 0;
8187 	struct kmem_page_meta  *meta = NULL;
8188 	uint32_t                size_idx, slot_idx;
8189 	struct mach_vm_range    slot;
8190 
8191 	if (vm_map_pmap(map) == kernel_pmap) {
8192 		state |= VMDS_KERNEL_PMAP;
8193 		range_id = kmem_addr_get_range(start, end - start);
8194 		if (kmem_is_ptr_range(range_id)) {
8195 			state |= VMDS_KERNEL_KMEMPTR;
8196 			slot_idx = kmem_addr_get_slot_idx(start, end, range_id, &meta,
8197 			    &size_idx, &slot);
8198 		}
8199 	}
8200 
8201 	if (map->terminated || os_ref_get_count_raw(&map->map_refcnt) == 0) {
8202 		state |= VMDS_GAPS_OK;
8203 	}
8204 
8205 	if (map->corpse_source &&
8206 	    !(flags & VM_MAP_REMOVE_TO_OVERWRITE) &&
8207 	    !map->terminated) {
8208 		/*
8209 		 * The map is being used for corpses related diagnostics.
8210 		 * So skip any entry removal to avoid perturbing the map state.
8211 		 * The cleanup will happen in task_terminate_internal after the
8212 		 * call to task_port_no_senders.
8213 		 */
8214 		goto out;
8215 	}
8216 
8217 	interruptible = (flags & VM_MAP_REMOVE_INTERRUPTIBLE) ?
8218 	    THREAD_ABORTSAFE : THREAD_UNINT;
8219 
8220 	if ((flags & VM_MAP_REMOVE_NO_MAP_ALIGN) == 0 &&
8221 	    (start & VM_MAP_PAGE_MASK(map))) {
8222 		__vm_map_delete_misaligned_panic(map, start, end);
8223 	}
8224 
8225 	if ((state & VMDS_GAPS_OK) == 0) {
8226 		/*
8227 		 * If the map isn't terminated then all deletions must have
8228 		 * no gaps, and be within the [min, max) of the map.
8229 		 *
8230 		 * We got here without VM_MAP_RANGE_CHECK() being called,
8231 		 * and hence must validate bounds manually.
8232 		 *
8233 		 * It is worth noting that because vm_deallocate() will
8234 		 * round_page() the deallocation size, it's possible for "end"
8235 		 * to be 0 here due to overflow. We hence must treat it as being
8236 		 * beyond vm_map_max(map).
8237 		 *
8238 		 * Similarly, end < start means some wrap around happend,
8239 		 * which should cause an error or panic.
8240 		 */
8241 		if (end == 0 || end > vm_map_max(map)) {
8242 			state |= VMDS_FOUND_GAP;
8243 			gap_start = vm_map_max(map);
8244 			if (state & VMDS_KERNEL_PMAP) {
8245 				__vm_map_delete_gap_panic(map,
8246 				    gap_start, start, end);
8247 			}
8248 			goto out;
8249 		}
8250 
8251 		if (end < start) {
8252 			if (state & VMDS_KERNEL_PMAP) {
8253 				__vm_map_delete_gap_panic(map,
8254 				    vm_map_max(map), start, end);
8255 			}
8256 			ret.kmr_return = KERN_INVALID_ARGUMENT;
8257 			goto out;
8258 		}
8259 
8260 		if (start < vm_map_min(map)) {
8261 			state |= VMDS_FOUND_GAP;
8262 			gap_start = start;
8263 			if (state & VMDS_KERNEL_PMAP) {
8264 				__vm_map_delete_gap_panic(map,
8265 				    gap_start, start, end);
8266 			}
8267 			goto out;
8268 		}
8269 	} else {
8270 		/*
8271 		 * If the map is terminated, we must accept start/end
8272 		 * being beyond the boundaries of the map as this is
8273 		 * how some of the mappings like commpage mappings
8274 		 * can be destroyed (they're outside of those bounds).
8275 		 *
8276 		 * end < start is still something we can't cope with,
8277 		 * so just bail.
8278 		 */
8279 		if (end < start) {
8280 			goto out;
8281 		}
8282 	}
8283 
8284 
8285 	/*
8286 	 *	Find the start of the region.
8287 	 *
8288 	 *	If in a superpage, extend the range
8289 	 *	to include the start of the mapping.
8290 	 */
8291 	while (vm_map_lookup_entry_or_next(map, start, &entry)) {
8292 		if (entry->superpage_size && (start & ~SUPERPAGE_MASK)) {
8293 			start = SUPERPAGE_ROUND_DOWN(start);
8294 		} else {
8295 			SAVE_HINT_MAP_WRITE(map, entry->vme_prev);
8296 			break;
8297 		}
8298 	}
8299 
8300 	if (entry->superpage_size) {
8301 		end = SUPERPAGE_ROUND_UP(end);
8302 	}
8303 
8304 	/*
8305 	 *	Step through all entries in this region
8306 	 */
8307 	for (vm_map_offset_t s = start; s < end;) {
8308 		/*
8309 		 * At this point, we have deleted all the memory entries
8310 		 * in [start, s) and are proceeding with the [s, end) range.
8311 		 *
8312 		 * This loop might drop the map lock, and it is possible that
8313 		 * some memory was already reallocated within [start, s)
8314 		 * and we don't want to mess with those entries.
8315 		 *
8316 		 * Some of those entries could even have been re-assembled
8317 		 * with an entry after "s" (in vm_map_simplify_entry()), so
8318 		 * we may have to vm_map_clip_start() again.
8319 		 *
8320 		 * When clear_in_transition_end is set, the we had marked
8321 		 * [start, clear_in_transition_end) as "in_transition"
8322 		 * during a previous iteration and we need to clear it.
8323 		 */
8324 
8325 		/*
8326 		 * Step 1: If needed (because we dropped locks),
8327 		 *         lookup the entry again.
8328 		 *
8329 		 *         If we're coming back from unwiring (Step 5),
8330 		 *         we also need to mark the entries as no longer
8331 		 *         in transition after that.
8332 		 */
8333 
8334 		if (state & VMDS_NEEDS_LOOKUP) {
8335 			state &= ~VMDS_NEEDS_LOOKUP;
8336 
8337 			if (vm_map_lookup_entry_or_next(map, s, &entry)) {
8338 				SAVE_HINT_MAP_WRITE(map, entry->vme_prev);
8339 			}
8340 
8341 			if (state & VMDS_KERNEL_KMEMPTR) {
8342 				kmem_validate_slot(s, meta, size_idx, slot_idx);
8343 			}
8344 		}
8345 
8346 		if (clear_in_transition_end) {
8347 			for (vm_map_entry_t it = entry;
8348 			    it != vm_map_to_entry(map) &&
8349 			    it->vme_start < clear_in_transition_end;
8350 			    it = it->vme_next) {
8351 				assert(it->in_transition);
8352 				it->in_transition = FALSE;
8353 				if (it->needs_wakeup) {
8354 					it->needs_wakeup = FALSE;
8355 					state |= VMDS_NEEDS_WAKEUP;
8356 				}
8357 			}
8358 
8359 			clear_in_transition_end = 0;
8360 		}
8361 
8362 
8363 		/*
8364 		 * Step 2: Perform various policy checks
8365 		 *         before we do _anything_ to this entry.
8366 		 */
8367 
8368 		if (entry == vm_map_to_entry(map) || s < entry->vme_start) {
8369 			if (state & (VMDS_GAPS_OK | VMDS_FOUND_GAP)) {
8370 				/*
8371 				 * Either we found a gap already,
8372 				 * or we are tearing down a map,
8373 				 * keep going.
8374 				 */
8375 			} else if (state & VMDS_KERNEL_PMAP) {
8376 				__vm_map_delete_gap_panic(map, s, start, end);
8377 			} else if (s < end) {
8378 				state |= VMDS_FOUND_GAP;
8379 				gap_start = s;
8380 			}
8381 
8382 			if (entry == vm_map_to_entry(map) ||
8383 			    end <= entry->vme_start) {
8384 				break;
8385 			}
8386 
8387 			s = entry->vme_start;
8388 		}
8389 
8390 		if (state & VMDS_KERNEL_PMAP) {
8391 			/*
8392 			 * In the kernel map and its submaps,
8393 			 * permanent entries never die, even
8394 			 * if VM_MAP_REMOVE_IMMUTABLE is passed.
8395 			 */
8396 			if (entry->vme_permanent) {
8397 				__vm_map_delete_permanent_panic(map, start, end, entry);
8398 			}
8399 
8400 			if (flags & VM_MAP_REMOVE_GUESS_SIZE) {
8401 				end = entry->vme_end;
8402 				flags &= ~VM_MAP_REMOVE_GUESS_SIZE;
8403 			}
8404 
8405 			/*
8406 			 * In the kernel map and its submaps,
8407 			 * the removal of an atomic/guarded entry is strict.
8408 			 *
8409 			 * An atomic entry is processed only if it was
8410 			 * specifically targeted.
8411 			 *
8412 			 * We might have deleted non-atomic entries before
8413 			 * we reach this this point however...
8414 			 */
8415 			kmem_entry_validate_guard(map, entry,
8416 			    start, end - start, guard);
8417 		}
8418 
8419 		/*
8420 		 * Step 2.1: handle "permanent" and "submap" entries
8421 		 * *before* clipping to avoid triggering some unnecessary
8422 		 * un-nesting of the shared region.
8423 		 */
8424 		if (entry->vme_permanent && entry->is_sub_map) {
8425 //			printf("FBDP %s:%d permanent submap...\n", __FUNCTION__, __LINE__);
8426 			/*
8427 			 * Un-mapping a "permanent" mapping of a user-space
8428 			 * submap is not allowed unless...
8429 			 */
8430 			if (flags & VM_MAP_REMOVE_IMMUTABLE) {
8431 				/*
8432 				 * a. explicitly requested by the kernel caller.
8433 				 */
8434 //				printf("FBDP %s:%d flags & REMOVE_IMMUTABLE\n", __FUNCTION__, __LINE__);
8435 			} else if ((flags & VM_MAP_REMOVE_IMMUTABLE_CODE) &&
8436 			    developer_mode_state()) {
8437 				/*
8438 				 * b. we're in "developer" mode (for
8439 				 *    breakpoints, dtrace probes, ...).
8440 				 */
8441 //				printf("FBDP %s:%d flags & REMOVE_IMMUTABLE_CODE\n", __FUNCTION__, __LINE__);
8442 			} else if (map->terminated) {
8443 				/*
8444 				 * c. this is the final address space cleanup.
8445 				 */
8446 //				printf("FBDP %s:%d map->terminated\n", __FUNCTION__, __LINE__);
8447 			} else {
8448 				vm_map_offset_t submap_start, submap_end;
8449 				kern_return_t submap_kr;
8450 
8451 				/*
8452 				 * Check if there are any "permanent" mappings
8453 				 * in this range in the submap.
8454 				 */
8455 				if (entry->in_transition) {
8456 					/* can that even happen ? */
8457 					goto in_transition;
8458 				}
8459 				/* compute the clipped range in the submap */
8460 				submap_start = s - entry->vme_start;
8461 				submap_start += VME_OFFSET(entry);
8462 				submap_end = end - entry->vme_start;
8463 				submap_end += VME_OFFSET(entry);
8464 				submap_kr = vm_map_delete_submap_recurse(
8465 					VME_SUBMAP(entry),
8466 					submap_start,
8467 					submap_end);
8468 				if (submap_kr != KERN_SUCCESS) {
8469 					/*
8470 					 * There are some "permanent" mappings
8471 					 * in the submap: we are not allowed
8472 					 * to remove this range.
8473 					 */
8474 					printf("%d[%s] removing permanent submap entry "
8475 					    "%p [0x%llx:0x%llx] prot 0x%x/0x%x -> KERN_PROT_FAILURE\n",
8476 					    proc_selfpid(),
8477 					    (get_bsdtask_info(current_task())
8478 					    ? proc_name_address(get_bsdtask_info(current_task()))
8479 					    : "?"), entry,
8480 					    (uint64_t)entry->vme_start,
8481 					    (uint64_t)entry->vme_end,
8482 					    entry->protection,
8483 					    entry->max_protection);
8484 					DTRACE_VM6(vm_map_delete_permanent_deny_submap,
8485 					    vm_map_entry_t, entry,
8486 					    vm_map_offset_t, entry->vme_start,
8487 					    vm_map_offset_t, entry->vme_end,
8488 					    vm_prot_t, entry->protection,
8489 					    vm_prot_t, entry->max_protection,
8490 					    int, VME_ALIAS(entry));
8491 					ret.kmr_return = KERN_PROTECTION_FAILURE;
8492 					goto out;
8493 				}
8494 				/* no permanent mappings: proceed */
8495 			}
8496 		}
8497 
8498 		/*
8499 		 * Step 3: Perform any clipping needed.
8500 		 *
8501 		 *         After this, "entry" starts at "s", ends before "end"
8502 		 */
8503 
8504 		if (entry->vme_start < s) {
8505 			if ((flags & VM_MAP_REMOVE_NO_MAP_ALIGN) &&
8506 			    entry->map_aligned &&
8507 			    !VM_MAP_PAGE_ALIGNED(s, VM_MAP_PAGE_MASK(map))) {
8508 				/*
8509 				 * The entry will no longer be map-aligned
8510 				 * after clipping and the caller said it's OK.
8511 				 */
8512 				entry->map_aligned = FALSE;
8513 			}
8514 			vm_map_clip_start(map, entry, s);
8515 			SAVE_HINT_MAP_WRITE(map, entry->vme_prev);
8516 		}
8517 
8518 		if (end < entry->vme_end) {
8519 			if ((flags & VM_MAP_REMOVE_NO_MAP_ALIGN) &&
8520 			    entry->map_aligned &&
8521 			    !VM_MAP_PAGE_ALIGNED(end, VM_MAP_PAGE_MASK(map))) {
8522 				/*
8523 				 * The entry will no longer be map-aligned
8524 				 * after clipping and the caller said it's OK.
8525 				 */
8526 				entry->map_aligned = FALSE;
8527 			}
8528 			vm_map_clip_end(map, entry, end);
8529 		}
8530 
8531 		if (entry->vme_permanent && entry->is_sub_map) {
8532 			/*
8533 			 * We already went through step 2.1 which did not deny
8534 			 * the removal of this "permanent" and "is_sub_map"
8535 			 * entry.
8536 			 * Now that we've clipped what we actually want to
8537 			 * delete, undo the "permanent" part to allow the
8538 			 * removal to proceed.
8539 			 */
8540 			DTRACE_VM6(vm_map_delete_permanent_allow_submap,
8541 			    vm_map_entry_t, entry,
8542 			    vm_map_offset_t, entry->vme_start,
8543 			    vm_map_offset_t, entry->vme_end,
8544 			    vm_prot_t, entry->protection,
8545 			    vm_prot_t, entry->max_protection,
8546 			    int, VME_ALIAS(entry));
8547 			entry->vme_permanent = false;
8548 		}
8549 
8550 		assert(s == entry->vme_start);
8551 		assert(entry->vme_end <= end);
8552 
8553 
8554 		/*
8555 		 * Step 4: If the entry is in flux, wait for this to resolve.
8556 		 */
8557 
8558 		if (entry->in_transition) {
8559 			wait_result_t wait_result;
8560 
8561 in_transition:
8562 			/*
8563 			 * Another thread is wiring/unwiring this entry.
8564 			 * Let the other thread know we are waiting.
8565 			 */
8566 
8567 			entry->needs_wakeup = TRUE;
8568 
8569 			/*
8570 			 * wake up anybody waiting on entries that we have
8571 			 * already unwired/deleted.
8572 			 */
8573 			if (state & VMDS_NEEDS_WAKEUP) {
8574 				vm_map_entry_wakeup(map);
8575 				state &= ~VMDS_NEEDS_WAKEUP;
8576 			}
8577 
8578 			wait_result = vm_map_entry_wait(map, interruptible);
8579 
8580 			if (interruptible &&
8581 			    wait_result == THREAD_INTERRUPTED) {
8582 				/*
8583 				 * We do not clear the needs_wakeup flag,
8584 				 * since we cannot tell if we were the only one.
8585 				 */
8586 				ret.kmr_return = KERN_ABORTED;
8587 				return ret;
8588 			}
8589 
8590 			/*
8591 			 * The entry could have been clipped or it
8592 			 * may not exist anymore.  Look it up again.
8593 			 */
8594 			state |= VMDS_NEEDS_LOOKUP;
8595 			continue;
8596 		}
8597 
8598 
8599 		/*
8600 		 * Step 5: Handle wiring
8601 		 */
8602 
8603 		if (entry->wired_count) {
8604 			struct vm_map_entry tmp_entry;
8605 			boolean_t           user_wire;
8606 			unsigned int        last_timestamp;
8607 
8608 			user_wire = entry->user_wired_count > 0;
8609 
8610 			/*
8611 			 *      Remove a kernel wiring if requested
8612 			 */
8613 			if (flags & VM_MAP_REMOVE_KUNWIRE) {
8614 				entry->wired_count--;
8615 				vme_btref_consider_and_put(entry);
8616 			}
8617 
8618 			/*
8619 			 *	Remove all user wirings for proper accounting
8620 			 */
8621 			while (entry->user_wired_count) {
8622 				subtract_wire_counts(map, entry, user_wire);
8623 			}
8624 
8625 			/*
8626 			 * All our DMA I/O operations in IOKit are currently
8627 			 * done by wiring through the map entries of the task
8628 			 * requesting the I/O.
8629 			 *
8630 			 * Because of this, we must always wait for kernel wirings
8631 			 * to go away on the entries before deleting them.
8632 			 *
8633 			 * Any caller who wants to actually remove a kernel wiring
8634 			 * should explicitly set the VM_MAP_REMOVE_KUNWIRE flag to
8635 			 * properly remove one wiring instead of blasting through
8636 			 * them all.
8637 			 */
8638 			if (entry->wired_count != 0) {
8639 				assert(map != kernel_map);
8640 				/*
8641 				 * Cannot continue.  Typical case is when
8642 				 * a user thread has physical io pending on
8643 				 * on this page.  Either wait for the
8644 				 * kernel wiring to go away or return an
8645 				 * error.
8646 				 */
8647 				wait_result_t wait_result;
8648 
8649 				entry->needs_wakeup = TRUE;
8650 				wait_result = vm_map_entry_wait(map,
8651 				    interruptible);
8652 
8653 				if (interruptible &&
8654 				    wait_result == THREAD_INTERRUPTED) {
8655 					/*
8656 					 * We do not clear the
8657 					 * needs_wakeup flag, since we
8658 					 * cannot tell if we were the
8659 					 * only one.
8660 					 */
8661 					ret.kmr_return = KERN_ABORTED;
8662 					return ret;
8663 				}
8664 
8665 
8666 				/*
8667 				 * The entry could have been clipped or
8668 				 * it may not exist anymore.  Look it
8669 				 * up again.
8670 				 */
8671 				state |= VMDS_NEEDS_LOOKUP;
8672 				continue;
8673 			}
8674 
8675 			/*
8676 			 * We can unlock the map now.
8677 			 *
8678 			 * The entry might be split once we unlock the map,
8679 			 * but we need the range as defined by this entry
8680 			 * to be stable. So we must make a local copy.
8681 			 *
8682 			 * The underlying objects do not change during clips,
8683 			 * and the in_transition state guarentees existence
8684 			 * of the entry.
8685 			 */
8686 			last_timestamp = map->timestamp;
8687 			entry->in_transition = TRUE;
8688 			tmp_entry = *entry;
8689 			vm_map_unlock(map);
8690 
8691 			if (tmp_entry.is_sub_map) {
8692 				vm_map_t sub_map;
8693 				vm_map_offset_t sub_start, sub_end;
8694 				pmap_t pmap;
8695 				vm_map_offset_t pmap_addr;
8696 
8697 
8698 				sub_map = VME_SUBMAP(&tmp_entry);
8699 				sub_start = VME_OFFSET(&tmp_entry);
8700 				sub_end = sub_start + (tmp_entry.vme_end -
8701 				    tmp_entry.vme_start);
8702 				if (tmp_entry.use_pmap) {
8703 					pmap = sub_map->pmap;
8704 					pmap_addr = tmp_entry.vme_start;
8705 				} else {
8706 					pmap = map->pmap;
8707 					pmap_addr = tmp_entry.vme_start;
8708 				}
8709 				(void) vm_map_unwire_nested(sub_map,
8710 				    sub_start, sub_end,
8711 				    user_wire,
8712 				    pmap, pmap_addr);
8713 			} else {
8714 				vm_map_offset_t entry_end = tmp_entry.vme_end;
8715 				vm_map_offset_t max_end;
8716 
8717 				if (flags & VM_MAP_REMOVE_NOKUNWIRE_LAST) {
8718 					max_end = end - VM_MAP_PAGE_SIZE(map);
8719 					if (entry_end > max_end) {
8720 						entry_end = max_end;
8721 					}
8722 				}
8723 
8724 				if (tmp_entry.vme_kernel_object) {
8725 					pmap_protect_options(
8726 						map->pmap,
8727 						tmp_entry.vme_start,
8728 						entry_end,
8729 						VM_PROT_NONE,
8730 						PMAP_OPTIONS_REMOVE,
8731 						NULL);
8732 				}
8733 				vm_fault_unwire(map, &tmp_entry,
8734 				    tmp_entry.vme_kernel_object, map->pmap,
8735 				    tmp_entry.vme_start, entry_end);
8736 			}
8737 
8738 			vm_map_lock(map);
8739 
8740 			/*
8741 			 * Unwiring happened, we can now go back to deleting
8742 			 * them (after we clear the in_transition bit for the range).
8743 			 */
8744 			if (last_timestamp + 1 != map->timestamp) {
8745 				state |= VMDS_NEEDS_LOOKUP;
8746 			}
8747 			clear_in_transition_end = tmp_entry.vme_end;
8748 			continue;
8749 		}
8750 
8751 		assert(entry->wired_count == 0);
8752 		assert(entry->user_wired_count == 0);
8753 
8754 
8755 		/*
8756 		 * Step 6: Entry is unwired and ready for us to delete !
8757 		 */
8758 
8759 		if (!entry->vme_permanent) {
8760 			/*
8761 			 * Typical case: the entry really shouldn't be permanent
8762 			 */
8763 		} else if ((flags & VM_MAP_REMOVE_IMMUTABLE_CODE) &&
8764 		    (entry->protection & VM_PROT_EXECUTE) &&
8765 		    developer_mode_state()) {
8766 			/*
8767 			 * Allow debuggers to undo executable mappings
8768 			 * when developer mode is on.
8769 			 */
8770 #if 0
8771 			printf("FBDP %d[%s] removing permanent executable entry "
8772 			    "%p [0x%llx:0x%llx] prot 0x%x/0x%x\n",
8773 			    proc_selfpid(),
8774 			    (current_task()->bsd_info
8775 			    ? proc_name_address(current_task()->bsd_info)
8776 			    : "?"), entry,
8777 			    (uint64_t)entry->vme_start,
8778 			    (uint64_t)entry->vme_end,
8779 			    entry->protection,
8780 			    entry->max_protection);
8781 #endif
8782 			entry->vme_permanent = FALSE;
8783 		} else if ((flags & VM_MAP_REMOVE_IMMUTABLE) || map->terminated) {
8784 #if 0
8785 			printf("FBDP %d[%s] removing permanent entry "
8786 			    "%p [0x%llx:0x%llx] prot 0x%x/0x%x\n",
8787 			    proc_selfpid(),
8788 			    (current_task()->bsd_info
8789 			    ? proc_name_address(current_task()->bsd_info)
8790 			    : "?"), entry,
8791 			    (uint64_t)entry->vme_start,
8792 			    (uint64_t)entry->vme_end,
8793 			    entry->protection,
8794 			    entry->max_protection);
8795 #endif
8796 			entry->vme_permanent = FALSE;
8797 #if CODE_SIGNING_MONITOR
8798 		} else if ((entry->protection & VM_PROT_EXECUTE) && !csm_enabled()) {
8799 			entry->vme_permanent = FALSE;
8800 
8801 			printf("%d[%s] %s(0x%llx,0x%llx): "
8802 			    "code signing monitor disabled, allowing for permanent executable entry [0x%llx:0x%llx] "
8803 			    "prot 0x%x/0x%x\n",
8804 			    proc_selfpid(),
8805 			    (get_bsdtask_info(current_task())
8806 			    ? proc_name_address(get_bsdtask_info(current_task()))
8807 			    : "?"),
8808 			    __FUNCTION__,
8809 			    (uint64_t)start,
8810 			    (uint64_t)end,
8811 			    (uint64_t)entry->vme_start,
8812 			    (uint64_t)entry->vme_end,
8813 			    entry->protection,
8814 			    entry->max_protection);
8815 #endif
8816 		} else {
8817 			DTRACE_VM6(vm_map_delete_permanent,
8818 			    vm_map_entry_t, entry,
8819 			    vm_map_offset_t, entry->vme_start,
8820 			    vm_map_offset_t, entry->vme_end,
8821 			    vm_prot_t, entry->protection,
8822 			    vm_prot_t, entry->max_protection,
8823 			    int, VME_ALIAS(entry));
8824 		}
8825 
8826 		if (entry->is_sub_map) {
8827 			assertf(VM_MAP_PAGE_SHIFT(VME_SUBMAP(entry)) >= VM_MAP_PAGE_SHIFT(map),
8828 			    "map %p (%d) entry %p submap %p (%d)\n",
8829 			    map, VM_MAP_PAGE_SHIFT(map), entry,
8830 			    VME_SUBMAP(entry),
8831 			    VM_MAP_PAGE_SHIFT(VME_SUBMAP(entry)));
8832 			if (entry->use_pmap) {
8833 #ifndef NO_NESTED_PMAP
8834 				int pmap_flags;
8835 
8836 				if (map->terminated) {
8837 					/*
8838 					 * This is the final cleanup of the
8839 					 * address space being terminated.
8840 					 * No new mappings are expected and
8841 					 * we don't really need to unnest the
8842 					 * shared region (and lose the "global"
8843 					 * pmap mappings, if applicable).
8844 					 *
8845 					 * Tell the pmap layer that we're
8846 					 * "clean" wrt nesting.
8847 					 */
8848 					pmap_flags = PMAP_UNNEST_CLEAN;
8849 				} else {
8850 					/*
8851 					 * We're unmapping part of the nested
8852 					 * shared region, so we can't keep the
8853 					 * nested pmap.
8854 					 */
8855 					pmap_flags = 0;
8856 				}
8857 				pmap_unnest_options(
8858 					map->pmap,
8859 					(addr64_t)entry->vme_start,
8860 					entry->vme_end - entry->vme_start,
8861 					pmap_flags);
8862 #endif  /* NO_NESTED_PMAP */
8863 				if (map->mapped_in_other_pmaps &&
8864 				    os_ref_get_count_raw(&map->map_refcnt) != 0) {
8865 					/* clean up parent map/maps */
8866 					vm_map_submap_pmap_clean(
8867 						map, entry->vme_start,
8868 						entry->vme_end,
8869 						VME_SUBMAP(entry),
8870 						VME_OFFSET(entry));
8871 				}
8872 			} else {
8873 				vm_map_submap_pmap_clean(
8874 					map, entry->vme_start, entry->vme_end,
8875 					VME_SUBMAP(entry),
8876 					VME_OFFSET(entry));
8877 			}
8878 		} else if (entry->vme_kernel_object ||
8879 		    VME_OBJECT(entry) == compressor_object) {
8880 			/*
8881 			 * nothing to do
8882 			 */
8883 		} else if (map->mapped_in_other_pmaps &&
8884 		    os_ref_get_count_raw(&map->map_refcnt) != 0) {
8885 			vm_object_pmap_protect_options(
8886 				VME_OBJECT(entry), VME_OFFSET(entry),
8887 				entry->vme_end - entry->vme_start,
8888 				PMAP_NULL,
8889 				PAGE_SIZE,
8890 				entry->vme_start,
8891 				VM_PROT_NONE,
8892 				PMAP_OPTIONS_REMOVE);
8893 		} else if ((VME_OBJECT(entry) != VM_OBJECT_NULL) ||
8894 		    (state & VMDS_KERNEL_PMAP)) {
8895 			/* Remove translations associated
8896 			 * with this range unless the entry
8897 			 * does not have an object, or
8898 			 * it's the kernel map or a descendant
8899 			 * since the platform could potentially
8900 			 * create "backdoor" mappings invisible
8901 			 * to the VM. It is expected that
8902 			 * objectless, non-kernel ranges
8903 			 * do not have such VM invisible
8904 			 * translations.
8905 			 */
8906 			pmap_remove_options(map->pmap,
8907 			    (addr64_t)entry->vme_start,
8908 			    (addr64_t)entry->vme_end,
8909 			    PMAP_OPTIONS_REMOVE);
8910 		}
8911 
8912 #if DEBUG
8913 		/*
8914 		 * All pmap mappings for this map entry must have been
8915 		 * cleared by now.
8916 		 */
8917 		assert(pmap_is_empty(map->pmap,
8918 		    entry->vme_start,
8919 		    entry->vme_end));
8920 #endif /* DEBUG */
8921 
8922 		if (entry->iokit_acct) {
8923 			/* alternate accounting */
8924 			DTRACE_VM4(vm_map_iokit_unmapped_region,
8925 			    vm_map_t, map,
8926 			    vm_map_offset_t, entry->vme_start,
8927 			    vm_map_offset_t, entry->vme_end,
8928 			    int, VME_ALIAS(entry));
8929 			vm_map_iokit_unmapped_region(map,
8930 			    (entry->vme_end -
8931 			    entry->vme_start));
8932 			entry->iokit_acct = FALSE;
8933 			entry->use_pmap = FALSE;
8934 		}
8935 
8936 		/* move "s" forward */
8937 		s    = entry->vme_end;
8938 		next = entry->vme_next;
8939 		if (!entry->map_aligned) {
8940 			vm_map_offset_t rounded_s;
8941 
8942 			/*
8943 			 * Skip artificial gap due to mis-aligned entry
8944 			 * on devices with a page size smaller than the
8945 			 * map's page size (i.e. 16k task on a 4k device).
8946 			 */
8947 			rounded_s = VM_MAP_ROUND_PAGE(s, VM_MAP_PAGE_MASK(map));
8948 			if (next == vm_map_to_entry(map)) {
8949 				s = rounded_s;
8950 			} else if (s < rounded_s) {
8951 				s = MIN(rounded_s, next->vme_start);
8952 			}
8953 		}
8954 		ret.kmr_size += s - entry->vme_start;
8955 
8956 		if (entry->vme_permanent) {
8957 			/*
8958 			 * A permanent entry can not be removed, so leave it
8959 			 * in place but remove all access permissions.
8960 			 */
8961 			if (!entry->csm_associated) {
8962 				printf("%s:%d %d[%s] map %p entry %p [ 0x%llx - 0x%llx ] submap %d prot 0x%x/0x%x -> 0/0\n",
8963 				    __FUNCTION__, __LINE__,
8964 				    proc_selfpid(),
8965 				    (get_bsdtask_info(current_task())
8966 				    ? proc_name_address(get_bsdtask_info(current_task()))
8967 				    : "?"),
8968 				    map,
8969 				    entry,
8970 				    (uint64_t)entry->vme_start,
8971 				    (uint64_t)entry->vme_end,
8972 				    entry->is_sub_map,
8973 				    entry->protection,
8974 				    entry->max_protection);
8975 			}
8976 			DTRACE_VM6(vm_map_delete_permanent_prot_none,
8977 			    vm_map_entry_t, entry,
8978 			    vm_map_offset_t, entry->vme_start,
8979 			    vm_map_offset_t, entry->vme_end,
8980 			    vm_prot_t, entry->protection,
8981 			    vm_prot_t, entry->max_protection,
8982 			    int, VME_ALIAS(entry));
8983 			entry->protection = VM_PROT_NONE;
8984 			entry->max_protection = VM_PROT_NONE;
8985 		} else {
8986 			vm_map_entry_zap(map, entry, zap_list);
8987 		}
8988 
8989 		entry = next;
8990 		next  = VM_MAP_ENTRY_NULL;
8991 
8992 		if ((flags & VM_MAP_REMOVE_NO_YIELD) == 0 && s < end) {
8993 			unsigned int last_timestamp = map->timestamp++;
8994 
8995 			if (lck_rw_lock_yield_exclusive(&map->lock,
8996 			    LCK_RW_YIELD_ANY_WAITER)) {
8997 				if (last_timestamp != map->timestamp + 1) {
8998 					state |= VMDS_NEEDS_LOOKUP;
8999 				}
9000 			} else {
9001 				/* we didn't yield, undo our change */
9002 				map->timestamp--;
9003 			}
9004 		}
9005 	}
9006 
9007 	if (map->wait_for_space) {
9008 		thread_wakeup((event_t) map);
9009 	}
9010 
9011 	if (state & VMDS_NEEDS_WAKEUP) {
9012 		vm_map_entry_wakeup(map);
9013 	}
9014 
9015 out:
9016 	if ((state & VMDS_KERNEL_PMAP) && ret.kmr_return) {
9017 		__vm_map_delete_failed_panic(map, start, end, ret.kmr_return);
9018 	}
9019 
9020 	if (state & VMDS_KERNEL_KMEMPTR) {
9021 		kmem_free_space(start, end, range_id, &slot);
9022 	}
9023 
9024 	if (state & VMDS_FOUND_GAP) {
9025 		DTRACE_VM3(kern_vm_deallocate_gap,
9026 		    vm_map_offset_t, gap_start,
9027 		    vm_map_offset_t, save_start,
9028 		    vm_map_offset_t, save_end);
9029 		if (flags & VM_MAP_REMOVE_GAPS_FAIL) {
9030 			ret.kmr_return = KERN_INVALID_VALUE;
9031 		} else {
9032 			vm_map_guard_exception(gap_start, kGUARD_EXC_DEALLOC_GAP);
9033 		}
9034 	}
9035 
9036 	return ret;
9037 }
9038 
9039 kmem_return_t
vm_map_remove_and_unlock(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vmr_flags_t flags,kmem_guard_t guard)9040 vm_map_remove_and_unlock(
9041 	vm_map_t        map,
9042 	vm_map_offset_t start,
9043 	vm_map_offset_t end,
9044 	vmr_flags_t     flags,
9045 	kmem_guard_t    guard)
9046 {
9047 	kmem_return_t ret;
9048 	VM_MAP_ZAP_DECLARE(zap);
9049 
9050 	ret = vm_map_delete(map, start, end, flags, guard, &zap);
9051 	vm_map_unlock(map);
9052 
9053 	vm_map_zap_dispose(&zap);
9054 
9055 	return ret;
9056 }
9057 
9058 /*
9059  *	vm_map_remove_guard:
9060  *
9061  *	Remove the given address range from the target map.
9062  *	This is the exported form of vm_map_delete.
9063  */
9064 kmem_return_t
vm_map_remove_guard(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vmr_flags_t flags,kmem_guard_t guard)9065 vm_map_remove_guard(
9066 	vm_map_t        map,
9067 	vm_map_offset_t start,
9068 	vm_map_offset_t end,
9069 	vmr_flags_t     flags,
9070 	kmem_guard_t    guard)
9071 {
9072 	vm_map_lock(map);
9073 	return vm_map_remove_and_unlock(map, start, end, flags, guard);
9074 }
9075 
9076 /*
9077  *	vm_map_terminate:
9078  *
9079  *	Clean out a task's map.
9080  */
9081 kern_return_t
vm_map_terminate(vm_map_t map)9082 vm_map_terminate(
9083 	vm_map_t        map)
9084 {
9085 	vm_map_lock(map);
9086 	map->terminated = TRUE;
9087 	vm_map_disable_hole_optimization(map);
9088 	(void)vm_map_remove_and_unlock(map, map->min_offset, map->max_offset,
9089 	    VM_MAP_REMOVE_NO_FLAGS, KMEM_GUARD_NONE);
9090 	return KERN_SUCCESS;
9091 }
9092 
9093 /*
9094  *	Routine:	vm_map_copy_allocate
9095  *
9096  *	Description:
9097  *		Allocates and initializes a map copy object.
9098  */
9099 static vm_map_copy_t
vm_map_copy_allocate(uint16_t type)9100 vm_map_copy_allocate(uint16_t type)
9101 {
9102 	vm_map_copy_t new_copy;
9103 
9104 	new_copy = zalloc_id(ZONE_ID_VM_MAP_COPY, Z_WAITOK | Z_ZERO);
9105 	new_copy->type = type;
9106 	if (type == VM_MAP_COPY_ENTRY_LIST) {
9107 		new_copy->c_u.hdr.rb_head_store.rbh_root = (void*)(int)SKIP_RB_TREE;
9108 		vm_map_store_init(&new_copy->cpy_hdr);
9109 	}
9110 	return new_copy;
9111 }
9112 
9113 /*
9114  *	Routine:	vm_map_copy_discard
9115  *
9116  *	Description:
9117  *		Dispose of a map copy object (returned by
9118  *		vm_map_copyin).
9119  */
9120 void
vm_map_copy_discard(vm_map_copy_t copy)9121 vm_map_copy_discard(
9122 	vm_map_copy_t   copy)
9123 {
9124 	if (copy == VM_MAP_COPY_NULL) {
9125 		return;
9126 	}
9127 
9128 	/*
9129 	 * Assert that the vm_map_copy is coming from the right
9130 	 * zone and hasn't been forged
9131 	 */
9132 	vm_map_copy_require(copy);
9133 
9134 	switch (copy->type) {
9135 	case VM_MAP_COPY_ENTRY_LIST:
9136 		while (vm_map_copy_first_entry(copy) !=
9137 		    vm_map_copy_to_entry(copy)) {
9138 			vm_map_entry_t  entry = vm_map_copy_first_entry(copy);
9139 
9140 			vm_map_copy_entry_unlink(copy, entry);
9141 			if (entry->is_sub_map) {
9142 				vm_map_deallocate(VME_SUBMAP(entry));
9143 			} else {
9144 				vm_object_deallocate(VME_OBJECT(entry));
9145 			}
9146 			vm_map_copy_entry_dispose(entry);
9147 		}
9148 		break;
9149 	case VM_MAP_COPY_KERNEL_BUFFER:
9150 
9151 		/*
9152 		 * The vm_map_copy_t and possibly the data buffer were
9153 		 * allocated by a single call to kalloc_data(), i.e. the
9154 		 * vm_map_copy_t was not allocated out of the zone.
9155 		 */
9156 		if (copy->size > msg_ool_size_small || copy->offset) {
9157 			panic("Invalid vm_map_copy_t sz:%lld, ofst:%lld",
9158 			    (long long)copy->size, (long long)copy->offset);
9159 		}
9160 		kfree_data(copy->cpy_kdata, copy->size);
9161 	}
9162 	zfree_id(ZONE_ID_VM_MAP_COPY, copy);
9163 }
9164 
9165 #if XNU_PLATFORM_MacOSX
9166 
9167 /*
9168  *	Routine:	vm_map_copy_copy
9169  *
9170  *	Description:
9171  *			Move the information in a map copy object to
9172  *			a new map copy object, leaving the old one
9173  *			empty.
9174  *
9175  *			This is used by kernel routines that need
9176  *			to look at out-of-line data (in copyin form)
9177  *			before deciding whether to return SUCCESS.
9178  *			If the routine returns FAILURE, the original
9179  *			copy object will be deallocated; therefore,
9180  *			these routines must make a copy of the copy
9181  *			object and leave the original empty so that
9182  *			deallocation will not fail.
9183  */
9184 vm_map_copy_t
vm_map_copy_copy(vm_map_copy_t copy)9185 vm_map_copy_copy(
9186 	vm_map_copy_t   copy)
9187 {
9188 	vm_map_copy_t   new_copy;
9189 
9190 	if (copy == VM_MAP_COPY_NULL) {
9191 		return VM_MAP_COPY_NULL;
9192 	}
9193 
9194 	/*
9195 	 * Assert that the vm_map_copy is coming from the right
9196 	 * zone and hasn't been forged
9197 	 */
9198 	vm_map_copy_require(copy);
9199 
9200 	/*
9201 	 * Allocate a new copy object, and copy the information
9202 	 * from the old one into it.
9203 	 */
9204 
9205 	new_copy = zalloc_id(ZONE_ID_VM_MAP_COPY, Z_WAITOK | Z_ZERO | Z_NOFAIL);
9206 	memcpy((void *) new_copy, (void *) copy, sizeof(struct vm_map_copy));
9207 #if __has_feature(ptrauth_calls)
9208 	if (copy->type == VM_MAP_COPY_KERNEL_BUFFER) {
9209 		new_copy->cpy_kdata = copy->cpy_kdata;
9210 	}
9211 #endif
9212 
9213 	if (copy->type == VM_MAP_COPY_ENTRY_LIST) {
9214 		/*
9215 		 * The links in the entry chain must be
9216 		 * changed to point to the new copy object.
9217 		 */
9218 		vm_map_copy_first_entry(copy)->vme_prev
9219 		        = vm_map_copy_to_entry(new_copy);
9220 		vm_map_copy_last_entry(copy)->vme_next
9221 		        = vm_map_copy_to_entry(new_copy);
9222 	}
9223 
9224 	/*
9225 	 * Change the old copy object into one that contains
9226 	 * nothing to be deallocated.
9227 	 */
9228 	bzero(copy, sizeof(struct vm_map_copy));
9229 	copy->type = VM_MAP_COPY_KERNEL_BUFFER;
9230 
9231 	/*
9232 	 * Return the new object.
9233 	 */
9234 	return new_copy;
9235 }
9236 
9237 #endif /* XNU_PLATFORM_MacOSX */
9238 
9239 static boolean_t
vm_map_entry_is_overwritable(vm_map_t dst_map __unused,vm_map_entry_t entry)9240 vm_map_entry_is_overwritable(
9241 	vm_map_t        dst_map __unused,
9242 	vm_map_entry_t  entry)
9243 {
9244 	if (!(entry->protection & VM_PROT_WRITE)) {
9245 		/* can't overwrite if not writable */
9246 		return FALSE;
9247 	}
9248 #if !__x86_64__
9249 	if (entry->used_for_jit &&
9250 	    vm_map_cs_enforcement(dst_map) &&
9251 	    !dst_map->cs_debugged) {
9252 		/*
9253 		 * Can't overwrite a JIT region while cs_enforced
9254 		 * and not cs_debugged.
9255 		 */
9256 		return FALSE;
9257 	}
9258 
9259 #if __arm64e__
9260 	/* Do not allow overwrite HW assisted TPRO entries */
9261 	if (entry->used_for_tpro) {
9262 		return FALSE;
9263 	}
9264 #endif /* __arm64e__ */
9265 
9266 	if (entry->vme_permanent) {
9267 		if (entry->is_sub_map) {
9268 			/*
9269 			 * We can't tell if the submap contains "permanent"
9270 			 * entries within the range targeted by the caller.
9271 			 * The caller will have to check for that with
9272 			 * vm_map_overwrite_submap_recurse() for example.
9273 			 */
9274 		} else {
9275 			/*
9276 			 * Do not allow overwriting of a "permanent"
9277 			 * entry.
9278 			 */
9279 			DTRACE_VM6(vm_map_delete_permanent_deny_overwrite,
9280 			    vm_map_entry_t, entry,
9281 			    vm_map_offset_t, entry->vme_start,
9282 			    vm_map_offset_t, entry->vme_end,
9283 			    vm_prot_t, entry->protection,
9284 			    vm_prot_t, entry->max_protection,
9285 			    int, VME_ALIAS(entry));
9286 			return FALSE;
9287 		}
9288 	}
9289 #endif /* !__x86_64__ */
9290 	return TRUE;
9291 }
9292 
9293 static kern_return_t
vm_map_overwrite_submap_recurse(vm_map_t dst_map,vm_map_offset_t dst_addr,vm_map_size_t dst_size)9294 vm_map_overwrite_submap_recurse(
9295 	vm_map_t        dst_map,
9296 	vm_map_offset_t dst_addr,
9297 	vm_map_size_t   dst_size)
9298 {
9299 	vm_map_offset_t dst_end;
9300 	vm_map_entry_t  tmp_entry;
9301 	vm_map_entry_t  entry;
9302 	kern_return_t   result;
9303 	boolean_t       encountered_sub_map = FALSE;
9304 
9305 
9306 
9307 	/*
9308 	 *	Verify that the destination is all writeable
9309 	 *	initially.  We have to trunc the destination
9310 	 *	address and round the copy size or we'll end up
9311 	 *	splitting entries in strange ways.
9312 	 */
9313 
9314 	dst_end = vm_map_round_page(dst_addr + dst_size,
9315 	    VM_MAP_PAGE_MASK(dst_map));
9316 	vm_map_lock(dst_map);
9317 
9318 start_pass_1:
9319 	if (!vm_map_lookup_entry(dst_map, dst_addr, &tmp_entry)) {
9320 		vm_map_unlock(dst_map);
9321 		return KERN_INVALID_ADDRESS;
9322 	}
9323 
9324 	vm_map_clip_start(dst_map,
9325 	    tmp_entry,
9326 	    vm_map_trunc_page(dst_addr,
9327 	    VM_MAP_PAGE_MASK(dst_map)));
9328 	if (tmp_entry->is_sub_map) {
9329 		/* clipping did unnest if needed */
9330 		assert(!tmp_entry->use_pmap);
9331 	}
9332 
9333 	for (entry = tmp_entry;;) {
9334 		vm_map_entry_t  next;
9335 
9336 		next = entry->vme_next;
9337 		while (entry->is_sub_map) {
9338 			vm_map_offset_t sub_start;
9339 			vm_map_offset_t sub_end;
9340 			vm_map_offset_t local_end;
9341 
9342 			if (entry->in_transition) {
9343 				/*
9344 				 * Say that we are waiting, and wait for entry.
9345 				 */
9346 				entry->needs_wakeup = TRUE;
9347 				vm_map_entry_wait(dst_map, THREAD_UNINT);
9348 
9349 				goto start_pass_1;
9350 			}
9351 
9352 			encountered_sub_map = TRUE;
9353 			sub_start = VME_OFFSET(entry);
9354 
9355 			if (entry->vme_end < dst_end) {
9356 				sub_end = entry->vme_end;
9357 			} else {
9358 				sub_end = dst_end;
9359 			}
9360 			sub_end -= entry->vme_start;
9361 			sub_end += VME_OFFSET(entry);
9362 			local_end = entry->vme_end;
9363 			vm_map_unlock(dst_map);
9364 
9365 			result = vm_map_overwrite_submap_recurse(
9366 				VME_SUBMAP(entry),
9367 				sub_start,
9368 				sub_end - sub_start);
9369 
9370 			if (result != KERN_SUCCESS) {
9371 				return result;
9372 			}
9373 			if (dst_end <= entry->vme_end) {
9374 				return KERN_SUCCESS;
9375 			}
9376 			vm_map_lock(dst_map);
9377 			if (!vm_map_lookup_entry(dst_map, local_end,
9378 			    &tmp_entry)) {
9379 				vm_map_unlock(dst_map);
9380 				return KERN_INVALID_ADDRESS;
9381 			}
9382 			entry = tmp_entry;
9383 			next = entry->vme_next;
9384 		}
9385 
9386 		if (!(entry->protection & VM_PROT_WRITE)) {
9387 			vm_map_unlock(dst_map);
9388 			return KERN_PROTECTION_FAILURE;
9389 		}
9390 
9391 		if (!vm_map_entry_is_overwritable(dst_map, entry)) {
9392 			vm_map_unlock(dst_map);
9393 			return KERN_PROTECTION_FAILURE;
9394 		}
9395 
9396 		/*
9397 		 *	If the entry is in transition, we must wait
9398 		 *	for it to exit that state.  Anything could happen
9399 		 *	when we unlock the map, so start over.
9400 		 */
9401 		if (entry->in_transition) {
9402 			/*
9403 			 * Say that we are waiting, and wait for entry.
9404 			 */
9405 			entry->needs_wakeup = TRUE;
9406 			vm_map_entry_wait(dst_map, THREAD_UNINT);
9407 
9408 			goto start_pass_1;
9409 		}
9410 
9411 /*
9412  *		our range is contained completely within this map entry
9413  */
9414 		if (dst_end <= entry->vme_end) {
9415 			vm_map_unlock(dst_map);
9416 			return KERN_SUCCESS;
9417 		}
9418 /*
9419  *		check that range specified is contiguous region
9420  */
9421 		if ((next == vm_map_to_entry(dst_map)) ||
9422 		    (next->vme_start != entry->vme_end)) {
9423 			vm_map_unlock(dst_map);
9424 			return KERN_INVALID_ADDRESS;
9425 		}
9426 
9427 		/*
9428 		 *	Check for permanent objects in the destination.
9429 		 */
9430 		if ((VME_OBJECT(entry) != VM_OBJECT_NULL) &&
9431 		    ((!VME_OBJECT(entry)->internal) ||
9432 		    (VME_OBJECT(entry)->true_share))) {
9433 			if (encountered_sub_map) {
9434 				vm_map_unlock(dst_map);
9435 				return KERN_FAILURE;
9436 			}
9437 		}
9438 
9439 
9440 		entry = next;
9441 	}/* for */
9442 	vm_map_unlock(dst_map);
9443 	return KERN_SUCCESS;
9444 }
9445 
9446 /*
9447  *	Routine:	vm_map_copy_overwrite
9448  *
9449  *	Description:
9450  *		Copy the memory described by the map copy
9451  *		object (copy; returned by vm_map_copyin) onto
9452  *		the specified destination region (dst_map, dst_addr).
9453  *		The destination must be writeable.
9454  *
9455  *		Unlike vm_map_copyout, this routine actually
9456  *		writes over previously-mapped memory.  If the
9457  *		previous mapping was to a permanent (user-supplied)
9458  *		memory object, it is preserved.
9459  *
9460  *		The attributes (protection and inheritance) of the
9461  *		destination region are preserved.
9462  *
9463  *		If successful, consumes the copy object.
9464  *		Otherwise, the caller is responsible for it.
9465  *
9466  *	Implementation notes:
9467  *		To overwrite aligned temporary virtual memory, it is
9468  *		sufficient to remove the previous mapping and insert
9469  *		the new copy.  This replacement is done either on
9470  *		the whole region (if no permanent virtual memory
9471  *		objects are embedded in the destination region) or
9472  *		in individual map entries.
9473  *
9474  *		To overwrite permanent virtual memory , it is necessary
9475  *		to copy each page, as the external memory management
9476  *		interface currently does not provide any optimizations.
9477  *
9478  *		Unaligned memory also has to be copied.  It is possible
9479  *		to use 'vm_trickery' to copy the aligned data.  This is
9480  *		not done but not hard to implement.
9481  *
9482  *		Once a page of permanent memory has been overwritten,
9483  *		it is impossible to interrupt this function; otherwise,
9484  *		the call would be neither atomic nor location-independent.
9485  *		The kernel-state portion of a user thread must be
9486  *		interruptible.
9487  *
9488  *		It may be expensive to forward all requests that might
9489  *		overwrite permanent memory (vm_write, vm_copy) to
9490  *		uninterruptible kernel threads.  This routine may be
9491  *		called by interruptible threads; however, success is
9492  *		not guaranteed -- if the request cannot be performed
9493  *		atomically and interruptibly, an error indication is
9494  *		returned.
9495  *
9496  *		Callers of this function must call vm_map_copy_require on
9497  *		previously created vm_map_copy_t or pass a newly created
9498  *		one to ensure that it hasn't been forged.
9499  */
9500 static kern_return_t
vm_map_copy_overwrite_nested(vm_map_t dst_map,vm_map_address_t dst_addr,vm_map_copy_t copy,boolean_t interruptible,pmap_t pmap,boolean_t discard_on_success)9501 vm_map_copy_overwrite_nested(
9502 	vm_map_t                dst_map,
9503 	vm_map_address_t        dst_addr,
9504 	vm_map_copy_t           copy,
9505 	boolean_t               interruptible,
9506 	pmap_t                  pmap,
9507 	boolean_t               discard_on_success)
9508 {
9509 	vm_map_offset_t         dst_end;
9510 	vm_map_entry_t          tmp_entry;
9511 	vm_map_entry_t          entry;
9512 	kern_return_t           kr;
9513 	boolean_t               aligned = TRUE;
9514 	boolean_t               contains_permanent_objects = FALSE;
9515 	boolean_t               encountered_sub_map = FALSE;
9516 	vm_map_offset_t         base_addr;
9517 	vm_map_size_t           copy_size;
9518 	vm_map_size_t           total_size;
9519 	uint16_t                copy_page_shift;
9520 
9521 	/*
9522 	 *	Check for special kernel buffer allocated
9523 	 *	by new_ipc_kmsg_copyin.
9524 	 */
9525 
9526 	if (copy->type == VM_MAP_COPY_KERNEL_BUFFER) {
9527 		kr = vm_map_copyout_kernel_buffer(
9528 			dst_map, &dst_addr,
9529 			copy, copy->size, TRUE, discard_on_success);
9530 		return kr;
9531 	}
9532 
9533 	/*
9534 	 *      Only works for entry lists at the moment.  Will
9535 	 *	support page lists later.
9536 	 */
9537 
9538 	assert(copy->type == VM_MAP_COPY_ENTRY_LIST);
9539 
9540 	if (copy->size == 0) {
9541 		if (discard_on_success) {
9542 			vm_map_copy_discard(copy);
9543 		}
9544 		return KERN_SUCCESS;
9545 	}
9546 
9547 	copy_page_shift = copy->cpy_hdr.page_shift;
9548 
9549 	/*
9550 	 *	Verify that the destination is all writeable
9551 	 *	initially.  We have to trunc the destination
9552 	 *	address and round the copy size or we'll end up
9553 	 *	splitting entries in strange ways.
9554 	 */
9555 
9556 	if (!VM_MAP_PAGE_ALIGNED(copy->size,
9557 	    VM_MAP_PAGE_MASK(dst_map)) ||
9558 	    !VM_MAP_PAGE_ALIGNED(copy->offset,
9559 	    VM_MAP_PAGE_MASK(dst_map)) ||
9560 	    !VM_MAP_PAGE_ALIGNED(dst_addr,
9561 	    VM_MAP_PAGE_MASK(dst_map)) ||
9562 	    copy_page_shift != VM_MAP_PAGE_SHIFT(dst_map)) {
9563 		aligned = FALSE;
9564 		dst_end = vm_map_round_page(dst_addr + copy->size,
9565 		    VM_MAP_PAGE_MASK(dst_map));
9566 	} else {
9567 		dst_end = dst_addr + copy->size;
9568 	}
9569 
9570 	vm_map_lock(dst_map);
9571 
9572 	/* LP64todo - remove this check when vm_map_commpage64()
9573 	 * no longer has to stuff in a map_entry for the commpage
9574 	 * above the map's max_offset.
9575 	 */
9576 	if (dst_addr >= dst_map->max_offset) {
9577 		vm_map_unlock(dst_map);
9578 		return KERN_INVALID_ADDRESS;
9579 	}
9580 
9581 start_pass_1:
9582 	if (!vm_map_lookup_entry(dst_map, dst_addr, &tmp_entry)) {
9583 		vm_map_unlock(dst_map);
9584 		return KERN_INVALID_ADDRESS;
9585 	}
9586 	vm_map_clip_start(dst_map,
9587 	    tmp_entry,
9588 	    vm_map_trunc_page(dst_addr,
9589 	    VM_MAP_PAGE_MASK(dst_map)));
9590 	for (entry = tmp_entry;;) {
9591 		vm_map_entry_t  next = entry->vme_next;
9592 
9593 		while (entry->is_sub_map) {
9594 			vm_map_offset_t sub_start;
9595 			vm_map_offset_t sub_end;
9596 			vm_map_offset_t local_end;
9597 
9598 			if (entry->in_transition) {
9599 				/*
9600 				 * Say that we are waiting, and wait for entry.
9601 				 */
9602 				entry->needs_wakeup = TRUE;
9603 				vm_map_entry_wait(dst_map, THREAD_UNINT);
9604 
9605 				goto start_pass_1;
9606 			}
9607 
9608 			local_end = entry->vme_end;
9609 			if (!(entry->needs_copy)) {
9610 				/* if needs_copy we are a COW submap */
9611 				/* in such a case we just replace so */
9612 				/* there is no need for the follow-  */
9613 				/* ing check.                        */
9614 				encountered_sub_map = TRUE;
9615 				sub_start = VME_OFFSET(entry);
9616 
9617 				if (entry->vme_end < dst_end) {
9618 					sub_end = entry->vme_end;
9619 				} else {
9620 					sub_end = dst_end;
9621 				}
9622 				sub_end -= entry->vme_start;
9623 				sub_end += VME_OFFSET(entry);
9624 				vm_map_unlock(dst_map);
9625 
9626 				kr = vm_map_overwrite_submap_recurse(
9627 					VME_SUBMAP(entry),
9628 					sub_start,
9629 					sub_end - sub_start);
9630 				if (kr != KERN_SUCCESS) {
9631 					return kr;
9632 				}
9633 				vm_map_lock(dst_map);
9634 			}
9635 
9636 			if (dst_end <= entry->vme_end) {
9637 				goto start_overwrite;
9638 			}
9639 			if (!vm_map_lookup_entry(dst_map, local_end,
9640 			    &entry)) {
9641 				vm_map_unlock(dst_map);
9642 				return KERN_INVALID_ADDRESS;
9643 			}
9644 			next = entry->vme_next;
9645 		}
9646 
9647 		if (!(entry->protection & VM_PROT_WRITE)) {
9648 			vm_map_unlock(dst_map);
9649 			return KERN_PROTECTION_FAILURE;
9650 		}
9651 
9652 		if (!vm_map_entry_is_overwritable(dst_map, entry)) {
9653 			vm_map_unlock(dst_map);
9654 			return KERN_PROTECTION_FAILURE;
9655 		}
9656 
9657 		/*
9658 		 *	If the entry is in transition, we must wait
9659 		 *	for it to exit that state.  Anything could happen
9660 		 *	when we unlock the map, so start over.
9661 		 */
9662 		if (entry->in_transition) {
9663 			/*
9664 			 * Say that we are waiting, and wait for entry.
9665 			 */
9666 			entry->needs_wakeup = TRUE;
9667 			vm_map_entry_wait(dst_map, THREAD_UNINT);
9668 
9669 			goto start_pass_1;
9670 		}
9671 
9672 /*
9673  *		our range is contained completely within this map entry
9674  */
9675 		if (dst_end <= entry->vme_end) {
9676 			break;
9677 		}
9678 /*
9679  *		check that range specified is contiguous region
9680  */
9681 		if ((next == vm_map_to_entry(dst_map)) ||
9682 		    (next->vme_start != entry->vme_end)) {
9683 			vm_map_unlock(dst_map);
9684 			return KERN_INVALID_ADDRESS;
9685 		}
9686 
9687 
9688 		/*
9689 		 *	Check for permanent objects in the destination.
9690 		 */
9691 		if ((VME_OBJECT(entry) != VM_OBJECT_NULL) &&
9692 		    ((!VME_OBJECT(entry)->internal) ||
9693 		    (VME_OBJECT(entry)->true_share))) {
9694 			contains_permanent_objects = TRUE;
9695 		}
9696 
9697 		entry = next;
9698 	}/* for */
9699 
9700 start_overwrite:
9701 	/*
9702 	 *	If there are permanent objects in the destination, then
9703 	 *	the copy cannot be interrupted.
9704 	 */
9705 
9706 	if (interruptible && contains_permanent_objects) {
9707 		vm_map_unlock(dst_map);
9708 		return KERN_FAILURE;   /* XXX */
9709 	}
9710 
9711 	/*
9712 	 *
9713 	 *	Make a second pass, overwriting the data
9714 	 *	At the beginning of each loop iteration,
9715 	 *	the next entry to be overwritten is "tmp_entry"
9716 	 *	(initially, the value returned from the lookup above),
9717 	 *	and the starting address expected in that entry
9718 	 *	is "start".
9719 	 */
9720 
9721 	total_size = copy->size;
9722 	if (encountered_sub_map) {
9723 		copy_size = 0;
9724 		/* re-calculate tmp_entry since we've had the map */
9725 		/* unlocked */
9726 		if (!vm_map_lookup_entry( dst_map, dst_addr, &tmp_entry)) {
9727 			vm_map_unlock(dst_map);
9728 			return KERN_INVALID_ADDRESS;
9729 		}
9730 	} else {
9731 		copy_size = copy->size;
9732 	}
9733 
9734 	base_addr = dst_addr;
9735 	while (TRUE) {
9736 		/* deconstruct the copy object and do in parts */
9737 		/* only in sub_map, interruptable case */
9738 		vm_map_entry_t  copy_entry;
9739 		vm_map_entry_t  previous_prev = VM_MAP_ENTRY_NULL;
9740 		vm_map_entry_t  next_copy = VM_MAP_ENTRY_NULL;
9741 		int             nentries;
9742 		int             remaining_entries = 0;
9743 		vm_map_offset_t new_offset = 0;
9744 
9745 		for (entry = tmp_entry; copy_size == 0;) {
9746 			vm_map_entry_t  next;
9747 
9748 			next = entry->vme_next;
9749 
9750 			/* tmp_entry and base address are moved along */
9751 			/* each time we encounter a sub-map.  Otherwise */
9752 			/* entry can outpase tmp_entry, and the copy_size */
9753 			/* may reflect the distance between them */
9754 			/* if the current entry is found to be in transition */
9755 			/* we will start over at the beginning or the last */
9756 			/* encounter of a submap as dictated by base_addr */
9757 			/* we will zero copy_size accordingly. */
9758 			if (entry->in_transition) {
9759 				/*
9760 				 * Say that we are waiting, and wait for entry.
9761 				 */
9762 				entry->needs_wakeup = TRUE;
9763 				vm_map_entry_wait(dst_map, THREAD_UNINT);
9764 
9765 				if (!vm_map_lookup_entry(dst_map, base_addr,
9766 				    &tmp_entry)) {
9767 					vm_map_unlock(dst_map);
9768 					return KERN_INVALID_ADDRESS;
9769 				}
9770 				copy_size = 0;
9771 				entry = tmp_entry;
9772 				continue;
9773 			}
9774 			if (entry->is_sub_map) {
9775 				vm_map_offset_t sub_start;
9776 				vm_map_offset_t sub_end;
9777 				vm_map_offset_t local_end;
9778 
9779 				if (entry->needs_copy) {
9780 					/* if this is a COW submap */
9781 					/* just back the range with a */
9782 					/* anonymous entry */
9783 					assert(!entry->vme_permanent);
9784 					if (entry->vme_end < dst_end) {
9785 						sub_end = entry->vme_end;
9786 					} else {
9787 						sub_end = dst_end;
9788 					}
9789 					if (entry->vme_start < base_addr) {
9790 						sub_start = base_addr;
9791 					} else {
9792 						sub_start = entry->vme_start;
9793 					}
9794 					vm_map_clip_end(
9795 						dst_map, entry, sub_end);
9796 					vm_map_clip_start(
9797 						dst_map, entry, sub_start);
9798 					assert(!entry->use_pmap);
9799 					assert(!entry->iokit_acct);
9800 					entry->use_pmap = TRUE;
9801 					vm_map_deallocate(VME_SUBMAP(entry));
9802 					assert(!entry->vme_permanent);
9803 					VME_OBJECT_SET(entry, VM_OBJECT_NULL, false, 0);
9804 					VME_OFFSET_SET(entry, 0);
9805 					entry->is_shared = FALSE;
9806 					entry->needs_copy = FALSE;
9807 					entry->protection = VM_PROT_DEFAULT;
9808 					entry->max_protection = VM_PROT_ALL;
9809 					entry->wired_count = 0;
9810 					entry->user_wired_count = 0;
9811 					if (entry->inheritance
9812 					    == VM_INHERIT_SHARE) {
9813 						entry->inheritance = VM_INHERIT_COPY;
9814 					}
9815 					continue;
9816 				}
9817 				/* first take care of any non-sub_map */
9818 				/* entries to send */
9819 				if (base_addr < entry->vme_start) {
9820 					/* stuff to send */
9821 					copy_size =
9822 					    entry->vme_start - base_addr;
9823 					break;
9824 				}
9825 				sub_start = VME_OFFSET(entry);
9826 
9827 				if (entry->vme_end < dst_end) {
9828 					sub_end = entry->vme_end;
9829 				} else {
9830 					sub_end = dst_end;
9831 				}
9832 				sub_end -= entry->vme_start;
9833 				sub_end += VME_OFFSET(entry);
9834 				local_end = entry->vme_end;
9835 				vm_map_unlock(dst_map);
9836 				copy_size = sub_end - sub_start;
9837 
9838 				/* adjust the copy object */
9839 				if (total_size > copy_size) {
9840 					vm_map_size_t   local_size = 0;
9841 					vm_map_size_t   entry_size;
9842 
9843 					nentries = 1;
9844 					new_offset = copy->offset;
9845 					copy_entry = vm_map_copy_first_entry(copy);
9846 					while (copy_entry !=
9847 					    vm_map_copy_to_entry(copy)) {
9848 						entry_size = copy_entry->vme_end -
9849 						    copy_entry->vme_start;
9850 						if ((local_size < copy_size) &&
9851 						    ((local_size + entry_size)
9852 						    >= copy_size)) {
9853 							vm_map_copy_clip_end(copy,
9854 							    copy_entry,
9855 							    copy_entry->vme_start +
9856 							    (copy_size - local_size));
9857 							entry_size = copy_entry->vme_end -
9858 							    copy_entry->vme_start;
9859 							local_size += entry_size;
9860 							new_offset += entry_size;
9861 						}
9862 						if (local_size >= copy_size) {
9863 							next_copy = copy_entry->vme_next;
9864 							copy_entry->vme_next =
9865 							    vm_map_copy_to_entry(copy);
9866 							previous_prev =
9867 							    copy->cpy_hdr.links.prev;
9868 							copy->cpy_hdr.links.prev = copy_entry;
9869 							copy->size = copy_size;
9870 							remaining_entries =
9871 							    copy->cpy_hdr.nentries;
9872 							remaining_entries -= nentries;
9873 							copy->cpy_hdr.nentries = nentries;
9874 							break;
9875 						} else {
9876 							local_size += entry_size;
9877 							new_offset += entry_size;
9878 							nentries++;
9879 						}
9880 						copy_entry = copy_entry->vme_next;
9881 					}
9882 				}
9883 
9884 				if ((entry->use_pmap) && (pmap == NULL)) {
9885 					kr = vm_map_copy_overwrite_nested(
9886 						VME_SUBMAP(entry),
9887 						sub_start,
9888 						copy,
9889 						interruptible,
9890 						VME_SUBMAP(entry)->pmap,
9891 						TRUE);
9892 				} else if (pmap != NULL) {
9893 					kr = vm_map_copy_overwrite_nested(
9894 						VME_SUBMAP(entry),
9895 						sub_start,
9896 						copy,
9897 						interruptible, pmap,
9898 						TRUE);
9899 				} else {
9900 					kr = vm_map_copy_overwrite_nested(
9901 						VME_SUBMAP(entry),
9902 						sub_start,
9903 						copy,
9904 						interruptible,
9905 						dst_map->pmap,
9906 						TRUE);
9907 				}
9908 				if (kr != KERN_SUCCESS) {
9909 					if (next_copy != NULL) {
9910 						copy->cpy_hdr.nentries +=
9911 						    remaining_entries;
9912 						copy->cpy_hdr.links.prev->vme_next =
9913 						    next_copy;
9914 						copy->cpy_hdr.links.prev
9915 						        = previous_prev;
9916 						copy->size = total_size;
9917 					}
9918 					return kr;
9919 				}
9920 				if (dst_end <= local_end) {
9921 					return KERN_SUCCESS;
9922 				}
9923 				/* otherwise copy no longer exists, it was */
9924 				/* destroyed after successful copy_overwrite */
9925 				copy = vm_map_copy_allocate(VM_MAP_COPY_ENTRY_LIST);
9926 				copy->offset = new_offset;
9927 				copy->cpy_hdr.page_shift = copy_page_shift;
9928 
9929 				total_size -= copy_size;
9930 				copy_size = 0;
9931 				/* put back remainder of copy in container */
9932 				if (next_copy != NULL) {
9933 					copy->cpy_hdr.nentries = remaining_entries;
9934 					copy->cpy_hdr.links.next = next_copy;
9935 					copy->cpy_hdr.links.prev = previous_prev;
9936 					copy->size = total_size;
9937 					next_copy->vme_prev =
9938 					    vm_map_copy_to_entry(copy);
9939 					next_copy = NULL;
9940 				}
9941 				base_addr = local_end;
9942 				vm_map_lock(dst_map);
9943 				if (!vm_map_lookup_entry(dst_map,
9944 				    local_end, &tmp_entry)) {
9945 					vm_map_unlock(dst_map);
9946 					return KERN_INVALID_ADDRESS;
9947 				}
9948 				entry = tmp_entry;
9949 				continue;
9950 			}
9951 			if (dst_end <= entry->vme_end) {
9952 				copy_size = dst_end - base_addr;
9953 				break;
9954 			}
9955 
9956 			if ((next == vm_map_to_entry(dst_map)) ||
9957 			    (next->vme_start != entry->vme_end)) {
9958 				vm_map_unlock(dst_map);
9959 				return KERN_INVALID_ADDRESS;
9960 			}
9961 
9962 			entry = next;
9963 		}/* for */
9964 
9965 		next_copy = NULL;
9966 		nentries = 1;
9967 
9968 		/* adjust the copy object */
9969 		if (total_size > copy_size) {
9970 			vm_map_size_t   local_size = 0;
9971 			vm_map_size_t   entry_size;
9972 
9973 			new_offset = copy->offset;
9974 			copy_entry = vm_map_copy_first_entry(copy);
9975 			while (copy_entry != vm_map_copy_to_entry(copy)) {
9976 				entry_size = copy_entry->vme_end -
9977 				    copy_entry->vme_start;
9978 				if ((local_size < copy_size) &&
9979 				    ((local_size + entry_size)
9980 				    >= copy_size)) {
9981 					vm_map_copy_clip_end(copy, copy_entry,
9982 					    copy_entry->vme_start +
9983 					    (copy_size - local_size));
9984 					entry_size = copy_entry->vme_end -
9985 					    copy_entry->vme_start;
9986 					local_size += entry_size;
9987 					new_offset += entry_size;
9988 				}
9989 				if (local_size >= copy_size) {
9990 					next_copy = copy_entry->vme_next;
9991 					copy_entry->vme_next =
9992 					    vm_map_copy_to_entry(copy);
9993 					previous_prev =
9994 					    copy->cpy_hdr.links.prev;
9995 					copy->cpy_hdr.links.prev = copy_entry;
9996 					copy->size = copy_size;
9997 					remaining_entries =
9998 					    copy->cpy_hdr.nentries;
9999 					remaining_entries -= nentries;
10000 					copy->cpy_hdr.nentries = nentries;
10001 					break;
10002 				} else {
10003 					local_size += entry_size;
10004 					new_offset += entry_size;
10005 					nentries++;
10006 				}
10007 				copy_entry = copy_entry->vme_next;
10008 			}
10009 		}
10010 
10011 		if (aligned) {
10012 			pmap_t  local_pmap;
10013 
10014 			if (pmap) {
10015 				local_pmap = pmap;
10016 			} else {
10017 				local_pmap = dst_map->pmap;
10018 			}
10019 
10020 			if ((kr =  vm_map_copy_overwrite_aligned(
10021 				    dst_map, tmp_entry, copy,
10022 				    base_addr, local_pmap)) != KERN_SUCCESS) {
10023 				if (next_copy != NULL) {
10024 					copy->cpy_hdr.nentries +=
10025 					    remaining_entries;
10026 					copy->cpy_hdr.links.prev->vme_next =
10027 					    next_copy;
10028 					copy->cpy_hdr.links.prev =
10029 					    previous_prev;
10030 					copy->size += copy_size;
10031 				}
10032 				return kr;
10033 			}
10034 			vm_map_unlock(dst_map);
10035 		} else {
10036 			/*
10037 			 * Performance gain:
10038 			 *
10039 			 * if the copy and dst address are misaligned but the same
10040 			 * offset within the page we can copy_not_aligned the
10041 			 * misaligned parts and copy aligned the rest.  If they are
10042 			 * aligned but len is unaligned we simply need to copy
10043 			 * the end bit unaligned.  We'll need to split the misaligned
10044 			 * bits of the region in this case !
10045 			 */
10046 			/* ALWAYS UNLOCKS THE dst_map MAP */
10047 			kr = vm_map_copy_overwrite_unaligned(
10048 				dst_map,
10049 				tmp_entry,
10050 				copy,
10051 				base_addr,
10052 				discard_on_success);
10053 			if (kr != KERN_SUCCESS) {
10054 				if (next_copy != NULL) {
10055 					copy->cpy_hdr.nentries +=
10056 					    remaining_entries;
10057 					copy->cpy_hdr.links.prev->vme_next =
10058 					    next_copy;
10059 					copy->cpy_hdr.links.prev =
10060 					    previous_prev;
10061 					copy->size += copy_size;
10062 				}
10063 				return kr;
10064 			}
10065 		}
10066 		total_size -= copy_size;
10067 		if (total_size == 0) {
10068 			break;
10069 		}
10070 		base_addr += copy_size;
10071 		copy_size = 0;
10072 		copy->offset = new_offset;
10073 		if (next_copy != NULL) {
10074 			copy->cpy_hdr.nentries = remaining_entries;
10075 			copy->cpy_hdr.links.next = next_copy;
10076 			copy->cpy_hdr.links.prev = previous_prev;
10077 			next_copy->vme_prev = vm_map_copy_to_entry(copy);
10078 			copy->size = total_size;
10079 		}
10080 		vm_map_lock(dst_map);
10081 		while (TRUE) {
10082 			if (!vm_map_lookup_entry(dst_map,
10083 			    base_addr, &tmp_entry)) {
10084 				vm_map_unlock(dst_map);
10085 				return KERN_INVALID_ADDRESS;
10086 			}
10087 			if (tmp_entry->in_transition) {
10088 				entry->needs_wakeup = TRUE;
10089 				vm_map_entry_wait(dst_map, THREAD_UNINT);
10090 			} else {
10091 				break;
10092 			}
10093 		}
10094 		vm_map_clip_start(dst_map,
10095 		    tmp_entry,
10096 		    vm_map_trunc_page(base_addr,
10097 		    VM_MAP_PAGE_MASK(dst_map)));
10098 
10099 		entry = tmp_entry;
10100 	} /* while */
10101 
10102 	/*
10103 	 *	Throw away the vm_map_copy object
10104 	 */
10105 	if (discard_on_success) {
10106 		vm_map_copy_discard(copy);
10107 	}
10108 
10109 	return KERN_SUCCESS;
10110 }/* vm_map_copy_overwrite */
10111 
10112 kern_return_t
vm_map_copy_overwrite(vm_map_t dst_map,vm_map_offset_t dst_addr,vm_map_copy_t copy,vm_map_size_t copy_size,boolean_t interruptible)10113 vm_map_copy_overwrite(
10114 	vm_map_t        dst_map,
10115 	vm_map_offset_t dst_addr,
10116 	vm_map_copy_t   copy,
10117 	vm_map_size_t   copy_size,
10118 	boolean_t       interruptible)
10119 {
10120 	vm_map_size_t   head_size, tail_size;
10121 	vm_map_copy_t   head_copy, tail_copy;
10122 	vm_map_offset_t head_addr, tail_addr;
10123 	vm_map_entry_t  entry;
10124 	kern_return_t   kr;
10125 	vm_map_offset_t effective_page_mask, effective_page_size;
10126 	uint16_t        copy_page_shift;
10127 
10128 	head_size = 0;
10129 	tail_size = 0;
10130 	head_copy = NULL;
10131 	tail_copy = NULL;
10132 	head_addr = 0;
10133 	tail_addr = 0;
10134 
10135 	/*
10136 	 *	Check for null copy object.
10137 	 */
10138 	if (copy == VM_MAP_COPY_NULL) {
10139 		return KERN_SUCCESS;
10140 	}
10141 
10142 	if (__improbable(vm_map_range_overflows(dst_map, dst_addr, copy_size))) {
10143 		return KERN_INVALID_ADDRESS;
10144 	}
10145 
10146 	/*
10147 	 * Assert that the vm_map_copy is coming from the right
10148 	 * zone and hasn't been forged
10149 	 */
10150 	vm_map_copy_require(copy);
10151 
10152 	if (interruptible ||
10153 	    copy->type != VM_MAP_COPY_ENTRY_LIST) {
10154 		/*
10155 		 * We can't split the "copy" map if we're interruptible
10156 		 * or if we don't have a "copy" map...
10157 		 */
10158 blunt_copy:
10159 		kr = vm_map_copy_overwrite_nested(dst_map,
10160 		    dst_addr,
10161 		    copy,
10162 		    interruptible,
10163 		    (pmap_t) NULL,
10164 		    TRUE);
10165 		if (kr) {
10166 			ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_COPYOVERWRITE_FULL_NESTED_ERROR), kr /* arg */);
10167 		}
10168 		return kr;
10169 	}
10170 
10171 	copy_page_shift = VM_MAP_COPY_PAGE_SHIFT(copy);
10172 	if (copy_page_shift < PAGE_SHIFT ||
10173 	    VM_MAP_PAGE_SHIFT(dst_map) < PAGE_SHIFT) {
10174 		goto blunt_copy;
10175 	}
10176 
10177 	if (VM_MAP_PAGE_SHIFT(dst_map) < PAGE_SHIFT) {
10178 		effective_page_mask = VM_MAP_PAGE_MASK(dst_map);
10179 	} else {
10180 		effective_page_mask = MAX(VM_MAP_PAGE_MASK(dst_map), PAGE_MASK);
10181 		effective_page_mask = MAX(VM_MAP_COPY_PAGE_MASK(copy),
10182 		    effective_page_mask);
10183 	}
10184 	effective_page_size = effective_page_mask + 1;
10185 
10186 	if (copy_size < VM_MAP_COPY_OVERWRITE_OPTIMIZATION_THRESHOLD_PAGES * effective_page_size) {
10187 		/*
10188 		 * Too small to bother with optimizing...
10189 		 */
10190 		goto blunt_copy;
10191 	}
10192 
10193 	if ((dst_addr & effective_page_mask) !=
10194 	    (copy->offset & effective_page_mask)) {
10195 		/*
10196 		 * Incompatible mis-alignment of source and destination...
10197 		 */
10198 		goto blunt_copy;
10199 	}
10200 
10201 	/*
10202 	 * Proper alignment or identical mis-alignment at the beginning.
10203 	 * Let's try and do a small unaligned copy first (if needed)
10204 	 * and then an aligned copy for the rest.
10205 	 */
10206 	if (!vm_map_page_aligned(dst_addr, effective_page_mask)) {
10207 		head_addr = dst_addr;
10208 		head_size = (effective_page_size -
10209 		    (copy->offset & effective_page_mask));
10210 		head_size = MIN(head_size, copy_size);
10211 	}
10212 	if (!vm_map_page_aligned(copy->offset + copy_size,
10213 	    effective_page_mask)) {
10214 		/*
10215 		 * Mis-alignment at the end.
10216 		 * Do an aligned copy up to the last page and
10217 		 * then an unaligned copy for the remaining bytes.
10218 		 */
10219 		tail_size = ((copy->offset + copy_size) &
10220 		    effective_page_mask);
10221 		tail_size = MIN(tail_size, copy_size);
10222 		tail_addr = dst_addr + copy_size - tail_size;
10223 		assert(tail_addr >= head_addr + head_size);
10224 	}
10225 	assert(head_size + tail_size <= copy_size);
10226 
10227 	if (head_size + tail_size == copy_size) {
10228 		/*
10229 		 * It's all unaligned, no optimization possible...
10230 		 */
10231 		goto blunt_copy;
10232 	}
10233 
10234 	/*
10235 	 * Can't optimize if there are any submaps in the
10236 	 * destination due to the way we free the "copy" map
10237 	 * progressively in vm_map_copy_overwrite_nested()
10238 	 * in that case.
10239 	 */
10240 	vm_map_lock_read(dst_map);
10241 	if (!vm_map_lookup_entry(dst_map, dst_addr, &entry)) {
10242 		vm_map_unlock_read(dst_map);
10243 		goto blunt_copy;
10244 	}
10245 	for (;
10246 	    (entry != vm_map_to_entry(dst_map) &&
10247 	    entry->vme_start < dst_addr + copy_size);
10248 	    entry = entry->vme_next) {
10249 		if (entry->is_sub_map) {
10250 			vm_map_unlock_read(dst_map);
10251 			goto blunt_copy;
10252 		}
10253 	}
10254 	vm_map_unlock_read(dst_map);
10255 
10256 	if (head_size) {
10257 		/*
10258 		 * Unaligned copy of the first "head_size" bytes, to reach
10259 		 * a page boundary.
10260 		 */
10261 
10262 		/*
10263 		 * Extract "head_copy" out of "copy".
10264 		 */
10265 		head_copy = vm_map_copy_allocate(VM_MAP_COPY_ENTRY_LIST);
10266 		head_copy->cpy_hdr.entries_pageable =
10267 		    copy->cpy_hdr.entries_pageable;
10268 		head_copy->cpy_hdr.page_shift = copy_page_shift;
10269 
10270 		entry = vm_map_copy_first_entry(copy);
10271 		if (entry->vme_end < copy->offset + head_size) {
10272 			head_size = entry->vme_end - copy->offset;
10273 		}
10274 
10275 		head_copy->offset = copy->offset;
10276 		head_copy->size = head_size;
10277 		copy->offset += head_size;
10278 		copy->size -= head_size;
10279 		copy_size -= head_size;
10280 		assert(copy_size > 0);
10281 
10282 		vm_map_copy_clip_end(copy, entry, copy->offset);
10283 		vm_map_copy_entry_unlink(copy, entry);
10284 		vm_map_copy_entry_link(head_copy,
10285 		    vm_map_copy_to_entry(head_copy),
10286 		    entry);
10287 
10288 		/*
10289 		 * Do the unaligned copy.
10290 		 */
10291 		kr = vm_map_copy_overwrite_nested(dst_map,
10292 		    head_addr,
10293 		    head_copy,
10294 		    interruptible,
10295 		    (pmap_t) NULL,
10296 		    FALSE);
10297 		if (kr != KERN_SUCCESS) {
10298 			ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_COPYOVERWRITE_PARTIAL_HEAD_NESTED_ERROR), kr /* arg */);
10299 			goto done;
10300 		}
10301 	}
10302 
10303 	if (tail_size) {
10304 		/*
10305 		 * Extract "tail_copy" out of "copy".
10306 		 */
10307 		tail_copy = vm_map_copy_allocate(VM_MAP_COPY_ENTRY_LIST);
10308 		tail_copy->cpy_hdr.entries_pageable =
10309 		    copy->cpy_hdr.entries_pageable;
10310 		tail_copy->cpy_hdr.page_shift = copy_page_shift;
10311 
10312 		tail_copy->offset = copy->offset + copy_size - tail_size;
10313 		tail_copy->size = tail_size;
10314 
10315 		copy->size -= tail_size;
10316 		copy_size -= tail_size;
10317 		assert(copy_size > 0);
10318 
10319 		entry = vm_map_copy_last_entry(copy);
10320 		vm_map_copy_clip_start(copy, entry, tail_copy->offset);
10321 		entry = vm_map_copy_last_entry(copy);
10322 		vm_map_copy_entry_unlink(copy, entry);
10323 		vm_map_copy_entry_link(tail_copy,
10324 		    vm_map_copy_last_entry(tail_copy),
10325 		    entry);
10326 	}
10327 
10328 	/*
10329 	 * If we are here from ipc_kmsg_copyout_ool_descriptor(),
10330 	 * we want to avoid TOCTOU issues w.r.t copy->size but
10331 	 * we don't need to change vm_map_copy_overwrite_nested()
10332 	 * and all other vm_map_copy_overwrite variants.
10333 	 *
10334 	 * So we assign the original copy_size that was passed into
10335 	 * this routine back to copy.
10336 	 *
10337 	 * This use of local 'copy_size' passed into this routine is
10338 	 * to try and protect against TOCTOU attacks where the kernel
10339 	 * has been exploited. We don't expect this to be an issue
10340 	 * during normal system operation.
10341 	 */
10342 	assertf(copy->size == copy_size,
10343 	    "Mismatch of copy sizes. Expected 0x%llx, Got 0x%llx\n", (uint64_t) copy_size, (uint64_t) copy->size);
10344 	copy->size = copy_size;
10345 
10346 	/*
10347 	 * Copy most (or possibly all) of the data.
10348 	 */
10349 	kr = vm_map_copy_overwrite_nested(dst_map,
10350 	    dst_addr + head_size,
10351 	    copy,
10352 	    interruptible,
10353 	    (pmap_t) NULL,
10354 	    FALSE);
10355 	if (kr != KERN_SUCCESS) {
10356 		ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_COPYOVERWRITE_PARTIAL_NESTED_ERROR), kr /* arg */);
10357 		goto done;
10358 	}
10359 
10360 	if (tail_size) {
10361 		kr = vm_map_copy_overwrite_nested(dst_map,
10362 		    tail_addr,
10363 		    tail_copy,
10364 		    interruptible,
10365 		    (pmap_t) NULL,
10366 		    FALSE);
10367 		if (kr) {
10368 			ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_COPYOVERWRITE_PARTIAL_TAIL_NESTED_ERROR), kr /* arg */);
10369 		}
10370 	}
10371 
10372 done:
10373 	assert(copy->type == VM_MAP_COPY_ENTRY_LIST);
10374 	if (kr == KERN_SUCCESS) {
10375 		/*
10376 		 * Discard all the copy maps.
10377 		 */
10378 		if (head_copy) {
10379 			vm_map_copy_discard(head_copy);
10380 			head_copy = NULL;
10381 		}
10382 		vm_map_copy_discard(copy);
10383 		if (tail_copy) {
10384 			vm_map_copy_discard(tail_copy);
10385 			tail_copy = NULL;
10386 		}
10387 	} else {
10388 		/*
10389 		 * Re-assemble the original copy map.
10390 		 */
10391 		if (head_copy) {
10392 			entry = vm_map_copy_first_entry(head_copy);
10393 			vm_map_copy_entry_unlink(head_copy, entry);
10394 			vm_map_copy_entry_link(copy,
10395 			    vm_map_copy_to_entry(copy),
10396 			    entry);
10397 			copy->offset -= head_size;
10398 			copy->size += head_size;
10399 			vm_map_copy_discard(head_copy);
10400 			head_copy = NULL;
10401 		}
10402 		if (tail_copy) {
10403 			entry = vm_map_copy_last_entry(tail_copy);
10404 			vm_map_copy_entry_unlink(tail_copy, entry);
10405 			vm_map_copy_entry_link(copy,
10406 			    vm_map_copy_last_entry(copy),
10407 			    entry);
10408 			copy->size += tail_size;
10409 			vm_map_copy_discard(tail_copy);
10410 			tail_copy = NULL;
10411 		}
10412 	}
10413 	return kr;
10414 }
10415 
10416 
10417 /*
10418  *	Routine: vm_map_copy_overwrite_unaligned	[internal use only]
10419  *
10420  *	Decription:
10421  *	Physically copy unaligned data
10422  *
10423  *	Implementation:
10424  *	Unaligned parts of pages have to be physically copied.  We use
10425  *	a modified form of vm_fault_copy (which understands none-aligned
10426  *	page offsets and sizes) to do the copy.  We attempt to copy as
10427  *	much memory in one go as possibly, however vm_fault_copy copies
10428  *	within 1 memory object so we have to find the smaller of "amount left"
10429  *	"source object data size" and "target object data size".  With
10430  *	unaligned data we don't need to split regions, therefore the source
10431  *	(copy) object should be one map entry, the target range may be split
10432  *	over multiple map entries however.  In any event we are pessimistic
10433  *	about these assumptions.
10434  *
10435  *	Callers of this function must call vm_map_copy_require on
10436  *	previously created vm_map_copy_t or pass a newly created
10437  *	one to ensure that it hasn't been forged.
10438  *
10439  *	Assumptions:
10440  *	dst_map is locked on entry and is return locked on success,
10441  *	unlocked on error.
10442  */
10443 
10444 static kern_return_t
vm_map_copy_overwrite_unaligned(vm_map_t dst_map,vm_map_entry_t entry,vm_map_copy_t copy,vm_map_offset_t start,boolean_t discard_on_success)10445 vm_map_copy_overwrite_unaligned(
10446 	vm_map_t        dst_map,
10447 	vm_map_entry_t  entry,
10448 	vm_map_copy_t   copy,
10449 	vm_map_offset_t start,
10450 	boolean_t       discard_on_success)
10451 {
10452 	vm_map_entry_t          copy_entry;
10453 	vm_map_entry_t          copy_entry_next;
10454 	vm_map_version_t        version;
10455 	vm_object_t             dst_object;
10456 	vm_object_offset_t      dst_offset;
10457 	vm_object_offset_t      src_offset;
10458 	vm_object_offset_t      entry_offset;
10459 	vm_map_offset_t         entry_end;
10460 	vm_map_size_t           src_size,
10461 	    dst_size,
10462 	    copy_size,
10463 	    amount_left;
10464 	kern_return_t           kr = KERN_SUCCESS;
10465 
10466 
10467 	copy_entry = vm_map_copy_first_entry(copy);
10468 
10469 	vm_map_lock_write_to_read(dst_map);
10470 
10471 	src_offset = copy->offset - trunc_page_mask_64(copy->offset, VM_MAP_COPY_PAGE_MASK(copy));
10472 	amount_left = copy->size;
10473 /*
10474  *	unaligned so we never clipped this entry, we need the offset into
10475  *	the vm_object not just the data.
10476  */
10477 	while (amount_left > 0) {
10478 		if (entry == vm_map_to_entry(dst_map)) {
10479 			vm_map_unlock_read(dst_map);
10480 			return KERN_INVALID_ADDRESS;
10481 		}
10482 
10483 		/* "start" must be within the current map entry */
10484 		assert((start >= entry->vme_start) && (start < entry->vme_end));
10485 
10486 		/*
10487 		 *	Check protection again
10488 		 */
10489 		if (!(entry->protection & VM_PROT_WRITE)) {
10490 			vm_map_unlock_read(dst_map);
10491 			return KERN_PROTECTION_FAILURE;
10492 		}
10493 		if (!vm_map_entry_is_overwritable(dst_map, entry)) {
10494 			vm_map_unlock_read(dst_map);
10495 			return KERN_PROTECTION_FAILURE;
10496 		}
10497 
10498 		/*
10499 		 *	If the entry is in transition, we must wait
10500 		 *	for it to exit that state.  Anything could happen
10501 		 *	when we unlock the map, so start over.
10502 		 */
10503 		if (entry->in_transition) {
10504 			/*
10505 			 * Say that we are waiting, and wait for entry.
10506 			 */
10507 			entry->needs_wakeup = TRUE;
10508 			vm_map_entry_wait(dst_map, THREAD_UNINT);
10509 
10510 			goto RetryLookup;
10511 		}
10512 
10513 		dst_offset = start - entry->vme_start;
10514 
10515 		dst_size = entry->vme_end - start;
10516 
10517 		src_size = copy_entry->vme_end -
10518 		    (copy_entry->vme_start + src_offset);
10519 
10520 		if (dst_size < src_size) {
10521 /*
10522  *			we can only copy dst_size bytes before
10523  *			we have to get the next destination entry
10524  */
10525 			copy_size = dst_size;
10526 		} else {
10527 /*
10528  *			we can only copy src_size bytes before
10529  *			we have to get the next source copy entry
10530  */
10531 			copy_size = src_size;
10532 		}
10533 
10534 		if (copy_size > amount_left) {
10535 			copy_size = amount_left;
10536 		}
10537 /*
10538  *		Entry needs copy, create a shadow shadow object for
10539  *		Copy on write region.
10540  */
10541 		if (entry->needs_copy) {
10542 			if (vm_map_lock_read_to_write(dst_map)) {
10543 				vm_map_lock_read(dst_map);
10544 				goto RetryLookup;
10545 			}
10546 			VME_OBJECT_SHADOW(entry,
10547 			    (vm_map_size_t)(entry->vme_end
10548 			    - entry->vme_start),
10549 			    vm_map_always_shadow(dst_map));
10550 			entry->needs_copy = FALSE;
10551 			vm_map_lock_write_to_read(dst_map);
10552 		}
10553 		dst_object = VME_OBJECT(entry);
10554 /*
10555  *		unlike with the virtual (aligned) copy we're going
10556  *		to fault on it therefore we need a target object.
10557  */
10558 		if (dst_object == VM_OBJECT_NULL) {
10559 			if (vm_map_lock_read_to_write(dst_map)) {
10560 				vm_map_lock_read(dst_map);
10561 				goto RetryLookup;
10562 			}
10563 			dst_object = vm_object_allocate((vm_map_size_t)
10564 			    entry->vme_end - entry->vme_start);
10565 			VME_OBJECT_SET(entry, dst_object, false, 0);
10566 			VME_OFFSET_SET(entry, 0);
10567 			assert(entry->use_pmap);
10568 			vm_map_lock_write_to_read(dst_map);
10569 		}
10570 /*
10571  *		Take an object reference and unlock map. The "entry" may
10572  *		disappear or change when the map is unlocked.
10573  */
10574 		vm_object_reference(dst_object);
10575 		version.main_timestamp = dst_map->timestamp;
10576 		entry_offset = VME_OFFSET(entry);
10577 		entry_end = entry->vme_end;
10578 		vm_map_unlock_read(dst_map);
10579 /*
10580  *		Copy as much as possible in one pass
10581  */
10582 		kr = vm_fault_copy(
10583 			VME_OBJECT(copy_entry),
10584 			VME_OFFSET(copy_entry) + src_offset,
10585 			&copy_size,
10586 			dst_object,
10587 			entry_offset + dst_offset,
10588 			dst_map,
10589 			&version,
10590 			THREAD_UNINT );
10591 
10592 		start += copy_size;
10593 		src_offset += copy_size;
10594 		amount_left -= copy_size;
10595 /*
10596  *		Release the object reference
10597  */
10598 		vm_object_deallocate(dst_object);
10599 /*
10600  *		If a hard error occurred, return it now
10601  */
10602 		if (kr != KERN_SUCCESS) {
10603 			return kr;
10604 		}
10605 
10606 		if ((copy_entry->vme_start + src_offset) == copy_entry->vme_end
10607 		    || amount_left == 0) {
10608 /*
10609  *			all done with this copy entry, dispose.
10610  */
10611 			copy_entry_next = copy_entry->vme_next;
10612 
10613 			if (discard_on_success) {
10614 				vm_map_copy_entry_unlink(copy, copy_entry);
10615 				assert(!copy_entry->is_sub_map);
10616 				vm_object_deallocate(VME_OBJECT(copy_entry));
10617 				vm_map_copy_entry_dispose(copy_entry);
10618 			}
10619 
10620 			if (copy_entry_next == vm_map_copy_to_entry(copy) &&
10621 			    amount_left) {
10622 /*
10623  *				not finished copying but run out of source
10624  */
10625 				return KERN_INVALID_ADDRESS;
10626 			}
10627 
10628 			copy_entry = copy_entry_next;
10629 
10630 			src_offset = 0;
10631 		}
10632 
10633 		if (amount_left == 0) {
10634 			return KERN_SUCCESS;
10635 		}
10636 
10637 		vm_map_lock_read(dst_map);
10638 		if (version.main_timestamp == dst_map->timestamp) {
10639 			if (start == entry_end) {
10640 /*
10641  *				destination region is split.  Use the version
10642  *				information to avoid a lookup in the normal
10643  *				case.
10644  */
10645 				entry = entry->vme_next;
10646 /*
10647  *				should be contiguous. Fail if we encounter
10648  *				a hole in the destination.
10649  */
10650 				if (start != entry->vme_start) {
10651 					vm_map_unlock_read(dst_map);
10652 					return KERN_INVALID_ADDRESS;
10653 				}
10654 			}
10655 		} else {
10656 /*
10657  *			Map version check failed.
10658  *			we must lookup the entry because somebody
10659  *			might have changed the map behind our backs.
10660  */
10661 RetryLookup:
10662 			if (!vm_map_lookup_entry(dst_map, start, &entry)) {
10663 				vm_map_unlock_read(dst_map);
10664 				return KERN_INVALID_ADDRESS;
10665 			}
10666 		}
10667 	}/* while */
10668 
10669 	return KERN_SUCCESS;
10670 }/* vm_map_copy_overwrite_unaligned */
10671 
10672 /*
10673  *	Routine: vm_map_copy_overwrite_aligned	[internal use only]
10674  *
10675  *	Description:
10676  *	Does all the vm_trickery possible for whole pages.
10677  *
10678  *	Implementation:
10679  *
10680  *	If there are no permanent objects in the destination,
10681  *	and the source and destination map entry zones match,
10682  *	and the destination map entry is not shared,
10683  *	then the map entries can be deleted and replaced
10684  *	with those from the copy.  The following code is the
10685  *	basic idea of what to do, but there are lots of annoying
10686  *	little details about getting protection and inheritance
10687  *	right.  Should add protection, inheritance, and sharing checks
10688  *	to the above pass and make sure that no wiring is involved.
10689  *
10690  *	Callers of this function must call vm_map_copy_require on
10691  *	previously created vm_map_copy_t or pass a newly created
10692  *	one to ensure that it hasn't been forged.
10693  */
10694 
10695 int vm_map_copy_overwrite_aligned_src_not_internal = 0;
10696 int vm_map_copy_overwrite_aligned_src_not_symmetric = 0;
10697 int vm_map_copy_overwrite_aligned_src_large = 0;
10698 
10699 static kern_return_t
vm_map_copy_overwrite_aligned(vm_map_t dst_map,vm_map_entry_t tmp_entry,vm_map_copy_t copy,vm_map_offset_t start,__unused pmap_t pmap)10700 vm_map_copy_overwrite_aligned(
10701 	vm_map_t        dst_map,
10702 	vm_map_entry_t  tmp_entry,
10703 	vm_map_copy_t   copy,
10704 	vm_map_offset_t start,
10705 	__unused pmap_t pmap)
10706 {
10707 	vm_object_t     object;
10708 	vm_map_entry_t  copy_entry;
10709 	vm_map_size_t   copy_size;
10710 	vm_map_size_t   size;
10711 	vm_map_entry_t  entry;
10712 
10713 	while ((copy_entry = vm_map_copy_first_entry(copy))
10714 	    != vm_map_copy_to_entry(copy)) {
10715 		copy_size = (copy_entry->vme_end - copy_entry->vme_start);
10716 
10717 		entry = tmp_entry;
10718 		if (entry->is_sub_map) {
10719 			/* unnested when clipped earlier */
10720 			assert(!entry->use_pmap);
10721 		}
10722 		if (entry == vm_map_to_entry(dst_map)) {
10723 			vm_map_unlock(dst_map);
10724 			return KERN_INVALID_ADDRESS;
10725 		}
10726 		size = (entry->vme_end - entry->vme_start);
10727 		/*
10728 		 *	Make sure that no holes popped up in the
10729 		 *	address map, and that the protection is
10730 		 *	still valid, in case the map was unlocked
10731 		 *	earlier.
10732 		 */
10733 
10734 		if ((entry->vme_start != start) || ((entry->is_sub_map)
10735 		    && !entry->needs_copy)) {
10736 			vm_map_unlock(dst_map);
10737 			return KERN_INVALID_ADDRESS;
10738 		}
10739 		assert(entry != vm_map_to_entry(dst_map));
10740 
10741 		/*
10742 		 *	Check protection again
10743 		 */
10744 
10745 		if (!(entry->protection & VM_PROT_WRITE)) {
10746 			vm_map_unlock(dst_map);
10747 			return KERN_PROTECTION_FAILURE;
10748 		}
10749 
10750 		if (!vm_map_entry_is_overwritable(dst_map, entry)) {
10751 			vm_map_unlock(dst_map);
10752 			return KERN_PROTECTION_FAILURE;
10753 		}
10754 
10755 		/*
10756 		 *	If the entry is in transition, we must wait
10757 		 *	for it to exit that state.  Anything could happen
10758 		 *	when we unlock the map, so start over.
10759 		 */
10760 		if (entry->in_transition) {
10761 			/*
10762 			 * Say that we are waiting, and wait for entry.
10763 			 */
10764 			entry->needs_wakeup = TRUE;
10765 			vm_map_entry_wait(dst_map, THREAD_UNINT);
10766 
10767 			goto RetryLookup;
10768 		}
10769 
10770 		/*
10771 		 *	Adjust to source size first
10772 		 */
10773 
10774 		if (copy_size < size) {
10775 			if (entry->map_aligned &&
10776 			    !VM_MAP_PAGE_ALIGNED(entry->vme_start + copy_size,
10777 			    VM_MAP_PAGE_MASK(dst_map))) {
10778 				/* no longer map-aligned */
10779 				entry->map_aligned = FALSE;
10780 			}
10781 			vm_map_clip_end(dst_map, entry, entry->vme_start + copy_size);
10782 			size = copy_size;
10783 		}
10784 
10785 		/*
10786 		 *	Adjust to destination size
10787 		 */
10788 
10789 		if (size < copy_size) {
10790 			vm_map_copy_clip_end(copy, copy_entry,
10791 			    copy_entry->vme_start + size);
10792 			copy_size = size;
10793 		}
10794 
10795 		assert((entry->vme_end - entry->vme_start) == size);
10796 		assert((tmp_entry->vme_end - tmp_entry->vme_start) == size);
10797 		assert((copy_entry->vme_end - copy_entry->vme_start) == size);
10798 
10799 		/*
10800 		 *	If the destination contains temporary unshared memory,
10801 		 *	we can perform the copy by throwing it away and
10802 		 *	installing the source data.
10803 		 *
10804 		 *	Exceptions for mappings with special semantics:
10805 		 *	+ "permanent" entries,
10806 		 *	+ JIT regions,
10807 		 *	+ TPRO regions,
10808 		 *      + pmap-specific protection policies,
10809 		 *	+ VM objects with COPY_NONE copy strategy.
10810 		 */
10811 
10812 		object = VME_OBJECT(entry);
10813 		if ((!entry->is_shared &&
10814 		    !entry->vme_permanent &&
10815 		    !entry->used_for_jit &&
10816 #if __arm64e__
10817 		    !entry->used_for_tpro &&
10818 #endif /* __arm64e__ */
10819 		    !(entry->protection & VM_PROT_EXECUTE) &&
10820 		    !pmap_has_prot_policy(dst_map->pmap, entry->translated_allow_execute, entry->protection) &&
10821 		    ((object == VM_OBJECT_NULL) ||
10822 		    (object->internal &&
10823 		    !object->true_share &&
10824 		    object->copy_strategy != MEMORY_OBJECT_COPY_NONE))) ||
10825 		    entry->needs_copy) {
10826 			vm_object_t     old_object = VME_OBJECT(entry);
10827 			vm_object_offset_t      old_offset = VME_OFFSET(entry);
10828 			vm_object_offset_t      offset;
10829 
10830 			/*
10831 			 * Ensure that the source and destination aren't
10832 			 * identical
10833 			 */
10834 			if (old_object == VME_OBJECT(copy_entry) &&
10835 			    old_offset == VME_OFFSET(copy_entry)) {
10836 				vm_map_copy_entry_unlink(copy, copy_entry);
10837 				vm_map_copy_entry_dispose(copy_entry);
10838 
10839 				if (old_object != VM_OBJECT_NULL) {
10840 					vm_object_deallocate(old_object);
10841 				}
10842 
10843 				start = tmp_entry->vme_end;
10844 				tmp_entry = tmp_entry->vme_next;
10845 				continue;
10846 			}
10847 
10848 #if XNU_TARGET_OS_OSX
10849 #define __TRADEOFF1_OBJ_SIZE (64 * 1024 * 1024) /* 64 MB */
10850 #define __TRADEOFF1_COPY_SIZE (128 * 1024)      /* 128 KB */
10851 			if (VME_OBJECT(copy_entry) != VM_OBJECT_NULL &&
10852 			    VME_OBJECT(copy_entry)->vo_size >= __TRADEOFF1_OBJ_SIZE &&
10853 			    copy_size <= __TRADEOFF1_COPY_SIZE) {
10854 				/*
10855 				 * Virtual vs. Physical copy tradeoff #1.
10856 				 *
10857 				 * Copying only a few pages out of a large
10858 				 * object:  do a physical copy instead of
10859 				 * a virtual copy, to avoid possibly keeping
10860 				 * the entire large object alive because of
10861 				 * those few copy-on-write pages.
10862 				 */
10863 				vm_map_copy_overwrite_aligned_src_large++;
10864 				goto slow_copy;
10865 			}
10866 #endif /* XNU_TARGET_OS_OSX */
10867 
10868 			if ((dst_map->pmap != kernel_pmap) &&
10869 			    (VME_ALIAS(entry) >= VM_MEMORY_MALLOC) &&
10870 			    (VME_ALIAS(entry) <= VM_MEMORY_MALLOC_MEDIUM)) {
10871 				vm_object_t new_object, new_shadow;
10872 
10873 				/*
10874 				 * We're about to map something over a mapping
10875 				 * established by malloc()...
10876 				 */
10877 				new_object = VME_OBJECT(copy_entry);
10878 				if (new_object != VM_OBJECT_NULL) {
10879 					vm_object_lock_shared(new_object);
10880 				}
10881 				while (new_object != VM_OBJECT_NULL &&
10882 #if XNU_TARGET_OS_OSX
10883 				    !new_object->true_share &&
10884 				    new_object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC &&
10885 #endif /* XNU_TARGET_OS_OSX */
10886 				    new_object->internal) {
10887 					new_shadow = new_object->shadow;
10888 					if (new_shadow == VM_OBJECT_NULL) {
10889 						break;
10890 					}
10891 					vm_object_lock_shared(new_shadow);
10892 					vm_object_unlock(new_object);
10893 					new_object = new_shadow;
10894 				}
10895 				if (new_object != VM_OBJECT_NULL) {
10896 					if (!new_object->internal) {
10897 						/*
10898 						 * The new mapping is backed
10899 						 * by an external object.  We
10900 						 * don't want malloc'ed memory
10901 						 * to be replaced with such a
10902 						 * non-anonymous mapping, so
10903 						 * let's go off the optimized
10904 						 * path...
10905 						 */
10906 						vm_map_copy_overwrite_aligned_src_not_internal++;
10907 						vm_object_unlock(new_object);
10908 						goto slow_copy;
10909 					}
10910 #if XNU_TARGET_OS_OSX
10911 					if (new_object->true_share ||
10912 					    new_object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC) {
10913 						/*
10914 						 * Same if there's a "true_share"
10915 						 * object in the shadow chain, or
10916 						 * an object with a non-default
10917 						 * (SYMMETRIC) copy strategy.
10918 						 */
10919 						vm_map_copy_overwrite_aligned_src_not_symmetric++;
10920 						vm_object_unlock(new_object);
10921 						goto slow_copy;
10922 					}
10923 #endif /* XNU_TARGET_OS_OSX */
10924 					vm_object_unlock(new_object);
10925 				}
10926 				/*
10927 				 * The new mapping is still backed by
10928 				 * anonymous (internal) memory, so it's
10929 				 * OK to substitute it for the original
10930 				 * malloc() mapping.
10931 				 */
10932 			}
10933 
10934 			if (old_object != VM_OBJECT_NULL) {
10935 				assert(!entry->vme_permanent);
10936 				if (entry->is_sub_map) {
10937 					if (entry->use_pmap) {
10938 #ifndef NO_NESTED_PMAP
10939 						pmap_unnest(dst_map->pmap,
10940 						    (addr64_t)entry->vme_start,
10941 						    entry->vme_end - entry->vme_start);
10942 #endif  /* NO_NESTED_PMAP */
10943 						if (dst_map->mapped_in_other_pmaps) {
10944 							/* clean up parent */
10945 							/* map/maps */
10946 							vm_map_submap_pmap_clean(
10947 								dst_map, entry->vme_start,
10948 								entry->vme_end,
10949 								VME_SUBMAP(entry),
10950 								VME_OFFSET(entry));
10951 						}
10952 					} else {
10953 						vm_map_submap_pmap_clean(
10954 							dst_map, entry->vme_start,
10955 							entry->vme_end,
10956 							VME_SUBMAP(entry),
10957 							VME_OFFSET(entry));
10958 					}
10959 					vm_map_deallocate(VME_SUBMAP(entry));
10960 				} else {
10961 					if (dst_map->mapped_in_other_pmaps) {
10962 						vm_object_pmap_protect_options(
10963 							VME_OBJECT(entry),
10964 							VME_OFFSET(entry),
10965 							entry->vme_end
10966 							- entry->vme_start,
10967 							PMAP_NULL,
10968 							PAGE_SIZE,
10969 							entry->vme_start,
10970 							VM_PROT_NONE,
10971 							PMAP_OPTIONS_REMOVE);
10972 					} else {
10973 						pmap_remove_options(
10974 							dst_map->pmap,
10975 							(addr64_t)(entry->vme_start),
10976 							(addr64_t)(entry->vme_end),
10977 							PMAP_OPTIONS_REMOVE);
10978 					}
10979 					vm_object_deallocate(old_object);
10980 				}
10981 			}
10982 
10983 			if (entry->iokit_acct) {
10984 				/* keep using iokit accounting */
10985 				entry->use_pmap = FALSE;
10986 			} else {
10987 				/* use pmap accounting */
10988 				entry->use_pmap = TRUE;
10989 			}
10990 			assert(!entry->vme_permanent);
10991 			VME_OBJECT_SET(entry, VME_OBJECT(copy_entry), false, 0);
10992 			object = VME_OBJECT(entry);
10993 			entry->needs_copy = copy_entry->needs_copy;
10994 			entry->wired_count = 0;
10995 			entry->user_wired_count = 0;
10996 			offset = VME_OFFSET(copy_entry);
10997 			VME_OFFSET_SET(entry, offset);
10998 
10999 			vm_map_copy_entry_unlink(copy, copy_entry);
11000 			vm_map_copy_entry_dispose(copy_entry);
11001 
11002 			/*
11003 			 * we could try to push pages into the pmap at this point, BUT
11004 			 * this optimization only saved on average 2 us per page if ALL
11005 			 * the pages in the source were currently mapped
11006 			 * and ALL the pages in the dest were touched, if there were fewer
11007 			 * than 2/3 of the pages touched, this optimization actually cost more cycles
11008 			 * it also puts a lot of pressure on the pmap layer w/r to mapping structures
11009 			 */
11010 
11011 			/*
11012 			 *	Set up for the next iteration.  The map
11013 			 *	has not been unlocked, so the next
11014 			 *	address should be at the end of this
11015 			 *	entry, and the next map entry should be
11016 			 *	the one following it.
11017 			 */
11018 
11019 			start = tmp_entry->vme_end;
11020 			tmp_entry = tmp_entry->vme_next;
11021 		} else {
11022 			vm_map_version_t        version;
11023 			vm_object_t             dst_object;
11024 			vm_object_offset_t      dst_offset;
11025 			kern_return_t           r;
11026 
11027 slow_copy:
11028 			if (entry->needs_copy) {
11029 				VME_OBJECT_SHADOW(entry,
11030 				    (entry->vme_end -
11031 				    entry->vme_start),
11032 				    vm_map_always_shadow(dst_map));
11033 				entry->needs_copy = FALSE;
11034 			}
11035 
11036 			dst_object = VME_OBJECT(entry);
11037 			dst_offset = VME_OFFSET(entry);
11038 
11039 			/*
11040 			 *	Take an object reference, and record
11041 			 *	the map version information so that the
11042 			 *	map can be safely unlocked.
11043 			 */
11044 
11045 			if (dst_object == VM_OBJECT_NULL) {
11046 				/*
11047 				 * We would usually have just taken the
11048 				 * optimized path above if the destination
11049 				 * object has not been allocated yet.  But we
11050 				 * now disable that optimization if the copy
11051 				 * entry's object is not backed by anonymous
11052 				 * memory to avoid replacing malloc'ed
11053 				 * (i.e. re-usable) anonymous memory with a
11054 				 * not-so-anonymous mapping.
11055 				 * So we have to handle this case here and
11056 				 * allocate a new VM object for this map entry.
11057 				 */
11058 				dst_object = vm_object_allocate(
11059 					entry->vme_end - entry->vme_start);
11060 				dst_offset = 0;
11061 				VME_OBJECT_SET(entry, dst_object, false, 0);
11062 				VME_OFFSET_SET(entry, dst_offset);
11063 				assert(entry->use_pmap);
11064 			}
11065 
11066 			vm_object_reference(dst_object);
11067 
11068 			/* account for unlock bumping up timestamp */
11069 			version.main_timestamp = dst_map->timestamp + 1;
11070 
11071 			vm_map_unlock(dst_map);
11072 
11073 			/*
11074 			 *	Copy as much as possible in one pass
11075 			 */
11076 
11077 			copy_size = size;
11078 			r = vm_fault_copy(
11079 				VME_OBJECT(copy_entry),
11080 				VME_OFFSET(copy_entry),
11081 				&copy_size,
11082 				dst_object,
11083 				dst_offset,
11084 				dst_map,
11085 				&version,
11086 				THREAD_UNINT );
11087 
11088 			/*
11089 			 *	Release the object reference
11090 			 */
11091 
11092 			vm_object_deallocate(dst_object);
11093 
11094 			/*
11095 			 *	If a hard error occurred, return it now
11096 			 */
11097 
11098 			if (r != KERN_SUCCESS) {
11099 				return r;
11100 			}
11101 
11102 			if (copy_size != 0) {
11103 				/*
11104 				 *	Dispose of the copied region
11105 				 */
11106 
11107 				vm_map_copy_clip_end(copy, copy_entry,
11108 				    copy_entry->vme_start + copy_size);
11109 				vm_map_copy_entry_unlink(copy, copy_entry);
11110 				vm_object_deallocate(VME_OBJECT(copy_entry));
11111 				vm_map_copy_entry_dispose(copy_entry);
11112 			}
11113 
11114 			/*
11115 			 *	Pick up in the destination map where we left off.
11116 			 *
11117 			 *	Use the version information to avoid a lookup
11118 			 *	in the normal case.
11119 			 */
11120 
11121 			start += copy_size;
11122 			vm_map_lock(dst_map);
11123 			if (version.main_timestamp == dst_map->timestamp &&
11124 			    copy_size != 0) {
11125 				/* We can safely use saved tmp_entry value */
11126 
11127 				if (tmp_entry->map_aligned &&
11128 				    !VM_MAP_PAGE_ALIGNED(
11129 					    start,
11130 					    VM_MAP_PAGE_MASK(dst_map))) {
11131 					/* no longer map-aligned */
11132 					tmp_entry->map_aligned = FALSE;
11133 				}
11134 				vm_map_clip_end(dst_map, tmp_entry, start);
11135 				tmp_entry = tmp_entry->vme_next;
11136 			} else {
11137 				/* Must do lookup of tmp_entry */
11138 
11139 RetryLookup:
11140 				if (!vm_map_lookup_entry(dst_map, start, &tmp_entry)) {
11141 					vm_map_unlock(dst_map);
11142 					return KERN_INVALID_ADDRESS;
11143 				}
11144 				if (tmp_entry->map_aligned &&
11145 				    !VM_MAP_PAGE_ALIGNED(
11146 					    start,
11147 					    VM_MAP_PAGE_MASK(dst_map))) {
11148 					/* no longer map-aligned */
11149 					tmp_entry->map_aligned = FALSE;
11150 				}
11151 				vm_map_clip_start(dst_map, tmp_entry, start);
11152 			}
11153 		}
11154 	}/* while */
11155 
11156 	return KERN_SUCCESS;
11157 }/* vm_map_copy_overwrite_aligned */
11158 
11159 /*
11160  *	Routine: vm_map_copyin_kernel_buffer [internal use only]
11161  *
11162  *	Description:
11163  *		Copy in data to a kernel buffer from space in the
11164  *		source map. The original space may be optionally
11165  *		deallocated.
11166  *
11167  *		If successful, returns a new copy object.
11168  */
11169 static kern_return_t
vm_map_copyin_kernel_buffer(vm_map_t src_map,vm_map_offset_t src_addr,vm_map_size_t len,boolean_t src_destroy,vm_map_copy_t * copy_result)11170 vm_map_copyin_kernel_buffer(
11171 	vm_map_t        src_map,
11172 	vm_map_offset_t src_addr,
11173 	vm_map_size_t   len,
11174 	boolean_t       src_destroy,
11175 	vm_map_copy_t   *copy_result)
11176 {
11177 	kern_return_t kr;
11178 	vm_map_copy_t copy;
11179 	void *kdata;
11180 
11181 	if (len > msg_ool_size_small) {
11182 		return KERN_INVALID_ARGUMENT;
11183 	}
11184 
11185 	kdata = kalloc_data(len, Z_WAITOK);
11186 	if (kdata == NULL) {
11187 		return KERN_RESOURCE_SHORTAGE;
11188 	}
11189 	kr = copyinmap(src_map, src_addr, kdata, (vm_size_t)len);
11190 	if (kr != KERN_SUCCESS) {
11191 		kfree_data(kdata, len);
11192 		return kr;
11193 	}
11194 
11195 	copy = vm_map_copy_allocate(VM_MAP_COPY_KERNEL_BUFFER);
11196 	copy->cpy_kdata = kdata;
11197 	copy->size = len;
11198 	copy->offset = 0;
11199 
11200 	if (src_destroy) {
11201 		vmr_flags_t flags = VM_MAP_REMOVE_INTERRUPTIBLE;
11202 
11203 		if (src_map == kernel_map) {
11204 			flags |= VM_MAP_REMOVE_KUNWIRE;
11205 		}
11206 
11207 		(void)vm_map_remove_guard(src_map,
11208 		    vm_map_trunc_page(src_addr, VM_MAP_PAGE_MASK(src_map)),
11209 		    vm_map_round_page(src_addr + len, VM_MAP_PAGE_MASK(src_map)),
11210 		    flags, KMEM_GUARD_NONE);
11211 	}
11212 
11213 	*copy_result = copy;
11214 	return KERN_SUCCESS;
11215 }
11216 
11217 /*
11218  *	Routine: vm_map_copyout_kernel_buffer	[internal use only]
11219  *
11220  *	Description:
11221  *		Copy out data from a kernel buffer into space in the
11222  *		destination map. The space may be otpionally dynamically
11223  *		allocated.
11224  *
11225  *		If successful, consumes the copy object.
11226  *		Otherwise, the caller is responsible for it.
11227  *
11228  *		Callers of this function must call vm_map_copy_require on
11229  *		previously created vm_map_copy_t or pass a newly created
11230  *		one to ensure that it hasn't been forged.
11231  */
11232 static int vm_map_copyout_kernel_buffer_failures = 0;
11233 static kern_return_t
vm_map_copyout_kernel_buffer(vm_map_t map,vm_map_address_t * addr,vm_map_copy_t copy,vm_map_size_t copy_size,boolean_t overwrite,boolean_t consume_on_success)11234 vm_map_copyout_kernel_buffer(
11235 	vm_map_t                map,
11236 	vm_map_address_t        *addr,  /* IN/OUT */
11237 	vm_map_copy_t           copy,
11238 	vm_map_size_t           copy_size,
11239 	boolean_t               overwrite,
11240 	boolean_t               consume_on_success)
11241 {
11242 	kern_return_t kr = KERN_SUCCESS;
11243 	thread_t thread = current_thread();
11244 
11245 	assert(copy->size == copy_size);
11246 
11247 	/*
11248 	 * check for corrupted vm_map_copy structure
11249 	 */
11250 	if (copy_size > msg_ool_size_small || copy->offset) {
11251 		panic("Invalid vm_map_copy_t sz:%lld, ofst:%lld",
11252 		    (long long)copy->size, (long long)copy->offset);
11253 	}
11254 
11255 	if (!overwrite) {
11256 		/*
11257 		 * Allocate space in the target map for the data
11258 		 */
11259 		vm_map_kernel_flags_t vmk_flags = VM_MAP_KERNEL_FLAGS_ANYWHERE();
11260 
11261 		if (map == kernel_map) {
11262 			vmk_flags.vmkf_range_id = KMEM_RANGE_ID_DATA;
11263 		}
11264 
11265 		*addr = 0;
11266 		kr = vm_map_enter(map,
11267 		    addr,
11268 		    vm_map_round_page(copy_size,
11269 		    VM_MAP_PAGE_MASK(map)),
11270 		    (vm_map_offset_t) 0,
11271 		    vmk_flags,
11272 		    VM_OBJECT_NULL,
11273 		    (vm_object_offset_t) 0,
11274 		    FALSE,
11275 		    VM_PROT_DEFAULT,
11276 		    VM_PROT_ALL,
11277 		    VM_INHERIT_DEFAULT);
11278 		if (kr != KERN_SUCCESS) {
11279 			return kr;
11280 		}
11281 #if KASAN
11282 		if (map->pmap == kernel_pmap) {
11283 			kasan_notify_address(*addr, copy->size);
11284 		}
11285 #endif
11286 	}
11287 
11288 	/*
11289 	 * Copyout the data from the kernel buffer to the target map.
11290 	 */
11291 	if (thread->map == map) {
11292 		/*
11293 		 * If the target map is the current map, just do
11294 		 * the copy.
11295 		 */
11296 		assert((vm_size_t)copy_size == copy_size);
11297 		if (copyout(copy->cpy_kdata, *addr, (vm_size_t)copy_size)) {
11298 			kr = KERN_INVALID_ADDRESS;
11299 		}
11300 	} else {
11301 		vm_map_t oldmap;
11302 
11303 		/*
11304 		 * If the target map is another map, assume the
11305 		 * target's address space identity for the duration
11306 		 * of the copy.
11307 		 */
11308 		vm_map_reference(map);
11309 		oldmap = vm_map_switch(map);
11310 
11311 		assert((vm_size_t)copy_size == copy_size);
11312 		if (copyout(copy->cpy_kdata, *addr, (vm_size_t)copy_size)) {
11313 			vm_map_copyout_kernel_buffer_failures++;
11314 			kr = KERN_INVALID_ADDRESS;
11315 		}
11316 
11317 		(void) vm_map_switch(oldmap);
11318 		vm_map_deallocate(map);
11319 	}
11320 
11321 	if (kr != KERN_SUCCESS) {
11322 		/* the copy failed, clean up */
11323 		if (!overwrite) {
11324 			/*
11325 			 * Deallocate the space we allocated in the target map.
11326 			 */
11327 			(void) vm_map_remove(map,
11328 			    vm_map_trunc_page(*addr,
11329 			    VM_MAP_PAGE_MASK(map)),
11330 			    vm_map_round_page((*addr +
11331 			    vm_map_round_page(copy_size,
11332 			    VM_MAP_PAGE_MASK(map))),
11333 			    VM_MAP_PAGE_MASK(map)));
11334 			*addr = 0;
11335 		}
11336 	} else {
11337 		/* copy was successful, dicard the copy structure */
11338 		if (consume_on_success) {
11339 			kfree_data(copy->cpy_kdata, copy_size);
11340 			zfree_id(ZONE_ID_VM_MAP_COPY, copy);
11341 		}
11342 	}
11343 
11344 	return kr;
11345 }
11346 
11347 /*
11348  *	Routine:	vm_map_copy_insert      [internal use only]
11349  *
11350  *	Description:
11351  *		Link a copy chain ("copy") into a map at the
11352  *		specified location (after "where").
11353  *
11354  *		Callers of this function must call vm_map_copy_require on
11355  *		previously created vm_map_copy_t or pass a newly created
11356  *		one to ensure that it hasn't been forged.
11357  *	Side effects:
11358  *		The copy chain is destroyed.
11359  */
11360 static void
vm_map_copy_insert(vm_map_t map,vm_map_entry_t after_where,vm_map_copy_t copy)11361 vm_map_copy_insert(
11362 	vm_map_t        map,
11363 	vm_map_entry_t  after_where,
11364 	vm_map_copy_t   copy)
11365 {
11366 	vm_map_entry_t  entry;
11367 
11368 	while (vm_map_copy_first_entry(copy) != vm_map_copy_to_entry(copy)) {
11369 		entry = vm_map_copy_first_entry(copy);
11370 		vm_map_copy_entry_unlink(copy, entry);
11371 		vm_map_store_entry_link(map, after_where, entry,
11372 		    VM_MAP_KERNEL_FLAGS_NONE);
11373 		after_where = entry;
11374 	}
11375 	zfree_id(ZONE_ID_VM_MAP_COPY, copy);
11376 }
11377 
11378 /*
11379  * Callers of this function must call vm_map_copy_require on
11380  * previously created vm_map_copy_t or pass a newly created
11381  * one to ensure that it hasn't been forged.
11382  */
11383 void
vm_map_copy_remap(vm_map_t map,vm_map_entry_t where,vm_map_copy_t copy,vm_map_offset_t adjustment,vm_prot_t cur_prot,vm_prot_t max_prot,vm_inherit_t inheritance)11384 vm_map_copy_remap(
11385 	vm_map_t        map,
11386 	vm_map_entry_t  where,
11387 	vm_map_copy_t   copy,
11388 	vm_map_offset_t adjustment,
11389 	vm_prot_t       cur_prot,
11390 	vm_prot_t       max_prot,
11391 	vm_inherit_t    inheritance)
11392 {
11393 	vm_map_entry_t  copy_entry, new_entry;
11394 
11395 	for (copy_entry = vm_map_copy_first_entry(copy);
11396 	    copy_entry != vm_map_copy_to_entry(copy);
11397 	    copy_entry = copy_entry->vme_next) {
11398 		/* get a new VM map entry for the map */
11399 		new_entry = vm_map_entry_create(map);
11400 		/* copy the "copy entry" to the new entry */
11401 		vm_map_entry_copy(map, new_entry, copy_entry);
11402 		/* adjust "start" and "end" */
11403 		new_entry->vme_start += adjustment;
11404 		new_entry->vme_end += adjustment;
11405 		/* clear some attributes */
11406 		new_entry->inheritance = inheritance;
11407 		new_entry->protection = cur_prot;
11408 		new_entry->max_protection = max_prot;
11409 		new_entry->behavior = VM_BEHAVIOR_DEFAULT;
11410 		/* take an extra reference on the entry's "object" */
11411 		if (new_entry->is_sub_map) {
11412 			assert(!new_entry->use_pmap); /* not nested */
11413 			vm_map_reference(VME_SUBMAP(new_entry));
11414 		} else {
11415 			vm_object_reference(VME_OBJECT(new_entry));
11416 		}
11417 		/* insert the new entry in the map */
11418 		vm_map_store_entry_link(map, where, new_entry,
11419 		    VM_MAP_KERNEL_FLAGS_NONE);
11420 		/* continue inserting the "copy entries" after the new entry */
11421 		where = new_entry;
11422 	}
11423 }
11424 
11425 
11426 /*
11427  * Returns true if *size matches (or is in the range of) copy->size.
11428  * Upon returning true, the *size field is updated with the actual size of the
11429  * copy object (may be different for VM_MAP_COPY_ENTRY_LIST types)
11430  */
11431 boolean_t
vm_map_copy_validate_size(vm_map_t dst_map,vm_map_copy_t copy,vm_map_size_t * size)11432 vm_map_copy_validate_size(
11433 	vm_map_t                dst_map,
11434 	vm_map_copy_t           copy,
11435 	vm_map_size_t           *size)
11436 {
11437 	if (copy == VM_MAP_COPY_NULL) {
11438 		return FALSE;
11439 	}
11440 
11441 	/*
11442 	 * Assert that the vm_map_copy is coming from the right
11443 	 * zone and hasn't been forged
11444 	 */
11445 	vm_map_copy_require(copy);
11446 
11447 	vm_map_size_t copy_sz = copy->size;
11448 	vm_map_size_t sz = *size;
11449 	switch (copy->type) {
11450 	case VM_MAP_COPY_KERNEL_BUFFER:
11451 		if (sz == copy_sz) {
11452 			return TRUE;
11453 		}
11454 		break;
11455 	case VM_MAP_COPY_ENTRY_LIST:
11456 		/*
11457 		 * potential page-size rounding prevents us from exactly
11458 		 * validating this flavor of vm_map_copy, but we can at least
11459 		 * assert that it's within a range.
11460 		 */
11461 		if (copy_sz >= sz &&
11462 		    copy_sz <= vm_map_round_page(sz, VM_MAP_PAGE_MASK(dst_map))) {
11463 			*size = copy_sz;
11464 			return TRUE;
11465 		}
11466 		break;
11467 	default:
11468 		break;
11469 	}
11470 	return FALSE;
11471 }
11472 
11473 /*
11474  *	Routine:	vm_map_copyout_size
11475  *
11476  *	Description:
11477  *		Copy out a copy chain ("copy") into newly-allocated
11478  *		space in the destination map. Uses a prevalidated
11479  *		size for the copy object (vm_map_copy_validate_size).
11480  *
11481  *		If successful, consumes the copy object.
11482  *		Otherwise, the caller is responsible for it.
11483  */
11484 kern_return_t
vm_map_copyout_size(vm_map_t dst_map,vm_map_address_t * dst_addr,vm_map_copy_t copy,vm_map_size_t copy_size)11485 vm_map_copyout_size(
11486 	vm_map_t                dst_map,
11487 	vm_map_address_t        *dst_addr,      /* OUT */
11488 	vm_map_copy_t           copy,
11489 	vm_map_size_t           copy_size)
11490 {
11491 	return vm_map_copyout_internal(dst_map, dst_addr, copy, copy_size,
11492 	           TRUE,                     /* consume_on_success */
11493 	           VM_PROT_DEFAULT,
11494 	           VM_PROT_ALL,
11495 	           VM_INHERIT_DEFAULT);
11496 }
11497 
11498 /*
11499  *	Routine:	vm_map_copyout
11500  *
11501  *	Description:
11502  *		Copy out a copy chain ("copy") into newly-allocated
11503  *		space in the destination map.
11504  *
11505  *		If successful, consumes the copy object.
11506  *		Otherwise, the caller is responsible for it.
11507  */
11508 kern_return_t
vm_map_copyout(vm_map_t dst_map,vm_map_address_t * dst_addr,vm_map_copy_t copy)11509 vm_map_copyout(
11510 	vm_map_t                dst_map,
11511 	vm_map_address_t        *dst_addr,      /* OUT */
11512 	vm_map_copy_t           copy)
11513 {
11514 	return vm_map_copyout_internal(dst_map, dst_addr, copy, copy ? copy->size : 0,
11515 	           TRUE,                     /* consume_on_success */
11516 	           VM_PROT_DEFAULT,
11517 	           VM_PROT_ALL,
11518 	           VM_INHERIT_DEFAULT);
11519 }
11520 
11521 kern_return_t
vm_map_copyout_internal(vm_map_t dst_map,vm_map_address_t * dst_addr,vm_map_copy_t copy,vm_map_size_t copy_size,boolean_t consume_on_success,vm_prot_t cur_protection,vm_prot_t max_protection,vm_inherit_t inheritance)11522 vm_map_copyout_internal(
11523 	vm_map_t                dst_map,
11524 	vm_map_address_t        *dst_addr,      /* OUT */
11525 	vm_map_copy_t           copy,
11526 	vm_map_size_t           copy_size,
11527 	boolean_t               consume_on_success,
11528 	vm_prot_t               cur_protection,
11529 	vm_prot_t               max_protection,
11530 	vm_inherit_t            inheritance)
11531 {
11532 	vm_map_size_t           size;
11533 	vm_map_size_t           adjustment;
11534 	vm_map_offset_t         start;
11535 	vm_object_offset_t      vm_copy_start;
11536 	vm_map_entry_t          last;
11537 	vm_map_entry_t          entry;
11538 	vm_map_copy_t           original_copy;
11539 	kern_return_t           kr;
11540 	vm_map_kernel_flags_t   vmk_flags = VM_MAP_KERNEL_FLAGS_ANYWHERE();
11541 
11542 	/*
11543 	 *	Check for null copy object.
11544 	 */
11545 
11546 	if (copy == VM_MAP_COPY_NULL) {
11547 		*dst_addr = 0;
11548 		return KERN_SUCCESS;
11549 	}
11550 
11551 	/*
11552 	 * Assert that the vm_map_copy is coming from the right
11553 	 * zone and hasn't been forged
11554 	 */
11555 	vm_map_copy_require(copy);
11556 
11557 	if (copy->size != copy_size) {
11558 		*dst_addr = 0;
11559 		ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_COPYOUT_INTERNAL_SIZE_ERROR), KERN_FAILURE /* arg */);
11560 		return KERN_FAILURE;
11561 	}
11562 
11563 	/*
11564 	 *	Check for special kernel buffer allocated
11565 	 *	by new_ipc_kmsg_copyin.
11566 	 */
11567 
11568 	if (copy->type == VM_MAP_COPY_KERNEL_BUFFER) {
11569 		kr = vm_map_copyout_kernel_buffer(dst_map, dst_addr,
11570 		    copy, copy_size, FALSE,
11571 		    consume_on_success);
11572 		if (kr) {
11573 			ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_COPYOUT_KERNEL_BUFFER_ERROR), kr /* arg */);
11574 		}
11575 		return kr;
11576 	}
11577 
11578 	original_copy = copy;
11579 	if (copy->cpy_hdr.page_shift != VM_MAP_PAGE_SHIFT(dst_map)) {
11580 		vm_map_copy_t target_copy;
11581 		vm_map_offset_t overmap_start, overmap_end, trimmed_start;
11582 
11583 		target_copy = VM_MAP_COPY_NULL;
11584 		DEBUG4K_ADJUST("adjusting...\n");
11585 		kr = vm_map_copy_adjust_to_target(
11586 			copy,
11587 			0, /* offset */
11588 			copy->size, /* size */
11589 			dst_map,
11590 			TRUE, /* copy */
11591 			&target_copy,
11592 			&overmap_start,
11593 			&overmap_end,
11594 			&trimmed_start);
11595 		if (kr != KERN_SUCCESS) {
11596 			DEBUG4K_COPY("adjust failed 0x%x\n", kr);
11597 			ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_COPYOUT_INTERNAL_ADJUSTING_ERROR), kr /* arg */);
11598 			return kr;
11599 		}
11600 		DEBUG4K_COPY("copy %p (%d 0x%llx 0x%llx) dst_map %p (%d) target_copy %p (%d 0x%llx 0x%llx) overmap_start 0x%llx overmap_end 0x%llx trimmed_start 0x%llx\n", copy, copy->cpy_hdr.page_shift, copy->offset, (uint64_t)copy->size, dst_map, VM_MAP_PAGE_SHIFT(dst_map), target_copy, target_copy->cpy_hdr.page_shift, target_copy->offset, (uint64_t)target_copy->size, (uint64_t)overmap_start, (uint64_t)overmap_end, (uint64_t)trimmed_start);
11601 		if (target_copy != copy) {
11602 			copy = target_copy;
11603 		}
11604 		copy_size = copy->size;
11605 	}
11606 
11607 	/*
11608 	 *	Find space for the data
11609 	 */
11610 
11611 	vm_copy_start = vm_map_trunc_page((vm_map_size_t)copy->offset,
11612 	    VM_MAP_COPY_PAGE_MASK(copy));
11613 	size = vm_map_round_page((vm_map_size_t)copy->offset + copy_size,
11614 	    VM_MAP_COPY_PAGE_MASK(copy))
11615 	    - vm_copy_start;
11616 
11617 	vm_map_kernel_flags_update_range_id(&vmk_flags, dst_map);
11618 
11619 	vm_map_lock(dst_map);
11620 	kr = vm_map_locate_space(dst_map, size, 0, vmk_flags,
11621 	    &start, &last);
11622 	if (kr != KERN_SUCCESS) {
11623 		vm_map_unlock(dst_map);
11624 		ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_COPYOUT_INTERNAL_SPACE_ERROR), kr /* arg */);
11625 		return kr;
11626 	}
11627 
11628 	adjustment = start - vm_copy_start;
11629 	if (!consume_on_success) {
11630 		/*
11631 		 * We're not allowed to consume "copy", so we'll have to
11632 		 * copy its map entries into the destination map below.
11633 		 * No need to re-allocate map entries from the correct
11634 		 * (pageable or not) zone, since we'll get new map entries
11635 		 * during the transfer.
11636 		 * We'll also adjust the map entries's "start" and "end"
11637 		 * during the transfer, to keep "copy"'s entries consistent
11638 		 * with its "offset".
11639 		 */
11640 		goto after_adjustments;
11641 	}
11642 
11643 	/*
11644 	 *	Since we're going to just drop the map
11645 	 *	entries from the copy into the destination
11646 	 *	map, they must come from the same pool.
11647 	 */
11648 
11649 	if (copy->cpy_hdr.entries_pageable != dst_map->hdr.entries_pageable) {
11650 		/*
11651 		 * Mismatches occur when dealing with the default
11652 		 * pager.
11653 		 */
11654 		vm_map_entry_t  next, new;
11655 
11656 		/*
11657 		 * Find the zone that the copies were allocated from
11658 		 */
11659 
11660 		entry = vm_map_copy_first_entry(copy);
11661 
11662 		/*
11663 		 * Reinitialize the copy so that vm_map_copy_entry_link
11664 		 * will work.
11665 		 */
11666 		vm_map_store_copy_reset(copy, entry);
11667 		copy->cpy_hdr.entries_pageable = dst_map->hdr.entries_pageable;
11668 
11669 		/*
11670 		 * Copy each entry.
11671 		 */
11672 		while (entry != vm_map_copy_to_entry(copy)) {
11673 			new = vm_map_copy_entry_create(copy);
11674 			vm_map_entry_copy_full(new, entry);
11675 			new->vme_no_copy_on_read = FALSE;
11676 			assert(!new->iokit_acct);
11677 			if (new->is_sub_map) {
11678 				/* clr address space specifics */
11679 				new->use_pmap = FALSE;
11680 			}
11681 			vm_map_copy_entry_link(copy,
11682 			    vm_map_copy_last_entry(copy),
11683 			    new);
11684 			next = entry->vme_next;
11685 			vm_map_entry_dispose(entry);
11686 			entry = next;
11687 		}
11688 	}
11689 
11690 	/*
11691 	 *	Adjust the addresses in the copy chain, and
11692 	 *	reset the region attributes.
11693 	 */
11694 
11695 	for (entry = vm_map_copy_first_entry(copy);
11696 	    entry != vm_map_copy_to_entry(copy);
11697 	    entry = entry->vme_next) {
11698 		if (VM_MAP_PAGE_SHIFT(dst_map) == PAGE_SHIFT) {
11699 			/*
11700 			 * We're injecting this copy entry into a map that
11701 			 * has the standard page alignment, so clear
11702 			 * "map_aligned" (which might have been inherited
11703 			 * from the original map entry).
11704 			 */
11705 			entry->map_aligned = FALSE;
11706 		}
11707 
11708 		entry->vme_start += adjustment;
11709 		entry->vme_end += adjustment;
11710 
11711 		if (entry->map_aligned) {
11712 			assert(VM_MAP_PAGE_ALIGNED(entry->vme_start,
11713 			    VM_MAP_PAGE_MASK(dst_map)));
11714 			assert(VM_MAP_PAGE_ALIGNED(entry->vme_end,
11715 			    VM_MAP_PAGE_MASK(dst_map)));
11716 		}
11717 
11718 		entry->inheritance = VM_INHERIT_DEFAULT;
11719 		entry->protection = VM_PROT_DEFAULT;
11720 		entry->max_protection = VM_PROT_ALL;
11721 		entry->behavior = VM_BEHAVIOR_DEFAULT;
11722 
11723 		/*
11724 		 * If the entry is now wired,
11725 		 * map the pages into the destination map.
11726 		 */
11727 		if (entry->wired_count != 0) {
11728 			vm_map_offset_t va;
11729 			vm_object_offset_t       offset;
11730 			vm_object_t object;
11731 			vm_prot_t prot;
11732 			int     type_of_fault;
11733 			uint8_t object_lock_type = OBJECT_LOCK_EXCLUSIVE;
11734 
11735 			/* TODO4K would need to use actual page size */
11736 			assert(VM_MAP_PAGE_SHIFT(dst_map) == PAGE_SHIFT);
11737 
11738 			object = VME_OBJECT(entry);
11739 			offset = VME_OFFSET(entry);
11740 			va = entry->vme_start;
11741 
11742 			pmap_pageable(dst_map->pmap,
11743 			    entry->vme_start,
11744 			    entry->vme_end,
11745 			    TRUE);
11746 
11747 			while (va < entry->vme_end) {
11748 				vm_page_t       m;
11749 				struct vm_object_fault_info fault_info = {};
11750 
11751 				/*
11752 				 * Look up the page in the object.
11753 				 * Assert that the page will be found in the
11754 				 * top object:
11755 				 * either
11756 				 *	the object was newly created by
11757 				 *	vm_object_copy_slowly, and has
11758 				 *	copies of all of the pages from
11759 				 *	the source object
11760 				 * or
11761 				 *	the object was moved from the old
11762 				 *	map entry; because the old map
11763 				 *	entry was wired, all of the pages
11764 				 *	were in the top-level object.
11765 				 *	(XXX not true if we wire pages for
11766 				 *	 reading)
11767 				 */
11768 				vm_object_lock(object);
11769 
11770 				m = vm_page_lookup(object, offset);
11771 				if (m == VM_PAGE_NULL || !VM_PAGE_WIRED(m) ||
11772 				    m->vmp_absent) {
11773 					panic("vm_map_copyout: wiring %p", m);
11774 				}
11775 
11776 				prot = entry->protection;
11777 
11778 				if (override_nx(dst_map, VME_ALIAS(entry)) &&
11779 				    prot) {
11780 					prot |= VM_PROT_EXECUTE;
11781 				}
11782 
11783 				type_of_fault = DBG_CACHE_HIT_FAULT;
11784 
11785 				fault_info.user_tag = VME_ALIAS(entry);
11786 				fault_info.pmap_options = 0;
11787 				if (entry->iokit_acct ||
11788 				    (!entry->is_sub_map && !entry->use_pmap)) {
11789 					fault_info.pmap_options |= PMAP_OPTIONS_ALT_ACCT;
11790 				}
11791 				if (entry->vme_xnu_user_debug &&
11792 				    !VM_PAGE_OBJECT(m)->code_signed) {
11793 					/*
11794 					 * Modified code-signed executable
11795 					 * region: this page does not belong
11796 					 * to a code-signed VM object, so it
11797 					 * must have been copied and should
11798 					 * therefore be typed XNU_USER_DEBUG
11799 					 * rather than XNU_USER_EXEC.
11800 					 */
11801 					fault_info.pmap_options |= PMAP_OPTIONS_XNU_USER_DEBUG;
11802 				}
11803 
11804 				vm_fault_enter(m,
11805 				    dst_map->pmap,
11806 				    va,
11807 				    PAGE_SIZE, 0,
11808 				    prot,
11809 				    prot,
11810 				    VM_PAGE_WIRED(m),
11811 				    FALSE,            /* change_wiring */
11812 				    VM_KERN_MEMORY_NONE,            /* tag - not wiring */
11813 				    &fault_info,
11814 				    NULL,             /* need_retry */
11815 				    &type_of_fault,
11816 				    &object_lock_type); /*Exclusive mode lock. Will remain unchanged.*/
11817 
11818 				vm_object_unlock(object);
11819 
11820 				offset += PAGE_SIZE_64;
11821 				va += PAGE_SIZE;
11822 			}
11823 		}
11824 	}
11825 
11826 after_adjustments:
11827 
11828 	/*
11829 	 *	Correct the page alignment for the result
11830 	 */
11831 
11832 	*dst_addr = start + (copy->offset - vm_copy_start);
11833 
11834 #if KASAN
11835 	kasan_notify_address(*dst_addr, size);
11836 #endif
11837 
11838 	/*
11839 	 *	Update the hints and the map size
11840 	 */
11841 
11842 	if (consume_on_success) {
11843 		SAVE_HINT_MAP_WRITE(dst_map, vm_map_copy_last_entry(copy));
11844 	} else {
11845 		SAVE_HINT_MAP_WRITE(dst_map, last);
11846 	}
11847 
11848 	dst_map->size += size;
11849 
11850 	/*
11851 	 *	Link in the copy
11852 	 */
11853 
11854 	if (consume_on_success) {
11855 		vm_map_copy_insert(dst_map, last, copy);
11856 		if (copy != original_copy) {
11857 			vm_map_copy_discard(original_copy);
11858 			original_copy = VM_MAP_COPY_NULL;
11859 		}
11860 	} else {
11861 		vm_map_copy_remap(dst_map, last, copy, adjustment,
11862 		    cur_protection, max_protection,
11863 		    inheritance);
11864 		if (copy != original_copy && original_copy != VM_MAP_COPY_NULL) {
11865 			vm_map_copy_discard(copy);
11866 			copy = original_copy;
11867 		}
11868 	}
11869 
11870 
11871 	vm_map_unlock(dst_map);
11872 
11873 	/*
11874 	 * XXX	If wiring_required, call vm_map_pageable
11875 	 */
11876 
11877 	return KERN_SUCCESS;
11878 }
11879 
11880 /*
11881  *	Routine:	vm_map_copyin
11882  *
11883  *	Description:
11884  *		see vm_map_copyin_common.  Exported via Unsupported.exports.
11885  *
11886  */
11887 
11888 #undef vm_map_copyin
11889 
11890 kern_return_t
vm_map_copyin(vm_map_t src_map,vm_map_address_t src_addr,vm_map_size_t len,boolean_t src_destroy,vm_map_copy_t * copy_result)11891 vm_map_copyin(
11892 	vm_map_t                        src_map,
11893 	vm_map_address_t        src_addr,
11894 	vm_map_size_t           len,
11895 	boolean_t                       src_destroy,
11896 	vm_map_copy_t           *copy_result)   /* OUT */
11897 {
11898 	return vm_map_copyin_common(src_map, src_addr, len, src_destroy,
11899 	           FALSE, copy_result, FALSE);
11900 }
11901 
11902 /*
11903  *	Routine:	vm_map_copyin_common
11904  *
11905  *	Description:
11906  *		Copy the specified region (src_addr, len) from the
11907  *		source address space (src_map), possibly removing
11908  *		the region from the source address space (src_destroy).
11909  *
11910  *	Returns:
11911  *		A vm_map_copy_t object (copy_result), suitable for
11912  *		insertion into another address space (using vm_map_copyout),
11913  *		copying over another address space region (using
11914  *		vm_map_copy_overwrite).  If the copy is unused, it
11915  *		should be destroyed (using vm_map_copy_discard).
11916  *
11917  *	In/out conditions:
11918  *		The source map should not be locked on entry.
11919  */
11920 
11921 typedef struct submap_map {
11922 	vm_map_t        parent_map;
11923 	vm_map_offset_t base_start;
11924 	vm_map_offset_t base_end;
11925 	vm_map_size_t   base_len;
11926 	struct submap_map *next;
11927 } submap_map_t;
11928 
11929 kern_return_t
vm_map_copyin_common(vm_map_t src_map,vm_map_address_t src_addr,vm_map_size_t len,boolean_t src_destroy,__unused boolean_t src_volatile,vm_map_copy_t * copy_result,boolean_t use_maxprot)11930 vm_map_copyin_common(
11931 	vm_map_t        src_map,
11932 	vm_map_address_t src_addr,
11933 	vm_map_size_t   len,
11934 	boolean_t       src_destroy,
11935 	__unused boolean_t      src_volatile,
11936 	vm_map_copy_t   *copy_result,   /* OUT */
11937 	boolean_t       use_maxprot)
11938 {
11939 	int flags;
11940 
11941 	flags = 0;
11942 	if (src_destroy) {
11943 		flags |= VM_MAP_COPYIN_SRC_DESTROY;
11944 	}
11945 	if (use_maxprot) {
11946 		flags |= VM_MAP_COPYIN_USE_MAXPROT;
11947 	}
11948 	return vm_map_copyin_internal(src_map,
11949 	           src_addr,
11950 	           len,
11951 	           flags,
11952 	           copy_result);
11953 }
11954 kern_return_t
vm_map_copyin_internal(vm_map_t src_map,vm_map_address_t src_addr,vm_map_size_t len,int flags,vm_map_copy_t * copy_result)11955 vm_map_copyin_internal(
11956 	vm_map_t        src_map,
11957 	vm_map_address_t src_addr,
11958 	vm_map_size_t   len,
11959 	int             flags,
11960 	vm_map_copy_t   *copy_result)   /* OUT */
11961 {
11962 	vm_map_entry_t  tmp_entry;      /* Result of last map lookup --
11963 	                                 * in multi-level lookup, this
11964 	                                 * entry contains the actual
11965 	                                 * vm_object/offset.
11966 	                                 */
11967 	vm_map_entry_t  new_entry = VM_MAP_ENTRY_NULL;  /* Map entry for copy */
11968 
11969 	vm_map_offset_t src_start;      /* Start of current entry --
11970 	                                 * where copy is taking place now
11971 	                                 */
11972 	vm_map_offset_t src_end;        /* End of entire region to be
11973 	                                 * copied */
11974 	vm_map_offset_t src_base;
11975 	vm_map_t        base_map = src_map;
11976 	boolean_t       map_share = FALSE;
11977 	submap_map_t    *parent_maps = NULL;
11978 
11979 	vm_map_copy_t   copy;           /* Resulting copy */
11980 	vm_map_address_t copy_addr;
11981 	vm_map_size_t   copy_size;
11982 	boolean_t       src_destroy;
11983 	boolean_t       use_maxprot;
11984 	boolean_t       preserve_purgeable;
11985 	boolean_t       entry_was_shared;
11986 	vm_map_entry_t  saved_src_entry;
11987 
11988 
11989 	if (flags & ~VM_MAP_COPYIN_ALL_FLAGS) {
11990 		return KERN_INVALID_ARGUMENT;
11991 	}
11992 
11993 #if CONFIG_KERNEL_TAGGING
11994 	if (src_map->pmap == kernel_pmap) {
11995 		src_addr = vm_memtag_canonicalize_address(src_addr);
11996 	}
11997 #endif /* CONFIG_KERNEL_TAGGING */
11998 
11999 	src_destroy = (flags & VM_MAP_COPYIN_SRC_DESTROY) ? TRUE : FALSE;
12000 	use_maxprot = (flags & VM_MAP_COPYIN_USE_MAXPROT) ? TRUE : FALSE;
12001 	preserve_purgeable =
12002 	    (flags & VM_MAP_COPYIN_PRESERVE_PURGEABLE) ? TRUE : FALSE;
12003 
12004 	/*
12005 	 *	Check for copies of zero bytes.
12006 	 */
12007 
12008 	if (len == 0) {
12009 		*copy_result = VM_MAP_COPY_NULL;
12010 		return KERN_SUCCESS;
12011 	}
12012 
12013 	/*
12014 	 *	Check that the end address doesn't overflow
12015 	 */
12016 	if (__improbable(vm_map_range_overflows(src_map, src_addr, len))) {
12017 		return KERN_INVALID_ADDRESS;
12018 	}
12019 	src_end = src_addr + len;
12020 	if (src_end < src_addr) {
12021 		return KERN_INVALID_ADDRESS;
12022 	}
12023 
12024 	/*
12025 	 *	Compute (page aligned) start and end of region
12026 	 */
12027 	src_start = vm_map_trunc_page(src_addr,
12028 	    VM_MAP_PAGE_MASK(src_map));
12029 	src_end = vm_map_round_page(src_end,
12030 	    VM_MAP_PAGE_MASK(src_map));
12031 	if (src_end < src_addr) {
12032 		return KERN_INVALID_ADDRESS;
12033 	}
12034 
12035 	/*
12036 	 * If the copy is sufficiently small, use a kernel buffer instead
12037 	 * of making a virtual copy.  The theory being that the cost of
12038 	 * setting up VM (and taking C-O-W faults) dominates the copy costs
12039 	 * for small regions.
12040 	 */
12041 	if ((len <= msg_ool_size_small) &&
12042 	    !use_maxprot &&
12043 	    !preserve_purgeable &&
12044 	    !(flags & VM_MAP_COPYIN_ENTRY_LIST) &&
12045 	    /*
12046 	     * Since the "msg_ool_size_small" threshold was increased and
12047 	     * vm_map_copyin_kernel_buffer() doesn't handle accesses beyond the
12048 	     * address space limits, we revert to doing a virtual copy if the
12049 	     * copied range goes beyond those limits.  Otherwise, mach_vm_read()
12050 	     * of the commpage would now fail when it used to work.
12051 	     */
12052 	    (src_start >= vm_map_min(src_map) &&
12053 	    src_start < vm_map_max(src_map) &&
12054 	    src_end >= vm_map_min(src_map) &&
12055 	    src_end < vm_map_max(src_map))) {
12056 		return vm_map_copyin_kernel_buffer(src_map, src_addr, len,
12057 		           src_destroy, copy_result);
12058 	}
12059 
12060 	/*
12061 	 *	Allocate a header element for the list.
12062 	 *
12063 	 *	Use the start and end in the header to
12064 	 *	remember the endpoints prior to rounding.
12065 	 */
12066 
12067 	copy = vm_map_copy_allocate(VM_MAP_COPY_ENTRY_LIST);
12068 	copy->cpy_hdr.entries_pageable = TRUE;
12069 	copy->cpy_hdr.page_shift = (uint16_t)VM_MAP_PAGE_SHIFT(src_map);
12070 	copy->offset = src_addr;
12071 	copy->size = len;
12072 
12073 	new_entry = vm_map_copy_entry_create(copy);
12074 
12075 #define RETURN(x)                                               \
12076 	MACRO_BEGIN                                             \
12077 	vm_map_unlock(src_map);                                 \
12078 	if(src_map != base_map)                                 \
12079 	        vm_map_deallocate(src_map);                     \
12080 	if (new_entry != VM_MAP_ENTRY_NULL)                     \
12081 	        vm_map_copy_entry_dispose(new_entry);           \
12082 	vm_map_copy_discard(copy);                              \
12083 	{                                                       \
12084 	        submap_map_t	*_ptr;                          \
12085                                                                 \
12086 	        for(_ptr = parent_maps; _ptr != NULL; _ptr = parent_maps) { \
12087 	                parent_maps=parent_maps->next;          \
12088 	                if (_ptr->parent_map != base_map)       \
12089 	                        vm_map_deallocate(_ptr->parent_map);    \
12090 	                kfree_type(submap_map_t, _ptr);         \
12091 	        }                                               \
12092 	}                                                       \
12093 	MACRO_RETURN(x);                                        \
12094 	MACRO_END
12095 
12096 	/*
12097 	 *	Find the beginning of the region.
12098 	 */
12099 
12100 	vm_map_lock(src_map);
12101 
12102 	/*
12103 	 * Lookup the original "src_addr" rather than the truncated
12104 	 * "src_start", in case "src_start" falls in a non-map-aligned
12105 	 * map entry *before* the map entry that contains "src_addr"...
12106 	 */
12107 	if (!vm_map_lookup_entry(src_map, src_addr, &tmp_entry)) {
12108 		RETURN(KERN_INVALID_ADDRESS);
12109 	}
12110 	if (!tmp_entry->is_sub_map) {
12111 		/*
12112 		 * ... but clip to the map-rounded "src_start" rather than
12113 		 * "src_addr" to preserve map-alignment.  We'll adjust the
12114 		 * first copy entry at the end, if needed.
12115 		 */
12116 		vm_map_clip_start(src_map, tmp_entry, src_start);
12117 	}
12118 	if (src_start < tmp_entry->vme_start) {
12119 		/*
12120 		 * Move "src_start" up to the start of the
12121 		 * first map entry to copy.
12122 		 */
12123 		src_start = tmp_entry->vme_start;
12124 	}
12125 	/* set for later submap fix-up */
12126 	copy_addr = src_start;
12127 
12128 	/*
12129 	 *	Go through entries until we get to the end.
12130 	 */
12131 
12132 	while (TRUE) {
12133 		vm_map_entry_t  src_entry = tmp_entry;  /* Top-level entry */
12134 		vm_map_size_t   src_size;               /* Size of source
12135 		                                         * map entry (in both
12136 		                                         * maps)
12137 		                                         */
12138 
12139 		vm_object_t             src_object;     /* Object to copy */
12140 		vm_object_offset_t      src_offset;
12141 
12142 		vm_object_t             new_copy_object;/* vm_object_copy_* result */
12143 
12144 		boolean_t       src_needs_copy;         /* Should source map
12145 		                                         * be made read-only
12146 		                                         * for copy-on-write?
12147 		                                         */
12148 
12149 		boolean_t       new_entry_needs_copy;   /* Will new entry be COW? */
12150 
12151 		boolean_t       was_wired;              /* Was source wired? */
12152 		boolean_t       saved_used_for_jit;     /* Saved used_for_jit. */
12153 		vm_map_version_t version;               /* Version before locks
12154 		                                         * dropped to make copy
12155 		                                         */
12156 		kern_return_t   result;                 /* Return value from
12157 		                                         * copy_strategically.
12158 		                                         */
12159 		while (tmp_entry->is_sub_map) {
12160 			vm_map_size_t submap_len;
12161 			submap_map_t *ptr;
12162 
12163 			ptr = kalloc_type(submap_map_t, Z_WAITOK);
12164 			ptr->next = parent_maps;
12165 			parent_maps = ptr;
12166 			ptr->parent_map = src_map;
12167 			ptr->base_start = src_start;
12168 			ptr->base_end = src_end;
12169 			submap_len = tmp_entry->vme_end - src_start;
12170 			if (submap_len > (src_end - src_start)) {
12171 				submap_len = src_end - src_start;
12172 			}
12173 			ptr->base_len = submap_len;
12174 
12175 			src_start -= tmp_entry->vme_start;
12176 			src_start += VME_OFFSET(tmp_entry);
12177 			src_end = src_start + submap_len;
12178 			src_map = VME_SUBMAP(tmp_entry);
12179 			vm_map_lock(src_map);
12180 			/* keep an outstanding reference for all maps in */
12181 			/* the parents tree except the base map */
12182 			vm_map_reference(src_map);
12183 			vm_map_unlock(ptr->parent_map);
12184 			if (!vm_map_lookup_entry(
12185 				    src_map, src_start, &tmp_entry)) {
12186 				RETURN(KERN_INVALID_ADDRESS);
12187 			}
12188 			map_share = TRUE;
12189 			if (!tmp_entry->is_sub_map) {
12190 				vm_map_clip_start(src_map, tmp_entry, src_start);
12191 			}
12192 			src_entry = tmp_entry;
12193 		}
12194 		/* we are now in the lowest level submap... */
12195 
12196 		if ((VME_OBJECT(tmp_entry) != VM_OBJECT_NULL) &&
12197 		    (VME_OBJECT(tmp_entry)->phys_contiguous)) {
12198 			/* This is not, supported for now.In future */
12199 			/* we will need to detect the phys_contig   */
12200 			/* condition and then upgrade copy_slowly   */
12201 			/* to do physical copy from the device mem  */
12202 			/* based object. We can piggy-back off of   */
12203 			/* the was wired boolean to set-up the      */
12204 			/* proper handling */
12205 			RETURN(KERN_PROTECTION_FAILURE);
12206 		}
12207 		/*
12208 		 *	Create a new address map entry to hold the result.
12209 		 *	Fill in the fields from the appropriate source entries.
12210 		 *	We must unlock the source map to do this if we need
12211 		 *	to allocate a map entry.
12212 		 */
12213 		if (new_entry == VM_MAP_ENTRY_NULL) {
12214 			version.main_timestamp = src_map->timestamp;
12215 			vm_map_unlock(src_map);
12216 
12217 			new_entry = vm_map_copy_entry_create(copy);
12218 
12219 			vm_map_lock(src_map);
12220 			if ((version.main_timestamp + 1) != src_map->timestamp) {
12221 				if (!vm_map_lookup_entry(src_map, src_start,
12222 				    &tmp_entry)) {
12223 					RETURN(KERN_INVALID_ADDRESS);
12224 				}
12225 				if (!tmp_entry->is_sub_map) {
12226 					vm_map_clip_start(src_map, tmp_entry, src_start);
12227 				}
12228 				continue; /* restart w/ new tmp_entry */
12229 			}
12230 		}
12231 
12232 		/*
12233 		 *	Verify that the region can be read.
12234 		 */
12235 		if (((src_entry->protection & VM_PROT_READ) == VM_PROT_NONE &&
12236 		    !use_maxprot) ||
12237 		    (src_entry->max_protection & VM_PROT_READ) == 0) {
12238 			RETURN(KERN_PROTECTION_FAILURE);
12239 		}
12240 
12241 		/*
12242 		 *	Clip against the endpoints of the entire region.
12243 		 */
12244 
12245 		vm_map_clip_end(src_map, src_entry, src_end);
12246 
12247 		src_size = src_entry->vme_end - src_start;
12248 		src_object = VME_OBJECT(src_entry);
12249 		src_offset = VME_OFFSET(src_entry);
12250 		was_wired = (src_entry->wired_count != 0);
12251 
12252 		vm_map_entry_copy(src_map, new_entry, src_entry);
12253 		if (new_entry->is_sub_map) {
12254 			/* clr address space specifics */
12255 			new_entry->use_pmap = FALSE;
12256 		} else {
12257 			/*
12258 			 * We're dealing with a copy-on-write operation,
12259 			 * so the resulting mapping should not inherit the
12260 			 * original mapping's accounting settings.
12261 			 * "iokit_acct" should have been cleared in
12262 			 * vm_map_entry_copy().
12263 			 * "use_pmap" should be reset to its default (TRUE)
12264 			 * so that the new mapping gets accounted for in
12265 			 * the task's memory footprint.
12266 			 */
12267 			assert(!new_entry->iokit_acct);
12268 			new_entry->use_pmap = TRUE;
12269 		}
12270 
12271 		/*
12272 		 *	Attempt non-blocking copy-on-write optimizations.
12273 		 */
12274 
12275 		/*
12276 		 * If we are destroying the source, and the object
12277 		 * is internal, we could move the object reference
12278 		 * from the source to the copy.  The copy is
12279 		 * copy-on-write only if the source is.
12280 		 * We make another reference to the object, because
12281 		 * destroying the source entry will deallocate it.
12282 		 *
12283 		 * This memory transfer has to be atomic, (to prevent
12284 		 * the VM object from being shared or copied while
12285 		 * it's being moved here), so we could only do this
12286 		 * if we won't have to unlock the VM map until the
12287 		 * original mapping has been fully removed.
12288 		 */
12289 
12290 RestartCopy:
12291 		if ((src_object == VM_OBJECT_NULL ||
12292 		    (!was_wired && !map_share && !tmp_entry->is_shared
12293 		    && !(debug4k_no_cow_copyin && VM_MAP_PAGE_SHIFT(src_map) < PAGE_SHIFT))) &&
12294 		    vm_object_copy_quickly(
12295 			    VME_OBJECT(new_entry),
12296 			    src_offset,
12297 			    src_size,
12298 			    &src_needs_copy,
12299 			    &new_entry_needs_copy)) {
12300 			new_entry->needs_copy = new_entry_needs_copy;
12301 
12302 			/*
12303 			 *	Handle copy-on-write obligations
12304 			 */
12305 
12306 			if (src_needs_copy && !tmp_entry->needs_copy) {
12307 				vm_prot_t prot;
12308 
12309 				prot = src_entry->protection & ~VM_PROT_WRITE;
12310 
12311 				if (override_nx(src_map, VME_ALIAS(src_entry))
12312 				    && prot) {
12313 					prot |= VM_PROT_EXECUTE;
12314 				}
12315 
12316 				vm_object_pmap_protect(
12317 					src_object,
12318 					src_offset,
12319 					src_size,
12320 					(src_entry->is_shared ?
12321 					PMAP_NULL
12322 					: src_map->pmap),
12323 					VM_MAP_PAGE_SIZE(src_map),
12324 					src_entry->vme_start,
12325 					prot);
12326 
12327 				assert(tmp_entry->wired_count == 0);
12328 				tmp_entry->needs_copy = TRUE;
12329 			}
12330 
12331 			/*
12332 			 *	The map has never been unlocked, so it's safe
12333 			 *	to move to the next entry rather than doing
12334 			 *	another lookup.
12335 			 */
12336 
12337 			goto CopySuccessful;
12338 		}
12339 
12340 		entry_was_shared = tmp_entry->is_shared;
12341 
12342 		/*
12343 		 *	Take an object reference, so that we may
12344 		 *	release the map lock(s).
12345 		 */
12346 
12347 		assert(src_object != VM_OBJECT_NULL);
12348 		vm_object_reference(src_object);
12349 
12350 		/*
12351 		 *	Record the timestamp for later verification.
12352 		 *	Unlock the map.
12353 		 */
12354 
12355 		version.main_timestamp = src_map->timestamp;
12356 		vm_map_unlock(src_map); /* Increments timestamp once! */
12357 		saved_src_entry = src_entry;
12358 		tmp_entry = VM_MAP_ENTRY_NULL;
12359 		src_entry = VM_MAP_ENTRY_NULL;
12360 
12361 		/*
12362 		 *	Perform the copy
12363 		 */
12364 
12365 		if (was_wired ||
12366 		    (src_object->copy_strategy == MEMORY_OBJECT_COPY_DELAY_FORK &&
12367 		    !(flags & VM_MAP_COPYIN_FORK)) ||
12368 		    (debug4k_no_cow_copyin &&
12369 		    VM_MAP_PAGE_SHIFT(src_map) < PAGE_SHIFT)) {
12370 CopySlowly:
12371 			vm_object_lock(src_object);
12372 			result = vm_object_copy_slowly(
12373 				src_object,
12374 				src_offset,
12375 				src_size,
12376 				THREAD_UNINT,
12377 				&new_copy_object);
12378 			/* VME_OBJECT_SET will reset used_for_jit|tpro, so preserve it. */
12379 			saved_used_for_jit = new_entry->used_for_jit;
12380 			VME_OBJECT_SET(new_entry, new_copy_object, false, 0);
12381 			new_entry->used_for_jit = saved_used_for_jit;
12382 			VME_OFFSET_SET(new_entry,
12383 			    src_offset - vm_object_trunc_page(src_offset));
12384 			new_entry->needs_copy = FALSE;
12385 		} else if (src_object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC &&
12386 		    (entry_was_shared || map_share)) {
12387 			vm_object_t new_object;
12388 
12389 			vm_object_lock_shared(src_object);
12390 			new_object = vm_object_copy_delayed(
12391 				src_object,
12392 				src_offset,
12393 				src_size,
12394 				TRUE);
12395 			if (new_object == VM_OBJECT_NULL) {
12396 				goto CopySlowly;
12397 			}
12398 
12399 			VME_OBJECT_SET(new_entry, new_object, false, 0);
12400 			assert(new_entry->wired_count == 0);
12401 			new_entry->needs_copy = TRUE;
12402 			assert(!new_entry->iokit_acct);
12403 			assert(new_object->purgable == VM_PURGABLE_DENY);
12404 			assertf(new_entry->use_pmap, "src_map %p new_entry %p\n", src_map, new_entry);
12405 			result = KERN_SUCCESS;
12406 		} else {
12407 			vm_object_offset_t new_offset;
12408 			new_offset = VME_OFFSET(new_entry);
12409 			result = vm_object_copy_strategically(src_object,
12410 			    src_offset,
12411 			    src_size,
12412 			    (flags & VM_MAP_COPYIN_FORK),
12413 			    &new_copy_object,
12414 			    &new_offset,
12415 			    &new_entry_needs_copy);
12416 			/* VME_OBJECT_SET will reset used_for_jit, so preserve it. */
12417 			saved_used_for_jit = new_entry->used_for_jit;
12418 			VME_OBJECT_SET(new_entry, new_copy_object, false, 0);
12419 			new_entry->used_for_jit = saved_used_for_jit;
12420 			if (new_offset != VME_OFFSET(new_entry)) {
12421 				VME_OFFSET_SET(new_entry, new_offset);
12422 			}
12423 
12424 			new_entry->needs_copy = new_entry_needs_copy;
12425 		}
12426 
12427 		if (result == KERN_SUCCESS &&
12428 		    ((preserve_purgeable &&
12429 		    src_object->purgable != VM_PURGABLE_DENY) ||
12430 		    new_entry->used_for_jit)) {
12431 			/*
12432 			 * Purgeable objects should be COPY_NONE, true share;
12433 			 * this should be propogated to the copy.
12434 			 *
12435 			 * Also force mappings the pmap specially protects to
12436 			 * be COPY_NONE; trying to COW these mappings would
12437 			 * change the effective protections, which could have
12438 			 * side effects if the pmap layer relies on the
12439 			 * specified protections.
12440 			 */
12441 
12442 			vm_object_t     new_object;
12443 
12444 			new_object = VME_OBJECT(new_entry);
12445 			assert(new_object != src_object);
12446 			vm_object_lock(new_object);
12447 			assert(new_object->ref_count == 1);
12448 			assert(new_object->shadow == VM_OBJECT_NULL);
12449 			assert(new_object->vo_copy == VM_OBJECT_NULL);
12450 			assert(new_object->vo_owner == NULL);
12451 
12452 			new_object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
12453 
12454 			if (preserve_purgeable &&
12455 			    src_object->purgable != VM_PURGABLE_DENY) {
12456 				VM_OBJECT_SET_TRUE_SHARE(new_object, TRUE);
12457 
12458 				/* start as non-volatile with no owner... */
12459 				VM_OBJECT_SET_PURGABLE(new_object, VM_PURGABLE_NONVOLATILE);
12460 				vm_purgeable_nonvolatile_enqueue(new_object, NULL);
12461 				/* ... and move to src_object's purgeable state */
12462 				if (src_object->purgable != VM_PURGABLE_NONVOLATILE) {
12463 					int state;
12464 					state = src_object->purgable;
12465 					vm_object_purgable_control(
12466 						new_object,
12467 						VM_PURGABLE_SET_STATE_FROM_KERNEL,
12468 						&state);
12469 				}
12470 				/* no pmap accounting for purgeable objects */
12471 				new_entry->use_pmap = FALSE;
12472 			}
12473 
12474 			vm_object_unlock(new_object);
12475 			new_object = VM_OBJECT_NULL;
12476 		}
12477 
12478 		if (result != KERN_SUCCESS &&
12479 		    result != KERN_MEMORY_RESTART_COPY) {
12480 			vm_map_lock(src_map);
12481 			RETURN(result);
12482 		}
12483 
12484 		/*
12485 		 *	Throw away the extra reference
12486 		 */
12487 
12488 		vm_object_deallocate(src_object);
12489 
12490 		/*
12491 		 *	Verify that the map has not substantially
12492 		 *	changed while the copy was being made.
12493 		 */
12494 
12495 		vm_map_lock(src_map);
12496 
12497 		if ((version.main_timestamp + 1) == src_map->timestamp) {
12498 			/* src_map hasn't changed: src_entry is still valid */
12499 			src_entry = saved_src_entry;
12500 			goto VerificationSuccessful;
12501 		}
12502 
12503 		/*
12504 		 *	Simple version comparison failed.
12505 		 *
12506 		 *	Retry the lookup and verify that the
12507 		 *	same object/offset are still present.
12508 		 *
12509 		 *	[Note: a memory manager that colludes with
12510 		 *	the calling task can detect that we have
12511 		 *	cheated.  While the map was unlocked, the
12512 		 *	mapping could have been changed and restored.]
12513 		 */
12514 
12515 		if (!vm_map_lookup_entry(src_map, src_start, &tmp_entry)) {
12516 			if (result != KERN_MEMORY_RESTART_COPY) {
12517 				vm_object_deallocate(VME_OBJECT(new_entry));
12518 				VME_OBJECT_SET(new_entry, VM_OBJECT_NULL, false, 0);
12519 				/* reset accounting state */
12520 				new_entry->iokit_acct = FALSE;
12521 				new_entry->use_pmap = TRUE;
12522 			}
12523 			RETURN(KERN_INVALID_ADDRESS);
12524 		}
12525 
12526 		src_entry = tmp_entry;
12527 		vm_map_clip_start(src_map, src_entry, src_start);
12528 
12529 		if ((((src_entry->protection & VM_PROT_READ) == VM_PROT_NONE) &&
12530 		    !use_maxprot) ||
12531 		    ((src_entry->max_protection & VM_PROT_READ) == 0)) {
12532 			goto VerificationFailed;
12533 		}
12534 
12535 		if (src_entry->vme_end < new_entry->vme_end) {
12536 			/*
12537 			 * This entry might have been shortened
12538 			 * (vm_map_clip_end) or been replaced with
12539 			 * an entry that ends closer to "src_start"
12540 			 * than before.
12541 			 * Adjust "new_entry" accordingly; copying
12542 			 * less memory would be correct but we also
12543 			 * redo the copy (see below) if the new entry
12544 			 * no longer points at the same object/offset.
12545 			 */
12546 			assert(VM_MAP_PAGE_ALIGNED(src_entry->vme_end,
12547 			    VM_MAP_COPY_PAGE_MASK(copy)));
12548 			new_entry->vme_end = src_entry->vme_end;
12549 			src_size = new_entry->vme_end - src_start;
12550 		} else if (src_entry->vme_end > new_entry->vme_end) {
12551 			/*
12552 			 * This entry might have been extended
12553 			 * (vm_map_entry_simplify() or coalesce)
12554 			 * or been replaced with an entry that ends farther
12555 			 * from "src_start" than before.
12556 			 *
12557 			 * We've called vm_object_copy_*() only on
12558 			 * the previous <start:end> range, so we can't
12559 			 * just extend new_entry.  We have to re-do
12560 			 * the copy based on the new entry as if it was
12561 			 * pointing at a different object/offset (see
12562 			 * "Verification failed" below).
12563 			 */
12564 		}
12565 
12566 		if ((VME_OBJECT(src_entry) != src_object) ||
12567 		    (VME_OFFSET(src_entry) != src_offset) ||
12568 		    (src_entry->vme_end > new_entry->vme_end)) {
12569 			/*
12570 			 *	Verification failed.
12571 			 *
12572 			 *	Start over with this top-level entry.
12573 			 */
12574 
12575 VerificationFailed:     ;
12576 
12577 			vm_object_deallocate(VME_OBJECT(new_entry));
12578 			tmp_entry = src_entry;
12579 			continue;
12580 		}
12581 
12582 		/*
12583 		 *	Verification succeeded.
12584 		 */
12585 
12586 VerificationSuccessful:;
12587 
12588 		if (result == KERN_MEMORY_RESTART_COPY) {
12589 			goto RestartCopy;
12590 		}
12591 
12592 		/*
12593 		 *	Copy succeeded.
12594 		 */
12595 
12596 CopySuccessful: ;
12597 
12598 		/*
12599 		 *	Link in the new copy entry.
12600 		 */
12601 
12602 		vm_map_copy_entry_link(copy, vm_map_copy_last_entry(copy),
12603 		    new_entry);
12604 
12605 		/*
12606 		 *	Determine whether the entire region
12607 		 *	has been copied.
12608 		 */
12609 		src_base = src_start;
12610 		src_start = new_entry->vme_end;
12611 		new_entry = VM_MAP_ENTRY_NULL;
12612 		while ((src_start >= src_end) && (src_end != 0)) {
12613 			submap_map_t    *ptr;
12614 
12615 			if (src_map == base_map) {
12616 				/* back to the top */
12617 				break;
12618 			}
12619 
12620 			ptr = parent_maps;
12621 			assert(ptr != NULL);
12622 			parent_maps = parent_maps->next;
12623 
12624 			/* fix up the damage we did in that submap */
12625 			vm_map_simplify_range(src_map,
12626 			    src_base,
12627 			    src_end);
12628 
12629 			vm_map_unlock(src_map);
12630 			vm_map_deallocate(src_map);
12631 			vm_map_lock(ptr->parent_map);
12632 			src_map = ptr->parent_map;
12633 			src_base = ptr->base_start;
12634 			src_start = ptr->base_start + ptr->base_len;
12635 			src_end = ptr->base_end;
12636 			if (!vm_map_lookup_entry(src_map,
12637 			    src_start,
12638 			    &tmp_entry) &&
12639 			    (src_end > src_start)) {
12640 				RETURN(KERN_INVALID_ADDRESS);
12641 			}
12642 			kfree_type(submap_map_t, ptr);
12643 			if (parent_maps == NULL) {
12644 				map_share = FALSE;
12645 			}
12646 			src_entry = tmp_entry->vme_prev;
12647 		}
12648 
12649 		if ((VM_MAP_PAGE_SHIFT(src_map) != PAGE_SHIFT) &&
12650 		    (src_start >= src_addr + len) &&
12651 		    (src_addr + len != 0)) {
12652 			/*
12653 			 * Stop copying now, even though we haven't reached
12654 			 * "src_end".  We'll adjust the end of the last copy
12655 			 * entry at the end, if needed.
12656 			 *
12657 			 * If src_map's aligment is different from the
12658 			 * system's page-alignment, there could be
12659 			 * extra non-map-aligned map entries between
12660 			 * the original (non-rounded) "src_addr + len"
12661 			 * and the rounded "src_end".
12662 			 * We do not want to copy those map entries since
12663 			 * they're not part of the copied range.
12664 			 */
12665 			break;
12666 		}
12667 
12668 		if ((src_start >= src_end) && (src_end != 0)) {
12669 			break;
12670 		}
12671 
12672 		/*
12673 		 *	Verify that there are no gaps in the region
12674 		 */
12675 
12676 		tmp_entry = src_entry->vme_next;
12677 		if ((tmp_entry->vme_start != src_start) ||
12678 		    (tmp_entry == vm_map_to_entry(src_map))) {
12679 			RETURN(KERN_INVALID_ADDRESS);
12680 		}
12681 	}
12682 
12683 	/*
12684 	 * If the source should be destroyed, do it now, since the
12685 	 * copy was successful.
12686 	 */
12687 	if (src_destroy) {
12688 		vmr_flags_t remove_flags = VM_MAP_REMOVE_NO_FLAGS;
12689 
12690 		if (src_map == kernel_map) {
12691 			remove_flags |= VM_MAP_REMOVE_KUNWIRE;
12692 		}
12693 		(void)vm_map_remove_and_unlock(src_map,
12694 		    vm_map_trunc_page(src_addr, VM_MAP_PAGE_MASK(src_map)),
12695 		    src_end,
12696 		    remove_flags,
12697 		    KMEM_GUARD_NONE);
12698 	} else {
12699 		/* fix up the damage we did in the base map */
12700 		vm_map_simplify_range(
12701 			src_map,
12702 			vm_map_trunc_page(src_addr,
12703 			VM_MAP_PAGE_MASK(src_map)),
12704 			vm_map_round_page(src_end,
12705 			VM_MAP_PAGE_MASK(src_map)));
12706 		vm_map_unlock(src_map);
12707 	}
12708 
12709 	tmp_entry = VM_MAP_ENTRY_NULL;
12710 
12711 	if (VM_MAP_PAGE_SHIFT(src_map) > PAGE_SHIFT &&
12712 	    VM_MAP_PAGE_SHIFT(src_map) != VM_MAP_COPY_PAGE_SHIFT(copy)) {
12713 		vm_map_offset_t original_start, original_offset, original_end;
12714 
12715 		assert(VM_MAP_COPY_PAGE_MASK(copy) == PAGE_MASK);
12716 
12717 		/* adjust alignment of first copy_entry's "vme_start" */
12718 		tmp_entry = vm_map_copy_first_entry(copy);
12719 		if (tmp_entry != vm_map_copy_to_entry(copy)) {
12720 			vm_map_offset_t adjustment;
12721 
12722 			original_start = tmp_entry->vme_start;
12723 			original_offset = VME_OFFSET(tmp_entry);
12724 
12725 			/* map-align the start of the first copy entry... */
12726 			adjustment = (tmp_entry->vme_start -
12727 			    vm_map_trunc_page(
12728 				    tmp_entry->vme_start,
12729 				    VM_MAP_PAGE_MASK(src_map)));
12730 			tmp_entry->vme_start -= adjustment;
12731 			VME_OFFSET_SET(tmp_entry,
12732 			    VME_OFFSET(tmp_entry) - adjustment);
12733 			copy_addr -= adjustment;
12734 			assert(tmp_entry->vme_start < tmp_entry->vme_end);
12735 			/* ... adjust for mis-aligned start of copy range */
12736 			adjustment =
12737 			    (vm_map_trunc_page(copy->offset,
12738 			    PAGE_MASK) -
12739 			    vm_map_trunc_page(copy->offset,
12740 			    VM_MAP_PAGE_MASK(src_map)));
12741 			if (adjustment) {
12742 				assert(page_aligned(adjustment));
12743 				assert(adjustment < VM_MAP_PAGE_SIZE(src_map));
12744 				tmp_entry->vme_start += adjustment;
12745 				VME_OFFSET_SET(tmp_entry,
12746 				    (VME_OFFSET(tmp_entry) +
12747 				    adjustment));
12748 				copy_addr += adjustment;
12749 				assert(tmp_entry->vme_start < tmp_entry->vme_end);
12750 			}
12751 
12752 			/*
12753 			 * Assert that the adjustments haven't exposed
12754 			 * more than was originally copied...
12755 			 */
12756 			assert(tmp_entry->vme_start >= original_start);
12757 			assert(VME_OFFSET(tmp_entry) >= original_offset);
12758 			/*
12759 			 * ... and that it did not adjust outside of a
12760 			 * a single 16K page.
12761 			 */
12762 			assert(vm_map_trunc_page(tmp_entry->vme_start,
12763 			    VM_MAP_PAGE_MASK(src_map)) ==
12764 			    vm_map_trunc_page(original_start,
12765 			    VM_MAP_PAGE_MASK(src_map)));
12766 		}
12767 
12768 		/* adjust alignment of last copy_entry's "vme_end" */
12769 		tmp_entry = vm_map_copy_last_entry(copy);
12770 		if (tmp_entry != vm_map_copy_to_entry(copy)) {
12771 			vm_map_offset_t adjustment;
12772 
12773 			original_end = tmp_entry->vme_end;
12774 
12775 			/* map-align the end of the last copy entry... */
12776 			tmp_entry->vme_end =
12777 			    vm_map_round_page(tmp_entry->vme_end,
12778 			    VM_MAP_PAGE_MASK(src_map));
12779 			/* ... adjust for mis-aligned end of copy range */
12780 			adjustment =
12781 			    (vm_map_round_page((copy->offset +
12782 			    copy->size),
12783 			    VM_MAP_PAGE_MASK(src_map)) -
12784 			    vm_map_round_page((copy->offset +
12785 			    copy->size),
12786 			    PAGE_MASK));
12787 			if (adjustment) {
12788 				assert(page_aligned(adjustment));
12789 				assert(adjustment < VM_MAP_PAGE_SIZE(src_map));
12790 				tmp_entry->vme_end -= adjustment;
12791 				assert(tmp_entry->vme_start < tmp_entry->vme_end);
12792 			}
12793 
12794 			/*
12795 			 * Assert that the adjustments haven't exposed
12796 			 * more than was originally copied...
12797 			 */
12798 			assert(tmp_entry->vme_end <= original_end);
12799 			/*
12800 			 * ... and that it did not adjust outside of a
12801 			 * a single 16K page.
12802 			 */
12803 			assert(vm_map_round_page(tmp_entry->vme_end,
12804 			    VM_MAP_PAGE_MASK(src_map)) ==
12805 			    vm_map_round_page(original_end,
12806 			    VM_MAP_PAGE_MASK(src_map)));
12807 		}
12808 	}
12809 
12810 	/* Fix-up start and end points in copy.  This is necessary */
12811 	/* when the various entries in the copy object were picked */
12812 	/* up from different sub-maps */
12813 
12814 	tmp_entry = vm_map_copy_first_entry(copy);
12815 	copy_size = 0; /* compute actual size */
12816 	while (tmp_entry != vm_map_copy_to_entry(copy)) {
12817 		assert(VM_MAP_PAGE_ALIGNED(
12818 			    copy_addr + (tmp_entry->vme_end -
12819 			    tmp_entry->vme_start),
12820 			    MIN(VM_MAP_COPY_PAGE_MASK(copy), PAGE_MASK)));
12821 		assert(VM_MAP_PAGE_ALIGNED(
12822 			    copy_addr,
12823 			    MIN(VM_MAP_COPY_PAGE_MASK(copy), PAGE_MASK)));
12824 
12825 		/*
12826 		 * The copy_entries will be injected directly into the
12827 		 * destination map and might not be "map aligned" there...
12828 		 */
12829 		tmp_entry->map_aligned = FALSE;
12830 
12831 		tmp_entry->vme_end = copy_addr +
12832 		    (tmp_entry->vme_end - tmp_entry->vme_start);
12833 		tmp_entry->vme_start = copy_addr;
12834 		assert(tmp_entry->vme_start < tmp_entry->vme_end);
12835 		copy_addr += tmp_entry->vme_end - tmp_entry->vme_start;
12836 		copy_size += tmp_entry->vme_end - tmp_entry->vme_start;
12837 		tmp_entry = (struct vm_map_entry *)tmp_entry->vme_next;
12838 	}
12839 
12840 	if (VM_MAP_PAGE_SHIFT(src_map) != PAGE_SHIFT &&
12841 	    copy_size < copy->size) {
12842 		/*
12843 		 * The actual size of the VM map copy is smaller than what
12844 		 * was requested by the caller.  This must be because some
12845 		 * PAGE_SIZE-sized pages are missing at the end of the last
12846 		 * VM_MAP_PAGE_SIZE(src_map)-sized chunk of the range.
12847 		 * The caller might not have been aware of those missing
12848 		 * pages and might not want to be aware of it, which is
12849 		 * fine as long as they don't try to access (and crash on)
12850 		 * those missing pages.
12851 		 * Let's adjust the size of the "copy", to avoid failing
12852 		 * in vm_map_copyout() or vm_map_copy_overwrite().
12853 		 */
12854 		assert(vm_map_round_page(copy_size,
12855 		    VM_MAP_PAGE_MASK(src_map)) ==
12856 		    vm_map_round_page(copy->size,
12857 		    VM_MAP_PAGE_MASK(src_map)));
12858 		copy->size = copy_size;
12859 	}
12860 
12861 	*copy_result = copy;
12862 	return KERN_SUCCESS;
12863 
12864 #undef  RETURN
12865 }
12866 
12867 kern_return_t
vm_map_copy_extract(vm_map_t src_map,vm_map_address_t src_addr,vm_map_size_t len,boolean_t do_copy,vm_map_copy_t * copy_result,vm_prot_t * cur_prot,vm_prot_t * max_prot,vm_inherit_t inheritance,vm_map_kernel_flags_t vmk_flags)12868 vm_map_copy_extract(
12869 	vm_map_t                src_map,
12870 	vm_map_address_t        src_addr,
12871 	vm_map_size_t           len,
12872 	boolean_t               do_copy,
12873 	vm_map_copy_t           *copy_result,   /* OUT */
12874 	vm_prot_t               *cur_prot,      /* IN/OUT */
12875 	vm_prot_t               *max_prot,      /* IN/OUT */
12876 	vm_inherit_t            inheritance,
12877 	vm_map_kernel_flags_t   vmk_flags)
12878 {
12879 	vm_map_copy_t   copy;
12880 	kern_return_t   kr;
12881 	vm_prot_t required_cur_prot, required_max_prot;
12882 
12883 	/*
12884 	 *	Check for copies of zero bytes.
12885 	 */
12886 
12887 	if (len == 0) {
12888 		*copy_result = VM_MAP_COPY_NULL;
12889 		return KERN_SUCCESS;
12890 	}
12891 
12892 	/*
12893 	 *	Check that the end address doesn't overflow
12894 	 */
12895 	if (src_addr + len < src_addr) {
12896 		return KERN_INVALID_ADDRESS;
12897 	}
12898 	if (__improbable(vm_map_range_overflows(src_map, src_addr, len))) {
12899 		return KERN_INVALID_ADDRESS;
12900 	}
12901 
12902 	if (VM_MAP_PAGE_SIZE(src_map) < PAGE_SIZE) {
12903 		DEBUG4K_SHARE("src_map %p src_addr 0x%llx src_end 0x%llx\n", src_map, (uint64_t)src_addr, (uint64_t)(src_addr + len));
12904 	}
12905 
12906 	required_cur_prot = *cur_prot;
12907 	required_max_prot = *max_prot;
12908 
12909 	/*
12910 	 *	Allocate a header element for the list.
12911 	 *
12912 	 *	Use the start and end in the header to
12913 	 *	remember the endpoints prior to rounding.
12914 	 */
12915 
12916 	copy = vm_map_copy_allocate(VM_MAP_COPY_ENTRY_LIST);
12917 	copy->cpy_hdr.entries_pageable = vmk_flags.vmkf_copy_pageable;
12918 	copy->offset = 0;
12919 	copy->size = len;
12920 
12921 	kr = vm_map_remap_extract(src_map,
12922 	    src_addr,
12923 	    len,
12924 	    do_copy,             /* copy */
12925 	    copy,
12926 	    cur_prot,            /* IN/OUT */
12927 	    max_prot,            /* IN/OUT */
12928 	    inheritance,
12929 	    vmk_flags);
12930 	if (kr != KERN_SUCCESS) {
12931 		vm_map_copy_discard(copy);
12932 		return kr;
12933 	}
12934 	if (required_cur_prot != VM_PROT_NONE) {
12935 		assert((*cur_prot & required_cur_prot) == required_cur_prot);
12936 		assert((*max_prot & required_max_prot) == required_max_prot);
12937 	}
12938 
12939 	*copy_result = copy;
12940 	return KERN_SUCCESS;
12941 }
12942 
12943 static void
vm_map_fork_share(vm_map_t old_map,vm_map_entry_t old_entry,vm_map_t new_map)12944 vm_map_fork_share(
12945 	vm_map_t        old_map,
12946 	vm_map_entry_t  old_entry,
12947 	vm_map_t        new_map)
12948 {
12949 	vm_object_t     object;
12950 	vm_map_entry_t  new_entry;
12951 
12952 	/*
12953 	 *	New sharing code.  New map entry
12954 	 *	references original object.  Internal
12955 	 *	objects use asynchronous copy algorithm for
12956 	 *	future copies.  First make sure we have
12957 	 *	the right object.  If we need a shadow,
12958 	 *	or someone else already has one, then
12959 	 *	make a new shadow and share it.
12960 	 */
12961 
12962 	if (!old_entry->is_sub_map) {
12963 		object = VME_OBJECT(old_entry);
12964 	}
12965 
12966 	if (old_entry->is_sub_map) {
12967 		assert(old_entry->wired_count == 0);
12968 #ifndef NO_NESTED_PMAP
12969 #if !PMAP_FORK_NEST
12970 		if (old_entry->use_pmap) {
12971 			kern_return_t   result;
12972 
12973 			result = pmap_nest(new_map->pmap,
12974 			    (VME_SUBMAP(old_entry))->pmap,
12975 			    (addr64_t)old_entry->vme_start,
12976 			    (uint64_t)(old_entry->vme_end - old_entry->vme_start));
12977 			if (result) {
12978 				panic("vm_map_fork_share: pmap_nest failed!");
12979 			}
12980 		}
12981 #endif /* !PMAP_FORK_NEST */
12982 #endif  /* NO_NESTED_PMAP */
12983 	} else if (object == VM_OBJECT_NULL) {
12984 		object = vm_object_allocate((vm_map_size_t)(old_entry->vme_end -
12985 		    old_entry->vme_start));
12986 		VME_OFFSET_SET(old_entry, 0);
12987 		VME_OBJECT_SET(old_entry, object, false, 0);
12988 		old_entry->use_pmap = TRUE;
12989 //		assert(!old_entry->needs_copy);
12990 	} else if (object->copy_strategy !=
12991 	    MEMORY_OBJECT_COPY_SYMMETRIC) {
12992 		/*
12993 		 *	We are already using an asymmetric
12994 		 *	copy, and therefore we already have
12995 		 *	the right object.
12996 		 */
12997 
12998 		assert(!old_entry->needs_copy);
12999 	} else if (old_entry->needs_copy ||       /* case 1 */
13000 	    object->shadowed ||                 /* case 2 */
13001 	    (!object->true_share &&             /* case 3 */
13002 	    !old_entry->is_shared &&
13003 	    (object->vo_size >
13004 	    (vm_map_size_t)(old_entry->vme_end -
13005 	    old_entry->vme_start)))) {
13006 		bool is_writable;
13007 
13008 		/*
13009 		 *	We need to create a shadow.
13010 		 *	There are three cases here.
13011 		 *	In the first case, we need to
13012 		 *	complete a deferred symmetrical
13013 		 *	copy that we participated in.
13014 		 *	In the second and third cases,
13015 		 *	we need to create the shadow so
13016 		 *	that changes that we make to the
13017 		 *	object do not interfere with
13018 		 *	any symmetrical copies which
13019 		 *	have occured (case 2) or which
13020 		 *	might occur (case 3).
13021 		 *
13022 		 *	The first case is when we had
13023 		 *	deferred shadow object creation
13024 		 *	via the entry->needs_copy mechanism.
13025 		 *	This mechanism only works when
13026 		 *	only one entry points to the source
13027 		 *	object, and we are about to create
13028 		 *	a second entry pointing to the
13029 		 *	same object. The problem is that
13030 		 *	there is no way of mapping from
13031 		 *	an object to the entries pointing
13032 		 *	to it. (Deferred shadow creation
13033 		 *	works with one entry because occurs
13034 		 *	at fault time, and we walk from the
13035 		 *	entry to the object when handling
13036 		 *	the fault.)
13037 		 *
13038 		 *	The second case is when the object
13039 		 *	to be shared has already been copied
13040 		 *	with a symmetric copy, but we point
13041 		 *	directly to the object without
13042 		 *	needs_copy set in our entry. (This
13043 		 *	can happen because different ranges
13044 		 *	of an object can be pointed to by
13045 		 *	different entries. In particular,
13046 		 *	a single entry pointing to an object
13047 		 *	can be split by a call to vm_inherit,
13048 		 *	which, combined with task_create, can
13049 		 *	result in the different entries
13050 		 *	having different needs_copy values.)
13051 		 *	The shadowed flag in the object allows
13052 		 *	us to detect this case. The problem
13053 		 *	with this case is that if this object
13054 		 *	has or will have shadows, then we
13055 		 *	must not perform an asymmetric copy
13056 		 *	of this object, since such a copy
13057 		 *	allows the object to be changed, which
13058 		 *	will break the previous symmetrical
13059 		 *	copies (which rely upon the object
13060 		 *	not changing). In a sense, the shadowed
13061 		 *	flag says "don't change this object".
13062 		 *	We fix this by creating a shadow
13063 		 *	object for this object, and sharing
13064 		 *	that. This works because we are free
13065 		 *	to change the shadow object (and thus
13066 		 *	to use an asymmetric copy strategy);
13067 		 *	this is also semantically correct,
13068 		 *	since this object is temporary, and
13069 		 *	therefore a copy of the object is
13070 		 *	as good as the object itself. (This
13071 		 *	is not true for permanent objects,
13072 		 *	since the pager needs to see changes,
13073 		 *	which won't happen if the changes
13074 		 *	are made to a copy.)
13075 		 *
13076 		 *	The third case is when the object
13077 		 *	to be shared has parts sticking
13078 		 *	outside of the entry we're working
13079 		 *	with, and thus may in the future
13080 		 *	be subject to a symmetrical copy.
13081 		 *	(This is a preemptive version of
13082 		 *	case 2.)
13083 		 */
13084 		VME_OBJECT_SHADOW(old_entry,
13085 		    (vm_map_size_t) (old_entry->vme_end -
13086 		    old_entry->vme_start),
13087 		    vm_map_always_shadow(old_map));
13088 
13089 		/*
13090 		 *	If we're making a shadow for other than
13091 		 *	copy on write reasons, then we have
13092 		 *	to remove write permission.
13093 		 */
13094 
13095 		is_writable = false;
13096 		if (old_entry->protection & VM_PROT_WRITE) {
13097 			is_writable = true;
13098 #if __arm64e__
13099 		} else if (old_entry->used_for_tpro) {
13100 			is_writable = true;
13101 #endif /* __arm64e__ */
13102 		}
13103 		if (!old_entry->needs_copy && is_writable) {
13104 			vm_prot_t prot;
13105 
13106 			if (pmap_has_prot_policy(old_map->pmap, old_entry->translated_allow_execute, old_entry->protection)) {
13107 				panic("%s: map %p pmap %p entry %p 0x%llx:0x%llx prot 0x%x",
13108 				    __FUNCTION__, old_map, old_map->pmap,
13109 				    old_entry,
13110 				    (uint64_t)old_entry->vme_start,
13111 				    (uint64_t)old_entry->vme_end,
13112 				    old_entry->protection);
13113 			}
13114 
13115 			prot = old_entry->protection & ~VM_PROT_WRITE;
13116 
13117 			if (pmap_has_prot_policy(old_map->pmap, old_entry->translated_allow_execute, prot)) {
13118 				panic("%s: map %p pmap %p entry %p 0x%llx:0x%llx prot 0x%x",
13119 				    __FUNCTION__, old_map, old_map->pmap,
13120 				    old_entry,
13121 				    (uint64_t)old_entry->vme_start,
13122 				    (uint64_t)old_entry->vme_end,
13123 				    prot);
13124 			}
13125 
13126 			if (override_nx(old_map, VME_ALIAS(old_entry)) && prot) {
13127 				prot |= VM_PROT_EXECUTE;
13128 			}
13129 
13130 
13131 			if (old_map->mapped_in_other_pmaps) {
13132 				vm_object_pmap_protect(
13133 					VME_OBJECT(old_entry),
13134 					VME_OFFSET(old_entry),
13135 					(old_entry->vme_end -
13136 					old_entry->vme_start),
13137 					PMAP_NULL,
13138 					PAGE_SIZE,
13139 					old_entry->vme_start,
13140 					prot);
13141 			} else {
13142 				pmap_protect(old_map->pmap,
13143 				    old_entry->vme_start,
13144 				    old_entry->vme_end,
13145 				    prot);
13146 			}
13147 		}
13148 
13149 		old_entry->needs_copy = FALSE;
13150 		object = VME_OBJECT(old_entry);
13151 	}
13152 
13153 
13154 	/*
13155 	 *	If object was using a symmetric copy strategy,
13156 	 *	change its copy strategy to the default
13157 	 *	asymmetric copy strategy, which is copy_delay
13158 	 *	in the non-norma case and copy_call in the
13159 	 *	norma case. Bump the reference count for the
13160 	 *	new entry.
13161 	 */
13162 
13163 	if (old_entry->is_sub_map) {
13164 		vm_map_reference(VME_SUBMAP(old_entry));
13165 	} else {
13166 		vm_object_lock(object);
13167 		vm_object_reference_locked(object);
13168 		if (object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) {
13169 			object->copy_strategy = MEMORY_OBJECT_COPY_DELAY;
13170 		}
13171 		vm_object_unlock(object);
13172 	}
13173 
13174 	/*
13175 	 *	Clone the entry, using object ref from above.
13176 	 *	Mark both entries as shared.
13177 	 */
13178 
13179 	new_entry = vm_map_entry_create(new_map); /* Never the kernel map or descendants */
13180 	vm_map_entry_copy(old_map, new_entry, old_entry);
13181 	old_entry->is_shared = TRUE;
13182 	new_entry->is_shared = TRUE;
13183 
13184 	/*
13185 	 * We're dealing with a shared mapping, so the resulting mapping
13186 	 * should inherit some of the original mapping's accounting settings.
13187 	 * "iokit_acct" should have been cleared in vm_map_entry_copy().
13188 	 * "use_pmap" should stay the same as before (if it hasn't been reset
13189 	 * to TRUE when we cleared "iokit_acct").
13190 	 */
13191 	assert(!new_entry->iokit_acct);
13192 
13193 	/*
13194 	 *	If old entry's inheritence is VM_INHERIT_NONE,
13195 	 *	the new entry is for corpse fork, remove the
13196 	 *	write permission from the new entry.
13197 	 */
13198 	if (old_entry->inheritance == VM_INHERIT_NONE) {
13199 		new_entry->protection &= ~VM_PROT_WRITE;
13200 		new_entry->max_protection &= ~VM_PROT_WRITE;
13201 	}
13202 
13203 	/*
13204 	 *	Insert the entry into the new map -- we
13205 	 *	know we're inserting at the end of the new
13206 	 *	map.
13207 	 */
13208 
13209 	vm_map_store_entry_link(new_map, vm_map_last_entry(new_map), new_entry,
13210 	    VM_MAP_KERNEL_FLAGS_NONE);
13211 
13212 	/*
13213 	 *	Update the physical map
13214 	 */
13215 
13216 	if (old_entry->is_sub_map) {
13217 		/* Bill Angell pmap support goes here */
13218 	} else {
13219 		pmap_copy(new_map->pmap, old_map->pmap, new_entry->vme_start,
13220 		    old_entry->vme_end - old_entry->vme_start,
13221 		    old_entry->vme_start);
13222 	}
13223 }
13224 
13225 static boolean_t
vm_map_fork_copy(vm_map_t old_map,vm_map_entry_t * old_entry_p,vm_map_t new_map,int vm_map_copyin_flags)13226 vm_map_fork_copy(
13227 	vm_map_t        old_map,
13228 	vm_map_entry_t  *old_entry_p,
13229 	vm_map_t        new_map,
13230 	int             vm_map_copyin_flags)
13231 {
13232 	vm_map_entry_t old_entry = *old_entry_p;
13233 	vm_map_size_t entry_size = old_entry->vme_end - old_entry->vme_start;
13234 	vm_map_offset_t start = old_entry->vme_start;
13235 	vm_map_copy_t copy;
13236 	vm_map_entry_t last = vm_map_last_entry(new_map);
13237 
13238 	vm_map_unlock(old_map);
13239 	/*
13240 	 *	Use maxprot version of copyin because we
13241 	 *	care about whether this memory can ever
13242 	 *	be accessed, not just whether it's accessible
13243 	 *	right now.
13244 	 */
13245 	vm_map_copyin_flags |= VM_MAP_COPYIN_USE_MAXPROT;
13246 	if (vm_map_copyin_internal(old_map, start, entry_size,
13247 	    vm_map_copyin_flags, &copy)
13248 	    != KERN_SUCCESS) {
13249 		/*
13250 		 *	The map might have changed while it
13251 		 *	was unlocked, check it again.  Skip
13252 		 *	any blank space or permanently
13253 		 *	unreadable region.
13254 		 */
13255 		vm_map_lock(old_map);
13256 		if (!vm_map_lookup_entry(old_map, start, &last) ||
13257 		    (last->max_protection & VM_PROT_READ) == VM_PROT_NONE) {
13258 			last = last->vme_next;
13259 		}
13260 		*old_entry_p = last;
13261 
13262 		/*
13263 		 * XXX	For some error returns, want to
13264 		 * XXX	skip to the next element.  Note
13265 		 *	that INVALID_ADDRESS and
13266 		 *	PROTECTION_FAILURE are handled above.
13267 		 */
13268 
13269 		return FALSE;
13270 	}
13271 
13272 	/*
13273 	 * Assert that the vm_map_copy is coming from the right
13274 	 * zone and hasn't been forged
13275 	 */
13276 	vm_map_copy_require(copy);
13277 
13278 	/*
13279 	 *	Insert the copy into the new map
13280 	 */
13281 	vm_map_copy_insert(new_map, last, copy);
13282 
13283 	/*
13284 	 *	Pick up the traversal at the end of
13285 	 *	the copied region.
13286 	 */
13287 
13288 	vm_map_lock(old_map);
13289 	start += entry_size;
13290 	if (!vm_map_lookup_entry(old_map, start, &last)) {
13291 		last = last->vme_next;
13292 	} else {
13293 		if (last->vme_start == start) {
13294 			/*
13295 			 * No need to clip here and we don't
13296 			 * want to cause any unnecessary
13297 			 * unnesting...
13298 			 */
13299 		} else {
13300 			vm_map_clip_start(old_map, last, start);
13301 		}
13302 	}
13303 	*old_entry_p = last;
13304 
13305 	return TRUE;
13306 }
13307 
13308 #if PMAP_FORK_NEST
13309 #define PMAP_FORK_NEST_DEBUG 0
13310 static inline void
vm_map_fork_unnest(pmap_t new_pmap,vm_map_offset_t pre_nested_start,vm_map_offset_t pre_nested_end,vm_map_offset_t start,vm_map_offset_t end)13311 vm_map_fork_unnest(
13312 	pmap_t new_pmap,
13313 	vm_map_offset_t pre_nested_start,
13314 	vm_map_offset_t pre_nested_end,
13315 	vm_map_offset_t start,
13316 	vm_map_offset_t end)
13317 {
13318 	kern_return_t kr;
13319 	vm_map_offset_t nesting_mask, start_unnest, end_unnest;
13320 
13321 	assertf(pre_nested_start <= pre_nested_end,
13322 	    "pre_nested start 0x%llx end 0x%llx",
13323 	    (uint64_t)pre_nested_start, (uint64_t)pre_nested_end);
13324 	assertf(start <= end,
13325 	    "start 0x%llx end 0x%llx",
13326 	    (uint64_t) start, (uint64_t)end);
13327 
13328 	if (pre_nested_start == pre_nested_end) {
13329 		/* nothing was pre-nested: done */
13330 		return;
13331 	}
13332 	if (end <= pre_nested_start) {
13333 		/* fully before pre-nested range: done */
13334 		return;
13335 	}
13336 	if (start >= pre_nested_end) {
13337 		/* fully after pre-nested range: done */
13338 		return;
13339 	}
13340 	/* ignore parts of range outside of pre_nested range */
13341 	if (start < pre_nested_start) {
13342 		start = pre_nested_start;
13343 	}
13344 	if (end > pre_nested_end) {
13345 		end = pre_nested_end;
13346 	}
13347 	nesting_mask = pmap_shared_region_size_min(new_pmap) - 1;
13348 	start_unnest = start & ~nesting_mask;
13349 	end_unnest = (end + nesting_mask) & ~nesting_mask;
13350 	kr = pmap_unnest(new_pmap,
13351 	    (addr64_t)start_unnest,
13352 	    (uint64_t)(end_unnest - start_unnest));
13353 #if PMAP_FORK_NEST_DEBUG
13354 	printf("PMAP_FORK_NEST %s:%d new_pmap %p 0x%llx:0x%llx -> pmap_unnest 0x%llx:0x%llx kr 0x%x\n", __FUNCTION__, __LINE__, new_pmap, (uint64_t)start, (uint64_t)end, (uint64_t)start_unnest, (uint64_t)end_unnest, kr);
13355 #endif /* PMAP_FORK_NEST_DEBUG */
13356 	assertf(kr == KERN_SUCCESS,
13357 	    "0x%llx 0x%llx pmap_unnest(%p, 0x%llx, 0x%llx) -> 0x%x",
13358 	    (uint64_t)start, (uint64_t)end, new_pmap,
13359 	    (uint64_t)start_unnest, (uint64_t)(end_unnest - start_unnest),
13360 	    kr);
13361 }
13362 #endif /* PMAP_FORK_NEST */
13363 
13364 void
vm_map_inherit_limits(vm_map_t new_map,const struct _vm_map * old_map)13365 vm_map_inherit_limits(vm_map_t new_map, const struct _vm_map *old_map)
13366 {
13367 	new_map->size_limit = old_map->size_limit;
13368 	new_map->data_limit = old_map->data_limit;
13369 	new_map->user_wire_limit = old_map->user_wire_limit;
13370 	new_map->reserved_regions = old_map->reserved_regions;
13371 }
13372 
13373 /*
13374  *	vm_map_fork:
13375  *
13376  *	Create and return a new map based on the old
13377  *	map, according to the inheritance values on the
13378  *	regions in that map and the options.
13379  *
13380  *	The source map must not be locked.
13381  */
13382 vm_map_t
vm_map_fork(ledger_t ledger,vm_map_t old_map,int options)13383 vm_map_fork(
13384 	ledger_t        ledger,
13385 	vm_map_t        old_map,
13386 	int             options)
13387 {
13388 	pmap_t          new_pmap;
13389 	vm_map_t        new_map;
13390 	vm_map_entry_t  old_entry;
13391 	vm_map_size_t   new_size = 0, entry_size;
13392 	vm_map_entry_t  new_entry;
13393 	boolean_t       src_needs_copy;
13394 	boolean_t       new_entry_needs_copy;
13395 	boolean_t       pmap_is64bit;
13396 	int             vm_map_copyin_flags;
13397 	vm_inherit_t    old_entry_inheritance;
13398 	int             map_create_options;
13399 	kern_return_t   footprint_collect_kr;
13400 
13401 	if (options & ~(VM_MAP_FORK_SHARE_IF_INHERIT_NONE |
13402 	    VM_MAP_FORK_PRESERVE_PURGEABLE |
13403 	    VM_MAP_FORK_CORPSE_FOOTPRINT)) {
13404 		/* unsupported option */
13405 		return VM_MAP_NULL;
13406 	}
13407 
13408 	pmap_is64bit =
13409 #if defined(__i386__) || defined(__x86_64__)
13410 	    old_map->pmap->pm_task_map != TASK_MAP_32BIT;
13411 #elif defined(__arm64__)
13412 	    old_map->pmap->is_64bit;
13413 #else
13414 #error Unknown architecture.
13415 #endif
13416 
13417 	unsigned int pmap_flags = 0;
13418 	pmap_flags |= pmap_is64bit ? PMAP_CREATE_64BIT : 0;
13419 #if defined(HAS_APPLE_PAC)
13420 	pmap_flags |= old_map->pmap->disable_jop ? PMAP_CREATE_DISABLE_JOP : 0;
13421 #endif
13422 #if CONFIG_ROSETTA
13423 	pmap_flags |= old_map->pmap->is_rosetta ? PMAP_CREATE_ROSETTA : 0;
13424 #endif
13425 #if PMAP_CREATE_FORCE_4K_PAGES
13426 	if (VM_MAP_PAGE_SIZE(old_map) == FOURK_PAGE_SIZE &&
13427 	    PAGE_SIZE != FOURK_PAGE_SIZE) {
13428 		pmap_flags |= PMAP_CREATE_FORCE_4K_PAGES;
13429 	}
13430 #endif /* PMAP_CREATE_FORCE_4K_PAGES */
13431 	new_pmap = pmap_create_options(ledger, (vm_map_size_t) 0, pmap_flags);
13432 	if (new_pmap == NULL) {
13433 		return VM_MAP_NULL;
13434 	}
13435 
13436 	vm_map_reference(old_map);
13437 	vm_map_lock(old_map);
13438 
13439 	map_create_options = 0;
13440 	if (old_map->hdr.entries_pageable) {
13441 		map_create_options |= VM_MAP_CREATE_PAGEABLE;
13442 	}
13443 	if (options & VM_MAP_FORK_CORPSE_FOOTPRINT) {
13444 		map_create_options |= VM_MAP_CREATE_CORPSE_FOOTPRINT;
13445 		footprint_collect_kr = KERN_SUCCESS;
13446 	}
13447 	new_map = vm_map_create_options(new_pmap,
13448 	    old_map->min_offset,
13449 	    old_map->max_offset,
13450 	    map_create_options);
13451 
13452 	/* inherit cs_enforcement */
13453 	vm_map_cs_enforcement_set(new_map, old_map->cs_enforcement);
13454 
13455 	vm_map_lock(new_map);
13456 	vm_commit_pagezero_status(new_map);
13457 	/* inherit the parent map's page size */
13458 	vm_map_set_page_shift(new_map, VM_MAP_PAGE_SHIFT(old_map));
13459 
13460 	/* inherit the parent rlimits */
13461 	vm_map_inherit_limits(new_map, old_map);
13462 
13463 #if CONFIG_MAP_RANGES
13464 	/* inherit the parent map's VM ranges */
13465 	vm_map_range_fork(new_map, old_map);
13466 #endif
13467 
13468 #if CODE_SIGNING_MONITOR
13469 	/* Prepare the monitor for the fork */
13470 	csm_fork_prepare(old_map->pmap, new_pmap);
13471 #endif
13472 
13473 #if PMAP_FORK_NEST
13474 	/*
13475 	 * Pre-nest the shared region's pmap.
13476 	 */
13477 	vm_map_offset_t pre_nested_start = 0, pre_nested_end = 0;
13478 	pmap_fork_nest(old_map->pmap, new_pmap,
13479 	    &pre_nested_start, &pre_nested_end);
13480 #if PMAP_FORK_NEST_DEBUG
13481 	printf("PMAP_FORK_NEST %s:%d old %p new %p pre_nested start 0x%llx end 0x%llx\n", __FUNCTION__, __LINE__, old_map->pmap, new_pmap, (uint64_t)pre_nested_start, (uint64_t)pre_nested_end);
13482 #endif /* PMAP_FORK_NEST_DEBUG */
13483 #endif /* PMAP_FORK_NEST */
13484 
13485 	for (old_entry = vm_map_first_entry(old_map); old_entry != vm_map_to_entry(old_map);) {
13486 		/*
13487 		 * Abort any corpse collection if the system is shutting down.
13488 		 */
13489 		if ((options & VM_MAP_FORK_CORPSE_FOOTPRINT) &&
13490 		    get_system_inshutdown()) {
13491 #if PMAP_FORK_NEST
13492 			new_entry = vm_map_last_entry(new_map);
13493 			if (new_entry == vm_map_to_entry(new_map)) {
13494 				/* unnest all that was pre-nested */
13495 				vm_map_fork_unnest(new_pmap,
13496 				    pre_nested_start, pre_nested_end,
13497 				    vm_map_min(new_map), vm_map_max(new_map));
13498 			} else if (new_entry->vme_end < vm_map_max(new_map)) {
13499 				/* unnest hole at the end, if pre-nested */
13500 				vm_map_fork_unnest(new_pmap,
13501 				    pre_nested_start, pre_nested_end,
13502 				    new_entry->vme_end, vm_map_max(new_map));
13503 			}
13504 #endif /* PMAP_FORK_NEST */
13505 			vm_map_corpse_footprint_collect_done(new_map);
13506 			vm_map_unlock(new_map);
13507 			vm_map_unlock(old_map);
13508 			vm_map_deallocate(new_map);
13509 			vm_map_deallocate(old_map);
13510 			printf("Aborting corpse map due to system shutdown\n");
13511 			return VM_MAP_NULL;
13512 		}
13513 
13514 		entry_size = old_entry->vme_end - old_entry->vme_start;
13515 
13516 #if PMAP_FORK_NEST
13517 		/*
13518 		 * Undo any unnecessary pre-nesting.
13519 		 */
13520 		vm_map_offset_t prev_end;
13521 		if (old_entry == vm_map_first_entry(old_map)) {
13522 			prev_end = vm_map_min(old_map);
13523 		} else {
13524 			prev_end = old_entry->vme_prev->vme_end;
13525 		}
13526 		if (prev_end < old_entry->vme_start) {
13527 			/* unnest hole before this entry, if pre-nested */
13528 			vm_map_fork_unnest(new_pmap,
13529 			    pre_nested_start, pre_nested_end,
13530 			    prev_end, old_entry->vme_start);
13531 		}
13532 		if (old_entry->is_sub_map && old_entry->use_pmap) {
13533 			/* keep this entry nested in the child */
13534 #if PMAP_FORK_NEST_DEBUG
13535 			printf("PMAP_FORK_NEST %s:%d new_pmap %p keeping 0x%llx:0x%llx nested\n", __FUNCTION__, __LINE__, new_pmap, (uint64_t)old_entry->vme_start, (uint64_t)old_entry->vme_end);
13536 #endif /* PMAP_FORK_NEST_DEBUG */
13537 		} else {
13538 			/* undo nesting for this entry, if pre-nested */
13539 			vm_map_fork_unnest(new_pmap,
13540 			    pre_nested_start, pre_nested_end,
13541 			    old_entry->vme_start, old_entry->vme_end);
13542 		}
13543 #endif /* PMAP_FORK_NEST */
13544 
13545 		old_entry_inheritance = old_entry->inheritance;
13546 		/*
13547 		 * If caller used the VM_MAP_FORK_SHARE_IF_INHERIT_NONE option
13548 		 * share VM_INHERIT_NONE entries that are not backed by a
13549 		 * device pager.
13550 		 */
13551 		if (old_entry_inheritance == VM_INHERIT_NONE &&
13552 		    (options & VM_MAP_FORK_SHARE_IF_INHERIT_NONE) &&
13553 		    (old_entry->protection & VM_PROT_READ) &&
13554 		    !(!old_entry->is_sub_map &&
13555 		    VME_OBJECT(old_entry) != NULL &&
13556 		    VME_OBJECT(old_entry)->pager != NULL &&
13557 		    is_device_pager_ops(
13558 			    VME_OBJECT(old_entry)->pager->mo_pager_ops))) {
13559 			old_entry_inheritance = VM_INHERIT_SHARE;
13560 		}
13561 
13562 		if (old_entry_inheritance != VM_INHERIT_NONE &&
13563 		    (options & VM_MAP_FORK_CORPSE_FOOTPRINT) &&
13564 		    footprint_collect_kr == KERN_SUCCESS) {
13565 			/*
13566 			 * The corpse won't have old_map->pmap to query
13567 			 * footprint information, so collect that data now
13568 			 * and store it in new_map->vmmap_corpse_footprint
13569 			 * for later autopsy.
13570 			 */
13571 			footprint_collect_kr =
13572 			    vm_map_corpse_footprint_collect(old_map,
13573 			    old_entry,
13574 			    new_map);
13575 		}
13576 
13577 		switch (old_entry_inheritance) {
13578 		case VM_INHERIT_NONE:
13579 			break;
13580 
13581 		case VM_INHERIT_SHARE:
13582 			vm_map_fork_share(old_map, old_entry, new_map);
13583 			new_size += entry_size;
13584 			break;
13585 
13586 		case VM_INHERIT_COPY:
13587 
13588 			/*
13589 			 *	Inline the copy_quickly case;
13590 			 *	upon failure, fall back on call
13591 			 *	to vm_map_fork_copy.
13592 			 */
13593 
13594 			if (old_entry->is_sub_map) {
13595 				break;
13596 			}
13597 			if ((old_entry->wired_count != 0) ||
13598 			    ((VME_OBJECT(old_entry) != NULL) &&
13599 			    (VME_OBJECT(old_entry)->true_share))) {
13600 				goto slow_vm_map_fork_copy;
13601 			}
13602 
13603 			new_entry = vm_map_entry_create(new_map); /* never the kernel map or descendants */
13604 			vm_map_entry_copy(old_map, new_entry, old_entry);
13605 			if (old_entry->vme_permanent) {
13606 				/* inherit "permanent" on fork() */
13607 				new_entry->vme_permanent = TRUE;
13608 			}
13609 
13610 			if (new_entry->used_for_jit == TRUE && new_map->jit_entry_exists == FALSE) {
13611 				new_map->jit_entry_exists = TRUE;
13612 			}
13613 
13614 			if (new_entry->is_sub_map) {
13615 				/* clear address space specifics */
13616 				new_entry->use_pmap = FALSE;
13617 			} else {
13618 				/*
13619 				 * We're dealing with a copy-on-write operation,
13620 				 * so the resulting mapping should not inherit
13621 				 * the original mapping's accounting settings.
13622 				 * "iokit_acct" should have been cleared in
13623 				 * vm_map_entry_copy().
13624 				 * "use_pmap" should be reset to its default
13625 				 * (TRUE) so that the new mapping gets
13626 				 * accounted for in the task's memory footprint.
13627 				 */
13628 				assert(!new_entry->iokit_acct);
13629 				new_entry->use_pmap = TRUE;
13630 			}
13631 
13632 			if (!vm_object_copy_quickly(
13633 				    VME_OBJECT(new_entry),
13634 				    VME_OFFSET(old_entry),
13635 				    (old_entry->vme_end -
13636 				    old_entry->vme_start),
13637 				    &src_needs_copy,
13638 				    &new_entry_needs_copy)) {
13639 				vm_map_entry_dispose(new_entry);
13640 				goto slow_vm_map_fork_copy;
13641 			}
13642 
13643 			/*
13644 			 *	Handle copy-on-write obligations
13645 			 */
13646 
13647 			if (src_needs_copy && !old_entry->needs_copy) {
13648 				vm_prot_t prot;
13649 
13650 				if (pmap_has_prot_policy(old_map->pmap, old_entry->translated_allow_execute, old_entry->protection)) {
13651 					panic("%s: map %p pmap %p entry %p 0x%llx:0x%llx prot 0x%x",
13652 					    __FUNCTION__,
13653 					    old_map, old_map->pmap, old_entry,
13654 					    (uint64_t)old_entry->vme_start,
13655 					    (uint64_t)old_entry->vme_end,
13656 					    old_entry->protection);
13657 				}
13658 
13659 				prot = old_entry->protection & ~VM_PROT_WRITE;
13660 
13661 				if (override_nx(old_map, VME_ALIAS(old_entry))
13662 				    && prot) {
13663 					prot |= VM_PROT_EXECUTE;
13664 				}
13665 
13666 				if (pmap_has_prot_policy(old_map->pmap, old_entry->translated_allow_execute, prot)) {
13667 					panic("%s: map %p pmap %p entry %p 0x%llx:0x%llx prot 0x%x",
13668 					    __FUNCTION__,
13669 					    old_map, old_map->pmap, old_entry,
13670 					    (uint64_t)old_entry->vme_start,
13671 					    (uint64_t)old_entry->vme_end,
13672 					    prot);
13673 				}
13674 
13675 				vm_object_pmap_protect(
13676 					VME_OBJECT(old_entry),
13677 					VME_OFFSET(old_entry),
13678 					(old_entry->vme_end -
13679 					old_entry->vme_start),
13680 					((old_entry->is_shared
13681 					|| old_map->mapped_in_other_pmaps)
13682 					? PMAP_NULL :
13683 					old_map->pmap),
13684 					VM_MAP_PAGE_SIZE(old_map),
13685 					old_entry->vme_start,
13686 					prot);
13687 
13688 				assert(old_entry->wired_count == 0);
13689 				old_entry->needs_copy = TRUE;
13690 			}
13691 			new_entry->needs_copy = new_entry_needs_copy;
13692 
13693 			/*
13694 			 *	Insert the entry at the end
13695 			 *	of the map.
13696 			 */
13697 
13698 			vm_map_store_entry_link(new_map,
13699 			    vm_map_last_entry(new_map),
13700 			    new_entry,
13701 			    VM_MAP_KERNEL_FLAGS_NONE);
13702 			new_size += entry_size;
13703 			break;
13704 
13705 slow_vm_map_fork_copy:
13706 			vm_map_copyin_flags = VM_MAP_COPYIN_FORK;
13707 			if (options & VM_MAP_FORK_PRESERVE_PURGEABLE) {
13708 				vm_map_copyin_flags |=
13709 				    VM_MAP_COPYIN_PRESERVE_PURGEABLE;
13710 			}
13711 			if (vm_map_fork_copy(old_map,
13712 			    &old_entry,
13713 			    new_map,
13714 			    vm_map_copyin_flags)) {
13715 				new_size += entry_size;
13716 			}
13717 			continue;
13718 		}
13719 		old_entry = old_entry->vme_next;
13720 	}
13721 
13722 #if PMAP_FORK_NEST
13723 	new_entry = vm_map_last_entry(new_map);
13724 	if (new_entry == vm_map_to_entry(new_map)) {
13725 		/* unnest all that was pre-nested */
13726 		vm_map_fork_unnest(new_pmap,
13727 		    pre_nested_start, pre_nested_end,
13728 		    vm_map_min(new_map), vm_map_max(new_map));
13729 	} else if (new_entry->vme_end < vm_map_max(new_map)) {
13730 		/* unnest hole at the end, if pre-nested */
13731 		vm_map_fork_unnest(new_pmap,
13732 		    pre_nested_start, pre_nested_end,
13733 		    new_entry->vme_end, vm_map_max(new_map));
13734 	}
13735 #endif /* PMAP_FORK_NEST */
13736 
13737 #if defined(__arm64__)
13738 	pmap_insert_commpage(new_map->pmap);
13739 #endif /* __arm64__ */
13740 
13741 	new_map->size = new_size;
13742 
13743 	if (options & VM_MAP_FORK_CORPSE_FOOTPRINT) {
13744 		vm_map_corpse_footprint_collect_done(new_map);
13745 	}
13746 
13747 	/* Propagate JIT entitlement for the pmap layer. */
13748 	if (pmap_get_jit_entitled(old_map->pmap)) {
13749 		/* Tell the pmap that it supports JIT. */
13750 		pmap_set_jit_entitled(new_map->pmap);
13751 	}
13752 
13753 	/* Propagate TPRO settings for the pmap layer */
13754 	if (pmap_get_tpro(old_map->pmap)) {
13755 		/* Tell the pmap that it supports TPRO */
13756 		pmap_set_tpro(new_map->pmap);
13757 	}
13758 
13759 
13760 	vm_map_unlock(new_map);
13761 	vm_map_unlock(old_map);
13762 	vm_map_deallocate(old_map);
13763 
13764 	return new_map;
13765 }
13766 
13767 /*
13768  * vm_map_exec:
13769  *
13770  *      Setup the "new_map" with the proper execution environment according
13771  *	to the type of executable (platform, 64bit, chroot environment).
13772  *	Map the comm page and shared region, etc...
13773  */
13774 kern_return_t
vm_map_exec(vm_map_t new_map,task_t task,boolean_t is64bit,void * fsroot,cpu_type_t cpu,cpu_subtype_t cpu_subtype,boolean_t reslide,boolean_t is_driverkit,uint32_t rsr_version)13775 vm_map_exec(
13776 	vm_map_t        new_map,
13777 	task_t          task,
13778 	boolean_t       is64bit,
13779 	void            *fsroot,
13780 	cpu_type_t      cpu,
13781 	cpu_subtype_t   cpu_subtype,
13782 	boolean_t       reslide,
13783 	boolean_t       is_driverkit,
13784 	uint32_t        rsr_version)
13785 {
13786 	SHARED_REGION_TRACE_DEBUG(
13787 		("shared_region: task %p: vm_map_exec(%p,%p,%p,0x%x,0x%x): ->\n",
13788 		(void *)VM_KERNEL_ADDRPERM(current_task()),
13789 		(void *)VM_KERNEL_ADDRPERM(new_map),
13790 		(void *)VM_KERNEL_ADDRPERM(task),
13791 		(void *)VM_KERNEL_ADDRPERM(fsroot),
13792 		cpu,
13793 		cpu_subtype));
13794 	(void) vm_commpage_enter(new_map, task, is64bit);
13795 
13796 	(void) vm_shared_region_enter(new_map, task, is64bit, fsroot, cpu, cpu_subtype, reslide, is_driverkit, rsr_version);
13797 
13798 	SHARED_REGION_TRACE_DEBUG(
13799 		("shared_region: task %p: vm_map_exec(%p,%p,%p,0x%x,0x%x): <-\n",
13800 		(void *)VM_KERNEL_ADDRPERM(current_task()),
13801 		(void *)VM_KERNEL_ADDRPERM(new_map),
13802 		(void *)VM_KERNEL_ADDRPERM(task),
13803 		(void *)VM_KERNEL_ADDRPERM(fsroot),
13804 		cpu,
13805 		cpu_subtype));
13806 
13807 	/*
13808 	 * Some devices have region(s) of memory that shouldn't get allocated by
13809 	 * user processes. The following code creates dummy vm_map_entry_t's for each
13810 	 * of the regions that needs to be reserved to prevent any allocations in
13811 	 * those regions.
13812 	 */
13813 	kern_return_t kr = KERN_FAILURE;
13814 	vm_map_kernel_flags_t vmk_flags = VM_MAP_KERNEL_FLAGS_FIXED_PERMANENT();
13815 	vmk_flags.vmkf_beyond_max = true;
13816 
13817 	const struct vm_reserved_region *regions = NULL;
13818 	size_t num_regions = ml_get_vm_reserved_regions(is64bit, &regions);
13819 	assert((num_regions == 0) || (num_regions > 0 && regions != NULL));
13820 
13821 	for (size_t i = 0; i < num_regions; ++i) {
13822 		vm_map_offset_t address = regions[i].vmrr_addr;
13823 
13824 		kr = vm_map_enter(
13825 			new_map,
13826 			&address,
13827 			regions[i].vmrr_size,
13828 			(vm_map_offset_t)0,
13829 			vmk_flags,
13830 			VM_OBJECT_NULL,
13831 			(vm_object_offset_t)0,
13832 			FALSE,
13833 			VM_PROT_NONE,
13834 			VM_PROT_NONE,
13835 			VM_INHERIT_COPY);
13836 
13837 		if (kr != KERN_SUCCESS) {
13838 			panic("Failed to reserve %s region in user map %p %d", regions[i].vmrr_name, new_map, kr);
13839 		}
13840 	}
13841 
13842 	new_map->reserved_regions = (num_regions ? TRUE : FALSE);
13843 
13844 	return KERN_SUCCESS;
13845 }
13846 
13847 uint64_t vm_map_lookup_and_lock_object_copy_slowly_count = 0;
13848 uint64_t vm_map_lookup_and_lock_object_copy_slowly_size = 0;
13849 uint64_t vm_map_lookup_and_lock_object_copy_slowly_max = 0;
13850 uint64_t vm_map_lookup_and_lock_object_copy_slowly_restart = 0;
13851 uint64_t vm_map_lookup_and_lock_object_copy_slowly_error = 0;
13852 uint64_t vm_map_lookup_and_lock_object_copy_strategically_count = 0;
13853 uint64_t vm_map_lookup_and_lock_object_copy_strategically_size = 0;
13854 uint64_t vm_map_lookup_and_lock_object_copy_strategically_max = 0;
13855 uint64_t vm_map_lookup_and_lock_object_copy_strategically_restart = 0;
13856 uint64_t vm_map_lookup_and_lock_object_copy_strategically_error = 0;
13857 uint64_t vm_map_lookup_and_lock_object_copy_shadow_count = 0;
13858 uint64_t vm_map_lookup_and_lock_object_copy_shadow_size = 0;
13859 uint64_t vm_map_lookup_and_lock_object_copy_shadow_max = 0;
13860 /*
13861  *	vm_map_lookup_and_lock_object:
13862  *
13863  *	Finds the VM object, offset, and
13864  *	protection for a given virtual address in the
13865  *	specified map, assuming a page fault of the
13866  *	type specified.
13867  *
13868  *	Returns the (object, offset, protection) for
13869  *	this address, whether it is wired down, and whether
13870  *	this map has the only reference to the data in question.
13871  *	In order to later verify this lookup, a "version"
13872  *	is returned.
13873  *	If contended != NULL, *contended will be set to
13874  *	true iff the thread had to spin or block to acquire
13875  *	an exclusive lock.
13876  *
13877  *	The map MUST be locked by the caller and WILL be
13878  *	locked on exit.  In order to guarantee the
13879  *	existence of the returned object, it is returned
13880  *	locked.
13881  *
13882  *	If a lookup is requested with "write protection"
13883  *	specified, the map may be changed to perform virtual
13884  *	copying operations, although the data referenced will
13885  *	remain the same.
13886  */
13887 kern_return_t
vm_map_lookup_and_lock_object(vm_map_t * var_map,vm_map_offset_t vaddr,vm_prot_t fault_type,int object_lock_type,vm_map_version_t * out_version,vm_object_t * object,vm_object_offset_t * offset,vm_prot_t * out_prot,boolean_t * wired,vm_object_fault_info_t fault_info,vm_map_t * real_map,bool * contended)13888 vm_map_lookup_and_lock_object(
13889 	vm_map_t                *var_map,       /* IN/OUT */
13890 	vm_map_offset_t         vaddr,
13891 	vm_prot_t               fault_type,
13892 	int                     object_lock_type,
13893 	vm_map_version_t        *out_version,   /* OUT */
13894 	vm_object_t             *object,        /* OUT */
13895 	vm_object_offset_t      *offset,        /* OUT */
13896 	vm_prot_t               *out_prot,      /* OUT */
13897 	boolean_t               *wired,         /* OUT */
13898 	vm_object_fault_info_t  fault_info,     /* OUT */
13899 	vm_map_t                *real_map,      /* OUT */
13900 	bool                    *contended)     /* OUT */
13901 {
13902 	vm_map_entry_t                  entry;
13903 	vm_map_t                        map = *var_map;
13904 	vm_map_t                        old_map = *var_map;
13905 	vm_map_t                        cow_sub_map_parent = VM_MAP_NULL;
13906 	vm_map_offset_t                 cow_parent_vaddr = 0;
13907 	vm_map_offset_t                 old_start = 0;
13908 	vm_map_offset_t                 old_end = 0;
13909 	vm_prot_t                       prot;
13910 	boolean_t                       mask_protections;
13911 	boolean_t                       force_copy;
13912 	boolean_t                       no_force_copy_if_executable;
13913 	boolean_t                       submap_needed_copy;
13914 	vm_prot_t                       original_fault_type;
13915 	vm_map_size_t                   fault_page_mask;
13916 
13917 	/*
13918 	 * VM_PROT_MASK means that the caller wants us to use "fault_type"
13919 	 * as a mask against the mapping's actual protections, not as an
13920 	 * absolute value.
13921 	 */
13922 	mask_protections = (fault_type & VM_PROT_IS_MASK) ? TRUE : FALSE;
13923 	force_copy = (fault_type & VM_PROT_COPY) ? TRUE : FALSE;
13924 	no_force_copy_if_executable = (fault_type & VM_PROT_COPY_FAIL_IF_EXECUTABLE) ? TRUE : FALSE;
13925 	fault_type &= VM_PROT_ALL;
13926 	original_fault_type = fault_type;
13927 	if (contended) {
13928 		*contended = false;
13929 	}
13930 
13931 	*real_map = map;
13932 
13933 	fault_page_mask = MIN(VM_MAP_PAGE_MASK(map), PAGE_MASK);
13934 	vaddr = VM_MAP_TRUNC_PAGE(vaddr, fault_page_mask);
13935 
13936 RetryLookup:
13937 	fault_type = original_fault_type;
13938 
13939 	/*
13940 	 *	If the map has an interesting hint, try it before calling
13941 	 *	full blown lookup routine.
13942 	 */
13943 	entry = map->hint;
13944 
13945 	if ((entry == vm_map_to_entry(map)) ||
13946 	    (vaddr < entry->vme_start) || (vaddr >= entry->vme_end)) {
13947 		vm_map_entry_t  tmp_entry;
13948 
13949 		/*
13950 		 *	Entry was either not a valid hint, or the vaddr
13951 		 *	was not contained in the entry, so do a full lookup.
13952 		 */
13953 		if (!vm_map_lookup_entry(map, vaddr, &tmp_entry)) {
13954 			if ((cow_sub_map_parent) && (cow_sub_map_parent != map)) {
13955 				vm_map_unlock(cow_sub_map_parent);
13956 			}
13957 			if ((*real_map != map)
13958 			    && (*real_map != cow_sub_map_parent)) {
13959 				vm_map_unlock(*real_map);
13960 			}
13961 			return KERN_INVALID_ADDRESS;
13962 		}
13963 
13964 		entry = tmp_entry;
13965 	}
13966 	if (map == old_map) {
13967 		old_start = entry->vme_start;
13968 		old_end = entry->vme_end;
13969 	}
13970 
13971 	/*
13972 	 *	Handle submaps.  Drop lock on upper map, submap is
13973 	 *	returned locked.
13974 	 */
13975 
13976 	submap_needed_copy = FALSE;
13977 submap_recurse:
13978 	if (entry->is_sub_map) {
13979 		vm_map_offset_t         local_vaddr;
13980 		vm_map_offset_t         end_delta;
13981 		vm_map_offset_t         start_delta;
13982 		vm_map_offset_t         top_entry_saved_start;
13983 		vm_object_offset_t      top_entry_saved_offset;
13984 		vm_map_entry_t          submap_entry, saved_submap_entry;
13985 		vm_object_offset_t      submap_entry_offset;
13986 		vm_object_size_t        submap_entry_size;
13987 		vm_prot_t               subentry_protection;
13988 		vm_prot_t               subentry_max_protection;
13989 		boolean_t               subentry_no_copy_on_read;
13990 		boolean_t               subentry_permanent;
13991 		boolean_t               subentry_csm_associated;
13992 #if __arm64e__
13993 		boolean_t               subentry_used_for_tpro;
13994 #endif /* __arm64e__ */
13995 		boolean_t               mapped_needs_copy = FALSE;
13996 		vm_map_version_t        version;
13997 
13998 		assertf(VM_MAP_PAGE_SHIFT(VME_SUBMAP(entry)) >= VM_MAP_PAGE_SHIFT(map),
13999 		    "map %p (%d) entry %p submap %p (%d)\n",
14000 		    map, VM_MAP_PAGE_SHIFT(map), entry,
14001 		    VME_SUBMAP(entry), VM_MAP_PAGE_SHIFT(VME_SUBMAP(entry)));
14002 
14003 		local_vaddr = vaddr;
14004 		top_entry_saved_start = entry->vme_start;
14005 		top_entry_saved_offset = VME_OFFSET(entry);
14006 
14007 		if ((entry->use_pmap &&
14008 		    !((fault_type & VM_PROT_WRITE) ||
14009 		    force_copy))) {
14010 			/* if real_map equals map we unlock below */
14011 			if ((*real_map != map) &&
14012 			    (*real_map != cow_sub_map_parent)) {
14013 				vm_map_unlock(*real_map);
14014 			}
14015 			*real_map = VME_SUBMAP(entry);
14016 		}
14017 
14018 		if (entry->needs_copy &&
14019 		    ((fault_type & VM_PROT_WRITE) ||
14020 		    force_copy)) {
14021 			if (!mapped_needs_copy) {
14022 				if (vm_map_lock_read_to_write(map)) {
14023 					vm_map_lock_read(map);
14024 					*real_map = map;
14025 					goto RetryLookup;
14026 				}
14027 				vm_map_lock_read(VME_SUBMAP(entry));
14028 				*var_map = VME_SUBMAP(entry);
14029 				cow_sub_map_parent = map;
14030 				/* reset base to map before cow object */
14031 				/* this is the map which will accept   */
14032 				/* the new cow object */
14033 				old_start = entry->vme_start;
14034 				old_end = entry->vme_end;
14035 				cow_parent_vaddr = vaddr;
14036 				mapped_needs_copy = TRUE;
14037 			} else {
14038 				vm_map_lock_read(VME_SUBMAP(entry));
14039 				*var_map = VME_SUBMAP(entry);
14040 				if ((cow_sub_map_parent != map) &&
14041 				    (*real_map != map)) {
14042 					vm_map_unlock(map);
14043 				}
14044 			}
14045 		} else {
14046 			if (entry->needs_copy) {
14047 				submap_needed_copy = TRUE;
14048 			}
14049 			vm_map_lock_read(VME_SUBMAP(entry));
14050 			*var_map = VME_SUBMAP(entry);
14051 			/* leave map locked if it is a target */
14052 			/* cow sub_map above otherwise, just  */
14053 			/* follow the maps down to the object */
14054 			/* here we unlock knowing we are not  */
14055 			/* revisiting the map.  */
14056 			if ((*real_map != map) && (map != cow_sub_map_parent)) {
14057 				vm_map_unlock_read(map);
14058 			}
14059 		}
14060 
14061 		entry = NULL;
14062 		map = *var_map;
14063 
14064 		/* calculate the offset in the submap for vaddr */
14065 		local_vaddr = (local_vaddr - top_entry_saved_start) + top_entry_saved_offset;
14066 		assertf(VM_MAP_PAGE_ALIGNED(local_vaddr, fault_page_mask),
14067 		    "local_vaddr 0x%llx entry->vme_start 0x%llx fault_page_mask 0x%llx\n",
14068 		    (uint64_t)local_vaddr, (uint64_t)top_entry_saved_start, (uint64_t)fault_page_mask);
14069 
14070 RetrySubMap:
14071 		if (!vm_map_lookup_entry(map, local_vaddr, &submap_entry)) {
14072 			if ((cow_sub_map_parent) && (cow_sub_map_parent != map)) {
14073 				vm_map_unlock(cow_sub_map_parent);
14074 			}
14075 			if ((*real_map != map)
14076 			    && (*real_map != cow_sub_map_parent)) {
14077 				vm_map_unlock(*real_map);
14078 			}
14079 			*real_map = map;
14080 			return KERN_INVALID_ADDRESS;
14081 		}
14082 
14083 		/* find the attenuated shadow of the underlying object */
14084 		/* on our target map */
14085 
14086 		/* in english the submap object may extend beyond the     */
14087 		/* region mapped by the entry or, may only fill a portion */
14088 		/* of it.  For our purposes, we only care if the object   */
14089 		/* doesn't fill.  In this case the area which will        */
14090 		/* ultimately be clipped in the top map will only need    */
14091 		/* to be as big as the portion of the underlying entry    */
14092 		/* which is mapped */
14093 		start_delta = submap_entry->vme_start > top_entry_saved_offset ?
14094 		    submap_entry->vme_start - top_entry_saved_offset : 0;
14095 
14096 		end_delta =
14097 		    (top_entry_saved_offset + start_delta + (old_end - old_start)) <=
14098 		    submap_entry->vme_end ?
14099 		    0 : (top_entry_saved_offset +
14100 		    (old_end - old_start))
14101 		    - submap_entry->vme_end;
14102 
14103 		old_start += start_delta;
14104 		old_end -= end_delta;
14105 
14106 		if (submap_entry->is_sub_map) {
14107 			entry = submap_entry;
14108 			vaddr = local_vaddr;
14109 			goto submap_recurse;
14110 		}
14111 
14112 		if (((fault_type & VM_PROT_WRITE) ||
14113 		    force_copy)
14114 		    && cow_sub_map_parent) {
14115 			vm_object_t     sub_object, copy_object;
14116 			vm_object_offset_t copy_offset;
14117 			vm_map_offset_t local_start;
14118 			vm_map_offset_t local_end;
14119 			boolean_t       object_copied = FALSE;
14120 			vm_object_offset_t object_copied_offset = 0;
14121 			boolean_t       object_copied_needs_copy = FALSE;
14122 			kern_return_t   kr = KERN_SUCCESS;
14123 
14124 			if (vm_map_lock_read_to_write(map)) {
14125 				vm_map_lock_read(map);
14126 				old_start -= start_delta;
14127 				old_end += end_delta;
14128 				goto RetrySubMap;
14129 			}
14130 
14131 
14132 			sub_object = VME_OBJECT(submap_entry);
14133 			if (sub_object == VM_OBJECT_NULL) {
14134 				sub_object =
14135 				    vm_object_allocate(
14136 					(vm_map_size_t)
14137 					(submap_entry->vme_end -
14138 					submap_entry->vme_start));
14139 				VME_OBJECT_SET(submap_entry, sub_object, false, 0);
14140 				VME_OFFSET_SET(submap_entry, 0);
14141 				assert(!submap_entry->is_sub_map);
14142 				assert(submap_entry->use_pmap);
14143 			}
14144 			local_start =  local_vaddr -
14145 			    (cow_parent_vaddr - old_start);
14146 			local_end = local_vaddr +
14147 			    (old_end - cow_parent_vaddr);
14148 			vm_map_clip_start(map, submap_entry, local_start);
14149 			vm_map_clip_end(map, submap_entry, local_end);
14150 			if (submap_entry->is_sub_map) {
14151 				/* unnesting was done when clipping */
14152 				assert(!submap_entry->use_pmap);
14153 			}
14154 
14155 			/* This is the COW case, lets connect */
14156 			/* an entry in our space to the underlying */
14157 			/* object in the submap, bypassing the  */
14158 			/* submap. */
14159 			submap_entry_offset = VME_OFFSET(submap_entry);
14160 			submap_entry_size = submap_entry->vme_end - submap_entry->vme_start;
14161 
14162 			if ((submap_entry->wired_count != 0 ||
14163 			    sub_object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC) &&
14164 			    (submap_entry->protection & VM_PROT_EXECUTE) &&
14165 			    no_force_copy_if_executable) {
14166 //				printf("FBDP map %p entry %p start 0x%llx end 0x%llx wired %d strat %d\n", map, submap_entry, (uint64_t)local_start, (uint64_t)local_end, submap_entry->wired_count, sub_object->copy_strategy);
14167 				if ((cow_sub_map_parent) && (cow_sub_map_parent != map)) {
14168 					vm_map_unlock(cow_sub_map_parent);
14169 				}
14170 				if ((*real_map != map)
14171 				    && (*real_map != cow_sub_map_parent)) {
14172 					vm_map_unlock(*real_map);
14173 				}
14174 				*real_map = map;
14175 				ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_SUBMAP_NO_COW_ON_EXECUTABLE), 0 /* arg */);
14176 				vm_map_lock_write_to_read(map);
14177 				kr = KERN_PROTECTION_FAILURE;
14178 				DTRACE_VM4(submap_no_copy_executable,
14179 				    vm_map_t, map,
14180 				    vm_object_offset_t, submap_entry_offset,
14181 				    vm_object_size_t, submap_entry_size,
14182 				    int, kr);
14183 				return kr;
14184 			}
14185 
14186 			if (submap_entry->wired_count != 0) {
14187 				vm_object_reference(sub_object);
14188 
14189 				assertf(VM_MAP_PAGE_ALIGNED(VME_OFFSET(submap_entry), VM_MAP_PAGE_MASK(map)),
14190 				    "submap_entry %p offset 0x%llx\n",
14191 				    submap_entry, VME_OFFSET(submap_entry));
14192 
14193 				DTRACE_VM6(submap_copy_slowly,
14194 				    vm_map_t, cow_sub_map_parent,
14195 				    vm_map_offset_t, vaddr,
14196 				    vm_map_t, map,
14197 				    vm_object_size_t, submap_entry_size,
14198 				    int, submap_entry->wired_count,
14199 				    int, sub_object->copy_strategy);
14200 
14201 				saved_submap_entry = submap_entry;
14202 				version.main_timestamp = map->timestamp;
14203 				vm_map_unlock(map); /* Increments timestamp by 1 */
14204 				submap_entry = VM_MAP_ENTRY_NULL;
14205 
14206 				vm_object_lock(sub_object);
14207 				kr = vm_object_copy_slowly(sub_object,
14208 				    submap_entry_offset,
14209 				    submap_entry_size,
14210 				    FALSE,
14211 				    &copy_object);
14212 				object_copied = TRUE;
14213 				object_copied_offset = 0;
14214 				/* 4k: account for extra offset in physical page */
14215 				object_copied_offset += submap_entry_offset - vm_object_trunc_page(submap_entry_offset);
14216 				object_copied_needs_copy = FALSE;
14217 				vm_object_deallocate(sub_object);
14218 
14219 				vm_map_lock(map);
14220 
14221 				if (kr != KERN_SUCCESS &&
14222 				    kr != KERN_MEMORY_RESTART_COPY) {
14223 					if ((cow_sub_map_parent) && (cow_sub_map_parent != map)) {
14224 						vm_map_unlock(cow_sub_map_parent);
14225 					}
14226 					if ((*real_map != map)
14227 					    && (*real_map != cow_sub_map_parent)) {
14228 						vm_map_unlock(*real_map);
14229 					}
14230 					*real_map = map;
14231 					vm_object_deallocate(copy_object);
14232 					copy_object = VM_OBJECT_NULL;
14233 					ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_SUBMAP_COPY_SLOWLY_FAILED), 0 /* arg */);
14234 					vm_map_lock_write_to_read(map);
14235 					DTRACE_VM4(submap_copy_error_slowly,
14236 					    vm_object_t, sub_object,
14237 					    vm_object_offset_t, submap_entry_offset,
14238 					    vm_object_size_t, submap_entry_size,
14239 					    int, kr);
14240 					vm_map_lookup_and_lock_object_copy_slowly_error++;
14241 					return kr;
14242 				}
14243 
14244 				if ((kr == KERN_SUCCESS) &&
14245 				    (version.main_timestamp + 1) == map->timestamp) {
14246 					submap_entry = saved_submap_entry;
14247 				} else {
14248 					saved_submap_entry = NULL;
14249 					old_start -= start_delta;
14250 					old_end += end_delta;
14251 					vm_object_deallocate(copy_object);
14252 					copy_object = VM_OBJECT_NULL;
14253 					vm_map_lock_write_to_read(map);
14254 					vm_map_lookup_and_lock_object_copy_slowly_restart++;
14255 					goto RetrySubMap;
14256 				}
14257 				vm_map_lookup_and_lock_object_copy_slowly_count++;
14258 				vm_map_lookup_and_lock_object_copy_slowly_size += submap_entry_size;
14259 				if (submap_entry_size > vm_map_lookup_and_lock_object_copy_slowly_max) {
14260 					vm_map_lookup_and_lock_object_copy_slowly_max = submap_entry_size;
14261 				}
14262 			} else if (sub_object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC) {
14263 				submap_entry_offset = VME_OFFSET(submap_entry);
14264 				copy_object = VM_OBJECT_NULL;
14265 				object_copied_offset = submap_entry_offset;
14266 				object_copied_needs_copy = FALSE;
14267 				DTRACE_VM6(submap_copy_strategically,
14268 				    vm_map_t, cow_sub_map_parent,
14269 				    vm_map_offset_t, vaddr,
14270 				    vm_map_t, map,
14271 				    vm_object_size_t, submap_entry_size,
14272 				    int, submap_entry->wired_count,
14273 				    int, sub_object->copy_strategy);
14274 				kr = vm_object_copy_strategically(
14275 					sub_object,
14276 					submap_entry_offset,
14277 					submap_entry->vme_end - submap_entry->vme_start,
14278 					false, /* forking */
14279 					&copy_object,
14280 					&object_copied_offset,
14281 					&object_copied_needs_copy);
14282 				if (kr == KERN_MEMORY_RESTART_COPY) {
14283 					old_start -= start_delta;
14284 					old_end += end_delta;
14285 					vm_object_deallocate(copy_object);
14286 					copy_object = VM_OBJECT_NULL;
14287 					vm_map_lock_write_to_read(map);
14288 					vm_map_lookup_and_lock_object_copy_strategically_restart++;
14289 					goto RetrySubMap;
14290 				}
14291 				if (kr != KERN_SUCCESS) {
14292 					if ((cow_sub_map_parent) && (cow_sub_map_parent != map)) {
14293 						vm_map_unlock(cow_sub_map_parent);
14294 					}
14295 					if ((*real_map != map)
14296 					    && (*real_map != cow_sub_map_parent)) {
14297 						vm_map_unlock(*real_map);
14298 					}
14299 					*real_map = map;
14300 					vm_object_deallocate(copy_object);
14301 					copy_object = VM_OBJECT_NULL;
14302 					ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_SUBMAP_COPY_STRAT_FAILED), 0 /* arg */);
14303 					vm_map_lock_write_to_read(map);
14304 					DTRACE_VM4(submap_copy_error_strategically,
14305 					    vm_object_t, sub_object,
14306 					    vm_object_offset_t, submap_entry_offset,
14307 					    vm_object_size_t, submap_entry_size,
14308 					    int, kr);
14309 					vm_map_lookup_and_lock_object_copy_strategically_error++;
14310 					return kr;
14311 				}
14312 				assert(copy_object != VM_OBJECT_NULL);
14313 				assert(copy_object != sub_object);
14314 				object_copied = TRUE;
14315 				vm_map_lookup_and_lock_object_copy_strategically_count++;
14316 				vm_map_lookup_and_lock_object_copy_strategically_size += submap_entry_size;
14317 				if (submap_entry_size > vm_map_lookup_and_lock_object_copy_strategically_max) {
14318 					vm_map_lookup_and_lock_object_copy_strategically_max = submap_entry_size;
14319 				}
14320 			} else {
14321 				/* set up shadow object */
14322 				object_copied = FALSE;
14323 				copy_object = sub_object;
14324 				vm_object_lock(sub_object);
14325 				vm_object_reference_locked(sub_object);
14326 				VM_OBJECT_SET_SHADOWED(sub_object, TRUE);
14327 				vm_object_unlock(sub_object);
14328 
14329 				assert(submap_entry->wired_count == 0);
14330 				submap_entry->needs_copy = TRUE;
14331 
14332 				prot = submap_entry->protection;
14333 				if (pmap_has_prot_policy(map->pmap, submap_entry->translated_allow_execute, prot)) {
14334 					panic("%s: map %p pmap %p entry %p 0x%llx:0x%llx prot 0x%x",
14335 					    __FUNCTION__,
14336 					    map, map->pmap, submap_entry,
14337 					    (uint64_t)submap_entry->vme_start,
14338 					    (uint64_t)submap_entry->vme_end,
14339 					    prot);
14340 				}
14341 				prot = prot & ~VM_PROT_WRITE;
14342 				if (pmap_has_prot_policy(map->pmap, submap_entry->translated_allow_execute, prot)) {
14343 					panic("%s: map %p pmap %p entry %p 0x%llx:0x%llx prot 0x%x",
14344 					    __FUNCTION__,
14345 					    map, map->pmap, submap_entry,
14346 					    (uint64_t)submap_entry->vme_start,
14347 					    (uint64_t)submap_entry->vme_end,
14348 					    prot);
14349 				}
14350 
14351 				if (override_nx(old_map,
14352 				    VME_ALIAS(submap_entry))
14353 				    && prot) {
14354 					prot |= VM_PROT_EXECUTE;
14355 				}
14356 
14357 				vm_object_pmap_protect(
14358 					sub_object,
14359 					VME_OFFSET(submap_entry),
14360 					submap_entry->vme_end -
14361 					submap_entry->vme_start,
14362 					(submap_entry->is_shared
14363 					|| map->mapped_in_other_pmaps) ?
14364 					PMAP_NULL : map->pmap,
14365 					VM_MAP_PAGE_SIZE(map),
14366 					submap_entry->vme_start,
14367 					prot);
14368 				vm_map_lookup_and_lock_object_copy_shadow_count++;
14369 				vm_map_lookup_and_lock_object_copy_shadow_size += submap_entry_size;
14370 				if (submap_entry_size > vm_map_lookup_and_lock_object_copy_shadow_max) {
14371 					vm_map_lookup_and_lock_object_copy_shadow_max = submap_entry_size;
14372 				}
14373 			}
14374 
14375 			/*
14376 			 * Adjust the fault offset to the submap entry.
14377 			 */
14378 			copy_offset = (local_vaddr -
14379 			    submap_entry->vme_start +
14380 			    VME_OFFSET(submap_entry));
14381 
14382 			/* This works diffently than the   */
14383 			/* normal submap case. We go back  */
14384 			/* to the parent of the cow map and*/
14385 			/* clip out the target portion of  */
14386 			/* the sub_map, substituting the   */
14387 			/* new copy object,                */
14388 
14389 			subentry_protection = submap_entry->protection;
14390 			subentry_max_protection = submap_entry->max_protection;
14391 			subentry_no_copy_on_read = submap_entry->vme_no_copy_on_read;
14392 			subentry_permanent = submap_entry->vme_permanent;
14393 			subentry_csm_associated = submap_entry->csm_associated;
14394 #if __arm64e__
14395 			subentry_used_for_tpro = submap_entry->used_for_tpro;
14396 #endif // __arm64e__
14397 			vm_map_unlock(map);
14398 			submap_entry = NULL; /* not valid after map unlock */
14399 
14400 			local_start = old_start;
14401 			local_end = old_end;
14402 			map = cow_sub_map_parent;
14403 			*var_map = cow_sub_map_parent;
14404 			vaddr = cow_parent_vaddr;
14405 			cow_sub_map_parent = NULL;
14406 
14407 			if (!vm_map_lookup_entry(map,
14408 			    vaddr, &entry)) {
14409 				if ((cow_sub_map_parent) && (cow_sub_map_parent != map)) {
14410 					vm_map_unlock(cow_sub_map_parent);
14411 				}
14412 				if ((*real_map != map)
14413 				    && (*real_map != cow_sub_map_parent)) {
14414 					vm_map_unlock(*real_map);
14415 				}
14416 				*real_map = map;
14417 				vm_object_deallocate(
14418 					copy_object);
14419 				copy_object = VM_OBJECT_NULL;
14420 				vm_map_lock_write_to_read(map);
14421 				DTRACE_VM4(submap_lookup_post_unlock,
14422 				    uint64_t, (uint64_t)entry->vme_start,
14423 				    uint64_t, (uint64_t)entry->vme_end,
14424 				    vm_map_offset_t, vaddr,
14425 				    int, object_copied);
14426 				return KERN_INVALID_ADDRESS;
14427 			}
14428 
14429 			/* clip out the portion of space */
14430 			/* mapped by the sub map which   */
14431 			/* corresponds to the underlying */
14432 			/* object */
14433 
14434 			/*
14435 			 * Clip (and unnest) the smallest nested chunk
14436 			 * possible around the faulting address...
14437 			 */
14438 			local_start = vaddr & ~(pmap_shared_region_size_min(map->pmap) - 1);
14439 			local_end = local_start + pmap_shared_region_size_min(map->pmap);
14440 			/*
14441 			 * ... but don't go beyond the "old_start" to "old_end"
14442 			 * range, to avoid spanning over another VM region
14443 			 * with a possibly different VM object and/or offset.
14444 			 */
14445 			if (local_start < old_start) {
14446 				local_start = old_start;
14447 			}
14448 			if (local_end > old_end) {
14449 				local_end = old_end;
14450 			}
14451 			/*
14452 			 * Adjust copy_offset to the start of the range.
14453 			 */
14454 			copy_offset -= (vaddr - local_start);
14455 
14456 			vm_map_clip_start(map, entry, local_start);
14457 			vm_map_clip_end(map, entry, local_end);
14458 			if (entry->is_sub_map) {
14459 				/* unnesting was done when clipping */
14460 				assert(!entry->use_pmap);
14461 			}
14462 
14463 			/* substitute copy object for */
14464 			/* shared map entry           */
14465 			vm_map_deallocate(VME_SUBMAP(entry));
14466 			assert(!entry->iokit_acct);
14467 			entry->use_pmap = TRUE;
14468 			VME_OBJECT_SET(entry, copy_object, false, 0);
14469 
14470 			/* propagate the submap entry's protections */
14471 			if (entry->protection != VM_PROT_READ) {
14472 				/*
14473 				 * Someone has already altered the top entry's
14474 				 * protections via vm_protect(VM_PROT_COPY).
14475 				 * Respect these new values and ignore the
14476 				 * submap entry's protections.
14477 				 */
14478 			} else {
14479 				/*
14480 				 * Regular copy-on-write: propagate the submap
14481 				 * entry's protections to the top map entry.
14482 				 */
14483 				entry->protection |= subentry_protection;
14484 			}
14485 			entry->max_protection |= subentry_max_protection;
14486 			/* propagate some attributes from subentry */
14487 			entry->vme_no_copy_on_read = subentry_no_copy_on_read;
14488 			entry->vme_permanent = subentry_permanent;
14489 			entry->csm_associated = subentry_csm_associated;
14490 #if __arm64e__
14491 			/* propagate TPRO iff the destination map has TPRO enabled */
14492 			if (subentry_used_for_tpro && vm_map_tpro(map)) {
14493 				entry->used_for_tpro = subentry_used_for_tpro;
14494 			}
14495 #endif /* __arm64e */
14496 			if ((entry->protection & VM_PROT_WRITE) &&
14497 			    (entry->protection & VM_PROT_EXECUTE) &&
14498 #if XNU_TARGET_OS_OSX
14499 			    map->pmap != kernel_pmap &&
14500 			    (vm_map_cs_enforcement(map)
14501 #if __arm64__
14502 			    || !VM_MAP_IS_EXOTIC(map)
14503 #endif /* __arm64__ */
14504 			    ) &&
14505 #endif /* XNU_TARGET_OS_OSX */
14506 #if CODE_SIGNING_MONITOR
14507 			    (csm_address_space_exempt(map->pmap) != KERN_SUCCESS) &&
14508 #endif
14509 			    !(entry->used_for_jit) &&
14510 			    VM_MAP_POLICY_WX_STRIP_X(map)) {
14511 				DTRACE_VM3(cs_wx,
14512 				    uint64_t, (uint64_t)entry->vme_start,
14513 				    uint64_t, (uint64_t)entry->vme_end,
14514 				    vm_prot_t, entry->protection);
14515 				printf("CODE SIGNING: %d[%s] %s:%d(0x%llx,0x%llx,0x%x) can't have both write and exec at the same time\n",
14516 				    proc_selfpid(),
14517 				    (get_bsdtask_info(current_task())
14518 				    ? proc_name_address(get_bsdtask_info(current_task()))
14519 				    : "?"),
14520 				    __FUNCTION__, __LINE__,
14521 #if DEVELOPMENT || DEBUG
14522 				    (uint64_t)entry->vme_start,
14523 				    (uint64_t)entry->vme_end,
14524 #else /* DEVELOPMENT || DEBUG */
14525 				    (uint64_t)0,
14526 				    (uint64_t)0,
14527 #endif /* DEVELOPMENT || DEBUG */
14528 				    entry->protection);
14529 				entry->protection &= ~VM_PROT_EXECUTE;
14530 			}
14531 
14532 			if (object_copied) {
14533 				VME_OFFSET_SET(entry, local_start - old_start + object_copied_offset);
14534 				entry->needs_copy = object_copied_needs_copy;
14535 				entry->is_shared = FALSE;
14536 			} else {
14537 				assert(VME_OBJECT(entry) != VM_OBJECT_NULL);
14538 				assert(VME_OBJECT(entry)->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC);
14539 				assert(entry->wired_count == 0);
14540 				VME_OFFSET_SET(entry, copy_offset);
14541 				entry->needs_copy = TRUE;
14542 				if (map != old_map) {
14543 					entry->is_shared = TRUE;
14544 				}
14545 			}
14546 			if (entry->inheritance == VM_INHERIT_SHARE) {
14547 				entry->inheritance = VM_INHERIT_COPY;
14548 			}
14549 
14550 			vm_map_lock_write_to_read(map);
14551 		} else {
14552 			if ((cow_sub_map_parent)
14553 			    && (cow_sub_map_parent != *real_map)
14554 			    && (cow_sub_map_parent != map)) {
14555 				vm_map_unlock(cow_sub_map_parent);
14556 			}
14557 			entry = submap_entry;
14558 			vaddr = local_vaddr;
14559 		}
14560 	}
14561 
14562 	/*
14563 	 *	Check whether this task is allowed to have
14564 	 *	this page.
14565 	 */
14566 
14567 	prot = entry->protection;
14568 
14569 	if (override_nx(old_map, VME_ALIAS(entry)) && prot) {
14570 		/*
14571 		 * HACK -- if not a stack, then allow execution
14572 		 */
14573 		prot |= VM_PROT_EXECUTE;
14574 	}
14575 
14576 #if __arm64e__
14577 	/*
14578 	 * If the entry we're dealing with is TPRO and we have a write
14579 	 * fault, inject VM_PROT_WRITE into protections. This allows us
14580 	 * to maintain RO permissions when not marked as TPRO.
14581 	 */
14582 	if (entry->used_for_tpro && (fault_type & VM_PROT_WRITE)) {
14583 		prot |= VM_PROT_WRITE;
14584 	}
14585 #endif /* __arm64e__ */
14586 	if (mask_protections) {
14587 		fault_type &= prot;
14588 		if (fault_type == VM_PROT_NONE) {
14589 			goto protection_failure;
14590 		}
14591 	}
14592 	if (((fault_type & prot) != fault_type)
14593 #if __arm64__
14594 	    /* prefetch abort in execute-only page */
14595 	    && !(prot == VM_PROT_EXECUTE && fault_type == (VM_PROT_READ | VM_PROT_EXECUTE))
14596 #elif defined(__x86_64__)
14597 	    /* Consider the UEXEC bit when handling an EXECUTE fault */
14598 	    && !((fault_type & VM_PROT_EXECUTE) && !(prot & VM_PROT_EXECUTE) && (prot & VM_PROT_UEXEC))
14599 #endif
14600 	    ) {
14601 protection_failure:
14602 		if (*real_map != map) {
14603 			vm_map_unlock(*real_map);
14604 		}
14605 		*real_map = map;
14606 
14607 		if ((fault_type & VM_PROT_EXECUTE) && prot) {
14608 			log_stack_execution_failure((addr64_t)vaddr, prot);
14609 		}
14610 
14611 		DTRACE_VM2(prot_fault, int, 1, (uint64_t *), NULL);
14612 		DTRACE_VM3(prot_fault_detailed, vm_prot_t, fault_type, vm_prot_t, prot, void *, vaddr);
14613 		/*
14614 		 * Noisy (esp. internally) and can be inferred from CrashReports. So OFF for now.
14615 		 *
14616 		 * ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_PROTECTION_FAILURE), 0);
14617 		 */
14618 		return KERN_PROTECTION_FAILURE;
14619 	}
14620 
14621 	/*
14622 	 *	If this page is not pageable, we have to get
14623 	 *	it for all possible accesses.
14624 	 */
14625 
14626 	*wired = (entry->wired_count != 0);
14627 	if (*wired) {
14628 		fault_type = prot;
14629 	}
14630 
14631 	/*
14632 	 *	If the entry was copy-on-write, we either ...
14633 	 */
14634 
14635 	if (entry->needs_copy) {
14636 		/*
14637 		 *	If we want to write the page, we may as well
14638 		 *	handle that now since we've got the map locked.
14639 		 *
14640 		 *	If we don't need to write the page, we just
14641 		 *	demote the permissions allowed.
14642 		 */
14643 
14644 		if ((fault_type & VM_PROT_WRITE) || *wired || force_copy) {
14645 			/*
14646 			 *	Make a new object, and place it in the
14647 			 *	object chain.  Note that no new references
14648 			 *	have appeared -- one just moved from the
14649 			 *	map to the new object.
14650 			 */
14651 
14652 			if (vm_map_lock_read_to_write(map)) {
14653 				vm_map_lock_read(map);
14654 				goto RetryLookup;
14655 			}
14656 
14657 			if (VME_OBJECT(entry)->shadowed == FALSE) {
14658 				vm_object_lock(VME_OBJECT(entry));
14659 				VM_OBJECT_SET_SHADOWED(VME_OBJECT(entry), TRUE);
14660 				vm_object_unlock(VME_OBJECT(entry));
14661 			}
14662 			VME_OBJECT_SHADOW(entry,
14663 			    (vm_map_size_t) (entry->vme_end -
14664 			    entry->vme_start),
14665 			    vm_map_always_shadow(map));
14666 			entry->needs_copy = FALSE;
14667 
14668 			vm_map_lock_write_to_read(map);
14669 		}
14670 		if ((fault_type & VM_PROT_WRITE) == 0 && *wired == 0) {
14671 			/*
14672 			 *	We're attempting to read a copy-on-write
14673 			 *	page -- don't allow writes.
14674 			 */
14675 
14676 			prot &= (~VM_PROT_WRITE);
14677 		}
14678 	}
14679 
14680 	if (submap_needed_copy && (prot & VM_PROT_WRITE)) {
14681 		/*
14682 		 * We went through a "needs_copy" submap without triggering
14683 		 * a copy, so granting write access to the page would bypass
14684 		 * that submap's "needs_copy".
14685 		 */
14686 		assert(!(fault_type & VM_PROT_WRITE));
14687 		assert(!*wired);
14688 		assert(!force_copy);
14689 		// printf("FBDP %d[%s] submap_needed_copy for %p 0x%llx\n", proc_selfpid(), proc_name_address(current_task()->bsd_info), map, vaddr);
14690 		prot &= ~VM_PROT_WRITE;
14691 	}
14692 
14693 	/*
14694 	 *	Create an object if necessary.
14695 	 */
14696 	if (VME_OBJECT(entry) == VM_OBJECT_NULL) {
14697 		if (vm_map_lock_read_to_write(map)) {
14698 			vm_map_lock_read(map);
14699 			goto RetryLookup;
14700 		}
14701 
14702 		VME_OBJECT_SET(entry,
14703 		    vm_object_allocate(
14704 			    (vm_map_size_t)(entry->vme_end -
14705 			    entry->vme_start)), false, 0);
14706 		VME_OFFSET_SET(entry, 0);
14707 		assert(entry->use_pmap);
14708 		vm_map_lock_write_to_read(map);
14709 	}
14710 
14711 	/*
14712 	 *	Return the object/offset from this entry.  If the entry
14713 	 *	was copy-on-write or empty, it has been fixed up.  Also
14714 	 *	return the protection.
14715 	 */
14716 
14717 	*offset = (vaddr - entry->vme_start) + VME_OFFSET(entry);
14718 	*object = VME_OBJECT(entry);
14719 	*out_prot = prot;
14720 	KDBG_FILTERED(MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_MAP_LOOKUP_OBJECT), VM_KERNEL_UNSLIDE_OR_PERM(*object), (unsigned long) VME_ALIAS(entry), 0, 0);
14721 
14722 	if (fault_info) {
14723 		fault_info->interruptible = THREAD_UNINT; /* for now... */
14724 		/* ... the caller will change "interruptible" if needed */
14725 		fault_info->cluster_size = 0;
14726 		fault_info->user_tag = VME_ALIAS(entry);
14727 		fault_info->pmap_options = 0;
14728 		if (entry->iokit_acct ||
14729 		    (!entry->is_sub_map && !entry->use_pmap)) {
14730 			fault_info->pmap_options |= PMAP_OPTIONS_ALT_ACCT;
14731 		}
14732 		fault_info->behavior = entry->behavior;
14733 		fault_info->lo_offset = VME_OFFSET(entry);
14734 		fault_info->hi_offset =
14735 		    (entry->vme_end - entry->vme_start) + VME_OFFSET(entry);
14736 		fault_info->no_cache  = entry->no_cache;
14737 		fault_info->stealth = FALSE;
14738 		fault_info->io_sync = FALSE;
14739 		if (entry->used_for_jit ||
14740 #if CODE_SIGNING_MONITOR
14741 		    (csm_address_space_exempt(map->pmap) == KERN_SUCCESS) ||
14742 #endif
14743 		    entry->vme_resilient_codesign) {
14744 			fault_info->cs_bypass = TRUE;
14745 		} else {
14746 			fault_info->cs_bypass = FALSE;
14747 		}
14748 		fault_info->csm_associated = FALSE;
14749 #if CODE_SIGNING_MONITOR
14750 		if (entry->csm_associated) {
14751 			/*
14752 			 * The pmap layer will validate this page
14753 			 * before allowing it to be executed from.
14754 			 */
14755 			fault_info->csm_associated = TRUE;
14756 		}
14757 #endif
14758 		fault_info->mark_zf_absent = FALSE;
14759 		fault_info->batch_pmap_op = FALSE;
14760 		fault_info->resilient_media = entry->vme_resilient_media;
14761 		fault_info->fi_xnu_user_debug = entry->vme_xnu_user_debug;
14762 		fault_info->no_copy_on_read = entry->vme_no_copy_on_read;
14763 #if __arm64e__
14764 		fault_info->fi_used_for_tpro = entry->used_for_tpro;
14765 #else /* __arm64e__ */
14766 		fault_info->fi_used_for_tpro = FALSE;
14767 #endif
14768 		if (entry->translated_allow_execute) {
14769 			fault_info->pmap_options |= PMAP_OPTIONS_TRANSLATED_ALLOW_EXECUTE;
14770 		}
14771 	}
14772 
14773 	/*
14774 	 *	Lock the object to prevent it from disappearing
14775 	 */
14776 	if (object_lock_type == OBJECT_LOCK_EXCLUSIVE) {
14777 		if (contended == NULL) {
14778 			vm_object_lock(*object);
14779 		} else {
14780 			*contended = vm_object_lock_check_contended(*object);
14781 		}
14782 	} else {
14783 		vm_object_lock_shared(*object);
14784 	}
14785 
14786 	/*
14787 	 *	Save the version number
14788 	 */
14789 
14790 	out_version->main_timestamp = map->timestamp;
14791 
14792 	return KERN_SUCCESS;
14793 }
14794 
14795 
14796 /*
14797  *	vm_map_verify:
14798  *
14799  *	Verifies that the map in question has not changed
14800  *	since the given version. The map has to be locked
14801  *	("shared" mode is fine) before calling this function
14802  *	and it will be returned locked too.
14803  */
14804 boolean_t
vm_map_verify(vm_map_t map,vm_map_version_t * version)14805 vm_map_verify(
14806 	vm_map_t                map,
14807 	vm_map_version_t        *version)       /* REF */
14808 {
14809 	boolean_t       result;
14810 
14811 	vm_map_lock_assert_held(map);
14812 	result = (map->timestamp == version->main_timestamp);
14813 
14814 	return result;
14815 }
14816 
14817 /*
14818  *	TEMPORARYTEMPORARYTEMPORARYTEMPORARYTEMPORARYTEMPORARY
14819  *	Goes away after regular vm_region_recurse function migrates to
14820  *	64 bits
14821  *	vm_region_recurse: A form of vm_region which follows the
14822  *	submaps in a target map
14823  *
14824  */
14825 
14826 kern_return_t
vm_map_region_recurse_64(vm_map_t map,vm_map_offset_t * address,vm_map_size_t * size,natural_t * nesting_depth,vm_region_submap_info_64_t submap_info,mach_msg_type_number_t * count)14827 vm_map_region_recurse_64(
14828 	vm_map_t                 map,
14829 	vm_map_offset_t *address,               /* IN/OUT */
14830 	vm_map_size_t           *size,                  /* OUT */
14831 	natural_t               *nesting_depth, /* IN/OUT */
14832 	vm_region_submap_info_64_t      submap_info,    /* IN/OUT */
14833 	mach_msg_type_number_t  *count) /* IN/OUT */
14834 {
14835 	mach_msg_type_number_t  original_count;
14836 	vm_region_extended_info_data_t  extended;
14837 	vm_map_entry_t                  tmp_entry;
14838 	vm_map_offset_t                 user_address;
14839 	unsigned int                    user_max_depth;
14840 
14841 	/*
14842 	 * "curr_entry" is the VM map entry preceding or including the
14843 	 * address we're looking for.
14844 	 * "curr_map" is the map or sub-map containing "curr_entry".
14845 	 * "curr_address" is the equivalent of the top map's "user_address"
14846 	 * in the current map.
14847 	 * "curr_offset" is the cumulated offset of "curr_map" in the
14848 	 * target task's address space.
14849 	 * "curr_depth" is the depth of "curr_map" in the chain of
14850 	 * sub-maps.
14851 	 *
14852 	 * "curr_max_below" and "curr_max_above" limit the range (around
14853 	 * "curr_address") we should take into account in the current (sub)map.
14854 	 * They limit the range to what's visible through the map entries
14855 	 * we've traversed from the top map to the current map.
14856 	 *
14857 	 */
14858 	vm_map_entry_t                  curr_entry;
14859 	vm_map_address_t                curr_address;
14860 	vm_map_offset_t                 curr_offset;
14861 	vm_map_t                        curr_map;
14862 	unsigned int                    curr_depth;
14863 	vm_map_offset_t                 curr_max_below, curr_max_above;
14864 	vm_map_offset_t                 curr_skip;
14865 
14866 	/*
14867 	 * "next_" is the same as "curr_" but for the VM region immediately
14868 	 * after the address we're looking for.  We need to keep track of this
14869 	 * too because we want to return info about that region if the
14870 	 * address we're looking for is not mapped.
14871 	 */
14872 	vm_map_entry_t                  next_entry;
14873 	vm_map_offset_t                 next_offset;
14874 	vm_map_offset_t                 next_address;
14875 	vm_map_t                        next_map;
14876 	unsigned int                    next_depth;
14877 	vm_map_offset_t                 next_max_below, next_max_above;
14878 	vm_map_offset_t                 next_skip;
14879 
14880 	boolean_t                       look_for_pages;
14881 	vm_region_submap_short_info_64_t short_info;
14882 	boolean_t                       do_region_footprint;
14883 	int                             effective_page_size, effective_page_shift;
14884 	boolean_t                       submap_needed_copy;
14885 
14886 	if (map == VM_MAP_NULL) {
14887 		/* no address space to work on */
14888 		return KERN_INVALID_ARGUMENT;
14889 	}
14890 
14891 	effective_page_shift = vm_self_region_page_shift(map);
14892 	effective_page_size = (1 << effective_page_shift);
14893 
14894 	if (*count < VM_REGION_SUBMAP_SHORT_INFO_COUNT_64) {
14895 		/*
14896 		 * "info" structure is not big enough and
14897 		 * would overflow
14898 		 */
14899 		return KERN_INVALID_ARGUMENT;
14900 	}
14901 
14902 	do_region_footprint = task_self_region_footprint();
14903 	original_count = *count;
14904 
14905 	if (original_count < VM_REGION_SUBMAP_INFO_V0_COUNT_64) {
14906 		*count = VM_REGION_SUBMAP_SHORT_INFO_COUNT_64;
14907 		look_for_pages = FALSE;
14908 		short_info = (vm_region_submap_short_info_64_t) submap_info;
14909 		submap_info = NULL;
14910 	} else {
14911 		look_for_pages = TRUE;
14912 		*count = VM_REGION_SUBMAP_INFO_V0_COUNT_64;
14913 		short_info = NULL;
14914 
14915 		if (original_count >= VM_REGION_SUBMAP_INFO_V1_COUNT_64) {
14916 			*count = VM_REGION_SUBMAP_INFO_V1_COUNT_64;
14917 		}
14918 		if (original_count >= VM_REGION_SUBMAP_INFO_V2_COUNT_64) {
14919 			*count = VM_REGION_SUBMAP_INFO_V2_COUNT_64;
14920 		}
14921 	}
14922 
14923 	user_address = *address;
14924 	user_max_depth = *nesting_depth;
14925 	submap_needed_copy = FALSE;
14926 
14927 	if (not_in_kdp) {
14928 		vm_map_lock_read(map);
14929 	}
14930 
14931 recurse_again:
14932 	curr_entry = NULL;
14933 	curr_map = map;
14934 	curr_address = user_address;
14935 	curr_offset = 0;
14936 	curr_skip = 0;
14937 	curr_depth = 0;
14938 	curr_max_above = ((vm_map_offset_t) -1) - curr_address;
14939 	curr_max_below = curr_address;
14940 
14941 	next_entry = NULL;
14942 	next_map = NULL;
14943 	next_address = 0;
14944 	next_offset = 0;
14945 	next_skip = 0;
14946 	next_depth = 0;
14947 	next_max_above = (vm_map_offset_t) -1;
14948 	next_max_below = (vm_map_offset_t) -1;
14949 
14950 	for (;;) {
14951 		if (vm_map_lookup_entry(curr_map,
14952 		    curr_address,
14953 		    &tmp_entry)) {
14954 			/* tmp_entry contains the address we're looking for */
14955 			curr_entry = tmp_entry;
14956 		} else {
14957 			vm_map_offset_t skip;
14958 			/*
14959 			 * The address is not mapped.  "tmp_entry" is the
14960 			 * map entry preceding the address.  We want the next
14961 			 * one, if it exists.
14962 			 */
14963 			curr_entry = tmp_entry->vme_next;
14964 
14965 			if (curr_entry == vm_map_to_entry(curr_map) ||
14966 			    (curr_entry->vme_start >=
14967 			    curr_address + curr_max_above)) {
14968 				/* no next entry at this level: stop looking */
14969 				if (not_in_kdp) {
14970 					vm_map_unlock_read(curr_map);
14971 				}
14972 				curr_entry = NULL;
14973 				curr_map = NULL;
14974 				curr_skip = 0;
14975 				curr_offset = 0;
14976 				curr_depth = 0;
14977 				curr_max_above = 0;
14978 				curr_max_below = 0;
14979 				break;
14980 			}
14981 
14982 			/* adjust current address and offset */
14983 			skip = curr_entry->vme_start - curr_address;
14984 			curr_address = curr_entry->vme_start;
14985 			curr_skip += skip;
14986 			curr_offset += skip;
14987 			curr_max_above -= skip;
14988 			curr_max_below = 0;
14989 		}
14990 
14991 		/*
14992 		 * Is the next entry at this level closer to the address (or
14993 		 * deeper in the submap chain) than the one we had
14994 		 * so far ?
14995 		 */
14996 		tmp_entry = curr_entry->vme_next;
14997 		if (tmp_entry == vm_map_to_entry(curr_map)) {
14998 			/* no next entry at this level */
14999 		} else if (tmp_entry->vme_start >=
15000 		    curr_address + curr_max_above) {
15001 			/*
15002 			 * tmp_entry is beyond the scope of what we mapped of
15003 			 * this submap in the upper level: ignore it.
15004 			 */
15005 		} else if ((next_entry == NULL) ||
15006 		    (tmp_entry->vme_start + curr_offset <=
15007 		    next_entry->vme_start + next_offset)) {
15008 			/*
15009 			 * We didn't have a "next_entry" or this one is
15010 			 * closer to the address we're looking for:
15011 			 * use this "tmp_entry" as the new "next_entry".
15012 			 */
15013 			if (next_entry != NULL) {
15014 				/* unlock the last "next_map" */
15015 				if (next_map != curr_map && not_in_kdp) {
15016 					vm_map_unlock_read(next_map);
15017 				}
15018 			}
15019 			next_entry = tmp_entry;
15020 			next_map = curr_map;
15021 			next_depth = curr_depth;
15022 			next_address = next_entry->vme_start;
15023 			next_skip = curr_skip;
15024 			next_skip += (next_address - curr_address);
15025 			next_offset = curr_offset;
15026 			next_offset += (next_address - curr_address);
15027 			next_max_above = MIN(next_max_above, curr_max_above);
15028 			next_max_above = MIN(next_max_above,
15029 			    next_entry->vme_end - next_address);
15030 			next_max_below = MIN(next_max_below, curr_max_below);
15031 			next_max_below = MIN(next_max_below,
15032 			    next_address - next_entry->vme_start);
15033 		}
15034 
15035 		/*
15036 		 * "curr_max_{above,below}" allow us to keep track of the
15037 		 * portion of the submap that is actually mapped at this level:
15038 		 * the rest of that submap is irrelevant to us, since it's not
15039 		 * mapped here.
15040 		 * The relevant portion of the map starts at
15041 		 * "VME_OFFSET(curr_entry)" up to the size of "curr_entry".
15042 		 */
15043 		curr_max_above = MIN(curr_max_above,
15044 		    curr_entry->vme_end - curr_address);
15045 		curr_max_below = MIN(curr_max_below,
15046 		    curr_address - curr_entry->vme_start);
15047 
15048 		if (!curr_entry->is_sub_map ||
15049 		    curr_depth >= user_max_depth) {
15050 			/*
15051 			 * We hit a leaf map or we reached the maximum depth
15052 			 * we could, so stop looking.  Keep the current map
15053 			 * locked.
15054 			 */
15055 			break;
15056 		}
15057 
15058 		/*
15059 		 * Get down to the next submap level.
15060 		 */
15061 
15062 		if (curr_entry->needs_copy) {
15063 			/* everything below this is effectively copy-on-write */
15064 			submap_needed_copy = TRUE;
15065 		}
15066 
15067 		/*
15068 		 * Lock the next level and unlock the current level,
15069 		 * unless we need to keep it locked to access the "next_entry"
15070 		 * later.
15071 		 */
15072 		if (not_in_kdp) {
15073 			vm_map_lock_read(VME_SUBMAP(curr_entry));
15074 		}
15075 		if (curr_map == next_map) {
15076 			/* keep "next_map" locked in case we need it */
15077 		} else {
15078 			/* release this map */
15079 			if (not_in_kdp) {
15080 				vm_map_unlock_read(curr_map);
15081 			}
15082 		}
15083 
15084 		/*
15085 		 * Adjust the offset.  "curr_entry" maps the submap
15086 		 * at relative address "curr_entry->vme_start" in the
15087 		 * curr_map but skips the first "VME_OFFSET(curr_entry)"
15088 		 * bytes of the submap.
15089 		 * "curr_offset" always represents the offset of a virtual
15090 		 * address in the curr_map relative to the absolute address
15091 		 * space (i.e. the top-level VM map).
15092 		 */
15093 		curr_offset +=
15094 		    (VME_OFFSET(curr_entry) - curr_entry->vme_start);
15095 		curr_address = user_address + curr_offset;
15096 		/* switch to the submap */
15097 		curr_map = VME_SUBMAP(curr_entry);
15098 		curr_depth++;
15099 		curr_entry = NULL;
15100 	}
15101 
15102 // LP64todo: all the current tools are 32bit, obviously never worked for 64b
15103 // so probably should be a real 32b ID vs. ptr.
15104 // Current users just check for equality
15105 
15106 	if (curr_entry == NULL) {
15107 		/* no VM region contains the address... */
15108 
15109 		if (do_region_footprint && /* we want footprint numbers */
15110 		    next_entry == NULL && /* & there are no more regions */
15111 		    /* & we haven't already provided our fake region: */
15112 		    user_address <= vm_map_last_entry(map)->vme_end) {
15113 			ledger_amount_t ledger_resident, ledger_compressed;
15114 
15115 			/*
15116 			 * Add a fake memory region to account for
15117 			 * purgeable and/or ledger-tagged memory that
15118 			 * counts towards this task's memory footprint,
15119 			 * i.e. the resident/compressed pages of non-volatile
15120 			 * objects owned by that task.
15121 			 */
15122 			task_ledgers_footprint(map->pmap->ledger,
15123 			    &ledger_resident,
15124 			    &ledger_compressed);
15125 			if (ledger_resident + ledger_compressed == 0) {
15126 				/* no purgeable memory usage to report */
15127 				return KERN_INVALID_ADDRESS;
15128 			}
15129 			/* fake region to show nonvolatile footprint */
15130 			if (look_for_pages) {
15131 				submap_info->protection = VM_PROT_DEFAULT;
15132 				submap_info->max_protection = VM_PROT_DEFAULT;
15133 				submap_info->inheritance = VM_INHERIT_DEFAULT;
15134 				submap_info->offset = 0;
15135 				submap_info->user_tag = -1;
15136 				submap_info->pages_resident = (unsigned int) (ledger_resident / effective_page_size);
15137 				submap_info->pages_shared_now_private = 0;
15138 				submap_info->pages_swapped_out = (unsigned int) (ledger_compressed / effective_page_size);
15139 				submap_info->pages_dirtied = submap_info->pages_resident;
15140 				submap_info->ref_count = 1;
15141 				submap_info->shadow_depth = 0;
15142 				submap_info->external_pager = 0;
15143 				submap_info->share_mode = SM_PRIVATE;
15144 				if (submap_needed_copy) {
15145 					submap_info->share_mode = SM_COW;
15146 				}
15147 				submap_info->is_submap = 0;
15148 				submap_info->behavior = VM_BEHAVIOR_DEFAULT;
15149 				submap_info->object_id = VM_OBJECT_ID_FAKE(map, task_ledgers.purgeable_nonvolatile);
15150 				submap_info->user_wired_count = 0;
15151 				submap_info->pages_reusable = 0;
15152 			} else {
15153 				short_info->user_tag = -1;
15154 				short_info->offset = 0;
15155 				short_info->protection = VM_PROT_DEFAULT;
15156 				short_info->inheritance = VM_INHERIT_DEFAULT;
15157 				short_info->max_protection = VM_PROT_DEFAULT;
15158 				short_info->behavior = VM_BEHAVIOR_DEFAULT;
15159 				short_info->user_wired_count = 0;
15160 				short_info->is_submap = 0;
15161 				short_info->object_id = VM_OBJECT_ID_FAKE(map, task_ledgers.purgeable_nonvolatile);
15162 				short_info->external_pager = 0;
15163 				short_info->shadow_depth = 0;
15164 				short_info->share_mode = SM_PRIVATE;
15165 				if (submap_needed_copy) {
15166 					short_info->share_mode = SM_COW;
15167 				}
15168 				short_info->ref_count = 1;
15169 			}
15170 			*nesting_depth = 0;
15171 			*size = (vm_map_size_t) (ledger_resident + ledger_compressed);
15172 //			*address = user_address;
15173 			*address = vm_map_last_entry(map)->vme_end;
15174 			return KERN_SUCCESS;
15175 		}
15176 
15177 		if (next_entry == NULL) {
15178 			/* ... and no VM region follows it either */
15179 			return KERN_INVALID_ADDRESS;
15180 		}
15181 		/* ... gather info about the next VM region */
15182 		curr_entry = next_entry;
15183 		curr_map = next_map;    /* still locked ... */
15184 		curr_address = next_address;
15185 		curr_skip = next_skip;
15186 		curr_offset = next_offset;
15187 		curr_depth = next_depth;
15188 		curr_max_above = next_max_above;
15189 		curr_max_below = next_max_below;
15190 	} else {
15191 		/* we won't need "next_entry" after all */
15192 		if (next_entry != NULL) {
15193 			/* release "next_map" */
15194 			if (next_map != curr_map && not_in_kdp) {
15195 				vm_map_unlock_read(next_map);
15196 			}
15197 		}
15198 	}
15199 	next_entry = NULL;
15200 	next_map = NULL;
15201 	next_offset = 0;
15202 	next_skip = 0;
15203 	next_depth = 0;
15204 	next_max_below = -1;
15205 	next_max_above = -1;
15206 
15207 	if (curr_entry->is_sub_map &&
15208 	    curr_depth < user_max_depth) {
15209 		/*
15210 		 * We're not as deep as we could be:  we must have
15211 		 * gone back up after not finding anything mapped
15212 		 * below the original top-level map entry's.
15213 		 * Let's move "curr_address" forward and recurse again.
15214 		 */
15215 		user_address = curr_address;
15216 		goto recurse_again;
15217 	}
15218 
15219 	*nesting_depth = curr_depth;
15220 	*size = curr_max_above + curr_max_below;
15221 	*address = user_address + curr_skip - curr_max_below;
15222 
15223 	if (look_for_pages) {
15224 		submap_info->user_tag = VME_ALIAS(curr_entry);
15225 		submap_info->offset = VME_OFFSET(curr_entry);
15226 		submap_info->protection = curr_entry->protection;
15227 		submap_info->inheritance = curr_entry->inheritance;
15228 		submap_info->max_protection = curr_entry->max_protection;
15229 		submap_info->behavior = curr_entry->behavior;
15230 		submap_info->user_wired_count = curr_entry->user_wired_count;
15231 		submap_info->is_submap = curr_entry->is_sub_map;
15232 		if (curr_entry->is_sub_map) {
15233 			submap_info->object_id = VM_OBJECT_ID(VME_SUBMAP(curr_entry));
15234 		} else {
15235 			submap_info->object_id = VM_OBJECT_ID(VME_OBJECT(curr_entry));
15236 		}
15237 	} else {
15238 		short_info->user_tag = VME_ALIAS(curr_entry);
15239 		short_info->offset = VME_OFFSET(curr_entry);
15240 		short_info->protection = curr_entry->protection;
15241 		short_info->inheritance = curr_entry->inheritance;
15242 		short_info->max_protection = curr_entry->max_protection;
15243 		short_info->behavior = curr_entry->behavior;
15244 		short_info->user_wired_count = curr_entry->user_wired_count;
15245 		short_info->is_submap = curr_entry->is_sub_map;
15246 		if (curr_entry->is_sub_map) {
15247 			short_info->object_id = VM_OBJECT_ID(VME_SUBMAP(curr_entry));
15248 		} else {
15249 			short_info->object_id = VM_OBJECT_ID(VME_OBJECT(curr_entry));
15250 		}
15251 	}
15252 
15253 	extended.pages_resident = 0;
15254 	extended.pages_swapped_out = 0;
15255 	extended.pages_shared_now_private = 0;
15256 	extended.pages_dirtied = 0;
15257 	extended.pages_reusable = 0;
15258 	extended.external_pager = 0;
15259 	extended.shadow_depth = 0;
15260 	extended.share_mode = SM_EMPTY;
15261 	extended.ref_count = 0;
15262 
15263 	if (not_in_kdp) {
15264 		if (!curr_entry->is_sub_map) {
15265 			vm_map_offset_t range_start, range_end;
15266 			range_start = MAX((curr_address - curr_max_below),
15267 			    curr_entry->vme_start);
15268 			range_end = MIN((curr_address + curr_max_above),
15269 			    curr_entry->vme_end);
15270 			vm_map_region_walk(curr_map,
15271 			    range_start,
15272 			    curr_entry,
15273 			    (VME_OFFSET(curr_entry) +
15274 			    (range_start -
15275 			    curr_entry->vme_start)),
15276 			    range_end - range_start,
15277 			    &extended,
15278 			    look_for_pages, VM_REGION_EXTENDED_INFO_COUNT);
15279 			if (extended.external_pager &&
15280 			    extended.ref_count == 2 &&
15281 			    extended.share_mode == SM_SHARED) {
15282 				extended.share_mode = SM_PRIVATE;
15283 			}
15284 			if (submap_needed_copy) {
15285 				extended.share_mode = SM_COW;
15286 			}
15287 		} else {
15288 			if (curr_entry->use_pmap) {
15289 				extended.share_mode = SM_TRUESHARED;
15290 			} else {
15291 				extended.share_mode = SM_PRIVATE;
15292 			}
15293 			extended.ref_count = os_ref_get_count_raw(&VME_SUBMAP(curr_entry)->map_refcnt);
15294 		}
15295 	}
15296 
15297 	if (look_for_pages) {
15298 		submap_info->pages_resident = extended.pages_resident;
15299 		submap_info->pages_swapped_out = extended.pages_swapped_out;
15300 		submap_info->pages_shared_now_private =
15301 		    extended.pages_shared_now_private;
15302 		submap_info->pages_dirtied = extended.pages_dirtied;
15303 		submap_info->external_pager = extended.external_pager;
15304 		submap_info->shadow_depth = extended.shadow_depth;
15305 		submap_info->share_mode = extended.share_mode;
15306 		submap_info->ref_count = extended.ref_count;
15307 
15308 		if (original_count >= VM_REGION_SUBMAP_INFO_V1_COUNT_64) {
15309 			submap_info->pages_reusable = extended.pages_reusable;
15310 		}
15311 		if (original_count >= VM_REGION_SUBMAP_INFO_V2_COUNT_64) {
15312 			if (curr_entry->is_sub_map) {
15313 				submap_info->object_id_full = (vm_object_id_t)VM_KERNEL_ADDRHASH(VME_SUBMAP(curr_entry));
15314 			} else if (VME_OBJECT(curr_entry)) {
15315 				submap_info->object_id_full = (vm_object_id_t)VM_KERNEL_ADDRHASH(VME_OBJECT(curr_entry));
15316 			} else {
15317 				submap_info->object_id_full = 0ull;
15318 			}
15319 		}
15320 	} else {
15321 		short_info->external_pager = extended.external_pager;
15322 		short_info->shadow_depth = extended.shadow_depth;
15323 		short_info->share_mode = extended.share_mode;
15324 		short_info->ref_count = extended.ref_count;
15325 	}
15326 
15327 	if (not_in_kdp) {
15328 		vm_map_unlock_read(curr_map);
15329 	}
15330 
15331 	return KERN_SUCCESS;
15332 }
15333 
15334 /*
15335  *	vm_region:
15336  *
15337  *	User call to obtain information about a region in
15338  *	a task's address map. Currently, only one flavor is
15339  *	supported.
15340  *
15341  *	XXX The reserved and behavior fields cannot be filled
15342  *	    in until the vm merge from the IK is completed, and
15343  *	    vm_reserve is implemented.
15344  */
15345 
15346 kern_return_t
vm_map_region(vm_map_t map,vm_map_offset_t * address,vm_map_size_t * size,vm_region_flavor_t flavor,vm_region_info_t info,mach_msg_type_number_t * count,mach_port_t * object_name)15347 vm_map_region(
15348 	vm_map_t                 map,
15349 	vm_map_offset_t *address,               /* IN/OUT */
15350 	vm_map_size_t           *size,                  /* OUT */
15351 	vm_region_flavor_t       flavor,                /* IN */
15352 	vm_region_info_t         info,                  /* OUT */
15353 	mach_msg_type_number_t  *count, /* IN/OUT */
15354 	mach_port_t             *object_name)           /* OUT */
15355 {
15356 	vm_map_entry_t          tmp_entry;
15357 	vm_map_entry_t          entry;
15358 	vm_map_offset_t         start;
15359 
15360 	if (map == VM_MAP_NULL) {
15361 		return KERN_INVALID_ARGUMENT;
15362 	}
15363 
15364 	switch (flavor) {
15365 	case VM_REGION_BASIC_INFO:
15366 		/* legacy for old 32-bit objects info */
15367 	{
15368 		vm_region_basic_info_t  basic;
15369 
15370 		if (*count < VM_REGION_BASIC_INFO_COUNT) {
15371 			return KERN_INVALID_ARGUMENT;
15372 		}
15373 
15374 		basic = (vm_region_basic_info_t) info;
15375 		*count = VM_REGION_BASIC_INFO_COUNT;
15376 
15377 		vm_map_lock_read(map);
15378 
15379 		start = *address;
15380 		if (!vm_map_lookup_entry(map, start, &tmp_entry)) {
15381 			if ((entry = tmp_entry->vme_next) == vm_map_to_entry(map)) {
15382 				vm_map_unlock_read(map);
15383 				return KERN_INVALID_ADDRESS;
15384 			}
15385 		} else {
15386 			entry = tmp_entry;
15387 		}
15388 
15389 		start = entry->vme_start;
15390 
15391 		basic->offset = (uint32_t)VME_OFFSET(entry);
15392 		basic->protection = entry->protection;
15393 		basic->inheritance = entry->inheritance;
15394 		basic->max_protection = entry->max_protection;
15395 		basic->behavior = entry->behavior;
15396 		basic->user_wired_count = entry->user_wired_count;
15397 		basic->reserved = entry->is_sub_map;
15398 		*address = start;
15399 		*size = (entry->vme_end - start);
15400 
15401 		if (object_name) {
15402 			*object_name = IP_NULL;
15403 		}
15404 		if (entry->is_sub_map) {
15405 			basic->shared = FALSE;
15406 		} else {
15407 			basic->shared = entry->is_shared;
15408 		}
15409 
15410 		vm_map_unlock_read(map);
15411 		return KERN_SUCCESS;
15412 	}
15413 
15414 	case VM_REGION_BASIC_INFO_64:
15415 	{
15416 		vm_region_basic_info_64_t       basic;
15417 
15418 		if (*count < VM_REGION_BASIC_INFO_COUNT_64) {
15419 			return KERN_INVALID_ARGUMENT;
15420 		}
15421 
15422 		basic = (vm_region_basic_info_64_t) info;
15423 		*count = VM_REGION_BASIC_INFO_COUNT_64;
15424 
15425 		vm_map_lock_read(map);
15426 
15427 		start = *address;
15428 		if (!vm_map_lookup_entry(map, start, &tmp_entry)) {
15429 			if ((entry = tmp_entry->vme_next) == vm_map_to_entry(map)) {
15430 				vm_map_unlock_read(map);
15431 				return KERN_INVALID_ADDRESS;
15432 			}
15433 		} else {
15434 			entry = tmp_entry;
15435 		}
15436 
15437 		start = entry->vme_start;
15438 
15439 		basic->offset = VME_OFFSET(entry);
15440 		basic->protection = entry->protection;
15441 		basic->inheritance = entry->inheritance;
15442 		basic->max_protection = entry->max_protection;
15443 		basic->behavior = entry->behavior;
15444 		basic->user_wired_count = entry->user_wired_count;
15445 		basic->reserved = entry->is_sub_map;
15446 		*address = start;
15447 		*size = (entry->vme_end - start);
15448 
15449 		if (object_name) {
15450 			*object_name = IP_NULL;
15451 		}
15452 		if (entry->is_sub_map) {
15453 			basic->shared = FALSE;
15454 		} else {
15455 			basic->shared = entry->is_shared;
15456 		}
15457 
15458 		vm_map_unlock_read(map);
15459 		return KERN_SUCCESS;
15460 	}
15461 	case VM_REGION_EXTENDED_INFO:
15462 		if (*count < VM_REGION_EXTENDED_INFO_COUNT) {
15463 			return KERN_INVALID_ARGUMENT;
15464 		}
15465 		OS_FALLTHROUGH;
15466 	case VM_REGION_EXTENDED_INFO__legacy:
15467 		if (*count < VM_REGION_EXTENDED_INFO_COUNT__legacy) {
15468 			return KERN_INVALID_ARGUMENT;
15469 		}
15470 
15471 		{
15472 			vm_region_extended_info_t       extended;
15473 			mach_msg_type_number_t original_count;
15474 			int effective_page_size, effective_page_shift;
15475 
15476 			extended = (vm_region_extended_info_t) info;
15477 
15478 			effective_page_shift = vm_self_region_page_shift(map);
15479 			effective_page_size = (1 << effective_page_shift);
15480 
15481 			vm_map_lock_read(map);
15482 
15483 			start = *address;
15484 			if (!vm_map_lookup_entry(map, start, &tmp_entry)) {
15485 				if ((entry = tmp_entry->vme_next) == vm_map_to_entry(map)) {
15486 					vm_map_unlock_read(map);
15487 					return KERN_INVALID_ADDRESS;
15488 				}
15489 			} else {
15490 				entry = tmp_entry;
15491 			}
15492 			start = entry->vme_start;
15493 
15494 			extended->protection = entry->protection;
15495 			extended->user_tag = VME_ALIAS(entry);
15496 			extended->pages_resident = 0;
15497 			extended->pages_swapped_out = 0;
15498 			extended->pages_shared_now_private = 0;
15499 			extended->pages_dirtied = 0;
15500 			extended->external_pager = 0;
15501 			extended->shadow_depth = 0;
15502 
15503 			original_count = *count;
15504 			if (flavor == VM_REGION_EXTENDED_INFO__legacy) {
15505 				*count = VM_REGION_EXTENDED_INFO_COUNT__legacy;
15506 			} else {
15507 				extended->pages_reusable = 0;
15508 				*count = VM_REGION_EXTENDED_INFO_COUNT;
15509 			}
15510 
15511 			vm_map_region_walk(map, start, entry, VME_OFFSET(entry), entry->vme_end - start, extended, TRUE, *count);
15512 
15513 			if (extended->external_pager && extended->ref_count == 2 && extended->share_mode == SM_SHARED) {
15514 				extended->share_mode = SM_PRIVATE;
15515 			}
15516 
15517 			if (object_name) {
15518 				*object_name = IP_NULL;
15519 			}
15520 			*address = start;
15521 			*size = (entry->vme_end - start);
15522 
15523 			vm_map_unlock_read(map);
15524 			return KERN_SUCCESS;
15525 		}
15526 	case VM_REGION_TOP_INFO:
15527 	{
15528 		vm_region_top_info_t    top;
15529 
15530 		if (*count < VM_REGION_TOP_INFO_COUNT) {
15531 			return KERN_INVALID_ARGUMENT;
15532 		}
15533 
15534 		top = (vm_region_top_info_t) info;
15535 		*count = VM_REGION_TOP_INFO_COUNT;
15536 
15537 		vm_map_lock_read(map);
15538 
15539 		start = *address;
15540 		if (!vm_map_lookup_entry(map, start, &tmp_entry)) {
15541 			if ((entry = tmp_entry->vme_next) == vm_map_to_entry(map)) {
15542 				vm_map_unlock_read(map);
15543 				return KERN_INVALID_ADDRESS;
15544 			}
15545 		} else {
15546 			entry = tmp_entry;
15547 		}
15548 		start = entry->vme_start;
15549 
15550 		top->private_pages_resident = 0;
15551 		top->shared_pages_resident = 0;
15552 
15553 		vm_map_region_top_walk(entry, top);
15554 
15555 		if (object_name) {
15556 			*object_name = IP_NULL;
15557 		}
15558 		*address = start;
15559 		*size = (entry->vme_end - start);
15560 
15561 		vm_map_unlock_read(map);
15562 		return KERN_SUCCESS;
15563 	}
15564 	default:
15565 		return KERN_INVALID_ARGUMENT;
15566 	}
15567 }
15568 
15569 #define OBJ_RESIDENT_COUNT(obj, entry_size)                             \
15570 	MIN((entry_size),                                               \
15571 	    ((obj)->all_reusable ?                                      \
15572 	     (obj)->wired_page_count :                                  \
15573 	     (obj)->resident_page_count - (obj)->reusable_page_count))
15574 
15575 void
vm_map_region_top_walk(vm_map_entry_t entry,vm_region_top_info_t top)15576 vm_map_region_top_walk(
15577 	vm_map_entry_t             entry,
15578 	vm_region_top_info_t       top)
15579 {
15580 	if (entry->is_sub_map || VME_OBJECT(entry) == 0) {
15581 		top->share_mode = SM_EMPTY;
15582 		top->ref_count = 0;
15583 		top->obj_id = 0;
15584 		return;
15585 	}
15586 
15587 	{
15588 		struct  vm_object *obj, *tmp_obj;
15589 		int             ref_count;
15590 		uint32_t        entry_size;
15591 
15592 		entry_size = (uint32_t) ((entry->vme_end - entry->vme_start) / PAGE_SIZE_64);
15593 
15594 		obj = VME_OBJECT(entry);
15595 
15596 		vm_object_lock(obj);
15597 
15598 		if ((ref_count = obj->ref_count) > 1 && obj->paging_in_progress) {
15599 			ref_count--;
15600 		}
15601 
15602 		assert(obj->reusable_page_count <= obj->resident_page_count);
15603 		if (obj->shadow) {
15604 			if (ref_count == 1) {
15605 				top->private_pages_resident =
15606 				    OBJ_RESIDENT_COUNT(obj, entry_size);
15607 			} else {
15608 				top->shared_pages_resident =
15609 				    OBJ_RESIDENT_COUNT(obj, entry_size);
15610 			}
15611 			top->ref_count  = ref_count;
15612 			top->share_mode = SM_COW;
15613 
15614 			while ((tmp_obj = obj->shadow)) {
15615 				vm_object_lock(tmp_obj);
15616 				vm_object_unlock(obj);
15617 				obj = tmp_obj;
15618 
15619 				if ((ref_count = obj->ref_count) > 1 && obj->paging_in_progress) {
15620 					ref_count--;
15621 				}
15622 
15623 				assert(obj->reusable_page_count <= obj->resident_page_count);
15624 				top->shared_pages_resident +=
15625 				    OBJ_RESIDENT_COUNT(obj, entry_size);
15626 				top->ref_count += ref_count - 1;
15627 			}
15628 		} else {
15629 			if (entry->superpage_size) {
15630 				top->share_mode = SM_LARGE_PAGE;
15631 				top->shared_pages_resident = 0;
15632 				top->private_pages_resident = entry_size;
15633 			} else if (entry->needs_copy) {
15634 				top->share_mode = SM_COW;
15635 				top->shared_pages_resident =
15636 				    OBJ_RESIDENT_COUNT(obj, entry_size);
15637 			} else {
15638 				if (ref_count == 1 ||
15639 				    (ref_count == 2 && obj->named)) {
15640 					top->share_mode = SM_PRIVATE;
15641 					top->private_pages_resident =
15642 					    OBJ_RESIDENT_COUNT(obj,
15643 					    entry_size);
15644 				} else {
15645 					top->share_mode = SM_SHARED;
15646 					top->shared_pages_resident =
15647 					    OBJ_RESIDENT_COUNT(obj,
15648 					    entry_size);
15649 				}
15650 			}
15651 			top->ref_count = ref_count;
15652 		}
15653 
15654 		vm_object_unlock(obj);
15655 
15656 		/* XXX K64: obj_id will be truncated */
15657 		top->obj_id = (unsigned int) (uintptr_t)VM_KERNEL_ADDRHASH(obj);
15658 	}
15659 }
15660 
15661 void
vm_map_region_walk(vm_map_t map,vm_map_offset_t va,vm_map_entry_t entry,vm_object_offset_t offset,vm_object_size_t range,vm_region_extended_info_t extended,boolean_t look_for_pages,mach_msg_type_number_t count)15662 vm_map_region_walk(
15663 	vm_map_t                        map,
15664 	vm_map_offset_t                 va,
15665 	vm_map_entry_t                  entry,
15666 	vm_object_offset_t              offset,
15667 	vm_object_size_t                range,
15668 	vm_region_extended_info_t       extended,
15669 	boolean_t                       look_for_pages,
15670 	mach_msg_type_number_t count)
15671 {
15672 	struct vm_object *obj, *tmp_obj;
15673 	vm_map_offset_t       last_offset;
15674 	int               i;
15675 	int               ref_count;
15676 	struct vm_object        *shadow_object;
15677 	unsigned short          shadow_depth;
15678 	boolean_t         do_region_footprint;
15679 	int                     effective_page_size, effective_page_shift;
15680 	vm_map_offset_t         effective_page_mask;
15681 
15682 	do_region_footprint = task_self_region_footprint();
15683 
15684 	if ((entry->is_sub_map) ||
15685 	    (VME_OBJECT(entry) == 0) ||
15686 	    (VME_OBJECT(entry)->phys_contiguous &&
15687 	    !entry->superpage_size)) {
15688 		extended->share_mode = SM_EMPTY;
15689 		extended->ref_count = 0;
15690 		return;
15691 	}
15692 
15693 	if (entry->superpage_size) {
15694 		extended->shadow_depth = 0;
15695 		extended->share_mode = SM_LARGE_PAGE;
15696 		extended->ref_count = 1;
15697 		extended->external_pager = 0;
15698 
15699 		/* TODO4K: Superpage in 4k mode? */
15700 		extended->pages_resident = (unsigned int)(range >> PAGE_SHIFT);
15701 		extended->shadow_depth = 0;
15702 		return;
15703 	}
15704 
15705 	effective_page_shift = vm_self_region_page_shift(map);
15706 	effective_page_size = (1 << effective_page_shift);
15707 	effective_page_mask = effective_page_size - 1;
15708 
15709 	offset = vm_map_trunc_page(offset, effective_page_mask);
15710 
15711 	obj = VME_OBJECT(entry);
15712 
15713 	vm_object_lock(obj);
15714 
15715 	if ((ref_count = obj->ref_count) > 1 && obj->paging_in_progress) {
15716 		ref_count--;
15717 	}
15718 
15719 	if (look_for_pages) {
15720 		for (last_offset = offset + range;
15721 		    offset < last_offset;
15722 		    offset += effective_page_size, va += effective_page_size) {
15723 			if (do_region_footprint) {
15724 				int disp;
15725 
15726 				disp = 0;
15727 				if (map->has_corpse_footprint) {
15728 					/*
15729 					 * Query the page info data we saved
15730 					 * while forking the corpse.
15731 					 */
15732 					vm_map_corpse_footprint_query_page_info(
15733 						map,
15734 						va,
15735 						&disp);
15736 				} else {
15737 					/*
15738 					 * Query the pmap.
15739 					 */
15740 					vm_map_footprint_query_page_info(
15741 						map,
15742 						entry,
15743 						va,
15744 						&disp);
15745 				}
15746 				if (disp & VM_PAGE_QUERY_PAGE_PRESENT) {
15747 					extended->pages_resident++;
15748 				}
15749 				if (disp & VM_PAGE_QUERY_PAGE_REUSABLE) {
15750 					extended->pages_reusable++;
15751 				}
15752 				if (disp & VM_PAGE_QUERY_PAGE_DIRTY) {
15753 					extended->pages_dirtied++;
15754 				}
15755 				if (disp & PMAP_QUERY_PAGE_COMPRESSED) {
15756 					extended->pages_swapped_out++;
15757 				}
15758 				continue;
15759 			}
15760 
15761 			vm_map_region_look_for_page(map, va, obj,
15762 			    vm_object_trunc_page(offset), ref_count,
15763 			    0, extended, count);
15764 		}
15765 
15766 		if (do_region_footprint) {
15767 			goto collect_object_info;
15768 		}
15769 	} else {
15770 collect_object_info:
15771 		shadow_object = obj->shadow;
15772 		shadow_depth = 0;
15773 
15774 		if (!(obj->internal)) {
15775 			extended->external_pager = 1;
15776 		}
15777 
15778 		if (shadow_object != VM_OBJECT_NULL) {
15779 			vm_object_lock(shadow_object);
15780 			for (;
15781 			    shadow_object != VM_OBJECT_NULL;
15782 			    shadow_depth++) {
15783 				vm_object_t     next_shadow;
15784 
15785 				if (!(shadow_object->internal)) {
15786 					extended->external_pager = 1;
15787 				}
15788 
15789 				next_shadow = shadow_object->shadow;
15790 				if (next_shadow) {
15791 					vm_object_lock(next_shadow);
15792 				}
15793 				vm_object_unlock(shadow_object);
15794 				shadow_object = next_shadow;
15795 			}
15796 		}
15797 		extended->shadow_depth = shadow_depth;
15798 	}
15799 
15800 	if (extended->shadow_depth || entry->needs_copy) {
15801 		extended->share_mode = SM_COW;
15802 	} else {
15803 		if (ref_count == 1) {
15804 			extended->share_mode = SM_PRIVATE;
15805 		} else {
15806 			if (obj->true_share) {
15807 				extended->share_mode = SM_TRUESHARED;
15808 			} else {
15809 				extended->share_mode = SM_SHARED;
15810 			}
15811 		}
15812 	}
15813 	extended->ref_count = ref_count - extended->shadow_depth;
15814 
15815 	for (i = 0; i < extended->shadow_depth; i++) {
15816 		if ((tmp_obj = obj->shadow) == 0) {
15817 			break;
15818 		}
15819 		vm_object_lock(tmp_obj);
15820 		vm_object_unlock(obj);
15821 
15822 		if ((ref_count = tmp_obj->ref_count) > 1 && tmp_obj->paging_in_progress) {
15823 			ref_count--;
15824 		}
15825 
15826 		extended->ref_count += ref_count;
15827 		obj = tmp_obj;
15828 	}
15829 	vm_object_unlock(obj);
15830 
15831 	if (extended->share_mode == SM_SHARED) {
15832 		vm_map_entry_t       cur;
15833 		vm_map_entry_t       last;
15834 		int      my_refs;
15835 
15836 		obj = VME_OBJECT(entry);
15837 		last = vm_map_to_entry(map);
15838 		my_refs = 0;
15839 
15840 		if ((ref_count = obj->ref_count) > 1 && obj->paging_in_progress) {
15841 			ref_count--;
15842 		}
15843 		for (cur = vm_map_first_entry(map); cur != last; cur = cur->vme_next) {
15844 			my_refs += vm_map_region_count_obj_refs(cur, obj);
15845 		}
15846 
15847 		if (my_refs == ref_count) {
15848 			extended->share_mode = SM_PRIVATE_ALIASED;
15849 		} else if (my_refs > 1) {
15850 			extended->share_mode = SM_SHARED_ALIASED;
15851 		}
15852 	}
15853 }
15854 
15855 
15856 /* object is locked on entry and locked on return */
15857 
15858 
15859 static void
vm_map_region_look_for_page(__unused vm_map_t map,__unused vm_map_offset_t va,vm_object_t object,vm_object_offset_t offset,int max_refcnt,unsigned short depth,vm_region_extended_info_t extended,mach_msg_type_number_t count)15860 vm_map_region_look_for_page(
15861 	__unused vm_map_t               map,
15862 	__unused vm_map_offset_t        va,
15863 	vm_object_t                     object,
15864 	vm_object_offset_t              offset,
15865 	int                             max_refcnt,
15866 	unsigned short                  depth,
15867 	vm_region_extended_info_t       extended,
15868 	mach_msg_type_number_t count)
15869 {
15870 	vm_page_t       p;
15871 	vm_object_t     shadow;
15872 	int             ref_count;
15873 	vm_object_t     caller_object;
15874 
15875 	shadow = object->shadow;
15876 	caller_object = object;
15877 
15878 
15879 	while (TRUE) {
15880 		if (!(object->internal)) {
15881 			extended->external_pager = 1;
15882 		}
15883 
15884 		if ((p = vm_page_lookup(object, offset)) != VM_PAGE_NULL) {
15885 			if (shadow && (max_refcnt == 1)) {
15886 				extended->pages_shared_now_private++;
15887 			}
15888 
15889 			if (!p->vmp_fictitious &&
15890 			    (p->vmp_dirty || pmap_is_modified(VM_PAGE_GET_PHYS_PAGE(p)))) {
15891 				extended->pages_dirtied++;
15892 			} else if (count >= VM_REGION_EXTENDED_INFO_COUNT) {
15893 				if (p->vmp_reusable || object->all_reusable) {
15894 					extended->pages_reusable++;
15895 				}
15896 			}
15897 
15898 			extended->pages_resident++;
15899 
15900 			if (object != caller_object) {
15901 				vm_object_unlock(object);
15902 			}
15903 
15904 			return;
15905 		}
15906 		if (object->internal &&
15907 		    object->alive &&
15908 		    !object->terminating &&
15909 		    object->pager_ready) {
15910 			if (VM_COMPRESSOR_PAGER_STATE_GET(object, offset)
15911 			    == VM_EXTERNAL_STATE_EXISTS) {
15912 				/* the pager has that page */
15913 				extended->pages_swapped_out++;
15914 				if (object != caller_object) {
15915 					vm_object_unlock(object);
15916 				}
15917 				return;
15918 			}
15919 		}
15920 
15921 		if (shadow) {
15922 			vm_object_lock(shadow);
15923 
15924 			if ((ref_count = shadow->ref_count) > 1 && shadow->paging_in_progress) {
15925 				ref_count--;
15926 			}
15927 
15928 			if (++depth > extended->shadow_depth) {
15929 				extended->shadow_depth = depth;
15930 			}
15931 
15932 			if (ref_count > max_refcnt) {
15933 				max_refcnt = ref_count;
15934 			}
15935 
15936 			if (object != caller_object) {
15937 				vm_object_unlock(object);
15938 			}
15939 
15940 			offset = offset + object->vo_shadow_offset;
15941 			object = shadow;
15942 			shadow = object->shadow;
15943 			continue;
15944 		}
15945 		if (object != caller_object) {
15946 			vm_object_unlock(object);
15947 		}
15948 		break;
15949 	}
15950 }
15951 
15952 static int
vm_map_region_count_obj_refs(vm_map_entry_t entry,vm_object_t object)15953 vm_map_region_count_obj_refs(
15954 	vm_map_entry_t    entry,
15955 	vm_object_t       object)
15956 {
15957 	int ref_count;
15958 	vm_object_t chk_obj;
15959 	vm_object_t tmp_obj;
15960 
15961 	if (entry->is_sub_map || VME_OBJECT(entry) == VM_OBJECT_NULL) {
15962 		return 0;
15963 	}
15964 
15965 	ref_count = 0;
15966 	chk_obj = VME_OBJECT(entry);
15967 	vm_object_lock(chk_obj);
15968 
15969 	while (chk_obj) {
15970 		if (chk_obj == object) {
15971 			ref_count++;
15972 		}
15973 		tmp_obj = chk_obj->shadow;
15974 		if (tmp_obj) {
15975 			vm_object_lock(tmp_obj);
15976 		}
15977 		vm_object_unlock(chk_obj);
15978 
15979 		chk_obj = tmp_obj;
15980 	}
15981 
15982 	return ref_count;
15983 }
15984 
15985 
15986 /*
15987  *	Routine:	vm_map_simplify
15988  *
15989  *	Description:
15990  *		Attempt to simplify the map representation in
15991  *		the vicinity of the given starting address.
15992  *	Note:
15993  *		This routine is intended primarily to keep the
15994  *		kernel maps more compact -- they generally don't
15995  *		benefit from the "expand a map entry" technology
15996  *		at allocation time because the adjacent entry
15997  *		is often wired down.
15998  */
15999 void
vm_map_simplify_entry(vm_map_t map,vm_map_entry_t this_entry)16000 vm_map_simplify_entry(
16001 	vm_map_t        map,
16002 	vm_map_entry_t  this_entry)
16003 {
16004 	vm_map_entry_t  prev_entry;
16005 
16006 	prev_entry = this_entry->vme_prev;
16007 
16008 	if ((this_entry != vm_map_to_entry(map)) &&
16009 	    (prev_entry != vm_map_to_entry(map)) &&
16010 
16011 	    (prev_entry->vme_end == this_entry->vme_start) &&
16012 
16013 	    (prev_entry->is_sub_map == this_entry->is_sub_map) &&
16014 	    (prev_entry->vme_object_value == this_entry->vme_object_value) &&
16015 	    (prev_entry->vme_kernel_object == this_entry->vme_kernel_object) &&
16016 	    ((VME_OFFSET(prev_entry) + (prev_entry->vme_end -
16017 	    prev_entry->vme_start))
16018 	    == VME_OFFSET(this_entry)) &&
16019 
16020 	    (prev_entry->behavior == this_entry->behavior) &&
16021 	    (prev_entry->needs_copy == this_entry->needs_copy) &&
16022 	    (prev_entry->protection == this_entry->protection) &&
16023 	    (prev_entry->max_protection == this_entry->max_protection) &&
16024 	    (prev_entry->inheritance == this_entry->inheritance) &&
16025 	    (prev_entry->use_pmap == this_entry->use_pmap) &&
16026 	    (VME_ALIAS(prev_entry) == VME_ALIAS(this_entry)) &&
16027 	    (prev_entry->no_cache == this_entry->no_cache) &&
16028 	    (prev_entry->vme_permanent == this_entry->vme_permanent) &&
16029 	    (prev_entry->map_aligned == this_entry->map_aligned) &&
16030 	    (prev_entry->zero_wired_pages == this_entry->zero_wired_pages) &&
16031 	    (prev_entry->used_for_jit == this_entry->used_for_jit) &&
16032 #if __arm64e__
16033 	    (prev_entry->used_for_tpro == this_entry->used_for_tpro) &&
16034 #endif
16035 	    (prev_entry->csm_associated == this_entry->csm_associated) &&
16036 	    (prev_entry->vme_xnu_user_debug == this_entry->vme_xnu_user_debug) &&
16037 	    (prev_entry->iokit_acct == this_entry->iokit_acct) &&
16038 	    (prev_entry->vme_resilient_codesign ==
16039 	    this_entry->vme_resilient_codesign) &&
16040 	    (prev_entry->vme_resilient_media ==
16041 	    this_entry->vme_resilient_media) &&
16042 	    (prev_entry->vme_no_copy_on_read == this_entry->vme_no_copy_on_read) &&
16043 	    (prev_entry->translated_allow_execute == this_entry->translated_allow_execute) &&
16044 
16045 	    (prev_entry->wired_count == this_entry->wired_count) &&
16046 	    (prev_entry->user_wired_count == this_entry->user_wired_count) &&
16047 
16048 	    ((prev_entry->vme_atomic == FALSE) && (this_entry->vme_atomic == FALSE)) &&
16049 	    (prev_entry->in_transition == FALSE) &&
16050 	    (this_entry->in_transition == FALSE) &&
16051 	    (prev_entry->needs_wakeup == FALSE) &&
16052 	    (this_entry->needs_wakeup == FALSE) &&
16053 	    (prev_entry->is_shared == this_entry->is_shared) &&
16054 	    (prev_entry->superpage_size == FALSE) &&
16055 	    (this_entry->superpage_size == FALSE)
16056 	    ) {
16057 		if (prev_entry->vme_permanent) {
16058 			assert(this_entry->vme_permanent);
16059 			prev_entry->vme_permanent = false;
16060 		}
16061 		vm_map_store_entry_unlink(map, prev_entry, true);
16062 		assert(prev_entry->vme_start < this_entry->vme_end);
16063 		if (prev_entry->map_aligned) {
16064 			assert(VM_MAP_PAGE_ALIGNED(prev_entry->vme_start,
16065 			    VM_MAP_PAGE_MASK(map)));
16066 		}
16067 		this_entry->vme_start = prev_entry->vme_start;
16068 		VME_OFFSET_SET(this_entry, VME_OFFSET(prev_entry));
16069 
16070 		if (map->holelistenabled) {
16071 			vm_map_store_update_first_free(map, this_entry, TRUE);
16072 		}
16073 
16074 		if (prev_entry->is_sub_map) {
16075 			vm_map_deallocate(VME_SUBMAP(prev_entry));
16076 		} else {
16077 			vm_object_deallocate(VME_OBJECT(prev_entry));
16078 		}
16079 		vm_map_entry_dispose(prev_entry);
16080 		SAVE_HINT_MAP_WRITE(map, this_entry);
16081 	}
16082 }
16083 
16084 void
vm_map_simplify(vm_map_t map,vm_map_offset_t start)16085 vm_map_simplify(
16086 	vm_map_t        map,
16087 	vm_map_offset_t start)
16088 {
16089 	vm_map_entry_t  this_entry;
16090 
16091 	vm_map_lock(map);
16092 	if (vm_map_lookup_entry(map, start, &this_entry)) {
16093 		vm_map_simplify_entry(map, this_entry);
16094 		vm_map_simplify_entry(map, this_entry->vme_next);
16095 	}
16096 	vm_map_unlock(map);
16097 }
16098 
16099 static void
vm_map_simplify_range(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)16100 vm_map_simplify_range(
16101 	vm_map_t        map,
16102 	vm_map_offset_t start,
16103 	vm_map_offset_t end)
16104 {
16105 	vm_map_entry_t  entry;
16106 
16107 	/*
16108 	 * The map should be locked (for "write") by the caller.
16109 	 */
16110 
16111 	if (start >= end) {
16112 		/* invalid address range */
16113 		return;
16114 	}
16115 
16116 	start = vm_map_trunc_page(start,
16117 	    VM_MAP_PAGE_MASK(map));
16118 	end = vm_map_round_page(end,
16119 	    VM_MAP_PAGE_MASK(map));
16120 
16121 	if (!vm_map_lookup_entry(map, start, &entry)) {
16122 		/* "start" is not mapped and "entry" ends before "start" */
16123 		if (entry == vm_map_to_entry(map)) {
16124 			/* start with first entry in the map */
16125 			entry = vm_map_first_entry(map);
16126 		} else {
16127 			/* start with next entry */
16128 			entry = entry->vme_next;
16129 		}
16130 	}
16131 
16132 	while (entry != vm_map_to_entry(map) &&
16133 	    entry->vme_start <= end) {
16134 		/* try and coalesce "entry" with its previous entry */
16135 		vm_map_simplify_entry(map, entry);
16136 		entry = entry->vme_next;
16137 	}
16138 }
16139 
16140 
16141 /*
16142  *	Routine:	vm_map_machine_attribute
16143  *	Purpose:
16144  *		Provide machine-specific attributes to mappings,
16145  *		such as cachability etc. for machines that provide
16146  *		them.  NUMA architectures and machines with big/strange
16147  *		caches will use this.
16148  *	Note:
16149  *		Responsibilities for locking and checking are handled here,
16150  *		everything else in the pmap module. If any non-volatile
16151  *		information must be kept, the pmap module should handle
16152  *		it itself. [This assumes that attributes do not
16153  *		need to be inherited, which seems ok to me]
16154  */
16155 kern_return_t
vm_map_machine_attribute(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_machine_attribute_t attribute,vm_machine_attribute_val_t * value)16156 vm_map_machine_attribute(
16157 	vm_map_t                        map,
16158 	vm_map_offset_t         start,
16159 	vm_map_offset_t         end,
16160 	vm_machine_attribute_t  attribute,
16161 	vm_machine_attribute_val_t* value)              /* IN/OUT */
16162 {
16163 	kern_return_t   ret;
16164 	vm_map_size_t sync_size;
16165 	vm_map_entry_t entry;
16166 
16167 	if (start < vm_map_min(map) || end > vm_map_max(map)) {
16168 		return KERN_INVALID_ADDRESS;
16169 	}
16170 	if (__improbable(vm_map_range_overflows(map, start, end - start))) {
16171 		return KERN_INVALID_ADDRESS;
16172 	}
16173 
16174 	/* Figure how much memory we need to flush (in page increments) */
16175 	sync_size = end - start;
16176 
16177 	vm_map_lock(map);
16178 
16179 	if (attribute != MATTR_CACHE) {
16180 		/* If we don't have to find physical addresses, we */
16181 		/* don't have to do an explicit traversal here.    */
16182 		ret = pmap_attribute(map->pmap, start, end - start,
16183 		    attribute, value);
16184 		vm_map_unlock(map);
16185 		return ret;
16186 	}
16187 
16188 	ret = KERN_SUCCESS;                                                                             /* Assume it all worked */
16189 
16190 	while (sync_size) {
16191 		if (vm_map_lookup_entry(map, start, &entry)) {
16192 			vm_map_size_t   sub_size;
16193 			if ((entry->vme_end - start) > sync_size) {
16194 				sub_size = sync_size;
16195 				sync_size = 0;
16196 			} else {
16197 				sub_size = entry->vme_end - start;
16198 				sync_size -= sub_size;
16199 			}
16200 			if (entry->is_sub_map) {
16201 				vm_map_offset_t sub_start;
16202 				vm_map_offset_t sub_end;
16203 
16204 				sub_start = (start - entry->vme_start)
16205 				    + VME_OFFSET(entry);
16206 				sub_end = sub_start + sub_size;
16207 				vm_map_machine_attribute(
16208 					VME_SUBMAP(entry),
16209 					sub_start,
16210 					sub_end,
16211 					attribute, value);
16212 			} else if (VME_OBJECT(entry)) {
16213 				vm_page_t               m;
16214 				vm_object_t             object;
16215 				vm_object_t             base_object;
16216 				vm_object_t             last_object;
16217 				vm_object_offset_t      offset;
16218 				vm_object_offset_t      base_offset;
16219 				vm_map_size_t           range;
16220 				range = sub_size;
16221 				offset = (start - entry->vme_start)
16222 				    + VME_OFFSET(entry);
16223 				offset = vm_object_trunc_page(offset);
16224 				base_offset = offset;
16225 				object = VME_OBJECT(entry);
16226 				base_object = object;
16227 				last_object = NULL;
16228 
16229 				vm_object_lock(object);
16230 
16231 				while (range) {
16232 					m = vm_page_lookup(
16233 						object, offset);
16234 
16235 					if (m && !m->vmp_fictitious) {
16236 						ret =
16237 						    pmap_attribute_cache_sync(
16238 							VM_PAGE_GET_PHYS_PAGE(m),
16239 							PAGE_SIZE,
16240 							attribute, value);
16241 					} else if (object->shadow) {
16242 						offset = offset + object->vo_shadow_offset;
16243 						last_object = object;
16244 						object = object->shadow;
16245 						vm_object_lock(last_object->shadow);
16246 						vm_object_unlock(last_object);
16247 						continue;
16248 					}
16249 					if (range < PAGE_SIZE) {
16250 						range = 0;
16251 					} else {
16252 						range -= PAGE_SIZE;
16253 					}
16254 
16255 					if (base_object != object) {
16256 						vm_object_unlock(object);
16257 						vm_object_lock(base_object);
16258 						object = base_object;
16259 					}
16260 					/* Bump to the next page */
16261 					base_offset += PAGE_SIZE;
16262 					offset = base_offset;
16263 				}
16264 				vm_object_unlock(object);
16265 			}
16266 			start += sub_size;
16267 		} else {
16268 			vm_map_unlock(map);
16269 			return KERN_FAILURE;
16270 		}
16271 	}
16272 
16273 	vm_map_unlock(map);
16274 
16275 	return ret;
16276 }
16277 
16278 /*
16279  *	vm_map_behavior_set:
16280  *
16281  *	Sets the paging reference behavior of the specified address
16282  *	range in the target map.  Paging reference behavior affects
16283  *	how pagein operations resulting from faults on the map will be
16284  *	clustered.
16285  */
16286 kern_return_t
vm_map_behavior_set(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_behavior_t new_behavior)16287 vm_map_behavior_set(
16288 	vm_map_t        map,
16289 	vm_map_offset_t start,
16290 	vm_map_offset_t end,
16291 	vm_behavior_t   new_behavior)
16292 {
16293 	vm_map_entry_t  entry;
16294 	vm_map_entry_t  temp_entry;
16295 
16296 	if (start > end ||
16297 	    start < vm_map_min(map) ||
16298 	    end > vm_map_max(map)) {
16299 		return KERN_NO_SPACE;
16300 	}
16301 	if (__improbable(vm_map_range_overflows(map, start, end - start))) {
16302 		return KERN_INVALID_ADDRESS;
16303 	}
16304 
16305 	switch (new_behavior) {
16306 	/*
16307 	 * This first block of behaviors all set a persistent state on the specified
16308 	 * memory range.  All we have to do here is to record the desired behavior
16309 	 * in the vm_map_entry_t's.
16310 	 */
16311 
16312 	case VM_BEHAVIOR_DEFAULT:
16313 	case VM_BEHAVIOR_RANDOM:
16314 	case VM_BEHAVIOR_SEQUENTIAL:
16315 	case VM_BEHAVIOR_RSEQNTL:
16316 	case VM_BEHAVIOR_ZERO_WIRED_PAGES:
16317 		vm_map_lock(map);
16318 
16319 		/*
16320 		 *	The entire address range must be valid for the map.
16321 		 *      Note that vm_map_range_check() does a
16322 		 *	vm_map_lookup_entry() internally and returns the
16323 		 *	entry containing the start of the address range if
16324 		 *	the entire range is valid.
16325 		 */
16326 		if (vm_map_range_check(map, start, end, &temp_entry)) {
16327 			entry = temp_entry;
16328 			vm_map_clip_start(map, entry, start);
16329 		} else {
16330 			vm_map_unlock(map);
16331 			return KERN_INVALID_ADDRESS;
16332 		}
16333 
16334 		while ((entry != vm_map_to_entry(map)) && (entry->vme_start < end)) {
16335 			vm_map_clip_end(map, entry, end);
16336 			if (entry->is_sub_map) {
16337 				assert(!entry->use_pmap);
16338 			}
16339 
16340 			if (new_behavior == VM_BEHAVIOR_ZERO_WIRED_PAGES) {
16341 				entry->zero_wired_pages = TRUE;
16342 			} else {
16343 				entry->behavior = new_behavior;
16344 			}
16345 			entry = entry->vme_next;
16346 		}
16347 
16348 		vm_map_unlock(map);
16349 		break;
16350 
16351 	/*
16352 	 * The rest of these are different from the above in that they cause
16353 	 * an immediate action to take place as opposed to setting a behavior that
16354 	 * affects future actions.
16355 	 */
16356 
16357 	case VM_BEHAVIOR_WILLNEED:
16358 		return vm_map_willneed(map, start, end);
16359 
16360 	case VM_BEHAVIOR_DONTNEED:
16361 		return vm_map_msync(map, start, end - start, VM_SYNC_DEACTIVATE | VM_SYNC_CONTIGUOUS);
16362 
16363 	case VM_BEHAVIOR_FREE:
16364 		return vm_map_msync(map, start, end - start, VM_SYNC_KILLPAGES | VM_SYNC_CONTIGUOUS);
16365 
16366 	case VM_BEHAVIOR_REUSABLE:
16367 		return vm_map_reusable_pages(map, start, end);
16368 
16369 	case VM_BEHAVIOR_REUSE:
16370 		return vm_map_reuse_pages(map, start, end);
16371 
16372 	case VM_BEHAVIOR_CAN_REUSE:
16373 		return vm_map_can_reuse(map, start, end);
16374 
16375 #if MACH_ASSERT
16376 	case VM_BEHAVIOR_PAGEOUT:
16377 		return vm_map_pageout(map, start, end);
16378 #endif /* MACH_ASSERT */
16379 
16380 	case VM_BEHAVIOR_ZERO:
16381 		return vm_map_zero(map, start, end);
16382 
16383 	default:
16384 		return KERN_INVALID_ARGUMENT;
16385 	}
16386 
16387 	return KERN_SUCCESS;
16388 }
16389 
16390 
16391 /*
16392  * Internals for madvise(MADV_WILLNEED) system call.
16393  *
16394  * The implementation is to do:-
16395  * a) read-ahead if the mapping corresponds to a mapped regular file
16396  * b) or, fault in the pages (zero-fill, decompress etc) if it's an anonymous mapping
16397  */
16398 
16399 
16400 static kern_return_t
vm_map_willneed(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)16401 vm_map_willneed(
16402 	vm_map_t        map,
16403 	vm_map_offset_t start,
16404 	vm_map_offset_t end
16405 	)
16406 {
16407 	vm_map_entry_t                  entry;
16408 	vm_object_t                     object;
16409 	memory_object_t                 pager;
16410 	struct vm_object_fault_info     fault_info = {};
16411 	kern_return_t                   kr;
16412 	vm_object_size_t                len;
16413 	vm_object_offset_t              offset;
16414 
16415 	fault_info.interruptible = THREAD_UNINT;        /* ignored value */
16416 	fault_info.behavior      = VM_BEHAVIOR_SEQUENTIAL;
16417 	fault_info.stealth       = TRUE;
16418 
16419 	/*
16420 	 * The MADV_WILLNEED operation doesn't require any changes to the
16421 	 * vm_map_entry_t's, so the read lock is sufficient.
16422 	 */
16423 
16424 	vm_map_lock_read(map);
16425 
16426 	/*
16427 	 * The madvise semantics require that the address range be fully
16428 	 * allocated with no holes.  Otherwise, we're required to return
16429 	 * an error.
16430 	 */
16431 
16432 	if (!vm_map_range_check(map, start, end, &entry)) {
16433 		vm_map_unlock_read(map);
16434 		return KERN_INVALID_ADDRESS;
16435 	}
16436 
16437 	/*
16438 	 * Examine each vm_map_entry_t in the range.
16439 	 */
16440 	for (; entry != vm_map_to_entry(map) && start < end;) {
16441 		/*
16442 		 * The first time through, the start address could be anywhere
16443 		 * within the vm_map_entry we found.  So adjust the offset to
16444 		 * correspond.  After that, the offset will always be zero to
16445 		 * correspond to the beginning of the current vm_map_entry.
16446 		 */
16447 		offset = (start - entry->vme_start) + VME_OFFSET(entry);
16448 
16449 		/*
16450 		 * Set the length so we don't go beyond the end of the
16451 		 * map_entry or beyond the end of the range we were given.
16452 		 * This range could span also multiple map entries all of which
16453 		 * map different files, so make sure we only do the right amount
16454 		 * of I/O for each object.  Note that it's possible for there
16455 		 * to be multiple map entries all referring to the same object
16456 		 * but with different page permissions, but it's not worth
16457 		 * trying to optimize that case.
16458 		 */
16459 		len = MIN(entry->vme_end - start, end - start);
16460 
16461 		if ((vm_size_t) len != len) {
16462 			/* 32-bit overflow */
16463 			len = (vm_size_t) (0 - PAGE_SIZE);
16464 		}
16465 		fault_info.cluster_size = (vm_size_t) len;
16466 		fault_info.lo_offset    = offset;
16467 		fault_info.hi_offset    = offset + len;
16468 		fault_info.user_tag     = VME_ALIAS(entry);
16469 		fault_info.pmap_options = 0;
16470 		if (entry->iokit_acct ||
16471 		    (!entry->is_sub_map && !entry->use_pmap)) {
16472 			fault_info.pmap_options |= PMAP_OPTIONS_ALT_ACCT;
16473 		}
16474 		fault_info.fi_xnu_user_debug = entry->vme_xnu_user_debug;
16475 
16476 		/*
16477 		 * If the entry is a submap OR there's no read permission
16478 		 * to this mapping, then just skip it.
16479 		 */
16480 		if ((entry->is_sub_map) || (entry->protection & VM_PROT_READ) == 0) {
16481 			entry = entry->vme_next;
16482 			start = entry->vme_start;
16483 			continue;
16484 		}
16485 
16486 		object = VME_OBJECT(entry);
16487 
16488 		if (object == NULL ||
16489 		    (object && object->internal)) {
16490 			/*
16491 			 * Memory range backed by anonymous memory.
16492 			 */
16493 			vm_size_t region_size = 0, effective_page_size = 0;
16494 			vm_map_offset_t addr = 0, effective_page_mask = 0;
16495 
16496 			region_size = len;
16497 			addr = start;
16498 
16499 			effective_page_mask = MIN(vm_map_page_mask(current_map()), PAGE_MASK);
16500 			effective_page_size = effective_page_mask + 1;
16501 
16502 			vm_map_unlock_read(map);
16503 
16504 			while (region_size) {
16505 				vm_pre_fault(
16506 					vm_map_trunc_page(addr, effective_page_mask),
16507 					VM_PROT_READ | VM_PROT_WRITE);
16508 
16509 				region_size -= effective_page_size;
16510 				addr += effective_page_size;
16511 			}
16512 		} else {
16513 			/*
16514 			 * Find the file object backing this map entry.  If there is
16515 			 * none, then we simply ignore the "will need" advice for this
16516 			 * entry and go on to the next one.
16517 			 */
16518 			if ((object = find_vnode_object(entry)) == VM_OBJECT_NULL) {
16519 				entry = entry->vme_next;
16520 				start = entry->vme_start;
16521 				continue;
16522 			}
16523 
16524 			vm_object_paging_begin(object);
16525 			pager = object->pager;
16526 			vm_object_unlock(object);
16527 
16528 			/*
16529 			 * The data_request() could take a long time, so let's
16530 			 * release the map lock to avoid blocking other threads.
16531 			 */
16532 			vm_map_unlock_read(map);
16533 
16534 			/*
16535 			 * Get the data from the object asynchronously.
16536 			 *
16537 			 * Note that memory_object_data_request() places limits on the
16538 			 * amount of I/O it will do.  Regardless of the len we
16539 			 * specified, it won't do more than MAX_UPL_TRANSFER_BYTES and it
16540 			 * silently truncates the len to that size.  This isn't
16541 			 * necessarily bad since madvise shouldn't really be used to
16542 			 * page in unlimited amounts of data.  Other Unix variants
16543 			 * limit the willneed case as well.  If this turns out to be an
16544 			 * issue for developers, then we can always adjust the policy
16545 			 * here and still be backwards compatible since this is all
16546 			 * just "advice".
16547 			 */
16548 			kr = memory_object_data_request(
16549 				pager,
16550 				vm_object_trunc_page(offset) + object->paging_offset,
16551 				0,      /* ignored */
16552 				VM_PROT_READ,
16553 				(memory_object_fault_info_t)&fault_info);
16554 
16555 			vm_object_lock(object);
16556 			vm_object_paging_end(object);
16557 			vm_object_unlock(object);
16558 
16559 			/*
16560 			 * If we couldn't do the I/O for some reason, just give up on
16561 			 * the madvise.  We still return success to the user since
16562 			 * madvise isn't supposed to fail when the advice can't be
16563 			 * taken.
16564 			 */
16565 
16566 			if (kr != KERN_SUCCESS) {
16567 				return KERN_SUCCESS;
16568 			}
16569 		}
16570 
16571 		start += len;
16572 		if (start >= end) {
16573 			/* done */
16574 			return KERN_SUCCESS;
16575 		}
16576 
16577 		/* look up next entry */
16578 		vm_map_lock_read(map);
16579 		if (!vm_map_lookup_entry(map, start, &entry)) {
16580 			/*
16581 			 * There's a new hole in the address range.
16582 			 */
16583 			vm_map_unlock_read(map);
16584 			return KERN_INVALID_ADDRESS;
16585 		}
16586 	}
16587 
16588 	vm_map_unlock_read(map);
16589 	return KERN_SUCCESS;
16590 }
16591 
16592 static boolean_t
vm_map_entry_is_reusable(vm_map_entry_t entry)16593 vm_map_entry_is_reusable(
16594 	vm_map_entry_t entry)
16595 {
16596 	/* Only user map entries */
16597 
16598 	vm_object_t object;
16599 
16600 	if (entry->is_sub_map) {
16601 		return FALSE;
16602 	}
16603 
16604 	switch (VME_ALIAS(entry)) {
16605 	case VM_MEMORY_MALLOC:
16606 	case VM_MEMORY_MALLOC_SMALL:
16607 	case VM_MEMORY_MALLOC_LARGE:
16608 	case VM_MEMORY_REALLOC:
16609 	case VM_MEMORY_MALLOC_TINY:
16610 	case VM_MEMORY_MALLOC_LARGE_REUSABLE:
16611 	case VM_MEMORY_MALLOC_LARGE_REUSED:
16612 		/*
16613 		 * This is a malloc() memory region: check if it's still
16614 		 * in its original state and can be re-used for more
16615 		 * malloc() allocations.
16616 		 */
16617 		break;
16618 	default:
16619 		/*
16620 		 * Not a malloc() memory region: let the caller decide if
16621 		 * it's re-usable.
16622 		 */
16623 		return TRUE;
16624 	}
16625 
16626 	if (/*entry->is_shared ||*/
16627 		entry->is_sub_map ||
16628 		entry->in_transition ||
16629 		entry->protection != VM_PROT_DEFAULT ||
16630 		entry->max_protection != VM_PROT_ALL ||
16631 		entry->inheritance != VM_INHERIT_DEFAULT ||
16632 		entry->no_cache ||
16633 		entry->vme_permanent ||
16634 		entry->superpage_size != FALSE ||
16635 		entry->zero_wired_pages ||
16636 		entry->wired_count != 0 ||
16637 		entry->user_wired_count != 0) {
16638 		return FALSE;
16639 	}
16640 
16641 	object = VME_OBJECT(entry);
16642 	if (object == VM_OBJECT_NULL) {
16643 		return TRUE;
16644 	}
16645 	if (
16646 #if 0
16647 		/*
16648 		 * Let's proceed even if the VM object is potentially
16649 		 * shared.
16650 		 * We check for this later when processing the actual
16651 		 * VM pages, so the contents will be safe if shared.
16652 		 *
16653 		 * But we can still mark this memory region as "reusable" to
16654 		 * acknowledge that the caller did let us know that the memory
16655 		 * could be re-used and should not be penalized for holding
16656 		 * on to it.  This allows its "resident size" to not include
16657 		 * the reusable range.
16658 		 */
16659 		object->ref_count == 1 &&
16660 #endif
16661 		object->vo_copy == VM_OBJECT_NULL &&
16662 		object->shadow == VM_OBJECT_NULL &&
16663 		object->internal &&
16664 		object->purgable == VM_PURGABLE_DENY &&
16665 		object->wimg_bits == VM_WIMG_USE_DEFAULT &&
16666 		!object->code_signed) {
16667 		return TRUE;
16668 	}
16669 	return FALSE;
16670 }
16671 
16672 static kern_return_t
vm_map_reuse_pages(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)16673 vm_map_reuse_pages(
16674 	vm_map_t        map,
16675 	vm_map_offset_t start,
16676 	vm_map_offset_t end)
16677 {
16678 	vm_map_entry_t                  entry;
16679 	vm_object_t                     object;
16680 	vm_object_offset_t              start_offset, end_offset;
16681 
16682 	/*
16683 	 * The MADV_REUSE operation doesn't require any changes to the
16684 	 * vm_map_entry_t's, so the read lock is sufficient.
16685 	 */
16686 
16687 	if (VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT) {
16688 		/*
16689 		 * XXX TODO4K
16690 		 * need to figure out what reusable means for a
16691 		 * portion of a native page.
16692 		 */
16693 		return KERN_SUCCESS;
16694 	}
16695 
16696 	vm_map_lock_read(map);
16697 	assert(map->pmap != kernel_pmap);       /* protect alias access */
16698 
16699 	/*
16700 	 * The madvise semantics require that the address range be fully
16701 	 * allocated with no holes.  Otherwise, we're required to return
16702 	 * an error.
16703 	 */
16704 
16705 	if (!vm_map_range_check(map, start, end, &entry)) {
16706 		vm_map_unlock_read(map);
16707 		vm_page_stats_reusable.reuse_pages_failure++;
16708 		return KERN_INVALID_ADDRESS;
16709 	}
16710 
16711 	/*
16712 	 * Examine each vm_map_entry_t in the range.
16713 	 */
16714 	for (; entry != vm_map_to_entry(map) && entry->vme_start < end;
16715 	    entry = entry->vme_next) {
16716 		/*
16717 		 * Sanity check on the VM map entry.
16718 		 */
16719 		if (!vm_map_entry_is_reusable(entry)) {
16720 			vm_map_unlock_read(map);
16721 			vm_page_stats_reusable.reuse_pages_failure++;
16722 			return KERN_INVALID_ADDRESS;
16723 		}
16724 
16725 		/*
16726 		 * The first time through, the start address could be anywhere
16727 		 * within the vm_map_entry we found.  So adjust the offset to
16728 		 * correspond.
16729 		 */
16730 		if (entry->vme_start < start) {
16731 			start_offset = start - entry->vme_start;
16732 		} else {
16733 			start_offset = 0;
16734 		}
16735 		end_offset = MIN(end, entry->vme_end) - entry->vme_start;
16736 		start_offset += VME_OFFSET(entry);
16737 		end_offset += VME_OFFSET(entry);
16738 
16739 		object = VME_OBJECT(entry);
16740 		if (object != VM_OBJECT_NULL) {
16741 			vm_object_lock(object);
16742 			vm_object_reuse_pages(object, start_offset, end_offset,
16743 			    TRUE);
16744 			vm_object_unlock(object);
16745 		}
16746 
16747 		if (VME_ALIAS(entry) == VM_MEMORY_MALLOC_LARGE_REUSABLE) {
16748 			/*
16749 			 * XXX
16750 			 * We do not hold the VM map exclusively here.
16751 			 * The "alias" field is not that critical, so it's
16752 			 * safe to update it here, as long as it is the only
16753 			 * one that can be modified while holding the VM map
16754 			 * "shared".
16755 			 */
16756 			VME_ALIAS_SET(entry, VM_MEMORY_MALLOC_LARGE_REUSED);
16757 		}
16758 	}
16759 
16760 	vm_map_unlock_read(map);
16761 	vm_page_stats_reusable.reuse_pages_success++;
16762 	return KERN_SUCCESS;
16763 }
16764 
16765 
16766 static kern_return_t
vm_map_reusable_pages(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)16767 vm_map_reusable_pages(
16768 	vm_map_t        map,
16769 	vm_map_offset_t start,
16770 	vm_map_offset_t end)
16771 {
16772 	vm_map_entry_t                  entry;
16773 	vm_object_t                     object;
16774 	vm_object_offset_t              start_offset, end_offset;
16775 	vm_map_offset_t                 pmap_offset;
16776 
16777 	if (VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT) {
16778 		/*
16779 		 * XXX TODO4K
16780 		 * need to figure out what reusable means for a portion
16781 		 * of a native page.
16782 		 */
16783 		return KERN_SUCCESS;
16784 	}
16785 
16786 	/*
16787 	 * The MADV_REUSABLE operation doesn't require any changes to the
16788 	 * vm_map_entry_t's, so the read lock is sufficient.
16789 	 */
16790 
16791 	vm_map_lock_read(map);
16792 	assert(map->pmap != kernel_pmap);       /* protect alias access */
16793 
16794 	/*
16795 	 * The madvise semantics require that the address range be fully
16796 	 * allocated with no holes.  Otherwise, we're required to return
16797 	 * an error.
16798 	 */
16799 
16800 	if (!vm_map_range_check(map, start, end, &entry)) {
16801 		vm_map_unlock_read(map);
16802 		vm_page_stats_reusable.reusable_pages_failure++;
16803 		return KERN_INVALID_ADDRESS;
16804 	}
16805 
16806 	/*
16807 	 * Examine each vm_map_entry_t in the range.
16808 	 */
16809 	for (; entry != vm_map_to_entry(map) && entry->vme_start < end;
16810 	    entry = entry->vme_next) {
16811 		int kill_pages = 0;
16812 		boolean_t reusable_no_write = FALSE;
16813 
16814 		/*
16815 		 * Sanity check on the VM map entry.
16816 		 */
16817 		if (!vm_map_entry_is_reusable(entry)) {
16818 			vm_map_unlock_read(map);
16819 			vm_page_stats_reusable.reusable_pages_failure++;
16820 			return KERN_INVALID_ADDRESS;
16821 		}
16822 
16823 		if (!(entry->protection & VM_PROT_WRITE) && !entry->used_for_jit
16824 #if __arm64e__
16825 		    && !entry->used_for_tpro
16826 #endif
16827 		    ) {
16828 			/* not writable: can't discard contents */
16829 			vm_map_unlock_read(map);
16830 			vm_page_stats_reusable.reusable_nonwritable++;
16831 			vm_page_stats_reusable.reusable_pages_failure++;
16832 			return KERN_PROTECTION_FAILURE;
16833 		}
16834 
16835 		/*
16836 		 * The first time through, the start address could be anywhere
16837 		 * within the vm_map_entry we found.  So adjust the offset to
16838 		 * correspond.
16839 		 */
16840 		if (entry->vme_start < start) {
16841 			start_offset = start - entry->vme_start;
16842 			pmap_offset = start;
16843 		} else {
16844 			start_offset = 0;
16845 			pmap_offset = entry->vme_start;
16846 		}
16847 		end_offset = MIN(end, entry->vme_end) - entry->vme_start;
16848 		start_offset += VME_OFFSET(entry);
16849 		end_offset += VME_OFFSET(entry);
16850 
16851 		object = VME_OBJECT(entry);
16852 		if (object == VM_OBJECT_NULL) {
16853 			continue;
16854 		}
16855 
16856 		if (entry->protection & VM_PROT_EXECUTE) {
16857 			/*
16858 			 * Executable mappings might be write-protected by
16859 			 * hardware, so do not attempt to write to these pages.
16860 			 */
16861 			reusable_no_write = TRUE;
16862 		}
16863 
16864 		if (entry->vme_xnu_user_debug) {
16865 			/*
16866 			 * User debug pages might be write-protected by hardware,
16867 			 * so do not attempt to write to these pages.
16868 			 */
16869 			reusable_no_write = TRUE;
16870 		}
16871 
16872 		vm_object_lock(object);
16873 		if (((object->ref_count == 1) ||
16874 		    (object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC &&
16875 		    object->vo_copy == VM_OBJECT_NULL)) &&
16876 		    object->shadow == VM_OBJECT_NULL &&
16877 		    /*
16878 		     * "iokit_acct" entries are billed for their virtual size
16879 		     * (rather than for their resident pages only), so they
16880 		     * wouldn't benefit from making pages reusable, and it
16881 		     * would be hard to keep track of pages that are both
16882 		     * "iokit_acct" and "reusable" in the pmap stats and
16883 		     * ledgers.
16884 		     */
16885 		    !(entry->iokit_acct ||
16886 		    (!entry->is_sub_map && !entry->use_pmap))) {
16887 			if (object->ref_count != 1) {
16888 				vm_page_stats_reusable.reusable_shared++;
16889 			}
16890 			kill_pages = 1;
16891 		} else {
16892 			kill_pages = -1;
16893 		}
16894 		if (kill_pages != -1) {
16895 			vm_object_deactivate_pages(object,
16896 			    start_offset,
16897 			    end_offset - start_offset,
16898 			    kill_pages,
16899 			    TRUE /*reusable_pages*/,
16900 			    reusable_no_write,
16901 			    map->pmap,
16902 			    pmap_offset);
16903 		} else {
16904 			vm_page_stats_reusable.reusable_pages_shared++;
16905 			DTRACE_VM4(vm_map_reusable_pages_shared,
16906 			    unsigned int, VME_ALIAS(entry),
16907 			    vm_map_t, map,
16908 			    vm_map_entry_t, entry,
16909 			    vm_object_t, object);
16910 		}
16911 		vm_object_unlock(object);
16912 
16913 		if (VME_ALIAS(entry) == VM_MEMORY_MALLOC_LARGE ||
16914 		    VME_ALIAS(entry) == VM_MEMORY_MALLOC_LARGE_REUSED) {
16915 			/*
16916 			 * XXX
16917 			 * We do not hold the VM map exclusively here.
16918 			 * The "alias" field is not that critical, so it's
16919 			 * safe to update it here, as long as it is the only
16920 			 * one that can be modified while holding the VM map
16921 			 * "shared".
16922 			 */
16923 			VME_ALIAS_SET(entry, VM_MEMORY_MALLOC_LARGE_REUSABLE);
16924 		}
16925 	}
16926 
16927 	vm_map_unlock_read(map);
16928 	vm_page_stats_reusable.reusable_pages_success++;
16929 	return KERN_SUCCESS;
16930 }
16931 
16932 
16933 static kern_return_t
vm_map_can_reuse(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)16934 vm_map_can_reuse(
16935 	vm_map_t        map,
16936 	vm_map_offset_t start,
16937 	vm_map_offset_t end)
16938 {
16939 	vm_map_entry_t                  entry;
16940 
16941 	/*
16942 	 * The MADV_REUSABLE operation doesn't require any changes to the
16943 	 * vm_map_entry_t's, so the read lock is sufficient.
16944 	 */
16945 
16946 	vm_map_lock_read(map);
16947 	assert(map->pmap != kernel_pmap);       /* protect alias access */
16948 
16949 	/*
16950 	 * The madvise semantics require that the address range be fully
16951 	 * allocated with no holes.  Otherwise, we're required to return
16952 	 * an error.
16953 	 */
16954 
16955 	if (!vm_map_range_check(map, start, end, &entry)) {
16956 		vm_map_unlock_read(map);
16957 		vm_page_stats_reusable.can_reuse_failure++;
16958 		return KERN_INVALID_ADDRESS;
16959 	}
16960 
16961 	/*
16962 	 * Examine each vm_map_entry_t in the range.
16963 	 */
16964 	for (; entry != vm_map_to_entry(map) && entry->vme_start < end;
16965 	    entry = entry->vme_next) {
16966 		/*
16967 		 * Sanity check on the VM map entry.
16968 		 */
16969 		if (!vm_map_entry_is_reusable(entry)) {
16970 			vm_map_unlock_read(map);
16971 			vm_page_stats_reusable.can_reuse_failure++;
16972 			return KERN_INVALID_ADDRESS;
16973 		}
16974 	}
16975 
16976 	vm_map_unlock_read(map);
16977 	vm_page_stats_reusable.can_reuse_success++;
16978 	return KERN_SUCCESS;
16979 }
16980 
16981 
16982 #if MACH_ASSERT
16983 static kern_return_t
vm_map_pageout(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)16984 vm_map_pageout(
16985 	vm_map_t        map,
16986 	vm_map_offset_t start,
16987 	vm_map_offset_t end)
16988 {
16989 	vm_map_entry_t                  entry;
16990 
16991 	/*
16992 	 * The MADV_PAGEOUT operation doesn't require any changes to the
16993 	 * vm_map_entry_t's, so the read lock is sufficient.
16994 	 */
16995 
16996 	vm_map_lock_read(map);
16997 
16998 	/*
16999 	 * The madvise semantics require that the address range be fully
17000 	 * allocated with no holes.  Otherwise, we're required to return
17001 	 * an error.
17002 	 */
17003 
17004 	if (!vm_map_range_check(map, start, end, &entry)) {
17005 		vm_map_unlock_read(map);
17006 		return KERN_INVALID_ADDRESS;
17007 	}
17008 
17009 	/*
17010 	 * Examine each vm_map_entry_t in the range.
17011 	 */
17012 	for (; entry != vm_map_to_entry(map) && entry->vme_start < end;
17013 	    entry = entry->vme_next) {
17014 		vm_object_t     object;
17015 
17016 		/*
17017 		 * Sanity check on the VM map entry.
17018 		 */
17019 		if (entry->is_sub_map) {
17020 			vm_map_t submap;
17021 			vm_map_offset_t submap_start;
17022 			vm_map_offset_t submap_end;
17023 			vm_map_entry_t submap_entry;
17024 
17025 			submap = VME_SUBMAP(entry);
17026 			submap_start = VME_OFFSET(entry);
17027 			submap_end = submap_start + (entry->vme_end -
17028 			    entry->vme_start);
17029 
17030 			vm_map_lock_read(submap);
17031 
17032 			if (!vm_map_range_check(submap,
17033 			    submap_start,
17034 			    submap_end,
17035 			    &submap_entry)) {
17036 				vm_map_unlock_read(submap);
17037 				vm_map_unlock_read(map);
17038 				return KERN_INVALID_ADDRESS;
17039 			}
17040 
17041 			if (submap_entry->is_sub_map) {
17042 				vm_map_unlock_read(submap);
17043 				continue;
17044 			}
17045 
17046 			object = VME_OBJECT(submap_entry);
17047 			if (object == VM_OBJECT_NULL || !object->internal) {
17048 				vm_map_unlock_read(submap);
17049 				continue;
17050 			}
17051 
17052 			vm_object_pageout(object);
17053 
17054 			vm_map_unlock_read(submap);
17055 			submap = VM_MAP_NULL;
17056 			submap_entry = VM_MAP_ENTRY_NULL;
17057 			continue;
17058 		}
17059 
17060 		object = VME_OBJECT(entry);
17061 		if (object == VM_OBJECT_NULL || !object->internal) {
17062 			continue;
17063 		}
17064 
17065 		vm_object_pageout(object);
17066 	}
17067 
17068 	vm_map_unlock_read(map);
17069 	return KERN_SUCCESS;
17070 }
17071 #endif /* MACH_ASSERT */
17072 
17073 /*
17074  * This function determines if the zero operation can be run on the
17075  * respective entry. Additional checks on the object are in
17076  * vm_object_zero_preflight.
17077  */
17078 static kern_return_t
vm_map_zero_entry_preflight(vm_map_entry_t entry)17079 vm_map_zero_entry_preflight(vm_map_entry_t entry)
17080 {
17081 	/*
17082 	 * Zeroing is restricted to writable non-executable entries and non-JIT
17083 	 * regions.
17084 	 */
17085 	if (!(entry->protection & VM_PROT_WRITE) ||
17086 	    (entry->protection & VM_PROT_EXECUTE) ||
17087 	    entry->used_for_jit ||
17088 	    entry->vme_xnu_user_debug) {
17089 		return KERN_PROTECTION_FAILURE;
17090 	}
17091 
17092 	/*
17093 	 * Zeroing for copy on write isn't yet supported. Zeroing is also not
17094 	 * allowed for submaps.
17095 	 */
17096 	if (entry->needs_copy || entry->is_sub_map) {
17097 		return KERN_NO_ACCESS;
17098 	}
17099 
17100 	return KERN_SUCCESS;
17101 }
17102 
17103 /*
17104  * This function translates entry's start and end to offsets in the object
17105  */
17106 static void
vm_map_get_bounds_in_object(vm_map_entry_t entry,vm_map_offset_t start,vm_map_offset_t end,vm_map_offset_t * start_offset,vm_map_offset_t * end_offset)17107 vm_map_get_bounds_in_object(
17108 	vm_map_entry_t      entry,
17109 	vm_map_offset_t     start,
17110 	vm_map_offset_t     end,
17111 	vm_map_offset_t    *start_offset,
17112 	vm_map_offset_t    *end_offset)
17113 {
17114 	if (entry->vme_start < start) {
17115 		*start_offset = start - entry->vme_start;
17116 	} else {
17117 		*start_offset = 0;
17118 	}
17119 	*end_offset = MIN(end, entry->vme_end) - entry->vme_start;
17120 	*start_offset += VME_OFFSET(entry);
17121 	*end_offset += VME_OFFSET(entry);
17122 }
17123 
17124 /*
17125  * This function iterates through the entries in the requested range
17126  * and zeroes any resident pages in the corresponding objects. Compressed
17127  * pages are dropped instead of being faulted in and zeroed.
17128  */
17129 static kern_return_t
vm_map_zero(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)17130 vm_map_zero(
17131 	vm_map_t        map,
17132 	vm_map_offset_t start,
17133 	vm_map_offset_t end)
17134 {
17135 	vm_map_entry_t                  entry;
17136 	vm_map_offset_t                 cur = start;
17137 	kern_return_t                   ret;
17138 
17139 	/*
17140 	 * This operation isn't supported where the map page size is less than
17141 	 * the hardware page size. Caller will need to handle error and
17142 	 * explicitly zero memory if needed.
17143 	 */
17144 	if (VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT) {
17145 		return KERN_NO_ACCESS;
17146 	}
17147 
17148 	/*
17149 	 * The MADV_ZERO operation doesn't require any changes to the
17150 	 * vm_map_entry_t's, so the read lock is sufficient.
17151 	 */
17152 	vm_map_lock_read(map);
17153 	assert(map->pmap != kernel_pmap);       /* protect alias access */
17154 
17155 	/*
17156 	 * The madvise semantics require that the address range be fully
17157 	 * allocated with no holes. Otherwise, we're required to return
17158 	 * an error. This check needs to be redone if the map has changed.
17159 	 */
17160 	if (!vm_map_range_check(map, cur, end, &entry)) {
17161 		vm_map_unlock_read(map);
17162 		return KERN_INVALID_ADDRESS;
17163 	}
17164 
17165 	/*
17166 	 * Examine each vm_map_entry_t in the range.
17167 	 */
17168 	while (entry != vm_map_to_entry(map) && entry->vme_start < end) {
17169 		vm_map_offset_t cur_offset;
17170 		vm_map_offset_t end_offset;
17171 		unsigned int last_timestamp = map->timestamp;
17172 		vm_object_t object = VME_OBJECT(entry);
17173 
17174 		ret = vm_map_zero_entry_preflight(entry);
17175 		if (ret != KERN_SUCCESS) {
17176 			vm_map_unlock_read(map);
17177 			return ret;
17178 		}
17179 
17180 		if (object == VM_OBJECT_NULL) {
17181 			entry = entry->vme_next;
17182 			continue;
17183 		}
17184 
17185 		vm_map_get_bounds_in_object(entry, cur, end, &cur_offset, &end_offset);
17186 		vm_object_lock(object);
17187 		/*
17188 		 * Take a reference on the object as vm_object_zero will drop the object
17189 		 * lock when it encounters a busy page.
17190 		 */
17191 		vm_object_reference_locked(object);
17192 		vm_map_unlock_read(map);
17193 
17194 		ret = vm_object_zero(object, cur_offset, end_offset);
17195 		vm_object_unlock(object);
17196 		vm_object_deallocate(object);
17197 		if (ret != KERN_SUCCESS) {
17198 			return ret;
17199 		}
17200 		/*
17201 		 * Update cur as vm_object_zero has succeeded.
17202 		 */
17203 		cur += (end_offset - cur_offset);
17204 		if (cur == end) {
17205 			return KERN_SUCCESS;
17206 		}
17207 
17208 		/*
17209 		 * If the map timestamp has changed, restart by relooking up cur in the
17210 		 * map
17211 		 */
17212 		vm_map_lock_read(map);
17213 		if (last_timestamp != map->timestamp) {
17214 			/*
17215 			 * Relookup cur in the map
17216 			 */
17217 			if (!vm_map_range_check(map, cur, end, &entry)) {
17218 				vm_map_unlock_read(map);
17219 				return KERN_INVALID_ADDRESS;
17220 			}
17221 			continue;
17222 		}
17223 		/*
17224 		 * If the map hasn't changed proceed with the next entry
17225 		 */
17226 		entry = entry->vme_next;
17227 	}
17228 
17229 	vm_map_unlock_read(map);
17230 	return KERN_SUCCESS;
17231 }
17232 
17233 
17234 /*
17235  *	Routine:	vm_map_entry_insert
17236  *
17237  *	Description:	This routine inserts a new vm_entry in a locked map.
17238  */
17239 static vm_map_entry_t
vm_map_entry_insert(vm_map_t map,vm_map_entry_t insp_entry,vm_map_offset_t start,vm_map_offset_t end,vm_object_t object,vm_object_offset_t offset,vm_map_kernel_flags_t vmk_flags,boolean_t needs_copy,vm_prot_t cur_protection,vm_prot_t max_protection,vm_inherit_t inheritance,boolean_t clear_map_aligned)17240 vm_map_entry_insert(
17241 	vm_map_t                map,
17242 	vm_map_entry_t          insp_entry,
17243 	vm_map_offset_t         start,
17244 	vm_map_offset_t         end,
17245 	vm_object_t             object,
17246 	vm_object_offset_t      offset,
17247 	vm_map_kernel_flags_t   vmk_flags,
17248 	boolean_t               needs_copy,
17249 	vm_prot_t               cur_protection,
17250 	vm_prot_t               max_protection,
17251 	vm_inherit_t            inheritance,
17252 	boolean_t               clear_map_aligned)
17253 {
17254 	vm_map_entry_t  new_entry;
17255 	boolean_t map_aligned = FALSE;
17256 
17257 	assert(insp_entry != (vm_map_entry_t)0);
17258 	vm_map_lock_assert_exclusive(map);
17259 
17260 #if DEVELOPMENT || DEBUG
17261 	vm_object_offset_t      end_offset = 0;
17262 	assertf(!os_add_overflow(end - start, offset, &end_offset), "size 0x%llx, offset 0x%llx caused overflow", (uint64_t)(end - start), offset);
17263 #endif /* DEVELOPMENT || DEBUG */
17264 
17265 	if (VM_MAP_PAGE_SHIFT(map) != PAGE_SHIFT) {
17266 		map_aligned = TRUE;
17267 	}
17268 	if (clear_map_aligned &&
17269 	    (!VM_MAP_PAGE_ALIGNED(start, VM_MAP_PAGE_MASK(map)) ||
17270 	    !VM_MAP_PAGE_ALIGNED(end, VM_MAP_PAGE_MASK(map)))) {
17271 		map_aligned = FALSE;
17272 	}
17273 	if (map_aligned) {
17274 		assert(VM_MAP_PAGE_ALIGNED(start, VM_MAP_PAGE_MASK(map)));
17275 		assert(VM_MAP_PAGE_ALIGNED(end, VM_MAP_PAGE_MASK(map)));
17276 	} else {
17277 		assert(page_aligned(start));
17278 		assert(page_aligned(end));
17279 	}
17280 	assert(start < end);
17281 
17282 	new_entry = vm_map_entry_create(map);
17283 
17284 	new_entry->vme_start = start;
17285 	new_entry->vme_end = end;
17286 
17287 	if (vmk_flags.vmkf_submap) {
17288 		new_entry->vme_atomic = vmk_flags.vmkf_submap_atomic;
17289 		VME_SUBMAP_SET(new_entry, (vm_map_t)object);
17290 	} else {
17291 		VME_OBJECT_SET(new_entry, object, false, 0);
17292 	}
17293 	VME_OFFSET_SET(new_entry, offset);
17294 	VME_ALIAS_SET(new_entry, vmk_flags.vm_tag);
17295 
17296 	new_entry->map_aligned = map_aligned;
17297 	new_entry->needs_copy = needs_copy;
17298 	new_entry->inheritance = inheritance;
17299 	new_entry->protection = cur_protection;
17300 	new_entry->max_protection = max_protection;
17301 	/*
17302 	 * submap: "use_pmap" means "nested".
17303 	 * default: false.
17304 	 *
17305 	 * object: "use_pmap" means "use pmap accounting" for footprint.
17306 	 * default: true.
17307 	 */
17308 	new_entry->use_pmap = !vmk_flags.vmkf_submap;
17309 	new_entry->no_cache = vmk_flags.vmf_no_cache;
17310 	new_entry->vme_permanent = vmk_flags.vmf_permanent;
17311 	new_entry->translated_allow_execute = vmk_flags.vmkf_translated_allow_execute;
17312 	new_entry->vme_no_copy_on_read = vmk_flags.vmkf_no_copy_on_read;
17313 	new_entry->superpage_size = (vmk_flags.vmf_superpage_size != 0);
17314 
17315 	if (vmk_flags.vmkf_map_jit) {
17316 		if (!(map->jit_entry_exists) ||
17317 		    VM_MAP_POLICY_ALLOW_MULTIPLE_JIT(map)) {
17318 			new_entry->used_for_jit = TRUE;
17319 			map->jit_entry_exists = TRUE;
17320 		}
17321 	}
17322 
17323 	/*
17324 	 *	Insert the new entry into the list.
17325 	 */
17326 
17327 	vm_map_store_entry_link(map, insp_entry, new_entry, vmk_flags);
17328 	map->size += end - start;
17329 
17330 	/*
17331 	 *	Update the free space hint and the lookup hint.
17332 	 */
17333 
17334 	SAVE_HINT_MAP_WRITE(map, new_entry);
17335 	return new_entry;
17336 }
17337 
17338 /*
17339  *	Routine:	vm_map_remap_extract
17340  *
17341  *	Description:	This routine returns a vm_entry list from a map.
17342  */
17343 static kern_return_t
vm_map_remap_extract(vm_map_t map,vm_map_offset_t addr,vm_map_size_t size,boolean_t copy,vm_map_copy_t map_copy,vm_prot_t * cur_protection,vm_prot_t * max_protection,vm_inherit_t inheritance,vm_map_kernel_flags_t vmk_flags)17344 vm_map_remap_extract(
17345 	vm_map_t                map,
17346 	vm_map_offset_t         addr,
17347 	vm_map_size_t           size,
17348 	boolean_t               copy,
17349 	vm_map_copy_t           map_copy,
17350 	vm_prot_t               *cur_protection,   /* IN/OUT */
17351 	vm_prot_t               *max_protection,   /* IN/OUT */
17352 	/* What, no behavior? */
17353 	vm_inherit_t            inheritance,
17354 	vm_map_kernel_flags_t   vmk_flags)
17355 {
17356 	struct vm_map_header   *map_header = &map_copy->cpy_hdr;
17357 	kern_return_t           result;
17358 	vm_map_size_t           mapped_size;
17359 	vm_map_size_t           tmp_size;
17360 	vm_map_entry_t          src_entry;     /* result of last map lookup */
17361 	vm_map_entry_t          new_entry;
17362 	vm_object_offset_t      offset;
17363 	vm_map_offset_t         map_address;
17364 	vm_map_offset_t         src_start;     /* start of entry to map */
17365 	vm_map_offset_t         src_end;       /* end of region to be mapped */
17366 	vm_object_t             object;
17367 	vm_map_version_t        version;
17368 	boolean_t               src_needs_copy;
17369 	boolean_t               new_entry_needs_copy;
17370 	vm_map_entry_t          saved_src_entry;
17371 	boolean_t               src_entry_was_wired;
17372 	vm_prot_t               max_prot_for_prot_copy;
17373 	vm_map_offset_t         effective_page_mask;
17374 	bool                    pageable, same_map;
17375 	boolean_t               vm_remap_legacy;
17376 	vm_prot_t               required_cur_prot, required_max_prot;
17377 	vm_object_t             new_copy_object;     /* vm_object_copy_* result */
17378 	boolean_t               saved_used_for_jit;  /* Saved used_for_jit. */
17379 
17380 	pageable = vmk_flags.vmkf_copy_pageable;
17381 	same_map = vmk_flags.vmkf_copy_same_map;
17382 
17383 	effective_page_mask = MIN(PAGE_MASK, VM_MAP_PAGE_MASK(map));
17384 
17385 	assert(map != VM_MAP_NULL);
17386 	assert(size != 0);
17387 	assert(size == vm_map_round_page(size, effective_page_mask));
17388 	assert(inheritance == VM_INHERIT_NONE ||
17389 	    inheritance == VM_INHERIT_COPY ||
17390 	    inheritance == VM_INHERIT_SHARE);
17391 	assert(!(*cur_protection & ~(VM_PROT_ALL | VM_PROT_ALLEXEC)));
17392 	assert(!(*max_protection & ~(VM_PROT_ALL | VM_PROT_ALLEXEC)));
17393 	assert((*cur_protection & *max_protection) == *cur_protection);
17394 
17395 	/*
17396 	 *	Compute start and end of region.
17397 	 */
17398 	src_start = vm_map_trunc_page(addr, effective_page_mask);
17399 	src_end = vm_map_round_page(src_start + size, effective_page_mask);
17400 
17401 	/*
17402 	 *	Initialize map_header.
17403 	 */
17404 	map_header->nentries = 0;
17405 	map_header->entries_pageable = pageable;
17406 //	map_header->page_shift = MIN(VM_MAP_PAGE_SHIFT(map), PAGE_SHIFT);
17407 	map_header->page_shift = (uint16_t)VM_MAP_PAGE_SHIFT(map);
17408 	map_header->rb_head_store.rbh_root = (void *)(int)SKIP_RB_TREE;
17409 	vm_map_store_init(map_header);
17410 
17411 	if (copy && vmk_flags.vmkf_remap_prot_copy) {
17412 		/*
17413 		 * Special case for vm_map_protect(VM_PROT_COPY):
17414 		 * we want to set the new mappings' max protection to the
17415 		 * specified *max_protection...
17416 		 */
17417 		max_prot_for_prot_copy = *max_protection & (VM_PROT_ALL | VM_PROT_ALLEXEC);
17418 		/* ... but we want to use the vm_remap() legacy mode */
17419 		*max_protection = VM_PROT_NONE;
17420 		*cur_protection = VM_PROT_NONE;
17421 	} else {
17422 		max_prot_for_prot_copy = VM_PROT_NONE;
17423 	}
17424 
17425 	if (*cur_protection == VM_PROT_NONE &&
17426 	    *max_protection == VM_PROT_NONE) {
17427 		/*
17428 		 * vm_remap() legacy mode:
17429 		 * Extract all memory regions in the specified range and
17430 		 * collect the strictest set of protections allowed on the
17431 		 * entire range, so the caller knows what they can do with
17432 		 * the remapped range.
17433 		 * We start with VM_PROT_ALL and we'll remove the protections
17434 		 * missing from each memory region.
17435 		 */
17436 		vm_remap_legacy = TRUE;
17437 		*cur_protection = VM_PROT_ALL;
17438 		*max_protection = VM_PROT_ALL;
17439 		required_cur_prot = VM_PROT_NONE;
17440 		required_max_prot = VM_PROT_NONE;
17441 	} else {
17442 		/*
17443 		 * vm_remap_new() mode:
17444 		 * Extract all memory regions in the specified range and
17445 		 * ensure that they have at least the protections specified
17446 		 * by the caller via *cur_protection and *max_protection.
17447 		 * The resulting mapping should have these protections.
17448 		 */
17449 		vm_remap_legacy = FALSE;
17450 		if (copy) {
17451 			required_cur_prot = VM_PROT_NONE;
17452 			required_max_prot = VM_PROT_READ;
17453 		} else {
17454 			required_cur_prot = *cur_protection;
17455 			required_max_prot = *max_protection;
17456 		}
17457 	}
17458 
17459 	map_address = 0;
17460 	mapped_size = 0;
17461 	result = KERN_SUCCESS;
17462 
17463 	/*
17464 	 *	The specified source virtual space might correspond to
17465 	 *	multiple map entries, need to loop on them.
17466 	 */
17467 	vm_map_lock(map);
17468 
17469 	if (map->pmap == kernel_pmap) {
17470 		map_copy->is_kernel_range = true;
17471 		map_copy->orig_range = kmem_addr_get_range(addr, size);
17472 #if CONFIG_MAP_RANGES
17473 	} else if (map->uses_user_ranges) {
17474 		map_copy->is_user_range = true;
17475 		map_copy->orig_range = vm_map_user_range_resolve(map, addr, size, NULL);
17476 #endif /* CONFIG_MAP_RANGES */
17477 	}
17478 
17479 	if (VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT) {
17480 		/*
17481 		 * This address space uses sub-pages so the range might
17482 		 * not be re-mappable in an address space with larger
17483 		 * pages. Re-assemble any broken-up VM map entries to
17484 		 * improve our chances of making it work.
17485 		 */
17486 		vm_map_simplify_range(map, src_start, src_end);
17487 	}
17488 	while (mapped_size != size) {
17489 		vm_map_size_t   entry_size;
17490 
17491 		/*
17492 		 *	Find the beginning of the region.
17493 		 */
17494 		if (!vm_map_lookup_entry(map, src_start, &src_entry)) {
17495 			result = KERN_INVALID_ADDRESS;
17496 			break;
17497 		}
17498 
17499 		if (src_start < src_entry->vme_start ||
17500 		    (mapped_size && src_start != src_entry->vme_start)) {
17501 			result = KERN_INVALID_ADDRESS;
17502 			break;
17503 		}
17504 
17505 		tmp_size = size - mapped_size;
17506 		if (src_end > src_entry->vme_end) {
17507 			tmp_size -= (src_end - src_entry->vme_end);
17508 		}
17509 
17510 		entry_size = (vm_map_size_t)(src_entry->vme_end -
17511 		    src_entry->vme_start);
17512 
17513 		if (src_entry->is_sub_map &&
17514 		    vmk_flags.vmkf_copy_single_object) {
17515 			vm_map_t submap;
17516 			vm_map_offset_t submap_start;
17517 			vm_map_size_t submap_size;
17518 			boolean_t submap_needs_copy;
17519 
17520 			/*
17521 			 * No check for "required protection" on "src_entry"
17522 			 * because the protections that matter are the ones
17523 			 * on the submap's VM map entry, which will be checked
17524 			 * during the call to vm_map_remap_extract() below.
17525 			 */
17526 			submap_size = src_entry->vme_end - src_start;
17527 			if (submap_size > size) {
17528 				submap_size = size;
17529 			}
17530 			submap_start = VME_OFFSET(src_entry) + src_start - src_entry->vme_start;
17531 			submap = VME_SUBMAP(src_entry);
17532 			if (copy) {
17533 				/*
17534 				 * The caller wants a copy-on-write re-mapping,
17535 				 * so let's extract from the submap accordingly.
17536 				 */
17537 				submap_needs_copy = TRUE;
17538 			} else if (src_entry->needs_copy) {
17539 				/*
17540 				 * The caller wants a shared re-mapping but the
17541 				 * submap is mapped with "needs_copy", so its
17542 				 * contents can't be shared as is. Extract the
17543 				 * contents of the submap as "copy-on-write".
17544 				 * The re-mapping won't be shared with the
17545 				 * original mapping but this is equivalent to
17546 				 * what happened with the original "remap from
17547 				 * submap" code.
17548 				 * The shared region is mapped "needs_copy", for
17549 				 * example.
17550 				 */
17551 				submap_needs_copy = TRUE;
17552 			} else {
17553 				/*
17554 				 * The caller wants a shared re-mapping and
17555 				 * this mapping can be shared (no "needs_copy"),
17556 				 * so let's extract from the submap accordingly.
17557 				 * Kernel submaps are mapped without
17558 				 * "needs_copy", for example.
17559 				 */
17560 				submap_needs_copy = FALSE;
17561 			}
17562 			vm_map_reference(submap);
17563 			vm_map_unlock(map);
17564 			src_entry = NULL;
17565 			if (vm_remap_legacy) {
17566 				*cur_protection = VM_PROT_NONE;
17567 				*max_protection = VM_PROT_NONE;
17568 			}
17569 
17570 			DTRACE_VM7(remap_submap_recurse,
17571 			    vm_map_t, map,
17572 			    vm_map_offset_t, addr,
17573 			    vm_map_size_t, size,
17574 			    boolean_t, copy,
17575 			    vm_map_offset_t, submap_start,
17576 			    vm_map_size_t, submap_size,
17577 			    boolean_t, submap_needs_copy);
17578 
17579 			result = vm_map_remap_extract(submap,
17580 			    submap_start,
17581 			    submap_size,
17582 			    submap_needs_copy,
17583 			    map_copy,
17584 			    cur_protection,
17585 			    max_protection,
17586 			    inheritance,
17587 			    vmk_flags);
17588 			vm_map_deallocate(submap);
17589 
17590 			if (result == KERN_SUCCESS &&
17591 			    submap_needs_copy &&
17592 			    !copy) {
17593 				/*
17594 				 * We were asked for a "shared"
17595 				 * re-mapping but had to ask for a
17596 				 * "copy-on-write" remapping of the
17597 				 * submap's mapping to honor the
17598 				 * submap's "needs_copy".
17599 				 * We now need to resolve that
17600 				 * pending "copy-on-write" to
17601 				 * get something we can share.
17602 				 */
17603 				vm_map_entry_t copy_entry;
17604 				vm_object_offset_t copy_offset;
17605 				vm_map_size_t copy_size;
17606 				vm_object_t copy_object;
17607 				copy_entry = vm_map_copy_first_entry(map_copy);
17608 				copy_size = copy_entry->vme_end - copy_entry->vme_start;
17609 				copy_object = VME_OBJECT(copy_entry);
17610 				copy_offset = VME_OFFSET(copy_entry);
17611 				if (copy_object == VM_OBJECT_NULL) {
17612 					assert(copy_offset == 0);
17613 					assert(!copy_entry->needs_copy);
17614 					if (copy_entry->max_protection == VM_PROT_NONE) {
17615 						assert(copy_entry->protection == VM_PROT_NONE);
17616 						/* nothing to share */
17617 					} else {
17618 						assert(copy_offset == 0);
17619 						copy_object = vm_object_allocate(copy_size);
17620 						VME_OFFSET_SET(copy_entry, 0);
17621 						VME_OBJECT_SET(copy_entry, copy_object, false, 0);
17622 						assert(copy_entry->use_pmap);
17623 					}
17624 				} else if (copy_object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC) {
17625 					/* already shareable */
17626 					assert(!copy_entry->needs_copy);
17627 				} else if (copy_entry->needs_copy ||
17628 				    copy_object->shadowed ||
17629 				    (object->internal &&
17630 				    !object->true_share &&
17631 				    !copy_entry->is_shared &&
17632 				    copy_object->vo_size > copy_size)) {
17633 					VME_OBJECT_SHADOW(copy_entry, copy_size, TRUE);
17634 					assert(copy_entry->use_pmap);
17635 					if (copy_entry->needs_copy) {
17636 						/* already write-protected */
17637 					} else {
17638 						vm_prot_t prot;
17639 						prot = copy_entry->protection & ~VM_PROT_WRITE;
17640 						vm_object_pmap_protect(copy_object,
17641 						    copy_offset,
17642 						    copy_size,
17643 						    PMAP_NULL,
17644 						    PAGE_SIZE,
17645 						    0,
17646 						    prot);
17647 					}
17648 					copy_entry->needs_copy = FALSE;
17649 				}
17650 				copy_object = VME_OBJECT(copy_entry);
17651 				copy_offset = VME_OFFSET(copy_entry);
17652 				if (copy_object &&
17653 				    copy_object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) {
17654 					copy_object->copy_strategy = MEMORY_OBJECT_COPY_DELAY;
17655 					copy_object->true_share = TRUE;
17656 				}
17657 			}
17658 
17659 			return result;
17660 		}
17661 
17662 		if (src_entry->is_sub_map) {
17663 			/* protections for submap mapping are irrelevant here */
17664 		} else if (((src_entry->protection & required_cur_prot) !=
17665 		    required_cur_prot) ||
17666 		    ((src_entry->max_protection & required_max_prot) !=
17667 		    required_max_prot)) {
17668 			if (vmk_flags.vmkf_copy_single_object &&
17669 			    mapped_size != 0) {
17670 				/*
17671 				 * Single object extraction.
17672 				 * We can't extract more with the required
17673 				 * protection but we've extracted some, so
17674 				 * stop there and declare success.
17675 				 * The caller should check the size of
17676 				 * the copy entry we've extracted.
17677 				 */
17678 				result = KERN_SUCCESS;
17679 			} else {
17680 				/*
17681 				 * VM range extraction.
17682 				 * Required proctection is not available
17683 				 * for this part of the range: fail.
17684 				 */
17685 				result = KERN_PROTECTION_FAILURE;
17686 			}
17687 			break;
17688 		}
17689 
17690 		if (src_entry->is_sub_map) {
17691 			vm_map_t submap;
17692 			vm_map_offset_t submap_start;
17693 			vm_map_size_t submap_size;
17694 			vm_map_copy_t submap_copy;
17695 			vm_prot_t submap_curprot, submap_maxprot;
17696 			boolean_t submap_needs_copy;
17697 
17698 			/*
17699 			 * No check for "required protection" on "src_entry"
17700 			 * because the protections that matter are the ones
17701 			 * on the submap's VM map entry, which will be checked
17702 			 * during the call to vm_map_copy_extract() below.
17703 			 */
17704 			object = VM_OBJECT_NULL;
17705 			submap_copy = VM_MAP_COPY_NULL;
17706 
17707 			/* find equivalent range in the submap */
17708 			submap = VME_SUBMAP(src_entry);
17709 			submap_start = VME_OFFSET(src_entry) + src_start - src_entry->vme_start;
17710 			submap_size = tmp_size;
17711 			if (copy) {
17712 				/*
17713 				 * The caller wants a copy-on-write re-mapping,
17714 				 * so let's extract from the submap accordingly.
17715 				 */
17716 				submap_needs_copy = TRUE;
17717 			} else if (src_entry->needs_copy) {
17718 				/*
17719 				 * The caller wants a shared re-mapping but the
17720 				 * submap is mapped with "needs_copy", so its
17721 				 * contents can't be shared as is. Extract the
17722 				 * contents of the submap as "copy-on-write".
17723 				 * The re-mapping won't be shared with the
17724 				 * original mapping but this is equivalent to
17725 				 * what happened with the original "remap from
17726 				 * submap" code.
17727 				 * The shared region is mapped "needs_copy", for
17728 				 * example.
17729 				 */
17730 				submap_needs_copy = TRUE;
17731 			} else {
17732 				/*
17733 				 * The caller wants a shared re-mapping and
17734 				 * this mapping can be shared (no "needs_copy"),
17735 				 * so let's extract from the submap accordingly.
17736 				 * Kernel submaps are mapped without
17737 				 * "needs_copy", for example.
17738 				 */
17739 				submap_needs_copy = FALSE;
17740 			}
17741 			/* extra ref to keep submap alive */
17742 			vm_map_reference(submap);
17743 
17744 			DTRACE_VM7(remap_submap_recurse,
17745 			    vm_map_t, map,
17746 			    vm_map_offset_t, addr,
17747 			    vm_map_size_t, size,
17748 			    boolean_t, copy,
17749 			    vm_map_offset_t, submap_start,
17750 			    vm_map_size_t, submap_size,
17751 			    boolean_t, submap_needs_copy);
17752 
17753 			/*
17754 			 * The map can be safely unlocked since we
17755 			 * already hold a reference on the submap.
17756 			 *
17757 			 * No timestamp since we don't care if the map
17758 			 * gets modified while we're down in the submap.
17759 			 * We'll resume the extraction at src_start + tmp_size
17760 			 * anyway.
17761 			 */
17762 			vm_map_unlock(map);
17763 			src_entry = NULL; /* not valid once map is unlocked */
17764 
17765 			if (vm_remap_legacy) {
17766 				submap_curprot = VM_PROT_NONE;
17767 				submap_maxprot = VM_PROT_NONE;
17768 				if (max_prot_for_prot_copy) {
17769 					submap_maxprot = max_prot_for_prot_copy;
17770 				}
17771 			} else {
17772 				assert(!max_prot_for_prot_copy);
17773 				submap_curprot = *cur_protection;
17774 				submap_maxprot = *max_protection;
17775 			}
17776 			result = vm_map_copy_extract(submap,
17777 			    submap_start,
17778 			    submap_size,
17779 			    submap_needs_copy,
17780 			    &submap_copy,
17781 			    &submap_curprot,
17782 			    &submap_maxprot,
17783 			    inheritance,
17784 			    vmk_flags);
17785 
17786 			/* release extra ref on submap */
17787 			vm_map_deallocate(submap);
17788 			submap = VM_MAP_NULL;
17789 
17790 			if (result != KERN_SUCCESS) {
17791 				vm_map_lock(map);
17792 				break;
17793 			}
17794 
17795 			/* transfer submap_copy entries to map_header */
17796 			while (vm_map_copy_first_entry(submap_copy) !=
17797 			    vm_map_copy_to_entry(submap_copy)) {
17798 				vm_map_entry_t copy_entry;
17799 				vm_map_size_t copy_entry_size;
17800 
17801 				copy_entry = vm_map_copy_first_entry(submap_copy);
17802 
17803 				/*
17804 				 * Prevent kernel_object from being exposed to
17805 				 * user space.
17806 				 */
17807 				if (__improbable(copy_entry->vme_kernel_object)) {
17808 					printf("%d[%s]: rejecting attempt to extract from kernel_object\n",
17809 					    proc_selfpid(),
17810 					    (get_bsdtask_info(current_task())
17811 					    ? proc_name_address(get_bsdtask_info(current_task()))
17812 					    : "?"));
17813 					DTRACE_VM(extract_kernel_only);
17814 					result = KERN_INVALID_RIGHT;
17815 					vm_map_copy_discard(submap_copy);
17816 					submap_copy = VM_MAP_COPY_NULL;
17817 					vm_map_lock(map);
17818 					break;
17819 				}
17820 
17821 #ifdef __arm64e__
17822 				if (vmk_flags.vmkf_tpro_enforcement_override) {
17823 					copy_entry->used_for_tpro = FALSE;
17824 				}
17825 #endif /* __arm64e__ */
17826 
17827 				vm_map_copy_entry_unlink(submap_copy, copy_entry);
17828 				copy_entry_size = copy_entry->vme_end - copy_entry->vme_start;
17829 				copy_entry->vme_start = map_address;
17830 				copy_entry->vme_end = map_address + copy_entry_size;
17831 				map_address += copy_entry_size;
17832 				mapped_size += copy_entry_size;
17833 				src_start += copy_entry_size;
17834 				assert(src_start <= src_end);
17835 				_vm_map_store_entry_link(map_header,
17836 				    map_header->links.prev,
17837 				    copy_entry);
17838 			}
17839 			/* done with submap_copy */
17840 			vm_map_copy_discard(submap_copy);
17841 
17842 			if (vm_remap_legacy) {
17843 				*cur_protection &= submap_curprot;
17844 				*max_protection &= submap_maxprot;
17845 			}
17846 
17847 			/* re-acquire the map lock and continue to next entry */
17848 			vm_map_lock(map);
17849 			continue;
17850 		} else {
17851 			object = VME_OBJECT(src_entry);
17852 
17853 			/*
17854 			 * Prevent kernel_object from being exposed to
17855 			 * user space.
17856 			 */
17857 			if (__improbable(is_kernel_object(object))) {
17858 				printf("%d[%s]: rejecting attempt to extract from kernel_object\n",
17859 				    proc_selfpid(),
17860 				    (get_bsdtask_info(current_task())
17861 				    ? proc_name_address(get_bsdtask_info(current_task()))
17862 				    : "?"));
17863 				DTRACE_VM(extract_kernel_only);
17864 				result = KERN_INVALID_RIGHT;
17865 				break;
17866 			}
17867 
17868 			if (src_entry->iokit_acct) {
17869 				/*
17870 				 * This entry uses "IOKit accounting".
17871 				 */
17872 			} else if (object != VM_OBJECT_NULL &&
17873 			    (object->purgable != VM_PURGABLE_DENY ||
17874 			    object->vo_ledger_tag != VM_LEDGER_TAG_NONE)) {
17875 				/*
17876 				 * Purgeable objects have their own accounting:
17877 				 * no pmap accounting for them.
17878 				 */
17879 				assertf(!src_entry->use_pmap,
17880 				    "map=%p src_entry=%p [0x%llx:0x%llx] 0x%x/0x%x %d",
17881 				    map,
17882 				    src_entry,
17883 				    (uint64_t)src_entry->vme_start,
17884 				    (uint64_t)src_entry->vme_end,
17885 				    src_entry->protection,
17886 				    src_entry->max_protection,
17887 				    VME_ALIAS(src_entry));
17888 			} else {
17889 				/*
17890 				 * Not IOKit or purgeable:
17891 				 * must be accounted by pmap stats.
17892 				 */
17893 				assertf(src_entry->use_pmap,
17894 				    "map=%p src_entry=%p [0x%llx:0x%llx] 0x%x/0x%x %d",
17895 				    map,
17896 				    src_entry,
17897 				    (uint64_t)src_entry->vme_start,
17898 				    (uint64_t)src_entry->vme_end,
17899 				    src_entry->protection,
17900 				    src_entry->max_protection,
17901 				    VME_ALIAS(src_entry));
17902 			}
17903 
17904 			if (object == VM_OBJECT_NULL) {
17905 				assert(!src_entry->needs_copy);
17906 				if (src_entry->max_protection == VM_PROT_NONE) {
17907 					assert(src_entry->protection == VM_PROT_NONE);
17908 					/*
17909 					 * No VM object and no permissions:
17910 					 * this must be a reserved range with
17911 					 * nothing to share or copy.
17912 					 * There could also be all sorts of
17913 					 * pmap shenanigans within that reserved
17914 					 * range, so let's just copy the map
17915 					 * entry as is to remap a similar
17916 					 * reserved range.
17917 					 */
17918 					offset = 0; /* no object => no offset */
17919 					goto copy_src_entry;
17920 				}
17921 				object = vm_object_allocate(entry_size);
17922 				VME_OFFSET_SET(src_entry, 0);
17923 				VME_OBJECT_SET(src_entry, object, false, 0);
17924 				assert(src_entry->use_pmap);
17925 				assert(!map->mapped_in_other_pmaps);
17926 			} else if (src_entry->wired_count ||
17927 			    object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC) {
17928 				/*
17929 				 * A wired memory region should not have
17930 				 * any pending copy-on-write and needs to
17931 				 * keep pointing at the VM object that
17932 				 * contains the wired pages.
17933 				 * If we're sharing this memory (copy=false),
17934 				 * we'll share this VM object.
17935 				 * If we're copying this memory (copy=true),
17936 				 * we'll call vm_object_copy_slowly() below
17937 				 * and use the new VM object for the remapping.
17938 				 *
17939 				 * Or, we are already using an asymmetric
17940 				 * copy, and therefore we already have
17941 				 * the right object.
17942 				 */
17943 				assert(!src_entry->needs_copy);
17944 			} else if (src_entry->needs_copy || object->shadowed ||
17945 			    (object->internal && !object->true_share &&
17946 			    !src_entry->is_shared &&
17947 			    object->vo_size > entry_size)) {
17948 				bool is_writable;
17949 
17950 				VME_OBJECT_SHADOW(src_entry, entry_size,
17951 				    vm_map_always_shadow(map));
17952 				assert(src_entry->use_pmap);
17953 
17954 				is_writable = false;
17955 				if (src_entry->protection & VM_PROT_WRITE) {
17956 					is_writable = true;
17957 #if __arm64e__
17958 				} else if (src_entry->used_for_tpro) {
17959 					is_writable = true;
17960 #endif /* __arm64e__ */
17961 				}
17962 				if (!src_entry->needs_copy && is_writable) {
17963 					vm_prot_t prot;
17964 
17965 					if (pmap_has_prot_policy(map->pmap, src_entry->translated_allow_execute, src_entry->protection)) {
17966 						panic("%s: map %p pmap %p entry %p 0x%llx:0x%llx prot 0x%x",
17967 						    __FUNCTION__,
17968 						    map, map->pmap,
17969 						    src_entry,
17970 						    (uint64_t)src_entry->vme_start,
17971 						    (uint64_t)src_entry->vme_end,
17972 						    src_entry->protection);
17973 					}
17974 
17975 					prot = src_entry->protection & ~VM_PROT_WRITE;
17976 
17977 					if (override_nx(map,
17978 					    VME_ALIAS(src_entry))
17979 					    && prot) {
17980 						prot |= VM_PROT_EXECUTE;
17981 					}
17982 
17983 					if (pmap_has_prot_policy(map->pmap, src_entry->translated_allow_execute, prot)) {
17984 						panic("%s: map %p pmap %p entry %p 0x%llx:0x%llx prot 0x%x",
17985 						    __FUNCTION__,
17986 						    map, map->pmap,
17987 						    src_entry,
17988 						    (uint64_t)src_entry->vme_start,
17989 						    (uint64_t)src_entry->vme_end,
17990 						    prot);
17991 					}
17992 
17993 					if (map->mapped_in_other_pmaps) {
17994 						vm_object_pmap_protect(
17995 							VME_OBJECT(src_entry),
17996 							VME_OFFSET(src_entry),
17997 							entry_size,
17998 							PMAP_NULL,
17999 							PAGE_SIZE,
18000 							src_entry->vme_start,
18001 							prot);
18002 #if MACH_ASSERT
18003 					} else if (__improbable(map->pmap == PMAP_NULL)) {
18004 						extern boolean_t vm_tests_in_progress;
18005 						assert(vm_tests_in_progress);
18006 						/*
18007 						 * Some VM tests (in vm_tests.c)
18008 						 * sometimes want to use a VM
18009 						 * map without a pmap.
18010 						 * Otherwise, this should never
18011 						 * happen.
18012 						 */
18013 #endif /* MACH_ASSERT */
18014 					} else {
18015 						pmap_protect(vm_map_pmap(map),
18016 						    src_entry->vme_start,
18017 						    src_entry->vme_end,
18018 						    prot);
18019 					}
18020 				}
18021 
18022 				object = VME_OBJECT(src_entry);
18023 				src_entry->needs_copy = FALSE;
18024 			}
18025 
18026 
18027 			vm_object_lock(object);
18028 			vm_object_reference_locked(object); /* object ref. for new entry */
18029 			assert(!src_entry->needs_copy);
18030 			if (object->copy_strategy ==
18031 			    MEMORY_OBJECT_COPY_SYMMETRIC) {
18032 				/*
18033 				 * If we want to share this object (copy==0),
18034 				 * it needs to be COPY_DELAY.
18035 				 * If we want to copy this object (copy==1),
18036 				 * we can't just set "needs_copy" on our side
18037 				 * and expect the other side to do the same
18038 				 * (symmetrically), so we can't let the object
18039 				 * stay COPY_SYMMETRIC.
18040 				 * So we always switch from COPY_SYMMETRIC to
18041 				 * COPY_DELAY.
18042 				 */
18043 				object->copy_strategy =
18044 				    MEMORY_OBJECT_COPY_DELAY;
18045 				VM_OBJECT_SET_TRUE_SHARE(object, TRUE);
18046 			}
18047 			vm_object_unlock(object);
18048 		}
18049 
18050 		offset = (VME_OFFSET(src_entry) +
18051 		    (src_start - src_entry->vme_start));
18052 
18053 copy_src_entry:
18054 		new_entry = _vm_map_entry_create(map_header);
18055 		vm_map_entry_copy(map, new_entry, src_entry);
18056 		if (new_entry->is_sub_map) {
18057 			/* clr address space specifics */
18058 			new_entry->use_pmap = FALSE;
18059 		} else if (copy) {
18060 			/*
18061 			 * We're dealing with a copy-on-write operation,
18062 			 * so the resulting mapping should not inherit the
18063 			 * original mapping's accounting settings.
18064 			 * "use_pmap" should be reset to its default (TRUE)
18065 			 * so that the new mapping gets accounted for in
18066 			 * the task's memory footprint.
18067 			 */
18068 			new_entry->use_pmap = TRUE;
18069 		}
18070 		/* "iokit_acct" was cleared in vm_map_entry_copy() */
18071 		assert(!new_entry->iokit_acct);
18072 
18073 		new_entry->map_aligned = FALSE;
18074 
18075 		new_entry->vme_start = map_address;
18076 		new_entry->vme_end = map_address + tmp_size;
18077 		assert(new_entry->vme_start < new_entry->vme_end);
18078 		if (copy && vmk_flags.vmkf_remap_prot_copy) {
18079 			/* security: keep "permanent" and "csm_associated" */
18080 			new_entry->vme_permanent = src_entry->vme_permanent;
18081 			new_entry->csm_associated = src_entry->csm_associated;
18082 			/*
18083 			 * Remapping for vm_map_protect(VM_PROT_COPY)
18084 			 * to convert a read-only mapping into a
18085 			 * copy-on-write version of itself but
18086 			 * with write access:
18087 			 * keep the original inheritance but let's not
18088 			 * add VM_PROT_WRITE to the max protection yet
18089 			 * since we want to do more security checks against
18090 			 * the target map.
18091 			 */
18092 			new_entry->inheritance = src_entry->inheritance;
18093 			new_entry->protection &= max_prot_for_prot_copy;
18094 		} else {
18095 			new_entry->inheritance = inheritance;
18096 			if (!vm_remap_legacy) {
18097 				new_entry->protection = *cur_protection;
18098 				new_entry->max_protection = *max_protection;
18099 			}
18100 		}
18101 #ifdef __arm64e__
18102 		if (copy && vmk_flags.vmkf_tpro_enforcement_override) {
18103 			new_entry->used_for_tpro = FALSE;
18104 		}
18105 #endif /* __arm64e__ */
18106 		VME_OFFSET_SET(new_entry, offset);
18107 
18108 		/*
18109 		 * The new region has to be copied now if required.
18110 		 */
18111 RestartCopy:
18112 		if (!copy) {
18113 			if (src_entry->used_for_jit == TRUE) {
18114 				if (same_map) {
18115 				} else if (!VM_MAP_POLICY_ALLOW_JIT_SHARING(map)) {
18116 					/*
18117 					 * Cannot allow an entry describing a JIT
18118 					 * region to be shared across address spaces.
18119 					 */
18120 					result = KERN_INVALID_ARGUMENT;
18121 					vm_object_deallocate(object);
18122 					vm_map_entry_dispose(new_entry);
18123 					new_entry = VM_MAP_ENTRY_NULL;
18124 					break;
18125 				}
18126 			}
18127 
18128 			src_entry->is_shared = TRUE;
18129 			new_entry->is_shared = TRUE;
18130 			if (!(new_entry->is_sub_map)) {
18131 				new_entry->needs_copy = FALSE;
18132 			}
18133 		} else if (src_entry->is_sub_map) {
18134 			/* make this a COW sub_map if not already */
18135 			assert(new_entry->wired_count == 0);
18136 			new_entry->needs_copy = TRUE;
18137 			object = VM_OBJECT_NULL;
18138 		} else if (src_entry->wired_count == 0 &&
18139 		    !(debug4k_no_cow_copyin && VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT) &&
18140 		    vm_object_copy_quickly(VME_OBJECT(new_entry),
18141 		    VME_OFFSET(new_entry),
18142 		    (new_entry->vme_end -
18143 		    new_entry->vme_start),
18144 		    &src_needs_copy,
18145 		    &new_entry_needs_copy)) {
18146 			new_entry->needs_copy = new_entry_needs_copy;
18147 			new_entry->is_shared = FALSE;
18148 			assertf(new_entry->use_pmap, "map %p new_entry %p\n", map, new_entry);
18149 
18150 			/*
18151 			 * Handle copy_on_write semantics.
18152 			 */
18153 			if (src_needs_copy && !src_entry->needs_copy) {
18154 				vm_prot_t prot;
18155 
18156 				if (pmap_has_prot_policy(map->pmap, src_entry->translated_allow_execute, src_entry->protection)) {
18157 					panic("%s: map %p pmap %p entry %p 0x%llx:0x%llx prot 0x%x",
18158 					    __FUNCTION__,
18159 					    map, map->pmap, src_entry,
18160 					    (uint64_t)src_entry->vme_start,
18161 					    (uint64_t)src_entry->vme_end,
18162 					    src_entry->protection);
18163 				}
18164 
18165 				prot = src_entry->protection & ~VM_PROT_WRITE;
18166 
18167 				if (override_nx(map,
18168 				    VME_ALIAS(src_entry))
18169 				    && prot) {
18170 					prot |= VM_PROT_EXECUTE;
18171 				}
18172 
18173 				if (pmap_has_prot_policy(map->pmap, src_entry->translated_allow_execute, prot)) {
18174 					panic("%s: map %p pmap %p entry %p 0x%llx:0x%llx prot 0x%x",
18175 					    __FUNCTION__,
18176 					    map, map->pmap, src_entry,
18177 					    (uint64_t)src_entry->vme_start,
18178 					    (uint64_t)src_entry->vme_end,
18179 					    prot);
18180 				}
18181 
18182 				vm_object_pmap_protect(object,
18183 				    offset,
18184 				    entry_size,
18185 				    ((src_entry->is_shared
18186 				    || map->mapped_in_other_pmaps) ?
18187 				    PMAP_NULL : map->pmap),
18188 				    VM_MAP_PAGE_SIZE(map),
18189 				    src_entry->vme_start,
18190 				    prot);
18191 
18192 				assert(src_entry->wired_count == 0);
18193 				src_entry->needs_copy = TRUE;
18194 			}
18195 			/*
18196 			 * Throw away the old object reference of the new entry.
18197 			 */
18198 			vm_object_deallocate(object);
18199 		} else {
18200 			new_entry->is_shared = FALSE;
18201 			assertf(new_entry->use_pmap, "map %p new_entry %p\n", map, new_entry);
18202 
18203 			src_entry_was_wired = (src_entry->wired_count > 0);
18204 			saved_src_entry = src_entry;
18205 			src_entry = VM_MAP_ENTRY_NULL;
18206 
18207 			/*
18208 			 * The map can be safely unlocked since we
18209 			 * already hold a reference on the object.
18210 			 *
18211 			 * Record the timestamp of the map for later
18212 			 * verification, and unlock the map.
18213 			 */
18214 			version.main_timestamp = map->timestamp;
18215 			vm_map_unlock(map);     /* Increments timestamp once! */
18216 
18217 			/*
18218 			 * Perform the copy.
18219 			 */
18220 			if (src_entry_was_wired > 0 ||
18221 			    (debug4k_no_cow_copyin &&
18222 			    VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT)) {
18223 				vm_object_lock(object);
18224 				result = vm_object_copy_slowly(
18225 					object,
18226 					offset,
18227 					(new_entry->vme_end -
18228 					new_entry->vme_start),
18229 					THREAD_UNINT,
18230 					&new_copy_object);
18231 				/* VME_OBJECT_SET will reset used_for_jit, so preserve it. */
18232 				saved_used_for_jit = new_entry->used_for_jit;
18233 				VME_OBJECT_SET(new_entry, new_copy_object, false, 0);
18234 				new_entry->used_for_jit = saved_used_for_jit;
18235 				VME_OFFSET_SET(new_entry, offset - vm_object_trunc_page(offset));
18236 				new_entry->needs_copy = FALSE;
18237 			} else {
18238 				vm_object_offset_t new_offset;
18239 
18240 				new_offset = VME_OFFSET(new_entry);
18241 				result = vm_object_copy_strategically(
18242 					object,
18243 					offset,
18244 					(new_entry->vme_end -
18245 					new_entry->vme_start),
18246 					false, /* forking */
18247 					&new_copy_object,
18248 					&new_offset,
18249 					&new_entry_needs_copy);
18250 				/* VME_OBJECT_SET will reset used_for_jit, so preserve it. */
18251 				saved_used_for_jit = new_entry->used_for_jit;
18252 				VME_OBJECT_SET(new_entry, new_copy_object, false, 0);
18253 				new_entry->used_for_jit = saved_used_for_jit;
18254 				if (new_offset != VME_OFFSET(new_entry)) {
18255 					VME_OFFSET_SET(new_entry, new_offset);
18256 				}
18257 
18258 				new_entry->needs_copy = new_entry_needs_copy;
18259 			}
18260 
18261 			/*
18262 			 * Throw away the old object reference of the new entry.
18263 			 */
18264 			vm_object_deallocate(object);
18265 
18266 			if (result != KERN_SUCCESS &&
18267 			    result != KERN_MEMORY_RESTART_COPY) {
18268 				vm_map_entry_dispose(new_entry);
18269 				vm_map_lock(map);
18270 				break;
18271 			}
18272 
18273 			/*
18274 			 * Verify that the map has not substantially
18275 			 * changed while the copy was being made.
18276 			 */
18277 
18278 			vm_map_lock(map);
18279 			if (version.main_timestamp + 1 != map->timestamp) {
18280 				/*
18281 				 * Simple version comparison failed.
18282 				 *
18283 				 * Retry the lookup and verify that the
18284 				 * same object/offset are still present.
18285 				 */
18286 				saved_src_entry = VM_MAP_ENTRY_NULL;
18287 				vm_object_deallocate(VME_OBJECT(new_entry));
18288 				vm_map_entry_dispose(new_entry);
18289 				if (result == KERN_MEMORY_RESTART_COPY) {
18290 					result = KERN_SUCCESS;
18291 				}
18292 				continue;
18293 			}
18294 			/* map hasn't changed: src_entry is still valid */
18295 			src_entry = saved_src_entry;
18296 			saved_src_entry = VM_MAP_ENTRY_NULL;
18297 
18298 			if (result == KERN_MEMORY_RESTART_COPY) {
18299 				vm_object_reference(object);
18300 				goto RestartCopy;
18301 			}
18302 		}
18303 
18304 		_vm_map_store_entry_link(map_header,
18305 		    map_header->links.prev, new_entry);
18306 
18307 		/* protections for submap mapping are irrelevant here */
18308 		if (vm_remap_legacy && !src_entry->is_sub_map) {
18309 			*cur_protection &= src_entry->protection;
18310 			*max_protection &= src_entry->max_protection;
18311 		}
18312 
18313 		map_address += tmp_size;
18314 		mapped_size += tmp_size;
18315 		src_start += tmp_size;
18316 
18317 		if (vmk_flags.vmkf_copy_single_object) {
18318 			if (mapped_size != size) {
18319 				DEBUG4K_SHARE("map %p addr 0x%llx size 0x%llx clipped copy at mapped_size 0x%llx\n",
18320 				    map, (uint64_t)addr, (uint64_t)size, (uint64_t)mapped_size);
18321 				if (src_entry->vme_next != vm_map_to_entry(map) &&
18322 				    src_entry->vme_next->vme_object_value ==
18323 				    src_entry->vme_object_value) {
18324 					/* XXX TODO4K */
18325 					DEBUG4K_ERROR("could have extended copy to next entry...\n");
18326 				}
18327 			}
18328 			break;
18329 		}
18330 	} /* end while */
18331 
18332 	vm_map_unlock(map);
18333 	if (result != KERN_SUCCESS) {
18334 		/*
18335 		 * Free all allocated elements.
18336 		 */
18337 		for (src_entry = map_header->links.next;
18338 		    src_entry != CAST_TO_VM_MAP_ENTRY(&map_header->links);
18339 		    src_entry = new_entry) {
18340 			new_entry = src_entry->vme_next;
18341 			_vm_map_store_entry_unlink(map_header, src_entry, false);
18342 			if (src_entry->is_sub_map) {
18343 				vm_map_deallocate(VME_SUBMAP(src_entry));
18344 			} else {
18345 				vm_object_deallocate(VME_OBJECT(src_entry));
18346 			}
18347 			vm_map_entry_dispose(src_entry);
18348 		}
18349 	}
18350 	return result;
18351 }
18352 
18353 bool
vm_map_is_exotic(vm_map_t map)18354 vm_map_is_exotic(
18355 	vm_map_t map)
18356 {
18357 	return VM_MAP_IS_EXOTIC(map);
18358 }
18359 
18360 bool
vm_map_is_alien(vm_map_t map)18361 vm_map_is_alien(
18362 	vm_map_t map)
18363 {
18364 	return VM_MAP_IS_ALIEN(map);
18365 }
18366 
18367 #if XNU_TARGET_OS_OSX
18368 void
vm_map_mark_alien(vm_map_t map)18369 vm_map_mark_alien(
18370 	vm_map_t map)
18371 {
18372 	vm_map_lock(map);
18373 	map->is_alien = true;
18374 	vm_map_unlock(map);
18375 }
18376 
18377 void
vm_map_single_jit(vm_map_t map)18378 vm_map_single_jit(
18379 	vm_map_t map)
18380 {
18381 	vm_map_lock(map);
18382 	map->single_jit = true;
18383 	vm_map_unlock(map);
18384 }
18385 #endif /* XNU_TARGET_OS_OSX */
18386 
18387 
18388 /*
18389  * Callers of this function must call vm_map_copy_require on
18390  * previously created vm_map_copy_t or pass a newly created
18391  * one to ensure that it hasn't been forged.
18392  */
18393 static kern_return_t
vm_map_copy_to_physcopy(vm_map_copy_t copy_map,vm_map_t target_map)18394 vm_map_copy_to_physcopy(
18395 	vm_map_copy_t   copy_map,
18396 	vm_map_t        target_map)
18397 {
18398 	vm_map_size_t           size;
18399 	vm_map_entry_t          entry;
18400 	vm_map_entry_t          new_entry;
18401 	vm_object_t             new_object;
18402 	unsigned int            pmap_flags;
18403 	pmap_t                  new_pmap;
18404 	vm_map_t                new_map;
18405 	vm_map_address_t        src_start, src_end, src_cur;
18406 	vm_map_address_t        dst_start, dst_end, dst_cur;
18407 	kern_return_t           kr;
18408 	void                    *kbuf;
18409 
18410 	/*
18411 	 * Perform the equivalent of vm_allocate() and memcpy().
18412 	 * Replace the mappings in "copy_map" with the newly allocated mapping.
18413 	 */
18414 	DEBUG4K_COPY("copy_map %p (%d %d 0x%llx 0x%llx) BEFORE\n", copy_map, copy_map->cpy_hdr.page_shift, copy_map->cpy_hdr.nentries, copy_map->offset, (uint64_t)copy_map->size);
18415 
18416 	assert(copy_map->cpy_hdr.page_shift != VM_MAP_PAGE_MASK(target_map));
18417 
18418 	/* create a new pmap to map "copy_map" */
18419 	pmap_flags = 0;
18420 	assert(copy_map->cpy_hdr.page_shift == FOURK_PAGE_SHIFT);
18421 #if PMAP_CREATE_FORCE_4K_PAGES
18422 	pmap_flags |= PMAP_CREATE_FORCE_4K_PAGES;
18423 #endif /* PMAP_CREATE_FORCE_4K_PAGES */
18424 	pmap_flags |= PMAP_CREATE_64BIT;
18425 	new_pmap = pmap_create_options(NULL, (vm_map_size_t)0, pmap_flags);
18426 	if (new_pmap == NULL) {
18427 		return KERN_RESOURCE_SHORTAGE;
18428 	}
18429 
18430 	/* allocate new VM object */
18431 	size = VM_MAP_ROUND_PAGE(copy_map->size, PAGE_MASK);
18432 	new_object = vm_object_allocate(size);
18433 	assert(new_object);
18434 
18435 	/* allocate new VM map entry */
18436 	new_entry = vm_map_copy_entry_create(copy_map);
18437 	assert(new_entry);
18438 
18439 	/* finish initializing new VM map entry */
18440 	new_entry->protection = VM_PROT_DEFAULT;
18441 	new_entry->max_protection = VM_PROT_DEFAULT;
18442 	new_entry->use_pmap = TRUE;
18443 
18444 	/* make new VM map entry point to new VM object */
18445 	new_entry->vme_start = 0;
18446 	new_entry->vme_end = size;
18447 	VME_OBJECT_SET(new_entry, new_object, false, 0);
18448 	VME_OFFSET_SET(new_entry, 0);
18449 
18450 	/* create a new pageable VM map to map "copy_map" */
18451 	new_map = vm_map_create_options(new_pmap, 0, MACH_VM_MAX_ADDRESS,
18452 	    VM_MAP_CREATE_PAGEABLE);
18453 	assert(new_map);
18454 	vm_map_set_page_shift(new_map, copy_map->cpy_hdr.page_shift);
18455 
18456 	/* map "copy_map" in the new VM map */
18457 	src_start = 0;
18458 	kr = vm_map_copyout_internal(
18459 		new_map,
18460 		&src_start,
18461 		copy_map,
18462 		copy_map->size,
18463 		FALSE, /* consume_on_success */
18464 		VM_PROT_DEFAULT,
18465 		VM_PROT_DEFAULT,
18466 		VM_INHERIT_DEFAULT);
18467 	assert(kr == KERN_SUCCESS);
18468 	src_end = src_start + copy_map->size;
18469 
18470 	/* map "new_object" in the new VM map */
18471 	vm_object_reference(new_object);
18472 	dst_start = 0;
18473 	kr = vm_map_enter(new_map,
18474 	    &dst_start,
18475 	    size,
18476 	    0,               /* mask */
18477 	    VM_MAP_KERNEL_FLAGS_ANYWHERE(.vm_tag = VM_KERN_MEMORY_OSFMK),
18478 	    new_object,
18479 	    0,               /* offset */
18480 	    FALSE,               /* needs copy */
18481 	    VM_PROT_DEFAULT,
18482 	    VM_PROT_DEFAULT,
18483 	    VM_INHERIT_DEFAULT);
18484 	assert(kr == KERN_SUCCESS);
18485 	dst_end = dst_start + size;
18486 
18487 	/* get a kernel buffer */
18488 	kbuf = kalloc_data(PAGE_SIZE, Z_WAITOK | Z_NOFAIL);
18489 
18490 	/* physically copy "copy_map" mappings to new VM object */
18491 	for (src_cur = src_start, dst_cur = dst_start;
18492 	    src_cur < src_end;
18493 	    src_cur += PAGE_SIZE, dst_cur += PAGE_SIZE) {
18494 		vm_size_t bytes;
18495 
18496 		bytes = PAGE_SIZE;
18497 		if (src_cur + PAGE_SIZE > src_end) {
18498 			/* partial copy for last page */
18499 			bytes = src_end - src_cur;
18500 			assert(bytes > 0 && bytes < PAGE_SIZE);
18501 			/* rest of dst page should be zero-filled */
18502 		}
18503 		/* get bytes from src mapping */
18504 		kr = copyinmap(new_map, src_cur, kbuf, bytes);
18505 		if (kr != KERN_SUCCESS) {
18506 			DEBUG4K_COPY("copyinmap(%p, 0x%llx, %p, 0x%llx) kr 0x%x\n", new_map, (uint64_t)src_cur, kbuf, (uint64_t)bytes, kr);
18507 		}
18508 		/* put bytes in dst mapping */
18509 		assert(dst_cur < dst_end);
18510 		assert(dst_cur + bytes <= dst_end);
18511 		kr = copyoutmap(new_map, kbuf, dst_cur, bytes);
18512 		if (kr != KERN_SUCCESS) {
18513 			DEBUG4K_COPY("copyoutmap(%p, %p, 0x%llx, 0x%llx) kr 0x%x\n", new_map, kbuf, (uint64_t)dst_cur, (uint64_t)bytes, kr);
18514 		}
18515 	}
18516 
18517 	/* free kernel buffer */
18518 	kfree_data(kbuf, PAGE_SIZE);
18519 
18520 	/* destroy new map */
18521 	vm_map_destroy(new_map);
18522 	new_map = VM_MAP_NULL;
18523 
18524 	/* dispose of the old map entries in "copy_map" */
18525 	while (vm_map_copy_first_entry(copy_map) !=
18526 	    vm_map_copy_to_entry(copy_map)) {
18527 		entry = vm_map_copy_first_entry(copy_map);
18528 		vm_map_copy_entry_unlink(copy_map, entry);
18529 		if (entry->is_sub_map) {
18530 			vm_map_deallocate(VME_SUBMAP(entry));
18531 		} else {
18532 			vm_object_deallocate(VME_OBJECT(entry));
18533 		}
18534 		vm_map_copy_entry_dispose(entry);
18535 	}
18536 
18537 	/* change "copy_map"'s page_size to match "target_map" */
18538 	copy_map->cpy_hdr.page_shift = (uint16_t)VM_MAP_PAGE_SHIFT(target_map);
18539 	copy_map->offset = 0;
18540 	copy_map->size = size;
18541 
18542 	/* insert new map entry in "copy_map" */
18543 	assert(vm_map_copy_last_entry(copy_map) == vm_map_copy_to_entry(copy_map));
18544 	vm_map_copy_entry_link(copy_map, vm_map_copy_last_entry(copy_map), new_entry);
18545 
18546 	DEBUG4K_COPY("copy_map %p (%d %d 0x%llx 0x%llx) AFTER\n", copy_map, copy_map->cpy_hdr.page_shift, copy_map->cpy_hdr.nentries, copy_map->offset, (uint64_t)copy_map->size);
18547 	return KERN_SUCCESS;
18548 }
18549 
18550 void
18551 vm_map_copy_adjust_get_target_copy_map(
18552 	vm_map_copy_t   copy_map,
18553 	vm_map_copy_t   *target_copy_map_p);
18554 void
vm_map_copy_adjust_get_target_copy_map(vm_map_copy_t copy_map,vm_map_copy_t * target_copy_map_p)18555 vm_map_copy_adjust_get_target_copy_map(
18556 	vm_map_copy_t   copy_map,
18557 	vm_map_copy_t   *target_copy_map_p)
18558 {
18559 	vm_map_copy_t   target_copy_map;
18560 	vm_map_entry_t  entry, target_entry;
18561 
18562 	if (*target_copy_map_p != VM_MAP_COPY_NULL) {
18563 		/* the caller already has a "target_copy_map": use it */
18564 		return;
18565 	}
18566 
18567 	/* the caller wants us to create a new copy of "copy_map" */
18568 	assert(copy_map->type == VM_MAP_COPY_ENTRY_LIST);
18569 	target_copy_map = vm_map_copy_allocate(copy_map->type);
18570 	target_copy_map->offset = copy_map->offset;
18571 	target_copy_map->size = copy_map->size;
18572 	target_copy_map->cpy_hdr.page_shift = copy_map->cpy_hdr.page_shift;
18573 	for (entry = vm_map_copy_first_entry(copy_map);
18574 	    entry != vm_map_copy_to_entry(copy_map);
18575 	    entry = entry->vme_next) {
18576 		target_entry = vm_map_copy_entry_create(target_copy_map);
18577 		vm_map_entry_copy_full(target_entry, entry);
18578 		if (target_entry->is_sub_map) {
18579 			vm_map_reference(VME_SUBMAP(target_entry));
18580 		} else {
18581 			vm_object_reference(VME_OBJECT(target_entry));
18582 		}
18583 		vm_map_copy_entry_link(
18584 			target_copy_map,
18585 			vm_map_copy_last_entry(target_copy_map),
18586 			target_entry);
18587 	}
18588 	entry = VM_MAP_ENTRY_NULL;
18589 	*target_copy_map_p = target_copy_map;
18590 }
18591 
18592 /*
18593  * Callers of this function must call vm_map_copy_require on
18594  * previously created vm_map_copy_t or pass a newly created
18595  * one to ensure that it hasn't been forged.
18596  */
18597 static void
vm_map_copy_trim(vm_map_copy_t copy_map,uint16_t new_page_shift,vm_map_offset_t trim_start,vm_map_offset_t trim_end)18598 vm_map_copy_trim(
18599 	vm_map_copy_t   copy_map,
18600 	uint16_t        new_page_shift,
18601 	vm_map_offset_t trim_start,
18602 	vm_map_offset_t trim_end)
18603 {
18604 	uint16_t        copy_page_shift;
18605 	vm_map_entry_t  entry, next_entry;
18606 
18607 	assert(copy_map->type == VM_MAP_COPY_ENTRY_LIST);
18608 	assert(copy_map->cpy_hdr.nentries > 0);
18609 
18610 	trim_start += vm_map_copy_first_entry(copy_map)->vme_start;
18611 	trim_end += vm_map_copy_first_entry(copy_map)->vme_start;
18612 
18613 	/* use the new page_shift to do the clipping */
18614 	copy_page_shift = VM_MAP_COPY_PAGE_SHIFT(copy_map);
18615 	copy_map->cpy_hdr.page_shift = new_page_shift;
18616 
18617 	for (entry = vm_map_copy_first_entry(copy_map);
18618 	    entry != vm_map_copy_to_entry(copy_map);
18619 	    entry = next_entry) {
18620 		next_entry = entry->vme_next;
18621 		if (entry->vme_end <= trim_start) {
18622 			/* entry fully before trim range: skip */
18623 			continue;
18624 		}
18625 		if (entry->vme_start >= trim_end) {
18626 			/* entry fully after trim range: done */
18627 			break;
18628 		}
18629 		/* clip entry if needed */
18630 		vm_map_copy_clip_start(copy_map, entry, trim_start);
18631 		vm_map_copy_clip_end(copy_map, entry, trim_end);
18632 		/* dispose of entry */
18633 		copy_map->size -= entry->vme_end - entry->vme_start;
18634 		vm_map_copy_entry_unlink(copy_map, entry);
18635 		if (entry->is_sub_map) {
18636 			vm_map_deallocate(VME_SUBMAP(entry));
18637 		} else {
18638 			vm_object_deallocate(VME_OBJECT(entry));
18639 		}
18640 		vm_map_copy_entry_dispose(entry);
18641 		entry = VM_MAP_ENTRY_NULL;
18642 	}
18643 
18644 	/* restore copy_map's original page_shift */
18645 	copy_map->cpy_hdr.page_shift = copy_page_shift;
18646 }
18647 
18648 /*
18649  * Make any necessary adjustments to "copy_map" to allow it to be
18650  * mapped into "target_map".
18651  * If no changes were necessary, "target_copy_map" points to the
18652  * untouched "copy_map".
18653  * If changes are necessary, changes will be made to "target_copy_map".
18654  * If "target_copy_map" was NULL, we create a new "vm_map_copy_t" and
18655  * copy the original "copy_map" to it before applying the changes.
18656  * The caller should discard "target_copy_map" if it's not the same as
18657  * the original "copy_map".
18658  */
18659 /* TODO4K: also adjust to sub-range in the copy_map -> add start&end? */
18660 kern_return_t
vm_map_copy_adjust_to_target(vm_map_copy_t src_copy_map,vm_map_offset_t offset,vm_map_size_t size,vm_map_t target_map,boolean_t copy,vm_map_copy_t * target_copy_map_p,vm_map_offset_t * overmap_start_p,vm_map_offset_t * overmap_end_p,vm_map_offset_t * trimmed_start_p)18661 vm_map_copy_adjust_to_target(
18662 	vm_map_copy_t           src_copy_map,
18663 	vm_map_offset_t         offset,
18664 	vm_map_size_t           size,
18665 	vm_map_t                target_map,
18666 	boolean_t               copy,
18667 	vm_map_copy_t           *target_copy_map_p,
18668 	vm_map_offset_t         *overmap_start_p,
18669 	vm_map_offset_t         *overmap_end_p,
18670 	vm_map_offset_t         *trimmed_start_p)
18671 {
18672 	vm_map_copy_t           copy_map, target_copy_map;
18673 	vm_map_size_t           target_size;
18674 	vm_map_size_t           src_copy_map_size;
18675 	vm_map_size_t           overmap_start, overmap_end;
18676 	int                     misalignments;
18677 	vm_map_entry_t          entry, target_entry;
18678 	vm_map_offset_t         addr_adjustment;
18679 	vm_map_offset_t         new_start, new_end;
18680 	int                     copy_page_mask, target_page_mask;
18681 	uint16_t                copy_page_shift, target_page_shift;
18682 	vm_map_offset_t         trimmed_end;
18683 
18684 	/*
18685 	 * Assert that the vm_map_copy is coming from the right
18686 	 * zone and hasn't been forged
18687 	 */
18688 	vm_map_copy_require(src_copy_map);
18689 	assert(src_copy_map->type == VM_MAP_COPY_ENTRY_LIST);
18690 
18691 	/*
18692 	 * Start working with "src_copy_map" but we'll switch
18693 	 * to "target_copy_map" as soon as we start making adjustments.
18694 	 */
18695 	copy_map = src_copy_map;
18696 	src_copy_map_size = src_copy_map->size;
18697 
18698 	copy_page_shift = VM_MAP_COPY_PAGE_SHIFT(copy_map);
18699 	copy_page_mask = VM_MAP_COPY_PAGE_MASK(copy_map);
18700 	target_page_shift = (uint16_t)VM_MAP_PAGE_SHIFT(target_map);
18701 	target_page_mask = VM_MAP_PAGE_MASK(target_map);
18702 
18703 	DEBUG4K_ADJUST("copy_map %p (%d offset 0x%llx size 0x%llx) target_map %p (%d) copy %d offset 0x%llx size 0x%llx target_copy_map %p...\n", copy_map, copy_page_shift, (uint64_t)copy_map->offset, (uint64_t)copy_map->size, target_map, target_page_shift, copy, (uint64_t)offset, (uint64_t)size, *target_copy_map_p);
18704 
18705 	target_copy_map = *target_copy_map_p;
18706 	if (target_copy_map != VM_MAP_COPY_NULL) {
18707 		vm_map_copy_require(target_copy_map);
18708 	}
18709 
18710 	if (offset + size > copy_map->size) {
18711 		DEBUG4K_ERROR("copy_map %p (%d->%d) copy_map->size 0x%llx offset 0x%llx size 0x%llx KERN_INVALID_ARGUMENT\n", copy_map, copy_page_shift, target_page_shift, (uint64_t)copy_map->size, (uint64_t)offset, (uint64_t)size);
18712 		return KERN_INVALID_ARGUMENT;
18713 	}
18714 
18715 	/* trim the end */
18716 	trimmed_end = 0;
18717 	new_end = VM_MAP_ROUND_PAGE(offset + size, target_page_mask);
18718 	if (new_end < copy_map->size) {
18719 		trimmed_end = src_copy_map_size - new_end;
18720 		DEBUG4K_ADJUST("copy_map %p (%d->%d) copy %d offset 0x%llx size 0x%llx target_copy_map %p... trim end from 0x%llx to 0x%llx\n", copy_map, copy_page_shift, target_page_shift, copy, (uint64_t)offset, (uint64_t)size, target_copy_map, (uint64_t)new_end, (uint64_t)copy_map->size);
18721 		/* get "target_copy_map" if needed and adjust it */
18722 		vm_map_copy_adjust_get_target_copy_map(copy_map,
18723 		    &target_copy_map);
18724 		copy_map = target_copy_map;
18725 		vm_map_copy_trim(target_copy_map, target_page_shift,
18726 		    new_end, copy_map->size);
18727 	}
18728 
18729 	/* trim the start */
18730 	new_start = VM_MAP_TRUNC_PAGE(offset, target_page_mask);
18731 	if (new_start != 0) {
18732 		DEBUG4K_ADJUST("copy_map %p (%d->%d) copy %d offset 0x%llx size 0x%llx target_copy_map %p... trim start from 0x%llx to 0x%llx\n", copy_map, copy_page_shift, target_page_shift, copy, (uint64_t)offset, (uint64_t)size, target_copy_map, (uint64_t)0, (uint64_t)new_start);
18733 		/* get "target_copy_map" if needed and adjust it */
18734 		vm_map_copy_adjust_get_target_copy_map(copy_map,
18735 		    &target_copy_map);
18736 		copy_map = target_copy_map;
18737 		vm_map_copy_trim(target_copy_map, target_page_shift,
18738 		    0, new_start);
18739 	}
18740 	*trimmed_start_p = new_start;
18741 
18742 	/* target_size starts with what's left after trimming */
18743 	target_size = copy_map->size;
18744 	assertf(target_size == src_copy_map_size - *trimmed_start_p - trimmed_end,
18745 	    "target_size 0x%llx src_copy_map_size 0x%llx trimmed_start 0x%llx trimmed_end 0x%llx\n",
18746 	    (uint64_t)target_size, (uint64_t)src_copy_map_size,
18747 	    (uint64_t)*trimmed_start_p, (uint64_t)trimmed_end);
18748 
18749 	/* check for misalignments but don't adjust yet */
18750 	misalignments = 0;
18751 	overmap_start = 0;
18752 	overmap_end = 0;
18753 	if (copy_page_shift < target_page_shift) {
18754 		/*
18755 		 * Remapping from 4K to 16K: check the VM object alignments
18756 		 * throughout the range.
18757 		 * If the start and end of the range are mis-aligned, we can
18758 		 * over-map to re-align, and adjust the "overmap" start/end
18759 		 * and "target_size" of the range accordingly.
18760 		 * If there is any mis-alignment within the range:
18761 		 *     if "copy":
18762 		 *         we can do immediate-copy instead of copy-on-write,
18763 		 *     else:
18764 		 *         no way to remap and share; fail.
18765 		 */
18766 		for (entry = vm_map_copy_first_entry(copy_map);
18767 		    entry != vm_map_copy_to_entry(copy_map);
18768 		    entry = entry->vme_next) {
18769 			vm_object_offset_t object_offset_start, object_offset_end;
18770 
18771 			object_offset_start = VME_OFFSET(entry);
18772 			object_offset_end = object_offset_start;
18773 			object_offset_end += entry->vme_end - entry->vme_start;
18774 			if (object_offset_start & target_page_mask) {
18775 				if (entry == vm_map_copy_first_entry(copy_map) && !copy) {
18776 					overmap_start++;
18777 				} else {
18778 					misalignments++;
18779 				}
18780 			}
18781 			if (object_offset_end & target_page_mask) {
18782 				if (entry->vme_next == vm_map_copy_to_entry(copy_map) && !copy) {
18783 					overmap_end++;
18784 				} else {
18785 					misalignments++;
18786 				}
18787 			}
18788 		}
18789 	}
18790 	entry = VM_MAP_ENTRY_NULL;
18791 
18792 	/* decide how to deal with misalignments */
18793 	assert(overmap_start <= 1);
18794 	assert(overmap_end <= 1);
18795 	if (!overmap_start && !overmap_end && !misalignments) {
18796 		/* copy_map is properly aligned for target_map ... */
18797 		if (*trimmed_start_p) {
18798 			/* ... but we trimmed it, so still need to adjust */
18799 		} else {
18800 			/* ... and we didn't trim anything: we're done */
18801 			if (target_copy_map == VM_MAP_COPY_NULL) {
18802 				target_copy_map = copy_map;
18803 			}
18804 			*target_copy_map_p = target_copy_map;
18805 			*overmap_start_p = 0;
18806 			*overmap_end_p = 0;
18807 			DEBUG4K_ADJUST("copy_map %p (%d offset 0x%llx size 0x%llx) target_map %p (%d) copy %d target_copy_map %p (%d offset 0x%llx size 0x%llx) -> trimmed 0x%llx overmap start 0x%llx end 0x%llx KERN_SUCCESS\n", copy_map, copy_page_shift, (uint64_t)copy_map->offset, (uint64_t)copy_map->size, target_map, target_page_shift, copy, *target_copy_map_p, VM_MAP_COPY_PAGE_SHIFT(*target_copy_map_p), (uint64_t)(*target_copy_map_p)->offset, (uint64_t)(*target_copy_map_p)->size, (uint64_t)*trimmed_start_p, (uint64_t)*overmap_start_p, (uint64_t)*overmap_end_p);
18808 			return KERN_SUCCESS;
18809 		}
18810 	} else if (misalignments && !copy) {
18811 		/* can't "share" if misaligned */
18812 		DEBUG4K_ADJUST("unsupported sharing\n");
18813 #if MACH_ASSERT
18814 		if (debug4k_panic_on_misaligned_sharing) {
18815 			panic("DEBUG4k %s:%d unsupported sharing", __FUNCTION__, __LINE__);
18816 		}
18817 #endif /* MACH_ASSERT */
18818 		DEBUG4K_ADJUST("copy_map %p (%d) target_map %p (%d) copy %d target_copy_map %p -> KERN_NOT_SUPPORTED\n", copy_map, copy_page_shift, target_map, target_page_shift, copy, *target_copy_map_p);
18819 		return KERN_NOT_SUPPORTED;
18820 	} else {
18821 		/* can't virtual-copy if misaligned (but can physical-copy) */
18822 		DEBUG4K_ADJUST("mis-aligned copying\n");
18823 	}
18824 
18825 	/* get a "target_copy_map" if needed and switch to it */
18826 	vm_map_copy_adjust_get_target_copy_map(copy_map, &target_copy_map);
18827 	copy_map = target_copy_map;
18828 
18829 	if (misalignments && copy) {
18830 		vm_map_size_t target_copy_map_size;
18831 
18832 		/*
18833 		 * Can't do copy-on-write with misaligned mappings.
18834 		 * Replace the mappings with a physical copy of the original
18835 		 * mappings' contents.
18836 		 */
18837 		target_copy_map_size = target_copy_map->size;
18838 		kern_return_t kr = vm_map_copy_to_physcopy(target_copy_map, target_map);
18839 		if (kr != KERN_SUCCESS) {
18840 			return kr;
18841 		}
18842 		*target_copy_map_p = target_copy_map;
18843 		*overmap_start_p = 0;
18844 		*overmap_end_p = target_copy_map->size - target_copy_map_size;
18845 		DEBUG4K_ADJUST("copy_map %p (%d offset 0x%llx size 0x%llx) target_map %p (%d) copy %d target_copy_map %p (%d offset 0x%llx size 0x%llx)-> trimmed 0x%llx overmap start 0x%llx end 0x%llx PHYSCOPY\n", copy_map, copy_page_shift, (uint64_t)copy_map->offset, (uint64_t)copy_map->size, target_map, target_page_shift, copy, *target_copy_map_p, VM_MAP_COPY_PAGE_SHIFT(*target_copy_map_p), (uint64_t)(*target_copy_map_p)->offset, (uint64_t)(*target_copy_map_p)->size, (uint64_t)*trimmed_start_p, (uint64_t)*overmap_start_p, (uint64_t)*overmap_end_p);
18846 		return KERN_SUCCESS;
18847 	}
18848 
18849 	/* apply the adjustments */
18850 	misalignments = 0;
18851 	overmap_start = 0;
18852 	overmap_end = 0;
18853 	/* remove copy_map->offset, so that everything starts at offset 0 */
18854 	addr_adjustment = copy_map->offset;
18855 	/* also remove whatever we trimmed from the start */
18856 	addr_adjustment += *trimmed_start_p;
18857 	for (target_entry = vm_map_copy_first_entry(target_copy_map);
18858 	    target_entry != vm_map_copy_to_entry(target_copy_map);
18859 	    target_entry = target_entry->vme_next) {
18860 		vm_object_offset_t object_offset_start, object_offset_end;
18861 
18862 		DEBUG4K_ADJUST("copy %p (%d 0x%llx 0x%llx) entry %p [ 0x%llx 0x%llx ] object %p offset 0x%llx BEFORE\n", target_copy_map, VM_MAP_COPY_PAGE_SHIFT(target_copy_map), target_copy_map->offset, (uint64_t)target_copy_map->size, target_entry, (uint64_t)target_entry->vme_start, (uint64_t)target_entry->vme_end, VME_OBJECT(target_entry), VME_OFFSET(target_entry));
18863 		object_offset_start = VME_OFFSET(target_entry);
18864 		if (object_offset_start & target_page_mask) {
18865 			DEBUG4K_ADJUST("copy %p (%d 0x%llx 0x%llx) entry %p [ 0x%llx 0x%llx ] object %p offset 0x%llx misaligned at start\n", target_copy_map, VM_MAP_COPY_PAGE_SHIFT(target_copy_map), target_copy_map->offset, (uint64_t)target_copy_map->size, target_entry, (uint64_t)target_entry->vme_start, (uint64_t)target_entry->vme_end, VME_OBJECT(target_entry), VME_OFFSET(target_entry));
18866 			if (target_entry == vm_map_copy_first_entry(target_copy_map)) {
18867 				/*
18868 				 * start of 1st entry is mis-aligned:
18869 				 * re-adjust by over-mapping.
18870 				 */
18871 				overmap_start = object_offset_start - trunc_page_mask_64(object_offset_start, target_page_mask);
18872 				DEBUG4K_ADJUST("entry %p offset 0x%llx copy %d -> overmap_start 0x%llx\n", target_entry, VME_OFFSET(target_entry), copy, (uint64_t)overmap_start);
18873 				VME_OFFSET_SET(target_entry, VME_OFFSET(target_entry) - overmap_start);
18874 			} else {
18875 				misalignments++;
18876 				DEBUG4K_ADJUST("entry %p offset 0x%llx copy %d -> misalignments %d\n", target_entry, VME_OFFSET(target_entry), copy, misalignments);
18877 				assert(copy);
18878 			}
18879 		}
18880 
18881 		if (target_entry == vm_map_copy_first_entry(target_copy_map)) {
18882 			target_size += overmap_start;
18883 		} else {
18884 			target_entry->vme_start += overmap_start;
18885 		}
18886 		target_entry->vme_end += overmap_start;
18887 
18888 		object_offset_end = VME_OFFSET(target_entry) + target_entry->vme_end - target_entry->vme_start;
18889 		if (object_offset_end & target_page_mask) {
18890 			DEBUG4K_ADJUST("copy %p (%d 0x%llx 0x%llx) entry %p [ 0x%llx 0x%llx ] object %p offset 0x%llx misaligned at end\n", target_copy_map, VM_MAP_COPY_PAGE_SHIFT(target_copy_map), target_copy_map->offset, (uint64_t)target_copy_map->size, target_entry, (uint64_t)target_entry->vme_start, (uint64_t)target_entry->vme_end, VME_OBJECT(target_entry), VME_OFFSET(target_entry));
18891 			if (target_entry->vme_next == vm_map_copy_to_entry(target_copy_map)) {
18892 				/*
18893 				 * end of last entry is mis-aligned: re-adjust by over-mapping.
18894 				 */
18895 				overmap_end = round_page_mask_64(object_offset_end, target_page_mask) - object_offset_end;
18896 				DEBUG4K_ADJUST("entry %p offset 0x%llx copy %d -> overmap_end 0x%llx\n", target_entry, VME_OFFSET(target_entry), copy, (uint64_t)overmap_end);
18897 				target_entry->vme_end += overmap_end;
18898 				target_size += overmap_end;
18899 			} else {
18900 				misalignments++;
18901 				DEBUG4K_ADJUST("entry %p offset 0x%llx copy %d -> misalignments %d\n", target_entry, VME_OFFSET(target_entry), copy, misalignments);
18902 				assert(copy);
18903 			}
18904 		}
18905 		target_entry->vme_start -= addr_adjustment;
18906 		target_entry->vme_end -= addr_adjustment;
18907 		DEBUG4K_ADJUST("copy %p (%d 0x%llx 0x%llx) entry %p [ 0x%llx 0x%llx ] object %p offset 0x%llx AFTER\n", target_copy_map, VM_MAP_COPY_PAGE_SHIFT(target_copy_map), target_copy_map->offset, (uint64_t)target_copy_map->size, target_entry, (uint64_t)target_entry->vme_start, (uint64_t)target_entry->vme_end, VME_OBJECT(target_entry), VME_OFFSET(target_entry));
18908 	}
18909 
18910 	target_copy_map->size = target_size;
18911 	target_copy_map->offset += overmap_start;
18912 	target_copy_map->offset -= addr_adjustment;
18913 	target_copy_map->cpy_hdr.page_shift = target_page_shift;
18914 
18915 //	assert(VM_MAP_PAGE_ALIGNED(target_copy_map->size, target_page_mask));
18916 //	assert(VM_MAP_PAGE_ALIGNED(target_copy_map->offset, FOURK_PAGE_MASK));
18917 	assert(overmap_start < VM_MAP_PAGE_SIZE(target_map));
18918 	assert(overmap_end < VM_MAP_PAGE_SIZE(target_map));
18919 
18920 	*target_copy_map_p = target_copy_map;
18921 	*overmap_start_p = overmap_start;
18922 	*overmap_end_p = overmap_end;
18923 
18924 	DEBUG4K_ADJUST("copy_map %p (%d offset 0x%llx size 0x%llx) target_map %p (%d) copy %d target_copy_map %p (%d offset 0x%llx size 0x%llx) -> trimmed 0x%llx overmap start 0x%llx end 0x%llx KERN_SUCCESS\n", copy_map, copy_page_shift, (uint64_t)copy_map->offset, (uint64_t)copy_map->size, target_map, target_page_shift, copy, *target_copy_map_p, VM_MAP_COPY_PAGE_SHIFT(*target_copy_map_p), (uint64_t)(*target_copy_map_p)->offset, (uint64_t)(*target_copy_map_p)->size, (uint64_t)*trimmed_start_p, (uint64_t)*overmap_start_p, (uint64_t)*overmap_end_p);
18925 	return KERN_SUCCESS;
18926 }
18927 
18928 kern_return_t
vm_map_range_physical_size(vm_map_t map,vm_map_address_t start,mach_vm_size_t size,mach_vm_size_t * phys_size)18929 vm_map_range_physical_size(
18930 	vm_map_t         map,
18931 	vm_map_address_t start,
18932 	mach_vm_size_t   size,
18933 	mach_vm_size_t * phys_size)
18934 {
18935 	kern_return_t   kr;
18936 	vm_map_copy_t   copy_map, target_copy_map;
18937 	vm_map_offset_t adjusted_start, adjusted_end;
18938 	vm_map_size_t   adjusted_size;
18939 	vm_prot_t       cur_prot, max_prot;
18940 	vm_map_offset_t overmap_start, overmap_end, trimmed_start, end;
18941 	vm_map_kernel_flags_t vmk_flags;
18942 
18943 	if (size == 0) {
18944 		DEBUG4K_SHARE("map %p start 0x%llx size 0x%llx -> phys_size 0!\n", map, (uint64_t)start, (uint64_t)size);
18945 		*phys_size = 0;
18946 		return KERN_SUCCESS;
18947 	}
18948 
18949 	adjusted_start = vm_map_trunc_page(start, VM_MAP_PAGE_MASK(map));
18950 	adjusted_end = vm_map_round_page(start + size, VM_MAP_PAGE_MASK(map));
18951 	if (__improbable(os_add_overflow(start, size, &end) ||
18952 	    adjusted_end <= adjusted_start)) {
18953 		/* wraparound */
18954 		printf("%s:%d(start=0x%llx, size=0x%llx) pgmask 0x%x: wraparound\n", __FUNCTION__, __LINE__, (uint64_t)start, (uint64_t)size, VM_MAP_PAGE_MASK(map));
18955 		*phys_size = 0;
18956 		return KERN_INVALID_ARGUMENT;
18957 	}
18958 	if (__improbable(vm_map_range_overflows(map, start, size))) {
18959 		*phys_size = 0;
18960 		return KERN_INVALID_ADDRESS;
18961 	}
18962 	assert(adjusted_end > adjusted_start);
18963 	adjusted_size = adjusted_end - adjusted_start;
18964 	*phys_size = adjusted_size;
18965 	if (VM_MAP_PAGE_SIZE(map) == PAGE_SIZE) {
18966 		return KERN_SUCCESS;
18967 	}
18968 	if (start == 0) {
18969 		adjusted_start = vm_map_trunc_page(start, PAGE_MASK);
18970 		adjusted_end = vm_map_round_page(start + size, PAGE_MASK);
18971 		if (__improbable(adjusted_end <= adjusted_start)) {
18972 			/* wraparound */
18973 			printf("%s:%d(start=0x%llx, size=0x%llx) pgmask 0x%x: wraparound\n", __FUNCTION__, __LINE__, (uint64_t)start, (uint64_t)size, PAGE_MASK);
18974 			*phys_size = 0;
18975 			return KERN_INVALID_ARGUMENT;
18976 		}
18977 		assert(adjusted_end > adjusted_start);
18978 		adjusted_size = adjusted_end - adjusted_start;
18979 		*phys_size = adjusted_size;
18980 		return KERN_SUCCESS;
18981 	}
18982 
18983 	vmk_flags = VM_MAP_KERNEL_FLAGS_NONE;
18984 	vmk_flags.vmkf_copy_pageable = TRUE;
18985 	vmk_flags.vmkf_copy_same_map = TRUE;
18986 	assert(adjusted_size != 0);
18987 	cur_prot = VM_PROT_NONE; /* legacy mode */
18988 	max_prot = VM_PROT_NONE; /* legacy mode */
18989 	kr = vm_map_copy_extract(map, adjusted_start, adjusted_size,
18990 	    FALSE /* copy */,
18991 	    &copy_map,
18992 	    &cur_prot, &max_prot, VM_INHERIT_DEFAULT,
18993 	    vmk_flags);
18994 	if (kr != KERN_SUCCESS) {
18995 		DEBUG4K_ERROR("map %p start 0x%llx 0x%llx size 0x%llx 0x%llx kr 0x%x\n", map, (uint64_t)start, (uint64_t)adjusted_start, size, (uint64_t)adjusted_size, kr);
18996 		//assert(0);
18997 		*phys_size = 0;
18998 		return kr;
18999 	}
19000 	assert(copy_map != VM_MAP_COPY_NULL);
19001 	target_copy_map = copy_map;
19002 	DEBUG4K_ADJUST("adjusting...\n");
19003 	kr = vm_map_copy_adjust_to_target(
19004 		copy_map,
19005 		start - adjusted_start, /* offset */
19006 		size, /* size */
19007 		kernel_map,
19008 		FALSE,                          /* copy */
19009 		&target_copy_map,
19010 		&overmap_start,
19011 		&overmap_end,
19012 		&trimmed_start);
19013 	if (kr == KERN_SUCCESS) {
19014 		if (target_copy_map->size != *phys_size) {
19015 			DEBUG4K_ADJUST("map %p (%d) start 0x%llx size 0x%llx adjusted_start 0x%llx adjusted_end 0x%llx overmap_start 0x%llx overmap_end 0x%llx trimmed_start 0x%llx phys_size 0x%llx -> 0x%llx\n", map, VM_MAP_PAGE_SHIFT(map), (uint64_t)start, (uint64_t)size, (uint64_t)adjusted_start, (uint64_t)adjusted_end, (uint64_t)overmap_start, (uint64_t)overmap_end, (uint64_t)trimmed_start, (uint64_t)*phys_size, (uint64_t)target_copy_map->size);
19016 		}
19017 		*phys_size = target_copy_map->size;
19018 	} else {
19019 		DEBUG4K_ERROR("map %p start 0x%llx 0x%llx size 0x%llx 0x%llx kr 0x%x\n", map, (uint64_t)start, (uint64_t)adjusted_start, size, (uint64_t)adjusted_size, kr);
19020 		//assert(0);
19021 		*phys_size = 0;
19022 	}
19023 	vm_map_copy_discard(copy_map);
19024 	copy_map = VM_MAP_COPY_NULL;
19025 
19026 	return kr;
19027 }
19028 
19029 
19030 kern_return_t
memory_entry_check_for_adjustment(vm_map_t src_map,ipc_port_t port,vm_map_offset_t * overmap_start,vm_map_offset_t * overmap_end)19031 memory_entry_check_for_adjustment(
19032 	vm_map_t                        src_map,
19033 	ipc_port_t                      port,
19034 	vm_map_offset_t         *overmap_start,
19035 	vm_map_offset_t         *overmap_end)
19036 {
19037 	kern_return_t kr = KERN_SUCCESS;
19038 	vm_map_copy_t copy_map = VM_MAP_COPY_NULL, target_copy_map = VM_MAP_COPY_NULL;
19039 
19040 	assert(port);
19041 	assertf(ip_kotype(port) == IKOT_NAMED_ENTRY, "Port Type expected: %d...received:%d\n", IKOT_NAMED_ENTRY, ip_kotype(port));
19042 
19043 	vm_named_entry_t        named_entry;
19044 
19045 	named_entry = mach_memory_entry_from_port(port);
19046 	named_entry_lock(named_entry);
19047 	copy_map = named_entry->backing.copy;
19048 	target_copy_map = copy_map;
19049 
19050 	if (src_map && VM_MAP_PAGE_SHIFT(src_map) < PAGE_SHIFT) {
19051 		vm_map_offset_t trimmed_start;
19052 
19053 		trimmed_start = 0;
19054 		DEBUG4K_ADJUST("adjusting...\n");
19055 		kr = vm_map_copy_adjust_to_target(
19056 			copy_map,
19057 			0, /* offset */
19058 			copy_map->size, /* size */
19059 			src_map,
19060 			FALSE, /* copy */
19061 			&target_copy_map,
19062 			overmap_start,
19063 			overmap_end,
19064 			&trimmed_start);
19065 		assert(trimmed_start == 0);
19066 	}
19067 	named_entry_unlock(named_entry);
19068 
19069 	return kr;
19070 }
19071 
19072 
19073 /*
19074  *	Routine:	vm_remap
19075  *
19076  *			Map portion of a task's address space.
19077  *			Mapped region must not overlap more than
19078  *			one vm memory object. Protections and
19079  *			inheritance attributes remain the same
19080  *			as in the original task and are	out parameters.
19081  *			Source and Target task can be identical
19082  *			Other attributes are identical as for vm_map()
19083  */
19084 kern_return_t
vm_map_remap(vm_map_t target_map,vm_map_address_t * address,vm_map_size_t size,vm_map_offset_t mask,vm_map_kernel_flags_t vmk_flags,vm_map_t src_map,vm_map_offset_t memory_address,boolean_t copy,vm_prot_t * cur_protection,vm_prot_t * max_protection,vm_inherit_t inheritance)19085 vm_map_remap(
19086 	vm_map_t                target_map,
19087 	vm_map_address_t        *address,
19088 	vm_map_size_t           size,
19089 	vm_map_offset_t         mask,
19090 	vm_map_kernel_flags_t   vmk_flags,
19091 	vm_map_t                src_map,
19092 	vm_map_offset_t         memory_address,
19093 	boolean_t               copy,
19094 	vm_prot_t               *cur_protection, /* IN/OUT */
19095 	vm_prot_t               *max_protection, /* IN/OUT */
19096 	vm_inherit_t            inheritance)
19097 {
19098 	kern_return_t           result;
19099 	vm_map_entry_t          entry;
19100 	vm_map_entry_t          insp_entry = VM_MAP_ENTRY_NULL;
19101 	vm_map_entry_t          new_entry;
19102 	vm_map_copy_t           copy_map;
19103 	vm_map_offset_t         offset_in_mapping;
19104 	vm_map_size_t           target_size = 0;
19105 	vm_map_size_t           src_page_mask, target_page_mask;
19106 	vm_map_offset_t         overmap_start, overmap_end, trimmed_start;
19107 	vm_map_offset_t         initial_memory_address;
19108 	vm_map_size_t           initial_size;
19109 	VM_MAP_ZAP_DECLARE(zap_list);
19110 
19111 	if (target_map == VM_MAP_NULL) {
19112 		return KERN_INVALID_ARGUMENT;
19113 	}
19114 
19115 	if (__improbable(vm_map_range_overflows(src_map, memory_address, size))) {
19116 		return KERN_INVALID_ARGUMENT;
19117 	}
19118 
19119 	if (__improbable((*cur_protection & *max_protection) != *cur_protection)) {
19120 		/* cur is more permissive than max */
19121 		return KERN_INVALID_ARGUMENT;
19122 	}
19123 
19124 	initial_memory_address = memory_address;
19125 	initial_size = size;
19126 	src_page_mask = VM_MAP_PAGE_MASK(src_map);
19127 	target_page_mask = VM_MAP_PAGE_MASK(target_map);
19128 
19129 	switch (inheritance) {
19130 	case VM_INHERIT_NONE:
19131 	case VM_INHERIT_COPY:
19132 	case VM_INHERIT_SHARE:
19133 		if (size != 0 && src_map != VM_MAP_NULL) {
19134 			break;
19135 		}
19136 		OS_FALLTHROUGH;
19137 	default:
19138 		return KERN_INVALID_ARGUMENT;
19139 	}
19140 
19141 	if (src_page_mask != target_page_mask) {
19142 		if (copy) {
19143 			DEBUG4K_COPY("src_map %p pgsz 0x%x addr 0x%llx size 0x%llx copy %d -> target_map %p pgsz 0x%x\n", src_map, VM_MAP_PAGE_SIZE(src_map), (uint64_t)memory_address, (uint64_t)size, copy, target_map, VM_MAP_PAGE_SIZE(target_map));
19144 		} else {
19145 			DEBUG4K_SHARE("src_map %p pgsz 0x%x addr 0x%llx size 0x%llx copy %d -> target_map %p pgsz 0x%x\n", src_map, VM_MAP_PAGE_SIZE(src_map), (uint64_t)memory_address, (uint64_t)size, copy, target_map, VM_MAP_PAGE_SIZE(target_map));
19146 		}
19147 	}
19148 
19149 	/*
19150 	 * If the user is requesting that we return the address of the
19151 	 * first byte of the data (rather than the base of the page),
19152 	 * then we use different rounding semantics: specifically,
19153 	 * we assume that (memory_address, size) describes a region
19154 	 * all of whose pages we must cover, rather than a base to be truncated
19155 	 * down and a size to be added to that base.  So we figure out
19156 	 * the highest page that the requested region includes and make
19157 	 * sure that the size will cover it.
19158 	 *
19159 	 * The key example we're worried about it is of the form:
19160 	 *
19161 	 *              memory_address = 0x1ff0, size = 0x20
19162 	 *
19163 	 * With the old semantics, we round down the memory_address to 0x1000
19164 	 * and round up the size to 0x1000, resulting in our covering *only*
19165 	 * page 0x1000.  With the new semantics, we'd realize that the region covers
19166 	 * 0x1ff0-0x2010, and compute a size of 0x2000.  Thus, we cover both page
19167 	 * 0x1000 and page 0x2000 in the region we remap.
19168 	 */
19169 	if (vmk_flags.vmf_return_data_addr) {
19170 		vm_map_offset_t range_start, range_end;
19171 
19172 		range_start = vm_map_trunc_page(memory_address, src_page_mask);
19173 		range_end = vm_map_round_page(memory_address + size, src_page_mask);
19174 		memory_address = range_start;
19175 		size = range_end - range_start;
19176 		offset_in_mapping = initial_memory_address - memory_address;
19177 	} else {
19178 		/*
19179 		 * IMPORTANT:
19180 		 * This legacy code path is broken: for the range mentioned
19181 		 * above [ memory_address = 0x1ff0,size = 0x20 ], which spans
19182 		 * two 4k pages, it yields [ memory_address = 0x1000,
19183 		 * size = 0x1000 ], which covers only the first 4k page.
19184 		 * BUT some code unfortunately depends on this bug, so we
19185 		 * can't fix it without breaking something.
19186 		 * New code should get automatically opted in the new
19187 		 * behavior with the new VM_FLAGS_RETURN_DATA_ADDR flags.
19188 		 */
19189 		offset_in_mapping = 0;
19190 		memory_address = vm_map_trunc_page(memory_address, src_page_mask);
19191 		size = vm_map_round_page(size, src_page_mask);
19192 		initial_memory_address = memory_address;
19193 		initial_size = size;
19194 	}
19195 
19196 
19197 	if (size == 0) {
19198 		return KERN_INVALID_ARGUMENT;
19199 	}
19200 
19201 	if (vmk_flags.vmf_resilient_media) {
19202 		/* must be copy-on-write to be "media resilient" */
19203 		if (!copy) {
19204 			return KERN_INVALID_ARGUMENT;
19205 		}
19206 	}
19207 
19208 	vmk_flags.vmkf_copy_pageable = target_map->hdr.entries_pageable;
19209 	vmk_flags.vmkf_copy_same_map = (src_map == target_map);
19210 
19211 	assert(size != 0);
19212 	result = vm_map_copy_extract(src_map,
19213 	    memory_address,
19214 	    size,
19215 	    copy, &copy_map,
19216 	    cur_protection, /* IN/OUT */
19217 	    max_protection, /* IN/OUT */
19218 	    inheritance,
19219 	    vmk_flags);
19220 	if (result != KERN_SUCCESS) {
19221 		return result;
19222 	}
19223 	assert(copy_map != VM_MAP_COPY_NULL);
19224 
19225 	/*
19226 	 * Handle the policy for vm map ranges
19227 	 *
19228 	 * If the maps differ, the target_map policy applies like for vm_map()
19229 	 * For same mapping remaps, we preserve the range.
19230 	 */
19231 	if (vmk_flags.vmkf_copy_same_map) {
19232 		vmk_flags.vmkf_range_id = copy_map->orig_range;
19233 	} else {
19234 		vm_map_kernel_flags_update_range_id(&vmk_flags, target_map);
19235 	}
19236 
19237 	overmap_start = 0;
19238 	overmap_end = 0;
19239 	trimmed_start = 0;
19240 	target_size = size;
19241 	if (src_page_mask != target_page_mask) {
19242 		vm_map_copy_t target_copy_map;
19243 
19244 		target_copy_map = copy_map; /* can modify "copy_map" itself */
19245 		DEBUG4K_ADJUST("adjusting...\n");
19246 		result = vm_map_copy_adjust_to_target(
19247 			copy_map,
19248 			offset_in_mapping, /* offset */
19249 			initial_size,
19250 			target_map,
19251 			copy,
19252 			&target_copy_map,
19253 			&overmap_start,
19254 			&overmap_end,
19255 			&trimmed_start);
19256 		if (result != KERN_SUCCESS) {
19257 			DEBUG4K_COPY("failed to adjust 0x%x\n", result);
19258 			vm_map_copy_discard(copy_map);
19259 			return result;
19260 		}
19261 		if (trimmed_start == 0) {
19262 			/* nothing trimmed: no adjustment needed */
19263 		} else if (trimmed_start >= offset_in_mapping) {
19264 			/* trimmed more than offset_in_mapping: nothing left */
19265 			assert(overmap_start == 0);
19266 			assert(overmap_end == 0);
19267 			offset_in_mapping = 0;
19268 		} else {
19269 			/* trimmed some of offset_in_mapping: adjust */
19270 			assert(overmap_start == 0);
19271 			assert(overmap_end == 0);
19272 			offset_in_mapping -= trimmed_start;
19273 		}
19274 		offset_in_mapping += overmap_start;
19275 		target_size = target_copy_map->size;
19276 	}
19277 
19278 	/*
19279 	 * Allocate/check a range of free virtual address
19280 	 * space for the target
19281 	 */
19282 	*address = vm_map_trunc_page(*address, target_page_mask);
19283 	vm_map_lock(target_map);
19284 	target_size = vm_map_round_page(target_size, target_page_mask);
19285 	result = vm_map_remap_range_allocate(target_map, address,
19286 	    target_size, mask, vmk_flags,
19287 	    &insp_entry, &zap_list);
19288 
19289 	for (entry = vm_map_copy_first_entry(copy_map);
19290 	    entry != vm_map_copy_to_entry(copy_map);
19291 	    entry = new_entry) {
19292 		new_entry = entry->vme_next;
19293 		vm_map_copy_entry_unlink(copy_map, entry);
19294 		if (result == KERN_SUCCESS) {
19295 			if (vmk_flags.vmkf_remap_prot_copy) {
19296 				/*
19297 				 * This vm_map_remap() is for a
19298 				 * vm_protect(VM_PROT_COPY), so the caller
19299 				 * expects to be allowed to add write access
19300 				 * to this new mapping.  This is done by
19301 				 * adding VM_PROT_WRITE to each entry's
19302 				 * max_protection... unless some security
19303 				 * settings disallow it.
19304 				 */
19305 				bool allow_write = false;
19306 				if (entry->vme_permanent) {
19307 					/* immutable mapping... */
19308 					if ((entry->max_protection & VM_PROT_EXECUTE) &&
19309 					    developer_mode_state()) {
19310 						/*
19311 						 * ... but executable and
19312 						 * possibly being debugged,
19313 						 * so let's allow it to become
19314 						 * writable, for breakpoints
19315 						 * and dtrace probes, for
19316 						 * example.
19317 						 */
19318 						allow_write = true;
19319 					} else {
19320 						printf("%d[%s] vm_remap(0x%llx,0x%llx) VM_PROT_COPY denied on permanent mapping prot 0x%x/0x%x developer %d\n",
19321 						    proc_selfpid(),
19322 						    (get_bsdtask_info(current_task())
19323 						    ? proc_name_address(get_bsdtask_info(current_task()))
19324 						    : "?"),
19325 						    (uint64_t)memory_address,
19326 						    (uint64_t)size,
19327 						    entry->protection,
19328 						    entry->max_protection,
19329 						    developer_mode_state());
19330 						DTRACE_VM6(vm_map_delete_permanent_deny_protcopy,
19331 						    vm_map_entry_t, entry,
19332 						    vm_map_offset_t, entry->vme_start,
19333 						    vm_map_offset_t, entry->vme_end,
19334 						    vm_prot_t, entry->protection,
19335 						    vm_prot_t, entry->max_protection,
19336 						    int, VME_ALIAS(entry));
19337 					}
19338 				} else {
19339 					allow_write = true;
19340 				}
19341 
19342 				/*
19343 				 * VM_PROT_COPY: allow this mapping to become
19344 				 * writable, unless it was "permanent".
19345 				 */
19346 				if (allow_write) {
19347 					entry->max_protection |= VM_PROT_WRITE;
19348 				}
19349 			}
19350 			if (vmk_flags.vmf_resilient_codesign) {
19351 				/* no codesigning -> read-only access */
19352 				entry->max_protection = VM_PROT_READ;
19353 				entry->protection = VM_PROT_READ;
19354 				entry->vme_resilient_codesign = TRUE;
19355 			}
19356 			entry->vme_start += *address;
19357 			entry->vme_end += *address;
19358 			assert(!entry->map_aligned);
19359 			if (vmk_flags.vmf_resilient_media &&
19360 			    !entry->is_sub_map &&
19361 			    (VME_OBJECT(entry) == VM_OBJECT_NULL ||
19362 			    VME_OBJECT(entry)->internal)) {
19363 				entry->vme_resilient_media = TRUE;
19364 			}
19365 			assert(VM_MAP_PAGE_ALIGNED(entry->vme_start, MIN(target_page_mask, PAGE_MASK)));
19366 			assert(VM_MAP_PAGE_ALIGNED(entry->vme_end, MIN(target_page_mask, PAGE_MASK)));
19367 			assert(VM_MAP_PAGE_ALIGNED(VME_OFFSET(entry), MIN(target_page_mask, PAGE_MASK)));
19368 			vm_map_store_entry_link(target_map, insp_entry, entry,
19369 			    vmk_flags);
19370 			insp_entry = entry;
19371 		} else {
19372 			if (!entry->is_sub_map) {
19373 				vm_object_deallocate(VME_OBJECT(entry));
19374 			} else {
19375 				vm_map_deallocate(VME_SUBMAP(entry));
19376 			}
19377 			vm_map_copy_entry_dispose(entry);
19378 		}
19379 	}
19380 
19381 	if (vmk_flags.vmf_resilient_codesign) {
19382 		*cur_protection = VM_PROT_READ;
19383 		*max_protection = VM_PROT_READ;
19384 	}
19385 
19386 	if (result == KERN_SUCCESS) {
19387 		target_map->size += target_size;
19388 		SAVE_HINT_MAP_WRITE(target_map, insp_entry);
19389 	}
19390 	vm_map_unlock(target_map);
19391 
19392 	vm_map_zap_dispose(&zap_list);
19393 
19394 	if (result == KERN_SUCCESS && target_map->wiring_required) {
19395 		result = vm_map_wire_kernel(target_map, *address,
19396 		    *address + size, *cur_protection, VM_KERN_MEMORY_MLOCK,
19397 		    TRUE);
19398 	}
19399 
19400 	/*
19401 	 * If requested, return the address of the data pointed to by the
19402 	 * request, rather than the base of the resulting page.
19403 	 */
19404 	if (vmk_flags.vmf_return_data_addr) {
19405 		*address += offset_in_mapping;
19406 	}
19407 
19408 	if (src_page_mask != target_page_mask) {
19409 		DEBUG4K_SHARE("vm_remap(%p 0x%llx 0x%llx copy=%d-> %p 0x%llx 0x%llx  result=0x%x\n", src_map, (uint64_t)memory_address, (uint64_t)size, copy, target_map, (uint64_t)*address, (uint64_t)offset_in_mapping, result);
19410 	}
19411 	vm_map_copy_discard(copy_map);
19412 	copy_map = VM_MAP_COPY_NULL;
19413 
19414 	return result;
19415 }
19416 
19417 /*
19418  *	Routine:	vm_map_remap_range_allocate
19419  *
19420  *	Description:
19421  *		Allocate a range in the specified virtual address map.
19422  *		returns the address and the map entry just before the allocated
19423  *		range
19424  *
19425  *	Map must be locked.
19426  */
19427 
19428 static kern_return_t
vm_map_remap_range_allocate(vm_map_t map,vm_map_address_t * address,vm_map_size_t size,vm_map_offset_t mask,vm_map_kernel_flags_t vmk_flags,vm_map_entry_t * map_entry,vm_map_zap_t zap_list)19429 vm_map_remap_range_allocate(
19430 	vm_map_t                map,
19431 	vm_map_address_t        *address,       /* IN/OUT */
19432 	vm_map_size_t           size,
19433 	vm_map_offset_t         mask,
19434 	vm_map_kernel_flags_t   vmk_flags,
19435 	vm_map_entry_t          *map_entry,     /* OUT */
19436 	vm_map_zap_t            zap_list)
19437 {
19438 	vm_map_entry_t  entry;
19439 	vm_map_offset_t start;
19440 	kern_return_t   kr;
19441 
19442 	start = *address;
19443 
19444 	if (!vmk_flags.vmf_fixed) {
19445 		kr = vm_map_locate_space(map, size, mask, vmk_flags,
19446 		    &start, &entry);
19447 		if (kr != KERN_SUCCESS) {
19448 			return kr;
19449 		}
19450 		*address = start;
19451 	} else {
19452 		vm_map_offset_t effective_min_offset, effective_max_offset;
19453 		vm_map_entry_t  temp_entry;
19454 		vm_map_offset_t end;
19455 
19456 		effective_min_offset = map->min_offset;
19457 		effective_max_offset = map->max_offset;
19458 
19459 		/*
19460 		 *	Verify that:
19461 		 *		the address doesn't itself violate
19462 		 *		the mask requirement.
19463 		 */
19464 
19465 		if ((start & mask) != 0) {
19466 			return KERN_NO_SPACE;
19467 		}
19468 
19469 #if CONFIG_MAP_RANGES
19470 		if (map->uses_user_ranges) {
19471 			struct mach_vm_range r;
19472 
19473 			vm_map_user_range_resolve(map, start, 1, &r);
19474 			if (r.max_address == 0) {
19475 				return KERN_INVALID_ADDRESS;
19476 			}
19477 
19478 			effective_min_offset = r.min_address;
19479 			effective_max_offset = r.max_address;
19480 		}
19481 #endif /* CONFIG_MAP_RANGES */
19482 		if (map == kernel_map) {
19483 			mach_vm_range_t r = kmem_validate_range_for_overwrite(start, size);
19484 			effective_min_offset = r->min_address;
19485 			effective_min_offset = r->max_address;
19486 		}
19487 
19488 		/*
19489 		 *	...	the address is within bounds
19490 		 */
19491 
19492 		end = start + size;
19493 
19494 		if ((start < effective_min_offset) ||
19495 		    (end > effective_max_offset) ||
19496 		    (start >= end)) {
19497 			return KERN_INVALID_ADDRESS;
19498 		}
19499 
19500 		/*
19501 		 * If we're asked to overwrite whatever was mapped in that
19502 		 * range, first deallocate that range.
19503 		 */
19504 		if (vmk_flags.vmf_overwrite) {
19505 			vmr_flags_t remove_flags = VM_MAP_REMOVE_NO_MAP_ALIGN;
19506 
19507 			/*
19508 			 * We use a "zap_list" to avoid having to unlock
19509 			 * the "map" in vm_map_delete(), which would compromise
19510 			 * the atomicity of the "deallocate" and then "remap"
19511 			 * combination.
19512 			 */
19513 			remove_flags |= VM_MAP_REMOVE_NO_YIELD;
19514 
19515 			if (vmk_flags.vmkf_overwrite_immutable) {
19516 				remove_flags |= VM_MAP_REMOVE_IMMUTABLE;
19517 			}
19518 			if (vmk_flags.vmkf_remap_prot_copy) {
19519 				remove_flags |= VM_MAP_REMOVE_IMMUTABLE_CODE;
19520 			}
19521 			kr = vm_map_delete(map, start, end, remove_flags,
19522 			    KMEM_GUARD_NONE, zap_list).kmr_return;
19523 			if (kr != KERN_SUCCESS) {
19524 				/* XXX FBDP restore zap_list? */
19525 				return kr;
19526 			}
19527 		}
19528 
19529 		/*
19530 		 *	...	the starting address isn't allocated
19531 		 */
19532 
19533 		if (vm_map_lookup_entry(map, start, &temp_entry)) {
19534 			return KERN_NO_SPACE;
19535 		}
19536 
19537 		entry = temp_entry;
19538 
19539 		/*
19540 		 *	...	the next region doesn't overlap the
19541 		 *		end point.
19542 		 */
19543 
19544 		if ((entry->vme_next != vm_map_to_entry(map)) &&
19545 		    (entry->vme_next->vme_start < end)) {
19546 			return KERN_NO_SPACE;
19547 		}
19548 	}
19549 	*map_entry = entry;
19550 	return KERN_SUCCESS;
19551 }
19552 
19553 /*
19554  *	vm_map_switch:
19555  *
19556  *	Set the address map for the current thread to the specified map
19557  */
19558 
19559 vm_map_t
vm_map_switch(vm_map_t map)19560 vm_map_switch(
19561 	vm_map_t        map)
19562 {
19563 	thread_t        thread = current_thread();
19564 	vm_map_t        oldmap = thread->map;
19565 
19566 
19567 	/*
19568 	 *	Deactivate the current map and activate the requested map
19569 	 */
19570 	mp_disable_preemption();
19571 	PMAP_SWITCH_USER(thread, map, cpu_number());
19572 	mp_enable_preemption();
19573 	return oldmap;
19574 }
19575 
19576 
19577 /*
19578  *	Routine:	vm_map_write_user
19579  *
19580  *	Description:
19581  *		Copy out data from a kernel space into space in the
19582  *		destination map. The space must already exist in the
19583  *		destination map.
19584  *		NOTE:  This routine should only be called by threads
19585  *		which can block on a page fault. i.e. kernel mode user
19586  *		threads.
19587  *
19588  */
19589 kern_return_t
vm_map_write_user(vm_map_t map,void * src_p,vm_map_address_t dst_addr,vm_size_t size)19590 vm_map_write_user(
19591 	vm_map_t                map,
19592 	void                    *src_p,
19593 	vm_map_address_t        dst_addr,
19594 	vm_size_t               size)
19595 {
19596 	kern_return_t   kr = KERN_SUCCESS;
19597 
19598 	if (__improbable(vm_map_range_overflows(map, dst_addr, size))) {
19599 		return KERN_INVALID_ADDRESS;
19600 	}
19601 
19602 	if (current_map() == map) {
19603 		if (copyout(src_p, dst_addr, size)) {
19604 			kr = KERN_INVALID_ADDRESS;
19605 		}
19606 	} else {
19607 		vm_map_t        oldmap;
19608 
19609 		/* take on the identity of the target map while doing */
19610 		/* the transfer */
19611 
19612 		vm_map_reference(map);
19613 		oldmap = vm_map_switch(map);
19614 		if (copyout(src_p, dst_addr, size)) {
19615 			kr = KERN_INVALID_ADDRESS;
19616 		}
19617 		vm_map_switch(oldmap);
19618 		vm_map_deallocate(map);
19619 	}
19620 	return kr;
19621 }
19622 
19623 /*
19624  *	Routine:	vm_map_read_user
19625  *
19626  *	Description:
19627  *		Copy in data from a user space source map into the
19628  *		kernel map. The space must already exist in the
19629  *		kernel map.
19630  *		NOTE:  This routine should only be called by threads
19631  *		which can block on a page fault. i.e. kernel mode user
19632  *		threads.
19633  *
19634  */
19635 kern_return_t
vm_map_read_user(vm_map_t map,vm_map_address_t src_addr,void * dst_p,vm_size_t size)19636 vm_map_read_user(
19637 	vm_map_t                map,
19638 	vm_map_address_t        src_addr,
19639 	void                    *dst_p,
19640 	vm_size_t               size)
19641 {
19642 	kern_return_t   kr = KERN_SUCCESS;
19643 
19644 	if (__improbable(vm_map_range_overflows(map, src_addr, size))) {
19645 		return KERN_INVALID_ADDRESS;
19646 	}
19647 
19648 	if (current_map() == map) {
19649 		if (copyin(src_addr, dst_p, size)) {
19650 			kr = KERN_INVALID_ADDRESS;
19651 		}
19652 	} else {
19653 		vm_map_t        oldmap;
19654 
19655 		/* take on the identity of the target map while doing */
19656 		/* the transfer */
19657 
19658 		vm_map_reference(map);
19659 		oldmap = vm_map_switch(map);
19660 		if (copyin(src_addr, dst_p, size)) {
19661 			kr = KERN_INVALID_ADDRESS;
19662 		}
19663 		vm_map_switch(oldmap);
19664 		vm_map_deallocate(map);
19665 	}
19666 	return kr;
19667 }
19668 
19669 
19670 /*
19671  *	vm_map_check_protection:
19672  *
19673  *	Assert that the target map allows the specified
19674  *	privilege on the entire address region given.
19675  *	The entire region must be allocated.
19676  */
19677 boolean_t
vm_map_check_protection(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_prot_t protection)19678 vm_map_check_protection(vm_map_t map, vm_map_offset_t start,
19679     vm_map_offset_t end, vm_prot_t protection)
19680 {
19681 	vm_map_entry_t entry;
19682 	vm_map_entry_t tmp_entry;
19683 
19684 	if (__improbable(vm_map_range_overflows(map, start, end - start))) {
19685 		return FALSE;
19686 	}
19687 
19688 	vm_map_lock(map);
19689 
19690 	if (start < vm_map_min(map) || end > vm_map_max(map) || start > end) {
19691 		vm_map_unlock(map);
19692 		return FALSE;
19693 	}
19694 
19695 	if (!vm_map_lookup_entry(map, start, &tmp_entry)) {
19696 		vm_map_unlock(map);
19697 		return FALSE;
19698 	}
19699 
19700 	entry = tmp_entry;
19701 
19702 	while (start < end) {
19703 		if (entry == vm_map_to_entry(map)) {
19704 			vm_map_unlock(map);
19705 			return FALSE;
19706 		}
19707 
19708 		/*
19709 		 *	No holes allowed!
19710 		 */
19711 
19712 		if (start < entry->vme_start) {
19713 			vm_map_unlock(map);
19714 			return FALSE;
19715 		}
19716 
19717 		/*
19718 		 * Check protection associated with entry.
19719 		 */
19720 
19721 		if ((entry->protection & protection) != protection) {
19722 			vm_map_unlock(map);
19723 			return FALSE;
19724 		}
19725 
19726 		/* go to next entry */
19727 
19728 		start = entry->vme_end;
19729 		entry = entry->vme_next;
19730 	}
19731 	vm_map_unlock(map);
19732 	return TRUE;
19733 }
19734 
19735 kern_return_t
vm_map_purgable_control(vm_map_t map,vm_map_offset_t address,vm_purgable_t control,int * state)19736 vm_map_purgable_control(
19737 	vm_map_t                map,
19738 	vm_map_offset_t         address,
19739 	vm_purgable_t           control,
19740 	int                     *state)
19741 {
19742 	vm_map_entry_t          entry;
19743 	vm_object_t             object;
19744 	kern_return_t           kr;
19745 	boolean_t               was_nonvolatile;
19746 
19747 	/*
19748 	 * Vet all the input parameters and current type and state of the
19749 	 * underlaying object.  Return with an error if anything is amiss.
19750 	 */
19751 	if (map == VM_MAP_NULL) {
19752 		return KERN_INVALID_ARGUMENT;
19753 	}
19754 
19755 	if (control != VM_PURGABLE_SET_STATE &&
19756 	    control != VM_PURGABLE_GET_STATE &&
19757 	    control != VM_PURGABLE_PURGE_ALL &&
19758 	    control != VM_PURGABLE_SET_STATE_FROM_KERNEL) {
19759 		return KERN_INVALID_ARGUMENT;
19760 	}
19761 
19762 	if (control == VM_PURGABLE_PURGE_ALL) {
19763 		vm_purgeable_object_purge_all();
19764 		return KERN_SUCCESS;
19765 	}
19766 
19767 	if ((control == VM_PURGABLE_SET_STATE ||
19768 	    control == VM_PURGABLE_SET_STATE_FROM_KERNEL) &&
19769 	    (((*state & ~(VM_PURGABLE_ALL_MASKS)) != 0) ||
19770 	    ((*state & VM_PURGABLE_STATE_MASK) > VM_PURGABLE_STATE_MASK))) {
19771 		return KERN_INVALID_ARGUMENT;
19772 	}
19773 
19774 	vm_map_lock_read(map);
19775 
19776 	if (!vm_map_lookup_entry(map, address, &entry) || entry->is_sub_map) {
19777 		/*
19778 		 * Must pass a valid non-submap address.
19779 		 */
19780 		vm_map_unlock_read(map);
19781 		return KERN_INVALID_ADDRESS;
19782 	}
19783 
19784 	if ((entry->protection & VM_PROT_WRITE) == 0 &&
19785 	    control != VM_PURGABLE_GET_STATE) {
19786 		/*
19787 		 * Can't apply purgable controls to something you can't write.
19788 		 */
19789 		vm_map_unlock_read(map);
19790 		return KERN_PROTECTION_FAILURE;
19791 	}
19792 
19793 	object = VME_OBJECT(entry);
19794 	if (object == VM_OBJECT_NULL ||
19795 	    object->purgable == VM_PURGABLE_DENY) {
19796 		/*
19797 		 * Object must already be present and be purgeable.
19798 		 */
19799 		vm_map_unlock_read(map);
19800 		return KERN_INVALID_ARGUMENT;
19801 	}
19802 
19803 	vm_object_lock(object);
19804 
19805 #if 00
19806 	if (VME_OFFSET(entry) != 0 ||
19807 	    entry->vme_end - entry->vme_start != object->vo_size) {
19808 		/*
19809 		 * Can only apply purgable controls to the whole (existing)
19810 		 * object at once.
19811 		 */
19812 		vm_map_unlock_read(map);
19813 		vm_object_unlock(object);
19814 		return KERN_INVALID_ARGUMENT;
19815 	}
19816 #endif
19817 
19818 	assert(!entry->is_sub_map);
19819 	assert(!entry->use_pmap); /* purgeable has its own accounting */
19820 
19821 	vm_map_unlock_read(map);
19822 
19823 	was_nonvolatile = (object->purgable == VM_PURGABLE_NONVOLATILE);
19824 
19825 	kr = vm_object_purgable_control(object, control, state);
19826 
19827 	if (was_nonvolatile &&
19828 	    object->purgable != VM_PURGABLE_NONVOLATILE &&
19829 	    map->pmap == kernel_pmap) {
19830 #if DEBUG
19831 		object->vo_purgeable_volatilizer = kernel_task;
19832 #endif /* DEBUG */
19833 	}
19834 
19835 	vm_object_unlock(object);
19836 
19837 	return kr;
19838 }
19839 
19840 void
vm_map_footprint_query_page_info(vm_map_t map,vm_map_entry_t map_entry,vm_map_offset_t curr_s_offset,int * disposition_p)19841 vm_map_footprint_query_page_info(
19842 	vm_map_t        map,
19843 	vm_map_entry_t  map_entry,
19844 	vm_map_offset_t curr_s_offset,
19845 	int             *disposition_p)
19846 {
19847 	int             pmap_disp;
19848 	vm_object_t     object = VM_OBJECT_NULL;
19849 	int             disposition;
19850 	int             effective_page_size;
19851 
19852 	vm_map_lock_assert_held(map);
19853 	assert(!map->has_corpse_footprint);
19854 	assert(curr_s_offset >= map_entry->vme_start);
19855 	assert(curr_s_offset < map_entry->vme_end);
19856 
19857 	if (map_entry->is_sub_map) {
19858 		if (!map_entry->use_pmap) {
19859 			/* nested pmap: no footprint */
19860 			*disposition_p = 0;
19861 			return;
19862 		}
19863 	} else {
19864 		object = VME_OBJECT(map_entry);
19865 		if (object == VM_OBJECT_NULL) {
19866 			/* nothing mapped here: no need to ask */
19867 			*disposition_p = 0;
19868 			return;
19869 		}
19870 	}
19871 
19872 	effective_page_size = MIN(PAGE_SIZE, VM_MAP_PAGE_SIZE(map));
19873 
19874 	pmap_disp = 0;
19875 
19876 	/*
19877 	 * Query the pmap.
19878 	 */
19879 	pmap_query_page_info(map->pmap, curr_s_offset, &pmap_disp);
19880 
19881 	/*
19882 	 * Compute this page's disposition.
19883 	 */
19884 	disposition = 0;
19885 
19886 	/* deal with "alternate accounting" first */
19887 	if (!map_entry->is_sub_map &&
19888 	    object->vo_no_footprint) {
19889 		/* does not count in footprint */
19890 		assertf(!map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry);
19891 	} else if (!map_entry->is_sub_map &&
19892 	    (object->purgable == VM_PURGABLE_NONVOLATILE ||
19893 	    (object->purgable == VM_PURGABLE_DENY &&
19894 	    object->vo_ledger_tag)) &&
19895 	    VM_OBJECT_OWNER(object) != NULL &&
19896 	    VM_OBJECT_OWNER(object)->map == map) {
19897 		assertf(!map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry);
19898 		if ((((curr_s_offset
19899 		    - map_entry->vme_start
19900 		    + VME_OFFSET(map_entry))
19901 		    / effective_page_size) <
19902 		    (object->resident_page_count +
19903 		    vm_compressor_pager_get_count(object->pager)))) {
19904 			/*
19905 			 * Non-volatile purgeable object owned
19906 			 * by this task: report the first
19907 			 * "#resident + #compressed" pages as
19908 			 * "resident" (to show that they
19909 			 * contribute to the footprint) but not
19910 			 * "dirty" (to avoid double-counting
19911 			 * with the fake "non-volatile" region
19912 			 * we'll report at the end of the
19913 			 * address space to account for all
19914 			 * (mapped or not) non-volatile memory
19915 			 * owned by this task.
19916 			 */
19917 			disposition |= VM_PAGE_QUERY_PAGE_PRESENT;
19918 		}
19919 	} else if (!map_entry->is_sub_map &&
19920 	    (object->purgable == VM_PURGABLE_VOLATILE ||
19921 	    object->purgable == VM_PURGABLE_EMPTY) &&
19922 	    VM_OBJECT_OWNER(object) != NULL &&
19923 	    VM_OBJECT_OWNER(object)->map == map) {
19924 		assertf(!map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry);
19925 		if ((((curr_s_offset
19926 		    - map_entry->vme_start
19927 		    + VME_OFFSET(map_entry))
19928 		    / effective_page_size) <
19929 		    object->wired_page_count)) {
19930 			/*
19931 			 * Volatile|empty purgeable object owned
19932 			 * by this task: report the first
19933 			 * "#wired" pages as "resident" (to
19934 			 * show that they contribute to the
19935 			 * footprint) but not "dirty" (to avoid
19936 			 * double-counting with the fake
19937 			 * "non-volatile" region we'll report
19938 			 * at the end of the address space to
19939 			 * account for all (mapped or not)
19940 			 * non-volatile memory owned by this
19941 			 * task.
19942 			 */
19943 			disposition |= VM_PAGE_QUERY_PAGE_PRESENT;
19944 		}
19945 	} else if (!map_entry->is_sub_map &&
19946 	    map_entry->iokit_acct &&
19947 	    object->internal &&
19948 	    object->purgable == VM_PURGABLE_DENY) {
19949 		/*
19950 		 * Non-purgeable IOKit memory: phys_footprint
19951 		 * includes the entire virtual mapping.
19952 		 */
19953 		assertf(!map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry);
19954 		disposition |= VM_PAGE_QUERY_PAGE_PRESENT;
19955 		disposition |= VM_PAGE_QUERY_PAGE_DIRTY;
19956 	} else if (pmap_disp & (PMAP_QUERY_PAGE_ALTACCT |
19957 	    PMAP_QUERY_PAGE_COMPRESSED_ALTACCT)) {
19958 		/* alternate accounting */
19959 #if __arm64__ && (DEVELOPMENT || DEBUG)
19960 		if (map->pmap->footprint_was_suspended) {
19961 			/*
19962 			 * The assertion below can fail if dyld
19963 			 * suspended footprint accounting
19964 			 * while doing some adjustments to
19965 			 * this page;  the mapping would say
19966 			 * "use pmap accounting" but the page
19967 			 * would be marked "alternate
19968 			 * accounting".
19969 			 */
19970 		} else
19971 #endif /* __arm64__ && (DEVELOPMENT || DEBUG) */
19972 		{
19973 			assertf(!map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry);
19974 		}
19975 		disposition = 0;
19976 	} else {
19977 		if (pmap_disp & PMAP_QUERY_PAGE_PRESENT) {
19978 			assertf(map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry);
19979 			disposition |= VM_PAGE_QUERY_PAGE_PRESENT;
19980 			disposition |= VM_PAGE_QUERY_PAGE_REF;
19981 			if (pmap_disp & PMAP_QUERY_PAGE_INTERNAL) {
19982 				disposition |= VM_PAGE_QUERY_PAGE_DIRTY;
19983 			} else {
19984 				disposition |= VM_PAGE_QUERY_PAGE_EXTERNAL;
19985 			}
19986 			if (pmap_disp & PMAP_QUERY_PAGE_REUSABLE) {
19987 				disposition |= VM_PAGE_QUERY_PAGE_REUSABLE;
19988 			}
19989 		} else if (pmap_disp & PMAP_QUERY_PAGE_COMPRESSED) {
19990 			assertf(map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry);
19991 			disposition |= VM_PAGE_QUERY_PAGE_PAGED_OUT;
19992 		}
19993 	}
19994 
19995 	*disposition_p = disposition;
19996 }
19997 
19998 kern_return_t
vm_map_page_query_internal(vm_map_t target_map,vm_map_offset_t offset,int * disposition,int * ref_count)19999 vm_map_page_query_internal(
20000 	vm_map_t        target_map,
20001 	vm_map_offset_t offset,
20002 	int             *disposition,
20003 	int             *ref_count)
20004 {
20005 	kern_return_t                   kr;
20006 	vm_page_info_basic_data_t       info;
20007 	mach_msg_type_number_t          count;
20008 
20009 	count = VM_PAGE_INFO_BASIC_COUNT;
20010 	kr = vm_map_page_info(target_map,
20011 	    offset,
20012 	    VM_PAGE_INFO_BASIC,
20013 	    (vm_page_info_t) &info,
20014 	    &count);
20015 	if (kr == KERN_SUCCESS) {
20016 		*disposition = info.disposition;
20017 		*ref_count = info.ref_count;
20018 	} else {
20019 		*disposition = 0;
20020 		*ref_count = 0;
20021 	}
20022 
20023 	return kr;
20024 }
20025 
20026 kern_return_t
vm_map_page_info(vm_map_t map,vm_map_offset_t offset,vm_page_info_flavor_t flavor,vm_page_info_t info,mach_msg_type_number_t * count)20027 vm_map_page_info(
20028 	vm_map_t                map,
20029 	vm_map_offset_t         offset,
20030 	vm_page_info_flavor_t   flavor,
20031 	vm_page_info_t          info,
20032 	mach_msg_type_number_t  *count)
20033 {
20034 	return vm_map_page_range_info_internal(map,
20035 	           offset, /* start of range */
20036 	           (offset + 1), /* this will get rounded in the call to the page boundary */
20037 	           (int)-1, /* effective_page_shift: unspecified */
20038 	           flavor,
20039 	           info,
20040 	           count);
20041 }
20042 
20043 kern_return_t
vm_map_page_range_info_internal(vm_map_t map,vm_map_offset_t start_offset,vm_map_offset_t end_offset,int effective_page_shift,vm_page_info_flavor_t flavor,vm_page_info_t info,mach_msg_type_number_t * count)20044 vm_map_page_range_info_internal(
20045 	vm_map_t                map,
20046 	vm_map_offset_t         start_offset,
20047 	vm_map_offset_t         end_offset,
20048 	int                     effective_page_shift,
20049 	vm_page_info_flavor_t   flavor,
20050 	vm_page_info_t          info,
20051 	mach_msg_type_number_t  *count)
20052 {
20053 	vm_map_entry_t          map_entry = VM_MAP_ENTRY_NULL;
20054 	vm_object_t             object = VM_OBJECT_NULL, curr_object = VM_OBJECT_NULL;
20055 	vm_page_t               m = VM_PAGE_NULL;
20056 	kern_return_t           retval = KERN_SUCCESS;
20057 	int                     disposition = 0;
20058 	int                     ref_count = 0;
20059 	int                     depth = 0, info_idx = 0;
20060 	vm_page_info_basic_t    basic_info = 0;
20061 	vm_map_offset_t         offset_in_page = 0, offset_in_object = 0, curr_offset_in_object = 0;
20062 	vm_map_offset_t         start = 0, end = 0, curr_s_offset = 0, curr_e_offset = 0;
20063 	boolean_t               do_region_footprint;
20064 	ledger_amount_t         ledger_resident, ledger_compressed;
20065 	int                     effective_page_size;
20066 	vm_map_offset_t         effective_page_mask;
20067 
20068 	switch (flavor) {
20069 	case VM_PAGE_INFO_BASIC:
20070 		if (*count != VM_PAGE_INFO_BASIC_COUNT) {
20071 			/*
20072 			 * The "vm_page_info_basic_data" structure was not
20073 			 * properly padded, so allow the size to be off by
20074 			 * one to maintain backwards binary compatibility...
20075 			 */
20076 			if (*count != VM_PAGE_INFO_BASIC_COUNT - 1) {
20077 				return KERN_INVALID_ARGUMENT;
20078 			}
20079 		}
20080 		break;
20081 	default:
20082 		return KERN_INVALID_ARGUMENT;
20083 	}
20084 
20085 	if (effective_page_shift == -1) {
20086 		effective_page_shift = vm_self_region_page_shift_safely(map);
20087 		if (effective_page_shift == -1) {
20088 			return KERN_INVALID_ARGUMENT;
20089 		}
20090 	}
20091 	effective_page_size = (1 << effective_page_shift);
20092 	effective_page_mask = effective_page_size - 1;
20093 
20094 	do_region_footprint = task_self_region_footprint();
20095 	disposition = 0;
20096 	ref_count = 0;
20097 	depth = 0;
20098 	info_idx = 0; /* Tracks the next index within the info structure to be filled.*/
20099 	retval = KERN_SUCCESS;
20100 
20101 	if (__improbable(vm_map_range_overflows(map, start_offset, end_offset - start_offset))) {
20102 		return KERN_INVALID_ADDRESS;
20103 	}
20104 
20105 	offset_in_page = start_offset & effective_page_mask;
20106 	start = vm_map_trunc_page(start_offset, effective_page_mask);
20107 	end = vm_map_round_page(end_offset, effective_page_mask);
20108 
20109 	if (end < start) {
20110 		return KERN_INVALID_ARGUMENT;
20111 	}
20112 
20113 	assert((end - start) <= MAX_PAGE_RANGE_QUERY);
20114 
20115 	vm_map_lock_read(map);
20116 
20117 	task_ledgers_footprint(map->pmap->ledger, &ledger_resident, &ledger_compressed);
20118 
20119 	for (curr_s_offset = start; curr_s_offset < end;) {
20120 		/*
20121 		 * New lookup needs reset of these variables.
20122 		 */
20123 		curr_object = object = VM_OBJECT_NULL;
20124 		offset_in_object = 0;
20125 		ref_count = 0;
20126 		depth = 0;
20127 
20128 		if (do_region_footprint &&
20129 		    curr_s_offset >= vm_map_last_entry(map)->vme_end) {
20130 			/*
20131 			 * Request for "footprint" info about a page beyond
20132 			 * the end of address space: this must be for
20133 			 * the fake region vm_map_region_recurse_64()
20134 			 * reported to account for non-volatile purgeable
20135 			 * memory owned by this task.
20136 			 */
20137 			disposition = 0;
20138 
20139 			if (curr_s_offset - vm_map_last_entry(map)->vme_end <=
20140 			    (unsigned) ledger_compressed) {
20141 				/*
20142 				 * We haven't reported all the "non-volatile
20143 				 * compressed" pages yet, so report this fake
20144 				 * page as "compressed".
20145 				 */
20146 				disposition |= VM_PAGE_QUERY_PAGE_PAGED_OUT;
20147 			} else {
20148 				/*
20149 				 * We've reported all the non-volatile
20150 				 * compressed page but not all the non-volatile
20151 				 * pages , so report this fake page as
20152 				 * "resident dirty".
20153 				 */
20154 				disposition |= VM_PAGE_QUERY_PAGE_PRESENT;
20155 				disposition |= VM_PAGE_QUERY_PAGE_DIRTY;
20156 				disposition |= VM_PAGE_QUERY_PAGE_REF;
20157 			}
20158 			switch (flavor) {
20159 			case VM_PAGE_INFO_BASIC:
20160 				basic_info = (vm_page_info_basic_t) (((uintptr_t) info) + (info_idx * sizeof(struct vm_page_info_basic)));
20161 				basic_info->disposition = disposition;
20162 				basic_info->ref_count = 1;
20163 				basic_info->object_id = VM_OBJECT_ID_FAKE(map, task_ledgers.purgeable_nonvolatile);
20164 				basic_info->offset = 0;
20165 				basic_info->depth = 0;
20166 
20167 				info_idx++;
20168 				break;
20169 			}
20170 			curr_s_offset += effective_page_size;
20171 			continue;
20172 		}
20173 
20174 		/*
20175 		 * First, find the map entry covering "curr_s_offset", going down
20176 		 * submaps if necessary.
20177 		 */
20178 		if (!vm_map_lookup_entry(map, curr_s_offset, &map_entry)) {
20179 			/* no entry -> no object -> no page */
20180 
20181 			if (curr_s_offset < vm_map_min(map)) {
20182 				/*
20183 				 * Illegal address that falls below map min.
20184 				 */
20185 				curr_e_offset = MIN(end, vm_map_min(map));
20186 			} else if (curr_s_offset >= vm_map_max(map)) {
20187 				/*
20188 				 * Illegal address that falls on/after map max.
20189 				 */
20190 				curr_e_offset = end;
20191 			} else if (map_entry == vm_map_to_entry(map)) {
20192 				/*
20193 				 * Hit a hole.
20194 				 */
20195 				if (map_entry->vme_next == vm_map_to_entry(map)) {
20196 					/*
20197 					 * Empty map.
20198 					 */
20199 					curr_e_offset = MIN(map->max_offset, end);
20200 				} else {
20201 					/*
20202 					 * Hole at start of the map.
20203 					 */
20204 					curr_e_offset = MIN(map_entry->vme_next->vme_start, end);
20205 				}
20206 			} else {
20207 				if (map_entry->vme_next == vm_map_to_entry(map)) {
20208 					/*
20209 					 * Hole at the end of the map.
20210 					 */
20211 					curr_e_offset = MIN(map->max_offset, end);
20212 				} else {
20213 					curr_e_offset = MIN(map_entry->vme_next->vme_start, end);
20214 				}
20215 			}
20216 
20217 			assert(curr_e_offset >= curr_s_offset);
20218 
20219 			uint64_t num_pages = (curr_e_offset - curr_s_offset) >> effective_page_shift;
20220 
20221 			void *info_ptr = (void*) (((uintptr_t) info) + (info_idx * sizeof(struct vm_page_info_basic)));
20222 
20223 			bzero(info_ptr, num_pages * sizeof(struct vm_page_info_basic));
20224 
20225 			curr_s_offset = curr_e_offset;
20226 
20227 			info_idx += num_pages;
20228 
20229 			continue;
20230 		}
20231 
20232 		/* compute offset from this map entry's start */
20233 		offset_in_object = curr_s_offset - map_entry->vme_start;
20234 
20235 		/* compute offset into this map entry's object (or submap) */
20236 		offset_in_object += VME_OFFSET(map_entry);
20237 
20238 		if (map_entry->is_sub_map) {
20239 			vm_map_t sub_map = VM_MAP_NULL;
20240 			vm_page_info_t submap_info = 0;
20241 			vm_map_offset_t submap_s_offset = 0, submap_e_offset = 0, range_len = 0;
20242 
20243 			range_len = MIN(map_entry->vme_end, end) - curr_s_offset;
20244 
20245 			submap_s_offset = offset_in_object;
20246 			submap_e_offset = submap_s_offset + range_len;
20247 
20248 			sub_map = VME_SUBMAP(map_entry);
20249 
20250 			vm_map_reference(sub_map);
20251 			vm_map_unlock_read(map);
20252 
20253 			submap_info = (vm_page_info_t) (((uintptr_t) info) + (info_idx * sizeof(struct vm_page_info_basic)));
20254 
20255 			assertf(VM_MAP_PAGE_SHIFT(sub_map) >= VM_MAP_PAGE_SHIFT(map),
20256 			    "Submap page size (%d) differs from current map (%d)\n", VM_MAP_PAGE_SIZE(sub_map), VM_MAP_PAGE_SIZE(map));
20257 
20258 			retval = vm_map_page_range_info_internal(sub_map,
20259 			    submap_s_offset,
20260 			    submap_e_offset,
20261 			    effective_page_shift,
20262 			    VM_PAGE_INFO_BASIC,
20263 			    (vm_page_info_t) submap_info,
20264 			    count);
20265 
20266 			assert(retval == KERN_SUCCESS);
20267 
20268 			vm_map_lock_read(map);
20269 			vm_map_deallocate(sub_map);
20270 
20271 			/* Move the "info" index by the number of pages we inspected.*/
20272 			info_idx += range_len >> effective_page_shift;
20273 
20274 			/* Move our current offset by the size of the range we inspected.*/
20275 			curr_s_offset += range_len;
20276 
20277 			continue;
20278 		}
20279 
20280 		object = VME_OBJECT(map_entry);
20281 
20282 		if (object == VM_OBJECT_NULL) {
20283 			/*
20284 			 * We don't have an object here and, hence,
20285 			 * no pages to inspect. We'll fill up the
20286 			 * info structure appropriately.
20287 			 */
20288 
20289 			curr_e_offset = MIN(map_entry->vme_end, end);
20290 
20291 			uint64_t num_pages = (curr_e_offset - curr_s_offset) >> effective_page_shift;
20292 
20293 			void *info_ptr = (void*) (((uintptr_t) info) + (info_idx * sizeof(struct vm_page_info_basic)));
20294 
20295 			bzero(info_ptr, num_pages * sizeof(struct vm_page_info_basic));
20296 
20297 			curr_s_offset = curr_e_offset;
20298 
20299 			info_idx += num_pages;
20300 
20301 			continue;
20302 		}
20303 
20304 		if (do_region_footprint) {
20305 			disposition = 0;
20306 			if (map->has_corpse_footprint) {
20307 				/*
20308 				 * Query the page info data we saved
20309 				 * while forking the corpse.
20310 				 */
20311 				vm_map_corpse_footprint_query_page_info(
20312 					map,
20313 					curr_s_offset,
20314 					&disposition);
20315 			} else {
20316 				/*
20317 				 * Query the live pmap for footprint info
20318 				 * about this page.
20319 				 */
20320 				vm_map_footprint_query_page_info(
20321 					map,
20322 					map_entry,
20323 					curr_s_offset,
20324 					&disposition);
20325 			}
20326 			switch (flavor) {
20327 			case VM_PAGE_INFO_BASIC:
20328 				basic_info = (vm_page_info_basic_t) (((uintptr_t) info) + (info_idx * sizeof(struct vm_page_info_basic)));
20329 				basic_info->disposition = disposition;
20330 				basic_info->ref_count = 1;
20331 				basic_info->object_id = VM_OBJECT_ID_FAKE(map, task_ledgers.purgeable_nonvolatile);
20332 				basic_info->offset = 0;
20333 				basic_info->depth = 0;
20334 
20335 				info_idx++;
20336 				break;
20337 			}
20338 			curr_s_offset += effective_page_size;
20339 			continue;
20340 		}
20341 
20342 		vm_object_reference(object);
20343 		/*
20344 		 * Shared mode -- so we can allow other readers
20345 		 * to grab the lock too.
20346 		 */
20347 		vm_object_lock_shared(object);
20348 
20349 		curr_e_offset = MIN(map_entry->vme_end, end);
20350 
20351 		vm_map_unlock_read(map);
20352 
20353 		map_entry = NULL; /* map is unlocked, the entry is no longer valid. */
20354 
20355 		curr_object = object;
20356 
20357 		for (; curr_s_offset < curr_e_offset;) {
20358 			if (object == curr_object) {
20359 				ref_count = curr_object->ref_count - 1; /* account for our object reference above. */
20360 			} else {
20361 				ref_count = curr_object->ref_count;
20362 			}
20363 
20364 			curr_offset_in_object = offset_in_object;
20365 
20366 			for (;;) {
20367 				m = vm_page_lookup(curr_object, vm_object_trunc_page(curr_offset_in_object));
20368 
20369 				if (m != VM_PAGE_NULL) {
20370 					disposition |= VM_PAGE_QUERY_PAGE_PRESENT;
20371 					break;
20372 				} else {
20373 					if (curr_object->internal &&
20374 					    curr_object->alive &&
20375 					    !curr_object->terminating &&
20376 					    curr_object->pager_ready) {
20377 						if (VM_COMPRESSOR_PAGER_STATE_GET(curr_object, vm_object_trunc_page(curr_offset_in_object))
20378 						    == VM_EXTERNAL_STATE_EXISTS) {
20379 							/* the pager has that page */
20380 							disposition |= VM_PAGE_QUERY_PAGE_PAGED_OUT;
20381 							break;
20382 						}
20383 					}
20384 
20385 					/*
20386 					 * Go down the VM object shadow chain until we find the page
20387 					 * we're looking for.
20388 					 */
20389 
20390 					if (curr_object->shadow != VM_OBJECT_NULL) {
20391 						vm_object_t shadow = VM_OBJECT_NULL;
20392 
20393 						curr_offset_in_object += curr_object->vo_shadow_offset;
20394 						shadow = curr_object->shadow;
20395 
20396 						vm_object_lock_shared(shadow);
20397 						vm_object_unlock(curr_object);
20398 
20399 						curr_object = shadow;
20400 						depth++;
20401 						continue;
20402 					} else {
20403 						break;
20404 					}
20405 				}
20406 			}
20407 
20408 			/* The ref_count is not strictly accurate, it measures the number   */
20409 			/* of entities holding a ref on the object, they may not be mapping */
20410 			/* the object or may not be mapping the section holding the         */
20411 			/* target page but its still a ball park number and though an over- */
20412 			/* count, it picks up the copy-on-write cases                       */
20413 
20414 			/* We could also get a picture of page sharing from pmap_attributes */
20415 			/* but this would under count as only faulted-in mappings would     */
20416 			/* show up.							    */
20417 
20418 			if ((curr_object == object) && curr_object->shadow) {
20419 				disposition |= VM_PAGE_QUERY_PAGE_COPIED;
20420 			}
20421 
20422 			if (!curr_object->internal) {
20423 				disposition |= VM_PAGE_QUERY_PAGE_EXTERNAL;
20424 			}
20425 
20426 			if (m != VM_PAGE_NULL) {
20427 				if (m->vmp_fictitious) {
20428 					disposition |= VM_PAGE_QUERY_PAGE_FICTITIOUS;
20429 				} else {
20430 					if (m->vmp_dirty || pmap_is_modified(VM_PAGE_GET_PHYS_PAGE(m))) {
20431 						disposition |= VM_PAGE_QUERY_PAGE_DIRTY;
20432 					}
20433 
20434 					if (m->vmp_reference || pmap_is_referenced(VM_PAGE_GET_PHYS_PAGE(m))) {
20435 						disposition |= VM_PAGE_QUERY_PAGE_REF;
20436 					}
20437 
20438 					if (m->vmp_q_state == VM_PAGE_ON_SPECULATIVE_Q) {
20439 						disposition |= VM_PAGE_QUERY_PAGE_SPECULATIVE;
20440 					}
20441 
20442 					/*
20443 					 * XXX TODO4K:
20444 					 * when this routine deals with 4k
20445 					 * pages, check the appropriate CS bit
20446 					 * here.
20447 					 */
20448 					if (m->vmp_cs_validated) {
20449 						disposition |= VM_PAGE_QUERY_PAGE_CS_VALIDATED;
20450 					}
20451 					if (m->vmp_cs_tainted) {
20452 						disposition |= VM_PAGE_QUERY_PAGE_CS_TAINTED;
20453 					}
20454 					if (m->vmp_cs_nx) {
20455 						disposition |= VM_PAGE_QUERY_PAGE_CS_NX;
20456 					}
20457 					if (m->vmp_reusable || curr_object->all_reusable) {
20458 						disposition |= VM_PAGE_QUERY_PAGE_REUSABLE;
20459 					}
20460 				}
20461 			}
20462 
20463 			switch (flavor) {
20464 			case VM_PAGE_INFO_BASIC:
20465 				basic_info = (vm_page_info_basic_t) (((uintptr_t) info) + (info_idx * sizeof(struct vm_page_info_basic)));
20466 				basic_info->disposition = disposition;
20467 				basic_info->ref_count = ref_count;
20468 				basic_info->object_id = (vm_object_id_t) (uintptr_t)
20469 				    VM_KERNEL_ADDRHASH(curr_object);
20470 				basic_info->offset =
20471 				    (memory_object_offset_t) curr_offset_in_object + offset_in_page;
20472 				basic_info->depth = depth;
20473 
20474 				info_idx++;
20475 				break;
20476 			}
20477 
20478 			disposition = 0;
20479 			offset_in_page = 0; // This doesn't really make sense for any offset other than the starting offset.
20480 
20481 			/*
20482 			 * Move to next offset in the range and in our object.
20483 			 */
20484 			curr_s_offset += effective_page_size;
20485 			offset_in_object += effective_page_size;
20486 			curr_offset_in_object = offset_in_object;
20487 
20488 			if (curr_object != object) {
20489 				vm_object_unlock(curr_object);
20490 
20491 				curr_object = object;
20492 
20493 				vm_object_lock_shared(curr_object);
20494 			} else {
20495 				vm_object_lock_yield_shared(curr_object);
20496 			}
20497 		}
20498 
20499 		vm_object_unlock(curr_object);
20500 		vm_object_deallocate(curr_object);
20501 
20502 		vm_map_lock_read(map);
20503 	}
20504 
20505 	vm_map_unlock_read(map);
20506 	return retval;
20507 }
20508 
20509 /*
20510  *	vm_map_msync
20511  *
20512  *	Synchronises the memory range specified with its backing store
20513  *	image by either flushing or cleaning the contents to the appropriate
20514  *	memory manager engaging in a memory object synchronize dialog with
20515  *	the manager.  The client doesn't return until the manager issues
20516  *	m_o_s_completed message.  MIG Magically converts user task parameter
20517  *	to the task's address map.
20518  *
20519  *	interpretation of sync_flags
20520  *	VM_SYNC_INVALIDATE	- discard pages, only return precious
20521  *				  pages to manager.
20522  *
20523  *	VM_SYNC_INVALIDATE & (VM_SYNC_SYNCHRONOUS | VM_SYNC_ASYNCHRONOUS)
20524  *				- discard pages, write dirty or precious
20525  *				  pages back to memory manager.
20526  *
20527  *	VM_SYNC_SYNCHRONOUS | VM_SYNC_ASYNCHRONOUS
20528  *				- write dirty or precious pages back to
20529  *				  the memory manager.
20530  *
20531  *	VM_SYNC_CONTIGUOUS	- does everything normally, but if there
20532  *				  is a hole in the region, and we would
20533  *				  have returned KERN_SUCCESS, return
20534  *				  KERN_INVALID_ADDRESS instead.
20535  *
20536  *	NOTE
20537  *	The memory object attributes have not yet been implemented, this
20538  *	function will have to deal with the invalidate attribute
20539  *
20540  *	RETURNS
20541  *	KERN_INVALID_TASK		Bad task parameter
20542  *	KERN_INVALID_ARGUMENT		both sync and async were specified.
20543  *	KERN_SUCCESS			The usual.
20544  *	KERN_INVALID_ADDRESS		There was a hole in the region.
20545  */
20546 
20547 kern_return_t
vm_map_msync(vm_map_t map,vm_map_address_t address,vm_map_size_t size,vm_sync_t sync_flags)20548 vm_map_msync(
20549 	vm_map_t                map,
20550 	vm_map_address_t        address,
20551 	vm_map_size_t           size,
20552 	vm_sync_t               sync_flags)
20553 {
20554 	vm_map_entry_t          entry;
20555 	vm_map_size_t           amount_left;
20556 	vm_object_offset_t      offset;
20557 	vm_object_offset_t      start_offset, end_offset;
20558 	boolean_t               do_sync_req;
20559 	boolean_t               had_hole = FALSE;
20560 	vm_map_offset_t         pmap_offset;
20561 
20562 	if ((sync_flags & VM_SYNC_ASYNCHRONOUS) &&
20563 	    (sync_flags & VM_SYNC_SYNCHRONOUS)) {
20564 		return KERN_INVALID_ARGUMENT;
20565 	}
20566 
20567 	if (__improbable(vm_map_range_overflows(map, address, size))) {
20568 		return KERN_INVALID_ADDRESS;
20569 	}
20570 
20571 	if (VM_MAP_PAGE_MASK(map) < PAGE_MASK) {
20572 		DEBUG4K_SHARE("map %p address 0x%llx size 0x%llx flags 0x%x\n", map, (uint64_t)address, (uint64_t)size, sync_flags);
20573 	}
20574 
20575 	/*
20576 	 * align address and size on page boundaries
20577 	 */
20578 	size = (vm_map_round_page(address + size,
20579 	    VM_MAP_PAGE_MASK(map)) -
20580 	    vm_map_trunc_page(address,
20581 	    VM_MAP_PAGE_MASK(map)));
20582 	address = vm_map_trunc_page(address,
20583 	    VM_MAP_PAGE_MASK(map));
20584 
20585 	if (map == VM_MAP_NULL) {
20586 		return KERN_INVALID_TASK;
20587 	}
20588 
20589 	if (size == 0) {
20590 		return KERN_SUCCESS;
20591 	}
20592 
20593 	amount_left = size;
20594 
20595 	while (amount_left > 0) {
20596 		vm_object_size_t        flush_size;
20597 		vm_object_t             object;
20598 
20599 		vm_map_lock(map);
20600 		if (!vm_map_lookup_entry(map,
20601 		    address,
20602 		    &entry)) {
20603 			vm_map_size_t   skip;
20604 
20605 			/*
20606 			 * hole in the address map.
20607 			 */
20608 			had_hole = TRUE;
20609 
20610 			if (sync_flags & VM_SYNC_KILLPAGES) {
20611 				/*
20612 				 * For VM_SYNC_KILLPAGES, there should be
20613 				 * no holes in the range, since we couldn't
20614 				 * prevent someone else from allocating in
20615 				 * that hole and we wouldn't want to "kill"
20616 				 * their pages.
20617 				 */
20618 				vm_map_unlock(map);
20619 				break;
20620 			}
20621 
20622 			/*
20623 			 * Check for empty map.
20624 			 */
20625 			if (entry == vm_map_to_entry(map) &&
20626 			    entry->vme_next == entry) {
20627 				vm_map_unlock(map);
20628 				break;
20629 			}
20630 			/*
20631 			 * Check that we don't wrap and that
20632 			 * we have at least one real map entry.
20633 			 */
20634 			if ((map->hdr.nentries == 0) ||
20635 			    (entry->vme_next->vme_start < address)) {
20636 				vm_map_unlock(map);
20637 				break;
20638 			}
20639 			/*
20640 			 * Move up to the next entry if needed
20641 			 */
20642 			skip = (entry->vme_next->vme_start - address);
20643 			if (skip >= amount_left) {
20644 				amount_left = 0;
20645 			} else {
20646 				amount_left -= skip;
20647 			}
20648 			address = entry->vme_next->vme_start;
20649 			vm_map_unlock(map);
20650 			continue;
20651 		}
20652 
20653 		offset = address - entry->vme_start;
20654 		pmap_offset = address;
20655 
20656 		/*
20657 		 * do we have more to flush than is contained in this
20658 		 * entry ?
20659 		 */
20660 		if (amount_left + entry->vme_start + offset > entry->vme_end) {
20661 			flush_size = entry->vme_end -
20662 			    (entry->vme_start + offset);
20663 		} else {
20664 			flush_size = amount_left;
20665 		}
20666 		amount_left -= flush_size;
20667 		address += flush_size;
20668 
20669 		if (entry->is_sub_map == TRUE) {
20670 			vm_map_t        local_map;
20671 			vm_map_offset_t local_offset;
20672 
20673 			local_map = VME_SUBMAP(entry);
20674 			local_offset = VME_OFFSET(entry);
20675 			vm_map_reference(local_map);
20676 			vm_map_unlock(map);
20677 			if (vm_map_msync(
20678 				    local_map,
20679 				    local_offset,
20680 				    flush_size,
20681 				    sync_flags) == KERN_INVALID_ADDRESS) {
20682 				had_hole = TRUE;
20683 			}
20684 			vm_map_deallocate(local_map);
20685 			continue;
20686 		}
20687 		object = VME_OBJECT(entry);
20688 
20689 		/*
20690 		 * We can't sync this object if the object has not been
20691 		 * created yet
20692 		 */
20693 		if (object == VM_OBJECT_NULL) {
20694 			vm_map_unlock(map);
20695 			continue;
20696 		}
20697 		offset += VME_OFFSET(entry);
20698 
20699 		vm_object_lock(object);
20700 
20701 		if (sync_flags & (VM_SYNC_KILLPAGES | VM_SYNC_DEACTIVATE)) {
20702 			int kill_pages = 0;
20703 
20704 			if (VM_MAP_PAGE_MASK(map) < PAGE_MASK) {
20705 				/*
20706 				 * This is a destructive operation and so we
20707 				 * err on the side of limiting the range of
20708 				 * the operation.
20709 				 */
20710 				start_offset = vm_object_round_page(offset);
20711 				end_offset = vm_object_trunc_page(offset + flush_size);
20712 
20713 				if (end_offset <= start_offset) {
20714 					vm_object_unlock(object);
20715 					vm_map_unlock(map);
20716 					continue;
20717 				}
20718 
20719 				pmap_offset += start_offset - offset;
20720 			} else {
20721 				start_offset = offset;
20722 				end_offset = offset + flush_size;
20723 			}
20724 
20725 			if (sync_flags & VM_SYNC_KILLPAGES) {
20726 				if (((object->ref_count == 1) ||
20727 				    ((object->copy_strategy !=
20728 				    MEMORY_OBJECT_COPY_SYMMETRIC) &&
20729 				    (object->vo_copy == VM_OBJECT_NULL))) &&
20730 				    (object->shadow == VM_OBJECT_NULL)) {
20731 					if (object->ref_count != 1) {
20732 						vm_page_stats_reusable.free_shared++;
20733 					}
20734 					kill_pages = 1;
20735 				} else {
20736 					kill_pages = -1;
20737 				}
20738 			}
20739 			if (kill_pages != -1) {
20740 				vm_object_deactivate_pages(
20741 					object,
20742 					start_offset,
20743 					(vm_object_size_t) (end_offset - start_offset),
20744 					kill_pages,
20745 					FALSE, /* reusable_pages */
20746 					FALSE, /* reusable_no_write */
20747 					map->pmap,
20748 					pmap_offset);
20749 			}
20750 			vm_object_unlock(object);
20751 			vm_map_unlock(map);
20752 			continue;
20753 		}
20754 		/*
20755 		 * We can't sync this object if there isn't a pager.
20756 		 * Don't bother to sync internal objects, since there can't
20757 		 * be any "permanent" storage for these objects anyway.
20758 		 */
20759 		if ((object->pager == MEMORY_OBJECT_NULL) ||
20760 		    (object->internal) || (object->private)) {
20761 			vm_object_unlock(object);
20762 			vm_map_unlock(map);
20763 			continue;
20764 		}
20765 		/*
20766 		 * keep reference on the object until syncing is done
20767 		 */
20768 		vm_object_reference_locked(object);
20769 		vm_object_unlock(object);
20770 
20771 		vm_map_unlock(map);
20772 
20773 		if (VM_MAP_PAGE_MASK(map) < PAGE_MASK) {
20774 			start_offset = vm_object_trunc_page(offset);
20775 			end_offset = vm_object_round_page(offset + flush_size);
20776 		} else {
20777 			start_offset = offset;
20778 			end_offset = offset + flush_size;
20779 		}
20780 
20781 		do_sync_req = vm_object_sync(object,
20782 		    start_offset,
20783 		    (end_offset - start_offset),
20784 		    sync_flags & VM_SYNC_INVALIDATE,
20785 		    ((sync_flags & VM_SYNC_SYNCHRONOUS) ||
20786 		    (sync_flags & VM_SYNC_ASYNCHRONOUS)),
20787 		    sync_flags & VM_SYNC_SYNCHRONOUS);
20788 
20789 		if ((sync_flags & VM_SYNC_INVALIDATE) && object->resident_page_count == 0) {
20790 			/*
20791 			 * clear out the clustering and read-ahead hints
20792 			 */
20793 			vm_object_lock(object);
20794 
20795 			object->pages_created = 0;
20796 			object->pages_used = 0;
20797 			object->sequential = 0;
20798 			object->last_alloc = 0;
20799 
20800 			vm_object_unlock(object);
20801 		}
20802 		vm_object_deallocate(object);
20803 	} /* while */
20804 
20805 	/* for proper msync() behaviour */
20806 	if (had_hole == TRUE && (sync_flags & VM_SYNC_CONTIGUOUS)) {
20807 		return KERN_INVALID_ADDRESS;
20808 	}
20809 
20810 	return KERN_SUCCESS;
20811 }/* vm_msync */
20812 
20813 void
vm_named_entry_associate_vm_object(vm_named_entry_t named_entry,vm_object_t object,vm_object_offset_t offset,vm_object_size_t size,vm_prot_t prot)20814 vm_named_entry_associate_vm_object(
20815 	vm_named_entry_t        named_entry,
20816 	vm_object_t             object,
20817 	vm_object_offset_t      offset,
20818 	vm_object_size_t        size,
20819 	vm_prot_t               prot)
20820 {
20821 	vm_map_copy_t copy;
20822 	vm_map_entry_t copy_entry;
20823 
20824 	assert(!named_entry->is_sub_map);
20825 	assert(!named_entry->is_copy);
20826 	assert(!named_entry->is_object);
20827 	assert(!named_entry->internal);
20828 	assert(named_entry->backing.copy == VM_MAP_COPY_NULL);
20829 
20830 	copy = vm_map_copy_allocate(VM_MAP_COPY_ENTRY_LIST);
20831 	copy->offset = offset;
20832 	copy->size = size;
20833 	copy->cpy_hdr.page_shift = (uint16_t)PAGE_SHIFT;
20834 
20835 	copy_entry = vm_map_copy_entry_create(copy);
20836 	copy_entry->protection = prot;
20837 	copy_entry->max_protection = prot;
20838 	copy_entry->use_pmap = TRUE;
20839 	copy_entry->vme_start = VM_MAP_TRUNC_PAGE(offset, PAGE_MASK);
20840 	copy_entry->vme_end = VM_MAP_ROUND_PAGE(offset + size, PAGE_MASK);
20841 	VME_OBJECT_SET(copy_entry, object, false, 0);
20842 	VME_OFFSET_SET(copy_entry, vm_object_trunc_page(offset));
20843 	vm_map_copy_entry_link(copy, vm_map_copy_last_entry(copy), copy_entry);
20844 
20845 	named_entry->backing.copy = copy;
20846 	named_entry->is_object = TRUE;
20847 	if (object->internal) {
20848 		named_entry->internal = TRUE;
20849 	}
20850 
20851 	DEBUG4K_MEMENTRY("named_entry %p copy %p object %p offset 0x%llx size 0x%llx prot 0x%x\n",
20852 	    named_entry, copy, object, offset, size, prot);
20853 }
20854 
20855 vm_object_t
vm_named_entry_to_vm_object(vm_named_entry_t named_entry)20856 vm_named_entry_to_vm_object(
20857 	vm_named_entry_t named_entry)
20858 {
20859 	vm_map_copy_t   copy;
20860 	vm_map_entry_t  copy_entry;
20861 	vm_object_t     object;
20862 
20863 	assert(!named_entry->is_sub_map);
20864 	assert(!named_entry->is_copy);
20865 	assert(named_entry->is_object);
20866 	copy = named_entry->backing.copy;
20867 	assert(copy != VM_MAP_COPY_NULL);
20868 	/*
20869 	 * Assert that the vm_map_copy is coming from the right
20870 	 * zone and hasn't been forged
20871 	 */
20872 	vm_map_copy_require(copy);
20873 	assert(copy->cpy_hdr.nentries == 1);
20874 	copy_entry = vm_map_copy_first_entry(copy);
20875 	object = VME_OBJECT(copy_entry);
20876 
20877 	DEBUG4K_MEMENTRY("%p -> %p -> %p [0x%llx 0x%llx 0x%llx 0x%x/0x%x ] -> %p offset 0x%llx size 0x%llx prot 0x%x\n", named_entry, copy, copy_entry, (uint64_t)copy_entry->vme_start, (uint64_t)copy_entry->vme_end, copy_entry->vme_offset, copy_entry->protection, copy_entry->max_protection, object, named_entry->offset, named_entry->size, named_entry->protection);
20878 
20879 	return object;
20880 }
20881 
20882 /*
20883  *	Routine:	convert_port_entry_to_map
20884  *	Purpose:
20885  *		Convert from a port specifying an entry or a task
20886  *		to a map. Doesn't consume the port ref; produces a map ref,
20887  *		which may be null.  Unlike convert_port_to_map, the
20888  *		port may be task or a named entry backed.
20889  *	Conditions:
20890  *		Nothing locked.
20891  */
20892 
20893 vm_map_t
convert_port_entry_to_map(ipc_port_t port)20894 convert_port_entry_to_map(
20895 	ipc_port_t      port)
20896 {
20897 	vm_map_t map = VM_MAP_NULL;
20898 	vm_named_entry_t named_entry;
20899 
20900 	if (!IP_VALID(port)) {
20901 		return VM_MAP_NULL;
20902 	}
20903 
20904 	if (ip_kotype(port) != IKOT_NAMED_ENTRY) {
20905 		return convert_port_to_map(port);
20906 	}
20907 
20908 	named_entry = mach_memory_entry_from_port(port);
20909 
20910 	if ((named_entry->is_sub_map) &&
20911 	    (named_entry->protection & VM_PROT_WRITE)) {
20912 		map = named_entry->backing.map;
20913 		if (map->pmap != PMAP_NULL) {
20914 			if (map->pmap == kernel_pmap) {
20915 				panic("userspace has access "
20916 				    "to a kernel map %p", map);
20917 			}
20918 			pmap_require(map->pmap);
20919 		}
20920 		vm_map_reference(map);
20921 	}
20922 
20923 	return map;
20924 }
20925 
20926 /*
20927  * Export routines to other components for the things we access locally through
20928  * macros.
20929  */
20930 #undef current_map
20931 vm_map_t
current_map(void)20932 current_map(void)
20933 {
20934 	return current_map_fast();
20935 }
20936 
20937 /*
20938  *	vm_map_reference:
20939  *
20940  *	Takes a reference on the specified map.
20941  */
20942 void
vm_map_reference(vm_map_t map)20943 vm_map_reference(
20944 	vm_map_t        map)
20945 {
20946 	if (__probable(map != VM_MAP_NULL)) {
20947 		vm_map_require(map);
20948 		os_ref_retain_raw(&map->map_refcnt, &map_refgrp);
20949 	}
20950 }
20951 
20952 /*
20953  *	vm_map_deallocate:
20954  *
20955  *	Removes a reference from the specified map,
20956  *	destroying it if no references remain.
20957  *	The map should not be locked.
20958  */
20959 void
vm_map_deallocate(vm_map_t map)20960 vm_map_deallocate(
20961 	vm_map_t        map)
20962 {
20963 	if (__probable(map != VM_MAP_NULL)) {
20964 		vm_map_require(map);
20965 		if (os_ref_release_raw(&map->map_refcnt, &map_refgrp) == 0) {
20966 			vm_map_destroy(map);
20967 		}
20968 	}
20969 }
20970 
20971 void
vm_map_inspect_deallocate(vm_map_inspect_t map)20972 vm_map_inspect_deallocate(
20973 	vm_map_inspect_t      map)
20974 {
20975 	vm_map_deallocate((vm_map_t)map);
20976 }
20977 
20978 void
vm_map_read_deallocate(vm_map_read_t map)20979 vm_map_read_deallocate(
20980 	vm_map_read_t      map)
20981 {
20982 	vm_map_deallocate((vm_map_t)map);
20983 }
20984 
20985 
20986 void
vm_map_disable_NX(vm_map_t map)20987 vm_map_disable_NX(vm_map_t map)
20988 {
20989 	if (map == NULL) {
20990 		return;
20991 	}
20992 	if (map->pmap == NULL) {
20993 		return;
20994 	}
20995 
20996 	pmap_disable_NX(map->pmap);
20997 }
20998 
20999 void
vm_map_disallow_data_exec(vm_map_t map)21000 vm_map_disallow_data_exec(vm_map_t map)
21001 {
21002 	if (map == NULL) {
21003 		return;
21004 	}
21005 
21006 	map->map_disallow_data_exec = TRUE;
21007 }
21008 
21009 /* XXX Consider making these constants (VM_MAX_ADDRESS and MACH_VM_MAX_ADDRESS)
21010  * more descriptive.
21011  */
21012 void
vm_map_set_32bit(vm_map_t map)21013 vm_map_set_32bit(vm_map_t map)
21014 {
21015 #if defined(__arm64__)
21016 	map->max_offset = pmap_max_offset(FALSE, ARM_PMAP_MAX_OFFSET_DEVICE);
21017 #else
21018 	map->max_offset = (vm_map_offset_t)VM_MAX_ADDRESS;
21019 #endif
21020 }
21021 
21022 
21023 void
vm_map_set_64bit(vm_map_t map)21024 vm_map_set_64bit(vm_map_t map)
21025 {
21026 #if defined(__arm64__)
21027 	map->max_offset = pmap_max_offset(TRUE, ARM_PMAP_MAX_OFFSET_DEVICE);
21028 #else
21029 	map->max_offset = (vm_map_offset_t)MACH_VM_MAX_ADDRESS;
21030 #endif
21031 }
21032 
21033 /*
21034  * Expand the maximum size of an existing map to the maximum supported.
21035  */
21036 void
vm_map_set_jumbo(vm_map_t map)21037 vm_map_set_jumbo(vm_map_t map)
21038 {
21039 #if defined (__arm64__) && !XNU_TARGET_OS_OSX
21040 	vm_map_set_max_addr(map, ~0);
21041 #else /* arm64 */
21042 	(void) map;
21043 #endif
21044 }
21045 
21046 /*
21047  * This map has a JIT entitlement
21048  */
21049 void
vm_map_set_jit_entitled(vm_map_t map)21050 vm_map_set_jit_entitled(vm_map_t map)
21051 {
21052 #if defined (__arm64__)
21053 	pmap_set_jit_entitled(map->pmap);
21054 #else /* arm64 */
21055 	(void) map;
21056 #endif
21057 }
21058 
21059 /*
21060  * Get status of this maps TPRO flag
21061  */
21062 boolean_t
vm_map_tpro(vm_map_t map)21063 vm_map_tpro(vm_map_t map)
21064 {
21065 #if defined (__arm64e__)
21066 	return pmap_get_tpro(map->pmap);
21067 #else /* arm64e */
21068 	(void) map;
21069 	return FALSE;
21070 #endif
21071 }
21072 
21073 /*
21074  * This map has TPRO enabled
21075  */
21076 void
vm_map_set_tpro(vm_map_t map)21077 vm_map_set_tpro(vm_map_t map)
21078 {
21079 #if defined (__arm64e__)
21080 	pmap_set_tpro(map->pmap);
21081 #else /* arm64e */
21082 	(void) map;
21083 #endif
21084 }
21085 
21086 /*
21087  * Does this map have TPRO enforcement enabled
21088  */
21089 boolean_t
vm_map_tpro_enforcement(vm_map_t map)21090 vm_map_tpro_enforcement(vm_map_t map)
21091 {
21092 	return map->tpro_enforcement;
21093 }
21094 
21095 /*
21096  * Set TPRO enforcement for this map
21097  */
21098 void
vm_map_set_tpro_enforcement(vm_map_t map)21099 vm_map_set_tpro_enforcement(vm_map_t map)
21100 {
21101 	if (vm_map_tpro(map)) {
21102 		vm_map_lock(map);
21103 		map->tpro_enforcement = TRUE;
21104 		vm_map_unlock(map);
21105 	}
21106 }
21107 
21108 /*
21109  * Enable TPRO on the requested region
21110  *
21111  * Note:
21112  *     This routine is primarily intended to be called during/soon after map
21113  *     creation before the associated task has been released to run. It is only
21114  *     currently safe when we have no resident pages.
21115  */
21116 boolean_t
vm_map_set_tpro_range(__unused vm_map_t map,__unused vm_map_address_t start,__unused vm_map_address_t end)21117 vm_map_set_tpro_range(
21118 	__unused vm_map_t map,
21119 	__unused vm_map_address_t start,
21120 	__unused vm_map_address_t end)
21121 {
21122 	return TRUE;
21123 }
21124 
21125 /*
21126  * Expand the maximum size of an existing map.
21127  */
21128 void
vm_map_set_max_addr(vm_map_t map,vm_map_offset_t new_max_offset)21129 vm_map_set_max_addr(vm_map_t map, vm_map_offset_t new_max_offset)
21130 {
21131 #if defined(__arm64__)
21132 	vm_map_offset_t max_supported_offset;
21133 	vm_map_offset_t old_max_offset;
21134 
21135 	vm_map_lock(map);
21136 
21137 	old_max_offset = map->max_offset;
21138 	max_supported_offset = pmap_max_offset(vm_map_is_64bit(map), ARM_PMAP_MAX_OFFSET_JUMBO);
21139 
21140 	new_max_offset = trunc_page(new_max_offset);
21141 
21142 	/* The address space cannot be shrunk using this routine. */
21143 	if (old_max_offset >= new_max_offset) {
21144 		vm_map_unlock(map);
21145 		return;
21146 	}
21147 
21148 	if (max_supported_offset < new_max_offset) {
21149 		new_max_offset = max_supported_offset;
21150 	}
21151 
21152 	map->max_offset = new_max_offset;
21153 
21154 	if (map->holelistenabled) {
21155 		if (map->holes_list->prev->vme_end == old_max_offset) {
21156 			/*
21157 			 * There is already a hole at the end of the map; simply make it bigger.
21158 			 */
21159 			map->holes_list->prev->vme_end = map->max_offset;
21160 		} else {
21161 			/*
21162 			 * There is no hole at the end, so we need to create a new hole
21163 			 * for the new empty space we're creating.
21164 			 */
21165 			struct vm_map_links *new_hole;
21166 
21167 			new_hole = zalloc_id(ZONE_ID_VM_MAP_HOLES, Z_WAITOK | Z_NOFAIL);
21168 			new_hole->start = old_max_offset;
21169 			new_hole->end = map->max_offset;
21170 			new_hole->prev = map->holes_list->prev;
21171 			new_hole->next = (struct vm_map_entry *)map->holes_list;
21172 			map->holes_list->prev->vme_next = (struct vm_map_entry *)new_hole;
21173 			map->holes_list->prev = (struct vm_map_entry *)new_hole;
21174 		}
21175 	}
21176 
21177 	vm_map_unlock(map);
21178 #else
21179 	(void)map;
21180 	(void)new_max_offset;
21181 #endif
21182 }
21183 
21184 vm_map_offset_t
vm_compute_max_offset(boolean_t is64)21185 vm_compute_max_offset(boolean_t is64)
21186 {
21187 #if defined(__arm64__)
21188 	return pmap_max_offset(is64, ARM_PMAP_MAX_OFFSET_DEVICE);
21189 #else
21190 	return is64 ? (vm_map_offset_t)MACH_VM_MAX_ADDRESS : (vm_map_offset_t)VM_MAX_ADDRESS;
21191 #endif
21192 }
21193 
21194 void
vm_map_get_max_aslr_slide_section(vm_map_t map __unused,int64_t * max_sections,int64_t * section_size)21195 vm_map_get_max_aslr_slide_section(
21196 	vm_map_t                map __unused,
21197 	int64_t                 *max_sections,
21198 	int64_t                 *section_size)
21199 {
21200 #if defined(__arm64__)
21201 	*max_sections = 3;
21202 	*section_size = ARM_TT_TWIG_SIZE;
21203 #else
21204 	*max_sections = 1;
21205 	*section_size = 0;
21206 #endif
21207 }
21208 
21209 uint64_t
vm_map_get_max_aslr_slide_pages(vm_map_t map)21210 vm_map_get_max_aslr_slide_pages(vm_map_t map)
21211 {
21212 #if defined(__arm64__)
21213 	/* Limit arm64 slide to 16MB to conserve contiguous VA space in the more
21214 	 * limited embedded address space; this is also meant to minimize pmap
21215 	 * memory usage on 16KB page systems.
21216 	 */
21217 	return 1 << (24 - VM_MAP_PAGE_SHIFT(map));
21218 #else
21219 	return 1 << (vm_map_is_64bit(map) ? 16 : 8);
21220 #endif
21221 }
21222 
21223 uint64_t
vm_map_get_max_loader_aslr_slide_pages(vm_map_t map)21224 vm_map_get_max_loader_aslr_slide_pages(vm_map_t map)
21225 {
21226 #if defined(__arm64__)
21227 	/* We limit the loader slide to 4MB, in order to ensure at least 8 bits
21228 	 * of independent entropy on 16KB page systems.
21229 	 */
21230 	return 1 << (22 - VM_MAP_PAGE_SHIFT(map));
21231 #else
21232 	return 1 << (vm_map_is_64bit(map) ? 16 : 8);
21233 #endif
21234 }
21235 
21236 boolean_t
vm_map_is_64bit(vm_map_t map)21237 vm_map_is_64bit(
21238 	vm_map_t map)
21239 {
21240 	return map->max_offset > ((vm_map_offset_t)VM_MAX_ADDRESS);
21241 }
21242 
21243 boolean_t
vm_map_has_hard_pagezero(vm_map_t map,vm_map_offset_t pagezero_size)21244 vm_map_has_hard_pagezero(
21245 	vm_map_t        map,
21246 	vm_map_offset_t pagezero_size)
21247 {
21248 	/*
21249 	 * XXX FBDP
21250 	 * We should lock the VM map (for read) here but we can get away
21251 	 * with it for now because there can't really be any race condition:
21252 	 * the VM map's min_offset is changed only when the VM map is created
21253 	 * and when the zero page is established (when the binary gets loaded),
21254 	 * and this routine gets called only when the task terminates and the
21255 	 * VM map is being torn down, and when a new map is created via
21256 	 * load_machfile()/execve().
21257 	 */
21258 	return map->min_offset >= pagezero_size;
21259 }
21260 
21261 /*
21262  * Raise a VM map's maximun offset.
21263  */
21264 kern_return_t
vm_map_raise_max_offset(vm_map_t map,vm_map_offset_t new_max_offset)21265 vm_map_raise_max_offset(
21266 	vm_map_t        map,
21267 	vm_map_offset_t new_max_offset)
21268 {
21269 	kern_return_t   ret;
21270 
21271 	vm_map_lock(map);
21272 	ret = KERN_INVALID_ADDRESS;
21273 
21274 	if (new_max_offset >= map->max_offset) {
21275 		if (!vm_map_is_64bit(map)) {
21276 			if (new_max_offset <= (vm_map_offset_t)VM_MAX_ADDRESS) {
21277 				map->max_offset = new_max_offset;
21278 				ret = KERN_SUCCESS;
21279 			}
21280 		} else {
21281 			if (new_max_offset <= (vm_map_offset_t)MACH_VM_MAX_ADDRESS) {
21282 				map->max_offset = new_max_offset;
21283 				ret = KERN_SUCCESS;
21284 			}
21285 		}
21286 	}
21287 
21288 	vm_map_unlock(map);
21289 	return ret;
21290 }
21291 
21292 
21293 /*
21294  * Raise a VM map's minimum offset.
21295  * To strictly enforce "page zero" reservation.
21296  */
21297 kern_return_t
vm_map_raise_min_offset(vm_map_t map,vm_map_offset_t new_min_offset)21298 vm_map_raise_min_offset(
21299 	vm_map_t        map,
21300 	vm_map_offset_t new_min_offset)
21301 {
21302 	vm_map_entry_t  first_entry;
21303 
21304 	new_min_offset = vm_map_round_page(new_min_offset,
21305 	    VM_MAP_PAGE_MASK(map));
21306 
21307 	vm_map_lock(map);
21308 
21309 	if (new_min_offset < map->min_offset) {
21310 		/*
21311 		 * Can't move min_offset backwards, as that would expose
21312 		 * a part of the address space that was previously, and for
21313 		 * possibly good reasons, inaccessible.
21314 		 */
21315 		vm_map_unlock(map);
21316 		return KERN_INVALID_ADDRESS;
21317 	}
21318 	if (new_min_offset >= map->max_offset) {
21319 		/* can't go beyond the end of the address space */
21320 		vm_map_unlock(map);
21321 		return KERN_INVALID_ADDRESS;
21322 	}
21323 
21324 	first_entry = vm_map_first_entry(map);
21325 	if (first_entry != vm_map_to_entry(map) &&
21326 	    first_entry->vme_start < new_min_offset) {
21327 		/*
21328 		 * Some memory was already allocated below the new
21329 		 * minimun offset.  It's too late to change it now...
21330 		 */
21331 		vm_map_unlock(map);
21332 		return KERN_NO_SPACE;
21333 	}
21334 
21335 	map->min_offset = new_min_offset;
21336 
21337 	if (map->holelistenabled) {
21338 		assert(map->holes_list);
21339 		map->holes_list->start = new_min_offset;
21340 		assert(new_min_offset < map->holes_list->end);
21341 	}
21342 
21343 	vm_map_unlock(map);
21344 
21345 	return KERN_SUCCESS;
21346 }
21347 
21348 /*
21349  * Set the limit on the maximum amount of address space and user wired memory allowed for this map.
21350  * This is basically a copy of the RLIMIT_AS and RLIMIT_MEMLOCK rlimit value maintained by the BSD
21351  * side of the kernel. The limits are checked in the mach VM side, so we keep a copy so we don't
21352  * have to reach over to the BSD data structures.
21353  */
21354 
21355 uint64_t vm_map_set_size_limit_count = 0;
21356 kern_return_t
vm_map_set_size_limit(vm_map_t map,uint64_t new_size_limit)21357 vm_map_set_size_limit(vm_map_t map, uint64_t new_size_limit)
21358 {
21359 	kern_return_t kr;
21360 
21361 	vm_map_lock(map);
21362 	if (new_size_limit < map->size) {
21363 		/* new limit should not be lower than its current size */
21364 		DTRACE_VM2(vm_map_set_size_limit_fail,
21365 		    vm_map_size_t, map->size,
21366 		    uint64_t, new_size_limit);
21367 		kr = KERN_FAILURE;
21368 	} else if (new_size_limit == map->size_limit) {
21369 		/* no change */
21370 		kr = KERN_SUCCESS;
21371 	} else {
21372 		/* set new limit */
21373 		DTRACE_VM2(vm_map_set_size_limit,
21374 		    vm_map_size_t, map->size,
21375 		    uint64_t, new_size_limit);
21376 		if (new_size_limit != RLIM_INFINITY) {
21377 			vm_map_set_size_limit_count++;
21378 		}
21379 		map->size_limit = new_size_limit;
21380 		kr = KERN_SUCCESS;
21381 	}
21382 	vm_map_unlock(map);
21383 	return kr;
21384 }
21385 
21386 uint64_t vm_map_set_data_limit_count = 0;
21387 kern_return_t
vm_map_set_data_limit(vm_map_t map,uint64_t new_data_limit)21388 vm_map_set_data_limit(vm_map_t map, uint64_t new_data_limit)
21389 {
21390 	kern_return_t kr;
21391 
21392 	vm_map_lock(map);
21393 	if (new_data_limit < map->size) {
21394 		/* new limit should not be lower than its current size */
21395 		DTRACE_VM2(vm_map_set_data_limit_fail,
21396 		    vm_map_size_t, map->size,
21397 		    uint64_t, new_data_limit);
21398 		kr = KERN_FAILURE;
21399 	} else if (new_data_limit == map->data_limit) {
21400 		/* no change */
21401 		kr = KERN_SUCCESS;
21402 	} else {
21403 		/* set new limit */
21404 		DTRACE_VM2(vm_map_set_data_limit,
21405 		    vm_map_size_t, map->size,
21406 		    uint64_t, new_data_limit);
21407 		if (new_data_limit != RLIM_INFINITY) {
21408 			vm_map_set_data_limit_count++;
21409 		}
21410 		map->data_limit = new_data_limit;
21411 		kr = KERN_SUCCESS;
21412 	}
21413 	vm_map_unlock(map);
21414 	return kr;
21415 }
21416 
21417 void
vm_map_set_user_wire_limit(vm_map_t map,vm_size_t limit)21418 vm_map_set_user_wire_limit(vm_map_t     map,
21419     vm_size_t    limit)
21420 {
21421 	vm_map_lock(map);
21422 	map->user_wire_limit = limit;
21423 	vm_map_unlock(map);
21424 }
21425 
21426 
21427 void
vm_map_switch_protect(vm_map_t map,boolean_t val)21428 vm_map_switch_protect(vm_map_t     map,
21429     boolean_t    val)
21430 {
21431 	vm_map_lock(map);
21432 	map->switch_protect = val;
21433 	vm_map_unlock(map);
21434 }
21435 
21436 extern int cs_process_enforcement_enable;
21437 boolean_t
vm_map_cs_enforcement(vm_map_t map)21438 vm_map_cs_enforcement(
21439 	vm_map_t map)
21440 {
21441 	if (cs_process_enforcement_enable) {
21442 		return TRUE;
21443 	}
21444 	return map->cs_enforcement;
21445 }
21446 
21447 kern_return_t
vm_map_cs_wx_enable(__unused vm_map_t map)21448 vm_map_cs_wx_enable(
21449 	__unused vm_map_t map)
21450 {
21451 #if CODE_SIGNING_MONITOR
21452 	kern_return_t ret = csm_allow_invalid_code(vm_map_pmap(map));
21453 	if ((ret == KERN_SUCCESS) || (ret == KERN_NOT_SUPPORTED)) {
21454 		return KERN_SUCCESS;
21455 	}
21456 	return ret;
21457 #else
21458 	/* The VM manages WX memory entirely on its own */
21459 	return KERN_SUCCESS;
21460 #endif
21461 }
21462 
21463 kern_return_t
vm_map_csm_allow_jit(__unused vm_map_t map)21464 vm_map_csm_allow_jit(
21465 	__unused vm_map_t map)
21466 {
21467 #if CODE_SIGNING_MONITOR
21468 	return csm_allow_jit_region(vm_map_pmap(map));
21469 #else
21470 	/* No code signing monitor to enforce JIT policy */
21471 	return KERN_SUCCESS;
21472 #endif
21473 }
21474 
21475 void
vm_map_cs_debugged_set(vm_map_t map,boolean_t val)21476 vm_map_cs_debugged_set(
21477 	vm_map_t map,
21478 	boolean_t val)
21479 {
21480 	vm_map_lock(map);
21481 	map->cs_debugged = val;
21482 	vm_map_unlock(map);
21483 }
21484 
21485 void
vm_map_cs_enforcement_set(vm_map_t map,boolean_t val)21486 vm_map_cs_enforcement_set(
21487 	vm_map_t map,
21488 	boolean_t val)
21489 {
21490 	vm_map_lock(map);
21491 	map->cs_enforcement = val;
21492 	pmap_set_vm_map_cs_enforced(map->pmap, val);
21493 	vm_map_unlock(map);
21494 }
21495 
21496 /*
21497  * IOKit has mapped a region into this map; adjust the pmap's ledgers appropriately.
21498  * phys_footprint is a composite limit consisting of iokit + physmem, so we need to
21499  * bump both counters.
21500  */
21501 void
vm_map_iokit_mapped_region(vm_map_t map,vm_size_t bytes)21502 vm_map_iokit_mapped_region(vm_map_t map, vm_size_t bytes)
21503 {
21504 	pmap_t pmap = vm_map_pmap(map);
21505 
21506 	ledger_credit(pmap->ledger, task_ledgers.iokit_mapped, bytes);
21507 	ledger_credit(pmap->ledger, task_ledgers.phys_footprint, bytes);
21508 }
21509 
21510 void
vm_map_iokit_unmapped_region(vm_map_t map,vm_size_t bytes)21511 vm_map_iokit_unmapped_region(vm_map_t map, vm_size_t bytes)
21512 {
21513 	pmap_t pmap = vm_map_pmap(map);
21514 
21515 	ledger_debit(pmap->ledger, task_ledgers.iokit_mapped, bytes);
21516 	ledger_debit(pmap->ledger, task_ledgers.phys_footprint, bytes);
21517 }
21518 
21519 /* Add (generate) code signature for memory range */
21520 #if CONFIG_DYNAMIC_CODE_SIGNING
21521 kern_return_t
vm_map_sign(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)21522 vm_map_sign(vm_map_t map,
21523     vm_map_offset_t start,
21524     vm_map_offset_t end)
21525 {
21526 	vm_map_entry_t entry;
21527 	vm_page_t m;
21528 	vm_object_t object;
21529 
21530 	/*
21531 	 * Vet all the input parameters and current type and state of the
21532 	 * underlaying object.  Return with an error if anything is amiss.
21533 	 */
21534 	if (map == VM_MAP_NULL) {
21535 		return KERN_INVALID_ARGUMENT;
21536 	}
21537 
21538 	if (__improbable(vm_map_range_overflows(map, start, end - start))) {
21539 		return KERN_INVALID_ADDRESS;
21540 	}
21541 
21542 	vm_map_lock_read(map);
21543 
21544 	if (!vm_map_lookup_entry(map, start, &entry) || entry->is_sub_map) {
21545 		/*
21546 		 * Must pass a valid non-submap address.
21547 		 */
21548 		vm_map_unlock_read(map);
21549 		return KERN_INVALID_ADDRESS;
21550 	}
21551 
21552 	if ((entry->vme_start > start) || (entry->vme_end < end)) {
21553 		/*
21554 		 * Map entry doesn't cover the requested range. Not handling
21555 		 * this situation currently.
21556 		 */
21557 		vm_map_unlock_read(map);
21558 		return KERN_INVALID_ARGUMENT;
21559 	}
21560 
21561 	object = VME_OBJECT(entry);
21562 	if (object == VM_OBJECT_NULL) {
21563 		/*
21564 		 * Object must already be present or we can't sign.
21565 		 */
21566 		vm_map_unlock_read(map);
21567 		return KERN_INVALID_ARGUMENT;
21568 	}
21569 
21570 	vm_object_lock(object);
21571 	vm_map_unlock_read(map);
21572 
21573 	while (start < end) {
21574 		uint32_t refmod;
21575 
21576 		m = vm_page_lookup(object,
21577 		    start - entry->vme_start + VME_OFFSET(entry));
21578 		if (m == VM_PAGE_NULL) {
21579 			/* shoud we try to fault a page here? we can probably
21580 			 * demand it exists and is locked for this request */
21581 			vm_object_unlock(object);
21582 			return KERN_FAILURE;
21583 		}
21584 		/* deal with special page status */
21585 		if (m->vmp_busy ||
21586 		    (m->vmp_unusual && (VMP_ERROR_GET(m) || m->vmp_restart || m->vmp_private || m->vmp_absent))) {
21587 			vm_object_unlock(object);
21588 			return KERN_FAILURE;
21589 		}
21590 
21591 		/* Page is OK... now "validate" it */
21592 		/* This is the place where we'll call out to create a code
21593 		 * directory, later */
21594 		/* XXX TODO4K: deal with 4k subpages individually? */
21595 		m->vmp_cs_validated = VMP_CS_ALL_TRUE;
21596 
21597 		/* The page is now "clean" for codesigning purposes. That means
21598 		 * we don't consider it as modified (wpmapped) anymore. But
21599 		 * we'll disconnect the page so we note any future modification
21600 		 * attempts. */
21601 		m->vmp_wpmapped = FALSE;
21602 		refmod = pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m));
21603 
21604 		/* Pull the dirty status from the pmap, since we cleared the
21605 		 * wpmapped bit */
21606 		if ((refmod & VM_MEM_MODIFIED) && !m->vmp_dirty) {
21607 			SET_PAGE_DIRTY(m, FALSE);
21608 		}
21609 
21610 		/* On to the next page */
21611 		start += PAGE_SIZE;
21612 	}
21613 	vm_object_unlock(object);
21614 
21615 	return KERN_SUCCESS;
21616 }
21617 #endif
21618 
21619 kern_return_t
vm_map_partial_reap(vm_map_t map,unsigned int * reclaimed_resident,unsigned int * reclaimed_compressed)21620 vm_map_partial_reap(vm_map_t map, unsigned int *reclaimed_resident, unsigned int *reclaimed_compressed)
21621 {
21622 	vm_map_entry_t  entry = VM_MAP_ENTRY_NULL;
21623 	vm_map_entry_t  next_entry;
21624 	kern_return_t   kr = KERN_SUCCESS;
21625 	VM_MAP_ZAP_DECLARE(zap_list);
21626 
21627 	vm_map_lock(map);
21628 
21629 	for (entry = vm_map_first_entry(map);
21630 	    entry != vm_map_to_entry(map);
21631 	    entry = next_entry) {
21632 		next_entry = entry->vme_next;
21633 
21634 		if (!entry->is_sub_map &&
21635 		    VME_OBJECT(entry) &&
21636 		    (VME_OBJECT(entry)->internal == TRUE) &&
21637 		    (VME_OBJECT(entry)->ref_count == 1)) {
21638 			*reclaimed_resident += VME_OBJECT(entry)->resident_page_count;
21639 			*reclaimed_compressed += vm_compressor_pager_get_count(VME_OBJECT(entry)->pager);
21640 
21641 			(void)vm_map_delete(map, entry->vme_start,
21642 			    entry->vme_end, VM_MAP_REMOVE_NO_YIELD,
21643 			    KMEM_GUARD_NONE, &zap_list);
21644 		}
21645 	}
21646 
21647 	vm_map_unlock(map);
21648 
21649 	vm_map_zap_dispose(&zap_list);
21650 
21651 	return kr;
21652 }
21653 
21654 
21655 #if DEVELOPMENT || DEBUG
21656 
21657 int
vm_map_disconnect_page_mappings(vm_map_t map,boolean_t do_unnest)21658 vm_map_disconnect_page_mappings(
21659 	vm_map_t map,
21660 	boolean_t do_unnest)
21661 {
21662 	vm_map_entry_t entry;
21663 	ledger_amount_t byte_count = 0;
21664 
21665 	if (do_unnest == TRUE) {
21666 #ifndef NO_NESTED_PMAP
21667 		vm_map_lock(map);
21668 
21669 		for (entry = vm_map_first_entry(map);
21670 		    entry != vm_map_to_entry(map);
21671 		    entry = entry->vme_next) {
21672 			if (entry->is_sub_map && entry->use_pmap) {
21673 				/*
21674 				 * Make sure the range between the start of this entry and
21675 				 * the end of this entry is no longer nested, so that
21676 				 * we will only remove mappings from the pmap in use by this
21677 				 * this task
21678 				 */
21679 				vm_map_clip_unnest(map, entry, entry->vme_start, entry->vme_end);
21680 			}
21681 		}
21682 		vm_map_unlock(map);
21683 #endif
21684 	}
21685 	vm_map_lock_read(map);
21686 
21687 	ledger_get_balance(map->pmap->ledger, task_ledgers.phys_mem, &byte_count);
21688 
21689 	for (entry = vm_map_first_entry(map);
21690 	    entry != vm_map_to_entry(map);
21691 	    entry = entry->vme_next) {
21692 		if (!entry->is_sub_map && ((VME_OBJECT(entry) == 0) ||
21693 		    (VME_OBJECT(entry)->phys_contiguous))) {
21694 			continue;
21695 		}
21696 		if (entry->is_sub_map) {
21697 			assert(!entry->use_pmap);
21698 		}
21699 
21700 		pmap_remove_options(map->pmap, entry->vme_start, entry->vme_end, 0);
21701 	}
21702 	vm_map_unlock_read(map);
21703 
21704 	return (int) (byte_count / VM_MAP_PAGE_SIZE(map));
21705 }
21706 
21707 kern_return_t
vm_map_inject_error(vm_map_t map,vm_map_offset_t vaddr)21708 vm_map_inject_error(vm_map_t map, vm_map_offset_t vaddr)
21709 {
21710 	vm_object_t object = NULL;
21711 	vm_object_offset_t offset;
21712 	vm_prot_t prot;
21713 	boolean_t wired;
21714 	vm_map_version_t version;
21715 	vm_map_t real_map;
21716 	int result = KERN_FAILURE;
21717 
21718 	vaddr = vm_map_trunc_page(vaddr, PAGE_MASK);
21719 	vm_map_lock(map);
21720 
21721 	result = vm_map_lookup_and_lock_object(&map, vaddr, VM_PROT_READ,
21722 	    OBJECT_LOCK_EXCLUSIVE, &version, &object, &offset, &prot, &wired,
21723 	    NULL, &real_map, NULL);
21724 	if (object == NULL) {
21725 		result = KERN_MEMORY_ERROR;
21726 	} else if (object->pager) {
21727 		result = vm_compressor_pager_inject_error(object->pager,
21728 		    offset);
21729 	} else {
21730 		result = KERN_MEMORY_PRESENT;
21731 	}
21732 
21733 	if (object != NULL) {
21734 		vm_object_unlock(object);
21735 	}
21736 
21737 	if (real_map != map) {
21738 		vm_map_unlock(real_map);
21739 	}
21740 	vm_map_unlock(map);
21741 
21742 	return result;
21743 }
21744 
21745 #endif
21746 
21747 
21748 #if CONFIG_FREEZE
21749 
21750 
21751 extern struct freezer_context freezer_context_global;
21752 AbsoluteTime c_freezer_last_yield_ts = 0;
21753 
21754 extern unsigned int memorystatus_freeze_private_shared_pages_ratio;
21755 extern unsigned int memorystatus_freeze_shared_mb_per_process_max;
21756 
21757 kern_return_t
vm_map_freeze(task_t task,unsigned int * purgeable_count,unsigned int * wired_count,unsigned int * clean_count,unsigned int * dirty_count,unsigned int dirty_budget,unsigned int * shared_count,int * freezer_error_code,boolean_t eval_only)21758 vm_map_freeze(
21759 	task_t       task,
21760 	unsigned int *purgeable_count,
21761 	unsigned int *wired_count,
21762 	unsigned int *clean_count,
21763 	unsigned int *dirty_count,
21764 	unsigned int dirty_budget,
21765 	unsigned int *shared_count,
21766 	int          *freezer_error_code,
21767 	boolean_t    eval_only)
21768 {
21769 	vm_map_entry_t  entry2 = VM_MAP_ENTRY_NULL;
21770 	kern_return_t   kr = KERN_SUCCESS;
21771 	boolean_t       evaluation_phase = TRUE;
21772 	vm_object_t     cur_shared_object = NULL;
21773 	int             cur_shared_obj_ref_cnt = 0;
21774 	unsigned int    dirty_private_count = 0, dirty_shared_count = 0, obj_pages_snapshot = 0;
21775 
21776 	*purgeable_count = *wired_count = *clean_count = *dirty_count = *shared_count = 0;
21777 
21778 	/*
21779 	 * We need the exclusive lock here so that we can
21780 	 * block any page faults or lookups while we are
21781 	 * in the middle of freezing this vm map.
21782 	 */
21783 	vm_map_t map = task->map;
21784 
21785 	vm_map_lock(map);
21786 
21787 	assert(VM_CONFIG_COMPRESSOR_IS_PRESENT);
21788 
21789 	if (vm_compressor_low_on_space() || vm_swap_low_on_space()) {
21790 		if (vm_compressor_low_on_space()) {
21791 			*freezer_error_code = FREEZER_ERROR_NO_COMPRESSOR_SPACE;
21792 		}
21793 
21794 		if (vm_swap_low_on_space()) {
21795 			*freezer_error_code = FREEZER_ERROR_NO_SWAP_SPACE;
21796 		}
21797 
21798 		kr = KERN_NO_SPACE;
21799 		goto done;
21800 	}
21801 
21802 	if (VM_CONFIG_FREEZER_SWAP_IS_ACTIVE == FALSE) {
21803 		/*
21804 		 * In-memory compressor backing the freezer. No disk.
21805 		 * So no need to do the evaluation phase.
21806 		 */
21807 		evaluation_phase = FALSE;
21808 
21809 		if (eval_only == TRUE) {
21810 			/*
21811 			 * We don't support 'eval_only' mode
21812 			 * in this non-swap config.
21813 			 */
21814 			*freezer_error_code = FREEZER_ERROR_GENERIC;
21815 			kr = KERN_INVALID_ARGUMENT;
21816 			goto done;
21817 		}
21818 
21819 		freezer_context_global.freezer_ctx_uncompressed_pages = 0;
21820 		clock_get_uptime(&c_freezer_last_yield_ts);
21821 	}
21822 again:
21823 
21824 	for (entry2 = vm_map_first_entry(map);
21825 	    entry2 != vm_map_to_entry(map);
21826 	    entry2 = entry2->vme_next) {
21827 		vm_object_t src_object;
21828 
21829 		if (entry2->is_sub_map) {
21830 			continue;
21831 		}
21832 
21833 		src_object = VME_OBJECT(entry2);
21834 		if (!src_object ||
21835 		    src_object->phys_contiguous ||
21836 		    !src_object->internal) {
21837 			continue;
21838 		}
21839 
21840 		/* If eligible, scan the entry, moving eligible pages over to our parent object */
21841 
21842 		if (VM_CONFIG_FREEZER_SWAP_IS_ACTIVE) {
21843 			/*
21844 			 * We skip purgeable objects during evaluation phase only.
21845 			 * If we decide to freeze this process, we'll explicitly
21846 			 * purge these objects before we go around again with
21847 			 * 'evaluation_phase' set to FALSE.
21848 			 */
21849 
21850 			if ((src_object->purgable == VM_PURGABLE_EMPTY) || (src_object->purgable == VM_PURGABLE_VOLATILE)) {
21851 				/*
21852 				 * We want to purge objects that may not belong to this task but are mapped
21853 				 * in this task alone. Since we already purged this task's purgeable memory
21854 				 * at the end of a successful evaluation phase, we want to avoid doing no-op calls
21855 				 * on this task's purgeable objects. Hence the check for only volatile objects.
21856 				 */
21857 				if (evaluation_phase ||
21858 				    src_object->purgable != VM_PURGABLE_VOLATILE ||
21859 				    src_object->ref_count != 1) {
21860 					continue;
21861 				}
21862 				vm_object_lock(src_object);
21863 				if (src_object->purgable == VM_PURGABLE_VOLATILE &&
21864 				    src_object->ref_count == 1) {
21865 					purgeable_q_t old_queue;
21866 
21867 					/* object should be on a purgeable queue */
21868 					assert(src_object->objq.next != NULL &&
21869 					    src_object->objq.prev != NULL);
21870 					/* move object from its volatile queue to the nonvolatile queue */
21871 					old_queue = vm_purgeable_object_remove(src_object);
21872 					assert(old_queue);
21873 					if (src_object->purgeable_when_ripe) {
21874 						/* remove a token from that volatile queue */
21875 						vm_page_lock_queues();
21876 						vm_purgeable_token_delete_first(old_queue);
21877 						vm_page_unlock_queues();
21878 					}
21879 					/* purge the object */
21880 					vm_object_purge(src_object, 0);
21881 				}
21882 				vm_object_unlock(src_object);
21883 				continue;
21884 			}
21885 
21886 			/*
21887 			 * Pages belonging to this object could be swapped to disk.
21888 			 * Make sure it's not a shared object because we could end
21889 			 * up just bringing it back in again.
21890 			 *
21891 			 * We try to optimize somewhat by checking for objects that are mapped
21892 			 * more than once within our own map. But we don't do full searches,
21893 			 * we just look at the entries following our current entry.
21894 			 */
21895 
21896 			if (src_object->ref_count > 1) {
21897 				if (src_object != cur_shared_object) {
21898 					obj_pages_snapshot = (src_object->resident_page_count - src_object->wired_page_count) + vm_compressor_pager_get_count(src_object->pager);
21899 					dirty_shared_count += obj_pages_snapshot;
21900 
21901 					cur_shared_object = src_object;
21902 					cur_shared_obj_ref_cnt = 1;
21903 					continue;
21904 				} else {
21905 					cur_shared_obj_ref_cnt++;
21906 					if (src_object->ref_count == cur_shared_obj_ref_cnt) {
21907 						/*
21908 						 * Fall through to below and treat this object as private.
21909 						 * So deduct its pages from our shared total and add it to the
21910 						 * private total.
21911 						 */
21912 
21913 						dirty_shared_count -= obj_pages_snapshot;
21914 						dirty_private_count += obj_pages_snapshot;
21915 					} else {
21916 						continue;
21917 					}
21918 				}
21919 			}
21920 
21921 
21922 			if (src_object->ref_count == 1) {
21923 				dirty_private_count += (src_object->resident_page_count - src_object->wired_page_count) + vm_compressor_pager_get_count(src_object->pager);
21924 			}
21925 
21926 			if (evaluation_phase == TRUE) {
21927 				continue;
21928 			}
21929 		}
21930 
21931 		uint32_t paged_out_count = vm_object_compressed_freezer_pageout(src_object, dirty_budget);
21932 		*wired_count += src_object->wired_page_count;
21933 
21934 		if (vm_compressor_low_on_space() || vm_swap_low_on_space()) {
21935 			if (vm_compressor_low_on_space()) {
21936 				*freezer_error_code = FREEZER_ERROR_NO_COMPRESSOR_SPACE;
21937 			}
21938 
21939 			if (vm_swap_low_on_space()) {
21940 				*freezer_error_code = FREEZER_ERROR_NO_SWAP_SPACE;
21941 			}
21942 
21943 			kr = KERN_NO_SPACE;
21944 			break;
21945 		}
21946 		if (paged_out_count >= dirty_budget) {
21947 			break;
21948 		}
21949 		dirty_budget -= paged_out_count;
21950 	}
21951 
21952 	*shared_count = (unsigned int) ((dirty_shared_count * PAGE_SIZE_64) / (1024 * 1024ULL));
21953 	if (evaluation_phase) {
21954 		unsigned int shared_pages_threshold = (memorystatus_freeze_shared_mb_per_process_max * 1024 * 1024ULL) / PAGE_SIZE_64;
21955 
21956 		if (dirty_shared_count > shared_pages_threshold) {
21957 			*freezer_error_code = FREEZER_ERROR_EXCESS_SHARED_MEMORY;
21958 			kr = KERN_FAILURE;
21959 			goto done;
21960 		}
21961 
21962 		if (dirty_shared_count &&
21963 		    ((dirty_private_count / dirty_shared_count) < memorystatus_freeze_private_shared_pages_ratio)) {
21964 			*freezer_error_code = FREEZER_ERROR_LOW_PRIVATE_SHARED_RATIO;
21965 			kr = KERN_FAILURE;
21966 			goto done;
21967 		}
21968 
21969 		evaluation_phase = FALSE;
21970 		dirty_shared_count = dirty_private_count = 0;
21971 
21972 		freezer_context_global.freezer_ctx_uncompressed_pages = 0;
21973 		clock_get_uptime(&c_freezer_last_yield_ts);
21974 
21975 		if (eval_only) {
21976 			kr = KERN_SUCCESS;
21977 			goto done;
21978 		}
21979 
21980 		vm_purgeable_purge_task_owned(task);
21981 
21982 		goto again;
21983 	} else {
21984 		kr = KERN_SUCCESS;
21985 	}
21986 
21987 done:
21988 	vm_map_unlock(map);
21989 
21990 	if ((eval_only == FALSE) && (kr == KERN_SUCCESS)) {
21991 		vm_object_compressed_freezer_done();
21992 	}
21993 	return kr;
21994 }
21995 
21996 #endif
21997 
21998 /*
21999  * vm_map_entry_should_cow_for_true_share:
22000  *
22001  * Determines if the map entry should be clipped and setup for copy-on-write
22002  * to avoid applying "true_share" to a large VM object when only a subset is
22003  * targeted.
22004  *
22005  * For now, we target only the map entries created for the Objective C
22006  * Garbage Collector, which initially have the following properties:
22007  *	- alias == VM_MEMORY_MALLOC
22008  *      - wired_count == 0
22009  *      - !needs_copy
22010  * and a VM object with:
22011  *      - internal
22012  *      - copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC
22013  *      - !true_share
22014  *      - vo_size == ANON_CHUNK_SIZE
22015  *
22016  * Only non-kernel map entries.
22017  */
22018 boolean_t
vm_map_entry_should_cow_for_true_share(vm_map_entry_t entry)22019 vm_map_entry_should_cow_for_true_share(
22020 	vm_map_entry_t  entry)
22021 {
22022 	vm_object_t     object;
22023 
22024 	if (entry->is_sub_map) {
22025 		/* entry does not point at a VM object */
22026 		return FALSE;
22027 	}
22028 
22029 	if (entry->needs_copy) {
22030 		/* already set for copy_on_write: done! */
22031 		return FALSE;
22032 	}
22033 
22034 	if (VME_ALIAS(entry) != VM_MEMORY_MALLOC &&
22035 	    VME_ALIAS(entry) != VM_MEMORY_MALLOC_SMALL) {
22036 		/* not a malloc heap or Obj-C Garbage Collector heap */
22037 		return FALSE;
22038 	}
22039 
22040 	if (entry->wired_count) {
22041 		/* wired: can't change the map entry... */
22042 		vm_counters.should_cow_but_wired++;
22043 		return FALSE;
22044 	}
22045 
22046 	object = VME_OBJECT(entry);
22047 
22048 	if (object == VM_OBJECT_NULL) {
22049 		/* no object yet... */
22050 		return FALSE;
22051 	}
22052 
22053 	if (!object->internal) {
22054 		/* not an internal object */
22055 		return FALSE;
22056 	}
22057 
22058 	if (object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC) {
22059 		/* not the default copy strategy */
22060 		return FALSE;
22061 	}
22062 
22063 	if (object->true_share) {
22064 		/* already true_share: too late to avoid it */
22065 		return FALSE;
22066 	}
22067 
22068 	if (VME_ALIAS(entry) == VM_MEMORY_MALLOC &&
22069 	    object->vo_size != ANON_CHUNK_SIZE) {
22070 		/* ... not an object created for the ObjC Garbage Collector */
22071 		return FALSE;
22072 	}
22073 
22074 	if (VME_ALIAS(entry) == VM_MEMORY_MALLOC_SMALL &&
22075 	    object->vo_size != 2048 * 4096) {
22076 		/* ... not a "MALLOC_SMALL" heap */
22077 		return FALSE;
22078 	}
22079 
22080 	/*
22081 	 * All the criteria match: we have a large object being targeted for "true_share".
22082 	 * To limit the adverse side-effects linked with "true_share", tell the caller to
22083 	 * try and avoid setting up the entire object for "true_share" by clipping the
22084 	 * targeted range and setting it up for copy-on-write.
22085 	 */
22086 	return TRUE;
22087 }
22088 
22089 uint64_t vm_map_range_overflows_count = 0;
22090 TUNABLE_WRITEABLE(boolean_t, vm_map_range_overflows_log, "vm_map_range_overflows_log", FALSE);
22091 bool
vm_map_range_overflows(vm_map_t map,vm_map_offset_t addr,vm_map_size_t size)22092 vm_map_range_overflows(
22093 	vm_map_t map,
22094 	vm_map_offset_t addr,
22095 	vm_map_size_t size)
22096 {
22097 	vm_map_offset_t start, end, sum;
22098 	vm_map_offset_t pgmask;
22099 
22100 	if (size == 0) {
22101 		/* empty range -> no overflow */
22102 		return false;
22103 	}
22104 	pgmask = vm_map_page_mask(map);
22105 	start = vm_map_trunc_page_mask(addr, pgmask);
22106 	end = vm_map_round_page_mask(addr + size, pgmask);
22107 	if (__improbable(os_add_overflow(addr, size, &sum) || end <= start)) {
22108 		vm_map_range_overflows_count++;
22109 		if (vm_map_range_overflows_log) {
22110 			printf("%d[%s] vm_map_range_overflows addr 0x%llx size 0x%llx pgmask 0x%llx\n",
22111 			    proc_selfpid(),
22112 			    proc_best_name(current_proc()),
22113 			    (uint64_t)addr,
22114 			    (uint64_t)size,
22115 			    (uint64_t)pgmask);
22116 		}
22117 		DTRACE_VM4(vm_map_range_overflows,
22118 		    vm_map_t, map,
22119 		    uint32_t, pgmask,
22120 		    uint64_t, (uint64_t)addr,
22121 		    uint64_t, (uint64_t)size);
22122 		return true;
22123 	}
22124 	return false;
22125 }
22126 
22127 vm_map_offset_t
vm_map_round_page_mask(vm_map_offset_t offset,vm_map_offset_t mask)22128 vm_map_round_page_mask(
22129 	vm_map_offset_t offset,
22130 	vm_map_offset_t mask)
22131 {
22132 	return VM_MAP_ROUND_PAGE(offset, mask);
22133 }
22134 
22135 vm_map_offset_t
vm_map_trunc_page_mask(vm_map_offset_t offset,vm_map_offset_t mask)22136 vm_map_trunc_page_mask(
22137 	vm_map_offset_t offset,
22138 	vm_map_offset_t mask)
22139 {
22140 	return VM_MAP_TRUNC_PAGE(offset, mask);
22141 }
22142 
22143 boolean_t
vm_map_page_aligned(vm_map_offset_t offset,vm_map_offset_t mask)22144 vm_map_page_aligned(
22145 	vm_map_offset_t offset,
22146 	vm_map_offset_t mask)
22147 {
22148 	return ((offset) & mask) == 0;
22149 }
22150 
22151 int
vm_map_page_shift(vm_map_t map)22152 vm_map_page_shift(
22153 	vm_map_t map)
22154 {
22155 	return VM_MAP_PAGE_SHIFT(map);
22156 }
22157 
22158 int
vm_map_page_size(vm_map_t map)22159 vm_map_page_size(
22160 	vm_map_t map)
22161 {
22162 	return VM_MAP_PAGE_SIZE(map);
22163 }
22164 
22165 vm_map_offset_t
vm_map_page_mask(vm_map_t map)22166 vm_map_page_mask(
22167 	vm_map_t map)
22168 {
22169 	return VM_MAP_PAGE_MASK(map);
22170 }
22171 
22172 kern_return_t
vm_map_set_page_shift(vm_map_t map,int pageshift)22173 vm_map_set_page_shift(
22174 	vm_map_t        map,
22175 	int             pageshift)
22176 {
22177 	if (map->hdr.nentries != 0) {
22178 		/* too late to change page size */
22179 		return KERN_FAILURE;
22180 	}
22181 
22182 	map->hdr.page_shift = (uint16_t)pageshift;
22183 
22184 	return KERN_SUCCESS;
22185 }
22186 
22187 kern_return_t
vm_map_query_volatile(vm_map_t map,mach_vm_size_t * volatile_virtual_size_p,mach_vm_size_t * volatile_resident_size_p,mach_vm_size_t * volatile_compressed_size_p,mach_vm_size_t * volatile_pmap_size_p,mach_vm_size_t * volatile_compressed_pmap_size_p)22188 vm_map_query_volatile(
22189 	vm_map_t        map,
22190 	mach_vm_size_t  *volatile_virtual_size_p,
22191 	mach_vm_size_t  *volatile_resident_size_p,
22192 	mach_vm_size_t  *volatile_compressed_size_p,
22193 	mach_vm_size_t  *volatile_pmap_size_p,
22194 	mach_vm_size_t  *volatile_compressed_pmap_size_p)
22195 {
22196 	mach_vm_size_t  volatile_virtual_size;
22197 	mach_vm_size_t  volatile_resident_count;
22198 	mach_vm_size_t  volatile_compressed_count;
22199 	mach_vm_size_t  volatile_pmap_count;
22200 	mach_vm_size_t  volatile_compressed_pmap_count;
22201 	mach_vm_size_t  resident_count;
22202 	vm_map_entry_t  entry;
22203 	vm_object_t     object;
22204 
22205 	/* map should be locked by caller */
22206 
22207 	volatile_virtual_size = 0;
22208 	volatile_resident_count = 0;
22209 	volatile_compressed_count = 0;
22210 	volatile_pmap_count = 0;
22211 	volatile_compressed_pmap_count = 0;
22212 
22213 	for (entry = vm_map_first_entry(map);
22214 	    entry != vm_map_to_entry(map);
22215 	    entry = entry->vme_next) {
22216 		mach_vm_size_t  pmap_resident_bytes, pmap_compressed_bytes;
22217 
22218 		if (entry->is_sub_map) {
22219 			continue;
22220 		}
22221 		if (!(entry->protection & VM_PROT_WRITE)) {
22222 			continue;
22223 		}
22224 		object = VME_OBJECT(entry);
22225 		if (object == VM_OBJECT_NULL) {
22226 			continue;
22227 		}
22228 		if (object->purgable != VM_PURGABLE_VOLATILE &&
22229 		    object->purgable != VM_PURGABLE_EMPTY) {
22230 			continue;
22231 		}
22232 		if (VME_OFFSET(entry)) {
22233 			/*
22234 			 * If the map entry has been split and the object now
22235 			 * appears several times in the VM map, we don't want
22236 			 * to count the object's resident_page_count more than
22237 			 * once.  We count it only for the first one, starting
22238 			 * at offset 0 and ignore the other VM map entries.
22239 			 */
22240 			continue;
22241 		}
22242 		resident_count = object->resident_page_count;
22243 		if ((VME_OFFSET(entry) / PAGE_SIZE) >= resident_count) {
22244 			resident_count = 0;
22245 		} else {
22246 			resident_count -= (VME_OFFSET(entry) / PAGE_SIZE);
22247 		}
22248 
22249 		volatile_virtual_size += entry->vme_end - entry->vme_start;
22250 		volatile_resident_count += resident_count;
22251 		if (object->pager) {
22252 			volatile_compressed_count +=
22253 			    vm_compressor_pager_get_count(object->pager);
22254 		}
22255 		pmap_compressed_bytes = 0;
22256 		pmap_resident_bytes =
22257 		    pmap_query_resident(map->pmap,
22258 		    entry->vme_start,
22259 		    entry->vme_end,
22260 		    &pmap_compressed_bytes);
22261 		volatile_pmap_count += (pmap_resident_bytes / PAGE_SIZE);
22262 		volatile_compressed_pmap_count += (pmap_compressed_bytes
22263 		    / PAGE_SIZE);
22264 	}
22265 
22266 	/* map is still locked on return */
22267 
22268 	*volatile_virtual_size_p = volatile_virtual_size;
22269 	*volatile_resident_size_p = volatile_resident_count * PAGE_SIZE;
22270 	*volatile_compressed_size_p = volatile_compressed_count * PAGE_SIZE;
22271 	*volatile_pmap_size_p = volatile_pmap_count * PAGE_SIZE;
22272 	*volatile_compressed_pmap_size_p = volatile_compressed_pmap_count * PAGE_SIZE;
22273 
22274 	return KERN_SUCCESS;
22275 }
22276 
22277 void
vm_map_sizes(vm_map_t map,vm_map_size_t * psize,vm_map_size_t * pfree,vm_map_size_t * plargest_free)22278 vm_map_sizes(vm_map_t map,
22279     vm_map_size_t * psize,
22280     vm_map_size_t * pfree,
22281     vm_map_size_t * plargest_free)
22282 {
22283 	vm_map_entry_t  entry;
22284 	vm_map_offset_t prev;
22285 	vm_map_size_t   free, total_free, largest_free;
22286 	boolean_t       end;
22287 
22288 	if (!map) {
22289 		*psize = *pfree = *plargest_free = 0;
22290 		return;
22291 	}
22292 	total_free = largest_free = 0;
22293 
22294 	vm_map_lock_read(map);
22295 	if (psize) {
22296 		*psize = map->max_offset - map->min_offset;
22297 	}
22298 
22299 	prev = map->min_offset;
22300 	for (entry = vm_map_first_entry(map);; entry = entry->vme_next) {
22301 		end = (entry == vm_map_to_entry(map));
22302 
22303 		if (end) {
22304 			free = entry->vme_end   - prev;
22305 		} else {
22306 			free = entry->vme_start - prev;
22307 		}
22308 
22309 		total_free += free;
22310 		if (free > largest_free) {
22311 			largest_free = free;
22312 		}
22313 
22314 		if (end) {
22315 			break;
22316 		}
22317 		prev = entry->vme_end;
22318 	}
22319 	vm_map_unlock_read(map);
22320 	if (pfree) {
22321 		*pfree = total_free;
22322 	}
22323 	if (plargest_free) {
22324 		*plargest_free = largest_free;
22325 	}
22326 }
22327 
22328 #if VM_SCAN_FOR_SHADOW_CHAIN
22329 int vm_map_shadow_max(vm_map_t map);
22330 int
vm_map_shadow_max(vm_map_t map)22331 vm_map_shadow_max(
22332 	vm_map_t map)
22333 {
22334 	int             shadows, shadows_max;
22335 	vm_map_entry_t  entry;
22336 	vm_object_t     object, next_object;
22337 
22338 	if (map == NULL) {
22339 		return 0;
22340 	}
22341 
22342 	shadows_max = 0;
22343 
22344 	vm_map_lock_read(map);
22345 
22346 	for (entry = vm_map_first_entry(map);
22347 	    entry != vm_map_to_entry(map);
22348 	    entry = entry->vme_next) {
22349 		if (entry->is_sub_map) {
22350 			continue;
22351 		}
22352 		object = VME_OBJECT(entry);
22353 		if (object == NULL) {
22354 			continue;
22355 		}
22356 		vm_object_lock_shared(object);
22357 		for (shadows = 0;
22358 		    object->shadow != NULL;
22359 		    shadows++, object = next_object) {
22360 			next_object = object->shadow;
22361 			vm_object_lock_shared(next_object);
22362 			vm_object_unlock(object);
22363 		}
22364 		vm_object_unlock(object);
22365 		if (shadows > shadows_max) {
22366 			shadows_max = shadows;
22367 		}
22368 	}
22369 
22370 	vm_map_unlock_read(map);
22371 
22372 	return shadows_max;
22373 }
22374 #endif /* VM_SCAN_FOR_SHADOW_CHAIN */
22375 
22376 void
vm_commit_pagezero_status(vm_map_t lmap)22377 vm_commit_pagezero_status(vm_map_t lmap)
22378 {
22379 	pmap_advise_pagezero_range(lmap->pmap, lmap->min_offset);
22380 }
22381 
22382 #if __x86_64__
22383 void
vm_map_set_high_start(vm_map_t map,vm_map_offset_t high_start)22384 vm_map_set_high_start(
22385 	vm_map_t        map,
22386 	vm_map_offset_t high_start)
22387 {
22388 	map->vmmap_high_start = high_start;
22389 }
22390 #endif /* __x86_64__ */
22391 
22392 #if CODE_SIGNING_MONITOR
22393 
22394 kern_return_t
vm_map_entry_cs_associate(vm_map_t map,vm_map_entry_t entry,vm_map_kernel_flags_t vmk_flags)22395 vm_map_entry_cs_associate(
22396 	vm_map_t                map,
22397 	vm_map_entry_t          entry,
22398 	vm_map_kernel_flags_t   vmk_flags)
22399 {
22400 	vm_object_t cs_object, cs_shadow, backing_object;
22401 	vm_object_offset_t cs_offset, backing_offset;
22402 	void *cs_blobs;
22403 	struct vnode *cs_vnode;
22404 	kern_return_t cs_ret;
22405 
22406 	if (map->pmap == NULL ||
22407 	    entry->is_sub_map || /* XXX FBDP: recurse on sub-range? */
22408 	    (csm_address_space_exempt(map->pmap) == KERN_SUCCESS) ||
22409 	    VME_OBJECT(entry) == VM_OBJECT_NULL) {
22410 		return KERN_SUCCESS;
22411 	}
22412 
22413 	if (!(entry->protection & VM_PROT_EXECUTE)) {
22414 		/*
22415 		 * This memory region is not executable, so the code-signing
22416 		 * monitor would usually not care about it...
22417 		 */
22418 		if (vmk_flags.vmkf_remap_prot_copy &&
22419 		    (entry->max_protection & VM_PROT_EXECUTE)) {
22420 			/*
22421 			 * ... except if the memory region is being remapped
22422 			 * from r-x/r-x to rw-/rwx via vm_protect(VM_PROT_COPY)
22423 			 * which is what a debugger or dtrace would be doing
22424 			 * to prepare to modify an executable page to insert
22425 			 * a breakpoint or activate a probe.
22426 			 * In that case, fall through so that we can mark
22427 			 * this region as being "debugged" and no longer
22428 			 * strictly code-signed.
22429 			 */
22430 		} else {
22431 			/*
22432 			 * Really not executable, so no need to tell the
22433 			 * code-signing monitor.
22434 			 */
22435 			return KERN_SUCCESS;
22436 		}
22437 	}
22438 
22439 	vm_map_lock_assert_exclusive(map);
22440 
22441 	/*
22442 	 * Check for a debug association mapping before we check for used_for_jit. This
22443 	 * allows non-RWX JIT on macOS systems to masquerade their mappings as USER_DEBUG
22444 	 * pages instead of USER_JIT. These non-RWX JIT pages cannot be marked as USER_JIT
22445 	 * since they are mapped with RW or RX permissions, which the page table monitor
22446 	 * denies on USER_JIT pages. Given that, if they're not mapped as USER_DEBUG,
22447 	 * they will be mapped as USER_EXEC, and that will cause another page table monitor
22448 	 * violation when those USER_EXEC pages are mapped as RW.
22449 	 *
22450 	 * Since these pages switch between RW and RX through mprotect, they mimic what
22451 	 * we expect a debugger to do. As the code signing monitor does not enforce mappings
22452 	 * on macOS systems, this works in our favor here and allows us to continue to
22453 	 * support these legacy-programmed applications without sacrificing security on
22454 	 * the page table or the code signing monitor. We don't need to explicitly check
22455 	 * for entry_for_jit here and the mapping permissions. If the initial mapping is
22456 	 * created with RX, then the application must map it as RW in order to first write
22457 	 * to the page (MAP_JIT mappings must be private and anonymous). The switch to
22458 	 * RX will cause vm_map_protect to mark the entry as vmkf_remap_prot_copy.
22459 	 * Similarly, if the mapping was created as RW, and then switched to RX,
22460 	 * vm_map_protect will again mark the entry as a copy, and both these cases
22461 	 * lead to this if-statement being entered.
22462 	 *
22463 	 * For more information: rdar://115313336.
22464 	 */
22465 	if (vmk_flags.vmkf_remap_prot_copy) {
22466 		cs_ret = csm_associate_debug_region(
22467 			map->pmap,
22468 			entry->vme_start,
22469 			entry->vme_end - entry->vme_start);
22470 
22471 		/*
22472 		 * csm_associate_debug_region returns not supported when the code signing
22473 		 * monitor is disabled. This is intentional, since cs_ret is checked towards
22474 		 * the end of the function, and if it is not supported, then we still want the
22475 		 * VM to perform code-signing enforcement on this entry. That said, if we don't
22476 		 * mark this as a xnu_user_debug page when the code-signing monitor is disabled,
22477 		 * then it never gets retyped to XNU_USER_DEBUG frame type, which then causes
22478 		 * an issue with debugging (since it'll be mapped in as XNU_USER_EXEC in some
22479 		 * cases, which will cause a violation when attempted to be mapped as writable).
22480 		 */
22481 		if ((cs_ret == KERN_SUCCESS) || (cs_ret == KERN_NOT_SUPPORTED)) {
22482 			entry->vme_xnu_user_debug = TRUE;
22483 		}
22484 #if DEVELOPMENT || DEBUG
22485 		if (vm_log_xnu_user_debug) {
22486 			printf("FBDP %d[%s] %s:%d map %p entry %p [ 0x%llx 0x%llx ]  vme_xnu_user_debug=%d cs_ret %d\n",
22487 			    proc_selfpid(),
22488 			    (get_bsdtask_info(current_task()) ? proc_name_address(get_bsdtask_info(current_task())) : "?"),
22489 			    __FUNCTION__, __LINE__,
22490 			    map, entry,
22491 			    (uint64_t)entry->vme_start, (uint64_t)entry->vme_end,
22492 			    entry->vme_xnu_user_debug,
22493 			    cs_ret);
22494 		}
22495 #endif /* DEVELOPMENT || DEBUG */
22496 		goto done;
22497 	}
22498 
22499 	if (entry->used_for_jit) {
22500 		cs_ret = csm_associate_jit_region(
22501 			map->pmap,
22502 			entry->vme_start,
22503 			entry->vme_end - entry->vme_start);
22504 		goto done;
22505 	}
22506 
22507 	cs_object = VME_OBJECT(entry);
22508 	vm_object_lock_shared(cs_object);
22509 	cs_offset = VME_OFFSET(entry);
22510 
22511 	/* find the VM object backed by the code-signed vnode */
22512 	for (;;) {
22513 		/* go to the bottom of cs_object's shadow chain */
22514 		for (;
22515 		    cs_object->shadow != VM_OBJECT_NULL;
22516 		    cs_object = cs_shadow) {
22517 			cs_shadow = cs_object->shadow;
22518 			cs_offset += cs_object->vo_shadow_offset;
22519 			vm_object_lock_shared(cs_shadow);
22520 			vm_object_unlock(cs_object);
22521 		}
22522 		if (cs_object->internal ||
22523 		    cs_object->pager == MEMORY_OBJECT_NULL) {
22524 			vm_object_unlock(cs_object);
22525 			return KERN_SUCCESS;
22526 		}
22527 
22528 		cs_offset += cs_object->paging_offset;
22529 
22530 		/*
22531 		 * cs_object could be backed by a:
22532 		 *      vnode_pager
22533 		 *	apple_protect_pager
22534 		 *      shared_region_pager
22535 		 *	fourk_pager (multiple backing objects -> fail?)
22536 		 * ask the pager if it has a backing VM object
22537 		 */
22538 		if (!memory_object_backing_object(cs_object->pager,
22539 		    cs_offset,
22540 		    &backing_object,
22541 		    &backing_offset)) {
22542 			/* no backing object: cs_object is it */
22543 			break;
22544 		}
22545 
22546 		/* look down the backing object's shadow chain */
22547 		vm_object_lock_shared(backing_object);
22548 		vm_object_unlock(cs_object);
22549 		cs_object = backing_object;
22550 		cs_offset = backing_offset;
22551 	}
22552 
22553 	cs_vnode = vnode_pager_lookup_vnode(cs_object->pager);
22554 	if (cs_vnode == NULL) {
22555 		/* no vnode, no code signatures to associate */
22556 		cs_ret = KERN_SUCCESS;
22557 	} else {
22558 		cs_ret = vnode_pager_get_cs_blobs(cs_vnode,
22559 		    &cs_blobs);
22560 		assert(cs_ret == KERN_SUCCESS);
22561 		cs_ret = cs_associate_blob_with_mapping(map->pmap,
22562 		    entry->vme_start,
22563 		    (entry->vme_end - entry->vme_start),
22564 		    cs_offset,
22565 		    cs_blobs);
22566 	}
22567 	vm_object_unlock(cs_object);
22568 	cs_object = VM_OBJECT_NULL;
22569 
22570 done:
22571 	if (cs_ret == KERN_SUCCESS) {
22572 		DTRACE_VM2(vm_map_entry_cs_associate_success,
22573 		    vm_map_offset_t, entry->vme_start,
22574 		    vm_map_offset_t, entry->vme_end);
22575 		if (vm_map_executable_immutable) {
22576 			/*
22577 			 * Prevent this executable
22578 			 * mapping from being unmapped
22579 			 * or modified.
22580 			 */
22581 			entry->vme_permanent = TRUE;
22582 		}
22583 		/*
22584 		 * pmap says it will validate the
22585 		 * code-signing validity of pages
22586 		 * faulted in via this mapping, so
22587 		 * this map entry should be marked so
22588 		 * that vm_fault() bypasses code-signing
22589 		 * validation for faults coming through
22590 		 * this mapping.
22591 		 */
22592 		entry->csm_associated = TRUE;
22593 	} else if (cs_ret == KERN_NOT_SUPPORTED) {
22594 		/*
22595 		 * pmap won't check the code-signing
22596 		 * validity of pages faulted in via
22597 		 * this mapping, so VM should keep
22598 		 * doing it.
22599 		 */
22600 		DTRACE_VM3(vm_map_entry_cs_associate_off,
22601 		    vm_map_offset_t, entry->vme_start,
22602 		    vm_map_offset_t, entry->vme_end,
22603 		    int, cs_ret);
22604 	} else {
22605 		/*
22606 		 * A real error: do not allow
22607 		 * execution in this mapping.
22608 		 */
22609 		DTRACE_VM3(vm_map_entry_cs_associate_failure,
22610 		    vm_map_offset_t, entry->vme_start,
22611 		    vm_map_offset_t, entry->vme_end,
22612 		    int, cs_ret);
22613 		if (vmk_flags.vmkf_overwrite_immutable) {
22614 			/*
22615 			 * We can get here when we remap an apple_protect pager
22616 			 * on top of an already cs_associated executable mapping
22617 			 * with the same code signatures, so we don't want to
22618 			 * lose VM_PROT_EXECUTE in that case...
22619 			 */
22620 		} else {
22621 			entry->protection &= ~VM_PROT_ALLEXEC;
22622 			entry->max_protection &= ~VM_PROT_ALLEXEC;
22623 		}
22624 	}
22625 
22626 	return cs_ret;
22627 }
22628 
22629 #endif /* CODE_SIGNING_MONITOR */
22630 
22631 inline bool
vm_map_is_corpse_source(vm_map_t map)22632 vm_map_is_corpse_source(vm_map_t map)
22633 {
22634 	bool status = false;
22635 	if (map) {
22636 		vm_map_lock_read(map);
22637 		status = map->corpse_source;
22638 		vm_map_unlock_read(map);
22639 	}
22640 	return status;
22641 }
22642 
22643 inline void
vm_map_set_corpse_source(vm_map_t map)22644 vm_map_set_corpse_source(vm_map_t map)
22645 {
22646 	if (map) {
22647 		vm_map_lock(map);
22648 		map->corpse_source = true;
22649 		vm_map_unlock(map);
22650 	}
22651 }
22652 
22653 inline void
vm_map_unset_corpse_source(vm_map_t map)22654 vm_map_unset_corpse_source(vm_map_t map)
22655 {
22656 	if (map) {
22657 		vm_map_lock(map);
22658 		map->corpse_source = false;
22659 		vm_map_unlock(map);
22660 	}
22661 }
22662 /*
22663  * FORKED CORPSE FOOTPRINT
22664  *
22665  * A forked corpse gets a copy of the original VM map but its pmap is mostly
22666  * empty since it never ran and never got to fault in any pages.
22667  * Collecting footprint info (via "sysctl vm.self_region_footprint") for
22668  * a forked corpse would therefore return very little information.
22669  *
22670  * When forking a corpse, we can pass the VM_MAP_FORK_CORPSE_FOOTPRINT option
22671  * to vm_map_fork() to collect footprint information from the original VM map
22672  * and its pmap, and store it in the forked corpse's VM map.  That information
22673  * is stored in place of the VM map's "hole list" since we'll never need to
22674  * lookup for holes in the corpse's map.
22675  *
22676  * The corpse's footprint info looks like this:
22677  *
22678  * vm_map->vmmap_corpse_footprint points to pageable kernel memory laid out
22679  * as follows:
22680  *                     +---------------------------------------+
22681  *            header-> | cf_size                               |
22682  *                     +-------------------+-------------------+
22683  *                     | cf_last_region    | cf_last_zeroes    |
22684  *                     +-------------------+-------------------+
22685  *           region1-> | cfr_vaddr                             |
22686  *                     +-------------------+-------------------+
22687  *                     | cfr_num_pages     | d0 | d1 | d2 | d3 |
22688  *                     +---------------------------------------+
22689  *                     | d4 | d5 | ...                         |
22690  *                     +---------------------------------------+
22691  *                     | ...                                   |
22692  *                     +-------------------+-------------------+
22693  *                     | dy | dz | na | na | cfr_vaddr...      | <-region2
22694  *                     +-------------------+-------------------+
22695  *                     | cfr_vaddr (ctd)   | cfr_num_pages     |
22696  *                     +---------------------------------------+
22697  *                     | d0 | d1 ...                           |
22698  *                     +---------------------------------------+
22699  *                       ...
22700  *                     +---------------------------------------+
22701  *       last region-> | cfr_vaddr                             |
22702  *                     +---------------------------------------+
22703  *                     + cfr_num_pages     | d0 | d1 | d2 | d3 |
22704  *                     +---------------------------------------+
22705  *                       ...
22706  *                     +---------------------------------------+
22707  *                     | dx | dy | dz | na | na | na | na | na |
22708  *                     +---------------------------------------+
22709  *
22710  * where:
22711  *      cf_size:	total size of the buffer (rounded to page size)
22712  *      cf_last_region:	offset in the buffer of the last "region" sub-header
22713  *	cf_last_zeroes: number of trailing "zero" dispositions at the end
22714  *			of last region
22715  *	cfr_vaddr:	virtual address of the start of the covered "region"
22716  *	cfr_num_pages:	number of pages in the covered "region"
22717  *	d*:		disposition of the page at that virtual address
22718  * Regions in the buffer are word-aligned.
22719  *
22720  * We estimate the size of the buffer based on the number of memory regions
22721  * and the virtual size of the address space.  While copying each memory region
22722  * during vm_map_fork(), we also collect the footprint info for that region
22723  * and store it in the buffer, packing it as much as possible (coalescing
22724  * contiguous memory regions to avoid having too many region headers and
22725  * avoiding long streaks of "zero" page dispositions by splitting footprint
22726  * "regions", so the number of regions in the footprint buffer might not match
22727  * the number of memory regions in the address space.
22728  *
22729  * We also have to copy the original task's "nonvolatile" ledgers since that's
22730  * part of the footprint and will need to be reported to any tool asking for
22731  * the footprint information of the forked corpse.
22732  */
22733 
22734 uint64_t vm_map_corpse_footprint_count = 0;
22735 uint64_t vm_map_corpse_footprint_size_avg = 0;
22736 uint64_t vm_map_corpse_footprint_size_max = 0;
22737 uint64_t vm_map_corpse_footprint_full = 0;
22738 uint64_t vm_map_corpse_footprint_no_buf = 0;
22739 
22740 struct vm_map_corpse_footprint_header {
22741 	vm_size_t       cf_size;        /* allocated buffer size */
22742 	uint32_t        cf_last_region; /* offset of last region in buffer */
22743 	union {
22744 		uint32_t cfu_last_zeroes; /* during creation:
22745 		                           * number of "zero" dispositions at
22746 		                           * end of last region */
22747 		uint32_t cfu_hint_region; /* during lookup:
22748 		                           * offset of last looked up region */
22749 #define cf_last_zeroes cfu.cfu_last_zeroes
22750 #define cf_hint_region cfu.cfu_hint_region
22751 	} cfu;
22752 };
22753 typedef uint8_t cf_disp_t;
22754 struct vm_map_corpse_footprint_region {
22755 	vm_map_offset_t cfr_vaddr;      /* region start virtual address */
22756 	uint32_t        cfr_num_pages;  /* number of pages in this "region" */
22757 	cf_disp_t   cfr_disposition[0]; /* disposition of each page */
22758 } __attribute__((packed));
22759 
22760 static cf_disp_t
vm_page_disposition_to_cf_disp(int disposition)22761 vm_page_disposition_to_cf_disp(
22762 	int disposition)
22763 {
22764 	assert(sizeof(cf_disp_t) == 1);
22765 	/* relocate bits that don't fit in a "uint8_t" */
22766 	if (disposition & VM_PAGE_QUERY_PAGE_REUSABLE) {
22767 		disposition |= VM_PAGE_QUERY_PAGE_FICTITIOUS;
22768 	}
22769 	/* cast gets rid of extra bits */
22770 	return (cf_disp_t) disposition;
22771 }
22772 
22773 static int
vm_page_cf_disp_to_disposition(cf_disp_t cf_disp)22774 vm_page_cf_disp_to_disposition(
22775 	cf_disp_t cf_disp)
22776 {
22777 	int disposition;
22778 
22779 	assert(sizeof(cf_disp_t) == 1);
22780 	disposition = (int) cf_disp;
22781 	/* move relocated bits back in place */
22782 	if (cf_disp & VM_PAGE_QUERY_PAGE_FICTITIOUS) {
22783 		disposition |= VM_PAGE_QUERY_PAGE_REUSABLE;
22784 		disposition &= ~VM_PAGE_QUERY_PAGE_FICTITIOUS;
22785 	}
22786 	return disposition;
22787 }
22788 
22789 /*
22790  * vm_map_corpse_footprint_new_region:
22791  *      closes the current footprint "region" and creates a new one
22792  *
22793  * Returns NULL if there's not enough space in the buffer for a new region.
22794  */
22795 static struct vm_map_corpse_footprint_region *
vm_map_corpse_footprint_new_region(struct vm_map_corpse_footprint_header * footprint_header)22796 vm_map_corpse_footprint_new_region(
22797 	struct vm_map_corpse_footprint_header *footprint_header)
22798 {
22799 	uintptr_t       footprint_edge;
22800 	uint32_t        new_region_offset;
22801 	struct vm_map_corpse_footprint_region *footprint_region;
22802 	struct vm_map_corpse_footprint_region *new_footprint_region;
22803 
22804 	footprint_edge = ((uintptr_t)footprint_header +
22805 	    footprint_header->cf_size);
22806 	footprint_region = ((struct vm_map_corpse_footprint_region *)
22807 	    ((char *)footprint_header +
22808 	    footprint_header->cf_last_region));
22809 	assert((uintptr_t)footprint_region + sizeof(*footprint_region) <=
22810 	    footprint_edge);
22811 
22812 	/* get rid of trailing zeroes in the last region */
22813 	assert(footprint_region->cfr_num_pages >=
22814 	    footprint_header->cf_last_zeroes);
22815 	footprint_region->cfr_num_pages -=
22816 	    footprint_header->cf_last_zeroes;
22817 	footprint_header->cf_last_zeroes = 0;
22818 
22819 	/* reuse this region if it's now empty */
22820 	if (footprint_region->cfr_num_pages == 0) {
22821 		return footprint_region;
22822 	}
22823 
22824 	/* compute offset of new region */
22825 	new_region_offset = footprint_header->cf_last_region;
22826 	new_region_offset += sizeof(*footprint_region);
22827 	new_region_offset += (footprint_region->cfr_num_pages * sizeof(cf_disp_t));
22828 	new_region_offset = roundup(new_region_offset, sizeof(int));
22829 
22830 	/* check if we're going over the edge */
22831 	if (((uintptr_t)footprint_header +
22832 	    new_region_offset +
22833 	    sizeof(*footprint_region)) >=
22834 	    footprint_edge) {
22835 		/* over the edge: no new region */
22836 		return NULL;
22837 	}
22838 
22839 	/* adjust offset of last region in header */
22840 	footprint_header->cf_last_region = new_region_offset;
22841 
22842 	new_footprint_region = (struct vm_map_corpse_footprint_region *)
22843 	    ((char *)footprint_header +
22844 	    footprint_header->cf_last_region);
22845 	new_footprint_region->cfr_vaddr = 0;
22846 	new_footprint_region->cfr_num_pages = 0;
22847 	/* caller needs to initialize new region */
22848 
22849 	return new_footprint_region;
22850 }
22851 
22852 /*
22853  * vm_map_corpse_footprint_collect:
22854  *	collect footprint information for "old_entry" in "old_map" and
22855  *	stores it in "new_map"'s vmmap_footprint_info.
22856  */
22857 kern_return_t
vm_map_corpse_footprint_collect(vm_map_t old_map,vm_map_entry_t old_entry,vm_map_t new_map)22858 vm_map_corpse_footprint_collect(
22859 	vm_map_t        old_map,
22860 	vm_map_entry_t  old_entry,
22861 	vm_map_t        new_map)
22862 {
22863 	vm_map_offset_t va;
22864 	kern_return_t   kr;
22865 	struct vm_map_corpse_footprint_header *footprint_header;
22866 	struct vm_map_corpse_footprint_region *footprint_region;
22867 	struct vm_map_corpse_footprint_region *new_footprint_region;
22868 	cf_disp_t       *next_disp_p;
22869 	uintptr_t       footprint_edge;
22870 	uint32_t        num_pages_tmp;
22871 	int             effective_page_size;
22872 
22873 	effective_page_size = MIN(PAGE_SIZE, VM_MAP_PAGE_SIZE(old_map));
22874 
22875 	va = old_entry->vme_start;
22876 
22877 	vm_map_lock_assert_exclusive(old_map);
22878 	vm_map_lock_assert_exclusive(new_map);
22879 
22880 	assert(new_map->has_corpse_footprint);
22881 	assert(!old_map->has_corpse_footprint);
22882 	if (!new_map->has_corpse_footprint ||
22883 	    old_map->has_corpse_footprint) {
22884 		/*
22885 		 * This can only transfer footprint info from a
22886 		 * map with a live pmap to a map with a corpse footprint.
22887 		 */
22888 		return KERN_NOT_SUPPORTED;
22889 	}
22890 
22891 	if (new_map->vmmap_corpse_footprint == NULL) {
22892 		vm_offset_t     buf;
22893 		vm_size_t       buf_size;
22894 
22895 		buf = 0;
22896 		buf_size = (sizeof(*footprint_header) +
22897 		    (old_map->hdr.nentries
22898 		    *
22899 		    (sizeof(*footprint_region) +
22900 		    +3))            /* potential alignment for each region */
22901 		    +
22902 		    ((old_map->size / effective_page_size)
22903 		    *
22904 		    sizeof(cf_disp_t)));      /* disposition for each page */
22905 //		printf("FBDP corpse map %p guestimate footprint size 0x%llx\n", new_map, (uint64_t) buf_size);
22906 		buf_size = round_page(buf_size);
22907 
22908 		/* limit buffer to 1 page to validate overflow detection */
22909 //		buf_size = PAGE_SIZE;
22910 
22911 		/* limit size to a somewhat sane amount */
22912 #if XNU_TARGET_OS_OSX
22913 #define VM_MAP_CORPSE_FOOTPRINT_INFO_MAX_SIZE   (8*1024*1024)   /* 8MB */
22914 #else /* XNU_TARGET_OS_OSX */
22915 #define VM_MAP_CORPSE_FOOTPRINT_INFO_MAX_SIZE   (256*1024)      /* 256KB */
22916 #endif /* XNU_TARGET_OS_OSX */
22917 		if (buf_size > VM_MAP_CORPSE_FOOTPRINT_INFO_MAX_SIZE) {
22918 			buf_size = VM_MAP_CORPSE_FOOTPRINT_INFO_MAX_SIZE;
22919 		}
22920 
22921 		/*
22922 		 * Allocate the pageable buffer (with a trailing guard page).
22923 		 * It will be zero-filled on demand.
22924 		 */
22925 		kr = kmem_alloc(kernel_map, &buf, buf_size + PAGE_SIZE,
22926 		    KMA_DATA | KMA_PAGEABLE | KMA_GUARD_LAST,
22927 		    VM_KERN_MEMORY_DIAG);
22928 		if (kr != KERN_SUCCESS) {
22929 			vm_map_corpse_footprint_no_buf++;
22930 			return kr;
22931 		}
22932 
22933 		/* initialize header and 1st region */
22934 		footprint_header = (struct vm_map_corpse_footprint_header *)buf;
22935 		new_map->vmmap_corpse_footprint = footprint_header;
22936 
22937 		footprint_header->cf_size = buf_size;
22938 		footprint_header->cf_last_region =
22939 		    sizeof(*footprint_header);
22940 		footprint_header->cf_last_zeroes = 0;
22941 
22942 		footprint_region = (struct vm_map_corpse_footprint_region *)
22943 		    ((char *)footprint_header +
22944 		    footprint_header->cf_last_region);
22945 		footprint_region->cfr_vaddr = 0;
22946 		footprint_region->cfr_num_pages = 0;
22947 	} else {
22948 		/* retrieve header and last region */
22949 		footprint_header = (struct vm_map_corpse_footprint_header *)
22950 		    new_map->vmmap_corpse_footprint;
22951 		footprint_region = (struct vm_map_corpse_footprint_region *)
22952 		    ((char *)footprint_header +
22953 		    footprint_header->cf_last_region);
22954 	}
22955 	footprint_edge = ((uintptr_t)footprint_header +
22956 	    footprint_header->cf_size);
22957 
22958 	if ((footprint_region->cfr_vaddr +
22959 	    (((vm_map_offset_t)footprint_region->cfr_num_pages) *
22960 	    effective_page_size))
22961 	    != old_entry->vme_start) {
22962 		uint64_t num_pages_delta, num_pages_delta_size;
22963 		uint32_t region_offset_delta_size;
22964 
22965 		/*
22966 		 * Not the next contiguous virtual address:
22967 		 * start a new region or store "zero" dispositions for
22968 		 * the missing pages?
22969 		 */
22970 		/* size of gap in actual page dispositions */
22971 		num_pages_delta = ((old_entry->vme_start -
22972 		    footprint_region->cfr_vaddr) / effective_page_size)
22973 		    - footprint_region->cfr_num_pages;
22974 		num_pages_delta_size = num_pages_delta * sizeof(cf_disp_t);
22975 		/* size of gap as a new footprint region header */
22976 		region_offset_delta_size =
22977 		    (sizeof(*footprint_region) +
22978 		    roundup(((footprint_region->cfr_num_pages -
22979 		    footprint_header->cf_last_zeroes) * sizeof(cf_disp_t)),
22980 		    sizeof(int)) -
22981 		    ((footprint_region->cfr_num_pages -
22982 		    footprint_header->cf_last_zeroes) * sizeof(cf_disp_t)));
22983 //		printf("FBDP %s:%d region 0x%x 0x%llx 0x%x vme_start 0x%llx pages_delta 0x%llx region_delta 0x%x\n", __FUNCTION__, __LINE__, footprint_header->cf_last_region, footprint_region->cfr_vaddr, footprint_region->cfr_num_pages, old_entry->vme_start, num_pages_delta, region_offset_delta);
22984 		if (region_offset_delta_size < num_pages_delta_size ||
22985 		    os_add3_overflow(footprint_region->cfr_num_pages,
22986 		    (uint32_t) num_pages_delta,
22987 		    1,
22988 		    &num_pages_tmp)) {
22989 			/*
22990 			 * Storing data for this gap would take more space
22991 			 * than inserting a new footprint region header:
22992 			 * let's start a new region and save space. If it's a
22993 			 * tie, let's avoid using a new region, since that
22994 			 * would require more region hops to find the right
22995 			 * range during lookups.
22996 			 *
22997 			 * If the current region's cfr_num_pages would overflow
22998 			 * if we added "zero" page dispositions for the gap,
22999 			 * no choice but to start a new region.
23000 			 */
23001 //			printf("FBDP %s:%d new region\n", __FUNCTION__, __LINE__);
23002 			new_footprint_region =
23003 			    vm_map_corpse_footprint_new_region(footprint_header);
23004 			/* check that we're not going over the edge */
23005 			if (new_footprint_region == NULL) {
23006 				goto over_the_edge;
23007 			}
23008 			footprint_region = new_footprint_region;
23009 			/* initialize new region as empty */
23010 			footprint_region->cfr_vaddr = old_entry->vme_start;
23011 			footprint_region->cfr_num_pages = 0;
23012 		} else {
23013 			/*
23014 			 * Store "zero" page dispositions for the missing
23015 			 * pages.
23016 			 */
23017 //			printf("FBDP %s:%d zero gap\n", __FUNCTION__, __LINE__);
23018 			for (; num_pages_delta > 0; num_pages_delta--) {
23019 				next_disp_p = (cf_disp_t *)
23020 				    ((uintptr_t) footprint_region +
23021 				    sizeof(*footprint_region));
23022 				next_disp_p += footprint_region->cfr_num_pages;
23023 				/* check that we're not going over the edge */
23024 				if ((uintptr_t)next_disp_p >= footprint_edge) {
23025 					goto over_the_edge;
23026 				}
23027 				/* store "zero" disposition for this gap page */
23028 				footprint_region->cfr_num_pages++;
23029 				*next_disp_p = (cf_disp_t) 0;
23030 				footprint_header->cf_last_zeroes++;
23031 			}
23032 		}
23033 	}
23034 
23035 	for (va = old_entry->vme_start;
23036 	    va < old_entry->vme_end;
23037 	    va += effective_page_size) {
23038 		int             disposition;
23039 		cf_disp_t       cf_disp;
23040 
23041 		vm_map_footprint_query_page_info(old_map,
23042 		    old_entry,
23043 		    va,
23044 		    &disposition);
23045 		cf_disp = vm_page_disposition_to_cf_disp(disposition);
23046 
23047 //		if (va < SHARED_REGION_BASE_ARM64) printf("FBDP collect map %p va 0x%llx disp 0x%x\n", new_map, va, disp);
23048 
23049 		if (cf_disp == 0 && footprint_region->cfr_num_pages == 0) {
23050 			/*
23051 			 * Ignore "zero" dispositions at start of
23052 			 * region: just move start of region.
23053 			 */
23054 			footprint_region->cfr_vaddr += effective_page_size;
23055 			continue;
23056 		}
23057 
23058 		/* would region's cfr_num_pages overflow? */
23059 		if (os_add_overflow(footprint_region->cfr_num_pages, 1,
23060 		    &num_pages_tmp)) {
23061 			/* overflow: create a new region */
23062 			new_footprint_region =
23063 			    vm_map_corpse_footprint_new_region(
23064 				footprint_header);
23065 			if (new_footprint_region == NULL) {
23066 				goto over_the_edge;
23067 			}
23068 			footprint_region = new_footprint_region;
23069 			footprint_region->cfr_vaddr = va;
23070 			footprint_region->cfr_num_pages = 0;
23071 		}
23072 
23073 		next_disp_p = (cf_disp_t *) ((uintptr_t) footprint_region +
23074 		    sizeof(*footprint_region));
23075 		next_disp_p += footprint_region->cfr_num_pages;
23076 		/* check that we're not going over the edge */
23077 		if ((uintptr_t)next_disp_p >= footprint_edge) {
23078 			goto over_the_edge;
23079 		}
23080 		/* store this dispostion */
23081 		*next_disp_p = cf_disp;
23082 		footprint_region->cfr_num_pages++;
23083 
23084 		if (cf_disp != 0) {
23085 			/* non-zero disp: break the current zero streak */
23086 			footprint_header->cf_last_zeroes = 0;
23087 			/* done */
23088 			continue;
23089 		}
23090 
23091 		/* zero disp: add to the current streak of zeroes */
23092 		footprint_header->cf_last_zeroes++;
23093 		if ((footprint_header->cf_last_zeroes +
23094 		    roundup(((footprint_region->cfr_num_pages -
23095 		    footprint_header->cf_last_zeroes) * sizeof(cf_disp_t)) &
23096 		    (sizeof(int) - 1),
23097 		    sizeof(int))) <
23098 		    (sizeof(*footprint_header))) {
23099 			/*
23100 			 * There are not enough trailing "zero" dispositions
23101 			 * (+ the extra padding we would need for the previous
23102 			 * region); creating a new region would not save space
23103 			 * at this point, so let's keep this "zero" disposition
23104 			 * in this region and reconsider later.
23105 			 */
23106 			continue;
23107 		}
23108 		/*
23109 		 * Create a new region to avoid having too many consecutive
23110 		 * "zero" dispositions.
23111 		 */
23112 		new_footprint_region =
23113 		    vm_map_corpse_footprint_new_region(footprint_header);
23114 		if (new_footprint_region == NULL) {
23115 			goto over_the_edge;
23116 		}
23117 		footprint_region = new_footprint_region;
23118 		/* initialize the new region as empty ... */
23119 		footprint_region->cfr_num_pages = 0;
23120 		/* ... and skip this "zero" disp */
23121 		footprint_region->cfr_vaddr = va + effective_page_size;
23122 	}
23123 
23124 	return KERN_SUCCESS;
23125 
23126 over_the_edge:
23127 //	printf("FBDP map %p footprint was full for va 0x%llx\n", new_map, va);
23128 	vm_map_corpse_footprint_full++;
23129 	return KERN_RESOURCE_SHORTAGE;
23130 }
23131 
23132 /*
23133  * vm_map_corpse_footprint_collect_done:
23134  *	completes the footprint collection by getting rid of any remaining
23135  *	trailing "zero" dispositions and trimming the unused part of the
23136  *	kernel buffer
23137  */
23138 void
vm_map_corpse_footprint_collect_done(vm_map_t new_map)23139 vm_map_corpse_footprint_collect_done(
23140 	vm_map_t        new_map)
23141 {
23142 	struct vm_map_corpse_footprint_header *footprint_header;
23143 	struct vm_map_corpse_footprint_region *footprint_region;
23144 	vm_size_t       buf_size, actual_size;
23145 	kern_return_t   kr;
23146 
23147 	assert(new_map->has_corpse_footprint);
23148 	if (!new_map->has_corpse_footprint ||
23149 	    new_map->vmmap_corpse_footprint == NULL) {
23150 		return;
23151 	}
23152 
23153 	footprint_header = (struct vm_map_corpse_footprint_header *)
23154 	    new_map->vmmap_corpse_footprint;
23155 	buf_size = footprint_header->cf_size;
23156 
23157 	footprint_region = (struct vm_map_corpse_footprint_region *)
23158 	    ((char *)footprint_header +
23159 	    footprint_header->cf_last_region);
23160 
23161 	/* get rid of trailing zeroes in last region */
23162 	assert(footprint_region->cfr_num_pages >= footprint_header->cf_last_zeroes);
23163 	footprint_region->cfr_num_pages -= footprint_header->cf_last_zeroes;
23164 	footprint_header->cf_last_zeroes = 0;
23165 
23166 	actual_size = (vm_size_t)(footprint_header->cf_last_region +
23167 	    sizeof(*footprint_region) +
23168 	    (footprint_region->cfr_num_pages * sizeof(cf_disp_t)));
23169 
23170 //	printf("FBDP map %p buf_size 0x%llx actual_size 0x%llx\n", new_map, (uint64_t) buf_size, (uint64_t) actual_size);
23171 	vm_map_corpse_footprint_size_avg =
23172 	    (((vm_map_corpse_footprint_size_avg *
23173 	    vm_map_corpse_footprint_count) +
23174 	    actual_size) /
23175 	    (vm_map_corpse_footprint_count + 1));
23176 	vm_map_corpse_footprint_count++;
23177 	if (actual_size > vm_map_corpse_footprint_size_max) {
23178 		vm_map_corpse_footprint_size_max = actual_size;
23179 	}
23180 
23181 	actual_size = round_page(actual_size);
23182 	if (buf_size > actual_size) {
23183 		kr = vm_deallocate(kernel_map,
23184 		    ((vm_address_t)footprint_header +
23185 		    actual_size +
23186 		    PAGE_SIZE),                 /* trailing guard page */
23187 		    (buf_size - actual_size));
23188 		assertf(kr == KERN_SUCCESS,
23189 		    "trim: footprint_header %p buf_size 0x%llx actual_size 0x%llx kr=0x%x\n",
23190 		    footprint_header,
23191 		    (uint64_t) buf_size,
23192 		    (uint64_t) actual_size,
23193 		    kr);
23194 		kr = vm_protect(kernel_map,
23195 		    ((vm_address_t)footprint_header +
23196 		    actual_size),
23197 		    PAGE_SIZE,
23198 		    FALSE,             /* set_maximum */
23199 		    VM_PROT_NONE);
23200 		assertf(kr == KERN_SUCCESS,
23201 		    "guard: footprint_header %p buf_size 0x%llx actual_size 0x%llx kr=0x%x\n",
23202 		    footprint_header,
23203 		    (uint64_t) buf_size,
23204 		    (uint64_t) actual_size,
23205 		    kr);
23206 	}
23207 
23208 	footprint_header->cf_size = actual_size;
23209 }
23210 
23211 /*
23212  * vm_map_corpse_footprint_query_page_info:
23213  *	retrieves the disposition of the page at virtual address "vaddr"
23214  *	in the forked corpse's VM map
23215  *
23216  * This is the equivalent of vm_map_footprint_query_page_info() for a forked corpse.
23217  */
23218 kern_return_t
vm_map_corpse_footprint_query_page_info(vm_map_t map,vm_map_offset_t va,int * disposition_p)23219 vm_map_corpse_footprint_query_page_info(
23220 	vm_map_t        map,
23221 	vm_map_offset_t va,
23222 	int             *disposition_p)
23223 {
23224 	struct vm_map_corpse_footprint_header *footprint_header;
23225 	struct vm_map_corpse_footprint_region *footprint_region;
23226 	uint32_t        footprint_region_offset;
23227 	vm_map_offset_t region_start, region_end;
23228 	int             disp_idx;
23229 	kern_return_t   kr;
23230 	int             effective_page_size;
23231 	cf_disp_t       cf_disp;
23232 
23233 	if (!map->has_corpse_footprint) {
23234 		*disposition_p = 0;
23235 		kr = KERN_INVALID_ARGUMENT;
23236 		goto done;
23237 	}
23238 
23239 	footprint_header = map->vmmap_corpse_footprint;
23240 	if (footprint_header == NULL) {
23241 		*disposition_p = 0;
23242 //		if (va < SHARED_REGION_BASE_ARM64) printf("FBDP %d query map %p va 0x%llx disp 0x%x\n", __LINE__, map, va, *disposition_p);
23243 		kr = KERN_INVALID_ARGUMENT;
23244 		goto done;
23245 	}
23246 
23247 	/* start looking at the hint ("cf_hint_region") */
23248 	footprint_region_offset = footprint_header->cf_hint_region;
23249 
23250 	effective_page_size = MIN(PAGE_SIZE, VM_MAP_PAGE_SIZE(map));
23251 
23252 lookup_again:
23253 	if (footprint_region_offset < sizeof(*footprint_header)) {
23254 		/* hint too low: start from 1st region */
23255 		footprint_region_offset = sizeof(*footprint_header);
23256 	}
23257 	if (footprint_region_offset >= footprint_header->cf_last_region) {
23258 		/* hint too high: re-start from 1st region */
23259 		footprint_region_offset = sizeof(*footprint_header);
23260 	}
23261 	footprint_region = (struct vm_map_corpse_footprint_region *)
23262 	    ((char *)footprint_header + footprint_region_offset);
23263 	region_start = footprint_region->cfr_vaddr;
23264 	region_end = (region_start +
23265 	    ((vm_map_offset_t)(footprint_region->cfr_num_pages) *
23266 	    effective_page_size));
23267 	if (va < region_start &&
23268 	    footprint_region_offset != sizeof(*footprint_header)) {
23269 		/* our range starts before the hint region */
23270 
23271 		/* reset the hint (in a racy way...) */
23272 		footprint_header->cf_hint_region = sizeof(*footprint_header);
23273 		/* lookup "va" again from 1st region */
23274 		footprint_region_offset = sizeof(*footprint_header);
23275 		goto lookup_again;
23276 	}
23277 
23278 	while (va >= region_end) {
23279 		if (footprint_region_offset >= footprint_header->cf_last_region) {
23280 			break;
23281 		}
23282 		/* skip the region's header */
23283 		footprint_region_offset += sizeof(*footprint_region);
23284 		/* skip the region's page dispositions */
23285 		footprint_region_offset += (footprint_region->cfr_num_pages * sizeof(cf_disp_t));
23286 		/* align to next word boundary */
23287 		footprint_region_offset =
23288 		    roundup(footprint_region_offset,
23289 		    sizeof(int));
23290 		footprint_region = (struct vm_map_corpse_footprint_region *)
23291 		    ((char *)footprint_header + footprint_region_offset);
23292 		region_start = footprint_region->cfr_vaddr;
23293 		region_end = (region_start +
23294 		    ((vm_map_offset_t)(footprint_region->cfr_num_pages) *
23295 		    effective_page_size));
23296 	}
23297 	if (va < region_start || va >= region_end) {
23298 		/* page not found */
23299 		*disposition_p = 0;
23300 //		if (va < SHARED_REGION_BASE_ARM64) printf("FBDP %d query map %p va 0x%llx disp 0x%x\n", __LINE__, map, va, *disposition_p);
23301 		kr = KERN_SUCCESS;
23302 		goto done;
23303 	}
23304 
23305 	/* "va" found: set the lookup hint for next lookup (in a racy way...) */
23306 	footprint_header->cf_hint_region = footprint_region_offset;
23307 
23308 	/* get page disposition for "va" in this region */
23309 	disp_idx = (int) ((va - footprint_region->cfr_vaddr) / effective_page_size);
23310 	cf_disp = footprint_region->cfr_disposition[disp_idx];
23311 	*disposition_p = vm_page_cf_disp_to_disposition(cf_disp);
23312 	kr = KERN_SUCCESS;
23313 done:
23314 //	if (va < SHARED_REGION_BASE_ARM64) printf("FBDP %d query map %p va 0x%llx disp 0x%x\n", __LINE__, map, va, *disposition_p);
23315 	/* dtrace -n 'vminfo:::footprint_query_page_info { printf("map 0x%p va 0x%llx disp 0x%x kr 0x%x", arg0, arg1, arg2, arg3); }' */
23316 	DTRACE_VM4(footprint_query_page_info,
23317 	    vm_map_t, map,
23318 	    vm_map_offset_t, va,
23319 	    int, *disposition_p,
23320 	    kern_return_t, kr);
23321 
23322 	return kr;
23323 }
23324 
23325 void
vm_map_corpse_footprint_destroy(vm_map_t map)23326 vm_map_corpse_footprint_destroy(
23327 	vm_map_t        map)
23328 {
23329 	if (map->has_corpse_footprint &&
23330 	    map->vmmap_corpse_footprint != 0) {
23331 		struct vm_map_corpse_footprint_header *footprint_header;
23332 		vm_size_t buf_size;
23333 		kern_return_t kr;
23334 
23335 		footprint_header = map->vmmap_corpse_footprint;
23336 		buf_size = footprint_header->cf_size;
23337 		kr = vm_deallocate(kernel_map,
23338 		    (vm_offset_t) map->vmmap_corpse_footprint,
23339 		    ((vm_size_t) buf_size
23340 		    + PAGE_SIZE));                 /* trailing guard page */
23341 		assertf(kr == KERN_SUCCESS, "kr=0x%x\n", kr);
23342 		map->vmmap_corpse_footprint = 0;
23343 		map->has_corpse_footprint = FALSE;
23344 	}
23345 }
23346 
23347 /*
23348  * vm_map_copy_footprint_ledgers:
23349  *	copies any ledger that's relevant to the memory footprint of "old_task"
23350  *	into the forked corpse's task ("new_task")
23351  */
23352 void
vm_map_copy_footprint_ledgers(task_t old_task,task_t new_task)23353 vm_map_copy_footprint_ledgers(
23354 	task_t  old_task,
23355 	task_t  new_task)
23356 {
23357 	vm_map_copy_ledger(old_task, new_task, task_ledgers.phys_footprint);
23358 	vm_map_copy_ledger(old_task, new_task, task_ledgers.purgeable_nonvolatile);
23359 	vm_map_copy_ledger(old_task, new_task, task_ledgers.purgeable_nonvolatile_compressed);
23360 	vm_map_copy_ledger(old_task, new_task, task_ledgers.internal);
23361 	vm_map_copy_ledger(old_task, new_task, task_ledgers.internal_compressed);
23362 	vm_map_copy_ledger(old_task, new_task, task_ledgers.iokit_mapped);
23363 	vm_map_copy_ledger(old_task, new_task, task_ledgers.alternate_accounting);
23364 	vm_map_copy_ledger(old_task, new_task, task_ledgers.alternate_accounting_compressed);
23365 	vm_map_copy_ledger(old_task, new_task, task_ledgers.page_table);
23366 	vm_map_copy_ledger(old_task, new_task, task_ledgers.tagged_footprint);
23367 	vm_map_copy_ledger(old_task, new_task, task_ledgers.tagged_footprint_compressed);
23368 	vm_map_copy_ledger(old_task, new_task, task_ledgers.network_nonvolatile);
23369 	vm_map_copy_ledger(old_task, new_task, task_ledgers.network_nonvolatile_compressed);
23370 	vm_map_copy_ledger(old_task, new_task, task_ledgers.media_footprint);
23371 	vm_map_copy_ledger(old_task, new_task, task_ledgers.media_footprint_compressed);
23372 	vm_map_copy_ledger(old_task, new_task, task_ledgers.graphics_footprint);
23373 	vm_map_copy_ledger(old_task, new_task, task_ledgers.graphics_footprint_compressed);
23374 	vm_map_copy_ledger(old_task, new_task, task_ledgers.neural_footprint);
23375 	vm_map_copy_ledger(old_task, new_task, task_ledgers.neural_footprint_compressed);
23376 	vm_map_copy_ledger(old_task, new_task, task_ledgers.wired_mem);
23377 }
23378 
23379 /*
23380  * vm_map_copy_ledger:
23381  *	copy a single ledger from "old_task" to "new_task"
23382  */
23383 void
vm_map_copy_ledger(task_t old_task,task_t new_task,int ledger_entry)23384 vm_map_copy_ledger(
23385 	task_t  old_task,
23386 	task_t  new_task,
23387 	int     ledger_entry)
23388 {
23389 	ledger_amount_t old_balance, new_balance, delta;
23390 
23391 	assert(new_task->map->has_corpse_footprint);
23392 	if (!new_task->map->has_corpse_footprint) {
23393 		return;
23394 	}
23395 
23396 	/* turn off sanity checks for the ledger we're about to mess with */
23397 	ledger_disable_panic_on_negative(new_task->ledger,
23398 	    ledger_entry);
23399 
23400 	/* adjust "new_task" to match "old_task" */
23401 	ledger_get_balance(old_task->ledger,
23402 	    ledger_entry,
23403 	    &old_balance);
23404 	ledger_get_balance(new_task->ledger,
23405 	    ledger_entry,
23406 	    &new_balance);
23407 	if (new_balance == old_balance) {
23408 		/* new == old: done */
23409 	} else if (new_balance > old_balance) {
23410 		/* new > old ==> new -= new - old */
23411 		delta = new_balance - old_balance;
23412 		ledger_debit(new_task->ledger,
23413 		    ledger_entry,
23414 		    delta);
23415 	} else {
23416 		/* new < old ==> new += old - new */
23417 		delta = old_balance - new_balance;
23418 		ledger_credit(new_task->ledger,
23419 		    ledger_entry,
23420 		    delta);
23421 	}
23422 }
23423 
23424 /*
23425  * vm_map_get_pmap:
23426  * returns the pmap associated with the vm_map
23427  */
23428 pmap_t
vm_map_get_pmap(vm_map_t map)23429 vm_map_get_pmap(vm_map_t map)
23430 {
23431 	return vm_map_pmap(map);
23432 }
23433 
23434 #if CONFIG_MAP_RANGES
23435 static bitmap_t vm_map_user_range_heap_map[BITMAP_LEN(VM_MEMORY_COUNT)];
23436 
23437 static_assert(UMEM_RANGE_ID_DEFAULT == MACH_VM_RANGE_DEFAULT);
23438 static_assert(UMEM_RANGE_ID_HEAP == MACH_VM_RANGE_DATA);
23439 
23440 /*
23441  * vm_map_range_map_init:
23442  *  initializes the VM range ID map to enable index lookup
23443  *  of user VM ranges based on VM tag from userspace.
23444  */
23445 static void
vm_map_range_map_init(void)23446 vm_map_range_map_init(void)
23447 {
23448 	/*
23449 	 * VM_MEMORY_MALLOC{,_NANO} are skipped on purpose:
23450 	 * - the former is malloc metadata which should be kept separate
23451 	 * - the latter has its own ranges
23452 	 */
23453 	bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_MALLOC_HUGE);
23454 	bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_MALLOC_LARGE);
23455 	bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_MALLOC_LARGE_REUSED);
23456 	bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_MALLOC_MEDIUM);
23457 	bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_MALLOC_PROB_GUARD);
23458 	bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_MALLOC_SMALL);
23459 	bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_MALLOC_TINY);
23460 	bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_TCMALLOC);
23461 	bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_LIBNETWORK);
23462 	bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_IOACCELERATOR);
23463 	bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_IOSURFACE);
23464 	bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_IMAGEIO);
23465 	bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_COREGRAPHICS);
23466 	bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_CORESERVICES);
23467 	bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_COREDATA);
23468 	bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_LAYERKIT);
23469 }
23470 
23471 static struct mach_vm_range
vm_map_range_random_uniform(vm_map_size_t req_size,vm_map_offset_t min_addr,vm_map_offset_t max_addr,vm_map_offset_t offmask)23472 vm_map_range_random_uniform(
23473 	vm_map_size_t           req_size,
23474 	vm_map_offset_t         min_addr,
23475 	vm_map_offset_t         max_addr,
23476 	vm_map_offset_t         offmask)
23477 {
23478 	vm_map_offset_t random_addr;
23479 	struct mach_vm_range alloc;
23480 
23481 	req_size = (req_size + offmask) & ~offmask;
23482 	min_addr = (min_addr + offmask) & ~offmask;
23483 	max_addr = max_addr & ~offmask;
23484 
23485 	read_random(&random_addr, sizeof(random_addr));
23486 	random_addr %= (max_addr - req_size - min_addr);
23487 	random_addr &= ~offmask;
23488 
23489 	alloc.min_address = min_addr + random_addr;
23490 	alloc.max_address = min_addr + random_addr + req_size;
23491 	return alloc;
23492 }
23493 
23494 static vm_map_offset_t
vm_map_range_offmask(void)23495 vm_map_range_offmask(void)
23496 {
23497 	uint32_t pte_depth;
23498 
23499 	/*
23500 	 * PTE optimizations
23501 	 *
23502 	 *
23503 	 * 16k pages systems
23504 	 * ~~~~~~~~~~~~~~~~~
23505 	 *
23506 	 * A single L1 (sub-)page covers the address space.
23507 	 * - L2 pages cover 64G,
23508 	 * - L3 pages cover 32M.
23509 	 *
23510 	 * On embedded, the dynamic VA range is 64G and uses a single L2 page.
23511 	 * As a result, we really only need to align the ranges to 32M to avoid
23512 	 * partial L3 pages.
23513 	 *
23514 	 * On macOS, the usage of L2 pages will increase, so as a result we will
23515 	 * want to align ranges to 64G in order to utilize them fully.
23516 	 *
23517 	 *
23518 	 * 4k pages systems
23519 	 * ~~~~~~~~~~~~~~~~
23520 	 *
23521 	 * A single L0 (sub-)page covers the address space.
23522 	 * - L1 pages cover 512G,
23523 	 * - L2 pages cover 1G,
23524 	 * - L3 pages cover 2M.
23525 	 *
23526 	 * The long tail of processes on a system will tend to have a VA usage
23527 	 * (ignoring the shared regions) in the 100s of MB order of magnitnude.
23528 	 * This is achievable with a single L1 and a few L2s without
23529 	 * randomization.
23530 	 *
23531 	 * However once randomization is introduced, the system will immediately
23532 	 * need several L1s and many more L2s. As a result:
23533 	 *
23534 	 * - on embedded devices, the cost of these extra pages isn't
23535 	 *   sustainable, and we just disable the feature entirely,
23536 	 *
23537 	 * - on macOS we align ranges to a 512G boundary so that the extra L1
23538 	 *   pages can be used to their full potential.
23539 	 */
23540 
23541 	/*
23542 	 * note, this function assumes _non exotic mappings_
23543 	 * which is why it uses the native kernel's PAGE_SHIFT.
23544 	 */
23545 #if XNU_PLATFORM_MacOSX
23546 	pte_depth = PAGE_SHIFT > 12 ? 2 : 3;
23547 #else /* !XNU_PLATFORM_MacOSX */
23548 	pte_depth = PAGE_SHIFT > 12 ? 1 : 0;
23549 #endif /* !XNU_PLATFORM_MacOSX */
23550 
23551 	if (pte_depth == 0) {
23552 		return 0;
23553 	}
23554 
23555 	return (1ull << ((PAGE_SHIFT - 3) * pte_depth + PAGE_SHIFT)) - 1;
23556 }
23557 
23558 /*
23559  * vm_map_range_configure:
23560  *	configures the user vm_map ranges by increasing the maximum VA range of
23561  *  the map and carving out a range at the end of VA space (searching backwards
23562  *  in the newly expanded map).
23563  */
23564 kern_return_t
vm_map_range_configure(vm_map_t map)23565 vm_map_range_configure(vm_map_t map)
23566 {
23567 	const vm_map_offset_t offmask = vm_map_range_offmask();
23568 	struct mach_vm_range data_range;
23569 	vm_map_offset_t default_end;
23570 	kern_return_t kr;
23571 
23572 	if (!vm_map_is_64bit(map) || vm_map_is_exotic(map) || offmask == 0) {
23573 		/*
23574 		 * No point doing vm ranges in a 32bit address space.
23575 		 */
23576 		return KERN_NOT_SUPPORTED;
23577 	}
23578 
23579 	/* Should not be applying ranges to kernel map or kernel map submaps */
23580 	assert(vm_map_pmap(map) != kernel_pmap);
23581 
23582 #if XNU_PLATFORM_MacOSX
23583 
23584 	/*
23585 	 * on macOS, the address space is a massive 47 bits (128T),
23586 	 * with several carve outs that processes can't use:
23587 	 * - the shared region
23588 	 * - the commpage region
23589 	 * - the GPU carve out (if applicable)
23590 	 *
23591 	 * and when nano-malloc is in use it desires memory at the 96T mark.
23592 	 *
23593 	 * However, their location is architecture dependent:
23594 	 * - On intel, the shared region and commpage are
23595 	 *   at the very end of the usable address space (above +127T),
23596 	 *   and there is no GPU carve out, and pthread wants to place
23597 	 *   threads at the 112T mark (0x70T).
23598 	 *
23599 	 * - On arm64, these are in the same spot as on embedded devices:
23600 	 *   o shared region:   [ 6G,  10G)  [ will likely grow over time ]
23601 	 *   o commpage region: [63G,  64G)
23602 	 *   o GPU carve out:   [64G, 448G)
23603 	 *
23604 	 * This is conveninent because the mappings at the end of the address
23605 	 * space (when they exist) are made by the kernel.
23606 	 *
23607 	 * The policy is to allocate a random 1T for the data heap
23608 	 * in the end of the address-space in the:
23609 	 * - [0x71, 0x7f) range on Intel (to leave space for pthread stacks)
23610 	 * - [0x61, 0x7f) range on ASM (to leave space for Nano malloc).
23611 	 */
23612 
23613 	/* see NANOZONE_SIGNATURE in libmalloc */
23614 #if __x86_64__
23615 	default_end = 0x71ull << 40;
23616 #else
23617 	default_end = 0x61ull << 40;
23618 #endif
23619 	data_range  = vm_map_range_random_uniform(1ull << 40,
23620 	        default_end, 0x7full << 40, offmask);
23621 
23622 #else /* !XNU_PLATFORM_MacOSX */
23623 
23624 	/*
23625 	 * Embedded devices:
23626 	 *
23627 	 *   The default VA Size scales with the device physical memory.
23628 	 *
23629 	 *   Out of that:
23630 	 *   - the "zero" page typically uses 4G + some slide
23631 	 *   - the shared region uses SHARED_REGION_SIZE bytes (4G)
23632 	 *
23633 	 *   Without the use of jumbo or any adjustment to the address space,
23634 	 *   a default VM map typically looks like this:
23635 	 *
23636 	 *       0G -->╒════════════╕
23637 	 *             │  pagezero  │
23638 	 *             │  + slide   │
23639 	 *      ~4G -->╞════════════╡<-- vm_map_min(map)
23640 	 *             │            │
23641 	 *       6G -->├────────────┤
23642 	 *             │   shared   │
23643 	 *             │   region   │
23644 	 *      10G -->├────────────┤
23645 	 *             │            │
23646 	 *   max_va -->├────────────┤<-- vm_map_max(map)
23647 	 *             │            │
23648 	 *             ╎   jumbo    ╎
23649 	 *             ╎            ╎
23650 	 *             │            │
23651 	 *      63G -->╞════════════╡<-- MACH_VM_MAX_ADDRESS
23652 	 *             │  commpage  │
23653 	 *      64G -->├────────────┤<-- MACH_VM_MIN_GPU_CARVEOUT_ADDRESS
23654 	 *             │            │
23655 	 *             ╎    GPU     ╎
23656 	 *             ╎  carveout  ╎
23657 	 *             │            │
23658 	 *     448G -->├────────────┤<-- MACH_VM_MAX_GPU_CARVEOUT_ADDRESS
23659 	 *             │            │
23660 	 *             ╎            ╎
23661 	 *             ╎            ╎
23662 	 *             │            │
23663 	 *     512G -->╘════════════╛<-- (1ull << ARM_16K_TT_L1_SHIFT)
23664 	 *
23665 	 *   When this drawing was made, "max_va" was smaller than
23666 	 *   ARM64_MAX_OFFSET_DEVICE_LARGE (~15.5G), leaving shy of
23667 	 *   12G of address space for the zero-page, slide, files,
23668 	 *   binaries, heap ...
23669 	 *
23670 	 *   We will want to make a "heap/data" carve out inside
23671 	 *   the jumbo range of half of that usable space, assuming
23672 	 *   that this is less than a forth of the jumbo range.
23673 	 *
23674 	 *   The assert below intends to catch when max_va grows
23675 	 *   too large for this heuristic.
23676 	 */
23677 
23678 	vm_map_lock_read(map);
23679 	default_end = vm_map_max(map);
23680 	vm_map_unlock_read(map);
23681 
23682 	/*
23683 	 * Check that we're not already jumbo'd,
23684 	 * or our address space was somehow modified.
23685 	 *
23686 	 * If so we cannot guarantee that we can set up the ranges
23687 	 * safely without interfering with the existing map.
23688 	 */
23689 	if (default_end > vm_compute_max_offset(true)) {
23690 		return KERN_NO_SPACE;
23691 	}
23692 
23693 	if (pmap_max_offset(true, ARM_PMAP_MAX_OFFSET_DEFAULT)) {
23694 		/*
23695 		 * an override boot-arg was set, disable user-ranges
23696 		 *
23697 		 * XXX: this is problematic because it means these boot-args
23698 		 *      no longer test the behavior changing the value
23699 		 *      of ARM64_MAX_OFFSET_DEVICE_* would have.
23700 		 */
23701 		return KERN_NOT_SUPPORTED;
23702 	}
23703 
23704 	/* expand the default VM space to the largest possible address */
23705 	vm_map_set_jumbo(map);
23706 
23707 	assert3u(7 * GiB(10) / 2, <=, vm_map_max(map) - default_end);
23708 	data_range = vm_map_range_random_uniform(GiB(10),
23709 	    default_end + PAGE_SIZE, vm_map_max(map), offmask);
23710 
23711 #endif /* !XNU_PLATFORM_MacOSX */
23712 
23713 	/*
23714 	 * Poke holes so that ASAN or people listing regions
23715 	 * do not think this space is free.
23716 	 */
23717 
23718 	if (default_end != data_range.min_address) {
23719 		kr = vm_map_enter(map, &default_end,
23720 		    data_range.min_address - default_end,
23721 		    0, VM_MAP_KERNEL_FLAGS_FIXED_PERMANENT(), VM_OBJECT_NULL,
23722 		    0, FALSE, VM_PROT_NONE, VM_PROT_NONE, VM_INHERIT_DEFAULT);
23723 		assert(kr == KERN_SUCCESS);
23724 	}
23725 
23726 	if (data_range.max_address != vm_map_max(map)) {
23727 		vm_map_entry_t entry;
23728 		vm_size_t size;
23729 
23730 		vm_map_lock_read(map);
23731 		vm_map_lookup_entry_or_next(map, data_range.max_address, &entry);
23732 		if (entry != vm_map_to_entry(map)) {
23733 			size = vm_map_max(map) - data_range.max_address;
23734 		} else {
23735 			size = entry->vme_start - data_range.max_address;
23736 		}
23737 		vm_map_unlock_read(map);
23738 
23739 		kr = vm_map_enter(map, &data_range.max_address, size,
23740 		    0, VM_MAP_KERNEL_FLAGS_FIXED_PERMANENT(), VM_OBJECT_NULL,
23741 		    0, FALSE, VM_PROT_NONE, VM_PROT_NONE, VM_INHERIT_DEFAULT);
23742 		assert(kr == KERN_SUCCESS);
23743 	}
23744 
23745 	vm_map_lock(map);
23746 	map->default_range.min_address = vm_map_min(map);
23747 	map->default_range.max_address = default_end;
23748 	map->data_range = data_range;
23749 	map->uses_user_ranges = true;
23750 	vm_map_unlock(map);
23751 
23752 	return KERN_SUCCESS;
23753 }
23754 
23755 /*
23756  * vm_map_range_fork:
23757  *	clones the array of ranges from old_map to new_map in support
23758  *  of a VM map fork.
23759  */
23760 void
vm_map_range_fork(vm_map_t new_map,vm_map_t old_map)23761 vm_map_range_fork(vm_map_t new_map, vm_map_t old_map)
23762 {
23763 	if (!old_map->uses_user_ranges) {
23764 		/* nothing to do */
23765 		return;
23766 	}
23767 
23768 	new_map->default_range = old_map->default_range;
23769 	new_map->data_range = old_map->data_range;
23770 
23771 	if (old_map->extra_ranges_count) {
23772 		vm_map_user_range_t otable, ntable;
23773 		uint16_t count;
23774 
23775 		otable = old_map->extra_ranges;
23776 		count  = old_map->extra_ranges_count;
23777 		ntable = kalloc_data(count * sizeof(struct vm_map_user_range),
23778 		    Z_WAITOK | Z_ZERO | Z_NOFAIL);
23779 		memcpy(ntable, otable,
23780 		    count * sizeof(struct vm_map_user_range));
23781 
23782 		new_map->extra_ranges_count = count;
23783 		new_map->extra_ranges = ntable;
23784 	}
23785 
23786 	new_map->uses_user_ranges = true;
23787 }
23788 
23789 /*
23790  * vm_map_get_user_range:
23791  *	copy the VM user range for the given VM map and range ID.
23792  */
23793 kern_return_t
vm_map_get_user_range(vm_map_t map,vm_map_range_id_t range_id,mach_vm_range_t range)23794 vm_map_get_user_range(
23795 	vm_map_t                map,
23796 	vm_map_range_id_t       range_id,
23797 	mach_vm_range_t         range)
23798 {
23799 	if (map == NULL || !map->uses_user_ranges || range == NULL) {
23800 		return KERN_INVALID_ARGUMENT;
23801 	}
23802 
23803 	switch (range_id) {
23804 	case UMEM_RANGE_ID_DEFAULT:
23805 		*range = map->default_range;
23806 		return KERN_SUCCESS;
23807 
23808 	case UMEM_RANGE_ID_HEAP:
23809 		*range = map->data_range;
23810 		return KERN_SUCCESS;
23811 
23812 	default:
23813 		return KERN_INVALID_ARGUMENT;
23814 	}
23815 }
23816 
23817 static vm_map_range_id_t
vm_map_user_range_resolve(vm_map_t map,mach_vm_address_t addr,mach_vm_size_t size,mach_vm_range_t range)23818 vm_map_user_range_resolve(
23819 	vm_map_t                map,
23820 	mach_vm_address_t       addr,
23821 	mach_vm_size_t          size,
23822 	mach_vm_range_t         range)
23823 {
23824 	struct mach_vm_range tmp;
23825 
23826 	vm_map_lock_assert_held(map);
23827 
23828 	static_assert(UMEM_RANGE_ID_DEFAULT == MACH_VM_RANGE_DEFAULT);
23829 	static_assert(UMEM_RANGE_ID_HEAP == MACH_VM_RANGE_DATA);
23830 
23831 	if (mach_vm_range_contains(&map->default_range, addr, size)) {
23832 		if (range) {
23833 			*range = map->default_range;
23834 		}
23835 		return UMEM_RANGE_ID_DEFAULT;
23836 	}
23837 
23838 	if (mach_vm_range_contains(&map->data_range, addr, size)) {
23839 		if (range) {
23840 			*range = map->data_range;
23841 		}
23842 		return UMEM_RANGE_ID_HEAP;
23843 	}
23844 
23845 	for (size_t i = 0; i < map->extra_ranges_count; i++) {
23846 		vm_map_user_range_t r = &map->extra_ranges[i];
23847 
23848 		tmp.min_address = r->vmur_min_address;
23849 		tmp.max_address = r->vmur_max_address;
23850 
23851 		if (mach_vm_range_contains(&tmp, addr, size)) {
23852 			if (range) {
23853 				*range = tmp;
23854 			}
23855 			return r->vmur_range_id;
23856 		}
23857 	}
23858 
23859 	if (range) {
23860 		range->min_address = range->max_address = 0;
23861 	}
23862 	return UMEM_RANGE_ID_DEFAULT;
23863 }
23864 
23865 static int
vm_map_user_range_cmp(const void * e1,const void * e2)23866 vm_map_user_range_cmp(const void *e1, const void *e2)
23867 {
23868 	const struct vm_map_user_range *r1 = e1;
23869 	const struct vm_map_user_range *r2 = e2;
23870 
23871 	if (r1->vmur_min_address != r2->vmur_min_address) {
23872 		return r1->vmur_min_address < r2->vmur_min_address ? -1 : 1;
23873 	}
23874 
23875 	return 0;
23876 }
23877 
23878 static int
mach_vm_range_recipe_v1_cmp(const void * e1,const void * e2)23879 mach_vm_range_recipe_v1_cmp(const void *e1, const void *e2)
23880 {
23881 	const mach_vm_range_recipe_v1_t *r1 = e1;
23882 	const mach_vm_range_recipe_v1_t *r2 = e2;
23883 
23884 	if (r1->range.min_address != r2->range.min_address) {
23885 		return r1->range.min_address < r2->range.min_address ? -1 : 1;
23886 	}
23887 
23888 	return 0;
23889 }
23890 
23891 /*!
23892  * @function mach_vm_range_create_v1()
23893  *
23894  * @brief
23895  * Handle the backend for mach_vm_range_create() for the
23896  * MACH_VM_RANGE_FLAVOR_V1 flavor.
23897  *
23898  * @description
23899  * This call allows to create "ranges" in the map of a task
23900  * that have special semantics/policies around placement of
23901  * new allocations (in the vm_map_locate_space() sense).
23902  *
23903  * @returns
23904  * - KERN_SUCCESS on success
23905  * - KERN_INVALID_ARGUMENT for incorrect arguments
23906  * - KERN_NO_SPACE if the maximum amount of ranges would be exceeded
23907  * - KERN_MEMORY_PRESENT if any of the requested ranges
23908  *   overlaps with existing ranges or allocations in the map.
23909  */
23910 static kern_return_t
mach_vm_range_create_v1(vm_map_t map,mach_vm_range_recipe_v1_t * recipe,uint32_t new_count)23911 mach_vm_range_create_v1(
23912 	vm_map_t                map,
23913 	mach_vm_range_recipe_v1_t *recipe,
23914 	uint32_t                new_count)
23915 {
23916 	const vm_offset_t mask = VM_MAP_PAGE_MASK(map);
23917 	vm_map_user_range_t table;
23918 	kern_return_t kr = KERN_SUCCESS;
23919 	uint16_t count;
23920 
23921 	struct mach_vm_range void1 = {
23922 		.min_address = map->default_range.max_address,
23923 		.max_address = map->data_range.min_address,
23924 	};
23925 	struct mach_vm_range void2 = {
23926 		.min_address = map->data_range.max_address,
23927 		.max_address = vm_map_max(map),
23928 	};
23929 
23930 	qsort(recipe, new_count, sizeof(mach_vm_range_recipe_v1_t),
23931 	    mach_vm_range_recipe_v1_cmp);
23932 
23933 	/*
23934 	 * Step 1: Validate that the recipes have no intersections.
23935 	 */
23936 
23937 	for (size_t i = 0; i < new_count; i++) {
23938 		mach_vm_range_t r = &recipe[i].range;
23939 		mach_vm_size_t s;
23940 
23941 		if (recipe[i].flags) {
23942 			return KERN_INVALID_ARGUMENT;
23943 		}
23944 
23945 		static_assert(UMEM_RANGE_ID_FIXED == MACH_VM_RANGE_FIXED);
23946 		switch (recipe[i].range_tag) {
23947 		case MACH_VM_RANGE_FIXED:
23948 			break;
23949 		default:
23950 			return KERN_INVALID_ARGUMENT;
23951 		}
23952 
23953 		if (!VM_MAP_PAGE_ALIGNED(r->min_address, mask) ||
23954 		    !VM_MAP_PAGE_ALIGNED(r->max_address, mask) ||
23955 		    r->min_address >= r->max_address) {
23956 			return KERN_INVALID_ARGUMENT;
23957 		}
23958 
23959 		s = mach_vm_range_size(r);
23960 		if (!mach_vm_range_contains(&void1, r->min_address, s) &&
23961 		    !mach_vm_range_contains(&void2, r->min_address, s)) {
23962 			return KERN_INVALID_ARGUMENT;
23963 		}
23964 
23965 		if (i > 0 && recipe[i - 1].range.max_address >
23966 		    recipe[i].range.min_address) {
23967 			return KERN_INVALID_ARGUMENT;
23968 		}
23969 	}
23970 
23971 	vm_map_lock(map);
23972 
23973 	table = map->extra_ranges;
23974 	count = map->extra_ranges_count;
23975 
23976 	if (count + new_count > VM_MAP_EXTRA_RANGES_MAX) {
23977 		kr = KERN_NO_SPACE;
23978 		goto out_unlock;
23979 	}
23980 
23981 	/*
23982 	 * Step 2: Check that there is no intersection with existing ranges.
23983 	 */
23984 
23985 	for (size_t i = 0, j = 0; i < new_count && j < count;) {
23986 		mach_vm_range_t     r1 = &recipe[i].range;
23987 		vm_map_user_range_t r2 = &table[j];
23988 
23989 		if (r1->max_address <= r2->vmur_min_address) {
23990 			i++;
23991 		} else if (r2->vmur_max_address <= r1->min_address) {
23992 			j++;
23993 		} else {
23994 			kr = KERN_MEMORY_PRESENT;
23995 			goto out_unlock;
23996 		}
23997 	}
23998 
23999 	/*
24000 	 * Step 4: commit the new ranges.
24001 	 */
24002 
24003 	static_assert(VM_MAP_EXTRA_RANGES_MAX * sizeof(struct vm_map_user_range) <=
24004 	    KALLOC_SAFE_ALLOC_SIZE);
24005 
24006 	table = krealloc_data(table,
24007 	    count * sizeof(struct vm_map_user_range),
24008 	    (count + new_count) * sizeof(struct vm_map_user_range),
24009 	    Z_ZERO | Z_WAITOK | Z_NOFAIL);
24010 
24011 	for (size_t i = 0; i < new_count; i++) {
24012 		static_assert(MACH_VM_MAX_ADDRESS < (1ull << 56));
24013 
24014 		table[count + i] = (struct vm_map_user_range){
24015 			.vmur_min_address = recipe[i].range.min_address,
24016 			.vmur_max_address = recipe[i].range.max_address,
24017 			.vmur_range_id    = (vm_map_range_id_t)recipe[i].range_tag,
24018 		};
24019 	}
24020 
24021 	qsort(table, count + new_count,
24022 	    sizeof(struct vm_map_user_range), vm_map_user_range_cmp);
24023 
24024 	map->extra_ranges_count += new_count;
24025 	map->extra_ranges = table;
24026 
24027 out_unlock:
24028 	vm_map_unlock(map);
24029 
24030 	if (kr == KERN_SUCCESS) {
24031 		for (size_t i = 0; i < new_count; i++) {
24032 			vm_map_kernel_flags_t vmk_flags = {
24033 				.vmf_fixed = true,
24034 				.vmf_overwrite = true,
24035 				.vmkf_overwrite_immutable = true,
24036 				.vm_tag = recipe[i].vm_tag,
24037 			};
24038 			__assert_only kern_return_t kr2;
24039 
24040 			kr2 = vm_map_enter(map, &recipe[i].range.min_address,
24041 			    mach_vm_range_size(&recipe[i].range),
24042 			    0, vmk_flags, VM_OBJECT_NULL, 0, FALSE,
24043 			    VM_PROT_NONE, VM_PROT_ALL,
24044 			    VM_INHERIT_DEFAULT);
24045 			assert(kr2 == KERN_SUCCESS);
24046 		}
24047 	}
24048 	return kr;
24049 }
24050 
24051 kern_return_t
mach_vm_range_create(vm_map_t map,mach_vm_range_flavor_t flavor,mach_vm_range_recipes_raw_t recipe,natural_t size)24052 mach_vm_range_create(
24053 	vm_map_t                map,
24054 	mach_vm_range_flavor_t  flavor,
24055 	mach_vm_range_recipes_raw_t recipe,
24056 	natural_t               size)
24057 {
24058 	if (map != current_map()) {
24059 		return KERN_INVALID_ARGUMENT;
24060 	}
24061 
24062 	if (!map->uses_user_ranges) {
24063 		return KERN_NOT_SUPPORTED;
24064 	}
24065 
24066 	if (size == 0) {
24067 		return KERN_SUCCESS;
24068 	}
24069 
24070 	if (flavor == MACH_VM_RANGE_FLAVOR_V1) {
24071 		mach_vm_range_recipe_v1_t *array;
24072 
24073 		if (size % sizeof(mach_vm_range_recipe_v1_t)) {
24074 			return KERN_INVALID_ARGUMENT;
24075 		}
24076 
24077 		size /= sizeof(mach_vm_range_recipe_v1_t);
24078 		if (size > VM_MAP_EXTRA_RANGES_MAX) {
24079 			return KERN_NO_SPACE;
24080 		}
24081 
24082 		array = (mach_vm_range_recipe_v1_t *)recipe;
24083 		return mach_vm_range_create_v1(map, array, size);
24084 	}
24085 
24086 	return KERN_INVALID_ARGUMENT;
24087 }
24088 
24089 #else /* !CONFIG_MAP_RANGES */
24090 
24091 kern_return_t
mach_vm_range_create(vm_map_t map,mach_vm_range_flavor_t flavor,mach_vm_range_recipes_raw_t recipe,natural_t size)24092 mach_vm_range_create(
24093 	vm_map_t                map,
24094 	mach_vm_range_flavor_t  flavor,
24095 	mach_vm_range_recipes_raw_t recipe,
24096 	natural_t               size)
24097 {
24098 #pragma unused(map, flavor, recipe, size)
24099 	return KERN_NOT_SUPPORTED;
24100 }
24101 
24102 #endif /* !CONFIG_MAP_RANGES */
24103 
24104 void
vm_map_kernel_flags_update_range_id(vm_map_kernel_flags_t * vmkf,vm_map_t map)24105 vm_map_kernel_flags_update_range_id(vm_map_kernel_flags_t *vmkf, vm_map_t map)
24106 {
24107 	if (map == kernel_map) {
24108 		if (vmkf->vmkf_range_id == KMEM_RANGE_ID_NONE) {
24109 			vmkf->vmkf_range_id = KMEM_RANGE_ID_DATA;
24110 		}
24111 #if CONFIG_MAP_RANGES
24112 	} else if (vmkf->vm_tag < VM_MEMORY_COUNT &&
24113 	    vmkf->vmkf_range_id == UMEM_RANGE_ID_DEFAULT &&
24114 	    bitmap_test(vm_map_user_range_heap_map, vmkf->vm_tag)) {
24115 		vmkf->vmkf_range_id = UMEM_RANGE_ID_HEAP;
24116 #endif /* CONFIG_MAP_RANGES */
24117 	}
24118 }
24119 
24120 /*
24121  * vm_map_entry_has_device_pager:
24122  * Check if the vm map entry specified by the virtual address has a device pager.
24123  * If the vm map entry does not exist or if the map is NULL, this returns FALSE.
24124  */
24125 boolean_t
vm_map_entry_has_device_pager(vm_map_t map,vm_map_offset_t vaddr)24126 vm_map_entry_has_device_pager(vm_map_t map, vm_map_offset_t vaddr)
24127 {
24128 	vm_map_entry_t entry;
24129 	vm_object_t object;
24130 	boolean_t result;
24131 
24132 	if (map == NULL) {
24133 		return FALSE;
24134 	}
24135 
24136 	vm_map_lock(map);
24137 	while (TRUE) {
24138 		if (!vm_map_lookup_entry(map, vaddr, &entry)) {
24139 			result = FALSE;
24140 			break;
24141 		}
24142 		if (entry->is_sub_map) {
24143 			// Check the submap
24144 			vm_map_t submap = VME_SUBMAP(entry);
24145 			assert(submap != NULL);
24146 			vm_map_lock(submap);
24147 			vm_map_unlock(map);
24148 			map = submap;
24149 			continue;
24150 		}
24151 		object = VME_OBJECT(entry);
24152 		if (object != NULL && object->pager != NULL && is_device_pager_ops(object->pager->mo_pager_ops)) {
24153 			result = TRUE;
24154 			break;
24155 		}
24156 		result = FALSE;
24157 		break;
24158 	}
24159 
24160 	vm_map_unlock(map);
24161 	return result;
24162 }
24163 
24164 
24165 #if MACH_ASSERT
24166 
24167 extern int pmap_ledgers_panic;
24168 extern int pmap_ledgers_panic_leeway;
24169 
24170 #define LEDGER_DRIFT(__LEDGER)                    \
24171 	int             __LEDGER##_over;          \
24172 	ledger_amount_t __LEDGER##_over_total;    \
24173 	ledger_amount_t __LEDGER##_over_max;      \
24174 	int             __LEDGER##_under;         \
24175 	ledger_amount_t __LEDGER##_under_total;   \
24176 	ledger_amount_t __LEDGER##_under_max
24177 
24178 struct {
24179 	uint64_t        num_pmaps_checked;
24180 
24181 	LEDGER_DRIFT(phys_footprint);
24182 	LEDGER_DRIFT(internal);
24183 	LEDGER_DRIFT(internal_compressed);
24184 	LEDGER_DRIFT(external);
24185 	LEDGER_DRIFT(reusable);
24186 	LEDGER_DRIFT(iokit_mapped);
24187 	LEDGER_DRIFT(alternate_accounting);
24188 	LEDGER_DRIFT(alternate_accounting_compressed);
24189 	LEDGER_DRIFT(page_table);
24190 	LEDGER_DRIFT(purgeable_volatile);
24191 	LEDGER_DRIFT(purgeable_nonvolatile);
24192 	LEDGER_DRIFT(purgeable_volatile_compressed);
24193 	LEDGER_DRIFT(purgeable_nonvolatile_compressed);
24194 	LEDGER_DRIFT(tagged_nofootprint);
24195 	LEDGER_DRIFT(tagged_footprint);
24196 	LEDGER_DRIFT(tagged_nofootprint_compressed);
24197 	LEDGER_DRIFT(tagged_footprint_compressed);
24198 	LEDGER_DRIFT(network_volatile);
24199 	LEDGER_DRIFT(network_nonvolatile);
24200 	LEDGER_DRIFT(network_volatile_compressed);
24201 	LEDGER_DRIFT(network_nonvolatile_compressed);
24202 	LEDGER_DRIFT(media_nofootprint);
24203 	LEDGER_DRIFT(media_footprint);
24204 	LEDGER_DRIFT(media_nofootprint_compressed);
24205 	LEDGER_DRIFT(media_footprint_compressed);
24206 	LEDGER_DRIFT(graphics_nofootprint);
24207 	LEDGER_DRIFT(graphics_footprint);
24208 	LEDGER_DRIFT(graphics_nofootprint_compressed);
24209 	LEDGER_DRIFT(graphics_footprint_compressed);
24210 	LEDGER_DRIFT(neural_nofootprint);
24211 	LEDGER_DRIFT(neural_footprint);
24212 	LEDGER_DRIFT(neural_nofootprint_compressed);
24213 	LEDGER_DRIFT(neural_footprint_compressed);
24214 } pmap_ledgers_drift;
24215 
24216 void
vm_map_pmap_check_ledgers(pmap_t pmap,ledger_t ledger,int pid,char * procname)24217 vm_map_pmap_check_ledgers(
24218 	pmap_t          pmap,
24219 	ledger_t        ledger,
24220 	int             pid,
24221 	char            *procname)
24222 {
24223 	ledger_amount_t bal;
24224 	boolean_t       do_panic;
24225 
24226 	do_panic = FALSE;
24227 
24228 	pmap_ledgers_drift.num_pmaps_checked++;
24229 
24230 #define LEDGER_CHECK_BALANCE(__LEDGER)                                  \
24231 MACRO_BEGIN                                                             \
24232 	int panic_on_negative = TRUE;                                   \
24233 	ledger_get_balance(ledger,                                      \
24234 	                   task_ledgers.__LEDGER,                       \
24235 	                   &bal);                                       \
24236 	ledger_get_panic_on_negative(ledger,                            \
24237 	                             task_ledgers.__LEDGER,             \
24238 	                             &panic_on_negative);               \
24239 	if (bal != 0) {                                                 \
24240 	        if (panic_on_negative ||                                \
24241 	            (pmap_ledgers_panic &&                              \
24242 	             pmap_ledgers_panic_leeway > 0 &&                   \
24243 	             (bal > (pmap_ledgers_panic_leeway * PAGE_SIZE) ||  \
24244 	              bal < (-pmap_ledgers_panic_leeway * PAGE_SIZE)))) { \
24245 	                do_panic = TRUE;                                \
24246 	        }                                                       \
24247 	        printf("LEDGER BALANCE proc %d (%s) "                   \
24248 	               "\"%s\" = %lld\n",                               \
24249 	               pid, procname, #__LEDGER, bal);                  \
24250 	        if (bal > 0) {                                          \
24251 	                pmap_ledgers_drift.__LEDGER##_over++;           \
24252 	                pmap_ledgers_drift.__LEDGER##_over_total += bal; \
24253 	                if (bal > pmap_ledgers_drift.__LEDGER##_over_max) { \
24254 	                        pmap_ledgers_drift.__LEDGER##_over_max = bal; \
24255 	                }                                               \
24256 	        } else if (bal < 0) {                                   \
24257 	                pmap_ledgers_drift.__LEDGER##_under++;          \
24258 	                pmap_ledgers_drift.__LEDGER##_under_total += bal; \
24259 	                if (bal < pmap_ledgers_drift.__LEDGER##_under_max) { \
24260 	                        pmap_ledgers_drift.__LEDGER##_under_max = bal; \
24261 	                }                                               \
24262 	        }                                                       \
24263 	}                                                               \
24264 MACRO_END
24265 
24266 	LEDGER_CHECK_BALANCE(phys_footprint);
24267 	LEDGER_CHECK_BALANCE(internal);
24268 	LEDGER_CHECK_BALANCE(internal_compressed);
24269 	LEDGER_CHECK_BALANCE(external);
24270 	LEDGER_CHECK_BALANCE(reusable);
24271 	LEDGER_CHECK_BALANCE(iokit_mapped);
24272 	LEDGER_CHECK_BALANCE(alternate_accounting);
24273 	LEDGER_CHECK_BALANCE(alternate_accounting_compressed);
24274 	LEDGER_CHECK_BALANCE(page_table);
24275 	LEDGER_CHECK_BALANCE(purgeable_volatile);
24276 	LEDGER_CHECK_BALANCE(purgeable_nonvolatile);
24277 	LEDGER_CHECK_BALANCE(purgeable_volatile_compressed);
24278 	LEDGER_CHECK_BALANCE(purgeable_nonvolatile_compressed);
24279 	LEDGER_CHECK_BALANCE(tagged_nofootprint);
24280 	LEDGER_CHECK_BALANCE(tagged_footprint);
24281 	LEDGER_CHECK_BALANCE(tagged_nofootprint_compressed);
24282 	LEDGER_CHECK_BALANCE(tagged_footprint_compressed);
24283 	LEDGER_CHECK_BALANCE(network_volatile);
24284 	LEDGER_CHECK_BALANCE(network_nonvolatile);
24285 	LEDGER_CHECK_BALANCE(network_volatile_compressed);
24286 	LEDGER_CHECK_BALANCE(network_nonvolatile_compressed);
24287 	LEDGER_CHECK_BALANCE(media_nofootprint);
24288 	LEDGER_CHECK_BALANCE(media_footprint);
24289 	LEDGER_CHECK_BALANCE(media_nofootprint_compressed);
24290 	LEDGER_CHECK_BALANCE(media_footprint_compressed);
24291 	LEDGER_CHECK_BALANCE(graphics_nofootprint);
24292 	LEDGER_CHECK_BALANCE(graphics_footprint);
24293 	LEDGER_CHECK_BALANCE(graphics_nofootprint_compressed);
24294 	LEDGER_CHECK_BALANCE(graphics_footprint_compressed);
24295 	LEDGER_CHECK_BALANCE(neural_nofootprint);
24296 	LEDGER_CHECK_BALANCE(neural_footprint);
24297 	LEDGER_CHECK_BALANCE(neural_nofootprint_compressed);
24298 	LEDGER_CHECK_BALANCE(neural_footprint_compressed);
24299 
24300 	if (do_panic) {
24301 		if (pmap_ledgers_panic) {
24302 			panic("pmap_destroy(%p) %d[%s] has imbalanced ledgers",
24303 			    pmap, pid, procname);
24304 		} else {
24305 			printf("pmap_destroy(%p) %d[%s] has imbalanced ledgers\n",
24306 			    pmap, pid, procname);
24307 		}
24308 	}
24309 }
24310 
24311 void
vm_map_pmap_set_process(vm_map_t map,int pid,char * procname)24312 vm_map_pmap_set_process(
24313 	vm_map_t map,
24314 	int pid,
24315 	char *procname)
24316 {
24317 	pmap_set_process(vm_map_pmap(map), pid, procname);
24318 }
24319 
24320 #endif /* MACH_ASSERT */
24321