xref: /xnu-10063.121.3/osfmk/vm/vm_map.c (revision 2c2f96dc2b9a4408a43d3150ae9c105355ca3daa)
1 /*
2  * Copyright (c) 2000-2021 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 /*
29  * @OSF_COPYRIGHT@
30  */
31 /*
32  * Mach Operating System
33  * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
34  * All Rights Reserved.
35  *
36  * Permission to use, copy, modify and distribute this software and its
37  * documentation is hereby granted, provided that both the copyright
38  * notice and this permission notice appear in all copies of the
39  * software, derivative works or modified versions, and any portions
40  * thereof, and that both notices appear in supporting documentation.
41  *
42  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44  * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45  *
46  * Carnegie Mellon requests users of this software to return to
47  *
48  *  Software Distribution Coordinator  or  [email protected]
49  *  School of Computer Science
50  *  Carnegie Mellon University
51  *  Pittsburgh PA 15213-3890
52  *
53  * any improvements or extensions that they make and grant Carnegie Mellon
54  * the rights to redistribute these changes.
55  */
56 /*
57  */
58 /*
59  *	File:	vm/vm_map.c
60  *	Author:	Avadis Tevanian, Jr., Michael Wayne Young
61  *	Date:	1985
62  *
63  *	Virtual memory mapping module.
64  */
65 
66 #include <mach/vm_types.h>
67 #include <mach_assert.h>
68 
69 #include <vm/vm_options.h>
70 
71 #include <libkern/OSAtomic.h>
72 
73 #include <mach/kern_return.h>
74 #include <mach/port.h>
75 #include <mach/vm_attributes.h>
76 #include <mach/vm_param.h>
77 #include <mach/vm_behavior.h>
78 #include <mach/vm_statistics.h>
79 #include <mach/memory_object.h>
80 #include <mach/mach_vm.h>
81 #include <machine/cpu_capabilities.h>
82 #include <mach/sdt.h>
83 
84 #include <kern/assert.h>
85 #include <kern/backtrace.h>
86 #include <kern/counter.h>
87 #include <kern/exc_guard.h>
88 #include <kern/kalloc.h>
89 #include <kern/zalloc_internal.h>
90 
91 #include <vm/cpm.h>
92 #include <vm/vm_compressor.h>
93 #include <vm/vm_compressor_pager.h>
94 #include <vm/vm_init.h>
95 #include <vm/vm_fault.h>
96 #include <vm/vm_map_internal.h>
97 #include <vm/vm_object.h>
98 #include <vm/vm_page.h>
99 #include <vm/vm_pageout.h>
100 #include <vm/pmap.h>
101 #include <vm/vm_kern.h>
102 #include <ipc/ipc_port.h>
103 #include <kern/sched_prim.h>
104 #include <kern/misc_protos.h>
105 
106 #include <mach/vm_map_server.h>
107 #include <mach/mach_host_server.h>
108 #include <vm/vm_memtag.h>
109 #include <vm/vm_protos.h>
110 #include <vm/vm_purgeable_internal.h>
111 #include <vm/vm_reclaim_internal.h>
112 
113 #include <vm/vm_protos.h>
114 #include <vm/vm_shared_region.h>
115 #include <vm/vm_map_store.h>
116 
117 #include <san/kasan.h>
118 
119 #include <sys/resource.h>
120 #include <sys/random.h>
121 #include <sys/codesign.h>
122 #include <sys/code_signing.h>
123 #include <sys/mman.h>
124 #include <sys/reboot.h>
125 #include <sys/kdebug_triage.h>
126 
127 #include <libkern/section_keywords.h>
128 
129 #if DEVELOPMENT || DEBUG
130 extern int proc_selfcsflags(void);
131 int vm_log_xnu_user_debug = 0;
132 int panic_on_unsigned_execute = 0;
133 int panic_on_mlock_failure = 0;
134 #endif /* DEVELOPMENT || DEBUG */
135 
136 #if MACH_ASSERT
137 int debug4k_filter = 0;
138 char debug4k_proc_name[1024] = "";
139 int debug4k_proc_filter = (int)-1 & ~(1 << __DEBUG4K_FAULT);
140 int debug4k_panic_on_misaligned_sharing = 0;
141 const char *debug4k_category_name[] = {
142 	"error",        /* 0 */
143 	"life",         /* 1 */
144 	"load",         /* 2 */
145 	"fault",        /* 3 */
146 	"copy",         /* 4 */
147 	"share",        /* 5 */
148 	"adjust",       /* 6 */
149 	"pmap",         /* 7 */
150 	"mementry",     /* 8 */
151 	"iokit",        /* 9 */
152 	"upl",          /* 10 */
153 	"exc",          /* 11 */
154 	"vfs"           /* 12 */
155 };
156 #endif /* MACH_ASSERT */
157 int debug4k_no_cow_copyin = 0;
158 
159 
160 #if __arm64__
161 extern const int fourk_binary_compatibility_unsafe;
162 extern const int fourk_binary_compatibility_allow_wx;
163 #endif /* __arm64__ */
164 extern void qsort(void *a, size_t n, size_t es, int (*cmp)(const void *, const void *));
165 extern int proc_selfpid(void);
166 extern char *proc_name_address(void *p);
167 extern char *proc_best_name(struct proc *p);
168 
169 #if VM_MAP_DEBUG_APPLE_PROTECT
170 int vm_map_debug_apple_protect = 0;
171 #endif /* VM_MAP_DEBUG_APPLE_PROTECT */
172 #if VM_MAP_DEBUG_FOURK
173 int vm_map_debug_fourk = 0;
174 #endif /* VM_MAP_DEBUG_FOURK */
175 
176 #if DEBUG || DEVELOPMENT
177 static TUNABLE(bool, vm_map_executable_immutable,
178     "vm_map_executable_immutable", true);
179 #else
180 #define vm_map_executable_immutable true
181 #endif
182 
183 os_refgrp_decl(static, map_refgrp, "vm_map", NULL);
184 
185 extern u_int32_t random(void);  /* from <libkern/libkern.h> */
186 /* Internal prototypes
187  */
188 
189 typedef struct vm_map_zap {
190 	vm_map_entry_t          vmz_head;
191 	vm_map_entry_t         *vmz_tail;
192 } *vm_map_zap_t;
193 
194 #define VM_MAP_ZAP_DECLARE(zap) \
195 	struct vm_map_zap zap = { .vmz_tail = &zap.vmz_head }
196 
197 static vm_map_entry_t   vm_map_entry_insert(
198 	vm_map_t                map,
199 	vm_map_entry_t          insp_entry,
200 	vm_map_offset_t         start,
201 	vm_map_offset_t         end,
202 	vm_object_t             object,
203 	vm_object_offset_t      offset,
204 	vm_map_kernel_flags_t   vmk_flags,
205 	boolean_t               needs_copy,
206 	vm_prot_t               cur_protection,
207 	vm_prot_t               max_protection,
208 	vm_inherit_t            inheritance,
209 	boolean_t               clear_map_aligned);
210 
211 static void vm_map_simplify_range(
212 	vm_map_t        map,
213 	vm_map_offset_t start,
214 	vm_map_offset_t end);   /* forward */
215 
216 static boolean_t        vm_map_range_check(
217 	vm_map_t        map,
218 	vm_map_offset_t start,
219 	vm_map_offset_t end,
220 	vm_map_entry_t  *entry);
221 
222 static void vm_map_submap_pmap_clean(
223 	vm_map_t        map,
224 	vm_map_offset_t start,
225 	vm_map_offset_t end,
226 	vm_map_t        sub_map,
227 	vm_map_offset_t offset);
228 
229 static void             vm_map_pmap_enter(
230 	vm_map_t                map,
231 	vm_map_offset_t         addr,
232 	vm_map_offset_t         end_addr,
233 	vm_object_t             object,
234 	vm_object_offset_t      offset,
235 	vm_prot_t               protection);
236 
237 static void             _vm_map_clip_end(
238 	struct vm_map_header    *map_header,
239 	vm_map_entry_t          entry,
240 	vm_map_offset_t         end);
241 
242 static void             _vm_map_clip_start(
243 	struct vm_map_header    *map_header,
244 	vm_map_entry_t          entry,
245 	vm_map_offset_t         start);
246 
247 static kmem_return_t vm_map_delete(
248 	vm_map_t        map,
249 	vm_map_offset_t start,
250 	vm_map_offset_t end,
251 	vmr_flags_t     flags,
252 	kmem_guard_t    guard,
253 	vm_map_zap_t    zap);
254 
255 static void             vm_map_copy_insert(
256 	vm_map_t        map,
257 	vm_map_entry_t  after_where,
258 	vm_map_copy_t   copy);
259 
260 static kern_return_t    vm_map_copy_overwrite_unaligned(
261 	vm_map_t        dst_map,
262 	vm_map_entry_t  entry,
263 	vm_map_copy_t   copy,
264 	vm_map_address_t start,
265 	boolean_t       discard_on_success);
266 
267 static kern_return_t    vm_map_copy_overwrite_aligned(
268 	vm_map_t        dst_map,
269 	vm_map_entry_t  tmp_entry,
270 	vm_map_copy_t   copy,
271 	vm_map_offset_t start,
272 	pmap_t          pmap);
273 
274 static kern_return_t    vm_map_copyin_kernel_buffer(
275 	vm_map_t        src_map,
276 	vm_map_address_t src_addr,
277 	vm_map_size_t   len,
278 	boolean_t       src_destroy,
279 	vm_map_copy_t   *copy_result);  /* OUT */
280 
281 static kern_return_t    vm_map_copyout_kernel_buffer(
282 	vm_map_t        map,
283 	vm_map_address_t *addr, /* IN/OUT */
284 	vm_map_copy_t   copy,
285 	vm_map_size_t   copy_size,
286 	boolean_t       overwrite,
287 	boolean_t       consume_on_success);
288 
289 static void             vm_map_fork_share(
290 	vm_map_t        old_map,
291 	vm_map_entry_t  old_entry,
292 	vm_map_t        new_map);
293 
294 static boolean_t        vm_map_fork_copy(
295 	vm_map_t        old_map,
296 	vm_map_entry_t  *old_entry_p,
297 	vm_map_t        new_map,
298 	int             vm_map_copyin_flags);
299 
300 static kern_return_t    vm_map_wire_nested(
301 	vm_map_t                   map,
302 	vm_map_offset_t            start,
303 	vm_map_offset_t            end,
304 	vm_prot_t                  caller_prot,
305 	vm_tag_t                   tag,
306 	boolean_t                  user_wire,
307 	pmap_t                     map_pmap,
308 	vm_map_offset_t            pmap_addr,
309 	ppnum_t                    *physpage_p);
310 
311 static kern_return_t    vm_map_unwire_nested(
312 	vm_map_t                   map,
313 	vm_map_offset_t            start,
314 	vm_map_offset_t            end,
315 	boolean_t                  user_wire,
316 	pmap_t                     map_pmap,
317 	vm_map_offset_t            pmap_addr);
318 
319 static kern_return_t    vm_map_overwrite_submap_recurse(
320 	vm_map_t                   dst_map,
321 	vm_map_offset_t            dst_addr,
322 	vm_map_size_t              dst_size);
323 
324 static kern_return_t    vm_map_copy_overwrite_nested(
325 	vm_map_t                   dst_map,
326 	vm_map_offset_t            dst_addr,
327 	vm_map_copy_t              copy,
328 	boolean_t                  interruptible,
329 	pmap_t                     pmap,
330 	boolean_t                  discard_on_success);
331 
332 static kern_return_t    vm_map_remap_extract(
333 	vm_map_t                map,
334 	vm_map_offset_t         addr,
335 	vm_map_size_t           size,
336 	boolean_t               copy,
337 	vm_map_copy_t           map_copy,
338 	vm_prot_t               *cur_protection,
339 	vm_prot_t               *max_protection,
340 	vm_inherit_t            inheritance,
341 	vm_map_kernel_flags_t   vmk_flags);
342 
343 static kern_return_t    vm_map_remap_range_allocate(
344 	vm_map_t                map,
345 	vm_map_address_t        *address,
346 	vm_map_size_t           size,
347 	vm_map_offset_t         mask,
348 	vm_map_kernel_flags_t   vmk_flags,
349 	vm_map_entry_t          *map_entry,
350 	vm_map_zap_t            zap_list);
351 
352 static void             vm_map_region_look_for_page(
353 	vm_map_t                   map,
354 	vm_map_offset_t            va,
355 	vm_object_t                object,
356 	vm_object_offset_t         offset,
357 	int                        max_refcnt,
358 	unsigned short             depth,
359 	vm_region_extended_info_t  extended,
360 	mach_msg_type_number_t count);
361 
362 static int              vm_map_region_count_obj_refs(
363 	vm_map_entry_t             entry,
364 	vm_object_t                object);
365 
366 
367 static kern_return_t    vm_map_willneed(
368 	vm_map_t        map,
369 	vm_map_offset_t start,
370 	vm_map_offset_t end);
371 
372 static kern_return_t    vm_map_reuse_pages(
373 	vm_map_t        map,
374 	vm_map_offset_t start,
375 	vm_map_offset_t end);
376 
377 static kern_return_t    vm_map_reusable_pages(
378 	vm_map_t        map,
379 	vm_map_offset_t start,
380 	vm_map_offset_t end);
381 
382 static kern_return_t    vm_map_can_reuse(
383 	vm_map_t        map,
384 	vm_map_offset_t start,
385 	vm_map_offset_t end);
386 
387 static kern_return_t    vm_map_zero(
388 	vm_map_t        map,
389 	vm_map_offset_t start,
390 	vm_map_offset_t end);
391 
392 static kern_return_t    vm_map_random_address_for_size(
393 	vm_map_t                map,
394 	vm_map_offset_t        *address,
395 	vm_map_size_t           size,
396 	vm_map_kernel_flags_t   vmk_flags);
397 
398 
399 #if CONFIG_MAP_RANGES
400 
401 static vm_map_range_id_t vm_map_user_range_resolve(
402 	vm_map_t                map,
403 	mach_vm_address_t       addr,
404 	mach_vm_address_t       size,
405 	mach_vm_range_t         range);
406 
407 #endif /* CONFIG_MAP_RANGES */
408 #if MACH_ASSERT
409 static kern_return_t    vm_map_pageout(
410 	vm_map_t        map,
411 	vm_map_offset_t start,
412 	vm_map_offset_t end);
413 #endif /* MACH_ASSERT */
414 
415 kern_return_t vm_map_corpse_footprint_collect(
416 	vm_map_t        old_map,
417 	vm_map_entry_t  old_entry,
418 	vm_map_t        new_map);
419 void vm_map_corpse_footprint_collect_done(
420 	vm_map_t        new_map);
421 void vm_map_corpse_footprint_destroy(
422 	vm_map_t        map);
423 kern_return_t vm_map_corpse_footprint_query_page_info(
424 	vm_map_t        map,
425 	vm_map_offset_t va,
426 	int             *disposition_p);
427 void vm_map_footprint_query_page_info(
428 	vm_map_t        map,
429 	vm_map_entry_t  map_entry,
430 	vm_map_offset_t curr_s_offset,
431 	int             *disposition_p);
432 
433 #if CONFIG_MAP_RANGES
434 static void vm_map_range_map_init(void);
435 #endif /* CONFIG_MAP_RANGES */
436 
437 pid_t find_largest_process_vm_map_entries(void);
438 
439 extern int exit_with_guard_exception(void *p, mach_exception_data_type_t code,
440     mach_exception_data_type_t subcode);
441 
442 /*
443  * Macros to copy a vm_map_entry. We must be careful to correctly
444  * manage the wired page count. vm_map_entry_copy() creates a new
445  * map entry to the same memory - the wired count in the new entry
446  * must be set to zero. vm_map_entry_copy_full() creates a new
447  * entry that is identical to the old entry.  This preserves the
448  * wire count; it's used for map splitting and zone changing in
449  * vm_map_copyout.
450  */
451 
452 static inline void
vm_map_entry_copy_csm_assoc(vm_map_t map __unused,vm_map_entry_t new __unused,vm_map_entry_t old __unused)453 vm_map_entry_copy_csm_assoc(
454 	vm_map_t map __unused,
455 	vm_map_entry_t new __unused,
456 	vm_map_entry_t old __unused)
457 {
458 #if CODE_SIGNING_MONITOR
459 	/* when code signing monitor is enabled, we want to reset on copy */
460 	new->csm_associated = FALSE;
461 #else
462 	/* when code signing monitor is not enabled, assert as a sanity check */
463 	assert(new->csm_associated == FALSE);
464 #endif
465 #if DEVELOPMENT || DEBUG
466 	if (new->vme_xnu_user_debug && vm_log_xnu_user_debug) {
467 		printf("FBDP %d[%s] %s:%d map %p entry %p [ 0x%llx 0x%llx ] resetting vme_xnu_user_debug\n",
468 		    proc_selfpid(),
469 		    (get_bsdtask_info(current_task())
470 		    ? proc_name_address(get_bsdtask_info(current_task()))
471 		    : "?"),
472 		    __FUNCTION__, __LINE__,
473 		    map, new, new->vme_start, new->vme_end);
474 	}
475 #endif /* DEVELOPMENT || DEBUG */
476 	new->vme_xnu_user_debug = FALSE;
477 }
478 
479 /*
480  * The "used_for_jit" flag was copied from OLD to NEW in vm_map_entry_copy().
481  * But for security reasons on some platforms, we don't want the
482  * new mapping to be "used for jit", so we reset the flag here.
483  */
484 static inline void
vm_map_entry_copy_code_signing(vm_map_t map,vm_map_entry_t new,vm_map_entry_t old __unused)485 vm_map_entry_copy_code_signing(
486 	vm_map_t map,
487 	vm_map_entry_t new,
488 	vm_map_entry_t old __unused)
489 {
490 	if (VM_MAP_POLICY_ALLOW_JIT_COPY(map)) {
491 		assert(new->used_for_jit == old->used_for_jit);
492 	} else {
493 		if (old->used_for_jit) {
494 			DTRACE_VM3(cs_wx,
495 			    uint64_t, new->vme_start,
496 			    uint64_t, new->vme_end,
497 			    vm_prot_t, new->protection);
498 			printf("CODE SIGNING: %d[%s] %s: curprot cannot be write+execute. %s\n",
499 			    proc_selfpid(),
500 			    (get_bsdtask_info(current_task())
501 			    ? proc_name_address(get_bsdtask_info(current_task()))
502 			    : "?"),
503 			    __FUNCTION__,
504 			    "removing execute access");
505 			new->protection &= ~VM_PROT_EXECUTE;
506 			new->max_protection &= ~VM_PROT_EXECUTE;
507 		}
508 		new->used_for_jit = FALSE;
509 	}
510 }
511 
512 static inline void
vm_map_entry_copy_full(vm_map_entry_t new,vm_map_entry_t old)513 vm_map_entry_copy_full(
514 	vm_map_entry_t new,
515 	vm_map_entry_t old)
516 {
517 #if MAP_ENTRY_CREATION_DEBUG
518 	btref_put(new->vme_creation_bt);
519 	btref_retain(old->vme_creation_bt);
520 #endif
521 #if MAP_ENTRY_INSERTION_DEBUG
522 	btref_put(new->vme_insertion_bt);
523 	btref_retain(old->vme_insertion_bt);
524 #endif
525 #if VM_BTLOG_TAGS
526 	/* Discard the btref that might be in the new entry */
527 	if (new->vme_kernel_object) {
528 		btref_put(new->vme_tag_btref);
529 	}
530 	/* Retain the btref in the old entry to account for its copy */
531 	if (old->vme_kernel_object) {
532 		btref_retain(old->vme_tag_btref);
533 	}
534 #endif /* VM_BTLOG_TAGS */
535 	*new = *old;
536 }
537 
538 static inline void
vm_map_entry_copy(vm_map_t map,vm_map_entry_t new,vm_map_entry_t old)539 vm_map_entry_copy(
540 	vm_map_t map,
541 	vm_map_entry_t new,
542 	vm_map_entry_t old)
543 {
544 	vm_map_entry_copy_full(new, old);
545 
546 	new->is_shared = FALSE;
547 	new->needs_wakeup = FALSE;
548 	new->in_transition = FALSE;
549 	new->wired_count = 0;
550 	new->user_wired_count = 0;
551 	new->vme_permanent = FALSE;
552 	vm_map_entry_copy_code_signing(map, new, old);
553 	vm_map_entry_copy_csm_assoc(map, new, old);
554 	if (new->iokit_acct) {
555 		assertf(!new->use_pmap, "old %p new %p\n", old, new);
556 		new->iokit_acct = FALSE;
557 		new->use_pmap = TRUE;
558 	}
559 	new->vme_resilient_codesign = FALSE;
560 	new->vme_resilient_media = FALSE;
561 	new->vme_atomic = FALSE;
562 	new->vme_no_copy_on_read = FALSE;
563 }
564 
565 /*
566  * Normal lock_read_to_write() returns FALSE/0 on failure.
567  * These functions evaluate to zero on success and non-zero value on failure.
568  */
569 __attribute__((always_inline))
570 int
vm_map_lock_read_to_write(vm_map_t map)571 vm_map_lock_read_to_write(vm_map_t map)
572 {
573 	if (lck_rw_lock_shared_to_exclusive(&(map)->lock)) {
574 		DTRACE_VM(vm_map_lock_upgrade);
575 		return 0;
576 	}
577 	return 1;
578 }
579 
580 __attribute__((always_inline))
581 boolean_t
vm_map_try_lock(vm_map_t map)582 vm_map_try_lock(vm_map_t map)
583 {
584 	if (lck_rw_try_lock_exclusive(&(map)->lock)) {
585 		DTRACE_VM(vm_map_lock_w);
586 		return TRUE;
587 	}
588 	return FALSE;
589 }
590 
591 __attribute__((always_inline))
592 boolean_t
vm_map_try_lock_read(vm_map_t map)593 vm_map_try_lock_read(vm_map_t map)
594 {
595 	if (lck_rw_try_lock_shared(&(map)->lock)) {
596 		DTRACE_VM(vm_map_lock_r);
597 		return TRUE;
598 	}
599 	return FALSE;
600 }
601 
602 /*!
603  * @function kdp_vm_map_is_acquired_exclusive
604  *
605  * @abstract
606  * Checks if vm map is acquired exclusive.
607  *
608  * @discussion
609  * NOT SAFE: To be used only by kernel debugger.
610  *
611  * @param map map to check
612  *
613  * @returns TRUE if the map is acquired exclusively.
614  */
615 boolean_t
kdp_vm_map_is_acquired_exclusive(vm_map_t map)616 kdp_vm_map_is_acquired_exclusive(vm_map_t map)
617 {
618 	return kdp_lck_rw_lock_is_acquired_exclusive(&map->lock);
619 }
620 
621 /*
622  * Routines to get the page size the caller should
623  * use while inspecting the target address space.
624  * Use the "_safely" variant if the caller is dealing with a user-provided
625  * array whose size depends on the page size, to avoid any overflow or
626  * underflow of a user-allocated buffer.
627  */
628 int
vm_self_region_page_shift_safely(vm_map_t target_map)629 vm_self_region_page_shift_safely(
630 	vm_map_t target_map)
631 {
632 	int effective_page_shift = 0;
633 
634 	if (PAGE_SIZE == (4096)) {
635 		/* x86_64 and 4k watches: always use 4k */
636 		return PAGE_SHIFT;
637 	}
638 	/* did caller provide an explicit page size for this thread to use? */
639 	effective_page_shift = thread_self_region_page_shift();
640 	if (effective_page_shift) {
641 		/* use the explicitly-provided page size */
642 		return effective_page_shift;
643 	}
644 	/* no explicit page size: use the caller's page size... */
645 	effective_page_shift = VM_MAP_PAGE_SHIFT(current_map());
646 	if (effective_page_shift == VM_MAP_PAGE_SHIFT(target_map)) {
647 		/* page size match: safe to use */
648 		return effective_page_shift;
649 	}
650 	/* page size mismatch */
651 	return -1;
652 }
653 int
vm_self_region_page_shift(vm_map_t target_map)654 vm_self_region_page_shift(
655 	vm_map_t target_map)
656 {
657 	int effective_page_shift;
658 
659 	effective_page_shift = vm_self_region_page_shift_safely(target_map);
660 	if (effective_page_shift == -1) {
661 		/* no safe value but OK to guess for caller */
662 		effective_page_shift = MIN(VM_MAP_PAGE_SHIFT(current_map()),
663 		    VM_MAP_PAGE_SHIFT(target_map));
664 	}
665 	return effective_page_shift;
666 }
667 
668 
669 /*
670  *	Decide if we want to allow processes to execute from their data or stack areas.
671  *	override_nx() returns true if we do.  Data/stack execution can be enabled independently
672  *	for 32 and 64 bit processes.  Set the VM_ABI_32 or VM_ABI_64 flags in allow_data_exec
673  *	or allow_stack_exec to enable data execution for that type of data area for that particular
674  *	ABI (or both by or'ing the flags together).  These are initialized in the architecture
675  *	specific pmap files since the default behavior varies according to architecture.  The
676  *	main reason it varies is because of the need to provide binary compatibility with old
677  *	applications that were written before these restrictions came into being.  In the old
678  *	days, an app could execute anything it could read, but this has slowly been tightened
679  *	up over time.  The default behavior is:
680  *
681  *	32-bit PPC apps		may execute from both stack and data areas
682  *	32-bit Intel apps	may exeucte from data areas but not stack
683  *	64-bit PPC/Intel apps	may not execute from either data or stack
684  *
685  *	An application on any architecture may override these defaults by explicitly
686  *	adding PROT_EXEC permission to the page in question with the mprotect(2)
687  *	system call.  This code here just determines what happens when an app tries to
688  *      execute from a page that lacks execute permission.
689  *
690  *	Note that allow_data_exec or allow_stack_exec may also be modified by sysctl to change the
691  *	default behavior for both 32 and 64 bit apps on a system-wide basis. Furthermore,
692  *	a Mach-O header flag bit (MH_NO_HEAP_EXECUTION) can be used to forcibly disallow
693  *	execution from data areas for a particular binary even if the arch normally permits it. As
694  *	a final wrinkle, a posix_spawn attribute flag can be used to negate this opt-in header bit
695  *	to support some complicated use cases, notably browsers with out-of-process plugins that
696  *	are not all NX-safe.
697  */
698 
699 extern int allow_data_exec, allow_stack_exec;
700 
701 int
override_nx(vm_map_t map,uint32_t user_tag)702 override_nx(vm_map_t map, uint32_t user_tag) /* map unused on arm */
703 {
704 	int current_abi;
705 
706 	if (map->pmap == kernel_pmap) {
707 		return FALSE;
708 	}
709 
710 	/*
711 	 * Determine if the app is running in 32 or 64 bit mode.
712 	 */
713 
714 	if (vm_map_is_64bit(map)) {
715 		current_abi = VM_ABI_64;
716 	} else {
717 		current_abi = VM_ABI_32;
718 	}
719 
720 	/*
721 	 * Determine if we should allow the execution based on whether it's a
722 	 * stack or data area and the current architecture.
723 	 */
724 
725 	if (user_tag == VM_MEMORY_STACK) {
726 		return allow_stack_exec & current_abi;
727 	}
728 
729 	return (allow_data_exec & current_abi) && (map->map_disallow_data_exec == FALSE);
730 }
731 
732 
733 /*
734  *	Virtual memory maps provide for the mapping, protection,
735  *	and sharing of virtual memory objects.  In addition,
736  *	this module provides for an efficient virtual copy of
737  *	memory from one map to another.
738  *
739  *	Synchronization is required prior to most operations.
740  *
741  *	Maps consist of an ordered doubly-linked list of simple
742  *	entries; a single hint is used to speed up lookups.
743  *
744  *	Sharing maps have been deleted from this version of Mach.
745  *	All shared objects are now mapped directly into the respective
746  *	maps.  This requires a change in the copy on write strategy;
747  *	the asymmetric (delayed) strategy is used for shared temporary
748  *	objects instead of the symmetric (shadow) strategy.  All maps
749  *	are now "top level" maps (either task map, kernel map or submap
750  *	of the kernel map).
751  *
752  *	Since portions of maps are specified by start/end addreses,
753  *	which may not align with existing map entries, all
754  *	routines merely "clip" entries to these start/end values.
755  *	[That is, an entry is split into two, bordering at a
756  *	start or end value.]  Note that these clippings may not
757  *	always be necessary (as the two resulting entries are then
758  *	not changed); however, the clipping is done for convenience.
759  *	No attempt is currently made to "glue back together" two
760  *	abutting entries.
761  *
762  *	The symmetric (shadow) copy strategy implements virtual copy
763  *	by copying VM object references from one map to
764  *	another, and then marking both regions as copy-on-write.
765  *	It is important to note that only one writeable reference
766  *	to a VM object region exists in any map when this strategy
767  *	is used -- this means that shadow object creation can be
768  *	delayed until a write operation occurs.  The symmetric (delayed)
769  *	strategy allows multiple maps to have writeable references to
770  *	the same region of a vm object, and hence cannot delay creating
771  *	its copy objects.  See vm_object_copy_quickly() in vm_object.c.
772  *	Copying of permanent objects is completely different; see
773  *	vm_object_copy_strategically() in vm_object.c.
774  */
775 
776 ZONE_DECLARE_ID(ZONE_ID_VM_MAP_COPY, struct vm_map_copy);
777 
778 #define VM_MAP_ZONE_NAME        "maps"
779 #define VM_MAP_ZFLAGS           (ZC_NOENCRYPT | ZC_VM)
780 
781 #define VM_MAP_ENTRY_ZONE_NAME  "VM map entries"
782 #define VM_MAP_ENTRY_ZFLAGS     (ZC_NOENCRYPT | ZC_VM)
783 
784 #define VM_MAP_HOLES_ZONE_NAME  "VM map holes"
785 #define VM_MAP_HOLES_ZFLAGS     (ZC_NOENCRYPT | ZC_VM)
786 
787 /*
788  * Asserts that a vm_map_copy object is coming from the
789  * vm_map_copy_zone to ensure that it isn't a fake constructed
790  * anywhere else.
791  */
792 void
vm_map_copy_require(struct vm_map_copy * copy)793 vm_map_copy_require(struct vm_map_copy *copy)
794 {
795 	zone_id_require(ZONE_ID_VM_MAP_COPY, sizeof(struct vm_map_copy), copy);
796 }
797 
798 /*
799  *	vm_map_require:
800  *
801  *	Ensures that the argument is memory allocated from the genuine
802  *	vm map zone. (See zone_id_require_allow_foreign).
803  */
804 void
vm_map_require(vm_map_t map)805 vm_map_require(vm_map_t map)
806 {
807 	zone_id_require(ZONE_ID_VM_MAP, sizeof(struct _vm_map), map);
808 }
809 
810 #define VM_MAP_EARLY_COUNT_MAX         16
811 static __startup_data vm_offset_t      map_data;
812 static __startup_data vm_size_t        map_data_size;
813 static __startup_data vm_offset_t      kentry_data;
814 static __startup_data vm_size_t        kentry_data_size;
815 static __startup_data vm_offset_t      map_holes_data;
816 static __startup_data vm_size_t        map_holes_data_size;
817 static __startup_data vm_map_t        *early_map_owners[VM_MAP_EARLY_COUNT_MAX];
818 static __startup_data uint32_t         early_map_count;
819 
820 #if XNU_TARGET_OS_OSX
821 #define         NO_COALESCE_LIMIT  ((1024 * 128) - 1)
822 #else /* XNU_TARGET_OS_OSX */
823 #define         NO_COALESCE_LIMIT  0
824 #endif /* XNU_TARGET_OS_OSX */
825 
826 /* Skip acquiring locks if we're in the midst of a kernel core dump */
827 unsigned int not_in_kdp = 1;
828 
829 unsigned int vm_map_set_cache_attr_count = 0;
830 
831 kern_return_t
vm_map_set_cache_attr(vm_map_t map,vm_map_offset_t va)832 vm_map_set_cache_attr(
833 	vm_map_t        map,
834 	vm_map_offset_t va)
835 {
836 	vm_map_entry_t  map_entry;
837 	vm_object_t     object;
838 	kern_return_t   kr = KERN_SUCCESS;
839 
840 	vm_map_lock_read(map);
841 
842 	if (!vm_map_lookup_entry(map, va, &map_entry) ||
843 	    map_entry->is_sub_map) {
844 		/*
845 		 * that memory is not properly mapped
846 		 */
847 		kr = KERN_INVALID_ARGUMENT;
848 		goto done;
849 	}
850 	object = VME_OBJECT(map_entry);
851 
852 	if (object == VM_OBJECT_NULL) {
853 		/*
854 		 * there should be a VM object here at this point
855 		 */
856 		kr = KERN_INVALID_ARGUMENT;
857 		goto done;
858 	}
859 	vm_object_lock(object);
860 	object->set_cache_attr = TRUE;
861 	vm_object_unlock(object);
862 
863 	vm_map_set_cache_attr_count++;
864 done:
865 	vm_map_unlock_read(map);
866 
867 	return kr;
868 }
869 
870 
871 #if CONFIG_CODE_DECRYPTION
872 /*
873  * vm_map_apple_protected:
874  * This remaps the requested part of the object with an object backed by
875  * the decrypting pager.
876  * crypt_info contains entry points and session data for the crypt module.
877  * The crypt_info block will be copied by vm_map_apple_protected. The data structures
878  * referenced in crypt_info must remain valid until crypt_info->crypt_end() is called.
879  */
880 kern_return_t
vm_map_apple_protected(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_object_offset_t crypto_backing_offset,struct pager_crypt_info * crypt_info,uint32_t cryptid)881 vm_map_apple_protected(
882 	vm_map_t                map,
883 	vm_map_offset_t         start,
884 	vm_map_offset_t         end,
885 	vm_object_offset_t      crypto_backing_offset,
886 	struct pager_crypt_info *crypt_info,
887 	uint32_t                cryptid)
888 {
889 	boolean_t       map_locked;
890 	kern_return_t   kr;
891 	vm_map_entry_t  map_entry;
892 	struct vm_map_entry tmp_entry;
893 	memory_object_t unprotected_mem_obj;
894 	vm_object_t     protected_object;
895 	vm_map_offset_t map_addr;
896 	vm_map_offset_t start_aligned, end_aligned;
897 	vm_object_offset_t      crypto_start, crypto_end;
898 	boolean_t       cache_pager;
899 
900 	map_locked = FALSE;
901 	unprotected_mem_obj = MEMORY_OBJECT_NULL;
902 
903 	if (__improbable(vm_map_range_overflows(map, start, end - start))) {
904 		return KERN_INVALID_ADDRESS;
905 	}
906 	start_aligned = vm_map_trunc_page(start, PAGE_MASK_64);
907 	end_aligned = vm_map_round_page(end, PAGE_MASK_64);
908 	start_aligned = vm_map_trunc_page(start_aligned, VM_MAP_PAGE_MASK(map));
909 	end_aligned = vm_map_round_page(end_aligned, VM_MAP_PAGE_MASK(map));
910 
911 #if __arm64__
912 	/*
913 	 * "start" and "end" might be 4K-aligned but not 16K-aligned,
914 	 * so we might have to loop and establish up to 3 mappings:
915 	 *
916 	 * + the first 16K-page, which might overlap with the previous
917 	 *   4K-aligned mapping,
918 	 * + the center,
919 	 * + the last 16K-page, which might overlap with the next
920 	 *   4K-aligned mapping.
921 	 * Each of these mapping might be backed by a vnode pager (if
922 	 * properly page-aligned) or a "fourk_pager", itself backed by a
923 	 * vnode pager (if 4K-aligned but not page-aligned).
924 	 */
925 #endif /* __arm64__ */
926 
927 	map_addr = start_aligned;
928 	for (map_addr = start_aligned;
929 	    map_addr < end;
930 	    map_addr = tmp_entry.vme_end) {
931 		vm_map_lock(map);
932 		map_locked = TRUE;
933 
934 		/* lookup the protected VM object */
935 		if (!vm_map_lookup_entry(map,
936 		    map_addr,
937 		    &map_entry) ||
938 		    map_entry->is_sub_map ||
939 		    VME_OBJECT(map_entry) == VM_OBJECT_NULL) {
940 			/* that memory is not properly mapped */
941 			kr = KERN_INVALID_ARGUMENT;
942 			goto done;
943 		}
944 
945 		/* ensure mapped memory is mapped as executable except
946 		 *  except for model decryption flow */
947 		if ((cryptid != CRYPTID_MODEL_ENCRYPTION) &&
948 		    !(map_entry->protection & VM_PROT_EXECUTE)) {
949 			kr = KERN_INVALID_ARGUMENT;
950 			goto done;
951 		}
952 
953 		/* get the protected object to be decrypted */
954 		protected_object = VME_OBJECT(map_entry);
955 		if (protected_object == VM_OBJECT_NULL) {
956 			/* there should be a VM object here at this point */
957 			kr = KERN_INVALID_ARGUMENT;
958 			goto done;
959 		}
960 		/* ensure protected object stays alive while map is unlocked */
961 		vm_object_reference(protected_object);
962 
963 		/* limit the map entry to the area we want to cover */
964 		vm_map_clip_start(map, map_entry, start_aligned);
965 		vm_map_clip_end(map, map_entry, end_aligned);
966 
967 		tmp_entry = *map_entry;
968 		map_entry = VM_MAP_ENTRY_NULL; /* not valid after unlocking map */
969 		vm_map_unlock(map);
970 		map_locked = FALSE;
971 
972 		/*
973 		 * This map entry might be only partially encrypted
974 		 * (if not fully "page-aligned").
975 		 */
976 		crypto_start = 0;
977 		crypto_end = tmp_entry.vme_end - tmp_entry.vme_start;
978 		if (tmp_entry.vme_start < start) {
979 			if (tmp_entry.vme_start != start_aligned) {
980 				kr = KERN_INVALID_ADDRESS;
981 				vm_object_deallocate(protected_object);
982 				goto done;
983 			}
984 			crypto_start += (start - tmp_entry.vme_start);
985 		}
986 		if (tmp_entry.vme_end > end) {
987 			if (tmp_entry.vme_end != end_aligned) {
988 				kr = KERN_INVALID_ADDRESS;
989 				vm_object_deallocate(protected_object);
990 				goto done;
991 			}
992 			crypto_end -= (tmp_entry.vme_end - end);
993 		}
994 
995 		/*
996 		 * This "extra backing offset" is needed to get the decryption
997 		 * routine to use the right key.  It adjusts for the possibly
998 		 * relative offset of an interposed "4K" pager...
999 		 */
1000 		if (crypto_backing_offset == (vm_object_offset_t) -1) {
1001 			crypto_backing_offset = VME_OFFSET(&tmp_entry);
1002 		}
1003 
1004 		cache_pager = TRUE;
1005 #if XNU_TARGET_OS_OSX
1006 		if (vm_map_is_alien(map)) {
1007 			cache_pager = FALSE;
1008 		}
1009 #endif /* XNU_TARGET_OS_OSX */
1010 
1011 		/*
1012 		 * Lookup (and create if necessary) the protected memory object
1013 		 * matching that VM object.
1014 		 * If successful, this also grabs a reference on the memory object,
1015 		 * to guarantee that it doesn't go away before we get a chance to map
1016 		 * it.
1017 		 */
1018 		unprotected_mem_obj = apple_protect_pager_setup(
1019 			protected_object,
1020 			VME_OFFSET(&tmp_entry),
1021 			crypto_backing_offset,
1022 			crypt_info,
1023 			crypto_start,
1024 			crypto_end,
1025 			cache_pager);
1026 
1027 		/* release extra ref on protected object */
1028 		vm_object_deallocate(protected_object);
1029 
1030 		if (unprotected_mem_obj == NULL) {
1031 			kr = KERN_FAILURE;
1032 			goto done;
1033 		}
1034 
1035 		/* can overwrite an immutable mapping */
1036 		vm_map_kernel_flags_t vmk_flags = {
1037 			.vmf_fixed = true,
1038 			.vmf_overwrite = true,
1039 			.vmkf_overwrite_immutable = true,
1040 		};
1041 #if __arm64__
1042 		if (tmp_entry.used_for_jit &&
1043 		    (VM_MAP_PAGE_SHIFT(map) != FOURK_PAGE_SHIFT ||
1044 		    PAGE_SHIFT != FOURK_PAGE_SHIFT) &&
1045 		    fourk_binary_compatibility_unsafe &&
1046 		    fourk_binary_compatibility_allow_wx) {
1047 			printf("** FOURK_COMPAT [%d]: "
1048 			    "allowing write+execute at 0x%llx\n",
1049 			    proc_selfpid(), tmp_entry.vme_start);
1050 			vmk_flags.vmkf_map_jit = TRUE;
1051 		}
1052 #endif /* __arm64__ */
1053 
1054 		/* map this memory object in place of the current one */
1055 		map_addr = tmp_entry.vme_start;
1056 		kr = vm_map_enter_mem_object(map,
1057 		    &map_addr,
1058 		    (tmp_entry.vme_end -
1059 		    tmp_entry.vme_start),
1060 		    (mach_vm_offset_t) 0,
1061 		    vmk_flags,
1062 		    (ipc_port_t)(uintptr_t) unprotected_mem_obj,
1063 		    0,
1064 		    TRUE,
1065 		    tmp_entry.protection,
1066 		    tmp_entry.max_protection,
1067 		    tmp_entry.inheritance);
1068 		assertf(kr == KERN_SUCCESS,
1069 		    "kr = 0x%x\n", kr);
1070 		assertf(map_addr == tmp_entry.vme_start,
1071 		    "map_addr=0x%llx vme_start=0x%llx tmp_entry=%p\n",
1072 		    (uint64_t)map_addr,
1073 		    (uint64_t) tmp_entry.vme_start,
1074 		    &tmp_entry);
1075 
1076 #if VM_MAP_DEBUG_APPLE_PROTECT
1077 		if (vm_map_debug_apple_protect) {
1078 			printf("APPLE_PROTECT: map %p [0x%llx:0x%llx] pager %p:"
1079 			    " backing:[object:%p,offset:0x%llx,"
1080 			    "crypto_backing_offset:0x%llx,"
1081 			    "crypto_start:0x%llx,crypto_end:0x%llx]\n",
1082 			    map,
1083 			    (uint64_t) map_addr,
1084 			    (uint64_t) (map_addr + (tmp_entry.vme_end -
1085 			    tmp_entry.vme_start)),
1086 			    unprotected_mem_obj,
1087 			    protected_object,
1088 			    VME_OFFSET(&tmp_entry),
1089 			    crypto_backing_offset,
1090 			    crypto_start,
1091 			    crypto_end);
1092 		}
1093 #endif /* VM_MAP_DEBUG_APPLE_PROTECT */
1094 
1095 		/*
1096 		 * Release the reference obtained by
1097 		 * apple_protect_pager_setup().
1098 		 * The mapping (if it succeeded) is now holding a reference on
1099 		 * the memory object.
1100 		 */
1101 		memory_object_deallocate(unprotected_mem_obj);
1102 		unprotected_mem_obj = MEMORY_OBJECT_NULL;
1103 
1104 		/* continue with next map entry */
1105 		crypto_backing_offset += (tmp_entry.vme_end -
1106 		    tmp_entry.vme_start);
1107 		crypto_backing_offset -= crypto_start;
1108 	}
1109 	kr = KERN_SUCCESS;
1110 
1111 done:
1112 	if (map_locked) {
1113 		vm_map_unlock(map);
1114 	}
1115 	return kr;
1116 }
1117 #endif  /* CONFIG_CODE_DECRYPTION */
1118 
1119 
1120 LCK_GRP_DECLARE(vm_map_lck_grp, "vm_map");
1121 LCK_ATTR_DECLARE(vm_map_lck_attr, 0, 0);
1122 LCK_ATTR_DECLARE(vm_map_lck_rw_attr, 0, LCK_ATTR_DEBUG);
1123 
1124 #if XNU_TARGET_OS_OSX
1125 #define MALLOC_NO_COW_DEFAULT 1
1126 #define MALLOC_NO_COW_EXCEPT_FORK_DEFAULT 1
1127 #else /* XNU_TARGET_OS_OSX */
1128 #define MALLOC_NO_COW_DEFAULT 1
1129 #define MALLOC_NO_COW_EXCEPT_FORK_DEFAULT 0
1130 #endif /* XNU_TARGET_OS_OSX */
1131 TUNABLE(int, malloc_no_cow, "malloc_no_cow", MALLOC_NO_COW_DEFAULT);
1132 TUNABLE(int, malloc_no_cow_except_fork, "malloc_no_cow_except_fork", MALLOC_NO_COW_EXCEPT_FORK_DEFAULT);
1133 uint64_t vm_memory_malloc_no_cow_mask = 0ULL;
1134 #if DEBUG
1135 int vm_check_map_sanity = 0;
1136 #endif
1137 
1138 /*
1139  *	vm_map_init:
1140  *
1141  *	Initialize the vm_map module.  Must be called before
1142  *	any other vm_map routines.
1143  *
1144  *	Map and entry structures are allocated from zones -- we must
1145  *	initialize those zones.
1146  *
1147  *	There are three zones of interest:
1148  *
1149  *	vm_map_zone:		used to allocate maps.
1150  *	vm_map_entry_zone:	used to allocate map entries.
1151  *
1152  *	LP32:
1153  *	vm_map_entry_reserved_zone:     fallback zone for kernel map entries
1154  *
1155  *	The kernel allocates map entries from a special zone that is initially
1156  *	"crammed" with memory.  It would be difficult (perhaps impossible) for
1157  *	the kernel to allocate more memory to a entry zone when it became
1158  *	empty since the very act of allocating memory implies the creation
1159  *	of a new entry.
1160  */
1161 __startup_func
1162 void
vm_map_init(void)1163 vm_map_init(void)
1164 {
1165 
1166 #if MACH_ASSERT
1167 	PE_parse_boot_argn("debug4k_filter", &debug4k_filter,
1168 	    sizeof(debug4k_filter));
1169 #endif /* MACH_ASSERT */
1170 
1171 	zone_create_ext(VM_MAP_ZONE_NAME, sizeof(struct _vm_map),
1172 	    VM_MAP_ZFLAGS, ZONE_ID_VM_MAP, NULL);
1173 
1174 	/*
1175 	 * Don't quarantine because we always need elements available
1176 	 * Disallow GC on this zone... to aid the GC.
1177 	 */
1178 	zone_create_ext(VM_MAP_ENTRY_ZONE_NAME,
1179 	    sizeof(struct vm_map_entry), VM_MAP_ENTRY_ZFLAGS,
1180 	    ZONE_ID_VM_MAP_ENTRY, ^(zone_t z) {
1181 		z->z_elems_rsv = (uint16_t)(32 *
1182 		(ml_early_cpu_max_number() + 1));
1183 	});
1184 
1185 	zone_create_ext(VM_MAP_HOLES_ZONE_NAME,
1186 	    sizeof(struct vm_map_links), VM_MAP_HOLES_ZFLAGS,
1187 	    ZONE_ID_VM_MAP_HOLES, ^(zone_t z) {
1188 		z->z_elems_rsv = (uint16_t)(16 * 1024 / zone_elem_outer_size(z));
1189 	});
1190 
1191 	zone_create_ext("VM map copies", sizeof(struct vm_map_copy),
1192 	    ZC_NOENCRYPT, ZONE_ID_VM_MAP_COPY, NULL);
1193 
1194 	/*
1195 	 * Add the stolen memory to zones, adjust zone size and stolen counts.
1196 	 */
1197 	zone_cram_early(vm_map_zone, map_data, map_data_size);
1198 	zone_cram_early(vm_map_entry_zone, kentry_data, kentry_data_size);
1199 	zone_cram_early(vm_map_holes_zone, map_holes_data, map_holes_data_size);
1200 	printf("VM boostrap: %d maps, %d entries and %d holes available\n",
1201 	    zone_count_free(vm_map_zone),
1202 	    zone_count_free(vm_map_entry_zone),
1203 	    zone_count_free(vm_map_holes_zone));
1204 
1205 	/*
1206 	 * Since these are covered by zones, remove them from stolen page accounting.
1207 	 */
1208 	VM_PAGE_MOVE_STOLEN(atop_64(map_data_size) + atop_64(kentry_data_size) + atop_64(map_holes_data_size));
1209 
1210 #if VM_MAP_DEBUG_APPLE_PROTECT
1211 	PE_parse_boot_argn("vm_map_debug_apple_protect",
1212 	    &vm_map_debug_apple_protect,
1213 	    sizeof(vm_map_debug_apple_protect));
1214 #endif /* VM_MAP_DEBUG_APPLE_PROTECT */
1215 #if VM_MAP_DEBUG_APPLE_FOURK
1216 	PE_parse_boot_argn("vm_map_debug_fourk",
1217 	    &vm_map_debug_fourk,
1218 	    sizeof(vm_map_debug_fourk));
1219 #endif /* VM_MAP_DEBUG_FOURK */
1220 
1221 	if (malloc_no_cow) {
1222 		vm_memory_malloc_no_cow_mask = 0ULL;
1223 		vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC;
1224 		vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_SMALL;
1225 		vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_MEDIUM;
1226 #if XNU_TARGET_OS_OSX
1227 		/*
1228 		 * On macOS, keep copy-on-write for MALLOC_LARGE because
1229 		 * realloc() may use vm_copy() to transfer the old contents
1230 		 * to the new location.
1231 		 */
1232 #else /* XNU_TARGET_OS_OSX */
1233 		vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_LARGE;
1234 		vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_LARGE_REUSABLE;
1235 		vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_LARGE_REUSED;
1236 #endif /* XNU_TARGET_OS_OSX */
1237 //		vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_HUGE;
1238 //		vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_REALLOC;
1239 		vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_TINY;
1240 		vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_NANO;
1241 //		vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_TCMALLOC;
1242 		PE_parse_boot_argn("vm_memory_malloc_no_cow_mask",
1243 		    &vm_memory_malloc_no_cow_mask,
1244 		    sizeof(vm_memory_malloc_no_cow_mask));
1245 	}
1246 
1247 #if CONFIG_MAP_RANGES
1248 	vm_map_range_map_init();
1249 #endif /* CONFIG_MAP_RANGES */
1250 
1251 #if DEBUG
1252 	PE_parse_boot_argn("vm_check_map_sanity", &vm_check_map_sanity, sizeof(vm_check_map_sanity));
1253 	if (vm_check_map_sanity) {
1254 		kprintf("VM sanity checking enabled\n");
1255 	} else {
1256 		kprintf("VM sanity checking disabled. Set bootarg vm_check_map_sanity=1 to enable\n");
1257 	}
1258 #endif /* DEBUG */
1259 
1260 #if DEVELOPMENT || DEBUG
1261 	PE_parse_boot_argn("panic_on_unsigned_execute",
1262 	    &panic_on_unsigned_execute,
1263 	    sizeof(panic_on_unsigned_execute));
1264 	PE_parse_boot_argn("panic_on_mlock_failure",
1265 	    &panic_on_mlock_failure,
1266 	    sizeof(panic_on_mlock_failure));
1267 #endif /* DEVELOPMENT || DEBUG */
1268 }
1269 
1270 __startup_func
1271 static void
vm_map_steal_memory(void)1272 vm_map_steal_memory(void)
1273 {
1274 	/*
1275 	 * We need to reserve enough memory to support boostraping VM maps
1276 	 * and the zone subsystem.
1277 	 *
1278 	 * The VM Maps that need to function before zones can support them
1279 	 * are the ones registered with vm_map_will_allocate_early_map(),
1280 	 * which are:
1281 	 * - the kernel map
1282 	 * - the various submaps used by zones (pgz, meta, ...)
1283 	 *
1284 	 * We also need enough entries and holes to support them
1285 	 * until zone_metadata_init() is called, which is when
1286 	 * the zone allocator becomes capable of expanding dynamically.
1287 	 *
1288 	 * We need:
1289 	 * - VM_MAP_EARLY_COUNT_MAX worth of VM Maps.
1290 	 * - To allow for 3-4 entries per map, but the kernel map
1291 	 *   needs a multiple of VM_MAP_EARLY_COUNT_MAX entries
1292 	 *   to describe the submaps, so double it (and make it 8x too)
1293 	 * - To allow for holes between entries,
1294 	 *   hence needs the same budget as entries
1295 	 */
1296 	map_data_size = zone_get_early_alloc_size(VM_MAP_ZONE_NAME,
1297 	    sizeof(struct _vm_map), VM_MAP_ZFLAGS,
1298 	    VM_MAP_EARLY_COUNT_MAX);
1299 
1300 	kentry_data_size = zone_get_early_alloc_size(VM_MAP_ENTRY_ZONE_NAME,
1301 	    sizeof(struct vm_map_entry), VM_MAP_ENTRY_ZFLAGS,
1302 	    8 * VM_MAP_EARLY_COUNT_MAX);
1303 
1304 	map_holes_data_size = zone_get_early_alloc_size(VM_MAP_HOLES_ZONE_NAME,
1305 	    sizeof(struct vm_map_links), VM_MAP_HOLES_ZFLAGS,
1306 	    8 * VM_MAP_EARLY_COUNT_MAX);
1307 
1308 	/*
1309 	 * Steal a contiguous range of memory so that a simple range check
1310 	 * can validate early addresses being freed/crammed to these
1311 	 * zones
1312 	 */
1313 	map_data       = zone_early_mem_init(map_data_size + kentry_data_size +
1314 	    map_holes_data_size);
1315 	kentry_data    = map_data + map_data_size;
1316 	map_holes_data = kentry_data + kentry_data_size;
1317 }
1318 STARTUP(PMAP_STEAL, STARTUP_RANK_FIRST, vm_map_steal_memory);
1319 
1320 __startup_func
1321 static void
vm_kernel_boostraped(void)1322 vm_kernel_boostraped(void)
1323 {
1324 	zone_enable_caching(&zone_array[ZONE_ID_VM_MAP_ENTRY]);
1325 	zone_enable_caching(&zone_array[ZONE_ID_VM_MAP_HOLES]);
1326 	zone_enable_caching(&zone_array[ZONE_ID_VM_MAP_COPY]);
1327 
1328 	printf("VM bootstrap done: %d maps, %d entries and %d holes left\n",
1329 	    zone_count_free(vm_map_zone),
1330 	    zone_count_free(vm_map_entry_zone),
1331 	    zone_count_free(vm_map_holes_zone));
1332 }
1333 STARTUP(ZALLOC, STARTUP_RANK_SECOND, vm_kernel_boostraped);
1334 
1335 void
vm_map_disable_hole_optimization(vm_map_t map)1336 vm_map_disable_hole_optimization(vm_map_t map)
1337 {
1338 	vm_map_entry_t  head_entry, hole_entry, next_hole_entry;
1339 
1340 	if (map->holelistenabled) {
1341 		head_entry = hole_entry = CAST_TO_VM_MAP_ENTRY(map->holes_list);
1342 
1343 		while (hole_entry != NULL) {
1344 			next_hole_entry = hole_entry->vme_next;
1345 
1346 			hole_entry->vme_next = NULL;
1347 			hole_entry->vme_prev = NULL;
1348 			zfree_id(ZONE_ID_VM_MAP_HOLES, hole_entry);
1349 
1350 			if (next_hole_entry == head_entry) {
1351 				hole_entry = NULL;
1352 			} else {
1353 				hole_entry = next_hole_entry;
1354 			}
1355 		}
1356 
1357 		map->holes_list = NULL;
1358 		map->holelistenabled = FALSE;
1359 
1360 		map->first_free = vm_map_first_entry(map);
1361 		SAVE_HINT_HOLE_WRITE(map, NULL);
1362 	}
1363 }
1364 
1365 boolean_t
vm_kernel_map_is_kernel(vm_map_t map)1366 vm_kernel_map_is_kernel(vm_map_t map)
1367 {
1368 	return map->pmap == kernel_pmap;
1369 }
1370 
1371 /*
1372  *	vm_map_create:
1373  *
1374  *	Creates and returns a new empty VM map with
1375  *	the given physical map structure, and having
1376  *	the given lower and upper address bounds.
1377  */
1378 
1379 extern vm_map_t vm_map_create_external(
1380 	pmap_t                  pmap,
1381 	vm_map_offset_t         min_off,
1382 	vm_map_offset_t         max_off,
1383 	boolean_t               pageable);
1384 
1385 vm_map_t
vm_map_create_external(pmap_t pmap,vm_map_offset_t min,vm_map_offset_t max,boolean_t pageable)1386 vm_map_create_external(
1387 	pmap_t                  pmap,
1388 	vm_map_offset_t         min,
1389 	vm_map_offset_t         max,
1390 	boolean_t               pageable)
1391 {
1392 	vm_map_create_options_t options = VM_MAP_CREATE_DEFAULT;
1393 
1394 	if (pageable) {
1395 		options |= VM_MAP_CREATE_PAGEABLE;
1396 	}
1397 	return vm_map_create_options(pmap, min, max, options);
1398 }
1399 
1400 __startup_func
1401 void
vm_map_will_allocate_early_map(vm_map_t * owner)1402 vm_map_will_allocate_early_map(vm_map_t *owner)
1403 {
1404 	if (early_map_count >= VM_MAP_EARLY_COUNT_MAX) {
1405 		panic("VM_MAP_EARLY_COUNT_MAX is too low");
1406 	}
1407 
1408 	early_map_owners[early_map_count++] = owner;
1409 }
1410 
1411 __startup_func
1412 void
vm_map_relocate_early_maps(vm_offset_t delta)1413 vm_map_relocate_early_maps(vm_offset_t delta)
1414 {
1415 	for (uint32_t i = 0; i < early_map_count; i++) {
1416 		vm_address_t addr = (vm_address_t)*early_map_owners[i];
1417 
1418 		*early_map_owners[i] = (vm_map_t)(addr + delta);
1419 	}
1420 
1421 	early_map_count = ~0u;
1422 }
1423 
1424 /*
1425  *	Routine:	vm_map_relocate_early_elem
1426  *
1427  *	Purpose:
1428  *		Early zone elements are allocated in a temporary part
1429  *		of the address space.
1430  *
1431  *		Once the zones live in their final place, the early
1432  *		VM maps, map entries and map holes need to be relocated.
1433  *
1434  *		It involves rewriting any vm_map_t, vm_map_entry_t or
1435  *		pointers to vm_map_links. Other pointers to other types
1436  *		are fine.
1437  *
1438  *		Fortunately, pointers to those types are self-contained
1439  *		in those zones, _except_ for pointers to VM maps,
1440  *		which are tracked during early boot and fixed with
1441  *		vm_map_relocate_early_maps().
1442  */
1443 __startup_func
1444 void
vm_map_relocate_early_elem(uint32_t zone_id,vm_offset_t new_addr,vm_offset_t delta)1445 vm_map_relocate_early_elem(
1446 	uint32_t                zone_id,
1447 	vm_offset_t             new_addr,
1448 	vm_offset_t             delta)
1449 {
1450 #define relocate(type_t, field)  ({ \
1451 	typeof(((type_t)NULL)->field) *__field = &((type_t)new_addr)->field;   \
1452 	if (*__field) {                                                        \
1453 	        *__field = (typeof(*__field))((vm_offset_t)*__field + delta);  \
1454 	}                                                                      \
1455 })
1456 
1457 	switch (zone_id) {
1458 	case ZONE_ID_VM_MAP:
1459 	case ZONE_ID_VM_MAP_ENTRY:
1460 	case ZONE_ID_VM_MAP_HOLES:
1461 		break;
1462 
1463 	default:
1464 		panic("Unexpected zone ID %d", zone_id);
1465 	}
1466 
1467 	if (zone_id == ZONE_ID_VM_MAP) {
1468 		relocate(vm_map_t, hdr.links.prev);
1469 		relocate(vm_map_t, hdr.links.next);
1470 		((vm_map_t)new_addr)->pmap = kernel_pmap;
1471 #ifdef VM_MAP_STORE_USE_RB
1472 		relocate(vm_map_t, hdr.rb_head_store.rbh_root);
1473 #endif /* VM_MAP_STORE_USE_RB */
1474 		relocate(vm_map_t, hint);
1475 		relocate(vm_map_t, hole_hint);
1476 		relocate(vm_map_t, first_free);
1477 		return;
1478 	}
1479 
1480 	relocate(struct vm_map_links *, prev);
1481 	relocate(struct vm_map_links *, next);
1482 
1483 	if (zone_id == ZONE_ID_VM_MAP_ENTRY) {
1484 #ifdef VM_MAP_STORE_USE_RB
1485 		relocate(vm_map_entry_t, store.entry.rbe_left);
1486 		relocate(vm_map_entry_t, store.entry.rbe_right);
1487 		relocate(vm_map_entry_t, store.entry.rbe_parent);
1488 #endif /* VM_MAP_STORE_USE_RB */
1489 		if (((vm_map_entry_t)new_addr)->is_sub_map) {
1490 			/* no object to relocate because we haven't made any */
1491 			((vm_map_entry_t)new_addr)->vme_submap +=
1492 			    delta >> VME_SUBMAP_SHIFT;
1493 		}
1494 #if MAP_ENTRY_CREATION_DEBUG
1495 		relocate(vm_map_entry_t, vme_creation_maphdr);
1496 #endif /* MAP_ENTRY_CREATION_DEBUG */
1497 	}
1498 
1499 #undef relocate
1500 }
1501 
1502 vm_map_t
vm_map_create_options(pmap_t pmap,vm_map_offset_t min,vm_map_offset_t max,vm_map_create_options_t options)1503 vm_map_create_options(
1504 	pmap_t                  pmap,
1505 	vm_map_offset_t         min,
1506 	vm_map_offset_t         max,
1507 	vm_map_create_options_t options)
1508 {
1509 	vm_map_t result;
1510 
1511 #if DEBUG || DEVELOPMENT
1512 	if (__improbable(startup_phase < STARTUP_SUB_ZALLOC)) {
1513 		if (early_map_count != ~0u && early_map_count !=
1514 		    zone_count_allocated(vm_map_zone) + 1) {
1515 			panic("allocating %dth early map, owner not known",
1516 			    zone_count_allocated(vm_map_zone) + 1);
1517 		}
1518 		if (early_map_count != ~0u && pmap && pmap != kernel_pmap) {
1519 			panic("allocating %dth early map for non kernel pmap",
1520 			    early_map_count);
1521 		}
1522 	}
1523 #endif /* DEBUG || DEVELOPMENT */
1524 
1525 	result = zalloc_id(ZONE_ID_VM_MAP, Z_WAITOK | Z_NOFAIL | Z_ZERO);
1526 
1527 	vm_map_store_init(&result->hdr);
1528 	result->hdr.entries_pageable = (bool)(options & VM_MAP_CREATE_PAGEABLE);
1529 	vm_map_set_page_shift(result, PAGE_SHIFT);
1530 
1531 	result->size_limit      = RLIM_INFINITY;        /* default unlimited */
1532 	result->data_limit      = RLIM_INFINITY;        /* default unlimited */
1533 	result->user_wire_limit = MACH_VM_MAX_ADDRESS;  /* default limit is unlimited */
1534 	os_ref_init_count_raw(&result->map_refcnt, &map_refgrp, 1);
1535 	result->pmap = pmap;
1536 	result->min_offset = min;
1537 	result->max_offset = max;
1538 	result->first_free = vm_map_to_entry(result);
1539 	result->hint = vm_map_to_entry(result);
1540 
1541 	if (options & VM_MAP_CREATE_NEVER_FAULTS) {
1542 		assert(pmap == kernel_pmap);
1543 		result->never_faults = true;
1544 	}
1545 
1546 	/* "has_corpse_footprint" and "holelistenabled" are mutually exclusive */
1547 	if (options & VM_MAP_CREATE_CORPSE_FOOTPRINT) {
1548 		result->has_corpse_footprint = true;
1549 	} else if (!(options & VM_MAP_CREATE_DISABLE_HOLELIST)) {
1550 		struct vm_map_links *hole_entry;
1551 
1552 		hole_entry = zalloc_id(ZONE_ID_VM_MAP_HOLES, Z_WAITOK | Z_NOFAIL);
1553 		hole_entry->start = min;
1554 #if defined(__arm64__)
1555 		hole_entry->end = result->max_offset;
1556 #else
1557 		hole_entry->end = MAX(max, (vm_map_offset_t)MACH_VM_MAX_ADDRESS);
1558 #endif
1559 		result->holes_list = result->hole_hint = hole_entry;
1560 		hole_entry->prev = hole_entry->next = CAST_TO_VM_MAP_ENTRY(hole_entry);
1561 		result->holelistenabled = true;
1562 	}
1563 
1564 	vm_map_lock_init(result);
1565 
1566 	return result;
1567 }
1568 
1569 /*
1570  * Adjusts a submap that was made by kmem_suballoc()
1571  * before it knew where it would be mapped,
1572  * so that it has the right min/max offsets.
1573  *
1574  * We do not need to hold any locks:
1575  * only the caller knows about this map,
1576  * and it is not published on any entry yet.
1577  */
1578 static void
vm_map_adjust_offsets(vm_map_t map,vm_map_offset_t min_off,vm_map_offset_t max_off)1579 vm_map_adjust_offsets(
1580 	vm_map_t                map,
1581 	vm_map_offset_t         min_off,
1582 	vm_map_offset_t         max_off)
1583 {
1584 	assert(map->min_offset == 0);
1585 	assert(map->max_offset == max_off - min_off);
1586 	assert(map->hdr.nentries == 0);
1587 	assert(os_ref_get_count_raw(&map->map_refcnt) == 2);
1588 
1589 	map->min_offset = min_off;
1590 	map->max_offset = max_off;
1591 
1592 	if (map->holelistenabled) {
1593 		struct vm_map_links *hole = map->holes_list;
1594 
1595 		hole->start = min_off;
1596 #if defined(__arm64__)
1597 		hole->end = max_off;
1598 #else
1599 		hole->end = MAX(max_off, (vm_map_offset_t)MACH_VM_MAX_ADDRESS);
1600 #endif
1601 	}
1602 }
1603 
1604 
1605 vm_map_size_t
vm_map_adjusted_size(vm_map_t map)1606 vm_map_adjusted_size(vm_map_t map)
1607 {
1608 	const struct vm_reserved_region *regions = NULL;
1609 	size_t num_regions = 0;
1610 	mach_vm_size_t  reserved_size = 0, map_size = 0;
1611 
1612 	if (map == NULL || (map->size == 0)) {
1613 		return 0;
1614 	}
1615 
1616 	map_size = map->size;
1617 
1618 	if (map->reserved_regions == FALSE || !vm_map_is_exotic(map) || map->terminated) {
1619 		/*
1620 		 * No special reserved regions or not an exotic map or the task
1621 		 * is terminating and these special regions might have already
1622 		 * been deallocated.
1623 		 */
1624 		return map_size;
1625 	}
1626 
1627 	num_regions = ml_get_vm_reserved_regions(vm_map_is_64bit(map), &regions);
1628 	assert((num_regions == 0) || (num_regions > 0 && regions != NULL));
1629 
1630 	while (num_regions) {
1631 		reserved_size += regions[--num_regions].vmrr_size;
1632 	}
1633 
1634 	/*
1635 	 * There are a few places where the map is being switched out due to
1636 	 * 'termination' without that bit being set (e.g. exec and corpse purging).
1637 	 * In those cases, we could have the map's regions being deallocated on
1638 	 * a core while some accounting process is trying to get the map's size.
1639 	 * So this assert can't be enabled till all those places are uniform in
1640 	 * their use of the 'map->terminated' bit.
1641 	 *
1642 	 * assert(map_size >= reserved_size);
1643 	 */
1644 
1645 	return (map_size >= reserved_size) ? (map_size - reserved_size) : map_size;
1646 }
1647 
1648 /*
1649  *	vm_map_entry_create:	[ internal use only ]
1650  *
1651  *	Allocates a VM map entry for insertion in the
1652  *	given map (or map copy).  No fields are filled.
1653  *
1654  *	The VM entry will be zero initialized, except for:
1655  *	- behavior set to VM_BEHAVIOR_DEFAULT
1656  *	- inheritance set to VM_INHERIT_DEFAULT
1657  */
1658 #define vm_map_entry_create(map)    _vm_map_entry_create(&(map)->hdr)
1659 
1660 #define vm_map_copy_entry_create(copy) _vm_map_entry_create(&(copy)->cpy_hdr)
1661 
1662 static vm_map_entry_t
_vm_map_entry_create(struct vm_map_header * map_header __unused)1663 _vm_map_entry_create(
1664 	struct vm_map_header    *map_header __unused)
1665 {
1666 	vm_map_entry_t entry = NULL;
1667 
1668 	entry = zalloc_id(ZONE_ID_VM_MAP_ENTRY, Z_WAITOK | Z_ZERO);
1669 
1670 	/*
1671 	 * Help the compiler with what we know to be true,
1672 	 * so that the further bitfields inits have good codegen.
1673 	 *
1674 	 * See rdar://87041299
1675 	 */
1676 	__builtin_assume(entry->vme_object_value == 0);
1677 	__builtin_assume(*(uint64_t *)(&entry->vme_object_value + 1) == 0);
1678 	__builtin_assume(*(uint64_t *)(&entry->vme_object_value + 2) == 0);
1679 
1680 	static_assert(VM_MAX_TAG_VALUE <= VME_ALIAS_MASK,
1681 	    "VME_ALIAS_MASK covers tags");
1682 
1683 	static_assert(VM_BEHAVIOR_DEFAULT == 0,
1684 	    "can skip zeroing of the behavior field");
1685 	entry->inheritance = VM_INHERIT_DEFAULT;
1686 
1687 #if MAP_ENTRY_CREATION_DEBUG
1688 	entry->vme_creation_maphdr = map_header;
1689 	entry->vme_creation_bt = btref_get(__builtin_frame_address(0),
1690 	    BTREF_GET_NOWAIT);
1691 #endif
1692 	return entry;
1693 }
1694 
1695 /*
1696  *	vm_map_entry_dispose:	[ internal use only ]
1697  *
1698  *	Inverse of vm_map_entry_create.
1699  *
1700  *      write map lock held so no need to
1701  *	do anything special to insure correctness
1702  *      of the stores
1703  */
1704 static void
vm_map_entry_dispose(vm_map_entry_t entry)1705 vm_map_entry_dispose(
1706 	vm_map_entry_t          entry)
1707 {
1708 #if VM_BTLOG_TAGS
1709 	if (entry->vme_kernel_object) {
1710 		btref_put(entry->vme_tag_btref);
1711 	}
1712 #endif /* VM_BTLOG_TAGS */
1713 #if MAP_ENTRY_CREATION_DEBUG
1714 	btref_put(entry->vme_creation_bt);
1715 #endif
1716 #if MAP_ENTRY_INSERTION_DEBUG
1717 	btref_put(entry->vme_insertion_bt);
1718 #endif
1719 	zfree(vm_map_entry_zone, entry);
1720 }
1721 
1722 #define vm_map_copy_entry_dispose(copy_entry) \
1723 	vm_map_entry_dispose(copy_entry)
1724 
1725 static vm_map_entry_t
vm_map_zap_first_entry(vm_map_zap_t list)1726 vm_map_zap_first_entry(
1727 	vm_map_zap_t            list)
1728 {
1729 	return list->vmz_head;
1730 }
1731 
1732 static vm_map_entry_t
vm_map_zap_last_entry(vm_map_zap_t list)1733 vm_map_zap_last_entry(
1734 	vm_map_zap_t            list)
1735 {
1736 	assert(vm_map_zap_first_entry(list));
1737 	return __container_of(list->vmz_tail, struct vm_map_entry, vme_next);
1738 }
1739 
1740 static void
vm_map_zap_append(vm_map_zap_t list,vm_map_entry_t entry)1741 vm_map_zap_append(
1742 	vm_map_zap_t            list,
1743 	vm_map_entry_t          entry)
1744 {
1745 	entry->vme_next = VM_MAP_ENTRY_NULL;
1746 	*list->vmz_tail = entry;
1747 	list->vmz_tail = &entry->vme_next;
1748 }
1749 
1750 static vm_map_entry_t
vm_map_zap_pop(vm_map_zap_t list)1751 vm_map_zap_pop(
1752 	vm_map_zap_t            list)
1753 {
1754 	vm_map_entry_t head = list->vmz_head;
1755 
1756 	if (head != VM_MAP_ENTRY_NULL &&
1757 	    (list->vmz_head = head->vme_next) == VM_MAP_ENTRY_NULL) {
1758 		list->vmz_tail = &list->vmz_head;
1759 	}
1760 
1761 	return head;
1762 }
1763 
1764 static void
vm_map_zap_dispose(vm_map_zap_t list)1765 vm_map_zap_dispose(
1766 	vm_map_zap_t            list)
1767 {
1768 	vm_map_entry_t          entry;
1769 
1770 	while ((entry = vm_map_zap_pop(list))) {
1771 		if (entry->is_sub_map) {
1772 			vm_map_deallocate(VME_SUBMAP(entry));
1773 		} else {
1774 			vm_object_deallocate(VME_OBJECT(entry));
1775 		}
1776 
1777 		vm_map_entry_dispose(entry);
1778 	}
1779 }
1780 
1781 #if MACH_ASSERT
1782 static boolean_t first_free_check = FALSE;
1783 boolean_t
first_free_is_valid(vm_map_t map)1784 first_free_is_valid(
1785 	vm_map_t        map)
1786 {
1787 	if (!first_free_check) {
1788 		return TRUE;
1789 	}
1790 
1791 	return first_free_is_valid_store( map );
1792 }
1793 #endif /* MACH_ASSERT */
1794 
1795 
1796 #define vm_map_copy_entry_link(copy, after_where, entry)                \
1797 	_vm_map_store_entry_link(&(copy)->cpy_hdr, after_where, (entry))
1798 
1799 #define vm_map_copy_entry_unlink(copy, entry)                           \
1800 	_vm_map_store_entry_unlink(&(copy)->cpy_hdr, (entry), false)
1801 
1802 /*
1803  *	vm_map_destroy:
1804  *
1805  *	Actually destroy a map.
1806  */
1807 void
vm_map_destroy(vm_map_t map)1808 vm_map_destroy(
1809 	vm_map_t        map)
1810 {
1811 	/* final cleanup: this is not allowed to fail */
1812 	vmr_flags_t flags = VM_MAP_REMOVE_NO_FLAGS;
1813 
1814 	VM_MAP_ZAP_DECLARE(zap);
1815 
1816 	vm_map_lock(map);
1817 
1818 	map->terminated = true;
1819 	/* clean up regular map entries */
1820 	(void)vm_map_delete(map, map->min_offset, map->max_offset, flags,
1821 	    KMEM_GUARD_NONE, &zap);
1822 	/* clean up leftover special mappings (commpage, GPU carveout, etc...) */
1823 	(void)vm_map_delete(map, 0x0, 0xFFFFFFFFFFFFF000ULL, flags,
1824 	    KMEM_GUARD_NONE, &zap);
1825 
1826 	vm_map_disable_hole_optimization(map);
1827 	vm_map_corpse_footprint_destroy(map);
1828 
1829 	vm_map_unlock(map);
1830 
1831 	vm_map_zap_dispose(&zap);
1832 
1833 	assert(map->hdr.nentries == 0);
1834 
1835 	if (map->pmap) {
1836 		pmap_destroy(map->pmap);
1837 	}
1838 
1839 	lck_rw_destroy(&map->lock, &vm_map_lck_grp);
1840 
1841 #if CONFIG_MAP_RANGES
1842 	kfree_data(map->extra_ranges,
1843 	    map->extra_ranges_count * sizeof(struct vm_map_user_range));
1844 #endif
1845 
1846 	zfree_id(ZONE_ID_VM_MAP, map);
1847 }
1848 
1849 /*
1850  * Returns pid of the task with the largest number of VM map entries.
1851  * Used in the zone-map-exhaustion jetsam path.
1852  */
1853 pid_t
find_largest_process_vm_map_entries(void)1854 find_largest_process_vm_map_entries(void)
1855 {
1856 	pid_t victim_pid = -1;
1857 	int max_vm_map_entries = 0;
1858 	task_t task = TASK_NULL;
1859 	queue_head_t *task_list = &tasks;
1860 
1861 	lck_mtx_lock(&tasks_threads_lock);
1862 	queue_iterate(task_list, task, task_t, tasks) {
1863 		if (task == kernel_task || !task->active) {
1864 			continue;
1865 		}
1866 
1867 		vm_map_t task_map = task->map;
1868 		if (task_map != VM_MAP_NULL) {
1869 			int task_vm_map_entries = task_map->hdr.nentries;
1870 			if (task_vm_map_entries > max_vm_map_entries) {
1871 				max_vm_map_entries = task_vm_map_entries;
1872 				victim_pid = pid_from_task(task);
1873 			}
1874 		}
1875 	}
1876 	lck_mtx_unlock(&tasks_threads_lock);
1877 
1878 	printf("zone_map_exhaustion: victim pid %d, vm region count: %d\n", victim_pid, max_vm_map_entries);
1879 	return victim_pid;
1880 }
1881 
1882 
1883 /*
1884  *	vm_map_lookup_entry:	[ internal use only ]
1885  *
1886  *	Calls into the vm map store layer to find the map
1887  *	entry containing (or immediately preceding) the
1888  *	specified address in the given map; the entry is returned
1889  *	in the "entry" parameter.  The boolean
1890  *	result indicates whether the address is
1891  *	actually contained in the map.
1892  */
1893 boolean_t
vm_map_lookup_entry(vm_map_t map,vm_map_offset_t address,vm_map_entry_t * entry)1894 vm_map_lookup_entry(
1895 	vm_map_t        map,
1896 	vm_map_offset_t address,
1897 	vm_map_entry_t  *entry)         /* OUT */
1898 {
1899 	if (VM_KERNEL_ADDRESS(address)) {
1900 		address = VM_KERNEL_STRIP_UPTR(address);
1901 	}
1902 
1903 
1904 #if CONFIG_PROB_GZALLOC
1905 	if (map->pmap == kernel_pmap) {
1906 		assertf(!pgz_owned(address),
1907 		    "it is the responsibility of callers to unguard PGZ addresses");
1908 	}
1909 #endif /* CONFIG_PROB_GZALLOC */
1910 	return vm_map_store_lookup_entry( map, address, entry );
1911 }
1912 
1913 boolean_t
vm_map_lookup_entry_or_next(vm_map_t map,vm_map_offset_t address,vm_map_entry_t * entry)1914 vm_map_lookup_entry_or_next(
1915 	vm_map_t        map,
1916 	vm_map_offset_t address,
1917 	vm_map_entry_t  *entry)         /* OUT */
1918 {
1919 	if (vm_map_lookup_entry(map, address, entry)) {
1920 		return true;
1921 	}
1922 
1923 	*entry = (*entry)->vme_next;
1924 	return false;
1925 }
1926 
1927 #if CONFIG_PROB_GZALLOC
1928 boolean_t
vm_map_lookup_entry_allow_pgz(vm_map_t map,vm_map_offset_t address,vm_map_entry_t * entry)1929 vm_map_lookup_entry_allow_pgz(
1930 	vm_map_t        map,
1931 	vm_map_offset_t address,
1932 	vm_map_entry_t  *entry)         /* OUT */
1933 {
1934 	if (VM_KERNEL_ADDRESS(address)) {
1935 		address = VM_KERNEL_STRIP_UPTR(address);
1936 	}
1937 	return vm_map_store_lookup_entry( map, address, entry );
1938 }
1939 #endif /* CONFIG_PROB_GZALLOC */
1940 
1941 /*
1942  *	Routine:	vm_map_range_invalid_panic
1943  *	Purpose:
1944  *			Panic on detection of an invalid range id.
1945  */
1946 __abortlike
1947 static void
vm_map_range_invalid_panic(vm_map_t map,vm_map_range_id_t range_id)1948 vm_map_range_invalid_panic(
1949 	vm_map_t                map,
1950 	vm_map_range_id_t       range_id)
1951 {
1952 	panic("invalid range ID (%u) for map %p", range_id, map);
1953 }
1954 
1955 /*
1956  *	Routine:	vm_map_get_range
1957  *	Purpose:
1958  *			Adjust bounds based on security policy.
1959  */
1960 static struct mach_vm_range
vm_map_get_range(vm_map_t map,vm_map_address_t * address,vm_map_kernel_flags_t * vmk_flags,vm_map_size_t size,bool * is_ptr)1961 vm_map_get_range(
1962 	vm_map_t                map,
1963 	vm_map_address_t       *address,
1964 	vm_map_kernel_flags_t  *vmk_flags,
1965 	vm_map_size_t           size,
1966 	bool                   *is_ptr)
1967 {
1968 	struct mach_vm_range effective_range = {};
1969 	vm_map_range_id_t range_id = vmk_flags->vmkf_range_id;
1970 
1971 	if (map == kernel_map) {
1972 		effective_range = kmem_ranges[range_id];
1973 
1974 		if (startup_phase >= STARTUP_SUB_KMEM) {
1975 			/*
1976 			 * Hint provided by caller is zeroed as the range is restricted to a
1977 			 * subset of the entire kernel_map VA, which could put the hint outside
1978 			 * the range, causing vm_map_store_find_space to fail.
1979 			 */
1980 			*address = 0ull;
1981 			/*
1982 			 * Ensure that range_id passed in by the caller is within meaningful
1983 			 * bounds. Range id of KMEM_RANGE_ID_NONE will cause vm_map_locate_space
1984 			 * to fail as the corresponding range is invalid. Range id larger than
1985 			 * KMEM_RANGE_ID_MAX will lead to an OOB access.
1986 			 */
1987 			if ((range_id == KMEM_RANGE_ID_NONE) ||
1988 			    (range_id > KMEM_RANGE_ID_MAX)) {
1989 				vm_map_range_invalid_panic(map, range_id);
1990 			}
1991 
1992 			/*
1993 			 * Pointer ranges use kmem_locate_space to do allocations.
1994 			 *
1995 			 * Non pointer fronts look like [ Small | Large | Permanent ]
1996 			 * Adjust range for allocations larger than KMEM_SMALLMAP_THRESHOLD.
1997 			 * Allocations smaller than KMEM_SMALLMAP_THRESHOLD are allowed to
1998 			 * use the entire range.
1999 			 */
2000 			if (range_id < KMEM_RANGE_ID_SPRAYQTN) {
2001 				*is_ptr = true;
2002 			} else if (size >= KMEM_SMALLMAP_THRESHOLD) {
2003 				effective_range = kmem_large_ranges[range_id];
2004 			}
2005 		}
2006 #if CONFIG_MAP_RANGES
2007 	} else if (map->uses_user_ranges) {
2008 		switch (range_id) {
2009 		case UMEM_RANGE_ID_DEFAULT:
2010 			effective_range = map->default_range;
2011 			break;
2012 		case UMEM_RANGE_ID_HEAP:
2013 			effective_range = map->data_range;
2014 			break;
2015 		case UMEM_RANGE_ID_FIXED:
2016 			/*
2017 			 * anywhere allocations with an address in "FIXED"
2018 			 * makes no sense, leave the range empty
2019 			 */
2020 			break;
2021 
2022 		default:
2023 			vm_map_range_invalid_panic(map, range_id);
2024 		}
2025 #endif /* CONFIG_MAP_RANGES */
2026 	} else {
2027 		/*
2028 		 * If minimum is 0, bump it up by PAGE_SIZE.  We want to limit
2029 		 * allocations of PAGEZERO to explicit requests since its
2030 		 * normal use is to catch dereferences of NULL and many
2031 		 * applications also treat pointers with a value of 0 as
2032 		 * special and suddenly having address 0 contain useable
2033 		 * memory would tend to confuse those applications.
2034 		 */
2035 		effective_range.min_address = MAX(map->min_offset, VM_MAP_PAGE_SIZE(map));
2036 		effective_range.max_address = map->max_offset;
2037 	}
2038 
2039 	return effective_range;
2040 }
2041 
2042 /*
2043  *	Routine:	vm_map_locate_space
2044  *	Purpose:
2045  *		Finds a range in the specified virtual address map,
2046  *		returning the start of that range,
2047  *		as well as the entry right before it.
2048  */
2049 kern_return_t
vm_map_locate_space(vm_map_t map,vm_map_size_t size,vm_map_offset_t mask,vm_map_kernel_flags_t vmk_flags,vm_map_offset_t * start_inout,vm_map_entry_t * entry_out)2050 vm_map_locate_space(
2051 	vm_map_t                map,
2052 	vm_map_size_t           size,
2053 	vm_map_offset_t         mask,
2054 	vm_map_kernel_flags_t   vmk_flags,
2055 	vm_map_offset_t        *start_inout,
2056 	vm_map_entry_t         *entry_out)
2057 {
2058 	struct mach_vm_range effective_range = {};
2059 	vm_map_size_t   guard_offset;
2060 	vm_map_offset_t hint, limit;
2061 	vm_map_entry_t  entry;
2062 	bool            is_kmem_ptr_range = false;
2063 
2064 	/*
2065 	 * Only supported by vm_map_enter() with a fixed address.
2066 	 */
2067 	assert(!vmk_flags.vmkf_beyond_max);
2068 
2069 	if (__improbable(map->wait_for_space)) {
2070 		/*
2071 		 * support for "wait_for_space" is minimal,
2072 		 * its only consumer is the ipc_kernel_copy_map.
2073 		 */
2074 		assert(!map->holelistenabled &&
2075 		    !vmk_flags.vmkf_last_free &&
2076 		    !vmk_flags.vmkf_keep_map_locked &&
2077 		    !vmk_flags.vmkf_map_jit &&
2078 		    !vmk_flags.vmf_random_addr &&
2079 		    *start_inout <= map->min_offset);
2080 	} else if (vmk_flags.vmkf_last_free) {
2081 		assert(!vmk_flags.vmkf_map_jit &&
2082 		    !vmk_flags.vmf_random_addr);
2083 	}
2084 
2085 	if (vmk_flags.vmkf_guard_before) {
2086 		guard_offset = VM_MAP_PAGE_SIZE(map);
2087 		assert(size > guard_offset);
2088 		size -= guard_offset;
2089 	} else {
2090 		assert(size != 0);
2091 		guard_offset = 0;
2092 	}
2093 
2094 	/*
2095 	 * Validate range_id from flags and get associated range
2096 	 */
2097 	effective_range = vm_map_get_range(map, start_inout, &vmk_flags, size,
2098 	    &is_kmem_ptr_range);
2099 
2100 	if (is_kmem_ptr_range) {
2101 		return kmem_locate_space(size + guard_offset, vmk_flags.vmkf_range_id,
2102 		           vmk_flags.vmkf_last_free, start_inout, entry_out);
2103 	}
2104 
2105 #if XNU_TARGET_OS_OSX
2106 	if (__improbable(vmk_flags.vmkf_32bit_map_va)) {
2107 		assert(map != kernel_map);
2108 		effective_range.max_address = MIN(map->max_offset, 0x00000000FFFFF000ULL);
2109 	}
2110 #endif /* XNU_TARGET_OS_OSX */
2111 
2112 again:
2113 	if (vmk_flags.vmkf_last_free) {
2114 		hint = *start_inout;
2115 
2116 		if (hint == 0 || hint > effective_range.max_address) {
2117 			hint = effective_range.max_address;
2118 		}
2119 		if (hint <= effective_range.min_address) {
2120 			return KERN_NO_SPACE;
2121 		}
2122 		limit = effective_range.min_address;
2123 	} else {
2124 		hint = *start_inout;
2125 
2126 		if (vmk_flags.vmkf_map_jit) {
2127 			if (map->jit_entry_exists &&
2128 			    !VM_MAP_POLICY_ALLOW_MULTIPLE_JIT(map)) {
2129 				return KERN_INVALID_ARGUMENT;
2130 			}
2131 			if (VM_MAP_POLICY_ALLOW_JIT_RANDOM_ADDRESS(map)) {
2132 				vmk_flags.vmf_random_addr = true;
2133 			}
2134 		}
2135 
2136 		if (vmk_flags.vmf_random_addr) {
2137 			kern_return_t kr;
2138 
2139 			kr = vm_map_random_address_for_size(map, &hint, size, vmk_flags);
2140 			if (kr != KERN_SUCCESS) {
2141 				return kr;
2142 			}
2143 		}
2144 #if __x86_64__
2145 		else if ((hint == 0 || hint == vm_map_min(map)) &&
2146 		    !map->disable_vmentry_reuse &&
2147 		    map->vmmap_high_start != 0) {
2148 			hint = map->vmmap_high_start;
2149 		}
2150 #endif /* __x86_64__ */
2151 
2152 		if (hint < effective_range.min_address) {
2153 			hint = effective_range.min_address;
2154 		}
2155 		if (effective_range.max_address <= hint) {
2156 			return KERN_NO_SPACE;
2157 		}
2158 
2159 		limit = effective_range.max_address;
2160 	}
2161 	entry = vm_map_store_find_space(map,
2162 	    hint, limit, vmk_flags.vmkf_last_free,
2163 	    guard_offset, size, mask,
2164 	    start_inout);
2165 
2166 	if (__improbable(entry == NULL)) {
2167 		if (map->wait_for_space &&
2168 		    guard_offset + size <=
2169 		    effective_range.max_address - effective_range.min_address) {
2170 			assert_wait((event_t)map, THREAD_ABORTSAFE);
2171 			vm_map_unlock(map);
2172 			thread_block(THREAD_CONTINUE_NULL);
2173 			vm_map_lock(map);
2174 			goto again;
2175 		}
2176 		return KERN_NO_SPACE;
2177 	}
2178 
2179 	if (entry_out) {
2180 		*entry_out = entry;
2181 	}
2182 	return KERN_SUCCESS;
2183 }
2184 
2185 
2186 /*
2187  *	Routine:	vm_map_find_space
2188  *	Purpose:
2189  *		Allocate a range in the specified virtual address map,
2190  *		returning the entry allocated for that range.
2191  *		Used by kmem_alloc, etc.
2192  *
2193  *		The map must be NOT be locked. It will be returned locked
2194  *		on KERN_SUCCESS, unlocked on failure.
2195  *
2196  *		If an entry is allocated, the object/offset fields
2197  *		are initialized to zero.
2198  */
2199 kern_return_t
vm_map_find_space(vm_map_t map,vm_map_offset_t hint_address,vm_map_size_t size,vm_map_offset_t mask,vm_map_kernel_flags_t vmk_flags,vm_map_entry_t * o_entry)2200 vm_map_find_space(
2201 	vm_map_t                map,
2202 	vm_map_offset_t         hint_address,
2203 	vm_map_size_t           size,
2204 	vm_map_offset_t         mask,
2205 	vm_map_kernel_flags_t   vmk_flags,
2206 	vm_map_entry_t          *o_entry)       /* OUT */
2207 {
2208 	vm_map_entry_t          new_entry, entry;
2209 	kern_return_t           kr;
2210 
2211 	if (size == 0) {
2212 		return KERN_INVALID_ARGUMENT;
2213 	}
2214 
2215 	new_entry = vm_map_entry_create(map);
2216 	new_entry->use_pmap = true;
2217 	new_entry->protection = VM_PROT_DEFAULT;
2218 	new_entry->max_protection = VM_PROT_ALL;
2219 
2220 	if (VM_MAP_PAGE_SHIFT(map) != PAGE_SHIFT) {
2221 		new_entry->map_aligned = true;
2222 	}
2223 	if (vmk_flags.vmf_permanent) {
2224 		new_entry->vme_permanent = true;
2225 	}
2226 
2227 	vm_map_lock(map);
2228 
2229 	kr = vm_map_locate_space(map, size, mask, vmk_flags,
2230 	    &hint_address, &entry);
2231 	if (kr != KERN_SUCCESS) {
2232 		vm_map_unlock(map);
2233 		vm_map_entry_dispose(new_entry);
2234 		return kr;
2235 	}
2236 	new_entry->vme_start = hint_address;
2237 	new_entry->vme_end = hint_address + size;
2238 
2239 	/*
2240 	 *	At this point,
2241 	 *
2242 	 *	- new_entry's "vme_start" and "vme_end" should define
2243 	 *	  the endpoints of the available new range,
2244 	 *
2245 	 *	- and "entry" should refer to the region before
2246 	 *	  the new range,
2247 	 *
2248 	 *	- and the map should still be locked.
2249 	 */
2250 
2251 	assert(page_aligned(new_entry->vme_start));
2252 	assert(page_aligned(new_entry->vme_end));
2253 	assert(VM_MAP_PAGE_ALIGNED(new_entry->vme_start, VM_MAP_PAGE_MASK(map)));
2254 	assert(VM_MAP_PAGE_ALIGNED(new_entry->vme_end, VM_MAP_PAGE_MASK(map)));
2255 
2256 	/*
2257 	 *	Insert the new entry into the list
2258 	 */
2259 
2260 	vm_map_store_entry_link(map, entry, new_entry,
2261 	    VM_MAP_KERNEL_FLAGS_NONE);
2262 	map->size += size;
2263 
2264 	/*
2265 	 *	Update the lookup hint
2266 	 */
2267 	SAVE_HINT_MAP_WRITE(map, new_entry);
2268 
2269 	*o_entry = new_entry;
2270 	return KERN_SUCCESS;
2271 }
2272 
2273 int vm_map_pmap_enter_print = FALSE;
2274 int vm_map_pmap_enter_enable = FALSE;
2275 
2276 /*
2277  *	Routine:	vm_map_pmap_enter [internal only]
2278  *
2279  *	Description:
2280  *		Force pages from the specified object to be entered into
2281  *		the pmap at the specified address if they are present.
2282  *		As soon as a page not found in the object the scan ends.
2283  *
2284  *	Returns:
2285  *		Nothing.
2286  *
2287  *	In/out conditions:
2288  *		The source map should not be locked on entry.
2289  */
2290 __unused static void
vm_map_pmap_enter(vm_map_t map,vm_map_offset_t addr,vm_map_offset_t end_addr,vm_object_t object,vm_object_offset_t offset,vm_prot_t protection)2291 vm_map_pmap_enter(
2292 	vm_map_t                map,
2293 	vm_map_offset_t         addr,
2294 	vm_map_offset_t         end_addr,
2295 	vm_object_t             object,
2296 	vm_object_offset_t      offset,
2297 	vm_prot_t               protection)
2298 {
2299 	int                     type_of_fault;
2300 	kern_return_t           kr;
2301 	uint8_t                 object_lock_type = 0;
2302 	struct vm_object_fault_info fault_info = {};
2303 
2304 	if (map->pmap == 0) {
2305 		return;
2306 	}
2307 
2308 	assert(VM_MAP_PAGE_SHIFT(map) == PAGE_SHIFT);
2309 
2310 	while (addr < end_addr) {
2311 		vm_page_t       m;
2312 
2313 
2314 		/*
2315 		 * TODO:
2316 		 * From vm_map_enter(), we come into this function without the map
2317 		 * lock held or the object lock held.
2318 		 * We haven't taken a reference on the object either.
2319 		 * We should do a proper lookup on the map to make sure
2320 		 * that things are sane before we go locking objects that
2321 		 * could have been deallocated from under us.
2322 		 */
2323 
2324 		object_lock_type = OBJECT_LOCK_EXCLUSIVE;
2325 		vm_object_lock(object);
2326 
2327 		m = vm_page_lookup(object, offset);
2328 
2329 		if (m == VM_PAGE_NULL || m->vmp_busy || m->vmp_fictitious ||
2330 		    (m->vmp_unusual && (VMP_ERROR_GET(m) || m->vmp_restart || m->vmp_absent))) {
2331 			vm_object_unlock(object);
2332 			return;
2333 		}
2334 
2335 		if (vm_map_pmap_enter_print) {
2336 			printf("vm_map_pmap_enter:");
2337 			printf("map: %p, addr: %llx, object: %p, offset: %llx\n",
2338 			    map, (unsigned long long)addr, object, (unsigned long long)offset);
2339 		}
2340 		type_of_fault = DBG_CACHE_HIT_FAULT;
2341 		kr = vm_fault_enter(m, map->pmap,
2342 		    addr,
2343 		    PAGE_SIZE, 0,
2344 		    protection, protection,
2345 		    VM_PAGE_WIRED(m),
2346 		    FALSE,                 /* change_wiring */
2347 		    VM_KERN_MEMORY_NONE,                 /* tag - not wiring */
2348 		    &fault_info,
2349 		    NULL,                  /* need_retry */
2350 		    &type_of_fault,
2351 		    &object_lock_type); /* Exclusive lock mode. Will remain unchanged.*/
2352 
2353 		vm_object_unlock(object);
2354 
2355 		offset += PAGE_SIZE_64;
2356 		addr += PAGE_SIZE;
2357 	}
2358 }
2359 
2360 #define MAX_TRIES_TO_GET_RANDOM_ADDRESS 1000
2361 static kern_return_t
vm_map_random_address_for_size(vm_map_t map,vm_map_offset_t * address,vm_map_size_t size,vm_map_kernel_flags_t vmk_flags)2362 vm_map_random_address_for_size(
2363 	vm_map_t                map,
2364 	vm_map_offset_t        *address,
2365 	vm_map_size_t           size,
2366 	vm_map_kernel_flags_t   vmk_flags)
2367 {
2368 	kern_return_t   kr = KERN_SUCCESS;
2369 	int             tries = 0;
2370 	vm_map_offset_t random_addr = 0;
2371 	vm_map_offset_t hole_end;
2372 
2373 	vm_map_entry_t  next_entry = VM_MAP_ENTRY_NULL;
2374 	vm_map_entry_t  prev_entry = VM_MAP_ENTRY_NULL;
2375 	vm_map_size_t   vm_hole_size = 0;
2376 	vm_map_size_t   addr_space_size;
2377 	bool            is_kmem_ptr;
2378 	struct mach_vm_range effective_range;
2379 
2380 	effective_range = vm_map_get_range(map, address, &vmk_flags, size,
2381 	    &is_kmem_ptr);
2382 
2383 	addr_space_size = effective_range.max_address - effective_range.min_address;
2384 	if (size >= addr_space_size) {
2385 		return KERN_NO_SPACE;
2386 	}
2387 	addr_space_size -= size;
2388 
2389 	assert(VM_MAP_PAGE_ALIGNED(size, VM_MAP_PAGE_MASK(map)));
2390 
2391 	while (tries < MAX_TRIES_TO_GET_RANDOM_ADDRESS) {
2392 		if (startup_phase < STARTUP_SUB_ZALLOC) {
2393 			random_addr = (vm_map_offset_t)early_random();
2394 		} else {
2395 			random_addr = (vm_map_offset_t)random();
2396 		}
2397 		random_addr <<= VM_MAP_PAGE_SHIFT(map);
2398 		random_addr = vm_map_trunc_page(
2399 			effective_range.min_address + (random_addr % addr_space_size),
2400 			VM_MAP_PAGE_MASK(map));
2401 
2402 #if CONFIG_PROB_GZALLOC
2403 		if (map->pmap == kernel_pmap && pgz_owned(random_addr)) {
2404 			continue;
2405 		}
2406 #endif /* CONFIG_PROB_GZALLOC */
2407 
2408 		if (vm_map_lookup_entry(map, random_addr, &prev_entry) == FALSE) {
2409 			if (prev_entry == vm_map_to_entry(map)) {
2410 				next_entry = vm_map_first_entry(map);
2411 			} else {
2412 				next_entry = prev_entry->vme_next;
2413 			}
2414 			if (next_entry == vm_map_to_entry(map)) {
2415 				hole_end = vm_map_max(map);
2416 			} else {
2417 				hole_end = next_entry->vme_start;
2418 			}
2419 			vm_hole_size = hole_end - random_addr;
2420 			if (vm_hole_size >= size) {
2421 				*address = random_addr;
2422 				break;
2423 			}
2424 		}
2425 		tries++;
2426 	}
2427 
2428 	if (tries == MAX_TRIES_TO_GET_RANDOM_ADDRESS) {
2429 		kr = KERN_NO_SPACE;
2430 	}
2431 	return kr;
2432 }
2433 
2434 static boolean_t
vm_memory_malloc_no_cow(int alias)2435 vm_memory_malloc_no_cow(
2436 	int alias)
2437 {
2438 	uint64_t alias_mask;
2439 
2440 	if (!malloc_no_cow) {
2441 		return FALSE;
2442 	}
2443 	if (alias > 63) {
2444 		return FALSE;
2445 	}
2446 	alias_mask = 1ULL << alias;
2447 	if (alias_mask & vm_memory_malloc_no_cow_mask) {
2448 		return TRUE;
2449 	}
2450 	return FALSE;
2451 }
2452 
2453 uint64_t vm_map_enter_RLIMIT_AS_count = 0;
2454 uint64_t vm_map_enter_RLIMIT_DATA_count = 0;
2455 /*
2456  *	Routine:	vm_map_enter
2457  *
2458  *	Description:
2459  *		Allocate a range in the specified virtual address map.
2460  *		The resulting range will refer to memory defined by
2461  *		the given memory object and offset into that object.
2462  *
2463  *		Arguments are as defined in the vm_map call.
2464  */
2465 static unsigned int vm_map_enter_restore_successes = 0;
2466 static unsigned int vm_map_enter_restore_failures = 0;
2467 kern_return_t
vm_map_enter(vm_map_t map,vm_map_offset_t * address,vm_map_size_t size,vm_map_offset_t mask,vm_map_kernel_flags_t vmk_flags,vm_object_t object,vm_object_offset_t offset,boolean_t needs_copy,vm_prot_t cur_protection,vm_prot_t max_protection,vm_inherit_t inheritance)2468 vm_map_enter(
2469 	vm_map_t                map,
2470 	vm_map_offset_t         *address,       /* IN/OUT */
2471 	vm_map_size_t           size,
2472 	vm_map_offset_t         mask,
2473 	vm_map_kernel_flags_t   vmk_flags,
2474 	vm_object_t             object,
2475 	vm_object_offset_t      offset,
2476 	boolean_t               needs_copy,
2477 	vm_prot_t               cur_protection,
2478 	vm_prot_t               max_protection,
2479 	vm_inherit_t            inheritance)
2480 {
2481 	vm_map_entry_t          entry, new_entry;
2482 	vm_map_offset_t         start, tmp_start, tmp_offset;
2483 	vm_map_offset_t         end, tmp_end;
2484 	vm_map_offset_t         tmp2_start, tmp2_end;
2485 	vm_map_offset_t         step;
2486 	kern_return_t           result = KERN_SUCCESS;
2487 	bool                    map_locked = FALSE;
2488 	bool                    pmap_empty = TRUE;
2489 	bool                    new_mapping_established = FALSE;
2490 	const bool              keep_map_locked = vmk_flags.vmkf_keep_map_locked;
2491 	const bool              anywhere = !vmk_flags.vmf_fixed;
2492 	const bool              purgable = vmk_flags.vmf_purgeable;
2493 	const bool              overwrite = vmk_flags.vmf_overwrite;
2494 	const bool              no_cache = vmk_flags.vmf_no_cache;
2495 	const bool              is_submap = vmk_flags.vmkf_submap;
2496 	const bool              permanent = vmk_flags.vmf_permanent;
2497 	const bool              no_copy_on_read = vmk_flags.vmkf_no_copy_on_read;
2498 	const bool              entry_for_jit = vmk_flags.vmkf_map_jit;
2499 	const bool              iokit_acct = vmk_flags.vmkf_iokit_acct;
2500 	const bool              resilient_codesign = vmk_flags.vmf_resilient_codesign;
2501 	const bool              resilient_media = vmk_flags.vmf_resilient_media;
2502 	const bool              entry_for_tpro = vmk_flags.vmf_tpro;
2503 	const unsigned int      superpage_size = vmk_flags.vmf_superpage_size;
2504 	const vm_tag_t          alias = vmk_flags.vm_tag;
2505 	vm_tag_t                user_alias;
2506 	kern_return_t           kr;
2507 	bool                    clear_map_aligned = FALSE;
2508 	vm_map_size_t           chunk_size = 0;
2509 	vm_object_t             caller_object;
2510 	VM_MAP_ZAP_DECLARE(zap_old_list);
2511 	VM_MAP_ZAP_DECLARE(zap_new_list);
2512 
2513 	caller_object = object;
2514 
2515 	assertf(vmk_flags.__vmkf_unused == 0, "vmk_flags unused=0x%x\n", vmk_flags.__vmkf_unused);
2516 
2517 	if (vmk_flags.vmf_4gb_chunk) {
2518 #if defined(__LP64__)
2519 		chunk_size = (4ULL * 1024 * 1024 * 1024); /* max. 4GB chunks for the new allocation */
2520 #else /* __LP64__ */
2521 		chunk_size = ANON_CHUNK_SIZE;
2522 #endif /* __LP64__ */
2523 	} else {
2524 		chunk_size = ANON_CHUNK_SIZE;
2525 	}
2526 
2527 
2528 
2529 	if (superpage_size) {
2530 		switch (superpage_size) {
2531 			/*
2532 			 * Note that the current implementation only supports
2533 			 * a single size for superpages, SUPERPAGE_SIZE, per
2534 			 * architecture. As soon as more sizes are supposed
2535 			 * to be supported, SUPERPAGE_SIZE has to be replaced
2536 			 * with a lookup of the size depending on superpage_size.
2537 			 */
2538 #ifdef __x86_64__
2539 		case SUPERPAGE_SIZE_ANY:
2540 			/* handle it like 2 MB and round up to page size */
2541 			size = (size + 2 * 1024 * 1024 - 1) & ~(2 * 1024 * 1024 - 1);
2542 			OS_FALLTHROUGH;
2543 		case SUPERPAGE_SIZE_2MB:
2544 			break;
2545 #endif
2546 		default:
2547 			return KERN_INVALID_ARGUMENT;
2548 		}
2549 		mask = SUPERPAGE_SIZE - 1;
2550 		if (size & (SUPERPAGE_SIZE - 1)) {
2551 			return KERN_INVALID_ARGUMENT;
2552 		}
2553 		inheritance = VM_INHERIT_NONE;  /* fork() children won't inherit superpages */
2554 	}
2555 
2556 
2557 	if ((cur_protection & VM_PROT_WRITE) &&
2558 	    (cur_protection & VM_PROT_EXECUTE) &&
2559 #if XNU_TARGET_OS_OSX
2560 	    map->pmap != kernel_pmap &&
2561 	    (cs_process_global_enforcement() ||
2562 	    (vmk_flags.vmkf_cs_enforcement_override
2563 	    ? vmk_flags.vmkf_cs_enforcement
2564 	    : (vm_map_cs_enforcement(map)
2565 #if __arm64__
2566 	    || !VM_MAP_IS_EXOTIC(map)
2567 #endif /* __arm64__ */
2568 	    ))) &&
2569 #endif /* XNU_TARGET_OS_OSX */
2570 #if CODE_SIGNING_MONITOR
2571 	    (csm_address_space_exempt(map->pmap) != KERN_SUCCESS) &&
2572 #endif
2573 	    (VM_MAP_POLICY_WX_FAIL(map) ||
2574 	    VM_MAP_POLICY_WX_STRIP_X(map)) &&
2575 	    !entry_for_jit) {
2576 		boolean_t vm_protect_wx_fail = VM_MAP_POLICY_WX_FAIL(map);
2577 
2578 		DTRACE_VM3(cs_wx,
2579 		    uint64_t, 0,
2580 		    uint64_t, 0,
2581 		    vm_prot_t, cur_protection);
2582 		printf("CODE SIGNING: %d[%s] %s: curprot cannot be write+execute. %s\n",
2583 		    proc_selfpid(),
2584 		    (get_bsdtask_info(current_task())
2585 		    ? proc_name_address(get_bsdtask_info(current_task()))
2586 		    : "?"),
2587 		    __FUNCTION__,
2588 		    (vm_protect_wx_fail ? "failing" : "turning off execute"));
2589 		cur_protection &= ~VM_PROT_EXECUTE;
2590 		if (vm_protect_wx_fail) {
2591 			return KERN_PROTECTION_FAILURE;
2592 		}
2593 	}
2594 
2595 	if (entry_for_jit
2596 	    && cur_protection != VM_PROT_ALL) {
2597 		/*
2598 		 * Native macOS processes and all non-macOS processes are
2599 		 * expected to create JIT regions via mmap(MAP_JIT, RWX) but
2600 		 * the RWX requirement was not enforced, and thus, we must live
2601 		 * with our sins. We are now dealing with a JIT mapping without
2602 		 * RWX.
2603 		 *
2604 		 * We deal with these by letting the MAP_JIT stick in order
2605 		 * to avoid CS violations when these pages are mapped executable
2606 		 * down the line. In order to appease the page table monitor (you
2607 		 * know what I'm talking about), these pages will end up being
2608 		 * marked as XNU_USER_DEBUG, which will be allowed because we
2609 		 * don't enforce the code signing monitor on macOS systems. If
2610 		 * the user-space application ever changes permissions to RWX,
2611 		 * which they are allowed to since the mapping was originally
2612 		 * created with MAP_JIT, then they'll switch over to using the
2613 		 * XNU_USER_JIT type, and won't be allowed to downgrade any
2614 		 * more after that.
2615 		 *
2616 		 * When not on macOS, a MAP_JIT mapping without VM_PROT_ALL is
2617 		 * strictly disallowed.
2618 		 */
2619 
2620 #if XNU_TARGET_OS_OSX
2621 		/*
2622 		 * Continue to allow non-RWX JIT
2623 		 */
2624 #else
2625 		/* non-macOS: reject JIT regions without RWX */
2626 		DTRACE_VM3(cs_wx,
2627 		    uint64_t, 0,
2628 		    uint64_t, 0,
2629 		    vm_prot_t, cur_protection);
2630 		printf("CODE SIGNING: %d[%s] %s(%d): JIT requires RWX: failing. \n",
2631 		    proc_selfpid(),
2632 		    (get_bsdtask_info(current_task())
2633 		    ? proc_name_address(get_bsdtask_info(current_task()))
2634 		    : "?"),
2635 		    __FUNCTION__,
2636 		    cur_protection);
2637 		return KERN_PROTECTION_FAILURE;
2638 #endif
2639 	}
2640 
2641 	/*
2642 	 * If the task has requested executable lockdown,
2643 	 * deny any new executable mapping.
2644 	 */
2645 	if (map->map_disallow_new_exec == TRUE) {
2646 		if (cur_protection & VM_PROT_EXECUTE) {
2647 			return KERN_PROTECTION_FAILURE;
2648 		}
2649 	}
2650 
2651 	if (resilient_codesign) {
2652 		assert(!is_submap);
2653 		int reject_prot = (needs_copy ? VM_PROT_ALLEXEC : (VM_PROT_WRITE | VM_PROT_ALLEXEC));
2654 		if ((cur_protection | max_protection) & reject_prot) {
2655 			return KERN_PROTECTION_FAILURE;
2656 		}
2657 	}
2658 
2659 	if (resilient_media) {
2660 		assert(!is_submap);
2661 //		assert(!needs_copy);
2662 		if (object != VM_OBJECT_NULL &&
2663 		    !object->internal) {
2664 			/*
2665 			 * This mapping is directly backed by an external
2666 			 * memory manager (e.g. a vnode pager for a file):
2667 			 * we would not have any safe place to inject
2668 			 * a zero-filled page if an actual page is not
2669 			 * available, without possibly impacting the actual
2670 			 * contents of the mapped object (e.g. the file),
2671 			 * so we can't provide any media resiliency here.
2672 			 */
2673 			return KERN_INVALID_ARGUMENT;
2674 		}
2675 	}
2676 
2677 	if (entry_for_tpro) {
2678 		/*
2679 		 * TPRO overrides the effective permissions of the region
2680 		 * and explicitly maps as RW. Ensure we have been passed
2681 		 * the expected permissions. We accept `cur_protections`
2682 		 * RO as that will be handled on fault.
2683 		 */
2684 		if (!(max_protection & VM_PROT_READ) ||
2685 		    !(max_protection & VM_PROT_WRITE) ||
2686 		    !(cur_protection & VM_PROT_READ)) {
2687 			return KERN_PROTECTION_FAILURE;
2688 		}
2689 
2690 		/*
2691 		 * We can now downgrade the cur_protection to RO. This is a mild lie
2692 		 * to the VM layer. But TPRO will be responsible for toggling the
2693 		 * protections between RO/RW
2694 		 */
2695 		cur_protection = VM_PROT_READ;
2696 	}
2697 
2698 	if (is_submap) {
2699 		vm_map_t submap;
2700 		if (purgable) {
2701 			/* submaps can not be purgeable */
2702 			return KERN_INVALID_ARGUMENT;
2703 		}
2704 		if (object == VM_OBJECT_NULL) {
2705 			/* submaps can not be created lazily */
2706 			return KERN_INVALID_ARGUMENT;
2707 		}
2708 		submap = (vm_map_t) object;
2709 		if (VM_MAP_PAGE_SHIFT(submap) != VM_MAP_PAGE_SHIFT(map)) {
2710 			/* page size mismatch */
2711 			return KERN_INVALID_ARGUMENT;
2712 		}
2713 	}
2714 	if (vmk_flags.vmkf_already) {
2715 		/*
2716 		 * VM_FLAGS_ALREADY says that it's OK if the same mapping
2717 		 * is already present.  For it to be meaningul, the requested
2718 		 * mapping has to be at a fixed address (!VM_FLAGS_ANYWHERE) and
2719 		 * we shouldn't try and remove what was mapped there first
2720 		 * (!VM_FLAGS_OVERWRITE).
2721 		 */
2722 		if (!vmk_flags.vmf_fixed || vmk_flags.vmf_overwrite) {
2723 			return KERN_INVALID_ARGUMENT;
2724 		}
2725 	}
2726 
2727 	if (size == 0 ||
2728 	    (offset & MIN(VM_MAP_PAGE_MASK(map), PAGE_MASK_64)) != 0) {
2729 		*address = 0;
2730 		return KERN_INVALID_ARGUMENT;
2731 	}
2732 
2733 	if (map->pmap == kernel_pmap) {
2734 		user_alias = VM_KERN_MEMORY_NONE;
2735 	} else {
2736 		user_alias = alias;
2737 	}
2738 
2739 	if (user_alias == VM_MEMORY_MALLOC_MEDIUM) {
2740 		chunk_size = MALLOC_MEDIUM_CHUNK_SIZE;
2741 	}
2742 
2743 #define RETURN(value)   { result = value; goto BailOut; }
2744 
2745 	assertf(VM_MAP_PAGE_ALIGNED(*address, FOURK_PAGE_MASK), "0x%llx", (uint64_t)*address);
2746 	assertf(VM_MAP_PAGE_ALIGNED(size, FOURK_PAGE_MASK), "0x%llx", (uint64_t)size);
2747 	if (VM_MAP_PAGE_MASK(map) >= PAGE_MASK) {
2748 		assertf(page_aligned(*address), "0x%llx", (uint64_t)*address);
2749 		assertf(page_aligned(size), "0x%llx", (uint64_t)size);
2750 	}
2751 
2752 	if (VM_MAP_PAGE_MASK(map) >= PAGE_MASK &&
2753 	    !VM_MAP_PAGE_ALIGNED(size, VM_MAP_PAGE_MASK(map))) {
2754 		/*
2755 		 * In most cases, the caller rounds the size up to the
2756 		 * map's page size.
2757 		 * If we get a size that is explicitly not map-aligned here,
2758 		 * we'll have to respect the caller's wish and mark the
2759 		 * mapping as "not map-aligned" to avoid tripping the
2760 		 * map alignment checks later.
2761 		 */
2762 		clear_map_aligned = TRUE;
2763 	}
2764 	if (!anywhere &&
2765 	    VM_MAP_PAGE_MASK(map) >= PAGE_MASK &&
2766 	    !VM_MAP_PAGE_ALIGNED(*address, VM_MAP_PAGE_MASK(map))) {
2767 		/*
2768 		 * We've been asked to map at a fixed address and that
2769 		 * address is not aligned to the map's specific alignment.
2770 		 * The caller should know what it's doing (i.e. most likely
2771 		 * mapping some fragmented copy map, transferring memory from
2772 		 * a VM map with a different alignment), so clear map_aligned
2773 		 * for this new VM map entry and proceed.
2774 		 */
2775 		clear_map_aligned = TRUE;
2776 	}
2777 
2778 	/*
2779 	 * Only zero-fill objects are allowed to be purgable.
2780 	 * LP64todo - limit purgable objects to 32-bits for now
2781 	 */
2782 	if (purgable &&
2783 	    (offset != 0 ||
2784 	    (object != VM_OBJECT_NULL &&
2785 	    (object->vo_size != size ||
2786 	    object->purgable == VM_PURGABLE_DENY))
2787 #if __LP64__
2788 	    || size > ANON_MAX_SIZE
2789 #endif
2790 	    )) {
2791 		return KERN_INVALID_ARGUMENT;
2792 	}
2793 
2794 	start = *address;
2795 
2796 	if (anywhere) {
2797 		vm_map_lock(map);
2798 		map_locked = TRUE;
2799 
2800 		result = vm_map_locate_space(map, size, mask, vmk_flags,
2801 		    &start, &entry);
2802 		if (result != KERN_SUCCESS) {
2803 			goto BailOut;
2804 		}
2805 
2806 		*address = start;
2807 		end = start + size;
2808 		assert(VM_MAP_PAGE_ALIGNED(*address,
2809 		    VM_MAP_PAGE_MASK(map)));
2810 	} else {
2811 		vm_map_offset_t effective_min_offset, effective_max_offset;
2812 
2813 		effective_min_offset = map->min_offset;
2814 		effective_max_offset = map->max_offset;
2815 
2816 		if (vmk_flags.vmkf_beyond_max) {
2817 			/*
2818 			 * Allow an insertion beyond the map's max offset.
2819 			 */
2820 			effective_max_offset = 0x00000000FFFFF000ULL;
2821 			if (vm_map_is_64bit(map)) {
2822 				effective_max_offset = 0xFFFFFFFFFFFFF000ULL;
2823 			}
2824 #if XNU_TARGET_OS_OSX
2825 		} else if (__improbable(vmk_flags.vmkf_32bit_map_va)) {
2826 			effective_max_offset = MIN(map->max_offset, 0x00000000FFFFF000ULL);
2827 #endif /* XNU_TARGET_OS_OSX */
2828 		}
2829 
2830 		if (VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT &&
2831 		    !overwrite &&
2832 		    user_alias == VM_MEMORY_REALLOC) {
2833 			/*
2834 			 * Force realloc() to switch to a new allocation,
2835 			 * to prevent 4k-fragmented virtual ranges.
2836 			 */
2837 //			DEBUG4K_ERROR("no realloc in place");
2838 			return KERN_NO_SPACE;
2839 		}
2840 
2841 		/*
2842 		 *	Verify that:
2843 		 *		the address doesn't itself violate
2844 		 *		the mask requirement.
2845 		 */
2846 
2847 		vm_map_lock(map);
2848 		map_locked = TRUE;
2849 		if ((start & mask) != 0) {
2850 			RETURN(KERN_NO_SPACE);
2851 		}
2852 
2853 #if CONFIG_MAP_RANGES
2854 		if (map->uses_user_ranges) {
2855 			struct mach_vm_range r;
2856 
2857 			vm_map_user_range_resolve(map, start, 1, &r);
2858 			if (r.max_address == 0) {
2859 				RETURN(KERN_INVALID_ADDRESS);
2860 			}
2861 			effective_min_offset = r.min_address;
2862 			effective_max_offset = r.max_address;
2863 		}
2864 #endif /* CONFIG_MAP_RANGES */
2865 
2866 		if ((startup_phase >= STARTUP_SUB_KMEM) && !is_submap &&
2867 		    (map == kernel_map)) {
2868 			mach_vm_range_t r = kmem_validate_range_for_overwrite(start, size);
2869 			effective_min_offset = r->min_address;
2870 			effective_max_offset = r->max_address;
2871 		}
2872 
2873 		/*
2874 		 *	...	the address is within bounds
2875 		 */
2876 
2877 		end = start + size;
2878 
2879 		if ((start < effective_min_offset) ||
2880 		    (end > effective_max_offset) ||
2881 		    (start >= end)) {
2882 			RETURN(KERN_INVALID_ADDRESS);
2883 		}
2884 
2885 		if (overwrite) {
2886 			vmr_flags_t remove_flags = VM_MAP_REMOVE_NO_MAP_ALIGN | VM_MAP_REMOVE_TO_OVERWRITE;
2887 			kern_return_t remove_kr;
2888 
2889 			/*
2890 			 * Fixed mapping and "overwrite" flag: attempt to
2891 			 * remove all existing mappings in the specified
2892 			 * address range, saving them in our "zap_old_list".
2893 			 *
2894 			 * This avoids releasing the VM map lock in
2895 			 * vm_map_entry_delete() and allows atomicity
2896 			 * when we want to replace some mappings with a new one.
2897 			 * It also allows us to restore the old VM mappings if the
2898 			 * new mapping fails.
2899 			 */
2900 			remove_flags |= VM_MAP_REMOVE_NO_YIELD;
2901 
2902 			if (vmk_flags.vmkf_overwrite_immutable) {
2903 				/* we can overwrite immutable mappings */
2904 				remove_flags |= VM_MAP_REMOVE_IMMUTABLE;
2905 			}
2906 			if (vmk_flags.vmkf_remap_prot_copy) {
2907 				remove_flags |= VM_MAP_REMOVE_IMMUTABLE_CODE;
2908 			}
2909 			remove_kr = vm_map_delete(map, start, end, remove_flags,
2910 			    KMEM_GUARD_NONE, &zap_old_list).kmr_return;
2911 			if (remove_kr) {
2912 				/* XXX FBDP restore zap_old_list? */
2913 				RETURN(remove_kr);
2914 			}
2915 		}
2916 
2917 		/*
2918 		 *	...	the starting address isn't allocated
2919 		 */
2920 
2921 		if (vm_map_lookup_entry(map, start, &entry)) {
2922 			if (!(vmk_flags.vmkf_already)) {
2923 				RETURN(KERN_NO_SPACE);
2924 			}
2925 			/*
2926 			 * Check if what's already there is what we want.
2927 			 */
2928 			tmp_start = start;
2929 			tmp_offset = offset;
2930 			if (entry->vme_start < start) {
2931 				tmp_start -= start - entry->vme_start;
2932 				tmp_offset -= start - entry->vme_start;
2933 			}
2934 			for (; entry->vme_start < end;
2935 			    entry = entry->vme_next) {
2936 				/*
2937 				 * Check if the mapping's attributes
2938 				 * match the existing map entry.
2939 				 */
2940 				if (entry == vm_map_to_entry(map) ||
2941 				    entry->vme_start != tmp_start ||
2942 				    entry->is_sub_map != is_submap ||
2943 				    VME_OFFSET(entry) != tmp_offset ||
2944 				    entry->needs_copy != needs_copy ||
2945 				    entry->protection != cur_protection ||
2946 				    entry->max_protection != max_protection ||
2947 				    entry->inheritance != inheritance ||
2948 				    entry->iokit_acct != iokit_acct ||
2949 				    VME_ALIAS(entry) != alias) {
2950 					/* not the same mapping ! */
2951 					RETURN(KERN_NO_SPACE);
2952 				}
2953 				/*
2954 				 * Check if the same object is being mapped.
2955 				 */
2956 				if (is_submap) {
2957 					if (VME_SUBMAP(entry) !=
2958 					    (vm_map_t) object) {
2959 						/* not the same submap */
2960 						RETURN(KERN_NO_SPACE);
2961 					}
2962 				} else {
2963 					if (VME_OBJECT(entry) != object) {
2964 						/* not the same VM object... */
2965 						vm_object_t obj2;
2966 
2967 						obj2 = VME_OBJECT(entry);
2968 						if ((obj2 == VM_OBJECT_NULL ||
2969 						    obj2->internal) &&
2970 						    (object == VM_OBJECT_NULL ||
2971 						    object->internal)) {
2972 							/*
2973 							 * ... but both are
2974 							 * anonymous memory,
2975 							 * so equivalent.
2976 							 */
2977 						} else {
2978 							RETURN(KERN_NO_SPACE);
2979 						}
2980 					}
2981 				}
2982 
2983 				tmp_offset += entry->vme_end - entry->vme_start;
2984 				tmp_start += entry->vme_end - entry->vme_start;
2985 				if (entry->vme_end >= end) {
2986 					/* reached the end of our mapping */
2987 					break;
2988 				}
2989 			}
2990 			/* it all matches:  let's use what's already there ! */
2991 			RETURN(KERN_MEMORY_PRESENT);
2992 		}
2993 
2994 		/*
2995 		 *	...	the next region doesn't overlap the
2996 		 *		end point.
2997 		 */
2998 
2999 		if ((entry->vme_next != vm_map_to_entry(map)) &&
3000 		    (entry->vme_next->vme_start < end)) {
3001 			RETURN(KERN_NO_SPACE);
3002 		}
3003 	}
3004 
3005 	/*
3006 	 *	At this point,
3007 	 *		"start" and "end" should define the endpoints of the
3008 	 *			available new range, and
3009 	 *		"entry" should refer to the region before the new
3010 	 *			range, and
3011 	 *
3012 	 *		the map should be locked.
3013 	 */
3014 
3015 	/*
3016 	 *	See whether we can avoid creating a new entry (and object) by
3017 	 *	extending one of our neighbors.  [So far, we only attempt to
3018 	 *	extend from below.]  Note that we can never extend/join
3019 	 *	purgable objects because they need to remain distinct
3020 	 *	entities in order to implement their "volatile object"
3021 	 *	semantics.
3022 	 */
3023 
3024 	if (purgable ||
3025 	    entry_for_jit ||
3026 	    entry_for_tpro ||
3027 	    vm_memory_malloc_no_cow(user_alias)) {
3028 		if (object == VM_OBJECT_NULL) {
3029 			object = vm_object_allocate(size);
3030 			vm_object_lock(object);
3031 			object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
3032 			VM_OBJECT_SET_TRUE_SHARE(object, FALSE);
3033 			if (malloc_no_cow_except_fork &&
3034 			    !purgable &&
3035 			    !entry_for_jit &&
3036 			    !entry_for_tpro &&
3037 			    vm_memory_malloc_no_cow(user_alias)) {
3038 				object->copy_strategy = MEMORY_OBJECT_COPY_DELAY_FORK;
3039 				VM_OBJECT_SET_TRUE_SHARE(object, TRUE);
3040 			}
3041 			if (purgable) {
3042 				task_t owner;
3043 				VM_OBJECT_SET_PURGABLE(object, VM_PURGABLE_NONVOLATILE);
3044 				if (map->pmap == kernel_pmap) {
3045 					/*
3046 					 * Purgeable mappings made in a kernel
3047 					 * map are "owned" by the kernel itself
3048 					 * rather than the current user task
3049 					 * because they're likely to be used by
3050 					 * more than this user task (see
3051 					 * execargs_purgeable_allocate(), for
3052 					 * example).
3053 					 */
3054 					owner = kernel_task;
3055 				} else {
3056 					owner = current_task();
3057 				}
3058 				assert(object->vo_owner == NULL);
3059 				assert(object->resident_page_count == 0);
3060 				assert(object->wired_page_count == 0);
3061 				vm_purgeable_nonvolatile_enqueue(object, owner);
3062 			}
3063 			vm_object_unlock(object);
3064 			offset = (vm_object_offset_t)0;
3065 		}
3066 	} else if (VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT) {
3067 		/* no coalescing if address space uses sub-pages */
3068 	} else if ((is_submap == FALSE) &&
3069 	    (object == VM_OBJECT_NULL) &&
3070 	    (entry != vm_map_to_entry(map)) &&
3071 	    (entry->vme_end == start) &&
3072 	    (!entry->is_shared) &&
3073 	    (!entry->is_sub_map) &&
3074 	    (!entry->in_transition) &&
3075 	    (!entry->needs_wakeup) &&
3076 	    (entry->behavior == VM_BEHAVIOR_DEFAULT) &&
3077 	    (entry->protection == cur_protection) &&
3078 	    (entry->max_protection == max_protection) &&
3079 	    (entry->inheritance == inheritance) &&
3080 	    ((user_alias == VM_MEMORY_REALLOC) ||
3081 	    (VME_ALIAS(entry) == alias)) &&
3082 	    (entry->no_cache == no_cache) &&
3083 	    (entry->vme_permanent == permanent) &&
3084 	    /* no coalescing for immutable executable mappings */
3085 	    !((entry->protection & VM_PROT_EXECUTE) &&
3086 	    entry->vme_permanent) &&
3087 	    (!entry->superpage_size && !superpage_size) &&
3088 	    /*
3089 	     * No coalescing if not map-aligned, to avoid propagating
3090 	     * that condition any further than needed:
3091 	     */
3092 	    (!entry->map_aligned || !clear_map_aligned) &&
3093 	    (!entry->zero_wired_pages) &&
3094 	    (!entry->used_for_jit && !entry_for_jit) &&
3095 #if __arm64e__
3096 	    (!entry->used_for_tpro && !entry_for_tpro) &&
3097 #endif
3098 	    (!entry->csm_associated) &&
3099 	    (entry->iokit_acct == iokit_acct) &&
3100 	    (!entry->vme_resilient_codesign) &&
3101 	    (!entry->vme_resilient_media) &&
3102 	    (!entry->vme_atomic) &&
3103 	    (entry->vme_no_copy_on_read == no_copy_on_read) &&
3104 
3105 	    ((entry->vme_end - entry->vme_start) + size <=
3106 	    (user_alias == VM_MEMORY_REALLOC ?
3107 	    ANON_CHUNK_SIZE :
3108 	    NO_COALESCE_LIMIT)) &&
3109 
3110 	    (entry->wired_count == 0)) {        /* implies user_wired_count == 0 */
3111 		if (vm_object_coalesce(VME_OBJECT(entry),
3112 		    VM_OBJECT_NULL,
3113 		    VME_OFFSET(entry),
3114 		    (vm_object_offset_t) 0,
3115 		    (vm_map_size_t)(entry->vme_end - entry->vme_start),
3116 		    (vm_map_size_t)(end - entry->vme_end))) {
3117 			/*
3118 			 *	Coalesced the two objects - can extend
3119 			 *	the previous map entry to include the
3120 			 *	new range.
3121 			 */
3122 			map->size += (end - entry->vme_end);
3123 			assert(entry->vme_start < end);
3124 			assert(VM_MAP_PAGE_ALIGNED(end,
3125 			    VM_MAP_PAGE_MASK(map)));
3126 			if (__improbable(vm_debug_events)) {
3127 				DTRACE_VM5(map_entry_extend, vm_map_t, map, vm_map_entry_t, entry, vm_address_t, entry->vme_start, vm_address_t, entry->vme_end, vm_address_t, end);
3128 			}
3129 			entry->vme_end = end;
3130 			if (map->holelistenabled) {
3131 				vm_map_store_update_first_free(map, entry, TRUE);
3132 			} else {
3133 				vm_map_store_update_first_free(map, map->first_free, TRUE);
3134 			}
3135 			new_mapping_established = TRUE;
3136 			RETURN(KERN_SUCCESS);
3137 		}
3138 	}
3139 
3140 	step = superpage_size ? SUPERPAGE_SIZE : (end - start);
3141 	new_entry = NULL;
3142 
3143 	if (vmk_flags.vmkf_submap_adjust) {
3144 		vm_map_adjust_offsets((vm_map_t)caller_object, start, end);
3145 		offset = start;
3146 	}
3147 
3148 	for (tmp2_start = start; tmp2_start < end; tmp2_start += step) {
3149 		tmp2_end = tmp2_start + step;
3150 		/*
3151 		 *	Create a new entry
3152 		 *
3153 		 * XXX FBDP
3154 		 * The reserved "page zero" in each process's address space can
3155 		 * be arbitrarily large.  Splitting it into separate objects and
3156 		 * therefore different VM map entries serves no purpose and just
3157 		 * slows down operations on the VM map, so let's not split the
3158 		 * allocation into chunks if the max protection is NONE.  That
3159 		 * memory should never be accessible, so it will never get to the
3160 		 * default pager.
3161 		 */
3162 		tmp_start = tmp2_start;
3163 		if (!is_submap &&
3164 		    object == VM_OBJECT_NULL &&
3165 		    size > chunk_size &&
3166 		    max_protection != VM_PROT_NONE &&
3167 		    superpage_size == 0) {
3168 			tmp_end = tmp_start + chunk_size;
3169 		} else {
3170 			tmp_end = tmp2_end;
3171 		}
3172 		do {
3173 			if (!is_submap &&
3174 			    object != VM_OBJECT_NULL &&
3175 			    object->internal &&
3176 			    offset + (tmp_end - tmp_start) > object->vo_size) {
3177 //				printf("FBDP object %p size 0x%llx overmapping offset 0x%llx size 0x%llx\n", object, object->vo_size, offset, (uint64_t)(tmp_end - tmp_start));
3178 				DTRACE_VM5(vm_map_enter_overmap,
3179 				    vm_map_t, map,
3180 				    vm_map_address_t, tmp_start,
3181 				    vm_map_address_t, tmp_end,
3182 				    vm_object_offset_t, offset,
3183 				    vm_object_size_t, object->vo_size);
3184 			}
3185 			new_entry = vm_map_entry_insert(map,
3186 			    entry, tmp_start, tmp_end,
3187 			    object, offset, vmk_flags,
3188 			    needs_copy,
3189 			    cur_protection, max_protection,
3190 			    (entry_for_jit && !VM_MAP_POLICY_ALLOW_JIT_INHERIT(map) ?
3191 			    VM_INHERIT_NONE : inheritance),
3192 			    clear_map_aligned);
3193 
3194 			assert(!is_kernel_object(object) || (VM_KERN_MEMORY_NONE != alias));
3195 
3196 			if (resilient_codesign) {
3197 				int reject_prot = (needs_copy ? VM_PROT_ALLEXEC : (VM_PROT_WRITE | VM_PROT_ALLEXEC));
3198 				if (!((cur_protection | max_protection) & reject_prot)) {
3199 					new_entry->vme_resilient_codesign = TRUE;
3200 				}
3201 			}
3202 
3203 			if (resilient_media &&
3204 			    (object == VM_OBJECT_NULL ||
3205 			    object->internal)) {
3206 				new_entry->vme_resilient_media = TRUE;
3207 			}
3208 
3209 			assert(!new_entry->iokit_acct);
3210 			if (!is_submap &&
3211 			    object != VM_OBJECT_NULL &&
3212 			    (object->purgable != VM_PURGABLE_DENY ||
3213 			    object->vo_ledger_tag)) {
3214 				assert(new_entry->use_pmap);
3215 				assert(!new_entry->iokit_acct);
3216 				/*
3217 				 * Turn off pmap accounting since
3218 				 * purgeable (or tagged) objects have their
3219 				 * own ledgers.
3220 				 */
3221 				new_entry->use_pmap = FALSE;
3222 			} else if (!is_submap &&
3223 			    iokit_acct &&
3224 			    object != VM_OBJECT_NULL &&
3225 			    object->internal) {
3226 				/* alternate accounting */
3227 				assert(!new_entry->iokit_acct);
3228 				assert(new_entry->use_pmap);
3229 				new_entry->iokit_acct = TRUE;
3230 				new_entry->use_pmap = FALSE;
3231 				DTRACE_VM4(
3232 					vm_map_iokit_mapped_region,
3233 					vm_map_t, map,
3234 					vm_map_offset_t, new_entry->vme_start,
3235 					vm_map_offset_t, new_entry->vme_end,
3236 					int, VME_ALIAS(new_entry));
3237 				vm_map_iokit_mapped_region(
3238 					map,
3239 					(new_entry->vme_end -
3240 					new_entry->vme_start));
3241 			} else if (!is_submap) {
3242 				assert(!new_entry->iokit_acct);
3243 				assert(new_entry->use_pmap);
3244 			}
3245 
3246 			if (is_submap) {
3247 				vm_map_t        submap;
3248 				boolean_t       submap_is_64bit;
3249 				boolean_t       use_pmap;
3250 
3251 				assert(new_entry->is_sub_map);
3252 				assert(!new_entry->use_pmap);
3253 				assert(!new_entry->iokit_acct);
3254 				submap = (vm_map_t) object;
3255 				submap_is_64bit = vm_map_is_64bit(submap);
3256 				use_pmap = vmk_flags.vmkf_nested_pmap;
3257 #ifndef NO_NESTED_PMAP
3258 				if (use_pmap && submap->pmap == NULL) {
3259 					ledger_t ledger = map->pmap->ledger;
3260 					/* we need a sub pmap to nest... */
3261 					submap->pmap = pmap_create_options(ledger, 0,
3262 					    submap_is_64bit ? PMAP_CREATE_64BIT : 0);
3263 					if (submap->pmap == NULL) {
3264 						/* let's proceed without nesting... */
3265 					}
3266 #if defined(__arm64__)
3267 					else {
3268 						pmap_set_nested(submap->pmap);
3269 					}
3270 #endif
3271 				}
3272 				if (use_pmap && submap->pmap != NULL) {
3273 					if (VM_MAP_PAGE_SHIFT(map) != VM_MAP_PAGE_SHIFT(submap)) {
3274 						DEBUG4K_ERROR("map %p (%d) submap %p (%d): incompatible page sizes\n", map, VM_MAP_PAGE_SHIFT(map), submap, VM_MAP_PAGE_SHIFT(submap));
3275 						kr = KERN_FAILURE;
3276 					} else {
3277 						kr = pmap_nest(map->pmap,
3278 						    submap->pmap,
3279 						    tmp_start,
3280 						    tmp_end - tmp_start);
3281 					}
3282 					if (kr != KERN_SUCCESS) {
3283 						printf("vm_map_enter: "
3284 						    "pmap_nest(0x%llx,0x%llx) "
3285 						    "error 0x%x\n",
3286 						    (long long)tmp_start,
3287 						    (long long)tmp_end,
3288 						    kr);
3289 					} else {
3290 						/* we're now nested ! */
3291 						new_entry->use_pmap = TRUE;
3292 						pmap_empty = FALSE;
3293 					}
3294 				}
3295 #endif /* NO_NESTED_PMAP */
3296 			}
3297 			entry = new_entry;
3298 
3299 			if (superpage_size) {
3300 				vm_page_t pages, m;
3301 				vm_object_t sp_object;
3302 				vm_object_offset_t sp_offset;
3303 
3304 				VME_OFFSET_SET(entry, 0);
3305 
3306 				/* allocate one superpage */
3307 				kr = cpm_allocate(SUPERPAGE_SIZE, &pages, 0, SUPERPAGE_NBASEPAGES - 1, TRUE, 0);
3308 				if (kr != KERN_SUCCESS) {
3309 					/* deallocate whole range... */
3310 					new_mapping_established = TRUE;
3311 					/* ... but only up to "tmp_end" */
3312 					size -= end - tmp_end;
3313 					RETURN(kr);
3314 				}
3315 
3316 				/* create one vm_object per superpage */
3317 				sp_object = vm_object_allocate((vm_map_size_t)(entry->vme_end - entry->vme_start));
3318 				vm_object_lock(sp_object);
3319 				sp_object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
3320 				VM_OBJECT_SET_PHYS_CONTIGUOUS(sp_object, TRUE);
3321 				sp_object->vo_shadow_offset = (vm_object_offset_t)VM_PAGE_GET_PHYS_PAGE(pages) * PAGE_SIZE;
3322 				VME_OBJECT_SET(entry, sp_object, false, 0);
3323 				assert(entry->use_pmap);
3324 
3325 				/* enter the base pages into the object */
3326 				for (sp_offset = 0;
3327 				    sp_offset < SUPERPAGE_SIZE;
3328 				    sp_offset += PAGE_SIZE) {
3329 					m = pages;
3330 					pmap_zero_page(VM_PAGE_GET_PHYS_PAGE(m));
3331 					pages = NEXT_PAGE(m);
3332 					*(NEXT_PAGE_PTR(m)) = VM_PAGE_NULL;
3333 					vm_page_insert_wired(m, sp_object, sp_offset, VM_KERN_MEMORY_OSFMK);
3334 				}
3335 				vm_object_unlock(sp_object);
3336 			}
3337 		} while (tmp_end != tmp2_end &&
3338 		    (tmp_start = tmp_end) &&
3339 		    (tmp_end = (tmp2_end - tmp_end > chunk_size) ?
3340 		    tmp_end + chunk_size : tmp2_end));
3341 	}
3342 
3343 	new_mapping_established = TRUE;
3344 
3345 BailOut:
3346 	assert(map_locked == TRUE);
3347 
3348 	/*
3349 	 * Address space limit enforcement (RLIMIT_AS and RLIMIT_DATA):
3350 	 * If we have identified and possibly established the new mapping(s),
3351 	 * make sure we did not go beyond the address space limit.
3352 	 */
3353 	if (result == KERN_SUCCESS) {
3354 		if (map->size_limit != RLIM_INFINITY &&
3355 		    map->size > map->size_limit) {
3356 			/*
3357 			 * Establishing the requested mappings would exceed
3358 			 * the process's RLIMIT_AS limit: fail with
3359 			 * KERN_NO_SPACE.
3360 			 */
3361 			result = KERN_NO_SPACE;
3362 			printf("%d[%s] %s: map size 0x%llx over RLIMIT_AS 0x%llx\n",
3363 			    proc_selfpid(),
3364 			    (get_bsdtask_info(current_task())
3365 			    ? proc_name_address(get_bsdtask_info(current_task()))
3366 			    : "?"),
3367 			    __FUNCTION__,
3368 			    (uint64_t) map->size,
3369 			    (uint64_t) map->size_limit);
3370 			DTRACE_VM2(vm_map_enter_RLIMIT_AS,
3371 			    vm_map_size_t, map->size,
3372 			    uint64_t, map->size_limit);
3373 			vm_map_enter_RLIMIT_AS_count++;
3374 		} else if (map->data_limit != RLIM_INFINITY &&
3375 		    map->size > map->data_limit) {
3376 			/*
3377 			 * Establishing the requested mappings would exceed
3378 			 * the process's RLIMIT_DATA limit: fail with
3379 			 * KERN_NO_SPACE.
3380 			 */
3381 			result = KERN_NO_SPACE;
3382 			printf("%d[%s] %s: map size 0x%llx over RLIMIT_DATA 0x%llx\n",
3383 			    proc_selfpid(),
3384 			    (get_bsdtask_info(current_task())
3385 			    ? proc_name_address(get_bsdtask_info(current_task()))
3386 			    : "?"),
3387 			    __FUNCTION__,
3388 			    (uint64_t) map->size,
3389 			    (uint64_t) map->data_limit);
3390 			DTRACE_VM2(vm_map_enter_RLIMIT_DATA,
3391 			    vm_map_size_t, map->size,
3392 			    uint64_t, map->data_limit);
3393 			vm_map_enter_RLIMIT_DATA_count++;
3394 		}
3395 	}
3396 
3397 	if (result == KERN_SUCCESS) {
3398 		vm_prot_t pager_prot;
3399 		memory_object_t pager;
3400 
3401 #if DEBUG
3402 		if (pmap_empty &&
3403 		    !(vmk_flags.vmkf_no_pmap_check)) {
3404 			assert(pmap_is_empty(map->pmap,
3405 			    *address,
3406 			    *address + size));
3407 		}
3408 #endif /* DEBUG */
3409 
3410 		/*
3411 		 * For "named" VM objects, let the pager know that the
3412 		 * memory object is being mapped.  Some pagers need to keep
3413 		 * track of this, to know when they can reclaim the memory
3414 		 * object, for example.
3415 		 * VM calls memory_object_map() for each mapping (specifying
3416 		 * the protection of each mapping) and calls
3417 		 * memory_object_last_unmap() when all the mappings are gone.
3418 		 */
3419 		pager_prot = max_protection;
3420 		if (needs_copy) {
3421 			/*
3422 			 * Copy-On-Write mapping: won't modify
3423 			 * the memory object.
3424 			 */
3425 			pager_prot &= ~VM_PROT_WRITE;
3426 		}
3427 		if (!is_submap &&
3428 		    object != VM_OBJECT_NULL &&
3429 		    object->named &&
3430 		    object->pager != MEMORY_OBJECT_NULL) {
3431 			vm_object_lock(object);
3432 			pager = object->pager;
3433 			if (object->named &&
3434 			    pager != MEMORY_OBJECT_NULL) {
3435 				assert(object->pager_ready);
3436 				vm_object_mapping_wait(object, THREAD_UNINT);
3437 				vm_object_mapping_begin(object);
3438 				vm_object_unlock(object);
3439 
3440 				kr = memory_object_map(pager, pager_prot);
3441 				assert(kr == KERN_SUCCESS);
3442 
3443 				vm_object_lock(object);
3444 				vm_object_mapping_end(object);
3445 			}
3446 			vm_object_unlock(object);
3447 		}
3448 	}
3449 
3450 	assert(map_locked == TRUE);
3451 
3452 	if (new_mapping_established) {
3453 		/*
3454 		 * If we release the map lock for any reason below,
3455 		 * another thread could deallocate our new mapping,
3456 		 * releasing the caller's reference on "caller_object",
3457 		 * which was transferred to the mapping.
3458 		 * If this was the only reference, the object could be
3459 		 * destroyed.
3460 		 *
3461 		 * We need to take an extra reference on "caller_object"
3462 		 * to keep it alive if we need to return the caller's
3463 		 * reference to the caller in case of failure.
3464 		 */
3465 		if (is_submap) {
3466 			vm_map_reference((vm_map_t)caller_object);
3467 		} else {
3468 			vm_object_reference(caller_object);
3469 		}
3470 	}
3471 
3472 	if (!keep_map_locked) {
3473 		vm_map_unlock(map);
3474 		map_locked = FALSE;
3475 		entry = VM_MAP_ENTRY_NULL;
3476 		new_entry = VM_MAP_ENTRY_NULL;
3477 	}
3478 
3479 	/*
3480 	 * We can't hold the map lock if we enter this block.
3481 	 */
3482 
3483 	if (result == KERN_SUCCESS) {
3484 		/*	Wire down the new entry if the user
3485 		 *	requested all new map entries be wired.
3486 		 */
3487 		if ((map->wiring_required) || (superpage_size)) {
3488 			assert(!keep_map_locked);
3489 			pmap_empty = FALSE; /* pmap won't be empty */
3490 			kr = vm_map_wire_kernel(map, start, end,
3491 			    cur_protection, VM_KERN_MEMORY_MLOCK,
3492 			    TRUE);
3493 			result = kr;
3494 		}
3495 
3496 	}
3497 
3498 	if (result != KERN_SUCCESS) {
3499 		if (new_mapping_established) {
3500 			vmr_flags_t remove_flags = VM_MAP_REMOVE_NO_FLAGS;
3501 
3502 			/*
3503 			 * We have to get rid of the new mappings since we
3504 			 * won't make them available to the user.
3505 			 * Try and do that atomically, to minimize the risk
3506 			 * that someone else create new mappings that range.
3507 			 */
3508 			if (!map_locked) {
3509 				vm_map_lock(map);
3510 				map_locked = TRUE;
3511 			}
3512 			remove_flags |= VM_MAP_REMOVE_NO_MAP_ALIGN;
3513 			remove_flags |= VM_MAP_REMOVE_NO_YIELD;
3514 			if (permanent) {
3515 				remove_flags |= VM_MAP_REMOVE_IMMUTABLE;
3516 			}
3517 			(void) vm_map_delete(map,
3518 			    *address, *address + size,
3519 			    remove_flags,
3520 			    KMEM_GUARD_NONE, &zap_new_list);
3521 		}
3522 
3523 		if (vm_map_zap_first_entry(&zap_old_list)) {
3524 			vm_map_entry_t entry1, entry2;
3525 
3526 			/*
3527 			 * The new mapping failed.  Attempt to restore
3528 			 * the old mappings, saved in the "zap_old_map".
3529 			 */
3530 			if (!map_locked) {
3531 				vm_map_lock(map);
3532 				map_locked = TRUE;
3533 			}
3534 
3535 			/* first check if the coast is still clear */
3536 			start = vm_map_zap_first_entry(&zap_old_list)->vme_start;
3537 			end   = vm_map_zap_last_entry(&zap_old_list)->vme_end;
3538 
3539 			if (vm_map_lookup_entry(map, start, &entry1) ||
3540 			    vm_map_lookup_entry(map, end, &entry2) ||
3541 			    entry1 != entry2) {
3542 				/*
3543 				 * Part of that range has already been
3544 				 * re-mapped:  we can't restore the old
3545 				 * mappings...
3546 				 */
3547 				vm_map_enter_restore_failures++;
3548 			} else {
3549 				/*
3550 				 * Transfer the saved map entries from
3551 				 * "zap_old_map" to the original "map",
3552 				 * inserting them all after "entry1".
3553 				 */
3554 				while ((entry2 = vm_map_zap_pop(&zap_old_list))) {
3555 					vm_map_size_t entry_size;
3556 
3557 					entry_size = (entry2->vme_end -
3558 					    entry2->vme_start);
3559 					vm_map_store_entry_link(map, entry1, entry2,
3560 					    VM_MAP_KERNEL_FLAGS_NONE);
3561 					map->size += entry_size;
3562 					entry1 = entry2;
3563 				}
3564 				if (map->wiring_required) {
3565 					/*
3566 					 * XXX TODO: we should rewire the
3567 					 * old pages here...
3568 					 */
3569 				}
3570 				vm_map_enter_restore_successes++;
3571 			}
3572 		}
3573 	}
3574 
3575 	/*
3576 	 * The caller is responsible for releasing the lock if it requested to
3577 	 * keep the map locked.
3578 	 */
3579 	if (map_locked && !keep_map_locked) {
3580 		vm_map_unlock(map);
3581 	}
3582 
3583 	vm_map_zap_dispose(&zap_old_list);
3584 	vm_map_zap_dispose(&zap_new_list);
3585 
3586 	if (new_mapping_established) {
3587 		/*
3588 		 * The caller had a reference on "caller_object" and we
3589 		 * transferred that reference to the mapping.
3590 		 * We also took an extra reference on "caller_object" to keep
3591 		 * it alive while the map was unlocked.
3592 		 */
3593 		if (result == KERN_SUCCESS) {
3594 			/*
3595 			 * On success, the caller's reference on the object gets
3596 			 * tranferred to the mapping.
3597 			 * Release our extra reference.
3598 			 */
3599 			if (is_submap) {
3600 				vm_map_deallocate((vm_map_t)caller_object);
3601 			} else {
3602 				vm_object_deallocate(caller_object);
3603 			}
3604 		} else {
3605 			/*
3606 			 * On error, the caller expects to still have a
3607 			 * reference on the object it gave us.
3608 			 * Let's use our extra reference for that.
3609 			 */
3610 		}
3611 	}
3612 
3613 	return result;
3614 
3615 #undef  RETURN
3616 }
3617 
3618 #if __arm64__
3619 extern const struct memory_object_pager_ops fourk_pager_ops;
3620 kern_return_t
vm_map_enter_fourk(vm_map_t map,vm_map_offset_t * address,vm_map_size_t size,vm_map_offset_t mask,vm_map_kernel_flags_t vmk_flags,vm_object_t object,vm_object_offset_t offset,boolean_t needs_copy,vm_prot_t cur_protection,vm_prot_t max_protection,vm_inherit_t inheritance)3621 vm_map_enter_fourk(
3622 	vm_map_t                map,
3623 	vm_map_offset_t         *address,       /* IN/OUT */
3624 	vm_map_size_t           size,
3625 	vm_map_offset_t         mask,
3626 	vm_map_kernel_flags_t   vmk_flags,
3627 	vm_object_t             object,
3628 	vm_object_offset_t      offset,
3629 	boolean_t               needs_copy,
3630 	vm_prot_t               cur_protection,
3631 	vm_prot_t               max_protection,
3632 	vm_inherit_t            inheritance)
3633 {
3634 	vm_map_entry_t          entry, new_entry;
3635 	vm_map_offset_t         start, fourk_start;
3636 	vm_map_offset_t         end, fourk_end;
3637 	vm_map_size_t           fourk_size;
3638 	kern_return_t           result = KERN_SUCCESS;
3639 	boolean_t               map_locked = FALSE;
3640 	boolean_t               pmap_empty = TRUE;
3641 	boolean_t               new_mapping_established = FALSE;
3642 	const bool              keep_map_locked = vmk_flags.vmkf_keep_map_locked;
3643 	const bool              anywhere = !vmk_flags.vmf_fixed;
3644 	const bool              purgable = vmk_flags.vmf_purgeable;
3645 	const bool              overwrite = vmk_flags.vmf_overwrite;
3646 	const bool              is_submap = vmk_flags.vmkf_submap;
3647 	const bool              entry_for_jit = vmk_flags.vmkf_map_jit;
3648 	const unsigned int      superpage_size = vmk_flags.vmf_superpage_size;
3649 	vm_map_offset_t         effective_min_offset, effective_max_offset;
3650 	kern_return_t           kr;
3651 	boolean_t               clear_map_aligned = FALSE;
3652 	memory_object_t         fourk_mem_obj;
3653 	vm_object_t             fourk_object;
3654 	vm_map_offset_t         fourk_pager_offset;
3655 	int                     fourk_pager_index_start, fourk_pager_index_num;
3656 	int                     cur_idx;
3657 	boolean_t               fourk_copy;
3658 	vm_object_t             copy_object;
3659 	vm_object_offset_t      copy_offset;
3660 	VM_MAP_ZAP_DECLARE(zap_list);
3661 
3662 	if (VM_MAP_PAGE_MASK(map) < PAGE_MASK) {
3663 		panic("%s:%d", __FUNCTION__, __LINE__);
3664 	}
3665 	fourk_mem_obj = MEMORY_OBJECT_NULL;
3666 	fourk_object = VM_OBJECT_NULL;
3667 
3668 	if (superpage_size) {
3669 		return KERN_NOT_SUPPORTED;
3670 	}
3671 
3672 	if ((cur_protection & VM_PROT_WRITE) &&
3673 	    (cur_protection & VM_PROT_EXECUTE) &&
3674 #if XNU_TARGET_OS_OSX
3675 	    map->pmap != kernel_pmap &&
3676 	    (vm_map_cs_enforcement(map)
3677 #if __arm64__
3678 	    || !VM_MAP_IS_EXOTIC(map)
3679 #endif /* __arm64__ */
3680 	    ) &&
3681 #endif /* XNU_TARGET_OS_OSX */
3682 #if CODE_SIGNING_MONITOR
3683 	    (csm_address_space_exempt(map->pmap) != KERN_SUCCESS) &&
3684 #endif
3685 	    !entry_for_jit) {
3686 		DTRACE_VM3(cs_wx,
3687 		    uint64_t, 0,
3688 		    uint64_t, 0,
3689 		    vm_prot_t, cur_protection);
3690 		printf("CODE SIGNING: %d[%s] %s: curprot cannot be write+execute. "
3691 		    "turning off execute\n",
3692 		    proc_selfpid(),
3693 		    (get_bsdtask_info(current_task())
3694 		    ? proc_name_address(get_bsdtask_info(current_task()))
3695 		    : "?"),
3696 		    __FUNCTION__);
3697 		cur_protection &= ~VM_PROT_EXECUTE;
3698 	}
3699 
3700 	/*
3701 	 * If the task has requested executable lockdown,
3702 	 * deny any new executable mapping.
3703 	 */
3704 	if (map->map_disallow_new_exec == TRUE) {
3705 		if (cur_protection & VM_PROT_EXECUTE) {
3706 			return KERN_PROTECTION_FAILURE;
3707 		}
3708 	}
3709 
3710 	if (is_submap) {
3711 		return KERN_NOT_SUPPORTED;
3712 	}
3713 	if (vmk_flags.vmkf_already) {
3714 		return KERN_NOT_SUPPORTED;
3715 	}
3716 	if (purgable || entry_for_jit) {
3717 		return KERN_NOT_SUPPORTED;
3718 	}
3719 
3720 	effective_min_offset = map->min_offset;
3721 
3722 	if (vmk_flags.vmkf_beyond_max) {
3723 		return KERN_NOT_SUPPORTED;
3724 	} else {
3725 		effective_max_offset = map->max_offset;
3726 	}
3727 
3728 	if (size == 0 ||
3729 	    (offset & FOURK_PAGE_MASK) != 0) {
3730 		*address = 0;
3731 		return KERN_INVALID_ARGUMENT;
3732 	}
3733 
3734 #define RETURN(value)   { result = value; goto BailOut; }
3735 
3736 	assert(VM_MAP_PAGE_ALIGNED(*address, FOURK_PAGE_MASK));
3737 	assert(VM_MAP_PAGE_ALIGNED(size, FOURK_PAGE_MASK));
3738 
3739 	if (!anywhere && overwrite) {
3740 		return KERN_NOT_SUPPORTED;
3741 	}
3742 
3743 	fourk_start = *address;
3744 	fourk_size = size;
3745 	fourk_end = fourk_start + fourk_size;
3746 
3747 	start = vm_map_trunc_page(*address, VM_MAP_PAGE_MASK(map));
3748 	end = vm_map_round_page(fourk_end, VM_MAP_PAGE_MASK(map));
3749 	size = end - start;
3750 
3751 	if (anywhere) {
3752 		return KERN_NOT_SUPPORTED;
3753 	} else {
3754 		/*
3755 		 *	Verify that:
3756 		 *		the address doesn't itself violate
3757 		 *		the mask requirement.
3758 		 */
3759 
3760 		vm_map_lock(map);
3761 		map_locked = TRUE;
3762 		if ((start & mask) != 0) {
3763 			RETURN(KERN_NO_SPACE);
3764 		}
3765 
3766 		/*
3767 		 *	...	the address is within bounds
3768 		 */
3769 
3770 		end = start + size;
3771 
3772 		if ((start < effective_min_offset) ||
3773 		    (end > effective_max_offset) ||
3774 		    (start >= end)) {
3775 			RETURN(KERN_INVALID_ADDRESS);
3776 		}
3777 
3778 		/*
3779 		 *	...	the starting address isn't allocated
3780 		 */
3781 		if (vm_map_lookup_entry(map, start, &entry)) {
3782 			vm_object_t cur_object, shadow_object;
3783 
3784 			/*
3785 			 * We might already some 4K mappings
3786 			 * in a 16K page here.
3787 			 */
3788 
3789 			if (entry->vme_end - entry->vme_start
3790 			    != SIXTEENK_PAGE_SIZE) {
3791 				RETURN(KERN_NO_SPACE);
3792 			}
3793 			if (entry->is_sub_map) {
3794 				RETURN(KERN_NO_SPACE);
3795 			}
3796 			if (VME_OBJECT(entry) == VM_OBJECT_NULL) {
3797 				RETURN(KERN_NO_SPACE);
3798 			}
3799 
3800 			/* go all the way down the shadow chain */
3801 			cur_object = VME_OBJECT(entry);
3802 			vm_object_lock(cur_object);
3803 			while (cur_object->shadow != VM_OBJECT_NULL) {
3804 				shadow_object = cur_object->shadow;
3805 				vm_object_lock(shadow_object);
3806 				vm_object_unlock(cur_object);
3807 				cur_object = shadow_object;
3808 				shadow_object = VM_OBJECT_NULL;
3809 			}
3810 			if (cur_object->internal ||
3811 			    cur_object->pager == NULL) {
3812 				vm_object_unlock(cur_object);
3813 				RETURN(KERN_NO_SPACE);
3814 			}
3815 			if (cur_object->pager->mo_pager_ops
3816 			    != &fourk_pager_ops) {
3817 				vm_object_unlock(cur_object);
3818 				RETURN(KERN_NO_SPACE);
3819 			}
3820 			fourk_object = cur_object;
3821 			fourk_mem_obj = fourk_object->pager;
3822 
3823 			/* keep the "4K" object alive */
3824 			vm_object_reference_locked(fourk_object);
3825 			memory_object_reference(fourk_mem_obj);
3826 			vm_object_unlock(fourk_object);
3827 
3828 			/* merge permissions */
3829 			entry->protection |= cur_protection;
3830 			entry->max_protection |= max_protection;
3831 
3832 			if ((entry->protection & VM_PROT_WRITE) &&
3833 			    (entry->protection & VM_PROT_ALLEXEC) &&
3834 			    fourk_binary_compatibility_unsafe &&
3835 			    fourk_binary_compatibility_allow_wx) {
3836 				/* write+execute: need to be "jit" */
3837 				entry->used_for_jit = TRUE;
3838 			}
3839 			goto map_in_fourk_pager;
3840 		}
3841 
3842 		/*
3843 		 *	...	the next region doesn't overlap the
3844 		 *		end point.
3845 		 */
3846 
3847 		if ((entry->vme_next != vm_map_to_entry(map)) &&
3848 		    (entry->vme_next->vme_start < end)) {
3849 			RETURN(KERN_NO_SPACE);
3850 		}
3851 	}
3852 
3853 	/*
3854 	 *	At this point,
3855 	 *		"start" and "end" should define the endpoints of the
3856 	 *			available new range, and
3857 	 *		"entry" should refer to the region before the new
3858 	 *			range, and
3859 	 *
3860 	 *		the map should be locked.
3861 	 */
3862 
3863 	/* create a new "4K" pager */
3864 	fourk_mem_obj = fourk_pager_create();
3865 	fourk_object = fourk_pager_to_vm_object(fourk_mem_obj);
3866 	assert(fourk_object);
3867 
3868 	/* keep the "4" object alive */
3869 	vm_object_reference(fourk_object);
3870 
3871 	/* create a "copy" object, to map the "4K" object copy-on-write */
3872 	fourk_copy = TRUE;
3873 	result = vm_object_copy_strategically(fourk_object,
3874 	    0,
3875 	    end - start,
3876 	    false,                                   /* forking */
3877 	    &copy_object,
3878 	    &copy_offset,
3879 	    &fourk_copy);
3880 	assert(result == KERN_SUCCESS);
3881 	assert(copy_object != VM_OBJECT_NULL);
3882 	assert(copy_offset == 0);
3883 
3884 	/* map the "4K" pager's copy object */
3885 	new_entry = vm_map_entry_insert(map,
3886 	    entry,
3887 	    vm_map_trunc_page(start, VM_MAP_PAGE_MASK(map)),
3888 	    vm_map_round_page(end, VM_MAP_PAGE_MASK(map)),
3889 	    copy_object,
3890 	    0,                      /* offset */
3891 	    vmk_flags,
3892 	    FALSE,                  /* needs_copy */
3893 	    cur_protection, max_protection,
3894 	    (entry_for_jit && !VM_MAP_POLICY_ALLOW_JIT_INHERIT(map) ?
3895 	    VM_INHERIT_NONE : inheritance),
3896 	    clear_map_aligned);
3897 	entry = new_entry;
3898 
3899 #if VM_MAP_DEBUG_FOURK
3900 	if (vm_map_debug_fourk) {
3901 		printf("FOURK_PAGER: map %p [0x%llx:0x%llx] new pager %p\n",
3902 		    map,
3903 		    (uint64_t) entry->vme_start,
3904 		    (uint64_t) entry->vme_end,
3905 		    fourk_mem_obj);
3906 	}
3907 #endif /* VM_MAP_DEBUG_FOURK */
3908 
3909 	new_mapping_established = TRUE;
3910 
3911 map_in_fourk_pager:
3912 	/* "map" the original "object" where it belongs in the "4K" pager */
3913 	fourk_pager_offset = (fourk_start & SIXTEENK_PAGE_MASK);
3914 	fourk_pager_index_start = (int) (fourk_pager_offset / FOURK_PAGE_SIZE);
3915 	if (fourk_size > SIXTEENK_PAGE_SIZE) {
3916 		fourk_pager_index_num = 4;
3917 	} else {
3918 		fourk_pager_index_num = (int) (fourk_size / FOURK_PAGE_SIZE);
3919 	}
3920 	if (fourk_pager_index_start + fourk_pager_index_num > 4) {
3921 		fourk_pager_index_num = 4 - fourk_pager_index_start;
3922 	}
3923 	for (cur_idx = 0;
3924 	    cur_idx < fourk_pager_index_num;
3925 	    cur_idx++) {
3926 		vm_object_t             old_object;
3927 		vm_object_offset_t      old_offset;
3928 
3929 		kr = fourk_pager_populate(fourk_mem_obj,
3930 		    TRUE,                       /* overwrite */
3931 		    fourk_pager_index_start + cur_idx,
3932 		    object,
3933 		    (object
3934 		    ? (offset +
3935 		    (cur_idx * FOURK_PAGE_SIZE))
3936 		    : 0),
3937 		    &old_object,
3938 		    &old_offset);
3939 #if VM_MAP_DEBUG_FOURK
3940 		if (vm_map_debug_fourk) {
3941 			if (old_object == (vm_object_t) -1 &&
3942 			    old_offset == (vm_object_offset_t) -1) {
3943 				printf("FOURK_PAGER: map %p [0x%llx:0x%llx] "
3944 				    "pager [%p:0x%llx] "
3945 				    "populate[%d] "
3946 				    "[object:%p,offset:0x%llx]\n",
3947 				    map,
3948 				    (uint64_t) entry->vme_start,
3949 				    (uint64_t) entry->vme_end,
3950 				    fourk_mem_obj,
3951 				    VME_OFFSET(entry),
3952 				    fourk_pager_index_start + cur_idx,
3953 				    object,
3954 				    (object
3955 				    ? (offset + (cur_idx * FOURK_PAGE_SIZE))
3956 				    : 0));
3957 			} else {
3958 				printf("FOURK_PAGER: map %p [0x%llx:0x%llx] "
3959 				    "pager [%p:0x%llx] "
3960 				    "populate[%d] [object:%p,offset:0x%llx] "
3961 				    "old [%p:0x%llx]\n",
3962 				    map,
3963 				    (uint64_t) entry->vme_start,
3964 				    (uint64_t) entry->vme_end,
3965 				    fourk_mem_obj,
3966 				    VME_OFFSET(entry),
3967 				    fourk_pager_index_start + cur_idx,
3968 				    object,
3969 				    (object
3970 				    ? (offset + (cur_idx * FOURK_PAGE_SIZE))
3971 				    : 0),
3972 				    old_object,
3973 				    old_offset);
3974 			}
3975 		}
3976 #endif /* VM_MAP_DEBUG_FOURK */
3977 
3978 		assert(kr == KERN_SUCCESS);
3979 		if (object != old_object &&
3980 		    object != VM_OBJECT_NULL &&
3981 		    object != (vm_object_t) -1) {
3982 			vm_object_reference(object);
3983 		}
3984 		if (object != old_object &&
3985 		    old_object != VM_OBJECT_NULL &&
3986 		    old_object != (vm_object_t) -1) {
3987 			vm_object_deallocate(old_object);
3988 		}
3989 	}
3990 
3991 BailOut:
3992 	assert(map_locked == TRUE);
3993 
3994 	if (result == KERN_SUCCESS) {
3995 		vm_prot_t pager_prot;
3996 		memory_object_t pager;
3997 
3998 #if DEBUG
3999 		if (pmap_empty &&
4000 		    !(vmk_flags.vmkf_no_pmap_check)) {
4001 			assert(pmap_is_empty(map->pmap,
4002 			    *address,
4003 			    *address + size));
4004 		}
4005 #endif /* DEBUG */
4006 
4007 		/*
4008 		 * For "named" VM objects, let the pager know that the
4009 		 * memory object is being mapped.  Some pagers need to keep
4010 		 * track of this, to know when they can reclaim the memory
4011 		 * object, for example.
4012 		 * VM calls memory_object_map() for each mapping (specifying
4013 		 * the protection of each mapping) and calls
4014 		 * memory_object_last_unmap() when all the mappings are gone.
4015 		 */
4016 		pager_prot = max_protection;
4017 		if (needs_copy) {
4018 			/*
4019 			 * Copy-On-Write mapping: won't modify
4020 			 * the memory object.
4021 			 */
4022 			pager_prot &= ~VM_PROT_WRITE;
4023 		}
4024 		if (!is_submap &&
4025 		    object != VM_OBJECT_NULL &&
4026 		    object->named &&
4027 		    object->pager != MEMORY_OBJECT_NULL) {
4028 			vm_object_lock(object);
4029 			pager = object->pager;
4030 			if (object->named &&
4031 			    pager != MEMORY_OBJECT_NULL) {
4032 				assert(object->pager_ready);
4033 				vm_object_mapping_wait(object, THREAD_UNINT);
4034 				vm_object_mapping_begin(object);
4035 				vm_object_unlock(object);
4036 
4037 				kr = memory_object_map(pager, pager_prot);
4038 				assert(kr == KERN_SUCCESS);
4039 
4040 				vm_object_lock(object);
4041 				vm_object_mapping_end(object);
4042 			}
4043 			vm_object_unlock(object);
4044 		}
4045 		if (!is_submap &&
4046 		    fourk_object != VM_OBJECT_NULL &&
4047 		    fourk_object->named &&
4048 		    fourk_object->pager != MEMORY_OBJECT_NULL) {
4049 			vm_object_lock(fourk_object);
4050 			pager = fourk_object->pager;
4051 			if (fourk_object->named &&
4052 			    pager != MEMORY_OBJECT_NULL) {
4053 				assert(fourk_object->pager_ready);
4054 				vm_object_mapping_wait(fourk_object,
4055 				    THREAD_UNINT);
4056 				vm_object_mapping_begin(fourk_object);
4057 				vm_object_unlock(fourk_object);
4058 
4059 				kr = memory_object_map(pager, VM_PROT_READ);
4060 				assert(kr == KERN_SUCCESS);
4061 
4062 				vm_object_lock(fourk_object);
4063 				vm_object_mapping_end(fourk_object);
4064 			}
4065 			vm_object_unlock(fourk_object);
4066 		}
4067 	}
4068 
4069 	if (fourk_object != VM_OBJECT_NULL) {
4070 		vm_object_deallocate(fourk_object);
4071 		fourk_object = VM_OBJECT_NULL;
4072 		memory_object_deallocate(fourk_mem_obj);
4073 		fourk_mem_obj = MEMORY_OBJECT_NULL;
4074 	}
4075 
4076 	assert(map_locked == TRUE);
4077 
4078 	if (!keep_map_locked) {
4079 		vm_map_unlock(map);
4080 		map_locked = FALSE;
4081 	}
4082 
4083 	/*
4084 	 * We can't hold the map lock if we enter this block.
4085 	 */
4086 
4087 	if (result == KERN_SUCCESS) {
4088 		/*	Wire down the new entry if the user
4089 		 *	requested all new map entries be wired.
4090 		 */
4091 		if ((map->wiring_required) || (superpage_size)) {
4092 			assert(!keep_map_locked);
4093 			pmap_empty = FALSE; /* pmap won't be empty */
4094 			kr = vm_map_wire_kernel(map, start, end,
4095 			    new_entry->protection, VM_KERN_MEMORY_MLOCK,
4096 			    TRUE);
4097 			result = kr;
4098 		}
4099 
4100 	}
4101 
4102 	if (result != KERN_SUCCESS) {
4103 		if (new_mapping_established) {
4104 			/*
4105 			 * We have to get rid of the new mappings since we
4106 			 * won't make them available to the user.
4107 			 * Try and do that atomically, to minimize the risk
4108 			 * that someone else create new mappings that range.
4109 			 */
4110 
4111 			if (!map_locked) {
4112 				vm_map_lock(map);
4113 				map_locked = TRUE;
4114 			}
4115 			(void)vm_map_delete(map, *address, *address + size,
4116 			    VM_MAP_REMOVE_NO_MAP_ALIGN | VM_MAP_REMOVE_NO_YIELD,
4117 			    KMEM_GUARD_NONE, &zap_list);
4118 		}
4119 	}
4120 
4121 	/*
4122 	 * The caller is responsible for releasing the lock if it requested to
4123 	 * keep the map locked.
4124 	 */
4125 	if (map_locked && !keep_map_locked) {
4126 		vm_map_unlock(map);
4127 	}
4128 
4129 	vm_map_zap_dispose(&zap_list);
4130 
4131 	return result;
4132 
4133 #undef  RETURN
4134 }
4135 #endif /* __arm64__ */
4136 
4137 /*
4138  * Counters for the prefault optimization.
4139  */
4140 int64_t vm_prefault_nb_pages = 0;
4141 int64_t vm_prefault_nb_bailout = 0;
4142 
4143 static kern_return_t
vm_map_enter_mem_object_helper(vm_map_t target_map,vm_map_offset_t * address,vm_map_size_t initial_size,vm_map_offset_t mask,vm_map_kernel_flags_t vmk_flags,ipc_port_t port,vm_object_offset_t offset,boolean_t copy,vm_prot_t cur_protection,vm_prot_t max_protection,vm_inherit_t inheritance,upl_page_list_ptr_t page_list,unsigned int page_list_count)4144 vm_map_enter_mem_object_helper(
4145 	vm_map_t                target_map,
4146 	vm_map_offset_t         *address,
4147 	vm_map_size_t           initial_size,
4148 	vm_map_offset_t         mask,
4149 	vm_map_kernel_flags_t   vmk_flags,
4150 	ipc_port_t              port,
4151 	vm_object_offset_t      offset,
4152 	boolean_t               copy,
4153 	vm_prot_t               cur_protection,
4154 	vm_prot_t               max_protection,
4155 	vm_inherit_t            inheritance,
4156 	upl_page_list_ptr_t     page_list,
4157 	unsigned int            page_list_count)
4158 {
4159 	vm_map_address_t        map_addr;
4160 	vm_map_size_t           map_size;
4161 	vm_object_t             object;
4162 	vm_object_size_t        size;
4163 	kern_return_t           result;
4164 	boolean_t               mask_cur_protection, mask_max_protection;
4165 	boolean_t               kernel_prefault, try_prefault = (page_list_count != 0);
4166 	vm_map_offset_t         offset_in_mapping = 0;
4167 #if __arm64__
4168 	boolean_t               fourk = vmk_flags.vmkf_fourk;
4169 #endif /* __arm64__ */
4170 
4171 	if (VM_MAP_PAGE_SHIFT(target_map) < PAGE_SHIFT) {
4172 		/* XXX TODO4K prefaulting depends on page size... */
4173 		try_prefault = FALSE;
4174 	}
4175 
4176 	assertf(vmk_flags.__vmkf_unused == 0, "vmk_flags unused=0x%x\n", vmk_flags.__vmkf_unused);
4177 	vm_map_kernel_flags_update_range_id(&vmk_flags, target_map);
4178 
4179 	mask_cur_protection = cur_protection & VM_PROT_IS_MASK;
4180 	mask_max_protection = max_protection & VM_PROT_IS_MASK;
4181 	cur_protection &= ~VM_PROT_IS_MASK;
4182 	max_protection &= ~VM_PROT_IS_MASK;
4183 
4184 	/*
4185 	 * Check arguments for validity
4186 	 */
4187 	if ((target_map == VM_MAP_NULL) ||
4188 	    (cur_protection & ~(VM_PROT_ALL | VM_PROT_ALLEXEC)) ||
4189 	    (max_protection & ~(VM_PROT_ALL | VM_PROT_ALLEXEC)) ||
4190 	    (inheritance > VM_INHERIT_LAST_VALID) ||
4191 	    (try_prefault && (copy || !page_list)) ||
4192 	    initial_size == 0) {
4193 		return KERN_INVALID_ARGUMENT;
4194 	}
4195 
4196 	if (__improbable((cur_protection & max_protection) != cur_protection)) {
4197 		/* cur is more permissive than max */
4198 		cur_protection &= max_protection;
4199 	}
4200 
4201 #if __arm64__
4202 	if (cur_protection & VM_PROT_EXECUTE) {
4203 		cur_protection |= VM_PROT_READ;
4204 	}
4205 
4206 	if (fourk && VM_MAP_PAGE_SHIFT(target_map) < PAGE_SHIFT) {
4207 		/* no "fourk" if map is using a sub-page page size */
4208 		fourk = FALSE;
4209 	}
4210 	if (fourk) {
4211 		map_addr = vm_map_trunc_page(*address, FOURK_PAGE_MASK);
4212 		map_size = vm_map_round_page(initial_size, FOURK_PAGE_MASK);
4213 	} else
4214 #endif /* __arm64__ */
4215 	{
4216 		map_addr = vm_map_trunc_page(*address,
4217 		    VM_MAP_PAGE_MASK(target_map));
4218 		map_size = vm_map_round_page(initial_size,
4219 		    VM_MAP_PAGE_MASK(target_map));
4220 	}
4221 	if (map_size == 0) {
4222 		return KERN_INVALID_ARGUMENT;
4223 	}
4224 	size = vm_object_round_page(initial_size);
4225 
4226 	/*
4227 	 * Find the vm object (if any) corresponding to this port.
4228 	 */
4229 	if (!IP_VALID(port)) {
4230 		object = VM_OBJECT_NULL;
4231 		offset = 0;
4232 		copy = FALSE;
4233 	} else if (ip_kotype(port) == IKOT_NAMED_ENTRY) {
4234 		vm_named_entry_t        named_entry;
4235 		vm_object_offset_t      data_offset;
4236 
4237 		named_entry = mach_memory_entry_from_port(port);
4238 
4239 		if (vmk_flags.vmf_return_data_addr ||
4240 		    vmk_flags.vmf_return_4k_data_addr) {
4241 			data_offset = named_entry->data_offset;
4242 			offset += named_entry->data_offset;
4243 		} else {
4244 			data_offset = 0;
4245 		}
4246 
4247 		/* a few checks to make sure user is obeying rules */
4248 		if (mask_max_protection) {
4249 			max_protection &= named_entry->protection;
4250 		}
4251 		if (mask_cur_protection) {
4252 			cur_protection &= named_entry->protection;
4253 		}
4254 		if ((named_entry->protection & max_protection) !=
4255 		    max_protection) {
4256 			return KERN_INVALID_RIGHT;
4257 		}
4258 		if ((named_entry->protection & cur_protection) !=
4259 		    cur_protection) {
4260 			return KERN_INVALID_RIGHT;
4261 		}
4262 		if (offset + size <= offset) {
4263 			/* overflow */
4264 			return KERN_INVALID_ARGUMENT;
4265 		}
4266 		if (named_entry->size < (offset + initial_size)) {
4267 			return KERN_INVALID_ARGUMENT;
4268 		}
4269 
4270 		if (named_entry->is_copy) {
4271 			/* for a vm_map_copy, we can only map it whole */
4272 			if ((size != named_entry->size) &&
4273 			    (vm_map_round_page(size,
4274 			    VM_MAP_PAGE_MASK(target_map)) ==
4275 			    named_entry->size)) {
4276 				/* XXX FBDP use the rounded size... */
4277 				size = vm_map_round_page(
4278 					size,
4279 					VM_MAP_PAGE_MASK(target_map));
4280 			}
4281 		}
4282 
4283 		/* the callers parameter offset is defined to be the */
4284 		/* offset from beginning of named entry offset in object */
4285 		offset = offset + named_entry->offset;
4286 
4287 		if (!VM_MAP_PAGE_ALIGNED(size,
4288 		    VM_MAP_PAGE_MASK(target_map))) {
4289 			/*
4290 			 * Let's not map more than requested;
4291 			 * vm_map_enter() will handle this "not map-aligned"
4292 			 * case.
4293 			 */
4294 			map_size = size;
4295 		}
4296 
4297 		named_entry_lock(named_entry);
4298 		if (named_entry->is_sub_map) {
4299 			vm_map_t                submap;
4300 
4301 			if (vmk_flags.vmf_return_data_addr ||
4302 			    vmk_flags.vmf_return_4k_data_addr) {
4303 				panic("VM_FLAGS_RETURN_DATA_ADDR not expected for submap.");
4304 			}
4305 
4306 			submap = named_entry->backing.map;
4307 			vm_map_reference(submap);
4308 			named_entry_unlock(named_entry);
4309 
4310 			vmk_flags.vmkf_submap = TRUE;
4311 
4312 			result = vm_map_enter(target_map,
4313 			    &map_addr,
4314 			    map_size,
4315 			    mask,
4316 			    vmk_flags,
4317 			    (vm_object_t)(uintptr_t) submap,
4318 			    offset,
4319 			    copy,
4320 			    cur_protection,
4321 			    max_protection,
4322 			    inheritance);
4323 			if (result != KERN_SUCCESS) {
4324 				vm_map_deallocate(submap);
4325 			} else {
4326 				/*
4327 				 * No need to lock "submap" just to check its
4328 				 * "mapped" flag: that flag is never reset
4329 				 * once it's been set and if we race, we'll
4330 				 * just end up setting it twice, which is OK.
4331 				 */
4332 				if (submap->mapped_in_other_pmaps == FALSE &&
4333 				    vm_map_pmap(submap) != PMAP_NULL &&
4334 				    vm_map_pmap(submap) !=
4335 				    vm_map_pmap(target_map)) {
4336 					/*
4337 					 * This submap is being mapped in a map
4338 					 * that uses a different pmap.
4339 					 * Set its "mapped_in_other_pmaps" flag
4340 					 * to indicate that we now need to
4341 					 * remove mappings from all pmaps rather
4342 					 * than just the submap's pmap.
4343 					 */
4344 					vm_map_lock(submap);
4345 					submap->mapped_in_other_pmaps = TRUE;
4346 					vm_map_unlock(submap);
4347 				}
4348 				*address = map_addr;
4349 			}
4350 			return result;
4351 		} else if (named_entry->is_copy) {
4352 			kern_return_t   kr;
4353 			vm_map_copy_t   copy_map;
4354 			vm_map_entry_t  copy_entry;
4355 			vm_map_offset_t copy_addr;
4356 			vm_map_copy_t   target_copy_map;
4357 			vm_map_offset_t overmap_start, overmap_end;
4358 			vm_map_offset_t trimmed_start;
4359 			vm_map_size_t   target_size;
4360 
4361 			if (!vm_map_kernel_flags_check_vmflags(vmk_flags,
4362 			    (VM_FLAGS_FIXED |
4363 			    VM_FLAGS_ANYWHERE |
4364 			    VM_FLAGS_OVERWRITE |
4365 			    VM_FLAGS_RETURN_4K_DATA_ADDR |
4366 			    VM_FLAGS_RETURN_DATA_ADDR))) {
4367 				named_entry_unlock(named_entry);
4368 				return KERN_INVALID_ARGUMENT;
4369 			}
4370 
4371 			copy_map = named_entry->backing.copy;
4372 			assert(copy_map->type == VM_MAP_COPY_ENTRY_LIST);
4373 			if (copy_map->type != VM_MAP_COPY_ENTRY_LIST) {
4374 				/* unsupported type; should not happen */
4375 				printf("vm_map_enter_mem_object: "
4376 				    "memory_entry->backing.copy "
4377 				    "unsupported type 0x%x\n",
4378 				    copy_map->type);
4379 				named_entry_unlock(named_entry);
4380 				return KERN_INVALID_ARGUMENT;
4381 			}
4382 
4383 			if (VM_MAP_PAGE_SHIFT(target_map) != copy_map->cpy_hdr.page_shift) {
4384 				DEBUG4K_SHARE("copy_map %p offset %llx size 0x%llx pgshift %d -> target_map %p pgshift %d\n", copy_map, offset, (uint64_t)map_size, copy_map->cpy_hdr.page_shift, target_map, VM_MAP_PAGE_SHIFT(target_map));
4385 			}
4386 
4387 			if (vmk_flags.vmf_return_data_addr ||
4388 			    vmk_flags.vmf_return_4k_data_addr) {
4389 				offset_in_mapping = offset & VM_MAP_PAGE_MASK(target_map);
4390 				if (vmk_flags.vmf_return_4k_data_addr) {
4391 					offset_in_mapping &= ~((signed)(0xFFF));
4392 				}
4393 			}
4394 
4395 			target_copy_map = VM_MAP_COPY_NULL;
4396 			target_size = copy_map->size;
4397 			overmap_start = 0;
4398 			overmap_end = 0;
4399 			trimmed_start = 0;
4400 			if (copy_map->cpy_hdr.page_shift != VM_MAP_PAGE_SHIFT(target_map)) {
4401 				DEBUG4K_ADJUST("adjusting...\n");
4402 				kr = vm_map_copy_adjust_to_target(
4403 					copy_map,
4404 					offset /* includes data_offset */,
4405 					initial_size,
4406 					target_map,
4407 					copy,
4408 					&target_copy_map,
4409 					&overmap_start,
4410 					&overmap_end,
4411 					&trimmed_start);
4412 				if (kr != KERN_SUCCESS) {
4413 					named_entry_unlock(named_entry);
4414 					return kr;
4415 				}
4416 				target_size = target_copy_map->size;
4417 				if (trimmed_start >= data_offset) {
4418 					data_offset = offset & VM_MAP_PAGE_MASK(target_map);
4419 				} else {
4420 					data_offset -= trimmed_start;
4421 				}
4422 			} else {
4423 				/*
4424 				 * Assert that the vm_map_copy is coming from the right
4425 				 * zone and hasn't been forged
4426 				 */
4427 				vm_map_copy_require(copy_map);
4428 				target_copy_map = copy_map;
4429 			}
4430 
4431 			vm_map_kernel_flags_t rsv_flags = vmk_flags;
4432 
4433 			vm_map_kernel_flags_and_vmflags(&rsv_flags,
4434 			    (VM_FLAGS_FIXED |
4435 			    VM_FLAGS_ANYWHERE |
4436 			    VM_FLAGS_OVERWRITE |
4437 			    VM_FLAGS_RETURN_4K_DATA_ADDR |
4438 			    VM_FLAGS_RETURN_DATA_ADDR));
4439 
4440 			/* reserve a contiguous range */
4441 			kr = vm_map_enter(target_map,
4442 			    &map_addr,
4443 			    vm_map_round_page(target_size, VM_MAP_PAGE_MASK(target_map)),
4444 			    mask,
4445 			    rsv_flags,
4446 			    VM_OBJECT_NULL,
4447 			    0,
4448 			    FALSE,               /* copy */
4449 			    cur_protection,
4450 			    max_protection,
4451 			    inheritance);
4452 			if (kr != KERN_SUCCESS) {
4453 				DEBUG4K_ERROR("kr 0x%x\n", kr);
4454 				if (target_copy_map != copy_map) {
4455 					vm_map_copy_discard(target_copy_map);
4456 					target_copy_map = VM_MAP_COPY_NULL;
4457 				}
4458 				named_entry_unlock(named_entry);
4459 				return kr;
4460 			}
4461 
4462 			copy_addr = map_addr;
4463 
4464 			for (copy_entry = vm_map_copy_first_entry(target_copy_map);
4465 			    copy_entry != vm_map_copy_to_entry(target_copy_map);
4466 			    copy_entry = copy_entry->vme_next) {
4467 				vm_map_t                copy_submap = VM_MAP_NULL;
4468 				vm_object_t             copy_object = VM_OBJECT_NULL;
4469 				vm_map_size_t           copy_size;
4470 				vm_object_offset_t      copy_offset;
4471 				boolean_t               do_copy = false;
4472 
4473 				if (copy_entry->is_sub_map) {
4474 					copy_submap = VME_SUBMAP(copy_entry);
4475 					copy_object = (vm_object_t)copy_submap;
4476 				} else {
4477 					copy_object = VME_OBJECT(copy_entry);
4478 				}
4479 				copy_offset = VME_OFFSET(copy_entry);
4480 				copy_size = (copy_entry->vme_end -
4481 				    copy_entry->vme_start);
4482 
4483 				/* sanity check */
4484 				if ((copy_addr + copy_size) >
4485 				    (map_addr +
4486 				    overmap_start + overmap_end +
4487 				    named_entry->size /* XXX full size */)) {
4488 					/* over-mapping too much !? */
4489 					kr = KERN_INVALID_ARGUMENT;
4490 					DEBUG4K_ERROR("kr 0x%x\n", kr);
4491 					/* abort */
4492 					break;
4493 				}
4494 
4495 				/* take a reference on the object */
4496 				if (copy_entry->is_sub_map) {
4497 					vm_map_reference(copy_submap);
4498 				} else {
4499 					if (!copy &&
4500 					    copy_object != VM_OBJECT_NULL &&
4501 					    copy_object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) {
4502 						bool is_writable;
4503 
4504 						/*
4505 						 * We need to resolve our side of this
4506 						 * "symmetric" copy-on-write now; we
4507 						 * need a new object to map and share,
4508 						 * instead of the current one which
4509 						 * might still be shared with the
4510 						 * original mapping.
4511 						 *
4512 						 * Note: A "vm_map_copy_t" does not
4513 						 * have a lock but we're protected by
4514 						 * the named entry's lock here.
4515 						 */
4516 						// assert(copy_object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC);
4517 						VME_OBJECT_SHADOW(copy_entry, copy_size, TRUE);
4518 						assert(copy_object != VME_OBJECT(copy_entry));
4519 						is_writable = false;
4520 						if (copy_entry->protection & VM_PROT_WRITE) {
4521 							is_writable = true;
4522 #if __arm64e__
4523 						} else if (copy_entry->used_for_tpro) {
4524 							is_writable = true;
4525 #endif /* __arm64e__ */
4526 						}
4527 						if (!copy_entry->needs_copy && is_writable) {
4528 							vm_prot_t prot;
4529 
4530 							prot = copy_entry->protection & ~VM_PROT_WRITE;
4531 							vm_object_pmap_protect(copy_object,
4532 							    copy_offset,
4533 							    copy_size,
4534 							    PMAP_NULL,
4535 							    PAGE_SIZE,
4536 							    0,
4537 							    prot);
4538 						}
4539 						copy_entry->needs_copy = FALSE;
4540 						copy_entry->is_shared = TRUE;
4541 						copy_object = VME_OBJECT(copy_entry);
4542 						copy_offset = VME_OFFSET(copy_entry);
4543 						vm_object_lock(copy_object);
4544 						/* we're about to make a shared mapping of this object */
4545 						copy_object->copy_strategy = MEMORY_OBJECT_COPY_DELAY;
4546 						VM_OBJECT_SET_TRUE_SHARE(copy_object, TRUE);
4547 						vm_object_unlock(copy_object);
4548 					}
4549 
4550 					if (copy_object != VM_OBJECT_NULL &&
4551 					    copy_object->named &&
4552 					    copy_object->pager != MEMORY_OBJECT_NULL &&
4553 					    copy_object->copy_strategy != MEMORY_OBJECT_COPY_NONE) {
4554 						memory_object_t pager;
4555 						vm_prot_t       pager_prot;
4556 
4557 						/*
4558 						 * For "named" VM objects, let the pager know that the
4559 						 * memory object is being mapped.  Some pagers need to keep
4560 						 * track of this, to know when they can reclaim the memory
4561 						 * object, for example.
4562 						 * VM calls memory_object_map() for each mapping (specifying
4563 						 * the protection of each mapping) and calls
4564 						 * memory_object_last_unmap() when all the mappings are gone.
4565 						 */
4566 						pager_prot = max_protection;
4567 						if (copy) {
4568 							/*
4569 							 * Copy-On-Write mapping: won't modify the
4570 							 * memory object.
4571 							 */
4572 							pager_prot &= ~VM_PROT_WRITE;
4573 						}
4574 						vm_object_lock(copy_object);
4575 						pager = copy_object->pager;
4576 						if (copy_object->named &&
4577 						    pager != MEMORY_OBJECT_NULL &&
4578 						    copy_object->copy_strategy != MEMORY_OBJECT_COPY_NONE) {
4579 							assert(copy_object->pager_ready);
4580 							vm_object_mapping_wait(copy_object, THREAD_UNINT);
4581 							vm_object_mapping_begin(copy_object);
4582 							vm_object_unlock(copy_object);
4583 
4584 							kr = memory_object_map(pager, pager_prot);
4585 							assert(kr == KERN_SUCCESS);
4586 
4587 							vm_object_lock(copy_object);
4588 							vm_object_mapping_end(copy_object);
4589 						}
4590 						vm_object_unlock(copy_object);
4591 					}
4592 
4593 					/*
4594 					 *	Perform the copy if requested
4595 					 */
4596 
4597 					if (copy && copy_object != VM_OBJECT_NULL) {
4598 						vm_object_t             new_object;
4599 						vm_object_offset_t      new_offset;
4600 
4601 						result = vm_object_copy_strategically(copy_object, copy_offset,
4602 						    copy_size,
4603 						    false,                                   /* forking */
4604 						    &new_object, &new_offset,
4605 						    &do_copy);
4606 
4607 
4608 						if (result == KERN_MEMORY_RESTART_COPY) {
4609 							boolean_t success;
4610 							boolean_t src_needs_copy;
4611 
4612 							/*
4613 							 * XXX
4614 							 * We currently ignore src_needs_copy.
4615 							 * This really is the issue of how to make
4616 							 * MEMORY_OBJECT_COPY_SYMMETRIC safe for
4617 							 * non-kernel users to use. Solution forthcoming.
4618 							 * In the meantime, since we don't allow non-kernel
4619 							 * memory managers to specify symmetric copy,
4620 							 * we won't run into problems here.
4621 							 */
4622 							new_object = copy_object;
4623 							new_offset = copy_offset;
4624 							success = vm_object_copy_quickly(new_object,
4625 							    new_offset,
4626 							    copy_size,
4627 							    &src_needs_copy,
4628 							    &do_copy);
4629 							assert(success);
4630 							result = KERN_SUCCESS;
4631 						}
4632 						if (result != KERN_SUCCESS) {
4633 							kr = result;
4634 							break;
4635 						}
4636 
4637 						copy_object = new_object;
4638 						copy_offset = new_offset;
4639 						/*
4640 						 * No extra object reference for the mapping:
4641 						 * the mapping should be the only thing keeping
4642 						 * this new object alive.
4643 						 */
4644 					} else {
4645 						/*
4646 						 * We already have the right object
4647 						 * to map.
4648 						 */
4649 						copy_object = VME_OBJECT(copy_entry);
4650 						/* take an extra ref for the mapping below */
4651 						vm_object_reference(copy_object);
4652 					}
4653 				}
4654 
4655 				/*
4656 				 * If the caller does not want a specific
4657 				 * tag for this new mapping:  use
4658 				 * the tag of the original mapping.
4659 				 */
4660 				vm_map_kernel_flags_t vmk_remap_flags = {
4661 					.vmkf_submap = copy_entry->is_sub_map,
4662 				};
4663 
4664 				vm_map_kernel_flags_set_vmflags(&vmk_remap_flags,
4665 				    vm_map_kernel_flags_vmflags(vmk_flags),
4666 				    vmk_flags.vm_tag ?: VME_ALIAS(copy_entry));
4667 
4668 				/* over-map the object into destination */
4669 				vmk_remap_flags.vmf_fixed = true;
4670 				vmk_remap_flags.vmf_overwrite = true;
4671 
4672 				if (!copy && !copy_entry->is_sub_map) {
4673 					/*
4674 					 * copy-on-write should have been
4675 					 * resolved at this point, or we would
4676 					 * end up sharing instead of copying.
4677 					 */
4678 					assert(!copy_entry->needs_copy);
4679 				}
4680 #if XNU_TARGET_OS_OSX
4681 				if (copy_entry->used_for_jit) {
4682 					vmk_remap_flags.vmkf_map_jit = TRUE;
4683 				}
4684 #endif /* XNU_TARGET_OS_OSX */
4685 
4686 				kr = vm_map_enter(target_map,
4687 				    &copy_addr,
4688 				    copy_size,
4689 				    (vm_map_offset_t) 0,
4690 				    vmk_remap_flags,
4691 				    copy_object,
4692 				    copy_offset,
4693 				    ((copy_object == NULL)
4694 				    ? FALSE
4695 				    : (copy || copy_entry->needs_copy)),
4696 				    cur_protection,
4697 				    max_protection,
4698 				    inheritance);
4699 				if (kr != KERN_SUCCESS) {
4700 					DEBUG4K_SHARE("failed kr 0x%x\n", kr);
4701 					if (copy_entry->is_sub_map) {
4702 						vm_map_deallocate(copy_submap);
4703 					} else {
4704 						vm_object_deallocate(copy_object);
4705 					}
4706 					/* abort */
4707 					break;
4708 				}
4709 
4710 				/* next mapping */
4711 				copy_addr += copy_size;
4712 			}
4713 
4714 			if (kr == KERN_SUCCESS) {
4715 				if (vmk_flags.vmf_return_data_addr ||
4716 				    vmk_flags.vmf_return_4k_data_addr) {
4717 					*address = map_addr + offset_in_mapping;
4718 				} else {
4719 					*address = map_addr;
4720 				}
4721 				if (overmap_start) {
4722 					*address += overmap_start;
4723 					DEBUG4K_SHARE("map %p map_addr 0x%llx offset_in_mapping 0x%llx overmap_start 0x%llx -> *address 0x%llx\n", target_map, (uint64_t)map_addr, (uint64_t) offset_in_mapping, (uint64_t)overmap_start, (uint64_t)*address);
4724 				}
4725 			}
4726 			named_entry_unlock(named_entry);
4727 			if (target_copy_map != copy_map) {
4728 				vm_map_copy_discard(target_copy_map);
4729 				target_copy_map = VM_MAP_COPY_NULL;
4730 			}
4731 
4732 			if (kr != KERN_SUCCESS && !vmk_flags.vmf_overwrite) {
4733 				/* deallocate the contiguous range */
4734 				(void) vm_deallocate(target_map,
4735 				    map_addr,
4736 				    map_size);
4737 			}
4738 
4739 			return kr;
4740 		}
4741 
4742 		if (named_entry->is_object) {
4743 			unsigned int    access;
4744 			unsigned int    wimg_mode;
4745 
4746 			/* we are mapping a VM object */
4747 
4748 			access = named_entry->access;
4749 
4750 			if (vmk_flags.vmf_return_data_addr ||
4751 			    vmk_flags.vmf_return_4k_data_addr) {
4752 				offset_in_mapping = offset - VM_MAP_TRUNC_PAGE(offset, VM_MAP_PAGE_MASK(target_map));
4753 				if (vmk_flags.vmf_return_4k_data_addr) {
4754 					offset_in_mapping &= ~((signed)(0xFFF));
4755 				}
4756 				offset = VM_MAP_TRUNC_PAGE(offset, VM_MAP_PAGE_MASK(target_map));
4757 				map_size = VM_MAP_ROUND_PAGE((offset + offset_in_mapping + initial_size) - offset, VM_MAP_PAGE_MASK(target_map));
4758 			}
4759 
4760 			object = vm_named_entry_to_vm_object(named_entry);
4761 			assert(object != VM_OBJECT_NULL);
4762 			vm_object_lock(object);
4763 			named_entry_unlock(named_entry);
4764 
4765 			vm_object_reference_locked(object);
4766 
4767 			wimg_mode = object->wimg_bits;
4768 			vm_prot_to_wimg(access, &wimg_mode);
4769 			if (object->wimg_bits != wimg_mode) {
4770 				vm_object_change_wimg_mode(object, wimg_mode);
4771 			}
4772 
4773 			vm_object_unlock(object);
4774 		} else {
4775 			panic("invalid VM named entry %p", named_entry);
4776 		}
4777 	} else if (ip_kotype(port) == IKOT_MEMORY_OBJECT) {
4778 		/*
4779 		 * JMM - This is temporary until we unify named entries
4780 		 * and raw memory objects.
4781 		 *
4782 		 * Detected fake ip_kotype for a memory object.  In
4783 		 * this case, the port isn't really a port at all, but
4784 		 * instead is just a raw memory object.
4785 		 */
4786 		if (vmk_flags.vmf_return_data_addr ||
4787 		    vmk_flags.vmf_return_4k_data_addr) {
4788 			panic("VM_FLAGS_RETURN_DATA_ADDR not expected for raw memory object.");
4789 		}
4790 
4791 		object = memory_object_to_vm_object((memory_object_t)port);
4792 		if (object == VM_OBJECT_NULL) {
4793 			return KERN_INVALID_OBJECT;
4794 		}
4795 		vm_object_reference(object);
4796 
4797 		/* wait for object (if any) to be ready */
4798 		if (object != VM_OBJECT_NULL) {
4799 			if (is_kernel_object(object)) {
4800 				printf("Warning: Attempt to map kernel object"
4801 				    " by a non-private kernel entity\n");
4802 				return KERN_INVALID_OBJECT;
4803 			}
4804 			if (!object->pager_ready) {
4805 				vm_object_lock(object);
4806 
4807 				while (!object->pager_ready) {
4808 					vm_object_wait(object,
4809 					    VM_OBJECT_EVENT_PAGER_READY,
4810 					    THREAD_UNINT);
4811 					vm_object_lock(object);
4812 				}
4813 				vm_object_unlock(object);
4814 			}
4815 		}
4816 	} else {
4817 		return KERN_INVALID_OBJECT;
4818 	}
4819 
4820 	if (object != VM_OBJECT_NULL &&
4821 	    object->named &&
4822 	    object->pager != MEMORY_OBJECT_NULL &&
4823 	    object->copy_strategy != MEMORY_OBJECT_COPY_NONE) {
4824 		memory_object_t pager;
4825 		vm_prot_t       pager_prot;
4826 		kern_return_t   kr;
4827 
4828 		/*
4829 		 * For "named" VM objects, let the pager know that the
4830 		 * memory object is being mapped.  Some pagers need to keep
4831 		 * track of this, to know when they can reclaim the memory
4832 		 * object, for example.
4833 		 * VM calls memory_object_map() for each mapping (specifying
4834 		 * the protection of each mapping) and calls
4835 		 * memory_object_last_unmap() when all the mappings are gone.
4836 		 */
4837 		pager_prot = max_protection;
4838 		if (copy) {
4839 			/*
4840 			 * Copy-On-Write mapping: won't modify the
4841 			 * memory object.
4842 			 */
4843 			pager_prot &= ~VM_PROT_WRITE;
4844 		}
4845 		vm_object_lock(object);
4846 		pager = object->pager;
4847 		if (object->named &&
4848 		    pager != MEMORY_OBJECT_NULL &&
4849 		    object->copy_strategy != MEMORY_OBJECT_COPY_NONE) {
4850 			assert(object->pager_ready);
4851 			vm_object_mapping_wait(object, THREAD_UNINT);
4852 			vm_object_mapping_begin(object);
4853 			vm_object_unlock(object);
4854 
4855 			kr = memory_object_map(pager, pager_prot);
4856 			assert(kr == KERN_SUCCESS);
4857 
4858 			vm_object_lock(object);
4859 			vm_object_mapping_end(object);
4860 		}
4861 		vm_object_unlock(object);
4862 	}
4863 
4864 	/*
4865 	 *	Perform the copy if requested
4866 	 */
4867 
4868 	if (copy) {
4869 		vm_object_t             new_object;
4870 		vm_object_offset_t      new_offset;
4871 
4872 		result = vm_object_copy_strategically(object, offset,
4873 		    map_size,
4874 		    false,                                   /* forking */
4875 		    &new_object, &new_offset,
4876 		    &copy);
4877 
4878 
4879 		if (result == KERN_MEMORY_RESTART_COPY) {
4880 			boolean_t success;
4881 			boolean_t src_needs_copy;
4882 
4883 			/*
4884 			 * XXX
4885 			 * We currently ignore src_needs_copy.
4886 			 * This really is the issue of how to make
4887 			 * MEMORY_OBJECT_COPY_SYMMETRIC safe for
4888 			 * non-kernel users to use. Solution forthcoming.
4889 			 * In the meantime, since we don't allow non-kernel
4890 			 * memory managers to specify symmetric copy,
4891 			 * we won't run into problems here.
4892 			 */
4893 			new_object = object;
4894 			new_offset = offset;
4895 			success = vm_object_copy_quickly(new_object,
4896 			    new_offset,
4897 			    map_size,
4898 			    &src_needs_copy,
4899 			    &copy);
4900 			assert(success);
4901 			result = KERN_SUCCESS;
4902 		}
4903 		/*
4904 		 *	Throw away the reference to the
4905 		 *	original object, as it won't be mapped.
4906 		 */
4907 
4908 		vm_object_deallocate(object);
4909 
4910 		if (result != KERN_SUCCESS) {
4911 			return result;
4912 		}
4913 
4914 		object = new_object;
4915 		offset = new_offset;
4916 	}
4917 
4918 	/*
4919 	 * If non-kernel users want to try to prefault pages, the mapping and prefault
4920 	 * needs to be atomic.
4921 	 */
4922 	kernel_prefault = (try_prefault && vm_kernel_map_is_kernel(target_map));
4923 	vmk_flags.vmkf_keep_map_locked = (try_prefault && !kernel_prefault);
4924 
4925 #if __arm64__
4926 	if (fourk) {
4927 		/* map this object in a "4K" pager */
4928 		result = vm_map_enter_fourk(target_map,
4929 		    &map_addr,
4930 		    map_size,
4931 		    (vm_map_offset_t) mask,
4932 		    vmk_flags,
4933 		    object,
4934 		    offset,
4935 		    copy,
4936 		    cur_protection,
4937 		    max_protection,
4938 		    inheritance);
4939 	} else
4940 #endif /* __arm64__ */
4941 	{
4942 		result = vm_map_enter(target_map,
4943 		    &map_addr, map_size,
4944 		    (vm_map_offset_t)mask,
4945 		    vmk_flags,
4946 		    object, offset,
4947 		    copy,
4948 		    cur_protection, max_protection,
4949 		    inheritance);
4950 	}
4951 	if (result != KERN_SUCCESS) {
4952 		vm_object_deallocate(object);
4953 	}
4954 
4955 	/*
4956 	 * Try to prefault, and do not forget to release the vm map lock.
4957 	 */
4958 	if (result == KERN_SUCCESS && try_prefault) {
4959 		mach_vm_address_t va = map_addr;
4960 		kern_return_t kr = KERN_SUCCESS;
4961 		unsigned int i = 0;
4962 		int pmap_options;
4963 
4964 		pmap_options = kernel_prefault ? 0 : PMAP_OPTIONS_NOWAIT;
4965 		if (object->internal) {
4966 			pmap_options |= PMAP_OPTIONS_INTERNAL;
4967 		}
4968 
4969 		for (i = 0; i < page_list_count; ++i) {
4970 			if (!UPL_VALID_PAGE(page_list, i)) {
4971 				if (kernel_prefault) {
4972 					assertf(FALSE, "kernel_prefault && !UPL_VALID_PAGE");
4973 					result = KERN_MEMORY_ERROR;
4974 					break;
4975 				}
4976 			} else {
4977 				/*
4978 				 * If this function call failed, we should stop
4979 				 * trying to optimize, other calls are likely
4980 				 * going to fail too.
4981 				 *
4982 				 * We are not gonna report an error for such
4983 				 * failure though. That's an optimization, not
4984 				 * something critical.
4985 				 */
4986 				kr = pmap_enter_options(target_map->pmap,
4987 				    va, UPL_PHYS_PAGE(page_list, i),
4988 				    cur_protection, VM_PROT_NONE,
4989 				    0, TRUE, pmap_options, NULL, PMAP_MAPPING_TYPE_INFER);
4990 				if (kr != KERN_SUCCESS) {
4991 					OSIncrementAtomic64(&vm_prefault_nb_bailout);
4992 					if (kernel_prefault) {
4993 						result = kr;
4994 					}
4995 					break;
4996 				}
4997 				OSIncrementAtomic64(&vm_prefault_nb_pages);
4998 			}
4999 
5000 			/* Next virtual address */
5001 			va += PAGE_SIZE;
5002 		}
5003 		if (vmk_flags.vmkf_keep_map_locked) {
5004 			vm_map_unlock(target_map);
5005 		}
5006 	}
5007 
5008 	if (vmk_flags.vmf_return_data_addr ||
5009 	    vmk_flags.vmf_return_4k_data_addr) {
5010 		*address = map_addr + offset_in_mapping;
5011 	} else {
5012 		*address = map_addr;
5013 	}
5014 	return result;
5015 }
5016 
5017 kern_return_t
vm_map_enter_mem_object(vm_map_t target_map,vm_map_offset_t * address,vm_map_size_t initial_size,vm_map_offset_t mask,vm_map_kernel_flags_t vmk_flags,ipc_port_t port,vm_object_offset_t offset,boolean_t copy,vm_prot_t cur_protection,vm_prot_t max_protection,vm_inherit_t inheritance)5018 vm_map_enter_mem_object(
5019 	vm_map_t                target_map,
5020 	vm_map_offset_t         *address,
5021 	vm_map_size_t           initial_size,
5022 	vm_map_offset_t         mask,
5023 	vm_map_kernel_flags_t   vmk_flags,
5024 	ipc_port_t              port,
5025 	vm_object_offset_t      offset,
5026 	boolean_t               copy,
5027 	vm_prot_t               cur_protection,
5028 	vm_prot_t               max_protection,
5029 	vm_inherit_t            inheritance)
5030 {
5031 	kern_return_t ret;
5032 
5033 	/* range_id is set by vm_map_enter_mem_object_helper */
5034 	ret = vm_map_enter_mem_object_helper(target_map,
5035 	    address,
5036 	    initial_size,
5037 	    mask,
5038 	    vmk_flags,
5039 	    port,
5040 	    offset,
5041 	    copy,
5042 	    cur_protection,
5043 	    max_protection,
5044 	    inheritance,
5045 	    NULL,
5046 	    0);
5047 
5048 #if KASAN
5049 	if (ret == KERN_SUCCESS && address && target_map->pmap == kernel_pmap) {
5050 		kasan_notify_address(*address, initial_size);
5051 	}
5052 #endif
5053 
5054 	return ret;
5055 }
5056 
5057 kern_return_t
vm_map_enter_mem_object_prefault(vm_map_t target_map,vm_map_offset_t * address,vm_map_size_t initial_size,vm_map_offset_t mask,vm_map_kernel_flags_t vmk_flags,ipc_port_t port,vm_object_offset_t offset,vm_prot_t cur_protection,vm_prot_t max_protection,upl_page_list_ptr_t page_list,unsigned int page_list_count)5058 vm_map_enter_mem_object_prefault(
5059 	vm_map_t                target_map,
5060 	vm_map_offset_t         *address,
5061 	vm_map_size_t           initial_size,
5062 	vm_map_offset_t         mask,
5063 	vm_map_kernel_flags_t   vmk_flags,
5064 	ipc_port_t              port,
5065 	vm_object_offset_t      offset,
5066 	vm_prot_t               cur_protection,
5067 	vm_prot_t               max_protection,
5068 	upl_page_list_ptr_t     page_list,
5069 	unsigned int            page_list_count)
5070 {
5071 	kern_return_t ret;
5072 
5073 	/* range_id is set by vm_map_enter_mem_object_helper */
5074 	ret = vm_map_enter_mem_object_helper(target_map,
5075 	    address,
5076 	    initial_size,
5077 	    mask,
5078 	    vmk_flags,
5079 	    port,
5080 	    offset,
5081 	    FALSE,
5082 	    cur_protection,
5083 	    max_protection,
5084 	    VM_INHERIT_DEFAULT,
5085 	    page_list,
5086 	    page_list_count);
5087 
5088 #if KASAN
5089 	if (ret == KERN_SUCCESS && address && target_map->pmap == kernel_pmap) {
5090 		kasan_notify_address(*address, initial_size);
5091 	}
5092 #endif
5093 
5094 	return ret;
5095 }
5096 
5097 
5098 kern_return_t
vm_map_enter_mem_object_control(vm_map_t target_map,vm_map_offset_t * address,vm_map_size_t initial_size,vm_map_offset_t mask,vm_map_kernel_flags_t vmk_flags,memory_object_control_t control,vm_object_offset_t offset,boolean_t copy,vm_prot_t cur_protection,vm_prot_t max_protection,vm_inherit_t inheritance)5099 vm_map_enter_mem_object_control(
5100 	vm_map_t                target_map,
5101 	vm_map_offset_t         *address,
5102 	vm_map_size_t           initial_size,
5103 	vm_map_offset_t         mask,
5104 	vm_map_kernel_flags_t   vmk_flags,
5105 	memory_object_control_t control,
5106 	vm_object_offset_t      offset,
5107 	boolean_t               copy,
5108 	vm_prot_t               cur_protection,
5109 	vm_prot_t               max_protection,
5110 	vm_inherit_t            inheritance)
5111 {
5112 	vm_map_address_t        map_addr;
5113 	vm_map_size_t           map_size;
5114 	vm_object_t             object;
5115 	vm_object_size_t        size;
5116 	kern_return_t           result;
5117 	memory_object_t         pager;
5118 	vm_prot_t               pager_prot;
5119 	kern_return_t           kr;
5120 #if __arm64__
5121 	boolean_t               fourk = vmk_flags.vmkf_fourk;
5122 #endif /* __arm64__ */
5123 
5124 	/*
5125 	 * Check arguments for validity
5126 	 */
5127 	if ((target_map == VM_MAP_NULL) ||
5128 	    (cur_protection & ~(VM_PROT_ALL | VM_PROT_ALLEXEC)) ||
5129 	    (max_protection & ~(VM_PROT_ALL | VM_PROT_ALLEXEC)) ||
5130 	    (inheritance > VM_INHERIT_LAST_VALID) ||
5131 	    initial_size == 0) {
5132 		return KERN_INVALID_ARGUMENT;
5133 	}
5134 
5135 	if (__improbable((cur_protection & max_protection) != cur_protection)) {
5136 		/* cur is more permissive than max */
5137 		cur_protection &= max_protection;
5138 	}
5139 
5140 #if __arm64__
5141 	if (fourk && VM_MAP_PAGE_MASK(target_map) < PAGE_MASK) {
5142 		fourk = FALSE;
5143 	}
5144 
5145 	if (fourk) {
5146 		map_addr = vm_map_trunc_page(*address,
5147 		    FOURK_PAGE_MASK);
5148 		map_size = vm_map_round_page(initial_size,
5149 		    FOURK_PAGE_MASK);
5150 	} else
5151 #endif /* __arm64__ */
5152 	{
5153 		map_addr = vm_map_trunc_page(*address,
5154 		    VM_MAP_PAGE_MASK(target_map));
5155 		map_size = vm_map_round_page(initial_size,
5156 		    VM_MAP_PAGE_MASK(target_map));
5157 	}
5158 	size = vm_object_round_page(initial_size);
5159 
5160 	object = memory_object_control_to_vm_object(control);
5161 
5162 	if (object == VM_OBJECT_NULL) {
5163 		return KERN_INVALID_OBJECT;
5164 	}
5165 
5166 	if (is_kernel_object(object)) {
5167 		printf("Warning: Attempt to map kernel object"
5168 		    " by a non-private kernel entity\n");
5169 		return KERN_INVALID_OBJECT;
5170 	}
5171 
5172 	vm_object_lock(object);
5173 	object->ref_count++;
5174 
5175 	/*
5176 	 * For "named" VM objects, let the pager know that the
5177 	 * memory object is being mapped.  Some pagers need to keep
5178 	 * track of this, to know when they can reclaim the memory
5179 	 * object, for example.
5180 	 * VM calls memory_object_map() for each mapping (specifying
5181 	 * the protection of each mapping) and calls
5182 	 * memory_object_last_unmap() when all the mappings are gone.
5183 	 */
5184 	pager_prot = max_protection;
5185 	if (copy) {
5186 		pager_prot &= ~VM_PROT_WRITE;
5187 	}
5188 	pager = object->pager;
5189 	if (object->named &&
5190 	    pager != MEMORY_OBJECT_NULL &&
5191 	    object->copy_strategy != MEMORY_OBJECT_COPY_NONE) {
5192 		assert(object->pager_ready);
5193 		vm_object_mapping_wait(object, THREAD_UNINT);
5194 		vm_object_mapping_begin(object);
5195 		vm_object_unlock(object);
5196 
5197 		kr = memory_object_map(pager, pager_prot);
5198 		assert(kr == KERN_SUCCESS);
5199 
5200 		vm_object_lock(object);
5201 		vm_object_mapping_end(object);
5202 	}
5203 	vm_object_unlock(object);
5204 
5205 	/*
5206 	 *	Perform the copy if requested
5207 	 */
5208 
5209 	if (copy) {
5210 		vm_object_t             new_object;
5211 		vm_object_offset_t      new_offset;
5212 
5213 		result = vm_object_copy_strategically(object, offset, size,
5214 		    false,                                   /* forking */
5215 		    &new_object, &new_offset,
5216 		    &copy);
5217 
5218 
5219 		if (result == KERN_MEMORY_RESTART_COPY) {
5220 			boolean_t success;
5221 			boolean_t src_needs_copy;
5222 
5223 			/*
5224 			 * XXX
5225 			 * We currently ignore src_needs_copy.
5226 			 * This really is the issue of how to make
5227 			 * MEMORY_OBJECT_COPY_SYMMETRIC safe for
5228 			 * non-kernel users to use. Solution forthcoming.
5229 			 * In the meantime, since we don't allow non-kernel
5230 			 * memory managers to specify symmetric copy,
5231 			 * we won't run into problems here.
5232 			 */
5233 			new_object = object;
5234 			new_offset = offset;
5235 			success = vm_object_copy_quickly(new_object,
5236 			    new_offset, size,
5237 			    &src_needs_copy,
5238 			    &copy);
5239 			assert(success);
5240 			result = KERN_SUCCESS;
5241 		}
5242 		/*
5243 		 *	Throw away the reference to the
5244 		 *	original object, as it won't be mapped.
5245 		 */
5246 
5247 		vm_object_deallocate(object);
5248 
5249 		if (result != KERN_SUCCESS) {
5250 			return result;
5251 		}
5252 
5253 		object = new_object;
5254 		offset = new_offset;
5255 	}
5256 
5257 #if __arm64__
5258 	if (fourk) {
5259 		result = vm_map_enter_fourk(target_map,
5260 		    &map_addr,
5261 		    map_size,
5262 		    (vm_map_offset_t)mask,
5263 		    vmk_flags,
5264 		    object, offset,
5265 		    copy,
5266 		    cur_protection, max_protection,
5267 		    inheritance);
5268 	} else
5269 #endif /* __arm64__ */
5270 	{
5271 		result = vm_map_enter(target_map,
5272 		    &map_addr, map_size,
5273 		    (vm_map_offset_t)mask,
5274 		    vmk_flags,
5275 		    object, offset,
5276 		    copy,
5277 		    cur_protection, max_protection,
5278 		    inheritance);
5279 	}
5280 	if (result != KERN_SUCCESS) {
5281 		vm_object_deallocate(object);
5282 	}
5283 	*address = map_addr;
5284 
5285 	return result;
5286 }
5287 
5288 
5289 #if     VM_CPM
5290 
5291 #ifdef MACH_ASSERT
5292 extern pmap_paddr_t     avail_start, avail_end;
5293 #endif
5294 
5295 /*
5296  *	Allocate memory in the specified map, with the caveat that
5297  *	the memory is physically contiguous.  This call may fail
5298  *	if the system can't find sufficient contiguous memory.
5299  *	This call may cause or lead to heart-stopping amounts of
5300  *	paging activity.
5301  *
5302  *	Memory obtained from this call should be freed in the
5303  *	normal way, viz., via vm_deallocate.
5304  */
5305 kern_return_t
vm_map_enter_cpm(vm_map_t map,vm_map_offset_t * addr,vm_map_size_t size,vm_map_kernel_flags_t vmk_flags)5306 vm_map_enter_cpm(
5307 	vm_map_t                map,
5308 	vm_map_offset_t        *addr,
5309 	vm_map_size_t           size,
5310 	vm_map_kernel_flags_t   vmk_flags)
5311 {
5312 	vm_object_t             cpm_obj;
5313 	pmap_t                  pmap;
5314 	vm_page_t               m, pages;
5315 	kern_return_t           kr;
5316 	vm_map_offset_t         va, start, end, offset;
5317 #if     MACH_ASSERT
5318 	vm_map_offset_t         prev_addr = 0;
5319 #endif  /* MACH_ASSERT */
5320 	uint8_t                 object_lock_type = 0;
5321 
5322 	if (VM_MAP_PAGE_SHIFT(map) != PAGE_SHIFT) {
5323 		/* XXX TODO4K do we need to support this? */
5324 		*addr = 0;
5325 		return KERN_NOT_SUPPORTED;
5326 	}
5327 
5328 	if (size == 0) {
5329 		*addr = 0;
5330 		return KERN_SUCCESS;
5331 	}
5332 	if (vmk_flags.vmf_fixed) {
5333 		*addr = vm_map_trunc_page(*addr,
5334 		    VM_MAP_PAGE_MASK(map));
5335 	} else {
5336 		*addr = vm_map_min(map);
5337 	}
5338 	size = vm_map_round_page(size,
5339 	    VM_MAP_PAGE_MASK(map));
5340 
5341 	/*
5342 	 * LP64todo - cpm_allocate should probably allow
5343 	 * allocations of >4GB, but not with the current
5344 	 * algorithm, so just cast down the size for now.
5345 	 */
5346 	if (size > VM_MAX_ADDRESS) {
5347 		return KERN_RESOURCE_SHORTAGE;
5348 	}
5349 	if ((kr = cpm_allocate(CAST_DOWN(vm_size_t, size),
5350 	    &pages, 0, 0, TRUE, flags)) != KERN_SUCCESS) {
5351 		return kr;
5352 	}
5353 
5354 	cpm_obj = vm_object_allocate((vm_object_size_t)size);
5355 	assert(cpm_obj != VM_OBJECT_NULL);
5356 	assert(cpm_obj->internal);
5357 	assert(cpm_obj->vo_size == (vm_object_size_t)size);
5358 	assert(cpm_obj->can_persist == FALSE);
5359 	assert(cpm_obj->pager_created == FALSE);
5360 	assert(cpm_obj->pageout == FALSE);
5361 	assert(cpm_obj->shadow == VM_OBJECT_NULL);
5362 
5363 	/*
5364 	 *	Insert pages into object.
5365 	 */
5366 	object_lock_type = OBJECT_LOCK_EXCLUSIVE;
5367 	vm_object_lock(cpm_obj);
5368 	for (offset = 0; offset < size; offset += PAGE_SIZE) {
5369 		m = pages;
5370 		pages = NEXT_PAGE(m);
5371 		*(NEXT_PAGE_PTR(m)) = VM_PAGE_NULL;
5372 
5373 		assert(!m->vmp_gobbled);
5374 		assert(!m->vmp_wanted);
5375 		assert(!m->vmp_pageout);
5376 		assert(!m->vmp_tabled);
5377 		assert(VM_PAGE_WIRED(m));
5378 		assert(m->vmp_busy);
5379 		assert(VM_PAGE_GET_PHYS_PAGE(m) >= (avail_start >> PAGE_SHIFT) && VM_PAGE_GET_PHYS_PAGE(m) <= (avail_end >> PAGE_SHIFT));
5380 
5381 		m->vmp_busy = FALSE;
5382 		vm_page_insert(m, cpm_obj, offset);
5383 	}
5384 	assert(cpm_obj->resident_page_count == size / PAGE_SIZE);
5385 	vm_object_unlock(cpm_obj);
5386 
5387 	/*
5388 	 *	Hang onto a reference on the object in case a
5389 	 *	multi-threaded application for some reason decides
5390 	 *	to deallocate the portion of the address space into
5391 	 *	which we will insert this object.
5392 	 *
5393 	 *	Unfortunately, we must insert the object now before
5394 	 *	we can talk to the pmap module about which addresses
5395 	 *	must be wired down.  Hence, the race with a multi-
5396 	 *	threaded app.
5397 	 */
5398 	vm_object_reference(cpm_obj);
5399 
5400 	/*
5401 	 *	Insert object into map.
5402 	 */
5403 
5404 	kr = vm_map_enter(
5405 		map,
5406 		addr,
5407 		size,
5408 		(vm_map_offset_t)0,
5409 		vmk_flags,
5410 		cpm_obj,
5411 		(vm_object_offset_t)0,
5412 		FALSE,
5413 		VM_PROT_ALL,
5414 		VM_PROT_ALL,
5415 		VM_INHERIT_DEFAULT);
5416 
5417 	if (kr != KERN_SUCCESS) {
5418 		/*
5419 		 *	A CPM object doesn't have can_persist set,
5420 		 *	so all we have to do is deallocate it to
5421 		 *	free up these pages.
5422 		 */
5423 		assert(cpm_obj->pager_created == FALSE);
5424 		assert(cpm_obj->can_persist == FALSE);
5425 		assert(cpm_obj->pageout == FALSE);
5426 		assert(cpm_obj->shadow == VM_OBJECT_NULL);
5427 		vm_object_deallocate(cpm_obj); /* kill acquired ref */
5428 		vm_object_deallocate(cpm_obj); /* kill creation ref */
5429 	}
5430 
5431 	/*
5432 	 *	Inform the physical mapping system that the
5433 	 *	range of addresses may not fault, so that
5434 	 *	page tables and such can be locked down as well.
5435 	 */
5436 	start = *addr;
5437 	end = start + size;
5438 	pmap = vm_map_pmap(map);
5439 	pmap_pageable(pmap, start, end, FALSE);
5440 
5441 	/*
5442 	 *	Enter each page into the pmap, to avoid faults.
5443 	 *	Note that this loop could be coded more efficiently,
5444 	 *	if the need arose, rather than looking up each page
5445 	 *	again.
5446 	 */
5447 	for (offset = 0, va = start; offset < size;
5448 	    va += PAGE_SIZE, offset += PAGE_SIZE) {
5449 		int type_of_fault;
5450 
5451 		vm_object_lock(cpm_obj);
5452 		m = vm_page_lookup(cpm_obj, (vm_object_offset_t)offset);
5453 		assert(m != VM_PAGE_NULL);
5454 
5455 		vm_page_zero_fill(m);
5456 
5457 		type_of_fault = DBG_ZERO_FILL_FAULT;
5458 
5459 		vm_fault_enter(m, pmap, va,
5460 		    PAGE_SIZE, 0,
5461 		    VM_PROT_ALL, VM_PROT_WRITE,
5462 		    VM_PAGE_WIRED(m),
5463 		    FALSE,                             /* change_wiring */
5464 		    VM_KERN_MEMORY_NONE,                             /* tag - not wiring */
5465 		    FALSE,                             /* cs_bypass */
5466 		    0,                                 /* user_tag */
5467 		    0,                             /* pmap_options */
5468 		    NULL,                              /* need_retry */
5469 		    &type_of_fault,
5470 		    &object_lock_type);                 /* Exclusive lock mode. Will remain unchanged.*/
5471 
5472 		vm_object_unlock(cpm_obj);
5473 	}
5474 
5475 #if     MACH_ASSERT
5476 	/*
5477 	 *	Verify ordering in address space.
5478 	 */
5479 	for (offset = 0; offset < size; offset += PAGE_SIZE) {
5480 		vm_object_lock(cpm_obj);
5481 		m = vm_page_lookup(cpm_obj, (vm_object_offset_t)offset);
5482 		vm_object_unlock(cpm_obj);
5483 		if (m == VM_PAGE_NULL) {
5484 			panic("vm_allocate_cpm:  obj %p off 0x%llx no page",
5485 			    cpm_obj, (uint64_t)offset);
5486 		}
5487 		assert(m->vmp_tabled);
5488 		assert(!m->vmp_busy);
5489 		assert(!m->vmp_wanted);
5490 		assert(!m->vmp_fictitious);
5491 		assert(!m->vmp_private);
5492 		assert(!m->vmp_absent);
5493 		assert(!m->vmp_cleaning);
5494 		assert(!m->vmp_laundry);
5495 		assert(!m->vmp_precious);
5496 		assert(!m->vmp_clustered);
5497 		if (offset != 0) {
5498 			if (VM_PAGE_GET_PHYS_PAGE(m) != prev_addr + 1) {
5499 				printf("start 0x%llx end 0x%llx va 0x%llx\n",
5500 				    (uint64_t)start, (uint64_t)end, (uint64_t)va);
5501 				printf("obj %p off 0x%llx\n", cpm_obj, (uint64_t)offset);
5502 				printf("m %p prev_address 0x%llx\n", m, (uint64_t)prev_addr);
5503 				panic("vm_allocate_cpm:  pages not contig!");
5504 			}
5505 		}
5506 		prev_addr = VM_PAGE_GET_PHYS_PAGE(m);
5507 	}
5508 #endif  /* MACH_ASSERT */
5509 
5510 	vm_object_deallocate(cpm_obj); /* kill extra ref */
5511 
5512 	return kr;
5513 }
5514 
5515 
5516 #else   /* VM_CPM */
5517 
5518 /*
5519  *	Interface is defined in all cases, but unless the kernel
5520  *	is built explicitly for this option, the interface does
5521  *	nothing.
5522  */
5523 
5524 kern_return_t
vm_map_enter_cpm(__unused vm_map_t map,__unused vm_map_offset_t * addr,__unused vm_map_size_t size,__unused vm_map_kernel_flags_t vmk_flags)5525 vm_map_enter_cpm(
5526 	__unused vm_map_t                map,
5527 	__unused vm_map_offset_t        *addr,
5528 	__unused vm_map_size_t           size,
5529 	__unused vm_map_kernel_flags_t   vmk_flags)
5530 {
5531 	return KERN_FAILURE;
5532 }
5533 #endif /* VM_CPM */
5534 
5535 /* Not used without nested pmaps */
5536 #ifndef NO_NESTED_PMAP
5537 /*
5538  * Clip and unnest a portion of a nested submap mapping.
5539  */
5540 
5541 
5542 static void
vm_map_clip_unnest(vm_map_t map,vm_map_entry_t entry,vm_map_offset_t start_unnest,vm_map_offset_t end_unnest)5543 vm_map_clip_unnest(
5544 	vm_map_t        map,
5545 	vm_map_entry_t  entry,
5546 	vm_map_offset_t start_unnest,
5547 	vm_map_offset_t end_unnest)
5548 {
5549 	vm_map_offset_t old_start_unnest = start_unnest;
5550 	vm_map_offset_t old_end_unnest = end_unnest;
5551 
5552 	assert(entry->is_sub_map);
5553 	assert(VME_SUBMAP(entry) != NULL);
5554 	assert(entry->use_pmap);
5555 
5556 	/*
5557 	 * Query the platform for the optimal unnest range.
5558 	 * DRK: There's some duplication of effort here, since
5559 	 * callers may have adjusted the range to some extent. This
5560 	 * routine was introduced to support 1GiB subtree nesting
5561 	 * for x86 platforms, which can also nest on 2MiB boundaries
5562 	 * depending on size/alignment.
5563 	 */
5564 	if (pmap_adjust_unnest_parameters(map->pmap, &start_unnest, &end_unnest)) {
5565 		assert(VME_SUBMAP(entry)->is_nested_map);
5566 		assert(!VME_SUBMAP(entry)->disable_vmentry_reuse);
5567 		log_unnest_badness(map,
5568 		    old_start_unnest,
5569 		    old_end_unnest,
5570 		    VME_SUBMAP(entry)->is_nested_map,
5571 		    (entry->vme_start +
5572 		    VME_SUBMAP(entry)->lowest_unnestable_start -
5573 		    VME_OFFSET(entry)));
5574 	}
5575 
5576 	if (entry->vme_start > start_unnest ||
5577 	    entry->vme_end < end_unnest) {
5578 		panic("vm_map_clip_unnest(0x%llx,0x%llx): "
5579 		    "bad nested entry: start=0x%llx end=0x%llx\n",
5580 		    (long long)start_unnest, (long long)end_unnest,
5581 		    (long long)entry->vme_start, (long long)entry->vme_end);
5582 	}
5583 
5584 	if (start_unnest > entry->vme_start) {
5585 		_vm_map_clip_start(&map->hdr,
5586 		    entry,
5587 		    start_unnest);
5588 		if (map->holelistenabled) {
5589 			vm_map_store_update_first_free(map, NULL, FALSE);
5590 		} else {
5591 			vm_map_store_update_first_free(map, map->first_free, FALSE);
5592 		}
5593 	}
5594 	if (entry->vme_end > end_unnest) {
5595 		_vm_map_clip_end(&map->hdr,
5596 		    entry,
5597 		    end_unnest);
5598 		if (map->holelistenabled) {
5599 			vm_map_store_update_first_free(map, NULL, FALSE);
5600 		} else {
5601 			vm_map_store_update_first_free(map, map->first_free, FALSE);
5602 		}
5603 	}
5604 
5605 	pmap_unnest(map->pmap,
5606 	    entry->vme_start,
5607 	    entry->vme_end - entry->vme_start);
5608 	if ((map->mapped_in_other_pmaps) && os_ref_get_count_raw(&map->map_refcnt) != 0) {
5609 		/* clean up parent map/maps */
5610 		vm_map_submap_pmap_clean(
5611 			map, entry->vme_start,
5612 			entry->vme_end,
5613 			VME_SUBMAP(entry),
5614 			VME_OFFSET(entry));
5615 	}
5616 	entry->use_pmap = FALSE;
5617 	if ((map->pmap != kernel_pmap) &&
5618 	    (VME_ALIAS(entry) == VM_MEMORY_SHARED_PMAP)) {
5619 		VME_ALIAS_SET(entry, VM_MEMORY_UNSHARED_PMAP);
5620 	}
5621 }
5622 #endif  /* NO_NESTED_PMAP */
5623 
5624 __abortlike
5625 static void
__vm_map_clip_atomic_entry_panic(vm_map_t map,vm_map_entry_t entry,vm_map_offset_t where)5626 __vm_map_clip_atomic_entry_panic(
5627 	vm_map_t        map,
5628 	vm_map_entry_t  entry,
5629 	vm_map_offset_t where)
5630 {
5631 	panic("vm_map_clip(%p): Attempting to clip an atomic VM map entry "
5632 	    "%p [0x%llx:0x%llx] at 0x%llx", map, entry,
5633 	    (uint64_t)entry->vme_start,
5634 	    (uint64_t)entry->vme_end,
5635 	    (uint64_t)where);
5636 }
5637 
5638 /*
5639  *	vm_map_clip_start:	[ internal use only ]
5640  *
5641  *	Asserts that the given entry begins at or after
5642  *	the specified address; if necessary,
5643  *	it splits the entry into two.
5644  */
5645 void
vm_map_clip_start(vm_map_t map,vm_map_entry_t entry,vm_map_offset_t startaddr)5646 vm_map_clip_start(
5647 	vm_map_t        map,
5648 	vm_map_entry_t  entry,
5649 	vm_map_offset_t startaddr)
5650 {
5651 #ifndef NO_NESTED_PMAP
5652 	if (entry->is_sub_map &&
5653 	    entry->use_pmap &&
5654 	    startaddr >= entry->vme_start) {
5655 		vm_map_offset_t start_unnest, end_unnest;
5656 
5657 		/*
5658 		 * Make sure "startaddr" is no longer in a nested range
5659 		 * before we clip.  Unnest only the minimum range the platform
5660 		 * can handle.
5661 		 * vm_map_clip_unnest may perform additional adjustments to
5662 		 * the unnest range.
5663 		 */
5664 		start_unnest = startaddr & ~(pmap_shared_region_size_min(map->pmap) - 1);
5665 		end_unnest = start_unnest + pmap_shared_region_size_min(map->pmap);
5666 		vm_map_clip_unnest(map, entry, start_unnest, end_unnest);
5667 	}
5668 #endif /* NO_NESTED_PMAP */
5669 	if (startaddr > entry->vme_start) {
5670 		if (!entry->is_sub_map &&
5671 		    VME_OBJECT(entry) &&
5672 		    VME_OBJECT(entry)->phys_contiguous) {
5673 			pmap_remove(map->pmap,
5674 			    (addr64_t)(entry->vme_start),
5675 			    (addr64_t)(entry->vme_end));
5676 		}
5677 		if (entry->vme_atomic) {
5678 			__vm_map_clip_atomic_entry_panic(map, entry, startaddr);
5679 		}
5680 
5681 		DTRACE_VM5(
5682 			vm_map_clip_start,
5683 			vm_map_t, map,
5684 			vm_map_offset_t, entry->vme_start,
5685 			vm_map_offset_t, entry->vme_end,
5686 			vm_map_offset_t, startaddr,
5687 			int, VME_ALIAS(entry));
5688 
5689 		_vm_map_clip_start(&map->hdr, entry, startaddr);
5690 		if (map->holelistenabled) {
5691 			vm_map_store_update_first_free(map, NULL, FALSE);
5692 		} else {
5693 			vm_map_store_update_first_free(map, map->first_free, FALSE);
5694 		}
5695 	}
5696 }
5697 
5698 
5699 #define vm_map_copy_clip_start(copy, entry, startaddr) \
5700 	MACRO_BEGIN \
5701 	if ((startaddr) > (entry)->vme_start) \
5702 	        _vm_map_clip_start(&(copy)->cpy_hdr,(entry),(startaddr)); \
5703 	MACRO_END
5704 
5705 /*
5706  *	This routine is called only when it is known that
5707  *	the entry must be split.
5708  */
5709 static void
_vm_map_clip_start(struct vm_map_header * map_header,vm_map_entry_t entry,vm_map_offset_t start)5710 _vm_map_clip_start(
5711 	struct vm_map_header    *map_header,
5712 	vm_map_entry_t          entry,
5713 	vm_map_offset_t         start)
5714 {
5715 	vm_map_entry_t  new_entry;
5716 
5717 	/*
5718 	 *	Split off the front portion --
5719 	 *	note that we must insert the new
5720 	 *	entry BEFORE this one, so that
5721 	 *	this entry has the specified starting
5722 	 *	address.
5723 	 */
5724 
5725 	if (entry->map_aligned) {
5726 		assert(VM_MAP_PAGE_ALIGNED(start,
5727 		    VM_MAP_HDR_PAGE_MASK(map_header)));
5728 	}
5729 
5730 	new_entry = _vm_map_entry_create(map_header);
5731 	vm_map_entry_copy_full(new_entry, entry);
5732 
5733 	new_entry->vme_end = start;
5734 	assert(new_entry->vme_start < new_entry->vme_end);
5735 	VME_OFFSET_SET(entry, VME_OFFSET(entry) + (start - entry->vme_start));
5736 	if (__improbable(start >= entry->vme_end)) {
5737 		panic("mapHdr %p entry %p start 0x%llx end 0x%llx new start 0x%llx", map_header, entry, entry->vme_start, entry->vme_end, start);
5738 	}
5739 	assert(start < entry->vme_end);
5740 	entry->vme_start = start;
5741 
5742 #if VM_BTLOG_TAGS
5743 	if (new_entry->vme_kernel_object) {
5744 		btref_retain(new_entry->vme_tag_btref);
5745 	}
5746 #endif /* VM_BTLOG_TAGS */
5747 
5748 	_vm_map_store_entry_link(map_header, entry->vme_prev, new_entry);
5749 
5750 	if (entry->is_sub_map) {
5751 		vm_map_reference(VME_SUBMAP(new_entry));
5752 	} else {
5753 		vm_object_reference(VME_OBJECT(new_entry));
5754 	}
5755 }
5756 
5757 
5758 /*
5759  *	vm_map_clip_end:	[ internal use only ]
5760  *
5761  *	Asserts that the given entry ends at or before
5762  *	the specified address; if necessary,
5763  *	it splits the entry into two.
5764  */
5765 void
vm_map_clip_end(vm_map_t map,vm_map_entry_t entry,vm_map_offset_t endaddr)5766 vm_map_clip_end(
5767 	vm_map_t        map,
5768 	vm_map_entry_t  entry,
5769 	vm_map_offset_t endaddr)
5770 {
5771 	if (endaddr > entry->vme_end) {
5772 		/*
5773 		 * Within the scope of this clipping, limit "endaddr" to
5774 		 * the end of this map entry...
5775 		 */
5776 		endaddr = entry->vme_end;
5777 	}
5778 #ifndef NO_NESTED_PMAP
5779 	if (entry->is_sub_map && entry->use_pmap) {
5780 		vm_map_offset_t start_unnest, end_unnest;
5781 
5782 		/*
5783 		 * Make sure the range between the start of this entry and
5784 		 * the new "endaddr" is no longer nested before we clip.
5785 		 * Unnest only the minimum range the platform can handle.
5786 		 * vm_map_clip_unnest may perform additional adjustments to
5787 		 * the unnest range.
5788 		 */
5789 		start_unnest = entry->vme_start;
5790 		end_unnest =
5791 		    (endaddr + pmap_shared_region_size_min(map->pmap) - 1) &
5792 		    ~(pmap_shared_region_size_min(map->pmap) - 1);
5793 		vm_map_clip_unnest(map, entry, start_unnest, end_unnest);
5794 	}
5795 #endif /* NO_NESTED_PMAP */
5796 	if (endaddr < entry->vme_end) {
5797 		if (!entry->is_sub_map &&
5798 		    VME_OBJECT(entry) &&
5799 		    VME_OBJECT(entry)->phys_contiguous) {
5800 			pmap_remove(map->pmap,
5801 			    (addr64_t)(entry->vme_start),
5802 			    (addr64_t)(entry->vme_end));
5803 		}
5804 		if (entry->vme_atomic) {
5805 			__vm_map_clip_atomic_entry_panic(map, entry, endaddr);
5806 		}
5807 		DTRACE_VM5(
5808 			vm_map_clip_end,
5809 			vm_map_t, map,
5810 			vm_map_offset_t, entry->vme_start,
5811 			vm_map_offset_t, entry->vme_end,
5812 			vm_map_offset_t, endaddr,
5813 			int, VME_ALIAS(entry));
5814 
5815 		_vm_map_clip_end(&map->hdr, entry, endaddr);
5816 		if (map->holelistenabled) {
5817 			vm_map_store_update_first_free(map, NULL, FALSE);
5818 		} else {
5819 			vm_map_store_update_first_free(map, map->first_free, FALSE);
5820 		}
5821 	}
5822 }
5823 
5824 
5825 #define vm_map_copy_clip_end(copy, entry, endaddr) \
5826 	MACRO_BEGIN \
5827 	if ((endaddr) < (entry)->vme_end) \
5828 	        _vm_map_clip_end(&(copy)->cpy_hdr,(entry),(endaddr)); \
5829 	MACRO_END
5830 
5831 /*
5832  *	This routine is called only when it is known that
5833  *	the entry must be split.
5834  */
5835 static void
_vm_map_clip_end(struct vm_map_header * map_header,vm_map_entry_t entry,vm_map_offset_t end)5836 _vm_map_clip_end(
5837 	struct vm_map_header    *map_header,
5838 	vm_map_entry_t          entry,
5839 	vm_map_offset_t         end)
5840 {
5841 	vm_map_entry_t  new_entry;
5842 
5843 	/*
5844 	 *	Create a new entry and insert it
5845 	 *	AFTER the specified entry
5846 	 */
5847 
5848 	if (entry->map_aligned) {
5849 		assert(VM_MAP_PAGE_ALIGNED(end,
5850 		    VM_MAP_HDR_PAGE_MASK(map_header)));
5851 	}
5852 
5853 	new_entry = _vm_map_entry_create(map_header);
5854 	vm_map_entry_copy_full(new_entry, entry);
5855 
5856 	if (__improbable(end <= entry->vme_start)) {
5857 		panic("mapHdr %p entry %p start 0x%llx end 0x%llx new end 0x%llx", map_header, entry, entry->vme_start, entry->vme_end, end);
5858 	}
5859 	assert(entry->vme_start < end);
5860 	new_entry->vme_start = entry->vme_end = end;
5861 	VME_OFFSET_SET(new_entry,
5862 	    VME_OFFSET(new_entry) + (end - entry->vme_start));
5863 	assert(new_entry->vme_start < new_entry->vme_end);
5864 
5865 #if VM_BTLOG_TAGS
5866 	if (new_entry->vme_kernel_object) {
5867 		btref_retain(new_entry->vme_tag_btref);
5868 	}
5869 #endif /* VM_BTLOG_TAGS */
5870 
5871 	_vm_map_store_entry_link(map_header, entry, new_entry);
5872 
5873 	if (entry->is_sub_map) {
5874 		vm_map_reference(VME_SUBMAP(new_entry));
5875 	} else {
5876 		vm_object_reference(VME_OBJECT(new_entry));
5877 	}
5878 }
5879 
5880 
5881 /*
5882  *	VM_MAP_RANGE_CHECK:	[ internal use only ]
5883  *
5884  *	Asserts that the starting and ending region
5885  *	addresses fall within the valid range of the map.
5886  */
5887 #define VM_MAP_RANGE_CHECK(map, start, end)     \
5888 	MACRO_BEGIN                             \
5889 	if (start < vm_map_min(map))            \
5890 	        start = vm_map_min(map);        \
5891 	if (end > vm_map_max(map))              \
5892 	        end = vm_map_max(map);          \
5893 	if (start > end)                        \
5894 	        start = end;                    \
5895 	MACRO_END
5896 
5897 /*
5898  *	vm_map_range_check:	[ internal use only ]
5899  *
5900  *	Check that the region defined by the specified start and
5901  *	end addresses are wholly contained within a single map
5902  *	entry or set of adjacent map entries of the spacified map,
5903  *	i.e. the specified region contains no unmapped space.
5904  *	If any or all of the region is unmapped, FALSE is returned.
5905  *	Otherwise, TRUE is returned and if the output argument 'entry'
5906  *	is not NULL it points to the map entry containing the start
5907  *	of the region.
5908  *
5909  *	The map is locked for reading on entry and is left locked.
5910  */
5911 static boolean_t
vm_map_range_check(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_map_entry_t * entry)5912 vm_map_range_check(
5913 	vm_map_t                map,
5914 	vm_map_offset_t         start,
5915 	vm_map_offset_t         end,
5916 	vm_map_entry_t          *entry)
5917 {
5918 	vm_map_entry_t          cur;
5919 	vm_map_offset_t         prev;
5920 
5921 	/*
5922 	 *      Basic sanity checks first
5923 	 */
5924 	if (start < vm_map_min(map) || end > vm_map_max(map) || start > end) {
5925 		return FALSE;
5926 	}
5927 
5928 	/*
5929 	 *      Check first if the region starts within a valid
5930 	 *	mapping for the map.
5931 	 */
5932 	if (!vm_map_lookup_entry(map, start, &cur)) {
5933 		return FALSE;
5934 	}
5935 
5936 	/*
5937 	 *	Optimize for the case that the region is contained
5938 	 *	in a single map entry.
5939 	 */
5940 	if (entry != (vm_map_entry_t *) NULL) {
5941 		*entry = cur;
5942 	}
5943 	if (end <= cur->vme_end) {
5944 		return TRUE;
5945 	}
5946 
5947 	/*
5948 	 *      If the region is not wholly contained within a
5949 	 *      single entry, walk the entries looking for holes.
5950 	 */
5951 	prev = cur->vme_end;
5952 	cur = cur->vme_next;
5953 	while ((cur != vm_map_to_entry(map)) && (prev == cur->vme_start)) {
5954 		if (end <= cur->vme_end) {
5955 			return TRUE;
5956 		}
5957 		prev = cur->vme_end;
5958 		cur = cur->vme_next;
5959 	}
5960 	return FALSE;
5961 }
5962 
5963 /*
5964  *	vm_map_protect:
5965  *
5966  *	Sets the protection of the specified address
5967  *	region in the target map.  If "set_max" is
5968  *	specified, the maximum protection is to be set;
5969  *	otherwise, only the current protection is affected.
5970  */
5971 kern_return_t
vm_map_protect(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_prot_t new_prot,boolean_t set_max)5972 vm_map_protect(
5973 	vm_map_t        map,
5974 	vm_map_offset_t start,
5975 	vm_map_offset_t end,
5976 	vm_prot_t       new_prot,
5977 	boolean_t       set_max)
5978 {
5979 	vm_map_entry_t                  current;
5980 	vm_map_offset_t                 prev;
5981 	vm_map_entry_t                  entry;
5982 	vm_prot_t                       new_max;
5983 	int                             pmap_options = 0;
5984 	kern_return_t                   kr;
5985 
5986 	if (__improbable(vm_map_range_overflows(map, start, end - start))) {
5987 		return KERN_INVALID_ARGUMENT;
5988 	}
5989 
5990 	if (new_prot & VM_PROT_COPY) {
5991 		vm_map_offset_t         new_start;
5992 		vm_prot_t               cur_prot, max_prot;
5993 		vm_map_kernel_flags_t   kflags;
5994 
5995 		/* LP64todo - see below */
5996 		if (start >= map->max_offset) {
5997 			return KERN_INVALID_ADDRESS;
5998 		}
5999 
6000 		if ((new_prot & VM_PROT_ALLEXEC) &&
6001 		    map->pmap != kernel_pmap &&
6002 		    (vm_map_cs_enforcement(map)
6003 #if XNU_TARGET_OS_OSX && __arm64__
6004 		    || !VM_MAP_IS_EXOTIC(map)
6005 #endif /* XNU_TARGET_OS_OSX && __arm64__ */
6006 		    ) &&
6007 		    VM_MAP_POLICY_WX_FAIL(map)) {
6008 			DTRACE_VM3(cs_wx,
6009 			    uint64_t, (uint64_t) start,
6010 			    uint64_t, (uint64_t) end,
6011 			    vm_prot_t, new_prot);
6012 			printf("CODE SIGNING: %d[%s] %s:%d(0x%llx,0x%llx,0x%x) can't have both write and exec at the same time\n",
6013 			    proc_selfpid(),
6014 			    (get_bsdtask_info(current_task())
6015 			    ? proc_name_address(get_bsdtask_info(current_task()))
6016 			    : "?"),
6017 			    __FUNCTION__, __LINE__,
6018 #if DEVELOPMENT || DEBUG
6019 			    (uint64_t)start,
6020 			    (uint64_t)end,
6021 #else /* DEVELOPMENT || DEBUG */
6022 			    (uint64_t)0,
6023 			    (uint64_t)0,
6024 #endif /* DEVELOPMENT || DEBUG */
6025 			    new_prot);
6026 			return KERN_PROTECTION_FAILURE;
6027 		}
6028 
6029 		/*
6030 		 * Let vm_map_remap_extract() know that it will need to:
6031 		 * + make a copy of the mapping
6032 		 * + add VM_PROT_WRITE to the max protections
6033 		 * + remove any protections that are no longer allowed from the
6034 		 *   max protections (to avoid any WRITE/EXECUTE conflict, for
6035 		 *   example).
6036 		 * Note that "max_prot" is an IN/OUT parameter only for this
6037 		 * specific (VM_PROT_COPY) case.  It's usually an OUT parameter
6038 		 * only.
6039 		 */
6040 		max_prot = new_prot & (VM_PROT_ALL | VM_PROT_ALLEXEC);
6041 		cur_prot = VM_PROT_NONE;
6042 		kflags = VM_MAP_KERNEL_FLAGS_FIXED(.vmf_overwrite = true);
6043 		kflags.vmkf_remap_prot_copy = true;
6044 		kflags.vmkf_tpro_enforcement_override = !vm_map_tpro_enforcement(map);
6045 		new_start = start;
6046 		kr = vm_map_remap(map,
6047 		    &new_start,
6048 		    end - start,
6049 		    0, /* mask */
6050 		    kflags,
6051 		    map,
6052 		    start,
6053 		    TRUE, /* copy-on-write remapping! */
6054 		    &cur_prot, /* IN/OUT */
6055 		    &max_prot, /* IN/OUT */
6056 		    VM_INHERIT_DEFAULT);
6057 		if (kr != KERN_SUCCESS) {
6058 			return kr;
6059 		}
6060 		new_prot &= ~VM_PROT_COPY;
6061 	}
6062 
6063 	vm_map_lock(map);
6064 
6065 	/* LP64todo - remove this check when vm_map_commpage64()
6066 	 * no longer has to stuff in a map_entry for the commpage
6067 	 * above the map's max_offset.
6068 	 */
6069 	if (start >= map->max_offset) {
6070 		vm_map_unlock(map);
6071 		return KERN_INVALID_ADDRESS;
6072 	}
6073 
6074 	while (1) {
6075 		/*
6076 		 *      Lookup the entry.  If it doesn't start in a valid
6077 		 *	entry, return an error.
6078 		 */
6079 		if (!vm_map_lookup_entry(map, start, &entry)) {
6080 			vm_map_unlock(map);
6081 			return KERN_INVALID_ADDRESS;
6082 		}
6083 
6084 		if (entry->superpage_size && (start & (SUPERPAGE_SIZE - 1))) { /* extend request to whole entry */
6085 			start = SUPERPAGE_ROUND_DOWN(start);
6086 			continue;
6087 		}
6088 		break;
6089 	}
6090 	if (entry->superpage_size) {
6091 		end = SUPERPAGE_ROUND_UP(end);
6092 	}
6093 
6094 	/*
6095 	 *	Make a first pass to check for protection and address
6096 	 *	violations.
6097 	 */
6098 
6099 	current = entry;
6100 	prev = current->vme_start;
6101 	while ((current != vm_map_to_entry(map)) &&
6102 	    (current->vme_start < end)) {
6103 		/*
6104 		 * If there is a hole, return an error.
6105 		 */
6106 		if (current->vme_start != prev) {
6107 			vm_map_unlock(map);
6108 			return KERN_INVALID_ADDRESS;
6109 		}
6110 
6111 		new_max = current->max_protection;
6112 
6113 #if defined(__x86_64__)
6114 		/* Allow max mask to include execute prot bits if this map doesn't enforce CS */
6115 		if (set_max && (new_prot & VM_PROT_ALLEXEC) && !vm_map_cs_enforcement(map)) {
6116 			new_max = (new_max & ~VM_PROT_ALLEXEC) | (new_prot & VM_PROT_ALLEXEC);
6117 		}
6118 #elif CODE_SIGNING_MONITOR
6119 		if (set_max && (new_prot & VM_PROT_EXECUTE) && (csm_address_space_exempt(map->pmap) == KERN_SUCCESS)) {
6120 			new_max |= VM_PROT_EXECUTE;
6121 		}
6122 #endif
6123 		if ((new_prot & new_max) != new_prot) {
6124 			vm_map_unlock(map);
6125 			return KERN_PROTECTION_FAILURE;
6126 		}
6127 
6128 		if (current->used_for_jit &&
6129 		    pmap_has_prot_policy(map->pmap, current->translated_allow_execute, current->protection)) {
6130 			vm_map_unlock(map);
6131 			return KERN_PROTECTION_FAILURE;
6132 		}
6133 
6134 #if __arm64e__
6135 		/* Disallow remapping hw assisted TPRO mappings */
6136 		if (current->used_for_tpro) {
6137 			vm_map_unlock(map);
6138 			return KERN_PROTECTION_FAILURE;
6139 		}
6140 #endif /* __arm64e__ */
6141 
6142 
6143 		if ((new_prot & VM_PROT_WRITE) &&
6144 		    (new_prot & VM_PROT_ALLEXEC) &&
6145 #if XNU_TARGET_OS_OSX
6146 		    map->pmap != kernel_pmap &&
6147 		    (vm_map_cs_enforcement(map)
6148 #if __arm64__
6149 		    || !VM_MAP_IS_EXOTIC(map)
6150 #endif /* __arm64__ */
6151 		    ) &&
6152 #endif /* XNU_TARGET_OS_OSX */
6153 #if CODE_SIGNING_MONITOR
6154 		    (csm_address_space_exempt(map->pmap) != KERN_SUCCESS) &&
6155 #endif
6156 		    !(current->used_for_jit)) {
6157 			DTRACE_VM3(cs_wx,
6158 			    uint64_t, (uint64_t) current->vme_start,
6159 			    uint64_t, (uint64_t) current->vme_end,
6160 			    vm_prot_t, new_prot);
6161 			printf("CODE SIGNING: %d[%s] %s:%d(0x%llx,0x%llx,0x%x) can't have both write and exec at the same time\n",
6162 			    proc_selfpid(),
6163 			    (get_bsdtask_info(current_task())
6164 			    ? proc_name_address(get_bsdtask_info(current_task()))
6165 			    : "?"),
6166 			    __FUNCTION__, __LINE__,
6167 #if DEVELOPMENT || DEBUG
6168 			    (uint64_t)current->vme_start,
6169 			    (uint64_t)current->vme_end,
6170 #else /* DEVELOPMENT || DEBUG */
6171 			    (uint64_t)0,
6172 			    (uint64_t)0,
6173 #endif /* DEVELOPMENT || DEBUG */
6174 			    new_prot);
6175 			new_prot &= ~VM_PROT_ALLEXEC;
6176 			if (VM_MAP_POLICY_WX_FAIL(map)) {
6177 				vm_map_unlock(map);
6178 				return KERN_PROTECTION_FAILURE;
6179 			}
6180 		}
6181 
6182 		/*
6183 		 * If the task has requested executable lockdown,
6184 		 * deny both:
6185 		 * - adding executable protections OR
6186 		 * - adding write protections to an existing executable mapping.
6187 		 */
6188 		if (map->map_disallow_new_exec == TRUE) {
6189 			if ((new_prot & VM_PROT_ALLEXEC) ||
6190 			    ((current->protection & VM_PROT_EXECUTE) && (new_prot & VM_PROT_WRITE))) {
6191 				vm_map_unlock(map);
6192 				return KERN_PROTECTION_FAILURE;
6193 			}
6194 		}
6195 
6196 		prev = current->vme_end;
6197 		current = current->vme_next;
6198 	}
6199 
6200 #if __arm64__
6201 	if (end > prev &&
6202 	    end == vm_map_round_page(prev, VM_MAP_PAGE_MASK(map))) {
6203 		vm_map_entry_t prev_entry;
6204 
6205 		prev_entry = current->vme_prev;
6206 		if (prev_entry != vm_map_to_entry(map) &&
6207 		    !prev_entry->map_aligned &&
6208 		    (vm_map_round_page(prev_entry->vme_end,
6209 		    VM_MAP_PAGE_MASK(map))
6210 		    == end)) {
6211 			/*
6212 			 * The last entry in our range is not "map-aligned"
6213 			 * but it would have reached all the way to "end"
6214 			 * if it had been map-aligned, so this is not really
6215 			 * a hole in the range and we can proceed.
6216 			 */
6217 			prev = end;
6218 		}
6219 	}
6220 #endif /* __arm64__ */
6221 
6222 	if (end > prev) {
6223 		vm_map_unlock(map);
6224 		return KERN_INVALID_ADDRESS;
6225 	}
6226 
6227 	/*
6228 	 *	Go back and fix up protections.
6229 	 *	Clip to start here if the range starts within
6230 	 *	the entry.
6231 	 */
6232 
6233 	current = entry;
6234 	if (current != vm_map_to_entry(map)) {
6235 		/* clip and unnest if necessary */
6236 		vm_map_clip_start(map, current, start);
6237 	}
6238 
6239 	while ((current != vm_map_to_entry(map)) &&
6240 	    (current->vme_start < end)) {
6241 		vm_prot_t       old_prot;
6242 
6243 		vm_map_clip_end(map, current, end);
6244 
6245 #if DEVELOPMENT || DEBUG
6246 		if (current->csm_associated && vm_log_xnu_user_debug) {
6247 			printf("FBDP %d[%s] %s(0x%llx,0x%llx,0x%x) on map %p entry %p [0x%llx:0x%llx 0x%x/0x%x] csm_associated\n",
6248 			    proc_selfpid(),
6249 			    (get_bsdtask_info(current_task())
6250 			    ? proc_name_address(get_bsdtask_info(current_task()))
6251 			    : "?"),
6252 			    __FUNCTION__,
6253 			    (uint64_t)start,
6254 			    (uint64_t)end,
6255 			    new_prot,
6256 			    map, current,
6257 			    current->vme_start,
6258 			    current->vme_end,
6259 			    current->protection,
6260 			    current->max_protection);
6261 		}
6262 #endif /* DEVELOPMENT || DEBUG */
6263 
6264 		if (current->is_sub_map) {
6265 			/* clipping did unnest if needed */
6266 			assert(!current->use_pmap);
6267 		}
6268 
6269 		old_prot = current->protection;
6270 
6271 		if (set_max) {
6272 			current->max_protection = new_prot;
6273 			/* Consider either EXECUTE or UEXEC as EXECUTE for this masking */
6274 			current->protection = (new_prot & old_prot);
6275 		} else {
6276 			current->protection = new_prot;
6277 		}
6278 
6279 #if CODE_SIGNING_MONITOR
6280 		if (!current->vme_xnu_user_debug &&
6281 		    /* a !csm_associated mapping becoming executable */
6282 		    ((!current->csm_associated &&
6283 		    !(old_prot & VM_PROT_EXECUTE) &&
6284 		    (current->protection & VM_PROT_EXECUTE))
6285 		    ||
6286 		    /* a csm_associated mapping becoming writable */
6287 		    (current->csm_associated &&
6288 		    !(old_prot & VM_PROT_WRITE) &&
6289 		    (current->protection & VM_PROT_WRITE)))) {
6290 			/*
6291 			 * This mapping has not already been marked as
6292 			 * "user_debug" and it is either:
6293 			 * 1. not code-signing-monitored and becoming executable
6294 			 * 2. code-signing-monitored and becoming writable,
6295 			 * so inform the CodeSigningMonitor and mark the
6296 			 * mapping as "user_debug" if appropriate.
6297 			 */
6298 			vm_map_kernel_flags_t vmk_flags;
6299 			vmk_flags = VM_MAP_KERNEL_FLAGS_NONE;
6300 			/* pretend it's a vm_protect(VM_PROT_COPY)... */
6301 			vmk_flags.vmkf_remap_prot_copy = true;
6302 			kr = vm_map_entry_cs_associate(map, current, vmk_flags);
6303 #if DEVELOPMENT || DEBUG
6304 			if (vm_log_xnu_user_debug) {
6305 				printf("FBDP %d[%s] %s:%d map %p entry %p [ 0x%llx 0x%llx ] prot 0x%x -> 0x%x cs_associate -> %d user_debug=%d\n",
6306 				    proc_selfpid(),
6307 				    (get_bsdtask_info(current_task()) ? proc_name_address(get_bsdtask_info(current_task())) : "?"),
6308 				    __FUNCTION__, __LINE__,
6309 				    map, current,
6310 				    current->vme_start, current->vme_end,
6311 				    old_prot, current->protection,
6312 				    kr, current->vme_xnu_user_debug);
6313 			}
6314 #endif /* DEVELOPMENT || DEBUG */
6315 		}
6316 #endif /* CODE_SIGNING_MONITOR */
6317 
6318 		/*
6319 		 *	Update physical map if necessary.
6320 		 *	If the request is to turn off write protection,
6321 		 *	we won't do it for real (in pmap). This is because
6322 		 *	it would cause copy-on-write to fail.  We've already
6323 		 *	set, the new protection in the map, so if a
6324 		 *	write-protect fault occurred, it will be fixed up
6325 		 *	properly, COW or not.
6326 		 */
6327 		if (current->protection != old_prot) {
6328 			/* Look one level in we support nested pmaps */
6329 			/* from mapped submaps which are direct entries */
6330 			/* in our map */
6331 
6332 			vm_prot_t prot;
6333 
6334 			prot = current->protection;
6335 			if (current->is_sub_map || (VME_OBJECT(current) == NULL) || (VME_OBJECT(current) != compressor_object)) {
6336 				prot &= ~VM_PROT_WRITE;
6337 			} else {
6338 				assert(!VME_OBJECT(current)->code_signed);
6339 				assert(VME_OBJECT(current)->copy_strategy == MEMORY_OBJECT_COPY_NONE);
6340 				if (prot & VM_PROT_WRITE) {
6341 					/*
6342 					 * For write requests on the
6343 					 * compressor, we wil ask the
6344 					 * pmap layer to prevent us from
6345 					 * taking a write fault when we
6346 					 * attempt to access the mapping
6347 					 * next.
6348 					 */
6349 					pmap_options |= PMAP_OPTIONS_PROTECT_IMMEDIATE;
6350 				}
6351 			}
6352 
6353 			if (override_nx(map, VME_ALIAS(current)) && prot) {
6354 				prot |= VM_PROT_EXECUTE;
6355 			}
6356 
6357 #if DEVELOPMENT || DEBUG
6358 			if (!(old_prot & VM_PROT_EXECUTE) &&
6359 			    (prot & VM_PROT_EXECUTE) &&
6360 			    panic_on_unsigned_execute &&
6361 			    (proc_selfcsflags() & CS_KILL)) {
6362 				panic("vm_map_protect(%p,0x%llx,0x%llx) old=0x%x new=0x%x - <rdar://23770418> code-signing bypass?", map, (uint64_t)current->vme_start, (uint64_t)current->vme_end, old_prot, prot);
6363 			}
6364 #endif /* DEVELOPMENT || DEBUG */
6365 
6366 			if (pmap_has_prot_policy(map->pmap, current->translated_allow_execute, prot)) {
6367 				if (current->wired_count) {
6368 					panic("vm_map_protect(%p,0x%llx,0x%llx) new=0x%x wired=%x",
6369 					    map, (uint64_t)current->vme_start, (uint64_t)current->vme_end, prot, current->wired_count);
6370 				}
6371 
6372 				/* If the pmap layer cares about this
6373 				 * protection type, force a fault for
6374 				 * each page so that vm_fault will
6375 				 * repopulate the page with the full
6376 				 * set of protections.
6377 				 */
6378 				/*
6379 				 * TODO: We don't seem to need this,
6380 				 * but this is due to an internal
6381 				 * implementation detail of
6382 				 * pmap_protect.  Do we want to rely
6383 				 * on this?
6384 				 */
6385 				prot = VM_PROT_NONE;
6386 			}
6387 
6388 			if (current->is_sub_map && current->use_pmap) {
6389 				pmap_protect(VME_SUBMAP(current)->pmap,
6390 				    current->vme_start,
6391 				    current->vme_end,
6392 				    prot);
6393 			} else {
6394 				pmap_protect_options(map->pmap,
6395 				    current->vme_start,
6396 				    current->vme_end,
6397 				    prot,
6398 				    pmap_options,
6399 				    NULL);
6400 			}
6401 		}
6402 		current = current->vme_next;
6403 	}
6404 
6405 	current = entry;
6406 	while ((current != vm_map_to_entry(map)) &&
6407 	    (current->vme_start <= end)) {
6408 		vm_map_simplify_entry(map, current);
6409 		current = current->vme_next;
6410 	}
6411 
6412 	vm_map_unlock(map);
6413 	return KERN_SUCCESS;
6414 }
6415 
6416 /*
6417  *	vm_map_inherit:
6418  *
6419  *	Sets the inheritance of the specified address
6420  *	range in the target map.  Inheritance
6421  *	affects how the map will be shared with
6422  *	child maps at the time of vm_map_fork.
6423  */
6424 kern_return_t
vm_map_inherit(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_inherit_t new_inheritance)6425 vm_map_inherit(
6426 	vm_map_t        map,
6427 	vm_map_offset_t start,
6428 	vm_map_offset_t end,
6429 	vm_inherit_t    new_inheritance)
6430 {
6431 	vm_map_entry_t  entry;
6432 	vm_map_entry_t  temp_entry;
6433 
6434 	vm_map_lock(map);
6435 
6436 	VM_MAP_RANGE_CHECK(map, start, end);
6437 
6438 	if (__improbable(vm_map_range_overflows(map, start, end - start))) {
6439 		vm_map_unlock(map);
6440 		return KERN_INVALID_ADDRESS;
6441 	}
6442 
6443 	if (vm_map_lookup_entry(map, start, &temp_entry)) {
6444 		entry = temp_entry;
6445 	} else {
6446 		temp_entry = temp_entry->vme_next;
6447 		entry = temp_entry;
6448 	}
6449 
6450 	/* first check entire range for submaps which can't support the */
6451 	/* given inheritance. */
6452 	while ((entry != vm_map_to_entry(map)) && (entry->vme_start < end)) {
6453 		if (entry->is_sub_map) {
6454 			if (new_inheritance == VM_INHERIT_COPY) {
6455 				vm_map_unlock(map);
6456 				return KERN_INVALID_ARGUMENT;
6457 			}
6458 		}
6459 
6460 		entry = entry->vme_next;
6461 	}
6462 
6463 	entry = temp_entry;
6464 	if (entry != vm_map_to_entry(map)) {
6465 		/* clip and unnest if necessary */
6466 		vm_map_clip_start(map, entry, start);
6467 	}
6468 
6469 	while ((entry != vm_map_to_entry(map)) && (entry->vme_start < end)) {
6470 		vm_map_clip_end(map, entry, end);
6471 		if (entry->is_sub_map) {
6472 			/* clip did unnest if needed */
6473 			assert(!entry->use_pmap);
6474 		}
6475 
6476 		entry->inheritance = new_inheritance;
6477 
6478 		entry = entry->vme_next;
6479 	}
6480 
6481 	vm_map_unlock(map);
6482 	return KERN_SUCCESS;
6483 }
6484 
6485 /*
6486  * Update the accounting for the amount of wired memory in this map.  If the user has
6487  * exceeded the defined limits, then we fail.  Wiring on behalf of the kernel never fails.
6488  */
6489 
6490 static kern_return_t
add_wire_counts(vm_map_t map,vm_map_entry_t entry,boolean_t user_wire)6491 add_wire_counts(
6492 	vm_map_t        map,
6493 	vm_map_entry_t  entry,
6494 	boolean_t       user_wire)
6495 {
6496 	vm_map_size_t   size;
6497 
6498 	bool first_wire = entry->wired_count == 0 && entry->user_wired_count == 0;
6499 
6500 	if (user_wire) {
6501 		unsigned int total_wire_count =  vm_page_wire_count + vm_lopage_free_count;
6502 
6503 		/*
6504 		 * We're wiring memory at the request of the user.  Check if this is the first time the user is wiring
6505 		 * this map entry.
6506 		 */
6507 
6508 		if (entry->user_wired_count == 0) {
6509 			size = entry->vme_end - entry->vme_start;
6510 
6511 			/*
6512 			 * Since this is the first time the user is wiring this map entry, check to see if we're
6513 			 * exceeding the user wire limits.  There is a per map limit which is the smaller of either
6514 			 * the process's rlimit or the global vm_per_task_user_wire_limit which caps this value.  There is also
6515 			 * a system-wide limit on the amount of memory all users can wire.  If the user is over either
6516 			 * limit, then we fail.
6517 			 */
6518 
6519 			if (size + map->user_wire_size > MIN(map->user_wire_limit, vm_per_task_user_wire_limit) ||
6520 			    size + ptoa_64(total_wire_count) > vm_global_user_wire_limit) {
6521 				if (size + ptoa_64(total_wire_count) > vm_global_user_wire_limit) {
6522 #if DEVELOPMENT || DEBUG
6523 					if (panic_on_mlock_failure) {
6524 						panic("mlock: Over global wire limit. %llu bytes wired and requested to wire %llu bytes more", ptoa_64(total_wire_count), (uint64_t) size);
6525 					}
6526 #endif /* DEVELOPMENT || DEBUG */
6527 					os_atomic_inc(&vm_add_wire_count_over_global_limit, relaxed);
6528 				} else {
6529 					os_atomic_inc(&vm_add_wire_count_over_user_limit, relaxed);
6530 #if DEVELOPMENT || DEBUG
6531 					if (panic_on_mlock_failure) {
6532 						panic("mlock: Over process wire limit. %llu bytes wired and requested to wire %llu bytes more", (uint64_t) map->user_wire_size, (uint64_t) size);
6533 					}
6534 #endif /* DEVELOPMENT || DEBUG */
6535 				}
6536 				return KERN_RESOURCE_SHORTAGE;
6537 			}
6538 
6539 			/*
6540 			 * The first time the user wires an entry, we also increment the wired_count and add this to
6541 			 * the total that has been wired in the map.
6542 			 */
6543 
6544 			if (entry->wired_count >= MAX_WIRE_COUNT) {
6545 				return KERN_FAILURE;
6546 			}
6547 
6548 			entry->wired_count++;
6549 			map->user_wire_size += size;
6550 		}
6551 
6552 		if (entry->user_wired_count >= MAX_WIRE_COUNT) {
6553 			return KERN_FAILURE;
6554 		}
6555 
6556 		entry->user_wired_count++;
6557 	} else {
6558 		/*
6559 		 * The kernel's wiring the memory.  Just bump the count and continue.
6560 		 */
6561 
6562 		if (entry->wired_count >= MAX_WIRE_COUNT) {
6563 			panic("vm_map_wire: too many wirings");
6564 		}
6565 
6566 		entry->wired_count++;
6567 	}
6568 
6569 	if (first_wire) {
6570 		vme_btref_consider_and_set(entry, __builtin_frame_address(0));
6571 	}
6572 
6573 	return KERN_SUCCESS;
6574 }
6575 
6576 /*
6577  * Update the memory wiring accounting now that the given map entry is being unwired.
6578  */
6579 
6580 static void
subtract_wire_counts(vm_map_t map,vm_map_entry_t entry,boolean_t user_wire)6581 subtract_wire_counts(
6582 	vm_map_t        map,
6583 	vm_map_entry_t  entry,
6584 	boolean_t       user_wire)
6585 {
6586 	if (user_wire) {
6587 		/*
6588 		 * We're unwiring memory at the request of the user.  See if we're removing the last user wire reference.
6589 		 */
6590 
6591 		if (entry->user_wired_count == 1) {
6592 			/*
6593 			 * We're removing the last user wire reference.  Decrement the wired_count and the total
6594 			 * user wired memory for this map.
6595 			 */
6596 
6597 			assert(entry->wired_count >= 1);
6598 			entry->wired_count--;
6599 			map->user_wire_size -= entry->vme_end - entry->vme_start;
6600 		}
6601 
6602 		assert(entry->user_wired_count >= 1);
6603 		entry->user_wired_count--;
6604 	} else {
6605 		/*
6606 		 * The kernel is unwiring the memory.   Just update the count.
6607 		 */
6608 
6609 		assert(entry->wired_count >= 1);
6610 		entry->wired_count--;
6611 	}
6612 
6613 	vme_btref_consider_and_put(entry);
6614 }
6615 
6616 int cs_executable_wire = 0;
6617 
6618 /*
6619  *	vm_map_wire:
6620  *
6621  *	Sets the pageability of the specified address range in the
6622  *	target map as wired.  Regions specified as not pageable require
6623  *	locked-down physical memory and physical page maps.  The
6624  *	access_type variable indicates types of accesses that must not
6625  *	generate page faults.  This is checked against protection of
6626  *	memory being locked-down.
6627  *
6628  *	The map must not be locked, but a reference must remain to the
6629  *	map throughout the call.
6630  */
6631 static kern_return_t
vm_map_wire_nested(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_prot_t caller_prot,vm_tag_t tag,boolean_t user_wire,pmap_t map_pmap,vm_map_offset_t pmap_addr,ppnum_t * physpage_p)6632 vm_map_wire_nested(
6633 	vm_map_t                map,
6634 	vm_map_offset_t         start,
6635 	vm_map_offset_t         end,
6636 	vm_prot_t               caller_prot,
6637 	vm_tag_t                tag,
6638 	boolean_t               user_wire,
6639 	pmap_t                  map_pmap,
6640 	vm_map_offset_t         pmap_addr,
6641 	ppnum_t                 *physpage_p)
6642 {
6643 	vm_map_entry_t          entry;
6644 	vm_prot_t               access_type;
6645 	struct vm_map_entry     *first_entry, tmp_entry;
6646 	vm_map_t                real_map;
6647 	vm_map_offset_t         s, e;
6648 	kern_return_t           rc;
6649 	boolean_t               need_wakeup;
6650 	boolean_t               main_map = FALSE;
6651 	wait_interrupt_t        interruptible_state;
6652 	thread_t                cur_thread;
6653 	unsigned int            last_timestamp;
6654 	vm_map_size_t           size;
6655 	boolean_t               wire_and_extract;
6656 	vm_prot_t               extra_prots;
6657 
6658 	extra_prots = VM_PROT_COPY;
6659 	extra_prots |= VM_PROT_COPY_FAIL_IF_EXECUTABLE;
6660 #if XNU_TARGET_OS_OSX
6661 	if (map->pmap == kernel_pmap ||
6662 	    !vm_map_cs_enforcement(map)) {
6663 		extra_prots &= ~VM_PROT_COPY_FAIL_IF_EXECUTABLE;
6664 	}
6665 #endif /* XNU_TARGET_OS_OSX */
6666 #if CODE_SIGNING_MONITOR
6667 	if (csm_address_space_exempt(map->pmap) == KERN_SUCCESS) {
6668 		extra_prots &= ~VM_PROT_COPY_FAIL_IF_EXECUTABLE;
6669 	}
6670 #endif /* CODE_SIGNING_MONITOR */
6671 
6672 	access_type = (caller_prot & (VM_PROT_ALL | VM_PROT_ALLEXEC));
6673 
6674 	wire_and_extract = FALSE;
6675 	if (physpage_p != NULL) {
6676 		/*
6677 		 * The caller wants the physical page number of the
6678 		 * wired page.  We return only one physical page number
6679 		 * so this works for only one page at a time.
6680 		 */
6681 		if ((end - start) != PAGE_SIZE) {
6682 			return KERN_INVALID_ARGUMENT;
6683 		}
6684 		wire_and_extract = TRUE;
6685 		*physpage_p = 0;
6686 	}
6687 
6688 	vm_map_lock(map);
6689 	if (map_pmap == NULL) {
6690 		main_map = TRUE;
6691 	}
6692 	last_timestamp = map->timestamp;
6693 
6694 	VM_MAP_RANGE_CHECK(map, start, end);
6695 	assert(VM_MAP_PAGE_ALIGNED(start, VM_MAP_PAGE_MASK(map)));
6696 	assert(VM_MAP_PAGE_ALIGNED(end, VM_MAP_PAGE_MASK(map)));
6697 
6698 	if (start == end) {
6699 		/* We wired what the caller asked for, zero pages */
6700 		vm_map_unlock(map);
6701 		return KERN_SUCCESS;
6702 	}
6703 
6704 	if (__improbable(vm_map_range_overflows(map, start, end - start))) {
6705 		vm_map_unlock(map);
6706 		return KERN_INVALID_ADDRESS;
6707 	}
6708 
6709 	need_wakeup = FALSE;
6710 	cur_thread = current_thread();
6711 
6712 	s = start;
6713 	rc = KERN_SUCCESS;
6714 
6715 	if (vm_map_lookup_entry(map, s, &first_entry)) {
6716 		entry = first_entry;
6717 		/*
6718 		 * vm_map_clip_start will be done later.
6719 		 * We don't want to unnest any nested submaps here !
6720 		 */
6721 	} else {
6722 		/* Start address is not in map */
6723 		rc = KERN_INVALID_ADDRESS;
6724 		goto done;
6725 	}
6726 
6727 	while ((entry != vm_map_to_entry(map)) && (s < end)) {
6728 		/*
6729 		 * At this point, we have wired from "start" to "s".
6730 		 * We still need to wire from "s" to "end".
6731 		 *
6732 		 * "entry" hasn't been clipped, so it could start before "s"
6733 		 * and/or end after "end".
6734 		 */
6735 
6736 		/* "e" is how far we want to wire in this entry */
6737 		e = entry->vme_end;
6738 		if (e > end) {
6739 			e = end;
6740 		}
6741 
6742 		/*
6743 		 * If another thread is wiring/unwiring this entry then
6744 		 * block after informing other thread to wake us up.
6745 		 */
6746 		if (entry->in_transition) {
6747 			wait_result_t wait_result;
6748 
6749 			/*
6750 			 * We have not clipped the entry.  Make sure that
6751 			 * the start address is in range so that the lookup
6752 			 * below will succeed.
6753 			 * "s" is the current starting point: we've already
6754 			 * wired from "start" to "s" and we still have
6755 			 * to wire from "s" to "end".
6756 			 */
6757 
6758 			entry->needs_wakeup = TRUE;
6759 
6760 			/*
6761 			 * wake up anybody waiting on entries that we have
6762 			 * already wired.
6763 			 */
6764 			if (need_wakeup) {
6765 				vm_map_entry_wakeup(map);
6766 				need_wakeup = FALSE;
6767 			}
6768 			/*
6769 			 * User wiring is interruptible
6770 			 */
6771 			wait_result = vm_map_entry_wait(map,
6772 			    (user_wire) ? THREAD_ABORTSAFE :
6773 			    THREAD_UNINT);
6774 			if (user_wire && wait_result == THREAD_INTERRUPTED) {
6775 				/*
6776 				 * undo the wirings we have done so far
6777 				 * We do not clear the needs_wakeup flag,
6778 				 * because we cannot tell if we were the
6779 				 * only one waiting.
6780 				 */
6781 				rc = KERN_FAILURE;
6782 				goto done;
6783 			}
6784 
6785 			/*
6786 			 * Cannot avoid a lookup here. reset timestamp.
6787 			 */
6788 			last_timestamp = map->timestamp;
6789 
6790 			/*
6791 			 * The entry could have been clipped, look it up again.
6792 			 * Worse that can happen is, it may not exist anymore.
6793 			 */
6794 			if (!vm_map_lookup_entry(map, s, &first_entry)) {
6795 				/*
6796 				 * User: undo everything upto the previous
6797 				 * entry.  let vm_map_unwire worry about
6798 				 * checking the validity of the range.
6799 				 */
6800 				rc = KERN_FAILURE;
6801 				goto done;
6802 			}
6803 			entry = first_entry;
6804 			continue;
6805 		}
6806 
6807 		if (entry->is_sub_map) {
6808 			vm_map_offset_t sub_start;
6809 			vm_map_offset_t sub_end;
6810 			vm_map_offset_t local_start;
6811 			vm_map_offset_t local_end;
6812 			pmap_t          pmap;
6813 
6814 			if (wire_and_extract) {
6815 				/*
6816 				 * Wiring would result in copy-on-write
6817 				 * which would not be compatible with
6818 				 * the sharing we have with the original
6819 				 * provider of this memory.
6820 				 */
6821 				rc = KERN_INVALID_ARGUMENT;
6822 				goto done;
6823 			}
6824 
6825 			vm_map_clip_start(map, entry, s);
6826 			vm_map_clip_end(map, entry, end);
6827 
6828 			sub_start = VME_OFFSET(entry);
6829 			sub_end = entry->vme_end;
6830 			sub_end += VME_OFFSET(entry) - entry->vme_start;
6831 
6832 			local_end = entry->vme_end;
6833 			if (map_pmap == NULL) {
6834 				vm_object_t             object;
6835 				vm_object_offset_t      offset;
6836 				vm_prot_t               prot;
6837 				boolean_t               wired;
6838 				vm_map_entry_t          local_entry;
6839 				vm_map_version_t         version;
6840 				vm_map_t                lookup_map;
6841 
6842 				if (entry->use_pmap) {
6843 					pmap = VME_SUBMAP(entry)->pmap;
6844 					/* ppc implementation requires that */
6845 					/* submaps pmap address ranges line */
6846 					/* up with parent map */
6847 #ifdef notdef
6848 					pmap_addr = sub_start;
6849 #endif
6850 					pmap_addr = s;
6851 				} else {
6852 					pmap = map->pmap;
6853 					pmap_addr = s;
6854 				}
6855 
6856 				if (entry->wired_count) {
6857 					if ((rc = add_wire_counts(map, entry, user_wire)) != KERN_SUCCESS) {
6858 						goto done;
6859 					}
6860 
6861 					/*
6862 					 * The map was not unlocked:
6863 					 * no need to goto re-lookup.
6864 					 * Just go directly to next entry.
6865 					 */
6866 					entry = entry->vme_next;
6867 					s = entry->vme_start;
6868 					continue;
6869 				}
6870 
6871 				/* call vm_map_lookup_and_lock_object to */
6872 				/* cause any needs copy to be   */
6873 				/* evaluated */
6874 				local_start = entry->vme_start;
6875 				lookup_map = map;
6876 				vm_map_lock_write_to_read(map);
6877 				rc = vm_map_lookup_and_lock_object(
6878 					&lookup_map, local_start,
6879 					(access_type | extra_prots),
6880 					OBJECT_LOCK_EXCLUSIVE,
6881 					&version, &object,
6882 					&offset, &prot, &wired,
6883 					NULL,
6884 					&real_map, NULL);
6885 				if (rc != KERN_SUCCESS) {
6886 					vm_map_unlock_read(lookup_map);
6887 					assert(map_pmap == NULL);
6888 					vm_map_unwire(map, start,
6889 					    s, user_wire);
6890 					return rc;
6891 				}
6892 				vm_object_unlock(object);
6893 				if (real_map != lookup_map) {
6894 					vm_map_unlock(real_map);
6895 				}
6896 				vm_map_unlock_read(lookup_map);
6897 				vm_map_lock(map);
6898 
6899 				/* we unlocked, so must re-lookup */
6900 				if (!vm_map_lookup_entry(map,
6901 				    local_start,
6902 				    &local_entry)) {
6903 					rc = KERN_FAILURE;
6904 					goto done;
6905 				}
6906 
6907 				/*
6908 				 * entry could have been "simplified",
6909 				 * so re-clip
6910 				 */
6911 				entry = local_entry;
6912 				assert(s == local_start);
6913 				vm_map_clip_start(map, entry, s);
6914 				vm_map_clip_end(map, entry, end);
6915 				/* re-compute "e" */
6916 				e = entry->vme_end;
6917 				if (e > end) {
6918 					e = end;
6919 				}
6920 
6921 				/* did we have a change of type? */
6922 				if (!entry->is_sub_map) {
6923 					last_timestamp = map->timestamp;
6924 					continue;
6925 				}
6926 			} else {
6927 				local_start = entry->vme_start;
6928 				pmap = map_pmap;
6929 			}
6930 
6931 			if ((rc = add_wire_counts(map, entry, user_wire)) != KERN_SUCCESS) {
6932 				goto done;
6933 			}
6934 
6935 			entry->in_transition = TRUE;
6936 
6937 			vm_map_unlock(map);
6938 			rc = vm_map_wire_nested(VME_SUBMAP(entry),
6939 			    sub_start, sub_end,
6940 			    caller_prot, tag,
6941 			    user_wire, pmap, pmap_addr,
6942 			    NULL);
6943 			vm_map_lock(map);
6944 
6945 			/*
6946 			 * Find the entry again.  It could have been clipped
6947 			 * after we unlocked the map.
6948 			 */
6949 			if (!vm_map_lookup_entry(map, local_start,
6950 			    &first_entry)) {
6951 				panic("vm_map_wire: re-lookup failed");
6952 			}
6953 			entry = first_entry;
6954 
6955 			assert(local_start == s);
6956 			/* re-compute "e" */
6957 			e = entry->vme_end;
6958 			if (e > end) {
6959 				e = end;
6960 			}
6961 
6962 			last_timestamp = map->timestamp;
6963 			while ((entry != vm_map_to_entry(map)) &&
6964 			    (entry->vme_start < e)) {
6965 				assert(entry->in_transition);
6966 				entry->in_transition = FALSE;
6967 				if (entry->needs_wakeup) {
6968 					entry->needs_wakeup = FALSE;
6969 					need_wakeup = TRUE;
6970 				}
6971 				if (rc != KERN_SUCCESS) {/* from vm_*_wire */
6972 					subtract_wire_counts(map, entry, user_wire);
6973 				}
6974 				entry = entry->vme_next;
6975 			}
6976 			if (rc != KERN_SUCCESS) {       /* from vm_*_wire */
6977 				goto done;
6978 			}
6979 
6980 			/* no need to relookup again */
6981 			s = entry->vme_start;
6982 			continue;
6983 		}
6984 
6985 		/*
6986 		 * If this entry is already wired then increment
6987 		 * the appropriate wire reference count.
6988 		 */
6989 		if (entry->wired_count) {
6990 			if ((entry->protection & access_type) != access_type) {
6991 				/* found a protection problem */
6992 
6993 				/*
6994 				 * XXX FBDP
6995 				 * We should always return an error
6996 				 * in this case but since we didn't
6997 				 * enforce it before, let's do
6998 				 * it only for the new "wire_and_extract"
6999 				 * code path for now...
7000 				 */
7001 				if (wire_and_extract) {
7002 					rc = KERN_PROTECTION_FAILURE;
7003 					goto done;
7004 				}
7005 			}
7006 
7007 			/*
7008 			 * entry is already wired down, get our reference
7009 			 * after clipping to our range.
7010 			 */
7011 			vm_map_clip_start(map, entry, s);
7012 			vm_map_clip_end(map, entry, end);
7013 
7014 			if ((rc = add_wire_counts(map, entry, user_wire)) != KERN_SUCCESS) {
7015 				goto done;
7016 			}
7017 
7018 			if (wire_and_extract) {
7019 				vm_object_t             object;
7020 				vm_object_offset_t      offset;
7021 				vm_page_t               m;
7022 
7023 				/*
7024 				 * We don't have to "wire" the page again
7025 				 * bit we still have to "extract" its
7026 				 * physical page number, after some sanity
7027 				 * checks.
7028 				 */
7029 				assert((entry->vme_end - entry->vme_start)
7030 				    == PAGE_SIZE);
7031 				assert(!entry->needs_copy);
7032 				assert(!entry->is_sub_map);
7033 				assert(VME_OBJECT(entry));
7034 				if (((entry->vme_end - entry->vme_start)
7035 				    != PAGE_SIZE) ||
7036 				    entry->needs_copy ||
7037 				    entry->is_sub_map ||
7038 				    VME_OBJECT(entry) == VM_OBJECT_NULL) {
7039 					rc = KERN_INVALID_ARGUMENT;
7040 					goto done;
7041 				}
7042 
7043 				object = VME_OBJECT(entry);
7044 				offset = VME_OFFSET(entry);
7045 				/* need exclusive lock to update m->dirty */
7046 				if (entry->protection & VM_PROT_WRITE) {
7047 					vm_object_lock(object);
7048 				} else {
7049 					vm_object_lock_shared(object);
7050 				}
7051 				m = vm_page_lookup(object, offset);
7052 				assert(m != VM_PAGE_NULL);
7053 				assert(VM_PAGE_WIRED(m));
7054 				if (m != VM_PAGE_NULL && VM_PAGE_WIRED(m)) {
7055 					*physpage_p = VM_PAGE_GET_PHYS_PAGE(m);
7056 					if (entry->protection & VM_PROT_WRITE) {
7057 						vm_object_lock_assert_exclusive(
7058 							object);
7059 						m->vmp_dirty = TRUE;
7060 					}
7061 				} else {
7062 					/* not already wired !? */
7063 					*physpage_p = 0;
7064 				}
7065 				vm_object_unlock(object);
7066 			}
7067 
7068 			/* map was not unlocked: no need to relookup */
7069 			entry = entry->vme_next;
7070 			s = entry->vme_start;
7071 			continue;
7072 		}
7073 
7074 		/*
7075 		 * Unwired entry or wire request transmitted via submap
7076 		 */
7077 
7078 		/*
7079 		 * Wiring would copy the pages to the shadow object.
7080 		 * The shadow object would not be code-signed so
7081 		 * attempting to execute code from these copied pages
7082 		 * would trigger a code-signing violation.
7083 		 */
7084 
7085 		if ((entry->protection & VM_PROT_EXECUTE)
7086 #if XNU_TARGET_OS_OSX
7087 		    &&
7088 		    map->pmap != kernel_pmap &&
7089 		    (vm_map_cs_enforcement(map)
7090 #if __arm64__
7091 		    || !VM_MAP_IS_EXOTIC(map)
7092 #endif /* __arm64__ */
7093 		    )
7094 #endif /* XNU_TARGET_OS_OSX */
7095 #if CODE_SIGNING_MONITOR
7096 		    &&
7097 		    (csm_address_space_exempt(map->pmap) != KERN_SUCCESS)
7098 #endif
7099 		    ) {
7100 #if MACH_ASSERT
7101 			printf("pid %d[%s] wiring executable range from "
7102 			    "0x%llx to 0x%llx: rejected to preserve "
7103 			    "code-signing\n",
7104 			    proc_selfpid(),
7105 			    (get_bsdtask_info(current_task())
7106 			    ? proc_name_address(get_bsdtask_info(current_task()))
7107 			    : "?"),
7108 			    (uint64_t) entry->vme_start,
7109 			    (uint64_t) entry->vme_end);
7110 #endif /* MACH_ASSERT */
7111 			DTRACE_VM2(cs_executable_wire,
7112 			    uint64_t, (uint64_t)entry->vme_start,
7113 			    uint64_t, (uint64_t)entry->vme_end);
7114 			cs_executable_wire++;
7115 			rc = KERN_PROTECTION_FAILURE;
7116 			goto done;
7117 		}
7118 
7119 		/*
7120 		 * Perform actions of vm_map_lookup that need the write
7121 		 * lock on the map: create a shadow object for a
7122 		 * copy-on-write region, or an object for a zero-fill
7123 		 * region.
7124 		 */
7125 		size = entry->vme_end - entry->vme_start;
7126 		/*
7127 		 * If wiring a copy-on-write page, we need to copy it now
7128 		 * even if we're only (currently) requesting read access.
7129 		 * This is aggressive, but once it's wired we can't move it.
7130 		 */
7131 		if (entry->needs_copy) {
7132 			if (wire_and_extract) {
7133 				/*
7134 				 * We're supposed to share with the original
7135 				 * provider so should not be "needs_copy"
7136 				 */
7137 				rc = KERN_INVALID_ARGUMENT;
7138 				goto done;
7139 			}
7140 
7141 			VME_OBJECT_SHADOW(entry, size,
7142 			    vm_map_always_shadow(map));
7143 			entry->needs_copy = FALSE;
7144 		} else if (VME_OBJECT(entry) == VM_OBJECT_NULL) {
7145 			if (wire_and_extract) {
7146 				/*
7147 				 * We're supposed to share with the original
7148 				 * provider so should already have an object.
7149 				 */
7150 				rc = KERN_INVALID_ARGUMENT;
7151 				goto done;
7152 			}
7153 			VME_OBJECT_SET(entry, vm_object_allocate(size), false, 0);
7154 			VME_OFFSET_SET(entry, (vm_object_offset_t)0);
7155 			assert(entry->use_pmap);
7156 		} else if (VME_OBJECT(entry)->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) {
7157 			if (wire_and_extract) {
7158 				/*
7159 				 * We're supposed to share with the original
7160 				 * provider so should not be COPY_SYMMETRIC.
7161 				 */
7162 				rc = KERN_INVALID_ARGUMENT;
7163 				goto done;
7164 			}
7165 			/*
7166 			 * Force an unrequested "copy-on-write" but only for
7167 			 * the range we're wiring.
7168 			 */
7169 //			printf("FBDP %s:%d map %p entry %p [ 0x%llx 0x%llx ] s 0x%llx end 0x%llx wire&extract=%d\n", __FUNCTION__, __LINE__, map, entry, (uint64_t)entry->vme_start, (uint64_t)entry->vme_end, (uint64_t)s, (uint64_t)end, wire_and_extract);
7170 			vm_map_clip_start(map, entry, s);
7171 			vm_map_clip_end(map, entry, end);
7172 			/* recompute "size" */
7173 			size = entry->vme_end - entry->vme_start;
7174 			/* make a shadow object */
7175 			vm_object_t orig_object;
7176 			vm_object_offset_t orig_offset;
7177 			orig_object = VME_OBJECT(entry);
7178 			orig_offset = VME_OFFSET(entry);
7179 			VME_OBJECT_SHADOW(entry, size, vm_map_always_shadow(map));
7180 			if (VME_OBJECT(entry) != orig_object) {
7181 				/*
7182 				 * This mapping has not been shared (or it would be
7183 				 * COPY_DELAY instead of COPY_SYMMETRIC) and it has
7184 				 * not been copied-on-write (or it would be marked
7185 				 * as "needs_copy" and would have been handled above
7186 				 * and also already write-protected).
7187 				 * We still need to write-protect here to prevent
7188 				 * other threads from modifying these pages while
7189 				 * we're in the process of copying and wiring
7190 				 * the copied pages.
7191 				 * Since the mapping is neither shared nor COWed,
7192 				 * we only need to write-protect the PTEs for this
7193 				 * mapping.
7194 				 */
7195 				vm_object_pmap_protect(orig_object,
7196 				    orig_offset,
7197 				    size,
7198 				    map->pmap,
7199 				    VM_MAP_PAGE_SIZE(map),
7200 				    entry->vme_start,
7201 				    entry->protection & ~VM_PROT_WRITE);
7202 			}
7203 		}
7204 		if (VME_OBJECT(entry)->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) {
7205 			/*
7206 			 * Make the object COPY_DELAY to get a stable object
7207 			 * to wire.
7208 			 * That should avoid creating long shadow chains while
7209 			 * wiring/unwiring the same range repeatedly.
7210 			 * That also prevents part of the object from being
7211 			 * wired while another part is "needs_copy", which
7212 			 * could result in conflicting rules wrt copy-on-write.
7213 			 */
7214 			vm_object_t object;
7215 
7216 			object = VME_OBJECT(entry);
7217 			vm_object_lock(object);
7218 			if (object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) {
7219 				assertf(vm_object_round_page(VME_OFFSET(entry) + size) - vm_object_trunc_page(VME_OFFSET(entry)) == object->vo_size,
7220 				    "object %p size 0x%llx entry %p [0x%llx:0x%llx:0x%llx] size 0x%llx\n",
7221 				    object, (uint64_t)object->vo_size,
7222 				    entry,
7223 				    (uint64_t)entry->vme_start,
7224 				    (uint64_t)entry->vme_end,
7225 				    (uint64_t)VME_OFFSET(entry),
7226 				    (uint64_t)size);
7227 				assertf(object->ref_count == 1,
7228 				    "object %p ref_count %d\n",
7229 				    object, object->ref_count);
7230 				assertf(!entry->needs_copy,
7231 				    "entry %p\n", entry);
7232 				object->copy_strategy = MEMORY_OBJECT_COPY_DELAY;
7233 				VM_OBJECT_SET_TRUE_SHARE(object, TRUE);
7234 			}
7235 			vm_object_unlock(object);
7236 		}
7237 
7238 		vm_map_clip_start(map, entry, s);
7239 		vm_map_clip_end(map, entry, end);
7240 
7241 		/* re-compute "e" */
7242 		e = entry->vme_end;
7243 		if (e > end) {
7244 			e = end;
7245 		}
7246 
7247 		/*
7248 		 * Check for holes and protection mismatch.
7249 		 * Holes: Next entry should be contiguous unless this
7250 		 *	  is the end of the region.
7251 		 * Protection: Access requested must be allowed, unless
7252 		 *	wiring is by protection class
7253 		 */
7254 		if ((entry->vme_end < end) &&
7255 		    ((entry->vme_next == vm_map_to_entry(map)) ||
7256 		    (entry->vme_next->vme_start > entry->vme_end))) {
7257 			/* found a hole */
7258 			rc = KERN_INVALID_ADDRESS;
7259 			goto done;
7260 		}
7261 		if ((entry->protection & access_type) != access_type) {
7262 			/* found a protection problem */
7263 			rc = KERN_PROTECTION_FAILURE;
7264 			goto done;
7265 		}
7266 
7267 		assert(entry->wired_count == 0 && entry->user_wired_count == 0);
7268 
7269 		if ((rc = add_wire_counts(map, entry, user_wire)) != KERN_SUCCESS) {
7270 			goto done;
7271 		}
7272 
7273 		entry->in_transition = TRUE;
7274 
7275 		/*
7276 		 * This entry might get split once we unlock the map.
7277 		 * In vm_fault_wire(), we need the current range as
7278 		 * defined by this entry.  In order for this to work
7279 		 * along with a simultaneous clip operation, we make a
7280 		 * temporary copy of this entry and use that for the
7281 		 * wiring.  Note that the underlying objects do not
7282 		 * change during a clip.
7283 		 */
7284 		tmp_entry = *entry;
7285 
7286 		/*
7287 		 * The in_transition state guarentees that the entry
7288 		 * (or entries for this range, if split occured) will be
7289 		 * there when the map lock is acquired for the second time.
7290 		 */
7291 		vm_map_unlock(map);
7292 
7293 		if (!user_wire && cur_thread != THREAD_NULL) {
7294 			interruptible_state = thread_interrupt_level(THREAD_UNINT);
7295 		} else {
7296 			interruptible_state = THREAD_UNINT;
7297 		}
7298 
7299 		if (map_pmap) {
7300 			rc = vm_fault_wire(map,
7301 			    &tmp_entry, caller_prot, tag, map_pmap, pmap_addr,
7302 			    physpage_p);
7303 		} else {
7304 			rc = vm_fault_wire(map,
7305 			    &tmp_entry, caller_prot, tag, map->pmap,
7306 			    tmp_entry.vme_start,
7307 			    physpage_p);
7308 		}
7309 
7310 		if (!user_wire && cur_thread != THREAD_NULL) {
7311 			thread_interrupt_level(interruptible_state);
7312 		}
7313 
7314 		vm_map_lock(map);
7315 
7316 		if (last_timestamp + 1 != map->timestamp) {
7317 			/*
7318 			 * Find the entry again.  It could have been clipped
7319 			 * after we unlocked the map.
7320 			 */
7321 			if (!vm_map_lookup_entry(map, tmp_entry.vme_start,
7322 			    &first_entry)) {
7323 				panic("vm_map_wire: re-lookup failed");
7324 			}
7325 
7326 			entry = first_entry;
7327 		}
7328 
7329 		last_timestamp = map->timestamp;
7330 
7331 		while ((entry != vm_map_to_entry(map)) &&
7332 		    (entry->vme_start < tmp_entry.vme_end)) {
7333 			assert(entry->in_transition);
7334 			entry->in_transition = FALSE;
7335 			if (entry->needs_wakeup) {
7336 				entry->needs_wakeup = FALSE;
7337 				need_wakeup = TRUE;
7338 			}
7339 			if (rc != KERN_SUCCESS) {       /* from vm_*_wire */
7340 				subtract_wire_counts(map, entry, user_wire);
7341 			}
7342 			entry = entry->vme_next;
7343 		}
7344 
7345 		if (rc != KERN_SUCCESS) {               /* from vm_*_wire */
7346 			goto done;
7347 		}
7348 
7349 		if ((entry != vm_map_to_entry(map)) && /* we still have entries in the map */
7350 		    (tmp_entry.vme_end != end) &&    /* AND, we are not at the end of the requested range */
7351 		    (entry->vme_start != tmp_entry.vme_end)) { /* AND, the next entry is not contiguous. */
7352 			/* found a "new" hole */
7353 			s = tmp_entry.vme_end;
7354 			rc = KERN_INVALID_ADDRESS;
7355 			goto done;
7356 		}
7357 
7358 		s = entry->vme_start;
7359 	} /* end while loop through map entries */
7360 
7361 done:
7362 	if (rc == KERN_SUCCESS) {
7363 		/* repair any damage we may have made to the VM map */
7364 		vm_map_simplify_range(map, start, end);
7365 	}
7366 
7367 	vm_map_unlock(map);
7368 
7369 	/*
7370 	 * wake up anybody waiting on entries we wired.
7371 	 */
7372 	if (need_wakeup) {
7373 		vm_map_entry_wakeup(map);
7374 	}
7375 
7376 	if (rc != KERN_SUCCESS) {
7377 		/* undo what has been wired so far */
7378 		vm_map_unwire_nested(map, start, s, user_wire,
7379 		    map_pmap, pmap_addr);
7380 		if (physpage_p) {
7381 			*physpage_p = 0;
7382 		}
7383 	}
7384 
7385 	return rc;
7386 }
7387 
7388 kern_return_t
vm_map_wire_external(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_prot_t caller_prot,boolean_t user_wire)7389 vm_map_wire_external(
7390 	vm_map_t                map,
7391 	vm_map_offset_t         start,
7392 	vm_map_offset_t         end,
7393 	vm_prot_t               caller_prot,
7394 	boolean_t               user_wire)
7395 {
7396 	kern_return_t   kret;
7397 
7398 	kret = vm_map_wire_nested(map, start, end, caller_prot, vm_tag_bt(),
7399 	    user_wire, (pmap_t)NULL, 0, NULL);
7400 	return kret;
7401 }
7402 
7403 kern_return_t
vm_map_wire_kernel(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_prot_t caller_prot,vm_tag_t tag,boolean_t user_wire)7404 vm_map_wire_kernel(
7405 	vm_map_t                map,
7406 	vm_map_offset_t         start,
7407 	vm_map_offset_t         end,
7408 	vm_prot_t               caller_prot,
7409 	vm_tag_t                tag,
7410 	boolean_t               user_wire)
7411 {
7412 	kern_return_t   kret;
7413 
7414 	kret = vm_map_wire_nested(map, start, end, caller_prot, tag,
7415 	    user_wire, (pmap_t)NULL, 0, NULL);
7416 	return kret;
7417 }
7418 
7419 kern_return_t
vm_map_wire_and_extract_external(vm_map_t map,vm_map_offset_t start,vm_prot_t caller_prot,boolean_t user_wire,ppnum_t * physpage_p)7420 vm_map_wire_and_extract_external(
7421 	vm_map_t        map,
7422 	vm_map_offset_t start,
7423 	vm_prot_t       caller_prot,
7424 	boolean_t       user_wire,
7425 	ppnum_t         *physpage_p)
7426 {
7427 	kern_return_t   kret;
7428 
7429 	kret = vm_map_wire_nested(map,
7430 	    start,
7431 	    start + VM_MAP_PAGE_SIZE(map),
7432 	    caller_prot,
7433 	    vm_tag_bt(),
7434 	    user_wire,
7435 	    (pmap_t)NULL,
7436 	    0,
7437 	    physpage_p);
7438 	if (kret != KERN_SUCCESS &&
7439 	    physpage_p != NULL) {
7440 		*physpage_p = 0;
7441 	}
7442 	return kret;
7443 }
7444 
7445 /*
7446  *	vm_map_unwire:
7447  *
7448  *	Sets the pageability of the specified address range in the target
7449  *	as pageable.  Regions specified must have been wired previously.
7450  *
7451  *	The map must not be locked, but a reference must remain to the map
7452  *	throughout the call.
7453  *
7454  *	Kernel will panic on failures.  User unwire ignores holes and
7455  *	unwired and intransition entries to avoid losing memory by leaving
7456  *	it unwired.
7457  */
7458 static kern_return_t
vm_map_unwire_nested(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,boolean_t user_wire,pmap_t map_pmap,vm_map_offset_t pmap_addr)7459 vm_map_unwire_nested(
7460 	vm_map_t                map,
7461 	vm_map_offset_t         start,
7462 	vm_map_offset_t         end,
7463 	boolean_t               user_wire,
7464 	pmap_t                  map_pmap,
7465 	vm_map_offset_t         pmap_addr)
7466 {
7467 	vm_map_entry_t          entry;
7468 	struct vm_map_entry     *first_entry, tmp_entry;
7469 	boolean_t               need_wakeup;
7470 	boolean_t               main_map = FALSE;
7471 	unsigned int            last_timestamp;
7472 
7473 	vm_map_lock(map);
7474 	if (map_pmap == NULL) {
7475 		main_map = TRUE;
7476 	}
7477 	last_timestamp = map->timestamp;
7478 
7479 	VM_MAP_RANGE_CHECK(map, start, end);
7480 	assert(VM_MAP_PAGE_ALIGNED(start, VM_MAP_PAGE_MASK(map)));
7481 	assert(VM_MAP_PAGE_ALIGNED(end, VM_MAP_PAGE_MASK(map)));
7482 
7483 	if (start == end) {
7484 		/* We unwired what the caller asked for: zero pages */
7485 		vm_map_unlock(map);
7486 		return KERN_SUCCESS;
7487 	}
7488 
7489 	if (__improbable(vm_map_range_overflows(map, start, end - start))) {
7490 		vm_map_unlock(map);
7491 		return KERN_INVALID_ADDRESS;
7492 	}
7493 
7494 	if (vm_map_lookup_entry(map, start, &first_entry)) {
7495 		entry = first_entry;
7496 		/*
7497 		 * vm_map_clip_start will be done later.
7498 		 * We don't want to unnest any nested sub maps here !
7499 		 */
7500 	} else {
7501 		if (!user_wire) {
7502 			panic("vm_map_unwire: start not found");
7503 		}
7504 		/*	Start address is not in map. */
7505 		vm_map_unlock(map);
7506 		return KERN_INVALID_ADDRESS;
7507 	}
7508 
7509 	if (entry->superpage_size) {
7510 		/* superpages are always wired */
7511 		vm_map_unlock(map);
7512 		return KERN_INVALID_ADDRESS;
7513 	}
7514 
7515 	need_wakeup = FALSE;
7516 	while ((entry != vm_map_to_entry(map)) && (entry->vme_start < end)) {
7517 		if (entry->in_transition) {
7518 			/*
7519 			 * 1)
7520 			 * Another thread is wiring down this entry. Note
7521 			 * that if it is not for the other thread we would
7522 			 * be unwiring an unwired entry.  This is not
7523 			 * permitted.  If we wait, we will be unwiring memory
7524 			 * we did not wire.
7525 			 *
7526 			 * 2)
7527 			 * Another thread is unwiring this entry.  We did not
7528 			 * have a reference to it, because if we did, this
7529 			 * entry will not be getting unwired now.
7530 			 */
7531 			if (!user_wire) {
7532 				/*
7533 				 * XXX FBDP
7534 				 * This could happen:  there could be some
7535 				 * overlapping vslock/vsunlock operations
7536 				 * going on.
7537 				 * We should probably just wait and retry,
7538 				 * but then we have to be careful that this
7539 				 * entry could get "simplified" after
7540 				 * "in_transition" gets unset and before
7541 				 * we re-lookup the entry, so we would
7542 				 * have to re-clip the entry to avoid
7543 				 * re-unwiring what we have already unwired...
7544 				 * See vm_map_wire_nested().
7545 				 *
7546 				 * Or we could just ignore "in_transition"
7547 				 * here and proceed to decement the wired
7548 				 * count(s) on this entry.  That should be fine
7549 				 * as long as "wired_count" doesn't drop all
7550 				 * the way to 0 (and we should panic if THAT
7551 				 * happens).
7552 				 */
7553 				panic("vm_map_unwire: in_transition entry");
7554 			}
7555 
7556 			entry = entry->vme_next;
7557 			continue;
7558 		}
7559 
7560 		if (entry->is_sub_map) {
7561 			vm_map_offset_t sub_start;
7562 			vm_map_offset_t sub_end;
7563 			vm_map_offset_t local_end;
7564 			pmap_t          pmap;
7565 
7566 			vm_map_clip_start(map, entry, start);
7567 			vm_map_clip_end(map, entry, end);
7568 
7569 			sub_start = VME_OFFSET(entry);
7570 			sub_end = entry->vme_end - entry->vme_start;
7571 			sub_end += VME_OFFSET(entry);
7572 			local_end = entry->vme_end;
7573 			if (map_pmap == NULL) {
7574 				if (entry->use_pmap) {
7575 					pmap = VME_SUBMAP(entry)->pmap;
7576 					pmap_addr = sub_start;
7577 				} else {
7578 					pmap = map->pmap;
7579 					pmap_addr = start;
7580 				}
7581 				if (entry->wired_count == 0 ||
7582 				    (user_wire && entry->user_wired_count == 0)) {
7583 					if (!user_wire) {
7584 						panic("vm_map_unwire: entry is unwired");
7585 					}
7586 					entry = entry->vme_next;
7587 					continue;
7588 				}
7589 
7590 				/*
7591 				 * Check for holes
7592 				 * Holes: Next entry should be contiguous unless
7593 				 * this is the end of the region.
7594 				 */
7595 				if (((entry->vme_end < end) &&
7596 				    ((entry->vme_next == vm_map_to_entry(map)) ||
7597 				    (entry->vme_next->vme_start
7598 				    > entry->vme_end)))) {
7599 					if (!user_wire) {
7600 						panic("vm_map_unwire: non-contiguous region");
7601 					}
7602 /*
7603  *                                       entry = entry->vme_next;
7604  *                                       continue;
7605  */
7606 				}
7607 
7608 				subtract_wire_counts(map, entry, user_wire);
7609 
7610 				if (entry->wired_count != 0) {
7611 					entry = entry->vme_next;
7612 					continue;
7613 				}
7614 
7615 				entry->in_transition = TRUE;
7616 				tmp_entry = *entry;/* see comment in vm_map_wire() */
7617 
7618 				/*
7619 				 * We can unlock the map now. The in_transition state
7620 				 * guarantees existance of the entry.
7621 				 */
7622 				vm_map_unlock(map);
7623 				vm_map_unwire_nested(VME_SUBMAP(entry),
7624 				    sub_start, sub_end, user_wire, pmap, pmap_addr);
7625 				vm_map_lock(map);
7626 
7627 				if (last_timestamp + 1 != map->timestamp) {
7628 					/*
7629 					 * Find the entry again.  It could have been
7630 					 * clipped or deleted after we unlocked the map.
7631 					 */
7632 					if (!vm_map_lookup_entry(map,
7633 					    tmp_entry.vme_start,
7634 					    &first_entry)) {
7635 						if (!user_wire) {
7636 							panic("vm_map_unwire: re-lookup failed");
7637 						}
7638 						entry = first_entry->vme_next;
7639 					} else {
7640 						entry = first_entry;
7641 					}
7642 				}
7643 				last_timestamp = map->timestamp;
7644 
7645 				/*
7646 				 * clear transition bit for all constituent entries
7647 				 * that were in the original entry (saved in
7648 				 * tmp_entry).  Also check for waiters.
7649 				 */
7650 				while ((entry != vm_map_to_entry(map)) &&
7651 				    (entry->vme_start < tmp_entry.vme_end)) {
7652 					assert(entry->in_transition);
7653 					entry->in_transition = FALSE;
7654 					if (entry->needs_wakeup) {
7655 						entry->needs_wakeup = FALSE;
7656 						need_wakeup = TRUE;
7657 					}
7658 					entry = entry->vme_next;
7659 				}
7660 				continue;
7661 			} else {
7662 				tmp_entry = *entry;
7663 				vm_map_unlock(map);
7664 				vm_map_unwire_nested(VME_SUBMAP(entry),
7665 				    sub_start, sub_end, user_wire, map_pmap,
7666 				    pmap_addr);
7667 				vm_map_lock(map);
7668 
7669 				if (last_timestamp + 1 != map->timestamp) {
7670 					/*
7671 					 * Find the entry again.  It could have been
7672 					 * clipped or deleted after we unlocked the map.
7673 					 */
7674 					if (!vm_map_lookup_entry(map,
7675 					    tmp_entry.vme_start,
7676 					    &first_entry)) {
7677 						if (!user_wire) {
7678 							panic("vm_map_unwire: re-lookup failed");
7679 						}
7680 						entry = first_entry->vme_next;
7681 					} else {
7682 						entry = first_entry;
7683 					}
7684 				}
7685 				last_timestamp = map->timestamp;
7686 			}
7687 		}
7688 
7689 
7690 		if ((entry->wired_count == 0) ||
7691 		    (user_wire && entry->user_wired_count == 0)) {
7692 			if (!user_wire) {
7693 				panic("vm_map_unwire: entry is unwired");
7694 			}
7695 
7696 			entry = entry->vme_next;
7697 			continue;
7698 		}
7699 
7700 		assert(entry->wired_count > 0 &&
7701 		    (!user_wire || entry->user_wired_count > 0));
7702 
7703 		vm_map_clip_start(map, entry, start);
7704 		vm_map_clip_end(map, entry, end);
7705 
7706 		/*
7707 		 * Check for holes
7708 		 * Holes: Next entry should be contiguous unless
7709 		 *	  this is the end of the region.
7710 		 */
7711 		if (((entry->vme_end < end) &&
7712 		    ((entry->vme_next == vm_map_to_entry(map)) ||
7713 		    (entry->vme_next->vme_start > entry->vme_end)))) {
7714 			if (!user_wire) {
7715 				panic("vm_map_unwire: non-contiguous region");
7716 			}
7717 			entry = entry->vme_next;
7718 			continue;
7719 		}
7720 
7721 		subtract_wire_counts(map, entry, user_wire);
7722 
7723 		if (entry->wired_count != 0) {
7724 			entry = entry->vme_next;
7725 			continue;
7726 		}
7727 
7728 		if (entry->zero_wired_pages) {
7729 			entry->zero_wired_pages = FALSE;
7730 		}
7731 
7732 		entry->in_transition = TRUE;
7733 		tmp_entry = *entry;     /* see comment in vm_map_wire() */
7734 
7735 		/*
7736 		 * We can unlock the map now. The in_transition state
7737 		 * guarantees existance of the entry.
7738 		 */
7739 		vm_map_unlock(map);
7740 		if (map_pmap) {
7741 			vm_fault_unwire(map, &tmp_entry, FALSE, map_pmap,
7742 			    pmap_addr, tmp_entry.vme_end);
7743 		} else {
7744 			vm_fault_unwire(map, &tmp_entry, FALSE, map->pmap,
7745 			    tmp_entry.vme_start, tmp_entry.vme_end);
7746 		}
7747 		vm_map_lock(map);
7748 
7749 		if (last_timestamp + 1 != map->timestamp) {
7750 			/*
7751 			 * Find the entry again.  It could have been clipped
7752 			 * or deleted after we unlocked the map.
7753 			 */
7754 			if (!vm_map_lookup_entry(map, tmp_entry.vme_start,
7755 			    &first_entry)) {
7756 				if (!user_wire) {
7757 					panic("vm_map_unwire: re-lookup failed");
7758 				}
7759 				entry = first_entry->vme_next;
7760 			} else {
7761 				entry = first_entry;
7762 			}
7763 		}
7764 		last_timestamp = map->timestamp;
7765 
7766 		/*
7767 		 * clear transition bit for all constituent entries that
7768 		 * were in the original entry (saved in tmp_entry).  Also
7769 		 * check for waiters.
7770 		 */
7771 		while ((entry != vm_map_to_entry(map)) &&
7772 		    (entry->vme_start < tmp_entry.vme_end)) {
7773 			assert(entry->in_transition);
7774 			entry->in_transition = FALSE;
7775 			if (entry->needs_wakeup) {
7776 				entry->needs_wakeup = FALSE;
7777 				need_wakeup = TRUE;
7778 			}
7779 			entry = entry->vme_next;
7780 		}
7781 	}
7782 
7783 	/*
7784 	 * We might have fragmented the address space when we wired this
7785 	 * range of addresses.  Attempt to re-coalesce these VM map entries
7786 	 * with their neighbors now that they're no longer wired.
7787 	 * Under some circumstances, address space fragmentation can
7788 	 * prevent VM object shadow chain collapsing, which can cause
7789 	 * swap space leaks.
7790 	 */
7791 	vm_map_simplify_range(map, start, end);
7792 
7793 	vm_map_unlock(map);
7794 	/*
7795 	 * wake up anybody waiting on entries that we have unwired.
7796 	 */
7797 	if (need_wakeup) {
7798 		vm_map_entry_wakeup(map);
7799 	}
7800 	return KERN_SUCCESS;
7801 }
7802 
7803 kern_return_t
vm_map_unwire(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,boolean_t user_wire)7804 vm_map_unwire(
7805 	vm_map_t                map,
7806 	vm_map_offset_t         start,
7807 	vm_map_offset_t         end,
7808 	boolean_t               user_wire)
7809 {
7810 	return vm_map_unwire_nested(map, start, end,
7811 	           user_wire, (pmap_t)NULL, 0);
7812 }
7813 
7814 
7815 /*
7816  *	vm_map_entry_zap:	[ internal use only ]
7817  *
7818  *	Remove the entry from the target map
7819  *	and put it on a zap list.
7820  */
7821 static void
vm_map_entry_zap(vm_map_t map,vm_map_entry_t entry,vm_map_zap_t zap)7822 vm_map_entry_zap(
7823 	vm_map_t                map,
7824 	vm_map_entry_t          entry,
7825 	vm_map_zap_t            zap)
7826 {
7827 	vm_map_offset_t s, e;
7828 
7829 	s = entry->vme_start;
7830 	e = entry->vme_end;
7831 	assert(VM_MAP_PAGE_ALIGNED(s, FOURK_PAGE_MASK));
7832 	assert(VM_MAP_PAGE_ALIGNED(e, FOURK_PAGE_MASK));
7833 	if (VM_MAP_PAGE_MASK(map) >= PAGE_MASK) {
7834 		assert(page_aligned(s));
7835 		assert(page_aligned(e));
7836 	}
7837 	if (entry->map_aligned == TRUE) {
7838 		assert(VM_MAP_PAGE_ALIGNED(s, VM_MAP_PAGE_MASK(map)));
7839 		assert(VM_MAP_PAGE_ALIGNED(e, VM_MAP_PAGE_MASK(map)));
7840 	}
7841 	assert(entry->wired_count == 0);
7842 	assert(entry->user_wired_count == 0);
7843 	assert(!entry->vme_permanent);
7844 
7845 	vm_map_store_entry_unlink(map, entry, false);
7846 	map->size -= e - s;
7847 
7848 	vm_map_zap_append(zap, entry);
7849 }
7850 
7851 static void
vm_map_submap_pmap_clean(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_map_t sub_map,vm_map_offset_t offset)7852 vm_map_submap_pmap_clean(
7853 	vm_map_t        map,
7854 	vm_map_offset_t start,
7855 	vm_map_offset_t end,
7856 	vm_map_t        sub_map,
7857 	vm_map_offset_t offset)
7858 {
7859 	vm_map_offset_t submap_start;
7860 	vm_map_offset_t submap_end;
7861 	vm_map_size_t   remove_size;
7862 	vm_map_entry_t  entry;
7863 
7864 	submap_end = offset + (end - start);
7865 	submap_start = offset;
7866 
7867 	vm_map_lock_read(sub_map);
7868 	if (vm_map_lookup_entry(sub_map, offset, &entry)) {
7869 		remove_size = (entry->vme_end - entry->vme_start);
7870 		if (offset > entry->vme_start) {
7871 			remove_size -= offset - entry->vme_start;
7872 		}
7873 
7874 
7875 		if (submap_end < entry->vme_end) {
7876 			remove_size -=
7877 			    entry->vme_end - submap_end;
7878 		}
7879 		if (entry->is_sub_map) {
7880 			vm_map_submap_pmap_clean(
7881 				sub_map,
7882 				start,
7883 				start + remove_size,
7884 				VME_SUBMAP(entry),
7885 				VME_OFFSET(entry));
7886 		} else {
7887 			if (map->mapped_in_other_pmaps &&
7888 			    os_ref_get_count_raw(&map->map_refcnt) != 0 &&
7889 			    VME_OBJECT(entry) != NULL) {
7890 				vm_object_pmap_protect_options(
7891 					VME_OBJECT(entry),
7892 					(VME_OFFSET(entry) +
7893 					offset -
7894 					entry->vme_start),
7895 					remove_size,
7896 					PMAP_NULL,
7897 					PAGE_SIZE,
7898 					entry->vme_start,
7899 					VM_PROT_NONE,
7900 					PMAP_OPTIONS_REMOVE);
7901 			} else {
7902 				pmap_remove(map->pmap,
7903 				    (addr64_t)start,
7904 				    (addr64_t)(start + remove_size));
7905 			}
7906 		}
7907 	}
7908 
7909 	entry = entry->vme_next;
7910 
7911 	while ((entry != vm_map_to_entry(sub_map))
7912 	    && (entry->vme_start < submap_end)) {
7913 		remove_size = (entry->vme_end - entry->vme_start);
7914 		if (submap_end < entry->vme_end) {
7915 			remove_size -= entry->vme_end - submap_end;
7916 		}
7917 		if (entry->is_sub_map) {
7918 			vm_map_submap_pmap_clean(
7919 				sub_map,
7920 				(start + entry->vme_start) - offset,
7921 				((start + entry->vme_start) - offset) + remove_size,
7922 				VME_SUBMAP(entry),
7923 				VME_OFFSET(entry));
7924 		} else {
7925 			if (map->mapped_in_other_pmaps &&
7926 			    os_ref_get_count_raw(&map->map_refcnt) != 0 &&
7927 			    VME_OBJECT(entry) != NULL) {
7928 				vm_object_pmap_protect_options(
7929 					VME_OBJECT(entry),
7930 					VME_OFFSET(entry),
7931 					remove_size,
7932 					PMAP_NULL,
7933 					PAGE_SIZE,
7934 					entry->vme_start,
7935 					VM_PROT_NONE,
7936 					PMAP_OPTIONS_REMOVE);
7937 			} else {
7938 				pmap_remove(map->pmap,
7939 				    (addr64_t)((start + entry->vme_start)
7940 				    - offset),
7941 				    (addr64_t)(((start + entry->vme_start)
7942 				    - offset) + remove_size));
7943 			}
7944 		}
7945 		entry = entry->vme_next;
7946 	}
7947 	vm_map_unlock_read(sub_map);
7948 	return;
7949 }
7950 
7951 /*
7952  *     virt_memory_guard_ast:
7953  *
7954  *     Handle the AST callout for a virtual memory guard.
7955  *	   raise an EXC_GUARD exception and terminate the task
7956  *     if configured to do so.
7957  */
7958 void
virt_memory_guard_ast(thread_t thread,mach_exception_data_type_t code,mach_exception_data_type_t subcode)7959 virt_memory_guard_ast(
7960 	thread_t thread,
7961 	mach_exception_data_type_t code,
7962 	mach_exception_data_type_t subcode)
7963 {
7964 	task_t task = get_threadtask(thread);
7965 	assert(task != kernel_task);
7966 	assert(task == current_task());
7967 	kern_return_t sync_exception_result;
7968 	uint32_t behavior;
7969 
7970 	behavior = task->task_exc_guard;
7971 
7972 	/* Is delivery enabled */
7973 	if ((behavior & TASK_EXC_GUARD_VM_DELIVER) == 0) {
7974 		return;
7975 	}
7976 
7977 	/* If only once, make sure we're that once */
7978 	while (behavior & TASK_EXC_GUARD_VM_ONCE) {
7979 		uint32_t new_behavior = behavior & ~TASK_EXC_GUARD_VM_DELIVER;
7980 
7981 		if (OSCompareAndSwap(behavior, new_behavior, &task->task_exc_guard)) {
7982 			break;
7983 		}
7984 		behavior = task->task_exc_guard;
7985 		if ((behavior & TASK_EXC_GUARD_VM_DELIVER) == 0) {
7986 			return;
7987 		}
7988 	}
7989 
7990 	const bool fatal = task->task_exc_guard & TASK_EXC_GUARD_VM_FATAL;
7991 	/* Raise exception synchronously and see if handler claimed it */
7992 	sync_exception_result = task_exception_notify(EXC_GUARD, code, subcode, fatal);
7993 
7994 	if (fatal) {
7995 		/*
7996 		 * If Synchronous EXC_GUARD delivery was successful then
7997 		 * kill the process and return, else kill the process
7998 		 * and deliver the exception via EXC_CORPSE_NOTIFY.
7999 		 */
8000 		if (sync_exception_result == KERN_SUCCESS) {
8001 			task_bsdtask_kill(current_task());
8002 		} else {
8003 			exit_with_guard_exception(current_proc(), code, subcode);
8004 		}
8005 	} else if (task->task_exc_guard & TASK_EXC_GUARD_VM_CORPSE) {
8006 		/*
8007 		 * If the synchronous EXC_GUARD delivery was not successful,
8008 		 * raise a simulated crash.
8009 		 */
8010 		if (sync_exception_result != KERN_SUCCESS) {
8011 			task_violated_guard(code, subcode, NULL, FALSE);
8012 		}
8013 	}
8014 }
8015 
8016 /*
8017  *     vm_map_guard_exception:
8018  *
8019  *     Generate a GUARD_TYPE_VIRTUAL_MEMORY EXC_GUARD exception.
8020  *
8021  *     Right now, we do this when we find nothing mapped, or a
8022  *     gap in the mapping when a user address space deallocate
8023  *     was requested. We report the address of the first gap found.
8024  */
8025 static void
vm_map_guard_exception(vm_map_offset_t gap_start,unsigned reason)8026 vm_map_guard_exception(
8027 	vm_map_offset_t gap_start,
8028 	unsigned reason)
8029 {
8030 	mach_exception_code_t code = 0;
8031 	unsigned int guard_type = GUARD_TYPE_VIRT_MEMORY;
8032 	unsigned int target = 0; /* should we pass in pid associated with map? */
8033 	mach_exception_data_type_t subcode = (uint64_t)gap_start;
8034 	boolean_t fatal = FALSE;
8035 
8036 	task_t task = current_task_early();
8037 
8038 	/* Can't deliver exceptions to a NULL task (early boot) or kernel task */
8039 	if (task == NULL || task == kernel_task) {
8040 		return;
8041 	}
8042 
8043 	EXC_GUARD_ENCODE_TYPE(code, guard_type);
8044 	EXC_GUARD_ENCODE_FLAVOR(code, reason);
8045 	EXC_GUARD_ENCODE_TARGET(code, target);
8046 
8047 	if (task->task_exc_guard & TASK_EXC_GUARD_VM_FATAL) {
8048 		fatal = TRUE;
8049 	}
8050 	thread_guard_violation(current_thread(), code, subcode, fatal);
8051 }
8052 
8053 static kern_return_t
vm_map_delete_submap_recurse(vm_map_t submap,vm_map_offset_t submap_start,vm_map_offset_t submap_end)8054 vm_map_delete_submap_recurse(
8055 	vm_map_t submap,
8056 	vm_map_offset_t submap_start,
8057 	vm_map_offset_t submap_end)
8058 {
8059 	vm_map_entry_t submap_entry;
8060 
8061 	/*
8062 	 * Verify that the submap does not contain any "permanent" entries
8063 	 * within the specified range.
8064 	 * We do not care about gaps.
8065 	 */
8066 
8067 	vm_map_lock(submap);
8068 
8069 	if (!vm_map_lookup_entry(submap, submap_start, &submap_entry)) {
8070 		submap_entry = submap_entry->vme_next;
8071 	}
8072 
8073 	for (;
8074 	    submap_entry != vm_map_to_entry(submap) &&
8075 	    submap_entry->vme_start < submap_end;
8076 	    submap_entry = submap_entry->vme_next) {
8077 		if (submap_entry->vme_permanent) {
8078 			/* "permanent" entry -> fail */
8079 			vm_map_unlock(submap);
8080 			return KERN_PROTECTION_FAILURE;
8081 		}
8082 	}
8083 	/* no "permanent" entries in the range -> success */
8084 	vm_map_unlock(submap);
8085 	return KERN_SUCCESS;
8086 }
8087 
8088 __abortlike
8089 static void
__vm_map_delete_misaligned_panic(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)8090 __vm_map_delete_misaligned_panic(
8091 	vm_map_t                map,
8092 	vm_map_offset_t         start,
8093 	vm_map_offset_t         end)
8094 {
8095 	panic("vm_map_delete(%p,0x%llx,0x%llx): start is not aligned to 0x%x",
8096 	    map, (uint64_t)start, (uint64_t)end, VM_MAP_PAGE_SIZE(map));
8097 }
8098 
8099 __abortlike
8100 static void
__vm_map_delete_failed_panic(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,kern_return_t kr)8101 __vm_map_delete_failed_panic(
8102 	vm_map_t                map,
8103 	vm_map_offset_t         start,
8104 	vm_map_offset_t         end,
8105 	kern_return_t           kr)
8106 {
8107 	panic("vm_map_delete(%p,0x%llx,0x%llx): failed unexpected with %d",
8108 	    map, (uint64_t)start, (uint64_t)end, kr);
8109 }
8110 
8111 __abortlike
8112 static void
__vm_map_delete_gap_panic(vm_map_t map,vm_map_offset_t where,vm_map_offset_t start,vm_map_offset_t end)8113 __vm_map_delete_gap_panic(
8114 	vm_map_t                map,
8115 	vm_map_offset_t         where,
8116 	vm_map_offset_t         start,
8117 	vm_map_offset_t         end)
8118 {
8119 	panic("vm_map_delete(%p,0x%llx,0x%llx): no map entry at 0x%llx",
8120 	    map, (uint64_t)start, (uint64_t)end, (uint64_t)where);
8121 }
8122 
8123 __abortlike
8124 static void
__vm_map_delete_permanent_panic(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_map_entry_t entry)8125 __vm_map_delete_permanent_panic(
8126 	vm_map_t                map,
8127 	vm_map_offset_t         start,
8128 	vm_map_offset_t         end,
8129 	vm_map_entry_t          entry)
8130 {
8131 	panic("vm_map_delete(%p,0x%llx,0x%llx): "
8132 	    "Attempting to remove permanent VM map entry %p [0x%llx:0x%llx]",
8133 	    map, (uint64_t)start, (uint64_t)end, entry,
8134 	    (uint64_t)entry->vme_start,
8135 	    (uint64_t)entry->vme_end);
8136 }
8137 
8138 __options_decl(vm_map_delete_state_t, uint32_t, {
8139 	VMDS_NONE               = 0x0000,
8140 
8141 	VMDS_FOUND_GAP          = 0x0001,
8142 	VMDS_GAPS_OK            = 0x0002,
8143 
8144 	VMDS_KERNEL_PMAP        = 0x0004,
8145 	VMDS_NEEDS_LOOKUP       = 0x0008,
8146 	VMDS_NEEDS_WAKEUP       = 0x0010,
8147 	VMDS_KERNEL_KMEMPTR     = 0x0020
8148 });
8149 
8150 /*
8151  *	vm_map_delete:	[ internal use only ]
8152  *
8153  *	Deallocates the given address range from the target map.
8154  *	Removes all user wirings. Unwires one kernel wiring if
8155  *	VM_MAP_REMOVE_KUNWIRE is set.  Waits for kernel wirings to go
8156  *	away if VM_MAP_REMOVE_WAIT_FOR_KWIRE is set.  Sleeps
8157  *	interruptibly if VM_MAP_REMOVE_INTERRUPTIBLE is set.
8158  *
8159  *
8160  *	When the map is a kernel map, then any error in removing mappings
8161  *	will lead to a panic so that clients do not have to repeat the panic
8162  *	code at each call site.  If VM_MAP_REMOVE_INTERRUPTIBLE
8163  *	is also passed, then KERN_ABORTED will not lead to a panic.
8164  *
8165  *	This routine is called with map locked and leaves map locked.
8166  */
8167 static kmem_return_t
vm_map_delete(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vmr_flags_t flags,kmem_guard_t guard,vm_map_zap_t zap_list)8168 vm_map_delete(
8169 	vm_map_t                map,
8170 	vm_map_offset_t         start,
8171 	vm_map_offset_t         end,
8172 	vmr_flags_t             flags,
8173 	kmem_guard_t            guard,
8174 	vm_map_zap_t            zap_list)
8175 {
8176 	vm_map_entry_t          entry, next;
8177 	int                     interruptible;
8178 	vm_map_offset_t         gap_start = 0;
8179 	vm_map_offset_t         clear_in_transition_end = 0;
8180 	__unused vm_map_offset_t save_start = start;
8181 	__unused vm_map_offset_t save_end = end;
8182 	vm_map_delete_state_t   state = VMDS_NONE;
8183 	kmem_return_t           ret = { };
8184 	vm_map_range_id_t       range_id = 0;
8185 	struct kmem_page_meta  *meta = NULL;
8186 	uint32_t                size_idx, slot_idx;
8187 	struct mach_vm_range    slot;
8188 
8189 	if (vm_map_pmap(map) == kernel_pmap) {
8190 		state |= VMDS_KERNEL_PMAP;
8191 		range_id = kmem_addr_get_range(start, end - start);
8192 		if (kmem_is_ptr_range(range_id)) {
8193 			state |= VMDS_KERNEL_KMEMPTR;
8194 			slot_idx = kmem_addr_get_slot_idx(start, end, range_id, &meta,
8195 			    &size_idx, &slot);
8196 		}
8197 	}
8198 
8199 	if (map->terminated || os_ref_get_count_raw(&map->map_refcnt) == 0) {
8200 		state |= VMDS_GAPS_OK;
8201 	}
8202 
8203 	if (map->corpse_source &&
8204 	    !(flags & VM_MAP_REMOVE_TO_OVERWRITE) &&
8205 	    !map->terminated) {
8206 		/*
8207 		 * The map is being used for corpses related diagnostics.
8208 		 * So skip any entry removal to avoid perturbing the map state.
8209 		 * The cleanup will happen in task_terminate_internal after the
8210 		 * call to task_port_no_senders.
8211 		 */
8212 		goto out;
8213 	}
8214 
8215 	interruptible = (flags & VM_MAP_REMOVE_INTERRUPTIBLE) ?
8216 	    THREAD_ABORTSAFE : THREAD_UNINT;
8217 
8218 	if ((flags & VM_MAP_REMOVE_NO_MAP_ALIGN) == 0 &&
8219 	    (start & VM_MAP_PAGE_MASK(map))) {
8220 		__vm_map_delete_misaligned_panic(map, start, end);
8221 	}
8222 
8223 	if ((state & VMDS_GAPS_OK) == 0) {
8224 		/*
8225 		 * If the map isn't terminated then all deletions must have
8226 		 * no gaps, and be within the [min, max) of the map.
8227 		 *
8228 		 * We got here without VM_MAP_RANGE_CHECK() being called,
8229 		 * and hence must validate bounds manually.
8230 		 *
8231 		 * It is worth noting that because vm_deallocate() will
8232 		 * round_page() the deallocation size, it's possible for "end"
8233 		 * to be 0 here due to overflow. We hence must treat it as being
8234 		 * beyond vm_map_max(map).
8235 		 *
8236 		 * Similarly, end < start means some wrap around happend,
8237 		 * which should cause an error or panic.
8238 		 */
8239 		if (end == 0 || end > vm_map_max(map)) {
8240 			state |= VMDS_FOUND_GAP;
8241 			gap_start = vm_map_max(map);
8242 			if (state & VMDS_KERNEL_PMAP) {
8243 				__vm_map_delete_gap_panic(map,
8244 				    gap_start, start, end);
8245 			}
8246 			goto out;
8247 		}
8248 
8249 		if (end < start) {
8250 			if (state & VMDS_KERNEL_PMAP) {
8251 				__vm_map_delete_gap_panic(map,
8252 				    vm_map_max(map), start, end);
8253 			}
8254 			ret.kmr_return = KERN_INVALID_ARGUMENT;
8255 			goto out;
8256 		}
8257 
8258 		if (start < vm_map_min(map)) {
8259 			state |= VMDS_FOUND_GAP;
8260 			gap_start = start;
8261 			if (state & VMDS_KERNEL_PMAP) {
8262 				__vm_map_delete_gap_panic(map,
8263 				    gap_start, start, end);
8264 			}
8265 			goto out;
8266 		}
8267 	} else {
8268 		/*
8269 		 * If the map is terminated, we must accept start/end
8270 		 * being beyond the boundaries of the map as this is
8271 		 * how some of the mappings like commpage mappings
8272 		 * can be destroyed (they're outside of those bounds).
8273 		 *
8274 		 * end < start is still something we can't cope with,
8275 		 * so just bail.
8276 		 */
8277 		if (end < start) {
8278 			goto out;
8279 		}
8280 	}
8281 
8282 
8283 	/*
8284 	 *	Find the start of the region.
8285 	 *
8286 	 *	If in a superpage, extend the range
8287 	 *	to include the start of the mapping.
8288 	 */
8289 	while (vm_map_lookup_entry_or_next(map, start, &entry)) {
8290 		if (entry->superpage_size && (start & ~SUPERPAGE_MASK)) {
8291 			start = SUPERPAGE_ROUND_DOWN(start);
8292 		} else {
8293 			SAVE_HINT_MAP_WRITE(map, entry->vme_prev);
8294 			break;
8295 		}
8296 	}
8297 
8298 	if (entry->superpage_size) {
8299 		end = SUPERPAGE_ROUND_UP(end);
8300 	}
8301 
8302 	/*
8303 	 *	Step through all entries in this region
8304 	 */
8305 	for (vm_map_offset_t s = start; s < end;) {
8306 		/*
8307 		 * At this point, we have deleted all the memory entries
8308 		 * in [start, s) and are proceeding with the [s, end) range.
8309 		 *
8310 		 * This loop might drop the map lock, and it is possible that
8311 		 * some memory was already reallocated within [start, s)
8312 		 * and we don't want to mess with those entries.
8313 		 *
8314 		 * Some of those entries could even have been re-assembled
8315 		 * with an entry after "s" (in vm_map_simplify_entry()), so
8316 		 * we may have to vm_map_clip_start() again.
8317 		 *
8318 		 * When clear_in_transition_end is set, the we had marked
8319 		 * [start, clear_in_transition_end) as "in_transition"
8320 		 * during a previous iteration and we need to clear it.
8321 		 */
8322 
8323 		/*
8324 		 * Step 1: If needed (because we dropped locks),
8325 		 *         lookup the entry again.
8326 		 *
8327 		 *         If we're coming back from unwiring (Step 5),
8328 		 *         we also need to mark the entries as no longer
8329 		 *         in transition after that.
8330 		 */
8331 
8332 		if (state & VMDS_NEEDS_LOOKUP) {
8333 			state &= ~VMDS_NEEDS_LOOKUP;
8334 
8335 			if (vm_map_lookup_entry_or_next(map, s, &entry)) {
8336 				SAVE_HINT_MAP_WRITE(map, entry->vme_prev);
8337 			}
8338 
8339 			if (state & VMDS_KERNEL_KMEMPTR) {
8340 				kmem_validate_slot(s, meta, size_idx, slot_idx);
8341 			}
8342 		}
8343 
8344 		if (clear_in_transition_end) {
8345 			for (vm_map_entry_t it = entry;
8346 			    it != vm_map_to_entry(map) &&
8347 			    it->vme_start < clear_in_transition_end;
8348 			    it = it->vme_next) {
8349 				assert(it->in_transition);
8350 				it->in_transition = FALSE;
8351 				if (it->needs_wakeup) {
8352 					it->needs_wakeup = FALSE;
8353 					state |= VMDS_NEEDS_WAKEUP;
8354 				}
8355 			}
8356 
8357 			clear_in_transition_end = 0;
8358 		}
8359 
8360 
8361 		/*
8362 		 * Step 2: Perform various policy checks
8363 		 *         before we do _anything_ to this entry.
8364 		 */
8365 
8366 		if (entry == vm_map_to_entry(map) || s < entry->vme_start) {
8367 			if (state & (VMDS_GAPS_OK | VMDS_FOUND_GAP)) {
8368 				/*
8369 				 * Either we found a gap already,
8370 				 * or we are tearing down a map,
8371 				 * keep going.
8372 				 */
8373 			} else if (state & VMDS_KERNEL_PMAP) {
8374 				__vm_map_delete_gap_panic(map, s, start, end);
8375 			} else if (s < end) {
8376 				state |= VMDS_FOUND_GAP;
8377 				gap_start = s;
8378 			}
8379 
8380 			if (entry == vm_map_to_entry(map) ||
8381 			    end <= entry->vme_start) {
8382 				break;
8383 			}
8384 
8385 			s = entry->vme_start;
8386 		}
8387 
8388 		if (state & VMDS_KERNEL_PMAP) {
8389 			/*
8390 			 * In the kernel map and its submaps,
8391 			 * permanent entries never die, even
8392 			 * if VM_MAP_REMOVE_IMMUTABLE is passed.
8393 			 */
8394 			if (entry->vme_permanent) {
8395 				__vm_map_delete_permanent_panic(map, start, end, entry);
8396 			}
8397 
8398 			if (flags & VM_MAP_REMOVE_GUESS_SIZE) {
8399 				end = entry->vme_end;
8400 				flags &= ~VM_MAP_REMOVE_GUESS_SIZE;
8401 			}
8402 
8403 			/*
8404 			 * In the kernel map and its submaps,
8405 			 * the removal of an atomic/guarded entry is strict.
8406 			 *
8407 			 * An atomic entry is processed only if it was
8408 			 * specifically targeted.
8409 			 *
8410 			 * We might have deleted non-atomic entries before
8411 			 * we reach this this point however...
8412 			 */
8413 			kmem_entry_validate_guard(map, entry,
8414 			    start, end - start, guard);
8415 		}
8416 
8417 		/*
8418 		 * Step 2.1: handle "permanent" and "submap" entries
8419 		 * *before* clipping to avoid triggering some unnecessary
8420 		 * un-nesting of the shared region.
8421 		 */
8422 		if (entry->vme_permanent && entry->is_sub_map) {
8423 //			printf("FBDP %s:%d permanent submap...\n", __FUNCTION__, __LINE__);
8424 			/*
8425 			 * Un-mapping a "permanent" mapping of a user-space
8426 			 * submap is not allowed unless...
8427 			 */
8428 			if (flags & VM_MAP_REMOVE_IMMUTABLE) {
8429 				/*
8430 				 * a. explicitly requested by the kernel caller.
8431 				 */
8432 //				printf("FBDP %s:%d flags & REMOVE_IMMUTABLE\n", __FUNCTION__, __LINE__);
8433 			} else if ((flags & VM_MAP_REMOVE_IMMUTABLE_CODE) &&
8434 			    developer_mode_state()) {
8435 				/*
8436 				 * b. we're in "developer" mode (for
8437 				 *    breakpoints, dtrace probes, ...).
8438 				 */
8439 //				printf("FBDP %s:%d flags & REMOVE_IMMUTABLE_CODE\n", __FUNCTION__, __LINE__);
8440 			} else if (map->terminated) {
8441 				/*
8442 				 * c. this is the final address space cleanup.
8443 				 */
8444 //				printf("FBDP %s:%d map->terminated\n", __FUNCTION__, __LINE__);
8445 			} else {
8446 				vm_map_offset_t submap_start, submap_end;
8447 				kern_return_t submap_kr;
8448 
8449 				/*
8450 				 * Check if there are any "permanent" mappings
8451 				 * in this range in the submap.
8452 				 */
8453 				if (entry->in_transition) {
8454 					/* can that even happen ? */
8455 					goto in_transition;
8456 				}
8457 				/* compute the clipped range in the submap */
8458 				submap_start = s - entry->vme_start;
8459 				submap_start += VME_OFFSET(entry);
8460 				submap_end = end - entry->vme_start;
8461 				submap_end += VME_OFFSET(entry);
8462 				submap_kr = vm_map_delete_submap_recurse(
8463 					VME_SUBMAP(entry),
8464 					submap_start,
8465 					submap_end);
8466 				if (submap_kr != KERN_SUCCESS) {
8467 					/*
8468 					 * There are some "permanent" mappings
8469 					 * in the submap: we are not allowed
8470 					 * to remove this range.
8471 					 */
8472 					printf("%d[%s] removing permanent submap entry "
8473 					    "%p [0x%llx:0x%llx] prot 0x%x/0x%x -> KERN_PROT_FAILURE\n",
8474 					    proc_selfpid(),
8475 					    (get_bsdtask_info(current_task())
8476 					    ? proc_name_address(get_bsdtask_info(current_task()))
8477 					    : "?"), entry,
8478 					    (uint64_t)entry->vme_start,
8479 					    (uint64_t)entry->vme_end,
8480 					    entry->protection,
8481 					    entry->max_protection);
8482 					DTRACE_VM6(vm_map_delete_permanent_deny_submap,
8483 					    vm_map_entry_t, entry,
8484 					    vm_map_offset_t, entry->vme_start,
8485 					    vm_map_offset_t, entry->vme_end,
8486 					    vm_prot_t, entry->protection,
8487 					    vm_prot_t, entry->max_protection,
8488 					    int, VME_ALIAS(entry));
8489 					ret.kmr_return = KERN_PROTECTION_FAILURE;
8490 					goto out;
8491 				}
8492 				/* no permanent mappings: proceed */
8493 			}
8494 		}
8495 
8496 		/*
8497 		 * Step 3: Perform any clipping needed.
8498 		 *
8499 		 *         After this, "entry" starts at "s", ends before "end"
8500 		 */
8501 
8502 		if (entry->vme_start < s) {
8503 			if ((flags & VM_MAP_REMOVE_NO_MAP_ALIGN) &&
8504 			    entry->map_aligned &&
8505 			    !VM_MAP_PAGE_ALIGNED(s, VM_MAP_PAGE_MASK(map))) {
8506 				/*
8507 				 * The entry will no longer be map-aligned
8508 				 * after clipping and the caller said it's OK.
8509 				 */
8510 				entry->map_aligned = FALSE;
8511 			}
8512 			vm_map_clip_start(map, entry, s);
8513 			SAVE_HINT_MAP_WRITE(map, entry->vme_prev);
8514 		}
8515 
8516 		if (end < entry->vme_end) {
8517 			if ((flags & VM_MAP_REMOVE_NO_MAP_ALIGN) &&
8518 			    entry->map_aligned &&
8519 			    !VM_MAP_PAGE_ALIGNED(end, VM_MAP_PAGE_MASK(map))) {
8520 				/*
8521 				 * The entry will no longer be map-aligned
8522 				 * after clipping and the caller said it's OK.
8523 				 */
8524 				entry->map_aligned = FALSE;
8525 			}
8526 			vm_map_clip_end(map, entry, end);
8527 		}
8528 
8529 		if (entry->vme_permanent && entry->is_sub_map) {
8530 			/*
8531 			 * We already went through step 2.1 which did not deny
8532 			 * the removal of this "permanent" and "is_sub_map"
8533 			 * entry.
8534 			 * Now that we've clipped what we actually want to
8535 			 * delete, undo the "permanent" part to allow the
8536 			 * removal to proceed.
8537 			 */
8538 			DTRACE_VM6(vm_map_delete_permanent_allow_submap,
8539 			    vm_map_entry_t, entry,
8540 			    vm_map_offset_t, entry->vme_start,
8541 			    vm_map_offset_t, entry->vme_end,
8542 			    vm_prot_t, entry->protection,
8543 			    vm_prot_t, entry->max_protection,
8544 			    int, VME_ALIAS(entry));
8545 			entry->vme_permanent = false;
8546 		}
8547 
8548 		assert(s == entry->vme_start);
8549 		assert(entry->vme_end <= end);
8550 
8551 
8552 		/*
8553 		 * Step 4: If the entry is in flux, wait for this to resolve.
8554 		 */
8555 
8556 		if (entry->in_transition) {
8557 			wait_result_t wait_result;
8558 
8559 in_transition:
8560 			/*
8561 			 * Another thread is wiring/unwiring this entry.
8562 			 * Let the other thread know we are waiting.
8563 			 */
8564 
8565 			entry->needs_wakeup = TRUE;
8566 
8567 			/*
8568 			 * wake up anybody waiting on entries that we have
8569 			 * already unwired/deleted.
8570 			 */
8571 			if (state & VMDS_NEEDS_WAKEUP) {
8572 				vm_map_entry_wakeup(map);
8573 				state &= ~VMDS_NEEDS_WAKEUP;
8574 			}
8575 
8576 			wait_result = vm_map_entry_wait(map, interruptible);
8577 
8578 			if (interruptible &&
8579 			    wait_result == THREAD_INTERRUPTED) {
8580 				/*
8581 				 * We do not clear the needs_wakeup flag,
8582 				 * since we cannot tell if we were the only one.
8583 				 */
8584 				ret.kmr_return = KERN_ABORTED;
8585 				return ret;
8586 			}
8587 
8588 			/*
8589 			 * The entry could have been clipped or it
8590 			 * may not exist anymore.  Look it up again.
8591 			 */
8592 			state |= VMDS_NEEDS_LOOKUP;
8593 			continue;
8594 		}
8595 
8596 
8597 		/*
8598 		 * Step 5: Handle wiring
8599 		 */
8600 
8601 		if (entry->wired_count) {
8602 			struct vm_map_entry tmp_entry;
8603 			boolean_t           user_wire;
8604 			unsigned int        last_timestamp;
8605 
8606 			user_wire = entry->user_wired_count > 0;
8607 
8608 			/*
8609 			 *      Remove a kernel wiring if requested
8610 			 */
8611 			if (flags & VM_MAP_REMOVE_KUNWIRE) {
8612 				entry->wired_count--;
8613 				vme_btref_consider_and_put(entry);
8614 			}
8615 
8616 			/*
8617 			 *	Remove all user wirings for proper accounting
8618 			 */
8619 			while (entry->user_wired_count) {
8620 				subtract_wire_counts(map, entry, user_wire);
8621 			}
8622 
8623 			/*
8624 			 * All our DMA I/O operations in IOKit are currently
8625 			 * done by wiring through the map entries of the task
8626 			 * requesting the I/O.
8627 			 *
8628 			 * Because of this, we must always wait for kernel wirings
8629 			 * to go away on the entries before deleting them.
8630 			 *
8631 			 * Any caller who wants to actually remove a kernel wiring
8632 			 * should explicitly set the VM_MAP_REMOVE_KUNWIRE flag to
8633 			 * properly remove one wiring instead of blasting through
8634 			 * them all.
8635 			 */
8636 			if (entry->wired_count != 0) {
8637 				assert(map != kernel_map);
8638 				/*
8639 				 * Cannot continue.  Typical case is when
8640 				 * a user thread has physical io pending on
8641 				 * on this page.  Either wait for the
8642 				 * kernel wiring to go away or return an
8643 				 * error.
8644 				 */
8645 				wait_result_t wait_result;
8646 
8647 				entry->needs_wakeup = TRUE;
8648 				wait_result = vm_map_entry_wait(map,
8649 				    interruptible);
8650 
8651 				if (interruptible &&
8652 				    wait_result == THREAD_INTERRUPTED) {
8653 					/*
8654 					 * We do not clear the
8655 					 * needs_wakeup flag, since we
8656 					 * cannot tell if we were the
8657 					 * only one.
8658 					 */
8659 					ret.kmr_return = KERN_ABORTED;
8660 					return ret;
8661 				}
8662 
8663 
8664 				/*
8665 				 * The entry could have been clipped or
8666 				 * it may not exist anymore.  Look it
8667 				 * up again.
8668 				 */
8669 				state |= VMDS_NEEDS_LOOKUP;
8670 				continue;
8671 			}
8672 
8673 			/*
8674 			 * We can unlock the map now.
8675 			 *
8676 			 * The entry might be split once we unlock the map,
8677 			 * but we need the range as defined by this entry
8678 			 * to be stable. So we must make a local copy.
8679 			 *
8680 			 * The underlying objects do not change during clips,
8681 			 * and the in_transition state guarentees existence
8682 			 * of the entry.
8683 			 */
8684 			last_timestamp = map->timestamp;
8685 			entry->in_transition = TRUE;
8686 			tmp_entry = *entry;
8687 			vm_map_unlock(map);
8688 
8689 			if (tmp_entry.is_sub_map) {
8690 				vm_map_t sub_map;
8691 				vm_map_offset_t sub_start, sub_end;
8692 				pmap_t pmap;
8693 				vm_map_offset_t pmap_addr;
8694 
8695 
8696 				sub_map = VME_SUBMAP(&tmp_entry);
8697 				sub_start = VME_OFFSET(&tmp_entry);
8698 				sub_end = sub_start + (tmp_entry.vme_end -
8699 				    tmp_entry.vme_start);
8700 				if (tmp_entry.use_pmap) {
8701 					pmap = sub_map->pmap;
8702 					pmap_addr = tmp_entry.vme_start;
8703 				} else {
8704 					pmap = map->pmap;
8705 					pmap_addr = tmp_entry.vme_start;
8706 				}
8707 				(void) vm_map_unwire_nested(sub_map,
8708 				    sub_start, sub_end,
8709 				    user_wire,
8710 				    pmap, pmap_addr);
8711 			} else {
8712 				vm_map_offset_t entry_end = tmp_entry.vme_end;
8713 				vm_map_offset_t max_end;
8714 
8715 				if (flags & VM_MAP_REMOVE_NOKUNWIRE_LAST) {
8716 					max_end = end - VM_MAP_PAGE_SIZE(map);
8717 					if (entry_end > max_end) {
8718 						entry_end = max_end;
8719 					}
8720 				}
8721 
8722 				if (tmp_entry.vme_kernel_object) {
8723 					pmap_protect_options(
8724 						map->pmap,
8725 						tmp_entry.vme_start,
8726 						entry_end,
8727 						VM_PROT_NONE,
8728 						PMAP_OPTIONS_REMOVE,
8729 						NULL);
8730 				}
8731 				vm_fault_unwire(map, &tmp_entry,
8732 				    tmp_entry.vme_kernel_object, map->pmap,
8733 				    tmp_entry.vme_start, entry_end);
8734 			}
8735 
8736 			vm_map_lock(map);
8737 
8738 			/*
8739 			 * Unwiring happened, we can now go back to deleting
8740 			 * them (after we clear the in_transition bit for the range).
8741 			 */
8742 			if (last_timestamp + 1 != map->timestamp) {
8743 				state |= VMDS_NEEDS_LOOKUP;
8744 			}
8745 			clear_in_transition_end = tmp_entry.vme_end;
8746 			continue;
8747 		}
8748 
8749 		assert(entry->wired_count == 0);
8750 		assert(entry->user_wired_count == 0);
8751 
8752 
8753 		/*
8754 		 * Step 6: Entry is unwired and ready for us to delete !
8755 		 */
8756 
8757 		if (!entry->vme_permanent) {
8758 			/*
8759 			 * Typical case: the entry really shouldn't be permanent
8760 			 */
8761 		} else if ((flags & VM_MAP_REMOVE_IMMUTABLE_CODE) &&
8762 		    (entry->protection & VM_PROT_EXECUTE) &&
8763 		    developer_mode_state()) {
8764 			/*
8765 			 * Allow debuggers to undo executable mappings
8766 			 * when developer mode is on.
8767 			 */
8768 #if 0
8769 			printf("FBDP %d[%s] removing permanent executable entry "
8770 			    "%p [0x%llx:0x%llx] prot 0x%x/0x%x\n",
8771 			    proc_selfpid(),
8772 			    (current_task()->bsd_info
8773 			    ? proc_name_address(current_task()->bsd_info)
8774 			    : "?"), entry,
8775 			    (uint64_t)entry->vme_start,
8776 			    (uint64_t)entry->vme_end,
8777 			    entry->protection,
8778 			    entry->max_protection);
8779 #endif
8780 			entry->vme_permanent = FALSE;
8781 		} else if ((flags & VM_MAP_REMOVE_IMMUTABLE) || map->terminated) {
8782 #if 0
8783 			printf("FBDP %d[%s] removing permanent entry "
8784 			    "%p [0x%llx:0x%llx] prot 0x%x/0x%x\n",
8785 			    proc_selfpid(),
8786 			    (current_task()->bsd_info
8787 			    ? proc_name_address(current_task()->bsd_info)
8788 			    : "?"), entry,
8789 			    (uint64_t)entry->vme_start,
8790 			    (uint64_t)entry->vme_end,
8791 			    entry->protection,
8792 			    entry->max_protection);
8793 #endif
8794 			entry->vme_permanent = FALSE;
8795 #if CODE_SIGNING_MONITOR
8796 		} else if ((entry->protection & VM_PROT_EXECUTE) && !csm_enabled()) {
8797 			entry->vme_permanent = FALSE;
8798 
8799 			printf("%d[%s] %s(0x%llx,0x%llx): "
8800 			    "code signing monitor disabled, allowing for permanent executable entry [0x%llx:0x%llx] "
8801 			    "prot 0x%x/0x%x\n",
8802 			    proc_selfpid(),
8803 			    (get_bsdtask_info(current_task())
8804 			    ? proc_name_address(get_bsdtask_info(current_task()))
8805 			    : "?"),
8806 			    __FUNCTION__,
8807 			    (uint64_t)start,
8808 			    (uint64_t)end,
8809 			    (uint64_t)entry->vme_start,
8810 			    (uint64_t)entry->vme_end,
8811 			    entry->protection,
8812 			    entry->max_protection);
8813 #endif
8814 		} else {
8815 			DTRACE_VM6(vm_map_delete_permanent,
8816 			    vm_map_entry_t, entry,
8817 			    vm_map_offset_t, entry->vme_start,
8818 			    vm_map_offset_t, entry->vme_end,
8819 			    vm_prot_t, entry->protection,
8820 			    vm_prot_t, entry->max_protection,
8821 			    int, VME_ALIAS(entry));
8822 		}
8823 
8824 		if (entry->is_sub_map) {
8825 			assertf(VM_MAP_PAGE_SHIFT(VME_SUBMAP(entry)) >= VM_MAP_PAGE_SHIFT(map),
8826 			    "map %p (%d) entry %p submap %p (%d)\n",
8827 			    map, VM_MAP_PAGE_SHIFT(map), entry,
8828 			    VME_SUBMAP(entry),
8829 			    VM_MAP_PAGE_SHIFT(VME_SUBMAP(entry)));
8830 			if (entry->use_pmap) {
8831 #ifndef NO_NESTED_PMAP
8832 				int pmap_flags;
8833 
8834 				if (map->terminated) {
8835 					/*
8836 					 * This is the final cleanup of the
8837 					 * address space being terminated.
8838 					 * No new mappings are expected and
8839 					 * we don't really need to unnest the
8840 					 * shared region (and lose the "global"
8841 					 * pmap mappings, if applicable).
8842 					 *
8843 					 * Tell the pmap layer that we're
8844 					 * "clean" wrt nesting.
8845 					 */
8846 					pmap_flags = PMAP_UNNEST_CLEAN;
8847 				} else {
8848 					/*
8849 					 * We're unmapping part of the nested
8850 					 * shared region, so we can't keep the
8851 					 * nested pmap.
8852 					 */
8853 					pmap_flags = 0;
8854 				}
8855 				pmap_unnest_options(
8856 					map->pmap,
8857 					(addr64_t)entry->vme_start,
8858 					entry->vme_end - entry->vme_start,
8859 					pmap_flags);
8860 #endif  /* NO_NESTED_PMAP */
8861 				if (map->mapped_in_other_pmaps &&
8862 				    os_ref_get_count_raw(&map->map_refcnt) != 0) {
8863 					/* clean up parent map/maps */
8864 					vm_map_submap_pmap_clean(
8865 						map, entry->vme_start,
8866 						entry->vme_end,
8867 						VME_SUBMAP(entry),
8868 						VME_OFFSET(entry));
8869 				}
8870 			} else {
8871 				vm_map_submap_pmap_clean(
8872 					map, entry->vme_start, entry->vme_end,
8873 					VME_SUBMAP(entry),
8874 					VME_OFFSET(entry));
8875 			}
8876 		} else if (entry->vme_kernel_object ||
8877 		    VME_OBJECT(entry) == compressor_object) {
8878 			/*
8879 			 * nothing to do
8880 			 */
8881 		} else if (map->mapped_in_other_pmaps &&
8882 		    os_ref_get_count_raw(&map->map_refcnt) != 0) {
8883 			vm_object_pmap_protect_options(
8884 				VME_OBJECT(entry), VME_OFFSET(entry),
8885 				entry->vme_end - entry->vme_start,
8886 				PMAP_NULL,
8887 				PAGE_SIZE,
8888 				entry->vme_start,
8889 				VM_PROT_NONE,
8890 				PMAP_OPTIONS_REMOVE);
8891 		} else if ((VME_OBJECT(entry) != VM_OBJECT_NULL) ||
8892 		    (state & VMDS_KERNEL_PMAP)) {
8893 			/* Remove translations associated
8894 			 * with this range unless the entry
8895 			 * does not have an object, or
8896 			 * it's the kernel map or a descendant
8897 			 * since the platform could potentially
8898 			 * create "backdoor" mappings invisible
8899 			 * to the VM. It is expected that
8900 			 * objectless, non-kernel ranges
8901 			 * do not have such VM invisible
8902 			 * translations.
8903 			 */
8904 			pmap_remove_options(map->pmap,
8905 			    (addr64_t)entry->vme_start,
8906 			    (addr64_t)entry->vme_end,
8907 			    PMAP_OPTIONS_REMOVE);
8908 		}
8909 
8910 #if DEBUG
8911 		/*
8912 		 * All pmap mappings for this map entry must have been
8913 		 * cleared by now.
8914 		 */
8915 		assert(pmap_is_empty(map->pmap,
8916 		    entry->vme_start,
8917 		    entry->vme_end));
8918 #endif /* DEBUG */
8919 
8920 		if (entry->iokit_acct) {
8921 			/* alternate accounting */
8922 			DTRACE_VM4(vm_map_iokit_unmapped_region,
8923 			    vm_map_t, map,
8924 			    vm_map_offset_t, entry->vme_start,
8925 			    vm_map_offset_t, entry->vme_end,
8926 			    int, VME_ALIAS(entry));
8927 			vm_map_iokit_unmapped_region(map,
8928 			    (entry->vme_end -
8929 			    entry->vme_start));
8930 			entry->iokit_acct = FALSE;
8931 			entry->use_pmap = FALSE;
8932 		}
8933 
8934 		/* move "s" forward */
8935 		s    = entry->vme_end;
8936 		next = entry->vme_next;
8937 		if (!entry->map_aligned) {
8938 			vm_map_offset_t rounded_s;
8939 
8940 			/*
8941 			 * Skip artificial gap due to mis-aligned entry
8942 			 * on devices with a page size smaller than the
8943 			 * map's page size (i.e. 16k task on a 4k device).
8944 			 */
8945 			rounded_s = VM_MAP_ROUND_PAGE(s, VM_MAP_PAGE_MASK(map));
8946 			if (next == vm_map_to_entry(map)) {
8947 				s = rounded_s;
8948 			} else if (s < rounded_s) {
8949 				s = MIN(rounded_s, next->vme_start);
8950 			}
8951 		}
8952 		ret.kmr_size += s - entry->vme_start;
8953 
8954 		if (entry->vme_permanent) {
8955 			/*
8956 			 * A permanent entry can not be removed, so leave it
8957 			 * in place but remove all access permissions.
8958 			 */
8959 			if (!entry->csm_associated) {
8960 				printf("%s:%d %d[%s] map %p entry %p [ 0x%llx - 0x%llx ] submap %d prot 0x%x/0x%x -> 0/0\n",
8961 				    __FUNCTION__, __LINE__,
8962 				    proc_selfpid(),
8963 				    (get_bsdtask_info(current_task())
8964 				    ? proc_name_address(get_bsdtask_info(current_task()))
8965 				    : "?"),
8966 				    map,
8967 				    entry,
8968 				    (uint64_t)entry->vme_start,
8969 				    (uint64_t)entry->vme_end,
8970 				    entry->is_sub_map,
8971 				    entry->protection,
8972 				    entry->max_protection);
8973 			}
8974 			DTRACE_VM6(vm_map_delete_permanent_prot_none,
8975 			    vm_map_entry_t, entry,
8976 			    vm_map_offset_t, entry->vme_start,
8977 			    vm_map_offset_t, entry->vme_end,
8978 			    vm_prot_t, entry->protection,
8979 			    vm_prot_t, entry->max_protection,
8980 			    int, VME_ALIAS(entry));
8981 			entry->protection = VM_PROT_NONE;
8982 			entry->max_protection = VM_PROT_NONE;
8983 		} else {
8984 			vm_map_entry_zap(map, entry, zap_list);
8985 		}
8986 
8987 		entry = next;
8988 		next  = VM_MAP_ENTRY_NULL;
8989 
8990 		if ((flags & VM_MAP_REMOVE_NO_YIELD) == 0 && s < end) {
8991 			unsigned int last_timestamp = map->timestamp++;
8992 
8993 			if (lck_rw_lock_yield_exclusive(&map->lock,
8994 			    LCK_RW_YIELD_ANY_WAITER)) {
8995 				if (last_timestamp != map->timestamp + 1) {
8996 					state |= VMDS_NEEDS_LOOKUP;
8997 				}
8998 			} else {
8999 				/* we didn't yield, undo our change */
9000 				map->timestamp--;
9001 			}
9002 		}
9003 	}
9004 
9005 	if (map->wait_for_space) {
9006 		thread_wakeup((event_t) map);
9007 	}
9008 
9009 	if (state & VMDS_NEEDS_WAKEUP) {
9010 		vm_map_entry_wakeup(map);
9011 	}
9012 
9013 out:
9014 	if ((state & VMDS_KERNEL_PMAP) && ret.kmr_return) {
9015 		__vm_map_delete_failed_panic(map, start, end, ret.kmr_return);
9016 	}
9017 
9018 	if (state & VMDS_KERNEL_KMEMPTR) {
9019 		kmem_free_space(start, end, range_id, &slot);
9020 	}
9021 
9022 	if (state & VMDS_FOUND_GAP) {
9023 		DTRACE_VM3(kern_vm_deallocate_gap,
9024 		    vm_map_offset_t, gap_start,
9025 		    vm_map_offset_t, save_start,
9026 		    vm_map_offset_t, save_end);
9027 		if (flags & VM_MAP_REMOVE_GAPS_FAIL) {
9028 			ret.kmr_return = KERN_INVALID_VALUE;
9029 		} else {
9030 			vm_map_guard_exception(gap_start, kGUARD_EXC_DEALLOC_GAP);
9031 		}
9032 	}
9033 
9034 	return ret;
9035 }
9036 
9037 kmem_return_t
vm_map_remove_and_unlock(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vmr_flags_t flags,kmem_guard_t guard)9038 vm_map_remove_and_unlock(
9039 	vm_map_t        map,
9040 	vm_map_offset_t start,
9041 	vm_map_offset_t end,
9042 	vmr_flags_t     flags,
9043 	kmem_guard_t    guard)
9044 {
9045 	kmem_return_t ret;
9046 	VM_MAP_ZAP_DECLARE(zap);
9047 
9048 	ret = vm_map_delete(map, start, end, flags, guard, &zap);
9049 	vm_map_unlock(map);
9050 
9051 	vm_map_zap_dispose(&zap);
9052 
9053 	return ret;
9054 }
9055 
9056 /*
9057  *	vm_map_remove_guard:
9058  *
9059  *	Remove the given address range from the target map.
9060  *	This is the exported form of vm_map_delete.
9061  */
9062 kmem_return_t
vm_map_remove_guard(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vmr_flags_t flags,kmem_guard_t guard)9063 vm_map_remove_guard(
9064 	vm_map_t        map,
9065 	vm_map_offset_t start,
9066 	vm_map_offset_t end,
9067 	vmr_flags_t     flags,
9068 	kmem_guard_t    guard)
9069 {
9070 	vm_map_lock(map);
9071 	return vm_map_remove_and_unlock(map, start, end, flags, guard);
9072 }
9073 
9074 /*
9075  *	vm_map_terminate:
9076  *
9077  *	Clean out a task's map.
9078  */
9079 kern_return_t
vm_map_terminate(vm_map_t map)9080 vm_map_terminate(
9081 	vm_map_t        map)
9082 {
9083 	vm_map_lock(map);
9084 	map->terminated = TRUE;
9085 	vm_map_disable_hole_optimization(map);
9086 	(void)vm_map_remove_and_unlock(map, map->min_offset, map->max_offset,
9087 	    VM_MAP_REMOVE_NO_FLAGS, KMEM_GUARD_NONE);
9088 	return KERN_SUCCESS;
9089 }
9090 
9091 /*
9092  *	Routine:	vm_map_copy_allocate
9093  *
9094  *	Description:
9095  *		Allocates and initializes a map copy object.
9096  */
9097 static vm_map_copy_t
vm_map_copy_allocate(uint16_t type)9098 vm_map_copy_allocate(uint16_t type)
9099 {
9100 	vm_map_copy_t new_copy;
9101 
9102 	new_copy = zalloc_id(ZONE_ID_VM_MAP_COPY, Z_WAITOK | Z_ZERO);
9103 	new_copy->type = type;
9104 	if (type == VM_MAP_COPY_ENTRY_LIST) {
9105 		new_copy->c_u.hdr.rb_head_store.rbh_root = (void*)(int)SKIP_RB_TREE;
9106 		vm_map_store_init(&new_copy->cpy_hdr);
9107 	}
9108 	return new_copy;
9109 }
9110 
9111 /*
9112  *	Routine:	vm_map_copy_discard
9113  *
9114  *	Description:
9115  *		Dispose of a map copy object (returned by
9116  *		vm_map_copyin).
9117  */
9118 void
vm_map_copy_discard(vm_map_copy_t copy)9119 vm_map_copy_discard(
9120 	vm_map_copy_t   copy)
9121 {
9122 	if (copy == VM_MAP_COPY_NULL) {
9123 		return;
9124 	}
9125 
9126 	/*
9127 	 * Assert that the vm_map_copy is coming from the right
9128 	 * zone and hasn't been forged
9129 	 */
9130 	vm_map_copy_require(copy);
9131 
9132 	switch (copy->type) {
9133 	case VM_MAP_COPY_ENTRY_LIST:
9134 		while (vm_map_copy_first_entry(copy) !=
9135 		    vm_map_copy_to_entry(copy)) {
9136 			vm_map_entry_t  entry = vm_map_copy_first_entry(copy);
9137 
9138 			vm_map_copy_entry_unlink(copy, entry);
9139 			if (entry->is_sub_map) {
9140 				vm_map_deallocate(VME_SUBMAP(entry));
9141 			} else {
9142 				vm_object_deallocate(VME_OBJECT(entry));
9143 			}
9144 			vm_map_copy_entry_dispose(entry);
9145 		}
9146 		break;
9147 	case VM_MAP_COPY_KERNEL_BUFFER:
9148 
9149 		/*
9150 		 * The vm_map_copy_t and possibly the data buffer were
9151 		 * allocated by a single call to kalloc_data(), i.e. the
9152 		 * vm_map_copy_t was not allocated out of the zone.
9153 		 */
9154 		if (copy->size > msg_ool_size_small || copy->offset) {
9155 			panic("Invalid vm_map_copy_t sz:%lld, ofst:%lld",
9156 			    (long long)copy->size, (long long)copy->offset);
9157 		}
9158 		kfree_data(copy->cpy_kdata, copy->size);
9159 	}
9160 	zfree_id(ZONE_ID_VM_MAP_COPY, copy);
9161 }
9162 
9163 #if XNU_PLATFORM_MacOSX
9164 
9165 /*
9166  *	Routine:	vm_map_copy_copy
9167  *
9168  *	Description:
9169  *			Move the information in a map copy object to
9170  *			a new map copy object, leaving the old one
9171  *			empty.
9172  *
9173  *			This is used by kernel routines that need
9174  *			to look at out-of-line data (in copyin form)
9175  *			before deciding whether to return SUCCESS.
9176  *			If the routine returns FAILURE, the original
9177  *			copy object will be deallocated; therefore,
9178  *			these routines must make a copy of the copy
9179  *			object and leave the original empty so that
9180  *			deallocation will not fail.
9181  */
9182 vm_map_copy_t
vm_map_copy_copy(vm_map_copy_t copy)9183 vm_map_copy_copy(
9184 	vm_map_copy_t   copy)
9185 {
9186 	vm_map_copy_t   new_copy;
9187 
9188 	if (copy == VM_MAP_COPY_NULL) {
9189 		return VM_MAP_COPY_NULL;
9190 	}
9191 
9192 	/*
9193 	 * Assert that the vm_map_copy is coming from the right
9194 	 * zone and hasn't been forged
9195 	 */
9196 	vm_map_copy_require(copy);
9197 
9198 	/*
9199 	 * Allocate a new copy object, and copy the information
9200 	 * from the old one into it.
9201 	 */
9202 
9203 	new_copy = zalloc_id(ZONE_ID_VM_MAP_COPY, Z_WAITOK | Z_ZERO | Z_NOFAIL);
9204 	memcpy((void *) new_copy, (void *) copy, sizeof(struct vm_map_copy));
9205 #if __has_feature(ptrauth_calls)
9206 	if (copy->type == VM_MAP_COPY_KERNEL_BUFFER) {
9207 		new_copy->cpy_kdata = copy->cpy_kdata;
9208 	}
9209 #endif
9210 
9211 	if (copy->type == VM_MAP_COPY_ENTRY_LIST) {
9212 		/*
9213 		 * The links in the entry chain must be
9214 		 * changed to point to the new copy object.
9215 		 */
9216 		vm_map_copy_first_entry(copy)->vme_prev
9217 		        = vm_map_copy_to_entry(new_copy);
9218 		vm_map_copy_last_entry(copy)->vme_next
9219 		        = vm_map_copy_to_entry(new_copy);
9220 	}
9221 
9222 	/*
9223 	 * Change the old copy object into one that contains
9224 	 * nothing to be deallocated.
9225 	 */
9226 	bzero(copy, sizeof(struct vm_map_copy));
9227 	copy->type = VM_MAP_COPY_KERNEL_BUFFER;
9228 
9229 	/*
9230 	 * Return the new object.
9231 	 */
9232 	return new_copy;
9233 }
9234 
9235 #endif /* XNU_PLATFORM_MacOSX */
9236 
9237 static boolean_t
vm_map_entry_is_overwritable(vm_map_t dst_map __unused,vm_map_entry_t entry)9238 vm_map_entry_is_overwritable(
9239 	vm_map_t        dst_map __unused,
9240 	vm_map_entry_t  entry)
9241 {
9242 	if (!(entry->protection & VM_PROT_WRITE)) {
9243 		/* can't overwrite if not writable */
9244 		return FALSE;
9245 	}
9246 #if !__x86_64__
9247 	if (entry->used_for_jit &&
9248 	    vm_map_cs_enforcement(dst_map) &&
9249 	    !dst_map->cs_debugged) {
9250 		/*
9251 		 * Can't overwrite a JIT region while cs_enforced
9252 		 * and not cs_debugged.
9253 		 */
9254 		return FALSE;
9255 	}
9256 
9257 #if __arm64e__
9258 	/* Do not allow overwrite HW assisted TPRO entries */
9259 	if (entry->used_for_tpro) {
9260 		return FALSE;
9261 	}
9262 #endif /* __arm64e__ */
9263 
9264 	if (entry->vme_permanent) {
9265 		if (entry->is_sub_map) {
9266 			/*
9267 			 * We can't tell if the submap contains "permanent"
9268 			 * entries within the range targeted by the caller.
9269 			 * The caller will have to check for that with
9270 			 * vm_map_overwrite_submap_recurse() for example.
9271 			 */
9272 		} else {
9273 			/*
9274 			 * Do not allow overwriting of a "permanent"
9275 			 * entry.
9276 			 */
9277 			DTRACE_VM6(vm_map_delete_permanent_deny_overwrite,
9278 			    vm_map_entry_t, entry,
9279 			    vm_map_offset_t, entry->vme_start,
9280 			    vm_map_offset_t, entry->vme_end,
9281 			    vm_prot_t, entry->protection,
9282 			    vm_prot_t, entry->max_protection,
9283 			    int, VME_ALIAS(entry));
9284 			return FALSE;
9285 		}
9286 	}
9287 #endif /* !__x86_64__ */
9288 	return TRUE;
9289 }
9290 
9291 static kern_return_t
vm_map_overwrite_submap_recurse(vm_map_t dst_map,vm_map_offset_t dst_addr,vm_map_size_t dst_size)9292 vm_map_overwrite_submap_recurse(
9293 	vm_map_t        dst_map,
9294 	vm_map_offset_t dst_addr,
9295 	vm_map_size_t   dst_size)
9296 {
9297 	vm_map_offset_t dst_end;
9298 	vm_map_entry_t  tmp_entry;
9299 	vm_map_entry_t  entry;
9300 	kern_return_t   result;
9301 	boolean_t       encountered_sub_map = FALSE;
9302 
9303 
9304 
9305 	/*
9306 	 *	Verify that the destination is all writeable
9307 	 *	initially.  We have to trunc the destination
9308 	 *	address and round the copy size or we'll end up
9309 	 *	splitting entries in strange ways.
9310 	 */
9311 
9312 	dst_end = vm_map_round_page(dst_addr + dst_size,
9313 	    VM_MAP_PAGE_MASK(dst_map));
9314 	vm_map_lock(dst_map);
9315 
9316 start_pass_1:
9317 	if (!vm_map_lookup_entry(dst_map, dst_addr, &tmp_entry)) {
9318 		vm_map_unlock(dst_map);
9319 		return KERN_INVALID_ADDRESS;
9320 	}
9321 
9322 	vm_map_clip_start(dst_map,
9323 	    tmp_entry,
9324 	    vm_map_trunc_page(dst_addr,
9325 	    VM_MAP_PAGE_MASK(dst_map)));
9326 	if (tmp_entry->is_sub_map) {
9327 		/* clipping did unnest if needed */
9328 		assert(!tmp_entry->use_pmap);
9329 	}
9330 
9331 	for (entry = tmp_entry;;) {
9332 		vm_map_entry_t  next;
9333 
9334 		next = entry->vme_next;
9335 		while (entry->is_sub_map) {
9336 			vm_map_offset_t sub_start;
9337 			vm_map_offset_t sub_end;
9338 			vm_map_offset_t local_end;
9339 
9340 			if (entry->in_transition) {
9341 				/*
9342 				 * Say that we are waiting, and wait for entry.
9343 				 */
9344 				entry->needs_wakeup = TRUE;
9345 				vm_map_entry_wait(dst_map, THREAD_UNINT);
9346 
9347 				goto start_pass_1;
9348 			}
9349 
9350 			encountered_sub_map = TRUE;
9351 			sub_start = VME_OFFSET(entry);
9352 
9353 			if (entry->vme_end < dst_end) {
9354 				sub_end = entry->vme_end;
9355 			} else {
9356 				sub_end = dst_end;
9357 			}
9358 			sub_end -= entry->vme_start;
9359 			sub_end += VME_OFFSET(entry);
9360 			local_end = entry->vme_end;
9361 			vm_map_unlock(dst_map);
9362 
9363 			result = vm_map_overwrite_submap_recurse(
9364 				VME_SUBMAP(entry),
9365 				sub_start,
9366 				sub_end - sub_start);
9367 
9368 			if (result != KERN_SUCCESS) {
9369 				return result;
9370 			}
9371 			if (dst_end <= entry->vme_end) {
9372 				return KERN_SUCCESS;
9373 			}
9374 			vm_map_lock(dst_map);
9375 			if (!vm_map_lookup_entry(dst_map, local_end,
9376 			    &tmp_entry)) {
9377 				vm_map_unlock(dst_map);
9378 				return KERN_INVALID_ADDRESS;
9379 			}
9380 			entry = tmp_entry;
9381 			next = entry->vme_next;
9382 		}
9383 
9384 		if (!(entry->protection & VM_PROT_WRITE)) {
9385 			vm_map_unlock(dst_map);
9386 			return KERN_PROTECTION_FAILURE;
9387 		}
9388 
9389 		if (!vm_map_entry_is_overwritable(dst_map, entry)) {
9390 			vm_map_unlock(dst_map);
9391 			return KERN_PROTECTION_FAILURE;
9392 		}
9393 
9394 		/*
9395 		 *	If the entry is in transition, we must wait
9396 		 *	for it to exit that state.  Anything could happen
9397 		 *	when we unlock the map, so start over.
9398 		 */
9399 		if (entry->in_transition) {
9400 			/*
9401 			 * Say that we are waiting, and wait for entry.
9402 			 */
9403 			entry->needs_wakeup = TRUE;
9404 			vm_map_entry_wait(dst_map, THREAD_UNINT);
9405 
9406 			goto start_pass_1;
9407 		}
9408 
9409 /*
9410  *		our range is contained completely within this map entry
9411  */
9412 		if (dst_end <= entry->vme_end) {
9413 			vm_map_unlock(dst_map);
9414 			return KERN_SUCCESS;
9415 		}
9416 /*
9417  *		check that range specified is contiguous region
9418  */
9419 		if ((next == vm_map_to_entry(dst_map)) ||
9420 		    (next->vme_start != entry->vme_end)) {
9421 			vm_map_unlock(dst_map);
9422 			return KERN_INVALID_ADDRESS;
9423 		}
9424 
9425 		/*
9426 		 *	Check for permanent objects in the destination.
9427 		 */
9428 		if ((VME_OBJECT(entry) != VM_OBJECT_NULL) &&
9429 		    ((!VME_OBJECT(entry)->internal) ||
9430 		    (VME_OBJECT(entry)->true_share))) {
9431 			if (encountered_sub_map) {
9432 				vm_map_unlock(dst_map);
9433 				return KERN_FAILURE;
9434 			}
9435 		}
9436 
9437 
9438 		entry = next;
9439 	}/* for */
9440 	vm_map_unlock(dst_map);
9441 	return KERN_SUCCESS;
9442 }
9443 
9444 /*
9445  *	Routine:	vm_map_copy_overwrite
9446  *
9447  *	Description:
9448  *		Copy the memory described by the map copy
9449  *		object (copy; returned by vm_map_copyin) onto
9450  *		the specified destination region (dst_map, dst_addr).
9451  *		The destination must be writeable.
9452  *
9453  *		Unlike vm_map_copyout, this routine actually
9454  *		writes over previously-mapped memory.  If the
9455  *		previous mapping was to a permanent (user-supplied)
9456  *		memory object, it is preserved.
9457  *
9458  *		The attributes (protection and inheritance) of the
9459  *		destination region are preserved.
9460  *
9461  *		If successful, consumes the copy object.
9462  *		Otherwise, the caller is responsible for it.
9463  *
9464  *	Implementation notes:
9465  *		To overwrite aligned temporary virtual memory, it is
9466  *		sufficient to remove the previous mapping and insert
9467  *		the new copy.  This replacement is done either on
9468  *		the whole region (if no permanent virtual memory
9469  *		objects are embedded in the destination region) or
9470  *		in individual map entries.
9471  *
9472  *		To overwrite permanent virtual memory , it is necessary
9473  *		to copy each page, as the external memory management
9474  *		interface currently does not provide any optimizations.
9475  *
9476  *		Unaligned memory also has to be copied.  It is possible
9477  *		to use 'vm_trickery' to copy the aligned data.  This is
9478  *		not done but not hard to implement.
9479  *
9480  *		Once a page of permanent memory has been overwritten,
9481  *		it is impossible to interrupt this function; otherwise,
9482  *		the call would be neither atomic nor location-independent.
9483  *		The kernel-state portion of a user thread must be
9484  *		interruptible.
9485  *
9486  *		It may be expensive to forward all requests that might
9487  *		overwrite permanent memory (vm_write, vm_copy) to
9488  *		uninterruptible kernel threads.  This routine may be
9489  *		called by interruptible threads; however, success is
9490  *		not guaranteed -- if the request cannot be performed
9491  *		atomically and interruptibly, an error indication is
9492  *		returned.
9493  *
9494  *		Callers of this function must call vm_map_copy_require on
9495  *		previously created vm_map_copy_t or pass a newly created
9496  *		one to ensure that it hasn't been forged.
9497  */
9498 static kern_return_t
vm_map_copy_overwrite_nested(vm_map_t dst_map,vm_map_address_t dst_addr,vm_map_copy_t copy,boolean_t interruptible,pmap_t pmap,boolean_t discard_on_success)9499 vm_map_copy_overwrite_nested(
9500 	vm_map_t                dst_map,
9501 	vm_map_address_t        dst_addr,
9502 	vm_map_copy_t           copy,
9503 	boolean_t               interruptible,
9504 	pmap_t                  pmap,
9505 	boolean_t               discard_on_success)
9506 {
9507 	vm_map_offset_t         dst_end;
9508 	vm_map_entry_t          tmp_entry;
9509 	vm_map_entry_t          entry;
9510 	kern_return_t           kr;
9511 	boolean_t               aligned = TRUE;
9512 	boolean_t               contains_permanent_objects = FALSE;
9513 	boolean_t               encountered_sub_map = FALSE;
9514 	vm_map_offset_t         base_addr;
9515 	vm_map_size_t           copy_size;
9516 	vm_map_size_t           total_size;
9517 	uint16_t                copy_page_shift;
9518 
9519 	/*
9520 	 *	Check for special kernel buffer allocated
9521 	 *	by new_ipc_kmsg_copyin.
9522 	 */
9523 
9524 	if (copy->type == VM_MAP_COPY_KERNEL_BUFFER) {
9525 		kr = vm_map_copyout_kernel_buffer(
9526 			dst_map, &dst_addr,
9527 			copy, copy->size, TRUE, discard_on_success);
9528 		return kr;
9529 	}
9530 
9531 	/*
9532 	 *      Only works for entry lists at the moment.  Will
9533 	 *	support page lists later.
9534 	 */
9535 
9536 	assert(copy->type == VM_MAP_COPY_ENTRY_LIST);
9537 
9538 	if (copy->size == 0) {
9539 		if (discard_on_success) {
9540 			vm_map_copy_discard(copy);
9541 		}
9542 		return KERN_SUCCESS;
9543 	}
9544 
9545 	copy_page_shift = copy->cpy_hdr.page_shift;
9546 
9547 	/*
9548 	 *	Verify that the destination is all writeable
9549 	 *	initially.  We have to trunc the destination
9550 	 *	address and round the copy size or we'll end up
9551 	 *	splitting entries in strange ways.
9552 	 */
9553 
9554 	if (!VM_MAP_PAGE_ALIGNED(copy->size,
9555 	    VM_MAP_PAGE_MASK(dst_map)) ||
9556 	    !VM_MAP_PAGE_ALIGNED(copy->offset,
9557 	    VM_MAP_PAGE_MASK(dst_map)) ||
9558 	    !VM_MAP_PAGE_ALIGNED(dst_addr,
9559 	    VM_MAP_PAGE_MASK(dst_map)) ||
9560 	    copy_page_shift != VM_MAP_PAGE_SHIFT(dst_map)) {
9561 		aligned = FALSE;
9562 		dst_end = vm_map_round_page(dst_addr + copy->size,
9563 		    VM_MAP_PAGE_MASK(dst_map));
9564 	} else {
9565 		dst_end = dst_addr + copy->size;
9566 	}
9567 
9568 	vm_map_lock(dst_map);
9569 
9570 	/* LP64todo - remove this check when vm_map_commpage64()
9571 	 * no longer has to stuff in a map_entry for the commpage
9572 	 * above the map's max_offset.
9573 	 */
9574 	if (dst_addr >= dst_map->max_offset) {
9575 		vm_map_unlock(dst_map);
9576 		return KERN_INVALID_ADDRESS;
9577 	}
9578 
9579 start_pass_1:
9580 	if (!vm_map_lookup_entry(dst_map, dst_addr, &tmp_entry)) {
9581 		vm_map_unlock(dst_map);
9582 		return KERN_INVALID_ADDRESS;
9583 	}
9584 	vm_map_clip_start(dst_map,
9585 	    tmp_entry,
9586 	    vm_map_trunc_page(dst_addr,
9587 	    VM_MAP_PAGE_MASK(dst_map)));
9588 	for (entry = tmp_entry;;) {
9589 		vm_map_entry_t  next = entry->vme_next;
9590 
9591 		while (entry->is_sub_map) {
9592 			vm_map_offset_t sub_start;
9593 			vm_map_offset_t sub_end;
9594 			vm_map_offset_t local_end;
9595 
9596 			if (entry->in_transition) {
9597 				/*
9598 				 * Say that we are waiting, and wait for entry.
9599 				 */
9600 				entry->needs_wakeup = TRUE;
9601 				vm_map_entry_wait(dst_map, THREAD_UNINT);
9602 
9603 				goto start_pass_1;
9604 			}
9605 
9606 			local_end = entry->vme_end;
9607 			if (!(entry->needs_copy)) {
9608 				/* if needs_copy we are a COW submap */
9609 				/* in such a case we just replace so */
9610 				/* there is no need for the follow-  */
9611 				/* ing check.                        */
9612 				encountered_sub_map = TRUE;
9613 				sub_start = VME_OFFSET(entry);
9614 
9615 				if (entry->vme_end < dst_end) {
9616 					sub_end = entry->vme_end;
9617 				} else {
9618 					sub_end = dst_end;
9619 				}
9620 				sub_end -= entry->vme_start;
9621 				sub_end += VME_OFFSET(entry);
9622 				vm_map_unlock(dst_map);
9623 
9624 				kr = vm_map_overwrite_submap_recurse(
9625 					VME_SUBMAP(entry),
9626 					sub_start,
9627 					sub_end - sub_start);
9628 				if (kr != KERN_SUCCESS) {
9629 					return kr;
9630 				}
9631 				vm_map_lock(dst_map);
9632 			}
9633 
9634 			if (dst_end <= entry->vme_end) {
9635 				goto start_overwrite;
9636 			}
9637 			if (!vm_map_lookup_entry(dst_map, local_end,
9638 			    &entry)) {
9639 				vm_map_unlock(dst_map);
9640 				return KERN_INVALID_ADDRESS;
9641 			}
9642 			next = entry->vme_next;
9643 		}
9644 
9645 		if (!(entry->protection & VM_PROT_WRITE)) {
9646 			vm_map_unlock(dst_map);
9647 			return KERN_PROTECTION_FAILURE;
9648 		}
9649 
9650 		if (!vm_map_entry_is_overwritable(dst_map, entry)) {
9651 			vm_map_unlock(dst_map);
9652 			return KERN_PROTECTION_FAILURE;
9653 		}
9654 
9655 		/*
9656 		 *	If the entry is in transition, we must wait
9657 		 *	for it to exit that state.  Anything could happen
9658 		 *	when we unlock the map, so start over.
9659 		 */
9660 		if (entry->in_transition) {
9661 			/*
9662 			 * Say that we are waiting, and wait for entry.
9663 			 */
9664 			entry->needs_wakeup = TRUE;
9665 			vm_map_entry_wait(dst_map, THREAD_UNINT);
9666 
9667 			goto start_pass_1;
9668 		}
9669 
9670 /*
9671  *		our range is contained completely within this map entry
9672  */
9673 		if (dst_end <= entry->vme_end) {
9674 			break;
9675 		}
9676 /*
9677  *		check that range specified is contiguous region
9678  */
9679 		if ((next == vm_map_to_entry(dst_map)) ||
9680 		    (next->vme_start != entry->vme_end)) {
9681 			vm_map_unlock(dst_map);
9682 			return KERN_INVALID_ADDRESS;
9683 		}
9684 
9685 
9686 		/*
9687 		 *	Check for permanent objects in the destination.
9688 		 */
9689 		if ((VME_OBJECT(entry) != VM_OBJECT_NULL) &&
9690 		    ((!VME_OBJECT(entry)->internal) ||
9691 		    (VME_OBJECT(entry)->true_share))) {
9692 			contains_permanent_objects = TRUE;
9693 		}
9694 
9695 		entry = next;
9696 	}/* for */
9697 
9698 start_overwrite:
9699 	/*
9700 	 *	If there are permanent objects in the destination, then
9701 	 *	the copy cannot be interrupted.
9702 	 */
9703 
9704 	if (interruptible && contains_permanent_objects) {
9705 		vm_map_unlock(dst_map);
9706 		return KERN_FAILURE;   /* XXX */
9707 	}
9708 
9709 	/*
9710 	 *
9711 	 *	Make a second pass, overwriting the data
9712 	 *	At the beginning of each loop iteration,
9713 	 *	the next entry to be overwritten is "tmp_entry"
9714 	 *	(initially, the value returned from the lookup above),
9715 	 *	and the starting address expected in that entry
9716 	 *	is "start".
9717 	 */
9718 
9719 	total_size = copy->size;
9720 	if (encountered_sub_map) {
9721 		copy_size = 0;
9722 		/* re-calculate tmp_entry since we've had the map */
9723 		/* unlocked */
9724 		if (!vm_map_lookup_entry( dst_map, dst_addr, &tmp_entry)) {
9725 			vm_map_unlock(dst_map);
9726 			return KERN_INVALID_ADDRESS;
9727 		}
9728 	} else {
9729 		copy_size = copy->size;
9730 	}
9731 
9732 	base_addr = dst_addr;
9733 	while (TRUE) {
9734 		/* deconstruct the copy object and do in parts */
9735 		/* only in sub_map, interruptable case */
9736 		vm_map_entry_t  copy_entry;
9737 		vm_map_entry_t  previous_prev = VM_MAP_ENTRY_NULL;
9738 		vm_map_entry_t  next_copy = VM_MAP_ENTRY_NULL;
9739 		int             nentries;
9740 		int             remaining_entries = 0;
9741 		vm_map_offset_t new_offset = 0;
9742 
9743 		for (entry = tmp_entry; copy_size == 0;) {
9744 			vm_map_entry_t  next;
9745 
9746 			next = entry->vme_next;
9747 
9748 			/* tmp_entry and base address are moved along */
9749 			/* each time we encounter a sub-map.  Otherwise */
9750 			/* entry can outpase tmp_entry, and the copy_size */
9751 			/* may reflect the distance between them */
9752 			/* if the current entry is found to be in transition */
9753 			/* we will start over at the beginning or the last */
9754 			/* encounter of a submap as dictated by base_addr */
9755 			/* we will zero copy_size accordingly. */
9756 			if (entry->in_transition) {
9757 				/*
9758 				 * Say that we are waiting, and wait for entry.
9759 				 */
9760 				entry->needs_wakeup = TRUE;
9761 				vm_map_entry_wait(dst_map, THREAD_UNINT);
9762 
9763 				if (!vm_map_lookup_entry(dst_map, base_addr,
9764 				    &tmp_entry)) {
9765 					vm_map_unlock(dst_map);
9766 					return KERN_INVALID_ADDRESS;
9767 				}
9768 				copy_size = 0;
9769 				entry = tmp_entry;
9770 				continue;
9771 			}
9772 			if (entry->is_sub_map) {
9773 				vm_map_offset_t sub_start;
9774 				vm_map_offset_t sub_end;
9775 				vm_map_offset_t local_end;
9776 
9777 				if (entry->needs_copy) {
9778 					/* if this is a COW submap */
9779 					/* just back the range with a */
9780 					/* anonymous entry */
9781 					assert(!entry->vme_permanent);
9782 					if (entry->vme_end < dst_end) {
9783 						sub_end = entry->vme_end;
9784 					} else {
9785 						sub_end = dst_end;
9786 					}
9787 					if (entry->vme_start < base_addr) {
9788 						sub_start = base_addr;
9789 					} else {
9790 						sub_start = entry->vme_start;
9791 					}
9792 					vm_map_clip_end(
9793 						dst_map, entry, sub_end);
9794 					vm_map_clip_start(
9795 						dst_map, entry, sub_start);
9796 					assert(!entry->use_pmap);
9797 					assert(!entry->iokit_acct);
9798 					entry->use_pmap = TRUE;
9799 					vm_map_deallocate(VME_SUBMAP(entry));
9800 					assert(!entry->vme_permanent);
9801 					VME_OBJECT_SET(entry, VM_OBJECT_NULL, false, 0);
9802 					VME_OFFSET_SET(entry, 0);
9803 					entry->is_shared = FALSE;
9804 					entry->needs_copy = FALSE;
9805 					entry->protection = VM_PROT_DEFAULT;
9806 					entry->max_protection = VM_PROT_ALL;
9807 					entry->wired_count = 0;
9808 					entry->user_wired_count = 0;
9809 					if (entry->inheritance
9810 					    == VM_INHERIT_SHARE) {
9811 						entry->inheritance = VM_INHERIT_COPY;
9812 					}
9813 					continue;
9814 				}
9815 				/* first take care of any non-sub_map */
9816 				/* entries to send */
9817 				if (base_addr < entry->vme_start) {
9818 					/* stuff to send */
9819 					copy_size =
9820 					    entry->vme_start - base_addr;
9821 					break;
9822 				}
9823 				sub_start = VME_OFFSET(entry);
9824 
9825 				if (entry->vme_end < dst_end) {
9826 					sub_end = entry->vme_end;
9827 				} else {
9828 					sub_end = dst_end;
9829 				}
9830 				sub_end -= entry->vme_start;
9831 				sub_end += VME_OFFSET(entry);
9832 				local_end = entry->vme_end;
9833 				vm_map_unlock(dst_map);
9834 				copy_size = sub_end - sub_start;
9835 
9836 				/* adjust the copy object */
9837 				if (total_size > copy_size) {
9838 					vm_map_size_t   local_size = 0;
9839 					vm_map_size_t   entry_size;
9840 
9841 					nentries = 1;
9842 					new_offset = copy->offset;
9843 					copy_entry = vm_map_copy_first_entry(copy);
9844 					while (copy_entry !=
9845 					    vm_map_copy_to_entry(copy)) {
9846 						entry_size = copy_entry->vme_end -
9847 						    copy_entry->vme_start;
9848 						if ((local_size < copy_size) &&
9849 						    ((local_size + entry_size)
9850 						    >= copy_size)) {
9851 							vm_map_copy_clip_end(copy,
9852 							    copy_entry,
9853 							    copy_entry->vme_start +
9854 							    (copy_size - local_size));
9855 							entry_size = copy_entry->vme_end -
9856 							    copy_entry->vme_start;
9857 							local_size += entry_size;
9858 							new_offset += entry_size;
9859 						}
9860 						if (local_size >= copy_size) {
9861 							next_copy = copy_entry->vme_next;
9862 							copy_entry->vme_next =
9863 							    vm_map_copy_to_entry(copy);
9864 							previous_prev =
9865 							    copy->cpy_hdr.links.prev;
9866 							copy->cpy_hdr.links.prev = copy_entry;
9867 							copy->size = copy_size;
9868 							remaining_entries =
9869 							    copy->cpy_hdr.nentries;
9870 							remaining_entries -= nentries;
9871 							copy->cpy_hdr.nentries = nentries;
9872 							break;
9873 						} else {
9874 							local_size += entry_size;
9875 							new_offset += entry_size;
9876 							nentries++;
9877 						}
9878 						copy_entry = copy_entry->vme_next;
9879 					}
9880 				}
9881 
9882 				if ((entry->use_pmap) && (pmap == NULL)) {
9883 					kr = vm_map_copy_overwrite_nested(
9884 						VME_SUBMAP(entry),
9885 						sub_start,
9886 						copy,
9887 						interruptible,
9888 						VME_SUBMAP(entry)->pmap,
9889 						TRUE);
9890 				} else if (pmap != NULL) {
9891 					kr = vm_map_copy_overwrite_nested(
9892 						VME_SUBMAP(entry),
9893 						sub_start,
9894 						copy,
9895 						interruptible, pmap,
9896 						TRUE);
9897 				} else {
9898 					kr = vm_map_copy_overwrite_nested(
9899 						VME_SUBMAP(entry),
9900 						sub_start,
9901 						copy,
9902 						interruptible,
9903 						dst_map->pmap,
9904 						TRUE);
9905 				}
9906 				if (kr != KERN_SUCCESS) {
9907 					if (next_copy != NULL) {
9908 						copy->cpy_hdr.nentries +=
9909 						    remaining_entries;
9910 						copy->cpy_hdr.links.prev->vme_next =
9911 						    next_copy;
9912 						copy->cpy_hdr.links.prev
9913 						        = previous_prev;
9914 						copy->size = total_size;
9915 					}
9916 					return kr;
9917 				}
9918 				if (dst_end <= local_end) {
9919 					return KERN_SUCCESS;
9920 				}
9921 				/* otherwise copy no longer exists, it was */
9922 				/* destroyed after successful copy_overwrite */
9923 				copy = vm_map_copy_allocate(VM_MAP_COPY_ENTRY_LIST);
9924 				copy->offset = new_offset;
9925 				copy->cpy_hdr.page_shift = copy_page_shift;
9926 
9927 				total_size -= copy_size;
9928 				copy_size = 0;
9929 				/* put back remainder of copy in container */
9930 				if (next_copy != NULL) {
9931 					copy->cpy_hdr.nentries = remaining_entries;
9932 					copy->cpy_hdr.links.next = next_copy;
9933 					copy->cpy_hdr.links.prev = previous_prev;
9934 					copy->size = total_size;
9935 					next_copy->vme_prev =
9936 					    vm_map_copy_to_entry(copy);
9937 					next_copy = NULL;
9938 				}
9939 				base_addr = local_end;
9940 				vm_map_lock(dst_map);
9941 				if (!vm_map_lookup_entry(dst_map,
9942 				    local_end, &tmp_entry)) {
9943 					vm_map_unlock(dst_map);
9944 					return KERN_INVALID_ADDRESS;
9945 				}
9946 				entry = tmp_entry;
9947 				continue;
9948 			}
9949 			if (dst_end <= entry->vme_end) {
9950 				copy_size = dst_end - base_addr;
9951 				break;
9952 			}
9953 
9954 			if ((next == vm_map_to_entry(dst_map)) ||
9955 			    (next->vme_start != entry->vme_end)) {
9956 				vm_map_unlock(dst_map);
9957 				return KERN_INVALID_ADDRESS;
9958 			}
9959 
9960 			entry = next;
9961 		}/* for */
9962 
9963 		next_copy = NULL;
9964 		nentries = 1;
9965 
9966 		/* adjust the copy object */
9967 		if (total_size > copy_size) {
9968 			vm_map_size_t   local_size = 0;
9969 			vm_map_size_t   entry_size;
9970 
9971 			new_offset = copy->offset;
9972 			copy_entry = vm_map_copy_first_entry(copy);
9973 			while (copy_entry != vm_map_copy_to_entry(copy)) {
9974 				entry_size = copy_entry->vme_end -
9975 				    copy_entry->vme_start;
9976 				if ((local_size < copy_size) &&
9977 				    ((local_size + entry_size)
9978 				    >= copy_size)) {
9979 					vm_map_copy_clip_end(copy, copy_entry,
9980 					    copy_entry->vme_start +
9981 					    (copy_size - local_size));
9982 					entry_size = copy_entry->vme_end -
9983 					    copy_entry->vme_start;
9984 					local_size += entry_size;
9985 					new_offset += entry_size;
9986 				}
9987 				if (local_size >= copy_size) {
9988 					next_copy = copy_entry->vme_next;
9989 					copy_entry->vme_next =
9990 					    vm_map_copy_to_entry(copy);
9991 					previous_prev =
9992 					    copy->cpy_hdr.links.prev;
9993 					copy->cpy_hdr.links.prev = copy_entry;
9994 					copy->size = copy_size;
9995 					remaining_entries =
9996 					    copy->cpy_hdr.nentries;
9997 					remaining_entries -= nentries;
9998 					copy->cpy_hdr.nentries = nentries;
9999 					break;
10000 				} else {
10001 					local_size += entry_size;
10002 					new_offset += entry_size;
10003 					nentries++;
10004 				}
10005 				copy_entry = copy_entry->vme_next;
10006 			}
10007 		}
10008 
10009 		if (aligned) {
10010 			pmap_t  local_pmap;
10011 
10012 			if (pmap) {
10013 				local_pmap = pmap;
10014 			} else {
10015 				local_pmap = dst_map->pmap;
10016 			}
10017 
10018 			if ((kr =  vm_map_copy_overwrite_aligned(
10019 				    dst_map, tmp_entry, copy,
10020 				    base_addr, local_pmap)) != KERN_SUCCESS) {
10021 				if (next_copy != NULL) {
10022 					copy->cpy_hdr.nentries +=
10023 					    remaining_entries;
10024 					copy->cpy_hdr.links.prev->vme_next =
10025 					    next_copy;
10026 					copy->cpy_hdr.links.prev =
10027 					    previous_prev;
10028 					copy->size += copy_size;
10029 				}
10030 				return kr;
10031 			}
10032 			vm_map_unlock(dst_map);
10033 		} else {
10034 			/*
10035 			 * Performance gain:
10036 			 *
10037 			 * if the copy and dst address are misaligned but the same
10038 			 * offset within the page we can copy_not_aligned the
10039 			 * misaligned parts and copy aligned the rest.  If they are
10040 			 * aligned but len is unaligned we simply need to copy
10041 			 * the end bit unaligned.  We'll need to split the misaligned
10042 			 * bits of the region in this case !
10043 			 */
10044 			/* ALWAYS UNLOCKS THE dst_map MAP */
10045 			kr = vm_map_copy_overwrite_unaligned(
10046 				dst_map,
10047 				tmp_entry,
10048 				copy,
10049 				base_addr,
10050 				discard_on_success);
10051 			if (kr != KERN_SUCCESS) {
10052 				if (next_copy != NULL) {
10053 					copy->cpy_hdr.nentries +=
10054 					    remaining_entries;
10055 					copy->cpy_hdr.links.prev->vme_next =
10056 					    next_copy;
10057 					copy->cpy_hdr.links.prev =
10058 					    previous_prev;
10059 					copy->size += copy_size;
10060 				}
10061 				return kr;
10062 			}
10063 		}
10064 		total_size -= copy_size;
10065 		if (total_size == 0) {
10066 			break;
10067 		}
10068 		base_addr += copy_size;
10069 		copy_size = 0;
10070 		copy->offset = new_offset;
10071 		if (next_copy != NULL) {
10072 			copy->cpy_hdr.nentries = remaining_entries;
10073 			copy->cpy_hdr.links.next = next_copy;
10074 			copy->cpy_hdr.links.prev = previous_prev;
10075 			next_copy->vme_prev = vm_map_copy_to_entry(copy);
10076 			copy->size = total_size;
10077 		}
10078 		vm_map_lock(dst_map);
10079 		while (TRUE) {
10080 			if (!vm_map_lookup_entry(dst_map,
10081 			    base_addr, &tmp_entry)) {
10082 				vm_map_unlock(dst_map);
10083 				return KERN_INVALID_ADDRESS;
10084 			}
10085 			if (tmp_entry->in_transition) {
10086 				entry->needs_wakeup = TRUE;
10087 				vm_map_entry_wait(dst_map, THREAD_UNINT);
10088 			} else {
10089 				break;
10090 			}
10091 		}
10092 		vm_map_clip_start(dst_map,
10093 		    tmp_entry,
10094 		    vm_map_trunc_page(base_addr,
10095 		    VM_MAP_PAGE_MASK(dst_map)));
10096 
10097 		entry = tmp_entry;
10098 	} /* while */
10099 
10100 	/*
10101 	 *	Throw away the vm_map_copy object
10102 	 */
10103 	if (discard_on_success) {
10104 		vm_map_copy_discard(copy);
10105 	}
10106 
10107 	return KERN_SUCCESS;
10108 }/* vm_map_copy_overwrite */
10109 
10110 kern_return_t
vm_map_copy_overwrite(vm_map_t dst_map,vm_map_offset_t dst_addr,vm_map_copy_t copy,vm_map_size_t copy_size,boolean_t interruptible)10111 vm_map_copy_overwrite(
10112 	vm_map_t        dst_map,
10113 	vm_map_offset_t dst_addr,
10114 	vm_map_copy_t   copy,
10115 	vm_map_size_t   copy_size,
10116 	boolean_t       interruptible)
10117 {
10118 	vm_map_size_t   head_size, tail_size;
10119 	vm_map_copy_t   head_copy, tail_copy;
10120 	vm_map_offset_t head_addr, tail_addr;
10121 	vm_map_entry_t  entry;
10122 	kern_return_t   kr;
10123 	vm_map_offset_t effective_page_mask, effective_page_size;
10124 	uint16_t        copy_page_shift;
10125 
10126 	head_size = 0;
10127 	tail_size = 0;
10128 	head_copy = NULL;
10129 	tail_copy = NULL;
10130 	head_addr = 0;
10131 	tail_addr = 0;
10132 
10133 	/*
10134 	 *	Check for null copy object.
10135 	 */
10136 	if (copy == VM_MAP_COPY_NULL) {
10137 		return KERN_SUCCESS;
10138 	}
10139 
10140 	if (__improbable(vm_map_range_overflows(dst_map, dst_addr, copy_size))) {
10141 		return KERN_INVALID_ADDRESS;
10142 	}
10143 
10144 	/*
10145 	 * Assert that the vm_map_copy is coming from the right
10146 	 * zone and hasn't been forged
10147 	 */
10148 	vm_map_copy_require(copy);
10149 
10150 	if (interruptible ||
10151 	    copy->type != VM_MAP_COPY_ENTRY_LIST) {
10152 		/*
10153 		 * We can't split the "copy" map if we're interruptible
10154 		 * or if we don't have a "copy" map...
10155 		 */
10156 blunt_copy:
10157 		kr = vm_map_copy_overwrite_nested(dst_map,
10158 		    dst_addr,
10159 		    copy,
10160 		    interruptible,
10161 		    (pmap_t) NULL,
10162 		    TRUE);
10163 		if (kr) {
10164 			ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_COPYOVERWRITE_FULL_NESTED_ERROR), kr /* arg */);
10165 		}
10166 		return kr;
10167 	}
10168 
10169 	copy_page_shift = VM_MAP_COPY_PAGE_SHIFT(copy);
10170 	if (copy_page_shift < PAGE_SHIFT ||
10171 	    VM_MAP_PAGE_SHIFT(dst_map) < PAGE_SHIFT) {
10172 		goto blunt_copy;
10173 	}
10174 
10175 	if (VM_MAP_PAGE_SHIFT(dst_map) < PAGE_SHIFT) {
10176 		effective_page_mask = VM_MAP_PAGE_MASK(dst_map);
10177 	} else {
10178 		effective_page_mask = MAX(VM_MAP_PAGE_MASK(dst_map), PAGE_MASK);
10179 		effective_page_mask = MAX(VM_MAP_COPY_PAGE_MASK(copy),
10180 		    effective_page_mask);
10181 	}
10182 	effective_page_size = effective_page_mask + 1;
10183 
10184 	if (copy_size < VM_MAP_COPY_OVERWRITE_OPTIMIZATION_THRESHOLD_PAGES * effective_page_size) {
10185 		/*
10186 		 * Too small to bother with optimizing...
10187 		 */
10188 		goto blunt_copy;
10189 	}
10190 
10191 	if ((dst_addr & effective_page_mask) !=
10192 	    (copy->offset & effective_page_mask)) {
10193 		/*
10194 		 * Incompatible mis-alignment of source and destination...
10195 		 */
10196 		goto blunt_copy;
10197 	}
10198 
10199 	/*
10200 	 * Proper alignment or identical mis-alignment at the beginning.
10201 	 * Let's try and do a small unaligned copy first (if needed)
10202 	 * and then an aligned copy for the rest.
10203 	 */
10204 	if (!vm_map_page_aligned(dst_addr, effective_page_mask)) {
10205 		head_addr = dst_addr;
10206 		head_size = (effective_page_size -
10207 		    (copy->offset & effective_page_mask));
10208 		head_size = MIN(head_size, copy_size);
10209 	}
10210 	if (!vm_map_page_aligned(copy->offset + copy_size,
10211 	    effective_page_mask)) {
10212 		/*
10213 		 * Mis-alignment at the end.
10214 		 * Do an aligned copy up to the last page and
10215 		 * then an unaligned copy for the remaining bytes.
10216 		 */
10217 		tail_size = ((copy->offset + copy_size) &
10218 		    effective_page_mask);
10219 		tail_size = MIN(tail_size, copy_size);
10220 		tail_addr = dst_addr + copy_size - tail_size;
10221 		assert(tail_addr >= head_addr + head_size);
10222 	}
10223 	assert(head_size + tail_size <= copy_size);
10224 
10225 	if (head_size + tail_size == copy_size) {
10226 		/*
10227 		 * It's all unaligned, no optimization possible...
10228 		 */
10229 		goto blunt_copy;
10230 	}
10231 
10232 	/*
10233 	 * Can't optimize if there are any submaps in the
10234 	 * destination due to the way we free the "copy" map
10235 	 * progressively in vm_map_copy_overwrite_nested()
10236 	 * in that case.
10237 	 */
10238 	vm_map_lock_read(dst_map);
10239 	if (!vm_map_lookup_entry(dst_map, dst_addr, &entry)) {
10240 		vm_map_unlock_read(dst_map);
10241 		goto blunt_copy;
10242 	}
10243 	for (;
10244 	    (entry != vm_map_to_entry(dst_map) &&
10245 	    entry->vme_start < dst_addr + copy_size);
10246 	    entry = entry->vme_next) {
10247 		if (entry->is_sub_map) {
10248 			vm_map_unlock_read(dst_map);
10249 			goto blunt_copy;
10250 		}
10251 	}
10252 	vm_map_unlock_read(dst_map);
10253 
10254 	if (head_size) {
10255 		/*
10256 		 * Unaligned copy of the first "head_size" bytes, to reach
10257 		 * a page boundary.
10258 		 */
10259 
10260 		/*
10261 		 * Extract "head_copy" out of "copy".
10262 		 */
10263 		head_copy = vm_map_copy_allocate(VM_MAP_COPY_ENTRY_LIST);
10264 		head_copy->cpy_hdr.entries_pageable =
10265 		    copy->cpy_hdr.entries_pageable;
10266 		head_copy->cpy_hdr.page_shift = copy_page_shift;
10267 
10268 		entry = vm_map_copy_first_entry(copy);
10269 		if (entry->vme_end < copy->offset + head_size) {
10270 			head_size = entry->vme_end - copy->offset;
10271 		}
10272 
10273 		head_copy->offset = copy->offset;
10274 		head_copy->size = head_size;
10275 		copy->offset += head_size;
10276 		copy->size -= head_size;
10277 		copy_size -= head_size;
10278 		assert(copy_size > 0);
10279 
10280 		vm_map_copy_clip_end(copy, entry, copy->offset);
10281 		vm_map_copy_entry_unlink(copy, entry);
10282 		vm_map_copy_entry_link(head_copy,
10283 		    vm_map_copy_to_entry(head_copy),
10284 		    entry);
10285 
10286 		/*
10287 		 * Do the unaligned copy.
10288 		 */
10289 		kr = vm_map_copy_overwrite_nested(dst_map,
10290 		    head_addr,
10291 		    head_copy,
10292 		    interruptible,
10293 		    (pmap_t) NULL,
10294 		    FALSE);
10295 		if (kr != KERN_SUCCESS) {
10296 			ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_COPYOVERWRITE_PARTIAL_HEAD_NESTED_ERROR), kr /* arg */);
10297 			goto done;
10298 		}
10299 	}
10300 
10301 	if (tail_size) {
10302 		/*
10303 		 * Extract "tail_copy" out of "copy".
10304 		 */
10305 		tail_copy = vm_map_copy_allocate(VM_MAP_COPY_ENTRY_LIST);
10306 		tail_copy->cpy_hdr.entries_pageable =
10307 		    copy->cpy_hdr.entries_pageable;
10308 		tail_copy->cpy_hdr.page_shift = copy_page_shift;
10309 
10310 		tail_copy->offset = copy->offset + copy_size - tail_size;
10311 		tail_copy->size = tail_size;
10312 
10313 		copy->size -= tail_size;
10314 		copy_size -= tail_size;
10315 		assert(copy_size > 0);
10316 
10317 		entry = vm_map_copy_last_entry(copy);
10318 		vm_map_copy_clip_start(copy, entry, tail_copy->offset);
10319 		entry = vm_map_copy_last_entry(copy);
10320 		vm_map_copy_entry_unlink(copy, entry);
10321 		vm_map_copy_entry_link(tail_copy,
10322 		    vm_map_copy_last_entry(tail_copy),
10323 		    entry);
10324 	}
10325 
10326 	/*
10327 	 * If we are here from ipc_kmsg_copyout_ool_descriptor(),
10328 	 * we want to avoid TOCTOU issues w.r.t copy->size but
10329 	 * we don't need to change vm_map_copy_overwrite_nested()
10330 	 * and all other vm_map_copy_overwrite variants.
10331 	 *
10332 	 * So we assign the original copy_size that was passed into
10333 	 * this routine back to copy.
10334 	 *
10335 	 * This use of local 'copy_size' passed into this routine is
10336 	 * to try and protect against TOCTOU attacks where the kernel
10337 	 * has been exploited. We don't expect this to be an issue
10338 	 * during normal system operation.
10339 	 */
10340 	assertf(copy->size == copy_size,
10341 	    "Mismatch of copy sizes. Expected 0x%llx, Got 0x%llx\n", (uint64_t) copy_size, (uint64_t) copy->size);
10342 	copy->size = copy_size;
10343 
10344 	/*
10345 	 * Copy most (or possibly all) of the data.
10346 	 */
10347 	kr = vm_map_copy_overwrite_nested(dst_map,
10348 	    dst_addr + head_size,
10349 	    copy,
10350 	    interruptible,
10351 	    (pmap_t) NULL,
10352 	    FALSE);
10353 	if (kr != KERN_SUCCESS) {
10354 		ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_COPYOVERWRITE_PARTIAL_NESTED_ERROR), kr /* arg */);
10355 		goto done;
10356 	}
10357 
10358 	if (tail_size) {
10359 		kr = vm_map_copy_overwrite_nested(dst_map,
10360 		    tail_addr,
10361 		    tail_copy,
10362 		    interruptible,
10363 		    (pmap_t) NULL,
10364 		    FALSE);
10365 		if (kr) {
10366 			ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_COPYOVERWRITE_PARTIAL_TAIL_NESTED_ERROR), kr /* arg */);
10367 		}
10368 	}
10369 
10370 done:
10371 	assert(copy->type == VM_MAP_COPY_ENTRY_LIST);
10372 	if (kr == KERN_SUCCESS) {
10373 		/*
10374 		 * Discard all the copy maps.
10375 		 */
10376 		if (head_copy) {
10377 			vm_map_copy_discard(head_copy);
10378 			head_copy = NULL;
10379 		}
10380 		vm_map_copy_discard(copy);
10381 		if (tail_copy) {
10382 			vm_map_copy_discard(tail_copy);
10383 			tail_copy = NULL;
10384 		}
10385 	} else {
10386 		/*
10387 		 * Re-assemble the original copy map.
10388 		 */
10389 		if (head_copy) {
10390 			entry = vm_map_copy_first_entry(head_copy);
10391 			vm_map_copy_entry_unlink(head_copy, entry);
10392 			vm_map_copy_entry_link(copy,
10393 			    vm_map_copy_to_entry(copy),
10394 			    entry);
10395 			copy->offset -= head_size;
10396 			copy->size += head_size;
10397 			vm_map_copy_discard(head_copy);
10398 			head_copy = NULL;
10399 		}
10400 		if (tail_copy) {
10401 			entry = vm_map_copy_last_entry(tail_copy);
10402 			vm_map_copy_entry_unlink(tail_copy, entry);
10403 			vm_map_copy_entry_link(copy,
10404 			    vm_map_copy_last_entry(copy),
10405 			    entry);
10406 			copy->size += tail_size;
10407 			vm_map_copy_discard(tail_copy);
10408 			tail_copy = NULL;
10409 		}
10410 	}
10411 	return kr;
10412 }
10413 
10414 
10415 /*
10416  *	Routine: vm_map_copy_overwrite_unaligned	[internal use only]
10417  *
10418  *	Decription:
10419  *	Physically copy unaligned data
10420  *
10421  *	Implementation:
10422  *	Unaligned parts of pages have to be physically copied.  We use
10423  *	a modified form of vm_fault_copy (which understands none-aligned
10424  *	page offsets and sizes) to do the copy.  We attempt to copy as
10425  *	much memory in one go as possibly, however vm_fault_copy copies
10426  *	within 1 memory object so we have to find the smaller of "amount left"
10427  *	"source object data size" and "target object data size".  With
10428  *	unaligned data we don't need to split regions, therefore the source
10429  *	(copy) object should be one map entry, the target range may be split
10430  *	over multiple map entries however.  In any event we are pessimistic
10431  *	about these assumptions.
10432  *
10433  *	Callers of this function must call vm_map_copy_require on
10434  *	previously created vm_map_copy_t or pass a newly created
10435  *	one to ensure that it hasn't been forged.
10436  *
10437  *	Assumptions:
10438  *	dst_map is locked on entry and is return locked on success,
10439  *	unlocked on error.
10440  */
10441 
10442 static kern_return_t
vm_map_copy_overwrite_unaligned(vm_map_t dst_map,vm_map_entry_t entry,vm_map_copy_t copy,vm_map_offset_t start,boolean_t discard_on_success)10443 vm_map_copy_overwrite_unaligned(
10444 	vm_map_t        dst_map,
10445 	vm_map_entry_t  entry,
10446 	vm_map_copy_t   copy,
10447 	vm_map_offset_t start,
10448 	boolean_t       discard_on_success)
10449 {
10450 	vm_map_entry_t          copy_entry;
10451 	vm_map_entry_t          copy_entry_next;
10452 	vm_map_version_t        version;
10453 	vm_object_t             dst_object;
10454 	vm_object_offset_t      dst_offset;
10455 	vm_object_offset_t      src_offset;
10456 	vm_object_offset_t      entry_offset;
10457 	vm_map_offset_t         entry_end;
10458 	vm_map_size_t           src_size,
10459 	    dst_size,
10460 	    copy_size,
10461 	    amount_left;
10462 	kern_return_t           kr = KERN_SUCCESS;
10463 
10464 
10465 	copy_entry = vm_map_copy_first_entry(copy);
10466 
10467 	vm_map_lock_write_to_read(dst_map);
10468 
10469 	src_offset = copy->offset - trunc_page_mask_64(copy->offset, VM_MAP_COPY_PAGE_MASK(copy));
10470 	amount_left = copy->size;
10471 /*
10472  *	unaligned so we never clipped this entry, we need the offset into
10473  *	the vm_object not just the data.
10474  */
10475 	while (amount_left > 0) {
10476 		if (entry == vm_map_to_entry(dst_map)) {
10477 			vm_map_unlock_read(dst_map);
10478 			return KERN_INVALID_ADDRESS;
10479 		}
10480 
10481 		/* "start" must be within the current map entry */
10482 		assert((start >= entry->vme_start) && (start < entry->vme_end));
10483 
10484 		/*
10485 		 *	Check protection again
10486 		 */
10487 		if (!(entry->protection & VM_PROT_WRITE)) {
10488 			vm_map_unlock_read(dst_map);
10489 			return KERN_PROTECTION_FAILURE;
10490 		}
10491 		if (!vm_map_entry_is_overwritable(dst_map, entry)) {
10492 			vm_map_unlock_read(dst_map);
10493 			return KERN_PROTECTION_FAILURE;
10494 		}
10495 
10496 		/*
10497 		 *	If the entry is in transition, we must wait
10498 		 *	for it to exit that state.  Anything could happen
10499 		 *	when we unlock the map, so start over.
10500 		 */
10501 		if (entry->in_transition) {
10502 			/*
10503 			 * Say that we are waiting, and wait for entry.
10504 			 */
10505 			entry->needs_wakeup = TRUE;
10506 			vm_map_entry_wait(dst_map, THREAD_UNINT);
10507 
10508 			goto RetryLookup;
10509 		}
10510 
10511 		dst_offset = start - entry->vme_start;
10512 
10513 		dst_size = entry->vme_end - start;
10514 
10515 		src_size = copy_entry->vme_end -
10516 		    (copy_entry->vme_start + src_offset);
10517 
10518 		if (dst_size < src_size) {
10519 /*
10520  *			we can only copy dst_size bytes before
10521  *			we have to get the next destination entry
10522  */
10523 			copy_size = dst_size;
10524 		} else {
10525 /*
10526  *			we can only copy src_size bytes before
10527  *			we have to get the next source copy entry
10528  */
10529 			copy_size = src_size;
10530 		}
10531 
10532 		if (copy_size > amount_left) {
10533 			copy_size = amount_left;
10534 		}
10535 /*
10536  *		Entry needs copy, create a shadow shadow object for
10537  *		Copy on write region.
10538  */
10539 		if (entry->needs_copy) {
10540 			if (vm_map_lock_read_to_write(dst_map)) {
10541 				vm_map_lock_read(dst_map);
10542 				goto RetryLookup;
10543 			}
10544 			VME_OBJECT_SHADOW(entry,
10545 			    (vm_map_size_t)(entry->vme_end
10546 			    - entry->vme_start),
10547 			    vm_map_always_shadow(dst_map));
10548 			entry->needs_copy = FALSE;
10549 			vm_map_lock_write_to_read(dst_map);
10550 		}
10551 		dst_object = VME_OBJECT(entry);
10552 /*
10553  *		unlike with the virtual (aligned) copy we're going
10554  *		to fault on it therefore we need a target object.
10555  */
10556 		if (dst_object == VM_OBJECT_NULL) {
10557 			if (vm_map_lock_read_to_write(dst_map)) {
10558 				vm_map_lock_read(dst_map);
10559 				goto RetryLookup;
10560 			}
10561 			dst_object = vm_object_allocate((vm_map_size_t)
10562 			    entry->vme_end - entry->vme_start);
10563 			VME_OBJECT_SET(entry, dst_object, false, 0);
10564 			VME_OFFSET_SET(entry, 0);
10565 			assert(entry->use_pmap);
10566 			vm_map_lock_write_to_read(dst_map);
10567 		}
10568 /*
10569  *		Take an object reference and unlock map. The "entry" may
10570  *		disappear or change when the map is unlocked.
10571  */
10572 		vm_object_reference(dst_object);
10573 		version.main_timestamp = dst_map->timestamp;
10574 		entry_offset = VME_OFFSET(entry);
10575 		entry_end = entry->vme_end;
10576 		vm_map_unlock_read(dst_map);
10577 /*
10578  *		Copy as much as possible in one pass
10579  */
10580 		kr = vm_fault_copy(
10581 			VME_OBJECT(copy_entry),
10582 			VME_OFFSET(copy_entry) + src_offset,
10583 			&copy_size,
10584 			dst_object,
10585 			entry_offset + dst_offset,
10586 			dst_map,
10587 			&version,
10588 			THREAD_UNINT );
10589 
10590 		start += copy_size;
10591 		src_offset += copy_size;
10592 		amount_left -= copy_size;
10593 /*
10594  *		Release the object reference
10595  */
10596 		vm_object_deallocate(dst_object);
10597 /*
10598  *		If a hard error occurred, return it now
10599  */
10600 		if (kr != KERN_SUCCESS) {
10601 			return kr;
10602 		}
10603 
10604 		if ((copy_entry->vme_start + src_offset) == copy_entry->vme_end
10605 		    || amount_left == 0) {
10606 /*
10607  *			all done with this copy entry, dispose.
10608  */
10609 			copy_entry_next = copy_entry->vme_next;
10610 
10611 			if (discard_on_success) {
10612 				vm_map_copy_entry_unlink(copy, copy_entry);
10613 				assert(!copy_entry->is_sub_map);
10614 				vm_object_deallocate(VME_OBJECT(copy_entry));
10615 				vm_map_copy_entry_dispose(copy_entry);
10616 			}
10617 
10618 			if (copy_entry_next == vm_map_copy_to_entry(copy) &&
10619 			    amount_left) {
10620 /*
10621  *				not finished copying but run out of source
10622  */
10623 				return KERN_INVALID_ADDRESS;
10624 			}
10625 
10626 			copy_entry = copy_entry_next;
10627 
10628 			src_offset = 0;
10629 		}
10630 
10631 		if (amount_left == 0) {
10632 			return KERN_SUCCESS;
10633 		}
10634 
10635 		vm_map_lock_read(dst_map);
10636 		if (version.main_timestamp == dst_map->timestamp) {
10637 			if (start == entry_end) {
10638 /*
10639  *				destination region is split.  Use the version
10640  *				information to avoid a lookup in the normal
10641  *				case.
10642  */
10643 				entry = entry->vme_next;
10644 /*
10645  *				should be contiguous. Fail if we encounter
10646  *				a hole in the destination.
10647  */
10648 				if (start != entry->vme_start) {
10649 					vm_map_unlock_read(dst_map);
10650 					return KERN_INVALID_ADDRESS;
10651 				}
10652 			}
10653 		} else {
10654 /*
10655  *			Map version check failed.
10656  *			we must lookup the entry because somebody
10657  *			might have changed the map behind our backs.
10658  */
10659 RetryLookup:
10660 			if (!vm_map_lookup_entry(dst_map, start, &entry)) {
10661 				vm_map_unlock_read(dst_map);
10662 				return KERN_INVALID_ADDRESS;
10663 			}
10664 		}
10665 	}/* while */
10666 
10667 	return KERN_SUCCESS;
10668 }/* vm_map_copy_overwrite_unaligned */
10669 
10670 /*
10671  *	Routine: vm_map_copy_overwrite_aligned	[internal use only]
10672  *
10673  *	Description:
10674  *	Does all the vm_trickery possible for whole pages.
10675  *
10676  *	Implementation:
10677  *
10678  *	If there are no permanent objects in the destination,
10679  *	and the source and destination map entry zones match,
10680  *	and the destination map entry is not shared,
10681  *	then the map entries can be deleted and replaced
10682  *	with those from the copy.  The following code is the
10683  *	basic idea of what to do, but there are lots of annoying
10684  *	little details about getting protection and inheritance
10685  *	right.  Should add protection, inheritance, and sharing checks
10686  *	to the above pass and make sure that no wiring is involved.
10687  *
10688  *	Callers of this function must call vm_map_copy_require on
10689  *	previously created vm_map_copy_t or pass a newly created
10690  *	one to ensure that it hasn't been forged.
10691  */
10692 
10693 int vm_map_copy_overwrite_aligned_src_not_internal = 0;
10694 int vm_map_copy_overwrite_aligned_src_not_symmetric = 0;
10695 int vm_map_copy_overwrite_aligned_src_large = 0;
10696 
10697 static kern_return_t
vm_map_copy_overwrite_aligned(vm_map_t dst_map,vm_map_entry_t tmp_entry,vm_map_copy_t copy,vm_map_offset_t start,__unused pmap_t pmap)10698 vm_map_copy_overwrite_aligned(
10699 	vm_map_t        dst_map,
10700 	vm_map_entry_t  tmp_entry,
10701 	vm_map_copy_t   copy,
10702 	vm_map_offset_t start,
10703 	__unused pmap_t pmap)
10704 {
10705 	vm_object_t     object;
10706 	vm_map_entry_t  copy_entry;
10707 	vm_map_size_t   copy_size;
10708 	vm_map_size_t   size;
10709 	vm_map_entry_t  entry;
10710 
10711 	while ((copy_entry = vm_map_copy_first_entry(copy))
10712 	    != vm_map_copy_to_entry(copy)) {
10713 		copy_size = (copy_entry->vme_end - copy_entry->vme_start);
10714 
10715 		entry = tmp_entry;
10716 		if (entry->is_sub_map) {
10717 			/* unnested when clipped earlier */
10718 			assert(!entry->use_pmap);
10719 		}
10720 		if (entry == vm_map_to_entry(dst_map)) {
10721 			vm_map_unlock(dst_map);
10722 			return KERN_INVALID_ADDRESS;
10723 		}
10724 		size = (entry->vme_end - entry->vme_start);
10725 		/*
10726 		 *	Make sure that no holes popped up in the
10727 		 *	address map, and that the protection is
10728 		 *	still valid, in case the map was unlocked
10729 		 *	earlier.
10730 		 */
10731 
10732 		if ((entry->vme_start != start) || ((entry->is_sub_map)
10733 		    && !entry->needs_copy)) {
10734 			vm_map_unlock(dst_map);
10735 			return KERN_INVALID_ADDRESS;
10736 		}
10737 		assert(entry != vm_map_to_entry(dst_map));
10738 
10739 		/*
10740 		 *	Check protection again
10741 		 */
10742 
10743 		if (!(entry->protection & VM_PROT_WRITE)) {
10744 			vm_map_unlock(dst_map);
10745 			return KERN_PROTECTION_FAILURE;
10746 		}
10747 
10748 		if (!vm_map_entry_is_overwritable(dst_map, entry)) {
10749 			vm_map_unlock(dst_map);
10750 			return KERN_PROTECTION_FAILURE;
10751 		}
10752 
10753 		/*
10754 		 *	If the entry is in transition, we must wait
10755 		 *	for it to exit that state.  Anything could happen
10756 		 *	when we unlock the map, so start over.
10757 		 */
10758 		if (entry->in_transition) {
10759 			/*
10760 			 * Say that we are waiting, and wait for entry.
10761 			 */
10762 			entry->needs_wakeup = TRUE;
10763 			vm_map_entry_wait(dst_map, THREAD_UNINT);
10764 
10765 			goto RetryLookup;
10766 		}
10767 
10768 		/*
10769 		 *	Adjust to source size first
10770 		 */
10771 
10772 		if (copy_size < size) {
10773 			if (entry->map_aligned &&
10774 			    !VM_MAP_PAGE_ALIGNED(entry->vme_start + copy_size,
10775 			    VM_MAP_PAGE_MASK(dst_map))) {
10776 				/* no longer map-aligned */
10777 				entry->map_aligned = FALSE;
10778 			}
10779 			vm_map_clip_end(dst_map, entry, entry->vme_start + copy_size);
10780 			size = copy_size;
10781 		}
10782 
10783 		/*
10784 		 *	Adjust to destination size
10785 		 */
10786 
10787 		if (size < copy_size) {
10788 			vm_map_copy_clip_end(copy, copy_entry,
10789 			    copy_entry->vme_start + size);
10790 			copy_size = size;
10791 		}
10792 
10793 		assert((entry->vme_end - entry->vme_start) == size);
10794 		assert((tmp_entry->vme_end - tmp_entry->vme_start) == size);
10795 		assert((copy_entry->vme_end - copy_entry->vme_start) == size);
10796 
10797 		/*
10798 		 *	If the destination contains temporary unshared memory,
10799 		 *	we can perform the copy by throwing it away and
10800 		 *	installing the source data.
10801 		 *
10802 		 *	Exceptions for mappings with special semantics:
10803 		 *	+ "permanent" entries,
10804 		 *	+ JIT regions,
10805 		 *	+ TPRO regions,
10806 		 *      + pmap-specific protection policies,
10807 		 *	+ VM objects with COPY_NONE copy strategy.
10808 		 */
10809 
10810 		object = VME_OBJECT(entry);
10811 		if ((!entry->is_shared &&
10812 		    !entry->vme_permanent &&
10813 		    !entry->used_for_jit &&
10814 #if __arm64e__
10815 		    !entry->used_for_tpro &&
10816 #endif /* __arm64e__ */
10817 		    !(entry->protection & VM_PROT_EXECUTE) &&
10818 		    !pmap_has_prot_policy(dst_map->pmap, entry->translated_allow_execute, entry->protection) &&
10819 		    ((object == VM_OBJECT_NULL) ||
10820 		    (object->internal &&
10821 		    !object->true_share &&
10822 		    object->copy_strategy != MEMORY_OBJECT_COPY_NONE))) ||
10823 		    entry->needs_copy) {
10824 			vm_object_t     old_object = VME_OBJECT(entry);
10825 			vm_object_offset_t      old_offset = VME_OFFSET(entry);
10826 			vm_object_offset_t      offset;
10827 
10828 			/*
10829 			 * Ensure that the source and destination aren't
10830 			 * identical
10831 			 */
10832 			if (old_object == VME_OBJECT(copy_entry) &&
10833 			    old_offset == VME_OFFSET(copy_entry)) {
10834 				vm_map_copy_entry_unlink(copy, copy_entry);
10835 				vm_map_copy_entry_dispose(copy_entry);
10836 
10837 				if (old_object != VM_OBJECT_NULL) {
10838 					vm_object_deallocate(old_object);
10839 				}
10840 
10841 				start = tmp_entry->vme_end;
10842 				tmp_entry = tmp_entry->vme_next;
10843 				continue;
10844 			}
10845 
10846 #if XNU_TARGET_OS_OSX
10847 #define __TRADEOFF1_OBJ_SIZE (64 * 1024 * 1024) /* 64 MB */
10848 #define __TRADEOFF1_COPY_SIZE (128 * 1024)      /* 128 KB */
10849 			if (VME_OBJECT(copy_entry) != VM_OBJECT_NULL &&
10850 			    VME_OBJECT(copy_entry)->vo_size >= __TRADEOFF1_OBJ_SIZE &&
10851 			    copy_size <= __TRADEOFF1_COPY_SIZE) {
10852 				/*
10853 				 * Virtual vs. Physical copy tradeoff #1.
10854 				 *
10855 				 * Copying only a few pages out of a large
10856 				 * object:  do a physical copy instead of
10857 				 * a virtual copy, to avoid possibly keeping
10858 				 * the entire large object alive because of
10859 				 * those few copy-on-write pages.
10860 				 */
10861 				vm_map_copy_overwrite_aligned_src_large++;
10862 				goto slow_copy;
10863 			}
10864 #endif /* XNU_TARGET_OS_OSX */
10865 
10866 			if ((dst_map->pmap != kernel_pmap) &&
10867 			    (VME_ALIAS(entry) >= VM_MEMORY_MALLOC) &&
10868 			    (VME_ALIAS(entry) <= VM_MEMORY_MALLOC_MEDIUM)) {
10869 				vm_object_t new_object, new_shadow;
10870 
10871 				/*
10872 				 * We're about to map something over a mapping
10873 				 * established by malloc()...
10874 				 */
10875 				new_object = VME_OBJECT(copy_entry);
10876 				if (new_object != VM_OBJECT_NULL) {
10877 					vm_object_lock_shared(new_object);
10878 				}
10879 				while (new_object != VM_OBJECT_NULL &&
10880 #if XNU_TARGET_OS_OSX
10881 				    !new_object->true_share &&
10882 				    new_object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC &&
10883 #endif /* XNU_TARGET_OS_OSX */
10884 				    new_object->internal) {
10885 					new_shadow = new_object->shadow;
10886 					if (new_shadow == VM_OBJECT_NULL) {
10887 						break;
10888 					}
10889 					vm_object_lock_shared(new_shadow);
10890 					vm_object_unlock(new_object);
10891 					new_object = new_shadow;
10892 				}
10893 				if (new_object != VM_OBJECT_NULL) {
10894 					if (!new_object->internal) {
10895 						/*
10896 						 * The new mapping is backed
10897 						 * by an external object.  We
10898 						 * don't want malloc'ed memory
10899 						 * to be replaced with such a
10900 						 * non-anonymous mapping, so
10901 						 * let's go off the optimized
10902 						 * path...
10903 						 */
10904 						vm_map_copy_overwrite_aligned_src_not_internal++;
10905 						vm_object_unlock(new_object);
10906 						goto slow_copy;
10907 					}
10908 #if XNU_TARGET_OS_OSX
10909 					if (new_object->true_share ||
10910 					    new_object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC) {
10911 						/*
10912 						 * Same if there's a "true_share"
10913 						 * object in the shadow chain, or
10914 						 * an object with a non-default
10915 						 * (SYMMETRIC) copy strategy.
10916 						 */
10917 						vm_map_copy_overwrite_aligned_src_not_symmetric++;
10918 						vm_object_unlock(new_object);
10919 						goto slow_copy;
10920 					}
10921 #endif /* XNU_TARGET_OS_OSX */
10922 					vm_object_unlock(new_object);
10923 				}
10924 				/*
10925 				 * The new mapping is still backed by
10926 				 * anonymous (internal) memory, so it's
10927 				 * OK to substitute it for the original
10928 				 * malloc() mapping.
10929 				 */
10930 			}
10931 
10932 			if (old_object != VM_OBJECT_NULL) {
10933 				assert(!entry->vme_permanent);
10934 				if (entry->is_sub_map) {
10935 					if (entry->use_pmap) {
10936 #ifndef NO_NESTED_PMAP
10937 						pmap_unnest(dst_map->pmap,
10938 						    (addr64_t)entry->vme_start,
10939 						    entry->vme_end - entry->vme_start);
10940 #endif  /* NO_NESTED_PMAP */
10941 						if (dst_map->mapped_in_other_pmaps) {
10942 							/* clean up parent */
10943 							/* map/maps */
10944 							vm_map_submap_pmap_clean(
10945 								dst_map, entry->vme_start,
10946 								entry->vme_end,
10947 								VME_SUBMAP(entry),
10948 								VME_OFFSET(entry));
10949 						}
10950 					} else {
10951 						vm_map_submap_pmap_clean(
10952 							dst_map, entry->vme_start,
10953 							entry->vme_end,
10954 							VME_SUBMAP(entry),
10955 							VME_OFFSET(entry));
10956 					}
10957 					vm_map_deallocate(VME_SUBMAP(entry));
10958 				} else {
10959 					if (dst_map->mapped_in_other_pmaps) {
10960 						vm_object_pmap_protect_options(
10961 							VME_OBJECT(entry),
10962 							VME_OFFSET(entry),
10963 							entry->vme_end
10964 							- entry->vme_start,
10965 							PMAP_NULL,
10966 							PAGE_SIZE,
10967 							entry->vme_start,
10968 							VM_PROT_NONE,
10969 							PMAP_OPTIONS_REMOVE);
10970 					} else {
10971 						pmap_remove_options(
10972 							dst_map->pmap,
10973 							(addr64_t)(entry->vme_start),
10974 							(addr64_t)(entry->vme_end),
10975 							PMAP_OPTIONS_REMOVE);
10976 					}
10977 					vm_object_deallocate(old_object);
10978 				}
10979 			}
10980 
10981 			if (entry->iokit_acct) {
10982 				/* keep using iokit accounting */
10983 				entry->use_pmap = FALSE;
10984 			} else {
10985 				/* use pmap accounting */
10986 				entry->use_pmap = TRUE;
10987 			}
10988 			assert(!entry->vme_permanent);
10989 			VME_OBJECT_SET(entry, VME_OBJECT(copy_entry), false, 0);
10990 			object = VME_OBJECT(entry);
10991 			entry->needs_copy = copy_entry->needs_copy;
10992 			entry->wired_count = 0;
10993 			entry->user_wired_count = 0;
10994 			offset = VME_OFFSET(copy_entry);
10995 			VME_OFFSET_SET(entry, offset);
10996 
10997 			vm_map_copy_entry_unlink(copy, copy_entry);
10998 			vm_map_copy_entry_dispose(copy_entry);
10999 
11000 			/*
11001 			 * we could try to push pages into the pmap at this point, BUT
11002 			 * this optimization only saved on average 2 us per page if ALL
11003 			 * the pages in the source were currently mapped
11004 			 * and ALL the pages in the dest were touched, if there were fewer
11005 			 * than 2/3 of the pages touched, this optimization actually cost more cycles
11006 			 * it also puts a lot of pressure on the pmap layer w/r to mapping structures
11007 			 */
11008 
11009 			/*
11010 			 *	Set up for the next iteration.  The map
11011 			 *	has not been unlocked, so the next
11012 			 *	address should be at the end of this
11013 			 *	entry, and the next map entry should be
11014 			 *	the one following it.
11015 			 */
11016 
11017 			start = tmp_entry->vme_end;
11018 			tmp_entry = tmp_entry->vme_next;
11019 		} else {
11020 			vm_map_version_t        version;
11021 			vm_object_t             dst_object;
11022 			vm_object_offset_t      dst_offset;
11023 			kern_return_t           r;
11024 
11025 slow_copy:
11026 			if (entry->needs_copy) {
11027 				VME_OBJECT_SHADOW(entry,
11028 				    (entry->vme_end -
11029 				    entry->vme_start),
11030 				    vm_map_always_shadow(dst_map));
11031 				entry->needs_copy = FALSE;
11032 			}
11033 
11034 			dst_object = VME_OBJECT(entry);
11035 			dst_offset = VME_OFFSET(entry);
11036 
11037 			/*
11038 			 *	Take an object reference, and record
11039 			 *	the map version information so that the
11040 			 *	map can be safely unlocked.
11041 			 */
11042 
11043 			if (dst_object == VM_OBJECT_NULL) {
11044 				/*
11045 				 * We would usually have just taken the
11046 				 * optimized path above if the destination
11047 				 * object has not been allocated yet.  But we
11048 				 * now disable that optimization if the copy
11049 				 * entry's object is not backed by anonymous
11050 				 * memory to avoid replacing malloc'ed
11051 				 * (i.e. re-usable) anonymous memory with a
11052 				 * not-so-anonymous mapping.
11053 				 * So we have to handle this case here and
11054 				 * allocate a new VM object for this map entry.
11055 				 */
11056 				dst_object = vm_object_allocate(
11057 					entry->vme_end - entry->vme_start);
11058 				dst_offset = 0;
11059 				VME_OBJECT_SET(entry, dst_object, false, 0);
11060 				VME_OFFSET_SET(entry, dst_offset);
11061 				assert(entry->use_pmap);
11062 			}
11063 
11064 			vm_object_reference(dst_object);
11065 
11066 			/* account for unlock bumping up timestamp */
11067 			version.main_timestamp = dst_map->timestamp + 1;
11068 
11069 			vm_map_unlock(dst_map);
11070 
11071 			/*
11072 			 *	Copy as much as possible in one pass
11073 			 */
11074 
11075 			copy_size = size;
11076 			r = vm_fault_copy(
11077 				VME_OBJECT(copy_entry),
11078 				VME_OFFSET(copy_entry),
11079 				&copy_size,
11080 				dst_object,
11081 				dst_offset,
11082 				dst_map,
11083 				&version,
11084 				THREAD_UNINT );
11085 
11086 			/*
11087 			 *	Release the object reference
11088 			 */
11089 
11090 			vm_object_deallocate(dst_object);
11091 
11092 			/*
11093 			 *	If a hard error occurred, return it now
11094 			 */
11095 
11096 			if (r != KERN_SUCCESS) {
11097 				return r;
11098 			}
11099 
11100 			if (copy_size != 0) {
11101 				/*
11102 				 *	Dispose of the copied region
11103 				 */
11104 
11105 				vm_map_copy_clip_end(copy, copy_entry,
11106 				    copy_entry->vme_start + copy_size);
11107 				vm_map_copy_entry_unlink(copy, copy_entry);
11108 				vm_object_deallocate(VME_OBJECT(copy_entry));
11109 				vm_map_copy_entry_dispose(copy_entry);
11110 			}
11111 
11112 			/*
11113 			 *	Pick up in the destination map where we left off.
11114 			 *
11115 			 *	Use the version information to avoid a lookup
11116 			 *	in the normal case.
11117 			 */
11118 
11119 			start += copy_size;
11120 			vm_map_lock(dst_map);
11121 			if (version.main_timestamp == dst_map->timestamp &&
11122 			    copy_size != 0) {
11123 				/* We can safely use saved tmp_entry value */
11124 
11125 				if (tmp_entry->map_aligned &&
11126 				    !VM_MAP_PAGE_ALIGNED(
11127 					    start,
11128 					    VM_MAP_PAGE_MASK(dst_map))) {
11129 					/* no longer map-aligned */
11130 					tmp_entry->map_aligned = FALSE;
11131 				}
11132 				vm_map_clip_end(dst_map, tmp_entry, start);
11133 				tmp_entry = tmp_entry->vme_next;
11134 			} else {
11135 				/* Must do lookup of tmp_entry */
11136 
11137 RetryLookup:
11138 				if (!vm_map_lookup_entry(dst_map, start, &tmp_entry)) {
11139 					vm_map_unlock(dst_map);
11140 					return KERN_INVALID_ADDRESS;
11141 				}
11142 				if (tmp_entry->map_aligned &&
11143 				    !VM_MAP_PAGE_ALIGNED(
11144 					    start,
11145 					    VM_MAP_PAGE_MASK(dst_map))) {
11146 					/* no longer map-aligned */
11147 					tmp_entry->map_aligned = FALSE;
11148 				}
11149 				vm_map_clip_start(dst_map, tmp_entry, start);
11150 			}
11151 		}
11152 	}/* while */
11153 
11154 	return KERN_SUCCESS;
11155 }/* vm_map_copy_overwrite_aligned */
11156 
11157 /*
11158  *	Routine: vm_map_copyin_kernel_buffer [internal use only]
11159  *
11160  *	Description:
11161  *		Copy in data to a kernel buffer from space in the
11162  *		source map. The original space may be optionally
11163  *		deallocated.
11164  *
11165  *		If successful, returns a new copy object.
11166  */
11167 static kern_return_t
vm_map_copyin_kernel_buffer(vm_map_t src_map,vm_map_offset_t src_addr,vm_map_size_t len,boolean_t src_destroy,vm_map_copy_t * copy_result)11168 vm_map_copyin_kernel_buffer(
11169 	vm_map_t        src_map,
11170 	vm_map_offset_t src_addr,
11171 	vm_map_size_t   len,
11172 	boolean_t       src_destroy,
11173 	vm_map_copy_t   *copy_result)
11174 {
11175 	kern_return_t kr;
11176 	vm_map_copy_t copy;
11177 	void *kdata;
11178 
11179 	if (len > msg_ool_size_small) {
11180 		return KERN_INVALID_ARGUMENT;
11181 	}
11182 
11183 	kdata = kalloc_data(len, Z_WAITOK);
11184 	if (kdata == NULL) {
11185 		return KERN_RESOURCE_SHORTAGE;
11186 	}
11187 	kr = copyinmap(src_map, src_addr, kdata, (vm_size_t)len);
11188 	if (kr != KERN_SUCCESS) {
11189 		kfree_data(kdata, len);
11190 		return kr;
11191 	}
11192 
11193 	copy = vm_map_copy_allocate(VM_MAP_COPY_KERNEL_BUFFER);
11194 	copy->cpy_kdata = kdata;
11195 	copy->size = len;
11196 	copy->offset = 0;
11197 
11198 	if (src_destroy) {
11199 		vmr_flags_t flags = VM_MAP_REMOVE_INTERRUPTIBLE;
11200 
11201 		if (src_map == kernel_map) {
11202 			flags |= VM_MAP_REMOVE_KUNWIRE;
11203 		}
11204 
11205 		(void)vm_map_remove_guard(src_map,
11206 		    vm_map_trunc_page(src_addr, VM_MAP_PAGE_MASK(src_map)),
11207 		    vm_map_round_page(src_addr + len, VM_MAP_PAGE_MASK(src_map)),
11208 		    flags, KMEM_GUARD_NONE);
11209 	}
11210 
11211 	*copy_result = copy;
11212 	return KERN_SUCCESS;
11213 }
11214 
11215 /*
11216  *	Routine: vm_map_copyout_kernel_buffer	[internal use only]
11217  *
11218  *	Description:
11219  *		Copy out data from a kernel buffer into space in the
11220  *		destination map. The space may be otpionally dynamically
11221  *		allocated.
11222  *
11223  *		If successful, consumes the copy object.
11224  *		Otherwise, the caller is responsible for it.
11225  *
11226  *		Callers of this function must call vm_map_copy_require on
11227  *		previously created vm_map_copy_t or pass a newly created
11228  *		one to ensure that it hasn't been forged.
11229  */
11230 static int vm_map_copyout_kernel_buffer_failures = 0;
11231 static kern_return_t
vm_map_copyout_kernel_buffer(vm_map_t map,vm_map_address_t * addr,vm_map_copy_t copy,vm_map_size_t copy_size,boolean_t overwrite,boolean_t consume_on_success)11232 vm_map_copyout_kernel_buffer(
11233 	vm_map_t                map,
11234 	vm_map_address_t        *addr,  /* IN/OUT */
11235 	vm_map_copy_t           copy,
11236 	vm_map_size_t           copy_size,
11237 	boolean_t               overwrite,
11238 	boolean_t               consume_on_success)
11239 {
11240 	kern_return_t kr = KERN_SUCCESS;
11241 	thread_t thread = current_thread();
11242 
11243 	assert(copy->size == copy_size);
11244 
11245 	/*
11246 	 * check for corrupted vm_map_copy structure
11247 	 */
11248 	if (copy_size > msg_ool_size_small || copy->offset) {
11249 		panic("Invalid vm_map_copy_t sz:%lld, ofst:%lld",
11250 		    (long long)copy->size, (long long)copy->offset);
11251 	}
11252 
11253 	if (!overwrite) {
11254 		/*
11255 		 * Allocate space in the target map for the data
11256 		 */
11257 		vm_map_kernel_flags_t vmk_flags = VM_MAP_KERNEL_FLAGS_ANYWHERE();
11258 
11259 		if (map == kernel_map) {
11260 			vmk_flags.vmkf_range_id = KMEM_RANGE_ID_DATA;
11261 		}
11262 
11263 		*addr = 0;
11264 		kr = vm_map_enter(map,
11265 		    addr,
11266 		    vm_map_round_page(copy_size,
11267 		    VM_MAP_PAGE_MASK(map)),
11268 		    (vm_map_offset_t) 0,
11269 		    vmk_flags,
11270 		    VM_OBJECT_NULL,
11271 		    (vm_object_offset_t) 0,
11272 		    FALSE,
11273 		    VM_PROT_DEFAULT,
11274 		    VM_PROT_ALL,
11275 		    VM_INHERIT_DEFAULT);
11276 		if (kr != KERN_SUCCESS) {
11277 			return kr;
11278 		}
11279 #if KASAN
11280 		if (map->pmap == kernel_pmap) {
11281 			kasan_notify_address(*addr, copy->size);
11282 		}
11283 #endif
11284 	}
11285 
11286 	/*
11287 	 * Copyout the data from the kernel buffer to the target map.
11288 	 */
11289 	if (thread->map == map) {
11290 		/*
11291 		 * If the target map is the current map, just do
11292 		 * the copy.
11293 		 */
11294 		assert((vm_size_t)copy_size == copy_size);
11295 		if (copyout(copy->cpy_kdata, *addr, (vm_size_t)copy_size)) {
11296 			kr = KERN_INVALID_ADDRESS;
11297 		}
11298 	} else {
11299 		vm_map_t oldmap;
11300 
11301 		/*
11302 		 * If the target map is another map, assume the
11303 		 * target's address space identity for the duration
11304 		 * of the copy.
11305 		 */
11306 		vm_map_reference(map);
11307 		oldmap = vm_map_switch(map);
11308 
11309 		assert((vm_size_t)copy_size == copy_size);
11310 		if (copyout(copy->cpy_kdata, *addr, (vm_size_t)copy_size)) {
11311 			vm_map_copyout_kernel_buffer_failures++;
11312 			kr = KERN_INVALID_ADDRESS;
11313 		}
11314 
11315 		(void) vm_map_switch(oldmap);
11316 		vm_map_deallocate(map);
11317 	}
11318 
11319 	if (kr != KERN_SUCCESS) {
11320 		/* the copy failed, clean up */
11321 		if (!overwrite) {
11322 			/*
11323 			 * Deallocate the space we allocated in the target map.
11324 			 */
11325 			(void) vm_map_remove(map,
11326 			    vm_map_trunc_page(*addr,
11327 			    VM_MAP_PAGE_MASK(map)),
11328 			    vm_map_round_page((*addr +
11329 			    vm_map_round_page(copy_size,
11330 			    VM_MAP_PAGE_MASK(map))),
11331 			    VM_MAP_PAGE_MASK(map)));
11332 			*addr = 0;
11333 		}
11334 	} else {
11335 		/* copy was successful, dicard the copy structure */
11336 		if (consume_on_success) {
11337 			kfree_data(copy->cpy_kdata, copy_size);
11338 			zfree_id(ZONE_ID_VM_MAP_COPY, copy);
11339 		}
11340 	}
11341 
11342 	return kr;
11343 }
11344 
11345 /*
11346  *	Routine:	vm_map_copy_insert      [internal use only]
11347  *
11348  *	Description:
11349  *		Link a copy chain ("copy") into a map at the
11350  *		specified location (after "where").
11351  *
11352  *		Callers of this function must call vm_map_copy_require on
11353  *		previously created vm_map_copy_t or pass a newly created
11354  *		one to ensure that it hasn't been forged.
11355  *	Side effects:
11356  *		The copy chain is destroyed.
11357  */
11358 static void
vm_map_copy_insert(vm_map_t map,vm_map_entry_t after_where,vm_map_copy_t copy)11359 vm_map_copy_insert(
11360 	vm_map_t        map,
11361 	vm_map_entry_t  after_where,
11362 	vm_map_copy_t   copy)
11363 {
11364 	vm_map_entry_t  entry;
11365 
11366 	while (vm_map_copy_first_entry(copy) != vm_map_copy_to_entry(copy)) {
11367 		entry = vm_map_copy_first_entry(copy);
11368 		vm_map_copy_entry_unlink(copy, entry);
11369 		vm_map_store_entry_link(map, after_where, entry,
11370 		    VM_MAP_KERNEL_FLAGS_NONE);
11371 		after_where = entry;
11372 	}
11373 	zfree_id(ZONE_ID_VM_MAP_COPY, copy);
11374 }
11375 
11376 /*
11377  * Callers of this function must call vm_map_copy_require on
11378  * previously created vm_map_copy_t or pass a newly created
11379  * one to ensure that it hasn't been forged.
11380  */
11381 void
vm_map_copy_remap(vm_map_t map,vm_map_entry_t where,vm_map_copy_t copy,vm_map_offset_t adjustment,vm_prot_t cur_prot,vm_prot_t max_prot,vm_inherit_t inheritance)11382 vm_map_copy_remap(
11383 	vm_map_t        map,
11384 	vm_map_entry_t  where,
11385 	vm_map_copy_t   copy,
11386 	vm_map_offset_t adjustment,
11387 	vm_prot_t       cur_prot,
11388 	vm_prot_t       max_prot,
11389 	vm_inherit_t    inheritance)
11390 {
11391 	vm_map_entry_t  copy_entry, new_entry;
11392 
11393 	for (copy_entry = vm_map_copy_first_entry(copy);
11394 	    copy_entry != vm_map_copy_to_entry(copy);
11395 	    copy_entry = copy_entry->vme_next) {
11396 		/* get a new VM map entry for the map */
11397 		new_entry = vm_map_entry_create(map);
11398 		/* copy the "copy entry" to the new entry */
11399 		vm_map_entry_copy(map, new_entry, copy_entry);
11400 		/* adjust "start" and "end" */
11401 		new_entry->vme_start += adjustment;
11402 		new_entry->vme_end += adjustment;
11403 		/* clear some attributes */
11404 		new_entry->inheritance = inheritance;
11405 		new_entry->protection = cur_prot;
11406 		new_entry->max_protection = max_prot;
11407 		new_entry->behavior = VM_BEHAVIOR_DEFAULT;
11408 		/* take an extra reference on the entry's "object" */
11409 		if (new_entry->is_sub_map) {
11410 			assert(!new_entry->use_pmap); /* not nested */
11411 			vm_map_reference(VME_SUBMAP(new_entry));
11412 		} else {
11413 			vm_object_reference(VME_OBJECT(new_entry));
11414 		}
11415 		/* insert the new entry in the map */
11416 		vm_map_store_entry_link(map, where, new_entry,
11417 		    VM_MAP_KERNEL_FLAGS_NONE);
11418 		/* continue inserting the "copy entries" after the new entry */
11419 		where = new_entry;
11420 	}
11421 }
11422 
11423 
11424 /*
11425  * Returns true if *size matches (or is in the range of) copy->size.
11426  * Upon returning true, the *size field is updated with the actual size of the
11427  * copy object (may be different for VM_MAP_COPY_ENTRY_LIST types)
11428  */
11429 boolean_t
vm_map_copy_validate_size(vm_map_t dst_map,vm_map_copy_t copy,vm_map_size_t * size)11430 vm_map_copy_validate_size(
11431 	vm_map_t                dst_map,
11432 	vm_map_copy_t           copy,
11433 	vm_map_size_t           *size)
11434 {
11435 	if (copy == VM_MAP_COPY_NULL) {
11436 		return FALSE;
11437 	}
11438 
11439 	/*
11440 	 * Assert that the vm_map_copy is coming from the right
11441 	 * zone and hasn't been forged
11442 	 */
11443 	vm_map_copy_require(copy);
11444 
11445 	vm_map_size_t copy_sz = copy->size;
11446 	vm_map_size_t sz = *size;
11447 	switch (copy->type) {
11448 	case VM_MAP_COPY_KERNEL_BUFFER:
11449 		if (sz == copy_sz) {
11450 			return TRUE;
11451 		}
11452 		break;
11453 	case VM_MAP_COPY_ENTRY_LIST:
11454 		/*
11455 		 * potential page-size rounding prevents us from exactly
11456 		 * validating this flavor of vm_map_copy, but we can at least
11457 		 * assert that it's within a range.
11458 		 */
11459 		if (copy_sz >= sz &&
11460 		    copy_sz <= vm_map_round_page(sz, VM_MAP_PAGE_MASK(dst_map))) {
11461 			*size = copy_sz;
11462 			return TRUE;
11463 		}
11464 		break;
11465 	default:
11466 		break;
11467 	}
11468 	return FALSE;
11469 }
11470 
11471 /*
11472  *	Routine:	vm_map_copyout_size
11473  *
11474  *	Description:
11475  *		Copy out a copy chain ("copy") into newly-allocated
11476  *		space in the destination map. Uses a prevalidated
11477  *		size for the copy object (vm_map_copy_validate_size).
11478  *
11479  *		If successful, consumes the copy object.
11480  *		Otherwise, the caller is responsible for it.
11481  */
11482 kern_return_t
vm_map_copyout_size(vm_map_t dst_map,vm_map_address_t * dst_addr,vm_map_copy_t copy,vm_map_size_t copy_size)11483 vm_map_copyout_size(
11484 	vm_map_t                dst_map,
11485 	vm_map_address_t        *dst_addr,      /* OUT */
11486 	vm_map_copy_t           copy,
11487 	vm_map_size_t           copy_size)
11488 {
11489 	return vm_map_copyout_internal(dst_map, dst_addr, copy, copy_size,
11490 	           TRUE,                     /* consume_on_success */
11491 	           VM_PROT_DEFAULT,
11492 	           VM_PROT_ALL,
11493 	           VM_INHERIT_DEFAULT);
11494 }
11495 
11496 /*
11497  *	Routine:	vm_map_copyout
11498  *
11499  *	Description:
11500  *		Copy out a copy chain ("copy") into newly-allocated
11501  *		space in the destination map.
11502  *
11503  *		If successful, consumes the copy object.
11504  *		Otherwise, the caller is responsible for it.
11505  */
11506 kern_return_t
vm_map_copyout(vm_map_t dst_map,vm_map_address_t * dst_addr,vm_map_copy_t copy)11507 vm_map_copyout(
11508 	vm_map_t                dst_map,
11509 	vm_map_address_t        *dst_addr,      /* OUT */
11510 	vm_map_copy_t           copy)
11511 {
11512 	return vm_map_copyout_internal(dst_map, dst_addr, copy, copy ? copy->size : 0,
11513 	           TRUE,                     /* consume_on_success */
11514 	           VM_PROT_DEFAULT,
11515 	           VM_PROT_ALL,
11516 	           VM_INHERIT_DEFAULT);
11517 }
11518 
11519 kern_return_t
vm_map_copyout_internal(vm_map_t dst_map,vm_map_address_t * dst_addr,vm_map_copy_t copy,vm_map_size_t copy_size,boolean_t consume_on_success,vm_prot_t cur_protection,vm_prot_t max_protection,vm_inherit_t inheritance)11520 vm_map_copyout_internal(
11521 	vm_map_t                dst_map,
11522 	vm_map_address_t        *dst_addr,      /* OUT */
11523 	vm_map_copy_t           copy,
11524 	vm_map_size_t           copy_size,
11525 	boolean_t               consume_on_success,
11526 	vm_prot_t               cur_protection,
11527 	vm_prot_t               max_protection,
11528 	vm_inherit_t            inheritance)
11529 {
11530 	vm_map_size_t           size;
11531 	vm_map_size_t           adjustment;
11532 	vm_map_offset_t         start;
11533 	vm_object_offset_t      vm_copy_start;
11534 	vm_map_entry_t          last;
11535 	vm_map_entry_t          entry;
11536 	vm_map_copy_t           original_copy;
11537 	kern_return_t           kr;
11538 	vm_map_kernel_flags_t   vmk_flags = VM_MAP_KERNEL_FLAGS_ANYWHERE();
11539 
11540 	/*
11541 	 *	Check for null copy object.
11542 	 */
11543 
11544 	if (copy == VM_MAP_COPY_NULL) {
11545 		*dst_addr = 0;
11546 		return KERN_SUCCESS;
11547 	}
11548 
11549 	/*
11550 	 * Assert that the vm_map_copy is coming from the right
11551 	 * zone and hasn't been forged
11552 	 */
11553 	vm_map_copy_require(copy);
11554 
11555 	if (copy->size != copy_size) {
11556 		*dst_addr = 0;
11557 		ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_COPYOUT_INTERNAL_SIZE_ERROR), KERN_FAILURE /* arg */);
11558 		return KERN_FAILURE;
11559 	}
11560 
11561 	/*
11562 	 *	Check for special kernel buffer allocated
11563 	 *	by new_ipc_kmsg_copyin.
11564 	 */
11565 
11566 	if (copy->type == VM_MAP_COPY_KERNEL_BUFFER) {
11567 		kr = vm_map_copyout_kernel_buffer(dst_map, dst_addr,
11568 		    copy, copy_size, FALSE,
11569 		    consume_on_success);
11570 		if (kr) {
11571 			ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_COPYOUT_KERNEL_BUFFER_ERROR), kr /* arg */);
11572 		}
11573 		return kr;
11574 	}
11575 
11576 	original_copy = copy;
11577 	if (copy->cpy_hdr.page_shift != VM_MAP_PAGE_SHIFT(dst_map)) {
11578 		vm_map_copy_t target_copy;
11579 		vm_map_offset_t overmap_start, overmap_end, trimmed_start;
11580 
11581 		target_copy = VM_MAP_COPY_NULL;
11582 		DEBUG4K_ADJUST("adjusting...\n");
11583 		kr = vm_map_copy_adjust_to_target(
11584 			copy,
11585 			0, /* offset */
11586 			copy->size, /* size */
11587 			dst_map,
11588 			TRUE, /* copy */
11589 			&target_copy,
11590 			&overmap_start,
11591 			&overmap_end,
11592 			&trimmed_start);
11593 		if (kr != KERN_SUCCESS) {
11594 			DEBUG4K_COPY("adjust failed 0x%x\n", kr);
11595 			ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_COPYOUT_INTERNAL_ADJUSTING_ERROR), kr /* arg */);
11596 			return kr;
11597 		}
11598 		DEBUG4K_COPY("copy %p (%d 0x%llx 0x%llx) dst_map %p (%d) target_copy %p (%d 0x%llx 0x%llx) overmap_start 0x%llx overmap_end 0x%llx trimmed_start 0x%llx\n", copy, copy->cpy_hdr.page_shift, copy->offset, (uint64_t)copy->size, dst_map, VM_MAP_PAGE_SHIFT(dst_map), target_copy, target_copy->cpy_hdr.page_shift, target_copy->offset, (uint64_t)target_copy->size, (uint64_t)overmap_start, (uint64_t)overmap_end, (uint64_t)trimmed_start);
11599 		if (target_copy != copy) {
11600 			copy = target_copy;
11601 		}
11602 		copy_size = copy->size;
11603 	}
11604 
11605 	/*
11606 	 *	Find space for the data
11607 	 */
11608 
11609 	vm_copy_start = vm_map_trunc_page((vm_map_size_t)copy->offset,
11610 	    VM_MAP_COPY_PAGE_MASK(copy));
11611 	size = vm_map_round_page((vm_map_size_t)copy->offset + copy_size,
11612 	    VM_MAP_COPY_PAGE_MASK(copy))
11613 	    - vm_copy_start;
11614 
11615 	vm_map_kernel_flags_update_range_id(&vmk_flags, dst_map);
11616 
11617 	vm_map_lock(dst_map);
11618 	kr = vm_map_locate_space(dst_map, size, 0, vmk_flags,
11619 	    &start, &last);
11620 	if (kr != KERN_SUCCESS) {
11621 		vm_map_unlock(dst_map);
11622 		ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_COPYOUT_INTERNAL_SPACE_ERROR), kr /* arg */);
11623 		return kr;
11624 	}
11625 
11626 	adjustment = start - vm_copy_start;
11627 	if (!consume_on_success) {
11628 		/*
11629 		 * We're not allowed to consume "copy", so we'll have to
11630 		 * copy its map entries into the destination map below.
11631 		 * No need to re-allocate map entries from the correct
11632 		 * (pageable or not) zone, since we'll get new map entries
11633 		 * during the transfer.
11634 		 * We'll also adjust the map entries's "start" and "end"
11635 		 * during the transfer, to keep "copy"'s entries consistent
11636 		 * with its "offset".
11637 		 */
11638 		goto after_adjustments;
11639 	}
11640 
11641 	/*
11642 	 *	Since we're going to just drop the map
11643 	 *	entries from the copy into the destination
11644 	 *	map, they must come from the same pool.
11645 	 */
11646 
11647 	if (copy->cpy_hdr.entries_pageable != dst_map->hdr.entries_pageable) {
11648 		/*
11649 		 * Mismatches occur when dealing with the default
11650 		 * pager.
11651 		 */
11652 		vm_map_entry_t  next, new;
11653 
11654 		/*
11655 		 * Find the zone that the copies were allocated from
11656 		 */
11657 
11658 		entry = vm_map_copy_first_entry(copy);
11659 
11660 		/*
11661 		 * Reinitialize the copy so that vm_map_copy_entry_link
11662 		 * will work.
11663 		 */
11664 		vm_map_store_copy_reset(copy, entry);
11665 		copy->cpy_hdr.entries_pageable = dst_map->hdr.entries_pageable;
11666 
11667 		/*
11668 		 * Copy each entry.
11669 		 */
11670 		while (entry != vm_map_copy_to_entry(copy)) {
11671 			new = vm_map_copy_entry_create(copy);
11672 			vm_map_entry_copy_full(new, entry);
11673 			new->vme_no_copy_on_read = FALSE;
11674 			assert(!new->iokit_acct);
11675 			if (new->is_sub_map) {
11676 				/* clr address space specifics */
11677 				new->use_pmap = FALSE;
11678 			}
11679 			vm_map_copy_entry_link(copy,
11680 			    vm_map_copy_last_entry(copy),
11681 			    new);
11682 			next = entry->vme_next;
11683 			vm_map_entry_dispose(entry);
11684 			entry = next;
11685 		}
11686 	}
11687 
11688 	/*
11689 	 *	Adjust the addresses in the copy chain, and
11690 	 *	reset the region attributes.
11691 	 */
11692 
11693 	for (entry = vm_map_copy_first_entry(copy);
11694 	    entry != vm_map_copy_to_entry(copy);
11695 	    entry = entry->vme_next) {
11696 		if (VM_MAP_PAGE_SHIFT(dst_map) == PAGE_SHIFT) {
11697 			/*
11698 			 * We're injecting this copy entry into a map that
11699 			 * has the standard page alignment, so clear
11700 			 * "map_aligned" (which might have been inherited
11701 			 * from the original map entry).
11702 			 */
11703 			entry->map_aligned = FALSE;
11704 		}
11705 
11706 		entry->vme_start += adjustment;
11707 		entry->vme_end += adjustment;
11708 
11709 		if (entry->map_aligned) {
11710 			assert(VM_MAP_PAGE_ALIGNED(entry->vme_start,
11711 			    VM_MAP_PAGE_MASK(dst_map)));
11712 			assert(VM_MAP_PAGE_ALIGNED(entry->vme_end,
11713 			    VM_MAP_PAGE_MASK(dst_map)));
11714 		}
11715 
11716 		entry->inheritance = VM_INHERIT_DEFAULT;
11717 		entry->protection = VM_PROT_DEFAULT;
11718 		entry->max_protection = VM_PROT_ALL;
11719 		entry->behavior = VM_BEHAVIOR_DEFAULT;
11720 
11721 		/*
11722 		 * If the entry is now wired,
11723 		 * map the pages into the destination map.
11724 		 */
11725 		if (entry->wired_count != 0) {
11726 			vm_map_offset_t va;
11727 			vm_object_offset_t       offset;
11728 			vm_object_t object;
11729 			vm_prot_t prot;
11730 			int     type_of_fault;
11731 			uint8_t object_lock_type = OBJECT_LOCK_EXCLUSIVE;
11732 
11733 			/* TODO4K would need to use actual page size */
11734 			assert(VM_MAP_PAGE_SHIFT(dst_map) == PAGE_SHIFT);
11735 
11736 			object = VME_OBJECT(entry);
11737 			offset = VME_OFFSET(entry);
11738 			va = entry->vme_start;
11739 
11740 			pmap_pageable(dst_map->pmap,
11741 			    entry->vme_start,
11742 			    entry->vme_end,
11743 			    TRUE);
11744 
11745 			while (va < entry->vme_end) {
11746 				vm_page_t       m;
11747 				struct vm_object_fault_info fault_info = {};
11748 
11749 				/*
11750 				 * Look up the page in the object.
11751 				 * Assert that the page will be found in the
11752 				 * top object:
11753 				 * either
11754 				 *	the object was newly created by
11755 				 *	vm_object_copy_slowly, and has
11756 				 *	copies of all of the pages from
11757 				 *	the source object
11758 				 * or
11759 				 *	the object was moved from the old
11760 				 *	map entry; because the old map
11761 				 *	entry was wired, all of the pages
11762 				 *	were in the top-level object.
11763 				 *	(XXX not true if we wire pages for
11764 				 *	 reading)
11765 				 */
11766 				vm_object_lock(object);
11767 
11768 				m = vm_page_lookup(object, offset);
11769 				if (m == VM_PAGE_NULL || !VM_PAGE_WIRED(m) ||
11770 				    m->vmp_absent) {
11771 					panic("vm_map_copyout: wiring %p", m);
11772 				}
11773 
11774 				prot = entry->protection;
11775 
11776 				if (override_nx(dst_map, VME_ALIAS(entry)) &&
11777 				    prot) {
11778 					prot |= VM_PROT_EXECUTE;
11779 				}
11780 
11781 				type_of_fault = DBG_CACHE_HIT_FAULT;
11782 
11783 				fault_info.user_tag = VME_ALIAS(entry);
11784 				fault_info.pmap_options = 0;
11785 				if (entry->iokit_acct ||
11786 				    (!entry->is_sub_map && !entry->use_pmap)) {
11787 					fault_info.pmap_options |= PMAP_OPTIONS_ALT_ACCT;
11788 				}
11789 				if (entry->vme_xnu_user_debug &&
11790 				    !VM_PAGE_OBJECT(m)->code_signed) {
11791 					/*
11792 					 * Modified code-signed executable
11793 					 * region: this page does not belong
11794 					 * to a code-signed VM object, so it
11795 					 * must have been copied and should
11796 					 * therefore be typed XNU_USER_DEBUG
11797 					 * rather than XNU_USER_EXEC.
11798 					 */
11799 					fault_info.pmap_options |= PMAP_OPTIONS_XNU_USER_DEBUG;
11800 				}
11801 
11802 				vm_fault_enter(m,
11803 				    dst_map->pmap,
11804 				    va,
11805 				    PAGE_SIZE, 0,
11806 				    prot,
11807 				    prot,
11808 				    VM_PAGE_WIRED(m),
11809 				    FALSE,            /* change_wiring */
11810 				    VM_KERN_MEMORY_NONE,            /* tag - not wiring */
11811 				    &fault_info,
11812 				    NULL,             /* need_retry */
11813 				    &type_of_fault,
11814 				    &object_lock_type); /*Exclusive mode lock. Will remain unchanged.*/
11815 
11816 				vm_object_unlock(object);
11817 
11818 				offset += PAGE_SIZE_64;
11819 				va += PAGE_SIZE;
11820 			}
11821 		}
11822 	}
11823 
11824 after_adjustments:
11825 
11826 	/*
11827 	 *	Correct the page alignment for the result
11828 	 */
11829 
11830 	*dst_addr = start + (copy->offset - vm_copy_start);
11831 
11832 #if KASAN
11833 	kasan_notify_address(*dst_addr, size);
11834 #endif
11835 
11836 	/*
11837 	 *	Update the hints and the map size
11838 	 */
11839 
11840 	if (consume_on_success) {
11841 		SAVE_HINT_MAP_WRITE(dst_map, vm_map_copy_last_entry(copy));
11842 	} else {
11843 		SAVE_HINT_MAP_WRITE(dst_map, last);
11844 	}
11845 
11846 	dst_map->size += size;
11847 
11848 	/*
11849 	 *	Link in the copy
11850 	 */
11851 
11852 	if (consume_on_success) {
11853 		vm_map_copy_insert(dst_map, last, copy);
11854 		if (copy != original_copy) {
11855 			vm_map_copy_discard(original_copy);
11856 			original_copy = VM_MAP_COPY_NULL;
11857 		}
11858 	} else {
11859 		vm_map_copy_remap(dst_map, last, copy, adjustment,
11860 		    cur_protection, max_protection,
11861 		    inheritance);
11862 		if (copy != original_copy && original_copy != VM_MAP_COPY_NULL) {
11863 			vm_map_copy_discard(copy);
11864 			copy = original_copy;
11865 		}
11866 	}
11867 
11868 
11869 	vm_map_unlock(dst_map);
11870 
11871 	/*
11872 	 * XXX	If wiring_required, call vm_map_pageable
11873 	 */
11874 
11875 	return KERN_SUCCESS;
11876 }
11877 
11878 /*
11879  *	Routine:	vm_map_copyin
11880  *
11881  *	Description:
11882  *		see vm_map_copyin_common.  Exported via Unsupported.exports.
11883  *
11884  */
11885 
11886 #undef vm_map_copyin
11887 
11888 kern_return_t
vm_map_copyin(vm_map_t src_map,vm_map_address_t src_addr,vm_map_size_t len,boolean_t src_destroy,vm_map_copy_t * copy_result)11889 vm_map_copyin(
11890 	vm_map_t                        src_map,
11891 	vm_map_address_t        src_addr,
11892 	vm_map_size_t           len,
11893 	boolean_t                       src_destroy,
11894 	vm_map_copy_t           *copy_result)   /* OUT */
11895 {
11896 	return vm_map_copyin_common(src_map, src_addr, len, src_destroy,
11897 	           FALSE, copy_result, FALSE);
11898 }
11899 
11900 /*
11901  *	Routine:	vm_map_copyin_common
11902  *
11903  *	Description:
11904  *		Copy the specified region (src_addr, len) from the
11905  *		source address space (src_map), possibly removing
11906  *		the region from the source address space (src_destroy).
11907  *
11908  *	Returns:
11909  *		A vm_map_copy_t object (copy_result), suitable for
11910  *		insertion into another address space (using vm_map_copyout),
11911  *		copying over another address space region (using
11912  *		vm_map_copy_overwrite).  If the copy is unused, it
11913  *		should be destroyed (using vm_map_copy_discard).
11914  *
11915  *	In/out conditions:
11916  *		The source map should not be locked on entry.
11917  */
11918 
11919 typedef struct submap_map {
11920 	vm_map_t        parent_map;
11921 	vm_map_offset_t base_start;
11922 	vm_map_offset_t base_end;
11923 	vm_map_size_t   base_len;
11924 	struct submap_map *next;
11925 } submap_map_t;
11926 
11927 kern_return_t
vm_map_copyin_common(vm_map_t src_map,vm_map_address_t src_addr,vm_map_size_t len,boolean_t src_destroy,__unused boolean_t src_volatile,vm_map_copy_t * copy_result,boolean_t use_maxprot)11928 vm_map_copyin_common(
11929 	vm_map_t        src_map,
11930 	vm_map_address_t src_addr,
11931 	vm_map_size_t   len,
11932 	boolean_t       src_destroy,
11933 	__unused boolean_t      src_volatile,
11934 	vm_map_copy_t   *copy_result,   /* OUT */
11935 	boolean_t       use_maxprot)
11936 {
11937 	int flags;
11938 
11939 	flags = 0;
11940 	if (src_destroy) {
11941 		flags |= VM_MAP_COPYIN_SRC_DESTROY;
11942 	}
11943 	if (use_maxprot) {
11944 		flags |= VM_MAP_COPYIN_USE_MAXPROT;
11945 	}
11946 	return vm_map_copyin_internal(src_map,
11947 	           src_addr,
11948 	           len,
11949 	           flags,
11950 	           copy_result);
11951 }
11952 kern_return_t
vm_map_copyin_internal(vm_map_t src_map,vm_map_address_t src_addr,vm_map_size_t len,int flags,vm_map_copy_t * copy_result)11953 vm_map_copyin_internal(
11954 	vm_map_t        src_map,
11955 	vm_map_address_t src_addr,
11956 	vm_map_size_t   len,
11957 	int             flags,
11958 	vm_map_copy_t   *copy_result)   /* OUT */
11959 {
11960 	vm_map_entry_t  tmp_entry;      /* Result of last map lookup --
11961 	                                 * in multi-level lookup, this
11962 	                                 * entry contains the actual
11963 	                                 * vm_object/offset.
11964 	                                 */
11965 	vm_map_entry_t  new_entry = VM_MAP_ENTRY_NULL;  /* Map entry for copy */
11966 
11967 	vm_map_offset_t src_start;      /* Start of current entry --
11968 	                                 * where copy is taking place now
11969 	                                 */
11970 	vm_map_offset_t src_end;        /* End of entire region to be
11971 	                                 * copied */
11972 	vm_map_offset_t src_base;
11973 	vm_map_t        base_map = src_map;
11974 	boolean_t       map_share = FALSE;
11975 	submap_map_t    *parent_maps = NULL;
11976 
11977 	vm_map_copy_t   copy;           /* Resulting copy */
11978 	vm_map_address_t copy_addr;
11979 	vm_map_size_t   copy_size;
11980 	boolean_t       src_destroy;
11981 	boolean_t       use_maxprot;
11982 	boolean_t       preserve_purgeable;
11983 	boolean_t       entry_was_shared;
11984 	vm_map_entry_t  saved_src_entry;
11985 
11986 
11987 	if (flags & ~VM_MAP_COPYIN_ALL_FLAGS) {
11988 		return KERN_INVALID_ARGUMENT;
11989 	}
11990 
11991 #if CONFIG_KERNEL_TAGGING
11992 	if (src_map->pmap == kernel_pmap) {
11993 		src_addr = vm_memtag_canonicalize_address(src_addr);
11994 	}
11995 #endif /* CONFIG_KERNEL_TAGGING */
11996 
11997 	src_destroy = (flags & VM_MAP_COPYIN_SRC_DESTROY) ? TRUE : FALSE;
11998 	use_maxprot = (flags & VM_MAP_COPYIN_USE_MAXPROT) ? TRUE : FALSE;
11999 	preserve_purgeable =
12000 	    (flags & VM_MAP_COPYIN_PRESERVE_PURGEABLE) ? TRUE : FALSE;
12001 
12002 	/*
12003 	 *	Check for copies of zero bytes.
12004 	 */
12005 
12006 	if (len == 0) {
12007 		*copy_result = VM_MAP_COPY_NULL;
12008 		return KERN_SUCCESS;
12009 	}
12010 
12011 	/*
12012 	 *	Check that the end address doesn't overflow
12013 	 */
12014 	if (__improbable(vm_map_range_overflows(src_map, src_addr, len))) {
12015 		return KERN_INVALID_ADDRESS;
12016 	}
12017 	src_end = src_addr + len;
12018 	if (src_end < src_addr) {
12019 		return KERN_INVALID_ADDRESS;
12020 	}
12021 
12022 	/*
12023 	 *	Compute (page aligned) start and end of region
12024 	 */
12025 	src_start = vm_map_trunc_page(src_addr,
12026 	    VM_MAP_PAGE_MASK(src_map));
12027 	src_end = vm_map_round_page(src_end,
12028 	    VM_MAP_PAGE_MASK(src_map));
12029 	if (src_end < src_addr) {
12030 		return KERN_INVALID_ADDRESS;
12031 	}
12032 
12033 	/*
12034 	 * If the copy is sufficiently small, use a kernel buffer instead
12035 	 * of making a virtual copy.  The theory being that the cost of
12036 	 * setting up VM (and taking C-O-W faults) dominates the copy costs
12037 	 * for small regions.
12038 	 */
12039 	if ((len <= msg_ool_size_small) &&
12040 	    !use_maxprot &&
12041 	    !preserve_purgeable &&
12042 	    !(flags & VM_MAP_COPYIN_ENTRY_LIST) &&
12043 	    /*
12044 	     * Since the "msg_ool_size_small" threshold was increased and
12045 	     * vm_map_copyin_kernel_buffer() doesn't handle accesses beyond the
12046 	     * address space limits, we revert to doing a virtual copy if the
12047 	     * copied range goes beyond those limits.  Otherwise, mach_vm_read()
12048 	     * of the commpage would now fail when it used to work.
12049 	     */
12050 	    (src_start >= vm_map_min(src_map) &&
12051 	    src_start < vm_map_max(src_map) &&
12052 	    src_end >= vm_map_min(src_map) &&
12053 	    src_end < vm_map_max(src_map))) {
12054 		return vm_map_copyin_kernel_buffer(src_map, src_addr, len,
12055 		           src_destroy, copy_result);
12056 	}
12057 
12058 	/*
12059 	 *	Allocate a header element for the list.
12060 	 *
12061 	 *	Use the start and end in the header to
12062 	 *	remember the endpoints prior to rounding.
12063 	 */
12064 
12065 	copy = vm_map_copy_allocate(VM_MAP_COPY_ENTRY_LIST);
12066 	copy->cpy_hdr.entries_pageable = TRUE;
12067 	copy->cpy_hdr.page_shift = (uint16_t)VM_MAP_PAGE_SHIFT(src_map);
12068 	copy->offset = src_addr;
12069 	copy->size = len;
12070 
12071 	new_entry = vm_map_copy_entry_create(copy);
12072 
12073 #define RETURN(x)                                               \
12074 	MACRO_BEGIN                                             \
12075 	vm_map_unlock(src_map);                                 \
12076 	if(src_map != base_map)                                 \
12077 	        vm_map_deallocate(src_map);                     \
12078 	if (new_entry != VM_MAP_ENTRY_NULL)                     \
12079 	        vm_map_copy_entry_dispose(new_entry);           \
12080 	vm_map_copy_discard(copy);                              \
12081 	{                                                       \
12082 	        submap_map_t	*_ptr;                          \
12083                                                                 \
12084 	        for(_ptr = parent_maps; _ptr != NULL; _ptr = parent_maps) { \
12085 	                parent_maps=parent_maps->next;          \
12086 	                if (_ptr->parent_map != base_map)       \
12087 	                        vm_map_deallocate(_ptr->parent_map);    \
12088 	                kfree_type(submap_map_t, _ptr);         \
12089 	        }                                               \
12090 	}                                                       \
12091 	MACRO_RETURN(x);                                        \
12092 	MACRO_END
12093 
12094 	/*
12095 	 *	Find the beginning of the region.
12096 	 */
12097 
12098 	vm_map_lock(src_map);
12099 
12100 	/*
12101 	 * Lookup the original "src_addr" rather than the truncated
12102 	 * "src_start", in case "src_start" falls in a non-map-aligned
12103 	 * map entry *before* the map entry that contains "src_addr"...
12104 	 */
12105 	if (!vm_map_lookup_entry(src_map, src_addr, &tmp_entry)) {
12106 		RETURN(KERN_INVALID_ADDRESS);
12107 	}
12108 	if (!tmp_entry->is_sub_map) {
12109 		/*
12110 		 * ... but clip to the map-rounded "src_start" rather than
12111 		 * "src_addr" to preserve map-alignment.  We'll adjust the
12112 		 * first copy entry at the end, if needed.
12113 		 */
12114 		vm_map_clip_start(src_map, tmp_entry, src_start);
12115 	}
12116 	if (src_start < tmp_entry->vme_start) {
12117 		/*
12118 		 * Move "src_start" up to the start of the
12119 		 * first map entry to copy.
12120 		 */
12121 		src_start = tmp_entry->vme_start;
12122 	}
12123 	/* set for later submap fix-up */
12124 	copy_addr = src_start;
12125 
12126 	/*
12127 	 *	Go through entries until we get to the end.
12128 	 */
12129 
12130 	while (TRUE) {
12131 		vm_map_entry_t  src_entry = tmp_entry;  /* Top-level entry */
12132 		vm_map_size_t   src_size;               /* Size of source
12133 		                                         * map entry (in both
12134 		                                         * maps)
12135 		                                         */
12136 
12137 		vm_object_t             src_object;     /* Object to copy */
12138 		vm_object_offset_t      src_offset;
12139 
12140 		vm_object_t             new_copy_object;/* vm_object_copy_* result */
12141 
12142 		boolean_t       src_needs_copy;         /* Should source map
12143 		                                         * be made read-only
12144 		                                         * for copy-on-write?
12145 		                                         */
12146 
12147 		boolean_t       new_entry_needs_copy;   /* Will new entry be COW? */
12148 
12149 		boolean_t       was_wired;              /* Was source wired? */
12150 		boolean_t       saved_used_for_jit;     /* Saved used_for_jit. */
12151 		vm_map_version_t version;               /* Version before locks
12152 		                                         * dropped to make copy
12153 		                                         */
12154 		kern_return_t   result;                 /* Return value from
12155 		                                         * copy_strategically.
12156 		                                         */
12157 		while (tmp_entry->is_sub_map) {
12158 			vm_map_size_t submap_len;
12159 			submap_map_t *ptr;
12160 
12161 			ptr = kalloc_type(submap_map_t, Z_WAITOK);
12162 			ptr->next = parent_maps;
12163 			parent_maps = ptr;
12164 			ptr->parent_map = src_map;
12165 			ptr->base_start = src_start;
12166 			ptr->base_end = src_end;
12167 			submap_len = tmp_entry->vme_end - src_start;
12168 			if (submap_len > (src_end - src_start)) {
12169 				submap_len = src_end - src_start;
12170 			}
12171 			ptr->base_len = submap_len;
12172 
12173 			src_start -= tmp_entry->vme_start;
12174 			src_start += VME_OFFSET(tmp_entry);
12175 			src_end = src_start + submap_len;
12176 			src_map = VME_SUBMAP(tmp_entry);
12177 			vm_map_lock(src_map);
12178 			/* keep an outstanding reference for all maps in */
12179 			/* the parents tree except the base map */
12180 			vm_map_reference(src_map);
12181 			vm_map_unlock(ptr->parent_map);
12182 			if (!vm_map_lookup_entry(
12183 				    src_map, src_start, &tmp_entry)) {
12184 				RETURN(KERN_INVALID_ADDRESS);
12185 			}
12186 			map_share = TRUE;
12187 			if (!tmp_entry->is_sub_map) {
12188 				vm_map_clip_start(src_map, tmp_entry, src_start);
12189 			}
12190 			src_entry = tmp_entry;
12191 		}
12192 		/* we are now in the lowest level submap... */
12193 
12194 		if ((VME_OBJECT(tmp_entry) != VM_OBJECT_NULL) &&
12195 		    (VME_OBJECT(tmp_entry)->phys_contiguous)) {
12196 			/* This is not, supported for now.In future */
12197 			/* we will need to detect the phys_contig   */
12198 			/* condition and then upgrade copy_slowly   */
12199 			/* to do physical copy from the device mem  */
12200 			/* based object. We can piggy-back off of   */
12201 			/* the was wired boolean to set-up the      */
12202 			/* proper handling */
12203 			RETURN(KERN_PROTECTION_FAILURE);
12204 		}
12205 		/*
12206 		 *	Create a new address map entry to hold the result.
12207 		 *	Fill in the fields from the appropriate source entries.
12208 		 *	We must unlock the source map to do this if we need
12209 		 *	to allocate a map entry.
12210 		 */
12211 		if (new_entry == VM_MAP_ENTRY_NULL) {
12212 			version.main_timestamp = src_map->timestamp;
12213 			vm_map_unlock(src_map);
12214 
12215 			new_entry = vm_map_copy_entry_create(copy);
12216 
12217 			vm_map_lock(src_map);
12218 			if ((version.main_timestamp + 1) != src_map->timestamp) {
12219 				if (!vm_map_lookup_entry(src_map, src_start,
12220 				    &tmp_entry)) {
12221 					RETURN(KERN_INVALID_ADDRESS);
12222 				}
12223 				if (!tmp_entry->is_sub_map) {
12224 					vm_map_clip_start(src_map, tmp_entry, src_start);
12225 				}
12226 				continue; /* restart w/ new tmp_entry */
12227 			}
12228 		}
12229 
12230 		/*
12231 		 *	Verify that the region can be read.
12232 		 */
12233 		if (((src_entry->protection & VM_PROT_READ) == VM_PROT_NONE &&
12234 		    !use_maxprot) ||
12235 		    (src_entry->max_protection & VM_PROT_READ) == 0) {
12236 			RETURN(KERN_PROTECTION_FAILURE);
12237 		}
12238 
12239 		/*
12240 		 *	Clip against the endpoints of the entire region.
12241 		 */
12242 
12243 		vm_map_clip_end(src_map, src_entry, src_end);
12244 
12245 		src_size = src_entry->vme_end - src_start;
12246 		src_object = VME_OBJECT(src_entry);
12247 		src_offset = VME_OFFSET(src_entry);
12248 		was_wired = (src_entry->wired_count != 0);
12249 
12250 		vm_map_entry_copy(src_map, new_entry, src_entry);
12251 		if (new_entry->is_sub_map) {
12252 			/* clr address space specifics */
12253 			new_entry->use_pmap = FALSE;
12254 		} else {
12255 			/*
12256 			 * We're dealing with a copy-on-write operation,
12257 			 * so the resulting mapping should not inherit the
12258 			 * original mapping's accounting settings.
12259 			 * "iokit_acct" should have been cleared in
12260 			 * vm_map_entry_copy().
12261 			 * "use_pmap" should be reset to its default (TRUE)
12262 			 * so that the new mapping gets accounted for in
12263 			 * the task's memory footprint.
12264 			 */
12265 			assert(!new_entry->iokit_acct);
12266 			new_entry->use_pmap = TRUE;
12267 		}
12268 
12269 		/*
12270 		 *	Attempt non-blocking copy-on-write optimizations.
12271 		 */
12272 
12273 		/*
12274 		 * If we are destroying the source, and the object
12275 		 * is internal, we could move the object reference
12276 		 * from the source to the copy.  The copy is
12277 		 * copy-on-write only if the source is.
12278 		 * We make another reference to the object, because
12279 		 * destroying the source entry will deallocate it.
12280 		 *
12281 		 * This memory transfer has to be atomic, (to prevent
12282 		 * the VM object from being shared or copied while
12283 		 * it's being moved here), so we could only do this
12284 		 * if we won't have to unlock the VM map until the
12285 		 * original mapping has been fully removed.
12286 		 */
12287 
12288 RestartCopy:
12289 		if ((src_object == VM_OBJECT_NULL ||
12290 		    (!was_wired && !map_share && !tmp_entry->is_shared
12291 		    && !(debug4k_no_cow_copyin && VM_MAP_PAGE_SHIFT(src_map) < PAGE_SHIFT))) &&
12292 		    vm_object_copy_quickly(
12293 			    VME_OBJECT(new_entry),
12294 			    src_offset,
12295 			    src_size,
12296 			    &src_needs_copy,
12297 			    &new_entry_needs_copy)) {
12298 			new_entry->needs_copy = new_entry_needs_copy;
12299 
12300 			/*
12301 			 *	Handle copy-on-write obligations
12302 			 */
12303 
12304 			if (src_needs_copy && !tmp_entry->needs_copy) {
12305 				vm_prot_t prot;
12306 
12307 				prot = src_entry->protection & ~VM_PROT_WRITE;
12308 
12309 				if (override_nx(src_map, VME_ALIAS(src_entry))
12310 				    && prot) {
12311 					prot |= VM_PROT_EXECUTE;
12312 				}
12313 
12314 				vm_object_pmap_protect(
12315 					src_object,
12316 					src_offset,
12317 					src_size,
12318 					(src_entry->is_shared ?
12319 					PMAP_NULL
12320 					: src_map->pmap),
12321 					VM_MAP_PAGE_SIZE(src_map),
12322 					src_entry->vme_start,
12323 					prot);
12324 
12325 				assert(tmp_entry->wired_count == 0);
12326 				tmp_entry->needs_copy = TRUE;
12327 			}
12328 
12329 			/*
12330 			 *	The map has never been unlocked, so it's safe
12331 			 *	to move to the next entry rather than doing
12332 			 *	another lookup.
12333 			 */
12334 
12335 			goto CopySuccessful;
12336 		}
12337 
12338 		entry_was_shared = tmp_entry->is_shared;
12339 
12340 		/*
12341 		 *	Take an object reference, so that we may
12342 		 *	release the map lock(s).
12343 		 */
12344 
12345 		assert(src_object != VM_OBJECT_NULL);
12346 		vm_object_reference(src_object);
12347 
12348 		/*
12349 		 *	Record the timestamp for later verification.
12350 		 *	Unlock the map.
12351 		 */
12352 
12353 		version.main_timestamp = src_map->timestamp;
12354 		vm_map_unlock(src_map); /* Increments timestamp once! */
12355 		saved_src_entry = src_entry;
12356 		tmp_entry = VM_MAP_ENTRY_NULL;
12357 		src_entry = VM_MAP_ENTRY_NULL;
12358 
12359 		/*
12360 		 *	Perform the copy
12361 		 */
12362 
12363 		if (was_wired ||
12364 		    (src_object->copy_strategy == MEMORY_OBJECT_COPY_DELAY_FORK &&
12365 		    !(flags & VM_MAP_COPYIN_FORK)) ||
12366 		    (debug4k_no_cow_copyin &&
12367 		    VM_MAP_PAGE_SHIFT(src_map) < PAGE_SHIFT)) {
12368 CopySlowly:
12369 			vm_object_lock(src_object);
12370 			result = vm_object_copy_slowly(
12371 				src_object,
12372 				src_offset,
12373 				src_size,
12374 				THREAD_UNINT,
12375 				&new_copy_object);
12376 			/* VME_OBJECT_SET will reset used_for_jit|tpro, so preserve it. */
12377 			saved_used_for_jit = new_entry->used_for_jit;
12378 			VME_OBJECT_SET(new_entry, new_copy_object, false, 0);
12379 			new_entry->used_for_jit = saved_used_for_jit;
12380 			VME_OFFSET_SET(new_entry,
12381 			    src_offset - vm_object_trunc_page(src_offset));
12382 			new_entry->needs_copy = FALSE;
12383 		} else if (src_object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC &&
12384 		    (entry_was_shared || map_share)) {
12385 			vm_object_t new_object;
12386 
12387 			vm_object_lock_shared(src_object);
12388 			new_object = vm_object_copy_delayed(
12389 				src_object,
12390 				src_offset,
12391 				src_size,
12392 				TRUE);
12393 			if (new_object == VM_OBJECT_NULL) {
12394 				goto CopySlowly;
12395 			}
12396 
12397 			VME_OBJECT_SET(new_entry, new_object, false, 0);
12398 			assert(new_entry->wired_count == 0);
12399 			new_entry->needs_copy = TRUE;
12400 			assert(!new_entry->iokit_acct);
12401 			assert(new_object->purgable == VM_PURGABLE_DENY);
12402 			assertf(new_entry->use_pmap, "src_map %p new_entry %p\n", src_map, new_entry);
12403 			result = KERN_SUCCESS;
12404 		} else {
12405 			vm_object_offset_t new_offset;
12406 			new_offset = VME_OFFSET(new_entry);
12407 			result = vm_object_copy_strategically(src_object,
12408 			    src_offset,
12409 			    src_size,
12410 			    (flags & VM_MAP_COPYIN_FORK),
12411 			    &new_copy_object,
12412 			    &new_offset,
12413 			    &new_entry_needs_copy);
12414 			/* VME_OBJECT_SET will reset used_for_jit, so preserve it. */
12415 			saved_used_for_jit = new_entry->used_for_jit;
12416 			VME_OBJECT_SET(new_entry, new_copy_object, false, 0);
12417 			new_entry->used_for_jit = saved_used_for_jit;
12418 			if (new_offset != VME_OFFSET(new_entry)) {
12419 				VME_OFFSET_SET(new_entry, new_offset);
12420 			}
12421 
12422 			new_entry->needs_copy = new_entry_needs_copy;
12423 		}
12424 
12425 		if (result == KERN_SUCCESS &&
12426 		    ((preserve_purgeable &&
12427 		    src_object->purgable != VM_PURGABLE_DENY) ||
12428 		    new_entry->used_for_jit)) {
12429 			/*
12430 			 * Purgeable objects should be COPY_NONE, true share;
12431 			 * this should be propogated to the copy.
12432 			 *
12433 			 * Also force mappings the pmap specially protects to
12434 			 * be COPY_NONE; trying to COW these mappings would
12435 			 * change the effective protections, which could have
12436 			 * side effects if the pmap layer relies on the
12437 			 * specified protections.
12438 			 */
12439 
12440 			vm_object_t     new_object;
12441 
12442 			new_object = VME_OBJECT(new_entry);
12443 			assert(new_object != src_object);
12444 			vm_object_lock(new_object);
12445 			assert(new_object->ref_count == 1);
12446 			assert(new_object->shadow == VM_OBJECT_NULL);
12447 			assert(new_object->vo_copy == VM_OBJECT_NULL);
12448 			assert(new_object->vo_owner == NULL);
12449 
12450 			new_object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
12451 
12452 			if (preserve_purgeable &&
12453 			    src_object->purgable != VM_PURGABLE_DENY) {
12454 				VM_OBJECT_SET_TRUE_SHARE(new_object, TRUE);
12455 
12456 				/* start as non-volatile with no owner... */
12457 				VM_OBJECT_SET_PURGABLE(new_object, VM_PURGABLE_NONVOLATILE);
12458 				vm_purgeable_nonvolatile_enqueue(new_object, NULL);
12459 				/* ... and move to src_object's purgeable state */
12460 				if (src_object->purgable != VM_PURGABLE_NONVOLATILE) {
12461 					int state;
12462 					state = src_object->purgable;
12463 					vm_object_purgable_control(
12464 						new_object,
12465 						VM_PURGABLE_SET_STATE_FROM_KERNEL,
12466 						&state);
12467 				}
12468 				/* no pmap accounting for purgeable objects */
12469 				new_entry->use_pmap = FALSE;
12470 			}
12471 
12472 			vm_object_unlock(new_object);
12473 			new_object = VM_OBJECT_NULL;
12474 		}
12475 
12476 		if (result != KERN_SUCCESS &&
12477 		    result != KERN_MEMORY_RESTART_COPY) {
12478 			vm_map_lock(src_map);
12479 			RETURN(result);
12480 		}
12481 
12482 		/*
12483 		 *	Throw away the extra reference
12484 		 */
12485 
12486 		vm_object_deallocate(src_object);
12487 
12488 		/*
12489 		 *	Verify that the map has not substantially
12490 		 *	changed while the copy was being made.
12491 		 */
12492 
12493 		vm_map_lock(src_map);
12494 
12495 		if ((version.main_timestamp + 1) == src_map->timestamp) {
12496 			/* src_map hasn't changed: src_entry is still valid */
12497 			src_entry = saved_src_entry;
12498 			goto VerificationSuccessful;
12499 		}
12500 
12501 		/*
12502 		 *	Simple version comparison failed.
12503 		 *
12504 		 *	Retry the lookup and verify that the
12505 		 *	same object/offset are still present.
12506 		 *
12507 		 *	[Note: a memory manager that colludes with
12508 		 *	the calling task can detect that we have
12509 		 *	cheated.  While the map was unlocked, the
12510 		 *	mapping could have been changed and restored.]
12511 		 */
12512 
12513 		if (!vm_map_lookup_entry(src_map, src_start, &tmp_entry)) {
12514 			if (result != KERN_MEMORY_RESTART_COPY) {
12515 				vm_object_deallocate(VME_OBJECT(new_entry));
12516 				VME_OBJECT_SET(new_entry, VM_OBJECT_NULL, false, 0);
12517 				/* reset accounting state */
12518 				new_entry->iokit_acct = FALSE;
12519 				new_entry->use_pmap = TRUE;
12520 			}
12521 			RETURN(KERN_INVALID_ADDRESS);
12522 		}
12523 
12524 		src_entry = tmp_entry;
12525 		vm_map_clip_start(src_map, src_entry, src_start);
12526 
12527 		if ((((src_entry->protection & VM_PROT_READ) == VM_PROT_NONE) &&
12528 		    !use_maxprot) ||
12529 		    ((src_entry->max_protection & VM_PROT_READ) == 0)) {
12530 			goto VerificationFailed;
12531 		}
12532 
12533 		if (src_entry->vme_end < new_entry->vme_end) {
12534 			/*
12535 			 * This entry might have been shortened
12536 			 * (vm_map_clip_end) or been replaced with
12537 			 * an entry that ends closer to "src_start"
12538 			 * than before.
12539 			 * Adjust "new_entry" accordingly; copying
12540 			 * less memory would be correct but we also
12541 			 * redo the copy (see below) if the new entry
12542 			 * no longer points at the same object/offset.
12543 			 */
12544 			assert(VM_MAP_PAGE_ALIGNED(src_entry->vme_end,
12545 			    VM_MAP_COPY_PAGE_MASK(copy)));
12546 			new_entry->vme_end = src_entry->vme_end;
12547 			src_size = new_entry->vme_end - src_start;
12548 		} else if (src_entry->vme_end > new_entry->vme_end) {
12549 			/*
12550 			 * This entry might have been extended
12551 			 * (vm_map_entry_simplify() or coalesce)
12552 			 * or been replaced with an entry that ends farther
12553 			 * from "src_start" than before.
12554 			 *
12555 			 * We've called vm_object_copy_*() only on
12556 			 * the previous <start:end> range, so we can't
12557 			 * just extend new_entry.  We have to re-do
12558 			 * the copy based on the new entry as if it was
12559 			 * pointing at a different object/offset (see
12560 			 * "Verification failed" below).
12561 			 */
12562 		}
12563 
12564 		if ((VME_OBJECT(src_entry) != src_object) ||
12565 		    (VME_OFFSET(src_entry) != src_offset) ||
12566 		    (src_entry->vme_end > new_entry->vme_end)) {
12567 			/*
12568 			 *	Verification failed.
12569 			 *
12570 			 *	Start over with this top-level entry.
12571 			 */
12572 
12573 VerificationFailed:     ;
12574 
12575 			vm_object_deallocate(VME_OBJECT(new_entry));
12576 			tmp_entry = src_entry;
12577 			continue;
12578 		}
12579 
12580 		/*
12581 		 *	Verification succeeded.
12582 		 */
12583 
12584 VerificationSuccessful:;
12585 
12586 		if (result == KERN_MEMORY_RESTART_COPY) {
12587 			goto RestartCopy;
12588 		}
12589 
12590 		/*
12591 		 *	Copy succeeded.
12592 		 */
12593 
12594 CopySuccessful: ;
12595 
12596 		/*
12597 		 *	Link in the new copy entry.
12598 		 */
12599 
12600 		vm_map_copy_entry_link(copy, vm_map_copy_last_entry(copy),
12601 		    new_entry);
12602 
12603 		/*
12604 		 *	Determine whether the entire region
12605 		 *	has been copied.
12606 		 */
12607 		src_base = src_start;
12608 		src_start = new_entry->vme_end;
12609 		new_entry = VM_MAP_ENTRY_NULL;
12610 		while ((src_start >= src_end) && (src_end != 0)) {
12611 			submap_map_t    *ptr;
12612 
12613 			if (src_map == base_map) {
12614 				/* back to the top */
12615 				break;
12616 			}
12617 
12618 			ptr = parent_maps;
12619 			assert(ptr != NULL);
12620 			parent_maps = parent_maps->next;
12621 
12622 			/* fix up the damage we did in that submap */
12623 			vm_map_simplify_range(src_map,
12624 			    src_base,
12625 			    src_end);
12626 
12627 			vm_map_unlock(src_map);
12628 			vm_map_deallocate(src_map);
12629 			vm_map_lock(ptr->parent_map);
12630 			src_map = ptr->parent_map;
12631 			src_base = ptr->base_start;
12632 			src_start = ptr->base_start + ptr->base_len;
12633 			src_end = ptr->base_end;
12634 			if (!vm_map_lookup_entry(src_map,
12635 			    src_start,
12636 			    &tmp_entry) &&
12637 			    (src_end > src_start)) {
12638 				RETURN(KERN_INVALID_ADDRESS);
12639 			}
12640 			kfree_type(submap_map_t, ptr);
12641 			if (parent_maps == NULL) {
12642 				map_share = FALSE;
12643 			}
12644 			src_entry = tmp_entry->vme_prev;
12645 		}
12646 
12647 		if ((VM_MAP_PAGE_SHIFT(src_map) != PAGE_SHIFT) &&
12648 		    (src_start >= src_addr + len) &&
12649 		    (src_addr + len != 0)) {
12650 			/*
12651 			 * Stop copying now, even though we haven't reached
12652 			 * "src_end".  We'll adjust the end of the last copy
12653 			 * entry at the end, if needed.
12654 			 *
12655 			 * If src_map's aligment is different from the
12656 			 * system's page-alignment, there could be
12657 			 * extra non-map-aligned map entries between
12658 			 * the original (non-rounded) "src_addr + len"
12659 			 * and the rounded "src_end".
12660 			 * We do not want to copy those map entries since
12661 			 * they're not part of the copied range.
12662 			 */
12663 			break;
12664 		}
12665 
12666 		if ((src_start >= src_end) && (src_end != 0)) {
12667 			break;
12668 		}
12669 
12670 		/*
12671 		 *	Verify that there are no gaps in the region
12672 		 */
12673 
12674 		tmp_entry = src_entry->vme_next;
12675 		if ((tmp_entry->vme_start != src_start) ||
12676 		    (tmp_entry == vm_map_to_entry(src_map))) {
12677 			RETURN(KERN_INVALID_ADDRESS);
12678 		}
12679 	}
12680 
12681 	/*
12682 	 * If the source should be destroyed, do it now, since the
12683 	 * copy was successful.
12684 	 */
12685 	if (src_destroy) {
12686 		vmr_flags_t remove_flags = VM_MAP_REMOVE_NO_FLAGS;
12687 
12688 		if (src_map == kernel_map) {
12689 			remove_flags |= VM_MAP_REMOVE_KUNWIRE;
12690 		}
12691 		(void)vm_map_remove_and_unlock(src_map,
12692 		    vm_map_trunc_page(src_addr, VM_MAP_PAGE_MASK(src_map)),
12693 		    src_end,
12694 		    remove_flags,
12695 		    KMEM_GUARD_NONE);
12696 	} else {
12697 		/* fix up the damage we did in the base map */
12698 		vm_map_simplify_range(
12699 			src_map,
12700 			vm_map_trunc_page(src_addr,
12701 			VM_MAP_PAGE_MASK(src_map)),
12702 			vm_map_round_page(src_end,
12703 			VM_MAP_PAGE_MASK(src_map)));
12704 		vm_map_unlock(src_map);
12705 	}
12706 
12707 	tmp_entry = VM_MAP_ENTRY_NULL;
12708 
12709 	if (VM_MAP_PAGE_SHIFT(src_map) > PAGE_SHIFT &&
12710 	    VM_MAP_PAGE_SHIFT(src_map) != VM_MAP_COPY_PAGE_SHIFT(copy)) {
12711 		vm_map_offset_t original_start, original_offset, original_end;
12712 
12713 		assert(VM_MAP_COPY_PAGE_MASK(copy) == PAGE_MASK);
12714 
12715 		/* adjust alignment of first copy_entry's "vme_start" */
12716 		tmp_entry = vm_map_copy_first_entry(copy);
12717 		if (tmp_entry != vm_map_copy_to_entry(copy)) {
12718 			vm_map_offset_t adjustment;
12719 
12720 			original_start = tmp_entry->vme_start;
12721 			original_offset = VME_OFFSET(tmp_entry);
12722 
12723 			/* map-align the start of the first copy entry... */
12724 			adjustment = (tmp_entry->vme_start -
12725 			    vm_map_trunc_page(
12726 				    tmp_entry->vme_start,
12727 				    VM_MAP_PAGE_MASK(src_map)));
12728 			tmp_entry->vme_start -= adjustment;
12729 			VME_OFFSET_SET(tmp_entry,
12730 			    VME_OFFSET(tmp_entry) - adjustment);
12731 			copy_addr -= adjustment;
12732 			assert(tmp_entry->vme_start < tmp_entry->vme_end);
12733 			/* ... adjust for mis-aligned start of copy range */
12734 			adjustment =
12735 			    (vm_map_trunc_page(copy->offset,
12736 			    PAGE_MASK) -
12737 			    vm_map_trunc_page(copy->offset,
12738 			    VM_MAP_PAGE_MASK(src_map)));
12739 			if (adjustment) {
12740 				assert(page_aligned(adjustment));
12741 				assert(adjustment < VM_MAP_PAGE_SIZE(src_map));
12742 				tmp_entry->vme_start += adjustment;
12743 				VME_OFFSET_SET(tmp_entry,
12744 				    (VME_OFFSET(tmp_entry) +
12745 				    adjustment));
12746 				copy_addr += adjustment;
12747 				assert(tmp_entry->vme_start < tmp_entry->vme_end);
12748 			}
12749 
12750 			/*
12751 			 * Assert that the adjustments haven't exposed
12752 			 * more than was originally copied...
12753 			 */
12754 			assert(tmp_entry->vme_start >= original_start);
12755 			assert(VME_OFFSET(tmp_entry) >= original_offset);
12756 			/*
12757 			 * ... and that it did not adjust outside of a
12758 			 * a single 16K page.
12759 			 */
12760 			assert(vm_map_trunc_page(tmp_entry->vme_start,
12761 			    VM_MAP_PAGE_MASK(src_map)) ==
12762 			    vm_map_trunc_page(original_start,
12763 			    VM_MAP_PAGE_MASK(src_map)));
12764 		}
12765 
12766 		/* adjust alignment of last copy_entry's "vme_end" */
12767 		tmp_entry = vm_map_copy_last_entry(copy);
12768 		if (tmp_entry != vm_map_copy_to_entry(copy)) {
12769 			vm_map_offset_t adjustment;
12770 
12771 			original_end = tmp_entry->vme_end;
12772 
12773 			/* map-align the end of the last copy entry... */
12774 			tmp_entry->vme_end =
12775 			    vm_map_round_page(tmp_entry->vme_end,
12776 			    VM_MAP_PAGE_MASK(src_map));
12777 			/* ... adjust for mis-aligned end of copy range */
12778 			adjustment =
12779 			    (vm_map_round_page((copy->offset +
12780 			    copy->size),
12781 			    VM_MAP_PAGE_MASK(src_map)) -
12782 			    vm_map_round_page((copy->offset +
12783 			    copy->size),
12784 			    PAGE_MASK));
12785 			if (adjustment) {
12786 				assert(page_aligned(adjustment));
12787 				assert(adjustment < VM_MAP_PAGE_SIZE(src_map));
12788 				tmp_entry->vme_end -= adjustment;
12789 				assert(tmp_entry->vme_start < tmp_entry->vme_end);
12790 			}
12791 
12792 			/*
12793 			 * Assert that the adjustments haven't exposed
12794 			 * more than was originally copied...
12795 			 */
12796 			assert(tmp_entry->vme_end <= original_end);
12797 			/*
12798 			 * ... and that it did not adjust outside of a
12799 			 * a single 16K page.
12800 			 */
12801 			assert(vm_map_round_page(tmp_entry->vme_end,
12802 			    VM_MAP_PAGE_MASK(src_map)) ==
12803 			    vm_map_round_page(original_end,
12804 			    VM_MAP_PAGE_MASK(src_map)));
12805 		}
12806 	}
12807 
12808 	/* Fix-up start and end points in copy.  This is necessary */
12809 	/* when the various entries in the copy object were picked */
12810 	/* up from different sub-maps */
12811 
12812 	tmp_entry = vm_map_copy_first_entry(copy);
12813 	copy_size = 0; /* compute actual size */
12814 	while (tmp_entry != vm_map_copy_to_entry(copy)) {
12815 		assert(VM_MAP_PAGE_ALIGNED(
12816 			    copy_addr + (tmp_entry->vme_end -
12817 			    tmp_entry->vme_start),
12818 			    MIN(VM_MAP_COPY_PAGE_MASK(copy), PAGE_MASK)));
12819 		assert(VM_MAP_PAGE_ALIGNED(
12820 			    copy_addr,
12821 			    MIN(VM_MAP_COPY_PAGE_MASK(copy), PAGE_MASK)));
12822 
12823 		/*
12824 		 * The copy_entries will be injected directly into the
12825 		 * destination map and might not be "map aligned" there...
12826 		 */
12827 		tmp_entry->map_aligned = FALSE;
12828 
12829 		tmp_entry->vme_end = copy_addr +
12830 		    (tmp_entry->vme_end - tmp_entry->vme_start);
12831 		tmp_entry->vme_start = copy_addr;
12832 		assert(tmp_entry->vme_start < tmp_entry->vme_end);
12833 		copy_addr += tmp_entry->vme_end - tmp_entry->vme_start;
12834 		copy_size += tmp_entry->vme_end - tmp_entry->vme_start;
12835 		tmp_entry = (struct vm_map_entry *)tmp_entry->vme_next;
12836 	}
12837 
12838 	if (VM_MAP_PAGE_SHIFT(src_map) != PAGE_SHIFT &&
12839 	    copy_size < copy->size) {
12840 		/*
12841 		 * The actual size of the VM map copy is smaller than what
12842 		 * was requested by the caller.  This must be because some
12843 		 * PAGE_SIZE-sized pages are missing at the end of the last
12844 		 * VM_MAP_PAGE_SIZE(src_map)-sized chunk of the range.
12845 		 * The caller might not have been aware of those missing
12846 		 * pages and might not want to be aware of it, which is
12847 		 * fine as long as they don't try to access (and crash on)
12848 		 * those missing pages.
12849 		 * Let's adjust the size of the "copy", to avoid failing
12850 		 * in vm_map_copyout() or vm_map_copy_overwrite().
12851 		 */
12852 		assert(vm_map_round_page(copy_size,
12853 		    VM_MAP_PAGE_MASK(src_map)) ==
12854 		    vm_map_round_page(copy->size,
12855 		    VM_MAP_PAGE_MASK(src_map)));
12856 		copy->size = copy_size;
12857 	}
12858 
12859 	*copy_result = copy;
12860 	return KERN_SUCCESS;
12861 
12862 #undef  RETURN
12863 }
12864 
12865 kern_return_t
vm_map_copy_extract(vm_map_t src_map,vm_map_address_t src_addr,vm_map_size_t len,boolean_t do_copy,vm_map_copy_t * copy_result,vm_prot_t * cur_prot,vm_prot_t * max_prot,vm_inherit_t inheritance,vm_map_kernel_flags_t vmk_flags)12866 vm_map_copy_extract(
12867 	vm_map_t                src_map,
12868 	vm_map_address_t        src_addr,
12869 	vm_map_size_t           len,
12870 	boolean_t               do_copy,
12871 	vm_map_copy_t           *copy_result,   /* OUT */
12872 	vm_prot_t               *cur_prot,      /* IN/OUT */
12873 	vm_prot_t               *max_prot,      /* IN/OUT */
12874 	vm_inherit_t            inheritance,
12875 	vm_map_kernel_flags_t   vmk_flags)
12876 {
12877 	vm_map_copy_t   copy;
12878 	kern_return_t   kr;
12879 	vm_prot_t required_cur_prot, required_max_prot;
12880 
12881 	/*
12882 	 *	Check for copies of zero bytes.
12883 	 */
12884 
12885 	if (len == 0) {
12886 		*copy_result = VM_MAP_COPY_NULL;
12887 		return KERN_SUCCESS;
12888 	}
12889 
12890 	/*
12891 	 *	Check that the end address doesn't overflow
12892 	 */
12893 	if (src_addr + len < src_addr) {
12894 		return KERN_INVALID_ADDRESS;
12895 	}
12896 	if (__improbable(vm_map_range_overflows(src_map, src_addr, len))) {
12897 		return KERN_INVALID_ADDRESS;
12898 	}
12899 
12900 	if (VM_MAP_PAGE_SIZE(src_map) < PAGE_SIZE) {
12901 		DEBUG4K_SHARE("src_map %p src_addr 0x%llx src_end 0x%llx\n", src_map, (uint64_t)src_addr, (uint64_t)(src_addr + len));
12902 	}
12903 
12904 	required_cur_prot = *cur_prot;
12905 	required_max_prot = *max_prot;
12906 
12907 	/*
12908 	 *	Allocate a header element for the list.
12909 	 *
12910 	 *	Use the start and end in the header to
12911 	 *	remember the endpoints prior to rounding.
12912 	 */
12913 
12914 	copy = vm_map_copy_allocate(VM_MAP_COPY_ENTRY_LIST);
12915 	copy->cpy_hdr.entries_pageable = vmk_flags.vmkf_copy_pageable;
12916 	copy->offset = 0;
12917 	copy->size = len;
12918 
12919 	kr = vm_map_remap_extract(src_map,
12920 	    src_addr,
12921 	    len,
12922 	    do_copy,             /* copy */
12923 	    copy,
12924 	    cur_prot,            /* IN/OUT */
12925 	    max_prot,            /* IN/OUT */
12926 	    inheritance,
12927 	    vmk_flags);
12928 	if (kr != KERN_SUCCESS) {
12929 		vm_map_copy_discard(copy);
12930 		return kr;
12931 	}
12932 	if (required_cur_prot != VM_PROT_NONE) {
12933 		assert((*cur_prot & required_cur_prot) == required_cur_prot);
12934 		assert((*max_prot & required_max_prot) == required_max_prot);
12935 	}
12936 
12937 	*copy_result = copy;
12938 	return KERN_SUCCESS;
12939 }
12940 
12941 static void
vm_map_fork_share(vm_map_t old_map,vm_map_entry_t old_entry,vm_map_t new_map)12942 vm_map_fork_share(
12943 	vm_map_t        old_map,
12944 	vm_map_entry_t  old_entry,
12945 	vm_map_t        new_map)
12946 {
12947 	vm_object_t     object;
12948 	vm_map_entry_t  new_entry;
12949 
12950 	/*
12951 	 *	New sharing code.  New map entry
12952 	 *	references original object.  Internal
12953 	 *	objects use asynchronous copy algorithm for
12954 	 *	future copies.  First make sure we have
12955 	 *	the right object.  If we need a shadow,
12956 	 *	or someone else already has one, then
12957 	 *	make a new shadow and share it.
12958 	 */
12959 
12960 	if (!old_entry->is_sub_map) {
12961 		object = VME_OBJECT(old_entry);
12962 	}
12963 
12964 	if (old_entry->is_sub_map) {
12965 		assert(old_entry->wired_count == 0);
12966 #ifndef NO_NESTED_PMAP
12967 #if !PMAP_FORK_NEST
12968 		if (old_entry->use_pmap) {
12969 			kern_return_t   result;
12970 
12971 			result = pmap_nest(new_map->pmap,
12972 			    (VME_SUBMAP(old_entry))->pmap,
12973 			    (addr64_t)old_entry->vme_start,
12974 			    (uint64_t)(old_entry->vme_end - old_entry->vme_start));
12975 			if (result) {
12976 				panic("vm_map_fork_share: pmap_nest failed!");
12977 			}
12978 		}
12979 #endif /* !PMAP_FORK_NEST */
12980 #endif  /* NO_NESTED_PMAP */
12981 	} else if (object == VM_OBJECT_NULL) {
12982 		object = vm_object_allocate((vm_map_size_t)(old_entry->vme_end -
12983 		    old_entry->vme_start));
12984 		VME_OFFSET_SET(old_entry, 0);
12985 		VME_OBJECT_SET(old_entry, object, false, 0);
12986 		old_entry->use_pmap = TRUE;
12987 //		assert(!old_entry->needs_copy);
12988 	} else if (object->copy_strategy !=
12989 	    MEMORY_OBJECT_COPY_SYMMETRIC) {
12990 		/*
12991 		 *	We are already using an asymmetric
12992 		 *	copy, and therefore we already have
12993 		 *	the right object.
12994 		 */
12995 
12996 		assert(!old_entry->needs_copy);
12997 	} else if (old_entry->needs_copy ||       /* case 1 */
12998 	    object->shadowed ||                 /* case 2 */
12999 	    (!object->true_share &&             /* case 3 */
13000 	    !old_entry->is_shared &&
13001 	    (object->vo_size >
13002 	    (vm_map_size_t)(old_entry->vme_end -
13003 	    old_entry->vme_start)))) {
13004 		bool is_writable;
13005 
13006 		/*
13007 		 *	We need to create a shadow.
13008 		 *	There are three cases here.
13009 		 *	In the first case, we need to
13010 		 *	complete a deferred symmetrical
13011 		 *	copy that we participated in.
13012 		 *	In the second and third cases,
13013 		 *	we need to create the shadow so
13014 		 *	that changes that we make to the
13015 		 *	object do not interfere with
13016 		 *	any symmetrical copies which
13017 		 *	have occured (case 2) or which
13018 		 *	might occur (case 3).
13019 		 *
13020 		 *	The first case is when we had
13021 		 *	deferred shadow object creation
13022 		 *	via the entry->needs_copy mechanism.
13023 		 *	This mechanism only works when
13024 		 *	only one entry points to the source
13025 		 *	object, and we are about to create
13026 		 *	a second entry pointing to the
13027 		 *	same object. The problem is that
13028 		 *	there is no way of mapping from
13029 		 *	an object to the entries pointing
13030 		 *	to it. (Deferred shadow creation
13031 		 *	works with one entry because occurs
13032 		 *	at fault time, and we walk from the
13033 		 *	entry to the object when handling
13034 		 *	the fault.)
13035 		 *
13036 		 *	The second case is when the object
13037 		 *	to be shared has already been copied
13038 		 *	with a symmetric copy, but we point
13039 		 *	directly to the object without
13040 		 *	needs_copy set in our entry. (This
13041 		 *	can happen because different ranges
13042 		 *	of an object can be pointed to by
13043 		 *	different entries. In particular,
13044 		 *	a single entry pointing to an object
13045 		 *	can be split by a call to vm_inherit,
13046 		 *	which, combined with task_create, can
13047 		 *	result in the different entries
13048 		 *	having different needs_copy values.)
13049 		 *	The shadowed flag in the object allows
13050 		 *	us to detect this case. The problem
13051 		 *	with this case is that if this object
13052 		 *	has or will have shadows, then we
13053 		 *	must not perform an asymmetric copy
13054 		 *	of this object, since such a copy
13055 		 *	allows the object to be changed, which
13056 		 *	will break the previous symmetrical
13057 		 *	copies (which rely upon the object
13058 		 *	not changing). In a sense, the shadowed
13059 		 *	flag says "don't change this object".
13060 		 *	We fix this by creating a shadow
13061 		 *	object for this object, and sharing
13062 		 *	that. This works because we are free
13063 		 *	to change the shadow object (and thus
13064 		 *	to use an asymmetric copy strategy);
13065 		 *	this is also semantically correct,
13066 		 *	since this object is temporary, and
13067 		 *	therefore a copy of the object is
13068 		 *	as good as the object itself. (This
13069 		 *	is not true for permanent objects,
13070 		 *	since the pager needs to see changes,
13071 		 *	which won't happen if the changes
13072 		 *	are made to a copy.)
13073 		 *
13074 		 *	The third case is when the object
13075 		 *	to be shared has parts sticking
13076 		 *	outside of the entry we're working
13077 		 *	with, and thus may in the future
13078 		 *	be subject to a symmetrical copy.
13079 		 *	(This is a preemptive version of
13080 		 *	case 2.)
13081 		 */
13082 		VME_OBJECT_SHADOW(old_entry,
13083 		    (vm_map_size_t) (old_entry->vme_end -
13084 		    old_entry->vme_start),
13085 		    vm_map_always_shadow(old_map));
13086 
13087 		/*
13088 		 *	If we're making a shadow for other than
13089 		 *	copy on write reasons, then we have
13090 		 *	to remove write permission.
13091 		 */
13092 
13093 		is_writable = false;
13094 		if (old_entry->protection & VM_PROT_WRITE) {
13095 			is_writable = true;
13096 #if __arm64e__
13097 		} else if (old_entry->used_for_tpro) {
13098 			is_writable = true;
13099 #endif /* __arm64e__ */
13100 		}
13101 		if (!old_entry->needs_copy && is_writable) {
13102 			vm_prot_t prot;
13103 
13104 			if (pmap_has_prot_policy(old_map->pmap, old_entry->translated_allow_execute, old_entry->protection)) {
13105 				panic("%s: map %p pmap %p entry %p 0x%llx:0x%llx prot 0x%x",
13106 				    __FUNCTION__, old_map, old_map->pmap,
13107 				    old_entry,
13108 				    (uint64_t)old_entry->vme_start,
13109 				    (uint64_t)old_entry->vme_end,
13110 				    old_entry->protection);
13111 			}
13112 
13113 			prot = old_entry->protection & ~VM_PROT_WRITE;
13114 
13115 			if (pmap_has_prot_policy(old_map->pmap, old_entry->translated_allow_execute, prot)) {
13116 				panic("%s: map %p pmap %p entry %p 0x%llx:0x%llx prot 0x%x",
13117 				    __FUNCTION__, old_map, old_map->pmap,
13118 				    old_entry,
13119 				    (uint64_t)old_entry->vme_start,
13120 				    (uint64_t)old_entry->vme_end,
13121 				    prot);
13122 			}
13123 
13124 			if (override_nx(old_map, VME_ALIAS(old_entry)) && prot) {
13125 				prot |= VM_PROT_EXECUTE;
13126 			}
13127 
13128 
13129 			if (old_map->mapped_in_other_pmaps) {
13130 				vm_object_pmap_protect(
13131 					VME_OBJECT(old_entry),
13132 					VME_OFFSET(old_entry),
13133 					(old_entry->vme_end -
13134 					old_entry->vme_start),
13135 					PMAP_NULL,
13136 					PAGE_SIZE,
13137 					old_entry->vme_start,
13138 					prot);
13139 			} else {
13140 				pmap_protect(old_map->pmap,
13141 				    old_entry->vme_start,
13142 				    old_entry->vme_end,
13143 				    prot);
13144 			}
13145 		}
13146 
13147 		old_entry->needs_copy = FALSE;
13148 		object = VME_OBJECT(old_entry);
13149 	}
13150 
13151 
13152 	/*
13153 	 *	If object was using a symmetric copy strategy,
13154 	 *	change its copy strategy to the default
13155 	 *	asymmetric copy strategy, which is copy_delay
13156 	 *	in the non-norma case and copy_call in the
13157 	 *	norma case. Bump the reference count for the
13158 	 *	new entry.
13159 	 */
13160 
13161 	if (old_entry->is_sub_map) {
13162 		vm_map_reference(VME_SUBMAP(old_entry));
13163 	} else {
13164 		vm_object_lock(object);
13165 		vm_object_reference_locked(object);
13166 		if (object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) {
13167 			object->copy_strategy = MEMORY_OBJECT_COPY_DELAY;
13168 		}
13169 		vm_object_unlock(object);
13170 	}
13171 
13172 	/*
13173 	 *	Clone the entry, using object ref from above.
13174 	 *	Mark both entries as shared.
13175 	 */
13176 
13177 	new_entry = vm_map_entry_create(new_map); /* Never the kernel map or descendants */
13178 	vm_map_entry_copy(old_map, new_entry, old_entry);
13179 	old_entry->is_shared = TRUE;
13180 	new_entry->is_shared = TRUE;
13181 
13182 	/*
13183 	 * We're dealing with a shared mapping, so the resulting mapping
13184 	 * should inherit some of the original mapping's accounting settings.
13185 	 * "iokit_acct" should have been cleared in vm_map_entry_copy().
13186 	 * "use_pmap" should stay the same as before (if it hasn't been reset
13187 	 * to TRUE when we cleared "iokit_acct").
13188 	 */
13189 	assert(!new_entry->iokit_acct);
13190 
13191 	/*
13192 	 *	If old entry's inheritence is VM_INHERIT_NONE,
13193 	 *	the new entry is for corpse fork, remove the
13194 	 *	write permission from the new entry.
13195 	 */
13196 	if (old_entry->inheritance == VM_INHERIT_NONE) {
13197 		new_entry->protection &= ~VM_PROT_WRITE;
13198 		new_entry->max_protection &= ~VM_PROT_WRITE;
13199 	}
13200 
13201 	/*
13202 	 *	Insert the entry into the new map -- we
13203 	 *	know we're inserting at the end of the new
13204 	 *	map.
13205 	 */
13206 
13207 	vm_map_store_entry_link(new_map, vm_map_last_entry(new_map), new_entry,
13208 	    VM_MAP_KERNEL_FLAGS_NONE);
13209 
13210 	/*
13211 	 *	Update the physical map
13212 	 */
13213 
13214 	if (old_entry->is_sub_map) {
13215 		/* Bill Angell pmap support goes here */
13216 	} else {
13217 		pmap_copy(new_map->pmap, old_map->pmap, new_entry->vme_start,
13218 		    old_entry->vme_end - old_entry->vme_start,
13219 		    old_entry->vme_start);
13220 	}
13221 }
13222 
13223 static boolean_t
vm_map_fork_copy(vm_map_t old_map,vm_map_entry_t * old_entry_p,vm_map_t new_map,int vm_map_copyin_flags)13224 vm_map_fork_copy(
13225 	vm_map_t        old_map,
13226 	vm_map_entry_t  *old_entry_p,
13227 	vm_map_t        new_map,
13228 	int             vm_map_copyin_flags)
13229 {
13230 	vm_map_entry_t old_entry = *old_entry_p;
13231 	vm_map_size_t entry_size = old_entry->vme_end - old_entry->vme_start;
13232 	vm_map_offset_t start = old_entry->vme_start;
13233 	vm_map_copy_t copy;
13234 	vm_map_entry_t last = vm_map_last_entry(new_map);
13235 
13236 	vm_map_unlock(old_map);
13237 	/*
13238 	 *	Use maxprot version of copyin because we
13239 	 *	care about whether this memory can ever
13240 	 *	be accessed, not just whether it's accessible
13241 	 *	right now.
13242 	 */
13243 	vm_map_copyin_flags |= VM_MAP_COPYIN_USE_MAXPROT;
13244 	if (vm_map_copyin_internal(old_map, start, entry_size,
13245 	    vm_map_copyin_flags, &copy)
13246 	    != KERN_SUCCESS) {
13247 		/*
13248 		 *	The map might have changed while it
13249 		 *	was unlocked, check it again.  Skip
13250 		 *	any blank space or permanently
13251 		 *	unreadable region.
13252 		 */
13253 		vm_map_lock(old_map);
13254 		if (!vm_map_lookup_entry(old_map, start, &last) ||
13255 		    (last->max_protection & VM_PROT_READ) == VM_PROT_NONE) {
13256 			last = last->vme_next;
13257 		}
13258 		*old_entry_p = last;
13259 
13260 		/*
13261 		 * XXX	For some error returns, want to
13262 		 * XXX	skip to the next element.  Note
13263 		 *	that INVALID_ADDRESS and
13264 		 *	PROTECTION_FAILURE are handled above.
13265 		 */
13266 
13267 		return FALSE;
13268 	}
13269 
13270 	/*
13271 	 * Assert that the vm_map_copy is coming from the right
13272 	 * zone and hasn't been forged
13273 	 */
13274 	vm_map_copy_require(copy);
13275 
13276 	/*
13277 	 *	Insert the copy into the new map
13278 	 */
13279 	vm_map_copy_insert(new_map, last, copy);
13280 
13281 	/*
13282 	 *	Pick up the traversal at the end of
13283 	 *	the copied region.
13284 	 */
13285 
13286 	vm_map_lock(old_map);
13287 	start += entry_size;
13288 	if (!vm_map_lookup_entry(old_map, start, &last)) {
13289 		last = last->vme_next;
13290 	} else {
13291 		if (last->vme_start == start) {
13292 			/*
13293 			 * No need to clip here and we don't
13294 			 * want to cause any unnecessary
13295 			 * unnesting...
13296 			 */
13297 		} else {
13298 			vm_map_clip_start(old_map, last, start);
13299 		}
13300 	}
13301 	*old_entry_p = last;
13302 
13303 	return TRUE;
13304 }
13305 
13306 #if PMAP_FORK_NEST
13307 #define PMAP_FORK_NEST_DEBUG 0
13308 static inline void
vm_map_fork_unnest(pmap_t new_pmap,vm_map_offset_t pre_nested_start,vm_map_offset_t pre_nested_end,vm_map_offset_t start,vm_map_offset_t end)13309 vm_map_fork_unnest(
13310 	pmap_t new_pmap,
13311 	vm_map_offset_t pre_nested_start,
13312 	vm_map_offset_t pre_nested_end,
13313 	vm_map_offset_t start,
13314 	vm_map_offset_t end)
13315 {
13316 	kern_return_t kr;
13317 	vm_map_offset_t nesting_mask, start_unnest, end_unnest;
13318 
13319 	assertf(pre_nested_start <= pre_nested_end,
13320 	    "pre_nested start 0x%llx end 0x%llx",
13321 	    (uint64_t)pre_nested_start, (uint64_t)pre_nested_end);
13322 	assertf(start <= end,
13323 	    "start 0x%llx end 0x%llx",
13324 	    (uint64_t) start, (uint64_t)end);
13325 
13326 	if (pre_nested_start == pre_nested_end) {
13327 		/* nothing was pre-nested: done */
13328 		return;
13329 	}
13330 	if (end <= pre_nested_start) {
13331 		/* fully before pre-nested range: done */
13332 		return;
13333 	}
13334 	if (start >= pre_nested_end) {
13335 		/* fully after pre-nested range: done */
13336 		return;
13337 	}
13338 	/* ignore parts of range outside of pre_nested range */
13339 	if (start < pre_nested_start) {
13340 		start = pre_nested_start;
13341 	}
13342 	if (end > pre_nested_end) {
13343 		end = pre_nested_end;
13344 	}
13345 	nesting_mask = pmap_shared_region_size_min(new_pmap) - 1;
13346 	start_unnest = start & ~nesting_mask;
13347 	end_unnest = (end + nesting_mask) & ~nesting_mask;
13348 	kr = pmap_unnest(new_pmap,
13349 	    (addr64_t)start_unnest,
13350 	    (uint64_t)(end_unnest - start_unnest));
13351 #if PMAP_FORK_NEST_DEBUG
13352 	printf("PMAP_FORK_NEST %s:%d new_pmap %p 0x%llx:0x%llx -> pmap_unnest 0x%llx:0x%llx kr 0x%x\n", __FUNCTION__, __LINE__, new_pmap, (uint64_t)start, (uint64_t)end, (uint64_t)start_unnest, (uint64_t)end_unnest, kr);
13353 #endif /* PMAP_FORK_NEST_DEBUG */
13354 	assertf(kr == KERN_SUCCESS,
13355 	    "0x%llx 0x%llx pmap_unnest(%p, 0x%llx, 0x%llx) -> 0x%x",
13356 	    (uint64_t)start, (uint64_t)end, new_pmap,
13357 	    (uint64_t)start_unnest, (uint64_t)(end_unnest - start_unnest),
13358 	    kr);
13359 }
13360 #endif /* PMAP_FORK_NEST */
13361 
13362 void
vm_map_inherit_limits(vm_map_t new_map,const struct _vm_map * old_map)13363 vm_map_inherit_limits(vm_map_t new_map, const struct _vm_map *old_map)
13364 {
13365 	new_map->size_limit = old_map->size_limit;
13366 	new_map->data_limit = old_map->data_limit;
13367 	new_map->user_wire_limit = old_map->user_wire_limit;
13368 	new_map->reserved_regions = old_map->reserved_regions;
13369 }
13370 
13371 /*
13372  *	vm_map_fork:
13373  *
13374  *	Create and return a new map based on the old
13375  *	map, according to the inheritance values on the
13376  *	regions in that map and the options.
13377  *
13378  *	The source map must not be locked.
13379  */
13380 vm_map_t
vm_map_fork(ledger_t ledger,vm_map_t old_map,int options)13381 vm_map_fork(
13382 	ledger_t        ledger,
13383 	vm_map_t        old_map,
13384 	int             options)
13385 {
13386 	pmap_t          new_pmap;
13387 	vm_map_t        new_map;
13388 	vm_map_entry_t  old_entry;
13389 	vm_map_size_t   new_size = 0, entry_size;
13390 	vm_map_entry_t  new_entry;
13391 	boolean_t       src_needs_copy;
13392 	boolean_t       new_entry_needs_copy;
13393 	boolean_t       pmap_is64bit;
13394 	int             vm_map_copyin_flags;
13395 	vm_inherit_t    old_entry_inheritance;
13396 	int             map_create_options;
13397 	kern_return_t   footprint_collect_kr;
13398 
13399 	if (options & ~(VM_MAP_FORK_SHARE_IF_INHERIT_NONE |
13400 	    VM_MAP_FORK_PRESERVE_PURGEABLE |
13401 	    VM_MAP_FORK_CORPSE_FOOTPRINT)) {
13402 		/* unsupported option */
13403 		return VM_MAP_NULL;
13404 	}
13405 
13406 	pmap_is64bit =
13407 #if defined(__i386__) || defined(__x86_64__)
13408 	    old_map->pmap->pm_task_map != TASK_MAP_32BIT;
13409 #elif defined(__arm64__)
13410 	    old_map->pmap->is_64bit;
13411 #else
13412 #error Unknown architecture.
13413 #endif
13414 
13415 	unsigned int pmap_flags = 0;
13416 	pmap_flags |= pmap_is64bit ? PMAP_CREATE_64BIT : 0;
13417 #if defined(HAS_APPLE_PAC)
13418 	pmap_flags |= old_map->pmap->disable_jop ? PMAP_CREATE_DISABLE_JOP : 0;
13419 #endif
13420 #if CONFIG_ROSETTA
13421 	pmap_flags |= old_map->pmap->is_rosetta ? PMAP_CREATE_ROSETTA : 0;
13422 #endif
13423 #if PMAP_CREATE_FORCE_4K_PAGES
13424 	if (VM_MAP_PAGE_SIZE(old_map) == FOURK_PAGE_SIZE &&
13425 	    PAGE_SIZE != FOURK_PAGE_SIZE) {
13426 		pmap_flags |= PMAP_CREATE_FORCE_4K_PAGES;
13427 	}
13428 #endif /* PMAP_CREATE_FORCE_4K_PAGES */
13429 	new_pmap = pmap_create_options(ledger, (vm_map_size_t) 0, pmap_flags);
13430 	if (new_pmap == NULL) {
13431 		return VM_MAP_NULL;
13432 	}
13433 
13434 	vm_map_reference(old_map);
13435 	vm_map_lock(old_map);
13436 
13437 	map_create_options = 0;
13438 	if (old_map->hdr.entries_pageable) {
13439 		map_create_options |= VM_MAP_CREATE_PAGEABLE;
13440 	}
13441 	if (options & VM_MAP_FORK_CORPSE_FOOTPRINT) {
13442 		map_create_options |= VM_MAP_CREATE_CORPSE_FOOTPRINT;
13443 		footprint_collect_kr = KERN_SUCCESS;
13444 	}
13445 	new_map = vm_map_create_options(new_pmap,
13446 	    old_map->min_offset,
13447 	    old_map->max_offset,
13448 	    map_create_options);
13449 
13450 	/* inherit cs_enforcement */
13451 	vm_map_cs_enforcement_set(new_map, old_map->cs_enforcement);
13452 
13453 	vm_map_lock(new_map);
13454 	vm_commit_pagezero_status(new_map);
13455 	/* inherit the parent map's page size */
13456 	vm_map_set_page_shift(new_map, VM_MAP_PAGE_SHIFT(old_map));
13457 
13458 	/* inherit the parent rlimits */
13459 	vm_map_inherit_limits(new_map, old_map);
13460 
13461 #if CONFIG_MAP_RANGES
13462 	/* inherit the parent map's VM ranges */
13463 	vm_map_range_fork(new_map, old_map);
13464 #endif
13465 
13466 #if CODE_SIGNING_MONITOR
13467 	/* Prepare the monitor for the fork */
13468 	csm_fork_prepare(old_map->pmap, new_pmap);
13469 #endif
13470 
13471 #if PMAP_FORK_NEST
13472 	/*
13473 	 * Pre-nest the shared region's pmap.
13474 	 */
13475 	vm_map_offset_t pre_nested_start = 0, pre_nested_end = 0;
13476 	pmap_fork_nest(old_map->pmap, new_pmap,
13477 	    &pre_nested_start, &pre_nested_end);
13478 #if PMAP_FORK_NEST_DEBUG
13479 	printf("PMAP_FORK_NEST %s:%d old %p new %p pre_nested start 0x%llx end 0x%llx\n", __FUNCTION__, __LINE__, old_map->pmap, new_pmap, (uint64_t)pre_nested_start, (uint64_t)pre_nested_end);
13480 #endif /* PMAP_FORK_NEST_DEBUG */
13481 #endif /* PMAP_FORK_NEST */
13482 
13483 	for (old_entry = vm_map_first_entry(old_map); old_entry != vm_map_to_entry(old_map);) {
13484 		/*
13485 		 * Abort any corpse collection if the system is shutting down.
13486 		 */
13487 		if ((options & VM_MAP_FORK_CORPSE_FOOTPRINT) &&
13488 		    get_system_inshutdown()) {
13489 #if PMAP_FORK_NEST
13490 			new_entry = vm_map_last_entry(new_map);
13491 			if (new_entry == vm_map_to_entry(new_map)) {
13492 				/* unnest all that was pre-nested */
13493 				vm_map_fork_unnest(new_pmap,
13494 				    pre_nested_start, pre_nested_end,
13495 				    vm_map_min(new_map), vm_map_max(new_map));
13496 			} else if (new_entry->vme_end < vm_map_max(new_map)) {
13497 				/* unnest hole at the end, if pre-nested */
13498 				vm_map_fork_unnest(new_pmap,
13499 				    pre_nested_start, pre_nested_end,
13500 				    new_entry->vme_end, vm_map_max(new_map));
13501 			}
13502 #endif /* PMAP_FORK_NEST */
13503 			vm_map_corpse_footprint_collect_done(new_map);
13504 			vm_map_unlock(new_map);
13505 			vm_map_unlock(old_map);
13506 			vm_map_deallocate(new_map);
13507 			vm_map_deallocate(old_map);
13508 			printf("Aborting corpse map due to system shutdown\n");
13509 			return VM_MAP_NULL;
13510 		}
13511 
13512 		entry_size = old_entry->vme_end - old_entry->vme_start;
13513 
13514 #if PMAP_FORK_NEST
13515 		/*
13516 		 * Undo any unnecessary pre-nesting.
13517 		 */
13518 		vm_map_offset_t prev_end;
13519 		if (old_entry == vm_map_first_entry(old_map)) {
13520 			prev_end = vm_map_min(old_map);
13521 		} else {
13522 			prev_end = old_entry->vme_prev->vme_end;
13523 		}
13524 		if (prev_end < old_entry->vme_start) {
13525 			/* unnest hole before this entry, if pre-nested */
13526 			vm_map_fork_unnest(new_pmap,
13527 			    pre_nested_start, pre_nested_end,
13528 			    prev_end, old_entry->vme_start);
13529 		}
13530 		if (old_entry->is_sub_map && old_entry->use_pmap) {
13531 			/* keep this entry nested in the child */
13532 #if PMAP_FORK_NEST_DEBUG
13533 			printf("PMAP_FORK_NEST %s:%d new_pmap %p keeping 0x%llx:0x%llx nested\n", __FUNCTION__, __LINE__, new_pmap, (uint64_t)old_entry->vme_start, (uint64_t)old_entry->vme_end);
13534 #endif /* PMAP_FORK_NEST_DEBUG */
13535 		} else {
13536 			/* undo nesting for this entry, if pre-nested */
13537 			vm_map_fork_unnest(new_pmap,
13538 			    pre_nested_start, pre_nested_end,
13539 			    old_entry->vme_start, old_entry->vme_end);
13540 		}
13541 #endif /* PMAP_FORK_NEST */
13542 
13543 		old_entry_inheritance = old_entry->inheritance;
13544 		/*
13545 		 * If caller used the VM_MAP_FORK_SHARE_IF_INHERIT_NONE option
13546 		 * share VM_INHERIT_NONE entries that are not backed by a
13547 		 * device pager.
13548 		 */
13549 		if (old_entry_inheritance == VM_INHERIT_NONE &&
13550 		    (options & VM_MAP_FORK_SHARE_IF_INHERIT_NONE) &&
13551 		    (old_entry->protection & VM_PROT_READ) &&
13552 		    !(!old_entry->is_sub_map &&
13553 		    VME_OBJECT(old_entry) != NULL &&
13554 		    VME_OBJECT(old_entry)->pager != NULL &&
13555 		    is_device_pager_ops(
13556 			    VME_OBJECT(old_entry)->pager->mo_pager_ops))) {
13557 			old_entry_inheritance = VM_INHERIT_SHARE;
13558 		}
13559 
13560 		if (old_entry_inheritance != VM_INHERIT_NONE &&
13561 		    (options & VM_MAP_FORK_CORPSE_FOOTPRINT) &&
13562 		    footprint_collect_kr == KERN_SUCCESS) {
13563 			/*
13564 			 * The corpse won't have old_map->pmap to query
13565 			 * footprint information, so collect that data now
13566 			 * and store it in new_map->vmmap_corpse_footprint
13567 			 * for later autopsy.
13568 			 */
13569 			footprint_collect_kr =
13570 			    vm_map_corpse_footprint_collect(old_map,
13571 			    old_entry,
13572 			    new_map);
13573 		}
13574 
13575 		switch (old_entry_inheritance) {
13576 		case VM_INHERIT_NONE:
13577 			break;
13578 
13579 		case VM_INHERIT_SHARE:
13580 			vm_map_fork_share(old_map, old_entry, new_map);
13581 			new_size += entry_size;
13582 			break;
13583 
13584 		case VM_INHERIT_COPY:
13585 
13586 			/*
13587 			 *	Inline the copy_quickly case;
13588 			 *	upon failure, fall back on call
13589 			 *	to vm_map_fork_copy.
13590 			 */
13591 
13592 			if (old_entry->is_sub_map) {
13593 				break;
13594 			}
13595 			if ((old_entry->wired_count != 0) ||
13596 			    ((VME_OBJECT(old_entry) != NULL) &&
13597 			    (VME_OBJECT(old_entry)->true_share))) {
13598 				goto slow_vm_map_fork_copy;
13599 			}
13600 
13601 			new_entry = vm_map_entry_create(new_map); /* never the kernel map or descendants */
13602 			vm_map_entry_copy(old_map, new_entry, old_entry);
13603 			if (old_entry->vme_permanent) {
13604 				/* inherit "permanent" on fork() */
13605 				new_entry->vme_permanent = TRUE;
13606 			}
13607 
13608 			if (new_entry->used_for_jit == TRUE && new_map->jit_entry_exists == FALSE) {
13609 				new_map->jit_entry_exists = TRUE;
13610 			}
13611 
13612 			if (new_entry->is_sub_map) {
13613 				/* clear address space specifics */
13614 				new_entry->use_pmap = FALSE;
13615 			} else {
13616 				/*
13617 				 * We're dealing with a copy-on-write operation,
13618 				 * so the resulting mapping should not inherit
13619 				 * the original mapping's accounting settings.
13620 				 * "iokit_acct" should have been cleared in
13621 				 * vm_map_entry_copy().
13622 				 * "use_pmap" should be reset to its default
13623 				 * (TRUE) so that the new mapping gets
13624 				 * accounted for in the task's memory footprint.
13625 				 */
13626 				assert(!new_entry->iokit_acct);
13627 				new_entry->use_pmap = TRUE;
13628 			}
13629 
13630 			if (!vm_object_copy_quickly(
13631 				    VME_OBJECT(new_entry),
13632 				    VME_OFFSET(old_entry),
13633 				    (old_entry->vme_end -
13634 				    old_entry->vme_start),
13635 				    &src_needs_copy,
13636 				    &new_entry_needs_copy)) {
13637 				vm_map_entry_dispose(new_entry);
13638 				goto slow_vm_map_fork_copy;
13639 			}
13640 
13641 			/*
13642 			 *	Handle copy-on-write obligations
13643 			 */
13644 
13645 			if (src_needs_copy && !old_entry->needs_copy) {
13646 				vm_prot_t prot;
13647 
13648 				if (pmap_has_prot_policy(old_map->pmap, old_entry->translated_allow_execute, old_entry->protection)) {
13649 					panic("%s: map %p pmap %p entry %p 0x%llx:0x%llx prot 0x%x",
13650 					    __FUNCTION__,
13651 					    old_map, old_map->pmap, old_entry,
13652 					    (uint64_t)old_entry->vme_start,
13653 					    (uint64_t)old_entry->vme_end,
13654 					    old_entry->protection);
13655 				}
13656 
13657 				prot = old_entry->protection & ~VM_PROT_WRITE;
13658 
13659 				if (override_nx(old_map, VME_ALIAS(old_entry))
13660 				    && prot) {
13661 					prot |= VM_PROT_EXECUTE;
13662 				}
13663 
13664 				if (pmap_has_prot_policy(old_map->pmap, old_entry->translated_allow_execute, prot)) {
13665 					panic("%s: map %p pmap %p entry %p 0x%llx:0x%llx prot 0x%x",
13666 					    __FUNCTION__,
13667 					    old_map, old_map->pmap, old_entry,
13668 					    (uint64_t)old_entry->vme_start,
13669 					    (uint64_t)old_entry->vme_end,
13670 					    prot);
13671 				}
13672 
13673 				vm_object_pmap_protect(
13674 					VME_OBJECT(old_entry),
13675 					VME_OFFSET(old_entry),
13676 					(old_entry->vme_end -
13677 					old_entry->vme_start),
13678 					((old_entry->is_shared
13679 					|| old_map->mapped_in_other_pmaps)
13680 					? PMAP_NULL :
13681 					old_map->pmap),
13682 					VM_MAP_PAGE_SIZE(old_map),
13683 					old_entry->vme_start,
13684 					prot);
13685 
13686 				assert(old_entry->wired_count == 0);
13687 				old_entry->needs_copy = TRUE;
13688 			}
13689 			new_entry->needs_copy = new_entry_needs_copy;
13690 
13691 			/*
13692 			 *	Insert the entry at the end
13693 			 *	of the map.
13694 			 */
13695 
13696 			vm_map_store_entry_link(new_map,
13697 			    vm_map_last_entry(new_map),
13698 			    new_entry,
13699 			    VM_MAP_KERNEL_FLAGS_NONE);
13700 			new_size += entry_size;
13701 			break;
13702 
13703 slow_vm_map_fork_copy:
13704 			vm_map_copyin_flags = VM_MAP_COPYIN_FORK;
13705 			if (options & VM_MAP_FORK_PRESERVE_PURGEABLE) {
13706 				vm_map_copyin_flags |=
13707 				    VM_MAP_COPYIN_PRESERVE_PURGEABLE;
13708 			}
13709 			if (vm_map_fork_copy(old_map,
13710 			    &old_entry,
13711 			    new_map,
13712 			    vm_map_copyin_flags)) {
13713 				new_size += entry_size;
13714 			}
13715 			continue;
13716 		}
13717 		old_entry = old_entry->vme_next;
13718 	}
13719 
13720 #if PMAP_FORK_NEST
13721 	new_entry = vm_map_last_entry(new_map);
13722 	if (new_entry == vm_map_to_entry(new_map)) {
13723 		/* unnest all that was pre-nested */
13724 		vm_map_fork_unnest(new_pmap,
13725 		    pre_nested_start, pre_nested_end,
13726 		    vm_map_min(new_map), vm_map_max(new_map));
13727 	} else if (new_entry->vme_end < vm_map_max(new_map)) {
13728 		/* unnest hole at the end, if pre-nested */
13729 		vm_map_fork_unnest(new_pmap,
13730 		    pre_nested_start, pre_nested_end,
13731 		    new_entry->vme_end, vm_map_max(new_map));
13732 	}
13733 #endif /* PMAP_FORK_NEST */
13734 
13735 #if defined(__arm64__)
13736 	pmap_insert_commpage(new_map->pmap);
13737 #endif /* __arm64__ */
13738 
13739 	new_map->size = new_size;
13740 
13741 	if (options & VM_MAP_FORK_CORPSE_FOOTPRINT) {
13742 		vm_map_corpse_footprint_collect_done(new_map);
13743 	}
13744 
13745 	/* Propagate JIT entitlement for the pmap layer. */
13746 	if (pmap_get_jit_entitled(old_map->pmap)) {
13747 		/* Tell the pmap that it supports JIT. */
13748 		pmap_set_jit_entitled(new_map->pmap);
13749 	}
13750 
13751 	/* Propagate TPRO settings for the pmap layer */
13752 	if (pmap_get_tpro(old_map->pmap)) {
13753 		/* Tell the pmap that it supports TPRO */
13754 		pmap_set_tpro(new_map->pmap);
13755 	}
13756 
13757 
13758 	vm_map_unlock(new_map);
13759 	vm_map_unlock(old_map);
13760 	vm_map_deallocate(old_map);
13761 
13762 	return new_map;
13763 }
13764 
13765 /*
13766  * vm_map_exec:
13767  *
13768  *      Setup the "new_map" with the proper execution environment according
13769  *	to the type of executable (platform, 64bit, chroot environment).
13770  *	Map the comm page and shared region, etc...
13771  */
13772 kern_return_t
vm_map_exec(vm_map_t new_map,task_t task,boolean_t is64bit,void * fsroot,cpu_type_t cpu,cpu_subtype_t cpu_subtype,boolean_t reslide,boolean_t is_driverkit,uint32_t rsr_version)13773 vm_map_exec(
13774 	vm_map_t        new_map,
13775 	task_t          task,
13776 	boolean_t       is64bit,
13777 	void            *fsroot,
13778 	cpu_type_t      cpu,
13779 	cpu_subtype_t   cpu_subtype,
13780 	boolean_t       reslide,
13781 	boolean_t       is_driverkit,
13782 	uint32_t        rsr_version)
13783 {
13784 	SHARED_REGION_TRACE_DEBUG(
13785 		("shared_region: task %p: vm_map_exec(%p,%p,%p,0x%x,0x%x): ->\n",
13786 		(void *)VM_KERNEL_ADDRPERM(current_task()),
13787 		(void *)VM_KERNEL_ADDRPERM(new_map),
13788 		(void *)VM_KERNEL_ADDRPERM(task),
13789 		(void *)VM_KERNEL_ADDRPERM(fsroot),
13790 		cpu,
13791 		cpu_subtype));
13792 	(void) vm_commpage_enter(new_map, task, is64bit);
13793 
13794 	(void) vm_shared_region_enter(new_map, task, is64bit, fsroot, cpu, cpu_subtype, reslide, is_driverkit, rsr_version);
13795 
13796 	SHARED_REGION_TRACE_DEBUG(
13797 		("shared_region: task %p: vm_map_exec(%p,%p,%p,0x%x,0x%x): <-\n",
13798 		(void *)VM_KERNEL_ADDRPERM(current_task()),
13799 		(void *)VM_KERNEL_ADDRPERM(new_map),
13800 		(void *)VM_KERNEL_ADDRPERM(task),
13801 		(void *)VM_KERNEL_ADDRPERM(fsroot),
13802 		cpu,
13803 		cpu_subtype));
13804 
13805 	/*
13806 	 * Some devices have region(s) of memory that shouldn't get allocated by
13807 	 * user processes. The following code creates dummy vm_map_entry_t's for each
13808 	 * of the regions that needs to be reserved to prevent any allocations in
13809 	 * those regions.
13810 	 */
13811 	kern_return_t kr = KERN_FAILURE;
13812 	vm_map_kernel_flags_t vmk_flags = VM_MAP_KERNEL_FLAGS_FIXED_PERMANENT();
13813 	vmk_flags.vmkf_beyond_max = true;
13814 
13815 	const struct vm_reserved_region *regions = NULL;
13816 	size_t num_regions = ml_get_vm_reserved_regions(is64bit, &regions);
13817 	assert((num_regions == 0) || (num_regions > 0 && regions != NULL));
13818 
13819 	for (size_t i = 0; i < num_regions; ++i) {
13820 		vm_map_offset_t address = regions[i].vmrr_addr;
13821 
13822 		kr = vm_map_enter(
13823 			new_map,
13824 			&address,
13825 			regions[i].vmrr_size,
13826 			(vm_map_offset_t)0,
13827 			vmk_flags,
13828 			VM_OBJECT_NULL,
13829 			(vm_object_offset_t)0,
13830 			FALSE,
13831 			VM_PROT_NONE,
13832 			VM_PROT_NONE,
13833 			VM_INHERIT_COPY);
13834 
13835 		if (kr != KERN_SUCCESS) {
13836 			panic("Failed to reserve %s region in user map %p %d", regions[i].vmrr_name, new_map, kr);
13837 		}
13838 	}
13839 
13840 	new_map->reserved_regions = (num_regions ? TRUE : FALSE);
13841 
13842 	return KERN_SUCCESS;
13843 }
13844 
13845 uint64_t vm_map_lookup_and_lock_object_copy_slowly_count = 0;
13846 uint64_t vm_map_lookup_and_lock_object_copy_slowly_size = 0;
13847 uint64_t vm_map_lookup_and_lock_object_copy_slowly_max = 0;
13848 uint64_t vm_map_lookup_and_lock_object_copy_slowly_restart = 0;
13849 uint64_t vm_map_lookup_and_lock_object_copy_slowly_error = 0;
13850 uint64_t vm_map_lookup_and_lock_object_copy_strategically_count = 0;
13851 uint64_t vm_map_lookup_and_lock_object_copy_strategically_size = 0;
13852 uint64_t vm_map_lookup_and_lock_object_copy_strategically_max = 0;
13853 uint64_t vm_map_lookup_and_lock_object_copy_strategically_restart = 0;
13854 uint64_t vm_map_lookup_and_lock_object_copy_strategically_error = 0;
13855 uint64_t vm_map_lookup_and_lock_object_copy_shadow_count = 0;
13856 uint64_t vm_map_lookup_and_lock_object_copy_shadow_size = 0;
13857 uint64_t vm_map_lookup_and_lock_object_copy_shadow_max = 0;
13858 /*
13859  *	vm_map_lookup_and_lock_object:
13860  *
13861  *	Finds the VM object, offset, and
13862  *	protection for a given virtual address in the
13863  *	specified map, assuming a page fault of the
13864  *	type specified.
13865  *
13866  *	Returns the (object, offset, protection) for
13867  *	this address, whether it is wired down, and whether
13868  *	this map has the only reference to the data in question.
13869  *	In order to later verify this lookup, a "version"
13870  *	is returned.
13871  *	If contended != NULL, *contended will be set to
13872  *	true iff the thread had to spin or block to acquire
13873  *	an exclusive lock.
13874  *
13875  *	The map MUST be locked by the caller and WILL be
13876  *	locked on exit.  In order to guarantee the
13877  *	existence of the returned object, it is returned
13878  *	locked.
13879  *
13880  *	If a lookup is requested with "write protection"
13881  *	specified, the map may be changed to perform virtual
13882  *	copying operations, although the data referenced will
13883  *	remain the same.
13884  */
13885 kern_return_t
vm_map_lookup_and_lock_object(vm_map_t * var_map,vm_map_offset_t vaddr,vm_prot_t fault_type,int object_lock_type,vm_map_version_t * out_version,vm_object_t * object,vm_object_offset_t * offset,vm_prot_t * out_prot,boolean_t * wired,vm_object_fault_info_t fault_info,vm_map_t * real_map,bool * contended)13886 vm_map_lookup_and_lock_object(
13887 	vm_map_t                *var_map,       /* IN/OUT */
13888 	vm_map_offset_t         vaddr,
13889 	vm_prot_t               fault_type,
13890 	int                     object_lock_type,
13891 	vm_map_version_t        *out_version,   /* OUT */
13892 	vm_object_t             *object,        /* OUT */
13893 	vm_object_offset_t      *offset,        /* OUT */
13894 	vm_prot_t               *out_prot,      /* OUT */
13895 	boolean_t               *wired,         /* OUT */
13896 	vm_object_fault_info_t  fault_info,     /* OUT */
13897 	vm_map_t                *real_map,      /* OUT */
13898 	bool                    *contended)     /* OUT */
13899 {
13900 	vm_map_entry_t                  entry;
13901 	vm_map_t                        map = *var_map;
13902 	vm_map_t                        old_map = *var_map;
13903 	vm_map_t                        cow_sub_map_parent = VM_MAP_NULL;
13904 	vm_map_offset_t                 cow_parent_vaddr = 0;
13905 	vm_map_offset_t                 old_start = 0;
13906 	vm_map_offset_t                 old_end = 0;
13907 	vm_prot_t                       prot;
13908 	boolean_t                       mask_protections;
13909 	boolean_t                       force_copy;
13910 	boolean_t                       no_force_copy_if_executable;
13911 	boolean_t                       submap_needed_copy;
13912 	vm_prot_t                       original_fault_type;
13913 	vm_map_size_t                   fault_page_mask;
13914 
13915 	/*
13916 	 * VM_PROT_MASK means that the caller wants us to use "fault_type"
13917 	 * as a mask against the mapping's actual protections, not as an
13918 	 * absolute value.
13919 	 */
13920 	mask_protections = (fault_type & VM_PROT_IS_MASK) ? TRUE : FALSE;
13921 	force_copy = (fault_type & VM_PROT_COPY) ? TRUE : FALSE;
13922 	no_force_copy_if_executable = (fault_type & VM_PROT_COPY_FAIL_IF_EXECUTABLE) ? TRUE : FALSE;
13923 	fault_type &= VM_PROT_ALL;
13924 	original_fault_type = fault_type;
13925 	if (contended) {
13926 		*contended = false;
13927 	}
13928 
13929 	*real_map = map;
13930 
13931 	fault_page_mask = MIN(VM_MAP_PAGE_MASK(map), PAGE_MASK);
13932 	vaddr = VM_MAP_TRUNC_PAGE(vaddr, fault_page_mask);
13933 
13934 RetryLookup:
13935 	fault_type = original_fault_type;
13936 
13937 	/*
13938 	 *	If the map has an interesting hint, try it before calling
13939 	 *	full blown lookup routine.
13940 	 */
13941 	entry = map->hint;
13942 
13943 	if ((entry == vm_map_to_entry(map)) ||
13944 	    (vaddr < entry->vme_start) || (vaddr >= entry->vme_end)) {
13945 		vm_map_entry_t  tmp_entry;
13946 
13947 		/*
13948 		 *	Entry was either not a valid hint, or the vaddr
13949 		 *	was not contained in the entry, so do a full lookup.
13950 		 */
13951 		if (!vm_map_lookup_entry(map, vaddr, &tmp_entry)) {
13952 			if ((cow_sub_map_parent) && (cow_sub_map_parent != map)) {
13953 				vm_map_unlock(cow_sub_map_parent);
13954 			}
13955 			if ((*real_map != map)
13956 			    && (*real_map != cow_sub_map_parent)) {
13957 				vm_map_unlock(*real_map);
13958 			}
13959 			return KERN_INVALID_ADDRESS;
13960 		}
13961 
13962 		entry = tmp_entry;
13963 	}
13964 	if (map == old_map) {
13965 		old_start = entry->vme_start;
13966 		old_end = entry->vme_end;
13967 	}
13968 
13969 	/*
13970 	 *	Handle submaps.  Drop lock on upper map, submap is
13971 	 *	returned locked.
13972 	 */
13973 
13974 	submap_needed_copy = FALSE;
13975 submap_recurse:
13976 	if (entry->is_sub_map) {
13977 		vm_map_offset_t         local_vaddr;
13978 		vm_map_offset_t         end_delta;
13979 		vm_map_offset_t         start_delta;
13980 		vm_map_offset_t         top_entry_saved_start;
13981 		vm_object_offset_t      top_entry_saved_offset;
13982 		vm_map_entry_t          submap_entry, saved_submap_entry;
13983 		vm_object_offset_t      submap_entry_offset;
13984 		vm_object_size_t        submap_entry_size;
13985 		vm_prot_t               subentry_protection;
13986 		vm_prot_t               subentry_max_protection;
13987 		boolean_t               subentry_no_copy_on_read;
13988 		boolean_t               subentry_permanent;
13989 		boolean_t               subentry_csm_associated;
13990 #if __arm64e__
13991 		boolean_t               subentry_used_for_tpro;
13992 #endif /* __arm64e__ */
13993 		boolean_t               mapped_needs_copy = FALSE;
13994 		vm_map_version_t        version;
13995 
13996 		assertf(VM_MAP_PAGE_SHIFT(VME_SUBMAP(entry)) >= VM_MAP_PAGE_SHIFT(map),
13997 		    "map %p (%d) entry %p submap %p (%d)\n",
13998 		    map, VM_MAP_PAGE_SHIFT(map), entry,
13999 		    VME_SUBMAP(entry), VM_MAP_PAGE_SHIFT(VME_SUBMAP(entry)));
14000 
14001 		local_vaddr = vaddr;
14002 		top_entry_saved_start = entry->vme_start;
14003 		top_entry_saved_offset = VME_OFFSET(entry);
14004 
14005 		if ((entry->use_pmap &&
14006 		    !((fault_type & VM_PROT_WRITE) ||
14007 		    force_copy))) {
14008 			/* if real_map equals map we unlock below */
14009 			if ((*real_map != map) &&
14010 			    (*real_map != cow_sub_map_parent)) {
14011 				vm_map_unlock(*real_map);
14012 			}
14013 			*real_map = VME_SUBMAP(entry);
14014 		}
14015 
14016 		if (entry->needs_copy &&
14017 		    ((fault_type & VM_PROT_WRITE) ||
14018 		    force_copy)) {
14019 			if (!mapped_needs_copy) {
14020 				if (vm_map_lock_read_to_write(map)) {
14021 					vm_map_lock_read(map);
14022 					*real_map = map;
14023 					goto RetryLookup;
14024 				}
14025 				vm_map_lock_read(VME_SUBMAP(entry));
14026 				*var_map = VME_SUBMAP(entry);
14027 				cow_sub_map_parent = map;
14028 				/* reset base to map before cow object */
14029 				/* this is the map which will accept   */
14030 				/* the new cow object */
14031 				old_start = entry->vme_start;
14032 				old_end = entry->vme_end;
14033 				cow_parent_vaddr = vaddr;
14034 				mapped_needs_copy = TRUE;
14035 			} else {
14036 				vm_map_lock_read(VME_SUBMAP(entry));
14037 				*var_map = VME_SUBMAP(entry);
14038 				if ((cow_sub_map_parent != map) &&
14039 				    (*real_map != map)) {
14040 					vm_map_unlock(map);
14041 				}
14042 			}
14043 		} else {
14044 			if (entry->needs_copy) {
14045 				submap_needed_copy = TRUE;
14046 			}
14047 			vm_map_lock_read(VME_SUBMAP(entry));
14048 			*var_map = VME_SUBMAP(entry);
14049 			/* leave map locked if it is a target */
14050 			/* cow sub_map above otherwise, just  */
14051 			/* follow the maps down to the object */
14052 			/* here we unlock knowing we are not  */
14053 			/* revisiting the map.  */
14054 			if ((*real_map != map) && (map != cow_sub_map_parent)) {
14055 				vm_map_unlock_read(map);
14056 			}
14057 		}
14058 
14059 		entry = NULL;
14060 		map = *var_map;
14061 
14062 		/* calculate the offset in the submap for vaddr */
14063 		local_vaddr = (local_vaddr - top_entry_saved_start) + top_entry_saved_offset;
14064 		assertf(VM_MAP_PAGE_ALIGNED(local_vaddr, fault_page_mask),
14065 		    "local_vaddr 0x%llx entry->vme_start 0x%llx fault_page_mask 0x%llx\n",
14066 		    (uint64_t)local_vaddr, (uint64_t)top_entry_saved_start, (uint64_t)fault_page_mask);
14067 
14068 RetrySubMap:
14069 		if (!vm_map_lookup_entry(map, local_vaddr, &submap_entry)) {
14070 			if ((cow_sub_map_parent) && (cow_sub_map_parent != map)) {
14071 				vm_map_unlock(cow_sub_map_parent);
14072 			}
14073 			if ((*real_map != map)
14074 			    && (*real_map != cow_sub_map_parent)) {
14075 				vm_map_unlock(*real_map);
14076 			}
14077 			*real_map = map;
14078 			return KERN_INVALID_ADDRESS;
14079 		}
14080 
14081 		/* find the attenuated shadow of the underlying object */
14082 		/* on our target map */
14083 
14084 		/* in english the submap object may extend beyond the     */
14085 		/* region mapped by the entry or, may only fill a portion */
14086 		/* of it.  For our purposes, we only care if the object   */
14087 		/* doesn't fill.  In this case the area which will        */
14088 		/* ultimately be clipped in the top map will only need    */
14089 		/* to be as big as the portion of the underlying entry    */
14090 		/* which is mapped */
14091 		start_delta = submap_entry->vme_start > top_entry_saved_offset ?
14092 		    submap_entry->vme_start - top_entry_saved_offset : 0;
14093 
14094 		end_delta =
14095 		    (top_entry_saved_offset + start_delta + (old_end - old_start)) <=
14096 		    submap_entry->vme_end ?
14097 		    0 : (top_entry_saved_offset +
14098 		    (old_end - old_start))
14099 		    - submap_entry->vme_end;
14100 
14101 		old_start += start_delta;
14102 		old_end -= end_delta;
14103 
14104 		if (submap_entry->is_sub_map) {
14105 			entry = submap_entry;
14106 			vaddr = local_vaddr;
14107 			goto submap_recurse;
14108 		}
14109 
14110 		if (((fault_type & VM_PROT_WRITE) ||
14111 		    force_copy)
14112 		    && cow_sub_map_parent) {
14113 			vm_object_t     sub_object, copy_object;
14114 			vm_object_offset_t copy_offset;
14115 			vm_map_offset_t local_start;
14116 			vm_map_offset_t local_end;
14117 			boolean_t       object_copied = FALSE;
14118 			vm_object_offset_t object_copied_offset = 0;
14119 			boolean_t       object_copied_needs_copy = FALSE;
14120 			kern_return_t   kr = KERN_SUCCESS;
14121 
14122 			if (vm_map_lock_read_to_write(map)) {
14123 				vm_map_lock_read(map);
14124 				old_start -= start_delta;
14125 				old_end += end_delta;
14126 				goto RetrySubMap;
14127 			}
14128 
14129 
14130 			sub_object = VME_OBJECT(submap_entry);
14131 			if (sub_object == VM_OBJECT_NULL) {
14132 				sub_object =
14133 				    vm_object_allocate(
14134 					(vm_map_size_t)
14135 					(submap_entry->vme_end -
14136 					submap_entry->vme_start));
14137 				VME_OBJECT_SET(submap_entry, sub_object, false, 0);
14138 				VME_OFFSET_SET(submap_entry, 0);
14139 				assert(!submap_entry->is_sub_map);
14140 				assert(submap_entry->use_pmap);
14141 			}
14142 			local_start =  local_vaddr -
14143 			    (cow_parent_vaddr - old_start);
14144 			local_end = local_vaddr +
14145 			    (old_end - cow_parent_vaddr);
14146 			vm_map_clip_start(map, submap_entry, local_start);
14147 			vm_map_clip_end(map, submap_entry, local_end);
14148 			if (submap_entry->is_sub_map) {
14149 				/* unnesting was done when clipping */
14150 				assert(!submap_entry->use_pmap);
14151 			}
14152 
14153 			/* This is the COW case, lets connect */
14154 			/* an entry in our space to the underlying */
14155 			/* object in the submap, bypassing the  */
14156 			/* submap. */
14157 			submap_entry_offset = VME_OFFSET(submap_entry);
14158 			submap_entry_size = submap_entry->vme_end - submap_entry->vme_start;
14159 
14160 			if ((submap_entry->wired_count != 0 ||
14161 			    sub_object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC) &&
14162 			    (submap_entry->protection & VM_PROT_EXECUTE) &&
14163 			    no_force_copy_if_executable) {
14164 //				printf("FBDP map %p entry %p start 0x%llx end 0x%llx wired %d strat %d\n", map, submap_entry, (uint64_t)local_start, (uint64_t)local_end, submap_entry->wired_count, sub_object->copy_strategy);
14165 				if ((cow_sub_map_parent) && (cow_sub_map_parent != map)) {
14166 					vm_map_unlock(cow_sub_map_parent);
14167 				}
14168 				if ((*real_map != map)
14169 				    && (*real_map != cow_sub_map_parent)) {
14170 					vm_map_unlock(*real_map);
14171 				}
14172 				*real_map = map;
14173 				ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_SUBMAP_NO_COW_ON_EXECUTABLE), 0 /* arg */);
14174 				vm_map_lock_write_to_read(map);
14175 				kr = KERN_PROTECTION_FAILURE;
14176 				DTRACE_VM4(submap_no_copy_executable,
14177 				    vm_map_t, map,
14178 				    vm_object_offset_t, submap_entry_offset,
14179 				    vm_object_size_t, submap_entry_size,
14180 				    int, kr);
14181 				return kr;
14182 			}
14183 
14184 			if (submap_entry->wired_count != 0) {
14185 				vm_object_reference(sub_object);
14186 
14187 				assertf(VM_MAP_PAGE_ALIGNED(VME_OFFSET(submap_entry), VM_MAP_PAGE_MASK(map)),
14188 				    "submap_entry %p offset 0x%llx\n",
14189 				    submap_entry, VME_OFFSET(submap_entry));
14190 
14191 				DTRACE_VM6(submap_copy_slowly,
14192 				    vm_map_t, cow_sub_map_parent,
14193 				    vm_map_offset_t, vaddr,
14194 				    vm_map_t, map,
14195 				    vm_object_size_t, submap_entry_size,
14196 				    int, submap_entry->wired_count,
14197 				    int, sub_object->copy_strategy);
14198 
14199 				saved_submap_entry = submap_entry;
14200 				version.main_timestamp = map->timestamp;
14201 				vm_map_unlock(map); /* Increments timestamp by 1 */
14202 				submap_entry = VM_MAP_ENTRY_NULL;
14203 
14204 				vm_object_lock(sub_object);
14205 				kr = vm_object_copy_slowly(sub_object,
14206 				    submap_entry_offset,
14207 				    submap_entry_size,
14208 				    FALSE,
14209 				    &copy_object);
14210 				object_copied = TRUE;
14211 				object_copied_offset = 0;
14212 				/* 4k: account for extra offset in physical page */
14213 				object_copied_offset += submap_entry_offset - vm_object_trunc_page(submap_entry_offset);
14214 				object_copied_needs_copy = FALSE;
14215 				vm_object_deallocate(sub_object);
14216 
14217 				vm_map_lock(map);
14218 
14219 				if (kr != KERN_SUCCESS &&
14220 				    kr != KERN_MEMORY_RESTART_COPY) {
14221 					if ((cow_sub_map_parent) && (cow_sub_map_parent != map)) {
14222 						vm_map_unlock(cow_sub_map_parent);
14223 					}
14224 					if ((*real_map != map)
14225 					    && (*real_map != cow_sub_map_parent)) {
14226 						vm_map_unlock(*real_map);
14227 					}
14228 					*real_map = map;
14229 					vm_object_deallocate(copy_object);
14230 					copy_object = VM_OBJECT_NULL;
14231 					ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_SUBMAP_COPY_SLOWLY_FAILED), 0 /* arg */);
14232 					vm_map_lock_write_to_read(map);
14233 					DTRACE_VM4(submap_copy_error_slowly,
14234 					    vm_object_t, sub_object,
14235 					    vm_object_offset_t, submap_entry_offset,
14236 					    vm_object_size_t, submap_entry_size,
14237 					    int, kr);
14238 					vm_map_lookup_and_lock_object_copy_slowly_error++;
14239 					return kr;
14240 				}
14241 
14242 				if ((kr == KERN_SUCCESS) &&
14243 				    (version.main_timestamp + 1) == map->timestamp) {
14244 					submap_entry = saved_submap_entry;
14245 				} else {
14246 					saved_submap_entry = NULL;
14247 					old_start -= start_delta;
14248 					old_end += end_delta;
14249 					vm_object_deallocate(copy_object);
14250 					copy_object = VM_OBJECT_NULL;
14251 					vm_map_lock_write_to_read(map);
14252 					vm_map_lookup_and_lock_object_copy_slowly_restart++;
14253 					goto RetrySubMap;
14254 				}
14255 				vm_map_lookup_and_lock_object_copy_slowly_count++;
14256 				vm_map_lookup_and_lock_object_copy_slowly_size += submap_entry_size;
14257 				if (submap_entry_size > vm_map_lookup_and_lock_object_copy_slowly_max) {
14258 					vm_map_lookup_and_lock_object_copy_slowly_max = submap_entry_size;
14259 				}
14260 			} else if (sub_object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC) {
14261 				submap_entry_offset = VME_OFFSET(submap_entry);
14262 				copy_object = VM_OBJECT_NULL;
14263 				object_copied_offset = submap_entry_offset;
14264 				object_copied_needs_copy = FALSE;
14265 				DTRACE_VM6(submap_copy_strategically,
14266 				    vm_map_t, cow_sub_map_parent,
14267 				    vm_map_offset_t, vaddr,
14268 				    vm_map_t, map,
14269 				    vm_object_size_t, submap_entry_size,
14270 				    int, submap_entry->wired_count,
14271 				    int, sub_object->copy_strategy);
14272 				kr = vm_object_copy_strategically(
14273 					sub_object,
14274 					submap_entry_offset,
14275 					submap_entry->vme_end - submap_entry->vme_start,
14276 					false, /* forking */
14277 					&copy_object,
14278 					&object_copied_offset,
14279 					&object_copied_needs_copy);
14280 				if (kr == KERN_MEMORY_RESTART_COPY) {
14281 					old_start -= start_delta;
14282 					old_end += end_delta;
14283 					vm_object_deallocate(copy_object);
14284 					copy_object = VM_OBJECT_NULL;
14285 					vm_map_lock_write_to_read(map);
14286 					vm_map_lookup_and_lock_object_copy_strategically_restart++;
14287 					goto RetrySubMap;
14288 				}
14289 				if (kr != KERN_SUCCESS) {
14290 					if ((cow_sub_map_parent) && (cow_sub_map_parent != map)) {
14291 						vm_map_unlock(cow_sub_map_parent);
14292 					}
14293 					if ((*real_map != map)
14294 					    && (*real_map != cow_sub_map_parent)) {
14295 						vm_map_unlock(*real_map);
14296 					}
14297 					*real_map = map;
14298 					vm_object_deallocate(copy_object);
14299 					copy_object = VM_OBJECT_NULL;
14300 					ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_SUBMAP_COPY_STRAT_FAILED), 0 /* arg */);
14301 					vm_map_lock_write_to_read(map);
14302 					DTRACE_VM4(submap_copy_error_strategically,
14303 					    vm_object_t, sub_object,
14304 					    vm_object_offset_t, submap_entry_offset,
14305 					    vm_object_size_t, submap_entry_size,
14306 					    int, kr);
14307 					vm_map_lookup_and_lock_object_copy_strategically_error++;
14308 					return kr;
14309 				}
14310 				assert(copy_object != VM_OBJECT_NULL);
14311 				assert(copy_object != sub_object);
14312 				object_copied = TRUE;
14313 				vm_map_lookup_and_lock_object_copy_strategically_count++;
14314 				vm_map_lookup_and_lock_object_copy_strategically_size += submap_entry_size;
14315 				if (submap_entry_size > vm_map_lookup_and_lock_object_copy_strategically_max) {
14316 					vm_map_lookup_and_lock_object_copy_strategically_max = submap_entry_size;
14317 				}
14318 			} else {
14319 				/* set up shadow object */
14320 				object_copied = FALSE;
14321 				copy_object = sub_object;
14322 				vm_object_lock(sub_object);
14323 				vm_object_reference_locked(sub_object);
14324 				VM_OBJECT_SET_SHADOWED(sub_object, TRUE);
14325 				vm_object_unlock(sub_object);
14326 
14327 				assert(submap_entry->wired_count == 0);
14328 				submap_entry->needs_copy = TRUE;
14329 
14330 				prot = submap_entry->protection;
14331 				if (pmap_has_prot_policy(map->pmap, submap_entry->translated_allow_execute, prot)) {
14332 					panic("%s: map %p pmap %p entry %p 0x%llx:0x%llx prot 0x%x",
14333 					    __FUNCTION__,
14334 					    map, map->pmap, submap_entry,
14335 					    (uint64_t)submap_entry->vme_start,
14336 					    (uint64_t)submap_entry->vme_end,
14337 					    prot);
14338 				}
14339 				prot = prot & ~VM_PROT_WRITE;
14340 				if (pmap_has_prot_policy(map->pmap, submap_entry->translated_allow_execute, prot)) {
14341 					panic("%s: map %p pmap %p entry %p 0x%llx:0x%llx prot 0x%x",
14342 					    __FUNCTION__,
14343 					    map, map->pmap, submap_entry,
14344 					    (uint64_t)submap_entry->vme_start,
14345 					    (uint64_t)submap_entry->vme_end,
14346 					    prot);
14347 				}
14348 
14349 				if (override_nx(old_map,
14350 				    VME_ALIAS(submap_entry))
14351 				    && prot) {
14352 					prot |= VM_PROT_EXECUTE;
14353 				}
14354 
14355 				vm_object_pmap_protect(
14356 					sub_object,
14357 					VME_OFFSET(submap_entry),
14358 					submap_entry->vme_end -
14359 					submap_entry->vme_start,
14360 					(submap_entry->is_shared
14361 					|| map->mapped_in_other_pmaps) ?
14362 					PMAP_NULL : map->pmap,
14363 					VM_MAP_PAGE_SIZE(map),
14364 					submap_entry->vme_start,
14365 					prot);
14366 				vm_map_lookup_and_lock_object_copy_shadow_count++;
14367 				vm_map_lookup_and_lock_object_copy_shadow_size += submap_entry_size;
14368 				if (submap_entry_size > vm_map_lookup_and_lock_object_copy_shadow_max) {
14369 					vm_map_lookup_and_lock_object_copy_shadow_max = submap_entry_size;
14370 				}
14371 			}
14372 
14373 			/*
14374 			 * Adjust the fault offset to the submap entry.
14375 			 */
14376 			copy_offset = (local_vaddr -
14377 			    submap_entry->vme_start +
14378 			    VME_OFFSET(submap_entry));
14379 
14380 			/* This works diffently than the   */
14381 			/* normal submap case. We go back  */
14382 			/* to the parent of the cow map and*/
14383 			/* clip out the target portion of  */
14384 			/* the sub_map, substituting the   */
14385 			/* new copy object,                */
14386 
14387 			subentry_protection = submap_entry->protection;
14388 			subentry_max_protection = submap_entry->max_protection;
14389 			subentry_no_copy_on_read = submap_entry->vme_no_copy_on_read;
14390 			subentry_permanent = submap_entry->vme_permanent;
14391 			subentry_csm_associated = submap_entry->csm_associated;
14392 #if __arm64e__
14393 			subentry_used_for_tpro = submap_entry->used_for_tpro;
14394 #endif // __arm64e__
14395 			vm_map_unlock(map);
14396 			submap_entry = NULL; /* not valid after map unlock */
14397 
14398 			local_start = old_start;
14399 			local_end = old_end;
14400 			map = cow_sub_map_parent;
14401 			*var_map = cow_sub_map_parent;
14402 			vaddr = cow_parent_vaddr;
14403 			cow_sub_map_parent = NULL;
14404 
14405 			if (!vm_map_lookup_entry(map,
14406 			    vaddr, &entry)) {
14407 				if ((cow_sub_map_parent) && (cow_sub_map_parent != map)) {
14408 					vm_map_unlock(cow_sub_map_parent);
14409 				}
14410 				if ((*real_map != map)
14411 				    && (*real_map != cow_sub_map_parent)) {
14412 					vm_map_unlock(*real_map);
14413 				}
14414 				*real_map = map;
14415 				vm_object_deallocate(
14416 					copy_object);
14417 				copy_object = VM_OBJECT_NULL;
14418 				vm_map_lock_write_to_read(map);
14419 				DTRACE_VM4(submap_lookup_post_unlock,
14420 				    uint64_t, (uint64_t)entry->vme_start,
14421 				    uint64_t, (uint64_t)entry->vme_end,
14422 				    vm_map_offset_t, vaddr,
14423 				    int, object_copied);
14424 				return KERN_INVALID_ADDRESS;
14425 			}
14426 
14427 			/* clip out the portion of space */
14428 			/* mapped by the sub map which   */
14429 			/* corresponds to the underlying */
14430 			/* object */
14431 
14432 			/*
14433 			 * Clip (and unnest) the smallest nested chunk
14434 			 * possible around the faulting address...
14435 			 */
14436 			local_start = vaddr & ~(pmap_shared_region_size_min(map->pmap) - 1);
14437 			local_end = local_start + pmap_shared_region_size_min(map->pmap);
14438 			/*
14439 			 * ... but don't go beyond the "old_start" to "old_end"
14440 			 * range, to avoid spanning over another VM region
14441 			 * with a possibly different VM object and/or offset.
14442 			 */
14443 			if (local_start < old_start) {
14444 				local_start = old_start;
14445 			}
14446 			if (local_end > old_end) {
14447 				local_end = old_end;
14448 			}
14449 			/*
14450 			 * Adjust copy_offset to the start of the range.
14451 			 */
14452 			copy_offset -= (vaddr - local_start);
14453 
14454 			vm_map_clip_start(map, entry, local_start);
14455 			vm_map_clip_end(map, entry, local_end);
14456 			if (entry->is_sub_map) {
14457 				/* unnesting was done when clipping */
14458 				assert(!entry->use_pmap);
14459 			}
14460 
14461 			/* substitute copy object for */
14462 			/* shared map entry           */
14463 			vm_map_deallocate(VME_SUBMAP(entry));
14464 			assert(!entry->iokit_acct);
14465 			entry->use_pmap = TRUE;
14466 			VME_OBJECT_SET(entry, copy_object, false, 0);
14467 
14468 			/* propagate the submap entry's protections */
14469 			if (entry->protection != VM_PROT_READ) {
14470 				/*
14471 				 * Someone has already altered the top entry's
14472 				 * protections via vm_protect(VM_PROT_COPY).
14473 				 * Respect these new values and ignore the
14474 				 * submap entry's protections.
14475 				 */
14476 			} else {
14477 				/*
14478 				 * Regular copy-on-write: propagate the submap
14479 				 * entry's protections to the top map entry.
14480 				 */
14481 				entry->protection |= subentry_protection;
14482 			}
14483 			entry->max_protection |= subentry_max_protection;
14484 			/* propagate some attributes from subentry */
14485 			entry->vme_no_copy_on_read = subentry_no_copy_on_read;
14486 			entry->vme_permanent = subentry_permanent;
14487 			entry->csm_associated = subentry_csm_associated;
14488 #if __arm64e__
14489 			/* propagate TPRO iff the destination map has TPRO enabled */
14490 			if (subentry_used_for_tpro && vm_map_tpro(map)) {
14491 				entry->used_for_tpro = subentry_used_for_tpro;
14492 			}
14493 #endif /* __arm64e */
14494 			if ((entry->protection & VM_PROT_WRITE) &&
14495 			    (entry->protection & VM_PROT_EXECUTE) &&
14496 #if XNU_TARGET_OS_OSX
14497 			    map->pmap != kernel_pmap &&
14498 			    (vm_map_cs_enforcement(map)
14499 #if __arm64__
14500 			    || !VM_MAP_IS_EXOTIC(map)
14501 #endif /* __arm64__ */
14502 			    ) &&
14503 #endif /* XNU_TARGET_OS_OSX */
14504 #if CODE_SIGNING_MONITOR
14505 			    (csm_address_space_exempt(map->pmap) != KERN_SUCCESS) &&
14506 #endif
14507 			    !(entry->used_for_jit) &&
14508 			    VM_MAP_POLICY_WX_STRIP_X(map)) {
14509 				DTRACE_VM3(cs_wx,
14510 				    uint64_t, (uint64_t)entry->vme_start,
14511 				    uint64_t, (uint64_t)entry->vme_end,
14512 				    vm_prot_t, entry->protection);
14513 				printf("CODE SIGNING: %d[%s] %s:%d(0x%llx,0x%llx,0x%x) can't have both write and exec at the same time\n",
14514 				    proc_selfpid(),
14515 				    (get_bsdtask_info(current_task())
14516 				    ? proc_name_address(get_bsdtask_info(current_task()))
14517 				    : "?"),
14518 				    __FUNCTION__, __LINE__,
14519 #if DEVELOPMENT || DEBUG
14520 				    (uint64_t)entry->vme_start,
14521 				    (uint64_t)entry->vme_end,
14522 #else /* DEVELOPMENT || DEBUG */
14523 				    (uint64_t)0,
14524 				    (uint64_t)0,
14525 #endif /* DEVELOPMENT || DEBUG */
14526 				    entry->protection);
14527 				entry->protection &= ~VM_PROT_EXECUTE;
14528 			}
14529 
14530 			if (object_copied) {
14531 				VME_OFFSET_SET(entry, local_start - old_start + object_copied_offset);
14532 				entry->needs_copy = object_copied_needs_copy;
14533 				entry->is_shared = FALSE;
14534 			} else {
14535 				assert(VME_OBJECT(entry) != VM_OBJECT_NULL);
14536 				assert(VME_OBJECT(entry)->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC);
14537 				assert(entry->wired_count == 0);
14538 				VME_OFFSET_SET(entry, copy_offset);
14539 				entry->needs_copy = TRUE;
14540 				if (map != old_map) {
14541 					entry->is_shared = TRUE;
14542 				}
14543 			}
14544 			if (entry->inheritance == VM_INHERIT_SHARE) {
14545 				entry->inheritance = VM_INHERIT_COPY;
14546 			}
14547 
14548 			vm_map_lock_write_to_read(map);
14549 		} else {
14550 			if ((cow_sub_map_parent)
14551 			    && (cow_sub_map_parent != *real_map)
14552 			    && (cow_sub_map_parent != map)) {
14553 				vm_map_unlock(cow_sub_map_parent);
14554 			}
14555 			entry = submap_entry;
14556 			vaddr = local_vaddr;
14557 		}
14558 	}
14559 
14560 	/*
14561 	 *	Check whether this task is allowed to have
14562 	 *	this page.
14563 	 */
14564 
14565 	prot = entry->protection;
14566 
14567 	if (override_nx(old_map, VME_ALIAS(entry)) && prot) {
14568 		/*
14569 		 * HACK -- if not a stack, then allow execution
14570 		 */
14571 		prot |= VM_PROT_EXECUTE;
14572 	}
14573 
14574 #if __arm64e__
14575 	/*
14576 	 * If the entry we're dealing with is TPRO and we have a write
14577 	 * fault, inject VM_PROT_WRITE into protections. This allows us
14578 	 * to maintain RO permissions when not marked as TPRO.
14579 	 */
14580 	if (entry->used_for_tpro && (fault_type & VM_PROT_WRITE)) {
14581 		prot |= VM_PROT_WRITE;
14582 	}
14583 #endif /* __arm64e__ */
14584 	if (mask_protections) {
14585 		fault_type &= prot;
14586 		if (fault_type == VM_PROT_NONE) {
14587 			goto protection_failure;
14588 		}
14589 	}
14590 	if (((fault_type & prot) != fault_type)
14591 #if __arm64__
14592 	    /* prefetch abort in execute-only page */
14593 	    && !(prot == VM_PROT_EXECUTE && fault_type == (VM_PROT_READ | VM_PROT_EXECUTE))
14594 #elif defined(__x86_64__)
14595 	    /* Consider the UEXEC bit when handling an EXECUTE fault */
14596 	    && !((fault_type & VM_PROT_EXECUTE) && !(prot & VM_PROT_EXECUTE) && (prot & VM_PROT_UEXEC))
14597 #endif
14598 	    ) {
14599 protection_failure:
14600 		if (*real_map != map) {
14601 			vm_map_unlock(*real_map);
14602 		}
14603 		*real_map = map;
14604 
14605 		if ((fault_type & VM_PROT_EXECUTE) && prot) {
14606 			log_stack_execution_failure((addr64_t)vaddr, prot);
14607 		}
14608 
14609 		DTRACE_VM2(prot_fault, int, 1, (uint64_t *), NULL);
14610 		DTRACE_VM3(prot_fault_detailed, vm_prot_t, fault_type, vm_prot_t, prot, void *, vaddr);
14611 		/*
14612 		 * Noisy (esp. internally) and can be inferred from CrashReports. So OFF for now.
14613 		 *
14614 		 * ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_PROTECTION_FAILURE), 0);
14615 		 */
14616 		return KERN_PROTECTION_FAILURE;
14617 	}
14618 
14619 	/*
14620 	 *	If this page is not pageable, we have to get
14621 	 *	it for all possible accesses.
14622 	 */
14623 
14624 	*wired = (entry->wired_count != 0);
14625 	if (*wired) {
14626 		fault_type = prot;
14627 	}
14628 
14629 	/*
14630 	 *	If the entry was copy-on-write, we either ...
14631 	 */
14632 
14633 	if (entry->needs_copy) {
14634 		/*
14635 		 *	If we want to write the page, we may as well
14636 		 *	handle that now since we've got the map locked.
14637 		 *
14638 		 *	If we don't need to write the page, we just
14639 		 *	demote the permissions allowed.
14640 		 */
14641 
14642 		if ((fault_type & VM_PROT_WRITE) || *wired || force_copy) {
14643 			/*
14644 			 *	Make a new object, and place it in the
14645 			 *	object chain.  Note that no new references
14646 			 *	have appeared -- one just moved from the
14647 			 *	map to the new object.
14648 			 */
14649 
14650 			if (vm_map_lock_read_to_write(map)) {
14651 				vm_map_lock_read(map);
14652 				goto RetryLookup;
14653 			}
14654 
14655 			if (VME_OBJECT(entry)->shadowed == FALSE) {
14656 				vm_object_lock(VME_OBJECT(entry));
14657 				VM_OBJECT_SET_SHADOWED(VME_OBJECT(entry), TRUE);
14658 				vm_object_unlock(VME_OBJECT(entry));
14659 			}
14660 			VME_OBJECT_SHADOW(entry,
14661 			    (vm_map_size_t) (entry->vme_end -
14662 			    entry->vme_start),
14663 			    vm_map_always_shadow(map));
14664 			entry->needs_copy = FALSE;
14665 
14666 			vm_map_lock_write_to_read(map);
14667 		}
14668 		if ((fault_type & VM_PROT_WRITE) == 0 && *wired == 0) {
14669 			/*
14670 			 *	We're attempting to read a copy-on-write
14671 			 *	page -- don't allow writes.
14672 			 */
14673 
14674 			prot &= (~VM_PROT_WRITE);
14675 		}
14676 	}
14677 
14678 	if (submap_needed_copy && (prot & VM_PROT_WRITE)) {
14679 		/*
14680 		 * We went through a "needs_copy" submap without triggering
14681 		 * a copy, so granting write access to the page would bypass
14682 		 * that submap's "needs_copy".
14683 		 */
14684 		assert(!(fault_type & VM_PROT_WRITE));
14685 		assert(!*wired);
14686 		assert(!force_copy);
14687 		// printf("FBDP %d[%s] submap_needed_copy for %p 0x%llx\n", proc_selfpid(), proc_name_address(current_task()->bsd_info), map, vaddr);
14688 		prot &= ~VM_PROT_WRITE;
14689 	}
14690 
14691 	/*
14692 	 *	Create an object if necessary.
14693 	 */
14694 	if (VME_OBJECT(entry) == VM_OBJECT_NULL) {
14695 		if (vm_map_lock_read_to_write(map)) {
14696 			vm_map_lock_read(map);
14697 			goto RetryLookup;
14698 		}
14699 
14700 		VME_OBJECT_SET(entry,
14701 		    vm_object_allocate(
14702 			    (vm_map_size_t)(entry->vme_end -
14703 			    entry->vme_start)), false, 0);
14704 		VME_OFFSET_SET(entry, 0);
14705 		assert(entry->use_pmap);
14706 		vm_map_lock_write_to_read(map);
14707 	}
14708 
14709 	/*
14710 	 *	Return the object/offset from this entry.  If the entry
14711 	 *	was copy-on-write or empty, it has been fixed up.  Also
14712 	 *	return the protection.
14713 	 */
14714 
14715 	*offset = (vaddr - entry->vme_start) + VME_OFFSET(entry);
14716 	*object = VME_OBJECT(entry);
14717 	*out_prot = prot;
14718 	KDBG_FILTERED(MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_MAP_LOOKUP_OBJECT), VM_KERNEL_UNSLIDE_OR_PERM(*object), (unsigned long) VME_ALIAS(entry), 0, 0);
14719 
14720 	if (fault_info) {
14721 		fault_info->interruptible = THREAD_UNINT; /* for now... */
14722 		/* ... the caller will change "interruptible" if needed */
14723 		fault_info->cluster_size = 0;
14724 		fault_info->user_tag = VME_ALIAS(entry);
14725 		fault_info->pmap_options = 0;
14726 		if (entry->iokit_acct ||
14727 		    (!entry->is_sub_map && !entry->use_pmap)) {
14728 			fault_info->pmap_options |= PMAP_OPTIONS_ALT_ACCT;
14729 		}
14730 		fault_info->behavior = entry->behavior;
14731 		fault_info->lo_offset = VME_OFFSET(entry);
14732 		fault_info->hi_offset =
14733 		    (entry->vme_end - entry->vme_start) + VME_OFFSET(entry);
14734 		fault_info->no_cache  = entry->no_cache;
14735 		fault_info->stealth = FALSE;
14736 		fault_info->io_sync = FALSE;
14737 		if (entry->used_for_jit ||
14738 #if CODE_SIGNING_MONITOR
14739 		    (csm_address_space_exempt(map->pmap) == KERN_SUCCESS) ||
14740 #endif
14741 		    entry->vme_resilient_codesign) {
14742 			fault_info->cs_bypass = TRUE;
14743 		} else {
14744 			fault_info->cs_bypass = FALSE;
14745 		}
14746 		fault_info->csm_associated = FALSE;
14747 #if CODE_SIGNING_MONITOR
14748 		if (entry->csm_associated) {
14749 			/*
14750 			 * The pmap layer will validate this page
14751 			 * before allowing it to be executed from.
14752 			 */
14753 			fault_info->csm_associated = TRUE;
14754 		}
14755 #endif
14756 		fault_info->mark_zf_absent = FALSE;
14757 		fault_info->batch_pmap_op = FALSE;
14758 		fault_info->resilient_media = entry->vme_resilient_media;
14759 		fault_info->fi_xnu_user_debug = entry->vme_xnu_user_debug;
14760 		fault_info->no_copy_on_read = entry->vme_no_copy_on_read;
14761 #if __arm64e__
14762 		fault_info->fi_used_for_tpro = entry->used_for_tpro;
14763 #else /* __arm64e__ */
14764 		fault_info->fi_used_for_tpro = FALSE;
14765 #endif
14766 		if (entry->translated_allow_execute) {
14767 			fault_info->pmap_options |= PMAP_OPTIONS_TRANSLATED_ALLOW_EXECUTE;
14768 		}
14769 	}
14770 
14771 	/*
14772 	 *	Lock the object to prevent it from disappearing
14773 	 */
14774 	if (object_lock_type == OBJECT_LOCK_EXCLUSIVE) {
14775 		if (contended == NULL) {
14776 			vm_object_lock(*object);
14777 		} else {
14778 			*contended = vm_object_lock_check_contended(*object);
14779 		}
14780 	} else {
14781 		vm_object_lock_shared(*object);
14782 	}
14783 
14784 	/*
14785 	 *	Save the version number
14786 	 */
14787 
14788 	out_version->main_timestamp = map->timestamp;
14789 
14790 	return KERN_SUCCESS;
14791 }
14792 
14793 
14794 /*
14795  *	vm_map_verify:
14796  *
14797  *	Verifies that the map in question has not changed
14798  *	since the given version. The map has to be locked
14799  *	("shared" mode is fine) before calling this function
14800  *	and it will be returned locked too.
14801  */
14802 boolean_t
vm_map_verify(vm_map_t map,vm_map_version_t * version)14803 vm_map_verify(
14804 	vm_map_t                map,
14805 	vm_map_version_t        *version)       /* REF */
14806 {
14807 	boolean_t       result;
14808 
14809 	vm_map_lock_assert_held(map);
14810 	result = (map->timestamp == version->main_timestamp);
14811 
14812 	return result;
14813 }
14814 
14815 /*
14816  *	TEMPORARYTEMPORARYTEMPORARYTEMPORARYTEMPORARYTEMPORARY
14817  *	Goes away after regular vm_region_recurse function migrates to
14818  *	64 bits
14819  *	vm_region_recurse: A form of vm_region which follows the
14820  *	submaps in a target map
14821  *
14822  */
14823 
14824 kern_return_t
vm_map_region_recurse_64(vm_map_t map,vm_map_offset_t * address,vm_map_size_t * size,natural_t * nesting_depth,vm_region_submap_info_64_t submap_info,mach_msg_type_number_t * count)14825 vm_map_region_recurse_64(
14826 	vm_map_t                 map,
14827 	vm_map_offset_t *address,               /* IN/OUT */
14828 	vm_map_size_t           *size,                  /* OUT */
14829 	natural_t               *nesting_depth, /* IN/OUT */
14830 	vm_region_submap_info_64_t      submap_info,    /* IN/OUT */
14831 	mach_msg_type_number_t  *count) /* IN/OUT */
14832 {
14833 	mach_msg_type_number_t  original_count;
14834 	vm_region_extended_info_data_t  extended;
14835 	vm_map_entry_t                  tmp_entry;
14836 	vm_map_offset_t                 user_address;
14837 	unsigned int                    user_max_depth;
14838 
14839 	/*
14840 	 * "curr_entry" is the VM map entry preceding or including the
14841 	 * address we're looking for.
14842 	 * "curr_map" is the map or sub-map containing "curr_entry".
14843 	 * "curr_address" is the equivalent of the top map's "user_address"
14844 	 * in the current map.
14845 	 * "curr_offset" is the cumulated offset of "curr_map" in the
14846 	 * target task's address space.
14847 	 * "curr_depth" is the depth of "curr_map" in the chain of
14848 	 * sub-maps.
14849 	 *
14850 	 * "curr_max_below" and "curr_max_above" limit the range (around
14851 	 * "curr_address") we should take into account in the current (sub)map.
14852 	 * They limit the range to what's visible through the map entries
14853 	 * we've traversed from the top map to the current map.
14854 	 *
14855 	 */
14856 	vm_map_entry_t                  curr_entry;
14857 	vm_map_address_t                curr_address;
14858 	vm_map_offset_t                 curr_offset;
14859 	vm_map_t                        curr_map;
14860 	unsigned int                    curr_depth;
14861 	vm_map_offset_t                 curr_max_below, curr_max_above;
14862 	vm_map_offset_t                 curr_skip;
14863 
14864 	/*
14865 	 * "next_" is the same as "curr_" but for the VM region immediately
14866 	 * after the address we're looking for.  We need to keep track of this
14867 	 * too because we want to return info about that region if the
14868 	 * address we're looking for is not mapped.
14869 	 */
14870 	vm_map_entry_t                  next_entry;
14871 	vm_map_offset_t                 next_offset;
14872 	vm_map_offset_t                 next_address;
14873 	vm_map_t                        next_map;
14874 	unsigned int                    next_depth;
14875 	vm_map_offset_t                 next_max_below, next_max_above;
14876 	vm_map_offset_t                 next_skip;
14877 
14878 	boolean_t                       look_for_pages;
14879 	vm_region_submap_short_info_64_t short_info;
14880 	boolean_t                       do_region_footprint;
14881 	int                             effective_page_size, effective_page_shift;
14882 	boolean_t                       submap_needed_copy;
14883 
14884 	if (map == VM_MAP_NULL) {
14885 		/* no address space to work on */
14886 		return KERN_INVALID_ARGUMENT;
14887 	}
14888 
14889 	effective_page_shift = vm_self_region_page_shift(map);
14890 	effective_page_size = (1 << effective_page_shift);
14891 
14892 	if (*count < VM_REGION_SUBMAP_SHORT_INFO_COUNT_64) {
14893 		/*
14894 		 * "info" structure is not big enough and
14895 		 * would overflow
14896 		 */
14897 		return KERN_INVALID_ARGUMENT;
14898 	}
14899 
14900 	do_region_footprint = task_self_region_footprint();
14901 	original_count = *count;
14902 
14903 	if (original_count < VM_REGION_SUBMAP_INFO_V0_COUNT_64) {
14904 		*count = VM_REGION_SUBMAP_SHORT_INFO_COUNT_64;
14905 		look_for_pages = FALSE;
14906 		short_info = (vm_region_submap_short_info_64_t) submap_info;
14907 		submap_info = NULL;
14908 	} else {
14909 		look_for_pages = TRUE;
14910 		*count = VM_REGION_SUBMAP_INFO_V0_COUNT_64;
14911 		short_info = NULL;
14912 
14913 		if (original_count >= VM_REGION_SUBMAP_INFO_V1_COUNT_64) {
14914 			*count = VM_REGION_SUBMAP_INFO_V1_COUNT_64;
14915 		}
14916 		if (original_count >= VM_REGION_SUBMAP_INFO_V2_COUNT_64) {
14917 			*count = VM_REGION_SUBMAP_INFO_V2_COUNT_64;
14918 		}
14919 	}
14920 
14921 	user_address = *address;
14922 	user_max_depth = *nesting_depth;
14923 	submap_needed_copy = FALSE;
14924 
14925 	if (not_in_kdp) {
14926 		vm_map_lock_read(map);
14927 	}
14928 
14929 recurse_again:
14930 	curr_entry = NULL;
14931 	curr_map = map;
14932 	curr_address = user_address;
14933 	curr_offset = 0;
14934 	curr_skip = 0;
14935 	curr_depth = 0;
14936 	curr_max_above = ((vm_map_offset_t) -1) - curr_address;
14937 	curr_max_below = curr_address;
14938 
14939 	next_entry = NULL;
14940 	next_map = NULL;
14941 	next_address = 0;
14942 	next_offset = 0;
14943 	next_skip = 0;
14944 	next_depth = 0;
14945 	next_max_above = (vm_map_offset_t) -1;
14946 	next_max_below = (vm_map_offset_t) -1;
14947 
14948 	for (;;) {
14949 		if (vm_map_lookup_entry(curr_map,
14950 		    curr_address,
14951 		    &tmp_entry)) {
14952 			/* tmp_entry contains the address we're looking for */
14953 			curr_entry = tmp_entry;
14954 		} else {
14955 			vm_map_offset_t skip;
14956 			/*
14957 			 * The address is not mapped.  "tmp_entry" is the
14958 			 * map entry preceding the address.  We want the next
14959 			 * one, if it exists.
14960 			 */
14961 			curr_entry = tmp_entry->vme_next;
14962 
14963 			if (curr_entry == vm_map_to_entry(curr_map) ||
14964 			    (curr_entry->vme_start >=
14965 			    curr_address + curr_max_above)) {
14966 				/* no next entry at this level: stop looking */
14967 				if (not_in_kdp) {
14968 					vm_map_unlock_read(curr_map);
14969 				}
14970 				curr_entry = NULL;
14971 				curr_map = NULL;
14972 				curr_skip = 0;
14973 				curr_offset = 0;
14974 				curr_depth = 0;
14975 				curr_max_above = 0;
14976 				curr_max_below = 0;
14977 				break;
14978 			}
14979 
14980 			/* adjust current address and offset */
14981 			skip = curr_entry->vme_start - curr_address;
14982 			curr_address = curr_entry->vme_start;
14983 			curr_skip += skip;
14984 			curr_offset += skip;
14985 			curr_max_above -= skip;
14986 			curr_max_below = 0;
14987 		}
14988 
14989 		/*
14990 		 * Is the next entry at this level closer to the address (or
14991 		 * deeper in the submap chain) than the one we had
14992 		 * so far ?
14993 		 */
14994 		tmp_entry = curr_entry->vme_next;
14995 		if (tmp_entry == vm_map_to_entry(curr_map)) {
14996 			/* no next entry at this level */
14997 		} else if (tmp_entry->vme_start >=
14998 		    curr_address + curr_max_above) {
14999 			/*
15000 			 * tmp_entry is beyond the scope of what we mapped of
15001 			 * this submap in the upper level: ignore it.
15002 			 */
15003 		} else if ((next_entry == NULL) ||
15004 		    (tmp_entry->vme_start + curr_offset <=
15005 		    next_entry->vme_start + next_offset)) {
15006 			/*
15007 			 * We didn't have a "next_entry" or this one is
15008 			 * closer to the address we're looking for:
15009 			 * use this "tmp_entry" as the new "next_entry".
15010 			 */
15011 			if (next_entry != NULL) {
15012 				/* unlock the last "next_map" */
15013 				if (next_map != curr_map && not_in_kdp) {
15014 					vm_map_unlock_read(next_map);
15015 				}
15016 			}
15017 			next_entry = tmp_entry;
15018 			next_map = curr_map;
15019 			next_depth = curr_depth;
15020 			next_address = next_entry->vme_start;
15021 			next_skip = curr_skip;
15022 			next_skip += (next_address - curr_address);
15023 			next_offset = curr_offset;
15024 			next_offset += (next_address - curr_address);
15025 			next_max_above = MIN(next_max_above, curr_max_above);
15026 			next_max_above = MIN(next_max_above,
15027 			    next_entry->vme_end - next_address);
15028 			next_max_below = MIN(next_max_below, curr_max_below);
15029 			next_max_below = MIN(next_max_below,
15030 			    next_address - next_entry->vme_start);
15031 		}
15032 
15033 		/*
15034 		 * "curr_max_{above,below}" allow us to keep track of the
15035 		 * portion of the submap that is actually mapped at this level:
15036 		 * the rest of that submap is irrelevant to us, since it's not
15037 		 * mapped here.
15038 		 * The relevant portion of the map starts at
15039 		 * "VME_OFFSET(curr_entry)" up to the size of "curr_entry".
15040 		 */
15041 		curr_max_above = MIN(curr_max_above,
15042 		    curr_entry->vme_end - curr_address);
15043 		curr_max_below = MIN(curr_max_below,
15044 		    curr_address - curr_entry->vme_start);
15045 
15046 		if (!curr_entry->is_sub_map ||
15047 		    curr_depth >= user_max_depth) {
15048 			/*
15049 			 * We hit a leaf map or we reached the maximum depth
15050 			 * we could, so stop looking.  Keep the current map
15051 			 * locked.
15052 			 */
15053 			break;
15054 		}
15055 
15056 		/*
15057 		 * Get down to the next submap level.
15058 		 */
15059 
15060 		if (curr_entry->needs_copy) {
15061 			/* everything below this is effectively copy-on-write */
15062 			submap_needed_copy = TRUE;
15063 		}
15064 
15065 		/*
15066 		 * Lock the next level and unlock the current level,
15067 		 * unless we need to keep it locked to access the "next_entry"
15068 		 * later.
15069 		 */
15070 		if (not_in_kdp) {
15071 			vm_map_lock_read(VME_SUBMAP(curr_entry));
15072 		}
15073 		if (curr_map == next_map) {
15074 			/* keep "next_map" locked in case we need it */
15075 		} else {
15076 			/* release this map */
15077 			if (not_in_kdp) {
15078 				vm_map_unlock_read(curr_map);
15079 			}
15080 		}
15081 
15082 		/*
15083 		 * Adjust the offset.  "curr_entry" maps the submap
15084 		 * at relative address "curr_entry->vme_start" in the
15085 		 * curr_map but skips the first "VME_OFFSET(curr_entry)"
15086 		 * bytes of the submap.
15087 		 * "curr_offset" always represents the offset of a virtual
15088 		 * address in the curr_map relative to the absolute address
15089 		 * space (i.e. the top-level VM map).
15090 		 */
15091 		curr_offset +=
15092 		    (VME_OFFSET(curr_entry) - curr_entry->vme_start);
15093 		curr_address = user_address + curr_offset;
15094 		/* switch to the submap */
15095 		curr_map = VME_SUBMAP(curr_entry);
15096 		curr_depth++;
15097 		curr_entry = NULL;
15098 	}
15099 
15100 // LP64todo: all the current tools are 32bit, obviously never worked for 64b
15101 // so probably should be a real 32b ID vs. ptr.
15102 // Current users just check for equality
15103 
15104 	if (curr_entry == NULL) {
15105 		/* no VM region contains the address... */
15106 
15107 		if (do_region_footprint && /* we want footprint numbers */
15108 		    next_entry == NULL && /* & there are no more regions */
15109 		    /* & we haven't already provided our fake region: */
15110 		    user_address <= vm_map_last_entry(map)->vme_end) {
15111 			ledger_amount_t ledger_resident, ledger_compressed;
15112 
15113 			/*
15114 			 * Add a fake memory region to account for
15115 			 * purgeable and/or ledger-tagged memory that
15116 			 * counts towards this task's memory footprint,
15117 			 * i.e. the resident/compressed pages of non-volatile
15118 			 * objects owned by that task.
15119 			 */
15120 			task_ledgers_footprint(map->pmap->ledger,
15121 			    &ledger_resident,
15122 			    &ledger_compressed);
15123 			if (ledger_resident + ledger_compressed == 0) {
15124 				/* no purgeable memory usage to report */
15125 				return KERN_INVALID_ADDRESS;
15126 			}
15127 			/* fake region to show nonvolatile footprint */
15128 			if (look_for_pages) {
15129 				submap_info->protection = VM_PROT_DEFAULT;
15130 				submap_info->max_protection = VM_PROT_DEFAULT;
15131 				submap_info->inheritance = VM_INHERIT_DEFAULT;
15132 				submap_info->offset = 0;
15133 				submap_info->user_tag = -1;
15134 				submap_info->pages_resident = (unsigned int) (ledger_resident / effective_page_size);
15135 				submap_info->pages_shared_now_private = 0;
15136 				submap_info->pages_swapped_out = (unsigned int) (ledger_compressed / effective_page_size);
15137 				submap_info->pages_dirtied = submap_info->pages_resident;
15138 				submap_info->ref_count = 1;
15139 				submap_info->shadow_depth = 0;
15140 				submap_info->external_pager = 0;
15141 				submap_info->share_mode = SM_PRIVATE;
15142 				if (submap_needed_copy) {
15143 					submap_info->share_mode = SM_COW;
15144 				}
15145 				submap_info->is_submap = 0;
15146 				submap_info->behavior = VM_BEHAVIOR_DEFAULT;
15147 				submap_info->object_id = VM_OBJECT_ID_FAKE(map, task_ledgers.purgeable_nonvolatile);
15148 				submap_info->user_wired_count = 0;
15149 				submap_info->pages_reusable = 0;
15150 			} else {
15151 				short_info->user_tag = -1;
15152 				short_info->offset = 0;
15153 				short_info->protection = VM_PROT_DEFAULT;
15154 				short_info->inheritance = VM_INHERIT_DEFAULT;
15155 				short_info->max_protection = VM_PROT_DEFAULT;
15156 				short_info->behavior = VM_BEHAVIOR_DEFAULT;
15157 				short_info->user_wired_count = 0;
15158 				short_info->is_submap = 0;
15159 				short_info->object_id = VM_OBJECT_ID_FAKE(map, task_ledgers.purgeable_nonvolatile);
15160 				short_info->external_pager = 0;
15161 				short_info->shadow_depth = 0;
15162 				short_info->share_mode = SM_PRIVATE;
15163 				if (submap_needed_copy) {
15164 					short_info->share_mode = SM_COW;
15165 				}
15166 				short_info->ref_count = 1;
15167 			}
15168 			*nesting_depth = 0;
15169 			*size = (vm_map_size_t) (ledger_resident + ledger_compressed);
15170 //			*address = user_address;
15171 			*address = vm_map_last_entry(map)->vme_end;
15172 			return KERN_SUCCESS;
15173 		}
15174 
15175 		if (next_entry == NULL) {
15176 			/* ... and no VM region follows it either */
15177 			return KERN_INVALID_ADDRESS;
15178 		}
15179 		/* ... gather info about the next VM region */
15180 		curr_entry = next_entry;
15181 		curr_map = next_map;    /* still locked ... */
15182 		curr_address = next_address;
15183 		curr_skip = next_skip;
15184 		curr_offset = next_offset;
15185 		curr_depth = next_depth;
15186 		curr_max_above = next_max_above;
15187 		curr_max_below = next_max_below;
15188 	} else {
15189 		/* we won't need "next_entry" after all */
15190 		if (next_entry != NULL) {
15191 			/* release "next_map" */
15192 			if (next_map != curr_map && not_in_kdp) {
15193 				vm_map_unlock_read(next_map);
15194 			}
15195 		}
15196 	}
15197 	next_entry = NULL;
15198 	next_map = NULL;
15199 	next_offset = 0;
15200 	next_skip = 0;
15201 	next_depth = 0;
15202 	next_max_below = -1;
15203 	next_max_above = -1;
15204 
15205 	if (curr_entry->is_sub_map &&
15206 	    curr_depth < user_max_depth) {
15207 		/*
15208 		 * We're not as deep as we could be:  we must have
15209 		 * gone back up after not finding anything mapped
15210 		 * below the original top-level map entry's.
15211 		 * Let's move "curr_address" forward and recurse again.
15212 		 */
15213 		user_address = curr_address;
15214 		goto recurse_again;
15215 	}
15216 
15217 	*nesting_depth = curr_depth;
15218 	*size = curr_max_above + curr_max_below;
15219 	*address = user_address + curr_skip - curr_max_below;
15220 
15221 	if (look_for_pages) {
15222 		submap_info->user_tag = VME_ALIAS(curr_entry);
15223 		submap_info->offset = VME_OFFSET(curr_entry);
15224 		submap_info->protection = curr_entry->protection;
15225 		submap_info->inheritance = curr_entry->inheritance;
15226 		submap_info->max_protection = curr_entry->max_protection;
15227 		submap_info->behavior = curr_entry->behavior;
15228 		submap_info->user_wired_count = curr_entry->user_wired_count;
15229 		submap_info->is_submap = curr_entry->is_sub_map;
15230 		if (curr_entry->is_sub_map) {
15231 			submap_info->object_id = VM_OBJECT_ID(VME_SUBMAP(curr_entry));
15232 		} else {
15233 			submap_info->object_id = VM_OBJECT_ID(VME_OBJECT(curr_entry));
15234 		}
15235 	} else {
15236 		short_info->user_tag = VME_ALIAS(curr_entry);
15237 		short_info->offset = VME_OFFSET(curr_entry);
15238 		short_info->protection = curr_entry->protection;
15239 		short_info->inheritance = curr_entry->inheritance;
15240 		short_info->max_protection = curr_entry->max_protection;
15241 		short_info->behavior = curr_entry->behavior;
15242 		short_info->user_wired_count = curr_entry->user_wired_count;
15243 		short_info->is_submap = curr_entry->is_sub_map;
15244 		if (curr_entry->is_sub_map) {
15245 			short_info->object_id = VM_OBJECT_ID(VME_SUBMAP(curr_entry));
15246 		} else {
15247 			short_info->object_id = VM_OBJECT_ID(VME_OBJECT(curr_entry));
15248 		}
15249 	}
15250 
15251 	extended.pages_resident = 0;
15252 	extended.pages_swapped_out = 0;
15253 	extended.pages_shared_now_private = 0;
15254 	extended.pages_dirtied = 0;
15255 	extended.pages_reusable = 0;
15256 	extended.external_pager = 0;
15257 	extended.shadow_depth = 0;
15258 	extended.share_mode = SM_EMPTY;
15259 	extended.ref_count = 0;
15260 
15261 	if (not_in_kdp) {
15262 		if (!curr_entry->is_sub_map) {
15263 			vm_map_offset_t range_start, range_end;
15264 			range_start = MAX((curr_address - curr_max_below),
15265 			    curr_entry->vme_start);
15266 			range_end = MIN((curr_address + curr_max_above),
15267 			    curr_entry->vme_end);
15268 			vm_map_region_walk(curr_map,
15269 			    range_start,
15270 			    curr_entry,
15271 			    (VME_OFFSET(curr_entry) +
15272 			    (range_start -
15273 			    curr_entry->vme_start)),
15274 			    range_end - range_start,
15275 			    &extended,
15276 			    look_for_pages, VM_REGION_EXTENDED_INFO_COUNT);
15277 			if (extended.external_pager &&
15278 			    extended.ref_count == 2 &&
15279 			    extended.share_mode == SM_SHARED) {
15280 				extended.share_mode = SM_PRIVATE;
15281 			}
15282 			if (submap_needed_copy) {
15283 				extended.share_mode = SM_COW;
15284 			}
15285 		} else {
15286 			if (curr_entry->use_pmap) {
15287 				extended.share_mode = SM_TRUESHARED;
15288 			} else {
15289 				extended.share_mode = SM_PRIVATE;
15290 			}
15291 			extended.ref_count = os_ref_get_count_raw(&VME_SUBMAP(curr_entry)->map_refcnt);
15292 		}
15293 	}
15294 
15295 	if (look_for_pages) {
15296 		submap_info->pages_resident = extended.pages_resident;
15297 		submap_info->pages_swapped_out = extended.pages_swapped_out;
15298 		submap_info->pages_shared_now_private =
15299 		    extended.pages_shared_now_private;
15300 		submap_info->pages_dirtied = extended.pages_dirtied;
15301 		submap_info->external_pager = extended.external_pager;
15302 		submap_info->shadow_depth = extended.shadow_depth;
15303 		submap_info->share_mode = extended.share_mode;
15304 		submap_info->ref_count = extended.ref_count;
15305 
15306 		if (original_count >= VM_REGION_SUBMAP_INFO_V1_COUNT_64) {
15307 			submap_info->pages_reusable = extended.pages_reusable;
15308 		}
15309 		if (original_count >= VM_REGION_SUBMAP_INFO_V2_COUNT_64) {
15310 			if (curr_entry->is_sub_map) {
15311 				submap_info->object_id_full = (vm_object_id_t)VM_KERNEL_ADDRHASH(VME_SUBMAP(curr_entry));
15312 			} else if (VME_OBJECT(curr_entry)) {
15313 				submap_info->object_id_full = (vm_object_id_t)VM_KERNEL_ADDRHASH(VME_OBJECT(curr_entry));
15314 			} else {
15315 				submap_info->object_id_full = 0ull;
15316 			}
15317 		}
15318 	} else {
15319 		short_info->external_pager = extended.external_pager;
15320 		short_info->shadow_depth = extended.shadow_depth;
15321 		short_info->share_mode = extended.share_mode;
15322 		short_info->ref_count = extended.ref_count;
15323 	}
15324 
15325 	if (not_in_kdp) {
15326 		vm_map_unlock_read(curr_map);
15327 	}
15328 
15329 	return KERN_SUCCESS;
15330 }
15331 
15332 /*
15333  *	vm_region:
15334  *
15335  *	User call to obtain information about a region in
15336  *	a task's address map. Currently, only one flavor is
15337  *	supported.
15338  *
15339  *	XXX The reserved and behavior fields cannot be filled
15340  *	    in until the vm merge from the IK is completed, and
15341  *	    vm_reserve is implemented.
15342  */
15343 
15344 kern_return_t
vm_map_region(vm_map_t map,vm_map_offset_t * address,vm_map_size_t * size,vm_region_flavor_t flavor,vm_region_info_t info,mach_msg_type_number_t * count,mach_port_t * object_name)15345 vm_map_region(
15346 	vm_map_t                 map,
15347 	vm_map_offset_t *address,               /* IN/OUT */
15348 	vm_map_size_t           *size,                  /* OUT */
15349 	vm_region_flavor_t       flavor,                /* IN */
15350 	vm_region_info_t         info,                  /* OUT */
15351 	mach_msg_type_number_t  *count, /* IN/OUT */
15352 	mach_port_t             *object_name)           /* OUT */
15353 {
15354 	vm_map_entry_t          tmp_entry;
15355 	vm_map_entry_t          entry;
15356 	vm_map_offset_t         start;
15357 
15358 	if (map == VM_MAP_NULL) {
15359 		return KERN_INVALID_ARGUMENT;
15360 	}
15361 
15362 	switch (flavor) {
15363 	case VM_REGION_BASIC_INFO:
15364 		/* legacy for old 32-bit objects info */
15365 	{
15366 		vm_region_basic_info_t  basic;
15367 
15368 		if (*count < VM_REGION_BASIC_INFO_COUNT) {
15369 			return KERN_INVALID_ARGUMENT;
15370 		}
15371 
15372 		basic = (vm_region_basic_info_t) info;
15373 		*count = VM_REGION_BASIC_INFO_COUNT;
15374 
15375 		vm_map_lock_read(map);
15376 
15377 		start = *address;
15378 		if (!vm_map_lookup_entry(map, start, &tmp_entry)) {
15379 			if ((entry = tmp_entry->vme_next) == vm_map_to_entry(map)) {
15380 				vm_map_unlock_read(map);
15381 				return KERN_INVALID_ADDRESS;
15382 			}
15383 		} else {
15384 			entry = tmp_entry;
15385 		}
15386 
15387 		start = entry->vme_start;
15388 
15389 		basic->offset = (uint32_t)VME_OFFSET(entry);
15390 		basic->protection = entry->protection;
15391 		basic->inheritance = entry->inheritance;
15392 		basic->max_protection = entry->max_protection;
15393 		basic->behavior = entry->behavior;
15394 		basic->user_wired_count = entry->user_wired_count;
15395 		basic->reserved = entry->is_sub_map;
15396 		*address = start;
15397 		*size = (entry->vme_end - start);
15398 
15399 		if (object_name) {
15400 			*object_name = IP_NULL;
15401 		}
15402 		if (entry->is_sub_map) {
15403 			basic->shared = FALSE;
15404 		} else {
15405 			basic->shared = entry->is_shared;
15406 		}
15407 
15408 		vm_map_unlock_read(map);
15409 		return KERN_SUCCESS;
15410 	}
15411 
15412 	case VM_REGION_BASIC_INFO_64:
15413 	{
15414 		vm_region_basic_info_64_t       basic;
15415 
15416 		if (*count < VM_REGION_BASIC_INFO_COUNT_64) {
15417 			return KERN_INVALID_ARGUMENT;
15418 		}
15419 
15420 		basic = (vm_region_basic_info_64_t) info;
15421 		*count = VM_REGION_BASIC_INFO_COUNT_64;
15422 
15423 		vm_map_lock_read(map);
15424 
15425 		start = *address;
15426 		if (!vm_map_lookup_entry(map, start, &tmp_entry)) {
15427 			if ((entry = tmp_entry->vme_next) == vm_map_to_entry(map)) {
15428 				vm_map_unlock_read(map);
15429 				return KERN_INVALID_ADDRESS;
15430 			}
15431 		} else {
15432 			entry = tmp_entry;
15433 		}
15434 
15435 		start = entry->vme_start;
15436 
15437 		basic->offset = VME_OFFSET(entry);
15438 		basic->protection = entry->protection;
15439 		basic->inheritance = entry->inheritance;
15440 		basic->max_protection = entry->max_protection;
15441 		basic->behavior = entry->behavior;
15442 		basic->user_wired_count = entry->user_wired_count;
15443 		basic->reserved = entry->is_sub_map;
15444 		*address = start;
15445 		*size = (entry->vme_end - start);
15446 
15447 		if (object_name) {
15448 			*object_name = IP_NULL;
15449 		}
15450 		if (entry->is_sub_map) {
15451 			basic->shared = FALSE;
15452 		} else {
15453 			basic->shared = entry->is_shared;
15454 		}
15455 
15456 		vm_map_unlock_read(map);
15457 		return KERN_SUCCESS;
15458 	}
15459 	case VM_REGION_EXTENDED_INFO:
15460 		if (*count < VM_REGION_EXTENDED_INFO_COUNT) {
15461 			return KERN_INVALID_ARGUMENT;
15462 		}
15463 		OS_FALLTHROUGH;
15464 	case VM_REGION_EXTENDED_INFO__legacy:
15465 		if (*count < VM_REGION_EXTENDED_INFO_COUNT__legacy) {
15466 			return KERN_INVALID_ARGUMENT;
15467 		}
15468 
15469 		{
15470 			vm_region_extended_info_t       extended;
15471 			mach_msg_type_number_t original_count;
15472 			int effective_page_size, effective_page_shift;
15473 
15474 			extended = (vm_region_extended_info_t) info;
15475 
15476 			effective_page_shift = vm_self_region_page_shift(map);
15477 			effective_page_size = (1 << effective_page_shift);
15478 
15479 			vm_map_lock_read(map);
15480 
15481 			start = *address;
15482 			if (!vm_map_lookup_entry(map, start, &tmp_entry)) {
15483 				if ((entry = tmp_entry->vme_next) == vm_map_to_entry(map)) {
15484 					vm_map_unlock_read(map);
15485 					return KERN_INVALID_ADDRESS;
15486 				}
15487 			} else {
15488 				entry = tmp_entry;
15489 			}
15490 			start = entry->vme_start;
15491 
15492 			extended->protection = entry->protection;
15493 			extended->user_tag = VME_ALIAS(entry);
15494 			extended->pages_resident = 0;
15495 			extended->pages_swapped_out = 0;
15496 			extended->pages_shared_now_private = 0;
15497 			extended->pages_dirtied = 0;
15498 			extended->external_pager = 0;
15499 			extended->shadow_depth = 0;
15500 
15501 			original_count = *count;
15502 			if (flavor == VM_REGION_EXTENDED_INFO__legacy) {
15503 				*count = VM_REGION_EXTENDED_INFO_COUNT__legacy;
15504 			} else {
15505 				extended->pages_reusable = 0;
15506 				*count = VM_REGION_EXTENDED_INFO_COUNT;
15507 			}
15508 
15509 			vm_map_region_walk(map, start, entry, VME_OFFSET(entry), entry->vme_end - start, extended, TRUE, *count);
15510 
15511 			if (extended->external_pager && extended->ref_count == 2 && extended->share_mode == SM_SHARED) {
15512 				extended->share_mode = SM_PRIVATE;
15513 			}
15514 
15515 			if (object_name) {
15516 				*object_name = IP_NULL;
15517 			}
15518 			*address = start;
15519 			*size = (entry->vme_end - start);
15520 
15521 			vm_map_unlock_read(map);
15522 			return KERN_SUCCESS;
15523 		}
15524 	case VM_REGION_TOP_INFO:
15525 	{
15526 		vm_region_top_info_t    top;
15527 
15528 		if (*count < VM_REGION_TOP_INFO_COUNT) {
15529 			return KERN_INVALID_ARGUMENT;
15530 		}
15531 
15532 		top = (vm_region_top_info_t) info;
15533 		*count = VM_REGION_TOP_INFO_COUNT;
15534 
15535 		vm_map_lock_read(map);
15536 
15537 		start = *address;
15538 		if (!vm_map_lookup_entry(map, start, &tmp_entry)) {
15539 			if ((entry = tmp_entry->vme_next) == vm_map_to_entry(map)) {
15540 				vm_map_unlock_read(map);
15541 				return KERN_INVALID_ADDRESS;
15542 			}
15543 		} else {
15544 			entry = tmp_entry;
15545 		}
15546 		start = entry->vme_start;
15547 
15548 		top->private_pages_resident = 0;
15549 		top->shared_pages_resident = 0;
15550 
15551 		vm_map_region_top_walk(entry, top);
15552 
15553 		if (object_name) {
15554 			*object_name = IP_NULL;
15555 		}
15556 		*address = start;
15557 		*size = (entry->vme_end - start);
15558 
15559 		vm_map_unlock_read(map);
15560 		return KERN_SUCCESS;
15561 	}
15562 	default:
15563 		return KERN_INVALID_ARGUMENT;
15564 	}
15565 }
15566 
15567 #define OBJ_RESIDENT_COUNT(obj, entry_size)                             \
15568 	MIN((entry_size),                                               \
15569 	    ((obj)->all_reusable ?                                      \
15570 	     (obj)->wired_page_count :                                  \
15571 	     (obj)->resident_page_count - (obj)->reusable_page_count))
15572 
15573 void
vm_map_region_top_walk(vm_map_entry_t entry,vm_region_top_info_t top)15574 vm_map_region_top_walk(
15575 	vm_map_entry_t             entry,
15576 	vm_region_top_info_t       top)
15577 {
15578 	if (entry->is_sub_map || VME_OBJECT(entry) == 0) {
15579 		top->share_mode = SM_EMPTY;
15580 		top->ref_count = 0;
15581 		top->obj_id = 0;
15582 		return;
15583 	}
15584 
15585 	{
15586 		struct  vm_object *obj, *tmp_obj;
15587 		int             ref_count;
15588 		uint32_t        entry_size;
15589 
15590 		entry_size = (uint32_t) ((entry->vme_end - entry->vme_start) / PAGE_SIZE_64);
15591 
15592 		obj = VME_OBJECT(entry);
15593 
15594 		vm_object_lock(obj);
15595 
15596 		if ((ref_count = obj->ref_count) > 1 && obj->paging_in_progress) {
15597 			ref_count--;
15598 		}
15599 
15600 		assert(obj->reusable_page_count <= obj->resident_page_count);
15601 		if (obj->shadow) {
15602 			if (ref_count == 1) {
15603 				top->private_pages_resident =
15604 				    OBJ_RESIDENT_COUNT(obj, entry_size);
15605 			} else {
15606 				top->shared_pages_resident =
15607 				    OBJ_RESIDENT_COUNT(obj, entry_size);
15608 			}
15609 			top->ref_count  = ref_count;
15610 			top->share_mode = SM_COW;
15611 
15612 			while ((tmp_obj = obj->shadow)) {
15613 				vm_object_lock(tmp_obj);
15614 				vm_object_unlock(obj);
15615 				obj = tmp_obj;
15616 
15617 				if ((ref_count = obj->ref_count) > 1 && obj->paging_in_progress) {
15618 					ref_count--;
15619 				}
15620 
15621 				assert(obj->reusable_page_count <= obj->resident_page_count);
15622 				top->shared_pages_resident +=
15623 				    OBJ_RESIDENT_COUNT(obj, entry_size);
15624 				top->ref_count += ref_count - 1;
15625 			}
15626 		} else {
15627 			if (entry->superpage_size) {
15628 				top->share_mode = SM_LARGE_PAGE;
15629 				top->shared_pages_resident = 0;
15630 				top->private_pages_resident = entry_size;
15631 			} else if (entry->needs_copy) {
15632 				top->share_mode = SM_COW;
15633 				top->shared_pages_resident =
15634 				    OBJ_RESIDENT_COUNT(obj, entry_size);
15635 			} else {
15636 				if (ref_count == 1 ||
15637 				    (ref_count == 2 && obj->named)) {
15638 					top->share_mode = SM_PRIVATE;
15639 					top->private_pages_resident =
15640 					    OBJ_RESIDENT_COUNT(obj,
15641 					    entry_size);
15642 				} else {
15643 					top->share_mode = SM_SHARED;
15644 					top->shared_pages_resident =
15645 					    OBJ_RESIDENT_COUNT(obj,
15646 					    entry_size);
15647 				}
15648 			}
15649 			top->ref_count = ref_count;
15650 		}
15651 
15652 		vm_object_unlock(obj);
15653 
15654 		/* XXX K64: obj_id will be truncated */
15655 		top->obj_id = (unsigned int) (uintptr_t)VM_KERNEL_ADDRHASH(obj);
15656 	}
15657 }
15658 
15659 void
vm_map_region_walk(vm_map_t map,vm_map_offset_t va,vm_map_entry_t entry,vm_object_offset_t offset,vm_object_size_t range,vm_region_extended_info_t extended,boolean_t look_for_pages,mach_msg_type_number_t count)15660 vm_map_region_walk(
15661 	vm_map_t                        map,
15662 	vm_map_offset_t                 va,
15663 	vm_map_entry_t                  entry,
15664 	vm_object_offset_t              offset,
15665 	vm_object_size_t                range,
15666 	vm_region_extended_info_t       extended,
15667 	boolean_t                       look_for_pages,
15668 	mach_msg_type_number_t count)
15669 {
15670 	struct vm_object *obj, *tmp_obj;
15671 	vm_map_offset_t       last_offset;
15672 	int               i;
15673 	int               ref_count;
15674 	struct vm_object        *shadow_object;
15675 	unsigned short          shadow_depth;
15676 	boolean_t         do_region_footprint;
15677 	int                     effective_page_size, effective_page_shift;
15678 	vm_map_offset_t         effective_page_mask;
15679 
15680 	do_region_footprint = task_self_region_footprint();
15681 
15682 	if ((entry->is_sub_map) ||
15683 	    (VME_OBJECT(entry) == 0) ||
15684 	    (VME_OBJECT(entry)->phys_contiguous &&
15685 	    !entry->superpage_size)) {
15686 		extended->share_mode = SM_EMPTY;
15687 		extended->ref_count = 0;
15688 		return;
15689 	}
15690 
15691 	if (entry->superpage_size) {
15692 		extended->shadow_depth = 0;
15693 		extended->share_mode = SM_LARGE_PAGE;
15694 		extended->ref_count = 1;
15695 		extended->external_pager = 0;
15696 
15697 		/* TODO4K: Superpage in 4k mode? */
15698 		extended->pages_resident = (unsigned int)(range >> PAGE_SHIFT);
15699 		extended->shadow_depth = 0;
15700 		return;
15701 	}
15702 
15703 	effective_page_shift = vm_self_region_page_shift(map);
15704 	effective_page_size = (1 << effective_page_shift);
15705 	effective_page_mask = effective_page_size - 1;
15706 
15707 	offset = vm_map_trunc_page(offset, effective_page_mask);
15708 
15709 	obj = VME_OBJECT(entry);
15710 
15711 	vm_object_lock(obj);
15712 
15713 	if ((ref_count = obj->ref_count) > 1 && obj->paging_in_progress) {
15714 		ref_count--;
15715 	}
15716 
15717 	if (look_for_pages) {
15718 		for (last_offset = offset + range;
15719 		    offset < last_offset;
15720 		    offset += effective_page_size, va += effective_page_size) {
15721 			if (do_region_footprint) {
15722 				int disp;
15723 
15724 				disp = 0;
15725 				if (map->has_corpse_footprint) {
15726 					/*
15727 					 * Query the page info data we saved
15728 					 * while forking the corpse.
15729 					 */
15730 					vm_map_corpse_footprint_query_page_info(
15731 						map,
15732 						va,
15733 						&disp);
15734 				} else {
15735 					/*
15736 					 * Query the pmap.
15737 					 */
15738 					vm_map_footprint_query_page_info(
15739 						map,
15740 						entry,
15741 						va,
15742 						&disp);
15743 				}
15744 				if (disp & VM_PAGE_QUERY_PAGE_PRESENT) {
15745 					extended->pages_resident++;
15746 				}
15747 				if (disp & VM_PAGE_QUERY_PAGE_REUSABLE) {
15748 					extended->pages_reusable++;
15749 				}
15750 				if (disp & VM_PAGE_QUERY_PAGE_DIRTY) {
15751 					extended->pages_dirtied++;
15752 				}
15753 				if (disp & PMAP_QUERY_PAGE_COMPRESSED) {
15754 					extended->pages_swapped_out++;
15755 				}
15756 				continue;
15757 			}
15758 
15759 			vm_map_region_look_for_page(map, va, obj,
15760 			    vm_object_trunc_page(offset), ref_count,
15761 			    0, extended, count);
15762 		}
15763 
15764 		if (do_region_footprint) {
15765 			goto collect_object_info;
15766 		}
15767 	} else {
15768 collect_object_info:
15769 		shadow_object = obj->shadow;
15770 		shadow_depth = 0;
15771 
15772 		if (!(obj->internal)) {
15773 			extended->external_pager = 1;
15774 		}
15775 
15776 		if (shadow_object != VM_OBJECT_NULL) {
15777 			vm_object_lock(shadow_object);
15778 			for (;
15779 			    shadow_object != VM_OBJECT_NULL;
15780 			    shadow_depth++) {
15781 				vm_object_t     next_shadow;
15782 
15783 				if (!(shadow_object->internal)) {
15784 					extended->external_pager = 1;
15785 				}
15786 
15787 				next_shadow = shadow_object->shadow;
15788 				if (next_shadow) {
15789 					vm_object_lock(next_shadow);
15790 				}
15791 				vm_object_unlock(shadow_object);
15792 				shadow_object = next_shadow;
15793 			}
15794 		}
15795 		extended->shadow_depth = shadow_depth;
15796 	}
15797 
15798 	if (extended->shadow_depth || entry->needs_copy) {
15799 		extended->share_mode = SM_COW;
15800 	} else {
15801 		if (ref_count == 1) {
15802 			extended->share_mode = SM_PRIVATE;
15803 		} else {
15804 			if (obj->true_share) {
15805 				extended->share_mode = SM_TRUESHARED;
15806 			} else {
15807 				extended->share_mode = SM_SHARED;
15808 			}
15809 		}
15810 	}
15811 	extended->ref_count = ref_count - extended->shadow_depth;
15812 
15813 	for (i = 0; i < extended->shadow_depth; i++) {
15814 		if ((tmp_obj = obj->shadow) == 0) {
15815 			break;
15816 		}
15817 		vm_object_lock(tmp_obj);
15818 		vm_object_unlock(obj);
15819 
15820 		if ((ref_count = tmp_obj->ref_count) > 1 && tmp_obj->paging_in_progress) {
15821 			ref_count--;
15822 		}
15823 
15824 		extended->ref_count += ref_count;
15825 		obj = tmp_obj;
15826 	}
15827 	vm_object_unlock(obj);
15828 
15829 	if (extended->share_mode == SM_SHARED) {
15830 		vm_map_entry_t       cur;
15831 		vm_map_entry_t       last;
15832 		int      my_refs;
15833 
15834 		obj = VME_OBJECT(entry);
15835 		last = vm_map_to_entry(map);
15836 		my_refs = 0;
15837 
15838 		if ((ref_count = obj->ref_count) > 1 && obj->paging_in_progress) {
15839 			ref_count--;
15840 		}
15841 		for (cur = vm_map_first_entry(map); cur != last; cur = cur->vme_next) {
15842 			my_refs += vm_map_region_count_obj_refs(cur, obj);
15843 		}
15844 
15845 		if (my_refs == ref_count) {
15846 			extended->share_mode = SM_PRIVATE_ALIASED;
15847 		} else if (my_refs > 1) {
15848 			extended->share_mode = SM_SHARED_ALIASED;
15849 		}
15850 	}
15851 }
15852 
15853 
15854 /* object is locked on entry and locked on return */
15855 
15856 
15857 static void
vm_map_region_look_for_page(__unused vm_map_t map,__unused vm_map_offset_t va,vm_object_t object,vm_object_offset_t offset,int max_refcnt,unsigned short depth,vm_region_extended_info_t extended,mach_msg_type_number_t count)15858 vm_map_region_look_for_page(
15859 	__unused vm_map_t               map,
15860 	__unused vm_map_offset_t        va,
15861 	vm_object_t                     object,
15862 	vm_object_offset_t              offset,
15863 	int                             max_refcnt,
15864 	unsigned short                  depth,
15865 	vm_region_extended_info_t       extended,
15866 	mach_msg_type_number_t count)
15867 {
15868 	vm_page_t       p;
15869 	vm_object_t     shadow;
15870 	int             ref_count;
15871 	vm_object_t     caller_object;
15872 
15873 	shadow = object->shadow;
15874 	caller_object = object;
15875 
15876 
15877 	while (TRUE) {
15878 		if (!(object->internal)) {
15879 			extended->external_pager = 1;
15880 		}
15881 
15882 		if ((p = vm_page_lookup(object, offset)) != VM_PAGE_NULL) {
15883 			if (shadow && (max_refcnt == 1)) {
15884 				extended->pages_shared_now_private++;
15885 			}
15886 
15887 			if (!p->vmp_fictitious &&
15888 			    (p->vmp_dirty || pmap_is_modified(VM_PAGE_GET_PHYS_PAGE(p)))) {
15889 				extended->pages_dirtied++;
15890 			} else if (count >= VM_REGION_EXTENDED_INFO_COUNT) {
15891 				if (p->vmp_reusable || object->all_reusable) {
15892 					extended->pages_reusable++;
15893 				}
15894 			}
15895 
15896 			extended->pages_resident++;
15897 
15898 			if (object != caller_object) {
15899 				vm_object_unlock(object);
15900 			}
15901 
15902 			return;
15903 		}
15904 		if (object->internal &&
15905 		    object->alive &&
15906 		    !object->terminating &&
15907 		    object->pager_ready) {
15908 			if (VM_COMPRESSOR_PAGER_STATE_GET(object, offset)
15909 			    == VM_EXTERNAL_STATE_EXISTS) {
15910 				/* the pager has that page */
15911 				extended->pages_swapped_out++;
15912 				if (object != caller_object) {
15913 					vm_object_unlock(object);
15914 				}
15915 				return;
15916 			}
15917 		}
15918 
15919 		if (shadow) {
15920 			vm_object_lock(shadow);
15921 
15922 			if ((ref_count = shadow->ref_count) > 1 && shadow->paging_in_progress) {
15923 				ref_count--;
15924 			}
15925 
15926 			if (++depth > extended->shadow_depth) {
15927 				extended->shadow_depth = depth;
15928 			}
15929 
15930 			if (ref_count > max_refcnt) {
15931 				max_refcnt = ref_count;
15932 			}
15933 
15934 			if (object != caller_object) {
15935 				vm_object_unlock(object);
15936 			}
15937 
15938 			offset = offset + object->vo_shadow_offset;
15939 			object = shadow;
15940 			shadow = object->shadow;
15941 			continue;
15942 		}
15943 		if (object != caller_object) {
15944 			vm_object_unlock(object);
15945 		}
15946 		break;
15947 	}
15948 }
15949 
15950 static int
vm_map_region_count_obj_refs(vm_map_entry_t entry,vm_object_t object)15951 vm_map_region_count_obj_refs(
15952 	vm_map_entry_t    entry,
15953 	vm_object_t       object)
15954 {
15955 	int ref_count;
15956 	vm_object_t chk_obj;
15957 	vm_object_t tmp_obj;
15958 
15959 	if (entry->is_sub_map || VME_OBJECT(entry) == VM_OBJECT_NULL) {
15960 		return 0;
15961 	}
15962 
15963 	ref_count = 0;
15964 	chk_obj = VME_OBJECT(entry);
15965 	vm_object_lock(chk_obj);
15966 
15967 	while (chk_obj) {
15968 		if (chk_obj == object) {
15969 			ref_count++;
15970 		}
15971 		tmp_obj = chk_obj->shadow;
15972 		if (tmp_obj) {
15973 			vm_object_lock(tmp_obj);
15974 		}
15975 		vm_object_unlock(chk_obj);
15976 
15977 		chk_obj = tmp_obj;
15978 	}
15979 
15980 	return ref_count;
15981 }
15982 
15983 
15984 /*
15985  *	Routine:	vm_map_simplify
15986  *
15987  *	Description:
15988  *		Attempt to simplify the map representation in
15989  *		the vicinity of the given starting address.
15990  *	Note:
15991  *		This routine is intended primarily to keep the
15992  *		kernel maps more compact -- they generally don't
15993  *		benefit from the "expand a map entry" technology
15994  *		at allocation time because the adjacent entry
15995  *		is often wired down.
15996  */
15997 void
vm_map_simplify_entry(vm_map_t map,vm_map_entry_t this_entry)15998 vm_map_simplify_entry(
15999 	vm_map_t        map,
16000 	vm_map_entry_t  this_entry)
16001 {
16002 	vm_map_entry_t  prev_entry;
16003 
16004 	prev_entry = this_entry->vme_prev;
16005 
16006 	if ((this_entry != vm_map_to_entry(map)) &&
16007 	    (prev_entry != vm_map_to_entry(map)) &&
16008 
16009 	    (prev_entry->vme_end == this_entry->vme_start) &&
16010 
16011 	    (prev_entry->is_sub_map == this_entry->is_sub_map) &&
16012 	    (prev_entry->vme_object_value == this_entry->vme_object_value) &&
16013 	    (prev_entry->vme_kernel_object == this_entry->vme_kernel_object) &&
16014 	    ((VME_OFFSET(prev_entry) + (prev_entry->vme_end -
16015 	    prev_entry->vme_start))
16016 	    == VME_OFFSET(this_entry)) &&
16017 
16018 	    (prev_entry->behavior == this_entry->behavior) &&
16019 	    (prev_entry->needs_copy == this_entry->needs_copy) &&
16020 	    (prev_entry->protection == this_entry->protection) &&
16021 	    (prev_entry->max_protection == this_entry->max_protection) &&
16022 	    (prev_entry->inheritance == this_entry->inheritance) &&
16023 	    (prev_entry->use_pmap == this_entry->use_pmap) &&
16024 	    (VME_ALIAS(prev_entry) == VME_ALIAS(this_entry)) &&
16025 	    (prev_entry->no_cache == this_entry->no_cache) &&
16026 	    (prev_entry->vme_permanent == this_entry->vme_permanent) &&
16027 	    (prev_entry->map_aligned == this_entry->map_aligned) &&
16028 	    (prev_entry->zero_wired_pages == this_entry->zero_wired_pages) &&
16029 	    (prev_entry->used_for_jit == this_entry->used_for_jit) &&
16030 #if __arm64e__
16031 	    (prev_entry->used_for_tpro == this_entry->used_for_tpro) &&
16032 #endif
16033 	    (prev_entry->csm_associated == this_entry->csm_associated) &&
16034 	    (prev_entry->vme_xnu_user_debug == this_entry->vme_xnu_user_debug) &&
16035 	    (prev_entry->iokit_acct == this_entry->iokit_acct) &&
16036 	    (prev_entry->vme_resilient_codesign ==
16037 	    this_entry->vme_resilient_codesign) &&
16038 	    (prev_entry->vme_resilient_media ==
16039 	    this_entry->vme_resilient_media) &&
16040 	    (prev_entry->vme_no_copy_on_read == this_entry->vme_no_copy_on_read) &&
16041 	    (prev_entry->translated_allow_execute == this_entry->translated_allow_execute) &&
16042 
16043 	    (prev_entry->wired_count == this_entry->wired_count) &&
16044 	    (prev_entry->user_wired_count == this_entry->user_wired_count) &&
16045 
16046 	    ((prev_entry->vme_atomic == FALSE) && (this_entry->vme_atomic == FALSE)) &&
16047 	    (prev_entry->in_transition == FALSE) &&
16048 	    (this_entry->in_transition == FALSE) &&
16049 	    (prev_entry->needs_wakeup == FALSE) &&
16050 	    (this_entry->needs_wakeup == FALSE) &&
16051 	    (prev_entry->is_shared == this_entry->is_shared) &&
16052 	    (prev_entry->superpage_size == FALSE) &&
16053 	    (this_entry->superpage_size == FALSE)
16054 	    ) {
16055 		if (prev_entry->vme_permanent) {
16056 			assert(this_entry->vme_permanent);
16057 			prev_entry->vme_permanent = false;
16058 		}
16059 		vm_map_store_entry_unlink(map, prev_entry, true);
16060 		assert(prev_entry->vme_start < this_entry->vme_end);
16061 		if (prev_entry->map_aligned) {
16062 			assert(VM_MAP_PAGE_ALIGNED(prev_entry->vme_start,
16063 			    VM_MAP_PAGE_MASK(map)));
16064 		}
16065 		this_entry->vme_start = prev_entry->vme_start;
16066 		VME_OFFSET_SET(this_entry, VME_OFFSET(prev_entry));
16067 
16068 		if (map->holelistenabled) {
16069 			vm_map_store_update_first_free(map, this_entry, TRUE);
16070 		}
16071 
16072 		if (prev_entry->is_sub_map) {
16073 			vm_map_deallocate(VME_SUBMAP(prev_entry));
16074 		} else {
16075 			vm_object_deallocate(VME_OBJECT(prev_entry));
16076 		}
16077 		vm_map_entry_dispose(prev_entry);
16078 		SAVE_HINT_MAP_WRITE(map, this_entry);
16079 	}
16080 }
16081 
16082 void
vm_map_simplify(vm_map_t map,vm_map_offset_t start)16083 vm_map_simplify(
16084 	vm_map_t        map,
16085 	vm_map_offset_t start)
16086 {
16087 	vm_map_entry_t  this_entry;
16088 
16089 	vm_map_lock(map);
16090 	if (vm_map_lookup_entry(map, start, &this_entry)) {
16091 		vm_map_simplify_entry(map, this_entry);
16092 		vm_map_simplify_entry(map, this_entry->vme_next);
16093 	}
16094 	vm_map_unlock(map);
16095 }
16096 
16097 static void
vm_map_simplify_range(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)16098 vm_map_simplify_range(
16099 	vm_map_t        map,
16100 	vm_map_offset_t start,
16101 	vm_map_offset_t end)
16102 {
16103 	vm_map_entry_t  entry;
16104 
16105 	/*
16106 	 * The map should be locked (for "write") by the caller.
16107 	 */
16108 
16109 	if (start >= end) {
16110 		/* invalid address range */
16111 		return;
16112 	}
16113 
16114 	start = vm_map_trunc_page(start,
16115 	    VM_MAP_PAGE_MASK(map));
16116 	end = vm_map_round_page(end,
16117 	    VM_MAP_PAGE_MASK(map));
16118 
16119 	if (!vm_map_lookup_entry(map, start, &entry)) {
16120 		/* "start" is not mapped and "entry" ends before "start" */
16121 		if (entry == vm_map_to_entry(map)) {
16122 			/* start with first entry in the map */
16123 			entry = vm_map_first_entry(map);
16124 		} else {
16125 			/* start with next entry */
16126 			entry = entry->vme_next;
16127 		}
16128 	}
16129 
16130 	while (entry != vm_map_to_entry(map) &&
16131 	    entry->vme_start <= end) {
16132 		/* try and coalesce "entry" with its previous entry */
16133 		vm_map_simplify_entry(map, entry);
16134 		entry = entry->vme_next;
16135 	}
16136 }
16137 
16138 
16139 /*
16140  *	Routine:	vm_map_machine_attribute
16141  *	Purpose:
16142  *		Provide machine-specific attributes to mappings,
16143  *		such as cachability etc. for machines that provide
16144  *		them.  NUMA architectures and machines with big/strange
16145  *		caches will use this.
16146  *	Note:
16147  *		Responsibilities for locking and checking are handled here,
16148  *		everything else in the pmap module. If any non-volatile
16149  *		information must be kept, the pmap module should handle
16150  *		it itself. [This assumes that attributes do not
16151  *		need to be inherited, which seems ok to me]
16152  */
16153 kern_return_t
vm_map_machine_attribute(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_machine_attribute_t attribute,vm_machine_attribute_val_t * value)16154 vm_map_machine_attribute(
16155 	vm_map_t                        map,
16156 	vm_map_offset_t         start,
16157 	vm_map_offset_t         end,
16158 	vm_machine_attribute_t  attribute,
16159 	vm_machine_attribute_val_t* value)              /* IN/OUT */
16160 {
16161 	kern_return_t   ret;
16162 	vm_map_size_t sync_size;
16163 	vm_map_entry_t entry;
16164 
16165 	if (start < vm_map_min(map) || end > vm_map_max(map)) {
16166 		return KERN_INVALID_ADDRESS;
16167 	}
16168 	if (__improbable(vm_map_range_overflows(map, start, end - start))) {
16169 		return KERN_INVALID_ADDRESS;
16170 	}
16171 
16172 	/* Figure how much memory we need to flush (in page increments) */
16173 	sync_size = end - start;
16174 
16175 	vm_map_lock(map);
16176 
16177 	if (attribute != MATTR_CACHE) {
16178 		/* If we don't have to find physical addresses, we */
16179 		/* don't have to do an explicit traversal here.    */
16180 		ret = pmap_attribute(map->pmap, start, end - start,
16181 		    attribute, value);
16182 		vm_map_unlock(map);
16183 		return ret;
16184 	}
16185 
16186 	ret = KERN_SUCCESS;                                                                             /* Assume it all worked */
16187 
16188 	while (sync_size) {
16189 		if (vm_map_lookup_entry(map, start, &entry)) {
16190 			vm_map_size_t   sub_size;
16191 			if ((entry->vme_end - start) > sync_size) {
16192 				sub_size = sync_size;
16193 				sync_size = 0;
16194 			} else {
16195 				sub_size = entry->vme_end - start;
16196 				sync_size -= sub_size;
16197 			}
16198 			if (entry->is_sub_map) {
16199 				vm_map_offset_t sub_start;
16200 				vm_map_offset_t sub_end;
16201 
16202 				sub_start = (start - entry->vme_start)
16203 				    + VME_OFFSET(entry);
16204 				sub_end = sub_start + sub_size;
16205 				vm_map_machine_attribute(
16206 					VME_SUBMAP(entry),
16207 					sub_start,
16208 					sub_end,
16209 					attribute, value);
16210 			} else if (VME_OBJECT(entry)) {
16211 				vm_page_t               m;
16212 				vm_object_t             object;
16213 				vm_object_t             base_object;
16214 				vm_object_t             last_object;
16215 				vm_object_offset_t      offset;
16216 				vm_object_offset_t      base_offset;
16217 				vm_map_size_t           range;
16218 				range = sub_size;
16219 				offset = (start - entry->vme_start)
16220 				    + VME_OFFSET(entry);
16221 				offset = vm_object_trunc_page(offset);
16222 				base_offset = offset;
16223 				object = VME_OBJECT(entry);
16224 				base_object = object;
16225 				last_object = NULL;
16226 
16227 				vm_object_lock(object);
16228 
16229 				while (range) {
16230 					m = vm_page_lookup(
16231 						object, offset);
16232 
16233 					if (m && !m->vmp_fictitious) {
16234 						ret =
16235 						    pmap_attribute_cache_sync(
16236 							VM_PAGE_GET_PHYS_PAGE(m),
16237 							PAGE_SIZE,
16238 							attribute, value);
16239 					} else if (object->shadow) {
16240 						offset = offset + object->vo_shadow_offset;
16241 						last_object = object;
16242 						object = object->shadow;
16243 						vm_object_lock(last_object->shadow);
16244 						vm_object_unlock(last_object);
16245 						continue;
16246 					}
16247 					if (range < PAGE_SIZE) {
16248 						range = 0;
16249 					} else {
16250 						range -= PAGE_SIZE;
16251 					}
16252 
16253 					if (base_object != object) {
16254 						vm_object_unlock(object);
16255 						vm_object_lock(base_object);
16256 						object = base_object;
16257 					}
16258 					/* Bump to the next page */
16259 					base_offset += PAGE_SIZE;
16260 					offset = base_offset;
16261 				}
16262 				vm_object_unlock(object);
16263 			}
16264 			start += sub_size;
16265 		} else {
16266 			vm_map_unlock(map);
16267 			return KERN_FAILURE;
16268 		}
16269 	}
16270 
16271 	vm_map_unlock(map);
16272 
16273 	return ret;
16274 }
16275 
16276 /*
16277  *	vm_map_behavior_set:
16278  *
16279  *	Sets the paging reference behavior of the specified address
16280  *	range in the target map.  Paging reference behavior affects
16281  *	how pagein operations resulting from faults on the map will be
16282  *	clustered.
16283  */
16284 kern_return_t
vm_map_behavior_set(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_behavior_t new_behavior)16285 vm_map_behavior_set(
16286 	vm_map_t        map,
16287 	vm_map_offset_t start,
16288 	vm_map_offset_t end,
16289 	vm_behavior_t   new_behavior)
16290 {
16291 	vm_map_entry_t  entry;
16292 	vm_map_entry_t  temp_entry;
16293 
16294 	if (start > end ||
16295 	    start < vm_map_min(map) ||
16296 	    end > vm_map_max(map)) {
16297 		return KERN_NO_SPACE;
16298 	}
16299 	if (__improbable(vm_map_range_overflows(map, start, end - start))) {
16300 		return KERN_INVALID_ADDRESS;
16301 	}
16302 
16303 	switch (new_behavior) {
16304 	/*
16305 	 * This first block of behaviors all set a persistent state on the specified
16306 	 * memory range.  All we have to do here is to record the desired behavior
16307 	 * in the vm_map_entry_t's.
16308 	 */
16309 
16310 	case VM_BEHAVIOR_DEFAULT:
16311 	case VM_BEHAVIOR_RANDOM:
16312 	case VM_BEHAVIOR_SEQUENTIAL:
16313 	case VM_BEHAVIOR_RSEQNTL:
16314 	case VM_BEHAVIOR_ZERO_WIRED_PAGES:
16315 		vm_map_lock(map);
16316 
16317 		/*
16318 		 *	The entire address range must be valid for the map.
16319 		 *      Note that vm_map_range_check() does a
16320 		 *	vm_map_lookup_entry() internally and returns the
16321 		 *	entry containing the start of the address range if
16322 		 *	the entire range is valid.
16323 		 */
16324 		if (vm_map_range_check(map, start, end, &temp_entry)) {
16325 			entry = temp_entry;
16326 			vm_map_clip_start(map, entry, start);
16327 		} else {
16328 			vm_map_unlock(map);
16329 			return KERN_INVALID_ADDRESS;
16330 		}
16331 
16332 		while ((entry != vm_map_to_entry(map)) && (entry->vme_start < end)) {
16333 			vm_map_clip_end(map, entry, end);
16334 			if (entry->is_sub_map) {
16335 				assert(!entry->use_pmap);
16336 			}
16337 
16338 			if (new_behavior == VM_BEHAVIOR_ZERO_WIRED_PAGES) {
16339 				entry->zero_wired_pages = TRUE;
16340 			} else {
16341 				entry->behavior = new_behavior;
16342 			}
16343 			entry = entry->vme_next;
16344 		}
16345 
16346 		vm_map_unlock(map);
16347 		break;
16348 
16349 	/*
16350 	 * The rest of these are different from the above in that they cause
16351 	 * an immediate action to take place as opposed to setting a behavior that
16352 	 * affects future actions.
16353 	 */
16354 
16355 	case VM_BEHAVIOR_WILLNEED:
16356 		return vm_map_willneed(map, start, end);
16357 
16358 	case VM_BEHAVIOR_DONTNEED:
16359 		return vm_map_msync(map, start, end - start, VM_SYNC_DEACTIVATE | VM_SYNC_CONTIGUOUS);
16360 
16361 	case VM_BEHAVIOR_FREE:
16362 		return vm_map_msync(map, start, end - start, VM_SYNC_KILLPAGES | VM_SYNC_CONTIGUOUS);
16363 
16364 	case VM_BEHAVIOR_REUSABLE:
16365 		return vm_map_reusable_pages(map, start, end);
16366 
16367 	case VM_BEHAVIOR_REUSE:
16368 		return vm_map_reuse_pages(map, start, end);
16369 
16370 	case VM_BEHAVIOR_CAN_REUSE:
16371 		return vm_map_can_reuse(map, start, end);
16372 
16373 #if MACH_ASSERT
16374 	case VM_BEHAVIOR_PAGEOUT:
16375 		return vm_map_pageout(map, start, end);
16376 #endif /* MACH_ASSERT */
16377 
16378 	case VM_BEHAVIOR_ZERO:
16379 		return vm_map_zero(map, start, end);
16380 
16381 	default:
16382 		return KERN_INVALID_ARGUMENT;
16383 	}
16384 
16385 	return KERN_SUCCESS;
16386 }
16387 
16388 
16389 /*
16390  * Internals for madvise(MADV_WILLNEED) system call.
16391  *
16392  * The implementation is to do:-
16393  * a) read-ahead if the mapping corresponds to a mapped regular file
16394  * b) or, fault in the pages (zero-fill, decompress etc) if it's an anonymous mapping
16395  */
16396 
16397 
16398 static kern_return_t
vm_map_willneed(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)16399 vm_map_willneed(
16400 	vm_map_t        map,
16401 	vm_map_offset_t start,
16402 	vm_map_offset_t end
16403 	)
16404 {
16405 	vm_map_entry_t                  entry;
16406 	vm_object_t                     object;
16407 	memory_object_t                 pager;
16408 	struct vm_object_fault_info     fault_info = {};
16409 	kern_return_t                   kr;
16410 	vm_object_size_t                len;
16411 	vm_object_offset_t              offset;
16412 
16413 	fault_info.interruptible = THREAD_UNINT;        /* ignored value */
16414 	fault_info.behavior      = VM_BEHAVIOR_SEQUENTIAL;
16415 	fault_info.stealth       = TRUE;
16416 
16417 	/*
16418 	 * The MADV_WILLNEED operation doesn't require any changes to the
16419 	 * vm_map_entry_t's, so the read lock is sufficient.
16420 	 */
16421 
16422 	vm_map_lock_read(map);
16423 
16424 	/*
16425 	 * The madvise semantics require that the address range be fully
16426 	 * allocated with no holes.  Otherwise, we're required to return
16427 	 * an error.
16428 	 */
16429 
16430 	if (!vm_map_range_check(map, start, end, &entry)) {
16431 		vm_map_unlock_read(map);
16432 		return KERN_INVALID_ADDRESS;
16433 	}
16434 
16435 	/*
16436 	 * Examine each vm_map_entry_t in the range.
16437 	 */
16438 	for (; entry != vm_map_to_entry(map) && start < end;) {
16439 		/*
16440 		 * The first time through, the start address could be anywhere
16441 		 * within the vm_map_entry we found.  So adjust the offset to
16442 		 * correspond.  After that, the offset will always be zero to
16443 		 * correspond to the beginning of the current vm_map_entry.
16444 		 */
16445 		offset = (start - entry->vme_start) + VME_OFFSET(entry);
16446 
16447 		/*
16448 		 * Set the length so we don't go beyond the end of the
16449 		 * map_entry or beyond the end of the range we were given.
16450 		 * This range could span also multiple map entries all of which
16451 		 * map different files, so make sure we only do the right amount
16452 		 * of I/O for each object.  Note that it's possible for there
16453 		 * to be multiple map entries all referring to the same object
16454 		 * but with different page permissions, but it's not worth
16455 		 * trying to optimize that case.
16456 		 */
16457 		len = MIN(entry->vme_end - start, end - start);
16458 
16459 		if ((vm_size_t) len != len) {
16460 			/* 32-bit overflow */
16461 			len = (vm_size_t) (0 - PAGE_SIZE);
16462 		}
16463 		fault_info.cluster_size = (vm_size_t) len;
16464 		fault_info.lo_offset    = offset;
16465 		fault_info.hi_offset    = offset + len;
16466 		fault_info.user_tag     = VME_ALIAS(entry);
16467 		fault_info.pmap_options = 0;
16468 		if (entry->iokit_acct ||
16469 		    (!entry->is_sub_map && !entry->use_pmap)) {
16470 			fault_info.pmap_options |= PMAP_OPTIONS_ALT_ACCT;
16471 		}
16472 		fault_info.fi_xnu_user_debug = entry->vme_xnu_user_debug;
16473 
16474 		/*
16475 		 * If the entry is a submap OR there's no read permission
16476 		 * to this mapping, then just skip it.
16477 		 */
16478 		if ((entry->is_sub_map) || (entry->protection & VM_PROT_READ) == 0) {
16479 			entry = entry->vme_next;
16480 			start = entry->vme_start;
16481 			continue;
16482 		}
16483 
16484 		object = VME_OBJECT(entry);
16485 
16486 		if (object == NULL ||
16487 		    (object && object->internal)) {
16488 			/*
16489 			 * Memory range backed by anonymous memory.
16490 			 */
16491 			vm_size_t region_size = 0, effective_page_size = 0;
16492 			vm_map_offset_t addr = 0, effective_page_mask = 0;
16493 
16494 			region_size = len;
16495 			addr = start;
16496 
16497 			effective_page_mask = MIN(vm_map_page_mask(current_map()), PAGE_MASK);
16498 			effective_page_size = effective_page_mask + 1;
16499 
16500 			vm_map_unlock_read(map);
16501 
16502 			while (region_size) {
16503 				vm_pre_fault(
16504 					vm_map_trunc_page(addr, effective_page_mask),
16505 					VM_PROT_READ | VM_PROT_WRITE);
16506 
16507 				region_size -= effective_page_size;
16508 				addr += effective_page_size;
16509 			}
16510 		} else {
16511 			/*
16512 			 * Find the file object backing this map entry.  If there is
16513 			 * none, then we simply ignore the "will need" advice for this
16514 			 * entry and go on to the next one.
16515 			 */
16516 			if ((object = find_vnode_object(entry)) == VM_OBJECT_NULL) {
16517 				entry = entry->vme_next;
16518 				start = entry->vme_start;
16519 				continue;
16520 			}
16521 
16522 			vm_object_paging_begin(object);
16523 			pager = object->pager;
16524 			vm_object_unlock(object);
16525 
16526 			/*
16527 			 * The data_request() could take a long time, so let's
16528 			 * release the map lock to avoid blocking other threads.
16529 			 */
16530 			vm_map_unlock_read(map);
16531 
16532 			/*
16533 			 * Get the data from the object asynchronously.
16534 			 *
16535 			 * Note that memory_object_data_request() places limits on the
16536 			 * amount of I/O it will do.  Regardless of the len we
16537 			 * specified, it won't do more than MAX_UPL_TRANSFER_BYTES and it
16538 			 * silently truncates the len to that size.  This isn't
16539 			 * necessarily bad since madvise shouldn't really be used to
16540 			 * page in unlimited amounts of data.  Other Unix variants
16541 			 * limit the willneed case as well.  If this turns out to be an
16542 			 * issue for developers, then we can always adjust the policy
16543 			 * here and still be backwards compatible since this is all
16544 			 * just "advice".
16545 			 */
16546 			kr = memory_object_data_request(
16547 				pager,
16548 				vm_object_trunc_page(offset) + object->paging_offset,
16549 				0,      /* ignored */
16550 				VM_PROT_READ,
16551 				(memory_object_fault_info_t)&fault_info);
16552 
16553 			vm_object_lock(object);
16554 			vm_object_paging_end(object);
16555 			vm_object_unlock(object);
16556 
16557 			/*
16558 			 * If we couldn't do the I/O for some reason, just give up on
16559 			 * the madvise.  We still return success to the user since
16560 			 * madvise isn't supposed to fail when the advice can't be
16561 			 * taken.
16562 			 */
16563 
16564 			if (kr != KERN_SUCCESS) {
16565 				return KERN_SUCCESS;
16566 			}
16567 		}
16568 
16569 		start += len;
16570 		if (start >= end) {
16571 			/* done */
16572 			return KERN_SUCCESS;
16573 		}
16574 
16575 		/* look up next entry */
16576 		vm_map_lock_read(map);
16577 		if (!vm_map_lookup_entry(map, start, &entry)) {
16578 			/*
16579 			 * There's a new hole in the address range.
16580 			 */
16581 			vm_map_unlock_read(map);
16582 			return KERN_INVALID_ADDRESS;
16583 		}
16584 	}
16585 
16586 	vm_map_unlock_read(map);
16587 	return KERN_SUCCESS;
16588 }
16589 
16590 static boolean_t
vm_map_entry_is_reusable(vm_map_entry_t entry)16591 vm_map_entry_is_reusable(
16592 	vm_map_entry_t entry)
16593 {
16594 	/* Only user map entries */
16595 
16596 	vm_object_t object;
16597 
16598 	if (entry->is_sub_map) {
16599 		return FALSE;
16600 	}
16601 
16602 	switch (VME_ALIAS(entry)) {
16603 	case VM_MEMORY_MALLOC:
16604 	case VM_MEMORY_MALLOC_SMALL:
16605 	case VM_MEMORY_MALLOC_LARGE:
16606 	case VM_MEMORY_REALLOC:
16607 	case VM_MEMORY_MALLOC_TINY:
16608 	case VM_MEMORY_MALLOC_LARGE_REUSABLE:
16609 	case VM_MEMORY_MALLOC_LARGE_REUSED:
16610 		/*
16611 		 * This is a malloc() memory region: check if it's still
16612 		 * in its original state and can be re-used for more
16613 		 * malloc() allocations.
16614 		 */
16615 		break;
16616 	default:
16617 		/*
16618 		 * Not a malloc() memory region: let the caller decide if
16619 		 * it's re-usable.
16620 		 */
16621 		return TRUE;
16622 	}
16623 
16624 	if (/*entry->is_shared ||*/
16625 		entry->is_sub_map ||
16626 		entry->in_transition ||
16627 		entry->protection != VM_PROT_DEFAULT ||
16628 		entry->max_protection != VM_PROT_ALL ||
16629 		entry->inheritance != VM_INHERIT_DEFAULT ||
16630 		entry->no_cache ||
16631 		entry->vme_permanent ||
16632 		entry->superpage_size != FALSE ||
16633 		entry->zero_wired_pages ||
16634 		entry->wired_count != 0 ||
16635 		entry->user_wired_count != 0) {
16636 		return FALSE;
16637 	}
16638 
16639 	object = VME_OBJECT(entry);
16640 	if (object == VM_OBJECT_NULL) {
16641 		return TRUE;
16642 	}
16643 	if (
16644 #if 0
16645 		/*
16646 		 * Let's proceed even if the VM object is potentially
16647 		 * shared.
16648 		 * We check for this later when processing the actual
16649 		 * VM pages, so the contents will be safe if shared.
16650 		 *
16651 		 * But we can still mark this memory region as "reusable" to
16652 		 * acknowledge that the caller did let us know that the memory
16653 		 * could be re-used and should not be penalized for holding
16654 		 * on to it.  This allows its "resident size" to not include
16655 		 * the reusable range.
16656 		 */
16657 		object->ref_count == 1 &&
16658 #endif
16659 		object->vo_copy == VM_OBJECT_NULL &&
16660 		object->shadow == VM_OBJECT_NULL &&
16661 		object->internal &&
16662 		object->purgable == VM_PURGABLE_DENY &&
16663 		object->wimg_bits == VM_WIMG_USE_DEFAULT &&
16664 		!object->code_signed) {
16665 		return TRUE;
16666 	}
16667 	return FALSE;
16668 }
16669 
16670 static kern_return_t
vm_map_reuse_pages(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)16671 vm_map_reuse_pages(
16672 	vm_map_t        map,
16673 	vm_map_offset_t start,
16674 	vm_map_offset_t end)
16675 {
16676 	vm_map_entry_t                  entry;
16677 	vm_object_t                     object;
16678 	vm_object_offset_t              start_offset, end_offset;
16679 
16680 	/*
16681 	 * The MADV_REUSE operation doesn't require any changes to the
16682 	 * vm_map_entry_t's, so the read lock is sufficient.
16683 	 */
16684 
16685 	if (VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT) {
16686 		/*
16687 		 * XXX TODO4K
16688 		 * need to figure out what reusable means for a
16689 		 * portion of a native page.
16690 		 */
16691 		return KERN_SUCCESS;
16692 	}
16693 
16694 	vm_map_lock_read(map);
16695 	assert(map->pmap != kernel_pmap);       /* protect alias access */
16696 
16697 	/*
16698 	 * The madvise semantics require that the address range be fully
16699 	 * allocated with no holes.  Otherwise, we're required to return
16700 	 * an error.
16701 	 */
16702 
16703 	if (!vm_map_range_check(map, start, end, &entry)) {
16704 		vm_map_unlock_read(map);
16705 		vm_page_stats_reusable.reuse_pages_failure++;
16706 		return KERN_INVALID_ADDRESS;
16707 	}
16708 
16709 	/*
16710 	 * Examine each vm_map_entry_t in the range.
16711 	 */
16712 	for (; entry != vm_map_to_entry(map) && entry->vme_start < end;
16713 	    entry = entry->vme_next) {
16714 		/*
16715 		 * Sanity check on the VM map entry.
16716 		 */
16717 		if (!vm_map_entry_is_reusable(entry)) {
16718 			vm_map_unlock_read(map);
16719 			vm_page_stats_reusable.reuse_pages_failure++;
16720 			return KERN_INVALID_ADDRESS;
16721 		}
16722 
16723 		/*
16724 		 * The first time through, the start address could be anywhere
16725 		 * within the vm_map_entry we found.  So adjust the offset to
16726 		 * correspond.
16727 		 */
16728 		if (entry->vme_start < start) {
16729 			start_offset = start - entry->vme_start;
16730 		} else {
16731 			start_offset = 0;
16732 		}
16733 		end_offset = MIN(end, entry->vme_end) - entry->vme_start;
16734 		start_offset += VME_OFFSET(entry);
16735 		end_offset += VME_OFFSET(entry);
16736 
16737 		object = VME_OBJECT(entry);
16738 		if (object != VM_OBJECT_NULL) {
16739 			vm_object_lock(object);
16740 			vm_object_reuse_pages(object, start_offset, end_offset,
16741 			    TRUE);
16742 			vm_object_unlock(object);
16743 		}
16744 
16745 		if (VME_ALIAS(entry) == VM_MEMORY_MALLOC_LARGE_REUSABLE) {
16746 			/*
16747 			 * XXX
16748 			 * We do not hold the VM map exclusively here.
16749 			 * The "alias" field is not that critical, so it's
16750 			 * safe to update it here, as long as it is the only
16751 			 * one that can be modified while holding the VM map
16752 			 * "shared".
16753 			 */
16754 			VME_ALIAS_SET(entry, VM_MEMORY_MALLOC_LARGE_REUSED);
16755 		}
16756 	}
16757 
16758 	vm_map_unlock_read(map);
16759 	vm_page_stats_reusable.reuse_pages_success++;
16760 	return KERN_SUCCESS;
16761 }
16762 
16763 
16764 static kern_return_t
vm_map_reusable_pages(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)16765 vm_map_reusable_pages(
16766 	vm_map_t        map,
16767 	vm_map_offset_t start,
16768 	vm_map_offset_t end)
16769 {
16770 	vm_map_entry_t                  entry;
16771 	vm_object_t                     object;
16772 	vm_object_offset_t              start_offset, end_offset;
16773 	vm_map_offset_t                 pmap_offset;
16774 
16775 	if (VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT) {
16776 		/*
16777 		 * XXX TODO4K
16778 		 * need to figure out what reusable means for a portion
16779 		 * of a native page.
16780 		 */
16781 		return KERN_SUCCESS;
16782 	}
16783 
16784 	/*
16785 	 * The MADV_REUSABLE operation doesn't require any changes to the
16786 	 * vm_map_entry_t's, so the read lock is sufficient.
16787 	 */
16788 
16789 	vm_map_lock_read(map);
16790 	assert(map->pmap != kernel_pmap);       /* protect alias access */
16791 
16792 	/*
16793 	 * The madvise semantics require that the address range be fully
16794 	 * allocated with no holes.  Otherwise, we're required to return
16795 	 * an error.
16796 	 */
16797 
16798 	if (!vm_map_range_check(map, start, end, &entry)) {
16799 		vm_map_unlock_read(map);
16800 		vm_page_stats_reusable.reusable_pages_failure++;
16801 		return KERN_INVALID_ADDRESS;
16802 	}
16803 
16804 	/*
16805 	 * Examine each vm_map_entry_t in the range.
16806 	 */
16807 	for (; entry != vm_map_to_entry(map) && entry->vme_start < end;
16808 	    entry = entry->vme_next) {
16809 		int kill_pages = 0;
16810 		boolean_t reusable_no_write = FALSE;
16811 
16812 		/*
16813 		 * Sanity check on the VM map entry.
16814 		 */
16815 		if (!vm_map_entry_is_reusable(entry)) {
16816 			vm_map_unlock_read(map);
16817 			vm_page_stats_reusable.reusable_pages_failure++;
16818 			return KERN_INVALID_ADDRESS;
16819 		}
16820 
16821 		if (!(entry->protection & VM_PROT_WRITE) && !entry->used_for_jit
16822 #if __arm64e__
16823 		    && !entry->used_for_tpro
16824 #endif
16825 		    ) {
16826 			/* not writable: can't discard contents */
16827 			vm_map_unlock_read(map);
16828 			vm_page_stats_reusable.reusable_nonwritable++;
16829 			vm_page_stats_reusable.reusable_pages_failure++;
16830 			return KERN_PROTECTION_FAILURE;
16831 		}
16832 
16833 		/*
16834 		 * The first time through, the start address could be anywhere
16835 		 * within the vm_map_entry we found.  So adjust the offset to
16836 		 * correspond.
16837 		 */
16838 		if (entry->vme_start < start) {
16839 			start_offset = start - entry->vme_start;
16840 			pmap_offset = start;
16841 		} else {
16842 			start_offset = 0;
16843 			pmap_offset = entry->vme_start;
16844 		}
16845 		end_offset = MIN(end, entry->vme_end) - entry->vme_start;
16846 		start_offset += VME_OFFSET(entry);
16847 		end_offset += VME_OFFSET(entry);
16848 
16849 		object = VME_OBJECT(entry);
16850 		if (object == VM_OBJECT_NULL) {
16851 			continue;
16852 		}
16853 
16854 		if (entry->protection & VM_PROT_EXECUTE) {
16855 			/*
16856 			 * Executable mappings might be write-protected by
16857 			 * hardware, so do not attempt to write to these pages.
16858 			 */
16859 			reusable_no_write = TRUE;
16860 		}
16861 
16862 		if (entry->vme_xnu_user_debug) {
16863 			/*
16864 			 * User debug pages might be write-protected by hardware,
16865 			 * so do not attempt to write to these pages.
16866 			 */
16867 			reusable_no_write = TRUE;
16868 		}
16869 
16870 		vm_object_lock(object);
16871 		if (((object->ref_count == 1) ||
16872 		    (object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC &&
16873 		    object->vo_copy == VM_OBJECT_NULL)) &&
16874 		    object->shadow == VM_OBJECT_NULL &&
16875 		    /*
16876 		     * "iokit_acct" entries are billed for their virtual size
16877 		     * (rather than for their resident pages only), so they
16878 		     * wouldn't benefit from making pages reusable, and it
16879 		     * would be hard to keep track of pages that are both
16880 		     * "iokit_acct" and "reusable" in the pmap stats and
16881 		     * ledgers.
16882 		     */
16883 		    !(entry->iokit_acct ||
16884 		    (!entry->is_sub_map && !entry->use_pmap))) {
16885 			if (object->ref_count != 1) {
16886 				vm_page_stats_reusable.reusable_shared++;
16887 			}
16888 			kill_pages = 1;
16889 		} else {
16890 			kill_pages = -1;
16891 		}
16892 		if (kill_pages != -1) {
16893 			vm_object_deactivate_pages(object,
16894 			    start_offset,
16895 			    end_offset - start_offset,
16896 			    kill_pages,
16897 			    TRUE /*reusable_pages*/,
16898 			    reusable_no_write,
16899 			    map->pmap,
16900 			    pmap_offset);
16901 		} else {
16902 			vm_page_stats_reusable.reusable_pages_shared++;
16903 			DTRACE_VM4(vm_map_reusable_pages_shared,
16904 			    unsigned int, VME_ALIAS(entry),
16905 			    vm_map_t, map,
16906 			    vm_map_entry_t, entry,
16907 			    vm_object_t, object);
16908 		}
16909 		vm_object_unlock(object);
16910 
16911 		if (VME_ALIAS(entry) == VM_MEMORY_MALLOC_LARGE ||
16912 		    VME_ALIAS(entry) == VM_MEMORY_MALLOC_LARGE_REUSED) {
16913 			/*
16914 			 * XXX
16915 			 * We do not hold the VM map exclusively here.
16916 			 * The "alias" field is not that critical, so it's
16917 			 * safe to update it here, as long as it is the only
16918 			 * one that can be modified while holding the VM map
16919 			 * "shared".
16920 			 */
16921 			VME_ALIAS_SET(entry, VM_MEMORY_MALLOC_LARGE_REUSABLE);
16922 		}
16923 	}
16924 
16925 	vm_map_unlock_read(map);
16926 	vm_page_stats_reusable.reusable_pages_success++;
16927 	return KERN_SUCCESS;
16928 }
16929 
16930 
16931 static kern_return_t
vm_map_can_reuse(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)16932 vm_map_can_reuse(
16933 	vm_map_t        map,
16934 	vm_map_offset_t start,
16935 	vm_map_offset_t end)
16936 {
16937 	vm_map_entry_t                  entry;
16938 
16939 	/*
16940 	 * The MADV_REUSABLE operation doesn't require any changes to the
16941 	 * vm_map_entry_t's, so the read lock is sufficient.
16942 	 */
16943 
16944 	vm_map_lock_read(map);
16945 	assert(map->pmap != kernel_pmap);       /* protect alias access */
16946 
16947 	/*
16948 	 * The madvise semantics require that the address range be fully
16949 	 * allocated with no holes.  Otherwise, we're required to return
16950 	 * an error.
16951 	 */
16952 
16953 	if (!vm_map_range_check(map, start, end, &entry)) {
16954 		vm_map_unlock_read(map);
16955 		vm_page_stats_reusable.can_reuse_failure++;
16956 		return KERN_INVALID_ADDRESS;
16957 	}
16958 
16959 	/*
16960 	 * Examine each vm_map_entry_t in the range.
16961 	 */
16962 	for (; entry != vm_map_to_entry(map) && entry->vme_start < end;
16963 	    entry = entry->vme_next) {
16964 		/*
16965 		 * Sanity check on the VM map entry.
16966 		 */
16967 		if (!vm_map_entry_is_reusable(entry)) {
16968 			vm_map_unlock_read(map);
16969 			vm_page_stats_reusable.can_reuse_failure++;
16970 			return KERN_INVALID_ADDRESS;
16971 		}
16972 	}
16973 
16974 	vm_map_unlock_read(map);
16975 	vm_page_stats_reusable.can_reuse_success++;
16976 	return KERN_SUCCESS;
16977 }
16978 
16979 
16980 #if MACH_ASSERT
16981 static kern_return_t
vm_map_pageout(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)16982 vm_map_pageout(
16983 	vm_map_t        map,
16984 	vm_map_offset_t start,
16985 	vm_map_offset_t end)
16986 {
16987 	vm_map_entry_t                  entry;
16988 
16989 	/*
16990 	 * The MADV_PAGEOUT operation doesn't require any changes to the
16991 	 * vm_map_entry_t's, so the read lock is sufficient.
16992 	 */
16993 
16994 	vm_map_lock_read(map);
16995 
16996 	/*
16997 	 * The madvise semantics require that the address range be fully
16998 	 * allocated with no holes.  Otherwise, we're required to return
16999 	 * an error.
17000 	 */
17001 
17002 	if (!vm_map_range_check(map, start, end, &entry)) {
17003 		vm_map_unlock_read(map);
17004 		return KERN_INVALID_ADDRESS;
17005 	}
17006 
17007 	/*
17008 	 * Examine each vm_map_entry_t in the range.
17009 	 */
17010 	for (; entry != vm_map_to_entry(map) && entry->vme_start < end;
17011 	    entry = entry->vme_next) {
17012 		vm_object_t     object;
17013 
17014 		/*
17015 		 * Sanity check on the VM map entry.
17016 		 */
17017 		if (entry->is_sub_map) {
17018 			vm_map_t submap;
17019 			vm_map_offset_t submap_start;
17020 			vm_map_offset_t submap_end;
17021 			vm_map_entry_t submap_entry;
17022 
17023 			submap = VME_SUBMAP(entry);
17024 			submap_start = VME_OFFSET(entry);
17025 			submap_end = submap_start + (entry->vme_end -
17026 			    entry->vme_start);
17027 
17028 			vm_map_lock_read(submap);
17029 
17030 			if (!vm_map_range_check(submap,
17031 			    submap_start,
17032 			    submap_end,
17033 			    &submap_entry)) {
17034 				vm_map_unlock_read(submap);
17035 				vm_map_unlock_read(map);
17036 				return KERN_INVALID_ADDRESS;
17037 			}
17038 
17039 			if (submap_entry->is_sub_map) {
17040 				vm_map_unlock_read(submap);
17041 				continue;
17042 			}
17043 
17044 			object = VME_OBJECT(submap_entry);
17045 			if (object == VM_OBJECT_NULL || !object->internal) {
17046 				vm_map_unlock_read(submap);
17047 				continue;
17048 			}
17049 
17050 			vm_object_pageout(object);
17051 
17052 			vm_map_unlock_read(submap);
17053 			submap = VM_MAP_NULL;
17054 			submap_entry = VM_MAP_ENTRY_NULL;
17055 			continue;
17056 		}
17057 
17058 		object = VME_OBJECT(entry);
17059 		if (object == VM_OBJECT_NULL || !object->internal) {
17060 			continue;
17061 		}
17062 
17063 		vm_object_pageout(object);
17064 	}
17065 
17066 	vm_map_unlock_read(map);
17067 	return KERN_SUCCESS;
17068 }
17069 #endif /* MACH_ASSERT */
17070 
17071 /*
17072  * This function determines if the zero operation can be run on the
17073  * respective entry. Additional checks on the object are in
17074  * vm_object_zero_preflight.
17075  */
17076 static kern_return_t
vm_map_zero_entry_preflight(vm_map_entry_t entry)17077 vm_map_zero_entry_preflight(vm_map_entry_t entry)
17078 {
17079 	/*
17080 	 * Zeroing is restricted to writable non-executable entries and non-JIT
17081 	 * regions.
17082 	 */
17083 	if (!(entry->protection & VM_PROT_WRITE) ||
17084 	    (entry->protection & VM_PROT_EXECUTE) ||
17085 	    entry->used_for_jit ||
17086 	    entry->vme_xnu_user_debug) {
17087 		return KERN_PROTECTION_FAILURE;
17088 	}
17089 
17090 	/*
17091 	 * Zeroing for copy on write isn't yet supported. Zeroing is also not
17092 	 * allowed for submaps.
17093 	 */
17094 	if (entry->needs_copy || entry->is_sub_map) {
17095 		return KERN_NO_ACCESS;
17096 	}
17097 
17098 	return KERN_SUCCESS;
17099 }
17100 
17101 /*
17102  * This function translates entry's start and end to offsets in the object
17103  */
17104 static void
vm_map_get_bounds_in_object(vm_map_entry_t entry,vm_map_offset_t start,vm_map_offset_t end,vm_map_offset_t * start_offset,vm_map_offset_t * end_offset)17105 vm_map_get_bounds_in_object(
17106 	vm_map_entry_t      entry,
17107 	vm_map_offset_t     start,
17108 	vm_map_offset_t     end,
17109 	vm_map_offset_t    *start_offset,
17110 	vm_map_offset_t    *end_offset)
17111 {
17112 	if (entry->vme_start < start) {
17113 		*start_offset = start - entry->vme_start;
17114 	} else {
17115 		*start_offset = 0;
17116 	}
17117 	*end_offset = MIN(end, entry->vme_end) - entry->vme_start;
17118 	*start_offset += VME_OFFSET(entry);
17119 	*end_offset += VME_OFFSET(entry);
17120 }
17121 
17122 /*
17123  * This function iterates through the entries in the requested range
17124  * and zeroes any resident pages in the corresponding objects. Compressed
17125  * pages are dropped instead of being faulted in and zeroed.
17126  */
17127 static kern_return_t
vm_map_zero(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)17128 vm_map_zero(
17129 	vm_map_t        map,
17130 	vm_map_offset_t start,
17131 	vm_map_offset_t end)
17132 {
17133 	vm_map_entry_t                  entry;
17134 	vm_map_offset_t                 cur = start;
17135 	kern_return_t                   ret;
17136 
17137 	/*
17138 	 * This operation isn't supported where the map page size is less than
17139 	 * the hardware page size. Caller will need to handle error and
17140 	 * explicitly zero memory if needed.
17141 	 */
17142 	if (VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT) {
17143 		return KERN_NO_ACCESS;
17144 	}
17145 
17146 	/*
17147 	 * The MADV_ZERO operation doesn't require any changes to the
17148 	 * vm_map_entry_t's, so the read lock is sufficient.
17149 	 */
17150 	vm_map_lock_read(map);
17151 	assert(map->pmap != kernel_pmap);       /* protect alias access */
17152 
17153 	/*
17154 	 * The madvise semantics require that the address range be fully
17155 	 * allocated with no holes. Otherwise, we're required to return
17156 	 * an error. This check needs to be redone if the map has changed.
17157 	 */
17158 	if (!vm_map_range_check(map, cur, end, &entry)) {
17159 		vm_map_unlock_read(map);
17160 		return KERN_INVALID_ADDRESS;
17161 	}
17162 
17163 	/*
17164 	 * Examine each vm_map_entry_t in the range.
17165 	 */
17166 	while (entry != vm_map_to_entry(map) && entry->vme_start < end) {
17167 		vm_map_offset_t cur_offset;
17168 		vm_map_offset_t end_offset;
17169 		unsigned int last_timestamp = map->timestamp;
17170 		vm_object_t object = VME_OBJECT(entry);
17171 
17172 		ret = vm_map_zero_entry_preflight(entry);
17173 		if (ret != KERN_SUCCESS) {
17174 			vm_map_unlock_read(map);
17175 			return ret;
17176 		}
17177 
17178 		if (object == VM_OBJECT_NULL) {
17179 			entry = entry->vme_next;
17180 			continue;
17181 		}
17182 
17183 		vm_map_get_bounds_in_object(entry, cur, end, &cur_offset, &end_offset);
17184 		vm_object_lock(object);
17185 		/*
17186 		 * Take a reference on the object as vm_object_zero will drop the object
17187 		 * lock when it encounters a busy page.
17188 		 */
17189 		vm_object_reference_locked(object);
17190 		vm_map_unlock_read(map);
17191 
17192 		ret = vm_object_zero(object, cur_offset, end_offset);
17193 		vm_object_unlock(object);
17194 		vm_object_deallocate(object);
17195 		if (ret != KERN_SUCCESS) {
17196 			return ret;
17197 		}
17198 		/*
17199 		 * Update cur as vm_object_zero has succeeded.
17200 		 */
17201 		cur += (end_offset - cur_offset);
17202 		if (cur == end) {
17203 			return KERN_SUCCESS;
17204 		}
17205 
17206 		/*
17207 		 * If the map timestamp has changed, restart by relooking up cur in the
17208 		 * map
17209 		 */
17210 		vm_map_lock_read(map);
17211 		if (last_timestamp != map->timestamp) {
17212 			/*
17213 			 * Relookup cur in the map
17214 			 */
17215 			if (!vm_map_range_check(map, cur, end, &entry)) {
17216 				vm_map_unlock_read(map);
17217 				return KERN_INVALID_ADDRESS;
17218 			}
17219 			continue;
17220 		}
17221 		/*
17222 		 * If the map hasn't changed proceed with the next entry
17223 		 */
17224 		entry = entry->vme_next;
17225 	}
17226 
17227 	vm_map_unlock_read(map);
17228 	return KERN_SUCCESS;
17229 }
17230 
17231 
17232 /*
17233  *	Routine:	vm_map_entry_insert
17234  *
17235  *	Description:	This routine inserts a new vm_entry in a locked map.
17236  */
17237 static vm_map_entry_t
vm_map_entry_insert(vm_map_t map,vm_map_entry_t insp_entry,vm_map_offset_t start,vm_map_offset_t end,vm_object_t object,vm_object_offset_t offset,vm_map_kernel_flags_t vmk_flags,boolean_t needs_copy,vm_prot_t cur_protection,vm_prot_t max_protection,vm_inherit_t inheritance,boolean_t clear_map_aligned)17238 vm_map_entry_insert(
17239 	vm_map_t                map,
17240 	vm_map_entry_t          insp_entry,
17241 	vm_map_offset_t         start,
17242 	vm_map_offset_t         end,
17243 	vm_object_t             object,
17244 	vm_object_offset_t      offset,
17245 	vm_map_kernel_flags_t   vmk_flags,
17246 	boolean_t               needs_copy,
17247 	vm_prot_t               cur_protection,
17248 	vm_prot_t               max_protection,
17249 	vm_inherit_t            inheritance,
17250 	boolean_t               clear_map_aligned)
17251 {
17252 	vm_map_entry_t  new_entry;
17253 	boolean_t map_aligned = FALSE;
17254 
17255 	assert(insp_entry != (vm_map_entry_t)0);
17256 	vm_map_lock_assert_exclusive(map);
17257 
17258 #if DEVELOPMENT || DEBUG
17259 	vm_object_offset_t      end_offset = 0;
17260 	assertf(!os_add_overflow(end - start, offset, &end_offset), "size 0x%llx, offset 0x%llx caused overflow", (uint64_t)(end - start), offset);
17261 #endif /* DEVELOPMENT || DEBUG */
17262 
17263 	if (VM_MAP_PAGE_SHIFT(map) != PAGE_SHIFT) {
17264 		map_aligned = TRUE;
17265 	}
17266 	if (clear_map_aligned &&
17267 	    (!VM_MAP_PAGE_ALIGNED(start, VM_MAP_PAGE_MASK(map)) ||
17268 	    !VM_MAP_PAGE_ALIGNED(end, VM_MAP_PAGE_MASK(map)))) {
17269 		map_aligned = FALSE;
17270 	}
17271 	if (map_aligned) {
17272 		assert(VM_MAP_PAGE_ALIGNED(start, VM_MAP_PAGE_MASK(map)));
17273 		assert(VM_MAP_PAGE_ALIGNED(end, VM_MAP_PAGE_MASK(map)));
17274 	} else {
17275 		assert(page_aligned(start));
17276 		assert(page_aligned(end));
17277 	}
17278 	assert(start < end);
17279 
17280 	new_entry = vm_map_entry_create(map);
17281 
17282 	new_entry->vme_start = start;
17283 	new_entry->vme_end = end;
17284 
17285 	if (vmk_flags.vmkf_submap) {
17286 		new_entry->vme_atomic = vmk_flags.vmkf_submap_atomic;
17287 		VME_SUBMAP_SET(new_entry, (vm_map_t)object);
17288 	} else {
17289 		VME_OBJECT_SET(new_entry, object, false, 0);
17290 	}
17291 	VME_OFFSET_SET(new_entry, offset);
17292 	VME_ALIAS_SET(new_entry, vmk_flags.vm_tag);
17293 
17294 	new_entry->map_aligned = map_aligned;
17295 	new_entry->needs_copy = needs_copy;
17296 	new_entry->inheritance = inheritance;
17297 	new_entry->protection = cur_protection;
17298 	new_entry->max_protection = max_protection;
17299 	/*
17300 	 * submap: "use_pmap" means "nested".
17301 	 * default: false.
17302 	 *
17303 	 * object: "use_pmap" means "use pmap accounting" for footprint.
17304 	 * default: true.
17305 	 */
17306 	new_entry->use_pmap = !vmk_flags.vmkf_submap;
17307 	new_entry->no_cache = vmk_flags.vmf_no_cache;
17308 	new_entry->vme_permanent = vmk_flags.vmf_permanent;
17309 	new_entry->translated_allow_execute = vmk_flags.vmkf_translated_allow_execute;
17310 	new_entry->vme_no_copy_on_read = vmk_flags.vmkf_no_copy_on_read;
17311 	new_entry->superpage_size = (vmk_flags.vmf_superpage_size != 0);
17312 
17313 	if (vmk_flags.vmkf_map_jit) {
17314 		if (!(map->jit_entry_exists) ||
17315 		    VM_MAP_POLICY_ALLOW_MULTIPLE_JIT(map)) {
17316 			new_entry->used_for_jit = TRUE;
17317 			map->jit_entry_exists = TRUE;
17318 		}
17319 	}
17320 
17321 	/*
17322 	 *	Insert the new entry into the list.
17323 	 */
17324 
17325 	vm_map_store_entry_link(map, insp_entry, new_entry, vmk_flags);
17326 	map->size += end - start;
17327 
17328 	/*
17329 	 *	Update the free space hint and the lookup hint.
17330 	 */
17331 
17332 	SAVE_HINT_MAP_WRITE(map, new_entry);
17333 	return new_entry;
17334 }
17335 
17336 /*
17337  *	Routine:	vm_map_remap_extract
17338  *
17339  *	Description:	This routine returns a vm_entry list from a map.
17340  */
17341 static kern_return_t
vm_map_remap_extract(vm_map_t map,vm_map_offset_t addr,vm_map_size_t size,boolean_t copy,vm_map_copy_t map_copy,vm_prot_t * cur_protection,vm_prot_t * max_protection,vm_inherit_t inheritance,vm_map_kernel_flags_t vmk_flags)17342 vm_map_remap_extract(
17343 	vm_map_t                map,
17344 	vm_map_offset_t         addr,
17345 	vm_map_size_t           size,
17346 	boolean_t               copy,
17347 	vm_map_copy_t           map_copy,
17348 	vm_prot_t               *cur_protection,   /* IN/OUT */
17349 	vm_prot_t               *max_protection,   /* IN/OUT */
17350 	/* What, no behavior? */
17351 	vm_inherit_t            inheritance,
17352 	vm_map_kernel_flags_t   vmk_flags)
17353 {
17354 	struct vm_map_header   *map_header = &map_copy->cpy_hdr;
17355 	kern_return_t           result;
17356 	vm_map_size_t           mapped_size;
17357 	vm_map_size_t           tmp_size;
17358 	vm_map_entry_t          src_entry;     /* result of last map lookup */
17359 	vm_map_entry_t          new_entry;
17360 	vm_object_offset_t      offset;
17361 	vm_map_offset_t         map_address;
17362 	vm_map_offset_t         src_start;     /* start of entry to map */
17363 	vm_map_offset_t         src_end;       /* end of region to be mapped */
17364 	vm_object_t             object;
17365 	vm_map_version_t        version;
17366 	boolean_t               src_needs_copy;
17367 	boolean_t               new_entry_needs_copy;
17368 	vm_map_entry_t          saved_src_entry;
17369 	boolean_t               src_entry_was_wired;
17370 	vm_prot_t               max_prot_for_prot_copy;
17371 	vm_map_offset_t         effective_page_mask;
17372 	bool                    pageable, same_map;
17373 	boolean_t               vm_remap_legacy;
17374 	vm_prot_t               required_cur_prot, required_max_prot;
17375 	vm_object_t             new_copy_object;     /* vm_object_copy_* result */
17376 	boolean_t               saved_used_for_jit;  /* Saved used_for_jit. */
17377 
17378 	pageable = vmk_flags.vmkf_copy_pageable;
17379 	same_map = vmk_flags.vmkf_copy_same_map;
17380 
17381 	effective_page_mask = MIN(PAGE_MASK, VM_MAP_PAGE_MASK(map));
17382 
17383 	assert(map != VM_MAP_NULL);
17384 	assert(size != 0);
17385 	assert(size == vm_map_round_page(size, effective_page_mask));
17386 	assert(inheritance == VM_INHERIT_NONE ||
17387 	    inheritance == VM_INHERIT_COPY ||
17388 	    inheritance == VM_INHERIT_SHARE);
17389 	assert(!(*cur_protection & ~(VM_PROT_ALL | VM_PROT_ALLEXEC)));
17390 	assert(!(*max_protection & ~(VM_PROT_ALL | VM_PROT_ALLEXEC)));
17391 	assert((*cur_protection & *max_protection) == *cur_protection);
17392 
17393 	/*
17394 	 *	Compute start and end of region.
17395 	 */
17396 	src_start = vm_map_trunc_page(addr, effective_page_mask);
17397 	src_end = vm_map_round_page(src_start + size, effective_page_mask);
17398 
17399 	/*
17400 	 *	Initialize map_header.
17401 	 */
17402 	map_header->nentries = 0;
17403 	map_header->entries_pageable = pageable;
17404 //	map_header->page_shift = MIN(VM_MAP_PAGE_SHIFT(map), PAGE_SHIFT);
17405 	map_header->page_shift = (uint16_t)VM_MAP_PAGE_SHIFT(map);
17406 	map_header->rb_head_store.rbh_root = (void *)(int)SKIP_RB_TREE;
17407 	vm_map_store_init(map_header);
17408 
17409 	if (copy && vmk_flags.vmkf_remap_prot_copy) {
17410 		/*
17411 		 * Special case for vm_map_protect(VM_PROT_COPY):
17412 		 * we want to set the new mappings' max protection to the
17413 		 * specified *max_protection...
17414 		 */
17415 		max_prot_for_prot_copy = *max_protection & (VM_PROT_ALL | VM_PROT_ALLEXEC);
17416 		/* ... but we want to use the vm_remap() legacy mode */
17417 		*max_protection = VM_PROT_NONE;
17418 		*cur_protection = VM_PROT_NONE;
17419 	} else {
17420 		max_prot_for_prot_copy = VM_PROT_NONE;
17421 	}
17422 
17423 	if (*cur_protection == VM_PROT_NONE &&
17424 	    *max_protection == VM_PROT_NONE) {
17425 		/*
17426 		 * vm_remap() legacy mode:
17427 		 * Extract all memory regions in the specified range and
17428 		 * collect the strictest set of protections allowed on the
17429 		 * entire range, so the caller knows what they can do with
17430 		 * the remapped range.
17431 		 * We start with VM_PROT_ALL and we'll remove the protections
17432 		 * missing from each memory region.
17433 		 */
17434 		vm_remap_legacy = TRUE;
17435 		*cur_protection = VM_PROT_ALL;
17436 		*max_protection = VM_PROT_ALL;
17437 		required_cur_prot = VM_PROT_NONE;
17438 		required_max_prot = VM_PROT_NONE;
17439 	} else {
17440 		/*
17441 		 * vm_remap_new() mode:
17442 		 * Extract all memory regions in the specified range and
17443 		 * ensure that they have at least the protections specified
17444 		 * by the caller via *cur_protection and *max_protection.
17445 		 * The resulting mapping should have these protections.
17446 		 */
17447 		vm_remap_legacy = FALSE;
17448 		if (copy) {
17449 			required_cur_prot = VM_PROT_NONE;
17450 			required_max_prot = VM_PROT_READ;
17451 		} else {
17452 			required_cur_prot = *cur_protection;
17453 			required_max_prot = *max_protection;
17454 		}
17455 	}
17456 
17457 	map_address = 0;
17458 	mapped_size = 0;
17459 	result = KERN_SUCCESS;
17460 
17461 	/*
17462 	 *	The specified source virtual space might correspond to
17463 	 *	multiple map entries, need to loop on them.
17464 	 */
17465 	vm_map_lock(map);
17466 
17467 	if (map->pmap == kernel_pmap) {
17468 		map_copy->is_kernel_range = true;
17469 		map_copy->orig_range = kmem_addr_get_range(addr, size);
17470 #if CONFIG_MAP_RANGES
17471 	} else if (map->uses_user_ranges) {
17472 		map_copy->is_user_range = true;
17473 		map_copy->orig_range = vm_map_user_range_resolve(map, addr, size, NULL);
17474 #endif /* CONFIG_MAP_RANGES */
17475 	}
17476 
17477 	if (VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT) {
17478 		/*
17479 		 * This address space uses sub-pages so the range might
17480 		 * not be re-mappable in an address space with larger
17481 		 * pages. Re-assemble any broken-up VM map entries to
17482 		 * improve our chances of making it work.
17483 		 */
17484 		vm_map_simplify_range(map, src_start, src_end);
17485 	}
17486 	while (mapped_size != size) {
17487 		vm_map_size_t   entry_size;
17488 
17489 		/*
17490 		 *	Find the beginning of the region.
17491 		 */
17492 		if (!vm_map_lookup_entry(map, src_start, &src_entry)) {
17493 			result = KERN_INVALID_ADDRESS;
17494 			break;
17495 		}
17496 
17497 		if (src_start < src_entry->vme_start ||
17498 		    (mapped_size && src_start != src_entry->vme_start)) {
17499 			result = KERN_INVALID_ADDRESS;
17500 			break;
17501 		}
17502 
17503 		tmp_size = size - mapped_size;
17504 		if (src_end > src_entry->vme_end) {
17505 			tmp_size -= (src_end - src_entry->vme_end);
17506 		}
17507 
17508 		entry_size = (vm_map_size_t)(src_entry->vme_end -
17509 		    src_entry->vme_start);
17510 
17511 		if (src_entry->is_sub_map &&
17512 		    vmk_flags.vmkf_copy_single_object) {
17513 			vm_map_t submap;
17514 			vm_map_offset_t submap_start;
17515 			vm_map_size_t submap_size;
17516 			boolean_t submap_needs_copy;
17517 
17518 			/*
17519 			 * No check for "required protection" on "src_entry"
17520 			 * because the protections that matter are the ones
17521 			 * on the submap's VM map entry, which will be checked
17522 			 * during the call to vm_map_remap_extract() below.
17523 			 */
17524 			submap_size = src_entry->vme_end - src_start;
17525 			if (submap_size > size) {
17526 				submap_size = size;
17527 			}
17528 			submap_start = VME_OFFSET(src_entry) + src_start - src_entry->vme_start;
17529 			submap = VME_SUBMAP(src_entry);
17530 			if (copy) {
17531 				/*
17532 				 * The caller wants a copy-on-write re-mapping,
17533 				 * so let's extract from the submap accordingly.
17534 				 */
17535 				submap_needs_copy = TRUE;
17536 			} else if (src_entry->needs_copy) {
17537 				/*
17538 				 * The caller wants a shared re-mapping but the
17539 				 * submap is mapped with "needs_copy", so its
17540 				 * contents can't be shared as is. Extract the
17541 				 * contents of the submap as "copy-on-write".
17542 				 * The re-mapping won't be shared with the
17543 				 * original mapping but this is equivalent to
17544 				 * what happened with the original "remap from
17545 				 * submap" code.
17546 				 * The shared region is mapped "needs_copy", for
17547 				 * example.
17548 				 */
17549 				submap_needs_copy = TRUE;
17550 			} else {
17551 				/*
17552 				 * The caller wants a shared re-mapping and
17553 				 * this mapping can be shared (no "needs_copy"),
17554 				 * so let's extract from the submap accordingly.
17555 				 * Kernel submaps are mapped without
17556 				 * "needs_copy", for example.
17557 				 */
17558 				submap_needs_copy = FALSE;
17559 			}
17560 			vm_map_reference(submap);
17561 			vm_map_unlock(map);
17562 			src_entry = NULL;
17563 			if (vm_remap_legacy) {
17564 				*cur_protection = VM_PROT_NONE;
17565 				*max_protection = VM_PROT_NONE;
17566 			}
17567 
17568 			DTRACE_VM7(remap_submap_recurse,
17569 			    vm_map_t, map,
17570 			    vm_map_offset_t, addr,
17571 			    vm_map_size_t, size,
17572 			    boolean_t, copy,
17573 			    vm_map_offset_t, submap_start,
17574 			    vm_map_size_t, submap_size,
17575 			    boolean_t, submap_needs_copy);
17576 
17577 			result = vm_map_remap_extract(submap,
17578 			    submap_start,
17579 			    submap_size,
17580 			    submap_needs_copy,
17581 			    map_copy,
17582 			    cur_protection,
17583 			    max_protection,
17584 			    inheritance,
17585 			    vmk_flags);
17586 			vm_map_deallocate(submap);
17587 
17588 			if (result == KERN_SUCCESS &&
17589 			    submap_needs_copy &&
17590 			    !copy) {
17591 				/*
17592 				 * We were asked for a "shared"
17593 				 * re-mapping but had to ask for a
17594 				 * "copy-on-write" remapping of the
17595 				 * submap's mapping to honor the
17596 				 * submap's "needs_copy".
17597 				 * We now need to resolve that
17598 				 * pending "copy-on-write" to
17599 				 * get something we can share.
17600 				 */
17601 				vm_map_entry_t copy_entry;
17602 				vm_object_offset_t copy_offset;
17603 				vm_map_size_t copy_size;
17604 				vm_object_t copy_object;
17605 				copy_entry = vm_map_copy_first_entry(map_copy);
17606 				copy_size = copy_entry->vme_end - copy_entry->vme_start;
17607 				copy_object = VME_OBJECT(copy_entry);
17608 				copy_offset = VME_OFFSET(copy_entry);
17609 				if (copy_object == VM_OBJECT_NULL) {
17610 					assert(copy_offset == 0);
17611 					assert(!copy_entry->needs_copy);
17612 					if (copy_entry->max_protection == VM_PROT_NONE) {
17613 						assert(copy_entry->protection == VM_PROT_NONE);
17614 						/* nothing to share */
17615 					} else {
17616 						assert(copy_offset == 0);
17617 						copy_object = vm_object_allocate(copy_size);
17618 						VME_OFFSET_SET(copy_entry, 0);
17619 						VME_OBJECT_SET(copy_entry, copy_object, false, 0);
17620 						assert(copy_entry->use_pmap);
17621 					}
17622 				} else if (copy_object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC) {
17623 					/* already shareable */
17624 					assert(!copy_entry->needs_copy);
17625 				} else if (copy_entry->needs_copy ||
17626 				    copy_object->shadowed ||
17627 				    (object->internal &&
17628 				    !object->true_share &&
17629 				    !copy_entry->is_shared &&
17630 				    copy_object->vo_size > copy_size)) {
17631 					VME_OBJECT_SHADOW(copy_entry, copy_size, TRUE);
17632 					assert(copy_entry->use_pmap);
17633 					if (copy_entry->needs_copy) {
17634 						/* already write-protected */
17635 					} else {
17636 						vm_prot_t prot;
17637 						prot = copy_entry->protection & ~VM_PROT_WRITE;
17638 						vm_object_pmap_protect(copy_object,
17639 						    copy_offset,
17640 						    copy_size,
17641 						    PMAP_NULL,
17642 						    PAGE_SIZE,
17643 						    0,
17644 						    prot);
17645 					}
17646 					copy_entry->needs_copy = FALSE;
17647 				}
17648 				copy_object = VME_OBJECT(copy_entry);
17649 				copy_offset = VME_OFFSET(copy_entry);
17650 				if (copy_object &&
17651 				    copy_object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) {
17652 					copy_object->copy_strategy = MEMORY_OBJECT_COPY_DELAY;
17653 					copy_object->true_share = TRUE;
17654 				}
17655 			}
17656 
17657 			return result;
17658 		}
17659 
17660 		if (src_entry->is_sub_map) {
17661 			/* protections for submap mapping are irrelevant here */
17662 		} else if (((src_entry->protection & required_cur_prot) !=
17663 		    required_cur_prot) ||
17664 		    ((src_entry->max_protection & required_max_prot) !=
17665 		    required_max_prot)) {
17666 			if (vmk_flags.vmkf_copy_single_object &&
17667 			    mapped_size != 0) {
17668 				/*
17669 				 * Single object extraction.
17670 				 * We can't extract more with the required
17671 				 * protection but we've extracted some, so
17672 				 * stop there and declare success.
17673 				 * The caller should check the size of
17674 				 * the copy entry we've extracted.
17675 				 */
17676 				result = KERN_SUCCESS;
17677 			} else {
17678 				/*
17679 				 * VM range extraction.
17680 				 * Required proctection is not available
17681 				 * for this part of the range: fail.
17682 				 */
17683 				result = KERN_PROTECTION_FAILURE;
17684 			}
17685 			break;
17686 		}
17687 
17688 		if (src_entry->is_sub_map) {
17689 			vm_map_t submap;
17690 			vm_map_offset_t submap_start;
17691 			vm_map_size_t submap_size;
17692 			vm_map_copy_t submap_copy;
17693 			vm_prot_t submap_curprot, submap_maxprot;
17694 			boolean_t submap_needs_copy;
17695 
17696 			/*
17697 			 * No check for "required protection" on "src_entry"
17698 			 * because the protections that matter are the ones
17699 			 * on the submap's VM map entry, which will be checked
17700 			 * during the call to vm_map_copy_extract() below.
17701 			 */
17702 			object = VM_OBJECT_NULL;
17703 			submap_copy = VM_MAP_COPY_NULL;
17704 
17705 			/* find equivalent range in the submap */
17706 			submap = VME_SUBMAP(src_entry);
17707 			submap_start = VME_OFFSET(src_entry) + src_start - src_entry->vme_start;
17708 			submap_size = tmp_size;
17709 			if (copy) {
17710 				/*
17711 				 * The caller wants a copy-on-write re-mapping,
17712 				 * so let's extract from the submap accordingly.
17713 				 */
17714 				submap_needs_copy = TRUE;
17715 			} else if (src_entry->needs_copy) {
17716 				/*
17717 				 * The caller wants a shared re-mapping but the
17718 				 * submap is mapped with "needs_copy", so its
17719 				 * contents can't be shared as is. Extract the
17720 				 * contents of the submap as "copy-on-write".
17721 				 * The re-mapping won't be shared with the
17722 				 * original mapping but this is equivalent to
17723 				 * what happened with the original "remap from
17724 				 * submap" code.
17725 				 * The shared region is mapped "needs_copy", for
17726 				 * example.
17727 				 */
17728 				submap_needs_copy = TRUE;
17729 			} else {
17730 				/*
17731 				 * The caller wants a shared re-mapping and
17732 				 * this mapping can be shared (no "needs_copy"),
17733 				 * so let's extract from the submap accordingly.
17734 				 * Kernel submaps are mapped without
17735 				 * "needs_copy", for example.
17736 				 */
17737 				submap_needs_copy = FALSE;
17738 			}
17739 			/* extra ref to keep submap alive */
17740 			vm_map_reference(submap);
17741 
17742 			DTRACE_VM7(remap_submap_recurse,
17743 			    vm_map_t, map,
17744 			    vm_map_offset_t, addr,
17745 			    vm_map_size_t, size,
17746 			    boolean_t, copy,
17747 			    vm_map_offset_t, submap_start,
17748 			    vm_map_size_t, submap_size,
17749 			    boolean_t, submap_needs_copy);
17750 
17751 			/*
17752 			 * The map can be safely unlocked since we
17753 			 * already hold a reference on the submap.
17754 			 *
17755 			 * No timestamp since we don't care if the map
17756 			 * gets modified while we're down in the submap.
17757 			 * We'll resume the extraction at src_start + tmp_size
17758 			 * anyway.
17759 			 */
17760 			vm_map_unlock(map);
17761 			src_entry = NULL; /* not valid once map is unlocked */
17762 
17763 			if (vm_remap_legacy) {
17764 				submap_curprot = VM_PROT_NONE;
17765 				submap_maxprot = VM_PROT_NONE;
17766 				if (max_prot_for_prot_copy) {
17767 					submap_maxprot = max_prot_for_prot_copy;
17768 				}
17769 			} else {
17770 				assert(!max_prot_for_prot_copy);
17771 				submap_curprot = *cur_protection;
17772 				submap_maxprot = *max_protection;
17773 			}
17774 			result = vm_map_copy_extract(submap,
17775 			    submap_start,
17776 			    submap_size,
17777 			    submap_needs_copy,
17778 			    &submap_copy,
17779 			    &submap_curprot,
17780 			    &submap_maxprot,
17781 			    inheritance,
17782 			    vmk_flags);
17783 
17784 			/* release extra ref on submap */
17785 			vm_map_deallocate(submap);
17786 			submap = VM_MAP_NULL;
17787 
17788 			if (result != KERN_SUCCESS) {
17789 				vm_map_lock(map);
17790 				break;
17791 			}
17792 
17793 			/* transfer submap_copy entries to map_header */
17794 			while (vm_map_copy_first_entry(submap_copy) !=
17795 			    vm_map_copy_to_entry(submap_copy)) {
17796 				vm_map_entry_t copy_entry;
17797 				vm_map_size_t copy_entry_size;
17798 
17799 				copy_entry = vm_map_copy_first_entry(submap_copy);
17800 
17801 				/*
17802 				 * Prevent kernel_object from being exposed to
17803 				 * user space.
17804 				 */
17805 				if (__improbable(copy_entry->vme_kernel_object)) {
17806 					printf("%d[%s]: rejecting attempt to extract from kernel_object\n",
17807 					    proc_selfpid(),
17808 					    (get_bsdtask_info(current_task())
17809 					    ? proc_name_address(get_bsdtask_info(current_task()))
17810 					    : "?"));
17811 					DTRACE_VM(extract_kernel_only);
17812 					result = KERN_INVALID_RIGHT;
17813 					vm_map_copy_discard(submap_copy);
17814 					submap_copy = VM_MAP_COPY_NULL;
17815 					vm_map_lock(map);
17816 					break;
17817 				}
17818 
17819 #ifdef __arm64e__
17820 				if (vmk_flags.vmkf_tpro_enforcement_override) {
17821 					copy_entry->used_for_tpro = FALSE;
17822 				}
17823 #endif /* __arm64e__ */
17824 
17825 				vm_map_copy_entry_unlink(submap_copy, copy_entry);
17826 				copy_entry_size = copy_entry->vme_end - copy_entry->vme_start;
17827 				copy_entry->vme_start = map_address;
17828 				copy_entry->vme_end = map_address + copy_entry_size;
17829 				map_address += copy_entry_size;
17830 				mapped_size += copy_entry_size;
17831 				src_start += copy_entry_size;
17832 				assert(src_start <= src_end);
17833 				_vm_map_store_entry_link(map_header,
17834 				    map_header->links.prev,
17835 				    copy_entry);
17836 			}
17837 			/* done with submap_copy */
17838 			vm_map_copy_discard(submap_copy);
17839 
17840 			if (vm_remap_legacy) {
17841 				*cur_protection &= submap_curprot;
17842 				*max_protection &= submap_maxprot;
17843 			}
17844 
17845 			/* re-acquire the map lock and continue to next entry */
17846 			vm_map_lock(map);
17847 			continue;
17848 		} else {
17849 			object = VME_OBJECT(src_entry);
17850 
17851 			/*
17852 			 * Prevent kernel_object from being exposed to
17853 			 * user space.
17854 			 */
17855 			if (__improbable(is_kernel_object(object))) {
17856 				printf("%d[%s]: rejecting attempt to extract from kernel_object\n",
17857 				    proc_selfpid(),
17858 				    (get_bsdtask_info(current_task())
17859 				    ? proc_name_address(get_bsdtask_info(current_task()))
17860 				    : "?"));
17861 				DTRACE_VM(extract_kernel_only);
17862 				result = KERN_INVALID_RIGHT;
17863 				break;
17864 			}
17865 
17866 			if (src_entry->iokit_acct) {
17867 				/*
17868 				 * This entry uses "IOKit accounting".
17869 				 */
17870 			} else if (object != VM_OBJECT_NULL &&
17871 			    (object->purgable != VM_PURGABLE_DENY ||
17872 			    object->vo_ledger_tag != VM_LEDGER_TAG_NONE)) {
17873 				/*
17874 				 * Purgeable objects have their own accounting:
17875 				 * no pmap accounting for them.
17876 				 */
17877 				assertf(!src_entry->use_pmap,
17878 				    "map=%p src_entry=%p [0x%llx:0x%llx] 0x%x/0x%x %d",
17879 				    map,
17880 				    src_entry,
17881 				    (uint64_t)src_entry->vme_start,
17882 				    (uint64_t)src_entry->vme_end,
17883 				    src_entry->protection,
17884 				    src_entry->max_protection,
17885 				    VME_ALIAS(src_entry));
17886 			} else {
17887 				/*
17888 				 * Not IOKit or purgeable:
17889 				 * must be accounted by pmap stats.
17890 				 */
17891 				assertf(src_entry->use_pmap,
17892 				    "map=%p src_entry=%p [0x%llx:0x%llx] 0x%x/0x%x %d",
17893 				    map,
17894 				    src_entry,
17895 				    (uint64_t)src_entry->vme_start,
17896 				    (uint64_t)src_entry->vme_end,
17897 				    src_entry->protection,
17898 				    src_entry->max_protection,
17899 				    VME_ALIAS(src_entry));
17900 			}
17901 
17902 			if (object == VM_OBJECT_NULL) {
17903 				assert(!src_entry->needs_copy);
17904 				if (src_entry->max_protection == VM_PROT_NONE) {
17905 					assert(src_entry->protection == VM_PROT_NONE);
17906 					/*
17907 					 * No VM object and no permissions:
17908 					 * this must be a reserved range with
17909 					 * nothing to share or copy.
17910 					 * There could also be all sorts of
17911 					 * pmap shenanigans within that reserved
17912 					 * range, so let's just copy the map
17913 					 * entry as is to remap a similar
17914 					 * reserved range.
17915 					 */
17916 					offset = 0; /* no object => no offset */
17917 					goto copy_src_entry;
17918 				}
17919 				object = vm_object_allocate(entry_size);
17920 				VME_OFFSET_SET(src_entry, 0);
17921 				VME_OBJECT_SET(src_entry, object, false, 0);
17922 				assert(src_entry->use_pmap);
17923 				assert(!map->mapped_in_other_pmaps);
17924 			} else if (src_entry->wired_count ||
17925 			    object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC) {
17926 				/*
17927 				 * A wired memory region should not have
17928 				 * any pending copy-on-write and needs to
17929 				 * keep pointing at the VM object that
17930 				 * contains the wired pages.
17931 				 * If we're sharing this memory (copy=false),
17932 				 * we'll share this VM object.
17933 				 * If we're copying this memory (copy=true),
17934 				 * we'll call vm_object_copy_slowly() below
17935 				 * and use the new VM object for the remapping.
17936 				 *
17937 				 * Or, we are already using an asymmetric
17938 				 * copy, and therefore we already have
17939 				 * the right object.
17940 				 */
17941 				assert(!src_entry->needs_copy);
17942 			} else if (src_entry->needs_copy || object->shadowed ||
17943 			    (object->internal && !object->true_share &&
17944 			    !src_entry->is_shared &&
17945 			    object->vo_size > entry_size)) {
17946 				bool is_writable;
17947 
17948 				VME_OBJECT_SHADOW(src_entry, entry_size,
17949 				    vm_map_always_shadow(map));
17950 				assert(src_entry->use_pmap);
17951 
17952 				is_writable = false;
17953 				if (src_entry->protection & VM_PROT_WRITE) {
17954 					is_writable = true;
17955 #if __arm64e__
17956 				} else if (src_entry->used_for_tpro) {
17957 					is_writable = true;
17958 #endif /* __arm64e__ */
17959 				}
17960 				if (!src_entry->needs_copy && is_writable) {
17961 					vm_prot_t prot;
17962 
17963 					if (pmap_has_prot_policy(map->pmap, src_entry->translated_allow_execute, src_entry->protection)) {
17964 						panic("%s: map %p pmap %p entry %p 0x%llx:0x%llx prot 0x%x",
17965 						    __FUNCTION__,
17966 						    map, map->pmap,
17967 						    src_entry,
17968 						    (uint64_t)src_entry->vme_start,
17969 						    (uint64_t)src_entry->vme_end,
17970 						    src_entry->protection);
17971 					}
17972 
17973 					prot = src_entry->protection & ~VM_PROT_WRITE;
17974 
17975 					if (override_nx(map,
17976 					    VME_ALIAS(src_entry))
17977 					    && prot) {
17978 						prot |= VM_PROT_EXECUTE;
17979 					}
17980 
17981 					if (pmap_has_prot_policy(map->pmap, src_entry->translated_allow_execute, prot)) {
17982 						panic("%s: map %p pmap %p entry %p 0x%llx:0x%llx prot 0x%x",
17983 						    __FUNCTION__,
17984 						    map, map->pmap,
17985 						    src_entry,
17986 						    (uint64_t)src_entry->vme_start,
17987 						    (uint64_t)src_entry->vme_end,
17988 						    prot);
17989 					}
17990 
17991 					if (map->mapped_in_other_pmaps) {
17992 						vm_object_pmap_protect(
17993 							VME_OBJECT(src_entry),
17994 							VME_OFFSET(src_entry),
17995 							entry_size,
17996 							PMAP_NULL,
17997 							PAGE_SIZE,
17998 							src_entry->vme_start,
17999 							prot);
18000 #if MACH_ASSERT
18001 					} else if (__improbable(map->pmap == PMAP_NULL)) {
18002 						extern boolean_t vm_tests_in_progress;
18003 						assert(vm_tests_in_progress);
18004 						/*
18005 						 * Some VM tests (in vm_tests.c)
18006 						 * sometimes want to use a VM
18007 						 * map without a pmap.
18008 						 * Otherwise, this should never
18009 						 * happen.
18010 						 */
18011 #endif /* MACH_ASSERT */
18012 					} else {
18013 						pmap_protect(vm_map_pmap(map),
18014 						    src_entry->vme_start,
18015 						    src_entry->vme_end,
18016 						    prot);
18017 					}
18018 				}
18019 
18020 				object = VME_OBJECT(src_entry);
18021 				src_entry->needs_copy = FALSE;
18022 			}
18023 
18024 
18025 			vm_object_lock(object);
18026 			vm_object_reference_locked(object); /* object ref. for new entry */
18027 			assert(!src_entry->needs_copy);
18028 			if (object->copy_strategy ==
18029 			    MEMORY_OBJECT_COPY_SYMMETRIC) {
18030 				/*
18031 				 * If we want to share this object (copy==0),
18032 				 * it needs to be COPY_DELAY.
18033 				 * If we want to copy this object (copy==1),
18034 				 * we can't just set "needs_copy" on our side
18035 				 * and expect the other side to do the same
18036 				 * (symmetrically), so we can't let the object
18037 				 * stay COPY_SYMMETRIC.
18038 				 * So we always switch from COPY_SYMMETRIC to
18039 				 * COPY_DELAY.
18040 				 */
18041 				object->copy_strategy =
18042 				    MEMORY_OBJECT_COPY_DELAY;
18043 				VM_OBJECT_SET_TRUE_SHARE(object, TRUE);
18044 			}
18045 			vm_object_unlock(object);
18046 		}
18047 
18048 		offset = (VME_OFFSET(src_entry) +
18049 		    (src_start - src_entry->vme_start));
18050 
18051 copy_src_entry:
18052 		new_entry = _vm_map_entry_create(map_header);
18053 		vm_map_entry_copy(map, new_entry, src_entry);
18054 		if (new_entry->is_sub_map) {
18055 			/* clr address space specifics */
18056 			new_entry->use_pmap = FALSE;
18057 		} else if (copy) {
18058 			/*
18059 			 * We're dealing with a copy-on-write operation,
18060 			 * so the resulting mapping should not inherit the
18061 			 * original mapping's accounting settings.
18062 			 * "use_pmap" should be reset to its default (TRUE)
18063 			 * so that the new mapping gets accounted for in
18064 			 * the task's memory footprint.
18065 			 */
18066 			new_entry->use_pmap = TRUE;
18067 		}
18068 		/* "iokit_acct" was cleared in vm_map_entry_copy() */
18069 		assert(!new_entry->iokit_acct);
18070 
18071 		new_entry->map_aligned = FALSE;
18072 
18073 		new_entry->vme_start = map_address;
18074 		new_entry->vme_end = map_address + tmp_size;
18075 		assert(new_entry->vme_start < new_entry->vme_end);
18076 		if (copy && vmk_flags.vmkf_remap_prot_copy) {
18077 			/* security: keep "permanent" and "csm_associated" */
18078 			new_entry->vme_permanent = src_entry->vme_permanent;
18079 			new_entry->csm_associated = src_entry->csm_associated;
18080 			/*
18081 			 * Remapping for vm_map_protect(VM_PROT_COPY)
18082 			 * to convert a read-only mapping into a
18083 			 * copy-on-write version of itself but
18084 			 * with write access:
18085 			 * keep the original inheritance but let's not
18086 			 * add VM_PROT_WRITE to the max protection yet
18087 			 * since we want to do more security checks against
18088 			 * the target map.
18089 			 */
18090 			new_entry->inheritance = src_entry->inheritance;
18091 			new_entry->protection &= max_prot_for_prot_copy;
18092 		} else {
18093 			new_entry->inheritance = inheritance;
18094 			if (!vm_remap_legacy) {
18095 				new_entry->protection = *cur_protection;
18096 				new_entry->max_protection = *max_protection;
18097 			}
18098 		}
18099 #ifdef __arm64e__
18100 		if (copy && vmk_flags.vmkf_tpro_enforcement_override) {
18101 			new_entry->used_for_tpro = FALSE;
18102 		}
18103 #endif /* __arm64e__ */
18104 		VME_OFFSET_SET(new_entry, offset);
18105 
18106 		/*
18107 		 * The new region has to be copied now if required.
18108 		 */
18109 RestartCopy:
18110 		if (!copy) {
18111 			if (src_entry->used_for_jit == TRUE) {
18112 				if (same_map) {
18113 				} else if (!VM_MAP_POLICY_ALLOW_JIT_SHARING(map)) {
18114 					/*
18115 					 * Cannot allow an entry describing a JIT
18116 					 * region to be shared across address spaces.
18117 					 */
18118 					result = KERN_INVALID_ARGUMENT;
18119 					vm_object_deallocate(object);
18120 					vm_map_entry_dispose(new_entry);
18121 					new_entry = VM_MAP_ENTRY_NULL;
18122 					break;
18123 				}
18124 			}
18125 
18126 			src_entry->is_shared = TRUE;
18127 			new_entry->is_shared = TRUE;
18128 			if (!(new_entry->is_sub_map)) {
18129 				new_entry->needs_copy = FALSE;
18130 			}
18131 		} else if (src_entry->is_sub_map) {
18132 			/* make this a COW sub_map if not already */
18133 			assert(new_entry->wired_count == 0);
18134 			new_entry->needs_copy = TRUE;
18135 			object = VM_OBJECT_NULL;
18136 		} else if (src_entry->wired_count == 0 &&
18137 		    !(debug4k_no_cow_copyin && VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT) &&
18138 		    vm_object_copy_quickly(VME_OBJECT(new_entry),
18139 		    VME_OFFSET(new_entry),
18140 		    (new_entry->vme_end -
18141 		    new_entry->vme_start),
18142 		    &src_needs_copy,
18143 		    &new_entry_needs_copy)) {
18144 			new_entry->needs_copy = new_entry_needs_copy;
18145 			new_entry->is_shared = FALSE;
18146 			assertf(new_entry->use_pmap, "map %p new_entry %p\n", map, new_entry);
18147 
18148 			/*
18149 			 * Handle copy_on_write semantics.
18150 			 */
18151 			if (src_needs_copy && !src_entry->needs_copy) {
18152 				vm_prot_t prot;
18153 
18154 				if (pmap_has_prot_policy(map->pmap, src_entry->translated_allow_execute, src_entry->protection)) {
18155 					panic("%s: map %p pmap %p entry %p 0x%llx:0x%llx prot 0x%x",
18156 					    __FUNCTION__,
18157 					    map, map->pmap, src_entry,
18158 					    (uint64_t)src_entry->vme_start,
18159 					    (uint64_t)src_entry->vme_end,
18160 					    src_entry->protection);
18161 				}
18162 
18163 				prot = src_entry->protection & ~VM_PROT_WRITE;
18164 
18165 				if (override_nx(map,
18166 				    VME_ALIAS(src_entry))
18167 				    && prot) {
18168 					prot |= VM_PROT_EXECUTE;
18169 				}
18170 
18171 				if (pmap_has_prot_policy(map->pmap, src_entry->translated_allow_execute, prot)) {
18172 					panic("%s: map %p pmap %p entry %p 0x%llx:0x%llx prot 0x%x",
18173 					    __FUNCTION__,
18174 					    map, map->pmap, src_entry,
18175 					    (uint64_t)src_entry->vme_start,
18176 					    (uint64_t)src_entry->vme_end,
18177 					    prot);
18178 				}
18179 
18180 				vm_object_pmap_protect(object,
18181 				    offset,
18182 				    entry_size,
18183 				    ((src_entry->is_shared
18184 				    || map->mapped_in_other_pmaps) ?
18185 				    PMAP_NULL : map->pmap),
18186 				    VM_MAP_PAGE_SIZE(map),
18187 				    src_entry->vme_start,
18188 				    prot);
18189 
18190 				assert(src_entry->wired_count == 0);
18191 				src_entry->needs_copy = TRUE;
18192 			}
18193 			/*
18194 			 * Throw away the old object reference of the new entry.
18195 			 */
18196 			vm_object_deallocate(object);
18197 		} else {
18198 			new_entry->is_shared = FALSE;
18199 			assertf(new_entry->use_pmap, "map %p new_entry %p\n", map, new_entry);
18200 
18201 			src_entry_was_wired = (src_entry->wired_count > 0);
18202 			saved_src_entry = src_entry;
18203 			src_entry = VM_MAP_ENTRY_NULL;
18204 
18205 			/*
18206 			 * The map can be safely unlocked since we
18207 			 * already hold a reference on the object.
18208 			 *
18209 			 * Record the timestamp of the map for later
18210 			 * verification, and unlock the map.
18211 			 */
18212 			version.main_timestamp = map->timestamp;
18213 			vm_map_unlock(map);     /* Increments timestamp once! */
18214 
18215 			/*
18216 			 * Perform the copy.
18217 			 */
18218 			if (src_entry_was_wired > 0 ||
18219 			    (debug4k_no_cow_copyin &&
18220 			    VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT)) {
18221 				vm_object_lock(object);
18222 				result = vm_object_copy_slowly(
18223 					object,
18224 					offset,
18225 					(new_entry->vme_end -
18226 					new_entry->vme_start),
18227 					THREAD_UNINT,
18228 					&new_copy_object);
18229 				/* VME_OBJECT_SET will reset used_for_jit, so preserve it. */
18230 				saved_used_for_jit = new_entry->used_for_jit;
18231 				VME_OBJECT_SET(new_entry, new_copy_object, false, 0);
18232 				new_entry->used_for_jit = saved_used_for_jit;
18233 				VME_OFFSET_SET(new_entry, offset - vm_object_trunc_page(offset));
18234 				new_entry->needs_copy = FALSE;
18235 			} else {
18236 				vm_object_offset_t new_offset;
18237 
18238 				new_offset = VME_OFFSET(new_entry);
18239 				result = vm_object_copy_strategically(
18240 					object,
18241 					offset,
18242 					(new_entry->vme_end -
18243 					new_entry->vme_start),
18244 					false, /* forking */
18245 					&new_copy_object,
18246 					&new_offset,
18247 					&new_entry_needs_copy);
18248 				/* VME_OBJECT_SET will reset used_for_jit, so preserve it. */
18249 				saved_used_for_jit = new_entry->used_for_jit;
18250 				VME_OBJECT_SET(new_entry, new_copy_object, false, 0);
18251 				new_entry->used_for_jit = saved_used_for_jit;
18252 				if (new_offset != VME_OFFSET(new_entry)) {
18253 					VME_OFFSET_SET(new_entry, new_offset);
18254 				}
18255 
18256 				new_entry->needs_copy = new_entry_needs_copy;
18257 			}
18258 
18259 			/*
18260 			 * Throw away the old object reference of the new entry.
18261 			 */
18262 			vm_object_deallocate(object);
18263 
18264 			if (result != KERN_SUCCESS &&
18265 			    result != KERN_MEMORY_RESTART_COPY) {
18266 				vm_map_entry_dispose(new_entry);
18267 				vm_map_lock(map);
18268 				break;
18269 			}
18270 
18271 			/*
18272 			 * Verify that the map has not substantially
18273 			 * changed while the copy was being made.
18274 			 */
18275 
18276 			vm_map_lock(map);
18277 			if (version.main_timestamp + 1 != map->timestamp) {
18278 				/*
18279 				 * Simple version comparison failed.
18280 				 *
18281 				 * Retry the lookup and verify that the
18282 				 * same object/offset are still present.
18283 				 */
18284 				saved_src_entry = VM_MAP_ENTRY_NULL;
18285 				vm_object_deallocate(VME_OBJECT(new_entry));
18286 				vm_map_entry_dispose(new_entry);
18287 				if (result == KERN_MEMORY_RESTART_COPY) {
18288 					result = KERN_SUCCESS;
18289 				}
18290 				continue;
18291 			}
18292 			/* map hasn't changed: src_entry is still valid */
18293 			src_entry = saved_src_entry;
18294 			saved_src_entry = VM_MAP_ENTRY_NULL;
18295 
18296 			if (result == KERN_MEMORY_RESTART_COPY) {
18297 				vm_object_reference(object);
18298 				goto RestartCopy;
18299 			}
18300 		}
18301 
18302 		_vm_map_store_entry_link(map_header,
18303 		    map_header->links.prev, new_entry);
18304 
18305 		/* protections for submap mapping are irrelevant here */
18306 		if (vm_remap_legacy && !src_entry->is_sub_map) {
18307 			*cur_protection &= src_entry->protection;
18308 			*max_protection &= src_entry->max_protection;
18309 		}
18310 
18311 		map_address += tmp_size;
18312 		mapped_size += tmp_size;
18313 		src_start += tmp_size;
18314 
18315 		if (vmk_flags.vmkf_copy_single_object) {
18316 			if (mapped_size != size) {
18317 				DEBUG4K_SHARE("map %p addr 0x%llx size 0x%llx clipped copy at mapped_size 0x%llx\n",
18318 				    map, (uint64_t)addr, (uint64_t)size, (uint64_t)mapped_size);
18319 				if (src_entry->vme_next != vm_map_to_entry(map) &&
18320 				    src_entry->vme_next->vme_object_value ==
18321 				    src_entry->vme_object_value) {
18322 					/* XXX TODO4K */
18323 					DEBUG4K_ERROR("could have extended copy to next entry...\n");
18324 				}
18325 			}
18326 			break;
18327 		}
18328 	} /* end while */
18329 
18330 	vm_map_unlock(map);
18331 	if (result != KERN_SUCCESS) {
18332 		/*
18333 		 * Free all allocated elements.
18334 		 */
18335 		for (src_entry = map_header->links.next;
18336 		    src_entry != CAST_TO_VM_MAP_ENTRY(&map_header->links);
18337 		    src_entry = new_entry) {
18338 			new_entry = src_entry->vme_next;
18339 			_vm_map_store_entry_unlink(map_header, src_entry, false);
18340 			if (src_entry->is_sub_map) {
18341 				vm_map_deallocate(VME_SUBMAP(src_entry));
18342 			} else {
18343 				vm_object_deallocate(VME_OBJECT(src_entry));
18344 			}
18345 			vm_map_entry_dispose(src_entry);
18346 		}
18347 	}
18348 	return result;
18349 }
18350 
18351 bool
vm_map_is_exotic(vm_map_t map)18352 vm_map_is_exotic(
18353 	vm_map_t map)
18354 {
18355 	return VM_MAP_IS_EXOTIC(map);
18356 }
18357 
18358 bool
vm_map_is_alien(vm_map_t map)18359 vm_map_is_alien(
18360 	vm_map_t map)
18361 {
18362 	return VM_MAP_IS_ALIEN(map);
18363 }
18364 
18365 #if XNU_TARGET_OS_OSX
18366 void
vm_map_mark_alien(vm_map_t map)18367 vm_map_mark_alien(
18368 	vm_map_t map)
18369 {
18370 	vm_map_lock(map);
18371 	map->is_alien = true;
18372 	vm_map_unlock(map);
18373 }
18374 
18375 void
vm_map_single_jit(vm_map_t map)18376 vm_map_single_jit(
18377 	vm_map_t map)
18378 {
18379 	vm_map_lock(map);
18380 	map->single_jit = true;
18381 	vm_map_unlock(map);
18382 }
18383 #endif /* XNU_TARGET_OS_OSX */
18384 
18385 
18386 /*
18387  * Callers of this function must call vm_map_copy_require on
18388  * previously created vm_map_copy_t or pass a newly created
18389  * one to ensure that it hasn't been forged.
18390  */
18391 static kern_return_t
vm_map_copy_to_physcopy(vm_map_copy_t copy_map,vm_map_t target_map)18392 vm_map_copy_to_physcopy(
18393 	vm_map_copy_t   copy_map,
18394 	vm_map_t        target_map)
18395 {
18396 	vm_map_size_t           size;
18397 	vm_map_entry_t          entry;
18398 	vm_map_entry_t          new_entry;
18399 	vm_object_t             new_object;
18400 	unsigned int            pmap_flags;
18401 	pmap_t                  new_pmap;
18402 	vm_map_t                new_map;
18403 	vm_map_address_t        src_start, src_end, src_cur;
18404 	vm_map_address_t        dst_start, dst_end, dst_cur;
18405 	kern_return_t           kr;
18406 	void                    *kbuf;
18407 
18408 	/*
18409 	 * Perform the equivalent of vm_allocate() and memcpy().
18410 	 * Replace the mappings in "copy_map" with the newly allocated mapping.
18411 	 */
18412 	DEBUG4K_COPY("copy_map %p (%d %d 0x%llx 0x%llx) BEFORE\n", copy_map, copy_map->cpy_hdr.page_shift, copy_map->cpy_hdr.nentries, copy_map->offset, (uint64_t)copy_map->size);
18413 
18414 	assert(copy_map->cpy_hdr.page_shift != VM_MAP_PAGE_MASK(target_map));
18415 
18416 	/* create a new pmap to map "copy_map" */
18417 	pmap_flags = 0;
18418 	assert(copy_map->cpy_hdr.page_shift == FOURK_PAGE_SHIFT);
18419 #if PMAP_CREATE_FORCE_4K_PAGES
18420 	pmap_flags |= PMAP_CREATE_FORCE_4K_PAGES;
18421 #endif /* PMAP_CREATE_FORCE_4K_PAGES */
18422 	pmap_flags |= PMAP_CREATE_64BIT;
18423 	new_pmap = pmap_create_options(NULL, (vm_map_size_t)0, pmap_flags);
18424 	if (new_pmap == NULL) {
18425 		return KERN_RESOURCE_SHORTAGE;
18426 	}
18427 
18428 	/* allocate new VM object */
18429 	size = VM_MAP_ROUND_PAGE(copy_map->size, PAGE_MASK);
18430 	new_object = vm_object_allocate(size);
18431 	assert(new_object);
18432 
18433 	/* allocate new VM map entry */
18434 	new_entry = vm_map_copy_entry_create(copy_map);
18435 	assert(new_entry);
18436 
18437 	/* finish initializing new VM map entry */
18438 	new_entry->protection = VM_PROT_DEFAULT;
18439 	new_entry->max_protection = VM_PROT_DEFAULT;
18440 	new_entry->use_pmap = TRUE;
18441 
18442 	/* make new VM map entry point to new VM object */
18443 	new_entry->vme_start = 0;
18444 	new_entry->vme_end = size;
18445 	VME_OBJECT_SET(new_entry, new_object, false, 0);
18446 	VME_OFFSET_SET(new_entry, 0);
18447 
18448 	/* create a new pageable VM map to map "copy_map" */
18449 	new_map = vm_map_create_options(new_pmap, 0, MACH_VM_MAX_ADDRESS,
18450 	    VM_MAP_CREATE_PAGEABLE);
18451 	assert(new_map);
18452 	vm_map_set_page_shift(new_map, copy_map->cpy_hdr.page_shift);
18453 
18454 	/* map "copy_map" in the new VM map */
18455 	src_start = 0;
18456 	kr = vm_map_copyout_internal(
18457 		new_map,
18458 		&src_start,
18459 		copy_map,
18460 		copy_map->size,
18461 		FALSE, /* consume_on_success */
18462 		VM_PROT_DEFAULT,
18463 		VM_PROT_DEFAULT,
18464 		VM_INHERIT_DEFAULT);
18465 	assert(kr == KERN_SUCCESS);
18466 	src_end = src_start + copy_map->size;
18467 
18468 	/* map "new_object" in the new VM map */
18469 	vm_object_reference(new_object);
18470 	dst_start = 0;
18471 	kr = vm_map_enter(new_map,
18472 	    &dst_start,
18473 	    size,
18474 	    0,               /* mask */
18475 	    VM_MAP_KERNEL_FLAGS_ANYWHERE(.vm_tag = VM_KERN_MEMORY_OSFMK),
18476 	    new_object,
18477 	    0,               /* offset */
18478 	    FALSE,               /* needs copy */
18479 	    VM_PROT_DEFAULT,
18480 	    VM_PROT_DEFAULT,
18481 	    VM_INHERIT_DEFAULT);
18482 	assert(kr == KERN_SUCCESS);
18483 	dst_end = dst_start + size;
18484 
18485 	/* get a kernel buffer */
18486 	kbuf = kalloc_data(PAGE_SIZE, Z_WAITOK | Z_NOFAIL);
18487 
18488 	/* physically copy "copy_map" mappings to new VM object */
18489 	for (src_cur = src_start, dst_cur = dst_start;
18490 	    src_cur < src_end;
18491 	    src_cur += PAGE_SIZE, dst_cur += PAGE_SIZE) {
18492 		vm_size_t bytes;
18493 
18494 		bytes = PAGE_SIZE;
18495 		if (src_cur + PAGE_SIZE > src_end) {
18496 			/* partial copy for last page */
18497 			bytes = src_end - src_cur;
18498 			assert(bytes > 0 && bytes < PAGE_SIZE);
18499 			/* rest of dst page should be zero-filled */
18500 		}
18501 		/* get bytes from src mapping */
18502 		kr = copyinmap(new_map, src_cur, kbuf, bytes);
18503 		if (kr != KERN_SUCCESS) {
18504 			DEBUG4K_COPY("copyinmap(%p, 0x%llx, %p, 0x%llx) kr 0x%x\n", new_map, (uint64_t)src_cur, kbuf, (uint64_t)bytes, kr);
18505 		}
18506 		/* put bytes in dst mapping */
18507 		assert(dst_cur < dst_end);
18508 		assert(dst_cur + bytes <= dst_end);
18509 		kr = copyoutmap(new_map, kbuf, dst_cur, bytes);
18510 		if (kr != KERN_SUCCESS) {
18511 			DEBUG4K_COPY("copyoutmap(%p, %p, 0x%llx, 0x%llx) kr 0x%x\n", new_map, kbuf, (uint64_t)dst_cur, (uint64_t)bytes, kr);
18512 		}
18513 	}
18514 
18515 	/* free kernel buffer */
18516 	kfree_data(kbuf, PAGE_SIZE);
18517 
18518 	/* destroy new map */
18519 	vm_map_destroy(new_map);
18520 	new_map = VM_MAP_NULL;
18521 
18522 	/* dispose of the old map entries in "copy_map" */
18523 	while (vm_map_copy_first_entry(copy_map) !=
18524 	    vm_map_copy_to_entry(copy_map)) {
18525 		entry = vm_map_copy_first_entry(copy_map);
18526 		vm_map_copy_entry_unlink(copy_map, entry);
18527 		if (entry->is_sub_map) {
18528 			vm_map_deallocate(VME_SUBMAP(entry));
18529 		} else {
18530 			vm_object_deallocate(VME_OBJECT(entry));
18531 		}
18532 		vm_map_copy_entry_dispose(entry);
18533 	}
18534 
18535 	/* change "copy_map"'s page_size to match "target_map" */
18536 	copy_map->cpy_hdr.page_shift = (uint16_t)VM_MAP_PAGE_SHIFT(target_map);
18537 	copy_map->offset = 0;
18538 	copy_map->size = size;
18539 
18540 	/* insert new map entry in "copy_map" */
18541 	assert(vm_map_copy_last_entry(copy_map) == vm_map_copy_to_entry(copy_map));
18542 	vm_map_copy_entry_link(copy_map, vm_map_copy_last_entry(copy_map), new_entry);
18543 
18544 	DEBUG4K_COPY("copy_map %p (%d %d 0x%llx 0x%llx) AFTER\n", copy_map, copy_map->cpy_hdr.page_shift, copy_map->cpy_hdr.nentries, copy_map->offset, (uint64_t)copy_map->size);
18545 	return KERN_SUCCESS;
18546 }
18547 
18548 void
18549 vm_map_copy_adjust_get_target_copy_map(
18550 	vm_map_copy_t   copy_map,
18551 	vm_map_copy_t   *target_copy_map_p);
18552 void
vm_map_copy_adjust_get_target_copy_map(vm_map_copy_t copy_map,vm_map_copy_t * target_copy_map_p)18553 vm_map_copy_adjust_get_target_copy_map(
18554 	vm_map_copy_t   copy_map,
18555 	vm_map_copy_t   *target_copy_map_p)
18556 {
18557 	vm_map_copy_t   target_copy_map;
18558 	vm_map_entry_t  entry, target_entry;
18559 
18560 	if (*target_copy_map_p != VM_MAP_COPY_NULL) {
18561 		/* the caller already has a "target_copy_map": use it */
18562 		return;
18563 	}
18564 
18565 	/* the caller wants us to create a new copy of "copy_map" */
18566 	assert(copy_map->type == VM_MAP_COPY_ENTRY_LIST);
18567 	target_copy_map = vm_map_copy_allocate(copy_map->type);
18568 	target_copy_map->offset = copy_map->offset;
18569 	target_copy_map->size = copy_map->size;
18570 	target_copy_map->cpy_hdr.page_shift = copy_map->cpy_hdr.page_shift;
18571 	for (entry = vm_map_copy_first_entry(copy_map);
18572 	    entry != vm_map_copy_to_entry(copy_map);
18573 	    entry = entry->vme_next) {
18574 		target_entry = vm_map_copy_entry_create(target_copy_map);
18575 		vm_map_entry_copy_full(target_entry, entry);
18576 		if (target_entry->is_sub_map) {
18577 			vm_map_reference(VME_SUBMAP(target_entry));
18578 		} else {
18579 			vm_object_reference(VME_OBJECT(target_entry));
18580 		}
18581 		vm_map_copy_entry_link(
18582 			target_copy_map,
18583 			vm_map_copy_last_entry(target_copy_map),
18584 			target_entry);
18585 	}
18586 	entry = VM_MAP_ENTRY_NULL;
18587 	*target_copy_map_p = target_copy_map;
18588 }
18589 
18590 /*
18591  * Callers of this function must call vm_map_copy_require on
18592  * previously created vm_map_copy_t or pass a newly created
18593  * one to ensure that it hasn't been forged.
18594  */
18595 static void
vm_map_copy_trim(vm_map_copy_t copy_map,uint16_t new_page_shift,vm_map_offset_t trim_start,vm_map_offset_t trim_end)18596 vm_map_copy_trim(
18597 	vm_map_copy_t   copy_map,
18598 	uint16_t        new_page_shift,
18599 	vm_map_offset_t trim_start,
18600 	vm_map_offset_t trim_end)
18601 {
18602 	uint16_t        copy_page_shift;
18603 	vm_map_entry_t  entry, next_entry;
18604 
18605 	assert(copy_map->type == VM_MAP_COPY_ENTRY_LIST);
18606 	assert(copy_map->cpy_hdr.nentries > 0);
18607 
18608 	trim_start += vm_map_copy_first_entry(copy_map)->vme_start;
18609 	trim_end += vm_map_copy_first_entry(copy_map)->vme_start;
18610 
18611 	/* use the new page_shift to do the clipping */
18612 	copy_page_shift = VM_MAP_COPY_PAGE_SHIFT(copy_map);
18613 	copy_map->cpy_hdr.page_shift = new_page_shift;
18614 
18615 	for (entry = vm_map_copy_first_entry(copy_map);
18616 	    entry != vm_map_copy_to_entry(copy_map);
18617 	    entry = next_entry) {
18618 		next_entry = entry->vme_next;
18619 		if (entry->vme_end <= trim_start) {
18620 			/* entry fully before trim range: skip */
18621 			continue;
18622 		}
18623 		if (entry->vme_start >= trim_end) {
18624 			/* entry fully after trim range: done */
18625 			break;
18626 		}
18627 		/* clip entry if needed */
18628 		vm_map_copy_clip_start(copy_map, entry, trim_start);
18629 		vm_map_copy_clip_end(copy_map, entry, trim_end);
18630 		/* dispose of entry */
18631 		copy_map->size -= entry->vme_end - entry->vme_start;
18632 		vm_map_copy_entry_unlink(copy_map, entry);
18633 		if (entry->is_sub_map) {
18634 			vm_map_deallocate(VME_SUBMAP(entry));
18635 		} else {
18636 			vm_object_deallocate(VME_OBJECT(entry));
18637 		}
18638 		vm_map_copy_entry_dispose(entry);
18639 		entry = VM_MAP_ENTRY_NULL;
18640 	}
18641 
18642 	/* restore copy_map's original page_shift */
18643 	copy_map->cpy_hdr.page_shift = copy_page_shift;
18644 }
18645 
18646 /*
18647  * Make any necessary adjustments to "copy_map" to allow it to be
18648  * mapped into "target_map".
18649  * If no changes were necessary, "target_copy_map" points to the
18650  * untouched "copy_map".
18651  * If changes are necessary, changes will be made to "target_copy_map".
18652  * If "target_copy_map" was NULL, we create a new "vm_map_copy_t" and
18653  * copy the original "copy_map" to it before applying the changes.
18654  * The caller should discard "target_copy_map" if it's not the same as
18655  * the original "copy_map".
18656  */
18657 /* TODO4K: also adjust to sub-range in the copy_map -> add start&end? */
18658 kern_return_t
vm_map_copy_adjust_to_target(vm_map_copy_t src_copy_map,vm_map_offset_t offset,vm_map_size_t size,vm_map_t target_map,boolean_t copy,vm_map_copy_t * target_copy_map_p,vm_map_offset_t * overmap_start_p,vm_map_offset_t * overmap_end_p,vm_map_offset_t * trimmed_start_p)18659 vm_map_copy_adjust_to_target(
18660 	vm_map_copy_t           src_copy_map,
18661 	vm_map_offset_t         offset,
18662 	vm_map_size_t           size,
18663 	vm_map_t                target_map,
18664 	boolean_t               copy,
18665 	vm_map_copy_t           *target_copy_map_p,
18666 	vm_map_offset_t         *overmap_start_p,
18667 	vm_map_offset_t         *overmap_end_p,
18668 	vm_map_offset_t         *trimmed_start_p)
18669 {
18670 	vm_map_copy_t           copy_map, target_copy_map;
18671 	vm_map_size_t           target_size;
18672 	vm_map_size_t           src_copy_map_size;
18673 	vm_map_size_t           overmap_start, overmap_end;
18674 	int                     misalignments;
18675 	vm_map_entry_t          entry, target_entry;
18676 	vm_map_offset_t         addr_adjustment;
18677 	vm_map_offset_t         new_start, new_end;
18678 	int                     copy_page_mask, target_page_mask;
18679 	uint16_t                copy_page_shift, target_page_shift;
18680 	vm_map_offset_t         trimmed_end;
18681 
18682 	/*
18683 	 * Assert that the vm_map_copy is coming from the right
18684 	 * zone and hasn't been forged
18685 	 */
18686 	vm_map_copy_require(src_copy_map);
18687 	assert(src_copy_map->type == VM_MAP_COPY_ENTRY_LIST);
18688 
18689 	/*
18690 	 * Start working with "src_copy_map" but we'll switch
18691 	 * to "target_copy_map" as soon as we start making adjustments.
18692 	 */
18693 	copy_map = src_copy_map;
18694 	src_copy_map_size = src_copy_map->size;
18695 
18696 	copy_page_shift = VM_MAP_COPY_PAGE_SHIFT(copy_map);
18697 	copy_page_mask = VM_MAP_COPY_PAGE_MASK(copy_map);
18698 	target_page_shift = (uint16_t)VM_MAP_PAGE_SHIFT(target_map);
18699 	target_page_mask = VM_MAP_PAGE_MASK(target_map);
18700 
18701 	DEBUG4K_ADJUST("copy_map %p (%d offset 0x%llx size 0x%llx) target_map %p (%d) copy %d offset 0x%llx size 0x%llx target_copy_map %p...\n", copy_map, copy_page_shift, (uint64_t)copy_map->offset, (uint64_t)copy_map->size, target_map, target_page_shift, copy, (uint64_t)offset, (uint64_t)size, *target_copy_map_p);
18702 
18703 	target_copy_map = *target_copy_map_p;
18704 	if (target_copy_map != VM_MAP_COPY_NULL) {
18705 		vm_map_copy_require(target_copy_map);
18706 	}
18707 
18708 	if (offset + size > copy_map->size) {
18709 		DEBUG4K_ERROR("copy_map %p (%d->%d) copy_map->size 0x%llx offset 0x%llx size 0x%llx KERN_INVALID_ARGUMENT\n", copy_map, copy_page_shift, target_page_shift, (uint64_t)copy_map->size, (uint64_t)offset, (uint64_t)size);
18710 		return KERN_INVALID_ARGUMENT;
18711 	}
18712 
18713 	/* trim the end */
18714 	trimmed_end = 0;
18715 	new_end = VM_MAP_ROUND_PAGE(offset + size, target_page_mask);
18716 	if (new_end < copy_map->size) {
18717 		trimmed_end = src_copy_map_size - new_end;
18718 		DEBUG4K_ADJUST("copy_map %p (%d->%d) copy %d offset 0x%llx size 0x%llx target_copy_map %p... trim end from 0x%llx to 0x%llx\n", copy_map, copy_page_shift, target_page_shift, copy, (uint64_t)offset, (uint64_t)size, target_copy_map, (uint64_t)new_end, (uint64_t)copy_map->size);
18719 		/* get "target_copy_map" if needed and adjust it */
18720 		vm_map_copy_adjust_get_target_copy_map(copy_map,
18721 		    &target_copy_map);
18722 		copy_map = target_copy_map;
18723 		vm_map_copy_trim(target_copy_map, target_page_shift,
18724 		    new_end, copy_map->size);
18725 	}
18726 
18727 	/* trim the start */
18728 	new_start = VM_MAP_TRUNC_PAGE(offset, target_page_mask);
18729 	if (new_start != 0) {
18730 		DEBUG4K_ADJUST("copy_map %p (%d->%d) copy %d offset 0x%llx size 0x%llx target_copy_map %p... trim start from 0x%llx to 0x%llx\n", copy_map, copy_page_shift, target_page_shift, copy, (uint64_t)offset, (uint64_t)size, target_copy_map, (uint64_t)0, (uint64_t)new_start);
18731 		/* get "target_copy_map" if needed and adjust it */
18732 		vm_map_copy_adjust_get_target_copy_map(copy_map,
18733 		    &target_copy_map);
18734 		copy_map = target_copy_map;
18735 		vm_map_copy_trim(target_copy_map, target_page_shift,
18736 		    0, new_start);
18737 	}
18738 	*trimmed_start_p = new_start;
18739 
18740 	/* target_size starts with what's left after trimming */
18741 	target_size = copy_map->size;
18742 	assertf(target_size == src_copy_map_size - *trimmed_start_p - trimmed_end,
18743 	    "target_size 0x%llx src_copy_map_size 0x%llx trimmed_start 0x%llx trimmed_end 0x%llx\n",
18744 	    (uint64_t)target_size, (uint64_t)src_copy_map_size,
18745 	    (uint64_t)*trimmed_start_p, (uint64_t)trimmed_end);
18746 
18747 	/* check for misalignments but don't adjust yet */
18748 	misalignments = 0;
18749 	overmap_start = 0;
18750 	overmap_end = 0;
18751 	if (copy_page_shift < target_page_shift) {
18752 		/*
18753 		 * Remapping from 4K to 16K: check the VM object alignments
18754 		 * throughout the range.
18755 		 * If the start and end of the range are mis-aligned, we can
18756 		 * over-map to re-align, and adjust the "overmap" start/end
18757 		 * and "target_size" of the range accordingly.
18758 		 * If there is any mis-alignment within the range:
18759 		 *     if "copy":
18760 		 *         we can do immediate-copy instead of copy-on-write,
18761 		 *     else:
18762 		 *         no way to remap and share; fail.
18763 		 */
18764 		for (entry = vm_map_copy_first_entry(copy_map);
18765 		    entry != vm_map_copy_to_entry(copy_map);
18766 		    entry = entry->vme_next) {
18767 			vm_object_offset_t object_offset_start, object_offset_end;
18768 
18769 			object_offset_start = VME_OFFSET(entry);
18770 			object_offset_end = object_offset_start;
18771 			object_offset_end += entry->vme_end - entry->vme_start;
18772 			if (object_offset_start & target_page_mask) {
18773 				if (entry == vm_map_copy_first_entry(copy_map) && !copy) {
18774 					overmap_start++;
18775 				} else {
18776 					misalignments++;
18777 				}
18778 			}
18779 			if (object_offset_end & target_page_mask) {
18780 				if (entry->vme_next == vm_map_copy_to_entry(copy_map) && !copy) {
18781 					overmap_end++;
18782 				} else {
18783 					misalignments++;
18784 				}
18785 			}
18786 		}
18787 	}
18788 	entry = VM_MAP_ENTRY_NULL;
18789 
18790 	/* decide how to deal with misalignments */
18791 	assert(overmap_start <= 1);
18792 	assert(overmap_end <= 1);
18793 	if (!overmap_start && !overmap_end && !misalignments) {
18794 		/* copy_map is properly aligned for target_map ... */
18795 		if (*trimmed_start_p) {
18796 			/* ... but we trimmed it, so still need to adjust */
18797 		} else {
18798 			/* ... and we didn't trim anything: we're done */
18799 			if (target_copy_map == VM_MAP_COPY_NULL) {
18800 				target_copy_map = copy_map;
18801 			}
18802 			*target_copy_map_p = target_copy_map;
18803 			*overmap_start_p = 0;
18804 			*overmap_end_p = 0;
18805 			DEBUG4K_ADJUST("copy_map %p (%d offset 0x%llx size 0x%llx) target_map %p (%d) copy %d target_copy_map %p (%d offset 0x%llx size 0x%llx) -> trimmed 0x%llx overmap start 0x%llx end 0x%llx KERN_SUCCESS\n", copy_map, copy_page_shift, (uint64_t)copy_map->offset, (uint64_t)copy_map->size, target_map, target_page_shift, copy, *target_copy_map_p, VM_MAP_COPY_PAGE_SHIFT(*target_copy_map_p), (uint64_t)(*target_copy_map_p)->offset, (uint64_t)(*target_copy_map_p)->size, (uint64_t)*trimmed_start_p, (uint64_t)*overmap_start_p, (uint64_t)*overmap_end_p);
18806 			return KERN_SUCCESS;
18807 		}
18808 	} else if (misalignments && !copy) {
18809 		/* can't "share" if misaligned */
18810 		DEBUG4K_ADJUST("unsupported sharing\n");
18811 #if MACH_ASSERT
18812 		if (debug4k_panic_on_misaligned_sharing) {
18813 			panic("DEBUG4k %s:%d unsupported sharing", __FUNCTION__, __LINE__);
18814 		}
18815 #endif /* MACH_ASSERT */
18816 		DEBUG4K_ADJUST("copy_map %p (%d) target_map %p (%d) copy %d target_copy_map %p -> KERN_NOT_SUPPORTED\n", copy_map, copy_page_shift, target_map, target_page_shift, copy, *target_copy_map_p);
18817 		return KERN_NOT_SUPPORTED;
18818 	} else {
18819 		/* can't virtual-copy if misaligned (but can physical-copy) */
18820 		DEBUG4K_ADJUST("mis-aligned copying\n");
18821 	}
18822 
18823 	/* get a "target_copy_map" if needed and switch to it */
18824 	vm_map_copy_adjust_get_target_copy_map(copy_map, &target_copy_map);
18825 	copy_map = target_copy_map;
18826 
18827 	if (misalignments && copy) {
18828 		vm_map_size_t target_copy_map_size;
18829 
18830 		/*
18831 		 * Can't do copy-on-write with misaligned mappings.
18832 		 * Replace the mappings with a physical copy of the original
18833 		 * mappings' contents.
18834 		 */
18835 		target_copy_map_size = target_copy_map->size;
18836 		kern_return_t kr = vm_map_copy_to_physcopy(target_copy_map, target_map);
18837 		if (kr != KERN_SUCCESS) {
18838 			return kr;
18839 		}
18840 		*target_copy_map_p = target_copy_map;
18841 		*overmap_start_p = 0;
18842 		*overmap_end_p = target_copy_map->size - target_copy_map_size;
18843 		DEBUG4K_ADJUST("copy_map %p (%d offset 0x%llx size 0x%llx) target_map %p (%d) copy %d target_copy_map %p (%d offset 0x%llx size 0x%llx)-> trimmed 0x%llx overmap start 0x%llx end 0x%llx PHYSCOPY\n", copy_map, copy_page_shift, (uint64_t)copy_map->offset, (uint64_t)copy_map->size, target_map, target_page_shift, copy, *target_copy_map_p, VM_MAP_COPY_PAGE_SHIFT(*target_copy_map_p), (uint64_t)(*target_copy_map_p)->offset, (uint64_t)(*target_copy_map_p)->size, (uint64_t)*trimmed_start_p, (uint64_t)*overmap_start_p, (uint64_t)*overmap_end_p);
18844 		return KERN_SUCCESS;
18845 	}
18846 
18847 	/* apply the adjustments */
18848 	misalignments = 0;
18849 	overmap_start = 0;
18850 	overmap_end = 0;
18851 	/* remove copy_map->offset, so that everything starts at offset 0 */
18852 	addr_adjustment = copy_map->offset;
18853 	/* also remove whatever we trimmed from the start */
18854 	addr_adjustment += *trimmed_start_p;
18855 	for (target_entry = vm_map_copy_first_entry(target_copy_map);
18856 	    target_entry != vm_map_copy_to_entry(target_copy_map);
18857 	    target_entry = target_entry->vme_next) {
18858 		vm_object_offset_t object_offset_start, object_offset_end;
18859 
18860 		DEBUG4K_ADJUST("copy %p (%d 0x%llx 0x%llx) entry %p [ 0x%llx 0x%llx ] object %p offset 0x%llx BEFORE\n", target_copy_map, VM_MAP_COPY_PAGE_SHIFT(target_copy_map), target_copy_map->offset, (uint64_t)target_copy_map->size, target_entry, (uint64_t)target_entry->vme_start, (uint64_t)target_entry->vme_end, VME_OBJECT(target_entry), VME_OFFSET(target_entry));
18861 		object_offset_start = VME_OFFSET(target_entry);
18862 		if (object_offset_start & target_page_mask) {
18863 			DEBUG4K_ADJUST("copy %p (%d 0x%llx 0x%llx) entry %p [ 0x%llx 0x%llx ] object %p offset 0x%llx misaligned at start\n", target_copy_map, VM_MAP_COPY_PAGE_SHIFT(target_copy_map), target_copy_map->offset, (uint64_t)target_copy_map->size, target_entry, (uint64_t)target_entry->vme_start, (uint64_t)target_entry->vme_end, VME_OBJECT(target_entry), VME_OFFSET(target_entry));
18864 			if (target_entry == vm_map_copy_first_entry(target_copy_map)) {
18865 				/*
18866 				 * start of 1st entry is mis-aligned:
18867 				 * re-adjust by over-mapping.
18868 				 */
18869 				overmap_start = object_offset_start - trunc_page_mask_64(object_offset_start, target_page_mask);
18870 				DEBUG4K_ADJUST("entry %p offset 0x%llx copy %d -> overmap_start 0x%llx\n", target_entry, VME_OFFSET(target_entry), copy, (uint64_t)overmap_start);
18871 				VME_OFFSET_SET(target_entry, VME_OFFSET(target_entry) - overmap_start);
18872 			} else {
18873 				misalignments++;
18874 				DEBUG4K_ADJUST("entry %p offset 0x%llx copy %d -> misalignments %d\n", target_entry, VME_OFFSET(target_entry), copy, misalignments);
18875 				assert(copy);
18876 			}
18877 		}
18878 
18879 		if (target_entry == vm_map_copy_first_entry(target_copy_map)) {
18880 			target_size += overmap_start;
18881 		} else {
18882 			target_entry->vme_start += overmap_start;
18883 		}
18884 		target_entry->vme_end += overmap_start;
18885 
18886 		object_offset_end = VME_OFFSET(target_entry) + target_entry->vme_end - target_entry->vme_start;
18887 		if (object_offset_end & target_page_mask) {
18888 			DEBUG4K_ADJUST("copy %p (%d 0x%llx 0x%llx) entry %p [ 0x%llx 0x%llx ] object %p offset 0x%llx misaligned at end\n", target_copy_map, VM_MAP_COPY_PAGE_SHIFT(target_copy_map), target_copy_map->offset, (uint64_t)target_copy_map->size, target_entry, (uint64_t)target_entry->vme_start, (uint64_t)target_entry->vme_end, VME_OBJECT(target_entry), VME_OFFSET(target_entry));
18889 			if (target_entry->vme_next == vm_map_copy_to_entry(target_copy_map)) {
18890 				/*
18891 				 * end of last entry is mis-aligned: re-adjust by over-mapping.
18892 				 */
18893 				overmap_end = round_page_mask_64(object_offset_end, target_page_mask) - object_offset_end;
18894 				DEBUG4K_ADJUST("entry %p offset 0x%llx copy %d -> overmap_end 0x%llx\n", target_entry, VME_OFFSET(target_entry), copy, (uint64_t)overmap_end);
18895 				target_entry->vme_end += overmap_end;
18896 				target_size += overmap_end;
18897 			} else {
18898 				misalignments++;
18899 				DEBUG4K_ADJUST("entry %p offset 0x%llx copy %d -> misalignments %d\n", target_entry, VME_OFFSET(target_entry), copy, misalignments);
18900 				assert(copy);
18901 			}
18902 		}
18903 		target_entry->vme_start -= addr_adjustment;
18904 		target_entry->vme_end -= addr_adjustment;
18905 		DEBUG4K_ADJUST("copy %p (%d 0x%llx 0x%llx) entry %p [ 0x%llx 0x%llx ] object %p offset 0x%llx AFTER\n", target_copy_map, VM_MAP_COPY_PAGE_SHIFT(target_copy_map), target_copy_map->offset, (uint64_t)target_copy_map->size, target_entry, (uint64_t)target_entry->vme_start, (uint64_t)target_entry->vme_end, VME_OBJECT(target_entry), VME_OFFSET(target_entry));
18906 	}
18907 
18908 	target_copy_map->size = target_size;
18909 	target_copy_map->offset += overmap_start;
18910 	target_copy_map->offset -= addr_adjustment;
18911 	target_copy_map->cpy_hdr.page_shift = target_page_shift;
18912 
18913 //	assert(VM_MAP_PAGE_ALIGNED(target_copy_map->size, target_page_mask));
18914 //	assert(VM_MAP_PAGE_ALIGNED(target_copy_map->offset, FOURK_PAGE_MASK));
18915 	assert(overmap_start < VM_MAP_PAGE_SIZE(target_map));
18916 	assert(overmap_end < VM_MAP_PAGE_SIZE(target_map));
18917 
18918 	*target_copy_map_p = target_copy_map;
18919 	*overmap_start_p = overmap_start;
18920 	*overmap_end_p = overmap_end;
18921 
18922 	DEBUG4K_ADJUST("copy_map %p (%d offset 0x%llx size 0x%llx) target_map %p (%d) copy %d target_copy_map %p (%d offset 0x%llx size 0x%llx) -> trimmed 0x%llx overmap start 0x%llx end 0x%llx KERN_SUCCESS\n", copy_map, copy_page_shift, (uint64_t)copy_map->offset, (uint64_t)copy_map->size, target_map, target_page_shift, copy, *target_copy_map_p, VM_MAP_COPY_PAGE_SHIFT(*target_copy_map_p), (uint64_t)(*target_copy_map_p)->offset, (uint64_t)(*target_copy_map_p)->size, (uint64_t)*trimmed_start_p, (uint64_t)*overmap_start_p, (uint64_t)*overmap_end_p);
18923 	return KERN_SUCCESS;
18924 }
18925 
18926 kern_return_t
vm_map_range_physical_size(vm_map_t map,vm_map_address_t start,mach_vm_size_t size,mach_vm_size_t * phys_size)18927 vm_map_range_physical_size(
18928 	vm_map_t         map,
18929 	vm_map_address_t start,
18930 	mach_vm_size_t   size,
18931 	mach_vm_size_t * phys_size)
18932 {
18933 	kern_return_t   kr;
18934 	vm_map_copy_t   copy_map, target_copy_map;
18935 	vm_map_offset_t adjusted_start, adjusted_end;
18936 	vm_map_size_t   adjusted_size;
18937 	vm_prot_t       cur_prot, max_prot;
18938 	vm_map_offset_t overmap_start, overmap_end, trimmed_start, end;
18939 	vm_map_kernel_flags_t vmk_flags;
18940 
18941 	if (size == 0) {
18942 		DEBUG4K_SHARE("map %p start 0x%llx size 0x%llx -> phys_size 0!\n", map, (uint64_t)start, (uint64_t)size);
18943 		*phys_size = 0;
18944 		return KERN_SUCCESS;
18945 	}
18946 
18947 	adjusted_start = vm_map_trunc_page(start, VM_MAP_PAGE_MASK(map));
18948 	adjusted_end = vm_map_round_page(start + size, VM_MAP_PAGE_MASK(map));
18949 	if (__improbable(os_add_overflow(start, size, &end) ||
18950 	    adjusted_end <= adjusted_start)) {
18951 		/* wraparound */
18952 		printf("%s:%d(start=0x%llx, size=0x%llx) pgmask 0x%x: wraparound\n", __FUNCTION__, __LINE__, (uint64_t)start, (uint64_t)size, VM_MAP_PAGE_MASK(map));
18953 		*phys_size = 0;
18954 		return KERN_INVALID_ARGUMENT;
18955 	}
18956 	if (__improbable(vm_map_range_overflows(map, start, size))) {
18957 		*phys_size = 0;
18958 		return KERN_INVALID_ADDRESS;
18959 	}
18960 	assert(adjusted_end > adjusted_start);
18961 	adjusted_size = adjusted_end - adjusted_start;
18962 	*phys_size = adjusted_size;
18963 	if (VM_MAP_PAGE_SIZE(map) == PAGE_SIZE) {
18964 		return KERN_SUCCESS;
18965 	}
18966 	if (start == 0) {
18967 		adjusted_start = vm_map_trunc_page(start, PAGE_MASK);
18968 		adjusted_end = vm_map_round_page(start + size, PAGE_MASK);
18969 		if (__improbable(adjusted_end <= adjusted_start)) {
18970 			/* wraparound */
18971 			printf("%s:%d(start=0x%llx, size=0x%llx) pgmask 0x%x: wraparound\n", __FUNCTION__, __LINE__, (uint64_t)start, (uint64_t)size, PAGE_MASK);
18972 			*phys_size = 0;
18973 			return KERN_INVALID_ARGUMENT;
18974 		}
18975 		assert(adjusted_end > adjusted_start);
18976 		adjusted_size = adjusted_end - adjusted_start;
18977 		*phys_size = adjusted_size;
18978 		return KERN_SUCCESS;
18979 	}
18980 
18981 	vmk_flags = VM_MAP_KERNEL_FLAGS_NONE;
18982 	vmk_flags.vmkf_copy_pageable = TRUE;
18983 	vmk_flags.vmkf_copy_same_map = TRUE;
18984 	assert(adjusted_size != 0);
18985 	cur_prot = VM_PROT_NONE; /* legacy mode */
18986 	max_prot = VM_PROT_NONE; /* legacy mode */
18987 	kr = vm_map_copy_extract(map, adjusted_start, adjusted_size,
18988 	    FALSE /* copy */,
18989 	    &copy_map,
18990 	    &cur_prot, &max_prot, VM_INHERIT_DEFAULT,
18991 	    vmk_flags);
18992 	if (kr != KERN_SUCCESS) {
18993 		DEBUG4K_ERROR("map %p start 0x%llx 0x%llx size 0x%llx 0x%llx kr 0x%x\n", map, (uint64_t)start, (uint64_t)adjusted_start, size, (uint64_t)adjusted_size, kr);
18994 		//assert(0);
18995 		*phys_size = 0;
18996 		return kr;
18997 	}
18998 	assert(copy_map != VM_MAP_COPY_NULL);
18999 	target_copy_map = copy_map;
19000 	DEBUG4K_ADJUST("adjusting...\n");
19001 	kr = vm_map_copy_adjust_to_target(
19002 		copy_map,
19003 		start - adjusted_start, /* offset */
19004 		size, /* size */
19005 		kernel_map,
19006 		FALSE,                          /* copy */
19007 		&target_copy_map,
19008 		&overmap_start,
19009 		&overmap_end,
19010 		&trimmed_start);
19011 	if (kr == KERN_SUCCESS) {
19012 		if (target_copy_map->size != *phys_size) {
19013 			DEBUG4K_ADJUST("map %p (%d) start 0x%llx size 0x%llx adjusted_start 0x%llx adjusted_end 0x%llx overmap_start 0x%llx overmap_end 0x%llx trimmed_start 0x%llx phys_size 0x%llx -> 0x%llx\n", map, VM_MAP_PAGE_SHIFT(map), (uint64_t)start, (uint64_t)size, (uint64_t)adjusted_start, (uint64_t)adjusted_end, (uint64_t)overmap_start, (uint64_t)overmap_end, (uint64_t)trimmed_start, (uint64_t)*phys_size, (uint64_t)target_copy_map->size);
19014 		}
19015 		*phys_size = target_copy_map->size;
19016 	} else {
19017 		DEBUG4K_ERROR("map %p start 0x%llx 0x%llx size 0x%llx 0x%llx kr 0x%x\n", map, (uint64_t)start, (uint64_t)adjusted_start, size, (uint64_t)adjusted_size, kr);
19018 		//assert(0);
19019 		*phys_size = 0;
19020 	}
19021 	vm_map_copy_discard(copy_map);
19022 	copy_map = VM_MAP_COPY_NULL;
19023 
19024 	return kr;
19025 }
19026 
19027 
19028 kern_return_t
memory_entry_check_for_adjustment(vm_map_t src_map,ipc_port_t port,vm_map_offset_t * overmap_start,vm_map_offset_t * overmap_end)19029 memory_entry_check_for_adjustment(
19030 	vm_map_t                        src_map,
19031 	ipc_port_t                      port,
19032 	vm_map_offset_t         *overmap_start,
19033 	vm_map_offset_t         *overmap_end)
19034 {
19035 	kern_return_t kr = KERN_SUCCESS;
19036 	vm_map_copy_t copy_map = VM_MAP_COPY_NULL, target_copy_map = VM_MAP_COPY_NULL;
19037 
19038 	assert(port);
19039 	assertf(ip_kotype(port) == IKOT_NAMED_ENTRY, "Port Type expected: %d...received:%d\n", IKOT_NAMED_ENTRY, ip_kotype(port));
19040 
19041 	vm_named_entry_t        named_entry;
19042 
19043 	named_entry = mach_memory_entry_from_port(port);
19044 	named_entry_lock(named_entry);
19045 	copy_map = named_entry->backing.copy;
19046 	target_copy_map = copy_map;
19047 
19048 	if (src_map && VM_MAP_PAGE_SHIFT(src_map) < PAGE_SHIFT) {
19049 		vm_map_offset_t trimmed_start;
19050 
19051 		trimmed_start = 0;
19052 		DEBUG4K_ADJUST("adjusting...\n");
19053 		kr = vm_map_copy_adjust_to_target(
19054 			copy_map,
19055 			0, /* offset */
19056 			copy_map->size, /* size */
19057 			src_map,
19058 			FALSE, /* copy */
19059 			&target_copy_map,
19060 			overmap_start,
19061 			overmap_end,
19062 			&trimmed_start);
19063 		assert(trimmed_start == 0);
19064 	}
19065 	named_entry_unlock(named_entry);
19066 
19067 	return kr;
19068 }
19069 
19070 
19071 /*
19072  *	Routine:	vm_remap
19073  *
19074  *			Map portion of a task's address space.
19075  *			Mapped region must not overlap more than
19076  *			one vm memory object. Protections and
19077  *			inheritance attributes remain the same
19078  *			as in the original task and are	out parameters.
19079  *			Source and Target task can be identical
19080  *			Other attributes are identical as for vm_map()
19081  */
19082 kern_return_t
vm_map_remap(vm_map_t target_map,vm_map_address_t * address,vm_map_size_t size,vm_map_offset_t mask,vm_map_kernel_flags_t vmk_flags,vm_map_t src_map,vm_map_offset_t memory_address,boolean_t copy,vm_prot_t * cur_protection,vm_prot_t * max_protection,vm_inherit_t inheritance)19083 vm_map_remap(
19084 	vm_map_t                target_map,
19085 	vm_map_address_t        *address,
19086 	vm_map_size_t           size,
19087 	vm_map_offset_t         mask,
19088 	vm_map_kernel_flags_t   vmk_flags,
19089 	vm_map_t                src_map,
19090 	vm_map_offset_t         memory_address,
19091 	boolean_t               copy,
19092 	vm_prot_t               *cur_protection, /* IN/OUT */
19093 	vm_prot_t               *max_protection, /* IN/OUT */
19094 	vm_inherit_t            inheritance)
19095 {
19096 	kern_return_t           result;
19097 	vm_map_entry_t          entry;
19098 	vm_map_entry_t          insp_entry = VM_MAP_ENTRY_NULL;
19099 	vm_map_entry_t          new_entry;
19100 	vm_map_copy_t           copy_map;
19101 	vm_map_offset_t         offset_in_mapping;
19102 	vm_map_size_t           target_size = 0;
19103 	vm_map_size_t           src_page_mask, target_page_mask;
19104 	vm_map_offset_t         overmap_start, overmap_end, trimmed_start;
19105 	vm_map_offset_t         initial_memory_address;
19106 	vm_map_size_t           initial_size;
19107 	VM_MAP_ZAP_DECLARE(zap_list);
19108 
19109 	if (target_map == VM_MAP_NULL) {
19110 		return KERN_INVALID_ARGUMENT;
19111 	}
19112 
19113 	if (__improbable(vm_map_range_overflows(src_map, memory_address, size))) {
19114 		return KERN_INVALID_ARGUMENT;
19115 	}
19116 
19117 	if (__improbable((*cur_protection & *max_protection) != *cur_protection)) {
19118 		/* cur is more permissive than max */
19119 		return KERN_INVALID_ARGUMENT;
19120 	}
19121 
19122 	initial_memory_address = memory_address;
19123 	initial_size = size;
19124 	src_page_mask = VM_MAP_PAGE_MASK(src_map);
19125 	target_page_mask = VM_MAP_PAGE_MASK(target_map);
19126 
19127 	switch (inheritance) {
19128 	case VM_INHERIT_NONE:
19129 	case VM_INHERIT_COPY:
19130 	case VM_INHERIT_SHARE:
19131 		if (size != 0 && src_map != VM_MAP_NULL) {
19132 			break;
19133 		}
19134 		OS_FALLTHROUGH;
19135 	default:
19136 		return KERN_INVALID_ARGUMENT;
19137 	}
19138 
19139 	if (src_page_mask != target_page_mask) {
19140 		if (copy) {
19141 			DEBUG4K_COPY("src_map %p pgsz 0x%x addr 0x%llx size 0x%llx copy %d -> target_map %p pgsz 0x%x\n", src_map, VM_MAP_PAGE_SIZE(src_map), (uint64_t)memory_address, (uint64_t)size, copy, target_map, VM_MAP_PAGE_SIZE(target_map));
19142 		} else {
19143 			DEBUG4K_SHARE("src_map %p pgsz 0x%x addr 0x%llx size 0x%llx copy %d -> target_map %p pgsz 0x%x\n", src_map, VM_MAP_PAGE_SIZE(src_map), (uint64_t)memory_address, (uint64_t)size, copy, target_map, VM_MAP_PAGE_SIZE(target_map));
19144 		}
19145 	}
19146 
19147 	/*
19148 	 * If the user is requesting that we return the address of the
19149 	 * first byte of the data (rather than the base of the page),
19150 	 * then we use different rounding semantics: specifically,
19151 	 * we assume that (memory_address, size) describes a region
19152 	 * all of whose pages we must cover, rather than a base to be truncated
19153 	 * down and a size to be added to that base.  So we figure out
19154 	 * the highest page that the requested region includes and make
19155 	 * sure that the size will cover it.
19156 	 *
19157 	 * The key example we're worried about it is of the form:
19158 	 *
19159 	 *              memory_address = 0x1ff0, size = 0x20
19160 	 *
19161 	 * With the old semantics, we round down the memory_address to 0x1000
19162 	 * and round up the size to 0x1000, resulting in our covering *only*
19163 	 * page 0x1000.  With the new semantics, we'd realize that the region covers
19164 	 * 0x1ff0-0x2010, and compute a size of 0x2000.  Thus, we cover both page
19165 	 * 0x1000 and page 0x2000 in the region we remap.
19166 	 */
19167 	if (vmk_flags.vmf_return_data_addr) {
19168 		vm_map_offset_t range_start, range_end;
19169 
19170 		range_start = vm_map_trunc_page(memory_address, src_page_mask);
19171 		range_end = vm_map_round_page(memory_address + size, src_page_mask);
19172 		memory_address = range_start;
19173 		size = range_end - range_start;
19174 		offset_in_mapping = initial_memory_address - memory_address;
19175 	} else {
19176 		/*
19177 		 * IMPORTANT:
19178 		 * This legacy code path is broken: for the range mentioned
19179 		 * above [ memory_address = 0x1ff0,size = 0x20 ], which spans
19180 		 * two 4k pages, it yields [ memory_address = 0x1000,
19181 		 * size = 0x1000 ], which covers only the first 4k page.
19182 		 * BUT some code unfortunately depends on this bug, so we
19183 		 * can't fix it without breaking something.
19184 		 * New code should get automatically opted in the new
19185 		 * behavior with the new VM_FLAGS_RETURN_DATA_ADDR flags.
19186 		 */
19187 		offset_in_mapping = 0;
19188 		memory_address = vm_map_trunc_page(memory_address, src_page_mask);
19189 		size = vm_map_round_page(size, src_page_mask);
19190 		initial_memory_address = memory_address;
19191 		initial_size = size;
19192 	}
19193 
19194 
19195 	if (size == 0) {
19196 		return KERN_INVALID_ARGUMENT;
19197 	}
19198 
19199 	if (vmk_flags.vmf_resilient_media) {
19200 		/* must be copy-on-write to be "media resilient" */
19201 		if (!copy) {
19202 			return KERN_INVALID_ARGUMENT;
19203 		}
19204 	}
19205 
19206 	vmk_flags.vmkf_copy_pageable = target_map->hdr.entries_pageable;
19207 	vmk_flags.vmkf_copy_same_map = (src_map == target_map);
19208 
19209 	assert(size != 0);
19210 	result = vm_map_copy_extract(src_map,
19211 	    memory_address,
19212 	    size,
19213 	    copy, &copy_map,
19214 	    cur_protection, /* IN/OUT */
19215 	    max_protection, /* IN/OUT */
19216 	    inheritance,
19217 	    vmk_flags);
19218 	if (result != KERN_SUCCESS) {
19219 		return result;
19220 	}
19221 	assert(copy_map != VM_MAP_COPY_NULL);
19222 
19223 	/*
19224 	 * Handle the policy for vm map ranges
19225 	 *
19226 	 * If the maps differ, the target_map policy applies like for vm_map()
19227 	 * For same mapping remaps, we preserve the range.
19228 	 */
19229 	if (vmk_flags.vmkf_copy_same_map) {
19230 		vmk_flags.vmkf_range_id = copy_map->orig_range;
19231 	} else {
19232 		vm_map_kernel_flags_update_range_id(&vmk_flags, target_map);
19233 	}
19234 
19235 	overmap_start = 0;
19236 	overmap_end = 0;
19237 	trimmed_start = 0;
19238 	target_size = size;
19239 	if (src_page_mask != target_page_mask) {
19240 		vm_map_copy_t target_copy_map;
19241 
19242 		target_copy_map = copy_map; /* can modify "copy_map" itself */
19243 		DEBUG4K_ADJUST("adjusting...\n");
19244 		result = vm_map_copy_adjust_to_target(
19245 			copy_map,
19246 			offset_in_mapping, /* offset */
19247 			initial_size,
19248 			target_map,
19249 			copy,
19250 			&target_copy_map,
19251 			&overmap_start,
19252 			&overmap_end,
19253 			&trimmed_start);
19254 		if (result != KERN_SUCCESS) {
19255 			DEBUG4K_COPY("failed to adjust 0x%x\n", result);
19256 			vm_map_copy_discard(copy_map);
19257 			return result;
19258 		}
19259 		if (trimmed_start == 0) {
19260 			/* nothing trimmed: no adjustment needed */
19261 		} else if (trimmed_start >= offset_in_mapping) {
19262 			/* trimmed more than offset_in_mapping: nothing left */
19263 			assert(overmap_start == 0);
19264 			assert(overmap_end == 0);
19265 			offset_in_mapping = 0;
19266 		} else {
19267 			/* trimmed some of offset_in_mapping: adjust */
19268 			assert(overmap_start == 0);
19269 			assert(overmap_end == 0);
19270 			offset_in_mapping -= trimmed_start;
19271 		}
19272 		offset_in_mapping += overmap_start;
19273 		target_size = target_copy_map->size;
19274 	}
19275 
19276 	/*
19277 	 * Allocate/check a range of free virtual address
19278 	 * space for the target
19279 	 */
19280 	*address = vm_map_trunc_page(*address, target_page_mask);
19281 	vm_map_lock(target_map);
19282 	target_size = vm_map_round_page(target_size, target_page_mask);
19283 	result = vm_map_remap_range_allocate(target_map, address,
19284 	    target_size, mask, vmk_flags,
19285 	    &insp_entry, &zap_list);
19286 
19287 	for (entry = vm_map_copy_first_entry(copy_map);
19288 	    entry != vm_map_copy_to_entry(copy_map);
19289 	    entry = new_entry) {
19290 		new_entry = entry->vme_next;
19291 		vm_map_copy_entry_unlink(copy_map, entry);
19292 		if (result == KERN_SUCCESS) {
19293 			if (vmk_flags.vmkf_remap_prot_copy) {
19294 				/*
19295 				 * This vm_map_remap() is for a
19296 				 * vm_protect(VM_PROT_COPY), so the caller
19297 				 * expects to be allowed to add write access
19298 				 * to this new mapping.  This is done by
19299 				 * adding VM_PROT_WRITE to each entry's
19300 				 * max_protection... unless some security
19301 				 * settings disallow it.
19302 				 */
19303 				bool allow_write = false;
19304 				if (entry->vme_permanent) {
19305 					/* immutable mapping... */
19306 					if ((entry->max_protection & VM_PROT_EXECUTE) &&
19307 					    developer_mode_state()) {
19308 						/*
19309 						 * ... but executable and
19310 						 * possibly being debugged,
19311 						 * so let's allow it to become
19312 						 * writable, for breakpoints
19313 						 * and dtrace probes, for
19314 						 * example.
19315 						 */
19316 						allow_write = true;
19317 					} else {
19318 						printf("%d[%s] vm_remap(0x%llx,0x%llx) VM_PROT_COPY denied on permanent mapping prot 0x%x/0x%x developer %d\n",
19319 						    proc_selfpid(),
19320 						    (get_bsdtask_info(current_task())
19321 						    ? proc_name_address(get_bsdtask_info(current_task()))
19322 						    : "?"),
19323 						    (uint64_t)memory_address,
19324 						    (uint64_t)size,
19325 						    entry->protection,
19326 						    entry->max_protection,
19327 						    developer_mode_state());
19328 						DTRACE_VM6(vm_map_delete_permanent_deny_protcopy,
19329 						    vm_map_entry_t, entry,
19330 						    vm_map_offset_t, entry->vme_start,
19331 						    vm_map_offset_t, entry->vme_end,
19332 						    vm_prot_t, entry->protection,
19333 						    vm_prot_t, entry->max_protection,
19334 						    int, VME_ALIAS(entry));
19335 					}
19336 				} else {
19337 					allow_write = true;
19338 				}
19339 
19340 				/*
19341 				 * VM_PROT_COPY: allow this mapping to become
19342 				 * writable, unless it was "permanent".
19343 				 */
19344 				if (allow_write) {
19345 					entry->max_protection |= VM_PROT_WRITE;
19346 				}
19347 			}
19348 			if (vmk_flags.vmf_resilient_codesign) {
19349 				/* no codesigning -> read-only access */
19350 				entry->max_protection = VM_PROT_READ;
19351 				entry->protection = VM_PROT_READ;
19352 				entry->vme_resilient_codesign = TRUE;
19353 			}
19354 			entry->vme_start += *address;
19355 			entry->vme_end += *address;
19356 			assert(!entry->map_aligned);
19357 			if (vmk_flags.vmf_resilient_media &&
19358 			    !entry->is_sub_map &&
19359 			    (VME_OBJECT(entry) == VM_OBJECT_NULL ||
19360 			    VME_OBJECT(entry)->internal)) {
19361 				entry->vme_resilient_media = TRUE;
19362 			}
19363 			assert(VM_MAP_PAGE_ALIGNED(entry->vme_start, MIN(target_page_mask, PAGE_MASK)));
19364 			assert(VM_MAP_PAGE_ALIGNED(entry->vme_end, MIN(target_page_mask, PAGE_MASK)));
19365 			assert(VM_MAP_PAGE_ALIGNED(VME_OFFSET(entry), MIN(target_page_mask, PAGE_MASK)));
19366 			vm_map_store_entry_link(target_map, insp_entry, entry,
19367 			    vmk_flags);
19368 			insp_entry = entry;
19369 		} else {
19370 			if (!entry->is_sub_map) {
19371 				vm_object_deallocate(VME_OBJECT(entry));
19372 			} else {
19373 				vm_map_deallocate(VME_SUBMAP(entry));
19374 			}
19375 			vm_map_copy_entry_dispose(entry);
19376 		}
19377 	}
19378 
19379 	if (vmk_flags.vmf_resilient_codesign) {
19380 		*cur_protection = VM_PROT_READ;
19381 		*max_protection = VM_PROT_READ;
19382 	}
19383 
19384 	if (result == KERN_SUCCESS) {
19385 		target_map->size += target_size;
19386 		SAVE_HINT_MAP_WRITE(target_map, insp_entry);
19387 	}
19388 	vm_map_unlock(target_map);
19389 
19390 	vm_map_zap_dispose(&zap_list);
19391 
19392 	if (result == KERN_SUCCESS && target_map->wiring_required) {
19393 		result = vm_map_wire_kernel(target_map, *address,
19394 		    *address + size, *cur_protection, VM_KERN_MEMORY_MLOCK,
19395 		    TRUE);
19396 	}
19397 
19398 	/*
19399 	 * If requested, return the address of the data pointed to by the
19400 	 * request, rather than the base of the resulting page.
19401 	 */
19402 	if (vmk_flags.vmf_return_data_addr) {
19403 		*address += offset_in_mapping;
19404 	}
19405 
19406 	if (src_page_mask != target_page_mask) {
19407 		DEBUG4K_SHARE("vm_remap(%p 0x%llx 0x%llx copy=%d-> %p 0x%llx 0x%llx  result=0x%x\n", src_map, (uint64_t)memory_address, (uint64_t)size, copy, target_map, (uint64_t)*address, (uint64_t)offset_in_mapping, result);
19408 	}
19409 	vm_map_copy_discard(copy_map);
19410 	copy_map = VM_MAP_COPY_NULL;
19411 
19412 	return result;
19413 }
19414 
19415 /*
19416  *	Routine:	vm_map_remap_range_allocate
19417  *
19418  *	Description:
19419  *		Allocate a range in the specified virtual address map.
19420  *		returns the address and the map entry just before the allocated
19421  *		range
19422  *
19423  *	Map must be locked.
19424  */
19425 
19426 static kern_return_t
vm_map_remap_range_allocate(vm_map_t map,vm_map_address_t * address,vm_map_size_t size,vm_map_offset_t mask,vm_map_kernel_flags_t vmk_flags,vm_map_entry_t * map_entry,vm_map_zap_t zap_list)19427 vm_map_remap_range_allocate(
19428 	vm_map_t                map,
19429 	vm_map_address_t        *address,       /* IN/OUT */
19430 	vm_map_size_t           size,
19431 	vm_map_offset_t         mask,
19432 	vm_map_kernel_flags_t   vmk_flags,
19433 	vm_map_entry_t          *map_entry,     /* OUT */
19434 	vm_map_zap_t            zap_list)
19435 {
19436 	vm_map_entry_t  entry;
19437 	vm_map_offset_t start;
19438 	kern_return_t   kr;
19439 
19440 	start = *address;
19441 
19442 	if (!vmk_flags.vmf_fixed) {
19443 		kr = vm_map_locate_space(map, size, mask, vmk_flags,
19444 		    &start, &entry);
19445 		if (kr != KERN_SUCCESS) {
19446 			return kr;
19447 		}
19448 		*address = start;
19449 	} else {
19450 		vm_map_offset_t effective_min_offset, effective_max_offset;
19451 		vm_map_entry_t  temp_entry;
19452 		vm_map_offset_t end;
19453 
19454 		effective_min_offset = map->min_offset;
19455 		effective_max_offset = map->max_offset;
19456 
19457 		/*
19458 		 *	Verify that:
19459 		 *		the address doesn't itself violate
19460 		 *		the mask requirement.
19461 		 */
19462 
19463 		if ((start & mask) != 0) {
19464 			return KERN_NO_SPACE;
19465 		}
19466 
19467 #if CONFIG_MAP_RANGES
19468 		if (map->uses_user_ranges) {
19469 			struct mach_vm_range r;
19470 
19471 			vm_map_user_range_resolve(map, start, 1, &r);
19472 			if (r.max_address == 0) {
19473 				return KERN_INVALID_ADDRESS;
19474 			}
19475 
19476 			effective_min_offset = r.min_address;
19477 			effective_max_offset = r.max_address;
19478 		}
19479 #endif /* CONFIG_MAP_RANGES */
19480 		if (map == kernel_map) {
19481 			mach_vm_range_t r = kmem_validate_range_for_overwrite(start, size);
19482 			effective_min_offset = r->min_address;
19483 			effective_min_offset = r->max_address;
19484 		}
19485 
19486 		/*
19487 		 *	...	the address is within bounds
19488 		 */
19489 
19490 		end = start + size;
19491 
19492 		if ((start < effective_min_offset) ||
19493 		    (end > effective_max_offset) ||
19494 		    (start >= end)) {
19495 			return KERN_INVALID_ADDRESS;
19496 		}
19497 
19498 		/*
19499 		 * If we're asked to overwrite whatever was mapped in that
19500 		 * range, first deallocate that range.
19501 		 */
19502 		if (vmk_flags.vmf_overwrite) {
19503 			vmr_flags_t remove_flags = VM_MAP_REMOVE_NO_MAP_ALIGN;
19504 
19505 			/*
19506 			 * We use a "zap_list" to avoid having to unlock
19507 			 * the "map" in vm_map_delete(), which would compromise
19508 			 * the atomicity of the "deallocate" and then "remap"
19509 			 * combination.
19510 			 */
19511 			remove_flags |= VM_MAP_REMOVE_NO_YIELD;
19512 
19513 			if (vmk_flags.vmkf_overwrite_immutable) {
19514 				remove_flags |= VM_MAP_REMOVE_IMMUTABLE;
19515 			}
19516 			if (vmk_flags.vmkf_remap_prot_copy) {
19517 				remove_flags |= VM_MAP_REMOVE_IMMUTABLE_CODE;
19518 			}
19519 			kr = vm_map_delete(map, start, end, remove_flags,
19520 			    KMEM_GUARD_NONE, zap_list).kmr_return;
19521 			if (kr != KERN_SUCCESS) {
19522 				/* XXX FBDP restore zap_list? */
19523 				return kr;
19524 			}
19525 		}
19526 
19527 		/*
19528 		 *	...	the starting address isn't allocated
19529 		 */
19530 
19531 		if (vm_map_lookup_entry(map, start, &temp_entry)) {
19532 			return KERN_NO_SPACE;
19533 		}
19534 
19535 		entry = temp_entry;
19536 
19537 		/*
19538 		 *	...	the next region doesn't overlap the
19539 		 *		end point.
19540 		 */
19541 
19542 		if ((entry->vme_next != vm_map_to_entry(map)) &&
19543 		    (entry->vme_next->vme_start < end)) {
19544 			return KERN_NO_SPACE;
19545 		}
19546 	}
19547 	*map_entry = entry;
19548 	return KERN_SUCCESS;
19549 }
19550 
19551 /*
19552  *	vm_map_switch:
19553  *
19554  *	Set the address map for the current thread to the specified map
19555  */
19556 
19557 vm_map_t
vm_map_switch(vm_map_t map)19558 vm_map_switch(
19559 	vm_map_t        map)
19560 {
19561 	thread_t        thread = current_thread();
19562 	vm_map_t        oldmap = thread->map;
19563 
19564 
19565 	/*
19566 	 *	Deactivate the current map and activate the requested map
19567 	 */
19568 	mp_disable_preemption();
19569 	PMAP_SWITCH_USER(thread, map, cpu_number());
19570 	mp_enable_preemption();
19571 	return oldmap;
19572 }
19573 
19574 
19575 /*
19576  *	Routine:	vm_map_write_user
19577  *
19578  *	Description:
19579  *		Copy out data from a kernel space into space in the
19580  *		destination map. The space must already exist in the
19581  *		destination map.
19582  *		NOTE:  This routine should only be called by threads
19583  *		which can block on a page fault. i.e. kernel mode user
19584  *		threads.
19585  *
19586  */
19587 kern_return_t
vm_map_write_user(vm_map_t map,void * src_p,vm_map_address_t dst_addr,vm_size_t size)19588 vm_map_write_user(
19589 	vm_map_t                map,
19590 	void                    *src_p,
19591 	vm_map_address_t        dst_addr,
19592 	vm_size_t               size)
19593 {
19594 	kern_return_t   kr = KERN_SUCCESS;
19595 
19596 	if (__improbable(vm_map_range_overflows(map, dst_addr, size))) {
19597 		return KERN_INVALID_ADDRESS;
19598 	}
19599 
19600 	if (current_map() == map) {
19601 		if (copyout(src_p, dst_addr, size)) {
19602 			kr = KERN_INVALID_ADDRESS;
19603 		}
19604 	} else {
19605 		vm_map_t        oldmap;
19606 
19607 		/* take on the identity of the target map while doing */
19608 		/* the transfer */
19609 
19610 		vm_map_reference(map);
19611 		oldmap = vm_map_switch(map);
19612 		if (copyout(src_p, dst_addr, size)) {
19613 			kr = KERN_INVALID_ADDRESS;
19614 		}
19615 		vm_map_switch(oldmap);
19616 		vm_map_deallocate(map);
19617 	}
19618 	return kr;
19619 }
19620 
19621 /*
19622  *	Routine:	vm_map_read_user
19623  *
19624  *	Description:
19625  *		Copy in data from a user space source map into the
19626  *		kernel map. The space must already exist in the
19627  *		kernel map.
19628  *		NOTE:  This routine should only be called by threads
19629  *		which can block on a page fault. i.e. kernel mode user
19630  *		threads.
19631  *
19632  */
19633 kern_return_t
vm_map_read_user(vm_map_t map,vm_map_address_t src_addr,void * dst_p,vm_size_t size)19634 vm_map_read_user(
19635 	vm_map_t                map,
19636 	vm_map_address_t        src_addr,
19637 	void                    *dst_p,
19638 	vm_size_t               size)
19639 {
19640 	kern_return_t   kr = KERN_SUCCESS;
19641 
19642 	if (__improbable(vm_map_range_overflows(map, src_addr, size))) {
19643 		return KERN_INVALID_ADDRESS;
19644 	}
19645 
19646 	if (current_map() == map) {
19647 		if (copyin(src_addr, dst_p, size)) {
19648 			kr = KERN_INVALID_ADDRESS;
19649 		}
19650 	} else {
19651 		vm_map_t        oldmap;
19652 
19653 		/* take on the identity of the target map while doing */
19654 		/* the transfer */
19655 
19656 		vm_map_reference(map);
19657 		oldmap = vm_map_switch(map);
19658 		if (copyin(src_addr, dst_p, size)) {
19659 			kr = KERN_INVALID_ADDRESS;
19660 		}
19661 		vm_map_switch(oldmap);
19662 		vm_map_deallocate(map);
19663 	}
19664 	return kr;
19665 }
19666 
19667 
19668 /*
19669  *	vm_map_check_protection:
19670  *
19671  *	Assert that the target map allows the specified
19672  *	privilege on the entire address region given.
19673  *	The entire region must be allocated.
19674  */
19675 boolean_t
vm_map_check_protection(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_prot_t protection)19676 vm_map_check_protection(vm_map_t map, vm_map_offset_t start,
19677     vm_map_offset_t end, vm_prot_t protection)
19678 {
19679 	vm_map_entry_t entry;
19680 	vm_map_entry_t tmp_entry;
19681 
19682 	if (__improbable(vm_map_range_overflows(map, start, end - start))) {
19683 		return FALSE;
19684 	}
19685 
19686 	vm_map_lock(map);
19687 
19688 	if (start < vm_map_min(map) || end > vm_map_max(map) || start > end) {
19689 		vm_map_unlock(map);
19690 		return FALSE;
19691 	}
19692 
19693 	if (!vm_map_lookup_entry(map, start, &tmp_entry)) {
19694 		vm_map_unlock(map);
19695 		return FALSE;
19696 	}
19697 
19698 	entry = tmp_entry;
19699 
19700 	while (start < end) {
19701 		if (entry == vm_map_to_entry(map)) {
19702 			vm_map_unlock(map);
19703 			return FALSE;
19704 		}
19705 
19706 		/*
19707 		 *	No holes allowed!
19708 		 */
19709 
19710 		if (start < entry->vme_start) {
19711 			vm_map_unlock(map);
19712 			return FALSE;
19713 		}
19714 
19715 		/*
19716 		 * Check protection associated with entry.
19717 		 */
19718 
19719 		if ((entry->protection & protection) != protection) {
19720 			vm_map_unlock(map);
19721 			return FALSE;
19722 		}
19723 
19724 		/* go to next entry */
19725 
19726 		start = entry->vme_end;
19727 		entry = entry->vme_next;
19728 	}
19729 	vm_map_unlock(map);
19730 	return TRUE;
19731 }
19732 
19733 kern_return_t
vm_map_purgable_control(vm_map_t map,vm_map_offset_t address,vm_purgable_t control,int * state)19734 vm_map_purgable_control(
19735 	vm_map_t                map,
19736 	vm_map_offset_t         address,
19737 	vm_purgable_t           control,
19738 	int                     *state)
19739 {
19740 	vm_map_entry_t          entry;
19741 	vm_object_t             object;
19742 	kern_return_t           kr;
19743 	boolean_t               was_nonvolatile;
19744 
19745 	/*
19746 	 * Vet all the input parameters and current type and state of the
19747 	 * underlaying object.  Return with an error if anything is amiss.
19748 	 */
19749 	if (map == VM_MAP_NULL) {
19750 		return KERN_INVALID_ARGUMENT;
19751 	}
19752 
19753 	if (control != VM_PURGABLE_SET_STATE &&
19754 	    control != VM_PURGABLE_GET_STATE &&
19755 	    control != VM_PURGABLE_PURGE_ALL &&
19756 	    control != VM_PURGABLE_SET_STATE_FROM_KERNEL) {
19757 		return KERN_INVALID_ARGUMENT;
19758 	}
19759 
19760 	if (control == VM_PURGABLE_PURGE_ALL) {
19761 		vm_purgeable_object_purge_all();
19762 		return KERN_SUCCESS;
19763 	}
19764 
19765 	if ((control == VM_PURGABLE_SET_STATE ||
19766 	    control == VM_PURGABLE_SET_STATE_FROM_KERNEL) &&
19767 	    (((*state & ~(VM_PURGABLE_ALL_MASKS)) != 0) ||
19768 	    ((*state & VM_PURGABLE_STATE_MASK) > VM_PURGABLE_STATE_MASK))) {
19769 		return KERN_INVALID_ARGUMENT;
19770 	}
19771 
19772 	vm_map_lock_read(map);
19773 
19774 	if (!vm_map_lookup_entry(map, address, &entry) || entry->is_sub_map) {
19775 		/*
19776 		 * Must pass a valid non-submap address.
19777 		 */
19778 		vm_map_unlock_read(map);
19779 		return KERN_INVALID_ADDRESS;
19780 	}
19781 
19782 	if ((entry->protection & VM_PROT_WRITE) == 0 &&
19783 	    control != VM_PURGABLE_GET_STATE) {
19784 		/*
19785 		 * Can't apply purgable controls to something you can't write.
19786 		 */
19787 		vm_map_unlock_read(map);
19788 		return KERN_PROTECTION_FAILURE;
19789 	}
19790 
19791 	object = VME_OBJECT(entry);
19792 	if (object == VM_OBJECT_NULL ||
19793 	    object->purgable == VM_PURGABLE_DENY) {
19794 		/*
19795 		 * Object must already be present and be purgeable.
19796 		 */
19797 		vm_map_unlock_read(map);
19798 		return KERN_INVALID_ARGUMENT;
19799 	}
19800 
19801 	vm_object_lock(object);
19802 
19803 #if 00
19804 	if (VME_OFFSET(entry) != 0 ||
19805 	    entry->vme_end - entry->vme_start != object->vo_size) {
19806 		/*
19807 		 * Can only apply purgable controls to the whole (existing)
19808 		 * object at once.
19809 		 */
19810 		vm_map_unlock_read(map);
19811 		vm_object_unlock(object);
19812 		return KERN_INVALID_ARGUMENT;
19813 	}
19814 #endif
19815 
19816 	assert(!entry->is_sub_map);
19817 	assert(!entry->use_pmap); /* purgeable has its own accounting */
19818 
19819 	vm_map_unlock_read(map);
19820 
19821 	was_nonvolatile = (object->purgable == VM_PURGABLE_NONVOLATILE);
19822 
19823 	kr = vm_object_purgable_control(object, control, state);
19824 
19825 	if (was_nonvolatile &&
19826 	    object->purgable != VM_PURGABLE_NONVOLATILE &&
19827 	    map->pmap == kernel_pmap) {
19828 #if DEBUG
19829 		object->vo_purgeable_volatilizer = kernel_task;
19830 #endif /* DEBUG */
19831 	}
19832 
19833 	vm_object_unlock(object);
19834 
19835 	return kr;
19836 }
19837 
19838 void
vm_map_footprint_query_page_info(vm_map_t map,vm_map_entry_t map_entry,vm_map_offset_t curr_s_offset,int * disposition_p)19839 vm_map_footprint_query_page_info(
19840 	vm_map_t        map,
19841 	vm_map_entry_t  map_entry,
19842 	vm_map_offset_t curr_s_offset,
19843 	int             *disposition_p)
19844 {
19845 	int             pmap_disp;
19846 	vm_object_t     object = VM_OBJECT_NULL;
19847 	int             disposition;
19848 	int             effective_page_size;
19849 
19850 	vm_map_lock_assert_held(map);
19851 	assert(!map->has_corpse_footprint);
19852 	assert(curr_s_offset >= map_entry->vme_start);
19853 	assert(curr_s_offset < map_entry->vme_end);
19854 
19855 	if (map_entry->is_sub_map) {
19856 		if (!map_entry->use_pmap) {
19857 			/* nested pmap: no footprint */
19858 			*disposition_p = 0;
19859 			return;
19860 		}
19861 	} else {
19862 		object = VME_OBJECT(map_entry);
19863 		if (object == VM_OBJECT_NULL) {
19864 			/* nothing mapped here: no need to ask */
19865 			*disposition_p = 0;
19866 			return;
19867 		}
19868 	}
19869 
19870 	effective_page_size = MIN(PAGE_SIZE, VM_MAP_PAGE_SIZE(map));
19871 
19872 	pmap_disp = 0;
19873 
19874 	/*
19875 	 * Query the pmap.
19876 	 */
19877 	pmap_query_page_info(map->pmap, curr_s_offset, &pmap_disp);
19878 
19879 	/*
19880 	 * Compute this page's disposition.
19881 	 */
19882 	disposition = 0;
19883 
19884 	/* deal with "alternate accounting" first */
19885 	if (!map_entry->is_sub_map &&
19886 	    object->vo_no_footprint) {
19887 		/* does not count in footprint */
19888 		assertf(!map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry);
19889 	} else if (!map_entry->is_sub_map &&
19890 	    (object->purgable == VM_PURGABLE_NONVOLATILE ||
19891 	    (object->purgable == VM_PURGABLE_DENY &&
19892 	    object->vo_ledger_tag)) &&
19893 	    VM_OBJECT_OWNER(object) != NULL &&
19894 	    VM_OBJECT_OWNER(object)->map == map) {
19895 		assertf(!map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry);
19896 		if ((((curr_s_offset
19897 		    - map_entry->vme_start
19898 		    + VME_OFFSET(map_entry))
19899 		    / effective_page_size) <
19900 		    (object->resident_page_count +
19901 		    vm_compressor_pager_get_count(object->pager)))) {
19902 			/*
19903 			 * Non-volatile purgeable object owned
19904 			 * by this task: report the first
19905 			 * "#resident + #compressed" pages as
19906 			 * "resident" (to show that they
19907 			 * contribute to the footprint) but not
19908 			 * "dirty" (to avoid double-counting
19909 			 * with the fake "non-volatile" region
19910 			 * we'll report at the end of the
19911 			 * address space to account for all
19912 			 * (mapped or not) non-volatile memory
19913 			 * owned by this task.
19914 			 */
19915 			disposition |= VM_PAGE_QUERY_PAGE_PRESENT;
19916 		}
19917 	} else if (!map_entry->is_sub_map &&
19918 	    (object->purgable == VM_PURGABLE_VOLATILE ||
19919 	    object->purgable == VM_PURGABLE_EMPTY) &&
19920 	    VM_OBJECT_OWNER(object) != NULL &&
19921 	    VM_OBJECT_OWNER(object)->map == map) {
19922 		assertf(!map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry);
19923 		if ((((curr_s_offset
19924 		    - map_entry->vme_start
19925 		    + VME_OFFSET(map_entry))
19926 		    / effective_page_size) <
19927 		    object->wired_page_count)) {
19928 			/*
19929 			 * Volatile|empty purgeable object owned
19930 			 * by this task: report the first
19931 			 * "#wired" pages as "resident" (to
19932 			 * show that they contribute to the
19933 			 * footprint) but not "dirty" (to avoid
19934 			 * double-counting with the fake
19935 			 * "non-volatile" region we'll report
19936 			 * at the end of the address space to
19937 			 * account for all (mapped or not)
19938 			 * non-volatile memory owned by this
19939 			 * task.
19940 			 */
19941 			disposition |= VM_PAGE_QUERY_PAGE_PRESENT;
19942 		}
19943 	} else if (!map_entry->is_sub_map &&
19944 	    map_entry->iokit_acct &&
19945 	    object->internal &&
19946 	    object->purgable == VM_PURGABLE_DENY) {
19947 		/*
19948 		 * Non-purgeable IOKit memory: phys_footprint
19949 		 * includes the entire virtual mapping.
19950 		 */
19951 		assertf(!map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry);
19952 		disposition |= VM_PAGE_QUERY_PAGE_PRESENT;
19953 		disposition |= VM_PAGE_QUERY_PAGE_DIRTY;
19954 	} else if (pmap_disp & (PMAP_QUERY_PAGE_ALTACCT |
19955 	    PMAP_QUERY_PAGE_COMPRESSED_ALTACCT)) {
19956 		/* alternate accounting */
19957 #if __arm64__ && (DEVELOPMENT || DEBUG)
19958 		if (map->pmap->footprint_was_suspended) {
19959 			/*
19960 			 * The assertion below can fail if dyld
19961 			 * suspended footprint accounting
19962 			 * while doing some adjustments to
19963 			 * this page;  the mapping would say
19964 			 * "use pmap accounting" but the page
19965 			 * would be marked "alternate
19966 			 * accounting".
19967 			 */
19968 		} else
19969 #endif /* __arm64__ && (DEVELOPMENT || DEBUG) */
19970 		{
19971 			assertf(!map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry);
19972 		}
19973 		disposition = 0;
19974 	} else {
19975 		if (pmap_disp & PMAP_QUERY_PAGE_PRESENT) {
19976 			assertf(map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry);
19977 			disposition |= VM_PAGE_QUERY_PAGE_PRESENT;
19978 			disposition |= VM_PAGE_QUERY_PAGE_REF;
19979 			if (pmap_disp & PMAP_QUERY_PAGE_INTERNAL) {
19980 				disposition |= VM_PAGE_QUERY_PAGE_DIRTY;
19981 			} else {
19982 				disposition |= VM_PAGE_QUERY_PAGE_EXTERNAL;
19983 			}
19984 			if (pmap_disp & PMAP_QUERY_PAGE_REUSABLE) {
19985 				disposition |= VM_PAGE_QUERY_PAGE_REUSABLE;
19986 			}
19987 		} else if (pmap_disp & PMAP_QUERY_PAGE_COMPRESSED) {
19988 			assertf(map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry);
19989 			disposition |= VM_PAGE_QUERY_PAGE_PAGED_OUT;
19990 		}
19991 	}
19992 
19993 	*disposition_p = disposition;
19994 }
19995 
19996 kern_return_t
vm_map_page_query_internal(vm_map_t target_map,vm_map_offset_t offset,int * disposition,int * ref_count)19997 vm_map_page_query_internal(
19998 	vm_map_t        target_map,
19999 	vm_map_offset_t offset,
20000 	int             *disposition,
20001 	int             *ref_count)
20002 {
20003 	kern_return_t                   kr;
20004 	vm_page_info_basic_data_t       info;
20005 	mach_msg_type_number_t          count;
20006 
20007 	count = VM_PAGE_INFO_BASIC_COUNT;
20008 	kr = vm_map_page_info(target_map,
20009 	    offset,
20010 	    VM_PAGE_INFO_BASIC,
20011 	    (vm_page_info_t) &info,
20012 	    &count);
20013 	if (kr == KERN_SUCCESS) {
20014 		*disposition = info.disposition;
20015 		*ref_count = info.ref_count;
20016 	} else {
20017 		*disposition = 0;
20018 		*ref_count = 0;
20019 	}
20020 
20021 	return kr;
20022 }
20023 
20024 kern_return_t
vm_map_page_info(vm_map_t map,vm_map_offset_t offset,vm_page_info_flavor_t flavor,vm_page_info_t info,mach_msg_type_number_t * count)20025 vm_map_page_info(
20026 	vm_map_t                map,
20027 	vm_map_offset_t         offset,
20028 	vm_page_info_flavor_t   flavor,
20029 	vm_page_info_t          info,
20030 	mach_msg_type_number_t  *count)
20031 {
20032 	return vm_map_page_range_info_internal(map,
20033 	           offset, /* start of range */
20034 	           (offset + 1), /* this will get rounded in the call to the page boundary */
20035 	           (int)-1, /* effective_page_shift: unspecified */
20036 	           flavor,
20037 	           info,
20038 	           count);
20039 }
20040 
20041 kern_return_t
vm_map_page_range_info_internal(vm_map_t map,vm_map_offset_t start_offset,vm_map_offset_t end_offset,int effective_page_shift,vm_page_info_flavor_t flavor,vm_page_info_t info,mach_msg_type_number_t * count)20042 vm_map_page_range_info_internal(
20043 	vm_map_t                map,
20044 	vm_map_offset_t         start_offset,
20045 	vm_map_offset_t         end_offset,
20046 	int                     effective_page_shift,
20047 	vm_page_info_flavor_t   flavor,
20048 	vm_page_info_t          info,
20049 	mach_msg_type_number_t  *count)
20050 {
20051 	vm_map_entry_t          map_entry = VM_MAP_ENTRY_NULL;
20052 	vm_object_t             object = VM_OBJECT_NULL, curr_object = VM_OBJECT_NULL;
20053 	vm_page_t               m = VM_PAGE_NULL;
20054 	kern_return_t           retval = KERN_SUCCESS;
20055 	int                     disposition = 0;
20056 	int                     ref_count = 0;
20057 	int                     depth = 0, info_idx = 0;
20058 	vm_page_info_basic_t    basic_info = 0;
20059 	vm_map_offset_t         offset_in_page = 0, offset_in_object = 0, curr_offset_in_object = 0;
20060 	vm_map_offset_t         start = 0, end = 0, curr_s_offset = 0, curr_e_offset = 0;
20061 	boolean_t               do_region_footprint;
20062 	ledger_amount_t         ledger_resident, ledger_compressed;
20063 	int                     effective_page_size;
20064 	vm_map_offset_t         effective_page_mask;
20065 
20066 	switch (flavor) {
20067 	case VM_PAGE_INFO_BASIC:
20068 		if (*count != VM_PAGE_INFO_BASIC_COUNT) {
20069 			/*
20070 			 * The "vm_page_info_basic_data" structure was not
20071 			 * properly padded, so allow the size to be off by
20072 			 * one to maintain backwards binary compatibility...
20073 			 */
20074 			if (*count != VM_PAGE_INFO_BASIC_COUNT - 1) {
20075 				return KERN_INVALID_ARGUMENT;
20076 			}
20077 		}
20078 		break;
20079 	default:
20080 		return KERN_INVALID_ARGUMENT;
20081 	}
20082 
20083 	if (effective_page_shift == -1) {
20084 		effective_page_shift = vm_self_region_page_shift_safely(map);
20085 		if (effective_page_shift == -1) {
20086 			return KERN_INVALID_ARGUMENT;
20087 		}
20088 	}
20089 	effective_page_size = (1 << effective_page_shift);
20090 	effective_page_mask = effective_page_size - 1;
20091 
20092 	do_region_footprint = task_self_region_footprint();
20093 	disposition = 0;
20094 	ref_count = 0;
20095 	depth = 0;
20096 	info_idx = 0; /* Tracks the next index within the info structure to be filled.*/
20097 	retval = KERN_SUCCESS;
20098 
20099 	if (__improbable(vm_map_range_overflows(map, start_offset, end_offset - start_offset))) {
20100 		return KERN_INVALID_ADDRESS;
20101 	}
20102 
20103 	offset_in_page = start_offset & effective_page_mask;
20104 	start = vm_map_trunc_page(start_offset, effective_page_mask);
20105 	end = vm_map_round_page(end_offset, effective_page_mask);
20106 
20107 	if (end < start) {
20108 		return KERN_INVALID_ARGUMENT;
20109 	}
20110 
20111 	assert((end - start) <= MAX_PAGE_RANGE_QUERY);
20112 
20113 	vm_map_lock_read(map);
20114 
20115 	task_ledgers_footprint(map->pmap->ledger, &ledger_resident, &ledger_compressed);
20116 
20117 	for (curr_s_offset = start; curr_s_offset < end;) {
20118 		/*
20119 		 * New lookup needs reset of these variables.
20120 		 */
20121 		curr_object = object = VM_OBJECT_NULL;
20122 		offset_in_object = 0;
20123 		ref_count = 0;
20124 		depth = 0;
20125 
20126 		if (do_region_footprint &&
20127 		    curr_s_offset >= vm_map_last_entry(map)->vme_end) {
20128 			/*
20129 			 * Request for "footprint" info about a page beyond
20130 			 * the end of address space: this must be for
20131 			 * the fake region vm_map_region_recurse_64()
20132 			 * reported to account for non-volatile purgeable
20133 			 * memory owned by this task.
20134 			 */
20135 			disposition = 0;
20136 
20137 			if (curr_s_offset - vm_map_last_entry(map)->vme_end <=
20138 			    (unsigned) ledger_compressed) {
20139 				/*
20140 				 * We haven't reported all the "non-volatile
20141 				 * compressed" pages yet, so report this fake
20142 				 * page as "compressed".
20143 				 */
20144 				disposition |= VM_PAGE_QUERY_PAGE_PAGED_OUT;
20145 			} else {
20146 				/*
20147 				 * We've reported all the non-volatile
20148 				 * compressed page but not all the non-volatile
20149 				 * pages , so report this fake page as
20150 				 * "resident dirty".
20151 				 */
20152 				disposition |= VM_PAGE_QUERY_PAGE_PRESENT;
20153 				disposition |= VM_PAGE_QUERY_PAGE_DIRTY;
20154 				disposition |= VM_PAGE_QUERY_PAGE_REF;
20155 			}
20156 			switch (flavor) {
20157 			case VM_PAGE_INFO_BASIC:
20158 				basic_info = (vm_page_info_basic_t) (((uintptr_t) info) + (info_idx * sizeof(struct vm_page_info_basic)));
20159 				basic_info->disposition = disposition;
20160 				basic_info->ref_count = 1;
20161 				basic_info->object_id = VM_OBJECT_ID_FAKE(map, task_ledgers.purgeable_nonvolatile);
20162 				basic_info->offset = 0;
20163 				basic_info->depth = 0;
20164 
20165 				info_idx++;
20166 				break;
20167 			}
20168 			curr_s_offset += effective_page_size;
20169 			continue;
20170 		}
20171 
20172 		/*
20173 		 * First, find the map entry covering "curr_s_offset", going down
20174 		 * submaps if necessary.
20175 		 */
20176 		if (!vm_map_lookup_entry(map, curr_s_offset, &map_entry)) {
20177 			/* no entry -> no object -> no page */
20178 
20179 			if (curr_s_offset < vm_map_min(map)) {
20180 				/*
20181 				 * Illegal address that falls below map min.
20182 				 */
20183 				curr_e_offset = MIN(end, vm_map_min(map));
20184 			} else if (curr_s_offset >= vm_map_max(map)) {
20185 				/*
20186 				 * Illegal address that falls on/after map max.
20187 				 */
20188 				curr_e_offset = end;
20189 			} else if (map_entry == vm_map_to_entry(map)) {
20190 				/*
20191 				 * Hit a hole.
20192 				 */
20193 				if (map_entry->vme_next == vm_map_to_entry(map)) {
20194 					/*
20195 					 * Empty map.
20196 					 */
20197 					curr_e_offset = MIN(map->max_offset, end);
20198 				} else {
20199 					/*
20200 					 * Hole at start of the map.
20201 					 */
20202 					curr_e_offset = MIN(map_entry->vme_next->vme_start, end);
20203 				}
20204 			} else {
20205 				if (map_entry->vme_next == vm_map_to_entry(map)) {
20206 					/*
20207 					 * Hole at the end of the map.
20208 					 */
20209 					curr_e_offset = MIN(map->max_offset, end);
20210 				} else {
20211 					curr_e_offset = MIN(map_entry->vme_next->vme_start, end);
20212 				}
20213 			}
20214 
20215 			assert(curr_e_offset >= curr_s_offset);
20216 
20217 			uint64_t num_pages = (curr_e_offset - curr_s_offset) >> effective_page_shift;
20218 
20219 			void *info_ptr = (void*) (((uintptr_t) info) + (info_idx * sizeof(struct vm_page_info_basic)));
20220 
20221 			bzero(info_ptr, num_pages * sizeof(struct vm_page_info_basic));
20222 
20223 			curr_s_offset = curr_e_offset;
20224 
20225 			info_idx += num_pages;
20226 
20227 			continue;
20228 		}
20229 
20230 		/* compute offset from this map entry's start */
20231 		offset_in_object = curr_s_offset - map_entry->vme_start;
20232 
20233 		/* compute offset into this map entry's object (or submap) */
20234 		offset_in_object += VME_OFFSET(map_entry);
20235 
20236 		if (map_entry->is_sub_map) {
20237 			vm_map_t sub_map = VM_MAP_NULL;
20238 			vm_page_info_t submap_info = 0;
20239 			vm_map_offset_t submap_s_offset = 0, submap_e_offset = 0, range_len = 0;
20240 
20241 			range_len = MIN(map_entry->vme_end, end) - curr_s_offset;
20242 
20243 			submap_s_offset = offset_in_object;
20244 			submap_e_offset = submap_s_offset + range_len;
20245 
20246 			sub_map = VME_SUBMAP(map_entry);
20247 
20248 			vm_map_reference(sub_map);
20249 			vm_map_unlock_read(map);
20250 
20251 			submap_info = (vm_page_info_t) (((uintptr_t) info) + (info_idx * sizeof(struct vm_page_info_basic)));
20252 
20253 			assertf(VM_MAP_PAGE_SHIFT(sub_map) >= VM_MAP_PAGE_SHIFT(map),
20254 			    "Submap page size (%d) differs from current map (%d)\n", VM_MAP_PAGE_SIZE(sub_map), VM_MAP_PAGE_SIZE(map));
20255 
20256 			retval = vm_map_page_range_info_internal(sub_map,
20257 			    submap_s_offset,
20258 			    submap_e_offset,
20259 			    effective_page_shift,
20260 			    VM_PAGE_INFO_BASIC,
20261 			    (vm_page_info_t) submap_info,
20262 			    count);
20263 
20264 			assert(retval == KERN_SUCCESS);
20265 
20266 			vm_map_lock_read(map);
20267 			vm_map_deallocate(sub_map);
20268 
20269 			/* Move the "info" index by the number of pages we inspected.*/
20270 			info_idx += range_len >> effective_page_shift;
20271 
20272 			/* Move our current offset by the size of the range we inspected.*/
20273 			curr_s_offset += range_len;
20274 
20275 			continue;
20276 		}
20277 
20278 		object = VME_OBJECT(map_entry);
20279 
20280 		if (object == VM_OBJECT_NULL) {
20281 			/*
20282 			 * We don't have an object here and, hence,
20283 			 * no pages to inspect. We'll fill up the
20284 			 * info structure appropriately.
20285 			 */
20286 
20287 			curr_e_offset = MIN(map_entry->vme_end, end);
20288 
20289 			uint64_t num_pages = (curr_e_offset - curr_s_offset) >> effective_page_shift;
20290 
20291 			void *info_ptr = (void*) (((uintptr_t) info) + (info_idx * sizeof(struct vm_page_info_basic)));
20292 
20293 			bzero(info_ptr, num_pages * sizeof(struct vm_page_info_basic));
20294 
20295 			curr_s_offset = curr_e_offset;
20296 
20297 			info_idx += num_pages;
20298 
20299 			continue;
20300 		}
20301 
20302 		if (do_region_footprint) {
20303 			disposition = 0;
20304 			if (map->has_corpse_footprint) {
20305 				/*
20306 				 * Query the page info data we saved
20307 				 * while forking the corpse.
20308 				 */
20309 				vm_map_corpse_footprint_query_page_info(
20310 					map,
20311 					curr_s_offset,
20312 					&disposition);
20313 			} else {
20314 				/*
20315 				 * Query the live pmap for footprint info
20316 				 * about this page.
20317 				 */
20318 				vm_map_footprint_query_page_info(
20319 					map,
20320 					map_entry,
20321 					curr_s_offset,
20322 					&disposition);
20323 			}
20324 			switch (flavor) {
20325 			case VM_PAGE_INFO_BASIC:
20326 				basic_info = (vm_page_info_basic_t) (((uintptr_t) info) + (info_idx * sizeof(struct vm_page_info_basic)));
20327 				basic_info->disposition = disposition;
20328 				basic_info->ref_count = 1;
20329 				basic_info->object_id = VM_OBJECT_ID_FAKE(map, task_ledgers.purgeable_nonvolatile);
20330 				basic_info->offset = 0;
20331 				basic_info->depth = 0;
20332 
20333 				info_idx++;
20334 				break;
20335 			}
20336 			curr_s_offset += effective_page_size;
20337 			continue;
20338 		}
20339 
20340 		vm_object_reference(object);
20341 		/*
20342 		 * Shared mode -- so we can allow other readers
20343 		 * to grab the lock too.
20344 		 */
20345 		vm_object_lock_shared(object);
20346 
20347 		curr_e_offset = MIN(map_entry->vme_end, end);
20348 
20349 		vm_map_unlock_read(map);
20350 
20351 		map_entry = NULL; /* map is unlocked, the entry is no longer valid. */
20352 
20353 		curr_object = object;
20354 
20355 		for (; curr_s_offset < curr_e_offset;) {
20356 			if (object == curr_object) {
20357 				ref_count = curr_object->ref_count - 1; /* account for our object reference above. */
20358 			} else {
20359 				ref_count = curr_object->ref_count;
20360 			}
20361 
20362 			curr_offset_in_object = offset_in_object;
20363 
20364 			for (;;) {
20365 				m = vm_page_lookup(curr_object, vm_object_trunc_page(curr_offset_in_object));
20366 
20367 				if (m != VM_PAGE_NULL) {
20368 					disposition |= VM_PAGE_QUERY_PAGE_PRESENT;
20369 					break;
20370 				} else {
20371 					if (curr_object->internal &&
20372 					    curr_object->alive &&
20373 					    !curr_object->terminating &&
20374 					    curr_object->pager_ready) {
20375 						if (VM_COMPRESSOR_PAGER_STATE_GET(curr_object, vm_object_trunc_page(curr_offset_in_object))
20376 						    == VM_EXTERNAL_STATE_EXISTS) {
20377 							/* the pager has that page */
20378 							disposition |= VM_PAGE_QUERY_PAGE_PAGED_OUT;
20379 							break;
20380 						}
20381 					}
20382 
20383 					/*
20384 					 * Go down the VM object shadow chain until we find the page
20385 					 * we're looking for.
20386 					 */
20387 
20388 					if (curr_object->shadow != VM_OBJECT_NULL) {
20389 						vm_object_t shadow = VM_OBJECT_NULL;
20390 
20391 						curr_offset_in_object += curr_object->vo_shadow_offset;
20392 						shadow = curr_object->shadow;
20393 
20394 						vm_object_lock_shared(shadow);
20395 						vm_object_unlock(curr_object);
20396 
20397 						curr_object = shadow;
20398 						depth++;
20399 						continue;
20400 					} else {
20401 						break;
20402 					}
20403 				}
20404 			}
20405 
20406 			/* The ref_count is not strictly accurate, it measures the number   */
20407 			/* of entities holding a ref on the object, they may not be mapping */
20408 			/* the object or may not be mapping the section holding the         */
20409 			/* target page but its still a ball park number and though an over- */
20410 			/* count, it picks up the copy-on-write cases                       */
20411 
20412 			/* We could also get a picture of page sharing from pmap_attributes */
20413 			/* but this would under count as only faulted-in mappings would     */
20414 			/* show up.							    */
20415 
20416 			if ((curr_object == object) && curr_object->shadow) {
20417 				disposition |= VM_PAGE_QUERY_PAGE_COPIED;
20418 			}
20419 
20420 			if (!curr_object->internal) {
20421 				disposition |= VM_PAGE_QUERY_PAGE_EXTERNAL;
20422 			}
20423 
20424 			if (m != VM_PAGE_NULL) {
20425 				if (m->vmp_fictitious) {
20426 					disposition |= VM_PAGE_QUERY_PAGE_FICTITIOUS;
20427 				} else {
20428 					if (m->vmp_dirty || pmap_is_modified(VM_PAGE_GET_PHYS_PAGE(m))) {
20429 						disposition |= VM_PAGE_QUERY_PAGE_DIRTY;
20430 					}
20431 
20432 					if (m->vmp_reference || pmap_is_referenced(VM_PAGE_GET_PHYS_PAGE(m))) {
20433 						disposition |= VM_PAGE_QUERY_PAGE_REF;
20434 					}
20435 
20436 					if (m->vmp_q_state == VM_PAGE_ON_SPECULATIVE_Q) {
20437 						disposition |= VM_PAGE_QUERY_PAGE_SPECULATIVE;
20438 					}
20439 
20440 					/*
20441 					 * XXX TODO4K:
20442 					 * when this routine deals with 4k
20443 					 * pages, check the appropriate CS bit
20444 					 * here.
20445 					 */
20446 					if (m->vmp_cs_validated) {
20447 						disposition |= VM_PAGE_QUERY_PAGE_CS_VALIDATED;
20448 					}
20449 					if (m->vmp_cs_tainted) {
20450 						disposition |= VM_PAGE_QUERY_PAGE_CS_TAINTED;
20451 					}
20452 					if (m->vmp_cs_nx) {
20453 						disposition |= VM_PAGE_QUERY_PAGE_CS_NX;
20454 					}
20455 					if (m->vmp_reusable || curr_object->all_reusable) {
20456 						disposition |= VM_PAGE_QUERY_PAGE_REUSABLE;
20457 					}
20458 				}
20459 			}
20460 
20461 			switch (flavor) {
20462 			case VM_PAGE_INFO_BASIC:
20463 				basic_info = (vm_page_info_basic_t) (((uintptr_t) info) + (info_idx * sizeof(struct vm_page_info_basic)));
20464 				basic_info->disposition = disposition;
20465 				basic_info->ref_count = ref_count;
20466 				basic_info->object_id = (vm_object_id_t) (uintptr_t)
20467 				    VM_KERNEL_ADDRHASH(curr_object);
20468 				basic_info->offset =
20469 				    (memory_object_offset_t) curr_offset_in_object + offset_in_page;
20470 				basic_info->depth = depth;
20471 
20472 				info_idx++;
20473 				break;
20474 			}
20475 
20476 			disposition = 0;
20477 			offset_in_page = 0; // This doesn't really make sense for any offset other than the starting offset.
20478 
20479 			/*
20480 			 * Move to next offset in the range and in our object.
20481 			 */
20482 			curr_s_offset += effective_page_size;
20483 			offset_in_object += effective_page_size;
20484 			curr_offset_in_object = offset_in_object;
20485 
20486 			if (curr_object != object) {
20487 				vm_object_unlock(curr_object);
20488 
20489 				curr_object = object;
20490 
20491 				vm_object_lock_shared(curr_object);
20492 			} else {
20493 				vm_object_lock_yield_shared(curr_object);
20494 			}
20495 		}
20496 
20497 		vm_object_unlock(curr_object);
20498 		vm_object_deallocate(curr_object);
20499 
20500 		vm_map_lock_read(map);
20501 	}
20502 
20503 	vm_map_unlock_read(map);
20504 	return retval;
20505 }
20506 
20507 /*
20508  *	vm_map_msync
20509  *
20510  *	Synchronises the memory range specified with its backing store
20511  *	image by either flushing or cleaning the contents to the appropriate
20512  *	memory manager engaging in a memory object synchronize dialog with
20513  *	the manager.  The client doesn't return until the manager issues
20514  *	m_o_s_completed message.  MIG Magically converts user task parameter
20515  *	to the task's address map.
20516  *
20517  *	interpretation of sync_flags
20518  *	VM_SYNC_INVALIDATE	- discard pages, only return precious
20519  *				  pages to manager.
20520  *
20521  *	VM_SYNC_INVALIDATE & (VM_SYNC_SYNCHRONOUS | VM_SYNC_ASYNCHRONOUS)
20522  *				- discard pages, write dirty or precious
20523  *				  pages back to memory manager.
20524  *
20525  *	VM_SYNC_SYNCHRONOUS | VM_SYNC_ASYNCHRONOUS
20526  *				- write dirty or precious pages back to
20527  *				  the memory manager.
20528  *
20529  *	VM_SYNC_CONTIGUOUS	- does everything normally, but if there
20530  *				  is a hole in the region, and we would
20531  *				  have returned KERN_SUCCESS, return
20532  *				  KERN_INVALID_ADDRESS instead.
20533  *
20534  *	NOTE
20535  *	The memory object attributes have not yet been implemented, this
20536  *	function will have to deal with the invalidate attribute
20537  *
20538  *	RETURNS
20539  *	KERN_INVALID_TASK		Bad task parameter
20540  *	KERN_INVALID_ARGUMENT		both sync and async were specified.
20541  *	KERN_SUCCESS			The usual.
20542  *	KERN_INVALID_ADDRESS		There was a hole in the region.
20543  */
20544 
20545 kern_return_t
vm_map_msync(vm_map_t map,vm_map_address_t address,vm_map_size_t size,vm_sync_t sync_flags)20546 vm_map_msync(
20547 	vm_map_t                map,
20548 	vm_map_address_t        address,
20549 	vm_map_size_t           size,
20550 	vm_sync_t               sync_flags)
20551 {
20552 	vm_map_entry_t          entry;
20553 	vm_map_size_t           amount_left;
20554 	vm_object_offset_t      offset;
20555 	vm_object_offset_t      start_offset, end_offset;
20556 	boolean_t               do_sync_req;
20557 	boolean_t               had_hole = FALSE;
20558 	vm_map_offset_t         pmap_offset;
20559 
20560 	if ((sync_flags & VM_SYNC_ASYNCHRONOUS) &&
20561 	    (sync_flags & VM_SYNC_SYNCHRONOUS)) {
20562 		return KERN_INVALID_ARGUMENT;
20563 	}
20564 
20565 	if (__improbable(vm_map_range_overflows(map, address, size))) {
20566 		return KERN_INVALID_ADDRESS;
20567 	}
20568 
20569 	if (VM_MAP_PAGE_MASK(map) < PAGE_MASK) {
20570 		DEBUG4K_SHARE("map %p address 0x%llx size 0x%llx flags 0x%x\n", map, (uint64_t)address, (uint64_t)size, sync_flags);
20571 	}
20572 
20573 	/*
20574 	 * align address and size on page boundaries
20575 	 */
20576 	size = (vm_map_round_page(address + size,
20577 	    VM_MAP_PAGE_MASK(map)) -
20578 	    vm_map_trunc_page(address,
20579 	    VM_MAP_PAGE_MASK(map)));
20580 	address = vm_map_trunc_page(address,
20581 	    VM_MAP_PAGE_MASK(map));
20582 
20583 	if (map == VM_MAP_NULL) {
20584 		return KERN_INVALID_TASK;
20585 	}
20586 
20587 	if (size == 0) {
20588 		return KERN_SUCCESS;
20589 	}
20590 
20591 	amount_left = size;
20592 
20593 	while (amount_left > 0) {
20594 		vm_object_size_t        flush_size;
20595 		vm_object_t             object;
20596 
20597 		vm_map_lock(map);
20598 		if (!vm_map_lookup_entry(map,
20599 		    address,
20600 		    &entry)) {
20601 			vm_map_size_t   skip;
20602 
20603 			/*
20604 			 * hole in the address map.
20605 			 */
20606 			had_hole = TRUE;
20607 
20608 			if (sync_flags & VM_SYNC_KILLPAGES) {
20609 				/*
20610 				 * For VM_SYNC_KILLPAGES, there should be
20611 				 * no holes in the range, since we couldn't
20612 				 * prevent someone else from allocating in
20613 				 * that hole and we wouldn't want to "kill"
20614 				 * their pages.
20615 				 */
20616 				vm_map_unlock(map);
20617 				break;
20618 			}
20619 
20620 			/*
20621 			 * Check for empty map.
20622 			 */
20623 			if (entry == vm_map_to_entry(map) &&
20624 			    entry->vme_next == entry) {
20625 				vm_map_unlock(map);
20626 				break;
20627 			}
20628 			/*
20629 			 * Check that we don't wrap and that
20630 			 * we have at least one real map entry.
20631 			 */
20632 			if ((map->hdr.nentries == 0) ||
20633 			    (entry->vme_next->vme_start < address)) {
20634 				vm_map_unlock(map);
20635 				break;
20636 			}
20637 			/*
20638 			 * Move up to the next entry if needed
20639 			 */
20640 			skip = (entry->vme_next->vme_start - address);
20641 			if (skip >= amount_left) {
20642 				amount_left = 0;
20643 			} else {
20644 				amount_left -= skip;
20645 			}
20646 			address = entry->vme_next->vme_start;
20647 			vm_map_unlock(map);
20648 			continue;
20649 		}
20650 
20651 		offset = address - entry->vme_start;
20652 		pmap_offset = address;
20653 
20654 		/*
20655 		 * do we have more to flush than is contained in this
20656 		 * entry ?
20657 		 */
20658 		if (amount_left + entry->vme_start + offset > entry->vme_end) {
20659 			flush_size = entry->vme_end -
20660 			    (entry->vme_start + offset);
20661 		} else {
20662 			flush_size = amount_left;
20663 		}
20664 		amount_left -= flush_size;
20665 		address += flush_size;
20666 
20667 		if (entry->is_sub_map == TRUE) {
20668 			vm_map_t        local_map;
20669 			vm_map_offset_t local_offset;
20670 
20671 			local_map = VME_SUBMAP(entry);
20672 			local_offset = VME_OFFSET(entry);
20673 			vm_map_reference(local_map);
20674 			vm_map_unlock(map);
20675 			if (vm_map_msync(
20676 				    local_map,
20677 				    local_offset,
20678 				    flush_size,
20679 				    sync_flags) == KERN_INVALID_ADDRESS) {
20680 				had_hole = TRUE;
20681 			}
20682 			vm_map_deallocate(local_map);
20683 			continue;
20684 		}
20685 		object = VME_OBJECT(entry);
20686 
20687 		/*
20688 		 * We can't sync this object if the object has not been
20689 		 * created yet
20690 		 */
20691 		if (object == VM_OBJECT_NULL) {
20692 			vm_map_unlock(map);
20693 			continue;
20694 		}
20695 		offset += VME_OFFSET(entry);
20696 
20697 		vm_object_lock(object);
20698 
20699 		if (sync_flags & (VM_SYNC_KILLPAGES | VM_SYNC_DEACTIVATE)) {
20700 			int kill_pages = 0;
20701 
20702 			if (VM_MAP_PAGE_MASK(map) < PAGE_MASK) {
20703 				/*
20704 				 * This is a destructive operation and so we
20705 				 * err on the side of limiting the range of
20706 				 * the operation.
20707 				 */
20708 				start_offset = vm_object_round_page(offset);
20709 				end_offset = vm_object_trunc_page(offset + flush_size);
20710 
20711 				if (end_offset <= start_offset) {
20712 					vm_object_unlock(object);
20713 					vm_map_unlock(map);
20714 					continue;
20715 				}
20716 
20717 				pmap_offset += start_offset - offset;
20718 			} else {
20719 				start_offset = offset;
20720 				end_offset = offset + flush_size;
20721 			}
20722 
20723 			if (sync_flags & VM_SYNC_KILLPAGES) {
20724 				if (((object->ref_count == 1) ||
20725 				    ((object->copy_strategy !=
20726 				    MEMORY_OBJECT_COPY_SYMMETRIC) &&
20727 				    (object->vo_copy == VM_OBJECT_NULL))) &&
20728 				    (object->shadow == VM_OBJECT_NULL)) {
20729 					if (object->ref_count != 1) {
20730 						vm_page_stats_reusable.free_shared++;
20731 					}
20732 					kill_pages = 1;
20733 				} else {
20734 					kill_pages = -1;
20735 				}
20736 			}
20737 			if (kill_pages != -1) {
20738 				vm_object_deactivate_pages(
20739 					object,
20740 					start_offset,
20741 					(vm_object_size_t) (end_offset - start_offset),
20742 					kill_pages,
20743 					FALSE, /* reusable_pages */
20744 					FALSE, /* reusable_no_write */
20745 					map->pmap,
20746 					pmap_offset);
20747 			}
20748 			vm_object_unlock(object);
20749 			vm_map_unlock(map);
20750 			continue;
20751 		}
20752 		/*
20753 		 * We can't sync this object if there isn't a pager.
20754 		 * Don't bother to sync internal objects, since there can't
20755 		 * be any "permanent" storage for these objects anyway.
20756 		 */
20757 		if ((object->pager == MEMORY_OBJECT_NULL) ||
20758 		    (object->internal) || (object->private)) {
20759 			vm_object_unlock(object);
20760 			vm_map_unlock(map);
20761 			continue;
20762 		}
20763 		/*
20764 		 * keep reference on the object until syncing is done
20765 		 */
20766 		vm_object_reference_locked(object);
20767 		vm_object_unlock(object);
20768 
20769 		vm_map_unlock(map);
20770 
20771 		if (VM_MAP_PAGE_MASK(map) < PAGE_MASK) {
20772 			start_offset = vm_object_trunc_page(offset);
20773 			end_offset = vm_object_round_page(offset + flush_size);
20774 		} else {
20775 			start_offset = offset;
20776 			end_offset = offset + flush_size;
20777 		}
20778 
20779 		do_sync_req = vm_object_sync(object,
20780 		    start_offset,
20781 		    (end_offset - start_offset),
20782 		    sync_flags & VM_SYNC_INVALIDATE,
20783 		    ((sync_flags & VM_SYNC_SYNCHRONOUS) ||
20784 		    (sync_flags & VM_SYNC_ASYNCHRONOUS)),
20785 		    sync_flags & VM_SYNC_SYNCHRONOUS);
20786 
20787 		if ((sync_flags & VM_SYNC_INVALIDATE) && object->resident_page_count == 0) {
20788 			/*
20789 			 * clear out the clustering and read-ahead hints
20790 			 */
20791 			vm_object_lock(object);
20792 
20793 			object->pages_created = 0;
20794 			object->pages_used = 0;
20795 			object->sequential = 0;
20796 			object->last_alloc = 0;
20797 
20798 			vm_object_unlock(object);
20799 		}
20800 		vm_object_deallocate(object);
20801 	} /* while */
20802 
20803 	/* for proper msync() behaviour */
20804 	if (had_hole == TRUE && (sync_flags & VM_SYNC_CONTIGUOUS)) {
20805 		return KERN_INVALID_ADDRESS;
20806 	}
20807 
20808 	return KERN_SUCCESS;
20809 }/* vm_msync */
20810 
20811 void
vm_named_entry_associate_vm_object(vm_named_entry_t named_entry,vm_object_t object,vm_object_offset_t offset,vm_object_size_t size,vm_prot_t prot)20812 vm_named_entry_associate_vm_object(
20813 	vm_named_entry_t        named_entry,
20814 	vm_object_t             object,
20815 	vm_object_offset_t      offset,
20816 	vm_object_size_t        size,
20817 	vm_prot_t               prot)
20818 {
20819 	vm_map_copy_t copy;
20820 	vm_map_entry_t copy_entry;
20821 
20822 	assert(!named_entry->is_sub_map);
20823 	assert(!named_entry->is_copy);
20824 	assert(!named_entry->is_object);
20825 	assert(!named_entry->internal);
20826 	assert(named_entry->backing.copy == VM_MAP_COPY_NULL);
20827 
20828 	copy = vm_map_copy_allocate(VM_MAP_COPY_ENTRY_LIST);
20829 	copy->offset = offset;
20830 	copy->size = size;
20831 	copy->cpy_hdr.page_shift = (uint16_t)PAGE_SHIFT;
20832 
20833 	copy_entry = vm_map_copy_entry_create(copy);
20834 	copy_entry->protection = prot;
20835 	copy_entry->max_protection = prot;
20836 	copy_entry->use_pmap = TRUE;
20837 	copy_entry->vme_start = VM_MAP_TRUNC_PAGE(offset, PAGE_MASK);
20838 	copy_entry->vme_end = VM_MAP_ROUND_PAGE(offset + size, PAGE_MASK);
20839 	VME_OBJECT_SET(copy_entry, object, false, 0);
20840 	VME_OFFSET_SET(copy_entry, vm_object_trunc_page(offset));
20841 	vm_map_copy_entry_link(copy, vm_map_copy_last_entry(copy), copy_entry);
20842 
20843 	named_entry->backing.copy = copy;
20844 	named_entry->is_object = TRUE;
20845 	if (object->internal) {
20846 		named_entry->internal = TRUE;
20847 	}
20848 
20849 	DEBUG4K_MEMENTRY("named_entry %p copy %p object %p offset 0x%llx size 0x%llx prot 0x%x\n",
20850 	    named_entry, copy, object, offset, size, prot);
20851 }
20852 
20853 vm_object_t
vm_named_entry_to_vm_object(vm_named_entry_t named_entry)20854 vm_named_entry_to_vm_object(
20855 	vm_named_entry_t named_entry)
20856 {
20857 	vm_map_copy_t   copy;
20858 	vm_map_entry_t  copy_entry;
20859 	vm_object_t     object;
20860 
20861 	assert(!named_entry->is_sub_map);
20862 	assert(!named_entry->is_copy);
20863 	assert(named_entry->is_object);
20864 	copy = named_entry->backing.copy;
20865 	assert(copy != VM_MAP_COPY_NULL);
20866 	/*
20867 	 * Assert that the vm_map_copy is coming from the right
20868 	 * zone and hasn't been forged
20869 	 */
20870 	vm_map_copy_require(copy);
20871 	assert(copy->cpy_hdr.nentries == 1);
20872 	copy_entry = vm_map_copy_first_entry(copy);
20873 	object = VME_OBJECT(copy_entry);
20874 
20875 	DEBUG4K_MEMENTRY("%p -> %p -> %p [0x%llx 0x%llx 0x%llx 0x%x/0x%x ] -> %p offset 0x%llx size 0x%llx prot 0x%x\n", named_entry, copy, copy_entry, (uint64_t)copy_entry->vme_start, (uint64_t)copy_entry->vme_end, copy_entry->vme_offset, copy_entry->protection, copy_entry->max_protection, object, named_entry->offset, named_entry->size, named_entry->protection);
20876 
20877 	return object;
20878 }
20879 
20880 /*
20881  *	Routine:	convert_port_entry_to_map
20882  *	Purpose:
20883  *		Convert from a port specifying an entry or a task
20884  *		to a map. Doesn't consume the port ref; produces a map ref,
20885  *		which may be null.  Unlike convert_port_to_map, the
20886  *		port may be task or a named entry backed.
20887  *	Conditions:
20888  *		Nothing locked.
20889  */
20890 
20891 vm_map_t
convert_port_entry_to_map(ipc_port_t port)20892 convert_port_entry_to_map(
20893 	ipc_port_t      port)
20894 {
20895 	vm_map_t map = VM_MAP_NULL;
20896 	vm_named_entry_t named_entry;
20897 
20898 	if (!IP_VALID(port)) {
20899 		return VM_MAP_NULL;
20900 	}
20901 
20902 	if (ip_kotype(port) != IKOT_NAMED_ENTRY) {
20903 		return convert_port_to_map(port);
20904 	}
20905 
20906 	named_entry = mach_memory_entry_from_port(port);
20907 
20908 	if ((named_entry->is_sub_map) &&
20909 	    (named_entry->protection & VM_PROT_WRITE)) {
20910 		map = named_entry->backing.map;
20911 		if (map->pmap != PMAP_NULL) {
20912 			if (map->pmap == kernel_pmap) {
20913 				panic("userspace has access "
20914 				    "to a kernel map %p", map);
20915 			}
20916 			pmap_require(map->pmap);
20917 		}
20918 		vm_map_reference(map);
20919 	}
20920 
20921 	return map;
20922 }
20923 
20924 /*
20925  * Export routines to other components for the things we access locally through
20926  * macros.
20927  */
20928 #undef current_map
20929 vm_map_t
current_map(void)20930 current_map(void)
20931 {
20932 	return current_map_fast();
20933 }
20934 
20935 /*
20936  *	vm_map_reference:
20937  *
20938  *	Takes a reference on the specified map.
20939  */
20940 void
vm_map_reference(vm_map_t map)20941 vm_map_reference(
20942 	vm_map_t        map)
20943 {
20944 	if (__probable(map != VM_MAP_NULL)) {
20945 		vm_map_require(map);
20946 		os_ref_retain_raw(&map->map_refcnt, &map_refgrp);
20947 	}
20948 }
20949 
20950 /*
20951  *	vm_map_deallocate:
20952  *
20953  *	Removes a reference from the specified map,
20954  *	destroying it if no references remain.
20955  *	The map should not be locked.
20956  */
20957 void
vm_map_deallocate(vm_map_t map)20958 vm_map_deallocate(
20959 	vm_map_t        map)
20960 {
20961 	if (__probable(map != VM_MAP_NULL)) {
20962 		vm_map_require(map);
20963 		if (os_ref_release_raw(&map->map_refcnt, &map_refgrp) == 0) {
20964 			vm_map_destroy(map);
20965 		}
20966 	}
20967 }
20968 
20969 void
vm_map_inspect_deallocate(vm_map_inspect_t map)20970 vm_map_inspect_deallocate(
20971 	vm_map_inspect_t      map)
20972 {
20973 	vm_map_deallocate((vm_map_t)map);
20974 }
20975 
20976 void
vm_map_read_deallocate(vm_map_read_t map)20977 vm_map_read_deallocate(
20978 	vm_map_read_t      map)
20979 {
20980 	vm_map_deallocate((vm_map_t)map);
20981 }
20982 
20983 
20984 void
vm_map_disable_NX(vm_map_t map)20985 vm_map_disable_NX(vm_map_t map)
20986 {
20987 	if (map == NULL) {
20988 		return;
20989 	}
20990 	if (map->pmap == NULL) {
20991 		return;
20992 	}
20993 
20994 	pmap_disable_NX(map->pmap);
20995 }
20996 
20997 void
vm_map_disallow_data_exec(vm_map_t map)20998 vm_map_disallow_data_exec(vm_map_t map)
20999 {
21000 	if (map == NULL) {
21001 		return;
21002 	}
21003 
21004 	map->map_disallow_data_exec = TRUE;
21005 }
21006 
21007 /* XXX Consider making these constants (VM_MAX_ADDRESS and MACH_VM_MAX_ADDRESS)
21008  * more descriptive.
21009  */
21010 void
vm_map_set_32bit(vm_map_t map)21011 vm_map_set_32bit(vm_map_t map)
21012 {
21013 #if defined(__arm64__)
21014 	map->max_offset = pmap_max_offset(FALSE, ARM_PMAP_MAX_OFFSET_DEVICE);
21015 #else
21016 	map->max_offset = (vm_map_offset_t)VM_MAX_ADDRESS;
21017 #endif
21018 }
21019 
21020 
21021 void
vm_map_set_64bit(vm_map_t map)21022 vm_map_set_64bit(vm_map_t map)
21023 {
21024 #if defined(__arm64__)
21025 	map->max_offset = pmap_max_offset(TRUE, ARM_PMAP_MAX_OFFSET_DEVICE);
21026 #else
21027 	map->max_offset = (vm_map_offset_t)MACH_VM_MAX_ADDRESS;
21028 #endif
21029 }
21030 
21031 /*
21032  * Expand the maximum size of an existing map to the maximum supported.
21033  */
21034 void
vm_map_set_jumbo(vm_map_t map)21035 vm_map_set_jumbo(vm_map_t map)
21036 {
21037 #if defined (__arm64__) && !XNU_TARGET_OS_OSX
21038 	vm_map_set_max_addr(map, ~0);
21039 #else /* arm64 */
21040 	(void) map;
21041 #endif
21042 }
21043 
21044 /*
21045  * This map has a JIT entitlement
21046  */
21047 void
vm_map_set_jit_entitled(vm_map_t map)21048 vm_map_set_jit_entitled(vm_map_t map)
21049 {
21050 #if defined (__arm64__)
21051 	pmap_set_jit_entitled(map->pmap);
21052 #else /* arm64 */
21053 	(void) map;
21054 #endif
21055 }
21056 
21057 /*
21058  * Get status of this maps TPRO flag
21059  */
21060 boolean_t
vm_map_tpro(vm_map_t map)21061 vm_map_tpro(vm_map_t map)
21062 {
21063 #if defined (__arm64e__)
21064 	return pmap_get_tpro(map->pmap);
21065 #else /* arm64e */
21066 	(void) map;
21067 	return FALSE;
21068 #endif
21069 }
21070 
21071 /*
21072  * This map has TPRO enabled
21073  */
21074 void
vm_map_set_tpro(vm_map_t map)21075 vm_map_set_tpro(vm_map_t map)
21076 {
21077 #if defined (__arm64e__)
21078 	pmap_set_tpro(map->pmap);
21079 #else /* arm64e */
21080 	(void) map;
21081 #endif
21082 }
21083 
21084 /*
21085  * Does this map have TPRO enforcement enabled
21086  */
21087 boolean_t
vm_map_tpro_enforcement(vm_map_t map)21088 vm_map_tpro_enforcement(vm_map_t map)
21089 {
21090 	return map->tpro_enforcement;
21091 }
21092 
21093 /*
21094  * Set TPRO enforcement for this map
21095  */
21096 void
vm_map_set_tpro_enforcement(vm_map_t map)21097 vm_map_set_tpro_enforcement(vm_map_t map)
21098 {
21099 	if (vm_map_tpro(map)) {
21100 		vm_map_lock(map);
21101 		map->tpro_enforcement = TRUE;
21102 		vm_map_unlock(map);
21103 	}
21104 }
21105 
21106 /*
21107  * Enable TPRO on the requested region
21108  *
21109  * Note:
21110  *     This routine is primarily intended to be called during/soon after map
21111  *     creation before the associated task has been released to run. It is only
21112  *     currently safe when we have no resident pages.
21113  */
21114 boolean_t
vm_map_set_tpro_range(__unused vm_map_t map,__unused vm_map_address_t start,__unused vm_map_address_t end)21115 vm_map_set_tpro_range(
21116 	__unused vm_map_t map,
21117 	__unused vm_map_address_t start,
21118 	__unused vm_map_address_t end)
21119 {
21120 	return TRUE;
21121 }
21122 
21123 /*
21124  * Expand the maximum size of an existing map.
21125  */
21126 void
vm_map_set_max_addr(vm_map_t map,vm_map_offset_t new_max_offset)21127 vm_map_set_max_addr(vm_map_t map, vm_map_offset_t new_max_offset)
21128 {
21129 #if defined(__arm64__)
21130 	vm_map_offset_t max_supported_offset;
21131 	vm_map_offset_t old_max_offset;
21132 
21133 	vm_map_lock(map);
21134 
21135 	old_max_offset = map->max_offset;
21136 	max_supported_offset = pmap_max_offset(vm_map_is_64bit(map), ARM_PMAP_MAX_OFFSET_JUMBO);
21137 
21138 	new_max_offset = trunc_page(new_max_offset);
21139 
21140 	/* The address space cannot be shrunk using this routine. */
21141 	if (old_max_offset >= new_max_offset) {
21142 		vm_map_unlock(map);
21143 		return;
21144 	}
21145 
21146 	if (max_supported_offset < new_max_offset) {
21147 		new_max_offset = max_supported_offset;
21148 	}
21149 
21150 	map->max_offset = new_max_offset;
21151 
21152 	if (map->holelistenabled) {
21153 		if (map->holes_list->prev->vme_end == old_max_offset) {
21154 			/*
21155 			 * There is already a hole at the end of the map; simply make it bigger.
21156 			 */
21157 			map->holes_list->prev->vme_end = map->max_offset;
21158 		} else {
21159 			/*
21160 			 * There is no hole at the end, so we need to create a new hole
21161 			 * for the new empty space we're creating.
21162 			 */
21163 			struct vm_map_links *new_hole;
21164 
21165 			new_hole = zalloc_id(ZONE_ID_VM_MAP_HOLES, Z_WAITOK | Z_NOFAIL);
21166 			new_hole->start = old_max_offset;
21167 			new_hole->end = map->max_offset;
21168 			new_hole->prev = map->holes_list->prev;
21169 			new_hole->next = (struct vm_map_entry *)map->holes_list;
21170 			map->holes_list->prev->vme_next = (struct vm_map_entry *)new_hole;
21171 			map->holes_list->prev = (struct vm_map_entry *)new_hole;
21172 		}
21173 	}
21174 
21175 	vm_map_unlock(map);
21176 #else
21177 	(void)map;
21178 	(void)new_max_offset;
21179 #endif
21180 }
21181 
21182 vm_map_offset_t
vm_compute_max_offset(boolean_t is64)21183 vm_compute_max_offset(boolean_t is64)
21184 {
21185 #if defined(__arm64__)
21186 	return pmap_max_offset(is64, ARM_PMAP_MAX_OFFSET_DEVICE);
21187 #else
21188 	return is64 ? (vm_map_offset_t)MACH_VM_MAX_ADDRESS : (vm_map_offset_t)VM_MAX_ADDRESS;
21189 #endif
21190 }
21191 
21192 void
vm_map_get_max_aslr_slide_section(vm_map_t map __unused,int64_t * max_sections,int64_t * section_size)21193 vm_map_get_max_aslr_slide_section(
21194 	vm_map_t                map __unused,
21195 	int64_t                 *max_sections,
21196 	int64_t                 *section_size)
21197 {
21198 #if defined(__arm64__)
21199 	*max_sections = 3;
21200 	*section_size = ARM_TT_TWIG_SIZE;
21201 #else
21202 	*max_sections = 1;
21203 	*section_size = 0;
21204 #endif
21205 }
21206 
21207 uint64_t
vm_map_get_max_aslr_slide_pages(vm_map_t map)21208 vm_map_get_max_aslr_slide_pages(vm_map_t map)
21209 {
21210 #if defined(__arm64__)
21211 	/* Limit arm64 slide to 16MB to conserve contiguous VA space in the more
21212 	 * limited embedded address space; this is also meant to minimize pmap
21213 	 * memory usage on 16KB page systems.
21214 	 */
21215 	return 1 << (24 - VM_MAP_PAGE_SHIFT(map));
21216 #else
21217 	return 1 << (vm_map_is_64bit(map) ? 16 : 8);
21218 #endif
21219 }
21220 
21221 uint64_t
vm_map_get_max_loader_aslr_slide_pages(vm_map_t map)21222 vm_map_get_max_loader_aslr_slide_pages(vm_map_t map)
21223 {
21224 #if defined(__arm64__)
21225 	/* We limit the loader slide to 4MB, in order to ensure at least 8 bits
21226 	 * of independent entropy on 16KB page systems.
21227 	 */
21228 	return 1 << (22 - VM_MAP_PAGE_SHIFT(map));
21229 #else
21230 	return 1 << (vm_map_is_64bit(map) ? 16 : 8);
21231 #endif
21232 }
21233 
21234 boolean_t
vm_map_is_64bit(vm_map_t map)21235 vm_map_is_64bit(
21236 	vm_map_t map)
21237 {
21238 	return map->max_offset > ((vm_map_offset_t)VM_MAX_ADDRESS);
21239 }
21240 
21241 boolean_t
vm_map_has_hard_pagezero(vm_map_t map,vm_map_offset_t pagezero_size)21242 vm_map_has_hard_pagezero(
21243 	vm_map_t        map,
21244 	vm_map_offset_t pagezero_size)
21245 {
21246 	/*
21247 	 * XXX FBDP
21248 	 * We should lock the VM map (for read) here but we can get away
21249 	 * with it for now because there can't really be any race condition:
21250 	 * the VM map's min_offset is changed only when the VM map is created
21251 	 * and when the zero page is established (when the binary gets loaded),
21252 	 * and this routine gets called only when the task terminates and the
21253 	 * VM map is being torn down, and when a new map is created via
21254 	 * load_machfile()/execve().
21255 	 */
21256 	return map->min_offset >= pagezero_size;
21257 }
21258 
21259 /*
21260  * Raise a VM map's maximun offset.
21261  */
21262 kern_return_t
vm_map_raise_max_offset(vm_map_t map,vm_map_offset_t new_max_offset)21263 vm_map_raise_max_offset(
21264 	vm_map_t        map,
21265 	vm_map_offset_t new_max_offset)
21266 {
21267 	kern_return_t   ret;
21268 
21269 	vm_map_lock(map);
21270 	ret = KERN_INVALID_ADDRESS;
21271 
21272 	if (new_max_offset >= map->max_offset) {
21273 		if (!vm_map_is_64bit(map)) {
21274 			if (new_max_offset <= (vm_map_offset_t)VM_MAX_ADDRESS) {
21275 				map->max_offset = new_max_offset;
21276 				ret = KERN_SUCCESS;
21277 			}
21278 		} else {
21279 			if (new_max_offset <= (vm_map_offset_t)MACH_VM_MAX_ADDRESS) {
21280 				map->max_offset = new_max_offset;
21281 				ret = KERN_SUCCESS;
21282 			}
21283 		}
21284 	}
21285 
21286 	vm_map_unlock(map);
21287 	return ret;
21288 }
21289 
21290 
21291 /*
21292  * Raise a VM map's minimum offset.
21293  * To strictly enforce "page zero" reservation.
21294  */
21295 kern_return_t
vm_map_raise_min_offset(vm_map_t map,vm_map_offset_t new_min_offset)21296 vm_map_raise_min_offset(
21297 	vm_map_t        map,
21298 	vm_map_offset_t new_min_offset)
21299 {
21300 	vm_map_entry_t  first_entry;
21301 
21302 	new_min_offset = vm_map_round_page(new_min_offset,
21303 	    VM_MAP_PAGE_MASK(map));
21304 
21305 	vm_map_lock(map);
21306 
21307 	if (new_min_offset < map->min_offset) {
21308 		/*
21309 		 * Can't move min_offset backwards, as that would expose
21310 		 * a part of the address space that was previously, and for
21311 		 * possibly good reasons, inaccessible.
21312 		 */
21313 		vm_map_unlock(map);
21314 		return KERN_INVALID_ADDRESS;
21315 	}
21316 	if (new_min_offset >= map->max_offset) {
21317 		/* can't go beyond the end of the address space */
21318 		vm_map_unlock(map);
21319 		return KERN_INVALID_ADDRESS;
21320 	}
21321 
21322 	first_entry = vm_map_first_entry(map);
21323 	if (first_entry != vm_map_to_entry(map) &&
21324 	    first_entry->vme_start < new_min_offset) {
21325 		/*
21326 		 * Some memory was already allocated below the new
21327 		 * minimun offset.  It's too late to change it now...
21328 		 */
21329 		vm_map_unlock(map);
21330 		return KERN_NO_SPACE;
21331 	}
21332 
21333 	map->min_offset = new_min_offset;
21334 
21335 	if (map->holelistenabled) {
21336 		assert(map->holes_list);
21337 		map->holes_list->start = new_min_offset;
21338 		assert(new_min_offset < map->holes_list->end);
21339 	}
21340 
21341 	vm_map_unlock(map);
21342 
21343 	return KERN_SUCCESS;
21344 }
21345 
21346 /*
21347  * Set the limit on the maximum amount of address space and user wired memory allowed for this map.
21348  * This is basically a copy of the RLIMIT_AS and RLIMIT_MEMLOCK rlimit value maintained by the BSD
21349  * side of the kernel. The limits are checked in the mach VM side, so we keep a copy so we don't
21350  * have to reach over to the BSD data structures.
21351  */
21352 
21353 uint64_t vm_map_set_size_limit_count = 0;
21354 kern_return_t
vm_map_set_size_limit(vm_map_t map,uint64_t new_size_limit)21355 vm_map_set_size_limit(vm_map_t map, uint64_t new_size_limit)
21356 {
21357 	kern_return_t kr;
21358 
21359 	vm_map_lock(map);
21360 	if (new_size_limit < map->size) {
21361 		/* new limit should not be lower than its current size */
21362 		DTRACE_VM2(vm_map_set_size_limit_fail,
21363 		    vm_map_size_t, map->size,
21364 		    uint64_t, new_size_limit);
21365 		kr = KERN_FAILURE;
21366 	} else if (new_size_limit == map->size_limit) {
21367 		/* no change */
21368 		kr = KERN_SUCCESS;
21369 	} else {
21370 		/* set new limit */
21371 		DTRACE_VM2(vm_map_set_size_limit,
21372 		    vm_map_size_t, map->size,
21373 		    uint64_t, new_size_limit);
21374 		if (new_size_limit != RLIM_INFINITY) {
21375 			vm_map_set_size_limit_count++;
21376 		}
21377 		map->size_limit = new_size_limit;
21378 		kr = KERN_SUCCESS;
21379 	}
21380 	vm_map_unlock(map);
21381 	return kr;
21382 }
21383 
21384 uint64_t vm_map_set_data_limit_count = 0;
21385 kern_return_t
vm_map_set_data_limit(vm_map_t map,uint64_t new_data_limit)21386 vm_map_set_data_limit(vm_map_t map, uint64_t new_data_limit)
21387 {
21388 	kern_return_t kr;
21389 
21390 	vm_map_lock(map);
21391 	if (new_data_limit < map->size) {
21392 		/* new limit should not be lower than its current size */
21393 		DTRACE_VM2(vm_map_set_data_limit_fail,
21394 		    vm_map_size_t, map->size,
21395 		    uint64_t, new_data_limit);
21396 		kr = KERN_FAILURE;
21397 	} else if (new_data_limit == map->data_limit) {
21398 		/* no change */
21399 		kr = KERN_SUCCESS;
21400 	} else {
21401 		/* set new limit */
21402 		DTRACE_VM2(vm_map_set_data_limit,
21403 		    vm_map_size_t, map->size,
21404 		    uint64_t, new_data_limit);
21405 		if (new_data_limit != RLIM_INFINITY) {
21406 			vm_map_set_data_limit_count++;
21407 		}
21408 		map->data_limit = new_data_limit;
21409 		kr = KERN_SUCCESS;
21410 	}
21411 	vm_map_unlock(map);
21412 	return kr;
21413 }
21414 
21415 void
vm_map_set_user_wire_limit(vm_map_t map,vm_size_t limit)21416 vm_map_set_user_wire_limit(vm_map_t     map,
21417     vm_size_t    limit)
21418 {
21419 	vm_map_lock(map);
21420 	map->user_wire_limit = limit;
21421 	vm_map_unlock(map);
21422 }
21423 
21424 
21425 void
vm_map_switch_protect(vm_map_t map,boolean_t val)21426 vm_map_switch_protect(vm_map_t     map,
21427     boolean_t    val)
21428 {
21429 	vm_map_lock(map);
21430 	map->switch_protect = val;
21431 	vm_map_unlock(map);
21432 }
21433 
21434 extern int cs_process_enforcement_enable;
21435 boolean_t
vm_map_cs_enforcement(vm_map_t map)21436 vm_map_cs_enforcement(
21437 	vm_map_t map)
21438 {
21439 	if (cs_process_enforcement_enable) {
21440 		return TRUE;
21441 	}
21442 	return map->cs_enforcement;
21443 }
21444 
21445 kern_return_t
vm_map_cs_wx_enable(__unused vm_map_t map)21446 vm_map_cs_wx_enable(
21447 	__unused vm_map_t map)
21448 {
21449 #if CODE_SIGNING_MONITOR
21450 	kern_return_t ret = csm_allow_invalid_code(vm_map_pmap(map));
21451 	if ((ret == KERN_SUCCESS) || (ret == KERN_NOT_SUPPORTED)) {
21452 		return KERN_SUCCESS;
21453 	}
21454 	return ret;
21455 #else
21456 	/* The VM manages WX memory entirely on its own */
21457 	return KERN_SUCCESS;
21458 #endif
21459 }
21460 
21461 kern_return_t
vm_map_csm_allow_jit(__unused vm_map_t map)21462 vm_map_csm_allow_jit(
21463 	__unused vm_map_t map)
21464 {
21465 #if CODE_SIGNING_MONITOR
21466 	return csm_allow_jit_region(vm_map_pmap(map));
21467 #else
21468 	/* No code signing monitor to enforce JIT policy */
21469 	return KERN_SUCCESS;
21470 #endif
21471 }
21472 
21473 void
vm_map_cs_debugged_set(vm_map_t map,boolean_t val)21474 vm_map_cs_debugged_set(
21475 	vm_map_t map,
21476 	boolean_t val)
21477 {
21478 	vm_map_lock(map);
21479 	map->cs_debugged = val;
21480 	vm_map_unlock(map);
21481 }
21482 
21483 void
vm_map_cs_enforcement_set(vm_map_t map,boolean_t val)21484 vm_map_cs_enforcement_set(
21485 	vm_map_t map,
21486 	boolean_t val)
21487 {
21488 	vm_map_lock(map);
21489 	map->cs_enforcement = val;
21490 	pmap_set_vm_map_cs_enforced(map->pmap, val);
21491 	vm_map_unlock(map);
21492 }
21493 
21494 /*
21495  * IOKit has mapped a region into this map; adjust the pmap's ledgers appropriately.
21496  * phys_footprint is a composite limit consisting of iokit + physmem, so we need to
21497  * bump both counters.
21498  */
21499 void
vm_map_iokit_mapped_region(vm_map_t map,vm_size_t bytes)21500 vm_map_iokit_mapped_region(vm_map_t map, vm_size_t bytes)
21501 {
21502 	pmap_t pmap = vm_map_pmap(map);
21503 
21504 	ledger_credit(pmap->ledger, task_ledgers.iokit_mapped, bytes);
21505 	ledger_credit(pmap->ledger, task_ledgers.phys_footprint, bytes);
21506 }
21507 
21508 void
vm_map_iokit_unmapped_region(vm_map_t map,vm_size_t bytes)21509 vm_map_iokit_unmapped_region(vm_map_t map, vm_size_t bytes)
21510 {
21511 	pmap_t pmap = vm_map_pmap(map);
21512 
21513 	ledger_debit(pmap->ledger, task_ledgers.iokit_mapped, bytes);
21514 	ledger_debit(pmap->ledger, task_ledgers.phys_footprint, bytes);
21515 }
21516 
21517 /* Add (generate) code signature for memory range */
21518 #if CONFIG_DYNAMIC_CODE_SIGNING
21519 kern_return_t
vm_map_sign(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)21520 vm_map_sign(vm_map_t map,
21521     vm_map_offset_t start,
21522     vm_map_offset_t end)
21523 {
21524 	vm_map_entry_t entry;
21525 	vm_page_t m;
21526 	vm_object_t object;
21527 
21528 	/*
21529 	 * Vet all the input parameters and current type and state of the
21530 	 * underlaying object.  Return with an error if anything is amiss.
21531 	 */
21532 	if (map == VM_MAP_NULL) {
21533 		return KERN_INVALID_ARGUMENT;
21534 	}
21535 
21536 	if (__improbable(vm_map_range_overflows(map, start, end - start))) {
21537 		return KERN_INVALID_ADDRESS;
21538 	}
21539 
21540 	vm_map_lock_read(map);
21541 
21542 	if (!vm_map_lookup_entry(map, start, &entry) || entry->is_sub_map) {
21543 		/*
21544 		 * Must pass a valid non-submap address.
21545 		 */
21546 		vm_map_unlock_read(map);
21547 		return KERN_INVALID_ADDRESS;
21548 	}
21549 
21550 	if ((entry->vme_start > start) || (entry->vme_end < end)) {
21551 		/*
21552 		 * Map entry doesn't cover the requested range. Not handling
21553 		 * this situation currently.
21554 		 */
21555 		vm_map_unlock_read(map);
21556 		return KERN_INVALID_ARGUMENT;
21557 	}
21558 
21559 	object = VME_OBJECT(entry);
21560 	if (object == VM_OBJECT_NULL) {
21561 		/*
21562 		 * Object must already be present or we can't sign.
21563 		 */
21564 		vm_map_unlock_read(map);
21565 		return KERN_INVALID_ARGUMENT;
21566 	}
21567 
21568 	vm_object_lock(object);
21569 	vm_map_unlock_read(map);
21570 
21571 	while (start < end) {
21572 		uint32_t refmod;
21573 
21574 		m = vm_page_lookup(object,
21575 		    start - entry->vme_start + VME_OFFSET(entry));
21576 		if (m == VM_PAGE_NULL) {
21577 			/* shoud we try to fault a page here? we can probably
21578 			 * demand it exists and is locked for this request */
21579 			vm_object_unlock(object);
21580 			return KERN_FAILURE;
21581 		}
21582 		/* deal with special page status */
21583 		if (m->vmp_busy ||
21584 		    (m->vmp_unusual && (VMP_ERROR_GET(m) || m->vmp_restart || m->vmp_private || m->vmp_absent))) {
21585 			vm_object_unlock(object);
21586 			return KERN_FAILURE;
21587 		}
21588 
21589 		/* Page is OK... now "validate" it */
21590 		/* This is the place where we'll call out to create a code
21591 		 * directory, later */
21592 		/* XXX TODO4K: deal with 4k subpages individually? */
21593 		m->vmp_cs_validated = VMP_CS_ALL_TRUE;
21594 
21595 		/* The page is now "clean" for codesigning purposes. That means
21596 		 * we don't consider it as modified (wpmapped) anymore. But
21597 		 * we'll disconnect the page so we note any future modification
21598 		 * attempts. */
21599 		m->vmp_wpmapped = FALSE;
21600 		refmod = pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m));
21601 
21602 		/* Pull the dirty status from the pmap, since we cleared the
21603 		 * wpmapped bit */
21604 		if ((refmod & VM_MEM_MODIFIED) && !m->vmp_dirty) {
21605 			SET_PAGE_DIRTY(m, FALSE);
21606 		}
21607 
21608 		/* On to the next page */
21609 		start += PAGE_SIZE;
21610 	}
21611 	vm_object_unlock(object);
21612 
21613 	return KERN_SUCCESS;
21614 }
21615 #endif
21616 
21617 kern_return_t
vm_map_partial_reap(vm_map_t map,unsigned int * reclaimed_resident,unsigned int * reclaimed_compressed)21618 vm_map_partial_reap(vm_map_t map, unsigned int *reclaimed_resident, unsigned int *reclaimed_compressed)
21619 {
21620 	vm_map_entry_t  entry = VM_MAP_ENTRY_NULL;
21621 	vm_map_entry_t  next_entry;
21622 	kern_return_t   kr = KERN_SUCCESS;
21623 	VM_MAP_ZAP_DECLARE(zap_list);
21624 
21625 	vm_map_lock(map);
21626 
21627 	for (entry = vm_map_first_entry(map);
21628 	    entry != vm_map_to_entry(map);
21629 	    entry = next_entry) {
21630 		next_entry = entry->vme_next;
21631 
21632 		if (!entry->is_sub_map &&
21633 		    VME_OBJECT(entry) &&
21634 		    (VME_OBJECT(entry)->internal == TRUE) &&
21635 		    (VME_OBJECT(entry)->ref_count == 1)) {
21636 			*reclaimed_resident += VME_OBJECT(entry)->resident_page_count;
21637 			*reclaimed_compressed += vm_compressor_pager_get_count(VME_OBJECT(entry)->pager);
21638 
21639 			(void)vm_map_delete(map, entry->vme_start,
21640 			    entry->vme_end, VM_MAP_REMOVE_NO_YIELD,
21641 			    KMEM_GUARD_NONE, &zap_list);
21642 		}
21643 	}
21644 
21645 	vm_map_unlock(map);
21646 
21647 	vm_map_zap_dispose(&zap_list);
21648 
21649 	return kr;
21650 }
21651 
21652 
21653 #if DEVELOPMENT || DEBUG
21654 
21655 int
vm_map_disconnect_page_mappings(vm_map_t map,boolean_t do_unnest)21656 vm_map_disconnect_page_mappings(
21657 	vm_map_t map,
21658 	boolean_t do_unnest)
21659 {
21660 	vm_map_entry_t entry;
21661 	ledger_amount_t byte_count = 0;
21662 
21663 	if (do_unnest == TRUE) {
21664 #ifndef NO_NESTED_PMAP
21665 		vm_map_lock(map);
21666 
21667 		for (entry = vm_map_first_entry(map);
21668 		    entry != vm_map_to_entry(map);
21669 		    entry = entry->vme_next) {
21670 			if (entry->is_sub_map && entry->use_pmap) {
21671 				/*
21672 				 * Make sure the range between the start of this entry and
21673 				 * the end of this entry is no longer nested, so that
21674 				 * we will only remove mappings from the pmap in use by this
21675 				 * this task
21676 				 */
21677 				vm_map_clip_unnest(map, entry, entry->vme_start, entry->vme_end);
21678 			}
21679 		}
21680 		vm_map_unlock(map);
21681 #endif
21682 	}
21683 	vm_map_lock_read(map);
21684 
21685 	ledger_get_balance(map->pmap->ledger, task_ledgers.phys_mem, &byte_count);
21686 
21687 	for (entry = vm_map_first_entry(map);
21688 	    entry != vm_map_to_entry(map);
21689 	    entry = entry->vme_next) {
21690 		if (!entry->is_sub_map && ((VME_OBJECT(entry) == 0) ||
21691 		    (VME_OBJECT(entry)->phys_contiguous))) {
21692 			continue;
21693 		}
21694 		if (entry->is_sub_map) {
21695 			assert(!entry->use_pmap);
21696 		}
21697 
21698 		pmap_remove_options(map->pmap, entry->vme_start, entry->vme_end, 0);
21699 	}
21700 	vm_map_unlock_read(map);
21701 
21702 	return (int) (byte_count / VM_MAP_PAGE_SIZE(map));
21703 }
21704 
21705 kern_return_t
vm_map_inject_error(vm_map_t map,vm_map_offset_t vaddr)21706 vm_map_inject_error(vm_map_t map, vm_map_offset_t vaddr)
21707 {
21708 	vm_object_t object = NULL;
21709 	vm_object_offset_t offset;
21710 	vm_prot_t prot;
21711 	boolean_t wired;
21712 	vm_map_version_t version;
21713 	vm_map_t real_map;
21714 	int result = KERN_FAILURE;
21715 
21716 	vaddr = vm_map_trunc_page(vaddr, PAGE_MASK);
21717 	vm_map_lock(map);
21718 
21719 	result = vm_map_lookup_and_lock_object(&map, vaddr, VM_PROT_READ,
21720 	    OBJECT_LOCK_EXCLUSIVE, &version, &object, &offset, &prot, &wired,
21721 	    NULL, &real_map, NULL);
21722 	if (object == NULL) {
21723 		result = KERN_MEMORY_ERROR;
21724 	} else if (object->pager) {
21725 		result = vm_compressor_pager_inject_error(object->pager,
21726 		    offset);
21727 	} else {
21728 		result = KERN_MEMORY_PRESENT;
21729 	}
21730 
21731 	if (object != NULL) {
21732 		vm_object_unlock(object);
21733 	}
21734 
21735 	if (real_map != map) {
21736 		vm_map_unlock(real_map);
21737 	}
21738 	vm_map_unlock(map);
21739 
21740 	return result;
21741 }
21742 
21743 #endif
21744 
21745 
21746 #if CONFIG_FREEZE
21747 
21748 
21749 extern struct freezer_context freezer_context_global;
21750 AbsoluteTime c_freezer_last_yield_ts = 0;
21751 
21752 extern unsigned int memorystatus_freeze_private_shared_pages_ratio;
21753 extern unsigned int memorystatus_freeze_shared_mb_per_process_max;
21754 
21755 kern_return_t
vm_map_freeze(task_t task,unsigned int * purgeable_count,unsigned int * wired_count,unsigned int * clean_count,unsigned int * dirty_count,unsigned int dirty_budget,unsigned int * shared_count,int * freezer_error_code,boolean_t eval_only)21756 vm_map_freeze(
21757 	task_t       task,
21758 	unsigned int *purgeable_count,
21759 	unsigned int *wired_count,
21760 	unsigned int *clean_count,
21761 	unsigned int *dirty_count,
21762 	unsigned int dirty_budget,
21763 	unsigned int *shared_count,
21764 	int          *freezer_error_code,
21765 	boolean_t    eval_only)
21766 {
21767 	vm_map_entry_t  entry2 = VM_MAP_ENTRY_NULL;
21768 	kern_return_t   kr = KERN_SUCCESS;
21769 	boolean_t       evaluation_phase = TRUE;
21770 	vm_object_t     cur_shared_object = NULL;
21771 	int             cur_shared_obj_ref_cnt = 0;
21772 	unsigned int    dirty_private_count = 0, dirty_shared_count = 0, obj_pages_snapshot = 0;
21773 
21774 	*purgeable_count = *wired_count = *clean_count = *dirty_count = *shared_count = 0;
21775 
21776 	/*
21777 	 * We need the exclusive lock here so that we can
21778 	 * block any page faults or lookups while we are
21779 	 * in the middle of freezing this vm map.
21780 	 */
21781 	vm_map_t map = task->map;
21782 
21783 	vm_map_lock(map);
21784 
21785 	assert(VM_CONFIG_COMPRESSOR_IS_PRESENT);
21786 
21787 	if (vm_compressor_low_on_space() || vm_swap_low_on_space()) {
21788 		if (vm_compressor_low_on_space()) {
21789 			*freezer_error_code = FREEZER_ERROR_NO_COMPRESSOR_SPACE;
21790 		}
21791 
21792 		if (vm_swap_low_on_space()) {
21793 			*freezer_error_code = FREEZER_ERROR_NO_SWAP_SPACE;
21794 		}
21795 
21796 		kr = KERN_NO_SPACE;
21797 		goto done;
21798 	}
21799 
21800 	if (VM_CONFIG_FREEZER_SWAP_IS_ACTIVE == FALSE) {
21801 		/*
21802 		 * In-memory compressor backing the freezer. No disk.
21803 		 * So no need to do the evaluation phase.
21804 		 */
21805 		evaluation_phase = FALSE;
21806 
21807 		if (eval_only == TRUE) {
21808 			/*
21809 			 * We don't support 'eval_only' mode
21810 			 * in this non-swap config.
21811 			 */
21812 			*freezer_error_code = FREEZER_ERROR_GENERIC;
21813 			kr = KERN_INVALID_ARGUMENT;
21814 			goto done;
21815 		}
21816 
21817 		freezer_context_global.freezer_ctx_uncompressed_pages = 0;
21818 		clock_get_uptime(&c_freezer_last_yield_ts);
21819 	}
21820 again:
21821 
21822 	for (entry2 = vm_map_first_entry(map);
21823 	    entry2 != vm_map_to_entry(map);
21824 	    entry2 = entry2->vme_next) {
21825 		vm_object_t src_object;
21826 
21827 		if (entry2->is_sub_map) {
21828 			continue;
21829 		}
21830 
21831 		src_object = VME_OBJECT(entry2);
21832 		if (!src_object ||
21833 		    src_object->phys_contiguous ||
21834 		    !src_object->internal) {
21835 			continue;
21836 		}
21837 
21838 		/* If eligible, scan the entry, moving eligible pages over to our parent object */
21839 
21840 		if (VM_CONFIG_FREEZER_SWAP_IS_ACTIVE) {
21841 			/*
21842 			 * We skip purgeable objects during evaluation phase only.
21843 			 * If we decide to freeze this process, we'll explicitly
21844 			 * purge these objects before we go around again with
21845 			 * 'evaluation_phase' set to FALSE.
21846 			 */
21847 
21848 			if ((src_object->purgable == VM_PURGABLE_EMPTY) || (src_object->purgable == VM_PURGABLE_VOLATILE)) {
21849 				/*
21850 				 * We want to purge objects that may not belong to this task but are mapped
21851 				 * in this task alone. Since we already purged this task's purgeable memory
21852 				 * at the end of a successful evaluation phase, we want to avoid doing no-op calls
21853 				 * on this task's purgeable objects. Hence the check for only volatile objects.
21854 				 */
21855 				if (evaluation_phase ||
21856 				    src_object->purgable != VM_PURGABLE_VOLATILE ||
21857 				    src_object->ref_count != 1) {
21858 					continue;
21859 				}
21860 				vm_object_lock(src_object);
21861 				if (src_object->purgable == VM_PURGABLE_VOLATILE &&
21862 				    src_object->ref_count == 1) {
21863 					purgeable_q_t old_queue;
21864 
21865 					/* object should be on a purgeable queue */
21866 					assert(src_object->objq.next != NULL &&
21867 					    src_object->objq.prev != NULL);
21868 					/* move object from its volatile queue to the nonvolatile queue */
21869 					old_queue = vm_purgeable_object_remove(src_object);
21870 					assert(old_queue);
21871 					if (src_object->purgeable_when_ripe) {
21872 						/* remove a token from that volatile queue */
21873 						vm_page_lock_queues();
21874 						vm_purgeable_token_delete_first(old_queue);
21875 						vm_page_unlock_queues();
21876 					}
21877 					/* purge the object */
21878 					vm_object_purge(src_object, 0);
21879 				}
21880 				vm_object_unlock(src_object);
21881 				continue;
21882 			}
21883 
21884 			/*
21885 			 * Pages belonging to this object could be swapped to disk.
21886 			 * Make sure it's not a shared object because we could end
21887 			 * up just bringing it back in again.
21888 			 *
21889 			 * We try to optimize somewhat by checking for objects that are mapped
21890 			 * more than once within our own map. But we don't do full searches,
21891 			 * we just look at the entries following our current entry.
21892 			 */
21893 
21894 			if (src_object->ref_count > 1) {
21895 				if (src_object != cur_shared_object) {
21896 					obj_pages_snapshot = (src_object->resident_page_count - src_object->wired_page_count) + vm_compressor_pager_get_count(src_object->pager);
21897 					dirty_shared_count += obj_pages_snapshot;
21898 
21899 					cur_shared_object = src_object;
21900 					cur_shared_obj_ref_cnt = 1;
21901 					continue;
21902 				} else {
21903 					cur_shared_obj_ref_cnt++;
21904 					if (src_object->ref_count == cur_shared_obj_ref_cnt) {
21905 						/*
21906 						 * Fall through to below and treat this object as private.
21907 						 * So deduct its pages from our shared total and add it to the
21908 						 * private total.
21909 						 */
21910 
21911 						dirty_shared_count -= obj_pages_snapshot;
21912 						dirty_private_count += obj_pages_snapshot;
21913 					} else {
21914 						continue;
21915 					}
21916 				}
21917 			}
21918 
21919 
21920 			if (src_object->ref_count == 1) {
21921 				dirty_private_count += (src_object->resident_page_count - src_object->wired_page_count) + vm_compressor_pager_get_count(src_object->pager);
21922 			}
21923 
21924 			if (evaluation_phase == TRUE) {
21925 				continue;
21926 			}
21927 		}
21928 
21929 		uint32_t paged_out_count = vm_object_compressed_freezer_pageout(src_object, dirty_budget);
21930 		*wired_count += src_object->wired_page_count;
21931 
21932 		if (vm_compressor_low_on_space() || vm_swap_low_on_space()) {
21933 			if (vm_compressor_low_on_space()) {
21934 				*freezer_error_code = FREEZER_ERROR_NO_COMPRESSOR_SPACE;
21935 			}
21936 
21937 			if (vm_swap_low_on_space()) {
21938 				*freezer_error_code = FREEZER_ERROR_NO_SWAP_SPACE;
21939 			}
21940 
21941 			kr = KERN_NO_SPACE;
21942 			break;
21943 		}
21944 		if (paged_out_count >= dirty_budget) {
21945 			break;
21946 		}
21947 		dirty_budget -= paged_out_count;
21948 	}
21949 
21950 	*shared_count = (unsigned int) ((dirty_shared_count * PAGE_SIZE_64) / (1024 * 1024ULL));
21951 	if (evaluation_phase) {
21952 		unsigned int shared_pages_threshold = (memorystatus_freeze_shared_mb_per_process_max * 1024 * 1024ULL) / PAGE_SIZE_64;
21953 
21954 		if (dirty_shared_count > shared_pages_threshold) {
21955 			*freezer_error_code = FREEZER_ERROR_EXCESS_SHARED_MEMORY;
21956 			kr = KERN_FAILURE;
21957 			goto done;
21958 		}
21959 
21960 		if (dirty_shared_count &&
21961 		    ((dirty_private_count / dirty_shared_count) < memorystatus_freeze_private_shared_pages_ratio)) {
21962 			*freezer_error_code = FREEZER_ERROR_LOW_PRIVATE_SHARED_RATIO;
21963 			kr = KERN_FAILURE;
21964 			goto done;
21965 		}
21966 
21967 		evaluation_phase = FALSE;
21968 		dirty_shared_count = dirty_private_count = 0;
21969 
21970 		freezer_context_global.freezer_ctx_uncompressed_pages = 0;
21971 		clock_get_uptime(&c_freezer_last_yield_ts);
21972 
21973 		if (eval_only) {
21974 			kr = KERN_SUCCESS;
21975 			goto done;
21976 		}
21977 
21978 		vm_purgeable_purge_task_owned(task);
21979 
21980 		goto again;
21981 	} else {
21982 		kr = KERN_SUCCESS;
21983 	}
21984 
21985 done:
21986 	vm_map_unlock(map);
21987 
21988 	if ((eval_only == FALSE) && (kr == KERN_SUCCESS)) {
21989 		vm_object_compressed_freezer_done();
21990 	}
21991 	return kr;
21992 }
21993 
21994 #endif
21995 
21996 /*
21997  * vm_map_entry_should_cow_for_true_share:
21998  *
21999  * Determines if the map entry should be clipped and setup for copy-on-write
22000  * to avoid applying "true_share" to a large VM object when only a subset is
22001  * targeted.
22002  *
22003  * For now, we target only the map entries created for the Objective C
22004  * Garbage Collector, which initially have the following properties:
22005  *	- alias == VM_MEMORY_MALLOC
22006  *      - wired_count == 0
22007  *      - !needs_copy
22008  * and a VM object with:
22009  *      - internal
22010  *      - copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC
22011  *      - !true_share
22012  *      - vo_size == ANON_CHUNK_SIZE
22013  *
22014  * Only non-kernel map entries.
22015  */
22016 boolean_t
vm_map_entry_should_cow_for_true_share(vm_map_entry_t entry)22017 vm_map_entry_should_cow_for_true_share(
22018 	vm_map_entry_t  entry)
22019 {
22020 	vm_object_t     object;
22021 
22022 	if (entry->is_sub_map) {
22023 		/* entry does not point at a VM object */
22024 		return FALSE;
22025 	}
22026 
22027 	if (entry->needs_copy) {
22028 		/* already set for copy_on_write: done! */
22029 		return FALSE;
22030 	}
22031 
22032 	if (VME_ALIAS(entry) != VM_MEMORY_MALLOC &&
22033 	    VME_ALIAS(entry) != VM_MEMORY_MALLOC_SMALL) {
22034 		/* not a malloc heap or Obj-C Garbage Collector heap */
22035 		return FALSE;
22036 	}
22037 
22038 	if (entry->wired_count) {
22039 		/* wired: can't change the map entry... */
22040 		vm_counters.should_cow_but_wired++;
22041 		return FALSE;
22042 	}
22043 
22044 	object = VME_OBJECT(entry);
22045 
22046 	if (object == VM_OBJECT_NULL) {
22047 		/* no object yet... */
22048 		return FALSE;
22049 	}
22050 
22051 	if (!object->internal) {
22052 		/* not an internal object */
22053 		return FALSE;
22054 	}
22055 
22056 	if (object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC) {
22057 		/* not the default copy strategy */
22058 		return FALSE;
22059 	}
22060 
22061 	if (object->true_share) {
22062 		/* already true_share: too late to avoid it */
22063 		return FALSE;
22064 	}
22065 
22066 	if (VME_ALIAS(entry) == VM_MEMORY_MALLOC &&
22067 	    object->vo_size != ANON_CHUNK_SIZE) {
22068 		/* ... not an object created for the ObjC Garbage Collector */
22069 		return FALSE;
22070 	}
22071 
22072 	if (VME_ALIAS(entry) == VM_MEMORY_MALLOC_SMALL &&
22073 	    object->vo_size != 2048 * 4096) {
22074 		/* ... not a "MALLOC_SMALL" heap */
22075 		return FALSE;
22076 	}
22077 
22078 	/*
22079 	 * All the criteria match: we have a large object being targeted for "true_share".
22080 	 * To limit the adverse side-effects linked with "true_share", tell the caller to
22081 	 * try and avoid setting up the entire object for "true_share" by clipping the
22082 	 * targeted range and setting it up for copy-on-write.
22083 	 */
22084 	return TRUE;
22085 }
22086 
22087 uint64_t vm_map_range_overflows_count = 0;
22088 TUNABLE_WRITEABLE(boolean_t, vm_map_range_overflows_log, "vm_map_range_overflows_log", FALSE);
22089 bool
vm_map_range_overflows(vm_map_t map,vm_map_offset_t addr,vm_map_size_t size)22090 vm_map_range_overflows(
22091 	vm_map_t map,
22092 	vm_map_offset_t addr,
22093 	vm_map_size_t size)
22094 {
22095 	vm_map_offset_t start, end, sum;
22096 	vm_map_offset_t pgmask;
22097 
22098 	if (size == 0) {
22099 		/* empty range -> no overflow */
22100 		return false;
22101 	}
22102 	pgmask = vm_map_page_mask(map);
22103 	start = vm_map_trunc_page_mask(addr, pgmask);
22104 	end = vm_map_round_page_mask(addr + size, pgmask);
22105 	if (__improbable(os_add_overflow(addr, size, &sum) || end <= start)) {
22106 		vm_map_range_overflows_count++;
22107 		if (vm_map_range_overflows_log) {
22108 			printf("%d[%s] vm_map_range_overflows addr 0x%llx size 0x%llx pgmask 0x%llx\n",
22109 			    proc_selfpid(),
22110 			    proc_best_name(current_proc()),
22111 			    (uint64_t)addr,
22112 			    (uint64_t)size,
22113 			    (uint64_t)pgmask);
22114 		}
22115 		DTRACE_VM4(vm_map_range_overflows,
22116 		    vm_map_t, map,
22117 		    uint32_t, pgmask,
22118 		    uint64_t, (uint64_t)addr,
22119 		    uint64_t, (uint64_t)size);
22120 		return true;
22121 	}
22122 	return false;
22123 }
22124 
22125 vm_map_offset_t
vm_map_round_page_mask(vm_map_offset_t offset,vm_map_offset_t mask)22126 vm_map_round_page_mask(
22127 	vm_map_offset_t offset,
22128 	vm_map_offset_t mask)
22129 {
22130 	return VM_MAP_ROUND_PAGE(offset, mask);
22131 }
22132 
22133 vm_map_offset_t
vm_map_trunc_page_mask(vm_map_offset_t offset,vm_map_offset_t mask)22134 vm_map_trunc_page_mask(
22135 	vm_map_offset_t offset,
22136 	vm_map_offset_t mask)
22137 {
22138 	return VM_MAP_TRUNC_PAGE(offset, mask);
22139 }
22140 
22141 boolean_t
vm_map_page_aligned(vm_map_offset_t offset,vm_map_offset_t mask)22142 vm_map_page_aligned(
22143 	vm_map_offset_t offset,
22144 	vm_map_offset_t mask)
22145 {
22146 	return ((offset) & mask) == 0;
22147 }
22148 
22149 int
vm_map_page_shift(vm_map_t map)22150 vm_map_page_shift(
22151 	vm_map_t map)
22152 {
22153 	return VM_MAP_PAGE_SHIFT(map);
22154 }
22155 
22156 int
vm_map_page_size(vm_map_t map)22157 vm_map_page_size(
22158 	vm_map_t map)
22159 {
22160 	return VM_MAP_PAGE_SIZE(map);
22161 }
22162 
22163 vm_map_offset_t
vm_map_page_mask(vm_map_t map)22164 vm_map_page_mask(
22165 	vm_map_t map)
22166 {
22167 	return VM_MAP_PAGE_MASK(map);
22168 }
22169 
22170 kern_return_t
vm_map_set_page_shift(vm_map_t map,int pageshift)22171 vm_map_set_page_shift(
22172 	vm_map_t        map,
22173 	int             pageshift)
22174 {
22175 	if (map->hdr.nentries != 0) {
22176 		/* too late to change page size */
22177 		return KERN_FAILURE;
22178 	}
22179 
22180 	map->hdr.page_shift = (uint16_t)pageshift;
22181 
22182 	return KERN_SUCCESS;
22183 }
22184 
22185 kern_return_t
vm_map_query_volatile(vm_map_t map,mach_vm_size_t * volatile_virtual_size_p,mach_vm_size_t * volatile_resident_size_p,mach_vm_size_t * volatile_compressed_size_p,mach_vm_size_t * volatile_pmap_size_p,mach_vm_size_t * volatile_compressed_pmap_size_p)22186 vm_map_query_volatile(
22187 	vm_map_t        map,
22188 	mach_vm_size_t  *volatile_virtual_size_p,
22189 	mach_vm_size_t  *volatile_resident_size_p,
22190 	mach_vm_size_t  *volatile_compressed_size_p,
22191 	mach_vm_size_t  *volatile_pmap_size_p,
22192 	mach_vm_size_t  *volatile_compressed_pmap_size_p)
22193 {
22194 	mach_vm_size_t  volatile_virtual_size;
22195 	mach_vm_size_t  volatile_resident_count;
22196 	mach_vm_size_t  volatile_compressed_count;
22197 	mach_vm_size_t  volatile_pmap_count;
22198 	mach_vm_size_t  volatile_compressed_pmap_count;
22199 	mach_vm_size_t  resident_count;
22200 	vm_map_entry_t  entry;
22201 	vm_object_t     object;
22202 
22203 	/* map should be locked by caller */
22204 
22205 	volatile_virtual_size = 0;
22206 	volatile_resident_count = 0;
22207 	volatile_compressed_count = 0;
22208 	volatile_pmap_count = 0;
22209 	volatile_compressed_pmap_count = 0;
22210 
22211 	for (entry = vm_map_first_entry(map);
22212 	    entry != vm_map_to_entry(map);
22213 	    entry = entry->vme_next) {
22214 		mach_vm_size_t  pmap_resident_bytes, pmap_compressed_bytes;
22215 
22216 		if (entry->is_sub_map) {
22217 			continue;
22218 		}
22219 		if (!(entry->protection & VM_PROT_WRITE)) {
22220 			continue;
22221 		}
22222 		object = VME_OBJECT(entry);
22223 		if (object == VM_OBJECT_NULL) {
22224 			continue;
22225 		}
22226 		if (object->purgable != VM_PURGABLE_VOLATILE &&
22227 		    object->purgable != VM_PURGABLE_EMPTY) {
22228 			continue;
22229 		}
22230 		if (VME_OFFSET(entry)) {
22231 			/*
22232 			 * If the map entry has been split and the object now
22233 			 * appears several times in the VM map, we don't want
22234 			 * to count the object's resident_page_count more than
22235 			 * once.  We count it only for the first one, starting
22236 			 * at offset 0 and ignore the other VM map entries.
22237 			 */
22238 			continue;
22239 		}
22240 		resident_count = object->resident_page_count;
22241 		if ((VME_OFFSET(entry) / PAGE_SIZE) >= resident_count) {
22242 			resident_count = 0;
22243 		} else {
22244 			resident_count -= (VME_OFFSET(entry) / PAGE_SIZE);
22245 		}
22246 
22247 		volatile_virtual_size += entry->vme_end - entry->vme_start;
22248 		volatile_resident_count += resident_count;
22249 		if (object->pager) {
22250 			volatile_compressed_count +=
22251 			    vm_compressor_pager_get_count(object->pager);
22252 		}
22253 		pmap_compressed_bytes = 0;
22254 		pmap_resident_bytes =
22255 		    pmap_query_resident(map->pmap,
22256 		    entry->vme_start,
22257 		    entry->vme_end,
22258 		    &pmap_compressed_bytes);
22259 		volatile_pmap_count += (pmap_resident_bytes / PAGE_SIZE);
22260 		volatile_compressed_pmap_count += (pmap_compressed_bytes
22261 		    / PAGE_SIZE);
22262 	}
22263 
22264 	/* map is still locked on return */
22265 
22266 	*volatile_virtual_size_p = volatile_virtual_size;
22267 	*volatile_resident_size_p = volatile_resident_count * PAGE_SIZE;
22268 	*volatile_compressed_size_p = volatile_compressed_count * PAGE_SIZE;
22269 	*volatile_pmap_size_p = volatile_pmap_count * PAGE_SIZE;
22270 	*volatile_compressed_pmap_size_p = volatile_compressed_pmap_count * PAGE_SIZE;
22271 
22272 	return KERN_SUCCESS;
22273 }
22274 
22275 void
vm_map_sizes(vm_map_t map,vm_map_size_t * psize,vm_map_size_t * pfree,vm_map_size_t * plargest_free)22276 vm_map_sizes(vm_map_t map,
22277     vm_map_size_t * psize,
22278     vm_map_size_t * pfree,
22279     vm_map_size_t * plargest_free)
22280 {
22281 	vm_map_entry_t  entry;
22282 	vm_map_offset_t prev;
22283 	vm_map_size_t   free, total_free, largest_free;
22284 	boolean_t       end;
22285 
22286 	if (!map) {
22287 		*psize = *pfree = *plargest_free = 0;
22288 		return;
22289 	}
22290 	total_free = largest_free = 0;
22291 
22292 	vm_map_lock_read(map);
22293 	if (psize) {
22294 		*psize = map->max_offset - map->min_offset;
22295 	}
22296 
22297 	prev = map->min_offset;
22298 	for (entry = vm_map_first_entry(map);; entry = entry->vme_next) {
22299 		end = (entry == vm_map_to_entry(map));
22300 
22301 		if (end) {
22302 			free = entry->vme_end   - prev;
22303 		} else {
22304 			free = entry->vme_start - prev;
22305 		}
22306 
22307 		total_free += free;
22308 		if (free > largest_free) {
22309 			largest_free = free;
22310 		}
22311 
22312 		if (end) {
22313 			break;
22314 		}
22315 		prev = entry->vme_end;
22316 	}
22317 	vm_map_unlock_read(map);
22318 	if (pfree) {
22319 		*pfree = total_free;
22320 	}
22321 	if (plargest_free) {
22322 		*plargest_free = largest_free;
22323 	}
22324 }
22325 
22326 #if VM_SCAN_FOR_SHADOW_CHAIN
22327 int vm_map_shadow_max(vm_map_t map);
22328 int
vm_map_shadow_max(vm_map_t map)22329 vm_map_shadow_max(
22330 	vm_map_t map)
22331 {
22332 	int             shadows, shadows_max;
22333 	vm_map_entry_t  entry;
22334 	vm_object_t     object, next_object;
22335 
22336 	if (map == NULL) {
22337 		return 0;
22338 	}
22339 
22340 	shadows_max = 0;
22341 
22342 	vm_map_lock_read(map);
22343 
22344 	for (entry = vm_map_first_entry(map);
22345 	    entry != vm_map_to_entry(map);
22346 	    entry = entry->vme_next) {
22347 		if (entry->is_sub_map) {
22348 			continue;
22349 		}
22350 		object = VME_OBJECT(entry);
22351 		if (object == NULL) {
22352 			continue;
22353 		}
22354 		vm_object_lock_shared(object);
22355 		for (shadows = 0;
22356 		    object->shadow != NULL;
22357 		    shadows++, object = next_object) {
22358 			next_object = object->shadow;
22359 			vm_object_lock_shared(next_object);
22360 			vm_object_unlock(object);
22361 		}
22362 		vm_object_unlock(object);
22363 		if (shadows > shadows_max) {
22364 			shadows_max = shadows;
22365 		}
22366 	}
22367 
22368 	vm_map_unlock_read(map);
22369 
22370 	return shadows_max;
22371 }
22372 #endif /* VM_SCAN_FOR_SHADOW_CHAIN */
22373 
22374 void
vm_commit_pagezero_status(vm_map_t lmap)22375 vm_commit_pagezero_status(vm_map_t lmap)
22376 {
22377 	pmap_advise_pagezero_range(lmap->pmap, lmap->min_offset);
22378 }
22379 
22380 #if __x86_64__
22381 void
vm_map_set_high_start(vm_map_t map,vm_map_offset_t high_start)22382 vm_map_set_high_start(
22383 	vm_map_t        map,
22384 	vm_map_offset_t high_start)
22385 {
22386 	map->vmmap_high_start = high_start;
22387 }
22388 #endif /* __x86_64__ */
22389 
22390 #if CODE_SIGNING_MONITOR
22391 
22392 kern_return_t
vm_map_entry_cs_associate(vm_map_t map,vm_map_entry_t entry,vm_map_kernel_flags_t vmk_flags)22393 vm_map_entry_cs_associate(
22394 	vm_map_t                map,
22395 	vm_map_entry_t          entry,
22396 	vm_map_kernel_flags_t   vmk_flags)
22397 {
22398 	vm_object_t cs_object, cs_shadow, backing_object;
22399 	vm_object_offset_t cs_offset, backing_offset;
22400 	void *cs_blobs;
22401 	struct vnode *cs_vnode;
22402 	kern_return_t cs_ret;
22403 
22404 	if (map->pmap == NULL ||
22405 	    entry->is_sub_map || /* XXX FBDP: recurse on sub-range? */
22406 	    (csm_address_space_exempt(map->pmap) == KERN_SUCCESS) ||
22407 	    VME_OBJECT(entry) == VM_OBJECT_NULL) {
22408 		return KERN_SUCCESS;
22409 	}
22410 
22411 	if (!(entry->protection & VM_PROT_EXECUTE)) {
22412 		/*
22413 		 * This memory region is not executable, so the code-signing
22414 		 * monitor would usually not care about it...
22415 		 */
22416 		if (vmk_flags.vmkf_remap_prot_copy &&
22417 		    (entry->max_protection & VM_PROT_EXECUTE)) {
22418 			/*
22419 			 * ... except if the memory region is being remapped
22420 			 * from r-x/r-x to rw-/rwx via vm_protect(VM_PROT_COPY)
22421 			 * which is what a debugger or dtrace would be doing
22422 			 * to prepare to modify an executable page to insert
22423 			 * a breakpoint or activate a probe.
22424 			 * In that case, fall through so that we can mark
22425 			 * this region as being "debugged" and no longer
22426 			 * strictly code-signed.
22427 			 */
22428 		} else {
22429 			/*
22430 			 * Really not executable, so no need to tell the
22431 			 * code-signing monitor.
22432 			 */
22433 			return KERN_SUCCESS;
22434 		}
22435 	}
22436 
22437 	vm_map_lock_assert_exclusive(map);
22438 
22439 	/*
22440 	 * Check for a debug association mapping before we check for used_for_jit. This
22441 	 * allows non-RWX JIT on macOS systems to masquerade their mappings as USER_DEBUG
22442 	 * pages instead of USER_JIT. These non-RWX JIT pages cannot be marked as USER_JIT
22443 	 * since they are mapped with RW or RX permissions, which the page table monitor
22444 	 * denies on USER_JIT pages. Given that, if they're not mapped as USER_DEBUG,
22445 	 * they will be mapped as USER_EXEC, and that will cause another page table monitor
22446 	 * violation when those USER_EXEC pages are mapped as RW.
22447 	 *
22448 	 * Since these pages switch between RW and RX through mprotect, they mimic what
22449 	 * we expect a debugger to do. As the code signing monitor does not enforce mappings
22450 	 * on macOS systems, this works in our favor here and allows us to continue to
22451 	 * support these legacy-programmed applications without sacrificing security on
22452 	 * the page table or the code signing monitor. We don't need to explicitly check
22453 	 * for entry_for_jit here and the mapping permissions. If the initial mapping is
22454 	 * created with RX, then the application must map it as RW in order to first write
22455 	 * to the page (MAP_JIT mappings must be private and anonymous). The switch to
22456 	 * RX will cause vm_map_protect to mark the entry as vmkf_remap_prot_copy.
22457 	 * Similarly, if the mapping was created as RW, and then switched to RX,
22458 	 * vm_map_protect will again mark the entry as a copy, and both these cases
22459 	 * lead to this if-statement being entered.
22460 	 *
22461 	 * For more information: rdar://115313336.
22462 	 */
22463 	if (vmk_flags.vmkf_remap_prot_copy) {
22464 		cs_ret = csm_associate_debug_region(
22465 			map->pmap,
22466 			entry->vme_start,
22467 			entry->vme_end - entry->vme_start);
22468 
22469 		/*
22470 		 * csm_associate_debug_region returns not supported when the code signing
22471 		 * monitor is disabled. This is intentional, since cs_ret is checked towards
22472 		 * the end of the function, and if it is not supported, then we still want the
22473 		 * VM to perform code-signing enforcement on this entry. That said, if we don't
22474 		 * mark this as a xnu_user_debug page when the code-signing monitor is disabled,
22475 		 * then it never gets retyped to XNU_USER_DEBUG frame type, which then causes
22476 		 * an issue with debugging (since it'll be mapped in as XNU_USER_EXEC in some
22477 		 * cases, which will cause a violation when attempted to be mapped as writable).
22478 		 */
22479 		if ((cs_ret == KERN_SUCCESS) || (cs_ret == KERN_NOT_SUPPORTED)) {
22480 			entry->vme_xnu_user_debug = TRUE;
22481 		}
22482 #if DEVELOPMENT || DEBUG
22483 		if (vm_log_xnu_user_debug) {
22484 			printf("FBDP %d[%s] %s:%d map %p entry %p [ 0x%llx 0x%llx ]  vme_xnu_user_debug=%d cs_ret %d\n",
22485 			    proc_selfpid(),
22486 			    (get_bsdtask_info(current_task()) ? proc_name_address(get_bsdtask_info(current_task())) : "?"),
22487 			    __FUNCTION__, __LINE__,
22488 			    map, entry,
22489 			    (uint64_t)entry->vme_start, (uint64_t)entry->vme_end,
22490 			    entry->vme_xnu_user_debug,
22491 			    cs_ret);
22492 		}
22493 #endif /* DEVELOPMENT || DEBUG */
22494 		goto done;
22495 	}
22496 
22497 	if (entry->used_for_jit) {
22498 		cs_ret = csm_associate_jit_region(
22499 			map->pmap,
22500 			entry->vme_start,
22501 			entry->vme_end - entry->vme_start);
22502 		goto done;
22503 	}
22504 
22505 	cs_object = VME_OBJECT(entry);
22506 	vm_object_lock_shared(cs_object);
22507 	cs_offset = VME_OFFSET(entry);
22508 
22509 	/* find the VM object backed by the code-signed vnode */
22510 	for (;;) {
22511 		/* go to the bottom of cs_object's shadow chain */
22512 		for (;
22513 		    cs_object->shadow != VM_OBJECT_NULL;
22514 		    cs_object = cs_shadow) {
22515 			cs_shadow = cs_object->shadow;
22516 			cs_offset += cs_object->vo_shadow_offset;
22517 			vm_object_lock_shared(cs_shadow);
22518 			vm_object_unlock(cs_object);
22519 		}
22520 		if (cs_object->internal ||
22521 		    cs_object->pager == MEMORY_OBJECT_NULL) {
22522 			vm_object_unlock(cs_object);
22523 			return KERN_SUCCESS;
22524 		}
22525 
22526 		cs_offset += cs_object->paging_offset;
22527 
22528 		/*
22529 		 * cs_object could be backed by a:
22530 		 *      vnode_pager
22531 		 *	apple_protect_pager
22532 		 *      shared_region_pager
22533 		 *	fourk_pager (multiple backing objects -> fail?)
22534 		 * ask the pager if it has a backing VM object
22535 		 */
22536 		if (!memory_object_backing_object(cs_object->pager,
22537 		    cs_offset,
22538 		    &backing_object,
22539 		    &backing_offset)) {
22540 			/* no backing object: cs_object is it */
22541 			break;
22542 		}
22543 
22544 		/* look down the backing object's shadow chain */
22545 		vm_object_lock_shared(backing_object);
22546 		vm_object_unlock(cs_object);
22547 		cs_object = backing_object;
22548 		cs_offset = backing_offset;
22549 	}
22550 
22551 	cs_vnode = vnode_pager_lookup_vnode(cs_object->pager);
22552 	if (cs_vnode == NULL) {
22553 		/* no vnode, no code signatures to associate */
22554 		cs_ret = KERN_SUCCESS;
22555 	} else {
22556 		cs_ret = vnode_pager_get_cs_blobs(cs_vnode,
22557 		    &cs_blobs);
22558 		assert(cs_ret == KERN_SUCCESS);
22559 		cs_ret = cs_associate_blob_with_mapping(map->pmap,
22560 		    entry->vme_start,
22561 		    (entry->vme_end - entry->vme_start),
22562 		    cs_offset,
22563 		    cs_blobs);
22564 	}
22565 	vm_object_unlock(cs_object);
22566 	cs_object = VM_OBJECT_NULL;
22567 
22568 done:
22569 	if (cs_ret == KERN_SUCCESS) {
22570 		DTRACE_VM2(vm_map_entry_cs_associate_success,
22571 		    vm_map_offset_t, entry->vme_start,
22572 		    vm_map_offset_t, entry->vme_end);
22573 		if (vm_map_executable_immutable) {
22574 			/*
22575 			 * Prevent this executable
22576 			 * mapping from being unmapped
22577 			 * or modified.
22578 			 */
22579 			entry->vme_permanent = TRUE;
22580 		}
22581 		/*
22582 		 * pmap says it will validate the
22583 		 * code-signing validity of pages
22584 		 * faulted in via this mapping, so
22585 		 * this map entry should be marked so
22586 		 * that vm_fault() bypasses code-signing
22587 		 * validation for faults coming through
22588 		 * this mapping.
22589 		 */
22590 		entry->csm_associated = TRUE;
22591 	} else if (cs_ret == KERN_NOT_SUPPORTED) {
22592 		/*
22593 		 * pmap won't check the code-signing
22594 		 * validity of pages faulted in via
22595 		 * this mapping, so VM should keep
22596 		 * doing it.
22597 		 */
22598 		DTRACE_VM3(vm_map_entry_cs_associate_off,
22599 		    vm_map_offset_t, entry->vme_start,
22600 		    vm_map_offset_t, entry->vme_end,
22601 		    int, cs_ret);
22602 	} else {
22603 		/*
22604 		 * A real error: do not allow
22605 		 * execution in this mapping.
22606 		 */
22607 		DTRACE_VM3(vm_map_entry_cs_associate_failure,
22608 		    vm_map_offset_t, entry->vme_start,
22609 		    vm_map_offset_t, entry->vme_end,
22610 		    int, cs_ret);
22611 		if (vmk_flags.vmkf_overwrite_immutable) {
22612 			/*
22613 			 * We can get here when we remap an apple_protect pager
22614 			 * on top of an already cs_associated executable mapping
22615 			 * with the same code signatures, so we don't want to
22616 			 * lose VM_PROT_EXECUTE in that case...
22617 			 */
22618 		} else {
22619 			entry->protection &= ~VM_PROT_ALLEXEC;
22620 			entry->max_protection &= ~VM_PROT_ALLEXEC;
22621 		}
22622 	}
22623 
22624 	return cs_ret;
22625 }
22626 
22627 #endif /* CODE_SIGNING_MONITOR */
22628 
22629 inline bool
vm_map_is_corpse_source(vm_map_t map)22630 vm_map_is_corpse_source(vm_map_t map)
22631 {
22632 	bool status = false;
22633 	if (map) {
22634 		vm_map_lock_read(map);
22635 		status = map->corpse_source;
22636 		vm_map_unlock_read(map);
22637 	}
22638 	return status;
22639 }
22640 
22641 inline void
vm_map_set_corpse_source(vm_map_t map)22642 vm_map_set_corpse_source(vm_map_t map)
22643 {
22644 	if (map) {
22645 		vm_map_lock(map);
22646 		map->corpse_source = true;
22647 		vm_map_unlock(map);
22648 	}
22649 }
22650 
22651 inline void
vm_map_unset_corpse_source(vm_map_t map)22652 vm_map_unset_corpse_source(vm_map_t map)
22653 {
22654 	if (map) {
22655 		vm_map_lock(map);
22656 		map->corpse_source = false;
22657 		vm_map_unlock(map);
22658 	}
22659 }
22660 /*
22661  * FORKED CORPSE FOOTPRINT
22662  *
22663  * A forked corpse gets a copy of the original VM map but its pmap is mostly
22664  * empty since it never ran and never got to fault in any pages.
22665  * Collecting footprint info (via "sysctl vm.self_region_footprint") for
22666  * a forked corpse would therefore return very little information.
22667  *
22668  * When forking a corpse, we can pass the VM_MAP_FORK_CORPSE_FOOTPRINT option
22669  * to vm_map_fork() to collect footprint information from the original VM map
22670  * and its pmap, and store it in the forked corpse's VM map.  That information
22671  * is stored in place of the VM map's "hole list" since we'll never need to
22672  * lookup for holes in the corpse's map.
22673  *
22674  * The corpse's footprint info looks like this:
22675  *
22676  * vm_map->vmmap_corpse_footprint points to pageable kernel memory laid out
22677  * as follows:
22678  *                     +---------------------------------------+
22679  *            header-> | cf_size                               |
22680  *                     +-------------------+-------------------+
22681  *                     | cf_last_region    | cf_last_zeroes    |
22682  *                     +-------------------+-------------------+
22683  *           region1-> | cfr_vaddr                             |
22684  *                     +-------------------+-------------------+
22685  *                     | cfr_num_pages     | d0 | d1 | d2 | d3 |
22686  *                     +---------------------------------------+
22687  *                     | d4 | d5 | ...                         |
22688  *                     +---------------------------------------+
22689  *                     | ...                                   |
22690  *                     +-------------------+-------------------+
22691  *                     | dy | dz | na | na | cfr_vaddr...      | <-region2
22692  *                     +-------------------+-------------------+
22693  *                     | cfr_vaddr (ctd)   | cfr_num_pages     |
22694  *                     +---------------------------------------+
22695  *                     | d0 | d1 ...                           |
22696  *                     +---------------------------------------+
22697  *                       ...
22698  *                     +---------------------------------------+
22699  *       last region-> | cfr_vaddr                             |
22700  *                     +---------------------------------------+
22701  *                     + cfr_num_pages     | d0 | d1 | d2 | d3 |
22702  *                     +---------------------------------------+
22703  *                       ...
22704  *                     +---------------------------------------+
22705  *                     | dx | dy | dz | na | na | na | na | na |
22706  *                     +---------------------------------------+
22707  *
22708  * where:
22709  *      cf_size:	total size of the buffer (rounded to page size)
22710  *      cf_last_region:	offset in the buffer of the last "region" sub-header
22711  *	cf_last_zeroes: number of trailing "zero" dispositions at the end
22712  *			of last region
22713  *	cfr_vaddr:	virtual address of the start of the covered "region"
22714  *	cfr_num_pages:	number of pages in the covered "region"
22715  *	d*:		disposition of the page at that virtual address
22716  * Regions in the buffer are word-aligned.
22717  *
22718  * We estimate the size of the buffer based on the number of memory regions
22719  * and the virtual size of the address space.  While copying each memory region
22720  * during vm_map_fork(), we also collect the footprint info for that region
22721  * and store it in the buffer, packing it as much as possible (coalescing
22722  * contiguous memory regions to avoid having too many region headers and
22723  * avoiding long streaks of "zero" page dispositions by splitting footprint
22724  * "regions", so the number of regions in the footprint buffer might not match
22725  * the number of memory regions in the address space.
22726  *
22727  * We also have to copy the original task's "nonvolatile" ledgers since that's
22728  * part of the footprint and will need to be reported to any tool asking for
22729  * the footprint information of the forked corpse.
22730  */
22731 
22732 uint64_t vm_map_corpse_footprint_count = 0;
22733 uint64_t vm_map_corpse_footprint_size_avg = 0;
22734 uint64_t vm_map_corpse_footprint_size_max = 0;
22735 uint64_t vm_map_corpse_footprint_full = 0;
22736 uint64_t vm_map_corpse_footprint_no_buf = 0;
22737 
22738 struct vm_map_corpse_footprint_header {
22739 	vm_size_t       cf_size;        /* allocated buffer size */
22740 	uint32_t        cf_last_region; /* offset of last region in buffer */
22741 	union {
22742 		uint32_t cfu_last_zeroes; /* during creation:
22743 		                           * number of "zero" dispositions at
22744 		                           * end of last region */
22745 		uint32_t cfu_hint_region; /* during lookup:
22746 		                           * offset of last looked up region */
22747 #define cf_last_zeroes cfu.cfu_last_zeroes
22748 #define cf_hint_region cfu.cfu_hint_region
22749 	} cfu;
22750 };
22751 typedef uint8_t cf_disp_t;
22752 struct vm_map_corpse_footprint_region {
22753 	vm_map_offset_t cfr_vaddr;      /* region start virtual address */
22754 	uint32_t        cfr_num_pages;  /* number of pages in this "region" */
22755 	cf_disp_t   cfr_disposition[0]; /* disposition of each page */
22756 } __attribute__((packed));
22757 
22758 static cf_disp_t
vm_page_disposition_to_cf_disp(int disposition)22759 vm_page_disposition_to_cf_disp(
22760 	int disposition)
22761 {
22762 	assert(sizeof(cf_disp_t) == 1);
22763 	/* relocate bits that don't fit in a "uint8_t" */
22764 	if (disposition & VM_PAGE_QUERY_PAGE_REUSABLE) {
22765 		disposition |= VM_PAGE_QUERY_PAGE_FICTITIOUS;
22766 	}
22767 	/* cast gets rid of extra bits */
22768 	return (cf_disp_t) disposition;
22769 }
22770 
22771 static int
vm_page_cf_disp_to_disposition(cf_disp_t cf_disp)22772 vm_page_cf_disp_to_disposition(
22773 	cf_disp_t cf_disp)
22774 {
22775 	int disposition;
22776 
22777 	assert(sizeof(cf_disp_t) == 1);
22778 	disposition = (int) cf_disp;
22779 	/* move relocated bits back in place */
22780 	if (cf_disp & VM_PAGE_QUERY_PAGE_FICTITIOUS) {
22781 		disposition |= VM_PAGE_QUERY_PAGE_REUSABLE;
22782 		disposition &= ~VM_PAGE_QUERY_PAGE_FICTITIOUS;
22783 	}
22784 	return disposition;
22785 }
22786 
22787 /*
22788  * vm_map_corpse_footprint_new_region:
22789  *      closes the current footprint "region" and creates a new one
22790  *
22791  * Returns NULL if there's not enough space in the buffer for a new region.
22792  */
22793 static struct vm_map_corpse_footprint_region *
vm_map_corpse_footprint_new_region(struct vm_map_corpse_footprint_header * footprint_header)22794 vm_map_corpse_footprint_new_region(
22795 	struct vm_map_corpse_footprint_header *footprint_header)
22796 {
22797 	uintptr_t       footprint_edge;
22798 	uint32_t        new_region_offset;
22799 	struct vm_map_corpse_footprint_region *footprint_region;
22800 	struct vm_map_corpse_footprint_region *new_footprint_region;
22801 
22802 	footprint_edge = ((uintptr_t)footprint_header +
22803 	    footprint_header->cf_size);
22804 	footprint_region = ((struct vm_map_corpse_footprint_region *)
22805 	    ((char *)footprint_header +
22806 	    footprint_header->cf_last_region));
22807 	assert((uintptr_t)footprint_region + sizeof(*footprint_region) <=
22808 	    footprint_edge);
22809 
22810 	/* get rid of trailing zeroes in the last region */
22811 	assert(footprint_region->cfr_num_pages >=
22812 	    footprint_header->cf_last_zeroes);
22813 	footprint_region->cfr_num_pages -=
22814 	    footprint_header->cf_last_zeroes;
22815 	footprint_header->cf_last_zeroes = 0;
22816 
22817 	/* reuse this region if it's now empty */
22818 	if (footprint_region->cfr_num_pages == 0) {
22819 		return footprint_region;
22820 	}
22821 
22822 	/* compute offset of new region */
22823 	new_region_offset = footprint_header->cf_last_region;
22824 	new_region_offset += sizeof(*footprint_region);
22825 	new_region_offset += (footprint_region->cfr_num_pages * sizeof(cf_disp_t));
22826 	new_region_offset = roundup(new_region_offset, sizeof(int));
22827 
22828 	/* check if we're going over the edge */
22829 	if (((uintptr_t)footprint_header +
22830 	    new_region_offset +
22831 	    sizeof(*footprint_region)) >=
22832 	    footprint_edge) {
22833 		/* over the edge: no new region */
22834 		return NULL;
22835 	}
22836 
22837 	/* adjust offset of last region in header */
22838 	footprint_header->cf_last_region = new_region_offset;
22839 
22840 	new_footprint_region = (struct vm_map_corpse_footprint_region *)
22841 	    ((char *)footprint_header +
22842 	    footprint_header->cf_last_region);
22843 	new_footprint_region->cfr_vaddr = 0;
22844 	new_footprint_region->cfr_num_pages = 0;
22845 	/* caller needs to initialize new region */
22846 
22847 	return new_footprint_region;
22848 }
22849 
22850 /*
22851  * vm_map_corpse_footprint_collect:
22852  *	collect footprint information for "old_entry" in "old_map" and
22853  *	stores it in "new_map"'s vmmap_footprint_info.
22854  */
22855 kern_return_t
vm_map_corpse_footprint_collect(vm_map_t old_map,vm_map_entry_t old_entry,vm_map_t new_map)22856 vm_map_corpse_footprint_collect(
22857 	vm_map_t        old_map,
22858 	vm_map_entry_t  old_entry,
22859 	vm_map_t        new_map)
22860 {
22861 	vm_map_offset_t va;
22862 	kern_return_t   kr;
22863 	struct vm_map_corpse_footprint_header *footprint_header;
22864 	struct vm_map_corpse_footprint_region *footprint_region;
22865 	struct vm_map_corpse_footprint_region *new_footprint_region;
22866 	cf_disp_t       *next_disp_p;
22867 	uintptr_t       footprint_edge;
22868 	uint32_t        num_pages_tmp;
22869 	int             effective_page_size;
22870 
22871 	effective_page_size = MIN(PAGE_SIZE, VM_MAP_PAGE_SIZE(old_map));
22872 
22873 	va = old_entry->vme_start;
22874 
22875 	vm_map_lock_assert_exclusive(old_map);
22876 	vm_map_lock_assert_exclusive(new_map);
22877 
22878 	assert(new_map->has_corpse_footprint);
22879 	assert(!old_map->has_corpse_footprint);
22880 	if (!new_map->has_corpse_footprint ||
22881 	    old_map->has_corpse_footprint) {
22882 		/*
22883 		 * This can only transfer footprint info from a
22884 		 * map with a live pmap to a map with a corpse footprint.
22885 		 */
22886 		return KERN_NOT_SUPPORTED;
22887 	}
22888 
22889 	if (new_map->vmmap_corpse_footprint == NULL) {
22890 		vm_offset_t     buf;
22891 		vm_size_t       buf_size;
22892 
22893 		buf = 0;
22894 		buf_size = (sizeof(*footprint_header) +
22895 		    (old_map->hdr.nentries
22896 		    *
22897 		    (sizeof(*footprint_region) +
22898 		    +3))            /* potential alignment for each region */
22899 		    +
22900 		    ((old_map->size / effective_page_size)
22901 		    *
22902 		    sizeof(cf_disp_t)));      /* disposition for each page */
22903 //		printf("FBDP corpse map %p guestimate footprint size 0x%llx\n", new_map, (uint64_t) buf_size);
22904 		buf_size = round_page(buf_size);
22905 
22906 		/* limit buffer to 1 page to validate overflow detection */
22907 //		buf_size = PAGE_SIZE;
22908 
22909 		/* limit size to a somewhat sane amount */
22910 #if XNU_TARGET_OS_OSX
22911 #define VM_MAP_CORPSE_FOOTPRINT_INFO_MAX_SIZE   (8*1024*1024)   /* 8MB */
22912 #else /* XNU_TARGET_OS_OSX */
22913 #define VM_MAP_CORPSE_FOOTPRINT_INFO_MAX_SIZE   (256*1024)      /* 256KB */
22914 #endif /* XNU_TARGET_OS_OSX */
22915 		if (buf_size > VM_MAP_CORPSE_FOOTPRINT_INFO_MAX_SIZE) {
22916 			buf_size = VM_MAP_CORPSE_FOOTPRINT_INFO_MAX_SIZE;
22917 		}
22918 
22919 		/*
22920 		 * Allocate the pageable buffer (with a trailing guard page).
22921 		 * It will be zero-filled on demand.
22922 		 */
22923 		kr = kmem_alloc(kernel_map, &buf, buf_size + PAGE_SIZE,
22924 		    KMA_DATA | KMA_PAGEABLE | KMA_GUARD_LAST,
22925 		    VM_KERN_MEMORY_DIAG);
22926 		if (kr != KERN_SUCCESS) {
22927 			vm_map_corpse_footprint_no_buf++;
22928 			return kr;
22929 		}
22930 
22931 		/* initialize header and 1st region */
22932 		footprint_header = (struct vm_map_corpse_footprint_header *)buf;
22933 		new_map->vmmap_corpse_footprint = footprint_header;
22934 
22935 		footprint_header->cf_size = buf_size;
22936 		footprint_header->cf_last_region =
22937 		    sizeof(*footprint_header);
22938 		footprint_header->cf_last_zeroes = 0;
22939 
22940 		footprint_region = (struct vm_map_corpse_footprint_region *)
22941 		    ((char *)footprint_header +
22942 		    footprint_header->cf_last_region);
22943 		footprint_region->cfr_vaddr = 0;
22944 		footprint_region->cfr_num_pages = 0;
22945 	} else {
22946 		/* retrieve header and last region */
22947 		footprint_header = (struct vm_map_corpse_footprint_header *)
22948 		    new_map->vmmap_corpse_footprint;
22949 		footprint_region = (struct vm_map_corpse_footprint_region *)
22950 		    ((char *)footprint_header +
22951 		    footprint_header->cf_last_region);
22952 	}
22953 	footprint_edge = ((uintptr_t)footprint_header +
22954 	    footprint_header->cf_size);
22955 
22956 	if ((footprint_region->cfr_vaddr +
22957 	    (((vm_map_offset_t)footprint_region->cfr_num_pages) *
22958 	    effective_page_size))
22959 	    != old_entry->vme_start) {
22960 		uint64_t num_pages_delta, num_pages_delta_size;
22961 		uint32_t region_offset_delta_size;
22962 
22963 		/*
22964 		 * Not the next contiguous virtual address:
22965 		 * start a new region or store "zero" dispositions for
22966 		 * the missing pages?
22967 		 */
22968 		/* size of gap in actual page dispositions */
22969 		num_pages_delta = ((old_entry->vme_start -
22970 		    footprint_region->cfr_vaddr) / effective_page_size)
22971 		    - footprint_region->cfr_num_pages;
22972 		num_pages_delta_size = num_pages_delta * sizeof(cf_disp_t);
22973 		/* size of gap as a new footprint region header */
22974 		region_offset_delta_size =
22975 		    (sizeof(*footprint_region) +
22976 		    roundup(((footprint_region->cfr_num_pages -
22977 		    footprint_header->cf_last_zeroes) * sizeof(cf_disp_t)),
22978 		    sizeof(int)) -
22979 		    ((footprint_region->cfr_num_pages -
22980 		    footprint_header->cf_last_zeroes) * sizeof(cf_disp_t)));
22981 //		printf("FBDP %s:%d region 0x%x 0x%llx 0x%x vme_start 0x%llx pages_delta 0x%llx region_delta 0x%x\n", __FUNCTION__, __LINE__, footprint_header->cf_last_region, footprint_region->cfr_vaddr, footprint_region->cfr_num_pages, old_entry->vme_start, num_pages_delta, region_offset_delta);
22982 		if (region_offset_delta_size < num_pages_delta_size ||
22983 		    os_add3_overflow(footprint_region->cfr_num_pages,
22984 		    (uint32_t) num_pages_delta,
22985 		    1,
22986 		    &num_pages_tmp)) {
22987 			/*
22988 			 * Storing data for this gap would take more space
22989 			 * than inserting a new footprint region header:
22990 			 * let's start a new region and save space. If it's a
22991 			 * tie, let's avoid using a new region, since that
22992 			 * would require more region hops to find the right
22993 			 * range during lookups.
22994 			 *
22995 			 * If the current region's cfr_num_pages would overflow
22996 			 * if we added "zero" page dispositions for the gap,
22997 			 * no choice but to start a new region.
22998 			 */
22999 //			printf("FBDP %s:%d new region\n", __FUNCTION__, __LINE__);
23000 			new_footprint_region =
23001 			    vm_map_corpse_footprint_new_region(footprint_header);
23002 			/* check that we're not going over the edge */
23003 			if (new_footprint_region == NULL) {
23004 				goto over_the_edge;
23005 			}
23006 			footprint_region = new_footprint_region;
23007 			/* initialize new region as empty */
23008 			footprint_region->cfr_vaddr = old_entry->vme_start;
23009 			footprint_region->cfr_num_pages = 0;
23010 		} else {
23011 			/*
23012 			 * Store "zero" page dispositions for the missing
23013 			 * pages.
23014 			 */
23015 //			printf("FBDP %s:%d zero gap\n", __FUNCTION__, __LINE__);
23016 			for (; num_pages_delta > 0; num_pages_delta--) {
23017 				next_disp_p = (cf_disp_t *)
23018 				    ((uintptr_t) footprint_region +
23019 				    sizeof(*footprint_region));
23020 				next_disp_p += footprint_region->cfr_num_pages;
23021 				/* check that we're not going over the edge */
23022 				if ((uintptr_t)next_disp_p >= footprint_edge) {
23023 					goto over_the_edge;
23024 				}
23025 				/* store "zero" disposition for this gap page */
23026 				footprint_region->cfr_num_pages++;
23027 				*next_disp_p = (cf_disp_t) 0;
23028 				footprint_header->cf_last_zeroes++;
23029 			}
23030 		}
23031 	}
23032 
23033 	for (va = old_entry->vme_start;
23034 	    va < old_entry->vme_end;
23035 	    va += effective_page_size) {
23036 		int             disposition;
23037 		cf_disp_t       cf_disp;
23038 
23039 		vm_map_footprint_query_page_info(old_map,
23040 		    old_entry,
23041 		    va,
23042 		    &disposition);
23043 		cf_disp = vm_page_disposition_to_cf_disp(disposition);
23044 
23045 //		if (va < SHARED_REGION_BASE_ARM64) printf("FBDP collect map %p va 0x%llx disp 0x%x\n", new_map, va, disp);
23046 
23047 		if (cf_disp == 0 && footprint_region->cfr_num_pages == 0) {
23048 			/*
23049 			 * Ignore "zero" dispositions at start of
23050 			 * region: just move start of region.
23051 			 */
23052 			footprint_region->cfr_vaddr += effective_page_size;
23053 			continue;
23054 		}
23055 
23056 		/* would region's cfr_num_pages overflow? */
23057 		if (os_add_overflow(footprint_region->cfr_num_pages, 1,
23058 		    &num_pages_tmp)) {
23059 			/* overflow: create a new region */
23060 			new_footprint_region =
23061 			    vm_map_corpse_footprint_new_region(
23062 				footprint_header);
23063 			if (new_footprint_region == NULL) {
23064 				goto over_the_edge;
23065 			}
23066 			footprint_region = new_footprint_region;
23067 			footprint_region->cfr_vaddr = va;
23068 			footprint_region->cfr_num_pages = 0;
23069 		}
23070 
23071 		next_disp_p = (cf_disp_t *) ((uintptr_t) footprint_region +
23072 		    sizeof(*footprint_region));
23073 		next_disp_p += footprint_region->cfr_num_pages;
23074 		/* check that we're not going over the edge */
23075 		if ((uintptr_t)next_disp_p >= footprint_edge) {
23076 			goto over_the_edge;
23077 		}
23078 		/* store this dispostion */
23079 		*next_disp_p = cf_disp;
23080 		footprint_region->cfr_num_pages++;
23081 
23082 		if (cf_disp != 0) {
23083 			/* non-zero disp: break the current zero streak */
23084 			footprint_header->cf_last_zeroes = 0;
23085 			/* done */
23086 			continue;
23087 		}
23088 
23089 		/* zero disp: add to the current streak of zeroes */
23090 		footprint_header->cf_last_zeroes++;
23091 		if ((footprint_header->cf_last_zeroes +
23092 		    roundup(((footprint_region->cfr_num_pages -
23093 		    footprint_header->cf_last_zeroes) * sizeof(cf_disp_t)) &
23094 		    (sizeof(int) - 1),
23095 		    sizeof(int))) <
23096 		    (sizeof(*footprint_header))) {
23097 			/*
23098 			 * There are not enough trailing "zero" dispositions
23099 			 * (+ the extra padding we would need for the previous
23100 			 * region); creating a new region would not save space
23101 			 * at this point, so let's keep this "zero" disposition
23102 			 * in this region and reconsider later.
23103 			 */
23104 			continue;
23105 		}
23106 		/*
23107 		 * Create a new region to avoid having too many consecutive
23108 		 * "zero" dispositions.
23109 		 */
23110 		new_footprint_region =
23111 		    vm_map_corpse_footprint_new_region(footprint_header);
23112 		if (new_footprint_region == NULL) {
23113 			goto over_the_edge;
23114 		}
23115 		footprint_region = new_footprint_region;
23116 		/* initialize the new region as empty ... */
23117 		footprint_region->cfr_num_pages = 0;
23118 		/* ... and skip this "zero" disp */
23119 		footprint_region->cfr_vaddr = va + effective_page_size;
23120 	}
23121 
23122 	return KERN_SUCCESS;
23123 
23124 over_the_edge:
23125 //	printf("FBDP map %p footprint was full for va 0x%llx\n", new_map, va);
23126 	vm_map_corpse_footprint_full++;
23127 	return KERN_RESOURCE_SHORTAGE;
23128 }
23129 
23130 /*
23131  * vm_map_corpse_footprint_collect_done:
23132  *	completes the footprint collection by getting rid of any remaining
23133  *	trailing "zero" dispositions and trimming the unused part of the
23134  *	kernel buffer
23135  */
23136 void
vm_map_corpse_footprint_collect_done(vm_map_t new_map)23137 vm_map_corpse_footprint_collect_done(
23138 	vm_map_t        new_map)
23139 {
23140 	struct vm_map_corpse_footprint_header *footprint_header;
23141 	struct vm_map_corpse_footprint_region *footprint_region;
23142 	vm_size_t       buf_size, actual_size;
23143 	kern_return_t   kr;
23144 
23145 	assert(new_map->has_corpse_footprint);
23146 	if (!new_map->has_corpse_footprint ||
23147 	    new_map->vmmap_corpse_footprint == NULL) {
23148 		return;
23149 	}
23150 
23151 	footprint_header = (struct vm_map_corpse_footprint_header *)
23152 	    new_map->vmmap_corpse_footprint;
23153 	buf_size = footprint_header->cf_size;
23154 
23155 	footprint_region = (struct vm_map_corpse_footprint_region *)
23156 	    ((char *)footprint_header +
23157 	    footprint_header->cf_last_region);
23158 
23159 	/* get rid of trailing zeroes in last region */
23160 	assert(footprint_region->cfr_num_pages >= footprint_header->cf_last_zeroes);
23161 	footprint_region->cfr_num_pages -= footprint_header->cf_last_zeroes;
23162 	footprint_header->cf_last_zeroes = 0;
23163 
23164 	actual_size = (vm_size_t)(footprint_header->cf_last_region +
23165 	    sizeof(*footprint_region) +
23166 	    (footprint_region->cfr_num_pages * sizeof(cf_disp_t)));
23167 
23168 //	printf("FBDP map %p buf_size 0x%llx actual_size 0x%llx\n", new_map, (uint64_t) buf_size, (uint64_t) actual_size);
23169 	vm_map_corpse_footprint_size_avg =
23170 	    (((vm_map_corpse_footprint_size_avg *
23171 	    vm_map_corpse_footprint_count) +
23172 	    actual_size) /
23173 	    (vm_map_corpse_footprint_count + 1));
23174 	vm_map_corpse_footprint_count++;
23175 	if (actual_size > vm_map_corpse_footprint_size_max) {
23176 		vm_map_corpse_footprint_size_max = actual_size;
23177 	}
23178 
23179 	actual_size = round_page(actual_size);
23180 	if (buf_size > actual_size) {
23181 		kr = vm_deallocate(kernel_map,
23182 		    ((vm_address_t)footprint_header +
23183 		    actual_size +
23184 		    PAGE_SIZE),                 /* trailing guard page */
23185 		    (buf_size - actual_size));
23186 		assertf(kr == KERN_SUCCESS,
23187 		    "trim: footprint_header %p buf_size 0x%llx actual_size 0x%llx kr=0x%x\n",
23188 		    footprint_header,
23189 		    (uint64_t) buf_size,
23190 		    (uint64_t) actual_size,
23191 		    kr);
23192 		kr = vm_protect(kernel_map,
23193 		    ((vm_address_t)footprint_header +
23194 		    actual_size),
23195 		    PAGE_SIZE,
23196 		    FALSE,             /* set_maximum */
23197 		    VM_PROT_NONE);
23198 		assertf(kr == KERN_SUCCESS,
23199 		    "guard: footprint_header %p buf_size 0x%llx actual_size 0x%llx kr=0x%x\n",
23200 		    footprint_header,
23201 		    (uint64_t) buf_size,
23202 		    (uint64_t) actual_size,
23203 		    kr);
23204 	}
23205 
23206 	footprint_header->cf_size = actual_size;
23207 }
23208 
23209 /*
23210  * vm_map_corpse_footprint_query_page_info:
23211  *	retrieves the disposition of the page at virtual address "vaddr"
23212  *	in the forked corpse's VM map
23213  *
23214  * This is the equivalent of vm_map_footprint_query_page_info() for a forked corpse.
23215  */
23216 kern_return_t
vm_map_corpse_footprint_query_page_info(vm_map_t map,vm_map_offset_t va,int * disposition_p)23217 vm_map_corpse_footprint_query_page_info(
23218 	vm_map_t        map,
23219 	vm_map_offset_t va,
23220 	int             *disposition_p)
23221 {
23222 	struct vm_map_corpse_footprint_header *footprint_header;
23223 	struct vm_map_corpse_footprint_region *footprint_region;
23224 	uint32_t        footprint_region_offset;
23225 	vm_map_offset_t region_start, region_end;
23226 	int             disp_idx;
23227 	kern_return_t   kr;
23228 	int             effective_page_size;
23229 	cf_disp_t       cf_disp;
23230 
23231 	if (!map->has_corpse_footprint) {
23232 		*disposition_p = 0;
23233 		kr = KERN_INVALID_ARGUMENT;
23234 		goto done;
23235 	}
23236 
23237 	footprint_header = map->vmmap_corpse_footprint;
23238 	if (footprint_header == NULL) {
23239 		*disposition_p = 0;
23240 //		if (va < SHARED_REGION_BASE_ARM64) printf("FBDP %d query map %p va 0x%llx disp 0x%x\n", __LINE__, map, va, *disposition_p);
23241 		kr = KERN_INVALID_ARGUMENT;
23242 		goto done;
23243 	}
23244 
23245 	/* start looking at the hint ("cf_hint_region") */
23246 	footprint_region_offset = footprint_header->cf_hint_region;
23247 
23248 	effective_page_size = MIN(PAGE_SIZE, VM_MAP_PAGE_SIZE(map));
23249 
23250 lookup_again:
23251 	if (footprint_region_offset < sizeof(*footprint_header)) {
23252 		/* hint too low: start from 1st region */
23253 		footprint_region_offset = sizeof(*footprint_header);
23254 	}
23255 	if (footprint_region_offset >= footprint_header->cf_last_region) {
23256 		/* hint too high: re-start from 1st region */
23257 		footprint_region_offset = sizeof(*footprint_header);
23258 	}
23259 	footprint_region = (struct vm_map_corpse_footprint_region *)
23260 	    ((char *)footprint_header + footprint_region_offset);
23261 	region_start = footprint_region->cfr_vaddr;
23262 	region_end = (region_start +
23263 	    ((vm_map_offset_t)(footprint_region->cfr_num_pages) *
23264 	    effective_page_size));
23265 	if (va < region_start &&
23266 	    footprint_region_offset != sizeof(*footprint_header)) {
23267 		/* our range starts before the hint region */
23268 
23269 		/* reset the hint (in a racy way...) */
23270 		footprint_header->cf_hint_region = sizeof(*footprint_header);
23271 		/* lookup "va" again from 1st region */
23272 		footprint_region_offset = sizeof(*footprint_header);
23273 		goto lookup_again;
23274 	}
23275 
23276 	while (va >= region_end) {
23277 		if (footprint_region_offset >= footprint_header->cf_last_region) {
23278 			break;
23279 		}
23280 		/* skip the region's header */
23281 		footprint_region_offset += sizeof(*footprint_region);
23282 		/* skip the region's page dispositions */
23283 		footprint_region_offset += (footprint_region->cfr_num_pages * sizeof(cf_disp_t));
23284 		/* align to next word boundary */
23285 		footprint_region_offset =
23286 		    roundup(footprint_region_offset,
23287 		    sizeof(int));
23288 		footprint_region = (struct vm_map_corpse_footprint_region *)
23289 		    ((char *)footprint_header + footprint_region_offset);
23290 		region_start = footprint_region->cfr_vaddr;
23291 		region_end = (region_start +
23292 		    ((vm_map_offset_t)(footprint_region->cfr_num_pages) *
23293 		    effective_page_size));
23294 	}
23295 	if (va < region_start || va >= region_end) {
23296 		/* page not found */
23297 		*disposition_p = 0;
23298 //		if (va < SHARED_REGION_BASE_ARM64) printf("FBDP %d query map %p va 0x%llx disp 0x%x\n", __LINE__, map, va, *disposition_p);
23299 		kr = KERN_SUCCESS;
23300 		goto done;
23301 	}
23302 
23303 	/* "va" found: set the lookup hint for next lookup (in a racy way...) */
23304 	footprint_header->cf_hint_region = footprint_region_offset;
23305 
23306 	/* get page disposition for "va" in this region */
23307 	disp_idx = (int) ((va - footprint_region->cfr_vaddr) / effective_page_size);
23308 	cf_disp = footprint_region->cfr_disposition[disp_idx];
23309 	*disposition_p = vm_page_cf_disp_to_disposition(cf_disp);
23310 	kr = KERN_SUCCESS;
23311 done:
23312 //	if (va < SHARED_REGION_BASE_ARM64) printf("FBDP %d query map %p va 0x%llx disp 0x%x\n", __LINE__, map, va, *disposition_p);
23313 	/* dtrace -n 'vminfo:::footprint_query_page_info { printf("map 0x%p va 0x%llx disp 0x%x kr 0x%x", arg0, arg1, arg2, arg3); }' */
23314 	DTRACE_VM4(footprint_query_page_info,
23315 	    vm_map_t, map,
23316 	    vm_map_offset_t, va,
23317 	    int, *disposition_p,
23318 	    kern_return_t, kr);
23319 
23320 	return kr;
23321 }
23322 
23323 void
vm_map_corpse_footprint_destroy(vm_map_t map)23324 vm_map_corpse_footprint_destroy(
23325 	vm_map_t        map)
23326 {
23327 	if (map->has_corpse_footprint &&
23328 	    map->vmmap_corpse_footprint != 0) {
23329 		struct vm_map_corpse_footprint_header *footprint_header;
23330 		vm_size_t buf_size;
23331 		kern_return_t kr;
23332 
23333 		footprint_header = map->vmmap_corpse_footprint;
23334 		buf_size = footprint_header->cf_size;
23335 		kr = vm_deallocate(kernel_map,
23336 		    (vm_offset_t) map->vmmap_corpse_footprint,
23337 		    ((vm_size_t) buf_size
23338 		    + PAGE_SIZE));                 /* trailing guard page */
23339 		assertf(kr == KERN_SUCCESS, "kr=0x%x\n", kr);
23340 		map->vmmap_corpse_footprint = 0;
23341 		map->has_corpse_footprint = FALSE;
23342 	}
23343 }
23344 
23345 /*
23346  * vm_map_copy_footprint_ledgers:
23347  *	copies any ledger that's relevant to the memory footprint of "old_task"
23348  *	into the forked corpse's task ("new_task")
23349  */
23350 void
vm_map_copy_footprint_ledgers(task_t old_task,task_t new_task)23351 vm_map_copy_footprint_ledgers(
23352 	task_t  old_task,
23353 	task_t  new_task)
23354 {
23355 	vm_map_copy_ledger(old_task, new_task, task_ledgers.phys_footprint);
23356 	vm_map_copy_ledger(old_task, new_task, task_ledgers.purgeable_nonvolatile);
23357 	vm_map_copy_ledger(old_task, new_task, task_ledgers.purgeable_nonvolatile_compressed);
23358 	vm_map_copy_ledger(old_task, new_task, task_ledgers.internal);
23359 	vm_map_copy_ledger(old_task, new_task, task_ledgers.internal_compressed);
23360 	vm_map_copy_ledger(old_task, new_task, task_ledgers.iokit_mapped);
23361 	vm_map_copy_ledger(old_task, new_task, task_ledgers.alternate_accounting);
23362 	vm_map_copy_ledger(old_task, new_task, task_ledgers.alternate_accounting_compressed);
23363 	vm_map_copy_ledger(old_task, new_task, task_ledgers.page_table);
23364 	vm_map_copy_ledger(old_task, new_task, task_ledgers.tagged_footprint);
23365 	vm_map_copy_ledger(old_task, new_task, task_ledgers.tagged_footprint_compressed);
23366 	vm_map_copy_ledger(old_task, new_task, task_ledgers.network_nonvolatile);
23367 	vm_map_copy_ledger(old_task, new_task, task_ledgers.network_nonvolatile_compressed);
23368 	vm_map_copy_ledger(old_task, new_task, task_ledgers.media_footprint);
23369 	vm_map_copy_ledger(old_task, new_task, task_ledgers.media_footprint_compressed);
23370 	vm_map_copy_ledger(old_task, new_task, task_ledgers.graphics_footprint);
23371 	vm_map_copy_ledger(old_task, new_task, task_ledgers.graphics_footprint_compressed);
23372 	vm_map_copy_ledger(old_task, new_task, task_ledgers.neural_footprint);
23373 	vm_map_copy_ledger(old_task, new_task, task_ledgers.neural_footprint_compressed);
23374 	vm_map_copy_ledger(old_task, new_task, task_ledgers.wired_mem);
23375 }
23376 
23377 /*
23378  * vm_map_copy_ledger:
23379  *	copy a single ledger from "old_task" to "new_task"
23380  */
23381 void
vm_map_copy_ledger(task_t old_task,task_t new_task,int ledger_entry)23382 vm_map_copy_ledger(
23383 	task_t  old_task,
23384 	task_t  new_task,
23385 	int     ledger_entry)
23386 {
23387 	ledger_amount_t old_balance, new_balance, delta;
23388 
23389 	assert(new_task->map->has_corpse_footprint);
23390 	if (!new_task->map->has_corpse_footprint) {
23391 		return;
23392 	}
23393 
23394 	/* turn off sanity checks for the ledger we're about to mess with */
23395 	ledger_disable_panic_on_negative(new_task->ledger,
23396 	    ledger_entry);
23397 
23398 	/* adjust "new_task" to match "old_task" */
23399 	ledger_get_balance(old_task->ledger,
23400 	    ledger_entry,
23401 	    &old_balance);
23402 	ledger_get_balance(new_task->ledger,
23403 	    ledger_entry,
23404 	    &new_balance);
23405 	if (new_balance == old_balance) {
23406 		/* new == old: done */
23407 	} else if (new_balance > old_balance) {
23408 		/* new > old ==> new -= new - old */
23409 		delta = new_balance - old_balance;
23410 		ledger_debit(new_task->ledger,
23411 		    ledger_entry,
23412 		    delta);
23413 	} else {
23414 		/* new < old ==> new += old - new */
23415 		delta = old_balance - new_balance;
23416 		ledger_credit(new_task->ledger,
23417 		    ledger_entry,
23418 		    delta);
23419 	}
23420 }
23421 
23422 /*
23423  * vm_map_get_pmap:
23424  * returns the pmap associated with the vm_map
23425  */
23426 pmap_t
vm_map_get_pmap(vm_map_t map)23427 vm_map_get_pmap(vm_map_t map)
23428 {
23429 	return vm_map_pmap(map);
23430 }
23431 
23432 #if CONFIG_MAP_RANGES
23433 static bitmap_t vm_map_user_range_heap_map[BITMAP_LEN(VM_MEMORY_COUNT)];
23434 
23435 static_assert(UMEM_RANGE_ID_DEFAULT == MACH_VM_RANGE_DEFAULT);
23436 static_assert(UMEM_RANGE_ID_HEAP == MACH_VM_RANGE_DATA);
23437 
23438 /*
23439  * vm_map_range_map_init:
23440  *  initializes the VM range ID map to enable index lookup
23441  *  of user VM ranges based on VM tag from userspace.
23442  */
23443 static void
vm_map_range_map_init(void)23444 vm_map_range_map_init(void)
23445 {
23446 	/*
23447 	 * VM_MEMORY_MALLOC{,_NANO} are skipped on purpose:
23448 	 * - the former is malloc metadata which should be kept separate
23449 	 * - the latter has its own ranges
23450 	 */
23451 	bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_MALLOC_HUGE);
23452 	bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_MALLOC_LARGE);
23453 	bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_MALLOC_LARGE_REUSED);
23454 	bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_MALLOC_MEDIUM);
23455 	bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_MALLOC_PROB_GUARD);
23456 	bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_MALLOC_SMALL);
23457 	bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_MALLOC_TINY);
23458 	bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_TCMALLOC);
23459 	bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_LIBNETWORK);
23460 	bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_IOACCELERATOR);
23461 	bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_IOSURFACE);
23462 	bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_IMAGEIO);
23463 	bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_COREGRAPHICS);
23464 	bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_CORESERVICES);
23465 	bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_COREDATA);
23466 	bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_LAYERKIT);
23467 }
23468 
23469 static struct mach_vm_range
vm_map_range_random_uniform(vm_map_size_t req_size,vm_map_offset_t min_addr,vm_map_offset_t max_addr,vm_map_offset_t offmask)23470 vm_map_range_random_uniform(
23471 	vm_map_size_t           req_size,
23472 	vm_map_offset_t         min_addr,
23473 	vm_map_offset_t         max_addr,
23474 	vm_map_offset_t         offmask)
23475 {
23476 	vm_map_offset_t random_addr;
23477 	struct mach_vm_range alloc;
23478 
23479 	req_size = (req_size + offmask) & ~offmask;
23480 	min_addr = (min_addr + offmask) & ~offmask;
23481 	max_addr = max_addr & ~offmask;
23482 
23483 	read_random(&random_addr, sizeof(random_addr));
23484 	random_addr %= (max_addr - req_size - min_addr);
23485 	random_addr &= ~offmask;
23486 
23487 	alloc.min_address = min_addr + random_addr;
23488 	alloc.max_address = min_addr + random_addr + req_size;
23489 	return alloc;
23490 }
23491 
23492 static vm_map_offset_t
vm_map_range_offmask(void)23493 vm_map_range_offmask(void)
23494 {
23495 	uint32_t pte_depth;
23496 
23497 	/*
23498 	 * PTE optimizations
23499 	 *
23500 	 *
23501 	 * 16k pages systems
23502 	 * ~~~~~~~~~~~~~~~~~
23503 	 *
23504 	 * A single L1 (sub-)page covers the address space.
23505 	 * - L2 pages cover 64G,
23506 	 * - L3 pages cover 32M.
23507 	 *
23508 	 * On embedded, the dynamic VA range is 64G and uses a single L2 page.
23509 	 * As a result, we really only need to align the ranges to 32M to avoid
23510 	 * partial L3 pages.
23511 	 *
23512 	 * On macOS, the usage of L2 pages will increase, so as a result we will
23513 	 * want to align ranges to 64G in order to utilize them fully.
23514 	 *
23515 	 *
23516 	 * 4k pages systems
23517 	 * ~~~~~~~~~~~~~~~~
23518 	 *
23519 	 * A single L0 (sub-)page covers the address space.
23520 	 * - L1 pages cover 512G,
23521 	 * - L2 pages cover 1G,
23522 	 * - L3 pages cover 2M.
23523 	 *
23524 	 * The long tail of processes on a system will tend to have a VA usage
23525 	 * (ignoring the shared regions) in the 100s of MB order of magnitnude.
23526 	 * This is achievable with a single L1 and a few L2s without
23527 	 * randomization.
23528 	 *
23529 	 * However once randomization is introduced, the system will immediately
23530 	 * need several L1s and many more L2s. As a result:
23531 	 *
23532 	 * - on embedded devices, the cost of these extra pages isn't
23533 	 *   sustainable, and we just disable the feature entirely,
23534 	 *
23535 	 * - on macOS we align ranges to a 512G boundary so that the extra L1
23536 	 *   pages can be used to their full potential.
23537 	 */
23538 
23539 	/*
23540 	 * note, this function assumes _non exotic mappings_
23541 	 * which is why it uses the native kernel's PAGE_SHIFT.
23542 	 */
23543 #if XNU_PLATFORM_MacOSX
23544 	pte_depth = PAGE_SHIFT > 12 ? 2 : 3;
23545 #else /* !XNU_PLATFORM_MacOSX */
23546 	pte_depth = PAGE_SHIFT > 12 ? 1 : 0;
23547 #endif /* !XNU_PLATFORM_MacOSX */
23548 
23549 	if (pte_depth == 0) {
23550 		return 0;
23551 	}
23552 
23553 	return (1ull << ((PAGE_SHIFT - 3) * pte_depth + PAGE_SHIFT)) - 1;
23554 }
23555 
23556 /*
23557  * vm_map_range_configure:
23558  *	configures the user vm_map ranges by increasing the maximum VA range of
23559  *  the map and carving out a range at the end of VA space (searching backwards
23560  *  in the newly expanded map).
23561  */
23562 kern_return_t
vm_map_range_configure(vm_map_t map)23563 vm_map_range_configure(vm_map_t map)
23564 {
23565 	const vm_map_offset_t offmask = vm_map_range_offmask();
23566 	struct mach_vm_range data_range;
23567 	vm_map_offset_t default_end;
23568 	kern_return_t kr;
23569 
23570 	if (!vm_map_is_64bit(map) || vm_map_is_exotic(map) || offmask == 0) {
23571 		/*
23572 		 * No point doing vm ranges in a 32bit address space.
23573 		 */
23574 		return KERN_NOT_SUPPORTED;
23575 	}
23576 
23577 	/* Should not be applying ranges to kernel map or kernel map submaps */
23578 	assert(vm_map_pmap(map) != kernel_pmap);
23579 
23580 #if XNU_PLATFORM_MacOSX
23581 
23582 	/*
23583 	 * on macOS, the address space is a massive 47 bits (128T),
23584 	 * with several carve outs that processes can't use:
23585 	 * - the shared region
23586 	 * - the commpage region
23587 	 * - the GPU carve out (if applicable)
23588 	 *
23589 	 * and when nano-malloc is in use it desires memory at the 96T mark.
23590 	 *
23591 	 * However, their location is architecture dependent:
23592 	 * - On intel, the shared region and commpage are
23593 	 *   at the very end of the usable address space (above +127T),
23594 	 *   and there is no GPU carve out, and pthread wants to place
23595 	 *   threads at the 112T mark (0x70T).
23596 	 *
23597 	 * - On arm64, these are in the same spot as on embedded devices:
23598 	 *   o shared region:   [ 6G,  10G)  [ will likely grow over time ]
23599 	 *   o commpage region: [63G,  64G)
23600 	 *   o GPU carve out:   [64G, 448G)
23601 	 *
23602 	 * This is conveninent because the mappings at the end of the address
23603 	 * space (when they exist) are made by the kernel.
23604 	 *
23605 	 * The policy is to allocate a random 1T for the data heap
23606 	 * in the end of the address-space in the:
23607 	 * - [0x71, 0x7f) range on Intel (to leave space for pthread stacks)
23608 	 * - [0x61, 0x7f) range on ASM (to leave space for Nano malloc).
23609 	 */
23610 
23611 	/* see NANOZONE_SIGNATURE in libmalloc */
23612 #if __x86_64__
23613 	default_end = 0x71ull << 40;
23614 #else
23615 	default_end = 0x61ull << 40;
23616 #endif
23617 	data_range  = vm_map_range_random_uniform(1ull << 40,
23618 	        default_end, 0x7full << 40, offmask);
23619 
23620 #else /* !XNU_PLATFORM_MacOSX */
23621 
23622 	/*
23623 	 * Embedded devices:
23624 	 *
23625 	 *   The default VA Size scales with the device physical memory.
23626 	 *
23627 	 *   Out of that:
23628 	 *   - the "zero" page typically uses 4G + some slide
23629 	 *   - the shared region uses SHARED_REGION_SIZE bytes (4G)
23630 	 *
23631 	 *   Without the use of jumbo or any adjustment to the address space,
23632 	 *   a default VM map typically looks like this:
23633 	 *
23634 	 *       0G -->╒════════════╕
23635 	 *             │  pagezero  │
23636 	 *             │  + slide   │
23637 	 *      ~4G -->╞════════════╡<-- vm_map_min(map)
23638 	 *             │            │
23639 	 *       6G -->├────────────┤
23640 	 *             │   shared   │
23641 	 *             │   region   │
23642 	 *      10G -->├────────────┤
23643 	 *             │            │
23644 	 *   max_va -->├────────────┤<-- vm_map_max(map)
23645 	 *             │            │
23646 	 *             ╎   jumbo    ╎
23647 	 *             ╎            ╎
23648 	 *             │            │
23649 	 *      63G -->╞════════════╡<-- MACH_VM_MAX_ADDRESS
23650 	 *             │  commpage  │
23651 	 *      64G -->├────────────┤<-- MACH_VM_MIN_GPU_CARVEOUT_ADDRESS
23652 	 *             │            │
23653 	 *             ╎    GPU     ╎
23654 	 *             ╎  carveout  ╎
23655 	 *             │            │
23656 	 *     448G -->├────────────┤<-- MACH_VM_MAX_GPU_CARVEOUT_ADDRESS
23657 	 *             │            │
23658 	 *             ╎            ╎
23659 	 *             ╎            ╎
23660 	 *             │            │
23661 	 *     512G -->╘════════════╛<-- (1ull << ARM_16K_TT_L1_SHIFT)
23662 	 *
23663 	 *   When this drawing was made, "max_va" was smaller than
23664 	 *   ARM64_MAX_OFFSET_DEVICE_LARGE (~15.5G), leaving shy of
23665 	 *   12G of address space for the zero-page, slide, files,
23666 	 *   binaries, heap ...
23667 	 *
23668 	 *   We will want to make a "heap/data" carve out inside
23669 	 *   the jumbo range of half of that usable space, assuming
23670 	 *   that this is less than a forth of the jumbo range.
23671 	 *
23672 	 *   The assert below intends to catch when max_va grows
23673 	 *   too large for this heuristic.
23674 	 */
23675 
23676 	vm_map_lock_read(map);
23677 	default_end = vm_map_max(map);
23678 	vm_map_unlock_read(map);
23679 
23680 	/*
23681 	 * Check that we're not already jumbo'd,
23682 	 * or our address space was somehow modified.
23683 	 *
23684 	 * If so we cannot guarantee that we can set up the ranges
23685 	 * safely without interfering with the existing map.
23686 	 */
23687 	if (default_end > vm_compute_max_offset(true)) {
23688 		return KERN_NO_SPACE;
23689 	}
23690 
23691 	if (pmap_max_offset(true, ARM_PMAP_MAX_OFFSET_DEFAULT)) {
23692 		/*
23693 		 * an override boot-arg was set, disable user-ranges
23694 		 *
23695 		 * XXX: this is problematic because it means these boot-args
23696 		 *      no longer test the behavior changing the value
23697 		 *      of ARM64_MAX_OFFSET_DEVICE_* would have.
23698 		 */
23699 		return KERN_NOT_SUPPORTED;
23700 	}
23701 
23702 	/* expand the default VM space to the largest possible address */
23703 	vm_map_set_jumbo(map);
23704 
23705 	assert3u(7 * GiB(10) / 2, <=, vm_map_max(map) - default_end);
23706 	data_range = vm_map_range_random_uniform(GiB(10),
23707 	    default_end + PAGE_SIZE, vm_map_max(map), offmask);
23708 
23709 #endif /* !XNU_PLATFORM_MacOSX */
23710 
23711 	/*
23712 	 * Poke holes so that ASAN or people listing regions
23713 	 * do not think this space is free.
23714 	 */
23715 
23716 	if (default_end != data_range.min_address) {
23717 		kr = vm_map_enter(map, &default_end,
23718 		    data_range.min_address - default_end,
23719 		    0, VM_MAP_KERNEL_FLAGS_FIXED_PERMANENT(), VM_OBJECT_NULL,
23720 		    0, FALSE, VM_PROT_NONE, VM_PROT_NONE, VM_INHERIT_DEFAULT);
23721 		assert(kr == KERN_SUCCESS);
23722 	}
23723 
23724 	if (data_range.max_address != vm_map_max(map)) {
23725 		vm_map_entry_t entry;
23726 		vm_size_t size;
23727 
23728 		vm_map_lock_read(map);
23729 		vm_map_lookup_entry_or_next(map, data_range.max_address, &entry);
23730 		if (entry != vm_map_to_entry(map)) {
23731 			size = vm_map_max(map) - data_range.max_address;
23732 		} else {
23733 			size = entry->vme_start - data_range.max_address;
23734 		}
23735 		vm_map_unlock_read(map);
23736 
23737 		kr = vm_map_enter(map, &data_range.max_address, size,
23738 		    0, VM_MAP_KERNEL_FLAGS_FIXED_PERMANENT(), VM_OBJECT_NULL,
23739 		    0, FALSE, VM_PROT_NONE, VM_PROT_NONE, VM_INHERIT_DEFAULT);
23740 		assert(kr == KERN_SUCCESS);
23741 	}
23742 
23743 	vm_map_lock(map);
23744 	map->default_range.min_address = vm_map_min(map);
23745 	map->default_range.max_address = default_end;
23746 	map->data_range = data_range;
23747 	map->uses_user_ranges = true;
23748 	vm_map_unlock(map);
23749 
23750 	return KERN_SUCCESS;
23751 }
23752 
23753 /*
23754  * vm_map_range_fork:
23755  *	clones the array of ranges from old_map to new_map in support
23756  *  of a VM map fork.
23757  */
23758 void
vm_map_range_fork(vm_map_t new_map,vm_map_t old_map)23759 vm_map_range_fork(vm_map_t new_map, vm_map_t old_map)
23760 {
23761 	if (!old_map->uses_user_ranges) {
23762 		/* nothing to do */
23763 		return;
23764 	}
23765 
23766 	new_map->default_range = old_map->default_range;
23767 	new_map->data_range = old_map->data_range;
23768 
23769 	if (old_map->extra_ranges_count) {
23770 		vm_map_user_range_t otable, ntable;
23771 		uint16_t count;
23772 
23773 		otable = old_map->extra_ranges;
23774 		count  = old_map->extra_ranges_count;
23775 		ntable = kalloc_data(count * sizeof(struct vm_map_user_range),
23776 		    Z_WAITOK | Z_ZERO | Z_NOFAIL);
23777 		memcpy(ntable, otable,
23778 		    count * sizeof(struct vm_map_user_range));
23779 
23780 		new_map->extra_ranges_count = count;
23781 		new_map->extra_ranges = ntable;
23782 	}
23783 
23784 	new_map->uses_user_ranges = true;
23785 }
23786 
23787 /*
23788  * vm_map_get_user_range:
23789  *	copy the VM user range for the given VM map and range ID.
23790  */
23791 kern_return_t
vm_map_get_user_range(vm_map_t map,vm_map_range_id_t range_id,mach_vm_range_t range)23792 vm_map_get_user_range(
23793 	vm_map_t                map,
23794 	vm_map_range_id_t       range_id,
23795 	mach_vm_range_t         range)
23796 {
23797 	if (map == NULL || !map->uses_user_ranges || range == NULL) {
23798 		return KERN_INVALID_ARGUMENT;
23799 	}
23800 
23801 	switch (range_id) {
23802 	case UMEM_RANGE_ID_DEFAULT:
23803 		*range = map->default_range;
23804 		return KERN_SUCCESS;
23805 
23806 	case UMEM_RANGE_ID_HEAP:
23807 		*range = map->data_range;
23808 		return KERN_SUCCESS;
23809 
23810 	default:
23811 		return KERN_INVALID_ARGUMENT;
23812 	}
23813 }
23814 
23815 static vm_map_range_id_t
vm_map_user_range_resolve(vm_map_t map,mach_vm_address_t addr,mach_vm_size_t size,mach_vm_range_t range)23816 vm_map_user_range_resolve(
23817 	vm_map_t                map,
23818 	mach_vm_address_t       addr,
23819 	mach_vm_size_t          size,
23820 	mach_vm_range_t         range)
23821 {
23822 	struct mach_vm_range tmp;
23823 
23824 	vm_map_lock_assert_held(map);
23825 
23826 	static_assert(UMEM_RANGE_ID_DEFAULT == MACH_VM_RANGE_DEFAULT);
23827 	static_assert(UMEM_RANGE_ID_HEAP == MACH_VM_RANGE_DATA);
23828 
23829 	if (mach_vm_range_contains(&map->default_range, addr, size)) {
23830 		if (range) {
23831 			*range = map->default_range;
23832 		}
23833 		return UMEM_RANGE_ID_DEFAULT;
23834 	}
23835 
23836 	if (mach_vm_range_contains(&map->data_range, addr, size)) {
23837 		if (range) {
23838 			*range = map->data_range;
23839 		}
23840 		return UMEM_RANGE_ID_HEAP;
23841 	}
23842 
23843 	for (size_t i = 0; i < map->extra_ranges_count; i++) {
23844 		vm_map_user_range_t r = &map->extra_ranges[i];
23845 
23846 		tmp.min_address = r->vmur_min_address;
23847 		tmp.max_address = r->vmur_max_address;
23848 
23849 		if (mach_vm_range_contains(&tmp, addr, size)) {
23850 			if (range) {
23851 				*range = tmp;
23852 			}
23853 			return r->vmur_range_id;
23854 		}
23855 	}
23856 
23857 	if (range) {
23858 		range->min_address = range->max_address = 0;
23859 	}
23860 	return UMEM_RANGE_ID_DEFAULT;
23861 }
23862 
23863 static int
vm_map_user_range_cmp(const void * e1,const void * e2)23864 vm_map_user_range_cmp(const void *e1, const void *e2)
23865 {
23866 	const struct vm_map_user_range *r1 = e1;
23867 	const struct vm_map_user_range *r2 = e2;
23868 
23869 	if (r1->vmur_min_address != r2->vmur_min_address) {
23870 		return r1->vmur_min_address < r2->vmur_min_address ? -1 : 1;
23871 	}
23872 
23873 	return 0;
23874 }
23875 
23876 static int
mach_vm_range_recipe_v1_cmp(const void * e1,const void * e2)23877 mach_vm_range_recipe_v1_cmp(const void *e1, const void *e2)
23878 {
23879 	const mach_vm_range_recipe_v1_t *r1 = e1;
23880 	const mach_vm_range_recipe_v1_t *r2 = e2;
23881 
23882 	if (r1->range.min_address != r2->range.min_address) {
23883 		return r1->range.min_address < r2->range.min_address ? -1 : 1;
23884 	}
23885 
23886 	return 0;
23887 }
23888 
23889 /*!
23890  * @function mach_vm_range_create_v1()
23891  *
23892  * @brief
23893  * Handle the backend for mach_vm_range_create() for the
23894  * MACH_VM_RANGE_FLAVOR_V1 flavor.
23895  *
23896  * @description
23897  * This call allows to create "ranges" in the map of a task
23898  * that have special semantics/policies around placement of
23899  * new allocations (in the vm_map_locate_space() sense).
23900  *
23901  * @returns
23902  * - KERN_SUCCESS on success
23903  * - KERN_INVALID_ARGUMENT for incorrect arguments
23904  * - KERN_NO_SPACE if the maximum amount of ranges would be exceeded
23905  * - KERN_MEMORY_PRESENT if any of the requested ranges
23906  *   overlaps with existing ranges or allocations in the map.
23907  */
23908 static kern_return_t
mach_vm_range_create_v1(vm_map_t map,mach_vm_range_recipe_v1_t * recipe,uint32_t new_count)23909 mach_vm_range_create_v1(
23910 	vm_map_t                map,
23911 	mach_vm_range_recipe_v1_t *recipe,
23912 	uint32_t                new_count)
23913 {
23914 	const vm_offset_t mask = VM_MAP_PAGE_MASK(map);
23915 	vm_map_user_range_t table;
23916 	kern_return_t kr = KERN_SUCCESS;
23917 	uint16_t count;
23918 
23919 	struct mach_vm_range void1 = {
23920 		.min_address = map->default_range.max_address,
23921 		.max_address = map->data_range.min_address,
23922 	};
23923 	struct mach_vm_range void2 = {
23924 		.min_address = map->data_range.max_address,
23925 		.max_address = vm_map_max(map),
23926 	};
23927 
23928 	qsort(recipe, new_count, sizeof(mach_vm_range_recipe_v1_t),
23929 	    mach_vm_range_recipe_v1_cmp);
23930 
23931 	/*
23932 	 * Step 1: Validate that the recipes have no intersections.
23933 	 */
23934 
23935 	for (size_t i = 0; i < new_count; i++) {
23936 		mach_vm_range_t r = &recipe[i].range;
23937 		mach_vm_size_t s;
23938 
23939 		if (recipe[i].flags) {
23940 			return KERN_INVALID_ARGUMENT;
23941 		}
23942 
23943 		static_assert(UMEM_RANGE_ID_FIXED == MACH_VM_RANGE_FIXED);
23944 		switch (recipe[i].range_tag) {
23945 		case MACH_VM_RANGE_FIXED:
23946 			break;
23947 		default:
23948 			return KERN_INVALID_ARGUMENT;
23949 		}
23950 
23951 		if (!VM_MAP_PAGE_ALIGNED(r->min_address, mask) ||
23952 		    !VM_MAP_PAGE_ALIGNED(r->max_address, mask) ||
23953 		    r->min_address >= r->max_address) {
23954 			return KERN_INVALID_ARGUMENT;
23955 		}
23956 
23957 		s = mach_vm_range_size(r);
23958 		if (!mach_vm_range_contains(&void1, r->min_address, s) &&
23959 		    !mach_vm_range_contains(&void2, r->min_address, s)) {
23960 			return KERN_INVALID_ARGUMENT;
23961 		}
23962 
23963 		if (i > 0 && recipe[i - 1].range.max_address >
23964 		    recipe[i].range.min_address) {
23965 			return KERN_INVALID_ARGUMENT;
23966 		}
23967 	}
23968 
23969 	vm_map_lock(map);
23970 
23971 	table = map->extra_ranges;
23972 	count = map->extra_ranges_count;
23973 
23974 	if (count + new_count > VM_MAP_EXTRA_RANGES_MAX) {
23975 		kr = KERN_NO_SPACE;
23976 		goto out_unlock;
23977 	}
23978 
23979 	/*
23980 	 * Step 2: Check that there is no intersection with existing ranges.
23981 	 */
23982 
23983 	for (size_t i = 0, j = 0; i < new_count && j < count;) {
23984 		mach_vm_range_t     r1 = &recipe[i].range;
23985 		vm_map_user_range_t r2 = &table[j];
23986 
23987 		if (r1->max_address <= r2->vmur_min_address) {
23988 			i++;
23989 		} else if (r2->vmur_max_address <= r1->min_address) {
23990 			j++;
23991 		} else {
23992 			kr = KERN_MEMORY_PRESENT;
23993 			goto out_unlock;
23994 		}
23995 	}
23996 
23997 	/*
23998 	 * Step 4: commit the new ranges.
23999 	 */
24000 
24001 	static_assert(VM_MAP_EXTRA_RANGES_MAX * sizeof(struct vm_map_user_range) <=
24002 	    KALLOC_SAFE_ALLOC_SIZE);
24003 
24004 	table = krealloc_data(table,
24005 	    count * sizeof(struct vm_map_user_range),
24006 	    (count + new_count) * sizeof(struct vm_map_user_range),
24007 	    Z_ZERO | Z_WAITOK | Z_NOFAIL);
24008 
24009 	for (size_t i = 0; i < new_count; i++) {
24010 		static_assert(MACH_VM_MAX_ADDRESS < (1ull << 56));
24011 
24012 		table[count + i] = (struct vm_map_user_range){
24013 			.vmur_min_address = recipe[i].range.min_address,
24014 			.vmur_max_address = recipe[i].range.max_address,
24015 			.vmur_range_id    = (vm_map_range_id_t)recipe[i].range_tag,
24016 		};
24017 	}
24018 
24019 	qsort(table, count + new_count,
24020 	    sizeof(struct vm_map_user_range), vm_map_user_range_cmp);
24021 
24022 	map->extra_ranges_count += new_count;
24023 	map->extra_ranges = table;
24024 
24025 out_unlock:
24026 	vm_map_unlock(map);
24027 
24028 	if (kr == KERN_SUCCESS) {
24029 		for (size_t i = 0; i < new_count; i++) {
24030 			vm_map_kernel_flags_t vmk_flags = {
24031 				.vmf_fixed = true,
24032 				.vmf_overwrite = true,
24033 				.vmkf_overwrite_immutable = true,
24034 				.vm_tag = recipe[i].vm_tag,
24035 			};
24036 			__assert_only kern_return_t kr2;
24037 
24038 			kr2 = vm_map_enter(map, &recipe[i].range.min_address,
24039 			    mach_vm_range_size(&recipe[i].range),
24040 			    0, vmk_flags, VM_OBJECT_NULL, 0, FALSE,
24041 			    VM_PROT_NONE, VM_PROT_ALL,
24042 			    VM_INHERIT_DEFAULT);
24043 			assert(kr2 == KERN_SUCCESS);
24044 		}
24045 	}
24046 	return kr;
24047 }
24048 
24049 kern_return_t
mach_vm_range_create(vm_map_t map,mach_vm_range_flavor_t flavor,mach_vm_range_recipes_raw_t recipe,natural_t size)24050 mach_vm_range_create(
24051 	vm_map_t                map,
24052 	mach_vm_range_flavor_t  flavor,
24053 	mach_vm_range_recipes_raw_t recipe,
24054 	natural_t               size)
24055 {
24056 	if (map != current_map()) {
24057 		return KERN_INVALID_ARGUMENT;
24058 	}
24059 
24060 	if (!map->uses_user_ranges) {
24061 		return KERN_NOT_SUPPORTED;
24062 	}
24063 
24064 	if (size == 0) {
24065 		return KERN_SUCCESS;
24066 	}
24067 
24068 	if (flavor == MACH_VM_RANGE_FLAVOR_V1) {
24069 		mach_vm_range_recipe_v1_t *array;
24070 
24071 		if (size % sizeof(mach_vm_range_recipe_v1_t)) {
24072 			return KERN_INVALID_ARGUMENT;
24073 		}
24074 
24075 		size /= sizeof(mach_vm_range_recipe_v1_t);
24076 		if (size > VM_MAP_EXTRA_RANGES_MAX) {
24077 			return KERN_NO_SPACE;
24078 		}
24079 
24080 		array = (mach_vm_range_recipe_v1_t *)recipe;
24081 		return mach_vm_range_create_v1(map, array, size);
24082 	}
24083 
24084 	return KERN_INVALID_ARGUMENT;
24085 }
24086 
24087 #else /* !CONFIG_MAP_RANGES */
24088 
24089 kern_return_t
mach_vm_range_create(vm_map_t map,mach_vm_range_flavor_t flavor,mach_vm_range_recipes_raw_t recipe,natural_t size)24090 mach_vm_range_create(
24091 	vm_map_t                map,
24092 	mach_vm_range_flavor_t  flavor,
24093 	mach_vm_range_recipes_raw_t recipe,
24094 	natural_t               size)
24095 {
24096 #pragma unused(map, flavor, recipe, size)
24097 	return KERN_NOT_SUPPORTED;
24098 }
24099 
24100 #endif /* !CONFIG_MAP_RANGES */
24101 
24102 void
vm_map_kernel_flags_update_range_id(vm_map_kernel_flags_t * vmkf,vm_map_t map)24103 vm_map_kernel_flags_update_range_id(vm_map_kernel_flags_t *vmkf, vm_map_t map)
24104 {
24105 	if (map == kernel_map) {
24106 		if (vmkf->vmkf_range_id == KMEM_RANGE_ID_NONE) {
24107 			vmkf->vmkf_range_id = KMEM_RANGE_ID_DATA;
24108 		}
24109 #if CONFIG_MAP_RANGES
24110 	} else if (vmkf->vm_tag < VM_MEMORY_COUNT &&
24111 	    vmkf->vmkf_range_id == UMEM_RANGE_ID_DEFAULT &&
24112 	    bitmap_test(vm_map_user_range_heap_map, vmkf->vm_tag)) {
24113 		vmkf->vmkf_range_id = UMEM_RANGE_ID_HEAP;
24114 #endif /* CONFIG_MAP_RANGES */
24115 	}
24116 }
24117 
24118 /*
24119  * vm_map_entry_has_device_pager:
24120  * Check if the vm map entry specified by the virtual address has a device pager.
24121  * If the vm map entry does not exist or if the map is NULL, this returns FALSE.
24122  */
24123 boolean_t
vm_map_entry_has_device_pager(vm_map_t map,vm_map_offset_t vaddr)24124 vm_map_entry_has_device_pager(vm_map_t map, vm_map_offset_t vaddr)
24125 {
24126 	vm_map_entry_t entry;
24127 	vm_object_t object;
24128 	boolean_t result;
24129 
24130 	if (map == NULL) {
24131 		return FALSE;
24132 	}
24133 
24134 	vm_map_lock(map);
24135 	while (TRUE) {
24136 		if (!vm_map_lookup_entry(map, vaddr, &entry)) {
24137 			result = FALSE;
24138 			break;
24139 		}
24140 		if (entry->is_sub_map) {
24141 			// Check the submap
24142 			vm_map_t submap = VME_SUBMAP(entry);
24143 			assert(submap != NULL);
24144 			vm_map_lock(submap);
24145 			vm_map_unlock(map);
24146 			map = submap;
24147 			continue;
24148 		}
24149 		object = VME_OBJECT(entry);
24150 		if (object != NULL && object->pager != NULL && is_device_pager_ops(object->pager->mo_pager_ops)) {
24151 			result = TRUE;
24152 			break;
24153 		}
24154 		result = FALSE;
24155 		break;
24156 	}
24157 
24158 	vm_map_unlock(map);
24159 	return result;
24160 }
24161 
24162 
24163 #if MACH_ASSERT
24164 
24165 extern int pmap_ledgers_panic;
24166 extern int pmap_ledgers_panic_leeway;
24167 
24168 #define LEDGER_DRIFT(__LEDGER)                    \
24169 	int             __LEDGER##_over;          \
24170 	ledger_amount_t __LEDGER##_over_total;    \
24171 	ledger_amount_t __LEDGER##_over_max;      \
24172 	int             __LEDGER##_under;         \
24173 	ledger_amount_t __LEDGER##_under_total;   \
24174 	ledger_amount_t __LEDGER##_under_max
24175 
24176 struct {
24177 	uint64_t        num_pmaps_checked;
24178 
24179 	LEDGER_DRIFT(phys_footprint);
24180 	LEDGER_DRIFT(internal);
24181 	LEDGER_DRIFT(internal_compressed);
24182 	LEDGER_DRIFT(external);
24183 	LEDGER_DRIFT(reusable);
24184 	LEDGER_DRIFT(iokit_mapped);
24185 	LEDGER_DRIFT(alternate_accounting);
24186 	LEDGER_DRIFT(alternate_accounting_compressed);
24187 	LEDGER_DRIFT(page_table);
24188 	LEDGER_DRIFT(purgeable_volatile);
24189 	LEDGER_DRIFT(purgeable_nonvolatile);
24190 	LEDGER_DRIFT(purgeable_volatile_compressed);
24191 	LEDGER_DRIFT(purgeable_nonvolatile_compressed);
24192 	LEDGER_DRIFT(tagged_nofootprint);
24193 	LEDGER_DRIFT(tagged_footprint);
24194 	LEDGER_DRIFT(tagged_nofootprint_compressed);
24195 	LEDGER_DRIFT(tagged_footprint_compressed);
24196 	LEDGER_DRIFT(network_volatile);
24197 	LEDGER_DRIFT(network_nonvolatile);
24198 	LEDGER_DRIFT(network_volatile_compressed);
24199 	LEDGER_DRIFT(network_nonvolatile_compressed);
24200 	LEDGER_DRIFT(media_nofootprint);
24201 	LEDGER_DRIFT(media_footprint);
24202 	LEDGER_DRIFT(media_nofootprint_compressed);
24203 	LEDGER_DRIFT(media_footprint_compressed);
24204 	LEDGER_DRIFT(graphics_nofootprint);
24205 	LEDGER_DRIFT(graphics_footprint);
24206 	LEDGER_DRIFT(graphics_nofootprint_compressed);
24207 	LEDGER_DRIFT(graphics_footprint_compressed);
24208 	LEDGER_DRIFT(neural_nofootprint);
24209 	LEDGER_DRIFT(neural_footprint);
24210 	LEDGER_DRIFT(neural_nofootprint_compressed);
24211 	LEDGER_DRIFT(neural_footprint_compressed);
24212 } pmap_ledgers_drift;
24213 
24214 void
vm_map_pmap_check_ledgers(pmap_t pmap,ledger_t ledger,int pid,char * procname)24215 vm_map_pmap_check_ledgers(
24216 	pmap_t          pmap,
24217 	ledger_t        ledger,
24218 	int             pid,
24219 	char            *procname)
24220 {
24221 	ledger_amount_t bal;
24222 	boolean_t       do_panic;
24223 
24224 	do_panic = FALSE;
24225 
24226 	pmap_ledgers_drift.num_pmaps_checked++;
24227 
24228 #define LEDGER_CHECK_BALANCE(__LEDGER)                                  \
24229 MACRO_BEGIN                                                             \
24230 	int panic_on_negative = TRUE;                                   \
24231 	ledger_get_balance(ledger,                                      \
24232 	                   task_ledgers.__LEDGER,                       \
24233 	                   &bal);                                       \
24234 	ledger_get_panic_on_negative(ledger,                            \
24235 	                             task_ledgers.__LEDGER,             \
24236 	                             &panic_on_negative);               \
24237 	if (bal != 0) {                                                 \
24238 	        if (panic_on_negative ||                                \
24239 	            (pmap_ledgers_panic &&                              \
24240 	             pmap_ledgers_panic_leeway > 0 &&                   \
24241 	             (bal > (pmap_ledgers_panic_leeway * PAGE_SIZE) ||  \
24242 	              bal < (-pmap_ledgers_panic_leeway * PAGE_SIZE)))) { \
24243 	                do_panic = TRUE;                                \
24244 	        }                                                       \
24245 	        printf("LEDGER BALANCE proc %d (%s) "                   \
24246 	               "\"%s\" = %lld\n",                               \
24247 	               pid, procname, #__LEDGER, bal);                  \
24248 	        if (bal > 0) {                                          \
24249 	                pmap_ledgers_drift.__LEDGER##_over++;           \
24250 	                pmap_ledgers_drift.__LEDGER##_over_total += bal; \
24251 	                if (bal > pmap_ledgers_drift.__LEDGER##_over_max) { \
24252 	                        pmap_ledgers_drift.__LEDGER##_over_max = bal; \
24253 	                }                                               \
24254 	        } else if (bal < 0) {                                   \
24255 	                pmap_ledgers_drift.__LEDGER##_under++;          \
24256 	                pmap_ledgers_drift.__LEDGER##_under_total += bal; \
24257 	                if (bal < pmap_ledgers_drift.__LEDGER##_under_max) { \
24258 	                        pmap_ledgers_drift.__LEDGER##_under_max = bal; \
24259 	                }                                               \
24260 	        }                                                       \
24261 	}                                                               \
24262 MACRO_END
24263 
24264 	LEDGER_CHECK_BALANCE(phys_footprint);
24265 	LEDGER_CHECK_BALANCE(internal);
24266 	LEDGER_CHECK_BALANCE(internal_compressed);
24267 	LEDGER_CHECK_BALANCE(external);
24268 	LEDGER_CHECK_BALANCE(reusable);
24269 	LEDGER_CHECK_BALANCE(iokit_mapped);
24270 	LEDGER_CHECK_BALANCE(alternate_accounting);
24271 	LEDGER_CHECK_BALANCE(alternate_accounting_compressed);
24272 	LEDGER_CHECK_BALANCE(page_table);
24273 	LEDGER_CHECK_BALANCE(purgeable_volatile);
24274 	LEDGER_CHECK_BALANCE(purgeable_nonvolatile);
24275 	LEDGER_CHECK_BALANCE(purgeable_volatile_compressed);
24276 	LEDGER_CHECK_BALANCE(purgeable_nonvolatile_compressed);
24277 	LEDGER_CHECK_BALANCE(tagged_nofootprint);
24278 	LEDGER_CHECK_BALANCE(tagged_footprint);
24279 	LEDGER_CHECK_BALANCE(tagged_nofootprint_compressed);
24280 	LEDGER_CHECK_BALANCE(tagged_footprint_compressed);
24281 	LEDGER_CHECK_BALANCE(network_volatile);
24282 	LEDGER_CHECK_BALANCE(network_nonvolatile);
24283 	LEDGER_CHECK_BALANCE(network_volatile_compressed);
24284 	LEDGER_CHECK_BALANCE(network_nonvolatile_compressed);
24285 	LEDGER_CHECK_BALANCE(media_nofootprint);
24286 	LEDGER_CHECK_BALANCE(media_footprint);
24287 	LEDGER_CHECK_BALANCE(media_nofootprint_compressed);
24288 	LEDGER_CHECK_BALANCE(media_footprint_compressed);
24289 	LEDGER_CHECK_BALANCE(graphics_nofootprint);
24290 	LEDGER_CHECK_BALANCE(graphics_footprint);
24291 	LEDGER_CHECK_BALANCE(graphics_nofootprint_compressed);
24292 	LEDGER_CHECK_BALANCE(graphics_footprint_compressed);
24293 	LEDGER_CHECK_BALANCE(neural_nofootprint);
24294 	LEDGER_CHECK_BALANCE(neural_footprint);
24295 	LEDGER_CHECK_BALANCE(neural_nofootprint_compressed);
24296 	LEDGER_CHECK_BALANCE(neural_footprint_compressed);
24297 
24298 	if (do_panic) {
24299 		if (pmap_ledgers_panic) {
24300 			panic("pmap_destroy(%p) %d[%s] has imbalanced ledgers",
24301 			    pmap, pid, procname);
24302 		} else {
24303 			printf("pmap_destroy(%p) %d[%s] has imbalanced ledgers\n",
24304 			    pmap, pid, procname);
24305 		}
24306 	}
24307 }
24308 
24309 void
vm_map_pmap_set_process(vm_map_t map,int pid,char * procname)24310 vm_map_pmap_set_process(
24311 	vm_map_t map,
24312 	int pid,
24313 	char *procname)
24314 {
24315 	pmap_set_process(vm_map_pmap(map), pid, procname);
24316 }
24317 
24318 #endif /* MACH_ASSERT */
24319