1 /*
2 * Copyright (c) 2000-2021 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /*
29 * @OSF_COPYRIGHT@
30 */
31 /*
32 * Mach Operating System
33 * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
34 * All Rights Reserved.
35 *
36 * Permission to use, copy, modify and distribute this software and its
37 * documentation is hereby granted, provided that both the copyright
38 * notice and this permission notice appear in all copies of the
39 * software, derivative works or modified versions, and any portions
40 * thereof, and that both notices appear in supporting documentation.
41 *
42 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44 * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45 *
46 * Carnegie Mellon requests users of this software to return to
47 *
48 * Software Distribution Coordinator or [email protected]
49 * School of Computer Science
50 * Carnegie Mellon University
51 * Pittsburgh PA 15213-3890
52 *
53 * any improvements or extensions that they make and grant Carnegie Mellon
54 * the rights to redistribute these changes.
55 */
56 /*
57 */
58 /*
59 * File: vm/vm_map.c
60 * Author: Avadis Tevanian, Jr., Michael Wayne Young
61 * Date: 1985
62 *
63 * Virtual memory mapping module.
64 */
65
66 #include "vm/vm_map.h"
67 #include <mach/vm_types.h>
68 #include <mach_assert.h>
69
70 #include <vm/vm_options.h>
71
72 #include <libkern/OSAtomic.h>
73
74 #include <mach/kern_return.h>
75 #include <mach/port.h>
76 #include <mach/vm_attributes.h>
77 #include <mach/vm_param.h>
78 #include <mach/vm_behavior.h>
79 #include <mach/vm_statistics.h>
80 #include <mach/memory_object.h>
81 #include <mach/mach_vm.h>
82 #include <machine/cpu_capabilities.h>
83 #include <mach/sdt.h>
84
85 #include <kern/assert.h>
86 #include <kern/backtrace.h>
87 #include <kern/counter.h>
88 #include <kern/exc_guard.h>
89 #include <kern/kalloc.h>
90 #include <kern/zalloc_internal.h>
91
92 #include <vm/cpm.h>
93 #include <vm/vm_compressor.h>
94 #include <vm/vm_compressor_pager.h>
95 #include <vm/vm_init.h>
96 #include <vm/vm_fault.h>
97 #include <vm/vm_map_internal.h>
98 #include <vm/vm_object.h>
99 #include <vm/vm_page.h>
100 #include <vm/vm_pageout.h>
101 #include <vm/pmap.h>
102 #include <vm/vm_kern.h>
103 #include <ipc/ipc_port.h>
104 #include <kern/sched_prim.h>
105 #include <kern/misc_protos.h>
106
107 #include <mach/vm_map_server.h>
108 #include <mach/mach_host_server.h>
109 #include <vm/vm_protos.h>
110 #include <vm/vm_purgeable_internal.h>
111 #include <vm/vm_reclaim_internal.h>
112
113 #include <vm/vm_protos.h>
114 #include <vm/vm_shared_region.h>
115 #include <vm/vm_map_store.h>
116
117 #include <san/kasan.h>
118
119 #include <sys/resource.h>
120 #include <sys/codesign.h>
121 #include <sys/code_signing.h>
122 #include <sys/mman.h>
123 #include <sys/reboot.h>
124 #include <sys/kdebug_triage.h>
125
126 #include <libkern/section_keywords.h>
127
128 #if DEVELOPMENT || DEBUG
129 extern int proc_selfcsflags(void);
130 int vm_log_xnu_user_debug = 0;
131 int panic_on_unsigned_execute = 0;
132 int panic_on_mlock_failure = 0;
133 #endif /* DEVELOPMENT || DEBUG */
134
135 #if MACH_ASSERT
136 int debug4k_filter = 0;
137 char debug4k_proc_name[1024] = "";
138 int debug4k_proc_filter = (int)-1 & ~(1 << __DEBUG4K_FAULT);
139 int debug4k_panic_on_misaligned_sharing = 0;
140 const char *debug4k_category_name[] = {
141 "error", /* 0 */
142 "life", /* 1 */
143 "load", /* 2 */
144 "fault", /* 3 */
145 "copy", /* 4 */
146 "share", /* 5 */
147 "adjust", /* 6 */
148 "pmap", /* 7 */
149 "mementry", /* 8 */
150 "iokit", /* 9 */
151 "upl", /* 10 */
152 "exc", /* 11 */
153 "vfs" /* 12 */
154 };
155 #endif /* MACH_ASSERT */
156 int debug4k_no_cow_copyin = 0;
157
158
159 #if __arm64__
160 extern const int fourk_binary_compatibility_unsafe;
161 extern const int fourk_binary_compatibility_allow_wx;
162 #endif /* __arm64__ */
163 extern int proc_selfpid(void);
164 extern char *proc_name_address(void *p);
165
166 #if VM_MAP_DEBUG_APPLE_PROTECT
167 int vm_map_debug_apple_protect = 0;
168 #endif /* VM_MAP_DEBUG_APPLE_PROTECT */
169 #if VM_MAP_DEBUG_FOURK
170 int vm_map_debug_fourk = 0;
171 #endif /* VM_MAP_DEBUG_FOURK */
172
173 #if DEBUG || DEVELOPMENT
174 static TUNABLE(bool, vm_map_executable_immutable,
175 "vm_map_executable_immutable", true);
176 #else
177 #define vm_map_executable_immutable true
178 #endif
179
180 os_refgrp_decl(static, map_refgrp, "vm_map", NULL);
181
182 extern u_int32_t random(void); /* from <libkern/libkern.h> */
183 /* Internal prototypes
184 */
185
186 typedef struct vm_map_zap {
187 vm_map_entry_t vmz_head;
188 vm_map_entry_t *vmz_tail;
189 } *vm_map_zap_t;
190
191 #define VM_MAP_ZAP_DECLARE(zap) \
192 struct vm_map_zap zap = { .vmz_tail = &zap.vmz_head }
193
194 static vm_map_entry_t vm_map_entry_insert(
195 vm_map_t map,
196 vm_map_entry_t insp_entry,
197 vm_map_offset_t start,
198 vm_map_offset_t end,
199 vm_object_t object,
200 vm_object_offset_t offset,
201 vm_map_kernel_flags_t vmk_flags,
202 boolean_t needs_copy,
203 vm_prot_t cur_protection,
204 vm_prot_t max_protection,
205 vm_inherit_t inheritance,
206 boolean_t clear_map_aligned);
207
208 static void vm_map_simplify_range(
209 vm_map_t map,
210 vm_map_offset_t start,
211 vm_map_offset_t end); /* forward */
212
213 static boolean_t vm_map_range_check(
214 vm_map_t map,
215 vm_map_offset_t start,
216 vm_map_offset_t end,
217 vm_map_entry_t *entry);
218
219 static void vm_map_submap_pmap_clean(
220 vm_map_t map,
221 vm_map_offset_t start,
222 vm_map_offset_t end,
223 vm_map_t sub_map,
224 vm_map_offset_t offset);
225
226 static void vm_map_pmap_enter(
227 vm_map_t map,
228 vm_map_offset_t addr,
229 vm_map_offset_t end_addr,
230 vm_object_t object,
231 vm_object_offset_t offset,
232 vm_prot_t protection);
233
234 static void _vm_map_clip_end(
235 struct vm_map_header *map_header,
236 vm_map_entry_t entry,
237 vm_map_offset_t end);
238
239 static void _vm_map_clip_start(
240 struct vm_map_header *map_header,
241 vm_map_entry_t entry,
242 vm_map_offset_t start);
243
244 static kmem_return_t vm_map_delete(
245 vm_map_t map,
246 vm_map_offset_t start,
247 vm_map_offset_t end,
248 vmr_flags_t flags,
249 kmem_guard_t guard,
250 vm_map_zap_t zap);
251
252 static void vm_map_copy_insert(
253 vm_map_t map,
254 vm_map_entry_t after_where,
255 vm_map_copy_t copy);
256
257 static kern_return_t vm_map_copy_overwrite_unaligned(
258 vm_map_t dst_map,
259 vm_map_entry_t entry,
260 vm_map_copy_t copy,
261 vm_map_address_t start,
262 boolean_t discard_on_success);
263
264 static kern_return_t vm_map_copy_overwrite_aligned(
265 vm_map_t dst_map,
266 vm_map_entry_t tmp_entry,
267 vm_map_copy_t copy,
268 vm_map_offset_t start,
269 pmap_t pmap);
270
271 static kern_return_t vm_map_copyin_kernel_buffer(
272 vm_map_t src_map,
273 vm_map_address_t src_addr,
274 vm_map_size_t len,
275 boolean_t src_destroy,
276 vm_map_copy_t *copy_result); /* OUT */
277
278 static kern_return_t vm_map_copyout_kernel_buffer(
279 vm_map_t map,
280 vm_map_address_t *addr, /* IN/OUT */
281 vm_map_copy_t copy,
282 vm_map_size_t copy_size,
283 boolean_t overwrite,
284 boolean_t consume_on_success);
285
286 static void vm_map_fork_share(
287 vm_map_t old_map,
288 vm_map_entry_t old_entry,
289 vm_map_t new_map);
290
291 static boolean_t vm_map_fork_copy(
292 vm_map_t old_map,
293 vm_map_entry_t *old_entry_p,
294 vm_map_t new_map,
295 int vm_map_copyin_flags);
296
297 static kern_return_t vm_map_wire_nested(
298 vm_map_t map,
299 vm_map_offset_t start,
300 vm_map_offset_t end,
301 vm_prot_t caller_prot,
302 vm_tag_t tag,
303 boolean_t user_wire,
304 pmap_t map_pmap,
305 vm_map_offset_t pmap_addr,
306 ppnum_t *physpage_p);
307
308 static kern_return_t vm_map_unwire_nested(
309 vm_map_t map,
310 vm_map_offset_t start,
311 vm_map_offset_t end,
312 boolean_t user_wire,
313 pmap_t map_pmap,
314 vm_map_offset_t pmap_addr);
315
316 static kern_return_t vm_map_overwrite_submap_recurse(
317 vm_map_t dst_map,
318 vm_map_offset_t dst_addr,
319 vm_map_size_t dst_size);
320
321 static kern_return_t vm_map_copy_overwrite_nested(
322 vm_map_t dst_map,
323 vm_map_offset_t dst_addr,
324 vm_map_copy_t copy,
325 boolean_t interruptible,
326 pmap_t pmap,
327 boolean_t discard_on_success);
328
329 static kern_return_t vm_map_remap_extract(
330 vm_map_t map,
331 vm_map_offset_t addr,
332 vm_map_size_t size,
333 boolean_t copy,
334 vm_map_copy_t map_copy,
335 vm_prot_t *cur_protection,
336 vm_prot_t *max_protection,
337 vm_inherit_t inheritance,
338 vm_map_kernel_flags_t vmk_flags);
339
340 static kern_return_t vm_map_remap_range_allocate(
341 vm_map_t map,
342 vm_map_address_t *address,
343 vm_map_size_t size,
344 vm_map_offset_t mask,
345 vm_map_kernel_flags_t vmk_flags,
346 vm_map_entry_t *map_entry,
347 vm_map_zap_t zap_list);
348
349 static void vm_map_region_look_for_page(
350 vm_map_t map,
351 vm_map_offset_t va,
352 vm_object_t object,
353 vm_object_offset_t offset,
354 int max_refcnt,
355 unsigned short depth,
356 vm_region_extended_info_t extended,
357 mach_msg_type_number_t count);
358
359 static int vm_map_region_count_obj_refs(
360 vm_map_entry_t entry,
361 vm_object_t object);
362
363
364 static kern_return_t vm_map_willneed(
365 vm_map_t map,
366 vm_map_offset_t start,
367 vm_map_offset_t end);
368
369 static kern_return_t vm_map_reuse_pages(
370 vm_map_t map,
371 vm_map_offset_t start,
372 vm_map_offset_t end);
373
374 static kern_return_t vm_map_reusable_pages(
375 vm_map_t map,
376 vm_map_offset_t start,
377 vm_map_offset_t end);
378
379 static kern_return_t vm_map_can_reuse(
380 vm_map_t map,
381 vm_map_offset_t start,
382 vm_map_offset_t end);
383
384 static kern_return_t vm_map_random_address_for_size(
385 vm_map_t map,
386 vm_map_offset_t *address,
387 vm_map_size_t size,
388 vm_map_kernel_flags_t vmk_flags);
389
390
391 #if CONFIG_MAP_RANGES
392
393 static vm_map_range_id_t vm_map_user_range_resolve(
394 vm_map_t map,
395 mach_vm_address_t addr,
396 mach_vm_address_t size,
397 mach_vm_range_t range);
398
399 #endif /* CONFIG_MAP_RANGES */
400 #if MACH_ASSERT
401 static kern_return_t vm_map_pageout(
402 vm_map_t map,
403 vm_map_offset_t start,
404 vm_map_offset_t end);
405 #endif /* MACH_ASSERT */
406
407 kern_return_t vm_map_corpse_footprint_collect(
408 vm_map_t old_map,
409 vm_map_entry_t old_entry,
410 vm_map_t new_map);
411 void vm_map_corpse_footprint_collect_done(
412 vm_map_t new_map);
413 void vm_map_corpse_footprint_destroy(
414 vm_map_t map);
415 kern_return_t vm_map_corpse_footprint_query_page_info(
416 vm_map_t map,
417 vm_map_offset_t va,
418 int *disposition_p);
419 void vm_map_footprint_query_page_info(
420 vm_map_t map,
421 vm_map_entry_t map_entry,
422 vm_map_offset_t curr_s_offset,
423 int *disposition_p);
424
425 #if CONFIG_MAP_RANGES
426 static void vm_map_range_map_init(void);
427 #endif /* CONFIG_MAP_RANGES */
428
429 pid_t find_largest_process_vm_map_entries(void);
430
431 extern int exit_with_guard_exception(void *p, mach_exception_data_type_t code,
432 mach_exception_data_type_t subcode);
433
434 /*
435 * Macros to copy a vm_map_entry. We must be careful to correctly
436 * manage the wired page count. vm_map_entry_copy() creates a new
437 * map entry to the same memory - the wired count in the new entry
438 * must be set to zero. vm_map_entry_copy_full() creates a new
439 * entry that is identical to the old entry. This preserves the
440 * wire count; it's used for map splitting and zone changing in
441 * vm_map_copyout.
442 */
443
444 static inline void
vm_map_entry_copy_csm_assoc(vm_map_t map __unused,vm_map_entry_t new __unused,vm_map_entry_t old __unused)445 vm_map_entry_copy_csm_assoc(
446 vm_map_t map __unused,
447 vm_map_entry_t new __unused,
448 vm_map_entry_t old __unused)
449 {
450 #if CODE_SIGNING_MONITOR
451 /* when code signing monitor is enabled, we want to reset on copy */
452 new->csm_associated = FALSE;
453 #else
454 /* when code signing monitor is not enabled, assert as a sanity check */
455 assert(new->csm_associated == FALSE);
456 #endif
457 #if DEVELOPMENT || DEBUG
458 if (new->vme_xnu_user_debug && vm_log_xnu_user_debug) {
459 printf("FBDP %d[%s] %s:%d map %p entry %p [ 0x%llx 0x%llx ] resetting vme_xnu_user_debug\n",
460 proc_selfpid(),
461 (get_bsdtask_info(current_task())
462 ? proc_name_address(get_bsdtask_info(current_task()))
463 : "?"),
464 __FUNCTION__, __LINE__,
465 map, new, new->vme_start, new->vme_end);
466 }
467 #endif /* DEVELOPMENT || DEBUG */
468 new->vme_xnu_user_debug = FALSE;
469 }
470
471 /*
472 * The "used_for_jit" flag was copied from OLD to NEW in vm_map_entry_copy().
473 * But for security reasons on some platforms, we don't want the
474 * new mapping to be "used for jit", so we reset the flag here.
475 */
476 static inline void
vm_map_entry_copy_code_signing(vm_map_t map,vm_map_entry_t new,vm_map_entry_t old __unused)477 vm_map_entry_copy_code_signing(
478 vm_map_t map,
479 vm_map_entry_t new,
480 vm_map_entry_t old __unused)
481 {
482 if (VM_MAP_POLICY_ALLOW_JIT_COPY(map)) {
483 assert(new->used_for_jit == old->used_for_jit);
484 } else {
485 new->used_for_jit = FALSE;
486 }
487 }
488
489 static inline void
vm_map_entry_copy_full(vm_map_entry_t new,vm_map_entry_t old)490 vm_map_entry_copy_full(
491 vm_map_entry_t new,
492 vm_map_entry_t old)
493 {
494 #if MAP_ENTRY_CREATION_DEBUG
495 btref_put(new->vme_creation_bt);
496 btref_retain(old->vme_creation_bt);
497 #endif
498 #if MAP_ENTRY_INSERTION_DEBUG
499 btref_put(new->vme_insertion_bt);
500 btref_retain(old->vme_insertion_bt);
501 #endif
502 *new = *old;
503 }
504
505 static inline void
vm_map_entry_copy(vm_map_t map,vm_map_entry_t new,vm_map_entry_t old)506 vm_map_entry_copy(
507 vm_map_t map,
508 vm_map_entry_t new,
509 vm_map_entry_t old)
510 {
511 vm_map_entry_copy_full(new, old);
512
513 new->is_shared = FALSE;
514 new->needs_wakeup = FALSE;
515 new->in_transition = FALSE;
516 new->wired_count = 0;
517 new->user_wired_count = 0;
518 new->vme_permanent = FALSE;
519 vm_map_entry_copy_code_signing(map, new, old);
520 vm_map_entry_copy_csm_assoc(map, new, old);
521 if (new->iokit_acct) {
522 assertf(!new->use_pmap, "old %p new %p\n", old, new);
523 new->iokit_acct = FALSE;
524 new->use_pmap = TRUE;
525 }
526 new->vme_resilient_codesign = FALSE;
527 new->vme_resilient_media = FALSE;
528 new->vme_atomic = FALSE;
529 new->vme_no_copy_on_read = FALSE;
530 }
531
532 /*
533 * Normal lock_read_to_write() returns FALSE/0 on failure.
534 * These functions evaluate to zero on success and non-zero value on failure.
535 */
536 __attribute__((always_inline))
537 int
vm_map_lock_read_to_write(vm_map_t map)538 vm_map_lock_read_to_write(vm_map_t map)
539 {
540 if (lck_rw_lock_shared_to_exclusive(&(map)->lock)) {
541 DTRACE_VM(vm_map_lock_upgrade);
542 return 0;
543 }
544 return 1;
545 }
546
547 __attribute__((always_inline))
548 boolean_t
vm_map_try_lock(vm_map_t map)549 vm_map_try_lock(vm_map_t map)
550 {
551 if (lck_rw_try_lock_exclusive(&(map)->lock)) {
552 DTRACE_VM(vm_map_lock_w);
553 return TRUE;
554 }
555 return FALSE;
556 }
557
558 __attribute__((always_inline))
559 boolean_t
vm_map_try_lock_read(vm_map_t map)560 vm_map_try_lock_read(vm_map_t map)
561 {
562 if (lck_rw_try_lock_shared(&(map)->lock)) {
563 DTRACE_VM(vm_map_lock_r);
564 return TRUE;
565 }
566 return FALSE;
567 }
568
569 /*!
570 * @function kdp_vm_map_is_acquired_exclusive
571 *
572 * @abstract
573 * Checks if vm map is acquired exclusive.
574 *
575 * @discussion
576 * NOT SAFE: To be used only by kernel debugger.
577 *
578 * @param map map to check
579 *
580 * @returns TRUE if the map is acquired exclusively.
581 */
582 boolean_t
kdp_vm_map_is_acquired_exclusive(vm_map_t map)583 kdp_vm_map_is_acquired_exclusive(vm_map_t map)
584 {
585 return kdp_lck_rw_lock_is_acquired_exclusive(&map->lock);
586 }
587
588 /*
589 * Routines to get the page size the caller should
590 * use while inspecting the target address space.
591 * Use the "_safely" variant if the caller is dealing with a user-provided
592 * array whose size depends on the page size, to avoid any overflow or
593 * underflow of a user-allocated buffer.
594 */
595 int
vm_self_region_page_shift_safely(vm_map_t target_map)596 vm_self_region_page_shift_safely(
597 vm_map_t target_map)
598 {
599 int effective_page_shift = 0;
600
601 if (PAGE_SIZE == (4096)) {
602 /* x86_64 and 4k watches: always use 4k */
603 return PAGE_SHIFT;
604 }
605 /* did caller provide an explicit page size for this thread to use? */
606 effective_page_shift = thread_self_region_page_shift();
607 if (effective_page_shift) {
608 /* use the explicitly-provided page size */
609 return effective_page_shift;
610 }
611 /* no explicit page size: use the caller's page size... */
612 effective_page_shift = VM_MAP_PAGE_SHIFT(current_map());
613 if (effective_page_shift == VM_MAP_PAGE_SHIFT(target_map)) {
614 /* page size match: safe to use */
615 return effective_page_shift;
616 }
617 /* page size mismatch */
618 return -1;
619 }
620 int
vm_self_region_page_shift(vm_map_t target_map)621 vm_self_region_page_shift(
622 vm_map_t target_map)
623 {
624 int effective_page_shift;
625
626 effective_page_shift = vm_self_region_page_shift_safely(target_map);
627 if (effective_page_shift == -1) {
628 /* no safe value but OK to guess for caller */
629 effective_page_shift = MIN(VM_MAP_PAGE_SHIFT(current_map()),
630 VM_MAP_PAGE_SHIFT(target_map));
631 }
632 return effective_page_shift;
633 }
634
635
636 /*
637 * Decide if we want to allow processes to execute from their data or stack areas.
638 * override_nx() returns true if we do. Data/stack execution can be enabled independently
639 * for 32 and 64 bit processes. Set the VM_ABI_32 or VM_ABI_64 flags in allow_data_exec
640 * or allow_stack_exec to enable data execution for that type of data area for that particular
641 * ABI (or both by or'ing the flags together). These are initialized in the architecture
642 * specific pmap files since the default behavior varies according to architecture. The
643 * main reason it varies is because of the need to provide binary compatibility with old
644 * applications that were written before these restrictions came into being. In the old
645 * days, an app could execute anything it could read, but this has slowly been tightened
646 * up over time. The default behavior is:
647 *
648 * 32-bit PPC apps may execute from both stack and data areas
649 * 32-bit Intel apps may exeucte from data areas but not stack
650 * 64-bit PPC/Intel apps may not execute from either data or stack
651 *
652 * An application on any architecture may override these defaults by explicitly
653 * adding PROT_EXEC permission to the page in question with the mprotect(2)
654 * system call. This code here just determines what happens when an app tries to
655 * execute from a page that lacks execute permission.
656 *
657 * Note that allow_data_exec or allow_stack_exec may also be modified by sysctl to change the
658 * default behavior for both 32 and 64 bit apps on a system-wide basis. Furthermore,
659 * a Mach-O header flag bit (MH_NO_HEAP_EXECUTION) can be used to forcibly disallow
660 * execution from data areas for a particular binary even if the arch normally permits it. As
661 * a final wrinkle, a posix_spawn attribute flag can be used to negate this opt-in header bit
662 * to support some complicated use cases, notably browsers with out-of-process plugins that
663 * are not all NX-safe.
664 */
665
666 extern int allow_data_exec, allow_stack_exec;
667
668 int
override_nx(vm_map_t map,uint32_t user_tag)669 override_nx(vm_map_t map, uint32_t user_tag) /* map unused on arm */
670 {
671 int current_abi;
672
673 if (map->pmap == kernel_pmap) {
674 return FALSE;
675 }
676
677 /*
678 * Determine if the app is running in 32 or 64 bit mode.
679 */
680
681 if (vm_map_is_64bit(map)) {
682 current_abi = VM_ABI_64;
683 } else {
684 current_abi = VM_ABI_32;
685 }
686
687 /*
688 * Determine if we should allow the execution based on whether it's a
689 * stack or data area and the current architecture.
690 */
691
692 if (user_tag == VM_MEMORY_STACK) {
693 return allow_stack_exec & current_abi;
694 }
695
696 return (allow_data_exec & current_abi) && (map->map_disallow_data_exec == FALSE);
697 }
698
699
700 /*
701 * Virtual memory maps provide for the mapping, protection,
702 * and sharing of virtual memory objects. In addition,
703 * this module provides for an efficient virtual copy of
704 * memory from one map to another.
705 *
706 * Synchronization is required prior to most operations.
707 *
708 * Maps consist of an ordered doubly-linked list of simple
709 * entries; a single hint is used to speed up lookups.
710 *
711 * Sharing maps have been deleted from this version of Mach.
712 * All shared objects are now mapped directly into the respective
713 * maps. This requires a change in the copy on write strategy;
714 * the asymmetric (delayed) strategy is used for shared temporary
715 * objects instead of the symmetric (shadow) strategy. All maps
716 * are now "top level" maps (either task map, kernel map or submap
717 * of the kernel map).
718 *
719 * Since portions of maps are specified by start/end addreses,
720 * which may not align with existing map entries, all
721 * routines merely "clip" entries to these start/end values.
722 * [That is, an entry is split into two, bordering at a
723 * start or end value.] Note that these clippings may not
724 * always be necessary (as the two resulting entries are then
725 * not changed); however, the clipping is done for convenience.
726 * No attempt is currently made to "glue back together" two
727 * abutting entries.
728 *
729 * The symmetric (shadow) copy strategy implements virtual copy
730 * by copying VM object references from one map to
731 * another, and then marking both regions as copy-on-write.
732 * It is important to note that only one writeable reference
733 * to a VM object region exists in any map when this strategy
734 * is used -- this means that shadow object creation can be
735 * delayed until a write operation occurs. The symmetric (delayed)
736 * strategy allows multiple maps to have writeable references to
737 * the same region of a vm object, and hence cannot delay creating
738 * its copy objects. See vm_object_copy_quickly() in vm_object.c.
739 * Copying of permanent objects is completely different; see
740 * vm_object_copy_strategically() in vm_object.c.
741 */
742
743 ZONE_DECLARE_ID(ZONE_ID_VM_MAP_COPY, struct vm_map_copy);
744
745 #define VM_MAP_ZONE_NAME "maps"
746 #define VM_MAP_ZFLAGS (ZC_NOENCRYPT | ZC_VM)
747
748 #define VM_MAP_ENTRY_ZONE_NAME "VM map entries"
749 #define VM_MAP_ENTRY_ZFLAGS (ZC_NOENCRYPT | ZC_VM)
750
751 #define VM_MAP_HOLES_ZONE_NAME "VM map holes"
752 #define VM_MAP_HOLES_ZFLAGS (ZC_NOENCRYPT | ZC_VM)
753
754 /*
755 * Asserts that a vm_map_copy object is coming from the
756 * vm_map_copy_zone to ensure that it isn't a fake constructed
757 * anywhere else.
758 */
759 void
vm_map_copy_require(struct vm_map_copy * copy)760 vm_map_copy_require(struct vm_map_copy *copy)
761 {
762 zone_id_require(ZONE_ID_VM_MAP_COPY, sizeof(struct vm_map_copy), copy);
763 }
764
765 /*
766 * vm_map_require:
767 *
768 * Ensures that the argument is memory allocated from the genuine
769 * vm map zone. (See zone_id_require_allow_foreign).
770 */
771 void
vm_map_require(vm_map_t map)772 vm_map_require(vm_map_t map)
773 {
774 zone_id_require(ZONE_ID_VM_MAP, sizeof(struct _vm_map), map);
775 }
776
777 #define VM_MAP_EARLY_COUNT_MAX 16
778 static __startup_data vm_offset_t map_data;
779 static __startup_data vm_size_t map_data_size;
780 static __startup_data vm_offset_t kentry_data;
781 static __startup_data vm_size_t kentry_data_size;
782 static __startup_data vm_offset_t map_holes_data;
783 static __startup_data vm_size_t map_holes_data_size;
784 static __startup_data vm_map_t *early_map_owners[VM_MAP_EARLY_COUNT_MAX];
785 static __startup_data uint32_t early_map_count;
786
787 #if XNU_TARGET_OS_OSX
788 #define NO_COALESCE_LIMIT ((1024 * 128) - 1)
789 #else /* XNU_TARGET_OS_OSX */
790 #define NO_COALESCE_LIMIT 0
791 #endif /* XNU_TARGET_OS_OSX */
792
793 /* Skip acquiring locks if we're in the midst of a kernel core dump */
794 unsigned int not_in_kdp = 1;
795
796 unsigned int vm_map_set_cache_attr_count = 0;
797
798 kern_return_t
vm_map_set_cache_attr(vm_map_t map,vm_map_offset_t va)799 vm_map_set_cache_attr(
800 vm_map_t map,
801 vm_map_offset_t va)
802 {
803 vm_map_entry_t map_entry;
804 vm_object_t object;
805 kern_return_t kr = KERN_SUCCESS;
806
807 vm_map_lock_read(map);
808
809 if (!vm_map_lookup_entry(map, va, &map_entry) ||
810 map_entry->is_sub_map) {
811 /*
812 * that memory is not properly mapped
813 */
814 kr = KERN_INVALID_ARGUMENT;
815 goto done;
816 }
817 object = VME_OBJECT(map_entry);
818
819 if (object == VM_OBJECT_NULL) {
820 /*
821 * there should be a VM object here at this point
822 */
823 kr = KERN_INVALID_ARGUMENT;
824 goto done;
825 }
826 vm_object_lock(object);
827 object->set_cache_attr = TRUE;
828 vm_object_unlock(object);
829
830 vm_map_set_cache_attr_count++;
831 done:
832 vm_map_unlock_read(map);
833
834 return kr;
835 }
836
837
838 #if CONFIG_CODE_DECRYPTION
839 /*
840 * vm_map_apple_protected:
841 * This remaps the requested part of the object with an object backed by
842 * the decrypting pager.
843 * crypt_info contains entry points and session data for the crypt module.
844 * The crypt_info block will be copied by vm_map_apple_protected. The data structures
845 * referenced in crypt_info must remain valid until crypt_info->crypt_end() is called.
846 */
847 kern_return_t
vm_map_apple_protected(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_object_offset_t crypto_backing_offset,struct pager_crypt_info * crypt_info,uint32_t cryptid)848 vm_map_apple_protected(
849 vm_map_t map,
850 vm_map_offset_t start,
851 vm_map_offset_t end,
852 vm_object_offset_t crypto_backing_offset,
853 struct pager_crypt_info *crypt_info,
854 uint32_t cryptid)
855 {
856 boolean_t map_locked;
857 kern_return_t kr;
858 vm_map_entry_t map_entry;
859 struct vm_map_entry tmp_entry;
860 memory_object_t unprotected_mem_obj;
861 vm_object_t protected_object;
862 vm_map_offset_t map_addr;
863 vm_map_offset_t start_aligned, end_aligned;
864 vm_object_offset_t crypto_start, crypto_end;
865 boolean_t cache_pager;
866
867 map_locked = FALSE;
868 unprotected_mem_obj = MEMORY_OBJECT_NULL;
869
870 start_aligned = vm_map_trunc_page(start, PAGE_MASK_64);
871 end_aligned = vm_map_round_page(end, PAGE_MASK_64);
872 start_aligned = vm_map_trunc_page(start_aligned, VM_MAP_PAGE_MASK(map));
873 end_aligned = vm_map_round_page(end_aligned, VM_MAP_PAGE_MASK(map));
874
875 #if __arm64__
876 /*
877 * "start" and "end" might be 4K-aligned but not 16K-aligned,
878 * so we might have to loop and establish up to 3 mappings:
879 *
880 * + the first 16K-page, which might overlap with the previous
881 * 4K-aligned mapping,
882 * + the center,
883 * + the last 16K-page, which might overlap with the next
884 * 4K-aligned mapping.
885 * Each of these mapping might be backed by a vnode pager (if
886 * properly page-aligned) or a "fourk_pager", itself backed by a
887 * vnode pager (if 4K-aligned but not page-aligned).
888 */
889 #endif /* __arm64__ */
890
891 map_addr = start_aligned;
892 for (map_addr = start_aligned;
893 map_addr < end;
894 map_addr = tmp_entry.vme_end) {
895 vm_map_lock(map);
896 map_locked = TRUE;
897
898 /* lookup the protected VM object */
899 if (!vm_map_lookup_entry(map,
900 map_addr,
901 &map_entry) ||
902 map_entry->is_sub_map ||
903 VME_OBJECT(map_entry) == VM_OBJECT_NULL) {
904 /* that memory is not properly mapped */
905 kr = KERN_INVALID_ARGUMENT;
906 goto done;
907 }
908
909 /* ensure mapped memory is mapped as executable except
910 * except for model decryption flow */
911 if ((cryptid != CRYPTID_MODEL_ENCRYPTION) &&
912 !(map_entry->protection & VM_PROT_EXECUTE)) {
913 kr = KERN_INVALID_ARGUMENT;
914 goto done;
915 }
916
917 /* get the protected object to be decrypted */
918 protected_object = VME_OBJECT(map_entry);
919 if (protected_object == VM_OBJECT_NULL) {
920 /* there should be a VM object here at this point */
921 kr = KERN_INVALID_ARGUMENT;
922 goto done;
923 }
924 /* ensure protected object stays alive while map is unlocked */
925 vm_object_reference(protected_object);
926
927 /* limit the map entry to the area we want to cover */
928 vm_map_clip_start(map, map_entry, start_aligned);
929 vm_map_clip_end(map, map_entry, end_aligned);
930
931 tmp_entry = *map_entry;
932 map_entry = VM_MAP_ENTRY_NULL; /* not valid after unlocking map */
933 vm_map_unlock(map);
934 map_locked = FALSE;
935
936 /*
937 * This map entry might be only partially encrypted
938 * (if not fully "page-aligned").
939 */
940 crypto_start = 0;
941 crypto_end = tmp_entry.vme_end - tmp_entry.vme_start;
942 if (tmp_entry.vme_start < start) {
943 if (tmp_entry.vme_start != start_aligned) {
944 kr = KERN_INVALID_ADDRESS;
945 }
946 crypto_start += (start - tmp_entry.vme_start);
947 }
948 if (tmp_entry.vme_end > end) {
949 if (tmp_entry.vme_end != end_aligned) {
950 kr = KERN_INVALID_ADDRESS;
951 }
952 crypto_end -= (tmp_entry.vme_end - end);
953 }
954
955 /*
956 * This "extra backing offset" is needed to get the decryption
957 * routine to use the right key. It adjusts for the possibly
958 * relative offset of an interposed "4K" pager...
959 */
960 if (crypto_backing_offset == (vm_object_offset_t) -1) {
961 crypto_backing_offset = VME_OFFSET(&tmp_entry);
962 }
963
964 cache_pager = TRUE;
965 #if XNU_TARGET_OS_OSX
966 if (vm_map_is_alien(map)) {
967 cache_pager = FALSE;
968 }
969 #endif /* XNU_TARGET_OS_OSX */
970
971 /*
972 * Lookup (and create if necessary) the protected memory object
973 * matching that VM object.
974 * If successful, this also grabs a reference on the memory object,
975 * to guarantee that it doesn't go away before we get a chance to map
976 * it.
977 */
978 unprotected_mem_obj = apple_protect_pager_setup(
979 protected_object,
980 VME_OFFSET(&tmp_entry),
981 crypto_backing_offset,
982 crypt_info,
983 crypto_start,
984 crypto_end,
985 cache_pager);
986
987 /* release extra ref on protected object */
988 vm_object_deallocate(protected_object);
989
990 if (unprotected_mem_obj == NULL) {
991 kr = KERN_FAILURE;
992 goto done;
993 }
994
995 /* can overwrite an immutable mapping */
996 vm_map_kernel_flags_t vmk_flags = {
997 .vmf_fixed = true,
998 .vmf_overwrite = true,
999 .vmkf_overwrite_immutable = true,
1000 };
1001 #if __arm64__
1002 if (tmp_entry.used_for_jit &&
1003 (VM_MAP_PAGE_SHIFT(map) != FOURK_PAGE_SHIFT ||
1004 PAGE_SHIFT != FOURK_PAGE_SHIFT) &&
1005 fourk_binary_compatibility_unsafe &&
1006 fourk_binary_compatibility_allow_wx) {
1007 printf("** FOURK_COMPAT [%d]: "
1008 "allowing write+execute at 0x%llx\n",
1009 proc_selfpid(), tmp_entry.vme_start);
1010 vmk_flags.vmkf_map_jit = TRUE;
1011 }
1012 #endif /* __arm64__ */
1013
1014 /* map this memory object in place of the current one */
1015 map_addr = tmp_entry.vme_start;
1016 kr = vm_map_enter_mem_object(map,
1017 &map_addr,
1018 (tmp_entry.vme_end -
1019 tmp_entry.vme_start),
1020 (mach_vm_offset_t) 0,
1021 vmk_flags,
1022 (ipc_port_t)(uintptr_t) unprotected_mem_obj,
1023 0,
1024 TRUE,
1025 tmp_entry.protection,
1026 tmp_entry.max_protection,
1027 tmp_entry.inheritance);
1028 assertf(kr == KERN_SUCCESS,
1029 "kr = 0x%x\n", kr);
1030 assertf(map_addr == tmp_entry.vme_start,
1031 "map_addr=0x%llx vme_start=0x%llx tmp_entry=%p\n",
1032 (uint64_t)map_addr,
1033 (uint64_t) tmp_entry.vme_start,
1034 &tmp_entry);
1035
1036 #if VM_MAP_DEBUG_APPLE_PROTECT
1037 if (vm_map_debug_apple_protect) {
1038 printf("APPLE_PROTECT: map %p [0x%llx:0x%llx] pager %p:"
1039 " backing:[object:%p,offset:0x%llx,"
1040 "crypto_backing_offset:0x%llx,"
1041 "crypto_start:0x%llx,crypto_end:0x%llx]\n",
1042 map,
1043 (uint64_t) map_addr,
1044 (uint64_t) (map_addr + (tmp_entry.vme_end -
1045 tmp_entry.vme_start)),
1046 unprotected_mem_obj,
1047 protected_object,
1048 VME_OFFSET(&tmp_entry),
1049 crypto_backing_offset,
1050 crypto_start,
1051 crypto_end);
1052 }
1053 #endif /* VM_MAP_DEBUG_APPLE_PROTECT */
1054
1055 /*
1056 * Release the reference obtained by
1057 * apple_protect_pager_setup().
1058 * The mapping (if it succeeded) is now holding a reference on
1059 * the memory object.
1060 */
1061 memory_object_deallocate(unprotected_mem_obj);
1062 unprotected_mem_obj = MEMORY_OBJECT_NULL;
1063
1064 /* continue with next map entry */
1065 crypto_backing_offset += (tmp_entry.vme_end -
1066 tmp_entry.vme_start);
1067 crypto_backing_offset -= crypto_start;
1068 }
1069 kr = KERN_SUCCESS;
1070
1071 done:
1072 if (map_locked) {
1073 vm_map_unlock(map);
1074 }
1075 return kr;
1076 }
1077 #endif /* CONFIG_CODE_DECRYPTION */
1078
1079
1080 LCK_GRP_DECLARE(vm_map_lck_grp, "vm_map");
1081 LCK_ATTR_DECLARE(vm_map_lck_attr, 0, 0);
1082 LCK_ATTR_DECLARE(vm_map_lck_rw_attr, 0, LCK_ATTR_DEBUG);
1083
1084 #if XNU_TARGET_OS_OSX
1085 int malloc_no_cow = 0;
1086 #else /* XNU_TARGET_OS_OSX */
1087 int malloc_no_cow = 1;
1088 #endif /* XNU_TARGET_OS_OSX */
1089 uint64_t vm_memory_malloc_no_cow_mask = 0ULL;
1090 #if DEBUG
1091 int vm_check_map_sanity = 0;
1092 #endif
1093
1094 /*
1095 * vm_map_init:
1096 *
1097 * Initialize the vm_map module. Must be called before
1098 * any other vm_map routines.
1099 *
1100 * Map and entry structures are allocated from zones -- we must
1101 * initialize those zones.
1102 *
1103 * There are three zones of interest:
1104 *
1105 * vm_map_zone: used to allocate maps.
1106 * vm_map_entry_zone: used to allocate map entries.
1107 *
1108 * LP32:
1109 * vm_map_entry_reserved_zone: fallback zone for kernel map entries
1110 *
1111 * The kernel allocates map entries from a special zone that is initially
1112 * "crammed" with memory. It would be difficult (perhaps impossible) for
1113 * the kernel to allocate more memory to a entry zone when it became
1114 * empty since the very act of allocating memory implies the creation
1115 * of a new entry.
1116 */
1117 __startup_func
1118 void
vm_map_init(void)1119 vm_map_init(void)
1120 {
1121
1122 #if MACH_ASSERT
1123 PE_parse_boot_argn("debug4k_filter", &debug4k_filter,
1124 sizeof(debug4k_filter));
1125 #endif /* MACH_ASSERT */
1126
1127 zone_create_ext(VM_MAP_ZONE_NAME, sizeof(struct _vm_map),
1128 VM_MAP_ZFLAGS, ZONE_ID_VM_MAP, NULL);
1129
1130 /*
1131 * Don't quarantine because we always need elements available
1132 * Disallow GC on this zone... to aid the GC.
1133 */
1134 zone_create_ext(VM_MAP_ENTRY_ZONE_NAME,
1135 sizeof(struct vm_map_entry), VM_MAP_ENTRY_ZFLAGS,
1136 ZONE_ID_VM_MAP_ENTRY, ^(zone_t z) {
1137 z->z_elems_rsv = (uint16_t)(32 *
1138 (ml_early_cpu_max_number() + 1));
1139 });
1140
1141 zone_create_ext(VM_MAP_HOLES_ZONE_NAME,
1142 sizeof(struct vm_map_links), VM_MAP_HOLES_ZFLAGS,
1143 ZONE_ID_VM_MAP_HOLES, ^(zone_t z) {
1144 z->z_elems_rsv = (uint16_t)(16 * 1024 / zone_elem_outer_size(z));
1145 });
1146
1147 zone_create_ext("VM map copies", sizeof(struct vm_map_copy),
1148 ZC_NOENCRYPT, ZONE_ID_VM_MAP_COPY, NULL);
1149
1150 /*
1151 * Add the stolen memory to zones, adjust zone size and stolen counts.
1152 */
1153 zone_cram_early(vm_map_zone, map_data, map_data_size);
1154 zone_cram_early(vm_map_entry_zone, kentry_data, kentry_data_size);
1155 zone_cram_early(vm_map_holes_zone, map_holes_data, map_holes_data_size);
1156 printf("VM boostrap: %d maps, %d entries and %d holes available\n",
1157 zone_count_free(vm_map_zone),
1158 zone_count_free(vm_map_entry_zone),
1159 zone_count_free(vm_map_holes_zone));
1160
1161 /*
1162 * Since these are covered by zones, remove them from stolen page accounting.
1163 */
1164 VM_PAGE_MOVE_STOLEN(atop_64(map_data_size) + atop_64(kentry_data_size) + atop_64(map_holes_data_size));
1165
1166 #if VM_MAP_DEBUG_APPLE_PROTECT
1167 PE_parse_boot_argn("vm_map_debug_apple_protect",
1168 &vm_map_debug_apple_protect,
1169 sizeof(vm_map_debug_apple_protect));
1170 #endif /* VM_MAP_DEBUG_APPLE_PROTECT */
1171 #if VM_MAP_DEBUG_APPLE_FOURK
1172 PE_parse_boot_argn("vm_map_debug_fourk",
1173 &vm_map_debug_fourk,
1174 sizeof(vm_map_debug_fourk));
1175 #endif /* VM_MAP_DEBUG_FOURK */
1176
1177 PE_parse_boot_argn("malloc_no_cow",
1178 &malloc_no_cow,
1179 sizeof(malloc_no_cow));
1180 if (malloc_no_cow) {
1181 vm_memory_malloc_no_cow_mask = 0ULL;
1182 vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC;
1183 vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_SMALL;
1184 vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_MEDIUM;
1185 vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_LARGE;
1186 // vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_HUGE;
1187 // vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_REALLOC;
1188 vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_TINY;
1189 vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_LARGE_REUSABLE;
1190 vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_LARGE_REUSED;
1191 vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_NANO;
1192 // vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_TCMALLOC;
1193 PE_parse_boot_argn("vm_memory_malloc_no_cow_mask",
1194 &vm_memory_malloc_no_cow_mask,
1195 sizeof(vm_memory_malloc_no_cow_mask));
1196 }
1197
1198 #if CONFIG_MAP_RANGES
1199 vm_map_range_map_init();
1200 #endif /* CONFIG_MAP_RANGES */
1201
1202 #if DEBUG
1203 PE_parse_boot_argn("vm_check_map_sanity", &vm_check_map_sanity, sizeof(vm_check_map_sanity));
1204 if (vm_check_map_sanity) {
1205 kprintf("VM sanity checking enabled\n");
1206 } else {
1207 kprintf("VM sanity checking disabled. Set bootarg vm_check_map_sanity=1 to enable\n");
1208 }
1209 #endif /* DEBUG */
1210
1211 #if DEVELOPMENT || DEBUG
1212 PE_parse_boot_argn("panic_on_unsigned_execute",
1213 &panic_on_unsigned_execute,
1214 sizeof(panic_on_unsigned_execute));
1215 PE_parse_boot_argn("panic_on_mlock_failure",
1216 &panic_on_mlock_failure,
1217 sizeof(panic_on_mlock_failure));
1218 #endif /* DEVELOPMENT || DEBUG */
1219 }
1220
1221 __startup_func
1222 static void
vm_map_steal_memory(void)1223 vm_map_steal_memory(void)
1224 {
1225 /*
1226 * We need to reserve enough memory to support boostraping VM maps
1227 * and the zone subsystem.
1228 *
1229 * The VM Maps that need to function before zones can support them
1230 * are the ones registered with vm_map_will_allocate_early_map(),
1231 * which are:
1232 * - the kernel map
1233 * - the various submaps used by zones (pgz, meta, ...)
1234 *
1235 * We also need enough entries and holes to support them
1236 * until zone_metadata_init() is called, which is when
1237 * the zone allocator becomes capable of expanding dynamically.
1238 *
1239 * We need:
1240 * - VM_MAP_EARLY_COUNT_MAX worth of VM Maps.
1241 * - To allow for 3-4 entries per map, but the kernel map
1242 * needs a multiple of VM_MAP_EARLY_COUNT_MAX entries
1243 * to describe the submaps, so double it (and make it 8x too)
1244 * - To allow for holes between entries,
1245 * hence needs the same budget as entries
1246 */
1247 map_data_size = zone_get_early_alloc_size(VM_MAP_ZONE_NAME,
1248 sizeof(struct _vm_map), VM_MAP_ZFLAGS,
1249 VM_MAP_EARLY_COUNT_MAX);
1250
1251 kentry_data_size = zone_get_early_alloc_size(VM_MAP_ENTRY_ZONE_NAME,
1252 sizeof(struct vm_map_entry), VM_MAP_ENTRY_ZFLAGS,
1253 8 * VM_MAP_EARLY_COUNT_MAX);
1254
1255 map_holes_data_size = zone_get_early_alloc_size(VM_MAP_HOLES_ZONE_NAME,
1256 sizeof(struct vm_map_links), VM_MAP_HOLES_ZFLAGS,
1257 8 * VM_MAP_EARLY_COUNT_MAX);
1258
1259 /*
1260 * Steal a contiguous range of memory so that a simple range check
1261 * can validate early addresses being freed/crammed to these
1262 * zones
1263 */
1264 map_data = zone_early_mem_init(map_data_size + kentry_data_size +
1265 map_holes_data_size);
1266 kentry_data = map_data + map_data_size;
1267 map_holes_data = kentry_data + kentry_data_size;
1268 }
1269 STARTUP(PMAP_STEAL, STARTUP_RANK_FIRST, vm_map_steal_memory);
1270
1271 __startup_func
1272 static void
vm_kernel_boostraped(void)1273 vm_kernel_boostraped(void)
1274 {
1275 zone_enable_caching(&zone_array[ZONE_ID_VM_MAP_ENTRY]);
1276 zone_enable_caching(&zone_array[ZONE_ID_VM_MAP_HOLES]);
1277 zone_enable_caching(&zone_array[ZONE_ID_VM_MAP_COPY]);
1278
1279 printf("VM bootstrap done: %d maps, %d entries and %d holes left\n",
1280 zone_count_free(vm_map_zone),
1281 zone_count_free(vm_map_entry_zone),
1282 zone_count_free(vm_map_holes_zone));
1283 }
1284 STARTUP(ZALLOC, STARTUP_RANK_SECOND, vm_kernel_boostraped);
1285
1286 void
vm_map_disable_hole_optimization(vm_map_t map)1287 vm_map_disable_hole_optimization(vm_map_t map)
1288 {
1289 vm_map_entry_t head_entry, hole_entry, next_hole_entry;
1290
1291 if (map->holelistenabled) {
1292 head_entry = hole_entry = CAST_TO_VM_MAP_ENTRY(map->holes_list);
1293
1294 while (hole_entry != NULL) {
1295 next_hole_entry = hole_entry->vme_next;
1296
1297 hole_entry->vme_next = NULL;
1298 hole_entry->vme_prev = NULL;
1299 zfree_id(ZONE_ID_VM_MAP_HOLES, hole_entry);
1300
1301 if (next_hole_entry == head_entry) {
1302 hole_entry = NULL;
1303 } else {
1304 hole_entry = next_hole_entry;
1305 }
1306 }
1307
1308 map->holes_list = NULL;
1309 map->holelistenabled = FALSE;
1310
1311 map->first_free = vm_map_first_entry(map);
1312 SAVE_HINT_HOLE_WRITE(map, NULL);
1313 }
1314 }
1315
1316 boolean_t
vm_kernel_map_is_kernel(vm_map_t map)1317 vm_kernel_map_is_kernel(vm_map_t map)
1318 {
1319 return map->pmap == kernel_pmap;
1320 }
1321
1322 /*
1323 * vm_map_create:
1324 *
1325 * Creates and returns a new empty VM map with
1326 * the given physical map structure, and having
1327 * the given lower and upper address bounds.
1328 */
1329
1330 extern vm_map_t vm_map_create_external(
1331 pmap_t pmap,
1332 vm_map_offset_t min_off,
1333 vm_map_offset_t max_off,
1334 boolean_t pageable);
1335
1336 vm_map_t
vm_map_create_external(pmap_t pmap,vm_map_offset_t min,vm_map_offset_t max,boolean_t pageable)1337 vm_map_create_external(
1338 pmap_t pmap,
1339 vm_map_offset_t min,
1340 vm_map_offset_t max,
1341 boolean_t pageable)
1342 {
1343 vm_map_create_options_t options = VM_MAP_CREATE_DEFAULT;
1344
1345 if (pageable) {
1346 options |= VM_MAP_CREATE_PAGEABLE;
1347 }
1348 return vm_map_create_options(pmap, min, max, options);
1349 }
1350
1351 __startup_func
1352 void
vm_map_will_allocate_early_map(vm_map_t * owner)1353 vm_map_will_allocate_early_map(vm_map_t *owner)
1354 {
1355 if (early_map_count >= VM_MAP_EARLY_COUNT_MAX) {
1356 panic("VM_MAP_EARLY_COUNT_MAX is too low");
1357 }
1358
1359 early_map_owners[early_map_count++] = owner;
1360 }
1361
1362 __startup_func
1363 void
vm_map_relocate_early_maps(vm_offset_t delta)1364 vm_map_relocate_early_maps(vm_offset_t delta)
1365 {
1366 for (uint32_t i = 0; i < early_map_count; i++) {
1367 vm_address_t addr = (vm_address_t)*early_map_owners[i];
1368
1369 *early_map_owners[i] = (vm_map_t)(addr + delta);
1370 }
1371
1372 early_map_count = ~0u;
1373 }
1374
1375 /*
1376 * Routine: vm_map_relocate_early_elem
1377 *
1378 * Purpose:
1379 * Early zone elements are allocated in a temporary part
1380 * of the address space.
1381 *
1382 * Once the zones live in their final place, the early
1383 * VM maps, map entries and map holes need to be relocated.
1384 *
1385 * It involves rewriting any vm_map_t, vm_map_entry_t or
1386 * pointers to vm_map_links. Other pointers to other types
1387 * are fine.
1388 *
1389 * Fortunately, pointers to those types are self-contained
1390 * in those zones, _except_ for pointers to VM maps,
1391 * which are tracked during early boot and fixed with
1392 * vm_map_relocate_early_maps().
1393 */
1394 __startup_func
1395 void
vm_map_relocate_early_elem(uint32_t zone_id,vm_offset_t new_addr,vm_offset_t delta)1396 vm_map_relocate_early_elem(
1397 uint32_t zone_id,
1398 vm_offset_t new_addr,
1399 vm_offset_t delta)
1400 {
1401 #define relocate(type_t, field) ({ \
1402 typeof(((type_t)NULL)->field) *__field = &((type_t)new_addr)->field; \
1403 if (*__field) { \
1404 *__field = (typeof(*__field))((vm_offset_t)*__field + delta); \
1405 } \
1406 })
1407
1408 switch (zone_id) {
1409 case ZONE_ID_VM_MAP:
1410 case ZONE_ID_VM_MAP_ENTRY:
1411 case ZONE_ID_VM_MAP_HOLES:
1412 break;
1413
1414 default:
1415 panic("Unexpected zone ID %d", zone_id);
1416 }
1417
1418 if (zone_id == ZONE_ID_VM_MAP) {
1419 relocate(vm_map_t, hdr.links.prev);
1420 relocate(vm_map_t, hdr.links.next);
1421 ((vm_map_t)new_addr)->pmap = kernel_pmap;
1422 #ifdef VM_MAP_STORE_USE_RB
1423 relocate(vm_map_t, hdr.rb_head_store.rbh_root);
1424 #endif /* VM_MAP_STORE_USE_RB */
1425 relocate(vm_map_t, hint);
1426 relocate(vm_map_t, hole_hint);
1427 relocate(vm_map_t, first_free);
1428 return;
1429 }
1430
1431 relocate(struct vm_map_links *, prev);
1432 relocate(struct vm_map_links *, next);
1433
1434 if (zone_id == ZONE_ID_VM_MAP_ENTRY) {
1435 #ifdef VM_MAP_STORE_USE_RB
1436 relocate(vm_map_entry_t, store.entry.rbe_left);
1437 relocate(vm_map_entry_t, store.entry.rbe_right);
1438 relocate(vm_map_entry_t, store.entry.rbe_parent);
1439 #endif /* VM_MAP_STORE_USE_RB */
1440 if (((vm_map_entry_t)new_addr)->is_sub_map) {
1441 /* no object to relocate because we haven't made any */
1442 ((vm_map_entry_t)new_addr)->vme_submap +=
1443 delta >> VME_SUBMAP_SHIFT;
1444 }
1445 #if MAP_ENTRY_CREATION_DEBUG
1446 relocate(vm_map_entry_t, vme_creation_maphdr);
1447 #endif /* MAP_ENTRY_CREATION_DEBUG */
1448 }
1449
1450 #undef relocate
1451 }
1452
1453 vm_map_t
vm_map_create_options(pmap_t pmap,vm_map_offset_t min,vm_map_offset_t max,vm_map_create_options_t options)1454 vm_map_create_options(
1455 pmap_t pmap,
1456 vm_map_offset_t min,
1457 vm_map_offset_t max,
1458 vm_map_create_options_t options)
1459 {
1460 vm_map_t result;
1461
1462 #if DEBUG || DEVELOPMENT
1463 if (__improbable(startup_phase < STARTUP_SUB_ZALLOC)) {
1464 if (early_map_count != ~0u && early_map_count !=
1465 zone_count_allocated(vm_map_zone) + 1) {
1466 panic("allocating %dth early map, owner not known",
1467 zone_count_allocated(vm_map_zone) + 1);
1468 }
1469 if (early_map_count != ~0u && pmap && pmap != kernel_pmap) {
1470 panic("allocating %dth early map for non kernel pmap",
1471 early_map_count);
1472 }
1473 }
1474 #endif /* DEBUG || DEVELOPMENT */
1475
1476 result = zalloc_id(ZONE_ID_VM_MAP, Z_WAITOK | Z_NOFAIL | Z_ZERO);
1477
1478 vm_map_store_init(&result->hdr);
1479 result->hdr.entries_pageable = (bool)(options & VM_MAP_CREATE_PAGEABLE);
1480 vm_map_set_page_shift(result, PAGE_SHIFT);
1481
1482 result->size_limit = RLIM_INFINITY; /* default unlimited */
1483 result->data_limit = RLIM_INFINITY; /* default unlimited */
1484 result->user_wire_limit = MACH_VM_MAX_ADDRESS; /* default limit is unlimited */
1485 os_ref_init_count_raw(&result->map_refcnt, &map_refgrp, 1);
1486 result->pmap = pmap;
1487 result->min_offset = min;
1488 result->max_offset = max;
1489 result->first_free = vm_map_to_entry(result);
1490 result->hint = vm_map_to_entry(result);
1491
1492 if (options & VM_MAP_CREATE_NEVER_FAULTS) {
1493 assert(pmap == kernel_pmap);
1494 result->never_faults = true;
1495 }
1496
1497 /* "has_corpse_footprint" and "holelistenabled" are mutually exclusive */
1498 if (options & VM_MAP_CREATE_CORPSE_FOOTPRINT) {
1499 result->has_corpse_footprint = true;
1500 } else if (!(options & VM_MAP_CREATE_DISABLE_HOLELIST)) {
1501 struct vm_map_links *hole_entry;
1502
1503 hole_entry = zalloc_id(ZONE_ID_VM_MAP_HOLES, Z_WAITOK | Z_NOFAIL);
1504 hole_entry->start = min;
1505 #if defined(__arm64__)
1506 hole_entry->end = result->max_offset;
1507 #else
1508 hole_entry->end = MAX(max, (vm_map_offset_t)MACH_VM_MAX_ADDRESS);
1509 #endif
1510 result->holes_list = result->hole_hint = hole_entry;
1511 hole_entry->prev = hole_entry->next = CAST_TO_VM_MAP_ENTRY(hole_entry);
1512 result->holelistenabled = true;
1513 }
1514
1515 vm_map_lock_init(result);
1516
1517 return result;
1518 }
1519
1520 /*
1521 * Adjusts a submap that was made by kmem_suballoc()
1522 * before it knew where it would be mapped,
1523 * so that it has the right min/max offsets.
1524 *
1525 * We do not need to hold any locks:
1526 * only the caller knows about this map,
1527 * and it is not published on any entry yet.
1528 */
1529 static void
vm_map_adjust_offsets(vm_map_t map,vm_map_offset_t min_off,vm_map_offset_t max_off)1530 vm_map_adjust_offsets(
1531 vm_map_t map,
1532 vm_map_offset_t min_off,
1533 vm_map_offset_t max_off)
1534 {
1535 assert(map->min_offset == 0);
1536 assert(map->max_offset == max_off - min_off);
1537 assert(map->hdr.nentries == 0);
1538 assert(os_ref_get_count_raw(&map->map_refcnt) == 2);
1539
1540 map->min_offset = min_off;
1541 map->max_offset = max_off;
1542
1543 if (map->holelistenabled) {
1544 struct vm_map_links *hole = map->holes_list;
1545
1546 hole->start = min_off;
1547 #if defined(__arm64__)
1548 hole->end = max_off;
1549 #else
1550 hole->end = MAX(max_off, (vm_map_offset_t)MACH_VM_MAX_ADDRESS);
1551 #endif
1552 }
1553 }
1554
1555
1556 vm_map_size_t
vm_map_adjusted_size(vm_map_t map)1557 vm_map_adjusted_size(vm_map_t map)
1558 {
1559 const struct vm_reserved_region *regions = NULL;
1560 size_t num_regions = 0;
1561 mach_vm_size_t reserved_size = 0, map_size = 0;
1562
1563 if (map == NULL || (map->size == 0)) {
1564 return 0;
1565 }
1566
1567 map_size = map->size;
1568
1569 if (map->reserved_regions == FALSE || !vm_map_is_exotic(map) || map->terminated) {
1570 /*
1571 * No special reserved regions or not an exotic map or the task
1572 * is terminating and these special regions might have already
1573 * been deallocated.
1574 */
1575 return map_size;
1576 }
1577
1578 num_regions = ml_get_vm_reserved_regions(vm_map_is_64bit(map), ®ions);
1579 assert((num_regions == 0) || (num_regions > 0 && regions != NULL));
1580
1581 while (num_regions) {
1582 reserved_size += regions[--num_regions].vmrr_size;
1583 }
1584
1585 /*
1586 * There are a few places where the map is being switched out due to
1587 * 'termination' without that bit being set (e.g. exec and corpse purging).
1588 * In those cases, we could have the map's regions being deallocated on
1589 * a core while some accounting process is trying to get the map's size.
1590 * So this assert can't be enabled till all those places are uniform in
1591 * their use of the 'map->terminated' bit.
1592 *
1593 * assert(map_size >= reserved_size);
1594 */
1595
1596 return (map_size >= reserved_size) ? (map_size - reserved_size) : map_size;
1597 }
1598
1599 /*
1600 * vm_map_entry_create: [ internal use only ]
1601 *
1602 * Allocates a VM map entry for insertion in the
1603 * given map (or map copy). No fields are filled.
1604 *
1605 * The VM entry will be zero initialized, except for:
1606 * - behavior set to VM_BEHAVIOR_DEFAULT
1607 * - inheritance set to VM_INHERIT_DEFAULT
1608 */
1609 #define vm_map_entry_create(map) _vm_map_entry_create(&(map)->hdr)
1610
1611 #define vm_map_copy_entry_create(copy) _vm_map_entry_create(&(copy)->cpy_hdr)
1612
1613 static vm_map_entry_t
_vm_map_entry_create(struct vm_map_header * map_header __unused)1614 _vm_map_entry_create(
1615 struct vm_map_header *map_header __unused)
1616 {
1617 vm_map_entry_t entry = NULL;
1618
1619 entry = zalloc_id(ZONE_ID_VM_MAP_ENTRY, Z_WAITOK | Z_ZERO);
1620
1621 /*
1622 * Help the compiler with what we know to be true,
1623 * so that the further bitfields inits have good codegen.
1624 *
1625 * See rdar://87041299
1626 */
1627 __builtin_assume(entry->vme_object_value == 0);
1628 __builtin_assume(*(uint64_t *)(&entry->vme_object_value + 1) == 0);
1629 __builtin_assume(*(uint64_t *)(&entry->vme_object_value + 2) == 0);
1630
1631 static_assert(VM_MAX_TAG_VALUE <= VME_ALIAS_MASK,
1632 "VME_ALIAS_MASK covers tags");
1633
1634 static_assert(VM_BEHAVIOR_DEFAULT == 0,
1635 "can skip zeroing of the behavior field");
1636 entry->inheritance = VM_INHERIT_DEFAULT;
1637
1638 #if MAP_ENTRY_CREATION_DEBUG
1639 entry->vme_creation_maphdr = map_header;
1640 entry->vme_creation_bt = btref_get(__builtin_frame_address(0),
1641 BTREF_GET_NOWAIT);
1642 #endif
1643 return entry;
1644 }
1645
1646 /*
1647 * vm_map_entry_dispose: [ internal use only ]
1648 *
1649 * Inverse of vm_map_entry_create.
1650 *
1651 * write map lock held so no need to
1652 * do anything special to insure correctness
1653 * of the stores
1654 */
1655 static void
vm_map_entry_dispose(vm_map_entry_t entry)1656 vm_map_entry_dispose(
1657 vm_map_entry_t entry)
1658 {
1659 #if MAP_ENTRY_CREATION_DEBUG
1660 btref_put(entry->vme_creation_bt);
1661 #endif
1662 #if MAP_ENTRY_INSERTION_DEBUG
1663 btref_put(entry->vme_insertion_bt);
1664 #endif
1665 zfree(vm_map_entry_zone, entry);
1666 }
1667
1668 #define vm_map_copy_entry_dispose(copy_entry) \
1669 vm_map_entry_dispose(copy_entry)
1670
1671 static vm_map_entry_t
vm_map_zap_first_entry(vm_map_zap_t list)1672 vm_map_zap_first_entry(
1673 vm_map_zap_t list)
1674 {
1675 return list->vmz_head;
1676 }
1677
1678 static vm_map_entry_t
vm_map_zap_last_entry(vm_map_zap_t list)1679 vm_map_zap_last_entry(
1680 vm_map_zap_t list)
1681 {
1682 assert(vm_map_zap_first_entry(list));
1683 return __container_of(list->vmz_tail, struct vm_map_entry, vme_next);
1684 }
1685
1686 static void
vm_map_zap_append(vm_map_zap_t list,vm_map_entry_t entry)1687 vm_map_zap_append(
1688 vm_map_zap_t list,
1689 vm_map_entry_t entry)
1690 {
1691 entry->vme_next = VM_MAP_ENTRY_NULL;
1692 *list->vmz_tail = entry;
1693 list->vmz_tail = &entry->vme_next;
1694 }
1695
1696 static vm_map_entry_t
vm_map_zap_pop(vm_map_zap_t list)1697 vm_map_zap_pop(
1698 vm_map_zap_t list)
1699 {
1700 vm_map_entry_t head = list->vmz_head;
1701
1702 if (head != VM_MAP_ENTRY_NULL &&
1703 (list->vmz_head = head->vme_next) == VM_MAP_ENTRY_NULL) {
1704 list->vmz_tail = &list->vmz_head;
1705 }
1706
1707 return head;
1708 }
1709
1710 static void
vm_map_zap_dispose(vm_map_zap_t list)1711 vm_map_zap_dispose(
1712 vm_map_zap_t list)
1713 {
1714 vm_map_entry_t entry;
1715
1716 while ((entry = vm_map_zap_pop(list))) {
1717 if (entry->is_sub_map) {
1718 vm_map_deallocate(VME_SUBMAP(entry));
1719 } else {
1720 vm_object_deallocate(VME_OBJECT(entry));
1721 }
1722
1723 vm_map_entry_dispose(entry);
1724 }
1725 }
1726
1727 #if MACH_ASSERT
1728 static boolean_t first_free_check = FALSE;
1729 boolean_t
first_free_is_valid(vm_map_t map)1730 first_free_is_valid(
1731 vm_map_t map)
1732 {
1733 if (!first_free_check) {
1734 return TRUE;
1735 }
1736
1737 return first_free_is_valid_store( map );
1738 }
1739 #endif /* MACH_ASSERT */
1740
1741
1742 #define vm_map_copy_entry_link(copy, after_where, entry) \
1743 _vm_map_store_entry_link(&(copy)->cpy_hdr, after_where, (entry))
1744
1745 #define vm_map_copy_entry_unlink(copy, entry) \
1746 _vm_map_store_entry_unlink(&(copy)->cpy_hdr, (entry), false)
1747
1748 /*
1749 * vm_map_destroy:
1750 *
1751 * Actually destroy a map.
1752 */
1753 void
vm_map_destroy(vm_map_t map)1754 vm_map_destroy(
1755 vm_map_t map)
1756 {
1757 /* final cleanup: this is not allowed to fail */
1758 vmr_flags_t flags = VM_MAP_REMOVE_NO_FLAGS;
1759
1760 VM_MAP_ZAP_DECLARE(zap);
1761
1762 vm_map_lock(map);
1763
1764 map->terminated = true;
1765 /* clean up regular map entries */
1766 (void)vm_map_delete(map, map->min_offset, map->max_offset, flags,
1767 KMEM_GUARD_NONE, &zap);
1768 /* clean up leftover special mappings (commpage, GPU carveout, etc...) */
1769 (void)vm_map_delete(map, 0x0, 0xFFFFFFFFFFFFF000ULL, flags,
1770 KMEM_GUARD_NONE, &zap);
1771
1772 vm_map_disable_hole_optimization(map);
1773 vm_map_corpse_footprint_destroy(map);
1774
1775 vm_map_unlock(map);
1776
1777 vm_map_zap_dispose(&zap);
1778
1779 assert(map->hdr.nentries == 0);
1780
1781 if (map->pmap) {
1782 pmap_destroy(map->pmap);
1783 }
1784
1785 lck_rw_destroy(&map->lock, &vm_map_lck_grp);
1786
1787 zfree_id(ZONE_ID_VM_MAP, map);
1788 }
1789
1790 /*
1791 * Returns pid of the task with the largest number of VM map entries.
1792 * Used in the zone-map-exhaustion jetsam path.
1793 */
1794 pid_t
find_largest_process_vm_map_entries(void)1795 find_largest_process_vm_map_entries(void)
1796 {
1797 pid_t victim_pid = -1;
1798 int max_vm_map_entries = 0;
1799 task_t task = TASK_NULL;
1800 queue_head_t *task_list = &tasks;
1801
1802 lck_mtx_lock(&tasks_threads_lock);
1803 queue_iterate(task_list, task, task_t, tasks) {
1804 if (task == kernel_task || !task->active) {
1805 continue;
1806 }
1807
1808 vm_map_t task_map = task->map;
1809 if (task_map != VM_MAP_NULL) {
1810 int task_vm_map_entries = task_map->hdr.nentries;
1811 if (task_vm_map_entries > max_vm_map_entries) {
1812 max_vm_map_entries = task_vm_map_entries;
1813 victim_pid = pid_from_task(task);
1814 }
1815 }
1816 }
1817 lck_mtx_unlock(&tasks_threads_lock);
1818
1819 printf("zone_map_exhaustion: victim pid %d, vm region count: %d\n", victim_pid, max_vm_map_entries);
1820 return victim_pid;
1821 }
1822
1823
1824 /*
1825 * vm_map_lookup_entry: [ internal use only ]
1826 *
1827 * Calls into the vm map store layer to find the map
1828 * entry containing (or immediately preceding) the
1829 * specified address in the given map; the entry is returned
1830 * in the "entry" parameter. The boolean
1831 * result indicates whether the address is
1832 * actually contained in the map.
1833 */
1834 boolean_t
vm_map_lookup_entry(vm_map_t map,vm_map_offset_t address,vm_map_entry_t * entry)1835 vm_map_lookup_entry(
1836 vm_map_t map,
1837 vm_map_offset_t address,
1838 vm_map_entry_t *entry) /* OUT */
1839 {
1840 #if CONFIG_KERNEL_TBI
1841 if (VM_KERNEL_ADDRESS(address)) {
1842 address = VM_KERNEL_STRIP_UPTR(address);
1843 }
1844 #endif /* CONFIG_KERNEL_TBI */
1845 #if CONFIG_PROB_GZALLOC
1846 if (map->pmap == kernel_pmap) {
1847 assertf(!pgz_owned(address),
1848 "it is the responsibility of callers to unguard PGZ addresses");
1849 }
1850 #endif /* CONFIG_PROB_GZALLOC */
1851 return vm_map_store_lookup_entry( map, address, entry );
1852 }
1853
1854 boolean_t
vm_map_lookup_entry_or_next(vm_map_t map,vm_map_offset_t address,vm_map_entry_t * entry)1855 vm_map_lookup_entry_or_next(
1856 vm_map_t map,
1857 vm_map_offset_t address,
1858 vm_map_entry_t *entry) /* OUT */
1859 {
1860 if (vm_map_lookup_entry(map, address, entry)) {
1861 return true;
1862 }
1863
1864 *entry = (*entry)->vme_next;
1865 return false;
1866 }
1867
1868 #if CONFIG_PROB_GZALLOC
1869 boolean_t
vm_map_lookup_entry_allow_pgz(vm_map_t map,vm_map_offset_t address,vm_map_entry_t * entry)1870 vm_map_lookup_entry_allow_pgz(
1871 vm_map_t map,
1872 vm_map_offset_t address,
1873 vm_map_entry_t *entry) /* OUT */
1874 {
1875 #if CONFIG_KERNEL_TBI
1876 if (VM_KERNEL_ADDRESS(address)) {
1877 address = VM_KERNEL_STRIP_UPTR(address);
1878 }
1879 #endif /* CONFIG_KERNEL_TBI */
1880 return vm_map_store_lookup_entry( map, address, entry );
1881 }
1882 #endif /* CONFIG_PROB_GZALLOC */
1883
1884 /*
1885 * Routine: vm_map_range_invalid_panic
1886 * Purpose:
1887 * Panic on detection of an invalid range id.
1888 */
1889 __abortlike
1890 static void
vm_map_range_invalid_panic(vm_map_t map,vm_map_range_id_t range_id)1891 vm_map_range_invalid_panic(
1892 vm_map_t map,
1893 vm_map_range_id_t range_id)
1894 {
1895 panic("invalid range ID (%u) for map %p", range_id, map);
1896 }
1897
1898 /*
1899 * Routine: vm_map_get_range
1900 * Purpose:
1901 * Adjust bounds based on security policy.
1902 */
1903 static struct mach_vm_range
vm_map_get_range(vm_map_t map,vm_map_address_t * address,vm_map_kernel_flags_t * vmk_flags,vm_map_size_t size,bool * is_ptr)1904 vm_map_get_range(
1905 vm_map_t map,
1906 vm_map_address_t *address,
1907 vm_map_kernel_flags_t *vmk_flags,
1908 vm_map_size_t size,
1909 bool *is_ptr)
1910 {
1911 struct mach_vm_range effective_range = {};
1912 vm_map_range_id_t range_id = vmk_flags->vmkf_range_id;
1913
1914 if (map == kernel_map) {
1915 effective_range = kmem_ranges[range_id];
1916
1917 if (startup_phase >= STARTUP_SUB_KMEM) {
1918 /*
1919 * Hint provided by caller is zeroed as the range is restricted to a
1920 * subset of the entire kernel_map VA, which could put the hint outside
1921 * the range, causing vm_map_store_find_space to fail.
1922 */
1923 *address = 0ull;
1924 /*
1925 * Ensure that range_id passed in by the caller is within meaningful
1926 * bounds. Range id of KMEM_RANGE_ID_NONE will cause vm_map_locate_space
1927 * to fail as the corresponding range is invalid. Range id larger than
1928 * KMEM_RANGE_ID_MAX will lead to an OOB access.
1929 */
1930 if ((range_id == KMEM_RANGE_ID_NONE) ||
1931 (range_id > KMEM_RANGE_ID_MAX)) {
1932 vm_map_range_invalid_panic(map, range_id);
1933 }
1934
1935 /*
1936 * Pointer ranges use kmem_locate_space to do allocations.
1937 *
1938 * Non pointer fronts look like [ Small | Large | Permanent ]
1939 * Adjust range for allocations larger than KMEM_SMALLMAP_THRESHOLD.
1940 * Allocations smaller than KMEM_SMALLMAP_THRESHOLD are allowed to
1941 * use the entire range.
1942 */
1943 if (range_id < KMEM_RANGE_ID_SPRAYQTN) {
1944 *is_ptr = true;
1945 } else if (size >= KMEM_SMALLMAP_THRESHOLD) {
1946 effective_range = kmem_large_ranges[range_id];
1947 }
1948 }
1949 #if CONFIG_MAP_RANGES
1950 } else if (map->uses_user_ranges) {
1951 if (range_id > UMEM_RANGE_ID_MAX) {
1952 vm_map_range_invalid_panic(map, range_id);
1953 }
1954
1955 effective_range = map->user_range[range_id];
1956 #endif /* CONFIG_MAP_RANGES */
1957 } else {
1958 /*
1959 * If minimum is 0, bump it up by PAGE_SIZE. We want to limit
1960 * allocations of PAGEZERO to explicit requests since its
1961 * normal use is to catch dereferences of NULL and many
1962 * applications also treat pointers with a value of 0 as
1963 * special and suddenly having address 0 contain useable
1964 * memory would tend to confuse those applications.
1965 */
1966 effective_range.min_address = MAX(map->min_offset, VM_MAP_PAGE_SIZE(map));
1967 effective_range.max_address = map->max_offset;
1968 }
1969
1970 return effective_range;
1971 }
1972
1973 /*
1974 * Routine: vm_map_locate_space
1975 * Purpose:
1976 * Finds a range in the specified virtual address map,
1977 * returning the start of that range,
1978 * as well as the entry right before it.
1979 */
1980 kern_return_t
vm_map_locate_space(vm_map_t map,vm_map_size_t size,vm_map_offset_t mask,vm_map_kernel_flags_t vmk_flags,vm_map_offset_t * start_inout,vm_map_entry_t * entry_out)1981 vm_map_locate_space(
1982 vm_map_t map,
1983 vm_map_size_t size,
1984 vm_map_offset_t mask,
1985 vm_map_kernel_flags_t vmk_flags,
1986 vm_map_offset_t *start_inout,
1987 vm_map_entry_t *entry_out)
1988 {
1989 struct mach_vm_range effective_range = {};
1990 vm_map_size_t guard_offset;
1991 vm_map_offset_t hint, limit;
1992 vm_map_entry_t entry;
1993 bool is_kmem_ptr_range = false;
1994
1995 /*
1996 * Only supported by vm_map_enter() with a fixed address.
1997 */
1998 assert(!vmk_flags.vmkf_beyond_max);
1999
2000 if (__improbable(map->wait_for_space)) {
2001 /*
2002 * support for "wait_for_space" is minimal,
2003 * its only consumer is the ipc_kernel_copy_map.
2004 */
2005 assert(!map->holelistenabled &&
2006 !vmk_flags.vmkf_last_free &&
2007 !vmk_flags.vmkf_keep_map_locked &&
2008 !vmk_flags.vmkf_map_jit &&
2009 !vmk_flags.vmf_random_addr &&
2010 *start_inout <= map->min_offset);
2011 } else if (vmk_flags.vmkf_last_free) {
2012 assert(!vmk_flags.vmkf_map_jit &&
2013 !vmk_flags.vmf_random_addr);
2014 }
2015
2016 if (vmk_flags.vmkf_guard_before) {
2017 guard_offset = VM_MAP_PAGE_SIZE(map);
2018 assert(size > guard_offset);
2019 size -= guard_offset;
2020 } else {
2021 assert(size != 0);
2022 guard_offset = 0;
2023 }
2024
2025 /*
2026 * Validate range_id from flags and get associated range
2027 */
2028 effective_range = vm_map_get_range(map, start_inout, &vmk_flags, size,
2029 &is_kmem_ptr_range);
2030
2031 if (is_kmem_ptr_range) {
2032 return kmem_locate_space(size + guard_offset, vmk_flags.vmkf_range_id,
2033 vmk_flags.vmkf_last_free, start_inout, entry_out);
2034 }
2035
2036 #if XNU_TARGET_OS_OSX
2037 if (__improbable(vmk_flags.vmkf_32bit_map_va)) {
2038 assert(map != kernel_map);
2039 effective_range.max_address = MIN(map->max_offset, 0x00000000FFFFF000ULL);
2040 }
2041 #endif /* XNU_TARGET_OS_OSX */
2042
2043 again:
2044 if (vmk_flags.vmkf_last_free) {
2045 hint = *start_inout;
2046
2047 if (hint == 0 || hint > effective_range.max_address) {
2048 hint = effective_range.max_address;
2049 }
2050 if (hint <= effective_range.min_address) {
2051 return KERN_NO_SPACE;
2052 }
2053 limit = effective_range.min_address;
2054 } else {
2055 hint = *start_inout;
2056
2057 if (vmk_flags.vmkf_map_jit) {
2058 if (map->jit_entry_exists &&
2059 !VM_MAP_POLICY_ALLOW_MULTIPLE_JIT(map)) {
2060 return KERN_INVALID_ARGUMENT;
2061 }
2062 if (VM_MAP_POLICY_ALLOW_JIT_RANDOM_ADDRESS(map)) {
2063 vmk_flags.vmf_random_addr = true;
2064 }
2065 }
2066
2067 if (vmk_flags.vmf_random_addr) {
2068 kern_return_t kr;
2069
2070 kr = vm_map_random_address_for_size(map, &hint, size, vmk_flags);
2071 if (kr != KERN_SUCCESS) {
2072 return kr;
2073 }
2074 }
2075 #if XNU_TARGET_OS_OSX
2076 else if ((hint == 0 || hint == vm_map_min(map)) &&
2077 !map->disable_vmentry_reuse &&
2078 map->vmmap_high_start != 0) {
2079 hint = map->vmmap_high_start;
2080 }
2081 #endif /* XNU_TARGET_OS_OSX */
2082
2083 if (hint < effective_range.min_address) {
2084 hint = effective_range.min_address;
2085 }
2086 if (effective_range.max_address <= hint) {
2087 return KERN_NO_SPACE;
2088 }
2089
2090 limit = effective_range.max_address;
2091 }
2092 entry = vm_map_store_find_space(map,
2093 hint, limit, vmk_flags.vmkf_last_free,
2094 guard_offset, size, mask,
2095 start_inout);
2096
2097 if (__improbable(entry == NULL)) {
2098 if (map->wait_for_space &&
2099 guard_offset + size <=
2100 effective_range.max_address - effective_range.min_address) {
2101 assert_wait((event_t)map, THREAD_ABORTSAFE);
2102 vm_map_unlock(map);
2103 thread_block(THREAD_CONTINUE_NULL);
2104 vm_map_lock(map);
2105 goto again;
2106 }
2107 return KERN_NO_SPACE;
2108 }
2109
2110 if (entry_out) {
2111 *entry_out = entry;
2112 }
2113 return KERN_SUCCESS;
2114 }
2115
2116
2117 /*
2118 * Routine: vm_map_find_space
2119 * Purpose:
2120 * Allocate a range in the specified virtual address map,
2121 * returning the entry allocated for that range.
2122 * Used by kmem_alloc, etc.
2123 *
2124 * The map must be NOT be locked. It will be returned locked
2125 * on KERN_SUCCESS, unlocked on failure.
2126 *
2127 * If an entry is allocated, the object/offset fields
2128 * are initialized to zero.
2129 */
2130 kern_return_t
vm_map_find_space(vm_map_t map,vm_map_offset_t hint_address,vm_map_size_t size,vm_map_offset_t mask,vm_map_kernel_flags_t vmk_flags,vm_map_entry_t * o_entry)2131 vm_map_find_space(
2132 vm_map_t map,
2133 vm_map_offset_t hint_address,
2134 vm_map_size_t size,
2135 vm_map_offset_t mask,
2136 vm_map_kernel_flags_t vmk_flags,
2137 vm_map_entry_t *o_entry) /* OUT */
2138 {
2139 vm_map_entry_t new_entry, entry;
2140 kern_return_t kr;
2141
2142 if (size == 0) {
2143 return KERN_INVALID_ARGUMENT;
2144 }
2145
2146 new_entry = vm_map_entry_create(map);
2147 new_entry->use_pmap = true;
2148 new_entry->protection = VM_PROT_DEFAULT;
2149 new_entry->max_protection = VM_PROT_ALL;
2150
2151 if (VM_MAP_PAGE_SHIFT(map) != PAGE_SHIFT) {
2152 new_entry->map_aligned = true;
2153 }
2154 if (vmk_flags.vmf_permanent) {
2155 new_entry->vme_permanent = true;
2156 }
2157
2158 vm_map_lock(map);
2159
2160 kr = vm_map_locate_space(map, size, mask, vmk_flags,
2161 &hint_address, &entry);
2162 if (kr != KERN_SUCCESS) {
2163 vm_map_unlock(map);
2164 vm_map_entry_dispose(new_entry);
2165 return kr;
2166 }
2167 new_entry->vme_start = hint_address;
2168 new_entry->vme_end = hint_address + size;
2169
2170 /*
2171 * At this point,
2172 *
2173 * - new_entry's "vme_start" and "vme_end" should define
2174 * the endpoints of the available new range,
2175 *
2176 * - and "entry" should refer to the region before
2177 * the new range,
2178 *
2179 * - and the map should still be locked.
2180 */
2181
2182 assert(page_aligned(new_entry->vme_start));
2183 assert(page_aligned(new_entry->vme_end));
2184 assert(VM_MAP_PAGE_ALIGNED(new_entry->vme_start, VM_MAP_PAGE_MASK(map)));
2185 assert(VM_MAP_PAGE_ALIGNED(new_entry->vme_end, VM_MAP_PAGE_MASK(map)));
2186
2187 /*
2188 * Insert the new entry into the list
2189 */
2190
2191 vm_map_store_entry_link(map, entry, new_entry,
2192 VM_MAP_KERNEL_FLAGS_NONE);
2193 map->size += size;
2194
2195 /*
2196 * Update the lookup hint
2197 */
2198 SAVE_HINT_MAP_WRITE(map, new_entry);
2199
2200 *o_entry = new_entry;
2201 return KERN_SUCCESS;
2202 }
2203
2204 int vm_map_pmap_enter_print = FALSE;
2205 int vm_map_pmap_enter_enable = FALSE;
2206
2207 /*
2208 * Routine: vm_map_pmap_enter [internal only]
2209 *
2210 * Description:
2211 * Force pages from the specified object to be entered into
2212 * the pmap at the specified address if they are present.
2213 * As soon as a page not found in the object the scan ends.
2214 *
2215 * Returns:
2216 * Nothing.
2217 *
2218 * In/out conditions:
2219 * The source map should not be locked on entry.
2220 */
2221 __unused static void
vm_map_pmap_enter(vm_map_t map,vm_map_offset_t addr,vm_map_offset_t end_addr,vm_object_t object,vm_object_offset_t offset,vm_prot_t protection)2222 vm_map_pmap_enter(
2223 vm_map_t map,
2224 vm_map_offset_t addr,
2225 vm_map_offset_t end_addr,
2226 vm_object_t object,
2227 vm_object_offset_t offset,
2228 vm_prot_t protection)
2229 {
2230 int type_of_fault;
2231 kern_return_t kr;
2232 struct vm_object_fault_info fault_info = {};
2233
2234 if (map->pmap == 0) {
2235 return;
2236 }
2237
2238 assert(VM_MAP_PAGE_SHIFT(map) == PAGE_SHIFT);
2239
2240 while (addr < end_addr) {
2241 vm_page_t m;
2242
2243
2244 /*
2245 * TODO:
2246 * From vm_map_enter(), we come into this function without the map
2247 * lock held or the object lock held.
2248 * We haven't taken a reference on the object either.
2249 * We should do a proper lookup on the map to make sure
2250 * that things are sane before we go locking objects that
2251 * could have been deallocated from under us.
2252 */
2253
2254 vm_object_lock(object);
2255
2256 m = vm_page_lookup(object, offset);
2257
2258 if (m == VM_PAGE_NULL || m->vmp_busy || m->vmp_fictitious ||
2259 (m->vmp_unusual && (VMP_ERROR_GET(m) || m->vmp_restart || m->vmp_absent))) {
2260 vm_object_unlock(object);
2261 return;
2262 }
2263
2264 if (vm_map_pmap_enter_print) {
2265 printf("vm_map_pmap_enter:");
2266 printf("map: %p, addr: %llx, object: %p, offset: %llx\n",
2267 map, (unsigned long long)addr, object, (unsigned long long)offset);
2268 }
2269 type_of_fault = DBG_CACHE_HIT_FAULT;
2270 kr = vm_fault_enter(m, map->pmap,
2271 addr,
2272 PAGE_SIZE, 0,
2273 protection, protection,
2274 VM_PAGE_WIRED(m),
2275 FALSE, /* change_wiring */
2276 VM_KERN_MEMORY_NONE, /* tag - not wiring */
2277 &fault_info,
2278 NULL, /* need_retry */
2279 &type_of_fault);
2280
2281 vm_object_unlock(object);
2282
2283 offset += PAGE_SIZE_64;
2284 addr += PAGE_SIZE;
2285 }
2286 }
2287
2288 #define MAX_TRIES_TO_GET_RANDOM_ADDRESS 1000
2289 static kern_return_t
vm_map_random_address_for_size(vm_map_t map,vm_map_offset_t * address,vm_map_size_t size,vm_map_kernel_flags_t vmk_flags)2290 vm_map_random_address_for_size(
2291 vm_map_t map,
2292 vm_map_offset_t *address,
2293 vm_map_size_t size,
2294 vm_map_kernel_flags_t vmk_flags)
2295 {
2296 kern_return_t kr = KERN_SUCCESS;
2297 int tries = 0;
2298 vm_map_offset_t random_addr = 0;
2299 vm_map_offset_t hole_end;
2300
2301 vm_map_entry_t next_entry = VM_MAP_ENTRY_NULL;
2302 vm_map_entry_t prev_entry = VM_MAP_ENTRY_NULL;
2303 vm_map_size_t vm_hole_size = 0;
2304 vm_map_size_t addr_space_size;
2305 bool is_kmem_ptr;
2306 struct mach_vm_range effective_range;
2307
2308 effective_range = vm_map_get_range(map, address, &vmk_flags, size,
2309 &is_kmem_ptr);
2310
2311 addr_space_size = effective_range.max_address - effective_range.min_address;
2312 if (size >= addr_space_size) {
2313 return KERN_NO_SPACE;
2314 }
2315 addr_space_size -= size;
2316
2317 assert(VM_MAP_PAGE_ALIGNED(size, VM_MAP_PAGE_MASK(map)));
2318
2319 while (tries < MAX_TRIES_TO_GET_RANDOM_ADDRESS) {
2320 if (startup_phase < STARTUP_SUB_ZALLOC) {
2321 random_addr = (vm_map_offset_t)early_random();
2322 } else {
2323 random_addr = (vm_map_offset_t)random();
2324 }
2325 random_addr <<= VM_MAP_PAGE_SHIFT(map);
2326 random_addr = vm_map_trunc_page(
2327 effective_range.min_address + (random_addr % addr_space_size),
2328 VM_MAP_PAGE_MASK(map));
2329
2330 #if CONFIG_PROB_GZALLOC
2331 if (map->pmap == kernel_pmap && pgz_owned(random_addr)) {
2332 continue;
2333 }
2334 #endif /* CONFIG_PROB_GZALLOC */
2335
2336 if (vm_map_lookup_entry(map, random_addr, &prev_entry) == FALSE) {
2337 if (prev_entry == vm_map_to_entry(map)) {
2338 next_entry = vm_map_first_entry(map);
2339 } else {
2340 next_entry = prev_entry->vme_next;
2341 }
2342 if (next_entry == vm_map_to_entry(map)) {
2343 hole_end = vm_map_max(map);
2344 } else {
2345 hole_end = next_entry->vme_start;
2346 }
2347 vm_hole_size = hole_end - random_addr;
2348 if (vm_hole_size >= size) {
2349 *address = random_addr;
2350 break;
2351 }
2352 }
2353 tries++;
2354 }
2355
2356 if (tries == MAX_TRIES_TO_GET_RANDOM_ADDRESS) {
2357 kr = KERN_NO_SPACE;
2358 }
2359 return kr;
2360 }
2361
2362 static boolean_t
vm_memory_malloc_no_cow(int alias)2363 vm_memory_malloc_no_cow(
2364 int alias)
2365 {
2366 uint64_t alias_mask;
2367
2368 if (alias > 63) {
2369 return FALSE;
2370 }
2371
2372 alias_mask = 1ULL << alias;
2373 if (alias_mask & vm_memory_malloc_no_cow_mask) {
2374 return TRUE;
2375 }
2376 return FALSE;
2377 }
2378
2379 uint64_t vm_map_enter_RLIMIT_AS_count = 0;
2380 uint64_t vm_map_enter_RLIMIT_DATA_count = 0;
2381 /*
2382 * Routine: vm_map_enter
2383 *
2384 * Description:
2385 * Allocate a range in the specified virtual address map.
2386 * The resulting range will refer to memory defined by
2387 * the given memory object and offset into that object.
2388 *
2389 * Arguments are as defined in the vm_map call.
2390 */
2391 static unsigned int vm_map_enter_restore_successes = 0;
2392 static unsigned int vm_map_enter_restore_failures = 0;
2393 kern_return_t
vm_map_enter(vm_map_t map,vm_map_offset_t * address,vm_map_size_t size,vm_map_offset_t mask,vm_map_kernel_flags_t vmk_flags,vm_object_t object,vm_object_offset_t offset,boolean_t needs_copy,vm_prot_t cur_protection,vm_prot_t max_protection,vm_inherit_t inheritance)2394 vm_map_enter(
2395 vm_map_t map,
2396 vm_map_offset_t *address, /* IN/OUT */
2397 vm_map_size_t size,
2398 vm_map_offset_t mask,
2399 vm_map_kernel_flags_t vmk_flags,
2400 vm_object_t object,
2401 vm_object_offset_t offset,
2402 boolean_t needs_copy,
2403 vm_prot_t cur_protection,
2404 vm_prot_t max_protection,
2405 vm_inherit_t inheritance)
2406 {
2407 vm_map_entry_t entry, new_entry;
2408 vm_map_offset_t start, tmp_start, tmp_offset;
2409 vm_map_offset_t end, tmp_end;
2410 vm_map_offset_t tmp2_start, tmp2_end;
2411 vm_map_offset_t step;
2412 kern_return_t result = KERN_SUCCESS;
2413 bool map_locked = FALSE;
2414 bool pmap_empty = TRUE;
2415 bool new_mapping_established = FALSE;
2416 const bool keep_map_locked = vmk_flags.vmkf_keep_map_locked;
2417 const bool anywhere = !vmk_flags.vmf_fixed;
2418 const bool purgable = vmk_flags.vmf_purgeable;
2419 const bool overwrite = vmk_flags.vmf_overwrite;
2420 const bool no_cache = vmk_flags.vmf_no_cache;
2421 const bool is_submap = vmk_flags.vmkf_submap;
2422 const bool permanent = vmk_flags.vmf_permanent;
2423 const bool no_copy_on_read = vmk_flags.vmkf_no_copy_on_read;
2424 const bool entry_for_jit = vmk_flags.vmkf_map_jit;
2425 const bool iokit_acct = vmk_flags.vmkf_iokit_acct;
2426 const bool resilient_codesign = vmk_flags.vmf_resilient_codesign;
2427 const bool resilient_media = vmk_flags.vmf_resilient_media;
2428 const bool entry_for_tpro = vmk_flags.vmf_tpro;
2429 const unsigned int superpage_size = vmk_flags.vmf_superpage_size;
2430 const vm_tag_t alias = vmk_flags.vm_tag;
2431 vm_tag_t user_alias;
2432 kern_return_t kr;
2433 bool clear_map_aligned = FALSE;
2434 vm_map_size_t chunk_size = 0;
2435 vm_object_t caller_object;
2436 VM_MAP_ZAP_DECLARE(zap_old_list);
2437 VM_MAP_ZAP_DECLARE(zap_new_list);
2438
2439 caller_object = object;
2440
2441 assertf(vmk_flags.__vmkf_unused == 0, "vmk_flags unused=0x%x\n", vmk_flags.__vmkf_unused);
2442
2443 if (vmk_flags.vmf_4gb_chunk) {
2444 #if defined(__LP64__)
2445 chunk_size = (4ULL * 1024 * 1024 * 1024); /* max. 4GB chunks for the new allocation */
2446 #else /* __LP64__ */
2447 chunk_size = ANON_CHUNK_SIZE;
2448 #endif /* __LP64__ */
2449 } else {
2450 chunk_size = ANON_CHUNK_SIZE;
2451 }
2452
2453
2454
2455 if (superpage_size) {
2456 switch (superpage_size) {
2457 /*
2458 * Note that the current implementation only supports
2459 * a single size for superpages, SUPERPAGE_SIZE, per
2460 * architecture. As soon as more sizes are supposed
2461 * to be supported, SUPERPAGE_SIZE has to be replaced
2462 * with a lookup of the size depending on superpage_size.
2463 */
2464 #ifdef __x86_64__
2465 case SUPERPAGE_SIZE_ANY:
2466 /* handle it like 2 MB and round up to page size */
2467 size = (size + 2 * 1024 * 1024 - 1) & ~(2 * 1024 * 1024 - 1);
2468 OS_FALLTHROUGH;
2469 case SUPERPAGE_SIZE_2MB:
2470 break;
2471 #endif
2472 default:
2473 return KERN_INVALID_ARGUMENT;
2474 }
2475 mask = SUPERPAGE_SIZE - 1;
2476 if (size & (SUPERPAGE_SIZE - 1)) {
2477 return KERN_INVALID_ARGUMENT;
2478 }
2479 inheritance = VM_INHERIT_NONE; /* fork() children won't inherit superpages */
2480 }
2481
2482
2483 if ((cur_protection & VM_PROT_WRITE) &&
2484 (cur_protection & VM_PROT_EXECUTE) &&
2485 #if XNU_TARGET_OS_OSX
2486 map->pmap != kernel_pmap &&
2487 (cs_process_global_enforcement() ||
2488 (vmk_flags.vmkf_cs_enforcement_override
2489 ? vmk_flags.vmkf_cs_enforcement
2490 : (vm_map_cs_enforcement(map)
2491 #if __arm64__
2492 || !VM_MAP_IS_EXOTIC(map)
2493 #endif /* __arm64__ */
2494 ))) &&
2495 #endif /* XNU_TARGET_OS_OSX */
2496 #if CODE_SIGNING_MONITOR
2497 (csm_address_space_exempt(map->pmap) != KERN_SUCCESS) &&
2498 #endif
2499 (VM_MAP_POLICY_WX_FAIL(map) ||
2500 VM_MAP_POLICY_WX_STRIP_X(map)) &&
2501 !entry_for_jit) {
2502 boolean_t vm_protect_wx_fail = VM_MAP_POLICY_WX_FAIL(map);
2503
2504 DTRACE_VM3(cs_wx,
2505 uint64_t, 0,
2506 uint64_t, 0,
2507 vm_prot_t, cur_protection);
2508 printf("CODE SIGNING: %d[%s] %s: curprot cannot be write+execute. %s\n",
2509 proc_selfpid(),
2510 (get_bsdtask_info(current_task())
2511 ? proc_name_address(get_bsdtask_info(current_task()))
2512 : "?"),
2513 __FUNCTION__,
2514 (vm_protect_wx_fail ? "failing" : "turning off execute"));
2515 cur_protection &= ~VM_PROT_EXECUTE;
2516 if (vm_protect_wx_fail) {
2517 return KERN_PROTECTION_FAILURE;
2518 }
2519 }
2520
2521 /*
2522 * If the task has requested executable lockdown,
2523 * deny any new executable mapping.
2524 */
2525 if (map->map_disallow_new_exec == TRUE) {
2526 if (cur_protection & VM_PROT_EXECUTE) {
2527 return KERN_PROTECTION_FAILURE;
2528 }
2529 }
2530
2531 if (resilient_codesign) {
2532 assert(!is_submap);
2533 int reject_prot = (needs_copy ? VM_PROT_ALLEXEC : (VM_PROT_WRITE | VM_PROT_ALLEXEC));
2534 if ((cur_protection | max_protection) & reject_prot) {
2535 return KERN_PROTECTION_FAILURE;
2536 }
2537 }
2538
2539 if (resilient_media) {
2540 assert(!is_submap);
2541 // assert(!needs_copy);
2542 if (object != VM_OBJECT_NULL &&
2543 !object->internal) {
2544 /*
2545 * This mapping is directly backed by an external
2546 * memory manager (e.g. a vnode pager for a file):
2547 * we would not have any safe place to inject
2548 * a zero-filled page if an actual page is not
2549 * available, without possibly impacting the actual
2550 * contents of the mapped object (e.g. the file),
2551 * so we can't provide any media resiliency here.
2552 */
2553 return KERN_INVALID_ARGUMENT;
2554 }
2555 }
2556
2557 if (is_submap) {
2558 vm_map_t submap;
2559 if (purgable) {
2560 /* submaps can not be purgeable */
2561 return KERN_INVALID_ARGUMENT;
2562 }
2563 if (object == VM_OBJECT_NULL) {
2564 /* submaps can not be created lazily */
2565 return KERN_INVALID_ARGUMENT;
2566 }
2567 submap = (vm_map_t) object;
2568 if (VM_MAP_PAGE_SHIFT(submap) != VM_MAP_PAGE_SHIFT(map)) {
2569 /* page size mismatch */
2570 return KERN_INVALID_ARGUMENT;
2571 }
2572 }
2573 if (vmk_flags.vmkf_already) {
2574 /*
2575 * VM_FLAGS_ALREADY says that it's OK if the same mapping
2576 * is already present. For it to be meaningul, the requested
2577 * mapping has to be at a fixed address (!VM_FLAGS_ANYWHERE) and
2578 * we shouldn't try and remove what was mapped there first
2579 * (!VM_FLAGS_OVERWRITE).
2580 */
2581 if (!vmk_flags.vmf_fixed || vmk_flags.vmf_overwrite) {
2582 return KERN_INVALID_ARGUMENT;
2583 }
2584 }
2585
2586 if (size == 0 ||
2587 (offset & MIN(VM_MAP_PAGE_MASK(map), PAGE_MASK_64)) != 0) {
2588 *address = 0;
2589 return KERN_INVALID_ARGUMENT;
2590 }
2591
2592 if (map->pmap == kernel_pmap) {
2593 user_alias = VM_KERN_MEMORY_NONE;
2594 } else {
2595 user_alias = alias;
2596 }
2597
2598 if (user_alias == VM_MEMORY_MALLOC_MEDIUM) {
2599 chunk_size = MALLOC_MEDIUM_CHUNK_SIZE;
2600 }
2601
2602 #define RETURN(value) { result = value; goto BailOut; }
2603
2604 assertf(VM_MAP_PAGE_ALIGNED(*address, FOURK_PAGE_MASK), "0x%llx", (uint64_t)*address);
2605 assertf(VM_MAP_PAGE_ALIGNED(size, FOURK_PAGE_MASK), "0x%llx", (uint64_t)size);
2606 if (VM_MAP_PAGE_MASK(map) >= PAGE_MASK) {
2607 assertf(page_aligned(*address), "0x%llx", (uint64_t)*address);
2608 assertf(page_aligned(size), "0x%llx", (uint64_t)size);
2609 }
2610
2611 if (VM_MAP_PAGE_MASK(map) >= PAGE_MASK &&
2612 !VM_MAP_PAGE_ALIGNED(size, VM_MAP_PAGE_MASK(map))) {
2613 /*
2614 * In most cases, the caller rounds the size up to the
2615 * map's page size.
2616 * If we get a size that is explicitly not map-aligned here,
2617 * we'll have to respect the caller's wish and mark the
2618 * mapping as "not map-aligned" to avoid tripping the
2619 * map alignment checks later.
2620 */
2621 clear_map_aligned = TRUE;
2622 }
2623 if (!anywhere &&
2624 VM_MAP_PAGE_MASK(map) >= PAGE_MASK &&
2625 !VM_MAP_PAGE_ALIGNED(*address, VM_MAP_PAGE_MASK(map))) {
2626 /*
2627 * We've been asked to map at a fixed address and that
2628 * address is not aligned to the map's specific alignment.
2629 * The caller should know what it's doing (i.e. most likely
2630 * mapping some fragmented copy map, transferring memory from
2631 * a VM map with a different alignment), so clear map_aligned
2632 * for this new VM map entry and proceed.
2633 */
2634 clear_map_aligned = TRUE;
2635 }
2636
2637 /*
2638 * Only zero-fill objects are allowed to be purgable.
2639 * LP64todo - limit purgable objects to 32-bits for now
2640 */
2641 if (purgable &&
2642 (offset != 0 ||
2643 (object != VM_OBJECT_NULL &&
2644 (object->vo_size != size ||
2645 object->purgable == VM_PURGABLE_DENY))
2646 #if __LP64__
2647 || size > ANON_MAX_SIZE
2648 #endif
2649 )) {
2650 return KERN_INVALID_ARGUMENT;
2651 }
2652
2653 start = *address;
2654
2655 if (anywhere) {
2656 vm_map_lock(map);
2657 map_locked = TRUE;
2658
2659 result = vm_map_locate_space(map, size, mask, vmk_flags,
2660 &start, &entry);
2661 if (result != KERN_SUCCESS) {
2662 goto BailOut;
2663 }
2664
2665 *address = start;
2666 end = start + size;
2667 assert(VM_MAP_PAGE_ALIGNED(*address,
2668 VM_MAP_PAGE_MASK(map)));
2669 } else {
2670 vm_map_offset_t effective_min_offset, effective_max_offset;
2671
2672 effective_min_offset = map->min_offset;
2673 effective_max_offset = map->max_offset;
2674
2675 if (vmk_flags.vmkf_beyond_max) {
2676 /*
2677 * Allow an insertion beyond the map's max offset.
2678 */
2679 effective_max_offset = 0x00000000FFFFF000ULL;
2680 if (vm_map_is_64bit(map)) {
2681 effective_max_offset = 0xFFFFFFFFFFFFF000ULL;
2682 }
2683 #if XNU_TARGET_OS_OSX
2684 } else if (__improbable(vmk_flags.vmkf_32bit_map_va)) {
2685 effective_max_offset = MIN(map->max_offset, 0x00000000FFFFF000ULL);
2686 #endif /* XNU_TARGET_OS_OSX */
2687 }
2688
2689 if (VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT &&
2690 !overwrite &&
2691 user_alias == VM_MEMORY_REALLOC) {
2692 /*
2693 * Force realloc() to switch to a new allocation,
2694 * to prevent 4k-fragmented virtual ranges.
2695 */
2696 // DEBUG4K_ERROR("no realloc in place");
2697 return KERN_NO_SPACE;
2698 }
2699
2700 /*
2701 * Verify that:
2702 * the address doesn't itself violate
2703 * the mask requirement.
2704 */
2705
2706 vm_map_lock(map);
2707 map_locked = TRUE;
2708 if ((start & mask) != 0) {
2709 RETURN(KERN_NO_SPACE);
2710 }
2711
2712 #if CONFIG_MAP_RANGES
2713 if (map->uses_user_ranges) {
2714 struct mach_vm_range r;
2715
2716 vm_map_user_range_resolve(map, start, 1, &r);
2717 if (r.max_address == 0) {
2718 RETURN(KERN_INVALID_ADDRESS);
2719 }
2720 effective_min_offset = r.min_address;
2721 effective_max_offset = r.max_address;
2722 }
2723 #endif /* CONFIG_MAP_RANGES */
2724
2725 if ((startup_phase >= STARTUP_SUB_KMEM) && !is_submap &&
2726 (map == kernel_map)) {
2727 mach_vm_range_t r = kmem_validate_range_for_overwrite(start, size);
2728 effective_min_offset = r->min_address;
2729 effective_max_offset = r->max_address;
2730 }
2731
2732 /*
2733 * ... the address is within bounds
2734 */
2735
2736 end = start + size;
2737
2738 if ((start < effective_min_offset) ||
2739 (end > effective_max_offset) ||
2740 (start >= end)) {
2741 RETURN(KERN_INVALID_ADDRESS);
2742 }
2743
2744 if (overwrite) {
2745 vmr_flags_t remove_flags = VM_MAP_REMOVE_NO_MAP_ALIGN;
2746 kern_return_t remove_kr;
2747
2748 /*
2749 * Fixed mapping and "overwrite" flag: attempt to
2750 * remove all existing mappings in the specified
2751 * address range, saving them in our "zap_old_list".
2752 *
2753 * This avoids releasing the VM map lock in
2754 * vm_map_entry_delete() and allows atomicity
2755 * when we want to replace some mappings with a new one.
2756 * It also allows us to restore the old VM mappings if the
2757 * new mapping fails.
2758 */
2759 remove_flags |= VM_MAP_REMOVE_NO_YIELD;
2760
2761 if (vmk_flags.vmkf_overwrite_immutable) {
2762 /* we can overwrite immutable mappings */
2763 remove_flags |= VM_MAP_REMOVE_IMMUTABLE;
2764 }
2765 if (vmk_flags.vmkf_remap_prot_copy) {
2766 remove_flags |= VM_MAP_REMOVE_IMMUTABLE_CODE;
2767 }
2768 remove_kr = vm_map_delete(map, start, end, remove_flags,
2769 KMEM_GUARD_NONE, &zap_old_list).kmr_return;
2770 if (remove_kr) {
2771 /* XXX FBDP restore zap_old_list? */
2772 RETURN(remove_kr);
2773 }
2774 }
2775
2776 /*
2777 * ... the starting address isn't allocated
2778 */
2779
2780 if (vm_map_lookup_entry(map, start, &entry)) {
2781 if (!(vmk_flags.vmkf_already)) {
2782 RETURN(KERN_NO_SPACE);
2783 }
2784 /*
2785 * Check if what's already there is what we want.
2786 */
2787 tmp_start = start;
2788 tmp_offset = offset;
2789 if (entry->vme_start < start) {
2790 tmp_start -= start - entry->vme_start;
2791 tmp_offset -= start - entry->vme_start;
2792 }
2793 for (; entry->vme_start < end;
2794 entry = entry->vme_next) {
2795 /*
2796 * Check if the mapping's attributes
2797 * match the existing map entry.
2798 */
2799 if (entry == vm_map_to_entry(map) ||
2800 entry->vme_start != tmp_start ||
2801 entry->is_sub_map != is_submap ||
2802 VME_OFFSET(entry) != tmp_offset ||
2803 entry->needs_copy != needs_copy ||
2804 entry->protection != cur_protection ||
2805 entry->max_protection != max_protection ||
2806 entry->inheritance != inheritance ||
2807 entry->iokit_acct != iokit_acct ||
2808 VME_ALIAS(entry) != alias) {
2809 /* not the same mapping ! */
2810 RETURN(KERN_NO_SPACE);
2811 }
2812 /*
2813 * Check if the same object is being mapped.
2814 */
2815 if (is_submap) {
2816 if (VME_SUBMAP(entry) !=
2817 (vm_map_t) object) {
2818 /* not the same submap */
2819 RETURN(KERN_NO_SPACE);
2820 }
2821 } else {
2822 if (VME_OBJECT(entry) != object) {
2823 /* not the same VM object... */
2824 vm_object_t obj2;
2825
2826 obj2 = VME_OBJECT(entry);
2827 if ((obj2 == VM_OBJECT_NULL ||
2828 obj2->internal) &&
2829 (object == VM_OBJECT_NULL ||
2830 object->internal)) {
2831 /*
2832 * ... but both are
2833 * anonymous memory,
2834 * so equivalent.
2835 */
2836 } else {
2837 RETURN(KERN_NO_SPACE);
2838 }
2839 }
2840 }
2841
2842 tmp_offset += entry->vme_end - entry->vme_start;
2843 tmp_start += entry->vme_end - entry->vme_start;
2844 if (entry->vme_end >= end) {
2845 /* reached the end of our mapping */
2846 break;
2847 }
2848 }
2849 /* it all matches: let's use what's already there ! */
2850 RETURN(KERN_MEMORY_PRESENT);
2851 }
2852
2853 /*
2854 * ... the next region doesn't overlap the
2855 * end point.
2856 */
2857
2858 if ((entry->vme_next != vm_map_to_entry(map)) &&
2859 (entry->vme_next->vme_start < end)) {
2860 RETURN(KERN_NO_SPACE);
2861 }
2862 }
2863
2864 /*
2865 * At this point,
2866 * "start" and "end" should define the endpoints of the
2867 * available new range, and
2868 * "entry" should refer to the region before the new
2869 * range, and
2870 *
2871 * the map should be locked.
2872 */
2873
2874 /*
2875 * See whether we can avoid creating a new entry (and object) by
2876 * extending one of our neighbors. [So far, we only attempt to
2877 * extend from below.] Note that we can never extend/join
2878 * purgable objects because they need to remain distinct
2879 * entities in order to implement their "volatile object"
2880 * semantics.
2881 */
2882
2883 if (purgable ||
2884 entry_for_jit ||
2885 entry_for_tpro ||
2886 vm_memory_malloc_no_cow(user_alias)) {
2887 if (object == VM_OBJECT_NULL) {
2888 object = vm_object_allocate(size);
2889 object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
2890 object->true_share = FALSE;
2891 if (purgable) {
2892 task_t owner;
2893 object->purgable = VM_PURGABLE_NONVOLATILE;
2894 if (map->pmap == kernel_pmap) {
2895 /*
2896 * Purgeable mappings made in a kernel
2897 * map are "owned" by the kernel itself
2898 * rather than the current user task
2899 * because they're likely to be used by
2900 * more than this user task (see
2901 * execargs_purgeable_allocate(), for
2902 * example).
2903 */
2904 owner = kernel_task;
2905 } else {
2906 owner = current_task();
2907 }
2908 assert(object->vo_owner == NULL);
2909 assert(object->resident_page_count == 0);
2910 assert(object->wired_page_count == 0);
2911 vm_object_lock(object);
2912 vm_purgeable_nonvolatile_enqueue(object, owner);
2913 vm_object_unlock(object);
2914 }
2915 offset = (vm_object_offset_t)0;
2916 }
2917 } else if (VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT) {
2918 /* no coalescing if address space uses sub-pages */
2919 } else if ((is_submap == FALSE) &&
2920 (object == VM_OBJECT_NULL) &&
2921 (entry != vm_map_to_entry(map)) &&
2922 (entry->vme_end == start) &&
2923 (!entry->is_shared) &&
2924 (!entry->is_sub_map) &&
2925 (!entry->in_transition) &&
2926 (!entry->needs_wakeup) &&
2927 (entry->behavior == VM_BEHAVIOR_DEFAULT) &&
2928 (entry->protection == cur_protection) &&
2929 (entry->max_protection == max_protection) &&
2930 (entry->inheritance == inheritance) &&
2931 ((user_alias == VM_MEMORY_REALLOC) ||
2932 (VME_ALIAS(entry) == alias)) &&
2933 (entry->no_cache == no_cache) &&
2934 (entry->vme_permanent == permanent) &&
2935 /* no coalescing for immutable executable mappings */
2936 !((entry->protection & VM_PROT_EXECUTE) &&
2937 entry->vme_permanent) &&
2938 (!entry->superpage_size && !superpage_size) &&
2939 /*
2940 * No coalescing if not map-aligned, to avoid propagating
2941 * that condition any further than needed:
2942 */
2943 (!entry->map_aligned || !clear_map_aligned) &&
2944 (!entry->zero_wired_pages) &&
2945 (!entry->used_for_jit && !entry_for_jit) &&
2946 #if __arm64e__
2947 (!entry->used_for_tpro && !entry_for_tpro) &&
2948 #endif
2949 (!entry->csm_associated) &&
2950 (entry->iokit_acct == iokit_acct) &&
2951 (!entry->vme_resilient_codesign) &&
2952 (!entry->vme_resilient_media) &&
2953 (!entry->vme_atomic) &&
2954 (entry->vme_no_copy_on_read == no_copy_on_read) &&
2955
2956 ((entry->vme_end - entry->vme_start) + size <=
2957 (user_alias == VM_MEMORY_REALLOC ?
2958 ANON_CHUNK_SIZE :
2959 NO_COALESCE_LIMIT)) &&
2960
2961 (entry->wired_count == 0)) { /* implies user_wired_count == 0 */
2962 if (vm_object_coalesce(VME_OBJECT(entry),
2963 VM_OBJECT_NULL,
2964 VME_OFFSET(entry),
2965 (vm_object_offset_t) 0,
2966 (vm_map_size_t)(entry->vme_end - entry->vme_start),
2967 (vm_map_size_t)(end - entry->vme_end))) {
2968 /*
2969 * Coalesced the two objects - can extend
2970 * the previous map entry to include the
2971 * new range.
2972 */
2973 map->size += (end - entry->vme_end);
2974 assert(entry->vme_start < end);
2975 assert(VM_MAP_PAGE_ALIGNED(end,
2976 VM_MAP_PAGE_MASK(map)));
2977 if (__improbable(vm_debug_events)) {
2978 DTRACE_VM5(map_entry_extend, vm_map_t, map, vm_map_entry_t, entry, vm_address_t, entry->vme_start, vm_address_t, entry->vme_end, vm_address_t, end);
2979 }
2980 entry->vme_end = end;
2981 if (map->holelistenabled) {
2982 vm_map_store_update_first_free(map, entry, TRUE);
2983 } else {
2984 vm_map_store_update_first_free(map, map->first_free, TRUE);
2985 }
2986 new_mapping_established = TRUE;
2987 RETURN(KERN_SUCCESS);
2988 }
2989 }
2990
2991 step = superpage_size ? SUPERPAGE_SIZE : (end - start);
2992 new_entry = NULL;
2993
2994 if (vmk_flags.vmkf_submap_adjust) {
2995 vm_map_adjust_offsets((vm_map_t)caller_object, start, end);
2996 offset = start;
2997 }
2998
2999 for (tmp2_start = start; tmp2_start < end; tmp2_start += step) {
3000 tmp2_end = tmp2_start + step;
3001 /*
3002 * Create a new entry
3003 *
3004 * XXX FBDP
3005 * The reserved "page zero" in each process's address space can
3006 * be arbitrarily large. Splitting it into separate objects and
3007 * therefore different VM map entries serves no purpose and just
3008 * slows down operations on the VM map, so let's not split the
3009 * allocation into chunks if the max protection is NONE. That
3010 * memory should never be accessible, so it will never get to the
3011 * default pager.
3012 */
3013 tmp_start = tmp2_start;
3014 if (!is_submap &&
3015 object == VM_OBJECT_NULL &&
3016 size > chunk_size &&
3017 max_protection != VM_PROT_NONE &&
3018 superpage_size == 0) {
3019 tmp_end = tmp_start + chunk_size;
3020 } else {
3021 tmp_end = tmp2_end;
3022 }
3023 do {
3024 if (!is_submap &&
3025 object != VM_OBJECT_NULL &&
3026 object->internal &&
3027 offset + (tmp_end - tmp_start) > object->vo_size) {
3028 // printf("FBDP object %p size 0x%llx overmapping offset 0x%llx size 0x%llx\n", object, object->vo_size, offset, (uint64_t)(tmp_end - tmp_start));
3029 DTRACE_VM5(vm_map_enter_overmap,
3030 vm_map_t, map,
3031 vm_map_address_t, tmp_start,
3032 vm_map_address_t, tmp_end,
3033 vm_object_offset_t, offset,
3034 vm_object_size_t, object->vo_size);
3035 }
3036 new_entry = vm_map_entry_insert(map,
3037 entry, tmp_start, tmp_end,
3038 object, offset, vmk_flags,
3039 needs_copy,
3040 cur_protection, max_protection,
3041 (entry_for_jit && !VM_MAP_POLICY_ALLOW_JIT_INHERIT(map) ?
3042 VM_INHERIT_NONE : inheritance),
3043 clear_map_aligned);
3044
3045 assert((object != kernel_object) || (VM_KERN_MEMORY_NONE != alias));
3046
3047 if (resilient_codesign) {
3048 int reject_prot = (needs_copy ? VM_PROT_ALLEXEC : (VM_PROT_WRITE | VM_PROT_ALLEXEC));
3049 if (!((cur_protection | max_protection) & reject_prot)) {
3050 new_entry->vme_resilient_codesign = TRUE;
3051 }
3052 }
3053
3054 if (resilient_media &&
3055 (object == VM_OBJECT_NULL ||
3056 object->internal)) {
3057 new_entry->vme_resilient_media = TRUE;
3058 }
3059
3060 assert(!new_entry->iokit_acct);
3061 if (!is_submap &&
3062 object != VM_OBJECT_NULL &&
3063 (object->purgable != VM_PURGABLE_DENY ||
3064 object->vo_ledger_tag)) {
3065 assert(new_entry->use_pmap);
3066 assert(!new_entry->iokit_acct);
3067 /*
3068 * Turn off pmap accounting since
3069 * purgeable (or tagged) objects have their
3070 * own ledgers.
3071 */
3072 new_entry->use_pmap = FALSE;
3073 } else if (!is_submap &&
3074 iokit_acct &&
3075 object != VM_OBJECT_NULL &&
3076 object->internal) {
3077 /* alternate accounting */
3078 assert(!new_entry->iokit_acct);
3079 assert(new_entry->use_pmap);
3080 new_entry->iokit_acct = TRUE;
3081 new_entry->use_pmap = FALSE;
3082 DTRACE_VM4(
3083 vm_map_iokit_mapped_region,
3084 vm_map_t, map,
3085 vm_map_offset_t, new_entry->vme_start,
3086 vm_map_offset_t, new_entry->vme_end,
3087 int, VME_ALIAS(new_entry));
3088 vm_map_iokit_mapped_region(
3089 map,
3090 (new_entry->vme_end -
3091 new_entry->vme_start));
3092 } else if (!is_submap) {
3093 assert(!new_entry->iokit_acct);
3094 assert(new_entry->use_pmap);
3095 }
3096
3097 if (is_submap) {
3098 vm_map_t submap;
3099 boolean_t submap_is_64bit;
3100 boolean_t use_pmap;
3101
3102 assert(new_entry->is_sub_map);
3103 assert(!new_entry->use_pmap);
3104 assert(!new_entry->iokit_acct);
3105 submap = (vm_map_t) object;
3106 submap_is_64bit = vm_map_is_64bit(submap);
3107 use_pmap = vmk_flags.vmkf_nested_pmap;
3108 #ifndef NO_NESTED_PMAP
3109 if (use_pmap && submap->pmap == NULL) {
3110 ledger_t ledger = map->pmap->ledger;
3111 /* we need a sub pmap to nest... */
3112 submap->pmap = pmap_create_options(ledger, 0,
3113 submap_is_64bit ? PMAP_CREATE_64BIT : 0);
3114 if (submap->pmap == NULL) {
3115 /* let's proceed without nesting... */
3116 }
3117 #if defined(__arm64__)
3118 else {
3119 pmap_set_nested(submap->pmap);
3120 }
3121 #endif
3122 }
3123 if (use_pmap && submap->pmap != NULL) {
3124 if (VM_MAP_PAGE_SHIFT(map) != VM_MAP_PAGE_SHIFT(submap)) {
3125 DEBUG4K_ERROR("map %p (%d) submap %p (%d): incompatible page sizes\n", map, VM_MAP_PAGE_SHIFT(map), submap, VM_MAP_PAGE_SHIFT(submap));
3126 kr = KERN_FAILURE;
3127 } else {
3128 kr = pmap_nest(map->pmap,
3129 submap->pmap,
3130 tmp_start,
3131 tmp_end - tmp_start);
3132 }
3133 if (kr != KERN_SUCCESS) {
3134 printf("vm_map_enter: "
3135 "pmap_nest(0x%llx,0x%llx) "
3136 "error 0x%x\n",
3137 (long long)tmp_start,
3138 (long long)tmp_end,
3139 kr);
3140 } else {
3141 /* we're now nested ! */
3142 new_entry->use_pmap = TRUE;
3143 pmap_empty = FALSE;
3144 }
3145 }
3146 #endif /* NO_NESTED_PMAP */
3147 }
3148 entry = new_entry;
3149
3150 if (superpage_size) {
3151 vm_page_t pages, m;
3152 vm_object_t sp_object;
3153 vm_object_offset_t sp_offset;
3154
3155 VME_OFFSET_SET(entry, 0);
3156
3157 /* allocate one superpage */
3158 kr = cpm_allocate(SUPERPAGE_SIZE, &pages, 0, SUPERPAGE_NBASEPAGES - 1, TRUE, 0);
3159 if (kr != KERN_SUCCESS) {
3160 /* deallocate whole range... */
3161 new_mapping_established = TRUE;
3162 /* ... but only up to "tmp_end" */
3163 size -= end - tmp_end;
3164 RETURN(kr);
3165 }
3166
3167 /* create one vm_object per superpage */
3168 sp_object = vm_object_allocate((vm_map_size_t)(entry->vme_end - entry->vme_start));
3169 sp_object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
3170 sp_object->phys_contiguous = TRUE;
3171 sp_object->vo_shadow_offset = (vm_object_offset_t)VM_PAGE_GET_PHYS_PAGE(pages) * PAGE_SIZE;
3172 VME_OBJECT_SET(entry, sp_object, false, 0);
3173 assert(entry->use_pmap);
3174
3175 /* enter the base pages into the object */
3176 vm_object_lock(sp_object);
3177 for (sp_offset = 0;
3178 sp_offset < SUPERPAGE_SIZE;
3179 sp_offset += PAGE_SIZE) {
3180 m = pages;
3181 pmap_zero_page(VM_PAGE_GET_PHYS_PAGE(m));
3182 pages = NEXT_PAGE(m);
3183 *(NEXT_PAGE_PTR(m)) = VM_PAGE_NULL;
3184 vm_page_insert_wired(m, sp_object, sp_offset, VM_KERN_MEMORY_OSFMK);
3185 }
3186 vm_object_unlock(sp_object);
3187 }
3188 } while (tmp_end != tmp2_end &&
3189 (tmp_start = tmp_end) &&
3190 (tmp_end = (tmp2_end - tmp_end > chunk_size) ?
3191 tmp_end + chunk_size : tmp2_end));
3192 }
3193
3194 new_mapping_established = TRUE;
3195
3196 BailOut:
3197 assert(map_locked == TRUE);
3198
3199 /*
3200 * Address space limit enforcement (RLIMIT_AS and RLIMIT_DATA):
3201 * If we have identified and possibly established the new mapping(s),
3202 * make sure we did not go beyond the address space limit.
3203 */
3204 if (result == KERN_SUCCESS) {
3205 if (map->size_limit != RLIM_INFINITY &&
3206 map->size > map->size_limit) {
3207 /*
3208 * Establishing the requested mappings would exceed
3209 * the process's RLIMIT_AS limit: fail with
3210 * KERN_NO_SPACE.
3211 */
3212 result = KERN_NO_SPACE;
3213 printf("%d[%s] %s: map size 0x%llx over RLIMIT_AS 0x%llx\n",
3214 proc_selfpid(),
3215 (get_bsdtask_info(current_task())
3216 ? proc_name_address(get_bsdtask_info(current_task()))
3217 : "?"),
3218 __FUNCTION__,
3219 (uint64_t) map->size,
3220 (uint64_t) map->size_limit);
3221 DTRACE_VM2(vm_map_enter_RLIMIT_AS,
3222 vm_map_size_t, map->size,
3223 uint64_t, map->size_limit);
3224 vm_map_enter_RLIMIT_AS_count++;
3225 } else if (map->data_limit != RLIM_INFINITY &&
3226 map->size > map->data_limit) {
3227 /*
3228 * Establishing the requested mappings would exceed
3229 * the process's RLIMIT_DATA limit: fail with
3230 * KERN_NO_SPACE.
3231 */
3232 result = KERN_NO_SPACE;
3233 printf("%d[%s] %s: map size 0x%llx over RLIMIT_DATA 0x%llx\n",
3234 proc_selfpid(),
3235 (get_bsdtask_info(current_task())
3236 ? proc_name_address(get_bsdtask_info(current_task()))
3237 : "?"),
3238 __FUNCTION__,
3239 (uint64_t) map->size,
3240 (uint64_t) map->data_limit);
3241 DTRACE_VM2(vm_map_enter_RLIMIT_DATA,
3242 vm_map_size_t, map->size,
3243 uint64_t, map->data_limit);
3244 vm_map_enter_RLIMIT_DATA_count++;
3245 }
3246 }
3247
3248 if (result == KERN_SUCCESS) {
3249 vm_prot_t pager_prot;
3250 memory_object_t pager;
3251
3252 #if DEBUG
3253 if (pmap_empty &&
3254 !(vmk_flags.vmkf_no_pmap_check)) {
3255 assert(pmap_is_empty(map->pmap,
3256 *address,
3257 *address + size));
3258 }
3259 #endif /* DEBUG */
3260
3261 /*
3262 * For "named" VM objects, let the pager know that the
3263 * memory object is being mapped. Some pagers need to keep
3264 * track of this, to know when they can reclaim the memory
3265 * object, for example.
3266 * VM calls memory_object_map() for each mapping (specifying
3267 * the protection of each mapping) and calls
3268 * memory_object_last_unmap() when all the mappings are gone.
3269 */
3270 pager_prot = max_protection;
3271 if (needs_copy) {
3272 /*
3273 * Copy-On-Write mapping: won't modify
3274 * the memory object.
3275 */
3276 pager_prot &= ~VM_PROT_WRITE;
3277 }
3278 if (!is_submap &&
3279 object != VM_OBJECT_NULL &&
3280 object->named &&
3281 object->pager != MEMORY_OBJECT_NULL) {
3282 vm_object_lock(object);
3283 pager = object->pager;
3284 if (object->named &&
3285 pager != MEMORY_OBJECT_NULL) {
3286 assert(object->pager_ready);
3287 vm_object_mapping_wait(object, THREAD_UNINT);
3288 vm_object_mapping_begin(object);
3289 vm_object_unlock(object);
3290
3291 kr = memory_object_map(pager, pager_prot);
3292 assert(kr == KERN_SUCCESS);
3293
3294 vm_object_lock(object);
3295 vm_object_mapping_end(object);
3296 }
3297 vm_object_unlock(object);
3298 }
3299 }
3300
3301 assert(map_locked == TRUE);
3302
3303 if (new_mapping_established) {
3304 /*
3305 * If we release the map lock for any reason below,
3306 * another thread could deallocate our new mapping,
3307 * releasing the caller's reference on "caller_object",
3308 * which was transferred to the mapping.
3309 * If this was the only reference, the object could be
3310 * destroyed.
3311 *
3312 * We need to take an extra reference on "caller_object"
3313 * to keep it alive if we need to return the caller's
3314 * reference to the caller in case of failure.
3315 */
3316 if (is_submap) {
3317 vm_map_reference((vm_map_t)caller_object);
3318 } else {
3319 vm_object_reference(caller_object);
3320 }
3321 }
3322
3323 if (!keep_map_locked) {
3324 vm_map_unlock(map);
3325 map_locked = FALSE;
3326 entry = VM_MAP_ENTRY_NULL;
3327 new_entry = VM_MAP_ENTRY_NULL;
3328 }
3329
3330 /*
3331 * We can't hold the map lock if we enter this block.
3332 */
3333
3334 if (result == KERN_SUCCESS) {
3335 /* Wire down the new entry if the user
3336 * requested all new map entries be wired.
3337 */
3338 if ((map->wiring_required) || (superpage_size)) {
3339 assert(!keep_map_locked);
3340 pmap_empty = FALSE; /* pmap won't be empty */
3341 kr = vm_map_wire_kernel(map, start, end,
3342 cur_protection, VM_KERN_MEMORY_MLOCK,
3343 TRUE);
3344 result = kr;
3345 }
3346
3347 }
3348
3349 if (result != KERN_SUCCESS) {
3350 if (new_mapping_established) {
3351 vmr_flags_t remove_flags = VM_MAP_REMOVE_NO_FLAGS;
3352
3353 /*
3354 * We have to get rid of the new mappings since we
3355 * won't make them available to the user.
3356 * Try and do that atomically, to minimize the risk
3357 * that someone else create new mappings that range.
3358 */
3359 if (!map_locked) {
3360 vm_map_lock(map);
3361 map_locked = TRUE;
3362 }
3363 remove_flags |= VM_MAP_REMOVE_NO_MAP_ALIGN;
3364 remove_flags |= VM_MAP_REMOVE_NO_YIELD;
3365 if (permanent) {
3366 remove_flags |= VM_MAP_REMOVE_IMMUTABLE;
3367 }
3368 (void) vm_map_delete(map,
3369 *address, *address + size,
3370 remove_flags,
3371 KMEM_GUARD_NONE, &zap_new_list);
3372 }
3373
3374 if (vm_map_zap_first_entry(&zap_old_list)) {
3375 vm_map_entry_t entry1, entry2;
3376
3377 /*
3378 * The new mapping failed. Attempt to restore
3379 * the old mappings, saved in the "zap_old_map".
3380 */
3381 if (!map_locked) {
3382 vm_map_lock(map);
3383 map_locked = TRUE;
3384 }
3385
3386 /* first check if the coast is still clear */
3387 start = vm_map_zap_first_entry(&zap_old_list)->vme_start;
3388 end = vm_map_zap_last_entry(&zap_old_list)->vme_end;
3389
3390 if (vm_map_lookup_entry(map, start, &entry1) ||
3391 vm_map_lookup_entry(map, end, &entry2) ||
3392 entry1 != entry2) {
3393 /*
3394 * Part of that range has already been
3395 * re-mapped: we can't restore the old
3396 * mappings...
3397 */
3398 vm_map_enter_restore_failures++;
3399 } else {
3400 /*
3401 * Transfer the saved map entries from
3402 * "zap_old_map" to the original "map",
3403 * inserting them all after "entry1".
3404 */
3405 while ((entry2 = vm_map_zap_pop(&zap_old_list))) {
3406 vm_map_size_t entry_size;
3407
3408 entry_size = (entry2->vme_end -
3409 entry2->vme_start);
3410 vm_map_store_entry_link(map, entry1, entry2,
3411 VM_MAP_KERNEL_FLAGS_NONE);
3412 map->size += entry_size;
3413 entry1 = entry2;
3414 }
3415 if (map->wiring_required) {
3416 /*
3417 * XXX TODO: we should rewire the
3418 * old pages here...
3419 */
3420 }
3421 vm_map_enter_restore_successes++;
3422 }
3423 }
3424 }
3425
3426 /*
3427 * The caller is responsible for releasing the lock if it requested to
3428 * keep the map locked.
3429 */
3430 if (map_locked && !keep_map_locked) {
3431 vm_map_unlock(map);
3432 }
3433
3434 vm_map_zap_dispose(&zap_old_list);
3435 vm_map_zap_dispose(&zap_new_list);
3436
3437 if (new_mapping_established) {
3438 /*
3439 * The caller had a reference on "caller_object" and we
3440 * transferred that reference to the mapping.
3441 * We also took an extra reference on "caller_object" to keep
3442 * it alive while the map was unlocked.
3443 */
3444 if (result == KERN_SUCCESS) {
3445 /*
3446 * On success, the caller's reference on the object gets
3447 * tranferred to the mapping.
3448 * Release our extra reference.
3449 */
3450 if (is_submap) {
3451 vm_map_deallocate((vm_map_t)caller_object);
3452 } else {
3453 vm_object_deallocate(caller_object);
3454 }
3455 } else {
3456 /*
3457 * On error, the caller expects to still have a
3458 * reference on the object it gave us.
3459 * Let's use our extra reference for that.
3460 */
3461 }
3462 }
3463
3464 return result;
3465
3466 #undef RETURN
3467 }
3468
3469 #if __arm64__
3470 extern const struct memory_object_pager_ops fourk_pager_ops;
3471 kern_return_t
vm_map_enter_fourk(vm_map_t map,vm_map_offset_t * address,vm_map_size_t size,vm_map_offset_t mask,vm_map_kernel_flags_t vmk_flags,vm_object_t object,vm_object_offset_t offset,boolean_t needs_copy,vm_prot_t cur_protection,vm_prot_t max_protection,vm_inherit_t inheritance)3472 vm_map_enter_fourk(
3473 vm_map_t map,
3474 vm_map_offset_t *address, /* IN/OUT */
3475 vm_map_size_t size,
3476 vm_map_offset_t mask,
3477 vm_map_kernel_flags_t vmk_flags,
3478 vm_object_t object,
3479 vm_object_offset_t offset,
3480 boolean_t needs_copy,
3481 vm_prot_t cur_protection,
3482 vm_prot_t max_protection,
3483 vm_inherit_t inheritance)
3484 {
3485 vm_map_entry_t entry, new_entry;
3486 vm_map_offset_t start, fourk_start;
3487 vm_map_offset_t end, fourk_end;
3488 vm_map_size_t fourk_size;
3489 kern_return_t result = KERN_SUCCESS;
3490 boolean_t map_locked = FALSE;
3491 boolean_t pmap_empty = TRUE;
3492 boolean_t new_mapping_established = FALSE;
3493 const bool keep_map_locked = vmk_flags.vmkf_keep_map_locked;
3494 const bool anywhere = !vmk_flags.vmf_fixed;
3495 const bool purgable = vmk_flags.vmf_purgeable;
3496 const bool overwrite = vmk_flags.vmf_overwrite;
3497 const bool is_submap = vmk_flags.vmkf_submap;
3498 const bool entry_for_jit = vmk_flags.vmkf_map_jit;
3499 const unsigned int superpage_size = vmk_flags.vmf_superpage_size;
3500 vm_map_offset_t effective_min_offset, effective_max_offset;
3501 kern_return_t kr;
3502 boolean_t clear_map_aligned = FALSE;
3503 memory_object_t fourk_mem_obj;
3504 vm_object_t fourk_object;
3505 vm_map_offset_t fourk_pager_offset;
3506 int fourk_pager_index_start, fourk_pager_index_num;
3507 int cur_idx;
3508 boolean_t fourk_copy;
3509 vm_object_t copy_object;
3510 vm_object_offset_t copy_offset;
3511 VM_MAP_ZAP_DECLARE(zap_list);
3512
3513 if (VM_MAP_PAGE_MASK(map) < PAGE_MASK) {
3514 panic("%s:%d", __FUNCTION__, __LINE__);
3515 }
3516 fourk_mem_obj = MEMORY_OBJECT_NULL;
3517 fourk_object = VM_OBJECT_NULL;
3518
3519 if (superpage_size) {
3520 return KERN_NOT_SUPPORTED;
3521 }
3522
3523 if ((cur_protection & VM_PROT_WRITE) &&
3524 (cur_protection & VM_PROT_EXECUTE) &&
3525 #if XNU_TARGET_OS_OSX
3526 map->pmap != kernel_pmap &&
3527 (vm_map_cs_enforcement(map)
3528 #if __arm64__
3529 || !VM_MAP_IS_EXOTIC(map)
3530 #endif /* __arm64__ */
3531 ) &&
3532 #endif /* XNU_TARGET_OS_OSX */
3533 #if CODE_SIGNING_MONITOR
3534 (csm_address_space_exempt(map->pmap) != KERN_SUCCESS) &&
3535 #endif
3536 !entry_for_jit) {
3537 DTRACE_VM3(cs_wx,
3538 uint64_t, 0,
3539 uint64_t, 0,
3540 vm_prot_t, cur_protection);
3541 printf("CODE SIGNING: %d[%s] %s: curprot cannot be write+execute. "
3542 "turning off execute\n",
3543 proc_selfpid(),
3544 (get_bsdtask_info(current_task())
3545 ? proc_name_address(get_bsdtask_info(current_task()))
3546 : "?"),
3547 __FUNCTION__);
3548 cur_protection &= ~VM_PROT_EXECUTE;
3549 }
3550
3551 /*
3552 * If the task has requested executable lockdown,
3553 * deny any new executable mapping.
3554 */
3555 if (map->map_disallow_new_exec == TRUE) {
3556 if (cur_protection & VM_PROT_EXECUTE) {
3557 return KERN_PROTECTION_FAILURE;
3558 }
3559 }
3560
3561 if (is_submap) {
3562 return KERN_NOT_SUPPORTED;
3563 }
3564 if (vmk_flags.vmkf_already) {
3565 return KERN_NOT_SUPPORTED;
3566 }
3567 if (purgable || entry_for_jit) {
3568 return KERN_NOT_SUPPORTED;
3569 }
3570
3571 effective_min_offset = map->min_offset;
3572
3573 if (vmk_flags.vmkf_beyond_max) {
3574 return KERN_NOT_SUPPORTED;
3575 } else {
3576 effective_max_offset = map->max_offset;
3577 }
3578
3579 if (size == 0 ||
3580 (offset & FOURK_PAGE_MASK) != 0) {
3581 *address = 0;
3582 return KERN_INVALID_ARGUMENT;
3583 }
3584
3585 #define RETURN(value) { result = value; goto BailOut; }
3586
3587 assert(VM_MAP_PAGE_ALIGNED(*address, FOURK_PAGE_MASK));
3588 assert(VM_MAP_PAGE_ALIGNED(size, FOURK_PAGE_MASK));
3589
3590 if (!anywhere && overwrite) {
3591 return KERN_NOT_SUPPORTED;
3592 }
3593
3594 fourk_start = *address;
3595 fourk_size = size;
3596 fourk_end = fourk_start + fourk_size;
3597
3598 start = vm_map_trunc_page(*address, VM_MAP_PAGE_MASK(map));
3599 end = vm_map_round_page(fourk_end, VM_MAP_PAGE_MASK(map));
3600 size = end - start;
3601
3602 if (anywhere) {
3603 return KERN_NOT_SUPPORTED;
3604 } else {
3605 /*
3606 * Verify that:
3607 * the address doesn't itself violate
3608 * the mask requirement.
3609 */
3610
3611 vm_map_lock(map);
3612 map_locked = TRUE;
3613 if ((start & mask) != 0) {
3614 RETURN(KERN_NO_SPACE);
3615 }
3616
3617 /*
3618 * ... the address is within bounds
3619 */
3620
3621 end = start + size;
3622
3623 if ((start < effective_min_offset) ||
3624 (end > effective_max_offset) ||
3625 (start >= end)) {
3626 RETURN(KERN_INVALID_ADDRESS);
3627 }
3628
3629 /*
3630 * ... the starting address isn't allocated
3631 */
3632 if (vm_map_lookup_entry(map, start, &entry)) {
3633 vm_object_t cur_object, shadow_object;
3634
3635 /*
3636 * We might already some 4K mappings
3637 * in a 16K page here.
3638 */
3639
3640 if (entry->vme_end - entry->vme_start
3641 != SIXTEENK_PAGE_SIZE) {
3642 RETURN(KERN_NO_SPACE);
3643 }
3644 if (entry->is_sub_map) {
3645 RETURN(KERN_NO_SPACE);
3646 }
3647 if (VME_OBJECT(entry) == VM_OBJECT_NULL) {
3648 RETURN(KERN_NO_SPACE);
3649 }
3650
3651 /* go all the way down the shadow chain */
3652 cur_object = VME_OBJECT(entry);
3653 vm_object_lock(cur_object);
3654 while (cur_object->shadow != VM_OBJECT_NULL) {
3655 shadow_object = cur_object->shadow;
3656 vm_object_lock(shadow_object);
3657 vm_object_unlock(cur_object);
3658 cur_object = shadow_object;
3659 shadow_object = VM_OBJECT_NULL;
3660 }
3661 if (cur_object->internal ||
3662 cur_object->pager == NULL) {
3663 vm_object_unlock(cur_object);
3664 RETURN(KERN_NO_SPACE);
3665 }
3666 if (cur_object->pager->mo_pager_ops
3667 != &fourk_pager_ops) {
3668 vm_object_unlock(cur_object);
3669 RETURN(KERN_NO_SPACE);
3670 }
3671 fourk_object = cur_object;
3672 fourk_mem_obj = fourk_object->pager;
3673
3674 /* keep the "4K" object alive */
3675 vm_object_reference_locked(fourk_object);
3676 memory_object_reference(fourk_mem_obj);
3677 vm_object_unlock(fourk_object);
3678
3679 /* merge permissions */
3680 entry->protection |= cur_protection;
3681 entry->max_protection |= max_protection;
3682
3683 if ((entry->protection & VM_PROT_WRITE) &&
3684 (entry->protection & VM_PROT_ALLEXEC) &&
3685 fourk_binary_compatibility_unsafe &&
3686 fourk_binary_compatibility_allow_wx) {
3687 /* write+execute: need to be "jit" */
3688 entry->used_for_jit = TRUE;
3689 }
3690 goto map_in_fourk_pager;
3691 }
3692
3693 /*
3694 * ... the next region doesn't overlap the
3695 * end point.
3696 */
3697
3698 if ((entry->vme_next != vm_map_to_entry(map)) &&
3699 (entry->vme_next->vme_start < end)) {
3700 RETURN(KERN_NO_SPACE);
3701 }
3702 }
3703
3704 /*
3705 * At this point,
3706 * "start" and "end" should define the endpoints of the
3707 * available new range, and
3708 * "entry" should refer to the region before the new
3709 * range, and
3710 *
3711 * the map should be locked.
3712 */
3713
3714 /* create a new "4K" pager */
3715 fourk_mem_obj = fourk_pager_create();
3716 fourk_object = fourk_pager_to_vm_object(fourk_mem_obj);
3717 assert(fourk_object);
3718
3719 /* keep the "4" object alive */
3720 vm_object_reference(fourk_object);
3721
3722 /* create a "copy" object, to map the "4K" object copy-on-write */
3723 fourk_copy = TRUE;
3724 result = vm_object_copy_strategically(fourk_object,
3725 0,
3726 end - start,
3727 ©_object,
3728 ©_offset,
3729 &fourk_copy);
3730 assert(result == KERN_SUCCESS);
3731 assert(copy_object != VM_OBJECT_NULL);
3732 assert(copy_offset == 0);
3733
3734 /* map the "4K" pager's copy object */
3735 new_entry = vm_map_entry_insert(map,
3736 entry,
3737 vm_map_trunc_page(start, VM_MAP_PAGE_MASK(map)),
3738 vm_map_round_page(end, VM_MAP_PAGE_MASK(map)),
3739 copy_object,
3740 0, /* offset */
3741 vmk_flags,
3742 FALSE, /* needs_copy */
3743 cur_protection, max_protection,
3744 (entry_for_jit && !VM_MAP_POLICY_ALLOW_JIT_INHERIT(map) ?
3745 VM_INHERIT_NONE : inheritance),
3746 clear_map_aligned);
3747 entry = new_entry;
3748
3749 #if VM_MAP_DEBUG_FOURK
3750 if (vm_map_debug_fourk) {
3751 printf("FOURK_PAGER: map %p [0x%llx:0x%llx] new pager %p\n",
3752 map,
3753 (uint64_t) entry->vme_start,
3754 (uint64_t) entry->vme_end,
3755 fourk_mem_obj);
3756 }
3757 #endif /* VM_MAP_DEBUG_FOURK */
3758
3759 new_mapping_established = TRUE;
3760
3761 map_in_fourk_pager:
3762 /* "map" the original "object" where it belongs in the "4K" pager */
3763 fourk_pager_offset = (fourk_start & SIXTEENK_PAGE_MASK);
3764 fourk_pager_index_start = (int) (fourk_pager_offset / FOURK_PAGE_SIZE);
3765 if (fourk_size > SIXTEENK_PAGE_SIZE) {
3766 fourk_pager_index_num = 4;
3767 } else {
3768 fourk_pager_index_num = (int) (fourk_size / FOURK_PAGE_SIZE);
3769 }
3770 if (fourk_pager_index_start + fourk_pager_index_num > 4) {
3771 fourk_pager_index_num = 4 - fourk_pager_index_start;
3772 }
3773 for (cur_idx = 0;
3774 cur_idx < fourk_pager_index_num;
3775 cur_idx++) {
3776 vm_object_t old_object;
3777 vm_object_offset_t old_offset;
3778
3779 kr = fourk_pager_populate(fourk_mem_obj,
3780 TRUE, /* overwrite */
3781 fourk_pager_index_start + cur_idx,
3782 object,
3783 (object
3784 ? (offset +
3785 (cur_idx * FOURK_PAGE_SIZE))
3786 : 0),
3787 &old_object,
3788 &old_offset);
3789 #if VM_MAP_DEBUG_FOURK
3790 if (vm_map_debug_fourk) {
3791 if (old_object == (vm_object_t) -1 &&
3792 old_offset == (vm_object_offset_t) -1) {
3793 printf("FOURK_PAGER: map %p [0x%llx:0x%llx] "
3794 "pager [%p:0x%llx] "
3795 "populate[%d] "
3796 "[object:%p,offset:0x%llx]\n",
3797 map,
3798 (uint64_t) entry->vme_start,
3799 (uint64_t) entry->vme_end,
3800 fourk_mem_obj,
3801 VME_OFFSET(entry),
3802 fourk_pager_index_start + cur_idx,
3803 object,
3804 (object
3805 ? (offset + (cur_idx * FOURK_PAGE_SIZE))
3806 : 0));
3807 } else {
3808 printf("FOURK_PAGER: map %p [0x%llx:0x%llx] "
3809 "pager [%p:0x%llx] "
3810 "populate[%d] [object:%p,offset:0x%llx] "
3811 "old [%p:0x%llx]\n",
3812 map,
3813 (uint64_t) entry->vme_start,
3814 (uint64_t) entry->vme_end,
3815 fourk_mem_obj,
3816 VME_OFFSET(entry),
3817 fourk_pager_index_start + cur_idx,
3818 object,
3819 (object
3820 ? (offset + (cur_idx * FOURK_PAGE_SIZE))
3821 : 0),
3822 old_object,
3823 old_offset);
3824 }
3825 }
3826 #endif /* VM_MAP_DEBUG_FOURK */
3827
3828 assert(kr == KERN_SUCCESS);
3829 if (object != old_object &&
3830 object != VM_OBJECT_NULL &&
3831 object != (vm_object_t) -1) {
3832 vm_object_reference(object);
3833 }
3834 if (object != old_object &&
3835 old_object != VM_OBJECT_NULL &&
3836 old_object != (vm_object_t) -1) {
3837 vm_object_deallocate(old_object);
3838 }
3839 }
3840
3841 BailOut:
3842 assert(map_locked == TRUE);
3843
3844 if (result == KERN_SUCCESS) {
3845 vm_prot_t pager_prot;
3846 memory_object_t pager;
3847
3848 #if DEBUG
3849 if (pmap_empty &&
3850 !(vmk_flags.vmkf_no_pmap_check)) {
3851 assert(pmap_is_empty(map->pmap,
3852 *address,
3853 *address + size));
3854 }
3855 #endif /* DEBUG */
3856
3857 /*
3858 * For "named" VM objects, let the pager know that the
3859 * memory object is being mapped. Some pagers need to keep
3860 * track of this, to know when they can reclaim the memory
3861 * object, for example.
3862 * VM calls memory_object_map() for each mapping (specifying
3863 * the protection of each mapping) and calls
3864 * memory_object_last_unmap() when all the mappings are gone.
3865 */
3866 pager_prot = max_protection;
3867 if (needs_copy) {
3868 /*
3869 * Copy-On-Write mapping: won't modify
3870 * the memory object.
3871 */
3872 pager_prot &= ~VM_PROT_WRITE;
3873 }
3874 if (!is_submap &&
3875 object != VM_OBJECT_NULL &&
3876 object->named &&
3877 object->pager != MEMORY_OBJECT_NULL) {
3878 vm_object_lock(object);
3879 pager = object->pager;
3880 if (object->named &&
3881 pager != MEMORY_OBJECT_NULL) {
3882 assert(object->pager_ready);
3883 vm_object_mapping_wait(object, THREAD_UNINT);
3884 vm_object_mapping_begin(object);
3885 vm_object_unlock(object);
3886
3887 kr = memory_object_map(pager, pager_prot);
3888 assert(kr == KERN_SUCCESS);
3889
3890 vm_object_lock(object);
3891 vm_object_mapping_end(object);
3892 }
3893 vm_object_unlock(object);
3894 }
3895 if (!is_submap &&
3896 fourk_object != VM_OBJECT_NULL &&
3897 fourk_object->named &&
3898 fourk_object->pager != MEMORY_OBJECT_NULL) {
3899 vm_object_lock(fourk_object);
3900 pager = fourk_object->pager;
3901 if (fourk_object->named &&
3902 pager != MEMORY_OBJECT_NULL) {
3903 assert(fourk_object->pager_ready);
3904 vm_object_mapping_wait(fourk_object,
3905 THREAD_UNINT);
3906 vm_object_mapping_begin(fourk_object);
3907 vm_object_unlock(fourk_object);
3908
3909 kr = memory_object_map(pager, VM_PROT_READ);
3910 assert(kr == KERN_SUCCESS);
3911
3912 vm_object_lock(fourk_object);
3913 vm_object_mapping_end(fourk_object);
3914 }
3915 vm_object_unlock(fourk_object);
3916 }
3917 }
3918
3919 if (fourk_object != VM_OBJECT_NULL) {
3920 vm_object_deallocate(fourk_object);
3921 fourk_object = VM_OBJECT_NULL;
3922 memory_object_deallocate(fourk_mem_obj);
3923 fourk_mem_obj = MEMORY_OBJECT_NULL;
3924 }
3925
3926 assert(map_locked == TRUE);
3927
3928 if (!keep_map_locked) {
3929 vm_map_unlock(map);
3930 map_locked = FALSE;
3931 }
3932
3933 /*
3934 * We can't hold the map lock if we enter this block.
3935 */
3936
3937 if (result == KERN_SUCCESS) {
3938 /* Wire down the new entry if the user
3939 * requested all new map entries be wired.
3940 */
3941 if ((map->wiring_required) || (superpage_size)) {
3942 assert(!keep_map_locked);
3943 pmap_empty = FALSE; /* pmap won't be empty */
3944 kr = vm_map_wire_kernel(map, start, end,
3945 new_entry->protection, VM_KERN_MEMORY_MLOCK,
3946 TRUE);
3947 result = kr;
3948 }
3949
3950 }
3951
3952 if (result != KERN_SUCCESS) {
3953 if (new_mapping_established) {
3954 /*
3955 * We have to get rid of the new mappings since we
3956 * won't make them available to the user.
3957 * Try and do that atomically, to minimize the risk
3958 * that someone else create new mappings that range.
3959 */
3960
3961 if (!map_locked) {
3962 vm_map_lock(map);
3963 map_locked = TRUE;
3964 }
3965 (void)vm_map_delete(map, *address, *address + size,
3966 VM_MAP_REMOVE_NO_MAP_ALIGN | VM_MAP_REMOVE_NO_YIELD,
3967 KMEM_GUARD_NONE, &zap_list);
3968 }
3969 }
3970
3971 /*
3972 * The caller is responsible for releasing the lock if it requested to
3973 * keep the map locked.
3974 */
3975 if (map_locked && !keep_map_locked) {
3976 vm_map_unlock(map);
3977 }
3978
3979 vm_map_zap_dispose(&zap_list);
3980
3981 return result;
3982
3983 #undef RETURN
3984 }
3985 #endif /* __arm64__ */
3986
3987 /*
3988 * Counters for the prefault optimization.
3989 */
3990 int64_t vm_prefault_nb_pages = 0;
3991 int64_t vm_prefault_nb_bailout = 0;
3992
3993 static kern_return_t
vm_map_enter_mem_object_helper(vm_map_t target_map,vm_map_offset_t * address,vm_map_size_t initial_size,vm_map_offset_t mask,vm_map_kernel_flags_t vmk_flags,ipc_port_t port,vm_object_offset_t offset,boolean_t copy,vm_prot_t cur_protection,vm_prot_t max_protection,vm_inherit_t inheritance,upl_page_list_ptr_t page_list,unsigned int page_list_count)3994 vm_map_enter_mem_object_helper(
3995 vm_map_t target_map,
3996 vm_map_offset_t *address,
3997 vm_map_size_t initial_size,
3998 vm_map_offset_t mask,
3999 vm_map_kernel_flags_t vmk_flags,
4000 ipc_port_t port,
4001 vm_object_offset_t offset,
4002 boolean_t copy,
4003 vm_prot_t cur_protection,
4004 vm_prot_t max_protection,
4005 vm_inherit_t inheritance,
4006 upl_page_list_ptr_t page_list,
4007 unsigned int page_list_count)
4008 {
4009 vm_map_address_t map_addr;
4010 vm_map_size_t map_size;
4011 vm_object_t object;
4012 vm_object_size_t size;
4013 kern_return_t result;
4014 boolean_t mask_cur_protection, mask_max_protection;
4015 boolean_t kernel_prefault, try_prefault = (page_list_count != 0);
4016 vm_map_offset_t offset_in_mapping = 0;
4017 #if __arm64__
4018 boolean_t fourk = vmk_flags.vmkf_fourk;
4019 #endif /* __arm64__ */
4020
4021 if (VM_MAP_PAGE_SHIFT(target_map) < PAGE_SHIFT) {
4022 /* XXX TODO4K prefaulting depends on page size... */
4023 try_prefault = FALSE;
4024 }
4025
4026 assertf(vmk_flags.__vmkf_unused == 0, "vmk_flags unused=0x%x\n", vmk_flags.__vmkf_unused);
4027 vm_map_kernel_flags_update_range_id(&vmk_flags, target_map);
4028
4029 mask_cur_protection = cur_protection & VM_PROT_IS_MASK;
4030 mask_max_protection = max_protection & VM_PROT_IS_MASK;
4031 cur_protection &= ~VM_PROT_IS_MASK;
4032 max_protection &= ~VM_PROT_IS_MASK;
4033
4034 /*
4035 * Check arguments for validity
4036 */
4037 if ((target_map == VM_MAP_NULL) ||
4038 (cur_protection & ~(VM_PROT_ALL | VM_PROT_ALLEXEC)) ||
4039 (max_protection & ~(VM_PROT_ALL | VM_PROT_ALLEXEC)) ||
4040 (inheritance > VM_INHERIT_LAST_VALID) ||
4041 (try_prefault && (copy || !page_list)) ||
4042 initial_size == 0) {
4043 return KERN_INVALID_ARGUMENT;
4044 }
4045
4046 #if __arm64__
4047 if (cur_protection & VM_PROT_EXECUTE) {
4048 cur_protection |= VM_PROT_READ;
4049 }
4050
4051 if (fourk && VM_MAP_PAGE_SHIFT(target_map) < PAGE_SHIFT) {
4052 /* no "fourk" if map is using a sub-page page size */
4053 fourk = FALSE;
4054 }
4055 if (fourk) {
4056 map_addr = vm_map_trunc_page(*address, FOURK_PAGE_MASK);
4057 map_size = vm_map_round_page(initial_size, FOURK_PAGE_MASK);
4058 } else
4059 #endif /* __arm64__ */
4060 {
4061 map_addr = vm_map_trunc_page(*address,
4062 VM_MAP_PAGE_MASK(target_map));
4063 map_size = vm_map_round_page(initial_size,
4064 VM_MAP_PAGE_MASK(target_map));
4065 }
4066 if (map_size == 0) {
4067 return KERN_INVALID_ARGUMENT;
4068 }
4069 size = vm_object_round_page(initial_size);
4070
4071 /*
4072 * Find the vm object (if any) corresponding to this port.
4073 */
4074 if (!IP_VALID(port)) {
4075 object = VM_OBJECT_NULL;
4076 offset = 0;
4077 copy = FALSE;
4078 } else if (ip_kotype(port) == IKOT_NAMED_ENTRY) {
4079 vm_named_entry_t named_entry;
4080 vm_object_offset_t data_offset;
4081
4082 named_entry = mach_memory_entry_from_port(port);
4083
4084 if (vmk_flags.vmf_return_data_addr ||
4085 vmk_flags.vmf_return_4k_data_addr) {
4086 data_offset = named_entry->data_offset;
4087 offset += named_entry->data_offset;
4088 } else {
4089 data_offset = 0;
4090 }
4091
4092 /* a few checks to make sure user is obeying rules */
4093 if (mask_max_protection) {
4094 max_protection &= named_entry->protection;
4095 }
4096 if (mask_cur_protection) {
4097 cur_protection &= named_entry->protection;
4098 }
4099 if ((named_entry->protection & max_protection) !=
4100 max_protection) {
4101 return KERN_INVALID_RIGHT;
4102 }
4103 if ((named_entry->protection & cur_protection) !=
4104 cur_protection) {
4105 return KERN_INVALID_RIGHT;
4106 }
4107 if (offset + size <= offset) {
4108 /* overflow */
4109 return KERN_INVALID_ARGUMENT;
4110 }
4111 if (named_entry->size < (offset + initial_size)) {
4112 return KERN_INVALID_ARGUMENT;
4113 }
4114
4115 if (named_entry->is_copy) {
4116 /* for a vm_map_copy, we can only map it whole */
4117 if ((size != named_entry->size) &&
4118 (vm_map_round_page(size,
4119 VM_MAP_PAGE_MASK(target_map)) ==
4120 named_entry->size)) {
4121 /* XXX FBDP use the rounded size... */
4122 size = vm_map_round_page(
4123 size,
4124 VM_MAP_PAGE_MASK(target_map));
4125 }
4126 }
4127
4128 /* the callers parameter offset is defined to be the */
4129 /* offset from beginning of named entry offset in object */
4130 offset = offset + named_entry->offset;
4131
4132 if (!VM_MAP_PAGE_ALIGNED(size,
4133 VM_MAP_PAGE_MASK(target_map))) {
4134 /*
4135 * Let's not map more than requested;
4136 * vm_map_enter() will handle this "not map-aligned"
4137 * case.
4138 */
4139 map_size = size;
4140 }
4141
4142 named_entry_lock(named_entry);
4143 if (named_entry->is_sub_map) {
4144 vm_map_t submap;
4145
4146 if (vmk_flags.vmf_return_data_addr ||
4147 vmk_flags.vmf_return_4k_data_addr) {
4148 panic("VM_FLAGS_RETURN_DATA_ADDR not expected for submap.");
4149 }
4150
4151 submap = named_entry->backing.map;
4152 vm_map_reference(submap);
4153 named_entry_unlock(named_entry);
4154
4155 vmk_flags.vmkf_submap = TRUE;
4156
4157 result = vm_map_enter(target_map,
4158 &map_addr,
4159 map_size,
4160 mask,
4161 vmk_flags,
4162 (vm_object_t)(uintptr_t) submap,
4163 offset,
4164 copy,
4165 cur_protection,
4166 max_protection,
4167 inheritance);
4168 if (result != KERN_SUCCESS) {
4169 vm_map_deallocate(submap);
4170 } else {
4171 /*
4172 * No need to lock "submap" just to check its
4173 * "mapped" flag: that flag is never reset
4174 * once it's been set and if we race, we'll
4175 * just end up setting it twice, which is OK.
4176 */
4177 if (submap->mapped_in_other_pmaps == FALSE &&
4178 vm_map_pmap(submap) != PMAP_NULL &&
4179 vm_map_pmap(submap) !=
4180 vm_map_pmap(target_map)) {
4181 /*
4182 * This submap is being mapped in a map
4183 * that uses a different pmap.
4184 * Set its "mapped_in_other_pmaps" flag
4185 * to indicate that we now need to
4186 * remove mappings from all pmaps rather
4187 * than just the submap's pmap.
4188 */
4189 vm_map_lock(submap);
4190 submap->mapped_in_other_pmaps = TRUE;
4191 vm_map_unlock(submap);
4192 }
4193 *address = map_addr;
4194 }
4195 return result;
4196 } else if (named_entry->is_copy) {
4197 kern_return_t kr;
4198 vm_map_copy_t copy_map;
4199 vm_map_entry_t copy_entry;
4200 vm_map_offset_t copy_addr;
4201 vm_map_copy_t target_copy_map;
4202 vm_map_offset_t overmap_start, overmap_end;
4203 vm_map_offset_t trimmed_start;
4204 vm_map_size_t target_size;
4205
4206 if (!vm_map_kernel_flags_check_vmflags(vmk_flags,
4207 (VM_FLAGS_FIXED |
4208 VM_FLAGS_ANYWHERE |
4209 VM_FLAGS_OVERWRITE |
4210 VM_FLAGS_RETURN_4K_DATA_ADDR |
4211 VM_FLAGS_RETURN_DATA_ADDR))) {
4212 named_entry_unlock(named_entry);
4213 return KERN_INVALID_ARGUMENT;
4214 }
4215
4216 copy_map = named_entry->backing.copy;
4217 assert(copy_map->type == VM_MAP_COPY_ENTRY_LIST);
4218 if (copy_map->type != VM_MAP_COPY_ENTRY_LIST) {
4219 /* unsupported type; should not happen */
4220 printf("vm_map_enter_mem_object: "
4221 "memory_entry->backing.copy "
4222 "unsupported type 0x%x\n",
4223 copy_map->type);
4224 named_entry_unlock(named_entry);
4225 return KERN_INVALID_ARGUMENT;
4226 }
4227
4228 if (VM_MAP_PAGE_SHIFT(target_map) != copy_map->cpy_hdr.page_shift) {
4229 DEBUG4K_SHARE("copy_map %p offset %llx size 0x%llx pgshift %d -> target_map %p pgshift %d\n", copy_map, offset, (uint64_t)map_size, copy_map->cpy_hdr.page_shift, target_map, VM_MAP_PAGE_SHIFT(target_map));
4230 }
4231
4232 if (vmk_flags.vmf_return_data_addr ||
4233 vmk_flags.vmf_return_4k_data_addr) {
4234 offset_in_mapping = offset & VM_MAP_PAGE_MASK(target_map);
4235 if (vmk_flags.vmf_return_4k_data_addr) {
4236 offset_in_mapping &= ~((signed)(0xFFF));
4237 }
4238 }
4239
4240 target_copy_map = VM_MAP_COPY_NULL;
4241 target_size = copy_map->size;
4242 overmap_start = 0;
4243 overmap_end = 0;
4244 trimmed_start = 0;
4245 if (copy_map->cpy_hdr.page_shift != VM_MAP_PAGE_SHIFT(target_map)) {
4246 DEBUG4K_ADJUST("adjusting...\n");
4247 kr = vm_map_copy_adjust_to_target(
4248 copy_map,
4249 offset /* includes data_offset */,
4250 initial_size,
4251 target_map,
4252 copy,
4253 &target_copy_map,
4254 &overmap_start,
4255 &overmap_end,
4256 &trimmed_start);
4257 if (kr != KERN_SUCCESS) {
4258 named_entry_unlock(named_entry);
4259 return kr;
4260 }
4261 target_size = target_copy_map->size;
4262 if (trimmed_start >= data_offset) {
4263 data_offset = offset & VM_MAP_PAGE_MASK(target_map);
4264 } else {
4265 data_offset -= trimmed_start;
4266 }
4267 } else {
4268 /*
4269 * Assert that the vm_map_copy is coming from the right
4270 * zone and hasn't been forged
4271 */
4272 vm_map_copy_require(copy_map);
4273 target_copy_map = copy_map;
4274 }
4275
4276 vm_map_kernel_flags_t rsv_flags = vmk_flags;
4277
4278 vm_map_kernel_flags_and_vmflags(&rsv_flags,
4279 (VM_FLAGS_FIXED |
4280 VM_FLAGS_ANYWHERE |
4281 VM_FLAGS_OVERWRITE |
4282 VM_FLAGS_RETURN_4K_DATA_ADDR |
4283 VM_FLAGS_RETURN_DATA_ADDR));
4284
4285 /* reserve a contiguous range */
4286 kr = vm_map_enter(target_map,
4287 &map_addr,
4288 vm_map_round_page(target_size, VM_MAP_PAGE_MASK(target_map)),
4289 mask,
4290 rsv_flags,
4291 VM_OBJECT_NULL,
4292 0,
4293 FALSE, /* copy */
4294 cur_protection,
4295 max_protection,
4296 inheritance);
4297 if (kr != KERN_SUCCESS) {
4298 DEBUG4K_ERROR("kr 0x%x\n", kr);
4299 if (target_copy_map != copy_map) {
4300 vm_map_copy_discard(target_copy_map);
4301 target_copy_map = VM_MAP_COPY_NULL;
4302 }
4303 named_entry_unlock(named_entry);
4304 return kr;
4305 }
4306
4307 copy_addr = map_addr;
4308
4309 for (copy_entry = vm_map_copy_first_entry(target_copy_map);
4310 copy_entry != vm_map_copy_to_entry(target_copy_map);
4311 copy_entry = copy_entry->vme_next) {
4312 vm_map_t copy_submap = VM_MAP_NULL;
4313 vm_object_t copy_object = VM_OBJECT_NULL;
4314 vm_map_size_t copy_size;
4315 vm_object_offset_t copy_offset;
4316 boolean_t do_copy = false;
4317
4318 if (copy_entry->is_sub_map) {
4319 copy_submap = VME_SUBMAP(copy_entry);
4320 copy_object = (vm_object_t)copy_submap;
4321 } else {
4322 copy_object = VME_OBJECT(copy_entry);
4323 }
4324 copy_offset = VME_OFFSET(copy_entry);
4325 copy_size = (copy_entry->vme_end -
4326 copy_entry->vme_start);
4327
4328 /* sanity check */
4329 if ((copy_addr + copy_size) >
4330 (map_addr +
4331 overmap_start + overmap_end +
4332 named_entry->size /* XXX full size */)) {
4333 /* over-mapping too much !? */
4334 kr = KERN_INVALID_ARGUMENT;
4335 DEBUG4K_ERROR("kr 0x%x\n", kr);
4336 /* abort */
4337 break;
4338 }
4339
4340 /* take a reference on the object */
4341 if (copy_entry->is_sub_map) {
4342 vm_map_reference(copy_submap);
4343 } else {
4344 if (!copy &&
4345 copy_object != VM_OBJECT_NULL &&
4346 copy_object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) {
4347 /*
4348 * We need to resolve our side of this
4349 * "symmetric" copy-on-write now; we
4350 * need a new object to map and share,
4351 * instead of the current one which
4352 * might still be shared with the
4353 * original mapping.
4354 *
4355 * Note: A "vm_map_copy_t" does not
4356 * have a lock but we're protected by
4357 * the named entry's lock here.
4358 */
4359 // assert(copy_object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC);
4360 VME_OBJECT_SHADOW(copy_entry, copy_size, TRUE);
4361 assert(copy_object != VME_OBJECT(copy_entry));
4362 if (!copy_entry->needs_copy &&
4363 copy_entry->protection & VM_PROT_WRITE) {
4364 vm_prot_t prot;
4365
4366 prot = copy_entry->protection & ~VM_PROT_WRITE;
4367 vm_object_pmap_protect(copy_object,
4368 copy_offset,
4369 copy_size,
4370 PMAP_NULL,
4371 PAGE_SIZE,
4372 0,
4373 prot);
4374 }
4375 copy_entry->needs_copy = FALSE;
4376 copy_entry->is_shared = TRUE;
4377 copy_object = VME_OBJECT(copy_entry);
4378 copy_offset = VME_OFFSET(copy_entry);
4379 vm_object_lock(copy_object);
4380 /* we're about to make a shared mapping of this object */
4381 copy_object->copy_strategy = MEMORY_OBJECT_COPY_DELAY;
4382 copy_object->true_share = TRUE;
4383 vm_object_unlock(copy_object);
4384 }
4385
4386 if (copy_object != VM_OBJECT_NULL &&
4387 copy_object->named &&
4388 copy_object->pager != MEMORY_OBJECT_NULL &&
4389 copy_object->copy_strategy != MEMORY_OBJECT_COPY_NONE) {
4390 memory_object_t pager;
4391 vm_prot_t pager_prot;
4392
4393 /*
4394 * For "named" VM objects, let the pager know that the
4395 * memory object is being mapped. Some pagers need to keep
4396 * track of this, to know when they can reclaim the memory
4397 * object, for example.
4398 * VM calls memory_object_map() for each mapping (specifying
4399 * the protection of each mapping) and calls
4400 * memory_object_last_unmap() when all the mappings are gone.
4401 */
4402 pager_prot = max_protection;
4403 if (copy) {
4404 /*
4405 * Copy-On-Write mapping: won't modify the
4406 * memory object.
4407 */
4408 pager_prot &= ~VM_PROT_WRITE;
4409 }
4410 vm_object_lock(copy_object);
4411 pager = copy_object->pager;
4412 if (copy_object->named &&
4413 pager != MEMORY_OBJECT_NULL &&
4414 copy_object->copy_strategy != MEMORY_OBJECT_COPY_NONE) {
4415 assert(copy_object->pager_ready);
4416 vm_object_mapping_wait(copy_object, THREAD_UNINT);
4417 vm_object_mapping_begin(copy_object);
4418 vm_object_unlock(copy_object);
4419
4420 kr = memory_object_map(pager, pager_prot);
4421 assert(kr == KERN_SUCCESS);
4422
4423 vm_object_lock(copy_object);
4424 vm_object_mapping_end(copy_object);
4425 }
4426 vm_object_unlock(copy_object);
4427 }
4428
4429 /*
4430 * Perform the copy if requested
4431 */
4432
4433 if (copy && copy_object != VM_OBJECT_NULL) {
4434 vm_object_t new_object;
4435 vm_object_offset_t new_offset;
4436
4437 result = vm_object_copy_strategically(copy_object, copy_offset,
4438 copy_size,
4439 &new_object, &new_offset,
4440 &do_copy);
4441
4442
4443 if (result == KERN_MEMORY_RESTART_COPY) {
4444 boolean_t success;
4445 boolean_t src_needs_copy;
4446
4447 /*
4448 * XXX
4449 * We currently ignore src_needs_copy.
4450 * This really is the issue of how to make
4451 * MEMORY_OBJECT_COPY_SYMMETRIC safe for
4452 * non-kernel users to use. Solution forthcoming.
4453 * In the meantime, since we don't allow non-kernel
4454 * memory managers to specify symmetric copy,
4455 * we won't run into problems here.
4456 */
4457 new_object = copy_object;
4458 new_offset = copy_offset;
4459 success = vm_object_copy_quickly(new_object,
4460 new_offset,
4461 copy_size,
4462 &src_needs_copy,
4463 &do_copy);
4464 assert(success);
4465 result = KERN_SUCCESS;
4466 }
4467 if (result != KERN_SUCCESS) {
4468 kr = result;
4469 break;
4470 }
4471
4472 copy_object = new_object;
4473 copy_offset = new_offset;
4474 /*
4475 * No extra object reference for the mapping:
4476 * the mapping should be the only thing keeping
4477 * this new object alive.
4478 */
4479 } else {
4480 /*
4481 * We already have the right object
4482 * to map.
4483 */
4484 copy_object = VME_OBJECT(copy_entry);
4485 /* take an extra ref for the mapping below */
4486 vm_object_reference(copy_object);
4487 }
4488 }
4489
4490 /*
4491 * If the caller does not want a specific
4492 * tag for this new mapping: use
4493 * the tag of the original mapping.
4494 */
4495 vm_map_kernel_flags_t vmk_remap_flags = {
4496 .vmkf_submap = copy_entry->is_sub_map,
4497 };
4498
4499 vm_map_kernel_flags_set_vmflags(&vmk_remap_flags,
4500 vm_map_kernel_flags_vmflags(vmk_flags),
4501 vmk_flags.vm_tag ?: VME_ALIAS(copy_entry));
4502
4503 /* over-map the object into destination */
4504 vmk_remap_flags.vmf_fixed = true;
4505 vmk_remap_flags.vmf_overwrite = true;
4506
4507 if (!copy && !copy_entry->is_sub_map) {
4508 /*
4509 * copy-on-write should have been
4510 * resolved at this point, or we would
4511 * end up sharing instead of copying.
4512 */
4513 assert(!copy_entry->needs_copy);
4514 }
4515 #if XNU_TARGET_OS_OSX
4516 if (copy_entry->used_for_jit) {
4517 vmk_remap_flags.vmkf_map_jit = TRUE;
4518 }
4519 #endif /* XNU_TARGET_OS_OSX */
4520
4521 kr = vm_map_enter(target_map,
4522 ©_addr,
4523 copy_size,
4524 (vm_map_offset_t) 0,
4525 vmk_remap_flags,
4526 copy_object,
4527 copy_offset,
4528 ((copy_object == NULL)
4529 ? FALSE
4530 : (copy || copy_entry->needs_copy)),
4531 cur_protection,
4532 max_protection,
4533 inheritance);
4534 if (kr != KERN_SUCCESS) {
4535 DEBUG4K_SHARE("failed kr 0x%x\n", kr);
4536 if (copy_entry->is_sub_map) {
4537 vm_map_deallocate(copy_submap);
4538 } else {
4539 vm_object_deallocate(copy_object);
4540 }
4541 /* abort */
4542 break;
4543 }
4544
4545 /* next mapping */
4546 copy_addr += copy_size;
4547 }
4548
4549 if (kr == KERN_SUCCESS) {
4550 if (vmk_flags.vmf_return_data_addr ||
4551 vmk_flags.vmf_return_4k_data_addr) {
4552 *address = map_addr + offset_in_mapping;
4553 } else {
4554 *address = map_addr;
4555 }
4556 if (overmap_start) {
4557 *address += overmap_start;
4558 DEBUG4K_SHARE("map %p map_addr 0x%llx offset_in_mapping 0x%llx overmap_start 0x%llx -> *address 0x%llx\n", target_map, (uint64_t)map_addr, (uint64_t) offset_in_mapping, (uint64_t)overmap_start, (uint64_t)*address);
4559 }
4560 }
4561 named_entry_unlock(named_entry);
4562 if (target_copy_map != copy_map) {
4563 vm_map_copy_discard(target_copy_map);
4564 target_copy_map = VM_MAP_COPY_NULL;
4565 }
4566
4567 if (kr != KERN_SUCCESS && !vmk_flags.vmf_overwrite) {
4568 /* deallocate the contiguous range */
4569 (void) vm_deallocate(target_map,
4570 map_addr,
4571 map_size);
4572 }
4573
4574 return kr;
4575 }
4576
4577 if (named_entry->is_object) {
4578 unsigned int access;
4579 unsigned int wimg_mode;
4580
4581 /* we are mapping a VM object */
4582
4583 access = named_entry->access;
4584
4585 if (vmk_flags.vmf_return_data_addr ||
4586 vmk_flags.vmf_return_4k_data_addr) {
4587 offset_in_mapping = offset - VM_MAP_TRUNC_PAGE(offset, VM_MAP_PAGE_MASK(target_map));
4588 if (vmk_flags.vmf_return_4k_data_addr) {
4589 offset_in_mapping &= ~((signed)(0xFFF));
4590 }
4591 offset = VM_MAP_TRUNC_PAGE(offset, VM_MAP_PAGE_MASK(target_map));
4592 map_size = VM_MAP_ROUND_PAGE((offset + offset_in_mapping + initial_size) - offset, VM_MAP_PAGE_MASK(target_map));
4593 }
4594
4595 object = vm_named_entry_to_vm_object(named_entry);
4596 assert(object != VM_OBJECT_NULL);
4597 vm_object_lock(object);
4598 named_entry_unlock(named_entry);
4599
4600 vm_object_reference_locked(object);
4601
4602 wimg_mode = object->wimg_bits;
4603 vm_prot_to_wimg(access, &wimg_mode);
4604 if (object->wimg_bits != wimg_mode) {
4605 vm_object_change_wimg_mode(object, wimg_mode);
4606 }
4607
4608 vm_object_unlock(object);
4609 } else {
4610 panic("invalid VM named entry %p", named_entry);
4611 }
4612 } else if (ip_kotype(port) == IKOT_MEMORY_OBJECT) {
4613 /*
4614 * JMM - This is temporary until we unify named entries
4615 * and raw memory objects.
4616 *
4617 * Detected fake ip_kotype for a memory object. In
4618 * this case, the port isn't really a port at all, but
4619 * instead is just a raw memory object.
4620 */
4621 if (vmk_flags.vmf_return_data_addr ||
4622 vmk_flags.vmf_return_4k_data_addr) {
4623 panic("VM_FLAGS_RETURN_DATA_ADDR not expected for raw memory object.");
4624 }
4625
4626 object = memory_object_to_vm_object((memory_object_t)port);
4627 if (object == VM_OBJECT_NULL) {
4628 return KERN_INVALID_OBJECT;
4629 }
4630 vm_object_reference(object);
4631
4632 /* wait for object (if any) to be ready */
4633 if (object != VM_OBJECT_NULL) {
4634 if (object == kernel_object) {
4635 printf("Warning: Attempt to map kernel object"
4636 " by a non-private kernel entity\n");
4637 return KERN_INVALID_OBJECT;
4638 }
4639 if (!object->pager_ready) {
4640 vm_object_lock(object);
4641
4642 while (!object->pager_ready) {
4643 vm_object_wait(object,
4644 VM_OBJECT_EVENT_PAGER_READY,
4645 THREAD_UNINT);
4646 vm_object_lock(object);
4647 }
4648 vm_object_unlock(object);
4649 }
4650 }
4651 } else {
4652 return KERN_INVALID_OBJECT;
4653 }
4654
4655 if (object != VM_OBJECT_NULL &&
4656 object->named &&
4657 object->pager != MEMORY_OBJECT_NULL &&
4658 object->copy_strategy != MEMORY_OBJECT_COPY_NONE) {
4659 memory_object_t pager;
4660 vm_prot_t pager_prot;
4661 kern_return_t kr;
4662
4663 /*
4664 * For "named" VM objects, let the pager know that the
4665 * memory object is being mapped. Some pagers need to keep
4666 * track of this, to know when they can reclaim the memory
4667 * object, for example.
4668 * VM calls memory_object_map() for each mapping (specifying
4669 * the protection of each mapping) and calls
4670 * memory_object_last_unmap() when all the mappings are gone.
4671 */
4672 pager_prot = max_protection;
4673 if (copy) {
4674 /*
4675 * Copy-On-Write mapping: won't modify the
4676 * memory object.
4677 */
4678 pager_prot &= ~VM_PROT_WRITE;
4679 }
4680 vm_object_lock(object);
4681 pager = object->pager;
4682 if (object->named &&
4683 pager != MEMORY_OBJECT_NULL &&
4684 object->copy_strategy != MEMORY_OBJECT_COPY_NONE) {
4685 assert(object->pager_ready);
4686 vm_object_mapping_wait(object, THREAD_UNINT);
4687 vm_object_mapping_begin(object);
4688 vm_object_unlock(object);
4689
4690 kr = memory_object_map(pager, pager_prot);
4691 assert(kr == KERN_SUCCESS);
4692
4693 vm_object_lock(object);
4694 vm_object_mapping_end(object);
4695 }
4696 vm_object_unlock(object);
4697 }
4698
4699 /*
4700 * Perform the copy if requested
4701 */
4702
4703 if (copy) {
4704 vm_object_t new_object;
4705 vm_object_offset_t new_offset;
4706
4707 result = vm_object_copy_strategically(object, offset,
4708 map_size,
4709 &new_object, &new_offset,
4710 ©);
4711
4712
4713 if (result == KERN_MEMORY_RESTART_COPY) {
4714 boolean_t success;
4715 boolean_t src_needs_copy;
4716
4717 /*
4718 * XXX
4719 * We currently ignore src_needs_copy.
4720 * This really is the issue of how to make
4721 * MEMORY_OBJECT_COPY_SYMMETRIC safe for
4722 * non-kernel users to use. Solution forthcoming.
4723 * In the meantime, since we don't allow non-kernel
4724 * memory managers to specify symmetric copy,
4725 * we won't run into problems here.
4726 */
4727 new_object = object;
4728 new_offset = offset;
4729 success = vm_object_copy_quickly(new_object,
4730 new_offset,
4731 map_size,
4732 &src_needs_copy,
4733 ©);
4734 assert(success);
4735 result = KERN_SUCCESS;
4736 }
4737 /*
4738 * Throw away the reference to the
4739 * original object, as it won't be mapped.
4740 */
4741
4742 vm_object_deallocate(object);
4743
4744 if (result != KERN_SUCCESS) {
4745 return result;
4746 }
4747
4748 object = new_object;
4749 offset = new_offset;
4750 }
4751
4752 /*
4753 * If non-kernel users want to try to prefault pages, the mapping and prefault
4754 * needs to be atomic.
4755 */
4756 kernel_prefault = (try_prefault && vm_kernel_map_is_kernel(target_map));
4757 vmk_flags.vmkf_keep_map_locked = (try_prefault && !kernel_prefault);
4758
4759 #if __arm64__
4760 if (fourk) {
4761 /* map this object in a "4K" pager */
4762 result = vm_map_enter_fourk(target_map,
4763 &map_addr,
4764 map_size,
4765 (vm_map_offset_t) mask,
4766 vmk_flags,
4767 object,
4768 offset,
4769 copy,
4770 cur_protection,
4771 max_protection,
4772 inheritance);
4773 } else
4774 #endif /* __arm64__ */
4775 {
4776 result = vm_map_enter(target_map,
4777 &map_addr, map_size,
4778 (vm_map_offset_t)mask,
4779 vmk_flags,
4780 object, offset,
4781 copy,
4782 cur_protection, max_protection,
4783 inheritance);
4784 }
4785 if (result != KERN_SUCCESS) {
4786 vm_object_deallocate(object);
4787 }
4788
4789 /*
4790 * Try to prefault, and do not forget to release the vm map lock.
4791 */
4792 if (result == KERN_SUCCESS && try_prefault) {
4793 mach_vm_address_t va = map_addr;
4794 kern_return_t kr = KERN_SUCCESS;
4795 unsigned int i = 0;
4796 int pmap_options;
4797
4798 pmap_options = kernel_prefault ? 0 : PMAP_OPTIONS_NOWAIT;
4799 if (object->internal) {
4800 pmap_options |= PMAP_OPTIONS_INTERNAL;
4801 }
4802
4803 for (i = 0; i < page_list_count; ++i) {
4804 if (!UPL_VALID_PAGE(page_list, i)) {
4805 if (kernel_prefault) {
4806 assertf(FALSE, "kernel_prefault && !UPL_VALID_PAGE");
4807 result = KERN_MEMORY_ERROR;
4808 break;
4809 }
4810 } else {
4811 /*
4812 * If this function call failed, we should stop
4813 * trying to optimize, other calls are likely
4814 * going to fail too.
4815 *
4816 * We are not gonna report an error for such
4817 * failure though. That's an optimization, not
4818 * something critical.
4819 */
4820 kr = pmap_enter_options(target_map->pmap,
4821 va, UPL_PHYS_PAGE(page_list, i),
4822 cur_protection, VM_PROT_NONE,
4823 0, TRUE, pmap_options, NULL);
4824 if (kr != KERN_SUCCESS) {
4825 OSIncrementAtomic64(&vm_prefault_nb_bailout);
4826 if (kernel_prefault) {
4827 result = kr;
4828 }
4829 break;
4830 }
4831 OSIncrementAtomic64(&vm_prefault_nb_pages);
4832 }
4833
4834 /* Next virtual address */
4835 va += PAGE_SIZE;
4836 }
4837 if (vmk_flags.vmkf_keep_map_locked) {
4838 vm_map_unlock(target_map);
4839 }
4840 }
4841
4842 if (vmk_flags.vmf_return_data_addr ||
4843 vmk_flags.vmf_return_4k_data_addr) {
4844 *address = map_addr + offset_in_mapping;
4845 } else {
4846 *address = map_addr;
4847 }
4848 return result;
4849 }
4850
4851 kern_return_t
vm_map_enter_mem_object(vm_map_t target_map,vm_map_offset_t * address,vm_map_size_t initial_size,vm_map_offset_t mask,vm_map_kernel_flags_t vmk_flags,ipc_port_t port,vm_object_offset_t offset,boolean_t copy,vm_prot_t cur_protection,vm_prot_t max_protection,vm_inherit_t inheritance)4852 vm_map_enter_mem_object(
4853 vm_map_t target_map,
4854 vm_map_offset_t *address,
4855 vm_map_size_t initial_size,
4856 vm_map_offset_t mask,
4857 vm_map_kernel_flags_t vmk_flags,
4858 ipc_port_t port,
4859 vm_object_offset_t offset,
4860 boolean_t copy,
4861 vm_prot_t cur_protection,
4862 vm_prot_t max_protection,
4863 vm_inherit_t inheritance)
4864 {
4865 kern_return_t ret;
4866
4867 /* range_id is set by vm_map_enter_mem_object_helper */
4868 ret = vm_map_enter_mem_object_helper(target_map,
4869 address,
4870 initial_size,
4871 mask,
4872 vmk_flags,
4873 port,
4874 offset,
4875 copy,
4876 cur_protection,
4877 max_protection,
4878 inheritance,
4879 NULL,
4880 0);
4881
4882 #if KASAN
4883 if (ret == KERN_SUCCESS && address && target_map->pmap == kernel_pmap) {
4884 kasan_notify_address(*address, initial_size);
4885 }
4886 #endif
4887
4888 return ret;
4889 }
4890
4891 kern_return_t
vm_map_enter_mem_object_prefault(vm_map_t target_map,vm_map_offset_t * address,vm_map_size_t initial_size,vm_map_offset_t mask,vm_map_kernel_flags_t vmk_flags,ipc_port_t port,vm_object_offset_t offset,vm_prot_t cur_protection,vm_prot_t max_protection,upl_page_list_ptr_t page_list,unsigned int page_list_count)4892 vm_map_enter_mem_object_prefault(
4893 vm_map_t target_map,
4894 vm_map_offset_t *address,
4895 vm_map_size_t initial_size,
4896 vm_map_offset_t mask,
4897 vm_map_kernel_flags_t vmk_flags,
4898 ipc_port_t port,
4899 vm_object_offset_t offset,
4900 vm_prot_t cur_protection,
4901 vm_prot_t max_protection,
4902 upl_page_list_ptr_t page_list,
4903 unsigned int page_list_count)
4904 {
4905 kern_return_t ret;
4906
4907 /* range_id is set by vm_map_enter_mem_object_helper */
4908 ret = vm_map_enter_mem_object_helper(target_map,
4909 address,
4910 initial_size,
4911 mask,
4912 vmk_flags,
4913 port,
4914 offset,
4915 FALSE,
4916 cur_protection,
4917 max_protection,
4918 VM_INHERIT_DEFAULT,
4919 page_list,
4920 page_list_count);
4921
4922 #if KASAN
4923 if (ret == KERN_SUCCESS && address && target_map->pmap == kernel_pmap) {
4924 kasan_notify_address(*address, initial_size);
4925 }
4926 #endif
4927
4928 return ret;
4929 }
4930
4931
4932 kern_return_t
vm_map_enter_mem_object_control(vm_map_t target_map,vm_map_offset_t * address,vm_map_size_t initial_size,vm_map_offset_t mask,vm_map_kernel_flags_t vmk_flags,memory_object_control_t control,vm_object_offset_t offset,boolean_t copy,vm_prot_t cur_protection,vm_prot_t max_protection,vm_inherit_t inheritance)4933 vm_map_enter_mem_object_control(
4934 vm_map_t target_map,
4935 vm_map_offset_t *address,
4936 vm_map_size_t initial_size,
4937 vm_map_offset_t mask,
4938 vm_map_kernel_flags_t vmk_flags,
4939 memory_object_control_t control,
4940 vm_object_offset_t offset,
4941 boolean_t copy,
4942 vm_prot_t cur_protection,
4943 vm_prot_t max_protection,
4944 vm_inherit_t inheritance)
4945 {
4946 vm_map_address_t map_addr;
4947 vm_map_size_t map_size;
4948 vm_object_t object;
4949 vm_object_size_t size;
4950 kern_return_t result;
4951 memory_object_t pager;
4952 vm_prot_t pager_prot;
4953 kern_return_t kr;
4954 #if __arm64__
4955 boolean_t fourk = vmk_flags.vmkf_fourk;
4956 #endif /* __arm64__ */
4957
4958 /*
4959 * Check arguments for validity
4960 */
4961 if ((target_map == VM_MAP_NULL) ||
4962 (cur_protection & ~(VM_PROT_ALL | VM_PROT_ALLEXEC)) ||
4963 (max_protection & ~(VM_PROT_ALL | VM_PROT_ALLEXEC)) ||
4964 (inheritance > VM_INHERIT_LAST_VALID) ||
4965 initial_size == 0) {
4966 return KERN_INVALID_ARGUMENT;
4967 }
4968
4969 #if __arm64__
4970 if (fourk && VM_MAP_PAGE_MASK(target_map) < PAGE_MASK) {
4971 fourk = FALSE;
4972 }
4973
4974 if (fourk) {
4975 map_addr = vm_map_trunc_page(*address,
4976 FOURK_PAGE_MASK);
4977 map_size = vm_map_round_page(initial_size,
4978 FOURK_PAGE_MASK);
4979 } else
4980 #endif /* __arm64__ */
4981 {
4982 map_addr = vm_map_trunc_page(*address,
4983 VM_MAP_PAGE_MASK(target_map));
4984 map_size = vm_map_round_page(initial_size,
4985 VM_MAP_PAGE_MASK(target_map));
4986 }
4987 size = vm_object_round_page(initial_size);
4988
4989 object = memory_object_control_to_vm_object(control);
4990
4991 if (object == VM_OBJECT_NULL) {
4992 return KERN_INVALID_OBJECT;
4993 }
4994
4995 if (object == kernel_object) {
4996 printf("Warning: Attempt to map kernel object"
4997 " by a non-private kernel entity\n");
4998 return KERN_INVALID_OBJECT;
4999 }
5000
5001 vm_object_lock(object);
5002 object->ref_count++;
5003
5004 /*
5005 * For "named" VM objects, let the pager know that the
5006 * memory object is being mapped. Some pagers need to keep
5007 * track of this, to know when they can reclaim the memory
5008 * object, for example.
5009 * VM calls memory_object_map() for each mapping (specifying
5010 * the protection of each mapping) and calls
5011 * memory_object_last_unmap() when all the mappings are gone.
5012 */
5013 pager_prot = max_protection;
5014 if (copy) {
5015 pager_prot &= ~VM_PROT_WRITE;
5016 }
5017 pager = object->pager;
5018 if (object->named &&
5019 pager != MEMORY_OBJECT_NULL &&
5020 object->copy_strategy != MEMORY_OBJECT_COPY_NONE) {
5021 assert(object->pager_ready);
5022 vm_object_mapping_wait(object, THREAD_UNINT);
5023 vm_object_mapping_begin(object);
5024 vm_object_unlock(object);
5025
5026 kr = memory_object_map(pager, pager_prot);
5027 assert(kr == KERN_SUCCESS);
5028
5029 vm_object_lock(object);
5030 vm_object_mapping_end(object);
5031 }
5032 vm_object_unlock(object);
5033
5034 /*
5035 * Perform the copy if requested
5036 */
5037
5038 if (copy) {
5039 vm_object_t new_object;
5040 vm_object_offset_t new_offset;
5041
5042 result = vm_object_copy_strategically(object, offset, size,
5043 &new_object, &new_offset,
5044 ©);
5045
5046
5047 if (result == KERN_MEMORY_RESTART_COPY) {
5048 boolean_t success;
5049 boolean_t src_needs_copy;
5050
5051 /*
5052 * XXX
5053 * We currently ignore src_needs_copy.
5054 * This really is the issue of how to make
5055 * MEMORY_OBJECT_COPY_SYMMETRIC safe for
5056 * non-kernel users to use. Solution forthcoming.
5057 * In the meantime, since we don't allow non-kernel
5058 * memory managers to specify symmetric copy,
5059 * we won't run into problems here.
5060 */
5061 new_object = object;
5062 new_offset = offset;
5063 success = vm_object_copy_quickly(new_object,
5064 new_offset, size,
5065 &src_needs_copy,
5066 ©);
5067 assert(success);
5068 result = KERN_SUCCESS;
5069 }
5070 /*
5071 * Throw away the reference to the
5072 * original object, as it won't be mapped.
5073 */
5074
5075 vm_object_deallocate(object);
5076
5077 if (result != KERN_SUCCESS) {
5078 return result;
5079 }
5080
5081 object = new_object;
5082 offset = new_offset;
5083 }
5084
5085 #if __arm64__
5086 if (fourk) {
5087 result = vm_map_enter_fourk(target_map,
5088 &map_addr,
5089 map_size,
5090 (vm_map_offset_t)mask,
5091 vmk_flags,
5092 object, offset,
5093 copy,
5094 cur_protection, max_protection,
5095 inheritance);
5096 } else
5097 #endif /* __arm64__ */
5098 {
5099 result = vm_map_enter(target_map,
5100 &map_addr, map_size,
5101 (vm_map_offset_t)mask,
5102 vmk_flags,
5103 object, offset,
5104 copy,
5105 cur_protection, max_protection,
5106 inheritance);
5107 }
5108 if (result != KERN_SUCCESS) {
5109 vm_object_deallocate(object);
5110 }
5111 *address = map_addr;
5112
5113 return result;
5114 }
5115
5116
5117 #if VM_CPM
5118
5119 #ifdef MACH_ASSERT
5120 extern pmap_paddr_t avail_start, avail_end;
5121 #endif
5122
5123 /*
5124 * Allocate memory in the specified map, with the caveat that
5125 * the memory is physically contiguous. This call may fail
5126 * if the system can't find sufficient contiguous memory.
5127 * This call may cause or lead to heart-stopping amounts of
5128 * paging activity.
5129 *
5130 * Memory obtained from this call should be freed in the
5131 * normal way, viz., via vm_deallocate.
5132 */
5133 kern_return_t
vm_map_enter_cpm(vm_map_t map,vm_map_offset_t * addr,vm_map_size_t size,vm_map_kernel_flags_t vmk_flags)5134 vm_map_enter_cpm(
5135 vm_map_t map,
5136 vm_map_offset_t *addr,
5137 vm_map_size_t size,
5138 vm_map_kernel_flags_t vmk_flags)
5139 {
5140 vm_object_t cpm_obj;
5141 pmap_t pmap;
5142 vm_page_t m, pages;
5143 kern_return_t kr;
5144 vm_map_offset_t va, start, end, offset;
5145 #if MACH_ASSERT
5146 vm_map_offset_t prev_addr = 0;
5147 #endif /* MACH_ASSERT */
5148
5149 if (VM_MAP_PAGE_SHIFT(map) != PAGE_SHIFT) {
5150 /* XXX TODO4K do we need to support this? */
5151 *addr = 0;
5152 return KERN_NOT_SUPPORTED;
5153 }
5154
5155 if (size == 0) {
5156 *addr = 0;
5157 return KERN_SUCCESS;
5158 }
5159 if (vmk_flags.vmf_fixed) {
5160 *addr = vm_map_trunc_page(*addr,
5161 VM_MAP_PAGE_MASK(map));
5162 } else {
5163 *addr = vm_map_min(map);
5164 }
5165 size = vm_map_round_page(size,
5166 VM_MAP_PAGE_MASK(map));
5167
5168 /*
5169 * LP64todo - cpm_allocate should probably allow
5170 * allocations of >4GB, but not with the current
5171 * algorithm, so just cast down the size for now.
5172 */
5173 if (size > VM_MAX_ADDRESS) {
5174 return KERN_RESOURCE_SHORTAGE;
5175 }
5176 if ((kr = cpm_allocate(CAST_DOWN(vm_size_t, size),
5177 &pages, 0, 0, TRUE, flags)) != KERN_SUCCESS) {
5178 return kr;
5179 }
5180
5181 cpm_obj = vm_object_allocate((vm_object_size_t)size);
5182 assert(cpm_obj != VM_OBJECT_NULL);
5183 assert(cpm_obj->internal);
5184 assert(cpm_obj->vo_size == (vm_object_size_t)size);
5185 assert(cpm_obj->can_persist == FALSE);
5186 assert(cpm_obj->pager_created == FALSE);
5187 assert(cpm_obj->pageout == FALSE);
5188 assert(cpm_obj->shadow == VM_OBJECT_NULL);
5189
5190 /*
5191 * Insert pages into object.
5192 */
5193
5194 vm_object_lock(cpm_obj);
5195 for (offset = 0; offset < size; offset += PAGE_SIZE) {
5196 m = pages;
5197 pages = NEXT_PAGE(m);
5198 *(NEXT_PAGE_PTR(m)) = VM_PAGE_NULL;
5199
5200 assert(!m->vmp_gobbled);
5201 assert(!m->vmp_wanted);
5202 assert(!m->vmp_pageout);
5203 assert(!m->vmp_tabled);
5204 assert(VM_PAGE_WIRED(m));
5205 assert(m->vmp_busy);
5206 assert(VM_PAGE_GET_PHYS_PAGE(m) >= (avail_start >> PAGE_SHIFT) && VM_PAGE_GET_PHYS_PAGE(m) <= (avail_end >> PAGE_SHIFT));
5207
5208 m->vmp_busy = FALSE;
5209 vm_page_insert(m, cpm_obj, offset);
5210 }
5211 assert(cpm_obj->resident_page_count == size / PAGE_SIZE);
5212 vm_object_unlock(cpm_obj);
5213
5214 /*
5215 * Hang onto a reference on the object in case a
5216 * multi-threaded application for some reason decides
5217 * to deallocate the portion of the address space into
5218 * which we will insert this object.
5219 *
5220 * Unfortunately, we must insert the object now before
5221 * we can talk to the pmap module about which addresses
5222 * must be wired down. Hence, the race with a multi-
5223 * threaded app.
5224 */
5225 vm_object_reference(cpm_obj);
5226
5227 /*
5228 * Insert object into map.
5229 */
5230
5231 kr = vm_map_enter(
5232 map,
5233 addr,
5234 size,
5235 (vm_map_offset_t)0,
5236 vmk_flags,
5237 cpm_obj,
5238 (vm_object_offset_t)0,
5239 FALSE,
5240 VM_PROT_ALL,
5241 VM_PROT_ALL,
5242 VM_INHERIT_DEFAULT);
5243
5244 if (kr != KERN_SUCCESS) {
5245 /*
5246 * A CPM object doesn't have can_persist set,
5247 * so all we have to do is deallocate it to
5248 * free up these pages.
5249 */
5250 assert(cpm_obj->pager_created == FALSE);
5251 assert(cpm_obj->can_persist == FALSE);
5252 assert(cpm_obj->pageout == FALSE);
5253 assert(cpm_obj->shadow == VM_OBJECT_NULL);
5254 vm_object_deallocate(cpm_obj); /* kill acquired ref */
5255 vm_object_deallocate(cpm_obj); /* kill creation ref */
5256 }
5257
5258 /*
5259 * Inform the physical mapping system that the
5260 * range of addresses may not fault, so that
5261 * page tables and such can be locked down as well.
5262 */
5263 start = *addr;
5264 end = start + size;
5265 pmap = vm_map_pmap(map);
5266 pmap_pageable(pmap, start, end, FALSE);
5267
5268 /*
5269 * Enter each page into the pmap, to avoid faults.
5270 * Note that this loop could be coded more efficiently,
5271 * if the need arose, rather than looking up each page
5272 * again.
5273 */
5274 for (offset = 0, va = start; offset < size;
5275 va += PAGE_SIZE, offset += PAGE_SIZE) {
5276 int type_of_fault;
5277
5278 vm_object_lock(cpm_obj);
5279 m = vm_page_lookup(cpm_obj, (vm_object_offset_t)offset);
5280 assert(m != VM_PAGE_NULL);
5281
5282 vm_page_zero_fill(m);
5283
5284 type_of_fault = DBG_ZERO_FILL_FAULT;
5285
5286 vm_fault_enter(m, pmap, va,
5287 PAGE_SIZE, 0,
5288 VM_PROT_ALL, VM_PROT_WRITE,
5289 VM_PAGE_WIRED(m),
5290 FALSE, /* change_wiring */
5291 VM_KERN_MEMORY_NONE, /* tag - not wiring */
5292 FALSE, /* cs_bypass */
5293 0, /* user_tag */
5294 0, /* pmap_options */
5295 NULL, /* need_retry */
5296 &type_of_fault);
5297
5298 vm_object_unlock(cpm_obj);
5299 }
5300
5301 #if MACH_ASSERT
5302 /*
5303 * Verify ordering in address space.
5304 */
5305 for (offset = 0; offset < size; offset += PAGE_SIZE) {
5306 vm_object_lock(cpm_obj);
5307 m = vm_page_lookup(cpm_obj, (vm_object_offset_t)offset);
5308 vm_object_unlock(cpm_obj);
5309 if (m == VM_PAGE_NULL) {
5310 panic("vm_allocate_cpm: obj %p off 0x%llx no page",
5311 cpm_obj, (uint64_t)offset);
5312 }
5313 assert(m->vmp_tabled);
5314 assert(!m->vmp_busy);
5315 assert(!m->vmp_wanted);
5316 assert(!m->vmp_fictitious);
5317 assert(!m->vmp_private);
5318 assert(!m->vmp_absent);
5319 assert(!m->vmp_cleaning);
5320 assert(!m->vmp_laundry);
5321 assert(!m->vmp_precious);
5322 assert(!m->vmp_clustered);
5323 if (offset != 0) {
5324 if (VM_PAGE_GET_PHYS_PAGE(m) != prev_addr + 1) {
5325 printf("start 0x%llx end 0x%llx va 0x%llx\n",
5326 (uint64_t)start, (uint64_t)end, (uint64_t)va);
5327 printf("obj %p off 0x%llx\n", cpm_obj, (uint64_t)offset);
5328 printf("m %p prev_address 0x%llx\n", m, (uint64_t)prev_addr);
5329 panic("vm_allocate_cpm: pages not contig!");
5330 }
5331 }
5332 prev_addr = VM_PAGE_GET_PHYS_PAGE(m);
5333 }
5334 #endif /* MACH_ASSERT */
5335
5336 vm_object_deallocate(cpm_obj); /* kill extra ref */
5337
5338 return kr;
5339 }
5340
5341
5342 #else /* VM_CPM */
5343
5344 /*
5345 * Interface is defined in all cases, but unless the kernel
5346 * is built explicitly for this option, the interface does
5347 * nothing.
5348 */
5349
5350 kern_return_t
vm_map_enter_cpm(__unused vm_map_t map,__unused vm_map_offset_t * addr,__unused vm_map_size_t size,__unused vm_map_kernel_flags_t vmk_flags)5351 vm_map_enter_cpm(
5352 __unused vm_map_t map,
5353 __unused vm_map_offset_t *addr,
5354 __unused vm_map_size_t size,
5355 __unused vm_map_kernel_flags_t vmk_flags)
5356 {
5357 return KERN_FAILURE;
5358 }
5359 #endif /* VM_CPM */
5360
5361 /* Not used without nested pmaps */
5362 #ifndef NO_NESTED_PMAP
5363 /*
5364 * Clip and unnest a portion of a nested submap mapping.
5365 */
5366
5367
5368 static void
vm_map_clip_unnest(vm_map_t map,vm_map_entry_t entry,vm_map_offset_t start_unnest,vm_map_offset_t end_unnest)5369 vm_map_clip_unnest(
5370 vm_map_t map,
5371 vm_map_entry_t entry,
5372 vm_map_offset_t start_unnest,
5373 vm_map_offset_t end_unnest)
5374 {
5375 vm_map_offset_t old_start_unnest = start_unnest;
5376 vm_map_offset_t old_end_unnest = end_unnest;
5377
5378 assert(entry->is_sub_map);
5379 assert(VME_SUBMAP(entry) != NULL);
5380 assert(entry->use_pmap);
5381
5382 /*
5383 * Query the platform for the optimal unnest range.
5384 * DRK: There's some duplication of effort here, since
5385 * callers may have adjusted the range to some extent. This
5386 * routine was introduced to support 1GiB subtree nesting
5387 * for x86 platforms, which can also nest on 2MiB boundaries
5388 * depending on size/alignment.
5389 */
5390 if (pmap_adjust_unnest_parameters(map->pmap, &start_unnest, &end_unnest)) {
5391 assert(VME_SUBMAP(entry)->is_nested_map);
5392 assert(!VME_SUBMAP(entry)->disable_vmentry_reuse);
5393 log_unnest_badness(map,
5394 old_start_unnest,
5395 old_end_unnest,
5396 VME_SUBMAP(entry)->is_nested_map,
5397 (entry->vme_start +
5398 VME_SUBMAP(entry)->lowest_unnestable_start -
5399 VME_OFFSET(entry)));
5400 }
5401
5402 if (entry->vme_start > start_unnest ||
5403 entry->vme_end < end_unnest) {
5404 panic("vm_map_clip_unnest(0x%llx,0x%llx): "
5405 "bad nested entry: start=0x%llx end=0x%llx\n",
5406 (long long)start_unnest, (long long)end_unnest,
5407 (long long)entry->vme_start, (long long)entry->vme_end);
5408 }
5409
5410 if (start_unnest > entry->vme_start) {
5411 _vm_map_clip_start(&map->hdr,
5412 entry,
5413 start_unnest);
5414 if (map->holelistenabled) {
5415 vm_map_store_update_first_free(map, NULL, FALSE);
5416 } else {
5417 vm_map_store_update_first_free(map, map->first_free, FALSE);
5418 }
5419 }
5420 if (entry->vme_end > end_unnest) {
5421 _vm_map_clip_end(&map->hdr,
5422 entry,
5423 end_unnest);
5424 if (map->holelistenabled) {
5425 vm_map_store_update_first_free(map, NULL, FALSE);
5426 } else {
5427 vm_map_store_update_first_free(map, map->first_free, FALSE);
5428 }
5429 }
5430
5431 pmap_unnest(map->pmap,
5432 entry->vme_start,
5433 entry->vme_end - entry->vme_start);
5434 if ((map->mapped_in_other_pmaps) && os_ref_get_count_raw(&map->map_refcnt) != 0) {
5435 /* clean up parent map/maps */
5436 vm_map_submap_pmap_clean(
5437 map, entry->vme_start,
5438 entry->vme_end,
5439 VME_SUBMAP(entry),
5440 VME_OFFSET(entry));
5441 }
5442 entry->use_pmap = FALSE;
5443 if ((map->pmap != kernel_pmap) &&
5444 (VME_ALIAS(entry) == VM_MEMORY_SHARED_PMAP)) {
5445 VME_ALIAS_SET(entry, VM_MEMORY_UNSHARED_PMAP);
5446 }
5447 }
5448 #endif /* NO_NESTED_PMAP */
5449
5450 __abortlike
5451 static void
__vm_map_clip_atomic_entry_panic(vm_map_t map,vm_map_entry_t entry,vm_map_offset_t where)5452 __vm_map_clip_atomic_entry_panic(
5453 vm_map_t map,
5454 vm_map_entry_t entry,
5455 vm_map_offset_t where)
5456 {
5457 panic("vm_map_clip(%p): Attempting to clip an atomic VM map entry "
5458 "%p [0x%llx:0x%llx] at 0x%llx", map, entry,
5459 (uint64_t)entry->vme_start,
5460 (uint64_t)entry->vme_end,
5461 (uint64_t)where);
5462 }
5463
5464 /*
5465 * vm_map_clip_start: [ internal use only ]
5466 *
5467 * Asserts that the given entry begins at or after
5468 * the specified address; if necessary,
5469 * it splits the entry into two.
5470 */
5471 void
vm_map_clip_start(vm_map_t map,vm_map_entry_t entry,vm_map_offset_t startaddr)5472 vm_map_clip_start(
5473 vm_map_t map,
5474 vm_map_entry_t entry,
5475 vm_map_offset_t startaddr)
5476 {
5477 #ifndef NO_NESTED_PMAP
5478 if (entry->is_sub_map &&
5479 entry->use_pmap &&
5480 startaddr >= entry->vme_start) {
5481 vm_map_offset_t start_unnest, end_unnest;
5482
5483 /*
5484 * Make sure "startaddr" is no longer in a nested range
5485 * before we clip. Unnest only the minimum range the platform
5486 * can handle.
5487 * vm_map_clip_unnest may perform additional adjustments to
5488 * the unnest range.
5489 */
5490 start_unnest = startaddr & ~(pmap_shared_region_size_min(map->pmap) - 1);
5491 end_unnest = start_unnest + pmap_shared_region_size_min(map->pmap);
5492 vm_map_clip_unnest(map, entry, start_unnest, end_unnest);
5493 }
5494 #endif /* NO_NESTED_PMAP */
5495 if (startaddr > entry->vme_start) {
5496 if (!entry->is_sub_map &&
5497 VME_OBJECT(entry) &&
5498 VME_OBJECT(entry)->phys_contiguous) {
5499 pmap_remove(map->pmap,
5500 (addr64_t)(entry->vme_start),
5501 (addr64_t)(entry->vme_end));
5502 }
5503 if (entry->vme_atomic) {
5504 __vm_map_clip_atomic_entry_panic(map, entry, startaddr);
5505 }
5506
5507 DTRACE_VM5(
5508 vm_map_clip_start,
5509 vm_map_t, map,
5510 vm_map_offset_t, entry->vme_start,
5511 vm_map_offset_t, entry->vme_end,
5512 vm_map_offset_t, startaddr,
5513 int, VME_ALIAS(entry));
5514
5515 _vm_map_clip_start(&map->hdr, entry, startaddr);
5516 if (map->holelistenabled) {
5517 vm_map_store_update_first_free(map, NULL, FALSE);
5518 } else {
5519 vm_map_store_update_first_free(map, map->first_free, FALSE);
5520 }
5521 }
5522 }
5523
5524
5525 #define vm_map_copy_clip_start(copy, entry, startaddr) \
5526 MACRO_BEGIN \
5527 if ((startaddr) > (entry)->vme_start) \
5528 _vm_map_clip_start(&(copy)->cpy_hdr,(entry),(startaddr)); \
5529 MACRO_END
5530
5531 /*
5532 * This routine is called only when it is known that
5533 * the entry must be split.
5534 */
5535 static void
_vm_map_clip_start(struct vm_map_header * map_header,vm_map_entry_t entry,vm_map_offset_t start)5536 _vm_map_clip_start(
5537 struct vm_map_header *map_header,
5538 vm_map_entry_t entry,
5539 vm_map_offset_t start)
5540 {
5541 vm_map_entry_t new_entry;
5542
5543 /*
5544 * Split off the front portion --
5545 * note that we must insert the new
5546 * entry BEFORE this one, so that
5547 * this entry has the specified starting
5548 * address.
5549 */
5550
5551 if (entry->map_aligned) {
5552 assert(VM_MAP_PAGE_ALIGNED(start,
5553 VM_MAP_HDR_PAGE_MASK(map_header)));
5554 }
5555
5556 new_entry = _vm_map_entry_create(map_header);
5557 vm_map_entry_copy_full(new_entry, entry);
5558
5559 new_entry->vme_end = start;
5560 assert(new_entry->vme_start < new_entry->vme_end);
5561 VME_OFFSET_SET(entry, VME_OFFSET(entry) + (start - entry->vme_start));
5562 assert(start < entry->vme_end);
5563 entry->vme_start = start;
5564
5565 _vm_map_store_entry_link(map_header, entry->vme_prev, new_entry);
5566
5567 if (entry->is_sub_map) {
5568 vm_map_reference(VME_SUBMAP(new_entry));
5569 } else {
5570 vm_object_reference(VME_OBJECT(new_entry));
5571 }
5572 }
5573
5574
5575 /*
5576 * vm_map_clip_end: [ internal use only ]
5577 *
5578 * Asserts that the given entry ends at or before
5579 * the specified address; if necessary,
5580 * it splits the entry into two.
5581 */
5582 void
vm_map_clip_end(vm_map_t map,vm_map_entry_t entry,vm_map_offset_t endaddr)5583 vm_map_clip_end(
5584 vm_map_t map,
5585 vm_map_entry_t entry,
5586 vm_map_offset_t endaddr)
5587 {
5588 if (endaddr > entry->vme_end) {
5589 /*
5590 * Within the scope of this clipping, limit "endaddr" to
5591 * the end of this map entry...
5592 */
5593 endaddr = entry->vme_end;
5594 }
5595 #ifndef NO_NESTED_PMAP
5596 if (entry->is_sub_map && entry->use_pmap) {
5597 vm_map_offset_t start_unnest, end_unnest;
5598
5599 /*
5600 * Make sure the range between the start of this entry and
5601 * the new "endaddr" is no longer nested before we clip.
5602 * Unnest only the minimum range the platform can handle.
5603 * vm_map_clip_unnest may perform additional adjustments to
5604 * the unnest range.
5605 */
5606 start_unnest = entry->vme_start;
5607 end_unnest =
5608 (endaddr + pmap_shared_region_size_min(map->pmap) - 1) &
5609 ~(pmap_shared_region_size_min(map->pmap) - 1);
5610 vm_map_clip_unnest(map, entry, start_unnest, end_unnest);
5611 }
5612 #endif /* NO_NESTED_PMAP */
5613 if (endaddr < entry->vme_end) {
5614 if (!entry->is_sub_map &&
5615 VME_OBJECT(entry) &&
5616 VME_OBJECT(entry)->phys_contiguous) {
5617 pmap_remove(map->pmap,
5618 (addr64_t)(entry->vme_start),
5619 (addr64_t)(entry->vme_end));
5620 }
5621 if (entry->vme_atomic) {
5622 __vm_map_clip_atomic_entry_panic(map, entry, endaddr);
5623 }
5624 DTRACE_VM5(
5625 vm_map_clip_end,
5626 vm_map_t, map,
5627 vm_map_offset_t, entry->vme_start,
5628 vm_map_offset_t, entry->vme_end,
5629 vm_map_offset_t, endaddr,
5630 int, VME_ALIAS(entry));
5631
5632 _vm_map_clip_end(&map->hdr, entry, endaddr);
5633 if (map->holelistenabled) {
5634 vm_map_store_update_first_free(map, NULL, FALSE);
5635 } else {
5636 vm_map_store_update_first_free(map, map->first_free, FALSE);
5637 }
5638 }
5639 }
5640
5641
5642 #define vm_map_copy_clip_end(copy, entry, endaddr) \
5643 MACRO_BEGIN \
5644 if ((endaddr) < (entry)->vme_end) \
5645 _vm_map_clip_end(&(copy)->cpy_hdr,(entry),(endaddr)); \
5646 MACRO_END
5647
5648 /*
5649 * This routine is called only when it is known that
5650 * the entry must be split.
5651 */
5652 static void
_vm_map_clip_end(struct vm_map_header * map_header,vm_map_entry_t entry,vm_map_offset_t end)5653 _vm_map_clip_end(
5654 struct vm_map_header *map_header,
5655 vm_map_entry_t entry,
5656 vm_map_offset_t end)
5657 {
5658 vm_map_entry_t new_entry;
5659
5660 /*
5661 * Create a new entry and insert it
5662 * AFTER the specified entry
5663 */
5664
5665 if (entry->map_aligned) {
5666 assert(VM_MAP_PAGE_ALIGNED(end,
5667 VM_MAP_HDR_PAGE_MASK(map_header)));
5668 }
5669
5670 new_entry = _vm_map_entry_create(map_header);
5671 vm_map_entry_copy_full(new_entry, entry);
5672
5673 assert(entry->vme_start < end);
5674 new_entry->vme_start = entry->vme_end = end;
5675 VME_OFFSET_SET(new_entry,
5676 VME_OFFSET(new_entry) + (end - entry->vme_start));
5677 assert(new_entry->vme_start < new_entry->vme_end);
5678
5679 _vm_map_store_entry_link(map_header, entry, new_entry);
5680
5681 if (entry->is_sub_map) {
5682 vm_map_reference(VME_SUBMAP(new_entry));
5683 } else {
5684 vm_object_reference(VME_OBJECT(new_entry));
5685 }
5686 }
5687
5688
5689 /*
5690 * VM_MAP_RANGE_CHECK: [ internal use only ]
5691 *
5692 * Asserts that the starting and ending region
5693 * addresses fall within the valid range of the map.
5694 */
5695 #define VM_MAP_RANGE_CHECK(map, start, end) \
5696 MACRO_BEGIN \
5697 if (start < vm_map_min(map)) \
5698 start = vm_map_min(map); \
5699 if (end > vm_map_max(map)) \
5700 end = vm_map_max(map); \
5701 if (start > end) \
5702 start = end; \
5703 MACRO_END
5704
5705 /*
5706 * vm_map_range_check: [ internal use only ]
5707 *
5708 * Check that the region defined by the specified start and
5709 * end addresses are wholly contained within a single map
5710 * entry or set of adjacent map entries of the spacified map,
5711 * i.e. the specified region contains no unmapped space.
5712 * If any or all of the region is unmapped, FALSE is returned.
5713 * Otherwise, TRUE is returned and if the output argument 'entry'
5714 * is not NULL it points to the map entry containing the start
5715 * of the region.
5716 *
5717 * The map is locked for reading on entry and is left locked.
5718 */
5719 static boolean_t
vm_map_range_check(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_map_entry_t * entry)5720 vm_map_range_check(
5721 vm_map_t map,
5722 vm_map_offset_t start,
5723 vm_map_offset_t end,
5724 vm_map_entry_t *entry)
5725 {
5726 vm_map_entry_t cur;
5727 vm_map_offset_t prev;
5728
5729 /*
5730 * Basic sanity checks first
5731 */
5732 if (start < vm_map_min(map) || end > vm_map_max(map) || start > end) {
5733 return FALSE;
5734 }
5735
5736 /*
5737 * Check first if the region starts within a valid
5738 * mapping for the map.
5739 */
5740 if (!vm_map_lookup_entry(map, start, &cur)) {
5741 return FALSE;
5742 }
5743
5744 /*
5745 * Optimize for the case that the region is contained
5746 * in a single map entry.
5747 */
5748 if (entry != (vm_map_entry_t *) NULL) {
5749 *entry = cur;
5750 }
5751 if (end <= cur->vme_end) {
5752 return TRUE;
5753 }
5754
5755 /*
5756 * If the region is not wholly contained within a
5757 * single entry, walk the entries looking for holes.
5758 */
5759 prev = cur->vme_end;
5760 cur = cur->vme_next;
5761 while ((cur != vm_map_to_entry(map)) && (prev == cur->vme_start)) {
5762 if (end <= cur->vme_end) {
5763 return TRUE;
5764 }
5765 prev = cur->vme_end;
5766 cur = cur->vme_next;
5767 }
5768 return FALSE;
5769 }
5770
5771 /*
5772 * vm_map_protect:
5773 *
5774 * Sets the protection of the specified address
5775 * region in the target map. If "set_max" is
5776 * specified, the maximum protection is to be set;
5777 * otherwise, only the current protection is affected.
5778 */
5779 kern_return_t
vm_map_protect(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_prot_t new_prot,boolean_t set_max)5780 vm_map_protect(
5781 vm_map_t map,
5782 vm_map_offset_t start,
5783 vm_map_offset_t end,
5784 vm_prot_t new_prot,
5785 boolean_t set_max)
5786 {
5787 vm_map_entry_t current;
5788 vm_map_offset_t prev;
5789 vm_map_entry_t entry;
5790 vm_prot_t new_max;
5791 int pmap_options = 0;
5792 kern_return_t kr;
5793
5794 if (new_prot & VM_PROT_COPY) {
5795 vm_map_offset_t new_start;
5796 vm_prot_t cur_prot, max_prot;
5797 vm_map_kernel_flags_t kflags;
5798
5799 /* LP64todo - see below */
5800 if (start >= map->max_offset) {
5801 return KERN_INVALID_ADDRESS;
5802 }
5803
5804 if ((new_prot & VM_PROT_ALLEXEC) &&
5805 map->pmap != kernel_pmap &&
5806 (vm_map_cs_enforcement(map)
5807 #if XNU_TARGET_OS_OSX && __arm64__
5808 || !VM_MAP_IS_EXOTIC(map)
5809 #endif /* XNU_TARGET_OS_OSX && __arm64__ */
5810 ) &&
5811 VM_MAP_POLICY_WX_FAIL(map)) {
5812 DTRACE_VM3(cs_wx,
5813 uint64_t, (uint64_t) start,
5814 uint64_t, (uint64_t) end,
5815 vm_prot_t, new_prot);
5816 printf("CODE SIGNING: %d[%s] %s:%d(0x%llx,0x%llx,0x%x) can't have both write and exec at the same time\n",
5817 proc_selfpid(),
5818 (get_bsdtask_info(current_task())
5819 ? proc_name_address(get_bsdtask_info(current_task()))
5820 : "?"),
5821 __FUNCTION__, __LINE__,
5822 #if DEVELOPMENT || DEBUG
5823 (uint64_t)start,
5824 (uint64_t)end,
5825 #else /* DEVELOPMENT || DEBUG */
5826 (uint64_t)0,
5827 (uint64_t)0,
5828 #endif /* DEVELOPMENT || DEBUG */
5829 new_prot);
5830 return KERN_PROTECTION_FAILURE;
5831 }
5832
5833 /*
5834 * Let vm_map_remap_extract() know that it will need to:
5835 * + make a copy of the mapping
5836 * + add VM_PROT_WRITE to the max protections
5837 * + remove any protections that are no longer allowed from the
5838 * max protections (to avoid any WRITE/EXECUTE conflict, for
5839 * example).
5840 * Note that "max_prot" is an IN/OUT parameter only for this
5841 * specific (VM_PROT_COPY) case. It's usually an OUT parameter
5842 * only.
5843 */
5844 max_prot = new_prot & (VM_PROT_ALL | VM_PROT_ALLEXEC);
5845 cur_prot = VM_PROT_NONE;
5846 kflags = VM_MAP_KERNEL_FLAGS_FIXED(.vmf_overwrite = true);
5847 kflags.vmkf_remap_prot_copy = true;
5848 new_start = start;
5849 kr = vm_map_remap(map,
5850 &new_start,
5851 end - start,
5852 0, /* mask */
5853 kflags,
5854 map,
5855 start,
5856 TRUE, /* copy-on-write remapping! */
5857 &cur_prot, /* IN/OUT */
5858 &max_prot, /* IN/OUT */
5859 VM_INHERIT_DEFAULT);
5860 if (kr != KERN_SUCCESS) {
5861 return kr;
5862 }
5863 new_prot &= ~VM_PROT_COPY;
5864 }
5865
5866 vm_map_lock(map);
5867
5868 /* LP64todo - remove this check when vm_map_commpage64()
5869 * no longer has to stuff in a map_entry for the commpage
5870 * above the map's max_offset.
5871 */
5872 if (start >= map->max_offset) {
5873 vm_map_unlock(map);
5874 return KERN_INVALID_ADDRESS;
5875 }
5876
5877 while (1) {
5878 /*
5879 * Lookup the entry. If it doesn't start in a valid
5880 * entry, return an error.
5881 */
5882 if (!vm_map_lookup_entry(map, start, &entry)) {
5883 vm_map_unlock(map);
5884 return KERN_INVALID_ADDRESS;
5885 }
5886
5887 if (entry->superpage_size && (start & (SUPERPAGE_SIZE - 1))) { /* extend request to whole entry */
5888 start = SUPERPAGE_ROUND_DOWN(start);
5889 continue;
5890 }
5891 break;
5892 }
5893 if (entry->superpage_size) {
5894 end = SUPERPAGE_ROUND_UP(end);
5895 }
5896
5897 /*
5898 * Make a first pass to check for protection and address
5899 * violations.
5900 */
5901
5902 current = entry;
5903 prev = current->vme_start;
5904 while ((current != vm_map_to_entry(map)) &&
5905 (current->vme_start < end)) {
5906 /*
5907 * If there is a hole, return an error.
5908 */
5909 if (current->vme_start != prev) {
5910 vm_map_unlock(map);
5911 return KERN_INVALID_ADDRESS;
5912 }
5913
5914 new_max = current->max_protection;
5915
5916 #if defined(__x86_64__)
5917 /* Allow max mask to include execute prot bits if this map doesn't enforce CS */
5918 if (set_max && (new_prot & VM_PROT_ALLEXEC) && !vm_map_cs_enforcement(map)) {
5919 new_max = (new_max & ~VM_PROT_ALLEXEC) | (new_prot & VM_PROT_ALLEXEC);
5920 }
5921 #elif CODE_SIGNING_MONITOR
5922 if (set_max && (new_prot & VM_PROT_EXECUTE) && (csm_address_space_exempt(map->pmap) == KERN_SUCCESS)) {
5923 new_max |= VM_PROT_EXECUTE;
5924 }
5925 #endif
5926 if ((new_prot & new_max) != new_prot) {
5927 vm_map_unlock(map);
5928 return KERN_PROTECTION_FAILURE;
5929 }
5930
5931 if (current->used_for_jit &&
5932 pmap_has_prot_policy(map->pmap, current->translated_allow_execute, current->protection)) {
5933 vm_map_unlock(map);
5934 return KERN_PROTECTION_FAILURE;
5935 }
5936
5937 #if __arm64e__
5938 /* Disallow remapping hw assisted TPRO mappings */
5939 if (current->used_for_tpro) {
5940 vm_map_unlock(map);
5941 return KERN_PROTECTION_FAILURE;
5942 }
5943 #endif /* __arm64e__ */
5944
5945
5946 if ((new_prot & VM_PROT_WRITE) &&
5947 (new_prot & VM_PROT_ALLEXEC) &&
5948 #if XNU_TARGET_OS_OSX
5949 map->pmap != kernel_pmap &&
5950 (vm_map_cs_enforcement(map)
5951 #if __arm64__
5952 || !VM_MAP_IS_EXOTIC(map)
5953 #endif /* __arm64__ */
5954 ) &&
5955 #endif /* XNU_TARGET_OS_OSX */
5956 #if CODE_SIGNING_MONITOR
5957 (csm_address_space_exempt(map->pmap) != KERN_SUCCESS) &&
5958 #endif
5959 !(current->used_for_jit)) {
5960 DTRACE_VM3(cs_wx,
5961 uint64_t, (uint64_t) current->vme_start,
5962 uint64_t, (uint64_t) current->vme_end,
5963 vm_prot_t, new_prot);
5964 printf("CODE SIGNING: %d[%s] %s:%d(0x%llx,0x%llx,0x%x) can't have both write and exec at the same time\n",
5965 proc_selfpid(),
5966 (get_bsdtask_info(current_task())
5967 ? proc_name_address(get_bsdtask_info(current_task()))
5968 : "?"),
5969 __FUNCTION__, __LINE__,
5970 #if DEVELOPMENT || DEBUG
5971 (uint64_t)current->vme_start,
5972 (uint64_t)current->vme_end,
5973 #else /* DEVELOPMENT || DEBUG */
5974 (uint64_t)0,
5975 (uint64_t)0,
5976 #endif /* DEVELOPMENT || DEBUG */
5977 new_prot);
5978 new_prot &= ~VM_PROT_ALLEXEC;
5979 if (VM_MAP_POLICY_WX_FAIL(map)) {
5980 vm_map_unlock(map);
5981 return KERN_PROTECTION_FAILURE;
5982 }
5983 }
5984
5985 /*
5986 * If the task has requested executable lockdown,
5987 * deny both:
5988 * - adding executable protections OR
5989 * - adding write protections to an existing executable mapping.
5990 */
5991 if (map->map_disallow_new_exec == TRUE) {
5992 if ((new_prot & VM_PROT_ALLEXEC) ||
5993 ((current->protection & VM_PROT_EXECUTE) && (new_prot & VM_PROT_WRITE))) {
5994 vm_map_unlock(map);
5995 return KERN_PROTECTION_FAILURE;
5996 }
5997 }
5998
5999 prev = current->vme_end;
6000 current = current->vme_next;
6001 }
6002
6003 #if __arm64__
6004 if (end > prev &&
6005 end == vm_map_round_page(prev, VM_MAP_PAGE_MASK(map))) {
6006 vm_map_entry_t prev_entry;
6007
6008 prev_entry = current->vme_prev;
6009 if (prev_entry != vm_map_to_entry(map) &&
6010 !prev_entry->map_aligned &&
6011 (vm_map_round_page(prev_entry->vme_end,
6012 VM_MAP_PAGE_MASK(map))
6013 == end)) {
6014 /*
6015 * The last entry in our range is not "map-aligned"
6016 * but it would have reached all the way to "end"
6017 * if it had been map-aligned, so this is not really
6018 * a hole in the range and we can proceed.
6019 */
6020 prev = end;
6021 }
6022 }
6023 #endif /* __arm64__ */
6024
6025 if (end > prev) {
6026 vm_map_unlock(map);
6027 return KERN_INVALID_ADDRESS;
6028 }
6029
6030 /*
6031 * Go back and fix up protections.
6032 * Clip to start here if the range starts within
6033 * the entry.
6034 */
6035
6036 current = entry;
6037 if (current != vm_map_to_entry(map)) {
6038 /* clip and unnest if necessary */
6039 vm_map_clip_start(map, current, start);
6040 }
6041
6042 while ((current != vm_map_to_entry(map)) &&
6043 (current->vme_start < end)) {
6044 vm_prot_t old_prot;
6045
6046 vm_map_clip_end(map, current, end);
6047
6048 #if DEVELOPMENT || DEBUG
6049 if (current->csm_associated && vm_log_xnu_user_debug) {
6050 printf("FBDP %d[%s] %s(0x%llx,0x%llx,0x%x) on map %p entry %p [0x%llx:0x%llx 0x%x/0x%x] csm_associated\n",
6051 proc_selfpid(),
6052 (get_bsdtask_info(current_task())
6053 ? proc_name_address(get_bsdtask_info(current_task()))
6054 : "?"),
6055 __FUNCTION__,
6056 (uint64_t)start,
6057 (uint64_t)end,
6058 new_prot,
6059 map, current,
6060 current->vme_start,
6061 current->vme_end,
6062 current->protection,
6063 current->max_protection);
6064 }
6065 #endif /* DEVELOPMENT || DEBUG */
6066
6067 if (current->is_sub_map) {
6068 /* clipping did unnest if needed */
6069 assert(!current->use_pmap);
6070 }
6071
6072 old_prot = current->protection;
6073
6074 if (set_max) {
6075 current->max_protection = new_prot;
6076 /* Consider either EXECUTE or UEXEC as EXECUTE for this masking */
6077 current->protection = (new_prot & old_prot);
6078 } else {
6079 current->protection = new_prot;
6080 }
6081
6082 #if CODE_SIGNING_MONITOR
6083 if (!current->vme_xnu_user_debug &&
6084 /* a !csm_associated mapping becoming executable */
6085 ((!current->csm_associated &&
6086 !(old_prot & VM_PROT_EXECUTE) &&
6087 (current->protection & VM_PROT_EXECUTE))
6088 ||
6089 /* a csm_associated mapping becoming writable */
6090 (current->csm_associated &&
6091 !(old_prot & VM_PROT_WRITE) &&
6092 (current->protection & VM_PROT_WRITE)))) {
6093 /*
6094 * This mapping has not already been marked as
6095 * "user_debug" and it is either:
6096 * 1. not code-signing-monitored and becoming executable
6097 * 2. code-signing-monitored and becoming writable,
6098 * so inform the CodeSigningMonitor and mark the
6099 * mapping as "user_debug" if appropriate.
6100 */
6101 vm_map_kernel_flags_t vmk_flags;
6102 vmk_flags = VM_MAP_KERNEL_FLAGS_NONE;
6103 /* pretend it's a vm_protect(VM_PROT_COPY)... */
6104 vmk_flags.vmkf_remap_prot_copy = true;
6105 kr = vm_map_entry_cs_associate(map, current, vmk_flags);
6106 #if DEVELOPMENT || DEBUG
6107 if (vm_log_xnu_user_debug) {
6108 printf("FBDP %d[%s] %s:%d map %p entry %p [ 0x%llx 0x%llx ] prot 0x%x -> 0x%x cs_associate -> %d user_debug=%d\n",
6109 proc_selfpid(),
6110 (get_bsdtask_info(current_task()) ? proc_name_address(get_bsdtask_info(current_task())) : "?"),
6111 __FUNCTION__, __LINE__,
6112 map, current,
6113 current->vme_start, current->vme_end,
6114 old_prot, current->protection,
6115 kr, current->vme_xnu_user_debug);
6116 }
6117 #endif /* DEVELOPMENT || DEBUG */
6118 }
6119 #endif /* CODE_SIGNING_MONITOR */
6120
6121 /*
6122 * Update physical map if necessary.
6123 * If the request is to turn off write protection,
6124 * we won't do it for real (in pmap). This is because
6125 * it would cause copy-on-write to fail. We've already
6126 * set, the new protection in the map, so if a
6127 * write-protect fault occurred, it will be fixed up
6128 * properly, COW or not.
6129 */
6130 if (current->protection != old_prot) {
6131 /* Look one level in we support nested pmaps */
6132 /* from mapped submaps which are direct entries */
6133 /* in our map */
6134
6135 vm_prot_t prot;
6136
6137 prot = current->protection;
6138 if (current->is_sub_map || (VME_OBJECT(current) == NULL) || (VME_OBJECT(current) != compressor_object)) {
6139 prot &= ~VM_PROT_WRITE;
6140 } else {
6141 assert(!VME_OBJECT(current)->code_signed);
6142 assert(VME_OBJECT(current)->copy_strategy == MEMORY_OBJECT_COPY_NONE);
6143 if (prot & VM_PROT_WRITE) {
6144 /*
6145 * For write requests on the
6146 * compressor, we wil ask the
6147 * pmap layer to prevent us from
6148 * taking a write fault when we
6149 * attempt to access the mapping
6150 * next.
6151 */
6152 pmap_options |= PMAP_OPTIONS_PROTECT_IMMEDIATE;
6153 }
6154 }
6155
6156 if (override_nx(map, VME_ALIAS(current)) && prot) {
6157 prot |= VM_PROT_EXECUTE;
6158 }
6159
6160 #if DEVELOPMENT || DEBUG
6161 if (!(old_prot & VM_PROT_EXECUTE) &&
6162 (prot & VM_PROT_EXECUTE) &&
6163 panic_on_unsigned_execute &&
6164 (proc_selfcsflags() & CS_KILL)) {
6165 panic("vm_map_protect(%p,0x%llx,0x%llx) old=0x%x new=0x%x - <rdar://23770418> code-signing bypass?", map, (uint64_t)current->vme_start, (uint64_t)current->vme_end, old_prot, prot);
6166 }
6167 #endif /* DEVELOPMENT || DEBUG */
6168
6169 if (pmap_has_prot_policy(map->pmap, current->translated_allow_execute, prot)) {
6170 if (current->wired_count) {
6171 panic("vm_map_protect(%p,0x%llx,0x%llx) new=0x%x wired=%x",
6172 map, (uint64_t)current->vme_start, (uint64_t)current->vme_end, prot, current->wired_count);
6173 }
6174
6175 /* If the pmap layer cares about this
6176 * protection type, force a fault for
6177 * each page so that vm_fault will
6178 * repopulate the page with the full
6179 * set of protections.
6180 */
6181 /*
6182 * TODO: We don't seem to need this,
6183 * but this is due to an internal
6184 * implementation detail of
6185 * pmap_protect. Do we want to rely
6186 * on this?
6187 */
6188 prot = VM_PROT_NONE;
6189 }
6190
6191 if (current->is_sub_map && current->use_pmap) {
6192 pmap_protect(VME_SUBMAP(current)->pmap,
6193 current->vme_start,
6194 current->vme_end,
6195 prot);
6196 } else {
6197 pmap_protect_options(map->pmap,
6198 current->vme_start,
6199 current->vme_end,
6200 prot,
6201 pmap_options,
6202 NULL);
6203 }
6204 }
6205 current = current->vme_next;
6206 }
6207
6208 current = entry;
6209 while ((current != vm_map_to_entry(map)) &&
6210 (current->vme_start <= end)) {
6211 vm_map_simplify_entry(map, current);
6212 current = current->vme_next;
6213 }
6214
6215 vm_map_unlock(map);
6216 return KERN_SUCCESS;
6217 }
6218
6219 /*
6220 * vm_map_inherit:
6221 *
6222 * Sets the inheritance of the specified address
6223 * range in the target map. Inheritance
6224 * affects how the map will be shared with
6225 * child maps at the time of vm_map_fork.
6226 */
6227 kern_return_t
vm_map_inherit(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_inherit_t new_inheritance)6228 vm_map_inherit(
6229 vm_map_t map,
6230 vm_map_offset_t start,
6231 vm_map_offset_t end,
6232 vm_inherit_t new_inheritance)
6233 {
6234 vm_map_entry_t entry;
6235 vm_map_entry_t temp_entry;
6236
6237 vm_map_lock(map);
6238
6239 VM_MAP_RANGE_CHECK(map, start, end);
6240
6241 if (vm_map_lookup_entry(map, start, &temp_entry)) {
6242 entry = temp_entry;
6243 } else {
6244 temp_entry = temp_entry->vme_next;
6245 entry = temp_entry;
6246 }
6247
6248 /* first check entire range for submaps which can't support the */
6249 /* given inheritance. */
6250 while ((entry != vm_map_to_entry(map)) && (entry->vme_start < end)) {
6251 if (entry->is_sub_map) {
6252 if (new_inheritance == VM_INHERIT_COPY) {
6253 vm_map_unlock(map);
6254 return KERN_INVALID_ARGUMENT;
6255 }
6256 }
6257
6258 entry = entry->vme_next;
6259 }
6260
6261 entry = temp_entry;
6262 if (entry != vm_map_to_entry(map)) {
6263 /* clip and unnest if necessary */
6264 vm_map_clip_start(map, entry, start);
6265 }
6266
6267 while ((entry != vm_map_to_entry(map)) && (entry->vme_start < end)) {
6268 vm_map_clip_end(map, entry, end);
6269 if (entry->is_sub_map) {
6270 /* clip did unnest if needed */
6271 assert(!entry->use_pmap);
6272 }
6273
6274 entry->inheritance = new_inheritance;
6275
6276 entry = entry->vme_next;
6277 }
6278
6279 vm_map_unlock(map);
6280 return KERN_SUCCESS;
6281 }
6282
6283 /*
6284 * Update the accounting for the amount of wired memory in this map. If the user has
6285 * exceeded the defined limits, then we fail. Wiring on behalf of the kernel never fails.
6286 */
6287
6288 static kern_return_t
add_wire_counts(vm_map_t map,vm_map_entry_t entry,boolean_t user_wire)6289 add_wire_counts(
6290 vm_map_t map,
6291 vm_map_entry_t entry,
6292 boolean_t user_wire)
6293 {
6294 vm_map_size_t size;
6295
6296 if (user_wire) {
6297 unsigned int total_wire_count = vm_page_wire_count + vm_lopage_free_count;
6298
6299 /*
6300 * We're wiring memory at the request of the user. Check if this is the first time the user is wiring
6301 * this map entry.
6302 */
6303
6304 if (entry->user_wired_count == 0) {
6305 size = entry->vme_end - entry->vme_start;
6306
6307 /*
6308 * Since this is the first time the user is wiring this map entry, check to see if we're
6309 * exceeding the user wire limits. There is a per map limit which is the smaller of either
6310 * the process's rlimit or the global vm_per_task_user_wire_limit which caps this value. There is also
6311 * a system-wide limit on the amount of memory all users can wire. If the user is over either
6312 * limit, then we fail.
6313 */
6314
6315 if (size + map->user_wire_size > MIN(map->user_wire_limit, vm_per_task_user_wire_limit) ||
6316 size + ptoa_64(total_wire_count) > vm_global_user_wire_limit) {
6317 if (size + ptoa_64(total_wire_count) > vm_global_user_wire_limit) {
6318 #if DEVELOPMENT || DEBUG
6319 if (panic_on_mlock_failure) {
6320 panic("mlock: Over global wire limit. %llu bytes wired and requested to wire %llu bytes more", ptoa_64(total_wire_count), (uint64_t) size);
6321 }
6322 #endif /* DEVELOPMENT || DEBUG */
6323 os_atomic_inc(&vm_add_wire_count_over_global_limit, relaxed);
6324 } else {
6325 os_atomic_inc(&vm_add_wire_count_over_user_limit, relaxed);
6326 #if DEVELOPMENT || DEBUG
6327 if (panic_on_mlock_failure) {
6328 panic("mlock: Over process wire limit. %llu bytes wired and requested to wire %llu bytes more", (uint64_t) map->user_wire_size, (uint64_t) size);
6329 }
6330 #endif /* DEVELOPMENT || DEBUG */
6331 }
6332 return KERN_RESOURCE_SHORTAGE;
6333 }
6334
6335 /*
6336 * The first time the user wires an entry, we also increment the wired_count and add this to
6337 * the total that has been wired in the map.
6338 */
6339
6340 if (entry->wired_count >= MAX_WIRE_COUNT) {
6341 return KERN_FAILURE;
6342 }
6343
6344 entry->wired_count++;
6345 map->user_wire_size += size;
6346 }
6347
6348 if (entry->user_wired_count >= MAX_WIRE_COUNT) {
6349 return KERN_FAILURE;
6350 }
6351
6352 entry->user_wired_count++;
6353 } else {
6354 /*
6355 * The kernel's wiring the memory. Just bump the count and continue.
6356 */
6357
6358 if (entry->wired_count >= MAX_WIRE_COUNT) {
6359 panic("vm_map_wire: too many wirings");
6360 }
6361
6362 entry->wired_count++;
6363 }
6364
6365 return KERN_SUCCESS;
6366 }
6367
6368 /*
6369 * Update the memory wiring accounting now that the given map entry is being unwired.
6370 */
6371
6372 static void
subtract_wire_counts(vm_map_t map,vm_map_entry_t entry,boolean_t user_wire)6373 subtract_wire_counts(
6374 vm_map_t map,
6375 vm_map_entry_t entry,
6376 boolean_t user_wire)
6377 {
6378 if (user_wire) {
6379 /*
6380 * We're unwiring memory at the request of the user. See if we're removing the last user wire reference.
6381 */
6382
6383 if (entry->user_wired_count == 1) {
6384 /*
6385 * We're removing the last user wire reference. Decrement the wired_count and the total
6386 * user wired memory for this map.
6387 */
6388
6389 assert(entry->wired_count >= 1);
6390 entry->wired_count--;
6391 map->user_wire_size -= entry->vme_end - entry->vme_start;
6392 }
6393
6394 assert(entry->user_wired_count >= 1);
6395 entry->user_wired_count--;
6396 } else {
6397 /*
6398 * The kernel is unwiring the memory. Just update the count.
6399 */
6400
6401 assert(entry->wired_count >= 1);
6402 entry->wired_count--;
6403 }
6404 }
6405
6406 int cs_executable_wire = 0;
6407
6408 /*
6409 * vm_map_wire:
6410 *
6411 * Sets the pageability of the specified address range in the
6412 * target map as wired. Regions specified as not pageable require
6413 * locked-down physical memory and physical page maps. The
6414 * access_type variable indicates types of accesses that must not
6415 * generate page faults. This is checked against protection of
6416 * memory being locked-down.
6417 *
6418 * The map must not be locked, but a reference must remain to the
6419 * map throughout the call.
6420 */
6421 static kern_return_t
vm_map_wire_nested(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_prot_t caller_prot,vm_tag_t tag,boolean_t user_wire,pmap_t map_pmap,vm_map_offset_t pmap_addr,ppnum_t * physpage_p)6422 vm_map_wire_nested(
6423 vm_map_t map,
6424 vm_map_offset_t start,
6425 vm_map_offset_t end,
6426 vm_prot_t caller_prot,
6427 vm_tag_t tag,
6428 boolean_t user_wire,
6429 pmap_t map_pmap,
6430 vm_map_offset_t pmap_addr,
6431 ppnum_t *physpage_p)
6432 {
6433 vm_map_entry_t entry;
6434 vm_prot_t access_type;
6435 struct vm_map_entry *first_entry, tmp_entry;
6436 vm_map_t real_map;
6437 vm_map_offset_t s, e;
6438 kern_return_t rc;
6439 boolean_t need_wakeup;
6440 boolean_t main_map = FALSE;
6441 wait_interrupt_t interruptible_state;
6442 thread_t cur_thread;
6443 unsigned int last_timestamp;
6444 vm_map_size_t size;
6445 boolean_t wire_and_extract;
6446 vm_prot_t extra_prots;
6447
6448 extra_prots = VM_PROT_COPY;
6449 extra_prots |= VM_PROT_COPY_FAIL_IF_EXECUTABLE;
6450 #if XNU_TARGET_OS_OSX
6451 if (map->pmap == kernel_pmap ||
6452 !vm_map_cs_enforcement(map)) {
6453 extra_prots &= ~VM_PROT_COPY_FAIL_IF_EXECUTABLE;
6454 }
6455 #endif /* XNU_TARGET_OS_OSX */
6456 #if CODE_SIGNING_MONITOR
6457 if (csm_address_space_exempt(map->pmap) == KERN_SUCCESS) {
6458 extra_prots &= ~VM_PROT_COPY_FAIL_IF_EXECUTABLE;
6459 }
6460 #endif /* CODE_SIGNING_MONITOR */
6461
6462 access_type = (caller_prot & (VM_PROT_ALL | VM_PROT_ALLEXEC));
6463
6464 wire_and_extract = FALSE;
6465 if (physpage_p != NULL) {
6466 /*
6467 * The caller wants the physical page number of the
6468 * wired page. We return only one physical page number
6469 * so this works for only one page at a time.
6470 */
6471 if ((end - start) != PAGE_SIZE) {
6472 return KERN_INVALID_ARGUMENT;
6473 }
6474 wire_and_extract = TRUE;
6475 *physpage_p = 0;
6476 }
6477
6478 vm_map_lock(map);
6479 if (map_pmap == NULL) {
6480 main_map = TRUE;
6481 }
6482 last_timestamp = map->timestamp;
6483
6484 VM_MAP_RANGE_CHECK(map, start, end);
6485 assert(VM_MAP_PAGE_ALIGNED(start, VM_MAP_PAGE_MASK(map)));
6486 assert(VM_MAP_PAGE_ALIGNED(end, VM_MAP_PAGE_MASK(map)));
6487
6488 if (start == end) {
6489 /* We wired what the caller asked for, zero pages */
6490 vm_map_unlock(map);
6491 return KERN_SUCCESS;
6492 }
6493
6494 need_wakeup = FALSE;
6495 cur_thread = current_thread();
6496
6497 s = start;
6498 rc = KERN_SUCCESS;
6499
6500 if (vm_map_lookup_entry(map, s, &first_entry)) {
6501 entry = first_entry;
6502 /*
6503 * vm_map_clip_start will be done later.
6504 * We don't want to unnest any nested submaps here !
6505 */
6506 } else {
6507 /* Start address is not in map */
6508 rc = KERN_INVALID_ADDRESS;
6509 goto done;
6510 }
6511
6512 while ((entry != vm_map_to_entry(map)) && (s < end)) {
6513 /*
6514 * At this point, we have wired from "start" to "s".
6515 * We still need to wire from "s" to "end".
6516 *
6517 * "entry" hasn't been clipped, so it could start before "s"
6518 * and/or end after "end".
6519 */
6520
6521 /* "e" is how far we want to wire in this entry */
6522 e = entry->vme_end;
6523 if (e > end) {
6524 e = end;
6525 }
6526
6527 /*
6528 * If another thread is wiring/unwiring this entry then
6529 * block after informing other thread to wake us up.
6530 */
6531 if (entry->in_transition) {
6532 wait_result_t wait_result;
6533
6534 /*
6535 * We have not clipped the entry. Make sure that
6536 * the start address is in range so that the lookup
6537 * below will succeed.
6538 * "s" is the current starting point: we've already
6539 * wired from "start" to "s" and we still have
6540 * to wire from "s" to "end".
6541 */
6542
6543 entry->needs_wakeup = TRUE;
6544
6545 /*
6546 * wake up anybody waiting on entries that we have
6547 * already wired.
6548 */
6549 if (need_wakeup) {
6550 vm_map_entry_wakeup(map);
6551 need_wakeup = FALSE;
6552 }
6553 /*
6554 * User wiring is interruptible
6555 */
6556 wait_result = vm_map_entry_wait(map,
6557 (user_wire) ? THREAD_ABORTSAFE :
6558 THREAD_UNINT);
6559 if (user_wire && wait_result == THREAD_INTERRUPTED) {
6560 /*
6561 * undo the wirings we have done so far
6562 * We do not clear the needs_wakeup flag,
6563 * because we cannot tell if we were the
6564 * only one waiting.
6565 */
6566 rc = KERN_FAILURE;
6567 goto done;
6568 }
6569
6570 /*
6571 * Cannot avoid a lookup here. reset timestamp.
6572 */
6573 last_timestamp = map->timestamp;
6574
6575 /*
6576 * The entry could have been clipped, look it up again.
6577 * Worse that can happen is, it may not exist anymore.
6578 */
6579 if (!vm_map_lookup_entry(map, s, &first_entry)) {
6580 /*
6581 * User: undo everything upto the previous
6582 * entry. let vm_map_unwire worry about
6583 * checking the validity of the range.
6584 */
6585 rc = KERN_FAILURE;
6586 goto done;
6587 }
6588 entry = first_entry;
6589 continue;
6590 }
6591
6592 if (entry->is_sub_map) {
6593 vm_map_offset_t sub_start;
6594 vm_map_offset_t sub_end;
6595 vm_map_offset_t local_start;
6596 vm_map_offset_t local_end;
6597 pmap_t pmap;
6598
6599 if (wire_and_extract) {
6600 /*
6601 * Wiring would result in copy-on-write
6602 * which would not be compatible with
6603 * the sharing we have with the original
6604 * provider of this memory.
6605 */
6606 rc = KERN_INVALID_ARGUMENT;
6607 goto done;
6608 }
6609
6610 vm_map_clip_start(map, entry, s);
6611 vm_map_clip_end(map, entry, end);
6612
6613 sub_start = VME_OFFSET(entry);
6614 sub_end = entry->vme_end;
6615 sub_end += VME_OFFSET(entry) - entry->vme_start;
6616
6617 local_end = entry->vme_end;
6618 if (map_pmap == NULL) {
6619 vm_object_t object;
6620 vm_object_offset_t offset;
6621 vm_prot_t prot;
6622 boolean_t wired;
6623 vm_map_entry_t local_entry;
6624 vm_map_version_t version;
6625 vm_map_t lookup_map;
6626
6627 if (entry->use_pmap) {
6628 pmap = VME_SUBMAP(entry)->pmap;
6629 /* ppc implementation requires that */
6630 /* submaps pmap address ranges line */
6631 /* up with parent map */
6632 #ifdef notdef
6633 pmap_addr = sub_start;
6634 #endif
6635 pmap_addr = s;
6636 } else {
6637 pmap = map->pmap;
6638 pmap_addr = s;
6639 }
6640
6641 if (entry->wired_count) {
6642 if ((rc = add_wire_counts(map, entry, user_wire)) != KERN_SUCCESS) {
6643 goto done;
6644 }
6645
6646 /*
6647 * The map was not unlocked:
6648 * no need to goto re-lookup.
6649 * Just go directly to next entry.
6650 */
6651 entry = entry->vme_next;
6652 s = entry->vme_start;
6653 continue;
6654 }
6655
6656 /* call vm_map_lookup_and_lock_object to */
6657 /* cause any needs copy to be */
6658 /* evaluated */
6659 local_start = entry->vme_start;
6660 lookup_map = map;
6661 vm_map_lock_write_to_read(map);
6662 rc = vm_map_lookup_and_lock_object(
6663 &lookup_map, local_start,
6664 (access_type | extra_prots),
6665 OBJECT_LOCK_EXCLUSIVE,
6666 &version, &object,
6667 &offset, &prot, &wired,
6668 NULL,
6669 &real_map, NULL);
6670 if (rc != KERN_SUCCESS) {
6671 vm_map_unlock_read(lookup_map);
6672 assert(map_pmap == NULL);
6673 vm_map_unwire(map, start,
6674 s, user_wire);
6675 return rc;
6676 }
6677 vm_object_unlock(object);
6678 if (real_map != lookup_map) {
6679 vm_map_unlock(real_map);
6680 }
6681 vm_map_unlock_read(lookup_map);
6682 vm_map_lock(map);
6683
6684 /* we unlocked, so must re-lookup */
6685 if (!vm_map_lookup_entry(map,
6686 local_start,
6687 &local_entry)) {
6688 rc = KERN_FAILURE;
6689 goto done;
6690 }
6691
6692 /*
6693 * entry could have been "simplified",
6694 * so re-clip
6695 */
6696 entry = local_entry;
6697 assert(s == local_start);
6698 vm_map_clip_start(map, entry, s);
6699 vm_map_clip_end(map, entry, end);
6700 /* re-compute "e" */
6701 e = entry->vme_end;
6702 if (e > end) {
6703 e = end;
6704 }
6705
6706 /* did we have a change of type? */
6707 if (!entry->is_sub_map) {
6708 last_timestamp = map->timestamp;
6709 continue;
6710 }
6711 } else {
6712 local_start = entry->vme_start;
6713 pmap = map_pmap;
6714 }
6715
6716 if ((rc = add_wire_counts(map, entry, user_wire)) != KERN_SUCCESS) {
6717 goto done;
6718 }
6719
6720 entry->in_transition = TRUE;
6721
6722 vm_map_unlock(map);
6723 rc = vm_map_wire_nested(VME_SUBMAP(entry),
6724 sub_start, sub_end,
6725 caller_prot, tag,
6726 user_wire, pmap, pmap_addr,
6727 NULL);
6728 vm_map_lock(map);
6729
6730 /*
6731 * Find the entry again. It could have been clipped
6732 * after we unlocked the map.
6733 */
6734 if (!vm_map_lookup_entry(map, local_start,
6735 &first_entry)) {
6736 panic("vm_map_wire: re-lookup failed");
6737 }
6738 entry = first_entry;
6739
6740 assert(local_start == s);
6741 /* re-compute "e" */
6742 e = entry->vme_end;
6743 if (e > end) {
6744 e = end;
6745 }
6746
6747 last_timestamp = map->timestamp;
6748 while ((entry != vm_map_to_entry(map)) &&
6749 (entry->vme_start < e)) {
6750 assert(entry->in_transition);
6751 entry->in_transition = FALSE;
6752 if (entry->needs_wakeup) {
6753 entry->needs_wakeup = FALSE;
6754 need_wakeup = TRUE;
6755 }
6756 if (rc != KERN_SUCCESS) {/* from vm_*_wire */
6757 subtract_wire_counts(map, entry, user_wire);
6758 }
6759 entry = entry->vme_next;
6760 }
6761 if (rc != KERN_SUCCESS) { /* from vm_*_wire */
6762 goto done;
6763 }
6764
6765 /* no need to relookup again */
6766 s = entry->vme_start;
6767 continue;
6768 }
6769
6770 /*
6771 * If this entry is already wired then increment
6772 * the appropriate wire reference count.
6773 */
6774 if (entry->wired_count) {
6775 if ((entry->protection & access_type) != access_type) {
6776 /* found a protection problem */
6777
6778 /*
6779 * XXX FBDP
6780 * We should always return an error
6781 * in this case but since we didn't
6782 * enforce it before, let's do
6783 * it only for the new "wire_and_extract"
6784 * code path for now...
6785 */
6786 if (wire_and_extract) {
6787 rc = KERN_PROTECTION_FAILURE;
6788 goto done;
6789 }
6790 }
6791
6792 /*
6793 * entry is already wired down, get our reference
6794 * after clipping to our range.
6795 */
6796 vm_map_clip_start(map, entry, s);
6797 vm_map_clip_end(map, entry, end);
6798
6799 if ((rc = add_wire_counts(map, entry, user_wire)) != KERN_SUCCESS) {
6800 goto done;
6801 }
6802
6803 if (wire_and_extract) {
6804 vm_object_t object;
6805 vm_object_offset_t offset;
6806 vm_page_t m;
6807
6808 /*
6809 * We don't have to "wire" the page again
6810 * bit we still have to "extract" its
6811 * physical page number, after some sanity
6812 * checks.
6813 */
6814 assert((entry->vme_end - entry->vme_start)
6815 == PAGE_SIZE);
6816 assert(!entry->needs_copy);
6817 assert(!entry->is_sub_map);
6818 assert(VME_OBJECT(entry));
6819 if (((entry->vme_end - entry->vme_start)
6820 != PAGE_SIZE) ||
6821 entry->needs_copy ||
6822 entry->is_sub_map ||
6823 VME_OBJECT(entry) == VM_OBJECT_NULL) {
6824 rc = KERN_INVALID_ARGUMENT;
6825 goto done;
6826 }
6827
6828 object = VME_OBJECT(entry);
6829 offset = VME_OFFSET(entry);
6830 /* need exclusive lock to update m->dirty */
6831 if (entry->protection & VM_PROT_WRITE) {
6832 vm_object_lock(object);
6833 } else {
6834 vm_object_lock_shared(object);
6835 }
6836 m = vm_page_lookup(object, offset);
6837 assert(m != VM_PAGE_NULL);
6838 assert(VM_PAGE_WIRED(m));
6839 if (m != VM_PAGE_NULL && VM_PAGE_WIRED(m)) {
6840 *physpage_p = VM_PAGE_GET_PHYS_PAGE(m);
6841 if (entry->protection & VM_PROT_WRITE) {
6842 vm_object_lock_assert_exclusive(
6843 object);
6844 m->vmp_dirty = TRUE;
6845 }
6846 } else {
6847 /* not already wired !? */
6848 *physpage_p = 0;
6849 }
6850 vm_object_unlock(object);
6851 }
6852
6853 /* map was not unlocked: no need to relookup */
6854 entry = entry->vme_next;
6855 s = entry->vme_start;
6856 continue;
6857 }
6858
6859 /*
6860 * Unwired entry or wire request transmitted via submap
6861 */
6862
6863 /*
6864 * Wiring would copy the pages to the shadow object.
6865 * The shadow object would not be code-signed so
6866 * attempting to execute code from these copied pages
6867 * would trigger a code-signing violation.
6868 */
6869
6870 if ((entry->protection & VM_PROT_EXECUTE)
6871 #if XNU_TARGET_OS_OSX
6872 &&
6873 map->pmap != kernel_pmap &&
6874 (vm_map_cs_enforcement(map)
6875 #if __arm64__
6876 || !VM_MAP_IS_EXOTIC(map)
6877 #endif /* __arm64__ */
6878 )
6879 #endif /* XNU_TARGET_OS_OSX */
6880 #if CODE_SIGNING_MONITOR
6881 &&
6882 (csm_address_space_exempt(map->pmap) != KERN_SUCCESS)
6883 #endif
6884 ) {
6885 #if MACH_ASSERT
6886 printf("pid %d[%s] wiring executable range from "
6887 "0x%llx to 0x%llx: rejected to preserve "
6888 "code-signing\n",
6889 proc_selfpid(),
6890 (get_bsdtask_info(current_task())
6891 ? proc_name_address(get_bsdtask_info(current_task()))
6892 : "?"),
6893 (uint64_t) entry->vme_start,
6894 (uint64_t) entry->vme_end);
6895 #endif /* MACH_ASSERT */
6896 DTRACE_VM2(cs_executable_wire,
6897 uint64_t, (uint64_t)entry->vme_start,
6898 uint64_t, (uint64_t)entry->vme_end);
6899 cs_executable_wire++;
6900 rc = KERN_PROTECTION_FAILURE;
6901 goto done;
6902 }
6903
6904 /*
6905 * Perform actions of vm_map_lookup that need the write
6906 * lock on the map: create a shadow object for a
6907 * copy-on-write region, or an object for a zero-fill
6908 * region.
6909 */
6910 size = entry->vme_end - entry->vme_start;
6911 /*
6912 * If wiring a copy-on-write page, we need to copy it now
6913 * even if we're only (currently) requesting read access.
6914 * This is aggressive, but once it's wired we can't move it.
6915 */
6916 if (entry->needs_copy) {
6917 if (wire_and_extract) {
6918 /*
6919 * We're supposed to share with the original
6920 * provider so should not be "needs_copy"
6921 */
6922 rc = KERN_INVALID_ARGUMENT;
6923 goto done;
6924 }
6925
6926 VME_OBJECT_SHADOW(entry, size,
6927 vm_map_always_shadow(map));
6928 entry->needs_copy = FALSE;
6929 } else if (VME_OBJECT(entry) == VM_OBJECT_NULL) {
6930 if (wire_and_extract) {
6931 /*
6932 * We're supposed to share with the original
6933 * provider so should already have an object.
6934 */
6935 rc = KERN_INVALID_ARGUMENT;
6936 goto done;
6937 }
6938 VME_OBJECT_SET(entry, vm_object_allocate(size), false, 0);
6939 VME_OFFSET_SET(entry, (vm_object_offset_t)0);
6940 assert(entry->use_pmap);
6941 } else if (VME_OBJECT(entry)->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) {
6942 if (wire_and_extract) {
6943 /*
6944 * We're supposed to share with the original
6945 * provider so should not be COPY_SYMMETRIC.
6946 */
6947 rc = KERN_INVALID_ARGUMENT;
6948 goto done;
6949 }
6950 /*
6951 * Force an unrequested "copy-on-write" but only for
6952 * the range we're wiring.
6953 */
6954 // printf("FBDP %s:%d map %p entry %p [ 0x%llx 0x%llx ] s 0x%llx end 0x%llx wire&extract=%d\n", __FUNCTION__, __LINE__, map, entry, (uint64_t)entry->vme_start, (uint64_t)entry->vme_end, (uint64_t)s, (uint64_t)end, wire_and_extract);
6955 vm_map_clip_start(map, entry, s);
6956 vm_map_clip_end(map, entry, end);
6957 /* recompute "size" */
6958 size = entry->vme_end - entry->vme_start;
6959 /* make a shadow object */
6960 vm_object_t orig_object;
6961 vm_object_offset_t orig_offset;
6962 orig_object = VME_OBJECT(entry);
6963 orig_offset = VME_OFFSET(entry);
6964 VME_OBJECT_SHADOW(entry, size, vm_map_always_shadow(map));
6965 if (VME_OBJECT(entry) != orig_object) {
6966 /*
6967 * This mapping has not been shared (or it would be
6968 * COPY_DELAY instead of COPY_SYMMETRIC) and it has
6969 * not been copied-on-write (or it would be marked
6970 * as "needs_copy" and would have been handled above
6971 * and also already write-protected).
6972 * We still need to write-protect here to prevent
6973 * other threads from modifying these pages while
6974 * we're in the process of copying and wiring
6975 * the copied pages.
6976 * Since the mapping is neither shared nor COWed,
6977 * we only need to write-protect the PTEs for this
6978 * mapping.
6979 */
6980 vm_object_pmap_protect(orig_object,
6981 orig_offset,
6982 size,
6983 map->pmap,
6984 VM_MAP_PAGE_SIZE(map),
6985 entry->vme_start,
6986 entry->protection & ~VM_PROT_WRITE);
6987 }
6988 }
6989 if (VME_OBJECT(entry)->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) {
6990 /*
6991 * Make the object COPY_DELAY to get a stable object
6992 * to wire.
6993 * That should avoid creating long shadow chains while
6994 * wiring/unwiring the same range repeatedly.
6995 * That also prevents part of the object from being
6996 * wired while another part is "needs_copy", which
6997 * could result in conflicting rules wrt copy-on-write.
6998 */
6999 vm_object_t object;
7000
7001 object = VME_OBJECT(entry);
7002 vm_object_lock(object);
7003 if (object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) {
7004 assertf(vm_object_round_page(VME_OFFSET(entry) + size) - vm_object_trunc_page(VME_OFFSET(entry)) == object->vo_size,
7005 "object %p size 0x%llx entry %p [0x%llx:0x%llx:0x%llx] size 0x%llx\n",
7006 object, (uint64_t)object->vo_size,
7007 entry,
7008 (uint64_t)entry->vme_start,
7009 (uint64_t)entry->vme_end,
7010 (uint64_t)VME_OFFSET(entry),
7011 (uint64_t)size);
7012 assertf(object->ref_count == 1,
7013 "object %p ref_count %d\n",
7014 object, object->ref_count);
7015 assertf(!entry->needs_copy,
7016 "entry %p\n", entry);
7017 object->copy_strategy = MEMORY_OBJECT_COPY_DELAY;
7018 object->true_share = TRUE;
7019 }
7020 vm_object_unlock(object);
7021 }
7022
7023 vm_map_clip_start(map, entry, s);
7024 vm_map_clip_end(map, entry, end);
7025
7026 /* re-compute "e" */
7027 e = entry->vme_end;
7028 if (e > end) {
7029 e = end;
7030 }
7031
7032 /*
7033 * Check for holes and protection mismatch.
7034 * Holes: Next entry should be contiguous unless this
7035 * is the end of the region.
7036 * Protection: Access requested must be allowed, unless
7037 * wiring is by protection class
7038 */
7039 if ((entry->vme_end < end) &&
7040 ((entry->vme_next == vm_map_to_entry(map)) ||
7041 (entry->vme_next->vme_start > entry->vme_end))) {
7042 /* found a hole */
7043 rc = KERN_INVALID_ADDRESS;
7044 goto done;
7045 }
7046 if ((entry->protection & access_type) != access_type) {
7047 /* found a protection problem */
7048 rc = KERN_PROTECTION_FAILURE;
7049 goto done;
7050 }
7051
7052 assert(entry->wired_count == 0 && entry->user_wired_count == 0);
7053
7054 if ((rc = add_wire_counts(map, entry, user_wire)) != KERN_SUCCESS) {
7055 goto done;
7056 }
7057
7058 entry->in_transition = TRUE;
7059
7060 /*
7061 * This entry might get split once we unlock the map.
7062 * In vm_fault_wire(), we need the current range as
7063 * defined by this entry. In order for this to work
7064 * along with a simultaneous clip operation, we make a
7065 * temporary copy of this entry and use that for the
7066 * wiring. Note that the underlying objects do not
7067 * change during a clip.
7068 */
7069 tmp_entry = *entry;
7070
7071 /*
7072 * The in_transition state guarentees that the entry
7073 * (or entries for this range, if split occured) will be
7074 * there when the map lock is acquired for the second time.
7075 */
7076 vm_map_unlock(map);
7077
7078 if (!user_wire && cur_thread != THREAD_NULL) {
7079 interruptible_state = thread_interrupt_level(THREAD_UNINT);
7080 } else {
7081 interruptible_state = THREAD_UNINT;
7082 }
7083
7084 if (map_pmap) {
7085 rc = vm_fault_wire(map,
7086 &tmp_entry, caller_prot, tag, map_pmap, pmap_addr,
7087 physpage_p);
7088 } else {
7089 rc = vm_fault_wire(map,
7090 &tmp_entry, caller_prot, tag, map->pmap,
7091 tmp_entry.vme_start,
7092 physpage_p);
7093 }
7094
7095 if (!user_wire && cur_thread != THREAD_NULL) {
7096 thread_interrupt_level(interruptible_state);
7097 }
7098
7099 vm_map_lock(map);
7100
7101 if (last_timestamp + 1 != map->timestamp) {
7102 /*
7103 * Find the entry again. It could have been clipped
7104 * after we unlocked the map.
7105 */
7106 if (!vm_map_lookup_entry(map, tmp_entry.vme_start,
7107 &first_entry)) {
7108 panic("vm_map_wire: re-lookup failed");
7109 }
7110
7111 entry = first_entry;
7112 }
7113
7114 last_timestamp = map->timestamp;
7115
7116 while ((entry != vm_map_to_entry(map)) &&
7117 (entry->vme_start < tmp_entry.vme_end)) {
7118 assert(entry->in_transition);
7119 entry->in_transition = FALSE;
7120 if (entry->needs_wakeup) {
7121 entry->needs_wakeup = FALSE;
7122 need_wakeup = TRUE;
7123 }
7124 if (rc != KERN_SUCCESS) { /* from vm_*_wire */
7125 subtract_wire_counts(map, entry, user_wire);
7126 }
7127 entry = entry->vme_next;
7128 }
7129
7130 if (rc != KERN_SUCCESS) { /* from vm_*_wire */
7131 goto done;
7132 }
7133
7134 if ((entry != vm_map_to_entry(map)) && /* we still have entries in the map */
7135 (tmp_entry.vme_end != end) && /* AND, we are not at the end of the requested range */
7136 (entry->vme_start != tmp_entry.vme_end)) { /* AND, the next entry is not contiguous. */
7137 /* found a "new" hole */
7138 s = tmp_entry.vme_end;
7139 rc = KERN_INVALID_ADDRESS;
7140 goto done;
7141 }
7142
7143 s = entry->vme_start;
7144 } /* end while loop through map entries */
7145
7146 done:
7147 if (rc == KERN_SUCCESS) {
7148 /* repair any damage we may have made to the VM map */
7149 vm_map_simplify_range(map, start, end);
7150 }
7151
7152 vm_map_unlock(map);
7153
7154 /*
7155 * wake up anybody waiting on entries we wired.
7156 */
7157 if (need_wakeup) {
7158 vm_map_entry_wakeup(map);
7159 }
7160
7161 if (rc != KERN_SUCCESS) {
7162 /* undo what has been wired so far */
7163 vm_map_unwire_nested(map, start, s, user_wire,
7164 map_pmap, pmap_addr);
7165 if (physpage_p) {
7166 *physpage_p = 0;
7167 }
7168 }
7169
7170 return rc;
7171 }
7172
7173 kern_return_t
vm_map_wire_external(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_prot_t caller_prot,boolean_t user_wire)7174 vm_map_wire_external(
7175 vm_map_t map,
7176 vm_map_offset_t start,
7177 vm_map_offset_t end,
7178 vm_prot_t caller_prot,
7179 boolean_t user_wire)
7180 {
7181 kern_return_t kret;
7182
7183 kret = vm_map_wire_nested(map, start, end, caller_prot, vm_tag_bt(),
7184 user_wire, (pmap_t)NULL, 0, NULL);
7185 return kret;
7186 }
7187
7188 kern_return_t
vm_map_wire_kernel(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_prot_t caller_prot,vm_tag_t tag,boolean_t user_wire)7189 vm_map_wire_kernel(
7190 vm_map_t map,
7191 vm_map_offset_t start,
7192 vm_map_offset_t end,
7193 vm_prot_t caller_prot,
7194 vm_tag_t tag,
7195 boolean_t user_wire)
7196 {
7197 kern_return_t kret;
7198
7199 kret = vm_map_wire_nested(map, start, end, caller_prot, tag,
7200 user_wire, (pmap_t)NULL, 0, NULL);
7201 return kret;
7202 }
7203
7204 kern_return_t
vm_map_wire_and_extract_external(vm_map_t map,vm_map_offset_t start,vm_prot_t caller_prot,boolean_t user_wire,ppnum_t * physpage_p)7205 vm_map_wire_and_extract_external(
7206 vm_map_t map,
7207 vm_map_offset_t start,
7208 vm_prot_t caller_prot,
7209 boolean_t user_wire,
7210 ppnum_t *physpage_p)
7211 {
7212 kern_return_t kret;
7213
7214 kret = vm_map_wire_nested(map,
7215 start,
7216 start + VM_MAP_PAGE_SIZE(map),
7217 caller_prot,
7218 vm_tag_bt(),
7219 user_wire,
7220 (pmap_t)NULL,
7221 0,
7222 physpage_p);
7223 if (kret != KERN_SUCCESS &&
7224 physpage_p != NULL) {
7225 *physpage_p = 0;
7226 }
7227 return kret;
7228 }
7229
7230 /*
7231 * vm_map_unwire:
7232 *
7233 * Sets the pageability of the specified address range in the target
7234 * as pageable. Regions specified must have been wired previously.
7235 *
7236 * The map must not be locked, but a reference must remain to the map
7237 * throughout the call.
7238 *
7239 * Kernel will panic on failures. User unwire ignores holes and
7240 * unwired and intransition entries to avoid losing memory by leaving
7241 * it unwired.
7242 */
7243 static kern_return_t
vm_map_unwire_nested(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,boolean_t user_wire,pmap_t map_pmap,vm_map_offset_t pmap_addr)7244 vm_map_unwire_nested(
7245 vm_map_t map,
7246 vm_map_offset_t start,
7247 vm_map_offset_t end,
7248 boolean_t user_wire,
7249 pmap_t map_pmap,
7250 vm_map_offset_t pmap_addr)
7251 {
7252 vm_map_entry_t entry;
7253 struct vm_map_entry *first_entry, tmp_entry;
7254 boolean_t need_wakeup;
7255 boolean_t main_map = FALSE;
7256 unsigned int last_timestamp;
7257
7258 vm_map_lock(map);
7259 if (map_pmap == NULL) {
7260 main_map = TRUE;
7261 }
7262 last_timestamp = map->timestamp;
7263
7264 VM_MAP_RANGE_CHECK(map, start, end);
7265 assert(VM_MAP_PAGE_ALIGNED(start, VM_MAP_PAGE_MASK(map)));
7266 assert(VM_MAP_PAGE_ALIGNED(end, VM_MAP_PAGE_MASK(map)));
7267
7268 if (start == end) {
7269 /* We unwired what the caller asked for: zero pages */
7270 vm_map_unlock(map);
7271 return KERN_SUCCESS;
7272 }
7273
7274 if (vm_map_lookup_entry(map, start, &first_entry)) {
7275 entry = first_entry;
7276 /*
7277 * vm_map_clip_start will be done later.
7278 * We don't want to unnest any nested sub maps here !
7279 */
7280 } else {
7281 if (!user_wire) {
7282 panic("vm_map_unwire: start not found");
7283 }
7284 /* Start address is not in map. */
7285 vm_map_unlock(map);
7286 return KERN_INVALID_ADDRESS;
7287 }
7288
7289 if (entry->superpage_size) {
7290 /* superpages are always wired */
7291 vm_map_unlock(map);
7292 return KERN_INVALID_ADDRESS;
7293 }
7294
7295 need_wakeup = FALSE;
7296 while ((entry != vm_map_to_entry(map)) && (entry->vme_start < end)) {
7297 if (entry->in_transition) {
7298 /*
7299 * 1)
7300 * Another thread is wiring down this entry. Note
7301 * that if it is not for the other thread we would
7302 * be unwiring an unwired entry. This is not
7303 * permitted. If we wait, we will be unwiring memory
7304 * we did not wire.
7305 *
7306 * 2)
7307 * Another thread is unwiring this entry. We did not
7308 * have a reference to it, because if we did, this
7309 * entry will not be getting unwired now.
7310 */
7311 if (!user_wire) {
7312 /*
7313 * XXX FBDP
7314 * This could happen: there could be some
7315 * overlapping vslock/vsunlock operations
7316 * going on.
7317 * We should probably just wait and retry,
7318 * but then we have to be careful that this
7319 * entry could get "simplified" after
7320 * "in_transition" gets unset and before
7321 * we re-lookup the entry, so we would
7322 * have to re-clip the entry to avoid
7323 * re-unwiring what we have already unwired...
7324 * See vm_map_wire_nested().
7325 *
7326 * Or we could just ignore "in_transition"
7327 * here and proceed to decement the wired
7328 * count(s) on this entry. That should be fine
7329 * as long as "wired_count" doesn't drop all
7330 * the way to 0 (and we should panic if THAT
7331 * happens).
7332 */
7333 panic("vm_map_unwire: in_transition entry");
7334 }
7335
7336 entry = entry->vme_next;
7337 continue;
7338 }
7339
7340 if (entry->is_sub_map) {
7341 vm_map_offset_t sub_start;
7342 vm_map_offset_t sub_end;
7343 vm_map_offset_t local_end;
7344 pmap_t pmap;
7345
7346 vm_map_clip_start(map, entry, start);
7347 vm_map_clip_end(map, entry, end);
7348
7349 sub_start = VME_OFFSET(entry);
7350 sub_end = entry->vme_end - entry->vme_start;
7351 sub_end += VME_OFFSET(entry);
7352 local_end = entry->vme_end;
7353 if (map_pmap == NULL) {
7354 if (entry->use_pmap) {
7355 pmap = VME_SUBMAP(entry)->pmap;
7356 pmap_addr = sub_start;
7357 } else {
7358 pmap = map->pmap;
7359 pmap_addr = start;
7360 }
7361 if (entry->wired_count == 0 ||
7362 (user_wire && entry->user_wired_count == 0)) {
7363 if (!user_wire) {
7364 panic("vm_map_unwire: entry is unwired");
7365 }
7366 entry = entry->vme_next;
7367 continue;
7368 }
7369
7370 /*
7371 * Check for holes
7372 * Holes: Next entry should be contiguous unless
7373 * this is the end of the region.
7374 */
7375 if (((entry->vme_end < end) &&
7376 ((entry->vme_next == vm_map_to_entry(map)) ||
7377 (entry->vme_next->vme_start
7378 > entry->vme_end)))) {
7379 if (!user_wire) {
7380 panic("vm_map_unwire: non-contiguous region");
7381 }
7382 /*
7383 * entry = entry->vme_next;
7384 * continue;
7385 */
7386 }
7387
7388 subtract_wire_counts(map, entry, user_wire);
7389
7390 if (entry->wired_count != 0) {
7391 entry = entry->vme_next;
7392 continue;
7393 }
7394
7395 entry->in_transition = TRUE;
7396 tmp_entry = *entry;/* see comment in vm_map_wire() */
7397
7398 /*
7399 * We can unlock the map now. The in_transition state
7400 * guarantees existance of the entry.
7401 */
7402 vm_map_unlock(map);
7403 vm_map_unwire_nested(VME_SUBMAP(entry),
7404 sub_start, sub_end, user_wire, pmap, pmap_addr);
7405 vm_map_lock(map);
7406
7407 if (last_timestamp + 1 != map->timestamp) {
7408 /*
7409 * Find the entry again. It could have been
7410 * clipped or deleted after we unlocked the map.
7411 */
7412 if (!vm_map_lookup_entry(map,
7413 tmp_entry.vme_start,
7414 &first_entry)) {
7415 if (!user_wire) {
7416 panic("vm_map_unwire: re-lookup failed");
7417 }
7418 entry = first_entry->vme_next;
7419 } else {
7420 entry = first_entry;
7421 }
7422 }
7423 last_timestamp = map->timestamp;
7424
7425 /*
7426 * clear transition bit for all constituent entries
7427 * that were in the original entry (saved in
7428 * tmp_entry). Also check for waiters.
7429 */
7430 while ((entry != vm_map_to_entry(map)) &&
7431 (entry->vme_start < tmp_entry.vme_end)) {
7432 assert(entry->in_transition);
7433 entry->in_transition = FALSE;
7434 if (entry->needs_wakeup) {
7435 entry->needs_wakeup = FALSE;
7436 need_wakeup = TRUE;
7437 }
7438 entry = entry->vme_next;
7439 }
7440 continue;
7441 } else {
7442 tmp_entry = *entry;
7443 vm_map_unlock(map);
7444 vm_map_unwire_nested(VME_SUBMAP(entry),
7445 sub_start, sub_end, user_wire, map_pmap,
7446 pmap_addr);
7447 vm_map_lock(map);
7448
7449 if (last_timestamp + 1 != map->timestamp) {
7450 /*
7451 * Find the entry again. It could have been
7452 * clipped or deleted after we unlocked the map.
7453 */
7454 if (!vm_map_lookup_entry(map,
7455 tmp_entry.vme_start,
7456 &first_entry)) {
7457 if (!user_wire) {
7458 panic("vm_map_unwire: re-lookup failed");
7459 }
7460 entry = first_entry->vme_next;
7461 } else {
7462 entry = first_entry;
7463 }
7464 }
7465 last_timestamp = map->timestamp;
7466 }
7467 }
7468
7469
7470 if ((entry->wired_count == 0) ||
7471 (user_wire && entry->user_wired_count == 0)) {
7472 if (!user_wire) {
7473 panic("vm_map_unwire: entry is unwired");
7474 }
7475
7476 entry = entry->vme_next;
7477 continue;
7478 }
7479
7480 assert(entry->wired_count > 0 &&
7481 (!user_wire || entry->user_wired_count > 0));
7482
7483 vm_map_clip_start(map, entry, start);
7484 vm_map_clip_end(map, entry, end);
7485
7486 /*
7487 * Check for holes
7488 * Holes: Next entry should be contiguous unless
7489 * this is the end of the region.
7490 */
7491 if (((entry->vme_end < end) &&
7492 ((entry->vme_next == vm_map_to_entry(map)) ||
7493 (entry->vme_next->vme_start > entry->vme_end)))) {
7494 if (!user_wire) {
7495 panic("vm_map_unwire: non-contiguous region");
7496 }
7497 entry = entry->vme_next;
7498 continue;
7499 }
7500
7501 subtract_wire_counts(map, entry, user_wire);
7502
7503 if (entry->wired_count != 0) {
7504 entry = entry->vme_next;
7505 continue;
7506 }
7507
7508 if (entry->zero_wired_pages) {
7509 entry->zero_wired_pages = FALSE;
7510 }
7511
7512 entry->in_transition = TRUE;
7513 tmp_entry = *entry; /* see comment in vm_map_wire() */
7514
7515 /*
7516 * We can unlock the map now. The in_transition state
7517 * guarantees existance of the entry.
7518 */
7519 vm_map_unlock(map);
7520 if (map_pmap) {
7521 vm_fault_unwire(map, &tmp_entry, FALSE, map_pmap,
7522 pmap_addr, tmp_entry.vme_end);
7523 } else {
7524 vm_fault_unwire(map, &tmp_entry, FALSE, map->pmap,
7525 tmp_entry.vme_start, tmp_entry.vme_end);
7526 }
7527 vm_map_lock(map);
7528
7529 if (last_timestamp + 1 != map->timestamp) {
7530 /*
7531 * Find the entry again. It could have been clipped
7532 * or deleted after we unlocked the map.
7533 */
7534 if (!vm_map_lookup_entry(map, tmp_entry.vme_start,
7535 &first_entry)) {
7536 if (!user_wire) {
7537 panic("vm_map_unwire: re-lookup failed");
7538 }
7539 entry = first_entry->vme_next;
7540 } else {
7541 entry = first_entry;
7542 }
7543 }
7544 last_timestamp = map->timestamp;
7545
7546 /*
7547 * clear transition bit for all constituent entries that
7548 * were in the original entry (saved in tmp_entry). Also
7549 * check for waiters.
7550 */
7551 while ((entry != vm_map_to_entry(map)) &&
7552 (entry->vme_start < tmp_entry.vme_end)) {
7553 assert(entry->in_transition);
7554 entry->in_transition = FALSE;
7555 if (entry->needs_wakeup) {
7556 entry->needs_wakeup = FALSE;
7557 need_wakeup = TRUE;
7558 }
7559 entry = entry->vme_next;
7560 }
7561 }
7562
7563 /*
7564 * We might have fragmented the address space when we wired this
7565 * range of addresses. Attempt to re-coalesce these VM map entries
7566 * with their neighbors now that they're no longer wired.
7567 * Under some circumstances, address space fragmentation can
7568 * prevent VM object shadow chain collapsing, which can cause
7569 * swap space leaks.
7570 */
7571 vm_map_simplify_range(map, start, end);
7572
7573 vm_map_unlock(map);
7574 /*
7575 * wake up anybody waiting on entries that we have unwired.
7576 */
7577 if (need_wakeup) {
7578 vm_map_entry_wakeup(map);
7579 }
7580 return KERN_SUCCESS;
7581 }
7582
7583 kern_return_t
vm_map_unwire(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,boolean_t user_wire)7584 vm_map_unwire(
7585 vm_map_t map,
7586 vm_map_offset_t start,
7587 vm_map_offset_t end,
7588 boolean_t user_wire)
7589 {
7590 return vm_map_unwire_nested(map, start, end,
7591 user_wire, (pmap_t)NULL, 0);
7592 }
7593
7594
7595 /*
7596 * vm_map_entry_zap: [ internal use only ]
7597 *
7598 * Remove the entry from the target map
7599 * and put it on a zap list.
7600 */
7601 static void
vm_map_entry_zap(vm_map_t map,vm_map_entry_t entry,vm_map_zap_t zap)7602 vm_map_entry_zap(
7603 vm_map_t map,
7604 vm_map_entry_t entry,
7605 vm_map_zap_t zap)
7606 {
7607 vm_map_offset_t s, e;
7608
7609 s = entry->vme_start;
7610 e = entry->vme_end;
7611 assert(VM_MAP_PAGE_ALIGNED(s, FOURK_PAGE_MASK));
7612 assert(VM_MAP_PAGE_ALIGNED(e, FOURK_PAGE_MASK));
7613 if (VM_MAP_PAGE_MASK(map) >= PAGE_MASK) {
7614 assert(page_aligned(s));
7615 assert(page_aligned(e));
7616 }
7617 if (entry->map_aligned == TRUE) {
7618 assert(VM_MAP_PAGE_ALIGNED(s, VM_MAP_PAGE_MASK(map)));
7619 assert(VM_MAP_PAGE_ALIGNED(e, VM_MAP_PAGE_MASK(map)));
7620 }
7621 assert(entry->wired_count == 0);
7622 assert(entry->user_wired_count == 0);
7623 assert(!entry->vme_permanent);
7624
7625 vm_map_store_entry_unlink(map, entry, false);
7626 map->size -= e - s;
7627
7628 vm_map_zap_append(zap, entry);
7629 }
7630
7631 static void
vm_map_submap_pmap_clean(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_map_t sub_map,vm_map_offset_t offset)7632 vm_map_submap_pmap_clean(
7633 vm_map_t map,
7634 vm_map_offset_t start,
7635 vm_map_offset_t end,
7636 vm_map_t sub_map,
7637 vm_map_offset_t offset)
7638 {
7639 vm_map_offset_t submap_start;
7640 vm_map_offset_t submap_end;
7641 vm_map_size_t remove_size;
7642 vm_map_entry_t entry;
7643
7644 submap_end = offset + (end - start);
7645 submap_start = offset;
7646
7647 vm_map_lock_read(sub_map);
7648 if (vm_map_lookup_entry(sub_map, offset, &entry)) {
7649 remove_size = (entry->vme_end - entry->vme_start);
7650 if (offset > entry->vme_start) {
7651 remove_size -= offset - entry->vme_start;
7652 }
7653
7654
7655 if (submap_end < entry->vme_end) {
7656 remove_size -=
7657 entry->vme_end - submap_end;
7658 }
7659 if (entry->is_sub_map) {
7660 vm_map_submap_pmap_clean(
7661 sub_map,
7662 start,
7663 start + remove_size,
7664 VME_SUBMAP(entry),
7665 VME_OFFSET(entry));
7666 } else {
7667 if (map->mapped_in_other_pmaps &&
7668 os_ref_get_count_raw(&map->map_refcnt) != 0 &&
7669 VME_OBJECT(entry) != NULL) {
7670 vm_object_pmap_protect_options(
7671 VME_OBJECT(entry),
7672 (VME_OFFSET(entry) +
7673 offset -
7674 entry->vme_start),
7675 remove_size,
7676 PMAP_NULL,
7677 PAGE_SIZE,
7678 entry->vme_start,
7679 VM_PROT_NONE,
7680 PMAP_OPTIONS_REMOVE);
7681 } else {
7682 pmap_remove(map->pmap,
7683 (addr64_t)start,
7684 (addr64_t)(start + remove_size));
7685 }
7686 }
7687 }
7688
7689 entry = entry->vme_next;
7690
7691 while ((entry != vm_map_to_entry(sub_map))
7692 && (entry->vme_start < submap_end)) {
7693 remove_size = (entry->vme_end - entry->vme_start);
7694 if (submap_end < entry->vme_end) {
7695 remove_size -= entry->vme_end - submap_end;
7696 }
7697 if (entry->is_sub_map) {
7698 vm_map_submap_pmap_clean(
7699 sub_map,
7700 (start + entry->vme_start) - offset,
7701 ((start + entry->vme_start) - offset) + remove_size,
7702 VME_SUBMAP(entry),
7703 VME_OFFSET(entry));
7704 } else {
7705 if (map->mapped_in_other_pmaps &&
7706 os_ref_get_count_raw(&map->map_refcnt) != 0 &&
7707 VME_OBJECT(entry) != NULL) {
7708 vm_object_pmap_protect_options(
7709 VME_OBJECT(entry),
7710 VME_OFFSET(entry),
7711 remove_size,
7712 PMAP_NULL,
7713 PAGE_SIZE,
7714 entry->vme_start,
7715 VM_PROT_NONE,
7716 PMAP_OPTIONS_REMOVE);
7717 } else {
7718 pmap_remove(map->pmap,
7719 (addr64_t)((start + entry->vme_start)
7720 - offset),
7721 (addr64_t)(((start + entry->vme_start)
7722 - offset) + remove_size));
7723 }
7724 }
7725 entry = entry->vme_next;
7726 }
7727 vm_map_unlock_read(sub_map);
7728 return;
7729 }
7730
7731 /*
7732 * virt_memory_guard_ast:
7733 *
7734 * Handle the AST callout for a virtual memory guard.
7735 * raise an EXC_GUARD exception and terminate the task
7736 * if configured to do so.
7737 */
7738 void
virt_memory_guard_ast(thread_t thread,mach_exception_data_type_t code,mach_exception_data_type_t subcode)7739 virt_memory_guard_ast(
7740 thread_t thread,
7741 mach_exception_data_type_t code,
7742 mach_exception_data_type_t subcode)
7743 {
7744 task_t task = get_threadtask(thread);
7745 assert(task != kernel_task);
7746 assert(task == current_task());
7747 kern_return_t sync_exception_result;
7748 uint32_t behavior;
7749
7750 behavior = task->task_exc_guard;
7751
7752 /* Is delivery enabled */
7753 if ((behavior & TASK_EXC_GUARD_VM_DELIVER) == 0) {
7754 return;
7755 }
7756
7757 /* If only once, make sure we're that once */
7758 while (behavior & TASK_EXC_GUARD_VM_ONCE) {
7759 uint32_t new_behavior = behavior & ~TASK_EXC_GUARD_VM_DELIVER;
7760
7761 if (OSCompareAndSwap(behavior, new_behavior, &task->task_exc_guard)) {
7762 break;
7763 }
7764 behavior = task->task_exc_guard;
7765 if ((behavior & TASK_EXC_GUARD_VM_DELIVER) == 0) {
7766 return;
7767 }
7768 }
7769
7770 /* Raise exception synchronously and see if handler claimed it */
7771 sync_exception_result = task_exception_notify(EXC_GUARD, code, subcode);
7772
7773 if (task->task_exc_guard & TASK_EXC_GUARD_VM_FATAL) {
7774 /*
7775 * If Synchronous EXC_GUARD delivery was successful then
7776 * kill the process and return, else kill the process
7777 * and deliver the exception via EXC_CORPSE_NOTIFY.
7778 */
7779 if (sync_exception_result == KERN_SUCCESS) {
7780 task_bsdtask_kill(current_task());
7781 } else {
7782 exit_with_guard_exception(current_proc(), code, subcode);
7783 }
7784 } else if (task->task_exc_guard & TASK_EXC_GUARD_VM_CORPSE) {
7785 /*
7786 * If the synchronous EXC_GUARD delivery was not successful,
7787 * raise a simulated crash.
7788 */
7789 if (sync_exception_result != KERN_SUCCESS) {
7790 task_violated_guard(code, subcode, NULL, FALSE);
7791 }
7792 }
7793 }
7794
7795 /*
7796 * vm_map_guard_exception:
7797 *
7798 * Generate a GUARD_TYPE_VIRTUAL_MEMORY EXC_GUARD exception.
7799 *
7800 * Right now, we do this when we find nothing mapped, or a
7801 * gap in the mapping when a user address space deallocate
7802 * was requested. We report the address of the first gap found.
7803 */
7804 static void
vm_map_guard_exception(vm_map_offset_t gap_start,unsigned reason)7805 vm_map_guard_exception(
7806 vm_map_offset_t gap_start,
7807 unsigned reason)
7808 {
7809 mach_exception_code_t code = 0;
7810 unsigned int guard_type = GUARD_TYPE_VIRT_MEMORY;
7811 unsigned int target = 0; /* should we pass in pid associated with map? */
7812 mach_exception_data_type_t subcode = (uint64_t)gap_start;
7813 boolean_t fatal = FALSE;
7814
7815 task_t task = current_task_early();
7816
7817 /* Can't deliver exceptions to a NULL task (early boot) or kernel task */
7818 if (task == NULL || task == kernel_task) {
7819 return;
7820 }
7821
7822 EXC_GUARD_ENCODE_TYPE(code, guard_type);
7823 EXC_GUARD_ENCODE_FLAVOR(code, reason);
7824 EXC_GUARD_ENCODE_TARGET(code, target);
7825
7826 if (task->task_exc_guard & TASK_EXC_GUARD_VM_FATAL) {
7827 fatal = TRUE;
7828 }
7829 thread_guard_violation(current_thread(), code, subcode, fatal);
7830 }
7831
7832 static kern_return_t
vm_map_delete_submap_recurse(vm_map_t submap,vm_map_offset_t submap_start,vm_map_offset_t submap_end)7833 vm_map_delete_submap_recurse(
7834 vm_map_t submap,
7835 vm_map_offset_t submap_start,
7836 vm_map_offset_t submap_end)
7837 {
7838 vm_map_entry_t submap_entry;
7839
7840 /*
7841 * Verify that the submap does not contain any "permanent" entries
7842 * within the specified range.
7843 * We do not care about gaps.
7844 */
7845
7846 vm_map_lock(submap);
7847
7848 if (!vm_map_lookup_entry(submap, submap_start, &submap_entry)) {
7849 submap_entry = submap_entry->vme_next;
7850 }
7851
7852 for (;
7853 submap_entry != vm_map_to_entry(submap) &&
7854 submap_entry->vme_start < submap_end;
7855 submap_entry = submap_entry->vme_next) {
7856 if (submap_entry->vme_permanent) {
7857 /* "permanent" entry -> fail */
7858 vm_map_unlock(submap);
7859 return KERN_PROTECTION_FAILURE;
7860 }
7861 }
7862 /* no "permanent" entries in the range -> success */
7863 vm_map_unlock(submap);
7864 return KERN_SUCCESS;
7865 }
7866
7867 __abortlike
7868 static void
__vm_map_delete_misaligned_panic(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)7869 __vm_map_delete_misaligned_panic(
7870 vm_map_t map,
7871 vm_map_offset_t start,
7872 vm_map_offset_t end)
7873 {
7874 panic("vm_map_delete(%p,0x%llx,0x%llx): start is not aligned to 0x%x",
7875 map, (uint64_t)start, (uint64_t)end, VM_MAP_PAGE_SIZE(map));
7876 }
7877
7878 __abortlike
7879 static void
__vm_map_delete_failed_panic(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,kern_return_t kr)7880 __vm_map_delete_failed_panic(
7881 vm_map_t map,
7882 vm_map_offset_t start,
7883 vm_map_offset_t end,
7884 kern_return_t kr)
7885 {
7886 panic("vm_map_delete(%p,0x%llx,0x%llx): failed unexpected with %d",
7887 map, (uint64_t)start, (uint64_t)end, kr);
7888 }
7889
7890 __abortlike
7891 static void
__vm_map_delete_gap_panic(vm_map_t map,vm_map_offset_t where,vm_map_offset_t start,vm_map_offset_t end)7892 __vm_map_delete_gap_panic(
7893 vm_map_t map,
7894 vm_map_offset_t where,
7895 vm_map_offset_t start,
7896 vm_map_offset_t end)
7897 {
7898 panic("vm_map_delete(%p,0x%llx,0x%llx): no map entry at 0x%llx",
7899 map, (uint64_t)start, (uint64_t)end, (uint64_t)where);
7900 }
7901
7902 __abortlike
7903 static void
__vm_map_delete_permanent_panic(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_map_entry_t entry)7904 __vm_map_delete_permanent_panic(
7905 vm_map_t map,
7906 vm_map_offset_t start,
7907 vm_map_offset_t end,
7908 vm_map_entry_t entry)
7909 {
7910 panic("vm_map_delete(%p,0x%llx,0x%llx): "
7911 "Attempting to remove permanent VM map entry %p [0x%llx:0x%llx]",
7912 map, (uint64_t)start, (uint64_t)end, entry,
7913 (uint64_t)entry->vme_start,
7914 (uint64_t)entry->vme_end);
7915 }
7916
7917 __options_decl(vm_map_delete_state_t, uint32_t, {
7918 VMDS_NONE = 0x0000,
7919
7920 VMDS_FOUND_GAP = 0x0001,
7921 VMDS_GAPS_OK = 0x0002,
7922
7923 VMDS_KERNEL_PMAP = 0x0004,
7924 VMDS_NEEDS_LOOKUP = 0x0008,
7925 VMDS_NEEDS_WAKEUP = 0x0010,
7926 VMDS_KERNEL_KMEMPTR = 0x0020
7927 });
7928
7929 /*
7930 * vm_map_delete: [ internal use only ]
7931 *
7932 * Deallocates the given address range from the target map.
7933 * Removes all user wirings. Unwires one kernel wiring if
7934 * VM_MAP_REMOVE_KUNWIRE is set. Waits for kernel wirings to go
7935 * away if VM_MAP_REMOVE_WAIT_FOR_KWIRE is set. Sleeps
7936 * interruptibly if VM_MAP_REMOVE_INTERRUPTIBLE is set.
7937 *
7938 *
7939 * When the map is a kernel map, then any error in removing mappings
7940 * will lead to a panic so that clients do not have to repeat the panic
7941 * code at each call site. If VM_MAP_REMOVE_INTERRUPTIBLE
7942 * is also passed, then KERN_ABORTED will not lead to a panic.
7943 *
7944 * This routine is called with map locked and leaves map locked.
7945 */
7946 static kmem_return_t
vm_map_delete(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vmr_flags_t flags,kmem_guard_t guard,vm_map_zap_t zap_list)7947 vm_map_delete(
7948 vm_map_t map,
7949 vm_map_offset_t start,
7950 vm_map_offset_t end,
7951 vmr_flags_t flags,
7952 kmem_guard_t guard,
7953 vm_map_zap_t zap_list)
7954 {
7955 vm_map_entry_t entry, next;
7956 int interruptible;
7957 vm_map_offset_t gap_start = 0;
7958 vm_map_offset_t clear_in_transition_end = 0;
7959 __unused vm_map_offset_t save_start = start;
7960 __unused vm_map_offset_t save_end = end;
7961 vm_map_delete_state_t state = VMDS_NONE;
7962 kmem_return_t ret = { };
7963 vm_map_range_id_t range_id = 0;
7964 struct kmem_page_meta *meta = NULL;
7965 uint32_t size_idx, slot_idx;
7966 struct mach_vm_range slot;
7967
7968 if (vm_map_pmap(map) == kernel_pmap) {
7969 state |= VMDS_KERNEL_PMAP;
7970 range_id = kmem_addr_get_range(start, end - start);
7971 if (kmem_is_ptr_range(range_id)) {
7972 state |= VMDS_KERNEL_KMEMPTR;
7973 slot_idx = kmem_addr_get_slot_idx(start, end, range_id, &meta,
7974 &size_idx, &slot);
7975 }
7976 }
7977
7978 if (map->terminated || os_ref_get_count_raw(&map->map_refcnt) == 0) {
7979 state |= VMDS_GAPS_OK;
7980 }
7981
7982 interruptible = (flags & VM_MAP_REMOVE_INTERRUPTIBLE) ?
7983 THREAD_ABORTSAFE : THREAD_UNINT;
7984
7985 if ((flags & VM_MAP_REMOVE_NO_MAP_ALIGN) == 0 &&
7986 (start & VM_MAP_PAGE_MASK(map))) {
7987 __vm_map_delete_misaligned_panic(map, start, end);
7988 }
7989
7990 if ((state & VMDS_GAPS_OK) == 0) {
7991 /*
7992 * If the map isn't terminated then all deletions must have
7993 * no gaps, and be within the [min, max) of the map.
7994 *
7995 * We got here without VM_MAP_RANGE_CHECK() being called,
7996 * and hence must validate bounds manually.
7997 *
7998 * It is worth noting that because vm_deallocate() will
7999 * round_page() the deallocation size, it's possible for "end"
8000 * to be 0 here due to overflow. We hence must treat it as being
8001 * beyond vm_map_max(map).
8002 *
8003 * Similarly, end < start means some wrap around happend,
8004 * which should cause an error or panic.
8005 */
8006 if (end == 0 || end > vm_map_max(map)) {
8007 state |= VMDS_FOUND_GAP;
8008 gap_start = vm_map_max(map);
8009 if (state & VMDS_KERNEL_PMAP) {
8010 __vm_map_delete_gap_panic(map,
8011 gap_start, start, end);
8012 }
8013 goto out;
8014 }
8015
8016 if (end < start) {
8017 if (state & VMDS_KERNEL_PMAP) {
8018 __vm_map_delete_gap_panic(map,
8019 vm_map_max(map), start, end);
8020 }
8021 ret.kmr_return = KERN_INVALID_ARGUMENT;
8022 goto out;
8023 }
8024
8025 if (start < vm_map_min(map)) {
8026 state |= VMDS_FOUND_GAP;
8027 gap_start = start;
8028 if (state & VMDS_KERNEL_PMAP) {
8029 __vm_map_delete_gap_panic(map,
8030 gap_start, start, end);
8031 }
8032 goto out;
8033 }
8034 } else {
8035 /*
8036 * If the map is terminated, we must accept start/end
8037 * being beyond the boundaries of the map as this is
8038 * how some of the mappings like commpage mappings
8039 * can be destroyed (they're outside of those bounds).
8040 *
8041 * end < start is still something we can't cope with,
8042 * so just bail.
8043 */
8044 if (end < start) {
8045 goto out;
8046 }
8047 }
8048
8049
8050 /*
8051 * Find the start of the region.
8052 *
8053 * If in a superpage, extend the range
8054 * to include the start of the mapping.
8055 */
8056 while (vm_map_lookup_entry_or_next(map, start, &entry)) {
8057 if (entry->superpage_size && (start & ~SUPERPAGE_MASK)) {
8058 start = SUPERPAGE_ROUND_DOWN(start);
8059 } else {
8060 SAVE_HINT_MAP_WRITE(map, entry->vme_prev);
8061 break;
8062 }
8063 }
8064
8065 if (entry->superpage_size) {
8066 end = SUPERPAGE_ROUND_UP(end);
8067 }
8068
8069 /*
8070 * Step through all entries in this region
8071 */
8072 for (vm_map_offset_t s = start; s < end;) {
8073 /*
8074 * At this point, we have deleted all the memory entries
8075 * in [start, s) and are proceeding with the [s, end) range.
8076 *
8077 * This loop might drop the map lock, and it is possible that
8078 * some memory was already reallocated within [start, s)
8079 * and we don't want to mess with those entries.
8080 *
8081 * Some of those entries could even have been re-assembled
8082 * with an entry after "s" (in vm_map_simplify_entry()), so
8083 * we may have to vm_map_clip_start() again.
8084 *
8085 * When clear_in_transition_end is set, the we had marked
8086 * [start, clear_in_transition_end) as "in_transition"
8087 * during a previous iteration and we need to clear it.
8088 */
8089
8090 /*
8091 * Step 1: If needed (because we dropped locks),
8092 * lookup the entry again.
8093 *
8094 * If we're coming back from unwiring (Step 5),
8095 * we also need to mark the entries as no longer
8096 * in transition after that.
8097 */
8098
8099 if (state & VMDS_NEEDS_LOOKUP) {
8100 state &= ~VMDS_NEEDS_LOOKUP;
8101
8102 if (vm_map_lookup_entry_or_next(map, s, &entry)) {
8103 SAVE_HINT_MAP_WRITE(map, entry->vme_prev);
8104 }
8105
8106 if (state & VMDS_KERNEL_KMEMPTR) {
8107 kmem_validate_slot(s, meta, size_idx, slot_idx);
8108 }
8109 }
8110
8111 if (clear_in_transition_end) {
8112 for (vm_map_entry_t it = entry;
8113 it != vm_map_to_entry(map) &&
8114 it->vme_start < clear_in_transition_end;
8115 it = it->vme_next) {
8116 assert(it->in_transition);
8117 it->in_transition = FALSE;
8118 if (it->needs_wakeup) {
8119 it->needs_wakeup = FALSE;
8120 state |= VMDS_NEEDS_WAKEUP;
8121 }
8122 }
8123
8124 clear_in_transition_end = 0;
8125 }
8126
8127
8128 /*
8129 * Step 2: Perform various policy checks
8130 * before we do _anything_ to this entry.
8131 */
8132
8133 if (entry == vm_map_to_entry(map) || s < entry->vme_start) {
8134 if (state & (VMDS_GAPS_OK | VMDS_FOUND_GAP)) {
8135 /*
8136 * Either we found a gap already,
8137 * or we are tearing down a map,
8138 * keep going.
8139 */
8140 } else if (state & VMDS_KERNEL_PMAP) {
8141 __vm_map_delete_gap_panic(map, s, start, end);
8142 } else if (s < end) {
8143 state |= VMDS_FOUND_GAP;
8144 gap_start = s;
8145 }
8146
8147 if (entry == vm_map_to_entry(map) ||
8148 end <= entry->vme_start) {
8149 break;
8150 }
8151
8152 s = entry->vme_start;
8153 }
8154
8155 if (state & VMDS_KERNEL_PMAP) {
8156 /*
8157 * In the kernel map and its submaps,
8158 * permanent entries never die, even
8159 * if VM_MAP_REMOVE_IMMUTABLE is passed.
8160 */
8161 if (entry->vme_permanent) {
8162 __vm_map_delete_permanent_panic(map, start, end, entry);
8163 }
8164
8165 if (flags & VM_MAP_REMOVE_GUESS_SIZE) {
8166 end = entry->vme_end;
8167 flags &= ~VM_MAP_REMOVE_GUESS_SIZE;
8168 }
8169
8170 /*
8171 * In the kernel map and its submaps,
8172 * the removal of an atomic/guarded entry is strict.
8173 *
8174 * An atomic entry is processed only if it was
8175 * specifically targeted.
8176 *
8177 * We might have deleted non-atomic entries before
8178 * we reach this this point however...
8179 */
8180 kmem_entry_validate_guard(map, entry,
8181 start, end - start, guard);
8182 }
8183
8184 /*
8185 * Step 2.1: handle "permanent" and "submap" entries
8186 * *before* clipping to avoid triggering some unnecessary
8187 * un-nesting of the shared region.
8188 */
8189 if (entry->vme_permanent && entry->is_sub_map) {
8190 // printf("FBDP %s:%d permanent submap...\n", __FUNCTION__, __LINE__);
8191 /*
8192 * Un-mapping a "permanent" mapping of a user-space
8193 * submap is not allowed unless...
8194 */
8195 if (flags & VM_MAP_REMOVE_IMMUTABLE) {
8196 /*
8197 * a. explicitly requested by the kernel caller.
8198 */
8199 // printf("FBDP %s:%d flags & REMOVE_IMMUTABLE\n", __FUNCTION__, __LINE__);
8200 } else if ((flags & VM_MAP_REMOVE_IMMUTABLE_CODE) &&
8201 developer_mode_state()) {
8202 /*
8203 * b. we're in "developer" mode (for
8204 * breakpoints, dtrace probes, ...).
8205 */
8206 // printf("FBDP %s:%d flags & REMOVE_IMMUTABLE_CODE\n", __FUNCTION__, __LINE__);
8207 } else if (map->terminated) {
8208 /*
8209 * c. this is the final address space cleanup.
8210 */
8211 // printf("FBDP %s:%d map->terminated\n", __FUNCTION__, __LINE__);
8212 } else {
8213 vm_map_offset_t submap_start, submap_end;
8214 kern_return_t submap_kr;
8215
8216 /*
8217 * Check if there are any "permanent" mappings
8218 * in this range in the submap.
8219 */
8220 if (entry->in_transition) {
8221 /* can that even happen ? */
8222 goto in_transition;
8223 }
8224 /* compute the clipped range in the submap */
8225 submap_start = s - entry->vme_start;
8226 submap_start += VME_OFFSET(entry);
8227 submap_end = end - entry->vme_start;
8228 submap_end += VME_OFFSET(entry);
8229 submap_kr = vm_map_delete_submap_recurse(
8230 VME_SUBMAP(entry),
8231 submap_start,
8232 submap_end);
8233 if (submap_kr != KERN_SUCCESS) {
8234 /*
8235 * There are some "permanent" mappings
8236 * in the submap: we are not allowed
8237 * to remove this range.
8238 */
8239 printf("%d[%s] removing permanent submap entry "
8240 "%p [0x%llx:0x%llx] prot 0x%x/0x%x -> KERN_PROT_FAILURE\n",
8241 proc_selfpid(),
8242 (get_bsdtask_info(current_task())
8243 ? proc_name_address(get_bsdtask_info(current_task()))
8244 : "?"), entry,
8245 (uint64_t)entry->vme_start,
8246 (uint64_t)entry->vme_end,
8247 entry->protection,
8248 entry->max_protection);
8249 DTRACE_VM6(vm_map_delete_permanent_deny_submap,
8250 vm_map_entry_t, entry,
8251 vm_map_offset_t, entry->vme_start,
8252 vm_map_offset_t, entry->vme_end,
8253 vm_prot_t, entry->protection,
8254 vm_prot_t, entry->max_protection,
8255 int, VME_ALIAS(entry));
8256 ret.kmr_return = KERN_PROTECTION_FAILURE;
8257 goto out;
8258 }
8259 /* no permanent mappings: proceed */
8260 }
8261 }
8262
8263 /*
8264 * Step 3: Perform any clipping needed.
8265 *
8266 * After this, "entry" starts at "s", ends before "end"
8267 */
8268
8269 if (entry->vme_start < s) {
8270 if ((flags & VM_MAP_REMOVE_NO_MAP_ALIGN) &&
8271 entry->map_aligned &&
8272 !VM_MAP_PAGE_ALIGNED(s, VM_MAP_PAGE_MASK(map))) {
8273 /*
8274 * The entry will no longer be map-aligned
8275 * after clipping and the caller said it's OK.
8276 */
8277 entry->map_aligned = FALSE;
8278 }
8279 vm_map_clip_start(map, entry, s);
8280 SAVE_HINT_MAP_WRITE(map, entry->vme_prev);
8281 }
8282
8283 if (end < entry->vme_end) {
8284 if ((flags & VM_MAP_REMOVE_NO_MAP_ALIGN) &&
8285 entry->map_aligned &&
8286 !VM_MAP_PAGE_ALIGNED(end, VM_MAP_PAGE_MASK(map))) {
8287 /*
8288 * The entry will no longer be map-aligned
8289 * after clipping and the caller said it's OK.
8290 */
8291 entry->map_aligned = FALSE;
8292 }
8293 vm_map_clip_end(map, entry, end);
8294 }
8295
8296 if (entry->vme_permanent && entry->is_sub_map) {
8297 /*
8298 * We already went through step 2.1 which did not deny
8299 * the removal of this "permanent" and "is_sub_map"
8300 * entry.
8301 * Now that we've clipped what we actually want to
8302 * delete, undo the "permanent" part to allow the
8303 * removal to proceed.
8304 */
8305 DTRACE_VM6(vm_map_delete_permanent_allow_submap,
8306 vm_map_entry_t, entry,
8307 vm_map_offset_t, entry->vme_start,
8308 vm_map_offset_t, entry->vme_end,
8309 vm_prot_t, entry->protection,
8310 vm_prot_t, entry->max_protection,
8311 int, VME_ALIAS(entry));
8312 entry->vme_permanent = false;
8313 }
8314
8315 assert(s == entry->vme_start);
8316 assert(entry->vme_end <= end);
8317
8318
8319 /*
8320 * Step 4: If the entry is in flux, wait for this to resolve.
8321 */
8322
8323 if (entry->in_transition) {
8324 wait_result_t wait_result;
8325
8326 in_transition:
8327 /*
8328 * Another thread is wiring/unwiring this entry.
8329 * Let the other thread know we are waiting.
8330 */
8331
8332 entry->needs_wakeup = TRUE;
8333
8334 /*
8335 * wake up anybody waiting on entries that we have
8336 * already unwired/deleted.
8337 */
8338 if (state & VMDS_NEEDS_WAKEUP) {
8339 vm_map_entry_wakeup(map);
8340 state &= ~VMDS_NEEDS_WAKEUP;
8341 }
8342
8343 wait_result = vm_map_entry_wait(map, interruptible);
8344
8345 if (interruptible &&
8346 wait_result == THREAD_INTERRUPTED) {
8347 /*
8348 * We do not clear the needs_wakeup flag,
8349 * since we cannot tell if we were the only one.
8350 */
8351 ret.kmr_return = KERN_ABORTED;
8352 return ret;
8353 }
8354
8355 /*
8356 * The entry could have been clipped or it
8357 * may not exist anymore. Look it up again.
8358 */
8359 state |= VMDS_NEEDS_LOOKUP;
8360 continue;
8361 }
8362
8363
8364 /*
8365 * Step 5: Handle wiring
8366 */
8367
8368 if (entry->wired_count) {
8369 struct vm_map_entry tmp_entry;
8370 boolean_t user_wire;
8371 unsigned int last_timestamp;
8372
8373 user_wire = entry->user_wired_count > 0;
8374
8375 /*
8376 * Remove a kernel wiring if requested
8377 */
8378 if (flags & VM_MAP_REMOVE_KUNWIRE) {
8379 entry->wired_count--;
8380 }
8381
8382 /*
8383 * Remove all user wirings for proper accounting
8384 */
8385 while (entry->user_wired_count) {
8386 subtract_wire_counts(map, entry, user_wire);
8387 }
8388
8389 /*
8390 * All our DMA I/O operations in IOKit are currently
8391 * done by wiring through the map entries of the task
8392 * requesting the I/O.
8393 *
8394 * Because of this, we must always wait for kernel wirings
8395 * to go away on the entries before deleting them.
8396 *
8397 * Any caller who wants to actually remove a kernel wiring
8398 * should explicitly set the VM_MAP_REMOVE_KUNWIRE flag to
8399 * properly remove one wiring instead of blasting through
8400 * them all.
8401 */
8402 if (entry->wired_count != 0) {
8403 assert(map != kernel_map);
8404 /*
8405 * Cannot continue. Typical case is when
8406 * a user thread has physical io pending on
8407 * on this page. Either wait for the
8408 * kernel wiring to go away or return an
8409 * error.
8410 */
8411 wait_result_t wait_result;
8412
8413 entry->needs_wakeup = TRUE;
8414 wait_result = vm_map_entry_wait(map,
8415 interruptible);
8416
8417 if (interruptible &&
8418 wait_result == THREAD_INTERRUPTED) {
8419 /*
8420 * We do not clear the
8421 * needs_wakeup flag, since we
8422 * cannot tell if we were the
8423 * only one.
8424 */
8425 ret.kmr_return = KERN_ABORTED;
8426 return ret;
8427 }
8428
8429
8430 /*
8431 * The entry could have been clipped or
8432 * it may not exist anymore. Look it
8433 * up again.
8434 */
8435 state |= VMDS_NEEDS_LOOKUP;
8436 continue;
8437 }
8438
8439 /*
8440 * We can unlock the map now.
8441 *
8442 * The entry might be split once we unlock the map,
8443 * but we need the range as defined by this entry
8444 * to be stable. So we must make a local copy.
8445 *
8446 * The underlying objects do not change during clips,
8447 * and the in_transition state guarentees existence
8448 * of the entry.
8449 */
8450 last_timestamp = map->timestamp;
8451 entry->in_transition = TRUE;
8452 tmp_entry = *entry;
8453 vm_map_unlock(map);
8454
8455 if (tmp_entry.is_sub_map) {
8456 vm_map_t sub_map;
8457 vm_map_offset_t sub_start, sub_end;
8458 pmap_t pmap;
8459 vm_map_offset_t pmap_addr;
8460
8461
8462 sub_map = VME_SUBMAP(&tmp_entry);
8463 sub_start = VME_OFFSET(&tmp_entry);
8464 sub_end = sub_start + (tmp_entry.vme_end -
8465 tmp_entry.vme_start);
8466 if (tmp_entry.use_pmap) {
8467 pmap = sub_map->pmap;
8468 pmap_addr = tmp_entry.vme_start;
8469 } else {
8470 pmap = map->pmap;
8471 pmap_addr = tmp_entry.vme_start;
8472 }
8473 (void) vm_map_unwire_nested(sub_map,
8474 sub_start, sub_end,
8475 user_wire,
8476 pmap, pmap_addr);
8477 } else {
8478 vm_map_offset_t entry_end = tmp_entry.vme_end;
8479 vm_map_offset_t max_end;
8480
8481 if (flags & VM_MAP_REMOVE_NOKUNWIRE_LAST) {
8482 max_end = end - VM_MAP_PAGE_SIZE(map);
8483 if (entry_end > max_end) {
8484 entry_end = max_end;
8485 }
8486 }
8487
8488 if (tmp_entry.vme_kernel_object) {
8489 pmap_protect_options(
8490 map->pmap,
8491 tmp_entry.vme_start,
8492 entry_end,
8493 VM_PROT_NONE,
8494 PMAP_OPTIONS_REMOVE,
8495 NULL);
8496 }
8497 vm_fault_unwire(map, &tmp_entry,
8498 tmp_entry.vme_kernel_object, map->pmap,
8499 tmp_entry.vme_start, entry_end);
8500 }
8501
8502 vm_map_lock(map);
8503
8504 /*
8505 * Unwiring happened, we can now go back to deleting
8506 * them (after we clear the in_transition bit for the range).
8507 */
8508 if (last_timestamp + 1 != map->timestamp) {
8509 state |= VMDS_NEEDS_LOOKUP;
8510 }
8511 clear_in_transition_end = tmp_entry.vme_end;
8512 continue;
8513 }
8514
8515 assert(entry->wired_count == 0);
8516 assert(entry->user_wired_count == 0);
8517
8518
8519 /*
8520 * Step 6: Entry is unwired and ready for us to delete !
8521 */
8522
8523 if (!entry->vme_permanent) {
8524 /*
8525 * Typical case: the entry really shouldn't be permanent
8526 */
8527 } else if ((flags & VM_MAP_REMOVE_IMMUTABLE_CODE) &&
8528 (entry->protection & VM_PROT_EXECUTE) &&
8529 developer_mode_state()) {
8530 /*
8531 * Allow debuggers to undo executable mappings
8532 * when developer mode is on.
8533 */
8534 #if 0
8535 printf("FBDP %d[%s] removing permanent executable entry "
8536 "%p [0x%llx:0x%llx] prot 0x%x/0x%x\n",
8537 proc_selfpid(),
8538 (current_task()->bsd_info
8539 ? proc_name_address(current_task()->bsd_info)
8540 : "?"), entry,
8541 (uint64_t)entry->vme_start,
8542 (uint64_t)entry->vme_end,
8543 entry->protection,
8544 entry->max_protection);
8545 #endif
8546 entry->vme_permanent = FALSE;
8547 } else if ((flags & VM_MAP_REMOVE_IMMUTABLE) || map->terminated) {
8548 #if 0
8549 printf("FBDP %d[%s] removing permanent entry "
8550 "%p [0x%llx:0x%llx] prot 0x%x/0x%x\n",
8551 proc_selfpid(),
8552 (current_task()->bsd_info
8553 ? proc_name_address(current_task()->bsd_info)
8554 : "?"), entry,
8555 (uint64_t)entry->vme_start,
8556 (uint64_t)entry->vme_end,
8557 entry->protection,
8558 entry->max_protection);
8559 #endif
8560 entry->vme_permanent = FALSE;
8561 #if CODE_SIGNING_MONITOR
8562 } else if ((entry->protection & VM_PROT_EXECUTE) && !csm_enabled()) {
8563 entry->vme_permanent = FALSE;
8564
8565 printf("%d[%s] %s(0x%llx,0x%llx): "
8566 "code signing monitor disabled, allowing for permanent executable entry [0x%llx:0x%llx] "
8567 "prot 0x%x/0x%x\n",
8568 proc_selfpid(),
8569 (get_bsdtask_info(current_task())
8570 ? proc_name_address(get_bsdtask_info(current_task()))
8571 : "?"),
8572 __FUNCTION__,
8573 (uint64_t)start,
8574 (uint64_t)end,
8575 (uint64_t)entry->vme_start,
8576 (uint64_t)entry->vme_end,
8577 entry->protection,
8578 entry->max_protection);
8579 #endif
8580 } else {
8581 DTRACE_VM6(vm_map_delete_permanent,
8582 vm_map_entry_t, entry,
8583 vm_map_offset_t, entry->vme_start,
8584 vm_map_offset_t, entry->vme_end,
8585 vm_prot_t, entry->protection,
8586 vm_prot_t, entry->max_protection,
8587 int, VME_ALIAS(entry));
8588 }
8589
8590 if (entry->is_sub_map) {
8591 assertf(VM_MAP_PAGE_SHIFT(VME_SUBMAP(entry)) >= VM_MAP_PAGE_SHIFT(map),
8592 "map %p (%d) entry %p submap %p (%d)\n",
8593 map, VM_MAP_PAGE_SHIFT(map), entry,
8594 VME_SUBMAP(entry),
8595 VM_MAP_PAGE_SHIFT(VME_SUBMAP(entry)));
8596 if (entry->use_pmap) {
8597 #ifndef NO_NESTED_PMAP
8598 int pmap_flags;
8599
8600 if (map->terminated) {
8601 /*
8602 * This is the final cleanup of the
8603 * address space being terminated.
8604 * No new mappings are expected and
8605 * we don't really need to unnest the
8606 * shared region (and lose the "global"
8607 * pmap mappings, if applicable).
8608 *
8609 * Tell the pmap layer that we're
8610 * "clean" wrt nesting.
8611 */
8612 pmap_flags = PMAP_UNNEST_CLEAN;
8613 } else {
8614 /*
8615 * We're unmapping part of the nested
8616 * shared region, so we can't keep the
8617 * nested pmap.
8618 */
8619 pmap_flags = 0;
8620 }
8621 pmap_unnest_options(
8622 map->pmap,
8623 (addr64_t)entry->vme_start,
8624 entry->vme_end - entry->vme_start,
8625 pmap_flags);
8626 #endif /* NO_NESTED_PMAP */
8627 if (map->mapped_in_other_pmaps &&
8628 os_ref_get_count_raw(&map->map_refcnt) != 0) {
8629 /* clean up parent map/maps */
8630 vm_map_submap_pmap_clean(
8631 map, entry->vme_start,
8632 entry->vme_end,
8633 VME_SUBMAP(entry),
8634 VME_OFFSET(entry));
8635 }
8636 } else {
8637 vm_map_submap_pmap_clean(
8638 map, entry->vme_start, entry->vme_end,
8639 VME_SUBMAP(entry),
8640 VME_OFFSET(entry));
8641 }
8642 } else if (entry->vme_kernel_object ||
8643 VME_OBJECT(entry) == compressor_object) {
8644 /*
8645 * nothing to do
8646 */
8647 } else if (map->mapped_in_other_pmaps &&
8648 os_ref_get_count_raw(&map->map_refcnt) != 0) {
8649 vm_object_pmap_protect_options(
8650 VME_OBJECT(entry), VME_OFFSET(entry),
8651 entry->vme_end - entry->vme_start,
8652 PMAP_NULL,
8653 PAGE_SIZE,
8654 entry->vme_start,
8655 VM_PROT_NONE,
8656 PMAP_OPTIONS_REMOVE);
8657 } else if ((VME_OBJECT(entry) != VM_OBJECT_NULL) ||
8658 (state & VMDS_KERNEL_PMAP)) {
8659 /* Remove translations associated
8660 * with this range unless the entry
8661 * does not have an object, or
8662 * it's the kernel map or a descendant
8663 * since the platform could potentially
8664 * create "backdoor" mappings invisible
8665 * to the VM. It is expected that
8666 * objectless, non-kernel ranges
8667 * do not have such VM invisible
8668 * translations.
8669 */
8670 pmap_remove_options(map->pmap,
8671 (addr64_t)entry->vme_start,
8672 (addr64_t)entry->vme_end,
8673 PMAP_OPTIONS_REMOVE);
8674 }
8675
8676 #if DEBUG
8677 /*
8678 * All pmap mappings for this map entry must have been
8679 * cleared by now.
8680 */
8681 assert(pmap_is_empty(map->pmap,
8682 entry->vme_start,
8683 entry->vme_end));
8684 #endif /* DEBUG */
8685
8686 if (entry->iokit_acct) {
8687 /* alternate accounting */
8688 DTRACE_VM4(vm_map_iokit_unmapped_region,
8689 vm_map_t, map,
8690 vm_map_offset_t, entry->vme_start,
8691 vm_map_offset_t, entry->vme_end,
8692 int, VME_ALIAS(entry));
8693 vm_map_iokit_unmapped_region(map,
8694 (entry->vme_end -
8695 entry->vme_start));
8696 entry->iokit_acct = FALSE;
8697 entry->use_pmap = FALSE;
8698 }
8699
8700 /* move "s" forward */
8701 s = entry->vme_end;
8702 next = entry->vme_next;
8703 if (!entry->map_aligned) {
8704 vm_map_offset_t rounded_s;
8705
8706 /*
8707 * Skip artificial gap due to mis-aligned entry
8708 * on devices with a page size smaller than the
8709 * map's page size (i.e. 16k task on a 4k device).
8710 */
8711 rounded_s = VM_MAP_ROUND_PAGE(s, VM_MAP_PAGE_MASK(map));
8712 if (next == vm_map_to_entry(map)) {
8713 s = rounded_s;
8714 } else if (s < rounded_s) {
8715 s = MIN(rounded_s, next->vme_start);
8716 }
8717 }
8718 ret.kmr_size += s - entry->vme_start;
8719
8720 if (entry->vme_permanent) {
8721 /*
8722 * A permanent entry can not be removed, so leave it
8723 * in place but remove all access permissions.
8724 */
8725 if (!entry->csm_associated) {
8726 printf("%s:%d %d[%s] map %p entry %p [ 0x%llx - 0x%llx ] submap %d prot 0x%x/0x%x -> 0/0\n",
8727 __FUNCTION__, __LINE__,
8728 proc_selfpid(),
8729 (get_bsdtask_info(current_task())
8730 ? proc_name_address(get_bsdtask_info(current_task()))
8731 : "?"),
8732 map,
8733 entry,
8734 (uint64_t)entry->vme_start,
8735 (uint64_t)entry->vme_end,
8736 entry->is_sub_map,
8737 entry->protection,
8738 entry->max_protection);
8739 }
8740 DTRACE_VM6(vm_map_delete_permanent_prot_none,
8741 vm_map_entry_t, entry,
8742 vm_map_offset_t, entry->vme_start,
8743 vm_map_offset_t, entry->vme_end,
8744 vm_prot_t, entry->protection,
8745 vm_prot_t, entry->max_protection,
8746 int, VME_ALIAS(entry));
8747 entry->protection = VM_PROT_NONE;
8748 entry->max_protection = VM_PROT_NONE;
8749 } else {
8750 vm_map_entry_zap(map, entry, zap_list);
8751 }
8752
8753 entry = next;
8754 next = VM_MAP_ENTRY_NULL;
8755
8756 if ((flags & VM_MAP_REMOVE_NO_YIELD) == 0 && s < end) {
8757 unsigned int last_timestamp = map->timestamp++;
8758
8759 if (lck_rw_lock_yield_exclusive(&map->lock,
8760 LCK_RW_YIELD_ANY_WAITER)) {
8761 if (last_timestamp != map->timestamp + 1) {
8762 state |= VMDS_NEEDS_LOOKUP;
8763 }
8764 } else {
8765 /* we didn't yield, undo our change */
8766 map->timestamp--;
8767 }
8768 }
8769 }
8770
8771 if (map->wait_for_space) {
8772 thread_wakeup((event_t) map);
8773 }
8774
8775 if (state & VMDS_NEEDS_WAKEUP) {
8776 vm_map_entry_wakeup(map);
8777 }
8778
8779 out:
8780 if ((state & VMDS_KERNEL_PMAP) && ret.kmr_return) {
8781 __vm_map_delete_failed_panic(map, start, end, ret.kmr_return);
8782 }
8783
8784 if (state & VMDS_KERNEL_KMEMPTR) {
8785 kmem_free_space(start, end, range_id, &slot);
8786 }
8787
8788 if (state & VMDS_FOUND_GAP) {
8789 DTRACE_VM3(kern_vm_deallocate_gap,
8790 vm_map_offset_t, gap_start,
8791 vm_map_offset_t, save_start,
8792 vm_map_offset_t, save_end);
8793 if (flags & VM_MAP_REMOVE_GAPS_FAIL) {
8794 ret.kmr_return = KERN_INVALID_VALUE;
8795 } else {
8796 vm_map_guard_exception(gap_start, kGUARD_EXC_DEALLOC_GAP);
8797 }
8798 }
8799
8800 return ret;
8801 }
8802
8803 kmem_return_t
vm_map_remove_and_unlock(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vmr_flags_t flags,kmem_guard_t guard)8804 vm_map_remove_and_unlock(
8805 vm_map_t map,
8806 vm_map_offset_t start,
8807 vm_map_offset_t end,
8808 vmr_flags_t flags,
8809 kmem_guard_t guard)
8810 {
8811 kmem_return_t ret;
8812 VM_MAP_ZAP_DECLARE(zap);
8813
8814 ret = vm_map_delete(map, start, end, flags, guard, &zap);
8815 vm_map_unlock(map);
8816
8817 vm_map_zap_dispose(&zap);
8818
8819 return ret;
8820 }
8821
8822 /*
8823 * vm_map_remove_guard:
8824 *
8825 * Remove the given address range from the target map.
8826 * This is the exported form of vm_map_delete.
8827 */
8828 kmem_return_t
vm_map_remove_guard(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vmr_flags_t flags,kmem_guard_t guard)8829 vm_map_remove_guard(
8830 vm_map_t map,
8831 vm_map_offset_t start,
8832 vm_map_offset_t end,
8833 vmr_flags_t flags,
8834 kmem_guard_t guard)
8835 {
8836 vm_map_lock(map);
8837 return vm_map_remove_and_unlock(map, start, end, flags, guard);
8838 }
8839
8840 /*
8841 * vm_map_terminate:
8842 *
8843 * Clean out a task's map.
8844 */
8845 kern_return_t
vm_map_terminate(vm_map_t map)8846 vm_map_terminate(
8847 vm_map_t map)
8848 {
8849 vm_map_lock(map);
8850 map->terminated = TRUE;
8851 vm_map_disable_hole_optimization(map);
8852 (void)vm_map_remove_and_unlock(map, map->min_offset, map->max_offset,
8853 VM_MAP_REMOVE_NO_FLAGS, KMEM_GUARD_NONE);
8854 return KERN_SUCCESS;
8855 }
8856
8857 /*
8858 * Routine: vm_map_copy_allocate
8859 *
8860 * Description:
8861 * Allocates and initializes a map copy object.
8862 */
8863 static vm_map_copy_t
vm_map_copy_allocate(uint16_t type)8864 vm_map_copy_allocate(uint16_t type)
8865 {
8866 vm_map_copy_t new_copy;
8867
8868 new_copy = zalloc_id(ZONE_ID_VM_MAP_COPY, Z_WAITOK | Z_ZERO);
8869 new_copy->type = type;
8870 if (type == VM_MAP_COPY_ENTRY_LIST) {
8871 new_copy->c_u.hdr.rb_head_store.rbh_root = (void*)(int)SKIP_RB_TREE;
8872 vm_map_store_init(&new_copy->cpy_hdr);
8873 }
8874 return new_copy;
8875 }
8876
8877 /*
8878 * Routine: vm_map_copy_discard
8879 *
8880 * Description:
8881 * Dispose of a map copy object (returned by
8882 * vm_map_copyin).
8883 */
8884 void
vm_map_copy_discard(vm_map_copy_t copy)8885 vm_map_copy_discard(
8886 vm_map_copy_t copy)
8887 {
8888 if (copy == VM_MAP_COPY_NULL) {
8889 return;
8890 }
8891
8892 /*
8893 * Assert that the vm_map_copy is coming from the right
8894 * zone and hasn't been forged
8895 */
8896 vm_map_copy_require(copy);
8897
8898 switch (copy->type) {
8899 case VM_MAP_COPY_ENTRY_LIST:
8900 while (vm_map_copy_first_entry(copy) !=
8901 vm_map_copy_to_entry(copy)) {
8902 vm_map_entry_t entry = vm_map_copy_first_entry(copy);
8903
8904 vm_map_copy_entry_unlink(copy, entry);
8905 if (entry->is_sub_map) {
8906 vm_map_deallocate(VME_SUBMAP(entry));
8907 } else {
8908 vm_object_deallocate(VME_OBJECT(entry));
8909 }
8910 vm_map_copy_entry_dispose(entry);
8911 }
8912 break;
8913 case VM_MAP_COPY_KERNEL_BUFFER:
8914
8915 /*
8916 * The vm_map_copy_t and possibly the data buffer were
8917 * allocated by a single call to kalloc_data(), i.e. the
8918 * vm_map_copy_t was not allocated out of the zone.
8919 */
8920 if (copy->size > msg_ool_size_small || copy->offset) {
8921 panic("Invalid vm_map_copy_t sz:%lld, ofst:%lld",
8922 (long long)copy->size, (long long)copy->offset);
8923 }
8924 kfree_data(copy->cpy_kdata, copy->size);
8925 }
8926 zfree_id(ZONE_ID_VM_MAP_COPY, copy);
8927 }
8928
8929 #if XNU_PLATFORM_MacOSX
8930
8931 /*
8932 * Routine: vm_map_copy_copy
8933 *
8934 * Description:
8935 * Move the information in a map copy object to
8936 * a new map copy object, leaving the old one
8937 * empty.
8938 *
8939 * This is used by kernel routines that need
8940 * to look at out-of-line data (in copyin form)
8941 * before deciding whether to return SUCCESS.
8942 * If the routine returns FAILURE, the original
8943 * copy object will be deallocated; therefore,
8944 * these routines must make a copy of the copy
8945 * object and leave the original empty so that
8946 * deallocation will not fail.
8947 */
8948 vm_map_copy_t
vm_map_copy_copy(vm_map_copy_t copy)8949 vm_map_copy_copy(
8950 vm_map_copy_t copy)
8951 {
8952 vm_map_copy_t new_copy;
8953
8954 if (copy == VM_MAP_COPY_NULL) {
8955 return VM_MAP_COPY_NULL;
8956 }
8957
8958 /*
8959 * Assert that the vm_map_copy is coming from the right
8960 * zone and hasn't been forged
8961 */
8962 vm_map_copy_require(copy);
8963
8964 /*
8965 * Allocate a new copy object, and copy the information
8966 * from the old one into it.
8967 */
8968
8969 new_copy = zalloc_id(ZONE_ID_VM_MAP_COPY, Z_WAITOK | Z_ZERO | Z_NOFAIL);
8970 memcpy((void *) new_copy, (void *) copy, sizeof(struct vm_map_copy));
8971 #if __has_feature(ptrauth_calls)
8972 if (copy->type == VM_MAP_COPY_KERNEL_BUFFER) {
8973 new_copy->cpy_kdata = copy->cpy_kdata;
8974 }
8975 #endif
8976
8977 if (copy->type == VM_MAP_COPY_ENTRY_LIST) {
8978 /*
8979 * The links in the entry chain must be
8980 * changed to point to the new copy object.
8981 */
8982 vm_map_copy_first_entry(copy)->vme_prev
8983 = vm_map_copy_to_entry(new_copy);
8984 vm_map_copy_last_entry(copy)->vme_next
8985 = vm_map_copy_to_entry(new_copy);
8986 }
8987
8988 /*
8989 * Change the old copy object into one that contains
8990 * nothing to be deallocated.
8991 */
8992 bzero(copy, sizeof(struct vm_map_copy));
8993 copy->type = VM_MAP_COPY_KERNEL_BUFFER;
8994
8995 /*
8996 * Return the new object.
8997 */
8998 return new_copy;
8999 }
9000
9001 #endif /* XNU_PLATFORM_MacOSX */
9002
9003 static boolean_t
vm_map_entry_is_overwritable(vm_map_t dst_map __unused,vm_map_entry_t entry)9004 vm_map_entry_is_overwritable(
9005 vm_map_t dst_map __unused,
9006 vm_map_entry_t entry)
9007 {
9008 if (!(entry->protection & VM_PROT_WRITE)) {
9009 /* can't overwrite if not writable */
9010 return FALSE;
9011 }
9012 #if !__x86_64__
9013 if (entry->used_for_jit &&
9014 vm_map_cs_enforcement(dst_map) &&
9015 !dst_map->cs_debugged) {
9016 /*
9017 * Can't overwrite a JIT region while cs_enforced
9018 * and not cs_debugged.
9019 */
9020 return FALSE;
9021 }
9022
9023 #if __arm64e__
9024 /* Do not allow overwrite HW assisted TPRO entries */
9025 if (entry->used_for_tpro) {
9026 return FALSE;
9027 }
9028 #endif /* __arm64e__ */
9029
9030 if (entry->vme_permanent) {
9031 if (entry->is_sub_map) {
9032 /*
9033 * We can't tell if the submap contains "permanent"
9034 * entries within the range targeted by the caller.
9035 * The caller will have to check for that with
9036 * vm_map_overwrite_submap_recurse() for example.
9037 */
9038 } else {
9039 /*
9040 * Do not allow overwriting of a "permanent"
9041 * entry.
9042 */
9043 DTRACE_VM6(vm_map_delete_permanent_deny_overwrite,
9044 vm_map_entry_t, entry,
9045 vm_map_offset_t, entry->vme_start,
9046 vm_map_offset_t, entry->vme_end,
9047 vm_prot_t, entry->protection,
9048 vm_prot_t, entry->max_protection,
9049 int, VME_ALIAS(entry));
9050 return FALSE;
9051 }
9052 }
9053 #endif /* !__x86_64__ */
9054 return TRUE;
9055 }
9056
9057 static kern_return_t
vm_map_overwrite_submap_recurse(vm_map_t dst_map,vm_map_offset_t dst_addr,vm_map_size_t dst_size)9058 vm_map_overwrite_submap_recurse(
9059 vm_map_t dst_map,
9060 vm_map_offset_t dst_addr,
9061 vm_map_size_t dst_size)
9062 {
9063 vm_map_offset_t dst_end;
9064 vm_map_entry_t tmp_entry;
9065 vm_map_entry_t entry;
9066 kern_return_t result;
9067 boolean_t encountered_sub_map = FALSE;
9068
9069
9070
9071 /*
9072 * Verify that the destination is all writeable
9073 * initially. We have to trunc the destination
9074 * address and round the copy size or we'll end up
9075 * splitting entries in strange ways.
9076 */
9077
9078 dst_end = vm_map_round_page(dst_addr + dst_size,
9079 VM_MAP_PAGE_MASK(dst_map));
9080 vm_map_lock(dst_map);
9081
9082 start_pass_1:
9083 if (!vm_map_lookup_entry(dst_map, dst_addr, &tmp_entry)) {
9084 vm_map_unlock(dst_map);
9085 return KERN_INVALID_ADDRESS;
9086 }
9087
9088 vm_map_clip_start(dst_map,
9089 tmp_entry,
9090 vm_map_trunc_page(dst_addr,
9091 VM_MAP_PAGE_MASK(dst_map)));
9092 if (tmp_entry->is_sub_map) {
9093 /* clipping did unnest if needed */
9094 assert(!tmp_entry->use_pmap);
9095 }
9096
9097 for (entry = tmp_entry;;) {
9098 vm_map_entry_t next;
9099
9100 next = entry->vme_next;
9101 while (entry->is_sub_map) {
9102 vm_map_offset_t sub_start;
9103 vm_map_offset_t sub_end;
9104 vm_map_offset_t local_end;
9105
9106 if (entry->in_transition) {
9107 /*
9108 * Say that we are waiting, and wait for entry.
9109 */
9110 entry->needs_wakeup = TRUE;
9111 vm_map_entry_wait(dst_map, THREAD_UNINT);
9112
9113 goto start_pass_1;
9114 }
9115
9116 encountered_sub_map = TRUE;
9117 sub_start = VME_OFFSET(entry);
9118
9119 if (entry->vme_end < dst_end) {
9120 sub_end = entry->vme_end;
9121 } else {
9122 sub_end = dst_end;
9123 }
9124 sub_end -= entry->vme_start;
9125 sub_end += VME_OFFSET(entry);
9126 local_end = entry->vme_end;
9127 vm_map_unlock(dst_map);
9128
9129 result = vm_map_overwrite_submap_recurse(
9130 VME_SUBMAP(entry),
9131 sub_start,
9132 sub_end - sub_start);
9133
9134 if (result != KERN_SUCCESS) {
9135 return result;
9136 }
9137 if (dst_end <= entry->vme_end) {
9138 return KERN_SUCCESS;
9139 }
9140 vm_map_lock(dst_map);
9141 if (!vm_map_lookup_entry(dst_map, local_end,
9142 &tmp_entry)) {
9143 vm_map_unlock(dst_map);
9144 return KERN_INVALID_ADDRESS;
9145 }
9146 entry = tmp_entry;
9147 next = entry->vme_next;
9148 }
9149
9150 if (!(entry->protection & VM_PROT_WRITE)) {
9151 vm_map_unlock(dst_map);
9152 return KERN_PROTECTION_FAILURE;
9153 }
9154
9155 if (!vm_map_entry_is_overwritable(dst_map, entry)) {
9156 vm_map_unlock(dst_map);
9157 return KERN_PROTECTION_FAILURE;
9158 }
9159
9160 /*
9161 * If the entry is in transition, we must wait
9162 * for it to exit that state. Anything could happen
9163 * when we unlock the map, so start over.
9164 */
9165 if (entry->in_transition) {
9166 /*
9167 * Say that we are waiting, and wait for entry.
9168 */
9169 entry->needs_wakeup = TRUE;
9170 vm_map_entry_wait(dst_map, THREAD_UNINT);
9171
9172 goto start_pass_1;
9173 }
9174
9175 /*
9176 * our range is contained completely within this map entry
9177 */
9178 if (dst_end <= entry->vme_end) {
9179 vm_map_unlock(dst_map);
9180 return KERN_SUCCESS;
9181 }
9182 /*
9183 * check that range specified is contiguous region
9184 */
9185 if ((next == vm_map_to_entry(dst_map)) ||
9186 (next->vme_start != entry->vme_end)) {
9187 vm_map_unlock(dst_map);
9188 return KERN_INVALID_ADDRESS;
9189 }
9190
9191 /*
9192 * Check for permanent objects in the destination.
9193 */
9194 if ((VME_OBJECT(entry) != VM_OBJECT_NULL) &&
9195 ((!VME_OBJECT(entry)->internal) ||
9196 (VME_OBJECT(entry)->true_share))) {
9197 if (encountered_sub_map) {
9198 vm_map_unlock(dst_map);
9199 return KERN_FAILURE;
9200 }
9201 }
9202
9203
9204 entry = next;
9205 }/* for */
9206 vm_map_unlock(dst_map);
9207 return KERN_SUCCESS;
9208 }
9209
9210 /*
9211 * Routine: vm_map_copy_overwrite
9212 *
9213 * Description:
9214 * Copy the memory described by the map copy
9215 * object (copy; returned by vm_map_copyin) onto
9216 * the specified destination region (dst_map, dst_addr).
9217 * The destination must be writeable.
9218 *
9219 * Unlike vm_map_copyout, this routine actually
9220 * writes over previously-mapped memory. If the
9221 * previous mapping was to a permanent (user-supplied)
9222 * memory object, it is preserved.
9223 *
9224 * The attributes (protection and inheritance) of the
9225 * destination region are preserved.
9226 *
9227 * If successful, consumes the copy object.
9228 * Otherwise, the caller is responsible for it.
9229 *
9230 * Implementation notes:
9231 * To overwrite aligned temporary virtual memory, it is
9232 * sufficient to remove the previous mapping and insert
9233 * the new copy. This replacement is done either on
9234 * the whole region (if no permanent virtual memory
9235 * objects are embedded in the destination region) or
9236 * in individual map entries.
9237 *
9238 * To overwrite permanent virtual memory , it is necessary
9239 * to copy each page, as the external memory management
9240 * interface currently does not provide any optimizations.
9241 *
9242 * Unaligned memory also has to be copied. It is possible
9243 * to use 'vm_trickery' to copy the aligned data. This is
9244 * not done but not hard to implement.
9245 *
9246 * Once a page of permanent memory has been overwritten,
9247 * it is impossible to interrupt this function; otherwise,
9248 * the call would be neither atomic nor location-independent.
9249 * The kernel-state portion of a user thread must be
9250 * interruptible.
9251 *
9252 * It may be expensive to forward all requests that might
9253 * overwrite permanent memory (vm_write, vm_copy) to
9254 * uninterruptible kernel threads. This routine may be
9255 * called by interruptible threads; however, success is
9256 * not guaranteed -- if the request cannot be performed
9257 * atomically and interruptibly, an error indication is
9258 * returned.
9259 *
9260 * Callers of this function must call vm_map_copy_require on
9261 * previously created vm_map_copy_t or pass a newly created
9262 * one to ensure that it hasn't been forged.
9263 */
9264
9265 static kern_return_t
vm_map_copy_overwrite_nested(vm_map_t dst_map,vm_map_address_t dst_addr,vm_map_copy_t copy,boolean_t interruptible,pmap_t pmap,boolean_t discard_on_success)9266 vm_map_copy_overwrite_nested(
9267 vm_map_t dst_map,
9268 vm_map_address_t dst_addr,
9269 vm_map_copy_t copy,
9270 boolean_t interruptible,
9271 pmap_t pmap,
9272 boolean_t discard_on_success)
9273 {
9274 vm_map_offset_t dst_end;
9275 vm_map_entry_t tmp_entry;
9276 vm_map_entry_t entry;
9277 kern_return_t kr;
9278 boolean_t aligned = TRUE;
9279 boolean_t contains_permanent_objects = FALSE;
9280 boolean_t encountered_sub_map = FALSE;
9281 vm_map_offset_t base_addr;
9282 vm_map_size_t copy_size;
9283 vm_map_size_t total_size;
9284 uint16_t copy_page_shift;
9285
9286 /*
9287 * Check for special kernel buffer allocated
9288 * by new_ipc_kmsg_copyin.
9289 */
9290
9291 if (copy->type == VM_MAP_COPY_KERNEL_BUFFER) {
9292 return vm_map_copyout_kernel_buffer(
9293 dst_map, &dst_addr,
9294 copy, copy->size, TRUE, discard_on_success);
9295 }
9296
9297 /*
9298 * Only works for entry lists at the moment. Will
9299 * support page lists later.
9300 */
9301
9302 assert(copy->type == VM_MAP_COPY_ENTRY_LIST);
9303
9304 if (copy->size == 0) {
9305 if (discard_on_success) {
9306 vm_map_copy_discard(copy);
9307 }
9308 return KERN_SUCCESS;
9309 }
9310
9311 copy_page_shift = copy->cpy_hdr.page_shift;
9312
9313 /*
9314 * Verify that the destination is all writeable
9315 * initially. We have to trunc the destination
9316 * address and round the copy size or we'll end up
9317 * splitting entries in strange ways.
9318 */
9319
9320 if (!VM_MAP_PAGE_ALIGNED(copy->size,
9321 VM_MAP_PAGE_MASK(dst_map)) ||
9322 !VM_MAP_PAGE_ALIGNED(copy->offset,
9323 VM_MAP_PAGE_MASK(dst_map)) ||
9324 !VM_MAP_PAGE_ALIGNED(dst_addr,
9325 VM_MAP_PAGE_MASK(dst_map)) ||
9326 copy_page_shift != VM_MAP_PAGE_SHIFT(dst_map)) {
9327 aligned = FALSE;
9328 dst_end = vm_map_round_page(dst_addr + copy->size,
9329 VM_MAP_PAGE_MASK(dst_map));
9330 } else {
9331 dst_end = dst_addr + copy->size;
9332 }
9333
9334 vm_map_lock(dst_map);
9335
9336 /* LP64todo - remove this check when vm_map_commpage64()
9337 * no longer has to stuff in a map_entry for the commpage
9338 * above the map's max_offset.
9339 */
9340 if (dst_addr >= dst_map->max_offset) {
9341 vm_map_unlock(dst_map);
9342 return KERN_INVALID_ADDRESS;
9343 }
9344
9345 start_pass_1:
9346 if (!vm_map_lookup_entry(dst_map, dst_addr, &tmp_entry)) {
9347 vm_map_unlock(dst_map);
9348 return KERN_INVALID_ADDRESS;
9349 }
9350 vm_map_clip_start(dst_map,
9351 tmp_entry,
9352 vm_map_trunc_page(dst_addr,
9353 VM_MAP_PAGE_MASK(dst_map)));
9354 for (entry = tmp_entry;;) {
9355 vm_map_entry_t next = entry->vme_next;
9356
9357 while (entry->is_sub_map) {
9358 vm_map_offset_t sub_start;
9359 vm_map_offset_t sub_end;
9360 vm_map_offset_t local_end;
9361
9362 if (entry->in_transition) {
9363 /*
9364 * Say that we are waiting, and wait for entry.
9365 */
9366 entry->needs_wakeup = TRUE;
9367 vm_map_entry_wait(dst_map, THREAD_UNINT);
9368
9369 goto start_pass_1;
9370 }
9371
9372 local_end = entry->vme_end;
9373 if (!(entry->needs_copy)) {
9374 /* if needs_copy we are a COW submap */
9375 /* in such a case we just replace so */
9376 /* there is no need for the follow- */
9377 /* ing check. */
9378 encountered_sub_map = TRUE;
9379 sub_start = VME_OFFSET(entry);
9380
9381 if (entry->vme_end < dst_end) {
9382 sub_end = entry->vme_end;
9383 } else {
9384 sub_end = dst_end;
9385 }
9386 sub_end -= entry->vme_start;
9387 sub_end += VME_OFFSET(entry);
9388 vm_map_unlock(dst_map);
9389
9390 kr = vm_map_overwrite_submap_recurse(
9391 VME_SUBMAP(entry),
9392 sub_start,
9393 sub_end - sub_start);
9394 if (kr != KERN_SUCCESS) {
9395 return kr;
9396 }
9397 vm_map_lock(dst_map);
9398 }
9399
9400 if (dst_end <= entry->vme_end) {
9401 goto start_overwrite;
9402 }
9403 if (!vm_map_lookup_entry(dst_map, local_end,
9404 &entry)) {
9405 vm_map_unlock(dst_map);
9406 return KERN_INVALID_ADDRESS;
9407 }
9408 next = entry->vme_next;
9409 }
9410
9411 if (!(entry->protection & VM_PROT_WRITE)) {
9412 vm_map_unlock(dst_map);
9413 return KERN_PROTECTION_FAILURE;
9414 }
9415
9416 if (!vm_map_entry_is_overwritable(dst_map, entry)) {
9417 vm_map_unlock(dst_map);
9418 return KERN_PROTECTION_FAILURE;
9419 }
9420
9421 /*
9422 * If the entry is in transition, we must wait
9423 * for it to exit that state. Anything could happen
9424 * when we unlock the map, so start over.
9425 */
9426 if (entry->in_transition) {
9427 /*
9428 * Say that we are waiting, and wait for entry.
9429 */
9430 entry->needs_wakeup = TRUE;
9431 vm_map_entry_wait(dst_map, THREAD_UNINT);
9432
9433 goto start_pass_1;
9434 }
9435
9436 /*
9437 * our range is contained completely within this map entry
9438 */
9439 if (dst_end <= entry->vme_end) {
9440 break;
9441 }
9442 /*
9443 * check that range specified is contiguous region
9444 */
9445 if ((next == vm_map_to_entry(dst_map)) ||
9446 (next->vme_start != entry->vme_end)) {
9447 vm_map_unlock(dst_map);
9448 return KERN_INVALID_ADDRESS;
9449 }
9450
9451
9452 /*
9453 * Check for permanent objects in the destination.
9454 */
9455 if ((VME_OBJECT(entry) != VM_OBJECT_NULL) &&
9456 ((!VME_OBJECT(entry)->internal) ||
9457 (VME_OBJECT(entry)->true_share))) {
9458 contains_permanent_objects = TRUE;
9459 }
9460
9461 entry = next;
9462 }/* for */
9463
9464 start_overwrite:
9465 /*
9466 * If there are permanent objects in the destination, then
9467 * the copy cannot be interrupted.
9468 */
9469
9470 if (interruptible && contains_permanent_objects) {
9471 vm_map_unlock(dst_map);
9472 return KERN_FAILURE; /* XXX */
9473 }
9474
9475 /*
9476 *
9477 * Make a second pass, overwriting the data
9478 * At the beginning of each loop iteration,
9479 * the next entry to be overwritten is "tmp_entry"
9480 * (initially, the value returned from the lookup above),
9481 * and the starting address expected in that entry
9482 * is "start".
9483 */
9484
9485 total_size = copy->size;
9486 if (encountered_sub_map) {
9487 copy_size = 0;
9488 /* re-calculate tmp_entry since we've had the map */
9489 /* unlocked */
9490 if (!vm_map_lookup_entry( dst_map, dst_addr, &tmp_entry)) {
9491 vm_map_unlock(dst_map);
9492 return KERN_INVALID_ADDRESS;
9493 }
9494 } else {
9495 copy_size = copy->size;
9496 }
9497
9498 base_addr = dst_addr;
9499 while (TRUE) {
9500 /* deconstruct the copy object and do in parts */
9501 /* only in sub_map, interruptable case */
9502 vm_map_entry_t copy_entry;
9503 vm_map_entry_t previous_prev = VM_MAP_ENTRY_NULL;
9504 vm_map_entry_t next_copy = VM_MAP_ENTRY_NULL;
9505 int nentries;
9506 int remaining_entries = 0;
9507 vm_map_offset_t new_offset = 0;
9508
9509 for (entry = tmp_entry; copy_size == 0;) {
9510 vm_map_entry_t next;
9511
9512 next = entry->vme_next;
9513
9514 /* tmp_entry and base address are moved along */
9515 /* each time we encounter a sub-map. Otherwise */
9516 /* entry can outpase tmp_entry, and the copy_size */
9517 /* may reflect the distance between them */
9518 /* if the current entry is found to be in transition */
9519 /* we will start over at the beginning or the last */
9520 /* encounter of a submap as dictated by base_addr */
9521 /* we will zero copy_size accordingly. */
9522 if (entry->in_transition) {
9523 /*
9524 * Say that we are waiting, and wait for entry.
9525 */
9526 entry->needs_wakeup = TRUE;
9527 vm_map_entry_wait(dst_map, THREAD_UNINT);
9528
9529 if (!vm_map_lookup_entry(dst_map, base_addr,
9530 &tmp_entry)) {
9531 vm_map_unlock(dst_map);
9532 return KERN_INVALID_ADDRESS;
9533 }
9534 copy_size = 0;
9535 entry = tmp_entry;
9536 continue;
9537 }
9538 if (entry->is_sub_map) {
9539 vm_map_offset_t sub_start;
9540 vm_map_offset_t sub_end;
9541 vm_map_offset_t local_end;
9542
9543 if (entry->needs_copy) {
9544 /* if this is a COW submap */
9545 /* just back the range with a */
9546 /* anonymous entry */
9547 assert(!entry->vme_permanent);
9548 if (entry->vme_end < dst_end) {
9549 sub_end = entry->vme_end;
9550 } else {
9551 sub_end = dst_end;
9552 }
9553 if (entry->vme_start < base_addr) {
9554 sub_start = base_addr;
9555 } else {
9556 sub_start = entry->vme_start;
9557 }
9558 vm_map_clip_end(
9559 dst_map, entry, sub_end);
9560 vm_map_clip_start(
9561 dst_map, entry, sub_start);
9562 assert(!entry->use_pmap);
9563 assert(!entry->iokit_acct);
9564 entry->use_pmap = TRUE;
9565 vm_map_deallocate(VME_SUBMAP(entry));
9566 assert(!entry->vme_permanent);
9567 VME_OBJECT_SET(entry, VM_OBJECT_NULL, false, 0);
9568 VME_OFFSET_SET(entry, 0);
9569 entry->is_shared = FALSE;
9570 entry->needs_copy = FALSE;
9571 entry->protection = VM_PROT_DEFAULT;
9572 entry->max_protection = VM_PROT_ALL;
9573 entry->wired_count = 0;
9574 entry->user_wired_count = 0;
9575 if (entry->inheritance
9576 == VM_INHERIT_SHARE) {
9577 entry->inheritance = VM_INHERIT_COPY;
9578 }
9579 continue;
9580 }
9581 /* first take care of any non-sub_map */
9582 /* entries to send */
9583 if (base_addr < entry->vme_start) {
9584 /* stuff to send */
9585 copy_size =
9586 entry->vme_start - base_addr;
9587 break;
9588 }
9589 sub_start = VME_OFFSET(entry);
9590
9591 if (entry->vme_end < dst_end) {
9592 sub_end = entry->vme_end;
9593 } else {
9594 sub_end = dst_end;
9595 }
9596 sub_end -= entry->vme_start;
9597 sub_end += VME_OFFSET(entry);
9598 local_end = entry->vme_end;
9599 vm_map_unlock(dst_map);
9600 copy_size = sub_end - sub_start;
9601
9602 /* adjust the copy object */
9603 if (total_size > copy_size) {
9604 vm_map_size_t local_size = 0;
9605 vm_map_size_t entry_size;
9606
9607 nentries = 1;
9608 new_offset = copy->offset;
9609 copy_entry = vm_map_copy_first_entry(copy);
9610 while (copy_entry !=
9611 vm_map_copy_to_entry(copy)) {
9612 entry_size = copy_entry->vme_end -
9613 copy_entry->vme_start;
9614 if ((local_size < copy_size) &&
9615 ((local_size + entry_size)
9616 >= copy_size)) {
9617 vm_map_copy_clip_end(copy,
9618 copy_entry,
9619 copy_entry->vme_start +
9620 (copy_size - local_size));
9621 entry_size = copy_entry->vme_end -
9622 copy_entry->vme_start;
9623 local_size += entry_size;
9624 new_offset += entry_size;
9625 }
9626 if (local_size >= copy_size) {
9627 next_copy = copy_entry->vme_next;
9628 copy_entry->vme_next =
9629 vm_map_copy_to_entry(copy);
9630 previous_prev =
9631 copy->cpy_hdr.links.prev;
9632 copy->cpy_hdr.links.prev = copy_entry;
9633 copy->size = copy_size;
9634 remaining_entries =
9635 copy->cpy_hdr.nentries;
9636 remaining_entries -= nentries;
9637 copy->cpy_hdr.nentries = nentries;
9638 break;
9639 } else {
9640 local_size += entry_size;
9641 new_offset += entry_size;
9642 nentries++;
9643 }
9644 copy_entry = copy_entry->vme_next;
9645 }
9646 }
9647
9648 if ((entry->use_pmap) && (pmap == NULL)) {
9649 kr = vm_map_copy_overwrite_nested(
9650 VME_SUBMAP(entry),
9651 sub_start,
9652 copy,
9653 interruptible,
9654 VME_SUBMAP(entry)->pmap,
9655 TRUE);
9656 } else if (pmap != NULL) {
9657 kr = vm_map_copy_overwrite_nested(
9658 VME_SUBMAP(entry),
9659 sub_start,
9660 copy,
9661 interruptible, pmap,
9662 TRUE);
9663 } else {
9664 kr = vm_map_copy_overwrite_nested(
9665 VME_SUBMAP(entry),
9666 sub_start,
9667 copy,
9668 interruptible,
9669 dst_map->pmap,
9670 TRUE);
9671 }
9672 if (kr != KERN_SUCCESS) {
9673 if (next_copy != NULL) {
9674 copy->cpy_hdr.nentries +=
9675 remaining_entries;
9676 copy->cpy_hdr.links.prev->vme_next =
9677 next_copy;
9678 copy->cpy_hdr.links.prev
9679 = previous_prev;
9680 copy->size = total_size;
9681 }
9682 return kr;
9683 }
9684 if (dst_end <= local_end) {
9685 return KERN_SUCCESS;
9686 }
9687 /* otherwise copy no longer exists, it was */
9688 /* destroyed after successful copy_overwrite */
9689 copy = vm_map_copy_allocate(VM_MAP_COPY_ENTRY_LIST);
9690 copy->offset = new_offset;
9691 copy->cpy_hdr.page_shift = copy_page_shift;
9692
9693 total_size -= copy_size;
9694 copy_size = 0;
9695 /* put back remainder of copy in container */
9696 if (next_copy != NULL) {
9697 copy->cpy_hdr.nentries = remaining_entries;
9698 copy->cpy_hdr.links.next = next_copy;
9699 copy->cpy_hdr.links.prev = previous_prev;
9700 copy->size = total_size;
9701 next_copy->vme_prev =
9702 vm_map_copy_to_entry(copy);
9703 next_copy = NULL;
9704 }
9705 base_addr = local_end;
9706 vm_map_lock(dst_map);
9707 if (!vm_map_lookup_entry(dst_map,
9708 local_end, &tmp_entry)) {
9709 vm_map_unlock(dst_map);
9710 return KERN_INVALID_ADDRESS;
9711 }
9712 entry = tmp_entry;
9713 continue;
9714 }
9715 if (dst_end <= entry->vme_end) {
9716 copy_size = dst_end - base_addr;
9717 break;
9718 }
9719
9720 if ((next == vm_map_to_entry(dst_map)) ||
9721 (next->vme_start != entry->vme_end)) {
9722 vm_map_unlock(dst_map);
9723 return KERN_INVALID_ADDRESS;
9724 }
9725
9726 entry = next;
9727 }/* for */
9728
9729 next_copy = NULL;
9730 nentries = 1;
9731
9732 /* adjust the copy object */
9733 if (total_size > copy_size) {
9734 vm_map_size_t local_size = 0;
9735 vm_map_size_t entry_size;
9736
9737 new_offset = copy->offset;
9738 copy_entry = vm_map_copy_first_entry(copy);
9739 while (copy_entry != vm_map_copy_to_entry(copy)) {
9740 entry_size = copy_entry->vme_end -
9741 copy_entry->vme_start;
9742 if ((local_size < copy_size) &&
9743 ((local_size + entry_size)
9744 >= copy_size)) {
9745 vm_map_copy_clip_end(copy, copy_entry,
9746 copy_entry->vme_start +
9747 (copy_size - local_size));
9748 entry_size = copy_entry->vme_end -
9749 copy_entry->vme_start;
9750 local_size += entry_size;
9751 new_offset += entry_size;
9752 }
9753 if (local_size >= copy_size) {
9754 next_copy = copy_entry->vme_next;
9755 copy_entry->vme_next =
9756 vm_map_copy_to_entry(copy);
9757 previous_prev =
9758 copy->cpy_hdr.links.prev;
9759 copy->cpy_hdr.links.prev = copy_entry;
9760 copy->size = copy_size;
9761 remaining_entries =
9762 copy->cpy_hdr.nentries;
9763 remaining_entries -= nentries;
9764 copy->cpy_hdr.nentries = nentries;
9765 break;
9766 } else {
9767 local_size += entry_size;
9768 new_offset += entry_size;
9769 nentries++;
9770 }
9771 copy_entry = copy_entry->vme_next;
9772 }
9773 }
9774
9775 if (aligned) {
9776 pmap_t local_pmap;
9777
9778 if (pmap) {
9779 local_pmap = pmap;
9780 } else {
9781 local_pmap = dst_map->pmap;
9782 }
9783
9784 if ((kr = vm_map_copy_overwrite_aligned(
9785 dst_map, tmp_entry, copy,
9786 base_addr, local_pmap)) != KERN_SUCCESS) {
9787 if (next_copy != NULL) {
9788 copy->cpy_hdr.nentries +=
9789 remaining_entries;
9790 copy->cpy_hdr.links.prev->vme_next =
9791 next_copy;
9792 copy->cpy_hdr.links.prev =
9793 previous_prev;
9794 copy->size += copy_size;
9795 }
9796 return kr;
9797 }
9798 vm_map_unlock(dst_map);
9799 } else {
9800 /*
9801 * Performance gain:
9802 *
9803 * if the copy and dst address are misaligned but the same
9804 * offset within the page we can copy_not_aligned the
9805 * misaligned parts and copy aligned the rest. If they are
9806 * aligned but len is unaligned we simply need to copy
9807 * the end bit unaligned. We'll need to split the misaligned
9808 * bits of the region in this case !
9809 */
9810 /* ALWAYS UNLOCKS THE dst_map MAP */
9811 kr = vm_map_copy_overwrite_unaligned(
9812 dst_map,
9813 tmp_entry,
9814 copy,
9815 base_addr,
9816 discard_on_success);
9817 if (kr != KERN_SUCCESS) {
9818 if (next_copy != NULL) {
9819 copy->cpy_hdr.nentries +=
9820 remaining_entries;
9821 copy->cpy_hdr.links.prev->vme_next =
9822 next_copy;
9823 copy->cpy_hdr.links.prev =
9824 previous_prev;
9825 copy->size += copy_size;
9826 }
9827 return kr;
9828 }
9829 }
9830 total_size -= copy_size;
9831 if (total_size == 0) {
9832 break;
9833 }
9834 base_addr += copy_size;
9835 copy_size = 0;
9836 copy->offset = new_offset;
9837 if (next_copy != NULL) {
9838 copy->cpy_hdr.nentries = remaining_entries;
9839 copy->cpy_hdr.links.next = next_copy;
9840 copy->cpy_hdr.links.prev = previous_prev;
9841 next_copy->vme_prev = vm_map_copy_to_entry(copy);
9842 copy->size = total_size;
9843 }
9844 vm_map_lock(dst_map);
9845 while (TRUE) {
9846 if (!vm_map_lookup_entry(dst_map,
9847 base_addr, &tmp_entry)) {
9848 vm_map_unlock(dst_map);
9849 return KERN_INVALID_ADDRESS;
9850 }
9851 if (tmp_entry->in_transition) {
9852 entry->needs_wakeup = TRUE;
9853 vm_map_entry_wait(dst_map, THREAD_UNINT);
9854 } else {
9855 break;
9856 }
9857 }
9858 vm_map_clip_start(dst_map,
9859 tmp_entry,
9860 vm_map_trunc_page(base_addr,
9861 VM_MAP_PAGE_MASK(dst_map)));
9862
9863 entry = tmp_entry;
9864 } /* while */
9865
9866 /*
9867 * Throw away the vm_map_copy object
9868 */
9869 if (discard_on_success) {
9870 vm_map_copy_discard(copy);
9871 }
9872
9873 return KERN_SUCCESS;
9874 }/* vm_map_copy_overwrite */
9875
9876 kern_return_t
vm_map_copy_overwrite(vm_map_t dst_map,vm_map_offset_t dst_addr,vm_map_copy_t copy,vm_map_size_t copy_size,boolean_t interruptible)9877 vm_map_copy_overwrite(
9878 vm_map_t dst_map,
9879 vm_map_offset_t dst_addr,
9880 vm_map_copy_t copy,
9881 vm_map_size_t copy_size,
9882 boolean_t interruptible)
9883 {
9884 vm_map_size_t head_size, tail_size;
9885 vm_map_copy_t head_copy, tail_copy;
9886 vm_map_offset_t head_addr, tail_addr;
9887 vm_map_entry_t entry;
9888 kern_return_t kr;
9889 vm_map_offset_t effective_page_mask, effective_page_size;
9890 uint16_t copy_page_shift;
9891
9892 head_size = 0;
9893 tail_size = 0;
9894 head_copy = NULL;
9895 tail_copy = NULL;
9896 head_addr = 0;
9897 tail_addr = 0;
9898
9899 /*
9900 * Check for null copy object.
9901 */
9902 if (copy == VM_MAP_COPY_NULL) {
9903 return KERN_SUCCESS;
9904 }
9905
9906 /*
9907 * Assert that the vm_map_copy is coming from the right
9908 * zone and hasn't been forged
9909 */
9910 vm_map_copy_require(copy);
9911
9912 if (interruptible ||
9913 copy->type != VM_MAP_COPY_ENTRY_LIST) {
9914 /*
9915 * We can't split the "copy" map if we're interruptible
9916 * or if we don't have a "copy" map...
9917 */
9918 blunt_copy:
9919 return vm_map_copy_overwrite_nested(dst_map,
9920 dst_addr,
9921 copy,
9922 interruptible,
9923 (pmap_t) NULL,
9924 TRUE);
9925 }
9926
9927 copy_page_shift = VM_MAP_COPY_PAGE_SHIFT(copy);
9928 if (copy_page_shift < PAGE_SHIFT ||
9929 VM_MAP_PAGE_SHIFT(dst_map) < PAGE_SHIFT) {
9930 goto blunt_copy;
9931 }
9932
9933 if (VM_MAP_PAGE_SHIFT(dst_map) < PAGE_SHIFT) {
9934 effective_page_mask = VM_MAP_PAGE_MASK(dst_map);
9935 } else {
9936 effective_page_mask = MAX(VM_MAP_PAGE_MASK(dst_map), PAGE_MASK);
9937 effective_page_mask = MAX(VM_MAP_COPY_PAGE_MASK(copy),
9938 effective_page_mask);
9939 }
9940 effective_page_size = effective_page_mask + 1;
9941
9942 if (copy_size < VM_MAP_COPY_OVERWRITE_OPTIMIZATION_THRESHOLD_PAGES * effective_page_size) {
9943 /*
9944 * Too small to bother with optimizing...
9945 */
9946 goto blunt_copy;
9947 }
9948
9949 if ((dst_addr & effective_page_mask) !=
9950 (copy->offset & effective_page_mask)) {
9951 /*
9952 * Incompatible mis-alignment of source and destination...
9953 */
9954 goto blunt_copy;
9955 }
9956
9957 /*
9958 * Proper alignment or identical mis-alignment at the beginning.
9959 * Let's try and do a small unaligned copy first (if needed)
9960 * and then an aligned copy for the rest.
9961 */
9962 if (!vm_map_page_aligned(dst_addr, effective_page_mask)) {
9963 head_addr = dst_addr;
9964 head_size = (effective_page_size -
9965 (copy->offset & effective_page_mask));
9966 head_size = MIN(head_size, copy_size);
9967 }
9968 if (!vm_map_page_aligned(copy->offset + copy_size,
9969 effective_page_mask)) {
9970 /*
9971 * Mis-alignment at the end.
9972 * Do an aligned copy up to the last page and
9973 * then an unaligned copy for the remaining bytes.
9974 */
9975 tail_size = ((copy->offset + copy_size) &
9976 effective_page_mask);
9977 tail_size = MIN(tail_size, copy_size);
9978 tail_addr = dst_addr + copy_size - tail_size;
9979 assert(tail_addr >= head_addr + head_size);
9980 }
9981 assert(head_size + tail_size <= copy_size);
9982
9983 if (head_size + tail_size == copy_size) {
9984 /*
9985 * It's all unaligned, no optimization possible...
9986 */
9987 goto blunt_copy;
9988 }
9989
9990 /*
9991 * Can't optimize if there are any submaps in the
9992 * destination due to the way we free the "copy" map
9993 * progressively in vm_map_copy_overwrite_nested()
9994 * in that case.
9995 */
9996 vm_map_lock_read(dst_map);
9997 if (!vm_map_lookup_entry(dst_map, dst_addr, &entry)) {
9998 vm_map_unlock_read(dst_map);
9999 goto blunt_copy;
10000 }
10001 for (;
10002 (entry != vm_map_to_entry(dst_map) &&
10003 entry->vme_start < dst_addr + copy_size);
10004 entry = entry->vme_next) {
10005 if (entry->is_sub_map) {
10006 vm_map_unlock_read(dst_map);
10007 goto blunt_copy;
10008 }
10009 }
10010 vm_map_unlock_read(dst_map);
10011
10012 if (head_size) {
10013 /*
10014 * Unaligned copy of the first "head_size" bytes, to reach
10015 * a page boundary.
10016 */
10017
10018 /*
10019 * Extract "head_copy" out of "copy".
10020 */
10021 head_copy = vm_map_copy_allocate(VM_MAP_COPY_ENTRY_LIST);
10022 head_copy->cpy_hdr.entries_pageable =
10023 copy->cpy_hdr.entries_pageable;
10024 head_copy->cpy_hdr.page_shift = copy_page_shift;
10025
10026 entry = vm_map_copy_first_entry(copy);
10027 if (entry->vme_end < copy->offset + head_size) {
10028 head_size = entry->vme_end - copy->offset;
10029 }
10030
10031 head_copy->offset = copy->offset;
10032 head_copy->size = head_size;
10033 copy->offset += head_size;
10034 copy->size -= head_size;
10035 copy_size -= head_size;
10036 assert(copy_size > 0);
10037
10038 vm_map_copy_clip_end(copy, entry, copy->offset);
10039 vm_map_copy_entry_unlink(copy, entry);
10040 vm_map_copy_entry_link(head_copy,
10041 vm_map_copy_to_entry(head_copy),
10042 entry);
10043
10044 /*
10045 * Do the unaligned copy.
10046 */
10047 kr = vm_map_copy_overwrite_nested(dst_map,
10048 head_addr,
10049 head_copy,
10050 interruptible,
10051 (pmap_t) NULL,
10052 FALSE);
10053 if (kr != KERN_SUCCESS) {
10054 goto done;
10055 }
10056 }
10057
10058 if (tail_size) {
10059 /*
10060 * Extract "tail_copy" out of "copy".
10061 */
10062 tail_copy = vm_map_copy_allocate(VM_MAP_COPY_ENTRY_LIST);
10063 tail_copy->cpy_hdr.entries_pageable =
10064 copy->cpy_hdr.entries_pageable;
10065 tail_copy->cpy_hdr.page_shift = copy_page_shift;
10066
10067 tail_copy->offset = copy->offset + copy_size - tail_size;
10068 tail_copy->size = tail_size;
10069
10070 copy->size -= tail_size;
10071 copy_size -= tail_size;
10072 assert(copy_size > 0);
10073
10074 entry = vm_map_copy_last_entry(copy);
10075 vm_map_copy_clip_start(copy, entry, tail_copy->offset);
10076 entry = vm_map_copy_last_entry(copy);
10077 vm_map_copy_entry_unlink(copy, entry);
10078 vm_map_copy_entry_link(tail_copy,
10079 vm_map_copy_last_entry(tail_copy),
10080 entry);
10081 }
10082
10083 /*
10084 * If we are here from ipc_kmsg_copyout_ool_descriptor(),
10085 * we want to avoid TOCTOU issues w.r.t copy->size but
10086 * we don't need to change vm_map_copy_overwrite_nested()
10087 * and all other vm_map_copy_overwrite variants.
10088 *
10089 * So we assign the original copy_size that was passed into
10090 * this routine back to copy.
10091 *
10092 * This use of local 'copy_size' passed into this routine is
10093 * to try and protect against TOCTOU attacks where the kernel
10094 * has been exploited. We don't expect this to be an issue
10095 * during normal system operation.
10096 */
10097 assertf(copy->size == copy_size,
10098 "Mismatch of copy sizes. Expected 0x%llx, Got 0x%llx\n", (uint64_t) copy_size, (uint64_t) copy->size);
10099 copy->size = copy_size;
10100
10101 /*
10102 * Copy most (or possibly all) of the data.
10103 */
10104 kr = vm_map_copy_overwrite_nested(dst_map,
10105 dst_addr + head_size,
10106 copy,
10107 interruptible,
10108 (pmap_t) NULL,
10109 FALSE);
10110 if (kr != KERN_SUCCESS) {
10111 goto done;
10112 }
10113
10114 if (tail_size) {
10115 kr = vm_map_copy_overwrite_nested(dst_map,
10116 tail_addr,
10117 tail_copy,
10118 interruptible,
10119 (pmap_t) NULL,
10120 FALSE);
10121 }
10122
10123 done:
10124 assert(copy->type == VM_MAP_COPY_ENTRY_LIST);
10125 if (kr == KERN_SUCCESS) {
10126 /*
10127 * Discard all the copy maps.
10128 */
10129 if (head_copy) {
10130 vm_map_copy_discard(head_copy);
10131 head_copy = NULL;
10132 }
10133 vm_map_copy_discard(copy);
10134 if (tail_copy) {
10135 vm_map_copy_discard(tail_copy);
10136 tail_copy = NULL;
10137 }
10138 } else {
10139 /*
10140 * Re-assemble the original copy map.
10141 */
10142 if (head_copy) {
10143 entry = vm_map_copy_first_entry(head_copy);
10144 vm_map_copy_entry_unlink(head_copy, entry);
10145 vm_map_copy_entry_link(copy,
10146 vm_map_copy_to_entry(copy),
10147 entry);
10148 copy->offset -= head_size;
10149 copy->size += head_size;
10150 vm_map_copy_discard(head_copy);
10151 head_copy = NULL;
10152 }
10153 if (tail_copy) {
10154 entry = vm_map_copy_last_entry(tail_copy);
10155 vm_map_copy_entry_unlink(tail_copy, entry);
10156 vm_map_copy_entry_link(copy,
10157 vm_map_copy_last_entry(copy),
10158 entry);
10159 copy->size += tail_size;
10160 vm_map_copy_discard(tail_copy);
10161 tail_copy = NULL;
10162 }
10163 }
10164 return kr;
10165 }
10166
10167
10168 /*
10169 * Routine: vm_map_copy_overwrite_unaligned [internal use only]
10170 *
10171 * Decription:
10172 * Physically copy unaligned data
10173 *
10174 * Implementation:
10175 * Unaligned parts of pages have to be physically copied. We use
10176 * a modified form of vm_fault_copy (which understands none-aligned
10177 * page offsets and sizes) to do the copy. We attempt to copy as
10178 * much memory in one go as possibly, however vm_fault_copy copies
10179 * within 1 memory object so we have to find the smaller of "amount left"
10180 * "source object data size" and "target object data size". With
10181 * unaligned data we don't need to split regions, therefore the source
10182 * (copy) object should be one map entry, the target range may be split
10183 * over multiple map entries however. In any event we are pessimistic
10184 * about these assumptions.
10185 *
10186 * Callers of this function must call vm_map_copy_require on
10187 * previously created vm_map_copy_t or pass a newly created
10188 * one to ensure that it hasn't been forged.
10189 *
10190 * Assumptions:
10191 * dst_map is locked on entry and is return locked on success,
10192 * unlocked on error.
10193 */
10194
10195 static kern_return_t
vm_map_copy_overwrite_unaligned(vm_map_t dst_map,vm_map_entry_t entry,vm_map_copy_t copy,vm_map_offset_t start,boolean_t discard_on_success)10196 vm_map_copy_overwrite_unaligned(
10197 vm_map_t dst_map,
10198 vm_map_entry_t entry,
10199 vm_map_copy_t copy,
10200 vm_map_offset_t start,
10201 boolean_t discard_on_success)
10202 {
10203 vm_map_entry_t copy_entry;
10204 vm_map_entry_t copy_entry_next;
10205 vm_map_version_t version;
10206 vm_object_t dst_object;
10207 vm_object_offset_t dst_offset;
10208 vm_object_offset_t src_offset;
10209 vm_object_offset_t entry_offset;
10210 vm_map_offset_t entry_end;
10211 vm_map_size_t src_size,
10212 dst_size,
10213 copy_size,
10214 amount_left;
10215 kern_return_t kr = KERN_SUCCESS;
10216
10217
10218 copy_entry = vm_map_copy_first_entry(copy);
10219
10220 vm_map_lock_write_to_read(dst_map);
10221
10222 src_offset = copy->offset - trunc_page_mask_64(copy->offset, VM_MAP_COPY_PAGE_MASK(copy));
10223 amount_left = copy->size;
10224 /*
10225 * unaligned so we never clipped this entry, we need the offset into
10226 * the vm_object not just the data.
10227 */
10228 while (amount_left > 0) {
10229 if (entry == vm_map_to_entry(dst_map)) {
10230 vm_map_unlock_read(dst_map);
10231 return KERN_INVALID_ADDRESS;
10232 }
10233
10234 /* "start" must be within the current map entry */
10235 assert((start >= entry->vme_start) && (start < entry->vme_end));
10236
10237 /*
10238 * Check protection again
10239 */
10240 if (!(entry->protection & VM_PROT_WRITE)) {
10241 vm_map_unlock_read(dst_map);
10242 return KERN_PROTECTION_FAILURE;
10243 }
10244 if (!vm_map_entry_is_overwritable(dst_map, entry)) {
10245 vm_map_unlock_read(dst_map);
10246 return KERN_PROTECTION_FAILURE;
10247 }
10248
10249 dst_offset = start - entry->vme_start;
10250
10251 dst_size = entry->vme_end - start;
10252
10253 src_size = copy_entry->vme_end -
10254 (copy_entry->vme_start + src_offset);
10255
10256 if (dst_size < src_size) {
10257 /*
10258 * we can only copy dst_size bytes before
10259 * we have to get the next destination entry
10260 */
10261 copy_size = dst_size;
10262 } else {
10263 /*
10264 * we can only copy src_size bytes before
10265 * we have to get the next source copy entry
10266 */
10267 copy_size = src_size;
10268 }
10269
10270 if (copy_size > amount_left) {
10271 copy_size = amount_left;
10272 }
10273 /*
10274 * Entry needs copy, create a shadow shadow object for
10275 * Copy on write region.
10276 */
10277 if (entry->needs_copy) {
10278 if (vm_map_lock_read_to_write(dst_map)) {
10279 vm_map_lock_read(dst_map);
10280 goto RetryLookup;
10281 }
10282 VME_OBJECT_SHADOW(entry,
10283 (vm_map_size_t)(entry->vme_end
10284 - entry->vme_start),
10285 vm_map_always_shadow(dst_map));
10286 entry->needs_copy = FALSE;
10287 vm_map_lock_write_to_read(dst_map);
10288 }
10289 dst_object = VME_OBJECT(entry);
10290 /*
10291 * unlike with the virtual (aligned) copy we're going
10292 * to fault on it therefore we need a target object.
10293 */
10294 if (dst_object == VM_OBJECT_NULL) {
10295 if (vm_map_lock_read_to_write(dst_map)) {
10296 vm_map_lock_read(dst_map);
10297 goto RetryLookup;
10298 }
10299 dst_object = vm_object_allocate((vm_map_size_t)
10300 entry->vme_end - entry->vme_start);
10301 VME_OBJECT_SET(entry, dst_object, false, 0);
10302 VME_OFFSET_SET(entry, 0);
10303 assert(entry->use_pmap);
10304 vm_map_lock_write_to_read(dst_map);
10305 }
10306 /*
10307 * Take an object reference and unlock map. The "entry" may
10308 * disappear or change when the map is unlocked.
10309 */
10310 vm_object_reference(dst_object);
10311 version.main_timestamp = dst_map->timestamp;
10312 entry_offset = VME_OFFSET(entry);
10313 entry_end = entry->vme_end;
10314 vm_map_unlock_read(dst_map);
10315 /*
10316 * Copy as much as possible in one pass
10317 */
10318 kr = vm_fault_copy(
10319 VME_OBJECT(copy_entry),
10320 VME_OFFSET(copy_entry) + src_offset,
10321 ©_size,
10322 dst_object,
10323 entry_offset + dst_offset,
10324 dst_map,
10325 &version,
10326 THREAD_UNINT );
10327
10328 start += copy_size;
10329 src_offset += copy_size;
10330 amount_left -= copy_size;
10331 /*
10332 * Release the object reference
10333 */
10334 vm_object_deallocate(dst_object);
10335 /*
10336 * If a hard error occurred, return it now
10337 */
10338 if (kr != KERN_SUCCESS) {
10339 return kr;
10340 }
10341
10342 if ((copy_entry->vme_start + src_offset) == copy_entry->vme_end
10343 || amount_left == 0) {
10344 /*
10345 * all done with this copy entry, dispose.
10346 */
10347 copy_entry_next = copy_entry->vme_next;
10348
10349 if (discard_on_success) {
10350 vm_map_copy_entry_unlink(copy, copy_entry);
10351 assert(!copy_entry->is_sub_map);
10352 vm_object_deallocate(VME_OBJECT(copy_entry));
10353 vm_map_copy_entry_dispose(copy_entry);
10354 }
10355
10356 if (copy_entry_next == vm_map_copy_to_entry(copy) &&
10357 amount_left) {
10358 /*
10359 * not finished copying but run out of source
10360 */
10361 return KERN_INVALID_ADDRESS;
10362 }
10363
10364 copy_entry = copy_entry_next;
10365
10366 src_offset = 0;
10367 }
10368
10369 if (amount_left == 0) {
10370 return KERN_SUCCESS;
10371 }
10372
10373 vm_map_lock_read(dst_map);
10374 if (version.main_timestamp == dst_map->timestamp) {
10375 if (start == entry_end) {
10376 /*
10377 * destination region is split. Use the version
10378 * information to avoid a lookup in the normal
10379 * case.
10380 */
10381 entry = entry->vme_next;
10382 /*
10383 * should be contiguous. Fail if we encounter
10384 * a hole in the destination.
10385 */
10386 if (start != entry->vme_start) {
10387 vm_map_unlock_read(dst_map);
10388 return KERN_INVALID_ADDRESS;
10389 }
10390 }
10391 } else {
10392 /*
10393 * Map version check failed.
10394 * we must lookup the entry because somebody
10395 * might have changed the map behind our backs.
10396 */
10397 RetryLookup:
10398 if (!vm_map_lookup_entry(dst_map, start, &entry)) {
10399 vm_map_unlock_read(dst_map);
10400 return KERN_INVALID_ADDRESS;
10401 }
10402 }
10403 }/* while */
10404
10405 return KERN_SUCCESS;
10406 }/* vm_map_copy_overwrite_unaligned */
10407
10408 /*
10409 * Routine: vm_map_copy_overwrite_aligned [internal use only]
10410 *
10411 * Description:
10412 * Does all the vm_trickery possible for whole pages.
10413 *
10414 * Implementation:
10415 *
10416 * If there are no permanent objects in the destination,
10417 * and the source and destination map entry zones match,
10418 * and the destination map entry is not shared,
10419 * then the map entries can be deleted and replaced
10420 * with those from the copy. The following code is the
10421 * basic idea of what to do, but there are lots of annoying
10422 * little details about getting protection and inheritance
10423 * right. Should add protection, inheritance, and sharing checks
10424 * to the above pass and make sure that no wiring is involved.
10425 *
10426 * Callers of this function must call vm_map_copy_require on
10427 * previously created vm_map_copy_t or pass a newly created
10428 * one to ensure that it hasn't been forged.
10429 */
10430
10431 int vm_map_copy_overwrite_aligned_src_not_internal = 0;
10432 int vm_map_copy_overwrite_aligned_src_not_symmetric = 0;
10433 int vm_map_copy_overwrite_aligned_src_large = 0;
10434
10435 static kern_return_t
vm_map_copy_overwrite_aligned(vm_map_t dst_map,vm_map_entry_t tmp_entry,vm_map_copy_t copy,vm_map_offset_t start,__unused pmap_t pmap)10436 vm_map_copy_overwrite_aligned(
10437 vm_map_t dst_map,
10438 vm_map_entry_t tmp_entry,
10439 vm_map_copy_t copy,
10440 vm_map_offset_t start,
10441 __unused pmap_t pmap)
10442 {
10443 vm_object_t object;
10444 vm_map_entry_t copy_entry;
10445 vm_map_size_t copy_size;
10446 vm_map_size_t size;
10447 vm_map_entry_t entry;
10448
10449 while ((copy_entry = vm_map_copy_first_entry(copy))
10450 != vm_map_copy_to_entry(copy)) {
10451 copy_size = (copy_entry->vme_end - copy_entry->vme_start);
10452
10453 entry = tmp_entry;
10454 if (entry->is_sub_map) {
10455 /* unnested when clipped earlier */
10456 assert(!entry->use_pmap);
10457 }
10458 if (entry == vm_map_to_entry(dst_map)) {
10459 vm_map_unlock(dst_map);
10460 return KERN_INVALID_ADDRESS;
10461 }
10462 size = (entry->vme_end - entry->vme_start);
10463 /*
10464 * Make sure that no holes popped up in the
10465 * address map, and that the protection is
10466 * still valid, in case the map was unlocked
10467 * earlier.
10468 */
10469
10470 if ((entry->vme_start != start) || ((entry->is_sub_map)
10471 && !entry->needs_copy)) {
10472 vm_map_unlock(dst_map);
10473 return KERN_INVALID_ADDRESS;
10474 }
10475 assert(entry != vm_map_to_entry(dst_map));
10476
10477 /*
10478 * Check protection again
10479 */
10480
10481 if (!(entry->protection & VM_PROT_WRITE)) {
10482 vm_map_unlock(dst_map);
10483 return KERN_PROTECTION_FAILURE;
10484 }
10485
10486 if (!vm_map_entry_is_overwritable(dst_map, entry)) {
10487 vm_map_unlock(dst_map);
10488 return KERN_PROTECTION_FAILURE;
10489 }
10490
10491 /*
10492 * Adjust to source size first
10493 */
10494
10495 if (copy_size < size) {
10496 if (entry->map_aligned &&
10497 !VM_MAP_PAGE_ALIGNED(entry->vme_start + copy_size,
10498 VM_MAP_PAGE_MASK(dst_map))) {
10499 /* no longer map-aligned */
10500 entry->map_aligned = FALSE;
10501 }
10502 vm_map_clip_end(dst_map, entry, entry->vme_start + copy_size);
10503 size = copy_size;
10504 }
10505
10506 /*
10507 * Adjust to destination size
10508 */
10509
10510 if (size < copy_size) {
10511 vm_map_copy_clip_end(copy, copy_entry,
10512 copy_entry->vme_start + size);
10513 copy_size = size;
10514 }
10515
10516 assert((entry->vme_end - entry->vme_start) == size);
10517 assert((tmp_entry->vme_end - tmp_entry->vme_start) == size);
10518 assert((copy_entry->vme_end - copy_entry->vme_start) == size);
10519
10520 /*
10521 * If the destination contains temporary unshared memory,
10522 * we can perform the copy by throwing it away and
10523 * installing the source data.
10524 */
10525
10526 object = VME_OBJECT(entry);
10527 if ((!entry->is_shared &&
10528 ((object == VM_OBJECT_NULL) ||
10529 (object->internal && !object->true_share))) ||
10530 entry->needs_copy) {
10531 vm_object_t old_object = VME_OBJECT(entry);
10532 vm_object_offset_t old_offset = VME_OFFSET(entry);
10533 vm_object_offset_t offset;
10534
10535 /*
10536 * Ensure that the source and destination aren't
10537 * identical
10538 */
10539 if (old_object == VME_OBJECT(copy_entry) &&
10540 old_offset == VME_OFFSET(copy_entry)) {
10541 vm_map_copy_entry_unlink(copy, copy_entry);
10542 vm_map_copy_entry_dispose(copy_entry);
10543
10544 if (old_object != VM_OBJECT_NULL) {
10545 vm_object_deallocate(old_object);
10546 }
10547
10548 start = tmp_entry->vme_end;
10549 tmp_entry = tmp_entry->vme_next;
10550 continue;
10551 }
10552
10553 #if XNU_TARGET_OS_OSX
10554 #define __TRADEOFF1_OBJ_SIZE (64 * 1024 * 1024) /* 64 MB */
10555 #define __TRADEOFF1_COPY_SIZE (128 * 1024) /* 128 KB */
10556 if (VME_OBJECT(copy_entry) != VM_OBJECT_NULL &&
10557 VME_OBJECT(copy_entry)->vo_size >= __TRADEOFF1_OBJ_SIZE &&
10558 copy_size <= __TRADEOFF1_COPY_SIZE) {
10559 /*
10560 * Virtual vs. Physical copy tradeoff #1.
10561 *
10562 * Copying only a few pages out of a large
10563 * object: do a physical copy instead of
10564 * a virtual copy, to avoid possibly keeping
10565 * the entire large object alive because of
10566 * those few copy-on-write pages.
10567 */
10568 vm_map_copy_overwrite_aligned_src_large++;
10569 goto slow_copy;
10570 }
10571 #endif /* XNU_TARGET_OS_OSX */
10572
10573 if ((dst_map->pmap != kernel_pmap) &&
10574 (VME_ALIAS(entry) >= VM_MEMORY_MALLOC) &&
10575 (VME_ALIAS(entry) <= VM_MEMORY_MALLOC_MEDIUM)) {
10576 vm_object_t new_object, new_shadow;
10577
10578 /*
10579 * We're about to map something over a mapping
10580 * established by malloc()...
10581 */
10582 new_object = VME_OBJECT(copy_entry);
10583 if (new_object != VM_OBJECT_NULL) {
10584 vm_object_lock_shared(new_object);
10585 }
10586 while (new_object != VM_OBJECT_NULL &&
10587 #if XNU_TARGET_OS_OSX
10588 !new_object->true_share &&
10589 new_object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC &&
10590 #endif /* XNU_TARGET_OS_OSX */
10591 new_object->internal) {
10592 new_shadow = new_object->shadow;
10593 if (new_shadow == VM_OBJECT_NULL) {
10594 break;
10595 }
10596 vm_object_lock_shared(new_shadow);
10597 vm_object_unlock(new_object);
10598 new_object = new_shadow;
10599 }
10600 if (new_object != VM_OBJECT_NULL) {
10601 if (!new_object->internal) {
10602 /*
10603 * The new mapping is backed
10604 * by an external object. We
10605 * don't want malloc'ed memory
10606 * to be replaced with such a
10607 * non-anonymous mapping, so
10608 * let's go off the optimized
10609 * path...
10610 */
10611 vm_map_copy_overwrite_aligned_src_not_internal++;
10612 vm_object_unlock(new_object);
10613 goto slow_copy;
10614 }
10615 #if XNU_TARGET_OS_OSX
10616 if (new_object->true_share ||
10617 new_object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC) {
10618 /*
10619 * Same if there's a "true_share"
10620 * object in the shadow chain, or
10621 * an object with a non-default
10622 * (SYMMETRIC) copy strategy.
10623 */
10624 vm_map_copy_overwrite_aligned_src_not_symmetric++;
10625 vm_object_unlock(new_object);
10626 goto slow_copy;
10627 }
10628 #endif /* XNU_TARGET_OS_OSX */
10629 vm_object_unlock(new_object);
10630 }
10631 /*
10632 * The new mapping is still backed by
10633 * anonymous (internal) memory, so it's
10634 * OK to substitute it for the original
10635 * malloc() mapping.
10636 */
10637 }
10638
10639 if (old_object != VM_OBJECT_NULL) {
10640 assert(!entry->vme_permanent);
10641 if (entry->is_sub_map) {
10642 if (entry->use_pmap) {
10643 #ifndef NO_NESTED_PMAP
10644 pmap_unnest(dst_map->pmap,
10645 (addr64_t)entry->vme_start,
10646 entry->vme_end - entry->vme_start);
10647 #endif /* NO_NESTED_PMAP */
10648 if (dst_map->mapped_in_other_pmaps) {
10649 /* clean up parent */
10650 /* map/maps */
10651 vm_map_submap_pmap_clean(
10652 dst_map, entry->vme_start,
10653 entry->vme_end,
10654 VME_SUBMAP(entry),
10655 VME_OFFSET(entry));
10656 }
10657 } else {
10658 vm_map_submap_pmap_clean(
10659 dst_map, entry->vme_start,
10660 entry->vme_end,
10661 VME_SUBMAP(entry),
10662 VME_OFFSET(entry));
10663 }
10664 vm_map_deallocate(VME_SUBMAP(entry));
10665 } else {
10666 if (dst_map->mapped_in_other_pmaps) {
10667 vm_object_pmap_protect_options(
10668 VME_OBJECT(entry),
10669 VME_OFFSET(entry),
10670 entry->vme_end
10671 - entry->vme_start,
10672 PMAP_NULL,
10673 PAGE_SIZE,
10674 entry->vme_start,
10675 VM_PROT_NONE,
10676 PMAP_OPTIONS_REMOVE);
10677 } else {
10678 pmap_remove_options(
10679 dst_map->pmap,
10680 (addr64_t)(entry->vme_start),
10681 (addr64_t)(entry->vme_end),
10682 PMAP_OPTIONS_REMOVE);
10683 }
10684 vm_object_deallocate(old_object);
10685 }
10686 }
10687
10688 if (entry->iokit_acct) {
10689 /* keep using iokit accounting */
10690 entry->use_pmap = FALSE;
10691 } else {
10692 /* use pmap accounting */
10693 entry->use_pmap = TRUE;
10694 }
10695 assert(!entry->vme_permanent);
10696 VME_OBJECT_SET(entry, VME_OBJECT(copy_entry), false, 0);
10697 object = VME_OBJECT(entry);
10698 entry->needs_copy = copy_entry->needs_copy;
10699 entry->wired_count = 0;
10700 entry->user_wired_count = 0;
10701 offset = VME_OFFSET(copy_entry);
10702 VME_OFFSET_SET(entry, offset);
10703
10704 vm_map_copy_entry_unlink(copy, copy_entry);
10705 vm_map_copy_entry_dispose(copy_entry);
10706
10707 /*
10708 * we could try to push pages into the pmap at this point, BUT
10709 * this optimization only saved on average 2 us per page if ALL
10710 * the pages in the source were currently mapped
10711 * and ALL the pages in the dest were touched, if there were fewer
10712 * than 2/3 of the pages touched, this optimization actually cost more cycles
10713 * it also puts a lot of pressure on the pmap layer w/r to mapping structures
10714 */
10715
10716 /*
10717 * Set up for the next iteration. The map
10718 * has not been unlocked, so the next
10719 * address should be at the end of this
10720 * entry, and the next map entry should be
10721 * the one following it.
10722 */
10723
10724 start = tmp_entry->vme_end;
10725 tmp_entry = tmp_entry->vme_next;
10726 } else {
10727 vm_map_version_t version;
10728 vm_object_t dst_object;
10729 vm_object_offset_t dst_offset;
10730 kern_return_t r;
10731
10732 slow_copy:
10733 if (entry->needs_copy) {
10734 VME_OBJECT_SHADOW(entry,
10735 (entry->vme_end -
10736 entry->vme_start),
10737 vm_map_always_shadow(dst_map));
10738 entry->needs_copy = FALSE;
10739 }
10740
10741 dst_object = VME_OBJECT(entry);
10742 dst_offset = VME_OFFSET(entry);
10743
10744 /*
10745 * Take an object reference, and record
10746 * the map version information so that the
10747 * map can be safely unlocked.
10748 */
10749
10750 if (dst_object == VM_OBJECT_NULL) {
10751 /*
10752 * We would usually have just taken the
10753 * optimized path above if the destination
10754 * object has not been allocated yet. But we
10755 * now disable that optimization if the copy
10756 * entry's object is not backed by anonymous
10757 * memory to avoid replacing malloc'ed
10758 * (i.e. re-usable) anonymous memory with a
10759 * not-so-anonymous mapping.
10760 * So we have to handle this case here and
10761 * allocate a new VM object for this map entry.
10762 */
10763 dst_object = vm_object_allocate(
10764 entry->vme_end - entry->vme_start);
10765 dst_offset = 0;
10766 VME_OBJECT_SET(entry, dst_object, false, 0);
10767 VME_OFFSET_SET(entry, dst_offset);
10768 assert(entry->use_pmap);
10769 }
10770
10771 vm_object_reference(dst_object);
10772
10773 /* account for unlock bumping up timestamp */
10774 version.main_timestamp = dst_map->timestamp + 1;
10775
10776 vm_map_unlock(dst_map);
10777
10778 /*
10779 * Copy as much as possible in one pass
10780 */
10781
10782 copy_size = size;
10783 r = vm_fault_copy(
10784 VME_OBJECT(copy_entry),
10785 VME_OFFSET(copy_entry),
10786 ©_size,
10787 dst_object,
10788 dst_offset,
10789 dst_map,
10790 &version,
10791 THREAD_UNINT );
10792
10793 /*
10794 * Release the object reference
10795 */
10796
10797 vm_object_deallocate(dst_object);
10798
10799 /*
10800 * If a hard error occurred, return it now
10801 */
10802
10803 if (r != KERN_SUCCESS) {
10804 return r;
10805 }
10806
10807 if (copy_size != 0) {
10808 /*
10809 * Dispose of the copied region
10810 */
10811
10812 vm_map_copy_clip_end(copy, copy_entry,
10813 copy_entry->vme_start + copy_size);
10814 vm_map_copy_entry_unlink(copy, copy_entry);
10815 vm_object_deallocate(VME_OBJECT(copy_entry));
10816 vm_map_copy_entry_dispose(copy_entry);
10817 }
10818
10819 /*
10820 * Pick up in the destination map where we left off.
10821 *
10822 * Use the version information to avoid a lookup
10823 * in the normal case.
10824 */
10825
10826 start += copy_size;
10827 vm_map_lock(dst_map);
10828 if (version.main_timestamp == dst_map->timestamp &&
10829 copy_size != 0) {
10830 /* We can safely use saved tmp_entry value */
10831
10832 if (tmp_entry->map_aligned &&
10833 !VM_MAP_PAGE_ALIGNED(
10834 start,
10835 VM_MAP_PAGE_MASK(dst_map))) {
10836 /* no longer map-aligned */
10837 tmp_entry->map_aligned = FALSE;
10838 }
10839 vm_map_clip_end(dst_map, tmp_entry, start);
10840 tmp_entry = tmp_entry->vme_next;
10841 } else {
10842 /* Must do lookup of tmp_entry */
10843
10844 if (!vm_map_lookup_entry(dst_map, start, &tmp_entry)) {
10845 vm_map_unlock(dst_map);
10846 return KERN_INVALID_ADDRESS;
10847 }
10848 if (tmp_entry->map_aligned &&
10849 !VM_MAP_PAGE_ALIGNED(
10850 start,
10851 VM_MAP_PAGE_MASK(dst_map))) {
10852 /* no longer map-aligned */
10853 tmp_entry->map_aligned = FALSE;
10854 }
10855 vm_map_clip_start(dst_map, tmp_entry, start);
10856 }
10857 }
10858 }/* while */
10859
10860 return KERN_SUCCESS;
10861 }/* vm_map_copy_overwrite_aligned */
10862
10863 /*
10864 * Routine: vm_map_copyin_kernel_buffer [internal use only]
10865 *
10866 * Description:
10867 * Copy in data to a kernel buffer from space in the
10868 * source map. The original space may be optionally
10869 * deallocated.
10870 *
10871 * If successful, returns a new copy object.
10872 */
10873 static kern_return_t
vm_map_copyin_kernel_buffer(vm_map_t src_map,vm_map_offset_t src_addr,vm_map_size_t len,boolean_t src_destroy,vm_map_copy_t * copy_result)10874 vm_map_copyin_kernel_buffer(
10875 vm_map_t src_map,
10876 vm_map_offset_t src_addr,
10877 vm_map_size_t len,
10878 boolean_t src_destroy,
10879 vm_map_copy_t *copy_result)
10880 {
10881 kern_return_t kr;
10882 vm_map_copy_t copy;
10883 void *kdata;
10884
10885 if (len > msg_ool_size_small) {
10886 return KERN_INVALID_ARGUMENT;
10887 }
10888
10889 kdata = kalloc_data(len, Z_WAITOK);
10890 if (kdata == NULL) {
10891 return KERN_RESOURCE_SHORTAGE;
10892 }
10893 kr = copyinmap(src_map, src_addr, kdata, (vm_size_t)len);
10894 if (kr != KERN_SUCCESS) {
10895 kfree_data(kdata, len);
10896 return kr;
10897 }
10898
10899 copy = vm_map_copy_allocate(VM_MAP_COPY_KERNEL_BUFFER);
10900 copy->cpy_kdata = kdata;
10901 copy->size = len;
10902 copy->offset = 0;
10903
10904 if (src_destroy) {
10905 vmr_flags_t flags = VM_MAP_REMOVE_INTERRUPTIBLE;
10906
10907 if (src_map == kernel_map) {
10908 flags |= VM_MAP_REMOVE_KUNWIRE;
10909 }
10910
10911 (void)vm_map_remove_guard(src_map,
10912 vm_map_trunc_page(src_addr, VM_MAP_PAGE_MASK(src_map)),
10913 vm_map_round_page(src_addr + len, VM_MAP_PAGE_MASK(src_map)),
10914 flags, KMEM_GUARD_NONE);
10915 }
10916
10917 *copy_result = copy;
10918 return KERN_SUCCESS;
10919 }
10920
10921 /*
10922 * Routine: vm_map_copyout_kernel_buffer [internal use only]
10923 *
10924 * Description:
10925 * Copy out data from a kernel buffer into space in the
10926 * destination map. The space may be otpionally dynamically
10927 * allocated.
10928 *
10929 * If successful, consumes the copy object.
10930 * Otherwise, the caller is responsible for it.
10931 *
10932 * Callers of this function must call vm_map_copy_require on
10933 * previously created vm_map_copy_t or pass a newly created
10934 * one to ensure that it hasn't been forged.
10935 */
10936 static int vm_map_copyout_kernel_buffer_failures = 0;
10937 static kern_return_t
vm_map_copyout_kernel_buffer(vm_map_t map,vm_map_address_t * addr,vm_map_copy_t copy,vm_map_size_t copy_size,boolean_t overwrite,boolean_t consume_on_success)10938 vm_map_copyout_kernel_buffer(
10939 vm_map_t map,
10940 vm_map_address_t *addr, /* IN/OUT */
10941 vm_map_copy_t copy,
10942 vm_map_size_t copy_size,
10943 boolean_t overwrite,
10944 boolean_t consume_on_success)
10945 {
10946 kern_return_t kr = KERN_SUCCESS;
10947 thread_t thread = current_thread();
10948
10949 assert(copy->size == copy_size);
10950
10951 /*
10952 * check for corrupted vm_map_copy structure
10953 */
10954 if (copy_size > msg_ool_size_small || copy->offset) {
10955 panic("Invalid vm_map_copy_t sz:%lld, ofst:%lld",
10956 (long long)copy->size, (long long)copy->offset);
10957 }
10958
10959 if (!overwrite) {
10960 /*
10961 * Allocate space in the target map for the data
10962 */
10963 vm_map_kernel_flags_t vmk_flags = VM_MAP_KERNEL_FLAGS_ANYWHERE();
10964
10965 if (map == kernel_map) {
10966 vmk_flags.vmkf_range_id = KMEM_RANGE_ID_DATA;
10967 }
10968
10969 *addr = 0;
10970 kr = vm_map_enter(map,
10971 addr,
10972 vm_map_round_page(copy_size,
10973 VM_MAP_PAGE_MASK(map)),
10974 (vm_map_offset_t) 0,
10975 vmk_flags,
10976 VM_OBJECT_NULL,
10977 (vm_object_offset_t) 0,
10978 FALSE,
10979 VM_PROT_DEFAULT,
10980 VM_PROT_ALL,
10981 VM_INHERIT_DEFAULT);
10982 if (kr != KERN_SUCCESS) {
10983 return kr;
10984 }
10985 #if KASAN
10986 if (map->pmap == kernel_pmap) {
10987 kasan_notify_address(*addr, copy->size);
10988 }
10989 #endif
10990 }
10991
10992 /*
10993 * Copyout the data from the kernel buffer to the target map.
10994 */
10995 if (thread->map == map) {
10996 /*
10997 * If the target map is the current map, just do
10998 * the copy.
10999 */
11000 assert((vm_size_t)copy_size == copy_size);
11001 if (copyout(copy->cpy_kdata, *addr, (vm_size_t)copy_size)) {
11002 kr = KERN_INVALID_ADDRESS;
11003 }
11004 } else {
11005 vm_map_t oldmap;
11006
11007 /*
11008 * If the target map is another map, assume the
11009 * target's address space identity for the duration
11010 * of the copy.
11011 */
11012 vm_map_reference(map);
11013 oldmap = vm_map_switch(map);
11014
11015 assert((vm_size_t)copy_size == copy_size);
11016 if (copyout(copy->cpy_kdata, *addr, (vm_size_t)copy_size)) {
11017 vm_map_copyout_kernel_buffer_failures++;
11018 kr = KERN_INVALID_ADDRESS;
11019 }
11020
11021 (void) vm_map_switch(oldmap);
11022 vm_map_deallocate(map);
11023 }
11024
11025 if (kr != KERN_SUCCESS) {
11026 /* the copy failed, clean up */
11027 if (!overwrite) {
11028 /*
11029 * Deallocate the space we allocated in the target map.
11030 */
11031 (void) vm_map_remove(map,
11032 vm_map_trunc_page(*addr,
11033 VM_MAP_PAGE_MASK(map)),
11034 vm_map_round_page((*addr +
11035 vm_map_round_page(copy_size,
11036 VM_MAP_PAGE_MASK(map))),
11037 VM_MAP_PAGE_MASK(map)));
11038 *addr = 0;
11039 }
11040 } else {
11041 /* copy was successful, dicard the copy structure */
11042 if (consume_on_success) {
11043 kfree_data(copy->cpy_kdata, copy_size);
11044 zfree_id(ZONE_ID_VM_MAP_COPY, copy);
11045 }
11046 }
11047
11048 return kr;
11049 }
11050
11051 /*
11052 * Routine: vm_map_copy_insert [internal use only]
11053 *
11054 * Description:
11055 * Link a copy chain ("copy") into a map at the
11056 * specified location (after "where").
11057 *
11058 * Callers of this function must call vm_map_copy_require on
11059 * previously created vm_map_copy_t or pass a newly created
11060 * one to ensure that it hasn't been forged.
11061 * Side effects:
11062 * The copy chain is destroyed.
11063 */
11064 static void
vm_map_copy_insert(vm_map_t map,vm_map_entry_t after_where,vm_map_copy_t copy)11065 vm_map_copy_insert(
11066 vm_map_t map,
11067 vm_map_entry_t after_where,
11068 vm_map_copy_t copy)
11069 {
11070 vm_map_entry_t entry;
11071
11072 while (vm_map_copy_first_entry(copy) != vm_map_copy_to_entry(copy)) {
11073 entry = vm_map_copy_first_entry(copy);
11074 vm_map_copy_entry_unlink(copy, entry);
11075 vm_map_store_entry_link(map, after_where, entry,
11076 VM_MAP_KERNEL_FLAGS_NONE);
11077 after_where = entry;
11078 }
11079 zfree_id(ZONE_ID_VM_MAP_COPY, copy);
11080 }
11081
11082 /*
11083 * Callers of this function must call vm_map_copy_require on
11084 * previously created vm_map_copy_t or pass a newly created
11085 * one to ensure that it hasn't been forged.
11086 */
11087 void
vm_map_copy_remap(vm_map_t map,vm_map_entry_t where,vm_map_copy_t copy,vm_map_offset_t adjustment,vm_prot_t cur_prot,vm_prot_t max_prot,vm_inherit_t inheritance)11088 vm_map_copy_remap(
11089 vm_map_t map,
11090 vm_map_entry_t where,
11091 vm_map_copy_t copy,
11092 vm_map_offset_t adjustment,
11093 vm_prot_t cur_prot,
11094 vm_prot_t max_prot,
11095 vm_inherit_t inheritance)
11096 {
11097 vm_map_entry_t copy_entry, new_entry;
11098
11099 for (copy_entry = vm_map_copy_first_entry(copy);
11100 copy_entry != vm_map_copy_to_entry(copy);
11101 copy_entry = copy_entry->vme_next) {
11102 /* get a new VM map entry for the map */
11103 new_entry = vm_map_entry_create(map);
11104 /* copy the "copy entry" to the new entry */
11105 vm_map_entry_copy(map, new_entry, copy_entry);
11106 /* adjust "start" and "end" */
11107 new_entry->vme_start += adjustment;
11108 new_entry->vme_end += adjustment;
11109 /* clear some attributes */
11110 new_entry->inheritance = inheritance;
11111 new_entry->protection = cur_prot;
11112 new_entry->max_protection = max_prot;
11113 new_entry->behavior = VM_BEHAVIOR_DEFAULT;
11114 /* take an extra reference on the entry's "object" */
11115 if (new_entry->is_sub_map) {
11116 assert(!new_entry->use_pmap); /* not nested */
11117 vm_map_reference(VME_SUBMAP(new_entry));
11118 } else {
11119 vm_object_reference(VME_OBJECT(new_entry));
11120 }
11121 /* insert the new entry in the map */
11122 vm_map_store_entry_link(map, where, new_entry,
11123 VM_MAP_KERNEL_FLAGS_NONE);
11124 /* continue inserting the "copy entries" after the new entry */
11125 where = new_entry;
11126 }
11127 }
11128
11129
11130 /*
11131 * Returns true if *size matches (or is in the range of) copy->size.
11132 * Upon returning true, the *size field is updated with the actual size of the
11133 * copy object (may be different for VM_MAP_COPY_ENTRY_LIST types)
11134 */
11135 boolean_t
vm_map_copy_validate_size(vm_map_t dst_map,vm_map_copy_t copy,vm_map_size_t * size)11136 vm_map_copy_validate_size(
11137 vm_map_t dst_map,
11138 vm_map_copy_t copy,
11139 vm_map_size_t *size)
11140 {
11141 if (copy == VM_MAP_COPY_NULL) {
11142 return FALSE;
11143 }
11144
11145 /*
11146 * Assert that the vm_map_copy is coming from the right
11147 * zone and hasn't been forged
11148 */
11149 vm_map_copy_require(copy);
11150
11151 vm_map_size_t copy_sz = copy->size;
11152 vm_map_size_t sz = *size;
11153 switch (copy->type) {
11154 case VM_MAP_COPY_KERNEL_BUFFER:
11155 if (sz == copy_sz) {
11156 return TRUE;
11157 }
11158 break;
11159 case VM_MAP_COPY_ENTRY_LIST:
11160 /*
11161 * potential page-size rounding prevents us from exactly
11162 * validating this flavor of vm_map_copy, but we can at least
11163 * assert that it's within a range.
11164 */
11165 if (copy_sz >= sz &&
11166 copy_sz <= vm_map_round_page(sz, VM_MAP_PAGE_MASK(dst_map))) {
11167 *size = copy_sz;
11168 return TRUE;
11169 }
11170 break;
11171 default:
11172 break;
11173 }
11174 return FALSE;
11175 }
11176
11177 /*
11178 * Routine: vm_map_copyout_size
11179 *
11180 * Description:
11181 * Copy out a copy chain ("copy") into newly-allocated
11182 * space in the destination map. Uses a prevalidated
11183 * size for the copy object (vm_map_copy_validate_size).
11184 *
11185 * If successful, consumes the copy object.
11186 * Otherwise, the caller is responsible for it.
11187 */
11188 kern_return_t
vm_map_copyout_size(vm_map_t dst_map,vm_map_address_t * dst_addr,vm_map_copy_t copy,vm_map_size_t copy_size)11189 vm_map_copyout_size(
11190 vm_map_t dst_map,
11191 vm_map_address_t *dst_addr, /* OUT */
11192 vm_map_copy_t copy,
11193 vm_map_size_t copy_size)
11194 {
11195 return vm_map_copyout_internal(dst_map, dst_addr, copy, copy_size,
11196 TRUE, /* consume_on_success */
11197 VM_PROT_DEFAULT,
11198 VM_PROT_ALL,
11199 VM_INHERIT_DEFAULT);
11200 }
11201
11202 /*
11203 * Routine: vm_map_copyout
11204 *
11205 * Description:
11206 * Copy out a copy chain ("copy") into newly-allocated
11207 * space in the destination map.
11208 *
11209 * If successful, consumes the copy object.
11210 * Otherwise, the caller is responsible for it.
11211 */
11212 kern_return_t
vm_map_copyout(vm_map_t dst_map,vm_map_address_t * dst_addr,vm_map_copy_t copy)11213 vm_map_copyout(
11214 vm_map_t dst_map,
11215 vm_map_address_t *dst_addr, /* OUT */
11216 vm_map_copy_t copy)
11217 {
11218 return vm_map_copyout_internal(dst_map, dst_addr, copy, copy ? copy->size : 0,
11219 TRUE, /* consume_on_success */
11220 VM_PROT_DEFAULT,
11221 VM_PROT_ALL,
11222 VM_INHERIT_DEFAULT);
11223 }
11224
11225 kern_return_t
vm_map_copyout_internal(vm_map_t dst_map,vm_map_address_t * dst_addr,vm_map_copy_t copy,vm_map_size_t copy_size,boolean_t consume_on_success,vm_prot_t cur_protection,vm_prot_t max_protection,vm_inherit_t inheritance)11226 vm_map_copyout_internal(
11227 vm_map_t dst_map,
11228 vm_map_address_t *dst_addr, /* OUT */
11229 vm_map_copy_t copy,
11230 vm_map_size_t copy_size,
11231 boolean_t consume_on_success,
11232 vm_prot_t cur_protection,
11233 vm_prot_t max_protection,
11234 vm_inherit_t inheritance)
11235 {
11236 vm_map_size_t size;
11237 vm_map_size_t adjustment;
11238 vm_map_offset_t start;
11239 vm_object_offset_t vm_copy_start;
11240 vm_map_entry_t last;
11241 vm_map_entry_t entry;
11242 vm_map_copy_t original_copy;
11243 kern_return_t kr;
11244 vm_map_kernel_flags_t vmk_flags = VM_MAP_KERNEL_FLAGS_ANYWHERE();
11245
11246 /*
11247 * Check for null copy object.
11248 */
11249
11250 if (copy == VM_MAP_COPY_NULL) {
11251 *dst_addr = 0;
11252 return KERN_SUCCESS;
11253 }
11254
11255 /*
11256 * Assert that the vm_map_copy is coming from the right
11257 * zone and hasn't been forged
11258 */
11259 vm_map_copy_require(copy);
11260
11261 if (copy->size != copy_size) {
11262 *dst_addr = 0;
11263 return KERN_FAILURE;
11264 }
11265
11266 /*
11267 * Check for special kernel buffer allocated
11268 * by new_ipc_kmsg_copyin.
11269 */
11270
11271 if (copy->type == VM_MAP_COPY_KERNEL_BUFFER) {
11272 return vm_map_copyout_kernel_buffer(dst_map, dst_addr,
11273 copy, copy_size, FALSE,
11274 consume_on_success);
11275 }
11276
11277 original_copy = copy;
11278 if (copy->cpy_hdr.page_shift != VM_MAP_PAGE_SHIFT(dst_map)) {
11279 vm_map_copy_t target_copy;
11280 vm_map_offset_t overmap_start, overmap_end, trimmed_start;
11281
11282 target_copy = VM_MAP_COPY_NULL;
11283 DEBUG4K_ADJUST("adjusting...\n");
11284 kr = vm_map_copy_adjust_to_target(
11285 copy,
11286 0, /* offset */
11287 copy->size, /* size */
11288 dst_map,
11289 TRUE, /* copy */
11290 &target_copy,
11291 &overmap_start,
11292 &overmap_end,
11293 &trimmed_start);
11294 if (kr != KERN_SUCCESS) {
11295 DEBUG4K_COPY("adjust failed 0x%x\n", kr);
11296 return kr;
11297 }
11298 DEBUG4K_COPY("copy %p (%d 0x%llx 0x%llx) dst_map %p (%d) target_copy %p (%d 0x%llx 0x%llx) overmap_start 0x%llx overmap_end 0x%llx trimmed_start 0x%llx\n", copy, copy->cpy_hdr.page_shift, copy->offset, (uint64_t)copy->size, dst_map, VM_MAP_PAGE_SHIFT(dst_map), target_copy, target_copy->cpy_hdr.page_shift, target_copy->offset, (uint64_t)target_copy->size, (uint64_t)overmap_start, (uint64_t)overmap_end, (uint64_t)trimmed_start);
11299 if (target_copy != copy) {
11300 copy = target_copy;
11301 }
11302 copy_size = copy->size;
11303 }
11304
11305 /*
11306 * Find space for the data
11307 */
11308
11309 vm_copy_start = vm_map_trunc_page((vm_map_size_t)copy->offset,
11310 VM_MAP_COPY_PAGE_MASK(copy));
11311 size = vm_map_round_page((vm_map_size_t)copy->offset + copy_size,
11312 VM_MAP_COPY_PAGE_MASK(copy))
11313 - vm_copy_start;
11314
11315 vm_map_kernel_flags_update_range_id(&vmk_flags, dst_map);
11316
11317 vm_map_lock(dst_map);
11318 kr = vm_map_locate_space(dst_map, size, 0, vmk_flags,
11319 &start, &last);
11320 if (kr != KERN_SUCCESS) {
11321 vm_map_unlock(dst_map);
11322 return kr;
11323 }
11324
11325 adjustment = start - vm_copy_start;
11326 if (!consume_on_success) {
11327 /*
11328 * We're not allowed to consume "copy", so we'll have to
11329 * copy its map entries into the destination map below.
11330 * No need to re-allocate map entries from the correct
11331 * (pageable or not) zone, since we'll get new map entries
11332 * during the transfer.
11333 * We'll also adjust the map entries's "start" and "end"
11334 * during the transfer, to keep "copy"'s entries consistent
11335 * with its "offset".
11336 */
11337 goto after_adjustments;
11338 }
11339
11340 /*
11341 * Since we're going to just drop the map
11342 * entries from the copy into the destination
11343 * map, they must come from the same pool.
11344 */
11345
11346 if (copy->cpy_hdr.entries_pageable != dst_map->hdr.entries_pageable) {
11347 /*
11348 * Mismatches occur when dealing with the default
11349 * pager.
11350 */
11351 vm_map_entry_t next, new;
11352
11353 /*
11354 * Find the zone that the copies were allocated from
11355 */
11356
11357 entry = vm_map_copy_first_entry(copy);
11358
11359 /*
11360 * Reinitialize the copy so that vm_map_copy_entry_link
11361 * will work.
11362 */
11363 vm_map_store_copy_reset(copy, entry);
11364 copy->cpy_hdr.entries_pageable = dst_map->hdr.entries_pageable;
11365
11366 /*
11367 * Copy each entry.
11368 */
11369 while (entry != vm_map_copy_to_entry(copy)) {
11370 new = vm_map_copy_entry_create(copy);
11371 vm_map_entry_copy_full(new, entry);
11372 new->vme_no_copy_on_read = FALSE;
11373 assert(!new->iokit_acct);
11374 if (new->is_sub_map) {
11375 /* clr address space specifics */
11376 new->use_pmap = FALSE;
11377 }
11378 vm_map_copy_entry_link(copy,
11379 vm_map_copy_last_entry(copy),
11380 new);
11381 next = entry->vme_next;
11382 vm_map_entry_dispose(entry);
11383 entry = next;
11384 }
11385 }
11386
11387 /*
11388 * Adjust the addresses in the copy chain, and
11389 * reset the region attributes.
11390 */
11391
11392 for (entry = vm_map_copy_first_entry(copy);
11393 entry != vm_map_copy_to_entry(copy);
11394 entry = entry->vme_next) {
11395 if (VM_MAP_PAGE_SHIFT(dst_map) == PAGE_SHIFT) {
11396 /*
11397 * We're injecting this copy entry into a map that
11398 * has the standard page alignment, so clear
11399 * "map_aligned" (which might have been inherited
11400 * from the original map entry).
11401 */
11402 entry->map_aligned = FALSE;
11403 }
11404
11405 entry->vme_start += adjustment;
11406 entry->vme_end += adjustment;
11407
11408 if (entry->map_aligned) {
11409 assert(VM_MAP_PAGE_ALIGNED(entry->vme_start,
11410 VM_MAP_PAGE_MASK(dst_map)));
11411 assert(VM_MAP_PAGE_ALIGNED(entry->vme_end,
11412 VM_MAP_PAGE_MASK(dst_map)));
11413 }
11414
11415 entry->inheritance = VM_INHERIT_DEFAULT;
11416 entry->protection = VM_PROT_DEFAULT;
11417 entry->max_protection = VM_PROT_ALL;
11418 entry->behavior = VM_BEHAVIOR_DEFAULT;
11419
11420 /*
11421 * If the entry is now wired,
11422 * map the pages into the destination map.
11423 */
11424 if (entry->wired_count != 0) {
11425 vm_map_offset_t va;
11426 vm_object_offset_t offset;
11427 vm_object_t object;
11428 vm_prot_t prot;
11429 int type_of_fault;
11430
11431 /* TODO4K would need to use actual page size */
11432 assert(VM_MAP_PAGE_SHIFT(dst_map) == PAGE_SHIFT);
11433
11434 object = VME_OBJECT(entry);
11435 offset = VME_OFFSET(entry);
11436 va = entry->vme_start;
11437
11438 pmap_pageable(dst_map->pmap,
11439 entry->vme_start,
11440 entry->vme_end,
11441 TRUE);
11442
11443 while (va < entry->vme_end) {
11444 vm_page_t m;
11445 struct vm_object_fault_info fault_info = {};
11446
11447 /*
11448 * Look up the page in the object.
11449 * Assert that the page will be found in the
11450 * top object:
11451 * either
11452 * the object was newly created by
11453 * vm_object_copy_slowly, and has
11454 * copies of all of the pages from
11455 * the source object
11456 * or
11457 * the object was moved from the old
11458 * map entry; because the old map
11459 * entry was wired, all of the pages
11460 * were in the top-level object.
11461 * (XXX not true if we wire pages for
11462 * reading)
11463 */
11464 vm_object_lock(object);
11465
11466 m = vm_page_lookup(object, offset);
11467 if (m == VM_PAGE_NULL || !VM_PAGE_WIRED(m) ||
11468 m->vmp_absent) {
11469 panic("vm_map_copyout: wiring %p", m);
11470 }
11471
11472 prot = entry->protection;
11473
11474 if (override_nx(dst_map, VME_ALIAS(entry)) &&
11475 prot) {
11476 prot |= VM_PROT_EXECUTE;
11477 }
11478
11479 type_of_fault = DBG_CACHE_HIT_FAULT;
11480
11481 fault_info.user_tag = VME_ALIAS(entry);
11482 fault_info.pmap_options = 0;
11483 if (entry->iokit_acct ||
11484 (!entry->is_sub_map && !entry->use_pmap)) {
11485 fault_info.pmap_options |= PMAP_OPTIONS_ALT_ACCT;
11486 }
11487 if (entry->vme_xnu_user_debug &&
11488 !VM_PAGE_OBJECT(m)->code_signed) {
11489 /*
11490 * Modified code-signed executable
11491 * region: this page does not belong
11492 * to a code-signed VM object, so it
11493 * must have been copied and should
11494 * therefore be typed XNU_USER_DEBUG
11495 * rather than XNU_USER_EXEC.
11496 */
11497 fault_info.pmap_options |= PMAP_OPTIONS_XNU_USER_DEBUG;
11498 }
11499
11500 vm_fault_enter(m,
11501 dst_map->pmap,
11502 va,
11503 PAGE_SIZE, 0,
11504 prot,
11505 prot,
11506 VM_PAGE_WIRED(m),
11507 FALSE, /* change_wiring */
11508 VM_KERN_MEMORY_NONE, /* tag - not wiring */
11509 &fault_info,
11510 NULL, /* need_retry */
11511 &type_of_fault);
11512
11513 vm_object_unlock(object);
11514
11515 offset += PAGE_SIZE_64;
11516 va += PAGE_SIZE;
11517 }
11518 }
11519 }
11520
11521 after_adjustments:
11522
11523 /*
11524 * Correct the page alignment for the result
11525 */
11526
11527 *dst_addr = start + (copy->offset - vm_copy_start);
11528
11529 #if KASAN
11530 kasan_notify_address(*dst_addr, size);
11531 #endif
11532
11533 /*
11534 * Update the hints and the map size
11535 */
11536
11537 if (consume_on_success) {
11538 SAVE_HINT_MAP_WRITE(dst_map, vm_map_copy_last_entry(copy));
11539 } else {
11540 SAVE_HINT_MAP_WRITE(dst_map, last);
11541 }
11542
11543 dst_map->size += size;
11544
11545 /*
11546 * Link in the copy
11547 */
11548
11549 if (consume_on_success) {
11550 vm_map_copy_insert(dst_map, last, copy);
11551 if (copy != original_copy) {
11552 vm_map_copy_discard(original_copy);
11553 original_copy = VM_MAP_COPY_NULL;
11554 }
11555 } else {
11556 vm_map_copy_remap(dst_map, last, copy, adjustment,
11557 cur_protection, max_protection,
11558 inheritance);
11559 if (copy != original_copy && original_copy != VM_MAP_COPY_NULL) {
11560 vm_map_copy_discard(copy);
11561 copy = original_copy;
11562 }
11563 }
11564
11565
11566 vm_map_unlock(dst_map);
11567
11568 /*
11569 * XXX If wiring_required, call vm_map_pageable
11570 */
11571
11572 return KERN_SUCCESS;
11573 }
11574
11575 /*
11576 * Routine: vm_map_copyin
11577 *
11578 * Description:
11579 * see vm_map_copyin_common. Exported via Unsupported.exports.
11580 *
11581 */
11582
11583 #undef vm_map_copyin
11584
11585 kern_return_t
vm_map_copyin(vm_map_t src_map,vm_map_address_t src_addr,vm_map_size_t len,boolean_t src_destroy,vm_map_copy_t * copy_result)11586 vm_map_copyin(
11587 vm_map_t src_map,
11588 vm_map_address_t src_addr,
11589 vm_map_size_t len,
11590 boolean_t src_destroy,
11591 vm_map_copy_t *copy_result) /* OUT */
11592 {
11593 return vm_map_copyin_common(src_map, src_addr, len, src_destroy,
11594 FALSE, copy_result, FALSE);
11595 }
11596
11597 /*
11598 * Routine: vm_map_copyin_common
11599 *
11600 * Description:
11601 * Copy the specified region (src_addr, len) from the
11602 * source address space (src_map), possibly removing
11603 * the region from the source address space (src_destroy).
11604 *
11605 * Returns:
11606 * A vm_map_copy_t object (copy_result), suitable for
11607 * insertion into another address space (using vm_map_copyout),
11608 * copying over another address space region (using
11609 * vm_map_copy_overwrite). If the copy is unused, it
11610 * should be destroyed (using vm_map_copy_discard).
11611 *
11612 * In/out conditions:
11613 * The source map should not be locked on entry.
11614 */
11615
11616 typedef struct submap_map {
11617 vm_map_t parent_map;
11618 vm_map_offset_t base_start;
11619 vm_map_offset_t base_end;
11620 vm_map_size_t base_len;
11621 struct submap_map *next;
11622 } submap_map_t;
11623
11624 kern_return_t
vm_map_copyin_common(vm_map_t src_map,vm_map_address_t src_addr,vm_map_size_t len,boolean_t src_destroy,__unused boolean_t src_volatile,vm_map_copy_t * copy_result,boolean_t use_maxprot)11625 vm_map_copyin_common(
11626 vm_map_t src_map,
11627 vm_map_address_t src_addr,
11628 vm_map_size_t len,
11629 boolean_t src_destroy,
11630 __unused boolean_t src_volatile,
11631 vm_map_copy_t *copy_result, /* OUT */
11632 boolean_t use_maxprot)
11633 {
11634 int flags;
11635
11636 flags = 0;
11637 if (src_destroy) {
11638 flags |= VM_MAP_COPYIN_SRC_DESTROY;
11639 }
11640 if (use_maxprot) {
11641 flags |= VM_MAP_COPYIN_USE_MAXPROT;
11642 }
11643 return vm_map_copyin_internal(src_map,
11644 src_addr,
11645 len,
11646 flags,
11647 copy_result);
11648 }
11649 kern_return_t
vm_map_copyin_internal(vm_map_t src_map,vm_map_address_t src_addr,vm_map_size_t len,int flags,vm_map_copy_t * copy_result)11650 vm_map_copyin_internal(
11651 vm_map_t src_map,
11652 vm_map_address_t src_addr,
11653 vm_map_size_t len,
11654 int flags,
11655 vm_map_copy_t *copy_result) /* OUT */
11656 {
11657 vm_map_entry_t tmp_entry; /* Result of last map lookup --
11658 * in multi-level lookup, this
11659 * entry contains the actual
11660 * vm_object/offset.
11661 */
11662 vm_map_entry_t new_entry = VM_MAP_ENTRY_NULL; /* Map entry for copy */
11663
11664 vm_map_offset_t src_start; /* Start of current entry --
11665 * where copy is taking place now
11666 */
11667 vm_map_offset_t src_end; /* End of entire region to be
11668 * copied */
11669 vm_map_offset_t src_base;
11670 vm_map_t base_map = src_map;
11671 boolean_t map_share = FALSE;
11672 submap_map_t *parent_maps = NULL;
11673
11674 vm_map_copy_t copy; /* Resulting copy */
11675 vm_map_address_t copy_addr;
11676 vm_map_size_t copy_size;
11677 boolean_t src_destroy;
11678 boolean_t use_maxprot;
11679 boolean_t preserve_purgeable;
11680 boolean_t entry_was_shared;
11681 vm_map_entry_t saved_src_entry;
11682
11683 if (flags & ~VM_MAP_COPYIN_ALL_FLAGS) {
11684 return KERN_INVALID_ARGUMENT;
11685 }
11686
11687 #if CONFIG_KERNEL_TBI
11688 if (src_map->pmap == kernel_pmap) {
11689 src_addr = VM_KERNEL_TBI_FILL(src_addr);
11690 }
11691 #endif /* CONFIG_KERNEL_TBI && KASAN_TBI */
11692
11693 src_destroy = (flags & VM_MAP_COPYIN_SRC_DESTROY) ? TRUE : FALSE;
11694 use_maxprot = (flags & VM_MAP_COPYIN_USE_MAXPROT) ? TRUE : FALSE;
11695 preserve_purgeable =
11696 (flags & VM_MAP_COPYIN_PRESERVE_PURGEABLE) ? TRUE : FALSE;
11697
11698 /*
11699 * Check for copies of zero bytes.
11700 */
11701
11702 if (len == 0) {
11703 *copy_result = VM_MAP_COPY_NULL;
11704 return KERN_SUCCESS;
11705 }
11706
11707 /*
11708 * Check that the end address doesn't overflow
11709 */
11710 src_end = src_addr + len;
11711 if (src_end < src_addr) {
11712 return KERN_INVALID_ADDRESS;
11713 }
11714
11715 /*
11716 * Compute (page aligned) start and end of region
11717 */
11718 src_start = vm_map_trunc_page(src_addr,
11719 VM_MAP_PAGE_MASK(src_map));
11720 src_end = vm_map_round_page(src_end,
11721 VM_MAP_PAGE_MASK(src_map));
11722
11723 /*
11724 * If the copy is sufficiently small, use a kernel buffer instead
11725 * of making a virtual copy. The theory being that the cost of
11726 * setting up VM (and taking C-O-W faults) dominates the copy costs
11727 * for small regions.
11728 */
11729 if ((len <= msg_ool_size_small) &&
11730 !use_maxprot &&
11731 !preserve_purgeable &&
11732 !(flags & VM_MAP_COPYIN_ENTRY_LIST) &&
11733 /*
11734 * Since the "msg_ool_size_small" threshold was increased and
11735 * vm_map_copyin_kernel_buffer() doesn't handle accesses beyond the
11736 * address space limits, we revert to doing a virtual copy if the
11737 * copied range goes beyond those limits. Otherwise, mach_vm_read()
11738 * of the commpage would now fail when it used to work.
11739 */
11740 (src_start >= vm_map_min(src_map) &&
11741 src_start < vm_map_max(src_map) &&
11742 src_end >= vm_map_min(src_map) &&
11743 src_end < vm_map_max(src_map))) {
11744 return vm_map_copyin_kernel_buffer(src_map, src_addr, len,
11745 src_destroy, copy_result);
11746 }
11747
11748 /*
11749 * Allocate a header element for the list.
11750 *
11751 * Use the start and end in the header to
11752 * remember the endpoints prior to rounding.
11753 */
11754
11755 copy = vm_map_copy_allocate(VM_MAP_COPY_ENTRY_LIST);
11756 copy->cpy_hdr.entries_pageable = TRUE;
11757 copy->cpy_hdr.page_shift = (uint16_t)VM_MAP_PAGE_SHIFT(src_map);
11758 copy->offset = src_addr;
11759 copy->size = len;
11760
11761 new_entry = vm_map_copy_entry_create(copy);
11762
11763 #define RETURN(x) \
11764 MACRO_BEGIN \
11765 vm_map_unlock(src_map); \
11766 if(src_map != base_map) \
11767 vm_map_deallocate(src_map); \
11768 if (new_entry != VM_MAP_ENTRY_NULL) \
11769 vm_map_copy_entry_dispose(new_entry); \
11770 vm_map_copy_discard(copy); \
11771 { \
11772 submap_map_t *_ptr; \
11773 \
11774 for(_ptr = parent_maps; _ptr != NULL; _ptr = parent_maps) { \
11775 parent_maps=parent_maps->next; \
11776 if (_ptr->parent_map != base_map) \
11777 vm_map_deallocate(_ptr->parent_map); \
11778 kfree_type(submap_map_t, _ptr); \
11779 } \
11780 } \
11781 MACRO_RETURN(x); \
11782 MACRO_END
11783
11784 /*
11785 * Find the beginning of the region.
11786 */
11787
11788 vm_map_lock(src_map);
11789
11790 /*
11791 * Lookup the original "src_addr" rather than the truncated
11792 * "src_start", in case "src_start" falls in a non-map-aligned
11793 * map entry *before* the map entry that contains "src_addr"...
11794 */
11795 if (!vm_map_lookup_entry(src_map, src_addr, &tmp_entry)) {
11796 RETURN(KERN_INVALID_ADDRESS);
11797 }
11798 if (!tmp_entry->is_sub_map) {
11799 /*
11800 * ... but clip to the map-rounded "src_start" rather than
11801 * "src_addr" to preserve map-alignment. We'll adjust the
11802 * first copy entry at the end, if needed.
11803 */
11804 vm_map_clip_start(src_map, tmp_entry, src_start);
11805 }
11806 if (src_start < tmp_entry->vme_start) {
11807 /*
11808 * Move "src_start" up to the start of the
11809 * first map entry to copy.
11810 */
11811 src_start = tmp_entry->vme_start;
11812 }
11813 /* set for later submap fix-up */
11814 copy_addr = src_start;
11815
11816 /*
11817 * Go through entries until we get to the end.
11818 */
11819
11820 while (TRUE) {
11821 vm_map_entry_t src_entry = tmp_entry; /* Top-level entry */
11822 vm_map_size_t src_size; /* Size of source
11823 * map entry (in both
11824 * maps)
11825 */
11826
11827 vm_object_t src_object; /* Object to copy */
11828 vm_object_offset_t src_offset;
11829
11830 vm_object_t new_copy_object;/* vm_object_copy_* result */
11831
11832 boolean_t src_needs_copy; /* Should source map
11833 * be made read-only
11834 * for copy-on-write?
11835 */
11836
11837 boolean_t new_entry_needs_copy; /* Will new entry be COW? */
11838
11839 boolean_t was_wired; /* Was source wired? */
11840 boolean_t saved_used_for_jit; /* Saved used_for_jit. */
11841 #if __arm64e__
11842 boolean_t saved_used_for_tpro; /* Saved used_for_tpro */
11843 #endif
11844 vm_map_version_t version; /* Version before locks
11845 * dropped to make copy
11846 */
11847 kern_return_t result; /* Return value from
11848 * copy_strategically.
11849 */
11850 while (tmp_entry->is_sub_map) {
11851 vm_map_size_t submap_len;
11852 submap_map_t *ptr;
11853
11854 ptr = kalloc_type(submap_map_t, Z_WAITOK);
11855 ptr->next = parent_maps;
11856 parent_maps = ptr;
11857 ptr->parent_map = src_map;
11858 ptr->base_start = src_start;
11859 ptr->base_end = src_end;
11860 submap_len = tmp_entry->vme_end - src_start;
11861 if (submap_len > (src_end - src_start)) {
11862 submap_len = src_end - src_start;
11863 }
11864 ptr->base_len = submap_len;
11865
11866 src_start -= tmp_entry->vme_start;
11867 src_start += VME_OFFSET(tmp_entry);
11868 src_end = src_start + submap_len;
11869 src_map = VME_SUBMAP(tmp_entry);
11870 vm_map_lock(src_map);
11871 /* keep an outstanding reference for all maps in */
11872 /* the parents tree except the base map */
11873 vm_map_reference(src_map);
11874 vm_map_unlock(ptr->parent_map);
11875 if (!vm_map_lookup_entry(
11876 src_map, src_start, &tmp_entry)) {
11877 RETURN(KERN_INVALID_ADDRESS);
11878 }
11879 map_share = TRUE;
11880 if (!tmp_entry->is_sub_map) {
11881 vm_map_clip_start(src_map, tmp_entry, src_start);
11882 }
11883 src_entry = tmp_entry;
11884 }
11885 /* we are now in the lowest level submap... */
11886
11887 if ((VME_OBJECT(tmp_entry) != VM_OBJECT_NULL) &&
11888 (VME_OBJECT(tmp_entry)->phys_contiguous)) {
11889 /* This is not, supported for now.In future */
11890 /* we will need to detect the phys_contig */
11891 /* condition and then upgrade copy_slowly */
11892 /* to do physical copy from the device mem */
11893 /* based object. We can piggy-back off of */
11894 /* the was wired boolean to set-up the */
11895 /* proper handling */
11896 RETURN(KERN_PROTECTION_FAILURE);
11897 }
11898 /*
11899 * Create a new address map entry to hold the result.
11900 * Fill in the fields from the appropriate source entries.
11901 * We must unlock the source map to do this if we need
11902 * to allocate a map entry.
11903 */
11904 if (new_entry == VM_MAP_ENTRY_NULL) {
11905 version.main_timestamp = src_map->timestamp;
11906 vm_map_unlock(src_map);
11907
11908 new_entry = vm_map_copy_entry_create(copy);
11909
11910 vm_map_lock(src_map);
11911 if ((version.main_timestamp + 1) != src_map->timestamp) {
11912 if (!vm_map_lookup_entry(src_map, src_start,
11913 &tmp_entry)) {
11914 RETURN(KERN_INVALID_ADDRESS);
11915 }
11916 if (!tmp_entry->is_sub_map) {
11917 vm_map_clip_start(src_map, tmp_entry, src_start);
11918 }
11919 continue; /* restart w/ new tmp_entry */
11920 }
11921 }
11922
11923 /*
11924 * Verify that the region can be read.
11925 */
11926 if (((src_entry->protection & VM_PROT_READ) == VM_PROT_NONE &&
11927 !use_maxprot) ||
11928 (src_entry->max_protection & VM_PROT_READ) == 0) {
11929 RETURN(KERN_PROTECTION_FAILURE);
11930 }
11931
11932 /*
11933 * Clip against the endpoints of the entire region.
11934 */
11935
11936 vm_map_clip_end(src_map, src_entry, src_end);
11937
11938 src_size = src_entry->vme_end - src_start;
11939 src_object = VME_OBJECT(src_entry);
11940 src_offset = VME_OFFSET(src_entry);
11941 was_wired = (src_entry->wired_count != 0);
11942
11943 vm_map_entry_copy(src_map, new_entry, src_entry);
11944 if (new_entry->is_sub_map) {
11945 /* clr address space specifics */
11946 new_entry->use_pmap = FALSE;
11947 } else {
11948 /*
11949 * We're dealing with a copy-on-write operation,
11950 * so the resulting mapping should not inherit the
11951 * original mapping's accounting settings.
11952 * "iokit_acct" should have been cleared in
11953 * vm_map_entry_copy().
11954 * "use_pmap" should be reset to its default (TRUE)
11955 * so that the new mapping gets accounted for in
11956 * the task's memory footprint.
11957 */
11958 assert(!new_entry->iokit_acct);
11959 new_entry->use_pmap = TRUE;
11960 }
11961
11962 /*
11963 * Attempt non-blocking copy-on-write optimizations.
11964 */
11965
11966 /*
11967 * If we are destroying the source, and the object
11968 * is internal, we could move the object reference
11969 * from the source to the copy. The copy is
11970 * copy-on-write only if the source is.
11971 * We make another reference to the object, because
11972 * destroying the source entry will deallocate it.
11973 *
11974 * This memory transfer has to be atomic, (to prevent
11975 * the VM object from being shared or copied while
11976 * it's being moved here), so we could only do this
11977 * if we won't have to unlock the VM map until the
11978 * original mapping has been fully removed.
11979 */
11980
11981 RestartCopy:
11982 if ((src_object == VM_OBJECT_NULL ||
11983 (!was_wired && !map_share && !tmp_entry->is_shared
11984 && !(debug4k_no_cow_copyin && VM_MAP_PAGE_SHIFT(src_map) < PAGE_SHIFT))) &&
11985 vm_object_copy_quickly(
11986 VME_OBJECT(new_entry),
11987 src_offset,
11988 src_size,
11989 &src_needs_copy,
11990 &new_entry_needs_copy)) {
11991 new_entry->needs_copy = new_entry_needs_copy;
11992
11993 /*
11994 * Handle copy-on-write obligations
11995 */
11996
11997 if (src_needs_copy && !tmp_entry->needs_copy) {
11998 vm_prot_t prot;
11999
12000 prot = src_entry->protection & ~VM_PROT_WRITE;
12001
12002 if (override_nx(src_map, VME_ALIAS(src_entry))
12003 && prot) {
12004 prot |= VM_PROT_EXECUTE;
12005 }
12006
12007 vm_object_pmap_protect(
12008 src_object,
12009 src_offset,
12010 src_size,
12011 (src_entry->is_shared ?
12012 PMAP_NULL
12013 : src_map->pmap),
12014 VM_MAP_PAGE_SIZE(src_map),
12015 src_entry->vme_start,
12016 prot);
12017
12018 assert(tmp_entry->wired_count == 0);
12019 tmp_entry->needs_copy = TRUE;
12020 }
12021
12022 /*
12023 * The map has never been unlocked, so it's safe
12024 * to move to the next entry rather than doing
12025 * another lookup.
12026 */
12027
12028 goto CopySuccessful;
12029 }
12030
12031 entry_was_shared = tmp_entry->is_shared;
12032
12033 /*
12034 * Take an object reference, so that we may
12035 * release the map lock(s).
12036 */
12037
12038 assert(src_object != VM_OBJECT_NULL);
12039 vm_object_reference(src_object);
12040
12041 /*
12042 * Record the timestamp for later verification.
12043 * Unlock the map.
12044 */
12045
12046 version.main_timestamp = src_map->timestamp;
12047 vm_map_unlock(src_map); /* Increments timestamp once! */
12048 saved_src_entry = src_entry;
12049 tmp_entry = VM_MAP_ENTRY_NULL;
12050 src_entry = VM_MAP_ENTRY_NULL;
12051
12052 /*
12053 * Perform the copy
12054 */
12055
12056 if (was_wired ||
12057 (debug4k_no_cow_copyin &&
12058 VM_MAP_PAGE_SHIFT(src_map) < PAGE_SHIFT)) {
12059 CopySlowly:
12060 vm_object_lock(src_object);
12061 result = vm_object_copy_slowly(
12062 src_object,
12063 src_offset,
12064 src_size,
12065 THREAD_UNINT,
12066 &new_copy_object);
12067 /* VME_OBJECT_SET will reset used_for_jit|tpro, so preserve it. */
12068 saved_used_for_jit = new_entry->used_for_jit;
12069 #if __arm64e__
12070 saved_used_for_tpro = new_entry->used_for_tpro;
12071 #endif
12072 VME_OBJECT_SET(new_entry, new_copy_object, false, 0);
12073 new_entry->used_for_jit = saved_used_for_jit;
12074 #if __arm64e__
12075 new_entry->used_for_tpro = saved_used_for_tpro;
12076 #endif
12077 VME_OFFSET_SET(new_entry,
12078 src_offset - vm_object_trunc_page(src_offset));
12079 new_entry->needs_copy = FALSE;
12080 } else if (src_object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC &&
12081 (entry_was_shared || map_share)) {
12082 vm_object_t new_object;
12083
12084 vm_object_lock_shared(src_object);
12085 new_object = vm_object_copy_delayed(
12086 src_object,
12087 src_offset,
12088 src_size,
12089 TRUE);
12090 if (new_object == VM_OBJECT_NULL) {
12091 goto CopySlowly;
12092 }
12093
12094 VME_OBJECT_SET(new_entry, new_object, false, 0);
12095 assert(new_entry->wired_count == 0);
12096 new_entry->needs_copy = TRUE;
12097 assert(!new_entry->iokit_acct);
12098 assert(new_object->purgable == VM_PURGABLE_DENY);
12099 assertf(new_entry->use_pmap, "src_map %p new_entry %p\n", src_map, new_entry);
12100 result = KERN_SUCCESS;
12101 } else {
12102 vm_object_offset_t new_offset;
12103 new_offset = VME_OFFSET(new_entry);
12104 result = vm_object_copy_strategically(src_object,
12105 src_offset,
12106 src_size,
12107 &new_copy_object,
12108 &new_offset,
12109 &new_entry_needs_copy);
12110 /* VME_OBJECT_SET will reset used_for_jit, so preserve it. */
12111 saved_used_for_jit = new_entry->used_for_jit;
12112 #if __arm64e__
12113 saved_used_for_tpro = new_entry->used_for_tpro;
12114 #endif
12115 VME_OBJECT_SET(new_entry, new_copy_object, false, 0);
12116 new_entry->used_for_jit = saved_used_for_jit;
12117 #if __arm64e__
12118 new_entry->used_for_tpro = saved_used_for_tpro;
12119 #endif
12120 if (new_offset != VME_OFFSET(new_entry)) {
12121 VME_OFFSET_SET(new_entry, new_offset);
12122 }
12123
12124 new_entry->needs_copy = new_entry_needs_copy;
12125 }
12126
12127 if (result == KERN_SUCCESS &&
12128 ((preserve_purgeable &&
12129 src_object->purgable != VM_PURGABLE_DENY) ||
12130 new_entry->used_for_jit
12131 #if __arm64e__
12132 || new_entry->used_for_tpro
12133 #endif
12134 )) {
12135 /*
12136 * Purgeable objects should be COPY_NONE, true share;
12137 * this should be propogated to the copy.
12138 *
12139 * Also force mappings the pmap specially protects to
12140 * be COPY_NONE; trying to COW these mappings would
12141 * change the effective protections, which could have
12142 * side effects if the pmap layer relies on the
12143 * specified protections.
12144 */
12145
12146 vm_object_t new_object;
12147
12148 new_object = VME_OBJECT(new_entry);
12149 assert(new_object != src_object);
12150 vm_object_lock(new_object);
12151 assert(new_object->ref_count == 1);
12152 assert(new_object->shadow == VM_OBJECT_NULL);
12153 assert(new_object->copy == VM_OBJECT_NULL);
12154 assert(new_object->vo_owner == NULL);
12155
12156 new_object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
12157
12158 if (preserve_purgeable &&
12159 src_object->purgable != VM_PURGABLE_DENY) {
12160 new_object->true_share = TRUE;
12161
12162 /* start as non-volatile with no owner... */
12163 new_object->purgable = VM_PURGABLE_NONVOLATILE;
12164 vm_purgeable_nonvolatile_enqueue(new_object, NULL);
12165 /* ... and move to src_object's purgeable state */
12166 if (src_object->purgable != VM_PURGABLE_NONVOLATILE) {
12167 int state;
12168 state = src_object->purgable;
12169 vm_object_purgable_control(
12170 new_object,
12171 VM_PURGABLE_SET_STATE_FROM_KERNEL,
12172 &state);
12173 }
12174 /* no pmap accounting for purgeable objects */
12175 new_entry->use_pmap = FALSE;
12176 }
12177
12178 vm_object_unlock(new_object);
12179 new_object = VM_OBJECT_NULL;
12180 }
12181
12182 if (result != KERN_SUCCESS &&
12183 result != KERN_MEMORY_RESTART_COPY) {
12184 vm_map_lock(src_map);
12185 RETURN(result);
12186 }
12187
12188 /*
12189 * Throw away the extra reference
12190 */
12191
12192 vm_object_deallocate(src_object);
12193
12194 /*
12195 * Verify that the map has not substantially
12196 * changed while the copy was being made.
12197 */
12198
12199 vm_map_lock(src_map);
12200
12201 if ((version.main_timestamp + 1) == src_map->timestamp) {
12202 /* src_map hasn't changed: src_entry is still valid */
12203 src_entry = saved_src_entry;
12204 goto VerificationSuccessful;
12205 }
12206
12207 /*
12208 * Simple version comparison failed.
12209 *
12210 * Retry the lookup and verify that the
12211 * same object/offset are still present.
12212 *
12213 * [Note: a memory manager that colludes with
12214 * the calling task can detect that we have
12215 * cheated. While the map was unlocked, the
12216 * mapping could have been changed and restored.]
12217 */
12218
12219 if (!vm_map_lookup_entry(src_map, src_start, &tmp_entry)) {
12220 if (result != KERN_MEMORY_RESTART_COPY) {
12221 vm_object_deallocate(VME_OBJECT(new_entry));
12222 VME_OBJECT_SET(new_entry, VM_OBJECT_NULL, false, 0);
12223 /* reset accounting state */
12224 new_entry->iokit_acct = FALSE;
12225 new_entry->use_pmap = TRUE;
12226 }
12227 RETURN(KERN_INVALID_ADDRESS);
12228 }
12229
12230 src_entry = tmp_entry;
12231 vm_map_clip_start(src_map, src_entry, src_start);
12232
12233 if ((((src_entry->protection & VM_PROT_READ) == VM_PROT_NONE) &&
12234 !use_maxprot) ||
12235 ((src_entry->max_protection & VM_PROT_READ) == 0)) {
12236 goto VerificationFailed;
12237 }
12238
12239 if (src_entry->vme_end < new_entry->vme_end) {
12240 /*
12241 * This entry might have been shortened
12242 * (vm_map_clip_end) or been replaced with
12243 * an entry that ends closer to "src_start"
12244 * than before.
12245 * Adjust "new_entry" accordingly; copying
12246 * less memory would be correct but we also
12247 * redo the copy (see below) if the new entry
12248 * no longer points at the same object/offset.
12249 */
12250 assert(VM_MAP_PAGE_ALIGNED(src_entry->vme_end,
12251 VM_MAP_COPY_PAGE_MASK(copy)));
12252 new_entry->vme_end = src_entry->vme_end;
12253 src_size = new_entry->vme_end - src_start;
12254 } else if (src_entry->vme_end > new_entry->vme_end) {
12255 /*
12256 * This entry might have been extended
12257 * (vm_map_entry_simplify() or coalesce)
12258 * or been replaced with an entry that ends farther
12259 * from "src_start" than before.
12260 *
12261 * We've called vm_object_copy_*() only on
12262 * the previous <start:end> range, so we can't
12263 * just extend new_entry. We have to re-do
12264 * the copy based on the new entry as if it was
12265 * pointing at a different object/offset (see
12266 * "Verification failed" below).
12267 */
12268 }
12269
12270 if ((VME_OBJECT(src_entry) != src_object) ||
12271 (VME_OFFSET(src_entry) != src_offset) ||
12272 (src_entry->vme_end > new_entry->vme_end)) {
12273 /*
12274 * Verification failed.
12275 *
12276 * Start over with this top-level entry.
12277 */
12278
12279 VerificationFailed: ;
12280
12281 vm_object_deallocate(VME_OBJECT(new_entry));
12282 tmp_entry = src_entry;
12283 continue;
12284 }
12285
12286 /*
12287 * Verification succeeded.
12288 */
12289
12290 VerificationSuccessful:;
12291
12292 if (result == KERN_MEMORY_RESTART_COPY) {
12293 goto RestartCopy;
12294 }
12295
12296 /*
12297 * Copy succeeded.
12298 */
12299
12300 CopySuccessful: ;
12301
12302 /*
12303 * Link in the new copy entry.
12304 */
12305
12306 vm_map_copy_entry_link(copy, vm_map_copy_last_entry(copy),
12307 new_entry);
12308
12309 /*
12310 * Determine whether the entire region
12311 * has been copied.
12312 */
12313 src_base = src_start;
12314 src_start = new_entry->vme_end;
12315 new_entry = VM_MAP_ENTRY_NULL;
12316 while ((src_start >= src_end) && (src_end != 0)) {
12317 submap_map_t *ptr;
12318
12319 if (src_map == base_map) {
12320 /* back to the top */
12321 break;
12322 }
12323
12324 ptr = parent_maps;
12325 assert(ptr != NULL);
12326 parent_maps = parent_maps->next;
12327
12328 /* fix up the damage we did in that submap */
12329 vm_map_simplify_range(src_map,
12330 src_base,
12331 src_end);
12332
12333 vm_map_unlock(src_map);
12334 vm_map_deallocate(src_map);
12335 vm_map_lock(ptr->parent_map);
12336 src_map = ptr->parent_map;
12337 src_base = ptr->base_start;
12338 src_start = ptr->base_start + ptr->base_len;
12339 src_end = ptr->base_end;
12340 if (!vm_map_lookup_entry(src_map,
12341 src_start,
12342 &tmp_entry) &&
12343 (src_end > src_start)) {
12344 RETURN(KERN_INVALID_ADDRESS);
12345 }
12346 kfree_type(submap_map_t, ptr);
12347 if (parent_maps == NULL) {
12348 map_share = FALSE;
12349 }
12350 src_entry = tmp_entry->vme_prev;
12351 }
12352
12353 if ((VM_MAP_PAGE_SHIFT(src_map) != PAGE_SHIFT) &&
12354 (src_start >= src_addr + len) &&
12355 (src_addr + len != 0)) {
12356 /*
12357 * Stop copying now, even though we haven't reached
12358 * "src_end". We'll adjust the end of the last copy
12359 * entry at the end, if needed.
12360 *
12361 * If src_map's aligment is different from the
12362 * system's page-alignment, there could be
12363 * extra non-map-aligned map entries between
12364 * the original (non-rounded) "src_addr + len"
12365 * and the rounded "src_end".
12366 * We do not want to copy those map entries since
12367 * they're not part of the copied range.
12368 */
12369 break;
12370 }
12371
12372 if ((src_start >= src_end) && (src_end != 0)) {
12373 break;
12374 }
12375
12376 /*
12377 * Verify that there are no gaps in the region
12378 */
12379
12380 tmp_entry = src_entry->vme_next;
12381 if ((tmp_entry->vme_start != src_start) ||
12382 (tmp_entry == vm_map_to_entry(src_map))) {
12383 RETURN(KERN_INVALID_ADDRESS);
12384 }
12385 }
12386
12387 /*
12388 * If the source should be destroyed, do it now, since the
12389 * copy was successful.
12390 */
12391 if (src_destroy) {
12392 vmr_flags_t remove_flags = VM_MAP_REMOVE_NO_FLAGS;
12393
12394 if (src_map == kernel_map) {
12395 remove_flags |= VM_MAP_REMOVE_KUNWIRE;
12396 }
12397 (void)vm_map_remove_and_unlock(src_map,
12398 vm_map_trunc_page(src_addr, VM_MAP_PAGE_MASK(src_map)),
12399 src_end,
12400 remove_flags,
12401 KMEM_GUARD_NONE);
12402 } else {
12403 /* fix up the damage we did in the base map */
12404 vm_map_simplify_range(
12405 src_map,
12406 vm_map_trunc_page(src_addr,
12407 VM_MAP_PAGE_MASK(src_map)),
12408 vm_map_round_page(src_end,
12409 VM_MAP_PAGE_MASK(src_map)));
12410 vm_map_unlock(src_map);
12411 }
12412
12413 tmp_entry = VM_MAP_ENTRY_NULL;
12414
12415 if (VM_MAP_PAGE_SHIFT(src_map) > PAGE_SHIFT &&
12416 VM_MAP_PAGE_SHIFT(src_map) != VM_MAP_COPY_PAGE_SHIFT(copy)) {
12417 vm_map_offset_t original_start, original_offset, original_end;
12418
12419 assert(VM_MAP_COPY_PAGE_MASK(copy) == PAGE_MASK);
12420
12421 /* adjust alignment of first copy_entry's "vme_start" */
12422 tmp_entry = vm_map_copy_first_entry(copy);
12423 if (tmp_entry != vm_map_copy_to_entry(copy)) {
12424 vm_map_offset_t adjustment;
12425
12426 original_start = tmp_entry->vme_start;
12427 original_offset = VME_OFFSET(tmp_entry);
12428
12429 /* map-align the start of the first copy entry... */
12430 adjustment = (tmp_entry->vme_start -
12431 vm_map_trunc_page(
12432 tmp_entry->vme_start,
12433 VM_MAP_PAGE_MASK(src_map)));
12434 tmp_entry->vme_start -= adjustment;
12435 VME_OFFSET_SET(tmp_entry,
12436 VME_OFFSET(tmp_entry) - adjustment);
12437 copy_addr -= adjustment;
12438 assert(tmp_entry->vme_start < tmp_entry->vme_end);
12439 /* ... adjust for mis-aligned start of copy range */
12440 adjustment =
12441 (vm_map_trunc_page(copy->offset,
12442 PAGE_MASK) -
12443 vm_map_trunc_page(copy->offset,
12444 VM_MAP_PAGE_MASK(src_map)));
12445 if (adjustment) {
12446 assert(page_aligned(adjustment));
12447 assert(adjustment < VM_MAP_PAGE_SIZE(src_map));
12448 tmp_entry->vme_start += adjustment;
12449 VME_OFFSET_SET(tmp_entry,
12450 (VME_OFFSET(tmp_entry) +
12451 adjustment));
12452 copy_addr += adjustment;
12453 assert(tmp_entry->vme_start < tmp_entry->vme_end);
12454 }
12455
12456 /*
12457 * Assert that the adjustments haven't exposed
12458 * more than was originally copied...
12459 */
12460 assert(tmp_entry->vme_start >= original_start);
12461 assert(VME_OFFSET(tmp_entry) >= original_offset);
12462 /*
12463 * ... and that it did not adjust outside of a
12464 * a single 16K page.
12465 */
12466 assert(vm_map_trunc_page(tmp_entry->vme_start,
12467 VM_MAP_PAGE_MASK(src_map)) ==
12468 vm_map_trunc_page(original_start,
12469 VM_MAP_PAGE_MASK(src_map)));
12470 }
12471
12472 /* adjust alignment of last copy_entry's "vme_end" */
12473 tmp_entry = vm_map_copy_last_entry(copy);
12474 if (tmp_entry != vm_map_copy_to_entry(copy)) {
12475 vm_map_offset_t adjustment;
12476
12477 original_end = tmp_entry->vme_end;
12478
12479 /* map-align the end of the last copy entry... */
12480 tmp_entry->vme_end =
12481 vm_map_round_page(tmp_entry->vme_end,
12482 VM_MAP_PAGE_MASK(src_map));
12483 /* ... adjust for mis-aligned end of copy range */
12484 adjustment =
12485 (vm_map_round_page((copy->offset +
12486 copy->size),
12487 VM_MAP_PAGE_MASK(src_map)) -
12488 vm_map_round_page((copy->offset +
12489 copy->size),
12490 PAGE_MASK));
12491 if (adjustment) {
12492 assert(page_aligned(adjustment));
12493 assert(adjustment < VM_MAP_PAGE_SIZE(src_map));
12494 tmp_entry->vme_end -= adjustment;
12495 assert(tmp_entry->vme_start < tmp_entry->vme_end);
12496 }
12497
12498 /*
12499 * Assert that the adjustments haven't exposed
12500 * more than was originally copied...
12501 */
12502 assert(tmp_entry->vme_end <= original_end);
12503 /*
12504 * ... and that it did not adjust outside of a
12505 * a single 16K page.
12506 */
12507 assert(vm_map_round_page(tmp_entry->vme_end,
12508 VM_MAP_PAGE_MASK(src_map)) ==
12509 vm_map_round_page(original_end,
12510 VM_MAP_PAGE_MASK(src_map)));
12511 }
12512 }
12513
12514 /* Fix-up start and end points in copy. This is necessary */
12515 /* when the various entries in the copy object were picked */
12516 /* up from different sub-maps */
12517
12518 tmp_entry = vm_map_copy_first_entry(copy);
12519 copy_size = 0; /* compute actual size */
12520 while (tmp_entry != vm_map_copy_to_entry(copy)) {
12521 assert(VM_MAP_PAGE_ALIGNED(
12522 copy_addr + (tmp_entry->vme_end -
12523 tmp_entry->vme_start),
12524 MIN(VM_MAP_COPY_PAGE_MASK(copy), PAGE_MASK)));
12525 assert(VM_MAP_PAGE_ALIGNED(
12526 copy_addr,
12527 MIN(VM_MAP_COPY_PAGE_MASK(copy), PAGE_MASK)));
12528
12529 /*
12530 * The copy_entries will be injected directly into the
12531 * destination map and might not be "map aligned" there...
12532 */
12533 tmp_entry->map_aligned = FALSE;
12534
12535 tmp_entry->vme_end = copy_addr +
12536 (tmp_entry->vme_end - tmp_entry->vme_start);
12537 tmp_entry->vme_start = copy_addr;
12538 assert(tmp_entry->vme_start < tmp_entry->vme_end);
12539 copy_addr += tmp_entry->vme_end - tmp_entry->vme_start;
12540 copy_size += tmp_entry->vme_end - tmp_entry->vme_start;
12541 tmp_entry = (struct vm_map_entry *)tmp_entry->vme_next;
12542 }
12543
12544 if (VM_MAP_PAGE_SHIFT(src_map) != PAGE_SHIFT &&
12545 copy_size < copy->size) {
12546 /*
12547 * The actual size of the VM map copy is smaller than what
12548 * was requested by the caller. This must be because some
12549 * PAGE_SIZE-sized pages are missing at the end of the last
12550 * VM_MAP_PAGE_SIZE(src_map)-sized chunk of the range.
12551 * The caller might not have been aware of those missing
12552 * pages and might not want to be aware of it, which is
12553 * fine as long as they don't try to access (and crash on)
12554 * those missing pages.
12555 * Let's adjust the size of the "copy", to avoid failing
12556 * in vm_map_copyout() or vm_map_copy_overwrite().
12557 */
12558 assert(vm_map_round_page(copy_size,
12559 VM_MAP_PAGE_MASK(src_map)) ==
12560 vm_map_round_page(copy->size,
12561 VM_MAP_PAGE_MASK(src_map)));
12562 copy->size = copy_size;
12563 }
12564
12565 *copy_result = copy;
12566 return KERN_SUCCESS;
12567
12568 #undef RETURN
12569 }
12570
12571 kern_return_t
vm_map_copy_extract(vm_map_t src_map,vm_map_address_t src_addr,vm_map_size_t len,boolean_t do_copy,vm_map_copy_t * copy_result,vm_prot_t * cur_prot,vm_prot_t * max_prot,vm_inherit_t inheritance,vm_map_kernel_flags_t vmk_flags)12572 vm_map_copy_extract(
12573 vm_map_t src_map,
12574 vm_map_address_t src_addr,
12575 vm_map_size_t len,
12576 boolean_t do_copy,
12577 vm_map_copy_t *copy_result, /* OUT */
12578 vm_prot_t *cur_prot, /* IN/OUT */
12579 vm_prot_t *max_prot, /* IN/OUT */
12580 vm_inherit_t inheritance,
12581 vm_map_kernel_flags_t vmk_flags)
12582 {
12583 vm_map_copy_t copy;
12584 kern_return_t kr;
12585 vm_prot_t required_cur_prot, required_max_prot;
12586
12587 /*
12588 * Check for copies of zero bytes.
12589 */
12590
12591 if (len == 0) {
12592 *copy_result = VM_MAP_COPY_NULL;
12593 return KERN_SUCCESS;
12594 }
12595
12596 /*
12597 * Check that the end address doesn't overflow
12598 */
12599 if (src_addr + len < src_addr) {
12600 return KERN_INVALID_ADDRESS;
12601 }
12602
12603 if (VM_MAP_PAGE_SIZE(src_map) < PAGE_SIZE) {
12604 DEBUG4K_SHARE("src_map %p src_addr 0x%llx src_end 0x%llx\n", src_map, (uint64_t)src_addr, (uint64_t)(src_addr + len));
12605 }
12606
12607 required_cur_prot = *cur_prot;
12608 required_max_prot = *max_prot;
12609
12610 /*
12611 * Allocate a header element for the list.
12612 *
12613 * Use the start and end in the header to
12614 * remember the endpoints prior to rounding.
12615 */
12616
12617 copy = vm_map_copy_allocate(VM_MAP_COPY_ENTRY_LIST);
12618 copy->cpy_hdr.entries_pageable = vmk_flags.vmkf_copy_pageable;
12619 copy->offset = 0;
12620 copy->size = len;
12621
12622 kr = vm_map_remap_extract(src_map,
12623 src_addr,
12624 len,
12625 do_copy, /* copy */
12626 copy,
12627 cur_prot, /* IN/OUT */
12628 max_prot, /* IN/OUT */
12629 inheritance,
12630 vmk_flags);
12631 if (kr != KERN_SUCCESS) {
12632 vm_map_copy_discard(copy);
12633 return kr;
12634 }
12635 if (required_cur_prot != VM_PROT_NONE) {
12636 assert((*cur_prot & required_cur_prot) == required_cur_prot);
12637 assert((*max_prot & required_max_prot) == required_max_prot);
12638 }
12639
12640 *copy_result = copy;
12641 return KERN_SUCCESS;
12642 }
12643
12644 static void
vm_map_fork_share(vm_map_t old_map,vm_map_entry_t old_entry,vm_map_t new_map)12645 vm_map_fork_share(
12646 vm_map_t old_map,
12647 vm_map_entry_t old_entry,
12648 vm_map_t new_map)
12649 {
12650 vm_object_t object;
12651 vm_map_entry_t new_entry;
12652
12653 /*
12654 * New sharing code. New map entry
12655 * references original object. Internal
12656 * objects use asynchronous copy algorithm for
12657 * future copies. First make sure we have
12658 * the right object. If we need a shadow,
12659 * or someone else already has one, then
12660 * make a new shadow and share it.
12661 */
12662
12663 if (!old_entry->is_sub_map) {
12664 object = VME_OBJECT(old_entry);
12665 }
12666
12667 if (old_entry->is_sub_map) {
12668 assert(old_entry->wired_count == 0);
12669 #ifndef NO_NESTED_PMAP
12670 #if !PMAP_FORK_NEST
12671 if (old_entry->use_pmap) {
12672 kern_return_t result;
12673
12674 result = pmap_nest(new_map->pmap,
12675 (VME_SUBMAP(old_entry))->pmap,
12676 (addr64_t)old_entry->vme_start,
12677 (uint64_t)(old_entry->vme_end - old_entry->vme_start));
12678 if (result) {
12679 panic("vm_map_fork_share: pmap_nest failed!");
12680 }
12681 }
12682 #endif /* !PMAP_FORK_NEST */
12683 #endif /* NO_NESTED_PMAP */
12684 } else if (object == VM_OBJECT_NULL) {
12685 object = vm_object_allocate((vm_map_size_t)(old_entry->vme_end -
12686 old_entry->vme_start));
12687 VME_OFFSET_SET(old_entry, 0);
12688 VME_OBJECT_SET(old_entry, object, false, 0);
12689 old_entry->use_pmap = TRUE;
12690 // assert(!old_entry->needs_copy);
12691 } else if (object->copy_strategy !=
12692 MEMORY_OBJECT_COPY_SYMMETRIC) {
12693 /*
12694 * We are already using an asymmetric
12695 * copy, and therefore we already have
12696 * the right object.
12697 */
12698
12699 assert(!old_entry->needs_copy);
12700 } else if (old_entry->needs_copy || /* case 1 */
12701 object->shadowed || /* case 2 */
12702 (!object->true_share && /* case 3 */
12703 !old_entry->is_shared &&
12704 (object->vo_size >
12705 (vm_map_size_t)(old_entry->vme_end -
12706 old_entry->vme_start)))) {
12707 /*
12708 * We need to create a shadow.
12709 * There are three cases here.
12710 * In the first case, we need to
12711 * complete a deferred symmetrical
12712 * copy that we participated in.
12713 * In the second and third cases,
12714 * we need to create the shadow so
12715 * that changes that we make to the
12716 * object do not interfere with
12717 * any symmetrical copies which
12718 * have occured (case 2) or which
12719 * might occur (case 3).
12720 *
12721 * The first case is when we had
12722 * deferred shadow object creation
12723 * via the entry->needs_copy mechanism.
12724 * This mechanism only works when
12725 * only one entry points to the source
12726 * object, and we are about to create
12727 * a second entry pointing to the
12728 * same object. The problem is that
12729 * there is no way of mapping from
12730 * an object to the entries pointing
12731 * to it. (Deferred shadow creation
12732 * works with one entry because occurs
12733 * at fault time, and we walk from the
12734 * entry to the object when handling
12735 * the fault.)
12736 *
12737 * The second case is when the object
12738 * to be shared has already been copied
12739 * with a symmetric copy, but we point
12740 * directly to the object without
12741 * needs_copy set in our entry. (This
12742 * can happen because different ranges
12743 * of an object can be pointed to by
12744 * different entries. In particular,
12745 * a single entry pointing to an object
12746 * can be split by a call to vm_inherit,
12747 * which, combined with task_create, can
12748 * result in the different entries
12749 * having different needs_copy values.)
12750 * The shadowed flag in the object allows
12751 * us to detect this case. The problem
12752 * with this case is that if this object
12753 * has or will have shadows, then we
12754 * must not perform an asymmetric copy
12755 * of this object, since such a copy
12756 * allows the object to be changed, which
12757 * will break the previous symmetrical
12758 * copies (which rely upon the object
12759 * not changing). In a sense, the shadowed
12760 * flag says "don't change this object".
12761 * We fix this by creating a shadow
12762 * object for this object, and sharing
12763 * that. This works because we are free
12764 * to change the shadow object (and thus
12765 * to use an asymmetric copy strategy);
12766 * this is also semantically correct,
12767 * since this object is temporary, and
12768 * therefore a copy of the object is
12769 * as good as the object itself. (This
12770 * is not true for permanent objects,
12771 * since the pager needs to see changes,
12772 * which won't happen if the changes
12773 * are made to a copy.)
12774 *
12775 * The third case is when the object
12776 * to be shared has parts sticking
12777 * outside of the entry we're working
12778 * with, and thus may in the future
12779 * be subject to a symmetrical copy.
12780 * (This is a preemptive version of
12781 * case 2.)
12782 */
12783 VME_OBJECT_SHADOW(old_entry,
12784 (vm_map_size_t) (old_entry->vme_end -
12785 old_entry->vme_start),
12786 vm_map_always_shadow(old_map));
12787
12788 /*
12789 * If we're making a shadow for other than
12790 * copy on write reasons, then we have
12791 * to remove write permission.
12792 */
12793
12794 if (!old_entry->needs_copy &&
12795 (old_entry->protection & VM_PROT_WRITE)) {
12796 vm_prot_t prot;
12797
12798 assert(!pmap_has_prot_policy(old_map->pmap, old_entry->translated_allow_execute, old_entry->protection));
12799
12800 prot = old_entry->protection & ~VM_PROT_WRITE;
12801
12802 assert(!pmap_has_prot_policy(old_map->pmap, old_entry->translated_allow_execute, prot));
12803
12804 if (override_nx(old_map, VME_ALIAS(old_entry)) && prot) {
12805 prot |= VM_PROT_EXECUTE;
12806 }
12807
12808
12809 if (old_map->mapped_in_other_pmaps) {
12810 vm_object_pmap_protect(
12811 VME_OBJECT(old_entry),
12812 VME_OFFSET(old_entry),
12813 (old_entry->vme_end -
12814 old_entry->vme_start),
12815 PMAP_NULL,
12816 PAGE_SIZE,
12817 old_entry->vme_start,
12818 prot);
12819 } else {
12820 pmap_protect(old_map->pmap,
12821 old_entry->vme_start,
12822 old_entry->vme_end,
12823 prot);
12824 }
12825 }
12826
12827 old_entry->needs_copy = FALSE;
12828 object = VME_OBJECT(old_entry);
12829 }
12830
12831
12832 /*
12833 * If object was using a symmetric copy strategy,
12834 * change its copy strategy to the default
12835 * asymmetric copy strategy, which is copy_delay
12836 * in the non-norma case and copy_call in the
12837 * norma case. Bump the reference count for the
12838 * new entry.
12839 */
12840
12841 if (old_entry->is_sub_map) {
12842 vm_map_reference(VME_SUBMAP(old_entry));
12843 } else {
12844 vm_object_lock(object);
12845 vm_object_reference_locked(object);
12846 if (object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) {
12847 object->copy_strategy = MEMORY_OBJECT_COPY_DELAY;
12848 }
12849 vm_object_unlock(object);
12850 }
12851
12852 /*
12853 * Clone the entry, using object ref from above.
12854 * Mark both entries as shared.
12855 */
12856
12857 new_entry = vm_map_entry_create(new_map); /* Never the kernel map or descendants */
12858 vm_map_entry_copy(old_map, new_entry, old_entry);
12859 old_entry->is_shared = TRUE;
12860 new_entry->is_shared = TRUE;
12861
12862 /*
12863 * We're dealing with a shared mapping, so the resulting mapping
12864 * should inherit some of the original mapping's accounting settings.
12865 * "iokit_acct" should have been cleared in vm_map_entry_copy().
12866 * "use_pmap" should stay the same as before (if it hasn't been reset
12867 * to TRUE when we cleared "iokit_acct").
12868 */
12869 assert(!new_entry->iokit_acct);
12870
12871 /*
12872 * If old entry's inheritence is VM_INHERIT_NONE,
12873 * the new entry is for corpse fork, remove the
12874 * write permission from the new entry.
12875 */
12876 if (old_entry->inheritance == VM_INHERIT_NONE) {
12877 new_entry->protection &= ~VM_PROT_WRITE;
12878 new_entry->max_protection &= ~VM_PROT_WRITE;
12879 }
12880
12881 /*
12882 * Insert the entry into the new map -- we
12883 * know we're inserting at the end of the new
12884 * map.
12885 */
12886
12887 vm_map_store_entry_link(new_map, vm_map_last_entry(new_map), new_entry,
12888 VM_MAP_KERNEL_FLAGS_NONE);
12889
12890 /*
12891 * Update the physical map
12892 */
12893
12894 if (old_entry->is_sub_map) {
12895 /* Bill Angell pmap support goes here */
12896 } else {
12897 pmap_copy(new_map->pmap, old_map->pmap, new_entry->vme_start,
12898 old_entry->vme_end - old_entry->vme_start,
12899 old_entry->vme_start);
12900 }
12901 }
12902
12903 static boolean_t
vm_map_fork_copy(vm_map_t old_map,vm_map_entry_t * old_entry_p,vm_map_t new_map,int vm_map_copyin_flags)12904 vm_map_fork_copy(
12905 vm_map_t old_map,
12906 vm_map_entry_t *old_entry_p,
12907 vm_map_t new_map,
12908 int vm_map_copyin_flags)
12909 {
12910 vm_map_entry_t old_entry = *old_entry_p;
12911 vm_map_size_t entry_size = old_entry->vme_end - old_entry->vme_start;
12912 vm_map_offset_t start = old_entry->vme_start;
12913 vm_map_copy_t copy;
12914 vm_map_entry_t last = vm_map_last_entry(new_map);
12915
12916 vm_map_unlock(old_map);
12917 /*
12918 * Use maxprot version of copyin because we
12919 * care about whether this memory can ever
12920 * be accessed, not just whether it's accessible
12921 * right now.
12922 */
12923 vm_map_copyin_flags |= VM_MAP_COPYIN_USE_MAXPROT;
12924 if (vm_map_copyin_internal(old_map, start, entry_size,
12925 vm_map_copyin_flags, ©)
12926 != KERN_SUCCESS) {
12927 /*
12928 * The map might have changed while it
12929 * was unlocked, check it again. Skip
12930 * any blank space or permanently
12931 * unreadable region.
12932 */
12933 vm_map_lock(old_map);
12934 if (!vm_map_lookup_entry(old_map, start, &last) ||
12935 (last->max_protection & VM_PROT_READ) == VM_PROT_NONE) {
12936 last = last->vme_next;
12937 }
12938 *old_entry_p = last;
12939
12940 /*
12941 * XXX For some error returns, want to
12942 * XXX skip to the next element. Note
12943 * that INVALID_ADDRESS and
12944 * PROTECTION_FAILURE are handled above.
12945 */
12946
12947 return FALSE;
12948 }
12949
12950 /*
12951 * Assert that the vm_map_copy is coming from the right
12952 * zone and hasn't been forged
12953 */
12954 vm_map_copy_require(copy);
12955
12956 /*
12957 * Insert the copy into the new map
12958 */
12959 vm_map_copy_insert(new_map, last, copy);
12960
12961 /*
12962 * Pick up the traversal at the end of
12963 * the copied region.
12964 */
12965
12966 vm_map_lock(old_map);
12967 start += entry_size;
12968 if (!vm_map_lookup_entry(old_map, start, &last)) {
12969 last = last->vme_next;
12970 } else {
12971 if (last->vme_start == start) {
12972 /*
12973 * No need to clip here and we don't
12974 * want to cause any unnecessary
12975 * unnesting...
12976 */
12977 } else {
12978 vm_map_clip_start(old_map, last, start);
12979 }
12980 }
12981 *old_entry_p = last;
12982
12983 return TRUE;
12984 }
12985
12986 #if PMAP_FORK_NEST
12987 #define PMAP_FORK_NEST_DEBUG 0
12988 static inline void
vm_map_fork_unnest(pmap_t new_pmap,vm_map_offset_t pre_nested_start,vm_map_offset_t pre_nested_end,vm_map_offset_t start,vm_map_offset_t end)12989 vm_map_fork_unnest(
12990 pmap_t new_pmap,
12991 vm_map_offset_t pre_nested_start,
12992 vm_map_offset_t pre_nested_end,
12993 vm_map_offset_t start,
12994 vm_map_offset_t end)
12995 {
12996 kern_return_t kr;
12997 vm_map_offset_t nesting_mask, start_unnest, end_unnest;
12998
12999 assertf(pre_nested_start <= pre_nested_end,
13000 "pre_nested start 0x%llx end 0x%llx",
13001 (uint64_t)pre_nested_start, (uint64_t)pre_nested_end);
13002 assertf(start <= end,
13003 "start 0x%llx end 0x%llx",
13004 (uint64_t) start, (uint64_t)end);
13005
13006 if (pre_nested_start == pre_nested_end) {
13007 /* nothing was pre-nested: done */
13008 return;
13009 }
13010 if (end <= pre_nested_start) {
13011 /* fully before pre-nested range: done */
13012 return;
13013 }
13014 if (start >= pre_nested_end) {
13015 /* fully after pre-nested range: done */
13016 return;
13017 }
13018 /* ignore parts of range outside of pre_nested range */
13019 if (start < pre_nested_start) {
13020 start = pre_nested_start;
13021 }
13022 if (end > pre_nested_end) {
13023 end = pre_nested_end;
13024 }
13025 nesting_mask = pmap_shared_region_size_min(new_pmap) - 1;
13026 start_unnest = start & ~nesting_mask;
13027 end_unnest = (end + nesting_mask) & ~nesting_mask;
13028 kr = pmap_unnest(new_pmap,
13029 (addr64_t)start_unnest,
13030 (uint64_t)(end_unnest - start_unnest));
13031 #if PMAP_FORK_NEST_DEBUG
13032 printf("PMAP_FORK_NEST %s:%d new_pmap %p 0x%llx:0x%llx -> pmap_unnest 0x%llx:0x%llx kr 0x%x\n", __FUNCTION__, __LINE__, new_pmap, (uint64_t)start, (uint64_t)end, (uint64_t)start_unnest, (uint64_t)end_unnest, kr);
13033 #endif /* PMAP_FORK_NEST_DEBUG */
13034 assertf(kr == KERN_SUCCESS,
13035 "0x%llx 0x%llx pmap_unnest(%p, 0x%llx, 0x%llx) -> 0x%x",
13036 (uint64_t)start, (uint64_t)end, new_pmap,
13037 (uint64_t)start_unnest, (uint64_t)(end_unnest - start_unnest),
13038 kr);
13039 }
13040 #endif /* PMAP_FORK_NEST */
13041
13042 void
vm_map_inherit_limits(vm_map_t new_map,const struct _vm_map * old_map)13043 vm_map_inherit_limits(vm_map_t new_map, const struct _vm_map *old_map)
13044 {
13045 new_map->size_limit = old_map->size_limit;
13046 new_map->data_limit = old_map->data_limit;
13047 new_map->user_wire_limit = old_map->user_wire_limit;
13048 new_map->reserved_regions = old_map->reserved_regions;
13049 }
13050
13051 /*
13052 * vm_map_fork:
13053 *
13054 * Create and return a new map based on the old
13055 * map, according to the inheritance values on the
13056 * regions in that map and the options.
13057 *
13058 * The source map must not be locked.
13059 */
13060 vm_map_t
vm_map_fork(ledger_t ledger,vm_map_t old_map,int options)13061 vm_map_fork(
13062 ledger_t ledger,
13063 vm_map_t old_map,
13064 int options)
13065 {
13066 pmap_t new_pmap;
13067 vm_map_t new_map;
13068 vm_map_entry_t old_entry;
13069 vm_map_size_t new_size = 0, entry_size;
13070 vm_map_entry_t new_entry;
13071 boolean_t src_needs_copy;
13072 boolean_t new_entry_needs_copy;
13073 boolean_t pmap_is64bit;
13074 int vm_map_copyin_flags;
13075 vm_inherit_t old_entry_inheritance;
13076 int map_create_options;
13077 kern_return_t footprint_collect_kr;
13078
13079 if (options & ~(VM_MAP_FORK_SHARE_IF_INHERIT_NONE |
13080 VM_MAP_FORK_PRESERVE_PURGEABLE |
13081 VM_MAP_FORK_CORPSE_FOOTPRINT)) {
13082 /* unsupported option */
13083 return VM_MAP_NULL;
13084 }
13085
13086 pmap_is64bit =
13087 #if defined(__i386__) || defined(__x86_64__)
13088 old_map->pmap->pm_task_map != TASK_MAP_32BIT;
13089 #elif defined(__arm64__)
13090 old_map->pmap->is_64bit;
13091 #else
13092 #error Unknown architecture.
13093 #endif
13094
13095 unsigned int pmap_flags = 0;
13096 pmap_flags |= pmap_is64bit ? PMAP_CREATE_64BIT : 0;
13097 #if defined(HAS_APPLE_PAC)
13098 pmap_flags |= old_map->pmap->disable_jop ? PMAP_CREATE_DISABLE_JOP : 0;
13099 #endif
13100 #if CONFIG_ROSETTA
13101 pmap_flags |= old_map->pmap->is_rosetta ? PMAP_CREATE_ROSETTA : 0;
13102 #endif
13103 #if PMAP_CREATE_FORCE_4K_PAGES
13104 if (VM_MAP_PAGE_SIZE(old_map) == FOURK_PAGE_SIZE &&
13105 PAGE_SIZE != FOURK_PAGE_SIZE) {
13106 pmap_flags |= PMAP_CREATE_FORCE_4K_PAGES;
13107 }
13108 #endif /* PMAP_CREATE_FORCE_4K_PAGES */
13109 new_pmap = pmap_create_options(ledger, (vm_map_size_t) 0, pmap_flags);
13110 if (new_pmap == NULL) {
13111 return VM_MAP_NULL;
13112 }
13113
13114 vm_map_reference(old_map);
13115 vm_map_lock(old_map);
13116
13117 map_create_options = 0;
13118 if (old_map->hdr.entries_pageable) {
13119 map_create_options |= VM_MAP_CREATE_PAGEABLE;
13120 }
13121 if (options & VM_MAP_FORK_CORPSE_FOOTPRINT) {
13122 map_create_options |= VM_MAP_CREATE_CORPSE_FOOTPRINT;
13123 footprint_collect_kr = KERN_SUCCESS;
13124 }
13125 new_map = vm_map_create_options(new_pmap,
13126 old_map->min_offset,
13127 old_map->max_offset,
13128 map_create_options);
13129
13130 /* inherit cs_enforcement */
13131 vm_map_cs_enforcement_set(new_map, old_map->cs_enforcement);
13132
13133 vm_map_lock(new_map);
13134 vm_commit_pagezero_status(new_map);
13135 /* inherit the parent map's page size */
13136 vm_map_set_page_shift(new_map, VM_MAP_PAGE_SHIFT(old_map));
13137
13138 /* inherit the parent rlimits */
13139 vm_map_inherit_limits(new_map, old_map);
13140
13141 #if CONFIG_MAP_RANGES
13142 /* inherit the parent map's VM ranges */
13143 vm_map_range_fork(new_map, old_map);
13144 #endif
13145
13146 #if CODE_SIGNING_MONITOR
13147 /* Prepare the monitor for the fork */
13148 csm_fork_prepare(old_map->pmap, new_pmap);
13149 #endif
13150
13151 #if PMAP_FORK_NEST
13152 /*
13153 * Pre-nest the shared region's pmap.
13154 */
13155 vm_map_offset_t pre_nested_start = 0, pre_nested_end = 0;
13156 pmap_fork_nest(old_map->pmap, new_pmap,
13157 &pre_nested_start, &pre_nested_end);
13158 #if PMAP_FORK_NEST_DEBUG
13159 printf("PMAP_FORK_NEST %s:%d old %p new %p pre_nested start 0x%llx end 0x%llx\n", __FUNCTION__, __LINE__, old_map->pmap, new_pmap, (uint64_t)pre_nested_start, (uint64_t)pre_nested_end);
13160 #endif /* PMAP_FORK_NEST_DEBUG */
13161 #endif /* PMAP_FORK_NEST */
13162
13163 for (old_entry = vm_map_first_entry(old_map); old_entry != vm_map_to_entry(old_map);) {
13164 /*
13165 * Abort any corpse collection if the system is shutting down.
13166 */
13167 if ((options & VM_MAP_FORK_CORPSE_FOOTPRINT) &&
13168 get_system_inshutdown()) {
13169 #if PMAP_FORK_NEST
13170 new_entry = vm_map_last_entry(new_map);
13171 if (new_entry == vm_map_to_entry(new_map)) {
13172 /* unnest all that was pre-nested */
13173 vm_map_fork_unnest(new_pmap,
13174 pre_nested_start, pre_nested_end,
13175 vm_map_min(new_map), vm_map_max(new_map));
13176 } else if (new_entry->vme_end < vm_map_max(new_map)) {
13177 /* unnest hole at the end, if pre-nested */
13178 vm_map_fork_unnest(new_pmap,
13179 pre_nested_start, pre_nested_end,
13180 new_entry->vme_end, vm_map_max(new_map));
13181 }
13182 #endif /* PMAP_FORK_NEST */
13183 vm_map_corpse_footprint_collect_done(new_map);
13184 vm_map_unlock(new_map);
13185 vm_map_unlock(old_map);
13186 vm_map_deallocate(new_map);
13187 vm_map_deallocate(old_map);
13188 printf("Aborting corpse map due to system shutdown\n");
13189 return VM_MAP_NULL;
13190 }
13191
13192 entry_size = old_entry->vme_end - old_entry->vme_start;
13193
13194 #if PMAP_FORK_NEST
13195 /*
13196 * Undo any unnecessary pre-nesting.
13197 */
13198 vm_map_offset_t prev_end;
13199 if (old_entry == vm_map_first_entry(old_map)) {
13200 prev_end = vm_map_min(old_map);
13201 } else {
13202 prev_end = old_entry->vme_prev->vme_end;
13203 }
13204 if (prev_end < old_entry->vme_start) {
13205 /* unnest hole before this entry, if pre-nested */
13206 vm_map_fork_unnest(new_pmap,
13207 pre_nested_start, pre_nested_end,
13208 prev_end, old_entry->vme_start);
13209 }
13210 if (old_entry->is_sub_map && old_entry->use_pmap) {
13211 /* keep this entry nested in the child */
13212 #if PMAP_FORK_NEST_DEBUG
13213 printf("PMAP_FORK_NEST %s:%d new_pmap %p keeping 0x%llx:0x%llx nested\n", __FUNCTION__, __LINE__, new_pmap, (uint64_t)old_entry->vme_start, (uint64_t)old_entry->vme_end);
13214 #endif /* PMAP_FORK_NEST_DEBUG */
13215 } else {
13216 /* undo nesting for this entry, if pre-nested */
13217 vm_map_fork_unnest(new_pmap,
13218 pre_nested_start, pre_nested_end,
13219 old_entry->vme_start, old_entry->vme_end);
13220 }
13221 #endif /* PMAP_FORK_NEST */
13222
13223 old_entry_inheritance = old_entry->inheritance;
13224 /*
13225 * If caller used the VM_MAP_FORK_SHARE_IF_INHERIT_NONE option
13226 * share VM_INHERIT_NONE entries that are not backed by a
13227 * device pager.
13228 */
13229 if (old_entry_inheritance == VM_INHERIT_NONE &&
13230 (options & VM_MAP_FORK_SHARE_IF_INHERIT_NONE) &&
13231 (old_entry->protection & VM_PROT_READ) &&
13232 !(!old_entry->is_sub_map &&
13233 VME_OBJECT(old_entry) != NULL &&
13234 VME_OBJECT(old_entry)->pager != NULL &&
13235 is_device_pager_ops(
13236 VME_OBJECT(old_entry)->pager->mo_pager_ops))) {
13237 old_entry_inheritance = VM_INHERIT_SHARE;
13238 }
13239
13240 if (old_entry_inheritance != VM_INHERIT_NONE &&
13241 (options & VM_MAP_FORK_CORPSE_FOOTPRINT) &&
13242 footprint_collect_kr == KERN_SUCCESS) {
13243 /*
13244 * The corpse won't have old_map->pmap to query
13245 * footprint information, so collect that data now
13246 * and store it in new_map->vmmap_corpse_footprint
13247 * for later autopsy.
13248 */
13249 footprint_collect_kr =
13250 vm_map_corpse_footprint_collect(old_map,
13251 old_entry,
13252 new_map);
13253 }
13254
13255 switch (old_entry_inheritance) {
13256 case VM_INHERIT_NONE:
13257 break;
13258
13259 case VM_INHERIT_SHARE:
13260 vm_map_fork_share(old_map, old_entry, new_map);
13261 new_size += entry_size;
13262 break;
13263
13264 case VM_INHERIT_COPY:
13265
13266 /*
13267 * Inline the copy_quickly case;
13268 * upon failure, fall back on call
13269 * to vm_map_fork_copy.
13270 */
13271
13272 if (old_entry->is_sub_map) {
13273 break;
13274 }
13275 if ((old_entry->wired_count != 0) ||
13276 ((VME_OBJECT(old_entry) != NULL) &&
13277 (VME_OBJECT(old_entry)->true_share))) {
13278 goto slow_vm_map_fork_copy;
13279 }
13280
13281 new_entry = vm_map_entry_create(new_map); /* never the kernel map or descendants */
13282 vm_map_entry_copy(old_map, new_entry, old_entry);
13283 if (old_entry->vme_permanent) {
13284 /* inherit "permanent" on fork() */
13285 new_entry->vme_permanent = TRUE;
13286 }
13287
13288 if (new_entry->used_for_jit == TRUE && new_map->jit_entry_exists == FALSE) {
13289 new_map->jit_entry_exists = TRUE;
13290 }
13291
13292 if (new_entry->is_sub_map) {
13293 /* clear address space specifics */
13294 new_entry->use_pmap = FALSE;
13295 } else {
13296 /*
13297 * We're dealing with a copy-on-write operation,
13298 * so the resulting mapping should not inherit
13299 * the original mapping's accounting settings.
13300 * "iokit_acct" should have been cleared in
13301 * vm_map_entry_copy().
13302 * "use_pmap" should be reset to its default
13303 * (TRUE) so that the new mapping gets
13304 * accounted for in the task's memory footprint.
13305 */
13306 assert(!new_entry->iokit_acct);
13307 new_entry->use_pmap = TRUE;
13308 }
13309
13310 if (!vm_object_copy_quickly(
13311 VME_OBJECT(new_entry),
13312 VME_OFFSET(old_entry),
13313 (old_entry->vme_end -
13314 old_entry->vme_start),
13315 &src_needs_copy,
13316 &new_entry_needs_copy)) {
13317 vm_map_entry_dispose(new_entry);
13318 goto slow_vm_map_fork_copy;
13319 }
13320
13321 /*
13322 * Handle copy-on-write obligations
13323 */
13324
13325 if (src_needs_copy && !old_entry->needs_copy) {
13326 vm_prot_t prot;
13327
13328 assert(!pmap_has_prot_policy(old_map->pmap, old_entry->translated_allow_execute, old_entry->protection));
13329
13330 prot = old_entry->protection & ~VM_PROT_WRITE;
13331
13332 if (override_nx(old_map, VME_ALIAS(old_entry))
13333 && prot) {
13334 prot |= VM_PROT_EXECUTE;
13335 }
13336
13337 assert(!pmap_has_prot_policy(old_map->pmap, old_entry->translated_allow_execute, prot));
13338
13339 vm_object_pmap_protect(
13340 VME_OBJECT(old_entry),
13341 VME_OFFSET(old_entry),
13342 (old_entry->vme_end -
13343 old_entry->vme_start),
13344 ((old_entry->is_shared
13345 || old_map->mapped_in_other_pmaps)
13346 ? PMAP_NULL :
13347 old_map->pmap),
13348 VM_MAP_PAGE_SIZE(old_map),
13349 old_entry->vme_start,
13350 prot);
13351
13352 assert(old_entry->wired_count == 0);
13353 old_entry->needs_copy = TRUE;
13354 }
13355 new_entry->needs_copy = new_entry_needs_copy;
13356
13357 /*
13358 * Insert the entry at the end
13359 * of the map.
13360 */
13361
13362 vm_map_store_entry_link(new_map,
13363 vm_map_last_entry(new_map),
13364 new_entry,
13365 VM_MAP_KERNEL_FLAGS_NONE);
13366 new_size += entry_size;
13367 break;
13368
13369 slow_vm_map_fork_copy:
13370 vm_map_copyin_flags = 0;
13371 if (options & VM_MAP_FORK_PRESERVE_PURGEABLE) {
13372 vm_map_copyin_flags |=
13373 VM_MAP_COPYIN_PRESERVE_PURGEABLE;
13374 }
13375 if (vm_map_fork_copy(old_map,
13376 &old_entry,
13377 new_map,
13378 vm_map_copyin_flags)) {
13379 new_size += entry_size;
13380 }
13381 continue;
13382 }
13383 old_entry = old_entry->vme_next;
13384 }
13385
13386 #if PMAP_FORK_NEST
13387 new_entry = vm_map_last_entry(new_map);
13388 if (new_entry == vm_map_to_entry(new_map)) {
13389 /* unnest all that was pre-nested */
13390 vm_map_fork_unnest(new_pmap,
13391 pre_nested_start, pre_nested_end,
13392 vm_map_min(new_map), vm_map_max(new_map));
13393 } else if (new_entry->vme_end < vm_map_max(new_map)) {
13394 /* unnest hole at the end, if pre-nested */
13395 vm_map_fork_unnest(new_pmap,
13396 pre_nested_start, pre_nested_end,
13397 new_entry->vme_end, vm_map_max(new_map));
13398 }
13399 #endif /* PMAP_FORK_NEST */
13400
13401 #if defined(__arm64__)
13402 pmap_insert_commpage(new_map->pmap);
13403 #endif /* __arm64__ */
13404
13405 new_map->size = new_size;
13406
13407 if (options & VM_MAP_FORK_CORPSE_FOOTPRINT) {
13408 vm_map_corpse_footprint_collect_done(new_map);
13409 }
13410
13411 /* Propagate JIT entitlement for the pmap layer. */
13412 if (pmap_get_jit_entitled(old_map->pmap)) {
13413 /* Tell the pmap that it supports JIT. */
13414 pmap_set_jit_entitled(new_map->pmap);
13415 }
13416
13417 /* Propagate TPRO settings for the pmap layer */
13418 if (pmap_get_tpro(old_map->pmap)) {
13419 /* Tell the pmap that it supports TPRO */
13420 pmap_set_tpro(new_map->pmap);
13421 }
13422
13423 vm_map_unlock(new_map);
13424 vm_map_unlock(old_map);
13425 vm_map_deallocate(old_map);
13426
13427 return new_map;
13428 }
13429
13430 /*
13431 * vm_map_exec:
13432 *
13433 * Setup the "new_map" with the proper execution environment according
13434 * to the type of executable (platform, 64bit, chroot environment).
13435 * Map the comm page and shared region, etc...
13436 */
13437 kern_return_t
vm_map_exec(vm_map_t new_map,task_t task,boolean_t is64bit,void * fsroot,cpu_type_t cpu,cpu_subtype_t cpu_subtype,boolean_t reslide,boolean_t is_driverkit,uint32_t rsr_version)13438 vm_map_exec(
13439 vm_map_t new_map,
13440 task_t task,
13441 boolean_t is64bit,
13442 void *fsroot,
13443 cpu_type_t cpu,
13444 cpu_subtype_t cpu_subtype,
13445 boolean_t reslide,
13446 boolean_t is_driverkit,
13447 uint32_t rsr_version)
13448 {
13449 SHARED_REGION_TRACE_DEBUG(
13450 ("shared_region: task %p: vm_map_exec(%p,%p,%p,0x%x,0x%x): ->\n",
13451 (void *)VM_KERNEL_ADDRPERM(current_task()),
13452 (void *)VM_KERNEL_ADDRPERM(new_map),
13453 (void *)VM_KERNEL_ADDRPERM(task),
13454 (void *)VM_KERNEL_ADDRPERM(fsroot),
13455 cpu,
13456 cpu_subtype));
13457 (void) vm_commpage_enter(new_map, task, is64bit);
13458
13459 (void) vm_shared_region_enter(new_map, task, is64bit, fsroot, cpu, cpu_subtype, reslide, is_driverkit, rsr_version);
13460
13461 SHARED_REGION_TRACE_DEBUG(
13462 ("shared_region: task %p: vm_map_exec(%p,%p,%p,0x%x,0x%x): <-\n",
13463 (void *)VM_KERNEL_ADDRPERM(current_task()),
13464 (void *)VM_KERNEL_ADDRPERM(new_map),
13465 (void *)VM_KERNEL_ADDRPERM(task),
13466 (void *)VM_KERNEL_ADDRPERM(fsroot),
13467 cpu,
13468 cpu_subtype));
13469
13470 /*
13471 * Some devices have region(s) of memory that shouldn't get allocated by
13472 * user processes. The following code creates dummy vm_map_entry_t's for each
13473 * of the regions that needs to be reserved to prevent any allocations in
13474 * those regions.
13475 */
13476 kern_return_t kr = KERN_FAILURE;
13477 vm_map_kernel_flags_t vmk_flags = VM_MAP_KERNEL_FLAGS_FIXED_PERMANENT();
13478 vmk_flags.vmkf_beyond_max = true;
13479
13480 const struct vm_reserved_region *regions = NULL;
13481 size_t num_regions = ml_get_vm_reserved_regions(is64bit, ®ions);
13482 assert((num_regions == 0) || (num_regions > 0 && regions != NULL));
13483
13484 for (size_t i = 0; i < num_regions; ++i) {
13485 vm_map_offset_t address = regions[i].vmrr_addr;
13486
13487 kr = vm_map_enter(
13488 new_map,
13489 &address,
13490 regions[i].vmrr_size,
13491 (vm_map_offset_t)0,
13492 vmk_flags,
13493 VM_OBJECT_NULL,
13494 (vm_object_offset_t)0,
13495 FALSE,
13496 VM_PROT_NONE,
13497 VM_PROT_NONE,
13498 VM_INHERIT_COPY);
13499
13500 if (kr != KERN_SUCCESS) {
13501 panic("Failed to reserve %s region in user map %p %d", regions[i].vmrr_name, new_map, kr);
13502 }
13503 }
13504
13505 new_map->reserved_regions = (num_regions ? TRUE : FALSE);
13506
13507 return KERN_SUCCESS;
13508 }
13509
13510 uint64_t vm_map_lookup_and_lock_object_copy_slowly_count = 0;
13511 uint64_t vm_map_lookup_and_lock_object_copy_slowly_size = 0;
13512 uint64_t vm_map_lookup_and_lock_object_copy_slowly_max = 0;
13513 uint64_t vm_map_lookup_and_lock_object_copy_slowly_restart = 0;
13514 uint64_t vm_map_lookup_and_lock_object_copy_slowly_error = 0;
13515 uint64_t vm_map_lookup_and_lock_object_copy_strategically_count = 0;
13516 uint64_t vm_map_lookup_and_lock_object_copy_strategically_size = 0;
13517 uint64_t vm_map_lookup_and_lock_object_copy_strategically_max = 0;
13518 uint64_t vm_map_lookup_and_lock_object_copy_strategically_restart = 0;
13519 uint64_t vm_map_lookup_and_lock_object_copy_strategically_error = 0;
13520 uint64_t vm_map_lookup_and_lock_object_copy_shadow_count = 0;
13521 uint64_t vm_map_lookup_and_lock_object_copy_shadow_size = 0;
13522 uint64_t vm_map_lookup_and_lock_object_copy_shadow_max = 0;
13523 /*
13524 * vm_map_lookup_and_lock_object:
13525 *
13526 * Finds the VM object, offset, and
13527 * protection for a given virtual address in the
13528 * specified map, assuming a page fault of the
13529 * type specified.
13530 *
13531 * Returns the (object, offset, protection) for
13532 * this address, whether it is wired down, and whether
13533 * this map has the only reference to the data in question.
13534 * In order to later verify this lookup, a "version"
13535 * is returned.
13536 * If contended != NULL, *contended will be set to
13537 * true iff the thread had to spin or block to acquire
13538 * an exclusive lock.
13539 *
13540 * The map MUST be locked by the caller and WILL be
13541 * locked on exit. In order to guarantee the
13542 * existence of the returned object, it is returned
13543 * locked.
13544 *
13545 * If a lookup is requested with "write protection"
13546 * specified, the map may be changed to perform virtual
13547 * copying operations, although the data referenced will
13548 * remain the same.
13549 */
13550 kern_return_t
vm_map_lookup_and_lock_object(vm_map_t * var_map,vm_map_offset_t vaddr,vm_prot_t fault_type,int object_lock_type,vm_map_version_t * out_version,vm_object_t * object,vm_object_offset_t * offset,vm_prot_t * out_prot,boolean_t * wired,vm_object_fault_info_t fault_info,vm_map_t * real_map,bool * contended)13551 vm_map_lookup_and_lock_object(
13552 vm_map_t *var_map, /* IN/OUT */
13553 vm_map_offset_t vaddr,
13554 vm_prot_t fault_type,
13555 int object_lock_type,
13556 vm_map_version_t *out_version, /* OUT */
13557 vm_object_t *object, /* OUT */
13558 vm_object_offset_t *offset, /* OUT */
13559 vm_prot_t *out_prot, /* OUT */
13560 boolean_t *wired, /* OUT */
13561 vm_object_fault_info_t fault_info, /* OUT */
13562 vm_map_t *real_map, /* OUT */
13563 bool *contended) /* OUT */
13564 {
13565 vm_map_entry_t entry;
13566 vm_map_t map = *var_map;
13567 vm_map_t old_map = *var_map;
13568 vm_map_t cow_sub_map_parent = VM_MAP_NULL;
13569 vm_map_offset_t cow_parent_vaddr = 0;
13570 vm_map_offset_t old_start = 0;
13571 vm_map_offset_t old_end = 0;
13572 vm_prot_t prot;
13573 boolean_t mask_protections;
13574 boolean_t force_copy;
13575 boolean_t no_force_copy_if_executable;
13576 boolean_t submap_needed_copy;
13577 vm_prot_t original_fault_type;
13578 vm_map_size_t fault_page_mask;
13579
13580 /*
13581 * VM_PROT_MASK means that the caller wants us to use "fault_type"
13582 * as a mask against the mapping's actual protections, not as an
13583 * absolute value.
13584 */
13585 mask_protections = (fault_type & VM_PROT_IS_MASK) ? TRUE : FALSE;
13586 force_copy = (fault_type & VM_PROT_COPY) ? TRUE : FALSE;
13587 no_force_copy_if_executable = (fault_type & VM_PROT_COPY_FAIL_IF_EXECUTABLE) ? TRUE : FALSE;
13588 fault_type &= VM_PROT_ALL;
13589 original_fault_type = fault_type;
13590 if (contended) {
13591 *contended = false;
13592 }
13593
13594 *real_map = map;
13595
13596 fault_page_mask = MIN(VM_MAP_PAGE_MASK(map), PAGE_MASK);
13597 vaddr = VM_MAP_TRUNC_PAGE(vaddr, fault_page_mask);
13598
13599 RetryLookup:
13600 fault_type = original_fault_type;
13601
13602 /*
13603 * If the map has an interesting hint, try it before calling
13604 * full blown lookup routine.
13605 */
13606 entry = map->hint;
13607
13608 if ((entry == vm_map_to_entry(map)) ||
13609 (vaddr < entry->vme_start) || (vaddr >= entry->vme_end)) {
13610 vm_map_entry_t tmp_entry;
13611
13612 /*
13613 * Entry was either not a valid hint, or the vaddr
13614 * was not contained in the entry, so do a full lookup.
13615 */
13616 if (!vm_map_lookup_entry(map, vaddr, &tmp_entry)) {
13617 if ((cow_sub_map_parent) && (cow_sub_map_parent != map)) {
13618 vm_map_unlock(cow_sub_map_parent);
13619 }
13620 if ((*real_map != map)
13621 && (*real_map != cow_sub_map_parent)) {
13622 vm_map_unlock(*real_map);
13623 }
13624 return KERN_INVALID_ADDRESS;
13625 }
13626
13627 entry = tmp_entry;
13628 }
13629 if (map == old_map) {
13630 old_start = entry->vme_start;
13631 old_end = entry->vme_end;
13632 }
13633
13634 /*
13635 * Handle submaps. Drop lock on upper map, submap is
13636 * returned locked.
13637 */
13638
13639 submap_needed_copy = FALSE;
13640 submap_recurse:
13641 if (entry->is_sub_map) {
13642 vm_map_offset_t local_vaddr;
13643 vm_map_offset_t end_delta;
13644 vm_map_offset_t start_delta;
13645 vm_map_offset_t top_entry_saved_start;
13646 vm_object_offset_t top_entry_saved_offset;
13647 vm_map_entry_t submap_entry, saved_submap_entry;
13648 vm_object_offset_t submap_entry_offset;
13649 vm_object_size_t submap_entry_size;
13650 vm_prot_t subentry_protection;
13651 vm_prot_t subentry_max_protection;
13652 boolean_t subentry_no_copy_on_read;
13653 boolean_t subentry_permanent;
13654 boolean_t subentry_csm_associated;
13655 boolean_t mapped_needs_copy = FALSE;
13656 vm_map_version_t version;
13657
13658 assertf(VM_MAP_PAGE_SHIFT(VME_SUBMAP(entry)) >= VM_MAP_PAGE_SHIFT(map),
13659 "map %p (%d) entry %p submap %p (%d)\n",
13660 map, VM_MAP_PAGE_SHIFT(map), entry,
13661 VME_SUBMAP(entry), VM_MAP_PAGE_SHIFT(VME_SUBMAP(entry)));
13662
13663 local_vaddr = vaddr;
13664 top_entry_saved_start = entry->vme_start;
13665 top_entry_saved_offset = VME_OFFSET(entry);
13666
13667 if ((entry->use_pmap &&
13668 !((fault_type & VM_PROT_WRITE) ||
13669 force_copy))) {
13670 /* if real_map equals map we unlock below */
13671 if ((*real_map != map) &&
13672 (*real_map != cow_sub_map_parent)) {
13673 vm_map_unlock(*real_map);
13674 }
13675 *real_map = VME_SUBMAP(entry);
13676 }
13677
13678 if (entry->needs_copy &&
13679 ((fault_type & VM_PROT_WRITE) ||
13680 force_copy)) {
13681 if (!mapped_needs_copy) {
13682 if (vm_map_lock_read_to_write(map)) {
13683 vm_map_lock_read(map);
13684 *real_map = map;
13685 goto RetryLookup;
13686 }
13687 vm_map_lock_read(VME_SUBMAP(entry));
13688 *var_map = VME_SUBMAP(entry);
13689 cow_sub_map_parent = map;
13690 /* reset base to map before cow object */
13691 /* this is the map which will accept */
13692 /* the new cow object */
13693 old_start = entry->vme_start;
13694 old_end = entry->vme_end;
13695 cow_parent_vaddr = vaddr;
13696 mapped_needs_copy = TRUE;
13697 } else {
13698 vm_map_lock_read(VME_SUBMAP(entry));
13699 *var_map = VME_SUBMAP(entry);
13700 if ((cow_sub_map_parent != map) &&
13701 (*real_map != map)) {
13702 vm_map_unlock(map);
13703 }
13704 }
13705 } else {
13706 if (entry->needs_copy) {
13707 submap_needed_copy = TRUE;
13708 }
13709 vm_map_lock_read(VME_SUBMAP(entry));
13710 *var_map = VME_SUBMAP(entry);
13711 /* leave map locked if it is a target */
13712 /* cow sub_map above otherwise, just */
13713 /* follow the maps down to the object */
13714 /* here we unlock knowing we are not */
13715 /* revisiting the map. */
13716 if ((*real_map != map) && (map != cow_sub_map_parent)) {
13717 vm_map_unlock_read(map);
13718 }
13719 }
13720
13721 entry = NULL;
13722 map = *var_map;
13723
13724 /* calculate the offset in the submap for vaddr */
13725 local_vaddr = (local_vaddr - top_entry_saved_start) + top_entry_saved_offset;
13726 assertf(VM_MAP_PAGE_ALIGNED(local_vaddr, fault_page_mask),
13727 "local_vaddr 0x%llx entry->vme_start 0x%llx fault_page_mask 0x%llx\n",
13728 (uint64_t)local_vaddr, (uint64_t)top_entry_saved_start, (uint64_t)fault_page_mask);
13729
13730 RetrySubMap:
13731 if (!vm_map_lookup_entry(map, local_vaddr, &submap_entry)) {
13732 if ((cow_sub_map_parent) && (cow_sub_map_parent != map)) {
13733 vm_map_unlock(cow_sub_map_parent);
13734 }
13735 if ((*real_map != map)
13736 && (*real_map != cow_sub_map_parent)) {
13737 vm_map_unlock(*real_map);
13738 }
13739 *real_map = map;
13740 return KERN_INVALID_ADDRESS;
13741 }
13742
13743 /* find the attenuated shadow of the underlying object */
13744 /* on our target map */
13745
13746 /* in english the submap object may extend beyond the */
13747 /* region mapped by the entry or, may only fill a portion */
13748 /* of it. For our purposes, we only care if the object */
13749 /* doesn't fill. In this case the area which will */
13750 /* ultimately be clipped in the top map will only need */
13751 /* to be as big as the portion of the underlying entry */
13752 /* which is mapped */
13753 start_delta = submap_entry->vme_start > top_entry_saved_offset ?
13754 submap_entry->vme_start - top_entry_saved_offset : 0;
13755
13756 end_delta =
13757 (top_entry_saved_offset + start_delta + (old_end - old_start)) <=
13758 submap_entry->vme_end ?
13759 0 : (top_entry_saved_offset +
13760 (old_end - old_start))
13761 - submap_entry->vme_end;
13762
13763 old_start += start_delta;
13764 old_end -= end_delta;
13765
13766 if (submap_entry->is_sub_map) {
13767 entry = submap_entry;
13768 vaddr = local_vaddr;
13769 goto submap_recurse;
13770 }
13771
13772 if (((fault_type & VM_PROT_WRITE) ||
13773 force_copy)
13774 && cow_sub_map_parent) {
13775 vm_object_t sub_object, copy_object;
13776 vm_object_offset_t copy_offset;
13777 vm_map_offset_t local_start;
13778 vm_map_offset_t local_end;
13779 boolean_t object_copied = FALSE;
13780 vm_object_offset_t object_copied_offset = 0;
13781 boolean_t object_copied_needs_copy = FALSE;
13782 kern_return_t kr = KERN_SUCCESS;
13783
13784 if (vm_map_lock_read_to_write(map)) {
13785 vm_map_lock_read(map);
13786 old_start -= start_delta;
13787 old_end += end_delta;
13788 goto RetrySubMap;
13789 }
13790
13791
13792 sub_object = VME_OBJECT(submap_entry);
13793 if (sub_object == VM_OBJECT_NULL) {
13794 sub_object =
13795 vm_object_allocate(
13796 (vm_map_size_t)
13797 (submap_entry->vme_end -
13798 submap_entry->vme_start));
13799 VME_OBJECT_SET(submap_entry, sub_object, false, 0);
13800 VME_OFFSET_SET(submap_entry, 0);
13801 assert(!submap_entry->is_sub_map);
13802 assert(submap_entry->use_pmap);
13803 }
13804 local_start = local_vaddr -
13805 (cow_parent_vaddr - old_start);
13806 local_end = local_vaddr +
13807 (old_end - cow_parent_vaddr);
13808 vm_map_clip_start(map, submap_entry, local_start);
13809 vm_map_clip_end(map, submap_entry, local_end);
13810 if (submap_entry->is_sub_map) {
13811 /* unnesting was done when clipping */
13812 assert(!submap_entry->use_pmap);
13813 }
13814
13815 /* This is the COW case, lets connect */
13816 /* an entry in our space to the underlying */
13817 /* object in the submap, bypassing the */
13818 /* submap. */
13819 submap_entry_offset = VME_OFFSET(submap_entry);
13820 submap_entry_size = submap_entry->vme_end - submap_entry->vme_start;
13821
13822 if ((submap_entry->wired_count != 0 ||
13823 sub_object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC) &&
13824 (submap_entry->protection & VM_PROT_EXECUTE) &&
13825 no_force_copy_if_executable) {
13826 // printf("FBDP map %p entry %p start 0x%llx end 0x%llx wired %d strat %d\n", map, submap_entry, (uint64_t)local_start, (uint64_t)local_end, submap_entry->wired_count, sub_object->copy_strategy);
13827 if ((cow_sub_map_parent) && (cow_sub_map_parent != map)) {
13828 vm_map_unlock(cow_sub_map_parent);
13829 }
13830 if ((*real_map != map)
13831 && (*real_map != cow_sub_map_parent)) {
13832 vm_map_unlock(*real_map);
13833 }
13834 *real_map = map;
13835 ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_SUBMAP_NO_COW_ON_EXECUTABLE), 0 /* arg */);
13836 vm_map_lock_write_to_read(map);
13837 kr = KERN_PROTECTION_FAILURE;
13838 DTRACE_VM4(submap_no_copy_executable,
13839 vm_map_t, map,
13840 vm_object_offset_t, submap_entry_offset,
13841 vm_object_size_t, submap_entry_size,
13842 int, kr);
13843 return kr;
13844 }
13845
13846 if (submap_entry->wired_count != 0) {
13847 vm_object_reference(sub_object);
13848
13849 assertf(VM_MAP_PAGE_ALIGNED(VME_OFFSET(submap_entry), VM_MAP_PAGE_MASK(map)),
13850 "submap_entry %p offset 0x%llx\n",
13851 submap_entry, VME_OFFSET(submap_entry));
13852
13853 DTRACE_VM6(submap_copy_slowly,
13854 vm_map_t, cow_sub_map_parent,
13855 vm_map_offset_t, vaddr,
13856 vm_map_t, map,
13857 vm_object_size_t, submap_entry_size,
13858 int, submap_entry->wired_count,
13859 int, sub_object->copy_strategy);
13860
13861 saved_submap_entry = submap_entry;
13862 version.main_timestamp = map->timestamp;
13863 vm_map_unlock(map); /* Increments timestamp by 1 */
13864 submap_entry = VM_MAP_ENTRY_NULL;
13865
13866 vm_object_lock(sub_object);
13867 kr = vm_object_copy_slowly(sub_object,
13868 submap_entry_offset,
13869 submap_entry_size,
13870 FALSE,
13871 ©_object);
13872 object_copied = TRUE;
13873 object_copied_offset = 0;
13874 /* 4k: account for extra offset in physical page */
13875 object_copied_offset += submap_entry_offset - vm_object_trunc_page(submap_entry_offset);
13876 object_copied_needs_copy = FALSE;
13877 vm_object_deallocate(sub_object);
13878
13879 vm_map_lock(map);
13880
13881 if (kr != KERN_SUCCESS &&
13882 kr != KERN_MEMORY_RESTART_COPY) {
13883 if ((cow_sub_map_parent) && (cow_sub_map_parent != map)) {
13884 vm_map_unlock(cow_sub_map_parent);
13885 }
13886 if ((*real_map != map)
13887 && (*real_map != cow_sub_map_parent)) {
13888 vm_map_unlock(*real_map);
13889 }
13890 *real_map = map;
13891 vm_object_deallocate(copy_object);
13892 copy_object = VM_OBJECT_NULL;
13893 ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_SUBMAP_COPY_SLOWLY_FAILED), 0 /* arg */);
13894 vm_map_lock_write_to_read(map);
13895 DTRACE_VM4(submap_copy_error_slowly,
13896 vm_object_t, sub_object,
13897 vm_object_offset_t, submap_entry_offset,
13898 vm_object_size_t, submap_entry_size,
13899 int, kr);
13900 vm_map_lookup_and_lock_object_copy_slowly_error++;
13901 return kr;
13902 }
13903
13904 if ((kr == KERN_SUCCESS) &&
13905 (version.main_timestamp + 1) == map->timestamp) {
13906 submap_entry = saved_submap_entry;
13907 } else {
13908 saved_submap_entry = NULL;
13909 old_start -= start_delta;
13910 old_end += end_delta;
13911 vm_object_deallocate(copy_object);
13912 copy_object = VM_OBJECT_NULL;
13913 vm_map_lock_write_to_read(map);
13914 vm_map_lookup_and_lock_object_copy_slowly_restart++;
13915 goto RetrySubMap;
13916 }
13917 vm_map_lookup_and_lock_object_copy_slowly_count++;
13918 vm_map_lookup_and_lock_object_copy_slowly_size += submap_entry_size;
13919 if (submap_entry_size > vm_map_lookup_and_lock_object_copy_slowly_max) {
13920 vm_map_lookup_and_lock_object_copy_slowly_max = submap_entry_size;
13921 }
13922 } else if (sub_object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC) {
13923 submap_entry_offset = VME_OFFSET(submap_entry);
13924 copy_object = VM_OBJECT_NULL;
13925 object_copied_offset = submap_entry_offset;
13926 object_copied_needs_copy = FALSE;
13927 DTRACE_VM6(submap_copy_strategically,
13928 vm_map_t, cow_sub_map_parent,
13929 vm_map_offset_t, vaddr,
13930 vm_map_t, map,
13931 vm_object_size_t, submap_entry_size,
13932 int, submap_entry->wired_count,
13933 int, sub_object->copy_strategy);
13934 kr = vm_object_copy_strategically(
13935 sub_object,
13936 submap_entry_offset,
13937 submap_entry->vme_end - submap_entry->vme_start,
13938 ©_object,
13939 &object_copied_offset,
13940 &object_copied_needs_copy);
13941 if (kr == KERN_MEMORY_RESTART_COPY) {
13942 old_start -= start_delta;
13943 old_end += end_delta;
13944 vm_object_deallocate(copy_object);
13945 copy_object = VM_OBJECT_NULL;
13946 vm_map_lock_write_to_read(map);
13947 vm_map_lookup_and_lock_object_copy_strategically_restart++;
13948 goto RetrySubMap;
13949 }
13950 if (kr != KERN_SUCCESS) {
13951 if ((cow_sub_map_parent) && (cow_sub_map_parent != map)) {
13952 vm_map_unlock(cow_sub_map_parent);
13953 }
13954 if ((*real_map != map)
13955 && (*real_map != cow_sub_map_parent)) {
13956 vm_map_unlock(*real_map);
13957 }
13958 *real_map = map;
13959 vm_object_deallocate(copy_object);
13960 copy_object = VM_OBJECT_NULL;
13961 ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_SUBMAP_COPY_STRAT_FAILED), 0 /* arg */);
13962 vm_map_lock_write_to_read(map);
13963 DTRACE_VM4(submap_copy_error_strategically,
13964 vm_object_t, sub_object,
13965 vm_object_offset_t, submap_entry_offset,
13966 vm_object_size_t, submap_entry_size,
13967 int, kr);
13968 vm_map_lookup_and_lock_object_copy_strategically_error++;
13969 return kr;
13970 }
13971 assert(copy_object != VM_OBJECT_NULL);
13972 assert(copy_object != sub_object);
13973 object_copied = TRUE;
13974 vm_map_lookup_and_lock_object_copy_strategically_count++;
13975 vm_map_lookup_and_lock_object_copy_strategically_size += submap_entry_size;
13976 if (submap_entry_size > vm_map_lookup_and_lock_object_copy_strategically_max) {
13977 vm_map_lookup_and_lock_object_copy_strategically_max = submap_entry_size;
13978 }
13979 } else {
13980 /* set up shadow object */
13981 object_copied = FALSE;
13982 copy_object = sub_object;
13983 vm_object_lock(sub_object);
13984 vm_object_reference_locked(sub_object);
13985 sub_object->shadowed = TRUE;
13986 vm_object_unlock(sub_object);
13987
13988 assert(submap_entry->wired_count == 0);
13989 submap_entry->needs_copy = TRUE;
13990
13991 prot = submap_entry->protection;
13992 assert(!pmap_has_prot_policy(map->pmap, submap_entry->translated_allow_execute, prot));
13993 prot = prot & ~VM_PROT_WRITE;
13994 assert(!pmap_has_prot_policy(map->pmap, submap_entry->translated_allow_execute, prot));
13995
13996 if (override_nx(old_map,
13997 VME_ALIAS(submap_entry))
13998 && prot) {
13999 prot |= VM_PROT_EXECUTE;
14000 }
14001
14002 vm_object_pmap_protect(
14003 sub_object,
14004 VME_OFFSET(submap_entry),
14005 submap_entry->vme_end -
14006 submap_entry->vme_start,
14007 (submap_entry->is_shared
14008 || map->mapped_in_other_pmaps) ?
14009 PMAP_NULL : map->pmap,
14010 VM_MAP_PAGE_SIZE(map),
14011 submap_entry->vme_start,
14012 prot);
14013 vm_map_lookup_and_lock_object_copy_shadow_count++;
14014 vm_map_lookup_and_lock_object_copy_shadow_size += submap_entry_size;
14015 if (submap_entry_size > vm_map_lookup_and_lock_object_copy_shadow_max) {
14016 vm_map_lookup_and_lock_object_copy_shadow_max = submap_entry_size;
14017 }
14018 }
14019
14020 /*
14021 * Adjust the fault offset to the submap entry.
14022 */
14023 copy_offset = (local_vaddr -
14024 submap_entry->vme_start +
14025 VME_OFFSET(submap_entry));
14026
14027 /* This works diffently than the */
14028 /* normal submap case. We go back */
14029 /* to the parent of the cow map and*/
14030 /* clip out the target portion of */
14031 /* the sub_map, substituting the */
14032 /* new copy object, */
14033
14034 subentry_protection = submap_entry->protection;
14035 subentry_max_protection = submap_entry->max_protection;
14036 subentry_no_copy_on_read = submap_entry->vme_no_copy_on_read;
14037 subentry_permanent = submap_entry->vme_permanent;
14038 subentry_csm_associated = submap_entry->csm_associated;
14039
14040 vm_map_unlock(map);
14041 submap_entry = NULL; /* not valid after map unlock */
14042
14043 local_start = old_start;
14044 local_end = old_end;
14045 map = cow_sub_map_parent;
14046 *var_map = cow_sub_map_parent;
14047 vaddr = cow_parent_vaddr;
14048 cow_sub_map_parent = NULL;
14049
14050 if (!vm_map_lookup_entry(map,
14051 vaddr, &entry)) {
14052 if ((cow_sub_map_parent) && (cow_sub_map_parent != map)) {
14053 vm_map_unlock(cow_sub_map_parent);
14054 }
14055 if ((*real_map != map)
14056 && (*real_map != cow_sub_map_parent)) {
14057 vm_map_unlock(*real_map);
14058 }
14059 *real_map = map;
14060 vm_object_deallocate(
14061 copy_object);
14062 copy_object = VM_OBJECT_NULL;
14063 vm_map_lock_write_to_read(map);
14064 DTRACE_VM4(submap_lookup_post_unlock,
14065 uint64_t, (uint64_t)entry->vme_start,
14066 uint64_t, (uint64_t)entry->vme_end,
14067 vm_map_offset_t, vaddr,
14068 int, object_copied);
14069 return KERN_INVALID_ADDRESS;
14070 }
14071
14072 /* clip out the portion of space */
14073 /* mapped by the sub map which */
14074 /* corresponds to the underlying */
14075 /* object */
14076
14077 /*
14078 * Clip (and unnest) the smallest nested chunk
14079 * possible around the faulting address...
14080 */
14081 local_start = vaddr & ~(pmap_shared_region_size_min(map->pmap) - 1);
14082 local_end = local_start + pmap_shared_region_size_min(map->pmap);
14083 /*
14084 * ... but don't go beyond the "old_start" to "old_end"
14085 * range, to avoid spanning over another VM region
14086 * with a possibly different VM object and/or offset.
14087 */
14088 if (local_start < old_start) {
14089 local_start = old_start;
14090 }
14091 if (local_end > old_end) {
14092 local_end = old_end;
14093 }
14094 /*
14095 * Adjust copy_offset to the start of the range.
14096 */
14097 copy_offset -= (vaddr - local_start);
14098
14099 vm_map_clip_start(map, entry, local_start);
14100 vm_map_clip_end(map, entry, local_end);
14101 if (entry->is_sub_map) {
14102 /* unnesting was done when clipping */
14103 assert(!entry->use_pmap);
14104 }
14105
14106 /* substitute copy object for */
14107 /* shared map entry */
14108 vm_map_deallocate(VME_SUBMAP(entry));
14109 assert(!entry->iokit_acct);
14110 entry->use_pmap = TRUE;
14111 VME_OBJECT_SET(entry, copy_object, false, 0);
14112
14113 /* propagate the submap entry's protections */
14114 if (entry->protection != VM_PROT_READ) {
14115 /*
14116 * Someone has already altered the top entry's
14117 * protections via vm_protect(VM_PROT_COPY).
14118 * Respect these new values and ignore the
14119 * submap entry's protections.
14120 */
14121 } else {
14122 /*
14123 * Regular copy-on-write: propagate the submap
14124 * entry's protections to the top map entry.
14125 */
14126 entry->protection |= subentry_protection;
14127 }
14128 entry->max_protection |= subentry_max_protection;
14129 /* propagate some attributes from subentry */
14130 entry->vme_no_copy_on_read = subentry_no_copy_on_read;
14131 entry->vme_permanent = subentry_permanent;
14132 entry->csm_associated = subentry_csm_associated;
14133
14134 if ((entry->protection & VM_PROT_WRITE) &&
14135 (entry->protection & VM_PROT_EXECUTE) &&
14136 #if XNU_TARGET_OS_OSX
14137 map->pmap != kernel_pmap &&
14138 (vm_map_cs_enforcement(map)
14139 #if __arm64__
14140 || !VM_MAP_IS_EXOTIC(map)
14141 #endif /* __arm64__ */
14142 ) &&
14143 #endif /* XNU_TARGET_OS_OSX */
14144 #if CODE_SIGNING_MONITOR
14145 (csm_address_space_exempt(map->pmap) != KERN_SUCCESS) &&
14146 #endif
14147 !(entry->used_for_jit) &&
14148 VM_MAP_POLICY_WX_STRIP_X(map)) {
14149 DTRACE_VM3(cs_wx,
14150 uint64_t, (uint64_t)entry->vme_start,
14151 uint64_t, (uint64_t)entry->vme_end,
14152 vm_prot_t, entry->protection);
14153 printf("CODE SIGNING: %d[%s] %s:%d(0x%llx,0x%llx,0x%x) can't have both write and exec at the same time\n",
14154 proc_selfpid(),
14155 (get_bsdtask_info(current_task())
14156 ? proc_name_address(get_bsdtask_info(current_task()))
14157 : "?"),
14158 __FUNCTION__, __LINE__,
14159 #if DEVELOPMENT || DEBUG
14160 (uint64_t)entry->vme_start,
14161 (uint64_t)entry->vme_end,
14162 #else /* DEVELOPMENT || DEBUG */
14163 (uint64_t)0,
14164 (uint64_t)0,
14165 #endif /* DEVELOPMENT || DEBUG */
14166 entry->protection);
14167 entry->protection &= ~VM_PROT_EXECUTE;
14168 }
14169
14170 if (object_copied) {
14171 VME_OFFSET_SET(entry, local_start - old_start + object_copied_offset);
14172 entry->needs_copy = object_copied_needs_copy;
14173 entry->is_shared = FALSE;
14174 } else {
14175 assert(VME_OBJECT(entry) != VM_OBJECT_NULL);
14176 assert(VME_OBJECT(entry)->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC);
14177 assert(entry->wired_count == 0);
14178 VME_OFFSET_SET(entry, copy_offset);
14179 entry->needs_copy = TRUE;
14180 if (map != old_map) {
14181 entry->is_shared = TRUE;
14182 }
14183 }
14184 if (entry->inheritance == VM_INHERIT_SHARE) {
14185 entry->inheritance = VM_INHERIT_COPY;
14186 }
14187
14188 vm_map_lock_write_to_read(map);
14189 } else {
14190 if ((cow_sub_map_parent)
14191 && (cow_sub_map_parent != *real_map)
14192 && (cow_sub_map_parent != map)) {
14193 vm_map_unlock(cow_sub_map_parent);
14194 }
14195 entry = submap_entry;
14196 vaddr = local_vaddr;
14197 }
14198 }
14199
14200 /*
14201 * Check whether this task is allowed to have
14202 * this page.
14203 */
14204
14205 prot = entry->protection;
14206
14207 if (override_nx(old_map, VME_ALIAS(entry)) && prot) {
14208 /*
14209 * HACK -- if not a stack, then allow execution
14210 */
14211 prot |= VM_PROT_EXECUTE;
14212 }
14213
14214 if (mask_protections) {
14215 fault_type &= prot;
14216 if (fault_type == VM_PROT_NONE) {
14217 goto protection_failure;
14218 }
14219 }
14220 if (((fault_type & prot) != fault_type)
14221 #if __arm64__
14222 /* prefetch abort in execute-only page */
14223 && !(prot == VM_PROT_EXECUTE && fault_type == (VM_PROT_READ | VM_PROT_EXECUTE))
14224 #elif defined(__x86_64__)
14225 /* Consider the UEXEC bit when handling an EXECUTE fault */
14226 && !((fault_type & VM_PROT_EXECUTE) && !(prot & VM_PROT_EXECUTE) && (prot & VM_PROT_UEXEC))
14227 #endif
14228 ) {
14229 protection_failure:
14230 if (*real_map != map) {
14231 vm_map_unlock(*real_map);
14232 }
14233 *real_map = map;
14234
14235 if ((fault_type & VM_PROT_EXECUTE) && prot) {
14236 log_stack_execution_failure((addr64_t)vaddr, prot);
14237 }
14238
14239 DTRACE_VM2(prot_fault, int, 1, (uint64_t *), NULL);
14240 DTRACE_VM3(prot_fault_detailed, vm_prot_t, fault_type, vm_prot_t, prot, void *, vaddr);
14241 /*
14242 * Noisy (esp. internally) and can be inferred from CrashReports. So OFF for now.
14243 *
14244 * ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_PROTECTION_FAILURE), 0);
14245 */
14246 return KERN_PROTECTION_FAILURE;
14247 }
14248
14249 /*
14250 * If this page is not pageable, we have to get
14251 * it for all possible accesses.
14252 */
14253
14254 *wired = (entry->wired_count != 0);
14255 if (*wired) {
14256 fault_type = prot;
14257 }
14258
14259 /*
14260 * If the entry was copy-on-write, we either ...
14261 */
14262
14263 if (entry->needs_copy) {
14264 /*
14265 * If we want to write the page, we may as well
14266 * handle that now since we've got the map locked.
14267 *
14268 * If we don't need to write the page, we just
14269 * demote the permissions allowed.
14270 */
14271
14272 if ((fault_type & VM_PROT_WRITE) || *wired || force_copy) {
14273 /*
14274 * Make a new object, and place it in the
14275 * object chain. Note that no new references
14276 * have appeared -- one just moved from the
14277 * map to the new object.
14278 */
14279
14280 if (vm_map_lock_read_to_write(map)) {
14281 vm_map_lock_read(map);
14282 goto RetryLookup;
14283 }
14284
14285 if (VME_OBJECT(entry)->shadowed == FALSE) {
14286 vm_object_lock(VME_OBJECT(entry));
14287 VME_OBJECT(entry)->shadowed = TRUE;
14288 vm_object_unlock(VME_OBJECT(entry));
14289 }
14290 VME_OBJECT_SHADOW(entry,
14291 (vm_map_size_t) (entry->vme_end -
14292 entry->vme_start),
14293 vm_map_always_shadow(map));
14294 entry->needs_copy = FALSE;
14295
14296 vm_map_lock_write_to_read(map);
14297 }
14298 if ((fault_type & VM_PROT_WRITE) == 0 && *wired == 0) {
14299 /*
14300 * We're attempting to read a copy-on-write
14301 * page -- don't allow writes.
14302 */
14303
14304 prot &= (~VM_PROT_WRITE);
14305 }
14306 }
14307
14308 if (submap_needed_copy && (prot & VM_PROT_WRITE)) {
14309 /*
14310 * We went through a "needs_copy" submap without triggering
14311 * a copy, so granting write access to the page would bypass
14312 * that submap's "needs_copy".
14313 */
14314 assert(!(fault_type & VM_PROT_WRITE));
14315 assert(!*wired);
14316 assert(!force_copy);
14317 // printf("FBDP %d[%s] submap_needed_copy for %p 0x%llx\n", proc_selfpid(), proc_name_address(current_task()->bsd_info), map, vaddr);
14318 prot &= ~VM_PROT_WRITE;
14319 }
14320
14321 /*
14322 * Create an object if necessary.
14323 */
14324 if (VME_OBJECT(entry) == VM_OBJECT_NULL) {
14325 if (vm_map_lock_read_to_write(map)) {
14326 vm_map_lock_read(map);
14327 goto RetryLookup;
14328 }
14329
14330 VME_OBJECT_SET(entry,
14331 vm_object_allocate(
14332 (vm_map_size_t)(entry->vme_end -
14333 entry->vme_start)), false, 0);
14334 VME_OFFSET_SET(entry, 0);
14335 assert(entry->use_pmap);
14336 vm_map_lock_write_to_read(map);
14337 }
14338
14339 /*
14340 * Return the object/offset from this entry. If the entry
14341 * was copy-on-write or empty, it has been fixed up. Also
14342 * return the protection.
14343 */
14344
14345 *offset = (vaddr - entry->vme_start) + VME_OFFSET(entry);
14346 *object = VME_OBJECT(entry);
14347 *out_prot = prot;
14348 KDBG_FILTERED(MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_MAP_LOOKUP_OBJECT), VM_KERNEL_UNSLIDE_OR_PERM(*object), (unsigned long) VME_ALIAS(entry), 0, 0);
14349
14350 if (fault_info) {
14351 fault_info->interruptible = THREAD_UNINT; /* for now... */
14352 /* ... the caller will change "interruptible" if needed */
14353 fault_info->cluster_size = 0;
14354 fault_info->user_tag = VME_ALIAS(entry);
14355 fault_info->pmap_options = 0;
14356 if (entry->iokit_acct ||
14357 (!entry->is_sub_map && !entry->use_pmap)) {
14358 fault_info->pmap_options |= PMAP_OPTIONS_ALT_ACCT;
14359 }
14360 fault_info->behavior = entry->behavior;
14361 fault_info->lo_offset = VME_OFFSET(entry);
14362 fault_info->hi_offset =
14363 (entry->vme_end - entry->vme_start) + VME_OFFSET(entry);
14364 fault_info->no_cache = entry->no_cache;
14365 fault_info->stealth = FALSE;
14366 fault_info->io_sync = FALSE;
14367 if (entry->used_for_jit ||
14368 #if CODE_SIGNING_MONITOR
14369 (csm_address_space_exempt(map->pmap) == KERN_SUCCESS) ||
14370 #endif
14371 entry->vme_resilient_codesign) {
14372 fault_info->cs_bypass = TRUE;
14373 } else {
14374 fault_info->cs_bypass = FALSE;
14375 }
14376 fault_info->csm_associated = FALSE;
14377 #if CODE_SIGNING_MONITOR
14378 if (entry->csm_associated) {
14379 /*
14380 * The pmap layer will validate this page
14381 * before allowing it to be executed from.
14382 */
14383 fault_info->csm_associated = TRUE;
14384 }
14385 #endif
14386 fault_info->mark_zf_absent = FALSE;
14387 fault_info->batch_pmap_op = FALSE;
14388 fault_info->resilient_media = entry->vme_resilient_media;
14389 fault_info->fi_xnu_user_debug = entry->vme_xnu_user_debug;
14390 fault_info->no_copy_on_read = entry->vme_no_copy_on_read;
14391 if (entry->translated_allow_execute) {
14392 fault_info->pmap_options |= PMAP_OPTIONS_TRANSLATED_ALLOW_EXECUTE;
14393 }
14394 }
14395
14396 /*
14397 * Lock the object to prevent it from disappearing
14398 */
14399 if (object_lock_type == OBJECT_LOCK_EXCLUSIVE) {
14400 if (contended == NULL) {
14401 vm_object_lock(*object);
14402 } else {
14403 *contended = vm_object_lock_check_contended(*object);
14404 }
14405 } else {
14406 vm_object_lock_shared(*object);
14407 }
14408
14409 /*
14410 * Save the version number
14411 */
14412
14413 out_version->main_timestamp = map->timestamp;
14414
14415 return KERN_SUCCESS;
14416 }
14417
14418
14419 /*
14420 * vm_map_verify:
14421 *
14422 * Verifies that the map in question has not changed
14423 * since the given version. The map has to be locked
14424 * ("shared" mode is fine) before calling this function
14425 * and it will be returned locked too.
14426 */
14427 boolean_t
vm_map_verify(vm_map_t map,vm_map_version_t * version)14428 vm_map_verify(
14429 vm_map_t map,
14430 vm_map_version_t *version) /* REF */
14431 {
14432 boolean_t result;
14433
14434 vm_map_lock_assert_held(map);
14435 result = (map->timestamp == version->main_timestamp);
14436
14437 return result;
14438 }
14439
14440 /*
14441 * TEMPORARYTEMPORARYTEMPORARYTEMPORARYTEMPORARYTEMPORARY
14442 * Goes away after regular vm_region_recurse function migrates to
14443 * 64 bits
14444 * vm_region_recurse: A form of vm_region which follows the
14445 * submaps in a target map
14446 *
14447 */
14448
14449 kern_return_t
vm_map_region_recurse_64(vm_map_t map,vm_map_offset_t * address,vm_map_size_t * size,natural_t * nesting_depth,vm_region_submap_info_64_t submap_info,mach_msg_type_number_t * count)14450 vm_map_region_recurse_64(
14451 vm_map_t map,
14452 vm_map_offset_t *address, /* IN/OUT */
14453 vm_map_size_t *size, /* OUT */
14454 natural_t *nesting_depth, /* IN/OUT */
14455 vm_region_submap_info_64_t submap_info, /* IN/OUT */
14456 mach_msg_type_number_t *count) /* IN/OUT */
14457 {
14458 mach_msg_type_number_t original_count;
14459 vm_region_extended_info_data_t extended;
14460 vm_map_entry_t tmp_entry;
14461 vm_map_offset_t user_address;
14462 unsigned int user_max_depth;
14463
14464 /*
14465 * "curr_entry" is the VM map entry preceding or including the
14466 * address we're looking for.
14467 * "curr_map" is the map or sub-map containing "curr_entry".
14468 * "curr_address" is the equivalent of the top map's "user_address"
14469 * in the current map.
14470 * "curr_offset" is the cumulated offset of "curr_map" in the
14471 * target task's address space.
14472 * "curr_depth" is the depth of "curr_map" in the chain of
14473 * sub-maps.
14474 *
14475 * "curr_max_below" and "curr_max_above" limit the range (around
14476 * "curr_address") we should take into account in the current (sub)map.
14477 * They limit the range to what's visible through the map entries
14478 * we've traversed from the top map to the current map.
14479 *
14480 */
14481 vm_map_entry_t curr_entry;
14482 vm_map_address_t curr_address;
14483 vm_map_offset_t curr_offset;
14484 vm_map_t curr_map;
14485 unsigned int curr_depth;
14486 vm_map_offset_t curr_max_below, curr_max_above;
14487 vm_map_offset_t curr_skip;
14488
14489 /*
14490 * "next_" is the same as "curr_" but for the VM region immediately
14491 * after the address we're looking for. We need to keep track of this
14492 * too because we want to return info about that region if the
14493 * address we're looking for is not mapped.
14494 */
14495 vm_map_entry_t next_entry;
14496 vm_map_offset_t next_offset;
14497 vm_map_offset_t next_address;
14498 vm_map_t next_map;
14499 unsigned int next_depth;
14500 vm_map_offset_t next_max_below, next_max_above;
14501 vm_map_offset_t next_skip;
14502
14503 boolean_t look_for_pages;
14504 vm_region_submap_short_info_64_t short_info;
14505 boolean_t do_region_footprint;
14506 int effective_page_size, effective_page_shift;
14507 boolean_t submap_needed_copy;
14508
14509 if (map == VM_MAP_NULL) {
14510 /* no address space to work on */
14511 return KERN_INVALID_ARGUMENT;
14512 }
14513
14514 effective_page_shift = vm_self_region_page_shift(map);
14515 effective_page_size = (1 << effective_page_shift);
14516
14517 if (*count < VM_REGION_SUBMAP_SHORT_INFO_COUNT_64) {
14518 /*
14519 * "info" structure is not big enough and
14520 * would overflow
14521 */
14522 return KERN_INVALID_ARGUMENT;
14523 }
14524
14525 do_region_footprint = task_self_region_footprint();
14526 original_count = *count;
14527
14528 if (original_count < VM_REGION_SUBMAP_INFO_V0_COUNT_64) {
14529 *count = VM_REGION_SUBMAP_SHORT_INFO_COUNT_64;
14530 look_for_pages = FALSE;
14531 short_info = (vm_region_submap_short_info_64_t) submap_info;
14532 submap_info = NULL;
14533 } else {
14534 look_for_pages = TRUE;
14535 *count = VM_REGION_SUBMAP_INFO_V0_COUNT_64;
14536 short_info = NULL;
14537
14538 if (original_count >= VM_REGION_SUBMAP_INFO_V1_COUNT_64) {
14539 *count = VM_REGION_SUBMAP_INFO_V1_COUNT_64;
14540 }
14541 if (original_count >= VM_REGION_SUBMAP_INFO_V2_COUNT_64) {
14542 *count = VM_REGION_SUBMAP_INFO_V2_COUNT_64;
14543 }
14544 }
14545
14546 user_address = *address;
14547 user_max_depth = *nesting_depth;
14548 submap_needed_copy = FALSE;
14549
14550 if (not_in_kdp) {
14551 vm_map_lock_read(map);
14552 }
14553
14554 recurse_again:
14555 curr_entry = NULL;
14556 curr_map = map;
14557 curr_address = user_address;
14558 curr_offset = 0;
14559 curr_skip = 0;
14560 curr_depth = 0;
14561 curr_max_above = ((vm_map_offset_t) -1) - curr_address;
14562 curr_max_below = curr_address;
14563
14564 next_entry = NULL;
14565 next_map = NULL;
14566 next_address = 0;
14567 next_offset = 0;
14568 next_skip = 0;
14569 next_depth = 0;
14570 next_max_above = (vm_map_offset_t) -1;
14571 next_max_below = (vm_map_offset_t) -1;
14572
14573 for (;;) {
14574 if (vm_map_lookup_entry(curr_map,
14575 curr_address,
14576 &tmp_entry)) {
14577 /* tmp_entry contains the address we're looking for */
14578 curr_entry = tmp_entry;
14579 } else {
14580 vm_map_offset_t skip;
14581 /*
14582 * The address is not mapped. "tmp_entry" is the
14583 * map entry preceding the address. We want the next
14584 * one, if it exists.
14585 */
14586 curr_entry = tmp_entry->vme_next;
14587
14588 if (curr_entry == vm_map_to_entry(curr_map) ||
14589 (curr_entry->vme_start >=
14590 curr_address + curr_max_above)) {
14591 /* no next entry at this level: stop looking */
14592 if (not_in_kdp) {
14593 vm_map_unlock_read(curr_map);
14594 }
14595 curr_entry = NULL;
14596 curr_map = NULL;
14597 curr_skip = 0;
14598 curr_offset = 0;
14599 curr_depth = 0;
14600 curr_max_above = 0;
14601 curr_max_below = 0;
14602 break;
14603 }
14604
14605 /* adjust current address and offset */
14606 skip = curr_entry->vme_start - curr_address;
14607 curr_address = curr_entry->vme_start;
14608 curr_skip += skip;
14609 curr_offset += skip;
14610 curr_max_above -= skip;
14611 curr_max_below = 0;
14612 }
14613
14614 /*
14615 * Is the next entry at this level closer to the address (or
14616 * deeper in the submap chain) than the one we had
14617 * so far ?
14618 */
14619 tmp_entry = curr_entry->vme_next;
14620 if (tmp_entry == vm_map_to_entry(curr_map)) {
14621 /* no next entry at this level */
14622 } else if (tmp_entry->vme_start >=
14623 curr_address + curr_max_above) {
14624 /*
14625 * tmp_entry is beyond the scope of what we mapped of
14626 * this submap in the upper level: ignore it.
14627 */
14628 } else if ((next_entry == NULL) ||
14629 (tmp_entry->vme_start + curr_offset <=
14630 next_entry->vme_start + next_offset)) {
14631 /*
14632 * We didn't have a "next_entry" or this one is
14633 * closer to the address we're looking for:
14634 * use this "tmp_entry" as the new "next_entry".
14635 */
14636 if (next_entry != NULL) {
14637 /* unlock the last "next_map" */
14638 if (next_map != curr_map && not_in_kdp) {
14639 vm_map_unlock_read(next_map);
14640 }
14641 }
14642 next_entry = tmp_entry;
14643 next_map = curr_map;
14644 next_depth = curr_depth;
14645 next_address = next_entry->vme_start;
14646 next_skip = curr_skip;
14647 next_skip += (next_address - curr_address);
14648 next_offset = curr_offset;
14649 next_offset += (next_address - curr_address);
14650 next_max_above = MIN(next_max_above, curr_max_above);
14651 next_max_above = MIN(next_max_above,
14652 next_entry->vme_end - next_address);
14653 next_max_below = MIN(next_max_below, curr_max_below);
14654 next_max_below = MIN(next_max_below,
14655 next_address - next_entry->vme_start);
14656 }
14657
14658 /*
14659 * "curr_max_{above,below}" allow us to keep track of the
14660 * portion of the submap that is actually mapped at this level:
14661 * the rest of that submap is irrelevant to us, since it's not
14662 * mapped here.
14663 * The relevant portion of the map starts at
14664 * "VME_OFFSET(curr_entry)" up to the size of "curr_entry".
14665 */
14666 curr_max_above = MIN(curr_max_above,
14667 curr_entry->vme_end - curr_address);
14668 curr_max_below = MIN(curr_max_below,
14669 curr_address - curr_entry->vme_start);
14670
14671 if (!curr_entry->is_sub_map ||
14672 curr_depth >= user_max_depth) {
14673 /*
14674 * We hit a leaf map or we reached the maximum depth
14675 * we could, so stop looking. Keep the current map
14676 * locked.
14677 */
14678 break;
14679 }
14680
14681 /*
14682 * Get down to the next submap level.
14683 */
14684
14685 if (curr_entry->needs_copy) {
14686 /* everything below this is effectively copy-on-write */
14687 submap_needed_copy = TRUE;
14688 }
14689
14690 /*
14691 * Lock the next level and unlock the current level,
14692 * unless we need to keep it locked to access the "next_entry"
14693 * later.
14694 */
14695 if (not_in_kdp) {
14696 vm_map_lock_read(VME_SUBMAP(curr_entry));
14697 }
14698 if (curr_map == next_map) {
14699 /* keep "next_map" locked in case we need it */
14700 } else {
14701 /* release this map */
14702 if (not_in_kdp) {
14703 vm_map_unlock_read(curr_map);
14704 }
14705 }
14706
14707 /*
14708 * Adjust the offset. "curr_entry" maps the submap
14709 * at relative address "curr_entry->vme_start" in the
14710 * curr_map but skips the first "VME_OFFSET(curr_entry)"
14711 * bytes of the submap.
14712 * "curr_offset" always represents the offset of a virtual
14713 * address in the curr_map relative to the absolute address
14714 * space (i.e. the top-level VM map).
14715 */
14716 curr_offset +=
14717 (VME_OFFSET(curr_entry) - curr_entry->vme_start);
14718 curr_address = user_address + curr_offset;
14719 /* switch to the submap */
14720 curr_map = VME_SUBMAP(curr_entry);
14721 curr_depth++;
14722 curr_entry = NULL;
14723 }
14724
14725 // LP64todo: all the current tools are 32bit, obviously never worked for 64b
14726 // so probably should be a real 32b ID vs. ptr.
14727 // Current users just check for equality
14728
14729 if (curr_entry == NULL) {
14730 /* no VM region contains the address... */
14731
14732 if (do_region_footprint && /* we want footprint numbers */
14733 next_entry == NULL && /* & there are no more regions */
14734 /* & we haven't already provided our fake region: */
14735 user_address <= vm_map_last_entry(map)->vme_end) {
14736 ledger_amount_t ledger_resident, ledger_compressed;
14737
14738 /*
14739 * Add a fake memory region to account for
14740 * purgeable and/or ledger-tagged memory that
14741 * counts towards this task's memory footprint,
14742 * i.e. the resident/compressed pages of non-volatile
14743 * objects owned by that task.
14744 */
14745 task_ledgers_footprint(map->pmap->ledger,
14746 &ledger_resident,
14747 &ledger_compressed);
14748 if (ledger_resident + ledger_compressed == 0) {
14749 /* no purgeable memory usage to report */
14750 return KERN_INVALID_ADDRESS;
14751 }
14752 /* fake region to show nonvolatile footprint */
14753 if (look_for_pages) {
14754 submap_info->protection = VM_PROT_DEFAULT;
14755 submap_info->max_protection = VM_PROT_DEFAULT;
14756 submap_info->inheritance = VM_INHERIT_DEFAULT;
14757 submap_info->offset = 0;
14758 submap_info->user_tag = -1;
14759 submap_info->pages_resident = (unsigned int) (ledger_resident / effective_page_size);
14760 submap_info->pages_shared_now_private = 0;
14761 submap_info->pages_swapped_out = (unsigned int) (ledger_compressed / effective_page_size);
14762 submap_info->pages_dirtied = submap_info->pages_resident;
14763 submap_info->ref_count = 1;
14764 submap_info->shadow_depth = 0;
14765 submap_info->external_pager = 0;
14766 submap_info->share_mode = SM_PRIVATE;
14767 if (submap_needed_copy) {
14768 submap_info->share_mode = SM_COW;
14769 }
14770 submap_info->is_submap = 0;
14771 submap_info->behavior = VM_BEHAVIOR_DEFAULT;
14772 submap_info->object_id = VM_OBJECT_ID_FAKE(map, task_ledgers.purgeable_nonvolatile);
14773 submap_info->user_wired_count = 0;
14774 submap_info->pages_reusable = 0;
14775 } else {
14776 short_info->user_tag = -1;
14777 short_info->offset = 0;
14778 short_info->protection = VM_PROT_DEFAULT;
14779 short_info->inheritance = VM_INHERIT_DEFAULT;
14780 short_info->max_protection = VM_PROT_DEFAULT;
14781 short_info->behavior = VM_BEHAVIOR_DEFAULT;
14782 short_info->user_wired_count = 0;
14783 short_info->is_submap = 0;
14784 short_info->object_id = VM_OBJECT_ID_FAKE(map, task_ledgers.purgeable_nonvolatile);
14785 short_info->external_pager = 0;
14786 short_info->shadow_depth = 0;
14787 short_info->share_mode = SM_PRIVATE;
14788 if (submap_needed_copy) {
14789 short_info->share_mode = SM_COW;
14790 }
14791 short_info->ref_count = 1;
14792 }
14793 *nesting_depth = 0;
14794 *size = (vm_map_size_t) (ledger_resident + ledger_compressed);
14795 // *address = user_address;
14796 *address = vm_map_last_entry(map)->vme_end;
14797 return KERN_SUCCESS;
14798 }
14799
14800 if (next_entry == NULL) {
14801 /* ... and no VM region follows it either */
14802 return KERN_INVALID_ADDRESS;
14803 }
14804 /* ... gather info about the next VM region */
14805 curr_entry = next_entry;
14806 curr_map = next_map; /* still locked ... */
14807 curr_address = next_address;
14808 curr_skip = next_skip;
14809 curr_offset = next_offset;
14810 curr_depth = next_depth;
14811 curr_max_above = next_max_above;
14812 curr_max_below = next_max_below;
14813 } else {
14814 /* we won't need "next_entry" after all */
14815 if (next_entry != NULL) {
14816 /* release "next_map" */
14817 if (next_map != curr_map && not_in_kdp) {
14818 vm_map_unlock_read(next_map);
14819 }
14820 }
14821 }
14822 next_entry = NULL;
14823 next_map = NULL;
14824 next_offset = 0;
14825 next_skip = 0;
14826 next_depth = 0;
14827 next_max_below = -1;
14828 next_max_above = -1;
14829
14830 if (curr_entry->is_sub_map &&
14831 curr_depth < user_max_depth) {
14832 /*
14833 * We're not as deep as we could be: we must have
14834 * gone back up after not finding anything mapped
14835 * below the original top-level map entry's.
14836 * Let's move "curr_address" forward and recurse again.
14837 */
14838 user_address = curr_address;
14839 goto recurse_again;
14840 }
14841
14842 *nesting_depth = curr_depth;
14843 *size = curr_max_above + curr_max_below;
14844 *address = user_address + curr_skip - curr_max_below;
14845
14846 if (look_for_pages) {
14847 submap_info->user_tag = VME_ALIAS(curr_entry);
14848 submap_info->offset = VME_OFFSET(curr_entry);
14849 submap_info->protection = curr_entry->protection;
14850 submap_info->inheritance = curr_entry->inheritance;
14851 submap_info->max_protection = curr_entry->max_protection;
14852 submap_info->behavior = curr_entry->behavior;
14853 submap_info->user_wired_count = curr_entry->user_wired_count;
14854 submap_info->is_submap = curr_entry->is_sub_map;
14855 if (curr_entry->is_sub_map) {
14856 submap_info->object_id = VM_OBJECT_ID(VME_SUBMAP(curr_entry));
14857 } else {
14858 submap_info->object_id = VM_OBJECT_ID(VME_OBJECT(curr_entry));
14859 }
14860 } else {
14861 short_info->user_tag = VME_ALIAS(curr_entry);
14862 short_info->offset = VME_OFFSET(curr_entry);
14863 short_info->protection = curr_entry->protection;
14864 short_info->inheritance = curr_entry->inheritance;
14865 short_info->max_protection = curr_entry->max_protection;
14866 short_info->behavior = curr_entry->behavior;
14867 short_info->user_wired_count = curr_entry->user_wired_count;
14868 short_info->is_submap = curr_entry->is_sub_map;
14869 if (curr_entry->is_sub_map) {
14870 short_info->object_id = VM_OBJECT_ID(VME_SUBMAP(curr_entry));
14871 } else {
14872 short_info->object_id = VM_OBJECT_ID(VME_OBJECT(curr_entry));
14873 }
14874 }
14875
14876 extended.pages_resident = 0;
14877 extended.pages_swapped_out = 0;
14878 extended.pages_shared_now_private = 0;
14879 extended.pages_dirtied = 0;
14880 extended.pages_reusable = 0;
14881 extended.external_pager = 0;
14882 extended.shadow_depth = 0;
14883 extended.share_mode = SM_EMPTY;
14884 extended.ref_count = 0;
14885
14886 if (not_in_kdp) {
14887 if (!curr_entry->is_sub_map) {
14888 vm_map_offset_t range_start, range_end;
14889 range_start = MAX((curr_address - curr_max_below),
14890 curr_entry->vme_start);
14891 range_end = MIN((curr_address + curr_max_above),
14892 curr_entry->vme_end);
14893 vm_map_region_walk(curr_map,
14894 range_start,
14895 curr_entry,
14896 (VME_OFFSET(curr_entry) +
14897 (range_start -
14898 curr_entry->vme_start)),
14899 range_end - range_start,
14900 &extended,
14901 look_for_pages, VM_REGION_EXTENDED_INFO_COUNT);
14902 if (extended.external_pager &&
14903 extended.ref_count == 2 &&
14904 extended.share_mode == SM_SHARED) {
14905 extended.share_mode = SM_PRIVATE;
14906 }
14907 if (submap_needed_copy) {
14908 extended.share_mode = SM_COW;
14909 }
14910 } else {
14911 if (curr_entry->use_pmap) {
14912 extended.share_mode = SM_TRUESHARED;
14913 } else {
14914 extended.share_mode = SM_PRIVATE;
14915 }
14916 extended.ref_count = os_ref_get_count_raw(&VME_SUBMAP(curr_entry)->map_refcnt);
14917 }
14918 }
14919
14920 if (look_for_pages) {
14921 submap_info->pages_resident = extended.pages_resident;
14922 submap_info->pages_swapped_out = extended.pages_swapped_out;
14923 submap_info->pages_shared_now_private =
14924 extended.pages_shared_now_private;
14925 submap_info->pages_dirtied = extended.pages_dirtied;
14926 submap_info->external_pager = extended.external_pager;
14927 submap_info->shadow_depth = extended.shadow_depth;
14928 submap_info->share_mode = extended.share_mode;
14929 submap_info->ref_count = extended.ref_count;
14930
14931 if (original_count >= VM_REGION_SUBMAP_INFO_V1_COUNT_64) {
14932 submap_info->pages_reusable = extended.pages_reusable;
14933 }
14934 if (original_count >= VM_REGION_SUBMAP_INFO_V2_COUNT_64) {
14935 if (curr_entry->is_sub_map) {
14936 submap_info->object_id_full = (vm_object_id_t)VM_KERNEL_ADDRPERM(VME_SUBMAP(curr_entry));
14937 } else if (VME_OBJECT(curr_entry)) {
14938 submap_info->object_id_full = (vm_object_id_t)VM_KERNEL_ADDRPERM(VME_OBJECT(curr_entry));
14939 } else {
14940 submap_info->object_id_full = 0ull;
14941 }
14942 }
14943 } else {
14944 short_info->external_pager = extended.external_pager;
14945 short_info->shadow_depth = extended.shadow_depth;
14946 short_info->share_mode = extended.share_mode;
14947 short_info->ref_count = extended.ref_count;
14948 }
14949
14950 if (not_in_kdp) {
14951 vm_map_unlock_read(curr_map);
14952 }
14953
14954 return KERN_SUCCESS;
14955 }
14956
14957 /*
14958 * vm_region:
14959 *
14960 * User call to obtain information about a region in
14961 * a task's address map. Currently, only one flavor is
14962 * supported.
14963 *
14964 * XXX The reserved and behavior fields cannot be filled
14965 * in until the vm merge from the IK is completed, and
14966 * vm_reserve is implemented.
14967 */
14968
14969 kern_return_t
vm_map_region(vm_map_t map,vm_map_offset_t * address,vm_map_size_t * size,vm_region_flavor_t flavor,vm_region_info_t info,mach_msg_type_number_t * count,mach_port_t * object_name)14970 vm_map_region(
14971 vm_map_t map,
14972 vm_map_offset_t *address, /* IN/OUT */
14973 vm_map_size_t *size, /* OUT */
14974 vm_region_flavor_t flavor, /* IN */
14975 vm_region_info_t info, /* OUT */
14976 mach_msg_type_number_t *count, /* IN/OUT */
14977 mach_port_t *object_name) /* OUT */
14978 {
14979 vm_map_entry_t tmp_entry;
14980 vm_map_entry_t entry;
14981 vm_map_offset_t start;
14982
14983 if (map == VM_MAP_NULL) {
14984 return KERN_INVALID_ARGUMENT;
14985 }
14986
14987 switch (flavor) {
14988 case VM_REGION_BASIC_INFO:
14989 /* legacy for old 32-bit objects info */
14990 {
14991 vm_region_basic_info_t basic;
14992
14993 if (*count < VM_REGION_BASIC_INFO_COUNT) {
14994 return KERN_INVALID_ARGUMENT;
14995 }
14996
14997 basic = (vm_region_basic_info_t) info;
14998 *count = VM_REGION_BASIC_INFO_COUNT;
14999
15000 vm_map_lock_read(map);
15001
15002 start = *address;
15003 if (!vm_map_lookup_entry(map, start, &tmp_entry)) {
15004 if ((entry = tmp_entry->vme_next) == vm_map_to_entry(map)) {
15005 vm_map_unlock_read(map);
15006 return KERN_INVALID_ADDRESS;
15007 }
15008 } else {
15009 entry = tmp_entry;
15010 }
15011
15012 start = entry->vme_start;
15013
15014 basic->offset = (uint32_t)VME_OFFSET(entry);
15015 basic->protection = entry->protection;
15016 basic->inheritance = entry->inheritance;
15017 basic->max_protection = entry->max_protection;
15018 basic->behavior = entry->behavior;
15019 basic->user_wired_count = entry->user_wired_count;
15020 basic->reserved = entry->is_sub_map;
15021 *address = start;
15022 *size = (entry->vme_end - start);
15023
15024 if (object_name) {
15025 *object_name = IP_NULL;
15026 }
15027 if (entry->is_sub_map) {
15028 basic->shared = FALSE;
15029 } else {
15030 basic->shared = entry->is_shared;
15031 }
15032
15033 vm_map_unlock_read(map);
15034 return KERN_SUCCESS;
15035 }
15036
15037 case VM_REGION_BASIC_INFO_64:
15038 {
15039 vm_region_basic_info_64_t basic;
15040
15041 if (*count < VM_REGION_BASIC_INFO_COUNT_64) {
15042 return KERN_INVALID_ARGUMENT;
15043 }
15044
15045 basic = (vm_region_basic_info_64_t) info;
15046 *count = VM_REGION_BASIC_INFO_COUNT_64;
15047
15048 vm_map_lock_read(map);
15049
15050 start = *address;
15051 if (!vm_map_lookup_entry(map, start, &tmp_entry)) {
15052 if ((entry = tmp_entry->vme_next) == vm_map_to_entry(map)) {
15053 vm_map_unlock_read(map);
15054 return KERN_INVALID_ADDRESS;
15055 }
15056 } else {
15057 entry = tmp_entry;
15058 }
15059
15060 start = entry->vme_start;
15061
15062 basic->offset = VME_OFFSET(entry);
15063 basic->protection = entry->protection;
15064 basic->inheritance = entry->inheritance;
15065 basic->max_protection = entry->max_protection;
15066 basic->behavior = entry->behavior;
15067 basic->user_wired_count = entry->user_wired_count;
15068 basic->reserved = entry->is_sub_map;
15069 *address = start;
15070 *size = (entry->vme_end - start);
15071
15072 if (object_name) {
15073 *object_name = IP_NULL;
15074 }
15075 if (entry->is_sub_map) {
15076 basic->shared = FALSE;
15077 } else {
15078 basic->shared = entry->is_shared;
15079 }
15080
15081 vm_map_unlock_read(map);
15082 return KERN_SUCCESS;
15083 }
15084 case VM_REGION_EXTENDED_INFO:
15085 if (*count < VM_REGION_EXTENDED_INFO_COUNT) {
15086 return KERN_INVALID_ARGUMENT;
15087 }
15088 OS_FALLTHROUGH;
15089 case VM_REGION_EXTENDED_INFO__legacy:
15090 if (*count < VM_REGION_EXTENDED_INFO_COUNT__legacy) {
15091 return KERN_INVALID_ARGUMENT;
15092 }
15093
15094 {
15095 vm_region_extended_info_t extended;
15096 mach_msg_type_number_t original_count;
15097 int effective_page_size, effective_page_shift;
15098
15099 extended = (vm_region_extended_info_t) info;
15100
15101 effective_page_shift = vm_self_region_page_shift(map);
15102 effective_page_size = (1 << effective_page_shift);
15103
15104 vm_map_lock_read(map);
15105
15106 start = *address;
15107 if (!vm_map_lookup_entry(map, start, &tmp_entry)) {
15108 if ((entry = tmp_entry->vme_next) == vm_map_to_entry(map)) {
15109 vm_map_unlock_read(map);
15110 return KERN_INVALID_ADDRESS;
15111 }
15112 } else {
15113 entry = tmp_entry;
15114 }
15115 start = entry->vme_start;
15116
15117 extended->protection = entry->protection;
15118 extended->user_tag = VME_ALIAS(entry);
15119 extended->pages_resident = 0;
15120 extended->pages_swapped_out = 0;
15121 extended->pages_shared_now_private = 0;
15122 extended->pages_dirtied = 0;
15123 extended->external_pager = 0;
15124 extended->shadow_depth = 0;
15125
15126 original_count = *count;
15127 if (flavor == VM_REGION_EXTENDED_INFO__legacy) {
15128 *count = VM_REGION_EXTENDED_INFO_COUNT__legacy;
15129 } else {
15130 extended->pages_reusable = 0;
15131 *count = VM_REGION_EXTENDED_INFO_COUNT;
15132 }
15133
15134 vm_map_region_walk(map, start, entry, VME_OFFSET(entry), entry->vme_end - start, extended, TRUE, *count);
15135
15136 if (extended->external_pager && extended->ref_count == 2 && extended->share_mode == SM_SHARED) {
15137 extended->share_mode = SM_PRIVATE;
15138 }
15139
15140 if (object_name) {
15141 *object_name = IP_NULL;
15142 }
15143 *address = start;
15144 *size = (entry->vme_end - start);
15145
15146 vm_map_unlock_read(map);
15147 return KERN_SUCCESS;
15148 }
15149 case VM_REGION_TOP_INFO:
15150 {
15151 vm_region_top_info_t top;
15152
15153 if (*count < VM_REGION_TOP_INFO_COUNT) {
15154 return KERN_INVALID_ARGUMENT;
15155 }
15156
15157 top = (vm_region_top_info_t) info;
15158 *count = VM_REGION_TOP_INFO_COUNT;
15159
15160 vm_map_lock_read(map);
15161
15162 start = *address;
15163 if (!vm_map_lookup_entry(map, start, &tmp_entry)) {
15164 if ((entry = tmp_entry->vme_next) == vm_map_to_entry(map)) {
15165 vm_map_unlock_read(map);
15166 return KERN_INVALID_ADDRESS;
15167 }
15168 } else {
15169 entry = tmp_entry;
15170 }
15171 start = entry->vme_start;
15172
15173 top->private_pages_resident = 0;
15174 top->shared_pages_resident = 0;
15175
15176 vm_map_region_top_walk(entry, top);
15177
15178 if (object_name) {
15179 *object_name = IP_NULL;
15180 }
15181 *address = start;
15182 *size = (entry->vme_end - start);
15183
15184 vm_map_unlock_read(map);
15185 return KERN_SUCCESS;
15186 }
15187 default:
15188 return KERN_INVALID_ARGUMENT;
15189 }
15190 }
15191
15192 #define OBJ_RESIDENT_COUNT(obj, entry_size) \
15193 MIN((entry_size), \
15194 ((obj)->all_reusable ? \
15195 (obj)->wired_page_count : \
15196 (obj)->resident_page_count - (obj)->reusable_page_count))
15197
15198 void
vm_map_region_top_walk(vm_map_entry_t entry,vm_region_top_info_t top)15199 vm_map_region_top_walk(
15200 vm_map_entry_t entry,
15201 vm_region_top_info_t top)
15202 {
15203 if (entry->is_sub_map || VME_OBJECT(entry) == 0) {
15204 top->share_mode = SM_EMPTY;
15205 top->ref_count = 0;
15206 top->obj_id = 0;
15207 return;
15208 }
15209
15210 {
15211 struct vm_object *obj, *tmp_obj;
15212 int ref_count;
15213 uint32_t entry_size;
15214
15215 entry_size = (uint32_t) ((entry->vme_end - entry->vme_start) / PAGE_SIZE_64);
15216
15217 obj = VME_OBJECT(entry);
15218
15219 vm_object_lock(obj);
15220
15221 if ((ref_count = obj->ref_count) > 1 && obj->paging_in_progress) {
15222 ref_count--;
15223 }
15224
15225 assert(obj->reusable_page_count <= obj->resident_page_count);
15226 if (obj->shadow) {
15227 if (ref_count == 1) {
15228 top->private_pages_resident =
15229 OBJ_RESIDENT_COUNT(obj, entry_size);
15230 } else {
15231 top->shared_pages_resident =
15232 OBJ_RESIDENT_COUNT(obj, entry_size);
15233 }
15234 top->ref_count = ref_count;
15235 top->share_mode = SM_COW;
15236
15237 while ((tmp_obj = obj->shadow)) {
15238 vm_object_lock(tmp_obj);
15239 vm_object_unlock(obj);
15240 obj = tmp_obj;
15241
15242 if ((ref_count = obj->ref_count) > 1 && obj->paging_in_progress) {
15243 ref_count--;
15244 }
15245
15246 assert(obj->reusable_page_count <= obj->resident_page_count);
15247 top->shared_pages_resident +=
15248 OBJ_RESIDENT_COUNT(obj, entry_size);
15249 top->ref_count += ref_count - 1;
15250 }
15251 } else {
15252 if (entry->superpage_size) {
15253 top->share_mode = SM_LARGE_PAGE;
15254 top->shared_pages_resident = 0;
15255 top->private_pages_resident = entry_size;
15256 } else if (entry->needs_copy) {
15257 top->share_mode = SM_COW;
15258 top->shared_pages_resident =
15259 OBJ_RESIDENT_COUNT(obj, entry_size);
15260 } else {
15261 if (ref_count == 1 ||
15262 (ref_count == 2 && obj->named)) {
15263 top->share_mode = SM_PRIVATE;
15264 top->private_pages_resident =
15265 OBJ_RESIDENT_COUNT(obj,
15266 entry_size);
15267 } else {
15268 top->share_mode = SM_SHARED;
15269 top->shared_pages_resident =
15270 OBJ_RESIDENT_COUNT(obj,
15271 entry_size);
15272 }
15273 }
15274 top->ref_count = ref_count;
15275 }
15276 /* XXX K64: obj_id will be truncated */
15277 top->obj_id = (unsigned int) (uintptr_t)VM_KERNEL_ADDRPERM(obj);
15278
15279 vm_object_unlock(obj);
15280 }
15281 }
15282
15283 void
vm_map_region_walk(vm_map_t map,vm_map_offset_t va,vm_map_entry_t entry,vm_object_offset_t offset,vm_object_size_t range,vm_region_extended_info_t extended,boolean_t look_for_pages,mach_msg_type_number_t count)15284 vm_map_region_walk(
15285 vm_map_t map,
15286 vm_map_offset_t va,
15287 vm_map_entry_t entry,
15288 vm_object_offset_t offset,
15289 vm_object_size_t range,
15290 vm_region_extended_info_t extended,
15291 boolean_t look_for_pages,
15292 mach_msg_type_number_t count)
15293 {
15294 struct vm_object *obj, *tmp_obj;
15295 vm_map_offset_t last_offset;
15296 int i;
15297 int ref_count;
15298 struct vm_object *shadow_object;
15299 unsigned short shadow_depth;
15300 boolean_t do_region_footprint;
15301 int effective_page_size, effective_page_shift;
15302 vm_map_offset_t effective_page_mask;
15303
15304 do_region_footprint = task_self_region_footprint();
15305
15306 if ((entry->is_sub_map) ||
15307 (VME_OBJECT(entry) == 0) ||
15308 (VME_OBJECT(entry)->phys_contiguous &&
15309 !entry->superpage_size)) {
15310 extended->share_mode = SM_EMPTY;
15311 extended->ref_count = 0;
15312 return;
15313 }
15314
15315 if (entry->superpage_size) {
15316 extended->shadow_depth = 0;
15317 extended->share_mode = SM_LARGE_PAGE;
15318 extended->ref_count = 1;
15319 extended->external_pager = 0;
15320
15321 /* TODO4K: Superpage in 4k mode? */
15322 extended->pages_resident = (unsigned int)(range >> PAGE_SHIFT);
15323 extended->shadow_depth = 0;
15324 return;
15325 }
15326
15327 effective_page_shift = vm_self_region_page_shift(map);
15328 effective_page_size = (1 << effective_page_shift);
15329 effective_page_mask = effective_page_size - 1;
15330
15331 offset = vm_map_trunc_page(offset, effective_page_mask);
15332
15333 obj = VME_OBJECT(entry);
15334
15335 vm_object_lock(obj);
15336
15337 if ((ref_count = obj->ref_count) > 1 && obj->paging_in_progress) {
15338 ref_count--;
15339 }
15340
15341 if (look_for_pages) {
15342 for (last_offset = offset + range;
15343 offset < last_offset;
15344 offset += effective_page_size, va += effective_page_size) {
15345 if (do_region_footprint) {
15346 int disp;
15347
15348 disp = 0;
15349 if (map->has_corpse_footprint) {
15350 /*
15351 * Query the page info data we saved
15352 * while forking the corpse.
15353 */
15354 vm_map_corpse_footprint_query_page_info(
15355 map,
15356 va,
15357 &disp);
15358 } else {
15359 /*
15360 * Query the pmap.
15361 */
15362 vm_map_footprint_query_page_info(
15363 map,
15364 entry,
15365 va,
15366 &disp);
15367 }
15368 if (disp & VM_PAGE_QUERY_PAGE_PRESENT) {
15369 extended->pages_resident++;
15370 }
15371 if (disp & VM_PAGE_QUERY_PAGE_REUSABLE) {
15372 extended->pages_reusable++;
15373 }
15374 if (disp & VM_PAGE_QUERY_PAGE_DIRTY) {
15375 extended->pages_dirtied++;
15376 }
15377 if (disp & PMAP_QUERY_PAGE_COMPRESSED) {
15378 extended->pages_swapped_out++;
15379 }
15380 continue;
15381 }
15382
15383 vm_map_region_look_for_page(map, va, obj,
15384 vm_object_trunc_page(offset), ref_count,
15385 0, extended, count);
15386 }
15387
15388 if (do_region_footprint) {
15389 goto collect_object_info;
15390 }
15391 } else {
15392 collect_object_info:
15393 shadow_object = obj->shadow;
15394 shadow_depth = 0;
15395
15396 if (!(obj->internal)) {
15397 extended->external_pager = 1;
15398 }
15399
15400 if (shadow_object != VM_OBJECT_NULL) {
15401 vm_object_lock(shadow_object);
15402 for (;
15403 shadow_object != VM_OBJECT_NULL;
15404 shadow_depth++) {
15405 vm_object_t next_shadow;
15406
15407 if (!(shadow_object->internal)) {
15408 extended->external_pager = 1;
15409 }
15410
15411 next_shadow = shadow_object->shadow;
15412 if (next_shadow) {
15413 vm_object_lock(next_shadow);
15414 }
15415 vm_object_unlock(shadow_object);
15416 shadow_object = next_shadow;
15417 }
15418 }
15419 extended->shadow_depth = shadow_depth;
15420 }
15421
15422 if (extended->shadow_depth || entry->needs_copy) {
15423 extended->share_mode = SM_COW;
15424 } else {
15425 if (ref_count == 1) {
15426 extended->share_mode = SM_PRIVATE;
15427 } else {
15428 if (obj->true_share) {
15429 extended->share_mode = SM_TRUESHARED;
15430 } else {
15431 extended->share_mode = SM_SHARED;
15432 }
15433 }
15434 }
15435 extended->ref_count = ref_count - extended->shadow_depth;
15436
15437 for (i = 0; i < extended->shadow_depth; i++) {
15438 if ((tmp_obj = obj->shadow) == 0) {
15439 break;
15440 }
15441 vm_object_lock(tmp_obj);
15442 vm_object_unlock(obj);
15443
15444 if ((ref_count = tmp_obj->ref_count) > 1 && tmp_obj->paging_in_progress) {
15445 ref_count--;
15446 }
15447
15448 extended->ref_count += ref_count;
15449 obj = tmp_obj;
15450 }
15451 vm_object_unlock(obj);
15452
15453 if (extended->share_mode == SM_SHARED) {
15454 vm_map_entry_t cur;
15455 vm_map_entry_t last;
15456 int my_refs;
15457
15458 obj = VME_OBJECT(entry);
15459 last = vm_map_to_entry(map);
15460 my_refs = 0;
15461
15462 if ((ref_count = obj->ref_count) > 1 && obj->paging_in_progress) {
15463 ref_count--;
15464 }
15465 for (cur = vm_map_first_entry(map); cur != last; cur = cur->vme_next) {
15466 my_refs += vm_map_region_count_obj_refs(cur, obj);
15467 }
15468
15469 if (my_refs == ref_count) {
15470 extended->share_mode = SM_PRIVATE_ALIASED;
15471 } else if (my_refs > 1) {
15472 extended->share_mode = SM_SHARED_ALIASED;
15473 }
15474 }
15475 }
15476
15477
15478 /* object is locked on entry and locked on return */
15479
15480
15481 static void
vm_map_region_look_for_page(__unused vm_map_t map,__unused vm_map_offset_t va,vm_object_t object,vm_object_offset_t offset,int max_refcnt,unsigned short depth,vm_region_extended_info_t extended,mach_msg_type_number_t count)15482 vm_map_region_look_for_page(
15483 __unused vm_map_t map,
15484 __unused vm_map_offset_t va,
15485 vm_object_t object,
15486 vm_object_offset_t offset,
15487 int max_refcnt,
15488 unsigned short depth,
15489 vm_region_extended_info_t extended,
15490 mach_msg_type_number_t count)
15491 {
15492 vm_page_t p;
15493 vm_object_t shadow;
15494 int ref_count;
15495 vm_object_t caller_object;
15496
15497 shadow = object->shadow;
15498 caller_object = object;
15499
15500
15501 while (TRUE) {
15502 if (!(object->internal)) {
15503 extended->external_pager = 1;
15504 }
15505
15506 if ((p = vm_page_lookup(object, offset)) != VM_PAGE_NULL) {
15507 if (shadow && (max_refcnt == 1)) {
15508 extended->pages_shared_now_private++;
15509 }
15510
15511 if (!p->vmp_fictitious &&
15512 (p->vmp_dirty || pmap_is_modified(VM_PAGE_GET_PHYS_PAGE(p)))) {
15513 extended->pages_dirtied++;
15514 } else if (count >= VM_REGION_EXTENDED_INFO_COUNT) {
15515 if (p->vmp_reusable || object->all_reusable) {
15516 extended->pages_reusable++;
15517 }
15518 }
15519
15520 extended->pages_resident++;
15521
15522 if (object != caller_object) {
15523 vm_object_unlock(object);
15524 }
15525
15526 return;
15527 }
15528 if (object->internal &&
15529 object->alive &&
15530 !object->terminating &&
15531 object->pager_ready) {
15532 if (VM_COMPRESSOR_PAGER_STATE_GET(object, offset)
15533 == VM_EXTERNAL_STATE_EXISTS) {
15534 /* the pager has that page */
15535 extended->pages_swapped_out++;
15536 if (object != caller_object) {
15537 vm_object_unlock(object);
15538 }
15539 return;
15540 }
15541 }
15542
15543 if (shadow) {
15544 vm_object_lock(shadow);
15545
15546 if ((ref_count = shadow->ref_count) > 1 && shadow->paging_in_progress) {
15547 ref_count--;
15548 }
15549
15550 if (++depth > extended->shadow_depth) {
15551 extended->shadow_depth = depth;
15552 }
15553
15554 if (ref_count > max_refcnt) {
15555 max_refcnt = ref_count;
15556 }
15557
15558 if (object != caller_object) {
15559 vm_object_unlock(object);
15560 }
15561
15562 offset = offset + object->vo_shadow_offset;
15563 object = shadow;
15564 shadow = object->shadow;
15565 continue;
15566 }
15567 if (object != caller_object) {
15568 vm_object_unlock(object);
15569 }
15570 break;
15571 }
15572 }
15573
15574 static int
vm_map_region_count_obj_refs(vm_map_entry_t entry,vm_object_t object)15575 vm_map_region_count_obj_refs(
15576 vm_map_entry_t entry,
15577 vm_object_t object)
15578 {
15579 int ref_count;
15580 vm_object_t chk_obj;
15581 vm_object_t tmp_obj;
15582
15583 if (entry->is_sub_map || VME_OBJECT(entry) == VM_OBJECT_NULL) {
15584 return 0;
15585 }
15586
15587 ref_count = 0;
15588 chk_obj = VME_OBJECT(entry);
15589 vm_object_lock(chk_obj);
15590
15591 while (chk_obj) {
15592 if (chk_obj == object) {
15593 ref_count++;
15594 }
15595 tmp_obj = chk_obj->shadow;
15596 if (tmp_obj) {
15597 vm_object_lock(tmp_obj);
15598 }
15599 vm_object_unlock(chk_obj);
15600
15601 chk_obj = tmp_obj;
15602 }
15603
15604 return ref_count;
15605 }
15606
15607
15608 /*
15609 * Routine: vm_map_simplify
15610 *
15611 * Description:
15612 * Attempt to simplify the map representation in
15613 * the vicinity of the given starting address.
15614 * Note:
15615 * This routine is intended primarily to keep the
15616 * kernel maps more compact -- they generally don't
15617 * benefit from the "expand a map entry" technology
15618 * at allocation time because the adjacent entry
15619 * is often wired down.
15620 */
15621 void
vm_map_simplify_entry(vm_map_t map,vm_map_entry_t this_entry)15622 vm_map_simplify_entry(
15623 vm_map_t map,
15624 vm_map_entry_t this_entry)
15625 {
15626 vm_map_entry_t prev_entry;
15627
15628 prev_entry = this_entry->vme_prev;
15629
15630 if ((this_entry != vm_map_to_entry(map)) &&
15631 (prev_entry != vm_map_to_entry(map)) &&
15632
15633 (prev_entry->vme_end == this_entry->vme_start) &&
15634
15635 (prev_entry->is_sub_map == this_entry->is_sub_map) &&
15636 (prev_entry->vme_object_value == this_entry->vme_object_value) &&
15637 (prev_entry->vme_kernel_object == this_entry->vme_kernel_object) &&
15638 ((VME_OFFSET(prev_entry) + (prev_entry->vme_end -
15639 prev_entry->vme_start))
15640 == VME_OFFSET(this_entry)) &&
15641
15642 (prev_entry->behavior == this_entry->behavior) &&
15643 (prev_entry->needs_copy == this_entry->needs_copy) &&
15644 (prev_entry->protection == this_entry->protection) &&
15645 (prev_entry->max_protection == this_entry->max_protection) &&
15646 (prev_entry->inheritance == this_entry->inheritance) &&
15647 (prev_entry->use_pmap == this_entry->use_pmap) &&
15648 (VME_ALIAS(prev_entry) == VME_ALIAS(this_entry)) &&
15649 (prev_entry->no_cache == this_entry->no_cache) &&
15650 (prev_entry->vme_permanent == this_entry->vme_permanent) &&
15651 (prev_entry->map_aligned == this_entry->map_aligned) &&
15652 (prev_entry->zero_wired_pages == this_entry->zero_wired_pages) &&
15653 (prev_entry->used_for_jit == this_entry->used_for_jit) &&
15654 #if __arm64e__
15655 (prev_entry->used_for_tpro == this_entry->used_for_tpro) &&
15656 #endif
15657 (prev_entry->csm_associated == this_entry->csm_associated) &&
15658 (prev_entry->vme_xnu_user_debug == this_entry->vme_xnu_user_debug) &&
15659 (prev_entry->iokit_acct == this_entry->iokit_acct) &&
15660 (prev_entry->vme_resilient_codesign ==
15661 this_entry->vme_resilient_codesign) &&
15662 (prev_entry->vme_resilient_media ==
15663 this_entry->vme_resilient_media) &&
15664 (prev_entry->vme_no_copy_on_read == this_entry->vme_no_copy_on_read) &&
15665 (prev_entry->translated_allow_execute == this_entry->translated_allow_execute) &&
15666
15667 (prev_entry->wired_count == this_entry->wired_count) &&
15668 (prev_entry->user_wired_count == this_entry->user_wired_count) &&
15669
15670 ((prev_entry->vme_atomic == FALSE) && (this_entry->vme_atomic == FALSE)) &&
15671 (prev_entry->in_transition == FALSE) &&
15672 (this_entry->in_transition == FALSE) &&
15673 (prev_entry->needs_wakeup == FALSE) &&
15674 (this_entry->needs_wakeup == FALSE) &&
15675 (prev_entry->is_shared == this_entry->is_shared) &&
15676 (prev_entry->superpage_size == FALSE) &&
15677 (this_entry->superpage_size == FALSE)
15678 ) {
15679 if (prev_entry->vme_permanent) {
15680 assert(this_entry->vme_permanent);
15681 prev_entry->vme_permanent = false;
15682 }
15683 vm_map_store_entry_unlink(map, prev_entry, true);
15684 assert(prev_entry->vme_start < this_entry->vme_end);
15685 if (prev_entry->map_aligned) {
15686 assert(VM_MAP_PAGE_ALIGNED(prev_entry->vme_start,
15687 VM_MAP_PAGE_MASK(map)));
15688 }
15689 this_entry->vme_start = prev_entry->vme_start;
15690 VME_OFFSET_SET(this_entry, VME_OFFSET(prev_entry));
15691
15692 if (map->holelistenabled) {
15693 vm_map_store_update_first_free(map, this_entry, TRUE);
15694 }
15695
15696 if (prev_entry->is_sub_map) {
15697 vm_map_deallocate(VME_SUBMAP(prev_entry));
15698 } else {
15699 vm_object_deallocate(VME_OBJECT(prev_entry));
15700 }
15701 vm_map_entry_dispose(prev_entry);
15702 SAVE_HINT_MAP_WRITE(map, this_entry);
15703 }
15704 }
15705
15706 void
vm_map_simplify(vm_map_t map,vm_map_offset_t start)15707 vm_map_simplify(
15708 vm_map_t map,
15709 vm_map_offset_t start)
15710 {
15711 vm_map_entry_t this_entry;
15712
15713 vm_map_lock(map);
15714 if (vm_map_lookup_entry(map, start, &this_entry)) {
15715 vm_map_simplify_entry(map, this_entry);
15716 vm_map_simplify_entry(map, this_entry->vme_next);
15717 }
15718 vm_map_unlock(map);
15719 }
15720
15721 static void
vm_map_simplify_range(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)15722 vm_map_simplify_range(
15723 vm_map_t map,
15724 vm_map_offset_t start,
15725 vm_map_offset_t end)
15726 {
15727 vm_map_entry_t entry;
15728
15729 /*
15730 * The map should be locked (for "write") by the caller.
15731 */
15732
15733 if (start >= end) {
15734 /* invalid address range */
15735 return;
15736 }
15737
15738 start = vm_map_trunc_page(start,
15739 VM_MAP_PAGE_MASK(map));
15740 end = vm_map_round_page(end,
15741 VM_MAP_PAGE_MASK(map));
15742
15743 if (!vm_map_lookup_entry(map, start, &entry)) {
15744 /* "start" is not mapped and "entry" ends before "start" */
15745 if (entry == vm_map_to_entry(map)) {
15746 /* start with first entry in the map */
15747 entry = vm_map_first_entry(map);
15748 } else {
15749 /* start with next entry */
15750 entry = entry->vme_next;
15751 }
15752 }
15753
15754 while (entry != vm_map_to_entry(map) &&
15755 entry->vme_start <= end) {
15756 /* try and coalesce "entry" with its previous entry */
15757 vm_map_simplify_entry(map, entry);
15758 entry = entry->vme_next;
15759 }
15760 }
15761
15762
15763 /*
15764 * Routine: vm_map_machine_attribute
15765 * Purpose:
15766 * Provide machine-specific attributes to mappings,
15767 * such as cachability etc. for machines that provide
15768 * them. NUMA architectures and machines with big/strange
15769 * caches will use this.
15770 * Note:
15771 * Responsibilities for locking and checking are handled here,
15772 * everything else in the pmap module. If any non-volatile
15773 * information must be kept, the pmap module should handle
15774 * it itself. [This assumes that attributes do not
15775 * need to be inherited, which seems ok to me]
15776 */
15777 kern_return_t
vm_map_machine_attribute(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_machine_attribute_t attribute,vm_machine_attribute_val_t * value)15778 vm_map_machine_attribute(
15779 vm_map_t map,
15780 vm_map_offset_t start,
15781 vm_map_offset_t end,
15782 vm_machine_attribute_t attribute,
15783 vm_machine_attribute_val_t* value) /* IN/OUT */
15784 {
15785 kern_return_t ret;
15786 vm_map_size_t sync_size;
15787 vm_map_entry_t entry;
15788
15789 if (start < vm_map_min(map) || end > vm_map_max(map)) {
15790 return KERN_INVALID_ADDRESS;
15791 }
15792
15793 /* Figure how much memory we need to flush (in page increments) */
15794 sync_size = end - start;
15795
15796 vm_map_lock(map);
15797
15798 if (attribute != MATTR_CACHE) {
15799 /* If we don't have to find physical addresses, we */
15800 /* don't have to do an explicit traversal here. */
15801 ret = pmap_attribute(map->pmap, start, end - start,
15802 attribute, value);
15803 vm_map_unlock(map);
15804 return ret;
15805 }
15806
15807 ret = KERN_SUCCESS; /* Assume it all worked */
15808
15809 while (sync_size) {
15810 if (vm_map_lookup_entry(map, start, &entry)) {
15811 vm_map_size_t sub_size;
15812 if ((entry->vme_end - start) > sync_size) {
15813 sub_size = sync_size;
15814 sync_size = 0;
15815 } else {
15816 sub_size = entry->vme_end - start;
15817 sync_size -= sub_size;
15818 }
15819 if (entry->is_sub_map) {
15820 vm_map_offset_t sub_start;
15821 vm_map_offset_t sub_end;
15822
15823 sub_start = (start - entry->vme_start)
15824 + VME_OFFSET(entry);
15825 sub_end = sub_start + sub_size;
15826 vm_map_machine_attribute(
15827 VME_SUBMAP(entry),
15828 sub_start,
15829 sub_end,
15830 attribute, value);
15831 } else if (VME_OBJECT(entry)) {
15832 vm_page_t m;
15833 vm_object_t object;
15834 vm_object_t base_object;
15835 vm_object_t last_object;
15836 vm_object_offset_t offset;
15837 vm_object_offset_t base_offset;
15838 vm_map_size_t range;
15839 range = sub_size;
15840 offset = (start - entry->vme_start)
15841 + VME_OFFSET(entry);
15842 offset = vm_object_trunc_page(offset);
15843 base_offset = offset;
15844 object = VME_OBJECT(entry);
15845 base_object = object;
15846 last_object = NULL;
15847
15848 vm_object_lock(object);
15849
15850 while (range) {
15851 m = vm_page_lookup(
15852 object, offset);
15853
15854 if (m && !m->vmp_fictitious) {
15855 ret =
15856 pmap_attribute_cache_sync(
15857 VM_PAGE_GET_PHYS_PAGE(m),
15858 PAGE_SIZE,
15859 attribute, value);
15860 } else if (object->shadow) {
15861 offset = offset + object->vo_shadow_offset;
15862 last_object = object;
15863 object = object->shadow;
15864 vm_object_lock(last_object->shadow);
15865 vm_object_unlock(last_object);
15866 continue;
15867 }
15868 if (range < PAGE_SIZE) {
15869 range = 0;
15870 } else {
15871 range -= PAGE_SIZE;
15872 }
15873
15874 if (base_object != object) {
15875 vm_object_unlock(object);
15876 vm_object_lock(base_object);
15877 object = base_object;
15878 }
15879 /* Bump to the next page */
15880 base_offset += PAGE_SIZE;
15881 offset = base_offset;
15882 }
15883 vm_object_unlock(object);
15884 }
15885 start += sub_size;
15886 } else {
15887 vm_map_unlock(map);
15888 return KERN_FAILURE;
15889 }
15890 }
15891
15892 vm_map_unlock(map);
15893
15894 return ret;
15895 }
15896
15897 /*
15898 * vm_map_behavior_set:
15899 *
15900 * Sets the paging reference behavior of the specified address
15901 * range in the target map. Paging reference behavior affects
15902 * how pagein operations resulting from faults on the map will be
15903 * clustered.
15904 */
15905 kern_return_t
vm_map_behavior_set(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_behavior_t new_behavior)15906 vm_map_behavior_set(
15907 vm_map_t map,
15908 vm_map_offset_t start,
15909 vm_map_offset_t end,
15910 vm_behavior_t new_behavior)
15911 {
15912 vm_map_entry_t entry;
15913 vm_map_entry_t temp_entry;
15914
15915 if (start > end ||
15916 start < vm_map_min(map) ||
15917 end > vm_map_max(map)) {
15918 return KERN_NO_SPACE;
15919 }
15920
15921 switch (new_behavior) {
15922 /*
15923 * This first block of behaviors all set a persistent state on the specified
15924 * memory range. All we have to do here is to record the desired behavior
15925 * in the vm_map_entry_t's.
15926 */
15927
15928 case VM_BEHAVIOR_DEFAULT:
15929 case VM_BEHAVIOR_RANDOM:
15930 case VM_BEHAVIOR_SEQUENTIAL:
15931 case VM_BEHAVIOR_RSEQNTL:
15932 case VM_BEHAVIOR_ZERO_WIRED_PAGES:
15933 vm_map_lock(map);
15934
15935 /*
15936 * The entire address range must be valid for the map.
15937 * Note that vm_map_range_check() does a
15938 * vm_map_lookup_entry() internally and returns the
15939 * entry containing the start of the address range if
15940 * the entire range is valid.
15941 */
15942 if (vm_map_range_check(map, start, end, &temp_entry)) {
15943 entry = temp_entry;
15944 vm_map_clip_start(map, entry, start);
15945 } else {
15946 vm_map_unlock(map);
15947 return KERN_INVALID_ADDRESS;
15948 }
15949
15950 while ((entry != vm_map_to_entry(map)) && (entry->vme_start < end)) {
15951 vm_map_clip_end(map, entry, end);
15952 if (entry->is_sub_map) {
15953 assert(!entry->use_pmap);
15954 }
15955
15956 if (new_behavior == VM_BEHAVIOR_ZERO_WIRED_PAGES) {
15957 entry->zero_wired_pages = TRUE;
15958 } else {
15959 entry->behavior = new_behavior;
15960 }
15961 entry = entry->vme_next;
15962 }
15963
15964 vm_map_unlock(map);
15965 break;
15966
15967 /*
15968 * The rest of these are different from the above in that they cause
15969 * an immediate action to take place as opposed to setting a behavior that
15970 * affects future actions.
15971 */
15972
15973 case VM_BEHAVIOR_WILLNEED:
15974 return vm_map_willneed(map, start, end);
15975
15976 case VM_BEHAVIOR_DONTNEED:
15977 return vm_map_msync(map, start, end - start, VM_SYNC_DEACTIVATE | VM_SYNC_CONTIGUOUS);
15978
15979 case VM_BEHAVIOR_FREE:
15980 return vm_map_msync(map, start, end - start, VM_SYNC_KILLPAGES | VM_SYNC_CONTIGUOUS);
15981
15982 case VM_BEHAVIOR_REUSABLE:
15983 return vm_map_reusable_pages(map, start, end);
15984
15985 case VM_BEHAVIOR_REUSE:
15986 return vm_map_reuse_pages(map, start, end);
15987
15988 case VM_BEHAVIOR_CAN_REUSE:
15989 return vm_map_can_reuse(map, start, end);
15990
15991 #if MACH_ASSERT
15992 case VM_BEHAVIOR_PAGEOUT:
15993 return vm_map_pageout(map, start, end);
15994 #endif /* MACH_ASSERT */
15995
15996 default:
15997 return KERN_INVALID_ARGUMENT;
15998 }
15999
16000 return KERN_SUCCESS;
16001 }
16002
16003
16004 /*
16005 * Internals for madvise(MADV_WILLNEED) system call.
16006 *
16007 * The implementation is to do:-
16008 * a) read-ahead if the mapping corresponds to a mapped regular file
16009 * b) or, fault in the pages (zero-fill, decompress etc) if it's an anonymous mapping
16010 */
16011
16012
16013 static kern_return_t
vm_map_willneed(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)16014 vm_map_willneed(
16015 vm_map_t map,
16016 vm_map_offset_t start,
16017 vm_map_offset_t end
16018 )
16019 {
16020 vm_map_entry_t entry;
16021 vm_object_t object;
16022 memory_object_t pager;
16023 struct vm_object_fault_info fault_info = {};
16024 kern_return_t kr;
16025 vm_object_size_t len;
16026 vm_object_offset_t offset;
16027
16028 fault_info.interruptible = THREAD_UNINT; /* ignored value */
16029 fault_info.behavior = VM_BEHAVIOR_SEQUENTIAL;
16030 fault_info.stealth = TRUE;
16031
16032 /*
16033 * The MADV_WILLNEED operation doesn't require any changes to the
16034 * vm_map_entry_t's, so the read lock is sufficient.
16035 */
16036
16037 vm_map_lock_read(map);
16038
16039 /*
16040 * The madvise semantics require that the address range be fully
16041 * allocated with no holes. Otherwise, we're required to return
16042 * an error.
16043 */
16044
16045 if (!vm_map_range_check(map, start, end, &entry)) {
16046 vm_map_unlock_read(map);
16047 return KERN_INVALID_ADDRESS;
16048 }
16049
16050 /*
16051 * Examine each vm_map_entry_t in the range.
16052 */
16053 for (; entry != vm_map_to_entry(map) && start < end;) {
16054 /*
16055 * The first time through, the start address could be anywhere
16056 * within the vm_map_entry we found. So adjust the offset to
16057 * correspond. After that, the offset will always be zero to
16058 * correspond to the beginning of the current vm_map_entry.
16059 */
16060 offset = (start - entry->vme_start) + VME_OFFSET(entry);
16061
16062 /*
16063 * Set the length so we don't go beyond the end of the
16064 * map_entry or beyond the end of the range we were given.
16065 * This range could span also multiple map entries all of which
16066 * map different files, so make sure we only do the right amount
16067 * of I/O for each object. Note that it's possible for there
16068 * to be multiple map entries all referring to the same object
16069 * but with different page permissions, but it's not worth
16070 * trying to optimize that case.
16071 */
16072 len = MIN(entry->vme_end - start, end - start);
16073
16074 if ((vm_size_t) len != len) {
16075 /* 32-bit overflow */
16076 len = (vm_size_t) (0 - PAGE_SIZE);
16077 }
16078 fault_info.cluster_size = (vm_size_t) len;
16079 fault_info.lo_offset = offset;
16080 fault_info.hi_offset = offset + len;
16081 fault_info.user_tag = VME_ALIAS(entry);
16082 fault_info.pmap_options = 0;
16083 if (entry->iokit_acct ||
16084 (!entry->is_sub_map && !entry->use_pmap)) {
16085 fault_info.pmap_options |= PMAP_OPTIONS_ALT_ACCT;
16086 }
16087 fault_info.fi_xnu_user_debug = entry->vme_xnu_user_debug;
16088
16089 /*
16090 * If the entry is a submap OR there's no read permission
16091 * to this mapping, then just skip it.
16092 */
16093 if ((entry->is_sub_map) || (entry->protection & VM_PROT_READ) == 0) {
16094 entry = entry->vme_next;
16095 start = entry->vme_start;
16096 continue;
16097 }
16098
16099 object = VME_OBJECT(entry);
16100
16101 if (object == NULL ||
16102 (object && object->internal)) {
16103 /*
16104 * Memory range backed by anonymous memory.
16105 */
16106 vm_size_t region_size = 0, effective_page_size = 0;
16107 vm_map_offset_t addr = 0, effective_page_mask = 0;
16108
16109 region_size = len;
16110 addr = start;
16111
16112 effective_page_mask = MIN(vm_map_page_mask(current_map()), PAGE_MASK);
16113 effective_page_size = effective_page_mask + 1;
16114
16115 vm_map_unlock_read(map);
16116
16117 while (region_size) {
16118 vm_pre_fault(
16119 vm_map_trunc_page(addr, effective_page_mask),
16120 VM_PROT_READ | VM_PROT_WRITE);
16121
16122 region_size -= effective_page_size;
16123 addr += effective_page_size;
16124 }
16125 } else {
16126 /*
16127 * Find the file object backing this map entry. If there is
16128 * none, then we simply ignore the "will need" advice for this
16129 * entry and go on to the next one.
16130 */
16131 if ((object = find_vnode_object(entry)) == VM_OBJECT_NULL) {
16132 entry = entry->vme_next;
16133 start = entry->vme_start;
16134 continue;
16135 }
16136
16137 vm_object_paging_begin(object);
16138 pager = object->pager;
16139 vm_object_unlock(object);
16140
16141 /*
16142 * The data_request() could take a long time, so let's
16143 * release the map lock to avoid blocking other threads.
16144 */
16145 vm_map_unlock_read(map);
16146
16147 /*
16148 * Get the data from the object asynchronously.
16149 *
16150 * Note that memory_object_data_request() places limits on the
16151 * amount of I/O it will do. Regardless of the len we
16152 * specified, it won't do more than MAX_UPL_TRANSFER_BYTES and it
16153 * silently truncates the len to that size. This isn't
16154 * necessarily bad since madvise shouldn't really be used to
16155 * page in unlimited amounts of data. Other Unix variants
16156 * limit the willneed case as well. If this turns out to be an
16157 * issue for developers, then we can always adjust the policy
16158 * here and still be backwards compatible since this is all
16159 * just "advice".
16160 */
16161 kr = memory_object_data_request(
16162 pager,
16163 vm_object_trunc_page(offset) + object->paging_offset,
16164 0, /* ignored */
16165 VM_PROT_READ,
16166 (memory_object_fault_info_t)&fault_info);
16167
16168 vm_object_lock(object);
16169 vm_object_paging_end(object);
16170 vm_object_unlock(object);
16171
16172 /*
16173 * If we couldn't do the I/O for some reason, just give up on
16174 * the madvise. We still return success to the user since
16175 * madvise isn't supposed to fail when the advice can't be
16176 * taken.
16177 */
16178
16179 if (kr != KERN_SUCCESS) {
16180 return KERN_SUCCESS;
16181 }
16182 }
16183
16184 start += len;
16185 if (start >= end) {
16186 /* done */
16187 return KERN_SUCCESS;
16188 }
16189
16190 /* look up next entry */
16191 vm_map_lock_read(map);
16192 if (!vm_map_lookup_entry(map, start, &entry)) {
16193 /*
16194 * There's a new hole in the address range.
16195 */
16196 vm_map_unlock_read(map);
16197 return KERN_INVALID_ADDRESS;
16198 }
16199 }
16200
16201 vm_map_unlock_read(map);
16202 return KERN_SUCCESS;
16203 }
16204
16205 static boolean_t
vm_map_entry_is_reusable(vm_map_entry_t entry)16206 vm_map_entry_is_reusable(
16207 vm_map_entry_t entry)
16208 {
16209 /* Only user map entries */
16210
16211 vm_object_t object;
16212
16213 if (entry->is_sub_map) {
16214 return FALSE;
16215 }
16216
16217 switch (VME_ALIAS(entry)) {
16218 case VM_MEMORY_MALLOC:
16219 case VM_MEMORY_MALLOC_SMALL:
16220 case VM_MEMORY_MALLOC_LARGE:
16221 case VM_MEMORY_REALLOC:
16222 case VM_MEMORY_MALLOC_TINY:
16223 case VM_MEMORY_MALLOC_LARGE_REUSABLE:
16224 case VM_MEMORY_MALLOC_LARGE_REUSED:
16225 /*
16226 * This is a malloc() memory region: check if it's still
16227 * in its original state and can be re-used for more
16228 * malloc() allocations.
16229 */
16230 break;
16231 default:
16232 /*
16233 * Not a malloc() memory region: let the caller decide if
16234 * it's re-usable.
16235 */
16236 return TRUE;
16237 }
16238
16239 if (/*entry->is_shared ||*/
16240 entry->is_sub_map ||
16241 entry->in_transition ||
16242 entry->protection != VM_PROT_DEFAULT ||
16243 entry->max_protection != VM_PROT_ALL ||
16244 entry->inheritance != VM_INHERIT_DEFAULT ||
16245 entry->no_cache ||
16246 entry->vme_permanent ||
16247 entry->superpage_size != FALSE ||
16248 entry->zero_wired_pages ||
16249 entry->wired_count != 0 ||
16250 entry->user_wired_count != 0) {
16251 return FALSE;
16252 }
16253
16254 object = VME_OBJECT(entry);
16255 if (object == VM_OBJECT_NULL) {
16256 return TRUE;
16257 }
16258 if (
16259 #if 0
16260 /*
16261 * Let's proceed even if the VM object is potentially
16262 * shared.
16263 * We check for this later when processing the actual
16264 * VM pages, so the contents will be safe if shared.
16265 *
16266 * But we can still mark this memory region as "reusable" to
16267 * acknowledge that the caller did let us know that the memory
16268 * could be re-used and should not be penalized for holding
16269 * on to it. This allows its "resident size" to not include
16270 * the reusable range.
16271 */
16272 object->ref_count == 1 &&
16273 #endif
16274 object->wired_page_count == 0 &&
16275 object->copy == VM_OBJECT_NULL &&
16276 object->shadow == VM_OBJECT_NULL &&
16277 object->internal &&
16278 object->purgable == VM_PURGABLE_DENY &&
16279 object->wimg_bits == VM_WIMG_USE_DEFAULT &&
16280 !object->code_signed) {
16281 return TRUE;
16282 }
16283 return FALSE;
16284 }
16285
16286 static kern_return_t
vm_map_reuse_pages(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)16287 vm_map_reuse_pages(
16288 vm_map_t map,
16289 vm_map_offset_t start,
16290 vm_map_offset_t end)
16291 {
16292 vm_map_entry_t entry;
16293 vm_object_t object;
16294 vm_object_offset_t start_offset, end_offset;
16295
16296 /*
16297 * The MADV_REUSE operation doesn't require any changes to the
16298 * vm_map_entry_t's, so the read lock is sufficient.
16299 */
16300
16301 if (VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT) {
16302 /*
16303 * XXX TODO4K
16304 * need to figure out what reusable means for a
16305 * portion of a native page.
16306 */
16307 return KERN_SUCCESS;
16308 }
16309
16310 vm_map_lock_read(map);
16311 assert(map->pmap != kernel_pmap); /* protect alias access */
16312
16313 /*
16314 * The madvise semantics require that the address range be fully
16315 * allocated with no holes. Otherwise, we're required to return
16316 * an error.
16317 */
16318
16319 if (!vm_map_range_check(map, start, end, &entry)) {
16320 vm_map_unlock_read(map);
16321 vm_page_stats_reusable.reuse_pages_failure++;
16322 return KERN_INVALID_ADDRESS;
16323 }
16324
16325 /*
16326 * Examine each vm_map_entry_t in the range.
16327 */
16328 for (; entry != vm_map_to_entry(map) && entry->vme_start < end;
16329 entry = entry->vme_next) {
16330 /*
16331 * Sanity check on the VM map entry.
16332 */
16333 if (!vm_map_entry_is_reusable(entry)) {
16334 vm_map_unlock_read(map);
16335 vm_page_stats_reusable.reuse_pages_failure++;
16336 return KERN_INVALID_ADDRESS;
16337 }
16338
16339 /*
16340 * The first time through, the start address could be anywhere
16341 * within the vm_map_entry we found. So adjust the offset to
16342 * correspond.
16343 */
16344 if (entry->vme_start < start) {
16345 start_offset = start - entry->vme_start;
16346 } else {
16347 start_offset = 0;
16348 }
16349 end_offset = MIN(end, entry->vme_end) - entry->vme_start;
16350 start_offset += VME_OFFSET(entry);
16351 end_offset += VME_OFFSET(entry);
16352
16353 object = VME_OBJECT(entry);
16354 if (object != VM_OBJECT_NULL) {
16355 vm_object_lock(object);
16356 vm_object_reuse_pages(object, start_offset, end_offset,
16357 TRUE);
16358 vm_object_unlock(object);
16359 }
16360
16361 if (VME_ALIAS(entry) == VM_MEMORY_MALLOC_LARGE_REUSABLE) {
16362 /*
16363 * XXX
16364 * We do not hold the VM map exclusively here.
16365 * The "alias" field is not that critical, so it's
16366 * safe to update it here, as long as it is the only
16367 * one that can be modified while holding the VM map
16368 * "shared".
16369 */
16370 VME_ALIAS_SET(entry, VM_MEMORY_MALLOC_LARGE_REUSED);
16371 }
16372 }
16373
16374 vm_map_unlock_read(map);
16375 vm_page_stats_reusable.reuse_pages_success++;
16376 return KERN_SUCCESS;
16377 }
16378
16379
16380 static kern_return_t
vm_map_reusable_pages(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)16381 vm_map_reusable_pages(
16382 vm_map_t map,
16383 vm_map_offset_t start,
16384 vm_map_offset_t end)
16385 {
16386 vm_map_entry_t entry;
16387 vm_object_t object;
16388 vm_object_offset_t start_offset, end_offset;
16389 vm_map_offset_t pmap_offset;
16390
16391 if (VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT) {
16392 /*
16393 * XXX TODO4K
16394 * need to figure out what reusable means for a portion
16395 * of a native page.
16396 */
16397 return KERN_SUCCESS;
16398 }
16399
16400 /*
16401 * The MADV_REUSABLE operation doesn't require any changes to the
16402 * vm_map_entry_t's, so the read lock is sufficient.
16403 */
16404
16405 vm_map_lock_read(map);
16406 assert(map->pmap != kernel_pmap); /* protect alias access */
16407
16408 /*
16409 * The madvise semantics require that the address range be fully
16410 * allocated with no holes. Otherwise, we're required to return
16411 * an error.
16412 */
16413
16414 if (!vm_map_range_check(map, start, end, &entry)) {
16415 vm_map_unlock_read(map);
16416 vm_page_stats_reusable.reusable_pages_failure++;
16417 return KERN_INVALID_ADDRESS;
16418 }
16419
16420 /*
16421 * Examine each vm_map_entry_t in the range.
16422 */
16423 for (; entry != vm_map_to_entry(map) && entry->vme_start < end;
16424 entry = entry->vme_next) {
16425 int kill_pages = 0;
16426 boolean_t reusable_no_write = FALSE;
16427
16428 /*
16429 * Sanity check on the VM map entry.
16430 */
16431 if (!vm_map_entry_is_reusable(entry)) {
16432 vm_map_unlock_read(map);
16433 vm_page_stats_reusable.reusable_pages_failure++;
16434 return KERN_INVALID_ADDRESS;
16435 }
16436
16437 if (!(entry->protection & VM_PROT_WRITE) && !entry->used_for_jit
16438 #if __arm64e__
16439 && !entry->used_for_tpro
16440 #endif
16441 ) {
16442 /* not writable: can't discard contents */
16443 vm_map_unlock_read(map);
16444 vm_page_stats_reusable.reusable_nonwritable++;
16445 vm_page_stats_reusable.reusable_pages_failure++;
16446 return KERN_PROTECTION_FAILURE;
16447 }
16448
16449 /*
16450 * The first time through, the start address could be anywhere
16451 * within the vm_map_entry we found. So adjust the offset to
16452 * correspond.
16453 */
16454 if (entry->vme_start < start) {
16455 start_offset = start - entry->vme_start;
16456 pmap_offset = start;
16457 } else {
16458 start_offset = 0;
16459 pmap_offset = entry->vme_start;
16460 }
16461 end_offset = MIN(end, entry->vme_end) - entry->vme_start;
16462 start_offset += VME_OFFSET(entry);
16463 end_offset += VME_OFFSET(entry);
16464
16465 object = VME_OBJECT(entry);
16466 if (object == VM_OBJECT_NULL) {
16467 continue;
16468 }
16469
16470 if (entry->protection & VM_PROT_EXECUTE) {
16471 /*
16472 * Executable mappings might be write-protected by
16473 * hardware, so do not attempt to write to these pages.
16474 */
16475 reusable_no_write = TRUE;
16476 }
16477
16478 vm_object_lock(object);
16479 if (((object->ref_count == 1) ||
16480 (object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC &&
16481 object->copy == VM_OBJECT_NULL)) &&
16482 object->shadow == VM_OBJECT_NULL &&
16483 /*
16484 * "iokit_acct" entries are billed for their virtual size
16485 * (rather than for their resident pages only), so they
16486 * wouldn't benefit from making pages reusable, and it
16487 * would be hard to keep track of pages that are both
16488 * "iokit_acct" and "reusable" in the pmap stats and
16489 * ledgers.
16490 */
16491 !(entry->iokit_acct ||
16492 (!entry->is_sub_map && !entry->use_pmap))) {
16493 if (object->ref_count != 1) {
16494 vm_page_stats_reusable.reusable_shared++;
16495 }
16496 kill_pages = 1;
16497 } else {
16498 kill_pages = -1;
16499 }
16500 if (kill_pages != -1) {
16501 vm_object_deactivate_pages(object,
16502 start_offset,
16503 end_offset - start_offset,
16504 kill_pages,
16505 TRUE /*reusable_pages*/,
16506 reusable_no_write,
16507 map->pmap,
16508 pmap_offset);
16509 } else {
16510 vm_page_stats_reusable.reusable_pages_shared++;
16511 DTRACE_VM4(vm_map_reusable_pages_shared,
16512 unsigned int, VME_ALIAS(entry),
16513 vm_map_t, map,
16514 vm_map_entry_t, entry,
16515 vm_object_t, object);
16516 }
16517 vm_object_unlock(object);
16518
16519 if (VME_ALIAS(entry) == VM_MEMORY_MALLOC_LARGE ||
16520 VME_ALIAS(entry) == VM_MEMORY_MALLOC_LARGE_REUSED) {
16521 /*
16522 * XXX
16523 * We do not hold the VM map exclusively here.
16524 * The "alias" field is not that critical, so it's
16525 * safe to update it here, as long as it is the only
16526 * one that can be modified while holding the VM map
16527 * "shared".
16528 */
16529 VME_ALIAS_SET(entry, VM_MEMORY_MALLOC_LARGE_REUSABLE);
16530 }
16531 }
16532
16533 vm_map_unlock_read(map);
16534 vm_page_stats_reusable.reusable_pages_success++;
16535 return KERN_SUCCESS;
16536 }
16537
16538
16539 static kern_return_t
vm_map_can_reuse(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)16540 vm_map_can_reuse(
16541 vm_map_t map,
16542 vm_map_offset_t start,
16543 vm_map_offset_t end)
16544 {
16545 vm_map_entry_t entry;
16546
16547 /*
16548 * The MADV_REUSABLE operation doesn't require any changes to the
16549 * vm_map_entry_t's, so the read lock is sufficient.
16550 */
16551
16552 vm_map_lock_read(map);
16553 assert(map->pmap != kernel_pmap); /* protect alias access */
16554
16555 /*
16556 * The madvise semantics require that the address range be fully
16557 * allocated with no holes. Otherwise, we're required to return
16558 * an error.
16559 */
16560
16561 if (!vm_map_range_check(map, start, end, &entry)) {
16562 vm_map_unlock_read(map);
16563 vm_page_stats_reusable.can_reuse_failure++;
16564 return KERN_INVALID_ADDRESS;
16565 }
16566
16567 /*
16568 * Examine each vm_map_entry_t in the range.
16569 */
16570 for (; entry != vm_map_to_entry(map) && entry->vme_start < end;
16571 entry = entry->vme_next) {
16572 /*
16573 * Sanity check on the VM map entry.
16574 */
16575 if (!vm_map_entry_is_reusable(entry)) {
16576 vm_map_unlock_read(map);
16577 vm_page_stats_reusable.can_reuse_failure++;
16578 return KERN_INVALID_ADDRESS;
16579 }
16580 }
16581
16582 vm_map_unlock_read(map);
16583 vm_page_stats_reusable.can_reuse_success++;
16584 return KERN_SUCCESS;
16585 }
16586
16587
16588 #if MACH_ASSERT
16589 static kern_return_t
vm_map_pageout(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)16590 vm_map_pageout(
16591 vm_map_t map,
16592 vm_map_offset_t start,
16593 vm_map_offset_t end)
16594 {
16595 vm_map_entry_t entry;
16596
16597 /*
16598 * The MADV_PAGEOUT operation doesn't require any changes to the
16599 * vm_map_entry_t's, so the read lock is sufficient.
16600 */
16601
16602 vm_map_lock_read(map);
16603
16604 /*
16605 * The madvise semantics require that the address range be fully
16606 * allocated with no holes. Otherwise, we're required to return
16607 * an error.
16608 */
16609
16610 if (!vm_map_range_check(map, start, end, &entry)) {
16611 vm_map_unlock_read(map);
16612 return KERN_INVALID_ADDRESS;
16613 }
16614
16615 /*
16616 * Examine each vm_map_entry_t in the range.
16617 */
16618 for (; entry != vm_map_to_entry(map) && entry->vme_start < end;
16619 entry = entry->vme_next) {
16620 vm_object_t object;
16621
16622 /*
16623 * Sanity check on the VM map entry.
16624 */
16625 if (entry->is_sub_map) {
16626 vm_map_t submap;
16627 vm_map_offset_t submap_start;
16628 vm_map_offset_t submap_end;
16629 vm_map_entry_t submap_entry;
16630
16631 submap = VME_SUBMAP(entry);
16632 submap_start = VME_OFFSET(entry);
16633 submap_end = submap_start + (entry->vme_end -
16634 entry->vme_start);
16635
16636 vm_map_lock_read(submap);
16637
16638 if (!vm_map_range_check(submap,
16639 submap_start,
16640 submap_end,
16641 &submap_entry)) {
16642 vm_map_unlock_read(submap);
16643 vm_map_unlock_read(map);
16644 return KERN_INVALID_ADDRESS;
16645 }
16646
16647 if (submap_entry->is_sub_map) {
16648 vm_map_unlock_read(submap);
16649 continue;
16650 }
16651
16652 object = VME_OBJECT(submap_entry);
16653 if (object == VM_OBJECT_NULL || !object->internal) {
16654 vm_map_unlock_read(submap);
16655 continue;
16656 }
16657
16658 vm_object_pageout(object);
16659
16660 vm_map_unlock_read(submap);
16661 submap = VM_MAP_NULL;
16662 submap_entry = VM_MAP_ENTRY_NULL;
16663 continue;
16664 }
16665
16666 object = VME_OBJECT(entry);
16667 if (object == VM_OBJECT_NULL || !object->internal) {
16668 continue;
16669 }
16670
16671 vm_object_pageout(object);
16672 }
16673
16674 vm_map_unlock_read(map);
16675 return KERN_SUCCESS;
16676 }
16677 #endif /* MACH_ASSERT */
16678
16679
16680 /*
16681 * Routine: vm_map_entry_insert
16682 *
16683 * Description: This routine inserts a new vm_entry in a locked map.
16684 */
16685 static vm_map_entry_t
vm_map_entry_insert(vm_map_t map,vm_map_entry_t insp_entry,vm_map_offset_t start,vm_map_offset_t end,vm_object_t object,vm_object_offset_t offset,vm_map_kernel_flags_t vmk_flags,boolean_t needs_copy,vm_prot_t cur_protection,vm_prot_t max_protection,vm_inherit_t inheritance,boolean_t clear_map_aligned)16686 vm_map_entry_insert(
16687 vm_map_t map,
16688 vm_map_entry_t insp_entry,
16689 vm_map_offset_t start,
16690 vm_map_offset_t end,
16691 vm_object_t object,
16692 vm_object_offset_t offset,
16693 vm_map_kernel_flags_t vmk_flags,
16694 boolean_t needs_copy,
16695 vm_prot_t cur_protection,
16696 vm_prot_t max_protection,
16697 vm_inherit_t inheritance,
16698 boolean_t clear_map_aligned)
16699 {
16700 vm_map_entry_t new_entry;
16701 boolean_t map_aligned = FALSE;
16702
16703 assert(insp_entry != (vm_map_entry_t)0);
16704 vm_map_lock_assert_exclusive(map);
16705
16706 #if DEVELOPMENT || DEBUG
16707 vm_object_offset_t end_offset = 0;
16708 assertf(!os_add_overflow(end - start, offset, &end_offset), "size 0x%llx, offset 0x%llx caused overflow", (uint64_t)(end - start), offset);
16709 #endif /* DEVELOPMENT || DEBUG */
16710
16711 if (VM_MAP_PAGE_SHIFT(map) != PAGE_SHIFT) {
16712 map_aligned = TRUE;
16713 }
16714 if (clear_map_aligned &&
16715 (!VM_MAP_PAGE_ALIGNED(start, VM_MAP_PAGE_MASK(map)) ||
16716 !VM_MAP_PAGE_ALIGNED(end, VM_MAP_PAGE_MASK(map)))) {
16717 map_aligned = FALSE;
16718 }
16719 if (map_aligned) {
16720 assert(VM_MAP_PAGE_ALIGNED(start, VM_MAP_PAGE_MASK(map)));
16721 assert(VM_MAP_PAGE_ALIGNED(end, VM_MAP_PAGE_MASK(map)));
16722 } else {
16723 assert(page_aligned(start));
16724 assert(page_aligned(end));
16725 }
16726 assert(start < end);
16727
16728 new_entry = vm_map_entry_create(map);
16729
16730 new_entry->vme_start = start;
16731 new_entry->vme_end = end;
16732
16733 if (vmk_flags.vmkf_submap) {
16734 new_entry->vme_atomic = vmk_flags.vmkf_submap_atomic;
16735 VME_SUBMAP_SET(new_entry, (vm_map_t)object);
16736 } else {
16737 VME_OBJECT_SET(new_entry, object, false, 0);
16738 }
16739 VME_OFFSET_SET(new_entry, offset);
16740 VME_ALIAS_SET(new_entry, vmk_flags.vm_tag);
16741
16742 new_entry->map_aligned = map_aligned;
16743 new_entry->needs_copy = needs_copy;
16744 new_entry->inheritance = inheritance;
16745 new_entry->protection = cur_protection;
16746 new_entry->max_protection = max_protection;
16747 /*
16748 * submap: "use_pmap" means "nested".
16749 * default: false.
16750 *
16751 * object: "use_pmap" means "use pmap accounting" for footprint.
16752 * default: true.
16753 */
16754 new_entry->use_pmap = !vmk_flags.vmkf_submap;
16755 new_entry->no_cache = vmk_flags.vmf_no_cache;
16756 new_entry->vme_permanent = vmk_flags.vmf_permanent;
16757 new_entry->translated_allow_execute = vmk_flags.vmkf_translated_allow_execute;
16758 new_entry->vme_no_copy_on_read = vmk_flags.vmkf_no_copy_on_read;
16759 new_entry->superpage_size = (vmk_flags.vmf_superpage_size != 0);
16760
16761 if (vmk_flags.vmkf_map_jit) {
16762 if (!(map->jit_entry_exists) ||
16763 VM_MAP_POLICY_ALLOW_MULTIPLE_JIT(map)) {
16764 new_entry->used_for_jit = TRUE;
16765 map->jit_entry_exists = TRUE;
16766 }
16767 }
16768
16769 /*
16770 * Insert the new entry into the list.
16771 */
16772
16773 vm_map_store_entry_link(map, insp_entry, new_entry, vmk_flags);
16774 map->size += end - start;
16775
16776 /*
16777 * Update the free space hint and the lookup hint.
16778 */
16779
16780 SAVE_HINT_MAP_WRITE(map, new_entry);
16781 return new_entry;
16782 }
16783
16784 /*
16785 * Routine: vm_map_remap_extract
16786 *
16787 * Description: This routine returns a vm_entry list from a map.
16788 */
16789 static kern_return_t
vm_map_remap_extract(vm_map_t map,vm_map_offset_t addr,vm_map_size_t size,boolean_t copy,vm_map_copy_t map_copy,vm_prot_t * cur_protection,vm_prot_t * max_protection,vm_inherit_t inheritance,vm_map_kernel_flags_t vmk_flags)16790 vm_map_remap_extract(
16791 vm_map_t map,
16792 vm_map_offset_t addr,
16793 vm_map_size_t size,
16794 boolean_t copy,
16795 vm_map_copy_t map_copy,
16796 vm_prot_t *cur_protection, /* IN/OUT */
16797 vm_prot_t *max_protection, /* IN/OUT */
16798 /* What, no behavior? */
16799 vm_inherit_t inheritance,
16800 vm_map_kernel_flags_t vmk_flags)
16801 {
16802 struct vm_map_header *map_header = &map_copy->cpy_hdr;
16803 kern_return_t result;
16804 vm_map_size_t mapped_size;
16805 vm_map_size_t tmp_size;
16806 vm_map_entry_t src_entry; /* result of last map lookup */
16807 vm_map_entry_t new_entry;
16808 vm_object_offset_t offset;
16809 vm_map_offset_t map_address;
16810 vm_map_offset_t src_start; /* start of entry to map */
16811 vm_map_offset_t src_end; /* end of region to be mapped */
16812 vm_object_t object;
16813 vm_map_version_t version;
16814 boolean_t src_needs_copy;
16815 boolean_t new_entry_needs_copy;
16816 vm_map_entry_t saved_src_entry;
16817 boolean_t src_entry_was_wired;
16818 vm_prot_t max_prot_for_prot_copy;
16819 vm_map_offset_t effective_page_mask;
16820 bool pageable, same_map;
16821 boolean_t vm_remap_legacy;
16822 vm_prot_t required_cur_prot, required_max_prot;
16823 vm_object_t new_copy_object; /* vm_object_copy_* result */
16824 boolean_t saved_used_for_jit; /* Saved used_for_jit. */
16825 #if __arm64e__
16826 boolean_t saved_used_for_tpro; /* Saved used_for_tpro. */
16827 #endif
16828
16829 pageable = vmk_flags.vmkf_copy_pageable;
16830 same_map = vmk_flags.vmkf_copy_same_map;
16831
16832 effective_page_mask = MIN(PAGE_MASK, VM_MAP_PAGE_MASK(map));
16833
16834 assert(map != VM_MAP_NULL);
16835 assert(size != 0);
16836 assert(size == vm_map_round_page(size, effective_page_mask));
16837 assert(inheritance == VM_INHERIT_NONE ||
16838 inheritance == VM_INHERIT_COPY ||
16839 inheritance == VM_INHERIT_SHARE);
16840 assert(!(*cur_protection & ~(VM_PROT_ALL | VM_PROT_ALLEXEC)));
16841 assert(!(*max_protection & ~(VM_PROT_ALL | VM_PROT_ALLEXEC)));
16842 assert((*cur_protection & *max_protection) == *cur_protection);
16843
16844 /*
16845 * Compute start and end of region.
16846 */
16847 src_start = vm_map_trunc_page(addr, effective_page_mask);
16848 src_end = vm_map_round_page(src_start + size, effective_page_mask);
16849
16850 /*
16851 * Initialize map_header.
16852 */
16853 map_header->nentries = 0;
16854 map_header->entries_pageable = pageable;
16855 // map_header->page_shift = MIN(VM_MAP_PAGE_SHIFT(map), PAGE_SHIFT);
16856 map_header->page_shift = (uint16_t)VM_MAP_PAGE_SHIFT(map);
16857 map_header->rb_head_store.rbh_root = (void *)(int)SKIP_RB_TREE;
16858 vm_map_store_init(map_header);
16859
16860 if (copy && vmk_flags.vmkf_remap_prot_copy) {
16861 /*
16862 * Special case for vm_map_protect(VM_PROT_COPY):
16863 * we want to set the new mappings' max protection to the
16864 * specified *max_protection...
16865 */
16866 max_prot_for_prot_copy = *max_protection & (VM_PROT_ALL | VM_PROT_ALLEXEC);
16867 /* ... but we want to use the vm_remap() legacy mode */
16868 *max_protection = VM_PROT_NONE;
16869 *cur_protection = VM_PROT_NONE;
16870 } else {
16871 max_prot_for_prot_copy = VM_PROT_NONE;
16872 }
16873
16874 if (*cur_protection == VM_PROT_NONE &&
16875 *max_protection == VM_PROT_NONE) {
16876 /*
16877 * vm_remap() legacy mode:
16878 * Extract all memory regions in the specified range and
16879 * collect the strictest set of protections allowed on the
16880 * entire range, so the caller knows what they can do with
16881 * the remapped range.
16882 * We start with VM_PROT_ALL and we'll remove the protections
16883 * missing from each memory region.
16884 */
16885 vm_remap_legacy = TRUE;
16886 *cur_protection = VM_PROT_ALL;
16887 *max_protection = VM_PROT_ALL;
16888 required_cur_prot = VM_PROT_NONE;
16889 required_max_prot = VM_PROT_NONE;
16890 } else {
16891 /*
16892 * vm_remap_new() mode:
16893 * Extract all memory regions in the specified range and
16894 * ensure that they have at least the protections specified
16895 * by the caller via *cur_protection and *max_protection.
16896 * The resulting mapping should have these protections.
16897 */
16898 vm_remap_legacy = FALSE;
16899 if (copy) {
16900 required_cur_prot = VM_PROT_NONE;
16901 required_max_prot = VM_PROT_READ;
16902 } else {
16903 required_cur_prot = *cur_protection;
16904 required_max_prot = *max_protection;
16905 }
16906 }
16907
16908 map_address = 0;
16909 mapped_size = 0;
16910 result = KERN_SUCCESS;
16911
16912 /*
16913 * The specified source virtual space might correspond to
16914 * multiple map entries, need to loop on them.
16915 */
16916 vm_map_lock(map);
16917
16918 if (map->pmap == kernel_pmap) {
16919 map_copy->is_kernel_range = true;
16920 map_copy->orig_range = kmem_addr_get_range(addr, size);
16921 #if CONFIG_MAP_RANGES
16922 } else if (map->uses_user_ranges) {
16923 map_copy->is_user_range = true;
16924 map_copy->orig_range = vm_map_user_range_resolve(map, addr, size, NULL);
16925 #endif /* CONFIG_MAP_RANGES */
16926 }
16927
16928 if (VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT) {
16929 /*
16930 * This address space uses sub-pages so the range might
16931 * not be re-mappable in an address space with larger
16932 * pages. Re-assemble any broken-up VM map entries to
16933 * improve our chances of making it work.
16934 */
16935 vm_map_simplify_range(map, src_start, src_end);
16936 }
16937 while (mapped_size != size) {
16938 vm_map_size_t entry_size;
16939
16940 /*
16941 * Find the beginning of the region.
16942 */
16943 if (!vm_map_lookup_entry(map, src_start, &src_entry)) {
16944 result = KERN_INVALID_ADDRESS;
16945 break;
16946 }
16947
16948 if (src_start < src_entry->vme_start ||
16949 (mapped_size && src_start != src_entry->vme_start)) {
16950 result = KERN_INVALID_ADDRESS;
16951 break;
16952 }
16953
16954 tmp_size = size - mapped_size;
16955 if (src_end > src_entry->vme_end) {
16956 tmp_size -= (src_end - src_entry->vme_end);
16957 }
16958
16959 entry_size = (vm_map_size_t)(src_entry->vme_end -
16960 src_entry->vme_start);
16961
16962 if (src_entry->is_sub_map &&
16963 vmk_flags.vmkf_copy_single_object) {
16964 vm_map_t submap;
16965 vm_map_offset_t submap_start;
16966 vm_map_size_t submap_size;
16967 boolean_t submap_needs_copy;
16968
16969 /*
16970 * No check for "required protection" on "src_entry"
16971 * because the protections that matter are the ones
16972 * on the submap's VM map entry, which will be checked
16973 * during the call to vm_map_remap_extract() below.
16974 */
16975 submap_size = src_entry->vme_end - src_start;
16976 if (submap_size > size) {
16977 submap_size = size;
16978 }
16979 submap_start = VME_OFFSET(src_entry) + src_start - src_entry->vme_start;
16980 submap = VME_SUBMAP(src_entry);
16981 if (copy) {
16982 /*
16983 * The caller wants a copy-on-write re-mapping,
16984 * so let's extract from the submap accordingly.
16985 */
16986 submap_needs_copy = TRUE;
16987 } else if (src_entry->needs_copy) {
16988 /*
16989 * The caller wants a shared re-mapping but the
16990 * submap is mapped with "needs_copy", so its
16991 * contents can't be shared as is. Extract the
16992 * contents of the submap as "copy-on-write".
16993 * The re-mapping won't be shared with the
16994 * original mapping but this is equivalent to
16995 * what happened with the original "remap from
16996 * submap" code.
16997 * The shared region is mapped "needs_copy", for
16998 * example.
16999 */
17000 submap_needs_copy = TRUE;
17001 } else {
17002 /*
17003 * The caller wants a shared re-mapping and
17004 * this mapping can be shared (no "needs_copy"),
17005 * so let's extract from the submap accordingly.
17006 * Kernel submaps are mapped without
17007 * "needs_copy", for example.
17008 */
17009 submap_needs_copy = FALSE;
17010 }
17011 vm_map_reference(submap);
17012 vm_map_unlock(map);
17013 src_entry = NULL;
17014 if (vm_remap_legacy) {
17015 *cur_protection = VM_PROT_NONE;
17016 *max_protection = VM_PROT_NONE;
17017 }
17018
17019 DTRACE_VM7(remap_submap_recurse,
17020 vm_map_t, map,
17021 vm_map_offset_t, addr,
17022 vm_map_size_t, size,
17023 boolean_t, copy,
17024 vm_map_offset_t, submap_start,
17025 vm_map_size_t, submap_size,
17026 boolean_t, submap_needs_copy);
17027
17028 result = vm_map_remap_extract(submap,
17029 submap_start,
17030 submap_size,
17031 submap_needs_copy,
17032 map_copy,
17033 cur_protection,
17034 max_protection,
17035 inheritance,
17036 vmk_flags);
17037 vm_map_deallocate(submap);
17038 return result;
17039 }
17040
17041 if (src_entry->is_sub_map) {
17042 /* protections for submap mapping are irrelevant here */
17043 } else if (((src_entry->protection & required_cur_prot) !=
17044 required_cur_prot) ||
17045 ((src_entry->max_protection & required_max_prot) !=
17046 required_max_prot)) {
17047 if (vmk_flags.vmkf_copy_single_object &&
17048 mapped_size != 0) {
17049 /*
17050 * Single object extraction.
17051 * We can't extract more with the required
17052 * protection but we've extracted some, so
17053 * stop there and declare success.
17054 * The caller should check the size of
17055 * the copy entry we've extracted.
17056 */
17057 result = KERN_SUCCESS;
17058 } else {
17059 /*
17060 * VM range extraction.
17061 * Required proctection is not available
17062 * for this part of the range: fail.
17063 */
17064 result = KERN_PROTECTION_FAILURE;
17065 }
17066 break;
17067 }
17068
17069 if (src_entry->is_sub_map) {
17070 vm_map_t submap;
17071 vm_map_offset_t submap_start;
17072 vm_map_size_t submap_size;
17073 vm_map_copy_t submap_copy;
17074 vm_prot_t submap_curprot, submap_maxprot;
17075 boolean_t submap_needs_copy;
17076
17077 /*
17078 * No check for "required protection" on "src_entry"
17079 * because the protections that matter are the ones
17080 * on the submap's VM map entry, which will be checked
17081 * during the call to vm_map_copy_extract() below.
17082 */
17083 object = VM_OBJECT_NULL;
17084 submap_copy = VM_MAP_COPY_NULL;
17085
17086 /* find equivalent range in the submap */
17087 submap = VME_SUBMAP(src_entry);
17088 submap_start = VME_OFFSET(src_entry) + src_start - src_entry->vme_start;
17089 submap_size = tmp_size;
17090 if (copy) {
17091 /*
17092 * The caller wants a copy-on-write re-mapping,
17093 * so let's extract from the submap accordingly.
17094 */
17095 submap_needs_copy = TRUE;
17096 } else if (src_entry->needs_copy) {
17097 /*
17098 * The caller wants a shared re-mapping but the
17099 * submap is mapped with "needs_copy", so its
17100 * contents can't be shared as is. Extract the
17101 * contents of the submap as "copy-on-write".
17102 * The re-mapping won't be shared with the
17103 * original mapping but this is equivalent to
17104 * what happened with the original "remap from
17105 * submap" code.
17106 * The shared region is mapped "needs_copy", for
17107 * example.
17108 */
17109 submap_needs_copy = TRUE;
17110 } else {
17111 /*
17112 * The caller wants a shared re-mapping and
17113 * this mapping can be shared (no "needs_copy"),
17114 * so let's extract from the submap accordingly.
17115 * Kernel submaps are mapped without
17116 * "needs_copy", for example.
17117 */
17118 submap_needs_copy = FALSE;
17119 }
17120 /* extra ref to keep submap alive */
17121 vm_map_reference(submap);
17122
17123 DTRACE_VM7(remap_submap_recurse,
17124 vm_map_t, map,
17125 vm_map_offset_t, addr,
17126 vm_map_size_t, size,
17127 boolean_t, copy,
17128 vm_map_offset_t, submap_start,
17129 vm_map_size_t, submap_size,
17130 boolean_t, submap_needs_copy);
17131
17132 /*
17133 * The map can be safely unlocked since we
17134 * already hold a reference on the submap.
17135 *
17136 * No timestamp since we don't care if the map
17137 * gets modified while we're down in the submap.
17138 * We'll resume the extraction at src_start + tmp_size
17139 * anyway.
17140 */
17141 vm_map_unlock(map);
17142 src_entry = NULL; /* not valid once map is unlocked */
17143
17144 if (vm_remap_legacy) {
17145 submap_curprot = VM_PROT_NONE;
17146 submap_maxprot = VM_PROT_NONE;
17147 if (max_prot_for_prot_copy) {
17148 submap_maxprot = max_prot_for_prot_copy;
17149 }
17150 } else {
17151 assert(!max_prot_for_prot_copy);
17152 submap_curprot = *cur_protection;
17153 submap_maxprot = *max_protection;
17154 }
17155 result = vm_map_copy_extract(submap,
17156 submap_start,
17157 submap_size,
17158 submap_needs_copy,
17159 &submap_copy,
17160 &submap_curprot,
17161 &submap_maxprot,
17162 inheritance,
17163 vmk_flags);
17164
17165 /* release extra ref on submap */
17166 vm_map_deallocate(submap);
17167 submap = VM_MAP_NULL;
17168
17169 if (result != KERN_SUCCESS) {
17170 vm_map_lock(map);
17171 break;
17172 }
17173
17174 /* transfer submap_copy entries to map_header */
17175 while (vm_map_copy_first_entry(submap_copy) !=
17176 vm_map_copy_to_entry(submap_copy)) {
17177 vm_map_entry_t copy_entry;
17178 vm_map_size_t copy_entry_size;
17179
17180 copy_entry = vm_map_copy_first_entry(submap_copy);
17181
17182 /*
17183 * Prevent kernel_object from being exposed to
17184 * user space.
17185 */
17186 if (__improbable(copy_entry->vme_kernel_object)) {
17187 printf("%d[%s]: rejecting attempt to extract from kernel_object\n",
17188 proc_selfpid(),
17189 (get_bsdtask_info(current_task())
17190 ? proc_name_address(get_bsdtask_info(current_task()))
17191 : "?"));
17192 DTRACE_VM(extract_kernel_only);
17193 result = KERN_INVALID_RIGHT;
17194 vm_map_copy_discard(submap_copy);
17195 submap_copy = VM_MAP_COPY_NULL;
17196 vm_map_lock(map);
17197 break;
17198 }
17199
17200 vm_map_copy_entry_unlink(submap_copy, copy_entry);
17201 copy_entry_size = copy_entry->vme_end - copy_entry->vme_start;
17202 copy_entry->vme_start = map_address;
17203 copy_entry->vme_end = map_address + copy_entry_size;
17204 map_address += copy_entry_size;
17205 mapped_size += copy_entry_size;
17206 src_start += copy_entry_size;
17207 assert(src_start <= src_end);
17208 _vm_map_store_entry_link(map_header,
17209 map_header->links.prev,
17210 copy_entry);
17211 }
17212 /* done with submap_copy */
17213 vm_map_copy_discard(submap_copy);
17214
17215 if (vm_remap_legacy) {
17216 *cur_protection &= submap_curprot;
17217 *max_protection &= submap_maxprot;
17218 }
17219
17220 /* re-acquire the map lock and continue to next entry */
17221 vm_map_lock(map);
17222 continue;
17223 } else {
17224 object = VME_OBJECT(src_entry);
17225
17226 /*
17227 * Prevent kernel_object from being exposed to
17228 * user space.
17229 */
17230 if (__improbable(object == kernel_object)) {
17231 printf("%d[%s]: rejecting attempt to extract from kernel_object\n",
17232 proc_selfpid(),
17233 (get_bsdtask_info(current_task())
17234 ? proc_name_address(get_bsdtask_info(current_task()))
17235 : "?"));
17236 DTRACE_VM(extract_kernel_only);
17237 result = KERN_INVALID_RIGHT;
17238 break;
17239 }
17240
17241 if (src_entry->iokit_acct) {
17242 /*
17243 * This entry uses "IOKit accounting".
17244 */
17245 } else if (object != VM_OBJECT_NULL &&
17246 (object->purgable != VM_PURGABLE_DENY ||
17247 object->vo_ledger_tag != VM_LEDGER_TAG_NONE)) {
17248 /*
17249 * Purgeable objects have their own accounting:
17250 * no pmap accounting for them.
17251 */
17252 assertf(!src_entry->use_pmap,
17253 "map=%p src_entry=%p [0x%llx:0x%llx] 0x%x/0x%x %d",
17254 map,
17255 src_entry,
17256 (uint64_t)src_entry->vme_start,
17257 (uint64_t)src_entry->vme_end,
17258 src_entry->protection,
17259 src_entry->max_protection,
17260 VME_ALIAS(src_entry));
17261 } else {
17262 /*
17263 * Not IOKit or purgeable:
17264 * must be accounted by pmap stats.
17265 */
17266 assertf(src_entry->use_pmap,
17267 "map=%p src_entry=%p [0x%llx:0x%llx] 0x%x/0x%x %d",
17268 map,
17269 src_entry,
17270 (uint64_t)src_entry->vme_start,
17271 (uint64_t)src_entry->vme_end,
17272 src_entry->protection,
17273 src_entry->max_protection,
17274 VME_ALIAS(src_entry));
17275 }
17276
17277 if (object == VM_OBJECT_NULL) {
17278 assert(!src_entry->needs_copy);
17279 if (src_entry->max_protection == VM_PROT_NONE) {
17280 assert(src_entry->protection == VM_PROT_NONE);
17281 /*
17282 * No VM object and no permissions:
17283 * this must be a reserved range with
17284 * nothing to share or copy.
17285 * There could also be all sorts of
17286 * pmap shenanigans within that reserved
17287 * range, so let's just copy the map
17288 * entry as is to remap a similar
17289 * reserved range.
17290 */
17291 offset = 0; /* no object => no offset */
17292 goto copy_src_entry;
17293 }
17294 object = vm_object_allocate(entry_size);
17295 VME_OFFSET_SET(src_entry, 0);
17296 VME_OBJECT_SET(src_entry, object, false, 0);
17297 assert(src_entry->use_pmap);
17298 assert(!map->mapped_in_other_pmaps);
17299 } else if (src_entry->wired_count ||
17300 object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC) {
17301 /*
17302 * A wired memory region should not have
17303 * any pending copy-on-write and needs to
17304 * keep pointing at the VM object that
17305 * contains the wired pages.
17306 * If we're sharing this memory (copy=false),
17307 * we'll share this VM object.
17308 * If we're copying this memory (copy=true),
17309 * we'll call vm_object_copy_slowly() below
17310 * and use the new VM object for the remapping.
17311 *
17312 * Or, we are already using an asymmetric
17313 * copy, and therefore we already have
17314 * the right object.
17315 */
17316 assert(!src_entry->needs_copy);
17317 } else if (src_entry->needs_copy || object->shadowed ||
17318 (object->internal && !object->true_share &&
17319 !src_entry->is_shared &&
17320 object->vo_size > entry_size)) {
17321 VME_OBJECT_SHADOW(src_entry, entry_size,
17322 vm_map_always_shadow(map));
17323 assert(src_entry->use_pmap);
17324
17325 if (!src_entry->needs_copy &&
17326 (src_entry->protection & VM_PROT_WRITE)) {
17327 vm_prot_t prot;
17328
17329 assert(!pmap_has_prot_policy(map->pmap, src_entry->translated_allow_execute, src_entry->protection));
17330
17331 prot = src_entry->protection & ~VM_PROT_WRITE;
17332
17333 if (override_nx(map,
17334 VME_ALIAS(src_entry))
17335 && prot) {
17336 prot |= VM_PROT_EXECUTE;
17337 }
17338
17339 assert(!pmap_has_prot_policy(map->pmap, src_entry->translated_allow_execute, prot));
17340
17341 if (map->mapped_in_other_pmaps) {
17342 vm_object_pmap_protect(
17343 VME_OBJECT(src_entry),
17344 VME_OFFSET(src_entry),
17345 entry_size,
17346 PMAP_NULL,
17347 PAGE_SIZE,
17348 src_entry->vme_start,
17349 prot);
17350 #if MACH_ASSERT
17351 } else if (__improbable(map->pmap == PMAP_NULL)) {
17352 extern boolean_t vm_tests_in_progress;
17353 assert(vm_tests_in_progress);
17354 /*
17355 * Some VM tests (in vm_tests.c)
17356 * sometimes want to use a VM
17357 * map without a pmap.
17358 * Otherwise, this should never
17359 * happen.
17360 */
17361 #endif /* MACH_ASSERT */
17362 } else {
17363 pmap_protect(vm_map_pmap(map),
17364 src_entry->vme_start,
17365 src_entry->vme_end,
17366 prot);
17367 }
17368 }
17369
17370 object = VME_OBJECT(src_entry);
17371 src_entry->needs_copy = FALSE;
17372 }
17373
17374
17375 vm_object_lock(object);
17376 vm_object_reference_locked(object); /* object ref. for new entry */
17377 assert(!src_entry->needs_copy);
17378 if (object->copy_strategy ==
17379 MEMORY_OBJECT_COPY_SYMMETRIC) {
17380 /*
17381 * If we want to share this object (copy==0),
17382 * it needs to be COPY_DELAY.
17383 * If we want to copy this object (copy==1),
17384 * we can't just set "needs_copy" on our side
17385 * and expect the other side to do the same
17386 * (symmetrically), so we can't let the object
17387 * stay COPY_SYMMETRIC.
17388 * So we always switch from COPY_SYMMETRIC to
17389 * COPY_DELAY.
17390 */
17391 object->copy_strategy =
17392 MEMORY_OBJECT_COPY_DELAY;
17393 object->true_share = TRUE;
17394 }
17395 vm_object_unlock(object);
17396 }
17397
17398 offset = (VME_OFFSET(src_entry) +
17399 (src_start - src_entry->vme_start));
17400
17401 copy_src_entry:
17402 new_entry = _vm_map_entry_create(map_header);
17403 vm_map_entry_copy(map, new_entry, src_entry);
17404 if (new_entry->is_sub_map) {
17405 /* clr address space specifics */
17406 new_entry->use_pmap = FALSE;
17407 } else if (copy) {
17408 /*
17409 * We're dealing with a copy-on-write operation,
17410 * so the resulting mapping should not inherit the
17411 * original mapping's accounting settings.
17412 * "use_pmap" should be reset to its default (TRUE)
17413 * so that the new mapping gets accounted for in
17414 * the task's memory footprint.
17415 */
17416 new_entry->use_pmap = TRUE;
17417 }
17418 /* "iokit_acct" was cleared in vm_map_entry_copy() */
17419 assert(!new_entry->iokit_acct);
17420
17421 new_entry->map_aligned = FALSE;
17422
17423 new_entry->vme_start = map_address;
17424 new_entry->vme_end = map_address + tmp_size;
17425 assert(new_entry->vme_start < new_entry->vme_end);
17426 if (copy && vmk_flags.vmkf_remap_prot_copy) {
17427 /* security: keep "permanent" and "csm_associated" */
17428 new_entry->vme_permanent = src_entry->vme_permanent;
17429 new_entry->csm_associated = src_entry->csm_associated;
17430 /*
17431 * Remapping for vm_map_protect(VM_PROT_COPY)
17432 * to convert a read-only mapping into a
17433 * copy-on-write version of itself but
17434 * with write access:
17435 * keep the original inheritance but let's not
17436 * add VM_PROT_WRITE to the max protection yet
17437 * since we want to do more security checks against
17438 * the target map.
17439 */
17440 new_entry->inheritance = src_entry->inheritance;
17441 new_entry->protection &= max_prot_for_prot_copy;
17442 } else {
17443 new_entry->inheritance = inheritance;
17444 if (!vm_remap_legacy) {
17445 new_entry->protection = *cur_protection;
17446 new_entry->max_protection = *max_protection;
17447 }
17448 }
17449 VME_OFFSET_SET(new_entry, offset);
17450
17451 /*
17452 * The new region has to be copied now if required.
17453 */
17454 RestartCopy:
17455 if (!copy) {
17456 if (src_entry->used_for_jit == TRUE) {
17457 if (same_map) {
17458 } else if (!VM_MAP_POLICY_ALLOW_JIT_SHARING(map)) {
17459 /*
17460 * Cannot allow an entry describing a JIT
17461 * region to be shared across address spaces.
17462 */
17463 result = KERN_INVALID_ARGUMENT;
17464 vm_object_deallocate(object);
17465 vm_map_entry_dispose(new_entry);
17466 new_entry = VM_MAP_ENTRY_NULL;
17467 break;
17468 }
17469 }
17470
17471 src_entry->is_shared = TRUE;
17472 new_entry->is_shared = TRUE;
17473 if (!(new_entry->is_sub_map)) {
17474 new_entry->needs_copy = FALSE;
17475 }
17476 } else if (src_entry->is_sub_map) {
17477 /* make this a COW sub_map if not already */
17478 assert(new_entry->wired_count == 0);
17479 new_entry->needs_copy = TRUE;
17480 object = VM_OBJECT_NULL;
17481 } else if (src_entry->wired_count == 0 &&
17482 !(debug4k_no_cow_copyin && VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT) &&
17483 vm_object_copy_quickly(VME_OBJECT(new_entry),
17484 VME_OFFSET(new_entry),
17485 (new_entry->vme_end -
17486 new_entry->vme_start),
17487 &src_needs_copy,
17488 &new_entry_needs_copy)) {
17489 new_entry->needs_copy = new_entry_needs_copy;
17490 new_entry->is_shared = FALSE;
17491 assertf(new_entry->use_pmap, "map %p new_entry %p\n", map, new_entry);
17492
17493 /*
17494 * Handle copy_on_write semantics.
17495 */
17496 if (src_needs_copy && !src_entry->needs_copy) {
17497 vm_prot_t prot;
17498
17499 assert(!pmap_has_prot_policy(map->pmap, src_entry->translated_allow_execute, src_entry->protection));
17500
17501 prot = src_entry->protection & ~VM_PROT_WRITE;
17502
17503 if (override_nx(map,
17504 VME_ALIAS(src_entry))
17505 && prot) {
17506 prot |= VM_PROT_EXECUTE;
17507 }
17508
17509 assert(!pmap_has_prot_policy(map->pmap, src_entry->translated_allow_execute, prot));
17510
17511 vm_object_pmap_protect(object,
17512 offset,
17513 entry_size,
17514 ((src_entry->is_shared
17515 || map->mapped_in_other_pmaps) ?
17516 PMAP_NULL : map->pmap),
17517 VM_MAP_PAGE_SIZE(map),
17518 src_entry->vme_start,
17519 prot);
17520
17521 assert(src_entry->wired_count == 0);
17522 src_entry->needs_copy = TRUE;
17523 }
17524 /*
17525 * Throw away the old object reference of the new entry.
17526 */
17527 vm_object_deallocate(object);
17528 } else {
17529 new_entry->is_shared = FALSE;
17530 assertf(new_entry->use_pmap, "map %p new_entry %p\n", map, new_entry);
17531
17532 src_entry_was_wired = (src_entry->wired_count > 0);
17533 saved_src_entry = src_entry;
17534 src_entry = VM_MAP_ENTRY_NULL;
17535
17536 /*
17537 * The map can be safely unlocked since we
17538 * already hold a reference on the object.
17539 *
17540 * Record the timestamp of the map for later
17541 * verification, and unlock the map.
17542 */
17543 version.main_timestamp = map->timestamp;
17544 vm_map_unlock(map); /* Increments timestamp once! */
17545
17546 /*
17547 * Perform the copy.
17548 */
17549 if (src_entry_was_wired > 0 ||
17550 (debug4k_no_cow_copyin &&
17551 VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT)) {
17552 vm_object_lock(object);
17553 result = vm_object_copy_slowly(
17554 object,
17555 offset,
17556 (new_entry->vme_end -
17557 new_entry->vme_start),
17558 THREAD_UNINT,
17559 &new_copy_object);
17560 /* VME_OBJECT_SET will reset used_for_jit, so preserve it. */
17561 saved_used_for_jit = new_entry->used_for_jit;
17562 #if __arm64e__
17563 saved_used_for_tpro = new_entry->used_for_tpro;
17564 #endif
17565 VME_OBJECT_SET(new_entry, new_copy_object, false, 0);
17566 new_entry->used_for_jit = saved_used_for_jit;
17567 #if __arm64e__
17568 new_entry->used_for_tpro = saved_used_for_tpro;
17569 #endif
17570 VME_OFFSET_SET(new_entry, offset - vm_object_trunc_page(offset));
17571 new_entry->needs_copy = FALSE;
17572 } else {
17573 vm_object_offset_t new_offset;
17574
17575 new_offset = VME_OFFSET(new_entry);
17576 result = vm_object_copy_strategically(
17577 object,
17578 offset,
17579 (new_entry->vme_end -
17580 new_entry->vme_start),
17581 &new_copy_object,
17582 &new_offset,
17583 &new_entry_needs_copy);
17584 /* VME_OBJECT_SET will reset used_for_jit, so preserve it. */
17585 saved_used_for_jit = new_entry->used_for_jit;
17586 #if __arm64e__
17587 saved_used_for_tpro = new_entry->used_for_tpro;
17588 #endif
17589 VME_OBJECT_SET(new_entry, new_copy_object, false, 0);
17590 new_entry->used_for_jit = saved_used_for_jit;
17591 #if __arm64e__
17592 new_entry->used_for_tpro = saved_used_for_tpro;
17593 #endif
17594 if (new_offset != VME_OFFSET(new_entry)) {
17595 VME_OFFSET_SET(new_entry, new_offset);
17596 }
17597
17598 new_entry->needs_copy = new_entry_needs_copy;
17599 }
17600
17601 /*
17602 * Throw away the old object reference of the new entry.
17603 */
17604 vm_object_deallocate(object);
17605
17606 if (result != KERN_SUCCESS &&
17607 result != KERN_MEMORY_RESTART_COPY) {
17608 vm_map_entry_dispose(new_entry);
17609 vm_map_lock(map);
17610 break;
17611 }
17612
17613 /*
17614 * Verify that the map has not substantially
17615 * changed while the copy was being made.
17616 */
17617
17618 vm_map_lock(map);
17619 if (version.main_timestamp + 1 != map->timestamp) {
17620 /*
17621 * Simple version comparison failed.
17622 *
17623 * Retry the lookup and verify that the
17624 * same object/offset are still present.
17625 */
17626 saved_src_entry = VM_MAP_ENTRY_NULL;
17627 vm_object_deallocate(VME_OBJECT(new_entry));
17628 vm_map_entry_dispose(new_entry);
17629 if (result == KERN_MEMORY_RESTART_COPY) {
17630 result = KERN_SUCCESS;
17631 }
17632 continue;
17633 }
17634 /* map hasn't changed: src_entry is still valid */
17635 src_entry = saved_src_entry;
17636 saved_src_entry = VM_MAP_ENTRY_NULL;
17637
17638 if (result == KERN_MEMORY_RESTART_COPY) {
17639 vm_object_reference(object);
17640 goto RestartCopy;
17641 }
17642 }
17643
17644 _vm_map_store_entry_link(map_header,
17645 map_header->links.prev, new_entry);
17646
17647 /* protections for submap mapping are irrelevant here */
17648 if (vm_remap_legacy && !src_entry->is_sub_map) {
17649 *cur_protection &= src_entry->protection;
17650 *max_protection &= src_entry->max_protection;
17651 }
17652
17653 map_address += tmp_size;
17654 mapped_size += tmp_size;
17655 src_start += tmp_size;
17656
17657 if (vmk_flags.vmkf_copy_single_object) {
17658 if (mapped_size != size) {
17659 DEBUG4K_SHARE("map %p addr 0x%llx size 0x%llx clipped copy at mapped_size 0x%llx\n",
17660 map, (uint64_t)addr, (uint64_t)size, (uint64_t)mapped_size);
17661 if (src_entry->vme_next != vm_map_to_entry(map) &&
17662 src_entry->vme_next->vme_object_value ==
17663 src_entry->vme_object_value) {
17664 /* XXX TODO4K */
17665 DEBUG4K_ERROR("could have extended copy to next entry...\n");
17666 }
17667 }
17668 break;
17669 }
17670 } /* end while */
17671
17672 vm_map_unlock(map);
17673 if (result != KERN_SUCCESS) {
17674 /*
17675 * Free all allocated elements.
17676 */
17677 for (src_entry = map_header->links.next;
17678 src_entry != CAST_TO_VM_MAP_ENTRY(&map_header->links);
17679 src_entry = new_entry) {
17680 new_entry = src_entry->vme_next;
17681 _vm_map_store_entry_unlink(map_header, src_entry, false);
17682 if (src_entry->is_sub_map) {
17683 vm_map_deallocate(VME_SUBMAP(src_entry));
17684 } else {
17685 vm_object_deallocate(VME_OBJECT(src_entry));
17686 }
17687 vm_map_entry_dispose(src_entry);
17688 }
17689 }
17690 return result;
17691 }
17692
17693 bool
vm_map_is_exotic(vm_map_t map)17694 vm_map_is_exotic(
17695 vm_map_t map)
17696 {
17697 return VM_MAP_IS_EXOTIC(map);
17698 }
17699
17700 bool
vm_map_is_alien(vm_map_t map)17701 vm_map_is_alien(
17702 vm_map_t map)
17703 {
17704 return VM_MAP_IS_ALIEN(map);
17705 }
17706
17707 #if XNU_TARGET_OS_OSX
17708 void
vm_map_mark_alien(vm_map_t map)17709 vm_map_mark_alien(
17710 vm_map_t map)
17711 {
17712 vm_map_lock(map);
17713 map->is_alien = true;
17714 vm_map_unlock(map);
17715 }
17716
17717 void
vm_map_single_jit(vm_map_t map)17718 vm_map_single_jit(
17719 vm_map_t map)
17720 {
17721 vm_map_lock(map);
17722 map->single_jit = true;
17723 vm_map_unlock(map);
17724 }
17725 #endif /* XNU_TARGET_OS_OSX */
17726
17727 /*
17728 * Callers of this function must call vm_map_copy_require on
17729 * previously created vm_map_copy_t or pass a newly created
17730 * one to ensure that it hasn't been forged.
17731 */
17732 static kern_return_t
vm_map_copy_to_physcopy(vm_map_copy_t copy_map,vm_map_t target_map)17733 vm_map_copy_to_physcopy(
17734 vm_map_copy_t copy_map,
17735 vm_map_t target_map)
17736 {
17737 vm_map_size_t size;
17738 vm_map_entry_t entry;
17739 vm_map_entry_t new_entry;
17740 vm_object_t new_object;
17741 unsigned int pmap_flags;
17742 pmap_t new_pmap;
17743 vm_map_t new_map;
17744 vm_map_address_t src_start, src_end, src_cur;
17745 vm_map_address_t dst_start, dst_end, dst_cur;
17746 kern_return_t kr;
17747 void *kbuf;
17748
17749 /*
17750 * Perform the equivalent of vm_allocate() and memcpy().
17751 * Replace the mappings in "copy_map" with the newly allocated mapping.
17752 */
17753 DEBUG4K_COPY("copy_map %p (%d %d 0x%llx 0x%llx) BEFORE\n", copy_map, copy_map->cpy_hdr.page_shift, copy_map->cpy_hdr.nentries, copy_map->offset, (uint64_t)copy_map->size);
17754
17755 assert(copy_map->cpy_hdr.page_shift != VM_MAP_PAGE_MASK(target_map));
17756
17757 /* create a new pmap to map "copy_map" */
17758 pmap_flags = 0;
17759 assert(copy_map->cpy_hdr.page_shift == FOURK_PAGE_SHIFT);
17760 #if PMAP_CREATE_FORCE_4K_PAGES
17761 pmap_flags |= PMAP_CREATE_FORCE_4K_PAGES;
17762 #endif /* PMAP_CREATE_FORCE_4K_PAGES */
17763 pmap_flags |= PMAP_CREATE_64BIT;
17764 new_pmap = pmap_create_options(NULL, (vm_map_size_t)0, pmap_flags);
17765 if (new_pmap == NULL) {
17766 return KERN_RESOURCE_SHORTAGE;
17767 }
17768
17769 /* allocate new VM object */
17770 size = VM_MAP_ROUND_PAGE(copy_map->size, PAGE_MASK);
17771 new_object = vm_object_allocate(size);
17772 assert(new_object);
17773
17774 /* allocate new VM map entry */
17775 new_entry = vm_map_copy_entry_create(copy_map);
17776 assert(new_entry);
17777
17778 /* finish initializing new VM map entry */
17779 new_entry->protection = VM_PROT_DEFAULT;
17780 new_entry->max_protection = VM_PROT_DEFAULT;
17781 new_entry->use_pmap = TRUE;
17782
17783 /* make new VM map entry point to new VM object */
17784 new_entry->vme_start = 0;
17785 new_entry->vme_end = size;
17786 VME_OBJECT_SET(new_entry, new_object, false, 0);
17787 VME_OFFSET_SET(new_entry, 0);
17788
17789 /* create a new pageable VM map to map "copy_map" */
17790 new_map = vm_map_create_options(new_pmap, 0, MACH_VM_MAX_ADDRESS,
17791 VM_MAP_CREATE_PAGEABLE);
17792 assert(new_map);
17793 vm_map_set_page_shift(new_map, copy_map->cpy_hdr.page_shift);
17794
17795 /* map "copy_map" in the new VM map */
17796 src_start = 0;
17797 kr = vm_map_copyout_internal(
17798 new_map,
17799 &src_start,
17800 copy_map,
17801 copy_map->size,
17802 FALSE, /* consume_on_success */
17803 VM_PROT_DEFAULT,
17804 VM_PROT_DEFAULT,
17805 VM_INHERIT_DEFAULT);
17806 assert(kr == KERN_SUCCESS);
17807 src_end = src_start + copy_map->size;
17808
17809 /* map "new_object" in the new VM map */
17810 vm_object_reference(new_object);
17811 dst_start = 0;
17812 kr = vm_map_enter(new_map,
17813 &dst_start,
17814 size,
17815 0, /* mask */
17816 VM_MAP_KERNEL_FLAGS_ANYWHERE(.vm_tag = VM_KERN_MEMORY_OSFMK),
17817 new_object,
17818 0, /* offset */
17819 FALSE, /* needs copy */
17820 VM_PROT_DEFAULT,
17821 VM_PROT_DEFAULT,
17822 VM_INHERIT_DEFAULT);
17823 assert(kr == KERN_SUCCESS);
17824 dst_end = dst_start + size;
17825
17826 /* get a kernel buffer */
17827 kbuf = kalloc_data(PAGE_SIZE, Z_WAITOK | Z_NOFAIL);
17828
17829 /* physically copy "copy_map" mappings to new VM object */
17830 for (src_cur = src_start, dst_cur = dst_start;
17831 src_cur < src_end;
17832 src_cur += PAGE_SIZE, dst_cur += PAGE_SIZE) {
17833 vm_size_t bytes;
17834
17835 bytes = PAGE_SIZE;
17836 if (src_cur + PAGE_SIZE > src_end) {
17837 /* partial copy for last page */
17838 bytes = src_end - src_cur;
17839 assert(bytes > 0 && bytes < PAGE_SIZE);
17840 /* rest of dst page should be zero-filled */
17841 }
17842 /* get bytes from src mapping */
17843 kr = copyinmap(new_map, src_cur, kbuf, bytes);
17844 if (kr != KERN_SUCCESS) {
17845 DEBUG4K_COPY("copyinmap(%p, 0x%llx, %p, 0x%llx) kr 0x%x\n", new_map, (uint64_t)src_cur, kbuf, (uint64_t)bytes, kr);
17846 }
17847 /* put bytes in dst mapping */
17848 assert(dst_cur < dst_end);
17849 assert(dst_cur + bytes <= dst_end);
17850 kr = copyoutmap(new_map, kbuf, dst_cur, bytes);
17851 if (kr != KERN_SUCCESS) {
17852 DEBUG4K_COPY("copyoutmap(%p, %p, 0x%llx, 0x%llx) kr 0x%x\n", new_map, kbuf, (uint64_t)dst_cur, (uint64_t)bytes, kr);
17853 }
17854 }
17855
17856 /* free kernel buffer */
17857 kfree_data(kbuf, PAGE_SIZE);
17858
17859 /* destroy new map */
17860 vm_map_destroy(new_map);
17861 new_map = VM_MAP_NULL;
17862
17863 /* dispose of the old map entries in "copy_map" */
17864 while (vm_map_copy_first_entry(copy_map) !=
17865 vm_map_copy_to_entry(copy_map)) {
17866 entry = vm_map_copy_first_entry(copy_map);
17867 vm_map_copy_entry_unlink(copy_map, entry);
17868 if (entry->is_sub_map) {
17869 vm_map_deallocate(VME_SUBMAP(entry));
17870 } else {
17871 vm_object_deallocate(VME_OBJECT(entry));
17872 }
17873 vm_map_copy_entry_dispose(entry);
17874 }
17875
17876 /* change "copy_map"'s page_size to match "target_map" */
17877 copy_map->cpy_hdr.page_shift = (uint16_t)VM_MAP_PAGE_SHIFT(target_map);
17878 copy_map->offset = 0;
17879 copy_map->size = size;
17880
17881 /* insert new map entry in "copy_map" */
17882 assert(vm_map_copy_last_entry(copy_map) == vm_map_copy_to_entry(copy_map));
17883 vm_map_copy_entry_link(copy_map, vm_map_copy_last_entry(copy_map), new_entry);
17884
17885 DEBUG4K_COPY("copy_map %p (%d %d 0x%llx 0x%llx) AFTER\n", copy_map, copy_map->cpy_hdr.page_shift, copy_map->cpy_hdr.nentries, copy_map->offset, (uint64_t)copy_map->size);
17886 return KERN_SUCCESS;
17887 }
17888
17889 void
17890 vm_map_copy_adjust_get_target_copy_map(
17891 vm_map_copy_t copy_map,
17892 vm_map_copy_t *target_copy_map_p);
17893 void
vm_map_copy_adjust_get_target_copy_map(vm_map_copy_t copy_map,vm_map_copy_t * target_copy_map_p)17894 vm_map_copy_adjust_get_target_copy_map(
17895 vm_map_copy_t copy_map,
17896 vm_map_copy_t *target_copy_map_p)
17897 {
17898 vm_map_copy_t target_copy_map;
17899 vm_map_entry_t entry, target_entry;
17900
17901 if (*target_copy_map_p != VM_MAP_COPY_NULL) {
17902 /* the caller already has a "target_copy_map": use it */
17903 return;
17904 }
17905
17906 /* the caller wants us to create a new copy of "copy_map" */
17907 assert(copy_map->type == VM_MAP_COPY_ENTRY_LIST);
17908 target_copy_map = vm_map_copy_allocate(copy_map->type);
17909 target_copy_map->offset = copy_map->offset;
17910 target_copy_map->size = copy_map->size;
17911 target_copy_map->cpy_hdr.page_shift = copy_map->cpy_hdr.page_shift;
17912 for (entry = vm_map_copy_first_entry(copy_map);
17913 entry != vm_map_copy_to_entry(copy_map);
17914 entry = entry->vme_next) {
17915 target_entry = vm_map_copy_entry_create(target_copy_map);
17916 vm_map_entry_copy_full(target_entry, entry);
17917 if (target_entry->is_sub_map) {
17918 vm_map_reference(VME_SUBMAP(target_entry));
17919 } else {
17920 vm_object_reference(VME_OBJECT(target_entry));
17921 }
17922 vm_map_copy_entry_link(
17923 target_copy_map,
17924 vm_map_copy_last_entry(target_copy_map),
17925 target_entry);
17926 }
17927 entry = VM_MAP_ENTRY_NULL;
17928 *target_copy_map_p = target_copy_map;
17929 }
17930
17931 /*
17932 * Callers of this function must call vm_map_copy_require on
17933 * previously created vm_map_copy_t or pass a newly created
17934 * one to ensure that it hasn't been forged.
17935 */
17936 static void
vm_map_copy_trim(vm_map_copy_t copy_map,uint16_t new_page_shift,vm_map_offset_t trim_start,vm_map_offset_t trim_end)17937 vm_map_copy_trim(
17938 vm_map_copy_t copy_map,
17939 uint16_t new_page_shift,
17940 vm_map_offset_t trim_start,
17941 vm_map_offset_t trim_end)
17942 {
17943 uint16_t copy_page_shift;
17944 vm_map_entry_t entry, next_entry;
17945
17946 assert(copy_map->type == VM_MAP_COPY_ENTRY_LIST);
17947 assert(copy_map->cpy_hdr.nentries > 0);
17948
17949 trim_start += vm_map_copy_first_entry(copy_map)->vme_start;
17950 trim_end += vm_map_copy_first_entry(copy_map)->vme_start;
17951
17952 /* use the new page_shift to do the clipping */
17953 copy_page_shift = VM_MAP_COPY_PAGE_SHIFT(copy_map);
17954 copy_map->cpy_hdr.page_shift = new_page_shift;
17955
17956 for (entry = vm_map_copy_first_entry(copy_map);
17957 entry != vm_map_copy_to_entry(copy_map);
17958 entry = next_entry) {
17959 next_entry = entry->vme_next;
17960 if (entry->vme_end <= trim_start) {
17961 /* entry fully before trim range: skip */
17962 continue;
17963 }
17964 if (entry->vme_start >= trim_end) {
17965 /* entry fully after trim range: done */
17966 break;
17967 }
17968 /* clip entry if needed */
17969 vm_map_copy_clip_start(copy_map, entry, trim_start);
17970 vm_map_copy_clip_end(copy_map, entry, trim_end);
17971 /* dispose of entry */
17972 copy_map->size -= entry->vme_end - entry->vme_start;
17973 vm_map_copy_entry_unlink(copy_map, entry);
17974 if (entry->is_sub_map) {
17975 vm_map_deallocate(VME_SUBMAP(entry));
17976 } else {
17977 vm_object_deallocate(VME_OBJECT(entry));
17978 }
17979 vm_map_copy_entry_dispose(entry);
17980 entry = VM_MAP_ENTRY_NULL;
17981 }
17982
17983 /* restore copy_map's original page_shift */
17984 copy_map->cpy_hdr.page_shift = copy_page_shift;
17985 }
17986
17987 /*
17988 * Make any necessary adjustments to "copy_map" to allow it to be
17989 * mapped into "target_map".
17990 * If no changes were necessary, "target_copy_map" points to the
17991 * untouched "copy_map".
17992 * If changes are necessary, changes will be made to "target_copy_map".
17993 * If "target_copy_map" was NULL, we create a new "vm_map_copy_t" and
17994 * copy the original "copy_map" to it before applying the changes.
17995 * The caller should discard "target_copy_map" if it's not the same as
17996 * the original "copy_map".
17997 */
17998 /* TODO4K: also adjust to sub-range in the copy_map -> add start&end? */
17999 kern_return_t
vm_map_copy_adjust_to_target(vm_map_copy_t src_copy_map,vm_map_offset_t offset,vm_map_size_t size,vm_map_t target_map,boolean_t copy,vm_map_copy_t * target_copy_map_p,vm_map_offset_t * overmap_start_p,vm_map_offset_t * overmap_end_p,vm_map_offset_t * trimmed_start_p)18000 vm_map_copy_adjust_to_target(
18001 vm_map_copy_t src_copy_map,
18002 vm_map_offset_t offset,
18003 vm_map_size_t size,
18004 vm_map_t target_map,
18005 boolean_t copy,
18006 vm_map_copy_t *target_copy_map_p,
18007 vm_map_offset_t *overmap_start_p,
18008 vm_map_offset_t *overmap_end_p,
18009 vm_map_offset_t *trimmed_start_p)
18010 {
18011 vm_map_copy_t copy_map, target_copy_map;
18012 vm_map_size_t target_size;
18013 vm_map_size_t src_copy_map_size;
18014 vm_map_size_t overmap_start, overmap_end;
18015 int misalignments;
18016 vm_map_entry_t entry, target_entry;
18017 vm_map_offset_t addr_adjustment;
18018 vm_map_offset_t new_start, new_end;
18019 int copy_page_mask, target_page_mask;
18020 uint16_t copy_page_shift, target_page_shift;
18021 vm_map_offset_t trimmed_end;
18022
18023 /*
18024 * Assert that the vm_map_copy is coming from the right
18025 * zone and hasn't been forged
18026 */
18027 vm_map_copy_require(src_copy_map);
18028 assert(src_copy_map->type == VM_MAP_COPY_ENTRY_LIST);
18029
18030 /*
18031 * Start working with "src_copy_map" but we'll switch
18032 * to "target_copy_map" as soon as we start making adjustments.
18033 */
18034 copy_map = src_copy_map;
18035 src_copy_map_size = src_copy_map->size;
18036
18037 copy_page_shift = VM_MAP_COPY_PAGE_SHIFT(copy_map);
18038 copy_page_mask = VM_MAP_COPY_PAGE_MASK(copy_map);
18039 target_page_shift = (uint16_t)VM_MAP_PAGE_SHIFT(target_map);
18040 target_page_mask = VM_MAP_PAGE_MASK(target_map);
18041
18042 DEBUG4K_ADJUST("copy_map %p (%d offset 0x%llx size 0x%llx) target_map %p (%d) copy %d offset 0x%llx size 0x%llx target_copy_map %p...\n", copy_map, copy_page_shift, (uint64_t)copy_map->offset, (uint64_t)copy_map->size, target_map, target_page_shift, copy, (uint64_t)offset, (uint64_t)size, *target_copy_map_p);
18043
18044 target_copy_map = *target_copy_map_p;
18045 if (target_copy_map != VM_MAP_COPY_NULL) {
18046 vm_map_copy_require(target_copy_map);
18047 }
18048
18049 if (offset + size > copy_map->size) {
18050 DEBUG4K_ERROR("copy_map %p (%d->%d) copy_map->size 0x%llx offset 0x%llx size 0x%llx KERN_INVALID_ARGUMENT\n", copy_map, copy_page_shift, target_page_shift, (uint64_t)copy_map->size, (uint64_t)offset, (uint64_t)size);
18051 return KERN_INVALID_ARGUMENT;
18052 }
18053
18054 /* trim the end */
18055 trimmed_end = 0;
18056 new_end = VM_MAP_ROUND_PAGE(offset + size, target_page_mask);
18057 if (new_end < copy_map->size) {
18058 trimmed_end = src_copy_map_size - new_end;
18059 DEBUG4K_ADJUST("copy_map %p (%d->%d) copy %d offset 0x%llx size 0x%llx target_copy_map %p... trim end from 0x%llx to 0x%llx\n", copy_map, copy_page_shift, target_page_shift, copy, (uint64_t)offset, (uint64_t)size, target_copy_map, (uint64_t)new_end, (uint64_t)copy_map->size);
18060 /* get "target_copy_map" if needed and adjust it */
18061 vm_map_copy_adjust_get_target_copy_map(copy_map,
18062 &target_copy_map);
18063 copy_map = target_copy_map;
18064 vm_map_copy_trim(target_copy_map, target_page_shift,
18065 new_end, copy_map->size);
18066 }
18067
18068 /* trim the start */
18069 new_start = VM_MAP_TRUNC_PAGE(offset, target_page_mask);
18070 if (new_start != 0) {
18071 DEBUG4K_ADJUST("copy_map %p (%d->%d) copy %d offset 0x%llx size 0x%llx target_copy_map %p... trim start from 0x%llx to 0x%llx\n", copy_map, copy_page_shift, target_page_shift, copy, (uint64_t)offset, (uint64_t)size, target_copy_map, (uint64_t)0, (uint64_t)new_start);
18072 /* get "target_copy_map" if needed and adjust it */
18073 vm_map_copy_adjust_get_target_copy_map(copy_map,
18074 &target_copy_map);
18075 copy_map = target_copy_map;
18076 vm_map_copy_trim(target_copy_map, target_page_shift,
18077 0, new_start);
18078 }
18079 *trimmed_start_p = new_start;
18080
18081 /* target_size starts with what's left after trimming */
18082 target_size = copy_map->size;
18083 assertf(target_size == src_copy_map_size - *trimmed_start_p - trimmed_end,
18084 "target_size 0x%llx src_copy_map_size 0x%llx trimmed_start 0x%llx trimmed_end 0x%llx\n",
18085 (uint64_t)target_size, (uint64_t)src_copy_map_size,
18086 (uint64_t)*trimmed_start_p, (uint64_t)trimmed_end);
18087
18088 /* check for misalignments but don't adjust yet */
18089 misalignments = 0;
18090 overmap_start = 0;
18091 overmap_end = 0;
18092 if (copy_page_shift < target_page_shift) {
18093 /*
18094 * Remapping from 4K to 16K: check the VM object alignments
18095 * throughout the range.
18096 * If the start and end of the range are mis-aligned, we can
18097 * over-map to re-align, and adjust the "overmap" start/end
18098 * and "target_size" of the range accordingly.
18099 * If there is any mis-alignment within the range:
18100 * if "copy":
18101 * we can do immediate-copy instead of copy-on-write,
18102 * else:
18103 * no way to remap and share; fail.
18104 */
18105 for (entry = vm_map_copy_first_entry(copy_map);
18106 entry != vm_map_copy_to_entry(copy_map);
18107 entry = entry->vme_next) {
18108 vm_object_offset_t object_offset_start, object_offset_end;
18109
18110 object_offset_start = VME_OFFSET(entry);
18111 object_offset_end = object_offset_start;
18112 object_offset_end += entry->vme_end - entry->vme_start;
18113 if (object_offset_start & target_page_mask) {
18114 if (entry == vm_map_copy_first_entry(copy_map) && !copy) {
18115 overmap_start++;
18116 } else {
18117 misalignments++;
18118 }
18119 }
18120 if (object_offset_end & target_page_mask) {
18121 if (entry->vme_next == vm_map_copy_to_entry(copy_map) && !copy) {
18122 overmap_end++;
18123 } else {
18124 misalignments++;
18125 }
18126 }
18127 }
18128 }
18129 entry = VM_MAP_ENTRY_NULL;
18130
18131 /* decide how to deal with misalignments */
18132 assert(overmap_start <= 1);
18133 assert(overmap_end <= 1);
18134 if (!overmap_start && !overmap_end && !misalignments) {
18135 /* copy_map is properly aligned for target_map ... */
18136 if (*trimmed_start_p) {
18137 /* ... but we trimmed it, so still need to adjust */
18138 } else {
18139 /* ... and we didn't trim anything: we're done */
18140 if (target_copy_map == VM_MAP_COPY_NULL) {
18141 target_copy_map = copy_map;
18142 }
18143 *target_copy_map_p = target_copy_map;
18144 *overmap_start_p = 0;
18145 *overmap_end_p = 0;
18146 DEBUG4K_ADJUST("copy_map %p (%d offset 0x%llx size 0x%llx) target_map %p (%d) copy %d target_copy_map %p (%d offset 0x%llx size 0x%llx) -> trimmed 0x%llx overmap start 0x%llx end 0x%llx KERN_SUCCESS\n", copy_map, copy_page_shift, (uint64_t)copy_map->offset, (uint64_t)copy_map->size, target_map, target_page_shift, copy, *target_copy_map_p, VM_MAP_COPY_PAGE_SHIFT(*target_copy_map_p), (uint64_t)(*target_copy_map_p)->offset, (uint64_t)(*target_copy_map_p)->size, (uint64_t)*trimmed_start_p, (uint64_t)*overmap_start_p, (uint64_t)*overmap_end_p);
18147 return KERN_SUCCESS;
18148 }
18149 } else if (misalignments && !copy) {
18150 /* can't "share" if misaligned */
18151 DEBUG4K_ADJUST("unsupported sharing\n");
18152 #if MACH_ASSERT
18153 if (debug4k_panic_on_misaligned_sharing) {
18154 panic("DEBUG4k %s:%d unsupported sharing", __FUNCTION__, __LINE__);
18155 }
18156 #endif /* MACH_ASSERT */
18157 DEBUG4K_ADJUST("copy_map %p (%d) target_map %p (%d) copy %d target_copy_map %p -> KERN_NOT_SUPPORTED\n", copy_map, copy_page_shift, target_map, target_page_shift, copy, *target_copy_map_p);
18158 return KERN_NOT_SUPPORTED;
18159 } else {
18160 /* can't virtual-copy if misaligned (but can physical-copy) */
18161 DEBUG4K_ADJUST("mis-aligned copying\n");
18162 }
18163
18164 /* get a "target_copy_map" if needed and switch to it */
18165 vm_map_copy_adjust_get_target_copy_map(copy_map, &target_copy_map);
18166 copy_map = target_copy_map;
18167
18168 if (misalignments && copy) {
18169 vm_map_size_t target_copy_map_size;
18170
18171 /*
18172 * Can't do copy-on-write with misaligned mappings.
18173 * Replace the mappings with a physical copy of the original
18174 * mappings' contents.
18175 */
18176 target_copy_map_size = target_copy_map->size;
18177 kern_return_t kr = vm_map_copy_to_physcopy(target_copy_map, target_map);
18178 if (kr != KERN_SUCCESS) {
18179 return kr;
18180 }
18181 *target_copy_map_p = target_copy_map;
18182 *overmap_start_p = 0;
18183 *overmap_end_p = target_copy_map->size - target_copy_map_size;
18184 DEBUG4K_ADJUST("copy_map %p (%d offset 0x%llx size 0x%llx) target_map %p (%d) copy %d target_copy_map %p (%d offset 0x%llx size 0x%llx)-> trimmed 0x%llx overmap start 0x%llx end 0x%llx PHYSCOPY\n", copy_map, copy_page_shift, (uint64_t)copy_map->offset, (uint64_t)copy_map->size, target_map, target_page_shift, copy, *target_copy_map_p, VM_MAP_COPY_PAGE_SHIFT(*target_copy_map_p), (uint64_t)(*target_copy_map_p)->offset, (uint64_t)(*target_copy_map_p)->size, (uint64_t)*trimmed_start_p, (uint64_t)*overmap_start_p, (uint64_t)*overmap_end_p);
18185 return KERN_SUCCESS;
18186 }
18187
18188 /* apply the adjustments */
18189 misalignments = 0;
18190 overmap_start = 0;
18191 overmap_end = 0;
18192 /* remove copy_map->offset, so that everything starts at offset 0 */
18193 addr_adjustment = copy_map->offset;
18194 /* also remove whatever we trimmed from the start */
18195 addr_adjustment += *trimmed_start_p;
18196 for (target_entry = vm_map_copy_first_entry(target_copy_map);
18197 target_entry != vm_map_copy_to_entry(target_copy_map);
18198 target_entry = target_entry->vme_next) {
18199 vm_object_offset_t object_offset_start, object_offset_end;
18200
18201 DEBUG4K_ADJUST("copy %p (%d 0x%llx 0x%llx) entry %p [ 0x%llx 0x%llx ] object %p offset 0x%llx BEFORE\n", target_copy_map, VM_MAP_COPY_PAGE_SHIFT(target_copy_map), target_copy_map->offset, (uint64_t)target_copy_map->size, target_entry, (uint64_t)target_entry->vme_start, (uint64_t)target_entry->vme_end, VME_OBJECT(target_entry), VME_OFFSET(target_entry));
18202 object_offset_start = VME_OFFSET(target_entry);
18203 if (object_offset_start & target_page_mask) {
18204 DEBUG4K_ADJUST("copy %p (%d 0x%llx 0x%llx) entry %p [ 0x%llx 0x%llx ] object %p offset 0x%llx misaligned at start\n", target_copy_map, VM_MAP_COPY_PAGE_SHIFT(target_copy_map), target_copy_map->offset, (uint64_t)target_copy_map->size, target_entry, (uint64_t)target_entry->vme_start, (uint64_t)target_entry->vme_end, VME_OBJECT(target_entry), VME_OFFSET(target_entry));
18205 if (target_entry == vm_map_copy_first_entry(target_copy_map)) {
18206 /*
18207 * start of 1st entry is mis-aligned:
18208 * re-adjust by over-mapping.
18209 */
18210 overmap_start = object_offset_start - trunc_page_mask_64(object_offset_start, target_page_mask);
18211 DEBUG4K_ADJUST("entry %p offset 0x%llx copy %d -> overmap_start 0x%llx\n", target_entry, VME_OFFSET(target_entry), copy, (uint64_t)overmap_start);
18212 VME_OFFSET_SET(target_entry, VME_OFFSET(target_entry) - overmap_start);
18213 } else {
18214 misalignments++;
18215 DEBUG4K_ADJUST("entry %p offset 0x%llx copy %d -> misalignments %d\n", target_entry, VME_OFFSET(target_entry), copy, misalignments);
18216 assert(copy);
18217 }
18218 }
18219
18220 if (target_entry == vm_map_copy_first_entry(target_copy_map)) {
18221 target_size += overmap_start;
18222 } else {
18223 target_entry->vme_start += overmap_start;
18224 }
18225 target_entry->vme_end += overmap_start;
18226
18227 object_offset_end = VME_OFFSET(target_entry) + target_entry->vme_end - target_entry->vme_start;
18228 if (object_offset_end & target_page_mask) {
18229 DEBUG4K_ADJUST("copy %p (%d 0x%llx 0x%llx) entry %p [ 0x%llx 0x%llx ] object %p offset 0x%llx misaligned at end\n", target_copy_map, VM_MAP_COPY_PAGE_SHIFT(target_copy_map), target_copy_map->offset, (uint64_t)target_copy_map->size, target_entry, (uint64_t)target_entry->vme_start, (uint64_t)target_entry->vme_end, VME_OBJECT(target_entry), VME_OFFSET(target_entry));
18230 if (target_entry->vme_next == vm_map_copy_to_entry(target_copy_map)) {
18231 /*
18232 * end of last entry is mis-aligned: re-adjust by over-mapping.
18233 */
18234 overmap_end = round_page_mask_64(object_offset_end, target_page_mask) - object_offset_end;
18235 DEBUG4K_ADJUST("entry %p offset 0x%llx copy %d -> overmap_end 0x%llx\n", target_entry, VME_OFFSET(target_entry), copy, (uint64_t)overmap_end);
18236 target_entry->vme_end += overmap_end;
18237 target_size += overmap_end;
18238 } else {
18239 misalignments++;
18240 DEBUG4K_ADJUST("entry %p offset 0x%llx copy %d -> misalignments %d\n", target_entry, VME_OFFSET(target_entry), copy, misalignments);
18241 assert(copy);
18242 }
18243 }
18244 target_entry->vme_start -= addr_adjustment;
18245 target_entry->vme_end -= addr_adjustment;
18246 DEBUG4K_ADJUST("copy %p (%d 0x%llx 0x%llx) entry %p [ 0x%llx 0x%llx ] object %p offset 0x%llx AFTER\n", target_copy_map, VM_MAP_COPY_PAGE_SHIFT(target_copy_map), target_copy_map->offset, (uint64_t)target_copy_map->size, target_entry, (uint64_t)target_entry->vme_start, (uint64_t)target_entry->vme_end, VME_OBJECT(target_entry), VME_OFFSET(target_entry));
18247 }
18248
18249 target_copy_map->size = target_size;
18250 target_copy_map->offset += overmap_start;
18251 target_copy_map->offset -= addr_adjustment;
18252 target_copy_map->cpy_hdr.page_shift = target_page_shift;
18253
18254 // assert(VM_MAP_PAGE_ALIGNED(target_copy_map->size, target_page_mask));
18255 // assert(VM_MAP_PAGE_ALIGNED(target_copy_map->offset, FOURK_PAGE_MASK));
18256 assert(overmap_start < VM_MAP_PAGE_SIZE(target_map));
18257 assert(overmap_end < VM_MAP_PAGE_SIZE(target_map));
18258
18259 *target_copy_map_p = target_copy_map;
18260 *overmap_start_p = overmap_start;
18261 *overmap_end_p = overmap_end;
18262
18263 DEBUG4K_ADJUST("copy_map %p (%d offset 0x%llx size 0x%llx) target_map %p (%d) copy %d target_copy_map %p (%d offset 0x%llx size 0x%llx) -> trimmed 0x%llx overmap start 0x%llx end 0x%llx KERN_SUCCESS\n", copy_map, copy_page_shift, (uint64_t)copy_map->offset, (uint64_t)copy_map->size, target_map, target_page_shift, copy, *target_copy_map_p, VM_MAP_COPY_PAGE_SHIFT(*target_copy_map_p), (uint64_t)(*target_copy_map_p)->offset, (uint64_t)(*target_copy_map_p)->size, (uint64_t)*trimmed_start_p, (uint64_t)*overmap_start_p, (uint64_t)*overmap_end_p);
18264 return KERN_SUCCESS;
18265 }
18266
18267 kern_return_t
vm_map_range_physical_size(vm_map_t map,vm_map_address_t start,mach_vm_size_t size,mach_vm_size_t * phys_size)18268 vm_map_range_physical_size(
18269 vm_map_t map,
18270 vm_map_address_t start,
18271 mach_vm_size_t size,
18272 mach_vm_size_t * phys_size)
18273 {
18274 kern_return_t kr;
18275 vm_map_copy_t copy_map, target_copy_map;
18276 vm_map_offset_t adjusted_start, adjusted_end;
18277 vm_map_size_t adjusted_size;
18278 vm_prot_t cur_prot, max_prot;
18279 vm_map_offset_t overmap_start, overmap_end, trimmed_start, end;
18280 vm_map_kernel_flags_t vmk_flags;
18281
18282 if (size == 0) {
18283 DEBUG4K_SHARE("map %p start 0x%llx size 0x%llx -> phys_size 0!\n", map, (uint64_t)start, (uint64_t)size);
18284 *phys_size = 0;
18285 return KERN_SUCCESS;
18286 }
18287
18288 adjusted_start = vm_map_trunc_page(start, VM_MAP_PAGE_MASK(map));
18289 adjusted_end = vm_map_round_page(start + size, VM_MAP_PAGE_MASK(map));
18290 if (__improbable(os_add_overflow(start, size, &end) ||
18291 adjusted_end <= adjusted_start)) {
18292 /* wraparound */
18293 printf("%s:%d(start=0x%llx, size=0x%llx) pgmask 0x%x: wraparound\n", __FUNCTION__, __LINE__, (uint64_t)start, (uint64_t)size, VM_MAP_PAGE_MASK(map));
18294 *phys_size = 0;
18295 return KERN_INVALID_ARGUMENT;
18296 }
18297 assert(adjusted_end > adjusted_start);
18298 adjusted_size = adjusted_end - adjusted_start;
18299 *phys_size = adjusted_size;
18300 if (VM_MAP_PAGE_SIZE(map) == PAGE_SIZE) {
18301 return KERN_SUCCESS;
18302 }
18303 if (start == 0) {
18304 adjusted_start = vm_map_trunc_page(start, PAGE_MASK);
18305 adjusted_end = vm_map_round_page(start + size, PAGE_MASK);
18306 if (__improbable(adjusted_end <= adjusted_start)) {
18307 /* wraparound */
18308 printf("%s:%d(start=0x%llx, size=0x%llx) pgmask 0x%x: wraparound\n", __FUNCTION__, __LINE__, (uint64_t)start, (uint64_t)size, PAGE_MASK);
18309 *phys_size = 0;
18310 return KERN_INVALID_ARGUMENT;
18311 }
18312 assert(adjusted_end > adjusted_start);
18313 adjusted_size = adjusted_end - adjusted_start;
18314 *phys_size = adjusted_size;
18315 return KERN_SUCCESS;
18316 }
18317
18318 vmk_flags = VM_MAP_KERNEL_FLAGS_NONE;
18319 vmk_flags.vmkf_copy_pageable = TRUE;
18320 vmk_flags.vmkf_copy_same_map = TRUE;
18321 assert(adjusted_size != 0);
18322 cur_prot = VM_PROT_NONE; /* legacy mode */
18323 max_prot = VM_PROT_NONE; /* legacy mode */
18324 kr = vm_map_copy_extract(map, adjusted_start, adjusted_size,
18325 FALSE /* copy */,
18326 ©_map,
18327 &cur_prot, &max_prot, VM_INHERIT_DEFAULT,
18328 vmk_flags);
18329 if (kr != KERN_SUCCESS) {
18330 DEBUG4K_ERROR("map %p start 0x%llx 0x%llx size 0x%llx 0x%llx kr 0x%x\n", map, (uint64_t)start, (uint64_t)adjusted_start, size, (uint64_t)adjusted_size, kr);
18331 //assert(0);
18332 *phys_size = 0;
18333 return kr;
18334 }
18335 assert(copy_map != VM_MAP_COPY_NULL);
18336 target_copy_map = copy_map;
18337 DEBUG4K_ADJUST("adjusting...\n");
18338 kr = vm_map_copy_adjust_to_target(
18339 copy_map,
18340 start - adjusted_start, /* offset */
18341 size, /* size */
18342 kernel_map,
18343 FALSE, /* copy */
18344 &target_copy_map,
18345 &overmap_start,
18346 &overmap_end,
18347 &trimmed_start);
18348 if (kr == KERN_SUCCESS) {
18349 if (target_copy_map->size != *phys_size) {
18350 DEBUG4K_ADJUST("map %p (%d) start 0x%llx size 0x%llx adjusted_start 0x%llx adjusted_end 0x%llx overmap_start 0x%llx overmap_end 0x%llx trimmed_start 0x%llx phys_size 0x%llx -> 0x%llx\n", map, VM_MAP_PAGE_SHIFT(map), (uint64_t)start, (uint64_t)size, (uint64_t)adjusted_start, (uint64_t)adjusted_end, (uint64_t)overmap_start, (uint64_t)overmap_end, (uint64_t)trimmed_start, (uint64_t)*phys_size, (uint64_t)target_copy_map->size);
18351 }
18352 *phys_size = target_copy_map->size;
18353 } else {
18354 DEBUG4K_ERROR("map %p start 0x%llx 0x%llx size 0x%llx 0x%llx kr 0x%x\n", map, (uint64_t)start, (uint64_t)adjusted_start, size, (uint64_t)adjusted_size, kr);
18355 //assert(0);
18356 *phys_size = 0;
18357 }
18358 vm_map_copy_discard(copy_map);
18359 copy_map = VM_MAP_COPY_NULL;
18360
18361 return kr;
18362 }
18363
18364
18365 kern_return_t
memory_entry_check_for_adjustment(vm_map_t src_map,ipc_port_t port,vm_map_offset_t * overmap_start,vm_map_offset_t * overmap_end)18366 memory_entry_check_for_adjustment(
18367 vm_map_t src_map,
18368 ipc_port_t port,
18369 vm_map_offset_t *overmap_start,
18370 vm_map_offset_t *overmap_end)
18371 {
18372 kern_return_t kr = KERN_SUCCESS;
18373 vm_map_copy_t copy_map = VM_MAP_COPY_NULL, target_copy_map = VM_MAP_COPY_NULL;
18374
18375 assert(port);
18376 assertf(ip_kotype(port) == IKOT_NAMED_ENTRY, "Port Type expected: %d...received:%d\n", IKOT_NAMED_ENTRY, ip_kotype(port));
18377
18378 vm_named_entry_t named_entry;
18379
18380 named_entry = mach_memory_entry_from_port(port);
18381 named_entry_lock(named_entry);
18382 copy_map = named_entry->backing.copy;
18383 target_copy_map = copy_map;
18384
18385 if (src_map && VM_MAP_PAGE_SHIFT(src_map) < PAGE_SHIFT) {
18386 vm_map_offset_t trimmed_start;
18387
18388 trimmed_start = 0;
18389 DEBUG4K_ADJUST("adjusting...\n");
18390 kr = vm_map_copy_adjust_to_target(
18391 copy_map,
18392 0, /* offset */
18393 copy_map->size, /* size */
18394 src_map,
18395 FALSE, /* copy */
18396 &target_copy_map,
18397 overmap_start,
18398 overmap_end,
18399 &trimmed_start);
18400 assert(trimmed_start == 0);
18401 }
18402 named_entry_unlock(named_entry);
18403
18404 return kr;
18405 }
18406
18407
18408 /*
18409 * Routine: vm_remap
18410 *
18411 * Map portion of a task's address space.
18412 * Mapped region must not overlap more than
18413 * one vm memory object. Protections and
18414 * inheritance attributes remain the same
18415 * as in the original task and are out parameters.
18416 * Source and Target task can be identical
18417 * Other attributes are identical as for vm_map()
18418 */
18419 kern_return_t
vm_map_remap(vm_map_t target_map,vm_map_address_t * address,vm_map_size_t size,vm_map_offset_t mask,vm_map_kernel_flags_t vmk_flags,vm_map_t src_map,vm_map_offset_t memory_address,boolean_t copy,vm_prot_t * cur_protection,vm_prot_t * max_protection,vm_inherit_t inheritance)18420 vm_map_remap(
18421 vm_map_t target_map,
18422 vm_map_address_t *address,
18423 vm_map_size_t size,
18424 vm_map_offset_t mask,
18425 vm_map_kernel_flags_t vmk_flags,
18426 vm_map_t src_map,
18427 vm_map_offset_t memory_address,
18428 boolean_t copy,
18429 vm_prot_t *cur_protection, /* IN/OUT */
18430 vm_prot_t *max_protection, /* IN/OUT */
18431 vm_inherit_t inheritance)
18432 {
18433 kern_return_t result;
18434 vm_map_entry_t entry;
18435 vm_map_entry_t insp_entry = VM_MAP_ENTRY_NULL;
18436 vm_map_entry_t new_entry;
18437 vm_map_copy_t copy_map;
18438 vm_map_offset_t offset_in_mapping;
18439 vm_map_size_t target_size = 0;
18440 vm_map_size_t src_page_mask, target_page_mask;
18441 vm_map_offset_t overmap_start, overmap_end, trimmed_start;
18442 vm_map_offset_t initial_memory_address;
18443 vm_map_size_t initial_size;
18444 VM_MAP_ZAP_DECLARE(zap_list);
18445
18446 if (target_map == VM_MAP_NULL) {
18447 return KERN_INVALID_ARGUMENT;
18448 }
18449
18450 initial_memory_address = memory_address;
18451 initial_size = size;
18452 src_page_mask = VM_MAP_PAGE_MASK(src_map);
18453 target_page_mask = VM_MAP_PAGE_MASK(target_map);
18454
18455 switch (inheritance) {
18456 case VM_INHERIT_NONE:
18457 case VM_INHERIT_COPY:
18458 case VM_INHERIT_SHARE:
18459 if (size != 0 && src_map != VM_MAP_NULL) {
18460 break;
18461 }
18462 OS_FALLTHROUGH;
18463 default:
18464 return KERN_INVALID_ARGUMENT;
18465 }
18466
18467 if (src_page_mask != target_page_mask) {
18468 if (copy) {
18469 DEBUG4K_COPY("src_map %p pgsz 0x%x addr 0x%llx size 0x%llx copy %d -> target_map %p pgsz 0x%x\n", src_map, VM_MAP_PAGE_SIZE(src_map), (uint64_t)memory_address, (uint64_t)size, copy, target_map, VM_MAP_PAGE_SIZE(target_map));
18470 } else {
18471 DEBUG4K_SHARE("src_map %p pgsz 0x%x addr 0x%llx size 0x%llx copy %d -> target_map %p pgsz 0x%x\n", src_map, VM_MAP_PAGE_SIZE(src_map), (uint64_t)memory_address, (uint64_t)size, copy, target_map, VM_MAP_PAGE_SIZE(target_map));
18472 }
18473 }
18474
18475 /*
18476 * If the user is requesting that we return the address of the
18477 * first byte of the data (rather than the base of the page),
18478 * then we use different rounding semantics: specifically,
18479 * we assume that (memory_address, size) describes a region
18480 * all of whose pages we must cover, rather than a base to be truncated
18481 * down and a size to be added to that base. So we figure out
18482 * the highest page that the requested region includes and make
18483 * sure that the size will cover it.
18484 *
18485 * The key example we're worried about it is of the form:
18486 *
18487 * memory_address = 0x1ff0, size = 0x20
18488 *
18489 * With the old semantics, we round down the memory_address to 0x1000
18490 * and round up the size to 0x1000, resulting in our covering *only*
18491 * page 0x1000. With the new semantics, we'd realize that the region covers
18492 * 0x1ff0-0x2010, and compute a size of 0x2000. Thus, we cover both page
18493 * 0x1000 and page 0x2000 in the region we remap.
18494 */
18495 if (vmk_flags.vmf_return_data_addr) {
18496 vm_map_offset_t range_start, range_end;
18497
18498 range_start = vm_map_trunc_page(memory_address, src_page_mask);
18499 range_end = vm_map_round_page(memory_address + size, src_page_mask);
18500 memory_address = range_start;
18501 size = range_end - range_start;
18502 offset_in_mapping = initial_memory_address - memory_address;
18503 } else {
18504 /*
18505 * IMPORTANT:
18506 * This legacy code path is broken: for the range mentioned
18507 * above [ memory_address = 0x1ff0,size = 0x20 ], which spans
18508 * two 4k pages, it yields [ memory_address = 0x1000,
18509 * size = 0x1000 ], which covers only the first 4k page.
18510 * BUT some code unfortunately depends on this bug, so we
18511 * can't fix it without breaking something.
18512 * New code should get automatically opted in the new
18513 * behavior with the new VM_FLAGS_RETURN_DATA_ADDR flags.
18514 */
18515 offset_in_mapping = 0;
18516 memory_address = vm_map_trunc_page(memory_address, src_page_mask);
18517 size = vm_map_round_page(size, src_page_mask);
18518 initial_memory_address = memory_address;
18519 initial_size = size;
18520 }
18521
18522
18523 if (size == 0) {
18524 return KERN_INVALID_ARGUMENT;
18525 }
18526
18527 if (vmk_flags.vmf_resilient_media) {
18528 /* must be copy-on-write to be "media resilient" */
18529 if (!copy) {
18530 return KERN_INVALID_ARGUMENT;
18531 }
18532 }
18533
18534 vmk_flags.vmkf_copy_pageable = target_map->hdr.entries_pageable;
18535 vmk_flags.vmkf_copy_same_map = (src_map == target_map);
18536
18537 assert(size != 0);
18538 result = vm_map_copy_extract(src_map,
18539 memory_address,
18540 size,
18541 copy, ©_map,
18542 cur_protection, /* IN/OUT */
18543 max_protection, /* IN/OUT */
18544 inheritance,
18545 vmk_flags);
18546 if (result != KERN_SUCCESS) {
18547 return result;
18548 }
18549 assert(copy_map != VM_MAP_COPY_NULL);
18550
18551 /*
18552 * Handle the policy for vm map ranges
18553 *
18554 * If the maps differ, the target_map policy applies like for vm_map()
18555 * For same mapping remaps, we preserve the range.
18556 */
18557 if (vmk_flags.vmkf_copy_same_map) {
18558 vmk_flags.vmkf_range_id = copy_map->orig_range;
18559 } else {
18560 vm_map_kernel_flags_update_range_id(&vmk_flags, target_map);
18561 }
18562
18563 overmap_start = 0;
18564 overmap_end = 0;
18565 trimmed_start = 0;
18566 target_size = size;
18567 if (src_page_mask != target_page_mask) {
18568 vm_map_copy_t target_copy_map;
18569
18570 target_copy_map = copy_map; /* can modify "copy_map" itself */
18571 DEBUG4K_ADJUST("adjusting...\n");
18572 result = vm_map_copy_adjust_to_target(
18573 copy_map,
18574 offset_in_mapping, /* offset */
18575 initial_size,
18576 target_map,
18577 copy,
18578 &target_copy_map,
18579 &overmap_start,
18580 &overmap_end,
18581 &trimmed_start);
18582 if (result != KERN_SUCCESS) {
18583 DEBUG4K_COPY("failed to adjust 0x%x\n", result);
18584 vm_map_copy_discard(copy_map);
18585 return result;
18586 }
18587 if (trimmed_start == 0) {
18588 /* nothing trimmed: no adjustment needed */
18589 } else if (trimmed_start >= offset_in_mapping) {
18590 /* trimmed more than offset_in_mapping: nothing left */
18591 assert(overmap_start == 0);
18592 assert(overmap_end == 0);
18593 offset_in_mapping = 0;
18594 } else {
18595 /* trimmed some of offset_in_mapping: adjust */
18596 assert(overmap_start == 0);
18597 assert(overmap_end == 0);
18598 offset_in_mapping -= trimmed_start;
18599 }
18600 offset_in_mapping += overmap_start;
18601 target_size = target_copy_map->size;
18602 }
18603
18604 /*
18605 * Allocate/check a range of free virtual address
18606 * space for the target
18607 */
18608 *address = vm_map_trunc_page(*address, target_page_mask);
18609 vm_map_lock(target_map);
18610 target_size = vm_map_round_page(target_size, target_page_mask);
18611 result = vm_map_remap_range_allocate(target_map, address,
18612 target_size, mask, vmk_flags,
18613 &insp_entry, &zap_list);
18614
18615 for (entry = vm_map_copy_first_entry(copy_map);
18616 entry != vm_map_copy_to_entry(copy_map);
18617 entry = new_entry) {
18618 new_entry = entry->vme_next;
18619 vm_map_copy_entry_unlink(copy_map, entry);
18620 if (result == KERN_SUCCESS) {
18621 if (vmk_flags.vmkf_remap_prot_copy) {
18622 /*
18623 * This vm_map_remap() is for a
18624 * vm_protect(VM_PROT_COPY), so the caller
18625 * expects to be allowed to add write access
18626 * to this new mapping. This is done by
18627 * adding VM_PROT_WRITE to each entry's
18628 * max_protection... unless some security
18629 * settings disallow it.
18630 */
18631 bool allow_write = false;
18632 if (entry->vme_permanent) {
18633 /* immutable mapping... */
18634 if ((entry->max_protection & VM_PROT_EXECUTE) &&
18635 developer_mode_state()) {
18636 /*
18637 * ... but executable and
18638 * possibly being debugged,
18639 * so let's allow it to become
18640 * writable, for breakpoints
18641 * and dtrace probes, for
18642 * example.
18643 */
18644 allow_write = true;
18645 } else {
18646 printf("%d[%s] vm_remap(0x%llx,0x%llx) VM_PROT_COPY denied on permanent mapping prot 0x%x/0x%x developer %d\n",
18647 proc_selfpid(),
18648 (get_bsdtask_info(current_task())
18649 ? proc_name_address(get_bsdtask_info(current_task()))
18650 : "?"),
18651 (uint64_t)memory_address,
18652 (uint64_t)size,
18653 entry->protection,
18654 entry->max_protection,
18655 developer_mode_state());
18656 DTRACE_VM6(vm_map_delete_permanent_deny_protcopy,
18657 vm_map_entry_t, entry,
18658 vm_map_offset_t, entry->vme_start,
18659 vm_map_offset_t, entry->vme_end,
18660 vm_prot_t, entry->protection,
18661 vm_prot_t, entry->max_protection,
18662 int, VME_ALIAS(entry));
18663 }
18664 } else {
18665 allow_write = true;
18666 }
18667
18668 /*
18669 * VM_PROT_COPY: allow this mapping to become
18670 * writable, unless it was "permanent".
18671 */
18672 if (allow_write) {
18673 entry->max_protection |= VM_PROT_WRITE;
18674 }
18675 }
18676 if (vmk_flags.vmf_resilient_codesign) {
18677 /* no codesigning -> read-only access */
18678 entry->max_protection = VM_PROT_READ;
18679 entry->protection = VM_PROT_READ;
18680 entry->vme_resilient_codesign = TRUE;
18681 }
18682 entry->vme_start += *address;
18683 entry->vme_end += *address;
18684 assert(!entry->map_aligned);
18685 if (vmk_flags.vmf_resilient_media &&
18686 !entry->is_sub_map &&
18687 (VME_OBJECT(entry) == VM_OBJECT_NULL ||
18688 VME_OBJECT(entry)->internal)) {
18689 entry->vme_resilient_media = TRUE;
18690 }
18691 assert(VM_MAP_PAGE_ALIGNED(entry->vme_start, MIN(target_page_mask, PAGE_MASK)));
18692 assert(VM_MAP_PAGE_ALIGNED(entry->vme_end, MIN(target_page_mask, PAGE_MASK)));
18693 assert(VM_MAP_PAGE_ALIGNED(VME_OFFSET(entry), MIN(target_page_mask, PAGE_MASK)));
18694 vm_map_store_entry_link(target_map, insp_entry, entry,
18695 vmk_flags);
18696 insp_entry = entry;
18697 } else {
18698 if (!entry->is_sub_map) {
18699 vm_object_deallocate(VME_OBJECT(entry));
18700 } else {
18701 vm_map_deallocate(VME_SUBMAP(entry));
18702 }
18703 vm_map_copy_entry_dispose(entry);
18704 }
18705 }
18706
18707 if (vmk_flags.vmf_resilient_codesign) {
18708 *cur_protection = VM_PROT_READ;
18709 *max_protection = VM_PROT_READ;
18710 }
18711
18712 if (result == KERN_SUCCESS) {
18713 target_map->size += target_size;
18714 SAVE_HINT_MAP_WRITE(target_map, insp_entry);
18715 }
18716 vm_map_unlock(target_map);
18717
18718 vm_map_zap_dispose(&zap_list);
18719
18720 if (result == KERN_SUCCESS && target_map->wiring_required) {
18721 result = vm_map_wire_kernel(target_map, *address,
18722 *address + size, *cur_protection, VM_KERN_MEMORY_MLOCK,
18723 TRUE);
18724 }
18725
18726 /*
18727 * If requested, return the address of the data pointed to by the
18728 * request, rather than the base of the resulting page.
18729 */
18730 if (vmk_flags.vmf_return_data_addr) {
18731 *address += offset_in_mapping;
18732 }
18733
18734 if (src_page_mask != target_page_mask) {
18735 DEBUG4K_SHARE("vm_remap(%p 0x%llx 0x%llx copy=%d-> %p 0x%llx 0x%llx result=0x%x\n", src_map, (uint64_t)memory_address, (uint64_t)size, copy, target_map, (uint64_t)*address, (uint64_t)offset_in_mapping, result);
18736 }
18737 vm_map_copy_discard(copy_map);
18738 copy_map = VM_MAP_COPY_NULL;
18739
18740 return result;
18741 }
18742
18743 /*
18744 * Routine: vm_map_remap_range_allocate
18745 *
18746 * Description:
18747 * Allocate a range in the specified virtual address map.
18748 * returns the address and the map entry just before the allocated
18749 * range
18750 *
18751 * Map must be locked.
18752 */
18753
18754 static kern_return_t
vm_map_remap_range_allocate(vm_map_t map,vm_map_address_t * address,vm_map_size_t size,vm_map_offset_t mask,vm_map_kernel_flags_t vmk_flags,vm_map_entry_t * map_entry,vm_map_zap_t zap_list)18755 vm_map_remap_range_allocate(
18756 vm_map_t map,
18757 vm_map_address_t *address, /* IN/OUT */
18758 vm_map_size_t size,
18759 vm_map_offset_t mask,
18760 vm_map_kernel_flags_t vmk_flags,
18761 vm_map_entry_t *map_entry, /* OUT */
18762 vm_map_zap_t zap_list)
18763 {
18764 vm_map_entry_t entry;
18765 vm_map_offset_t start;
18766 kern_return_t kr;
18767
18768 start = *address;
18769
18770 if (!vmk_flags.vmf_fixed) {
18771 kr = vm_map_locate_space(map, size, mask, vmk_flags,
18772 &start, &entry);
18773 if (kr != KERN_SUCCESS) {
18774 return kr;
18775 }
18776 *address = start;
18777 } else {
18778 vm_map_offset_t effective_min_offset, effective_max_offset;
18779 vm_map_entry_t temp_entry;
18780 vm_map_offset_t end;
18781
18782 effective_min_offset = map->min_offset;
18783 effective_max_offset = map->max_offset;
18784
18785 /*
18786 * Verify that:
18787 * the address doesn't itself violate
18788 * the mask requirement.
18789 */
18790
18791 if ((start & mask) != 0) {
18792 return KERN_NO_SPACE;
18793 }
18794
18795 #if CONFIG_MAP_RANGES
18796 if (map->uses_user_ranges) {
18797 struct mach_vm_range r;
18798
18799 vm_map_user_range_resolve(map, start, 1, &r);
18800 if (r.max_address == 0) {
18801 return KERN_INVALID_ADDRESS;
18802 }
18803
18804 effective_min_offset = r.min_address;
18805 effective_max_offset = r.max_address;
18806 }
18807 #endif /* CONFIG_MAP_RANGES */
18808 if (map == kernel_map) {
18809 mach_vm_range_t r = kmem_validate_range_for_overwrite(start, size);
18810 effective_min_offset = r->min_address;
18811 effective_min_offset = r->max_address;
18812 }
18813
18814 /*
18815 * ... the address is within bounds
18816 */
18817
18818 end = start + size;
18819
18820 if ((start < effective_min_offset) ||
18821 (end > effective_max_offset) ||
18822 (start >= end)) {
18823 return KERN_INVALID_ADDRESS;
18824 }
18825
18826 /*
18827 * If we're asked to overwrite whatever was mapped in that
18828 * range, first deallocate that range.
18829 */
18830 if (vmk_flags.vmf_overwrite) {
18831 vmr_flags_t remove_flags = VM_MAP_REMOVE_NO_MAP_ALIGN;
18832
18833 /*
18834 * We use a "zap_list" to avoid having to unlock
18835 * the "map" in vm_map_delete(), which would compromise
18836 * the atomicity of the "deallocate" and then "remap"
18837 * combination.
18838 */
18839 remove_flags |= VM_MAP_REMOVE_NO_YIELD;
18840
18841 if (vmk_flags.vmkf_overwrite_immutable) {
18842 remove_flags |= VM_MAP_REMOVE_IMMUTABLE;
18843 }
18844 if (vmk_flags.vmkf_remap_prot_copy) {
18845 remove_flags |= VM_MAP_REMOVE_IMMUTABLE_CODE;
18846 }
18847 kr = vm_map_delete(map, start, end, remove_flags,
18848 KMEM_GUARD_NONE, zap_list).kmr_return;
18849 if (kr != KERN_SUCCESS) {
18850 /* XXX FBDP restore zap_list? */
18851 return kr;
18852 }
18853 }
18854
18855 /*
18856 * ... the starting address isn't allocated
18857 */
18858
18859 if (vm_map_lookup_entry(map, start, &temp_entry)) {
18860 return KERN_NO_SPACE;
18861 }
18862
18863 entry = temp_entry;
18864
18865 /*
18866 * ... the next region doesn't overlap the
18867 * end point.
18868 */
18869
18870 if ((entry->vme_next != vm_map_to_entry(map)) &&
18871 (entry->vme_next->vme_start < end)) {
18872 return KERN_NO_SPACE;
18873 }
18874 }
18875 *map_entry = entry;
18876 return KERN_SUCCESS;
18877 }
18878
18879 /*
18880 * vm_map_switch:
18881 *
18882 * Set the address map for the current thread to the specified map
18883 */
18884
18885 vm_map_t
vm_map_switch(vm_map_t map)18886 vm_map_switch(
18887 vm_map_t map)
18888 {
18889 thread_t thread = current_thread();
18890 vm_map_t oldmap = thread->map;
18891
18892
18893 /*
18894 * Deactivate the current map and activate the requested map
18895 */
18896 mp_disable_preemption();
18897 PMAP_SWITCH_USER(thread, map, cpu_number());
18898 mp_enable_preemption();
18899 return oldmap;
18900 }
18901
18902
18903 /*
18904 * Routine: vm_map_write_user
18905 *
18906 * Description:
18907 * Copy out data from a kernel space into space in the
18908 * destination map. The space must already exist in the
18909 * destination map.
18910 * NOTE: This routine should only be called by threads
18911 * which can block on a page fault. i.e. kernel mode user
18912 * threads.
18913 *
18914 */
18915 kern_return_t
vm_map_write_user(vm_map_t map,void * src_p,vm_map_address_t dst_addr,vm_size_t size)18916 vm_map_write_user(
18917 vm_map_t map,
18918 void *src_p,
18919 vm_map_address_t dst_addr,
18920 vm_size_t size)
18921 {
18922 kern_return_t kr = KERN_SUCCESS;
18923
18924 if (current_map() == map) {
18925 if (copyout(src_p, dst_addr, size)) {
18926 kr = KERN_INVALID_ADDRESS;
18927 }
18928 } else {
18929 vm_map_t oldmap;
18930
18931 /* take on the identity of the target map while doing */
18932 /* the transfer */
18933
18934 vm_map_reference(map);
18935 oldmap = vm_map_switch(map);
18936 if (copyout(src_p, dst_addr, size)) {
18937 kr = KERN_INVALID_ADDRESS;
18938 }
18939 vm_map_switch(oldmap);
18940 vm_map_deallocate(map);
18941 }
18942 return kr;
18943 }
18944
18945 /*
18946 * Routine: vm_map_read_user
18947 *
18948 * Description:
18949 * Copy in data from a user space source map into the
18950 * kernel map. The space must already exist in the
18951 * kernel map.
18952 * NOTE: This routine should only be called by threads
18953 * which can block on a page fault. i.e. kernel mode user
18954 * threads.
18955 *
18956 */
18957 kern_return_t
vm_map_read_user(vm_map_t map,vm_map_address_t src_addr,void * dst_p,vm_size_t size)18958 vm_map_read_user(
18959 vm_map_t map,
18960 vm_map_address_t src_addr,
18961 void *dst_p,
18962 vm_size_t size)
18963 {
18964 kern_return_t kr = KERN_SUCCESS;
18965
18966 if (current_map() == map) {
18967 if (copyin(src_addr, dst_p, size)) {
18968 kr = KERN_INVALID_ADDRESS;
18969 }
18970 } else {
18971 vm_map_t oldmap;
18972
18973 /* take on the identity of the target map while doing */
18974 /* the transfer */
18975
18976 vm_map_reference(map);
18977 oldmap = vm_map_switch(map);
18978 if (copyin(src_addr, dst_p, size)) {
18979 kr = KERN_INVALID_ADDRESS;
18980 }
18981 vm_map_switch(oldmap);
18982 vm_map_deallocate(map);
18983 }
18984 return kr;
18985 }
18986
18987
18988 /*
18989 * vm_map_check_protection:
18990 *
18991 * Assert that the target map allows the specified
18992 * privilege on the entire address region given.
18993 * The entire region must be allocated.
18994 */
18995 boolean_t
vm_map_check_protection(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_prot_t protection)18996 vm_map_check_protection(vm_map_t map, vm_map_offset_t start,
18997 vm_map_offset_t end, vm_prot_t protection)
18998 {
18999 vm_map_entry_t entry;
19000 vm_map_entry_t tmp_entry;
19001
19002 vm_map_lock(map);
19003
19004 if (start < vm_map_min(map) || end > vm_map_max(map) || start > end) {
19005 vm_map_unlock(map);
19006 return FALSE;
19007 }
19008
19009 if (!vm_map_lookup_entry(map, start, &tmp_entry)) {
19010 vm_map_unlock(map);
19011 return FALSE;
19012 }
19013
19014 entry = tmp_entry;
19015
19016 while (start < end) {
19017 if (entry == vm_map_to_entry(map)) {
19018 vm_map_unlock(map);
19019 return FALSE;
19020 }
19021
19022 /*
19023 * No holes allowed!
19024 */
19025
19026 if (start < entry->vme_start) {
19027 vm_map_unlock(map);
19028 return FALSE;
19029 }
19030
19031 /*
19032 * Check protection associated with entry.
19033 */
19034
19035 if ((entry->protection & protection) != protection) {
19036 vm_map_unlock(map);
19037 return FALSE;
19038 }
19039
19040 /* go to next entry */
19041
19042 start = entry->vme_end;
19043 entry = entry->vme_next;
19044 }
19045 vm_map_unlock(map);
19046 return TRUE;
19047 }
19048
19049 kern_return_t
vm_map_purgable_control(vm_map_t map,vm_map_offset_t address,vm_purgable_t control,int * state)19050 vm_map_purgable_control(
19051 vm_map_t map,
19052 vm_map_offset_t address,
19053 vm_purgable_t control,
19054 int *state)
19055 {
19056 vm_map_entry_t entry;
19057 vm_object_t object;
19058 kern_return_t kr;
19059 boolean_t was_nonvolatile;
19060
19061 /*
19062 * Vet all the input parameters and current type and state of the
19063 * underlaying object. Return with an error if anything is amiss.
19064 */
19065 if (map == VM_MAP_NULL) {
19066 return KERN_INVALID_ARGUMENT;
19067 }
19068
19069 if (control != VM_PURGABLE_SET_STATE &&
19070 control != VM_PURGABLE_GET_STATE &&
19071 control != VM_PURGABLE_PURGE_ALL &&
19072 control != VM_PURGABLE_SET_STATE_FROM_KERNEL) {
19073 return KERN_INVALID_ARGUMENT;
19074 }
19075
19076 if (control == VM_PURGABLE_PURGE_ALL) {
19077 vm_purgeable_object_purge_all();
19078 return KERN_SUCCESS;
19079 }
19080
19081 if ((control == VM_PURGABLE_SET_STATE ||
19082 control == VM_PURGABLE_SET_STATE_FROM_KERNEL) &&
19083 (((*state & ~(VM_PURGABLE_ALL_MASKS)) != 0) ||
19084 ((*state & VM_PURGABLE_STATE_MASK) > VM_PURGABLE_STATE_MASK))) {
19085 return KERN_INVALID_ARGUMENT;
19086 }
19087
19088 vm_map_lock_read(map);
19089
19090 if (!vm_map_lookup_entry(map, address, &entry) || entry->is_sub_map) {
19091 /*
19092 * Must pass a valid non-submap address.
19093 */
19094 vm_map_unlock_read(map);
19095 return KERN_INVALID_ADDRESS;
19096 }
19097
19098 if ((entry->protection & VM_PROT_WRITE) == 0 &&
19099 control != VM_PURGABLE_GET_STATE) {
19100 /*
19101 * Can't apply purgable controls to something you can't write.
19102 */
19103 vm_map_unlock_read(map);
19104 return KERN_PROTECTION_FAILURE;
19105 }
19106
19107 object = VME_OBJECT(entry);
19108 if (object == VM_OBJECT_NULL ||
19109 object->purgable == VM_PURGABLE_DENY) {
19110 /*
19111 * Object must already be present and be purgeable.
19112 */
19113 vm_map_unlock_read(map);
19114 return KERN_INVALID_ARGUMENT;
19115 }
19116
19117 vm_object_lock(object);
19118
19119 #if 00
19120 if (VME_OFFSET(entry) != 0 ||
19121 entry->vme_end - entry->vme_start != object->vo_size) {
19122 /*
19123 * Can only apply purgable controls to the whole (existing)
19124 * object at once.
19125 */
19126 vm_map_unlock_read(map);
19127 vm_object_unlock(object);
19128 return KERN_INVALID_ARGUMENT;
19129 }
19130 #endif
19131
19132 assert(!entry->is_sub_map);
19133 assert(!entry->use_pmap); /* purgeable has its own accounting */
19134
19135 vm_map_unlock_read(map);
19136
19137 was_nonvolatile = (object->purgable == VM_PURGABLE_NONVOLATILE);
19138
19139 kr = vm_object_purgable_control(object, control, state);
19140
19141 if (was_nonvolatile &&
19142 object->purgable != VM_PURGABLE_NONVOLATILE &&
19143 map->pmap == kernel_pmap) {
19144 #if DEBUG
19145 object->vo_purgeable_volatilizer = kernel_task;
19146 #endif /* DEBUG */
19147 }
19148
19149 vm_object_unlock(object);
19150
19151 return kr;
19152 }
19153
19154 void
vm_map_footprint_query_page_info(vm_map_t map,vm_map_entry_t map_entry,vm_map_offset_t curr_s_offset,int * disposition_p)19155 vm_map_footprint_query_page_info(
19156 vm_map_t map,
19157 vm_map_entry_t map_entry,
19158 vm_map_offset_t curr_s_offset,
19159 int *disposition_p)
19160 {
19161 int pmap_disp;
19162 vm_object_t object = VM_OBJECT_NULL;
19163 int disposition;
19164 int effective_page_size;
19165
19166 vm_map_lock_assert_held(map);
19167 assert(!map->has_corpse_footprint);
19168 assert(curr_s_offset >= map_entry->vme_start);
19169 assert(curr_s_offset < map_entry->vme_end);
19170
19171 if (map_entry->is_sub_map) {
19172 if (!map_entry->use_pmap) {
19173 /* nested pmap: no footprint */
19174 *disposition_p = 0;
19175 return;
19176 }
19177 } else {
19178 object = VME_OBJECT(map_entry);
19179 if (object == VM_OBJECT_NULL) {
19180 /* nothing mapped here: no need to ask */
19181 *disposition_p = 0;
19182 return;
19183 }
19184 }
19185
19186 effective_page_size = MIN(PAGE_SIZE, VM_MAP_PAGE_SIZE(map));
19187
19188 pmap_disp = 0;
19189
19190 /*
19191 * Query the pmap.
19192 */
19193 pmap_query_page_info(map->pmap, curr_s_offset, &pmap_disp);
19194
19195 /*
19196 * Compute this page's disposition.
19197 */
19198 disposition = 0;
19199
19200 /* deal with "alternate accounting" first */
19201 if (!map_entry->is_sub_map &&
19202 object->vo_no_footprint) {
19203 /* does not count in footprint */
19204 assertf(!map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry);
19205 } else if (!map_entry->is_sub_map &&
19206 (object->purgable == VM_PURGABLE_NONVOLATILE ||
19207 (object->purgable == VM_PURGABLE_DENY &&
19208 object->vo_ledger_tag)) &&
19209 VM_OBJECT_OWNER(object) != NULL &&
19210 VM_OBJECT_OWNER(object)->map == map) {
19211 assertf(!map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry);
19212 if ((((curr_s_offset
19213 - map_entry->vme_start
19214 + VME_OFFSET(map_entry))
19215 / effective_page_size) <
19216 (object->resident_page_count +
19217 vm_compressor_pager_get_count(object->pager)))) {
19218 /*
19219 * Non-volatile purgeable object owned
19220 * by this task: report the first
19221 * "#resident + #compressed" pages as
19222 * "resident" (to show that they
19223 * contribute to the footprint) but not
19224 * "dirty" (to avoid double-counting
19225 * with the fake "non-volatile" region
19226 * we'll report at the end of the
19227 * address space to account for all
19228 * (mapped or not) non-volatile memory
19229 * owned by this task.
19230 */
19231 disposition |= VM_PAGE_QUERY_PAGE_PRESENT;
19232 }
19233 } else if (!map_entry->is_sub_map &&
19234 (object->purgable == VM_PURGABLE_VOLATILE ||
19235 object->purgable == VM_PURGABLE_EMPTY) &&
19236 VM_OBJECT_OWNER(object) != NULL &&
19237 VM_OBJECT_OWNER(object)->map == map) {
19238 assertf(!map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry);
19239 if ((((curr_s_offset
19240 - map_entry->vme_start
19241 + VME_OFFSET(map_entry))
19242 / effective_page_size) <
19243 object->wired_page_count)) {
19244 /*
19245 * Volatile|empty purgeable object owned
19246 * by this task: report the first
19247 * "#wired" pages as "resident" (to
19248 * show that they contribute to the
19249 * footprint) but not "dirty" (to avoid
19250 * double-counting with the fake
19251 * "non-volatile" region we'll report
19252 * at the end of the address space to
19253 * account for all (mapped or not)
19254 * non-volatile memory owned by this
19255 * task.
19256 */
19257 disposition |= VM_PAGE_QUERY_PAGE_PRESENT;
19258 }
19259 } else if (!map_entry->is_sub_map &&
19260 map_entry->iokit_acct &&
19261 object->internal &&
19262 object->purgable == VM_PURGABLE_DENY) {
19263 /*
19264 * Non-purgeable IOKit memory: phys_footprint
19265 * includes the entire virtual mapping.
19266 */
19267 assertf(!map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry);
19268 disposition |= VM_PAGE_QUERY_PAGE_PRESENT;
19269 disposition |= VM_PAGE_QUERY_PAGE_DIRTY;
19270 } else if (pmap_disp & (PMAP_QUERY_PAGE_ALTACCT |
19271 PMAP_QUERY_PAGE_COMPRESSED_ALTACCT)) {
19272 /* alternate accounting */
19273 #if __arm64__ && (DEVELOPMENT || DEBUG)
19274 if (map->pmap->footprint_was_suspended) {
19275 /*
19276 * The assertion below can fail if dyld
19277 * suspended footprint accounting
19278 * while doing some adjustments to
19279 * this page; the mapping would say
19280 * "use pmap accounting" but the page
19281 * would be marked "alternate
19282 * accounting".
19283 */
19284 } else
19285 #endif /* __arm64__ && (DEVELOPMENT || DEBUG) */
19286 {
19287 assertf(!map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry);
19288 }
19289 disposition = 0;
19290 } else {
19291 if (pmap_disp & PMAP_QUERY_PAGE_PRESENT) {
19292 assertf(map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry);
19293 disposition |= VM_PAGE_QUERY_PAGE_PRESENT;
19294 disposition |= VM_PAGE_QUERY_PAGE_REF;
19295 if (pmap_disp & PMAP_QUERY_PAGE_INTERNAL) {
19296 disposition |= VM_PAGE_QUERY_PAGE_DIRTY;
19297 } else {
19298 disposition |= VM_PAGE_QUERY_PAGE_EXTERNAL;
19299 }
19300 if (pmap_disp & PMAP_QUERY_PAGE_REUSABLE) {
19301 disposition |= VM_PAGE_QUERY_PAGE_REUSABLE;
19302 }
19303 } else if (pmap_disp & PMAP_QUERY_PAGE_COMPRESSED) {
19304 assertf(map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry);
19305 disposition |= VM_PAGE_QUERY_PAGE_PAGED_OUT;
19306 }
19307 }
19308
19309 *disposition_p = disposition;
19310 }
19311
19312 kern_return_t
vm_map_page_query_internal(vm_map_t target_map,vm_map_offset_t offset,int * disposition,int * ref_count)19313 vm_map_page_query_internal(
19314 vm_map_t target_map,
19315 vm_map_offset_t offset,
19316 int *disposition,
19317 int *ref_count)
19318 {
19319 kern_return_t kr;
19320 vm_page_info_basic_data_t info;
19321 mach_msg_type_number_t count;
19322
19323 count = VM_PAGE_INFO_BASIC_COUNT;
19324 kr = vm_map_page_info(target_map,
19325 offset,
19326 VM_PAGE_INFO_BASIC,
19327 (vm_page_info_t) &info,
19328 &count);
19329 if (kr == KERN_SUCCESS) {
19330 *disposition = info.disposition;
19331 *ref_count = info.ref_count;
19332 } else {
19333 *disposition = 0;
19334 *ref_count = 0;
19335 }
19336
19337 return kr;
19338 }
19339
19340 kern_return_t
vm_map_page_info(vm_map_t map,vm_map_offset_t offset,vm_page_info_flavor_t flavor,vm_page_info_t info,mach_msg_type_number_t * count)19341 vm_map_page_info(
19342 vm_map_t map,
19343 vm_map_offset_t offset,
19344 vm_page_info_flavor_t flavor,
19345 vm_page_info_t info,
19346 mach_msg_type_number_t *count)
19347 {
19348 return vm_map_page_range_info_internal(map,
19349 offset, /* start of range */
19350 (offset + 1), /* this will get rounded in the call to the page boundary */
19351 (int)-1, /* effective_page_shift: unspecified */
19352 flavor,
19353 info,
19354 count);
19355 }
19356
19357 kern_return_t
vm_map_page_range_info_internal(vm_map_t map,vm_map_offset_t start_offset,vm_map_offset_t end_offset,int effective_page_shift,vm_page_info_flavor_t flavor,vm_page_info_t info,mach_msg_type_number_t * count)19358 vm_map_page_range_info_internal(
19359 vm_map_t map,
19360 vm_map_offset_t start_offset,
19361 vm_map_offset_t end_offset,
19362 int effective_page_shift,
19363 vm_page_info_flavor_t flavor,
19364 vm_page_info_t info,
19365 mach_msg_type_number_t *count)
19366 {
19367 vm_map_entry_t map_entry = VM_MAP_ENTRY_NULL;
19368 vm_object_t object = VM_OBJECT_NULL, curr_object = VM_OBJECT_NULL;
19369 vm_page_t m = VM_PAGE_NULL;
19370 kern_return_t retval = KERN_SUCCESS;
19371 int disposition = 0;
19372 int ref_count = 0;
19373 int depth = 0, info_idx = 0;
19374 vm_page_info_basic_t basic_info = 0;
19375 vm_map_offset_t offset_in_page = 0, offset_in_object = 0, curr_offset_in_object = 0;
19376 vm_map_offset_t start = 0, end = 0, curr_s_offset = 0, curr_e_offset = 0;
19377 boolean_t do_region_footprint;
19378 ledger_amount_t ledger_resident, ledger_compressed;
19379 int effective_page_size;
19380 vm_map_offset_t effective_page_mask;
19381
19382 switch (flavor) {
19383 case VM_PAGE_INFO_BASIC:
19384 if (*count != VM_PAGE_INFO_BASIC_COUNT) {
19385 /*
19386 * The "vm_page_info_basic_data" structure was not
19387 * properly padded, so allow the size to be off by
19388 * one to maintain backwards binary compatibility...
19389 */
19390 if (*count != VM_PAGE_INFO_BASIC_COUNT - 1) {
19391 return KERN_INVALID_ARGUMENT;
19392 }
19393 }
19394 break;
19395 default:
19396 return KERN_INVALID_ARGUMENT;
19397 }
19398
19399 if (effective_page_shift == -1) {
19400 effective_page_shift = vm_self_region_page_shift_safely(map);
19401 if (effective_page_shift == -1) {
19402 return KERN_INVALID_ARGUMENT;
19403 }
19404 }
19405 effective_page_size = (1 << effective_page_shift);
19406 effective_page_mask = effective_page_size - 1;
19407
19408 do_region_footprint = task_self_region_footprint();
19409 disposition = 0;
19410 ref_count = 0;
19411 depth = 0;
19412 info_idx = 0; /* Tracks the next index within the info structure to be filled.*/
19413 retval = KERN_SUCCESS;
19414
19415 offset_in_page = start_offset & effective_page_mask;
19416 start = vm_map_trunc_page(start_offset, effective_page_mask);
19417 end = vm_map_round_page(end_offset, effective_page_mask);
19418
19419 if (end < start) {
19420 return KERN_INVALID_ARGUMENT;
19421 }
19422
19423 assert((end - start) <= MAX_PAGE_RANGE_QUERY);
19424
19425 vm_map_lock_read(map);
19426
19427 task_ledgers_footprint(map->pmap->ledger, &ledger_resident, &ledger_compressed);
19428
19429 for (curr_s_offset = start; curr_s_offset < end;) {
19430 /*
19431 * New lookup needs reset of these variables.
19432 */
19433 curr_object = object = VM_OBJECT_NULL;
19434 offset_in_object = 0;
19435 ref_count = 0;
19436 depth = 0;
19437
19438 if (do_region_footprint &&
19439 curr_s_offset >= vm_map_last_entry(map)->vme_end) {
19440 /*
19441 * Request for "footprint" info about a page beyond
19442 * the end of address space: this must be for
19443 * the fake region vm_map_region_recurse_64()
19444 * reported to account for non-volatile purgeable
19445 * memory owned by this task.
19446 */
19447 disposition = 0;
19448
19449 if (curr_s_offset - vm_map_last_entry(map)->vme_end <=
19450 (unsigned) ledger_compressed) {
19451 /*
19452 * We haven't reported all the "non-volatile
19453 * compressed" pages yet, so report this fake
19454 * page as "compressed".
19455 */
19456 disposition |= VM_PAGE_QUERY_PAGE_PAGED_OUT;
19457 } else {
19458 /*
19459 * We've reported all the non-volatile
19460 * compressed page but not all the non-volatile
19461 * pages , so report this fake page as
19462 * "resident dirty".
19463 */
19464 disposition |= VM_PAGE_QUERY_PAGE_PRESENT;
19465 disposition |= VM_PAGE_QUERY_PAGE_DIRTY;
19466 disposition |= VM_PAGE_QUERY_PAGE_REF;
19467 }
19468 switch (flavor) {
19469 case VM_PAGE_INFO_BASIC:
19470 basic_info = (vm_page_info_basic_t) (((uintptr_t) info) + (info_idx * sizeof(struct vm_page_info_basic)));
19471 basic_info->disposition = disposition;
19472 basic_info->ref_count = 1;
19473 basic_info->object_id = VM_OBJECT_ID_FAKE(map, task_ledgers.purgeable_nonvolatile);
19474 basic_info->offset = 0;
19475 basic_info->depth = 0;
19476
19477 info_idx++;
19478 break;
19479 }
19480 curr_s_offset += effective_page_size;
19481 continue;
19482 }
19483
19484 /*
19485 * First, find the map entry covering "curr_s_offset", going down
19486 * submaps if necessary.
19487 */
19488 if (!vm_map_lookup_entry(map, curr_s_offset, &map_entry)) {
19489 /* no entry -> no object -> no page */
19490
19491 if (curr_s_offset < vm_map_min(map)) {
19492 /*
19493 * Illegal address that falls below map min.
19494 */
19495 curr_e_offset = MIN(end, vm_map_min(map));
19496 } else if (curr_s_offset >= vm_map_max(map)) {
19497 /*
19498 * Illegal address that falls on/after map max.
19499 */
19500 curr_e_offset = end;
19501 } else if (map_entry == vm_map_to_entry(map)) {
19502 /*
19503 * Hit a hole.
19504 */
19505 if (map_entry->vme_next == vm_map_to_entry(map)) {
19506 /*
19507 * Empty map.
19508 */
19509 curr_e_offset = MIN(map->max_offset, end);
19510 } else {
19511 /*
19512 * Hole at start of the map.
19513 */
19514 curr_e_offset = MIN(map_entry->vme_next->vme_start, end);
19515 }
19516 } else {
19517 if (map_entry->vme_next == vm_map_to_entry(map)) {
19518 /*
19519 * Hole at the end of the map.
19520 */
19521 curr_e_offset = MIN(map->max_offset, end);
19522 } else {
19523 curr_e_offset = MIN(map_entry->vme_next->vme_start, end);
19524 }
19525 }
19526
19527 assert(curr_e_offset >= curr_s_offset);
19528
19529 uint64_t num_pages = (curr_e_offset - curr_s_offset) >> effective_page_shift;
19530
19531 void *info_ptr = (void*) (((uintptr_t) info) + (info_idx * sizeof(struct vm_page_info_basic)));
19532
19533 bzero(info_ptr, num_pages * sizeof(struct vm_page_info_basic));
19534
19535 curr_s_offset = curr_e_offset;
19536
19537 info_idx += num_pages;
19538
19539 continue;
19540 }
19541
19542 /* compute offset from this map entry's start */
19543 offset_in_object = curr_s_offset - map_entry->vme_start;
19544
19545 /* compute offset into this map entry's object (or submap) */
19546 offset_in_object += VME_OFFSET(map_entry);
19547
19548 if (map_entry->is_sub_map) {
19549 vm_map_t sub_map = VM_MAP_NULL;
19550 vm_page_info_t submap_info = 0;
19551 vm_map_offset_t submap_s_offset = 0, submap_e_offset = 0, range_len = 0;
19552
19553 range_len = MIN(map_entry->vme_end, end) - curr_s_offset;
19554
19555 submap_s_offset = offset_in_object;
19556 submap_e_offset = submap_s_offset + range_len;
19557
19558 sub_map = VME_SUBMAP(map_entry);
19559
19560 vm_map_reference(sub_map);
19561 vm_map_unlock_read(map);
19562
19563 submap_info = (vm_page_info_t) (((uintptr_t) info) + (info_idx * sizeof(struct vm_page_info_basic)));
19564
19565 assertf(VM_MAP_PAGE_SHIFT(sub_map) >= VM_MAP_PAGE_SHIFT(map),
19566 "Submap page size (%d) differs from current map (%d)\n", VM_MAP_PAGE_SIZE(sub_map), VM_MAP_PAGE_SIZE(map));
19567
19568 retval = vm_map_page_range_info_internal(sub_map,
19569 submap_s_offset,
19570 submap_e_offset,
19571 effective_page_shift,
19572 VM_PAGE_INFO_BASIC,
19573 (vm_page_info_t) submap_info,
19574 count);
19575
19576 assert(retval == KERN_SUCCESS);
19577
19578 vm_map_lock_read(map);
19579 vm_map_deallocate(sub_map);
19580
19581 /* Move the "info" index by the number of pages we inspected.*/
19582 info_idx += range_len >> effective_page_shift;
19583
19584 /* Move our current offset by the size of the range we inspected.*/
19585 curr_s_offset += range_len;
19586
19587 continue;
19588 }
19589
19590 object = VME_OBJECT(map_entry);
19591
19592 if (object == VM_OBJECT_NULL) {
19593 /*
19594 * We don't have an object here and, hence,
19595 * no pages to inspect. We'll fill up the
19596 * info structure appropriately.
19597 */
19598
19599 curr_e_offset = MIN(map_entry->vme_end, end);
19600
19601 uint64_t num_pages = (curr_e_offset - curr_s_offset) >> effective_page_shift;
19602
19603 void *info_ptr = (void*) (((uintptr_t) info) + (info_idx * sizeof(struct vm_page_info_basic)));
19604
19605 bzero(info_ptr, num_pages * sizeof(struct vm_page_info_basic));
19606
19607 curr_s_offset = curr_e_offset;
19608
19609 info_idx += num_pages;
19610
19611 continue;
19612 }
19613
19614 if (do_region_footprint) {
19615 disposition = 0;
19616 if (map->has_corpse_footprint) {
19617 /*
19618 * Query the page info data we saved
19619 * while forking the corpse.
19620 */
19621 vm_map_corpse_footprint_query_page_info(
19622 map,
19623 curr_s_offset,
19624 &disposition);
19625 } else {
19626 /*
19627 * Query the live pmap for footprint info
19628 * about this page.
19629 */
19630 vm_map_footprint_query_page_info(
19631 map,
19632 map_entry,
19633 curr_s_offset,
19634 &disposition);
19635 }
19636 switch (flavor) {
19637 case VM_PAGE_INFO_BASIC:
19638 basic_info = (vm_page_info_basic_t) (((uintptr_t) info) + (info_idx * sizeof(struct vm_page_info_basic)));
19639 basic_info->disposition = disposition;
19640 basic_info->ref_count = 1;
19641 basic_info->object_id = VM_OBJECT_ID_FAKE(map, task_ledgers.purgeable_nonvolatile);
19642 basic_info->offset = 0;
19643 basic_info->depth = 0;
19644
19645 info_idx++;
19646 break;
19647 }
19648 curr_s_offset += effective_page_size;
19649 continue;
19650 }
19651
19652 vm_object_reference(object);
19653 /*
19654 * Shared mode -- so we can allow other readers
19655 * to grab the lock too.
19656 */
19657 vm_object_lock_shared(object);
19658
19659 curr_e_offset = MIN(map_entry->vme_end, end);
19660
19661 vm_map_unlock_read(map);
19662
19663 map_entry = NULL; /* map is unlocked, the entry is no longer valid. */
19664
19665 curr_object = object;
19666
19667 for (; curr_s_offset < curr_e_offset;) {
19668 if (object == curr_object) {
19669 ref_count = curr_object->ref_count - 1; /* account for our object reference above. */
19670 } else {
19671 ref_count = curr_object->ref_count;
19672 }
19673
19674 curr_offset_in_object = offset_in_object;
19675
19676 for (;;) {
19677 m = vm_page_lookup(curr_object, vm_object_trunc_page(curr_offset_in_object));
19678
19679 if (m != VM_PAGE_NULL) {
19680 disposition |= VM_PAGE_QUERY_PAGE_PRESENT;
19681 break;
19682 } else {
19683 if (curr_object->internal &&
19684 curr_object->alive &&
19685 !curr_object->terminating &&
19686 curr_object->pager_ready) {
19687 if (VM_COMPRESSOR_PAGER_STATE_GET(curr_object, vm_object_trunc_page(curr_offset_in_object))
19688 == VM_EXTERNAL_STATE_EXISTS) {
19689 /* the pager has that page */
19690 disposition |= VM_PAGE_QUERY_PAGE_PAGED_OUT;
19691 break;
19692 }
19693 }
19694
19695 /*
19696 * Go down the VM object shadow chain until we find the page
19697 * we're looking for.
19698 */
19699
19700 if (curr_object->shadow != VM_OBJECT_NULL) {
19701 vm_object_t shadow = VM_OBJECT_NULL;
19702
19703 curr_offset_in_object += curr_object->vo_shadow_offset;
19704 shadow = curr_object->shadow;
19705
19706 vm_object_lock_shared(shadow);
19707 vm_object_unlock(curr_object);
19708
19709 curr_object = shadow;
19710 depth++;
19711 continue;
19712 } else {
19713 break;
19714 }
19715 }
19716 }
19717
19718 /* The ref_count is not strictly accurate, it measures the number */
19719 /* of entities holding a ref on the object, they may not be mapping */
19720 /* the object or may not be mapping the section holding the */
19721 /* target page but its still a ball park number and though an over- */
19722 /* count, it picks up the copy-on-write cases */
19723
19724 /* We could also get a picture of page sharing from pmap_attributes */
19725 /* but this would under count as only faulted-in mappings would */
19726 /* show up. */
19727
19728 if ((curr_object == object) && curr_object->shadow) {
19729 disposition |= VM_PAGE_QUERY_PAGE_COPIED;
19730 }
19731
19732 if (!curr_object->internal) {
19733 disposition |= VM_PAGE_QUERY_PAGE_EXTERNAL;
19734 }
19735
19736 if (m != VM_PAGE_NULL) {
19737 if (m->vmp_fictitious) {
19738 disposition |= VM_PAGE_QUERY_PAGE_FICTITIOUS;
19739 } else {
19740 if (m->vmp_dirty || pmap_is_modified(VM_PAGE_GET_PHYS_PAGE(m))) {
19741 disposition |= VM_PAGE_QUERY_PAGE_DIRTY;
19742 }
19743
19744 if (m->vmp_reference || pmap_is_referenced(VM_PAGE_GET_PHYS_PAGE(m))) {
19745 disposition |= VM_PAGE_QUERY_PAGE_REF;
19746 }
19747
19748 if (m->vmp_q_state == VM_PAGE_ON_SPECULATIVE_Q) {
19749 disposition |= VM_PAGE_QUERY_PAGE_SPECULATIVE;
19750 }
19751
19752 /*
19753 * XXX TODO4K:
19754 * when this routine deals with 4k
19755 * pages, check the appropriate CS bit
19756 * here.
19757 */
19758 if (m->vmp_cs_validated) {
19759 disposition |= VM_PAGE_QUERY_PAGE_CS_VALIDATED;
19760 }
19761 if (m->vmp_cs_tainted) {
19762 disposition |= VM_PAGE_QUERY_PAGE_CS_TAINTED;
19763 }
19764 if (m->vmp_cs_nx) {
19765 disposition |= VM_PAGE_QUERY_PAGE_CS_NX;
19766 }
19767 if (m->vmp_reusable || curr_object->all_reusable) {
19768 disposition |= VM_PAGE_QUERY_PAGE_REUSABLE;
19769 }
19770 }
19771 }
19772
19773 switch (flavor) {
19774 case VM_PAGE_INFO_BASIC:
19775 basic_info = (vm_page_info_basic_t) (((uintptr_t) info) + (info_idx * sizeof(struct vm_page_info_basic)));
19776 basic_info->disposition = disposition;
19777 basic_info->ref_count = ref_count;
19778 basic_info->object_id = (vm_object_id_t) (uintptr_t)
19779 VM_KERNEL_ADDRPERM(curr_object);
19780 basic_info->offset =
19781 (memory_object_offset_t) curr_offset_in_object + offset_in_page;
19782 basic_info->depth = depth;
19783
19784 info_idx++;
19785 break;
19786 }
19787
19788 disposition = 0;
19789 offset_in_page = 0; // This doesn't really make sense for any offset other than the starting offset.
19790
19791 /*
19792 * Move to next offset in the range and in our object.
19793 */
19794 curr_s_offset += effective_page_size;
19795 offset_in_object += effective_page_size;
19796 curr_offset_in_object = offset_in_object;
19797
19798 if (curr_object != object) {
19799 vm_object_unlock(curr_object);
19800
19801 curr_object = object;
19802
19803 vm_object_lock_shared(curr_object);
19804 } else {
19805 vm_object_lock_yield_shared(curr_object);
19806 }
19807 }
19808
19809 vm_object_unlock(curr_object);
19810 vm_object_deallocate(curr_object);
19811
19812 vm_map_lock_read(map);
19813 }
19814
19815 vm_map_unlock_read(map);
19816 return retval;
19817 }
19818
19819 /*
19820 * vm_map_msync
19821 *
19822 * Synchronises the memory range specified with its backing store
19823 * image by either flushing or cleaning the contents to the appropriate
19824 * memory manager engaging in a memory object synchronize dialog with
19825 * the manager. The client doesn't return until the manager issues
19826 * m_o_s_completed message. MIG Magically converts user task parameter
19827 * to the task's address map.
19828 *
19829 * interpretation of sync_flags
19830 * VM_SYNC_INVALIDATE - discard pages, only return precious
19831 * pages to manager.
19832 *
19833 * VM_SYNC_INVALIDATE & (VM_SYNC_SYNCHRONOUS | VM_SYNC_ASYNCHRONOUS)
19834 * - discard pages, write dirty or precious
19835 * pages back to memory manager.
19836 *
19837 * VM_SYNC_SYNCHRONOUS | VM_SYNC_ASYNCHRONOUS
19838 * - write dirty or precious pages back to
19839 * the memory manager.
19840 *
19841 * VM_SYNC_CONTIGUOUS - does everything normally, but if there
19842 * is a hole in the region, and we would
19843 * have returned KERN_SUCCESS, return
19844 * KERN_INVALID_ADDRESS instead.
19845 *
19846 * NOTE
19847 * The memory object attributes have not yet been implemented, this
19848 * function will have to deal with the invalidate attribute
19849 *
19850 * RETURNS
19851 * KERN_INVALID_TASK Bad task parameter
19852 * KERN_INVALID_ARGUMENT both sync and async were specified.
19853 * KERN_SUCCESS The usual.
19854 * KERN_INVALID_ADDRESS There was a hole in the region.
19855 */
19856
19857 kern_return_t
vm_map_msync(vm_map_t map,vm_map_address_t address,vm_map_size_t size,vm_sync_t sync_flags)19858 vm_map_msync(
19859 vm_map_t map,
19860 vm_map_address_t address,
19861 vm_map_size_t size,
19862 vm_sync_t sync_flags)
19863 {
19864 vm_map_entry_t entry;
19865 vm_map_size_t amount_left;
19866 vm_object_offset_t offset;
19867 vm_object_offset_t start_offset, end_offset;
19868 boolean_t do_sync_req;
19869 boolean_t had_hole = FALSE;
19870 vm_map_offset_t pmap_offset;
19871
19872 if ((sync_flags & VM_SYNC_ASYNCHRONOUS) &&
19873 (sync_flags & VM_SYNC_SYNCHRONOUS)) {
19874 return KERN_INVALID_ARGUMENT;
19875 }
19876
19877 if (VM_MAP_PAGE_MASK(map) < PAGE_MASK) {
19878 DEBUG4K_SHARE("map %p address 0x%llx size 0x%llx flags 0x%x\n", map, (uint64_t)address, (uint64_t)size, sync_flags);
19879 }
19880
19881 /*
19882 * align address and size on page boundaries
19883 */
19884 size = (vm_map_round_page(address + size,
19885 VM_MAP_PAGE_MASK(map)) -
19886 vm_map_trunc_page(address,
19887 VM_MAP_PAGE_MASK(map)));
19888 address = vm_map_trunc_page(address,
19889 VM_MAP_PAGE_MASK(map));
19890
19891 if (map == VM_MAP_NULL) {
19892 return KERN_INVALID_TASK;
19893 }
19894
19895 if (size == 0) {
19896 return KERN_SUCCESS;
19897 }
19898
19899 amount_left = size;
19900
19901 while (amount_left > 0) {
19902 vm_object_size_t flush_size;
19903 vm_object_t object;
19904
19905 vm_map_lock(map);
19906 if (!vm_map_lookup_entry(map,
19907 address,
19908 &entry)) {
19909 vm_map_size_t skip;
19910
19911 /*
19912 * hole in the address map.
19913 */
19914 had_hole = TRUE;
19915
19916 if (sync_flags & VM_SYNC_KILLPAGES) {
19917 /*
19918 * For VM_SYNC_KILLPAGES, there should be
19919 * no holes in the range, since we couldn't
19920 * prevent someone else from allocating in
19921 * that hole and we wouldn't want to "kill"
19922 * their pages.
19923 */
19924 vm_map_unlock(map);
19925 break;
19926 }
19927
19928 /*
19929 * Check for empty map.
19930 */
19931 if (entry == vm_map_to_entry(map) &&
19932 entry->vme_next == entry) {
19933 vm_map_unlock(map);
19934 break;
19935 }
19936 /*
19937 * Check that we don't wrap and that
19938 * we have at least one real map entry.
19939 */
19940 if ((map->hdr.nentries == 0) ||
19941 (entry->vme_next->vme_start < address)) {
19942 vm_map_unlock(map);
19943 break;
19944 }
19945 /*
19946 * Move up to the next entry if needed
19947 */
19948 skip = (entry->vme_next->vme_start - address);
19949 if (skip >= amount_left) {
19950 amount_left = 0;
19951 } else {
19952 amount_left -= skip;
19953 }
19954 address = entry->vme_next->vme_start;
19955 vm_map_unlock(map);
19956 continue;
19957 }
19958
19959 offset = address - entry->vme_start;
19960 pmap_offset = address;
19961
19962 /*
19963 * do we have more to flush than is contained in this
19964 * entry ?
19965 */
19966 if (amount_left + entry->vme_start + offset > entry->vme_end) {
19967 flush_size = entry->vme_end -
19968 (entry->vme_start + offset);
19969 } else {
19970 flush_size = amount_left;
19971 }
19972 amount_left -= flush_size;
19973 address += flush_size;
19974
19975 if (entry->is_sub_map == TRUE) {
19976 vm_map_t local_map;
19977 vm_map_offset_t local_offset;
19978
19979 local_map = VME_SUBMAP(entry);
19980 local_offset = VME_OFFSET(entry);
19981 vm_map_reference(local_map);
19982 vm_map_unlock(map);
19983 if (vm_map_msync(
19984 local_map,
19985 local_offset,
19986 flush_size,
19987 sync_flags) == KERN_INVALID_ADDRESS) {
19988 had_hole = TRUE;
19989 }
19990 vm_map_deallocate(local_map);
19991 continue;
19992 }
19993 object = VME_OBJECT(entry);
19994
19995 /*
19996 * We can't sync this object if the object has not been
19997 * created yet
19998 */
19999 if (object == VM_OBJECT_NULL) {
20000 vm_map_unlock(map);
20001 continue;
20002 }
20003 offset += VME_OFFSET(entry);
20004
20005 vm_object_lock(object);
20006
20007 if (sync_flags & (VM_SYNC_KILLPAGES | VM_SYNC_DEACTIVATE)) {
20008 int kill_pages = 0;
20009
20010 if (VM_MAP_PAGE_MASK(map) < PAGE_MASK) {
20011 /*
20012 * This is a destructive operation and so we
20013 * err on the side of limiting the range of
20014 * the operation.
20015 */
20016 start_offset = vm_object_round_page(offset);
20017 end_offset = vm_object_trunc_page(offset + flush_size);
20018
20019 if (end_offset <= start_offset) {
20020 vm_object_unlock(object);
20021 vm_map_unlock(map);
20022 continue;
20023 }
20024
20025 pmap_offset += start_offset - offset;
20026 } else {
20027 start_offset = offset;
20028 end_offset = offset + flush_size;
20029 }
20030
20031 if (sync_flags & VM_SYNC_KILLPAGES) {
20032 if (((object->ref_count == 1) ||
20033 ((object->copy_strategy !=
20034 MEMORY_OBJECT_COPY_SYMMETRIC) &&
20035 (object->copy == VM_OBJECT_NULL))) &&
20036 (object->shadow == VM_OBJECT_NULL)) {
20037 if (object->ref_count != 1) {
20038 vm_page_stats_reusable.free_shared++;
20039 }
20040 kill_pages = 1;
20041 } else {
20042 kill_pages = -1;
20043 }
20044 }
20045 if (kill_pages != -1) {
20046 vm_object_deactivate_pages(
20047 object,
20048 start_offset,
20049 (vm_object_size_t) (end_offset - start_offset),
20050 kill_pages,
20051 FALSE, /* reusable_pages */
20052 FALSE, /* reusable_no_write */
20053 map->pmap,
20054 pmap_offset);
20055 }
20056 vm_object_unlock(object);
20057 vm_map_unlock(map);
20058 continue;
20059 }
20060 /*
20061 * We can't sync this object if there isn't a pager.
20062 * Don't bother to sync internal objects, since there can't
20063 * be any "permanent" storage for these objects anyway.
20064 */
20065 if ((object->pager == MEMORY_OBJECT_NULL) ||
20066 (object->internal) || (object->private)) {
20067 vm_object_unlock(object);
20068 vm_map_unlock(map);
20069 continue;
20070 }
20071 /*
20072 * keep reference on the object until syncing is done
20073 */
20074 vm_object_reference_locked(object);
20075 vm_object_unlock(object);
20076
20077 vm_map_unlock(map);
20078
20079 if (VM_MAP_PAGE_MASK(map) < PAGE_MASK) {
20080 start_offset = vm_object_trunc_page(offset);
20081 end_offset = vm_object_round_page(offset + flush_size);
20082 } else {
20083 start_offset = offset;
20084 end_offset = offset + flush_size;
20085 }
20086
20087 do_sync_req = vm_object_sync(object,
20088 start_offset,
20089 (end_offset - start_offset),
20090 sync_flags & VM_SYNC_INVALIDATE,
20091 ((sync_flags & VM_SYNC_SYNCHRONOUS) ||
20092 (sync_flags & VM_SYNC_ASYNCHRONOUS)),
20093 sync_flags & VM_SYNC_SYNCHRONOUS);
20094
20095 if ((sync_flags & VM_SYNC_INVALIDATE) && object->resident_page_count == 0) {
20096 /*
20097 * clear out the clustering and read-ahead hints
20098 */
20099 vm_object_lock(object);
20100
20101 object->pages_created = 0;
20102 object->pages_used = 0;
20103 object->sequential = 0;
20104 object->last_alloc = 0;
20105
20106 vm_object_unlock(object);
20107 }
20108 vm_object_deallocate(object);
20109 } /* while */
20110
20111 /* for proper msync() behaviour */
20112 if (had_hole == TRUE && (sync_flags & VM_SYNC_CONTIGUOUS)) {
20113 return KERN_INVALID_ADDRESS;
20114 }
20115
20116 return KERN_SUCCESS;
20117 }/* vm_msync */
20118
20119 void
vm_named_entry_associate_vm_object(vm_named_entry_t named_entry,vm_object_t object,vm_object_offset_t offset,vm_object_size_t size,vm_prot_t prot)20120 vm_named_entry_associate_vm_object(
20121 vm_named_entry_t named_entry,
20122 vm_object_t object,
20123 vm_object_offset_t offset,
20124 vm_object_size_t size,
20125 vm_prot_t prot)
20126 {
20127 vm_map_copy_t copy;
20128 vm_map_entry_t copy_entry;
20129
20130 assert(!named_entry->is_sub_map);
20131 assert(!named_entry->is_copy);
20132 assert(!named_entry->is_object);
20133 assert(!named_entry->internal);
20134 assert(named_entry->backing.copy == VM_MAP_COPY_NULL);
20135
20136 copy = vm_map_copy_allocate(VM_MAP_COPY_ENTRY_LIST);
20137 copy->offset = offset;
20138 copy->size = size;
20139 copy->cpy_hdr.page_shift = (uint16_t)PAGE_SHIFT;
20140
20141 copy_entry = vm_map_copy_entry_create(copy);
20142 copy_entry->protection = prot;
20143 copy_entry->max_protection = prot;
20144 copy_entry->use_pmap = TRUE;
20145 copy_entry->vme_start = VM_MAP_TRUNC_PAGE(offset, PAGE_MASK);
20146 copy_entry->vme_end = VM_MAP_ROUND_PAGE(offset + size, PAGE_MASK);
20147 VME_OBJECT_SET(copy_entry, object, false, 0);
20148 VME_OFFSET_SET(copy_entry, vm_object_trunc_page(offset));
20149 vm_map_copy_entry_link(copy, vm_map_copy_last_entry(copy), copy_entry);
20150
20151 named_entry->backing.copy = copy;
20152 named_entry->is_object = TRUE;
20153 if (object->internal) {
20154 named_entry->internal = TRUE;
20155 }
20156
20157 DEBUG4K_MEMENTRY("named_entry %p copy %p object %p offset 0x%llx size 0x%llx prot 0x%x\n",
20158 named_entry, copy, object, offset, size, prot);
20159 }
20160
20161 vm_object_t
vm_named_entry_to_vm_object(vm_named_entry_t named_entry)20162 vm_named_entry_to_vm_object(
20163 vm_named_entry_t named_entry)
20164 {
20165 vm_map_copy_t copy;
20166 vm_map_entry_t copy_entry;
20167 vm_object_t object;
20168
20169 assert(!named_entry->is_sub_map);
20170 assert(!named_entry->is_copy);
20171 assert(named_entry->is_object);
20172 copy = named_entry->backing.copy;
20173 assert(copy != VM_MAP_COPY_NULL);
20174 /*
20175 * Assert that the vm_map_copy is coming from the right
20176 * zone and hasn't been forged
20177 */
20178 vm_map_copy_require(copy);
20179 assert(copy->cpy_hdr.nentries == 1);
20180 copy_entry = vm_map_copy_first_entry(copy);
20181 object = VME_OBJECT(copy_entry);
20182
20183 DEBUG4K_MEMENTRY("%p -> %p -> %p [0x%llx 0x%llx 0x%llx 0x%x/0x%x ] -> %p offset 0x%llx size 0x%llx prot 0x%x\n", named_entry, copy, copy_entry, (uint64_t)copy_entry->vme_start, (uint64_t)copy_entry->vme_end, copy_entry->vme_offset, copy_entry->protection, copy_entry->max_protection, object, named_entry->offset, named_entry->size, named_entry->protection);
20184
20185 return object;
20186 }
20187
20188 /*
20189 * Routine: convert_port_entry_to_map
20190 * Purpose:
20191 * Convert from a port specifying an entry or a task
20192 * to a map. Doesn't consume the port ref; produces a map ref,
20193 * which may be null. Unlike convert_port_to_map, the
20194 * port may be task or a named entry backed.
20195 * Conditions:
20196 * Nothing locked.
20197 */
20198
20199 vm_map_t
convert_port_entry_to_map(ipc_port_t port)20200 convert_port_entry_to_map(
20201 ipc_port_t port)
20202 {
20203 vm_map_t map = VM_MAP_NULL;
20204 vm_named_entry_t named_entry;
20205
20206 if (!IP_VALID(port)) {
20207 return VM_MAP_NULL;
20208 }
20209
20210 if (ip_kotype(port) != IKOT_NAMED_ENTRY) {
20211 return convert_port_to_map(port);
20212 }
20213
20214 named_entry = mach_memory_entry_from_port(port);
20215
20216 if ((named_entry->is_sub_map) &&
20217 (named_entry->protection & VM_PROT_WRITE)) {
20218 map = named_entry->backing.map;
20219 if (map->pmap != PMAP_NULL) {
20220 if (map->pmap == kernel_pmap) {
20221 panic("userspace has access "
20222 "to a kernel map %p", map);
20223 }
20224 pmap_require(map->pmap);
20225 }
20226 vm_map_reference(map);
20227 }
20228
20229 return map;
20230 }
20231
20232 /*
20233 * Export routines to other components for the things we access locally through
20234 * macros.
20235 */
20236 #undef current_map
20237 vm_map_t
current_map(void)20238 current_map(void)
20239 {
20240 return current_map_fast();
20241 }
20242
20243 /*
20244 * vm_map_reference:
20245 *
20246 * Takes a reference on the specified map.
20247 */
20248 void
vm_map_reference(vm_map_t map)20249 vm_map_reference(
20250 vm_map_t map)
20251 {
20252 if (__probable(map != VM_MAP_NULL)) {
20253 vm_map_require(map);
20254 os_ref_retain_raw(&map->map_refcnt, &map_refgrp);
20255 }
20256 }
20257
20258 /*
20259 * vm_map_deallocate:
20260 *
20261 * Removes a reference from the specified map,
20262 * destroying it if no references remain.
20263 * The map should not be locked.
20264 */
20265 void
vm_map_deallocate(vm_map_t map)20266 vm_map_deallocate(
20267 vm_map_t map)
20268 {
20269 if (__probable(map != VM_MAP_NULL)) {
20270 vm_map_require(map);
20271 if (os_ref_release_raw(&map->map_refcnt, &map_refgrp) == 0) {
20272 vm_map_destroy(map);
20273 }
20274 }
20275 }
20276
20277 void
vm_map_inspect_deallocate(vm_map_inspect_t map)20278 vm_map_inspect_deallocate(
20279 vm_map_inspect_t map)
20280 {
20281 vm_map_deallocate((vm_map_t)map);
20282 }
20283
20284 void
vm_map_read_deallocate(vm_map_read_t map)20285 vm_map_read_deallocate(
20286 vm_map_read_t map)
20287 {
20288 vm_map_deallocate((vm_map_t)map);
20289 }
20290
20291
20292 void
vm_map_disable_NX(vm_map_t map)20293 vm_map_disable_NX(vm_map_t map)
20294 {
20295 if (map == NULL) {
20296 return;
20297 }
20298 if (map->pmap == NULL) {
20299 return;
20300 }
20301
20302 pmap_disable_NX(map->pmap);
20303 }
20304
20305 void
vm_map_disallow_data_exec(vm_map_t map)20306 vm_map_disallow_data_exec(vm_map_t map)
20307 {
20308 if (map == NULL) {
20309 return;
20310 }
20311
20312 map->map_disallow_data_exec = TRUE;
20313 }
20314
20315 /* XXX Consider making these constants (VM_MAX_ADDRESS and MACH_VM_MAX_ADDRESS)
20316 * more descriptive.
20317 */
20318 void
vm_map_set_32bit(vm_map_t map)20319 vm_map_set_32bit(vm_map_t map)
20320 {
20321 #if defined(__arm64__)
20322 map->max_offset = pmap_max_offset(FALSE, ARM_PMAP_MAX_OFFSET_DEVICE);
20323 #else
20324 map->max_offset = (vm_map_offset_t)VM_MAX_ADDRESS;
20325 #endif
20326 }
20327
20328
20329 void
vm_map_set_64bit(vm_map_t map)20330 vm_map_set_64bit(vm_map_t map)
20331 {
20332 #if defined(__arm64__)
20333 map->max_offset = pmap_max_offset(TRUE, ARM_PMAP_MAX_OFFSET_DEVICE);
20334 #else
20335 map->max_offset = (vm_map_offset_t)MACH_VM_MAX_ADDRESS;
20336 #endif
20337 }
20338
20339 /*
20340 * Expand the maximum size of an existing map to the maximum supported.
20341 */
20342 void
vm_map_set_jumbo(vm_map_t map)20343 vm_map_set_jumbo(vm_map_t map)
20344 {
20345 #if defined (__arm64__) && !XNU_TARGET_OS_OSX
20346 vm_map_set_max_addr(map, ~0);
20347 #else /* arm64 */
20348 (void) map;
20349 #endif
20350 }
20351
20352 /*
20353 * This map has a JIT entitlement
20354 */
20355 void
vm_map_set_jit_entitled(vm_map_t map)20356 vm_map_set_jit_entitled(vm_map_t map)
20357 {
20358 #if defined (__arm64__)
20359 pmap_set_jit_entitled(map->pmap);
20360 #else /* arm64 */
20361 (void) map;
20362 #endif
20363 }
20364
20365 /*
20366 * Get status of this maps TPRO flag
20367 */
20368 boolean_t
vm_map_tpro(vm_map_t map)20369 vm_map_tpro(vm_map_t map)
20370 {
20371 #if defined (__arm64e__)
20372 return pmap_get_tpro(map->pmap);
20373 #else /* arm64e */
20374 (void) map;
20375 return false;
20376 #endif
20377 }
20378
20379 /*
20380 * This map has TPRO enabled
20381 */
20382 void
vm_map_set_tpro(vm_map_t map)20383 vm_map_set_tpro(vm_map_t map)
20384 {
20385 #if defined (__arm64e__)
20386 pmap_set_tpro(map->pmap);
20387 #else /* arm64e */
20388 (void) map;
20389 #endif
20390 }
20391
20392 /*
20393 * Expand the maximum size of an existing map.
20394 */
20395 void
vm_map_set_max_addr(vm_map_t map,vm_map_offset_t new_max_offset)20396 vm_map_set_max_addr(vm_map_t map, vm_map_offset_t new_max_offset)
20397 {
20398 #if defined(__arm64__)
20399 vm_map_offset_t max_supported_offset;
20400 vm_map_offset_t old_max_offset;
20401
20402 vm_map_lock(map);
20403
20404 old_max_offset = map->max_offset;
20405 max_supported_offset = pmap_max_offset(vm_map_is_64bit(map), ARM_PMAP_MAX_OFFSET_JUMBO);
20406
20407 new_max_offset = trunc_page(new_max_offset);
20408
20409 /* The address space cannot be shrunk using this routine. */
20410 if (old_max_offset >= new_max_offset) {
20411 vm_map_unlock(map);
20412 return;
20413 }
20414
20415 if (max_supported_offset < new_max_offset) {
20416 new_max_offset = max_supported_offset;
20417 }
20418
20419 map->max_offset = new_max_offset;
20420
20421 if (map->holelistenabled) {
20422 if (map->holes_list->prev->vme_end == old_max_offset) {
20423 /*
20424 * There is already a hole at the end of the map; simply make it bigger.
20425 */
20426 map->holes_list->prev->vme_end = map->max_offset;
20427 } else {
20428 /*
20429 * There is no hole at the end, so we need to create a new hole
20430 * for the new empty space we're creating.
20431 */
20432 struct vm_map_links *new_hole;
20433
20434 new_hole = zalloc_id(ZONE_ID_VM_MAP_HOLES, Z_WAITOK | Z_NOFAIL);
20435 new_hole->start = old_max_offset;
20436 new_hole->end = map->max_offset;
20437 new_hole->prev = map->holes_list->prev;
20438 new_hole->next = (struct vm_map_entry *)map->holes_list;
20439 map->holes_list->prev->vme_next = (struct vm_map_entry *)new_hole;
20440 map->holes_list->prev = (struct vm_map_entry *)new_hole;
20441 }
20442 }
20443
20444 vm_map_unlock(map);
20445 #else
20446 (void)map;
20447 (void)new_max_offset;
20448 #endif
20449 }
20450
20451 vm_map_offset_t
vm_compute_max_offset(boolean_t is64)20452 vm_compute_max_offset(boolean_t is64)
20453 {
20454 #if defined(__arm64__)
20455 return pmap_max_offset(is64, ARM_PMAP_MAX_OFFSET_DEVICE);
20456 #else
20457 return is64 ? (vm_map_offset_t)MACH_VM_MAX_ADDRESS : (vm_map_offset_t)VM_MAX_ADDRESS;
20458 #endif
20459 }
20460
20461 void
vm_map_get_max_aslr_slide_section(vm_map_t map __unused,int64_t * max_sections,int64_t * section_size)20462 vm_map_get_max_aslr_slide_section(
20463 vm_map_t map __unused,
20464 int64_t *max_sections,
20465 int64_t *section_size)
20466 {
20467 #if defined(__arm64__)
20468 *max_sections = 3;
20469 *section_size = ARM_TT_TWIG_SIZE;
20470 #else
20471 *max_sections = 1;
20472 *section_size = 0;
20473 #endif
20474 }
20475
20476 uint64_t
vm_map_get_max_aslr_slide_pages(vm_map_t map)20477 vm_map_get_max_aslr_slide_pages(vm_map_t map)
20478 {
20479 #if defined(__arm64__)
20480 /* Limit arm64 slide to 16MB to conserve contiguous VA space in the more
20481 * limited embedded address space; this is also meant to minimize pmap
20482 * memory usage on 16KB page systems.
20483 */
20484 return 1 << (24 - VM_MAP_PAGE_SHIFT(map));
20485 #else
20486 return 1 << (vm_map_is_64bit(map) ? 16 : 8);
20487 #endif
20488 }
20489
20490 uint64_t
vm_map_get_max_loader_aslr_slide_pages(vm_map_t map)20491 vm_map_get_max_loader_aslr_slide_pages(vm_map_t map)
20492 {
20493 #if defined(__arm64__)
20494 /* We limit the loader slide to 4MB, in order to ensure at least 8 bits
20495 * of independent entropy on 16KB page systems.
20496 */
20497 return 1 << (22 - VM_MAP_PAGE_SHIFT(map));
20498 #else
20499 return 1 << (vm_map_is_64bit(map) ? 16 : 8);
20500 #endif
20501 }
20502
20503 boolean_t
vm_map_is_64bit(vm_map_t map)20504 vm_map_is_64bit(
20505 vm_map_t map)
20506 {
20507 return map->max_offset > ((vm_map_offset_t)VM_MAX_ADDRESS);
20508 }
20509
20510 boolean_t
vm_map_has_hard_pagezero(vm_map_t map,vm_map_offset_t pagezero_size)20511 vm_map_has_hard_pagezero(
20512 vm_map_t map,
20513 vm_map_offset_t pagezero_size)
20514 {
20515 /*
20516 * XXX FBDP
20517 * We should lock the VM map (for read) here but we can get away
20518 * with it for now because there can't really be any race condition:
20519 * the VM map's min_offset is changed only when the VM map is created
20520 * and when the zero page is established (when the binary gets loaded),
20521 * and this routine gets called only when the task terminates and the
20522 * VM map is being torn down, and when a new map is created via
20523 * load_machfile()/execve().
20524 */
20525 return map->min_offset >= pagezero_size;
20526 }
20527
20528 /*
20529 * Raise a VM map's maximun offset.
20530 */
20531 kern_return_t
vm_map_raise_max_offset(vm_map_t map,vm_map_offset_t new_max_offset)20532 vm_map_raise_max_offset(
20533 vm_map_t map,
20534 vm_map_offset_t new_max_offset)
20535 {
20536 kern_return_t ret;
20537
20538 vm_map_lock(map);
20539 ret = KERN_INVALID_ADDRESS;
20540
20541 if (new_max_offset >= map->max_offset) {
20542 if (!vm_map_is_64bit(map)) {
20543 if (new_max_offset <= (vm_map_offset_t)VM_MAX_ADDRESS) {
20544 map->max_offset = new_max_offset;
20545 ret = KERN_SUCCESS;
20546 }
20547 } else {
20548 if (new_max_offset <= (vm_map_offset_t)MACH_VM_MAX_ADDRESS) {
20549 map->max_offset = new_max_offset;
20550 ret = KERN_SUCCESS;
20551 }
20552 }
20553 }
20554
20555 vm_map_unlock(map);
20556 return ret;
20557 }
20558
20559
20560 /*
20561 * Raise a VM map's minimum offset.
20562 * To strictly enforce "page zero" reservation.
20563 */
20564 kern_return_t
vm_map_raise_min_offset(vm_map_t map,vm_map_offset_t new_min_offset)20565 vm_map_raise_min_offset(
20566 vm_map_t map,
20567 vm_map_offset_t new_min_offset)
20568 {
20569 vm_map_entry_t first_entry;
20570
20571 new_min_offset = vm_map_round_page(new_min_offset,
20572 VM_MAP_PAGE_MASK(map));
20573
20574 vm_map_lock(map);
20575
20576 if (new_min_offset < map->min_offset) {
20577 /*
20578 * Can't move min_offset backwards, as that would expose
20579 * a part of the address space that was previously, and for
20580 * possibly good reasons, inaccessible.
20581 */
20582 vm_map_unlock(map);
20583 return KERN_INVALID_ADDRESS;
20584 }
20585 if (new_min_offset >= map->max_offset) {
20586 /* can't go beyond the end of the address space */
20587 vm_map_unlock(map);
20588 return KERN_INVALID_ADDRESS;
20589 }
20590
20591 first_entry = vm_map_first_entry(map);
20592 if (first_entry != vm_map_to_entry(map) &&
20593 first_entry->vme_start < new_min_offset) {
20594 /*
20595 * Some memory was already allocated below the new
20596 * minimun offset. It's too late to change it now...
20597 */
20598 vm_map_unlock(map);
20599 return KERN_NO_SPACE;
20600 }
20601
20602 map->min_offset = new_min_offset;
20603
20604 if (map->holelistenabled) {
20605 assert(map->holes_list);
20606 map->holes_list->start = new_min_offset;
20607 assert(new_min_offset < map->holes_list->end);
20608 }
20609
20610 vm_map_unlock(map);
20611
20612 return KERN_SUCCESS;
20613 }
20614
20615 /*
20616 * Set the limit on the maximum amount of address space and user wired memory allowed for this map.
20617 * This is basically a copy of the RLIMIT_AS and RLIMIT_MEMLOCK rlimit value maintained by the BSD
20618 * side of the kernel. The limits are checked in the mach VM side, so we keep a copy so we don't
20619 * have to reach over to the BSD data structures.
20620 */
20621
20622 uint64_t vm_map_set_size_limit_count = 0;
20623 kern_return_t
vm_map_set_size_limit(vm_map_t map,uint64_t new_size_limit)20624 vm_map_set_size_limit(vm_map_t map, uint64_t new_size_limit)
20625 {
20626 kern_return_t kr;
20627
20628 vm_map_lock(map);
20629 if (new_size_limit < map->size) {
20630 /* new limit should not be lower than its current size */
20631 DTRACE_VM2(vm_map_set_size_limit_fail,
20632 vm_map_size_t, map->size,
20633 uint64_t, new_size_limit);
20634 kr = KERN_FAILURE;
20635 } else if (new_size_limit == map->size_limit) {
20636 /* no change */
20637 kr = KERN_SUCCESS;
20638 } else {
20639 /* set new limit */
20640 DTRACE_VM2(vm_map_set_size_limit,
20641 vm_map_size_t, map->size,
20642 uint64_t, new_size_limit);
20643 if (new_size_limit != RLIM_INFINITY) {
20644 vm_map_set_size_limit_count++;
20645 }
20646 map->size_limit = new_size_limit;
20647 kr = KERN_SUCCESS;
20648 }
20649 vm_map_unlock(map);
20650 return kr;
20651 }
20652
20653 uint64_t vm_map_set_data_limit_count = 0;
20654 kern_return_t
vm_map_set_data_limit(vm_map_t map,uint64_t new_data_limit)20655 vm_map_set_data_limit(vm_map_t map, uint64_t new_data_limit)
20656 {
20657 kern_return_t kr;
20658
20659 vm_map_lock(map);
20660 if (new_data_limit < map->size) {
20661 /* new limit should not be lower than its current size */
20662 DTRACE_VM2(vm_map_set_data_limit_fail,
20663 vm_map_size_t, map->size,
20664 uint64_t, new_data_limit);
20665 kr = KERN_FAILURE;
20666 } else if (new_data_limit == map->data_limit) {
20667 /* no change */
20668 kr = KERN_SUCCESS;
20669 } else {
20670 /* set new limit */
20671 DTRACE_VM2(vm_map_set_data_limit,
20672 vm_map_size_t, map->size,
20673 uint64_t, new_data_limit);
20674 if (new_data_limit != RLIM_INFINITY) {
20675 vm_map_set_data_limit_count++;
20676 }
20677 map->data_limit = new_data_limit;
20678 kr = KERN_SUCCESS;
20679 }
20680 vm_map_unlock(map);
20681 return kr;
20682 }
20683
20684 void
vm_map_set_user_wire_limit(vm_map_t map,vm_size_t limit)20685 vm_map_set_user_wire_limit(vm_map_t map,
20686 vm_size_t limit)
20687 {
20688 vm_map_lock(map);
20689 map->user_wire_limit = limit;
20690 vm_map_unlock(map);
20691 }
20692
20693
20694 void
vm_map_switch_protect(vm_map_t map,boolean_t val)20695 vm_map_switch_protect(vm_map_t map,
20696 boolean_t val)
20697 {
20698 vm_map_lock(map);
20699 map->switch_protect = val;
20700 vm_map_unlock(map);
20701 }
20702
20703 extern int cs_process_enforcement_enable;
20704 boolean_t
vm_map_cs_enforcement(vm_map_t map)20705 vm_map_cs_enforcement(
20706 vm_map_t map)
20707 {
20708 if (cs_process_enforcement_enable) {
20709 return TRUE;
20710 }
20711 return map->cs_enforcement;
20712 }
20713
20714 kern_return_t
vm_map_cs_wx_enable(__unused vm_map_t map)20715 vm_map_cs_wx_enable(
20716 __unused vm_map_t map)
20717 {
20718 #if CODE_SIGNING_MONITOR
20719 kern_return_t ret = csm_allow_invalid_code(vm_map_pmap(map));
20720 if ((ret == KERN_SUCCESS) || (ret == KERN_NOT_SUPPORTED)) {
20721 return KERN_SUCCESS;
20722 }
20723 return ret;
20724 #else
20725 /* The VM manages WX memory entirely on its own */
20726 return true;
20727 #endif
20728 }
20729
20730 void
vm_map_cs_debugged_set(vm_map_t map,boolean_t val)20731 vm_map_cs_debugged_set(
20732 vm_map_t map,
20733 boolean_t val)
20734 {
20735 vm_map_lock(map);
20736 map->cs_debugged = val;
20737 vm_map_unlock(map);
20738 }
20739
20740 void
vm_map_cs_enforcement_set(vm_map_t map,boolean_t val)20741 vm_map_cs_enforcement_set(
20742 vm_map_t map,
20743 boolean_t val)
20744 {
20745 vm_map_lock(map);
20746 map->cs_enforcement = val;
20747 pmap_set_vm_map_cs_enforced(map->pmap, val);
20748 vm_map_unlock(map);
20749 }
20750
20751 /*
20752 * IOKit has mapped a region into this map; adjust the pmap's ledgers appropriately.
20753 * phys_footprint is a composite limit consisting of iokit + physmem, so we need to
20754 * bump both counters.
20755 */
20756 void
vm_map_iokit_mapped_region(vm_map_t map,vm_size_t bytes)20757 vm_map_iokit_mapped_region(vm_map_t map, vm_size_t bytes)
20758 {
20759 pmap_t pmap = vm_map_pmap(map);
20760
20761 ledger_credit(pmap->ledger, task_ledgers.iokit_mapped, bytes);
20762 ledger_credit(pmap->ledger, task_ledgers.phys_footprint, bytes);
20763 }
20764
20765 void
vm_map_iokit_unmapped_region(vm_map_t map,vm_size_t bytes)20766 vm_map_iokit_unmapped_region(vm_map_t map, vm_size_t bytes)
20767 {
20768 pmap_t pmap = vm_map_pmap(map);
20769
20770 ledger_debit(pmap->ledger, task_ledgers.iokit_mapped, bytes);
20771 ledger_debit(pmap->ledger, task_ledgers.phys_footprint, bytes);
20772 }
20773
20774 /* Add (generate) code signature for memory range */
20775 #if CONFIG_DYNAMIC_CODE_SIGNING
20776 kern_return_t
vm_map_sign(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)20777 vm_map_sign(vm_map_t map,
20778 vm_map_offset_t start,
20779 vm_map_offset_t end)
20780 {
20781 vm_map_entry_t entry;
20782 vm_page_t m;
20783 vm_object_t object;
20784
20785 /*
20786 * Vet all the input parameters and current type and state of the
20787 * underlaying object. Return with an error if anything is amiss.
20788 */
20789 if (map == VM_MAP_NULL) {
20790 return KERN_INVALID_ARGUMENT;
20791 }
20792
20793 vm_map_lock_read(map);
20794
20795 if (!vm_map_lookup_entry(map, start, &entry) || entry->is_sub_map) {
20796 /*
20797 * Must pass a valid non-submap address.
20798 */
20799 vm_map_unlock_read(map);
20800 return KERN_INVALID_ADDRESS;
20801 }
20802
20803 if ((entry->vme_start > start) || (entry->vme_end < end)) {
20804 /*
20805 * Map entry doesn't cover the requested range. Not handling
20806 * this situation currently.
20807 */
20808 vm_map_unlock_read(map);
20809 return KERN_INVALID_ARGUMENT;
20810 }
20811
20812 object = VME_OBJECT(entry);
20813 if (object == VM_OBJECT_NULL) {
20814 /*
20815 * Object must already be present or we can't sign.
20816 */
20817 vm_map_unlock_read(map);
20818 return KERN_INVALID_ARGUMENT;
20819 }
20820
20821 vm_object_lock(object);
20822 vm_map_unlock_read(map);
20823
20824 while (start < end) {
20825 uint32_t refmod;
20826
20827 m = vm_page_lookup(object,
20828 start - entry->vme_start + VME_OFFSET(entry));
20829 if (m == VM_PAGE_NULL) {
20830 /* shoud we try to fault a page here? we can probably
20831 * demand it exists and is locked for this request */
20832 vm_object_unlock(object);
20833 return KERN_FAILURE;
20834 }
20835 /* deal with special page status */
20836 if (m->vmp_busy ||
20837 (m->vmp_unusual && (VMP_ERROR_GET(m) || m->vmp_restart || m->vmp_private || m->vmp_absent))) {
20838 vm_object_unlock(object);
20839 return KERN_FAILURE;
20840 }
20841
20842 /* Page is OK... now "validate" it */
20843 /* This is the place where we'll call out to create a code
20844 * directory, later */
20845 /* XXX TODO4K: deal with 4k subpages individually? */
20846 m->vmp_cs_validated = VMP_CS_ALL_TRUE;
20847
20848 /* The page is now "clean" for codesigning purposes. That means
20849 * we don't consider it as modified (wpmapped) anymore. But
20850 * we'll disconnect the page so we note any future modification
20851 * attempts. */
20852 m->vmp_wpmapped = FALSE;
20853 refmod = pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m));
20854
20855 /* Pull the dirty status from the pmap, since we cleared the
20856 * wpmapped bit */
20857 if ((refmod & VM_MEM_MODIFIED) && !m->vmp_dirty) {
20858 SET_PAGE_DIRTY(m, FALSE);
20859 }
20860
20861 /* On to the next page */
20862 start += PAGE_SIZE;
20863 }
20864 vm_object_unlock(object);
20865
20866 return KERN_SUCCESS;
20867 }
20868 #endif
20869
20870 kern_return_t
vm_map_partial_reap(vm_map_t map,unsigned int * reclaimed_resident,unsigned int * reclaimed_compressed)20871 vm_map_partial_reap(vm_map_t map, unsigned int *reclaimed_resident, unsigned int *reclaimed_compressed)
20872 {
20873 vm_map_entry_t entry = VM_MAP_ENTRY_NULL;
20874 vm_map_entry_t next_entry;
20875 kern_return_t kr = KERN_SUCCESS;
20876 VM_MAP_ZAP_DECLARE(zap_list);
20877
20878 vm_map_lock(map);
20879
20880 for (entry = vm_map_first_entry(map);
20881 entry != vm_map_to_entry(map);
20882 entry = next_entry) {
20883 next_entry = entry->vme_next;
20884
20885 if (!entry->is_sub_map &&
20886 VME_OBJECT(entry) &&
20887 (VME_OBJECT(entry)->internal == TRUE) &&
20888 (VME_OBJECT(entry)->ref_count == 1)) {
20889 *reclaimed_resident += VME_OBJECT(entry)->resident_page_count;
20890 *reclaimed_compressed += vm_compressor_pager_get_count(VME_OBJECT(entry)->pager);
20891
20892 (void)vm_map_delete(map, entry->vme_start,
20893 entry->vme_end, VM_MAP_REMOVE_NO_YIELD,
20894 KMEM_GUARD_NONE, &zap_list);
20895 }
20896 }
20897
20898 vm_map_unlock(map);
20899
20900 vm_map_zap_dispose(&zap_list);
20901
20902 return kr;
20903 }
20904
20905
20906 #if DEVELOPMENT || DEBUG
20907
20908 int
vm_map_disconnect_page_mappings(vm_map_t map,boolean_t do_unnest)20909 vm_map_disconnect_page_mappings(
20910 vm_map_t map,
20911 boolean_t do_unnest)
20912 {
20913 vm_map_entry_t entry;
20914 ledger_amount_t byte_count = 0;
20915
20916 if (do_unnest == TRUE) {
20917 #ifndef NO_NESTED_PMAP
20918 vm_map_lock(map);
20919
20920 for (entry = vm_map_first_entry(map);
20921 entry != vm_map_to_entry(map);
20922 entry = entry->vme_next) {
20923 if (entry->is_sub_map && entry->use_pmap) {
20924 /*
20925 * Make sure the range between the start of this entry and
20926 * the end of this entry is no longer nested, so that
20927 * we will only remove mappings from the pmap in use by this
20928 * this task
20929 */
20930 vm_map_clip_unnest(map, entry, entry->vme_start, entry->vme_end);
20931 }
20932 }
20933 vm_map_unlock(map);
20934 #endif
20935 }
20936 vm_map_lock_read(map);
20937
20938 ledger_get_balance(map->pmap->ledger, task_ledgers.phys_mem, &byte_count);
20939
20940 for (entry = vm_map_first_entry(map);
20941 entry != vm_map_to_entry(map);
20942 entry = entry->vme_next) {
20943 if (!entry->is_sub_map && ((VME_OBJECT(entry) == 0) ||
20944 (VME_OBJECT(entry)->phys_contiguous))) {
20945 continue;
20946 }
20947 if (entry->is_sub_map) {
20948 assert(!entry->use_pmap);
20949 }
20950
20951 pmap_remove_options(map->pmap, entry->vme_start, entry->vme_end, 0);
20952 }
20953 vm_map_unlock_read(map);
20954
20955 return (int) (byte_count / VM_MAP_PAGE_SIZE(map));
20956 }
20957
20958 kern_return_t
vm_map_inject_error(vm_map_t map,vm_map_offset_t vaddr)20959 vm_map_inject_error(vm_map_t map, vm_map_offset_t vaddr)
20960 {
20961 vm_object_t object = NULL;
20962 vm_object_offset_t offset;
20963 vm_prot_t prot;
20964 boolean_t wired;
20965 vm_map_version_t version;
20966 vm_map_t real_map;
20967 int result = KERN_FAILURE;
20968
20969 vaddr = vm_map_trunc_page(vaddr, PAGE_MASK);
20970 vm_map_lock(map);
20971
20972 result = vm_map_lookup_and_lock_object(&map, vaddr, VM_PROT_READ,
20973 OBJECT_LOCK_EXCLUSIVE, &version, &object, &offset, &prot, &wired,
20974 NULL, &real_map, NULL);
20975 if (object == NULL) {
20976 result = KERN_MEMORY_ERROR;
20977 } else if (object->pager) {
20978 result = vm_compressor_pager_inject_error(object->pager,
20979 offset);
20980 } else {
20981 result = KERN_MEMORY_PRESENT;
20982 }
20983
20984 if (object != NULL) {
20985 vm_object_unlock(object);
20986 }
20987
20988 if (real_map != map) {
20989 vm_map_unlock(real_map);
20990 }
20991 vm_map_unlock(map);
20992
20993 return result;
20994 }
20995
20996 #endif
20997
20998
20999 #if CONFIG_FREEZE
21000
21001
21002 extern struct freezer_context freezer_context_global;
21003 AbsoluteTime c_freezer_last_yield_ts = 0;
21004
21005 extern unsigned int memorystatus_freeze_private_shared_pages_ratio;
21006 extern unsigned int memorystatus_freeze_shared_mb_per_process_max;
21007
21008 kern_return_t
vm_map_freeze(task_t task,unsigned int * purgeable_count,unsigned int * wired_count,unsigned int * clean_count,unsigned int * dirty_count,unsigned int dirty_budget,unsigned int * shared_count,int * freezer_error_code,boolean_t eval_only)21009 vm_map_freeze(
21010 task_t task,
21011 unsigned int *purgeable_count,
21012 unsigned int *wired_count,
21013 unsigned int *clean_count,
21014 unsigned int *dirty_count,
21015 unsigned int dirty_budget,
21016 unsigned int *shared_count,
21017 int *freezer_error_code,
21018 boolean_t eval_only)
21019 {
21020 vm_map_entry_t entry2 = VM_MAP_ENTRY_NULL;
21021 kern_return_t kr = KERN_SUCCESS;
21022 boolean_t evaluation_phase = TRUE;
21023 vm_object_t cur_shared_object = NULL;
21024 int cur_shared_obj_ref_cnt = 0;
21025 unsigned int dirty_private_count = 0, dirty_shared_count = 0, obj_pages_snapshot = 0;
21026
21027 *purgeable_count = *wired_count = *clean_count = *dirty_count = *shared_count = 0;
21028
21029 /*
21030 * We need the exclusive lock here so that we can
21031 * block any page faults or lookups while we are
21032 * in the middle of freezing this vm map.
21033 */
21034 vm_map_t map = task->map;
21035
21036 vm_map_lock(map);
21037
21038 assert(VM_CONFIG_COMPRESSOR_IS_PRESENT);
21039
21040 if (vm_compressor_low_on_space() || vm_swap_low_on_space()) {
21041 if (vm_compressor_low_on_space()) {
21042 *freezer_error_code = FREEZER_ERROR_NO_COMPRESSOR_SPACE;
21043 }
21044
21045 if (vm_swap_low_on_space()) {
21046 *freezer_error_code = FREEZER_ERROR_NO_SWAP_SPACE;
21047 }
21048
21049 kr = KERN_NO_SPACE;
21050 goto done;
21051 }
21052
21053 if (VM_CONFIG_FREEZER_SWAP_IS_ACTIVE == FALSE) {
21054 /*
21055 * In-memory compressor backing the freezer. No disk.
21056 * So no need to do the evaluation phase.
21057 */
21058 evaluation_phase = FALSE;
21059
21060 if (eval_only == TRUE) {
21061 /*
21062 * We don't support 'eval_only' mode
21063 * in this non-swap config.
21064 */
21065 *freezer_error_code = FREEZER_ERROR_GENERIC;
21066 kr = KERN_INVALID_ARGUMENT;
21067 goto done;
21068 }
21069
21070 freezer_context_global.freezer_ctx_uncompressed_pages = 0;
21071 clock_get_uptime(&c_freezer_last_yield_ts);
21072 }
21073 again:
21074
21075 for (entry2 = vm_map_first_entry(map);
21076 entry2 != vm_map_to_entry(map);
21077 entry2 = entry2->vme_next) {
21078 vm_object_t src_object;
21079
21080 if (entry2->is_sub_map) {
21081 continue;
21082 }
21083
21084 src_object = VME_OBJECT(entry2);
21085 if (!src_object ||
21086 src_object->phys_contiguous ||
21087 !src_object->internal) {
21088 continue;
21089 }
21090
21091 /* If eligible, scan the entry, moving eligible pages over to our parent object */
21092
21093 if (VM_CONFIG_FREEZER_SWAP_IS_ACTIVE) {
21094 /*
21095 * We skip purgeable objects during evaluation phase only.
21096 * If we decide to freeze this process, we'll explicitly
21097 * purge these objects before we go around again with
21098 * 'evaluation_phase' set to FALSE.
21099 */
21100
21101 if ((src_object->purgable == VM_PURGABLE_EMPTY) || (src_object->purgable == VM_PURGABLE_VOLATILE)) {
21102 /*
21103 * We want to purge objects that may not belong to this task but are mapped
21104 * in this task alone. Since we already purged this task's purgeable memory
21105 * at the end of a successful evaluation phase, we want to avoid doing no-op calls
21106 * on this task's purgeable objects. Hence the check for only volatile objects.
21107 */
21108 if (evaluation_phase == FALSE &&
21109 (src_object->purgable == VM_PURGABLE_VOLATILE) &&
21110 (src_object->ref_count == 1)) {
21111 vm_object_lock(src_object);
21112 vm_object_purge(src_object, 0);
21113 vm_object_unlock(src_object);
21114 }
21115 continue;
21116 }
21117
21118 /*
21119 * Pages belonging to this object could be swapped to disk.
21120 * Make sure it's not a shared object because we could end
21121 * up just bringing it back in again.
21122 *
21123 * We try to optimize somewhat by checking for objects that are mapped
21124 * more than once within our own map. But we don't do full searches,
21125 * we just look at the entries following our current entry.
21126 */
21127
21128 if (src_object->ref_count > 1) {
21129 if (src_object != cur_shared_object) {
21130 obj_pages_snapshot = (src_object->resident_page_count - src_object->wired_page_count) + vm_compressor_pager_get_count(src_object->pager);
21131 dirty_shared_count += obj_pages_snapshot;
21132
21133 cur_shared_object = src_object;
21134 cur_shared_obj_ref_cnt = 1;
21135 continue;
21136 } else {
21137 cur_shared_obj_ref_cnt++;
21138 if (src_object->ref_count == cur_shared_obj_ref_cnt) {
21139 /*
21140 * Fall through to below and treat this object as private.
21141 * So deduct its pages from our shared total and add it to the
21142 * private total.
21143 */
21144
21145 dirty_shared_count -= obj_pages_snapshot;
21146 dirty_private_count += obj_pages_snapshot;
21147 } else {
21148 continue;
21149 }
21150 }
21151 }
21152
21153
21154 if (src_object->ref_count == 1) {
21155 dirty_private_count += (src_object->resident_page_count - src_object->wired_page_count) + vm_compressor_pager_get_count(src_object->pager);
21156 }
21157
21158 if (evaluation_phase == TRUE) {
21159 continue;
21160 }
21161 }
21162
21163 uint32_t paged_out_count = vm_object_compressed_freezer_pageout(src_object, dirty_budget);
21164 *wired_count += src_object->wired_page_count;
21165
21166 if (vm_compressor_low_on_space() || vm_swap_low_on_space()) {
21167 if (vm_compressor_low_on_space()) {
21168 *freezer_error_code = FREEZER_ERROR_NO_COMPRESSOR_SPACE;
21169 }
21170
21171 if (vm_swap_low_on_space()) {
21172 *freezer_error_code = FREEZER_ERROR_NO_SWAP_SPACE;
21173 }
21174
21175 kr = KERN_NO_SPACE;
21176 break;
21177 }
21178 if (paged_out_count >= dirty_budget) {
21179 break;
21180 }
21181 dirty_budget -= paged_out_count;
21182 }
21183
21184 *shared_count = (unsigned int) ((dirty_shared_count * PAGE_SIZE_64) / (1024 * 1024ULL));
21185 if (evaluation_phase) {
21186 unsigned int shared_pages_threshold = (memorystatus_freeze_shared_mb_per_process_max * 1024 * 1024ULL) / PAGE_SIZE_64;
21187
21188 if (dirty_shared_count > shared_pages_threshold) {
21189 *freezer_error_code = FREEZER_ERROR_EXCESS_SHARED_MEMORY;
21190 kr = KERN_FAILURE;
21191 goto done;
21192 }
21193
21194 if (dirty_shared_count &&
21195 ((dirty_private_count / dirty_shared_count) < memorystatus_freeze_private_shared_pages_ratio)) {
21196 *freezer_error_code = FREEZER_ERROR_LOW_PRIVATE_SHARED_RATIO;
21197 kr = KERN_FAILURE;
21198 goto done;
21199 }
21200
21201 evaluation_phase = FALSE;
21202 dirty_shared_count = dirty_private_count = 0;
21203
21204 freezer_context_global.freezer_ctx_uncompressed_pages = 0;
21205 clock_get_uptime(&c_freezer_last_yield_ts);
21206
21207 if (eval_only) {
21208 kr = KERN_SUCCESS;
21209 goto done;
21210 }
21211
21212 vm_purgeable_purge_task_owned(task);
21213
21214 goto again;
21215 } else {
21216 kr = KERN_SUCCESS;
21217 }
21218
21219 done:
21220 vm_map_unlock(map);
21221
21222 if ((eval_only == FALSE) && (kr == KERN_SUCCESS)) {
21223 vm_object_compressed_freezer_done();
21224 }
21225 return kr;
21226 }
21227
21228 #endif
21229
21230 /*
21231 * vm_map_entry_should_cow_for_true_share:
21232 *
21233 * Determines if the map entry should be clipped and setup for copy-on-write
21234 * to avoid applying "true_share" to a large VM object when only a subset is
21235 * targeted.
21236 *
21237 * For now, we target only the map entries created for the Objective C
21238 * Garbage Collector, which initially have the following properties:
21239 * - alias == VM_MEMORY_MALLOC
21240 * - wired_count == 0
21241 * - !needs_copy
21242 * and a VM object with:
21243 * - internal
21244 * - copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC
21245 * - !true_share
21246 * - vo_size == ANON_CHUNK_SIZE
21247 *
21248 * Only non-kernel map entries.
21249 */
21250 boolean_t
vm_map_entry_should_cow_for_true_share(vm_map_entry_t entry)21251 vm_map_entry_should_cow_for_true_share(
21252 vm_map_entry_t entry)
21253 {
21254 vm_object_t object;
21255
21256 if (entry->is_sub_map) {
21257 /* entry does not point at a VM object */
21258 return FALSE;
21259 }
21260
21261 if (entry->needs_copy) {
21262 /* already set for copy_on_write: done! */
21263 return FALSE;
21264 }
21265
21266 if (VME_ALIAS(entry) != VM_MEMORY_MALLOC &&
21267 VME_ALIAS(entry) != VM_MEMORY_MALLOC_SMALL) {
21268 /* not a malloc heap or Obj-C Garbage Collector heap */
21269 return FALSE;
21270 }
21271
21272 if (entry->wired_count) {
21273 /* wired: can't change the map entry... */
21274 vm_counters.should_cow_but_wired++;
21275 return FALSE;
21276 }
21277
21278 object = VME_OBJECT(entry);
21279
21280 if (object == VM_OBJECT_NULL) {
21281 /* no object yet... */
21282 return FALSE;
21283 }
21284
21285 if (!object->internal) {
21286 /* not an internal object */
21287 return FALSE;
21288 }
21289
21290 if (object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC) {
21291 /* not the default copy strategy */
21292 return FALSE;
21293 }
21294
21295 if (object->true_share) {
21296 /* already true_share: too late to avoid it */
21297 return FALSE;
21298 }
21299
21300 if (VME_ALIAS(entry) == VM_MEMORY_MALLOC &&
21301 object->vo_size != ANON_CHUNK_SIZE) {
21302 /* ... not an object created for the ObjC Garbage Collector */
21303 return FALSE;
21304 }
21305
21306 if (VME_ALIAS(entry) == VM_MEMORY_MALLOC_SMALL &&
21307 object->vo_size != 2048 * 4096) {
21308 /* ... not a "MALLOC_SMALL" heap */
21309 return FALSE;
21310 }
21311
21312 /*
21313 * All the criteria match: we have a large object being targeted for "true_share".
21314 * To limit the adverse side-effects linked with "true_share", tell the caller to
21315 * try and avoid setting up the entire object for "true_share" by clipping the
21316 * targeted range and setting it up for copy-on-write.
21317 */
21318 return TRUE;
21319 }
21320
21321 vm_map_offset_t
vm_map_round_page_mask(vm_map_offset_t offset,vm_map_offset_t mask)21322 vm_map_round_page_mask(
21323 vm_map_offset_t offset,
21324 vm_map_offset_t mask)
21325 {
21326 return VM_MAP_ROUND_PAGE(offset, mask);
21327 }
21328
21329 vm_map_offset_t
vm_map_trunc_page_mask(vm_map_offset_t offset,vm_map_offset_t mask)21330 vm_map_trunc_page_mask(
21331 vm_map_offset_t offset,
21332 vm_map_offset_t mask)
21333 {
21334 return VM_MAP_TRUNC_PAGE(offset, mask);
21335 }
21336
21337 boolean_t
vm_map_page_aligned(vm_map_offset_t offset,vm_map_offset_t mask)21338 vm_map_page_aligned(
21339 vm_map_offset_t offset,
21340 vm_map_offset_t mask)
21341 {
21342 return ((offset) & mask) == 0;
21343 }
21344
21345 int
vm_map_page_shift(vm_map_t map)21346 vm_map_page_shift(
21347 vm_map_t map)
21348 {
21349 return VM_MAP_PAGE_SHIFT(map);
21350 }
21351
21352 int
vm_map_page_size(vm_map_t map)21353 vm_map_page_size(
21354 vm_map_t map)
21355 {
21356 return VM_MAP_PAGE_SIZE(map);
21357 }
21358
21359 vm_map_offset_t
vm_map_page_mask(vm_map_t map)21360 vm_map_page_mask(
21361 vm_map_t map)
21362 {
21363 return VM_MAP_PAGE_MASK(map);
21364 }
21365
21366 kern_return_t
vm_map_set_page_shift(vm_map_t map,int pageshift)21367 vm_map_set_page_shift(
21368 vm_map_t map,
21369 int pageshift)
21370 {
21371 if (map->hdr.nentries != 0) {
21372 /* too late to change page size */
21373 return KERN_FAILURE;
21374 }
21375
21376 map->hdr.page_shift = (uint16_t)pageshift;
21377
21378 return KERN_SUCCESS;
21379 }
21380
21381 kern_return_t
vm_map_query_volatile(vm_map_t map,mach_vm_size_t * volatile_virtual_size_p,mach_vm_size_t * volatile_resident_size_p,mach_vm_size_t * volatile_compressed_size_p,mach_vm_size_t * volatile_pmap_size_p,mach_vm_size_t * volatile_compressed_pmap_size_p)21382 vm_map_query_volatile(
21383 vm_map_t map,
21384 mach_vm_size_t *volatile_virtual_size_p,
21385 mach_vm_size_t *volatile_resident_size_p,
21386 mach_vm_size_t *volatile_compressed_size_p,
21387 mach_vm_size_t *volatile_pmap_size_p,
21388 mach_vm_size_t *volatile_compressed_pmap_size_p)
21389 {
21390 mach_vm_size_t volatile_virtual_size;
21391 mach_vm_size_t volatile_resident_count;
21392 mach_vm_size_t volatile_compressed_count;
21393 mach_vm_size_t volatile_pmap_count;
21394 mach_vm_size_t volatile_compressed_pmap_count;
21395 mach_vm_size_t resident_count;
21396 vm_map_entry_t entry;
21397 vm_object_t object;
21398
21399 /* map should be locked by caller */
21400
21401 volatile_virtual_size = 0;
21402 volatile_resident_count = 0;
21403 volatile_compressed_count = 0;
21404 volatile_pmap_count = 0;
21405 volatile_compressed_pmap_count = 0;
21406
21407 for (entry = vm_map_first_entry(map);
21408 entry != vm_map_to_entry(map);
21409 entry = entry->vme_next) {
21410 mach_vm_size_t pmap_resident_bytes, pmap_compressed_bytes;
21411
21412 if (entry->is_sub_map) {
21413 continue;
21414 }
21415 if (!(entry->protection & VM_PROT_WRITE)) {
21416 continue;
21417 }
21418 object = VME_OBJECT(entry);
21419 if (object == VM_OBJECT_NULL) {
21420 continue;
21421 }
21422 if (object->purgable != VM_PURGABLE_VOLATILE &&
21423 object->purgable != VM_PURGABLE_EMPTY) {
21424 continue;
21425 }
21426 if (VME_OFFSET(entry)) {
21427 /*
21428 * If the map entry has been split and the object now
21429 * appears several times in the VM map, we don't want
21430 * to count the object's resident_page_count more than
21431 * once. We count it only for the first one, starting
21432 * at offset 0 and ignore the other VM map entries.
21433 */
21434 continue;
21435 }
21436 resident_count = object->resident_page_count;
21437 if ((VME_OFFSET(entry) / PAGE_SIZE) >= resident_count) {
21438 resident_count = 0;
21439 } else {
21440 resident_count -= (VME_OFFSET(entry) / PAGE_SIZE);
21441 }
21442
21443 volatile_virtual_size += entry->vme_end - entry->vme_start;
21444 volatile_resident_count += resident_count;
21445 if (object->pager) {
21446 volatile_compressed_count +=
21447 vm_compressor_pager_get_count(object->pager);
21448 }
21449 pmap_compressed_bytes = 0;
21450 pmap_resident_bytes =
21451 pmap_query_resident(map->pmap,
21452 entry->vme_start,
21453 entry->vme_end,
21454 &pmap_compressed_bytes);
21455 volatile_pmap_count += (pmap_resident_bytes / PAGE_SIZE);
21456 volatile_compressed_pmap_count += (pmap_compressed_bytes
21457 / PAGE_SIZE);
21458 }
21459
21460 /* map is still locked on return */
21461
21462 *volatile_virtual_size_p = volatile_virtual_size;
21463 *volatile_resident_size_p = volatile_resident_count * PAGE_SIZE;
21464 *volatile_compressed_size_p = volatile_compressed_count * PAGE_SIZE;
21465 *volatile_pmap_size_p = volatile_pmap_count * PAGE_SIZE;
21466 *volatile_compressed_pmap_size_p = volatile_compressed_pmap_count * PAGE_SIZE;
21467
21468 return KERN_SUCCESS;
21469 }
21470
21471 void
vm_map_sizes(vm_map_t map,vm_map_size_t * psize,vm_map_size_t * pfree,vm_map_size_t * plargest_free)21472 vm_map_sizes(vm_map_t map,
21473 vm_map_size_t * psize,
21474 vm_map_size_t * pfree,
21475 vm_map_size_t * plargest_free)
21476 {
21477 vm_map_entry_t entry;
21478 vm_map_offset_t prev;
21479 vm_map_size_t free, total_free, largest_free;
21480 boolean_t end;
21481
21482 if (!map) {
21483 *psize = *pfree = *plargest_free = 0;
21484 return;
21485 }
21486 total_free = largest_free = 0;
21487
21488 vm_map_lock_read(map);
21489 if (psize) {
21490 *psize = map->max_offset - map->min_offset;
21491 }
21492
21493 prev = map->min_offset;
21494 for (entry = vm_map_first_entry(map);; entry = entry->vme_next) {
21495 end = (entry == vm_map_to_entry(map));
21496
21497 if (end) {
21498 free = entry->vme_end - prev;
21499 } else {
21500 free = entry->vme_start - prev;
21501 }
21502
21503 total_free += free;
21504 if (free > largest_free) {
21505 largest_free = free;
21506 }
21507
21508 if (end) {
21509 break;
21510 }
21511 prev = entry->vme_end;
21512 }
21513 vm_map_unlock_read(map);
21514 if (pfree) {
21515 *pfree = total_free;
21516 }
21517 if (plargest_free) {
21518 *plargest_free = largest_free;
21519 }
21520 }
21521
21522 #if VM_SCAN_FOR_SHADOW_CHAIN
21523 int vm_map_shadow_max(vm_map_t map);
21524 int
vm_map_shadow_max(vm_map_t map)21525 vm_map_shadow_max(
21526 vm_map_t map)
21527 {
21528 int shadows, shadows_max;
21529 vm_map_entry_t entry;
21530 vm_object_t object, next_object;
21531
21532 if (map == NULL) {
21533 return 0;
21534 }
21535
21536 shadows_max = 0;
21537
21538 vm_map_lock_read(map);
21539
21540 for (entry = vm_map_first_entry(map);
21541 entry != vm_map_to_entry(map);
21542 entry = entry->vme_next) {
21543 if (entry->is_sub_map) {
21544 continue;
21545 }
21546 object = VME_OBJECT(entry);
21547 if (object == NULL) {
21548 continue;
21549 }
21550 vm_object_lock_shared(object);
21551 for (shadows = 0;
21552 object->shadow != NULL;
21553 shadows++, object = next_object) {
21554 next_object = object->shadow;
21555 vm_object_lock_shared(next_object);
21556 vm_object_unlock(object);
21557 }
21558 vm_object_unlock(object);
21559 if (shadows > shadows_max) {
21560 shadows_max = shadows;
21561 }
21562 }
21563
21564 vm_map_unlock_read(map);
21565
21566 return shadows_max;
21567 }
21568 #endif /* VM_SCAN_FOR_SHADOW_CHAIN */
21569
21570 void
vm_commit_pagezero_status(vm_map_t lmap)21571 vm_commit_pagezero_status(vm_map_t lmap)
21572 {
21573 pmap_advise_pagezero_range(lmap->pmap, lmap->min_offset);
21574 }
21575
21576 #if XNU_TARGET_OS_OSX
21577 void
vm_map_set_high_start(vm_map_t map,vm_map_offset_t high_start)21578 vm_map_set_high_start(
21579 vm_map_t map,
21580 vm_map_offset_t high_start)
21581 {
21582 map->vmmap_high_start = high_start;
21583 }
21584 #endif /* XNU_TARGET_OS_OSX */
21585
21586 #if CODE_SIGNING_MONITOR
21587
21588 kern_return_t
vm_map_entry_cs_associate(vm_map_t map,vm_map_entry_t entry,vm_map_kernel_flags_t vmk_flags)21589 vm_map_entry_cs_associate(
21590 vm_map_t map,
21591 vm_map_entry_t entry,
21592 vm_map_kernel_flags_t vmk_flags)
21593 {
21594 vm_object_t cs_object, cs_shadow, backing_object;
21595 vm_object_offset_t cs_offset, backing_offset;
21596 void *cs_blobs;
21597 struct vnode *cs_vnode;
21598 kern_return_t cs_ret;
21599
21600 if (map->pmap == NULL ||
21601 entry->is_sub_map || /* XXX FBDP: recurse on sub-range? */
21602 (csm_address_space_exempt(map->pmap) == KERN_SUCCESS) ||
21603 VME_OBJECT(entry) == VM_OBJECT_NULL) {
21604 return KERN_SUCCESS;
21605 }
21606
21607 if (!(entry->protection & VM_PROT_EXECUTE)) {
21608 /*
21609 * This memory region is not executable, so the code-signing
21610 * monitor would usually not care about it...
21611 */
21612 if (vmk_flags.vmkf_remap_prot_copy &&
21613 (entry->max_protection & VM_PROT_EXECUTE)) {
21614 /*
21615 * ... except if the memory region is being remapped
21616 * from r-x/r-x to rw-/rwx via vm_protect(VM_PROT_COPY)
21617 * which is what a debugger or dtrace would be doing
21618 * to prepare to modify an executable page to insert
21619 * a breakpoint or activate a probe.
21620 * In that case, fall through so that we can mark
21621 * this region as being "debugged" and no longer
21622 * strictly code-signed.
21623 */
21624 } else {
21625 /*
21626 * Really not executable, so no need to tell the
21627 * code-signing monitor.
21628 */
21629 return KERN_SUCCESS;
21630 }
21631 }
21632
21633 vm_map_lock_assert_exclusive(map);
21634
21635 if (entry->used_for_jit) {
21636 cs_ret = csm_associate_jit_region(
21637 map->pmap,
21638 entry->vme_start,
21639 entry->vme_end - entry->vme_start);
21640 goto done;
21641 }
21642
21643 if (vmk_flags.vmkf_remap_prot_copy) {
21644 cs_ret = csm_associate_debug_region(
21645 map->pmap,
21646 entry->vme_start,
21647 entry->vme_end - entry->vme_start);
21648 if (cs_ret == KERN_SUCCESS) {
21649 entry->vme_xnu_user_debug = TRUE;
21650 }
21651 #if DEVELOPMENT || DEBUG
21652 if (vm_log_xnu_user_debug) {
21653 printf("FBDP %d[%s] %s:%d map %p entry %p [ 0x%llx 0x%llx ] vme_xnu_user_debug=%d cs_ret %d\n",
21654 proc_selfpid(),
21655 (get_bsdtask_info(current_task()) ? proc_name_address(get_bsdtask_info(current_task())) : "?"),
21656 __FUNCTION__, __LINE__,
21657 map, entry,
21658 (uint64_t)entry->vme_start, (uint64_t)entry->vme_end,
21659 entry->vme_xnu_user_debug,
21660 cs_ret);
21661 }
21662 #endif /* DEVELOPMENT || DEBUG */
21663 goto done;
21664 }
21665
21666 cs_object = VME_OBJECT(entry);
21667 vm_object_lock_shared(cs_object);
21668 cs_offset = VME_OFFSET(entry);
21669
21670 /* find the VM object backed by the code-signed vnode */
21671 for (;;) {
21672 /* go to the bottom of cs_object's shadow chain */
21673 for (;
21674 cs_object->shadow != VM_OBJECT_NULL;
21675 cs_object = cs_shadow) {
21676 cs_shadow = cs_object->shadow;
21677 cs_offset += cs_object->vo_shadow_offset;
21678 vm_object_lock_shared(cs_shadow);
21679 vm_object_unlock(cs_object);
21680 }
21681 if (cs_object->internal ||
21682 cs_object->pager == MEMORY_OBJECT_NULL) {
21683 vm_object_unlock(cs_object);
21684 return KERN_SUCCESS;
21685 }
21686
21687 cs_offset += cs_object->paging_offset;
21688
21689 /*
21690 * cs_object could be backed by a:
21691 * vnode_pager
21692 * apple_protect_pager
21693 * shared_region_pager
21694 * fourk_pager (multiple backing objects -> fail?)
21695 * ask the pager if it has a backing VM object
21696 */
21697 if (!memory_object_backing_object(cs_object->pager,
21698 cs_offset,
21699 &backing_object,
21700 &backing_offset)) {
21701 /* no backing object: cs_object is it */
21702 break;
21703 }
21704
21705 /* look down the backing object's shadow chain */
21706 vm_object_lock_shared(backing_object);
21707 vm_object_unlock(cs_object);
21708 cs_object = backing_object;
21709 cs_offset = backing_offset;
21710 }
21711
21712 cs_vnode = vnode_pager_lookup_vnode(cs_object->pager);
21713 if (cs_vnode == NULL) {
21714 /* no vnode, no code signatures to associate */
21715 cs_ret = KERN_SUCCESS;
21716 } else {
21717 cs_ret = vnode_pager_get_cs_blobs(cs_vnode,
21718 &cs_blobs);
21719 assert(cs_ret == KERN_SUCCESS);
21720 cs_ret = cs_associate_blob_with_mapping(map->pmap,
21721 entry->vme_start,
21722 (entry->vme_end - entry->vme_start),
21723 cs_offset,
21724 cs_blobs);
21725 }
21726 vm_object_unlock(cs_object);
21727 cs_object = VM_OBJECT_NULL;
21728
21729 done:
21730 if (cs_ret == KERN_SUCCESS) {
21731 DTRACE_VM2(vm_map_entry_cs_associate_success,
21732 vm_map_offset_t, entry->vme_start,
21733 vm_map_offset_t, entry->vme_end);
21734 if (vm_map_executable_immutable) {
21735 /*
21736 * Prevent this executable
21737 * mapping from being unmapped
21738 * or modified.
21739 */
21740 entry->vme_permanent = TRUE;
21741 }
21742 /*
21743 * pmap says it will validate the
21744 * code-signing validity of pages
21745 * faulted in via this mapping, so
21746 * this map entry should be marked so
21747 * that vm_fault() bypasses code-signing
21748 * validation for faults coming through
21749 * this mapping.
21750 */
21751 entry->csm_associated = TRUE;
21752 } else if (cs_ret == KERN_NOT_SUPPORTED) {
21753 /*
21754 * pmap won't check the code-signing
21755 * validity of pages faulted in via
21756 * this mapping, so VM should keep
21757 * doing it.
21758 */
21759 DTRACE_VM3(vm_map_entry_cs_associate_off,
21760 vm_map_offset_t, entry->vme_start,
21761 vm_map_offset_t, entry->vme_end,
21762 int, cs_ret);
21763 } else {
21764 /*
21765 * A real error: do not allow
21766 * execution in this mapping.
21767 */
21768 DTRACE_VM3(vm_map_entry_cs_associate_failure,
21769 vm_map_offset_t, entry->vme_start,
21770 vm_map_offset_t, entry->vme_end,
21771 int, cs_ret);
21772 if (vmk_flags.vmkf_overwrite_immutable) {
21773 /*
21774 * We can get here when we remap an apple_protect pager
21775 * on top of an already cs_associated executable mapping
21776 * with the same code signatures, so we don't want to
21777 * lose VM_PROT_EXECUTE in that case...
21778 */
21779 } else {
21780 entry->protection &= ~VM_PROT_ALLEXEC;
21781 entry->max_protection &= ~VM_PROT_ALLEXEC;
21782 }
21783 }
21784
21785 return cs_ret;
21786 }
21787
21788 #endif /* CODE_SIGNING_MONITOR */
21789
21790 /*
21791 * FORKED CORPSE FOOTPRINT
21792 *
21793 * A forked corpse gets a copy of the original VM map but its pmap is mostly
21794 * empty since it never ran and never got to fault in any pages.
21795 * Collecting footprint info (via "sysctl vm.self_region_footprint") for
21796 * a forked corpse would therefore return very little information.
21797 *
21798 * When forking a corpse, we can pass the VM_MAP_FORK_CORPSE_FOOTPRINT option
21799 * to vm_map_fork() to collect footprint information from the original VM map
21800 * and its pmap, and store it in the forked corpse's VM map. That information
21801 * is stored in place of the VM map's "hole list" since we'll never need to
21802 * lookup for holes in the corpse's map.
21803 *
21804 * The corpse's footprint info looks like this:
21805 *
21806 * vm_map->vmmap_corpse_footprint points to pageable kernel memory laid out
21807 * as follows:
21808 * +---------------------------------------+
21809 * header-> | cf_size |
21810 * +-------------------+-------------------+
21811 * | cf_last_region | cf_last_zeroes |
21812 * +-------------------+-------------------+
21813 * region1-> | cfr_vaddr |
21814 * +-------------------+-------------------+
21815 * | cfr_num_pages | d0 | d1 | d2 | d3 |
21816 * +---------------------------------------+
21817 * | d4 | d5 | ... |
21818 * +---------------------------------------+
21819 * | ... |
21820 * +-------------------+-------------------+
21821 * | dy | dz | na | na | cfr_vaddr... | <-region2
21822 * +-------------------+-------------------+
21823 * | cfr_vaddr (ctd) | cfr_num_pages |
21824 * +---------------------------------------+
21825 * | d0 | d1 ... |
21826 * +---------------------------------------+
21827 * ...
21828 * +---------------------------------------+
21829 * last region-> | cfr_vaddr |
21830 * +---------------------------------------+
21831 * + cfr_num_pages | d0 | d1 | d2 | d3 |
21832 * +---------------------------------------+
21833 * ...
21834 * +---------------------------------------+
21835 * | dx | dy | dz | na | na | na | na | na |
21836 * +---------------------------------------+
21837 *
21838 * where:
21839 * cf_size: total size of the buffer (rounded to page size)
21840 * cf_last_region: offset in the buffer of the last "region" sub-header
21841 * cf_last_zeroes: number of trailing "zero" dispositions at the end
21842 * of last region
21843 * cfr_vaddr: virtual address of the start of the covered "region"
21844 * cfr_num_pages: number of pages in the covered "region"
21845 * d*: disposition of the page at that virtual address
21846 * Regions in the buffer are word-aligned.
21847 *
21848 * We estimate the size of the buffer based on the number of memory regions
21849 * and the virtual size of the address space. While copying each memory region
21850 * during vm_map_fork(), we also collect the footprint info for that region
21851 * and store it in the buffer, packing it as much as possible (coalescing
21852 * contiguous memory regions to avoid having too many region headers and
21853 * avoiding long streaks of "zero" page dispositions by splitting footprint
21854 * "regions", so the number of regions in the footprint buffer might not match
21855 * the number of memory regions in the address space.
21856 *
21857 * We also have to copy the original task's "nonvolatile" ledgers since that's
21858 * part of the footprint and will need to be reported to any tool asking for
21859 * the footprint information of the forked corpse.
21860 */
21861
21862 uint64_t vm_map_corpse_footprint_count = 0;
21863 uint64_t vm_map_corpse_footprint_size_avg = 0;
21864 uint64_t vm_map_corpse_footprint_size_max = 0;
21865 uint64_t vm_map_corpse_footprint_full = 0;
21866 uint64_t vm_map_corpse_footprint_no_buf = 0;
21867
21868 struct vm_map_corpse_footprint_header {
21869 vm_size_t cf_size; /* allocated buffer size */
21870 uint32_t cf_last_region; /* offset of last region in buffer */
21871 union {
21872 uint32_t cfu_last_zeroes; /* during creation:
21873 * number of "zero" dispositions at
21874 * end of last region */
21875 uint32_t cfu_hint_region; /* during lookup:
21876 * offset of last looked up region */
21877 #define cf_last_zeroes cfu.cfu_last_zeroes
21878 #define cf_hint_region cfu.cfu_hint_region
21879 } cfu;
21880 };
21881 typedef uint8_t cf_disp_t;
21882 struct vm_map_corpse_footprint_region {
21883 vm_map_offset_t cfr_vaddr; /* region start virtual address */
21884 uint32_t cfr_num_pages; /* number of pages in this "region" */
21885 cf_disp_t cfr_disposition[0]; /* disposition of each page */
21886 } __attribute__((packed));
21887
21888 static cf_disp_t
vm_page_disposition_to_cf_disp(int disposition)21889 vm_page_disposition_to_cf_disp(
21890 int disposition)
21891 {
21892 assert(sizeof(cf_disp_t) == 1);
21893 /* relocate bits that don't fit in a "uint8_t" */
21894 if (disposition & VM_PAGE_QUERY_PAGE_REUSABLE) {
21895 disposition |= VM_PAGE_QUERY_PAGE_FICTITIOUS;
21896 }
21897 /* cast gets rid of extra bits */
21898 return (cf_disp_t) disposition;
21899 }
21900
21901 static int
vm_page_cf_disp_to_disposition(cf_disp_t cf_disp)21902 vm_page_cf_disp_to_disposition(
21903 cf_disp_t cf_disp)
21904 {
21905 int disposition;
21906
21907 assert(sizeof(cf_disp_t) == 1);
21908 disposition = (int) cf_disp;
21909 /* move relocated bits back in place */
21910 if (cf_disp & VM_PAGE_QUERY_PAGE_FICTITIOUS) {
21911 disposition |= VM_PAGE_QUERY_PAGE_REUSABLE;
21912 disposition &= ~VM_PAGE_QUERY_PAGE_FICTITIOUS;
21913 }
21914 return disposition;
21915 }
21916
21917 /*
21918 * vm_map_corpse_footprint_new_region:
21919 * closes the current footprint "region" and creates a new one
21920 *
21921 * Returns NULL if there's not enough space in the buffer for a new region.
21922 */
21923 static struct vm_map_corpse_footprint_region *
vm_map_corpse_footprint_new_region(struct vm_map_corpse_footprint_header * footprint_header)21924 vm_map_corpse_footprint_new_region(
21925 struct vm_map_corpse_footprint_header *footprint_header)
21926 {
21927 uintptr_t footprint_edge;
21928 uint32_t new_region_offset;
21929 struct vm_map_corpse_footprint_region *footprint_region;
21930 struct vm_map_corpse_footprint_region *new_footprint_region;
21931
21932 footprint_edge = ((uintptr_t)footprint_header +
21933 footprint_header->cf_size);
21934 footprint_region = ((struct vm_map_corpse_footprint_region *)
21935 ((char *)footprint_header +
21936 footprint_header->cf_last_region));
21937 assert((uintptr_t)footprint_region + sizeof(*footprint_region) <=
21938 footprint_edge);
21939
21940 /* get rid of trailing zeroes in the last region */
21941 assert(footprint_region->cfr_num_pages >=
21942 footprint_header->cf_last_zeroes);
21943 footprint_region->cfr_num_pages -=
21944 footprint_header->cf_last_zeroes;
21945 footprint_header->cf_last_zeroes = 0;
21946
21947 /* reuse this region if it's now empty */
21948 if (footprint_region->cfr_num_pages == 0) {
21949 return footprint_region;
21950 }
21951
21952 /* compute offset of new region */
21953 new_region_offset = footprint_header->cf_last_region;
21954 new_region_offset += sizeof(*footprint_region);
21955 new_region_offset += (footprint_region->cfr_num_pages * sizeof(cf_disp_t));
21956 new_region_offset = roundup(new_region_offset, sizeof(int));
21957
21958 /* check if we're going over the edge */
21959 if (((uintptr_t)footprint_header +
21960 new_region_offset +
21961 sizeof(*footprint_region)) >=
21962 footprint_edge) {
21963 /* over the edge: no new region */
21964 return NULL;
21965 }
21966
21967 /* adjust offset of last region in header */
21968 footprint_header->cf_last_region = new_region_offset;
21969
21970 new_footprint_region = (struct vm_map_corpse_footprint_region *)
21971 ((char *)footprint_header +
21972 footprint_header->cf_last_region);
21973 new_footprint_region->cfr_vaddr = 0;
21974 new_footprint_region->cfr_num_pages = 0;
21975 /* caller needs to initialize new region */
21976
21977 return new_footprint_region;
21978 }
21979
21980 /*
21981 * vm_map_corpse_footprint_collect:
21982 * collect footprint information for "old_entry" in "old_map" and
21983 * stores it in "new_map"'s vmmap_footprint_info.
21984 */
21985 kern_return_t
vm_map_corpse_footprint_collect(vm_map_t old_map,vm_map_entry_t old_entry,vm_map_t new_map)21986 vm_map_corpse_footprint_collect(
21987 vm_map_t old_map,
21988 vm_map_entry_t old_entry,
21989 vm_map_t new_map)
21990 {
21991 vm_map_offset_t va;
21992 kern_return_t kr;
21993 struct vm_map_corpse_footprint_header *footprint_header;
21994 struct vm_map_corpse_footprint_region *footprint_region;
21995 struct vm_map_corpse_footprint_region *new_footprint_region;
21996 cf_disp_t *next_disp_p;
21997 uintptr_t footprint_edge;
21998 uint32_t num_pages_tmp;
21999 int effective_page_size;
22000
22001 effective_page_size = MIN(PAGE_SIZE, VM_MAP_PAGE_SIZE(old_map));
22002
22003 va = old_entry->vme_start;
22004
22005 vm_map_lock_assert_exclusive(old_map);
22006 vm_map_lock_assert_exclusive(new_map);
22007
22008 assert(new_map->has_corpse_footprint);
22009 assert(!old_map->has_corpse_footprint);
22010 if (!new_map->has_corpse_footprint ||
22011 old_map->has_corpse_footprint) {
22012 /*
22013 * This can only transfer footprint info from a
22014 * map with a live pmap to a map with a corpse footprint.
22015 */
22016 return KERN_NOT_SUPPORTED;
22017 }
22018
22019 if (new_map->vmmap_corpse_footprint == NULL) {
22020 vm_offset_t buf;
22021 vm_size_t buf_size;
22022
22023 buf = 0;
22024 buf_size = (sizeof(*footprint_header) +
22025 (old_map->hdr.nentries
22026 *
22027 (sizeof(*footprint_region) +
22028 +3)) /* potential alignment for each region */
22029 +
22030 ((old_map->size / effective_page_size)
22031 *
22032 sizeof(cf_disp_t))); /* disposition for each page */
22033 // printf("FBDP corpse map %p guestimate footprint size 0x%llx\n", new_map, (uint64_t) buf_size);
22034 buf_size = round_page(buf_size);
22035
22036 /* limit buffer to 1 page to validate overflow detection */
22037 // buf_size = PAGE_SIZE;
22038
22039 /* limit size to a somewhat sane amount */
22040 #if XNU_TARGET_OS_OSX
22041 #define VM_MAP_CORPSE_FOOTPRINT_INFO_MAX_SIZE (8*1024*1024) /* 8MB */
22042 #else /* XNU_TARGET_OS_OSX */
22043 #define VM_MAP_CORPSE_FOOTPRINT_INFO_MAX_SIZE (256*1024) /* 256KB */
22044 #endif /* XNU_TARGET_OS_OSX */
22045 if (buf_size > VM_MAP_CORPSE_FOOTPRINT_INFO_MAX_SIZE) {
22046 buf_size = VM_MAP_CORPSE_FOOTPRINT_INFO_MAX_SIZE;
22047 }
22048
22049 /*
22050 * Allocate the pageable buffer (with a trailing guard page).
22051 * It will be zero-filled on demand.
22052 */
22053 kr = kmem_alloc(kernel_map, &buf, buf_size + PAGE_SIZE,
22054 KMA_DATA | KMA_PAGEABLE | KMA_GUARD_LAST,
22055 VM_KERN_MEMORY_DIAG);
22056 if (kr != KERN_SUCCESS) {
22057 vm_map_corpse_footprint_no_buf++;
22058 return kr;
22059 }
22060
22061 /* initialize header and 1st region */
22062 footprint_header = (struct vm_map_corpse_footprint_header *)buf;
22063 new_map->vmmap_corpse_footprint = footprint_header;
22064
22065 footprint_header->cf_size = buf_size;
22066 footprint_header->cf_last_region =
22067 sizeof(*footprint_header);
22068 footprint_header->cf_last_zeroes = 0;
22069
22070 footprint_region = (struct vm_map_corpse_footprint_region *)
22071 ((char *)footprint_header +
22072 footprint_header->cf_last_region);
22073 footprint_region->cfr_vaddr = 0;
22074 footprint_region->cfr_num_pages = 0;
22075 } else {
22076 /* retrieve header and last region */
22077 footprint_header = (struct vm_map_corpse_footprint_header *)
22078 new_map->vmmap_corpse_footprint;
22079 footprint_region = (struct vm_map_corpse_footprint_region *)
22080 ((char *)footprint_header +
22081 footprint_header->cf_last_region);
22082 }
22083 footprint_edge = ((uintptr_t)footprint_header +
22084 footprint_header->cf_size);
22085
22086 if ((footprint_region->cfr_vaddr +
22087 (((vm_map_offset_t)footprint_region->cfr_num_pages) *
22088 effective_page_size))
22089 != old_entry->vme_start) {
22090 uint64_t num_pages_delta, num_pages_delta_size;
22091 uint32_t region_offset_delta_size;
22092
22093 /*
22094 * Not the next contiguous virtual address:
22095 * start a new region or store "zero" dispositions for
22096 * the missing pages?
22097 */
22098 /* size of gap in actual page dispositions */
22099 num_pages_delta = ((old_entry->vme_start -
22100 footprint_region->cfr_vaddr) / effective_page_size)
22101 - footprint_region->cfr_num_pages;
22102 num_pages_delta_size = num_pages_delta * sizeof(cf_disp_t);
22103 /* size of gap as a new footprint region header */
22104 region_offset_delta_size =
22105 (sizeof(*footprint_region) +
22106 roundup(((footprint_region->cfr_num_pages -
22107 footprint_header->cf_last_zeroes) * sizeof(cf_disp_t)),
22108 sizeof(int)) -
22109 ((footprint_region->cfr_num_pages -
22110 footprint_header->cf_last_zeroes) * sizeof(cf_disp_t)));
22111 // printf("FBDP %s:%d region 0x%x 0x%llx 0x%x vme_start 0x%llx pages_delta 0x%llx region_delta 0x%x\n", __FUNCTION__, __LINE__, footprint_header->cf_last_region, footprint_region->cfr_vaddr, footprint_region->cfr_num_pages, old_entry->vme_start, num_pages_delta, region_offset_delta);
22112 if (region_offset_delta_size < num_pages_delta_size ||
22113 os_add3_overflow(footprint_region->cfr_num_pages,
22114 (uint32_t) num_pages_delta,
22115 1,
22116 &num_pages_tmp)) {
22117 /*
22118 * Storing data for this gap would take more space
22119 * than inserting a new footprint region header:
22120 * let's start a new region and save space. If it's a
22121 * tie, let's avoid using a new region, since that
22122 * would require more region hops to find the right
22123 * range during lookups.
22124 *
22125 * If the current region's cfr_num_pages would overflow
22126 * if we added "zero" page dispositions for the gap,
22127 * no choice but to start a new region.
22128 */
22129 // printf("FBDP %s:%d new region\n", __FUNCTION__, __LINE__);
22130 new_footprint_region =
22131 vm_map_corpse_footprint_new_region(footprint_header);
22132 /* check that we're not going over the edge */
22133 if (new_footprint_region == NULL) {
22134 goto over_the_edge;
22135 }
22136 footprint_region = new_footprint_region;
22137 /* initialize new region as empty */
22138 footprint_region->cfr_vaddr = old_entry->vme_start;
22139 footprint_region->cfr_num_pages = 0;
22140 } else {
22141 /*
22142 * Store "zero" page dispositions for the missing
22143 * pages.
22144 */
22145 // printf("FBDP %s:%d zero gap\n", __FUNCTION__, __LINE__);
22146 for (; num_pages_delta > 0; num_pages_delta--) {
22147 next_disp_p = (cf_disp_t *)
22148 ((uintptr_t) footprint_region +
22149 sizeof(*footprint_region));
22150 next_disp_p += footprint_region->cfr_num_pages;
22151 /* check that we're not going over the edge */
22152 if ((uintptr_t)next_disp_p >= footprint_edge) {
22153 goto over_the_edge;
22154 }
22155 /* store "zero" disposition for this gap page */
22156 footprint_region->cfr_num_pages++;
22157 *next_disp_p = (cf_disp_t) 0;
22158 footprint_header->cf_last_zeroes++;
22159 }
22160 }
22161 }
22162
22163 for (va = old_entry->vme_start;
22164 va < old_entry->vme_end;
22165 va += effective_page_size) {
22166 int disposition;
22167 cf_disp_t cf_disp;
22168
22169 vm_map_footprint_query_page_info(old_map,
22170 old_entry,
22171 va,
22172 &disposition);
22173 cf_disp = vm_page_disposition_to_cf_disp(disposition);
22174
22175 // if (va < SHARED_REGION_BASE_ARM64) printf("FBDP collect map %p va 0x%llx disp 0x%x\n", new_map, va, disp);
22176
22177 if (cf_disp == 0 && footprint_region->cfr_num_pages == 0) {
22178 /*
22179 * Ignore "zero" dispositions at start of
22180 * region: just move start of region.
22181 */
22182 footprint_region->cfr_vaddr += effective_page_size;
22183 continue;
22184 }
22185
22186 /* would region's cfr_num_pages overflow? */
22187 if (os_add_overflow(footprint_region->cfr_num_pages, 1,
22188 &num_pages_tmp)) {
22189 /* overflow: create a new region */
22190 new_footprint_region =
22191 vm_map_corpse_footprint_new_region(
22192 footprint_header);
22193 if (new_footprint_region == NULL) {
22194 goto over_the_edge;
22195 }
22196 footprint_region = new_footprint_region;
22197 footprint_region->cfr_vaddr = va;
22198 footprint_region->cfr_num_pages = 0;
22199 }
22200
22201 next_disp_p = (cf_disp_t *) ((uintptr_t) footprint_region +
22202 sizeof(*footprint_region));
22203 next_disp_p += footprint_region->cfr_num_pages;
22204 /* check that we're not going over the edge */
22205 if ((uintptr_t)next_disp_p >= footprint_edge) {
22206 goto over_the_edge;
22207 }
22208 /* store this dispostion */
22209 *next_disp_p = cf_disp;
22210 footprint_region->cfr_num_pages++;
22211
22212 if (cf_disp != 0) {
22213 /* non-zero disp: break the current zero streak */
22214 footprint_header->cf_last_zeroes = 0;
22215 /* done */
22216 continue;
22217 }
22218
22219 /* zero disp: add to the current streak of zeroes */
22220 footprint_header->cf_last_zeroes++;
22221 if ((footprint_header->cf_last_zeroes +
22222 roundup(((footprint_region->cfr_num_pages -
22223 footprint_header->cf_last_zeroes) * sizeof(cf_disp_t)) &
22224 (sizeof(int) - 1),
22225 sizeof(int))) <
22226 (sizeof(*footprint_header))) {
22227 /*
22228 * There are not enough trailing "zero" dispositions
22229 * (+ the extra padding we would need for the previous
22230 * region); creating a new region would not save space
22231 * at this point, so let's keep this "zero" disposition
22232 * in this region and reconsider later.
22233 */
22234 continue;
22235 }
22236 /*
22237 * Create a new region to avoid having too many consecutive
22238 * "zero" dispositions.
22239 */
22240 new_footprint_region =
22241 vm_map_corpse_footprint_new_region(footprint_header);
22242 if (new_footprint_region == NULL) {
22243 goto over_the_edge;
22244 }
22245 footprint_region = new_footprint_region;
22246 /* initialize the new region as empty ... */
22247 footprint_region->cfr_num_pages = 0;
22248 /* ... and skip this "zero" disp */
22249 footprint_region->cfr_vaddr = va + effective_page_size;
22250 }
22251
22252 return KERN_SUCCESS;
22253
22254 over_the_edge:
22255 // printf("FBDP map %p footprint was full for va 0x%llx\n", new_map, va);
22256 vm_map_corpse_footprint_full++;
22257 return KERN_RESOURCE_SHORTAGE;
22258 }
22259
22260 /*
22261 * vm_map_corpse_footprint_collect_done:
22262 * completes the footprint collection by getting rid of any remaining
22263 * trailing "zero" dispositions and trimming the unused part of the
22264 * kernel buffer
22265 */
22266 void
vm_map_corpse_footprint_collect_done(vm_map_t new_map)22267 vm_map_corpse_footprint_collect_done(
22268 vm_map_t new_map)
22269 {
22270 struct vm_map_corpse_footprint_header *footprint_header;
22271 struct vm_map_corpse_footprint_region *footprint_region;
22272 vm_size_t buf_size, actual_size;
22273 kern_return_t kr;
22274
22275 assert(new_map->has_corpse_footprint);
22276 if (!new_map->has_corpse_footprint ||
22277 new_map->vmmap_corpse_footprint == NULL) {
22278 return;
22279 }
22280
22281 footprint_header = (struct vm_map_corpse_footprint_header *)
22282 new_map->vmmap_corpse_footprint;
22283 buf_size = footprint_header->cf_size;
22284
22285 footprint_region = (struct vm_map_corpse_footprint_region *)
22286 ((char *)footprint_header +
22287 footprint_header->cf_last_region);
22288
22289 /* get rid of trailing zeroes in last region */
22290 assert(footprint_region->cfr_num_pages >= footprint_header->cf_last_zeroes);
22291 footprint_region->cfr_num_pages -= footprint_header->cf_last_zeroes;
22292 footprint_header->cf_last_zeroes = 0;
22293
22294 actual_size = (vm_size_t)(footprint_header->cf_last_region +
22295 sizeof(*footprint_region) +
22296 (footprint_region->cfr_num_pages * sizeof(cf_disp_t)));
22297
22298 // printf("FBDP map %p buf_size 0x%llx actual_size 0x%llx\n", new_map, (uint64_t) buf_size, (uint64_t) actual_size);
22299 vm_map_corpse_footprint_size_avg =
22300 (((vm_map_corpse_footprint_size_avg *
22301 vm_map_corpse_footprint_count) +
22302 actual_size) /
22303 (vm_map_corpse_footprint_count + 1));
22304 vm_map_corpse_footprint_count++;
22305 if (actual_size > vm_map_corpse_footprint_size_max) {
22306 vm_map_corpse_footprint_size_max = actual_size;
22307 }
22308
22309 actual_size = round_page(actual_size);
22310 if (buf_size > actual_size) {
22311 kr = vm_deallocate(kernel_map,
22312 ((vm_address_t)footprint_header +
22313 actual_size +
22314 PAGE_SIZE), /* trailing guard page */
22315 (buf_size - actual_size));
22316 assertf(kr == KERN_SUCCESS,
22317 "trim: footprint_header %p buf_size 0x%llx actual_size 0x%llx kr=0x%x\n",
22318 footprint_header,
22319 (uint64_t) buf_size,
22320 (uint64_t) actual_size,
22321 kr);
22322 kr = vm_protect(kernel_map,
22323 ((vm_address_t)footprint_header +
22324 actual_size),
22325 PAGE_SIZE,
22326 FALSE, /* set_maximum */
22327 VM_PROT_NONE);
22328 assertf(kr == KERN_SUCCESS,
22329 "guard: footprint_header %p buf_size 0x%llx actual_size 0x%llx kr=0x%x\n",
22330 footprint_header,
22331 (uint64_t) buf_size,
22332 (uint64_t) actual_size,
22333 kr);
22334 }
22335
22336 footprint_header->cf_size = actual_size;
22337 }
22338
22339 /*
22340 * vm_map_corpse_footprint_query_page_info:
22341 * retrieves the disposition of the page at virtual address "vaddr"
22342 * in the forked corpse's VM map
22343 *
22344 * This is the equivalent of vm_map_footprint_query_page_info() for a forked corpse.
22345 */
22346 kern_return_t
vm_map_corpse_footprint_query_page_info(vm_map_t map,vm_map_offset_t va,int * disposition_p)22347 vm_map_corpse_footprint_query_page_info(
22348 vm_map_t map,
22349 vm_map_offset_t va,
22350 int *disposition_p)
22351 {
22352 struct vm_map_corpse_footprint_header *footprint_header;
22353 struct vm_map_corpse_footprint_region *footprint_region;
22354 uint32_t footprint_region_offset;
22355 vm_map_offset_t region_start, region_end;
22356 int disp_idx;
22357 kern_return_t kr;
22358 int effective_page_size;
22359 cf_disp_t cf_disp;
22360
22361 if (!map->has_corpse_footprint) {
22362 *disposition_p = 0;
22363 kr = KERN_INVALID_ARGUMENT;
22364 goto done;
22365 }
22366
22367 footprint_header = map->vmmap_corpse_footprint;
22368 if (footprint_header == NULL) {
22369 *disposition_p = 0;
22370 // if (va < SHARED_REGION_BASE_ARM64) printf("FBDP %d query map %p va 0x%llx disp 0x%x\n", __LINE__, map, va, *disposition_p);
22371 kr = KERN_INVALID_ARGUMENT;
22372 goto done;
22373 }
22374
22375 /* start looking at the hint ("cf_hint_region") */
22376 footprint_region_offset = footprint_header->cf_hint_region;
22377
22378 effective_page_size = MIN(PAGE_SIZE, VM_MAP_PAGE_SIZE(map));
22379
22380 lookup_again:
22381 if (footprint_region_offset < sizeof(*footprint_header)) {
22382 /* hint too low: start from 1st region */
22383 footprint_region_offset = sizeof(*footprint_header);
22384 }
22385 if (footprint_region_offset >= footprint_header->cf_last_region) {
22386 /* hint too high: re-start from 1st region */
22387 footprint_region_offset = sizeof(*footprint_header);
22388 }
22389 footprint_region = (struct vm_map_corpse_footprint_region *)
22390 ((char *)footprint_header + footprint_region_offset);
22391 region_start = footprint_region->cfr_vaddr;
22392 region_end = (region_start +
22393 ((vm_map_offset_t)(footprint_region->cfr_num_pages) *
22394 effective_page_size));
22395 if (va < region_start &&
22396 footprint_region_offset != sizeof(*footprint_header)) {
22397 /* our range starts before the hint region */
22398
22399 /* reset the hint (in a racy way...) */
22400 footprint_header->cf_hint_region = sizeof(*footprint_header);
22401 /* lookup "va" again from 1st region */
22402 footprint_region_offset = sizeof(*footprint_header);
22403 goto lookup_again;
22404 }
22405
22406 while (va >= region_end) {
22407 if (footprint_region_offset >= footprint_header->cf_last_region) {
22408 break;
22409 }
22410 /* skip the region's header */
22411 footprint_region_offset += sizeof(*footprint_region);
22412 /* skip the region's page dispositions */
22413 footprint_region_offset += (footprint_region->cfr_num_pages * sizeof(cf_disp_t));
22414 /* align to next word boundary */
22415 footprint_region_offset =
22416 roundup(footprint_region_offset,
22417 sizeof(int));
22418 footprint_region = (struct vm_map_corpse_footprint_region *)
22419 ((char *)footprint_header + footprint_region_offset);
22420 region_start = footprint_region->cfr_vaddr;
22421 region_end = (region_start +
22422 ((vm_map_offset_t)(footprint_region->cfr_num_pages) *
22423 effective_page_size));
22424 }
22425 if (va < region_start || va >= region_end) {
22426 /* page not found */
22427 *disposition_p = 0;
22428 // if (va < SHARED_REGION_BASE_ARM64) printf("FBDP %d query map %p va 0x%llx disp 0x%x\n", __LINE__, map, va, *disposition_p);
22429 kr = KERN_SUCCESS;
22430 goto done;
22431 }
22432
22433 /* "va" found: set the lookup hint for next lookup (in a racy way...) */
22434 footprint_header->cf_hint_region = footprint_region_offset;
22435
22436 /* get page disposition for "va" in this region */
22437 disp_idx = (int) ((va - footprint_region->cfr_vaddr) / effective_page_size);
22438 cf_disp = footprint_region->cfr_disposition[disp_idx];
22439 *disposition_p = vm_page_cf_disp_to_disposition(cf_disp);
22440 kr = KERN_SUCCESS;
22441 done:
22442 // if (va < SHARED_REGION_BASE_ARM64) printf("FBDP %d query map %p va 0x%llx disp 0x%x\n", __LINE__, map, va, *disposition_p);
22443 /* dtrace -n 'vminfo:::footprint_query_page_info { printf("map 0x%p va 0x%llx disp 0x%x kr 0x%x", arg0, arg1, arg2, arg3); }' */
22444 DTRACE_VM4(footprint_query_page_info,
22445 vm_map_t, map,
22446 vm_map_offset_t, va,
22447 int, *disposition_p,
22448 kern_return_t, kr);
22449
22450 return kr;
22451 }
22452
22453 void
vm_map_corpse_footprint_destroy(vm_map_t map)22454 vm_map_corpse_footprint_destroy(
22455 vm_map_t map)
22456 {
22457 if (map->has_corpse_footprint &&
22458 map->vmmap_corpse_footprint != 0) {
22459 struct vm_map_corpse_footprint_header *footprint_header;
22460 vm_size_t buf_size;
22461 kern_return_t kr;
22462
22463 footprint_header = map->vmmap_corpse_footprint;
22464 buf_size = footprint_header->cf_size;
22465 kr = vm_deallocate(kernel_map,
22466 (vm_offset_t) map->vmmap_corpse_footprint,
22467 ((vm_size_t) buf_size
22468 + PAGE_SIZE)); /* trailing guard page */
22469 assertf(kr == KERN_SUCCESS, "kr=0x%x\n", kr);
22470 map->vmmap_corpse_footprint = 0;
22471 map->has_corpse_footprint = FALSE;
22472 }
22473 }
22474
22475 /*
22476 * vm_map_copy_footprint_ledgers:
22477 * copies any ledger that's relevant to the memory footprint of "old_task"
22478 * into the forked corpse's task ("new_task")
22479 */
22480 void
vm_map_copy_footprint_ledgers(task_t old_task,task_t new_task)22481 vm_map_copy_footprint_ledgers(
22482 task_t old_task,
22483 task_t new_task)
22484 {
22485 vm_map_copy_ledger(old_task, new_task, task_ledgers.phys_footprint);
22486 vm_map_copy_ledger(old_task, new_task, task_ledgers.purgeable_nonvolatile);
22487 vm_map_copy_ledger(old_task, new_task, task_ledgers.purgeable_nonvolatile_compressed);
22488 vm_map_copy_ledger(old_task, new_task, task_ledgers.internal);
22489 vm_map_copy_ledger(old_task, new_task, task_ledgers.internal_compressed);
22490 vm_map_copy_ledger(old_task, new_task, task_ledgers.iokit_mapped);
22491 vm_map_copy_ledger(old_task, new_task, task_ledgers.alternate_accounting);
22492 vm_map_copy_ledger(old_task, new_task, task_ledgers.alternate_accounting_compressed);
22493 vm_map_copy_ledger(old_task, new_task, task_ledgers.page_table);
22494 vm_map_copy_ledger(old_task, new_task, task_ledgers.tagged_footprint);
22495 vm_map_copy_ledger(old_task, new_task, task_ledgers.tagged_footprint_compressed);
22496 vm_map_copy_ledger(old_task, new_task, task_ledgers.network_nonvolatile);
22497 vm_map_copy_ledger(old_task, new_task, task_ledgers.network_nonvolatile_compressed);
22498 vm_map_copy_ledger(old_task, new_task, task_ledgers.media_footprint);
22499 vm_map_copy_ledger(old_task, new_task, task_ledgers.media_footprint_compressed);
22500 vm_map_copy_ledger(old_task, new_task, task_ledgers.graphics_footprint);
22501 vm_map_copy_ledger(old_task, new_task, task_ledgers.graphics_footprint_compressed);
22502 vm_map_copy_ledger(old_task, new_task, task_ledgers.neural_footprint);
22503 vm_map_copy_ledger(old_task, new_task, task_ledgers.neural_footprint_compressed);
22504 vm_map_copy_ledger(old_task, new_task, task_ledgers.wired_mem);
22505 }
22506
22507 /*
22508 * vm_map_copy_ledger:
22509 * copy a single ledger from "old_task" to "new_task"
22510 */
22511 void
vm_map_copy_ledger(task_t old_task,task_t new_task,int ledger_entry)22512 vm_map_copy_ledger(
22513 task_t old_task,
22514 task_t new_task,
22515 int ledger_entry)
22516 {
22517 ledger_amount_t old_balance, new_balance, delta;
22518
22519 assert(new_task->map->has_corpse_footprint);
22520 if (!new_task->map->has_corpse_footprint) {
22521 return;
22522 }
22523
22524 /* turn off sanity checks for the ledger we're about to mess with */
22525 ledger_disable_panic_on_negative(new_task->ledger,
22526 ledger_entry);
22527
22528 /* adjust "new_task" to match "old_task" */
22529 ledger_get_balance(old_task->ledger,
22530 ledger_entry,
22531 &old_balance);
22532 ledger_get_balance(new_task->ledger,
22533 ledger_entry,
22534 &new_balance);
22535 if (new_balance == old_balance) {
22536 /* new == old: done */
22537 } else if (new_balance > old_balance) {
22538 /* new > old ==> new -= new - old */
22539 delta = new_balance - old_balance;
22540 ledger_debit(new_task->ledger,
22541 ledger_entry,
22542 delta);
22543 } else {
22544 /* new < old ==> new += old - new */
22545 delta = old_balance - new_balance;
22546 ledger_credit(new_task->ledger,
22547 ledger_entry,
22548 delta);
22549 }
22550 }
22551
22552 /*
22553 * vm_map_get_pmap:
22554 * returns the pmap associated with the vm_map
22555 */
22556 pmap_t
vm_map_get_pmap(vm_map_t map)22557 vm_map_get_pmap(vm_map_t map)
22558 {
22559 return vm_map_pmap(map);
22560 }
22561
22562 #if CONFIG_MAP_RANGES
22563 static bitmap_t vm_map_user_range_heap_map[BITMAP_LEN(VM_MEMORY_COUNT)];
22564
22565 /*
22566 * vm_map_range_map_init:
22567 * initializes the VM range ID map to enable index lookup
22568 * of user VM ranges based on VM tag from userspace.
22569 */
22570 static void
vm_map_range_map_init(void)22571 vm_map_range_map_init(void)
22572 {
22573 bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_MALLOC);
22574 bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_MALLOC_HUGE);
22575 bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_MALLOC_LARGE);
22576 bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_MALLOC_LARGE_REUSED);
22577 bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_MALLOC_MEDIUM);
22578 bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_MALLOC_PROB_GUARD);
22579 bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_MALLOC_SMALL);
22580 bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_MALLOC_TINY);
22581 }
22582
22583 /*
22584 * vm_map_range_configure:
22585 * configures the user vm_map ranges by increasing the maximum VA range of
22586 * the map and carving out a range at the end of VA space (searching backwards
22587 * in the newly expanded map).
22588 */
22589 kern_return_t
vm_map_range_configure(vm_map_t map)22590 vm_map_range_configure(vm_map_t map)
22591 {
22592 vm_map_size_t addr_space_size;
22593 vm_map_offset_t start, end, saved_max, random_addr;
22594 kern_return_t kr;
22595
22596 /* Should not be applying ranges to kernel map or kernel map submaps */
22597 assert(map != kernel_map);
22598 assert(vm_map_pmap(map) != kernel_pmap);
22599
22600 /* save the existing max offset */
22601 vm_map_lock_read(map);
22602 saved_max = vm_map_max(map);
22603 vm_map_unlock_read(map);
22604
22605 /*
22606 * Check that we're not already jumbo'd. If so we cannot guarantee that
22607 * we can set up the ranges safely without interfering with the existing
22608 * map.
22609 */
22610 if (saved_max > vm_compute_max_offset(vm_map_is_64bit(map))) {
22611 return KERN_NO_SPACE;
22612 }
22613
22614 /* expand the default VM space to the largest possible address */
22615 vm_map_set_jumbo(map);
22616
22617 vm_map_lock(map);
22618 addr_space_size = vm_map_max(map) - saved_max;
22619
22620 if (addr_space_size <= VM_MAP_USER_RANGE_MAX) {
22621 vm_map_unlock(map);
22622 return KERN_NO_SPACE;
22623 }
22624
22625 addr_space_size -= VM_MAP_USER_RANGE_MAX;
22626 random_addr = (vm_map_offset_t)random();
22627 random_addr <<= VM_MAP_PAGE_SHIFT(map);
22628 random_addr %= addr_space_size;
22629
22630 /*
22631 * round off the start so we begin on a L2 TT boundary and ensure we have
22632 * at least a ARM_TT_L2_SIZE sized hole between existing map range and
22633 * new range(s).
22634 */
22635 start = vm_map_round_page(saved_max + random_addr + 1, ARM_TT_L2_OFFMASK);
22636 end = MIN(vm_map_max(map), start + VM_MAP_USER_RANGE_MAX);
22637 assert(start > saved_max);
22638 assert(end <= vm_map_max(map));
22639
22640 /* default range covers the "normal" heap range */
22641 map->user_range[UMEM_RANGE_ID_DEFAULT].min_address = vm_map_min(map);
22642 map->user_range[UMEM_RANGE_ID_DEFAULT].max_address = saved_max;
22643
22644 /* heap range covers the new extended range */
22645 map->user_range[UMEM_RANGE_ID_HEAP].min_address = start;
22646 map->user_range[UMEM_RANGE_ID_HEAP].max_address = end;
22647
22648 vm_map_unlock(map);
22649
22650 /*
22651 * Poke holes so that ASAN or people listing regions
22652 * do not think this space is free.
22653 */
22654
22655 if (start != saved_max) {
22656 kr = vm_map_enter(map, &saved_max, start - saved_max,
22657 0, VM_MAP_KERNEL_FLAGS_FIXED_PERMANENT(), VM_OBJECT_NULL,
22658 0, FALSE, VM_PROT_NONE, VM_PROT_NONE, VM_INHERIT_DEFAULT);
22659 assert(kr == KERN_SUCCESS);
22660 }
22661
22662 if (end != vm_map_max(map)) {
22663 kr = vm_map_enter(map, &end, vm_map_max(map) - end,
22664 0, VM_MAP_KERNEL_FLAGS_FIXED_PERMANENT(), VM_OBJECT_NULL,
22665 0, FALSE, VM_PROT_NONE, VM_PROT_NONE, VM_INHERIT_DEFAULT);
22666 assert(kr == KERN_SUCCESS);
22667 }
22668
22669 vm_map_lock(map);
22670
22671 map->uses_user_ranges = true;
22672
22673 vm_map_unlock(map);
22674
22675 return KERN_SUCCESS;
22676 }
22677
22678 /*
22679 * vm_map_range_fork:
22680 * clones the array of ranges from old_map to new_map in support
22681 * of a VM map fork.
22682 */
22683 void
vm_map_range_fork(vm_map_t new_map,vm_map_t old_map)22684 vm_map_range_fork(vm_map_t new_map, vm_map_t old_map)
22685 {
22686 if (!old_map->uses_user_ranges) {
22687 /* nothing to do */
22688 return;
22689 }
22690
22691 for (size_t i = 0; i < UMEM_RANGE_COUNT; i++) {
22692 new_map->user_range[i] = old_map->user_range[i];
22693 }
22694
22695 new_map->uses_user_ranges = true;
22696 }
22697
22698 /*
22699 * vm_map_get_user_range:
22700 * copy the VM user range for the given VM map and range ID.
22701 */
22702 kern_return_t
vm_map_get_user_range(vm_map_t map,vm_map_range_id_t range_id,mach_vm_range_t range)22703 vm_map_get_user_range(
22704 vm_map_t map,
22705 vm_map_range_id_t range_id,
22706 mach_vm_range_t range)
22707 {
22708 if (map == NULL ||
22709 !map->uses_user_ranges ||
22710 range_id > UMEM_RANGE_ID_MAX ||
22711 range == NULL) {
22712 return KERN_INVALID_ARGUMENT;
22713 }
22714
22715 *range = map->user_range[range_id];
22716 return KERN_SUCCESS;
22717 }
22718
22719 static vm_map_range_id_t
vm_map_user_range_resolve(vm_map_t map,mach_vm_address_t addr,mach_vm_size_t size,mach_vm_range_t range)22720 vm_map_user_range_resolve(
22721 vm_map_t map,
22722 mach_vm_address_t addr,
22723 mach_vm_size_t size,
22724 mach_vm_range_t range)
22725 {
22726 vm_map_lock_assert_held(map);
22727
22728 for (vm_map_range_id_t i = 0; i < UMEM_RANGE_COUNT; i++) {
22729 mach_vm_range_t r = &map->user_range[i];
22730
22731 if (mach_vm_range_contains(r, addr, size)) {
22732 if (range) {
22733 *range = *r;
22734 }
22735 return i;
22736 }
22737 }
22738
22739 if (range) {
22740 range->min_address = range->max_address = 0;
22741 }
22742 return UMEM_RANGE_ID_DEFAULT;
22743 }
22744
22745 #endif /* CONFIG_MAP_RANGES */
22746
22747 void
vm_map_kernel_flags_update_range_id(vm_map_kernel_flags_t * vmkf,vm_map_t map)22748 vm_map_kernel_flags_update_range_id(vm_map_kernel_flags_t *vmkf, vm_map_t map)
22749 {
22750 if (map == kernel_map) {
22751 if (vmkf->vmkf_range_id == KMEM_RANGE_ID_NONE) {
22752 vmkf->vmkf_range_id = KMEM_RANGE_ID_DATA;
22753 }
22754 #if CONFIG_MAP_RANGES
22755 } else if (vmkf->vm_tag < VM_MEMORY_COUNT &&
22756 vmkf->vmkf_range_id == UMEM_RANGE_ID_DEFAULT &&
22757 bitmap_test(vm_map_user_range_heap_map, vmkf->vm_tag)) {
22758 vmkf->vmkf_range_id = UMEM_RANGE_ID_HEAP;
22759 #endif /* CONFIG_MAP_RANGES */
22760 }
22761 }
22762
22763 /*
22764 * vm_map_entry_has_device_pager:
22765 * Check if the vm map entry specified by the virtual address has a device pager.
22766 * If the vm map entry does not exist or if the map is NULL, this returns FALSE.
22767 */
22768 boolean_t
vm_map_entry_has_device_pager(vm_map_t map,vm_map_offset_t vaddr)22769 vm_map_entry_has_device_pager(vm_map_t map, vm_map_offset_t vaddr)
22770 {
22771 vm_map_entry_t entry;
22772 vm_object_t object;
22773 boolean_t result;
22774
22775 if (map == NULL) {
22776 return FALSE;
22777 }
22778
22779 vm_map_lock(map);
22780 while (TRUE) {
22781 if (!vm_map_lookup_entry(map, vaddr, &entry)) {
22782 result = FALSE;
22783 break;
22784 }
22785 if (entry->is_sub_map) {
22786 // Check the submap
22787 vm_map_t submap = VME_SUBMAP(entry);
22788 assert(submap != NULL);
22789 vm_map_lock(submap);
22790 vm_map_unlock(map);
22791 map = submap;
22792 continue;
22793 }
22794 object = VME_OBJECT(entry);
22795 if (object != NULL && object->pager != NULL && is_device_pager_ops(object->pager->mo_pager_ops)) {
22796 result = TRUE;
22797 break;
22798 }
22799 result = FALSE;
22800 break;
22801 }
22802
22803 vm_map_unlock(map);
22804 return result;
22805 }
22806
22807
22808 #if MACH_ASSERT
22809
22810 extern int pmap_ledgers_panic;
22811 extern int pmap_ledgers_panic_leeway;
22812
22813 #define LEDGER_DRIFT(__LEDGER) \
22814 int __LEDGER##_over; \
22815 ledger_amount_t __LEDGER##_over_total; \
22816 ledger_amount_t __LEDGER##_over_max; \
22817 int __LEDGER##_under; \
22818 ledger_amount_t __LEDGER##_under_total; \
22819 ledger_amount_t __LEDGER##_under_max
22820
22821 struct {
22822 uint64_t num_pmaps_checked;
22823
22824 LEDGER_DRIFT(phys_footprint);
22825 LEDGER_DRIFT(internal);
22826 LEDGER_DRIFT(internal_compressed);
22827 LEDGER_DRIFT(external);
22828 LEDGER_DRIFT(reusable);
22829 LEDGER_DRIFT(iokit_mapped);
22830 LEDGER_DRIFT(alternate_accounting);
22831 LEDGER_DRIFT(alternate_accounting_compressed);
22832 LEDGER_DRIFT(page_table);
22833 LEDGER_DRIFT(purgeable_volatile);
22834 LEDGER_DRIFT(purgeable_nonvolatile);
22835 LEDGER_DRIFT(purgeable_volatile_compressed);
22836 LEDGER_DRIFT(purgeable_nonvolatile_compressed);
22837 LEDGER_DRIFT(tagged_nofootprint);
22838 LEDGER_DRIFT(tagged_footprint);
22839 LEDGER_DRIFT(tagged_nofootprint_compressed);
22840 LEDGER_DRIFT(tagged_footprint_compressed);
22841 LEDGER_DRIFT(network_volatile);
22842 LEDGER_DRIFT(network_nonvolatile);
22843 LEDGER_DRIFT(network_volatile_compressed);
22844 LEDGER_DRIFT(network_nonvolatile_compressed);
22845 LEDGER_DRIFT(media_nofootprint);
22846 LEDGER_DRIFT(media_footprint);
22847 LEDGER_DRIFT(media_nofootprint_compressed);
22848 LEDGER_DRIFT(media_footprint_compressed);
22849 LEDGER_DRIFT(graphics_nofootprint);
22850 LEDGER_DRIFT(graphics_footprint);
22851 LEDGER_DRIFT(graphics_nofootprint_compressed);
22852 LEDGER_DRIFT(graphics_footprint_compressed);
22853 LEDGER_DRIFT(neural_nofootprint);
22854 LEDGER_DRIFT(neural_footprint);
22855 LEDGER_DRIFT(neural_nofootprint_compressed);
22856 LEDGER_DRIFT(neural_footprint_compressed);
22857 } pmap_ledgers_drift;
22858
22859 void
vm_map_pmap_check_ledgers(pmap_t pmap,ledger_t ledger,int pid,char * procname)22860 vm_map_pmap_check_ledgers(
22861 pmap_t pmap,
22862 ledger_t ledger,
22863 int pid,
22864 char *procname)
22865 {
22866 ledger_amount_t bal;
22867 boolean_t do_panic;
22868
22869 do_panic = FALSE;
22870
22871 pmap_ledgers_drift.num_pmaps_checked++;
22872
22873 #define LEDGER_CHECK_BALANCE(__LEDGER) \
22874 MACRO_BEGIN \
22875 int panic_on_negative = TRUE; \
22876 ledger_get_balance(ledger, \
22877 task_ledgers.__LEDGER, \
22878 &bal); \
22879 ledger_get_panic_on_negative(ledger, \
22880 task_ledgers.__LEDGER, \
22881 &panic_on_negative); \
22882 if (bal != 0) { \
22883 if (panic_on_negative || \
22884 (pmap_ledgers_panic && \
22885 pmap_ledgers_panic_leeway > 0 && \
22886 (bal > (pmap_ledgers_panic_leeway * PAGE_SIZE) || \
22887 bal < (-pmap_ledgers_panic_leeway * PAGE_SIZE)))) { \
22888 do_panic = TRUE; \
22889 } \
22890 printf("LEDGER BALANCE proc %d (%s) " \
22891 "\"%s\" = %lld\n", \
22892 pid, procname, #__LEDGER, bal); \
22893 if (bal > 0) { \
22894 pmap_ledgers_drift.__LEDGER##_over++; \
22895 pmap_ledgers_drift.__LEDGER##_over_total += bal; \
22896 if (bal > pmap_ledgers_drift.__LEDGER##_over_max) { \
22897 pmap_ledgers_drift.__LEDGER##_over_max = bal; \
22898 } \
22899 } else if (bal < 0) { \
22900 pmap_ledgers_drift.__LEDGER##_under++; \
22901 pmap_ledgers_drift.__LEDGER##_under_total += bal; \
22902 if (bal < pmap_ledgers_drift.__LEDGER##_under_max) { \
22903 pmap_ledgers_drift.__LEDGER##_under_max = bal; \
22904 } \
22905 } \
22906 } \
22907 MACRO_END
22908
22909 LEDGER_CHECK_BALANCE(phys_footprint);
22910 LEDGER_CHECK_BALANCE(internal);
22911 LEDGER_CHECK_BALANCE(internal_compressed);
22912 LEDGER_CHECK_BALANCE(external);
22913 LEDGER_CHECK_BALANCE(reusable);
22914 LEDGER_CHECK_BALANCE(iokit_mapped);
22915 LEDGER_CHECK_BALANCE(alternate_accounting);
22916 LEDGER_CHECK_BALANCE(alternate_accounting_compressed);
22917 LEDGER_CHECK_BALANCE(page_table);
22918 LEDGER_CHECK_BALANCE(purgeable_volatile);
22919 LEDGER_CHECK_BALANCE(purgeable_nonvolatile);
22920 LEDGER_CHECK_BALANCE(purgeable_volatile_compressed);
22921 LEDGER_CHECK_BALANCE(purgeable_nonvolatile_compressed);
22922 LEDGER_CHECK_BALANCE(tagged_nofootprint);
22923 LEDGER_CHECK_BALANCE(tagged_footprint);
22924 LEDGER_CHECK_BALANCE(tagged_nofootprint_compressed);
22925 LEDGER_CHECK_BALANCE(tagged_footprint_compressed);
22926 LEDGER_CHECK_BALANCE(network_volatile);
22927 LEDGER_CHECK_BALANCE(network_nonvolatile);
22928 LEDGER_CHECK_BALANCE(network_volatile_compressed);
22929 LEDGER_CHECK_BALANCE(network_nonvolatile_compressed);
22930 LEDGER_CHECK_BALANCE(media_nofootprint);
22931 LEDGER_CHECK_BALANCE(media_footprint);
22932 LEDGER_CHECK_BALANCE(media_nofootprint_compressed);
22933 LEDGER_CHECK_BALANCE(media_footprint_compressed);
22934 LEDGER_CHECK_BALANCE(graphics_nofootprint);
22935 LEDGER_CHECK_BALANCE(graphics_footprint);
22936 LEDGER_CHECK_BALANCE(graphics_nofootprint_compressed);
22937 LEDGER_CHECK_BALANCE(graphics_footprint_compressed);
22938 LEDGER_CHECK_BALANCE(neural_nofootprint);
22939 LEDGER_CHECK_BALANCE(neural_footprint);
22940 LEDGER_CHECK_BALANCE(neural_nofootprint_compressed);
22941 LEDGER_CHECK_BALANCE(neural_footprint_compressed);
22942
22943 if (do_panic) {
22944 if (pmap_ledgers_panic) {
22945 panic("pmap_destroy(%p) %d[%s] has imbalanced ledgers",
22946 pmap, pid, procname);
22947 } else {
22948 printf("pmap_destroy(%p) %d[%s] has imbalanced ledgers\n",
22949 pmap, pid, procname);
22950 }
22951 }
22952 }
22953
22954 void
vm_map_pmap_set_process(vm_map_t map,int pid,char * procname)22955 vm_map_pmap_set_process(
22956 vm_map_t map,
22957 int pid,
22958 char *procname)
22959 {
22960 pmap_set_process(vm_map_pmap(map), pid, procname);
22961 }
22962
22963 #endif /* MACH_ASSERT */
22964