1 /*
2 * Copyright (c) 2000-2021 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /*
29 * @OSF_COPYRIGHT@
30 */
31 /*
32 * Mach Operating System
33 * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
34 * All Rights Reserved.
35 *
36 * Permission to use, copy, modify and distribute this software and its
37 * documentation is hereby granted, provided that both the copyright
38 * notice and this permission notice appear in all copies of the
39 * software, derivative works or modified versions, and any portions
40 * thereof, and that both notices appear in supporting documentation.
41 *
42 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44 * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45 *
46 * Carnegie Mellon requests users of this software to return to
47 *
48 * Software Distribution Coordinator or [email protected]
49 * School of Computer Science
50 * Carnegie Mellon University
51 * Pittsburgh PA 15213-3890
52 *
53 * any improvements or extensions that they make and grant Carnegie Mellon
54 * the rights to redistribute these changes.
55 */
56 /*
57 */
58 /*
59 * File: vm/vm_map.c
60 * Author: Avadis Tevanian, Jr., Michael Wayne Young
61 * Date: 1985
62 *
63 * Virtual memory mapping module.
64 */
65
66 #include <mach/vm_types.h>
67 #include <mach_assert.h>
68
69 #include <vm/vm_options.h>
70
71 #include <libkern/OSAtomic.h>
72
73 #include <mach/kern_return.h>
74 #include <mach/port.h>
75 #include <mach/vm_attributes.h>
76 #include <mach/vm_param.h>
77 #include <mach/vm_behavior.h>
78 #include <mach/vm_statistics.h>
79 #include <mach/memory_object.h>
80 #include <mach/mach_vm.h>
81 #include <machine/cpu_capabilities.h>
82 #include <mach/sdt.h>
83
84 #include <kern/assert.h>
85 #include <kern/backtrace.h>
86 #include <kern/counter.h>
87 #include <kern/exc_guard.h>
88 #include <kern/kalloc.h>
89 #include <kern/zalloc_internal.h>
90
91 #include <vm/cpm.h>
92 #include <vm/vm_compressor.h>
93 #include <vm/vm_compressor_pager.h>
94 #include <vm/vm_init.h>
95 #include <vm/vm_fault.h>
96 #include <vm/vm_map_internal.h>
97 #include <vm/vm_object.h>
98 #include <vm/vm_page.h>
99 #include <vm/vm_pageout.h>
100 #include <vm/pmap.h>
101 #include <vm/vm_kern.h>
102 #include <ipc/ipc_port.h>
103 #include <kern/sched_prim.h>
104 #include <kern/misc_protos.h>
105
106 #include <mach/vm_map_server.h>
107 #include <mach/mach_host_server.h>
108 #include <vm/vm_memtag.h>
109 #include <vm/vm_protos.h>
110 #include <vm/vm_purgeable_internal.h>
111 #include <vm/vm_reclaim_internal.h>
112
113 #include <vm/vm_protos.h>
114 #include <vm/vm_shared_region.h>
115 #include <vm/vm_map_store.h>
116
117 #include <san/kasan.h>
118
119 #include <sys/resource.h>
120 #include <sys/random.h>
121 #include <sys/codesign.h>
122 #include <sys/code_signing.h>
123 #include <sys/mman.h>
124 #include <sys/reboot.h>
125 #include <sys/kdebug_triage.h>
126
127 #include <libkern/section_keywords.h>
128
129 #if DEVELOPMENT || DEBUG
130 extern int proc_selfcsflags(void);
131 int vm_log_xnu_user_debug = 0;
132 int panic_on_unsigned_execute = 0;
133 int panic_on_mlock_failure = 0;
134 #endif /* DEVELOPMENT || DEBUG */
135
136 #if MACH_ASSERT
137 int debug4k_filter = 0;
138 char debug4k_proc_name[1024] = "";
139 int debug4k_proc_filter = (int)-1 & ~(1 << __DEBUG4K_FAULT);
140 int debug4k_panic_on_misaligned_sharing = 0;
141 const char *debug4k_category_name[] = {
142 "error", /* 0 */
143 "life", /* 1 */
144 "load", /* 2 */
145 "fault", /* 3 */
146 "copy", /* 4 */
147 "share", /* 5 */
148 "adjust", /* 6 */
149 "pmap", /* 7 */
150 "mementry", /* 8 */
151 "iokit", /* 9 */
152 "upl", /* 10 */
153 "exc", /* 11 */
154 "vfs" /* 12 */
155 };
156 #endif /* MACH_ASSERT */
157 int debug4k_no_cow_copyin = 0;
158
159
160 #if __arm64__
161 extern const int fourk_binary_compatibility_unsafe;
162 extern const int fourk_binary_compatibility_allow_wx;
163 #endif /* __arm64__ */
164 extern void qsort(void *a, size_t n, size_t es, int (*cmp)(const void *, const void *));
165 extern int proc_selfpid(void);
166 extern char *proc_name_address(void *p);
167 extern char *proc_best_name(struct proc *p);
168
169 #if VM_MAP_DEBUG_APPLE_PROTECT
170 int vm_map_debug_apple_protect = 0;
171 #endif /* VM_MAP_DEBUG_APPLE_PROTECT */
172 #if VM_MAP_DEBUG_FOURK
173 int vm_map_debug_fourk = 0;
174 #endif /* VM_MAP_DEBUG_FOURK */
175
176 #if DEBUG || DEVELOPMENT
177 static TUNABLE(bool, vm_map_executable_immutable,
178 "vm_map_executable_immutable", true);
179 #else
180 #define vm_map_executable_immutable true
181 #endif
182
183 os_refgrp_decl(static, map_refgrp, "vm_map", NULL);
184
185 extern u_int32_t random(void); /* from <libkern/libkern.h> */
186 /* Internal prototypes
187 */
188
189 typedef struct vm_map_zap {
190 vm_map_entry_t vmz_head;
191 vm_map_entry_t *vmz_tail;
192 } *vm_map_zap_t;
193
194 #define VM_MAP_ZAP_DECLARE(zap) \
195 struct vm_map_zap zap = { .vmz_tail = &zap.vmz_head }
196
197 static vm_map_entry_t vm_map_entry_insert(
198 vm_map_t map,
199 vm_map_entry_t insp_entry,
200 vm_map_offset_t start,
201 vm_map_offset_t end,
202 vm_object_t object,
203 vm_object_offset_t offset,
204 vm_map_kernel_flags_t vmk_flags,
205 boolean_t needs_copy,
206 vm_prot_t cur_protection,
207 vm_prot_t max_protection,
208 vm_inherit_t inheritance,
209 boolean_t clear_map_aligned);
210
211 static void vm_map_simplify_range(
212 vm_map_t map,
213 vm_map_offset_t start,
214 vm_map_offset_t end); /* forward */
215
216 static boolean_t vm_map_range_check(
217 vm_map_t map,
218 vm_map_offset_t start,
219 vm_map_offset_t end,
220 vm_map_entry_t *entry);
221
222 static void vm_map_submap_pmap_clean(
223 vm_map_t map,
224 vm_map_offset_t start,
225 vm_map_offset_t end,
226 vm_map_t sub_map,
227 vm_map_offset_t offset);
228
229 static void vm_map_pmap_enter(
230 vm_map_t map,
231 vm_map_offset_t addr,
232 vm_map_offset_t end_addr,
233 vm_object_t object,
234 vm_object_offset_t offset,
235 vm_prot_t protection);
236
237 static void _vm_map_clip_end(
238 struct vm_map_header *map_header,
239 vm_map_entry_t entry,
240 vm_map_offset_t end);
241
242 static void _vm_map_clip_start(
243 struct vm_map_header *map_header,
244 vm_map_entry_t entry,
245 vm_map_offset_t start);
246
247 static kmem_return_t vm_map_delete(
248 vm_map_t map,
249 vm_map_offset_t start,
250 vm_map_offset_t end,
251 vmr_flags_t flags,
252 kmem_guard_t guard,
253 vm_map_zap_t zap);
254
255 static void vm_map_copy_insert(
256 vm_map_t map,
257 vm_map_entry_t after_where,
258 vm_map_copy_t copy);
259
260 static kern_return_t vm_map_copy_overwrite_unaligned(
261 vm_map_t dst_map,
262 vm_map_entry_t entry,
263 vm_map_copy_t copy,
264 vm_map_address_t start,
265 boolean_t discard_on_success);
266
267 static kern_return_t vm_map_copy_overwrite_aligned(
268 vm_map_t dst_map,
269 vm_map_entry_t tmp_entry,
270 vm_map_copy_t copy,
271 vm_map_offset_t start,
272 pmap_t pmap);
273
274 static kern_return_t vm_map_copyin_kernel_buffer(
275 vm_map_t src_map,
276 vm_map_address_t src_addr,
277 vm_map_size_t len,
278 boolean_t src_destroy,
279 vm_map_copy_t *copy_result); /* OUT */
280
281 static kern_return_t vm_map_copyout_kernel_buffer(
282 vm_map_t map,
283 vm_map_address_t *addr, /* IN/OUT */
284 vm_map_copy_t copy,
285 vm_map_size_t copy_size,
286 boolean_t overwrite,
287 boolean_t consume_on_success);
288
289 static void vm_map_fork_share(
290 vm_map_t old_map,
291 vm_map_entry_t old_entry,
292 vm_map_t new_map);
293
294 static boolean_t vm_map_fork_copy(
295 vm_map_t old_map,
296 vm_map_entry_t *old_entry_p,
297 vm_map_t new_map,
298 int vm_map_copyin_flags);
299
300 static kern_return_t vm_map_wire_nested(
301 vm_map_t map,
302 vm_map_offset_t start,
303 vm_map_offset_t end,
304 vm_prot_t caller_prot,
305 vm_tag_t tag,
306 boolean_t user_wire,
307 pmap_t map_pmap,
308 vm_map_offset_t pmap_addr,
309 ppnum_t *physpage_p);
310
311 static kern_return_t vm_map_unwire_nested(
312 vm_map_t map,
313 vm_map_offset_t start,
314 vm_map_offset_t end,
315 boolean_t user_wire,
316 pmap_t map_pmap,
317 vm_map_offset_t pmap_addr);
318
319 static kern_return_t vm_map_overwrite_submap_recurse(
320 vm_map_t dst_map,
321 vm_map_offset_t dst_addr,
322 vm_map_size_t dst_size);
323
324 static kern_return_t vm_map_copy_overwrite_nested(
325 vm_map_t dst_map,
326 vm_map_offset_t dst_addr,
327 vm_map_copy_t copy,
328 boolean_t interruptible,
329 pmap_t pmap,
330 boolean_t discard_on_success);
331
332 static kern_return_t vm_map_remap_extract(
333 vm_map_t map,
334 vm_map_offset_t addr,
335 vm_map_size_t size,
336 boolean_t copy,
337 vm_map_copy_t map_copy,
338 vm_prot_t *cur_protection,
339 vm_prot_t *max_protection,
340 vm_inherit_t inheritance,
341 vm_map_kernel_flags_t vmk_flags);
342
343 static kern_return_t vm_map_remap_range_allocate(
344 vm_map_t map,
345 vm_map_address_t *address,
346 vm_map_size_t size,
347 vm_map_offset_t mask,
348 vm_map_kernel_flags_t vmk_flags,
349 vm_map_entry_t *map_entry,
350 vm_map_zap_t zap_list);
351
352 static void vm_map_region_look_for_page(
353 vm_map_t map,
354 vm_map_offset_t va,
355 vm_object_t object,
356 vm_object_offset_t offset,
357 int max_refcnt,
358 unsigned short depth,
359 vm_region_extended_info_t extended,
360 mach_msg_type_number_t count);
361
362 static int vm_map_region_count_obj_refs(
363 vm_map_entry_t entry,
364 vm_object_t object);
365
366
367 static kern_return_t vm_map_willneed(
368 vm_map_t map,
369 vm_map_offset_t start,
370 vm_map_offset_t end);
371
372 static kern_return_t vm_map_reuse_pages(
373 vm_map_t map,
374 vm_map_offset_t start,
375 vm_map_offset_t end);
376
377 static kern_return_t vm_map_reusable_pages(
378 vm_map_t map,
379 vm_map_offset_t start,
380 vm_map_offset_t end);
381
382 static kern_return_t vm_map_can_reuse(
383 vm_map_t map,
384 vm_map_offset_t start,
385 vm_map_offset_t end);
386
387 static kern_return_t vm_map_random_address_for_size(
388 vm_map_t map,
389 vm_map_offset_t *address,
390 vm_map_size_t size,
391 vm_map_kernel_flags_t vmk_flags);
392
393
394 #if CONFIG_MAP_RANGES
395
396 static vm_map_range_id_t vm_map_user_range_resolve(
397 vm_map_t map,
398 mach_vm_address_t addr,
399 mach_vm_address_t size,
400 mach_vm_range_t range);
401
402 #endif /* CONFIG_MAP_RANGES */
403 #if MACH_ASSERT
404 static kern_return_t vm_map_pageout(
405 vm_map_t map,
406 vm_map_offset_t start,
407 vm_map_offset_t end);
408 #endif /* MACH_ASSERT */
409
410 kern_return_t vm_map_corpse_footprint_collect(
411 vm_map_t old_map,
412 vm_map_entry_t old_entry,
413 vm_map_t new_map);
414 void vm_map_corpse_footprint_collect_done(
415 vm_map_t new_map);
416 void vm_map_corpse_footprint_destroy(
417 vm_map_t map);
418 kern_return_t vm_map_corpse_footprint_query_page_info(
419 vm_map_t map,
420 vm_map_offset_t va,
421 int *disposition_p);
422 void vm_map_footprint_query_page_info(
423 vm_map_t map,
424 vm_map_entry_t map_entry,
425 vm_map_offset_t curr_s_offset,
426 int *disposition_p);
427
428 #if CONFIG_MAP_RANGES
429 static void vm_map_range_map_init(void);
430 #endif /* CONFIG_MAP_RANGES */
431
432 pid_t find_largest_process_vm_map_entries(void);
433
434 extern int exit_with_guard_exception(void *p, mach_exception_data_type_t code,
435 mach_exception_data_type_t subcode);
436
437 /*
438 * Macros to copy a vm_map_entry. We must be careful to correctly
439 * manage the wired page count. vm_map_entry_copy() creates a new
440 * map entry to the same memory - the wired count in the new entry
441 * must be set to zero. vm_map_entry_copy_full() creates a new
442 * entry that is identical to the old entry. This preserves the
443 * wire count; it's used for map splitting and zone changing in
444 * vm_map_copyout.
445 */
446
447 static inline void
vm_map_entry_copy_csm_assoc(vm_map_t map __unused,vm_map_entry_t new __unused,vm_map_entry_t old __unused)448 vm_map_entry_copy_csm_assoc(
449 vm_map_t map __unused,
450 vm_map_entry_t new __unused,
451 vm_map_entry_t old __unused)
452 {
453 #if CODE_SIGNING_MONITOR
454 /* when code signing monitor is enabled, we want to reset on copy */
455 new->csm_associated = FALSE;
456 #else
457 /* when code signing monitor is not enabled, assert as a sanity check */
458 assert(new->csm_associated == FALSE);
459 #endif
460 #if DEVELOPMENT || DEBUG
461 if (new->vme_xnu_user_debug && vm_log_xnu_user_debug) {
462 printf("FBDP %d[%s] %s:%d map %p entry %p [ 0x%llx 0x%llx ] resetting vme_xnu_user_debug\n",
463 proc_selfpid(),
464 (get_bsdtask_info(current_task())
465 ? proc_name_address(get_bsdtask_info(current_task()))
466 : "?"),
467 __FUNCTION__, __LINE__,
468 map, new, new->vme_start, new->vme_end);
469 }
470 #endif /* DEVELOPMENT || DEBUG */
471 new->vme_xnu_user_debug = FALSE;
472 }
473
474 /*
475 * The "used_for_jit" flag was copied from OLD to NEW in vm_map_entry_copy().
476 * But for security reasons on some platforms, we don't want the
477 * new mapping to be "used for jit", so we reset the flag here.
478 */
479 static inline void
vm_map_entry_copy_code_signing(vm_map_t map,vm_map_entry_t new,vm_map_entry_t old __unused)480 vm_map_entry_copy_code_signing(
481 vm_map_t map,
482 vm_map_entry_t new,
483 vm_map_entry_t old __unused)
484 {
485 if (VM_MAP_POLICY_ALLOW_JIT_COPY(map)) {
486 assert(new->used_for_jit == old->used_for_jit);
487 } else {
488 new->used_for_jit = FALSE;
489 }
490 }
491
492 static inline void
vm_map_entry_copy_full(vm_map_entry_t new,vm_map_entry_t old)493 vm_map_entry_copy_full(
494 vm_map_entry_t new,
495 vm_map_entry_t old)
496 {
497 #if MAP_ENTRY_CREATION_DEBUG
498 btref_put(new->vme_creation_bt);
499 btref_retain(old->vme_creation_bt);
500 #endif
501 #if MAP_ENTRY_INSERTION_DEBUG
502 btref_put(new->vme_insertion_bt);
503 btref_retain(old->vme_insertion_bt);
504 #endif
505 #if VM_BTLOG_TAGS
506 /* Discard the btref that might be in the new entry */
507 if (new->vme_kernel_object) {
508 btref_put(new->vme_tag_btref);
509 }
510 /* Retain the btref in the old entry to account for its copy */
511 if (old->vme_kernel_object) {
512 btref_retain(old->vme_tag_btref);
513 }
514 #endif /* VM_BTLOG_TAGS */
515 *new = *old;
516 }
517
518 static inline void
vm_map_entry_copy(vm_map_t map,vm_map_entry_t new,vm_map_entry_t old)519 vm_map_entry_copy(
520 vm_map_t map,
521 vm_map_entry_t new,
522 vm_map_entry_t old)
523 {
524 vm_map_entry_copy_full(new, old);
525
526 new->is_shared = FALSE;
527 new->needs_wakeup = FALSE;
528 new->in_transition = FALSE;
529 new->wired_count = 0;
530 new->user_wired_count = 0;
531 new->vme_permanent = FALSE;
532 vm_map_entry_copy_code_signing(map, new, old);
533 vm_map_entry_copy_csm_assoc(map, new, old);
534 if (new->iokit_acct) {
535 assertf(!new->use_pmap, "old %p new %p\n", old, new);
536 new->iokit_acct = FALSE;
537 new->use_pmap = TRUE;
538 }
539 new->vme_resilient_codesign = FALSE;
540 new->vme_resilient_media = FALSE;
541 new->vme_atomic = FALSE;
542 new->vme_no_copy_on_read = FALSE;
543 }
544
545 /*
546 * Normal lock_read_to_write() returns FALSE/0 on failure.
547 * These functions evaluate to zero on success and non-zero value on failure.
548 */
549 __attribute__((always_inline))
550 int
vm_map_lock_read_to_write(vm_map_t map)551 vm_map_lock_read_to_write(vm_map_t map)
552 {
553 if (lck_rw_lock_shared_to_exclusive(&(map)->lock)) {
554 DTRACE_VM(vm_map_lock_upgrade);
555 return 0;
556 }
557 return 1;
558 }
559
560 __attribute__((always_inline))
561 boolean_t
vm_map_try_lock(vm_map_t map)562 vm_map_try_lock(vm_map_t map)
563 {
564 if (lck_rw_try_lock_exclusive(&(map)->lock)) {
565 DTRACE_VM(vm_map_lock_w);
566 return TRUE;
567 }
568 return FALSE;
569 }
570
571 __attribute__((always_inline))
572 boolean_t
vm_map_try_lock_read(vm_map_t map)573 vm_map_try_lock_read(vm_map_t map)
574 {
575 if (lck_rw_try_lock_shared(&(map)->lock)) {
576 DTRACE_VM(vm_map_lock_r);
577 return TRUE;
578 }
579 return FALSE;
580 }
581
582 /*!
583 * @function kdp_vm_map_is_acquired_exclusive
584 *
585 * @abstract
586 * Checks if vm map is acquired exclusive.
587 *
588 * @discussion
589 * NOT SAFE: To be used only by kernel debugger.
590 *
591 * @param map map to check
592 *
593 * @returns TRUE if the map is acquired exclusively.
594 */
595 boolean_t
kdp_vm_map_is_acquired_exclusive(vm_map_t map)596 kdp_vm_map_is_acquired_exclusive(vm_map_t map)
597 {
598 return kdp_lck_rw_lock_is_acquired_exclusive(&map->lock);
599 }
600
601 /*
602 * Routines to get the page size the caller should
603 * use while inspecting the target address space.
604 * Use the "_safely" variant if the caller is dealing with a user-provided
605 * array whose size depends on the page size, to avoid any overflow or
606 * underflow of a user-allocated buffer.
607 */
608 int
vm_self_region_page_shift_safely(vm_map_t target_map)609 vm_self_region_page_shift_safely(
610 vm_map_t target_map)
611 {
612 int effective_page_shift = 0;
613
614 if (PAGE_SIZE == (4096)) {
615 /* x86_64 and 4k watches: always use 4k */
616 return PAGE_SHIFT;
617 }
618 /* did caller provide an explicit page size for this thread to use? */
619 effective_page_shift = thread_self_region_page_shift();
620 if (effective_page_shift) {
621 /* use the explicitly-provided page size */
622 return effective_page_shift;
623 }
624 /* no explicit page size: use the caller's page size... */
625 effective_page_shift = VM_MAP_PAGE_SHIFT(current_map());
626 if (effective_page_shift == VM_MAP_PAGE_SHIFT(target_map)) {
627 /* page size match: safe to use */
628 return effective_page_shift;
629 }
630 /* page size mismatch */
631 return -1;
632 }
633 int
vm_self_region_page_shift(vm_map_t target_map)634 vm_self_region_page_shift(
635 vm_map_t target_map)
636 {
637 int effective_page_shift;
638
639 effective_page_shift = vm_self_region_page_shift_safely(target_map);
640 if (effective_page_shift == -1) {
641 /* no safe value but OK to guess for caller */
642 effective_page_shift = MIN(VM_MAP_PAGE_SHIFT(current_map()),
643 VM_MAP_PAGE_SHIFT(target_map));
644 }
645 return effective_page_shift;
646 }
647
648
649 /*
650 * Decide if we want to allow processes to execute from their data or stack areas.
651 * override_nx() returns true if we do. Data/stack execution can be enabled independently
652 * for 32 and 64 bit processes. Set the VM_ABI_32 or VM_ABI_64 flags in allow_data_exec
653 * or allow_stack_exec to enable data execution for that type of data area for that particular
654 * ABI (or both by or'ing the flags together). These are initialized in the architecture
655 * specific pmap files since the default behavior varies according to architecture. The
656 * main reason it varies is because of the need to provide binary compatibility with old
657 * applications that were written before these restrictions came into being. In the old
658 * days, an app could execute anything it could read, but this has slowly been tightened
659 * up over time. The default behavior is:
660 *
661 * 32-bit PPC apps may execute from both stack and data areas
662 * 32-bit Intel apps may exeucte from data areas but not stack
663 * 64-bit PPC/Intel apps may not execute from either data or stack
664 *
665 * An application on any architecture may override these defaults by explicitly
666 * adding PROT_EXEC permission to the page in question with the mprotect(2)
667 * system call. This code here just determines what happens when an app tries to
668 * execute from a page that lacks execute permission.
669 *
670 * Note that allow_data_exec or allow_stack_exec may also be modified by sysctl to change the
671 * default behavior for both 32 and 64 bit apps on a system-wide basis. Furthermore,
672 * a Mach-O header flag bit (MH_NO_HEAP_EXECUTION) can be used to forcibly disallow
673 * execution from data areas for a particular binary even if the arch normally permits it. As
674 * a final wrinkle, a posix_spawn attribute flag can be used to negate this opt-in header bit
675 * to support some complicated use cases, notably browsers with out-of-process plugins that
676 * are not all NX-safe.
677 */
678
679 extern int allow_data_exec, allow_stack_exec;
680
681 int
override_nx(vm_map_t map,uint32_t user_tag)682 override_nx(vm_map_t map, uint32_t user_tag) /* map unused on arm */
683 {
684 int current_abi;
685
686 if (map->pmap == kernel_pmap) {
687 return FALSE;
688 }
689
690 /*
691 * Determine if the app is running in 32 or 64 bit mode.
692 */
693
694 if (vm_map_is_64bit(map)) {
695 current_abi = VM_ABI_64;
696 } else {
697 current_abi = VM_ABI_32;
698 }
699
700 /*
701 * Determine if we should allow the execution based on whether it's a
702 * stack or data area and the current architecture.
703 */
704
705 if (user_tag == VM_MEMORY_STACK) {
706 return allow_stack_exec & current_abi;
707 }
708
709 return (allow_data_exec & current_abi) && (map->map_disallow_data_exec == FALSE);
710 }
711
712
713 /*
714 * Virtual memory maps provide for the mapping, protection,
715 * and sharing of virtual memory objects. In addition,
716 * this module provides for an efficient virtual copy of
717 * memory from one map to another.
718 *
719 * Synchronization is required prior to most operations.
720 *
721 * Maps consist of an ordered doubly-linked list of simple
722 * entries; a single hint is used to speed up lookups.
723 *
724 * Sharing maps have been deleted from this version of Mach.
725 * All shared objects are now mapped directly into the respective
726 * maps. This requires a change in the copy on write strategy;
727 * the asymmetric (delayed) strategy is used for shared temporary
728 * objects instead of the symmetric (shadow) strategy. All maps
729 * are now "top level" maps (either task map, kernel map or submap
730 * of the kernel map).
731 *
732 * Since portions of maps are specified by start/end addreses,
733 * which may not align with existing map entries, all
734 * routines merely "clip" entries to these start/end values.
735 * [That is, an entry is split into two, bordering at a
736 * start or end value.] Note that these clippings may not
737 * always be necessary (as the two resulting entries are then
738 * not changed); however, the clipping is done for convenience.
739 * No attempt is currently made to "glue back together" two
740 * abutting entries.
741 *
742 * The symmetric (shadow) copy strategy implements virtual copy
743 * by copying VM object references from one map to
744 * another, and then marking both regions as copy-on-write.
745 * It is important to note that only one writeable reference
746 * to a VM object region exists in any map when this strategy
747 * is used -- this means that shadow object creation can be
748 * delayed until a write operation occurs. The symmetric (delayed)
749 * strategy allows multiple maps to have writeable references to
750 * the same region of a vm object, and hence cannot delay creating
751 * its copy objects. See vm_object_copy_quickly() in vm_object.c.
752 * Copying of permanent objects is completely different; see
753 * vm_object_copy_strategically() in vm_object.c.
754 */
755
756 ZONE_DECLARE_ID(ZONE_ID_VM_MAP_COPY, struct vm_map_copy);
757
758 #define VM_MAP_ZONE_NAME "maps"
759 #define VM_MAP_ZFLAGS (ZC_NOENCRYPT | ZC_VM)
760
761 #define VM_MAP_ENTRY_ZONE_NAME "VM map entries"
762 #define VM_MAP_ENTRY_ZFLAGS (ZC_NOENCRYPT | ZC_VM)
763
764 #define VM_MAP_HOLES_ZONE_NAME "VM map holes"
765 #define VM_MAP_HOLES_ZFLAGS (ZC_NOENCRYPT | ZC_VM)
766
767 /*
768 * Asserts that a vm_map_copy object is coming from the
769 * vm_map_copy_zone to ensure that it isn't a fake constructed
770 * anywhere else.
771 */
772 void
vm_map_copy_require(struct vm_map_copy * copy)773 vm_map_copy_require(struct vm_map_copy *copy)
774 {
775 zone_id_require(ZONE_ID_VM_MAP_COPY, sizeof(struct vm_map_copy), copy);
776 }
777
778 /*
779 * vm_map_require:
780 *
781 * Ensures that the argument is memory allocated from the genuine
782 * vm map zone. (See zone_id_require_allow_foreign).
783 */
784 void
vm_map_require(vm_map_t map)785 vm_map_require(vm_map_t map)
786 {
787 zone_id_require(ZONE_ID_VM_MAP, sizeof(struct _vm_map), map);
788 }
789
790 #define VM_MAP_EARLY_COUNT_MAX 16
791 static __startup_data vm_offset_t map_data;
792 static __startup_data vm_size_t map_data_size;
793 static __startup_data vm_offset_t kentry_data;
794 static __startup_data vm_size_t kentry_data_size;
795 static __startup_data vm_offset_t map_holes_data;
796 static __startup_data vm_size_t map_holes_data_size;
797 static __startup_data vm_map_t *early_map_owners[VM_MAP_EARLY_COUNT_MAX];
798 static __startup_data uint32_t early_map_count;
799
800 #if XNU_TARGET_OS_OSX
801 #define NO_COALESCE_LIMIT ((1024 * 128) - 1)
802 #else /* XNU_TARGET_OS_OSX */
803 #define NO_COALESCE_LIMIT 0
804 #endif /* XNU_TARGET_OS_OSX */
805
806 /* Skip acquiring locks if we're in the midst of a kernel core dump */
807 unsigned int not_in_kdp = 1;
808
809 unsigned int vm_map_set_cache_attr_count = 0;
810
811 kern_return_t
vm_map_set_cache_attr(vm_map_t map,vm_map_offset_t va)812 vm_map_set_cache_attr(
813 vm_map_t map,
814 vm_map_offset_t va)
815 {
816 vm_map_entry_t map_entry;
817 vm_object_t object;
818 kern_return_t kr = KERN_SUCCESS;
819
820 vm_map_lock_read(map);
821
822 if (!vm_map_lookup_entry(map, va, &map_entry) ||
823 map_entry->is_sub_map) {
824 /*
825 * that memory is not properly mapped
826 */
827 kr = KERN_INVALID_ARGUMENT;
828 goto done;
829 }
830 object = VME_OBJECT(map_entry);
831
832 if (object == VM_OBJECT_NULL) {
833 /*
834 * there should be a VM object here at this point
835 */
836 kr = KERN_INVALID_ARGUMENT;
837 goto done;
838 }
839 vm_object_lock(object);
840 object->set_cache_attr = TRUE;
841 vm_object_unlock(object);
842
843 vm_map_set_cache_attr_count++;
844 done:
845 vm_map_unlock_read(map);
846
847 return kr;
848 }
849
850
851 #if CONFIG_CODE_DECRYPTION
852 /*
853 * vm_map_apple_protected:
854 * This remaps the requested part of the object with an object backed by
855 * the decrypting pager.
856 * crypt_info contains entry points and session data for the crypt module.
857 * The crypt_info block will be copied by vm_map_apple_protected. The data structures
858 * referenced in crypt_info must remain valid until crypt_info->crypt_end() is called.
859 */
860 kern_return_t
vm_map_apple_protected(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_object_offset_t crypto_backing_offset,struct pager_crypt_info * crypt_info,uint32_t cryptid)861 vm_map_apple_protected(
862 vm_map_t map,
863 vm_map_offset_t start,
864 vm_map_offset_t end,
865 vm_object_offset_t crypto_backing_offset,
866 struct pager_crypt_info *crypt_info,
867 uint32_t cryptid)
868 {
869 boolean_t map_locked;
870 kern_return_t kr;
871 vm_map_entry_t map_entry;
872 struct vm_map_entry tmp_entry;
873 memory_object_t unprotected_mem_obj;
874 vm_object_t protected_object;
875 vm_map_offset_t map_addr;
876 vm_map_offset_t start_aligned, end_aligned;
877 vm_object_offset_t crypto_start, crypto_end;
878 boolean_t cache_pager;
879
880 map_locked = FALSE;
881 unprotected_mem_obj = MEMORY_OBJECT_NULL;
882
883 if (__improbable(vm_map_range_overflows(map, start, end - start))) {
884 return KERN_INVALID_ADDRESS;
885 }
886 start_aligned = vm_map_trunc_page(start, PAGE_MASK_64);
887 end_aligned = vm_map_round_page(end, PAGE_MASK_64);
888 start_aligned = vm_map_trunc_page(start_aligned, VM_MAP_PAGE_MASK(map));
889 end_aligned = vm_map_round_page(end_aligned, VM_MAP_PAGE_MASK(map));
890
891 #if __arm64__
892 /*
893 * "start" and "end" might be 4K-aligned but not 16K-aligned,
894 * so we might have to loop and establish up to 3 mappings:
895 *
896 * + the first 16K-page, which might overlap with the previous
897 * 4K-aligned mapping,
898 * + the center,
899 * + the last 16K-page, which might overlap with the next
900 * 4K-aligned mapping.
901 * Each of these mapping might be backed by a vnode pager (if
902 * properly page-aligned) or a "fourk_pager", itself backed by a
903 * vnode pager (if 4K-aligned but not page-aligned).
904 */
905 #endif /* __arm64__ */
906
907 map_addr = start_aligned;
908 for (map_addr = start_aligned;
909 map_addr < end;
910 map_addr = tmp_entry.vme_end) {
911 vm_map_lock(map);
912 map_locked = TRUE;
913
914 /* lookup the protected VM object */
915 if (!vm_map_lookup_entry(map,
916 map_addr,
917 &map_entry) ||
918 map_entry->is_sub_map ||
919 VME_OBJECT(map_entry) == VM_OBJECT_NULL) {
920 /* that memory is not properly mapped */
921 kr = KERN_INVALID_ARGUMENT;
922 goto done;
923 }
924
925 /* ensure mapped memory is mapped as executable except
926 * except for model decryption flow */
927 if ((cryptid != CRYPTID_MODEL_ENCRYPTION) &&
928 !(map_entry->protection & VM_PROT_EXECUTE)) {
929 kr = KERN_INVALID_ARGUMENT;
930 goto done;
931 }
932
933 /* get the protected object to be decrypted */
934 protected_object = VME_OBJECT(map_entry);
935 if (protected_object == VM_OBJECT_NULL) {
936 /* there should be a VM object here at this point */
937 kr = KERN_INVALID_ARGUMENT;
938 goto done;
939 }
940 /* ensure protected object stays alive while map is unlocked */
941 vm_object_reference(protected_object);
942
943 /* limit the map entry to the area we want to cover */
944 vm_map_clip_start(map, map_entry, start_aligned);
945 vm_map_clip_end(map, map_entry, end_aligned);
946
947 tmp_entry = *map_entry;
948 map_entry = VM_MAP_ENTRY_NULL; /* not valid after unlocking map */
949 vm_map_unlock(map);
950 map_locked = FALSE;
951
952 /*
953 * This map entry might be only partially encrypted
954 * (if not fully "page-aligned").
955 */
956 crypto_start = 0;
957 crypto_end = tmp_entry.vme_end - tmp_entry.vme_start;
958 if (tmp_entry.vme_start < start) {
959 if (tmp_entry.vme_start != start_aligned) {
960 kr = KERN_INVALID_ADDRESS;
961 }
962 crypto_start += (start - tmp_entry.vme_start);
963 }
964 if (tmp_entry.vme_end > end) {
965 if (tmp_entry.vme_end != end_aligned) {
966 kr = KERN_INVALID_ADDRESS;
967 }
968 crypto_end -= (tmp_entry.vme_end - end);
969 }
970
971 /*
972 * This "extra backing offset" is needed to get the decryption
973 * routine to use the right key. It adjusts for the possibly
974 * relative offset of an interposed "4K" pager...
975 */
976 if (crypto_backing_offset == (vm_object_offset_t) -1) {
977 crypto_backing_offset = VME_OFFSET(&tmp_entry);
978 }
979
980 cache_pager = TRUE;
981 #if XNU_TARGET_OS_OSX
982 if (vm_map_is_alien(map)) {
983 cache_pager = FALSE;
984 }
985 #endif /* XNU_TARGET_OS_OSX */
986
987 /*
988 * Lookup (and create if necessary) the protected memory object
989 * matching that VM object.
990 * If successful, this also grabs a reference on the memory object,
991 * to guarantee that it doesn't go away before we get a chance to map
992 * it.
993 */
994 unprotected_mem_obj = apple_protect_pager_setup(
995 protected_object,
996 VME_OFFSET(&tmp_entry),
997 crypto_backing_offset,
998 crypt_info,
999 crypto_start,
1000 crypto_end,
1001 cache_pager);
1002
1003 /* release extra ref on protected object */
1004 vm_object_deallocate(protected_object);
1005
1006 if (unprotected_mem_obj == NULL) {
1007 kr = KERN_FAILURE;
1008 goto done;
1009 }
1010
1011 /* can overwrite an immutable mapping */
1012 vm_map_kernel_flags_t vmk_flags = {
1013 .vmf_fixed = true,
1014 .vmf_overwrite = true,
1015 .vmkf_overwrite_immutable = true,
1016 };
1017 #if __arm64__
1018 if (tmp_entry.used_for_jit &&
1019 (VM_MAP_PAGE_SHIFT(map) != FOURK_PAGE_SHIFT ||
1020 PAGE_SHIFT != FOURK_PAGE_SHIFT) &&
1021 fourk_binary_compatibility_unsafe &&
1022 fourk_binary_compatibility_allow_wx) {
1023 printf("** FOURK_COMPAT [%d]: "
1024 "allowing write+execute at 0x%llx\n",
1025 proc_selfpid(), tmp_entry.vme_start);
1026 vmk_flags.vmkf_map_jit = TRUE;
1027 }
1028 #endif /* __arm64__ */
1029
1030 /* map this memory object in place of the current one */
1031 map_addr = tmp_entry.vme_start;
1032 kr = vm_map_enter_mem_object(map,
1033 &map_addr,
1034 (tmp_entry.vme_end -
1035 tmp_entry.vme_start),
1036 (mach_vm_offset_t) 0,
1037 vmk_flags,
1038 (ipc_port_t)(uintptr_t) unprotected_mem_obj,
1039 0,
1040 TRUE,
1041 tmp_entry.protection,
1042 tmp_entry.max_protection,
1043 tmp_entry.inheritance);
1044 assertf(kr == KERN_SUCCESS,
1045 "kr = 0x%x\n", kr);
1046 assertf(map_addr == tmp_entry.vme_start,
1047 "map_addr=0x%llx vme_start=0x%llx tmp_entry=%p\n",
1048 (uint64_t)map_addr,
1049 (uint64_t) tmp_entry.vme_start,
1050 &tmp_entry);
1051
1052 #if VM_MAP_DEBUG_APPLE_PROTECT
1053 if (vm_map_debug_apple_protect) {
1054 printf("APPLE_PROTECT: map %p [0x%llx:0x%llx] pager %p:"
1055 " backing:[object:%p,offset:0x%llx,"
1056 "crypto_backing_offset:0x%llx,"
1057 "crypto_start:0x%llx,crypto_end:0x%llx]\n",
1058 map,
1059 (uint64_t) map_addr,
1060 (uint64_t) (map_addr + (tmp_entry.vme_end -
1061 tmp_entry.vme_start)),
1062 unprotected_mem_obj,
1063 protected_object,
1064 VME_OFFSET(&tmp_entry),
1065 crypto_backing_offset,
1066 crypto_start,
1067 crypto_end);
1068 }
1069 #endif /* VM_MAP_DEBUG_APPLE_PROTECT */
1070
1071 /*
1072 * Release the reference obtained by
1073 * apple_protect_pager_setup().
1074 * The mapping (if it succeeded) is now holding a reference on
1075 * the memory object.
1076 */
1077 memory_object_deallocate(unprotected_mem_obj);
1078 unprotected_mem_obj = MEMORY_OBJECT_NULL;
1079
1080 /* continue with next map entry */
1081 crypto_backing_offset += (tmp_entry.vme_end -
1082 tmp_entry.vme_start);
1083 crypto_backing_offset -= crypto_start;
1084 }
1085 kr = KERN_SUCCESS;
1086
1087 done:
1088 if (map_locked) {
1089 vm_map_unlock(map);
1090 }
1091 return kr;
1092 }
1093 #endif /* CONFIG_CODE_DECRYPTION */
1094
1095
1096 LCK_GRP_DECLARE(vm_map_lck_grp, "vm_map");
1097 LCK_ATTR_DECLARE(vm_map_lck_attr, 0, 0);
1098 LCK_ATTR_DECLARE(vm_map_lck_rw_attr, 0, LCK_ATTR_DEBUG);
1099
1100 #if XNU_TARGET_OS_OSX
1101 #define MALLOC_NO_COW_DEFAULT 1
1102 #define MALLOC_NO_COW_EXCEPT_FORK_DEFAULT 1
1103 #else /* XNU_TARGET_OS_OSX */
1104 #define MALLOC_NO_COW_DEFAULT 1
1105 #define MALLOC_NO_COW_EXCEPT_FORK_DEFAULT 0
1106 #endif /* XNU_TARGET_OS_OSX */
1107 TUNABLE(int, malloc_no_cow, "malloc_no_cow", MALLOC_NO_COW_DEFAULT);
1108 TUNABLE(int, malloc_no_cow_except_fork, "malloc_no_cow_except_fork", MALLOC_NO_COW_EXCEPT_FORK_DEFAULT);
1109 uint64_t vm_memory_malloc_no_cow_mask = 0ULL;
1110 #if DEBUG
1111 int vm_check_map_sanity = 0;
1112 #endif
1113
1114 /*
1115 * vm_map_init:
1116 *
1117 * Initialize the vm_map module. Must be called before
1118 * any other vm_map routines.
1119 *
1120 * Map and entry structures are allocated from zones -- we must
1121 * initialize those zones.
1122 *
1123 * There are three zones of interest:
1124 *
1125 * vm_map_zone: used to allocate maps.
1126 * vm_map_entry_zone: used to allocate map entries.
1127 *
1128 * LP32:
1129 * vm_map_entry_reserved_zone: fallback zone for kernel map entries
1130 *
1131 * The kernel allocates map entries from a special zone that is initially
1132 * "crammed" with memory. It would be difficult (perhaps impossible) for
1133 * the kernel to allocate more memory to a entry zone when it became
1134 * empty since the very act of allocating memory implies the creation
1135 * of a new entry.
1136 */
1137 __startup_func
1138 void
vm_map_init(void)1139 vm_map_init(void)
1140 {
1141
1142 #if MACH_ASSERT
1143 PE_parse_boot_argn("debug4k_filter", &debug4k_filter,
1144 sizeof(debug4k_filter));
1145 #endif /* MACH_ASSERT */
1146
1147 zone_create_ext(VM_MAP_ZONE_NAME, sizeof(struct _vm_map),
1148 VM_MAP_ZFLAGS, ZONE_ID_VM_MAP, NULL);
1149
1150 /*
1151 * Don't quarantine because we always need elements available
1152 * Disallow GC on this zone... to aid the GC.
1153 */
1154 zone_create_ext(VM_MAP_ENTRY_ZONE_NAME,
1155 sizeof(struct vm_map_entry), VM_MAP_ENTRY_ZFLAGS,
1156 ZONE_ID_VM_MAP_ENTRY, ^(zone_t z) {
1157 z->z_elems_rsv = (uint16_t)(32 *
1158 (ml_early_cpu_max_number() + 1));
1159 });
1160
1161 zone_create_ext(VM_MAP_HOLES_ZONE_NAME,
1162 sizeof(struct vm_map_links), VM_MAP_HOLES_ZFLAGS,
1163 ZONE_ID_VM_MAP_HOLES, ^(zone_t z) {
1164 z->z_elems_rsv = (uint16_t)(16 * 1024 / zone_elem_outer_size(z));
1165 });
1166
1167 zone_create_ext("VM map copies", sizeof(struct vm_map_copy),
1168 ZC_NOENCRYPT, ZONE_ID_VM_MAP_COPY, NULL);
1169
1170 /*
1171 * Add the stolen memory to zones, adjust zone size and stolen counts.
1172 */
1173 zone_cram_early(vm_map_zone, map_data, map_data_size);
1174 zone_cram_early(vm_map_entry_zone, kentry_data, kentry_data_size);
1175 zone_cram_early(vm_map_holes_zone, map_holes_data, map_holes_data_size);
1176 printf("VM boostrap: %d maps, %d entries and %d holes available\n",
1177 zone_count_free(vm_map_zone),
1178 zone_count_free(vm_map_entry_zone),
1179 zone_count_free(vm_map_holes_zone));
1180
1181 /*
1182 * Since these are covered by zones, remove them from stolen page accounting.
1183 */
1184 VM_PAGE_MOVE_STOLEN(atop_64(map_data_size) + atop_64(kentry_data_size) + atop_64(map_holes_data_size));
1185
1186 #if VM_MAP_DEBUG_APPLE_PROTECT
1187 PE_parse_boot_argn("vm_map_debug_apple_protect",
1188 &vm_map_debug_apple_protect,
1189 sizeof(vm_map_debug_apple_protect));
1190 #endif /* VM_MAP_DEBUG_APPLE_PROTECT */
1191 #if VM_MAP_DEBUG_APPLE_FOURK
1192 PE_parse_boot_argn("vm_map_debug_fourk",
1193 &vm_map_debug_fourk,
1194 sizeof(vm_map_debug_fourk));
1195 #endif /* VM_MAP_DEBUG_FOURK */
1196
1197 if (malloc_no_cow) {
1198 vm_memory_malloc_no_cow_mask = 0ULL;
1199 vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC;
1200 vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_SMALL;
1201 vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_MEDIUM;
1202 #if XNU_TARGET_OS_OSX
1203 /*
1204 * On macOS, keep copy-on-write for MALLOC_LARGE because
1205 * realloc() may use vm_copy() to transfer the old contents
1206 * to the new location.
1207 */
1208 #else /* XNU_TARGET_OS_OSX */
1209 vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_LARGE;
1210 vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_LARGE_REUSABLE;
1211 vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_LARGE_REUSED;
1212 #endif /* XNU_TARGET_OS_OSX */
1213 // vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_HUGE;
1214 // vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_REALLOC;
1215 vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_TINY;
1216 vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_NANO;
1217 // vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_TCMALLOC;
1218 PE_parse_boot_argn("vm_memory_malloc_no_cow_mask",
1219 &vm_memory_malloc_no_cow_mask,
1220 sizeof(vm_memory_malloc_no_cow_mask));
1221 }
1222
1223 #if CONFIG_MAP_RANGES
1224 vm_map_range_map_init();
1225 #endif /* CONFIG_MAP_RANGES */
1226
1227 #if DEBUG
1228 PE_parse_boot_argn("vm_check_map_sanity", &vm_check_map_sanity, sizeof(vm_check_map_sanity));
1229 if (vm_check_map_sanity) {
1230 kprintf("VM sanity checking enabled\n");
1231 } else {
1232 kprintf("VM sanity checking disabled. Set bootarg vm_check_map_sanity=1 to enable\n");
1233 }
1234 #endif /* DEBUG */
1235
1236 #if DEVELOPMENT || DEBUG
1237 PE_parse_boot_argn("panic_on_unsigned_execute",
1238 &panic_on_unsigned_execute,
1239 sizeof(panic_on_unsigned_execute));
1240 PE_parse_boot_argn("panic_on_mlock_failure",
1241 &panic_on_mlock_failure,
1242 sizeof(panic_on_mlock_failure));
1243 #endif /* DEVELOPMENT || DEBUG */
1244 }
1245
1246 __startup_func
1247 static void
vm_map_steal_memory(void)1248 vm_map_steal_memory(void)
1249 {
1250 /*
1251 * We need to reserve enough memory to support boostraping VM maps
1252 * and the zone subsystem.
1253 *
1254 * The VM Maps that need to function before zones can support them
1255 * are the ones registered with vm_map_will_allocate_early_map(),
1256 * which are:
1257 * - the kernel map
1258 * - the various submaps used by zones (pgz, meta, ...)
1259 *
1260 * We also need enough entries and holes to support them
1261 * until zone_metadata_init() is called, which is when
1262 * the zone allocator becomes capable of expanding dynamically.
1263 *
1264 * We need:
1265 * - VM_MAP_EARLY_COUNT_MAX worth of VM Maps.
1266 * - To allow for 3-4 entries per map, but the kernel map
1267 * needs a multiple of VM_MAP_EARLY_COUNT_MAX entries
1268 * to describe the submaps, so double it (and make it 8x too)
1269 * - To allow for holes between entries,
1270 * hence needs the same budget as entries
1271 */
1272 map_data_size = zone_get_early_alloc_size(VM_MAP_ZONE_NAME,
1273 sizeof(struct _vm_map), VM_MAP_ZFLAGS,
1274 VM_MAP_EARLY_COUNT_MAX);
1275
1276 kentry_data_size = zone_get_early_alloc_size(VM_MAP_ENTRY_ZONE_NAME,
1277 sizeof(struct vm_map_entry), VM_MAP_ENTRY_ZFLAGS,
1278 8 * VM_MAP_EARLY_COUNT_MAX);
1279
1280 map_holes_data_size = zone_get_early_alloc_size(VM_MAP_HOLES_ZONE_NAME,
1281 sizeof(struct vm_map_links), VM_MAP_HOLES_ZFLAGS,
1282 8 * VM_MAP_EARLY_COUNT_MAX);
1283
1284 /*
1285 * Steal a contiguous range of memory so that a simple range check
1286 * can validate early addresses being freed/crammed to these
1287 * zones
1288 */
1289 map_data = zone_early_mem_init(map_data_size + kentry_data_size +
1290 map_holes_data_size);
1291 kentry_data = map_data + map_data_size;
1292 map_holes_data = kentry_data + kentry_data_size;
1293 }
1294 STARTUP(PMAP_STEAL, STARTUP_RANK_FIRST, vm_map_steal_memory);
1295
1296 __startup_func
1297 static void
vm_kernel_boostraped(void)1298 vm_kernel_boostraped(void)
1299 {
1300 zone_enable_caching(&zone_array[ZONE_ID_VM_MAP_ENTRY]);
1301 zone_enable_caching(&zone_array[ZONE_ID_VM_MAP_HOLES]);
1302 zone_enable_caching(&zone_array[ZONE_ID_VM_MAP_COPY]);
1303
1304 printf("VM bootstrap done: %d maps, %d entries and %d holes left\n",
1305 zone_count_free(vm_map_zone),
1306 zone_count_free(vm_map_entry_zone),
1307 zone_count_free(vm_map_holes_zone));
1308 }
1309 STARTUP(ZALLOC, STARTUP_RANK_SECOND, vm_kernel_boostraped);
1310
1311 void
vm_map_disable_hole_optimization(vm_map_t map)1312 vm_map_disable_hole_optimization(vm_map_t map)
1313 {
1314 vm_map_entry_t head_entry, hole_entry, next_hole_entry;
1315
1316 if (map->holelistenabled) {
1317 head_entry = hole_entry = CAST_TO_VM_MAP_ENTRY(map->holes_list);
1318
1319 while (hole_entry != NULL) {
1320 next_hole_entry = hole_entry->vme_next;
1321
1322 hole_entry->vme_next = NULL;
1323 hole_entry->vme_prev = NULL;
1324 zfree_id(ZONE_ID_VM_MAP_HOLES, hole_entry);
1325
1326 if (next_hole_entry == head_entry) {
1327 hole_entry = NULL;
1328 } else {
1329 hole_entry = next_hole_entry;
1330 }
1331 }
1332
1333 map->holes_list = NULL;
1334 map->holelistenabled = FALSE;
1335
1336 map->first_free = vm_map_first_entry(map);
1337 SAVE_HINT_HOLE_WRITE(map, NULL);
1338 }
1339 }
1340
1341 boolean_t
vm_kernel_map_is_kernel(vm_map_t map)1342 vm_kernel_map_is_kernel(vm_map_t map)
1343 {
1344 return map->pmap == kernel_pmap;
1345 }
1346
1347 /*
1348 * vm_map_create:
1349 *
1350 * Creates and returns a new empty VM map with
1351 * the given physical map structure, and having
1352 * the given lower and upper address bounds.
1353 */
1354
1355 extern vm_map_t vm_map_create_external(
1356 pmap_t pmap,
1357 vm_map_offset_t min_off,
1358 vm_map_offset_t max_off,
1359 boolean_t pageable);
1360
1361 vm_map_t
vm_map_create_external(pmap_t pmap,vm_map_offset_t min,vm_map_offset_t max,boolean_t pageable)1362 vm_map_create_external(
1363 pmap_t pmap,
1364 vm_map_offset_t min,
1365 vm_map_offset_t max,
1366 boolean_t pageable)
1367 {
1368 vm_map_create_options_t options = VM_MAP_CREATE_DEFAULT;
1369
1370 if (pageable) {
1371 options |= VM_MAP_CREATE_PAGEABLE;
1372 }
1373 return vm_map_create_options(pmap, min, max, options);
1374 }
1375
1376 __startup_func
1377 void
vm_map_will_allocate_early_map(vm_map_t * owner)1378 vm_map_will_allocate_early_map(vm_map_t *owner)
1379 {
1380 if (early_map_count >= VM_MAP_EARLY_COUNT_MAX) {
1381 panic("VM_MAP_EARLY_COUNT_MAX is too low");
1382 }
1383
1384 early_map_owners[early_map_count++] = owner;
1385 }
1386
1387 __startup_func
1388 void
vm_map_relocate_early_maps(vm_offset_t delta)1389 vm_map_relocate_early_maps(vm_offset_t delta)
1390 {
1391 for (uint32_t i = 0; i < early_map_count; i++) {
1392 vm_address_t addr = (vm_address_t)*early_map_owners[i];
1393
1394 *early_map_owners[i] = (vm_map_t)(addr + delta);
1395 }
1396
1397 early_map_count = ~0u;
1398 }
1399
1400 /*
1401 * Routine: vm_map_relocate_early_elem
1402 *
1403 * Purpose:
1404 * Early zone elements are allocated in a temporary part
1405 * of the address space.
1406 *
1407 * Once the zones live in their final place, the early
1408 * VM maps, map entries and map holes need to be relocated.
1409 *
1410 * It involves rewriting any vm_map_t, vm_map_entry_t or
1411 * pointers to vm_map_links. Other pointers to other types
1412 * are fine.
1413 *
1414 * Fortunately, pointers to those types are self-contained
1415 * in those zones, _except_ for pointers to VM maps,
1416 * which are tracked during early boot and fixed with
1417 * vm_map_relocate_early_maps().
1418 */
1419 __startup_func
1420 void
vm_map_relocate_early_elem(uint32_t zone_id,vm_offset_t new_addr,vm_offset_t delta)1421 vm_map_relocate_early_elem(
1422 uint32_t zone_id,
1423 vm_offset_t new_addr,
1424 vm_offset_t delta)
1425 {
1426 #define relocate(type_t, field) ({ \
1427 typeof(((type_t)NULL)->field) *__field = &((type_t)new_addr)->field; \
1428 if (*__field) { \
1429 *__field = (typeof(*__field))((vm_offset_t)*__field + delta); \
1430 } \
1431 })
1432
1433 switch (zone_id) {
1434 case ZONE_ID_VM_MAP:
1435 case ZONE_ID_VM_MAP_ENTRY:
1436 case ZONE_ID_VM_MAP_HOLES:
1437 break;
1438
1439 default:
1440 panic("Unexpected zone ID %d", zone_id);
1441 }
1442
1443 if (zone_id == ZONE_ID_VM_MAP) {
1444 relocate(vm_map_t, hdr.links.prev);
1445 relocate(vm_map_t, hdr.links.next);
1446 ((vm_map_t)new_addr)->pmap = kernel_pmap;
1447 #ifdef VM_MAP_STORE_USE_RB
1448 relocate(vm_map_t, hdr.rb_head_store.rbh_root);
1449 #endif /* VM_MAP_STORE_USE_RB */
1450 relocate(vm_map_t, hint);
1451 relocate(vm_map_t, hole_hint);
1452 relocate(vm_map_t, first_free);
1453 return;
1454 }
1455
1456 relocate(struct vm_map_links *, prev);
1457 relocate(struct vm_map_links *, next);
1458
1459 if (zone_id == ZONE_ID_VM_MAP_ENTRY) {
1460 #ifdef VM_MAP_STORE_USE_RB
1461 relocate(vm_map_entry_t, store.entry.rbe_left);
1462 relocate(vm_map_entry_t, store.entry.rbe_right);
1463 relocate(vm_map_entry_t, store.entry.rbe_parent);
1464 #endif /* VM_MAP_STORE_USE_RB */
1465 if (((vm_map_entry_t)new_addr)->is_sub_map) {
1466 /* no object to relocate because we haven't made any */
1467 ((vm_map_entry_t)new_addr)->vme_submap +=
1468 delta >> VME_SUBMAP_SHIFT;
1469 }
1470 #if MAP_ENTRY_CREATION_DEBUG
1471 relocate(vm_map_entry_t, vme_creation_maphdr);
1472 #endif /* MAP_ENTRY_CREATION_DEBUG */
1473 }
1474
1475 #undef relocate
1476 }
1477
1478 vm_map_t
vm_map_create_options(pmap_t pmap,vm_map_offset_t min,vm_map_offset_t max,vm_map_create_options_t options)1479 vm_map_create_options(
1480 pmap_t pmap,
1481 vm_map_offset_t min,
1482 vm_map_offset_t max,
1483 vm_map_create_options_t options)
1484 {
1485 vm_map_t result;
1486
1487 #if DEBUG || DEVELOPMENT
1488 if (__improbable(startup_phase < STARTUP_SUB_ZALLOC)) {
1489 if (early_map_count != ~0u && early_map_count !=
1490 zone_count_allocated(vm_map_zone) + 1) {
1491 panic("allocating %dth early map, owner not known",
1492 zone_count_allocated(vm_map_zone) + 1);
1493 }
1494 if (early_map_count != ~0u && pmap && pmap != kernel_pmap) {
1495 panic("allocating %dth early map for non kernel pmap",
1496 early_map_count);
1497 }
1498 }
1499 #endif /* DEBUG || DEVELOPMENT */
1500
1501 result = zalloc_id(ZONE_ID_VM_MAP, Z_WAITOK | Z_NOFAIL | Z_ZERO);
1502
1503 vm_map_store_init(&result->hdr);
1504 result->hdr.entries_pageable = (bool)(options & VM_MAP_CREATE_PAGEABLE);
1505 vm_map_set_page_shift(result, PAGE_SHIFT);
1506
1507 result->size_limit = RLIM_INFINITY; /* default unlimited */
1508 result->data_limit = RLIM_INFINITY; /* default unlimited */
1509 result->user_wire_limit = MACH_VM_MAX_ADDRESS; /* default limit is unlimited */
1510 os_ref_init_count_raw(&result->map_refcnt, &map_refgrp, 1);
1511 result->pmap = pmap;
1512 result->min_offset = min;
1513 result->max_offset = max;
1514 result->first_free = vm_map_to_entry(result);
1515 result->hint = vm_map_to_entry(result);
1516
1517 if (options & VM_MAP_CREATE_NEVER_FAULTS) {
1518 assert(pmap == kernel_pmap);
1519 result->never_faults = true;
1520 }
1521
1522 /* "has_corpse_footprint" and "holelistenabled" are mutually exclusive */
1523 if (options & VM_MAP_CREATE_CORPSE_FOOTPRINT) {
1524 result->has_corpse_footprint = true;
1525 } else if (!(options & VM_MAP_CREATE_DISABLE_HOLELIST)) {
1526 struct vm_map_links *hole_entry;
1527
1528 hole_entry = zalloc_id(ZONE_ID_VM_MAP_HOLES, Z_WAITOK | Z_NOFAIL);
1529 hole_entry->start = min;
1530 #if defined(__arm64__)
1531 hole_entry->end = result->max_offset;
1532 #else
1533 hole_entry->end = MAX(max, (vm_map_offset_t)MACH_VM_MAX_ADDRESS);
1534 #endif
1535 result->holes_list = result->hole_hint = hole_entry;
1536 hole_entry->prev = hole_entry->next = CAST_TO_VM_MAP_ENTRY(hole_entry);
1537 result->holelistenabled = true;
1538 }
1539
1540 vm_map_lock_init(result);
1541
1542 return result;
1543 }
1544
1545 /*
1546 * Adjusts a submap that was made by kmem_suballoc()
1547 * before it knew where it would be mapped,
1548 * so that it has the right min/max offsets.
1549 *
1550 * We do not need to hold any locks:
1551 * only the caller knows about this map,
1552 * and it is not published on any entry yet.
1553 */
1554 static void
vm_map_adjust_offsets(vm_map_t map,vm_map_offset_t min_off,vm_map_offset_t max_off)1555 vm_map_adjust_offsets(
1556 vm_map_t map,
1557 vm_map_offset_t min_off,
1558 vm_map_offset_t max_off)
1559 {
1560 assert(map->min_offset == 0);
1561 assert(map->max_offset == max_off - min_off);
1562 assert(map->hdr.nentries == 0);
1563 assert(os_ref_get_count_raw(&map->map_refcnt) == 2);
1564
1565 map->min_offset = min_off;
1566 map->max_offset = max_off;
1567
1568 if (map->holelistenabled) {
1569 struct vm_map_links *hole = map->holes_list;
1570
1571 hole->start = min_off;
1572 #if defined(__arm64__)
1573 hole->end = max_off;
1574 #else
1575 hole->end = MAX(max_off, (vm_map_offset_t)MACH_VM_MAX_ADDRESS);
1576 #endif
1577 }
1578 }
1579
1580
1581 vm_map_size_t
vm_map_adjusted_size(vm_map_t map)1582 vm_map_adjusted_size(vm_map_t map)
1583 {
1584 const struct vm_reserved_region *regions = NULL;
1585 size_t num_regions = 0;
1586 mach_vm_size_t reserved_size = 0, map_size = 0;
1587
1588 if (map == NULL || (map->size == 0)) {
1589 return 0;
1590 }
1591
1592 map_size = map->size;
1593
1594 if (map->reserved_regions == FALSE || !vm_map_is_exotic(map) || map->terminated) {
1595 /*
1596 * No special reserved regions or not an exotic map or the task
1597 * is terminating and these special regions might have already
1598 * been deallocated.
1599 */
1600 return map_size;
1601 }
1602
1603 num_regions = ml_get_vm_reserved_regions(vm_map_is_64bit(map), ®ions);
1604 assert((num_regions == 0) || (num_regions > 0 && regions != NULL));
1605
1606 while (num_regions) {
1607 reserved_size += regions[--num_regions].vmrr_size;
1608 }
1609
1610 /*
1611 * There are a few places where the map is being switched out due to
1612 * 'termination' without that bit being set (e.g. exec and corpse purging).
1613 * In those cases, we could have the map's regions being deallocated on
1614 * a core while some accounting process is trying to get the map's size.
1615 * So this assert can't be enabled till all those places are uniform in
1616 * their use of the 'map->terminated' bit.
1617 *
1618 * assert(map_size >= reserved_size);
1619 */
1620
1621 return (map_size >= reserved_size) ? (map_size - reserved_size) : map_size;
1622 }
1623
1624 /*
1625 * vm_map_entry_create: [ internal use only ]
1626 *
1627 * Allocates a VM map entry for insertion in the
1628 * given map (or map copy). No fields are filled.
1629 *
1630 * The VM entry will be zero initialized, except for:
1631 * - behavior set to VM_BEHAVIOR_DEFAULT
1632 * - inheritance set to VM_INHERIT_DEFAULT
1633 */
1634 #define vm_map_entry_create(map) _vm_map_entry_create(&(map)->hdr)
1635
1636 #define vm_map_copy_entry_create(copy) _vm_map_entry_create(&(copy)->cpy_hdr)
1637
1638 static vm_map_entry_t
_vm_map_entry_create(struct vm_map_header * map_header __unused)1639 _vm_map_entry_create(
1640 struct vm_map_header *map_header __unused)
1641 {
1642 vm_map_entry_t entry = NULL;
1643
1644 entry = zalloc_id(ZONE_ID_VM_MAP_ENTRY, Z_WAITOK | Z_ZERO);
1645
1646 /*
1647 * Help the compiler with what we know to be true,
1648 * so that the further bitfields inits have good codegen.
1649 *
1650 * See rdar://87041299
1651 */
1652 __builtin_assume(entry->vme_object_value == 0);
1653 __builtin_assume(*(uint64_t *)(&entry->vme_object_value + 1) == 0);
1654 __builtin_assume(*(uint64_t *)(&entry->vme_object_value + 2) == 0);
1655
1656 static_assert(VM_MAX_TAG_VALUE <= VME_ALIAS_MASK,
1657 "VME_ALIAS_MASK covers tags");
1658
1659 static_assert(VM_BEHAVIOR_DEFAULT == 0,
1660 "can skip zeroing of the behavior field");
1661 entry->inheritance = VM_INHERIT_DEFAULT;
1662
1663 #if MAP_ENTRY_CREATION_DEBUG
1664 entry->vme_creation_maphdr = map_header;
1665 entry->vme_creation_bt = btref_get(__builtin_frame_address(0),
1666 BTREF_GET_NOWAIT);
1667 #endif
1668 return entry;
1669 }
1670
1671 /*
1672 * vm_map_entry_dispose: [ internal use only ]
1673 *
1674 * Inverse of vm_map_entry_create.
1675 *
1676 * write map lock held so no need to
1677 * do anything special to insure correctness
1678 * of the stores
1679 */
1680 static void
vm_map_entry_dispose(vm_map_entry_t entry)1681 vm_map_entry_dispose(
1682 vm_map_entry_t entry)
1683 {
1684 #if VM_BTLOG_TAGS
1685 if (entry->vme_kernel_object) {
1686 btref_put(entry->vme_tag_btref);
1687 }
1688 #endif /* VM_BTLOG_TAGS */
1689 #if MAP_ENTRY_CREATION_DEBUG
1690 btref_put(entry->vme_creation_bt);
1691 #endif
1692 #if MAP_ENTRY_INSERTION_DEBUG
1693 btref_put(entry->vme_insertion_bt);
1694 #endif
1695 zfree(vm_map_entry_zone, entry);
1696 }
1697
1698 #define vm_map_copy_entry_dispose(copy_entry) \
1699 vm_map_entry_dispose(copy_entry)
1700
1701 static vm_map_entry_t
vm_map_zap_first_entry(vm_map_zap_t list)1702 vm_map_zap_first_entry(
1703 vm_map_zap_t list)
1704 {
1705 return list->vmz_head;
1706 }
1707
1708 static vm_map_entry_t
vm_map_zap_last_entry(vm_map_zap_t list)1709 vm_map_zap_last_entry(
1710 vm_map_zap_t list)
1711 {
1712 assert(vm_map_zap_first_entry(list));
1713 return __container_of(list->vmz_tail, struct vm_map_entry, vme_next);
1714 }
1715
1716 static void
vm_map_zap_append(vm_map_zap_t list,vm_map_entry_t entry)1717 vm_map_zap_append(
1718 vm_map_zap_t list,
1719 vm_map_entry_t entry)
1720 {
1721 entry->vme_next = VM_MAP_ENTRY_NULL;
1722 *list->vmz_tail = entry;
1723 list->vmz_tail = &entry->vme_next;
1724 }
1725
1726 static vm_map_entry_t
vm_map_zap_pop(vm_map_zap_t list)1727 vm_map_zap_pop(
1728 vm_map_zap_t list)
1729 {
1730 vm_map_entry_t head = list->vmz_head;
1731
1732 if (head != VM_MAP_ENTRY_NULL &&
1733 (list->vmz_head = head->vme_next) == VM_MAP_ENTRY_NULL) {
1734 list->vmz_tail = &list->vmz_head;
1735 }
1736
1737 return head;
1738 }
1739
1740 static void
vm_map_zap_dispose(vm_map_zap_t list)1741 vm_map_zap_dispose(
1742 vm_map_zap_t list)
1743 {
1744 vm_map_entry_t entry;
1745
1746 while ((entry = vm_map_zap_pop(list))) {
1747 if (entry->is_sub_map) {
1748 vm_map_deallocate(VME_SUBMAP(entry));
1749 } else {
1750 vm_object_deallocate(VME_OBJECT(entry));
1751 }
1752
1753 vm_map_entry_dispose(entry);
1754 }
1755 }
1756
1757 #if MACH_ASSERT
1758 static boolean_t first_free_check = FALSE;
1759 boolean_t
first_free_is_valid(vm_map_t map)1760 first_free_is_valid(
1761 vm_map_t map)
1762 {
1763 if (!first_free_check) {
1764 return TRUE;
1765 }
1766
1767 return first_free_is_valid_store( map );
1768 }
1769 #endif /* MACH_ASSERT */
1770
1771
1772 #define vm_map_copy_entry_link(copy, after_where, entry) \
1773 _vm_map_store_entry_link(&(copy)->cpy_hdr, after_where, (entry))
1774
1775 #define vm_map_copy_entry_unlink(copy, entry) \
1776 _vm_map_store_entry_unlink(&(copy)->cpy_hdr, (entry), false)
1777
1778 /*
1779 * vm_map_destroy:
1780 *
1781 * Actually destroy a map.
1782 */
1783 void
vm_map_destroy(vm_map_t map)1784 vm_map_destroy(
1785 vm_map_t map)
1786 {
1787 /* final cleanup: this is not allowed to fail */
1788 vmr_flags_t flags = VM_MAP_REMOVE_NO_FLAGS;
1789
1790 VM_MAP_ZAP_DECLARE(zap);
1791
1792 vm_map_lock(map);
1793
1794 map->terminated = true;
1795 /* clean up regular map entries */
1796 (void)vm_map_delete(map, map->min_offset, map->max_offset, flags,
1797 KMEM_GUARD_NONE, &zap);
1798 /* clean up leftover special mappings (commpage, GPU carveout, etc...) */
1799 (void)vm_map_delete(map, 0x0, 0xFFFFFFFFFFFFF000ULL, flags,
1800 KMEM_GUARD_NONE, &zap);
1801
1802 vm_map_disable_hole_optimization(map);
1803 vm_map_corpse_footprint_destroy(map);
1804
1805 vm_map_unlock(map);
1806
1807 vm_map_zap_dispose(&zap);
1808
1809 assert(map->hdr.nentries == 0);
1810
1811 if (map->pmap) {
1812 pmap_destroy(map->pmap);
1813 }
1814
1815 lck_rw_destroy(&map->lock, &vm_map_lck_grp);
1816
1817 #if CONFIG_MAP_RANGES
1818 kfree_data(map->extra_ranges,
1819 map->extra_ranges_count * sizeof(struct vm_map_user_range));
1820 #endif
1821
1822 zfree_id(ZONE_ID_VM_MAP, map);
1823 }
1824
1825 /*
1826 * Returns pid of the task with the largest number of VM map entries.
1827 * Used in the zone-map-exhaustion jetsam path.
1828 */
1829 pid_t
find_largest_process_vm_map_entries(void)1830 find_largest_process_vm_map_entries(void)
1831 {
1832 pid_t victim_pid = -1;
1833 int max_vm_map_entries = 0;
1834 task_t task = TASK_NULL;
1835 queue_head_t *task_list = &tasks;
1836
1837 lck_mtx_lock(&tasks_threads_lock);
1838 queue_iterate(task_list, task, task_t, tasks) {
1839 if (task == kernel_task || !task->active) {
1840 continue;
1841 }
1842
1843 vm_map_t task_map = task->map;
1844 if (task_map != VM_MAP_NULL) {
1845 int task_vm_map_entries = task_map->hdr.nentries;
1846 if (task_vm_map_entries > max_vm_map_entries) {
1847 max_vm_map_entries = task_vm_map_entries;
1848 victim_pid = pid_from_task(task);
1849 }
1850 }
1851 }
1852 lck_mtx_unlock(&tasks_threads_lock);
1853
1854 printf("zone_map_exhaustion: victim pid %d, vm region count: %d\n", victim_pid, max_vm_map_entries);
1855 return victim_pid;
1856 }
1857
1858
1859 /*
1860 * vm_map_lookup_entry: [ internal use only ]
1861 *
1862 * Calls into the vm map store layer to find the map
1863 * entry containing (or immediately preceding) the
1864 * specified address in the given map; the entry is returned
1865 * in the "entry" parameter. The boolean
1866 * result indicates whether the address is
1867 * actually contained in the map.
1868 */
1869 boolean_t
vm_map_lookup_entry(vm_map_t map,vm_map_offset_t address,vm_map_entry_t * entry)1870 vm_map_lookup_entry(
1871 vm_map_t map,
1872 vm_map_offset_t address,
1873 vm_map_entry_t *entry) /* OUT */
1874 {
1875 if (VM_KERNEL_ADDRESS(address)) {
1876 address = VM_KERNEL_STRIP_UPTR(address);
1877 }
1878 #if CONFIG_PROB_GZALLOC
1879 if (map->pmap == kernel_pmap) {
1880 assertf(!pgz_owned(address),
1881 "it is the responsibility of callers to unguard PGZ addresses");
1882 }
1883 #endif /* CONFIG_PROB_GZALLOC */
1884 return vm_map_store_lookup_entry( map, address, entry );
1885 }
1886
1887 boolean_t
vm_map_lookup_entry_or_next(vm_map_t map,vm_map_offset_t address,vm_map_entry_t * entry)1888 vm_map_lookup_entry_or_next(
1889 vm_map_t map,
1890 vm_map_offset_t address,
1891 vm_map_entry_t *entry) /* OUT */
1892 {
1893 if (vm_map_lookup_entry(map, address, entry)) {
1894 return true;
1895 }
1896
1897 *entry = (*entry)->vme_next;
1898 return false;
1899 }
1900
1901 #if CONFIG_PROB_GZALLOC
1902 boolean_t
vm_map_lookup_entry_allow_pgz(vm_map_t map,vm_map_offset_t address,vm_map_entry_t * entry)1903 vm_map_lookup_entry_allow_pgz(
1904 vm_map_t map,
1905 vm_map_offset_t address,
1906 vm_map_entry_t *entry) /* OUT */
1907 {
1908 if (VM_KERNEL_ADDRESS(address)) {
1909 address = VM_KERNEL_STRIP_UPTR(address);
1910 }
1911 return vm_map_store_lookup_entry( map, address, entry );
1912 }
1913 #endif /* CONFIG_PROB_GZALLOC */
1914
1915 /*
1916 * Routine: vm_map_range_invalid_panic
1917 * Purpose:
1918 * Panic on detection of an invalid range id.
1919 */
1920 __abortlike
1921 static void
vm_map_range_invalid_panic(vm_map_t map,vm_map_range_id_t range_id)1922 vm_map_range_invalid_panic(
1923 vm_map_t map,
1924 vm_map_range_id_t range_id)
1925 {
1926 panic("invalid range ID (%u) for map %p", range_id, map);
1927 }
1928
1929 /*
1930 * Routine: vm_map_get_range
1931 * Purpose:
1932 * Adjust bounds based on security policy.
1933 */
1934 static struct mach_vm_range
vm_map_get_range(vm_map_t map,vm_map_address_t * address,vm_map_kernel_flags_t * vmk_flags,vm_map_size_t size,bool * is_ptr)1935 vm_map_get_range(
1936 vm_map_t map,
1937 vm_map_address_t *address,
1938 vm_map_kernel_flags_t *vmk_flags,
1939 vm_map_size_t size,
1940 bool *is_ptr)
1941 {
1942 struct mach_vm_range effective_range = {};
1943 vm_map_range_id_t range_id = vmk_flags->vmkf_range_id;
1944
1945 if (map == kernel_map) {
1946 effective_range = kmem_ranges[range_id];
1947
1948 if (startup_phase >= STARTUP_SUB_KMEM) {
1949 /*
1950 * Hint provided by caller is zeroed as the range is restricted to a
1951 * subset of the entire kernel_map VA, which could put the hint outside
1952 * the range, causing vm_map_store_find_space to fail.
1953 */
1954 *address = 0ull;
1955 /*
1956 * Ensure that range_id passed in by the caller is within meaningful
1957 * bounds. Range id of KMEM_RANGE_ID_NONE will cause vm_map_locate_space
1958 * to fail as the corresponding range is invalid. Range id larger than
1959 * KMEM_RANGE_ID_MAX will lead to an OOB access.
1960 */
1961 if ((range_id == KMEM_RANGE_ID_NONE) ||
1962 (range_id > KMEM_RANGE_ID_MAX)) {
1963 vm_map_range_invalid_panic(map, range_id);
1964 }
1965
1966 /*
1967 * Pointer ranges use kmem_locate_space to do allocations.
1968 *
1969 * Non pointer fronts look like [ Small | Large | Permanent ]
1970 * Adjust range for allocations larger than KMEM_SMALLMAP_THRESHOLD.
1971 * Allocations smaller than KMEM_SMALLMAP_THRESHOLD are allowed to
1972 * use the entire range.
1973 */
1974 if (range_id < KMEM_RANGE_ID_SPRAYQTN) {
1975 *is_ptr = true;
1976 } else if (size >= KMEM_SMALLMAP_THRESHOLD) {
1977 effective_range = kmem_large_ranges[range_id];
1978 }
1979 }
1980 #if CONFIG_MAP_RANGES
1981 } else if (map->uses_user_ranges) {
1982 switch (range_id) {
1983 case UMEM_RANGE_ID_DEFAULT:
1984 effective_range = map->default_range;
1985 break;
1986 case UMEM_RANGE_ID_HEAP:
1987 effective_range = map->data_range;
1988 break;
1989 case UMEM_RANGE_ID_FIXED:
1990 /*
1991 * anywhere allocations with an address in "FIXED"
1992 * makes no sense, leave the range empty
1993 */
1994 break;
1995
1996 default:
1997 vm_map_range_invalid_panic(map, range_id);
1998 }
1999 #endif /* CONFIG_MAP_RANGES */
2000 } else {
2001 /*
2002 * If minimum is 0, bump it up by PAGE_SIZE. We want to limit
2003 * allocations of PAGEZERO to explicit requests since its
2004 * normal use is to catch dereferences of NULL and many
2005 * applications also treat pointers with a value of 0 as
2006 * special and suddenly having address 0 contain useable
2007 * memory would tend to confuse those applications.
2008 */
2009 effective_range.min_address = MAX(map->min_offset, VM_MAP_PAGE_SIZE(map));
2010 effective_range.max_address = map->max_offset;
2011 }
2012
2013 return effective_range;
2014 }
2015
2016 /*
2017 * Routine: vm_map_locate_space
2018 * Purpose:
2019 * Finds a range in the specified virtual address map,
2020 * returning the start of that range,
2021 * as well as the entry right before it.
2022 */
2023 kern_return_t
vm_map_locate_space(vm_map_t map,vm_map_size_t size,vm_map_offset_t mask,vm_map_kernel_flags_t vmk_flags,vm_map_offset_t * start_inout,vm_map_entry_t * entry_out)2024 vm_map_locate_space(
2025 vm_map_t map,
2026 vm_map_size_t size,
2027 vm_map_offset_t mask,
2028 vm_map_kernel_flags_t vmk_flags,
2029 vm_map_offset_t *start_inout,
2030 vm_map_entry_t *entry_out)
2031 {
2032 struct mach_vm_range effective_range = {};
2033 vm_map_size_t guard_offset;
2034 vm_map_offset_t hint, limit;
2035 vm_map_entry_t entry;
2036 bool is_kmem_ptr_range = false;
2037
2038 /*
2039 * Only supported by vm_map_enter() with a fixed address.
2040 */
2041 assert(!vmk_flags.vmkf_beyond_max);
2042
2043 if (__improbable(map->wait_for_space)) {
2044 /*
2045 * support for "wait_for_space" is minimal,
2046 * its only consumer is the ipc_kernel_copy_map.
2047 */
2048 assert(!map->holelistenabled &&
2049 !vmk_flags.vmkf_last_free &&
2050 !vmk_flags.vmkf_keep_map_locked &&
2051 !vmk_flags.vmkf_map_jit &&
2052 !vmk_flags.vmf_random_addr &&
2053 *start_inout <= map->min_offset);
2054 } else if (vmk_flags.vmkf_last_free) {
2055 assert(!vmk_flags.vmkf_map_jit &&
2056 !vmk_flags.vmf_random_addr);
2057 }
2058
2059 if (vmk_flags.vmkf_guard_before) {
2060 guard_offset = VM_MAP_PAGE_SIZE(map);
2061 assert(size > guard_offset);
2062 size -= guard_offset;
2063 } else {
2064 assert(size != 0);
2065 guard_offset = 0;
2066 }
2067
2068 /*
2069 * Validate range_id from flags and get associated range
2070 */
2071 effective_range = vm_map_get_range(map, start_inout, &vmk_flags, size,
2072 &is_kmem_ptr_range);
2073
2074 if (is_kmem_ptr_range) {
2075 return kmem_locate_space(size + guard_offset, vmk_flags.vmkf_range_id,
2076 vmk_flags.vmkf_last_free, start_inout, entry_out);
2077 }
2078
2079 #if XNU_TARGET_OS_OSX
2080 if (__improbable(vmk_flags.vmkf_32bit_map_va)) {
2081 assert(map != kernel_map);
2082 effective_range.max_address = MIN(map->max_offset, 0x00000000FFFFF000ULL);
2083 }
2084 #endif /* XNU_TARGET_OS_OSX */
2085
2086 again:
2087 if (vmk_flags.vmkf_last_free) {
2088 hint = *start_inout;
2089
2090 if (hint == 0 || hint > effective_range.max_address) {
2091 hint = effective_range.max_address;
2092 }
2093 if (hint <= effective_range.min_address) {
2094 return KERN_NO_SPACE;
2095 }
2096 limit = effective_range.min_address;
2097 } else {
2098 hint = *start_inout;
2099
2100 if (vmk_flags.vmkf_map_jit) {
2101 if (map->jit_entry_exists &&
2102 !VM_MAP_POLICY_ALLOW_MULTIPLE_JIT(map)) {
2103 return KERN_INVALID_ARGUMENT;
2104 }
2105 if (VM_MAP_POLICY_ALLOW_JIT_RANDOM_ADDRESS(map)) {
2106 vmk_flags.vmf_random_addr = true;
2107 }
2108 }
2109
2110 if (vmk_flags.vmf_random_addr) {
2111 kern_return_t kr;
2112
2113 kr = vm_map_random_address_for_size(map, &hint, size, vmk_flags);
2114 if (kr != KERN_SUCCESS) {
2115 return kr;
2116 }
2117 }
2118 #if __x86_64__
2119 else if ((hint == 0 || hint == vm_map_min(map)) &&
2120 !map->disable_vmentry_reuse &&
2121 map->vmmap_high_start != 0) {
2122 hint = map->vmmap_high_start;
2123 }
2124 #endif /* __x86_64__ */
2125
2126 if (hint < effective_range.min_address) {
2127 hint = effective_range.min_address;
2128 }
2129 if (effective_range.max_address <= hint) {
2130 return KERN_NO_SPACE;
2131 }
2132
2133 limit = effective_range.max_address;
2134 }
2135 entry = vm_map_store_find_space(map,
2136 hint, limit, vmk_flags.vmkf_last_free,
2137 guard_offset, size, mask,
2138 start_inout);
2139
2140 if (__improbable(entry == NULL)) {
2141 if (map->wait_for_space &&
2142 guard_offset + size <=
2143 effective_range.max_address - effective_range.min_address) {
2144 assert_wait((event_t)map, THREAD_ABORTSAFE);
2145 vm_map_unlock(map);
2146 thread_block(THREAD_CONTINUE_NULL);
2147 vm_map_lock(map);
2148 goto again;
2149 }
2150 return KERN_NO_SPACE;
2151 }
2152
2153 if (entry_out) {
2154 *entry_out = entry;
2155 }
2156 return KERN_SUCCESS;
2157 }
2158
2159
2160 /*
2161 * Routine: vm_map_find_space
2162 * Purpose:
2163 * Allocate a range in the specified virtual address map,
2164 * returning the entry allocated for that range.
2165 * Used by kmem_alloc, etc.
2166 *
2167 * The map must be NOT be locked. It will be returned locked
2168 * on KERN_SUCCESS, unlocked on failure.
2169 *
2170 * If an entry is allocated, the object/offset fields
2171 * are initialized to zero.
2172 */
2173 kern_return_t
vm_map_find_space(vm_map_t map,vm_map_offset_t hint_address,vm_map_size_t size,vm_map_offset_t mask,vm_map_kernel_flags_t vmk_flags,vm_map_entry_t * o_entry)2174 vm_map_find_space(
2175 vm_map_t map,
2176 vm_map_offset_t hint_address,
2177 vm_map_size_t size,
2178 vm_map_offset_t mask,
2179 vm_map_kernel_flags_t vmk_flags,
2180 vm_map_entry_t *o_entry) /* OUT */
2181 {
2182 vm_map_entry_t new_entry, entry;
2183 kern_return_t kr;
2184
2185 if (size == 0) {
2186 return KERN_INVALID_ARGUMENT;
2187 }
2188
2189 new_entry = vm_map_entry_create(map);
2190 new_entry->use_pmap = true;
2191 new_entry->protection = VM_PROT_DEFAULT;
2192 new_entry->max_protection = VM_PROT_ALL;
2193
2194 if (VM_MAP_PAGE_SHIFT(map) != PAGE_SHIFT) {
2195 new_entry->map_aligned = true;
2196 }
2197 if (vmk_flags.vmf_permanent) {
2198 new_entry->vme_permanent = true;
2199 }
2200
2201 vm_map_lock(map);
2202
2203 kr = vm_map_locate_space(map, size, mask, vmk_flags,
2204 &hint_address, &entry);
2205 if (kr != KERN_SUCCESS) {
2206 vm_map_unlock(map);
2207 vm_map_entry_dispose(new_entry);
2208 return kr;
2209 }
2210 new_entry->vme_start = hint_address;
2211 new_entry->vme_end = hint_address + size;
2212
2213 /*
2214 * At this point,
2215 *
2216 * - new_entry's "vme_start" and "vme_end" should define
2217 * the endpoints of the available new range,
2218 *
2219 * - and "entry" should refer to the region before
2220 * the new range,
2221 *
2222 * - and the map should still be locked.
2223 */
2224
2225 assert(page_aligned(new_entry->vme_start));
2226 assert(page_aligned(new_entry->vme_end));
2227 assert(VM_MAP_PAGE_ALIGNED(new_entry->vme_start, VM_MAP_PAGE_MASK(map)));
2228 assert(VM_MAP_PAGE_ALIGNED(new_entry->vme_end, VM_MAP_PAGE_MASK(map)));
2229
2230 /*
2231 * Insert the new entry into the list
2232 */
2233
2234 vm_map_store_entry_link(map, entry, new_entry,
2235 VM_MAP_KERNEL_FLAGS_NONE);
2236 map->size += size;
2237
2238 /*
2239 * Update the lookup hint
2240 */
2241 SAVE_HINT_MAP_WRITE(map, new_entry);
2242
2243 *o_entry = new_entry;
2244 return KERN_SUCCESS;
2245 }
2246
2247 int vm_map_pmap_enter_print = FALSE;
2248 int vm_map_pmap_enter_enable = FALSE;
2249
2250 /*
2251 * Routine: vm_map_pmap_enter [internal only]
2252 *
2253 * Description:
2254 * Force pages from the specified object to be entered into
2255 * the pmap at the specified address if they are present.
2256 * As soon as a page not found in the object the scan ends.
2257 *
2258 * Returns:
2259 * Nothing.
2260 *
2261 * In/out conditions:
2262 * The source map should not be locked on entry.
2263 */
2264 __unused static void
vm_map_pmap_enter(vm_map_t map,vm_map_offset_t addr,vm_map_offset_t end_addr,vm_object_t object,vm_object_offset_t offset,vm_prot_t protection)2265 vm_map_pmap_enter(
2266 vm_map_t map,
2267 vm_map_offset_t addr,
2268 vm_map_offset_t end_addr,
2269 vm_object_t object,
2270 vm_object_offset_t offset,
2271 vm_prot_t protection)
2272 {
2273 int type_of_fault;
2274 kern_return_t kr;
2275 uint8_t object_lock_type = 0;
2276 struct vm_object_fault_info fault_info = {};
2277
2278 if (map->pmap == 0) {
2279 return;
2280 }
2281
2282 assert(VM_MAP_PAGE_SHIFT(map) == PAGE_SHIFT);
2283
2284 while (addr < end_addr) {
2285 vm_page_t m;
2286
2287
2288 /*
2289 * TODO:
2290 * From vm_map_enter(), we come into this function without the map
2291 * lock held or the object lock held.
2292 * We haven't taken a reference on the object either.
2293 * We should do a proper lookup on the map to make sure
2294 * that things are sane before we go locking objects that
2295 * could have been deallocated from under us.
2296 */
2297
2298 object_lock_type = OBJECT_LOCK_EXCLUSIVE;
2299 vm_object_lock(object);
2300
2301 m = vm_page_lookup(object, offset);
2302
2303 if (m == VM_PAGE_NULL || m->vmp_busy || m->vmp_fictitious ||
2304 (m->vmp_unusual && (VMP_ERROR_GET(m) || m->vmp_restart || m->vmp_absent))) {
2305 vm_object_unlock(object);
2306 return;
2307 }
2308
2309 if (vm_map_pmap_enter_print) {
2310 printf("vm_map_pmap_enter:");
2311 printf("map: %p, addr: %llx, object: %p, offset: %llx\n",
2312 map, (unsigned long long)addr, object, (unsigned long long)offset);
2313 }
2314 type_of_fault = DBG_CACHE_HIT_FAULT;
2315 kr = vm_fault_enter(m, map->pmap,
2316 addr,
2317 PAGE_SIZE, 0,
2318 protection, protection,
2319 VM_PAGE_WIRED(m),
2320 FALSE, /* change_wiring */
2321 VM_KERN_MEMORY_NONE, /* tag - not wiring */
2322 &fault_info,
2323 NULL, /* need_retry */
2324 &type_of_fault,
2325 &object_lock_type); /* Exclusive lock mode. Will remain unchanged.*/
2326
2327 vm_object_unlock(object);
2328
2329 offset += PAGE_SIZE_64;
2330 addr += PAGE_SIZE;
2331 }
2332 }
2333
2334 #define MAX_TRIES_TO_GET_RANDOM_ADDRESS 1000
2335 static kern_return_t
vm_map_random_address_for_size(vm_map_t map,vm_map_offset_t * address,vm_map_size_t size,vm_map_kernel_flags_t vmk_flags)2336 vm_map_random_address_for_size(
2337 vm_map_t map,
2338 vm_map_offset_t *address,
2339 vm_map_size_t size,
2340 vm_map_kernel_flags_t vmk_flags)
2341 {
2342 kern_return_t kr = KERN_SUCCESS;
2343 int tries = 0;
2344 vm_map_offset_t random_addr = 0;
2345 vm_map_offset_t hole_end;
2346
2347 vm_map_entry_t next_entry = VM_MAP_ENTRY_NULL;
2348 vm_map_entry_t prev_entry = VM_MAP_ENTRY_NULL;
2349 vm_map_size_t vm_hole_size = 0;
2350 vm_map_size_t addr_space_size;
2351 bool is_kmem_ptr;
2352 struct mach_vm_range effective_range;
2353
2354 effective_range = vm_map_get_range(map, address, &vmk_flags, size,
2355 &is_kmem_ptr);
2356
2357 addr_space_size = effective_range.max_address - effective_range.min_address;
2358 if (size >= addr_space_size) {
2359 return KERN_NO_SPACE;
2360 }
2361 addr_space_size -= size;
2362
2363 assert(VM_MAP_PAGE_ALIGNED(size, VM_MAP_PAGE_MASK(map)));
2364
2365 while (tries < MAX_TRIES_TO_GET_RANDOM_ADDRESS) {
2366 if (startup_phase < STARTUP_SUB_ZALLOC) {
2367 random_addr = (vm_map_offset_t)early_random();
2368 } else {
2369 random_addr = (vm_map_offset_t)random();
2370 }
2371 random_addr <<= VM_MAP_PAGE_SHIFT(map);
2372 random_addr = vm_map_trunc_page(
2373 effective_range.min_address + (random_addr % addr_space_size),
2374 VM_MAP_PAGE_MASK(map));
2375
2376 #if CONFIG_PROB_GZALLOC
2377 if (map->pmap == kernel_pmap && pgz_owned(random_addr)) {
2378 continue;
2379 }
2380 #endif /* CONFIG_PROB_GZALLOC */
2381
2382 if (vm_map_lookup_entry(map, random_addr, &prev_entry) == FALSE) {
2383 if (prev_entry == vm_map_to_entry(map)) {
2384 next_entry = vm_map_first_entry(map);
2385 } else {
2386 next_entry = prev_entry->vme_next;
2387 }
2388 if (next_entry == vm_map_to_entry(map)) {
2389 hole_end = vm_map_max(map);
2390 } else {
2391 hole_end = next_entry->vme_start;
2392 }
2393 vm_hole_size = hole_end - random_addr;
2394 if (vm_hole_size >= size) {
2395 *address = random_addr;
2396 break;
2397 }
2398 }
2399 tries++;
2400 }
2401
2402 if (tries == MAX_TRIES_TO_GET_RANDOM_ADDRESS) {
2403 kr = KERN_NO_SPACE;
2404 }
2405 return kr;
2406 }
2407
2408 static boolean_t
vm_memory_malloc_no_cow(int alias)2409 vm_memory_malloc_no_cow(
2410 int alias)
2411 {
2412 uint64_t alias_mask;
2413
2414 if (!malloc_no_cow) {
2415 return FALSE;
2416 }
2417 if (alias > 63) {
2418 return FALSE;
2419 }
2420 alias_mask = 1ULL << alias;
2421 if (alias_mask & vm_memory_malloc_no_cow_mask) {
2422 return TRUE;
2423 }
2424 return FALSE;
2425 }
2426
2427 uint64_t vm_map_enter_RLIMIT_AS_count = 0;
2428 uint64_t vm_map_enter_RLIMIT_DATA_count = 0;
2429 /*
2430 * Routine: vm_map_enter
2431 *
2432 * Description:
2433 * Allocate a range in the specified virtual address map.
2434 * The resulting range will refer to memory defined by
2435 * the given memory object and offset into that object.
2436 *
2437 * Arguments are as defined in the vm_map call.
2438 */
2439 static unsigned int vm_map_enter_restore_successes = 0;
2440 static unsigned int vm_map_enter_restore_failures = 0;
2441 kern_return_t
vm_map_enter(vm_map_t map,vm_map_offset_t * address,vm_map_size_t size,vm_map_offset_t mask,vm_map_kernel_flags_t vmk_flags,vm_object_t object,vm_object_offset_t offset,boolean_t needs_copy,vm_prot_t cur_protection,vm_prot_t max_protection,vm_inherit_t inheritance)2442 vm_map_enter(
2443 vm_map_t map,
2444 vm_map_offset_t *address, /* IN/OUT */
2445 vm_map_size_t size,
2446 vm_map_offset_t mask,
2447 vm_map_kernel_flags_t vmk_flags,
2448 vm_object_t object,
2449 vm_object_offset_t offset,
2450 boolean_t needs_copy,
2451 vm_prot_t cur_protection,
2452 vm_prot_t max_protection,
2453 vm_inherit_t inheritance)
2454 {
2455 vm_map_entry_t entry, new_entry;
2456 vm_map_offset_t start, tmp_start, tmp_offset;
2457 vm_map_offset_t end, tmp_end;
2458 vm_map_offset_t tmp2_start, tmp2_end;
2459 vm_map_offset_t step;
2460 kern_return_t result = KERN_SUCCESS;
2461 bool map_locked = FALSE;
2462 bool pmap_empty = TRUE;
2463 bool new_mapping_established = FALSE;
2464 const bool keep_map_locked = vmk_flags.vmkf_keep_map_locked;
2465 const bool anywhere = !vmk_flags.vmf_fixed;
2466 const bool purgable = vmk_flags.vmf_purgeable;
2467 const bool overwrite = vmk_flags.vmf_overwrite;
2468 const bool no_cache = vmk_flags.vmf_no_cache;
2469 const bool is_submap = vmk_flags.vmkf_submap;
2470 const bool permanent = vmk_flags.vmf_permanent;
2471 const bool no_copy_on_read = vmk_flags.vmkf_no_copy_on_read;
2472 const bool entry_for_jit = vmk_flags.vmkf_map_jit;
2473 const bool iokit_acct = vmk_flags.vmkf_iokit_acct;
2474 const bool resilient_codesign = vmk_flags.vmf_resilient_codesign;
2475 const bool resilient_media = vmk_flags.vmf_resilient_media;
2476 const bool entry_for_tpro = vmk_flags.vmf_tpro;
2477 const unsigned int superpage_size = vmk_flags.vmf_superpage_size;
2478 const vm_tag_t alias = vmk_flags.vm_tag;
2479 vm_tag_t user_alias;
2480 kern_return_t kr;
2481 bool clear_map_aligned = FALSE;
2482 vm_map_size_t chunk_size = 0;
2483 vm_object_t caller_object;
2484 VM_MAP_ZAP_DECLARE(zap_old_list);
2485 VM_MAP_ZAP_DECLARE(zap_new_list);
2486
2487 caller_object = object;
2488
2489 assertf(vmk_flags.__vmkf_unused == 0, "vmk_flags unused=0x%x\n", vmk_flags.__vmkf_unused);
2490
2491 if (vmk_flags.vmf_4gb_chunk) {
2492 #if defined(__LP64__)
2493 chunk_size = (4ULL * 1024 * 1024 * 1024); /* max. 4GB chunks for the new allocation */
2494 #else /* __LP64__ */
2495 chunk_size = ANON_CHUNK_SIZE;
2496 #endif /* __LP64__ */
2497 } else {
2498 chunk_size = ANON_CHUNK_SIZE;
2499 }
2500
2501
2502
2503 if (superpage_size) {
2504 switch (superpage_size) {
2505 /*
2506 * Note that the current implementation only supports
2507 * a single size for superpages, SUPERPAGE_SIZE, per
2508 * architecture. As soon as more sizes are supposed
2509 * to be supported, SUPERPAGE_SIZE has to be replaced
2510 * with a lookup of the size depending on superpage_size.
2511 */
2512 #ifdef __x86_64__
2513 case SUPERPAGE_SIZE_ANY:
2514 /* handle it like 2 MB and round up to page size */
2515 size = (size + 2 * 1024 * 1024 - 1) & ~(2 * 1024 * 1024 - 1);
2516 OS_FALLTHROUGH;
2517 case SUPERPAGE_SIZE_2MB:
2518 break;
2519 #endif
2520 default:
2521 return KERN_INVALID_ARGUMENT;
2522 }
2523 mask = SUPERPAGE_SIZE - 1;
2524 if (size & (SUPERPAGE_SIZE - 1)) {
2525 return KERN_INVALID_ARGUMENT;
2526 }
2527 inheritance = VM_INHERIT_NONE; /* fork() children won't inherit superpages */
2528 }
2529
2530
2531 if ((cur_protection & VM_PROT_WRITE) &&
2532 (cur_protection & VM_PROT_EXECUTE) &&
2533 #if XNU_TARGET_OS_OSX
2534 map->pmap != kernel_pmap &&
2535 (cs_process_global_enforcement() ||
2536 (vmk_flags.vmkf_cs_enforcement_override
2537 ? vmk_flags.vmkf_cs_enforcement
2538 : (vm_map_cs_enforcement(map)
2539 #if __arm64__
2540 || !VM_MAP_IS_EXOTIC(map)
2541 #endif /* __arm64__ */
2542 ))) &&
2543 #endif /* XNU_TARGET_OS_OSX */
2544 #if CODE_SIGNING_MONITOR
2545 (csm_address_space_exempt(map->pmap) != KERN_SUCCESS) &&
2546 #endif
2547 (VM_MAP_POLICY_WX_FAIL(map) ||
2548 VM_MAP_POLICY_WX_STRIP_X(map)) &&
2549 !entry_for_jit) {
2550 boolean_t vm_protect_wx_fail = VM_MAP_POLICY_WX_FAIL(map);
2551
2552 DTRACE_VM3(cs_wx,
2553 uint64_t, 0,
2554 uint64_t, 0,
2555 vm_prot_t, cur_protection);
2556 printf("CODE SIGNING: %d[%s] %s: curprot cannot be write+execute. %s\n",
2557 proc_selfpid(),
2558 (get_bsdtask_info(current_task())
2559 ? proc_name_address(get_bsdtask_info(current_task()))
2560 : "?"),
2561 __FUNCTION__,
2562 (vm_protect_wx_fail ? "failing" : "turning off execute"));
2563 cur_protection &= ~VM_PROT_EXECUTE;
2564 if (vm_protect_wx_fail) {
2565 return KERN_PROTECTION_FAILURE;
2566 }
2567 }
2568
2569 /*
2570 * If the task has requested executable lockdown,
2571 * deny any new executable mapping.
2572 */
2573 if (map->map_disallow_new_exec == TRUE) {
2574 if (cur_protection & VM_PROT_EXECUTE) {
2575 return KERN_PROTECTION_FAILURE;
2576 }
2577 }
2578
2579 if (resilient_codesign) {
2580 assert(!is_submap);
2581 int reject_prot = (needs_copy ? VM_PROT_ALLEXEC : (VM_PROT_WRITE | VM_PROT_ALLEXEC));
2582 if ((cur_protection | max_protection) & reject_prot) {
2583 return KERN_PROTECTION_FAILURE;
2584 }
2585 }
2586
2587 if (resilient_media) {
2588 assert(!is_submap);
2589 // assert(!needs_copy);
2590 if (object != VM_OBJECT_NULL &&
2591 !object->internal) {
2592 /*
2593 * This mapping is directly backed by an external
2594 * memory manager (e.g. a vnode pager for a file):
2595 * we would not have any safe place to inject
2596 * a zero-filled page if an actual page is not
2597 * available, without possibly impacting the actual
2598 * contents of the mapped object (e.g. the file),
2599 * so we can't provide any media resiliency here.
2600 */
2601 return KERN_INVALID_ARGUMENT;
2602 }
2603 }
2604
2605 if (entry_for_tpro) {
2606 /*
2607 * TPRO overrides the effective permissions of the region
2608 * and explicitly maps as RW. Ensure we have been passed
2609 * the expected permissions. We accept `cur_protections`
2610 * RO as that will be handled on fault.
2611 */
2612 if (!(max_protection & VM_PROT_READ) ||
2613 !(max_protection & VM_PROT_WRITE) ||
2614 !(cur_protection & VM_PROT_READ)) {
2615 return KERN_PROTECTION_FAILURE;
2616 }
2617
2618 /*
2619 * We can now downgrade the cur_protection to RO. This is a mild lie
2620 * to the VM layer. But TPRO will be responsible for toggling the
2621 * protections between RO/RW
2622 */
2623 cur_protection = VM_PROT_READ;
2624 }
2625
2626 if (is_submap) {
2627 vm_map_t submap;
2628 if (purgable) {
2629 /* submaps can not be purgeable */
2630 return KERN_INVALID_ARGUMENT;
2631 }
2632 if (object == VM_OBJECT_NULL) {
2633 /* submaps can not be created lazily */
2634 return KERN_INVALID_ARGUMENT;
2635 }
2636 submap = (vm_map_t) object;
2637 if (VM_MAP_PAGE_SHIFT(submap) != VM_MAP_PAGE_SHIFT(map)) {
2638 /* page size mismatch */
2639 return KERN_INVALID_ARGUMENT;
2640 }
2641 }
2642 if (vmk_flags.vmkf_already) {
2643 /*
2644 * VM_FLAGS_ALREADY says that it's OK if the same mapping
2645 * is already present. For it to be meaningul, the requested
2646 * mapping has to be at a fixed address (!VM_FLAGS_ANYWHERE) and
2647 * we shouldn't try and remove what was mapped there first
2648 * (!VM_FLAGS_OVERWRITE).
2649 */
2650 if (!vmk_flags.vmf_fixed || vmk_flags.vmf_overwrite) {
2651 return KERN_INVALID_ARGUMENT;
2652 }
2653 }
2654
2655 if (size == 0 ||
2656 (offset & MIN(VM_MAP_PAGE_MASK(map), PAGE_MASK_64)) != 0) {
2657 *address = 0;
2658 return KERN_INVALID_ARGUMENT;
2659 }
2660
2661 if (map->pmap == kernel_pmap) {
2662 user_alias = VM_KERN_MEMORY_NONE;
2663 } else {
2664 user_alias = alias;
2665 }
2666
2667 if (user_alias == VM_MEMORY_MALLOC_MEDIUM) {
2668 chunk_size = MALLOC_MEDIUM_CHUNK_SIZE;
2669 }
2670
2671 #define RETURN(value) { result = value; goto BailOut; }
2672
2673 assertf(VM_MAP_PAGE_ALIGNED(*address, FOURK_PAGE_MASK), "0x%llx", (uint64_t)*address);
2674 assertf(VM_MAP_PAGE_ALIGNED(size, FOURK_PAGE_MASK), "0x%llx", (uint64_t)size);
2675 if (VM_MAP_PAGE_MASK(map) >= PAGE_MASK) {
2676 assertf(page_aligned(*address), "0x%llx", (uint64_t)*address);
2677 assertf(page_aligned(size), "0x%llx", (uint64_t)size);
2678 }
2679
2680 if (VM_MAP_PAGE_MASK(map) >= PAGE_MASK &&
2681 !VM_MAP_PAGE_ALIGNED(size, VM_MAP_PAGE_MASK(map))) {
2682 /*
2683 * In most cases, the caller rounds the size up to the
2684 * map's page size.
2685 * If we get a size that is explicitly not map-aligned here,
2686 * we'll have to respect the caller's wish and mark the
2687 * mapping as "not map-aligned" to avoid tripping the
2688 * map alignment checks later.
2689 */
2690 clear_map_aligned = TRUE;
2691 }
2692 if (!anywhere &&
2693 VM_MAP_PAGE_MASK(map) >= PAGE_MASK &&
2694 !VM_MAP_PAGE_ALIGNED(*address, VM_MAP_PAGE_MASK(map))) {
2695 /*
2696 * We've been asked to map at a fixed address and that
2697 * address is not aligned to the map's specific alignment.
2698 * The caller should know what it's doing (i.e. most likely
2699 * mapping some fragmented copy map, transferring memory from
2700 * a VM map with a different alignment), so clear map_aligned
2701 * for this new VM map entry and proceed.
2702 */
2703 clear_map_aligned = TRUE;
2704 }
2705
2706 /*
2707 * Only zero-fill objects are allowed to be purgable.
2708 * LP64todo - limit purgable objects to 32-bits for now
2709 */
2710 if (purgable &&
2711 (offset != 0 ||
2712 (object != VM_OBJECT_NULL &&
2713 (object->vo_size != size ||
2714 object->purgable == VM_PURGABLE_DENY))
2715 #if __LP64__
2716 || size > ANON_MAX_SIZE
2717 #endif
2718 )) {
2719 return KERN_INVALID_ARGUMENT;
2720 }
2721
2722 start = *address;
2723
2724 if (anywhere) {
2725 vm_map_lock(map);
2726 map_locked = TRUE;
2727
2728 result = vm_map_locate_space(map, size, mask, vmk_flags,
2729 &start, &entry);
2730 if (result != KERN_SUCCESS) {
2731 goto BailOut;
2732 }
2733
2734 *address = start;
2735 end = start + size;
2736 assert(VM_MAP_PAGE_ALIGNED(*address,
2737 VM_MAP_PAGE_MASK(map)));
2738 } else {
2739 vm_map_offset_t effective_min_offset, effective_max_offset;
2740
2741 effective_min_offset = map->min_offset;
2742 effective_max_offset = map->max_offset;
2743
2744 if (vmk_flags.vmkf_beyond_max) {
2745 /*
2746 * Allow an insertion beyond the map's max offset.
2747 */
2748 effective_max_offset = 0x00000000FFFFF000ULL;
2749 if (vm_map_is_64bit(map)) {
2750 effective_max_offset = 0xFFFFFFFFFFFFF000ULL;
2751 }
2752 #if XNU_TARGET_OS_OSX
2753 } else if (__improbable(vmk_flags.vmkf_32bit_map_va)) {
2754 effective_max_offset = MIN(map->max_offset, 0x00000000FFFFF000ULL);
2755 #endif /* XNU_TARGET_OS_OSX */
2756 }
2757
2758 if (VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT &&
2759 !overwrite &&
2760 user_alias == VM_MEMORY_REALLOC) {
2761 /*
2762 * Force realloc() to switch to a new allocation,
2763 * to prevent 4k-fragmented virtual ranges.
2764 */
2765 // DEBUG4K_ERROR("no realloc in place");
2766 return KERN_NO_SPACE;
2767 }
2768
2769 /*
2770 * Verify that:
2771 * the address doesn't itself violate
2772 * the mask requirement.
2773 */
2774
2775 vm_map_lock(map);
2776 map_locked = TRUE;
2777 if ((start & mask) != 0) {
2778 RETURN(KERN_NO_SPACE);
2779 }
2780
2781 #if CONFIG_MAP_RANGES
2782 if (map->uses_user_ranges) {
2783 struct mach_vm_range r;
2784
2785 vm_map_user_range_resolve(map, start, 1, &r);
2786 if (r.max_address == 0) {
2787 RETURN(KERN_INVALID_ADDRESS);
2788 }
2789 effective_min_offset = r.min_address;
2790 effective_max_offset = r.max_address;
2791 }
2792 #endif /* CONFIG_MAP_RANGES */
2793
2794 if ((startup_phase >= STARTUP_SUB_KMEM) && !is_submap &&
2795 (map == kernel_map)) {
2796 mach_vm_range_t r = kmem_validate_range_for_overwrite(start, size);
2797 effective_min_offset = r->min_address;
2798 effective_max_offset = r->max_address;
2799 }
2800
2801 /*
2802 * ... the address is within bounds
2803 */
2804
2805 end = start + size;
2806
2807 if ((start < effective_min_offset) ||
2808 (end > effective_max_offset) ||
2809 (start >= end)) {
2810 RETURN(KERN_INVALID_ADDRESS);
2811 }
2812
2813 if (overwrite) {
2814 vmr_flags_t remove_flags = VM_MAP_REMOVE_NO_MAP_ALIGN | VM_MAP_REMOVE_TO_OVERWRITE;
2815 kern_return_t remove_kr;
2816
2817 /*
2818 * Fixed mapping and "overwrite" flag: attempt to
2819 * remove all existing mappings in the specified
2820 * address range, saving them in our "zap_old_list".
2821 *
2822 * This avoids releasing the VM map lock in
2823 * vm_map_entry_delete() and allows atomicity
2824 * when we want to replace some mappings with a new one.
2825 * It also allows us to restore the old VM mappings if the
2826 * new mapping fails.
2827 */
2828 remove_flags |= VM_MAP_REMOVE_NO_YIELD;
2829
2830 if (vmk_flags.vmkf_overwrite_immutable) {
2831 /* we can overwrite immutable mappings */
2832 remove_flags |= VM_MAP_REMOVE_IMMUTABLE;
2833 }
2834 if (vmk_flags.vmkf_remap_prot_copy) {
2835 remove_flags |= VM_MAP_REMOVE_IMMUTABLE_CODE;
2836 }
2837 remove_kr = vm_map_delete(map, start, end, remove_flags,
2838 KMEM_GUARD_NONE, &zap_old_list).kmr_return;
2839 if (remove_kr) {
2840 /* XXX FBDP restore zap_old_list? */
2841 RETURN(remove_kr);
2842 }
2843 }
2844
2845 /*
2846 * ... the starting address isn't allocated
2847 */
2848
2849 if (vm_map_lookup_entry(map, start, &entry)) {
2850 if (!(vmk_flags.vmkf_already)) {
2851 RETURN(KERN_NO_SPACE);
2852 }
2853 /*
2854 * Check if what's already there is what we want.
2855 */
2856 tmp_start = start;
2857 tmp_offset = offset;
2858 if (entry->vme_start < start) {
2859 tmp_start -= start - entry->vme_start;
2860 tmp_offset -= start - entry->vme_start;
2861 }
2862 for (; entry->vme_start < end;
2863 entry = entry->vme_next) {
2864 /*
2865 * Check if the mapping's attributes
2866 * match the existing map entry.
2867 */
2868 if (entry == vm_map_to_entry(map) ||
2869 entry->vme_start != tmp_start ||
2870 entry->is_sub_map != is_submap ||
2871 VME_OFFSET(entry) != tmp_offset ||
2872 entry->needs_copy != needs_copy ||
2873 entry->protection != cur_protection ||
2874 entry->max_protection != max_protection ||
2875 entry->inheritance != inheritance ||
2876 entry->iokit_acct != iokit_acct ||
2877 VME_ALIAS(entry) != alias) {
2878 /* not the same mapping ! */
2879 RETURN(KERN_NO_SPACE);
2880 }
2881 /*
2882 * Check if the same object is being mapped.
2883 */
2884 if (is_submap) {
2885 if (VME_SUBMAP(entry) !=
2886 (vm_map_t) object) {
2887 /* not the same submap */
2888 RETURN(KERN_NO_SPACE);
2889 }
2890 } else {
2891 if (VME_OBJECT(entry) != object) {
2892 /* not the same VM object... */
2893 vm_object_t obj2;
2894
2895 obj2 = VME_OBJECT(entry);
2896 if ((obj2 == VM_OBJECT_NULL ||
2897 obj2->internal) &&
2898 (object == VM_OBJECT_NULL ||
2899 object->internal)) {
2900 /*
2901 * ... but both are
2902 * anonymous memory,
2903 * so equivalent.
2904 */
2905 } else {
2906 RETURN(KERN_NO_SPACE);
2907 }
2908 }
2909 }
2910
2911 tmp_offset += entry->vme_end - entry->vme_start;
2912 tmp_start += entry->vme_end - entry->vme_start;
2913 if (entry->vme_end >= end) {
2914 /* reached the end of our mapping */
2915 break;
2916 }
2917 }
2918 /* it all matches: let's use what's already there ! */
2919 RETURN(KERN_MEMORY_PRESENT);
2920 }
2921
2922 /*
2923 * ... the next region doesn't overlap the
2924 * end point.
2925 */
2926
2927 if ((entry->vme_next != vm_map_to_entry(map)) &&
2928 (entry->vme_next->vme_start < end)) {
2929 RETURN(KERN_NO_SPACE);
2930 }
2931 }
2932
2933 /*
2934 * At this point,
2935 * "start" and "end" should define the endpoints of the
2936 * available new range, and
2937 * "entry" should refer to the region before the new
2938 * range, and
2939 *
2940 * the map should be locked.
2941 */
2942
2943 /*
2944 * See whether we can avoid creating a new entry (and object) by
2945 * extending one of our neighbors. [So far, we only attempt to
2946 * extend from below.] Note that we can never extend/join
2947 * purgable objects because they need to remain distinct
2948 * entities in order to implement their "volatile object"
2949 * semantics.
2950 */
2951
2952 if (purgable ||
2953 entry_for_jit ||
2954 entry_for_tpro ||
2955 vm_memory_malloc_no_cow(user_alias)) {
2956 if (object == VM_OBJECT_NULL) {
2957 object = vm_object_allocate(size);
2958 object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
2959 object->true_share = FALSE;
2960 if (malloc_no_cow_except_fork &&
2961 !purgable &&
2962 !entry_for_jit &&
2963 !entry_for_tpro &&
2964 vm_memory_malloc_no_cow(user_alias)) {
2965 object->copy_strategy = MEMORY_OBJECT_COPY_DELAY_FORK;
2966 object->true_share = TRUE;
2967 }
2968 if (purgable) {
2969 task_t owner;
2970 object->purgable = VM_PURGABLE_NONVOLATILE;
2971 if (map->pmap == kernel_pmap) {
2972 /*
2973 * Purgeable mappings made in a kernel
2974 * map are "owned" by the kernel itself
2975 * rather than the current user task
2976 * because they're likely to be used by
2977 * more than this user task (see
2978 * execargs_purgeable_allocate(), for
2979 * example).
2980 */
2981 owner = kernel_task;
2982 } else {
2983 owner = current_task();
2984 }
2985 assert(object->vo_owner == NULL);
2986 assert(object->resident_page_count == 0);
2987 assert(object->wired_page_count == 0);
2988 vm_object_lock(object);
2989 vm_purgeable_nonvolatile_enqueue(object, owner);
2990 vm_object_unlock(object);
2991 }
2992 offset = (vm_object_offset_t)0;
2993 }
2994 } else if (VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT) {
2995 /* no coalescing if address space uses sub-pages */
2996 } else if ((is_submap == FALSE) &&
2997 (object == VM_OBJECT_NULL) &&
2998 (entry != vm_map_to_entry(map)) &&
2999 (entry->vme_end == start) &&
3000 (!entry->is_shared) &&
3001 (!entry->is_sub_map) &&
3002 (!entry->in_transition) &&
3003 (!entry->needs_wakeup) &&
3004 (entry->behavior == VM_BEHAVIOR_DEFAULT) &&
3005 (entry->protection == cur_protection) &&
3006 (entry->max_protection == max_protection) &&
3007 (entry->inheritance == inheritance) &&
3008 ((user_alias == VM_MEMORY_REALLOC) ||
3009 (VME_ALIAS(entry) == alias)) &&
3010 (entry->no_cache == no_cache) &&
3011 (entry->vme_permanent == permanent) &&
3012 /* no coalescing for immutable executable mappings */
3013 !((entry->protection & VM_PROT_EXECUTE) &&
3014 entry->vme_permanent) &&
3015 (!entry->superpage_size && !superpage_size) &&
3016 /*
3017 * No coalescing if not map-aligned, to avoid propagating
3018 * that condition any further than needed:
3019 */
3020 (!entry->map_aligned || !clear_map_aligned) &&
3021 (!entry->zero_wired_pages) &&
3022 (!entry->used_for_jit && !entry_for_jit) &&
3023 #if __arm64e__
3024 (!entry->used_for_tpro && !entry_for_tpro) &&
3025 #endif
3026 (!entry->csm_associated) &&
3027 (entry->iokit_acct == iokit_acct) &&
3028 (!entry->vme_resilient_codesign) &&
3029 (!entry->vme_resilient_media) &&
3030 (!entry->vme_atomic) &&
3031 (entry->vme_no_copy_on_read == no_copy_on_read) &&
3032
3033 ((entry->vme_end - entry->vme_start) + size <=
3034 (user_alias == VM_MEMORY_REALLOC ?
3035 ANON_CHUNK_SIZE :
3036 NO_COALESCE_LIMIT)) &&
3037
3038 (entry->wired_count == 0)) { /* implies user_wired_count == 0 */
3039 if (vm_object_coalesce(VME_OBJECT(entry),
3040 VM_OBJECT_NULL,
3041 VME_OFFSET(entry),
3042 (vm_object_offset_t) 0,
3043 (vm_map_size_t)(entry->vme_end - entry->vme_start),
3044 (vm_map_size_t)(end - entry->vme_end))) {
3045 /*
3046 * Coalesced the two objects - can extend
3047 * the previous map entry to include the
3048 * new range.
3049 */
3050 map->size += (end - entry->vme_end);
3051 assert(entry->vme_start < end);
3052 assert(VM_MAP_PAGE_ALIGNED(end,
3053 VM_MAP_PAGE_MASK(map)));
3054 if (__improbable(vm_debug_events)) {
3055 DTRACE_VM5(map_entry_extend, vm_map_t, map, vm_map_entry_t, entry, vm_address_t, entry->vme_start, vm_address_t, entry->vme_end, vm_address_t, end);
3056 }
3057 entry->vme_end = end;
3058 if (map->holelistenabled) {
3059 vm_map_store_update_first_free(map, entry, TRUE);
3060 } else {
3061 vm_map_store_update_first_free(map, map->first_free, TRUE);
3062 }
3063 new_mapping_established = TRUE;
3064 RETURN(KERN_SUCCESS);
3065 }
3066 }
3067
3068 step = superpage_size ? SUPERPAGE_SIZE : (end - start);
3069 new_entry = NULL;
3070
3071 if (vmk_flags.vmkf_submap_adjust) {
3072 vm_map_adjust_offsets((vm_map_t)caller_object, start, end);
3073 offset = start;
3074 }
3075
3076 for (tmp2_start = start; tmp2_start < end; tmp2_start += step) {
3077 tmp2_end = tmp2_start + step;
3078 /*
3079 * Create a new entry
3080 *
3081 * XXX FBDP
3082 * The reserved "page zero" in each process's address space can
3083 * be arbitrarily large. Splitting it into separate objects and
3084 * therefore different VM map entries serves no purpose and just
3085 * slows down operations on the VM map, so let's not split the
3086 * allocation into chunks if the max protection is NONE. That
3087 * memory should never be accessible, so it will never get to the
3088 * default pager.
3089 */
3090 tmp_start = tmp2_start;
3091 if (!is_submap &&
3092 object == VM_OBJECT_NULL &&
3093 size > chunk_size &&
3094 max_protection != VM_PROT_NONE &&
3095 superpage_size == 0) {
3096 tmp_end = tmp_start + chunk_size;
3097 } else {
3098 tmp_end = tmp2_end;
3099 }
3100 do {
3101 if (!is_submap &&
3102 object != VM_OBJECT_NULL &&
3103 object->internal &&
3104 offset + (tmp_end - tmp_start) > object->vo_size) {
3105 // printf("FBDP object %p size 0x%llx overmapping offset 0x%llx size 0x%llx\n", object, object->vo_size, offset, (uint64_t)(tmp_end - tmp_start));
3106 DTRACE_VM5(vm_map_enter_overmap,
3107 vm_map_t, map,
3108 vm_map_address_t, tmp_start,
3109 vm_map_address_t, tmp_end,
3110 vm_object_offset_t, offset,
3111 vm_object_size_t, object->vo_size);
3112 }
3113 new_entry = vm_map_entry_insert(map,
3114 entry, tmp_start, tmp_end,
3115 object, offset, vmk_flags,
3116 needs_copy,
3117 cur_protection, max_protection,
3118 (entry_for_jit && !VM_MAP_POLICY_ALLOW_JIT_INHERIT(map) ?
3119 VM_INHERIT_NONE : inheritance),
3120 clear_map_aligned);
3121
3122 assert(!is_kernel_object(object) || (VM_KERN_MEMORY_NONE != alias));
3123
3124 if (resilient_codesign) {
3125 int reject_prot = (needs_copy ? VM_PROT_ALLEXEC : (VM_PROT_WRITE | VM_PROT_ALLEXEC));
3126 if (!((cur_protection | max_protection) & reject_prot)) {
3127 new_entry->vme_resilient_codesign = TRUE;
3128 }
3129 }
3130
3131 if (resilient_media &&
3132 (object == VM_OBJECT_NULL ||
3133 object->internal)) {
3134 new_entry->vme_resilient_media = TRUE;
3135 }
3136
3137 assert(!new_entry->iokit_acct);
3138 if (!is_submap &&
3139 object != VM_OBJECT_NULL &&
3140 (object->purgable != VM_PURGABLE_DENY ||
3141 object->vo_ledger_tag)) {
3142 assert(new_entry->use_pmap);
3143 assert(!new_entry->iokit_acct);
3144 /*
3145 * Turn off pmap accounting since
3146 * purgeable (or tagged) objects have their
3147 * own ledgers.
3148 */
3149 new_entry->use_pmap = FALSE;
3150 } else if (!is_submap &&
3151 iokit_acct &&
3152 object != VM_OBJECT_NULL &&
3153 object->internal) {
3154 /* alternate accounting */
3155 assert(!new_entry->iokit_acct);
3156 assert(new_entry->use_pmap);
3157 new_entry->iokit_acct = TRUE;
3158 new_entry->use_pmap = FALSE;
3159 DTRACE_VM4(
3160 vm_map_iokit_mapped_region,
3161 vm_map_t, map,
3162 vm_map_offset_t, new_entry->vme_start,
3163 vm_map_offset_t, new_entry->vme_end,
3164 int, VME_ALIAS(new_entry));
3165 vm_map_iokit_mapped_region(
3166 map,
3167 (new_entry->vme_end -
3168 new_entry->vme_start));
3169 } else if (!is_submap) {
3170 assert(!new_entry->iokit_acct);
3171 assert(new_entry->use_pmap);
3172 }
3173
3174 if (is_submap) {
3175 vm_map_t submap;
3176 boolean_t submap_is_64bit;
3177 boolean_t use_pmap;
3178
3179 assert(new_entry->is_sub_map);
3180 assert(!new_entry->use_pmap);
3181 assert(!new_entry->iokit_acct);
3182 submap = (vm_map_t) object;
3183 submap_is_64bit = vm_map_is_64bit(submap);
3184 use_pmap = vmk_flags.vmkf_nested_pmap;
3185 #ifndef NO_NESTED_PMAP
3186 if (use_pmap && submap->pmap == NULL) {
3187 ledger_t ledger = map->pmap->ledger;
3188 /* we need a sub pmap to nest... */
3189 submap->pmap = pmap_create_options(ledger, 0,
3190 submap_is_64bit ? PMAP_CREATE_64BIT : 0);
3191 if (submap->pmap == NULL) {
3192 /* let's proceed without nesting... */
3193 }
3194 #if defined(__arm64__)
3195 else {
3196 pmap_set_nested(submap->pmap);
3197 }
3198 #endif
3199 }
3200 if (use_pmap && submap->pmap != NULL) {
3201 if (VM_MAP_PAGE_SHIFT(map) != VM_MAP_PAGE_SHIFT(submap)) {
3202 DEBUG4K_ERROR("map %p (%d) submap %p (%d): incompatible page sizes\n", map, VM_MAP_PAGE_SHIFT(map), submap, VM_MAP_PAGE_SHIFT(submap));
3203 kr = KERN_FAILURE;
3204 } else {
3205 kr = pmap_nest(map->pmap,
3206 submap->pmap,
3207 tmp_start,
3208 tmp_end - tmp_start);
3209 }
3210 if (kr != KERN_SUCCESS) {
3211 printf("vm_map_enter: "
3212 "pmap_nest(0x%llx,0x%llx) "
3213 "error 0x%x\n",
3214 (long long)tmp_start,
3215 (long long)tmp_end,
3216 kr);
3217 } else {
3218 /* we're now nested ! */
3219 new_entry->use_pmap = TRUE;
3220 pmap_empty = FALSE;
3221 }
3222 }
3223 #endif /* NO_NESTED_PMAP */
3224 }
3225 entry = new_entry;
3226
3227 if (superpage_size) {
3228 vm_page_t pages, m;
3229 vm_object_t sp_object;
3230 vm_object_offset_t sp_offset;
3231
3232 VME_OFFSET_SET(entry, 0);
3233
3234 /* allocate one superpage */
3235 kr = cpm_allocate(SUPERPAGE_SIZE, &pages, 0, SUPERPAGE_NBASEPAGES - 1, TRUE, 0);
3236 if (kr != KERN_SUCCESS) {
3237 /* deallocate whole range... */
3238 new_mapping_established = TRUE;
3239 /* ... but only up to "tmp_end" */
3240 size -= end - tmp_end;
3241 RETURN(kr);
3242 }
3243
3244 /* create one vm_object per superpage */
3245 sp_object = vm_object_allocate((vm_map_size_t)(entry->vme_end - entry->vme_start));
3246 sp_object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
3247 sp_object->phys_contiguous = TRUE;
3248 sp_object->vo_shadow_offset = (vm_object_offset_t)VM_PAGE_GET_PHYS_PAGE(pages) * PAGE_SIZE;
3249 VME_OBJECT_SET(entry, sp_object, false, 0);
3250 assert(entry->use_pmap);
3251
3252 /* enter the base pages into the object */
3253 vm_object_lock(sp_object);
3254 for (sp_offset = 0;
3255 sp_offset < SUPERPAGE_SIZE;
3256 sp_offset += PAGE_SIZE) {
3257 m = pages;
3258 pmap_zero_page(VM_PAGE_GET_PHYS_PAGE(m));
3259 pages = NEXT_PAGE(m);
3260 *(NEXT_PAGE_PTR(m)) = VM_PAGE_NULL;
3261 vm_page_insert_wired(m, sp_object, sp_offset, VM_KERN_MEMORY_OSFMK);
3262 }
3263 vm_object_unlock(sp_object);
3264 }
3265 } while (tmp_end != tmp2_end &&
3266 (tmp_start = tmp_end) &&
3267 (tmp_end = (tmp2_end - tmp_end > chunk_size) ?
3268 tmp_end + chunk_size : tmp2_end));
3269 }
3270
3271 new_mapping_established = TRUE;
3272
3273 BailOut:
3274 assert(map_locked == TRUE);
3275
3276 /*
3277 * Address space limit enforcement (RLIMIT_AS and RLIMIT_DATA):
3278 * If we have identified and possibly established the new mapping(s),
3279 * make sure we did not go beyond the address space limit.
3280 */
3281 if (result == KERN_SUCCESS) {
3282 if (map->size_limit != RLIM_INFINITY &&
3283 map->size > map->size_limit) {
3284 /*
3285 * Establishing the requested mappings would exceed
3286 * the process's RLIMIT_AS limit: fail with
3287 * KERN_NO_SPACE.
3288 */
3289 result = KERN_NO_SPACE;
3290 printf("%d[%s] %s: map size 0x%llx over RLIMIT_AS 0x%llx\n",
3291 proc_selfpid(),
3292 (get_bsdtask_info(current_task())
3293 ? proc_name_address(get_bsdtask_info(current_task()))
3294 : "?"),
3295 __FUNCTION__,
3296 (uint64_t) map->size,
3297 (uint64_t) map->size_limit);
3298 DTRACE_VM2(vm_map_enter_RLIMIT_AS,
3299 vm_map_size_t, map->size,
3300 uint64_t, map->size_limit);
3301 vm_map_enter_RLIMIT_AS_count++;
3302 } else if (map->data_limit != RLIM_INFINITY &&
3303 map->size > map->data_limit) {
3304 /*
3305 * Establishing the requested mappings would exceed
3306 * the process's RLIMIT_DATA limit: fail with
3307 * KERN_NO_SPACE.
3308 */
3309 result = KERN_NO_SPACE;
3310 printf("%d[%s] %s: map size 0x%llx over RLIMIT_DATA 0x%llx\n",
3311 proc_selfpid(),
3312 (get_bsdtask_info(current_task())
3313 ? proc_name_address(get_bsdtask_info(current_task()))
3314 : "?"),
3315 __FUNCTION__,
3316 (uint64_t) map->size,
3317 (uint64_t) map->data_limit);
3318 DTRACE_VM2(vm_map_enter_RLIMIT_DATA,
3319 vm_map_size_t, map->size,
3320 uint64_t, map->data_limit);
3321 vm_map_enter_RLIMIT_DATA_count++;
3322 }
3323 }
3324
3325 if (result == KERN_SUCCESS) {
3326 vm_prot_t pager_prot;
3327 memory_object_t pager;
3328
3329 #if DEBUG
3330 if (pmap_empty &&
3331 !(vmk_flags.vmkf_no_pmap_check)) {
3332 assert(pmap_is_empty(map->pmap,
3333 *address,
3334 *address + size));
3335 }
3336 #endif /* DEBUG */
3337
3338 /*
3339 * For "named" VM objects, let the pager know that the
3340 * memory object is being mapped. Some pagers need to keep
3341 * track of this, to know when they can reclaim the memory
3342 * object, for example.
3343 * VM calls memory_object_map() for each mapping (specifying
3344 * the protection of each mapping) and calls
3345 * memory_object_last_unmap() when all the mappings are gone.
3346 */
3347 pager_prot = max_protection;
3348 if (needs_copy) {
3349 /*
3350 * Copy-On-Write mapping: won't modify
3351 * the memory object.
3352 */
3353 pager_prot &= ~VM_PROT_WRITE;
3354 }
3355 if (!is_submap &&
3356 object != VM_OBJECT_NULL &&
3357 object->named &&
3358 object->pager != MEMORY_OBJECT_NULL) {
3359 vm_object_lock(object);
3360 pager = object->pager;
3361 if (object->named &&
3362 pager != MEMORY_OBJECT_NULL) {
3363 assert(object->pager_ready);
3364 vm_object_mapping_wait(object, THREAD_UNINT);
3365 vm_object_mapping_begin(object);
3366 vm_object_unlock(object);
3367
3368 kr = memory_object_map(pager, pager_prot);
3369 assert(kr == KERN_SUCCESS);
3370
3371 vm_object_lock(object);
3372 vm_object_mapping_end(object);
3373 }
3374 vm_object_unlock(object);
3375 }
3376 }
3377
3378 assert(map_locked == TRUE);
3379
3380 if (new_mapping_established) {
3381 /*
3382 * If we release the map lock for any reason below,
3383 * another thread could deallocate our new mapping,
3384 * releasing the caller's reference on "caller_object",
3385 * which was transferred to the mapping.
3386 * If this was the only reference, the object could be
3387 * destroyed.
3388 *
3389 * We need to take an extra reference on "caller_object"
3390 * to keep it alive if we need to return the caller's
3391 * reference to the caller in case of failure.
3392 */
3393 if (is_submap) {
3394 vm_map_reference((vm_map_t)caller_object);
3395 } else {
3396 vm_object_reference(caller_object);
3397 }
3398 }
3399
3400 if (!keep_map_locked) {
3401 vm_map_unlock(map);
3402 map_locked = FALSE;
3403 entry = VM_MAP_ENTRY_NULL;
3404 new_entry = VM_MAP_ENTRY_NULL;
3405 }
3406
3407 /*
3408 * We can't hold the map lock if we enter this block.
3409 */
3410
3411 if (result == KERN_SUCCESS) {
3412 /* Wire down the new entry if the user
3413 * requested all new map entries be wired.
3414 */
3415 if ((map->wiring_required) || (superpage_size)) {
3416 assert(!keep_map_locked);
3417 pmap_empty = FALSE; /* pmap won't be empty */
3418 kr = vm_map_wire_kernel(map, start, end,
3419 cur_protection, VM_KERN_MEMORY_MLOCK,
3420 TRUE);
3421 result = kr;
3422 }
3423
3424 }
3425
3426 if (result != KERN_SUCCESS) {
3427 if (new_mapping_established) {
3428 vmr_flags_t remove_flags = VM_MAP_REMOVE_NO_FLAGS;
3429
3430 /*
3431 * We have to get rid of the new mappings since we
3432 * won't make them available to the user.
3433 * Try and do that atomically, to minimize the risk
3434 * that someone else create new mappings that range.
3435 */
3436 if (!map_locked) {
3437 vm_map_lock(map);
3438 map_locked = TRUE;
3439 }
3440 remove_flags |= VM_MAP_REMOVE_NO_MAP_ALIGN;
3441 remove_flags |= VM_MAP_REMOVE_NO_YIELD;
3442 if (permanent) {
3443 remove_flags |= VM_MAP_REMOVE_IMMUTABLE;
3444 }
3445 (void) vm_map_delete(map,
3446 *address, *address + size,
3447 remove_flags,
3448 KMEM_GUARD_NONE, &zap_new_list);
3449 }
3450
3451 if (vm_map_zap_first_entry(&zap_old_list)) {
3452 vm_map_entry_t entry1, entry2;
3453
3454 /*
3455 * The new mapping failed. Attempt to restore
3456 * the old mappings, saved in the "zap_old_map".
3457 */
3458 if (!map_locked) {
3459 vm_map_lock(map);
3460 map_locked = TRUE;
3461 }
3462
3463 /* first check if the coast is still clear */
3464 start = vm_map_zap_first_entry(&zap_old_list)->vme_start;
3465 end = vm_map_zap_last_entry(&zap_old_list)->vme_end;
3466
3467 if (vm_map_lookup_entry(map, start, &entry1) ||
3468 vm_map_lookup_entry(map, end, &entry2) ||
3469 entry1 != entry2) {
3470 /*
3471 * Part of that range has already been
3472 * re-mapped: we can't restore the old
3473 * mappings...
3474 */
3475 vm_map_enter_restore_failures++;
3476 } else {
3477 /*
3478 * Transfer the saved map entries from
3479 * "zap_old_map" to the original "map",
3480 * inserting them all after "entry1".
3481 */
3482 while ((entry2 = vm_map_zap_pop(&zap_old_list))) {
3483 vm_map_size_t entry_size;
3484
3485 entry_size = (entry2->vme_end -
3486 entry2->vme_start);
3487 vm_map_store_entry_link(map, entry1, entry2,
3488 VM_MAP_KERNEL_FLAGS_NONE);
3489 map->size += entry_size;
3490 entry1 = entry2;
3491 }
3492 if (map->wiring_required) {
3493 /*
3494 * XXX TODO: we should rewire the
3495 * old pages here...
3496 */
3497 }
3498 vm_map_enter_restore_successes++;
3499 }
3500 }
3501 }
3502
3503 /*
3504 * The caller is responsible for releasing the lock if it requested to
3505 * keep the map locked.
3506 */
3507 if (map_locked && !keep_map_locked) {
3508 vm_map_unlock(map);
3509 }
3510
3511 vm_map_zap_dispose(&zap_old_list);
3512 vm_map_zap_dispose(&zap_new_list);
3513
3514 if (new_mapping_established) {
3515 /*
3516 * The caller had a reference on "caller_object" and we
3517 * transferred that reference to the mapping.
3518 * We also took an extra reference on "caller_object" to keep
3519 * it alive while the map was unlocked.
3520 */
3521 if (result == KERN_SUCCESS) {
3522 /*
3523 * On success, the caller's reference on the object gets
3524 * tranferred to the mapping.
3525 * Release our extra reference.
3526 */
3527 if (is_submap) {
3528 vm_map_deallocate((vm_map_t)caller_object);
3529 } else {
3530 vm_object_deallocate(caller_object);
3531 }
3532 } else {
3533 /*
3534 * On error, the caller expects to still have a
3535 * reference on the object it gave us.
3536 * Let's use our extra reference for that.
3537 */
3538 }
3539 }
3540
3541 return result;
3542
3543 #undef RETURN
3544 }
3545
3546 #if __arm64__
3547 extern const struct memory_object_pager_ops fourk_pager_ops;
3548 kern_return_t
vm_map_enter_fourk(vm_map_t map,vm_map_offset_t * address,vm_map_size_t size,vm_map_offset_t mask,vm_map_kernel_flags_t vmk_flags,vm_object_t object,vm_object_offset_t offset,boolean_t needs_copy,vm_prot_t cur_protection,vm_prot_t max_protection,vm_inherit_t inheritance)3549 vm_map_enter_fourk(
3550 vm_map_t map,
3551 vm_map_offset_t *address, /* IN/OUT */
3552 vm_map_size_t size,
3553 vm_map_offset_t mask,
3554 vm_map_kernel_flags_t vmk_flags,
3555 vm_object_t object,
3556 vm_object_offset_t offset,
3557 boolean_t needs_copy,
3558 vm_prot_t cur_protection,
3559 vm_prot_t max_protection,
3560 vm_inherit_t inheritance)
3561 {
3562 vm_map_entry_t entry, new_entry;
3563 vm_map_offset_t start, fourk_start;
3564 vm_map_offset_t end, fourk_end;
3565 vm_map_size_t fourk_size;
3566 kern_return_t result = KERN_SUCCESS;
3567 boolean_t map_locked = FALSE;
3568 boolean_t pmap_empty = TRUE;
3569 boolean_t new_mapping_established = FALSE;
3570 const bool keep_map_locked = vmk_flags.vmkf_keep_map_locked;
3571 const bool anywhere = !vmk_flags.vmf_fixed;
3572 const bool purgable = vmk_flags.vmf_purgeable;
3573 const bool overwrite = vmk_flags.vmf_overwrite;
3574 const bool is_submap = vmk_flags.vmkf_submap;
3575 const bool entry_for_jit = vmk_flags.vmkf_map_jit;
3576 const unsigned int superpage_size = vmk_flags.vmf_superpage_size;
3577 vm_map_offset_t effective_min_offset, effective_max_offset;
3578 kern_return_t kr;
3579 boolean_t clear_map_aligned = FALSE;
3580 memory_object_t fourk_mem_obj;
3581 vm_object_t fourk_object;
3582 vm_map_offset_t fourk_pager_offset;
3583 int fourk_pager_index_start, fourk_pager_index_num;
3584 int cur_idx;
3585 boolean_t fourk_copy;
3586 vm_object_t copy_object;
3587 vm_object_offset_t copy_offset;
3588 VM_MAP_ZAP_DECLARE(zap_list);
3589
3590 if (VM_MAP_PAGE_MASK(map) < PAGE_MASK) {
3591 panic("%s:%d", __FUNCTION__, __LINE__);
3592 }
3593 fourk_mem_obj = MEMORY_OBJECT_NULL;
3594 fourk_object = VM_OBJECT_NULL;
3595
3596 if (superpage_size) {
3597 return KERN_NOT_SUPPORTED;
3598 }
3599
3600 if ((cur_protection & VM_PROT_WRITE) &&
3601 (cur_protection & VM_PROT_EXECUTE) &&
3602 #if XNU_TARGET_OS_OSX
3603 map->pmap != kernel_pmap &&
3604 (vm_map_cs_enforcement(map)
3605 #if __arm64__
3606 || !VM_MAP_IS_EXOTIC(map)
3607 #endif /* __arm64__ */
3608 ) &&
3609 #endif /* XNU_TARGET_OS_OSX */
3610 #if CODE_SIGNING_MONITOR
3611 (csm_address_space_exempt(map->pmap) != KERN_SUCCESS) &&
3612 #endif
3613 !entry_for_jit) {
3614 DTRACE_VM3(cs_wx,
3615 uint64_t, 0,
3616 uint64_t, 0,
3617 vm_prot_t, cur_protection);
3618 printf("CODE SIGNING: %d[%s] %s: curprot cannot be write+execute. "
3619 "turning off execute\n",
3620 proc_selfpid(),
3621 (get_bsdtask_info(current_task())
3622 ? proc_name_address(get_bsdtask_info(current_task()))
3623 : "?"),
3624 __FUNCTION__);
3625 cur_protection &= ~VM_PROT_EXECUTE;
3626 }
3627
3628 /*
3629 * If the task has requested executable lockdown,
3630 * deny any new executable mapping.
3631 */
3632 if (map->map_disallow_new_exec == TRUE) {
3633 if (cur_protection & VM_PROT_EXECUTE) {
3634 return KERN_PROTECTION_FAILURE;
3635 }
3636 }
3637
3638 if (is_submap) {
3639 return KERN_NOT_SUPPORTED;
3640 }
3641 if (vmk_flags.vmkf_already) {
3642 return KERN_NOT_SUPPORTED;
3643 }
3644 if (purgable || entry_for_jit) {
3645 return KERN_NOT_SUPPORTED;
3646 }
3647
3648 effective_min_offset = map->min_offset;
3649
3650 if (vmk_flags.vmkf_beyond_max) {
3651 return KERN_NOT_SUPPORTED;
3652 } else {
3653 effective_max_offset = map->max_offset;
3654 }
3655
3656 if (size == 0 ||
3657 (offset & FOURK_PAGE_MASK) != 0) {
3658 *address = 0;
3659 return KERN_INVALID_ARGUMENT;
3660 }
3661
3662 #define RETURN(value) { result = value; goto BailOut; }
3663
3664 assert(VM_MAP_PAGE_ALIGNED(*address, FOURK_PAGE_MASK));
3665 assert(VM_MAP_PAGE_ALIGNED(size, FOURK_PAGE_MASK));
3666
3667 if (!anywhere && overwrite) {
3668 return KERN_NOT_SUPPORTED;
3669 }
3670
3671 fourk_start = *address;
3672 fourk_size = size;
3673 fourk_end = fourk_start + fourk_size;
3674
3675 start = vm_map_trunc_page(*address, VM_MAP_PAGE_MASK(map));
3676 end = vm_map_round_page(fourk_end, VM_MAP_PAGE_MASK(map));
3677 size = end - start;
3678
3679 if (anywhere) {
3680 return KERN_NOT_SUPPORTED;
3681 } else {
3682 /*
3683 * Verify that:
3684 * the address doesn't itself violate
3685 * the mask requirement.
3686 */
3687
3688 vm_map_lock(map);
3689 map_locked = TRUE;
3690 if ((start & mask) != 0) {
3691 RETURN(KERN_NO_SPACE);
3692 }
3693
3694 /*
3695 * ... the address is within bounds
3696 */
3697
3698 end = start + size;
3699
3700 if ((start < effective_min_offset) ||
3701 (end > effective_max_offset) ||
3702 (start >= end)) {
3703 RETURN(KERN_INVALID_ADDRESS);
3704 }
3705
3706 /*
3707 * ... the starting address isn't allocated
3708 */
3709 if (vm_map_lookup_entry(map, start, &entry)) {
3710 vm_object_t cur_object, shadow_object;
3711
3712 /*
3713 * We might already some 4K mappings
3714 * in a 16K page here.
3715 */
3716
3717 if (entry->vme_end - entry->vme_start
3718 != SIXTEENK_PAGE_SIZE) {
3719 RETURN(KERN_NO_SPACE);
3720 }
3721 if (entry->is_sub_map) {
3722 RETURN(KERN_NO_SPACE);
3723 }
3724 if (VME_OBJECT(entry) == VM_OBJECT_NULL) {
3725 RETURN(KERN_NO_SPACE);
3726 }
3727
3728 /* go all the way down the shadow chain */
3729 cur_object = VME_OBJECT(entry);
3730 vm_object_lock(cur_object);
3731 while (cur_object->shadow != VM_OBJECT_NULL) {
3732 shadow_object = cur_object->shadow;
3733 vm_object_lock(shadow_object);
3734 vm_object_unlock(cur_object);
3735 cur_object = shadow_object;
3736 shadow_object = VM_OBJECT_NULL;
3737 }
3738 if (cur_object->internal ||
3739 cur_object->pager == NULL) {
3740 vm_object_unlock(cur_object);
3741 RETURN(KERN_NO_SPACE);
3742 }
3743 if (cur_object->pager->mo_pager_ops
3744 != &fourk_pager_ops) {
3745 vm_object_unlock(cur_object);
3746 RETURN(KERN_NO_SPACE);
3747 }
3748 fourk_object = cur_object;
3749 fourk_mem_obj = fourk_object->pager;
3750
3751 /* keep the "4K" object alive */
3752 vm_object_reference_locked(fourk_object);
3753 memory_object_reference(fourk_mem_obj);
3754 vm_object_unlock(fourk_object);
3755
3756 /* merge permissions */
3757 entry->protection |= cur_protection;
3758 entry->max_protection |= max_protection;
3759
3760 if ((entry->protection & VM_PROT_WRITE) &&
3761 (entry->protection & VM_PROT_ALLEXEC) &&
3762 fourk_binary_compatibility_unsafe &&
3763 fourk_binary_compatibility_allow_wx) {
3764 /* write+execute: need to be "jit" */
3765 entry->used_for_jit = TRUE;
3766 }
3767 goto map_in_fourk_pager;
3768 }
3769
3770 /*
3771 * ... the next region doesn't overlap the
3772 * end point.
3773 */
3774
3775 if ((entry->vme_next != vm_map_to_entry(map)) &&
3776 (entry->vme_next->vme_start < end)) {
3777 RETURN(KERN_NO_SPACE);
3778 }
3779 }
3780
3781 /*
3782 * At this point,
3783 * "start" and "end" should define the endpoints of the
3784 * available new range, and
3785 * "entry" should refer to the region before the new
3786 * range, and
3787 *
3788 * the map should be locked.
3789 */
3790
3791 /* create a new "4K" pager */
3792 fourk_mem_obj = fourk_pager_create();
3793 fourk_object = fourk_pager_to_vm_object(fourk_mem_obj);
3794 assert(fourk_object);
3795
3796 /* keep the "4" object alive */
3797 vm_object_reference(fourk_object);
3798
3799 /* create a "copy" object, to map the "4K" object copy-on-write */
3800 fourk_copy = TRUE;
3801 result = vm_object_copy_strategically(fourk_object,
3802 0,
3803 end - start,
3804 false, /* forking */
3805 ©_object,
3806 ©_offset,
3807 &fourk_copy);
3808 assert(result == KERN_SUCCESS);
3809 assert(copy_object != VM_OBJECT_NULL);
3810 assert(copy_offset == 0);
3811
3812 /* map the "4K" pager's copy object */
3813 new_entry = vm_map_entry_insert(map,
3814 entry,
3815 vm_map_trunc_page(start, VM_MAP_PAGE_MASK(map)),
3816 vm_map_round_page(end, VM_MAP_PAGE_MASK(map)),
3817 copy_object,
3818 0, /* offset */
3819 vmk_flags,
3820 FALSE, /* needs_copy */
3821 cur_protection, max_protection,
3822 (entry_for_jit && !VM_MAP_POLICY_ALLOW_JIT_INHERIT(map) ?
3823 VM_INHERIT_NONE : inheritance),
3824 clear_map_aligned);
3825 entry = new_entry;
3826
3827 #if VM_MAP_DEBUG_FOURK
3828 if (vm_map_debug_fourk) {
3829 printf("FOURK_PAGER: map %p [0x%llx:0x%llx] new pager %p\n",
3830 map,
3831 (uint64_t) entry->vme_start,
3832 (uint64_t) entry->vme_end,
3833 fourk_mem_obj);
3834 }
3835 #endif /* VM_MAP_DEBUG_FOURK */
3836
3837 new_mapping_established = TRUE;
3838
3839 map_in_fourk_pager:
3840 /* "map" the original "object" where it belongs in the "4K" pager */
3841 fourk_pager_offset = (fourk_start & SIXTEENK_PAGE_MASK);
3842 fourk_pager_index_start = (int) (fourk_pager_offset / FOURK_PAGE_SIZE);
3843 if (fourk_size > SIXTEENK_PAGE_SIZE) {
3844 fourk_pager_index_num = 4;
3845 } else {
3846 fourk_pager_index_num = (int) (fourk_size / FOURK_PAGE_SIZE);
3847 }
3848 if (fourk_pager_index_start + fourk_pager_index_num > 4) {
3849 fourk_pager_index_num = 4 - fourk_pager_index_start;
3850 }
3851 for (cur_idx = 0;
3852 cur_idx < fourk_pager_index_num;
3853 cur_idx++) {
3854 vm_object_t old_object;
3855 vm_object_offset_t old_offset;
3856
3857 kr = fourk_pager_populate(fourk_mem_obj,
3858 TRUE, /* overwrite */
3859 fourk_pager_index_start + cur_idx,
3860 object,
3861 (object
3862 ? (offset +
3863 (cur_idx * FOURK_PAGE_SIZE))
3864 : 0),
3865 &old_object,
3866 &old_offset);
3867 #if VM_MAP_DEBUG_FOURK
3868 if (vm_map_debug_fourk) {
3869 if (old_object == (vm_object_t) -1 &&
3870 old_offset == (vm_object_offset_t) -1) {
3871 printf("FOURK_PAGER: map %p [0x%llx:0x%llx] "
3872 "pager [%p:0x%llx] "
3873 "populate[%d] "
3874 "[object:%p,offset:0x%llx]\n",
3875 map,
3876 (uint64_t) entry->vme_start,
3877 (uint64_t) entry->vme_end,
3878 fourk_mem_obj,
3879 VME_OFFSET(entry),
3880 fourk_pager_index_start + cur_idx,
3881 object,
3882 (object
3883 ? (offset + (cur_idx * FOURK_PAGE_SIZE))
3884 : 0));
3885 } else {
3886 printf("FOURK_PAGER: map %p [0x%llx:0x%llx] "
3887 "pager [%p:0x%llx] "
3888 "populate[%d] [object:%p,offset:0x%llx] "
3889 "old [%p:0x%llx]\n",
3890 map,
3891 (uint64_t) entry->vme_start,
3892 (uint64_t) entry->vme_end,
3893 fourk_mem_obj,
3894 VME_OFFSET(entry),
3895 fourk_pager_index_start + cur_idx,
3896 object,
3897 (object
3898 ? (offset + (cur_idx * FOURK_PAGE_SIZE))
3899 : 0),
3900 old_object,
3901 old_offset);
3902 }
3903 }
3904 #endif /* VM_MAP_DEBUG_FOURK */
3905
3906 assert(kr == KERN_SUCCESS);
3907 if (object != old_object &&
3908 object != VM_OBJECT_NULL &&
3909 object != (vm_object_t) -1) {
3910 vm_object_reference(object);
3911 }
3912 if (object != old_object &&
3913 old_object != VM_OBJECT_NULL &&
3914 old_object != (vm_object_t) -1) {
3915 vm_object_deallocate(old_object);
3916 }
3917 }
3918
3919 BailOut:
3920 assert(map_locked == TRUE);
3921
3922 if (result == KERN_SUCCESS) {
3923 vm_prot_t pager_prot;
3924 memory_object_t pager;
3925
3926 #if DEBUG
3927 if (pmap_empty &&
3928 !(vmk_flags.vmkf_no_pmap_check)) {
3929 assert(pmap_is_empty(map->pmap,
3930 *address,
3931 *address + size));
3932 }
3933 #endif /* DEBUG */
3934
3935 /*
3936 * For "named" VM objects, let the pager know that the
3937 * memory object is being mapped. Some pagers need to keep
3938 * track of this, to know when they can reclaim the memory
3939 * object, for example.
3940 * VM calls memory_object_map() for each mapping (specifying
3941 * the protection of each mapping) and calls
3942 * memory_object_last_unmap() when all the mappings are gone.
3943 */
3944 pager_prot = max_protection;
3945 if (needs_copy) {
3946 /*
3947 * Copy-On-Write mapping: won't modify
3948 * the memory object.
3949 */
3950 pager_prot &= ~VM_PROT_WRITE;
3951 }
3952 if (!is_submap &&
3953 object != VM_OBJECT_NULL &&
3954 object->named &&
3955 object->pager != MEMORY_OBJECT_NULL) {
3956 vm_object_lock(object);
3957 pager = object->pager;
3958 if (object->named &&
3959 pager != MEMORY_OBJECT_NULL) {
3960 assert(object->pager_ready);
3961 vm_object_mapping_wait(object, THREAD_UNINT);
3962 vm_object_mapping_begin(object);
3963 vm_object_unlock(object);
3964
3965 kr = memory_object_map(pager, pager_prot);
3966 assert(kr == KERN_SUCCESS);
3967
3968 vm_object_lock(object);
3969 vm_object_mapping_end(object);
3970 }
3971 vm_object_unlock(object);
3972 }
3973 if (!is_submap &&
3974 fourk_object != VM_OBJECT_NULL &&
3975 fourk_object->named &&
3976 fourk_object->pager != MEMORY_OBJECT_NULL) {
3977 vm_object_lock(fourk_object);
3978 pager = fourk_object->pager;
3979 if (fourk_object->named &&
3980 pager != MEMORY_OBJECT_NULL) {
3981 assert(fourk_object->pager_ready);
3982 vm_object_mapping_wait(fourk_object,
3983 THREAD_UNINT);
3984 vm_object_mapping_begin(fourk_object);
3985 vm_object_unlock(fourk_object);
3986
3987 kr = memory_object_map(pager, VM_PROT_READ);
3988 assert(kr == KERN_SUCCESS);
3989
3990 vm_object_lock(fourk_object);
3991 vm_object_mapping_end(fourk_object);
3992 }
3993 vm_object_unlock(fourk_object);
3994 }
3995 }
3996
3997 if (fourk_object != VM_OBJECT_NULL) {
3998 vm_object_deallocate(fourk_object);
3999 fourk_object = VM_OBJECT_NULL;
4000 memory_object_deallocate(fourk_mem_obj);
4001 fourk_mem_obj = MEMORY_OBJECT_NULL;
4002 }
4003
4004 assert(map_locked == TRUE);
4005
4006 if (!keep_map_locked) {
4007 vm_map_unlock(map);
4008 map_locked = FALSE;
4009 }
4010
4011 /*
4012 * We can't hold the map lock if we enter this block.
4013 */
4014
4015 if (result == KERN_SUCCESS) {
4016 /* Wire down the new entry if the user
4017 * requested all new map entries be wired.
4018 */
4019 if ((map->wiring_required) || (superpage_size)) {
4020 assert(!keep_map_locked);
4021 pmap_empty = FALSE; /* pmap won't be empty */
4022 kr = vm_map_wire_kernel(map, start, end,
4023 new_entry->protection, VM_KERN_MEMORY_MLOCK,
4024 TRUE);
4025 result = kr;
4026 }
4027
4028 }
4029
4030 if (result != KERN_SUCCESS) {
4031 if (new_mapping_established) {
4032 /*
4033 * We have to get rid of the new mappings since we
4034 * won't make them available to the user.
4035 * Try and do that atomically, to minimize the risk
4036 * that someone else create new mappings that range.
4037 */
4038
4039 if (!map_locked) {
4040 vm_map_lock(map);
4041 map_locked = TRUE;
4042 }
4043 (void)vm_map_delete(map, *address, *address + size,
4044 VM_MAP_REMOVE_NO_MAP_ALIGN | VM_MAP_REMOVE_NO_YIELD,
4045 KMEM_GUARD_NONE, &zap_list);
4046 }
4047 }
4048
4049 /*
4050 * The caller is responsible for releasing the lock if it requested to
4051 * keep the map locked.
4052 */
4053 if (map_locked && !keep_map_locked) {
4054 vm_map_unlock(map);
4055 }
4056
4057 vm_map_zap_dispose(&zap_list);
4058
4059 return result;
4060
4061 #undef RETURN
4062 }
4063 #endif /* __arm64__ */
4064
4065 /*
4066 * Counters for the prefault optimization.
4067 */
4068 int64_t vm_prefault_nb_pages = 0;
4069 int64_t vm_prefault_nb_bailout = 0;
4070
4071 static kern_return_t
vm_map_enter_mem_object_helper(vm_map_t target_map,vm_map_offset_t * address,vm_map_size_t initial_size,vm_map_offset_t mask,vm_map_kernel_flags_t vmk_flags,ipc_port_t port,vm_object_offset_t offset,boolean_t copy,vm_prot_t cur_protection,vm_prot_t max_protection,vm_inherit_t inheritance,upl_page_list_ptr_t page_list,unsigned int page_list_count)4072 vm_map_enter_mem_object_helper(
4073 vm_map_t target_map,
4074 vm_map_offset_t *address,
4075 vm_map_size_t initial_size,
4076 vm_map_offset_t mask,
4077 vm_map_kernel_flags_t vmk_flags,
4078 ipc_port_t port,
4079 vm_object_offset_t offset,
4080 boolean_t copy,
4081 vm_prot_t cur_protection,
4082 vm_prot_t max_protection,
4083 vm_inherit_t inheritance,
4084 upl_page_list_ptr_t page_list,
4085 unsigned int page_list_count)
4086 {
4087 vm_map_address_t map_addr;
4088 vm_map_size_t map_size;
4089 vm_object_t object;
4090 vm_object_size_t size;
4091 kern_return_t result;
4092 boolean_t mask_cur_protection, mask_max_protection;
4093 boolean_t kernel_prefault, try_prefault = (page_list_count != 0);
4094 vm_map_offset_t offset_in_mapping = 0;
4095 #if __arm64__
4096 boolean_t fourk = vmk_flags.vmkf_fourk;
4097 #endif /* __arm64__ */
4098
4099 if (VM_MAP_PAGE_SHIFT(target_map) < PAGE_SHIFT) {
4100 /* XXX TODO4K prefaulting depends on page size... */
4101 try_prefault = FALSE;
4102 }
4103
4104 assertf(vmk_flags.__vmkf_unused == 0, "vmk_flags unused=0x%x\n", vmk_flags.__vmkf_unused);
4105 vm_map_kernel_flags_update_range_id(&vmk_flags, target_map);
4106
4107 mask_cur_protection = cur_protection & VM_PROT_IS_MASK;
4108 mask_max_protection = max_protection & VM_PROT_IS_MASK;
4109 cur_protection &= ~VM_PROT_IS_MASK;
4110 max_protection &= ~VM_PROT_IS_MASK;
4111
4112 /*
4113 * Check arguments for validity
4114 */
4115 if ((target_map == VM_MAP_NULL) ||
4116 (cur_protection & ~(VM_PROT_ALL | VM_PROT_ALLEXEC)) ||
4117 (max_protection & ~(VM_PROT_ALL | VM_PROT_ALLEXEC)) ||
4118 (inheritance > VM_INHERIT_LAST_VALID) ||
4119 (try_prefault && (copy || !page_list)) ||
4120 initial_size == 0) {
4121 return KERN_INVALID_ARGUMENT;
4122 }
4123
4124 #if __arm64__
4125 if (cur_protection & VM_PROT_EXECUTE) {
4126 cur_protection |= VM_PROT_READ;
4127 }
4128
4129 if (fourk && VM_MAP_PAGE_SHIFT(target_map) < PAGE_SHIFT) {
4130 /* no "fourk" if map is using a sub-page page size */
4131 fourk = FALSE;
4132 }
4133 if (fourk) {
4134 map_addr = vm_map_trunc_page(*address, FOURK_PAGE_MASK);
4135 map_size = vm_map_round_page(initial_size, FOURK_PAGE_MASK);
4136 } else
4137 #endif /* __arm64__ */
4138 {
4139 map_addr = vm_map_trunc_page(*address,
4140 VM_MAP_PAGE_MASK(target_map));
4141 map_size = vm_map_round_page(initial_size,
4142 VM_MAP_PAGE_MASK(target_map));
4143 }
4144 if (map_size == 0) {
4145 return KERN_INVALID_ARGUMENT;
4146 }
4147 size = vm_object_round_page(initial_size);
4148
4149 /*
4150 * Find the vm object (if any) corresponding to this port.
4151 */
4152 if (!IP_VALID(port)) {
4153 object = VM_OBJECT_NULL;
4154 offset = 0;
4155 copy = FALSE;
4156 } else if (ip_kotype(port) == IKOT_NAMED_ENTRY) {
4157 vm_named_entry_t named_entry;
4158 vm_object_offset_t data_offset;
4159
4160 named_entry = mach_memory_entry_from_port(port);
4161
4162 if (vmk_flags.vmf_return_data_addr ||
4163 vmk_flags.vmf_return_4k_data_addr) {
4164 data_offset = named_entry->data_offset;
4165 offset += named_entry->data_offset;
4166 } else {
4167 data_offset = 0;
4168 }
4169
4170 /* a few checks to make sure user is obeying rules */
4171 if (mask_max_protection) {
4172 max_protection &= named_entry->protection;
4173 }
4174 if (mask_cur_protection) {
4175 cur_protection &= named_entry->protection;
4176 }
4177 if ((named_entry->protection & max_protection) !=
4178 max_protection) {
4179 return KERN_INVALID_RIGHT;
4180 }
4181 if ((named_entry->protection & cur_protection) !=
4182 cur_protection) {
4183 return KERN_INVALID_RIGHT;
4184 }
4185 if (offset + size <= offset) {
4186 /* overflow */
4187 return KERN_INVALID_ARGUMENT;
4188 }
4189 if (named_entry->size < (offset + initial_size)) {
4190 return KERN_INVALID_ARGUMENT;
4191 }
4192
4193 if (named_entry->is_copy) {
4194 /* for a vm_map_copy, we can only map it whole */
4195 if ((size != named_entry->size) &&
4196 (vm_map_round_page(size,
4197 VM_MAP_PAGE_MASK(target_map)) ==
4198 named_entry->size)) {
4199 /* XXX FBDP use the rounded size... */
4200 size = vm_map_round_page(
4201 size,
4202 VM_MAP_PAGE_MASK(target_map));
4203 }
4204 }
4205
4206 /* the callers parameter offset is defined to be the */
4207 /* offset from beginning of named entry offset in object */
4208 offset = offset + named_entry->offset;
4209
4210 if (!VM_MAP_PAGE_ALIGNED(size,
4211 VM_MAP_PAGE_MASK(target_map))) {
4212 /*
4213 * Let's not map more than requested;
4214 * vm_map_enter() will handle this "not map-aligned"
4215 * case.
4216 */
4217 map_size = size;
4218 }
4219
4220 named_entry_lock(named_entry);
4221 if (named_entry->is_sub_map) {
4222 vm_map_t submap;
4223
4224 if (vmk_flags.vmf_return_data_addr ||
4225 vmk_flags.vmf_return_4k_data_addr) {
4226 panic("VM_FLAGS_RETURN_DATA_ADDR not expected for submap.");
4227 }
4228
4229 submap = named_entry->backing.map;
4230 vm_map_reference(submap);
4231 named_entry_unlock(named_entry);
4232
4233 vmk_flags.vmkf_submap = TRUE;
4234
4235 result = vm_map_enter(target_map,
4236 &map_addr,
4237 map_size,
4238 mask,
4239 vmk_flags,
4240 (vm_object_t)(uintptr_t) submap,
4241 offset,
4242 copy,
4243 cur_protection,
4244 max_protection,
4245 inheritance);
4246 if (result != KERN_SUCCESS) {
4247 vm_map_deallocate(submap);
4248 } else {
4249 /*
4250 * No need to lock "submap" just to check its
4251 * "mapped" flag: that flag is never reset
4252 * once it's been set and if we race, we'll
4253 * just end up setting it twice, which is OK.
4254 */
4255 if (submap->mapped_in_other_pmaps == FALSE &&
4256 vm_map_pmap(submap) != PMAP_NULL &&
4257 vm_map_pmap(submap) !=
4258 vm_map_pmap(target_map)) {
4259 /*
4260 * This submap is being mapped in a map
4261 * that uses a different pmap.
4262 * Set its "mapped_in_other_pmaps" flag
4263 * to indicate that we now need to
4264 * remove mappings from all pmaps rather
4265 * than just the submap's pmap.
4266 */
4267 vm_map_lock(submap);
4268 submap->mapped_in_other_pmaps = TRUE;
4269 vm_map_unlock(submap);
4270 }
4271 *address = map_addr;
4272 }
4273 return result;
4274 } else if (named_entry->is_copy) {
4275 kern_return_t kr;
4276 vm_map_copy_t copy_map;
4277 vm_map_entry_t copy_entry;
4278 vm_map_offset_t copy_addr;
4279 vm_map_copy_t target_copy_map;
4280 vm_map_offset_t overmap_start, overmap_end;
4281 vm_map_offset_t trimmed_start;
4282 vm_map_size_t target_size;
4283
4284 if (!vm_map_kernel_flags_check_vmflags(vmk_flags,
4285 (VM_FLAGS_FIXED |
4286 VM_FLAGS_ANYWHERE |
4287 VM_FLAGS_OVERWRITE |
4288 VM_FLAGS_RETURN_4K_DATA_ADDR |
4289 VM_FLAGS_RETURN_DATA_ADDR))) {
4290 named_entry_unlock(named_entry);
4291 return KERN_INVALID_ARGUMENT;
4292 }
4293
4294 copy_map = named_entry->backing.copy;
4295 assert(copy_map->type == VM_MAP_COPY_ENTRY_LIST);
4296 if (copy_map->type != VM_MAP_COPY_ENTRY_LIST) {
4297 /* unsupported type; should not happen */
4298 printf("vm_map_enter_mem_object: "
4299 "memory_entry->backing.copy "
4300 "unsupported type 0x%x\n",
4301 copy_map->type);
4302 named_entry_unlock(named_entry);
4303 return KERN_INVALID_ARGUMENT;
4304 }
4305
4306 if (VM_MAP_PAGE_SHIFT(target_map) != copy_map->cpy_hdr.page_shift) {
4307 DEBUG4K_SHARE("copy_map %p offset %llx size 0x%llx pgshift %d -> target_map %p pgshift %d\n", copy_map, offset, (uint64_t)map_size, copy_map->cpy_hdr.page_shift, target_map, VM_MAP_PAGE_SHIFT(target_map));
4308 }
4309
4310 if (vmk_flags.vmf_return_data_addr ||
4311 vmk_flags.vmf_return_4k_data_addr) {
4312 offset_in_mapping = offset & VM_MAP_PAGE_MASK(target_map);
4313 if (vmk_flags.vmf_return_4k_data_addr) {
4314 offset_in_mapping &= ~((signed)(0xFFF));
4315 }
4316 }
4317
4318 target_copy_map = VM_MAP_COPY_NULL;
4319 target_size = copy_map->size;
4320 overmap_start = 0;
4321 overmap_end = 0;
4322 trimmed_start = 0;
4323 if (copy_map->cpy_hdr.page_shift != VM_MAP_PAGE_SHIFT(target_map)) {
4324 DEBUG4K_ADJUST("adjusting...\n");
4325 kr = vm_map_copy_adjust_to_target(
4326 copy_map,
4327 offset /* includes data_offset */,
4328 initial_size,
4329 target_map,
4330 copy,
4331 &target_copy_map,
4332 &overmap_start,
4333 &overmap_end,
4334 &trimmed_start);
4335 if (kr != KERN_SUCCESS) {
4336 named_entry_unlock(named_entry);
4337 return kr;
4338 }
4339 target_size = target_copy_map->size;
4340 if (trimmed_start >= data_offset) {
4341 data_offset = offset & VM_MAP_PAGE_MASK(target_map);
4342 } else {
4343 data_offset -= trimmed_start;
4344 }
4345 } else {
4346 /*
4347 * Assert that the vm_map_copy is coming from the right
4348 * zone and hasn't been forged
4349 */
4350 vm_map_copy_require(copy_map);
4351 target_copy_map = copy_map;
4352 }
4353
4354 vm_map_kernel_flags_t rsv_flags = vmk_flags;
4355
4356 vm_map_kernel_flags_and_vmflags(&rsv_flags,
4357 (VM_FLAGS_FIXED |
4358 VM_FLAGS_ANYWHERE |
4359 VM_FLAGS_OVERWRITE |
4360 VM_FLAGS_RETURN_4K_DATA_ADDR |
4361 VM_FLAGS_RETURN_DATA_ADDR));
4362
4363 /* reserve a contiguous range */
4364 kr = vm_map_enter(target_map,
4365 &map_addr,
4366 vm_map_round_page(target_size, VM_MAP_PAGE_MASK(target_map)),
4367 mask,
4368 rsv_flags,
4369 VM_OBJECT_NULL,
4370 0,
4371 FALSE, /* copy */
4372 cur_protection,
4373 max_protection,
4374 inheritance);
4375 if (kr != KERN_SUCCESS) {
4376 DEBUG4K_ERROR("kr 0x%x\n", kr);
4377 if (target_copy_map != copy_map) {
4378 vm_map_copy_discard(target_copy_map);
4379 target_copy_map = VM_MAP_COPY_NULL;
4380 }
4381 named_entry_unlock(named_entry);
4382 return kr;
4383 }
4384
4385 copy_addr = map_addr;
4386
4387 for (copy_entry = vm_map_copy_first_entry(target_copy_map);
4388 copy_entry != vm_map_copy_to_entry(target_copy_map);
4389 copy_entry = copy_entry->vme_next) {
4390 vm_map_t copy_submap = VM_MAP_NULL;
4391 vm_object_t copy_object = VM_OBJECT_NULL;
4392 vm_map_size_t copy_size;
4393 vm_object_offset_t copy_offset;
4394 boolean_t do_copy = false;
4395
4396 if (copy_entry->is_sub_map) {
4397 copy_submap = VME_SUBMAP(copy_entry);
4398 copy_object = (vm_object_t)copy_submap;
4399 } else {
4400 copy_object = VME_OBJECT(copy_entry);
4401 }
4402 copy_offset = VME_OFFSET(copy_entry);
4403 copy_size = (copy_entry->vme_end -
4404 copy_entry->vme_start);
4405
4406 /* sanity check */
4407 if ((copy_addr + copy_size) >
4408 (map_addr +
4409 overmap_start + overmap_end +
4410 named_entry->size /* XXX full size */)) {
4411 /* over-mapping too much !? */
4412 kr = KERN_INVALID_ARGUMENT;
4413 DEBUG4K_ERROR("kr 0x%x\n", kr);
4414 /* abort */
4415 break;
4416 }
4417
4418 /* take a reference on the object */
4419 if (copy_entry->is_sub_map) {
4420 vm_map_reference(copy_submap);
4421 } else {
4422 if (!copy &&
4423 copy_object != VM_OBJECT_NULL &&
4424 copy_object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) {
4425 bool is_writable;
4426
4427 /*
4428 * We need to resolve our side of this
4429 * "symmetric" copy-on-write now; we
4430 * need a new object to map and share,
4431 * instead of the current one which
4432 * might still be shared with the
4433 * original mapping.
4434 *
4435 * Note: A "vm_map_copy_t" does not
4436 * have a lock but we're protected by
4437 * the named entry's lock here.
4438 */
4439 // assert(copy_object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC);
4440 VME_OBJECT_SHADOW(copy_entry, copy_size, TRUE);
4441 assert(copy_object != VME_OBJECT(copy_entry));
4442 is_writable = false;
4443 if (copy_entry->protection & VM_PROT_WRITE) {
4444 is_writable = true;
4445 #if __arm64e__
4446 } else if (copy_entry->used_for_tpro) {
4447 is_writable = true;
4448 #endif /* __arm64e__ */
4449 }
4450 if (!copy_entry->needs_copy && is_writable) {
4451 vm_prot_t prot;
4452
4453 prot = copy_entry->protection & ~VM_PROT_WRITE;
4454 vm_object_pmap_protect(copy_object,
4455 copy_offset,
4456 copy_size,
4457 PMAP_NULL,
4458 PAGE_SIZE,
4459 0,
4460 prot);
4461 }
4462 copy_entry->needs_copy = FALSE;
4463 copy_entry->is_shared = TRUE;
4464 copy_object = VME_OBJECT(copy_entry);
4465 copy_offset = VME_OFFSET(copy_entry);
4466 vm_object_lock(copy_object);
4467 /* we're about to make a shared mapping of this object */
4468 copy_object->copy_strategy = MEMORY_OBJECT_COPY_DELAY;
4469 copy_object->true_share = TRUE;
4470 vm_object_unlock(copy_object);
4471 }
4472
4473 if (copy_object != VM_OBJECT_NULL &&
4474 copy_object->named &&
4475 copy_object->pager != MEMORY_OBJECT_NULL &&
4476 copy_object->copy_strategy != MEMORY_OBJECT_COPY_NONE) {
4477 memory_object_t pager;
4478 vm_prot_t pager_prot;
4479
4480 /*
4481 * For "named" VM objects, let the pager know that the
4482 * memory object is being mapped. Some pagers need to keep
4483 * track of this, to know when they can reclaim the memory
4484 * object, for example.
4485 * VM calls memory_object_map() for each mapping (specifying
4486 * the protection of each mapping) and calls
4487 * memory_object_last_unmap() when all the mappings are gone.
4488 */
4489 pager_prot = max_protection;
4490 if (copy) {
4491 /*
4492 * Copy-On-Write mapping: won't modify the
4493 * memory object.
4494 */
4495 pager_prot &= ~VM_PROT_WRITE;
4496 }
4497 vm_object_lock(copy_object);
4498 pager = copy_object->pager;
4499 if (copy_object->named &&
4500 pager != MEMORY_OBJECT_NULL &&
4501 copy_object->copy_strategy != MEMORY_OBJECT_COPY_NONE) {
4502 assert(copy_object->pager_ready);
4503 vm_object_mapping_wait(copy_object, THREAD_UNINT);
4504 vm_object_mapping_begin(copy_object);
4505 vm_object_unlock(copy_object);
4506
4507 kr = memory_object_map(pager, pager_prot);
4508 assert(kr == KERN_SUCCESS);
4509
4510 vm_object_lock(copy_object);
4511 vm_object_mapping_end(copy_object);
4512 }
4513 vm_object_unlock(copy_object);
4514 }
4515
4516 /*
4517 * Perform the copy if requested
4518 */
4519
4520 if (copy && copy_object != VM_OBJECT_NULL) {
4521 vm_object_t new_object;
4522 vm_object_offset_t new_offset;
4523
4524 result = vm_object_copy_strategically(copy_object, copy_offset,
4525 copy_size,
4526 false, /* forking */
4527 &new_object, &new_offset,
4528 &do_copy);
4529
4530
4531 if (result == KERN_MEMORY_RESTART_COPY) {
4532 boolean_t success;
4533 boolean_t src_needs_copy;
4534
4535 /*
4536 * XXX
4537 * We currently ignore src_needs_copy.
4538 * This really is the issue of how to make
4539 * MEMORY_OBJECT_COPY_SYMMETRIC safe for
4540 * non-kernel users to use. Solution forthcoming.
4541 * In the meantime, since we don't allow non-kernel
4542 * memory managers to specify symmetric copy,
4543 * we won't run into problems here.
4544 */
4545 new_object = copy_object;
4546 new_offset = copy_offset;
4547 success = vm_object_copy_quickly(new_object,
4548 new_offset,
4549 copy_size,
4550 &src_needs_copy,
4551 &do_copy);
4552 assert(success);
4553 result = KERN_SUCCESS;
4554 }
4555 if (result != KERN_SUCCESS) {
4556 kr = result;
4557 break;
4558 }
4559
4560 copy_object = new_object;
4561 copy_offset = new_offset;
4562 /*
4563 * No extra object reference for the mapping:
4564 * the mapping should be the only thing keeping
4565 * this new object alive.
4566 */
4567 } else {
4568 /*
4569 * We already have the right object
4570 * to map.
4571 */
4572 copy_object = VME_OBJECT(copy_entry);
4573 /* take an extra ref for the mapping below */
4574 vm_object_reference(copy_object);
4575 }
4576 }
4577
4578 /*
4579 * If the caller does not want a specific
4580 * tag for this new mapping: use
4581 * the tag of the original mapping.
4582 */
4583 vm_map_kernel_flags_t vmk_remap_flags = {
4584 .vmkf_submap = copy_entry->is_sub_map,
4585 };
4586
4587 vm_map_kernel_flags_set_vmflags(&vmk_remap_flags,
4588 vm_map_kernel_flags_vmflags(vmk_flags),
4589 vmk_flags.vm_tag ?: VME_ALIAS(copy_entry));
4590
4591 /* over-map the object into destination */
4592 vmk_remap_flags.vmf_fixed = true;
4593 vmk_remap_flags.vmf_overwrite = true;
4594
4595 if (!copy && !copy_entry->is_sub_map) {
4596 /*
4597 * copy-on-write should have been
4598 * resolved at this point, or we would
4599 * end up sharing instead of copying.
4600 */
4601 assert(!copy_entry->needs_copy);
4602 }
4603 #if XNU_TARGET_OS_OSX
4604 if (copy_entry->used_for_jit) {
4605 vmk_remap_flags.vmkf_map_jit = TRUE;
4606 }
4607 #endif /* XNU_TARGET_OS_OSX */
4608
4609 kr = vm_map_enter(target_map,
4610 ©_addr,
4611 copy_size,
4612 (vm_map_offset_t) 0,
4613 vmk_remap_flags,
4614 copy_object,
4615 copy_offset,
4616 ((copy_object == NULL)
4617 ? FALSE
4618 : (copy || copy_entry->needs_copy)),
4619 cur_protection,
4620 max_protection,
4621 inheritance);
4622 if (kr != KERN_SUCCESS) {
4623 DEBUG4K_SHARE("failed kr 0x%x\n", kr);
4624 if (copy_entry->is_sub_map) {
4625 vm_map_deallocate(copy_submap);
4626 } else {
4627 vm_object_deallocate(copy_object);
4628 }
4629 /* abort */
4630 break;
4631 }
4632
4633 /* next mapping */
4634 copy_addr += copy_size;
4635 }
4636
4637 if (kr == KERN_SUCCESS) {
4638 if (vmk_flags.vmf_return_data_addr ||
4639 vmk_flags.vmf_return_4k_data_addr) {
4640 *address = map_addr + offset_in_mapping;
4641 } else {
4642 *address = map_addr;
4643 }
4644 if (overmap_start) {
4645 *address += overmap_start;
4646 DEBUG4K_SHARE("map %p map_addr 0x%llx offset_in_mapping 0x%llx overmap_start 0x%llx -> *address 0x%llx\n", target_map, (uint64_t)map_addr, (uint64_t) offset_in_mapping, (uint64_t)overmap_start, (uint64_t)*address);
4647 }
4648 }
4649 named_entry_unlock(named_entry);
4650 if (target_copy_map != copy_map) {
4651 vm_map_copy_discard(target_copy_map);
4652 target_copy_map = VM_MAP_COPY_NULL;
4653 }
4654
4655 if (kr != KERN_SUCCESS && !vmk_flags.vmf_overwrite) {
4656 /* deallocate the contiguous range */
4657 (void) vm_deallocate(target_map,
4658 map_addr,
4659 map_size);
4660 }
4661
4662 return kr;
4663 }
4664
4665 if (named_entry->is_object) {
4666 unsigned int access;
4667 unsigned int wimg_mode;
4668
4669 /* we are mapping a VM object */
4670
4671 access = named_entry->access;
4672
4673 if (vmk_flags.vmf_return_data_addr ||
4674 vmk_flags.vmf_return_4k_data_addr) {
4675 offset_in_mapping = offset - VM_MAP_TRUNC_PAGE(offset, VM_MAP_PAGE_MASK(target_map));
4676 if (vmk_flags.vmf_return_4k_data_addr) {
4677 offset_in_mapping &= ~((signed)(0xFFF));
4678 }
4679 offset = VM_MAP_TRUNC_PAGE(offset, VM_MAP_PAGE_MASK(target_map));
4680 map_size = VM_MAP_ROUND_PAGE((offset + offset_in_mapping + initial_size) - offset, VM_MAP_PAGE_MASK(target_map));
4681 }
4682
4683 object = vm_named_entry_to_vm_object(named_entry);
4684 assert(object != VM_OBJECT_NULL);
4685 vm_object_lock(object);
4686 named_entry_unlock(named_entry);
4687
4688 vm_object_reference_locked(object);
4689
4690 wimg_mode = object->wimg_bits;
4691 vm_prot_to_wimg(access, &wimg_mode);
4692 if (object->wimg_bits != wimg_mode) {
4693 vm_object_change_wimg_mode(object, wimg_mode);
4694 }
4695
4696 vm_object_unlock(object);
4697 } else {
4698 panic("invalid VM named entry %p", named_entry);
4699 }
4700 } else if (ip_kotype(port) == IKOT_MEMORY_OBJECT) {
4701 /*
4702 * JMM - This is temporary until we unify named entries
4703 * and raw memory objects.
4704 *
4705 * Detected fake ip_kotype for a memory object. In
4706 * this case, the port isn't really a port at all, but
4707 * instead is just a raw memory object.
4708 */
4709 if (vmk_flags.vmf_return_data_addr ||
4710 vmk_flags.vmf_return_4k_data_addr) {
4711 panic("VM_FLAGS_RETURN_DATA_ADDR not expected for raw memory object.");
4712 }
4713
4714 object = memory_object_to_vm_object((memory_object_t)port);
4715 if (object == VM_OBJECT_NULL) {
4716 return KERN_INVALID_OBJECT;
4717 }
4718 vm_object_reference(object);
4719
4720 /* wait for object (if any) to be ready */
4721 if (object != VM_OBJECT_NULL) {
4722 if (is_kernel_object(object)) {
4723 printf("Warning: Attempt to map kernel object"
4724 " by a non-private kernel entity\n");
4725 return KERN_INVALID_OBJECT;
4726 }
4727 if (!object->pager_ready) {
4728 vm_object_lock(object);
4729
4730 while (!object->pager_ready) {
4731 vm_object_wait(object,
4732 VM_OBJECT_EVENT_PAGER_READY,
4733 THREAD_UNINT);
4734 vm_object_lock(object);
4735 }
4736 vm_object_unlock(object);
4737 }
4738 }
4739 } else {
4740 return KERN_INVALID_OBJECT;
4741 }
4742
4743 if (object != VM_OBJECT_NULL &&
4744 object->named &&
4745 object->pager != MEMORY_OBJECT_NULL &&
4746 object->copy_strategy != MEMORY_OBJECT_COPY_NONE) {
4747 memory_object_t pager;
4748 vm_prot_t pager_prot;
4749 kern_return_t kr;
4750
4751 /*
4752 * For "named" VM objects, let the pager know that the
4753 * memory object is being mapped. Some pagers need to keep
4754 * track of this, to know when they can reclaim the memory
4755 * object, for example.
4756 * VM calls memory_object_map() for each mapping (specifying
4757 * the protection of each mapping) and calls
4758 * memory_object_last_unmap() when all the mappings are gone.
4759 */
4760 pager_prot = max_protection;
4761 if (copy) {
4762 /*
4763 * Copy-On-Write mapping: won't modify the
4764 * memory object.
4765 */
4766 pager_prot &= ~VM_PROT_WRITE;
4767 }
4768 vm_object_lock(object);
4769 pager = object->pager;
4770 if (object->named &&
4771 pager != MEMORY_OBJECT_NULL &&
4772 object->copy_strategy != MEMORY_OBJECT_COPY_NONE) {
4773 assert(object->pager_ready);
4774 vm_object_mapping_wait(object, THREAD_UNINT);
4775 vm_object_mapping_begin(object);
4776 vm_object_unlock(object);
4777
4778 kr = memory_object_map(pager, pager_prot);
4779 assert(kr == KERN_SUCCESS);
4780
4781 vm_object_lock(object);
4782 vm_object_mapping_end(object);
4783 }
4784 vm_object_unlock(object);
4785 }
4786
4787 /*
4788 * Perform the copy if requested
4789 */
4790
4791 if (copy) {
4792 vm_object_t new_object;
4793 vm_object_offset_t new_offset;
4794
4795 result = vm_object_copy_strategically(object, offset,
4796 map_size,
4797 false, /* forking */
4798 &new_object, &new_offset,
4799 ©);
4800
4801
4802 if (result == KERN_MEMORY_RESTART_COPY) {
4803 boolean_t success;
4804 boolean_t src_needs_copy;
4805
4806 /*
4807 * XXX
4808 * We currently ignore src_needs_copy.
4809 * This really is the issue of how to make
4810 * MEMORY_OBJECT_COPY_SYMMETRIC safe for
4811 * non-kernel users to use. Solution forthcoming.
4812 * In the meantime, since we don't allow non-kernel
4813 * memory managers to specify symmetric copy,
4814 * we won't run into problems here.
4815 */
4816 new_object = object;
4817 new_offset = offset;
4818 success = vm_object_copy_quickly(new_object,
4819 new_offset,
4820 map_size,
4821 &src_needs_copy,
4822 ©);
4823 assert(success);
4824 result = KERN_SUCCESS;
4825 }
4826 /*
4827 * Throw away the reference to the
4828 * original object, as it won't be mapped.
4829 */
4830
4831 vm_object_deallocate(object);
4832
4833 if (result != KERN_SUCCESS) {
4834 return result;
4835 }
4836
4837 object = new_object;
4838 offset = new_offset;
4839 }
4840
4841 /*
4842 * If non-kernel users want to try to prefault pages, the mapping and prefault
4843 * needs to be atomic.
4844 */
4845 kernel_prefault = (try_prefault && vm_kernel_map_is_kernel(target_map));
4846 vmk_flags.vmkf_keep_map_locked = (try_prefault && !kernel_prefault);
4847
4848 #if __arm64__
4849 if (fourk) {
4850 /* map this object in a "4K" pager */
4851 result = vm_map_enter_fourk(target_map,
4852 &map_addr,
4853 map_size,
4854 (vm_map_offset_t) mask,
4855 vmk_flags,
4856 object,
4857 offset,
4858 copy,
4859 cur_protection,
4860 max_protection,
4861 inheritance);
4862 } else
4863 #endif /* __arm64__ */
4864 {
4865 result = vm_map_enter(target_map,
4866 &map_addr, map_size,
4867 (vm_map_offset_t)mask,
4868 vmk_flags,
4869 object, offset,
4870 copy,
4871 cur_protection, max_protection,
4872 inheritance);
4873 }
4874 if (result != KERN_SUCCESS) {
4875 vm_object_deallocate(object);
4876 }
4877
4878 /*
4879 * Try to prefault, and do not forget to release the vm map lock.
4880 */
4881 if (result == KERN_SUCCESS && try_prefault) {
4882 mach_vm_address_t va = map_addr;
4883 kern_return_t kr = KERN_SUCCESS;
4884 unsigned int i = 0;
4885 int pmap_options;
4886
4887 pmap_options = kernel_prefault ? 0 : PMAP_OPTIONS_NOWAIT;
4888 if (object->internal) {
4889 pmap_options |= PMAP_OPTIONS_INTERNAL;
4890 }
4891
4892 for (i = 0; i < page_list_count; ++i) {
4893 if (!UPL_VALID_PAGE(page_list, i)) {
4894 if (kernel_prefault) {
4895 assertf(FALSE, "kernel_prefault && !UPL_VALID_PAGE");
4896 result = KERN_MEMORY_ERROR;
4897 break;
4898 }
4899 } else {
4900 /*
4901 * If this function call failed, we should stop
4902 * trying to optimize, other calls are likely
4903 * going to fail too.
4904 *
4905 * We are not gonna report an error for such
4906 * failure though. That's an optimization, not
4907 * something critical.
4908 */
4909 kr = pmap_enter_options(target_map->pmap,
4910 va, UPL_PHYS_PAGE(page_list, i),
4911 cur_protection, VM_PROT_NONE,
4912 0, TRUE, pmap_options, NULL, PMAP_MAPPING_TYPE_INFER);
4913 if (kr != KERN_SUCCESS) {
4914 OSIncrementAtomic64(&vm_prefault_nb_bailout);
4915 if (kernel_prefault) {
4916 result = kr;
4917 }
4918 break;
4919 }
4920 OSIncrementAtomic64(&vm_prefault_nb_pages);
4921 }
4922
4923 /* Next virtual address */
4924 va += PAGE_SIZE;
4925 }
4926 if (vmk_flags.vmkf_keep_map_locked) {
4927 vm_map_unlock(target_map);
4928 }
4929 }
4930
4931 if (vmk_flags.vmf_return_data_addr ||
4932 vmk_flags.vmf_return_4k_data_addr) {
4933 *address = map_addr + offset_in_mapping;
4934 } else {
4935 *address = map_addr;
4936 }
4937 return result;
4938 }
4939
4940 kern_return_t
vm_map_enter_mem_object(vm_map_t target_map,vm_map_offset_t * address,vm_map_size_t initial_size,vm_map_offset_t mask,vm_map_kernel_flags_t vmk_flags,ipc_port_t port,vm_object_offset_t offset,boolean_t copy,vm_prot_t cur_protection,vm_prot_t max_protection,vm_inherit_t inheritance)4941 vm_map_enter_mem_object(
4942 vm_map_t target_map,
4943 vm_map_offset_t *address,
4944 vm_map_size_t initial_size,
4945 vm_map_offset_t mask,
4946 vm_map_kernel_flags_t vmk_flags,
4947 ipc_port_t port,
4948 vm_object_offset_t offset,
4949 boolean_t copy,
4950 vm_prot_t cur_protection,
4951 vm_prot_t max_protection,
4952 vm_inherit_t inheritance)
4953 {
4954 kern_return_t ret;
4955
4956 /* range_id is set by vm_map_enter_mem_object_helper */
4957 ret = vm_map_enter_mem_object_helper(target_map,
4958 address,
4959 initial_size,
4960 mask,
4961 vmk_flags,
4962 port,
4963 offset,
4964 copy,
4965 cur_protection,
4966 max_protection,
4967 inheritance,
4968 NULL,
4969 0);
4970
4971 #if KASAN
4972 if (ret == KERN_SUCCESS && address && target_map->pmap == kernel_pmap) {
4973 kasan_notify_address(*address, initial_size);
4974 }
4975 #endif
4976
4977 return ret;
4978 }
4979
4980 kern_return_t
vm_map_enter_mem_object_prefault(vm_map_t target_map,vm_map_offset_t * address,vm_map_size_t initial_size,vm_map_offset_t mask,vm_map_kernel_flags_t vmk_flags,ipc_port_t port,vm_object_offset_t offset,vm_prot_t cur_protection,vm_prot_t max_protection,upl_page_list_ptr_t page_list,unsigned int page_list_count)4981 vm_map_enter_mem_object_prefault(
4982 vm_map_t target_map,
4983 vm_map_offset_t *address,
4984 vm_map_size_t initial_size,
4985 vm_map_offset_t mask,
4986 vm_map_kernel_flags_t vmk_flags,
4987 ipc_port_t port,
4988 vm_object_offset_t offset,
4989 vm_prot_t cur_protection,
4990 vm_prot_t max_protection,
4991 upl_page_list_ptr_t page_list,
4992 unsigned int page_list_count)
4993 {
4994 kern_return_t ret;
4995
4996 /* range_id is set by vm_map_enter_mem_object_helper */
4997 ret = vm_map_enter_mem_object_helper(target_map,
4998 address,
4999 initial_size,
5000 mask,
5001 vmk_flags,
5002 port,
5003 offset,
5004 FALSE,
5005 cur_protection,
5006 max_protection,
5007 VM_INHERIT_DEFAULT,
5008 page_list,
5009 page_list_count);
5010
5011 #if KASAN
5012 if (ret == KERN_SUCCESS && address && target_map->pmap == kernel_pmap) {
5013 kasan_notify_address(*address, initial_size);
5014 }
5015 #endif
5016
5017 return ret;
5018 }
5019
5020
5021 kern_return_t
vm_map_enter_mem_object_control(vm_map_t target_map,vm_map_offset_t * address,vm_map_size_t initial_size,vm_map_offset_t mask,vm_map_kernel_flags_t vmk_flags,memory_object_control_t control,vm_object_offset_t offset,boolean_t copy,vm_prot_t cur_protection,vm_prot_t max_protection,vm_inherit_t inheritance)5022 vm_map_enter_mem_object_control(
5023 vm_map_t target_map,
5024 vm_map_offset_t *address,
5025 vm_map_size_t initial_size,
5026 vm_map_offset_t mask,
5027 vm_map_kernel_flags_t vmk_flags,
5028 memory_object_control_t control,
5029 vm_object_offset_t offset,
5030 boolean_t copy,
5031 vm_prot_t cur_protection,
5032 vm_prot_t max_protection,
5033 vm_inherit_t inheritance)
5034 {
5035 vm_map_address_t map_addr;
5036 vm_map_size_t map_size;
5037 vm_object_t object;
5038 vm_object_size_t size;
5039 kern_return_t result;
5040 memory_object_t pager;
5041 vm_prot_t pager_prot;
5042 kern_return_t kr;
5043 #if __arm64__
5044 boolean_t fourk = vmk_flags.vmkf_fourk;
5045 #endif /* __arm64__ */
5046
5047 /*
5048 * Check arguments for validity
5049 */
5050 if ((target_map == VM_MAP_NULL) ||
5051 (cur_protection & ~(VM_PROT_ALL | VM_PROT_ALLEXEC)) ||
5052 (max_protection & ~(VM_PROT_ALL | VM_PROT_ALLEXEC)) ||
5053 (inheritance > VM_INHERIT_LAST_VALID) ||
5054 initial_size == 0) {
5055 return KERN_INVALID_ARGUMENT;
5056 }
5057
5058 #if __arm64__
5059 if (fourk && VM_MAP_PAGE_MASK(target_map) < PAGE_MASK) {
5060 fourk = FALSE;
5061 }
5062
5063 if (fourk) {
5064 map_addr = vm_map_trunc_page(*address,
5065 FOURK_PAGE_MASK);
5066 map_size = vm_map_round_page(initial_size,
5067 FOURK_PAGE_MASK);
5068 } else
5069 #endif /* __arm64__ */
5070 {
5071 map_addr = vm_map_trunc_page(*address,
5072 VM_MAP_PAGE_MASK(target_map));
5073 map_size = vm_map_round_page(initial_size,
5074 VM_MAP_PAGE_MASK(target_map));
5075 }
5076 size = vm_object_round_page(initial_size);
5077
5078 object = memory_object_control_to_vm_object(control);
5079
5080 if (object == VM_OBJECT_NULL) {
5081 return KERN_INVALID_OBJECT;
5082 }
5083
5084 if (is_kernel_object(object)) {
5085 printf("Warning: Attempt to map kernel object"
5086 " by a non-private kernel entity\n");
5087 return KERN_INVALID_OBJECT;
5088 }
5089
5090 vm_object_lock(object);
5091 object->ref_count++;
5092
5093 /*
5094 * For "named" VM objects, let the pager know that the
5095 * memory object is being mapped. Some pagers need to keep
5096 * track of this, to know when they can reclaim the memory
5097 * object, for example.
5098 * VM calls memory_object_map() for each mapping (specifying
5099 * the protection of each mapping) and calls
5100 * memory_object_last_unmap() when all the mappings are gone.
5101 */
5102 pager_prot = max_protection;
5103 if (copy) {
5104 pager_prot &= ~VM_PROT_WRITE;
5105 }
5106 pager = object->pager;
5107 if (object->named &&
5108 pager != MEMORY_OBJECT_NULL &&
5109 object->copy_strategy != MEMORY_OBJECT_COPY_NONE) {
5110 assert(object->pager_ready);
5111 vm_object_mapping_wait(object, THREAD_UNINT);
5112 vm_object_mapping_begin(object);
5113 vm_object_unlock(object);
5114
5115 kr = memory_object_map(pager, pager_prot);
5116 assert(kr == KERN_SUCCESS);
5117
5118 vm_object_lock(object);
5119 vm_object_mapping_end(object);
5120 }
5121 vm_object_unlock(object);
5122
5123 /*
5124 * Perform the copy if requested
5125 */
5126
5127 if (copy) {
5128 vm_object_t new_object;
5129 vm_object_offset_t new_offset;
5130
5131 result = vm_object_copy_strategically(object, offset, size,
5132 false, /* forking */
5133 &new_object, &new_offset,
5134 ©);
5135
5136
5137 if (result == KERN_MEMORY_RESTART_COPY) {
5138 boolean_t success;
5139 boolean_t src_needs_copy;
5140
5141 /*
5142 * XXX
5143 * We currently ignore src_needs_copy.
5144 * This really is the issue of how to make
5145 * MEMORY_OBJECT_COPY_SYMMETRIC safe for
5146 * non-kernel users to use. Solution forthcoming.
5147 * In the meantime, since we don't allow non-kernel
5148 * memory managers to specify symmetric copy,
5149 * we won't run into problems here.
5150 */
5151 new_object = object;
5152 new_offset = offset;
5153 success = vm_object_copy_quickly(new_object,
5154 new_offset, size,
5155 &src_needs_copy,
5156 ©);
5157 assert(success);
5158 result = KERN_SUCCESS;
5159 }
5160 /*
5161 * Throw away the reference to the
5162 * original object, as it won't be mapped.
5163 */
5164
5165 vm_object_deallocate(object);
5166
5167 if (result != KERN_SUCCESS) {
5168 return result;
5169 }
5170
5171 object = new_object;
5172 offset = new_offset;
5173 }
5174
5175 #if __arm64__
5176 if (fourk) {
5177 result = vm_map_enter_fourk(target_map,
5178 &map_addr,
5179 map_size,
5180 (vm_map_offset_t)mask,
5181 vmk_flags,
5182 object, offset,
5183 copy,
5184 cur_protection, max_protection,
5185 inheritance);
5186 } else
5187 #endif /* __arm64__ */
5188 {
5189 result = vm_map_enter(target_map,
5190 &map_addr, map_size,
5191 (vm_map_offset_t)mask,
5192 vmk_flags,
5193 object, offset,
5194 copy,
5195 cur_protection, max_protection,
5196 inheritance);
5197 }
5198 if (result != KERN_SUCCESS) {
5199 vm_object_deallocate(object);
5200 }
5201 *address = map_addr;
5202
5203 return result;
5204 }
5205
5206
5207 #if VM_CPM
5208
5209 #ifdef MACH_ASSERT
5210 extern pmap_paddr_t avail_start, avail_end;
5211 #endif
5212
5213 /*
5214 * Allocate memory in the specified map, with the caveat that
5215 * the memory is physically contiguous. This call may fail
5216 * if the system can't find sufficient contiguous memory.
5217 * This call may cause or lead to heart-stopping amounts of
5218 * paging activity.
5219 *
5220 * Memory obtained from this call should be freed in the
5221 * normal way, viz., via vm_deallocate.
5222 */
5223 kern_return_t
vm_map_enter_cpm(vm_map_t map,vm_map_offset_t * addr,vm_map_size_t size,vm_map_kernel_flags_t vmk_flags)5224 vm_map_enter_cpm(
5225 vm_map_t map,
5226 vm_map_offset_t *addr,
5227 vm_map_size_t size,
5228 vm_map_kernel_flags_t vmk_flags)
5229 {
5230 vm_object_t cpm_obj;
5231 pmap_t pmap;
5232 vm_page_t m, pages;
5233 kern_return_t kr;
5234 vm_map_offset_t va, start, end, offset;
5235 #if MACH_ASSERT
5236 vm_map_offset_t prev_addr = 0;
5237 #endif /* MACH_ASSERT */
5238 uint8_t object_lock_type = 0;
5239
5240 if (VM_MAP_PAGE_SHIFT(map) != PAGE_SHIFT) {
5241 /* XXX TODO4K do we need to support this? */
5242 *addr = 0;
5243 return KERN_NOT_SUPPORTED;
5244 }
5245
5246 if (size == 0) {
5247 *addr = 0;
5248 return KERN_SUCCESS;
5249 }
5250 if (vmk_flags.vmf_fixed) {
5251 *addr = vm_map_trunc_page(*addr,
5252 VM_MAP_PAGE_MASK(map));
5253 } else {
5254 *addr = vm_map_min(map);
5255 }
5256 size = vm_map_round_page(size,
5257 VM_MAP_PAGE_MASK(map));
5258
5259 /*
5260 * LP64todo - cpm_allocate should probably allow
5261 * allocations of >4GB, but not with the current
5262 * algorithm, so just cast down the size for now.
5263 */
5264 if (size > VM_MAX_ADDRESS) {
5265 return KERN_RESOURCE_SHORTAGE;
5266 }
5267 if ((kr = cpm_allocate(CAST_DOWN(vm_size_t, size),
5268 &pages, 0, 0, TRUE, flags)) != KERN_SUCCESS) {
5269 return kr;
5270 }
5271
5272 cpm_obj = vm_object_allocate((vm_object_size_t)size);
5273 assert(cpm_obj != VM_OBJECT_NULL);
5274 assert(cpm_obj->internal);
5275 assert(cpm_obj->vo_size == (vm_object_size_t)size);
5276 assert(cpm_obj->can_persist == FALSE);
5277 assert(cpm_obj->pager_created == FALSE);
5278 assert(cpm_obj->pageout == FALSE);
5279 assert(cpm_obj->shadow == VM_OBJECT_NULL);
5280
5281 /*
5282 * Insert pages into object.
5283 */
5284 object_lock_type = OBJECT_LOCK_EXCLUSIVE;
5285 vm_object_lock(cpm_obj);
5286 for (offset = 0; offset < size; offset += PAGE_SIZE) {
5287 m = pages;
5288 pages = NEXT_PAGE(m);
5289 *(NEXT_PAGE_PTR(m)) = VM_PAGE_NULL;
5290
5291 assert(!m->vmp_gobbled);
5292 assert(!m->vmp_wanted);
5293 assert(!m->vmp_pageout);
5294 assert(!m->vmp_tabled);
5295 assert(VM_PAGE_WIRED(m));
5296 assert(m->vmp_busy);
5297 assert(VM_PAGE_GET_PHYS_PAGE(m) >= (avail_start >> PAGE_SHIFT) && VM_PAGE_GET_PHYS_PAGE(m) <= (avail_end >> PAGE_SHIFT));
5298
5299 m->vmp_busy = FALSE;
5300 vm_page_insert(m, cpm_obj, offset);
5301 }
5302 assert(cpm_obj->resident_page_count == size / PAGE_SIZE);
5303 vm_object_unlock(cpm_obj);
5304
5305 /*
5306 * Hang onto a reference on the object in case a
5307 * multi-threaded application for some reason decides
5308 * to deallocate the portion of the address space into
5309 * which we will insert this object.
5310 *
5311 * Unfortunately, we must insert the object now before
5312 * we can talk to the pmap module about which addresses
5313 * must be wired down. Hence, the race with a multi-
5314 * threaded app.
5315 */
5316 vm_object_reference(cpm_obj);
5317
5318 /*
5319 * Insert object into map.
5320 */
5321
5322 kr = vm_map_enter(
5323 map,
5324 addr,
5325 size,
5326 (vm_map_offset_t)0,
5327 vmk_flags,
5328 cpm_obj,
5329 (vm_object_offset_t)0,
5330 FALSE,
5331 VM_PROT_ALL,
5332 VM_PROT_ALL,
5333 VM_INHERIT_DEFAULT);
5334
5335 if (kr != KERN_SUCCESS) {
5336 /*
5337 * A CPM object doesn't have can_persist set,
5338 * so all we have to do is deallocate it to
5339 * free up these pages.
5340 */
5341 assert(cpm_obj->pager_created == FALSE);
5342 assert(cpm_obj->can_persist == FALSE);
5343 assert(cpm_obj->pageout == FALSE);
5344 assert(cpm_obj->shadow == VM_OBJECT_NULL);
5345 vm_object_deallocate(cpm_obj); /* kill acquired ref */
5346 vm_object_deallocate(cpm_obj); /* kill creation ref */
5347 }
5348
5349 /*
5350 * Inform the physical mapping system that the
5351 * range of addresses may not fault, so that
5352 * page tables and such can be locked down as well.
5353 */
5354 start = *addr;
5355 end = start + size;
5356 pmap = vm_map_pmap(map);
5357 pmap_pageable(pmap, start, end, FALSE);
5358
5359 /*
5360 * Enter each page into the pmap, to avoid faults.
5361 * Note that this loop could be coded more efficiently,
5362 * if the need arose, rather than looking up each page
5363 * again.
5364 */
5365 for (offset = 0, va = start; offset < size;
5366 va += PAGE_SIZE, offset += PAGE_SIZE) {
5367 int type_of_fault;
5368
5369 vm_object_lock(cpm_obj);
5370 m = vm_page_lookup(cpm_obj, (vm_object_offset_t)offset);
5371 assert(m != VM_PAGE_NULL);
5372
5373 vm_page_zero_fill(m);
5374
5375 type_of_fault = DBG_ZERO_FILL_FAULT;
5376
5377 vm_fault_enter(m, pmap, va,
5378 PAGE_SIZE, 0,
5379 VM_PROT_ALL, VM_PROT_WRITE,
5380 VM_PAGE_WIRED(m),
5381 FALSE, /* change_wiring */
5382 VM_KERN_MEMORY_NONE, /* tag - not wiring */
5383 FALSE, /* cs_bypass */
5384 0, /* user_tag */
5385 0, /* pmap_options */
5386 NULL, /* need_retry */
5387 &type_of_fault,
5388 &object_lock_type); /* Exclusive lock mode. Will remain unchanged.*/
5389
5390 vm_object_unlock(cpm_obj);
5391 }
5392
5393 #if MACH_ASSERT
5394 /*
5395 * Verify ordering in address space.
5396 */
5397 for (offset = 0; offset < size; offset += PAGE_SIZE) {
5398 vm_object_lock(cpm_obj);
5399 m = vm_page_lookup(cpm_obj, (vm_object_offset_t)offset);
5400 vm_object_unlock(cpm_obj);
5401 if (m == VM_PAGE_NULL) {
5402 panic("vm_allocate_cpm: obj %p off 0x%llx no page",
5403 cpm_obj, (uint64_t)offset);
5404 }
5405 assert(m->vmp_tabled);
5406 assert(!m->vmp_busy);
5407 assert(!m->vmp_wanted);
5408 assert(!m->vmp_fictitious);
5409 assert(!m->vmp_private);
5410 assert(!m->vmp_absent);
5411 assert(!m->vmp_cleaning);
5412 assert(!m->vmp_laundry);
5413 assert(!m->vmp_precious);
5414 assert(!m->vmp_clustered);
5415 if (offset != 0) {
5416 if (VM_PAGE_GET_PHYS_PAGE(m) != prev_addr + 1) {
5417 printf("start 0x%llx end 0x%llx va 0x%llx\n",
5418 (uint64_t)start, (uint64_t)end, (uint64_t)va);
5419 printf("obj %p off 0x%llx\n", cpm_obj, (uint64_t)offset);
5420 printf("m %p prev_address 0x%llx\n", m, (uint64_t)prev_addr);
5421 panic("vm_allocate_cpm: pages not contig!");
5422 }
5423 }
5424 prev_addr = VM_PAGE_GET_PHYS_PAGE(m);
5425 }
5426 #endif /* MACH_ASSERT */
5427
5428 vm_object_deallocate(cpm_obj); /* kill extra ref */
5429
5430 return kr;
5431 }
5432
5433
5434 #else /* VM_CPM */
5435
5436 /*
5437 * Interface is defined in all cases, but unless the kernel
5438 * is built explicitly for this option, the interface does
5439 * nothing.
5440 */
5441
5442 kern_return_t
vm_map_enter_cpm(__unused vm_map_t map,__unused vm_map_offset_t * addr,__unused vm_map_size_t size,__unused vm_map_kernel_flags_t vmk_flags)5443 vm_map_enter_cpm(
5444 __unused vm_map_t map,
5445 __unused vm_map_offset_t *addr,
5446 __unused vm_map_size_t size,
5447 __unused vm_map_kernel_flags_t vmk_flags)
5448 {
5449 return KERN_FAILURE;
5450 }
5451 #endif /* VM_CPM */
5452
5453 /* Not used without nested pmaps */
5454 #ifndef NO_NESTED_PMAP
5455 /*
5456 * Clip and unnest a portion of a nested submap mapping.
5457 */
5458
5459
5460 static void
vm_map_clip_unnest(vm_map_t map,vm_map_entry_t entry,vm_map_offset_t start_unnest,vm_map_offset_t end_unnest)5461 vm_map_clip_unnest(
5462 vm_map_t map,
5463 vm_map_entry_t entry,
5464 vm_map_offset_t start_unnest,
5465 vm_map_offset_t end_unnest)
5466 {
5467 vm_map_offset_t old_start_unnest = start_unnest;
5468 vm_map_offset_t old_end_unnest = end_unnest;
5469
5470 assert(entry->is_sub_map);
5471 assert(VME_SUBMAP(entry) != NULL);
5472 assert(entry->use_pmap);
5473
5474 /*
5475 * Query the platform for the optimal unnest range.
5476 * DRK: There's some duplication of effort here, since
5477 * callers may have adjusted the range to some extent. This
5478 * routine was introduced to support 1GiB subtree nesting
5479 * for x86 platforms, which can also nest on 2MiB boundaries
5480 * depending on size/alignment.
5481 */
5482 if (pmap_adjust_unnest_parameters(map->pmap, &start_unnest, &end_unnest)) {
5483 assert(VME_SUBMAP(entry)->is_nested_map);
5484 assert(!VME_SUBMAP(entry)->disable_vmentry_reuse);
5485 log_unnest_badness(map,
5486 old_start_unnest,
5487 old_end_unnest,
5488 VME_SUBMAP(entry)->is_nested_map,
5489 (entry->vme_start +
5490 VME_SUBMAP(entry)->lowest_unnestable_start -
5491 VME_OFFSET(entry)));
5492 }
5493
5494 if (entry->vme_start > start_unnest ||
5495 entry->vme_end < end_unnest) {
5496 panic("vm_map_clip_unnest(0x%llx,0x%llx): "
5497 "bad nested entry: start=0x%llx end=0x%llx\n",
5498 (long long)start_unnest, (long long)end_unnest,
5499 (long long)entry->vme_start, (long long)entry->vme_end);
5500 }
5501
5502 if (start_unnest > entry->vme_start) {
5503 _vm_map_clip_start(&map->hdr,
5504 entry,
5505 start_unnest);
5506 if (map->holelistenabled) {
5507 vm_map_store_update_first_free(map, NULL, FALSE);
5508 } else {
5509 vm_map_store_update_first_free(map, map->first_free, FALSE);
5510 }
5511 }
5512 if (entry->vme_end > end_unnest) {
5513 _vm_map_clip_end(&map->hdr,
5514 entry,
5515 end_unnest);
5516 if (map->holelistenabled) {
5517 vm_map_store_update_first_free(map, NULL, FALSE);
5518 } else {
5519 vm_map_store_update_first_free(map, map->first_free, FALSE);
5520 }
5521 }
5522
5523 pmap_unnest(map->pmap,
5524 entry->vme_start,
5525 entry->vme_end - entry->vme_start);
5526 if ((map->mapped_in_other_pmaps) && os_ref_get_count_raw(&map->map_refcnt) != 0) {
5527 /* clean up parent map/maps */
5528 vm_map_submap_pmap_clean(
5529 map, entry->vme_start,
5530 entry->vme_end,
5531 VME_SUBMAP(entry),
5532 VME_OFFSET(entry));
5533 }
5534 entry->use_pmap = FALSE;
5535 if ((map->pmap != kernel_pmap) &&
5536 (VME_ALIAS(entry) == VM_MEMORY_SHARED_PMAP)) {
5537 VME_ALIAS_SET(entry, VM_MEMORY_UNSHARED_PMAP);
5538 }
5539 }
5540 #endif /* NO_NESTED_PMAP */
5541
5542 __abortlike
5543 static void
__vm_map_clip_atomic_entry_panic(vm_map_t map,vm_map_entry_t entry,vm_map_offset_t where)5544 __vm_map_clip_atomic_entry_panic(
5545 vm_map_t map,
5546 vm_map_entry_t entry,
5547 vm_map_offset_t where)
5548 {
5549 panic("vm_map_clip(%p): Attempting to clip an atomic VM map entry "
5550 "%p [0x%llx:0x%llx] at 0x%llx", map, entry,
5551 (uint64_t)entry->vme_start,
5552 (uint64_t)entry->vme_end,
5553 (uint64_t)where);
5554 }
5555
5556 /*
5557 * vm_map_clip_start: [ internal use only ]
5558 *
5559 * Asserts that the given entry begins at or after
5560 * the specified address; if necessary,
5561 * it splits the entry into two.
5562 */
5563 void
vm_map_clip_start(vm_map_t map,vm_map_entry_t entry,vm_map_offset_t startaddr)5564 vm_map_clip_start(
5565 vm_map_t map,
5566 vm_map_entry_t entry,
5567 vm_map_offset_t startaddr)
5568 {
5569 #ifndef NO_NESTED_PMAP
5570 if (entry->is_sub_map &&
5571 entry->use_pmap &&
5572 startaddr >= entry->vme_start) {
5573 vm_map_offset_t start_unnest, end_unnest;
5574
5575 /*
5576 * Make sure "startaddr" is no longer in a nested range
5577 * before we clip. Unnest only the minimum range the platform
5578 * can handle.
5579 * vm_map_clip_unnest may perform additional adjustments to
5580 * the unnest range.
5581 */
5582 start_unnest = startaddr & ~(pmap_shared_region_size_min(map->pmap) - 1);
5583 end_unnest = start_unnest + pmap_shared_region_size_min(map->pmap);
5584 vm_map_clip_unnest(map, entry, start_unnest, end_unnest);
5585 }
5586 #endif /* NO_NESTED_PMAP */
5587 if (startaddr > entry->vme_start) {
5588 if (!entry->is_sub_map &&
5589 VME_OBJECT(entry) &&
5590 VME_OBJECT(entry)->phys_contiguous) {
5591 pmap_remove(map->pmap,
5592 (addr64_t)(entry->vme_start),
5593 (addr64_t)(entry->vme_end));
5594 }
5595 if (entry->vme_atomic) {
5596 __vm_map_clip_atomic_entry_panic(map, entry, startaddr);
5597 }
5598
5599 DTRACE_VM5(
5600 vm_map_clip_start,
5601 vm_map_t, map,
5602 vm_map_offset_t, entry->vme_start,
5603 vm_map_offset_t, entry->vme_end,
5604 vm_map_offset_t, startaddr,
5605 int, VME_ALIAS(entry));
5606
5607 _vm_map_clip_start(&map->hdr, entry, startaddr);
5608 if (map->holelistenabled) {
5609 vm_map_store_update_first_free(map, NULL, FALSE);
5610 } else {
5611 vm_map_store_update_first_free(map, map->first_free, FALSE);
5612 }
5613 }
5614 }
5615
5616
5617 #define vm_map_copy_clip_start(copy, entry, startaddr) \
5618 MACRO_BEGIN \
5619 if ((startaddr) > (entry)->vme_start) \
5620 _vm_map_clip_start(&(copy)->cpy_hdr,(entry),(startaddr)); \
5621 MACRO_END
5622
5623 /*
5624 * This routine is called only when it is known that
5625 * the entry must be split.
5626 */
5627 static void
_vm_map_clip_start(struct vm_map_header * map_header,vm_map_entry_t entry,vm_map_offset_t start)5628 _vm_map_clip_start(
5629 struct vm_map_header *map_header,
5630 vm_map_entry_t entry,
5631 vm_map_offset_t start)
5632 {
5633 vm_map_entry_t new_entry;
5634
5635 /*
5636 * Split off the front portion --
5637 * note that we must insert the new
5638 * entry BEFORE this one, so that
5639 * this entry has the specified starting
5640 * address.
5641 */
5642
5643 if (entry->map_aligned) {
5644 assert(VM_MAP_PAGE_ALIGNED(start,
5645 VM_MAP_HDR_PAGE_MASK(map_header)));
5646 }
5647
5648 new_entry = _vm_map_entry_create(map_header);
5649 vm_map_entry_copy_full(new_entry, entry);
5650
5651 new_entry->vme_end = start;
5652 assert(new_entry->vme_start < new_entry->vme_end);
5653 VME_OFFSET_SET(entry, VME_OFFSET(entry) + (start - entry->vme_start));
5654 if (__improbable(start >= entry->vme_end)) {
5655 panic("mapHdr %p entry %p start 0x%llx end 0x%llx new start 0x%llx", map_header, entry, entry->vme_start, entry->vme_end, start);
5656 }
5657 assert(start < entry->vme_end);
5658 entry->vme_start = start;
5659
5660 #if VM_BTLOG_TAGS
5661 if (new_entry->vme_kernel_object) {
5662 btref_retain(new_entry->vme_tag_btref);
5663 }
5664 #endif /* VM_BTLOG_TAGS */
5665
5666 _vm_map_store_entry_link(map_header, entry->vme_prev, new_entry);
5667
5668 if (entry->is_sub_map) {
5669 vm_map_reference(VME_SUBMAP(new_entry));
5670 } else {
5671 vm_object_reference(VME_OBJECT(new_entry));
5672 }
5673 }
5674
5675
5676 /*
5677 * vm_map_clip_end: [ internal use only ]
5678 *
5679 * Asserts that the given entry ends at or before
5680 * the specified address; if necessary,
5681 * it splits the entry into two.
5682 */
5683 void
vm_map_clip_end(vm_map_t map,vm_map_entry_t entry,vm_map_offset_t endaddr)5684 vm_map_clip_end(
5685 vm_map_t map,
5686 vm_map_entry_t entry,
5687 vm_map_offset_t endaddr)
5688 {
5689 if (endaddr > entry->vme_end) {
5690 /*
5691 * Within the scope of this clipping, limit "endaddr" to
5692 * the end of this map entry...
5693 */
5694 endaddr = entry->vme_end;
5695 }
5696 #ifndef NO_NESTED_PMAP
5697 if (entry->is_sub_map && entry->use_pmap) {
5698 vm_map_offset_t start_unnest, end_unnest;
5699
5700 /*
5701 * Make sure the range between the start of this entry and
5702 * the new "endaddr" is no longer nested before we clip.
5703 * Unnest only the minimum range the platform can handle.
5704 * vm_map_clip_unnest may perform additional adjustments to
5705 * the unnest range.
5706 */
5707 start_unnest = entry->vme_start;
5708 end_unnest =
5709 (endaddr + pmap_shared_region_size_min(map->pmap) - 1) &
5710 ~(pmap_shared_region_size_min(map->pmap) - 1);
5711 vm_map_clip_unnest(map, entry, start_unnest, end_unnest);
5712 }
5713 #endif /* NO_NESTED_PMAP */
5714 if (endaddr < entry->vme_end) {
5715 if (!entry->is_sub_map &&
5716 VME_OBJECT(entry) &&
5717 VME_OBJECT(entry)->phys_contiguous) {
5718 pmap_remove(map->pmap,
5719 (addr64_t)(entry->vme_start),
5720 (addr64_t)(entry->vme_end));
5721 }
5722 if (entry->vme_atomic) {
5723 __vm_map_clip_atomic_entry_panic(map, entry, endaddr);
5724 }
5725 DTRACE_VM5(
5726 vm_map_clip_end,
5727 vm_map_t, map,
5728 vm_map_offset_t, entry->vme_start,
5729 vm_map_offset_t, entry->vme_end,
5730 vm_map_offset_t, endaddr,
5731 int, VME_ALIAS(entry));
5732
5733 _vm_map_clip_end(&map->hdr, entry, endaddr);
5734 if (map->holelistenabled) {
5735 vm_map_store_update_first_free(map, NULL, FALSE);
5736 } else {
5737 vm_map_store_update_first_free(map, map->first_free, FALSE);
5738 }
5739 }
5740 }
5741
5742
5743 #define vm_map_copy_clip_end(copy, entry, endaddr) \
5744 MACRO_BEGIN \
5745 if ((endaddr) < (entry)->vme_end) \
5746 _vm_map_clip_end(&(copy)->cpy_hdr,(entry),(endaddr)); \
5747 MACRO_END
5748
5749 /*
5750 * This routine is called only when it is known that
5751 * the entry must be split.
5752 */
5753 static void
_vm_map_clip_end(struct vm_map_header * map_header,vm_map_entry_t entry,vm_map_offset_t end)5754 _vm_map_clip_end(
5755 struct vm_map_header *map_header,
5756 vm_map_entry_t entry,
5757 vm_map_offset_t end)
5758 {
5759 vm_map_entry_t new_entry;
5760
5761 /*
5762 * Create a new entry and insert it
5763 * AFTER the specified entry
5764 */
5765
5766 if (entry->map_aligned) {
5767 assert(VM_MAP_PAGE_ALIGNED(end,
5768 VM_MAP_HDR_PAGE_MASK(map_header)));
5769 }
5770
5771 new_entry = _vm_map_entry_create(map_header);
5772 vm_map_entry_copy_full(new_entry, entry);
5773
5774 if (__improbable(end <= entry->vme_start)) {
5775 panic("mapHdr %p entry %p start 0x%llx end 0x%llx new end 0x%llx", map_header, entry, entry->vme_start, entry->vme_end, end);
5776 }
5777 assert(entry->vme_start < end);
5778 new_entry->vme_start = entry->vme_end = end;
5779 VME_OFFSET_SET(new_entry,
5780 VME_OFFSET(new_entry) + (end - entry->vme_start));
5781 assert(new_entry->vme_start < new_entry->vme_end);
5782
5783 #if VM_BTLOG_TAGS
5784 if (new_entry->vme_kernel_object) {
5785 btref_retain(new_entry->vme_tag_btref);
5786 }
5787 #endif /* VM_BTLOG_TAGS */
5788
5789 _vm_map_store_entry_link(map_header, entry, new_entry);
5790
5791 if (entry->is_sub_map) {
5792 vm_map_reference(VME_SUBMAP(new_entry));
5793 } else {
5794 vm_object_reference(VME_OBJECT(new_entry));
5795 }
5796 }
5797
5798
5799 /*
5800 * VM_MAP_RANGE_CHECK: [ internal use only ]
5801 *
5802 * Asserts that the starting and ending region
5803 * addresses fall within the valid range of the map.
5804 */
5805 #define VM_MAP_RANGE_CHECK(map, start, end) \
5806 MACRO_BEGIN \
5807 if (start < vm_map_min(map)) \
5808 start = vm_map_min(map); \
5809 if (end > vm_map_max(map)) \
5810 end = vm_map_max(map); \
5811 if (start > end) \
5812 start = end; \
5813 MACRO_END
5814
5815 /*
5816 * vm_map_range_check: [ internal use only ]
5817 *
5818 * Check that the region defined by the specified start and
5819 * end addresses are wholly contained within a single map
5820 * entry or set of adjacent map entries of the spacified map,
5821 * i.e. the specified region contains no unmapped space.
5822 * If any or all of the region is unmapped, FALSE is returned.
5823 * Otherwise, TRUE is returned and if the output argument 'entry'
5824 * is not NULL it points to the map entry containing the start
5825 * of the region.
5826 *
5827 * The map is locked for reading on entry and is left locked.
5828 */
5829 static boolean_t
vm_map_range_check(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_map_entry_t * entry)5830 vm_map_range_check(
5831 vm_map_t map,
5832 vm_map_offset_t start,
5833 vm_map_offset_t end,
5834 vm_map_entry_t *entry)
5835 {
5836 vm_map_entry_t cur;
5837 vm_map_offset_t prev;
5838
5839 /*
5840 * Basic sanity checks first
5841 */
5842 if (start < vm_map_min(map) || end > vm_map_max(map) || start > end) {
5843 return FALSE;
5844 }
5845
5846 /*
5847 * Check first if the region starts within a valid
5848 * mapping for the map.
5849 */
5850 if (!vm_map_lookup_entry(map, start, &cur)) {
5851 return FALSE;
5852 }
5853
5854 /*
5855 * Optimize for the case that the region is contained
5856 * in a single map entry.
5857 */
5858 if (entry != (vm_map_entry_t *) NULL) {
5859 *entry = cur;
5860 }
5861 if (end <= cur->vme_end) {
5862 return TRUE;
5863 }
5864
5865 /*
5866 * If the region is not wholly contained within a
5867 * single entry, walk the entries looking for holes.
5868 */
5869 prev = cur->vme_end;
5870 cur = cur->vme_next;
5871 while ((cur != vm_map_to_entry(map)) && (prev == cur->vme_start)) {
5872 if (end <= cur->vme_end) {
5873 return TRUE;
5874 }
5875 prev = cur->vme_end;
5876 cur = cur->vme_next;
5877 }
5878 return FALSE;
5879 }
5880
5881 /*
5882 * vm_map_protect:
5883 *
5884 * Sets the protection of the specified address
5885 * region in the target map. If "set_max" is
5886 * specified, the maximum protection is to be set;
5887 * otherwise, only the current protection is affected.
5888 */
5889 kern_return_t
vm_map_protect(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_prot_t new_prot,boolean_t set_max)5890 vm_map_protect(
5891 vm_map_t map,
5892 vm_map_offset_t start,
5893 vm_map_offset_t end,
5894 vm_prot_t new_prot,
5895 boolean_t set_max)
5896 {
5897 vm_map_entry_t current;
5898 vm_map_offset_t prev;
5899 vm_map_entry_t entry;
5900 vm_prot_t new_max;
5901 int pmap_options = 0;
5902 kern_return_t kr;
5903
5904 if (__improbable(vm_map_range_overflows(map, start, end - start))) {
5905 return KERN_INVALID_ARGUMENT;
5906 }
5907
5908 if (new_prot & VM_PROT_COPY) {
5909 vm_map_offset_t new_start;
5910 vm_prot_t cur_prot, max_prot;
5911 vm_map_kernel_flags_t kflags;
5912
5913 /* LP64todo - see below */
5914 if (start >= map->max_offset) {
5915 return KERN_INVALID_ADDRESS;
5916 }
5917
5918 if ((new_prot & VM_PROT_ALLEXEC) &&
5919 map->pmap != kernel_pmap &&
5920 (vm_map_cs_enforcement(map)
5921 #if XNU_TARGET_OS_OSX && __arm64__
5922 || !VM_MAP_IS_EXOTIC(map)
5923 #endif /* XNU_TARGET_OS_OSX && __arm64__ */
5924 ) &&
5925 VM_MAP_POLICY_WX_FAIL(map)) {
5926 DTRACE_VM3(cs_wx,
5927 uint64_t, (uint64_t) start,
5928 uint64_t, (uint64_t) end,
5929 vm_prot_t, new_prot);
5930 printf("CODE SIGNING: %d[%s] %s:%d(0x%llx,0x%llx,0x%x) can't have both write and exec at the same time\n",
5931 proc_selfpid(),
5932 (get_bsdtask_info(current_task())
5933 ? proc_name_address(get_bsdtask_info(current_task()))
5934 : "?"),
5935 __FUNCTION__, __LINE__,
5936 #if DEVELOPMENT || DEBUG
5937 (uint64_t)start,
5938 (uint64_t)end,
5939 #else /* DEVELOPMENT || DEBUG */
5940 (uint64_t)0,
5941 (uint64_t)0,
5942 #endif /* DEVELOPMENT || DEBUG */
5943 new_prot);
5944 return KERN_PROTECTION_FAILURE;
5945 }
5946
5947 /*
5948 * Let vm_map_remap_extract() know that it will need to:
5949 * + make a copy of the mapping
5950 * + add VM_PROT_WRITE to the max protections
5951 * + remove any protections that are no longer allowed from the
5952 * max protections (to avoid any WRITE/EXECUTE conflict, for
5953 * example).
5954 * Note that "max_prot" is an IN/OUT parameter only for this
5955 * specific (VM_PROT_COPY) case. It's usually an OUT parameter
5956 * only.
5957 */
5958 max_prot = new_prot & (VM_PROT_ALL | VM_PROT_ALLEXEC);
5959 cur_prot = VM_PROT_NONE;
5960 kflags = VM_MAP_KERNEL_FLAGS_FIXED(.vmf_overwrite = true);
5961 kflags.vmkf_remap_prot_copy = true;
5962 kflags.vmkf_tpro_enforcement_override = !vm_map_tpro_enforcement(map);
5963 new_start = start;
5964 kr = vm_map_remap(map,
5965 &new_start,
5966 end - start,
5967 0, /* mask */
5968 kflags,
5969 map,
5970 start,
5971 TRUE, /* copy-on-write remapping! */
5972 &cur_prot, /* IN/OUT */
5973 &max_prot, /* IN/OUT */
5974 VM_INHERIT_DEFAULT);
5975 if (kr != KERN_SUCCESS) {
5976 return kr;
5977 }
5978 new_prot &= ~VM_PROT_COPY;
5979 }
5980
5981 vm_map_lock(map);
5982
5983 /* LP64todo - remove this check when vm_map_commpage64()
5984 * no longer has to stuff in a map_entry for the commpage
5985 * above the map's max_offset.
5986 */
5987 if (start >= map->max_offset) {
5988 vm_map_unlock(map);
5989 return KERN_INVALID_ADDRESS;
5990 }
5991
5992 while (1) {
5993 /*
5994 * Lookup the entry. If it doesn't start in a valid
5995 * entry, return an error.
5996 */
5997 if (!vm_map_lookup_entry(map, start, &entry)) {
5998 vm_map_unlock(map);
5999 return KERN_INVALID_ADDRESS;
6000 }
6001
6002 if (entry->superpage_size && (start & (SUPERPAGE_SIZE - 1))) { /* extend request to whole entry */
6003 start = SUPERPAGE_ROUND_DOWN(start);
6004 continue;
6005 }
6006 break;
6007 }
6008 if (entry->superpage_size) {
6009 end = SUPERPAGE_ROUND_UP(end);
6010 }
6011
6012 /*
6013 * Make a first pass to check for protection and address
6014 * violations.
6015 */
6016
6017 current = entry;
6018 prev = current->vme_start;
6019 while ((current != vm_map_to_entry(map)) &&
6020 (current->vme_start < end)) {
6021 /*
6022 * If there is a hole, return an error.
6023 */
6024 if (current->vme_start != prev) {
6025 vm_map_unlock(map);
6026 return KERN_INVALID_ADDRESS;
6027 }
6028
6029 new_max = current->max_protection;
6030
6031 #if defined(__x86_64__)
6032 /* Allow max mask to include execute prot bits if this map doesn't enforce CS */
6033 if (set_max && (new_prot & VM_PROT_ALLEXEC) && !vm_map_cs_enforcement(map)) {
6034 new_max = (new_max & ~VM_PROT_ALLEXEC) | (new_prot & VM_PROT_ALLEXEC);
6035 }
6036 #elif CODE_SIGNING_MONITOR
6037 if (set_max && (new_prot & VM_PROT_EXECUTE) && (csm_address_space_exempt(map->pmap) == KERN_SUCCESS)) {
6038 new_max |= VM_PROT_EXECUTE;
6039 }
6040 #endif
6041 if ((new_prot & new_max) != new_prot) {
6042 vm_map_unlock(map);
6043 return KERN_PROTECTION_FAILURE;
6044 }
6045
6046 if (current->used_for_jit &&
6047 pmap_has_prot_policy(map->pmap, current->translated_allow_execute, current->protection)) {
6048 vm_map_unlock(map);
6049 return KERN_PROTECTION_FAILURE;
6050 }
6051
6052 #if __arm64e__
6053 /* Disallow remapping hw assisted TPRO mappings */
6054 if (current->used_for_tpro) {
6055 vm_map_unlock(map);
6056 return KERN_PROTECTION_FAILURE;
6057 }
6058 #endif /* __arm64e__ */
6059
6060
6061 if ((new_prot & VM_PROT_WRITE) &&
6062 (new_prot & VM_PROT_ALLEXEC) &&
6063 #if XNU_TARGET_OS_OSX
6064 map->pmap != kernel_pmap &&
6065 (vm_map_cs_enforcement(map)
6066 #if __arm64__
6067 || !VM_MAP_IS_EXOTIC(map)
6068 #endif /* __arm64__ */
6069 ) &&
6070 #endif /* XNU_TARGET_OS_OSX */
6071 #if CODE_SIGNING_MONITOR
6072 (csm_address_space_exempt(map->pmap) != KERN_SUCCESS) &&
6073 #endif
6074 !(current->used_for_jit)) {
6075 DTRACE_VM3(cs_wx,
6076 uint64_t, (uint64_t) current->vme_start,
6077 uint64_t, (uint64_t) current->vme_end,
6078 vm_prot_t, new_prot);
6079 printf("CODE SIGNING: %d[%s] %s:%d(0x%llx,0x%llx,0x%x) can't have both write and exec at the same time\n",
6080 proc_selfpid(),
6081 (get_bsdtask_info(current_task())
6082 ? proc_name_address(get_bsdtask_info(current_task()))
6083 : "?"),
6084 __FUNCTION__, __LINE__,
6085 #if DEVELOPMENT || DEBUG
6086 (uint64_t)current->vme_start,
6087 (uint64_t)current->vme_end,
6088 #else /* DEVELOPMENT || DEBUG */
6089 (uint64_t)0,
6090 (uint64_t)0,
6091 #endif /* DEVELOPMENT || DEBUG */
6092 new_prot);
6093 new_prot &= ~VM_PROT_ALLEXEC;
6094 if (VM_MAP_POLICY_WX_FAIL(map)) {
6095 vm_map_unlock(map);
6096 return KERN_PROTECTION_FAILURE;
6097 }
6098 }
6099
6100 /*
6101 * If the task has requested executable lockdown,
6102 * deny both:
6103 * - adding executable protections OR
6104 * - adding write protections to an existing executable mapping.
6105 */
6106 if (map->map_disallow_new_exec == TRUE) {
6107 if ((new_prot & VM_PROT_ALLEXEC) ||
6108 ((current->protection & VM_PROT_EXECUTE) && (new_prot & VM_PROT_WRITE))) {
6109 vm_map_unlock(map);
6110 return KERN_PROTECTION_FAILURE;
6111 }
6112 }
6113
6114 prev = current->vme_end;
6115 current = current->vme_next;
6116 }
6117
6118 #if __arm64__
6119 if (end > prev &&
6120 end == vm_map_round_page(prev, VM_MAP_PAGE_MASK(map))) {
6121 vm_map_entry_t prev_entry;
6122
6123 prev_entry = current->vme_prev;
6124 if (prev_entry != vm_map_to_entry(map) &&
6125 !prev_entry->map_aligned &&
6126 (vm_map_round_page(prev_entry->vme_end,
6127 VM_MAP_PAGE_MASK(map))
6128 == end)) {
6129 /*
6130 * The last entry in our range is not "map-aligned"
6131 * but it would have reached all the way to "end"
6132 * if it had been map-aligned, so this is not really
6133 * a hole in the range and we can proceed.
6134 */
6135 prev = end;
6136 }
6137 }
6138 #endif /* __arm64__ */
6139
6140 if (end > prev) {
6141 vm_map_unlock(map);
6142 return KERN_INVALID_ADDRESS;
6143 }
6144
6145 /*
6146 * Go back and fix up protections.
6147 * Clip to start here if the range starts within
6148 * the entry.
6149 */
6150
6151 current = entry;
6152 if (current != vm_map_to_entry(map)) {
6153 /* clip and unnest if necessary */
6154 vm_map_clip_start(map, current, start);
6155 }
6156
6157 while ((current != vm_map_to_entry(map)) &&
6158 (current->vme_start < end)) {
6159 vm_prot_t old_prot;
6160
6161 vm_map_clip_end(map, current, end);
6162
6163 #if DEVELOPMENT || DEBUG
6164 if (current->csm_associated && vm_log_xnu_user_debug) {
6165 printf("FBDP %d[%s] %s(0x%llx,0x%llx,0x%x) on map %p entry %p [0x%llx:0x%llx 0x%x/0x%x] csm_associated\n",
6166 proc_selfpid(),
6167 (get_bsdtask_info(current_task())
6168 ? proc_name_address(get_bsdtask_info(current_task()))
6169 : "?"),
6170 __FUNCTION__,
6171 (uint64_t)start,
6172 (uint64_t)end,
6173 new_prot,
6174 map, current,
6175 current->vme_start,
6176 current->vme_end,
6177 current->protection,
6178 current->max_protection);
6179 }
6180 #endif /* DEVELOPMENT || DEBUG */
6181
6182 if (current->is_sub_map) {
6183 /* clipping did unnest if needed */
6184 assert(!current->use_pmap);
6185 }
6186
6187 old_prot = current->protection;
6188
6189 if (set_max) {
6190 current->max_protection = new_prot;
6191 /* Consider either EXECUTE or UEXEC as EXECUTE for this masking */
6192 current->protection = (new_prot & old_prot);
6193 } else {
6194 current->protection = new_prot;
6195 }
6196
6197 #if CODE_SIGNING_MONITOR
6198 if (!current->vme_xnu_user_debug &&
6199 /* a !csm_associated mapping becoming executable */
6200 ((!current->csm_associated &&
6201 !(old_prot & VM_PROT_EXECUTE) &&
6202 (current->protection & VM_PROT_EXECUTE))
6203 ||
6204 /* a csm_associated mapping becoming writable */
6205 (current->csm_associated &&
6206 !(old_prot & VM_PROT_WRITE) &&
6207 (current->protection & VM_PROT_WRITE)))) {
6208 /*
6209 * This mapping has not already been marked as
6210 * "user_debug" and it is either:
6211 * 1. not code-signing-monitored and becoming executable
6212 * 2. code-signing-monitored and becoming writable,
6213 * so inform the CodeSigningMonitor and mark the
6214 * mapping as "user_debug" if appropriate.
6215 */
6216 vm_map_kernel_flags_t vmk_flags;
6217 vmk_flags = VM_MAP_KERNEL_FLAGS_NONE;
6218 /* pretend it's a vm_protect(VM_PROT_COPY)... */
6219 vmk_flags.vmkf_remap_prot_copy = true;
6220 kr = vm_map_entry_cs_associate(map, current, vmk_flags);
6221 #if DEVELOPMENT || DEBUG
6222 if (vm_log_xnu_user_debug) {
6223 printf("FBDP %d[%s] %s:%d map %p entry %p [ 0x%llx 0x%llx ] prot 0x%x -> 0x%x cs_associate -> %d user_debug=%d\n",
6224 proc_selfpid(),
6225 (get_bsdtask_info(current_task()) ? proc_name_address(get_bsdtask_info(current_task())) : "?"),
6226 __FUNCTION__, __LINE__,
6227 map, current,
6228 current->vme_start, current->vme_end,
6229 old_prot, current->protection,
6230 kr, current->vme_xnu_user_debug);
6231 }
6232 #endif /* DEVELOPMENT || DEBUG */
6233 }
6234 #endif /* CODE_SIGNING_MONITOR */
6235
6236 /*
6237 * Update physical map if necessary.
6238 * If the request is to turn off write protection,
6239 * we won't do it for real (in pmap). This is because
6240 * it would cause copy-on-write to fail. We've already
6241 * set, the new protection in the map, so if a
6242 * write-protect fault occurred, it will be fixed up
6243 * properly, COW or not.
6244 */
6245 if (current->protection != old_prot) {
6246 /* Look one level in we support nested pmaps */
6247 /* from mapped submaps which are direct entries */
6248 /* in our map */
6249
6250 vm_prot_t prot;
6251
6252 prot = current->protection;
6253 if (current->is_sub_map || (VME_OBJECT(current) == NULL) || (VME_OBJECT(current) != compressor_object)) {
6254 prot &= ~VM_PROT_WRITE;
6255 } else {
6256 assert(!VME_OBJECT(current)->code_signed);
6257 assert(VME_OBJECT(current)->copy_strategy == MEMORY_OBJECT_COPY_NONE);
6258 if (prot & VM_PROT_WRITE) {
6259 /*
6260 * For write requests on the
6261 * compressor, we wil ask the
6262 * pmap layer to prevent us from
6263 * taking a write fault when we
6264 * attempt to access the mapping
6265 * next.
6266 */
6267 pmap_options |= PMAP_OPTIONS_PROTECT_IMMEDIATE;
6268 }
6269 }
6270
6271 if (override_nx(map, VME_ALIAS(current)) && prot) {
6272 prot |= VM_PROT_EXECUTE;
6273 }
6274
6275 #if DEVELOPMENT || DEBUG
6276 if (!(old_prot & VM_PROT_EXECUTE) &&
6277 (prot & VM_PROT_EXECUTE) &&
6278 panic_on_unsigned_execute &&
6279 (proc_selfcsflags() & CS_KILL)) {
6280 panic("vm_map_protect(%p,0x%llx,0x%llx) old=0x%x new=0x%x - <rdar://23770418> code-signing bypass?", map, (uint64_t)current->vme_start, (uint64_t)current->vme_end, old_prot, prot);
6281 }
6282 #endif /* DEVELOPMENT || DEBUG */
6283
6284 if (pmap_has_prot_policy(map->pmap, current->translated_allow_execute, prot)) {
6285 if (current->wired_count) {
6286 panic("vm_map_protect(%p,0x%llx,0x%llx) new=0x%x wired=%x",
6287 map, (uint64_t)current->vme_start, (uint64_t)current->vme_end, prot, current->wired_count);
6288 }
6289
6290 /* If the pmap layer cares about this
6291 * protection type, force a fault for
6292 * each page so that vm_fault will
6293 * repopulate the page with the full
6294 * set of protections.
6295 */
6296 /*
6297 * TODO: We don't seem to need this,
6298 * but this is due to an internal
6299 * implementation detail of
6300 * pmap_protect. Do we want to rely
6301 * on this?
6302 */
6303 prot = VM_PROT_NONE;
6304 }
6305
6306 if (current->is_sub_map && current->use_pmap) {
6307 pmap_protect(VME_SUBMAP(current)->pmap,
6308 current->vme_start,
6309 current->vme_end,
6310 prot);
6311 } else {
6312 pmap_protect_options(map->pmap,
6313 current->vme_start,
6314 current->vme_end,
6315 prot,
6316 pmap_options,
6317 NULL);
6318 }
6319 }
6320 current = current->vme_next;
6321 }
6322
6323 current = entry;
6324 while ((current != vm_map_to_entry(map)) &&
6325 (current->vme_start <= end)) {
6326 vm_map_simplify_entry(map, current);
6327 current = current->vme_next;
6328 }
6329
6330 vm_map_unlock(map);
6331 return KERN_SUCCESS;
6332 }
6333
6334 /*
6335 * vm_map_inherit:
6336 *
6337 * Sets the inheritance of the specified address
6338 * range in the target map. Inheritance
6339 * affects how the map will be shared with
6340 * child maps at the time of vm_map_fork.
6341 */
6342 kern_return_t
vm_map_inherit(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_inherit_t new_inheritance)6343 vm_map_inherit(
6344 vm_map_t map,
6345 vm_map_offset_t start,
6346 vm_map_offset_t end,
6347 vm_inherit_t new_inheritance)
6348 {
6349 vm_map_entry_t entry;
6350 vm_map_entry_t temp_entry;
6351
6352 vm_map_lock(map);
6353
6354 VM_MAP_RANGE_CHECK(map, start, end);
6355
6356 if (__improbable(vm_map_range_overflows(map, start, end - start))) {
6357 vm_map_unlock(map);
6358 return KERN_INVALID_ADDRESS;
6359 }
6360
6361 if (vm_map_lookup_entry(map, start, &temp_entry)) {
6362 entry = temp_entry;
6363 } else {
6364 temp_entry = temp_entry->vme_next;
6365 entry = temp_entry;
6366 }
6367
6368 /* first check entire range for submaps which can't support the */
6369 /* given inheritance. */
6370 while ((entry != vm_map_to_entry(map)) && (entry->vme_start < end)) {
6371 if (entry->is_sub_map) {
6372 if (new_inheritance == VM_INHERIT_COPY) {
6373 vm_map_unlock(map);
6374 return KERN_INVALID_ARGUMENT;
6375 }
6376 }
6377
6378 entry = entry->vme_next;
6379 }
6380
6381 entry = temp_entry;
6382 if (entry != vm_map_to_entry(map)) {
6383 /* clip and unnest if necessary */
6384 vm_map_clip_start(map, entry, start);
6385 }
6386
6387 while ((entry != vm_map_to_entry(map)) && (entry->vme_start < end)) {
6388 vm_map_clip_end(map, entry, end);
6389 if (entry->is_sub_map) {
6390 /* clip did unnest if needed */
6391 assert(!entry->use_pmap);
6392 }
6393
6394 entry->inheritance = new_inheritance;
6395
6396 entry = entry->vme_next;
6397 }
6398
6399 vm_map_unlock(map);
6400 return KERN_SUCCESS;
6401 }
6402
6403 /*
6404 * Update the accounting for the amount of wired memory in this map. If the user has
6405 * exceeded the defined limits, then we fail. Wiring on behalf of the kernel never fails.
6406 */
6407
6408 static kern_return_t
add_wire_counts(vm_map_t map,vm_map_entry_t entry,boolean_t user_wire)6409 add_wire_counts(
6410 vm_map_t map,
6411 vm_map_entry_t entry,
6412 boolean_t user_wire)
6413 {
6414 vm_map_size_t size;
6415
6416 bool first_wire = entry->wired_count == 0 && entry->user_wired_count == 0;
6417
6418 if (user_wire) {
6419 unsigned int total_wire_count = vm_page_wire_count + vm_lopage_free_count;
6420
6421 /*
6422 * We're wiring memory at the request of the user. Check if this is the first time the user is wiring
6423 * this map entry.
6424 */
6425
6426 if (entry->user_wired_count == 0) {
6427 size = entry->vme_end - entry->vme_start;
6428
6429 /*
6430 * Since this is the first time the user is wiring this map entry, check to see if we're
6431 * exceeding the user wire limits. There is a per map limit which is the smaller of either
6432 * the process's rlimit or the global vm_per_task_user_wire_limit which caps this value. There is also
6433 * a system-wide limit on the amount of memory all users can wire. If the user is over either
6434 * limit, then we fail.
6435 */
6436
6437 if (size + map->user_wire_size > MIN(map->user_wire_limit, vm_per_task_user_wire_limit) ||
6438 size + ptoa_64(total_wire_count) > vm_global_user_wire_limit) {
6439 if (size + ptoa_64(total_wire_count) > vm_global_user_wire_limit) {
6440 #if DEVELOPMENT || DEBUG
6441 if (panic_on_mlock_failure) {
6442 panic("mlock: Over global wire limit. %llu bytes wired and requested to wire %llu bytes more", ptoa_64(total_wire_count), (uint64_t) size);
6443 }
6444 #endif /* DEVELOPMENT || DEBUG */
6445 os_atomic_inc(&vm_add_wire_count_over_global_limit, relaxed);
6446 } else {
6447 os_atomic_inc(&vm_add_wire_count_over_user_limit, relaxed);
6448 #if DEVELOPMENT || DEBUG
6449 if (panic_on_mlock_failure) {
6450 panic("mlock: Over process wire limit. %llu bytes wired and requested to wire %llu bytes more", (uint64_t) map->user_wire_size, (uint64_t) size);
6451 }
6452 #endif /* DEVELOPMENT || DEBUG */
6453 }
6454 return KERN_RESOURCE_SHORTAGE;
6455 }
6456
6457 /*
6458 * The first time the user wires an entry, we also increment the wired_count and add this to
6459 * the total that has been wired in the map.
6460 */
6461
6462 if (entry->wired_count >= MAX_WIRE_COUNT) {
6463 return KERN_FAILURE;
6464 }
6465
6466 entry->wired_count++;
6467 map->user_wire_size += size;
6468 }
6469
6470 if (entry->user_wired_count >= MAX_WIRE_COUNT) {
6471 return KERN_FAILURE;
6472 }
6473
6474 entry->user_wired_count++;
6475 } else {
6476 /*
6477 * The kernel's wiring the memory. Just bump the count and continue.
6478 */
6479
6480 if (entry->wired_count >= MAX_WIRE_COUNT) {
6481 panic("vm_map_wire: too many wirings");
6482 }
6483
6484 entry->wired_count++;
6485 }
6486
6487 if (first_wire) {
6488 vme_btref_consider_and_set(entry, __builtin_frame_address(0));
6489 }
6490
6491 return KERN_SUCCESS;
6492 }
6493
6494 /*
6495 * Update the memory wiring accounting now that the given map entry is being unwired.
6496 */
6497
6498 static void
subtract_wire_counts(vm_map_t map,vm_map_entry_t entry,boolean_t user_wire)6499 subtract_wire_counts(
6500 vm_map_t map,
6501 vm_map_entry_t entry,
6502 boolean_t user_wire)
6503 {
6504 if (user_wire) {
6505 /*
6506 * We're unwiring memory at the request of the user. See if we're removing the last user wire reference.
6507 */
6508
6509 if (entry->user_wired_count == 1) {
6510 /*
6511 * We're removing the last user wire reference. Decrement the wired_count and the total
6512 * user wired memory for this map.
6513 */
6514
6515 assert(entry->wired_count >= 1);
6516 entry->wired_count--;
6517 map->user_wire_size -= entry->vme_end - entry->vme_start;
6518 }
6519
6520 assert(entry->user_wired_count >= 1);
6521 entry->user_wired_count--;
6522 } else {
6523 /*
6524 * The kernel is unwiring the memory. Just update the count.
6525 */
6526
6527 assert(entry->wired_count >= 1);
6528 entry->wired_count--;
6529 }
6530
6531 vme_btref_consider_and_put(entry);
6532 }
6533
6534 int cs_executable_wire = 0;
6535
6536 /*
6537 * vm_map_wire:
6538 *
6539 * Sets the pageability of the specified address range in the
6540 * target map as wired. Regions specified as not pageable require
6541 * locked-down physical memory and physical page maps. The
6542 * access_type variable indicates types of accesses that must not
6543 * generate page faults. This is checked against protection of
6544 * memory being locked-down.
6545 *
6546 * The map must not be locked, but a reference must remain to the
6547 * map throughout the call.
6548 */
6549 static kern_return_t
vm_map_wire_nested(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_prot_t caller_prot,vm_tag_t tag,boolean_t user_wire,pmap_t map_pmap,vm_map_offset_t pmap_addr,ppnum_t * physpage_p)6550 vm_map_wire_nested(
6551 vm_map_t map,
6552 vm_map_offset_t start,
6553 vm_map_offset_t end,
6554 vm_prot_t caller_prot,
6555 vm_tag_t tag,
6556 boolean_t user_wire,
6557 pmap_t map_pmap,
6558 vm_map_offset_t pmap_addr,
6559 ppnum_t *physpage_p)
6560 {
6561 vm_map_entry_t entry;
6562 vm_prot_t access_type;
6563 struct vm_map_entry *first_entry, tmp_entry;
6564 vm_map_t real_map;
6565 vm_map_offset_t s, e;
6566 kern_return_t rc;
6567 boolean_t need_wakeup;
6568 boolean_t main_map = FALSE;
6569 wait_interrupt_t interruptible_state;
6570 thread_t cur_thread;
6571 unsigned int last_timestamp;
6572 vm_map_size_t size;
6573 boolean_t wire_and_extract;
6574 vm_prot_t extra_prots;
6575
6576 extra_prots = VM_PROT_COPY;
6577 extra_prots |= VM_PROT_COPY_FAIL_IF_EXECUTABLE;
6578 #if XNU_TARGET_OS_OSX
6579 if (map->pmap == kernel_pmap ||
6580 !vm_map_cs_enforcement(map)) {
6581 extra_prots &= ~VM_PROT_COPY_FAIL_IF_EXECUTABLE;
6582 }
6583 #endif /* XNU_TARGET_OS_OSX */
6584 #if CODE_SIGNING_MONITOR
6585 if (csm_address_space_exempt(map->pmap) == KERN_SUCCESS) {
6586 extra_prots &= ~VM_PROT_COPY_FAIL_IF_EXECUTABLE;
6587 }
6588 #endif /* CODE_SIGNING_MONITOR */
6589
6590 access_type = (caller_prot & (VM_PROT_ALL | VM_PROT_ALLEXEC));
6591
6592 wire_and_extract = FALSE;
6593 if (physpage_p != NULL) {
6594 /*
6595 * The caller wants the physical page number of the
6596 * wired page. We return only one physical page number
6597 * so this works for only one page at a time.
6598 */
6599 if ((end - start) != PAGE_SIZE) {
6600 return KERN_INVALID_ARGUMENT;
6601 }
6602 wire_and_extract = TRUE;
6603 *physpage_p = 0;
6604 }
6605
6606 vm_map_lock(map);
6607 if (map_pmap == NULL) {
6608 main_map = TRUE;
6609 }
6610 last_timestamp = map->timestamp;
6611
6612 VM_MAP_RANGE_CHECK(map, start, end);
6613 assert(VM_MAP_PAGE_ALIGNED(start, VM_MAP_PAGE_MASK(map)));
6614 assert(VM_MAP_PAGE_ALIGNED(end, VM_MAP_PAGE_MASK(map)));
6615
6616 if (start == end) {
6617 /* We wired what the caller asked for, zero pages */
6618 vm_map_unlock(map);
6619 return KERN_SUCCESS;
6620 }
6621
6622 if (__improbable(vm_map_range_overflows(map, start, end - start))) {
6623 vm_map_unlock(map);
6624 return KERN_INVALID_ADDRESS;
6625 }
6626
6627 need_wakeup = FALSE;
6628 cur_thread = current_thread();
6629
6630 s = start;
6631 rc = KERN_SUCCESS;
6632
6633 if (vm_map_lookup_entry(map, s, &first_entry)) {
6634 entry = first_entry;
6635 /*
6636 * vm_map_clip_start will be done later.
6637 * We don't want to unnest any nested submaps here !
6638 */
6639 } else {
6640 /* Start address is not in map */
6641 rc = KERN_INVALID_ADDRESS;
6642 goto done;
6643 }
6644
6645 while ((entry != vm_map_to_entry(map)) && (s < end)) {
6646 /*
6647 * At this point, we have wired from "start" to "s".
6648 * We still need to wire from "s" to "end".
6649 *
6650 * "entry" hasn't been clipped, so it could start before "s"
6651 * and/or end after "end".
6652 */
6653
6654 /* "e" is how far we want to wire in this entry */
6655 e = entry->vme_end;
6656 if (e > end) {
6657 e = end;
6658 }
6659
6660 /*
6661 * If another thread is wiring/unwiring this entry then
6662 * block after informing other thread to wake us up.
6663 */
6664 if (entry->in_transition) {
6665 wait_result_t wait_result;
6666
6667 /*
6668 * We have not clipped the entry. Make sure that
6669 * the start address is in range so that the lookup
6670 * below will succeed.
6671 * "s" is the current starting point: we've already
6672 * wired from "start" to "s" and we still have
6673 * to wire from "s" to "end".
6674 */
6675
6676 entry->needs_wakeup = TRUE;
6677
6678 /*
6679 * wake up anybody waiting on entries that we have
6680 * already wired.
6681 */
6682 if (need_wakeup) {
6683 vm_map_entry_wakeup(map);
6684 need_wakeup = FALSE;
6685 }
6686 /*
6687 * User wiring is interruptible
6688 */
6689 wait_result = vm_map_entry_wait(map,
6690 (user_wire) ? THREAD_ABORTSAFE :
6691 THREAD_UNINT);
6692 if (user_wire && wait_result == THREAD_INTERRUPTED) {
6693 /*
6694 * undo the wirings we have done so far
6695 * We do not clear the needs_wakeup flag,
6696 * because we cannot tell if we were the
6697 * only one waiting.
6698 */
6699 rc = KERN_FAILURE;
6700 goto done;
6701 }
6702
6703 /*
6704 * Cannot avoid a lookup here. reset timestamp.
6705 */
6706 last_timestamp = map->timestamp;
6707
6708 /*
6709 * The entry could have been clipped, look it up again.
6710 * Worse that can happen is, it may not exist anymore.
6711 */
6712 if (!vm_map_lookup_entry(map, s, &first_entry)) {
6713 /*
6714 * User: undo everything upto the previous
6715 * entry. let vm_map_unwire worry about
6716 * checking the validity of the range.
6717 */
6718 rc = KERN_FAILURE;
6719 goto done;
6720 }
6721 entry = first_entry;
6722 continue;
6723 }
6724
6725 if (entry->is_sub_map) {
6726 vm_map_offset_t sub_start;
6727 vm_map_offset_t sub_end;
6728 vm_map_offset_t local_start;
6729 vm_map_offset_t local_end;
6730 pmap_t pmap;
6731
6732 if (wire_and_extract) {
6733 /*
6734 * Wiring would result in copy-on-write
6735 * which would not be compatible with
6736 * the sharing we have with the original
6737 * provider of this memory.
6738 */
6739 rc = KERN_INVALID_ARGUMENT;
6740 goto done;
6741 }
6742
6743 vm_map_clip_start(map, entry, s);
6744 vm_map_clip_end(map, entry, end);
6745
6746 sub_start = VME_OFFSET(entry);
6747 sub_end = entry->vme_end;
6748 sub_end += VME_OFFSET(entry) - entry->vme_start;
6749
6750 local_end = entry->vme_end;
6751 if (map_pmap == NULL) {
6752 vm_object_t object;
6753 vm_object_offset_t offset;
6754 vm_prot_t prot;
6755 boolean_t wired;
6756 vm_map_entry_t local_entry;
6757 vm_map_version_t version;
6758 vm_map_t lookup_map;
6759
6760 if (entry->use_pmap) {
6761 pmap = VME_SUBMAP(entry)->pmap;
6762 /* ppc implementation requires that */
6763 /* submaps pmap address ranges line */
6764 /* up with parent map */
6765 #ifdef notdef
6766 pmap_addr = sub_start;
6767 #endif
6768 pmap_addr = s;
6769 } else {
6770 pmap = map->pmap;
6771 pmap_addr = s;
6772 }
6773
6774 if (entry->wired_count) {
6775 if ((rc = add_wire_counts(map, entry, user_wire)) != KERN_SUCCESS) {
6776 goto done;
6777 }
6778
6779 /*
6780 * The map was not unlocked:
6781 * no need to goto re-lookup.
6782 * Just go directly to next entry.
6783 */
6784 entry = entry->vme_next;
6785 s = entry->vme_start;
6786 continue;
6787 }
6788
6789 /* call vm_map_lookup_and_lock_object to */
6790 /* cause any needs copy to be */
6791 /* evaluated */
6792 local_start = entry->vme_start;
6793 lookup_map = map;
6794 vm_map_lock_write_to_read(map);
6795 rc = vm_map_lookup_and_lock_object(
6796 &lookup_map, local_start,
6797 (access_type | extra_prots),
6798 OBJECT_LOCK_EXCLUSIVE,
6799 &version, &object,
6800 &offset, &prot, &wired,
6801 NULL,
6802 &real_map, NULL);
6803 if (rc != KERN_SUCCESS) {
6804 vm_map_unlock_read(lookup_map);
6805 assert(map_pmap == NULL);
6806 vm_map_unwire(map, start,
6807 s, user_wire);
6808 return rc;
6809 }
6810 vm_object_unlock(object);
6811 if (real_map != lookup_map) {
6812 vm_map_unlock(real_map);
6813 }
6814 vm_map_unlock_read(lookup_map);
6815 vm_map_lock(map);
6816
6817 /* we unlocked, so must re-lookup */
6818 if (!vm_map_lookup_entry(map,
6819 local_start,
6820 &local_entry)) {
6821 rc = KERN_FAILURE;
6822 goto done;
6823 }
6824
6825 /*
6826 * entry could have been "simplified",
6827 * so re-clip
6828 */
6829 entry = local_entry;
6830 assert(s == local_start);
6831 vm_map_clip_start(map, entry, s);
6832 vm_map_clip_end(map, entry, end);
6833 /* re-compute "e" */
6834 e = entry->vme_end;
6835 if (e > end) {
6836 e = end;
6837 }
6838
6839 /* did we have a change of type? */
6840 if (!entry->is_sub_map) {
6841 last_timestamp = map->timestamp;
6842 continue;
6843 }
6844 } else {
6845 local_start = entry->vme_start;
6846 pmap = map_pmap;
6847 }
6848
6849 if ((rc = add_wire_counts(map, entry, user_wire)) != KERN_SUCCESS) {
6850 goto done;
6851 }
6852
6853 entry->in_transition = TRUE;
6854
6855 vm_map_unlock(map);
6856 rc = vm_map_wire_nested(VME_SUBMAP(entry),
6857 sub_start, sub_end,
6858 caller_prot, tag,
6859 user_wire, pmap, pmap_addr,
6860 NULL);
6861 vm_map_lock(map);
6862
6863 /*
6864 * Find the entry again. It could have been clipped
6865 * after we unlocked the map.
6866 */
6867 if (!vm_map_lookup_entry(map, local_start,
6868 &first_entry)) {
6869 panic("vm_map_wire: re-lookup failed");
6870 }
6871 entry = first_entry;
6872
6873 assert(local_start == s);
6874 /* re-compute "e" */
6875 e = entry->vme_end;
6876 if (e > end) {
6877 e = end;
6878 }
6879
6880 last_timestamp = map->timestamp;
6881 while ((entry != vm_map_to_entry(map)) &&
6882 (entry->vme_start < e)) {
6883 assert(entry->in_transition);
6884 entry->in_transition = FALSE;
6885 if (entry->needs_wakeup) {
6886 entry->needs_wakeup = FALSE;
6887 need_wakeup = TRUE;
6888 }
6889 if (rc != KERN_SUCCESS) {/* from vm_*_wire */
6890 subtract_wire_counts(map, entry, user_wire);
6891 }
6892 entry = entry->vme_next;
6893 }
6894 if (rc != KERN_SUCCESS) { /* from vm_*_wire */
6895 goto done;
6896 }
6897
6898 /* no need to relookup again */
6899 s = entry->vme_start;
6900 continue;
6901 }
6902
6903 /*
6904 * If this entry is already wired then increment
6905 * the appropriate wire reference count.
6906 */
6907 if (entry->wired_count) {
6908 if ((entry->protection & access_type) != access_type) {
6909 /* found a protection problem */
6910
6911 /*
6912 * XXX FBDP
6913 * We should always return an error
6914 * in this case but since we didn't
6915 * enforce it before, let's do
6916 * it only for the new "wire_and_extract"
6917 * code path for now...
6918 */
6919 if (wire_and_extract) {
6920 rc = KERN_PROTECTION_FAILURE;
6921 goto done;
6922 }
6923 }
6924
6925 /*
6926 * entry is already wired down, get our reference
6927 * after clipping to our range.
6928 */
6929 vm_map_clip_start(map, entry, s);
6930 vm_map_clip_end(map, entry, end);
6931
6932 if ((rc = add_wire_counts(map, entry, user_wire)) != KERN_SUCCESS) {
6933 goto done;
6934 }
6935
6936 if (wire_and_extract) {
6937 vm_object_t object;
6938 vm_object_offset_t offset;
6939 vm_page_t m;
6940
6941 /*
6942 * We don't have to "wire" the page again
6943 * bit we still have to "extract" its
6944 * physical page number, after some sanity
6945 * checks.
6946 */
6947 assert((entry->vme_end - entry->vme_start)
6948 == PAGE_SIZE);
6949 assert(!entry->needs_copy);
6950 assert(!entry->is_sub_map);
6951 assert(VME_OBJECT(entry));
6952 if (((entry->vme_end - entry->vme_start)
6953 != PAGE_SIZE) ||
6954 entry->needs_copy ||
6955 entry->is_sub_map ||
6956 VME_OBJECT(entry) == VM_OBJECT_NULL) {
6957 rc = KERN_INVALID_ARGUMENT;
6958 goto done;
6959 }
6960
6961 object = VME_OBJECT(entry);
6962 offset = VME_OFFSET(entry);
6963 /* need exclusive lock to update m->dirty */
6964 if (entry->protection & VM_PROT_WRITE) {
6965 vm_object_lock(object);
6966 } else {
6967 vm_object_lock_shared(object);
6968 }
6969 m = vm_page_lookup(object, offset);
6970 assert(m != VM_PAGE_NULL);
6971 assert(VM_PAGE_WIRED(m));
6972 if (m != VM_PAGE_NULL && VM_PAGE_WIRED(m)) {
6973 *physpage_p = VM_PAGE_GET_PHYS_PAGE(m);
6974 if (entry->protection & VM_PROT_WRITE) {
6975 vm_object_lock_assert_exclusive(
6976 object);
6977 m->vmp_dirty = TRUE;
6978 }
6979 } else {
6980 /* not already wired !? */
6981 *physpage_p = 0;
6982 }
6983 vm_object_unlock(object);
6984 }
6985
6986 /* map was not unlocked: no need to relookup */
6987 entry = entry->vme_next;
6988 s = entry->vme_start;
6989 continue;
6990 }
6991
6992 /*
6993 * Unwired entry or wire request transmitted via submap
6994 */
6995
6996 /*
6997 * Wiring would copy the pages to the shadow object.
6998 * The shadow object would not be code-signed so
6999 * attempting to execute code from these copied pages
7000 * would trigger a code-signing violation.
7001 */
7002
7003 if ((entry->protection & VM_PROT_EXECUTE)
7004 #if XNU_TARGET_OS_OSX
7005 &&
7006 map->pmap != kernel_pmap &&
7007 (vm_map_cs_enforcement(map)
7008 #if __arm64__
7009 || !VM_MAP_IS_EXOTIC(map)
7010 #endif /* __arm64__ */
7011 )
7012 #endif /* XNU_TARGET_OS_OSX */
7013 #if CODE_SIGNING_MONITOR
7014 &&
7015 (csm_address_space_exempt(map->pmap) != KERN_SUCCESS)
7016 #endif
7017 ) {
7018 #if MACH_ASSERT
7019 printf("pid %d[%s] wiring executable range from "
7020 "0x%llx to 0x%llx: rejected to preserve "
7021 "code-signing\n",
7022 proc_selfpid(),
7023 (get_bsdtask_info(current_task())
7024 ? proc_name_address(get_bsdtask_info(current_task()))
7025 : "?"),
7026 (uint64_t) entry->vme_start,
7027 (uint64_t) entry->vme_end);
7028 #endif /* MACH_ASSERT */
7029 DTRACE_VM2(cs_executable_wire,
7030 uint64_t, (uint64_t)entry->vme_start,
7031 uint64_t, (uint64_t)entry->vme_end);
7032 cs_executable_wire++;
7033 rc = KERN_PROTECTION_FAILURE;
7034 goto done;
7035 }
7036
7037 /*
7038 * Perform actions of vm_map_lookup that need the write
7039 * lock on the map: create a shadow object for a
7040 * copy-on-write region, or an object for a zero-fill
7041 * region.
7042 */
7043 size = entry->vme_end - entry->vme_start;
7044 /*
7045 * If wiring a copy-on-write page, we need to copy it now
7046 * even if we're only (currently) requesting read access.
7047 * This is aggressive, but once it's wired we can't move it.
7048 */
7049 if (entry->needs_copy) {
7050 if (wire_and_extract) {
7051 /*
7052 * We're supposed to share with the original
7053 * provider so should not be "needs_copy"
7054 */
7055 rc = KERN_INVALID_ARGUMENT;
7056 goto done;
7057 }
7058
7059 VME_OBJECT_SHADOW(entry, size,
7060 vm_map_always_shadow(map));
7061 entry->needs_copy = FALSE;
7062 } else if (VME_OBJECT(entry) == VM_OBJECT_NULL) {
7063 if (wire_and_extract) {
7064 /*
7065 * We're supposed to share with the original
7066 * provider so should already have an object.
7067 */
7068 rc = KERN_INVALID_ARGUMENT;
7069 goto done;
7070 }
7071 VME_OBJECT_SET(entry, vm_object_allocate(size), false, 0);
7072 VME_OFFSET_SET(entry, (vm_object_offset_t)0);
7073 assert(entry->use_pmap);
7074 } else if (VME_OBJECT(entry)->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) {
7075 if (wire_and_extract) {
7076 /*
7077 * We're supposed to share with the original
7078 * provider so should not be COPY_SYMMETRIC.
7079 */
7080 rc = KERN_INVALID_ARGUMENT;
7081 goto done;
7082 }
7083 /*
7084 * Force an unrequested "copy-on-write" but only for
7085 * the range we're wiring.
7086 */
7087 // printf("FBDP %s:%d map %p entry %p [ 0x%llx 0x%llx ] s 0x%llx end 0x%llx wire&extract=%d\n", __FUNCTION__, __LINE__, map, entry, (uint64_t)entry->vme_start, (uint64_t)entry->vme_end, (uint64_t)s, (uint64_t)end, wire_and_extract);
7088 vm_map_clip_start(map, entry, s);
7089 vm_map_clip_end(map, entry, end);
7090 /* recompute "size" */
7091 size = entry->vme_end - entry->vme_start;
7092 /* make a shadow object */
7093 vm_object_t orig_object;
7094 vm_object_offset_t orig_offset;
7095 orig_object = VME_OBJECT(entry);
7096 orig_offset = VME_OFFSET(entry);
7097 VME_OBJECT_SHADOW(entry, size, vm_map_always_shadow(map));
7098 if (VME_OBJECT(entry) != orig_object) {
7099 /*
7100 * This mapping has not been shared (or it would be
7101 * COPY_DELAY instead of COPY_SYMMETRIC) and it has
7102 * not been copied-on-write (or it would be marked
7103 * as "needs_copy" and would have been handled above
7104 * and also already write-protected).
7105 * We still need to write-protect here to prevent
7106 * other threads from modifying these pages while
7107 * we're in the process of copying and wiring
7108 * the copied pages.
7109 * Since the mapping is neither shared nor COWed,
7110 * we only need to write-protect the PTEs for this
7111 * mapping.
7112 */
7113 vm_object_pmap_protect(orig_object,
7114 orig_offset,
7115 size,
7116 map->pmap,
7117 VM_MAP_PAGE_SIZE(map),
7118 entry->vme_start,
7119 entry->protection & ~VM_PROT_WRITE);
7120 }
7121 }
7122 if (VME_OBJECT(entry)->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) {
7123 /*
7124 * Make the object COPY_DELAY to get a stable object
7125 * to wire.
7126 * That should avoid creating long shadow chains while
7127 * wiring/unwiring the same range repeatedly.
7128 * That also prevents part of the object from being
7129 * wired while another part is "needs_copy", which
7130 * could result in conflicting rules wrt copy-on-write.
7131 */
7132 vm_object_t object;
7133
7134 object = VME_OBJECT(entry);
7135 vm_object_lock(object);
7136 if (object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) {
7137 assertf(vm_object_round_page(VME_OFFSET(entry) + size) - vm_object_trunc_page(VME_OFFSET(entry)) == object->vo_size,
7138 "object %p size 0x%llx entry %p [0x%llx:0x%llx:0x%llx] size 0x%llx\n",
7139 object, (uint64_t)object->vo_size,
7140 entry,
7141 (uint64_t)entry->vme_start,
7142 (uint64_t)entry->vme_end,
7143 (uint64_t)VME_OFFSET(entry),
7144 (uint64_t)size);
7145 assertf(object->ref_count == 1,
7146 "object %p ref_count %d\n",
7147 object, object->ref_count);
7148 assertf(!entry->needs_copy,
7149 "entry %p\n", entry);
7150 object->copy_strategy = MEMORY_OBJECT_COPY_DELAY;
7151 object->true_share = TRUE;
7152 }
7153 vm_object_unlock(object);
7154 }
7155
7156 vm_map_clip_start(map, entry, s);
7157 vm_map_clip_end(map, entry, end);
7158
7159 /* re-compute "e" */
7160 e = entry->vme_end;
7161 if (e > end) {
7162 e = end;
7163 }
7164
7165 /*
7166 * Check for holes and protection mismatch.
7167 * Holes: Next entry should be contiguous unless this
7168 * is the end of the region.
7169 * Protection: Access requested must be allowed, unless
7170 * wiring is by protection class
7171 */
7172 if ((entry->vme_end < end) &&
7173 ((entry->vme_next == vm_map_to_entry(map)) ||
7174 (entry->vme_next->vme_start > entry->vme_end))) {
7175 /* found a hole */
7176 rc = KERN_INVALID_ADDRESS;
7177 goto done;
7178 }
7179 if ((entry->protection & access_type) != access_type) {
7180 /* found a protection problem */
7181 rc = KERN_PROTECTION_FAILURE;
7182 goto done;
7183 }
7184
7185 assert(entry->wired_count == 0 && entry->user_wired_count == 0);
7186
7187 if ((rc = add_wire_counts(map, entry, user_wire)) != KERN_SUCCESS) {
7188 goto done;
7189 }
7190
7191 entry->in_transition = TRUE;
7192
7193 /*
7194 * This entry might get split once we unlock the map.
7195 * In vm_fault_wire(), we need the current range as
7196 * defined by this entry. In order for this to work
7197 * along with a simultaneous clip operation, we make a
7198 * temporary copy of this entry and use that for the
7199 * wiring. Note that the underlying objects do not
7200 * change during a clip.
7201 */
7202 tmp_entry = *entry;
7203
7204 /*
7205 * The in_transition state guarentees that the entry
7206 * (or entries for this range, if split occured) will be
7207 * there when the map lock is acquired for the second time.
7208 */
7209 vm_map_unlock(map);
7210
7211 if (!user_wire && cur_thread != THREAD_NULL) {
7212 interruptible_state = thread_interrupt_level(THREAD_UNINT);
7213 } else {
7214 interruptible_state = THREAD_UNINT;
7215 }
7216
7217 if (map_pmap) {
7218 rc = vm_fault_wire(map,
7219 &tmp_entry, caller_prot, tag, map_pmap, pmap_addr,
7220 physpage_p);
7221 } else {
7222 rc = vm_fault_wire(map,
7223 &tmp_entry, caller_prot, tag, map->pmap,
7224 tmp_entry.vme_start,
7225 physpage_p);
7226 }
7227
7228 if (!user_wire && cur_thread != THREAD_NULL) {
7229 thread_interrupt_level(interruptible_state);
7230 }
7231
7232 vm_map_lock(map);
7233
7234 if (last_timestamp + 1 != map->timestamp) {
7235 /*
7236 * Find the entry again. It could have been clipped
7237 * after we unlocked the map.
7238 */
7239 if (!vm_map_lookup_entry(map, tmp_entry.vme_start,
7240 &first_entry)) {
7241 panic("vm_map_wire: re-lookup failed");
7242 }
7243
7244 entry = first_entry;
7245 }
7246
7247 last_timestamp = map->timestamp;
7248
7249 while ((entry != vm_map_to_entry(map)) &&
7250 (entry->vme_start < tmp_entry.vme_end)) {
7251 assert(entry->in_transition);
7252 entry->in_transition = FALSE;
7253 if (entry->needs_wakeup) {
7254 entry->needs_wakeup = FALSE;
7255 need_wakeup = TRUE;
7256 }
7257 if (rc != KERN_SUCCESS) { /* from vm_*_wire */
7258 subtract_wire_counts(map, entry, user_wire);
7259 }
7260 entry = entry->vme_next;
7261 }
7262
7263 if (rc != KERN_SUCCESS) { /* from vm_*_wire */
7264 goto done;
7265 }
7266
7267 if ((entry != vm_map_to_entry(map)) && /* we still have entries in the map */
7268 (tmp_entry.vme_end != end) && /* AND, we are not at the end of the requested range */
7269 (entry->vme_start != tmp_entry.vme_end)) { /* AND, the next entry is not contiguous. */
7270 /* found a "new" hole */
7271 s = tmp_entry.vme_end;
7272 rc = KERN_INVALID_ADDRESS;
7273 goto done;
7274 }
7275
7276 s = entry->vme_start;
7277 } /* end while loop through map entries */
7278
7279 done:
7280 if (rc == KERN_SUCCESS) {
7281 /* repair any damage we may have made to the VM map */
7282 vm_map_simplify_range(map, start, end);
7283 }
7284
7285 vm_map_unlock(map);
7286
7287 /*
7288 * wake up anybody waiting on entries we wired.
7289 */
7290 if (need_wakeup) {
7291 vm_map_entry_wakeup(map);
7292 }
7293
7294 if (rc != KERN_SUCCESS) {
7295 /* undo what has been wired so far */
7296 vm_map_unwire_nested(map, start, s, user_wire,
7297 map_pmap, pmap_addr);
7298 if (physpage_p) {
7299 *physpage_p = 0;
7300 }
7301 }
7302
7303 return rc;
7304 }
7305
7306 kern_return_t
vm_map_wire_external(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_prot_t caller_prot,boolean_t user_wire)7307 vm_map_wire_external(
7308 vm_map_t map,
7309 vm_map_offset_t start,
7310 vm_map_offset_t end,
7311 vm_prot_t caller_prot,
7312 boolean_t user_wire)
7313 {
7314 kern_return_t kret;
7315
7316 kret = vm_map_wire_nested(map, start, end, caller_prot, vm_tag_bt(),
7317 user_wire, (pmap_t)NULL, 0, NULL);
7318 return kret;
7319 }
7320
7321 kern_return_t
vm_map_wire_kernel(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_prot_t caller_prot,vm_tag_t tag,boolean_t user_wire)7322 vm_map_wire_kernel(
7323 vm_map_t map,
7324 vm_map_offset_t start,
7325 vm_map_offset_t end,
7326 vm_prot_t caller_prot,
7327 vm_tag_t tag,
7328 boolean_t user_wire)
7329 {
7330 kern_return_t kret;
7331
7332 kret = vm_map_wire_nested(map, start, end, caller_prot, tag,
7333 user_wire, (pmap_t)NULL, 0, NULL);
7334 return kret;
7335 }
7336
7337 kern_return_t
vm_map_wire_and_extract_external(vm_map_t map,vm_map_offset_t start,vm_prot_t caller_prot,boolean_t user_wire,ppnum_t * physpage_p)7338 vm_map_wire_and_extract_external(
7339 vm_map_t map,
7340 vm_map_offset_t start,
7341 vm_prot_t caller_prot,
7342 boolean_t user_wire,
7343 ppnum_t *physpage_p)
7344 {
7345 kern_return_t kret;
7346
7347 kret = vm_map_wire_nested(map,
7348 start,
7349 start + VM_MAP_PAGE_SIZE(map),
7350 caller_prot,
7351 vm_tag_bt(),
7352 user_wire,
7353 (pmap_t)NULL,
7354 0,
7355 physpage_p);
7356 if (kret != KERN_SUCCESS &&
7357 physpage_p != NULL) {
7358 *physpage_p = 0;
7359 }
7360 return kret;
7361 }
7362
7363 /*
7364 * vm_map_unwire:
7365 *
7366 * Sets the pageability of the specified address range in the target
7367 * as pageable. Regions specified must have been wired previously.
7368 *
7369 * The map must not be locked, but a reference must remain to the map
7370 * throughout the call.
7371 *
7372 * Kernel will panic on failures. User unwire ignores holes and
7373 * unwired and intransition entries to avoid losing memory by leaving
7374 * it unwired.
7375 */
7376 static kern_return_t
vm_map_unwire_nested(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,boolean_t user_wire,pmap_t map_pmap,vm_map_offset_t pmap_addr)7377 vm_map_unwire_nested(
7378 vm_map_t map,
7379 vm_map_offset_t start,
7380 vm_map_offset_t end,
7381 boolean_t user_wire,
7382 pmap_t map_pmap,
7383 vm_map_offset_t pmap_addr)
7384 {
7385 vm_map_entry_t entry;
7386 struct vm_map_entry *first_entry, tmp_entry;
7387 boolean_t need_wakeup;
7388 boolean_t main_map = FALSE;
7389 unsigned int last_timestamp;
7390
7391 vm_map_lock(map);
7392 if (map_pmap == NULL) {
7393 main_map = TRUE;
7394 }
7395 last_timestamp = map->timestamp;
7396
7397 VM_MAP_RANGE_CHECK(map, start, end);
7398 assert(VM_MAP_PAGE_ALIGNED(start, VM_MAP_PAGE_MASK(map)));
7399 assert(VM_MAP_PAGE_ALIGNED(end, VM_MAP_PAGE_MASK(map)));
7400
7401 if (start == end) {
7402 /* We unwired what the caller asked for: zero pages */
7403 vm_map_unlock(map);
7404 return KERN_SUCCESS;
7405 }
7406
7407 if (__improbable(vm_map_range_overflows(map, start, end - start))) {
7408 vm_map_unlock(map);
7409 return KERN_INVALID_ADDRESS;
7410 }
7411
7412 if (vm_map_lookup_entry(map, start, &first_entry)) {
7413 entry = first_entry;
7414 /*
7415 * vm_map_clip_start will be done later.
7416 * We don't want to unnest any nested sub maps here !
7417 */
7418 } else {
7419 if (!user_wire) {
7420 panic("vm_map_unwire: start not found");
7421 }
7422 /* Start address is not in map. */
7423 vm_map_unlock(map);
7424 return KERN_INVALID_ADDRESS;
7425 }
7426
7427 if (entry->superpage_size) {
7428 /* superpages are always wired */
7429 vm_map_unlock(map);
7430 return KERN_INVALID_ADDRESS;
7431 }
7432
7433 need_wakeup = FALSE;
7434 while ((entry != vm_map_to_entry(map)) && (entry->vme_start < end)) {
7435 if (entry->in_transition) {
7436 /*
7437 * 1)
7438 * Another thread is wiring down this entry. Note
7439 * that if it is not for the other thread we would
7440 * be unwiring an unwired entry. This is not
7441 * permitted. If we wait, we will be unwiring memory
7442 * we did not wire.
7443 *
7444 * 2)
7445 * Another thread is unwiring this entry. We did not
7446 * have a reference to it, because if we did, this
7447 * entry will not be getting unwired now.
7448 */
7449 if (!user_wire) {
7450 /*
7451 * XXX FBDP
7452 * This could happen: there could be some
7453 * overlapping vslock/vsunlock operations
7454 * going on.
7455 * We should probably just wait and retry,
7456 * but then we have to be careful that this
7457 * entry could get "simplified" after
7458 * "in_transition" gets unset and before
7459 * we re-lookup the entry, so we would
7460 * have to re-clip the entry to avoid
7461 * re-unwiring what we have already unwired...
7462 * See vm_map_wire_nested().
7463 *
7464 * Or we could just ignore "in_transition"
7465 * here and proceed to decement the wired
7466 * count(s) on this entry. That should be fine
7467 * as long as "wired_count" doesn't drop all
7468 * the way to 0 (and we should panic if THAT
7469 * happens).
7470 */
7471 panic("vm_map_unwire: in_transition entry");
7472 }
7473
7474 entry = entry->vme_next;
7475 continue;
7476 }
7477
7478 if (entry->is_sub_map) {
7479 vm_map_offset_t sub_start;
7480 vm_map_offset_t sub_end;
7481 vm_map_offset_t local_end;
7482 pmap_t pmap;
7483
7484 vm_map_clip_start(map, entry, start);
7485 vm_map_clip_end(map, entry, end);
7486
7487 sub_start = VME_OFFSET(entry);
7488 sub_end = entry->vme_end - entry->vme_start;
7489 sub_end += VME_OFFSET(entry);
7490 local_end = entry->vme_end;
7491 if (map_pmap == NULL) {
7492 if (entry->use_pmap) {
7493 pmap = VME_SUBMAP(entry)->pmap;
7494 pmap_addr = sub_start;
7495 } else {
7496 pmap = map->pmap;
7497 pmap_addr = start;
7498 }
7499 if (entry->wired_count == 0 ||
7500 (user_wire && entry->user_wired_count == 0)) {
7501 if (!user_wire) {
7502 panic("vm_map_unwire: entry is unwired");
7503 }
7504 entry = entry->vme_next;
7505 continue;
7506 }
7507
7508 /*
7509 * Check for holes
7510 * Holes: Next entry should be contiguous unless
7511 * this is the end of the region.
7512 */
7513 if (((entry->vme_end < end) &&
7514 ((entry->vme_next == vm_map_to_entry(map)) ||
7515 (entry->vme_next->vme_start
7516 > entry->vme_end)))) {
7517 if (!user_wire) {
7518 panic("vm_map_unwire: non-contiguous region");
7519 }
7520 /*
7521 * entry = entry->vme_next;
7522 * continue;
7523 */
7524 }
7525
7526 subtract_wire_counts(map, entry, user_wire);
7527
7528 if (entry->wired_count != 0) {
7529 entry = entry->vme_next;
7530 continue;
7531 }
7532
7533 entry->in_transition = TRUE;
7534 tmp_entry = *entry;/* see comment in vm_map_wire() */
7535
7536 /*
7537 * We can unlock the map now. The in_transition state
7538 * guarantees existance of the entry.
7539 */
7540 vm_map_unlock(map);
7541 vm_map_unwire_nested(VME_SUBMAP(entry),
7542 sub_start, sub_end, user_wire, pmap, pmap_addr);
7543 vm_map_lock(map);
7544
7545 if (last_timestamp + 1 != map->timestamp) {
7546 /*
7547 * Find the entry again. It could have been
7548 * clipped or deleted after we unlocked the map.
7549 */
7550 if (!vm_map_lookup_entry(map,
7551 tmp_entry.vme_start,
7552 &first_entry)) {
7553 if (!user_wire) {
7554 panic("vm_map_unwire: re-lookup failed");
7555 }
7556 entry = first_entry->vme_next;
7557 } else {
7558 entry = first_entry;
7559 }
7560 }
7561 last_timestamp = map->timestamp;
7562
7563 /*
7564 * clear transition bit for all constituent entries
7565 * that were in the original entry (saved in
7566 * tmp_entry). Also check for waiters.
7567 */
7568 while ((entry != vm_map_to_entry(map)) &&
7569 (entry->vme_start < tmp_entry.vme_end)) {
7570 assert(entry->in_transition);
7571 entry->in_transition = FALSE;
7572 if (entry->needs_wakeup) {
7573 entry->needs_wakeup = FALSE;
7574 need_wakeup = TRUE;
7575 }
7576 entry = entry->vme_next;
7577 }
7578 continue;
7579 } else {
7580 tmp_entry = *entry;
7581 vm_map_unlock(map);
7582 vm_map_unwire_nested(VME_SUBMAP(entry),
7583 sub_start, sub_end, user_wire, map_pmap,
7584 pmap_addr);
7585 vm_map_lock(map);
7586
7587 if (last_timestamp + 1 != map->timestamp) {
7588 /*
7589 * Find the entry again. It could have been
7590 * clipped or deleted after we unlocked the map.
7591 */
7592 if (!vm_map_lookup_entry(map,
7593 tmp_entry.vme_start,
7594 &first_entry)) {
7595 if (!user_wire) {
7596 panic("vm_map_unwire: re-lookup failed");
7597 }
7598 entry = first_entry->vme_next;
7599 } else {
7600 entry = first_entry;
7601 }
7602 }
7603 last_timestamp = map->timestamp;
7604 }
7605 }
7606
7607
7608 if ((entry->wired_count == 0) ||
7609 (user_wire && entry->user_wired_count == 0)) {
7610 if (!user_wire) {
7611 panic("vm_map_unwire: entry is unwired");
7612 }
7613
7614 entry = entry->vme_next;
7615 continue;
7616 }
7617
7618 assert(entry->wired_count > 0 &&
7619 (!user_wire || entry->user_wired_count > 0));
7620
7621 vm_map_clip_start(map, entry, start);
7622 vm_map_clip_end(map, entry, end);
7623
7624 /*
7625 * Check for holes
7626 * Holes: Next entry should be contiguous unless
7627 * this is the end of the region.
7628 */
7629 if (((entry->vme_end < end) &&
7630 ((entry->vme_next == vm_map_to_entry(map)) ||
7631 (entry->vme_next->vme_start > entry->vme_end)))) {
7632 if (!user_wire) {
7633 panic("vm_map_unwire: non-contiguous region");
7634 }
7635 entry = entry->vme_next;
7636 continue;
7637 }
7638
7639 subtract_wire_counts(map, entry, user_wire);
7640
7641 if (entry->wired_count != 0) {
7642 entry = entry->vme_next;
7643 continue;
7644 }
7645
7646 if (entry->zero_wired_pages) {
7647 entry->zero_wired_pages = FALSE;
7648 }
7649
7650 entry->in_transition = TRUE;
7651 tmp_entry = *entry; /* see comment in vm_map_wire() */
7652
7653 /*
7654 * We can unlock the map now. The in_transition state
7655 * guarantees existance of the entry.
7656 */
7657 vm_map_unlock(map);
7658 if (map_pmap) {
7659 vm_fault_unwire(map, &tmp_entry, FALSE, map_pmap,
7660 pmap_addr, tmp_entry.vme_end);
7661 } else {
7662 vm_fault_unwire(map, &tmp_entry, FALSE, map->pmap,
7663 tmp_entry.vme_start, tmp_entry.vme_end);
7664 }
7665 vm_map_lock(map);
7666
7667 if (last_timestamp + 1 != map->timestamp) {
7668 /*
7669 * Find the entry again. It could have been clipped
7670 * or deleted after we unlocked the map.
7671 */
7672 if (!vm_map_lookup_entry(map, tmp_entry.vme_start,
7673 &first_entry)) {
7674 if (!user_wire) {
7675 panic("vm_map_unwire: re-lookup failed");
7676 }
7677 entry = first_entry->vme_next;
7678 } else {
7679 entry = first_entry;
7680 }
7681 }
7682 last_timestamp = map->timestamp;
7683
7684 /*
7685 * clear transition bit for all constituent entries that
7686 * were in the original entry (saved in tmp_entry). Also
7687 * check for waiters.
7688 */
7689 while ((entry != vm_map_to_entry(map)) &&
7690 (entry->vme_start < tmp_entry.vme_end)) {
7691 assert(entry->in_transition);
7692 entry->in_transition = FALSE;
7693 if (entry->needs_wakeup) {
7694 entry->needs_wakeup = FALSE;
7695 need_wakeup = TRUE;
7696 }
7697 entry = entry->vme_next;
7698 }
7699 }
7700
7701 /*
7702 * We might have fragmented the address space when we wired this
7703 * range of addresses. Attempt to re-coalesce these VM map entries
7704 * with their neighbors now that they're no longer wired.
7705 * Under some circumstances, address space fragmentation can
7706 * prevent VM object shadow chain collapsing, which can cause
7707 * swap space leaks.
7708 */
7709 vm_map_simplify_range(map, start, end);
7710
7711 vm_map_unlock(map);
7712 /*
7713 * wake up anybody waiting on entries that we have unwired.
7714 */
7715 if (need_wakeup) {
7716 vm_map_entry_wakeup(map);
7717 }
7718 return KERN_SUCCESS;
7719 }
7720
7721 kern_return_t
vm_map_unwire(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,boolean_t user_wire)7722 vm_map_unwire(
7723 vm_map_t map,
7724 vm_map_offset_t start,
7725 vm_map_offset_t end,
7726 boolean_t user_wire)
7727 {
7728 return vm_map_unwire_nested(map, start, end,
7729 user_wire, (pmap_t)NULL, 0);
7730 }
7731
7732
7733 /*
7734 * vm_map_entry_zap: [ internal use only ]
7735 *
7736 * Remove the entry from the target map
7737 * and put it on a zap list.
7738 */
7739 static void
vm_map_entry_zap(vm_map_t map,vm_map_entry_t entry,vm_map_zap_t zap)7740 vm_map_entry_zap(
7741 vm_map_t map,
7742 vm_map_entry_t entry,
7743 vm_map_zap_t zap)
7744 {
7745 vm_map_offset_t s, e;
7746
7747 s = entry->vme_start;
7748 e = entry->vme_end;
7749 assert(VM_MAP_PAGE_ALIGNED(s, FOURK_PAGE_MASK));
7750 assert(VM_MAP_PAGE_ALIGNED(e, FOURK_PAGE_MASK));
7751 if (VM_MAP_PAGE_MASK(map) >= PAGE_MASK) {
7752 assert(page_aligned(s));
7753 assert(page_aligned(e));
7754 }
7755 if (entry->map_aligned == TRUE) {
7756 assert(VM_MAP_PAGE_ALIGNED(s, VM_MAP_PAGE_MASK(map)));
7757 assert(VM_MAP_PAGE_ALIGNED(e, VM_MAP_PAGE_MASK(map)));
7758 }
7759 assert(entry->wired_count == 0);
7760 assert(entry->user_wired_count == 0);
7761 assert(!entry->vme_permanent);
7762
7763 vm_map_store_entry_unlink(map, entry, false);
7764 map->size -= e - s;
7765
7766 vm_map_zap_append(zap, entry);
7767 }
7768
7769 static void
vm_map_submap_pmap_clean(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_map_t sub_map,vm_map_offset_t offset)7770 vm_map_submap_pmap_clean(
7771 vm_map_t map,
7772 vm_map_offset_t start,
7773 vm_map_offset_t end,
7774 vm_map_t sub_map,
7775 vm_map_offset_t offset)
7776 {
7777 vm_map_offset_t submap_start;
7778 vm_map_offset_t submap_end;
7779 vm_map_size_t remove_size;
7780 vm_map_entry_t entry;
7781
7782 submap_end = offset + (end - start);
7783 submap_start = offset;
7784
7785 vm_map_lock_read(sub_map);
7786 if (vm_map_lookup_entry(sub_map, offset, &entry)) {
7787 remove_size = (entry->vme_end - entry->vme_start);
7788 if (offset > entry->vme_start) {
7789 remove_size -= offset - entry->vme_start;
7790 }
7791
7792
7793 if (submap_end < entry->vme_end) {
7794 remove_size -=
7795 entry->vme_end - submap_end;
7796 }
7797 if (entry->is_sub_map) {
7798 vm_map_submap_pmap_clean(
7799 sub_map,
7800 start,
7801 start + remove_size,
7802 VME_SUBMAP(entry),
7803 VME_OFFSET(entry));
7804 } else {
7805 if (map->mapped_in_other_pmaps &&
7806 os_ref_get_count_raw(&map->map_refcnt) != 0 &&
7807 VME_OBJECT(entry) != NULL) {
7808 vm_object_pmap_protect_options(
7809 VME_OBJECT(entry),
7810 (VME_OFFSET(entry) +
7811 offset -
7812 entry->vme_start),
7813 remove_size,
7814 PMAP_NULL,
7815 PAGE_SIZE,
7816 entry->vme_start,
7817 VM_PROT_NONE,
7818 PMAP_OPTIONS_REMOVE);
7819 } else {
7820 pmap_remove(map->pmap,
7821 (addr64_t)start,
7822 (addr64_t)(start + remove_size));
7823 }
7824 }
7825 }
7826
7827 entry = entry->vme_next;
7828
7829 while ((entry != vm_map_to_entry(sub_map))
7830 && (entry->vme_start < submap_end)) {
7831 remove_size = (entry->vme_end - entry->vme_start);
7832 if (submap_end < entry->vme_end) {
7833 remove_size -= entry->vme_end - submap_end;
7834 }
7835 if (entry->is_sub_map) {
7836 vm_map_submap_pmap_clean(
7837 sub_map,
7838 (start + entry->vme_start) - offset,
7839 ((start + entry->vme_start) - offset) + remove_size,
7840 VME_SUBMAP(entry),
7841 VME_OFFSET(entry));
7842 } else {
7843 if (map->mapped_in_other_pmaps &&
7844 os_ref_get_count_raw(&map->map_refcnt) != 0 &&
7845 VME_OBJECT(entry) != NULL) {
7846 vm_object_pmap_protect_options(
7847 VME_OBJECT(entry),
7848 VME_OFFSET(entry),
7849 remove_size,
7850 PMAP_NULL,
7851 PAGE_SIZE,
7852 entry->vme_start,
7853 VM_PROT_NONE,
7854 PMAP_OPTIONS_REMOVE);
7855 } else {
7856 pmap_remove(map->pmap,
7857 (addr64_t)((start + entry->vme_start)
7858 - offset),
7859 (addr64_t)(((start + entry->vme_start)
7860 - offset) + remove_size));
7861 }
7862 }
7863 entry = entry->vme_next;
7864 }
7865 vm_map_unlock_read(sub_map);
7866 return;
7867 }
7868
7869 /*
7870 * virt_memory_guard_ast:
7871 *
7872 * Handle the AST callout for a virtual memory guard.
7873 * raise an EXC_GUARD exception and terminate the task
7874 * if configured to do so.
7875 */
7876 void
virt_memory_guard_ast(thread_t thread,mach_exception_data_type_t code,mach_exception_data_type_t subcode)7877 virt_memory_guard_ast(
7878 thread_t thread,
7879 mach_exception_data_type_t code,
7880 mach_exception_data_type_t subcode)
7881 {
7882 task_t task = get_threadtask(thread);
7883 assert(task != kernel_task);
7884 assert(task == current_task());
7885 kern_return_t sync_exception_result;
7886 uint32_t behavior;
7887
7888 behavior = task->task_exc_guard;
7889
7890 /* Is delivery enabled */
7891 if ((behavior & TASK_EXC_GUARD_VM_DELIVER) == 0) {
7892 return;
7893 }
7894
7895 /* If only once, make sure we're that once */
7896 while (behavior & TASK_EXC_GUARD_VM_ONCE) {
7897 uint32_t new_behavior = behavior & ~TASK_EXC_GUARD_VM_DELIVER;
7898
7899 if (OSCompareAndSwap(behavior, new_behavior, &task->task_exc_guard)) {
7900 break;
7901 }
7902 behavior = task->task_exc_guard;
7903 if ((behavior & TASK_EXC_GUARD_VM_DELIVER) == 0) {
7904 return;
7905 }
7906 }
7907
7908 const bool fatal = task->task_exc_guard & TASK_EXC_GUARD_VM_FATAL;
7909 /* Raise exception synchronously and see if handler claimed it */
7910 sync_exception_result = task_exception_notify(EXC_GUARD, code, subcode, fatal);
7911
7912 if (fatal) {
7913 /*
7914 * If Synchronous EXC_GUARD delivery was successful then
7915 * kill the process and return, else kill the process
7916 * and deliver the exception via EXC_CORPSE_NOTIFY.
7917 */
7918 if (sync_exception_result == KERN_SUCCESS) {
7919 task_bsdtask_kill(current_task());
7920 } else {
7921 exit_with_guard_exception(current_proc(), code, subcode);
7922 }
7923 } else if (task->task_exc_guard & TASK_EXC_GUARD_VM_CORPSE) {
7924 /*
7925 * If the synchronous EXC_GUARD delivery was not successful,
7926 * raise a simulated crash.
7927 */
7928 if (sync_exception_result != KERN_SUCCESS) {
7929 task_violated_guard(code, subcode, NULL, FALSE);
7930 }
7931 }
7932 }
7933
7934 /*
7935 * vm_map_guard_exception:
7936 *
7937 * Generate a GUARD_TYPE_VIRTUAL_MEMORY EXC_GUARD exception.
7938 *
7939 * Right now, we do this when we find nothing mapped, or a
7940 * gap in the mapping when a user address space deallocate
7941 * was requested. We report the address of the first gap found.
7942 */
7943 static void
vm_map_guard_exception(vm_map_offset_t gap_start,unsigned reason)7944 vm_map_guard_exception(
7945 vm_map_offset_t gap_start,
7946 unsigned reason)
7947 {
7948 mach_exception_code_t code = 0;
7949 unsigned int guard_type = GUARD_TYPE_VIRT_MEMORY;
7950 unsigned int target = 0; /* should we pass in pid associated with map? */
7951 mach_exception_data_type_t subcode = (uint64_t)gap_start;
7952 boolean_t fatal = FALSE;
7953
7954 task_t task = current_task_early();
7955
7956 /* Can't deliver exceptions to a NULL task (early boot) or kernel task */
7957 if (task == NULL || task == kernel_task) {
7958 return;
7959 }
7960
7961 EXC_GUARD_ENCODE_TYPE(code, guard_type);
7962 EXC_GUARD_ENCODE_FLAVOR(code, reason);
7963 EXC_GUARD_ENCODE_TARGET(code, target);
7964
7965 if (task->task_exc_guard & TASK_EXC_GUARD_VM_FATAL) {
7966 fatal = TRUE;
7967 }
7968 thread_guard_violation(current_thread(), code, subcode, fatal);
7969 }
7970
7971 static kern_return_t
vm_map_delete_submap_recurse(vm_map_t submap,vm_map_offset_t submap_start,vm_map_offset_t submap_end)7972 vm_map_delete_submap_recurse(
7973 vm_map_t submap,
7974 vm_map_offset_t submap_start,
7975 vm_map_offset_t submap_end)
7976 {
7977 vm_map_entry_t submap_entry;
7978
7979 /*
7980 * Verify that the submap does not contain any "permanent" entries
7981 * within the specified range.
7982 * We do not care about gaps.
7983 */
7984
7985 vm_map_lock(submap);
7986
7987 if (!vm_map_lookup_entry(submap, submap_start, &submap_entry)) {
7988 submap_entry = submap_entry->vme_next;
7989 }
7990
7991 for (;
7992 submap_entry != vm_map_to_entry(submap) &&
7993 submap_entry->vme_start < submap_end;
7994 submap_entry = submap_entry->vme_next) {
7995 if (submap_entry->vme_permanent) {
7996 /* "permanent" entry -> fail */
7997 vm_map_unlock(submap);
7998 return KERN_PROTECTION_FAILURE;
7999 }
8000 }
8001 /* no "permanent" entries in the range -> success */
8002 vm_map_unlock(submap);
8003 return KERN_SUCCESS;
8004 }
8005
8006 __abortlike
8007 static void
__vm_map_delete_misaligned_panic(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)8008 __vm_map_delete_misaligned_panic(
8009 vm_map_t map,
8010 vm_map_offset_t start,
8011 vm_map_offset_t end)
8012 {
8013 panic("vm_map_delete(%p,0x%llx,0x%llx): start is not aligned to 0x%x",
8014 map, (uint64_t)start, (uint64_t)end, VM_MAP_PAGE_SIZE(map));
8015 }
8016
8017 __abortlike
8018 static void
__vm_map_delete_failed_panic(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,kern_return_t kr)8019 __vm_map_delete_failed_panic(
8020 vm_map_t map,
8021 vm_map_offset_t start,
8022 vm_map_offset_t end,
8023 kern_return_t kr)
8024 {
8025 panic("vm_map_delete(%p,0x%llx,0x%llx): failed unexpected with %d",
8026 map, (uint64_t)start, (uint64_t)end, kr);
8027 }
8028
8029 __abortlike
8030 static void
__vm_map_delete_gap_panic(vm_map_t map,vm_map_offset_t where,vm_map_offset_t start,vm_map_offset_t end)8031 __vm_map_delete_gap_panic(
8032 vm_map_t map,
8033 vm_map_offset_t where,
8034 vm_map_offset_t start,
8035 vm_map_offset_t end)
8036 {
8037 panic("vm_map_delete(%p,0x%llx,0x%llx): no map entry at 0x%llx",
8038 map, (uint64_t)start, (uint64_t)end, (uint64_t)where);
8039 }
8040
8041 __abortlike
8042 static void
__vm_map_delete_permanent_panic(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_map_entry_t entry)8043 __vm_map_delete_permanent_panic(
8044 vm_map_t map,
8045 vm_map_offset_t start,
8046 vm_map_offset_t end,
8047 vm_map_entry_t entry)
8048 {
8049 panic("vm_map_delete(%p,0x%llx,0x%llx): "
8050 "Attempting to remove permanent VM map entry %p [0x%llx:0x%llx]",
8051 map, (uint64_t)start, (uint64_t)end, entry,
8052 (uint64_t)entry->vme_start,
8053 (uint64_t)entry->vme_end);
8054 }
8055
8056 __options_decl(vm_map_delete_state_t, uint32_t, {
8057 VMDS_NONE = 0x0000,
8058
8059 VMDS_FOUND_GAP = 0x0001,
8060 VMDS_GAPS_OK = 0x0002,
8061
8062 VMDS_KERNEL_PMAP = 0x0004,
8063 VMDS_NEEDS_LOOKUP = 0x0008,
8064 VMDS_NEEDS_WAKEUP = 0x0010,
8065 VMDS_KERNEL_KMEMPTR = 0x0020
8066 });
8067
8068 /*
8069 * vm_map_delete: [ internal use only ]
8070 *
8071 * Deallocates the given address range from the target map.
8072 * Removes all user wirings. Unwires one kernel wiring if
8073 * VM_MAP_REMOVE_KUNWIRE is set. Waits for kernel wirings to go
8074 * away if VM_MAP_REMOVE_WAIT_FOR_KWIRE is set. Sleeps
8075 * interruptibly if VM_MAP_REMOVE_INTERRUPTIBLE is set.
8076 *
8077 *
8078 * When the map is a kernel map, then any error in removing mappings
8079 * will lead to a panic so that clients do not have to repeat the panic
8080 * code at each call site. If VM_MAP_REMOVE_INTERRUPTIBLE
8081 * is also passed, then KERN_ABORTED will not lead to a panic.
8082 *
8083 * This routine is called with map locked and leaves map locked.
8084 */
8085 static kmem_return_t
vm_map_delete(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vmr_flags_t flags,kmem_guard_t guard,vm_map_zap_t zap_list)8086 vm_map_delete(
8087 vm_map_t map,
8088 vm_map_offset_t start,
8089 vm_map_offset_t end,
8090 vmr_flags_t flags,
8091 kmem_guard_t guard,
8092 vm_map_zap_t zap_list)
8093 {
8094 vm_map_entry_t entry, next;
8095 int interruptible;
8096 vm_map_offset_t gap_start = 0;
8097 vm_map_offset_t clear_in_transition_end = 0;
8098 __unused vm_map_offset_t save_start = start;
8099 __unused vm_map_offset_t save_end = end;
8100 vm_map_delete_state_t state = VMDS_NONE;
8101 kmem_return_t ret = { };
8102 vm_map_range_id_t range_id = 0;
8103 struct kmem_page_meta *meta = NULL;
8104 uint32_t size_idx, slot_idx;
8105 struct mach_vm_range slot;
8106
8107 if (vm_map_pmap(map) == kernel_pmap) {
8108 state |= VMDS_KERNEL_PMAP;
8109 range_id = kmem_addr_get_range(start, end - start);
8110 if (kmem_is_ptr_range(range_id)) {
8111 state |= VMDS_KERNEL_KMEMPTR;
8112 slot_idx = kmem_addr_get_slot_idx(start, end, range_id, &meta,
8113 &size_idx, &slot);
8114 }
8115 }
8116
8117 if (map->terminated || os_ref_get_count_raw(&map->map_refcnt) == 0) {
8118 state |= VMDS_GAPS_OK;
8119 }
8120
8121 if (map->corpse_source &&
8122 !(flags & VM_MAP_REMOVE_TO_OVERWRITE) &&
8123 !map->terminated) {
8124 /*
8125 * The map is being used for corpses related diagnostics.
8126 * So skip any entry removal to avoid perturbing the map state.
8127 * The cleanup will happen in task_terminate_internal after the
8128 * call to task_port_no_senders.
8129 */
8130 goto out;
8131 }
8132
8133 interruptible = (flags & VM_MAP_REMOVE_INTERRUPTIBLE) ?
8134 THREAD_ABORTSAFE : THREAD_UNINT;
8135
8136 if ((flags & VM_MAP_REMOVE_NO_MAP_ALIGN) == 0 &&
8137 (start & VM_MAP_PAGE_MASK(map))) {
8138 __vm_map_delete_misaligned_panic(map, start, end);
8139 }
8140
8141 if ((state & VMDS_GAPS_OK) == 0) {
8142 /*
8143 * If the map isn't terminated then all deletions must have
8144 * no gaps, and be within the [min, max) of the map.
8145 *
8146 * We got here without VM_MAP_RANGE_CHECK() being called,
8147 * and hence must validate bounds manually.
8148 *
8149 * It is worth noting that because vm_deallocate() will
8150 * round_page() the deallocation size, it's possible for "end"
8151 * to be 0 here due to overflow. We hence must treat it as being
8152 * beyond vm_map_max(map).
8153 *
8154 * Similarly, end < start means some wrap around happend,
8155 * which should cause an error or panic.
8156 */
8157 if (end == 0 || end > vm_map_max(map)) {
8158 state |= VMDS_FOUND_GAP;
8159 gap_start = vm_map_max(map);
8160 if (state & VMDS_KERNEL_PMAP) {
8161 __vm_map_delete_gap_panic(map,
8162 gap_start, start, end);
8163 }
8164 goto out;
8165 }
8166
8167 if (end < start) {
8168 if (state & VMDS_KERNEL_PMAP) {
8169 __vm_map_delete_gap_panic(map,
8170 vm_map_max(map), start, end);
8171 }
8172 ret.kmr_return = KERN_INVALID_ARGUMENT;
8173 goto out;
8174 }
8175
8176 if (start < vm_map_min(map)) {
8177 state |= VMDS_FOUND_GAP;
8178 gap_start = start;
8179 if (state & VMDS_KERNEL_PMAP) {
8180 __vm_map_delete_gap_panic(map,
8181 gap_start, start, end);
8182 }
8183 goto out;
8184 }
8185 } else {
8186 /*
8187 * If the map is terminated, we must accept start/end
8188 * being beyond the boundaries of the map as this is
8189 * how some of the mappings like commpage mappings
8190 * can be destroyed (they're outside of those bounds).
8191 *
8192 * end < start is still something we can't cope with,
8193 * so just bail.
8194 */
8195 if (end < start) {
8196 goto out;
8197 }
8198 }
8199
8200
8201 /*
8202 * Find the start of the region.
8203 *
8204 * If in a superpage, extend the range
8205 * to include the start of the mapping.
8206 */
8207 while (vm_map_lookup_entry_or_next(map, start, &entry)) {
8208 if (entry->superpage_size && (start & ~SUPERPAGE_MASK)) {
8209 start = SUPERPAGE_ROUND_DOWN(start);
8210 } else {
8211 SAVE_HINT_MAP_WRITE(map, entry->vme_prev);
8212 break;
8213 }
8214 }
8215
8216 if (entry->superpage_size) {
8217 end = SUPERPAGE_ROUND_UP(end);
8218 }
8219
8220 /*
8221 * Step through all entries in this region
8222 */
8223 for (vm_map_offset_t s = start; s < end;) {
8224 /*
8225 * At this point, we have deleted all the memory entries
8226 * in [start, s) and are proceeding with the [s, end) range.
8227 *
8228 * This loop might drop the map lock, and it is possible that
8229 * some memory was already reallocated within [start, s)
8230 * and we don't want to mess with those entries.
8231 *
8232 * Some of those entries could even have been re-assembled
8233 * with an entry after "s" (in vm_map_simplify_entry()), so
8234 * we may have to vm_map_clip_start() again.
8235 *
8236 * When clear_in_transition_end is set, the we had marked
8237 * [start, clear_in_transition_end) as "in_transition"
8238 * during a previous iteration and we need to clear it.
8239 */
8240
8241 /*
8242 * Step 1: If needed (because we dropped locks),
8243 * lookup the entry again.
8244 *
8245 * If we're coming back from unwiring (Step 5),
8246 * we also need to mark the entries as no longer
8247 * in transition after that.
8248 */
8249
8250 if (state & VMDS_NEEDS_LOOKUP) {
8251 state &= ~VMDS_NEEDS_LOOKUP;
8252
8253 if (vm_map_lookup_entry_or_next(map, s, &entry)) {
8254 SAVE_HINT_MAP_WRITE(map, entry->vme_prev);
8255 }
8256
8257 if (state & VMDS_KERNEL_KMEMPTR) {
8258 kmem_validate_slot(s, meta, size_idx, slot_idx);
8259 }
8260 }
8261
8262 if (clear_in_transition_end) {
8263 for (vm_map_entry_t it = entry;
8264 it != vm_map_to_entry(map) &&
8265 it->vme_start < clear_in_transition_end;
8266 it = it->vme_next) {
8267 assert(it->in_transition);
8268 it->in_transition = FALSE;
8269 if (it->needs_wakeup) {
8270 it->needs_wakeup = FALSE;
8271 state |= VMDS_NEEDS_WAKEUP;
8272 }
8273 }
8274
8275 clear_in_transition_end = 0;
8276 }
8277
8278
8279 /*
8280 * Step 2: Perform various policy checks
8281 * before we do _anything_ to this entry.
8282 */
8283
8284 if (entry == vm_map_to_entry(map) || s < entry->vme_start) {
8285 if (state & (VMDS_GAPS_OK | VMDS_FOUND_GAP)) {
8286 /*
8287 * Either we found a gap already,
8288 * or we are tearing down a map,
8289 * keep going.
8290 */
8291 } else if (state & VMDS_KERNEL_PMAP) {
8292 __vm_map_delete_gap_panic(map, s, start, end);
8293 } else if (s < end) {
8294 state |= VMDS_FOUND_GAP;
8295 gap_start = s;
8296 }
8297
8298 if (entry == vm_map_to_entry(map) ||
8299 end <= entry->vme_start) {
8300 break;
8301 }
8302
8303 s = entry->vme_start;
8304 }
8305
8306 if (state & VMDS_KERNEL_PMAP) {
8307 /*
8308 * In the kernel map and its submaps,
8309 * permanent entries never die, even
8310 * if VM_MAP_REMOVE_IMMUTABLE is passed.
8311 */
8312 if (entry->vme_permanent) {
8313 __vm_map_delete_permanent_panic(map, start, end, entry);
8314 }
8315
8316 if (flags & VM_MAP_REMOVE_GUESS_SIZE) {
8317 end = entry->vme_end;
8318 flags &= ~VM_MAP_REMOVE_GUESS_SIZE;
8319 }
8320
8321 /*
8322 * In the kernel map and its submaps,
8323 * the removal of an atomic/guarded entry is strict.
8324 *
8325 * An atomic entry is processed only if it was
8326 * specifically targeted.
8327 *
8328 * We might have deleted non-atomic entries before
8329 * we reach this this point however...
8330 */
8331 kmem_entry_validate_guard(map, entry,
8332 start, end - start, guard);
8333 }
8334
8335 /*
8336 * Step 2.1: handle "permanent" and "submap" entries
8337 * *before* clipping to avoid triggering some unnecessary
8338 * un-nesting of the shared region.
8339 */
8340 if (entry->vme_permanent && entry->is_sub_map) {
8341 // printf("FBDP %s:%d permanent submap...\n", __FUNCTION__, __LINE__);
8342 /*
8343 * Un-mapping a "permanent" mapping of a user-space
8344 * submap is not allowed unless...
8345 */
8346 if (flags & VM_MAP_REMOVE_IMMUTABLE) {
8347 /*
8348 * a. explicitly requested by the kernel caller.
8349 */
8350 // printf("FBDP %s:%d flags & REMOVE_IMMUTABLE\n", __FUNCTION__, __LINE__);
8351 } else if ((flags & VM_MAP_REMOVE_IMMUTABLE_CODE) &&
8352 developer_mode_state()) {
8353 /*
8354 * b. we're in "developer" mode (for
8355 * breakpoints, dtrace probes, ...).
8356 */
8357 // printf("FBDP %s:%d flags & REMOVE_IMMUTABLE_CODE\n", __FUNCTION__, __LINE__);
8358 } else if (map->terminated) {
8359 /*
8360 * c. this is the final address space cleanup.
8361 */
8362 // printf("FBDP %s:%d map->terminated\n", __FUNCTION__, __LINE__);
8363 } else {
8364 vm_map_offset_t submap_start, submap_end;
8365 kern_return_t submap_kr;
8366
8367 /*
8368 * Check if there are any "permanent" mappings
8369 * in this range in the submap.
8370 */
8371 if (entry->in_transition) {
8372 /* can that even happen ? */
8373 goto in_transition;
8374 }
8375 /* compute the clipped range in the submap */
8376 submap_start = s - entry->vme_start;
8377 submap_start += VME_OFFSET(entry);
8378 submap_end = end - entry->vme_start;
8379 submap_end += VME_OFFSET(entry);
8380 submap_kr = vm_map_delete_submap_recurse(
8381 VME_SUBMAP(entry),
8382 submap_start,
8383 submap_end);
8384 if (submap_kr != KERN_SUCCESS) {
8385 /*
8386 * There are some "permanent" mappings
8387 * in the submap: we are not allowed
8388 * to remove this range.
8389 */
8390 printf("%d[%s] removing permanent submap entry "
8391 "%p [0x%llx:0x%llx] prot 0x%x/0x%x -> KERN_PROT_FAILURE\n",
8392 proc_selfpid(),
8393 (get_bsdtask_info(current_task())
8394 ? proc_name_address(get_bsdtask_info(current_task()))
8395 : "?"), entry,
8396 (uint64_t)entry->vme_start,
8397 (uint64_t)entry->vme_end,
8398 entry->protection,
8399 entry->max_protection);
8400 DTRACE_VM6(vm_map_delete_permanent_deny_submap,
8401 vm_map_entry_t, entry,
8402 vm_map_offset_t, entry->vme_start,
8403 vm_map_offset_t, entry->vme_end,
8404 vm_prot_t, entry->protection,
8405 vm_prot_t, entry->max_protection,
8406 int, VME_ALIAS(entry));
8407 ret.kmr_return = KERN_PROTECTION_FAILURE;
8408 goto out;
8409 }
8410 /* no permanent mappings: proceed */
8411 }
8412 }
8413
8414 /*
8415 * Step 3: Perform any clipping needed.
8416 *
8417 * After this, "entry" starts at "s", ends before "end"
8418 */
8419
8420 if (entry->vme_start < s) {
8421 if ((flags & VM_MAP_REMOVE_NO_MAP_ALIGN) &&
8422 entry->map_aligned &&
8423 !VM_MAP_PAGE_ALIGNED(s, VM_MAP_PAGE_MASK(map))) {
8424 /*
8425 * The entry will no longer be map-aligned
8426 * after clipping and the caller said it's OK.
8427 */
8428 entry->map_aligned = FALSE;
8429 }
8430 vm_map_clip_start(map, entry, s);
8431 SAVE_HINT_MAP_WRITE(map, entry->vme_prev);
8432 }
8433
8434 if (end < entry->vme_end) {
8435 if ((flags & VM_MAP_REMOVE_NO_MAP_ALIGN) &&
8436 entry->map_aligned &&
8437 !VM_MAP_PAGE_ALIGNED(end, VM_MAP_PAGE_MASK(map))) {
8438 /*
8439 * The entry will no longer be map-aligned
8440 * after clipping and the caller said it's OK.
8441 */
8442 entry->map_aligned = FALSE;
8443 }
8444 vm_map_clip_end(map, entry, end);
8445 }
8446
8447 if (entry->vme_permanent && entry->is_sub_map) {
8448 /*
8449 * We already went through step 2.1 which did not deny
8450 * the removal of this "permanent" and "is_sub_map"
8451 * entry.
8452 * Now that we've clipped what we actually want to
8453 * delete, undo the "permanent" part to allow the
8454 * removal to proceed.
8455 */
8456 DTRACE_VM6(vm_map_delete_permanent_allow_submap,
8457 vm_map_entry_t, entry,
8458 vm_map_offset_t, entry->vme_start,
8459 vm_map_offset_t, entry->vme_end,
8460 vm_prot_t, entry->protection,
8461 vm_prot_t, entry->max_protection,
8462 int, VME_ALIAS(entry));
8463 entry->vme_permanent = false;
8464 }
8465
8466 assert(s == entry->vme_start);
8467 assert(entry->vme_end <= end);
8468
8469
8470 /*
8471 * Step 4: If the entry is in flux, wait for this to resolve.
8472 */
8473
8474 if (entry->in_transition) {
8475 wait_result_t wait_result;
8476
8477 in_transition:
8478 /*
8479 * Another thread is wiring/unwiring this entry.
8480 * Let the other thread know we are waiting.
8481 */
8482
8483 entry->needs_wakeup = TRUE;
8484
8485 /*
8486 * wake up anybody waiting on entries that we have
8487 * already unwired/deleted.
8488 */
8489 if (state & VMDS_NEEDS_WAKEUP) {
8490 vm_map_entry_wakeup(map);
8491 state &= ~VMDS_NEEDS_WAKEUP;
8492 }
8493
8494 wait_result = vm_map_entry_wait(map, interruptible);
8495
8496 if (interruptible &&
8497 wait_result == THREAD_INTERRUPTED) {
8498 /*
8499 * We do not clear the needs_wakeup flag,
8500 * since we cannot tell if we were the only one.
8501 */
8502 ret.kmr_return = KERN_ABORTED;
8503 return ret;
8504 }
8505
8506 /*
8507 * The entry could have been clipped or it
8508 * may not exist anymore. Look it up again.
8509 */
8510 state |= VMDS_NEEDS_LOOKUP;
8511 continue;
8512 }
8513
8514
8515 /*
8516 * Step 5: Handle wiring
8517 */
8518
8519 if (entry->wired_count) {
8520 struct vm_map_entry tmp_entry;
8521 boolean_t user_wire;
8522 unsigned int last_timestamp;
8523
8524 user_wire = entry->user_wired_count > 0;
8525
8526 /*
8527 * Remove a kernel wiring if requested
8528 */
8529 if (flags & VM_MAP_REMOVE_KUNWIRE) {
8530 entry->wired_count--;
8531 vme_btref_consider_and_put(entry);
8532 }
8533
8534 /*
8535 * Remove all user wirings for proper accounting
8536 */
8537 while (entry->user_wired_count) {
8538 subtract_wire_counts(map, entry, user_wire);
8539 }
8540
8541 /*
8542 * All our DMA I/O operations in IOKit are currently
8543 * done by wiring through the map entries of the task
8544 * requesting the I/O.
8545 *
8546 * Because of this, we must always wait for kernel wirings
8547 * to go away on the entries before deleting them.
8548 *
8549 * Any caller who wants to actually remove a kernel wiring
8550 * should explicitly set the VM_MAP_REMOVE_KUNWIRE flag to
8551 * properly remove one wiring instead of blasting through
8552 * them all.
8553 */
8554 if (entry->wired_count != 0) {
8555 assert(map != kernel_map);
8556 /*
8557 * Cannot continue. Typical case is when
8558 * a user thread has physical io pending on
8559 * on this page. Either wait for the
8560 * kernel wiring to go away or return an
8561 * error.
8562 */
8563 wait_result_t wait_result;
8564
8565 entry->needs_wakeup = TRUE;
8566 wait_result = vm_map_entry_wait(map,
8567 interruptible);
8568
8569 if (interruptible &&
8570 wait_result == THREAD_INTERRUPTED) {
8571 /*
8572 * We do not clear the
8573 * needs_wakeup flag, since we
8574 * cannot tell if we were the
8575 * only one.
8576 */
8577 ret.kmr_return = KERN_ABORTED;
8578 return ret;
8579 }
8580
8581
8582 /*
8583 * The entry could have been clipped or
8584 * it may not exist anymore. Look it
8585 * up again.
8586 */
8587 state |= VMDS_NEEDS_LOOKUP;
8588 continue;
8589 }
8590
8591 /*
8592 * We can unlock the map now.
8593 *
8594 * The entry might be split once we unlock the map,
8595 * but we need the range as defined by this entry
8596 * to be stable. So we must make a local copy.
8597 *
8598 * The underlying objects do not change during clips,
8599 * and the in_transition state guarentees existence
8600 * of the entry.
8601 */
8602 last_timestamp = map->timestamp;
8603 entry->in_transition = TRUE;
8604 tmp_entry = *entry;
8605 vm_map_unlock(map);
8606
8607 if (tmp_entry.is_sub_map) {
8608 vm_map_t sub_map;
8609 vm_map_offset_t sub_start, sub_end;
8610 pmap_t pmap;
8611 vm_map_offset_t pmap_addr;
8612
8613
8614 sub_map = VME_SUBMAP(&tmp_entry);
8615 sub_start = VME_OFFSET(&tmp_entry);
8616 sub_end = sub_start + (tmp_entry.vme_end -
8617 tmp_entry.vme_start);
8618 if (tmp_entry.use_pmap) {
8619 pmap = sub_map->pmap;
8620 pmap_addr = tmp_entry.vme_start;
8621 } else {
8622 pmap = map->pmap;
8623 pmap_addr = tmp_entry.vme_start;
8624 }
8625 (void) vm_map_unwire_nested(sub_map,
8626 sub_start, sub_end,
8627 user_wire,
8628 pmap, pmap_addr);
8629 } else {
8630 vm_map_offset_t entry_end = tmp_entry.vme_end;
8631 vm_map_offset_t max_end;
8632
8633 if (flags & VM_MAP_REMOVE_NOKUNWIRE_LAST) {
8634 max_end = end - VM_MAP_PAGE_SIZE(map);
8635 if (entry_end > max_end) {
8636 entry_end = max_end;
8637 }
8638 }
8639
8640 if (tmp_entry.vme_kernel_object) {
8641 pmap_protect_options(
8642 map->pmap,
8643 tmp_entry.vme_start,
8644 entry_end,
8645 VM_PROT_NONE,
8646 PMAP_OPTIONS_REMOVE,
8647 NULL);
8648 }
8649 vm_fault_unwire(map, &tmp_entry,
8650 tmp_entry.vme_kernel_object, map->pmap,
8651 tmp_entry.vme_start, entry_end);
8652 }
8653
8654 vm_map_lock(map);
8655
8656 /*
8657 * Unwiring happened, we can now go back to deleting
8658 * them (after we clear the in_transition bit for the range).
8659 */
8660 if (last_timestamp + 1 != map->timestamp) {
8661 state |= VMDS_NEEDS_LOOKUP;
8662 }
8663 clear_in_transition_end = tmp_entry.vme_end;
8664 continue;
8665 }
8666
8667 assert(entry->wired_count == 0);
8668 assert(entry->user_wired_count == 0);
8669
8670
8671 /*
8672 * Step 6: Entry is unwired and ready for us to delete !
8673 */
8674
8675 if (!entry->vme_permanent) {
8676 /*
8677 * Typical case: the entry really shouldn't be permanent
8678 */
8679 } else if ((flags & VM_MAP_REMOVE_IMMUTABLE_CODE) &&
8680 (entry->protection & VM_PROT_EXECUTE) &&
8681 developer_mode_state()) {
8682 /*
8683 * Allow debuggers to undo executable mappings
8684 * when developer mode is on.
8685 */
8686 #if 0
8687 printf("FBDP %d[%s] removing permanent executable entry "
8688 "%p [0x%llx:0x%llx] prot 0x%x/0x%x\n",
8689 proc_selfpid(),
8690 (current_task()->bsd_info
8691 ? proc_name_address(current_task()->bsd_info)
8692 : "?"), entry,
8693 (uint64_t)entry->vme_start,
8694 (uint64_t)entry->vme_end,
8695 entry->protection,
8696 entry->max_protection);
8697 #endif
8698 entry->vme_permanent = FALSE;
8699 } else if ((flags & VM_MAP_REMOVE_IMMUTABLE) || map->terminated) {
8700 #if 0
8701 printf("FBDP %d[%s] removing permanent entry "
8702 "%p [0x%llx:0x%llx] prot 0x%x/0x%x\n",
8703 proc_selfpid(),
8704 (current_task()->bsd_info
8705 ? proc_name_address(current_task()->bsd_info)
8706 : "?"), entry,
8707 (uint64_t)entry->vme_start,
8708 (uint64_t)entry->vme_end,
8709 entry->protection,
8710 entry->max_protection);
8711 #endif
8712 entry->vme_permanent = FALSE;
8713 #if CODE_SIGNING_MONITOR
8714 } else if ((entry->protection & VM_PROT_EXECUTE) && !csm_enabled()) {
8715 entry->vme_permanent = FALSE;
8716
8717 printf("%d[%s] %s(0x%llx,0x%llx): "
8718 "code signing monitor disabled, allowing for permanent executable entry [0x%llx:0x%llx] "
8719 "prot 0x%x/0x%x\n",
8720 proc_selfpid(),
8721 (get_bsdtask_info(current_task())
8722 ? proc_name_address(get_bsdtask_info(current_task()))
8723 : "?"),
8724 __FUNCTION__,
8725 (uint64_t)start,
8726 (uint64_t)end,
8727 (uint64_t)entry->vme_start,
8728 (uint64_t)entry->vme_end,
8729 entry->protection,
8730 entry->max_protection);
8731 #endif
8732 } else {
8733 DTRACE_VM6(vm_map_delete_permanent,
8734 vm_map_entry_t, entry,
8735 vm_map_offset_t, entry->vme_start,
8736 vm_map_offset_t, entry->vme_end,
8737 vm_prot_t, entry->protection,
8738 vm_prot_t, entry->max_protection,
8739 int, VME_ALIAS(entry));
8740 }
8741
8742 if (entry->is_sub_map) {
8743 assertf(VM_MAP_PAGE_SHIFT(VME_SUBMAP(entry)) >= VM_MAP_PAGE_SHIFT(map),
8744 "map %p (%d) entry %p submap %p (%d)\n",
8745 map, VM_MAP_PAGE_SHIFT(map), entry,
8746 VME_SUBMAP(entry),
8747 VM_MAP_PAGE_SHIFT(VME_SUBMAP(entry)));
8748 if (entry->use_pmap) {
8749 #ifndef NO_NESTED_PMAP
8750 int pmap_flags;
8751
8752 if (map->terminated) {
8753 /*
8754 * This is the final cleanup of the
8755 * address space being terminated.
8756 * No new mappings are expected and
8757 * we don't really need to unnest the
8758 * shared region (and lose the "global"
8759 * pmap mappings, if applicable).
8760 *
8761 * Tell the pmap layer that we're
8762 * "clean" wrt nesting.
8763 */
8764 pmap_flags = PMAP_UNNEST_CLEAN;
8765 } else {
8766 /*
8767 * We're unmapping part of the nested
8768 * shared region, so we can't keep the
8769 * nested pmap.
8770 */
8771 pmap_flags = 0;
8772 }
8773 pmap_unnest_options(
8774 map->pmap,
8775 (addr64_t)entry->vme_start,
8776 entry->vme_end - entry->vme_start,
8777 pmap_flags);
8778 #endif /* NO_NESTED_PMAP */
8779 if (map->mapped_in_other_pmaps &&
8780 os_ref_get_count_raw(&map->map_refcnt) != 0) {
8781 /* clean up parent map/maps */
8782 vm_map_submap_pmap_clean(
8783 map, entry->vme_start,
8784 entry->vme_end,
8785 VME_SUBMAP(entry),
8786 VME_OFFSET(entry));
8787 }
8788 } else {
8789 vm_map_submap_pmap_clean(
8790 map, entry->vme_start, entry->vme_end,
8791 VME_SUBMAP(entry),
8792 VME_OFFSET(entry));
8793 }
8794 } else if (entry->vme_kernel_object ||
8795 VME_OBJECT(entry) == compressor_object) {
8796 /*
8797 * nothing to do
8798 */
8799 } else if (map->mapped_in_other_pmaps &&
8800 os_ref_get_count_raw(&map->map_refcnt) != 0) {
8801 vm_object_pmap_protect_options(
8802 VME_OBJECT(entry), VME_OFFSET(entry),
8803 entry->vme_end - entry->vme_start,
8804 PMAP_NULL,
8805 PAGE_SIZE,
8806 entry->vme_start,
8807 VM_PROT_NONE,
8808 PMAP_OPTIONS_REMOVE);
8809 } else if ((VME_OBJECT(entry) != VM_OBJECT_NULL) ||
8810 (state & VMDS_KERNEL_PMAP)) {
8811 /* Remove translations associated
8812 * with this range unless the entry
8813 * does not have an object, or
8814 * it's the kernel map or a descendant
8815 * since the platform could potentially
8816 * create "backdoor" mappings invisible
8817 * to the VM. It is expected that
8818 * objectless, non-kernel ranges
8819 * do not have such VM invisible
8820 * translations.
8821 */
8822 pmap_remove_options(map->pmap,
8823 (addr64_t)entry->vme_start,
8824 (addr64_t)entry->vme_end,
8825 PMAP_OPTIONS_REMOVE);
8826 }
8827
8828 #if DEBUG
8829 /*
8830 * All pmap mappings for this map entry must have been
8831 * cleared by now.
8832 */
8833 assert(pmap_is_empty(map->pmap,
8834 entry->vme_start,
8835 entry->vme_end));
8836 #endif /* DEBUG */
8837
8838 if (entry->iokit_acct) {
8839 /* alternate accounting */
8840 DTRACE_VM4(vm_map_iokit_unmapped_region,
8841 vm_map_t, map,
8842 vm_map_offset_t, entry->vme_start,
8843 vm_map_offset_t, entry->vme_end,
8844 int, VME_ALIAS(entry));
8845 vm_map_iokit_unmapped_region(map,
8846 (entry->vme_end -
8847 entry->vme_start));
8848 entry->iokit_acct = FALSE;
8849 entry->use_pmap = FALSE;
8850 }
8851
8852 /* move "s" forward */
8853 s = entry->vme_end;
8854 next = entry->vme_next;
8855 if (!entry->map_aligned) {
8856 vm_map_offset_t rounded_s;
8857
8858 /*
8859 * Skip artificial gap due to mis-aligned entry
8860 * on devices with a page size smaller than the
8861 * map's page size (i.e. 16k task on a 4k device).
8862 */
8863 rounded_s = VM_MAP_ROUND_PAGE(s, VM_MAP_PAGE_MASK(map));
8864 if (next == vm_map_to_entry(map)) {
8865 s = rounded_s;
8866 } else if (s < rounded_s) {
8867 s = MIN(rounded_s, next->vme_start);
8868 }
8869 }
8870 ret.kmr_size += s - entry->vme_start;
8871
8872 if (entry->vme_permanent) {
8873 /*
8874 * A permanent entry can not be removed, so leave it
8875 * in place but remove all access permissions.
8876 */
8877 if (!entry->csm_associated) {
8878 printf("%s:%d %d[%s] map %p entry %p [ 0x%llx - 0x%llx ] submap %d prot 0x%x/0x%x -> 0/0\n",
8879 __FUNCTION__, __LINE__,
8880 proc_selfpid(),
8881 (get_bsdtask_info(current_task())
8882 ? proc_name_address(get_bsdtask_info(current_task()))
8883 : "?"),
8884 map,
8885 entry,
8886 (uint64_t)entry->vme_start,
8887 (uint64_t)entry->vme_end,
8888 entry->is_sub_map,
8889 entry->protection,
8890 entry->max_protection);
8891 }
8892 DTRACE_VM6(vm_map_delete_permanent_prot_none,
8893 vm_map_entry_t, entry,
8894 vm_map_offset_t, entry->vme_start,
8895 vm_map_offset_t, entry->vme_end,
8896 vm_prot_t, entry->protection,
8897 vm_prot_t, entry->max_protection,
8898 int, VME_ALIAS(entry));
8899 entry->protection = VM_PROT_NONE;
8900 entry->max_protection = VM_PROT_NONE;
8901 } else {
8902 vm_map_entry_zap(map, entry, zap_list);
8903 }
8904
8905 entry = next;
8906 next = VM_MAP_ENTRY_NULL;
8907
8908 if ((flags & VM_MAP_REMOVE_NO_YIELD) == 0 && s < end) {
8909 unsigned int last_timestamp = map->timestamp++;
8910
8911 if (lck_rw_lock_yield_exclusive(&map->lock,
8912 LCK_RW_YIELD_ANY_WAITER)) {
8913 if (last_timestamp != map->timestamp + 1) {
8914 state |= VMDS_NEEDS_LOOKUP;
8915 }
8916 } else {
8917 /* we didn't yield, undo our change */
8918 map->timestamp--;
8919 }
8920 }
8921 }
8922
8923 if (map->wait_for_space) {
8924 thread_wakeup((event_t) map);
8925 }
8926
8927 if (state & VMDS_NEEDS_WAKEUP) {
8928 vm_map_entry_wakeup(map);
8929 }
8930
8931 out:
8932 if ((state & VMDS_KERNEL_PMAP) && ret.kmr_return) {
8933 __vm_map_delete_failed_panic(map, start, end, ret.kmr_return);
8934 }
8935
8936 if (state & VMDS_KERNEL_KMEMPTR) {
8937 kmem_free_space(start, end, range_id, &slot);
8938 }
8939
8940 if (state & VMDS_FOUND_GAP) {
8941 DTRACE_VM3(kern_vm_deallocate_gap,
8942 vm_map_offset_t, gap_start,
8943 vm_map_offset_t, save_start,
8944 vm_map_offset_t, save_end);
8945 if (flags & VM_MAP_REMOVE_GAPS_FAIL) {
8946 ret.kmr_return = KERN_INVALID_VALUE;
8947 } else {
8948 vm_map_guard_exception(gap_start, kGUARD_EXC_DEALLOC_GAP);
8949 }
8950 }
8951
8952 return ret;
8953 }
8954
8955 kmem_return_t
vm_map_remove_and_unlock(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vmr_flags_t flags,kmem_guard_t guard)8956 vm_map_remove_and_unlock(
8957 vm_map_t map,
8958 vm_map_offset_t start,
8959 vm_map_offset_t end,
8960 vmr_flags_t flags,
8961 kmem_guard_t guard)
8962 {
8963 kmem_return_t ret;
8964 VM_MAP_ZAP_DECLARE(zap);
8965
8966 ret = vm_map_delete(map, start, end, flags, guard, &zap);
8967 vm_map_unlock(map);
8968
8969 vm_map_zap_dispose(&zap);
8970
8971 return ret;
8972 }
8973
8974 /*
8975 * vm_map_remove_guard:
8976 *
8977 * Remove the given address range from the target map.
8978 * This is the exported form of vm_map_delete.
8979 */
8980 kmem_return_t
vm_map_remove_guard(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vmr_flags_t flags,kmem_guard_t guard)8981 vm_map_remove_guard(
8982 vm_map_t map,
8983 vm_map_offset_t start,
8984 vm_map_offset_t end,
8985 vmr_flags_t flags,
8986 kmem_guard_t guard)
8987 {
8988 vm_map_lock(map);
8989 return vm_map_remove_and_unlock(map, start, end, flags, guard);
8990 }
8991
8992 /*
8993 * vm_map_terminate:
8994 *
8995 * Clean out a task's map.
8996 */
8997 kern_return_t
vm_map_terminate(vm_map_t map)8998 vm_map_terminate(
8999 vm_map_t map)
9000 {
9001 vm_map_lock(map);
9002 map->terminated = TRUE;
9003 vm_map_disable_hole_optimization(map);
9004 (void)vm_map_remove_and_unlock(map, map->min_offset, map->max_offset,
9005 VM_MAP_REMOVE_NO_FLAGS, KMEM_GUARD_NONE);
9006 return KERN_SUCCESS;
9007 }
9008
9009 /*
9010 * Routine: vm_map_copy_allocate
9011 *
9012 * Description:
9013 * Allocates and initializes a map copy object.
9014 */
9015 static vm_map_copy_t
vm_map_copy_allocate(uint16_t type)9016 vm_map_copy_allocate(uint16_t type)
9017 {
9018 vm_map_copy_t new_copy;
9019
9020 new_copy = zalloc_id(ZONE_ID_VM_MAP_COPY, Z_WAITOK | Z_ZERO);
9021 new_copy->type = type;
9022 if (type == VM_MAP_COPY_ENTRY_LIST) {
9023 new_copy->c_u.hdr.rb_head_store.rbh_root = (void*)(int)SKIP_RB_TREE;
9024 vm_map_store_init(&new_copy->cpy_hdr);
9025 }
9026 return new_copy;
9027 }
9028
9029 /*
9030 * Routine: vm_map_copy_discard
9031 *
9032 * Description:
9033 * Dispose of a map copy object (returned by
9034 * vm_map_copyin).
9035 */
9036 void
vm_map_copy_discard(vm_map_copy_t copy)9037 vm_map_copy_discard(
9038 vm_map_copy_t copy)
9039 {
9040 if (copy == VM_MAP_COPY_NULL) {
9041 return;
9042 }
9043
9044 /*
9045 * Assert that the vm_map_copy is coming from the right
9046 * zone and hasn't been forged
9047 */
9048 vm_map_copy_require(copy);
9049
9050 switch (copy->type) {
9051 case VM_MAP_COPY_ENTRY_LIST:
9052 while (vm_map_copy_first_entry(copy) !=
9053 vm_map_copy_to_entry(copy)) {
9054 vm_map_entry_t entry = vm_map_copy_first_entry(copy);
9055
9056 vm_map_copy_entry_unlink(copy, entry);
9057 if (entry->is_sub_map) {
9058 vm_map_deallocate(VME_SUBMAP(entry));
9059 } else {
9060 vm_object_deallocate(VME_OBJECT(entry));
9061 }
9062 vm_map_copy_entry_dispose(entry);
9063 }
9064 break;
9065 case VM_MAP_COPY_KERNEL_BUFFER:
9066
9067 /*
9068 * The vm_map_copy_t and possibly the data buffer were
9069 * allocated by a single call to kalloc_data(), i.e. the
9070 * vm_map_copy_t was not allocated out of the zone.
9071 */
9072 if (copy->size > msg_ool_size_small || copy->offset) {
9073 panic("Invalid vm_map_copy_t sz:%lld, ofst:%lld",
9074 (long long)copy->size, (long long)copy->offset);
9075 }
9076 kfree_data(copy->cpy_kdata, copy->size);
9077 }
9078 zfree_id(ZONE_ID_VM_MAP_COPY, copy);
9079 }
9080
9081 #if XNU_PLATFORM_MacOSX
9082
9083 /*
9084 * Routine: vm_map_copy_copy
9085 *
9086 * Description:
9087 * Move the information in a map copy object to
9088 * a new map copy object, leaving the old one
9089 * empty.
9090 *
9091 * This is used by kernel routines that need
9092 * to look at out-of-line data (in copyin form)
9093 * before deciding whether to return SUCCESS.
9094 * If the routine returns FAILURE, the original
9095 * copy object will be deallocated; therefore,
9096 * these routines must make a copy of the copy
9097 * object and leave the original empty so that
9098 * deallocation will not fail.
9099 */
9100 vm_map_copy_t
vm_map_copy_copy(vm_map_copy_t copy)9101 vm_map_copy_copy(
9102 vm_map_copy_t copy)
9103 {
9104 vm_map_copy_t new_copy;
9105
9106 if (copy == VM_MAP_COPY_NULL) {
9107 return VM_MAP_COPY_NULL;
9108 }
9109
9110 /*
9111 * Assert that the vm_map_copy is coming from the right
9112 * zone and hasn't been forged
9113 */
9114 vm_map_copy_require(copy);
9115
9116 /*
9117 * Allocate a new copy object, and copy the information
9118 * from the old one into it.
9119 */
9120
9121 new_copy = zalloc_id(ZONE_ID_VM_MAP_COPY, Z_WAITOK | Z_ZERO | Z_NOFAIL);
9122 memcpy((void *) new_copy, (void *) copy, sizeof(struct vm_map_copy));
9123 #if __has_feature(ptrauth_calls)
9124 if (copy->type == VM_MAP_COPY_KERNEL_BUFFER) {
9125 new_copy->cpy_kdata = copy->cpy_kdata;
9126 }
9127 #endif
9128
9129 if (copy->type == VM_MAP_COPY_ENTRY_LIST) {
9130 /*
9131 * The links in the entry chain must be
9132 * changed to point to the new copy object.
9133 */
9134 vm_map_copy_first_entry(copy)->vme_prev
9135 = vm_map_copy_to_entry(new_copy);
9136 vm_map_copy_last_entry(copy)->vme_next
9137 = vm_map_copy_to_entry(new_copy);
9138 }
9139
9140 /*
9141 * Change the old copy object into one that contains
9142 * nothing to be deallocated.
9143 */
9144 bzero(copy, sizeof(struct vm_map_copy));
9145 copy->type = VM_MAP_COPY_KERNEL_BUFFER;
9146
9147 /*
9148 * Return the new object.
9149 */
9150 return new_copy;
9151 }
9152
9153 #endif /* XNU_PLATFORM_MacOSX */
9154
9155 static boolean_t
vm_map_entry_is_overwritable(vm_map_t dst_map __unused,vm_map_entry_t entry)9156 vm_map_entry_is_overwritable(
9157 vm_map_t dst_map __unused,
9158 vm_map_entry_t entry)
9159 {
9160 if (!(entry->protection & VM_PROT_WRITE)) {
9161 /* can't overwrite if not writable */
9162 return FALSE;
9163 }
9164 #if !__x86_64__
9165 if (entry->used_for_jit &&
9166 vm_map_cs_enforcement(dst_map) &&
9167 !dst_map->cs_debugged) {
9168 /*
9169 * Can't overwrite a JIT region while cs_enforced
9170 * and not cs_debugged.
9171 */
9172 return FALSE;
9173 }
9174
9175 #if __arm64e__
9176 /* Do not allow overwrite HW assisted TPRO entries */
9177 if (entry->used_for_tpro) {
9178 return FALSE;
9179 }
9180 #endif /* __arm64e__ */
9181
9182 if (entry->vme_permanent) {
9183 if (entry->is_sub_map) {
9184 /*
9185 * We can't tell if the submap contains "permanent"
9186 * entries within the range targeted by the caller.
9187 * The caller will have to check for that with
9188 * vm_map_overwrite_submap_recurse() for example.
9189 */
9190 } else {
9191 /*
9192 * Do not allow overwriting of a "permanent"
9193 * entry.
9194 */
9195 DTRACE_VM6(vm_map_delete_permanent_deny_overwrite,
9196 vm_map_entry_t, entry,
9197 vm_map_offset_t, entry->vme_start,
9198 vm_map_offset_t, entry->vme_end,
9199 vm_prot_t, entry->protection,
9200 vm_prot_t, entry->max_protection,
9201 int, VME_ALIAS(entry));
9202 return FALSE;
9203 }
9204 }
9205 #endif /* !__x86_64__ */
9206 return TRUE;
9207 }
9208
9209 static kern_return_t
vm_map_overwrite_submap_recurse(vm_map_t dst_map,vm_map_offset_t dst_addr,vm_map_size_t dst_size)9210 vm_map_overwrite_submap_recurse(
9211 vm_map_t dst_map,
9212 vm_map_offset_t dst_addr,
9213 vm_map_size_t dst_size)
9214 {
9215 vm_map_offset_t dst_end;
9216 vm_map_entry_t tmp_entry;
9217 vm_map_entry_t entry;
9218 kern_return_t result;
9219 boolean_t encountered_sub_map = FALSE;
9220
9221
9222
9223 /*
9224 * Verify that the destination is all writeable
9225 * initially. We have to trunc the destination
9226 * address and round the copy size or we'll end up
9227 * splitting entries in strange ways.
9228 */
9229
9230 dst_end = vm_map_round_page(dst_addr + dst_size,
9231 VM_MAP_PAGE_MASK(dst_map));
9232 vm_map_lock(dst_map);
9233
9234 start_pass_1:
9235 if (!vm_map_lookup_entry(dst_map, dst_addr, &tmp_entry)) {
9236 vm_map_unlock(dst_map);
9237 return KERN_INVALID_ADDRESS;
9238 }
9239
9240 vm_map_clip_start(dst_map,
9241 tmp_entry,
9242 vm_map_trunc_page(dst_addr,
9243 VM_MAP_PAGE_MASK(dst_map)));
9244 if (tmp_entry->is_sub_map) {
9245 /* clipping did unnest if needed */
9246 assert(!tmp_entry->use_pmap);
9247 }
9248
9249 for (entry = tmp_entry;;) {
9250 vm_map_entry_t next;
9251
9252 next = entry->vme_next;
9253 while (entry->is_sub_map) {
9254 vm_map_offset_t sub_start;
9255 vm_map_offset_t sub_end;
9256 vm_map_offset_t local_end;
9257
9258 if (entry->in_transition) {
9259 /*
9260 * Say that we are waiting, and wait for entry.
9261 */
9262 entry->needs_wakeup = TRUE;
9263 vm_map_entry_wait(dst_map, THREAD_UNINT);
9264
9265 goto start_pass_1;
9266 }
9267
9268 encountered_sub_map = TRUE;
9269 sub_start = VME_OFFSET(entry);
9270
9271 if (entry->vme_end < dst_end) {
9272 sub_end = entry->vme_end;
9273 } else {
9274 sub_end = dst_end;
9275 }
9276 sub_end -= entry->vme_start;
9277 sub_end += VME_OFFSET(entry);
9278 local_end = entry->vme_end;
9279 vm_map_unlock(dst_map);
9280
9281 result = vm_map_overwrite_submap_recurse(
9282 VME_SUBMAP(entry),
9283 sub_start,
9284 sub_end - sub_start);
9285
9286 if (result != KERN_SUCCESS) {
9287 return result;
9288 }
9289 if (dst_end <= entry->vme_end) {
9290 return KERN_SUCCESS;
9291 }
9292 vm_map_lock(dst_map);
9293 if (!vm_map_lookup_entry(dst_map, local_end,
9294 &tmp_entry)) {
9295 vm_map_unlock(dst_map);
9296 return KERN_INVALID_ADDRESS;
9297 }
9298 entry = tmp_entry;
9299 next = entry->vme_next;
9300 }
9301
9302 if (!(entry->protection & VM_PROT_WRITE)) {
9303 vm_map_unlock(dst_map);
9304 return KERN_PROTECTION_FAILURE;
9305 }
9306
9307 if (!vm_map_entry_is_overwritable(dst_map, entry)) {
9308 vm_map_unlock(dst_map);
9309 return KERN_PROTECTION_FAILURE;
9310 }
9311
9312 /*
9313 * If the entry is in transition, we must wait
9314 * for it to exit that state. Anything could happen
9315 * when we unlock the map, so start over.
9316 */
9317 if (entry->in_transition) {
9318 /*
9319 * Say that we are waiting, and wait for entry.
9320 */
9321 entry->needs_wakeup = TRUE;
9322 vm_map_entry_wait(dst_map, THREAD_UNINT);
9323
9324 goto start_pass_1;
9325 }
9326
9327 /*
9328 * our range is contained completely within this map entry
9329 */
9330 if (dst_end <= entry->vme_end) {
9331 vm_map_unlock(dst_map);
9332 return KERN_SUCCESS;
9333 }
9334 /*
9335 * check that range specified is contiguous region
9336 */
9337 if ((next == vm_map_to_entry(dst_map)) ||
9338 (next->vme_start != entry->vme_end)) {
9339 vm_map_unlock(dst_map);
9340 return KERN_INVALID_ADDRESS;
9341 }
9342
9343 /*
9344 * Check for permanent objects in the destination.
9345 */
9346 if ((VME_OBJECT(entry) != VM_OBJECT_NULL) &&
9347 ((!VME_OBJECT(entry)->internal) ||
9348 (VME_OBJECT(entry)->true_share))) {
9349 if (encountered_sub_map) {
9350 vm_map_unlock(dst_map);
9351 return KERN_FAILURE;
9352 }
9353 }
9354
9355
9356 entry = next;
9357 }/* for */
9358 vm_map_unlock(dst_map);
9359 return KERN_SUCCESS;
9360 }
9361
9362 /*
9363 * Routine: vm_map_copy_overwrite
9364 *
9365 * Description:
9366 * Copy the memory described by the map copy
9367 * object (copy; returned by vm_map_copyin) onto
9368 * the specified destination region (dst_map, dst_addr).
9369 * The destination must be writeable.
9370 *
9371 * Unlike vm_map_copyout, this routine actually
9372 * writes over previously-mapped memory. If the
9373 * previous mapping was to a permanent (user-supplied)
9374 * memory object, it is preserved.
9375 *
9376 * The attributes (protection and inheritance) of the
9377 * destination region are preserved.
9378 *
9379 * If successful, consumes the copy object.
9380 * Otherwise, the caller is responsible for it.
9381 *
9382 * Implementation notes:
9383 * To overwrite aligned temporary virtual memory, it is
9384 * sufficient to remove the previous mapping and insert
9385 * the new copy. This replacement is done either on
9386 * the whole region (if no permanent virtual memory
9387 * objects are embedded in the destination region) or
9388 * in individual map entries.
9389 *
9390 * To overwrite permanent virtual memory , it is necessary
9391 * to copy each page, as the external memory management
9392 * interface currently does not provide any optimizations.
9393 *
9394 * Unaligned memory also has to be copied. It is possible
9395 * to use 'vm_trickery' to copy the aligned data. This is
9396 * not done but not hard to implement.
9397 *
9398 * Once a page of permanent memory has been overwritten,
9399 * it is impossible to interrupt this function; otherwise,
9400 * the call would be neither atomic nor location-independent.
9401 * The kernel-state portion of a user thread must be
9402 * interruptible.
9403 *
9404 * It may be expensive to forward all requests that might
9405 * overwrite permanent memory (vm_write, vm_copy) to
9406 * uninterruptible kernel threads. This routine may be
9407 * called by interruptible threads; however, success is
9408 * not guaranteed -- if the request cannot be performed
9409 * atomically and interruptibly, an error indication is
9410 * returned.
9411 *
9412 * Callers of this function must call vm_map_copy_require on
9413 * previously created vm_map_copy_t or pass a newly created
9414 * one to ensure that it hasn't been forged.
9415 */
9416 static kern_return_t
vm_map_copy_overwrite_nested(vm_map_t dst_map,vm_map_address_t dst_addr,vm_map_copy_t copy,boolean_t interruptible,pmap_t pmap,boolean_t discard_on_success)9417 vm_map_copy_overwrite_nested(
9418 vm_map_t dst_map,
9419 vm_map_address_t dst_addr,
9420 vm_map_copy_t copy,
9421 boolean_t interruptible,
9422 pmap_t pmap,
9423 boolean_t discard_on_success)
9424 {
9425 vm_map_offset_t dst_end;
9426 vm_map_entry_t tmp_entry;
9427 vm_map_entry_t entry;
9428 kern_return_t kr;
9429 boolean_t aligned = TRUE;
9430 boolean_t contains_permanent_objects = FALSE;
9431 boolean_t encountered_sub_map = FALSE;
9432 vm_map_offset_t base_addr;
9433 vm_map_size_t copy_size;
9434 vm_map_size_t total_size;
9435 uint16_t copy_page_shift;
9436
9437 /*
9438 * Check for special kernel buffer allocated
9439 * by new_ipc_kmsg_copyin.
9440 */
9441
9442 if (copy->type == VM_MAP_COPY_KERNEL_BUFFER) {
9443 kr = vm_map_copyout_kernel_buffer(
9444 dst_map, &dst_addr,
9445 copy, copy->size, TRUE, discard_on_success);
9446 return kr;
9447 }
9448
9449 /*
9450 * Only works for entry lists at the moment. Will
9451 * support page lists later.
9452 */
9453
9454 assert(copy->type == VM_MAP_COPY_ENTRY_LIST);
9455
9456 if (copy->size == 0) {
9457 if (discard_on_success) {
9458 vm_map_copy_discard(copy);
9459 }
9460 return KERN_SUCCESS;
9461 }
9462
9463 copy_page_shift = copy->cpy_hdr.page_shift;
9464
9465 /*
9466 * Verify that the destination is all writeable
9467 * initially. We have to trunc the destination
9468 * address and round the copy size or we'll end up
9469 * splitting entries in strange ways.
9470 */
9471
9472 if (!VM_MAP_PAGE_ALIGNED(copy->size,
9473 VM_MAP_PAGE_MASK(dst_map)) ||
9474 !VM_MAP_PAGE_ALIGNED(copy->offset,
9475 VM_MAP_PAGE_MASK(dst_map)) ||
9476 !VM_MAP_PAGE_ALIGNED(dst_addr,
9477 VM_MAP_PAGE_MASK(dst_map)) ||
9478 copy_page_shift != VM_MAP_PAGE_SHIFT(dst_map)) {
9479 aligned = FALSE;
9480 dst_end = vm_map_round_page(dst_addr + copy->size,
9481 VM_MAP_PAGE_MASK(dst_map));
9482 } else {
9483 dst_end = dst_addr + copy->size;
9484 }
9485
9486 vm_map_lock(dst_map);
9487
9488 /* LP64todo - remove this check when vm_map_commpage64()
9489 * no longer has to stuff in a map_entry for the commpage
9490 * above the map's max_offset.
9491 */
9492 if (dst_addr >= dst_map->max_offset) {
9493 vm_map_unlock(dst_map);
9494 return KERN_INVALID_ADDRESS;
9495 }
9496
9497 start_pass_1:
9498 if (!vm_map_lookup_entry(dst_map, dst_addr, &tmp_entry)) {
9499 vm_map_unlock(dst_map);
9500 return KERN_INVALID_ADDRESS;
9501 }
9502 vm_map_clip_start(dst_map,
9503 tmp_entry,
9504 vm_map_trunc_page(dst_addr,
9505 VM_MAP_PAGE_MASK(dst_map)));
9506 for (entry = tmp_entry;;) {
9507 vm_map_entry_t next = entry->vme_next;
9508
9509 while (entry->is_sub_map) {
9510 vm_map_offset_t sub_start;
9511 vm_map_offset_t sub_end;
9512 vm_map_offset_t local_end;
9513
9514 if (entry->in_transition) {
9515 /*
9516 * Say that we are waiting, and wait for entry.
9517 */
9518 entry->needs_wakeup = TRUE;
9519 vm_map_entry_wait(dst_map, THREAD_UNINT);
9520
9521 goto start_pass_1;
9522 }
9523
9524 local_end = entry->vme_end;
9525 if (!(entry->needs_copy)) {
9526 /* if needs_copy we are a COW submap */
9527 /* in such a case we just replace so */
9528 /* there is no need for the follow- */
9529 /* ing check. */
9530 encountered_sub_map = TRUE;
9531 sub_start = VME_OFFSET(entry);
9532
9533 if (entry->vme_end < dst_end) {
9534 sub_end = entry->vme_end;
9535 } else {
9536 sub_end = dst_end;
9537 }
9538 sub_end -= entry->vme_start;
9539 sub_end += VME_OFFSET(entry);
9540 vm_map_unlock(dst_map);
9541
9542 kr = vm_map_overwrite_submap_recurse(
9543 VME_SUBMAP(entry),
9544 sub_start,
9545 sub_end - sub_start);
9546 if (kr != KERN_SUCCESS) {
9547 return kr;
9548 }
9549 vm_map_lock(dst_map);
9550 }
9551
9552 if (dst_end <= entry->vme_end) {
9553 goto start_overwrite;
9554 }
9555 if (!vm_map_lookup_entry(dst_map, local_end,
9556 &entry)) {
9557 vm_map_unlock(dst_map);
9558 return KERN_INVALID_ADDRESS;
9559 }
9560 next = entry->vme_next;
9561 }
9562
9563 if (!(entry->protection & VM_PROT_WRITE)) {
9564 vm_map_unlock(dst_map);
9565 return KERN_PROTECTION_FAILURE;
9566 }
9567
9568 if (!vm_map_entry_is_overwritable(dst_map, entry)) {
9569 vm_map_unlock(dst_map);
9570 return KERN_PROTECTION_FAILURE;
9571 }
9572
9573 /*
9574 * If the entry is in transition, we must wait
9575 * for it to exit that state. Anything could happen
9576 * when we unlock the map, so start over.
9577 */
9578 if (entry->in_transition) {
9579 /*
9580 * Say that we are waiting, and wait for entry.
9581 */
9582 entry->needs_wakeup = TRUE;
9583 vm_map_entry_wait(dst_map, THREAD_UNINT);
9584
9585 goto start_pass_1;
9586 }
9587
9588 /*
9589 * our range is contained completely within this map entry
9590 */
9591 if (dst_end <= entry->vme_end) {
9592 break;
9593 }
9594 /*
9595 * check that range specified is contiguous region
9596 */
9597 if ((next == vm_map_to_entry(dst_map)) ||
9598 (next->vme_start != entry->vme_end)) {
9599 vm_map_unlock(dst_map);
9600 return KERN_INVALID_ADDRESS;
9601 }
9602
9603
9604 /*
9605 * Check for permanent objects in the destination.
9606 */
9607 if ((VME_OBJECT(entry) != VM_OBJECT_NULL) &&
9608 ((!VME_OBJECT(entry)->internal) ||
9609 (VME_OBJECT(entry)->true_share))) {
9610 contains_permanent_objects = TRUE;
9611 }
9612
9613 entry = next;
9614 }/* for */
9615
9616 start_overwrite:
9617 /*
9618 * If there are permanent objects in the destination, then
9619 * the copy cannot be interrupted.
9620 */
9621
9622 if (interruptible && contains_permanent_objects) {
9623 vm_map_unlock(dst_map);
9624 return KERN_FAILURE; /* XXX */
9625 }
9626
9627 /*
9628 *
9629 * Make a second pass, overwriting the data
9630 * At the beginning of each loop iteration,
9631 * the next entry to be overwritten is "tmp_entry"
9632 * (initially, the value returned from the lookup above),
9633 * and the starting address expected in that entry
9634 * is "start".
9635 */
9636
9637 total_size = copy->size;
9638 if (encountered_sub_map) {
9639 copy_size = 0;
9640 /* re-calculate tmp_entry since we've had the map */
9641 /* unlocked */
9642 if (!vm_map_lookup_entry( dst_map, dst_addr, &tmp_entry)) {
9643 vm_map_unlock(dst_map);
9644 return KERN_INVALID_ADDRESS;
9645 }
9646 } else {
9647 copy_size = copy->size;
9648 }
9649
9650 base_addr = dst_addr;
9651 while (TRUE) {
9652 /* deconstruct the copy object and do in parts */
9653 /* only in sub_map, interruptable case */
9654 vm_map_entry_t copy_entry;
9655 vm_map_entry_t previous_prev = VM_MAP_ENTRY_NULL;
9656 vm_map_entry_t next_copy = VM_MAP_ENTRY_NULL;
9657 int nentries;
9658 int remaining_entries = 0;
9659 vm_map_offset_t new_offset = 0;
9660
9661 for (entry = tmp_entry; copy_size == 0;) {
9662 vm_map_entry_t next;
9663
9664 next = entry->vme_next;
9665
9666 /* tmp_entry and base address are moved along */
9667 /* each time we encounter a sub-map. Otherwise */
9668 /* entry can outpase tmp_entry, and the copy_size */
9669 /* may reflect the distance between them */
9670 /* if the current entry is found to be in transition */
9671 /* we will start over at the beginning or the last */
9672 /* encounter of a submap as dictated by base_addr */
9673 /* we will zero copy_size accordingly. */
9674 if (entry->in_transition) {
9675 /*
9676 * Say that we are waiting, and wait for entry.
9677 */
9678 entry->needs_wakeup = TRUE;
9679 vm_map_entry_wait(dst_map, THREAD_UNINT);
9680
9681 if (!vm_map_lookup_entry(dst_map, base_addr,
9682 &tmp_entry)) {
9683 vm_map_unlock(dst_map);
9684 return KERN_INVALID_ADDRESS;
9685 }
9686 copy_size = 0;
9687 entry = tmp_entry;
9688 continue;
9689 }
9690 if (entry->is_sub_map) {
9691 vm_map_offset_t sub_start;
9692 vm_map_offset_t sub_end;
9693 vm_map_offset_t local_end;
9694
9695 if (entry->needs_copy) {
9696 /* if this is a COW submap */
9697 /* just back the range with a */
9698 /* anonymous entry */
9699 assert(!entry->vme_permanent);
9700 if (entry->vme_end < dst_end) {
9701 sub_end = entry->vme_end;
9702 } else {
9703 sub_end = dst_end;
9704 }
9705 if (entry->vme_start < base_addr) {
9706 sub_start = base_addr;
9707 } else {
9708 sub_start = entry->vme_start;
9709 }
9710 vm_map_clip_end(
9711 dst_map, entry, sub_end);
9712 vm_map_clip_start(
9713 dst_map, entry, sub_start);
9714 assert(!entry->use_pmap);
9715 assert(!entry->iokit_acct);
9716 entry->use_pmap = TRUE;
9717 vm_map_deallocate(VME_SUBMAP(entry));
9718 assert(!entry->vme_permanent);
9719 VME_OBJECT_SET(entry, VM_OBJECT_NULL, false, 0);
9720 VME_OFFSET_SET(entry, 0);
9721 entry->is_shared = FALSE;
9722 entry->needs_copy = FALSE;
9723 entry->protection = VM_PROT_DEFAULT;
9724 entry->max_protection = VM_PROT_ALL;
9725 entry->wired_count = 0;
9726 entry->user_wired_count = 0;
9727 if (entry->inheritance
9728 == VM_INHERIT_SHARE) {
9729 entry->inheritance = VM_INHERIT_COPY;
9730 }
9731 continue;
9732 }
9733 /* first take care of any non-sub_map */
9734 /* entries to send */
9735 if (base_addr < entry->vme_start) {
9736 /* stuff to send */
9737 copy_size =
9738 entry->vme_start - base_addr;
9739 break;
9740 }
9741 sub_start = VME_OFFSET(entry);
9742
9743 if (entry->vme_end < dst_end) {
9744 sub_end = entry->vme_end;
9745 } else {
9746 sub_end = dst_end;
9747 }
9748 sub_end -= entry->vme_start;
9749 sub_end += VME_OFFSET(entry);
9750 local_end = entry->vme_end;
9751 vm_map_unlock(dst_map);
9752 copy_size = sub_end - sub_start;
9753
9754 /* adjust the copy object */
9755 if (total_size > copy_size) {
9756 vm_map_size_t local_size = 0;
9757 vm_map_size_t entry_size;
9758
9759 nentries = 1;
9760 new_offset = copy->offset;
9761 copy_entry = vm_map_copy_first_entry(copy);
9762 while (copy_entry !=
9763 vm_map_copy_to_entry(copy)) {
9764 entry_size = copy_entry->vme_end -
9765 copy_entry->vme_start;
9766 if ((local_size < copy_size) &&
9767 ((local_size + entry_size)
9768 >= copy_size)) {
9769 vm_map_copy_clip_end(copy,
9770 copy_entry,
9771 copy_entry->vme_start +
9772 (copy_size - local_size));
9773 entry_size = copy_entry->vme_end -
9774 copy_entry->vme_start;
9775 local_size += entry_size;
9776 new_offset += entry_size;
9777 }
9778 if (local_size >= copy_size) {
9779 next_copy = copy_entry->vme_next;
9780 copy_entry->vme_next =
9781 vm_map_copy_to_entry(copy);
9782 previous_prev =
9783 copy->cpy_hdr.links.prev;
9784 copy->cpy_hdr.links.prev = copy_entry;
9785 copy->size = copy_size;
9786 remaining_entries =
9787 copy->cpy_hdr.nentries;
9788 remaining_entries -= nentries;
9789 copy->cpy_hdr.nentries = nentries;
9790 break;
9791 } else {
9792 local_size += entry_size;
9793 new_offset += entry_size;
9794 nentries++;
9795 }
9796 copy_entry = copy_entry->vme_next;
9797 }
9798 }
9799
9800 if ((entry->use_pmap) && (pmap == NULL)) {
9801 kr = vm_map_copy_overwrite_nested(
9802 VME_SUBMAP(entry),
9803 sub_start,
9804 copy,
9805 interruptible,
9806 VME_SUBMAP(entry)->pmap,
9807 TRUE);
9808 } else if (pmap != NULL) {
9809 kr = vm_map_copy_overwrite_nested(
9810 VME_SUBMAP(entry),
9811 sub_start,
9812 copy,
9813 interruptible, pmap,
9814 TRUE);
9815 } else {
9816 kr = vm_map_copy_overwrite_nested(
9817 VME_SUBMAP(entry),
9818 sub_start,
9819 copy,
9820 interruptible,
9821 dst_map->pmap,
9822 TRUE);
9823 }
9824 if (kr != KERN_SUCCESS) {
9825 if (next_copy != NULL) {
9826 copy->cpy_hdr.nentries +=
9827 remaining_entries;
9828 copy->cpy_hdr.links.prev->vme_next =
9829 next_copy;
9830 copy->cpy_hdr.links.prev
9831 = previous_prev;
9832 copy->size = total_size;
9833 }
9834 return kr;
9835 }
9836 if (dst_end <= local_end) {
9837 return KERN_SUCCESS;
9838 }
9839 /* otherwise copy no longer exists, it was */
9840 /* destroyed after successful copy_overwrite */
9841 copy = vm_map_copy_allocate(VM_MAP_COPY_ENTRY_LIST);
9842 copy->offset = new_offset;
9843 copy->cpy_hdr.page_shift = copy_page_shift;
9844
9845 total_size -= copy_size;
9846 copy_size = 0;
9847 /* put back remainder of copy in container */
9848 if (next_copy != NULL) {
9849 copy->cpy_hdr.nentries = remaining_entries;
9850 copy->cpy_hdr.links.next = next_copy;
9851 copy->cpy_hdr.links.prev = previous_prev;
9852 copy->size = total_size;
9853 next_copy->vme_prev =
9854 vm_map_copy_to_entry(copy);
9855 next_copy = NULL;
9856 }
9857 base_addr = local_end;
9858 vm_map_lock(dst_map);
9859 if (!vm_map_lookup_entry(dst_map,
9860 local_end, &tmp_entry)) {
9861 vm_map_unlock(dst_map);
9862 return KERN_INVALID_ADDRESS;
9863 }
9864 entry = tmp_entry;
9865 continue;
9866 }
9867 if (dst_end <= entry->vme_end) {
9868 copy_size = dst_end - base_addr;
9869 break;
9870 }
9871
9872 if ((next == vm_map_to_entry(dst_map)) ||
9873 (next->vme_start != entry->vme_end)) {
9874 vm_map_unlock(dst_map);
9875 return KERN_INVALID_ADDRESS;
9876 }
9877
9878 entry = next;
9879 }/* for */
9880
9881 next_copy = NULL;
9882 nentries = 1;
9883
9884 /* adjust the copy object */
9885 if (total_size > copy_size) {
9886 vm_map_size_t local_size = 0;
9887 vm_map_size_t entry_size;
9888
9889 new_offset = copy->offset;
9890 copy_entry = vm_map_copy_first_entry(copy);
9891 while (copy_entry != vm_map_copy_to_entry(copy)) {
9892 entry_size = copy_entry->vme_end -
9893 copy_entry->vme_start;
9894 if ((local_size < copy_size) &&
9895 ((local_size + entry_size)
9896 >= copy_size)) {
9897 vm_map_copy_clip_end(copy, copy_entry,
9898 copy_entry->vme_start +
9899 (copy_size - local_size));
9900 entry_size = copy_entry->vme_end -
9901 copy_entry->vme_start;
9902 local_size += entry_size;
9903 new_offset += entry_size;
9904 }
9905 if (local_size >= copy_size) {
9906 next_copy = copy_entry->vme_next;
9907 copy_entry->vme_next =
9908 vm_map_copy_to_entry(copy);
9909 previous_prev =
9910 copy->cpy_hdr.links.prev;
9911 copy->cpy_hdr.links.prev = copy_entry;
9912 copy->size = copy_size;
9913 remaining_entries =
9914 copy->cpy_hdr.nentries;
9915 remaining_entries -= nentries;
9916 copy->cpy_hdr.nentries = nentries;
9917 break;
9918 } else {
9919 local_size += entry_size;
9920 new_offset += entry_size;
9921 nentries++;
9922 }
9923 copy_entry = copy_entry->vme_next;
9924 }
9925 }
9926
9927 if (aligned) {
9928 pmap_t local_pmap;
9929
9930 if (pmap) {
9931 local_pmap = pmap;
9932 } else {
9933 local_pmap = dst_map->pmap;
9934 }
9935
9936 if ((kr = vm_map_copy_overwrite_aligned(
9937 dst_map, tmp_entry, copy,
9938 base_addr, local_pmap)) != KERN_SUCCESS) {
9939 if (next_copy != NULL) {
9940 copy->cpy_hdr.nentries +=
9941 remaining_entries;
9942 copy->cpy_hdr.links.prev->vme_next =
9943 next_copy;
9944 copy->cpy_hdr.links.prev =
9945 previous_prev;
9946 copy->size += copy_size;
9947 }
9948 return kr;
9949 }
9950 vm_map_unlock(dst_map);
9951 } else {
9952 /*
9953 * Performance gain:
9954 *
9955 * if the copy and dst address are misaligned but the same
9956 * offset within the page we can copy_not_aligned the
9957 * misaligned parts and copy aligned the rest. If they are
9958 * aligned but len is unaligned we simply need to copy
9959 * the end bit unaligned. We'll need to split the misaligned
9960 * bits of the region in this case !
9961 */
9962 /* ALWAYS UNLOCKS THE dst_map MAP */
9963 kr = vm_map_copy_overwrite_unaligned(
9964 dst_map,
9965 tmp_entry,
9966 copy,
9967 base_addr,
9968 discard_on_success);
9969 if (kr != KERN_SUCCESS) {
9970 if (next_copy != NULL) {
9971 copy->cpy_hdr.nentries +=
9972 remaining_entries;
9973 copy->cpy_hdr.links.prev->vme_next =
9974 next_copy;
9975 copy->cpy_hdr.links.prev =
9976 previous_prev;
9977 copy->size += copy_size;
9978 }
9979 return kr;
9980 }
9981 }
9982 total_size -= copy_size;
9983 if (total_size == 0) {
9984 break;
9985 }
9986 base_addr += copy_size;
9987 copy_size = 0;
9988 copy->offset = new_offset;
9989 if (next_copy != NULL) {
9990 copy->cpy_hdr.nentries = remaining_entries;
9991 copy->cpy_hdr.links.next = next_copy;
9992 copy->cpy_hdr.links.prev = previous_prev;
9993 next_copy->vme_prev = vm_map_copy_to_entry(copy);
9994 copy->size = total_size;
9995 }
9996 vm_map_lock(dst_map);
9997 while (TRUE) {
9998 if (!vm_map_lookup_entry(dst_map,
9999 base_addr, &tmp_entry)) {
10000 vm_map_unlock(dst_map);
10001 return KERN_INVALID_ADDRESS;
10002 }
10003 if (tmp_entry->in_transition) {
10004 entry->needs_wakeup = TRUE;
10005 vm_map_entry_wait(dst_map, THREAD_UNINT);
10006 } else {
10007 break;
10008 }
10009 }
10010 vm_map_clip_start(dst_map,
10011 tmp_entry,
10012 vm_map_trunc_page(base_addr,
10013 VM_MAP_PAGE_MASK(dst_map)));
10014
10015 entry = tmp_entry;
10016 } /* while */
10017
10018 /*
10019 * Throw away the vm_map_copy object
10020 */
10021 if (discard_on_success) {
10022 vm_map_copy_discard(copy);
10023 }
10024
10025 return KERN_SUCCESS;
10026 }/* vm_map_copy_overwrite */
10027
10028 kern_return_t
vm_map_copy_overwrite(vm_map_t dst_map,vm_map_offset_t dst_addr,vm_map_copy_t copy,vm_map_size_t copy_size,boolean_t interruptible)10029 vm_map_copy_overwrite(
10030 vm_map_t dst_map,
10031 vm_map_offset_t dst_addr,
10032 vm_map_copy_t copy,
10033 vm_map_size_t copy_size,
10034 boolean_t interruptible)
10035 {
10036 vm_map_size_t head_size, tail_size;
10037 vm_map_copy_t head_copy, tail_copy;
10038 vm_map_offset_t head_addr, tail_addr;
10039 vm_map_entry_t entry;
10040 kern_return_t kr;
10041 vm_map_offset_t effective_page_mask, effective_page_size;
10042 uint16_t copy_page_shift;
10043
10044 head_size = 0;
10045 tail_size = 0;
10046 head_copy = NULL;
10047 tail_copy = NULL;
10048 head_addr = 0;
10049 tail_addr = 0;
10050
10051 /*
10052 * Check for null copy object.
10053 */
10054 if (copy == VM_MAP_COPY_NULL) {
10055 return KERN_SUCCESS;
10056 }
10057
10058 if (__improbable(vm_map_range_overflows(dst_map, dst_addr, copy_size))) {
10059 return KERN_INVALID_ADDRESS;
10060 }
10061
10062 /*
10063 * Assert that the vm_map_copy is coming from the right
10064 * zone and hasn't been forged
10065 */
10066 vm_map_copy_require(copy);
10067
10068 if (interruptible ||
10069 copy->type != VM_MAP_COPY_ENTRY_LIST) {
10070 /*
10071 * We can't split the "copy" map if we're interruptible
10072 * or if we don't have a "copy" map...
10073 */
10074 blunt_copy:
10075 kr = vm_map_copy_overwrite_nested(dst_map,
10076 dst_addr,
10077 copy,
10078 interruptible,
10079 (pmap_t) NULL,
10080 TRUE);
10081 if (kr) {
10082 ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_COPYOVERWRITE_FULL_NESTED_ERROR), kr /* arg */);
10083 }
10084 return kr;
10085 }
10086
10087 copy_page_shift = VM_MAP_COPY_PAGE_SHIFT(copy);
10088 if (copy_page_shift < PAGE_SHIFT ||
10089 VM_MAP_PAGE_SHIFT(dst_map) < PAGE_SHIFT) {
10090 goto blunt_copy;
10091 }
10092
10093 if (VM_MAP_PAGE_SHIFT(dst_map) < PAGE_SHIFT) {
10094 effective_page_mask = VM_MAP_PAGE_MASK(dst_map);
10095 } else {
10096 effective_page_mask = MAX(VM_MAP_PAGE_MASK(dst_map), PAGE_MASK);
10097 effective_page_mask = MAX(VM_MAP_COPY_PAGE_MASK(copy),
10098 effective_page_mask);
10099 }
10100 effective_page_size = effective_page_mask + 1;
10101
10102 if (copy_size < VM_MAP_COPY_OVERWRITE_OPTIMIZATION_THRESHOLD_PAGES * effective_page_size) {
10103 /*
10104 * Too small to bother with optimizing...
10105 */
10106 goto blunt_copy;
10107 }
10108
10109 if ((dst_addr & effective_page_mask) !=
10110 (copy->offset & effective_page_mask)) {
10111 /*
10112 * Incompatible mis-alignment of source and destination...
10113 */
10114 goto blunt_copy;
10115 }
10116
10117 /*
10118 * Proper alignment or identical mis-alignment at the beginning.
10119 * Let's try and do a small unaligned copy first (if needed)
10120 * and then an aligned copy for the rest.
10121 */
10122 if (!vm_map_page_aligned(dst_addr, effective_page_mask)) {
10123 head_addr = dst_addr;
10124 head_size = (effective_page_size -
10125 (copy->offset & effective_page_mask));
10126 head_size = MIN(head_size, copy_size);
10127 }
10128 if (!vm_map_page_aligned(copy->offset + copy_size,
10129 effective_page_mask)) {
10130 /*
10131 * Mis-alignment at the end.
10132 * Do an aligned copy up to the last page and
10133 * then an unaligned copy for the remaining bytes.
10134 */
10135 tail_size = ((copy->offset + copy_size) &
10136 effective_page_mask);
10137 tail_size = MIN(tail_size, copy_size);
10138 tail_addr = dst_addr + copy_size - tail_size;
10139 assert(tail_addr >= head_addr + head_size);
10140 }
10141 assert(head_size + tail_size <= copy_size);
10142
10143 if (head_size + tail_size == copy_size) {
10144 /*
10145 * It's all unaligned, no optimization possible...
10146 */
10147 goto blunt_copy;
10148 }
10149
10150 /*
10151 * Can't optimize if there are any submaps in the
10152 * destination due to the way we free the "copy" map
10153 * progressively in vm_map_copy_overwrite_nested()
10154 * in that case.
10155 */
10156 vm_map_lock_read(dst_map);
10157 if (!vm_map_lookup_entry(dst_map, dst_addr, &entry)) {
10158 vm_map_unlock_read(dst_map);
10159 goto blunt_copy;
10160 }
10161 for (;
10162 (entry != vm_map_to_entry(dst_map) &&
10163 entry->vme_start < dst_addr + copy_size);
10164 entry = entry->vme_next) {
10165 if (entry->is_sub_map) {
10166 vm_map_unlock_read(dst_map);
10167 goto blunt_copy;
10168 }
10169 }
10170 vm_map_unlock_read(dst_map);
10171
10172 if (head_size) {
10173 /*
10174 * Unaligned copy of the first "head_size" bytes, to reach
10175 * a page boundary.
10176 */
10177
10178 /*
10179 * Extract "head_copy" out of "copy".
10180 */
10181 head_copy = vm_map_copy_allocate(VM_MAP_COPY_ENTRY_LIST);
10182 head_copy->cpy_hdr.entries_pageable =
10183 copy->cpy_hdr.entries_pageable;
10184 head_copy->cpy_hdr.page_shift = copy_page_shift;
10185
10186 entry = vm_map_copy_first_entry(copy);
10187 if (entry->vme_end < copy->offset + head_size) {
10188 head_size = entry->vme_end - copy->offset;
10189 }
10190
10191 head_copy->offset = copy->offset;
10192 head_copy->size = head_size;
10193 copy->offset += head_size;
10194 copy->size -= head_size;
10195 copy_size -= head_size;
10196 assert(copy_size > 0);
10197
10198 vm_map_copy_clip_end(copy, entry, copy->offset);
10199 vm_map_copy_entry_unlink(copy, entry);
10200 vm_map_copy_entry_link(head_copy,
10201 vm_map_copy_to_entry(head_copy),
10202 entry);
10203
10204 /*
10205 * Do the unaligned copy.
10206 */
10207 kr = vm_map_copy_overwrite_nested(dst_map,
10208 head_addr,
10209 head_copy,
10210 interruptible,
10211 (pmap_t) NULL,
10212 FALSE);
10213 if (kr != KERN_SUCCESS) {
10214 ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_COPYOVERWRITE_PARTIAL_HEAD_NESTED_ERROR), kr /* arg */);
10215 goto done;
10216 }
10217 }
10218
10219 if (tail_size) {
10220 /*
10221 * Extract "tail_copy" out of "copy".
10222 */
10223 tail_copy = vm_map_copy_allocate(VM_MAP_COPY_ENTRY_LIST);
10224 tail_copy->cpy_hdr.entries_pageable =
10225 copy->cpy_hdr.entries_pageable;
10226 tail_copy->cpy_hdr.page_shift = copy_page_shift;
10227
10228 tail_copy->offset = copy->offset + copy_size - tail_size;
10229 tail_copy->size = tail_size;
10230
10231 copy->size -= tail_size;
10232 copy_size -= tail_size;
10233 assert(copy_size > 0);
10234
10235 entry = vm_map_copy_last_entry(copy);
10236 vm_map_copy_clip_start(copy, entry, tail_copy->offset);
10237 entry = vm_map_copy_last_entry(copy);
10238 vm_map_copy_entry_unlink(copy, entry);
10239 vm_map_copy_entry_link(tail_copy,
10240 vm_map_copy_last_entry(tail_copy),
10241 entry);
10242 }
10243
10244 /*
10245 * If we are here from ipc_kmsg_copyout_ool_descriptor(),
10246 * we want to avoid TOCTOU issues w.r.t copy->size but
10247 * we don't need to change vm_map_copy_overwrite_nested()
10248 * and all other vm_map_copy_overwrite variants.
10249 *
10250 * So we assign the original copy_size that was passed into
10251 * this routine back to copy.
10252 *
10253 * This use of local 'copy_size' passed into this routine is
10254 * to try and protect against TOCTOU attacks where the kernel
10255 * has been exploited. We don't expect this to be an issue
10256 * during normal system operation.
10257 */
10258 assertf(copy->size == copy_size,
10259 "Mismatch of copy sizes. Expected 0x%llx, Got 0x%llx\n", (uint64_t) copy_size, (uint64_t) copy->size);
10260 copy->size = copy_size;
10261
10262 /*
10263 * Copy most (or possibly all) of the data.
10264 */
10265 kr = vm_map_copy_overwrite_nested(dst_map,
10266 dst_addr + head_size,
10267 copy,
10268 interruptible,
10269 (pmap_t) NULL,
10270 FALSE);
10271 if (kr != KERN_SUCCESS) {
10272 ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_COPYOVERWRITE_PARTIAL_NESTED_ERROR), kr /* arg */);
10273 goto done;
10274 }
10275
10276 if (tail_size) {
10277 kr = vm_map_copy_overwrite_nested(dst_map,
10278 tail_addr,
10279 tail_copy,
10280 interruptible,
10281 (pmap_t) NULL,
10282 FALSE);
10283 if (kr) {
10284 ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_COPYOVERWRITE_PARTIAL_TAIL_NESTED_ERROR), kr /* arg */);
10285 }
10286 }
10287
10288 done:
10289 assert(copy->type == VM_MAP_COPY_ENTRY_LIST);
10290 if (kr == KERN_SUCCESS) {
10291 /*
10292 * Discard all the copy maps.
10293 */
10294 if (head_copy) {
10295 vm_map_copy_discard(head_copy);
10296 head_copy = NULL;
10297 }
10298 vm_map_copy_discard(copy);
10299 if (tail_copy) {
10300 vm_map_copy_discard(tail_copy);
10301 tail_copy = NULL;
10302 }
10303 } else {
10304 /*
10305 * Re-assemble the original copy map.
10306 */
10307 if (head_copy) {
10308 entry = vm_map_copy_first_entry(head_copy);
10309 vm_map_copy_entry_unlink(head_copy, entry);
10310 vm_map_copy_entry_link(copy,
10311 vm_map_copy_to_entry(copy),
10312 entry);
10313 copy->offset -= head_size;
10314 copy->size += head_size;
10315 vm_map_copy_discard(head_copy);
10316 head_copy = NULL;
10317 }
10318 if (tail_copy) {
10319 entry = vm_map_copy_last_entry(tail_copy);
10320 vm_map_copy_entry_unlink(tail_copy, entry);
10321 vm_map_copy_entry_link(copy,
10322 vm_map_copy_last_entry(copy),
10323 entry);
10324 copy->size += tail_size;
10325 vm_map_copy_discard(tail_copy);
10326 tail_copy = NULL;
10327 }
10328 }
10329 return kr;
10330 }
10331
10332
10333 /*
10334 * Routine: vm_map_copy_overwrite_unaligned [internal use only]
10335 *
10336 * Decription:
10337 * Physically copy unaligned data
10338 *
10339 * Implementation:
10340 * Unaligned parts of pages have to be physically copied. We use
10341 * a modified form of vm_fault_copy (which understands none-aligned
10342 * page offsets and sizes) to do the copy. We attempt to copy as
10343 * much memory in one go as possibly, however vm_fault_copy copies
10344 * within 1 memory object so we have to find the smaller of "amount left"
10345 * "source object data size" and "target object data size". With
10346 * unaligned data we don't need to split regions, therefore the source
10347 * (copy) object should be one map entry, the target range may be split
10348 * over multiple map entries however. In any event we are pessimistic
10349 * about these assumptions.
10350 *
10351 * Callers of this function must call vm_map_copy_require on
10352 * previously created vm_map_copy_t or pass a newly created
10353 * one to ensure that it hasn't been forged.
10354 *
10355 * Assumptions:
10356 * dst_map is locked on entry and is return locked on success,
10357 * unlocked on error.
10358 */
10359
10360 static kern_return_t
vm_map_copy_overwrite_unaligned(vm_map_t dst_map,vm_map_entry_t entry,vm_map_copy_t copy,vm_map_offset_t start,boolean_t discard_on_success)10361 vm_map_copy_overwrite_unaligned(
10362 vm_map_t dst_map,
10363 vm_map_entry_t entry,
10364 vm_map_copy_t copy,
10365 vm_map_offset_t start,
10366 boolean_t discard_on_success)
10367 {
10368 vm_map_entry_t copy_entry;
10369 vm_map_entry_t copy_entry_next;
10370 vm_map_version_t version;
10371 vm_object_t dst_object;
10372 vm_object_offset_t dst_offset;
10373 vm_object_offset_t src_offset;
10374 vm_object_offset_t entry_offset;
10375 vm_map_offset_t entry_end;
10376 vm_map_size_t src_size,
10377 dst_size,
10378 copy_size,
10379 amount_left;
10380 kern_return_t kr = KERN_SUCCESS;
10381
10382
10383 copy_entry = vm_map_copy_first_entry(copy);
10384
10385 vm_map_lock_write_to_read(dst_map);
10386
10387 src_offset = copy->offset - trunc_page_mask_64(copy->offset, VM_MAP_COPY_PAGE_MASK(copy));
10388 amount_left = copy->size;
10389 /*
10390 * unaligned so we never clipped this entry, we need the offset into
10391 * the vm_object not just the data.
10392 */
10393 while (amount_left > 0) {
10394 if (entry == vm_map_to_entry(dst_map)) {
10395 vm_map_unlock_read(dst_map);
10396 return KERN_INVALID_ADDRESS;
10397 }
10398
10399 /* "start" must be within the current map entry */
10400 assert((start >= entry->vme_start) && (start < entry->vme_end));
10401
10402 /*
10403 * Check protection again
10404 */
10405 if (!(entry->protection & VM_PROT_WRITE)) {
10406 vm_map_unlock_read(dst_map);
10407 return KERN_PROTECTION_FAILURE;
10408 }
10409 if (!vm_map_entry_is_overwritable(dst_map, entry)) {
10410 vm_map_unlock_read(dst_map);
10411 return KERN_PROTECTION_FAILURE;
10412 }
10413
10414 /*
10415 * If the entry is in transition, we must wait
10416 * for it to exit that state. Anything could happen
10417 * when we unlock the map, so start over.
10418 */
10419 if (entry->in_transition) {
10420 /*
10421 * Say that we are waiting, and wait for entry.
10422 */
10423 entry->needs_wakeup = TRUE;
10424 vm_map_entry_wait(dst_map, THREAD_UNINT);
10425
10426 goto RetryLookup;
10427 }
10428
10429 dst_offset = start - entry->vme_start;
10430
10431 dst_size = entry->vme_end - start;
10432
10433 src_size = copy_entry->vme_end -
10434 (copy_entry->vme_start + src_offset);
10435
10436 if (dst_size < src_size) {
10437 /*
10438 * we can only copy dst_size bytes before
10439 * we have to get the next destination entry
10440 */
10441 copy_size = dst_size;
10442 } else {
10443 /*
10444 * we can only copy src_size bytes before
10445 * we have to get the next source copy entry
10446 */
10447 copy_size = src_size;
10448 }
10449
10450 if (copy_size > amount_left) {
10451 copy_size = amount_left;
10452 }
10453 /*
10454 * Entry needs copy, create a shadow shadow object for
10455 * Copy on write region.
10456 */
10457 if (entry->needs_copy) {
10458 if (vm_map_lock_read_to_write(dst_map)) {
10459 vm_map_lock_read(dst_map);
10460 goto RetryLookup;
10461 }
10462 VME_OBJECT_SHADOW(entry,
10463 (vm_map_size_t)(entry->vme_end
10464 - entry->vme_start),
10465 vm_map_always_shadow(dst_map));
10466 entry->needs_copy = FALSE;
10467 vm_map_lock_write_to_read(dst_map);
10468 }
10469 dst_object = VME_OBJECT(entry);
10470 /*
10471 * unlike with the virtual (aligned) copy we're going
10472 * to fault on it therefore we need a target object.
10473 */
10474 if (dst_object == VM_OBJECT_NULL) {
10475 if (vm_map_lock_read_to_write(dst_map)) {
10476 vm_map_lock_read(dst_map);
10477 goto RetryLookup;
10478 }
10479 dst_object = vm_object_allocate((vm_map_size_t)
10480 entry->vme_end - entry->vme_start);
10481 VME_OBJECT_SET(entry, dst_object, false, 0);
10482 VME_OFFSET_SET(entry, 0);
10483 assert(entry->use_pmap);
10484 vm_map_lock_write_to_read(dst_map);
10485 }
10486 /*
10487 * Take an object reference and unlock map. The "entry" may
10488 * disappear or change when the map is unlocked.
10489 */
10490 vm_object_reference(dst_object);
10491 version.main_timestamp = dst_map->timestamp;
10492 entry_offset = VME_OFFSET(entry);
10493 entry_end = entry->vme_end;
10494 vm_map_unlock_read(dst_map);
10495 /*
10496 * Copy as much as possible in one pass
10497 */
10498 kr = vm_fault_copy(
10499 VME_OBJECT(copy_entry),
10500 VME_OFFSET(copy_entry) + src_offset,
10501 ©_size,
10502 dst_object,
10503 entry_offset + dst_offset,
10504 dst_map,
10505 &version,
10506 THREAD_UNINT );
10507
10508 start += copy_size;
10509 src_offset += copy_size;
10510 amount_left -= copy_size;
10511 /*
10512 * Release the object reference
10513 */
10514 vm_object_deallocate(dst_object);
10515 /*
10516 * If a hard error occurred, return it now
10517 */
10518 if (kr != KERN_SUCCESS) {
10519 return kr;
10520 }
10521
10522 if ((copy_entry->vme_start + src_offset) == copy_entry->vme_end
10523 || amount_left == 0) {
10524 /*
10525 * all done with this copy entry, dispose.
10526 */
10527 copy_entry_next = copy_entry->vme_next;
10528
10529 if (discard_on_success) {
10530 vm_map_copy_entry_unlink(copy, copy_entry);
10531 assert(!copy_entry->is_sub_map);
10532 vm_object_deallocate(VME_OBJECT(copy_entry));
10533 vm_map_copy_entry_dispose(copy_entry);
10534 }
10535
10536 if (copy_entry_next == vm_map_copy_to_entry(copy) &&
10537 amount_left) {
10538 /*
10539 * not finished copying but run out of source
10540 */
10541 return KERN_INVALID_ADDRESS;
10542 }
10543
10544 copy_entry = copy_entry_next;
10545
10546 src_offset = 0;
10547 }
10548
10549 if (amount_left == 0) {
10550 return KERN_SUCCESS;
10551 }
10552
10553 vm_map_lock_read(dst_map);
10554 if (version.main_timestamp == dst_map->timestamp) {
10555 if (start == entry_end) {
10556 /*
10557 * destination region is split. Use the version
10558 * information to avoid a lookup in the normal
10559 * case.
10560 */
10561 entry = entry->vme_next;
10562 /*
10563 * should be contiguous. Fail if we encounter
10564 * a hole in the destination.
10565 */
10566 if (start != entry->vme_start) {
10567 vm_map_unlock_read(dst_map);
10568 return KERN_INVALID_ADDRESS;
10569 }
10570 }
10571 } else {
10572 /*
10573 * Map version check failed.
10574 * we must lookup the entry because somebody
10575 * might have changed the map behind our backs.
10576 */
10577 RetryLookup:
10578 if (!vm_map_lookup_entry(dst_map, start, &entry)) {
10579 vm_map_unlock_read(dst_map);
10580 return KERN_INVALID_ADDRESS;
10581 }
10582 }
10583 }/* while */
10584
10585 return KERN_SUCCESS;
10586 }/* vm_map_copy_overwrite_unaligned */
10587
10588 /*
10589 * Routine: vm_map_copy_overwrite_aligned [internal use only]
10590 *
10591 * Description:
10592 * Does all the vm_trickery possible for whole pages.
10593 *
10594 * Implementation:
10595 *
10596 * If there are no permanent objects in the destination,
10597 * and the source and destination map entry zones match,
10598 * and the destination map entry is not shared,
10599 * then the map entries can be deleted and replaced
10600 * with those from the copy. The following code is the
10601 * basic idea of what to do, but there are lots of annoying
10602 * little details about getting protection and inheritance
10603 * right. Should add protection, inheritance, and sharing checks
10604 * to the above pass and make sure that no wiring is involved.
10605 *
10606 * Callers of this function must call vm_map_copy_require on
10607 * previously created vm_map_copy_t or pass a newly created
10608 * one to ensure that it hasn't been forged.
10609 */
10610
10611 int vm_map_copy_overwrite_aligned_src_not_internal = 0;
10612 int vm_map_copy_overwrite_aligned_src_not_symmetric = 0;
10613 int vm_map_copy_overwrite_aligned_src_large = 0;
10614
10615 static kern_return_t
vm_map_copy_overwrite_aligned(vm_map_t dst_map,vm_map_entry_t tmp_entry,vm_map_copy_t copy,vm_map_offset_t start,__unused pmap_t pmap)10616 vm_map_copy_overwrite_aligned(
10617 vm_map_t dst_map,
10618 vm_map_entry_t tmp_entry,
10619 vm_map_copy_t copy,
10620 vm_map_offset_t start,
10621 __unused pmap_t pmap)
10622 {
10623 vm_object_t object;
10624 vm_map_entry_t copy_entry;
10625 vm_map_size_t copy_size;
10626 vm_map_size_t size;
10627 vm_map_entry_t entry;
10628
10629 while ((copy_entry = vm_map_copy_first_entry(copy))
10630 != vm_map_copy_to_entry(copy)) {
10631 copy_size = (copy_entry->vme_end - copy_entry->vme_start);
10632
10633 entry = tmp_entry;
10634 if (entry->is_sub_map) {
10635 /* unnested when clipped earlier */
10636 assert(!entry->use_pmap);
10637 }
10638 if (entry == vm_map_to_entry(dst_map)) {
10639 vm_map_unlock(dst_map);
10640 return KERN_INVALID_ADDRESS;
10641 }
10642 size = (entry->vme_end - entry->vme_start);
10643 /*
10644 * Make sure that no holes popped up in the
10645 * address map, and that the protection is
10646 * still valid, in case the map was unlocked
10647 * earlier.
10648 */
10649
10650 if ((entry->vme_start != start) || ((entry->is_sub_map)
10651 && !entry->needs_copy)) {
10652 vm_map_unlock(dst_map);
10653 return KERN_INVALID_ADDRESS;
10654 }
10655 assert(entry != vm_map_to_entry(dst_map));
10656
10657 /*
10658 * Check protection again
10659 */
10660
10661 if (!(entry->protection & VM_PROT_WRITE)) {
10662 vm_map_unlock(dst_map);
10663 return KERN_PROTECTION_FAILURE;
10664 }
10665
10666 if (!vm_map_entry_is_overwritable(dst_map, entry)) {
10667 vm_map_unlock(dst_map);
10668 return KERN_PROTECTION_FAILURE;
10669 }
10670
10671 /*
10672 * If the entry is in transition, we must wait
10673 * for it to exit that state. Anything could happen
10674 * when we unlock the map, so start over.
10675 */
10676 if (entry->in_transition) {
10677 /*
10678 * Say that we are waiting, and wait for entry.
10679 */
10680 entry->needs_wakeup = TRUE;
10681 vm_map_entry_wait(dst_map, THREAD_UNINT);
10682
10683 goto RetryLookup;
10684 }
10685
10686 /*
10687 * Adjust to source size first
10688 */
10689
10690 if (copy_size < size) {
10691 if (entry->map_aligned &&
10692 !VM_MAP_PAGE_ALIGNED(entry->vme_start + copy_size,
10693 VM_MAP_PAGE_MASK(dst_map))) {
10694 /* no longer map-aligned */
10695 entry->map_aligned = FALSE;
10696 }
10697 vm_map_clip_end(dst_map, entry, entry->vme_start + copy_size);
10698 size = copy_size;
10699 }
10700
10701 /*
10702 * Adjust to destination size
10703 */
10704
10705 if (size < copy_size) {
10706 vm_map_copy_clip_end(copy, copy_entry,
10707 copy_entry->vme_start + size);
10708 copy_size = size;
10709 }
10710
10711 assert((entry->vme_end - entry->vme_start) == size);
10712 assert((tmp_entry->vme_end - tmp_entry->vme_start) == size);
10713 assert((copy_entry->vme_end - copy_entry->vme_start) == size);
10714
10715 /*
10716 * If the destination contains temporary unshared memory,
10717 * we can perform the copy by throwing it away and
10718 * installing the source data.
10719 *
10720 * Exceptions for mappings with special semantics:
10721 * + "permanent" entries,
10722 * + JIT regions,
10723 * + TPRO regions,
10724 * + pmap-specific protection policies,
10725 * + VM objects with COPY_NONE copy strategy.
10726 */
10727
10728 object = VME_OBJECT(entry);
10729 if ((!entry->is_shared &&
10730 !entry->vme_permanent &&
10731 !entry->used_for_jit &&
10732 #if __arm64e__
10733 !entry->used_for_tpro &&
10734 #endif /* __arm64e__ */
10735 !(entry->protection & VM_PROT_EXECUTE) &&
10736 !pmap_has_prot_policy(dst_map->pmap, entry->translated_allow_execute, entry->protection) &&
10737 ((object == VM_OBJECT_NULL) ||
10738 (object->internal &&
10739 !object->true_share &&
10740 object->copy_strategy != MEMORY_OBJECT_COPY_NONE))) ||
10741 entry->needs_copy) {
10742 vm_object_t old_object = VME_OBJECT(entry);
10743 vm_object_offset_t old_offset = VME_OFFSET(entry);
10744 vm_object_offset_t offset;
10745
10746 /*
10747 * Ensure that the source and destination aren't
10748 * identical
10749 */
10750 if (old_object == VME_OBJECT(copy_entry) &&
10751 old_offset == VME_OFFSET(copy_entry)) {
10752 vm_map_copy_entry_unlink(copy, copy_entry);
10753 vm_map_copy_entry_dispose(copy_entry);
10754
10755 if (old_object != VM_OBJECT_NULL) {
10756 vm_object_deallocate(old_object);
10757 }
10758
10759 start = tmp_entry->vme_end;
10760 tmp_entry = tmp_entry->vme_next;
10761 continue;
10762 }
10763
10764 #if XNU_TARGET_OS_OSX
10765 #define __TRADEOFF1_OBJ_SIZE (64 * 1024 * 1024) /* 64 MB */
10766 #define __TRADEOFF1_COPY_SIZE (128 * 1024) /* 128 KB */
10767 if (VME_OBJECT(copy_entry) != VM_OBJECT_NULL &&
10768 VME_OBJECT(copy_entry)->vo_size >= __TRADEOFF1_OBJ_SIZE &&
10769 copy_size <= __TRADEOFF1_COPY_SIZE) {
10770 /*
10771 * Virtual vs. Physical copy tradeoff #1.
10772 *
10773 * Copying only a few pages out of a large
10774 * object: do a physical copy instead of
10775 * a virtual copy, to avoid possibly keeping
10776 * the entire large object alive because of
10777 * those few copy-on-write pages.
10778 */
10779 vm_map_copy_overwrite_aligned_src_large++;
10780 goto slow_copy;
10781 }
10782 #endif /* XNU_TARGET_OS_OSX */
10783
10784 if ((dst_map->pmap != kernel_pmap) &&
10785 (VME_ALIAS(entry) >= VM_MEMORY_MALLOC) &&
10786 (VME_ALIAS(entry) <= VM_MEMORY_MALLOC_MEDIUM)) {
10787 vm_object_t new_object, new_shadow;
10788
10789 /*
10790 * We're about to map something over a mapping
10791 * established by malloc()...
10792 */
10793 new_object = VME_OBJECT(copy_entry);
10794 if (new_object != VM_OBJECT_NULL) {
10795 vm_object_lock_shared(new_object);
10796 }
10797 while (new_object != VM_OBJECT_NULL &&
10798 #if XNU_TARGET_OS_OSX
10799 !new_object->true_share &&
10800 new_object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC &&
10801 #endif /* XNU_TARGET_OS_OSX */
10802 new_object->internal) {
10803 new_shadow = new_object->shadow;
10804 if (new_shadow == VM_OBJECT_NULL) {
10805 break;
10806 }
10807 vm_object_lock_shared(new_shadow);
10808 vm_object_unlock(new_object);
10809 new_object = new_shadow;
10810 }
10811 if (new_object != VM_OBJECT_NULL) {
10812 if (!new_object->internal) {
10813 /*
10814 * The new mapping is backed
10815 * by an external object. We
10816 * don't want malloc'ed memory
10817 * to be replaced with such a
10818 * non-anonymous mapping, so
10819 * let's go off the optimized
10820 * path...
10821 */
10822 vm_map_copy_overwrite_aligned_src_not_internal++;
10823 vm_object_unlock(new_object);
10824 goto slow_copy;
10825 }
10826 #if XNU_TARGET_OS_OSX
10827 if (new_object->true_share ||
10828 new_object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC) {
10829 /*
10830 * Same if there's a "true_share"
10831 * object in the shadow chain, or
10832 * an object with a non-default
10833 * (SYMMETRIC) copy strategy.
10834 */
10835 vm_map_copy_overwrite_aligned_src_not_symmetric++;
10836 vm_object_unlock(new_object);
10837 goto slow_copy;
10838 }
10839 #endif /* XNU_TARGET_OS_OSX */
10840 vm_object_unlock(new_object);
10841 }
10842 /*
10843 * The new mapping is still backed by
10844 * anonymous (internal) memory, so it's
10845 * OK to substitute it for the original
10846 * malloc() mapping.
10847 */
10848 }
10849
10850 if (old_object != VM_OBJECT_NULL) {
10851 assert(!entry->vme_permanent);
10852 if (entry->is_sub_map) {
10853 if (entry->use_pmap) {
10854 #ifndef NO_NESTED_PMAP
10855 pmap_unnest(dst_map->pmap,
10856 (addr64_t)entry->vme_start,
10857 entry->vme_end - entry->vme_start);
10858 #endif /* NO_NESTED_PMAP */
10859 if (dst_map->mapped_in_other_pmaps) {
10860 /* clean up parent */
10861 /* map/maps */
10862 vm_map_submap_pmap_clean(
10863 dst_map, entry->vme_start,
10864 entry->vme_end,
10865 VME_SUBMAP(entry),
10866 VME_OFFSET(entry));
10867 }
10868 } else {
10869 vm_map_submap_pmap_clean(
10870 dst_map, entry->vme_start,
10871 entry->vme_end,
10872 VME_SUBMAP(entry),
10873 VME_OFFSET(entry));
10874 }
10875 vm_map_deallocate(VME_SUBMAP(entry));
10876 } else {
10877 if (dst_map->mapped_in_other_pmaps) {
10878 vm_object_pmap_protect_options(
10879 VME_OBJECT(entry),
10880 VME_OFFSET(entry),
10881 entry->vme_end
10882 - entry->vme_start,
10883 PMAP_NULL,
10884 PAGE_SIZE,
10885 entry->vme_start,
10886 VM_PROT_NONE,
10887 PMAP_OPTIONS_REMOVE);
10888 } else {
10889 pmap_remove_options(
10890 dst_map->pmap,
10891 (addr64_t)(entry->vme_start),
10892 (addr64_t)(entry->vme_end),
10893 PMAP_OPTIONS_REMOVE);
10894 }
10895 vm_object_deallocate(old_object);
10896 }
10897 }
10898
10899 if (entry->iokit_acct) {
10900 /* keep using iokit accounting */
10901 entry->use_pmap = FALSE;
10902 } else {
10903 /* use pmap accounting */
10904 entry->use_pmap = TRUE;
10905 }
10906 assert(!entry->vme_permanent);
10907 VME_OBJECT_SET(entry, VME_OBJECT(copy_entry), false, 0);
10908 object = VME_OBJECT(entry);
10909 entry->needs_copy = copy_entry->needs_copy;
10910 entry->wired_count = 0;
10911 entry->user_wired_count = 0;
10912 offset = VME_OFFSET(copy_entry);
10913 VME_OFFSET_SET(entry, offset);
10914
10915 vm_map_copy_entry_unlink(copy, copy_entry);
10916 vm_map_copy_entry_dispose(copy_entry);
10917
10918 /*
10919 * we could try to push pages into the pmap at this point, BUT
10920 * this optimization only saved on average 2 us per page if ALL
10921 * the pages in the source were currently mapped
10922 * and ALL the pages in the dest were touched, if there were fewer
10923 * than 2/3 of the pages touched, this optimization actually cost more cycles
10924 * it also puts a lot of pressure on the pmap layer w/r to mapping structures
10925 */
10926
10927 /*
10928 * Set up for the next iteration. The map
10929 * has not been unlocked, so the next
10930 * address should be at the end of this
10931 * entry, and the next map entry should be
10932 * the one following it.
10933 */
10934
10935 start = tmp_entry->vme_end;
10936 tmp_entry = tmp_entry->vme_next;
10937 } else {
10938 vm_map_version_t version;
10939 vm_object_t dst_object;
10940 vm_object_offset_t dst_offset;
10941 kern_return_t r;
10942
10943 slow_copy:
10944 if (entry->needs_copy) {
10945 VME_OBJECT_SHADOW(entry,
10946 (entry->vme_end -
10947 entry->vme_start),
10948 vm_map_always_shadow(dst_map));
10949 entry->needs_copy = FALSE;
10950 }
10951
10952 dst_object = VME_OBJECT(entry);
10953 dst_offset = VME_OFFSET(entry);
10954
10955 /*
10956 * Take an object reference, and record
10957 * the map version information so that the
10958 * map can be safely unlocked.
10959 */
10960
10961 if (dst_object == VM_OBJECT_NULL) {
10962 /*
10963 * We would usually have just taken the
10964 * optimized path above if the destination
10965 * object has not been allocated yet. But we
10966 * now disable that optimization if the copy
10967 * entry's object is not backed by anonymous
10968 * memory to avoid replacing malloc'ed
10969 * (i.e. re-usable) anonymous memory with a
10970 * not-so-anonymous mapping.
10971 * So we have to handle this case here and
10972 * allocate a new VM object for this map entry.
10973 */
10974 dst_object = vm_object_allocate(
10975 entry->vme_end - entry->vme_start);
10976 dst_offset = 0;
10977 VME_OBJECT_SET(entry, dst_object, false, 0);
10978 VME_OFFSET_SET(entry, dst_offset);
10979 assert(entry->use_pmap);
10980 }
10981
10982 vm_object_reference(dst_object);
10983
10984 /* account for unlock bumping up timestamp */
10985 version.main_timestamp = dst_map->timestamp + 1;
10986
10987 vm_map_unlock(dst_map);
10988
10989 /*
10990 * Copy as much as possible in one pass
10991 */
10992
10993 copy_size = size;
10994 r = vm_fault_copy(
10995 VME_OBJECT(copy_entry),
10996 VME_OFFSET(copy_entry),
10997 ©_size,
10998 dst_object,
10999 dst_offset,
11000 dst_map,
11001 &version,
11002 THREAD_UNINT );
11003
11004 /*
11005 * Release the object reference
11006 */
11007
11008 vm_object_deallocate(dst_object);
11009
11010 /*
11011 * If a hard error occurred, return it now
11012 */
11013
11014 if (r != KERN_SUCCESS) {
11015 return r;
11016 }
11017
11018 if (copy_size != 0) {
11019 /*
11020 * Dispose of the copied region
11021 */
11022
11023 vm_map_copy_clip_end(copy, copy_entry,
11024 copy_entry->vme_start + copy_size);
11025 vm_map_copy_entry_unlink(copy, copy_entry);
11026 vm_object_deallocate(VME_OBJECT(copy_entry));
11027 vm_map_copy_entry_dispose(copy_entry);
11028 }
11029
11030 /*
11031 * Pick up in the destination map where we left off.
11032 *
11033 * Use the version information to avoid a lookup
11034 * in the normal case.
11035 */
11036
11037 start += copy_size;
11038 vm_map_lock(dst_map);
11039 if (version.main_timestamp == dst_map->timestamp &&
11040 copy_size != 0) {
11041 /* We can safely use saved tmp_entry value */
11042
11043 if (tmp_entry->map_aligned &&
11044 !VM_MAP_PAGE_ALIGNED(
11045 start,
11046 VM_MAP_PAGE_MASK(dst_map))) {
11047 /* no longer map-aligned */
11048 tmp_entry->map_aligned = FALSE;
11049 }
11050 vm_map_clip_end(dst_map, tmp_entry, start);
11051 tmp_entry = tmp_entry->vme_next;
11052 } else {
11053 /* Must do lookup of tmp_entry */
11054
11055 RetryLookup:
11056 if (!vm_map_lookup_entry(dst_map, start, &tmp_entry)) {
11057 vm_map_unlock(dst_map);
11058 return KERN_INVALID_ADDRESS;
11059 }
11060 if (tmp_entry->map_aligned &&
11061 !VM_MAP_PAGE_ALIGNED(
11062 start,
11063 VM_MAP_PAGE_MASK(dst_map))) {
11064 /* no longer map-aligned */
11065 tmp_entry->map_aligned = FALSE;
11066 }
11067 vm_map_clip_start(dst_map, tmp_entry, start);
11068 }
11069 }
11070 }/* while */
11071
11072 return KERN_SUCCESS;
11073 }/* vm_map_copy_overwrite_aligned */
11074
11075 /*
11076 * Routine: vm_map_copyin_kernel_buffer [internal use only]
11077 *
11078 * Description:
11079 * Copy in data to a kernel buffer from space in the
11080 * source map. The original space may be optionally
11081 * deallocated.
11082 *
11083 * If successful, returns a new copy object.
11084 */
11085 static kern_return_t
vm_map_copyin_kernel_buffer(vm_map_t src_map,vm_map_offset_t src_addr,vm_map_size_t len,boolean_t src_destroy,vm_map_copy_t * copy_result)11086 vm_map_copyin_kernel_buffer(
11087 vm_map_t src_map,
11088 vm_map_offset_t src_addr,
11089 vm_map_size_t len,
11090 boolean_t src_destroy,
11091 vm_map_copy_t *copy_result)
11092 {
11093 kern_return_t kr;
11094 vm_map_copy_t copy;
11095 void *kdata;
11096
11097 if (len > msg_ool_size_small) {
11098 return KERN_INVALID_ARGUMENT;
11099 }
11100
11101 kdata = kalloc_data(len, Z_WAITOK);
11102 if (kdata == NULL) {
11103 return KERN_RESOURCE_SHORTAGE;
11104 }
11105 kr = copyinmap(src_map, src_addr, kdata, (vm_size_t)len);
11106 if (kr != KERN_SUCCESS) {
11107 kfree_data(kdata, len);
11108 return kr;
11109 }
11110
11111 copy = vm_map_copy_allocate(VM_MAP_COPY_KERNEL_BUFFER);
11112 copy->cpy_kdata = kdata;
11113 copy->size = len;
11114 copy->offset = 0;
11115
11116 if (src_destroy) {
11117 vmr_flags_t flags = VM_MAP_REMOVE_INTERRUPTIBLE;
11118
11119 if (src_map == kernel_map) {
11120 flags |= VM_MAP_REMOVE_KUNWIRE;
11121 }
11122
11123 (void)vm_map_remove_guard(src_map,
11124 vm_map_trunc_page(src_addr, VM_MAP_PAGE_MASK(src_map)),
11125 vm_map_round_page(src_addr + len, VM_MAP_PAGE_MASK(src_map)),
11126 flags, KMEM_GUARD_NONE);
11127 }
11128
11129 *copy_result = copy;
11130 return KERN_SUCCESS;
11131 }
11132
11133 /*
11134 * Routine: vm_map_copyout_kernel_buffer [internal use only]
11135 *
11136 * Description:
11137 * Copy out data from a kernel buffer into space in the
11138 * destination map. The space may be otpionally dynamically
11139 * allocated.
11140 *
11141 * If successful, consumes the copy object.
11142 * Otherwise, the caller is responsible for it.
11143 *
11144 * Callers of this function must call vm_map_copy_require on
11145 * previously created vm_map_copy_t or pass a newly created
11146 * one to ensure that it hasn't been forged.
11147 */
11148 static int vm_map_copyout_kernel_buffer_failures = 0;
11149 static kern_return_t
vm_map_copyout_kernel_buffer(vm_map_t map,vm_map_address_t * addr,vm_map_copy_t copy,vm_map_size_t copy_size,boolean_t overwrite,boolean_t consume_on_success)11150 vm_map_copyout_kernel_buffer(
11151 vm_map_t map,
11152 vm_map_address_t *addr, /* IN/OUT */
11153 vm_map_copy_t copy,
11154 vm_map_size_t copy_size,
11155 boolean_t overwrite,
11156 boolean_t consume_on_success)
11157 {
11158 kern_return_t kr = KERN_SUCCESS;
11159 thread_t thread = current_thread();
11160
11161 assert(copy->size == copy_size);
11162
11163 /*
11164 * check for corrupted vm_map_copy structure
11165 */
11166 if (copy_size > msg_ool_size_small || copy->offset) {
11167 panic("Invalid vm_map_copy_t sz:%lld, ofst:%lld",
11168 (long long)copy->size, (long long)copy->offset);
11169 }
11170
11171 if (!overwrite) {
11172 /*
11173 * Allocate space in the target map for the data
11174 */
11175 vm_map_kernel_flags_t vmk_flags = VM_MAP_KERNEL_FLAGS_ANYWHERE();
11176
11177 if (map == kernel_map) {
11178 vmk_flags.vmkf_range_id = KMEM_RANGE_ID_DATA;
11179 }
11180
11181 *addr = 0;
11182 kr = vm_map_enter(map,
11183 addr,
11184 vm_map_round_page(copy_size,
11185 VM_MAP_PAGE_MASK(map)),
11186 (vm_map_offset_t) 0,
11187 vmk_flags,
11188 VM_OBJECT_NULL,
11189 (vm_object_offset_t) 0,
11190 FALSE,
11191 VM_PROT_DEFAULT,
11192 VM_PROT_ALL,
11193 VM_INHERIT_DEFAULT);
11194 if (kr != KERN_SUCCESS) {
11195 return kr;
11196 }
11197 #if KASAN
11198 if (map->pmap == kernel_pmap) {
11199 kasan_notify_address(*addr, copy->size);
11200 }
11201 #endif
11202 }
11203
11204 /*
11205 * Copyout the data from the kernel buffer to the target map.
11206 */
11207 if (thread->map == map) {
11208 /*
11209 * If the target map is the current map, just do
11210 * the copy.
11211 */
11212 assert((vm_size_t)copy_size == copy_size);
11213 if (copyout(copy->cpy_kdata, *addr, (vm_size_t)copy_size)) {
11214 kr = KERN_INVALID_ADDRESS;
11215 }
11216 } else {
11217 vm_map_t oldmap;
11218
11219 /*
11220 * If the target map is another map, assume the
11221 * target's address space identity for the duration
11222 * of the copy.
11223 */
11224 vm_map_reference(map);
11225 oldmap = vm_map_switch(map);
11226
11227 assert((vm_size_t)copy_size == copy_size);
11228 if (copyout(copy->cpy_kdata, *addr, (vm_size_t)copy_size)) {
11229 vm_map_copyout_kernel_buffer_failures++;
11230 kr = KERN_INVALID_ADDRESS;
11231 }
11232
11233 (void) vm_map_switch(oldmap);
11234 vm_map_deallocate(map);
11235 }
11236
11237 if (kr != KERN_SUCCESS) {
11238 /* the copy failed, clean up */
11239 if (!overwrite) {
11240 /*
11241 * Deallocate the space we allocated in the target map.
11242 */
11243 (void) vm_map_remove(map,
11244 vm_map_trunc_page(*addr,
11245 VM_MAP_PAGE_MASK(map)),
11246 vm_map_round_page((*addr +
11247 vm_map_round_page(copy_size,
11248 VM_MAP_PAGE_MASK(map))),
11249 VM_MAP_PAGE_MASK(map)));
11250 *addr = 0;
11251 }
11252 } else {
11253 /* copy was successful, dicard the copy structure */
11254 if (consume_on_success) {
11255 kfree_data(copy->cpy_kdata, copy_size);
11256 zfree_id(ZONE_ID_VM_MAP_COPY, copy);
11257 }
11258 }
11259
11260 return kr;
11261 }
11262
11263 /*
11264 * Routine: vm_map_copy_insert [internal use only]
11265 *
11266 * Description:
11267 * Link a copy chain ("copy") into a map at the
11268 * specified location (after "where").
11269 *
11270 * Callers of this function must call vm_map_copy_require on
11271 * previously created vm_map_copy_t or pass a newly created
11272 * one to ensure that it hasn't been forged.
11273 * Side effects:
11274 * The copy chain is destroyed.
11275 */
11276 static void
vm_map_copy_insert(vm_map_t map,vm_map_entry_t after_where,vm_map_copy_t copy)11277 vm_map_copy_insert(
11278 vm_map_t map,
11279 vm_map_entry_t after_where,
11280 vm_map_copy_t copy)
11281 {
11282 vm_map_entry_t entry;
11283
11284 while (vm_map_copy_first_entry(copy) != vm_map_copy_to_entry(copy)) {
11285 entry = vm_map_copy_first_entry(copy);
11286 vm_map_copy_entry_unlink(copy, entry);
11287 vm_map_store_entry_link(map, after_where, entry,
11288 VM_MAP_KERNEL_FLAGS_NONE);
11289 after_where = entry;
11290 }
11291 zfree_id(ZONE_ID_VM_MAP_COPY, copy);
11292 }
11293
11294 /*
11295 * Callers of this function must call vm_map_copy_require on
11296 * previously created vm_map_copy_t or pass a newly created
11297 * one to ensure that it hasn't been forged.
11298 */
11299 void
vm_map_copy_remap(vm_map_t map,vm_map_entry_t where,vm_map_copy_t copy,vm_map_offset_t adjustment,vm_prot_t cur_prot,vm_prot_t max_prot,vm_inherit_t inheritance)11300 vm_map_copy_remap(
11301 vm_map_t map,
11302 vm_map_entry_t where,
11303 vm_map_copy_t copy,
11304 vm_map_offset_t adjustment,
11305 vm_prot_t cur_prot,
11306 vm_prot_t max_prot,
11307 vm_inherit_t inheritance)
11308 {
11309 vm_map_entry_t copy_entry, new_entry;
11310
11311 for (copy_entry = vm_map_copy_first_entry(copy);
11312 copy_entry != vm_map_copy_to_entry(copy);
11313 copy_entry = copy_entry->vme_next) {
11314 /* get a new VM map entry for the map */
11315 new_entry = vm_map_entry_create(map);
11316 /* copy the "copy entry" to the new entry */
11317 vm_map_entry_copy(map, new_entry, copy_entry);
11318 /* adjust "start" and "end" */
11319 new_entry->vme_start += adjustment;
11320 new_entry->vme_end += adjustment;
11321 /* clear some attributes */
11322 new_entry->inheritance = inheritance;
11323 new_entry->protection = cur_prot;
11324 new_entry->max_protection = max_prot;
11325 new_entry->behavior = VM_BEHAVIOR_DEFAULT;
11326 /* take an extra reference on the entry's "object" */
11327 if (new_entry->is_sub_map) {
11328 assert(!new_entry->use_pmap); /* not nested */
11329 vm_map_reference(VME_SUBMAP(new_entry));
11330 } else {
11331 vm_object_reference(VME_OBJECT(new_entry));
11332 }
11333 /* insert the new entry in the map */
11334 vm_map_store_entry_link(map, where, new_entry,
11335 VM_MAP_KERNEL_FLAGS_NONE);
11336 /* continue inserting the "copy entries" after the new entry */
11337 where = new_entry;
11338 }
11339 }
11340
11341
11342 /*
11343 * Returns true if *size matches (or is in the range of) copy->size.
11344 * Upon returning true, the *size field is updated with the actual size of the
11345 * copy object (may be different for VM_MAP_COPY_ENTRY_LIST types)
11346 */
11347 boolean_t
vm_map_copy_validate_size(vm_map_t dst_map,vm_map_copy_t copy,vm_map_size_t * size)11348 vm_map_copy_validate_size(
11349 vm_map_t dst_map,
11350 vm_map_copy_t copy,
11351 vm_map_size_t *size)
11352 {
11353 if (copy == VM_MAP_COPY_NULL) {
11354 return FALSE;
11355 }
11356
11357 /*
11358 * Assert that the vm_map_copy is coming from the right
11359 * zone and hasn't been forged
11360 */
11361 vm_map_copy_require(copy);
11362
11363 vm_map_size_t copy_sz = copy->size;
11364 vm_map_size_t sz = *size;
11365 switch (copy->type) {
11366 case VM_MAP_COPY_KERNEL_BUFFER:
11367 if (sz == copy_sz) {
11368 return TRUE;
11369 }
11370 break;
11371 case VM_MAP_COPY_ENTRY_LIST:
11372 /*
11373 * potential page-size rounding prevents us from exactly
11374 * validating this flavor of vm_map_copy, but we can at least
11375 * assert that it's within a range.
11376 */
11377 if (copy_sz >= sz &&
11378 copy_sz <= vm_map_round_page(sz, VM_MAP_PAGE_MASK(dst_map))) {
11379 *size = copy_sz;
11380 return TRUE;
11381 }
11382 break;
11383 default:
11384 break;
11385 }
11386 return FALSE;
11387 }
11388
11389 /*
11390 * Routine: vm_map_copyout_size
11391 *
11392 * Description:
11393 * Copy out a copy chain ("copy") into newly-allocated
11394 * space in the destination map. Uses a prevalidated
11395 * size for the copy object (vm_map_copy_validate_size).
11396 *
11397 * If successful, consumes the copy object.
11398 * Otherwise, the caller is responsible for it.
11399 */
11400 kern_return_t
vm_map_copyout_size(vm_map_t dst_map,vm_map_address_t * dst_addr,vm_map_copy_t copy,vm_map_size_t copy_size)11401 vm_map_copyout_size(
11402 vm_map_t dst_map,
11403 vm_map_address_t *dst_addr, /* OUT */
11404 vm_map_copy_t copy,
11405 vm_map_size_t copy_size)
11406 {
11407 return vm_map_copyout_internal(dst_map, dst_addr, copy, copy_size,
11408 TRUE, /* consume_on_success */
11409 VM_PROT_DEFAULT,
11410 VM_PROT_ALL,
11411 VM_INHERIT_DEFAULT);
11412 }
11413
11414 /*
11415 * Routine: vm_map_copyout
11416 *
11417 * Description:
11418 * Copy out a copy chain ("copy") into newly-allocated
11419 * space in the destination map.
11420 *
11421 * If successful, consumes the copy object.
11422 * Otherwise, the caller is responsible for it.
11423 */
11424 kern_return_t
vm_map_copyout(vm_map_t dst_map,vm_map_address_t * dst_addr,vm_map_copy_t copy)11425 vm_map_copyout(
11426 vm_map_t dst_map,
11427 vm_map_address_t *dst_addr, /* OUT */
11428 vm_map_copy_t copy)
11429 {
11430 return vm_map_copyout_internal(dst_map, dst_addr, copy, copy ? copy->size : 0,
11431 TRUE, /* consume_on_success */
11432 VM_PROT_DEFAULT,
11433 VM_PROT_ALL,
11434 VM_INHERIT_DEFAULT);
11435 }
11436
11437 kern_return_t
vm_map_copyout_internal(vm_map_t dst_map,vm_map_address_t * dst_addr,vm_map_copy_t copy,vm_map_size_t copy_size,boolean_t consume_on_success,vm_prot_t cur_protection,vm_prot_t max_protection,vm_inherit_t inheritance)11438 vm_map_copyout_internal(
11439 vm_map_t dst_map,
11440 vm_map_address_t *dst_addr, /* OUT */
11441 vm_map_copy_t copy,
11442 vm_map_size_t copy_size,
11443 boolean_t consume_on_success,
11444 vm_prot_t cur_protection,
11445 vm_prot_t max_protection,
11446 vm_inherit_t inheritance)
11447 {
11448 vm_map_size_t size;
11449 vm_map_size_t adjustment;
11450 vm_map_offset_t start;
11451 vm_object_offset_t vm_copy_start;
11452 vm_map_entry_t last;
11453 vm_map_entry_t entry;
11454 vm_map_copy_t original_copy;
11455 kern_return_t kr;
11456 vm_map_kernel_flags_t vmk_flags = VM_MAP_KERNEL_FLAGS_ANYWHERE();
11457
11458 /*
11459 * Check for null copy object.
11460 */
11461
11462 if (copy == VM_MAP_COPY_NULL) {
11463 *dst_addr = 0;
11464 return KERN_SUCCESS;
11465 }
11466
11467 /*
11468 * Assert that the vm_map_copy is coming from the right
11469 * zone and hasn't been forged
11470 */
11471 vm_map_copy_require(copy);
11472
11473 if (copy->size != copy_size) {
11474 *dst_addr = 0;
11475 ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_COPYOUT_INTERNAL_SIZE_ERROR), KERN_FAILURE /* arg */);
11476 return KERN_FAILURE;
11477 }
11478
11479 /*
11480 * Check for special kernel buffer allocated
11481 * by new_ipc_kmsg_copyin.
11482 */
11483
11484 if (copy->type == VM_MAP_COPY_KERNEL_BUFFER) {
11485 kr = vm_map_copyout_kernel_buffer(dst_map, dst_addr,
11486 copy, copy_size, FALSE,
11487 consume_on_success);
11488 if (kr) {
11489 ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_COPYOUT_KERNEL_BUFFER_ERROR), kr /* arg */);
11490 }
11491 return kr;
11492 }
11493
11494 original_copy = copy;
11495 if (copy->cpy_hdr.page_shift != VM_MAP_PAGE_SHIFT(dst_map)) {
11496 vm_map_copy_t target_copy;
11497 vm_map_offset_t overmap_start, overmap_end, trimmed_start;
11498
11499 target_copy = VM_MAP_COPY_NULL;
11500 DEBUG4K_ADJUST("adjusting...\n");
11501 kr = vm_map_copy_adjust_to_target(
11502 copy,
11503 0, /* offset */
11504 copy->size, /* size */
11505 dst_map,
11506 TRUE, /* copy */
11507 &target_copy,
11508 &overmap_start,
11509 &overmap_end,
11510 &trimmed_start);
11511 if (kr != KERN_SUCCESS) {
11512 DEBUG4K_COPY("adjust failed 0x%x\n", kr);
11513 ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_COPYOUT_INTERNAL_ADJUSTING_ERROR), kr /* arg */);
11514 return kr;
11515 }
11516 DEBUG4K_COPY("copy %p (%d 0x%llx 0x%llx) dst_map %p (%d) target_copy %p (%d 0x%llx 0x%llx) overmap_start 0x%llx overmap_end 0x%llx trimmed_start 0x%llx\n", copy, copy->cpy_hdr.page_shift, copy->offset, (uint64_t)copy->size, dst_map, VM_MAP_PAGE_SHIFT(dst_map), target_copy, target_copy->cpy_hdr.page_shift, target_copy->offset, (uint64_t)target_copy->size, (uint64_t)overmap_start, (uint64_t)overmap_end, (uint64_t)trimmed_start);
11517 if (target_copy != copy) {
11518 copy = target_copy;
11519 }
11520 copy_size = copy->size;
11521 }
11522
11523 /*
11524 * Find space for the data
11525 */
11526
11527 vm_copy_start = vm_map_trunc_page((vm_map_size_t)copy->offset,
11528 VM_MAP_COPY_PAGE_MASK(copy));
11529 size = vm_map_round_page((vm_map_size_t)copy->offset + copy_size,
11530 VM_MAP_COPY_PAGE_MASK(copy))
11531 - vm_copy_start;
11532
11533 vm_map_kernel_flags_update_range_id(&vmk_flags, dst_map);
11534
11535 vm_map_lock(dst_map);
11536 kr = vm_map_locate_space(dst_map, size, 0, vmk_flags,
11537 &start, &last);
11538 if (kr != KERN_SUCCESS) {
11539 vm_map_unlock(dst_map);
11540 ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_COPYOUT_INTERNAL_SPACE_ERROR), kr /* arg */);
11541 return kr;
11542 }
11543
11544 adjustment = start - vm_copy_start;
11545 if (!consume_on_success) {
11546 /*
11547 * We're not allowed to consume "copy", so we'll have to
11548 * copy its map entries into the destination map below.
11549 * No need to re-allocate map entries from the correct
11550 * (pageable or not) zone, since we'll get new map entries
11551 * during the transfer.
11552 * We'll also adjust the map entries's "start" and "end"
11553 * during the transfer, to keep "copy"'s entries consistent
11554 * with its "offset".
11555 */
11556 goto after_adjustments;
11557 }
11558
11559 /*
11560 * Since we're going to just drop the map
11561 * entries from the copy into the destination
11562 * map, they must come from the same pool.
11563 */
11564
11565 if (copy->cpy_hdr.entries_pageable != dst_map->hdr.entries_pageable) {
11566 /*
11567 * Mismatches occur when dealing with the default
11568 * pager.
11569 */
11570 vm_map_entry_t next, new;
11571
11572 /*
11573 * Find the zone that the copies were allocated from
11574 */
11575
11576 entry = vm_map_copy_first_entry(copy);
11577
11578 /*
11579 * Reinitialize the copy so that vm_map_copy_entry_link
11580 * will work.
11581 */
11582 vm_map_store_copy_reset(copy, entry);
11583 copy->cpy_hdr.entries_pageable = dst_map->hdr.entries_pageable;
11584
11585 /*
11586 * Copy each entry.
11587 */
11588 while (entry != vm_map_copy_to_entry(copy)) {
11589 new = vm_map_copy_entry_create(copy);
11590 vm_map_entry_copy_full(new, entry);
11591 new->vme_no_copy_on_read = FALSE;
11592 assert(!new->iokit_acct);
11593 if (new->is_sub_map) {
11594 /* clr address space specifics */
11595 new->use_pmap = FALSE;
11596 }
11597 vm_map_copy_entry_link(copy,
11598 vm_map_copy_last_entry(copy),
11599 new);
11600 next = entry->vme_next;
11601 vm_map_entry_dispose(entry);
11602 entry = next;
11603 }
11604 }
11605
11606 /*
11607 * Adjust the addresses in the copy chain, and
11608 * reset the region attributes.
11609 */
11610
11611 for (entry = vm_map_copy_first_entry(copy);
11612 entry != vm_map_copy_to_entry(copy);
11613 entry = entry->vme_next) {
11614 if (VM_MAP_PAGE_SHIFT(dst_map) == PAGE_SHIFT) {
11615 /*
11616 * We're injecting this copy entry into a map that
11617 * has the standard page alignment, so clear
11618 * "map_aligned" (which might have been inherited
11619 * from the original map entry).
11620 */
11621 entry->map_aligned = FALSE;
11622 }
11623
11624 entry->vme_start += adjustment;
11625 entry->vme_end += adjustment;
11626
11627 if (entry->map_aligned) {
11628 assert(VM_MAP_PAGE_ALIGNED(entry->vme_start,
11629 VM_MAP_PAGE_MASK(dst_map)));
11630 assert(VM_MAP_PAGE_ALIGNED(entry->vme_end,
11631 VM_MAP_PAGE_MASK(dst_map)));
11632 }
11633
11634 entry->inheritance = VM_INHERIT_DEFAULT;
11635 entry->protection = VM_PROT_DEFAULT;
11636 entry->max_protection = VM_PROT_ALL;
11637 entry->behavior = VM_BEHAVIOR_DEFAULT;
11638
11639 /*
11640 * If the entry is now wired,
11641 * map the pages into the destination map.
11642 */
11643 if (entry->wired_count != 0) {
11644 vm_map_offset_t va;
11645 vm_object_offset_t offset;
11646 vm_object_t object;
11647 vm_prot_t prot;
11648 int type_of_fault;
11649 uint8_t object_lock_type = OBJECT_LOCK_EXCLUSIVE;
11650
11651 /* TODO4K would need to use actual page size */
11652 assert(VM_MAP_PAGE_SHIFT(dst_map) == PAGE_SHIFT);
11653
11654 object = VME_OBJECT(entry);
11655 offset = VME_OFFSET(entry);
11656 va = entry->vme_start;
11657
11658 pmap_pageable(dst_map->pmap,
11659 entry->vme_start,
11660 entry->vme_end,
11661 TRUE);
11662
11663 while (va < entry->vme_end) {
11664 vm_page_t m;
11665 struct vm_object_fault_info fault_info = {};
11666
11667 /*
11668 * Look up the page in the object.
11669 * Assert that the page will be found in the
11670 * top object:
11671 * either
11672 * the object was newly created by
11673 * vm_object_copy_slowly, and has
11674 * copies of all of the pages from
11675 * the source object
11676 * or
11677 * the object was moved from the old
11678 * map entry; because the old map
11679 * entry was wired, all of the pages
11680 * were in the top-level object.
11681 * (XXX not true if we wire pages for
11682 * reading)
11683 */
11684 vm_object_lock(object);
11685
11686 m = vm_page_lookup(object, offset);
11687 if (m == VM_PAGE_NULL || !VM_PAGE_WIRED(m) ||
11688 m->vmp_absent) {
11689 panic("vm_map_copyout: wiring %p", m);
11690 }
11691
11692 prot = entry->protection;
11693
11694 if (override_nx(dst_map, VME_ALIAS(entry)) &&
11695 prot) {
11696 prot |= VM_PROT_EXECUTE;
11697 }
11698
11699 type_of_fault = DBG_CACHE_HIT_FAULT;
11700
11701 fault_info.user_tag = VME_ALIAS(entry);
11702 fault_info.pmap_options = 0;
11703 if (entry->iokit_acct ||
11704 (!entry->is_sub_map && !entry->use_pmap)) {
11705 fault_info.pmap_options |= PMAP_OPTIONS_ALT_ACCT;
11706 }
11707 if (entry->vme_xnu_user_debug &&
11708 !VM_PAGE_OBJECT(m)->code_signed) {
11709 /*
11710 * Modified code-signed executable
11711 * region: this page does not belong
11712 * to a code-signed VM object, so it
11713 * must have been copied and should
11714 * therefore be typed XNU_USER_DEBUG
11715 * rather than XNU_USER_EXEC.
11716 */
11717 fault_info.pmap_options |= PMAP_OPTIONS_XNU_USER_DEBUG;
11718 }
11719
11720 vm_fault_enter(m,
11721 dst_map->pmap,
11722 va,
11723 PAGE_SIZE, 0,
11724 prot,
11725 prot,
11726 VM_PAGE_WIRED(m),
11727 FALSE, /* change_wiring */
11728 VM_KERN_MEMORY_NONE, /* tag - not wiring */
11729 &fault_info,
11730 NULL, /* need_retry */
11731 &type_of_fault,
11732 &object_lock_type); /*Exclusive mode lock. Will remain unchanged.*/
11733
11734 vm_object_unlock(object);
11735
11736 offset += PAGE_SIZE_64;
11737 va += PAGE_SIZE;
11738 }
11739 }
11740 }
11741
11742 after_adjustments:
11743
11744 /*
11745 * Correct the page alignment for the result
11746 */
11747
11748 *dst_addr = start + (copy->offset - vm_copy_start);
11749
11750 #if KASAN
11751 kasan_notify_address(*dst_addr, size);
11752 #endif
11753
11754 /*
11755 * Update the hints and the map size
11756 */
11757
11758 if (consume_on_success) {
11759 SAVE_HINT_MAP_WRITE(dst_map, vm_map_copy_last_entry(copy));
11760 } else {
11761 SAVE_HINT_MAP_WRITE(dst_map, last);
11762 }
11763
11764 dst_map->size += size;
11765
11766 /*
11767 * Link in the copy
11768 */
11769
11770 if (consume_on_success) {
11771 vm_map_copy_insert(dst_map, last, copy);
11772 if (copy != original_copy) {
11773 vm_map_copy_discard(original_copy);
11774 original_copy = VM_MAP_COPY_NULL;
11775 }
11776 } else {
11777 vm_map_copy_remap(dst_map, last, copy, adjustment,
11778 cur_protection, max_protection,
11779 inheritance);
11780 if (copy != original_copy && original_copy != VM_MAP_COPY_NULL) {
11781 vm_map_copy_discard(copy);
11782 copy = original_copy;
11783 }
11784 }
11785
11786
11787 vm_map_unlock(dst_map);
11788
11789 /*
11790 * XXX If wiring_required, call vm_map_pageable
11791 */
11792
11793 return KERN_SUCCESS;
11794 }
11795
11796 /*
11797 * Routine: vm_map_copyin
11798 *
11799 * Description:
11800 * see vm_map_copyin_common. Exported via Unsupported.exports.
11801 *
11802 */
11803
11804 #undef vm_map_copyin
11805
11806 kern_return_t
vm_map_copyin(vm_map_t src_map,vm_map_address_t src_addr,vm_map_size_t len,boolean_t src_destroy,vm_map_copy_t * copy_result)11807 vm_map_copyin(
11808 vm_map_t src_map,
11809 vm_map_address_t src_addr,
11810 vm_map_size_t len,
11811 boolean_t src_destroy,
11812 vm_map_copy_t *copy_result) /* OUT */
11813 {
11814 return vm_map_copyin_common(src_map, src_addr, len, src_destroy,
11815 FALSE, copy_result, FALSE);
11816 }
11817
11818 /*
11819 * Routine: vm_map_copyin_common
11820 *
11821 * Description:
11822 * Copy the specified region (src_addr, len) from the
11823 * source address space (src_map), possibly removing
11824 * the region from the source address space (src_destroy).
11825 *
11826 * Returns:
11827 * A vm_map_copy_t object (copy_result), suitable for
11828 * insertion into another address space (using vm_map_copyout),
11829 * copying over another address space region (using
11830 * vm_map_copy_overwrite). If the copy is unused, it
11831 * should be destroyed (using vm_map_copy_discard).
11832 *
11833 * In/out conditions:
11834 * The source map should not be locked on entry.
11835 */
11836
11837 typedef struct submap_map {
11838 vm_map_t parent_map;
11839 vm_map_offset_t base_start;
11840 vm_map_offset_t base_end;
11841 vm_map_size_t base_len;
11842 struct submap_map *next;
11843 } submap_map_t;
11844
11845 kern_return_t
vm_map_copyin_common(vm_map_t src_map,vm_map_address_t src_addr,vm_map_size_t len,boolean_t src_destroy,__unused boolean_t src_volatile,vm_map_copy_t * copy_result,boolean_t use_maxprot)11846 vm_map_copyin_common(
11847 vm_map_t src_map,
11848 vm_map_address_t src_addr,
11849 vm_map_size_t len,
11850 boolean_t src_destroy,
11851 __unused boolean_t src_volatile,
11852 vm_map_copy_t *copy_result, /* OUT */
11853 boolean_t use_maxprot)
11854 {
11855 int flags;
11856
11857 flags = 0;
11858 if (src_destroy) {
11859 flags |= VM_MAP_COPYIN_SRC_DESTROY;
11860 }
11861 if (use_maxprot) {
11862 flags |= VM_MAP_COPYIN_USE_MAXPROT;
11863 }
11864 return vm_map_copyin_internal(src_map,
11865 src_addr,
11866 len,
11867 flags,
11868 copy_result);
11869 }
11870 kern_return_t
vm_map_copyin_internal(vm_map_t src_map,vm_map_address_t src_addr,vm_map_size_t len,int flags,vm_map_copy_t * copy_result)11871 vm_map_copyin_internal(
11872 vm_map_t src_map,
11873 vm_map_address_t src_addr,
11874 vm_map_size_t len,
11875 int flags,
11876 vm_map_copy_t *copy_result) /* OUT */
11877 {
11878 vm_map_entry_t tmp_entry; /* Result of last map lookup --
11879 * in multi-level lookup, this
11880 * entry contains the actual
11881 * vm_object/offset.
11882 */
11883 vm_map_entry_t new_entry = VM_MAP_ENTRY_NULL; /* Map entry for copy */
11884
11885 vm_map_offset_t src_start; /* Start of current entry --
11886 * where copy is taking place now
11887 */
11888 vm_map_offset_t src_end; /* End of entire region to be
11889 * copied */
11890 vm_map_offset_t src_base;
11891 vm_map_t base_map = src_map;
11892 boolean_t map_share = FALSE;
11893 submap_map_t *parent_maps = NULL;
11894
11895 vm_map_copy_t copy; /* Resulting copy */
11896 vm_map_address_t copy_addr;
11897 vm_map_size_t copy_size;
11898 boolean_t src_destroy;
11899 boolean_t use_maxprot;
11900 boolean_t preserve_purgeable;
11901 boolean_t entry_was_shared;
11902 vm_map_entry_t saved_src_entry;
11903
11904 if (flags & ~VM_MAP_COPYIN_ALL_FLAGS) {
11905 return KERN_INVALID_ARGUMENT;
11906 }
11907
11908 #if CONFIG_KERNEL_TAGGING
11909 if (src_map->pmap == kernel_pmap) {
11910 src_addr = vm_memtag_canonicalize_address(src_addr);
11911 }
11912 #endif /* CONFIG_KERNEL_TAGGING */
11913
11914 src_destroy = (flags & VM_MAP_COPYIN_SRC_DESTROY) ? TRUE : FALSE;
11915 use_maxprot = (flags & VM_MAP_COPYIN_USE_MAXPROT) ? TRUE : FALSE;
11916 preserve_purgeable =
11917 (flags & VM_MAP_COPYIN_PRESERVE_PURGEABLE) ? TRUE : FALSE;
11918
11919 /*
11920 * Check for copies of zero bytes.
11921 */
11922
11923 if (len == 0) {
11924 *copy_result = VM_MAP_COPY_NULL;
11925 return KERN_SUCCESS;
11926 }
11927
11928 /*
11929 * Check that the end address doesn't overflow
11930 */
11931 if (__improbable(vm_map_range_overflows(src_map, src_addr, len))) {
11932 return KERN_INVALID_ADDRESS;
11933 }
11934 src_end = src_addr + len;
11935 if (src_end < src_addr) {
11936 return KERN_INVALID_ADDRESS;
11937 }
11938
11939 /*
11940 * Compute (page aligned) start and end of region
11941 */
11942 src_start = vm_map_trunc_page(src_addr,
11943 VM_MAP_PAGE_MASK(src_map));
11944 src_end = vm_map_round_page(src_end,
11945 VM_MAP_PAGE_MASK(src_map));
11946 if (src_end < src_addr) {
11947 return KERN_INVALID_ADDRESS;
11948 }
11949
11950 /*
11951 * If the copy is sufficiently small, use a kernel buffer instead
11952 * of making a virtual copy. The theory being that the cost of
11953 * setting up VM (and taking C-O-W faults) dominates the copy costs
11954 * for small regions.
11955 */
11956 if ((len <= msg_ool_size_small) &&
11957 !use_maxprot &&
11958 !preserve_purgeable &&
11959 !(flags & VM_MAP_COPYIN_ENTRY_LIST) &&
11960 /*
11961 * Since the "msg_ool_size_small" threshold was increased and
11962 * vm_map_copyin_kernel_buffer() doesn't handle accesses beyond the
11963 * address space limits, we revert to doing a virtual copy if the
11964 * copied range goes beyond those limits. Otherwise, mach_vm_read()
11965 * of the commpage would now fail when it used to work.
11966 */
11967 (src_start >= vm_map_min(src_map) &&
11968 src_start < vm_map_max(src_map) &&
11969 src_end >= vm_map_min(src_map) &&
11970 src_end < vm_map_max(src_map))) {
11971 return vm_map_copyin_kernel_buffer(src_map, src_addr, len,
11972 src_destroy, copy_result);
11973 }
11974
11975 /*
11976 * Allocate a header element for the list.
11977 *
11978 * Use the start and end in the header to
11979 * remember the endpoints prior to rounding.
11980 */
11981
11982 copy = vm_map_copy_allocate(VM_MAP_COPY_ENTRY_LIST);
11983 copy->cpy_hdr.entries_pageable = TRUE;
11984 copy->cpy_hdr.page_shift = (uint16_t)VM_MAP_PAGE_SHIFT(src_map);
11985 copy->offset = src_addr;
11986 copy->size = len;
11987
11988 new_entry = vm_map_copy_entry_create(copy);
11989
11990 #define RETURN(x) \
11991 MACRO_BEGIN \
11992 vm_map_unlock(src_map); \
11993 if(src_map != base_map) \
11994 vm_map_deallocate(src_map); \
11995 if (new_entry != VM_MAP_ENTRY_NULL) \
11996 vm_map_copy_entry_dispose(new_entry); \
11997 vm_map_copy_discard(copy); \
11998 { \
11999 submap_map_t *_ptr; \
12000 \
12001 for(_ptr = parent_maps; _ptr != NULL; _ptr = parent_maps) { \
12002 parent_maps=parent_maps->next; \
12003 if (_ptr->parent_map != base_map) \
12004 vm_map_deallocate(_ptr->parent_map); \
12005 kfree_type(submap_map_t, _ptr); \
12006 } \
12007 } \
12008 MACRO_RETURN(x); \
12009 MACRO_END
12010
12011 /*
12012 * Find the beginning of the region.
12013 */
12014
12015 vm_map_lock(src_map);
12016
12017 /*
12018 * Lookup the original "src_addr" rather than the truncated
12019 * "src_start", in case "src_start" falls in a non-map-aligned
12020 * map entry *before* the map entry that contains "src_addr"...
12021 */
12022 if (!vm_map_lookup_entry(src_map, src_addr, &tmp_entry)) {
12023 RETURN(KERN_INVALID_ADDRESS);
12024 }
12025 if (!tmp_entry->is_sub_map) {
12026 /*
12027 * ... but clip to the map-rounded "src_start" rather than
12028 * "src_addr" to preserve map-alignment. We'll adjust the
12029 * first copy entry at the end, if needed.
12030 */
12031 vm_map_clip_start(src_map, tmp_entry, src_start);
12032 }
12033 if (src_start < tmp_entry->vme_start) {
12034 /*
12035 * Move "src_start" up to the start of the
12036 * first map entry to copy.
12037 */
12038 src_start = tmp_entry->vme_start;
12039 }
12040 /* set for later submap fix-up */
12041 copy_addr = src_start;
12042
12043 /*
12044 * Go through entries until we get to the end.
12045 */
12046
12047 while (TRUE) {
12048 vm_map_entry_t src_entry = tmp_entry; /* Top-level entry */
12049 vm_map_size_t src_size; /* Size of source
12050 * map entry (in both
12051 * maps)
12052 */
12053
12054 vm_object_t src_object; /* Object to copy */
12055 vm_object_offset_t src_offset;
12056
12057 vm_object_t new_copy_object;/* vm_object_copy_* result */
12058
12059 boolean_t src_needs_copy; /* Should source map
12060 * be made read-only
12061 * for copy-on-write?
12062 */
12063
12064 boolean_t new_entry_needs_copy; /* Will new entry be COW? */
12065
12066 boolean_t was_wired; /* Was source wired? */
12067 boolean_t saved_used_for_jit; /* Saved used_for_jit. */
12068 vm_map_version_t version; /* Version before locks
12069 * dropped to make copy
12070 */
12071 kern_return_t result; /* Return value from
12072 * copy_strategically.
12073 */
12074 while (tmp_entry->is_sub_map) {
12075 vm_map_size_t submap_len;
12076 submap_map_t *ptr;
12077
12078 ptr = kalloc_type(submap_map_t, Z_WAITOK);
12079 ptr->next = parent_maps;
12080 parent_maps = ptr;
12081 ptr->parent_map = src_map;
12082 ptr->base_start = src_start;
12083 ptr->base_end = src_end;
12084 submap_len = tmp_entry->vme_end - src_start;
12085 if (submap_len > (src_end - src_start)) {
12086 submap_len = src_end - src_start;
12087 }
12088 ptr->base_len = submap_len;
12089
12090 src_start -= tmp_entry->vme_start;
12091 src_start += VME_OFFSET(tmp_entry);
12092 src_end = src_start + submap_len;
12093 src_map = VME_SUBMAP(tmp_entry);
12094 vm_map_lock(src_map);
12095 /* keep an outstanding reference for all maps in */
12096 /* the parents tree except the base map */
12097 vm_map_reference(src_map);
12098 vm_map_unlock(ptr->parent_map);
12099 if (!vm_map_lookup_entry(
12100 src_map, src_start, &tmp_entry)) {
12101 RETURN(KERN_INVALID_ADDRESS);
12102 }
12103 map_share = TRUE;
12104 if (!tmp_entry->is_sub_map) {
12105 vm_map_clip_start(src_map, tmp_entry, src_start);
12106 }
12107 src_entry = tmp_entry;
12108 }
12109 /* we are now in the lowest level submap... */
12110
12111 if ((VME_OBJECT(tmp_entry) != VM_OBJECT_NULL) &&
12112 (VME_OBJECT(tmp_entry)->phys_contiguous)) {
12113 /* This is not, supported for now.In future */
12114 /* we will need to detect the phys_contig */
12115 /* condition and then upgrade copy_slowly */
12116 /* to do physical copy from the device mem */
12117 /* based object. We can piggy-back off of */
12118 /* the was wired boolean to set-up the */
12119 /* proper handling */
12120 RETURN(KERN_PROTECTION_FAILURE);
12121 }
12122 /*
12123 * Create a new address map entry to hold the result.
12124 * Fill in the fields from the appropriate source entries.
12125 * We must unlock the source map to do this if we need
12126 * to allocate a map entry.
12127 */
12128 if (new_entry == VM_MAP_ENTRY_NULL) {
12129 version.main_timestamp = src_map->timestamp;
12130 vm_map_unlock(src_map);
12131
12132 new_entry = vm_map_copy_entry_create(copy);
12133
12134 vm_map_lock(src_map);
12135 if ((version.main_timestamp + 1) != src_map->timestamp) {
12136 if (!vm_map_lookup_entry(src_map, src_start,
12137 &tmp_entry)) {
12138 RETURN(KERN_INVALID_ADDRESS);
12139 }
12140 if (!tmp_entry->is_sub_map) {
12141 vm_map_clip_start(src_map, tmp_entry, src_start);
12142 }
12143 continue; /* restart w/ new tmp_entry */
12144 }
12145 }
12146
12147 /*
12148 * Verify that the region can be read.
12149 */
12150 if (((src_entry->protection & VM_PROT_READ) == VM_PROT_NONE &&
12151 !use_maxprot) ||
12152 (src_entry->max_protection & VM_PROT_READ) == 0) {
12153 RETURN(KERN_PROTECTION_FAILURE);
12154 }
12155
12156 /*
12157 * Clip against the endpoints of the entire region.
12158 */
12159
12160 vm_map_clip_end(src_map, src_entry, src_end);
12161
12162 src_size = src_entry->vme_end - src_start;
12163 src_object = VME_OBJECT(src_entry);
12164 src_offset = VME_OFFSET(src_entry);
12165 was_wired = (src_entry->wired_count != 0);
12166
12167 vm_map_entry_copy(src_map, new_entry, src_entry);
12168 if (new_entry->is_sub_map) {
12169 /* clr address space specifics */
12170 new_entry->use_pmap = FALSE;
12171 } else {
12172 /*
12173 * We're dealing with a copy-on-write operation,
12174 * so the resulting mapping should not inherit the
12175 * original mapping's accounting settings.
12176 * "iokit_acct" should have been cleared in
12177 * vm_map_entry_copy().
12178 * "use_pmap" should be reset to its default (TRUE)
12179 * so that the new mapping gets accounted for in
12180 * the task's memory footprint.
12181 */
12182 assert(!new_entry->iokit_acct);
12183 new_entry->use_pmap = TRUE;
12184 }
12185
12186 /*
12187 * Attempt non-blocking copy-on-write optimizations.
12188 */
12189
12190 /*
12191 * If we are destroying the source, and the object
12192 * is internal, we could move the object reference
12193 * from the source to the copy. The copy is
12194 * copy-on-write only if the source is.
12195 * We make another reference to the object, because
12196 * destroying the source entry will deallocate it.
12197 *
12198 * This memory transfer has to be atomic, (to prevent
12199 * the VM object from being shared or copied while
12200 * it's being moved here), so we could only do this
12201 * if we won't have to unlock the VM map until the
12202 * original mapping has been fully removed.
12203 */
12204
12205 RestartCopy:
12206 if ((src_object == VM_OBJECT_NULL ||
12207 (!was_wired && !map_share && !tmp_entry->is_shared
12208 && !(debug4k_no_cow_copyin && VM_MAP_PAGE_SHIFT(src_map) < PAGE_SHIFT))) &&
12209 vm_object_copy_quickly(
12210 VME_OBJECT(new_entry),
12211 src_offset,
12212 src_size,
12213 &src_needs_copy,
12214 &new_entry_needs_copy)) {
12215 new_entry->needs_copy = new_entry_needs_copy;
12216
12217 /*
12218 * Handle copy-on-write obligations
12219 */
12220
12221 if (src_needs_copy && !tmp_entry->needs_copy) {
12222 vm_prot_t prot;
12223
12224 prot = src_entry->protection & ~VM_PROT_WRITE;
12225
12226 if (override_nx(src_map, VME_ALIAS(src_entry))
12227 && prot) {
12228 prot |= VM_PROT_EXECUTE;
12229 }
12230
12231 vm_object_pmap_protect(
12232 src_object,
12233 src_offset,
12234 src_size,
12235 (src_entry->is_shared ?
12236 PMAP_NULL
12237 : src_map->pmap),
12238 VM_MAP_PAGE_SIZE(src_map),
12239 src_entry->vme_start,
12240 prot);
12241
12242 assert(tmp_entry->wired_count == 0);
12243 tmp_entry->needs_copy = TRUE;
12244 }
12245
12246 /*
12247 * The map has never been unlocked, so it's safe
12248 * to move to the next entry rather than doing
12249 * another lookup.
12250 */
12251
12252 goto CopySuccessful;
12253 }
12254
12255 entry_was_shared = tmp_entry->is_shared;
12256
12257 /*
12258 * Take an object reference, so that we may
12259 * release the map lock(s).
12260 */
12261
12262 assert(src_object != VM_OBJECT_NULL);
12263 vm_object_reference(src_object);
12264
12265 /*
12266 * Record the timestamp for later verification.
12267 * Unlock the map.
12268 */
12269
12270 version.main_timestamp = src_map->timestamp;
12271 vm_map_unlock(src_map); /* Increments timestamp once! */
12272 saved_src_entry = src_entry;
12273 tmp_entry = VM_MAP_ENTRY_NULL;
12274 src_entry = VM_MAP_ENTRY_NULL;
12275
12276 /*
12277 * Perform the copy
12278 */
12279
12280 if (was_wired ||
12281 (src_object->copy_strategy == MEMORY_OBJECT_COPY_DELAY_FORK &&
12282 !(flags & VM_MAP_COPYIN_FORK)) ||
12283 (debug4k_no_cow_copyin &&
12284 VM_MAP_PAGE_SHIFT(src_map) < PAGE_SHIFT)) {
12285 CopySlowly:
12286 vm_object_lock(src_object);
12287 result = vm_object_copy_slowly(
12288 src_object,
12289 src_offset,
12290 src_size,
12291 THREAD_UNINT,
12292 &new_copy_object);
12293 /* VME_OBJECT_SET will reset used_for_jit|tpro, so preserve it. */
12294 saved_used_for_jit = new_entry->used_for_jit;
12295 VME_OBJECT_SET(new_entry, new_copy_object, false, 0);
12296 new_entry->used_for_jit = saved_used_for_jit;
12297 VME_OFFSET_SET(new_entry,
12298 src_offset - vm_object_trunc_page(src_offset));
12299 new_entry->needs_copy = FALSE;
12300 } else if (src_object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC &&
12301 (entry_was_shared || map_share)) {
12302 vm_object_t new_object;
12303
12304 vm_object_lock_shared(src_object);
12305 new_object = vm_object_copy_delayed(
12306 src_object,
12307 src_offset,
12308 src_size,
12309 TRUE);
12310 if (new_object == VM_OBJECT_NULL) {
12311 goto CopySlowly;
12312 }
12313
12314 VME_OBJECT_SET(new_entry, new_object, false, 0);
12315 assert(new_entry->wired_count == 0);
12316 new_entry->needs_copy = TRUE;
12317 assert(!new_entry->iokit_acct);
12318 assert(new_object->purgable == VM_PURGABLE_DENY);
12319 assertf(new_entry->use_pmap, "src_map %p new_entry %p\n", src_map, new_entry);
12320 result = KERN_SUCCESS;
12321 } else {
12322 vm_object_offset_t new_offset;
12323 new_offset = VME_OFFSET(new_entry);
12324 result = vm_object_copy_strategically(src_object,
12325 src_offset,
12326 src_size,
12327 (flags & VM_MAP_COPYIN_FORK),
12328 &new_copy_object,
12329 &new_offset,
12330 &new_entry_needs_copy);
12331 /* VME_OBJECT_SET will reset used_for_jit, so preserve it. */
12332 saved_used_for_jit = new_entry->used_for_jit;
12333 VME_OBJECT_SET(new_entry, new_copy_object, false, 0);
12334 new_entry->used_for_jit = saved_used_for_jit;
12335 if (new_offset != VME_OFFSET(new_entry)) {
12336 VME_OFFSET_SET(new_entry, new_offset);
12337 }
12338
12339 new_entry->needs_copy = new_entry_needs_copy;
12340 }
12341
12342 if (result == KERN_SUCCESS &&
12343 ((preserve_purgeable &&
12344 src_object->purgable != VM_PURGABLE_DENY) ||
12345 new_entry->used_for_jit)) {
12346 /*
12347 * Purgeable objects should be COPY_NONE, true share;
12348 * this should be propogated to the copy.
12349 *
12350 * Also force mappings the pmap specially protects to
12351 * be COPY_NONE; trying to COW these mappings would
12352 * change the effective protections, which could have
12353 * side effects if the pmap layer relies on the
12354 * specified protections.
12355 */
12356
12357 vm_object_t new_object;
12358
12359 new_object = VME_OBJECT(new_entry);
12360 assert(new_object != src_object);
12361 vm_object_lock(new_object);
12362 assert(new_object->ref_count == 1);
12363 assert(new_object->shadow == VM_OBJECT_NULL);
12364 assert(new_object->vo_copy == VM_OBJECT_NULL);
12365 assert(new_object->vo_owner == NULL);
12366
12367 new_object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
12368
12369 if (preserve_purgeable &&
12370 src_object->purgable != VM_PURGABLE_DENY) {
12371 new_object->true_share = TRUE;
12372
12373 /* start as non-volatile with no owner... */
12374 new_object->purgable = VM_PURGABLE_NONVOLATILE;
12375 vm_purgeable_nonvolatile_enqueue(new_object, NULL);
12376 /* ... and move to src_object's purgeable state */
12377 if (src_object->purgable != VM_PURGABLE_NONVOLATILE) {
12378 int state;
12379 state = src_object->purgable;
12380 vm_object_purgable_control(
12381 new_object,
12382 VM_PURGABLE_SET_STATE_FROM_KERNEL,
12383 &state);
12384 }
12385 /* no pmap accounting for purgeable objects */
12386 new_entry->use_pmap = FALSE;
12387 }
12388
12389 vm_object_unlock(new_object);
12390 new_object = VM_OBJECT_NULL;
12391 }
12392
12393 if (result != KERN_SUCCESS &&
12394 result != KERN_MEMORY_RESTART_COPY) {
12395 vm_map_lock(src_map);
12396 RETURN(result);
12397 }
12398
12399 /*
12400 * Throw away the extra reference
12401 */
12402
12403 vm_object_deallocate(src_object);
12404
12405 /*
12406 * Verify that the map has not substantially
12407 * changed while the copy was being made.
12408 */
12409
12410 vm_map_lock(src_map);
12411
12412 if ((version.main_timestamp + 1) == src_map->timestamp) {
12413 /* src_map hasn't changed: src_entry is still valid */
12414 src_entry = saved_src_entry;
12415 goto VerificationSuccessful;
12416 }
12417
12418 /*
12419 * Simple version comparison failed.
12420 *
12421 * Retry the lookup and verify that the
12422 * same object/offset are still present.
12423 *
12424 * [Note: a memory manager that colludes with
12425 * the calling task can detect that we have
12426 * cheated. While the map was unlocked, the
12427 * mapping could have been changed and restored.]
12428 */
12429
12430 if (!vm_map_lookup_entry(src_map, src_start, &tmp_entry)) {
12431 if (result != KERN_MEMORY_RESTART_COPY) {
12432 vm_object_deallocate(VME_OBJECT(new_entry));
12433 VME_OBJECT_SET(new_entry, VM_OBJECT_NULL, false, 0);
12434 /* reset accounting state */
12435 new_entry->iokit_acct = FALSE;
12436 new_entry->use_pmap = TRUE;
12437 }
12438 RETURN(KERN_INVALID_ADDRESS);
12439 }
12440
12441 src_entry = tmp_entry;
12442 vm_map_clip_start(src_map, src_entry, src_start);
12443
12444 if ((((src_entry->protection & VM_PROT_READ) == VM_PROT_NONE) &&
12445 !use_maxprot) ||
12446 ((src_entry->max_protection & VM_PROT_READ) == 0)) {
12447 goto VerificationFailed;
12448 }
12449
12450 if (src_entry->vme_end < new_entry->vme_end) {
12451 /*
12452 * This entry might have been shortened
12453 * (vm_map_clip_end) or been replaced with
12454 * an entry that ends closer to "src_start"
12455 * than before.
12456 * Adjust "new_entry" accordingly; copying
12457 * less memory would be correct but we also
12458 * redo the copy (see below) if the new entry
12459 * no longer points at the same object/offset.
12460 */
12461 assert(VM_MAP_PAGE_ALIGNED(src_entry->vme_end,
12462 VM_MAP_COPY_PAGE_MASK(copy)));
12463 new_entry->vme_end = src_entry->vme_end;
12464 src_size = new_entry->vme_end - src_start;
12465 } else if (src_entry->vme_end > new_entry->vme_end) {
12466 /*
12467 * This entry might have been extended
12468 * (vm_map_entry_simplify() or coalesce)
12469 * or been replaced with an entry that ends farther
12470 * from "src_start" than before.
12471 *
12472 * We've called vm_object_copy_*() only on
12473 * the previous <start:end> range, so we can't
12474 * just extend new_entry. We have to re-do
12475 * the copy based on the new entry as if it was
12476 * pointing at a different object/offset (see
12477 * "Verification failed" below).
12478 */
12479 }
12480
12481 if ((VME_OBJECT(src_entry) != src_object) ||
12482 (VME_OFFSET(src_entry) != src_offset) ||
12483 (src_entry->vme_end > new_entry->vme_end)) {
12484 /*
12485 * Verification failed.
12486 *
12487 * Start over with this top-level entry.
12488 */
12489
12490 VerificationFailed: ;
12491
12492 vm_object_deallocate(VME_OBJECT(new_entry));
12493 tmp_entry = src_entry;
12494 continue;
12495 }
12496
12497 /*
12498 * Verification succeeded.
12499 */
12500
12501 VerificationSuccessful:;
12502
12503 if (result == KERN_MEMORY_RESTART_COPY) {
12504 goto RestartCopy;
12505 }
12506
12507 /*
12508 * Copy succeeded.
12509 */
12510
12511 CopySuccessful: ;
12512
12513 /*
12514 * Link in the new copy entry.
12515 */
12516
12517 vm_map_copy_entry_link(copy, vm_map_copy_last_entry(copy),
12518 new_entry);
12519
12520 /*
12521 * Determine whether the entire region
12522 * has been copied.
12523 */
12524 src_base = src_start;
12525 src_start = new_entry->vme_end;
12526 new_entry = VM_MAP_ENTRY_NULL;
12527 while ((src_start >= src_end) && (src_end != 0)) {
12528 submap_map_t *ptr;
12529
12530 if (src_map == base_map) {
12531 /* back to the top */
12532 break;
12533 }
12534
12535 ptr = parent_maps;
12536 assert(ptr != NULL);
12537 parent_maps = parent_maps->next;
12538
12539 /* fix up the damage we did in that submap */
12540 vm_map_simplify_range(src_map,
12541 src_base,
12542 src_end);
12543
12544 vm_map_unlock(src_map);
12545 vm_map_deallocate(src_map);
12546 vm_map_lock(ptr->parent_map);
12547 src_map = ptr->parent_map;
12548 src_base = ptr->base_start;
12549 src_start = ptr->base_start + ptr->base_len;
12550 src_end = ptr->base_end;
12551 if (!vm_map_lookup_entry(src_map,
12552 src_start,
12553 &tmp_entry) &&
12554 (src_end > src_start)) {
12555 RETURN(KERN_INVALID_ADDRESS);
12556 }
12557 kfree_type(submap_map_t, ptr);
12558 if (parent_maps == NULL) {
12559 map_share = FALSE;
12560 }
12561 src_entry = tmp_entry->vme_prev;
12562 }
12563
12564 if ((VM_MAP_PAGE_SHIFT(src_map) != PAGE_SHIFT) &&
12565 (src_start >= src_addr + len) &&
12566 (src_addr + len != 0)) {
12567 /*
12568 * Stop copying now, even though we haven't reached
12569 * "src_end". We'll adjust the end of the last copy
12570 * entry at the end, if needed.
12571 *
12572 * If src_map's aligment is different from the
12573 * system's page-alignment, there could be
12574 * extra non-map-aligned map entries between
12575 * the original (non-rounded) "src_addr + len"
12576 * and the rounded "src_end".
12577 * We do not want to copy those map entries since
12578 * they're not part of the copied range.
12579 */
12580 break;
12581 }
12582
12583 if ((src_start >= src_end) && (src_end != 0)) {
12584 break;
12585 }
12586
12587 /*
12588 * Verify that there are no gaps in the region
12589 */
12590
12591 tmp_entry = src_entry->vme_next;
12592 if ((tmp_entry->vme_start != src_start) ||
12593 (tmp_entry == vm_map_to_entry(src_map))) {
12594 RETURN(KERN_INVALID_ADDRESS);
12595 }
12596 }
12597
12598 /*
12599 * If the source should be destroyed, do it now, since the
12600 * copy was successful.
12601 */
12602 if (src_destroy) {
12603 vmr_flags_t remove_flags = VM_MAP_REMOVE_NO_FLAGS;
12604
12605 if (src_map == kernel_map) {
12606 remove_flags |= VM_MAP_REMOVE_KUNWIRE;
12607 }
12608 (void)vm_map_remove_and_unlock(src_map,
12609 vm_map_trunc_page(src_addr, VM_MAP_PAGE_MASK(src_map)),
12610 src_end,
12611 remove_flags,
12612 KMEM_GUARD_NONE);
12613 } else {
12614 /* fix up the damage we did in the base map */
12615 vm_map_simplify_range(
12616 src_map,
12617 vm_map_trunc_page(src_addr,
12618 VM_MAP_PAGE_MASK(src_map)),
12619 vm_map_round_page(src_end,
12620 VM_MAP_PAGE_MASK(src_map)));
12621 vm_map_unlock(src_map);
12622 }
12623
12624 tmp_entry = VM_MAP_ENTRY_NULL;
12625
12626 if (VM_MAP_PAGE_SHIFT(src_map) > PAGE_SHIFT &&
12627 VM_MAP_PAGE_SHIFT(src_map) != VM_MAP_COPY_PAGE_SHIFT(copy)) {
12628 vm_map_offset_t original_start, original_offset, original_end;
12629
12630 assert(VM_MAP_COPY_PAGE_MASK(copy) == PAGE_MASK);
12631
12632 /* adjust alignment of first copy_entry's "vme_start" */
12633 tmp_entry = vm_map_copy_first_entry(copy);
12634 if (tmp_entry != vm_map_copy_to_entry(copy)) {
12635 vm_map_offset_t adjustment;
12636
12637 original_start = tmp_entry->vme_start;
12638 original_offset = VME_OFFSET(tmp_entry);
12639
12640 /* map-align the start of the first copy entry... */
12641 adjustment = (tmp_entry->vme_start -
12642 vm_map_trunc_page(
12643 tmp_entry->vme_start,
12644 VM_MAP_PAGE_MASK(src_map)));
12645 tmp_entry->vme_start -= adjustment;
12646 VME_OFFSET_SET(tmp_entry,
12647 VME_OFFSET(tmp_entry) - adjustment);
12648 copy_addr -= adjustment;
12649 assert(tmp_entry->vme_start < tmp_entry->vme_end);
12650 /* ... adjust for mis-aligned start of copy range */
12651 adjustment =
12652 (vm_map_trunc_page(copy->offset,
12653 PAGE_MASK) -
12654 vm_map_trunc_page(copy->offset,
12655 VM_MAP_PAGE_MASK(src_map)));
12656 if (adjustment) {
12657 assert(page_aligned(adjustment));
12658 assert(adjustment < VM_MAP_PAGE_SIZE(src_map));
12659 tmp_entry->vme_start += adjustment;
12660 VME_OFFSET_SET(tmp_entry,
12661 (VME_OFFSET(tmp_entry) +
12662 adjustment));
12663 copy_addr += adjustment;
12664 assert(tmp_entry->vme_start < tmp_entry->vme_end);
12665 }
12666
12667 /*
12668 * Assert that the adjustments haven't exposed
12669 * more than was originally copied...
12670 */
12671 assert(tmp_entry->vme_start >= original_start);
12672 assert(VME_OFFSET(tmp_entry) >= original_offset);
12673 /*
12674 * ... and that it did not adjust outside of a
12675 * a single 16K page.
12676 */
12677 assert(vm_map_trunc_page(tmp_entry->vme_start,
12678 VM_MAP_PAGE_MASK(src_map)) ==
12679 vm_map_trunc_page(original_start,
12680 VM_MAP_PAGE_MASK(src_map)));
12681 }
12682
12683 /* adjust alignment of last copy_entry's "vme_end" */
12684 tmp_entry = vm_map_copy_last_entry(copy);
12685 if (tmp_entry != vm_map_copy_to_entry(copy)) {
12686 vm_map_offset_t adjustment;
12687
12688 original_end = tmp_entry->vme_end;
12689
12690 /* map-align the end of the last copy entry... */
12691 tmp_entry->vme_end =
12692 vm_map_round_page(tmp_entry->vme_end,
12693 VM_MAP_PAGE_MASK(src_map));
12694 /* ... adjust for mis-aligned end of copy range */
12695 adjustment =
12696 (vm_map_round_page((copy->offset +
12697 copy->size),
12698 VM_MAP_PAGE_MASK(src_map)) -
12699 vm_map_round_page((copy->offset +
12700 copy->size),
12701 PAGE_MASK));
12702 if (adjustment) {
12703 assert(page_aligned(adjustment));
12704 assert(adjustment < VM_MAP_PAGE_SIZE(src_map));
12705 tmp_entry->vme_end -= adjustment;
12706 assert(tmp_entry->vme_start < tmp_entry->vme_end);
12707 }
12708
12709 /*
12710 * Assert that the adjustments haven't exposed
12711 * more than was originally copied...
12712 */
12713 assert(tmp_entry->vme_end <= original_end);
12714 /*
12715 * ... and that it did not adjust outside of a
12716 * a single 16K page.
12717 */
12718 assert(vm_map_round_page(tmp_entry->vme_end,
12719 VM_MAP_PAGE_MASK(src_map)) ==
12720 vm_map_round_page(original_end,
12721 VM_MAP_PAGE_MASK(src_map)));
12722 }
12723 }
12724
12725 /* Fix-up start and end points in copy. This is necessary */
12726 /* when the various entries in the copy object were picked */
12727 /* up from different sub-maps */
12728
12729 tmp_entry = vm_map_copy_first_entry(copy);
12730 copy_size = 0; /* compute actual size */
12731 while (tmp_entry != vm_map_copy_to_entry(copy)) {
12732 assert(VM_MAP_PAGE_ALIGNED(
12733 copy_addr + (tmp_entry->vme_end -
12734 tmp_entry->vme_start),
12735 MIN(VM_MAP_COPY_PAGE_MASK(copy), PAGE_MASK)));
12736 assert(VM_MAP_PAGE_ALIGNED(
12737 copy_addr,
12738 MIN(VM_MAP_COPY_PAGE_MASK(copy), PAGE_MASK)));
12739
12740 /*
12741 * The copy_entries will be injected directly into the
12742 * destination map and might not be "map aligned" there...
12743 */
12744 tmp_entry->map_aligned = FALSE;
12745
12746 tmp_entry->vme_end = copy_addr +
12747 (tmp_entry->vme_end - tmp_entry->vme_start);
12748 tmp_entry->vme_start = copy_addr;
12749 assert(tmp_entry->vme_start < tmp_entry->vme_end);
12750 copy_addr += tmp_entry->vme_end - tmp_entry->vme_start;
12751 copy_size += tmp_entry->vme_end - tmp_entry->vme_start;
12752 tmp_entry = (struct vm_map_entry *)tmp_entry->vme_next;
12753 }
12754
12755 if (VM_MAP_PAGE_SHIFT(src_map) != PAGE_SHIFT &&
12756 copy_size < copy->size) {
12757 /*
12758 * The actual size of the VM map copy is smaller than what
12759 * was requested by the caller. This must be because some
12760 * PAGE_SIZE-sized pages are missing at the end of the last
12761 * VM_MAP_PAGE_SIZE(src_map)-sized chunk of the range.
12762 * The caller might not have been aware of those missing
12763 * pages and might not want to be aware of it, which is
12764 * fine as long as they don't try to access (and crash on)
12765 * those missing pages.
12766 * Let's adjust the size of the "copy", to avoid failing
12767 * in vm_map_copyout() or vm_map_copy_overwrite().
12768 */
12769 assert(vm_map_round_page(copy_size,
12770 VM_MAP_PAGE_MASK(src_map)) ==
12771 vm_map_round_page(copy->size,
12772 VM_MAP_PAGE_MASK(src_map)));
12773 copy->size = copy_size;
12774 }
12775
12776 *copy_result = copy;
12777 return KERN_SUCCESS;
12778
12779 #undef RETURN
12780 }
12781
12782 kern_return_t
vm_map_copy_extract(vm_map_t src_map,vm_map_address_t src_addr,vm_map_size_t len,boolean_t do_copy,vm_map_copy_t * copy_result,vm_prot_t * cur_prot,vm_prot_t * max_prot,vm_inherit_t inheritance,vm_map_kernel_flags_t vmk_flags)12783 vm_map_copy_extract(
12784 vm_map_t src_map,
12785 vm_map_address_t src_addr,
12786 vm_map_size_t len,
12787 boolean_t do_copy,
12788 vm_map_copy_t *copy_result, /* OUT */
12789 vm_prot_t *cur_prot, /* IN/OUT */
12790 vm_prot_t *max_prot, /* IN/OUT */
12791 vm_inherit_t inheritance,
12792 vm_map_kernel_flags_t vmk_flags)
12793 {
12794 vm_map_copy_t copy;
12795 kern_return_t kr;
12796 vm_prot_t required_cur_prot, required_max_prot;
12797
12798 /*
12799 * Check for copies of zero bytes.
12800 */
12801
12802 if (len == 0) {
12803 *copy_result = VM_MAP_COPY_NULL;
12804 return KERN_SUCCESS;
12805 }
12806
12807 /*
12808 * Check that the end address doesn't overflow
12809 */
12810 if (src_addr + len < src_addr) {
12811 return KERN_INVALID_ADDRESS;
12812 }
12813 if (__improbable(vm_map_range_overflows(src_map, src_addr, len))) {
12814 return KERN_INVALID_ADDRESS;
12815 }
12816
12817 if (VM_MAP_PAGE_SIZE(src_map) < PAGE_SIZE) {
12818 DEBUG4K_SHARE("src_map %p src_addr 0x%llx src_end 0x%llx\n", src_map, (uint64_t)src_addr, (uint64_t)(src_addr + len));
12819 }
12820
12821 required_cur_prot = *cur_prot;
12822 required_max_prot = *max_prot;
12823
12824 /*
12825 * Allocate a header element for the list.
12826 *
12827 * Use the start and end in the header to
12828 * remember the endpoints prior to rounding.
12829 */
12830
12831 copy = vm_map_copy_allocate(VM_MAP_COPY_ENTRY_LIST);
12832 copy->cpy_hdr.entries_pageable = vmk_flags.vmkf_copy_pageable;
12833 copy->offset = 0;
12834 copy->size = len;
12835
12836 kr = vm_map_remap_extract(src_map,
12837 src_addr,
12838 len,
12839 do_copy, /* copy */
12840 copy,
12841 cur_prot, /* IN/OUT */
12842 max_prot, /* IN/OUT */
12843 inheritance,
12844 vmk_flags);
12845 if (kr != KERN_SUCCESS) {
12846 vm_map_copy_discard(copy);
12847 return kr;
12848 }
12849 if (required_cur_prot != VM_PROT_NONE) {
12850 assert((*cur_prot & required_cur_prot) == required_cur_prot);
12851 assert((*max_prot & required_max_prot) == required_max_prot);
12852 }
12853
12854 *copy_result = copy;
12855 return KERN_SUCCESS;
12856 }
12857
12858 static void
vm_map_fork_share(vm_map_t old_map,vm_map_entry_t old_entry,vm_map_t new_map)12859 vm_map_fork_share(
12860 vm_map_t old_map,
12861 vm_map_entry_t old_entry,
12862 vm_map_t new_map)
12863 {
12864 vm_object_t object;
12865 vm_map_entry_t new_entry;
12866
12867 /*
12868 * New sharing code. New map entry
12869 * references original object. Internal
12870 * objects use asynchronous copy algorithm for
12871 * future copies. First make sure we have
12872 * the right object. If we need a shadow,
12873 * or someone else already has one, then
12874 * make a new shadow and share it.
12875 */
12876
12877 if (!old_entry->is_sub_map) {
12878 object = VME_OBJECT(old_entry);
12879 }
12880
12881 if (old_entry->is_sub_map) {
12882 assert(old_entry->wired_count == 0);
12883 #ifndef NO_NESTED_PMAP
12884 #if !PMAP_FORK_NEST
12885 if (old_entry->use_pmap) {
12886 kern_return_t result;
12887
12888 result = pmap_nest(new_map->pmap,
12889 (VME_SUBMAP(old_entry))->pmap,
12890 (addr64_t)old_entry->vme_start,
12891 (uint64_t)(old_entry->vme_end - old_entry->vme_start));
12892 if (result) {
12893 panic("vm_map_fork_share: pmap_nest failed!");
12894 }
12895 }
12896 #endif /* !PMAP_FORK_NEST */
12897 #endif /* NO_NESTED_PMAP */
12898 } else if (object == VM_OBJECT_NULL) {
12899 object = vm_object_allocate((vm_map_size_t)(old_entry->vme_end -
12900 old_entry->vme_start));
12901 VME_OFFSET_SET(old_entry, 0);
12902 VME_OBJECT_SET(old_entry, object, false, 0);
12903 old_entry->use_pmap = TRUE;
12904 // assert(!old_entry->needs_copy);
12905 } else if (object->copy_strategy !=
12906 MEMORY_OBJECT_COPY_SYMMETRIC) {
12907 /*
12908 * We are already using an asymmetric
12909 * copy, and therefore we already have
12910 * the right object.
12911 */
12912
12913 assert(!old_entry->needs_copy);
12914 } else if (old_entry->needs_copy || /* case 1 */
12915 object->shadowed || /* case 2 */
12916 (!object->true_share && /* case 3 */
12917 !old_entry->is_shared &&
12918 (object->vo_size >
12919 (vm_map_size_t)(old_entry->vme_end -
12920 old_entry->vme_start)))) {
12921 bool is_writable;
12922
12923 /*
12924 * We need to create a shadow.
12925 * There are three cases here.
12926 * In the first case, we need to
12927 * complete a deferred symmetrical
12928 * copy that we participated in.
12929 * In the second and third cases,
12930 * we need to create the shadow so
12931 * that changes that we make to the
12932 * object do not interfere with
12933 * any symmetrical copies which
12934 * have occured (case 2) or which
12935 * might occur (case 3).
12936 *
12937 * The first case is when we had
12938 * deferred shadow object creation
12939 * via the entry->needs_copy mechanism.
12940 * This mechanism only works when
12941 * only one entry points to the source
12942 * object, and we are about to create
12943 * a second entry pointing to the
12944 * same object. The problem is that
12945 * there is no way of mapping from
12946 * an object to the entries pointing
12947 * to it. (Deferred shadow creation
12948 * works with one entry because occurs
12949 * at fault time, and we walk from the
12950 * entry to the object when handling
12951 * the fault.)
12952 *
12953 * The second case is when the object
12954 * to be shared has already been copied
12955 * with a symmetric copy, but we point
12956 * directly to the object without
12957 * needs_copy set in our entry. (This
12958 * can happen because different ranges
12959 * of an object can be pointed to by
12960 * different entries. In particular,
12961 * a single entry pointing to an object
12962 * can be split by a call to vm_inherit,
12963 * which, combined with task_create, can
12964 * result in the different entries
12965 * having different needs_copy values.)
12966 * The shadowed flag in the object allows
12967 * us to detect this case. The problem
12968 * with this case is that if this object
12969 * has or will have shadows, then we
12970 * must not perform an asymmetric copy
12971 * of this object, since such a copy
12972 * allows the object to be changed, which
12973 * will break the previous symmetrical
12974 * copies (which rely upon the object
12975 * not changing). In a sense, the shadowed
12976 * flag says "don't change this object".
12977 * We fix this by creating a shadow
12978 * object for this object, and sharing
12979 * that. This works because we are free
12980 * to change the shadow object (and thus
12981 * to use an asymmetric copy strategy);
12982 * this is also semantically correct,
12983 * since this object is temporary, and
12984 * therefore a copy of the object is
12985 * as good as the object itself. (This
12986 * is not true for permanent objects,
12987 * since the pager needs to see changes,
12988 * which won't happen if the changes
12989 * are made to a copy.)
12990 *
12991 * The third case is when the object
12992 * to be shared has parts sticking
12993 * outside of the entry we're working
12994 * with, and thus may in the future
12995 * be subject to a symmetrical copy.
12996 * (This is a preemptive version of
12997 * case 2.)
12998 */
12999 VME_OBJECT_SHADOW(old_entry,
13000 (vm_map_size_t) (old_entry->vme_end -
13001 old_entry->vme_start),
13002 vm_map_always_shadow(old_map));
13003
13004 /*
13005 * If we're making a shadow for other than
13006 * copy on write reasons, then we have
13007 * to remove write permission.
13008 */
13009
13010 is_writable = false;
13011 if (old_entry->protection & VM_PROT_WRITE) {
13012 is_writable = true;
13013 #if __arm64e__
13014 } else if (old_entry->used_for_tpro) {
13015 is_writable = true;
13016 #endif /* __arm64e__ */
13017 }
13018 if (!old_entry->needs_copy && is_writable) {
13019 vm_prot_t prot;
13020
13021 assert(!pmap_has_prot_policy(old_map->pmap, old_entry->translated_allow_execute, old_entry->protection));
13022
13023 prot = old_entry->protection & ~VM_PROT_WRITE;
13024
13025 assert(!pmap_has_prot_policy(old_map->pmap, old_entry->translated_allow_execute, prot));
13026
13027 if (override_nx(old_map, VME_ALIAS(old_entry)) && prot) {
13028 prot |= VM_PROT_EXECUTE;
13029 }
13030
13031
13032 if (old_map->mapped_in_other_pmaps) {
13033 vm_object_pmap_protect(
13034 VME_OBJECT(old_entry),
13035 VME_OFFSET(old_entry),
13036 (old_entry->vme_end -
13037 old_entry->vme_start),
13038 PMAP_NULL,
13039 PAGE_SIZE,
13040 old_entry->vme_start,
13041 prot);
13042 } else {
13043 pmap_protect(old_map->pmap,
13044 old_entry->vme_start,
13045 old_entry->vme_end,
13046 prot);
13047 }
13048 }
13049
13050 old_entry->needs_copy = FALSE;
13051 object = VME_OBJECT(old_entry);
13052 }
13053
13054
13055 /*
13056 * If object was using a symmetric copy strategy,
13057 * change its copy strategy to the default
13058 * asymmetric copy strategy, which is copy_delay
13059 * in the non-norma case and copy_call in the
13060 * norma case. Bump the reference count for the
13061 * new entry.
13062 */
13063
13064 if (old_entry->is_sub_map) {
13065 vm_map_reference(VME_SUBMAP(old_entry));
13066 } else {
13067 vm_object_lock(object);
13068 vm_object_reference_locked(object);
13069 if (object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) {
13070 object->copy_strategy = MEMORY_OBJECT_COPY_DELAY;
13071 }
13072 vm_object_unlock(object);
13073 }
13074
13075 /*
13076 * Clone the entry, using object ref from above.
13077 * Mark both entries as shared.
13078 */
13079
13080 new_entry = vm_map_entry_create(new_map); /* Never the kernel map or descendants */
13081 vm_map_entry_copy(old_map, new_entry, old_entry);
13082 old_entry->is_shared = TRUE;
13083 new_entry->is_shared = TRUE;
13084
13085 /*
13086 * We're dealing with a shared mapping, so the resulting mapping
13087 * should inherit some of the original mapping's accounting settings.
13088 * "iokit_acct" should have been cleared in vm_map_entry_copy().
13089 * "use_pmap" should stay the same as before (if it hasn't been reset
13090 * to TRUE when we cleared "iokit_acct").
13091 */
13092 assert(!new_entry->iokit_acct);
13093
13094 /*
13095 * If old entry's inheritence is VM_INHERIT_NONE,
13096 * the new entry is for corpse fork, remove the
13097 * write permission from the new entry.
13098 */
13099 if (old_entry->inheritance == VM_INHERIT_NONE) {
13100 new_entry->protection &= ~VM_PROT_WRITE;
13101 new_entry->max_protection &= ~VM_PROT_WRITE;
13102 }
13103
13104 /*
13105 * Insert the entry into the new map -- we
13106 * know we're inserting at the end of the new
13107 * map.
13108 */
13109
13110 vm_map_store_entry_link(new_map, vm_map_last_entry(new_map), new_entry,
13111 VM_MAP_KERNEL_FLAGS_NONE);
13112
13113 /*
13114 * Update the physical map
13115 */
13116
13117 if (old_entry->is_sub_map) {
13118 /* Bill Angell pmap support goes here */
13119 } else {
13120 pmap_copy(new_map->pmap, old_map->pmap, new_entry->vme_start,
13121 old_entry->vme_end - old_entry->vme_start,
13122 old_entry->vme_start);
13123 }
13124 }
13125
13126 static boolean_t
vm_map_fork_copy(vm_map_t old_map,vm_map_entry_t * old_entry_p,vm_map_t new_map,int vm_map_copyin_flags)13127 vm_map_fork_copy(
13128 vm_map_t old_map,
13129 vm_map_entry_t *old_entry_p,
13130 vm_map_t new_map,
13131 int vm_map_copyin_flags)
13132 {
13133 vm_map_entry_t old_entry = *old_entry_p;
13134 vm_map_size_t entry_size = old_entry->vme_end - old_entry->vme_start;
13135 vm_map_offset_t start = old_entry->vme_start;
13136 vm_map_copy_t copy;
13137 vm_map_entry_t last = vm_map_last_entry(new_map);
13138
13139 vm_map_unlock(old_map);
13140 /*
13141 * Use maxprot version of copyin because we
13142 * care about whether this memory can ever
13143 * be accessed, not just whether it's accessible
13144 * right now.
13145 */
13146 vm_map_copyin_flags |= VM_MAP_COPYIN_USE_MAXPROT;
13147 if (vm_map_copyin_internal(old_map, start, entry_size,
13148 vm_map_copyin_flags, ©)
13149 != KERN_SUCCESS) {
13150 /*
13151 * The map might have changed while it
13152 * was unlocked, check it again. Skip
13153 * any blank space or permanently
13154 * unreadable region.
13155 */
13156 vm_map_lock(old_map);
13157 if (!vm_map_lookup_entry(old_map, start, &last) ||
13158 (last->max_protection & VM_PROT_READ) == VM_PROT_NONE) {
13159 last = last->vme_next;
13160 }
13161 *old_entry_p = last;
13162
13163 /*
13164 * XXX For some error returns, want to
13165 * XXX skip to the next element. Note
13166 * that INVALID_ADDRESS and
13167 * PROTECTION_FAILURE are handled above.
13168 */
13169
13170 return FALSE;
13171 }
13172
13173 /*
13174 * Assert that the vm_map_copy is coming from the right
13175 * zone and hasn't been forged
13176 */
13177 vm_map_copy_require(copy);
13178
13179 /*
13180 * Insert the copy into the new map
13181 */
13182 vm_map_copy_insert(new_map, last, copy);
13183
13184 /*
13185 * Pick up the traversal at the end of
13186 * the copied region.
13187 */
13188
13189 vm_map_lock(old_map);
13190 start += entry_size;
13191 if (!vm_map_lookup_entry(old_map, start, &last)) {
13192 last = last->vme_next;
13193 } else {
13194 if (last->vme_start == start) {
13195 /*
13196 * No need to clip here and we don't
13197 * want to cause any unnecessary
13198 * unnesting...
13199 */
13200 } else {
13201 vm_map_clip_start(old_map, last, start);
13202 }
13203 }
13204 *old_entry_p = last;
13205
13206 return TRUE;
13207 }
13208
13209 #if PMAP_FORK_NEST
13210 #define PMAP_FORK_NEST_DEBUG 0
13211 static inline void
vm_map_fork_unnest(pmap_t new_pmap,vm_map_offset_t pre_nested_start,vm_map_offset_t pre_nested_end,vm_map_offset_t start,vm_map_offset_t end)13212 vm_map_fork_unnest(
13213 pmap_t new_pmap,
13214 vm_map_offset_t pre_nested_start,
13215 vm_map_offset_t pre_nested_end,
13216 vm_map_offset_t start,
13217 vm_map_offset_t end)
13218 {
13219 kern_return_t kr;
13220 vm_map_offset_t nesting_mask, start_unnest, end_unnest;
13221
13222 assertf(pre_nested_start <= pre_nested_end,
13223 "pre_nested start 0x%llx end 0x%llx",
13224 (uint64_t)pre_nested_start, (uint64_t)pre_nested_end);
13225 assertf(start <= end,
13226 "start 0x%llx end 0x%llx",
13227 (uint64_t) start, (uint64_t)end);
13228
13229 if (pre_nested_start == pre_nested_end) {
13230 /* nothing was pre-nested: done */
13231 return;
13232 }
13233 if (end <= pre_nested_start) {
13234 /* fully before pre-nested range: done */
13235 return;
13236 }
13237 if (start >= pre_nested_end) {
13238 /* fully after pre-nested range: done */
13239 return;
13240 }
13241 /* ignore parts of range outside of pre_nested range */
13242 if (start < pre_nested_start) {
13243 start = pre_nested_start;
13244 }
13245 if (end > pre_nested_end) {
13246 end = pre_nested_end;
13247 }
13248 nesting_mask = pmap_shared_region_size_min(new_pmap) - 1;
13249 start_unnest = start & ~nesting_mask;
13250 end_unnest = (end + nesting_mask) & ~nesting_mask;
13251 kr = pmap_unnest(new_pmap,
13252 (addr64_t)start_unnest,
13253 (uint64_t)(end_unnest - start_unnest));
13254 #if PMAP_FORK_NEST_DEBUG
13255 printf("PMAP_FORK_NEST %s:%d new_pmap %p 0x%llx:0x%llx -> pmap_unnest 0x%llx:0x%llx kr 0x%x\n", __FUNCTION__, __LINE__, new_pmap, (uint64_t)start, (uint64_t)end, (uint64_t)start_unnest, (uint64_t)end_unnest, kr);
13256 #endif /* PMAP_FORK_NEST_DEBUG */
13257 assertf(kr == KERN_SUCCESS,
13258 "0x%llx 0x%llx pmap_unnest(%p, 0x%llx, 0x%llx) -> 0x%x",
13259 (uint64_t)start, (uint64_t)end, new_pmap,
13260 (uint64_t)start_unnest, (uint64_t)(end_unnest - start_unnest),
13261 kr);
13262 }
13263 #endif /* PMAP_FORK_NEST */
13264
13265 void
vm_map_inherit_limits(vm_map_t new_map,const struct _vm_map * old_map)13266 vm_map_inherit_limits(vm_map_t new_map, const struct _vm_map *old_map)
13267 {
13268 new_map->size_limit = old_map->size_limit;
13269 new_map->data_limit = old_map->data_limit;
13270 new_map->user_wire_limit = old_map->user_wire_limit;
13271 new_map->reserved_regions = old_map->reserved_regions;
13272 }
13273
13274 /*
13275 * vm_map_fork:
13276 *
13277 * Create and return a new map based on the old
13278 * map, according to the inheritance values on the
13279 * regions in that map and the options.
13280 *
13281 * The source map must not be locked.
13282 */
13283 vm_map_t
vm_map_fork(ledger_t ledger,vm_map_t old_map,int options)13284 vm_map_fork(
13285 ledger_t ledger,
13286 vm_map_t old_map,
13287 int options)
13288 {
13289 pmap_t new_pmap;
13290 vm_map_t new_map;
13291 vm_map_entry_t old_entry;
13292 vm_map_size_t new_size = 0, entry_size;
13293 vm_map_entry_t new_entry;
13294 boolean_t src_needs_copy;
13295 boolean_t new_entry_needs_copy;
13296 boolean_t pmap_is64bit;
13297 int vm_map_copyin_flags;
13298 vm_inherit_t old_entry_inheritance;
13299 int map_create_options;
13300 kern_return_t footprint_collect_kr;
13301
13302 if (options & ~(VM_MAP_FORK_SHARE_IF_INHERIT_NONE |
13303 VM_MAP_FORK_PRESERVE_PURGEABLE |
13304 VM_MAP_FORK_CORPSE_FOOTPRINT)) {
13305 /* unsupported option */
13306 return VM_MAP_NULL;
13307 }
13308
13309 pmap_is64bit =
13310 #if defined(__i386__) || defined(__x86_64__)
13311 old_map->pmap->pm_task_map != TASK_MAP_32BIT;
13312 #elif defined(__arm64__)
13313 old_map->pmap->is_64bit;
13314 #else
13315 #error Unknown architecture.
13316 #endif
13317
13318 unsigned int pmap_flags = 0;
13319 pmap_flags |= pmap_is64bit ? PMAP_CREATE_64BIT : 0;
13320 #if defined(HAS_APPLE_PAC)
13321 pmap_flags |= old_map->pmap->disable_jop ? PMAP_CREATE_DISABLE_JOP : 0;
13322 #endif
13323 #if CONFIG_ROSETTA
13324 pmap_flags |= old_map->pmap->is_rosetta ? PMAP_CREATE_ROSETTA : 0;
13325 #endif
13326 #if PMAP_CREATE_FORCE_4K_PAGES
13327 if (VM_MAP_PAGE_SIZE(old_map) == FOURK_PAGE_SIZE &&
13328 PAGE_SIZE != FOURK_PAGE_SIZE) {
13329 pmap_flags |= PMAP_CREATE_FORCE_4K_PAGES;
13330 }
13331 #endif /* PMAP_CREATE_FORCE_4K_PAGES */
13332 new_pmap = pmap_create_options(ledger, (vm_map_size_t) 0, pmap_flags);
13333 if (new_pmap == NULL) {
13334 return VM_MAP_NULL;
13335 }
13336
13337 vm_map_reference(old_map);
13338 vm_map_lock(old_map);
13339
13340 map_create_options = 0;
13341 if (old_map->hdr.entries_pageable) {
13342 map_create_options |= VM_MAP_CREATE_PAGEABLE;
13343 }
13344 if (options & VM_MAP_FORK_CORPSE_FOOTPRINT) {
13345 map_create_options |= VM_MAP_CREATE_CORPSE_FOOTPRINT;
13346 footprint_collect_kr = KERN_SUCCESS;
13347 }
13348 new_map = vm_map_create_options(new_pmap,
13349 old_map->min_offset,
13350 old_map->max_offset,
13351 map_create_options);
13352
13353 /* inherit cs_enforcement */
13354 vm_map_cs_enforcement_set(new_map, old_map->cs_enforcement);
13355
13356 vm_map_lock(new_map);
13357 vm_commit_pagezero_status(new_map);
13358 /* inherit the parent map's page size */
13359 vm_map_set_page_shift(new_map, VM_MAP_PAGE_SHIFT(old_map));
13360
13361 /* inherit the parent rlimits */
13362 vm_map_inherit_limits(new_map, old_map);
13363
13364 #if CONFIG_MAP_RANGES
13365 /* inherit the parent map's VM ranges */
13366 vm_map_range_fork(new_map, old_map);
13367 #endif
13368
13369 #if CODE_SIGNING_MONITOR
13370 /* Prepare the monitor for the fork */
13371 csm_fork_prepare(old_map->pmap, new_pmap);
13372 #endif
13373
13374 #if PMAP_FORK_NEST
13375 /*
13376 * Pre-nest the shared region's pmap.
13377 */
13378 vm_map_offset_t pre_nested_start = 0, pre_nested_end = 0;
13379 pmap_fork_nest(old_map->pmap, new_pmap,
13380 &pre_nested_start, &pre_nested_end);
13381 #if PMAP_FORK_NEST_DEBUG
13382 printf("PMAP_FORK_NEST %s:%d old %p new %p pre_nested start 0x%llx end 0x%llx\n", __FUNCTION__, __LINE__, old_map->pmap, new_pmap, (uint64_t)pre_nested_start, (uint64_t)pre_nested_end);
13383 #endif /* PMAP_FORK_NEST_DEBUG */
13384 #endif /* PMAP_FORK_NEST */
13385
13386 for (old_entry = vm_map_first_entry(old_map); old_entry != vm_map_to_entry(old_map);) {
13387 /*
13388 * Abort any corpse collection if the system is shutting down.
13389 */
13390 if ((options & VM_MAP_FORK_CORPSE_FOOTPRINT) &&
13391 get_system_inshutdown()) {
13392 #if PMAP_FORK_NEST
13393 new_entry = vm_map_last_entry(new_map);
13394 if (new_entry == vm_map_to_entry(new_map)) {
13395 /* unnest all that was pre-nested */
13396 vm_map_fork_unnest(new_pmap,
13397 pre_nested_start, pre_nested_end,
13398 vm_map_min(new_map), vm_map_max(new_map));
13399 } else if (new_entry->vme_end < vm_map_max(new_map)) {
13400 /* unnest hole at the end, if pre-nested */
13401 vm_map_fork_unnest(new_pmap,
13402 pre_nested_start, pre_nested_end,
13403 new_entry->vme_end, vm_map_max(new_map));
13404 }
13405 #endif /* PMAP_FORK_NEST */
13406 vm_map_corpse_footprint_collect_done(new_map);
13407 vm_map_unlock(new_map);
13408 vm_map_unlock(old_map);
13409 vm_map_deallocate(new_map);
13410 vm_map_deallocate(old_map);
13411 printf("Aborting corpse map due to system shutdown\n");
13412 return VM_MAP_NULL;
13413 }
13414
13415 entry_size = old_entry->vme_end - old_entry->vme_start;
13416
13417 #if PMAP_FORK_NEST
13418 /*
13419 * Undo any unnecessary pre-nesting.
13420 */
13421 vm_map_offset_t prev_end;
13422 if (old_entry == vm_map_first_entry(old_map)) {
13423 prev_end = vm_map_min(old_map);
13424 } else {
13425 prev_end = old_entry->vme_prev->vme_end;
13426 }
13427 if (prev_end < old_entry->vme_start) {
13428 /* unnest hole before this entry, if pre-nested */
13429 vm_map_fork_unnest(new_pmap,
13430 pre_nested_start, pre_nested_end,
13431 prev_end, old_entry->vme_start);
13432 }
13433 if (old_entry->is_sub_map && old_entry->use_pmap) {
13434 /* keep this entry nested in the child */
13435 #if PMAP_FORK_NEST_DEBUG
13436 printf("PMAP_FORK_NEST %s:%d new_pmap %p keeping 0x%llx:0x%llx nested\n", __FUNCTION__, __LINE__, new_pmap, (uint64_t)old_entry->vme_start, (uint64_t)old_entry->vme_end);
13437 #endif /* PMAP_FORK_NEST_DEBUG */
13438 } else {
13439 /* undo nesting for this entry, if pre-nested */
13440 vm_map_fork_unnest(new_pmap,
13441 pre_nested_start, pre_nested_end,
13442 old_entry->vme_start, old_entry->vme_end);
13443 }
13444 #endif /* PMAP_FORK_NEST */
13445
13446 old_entry_inheritance = old_entry->inheritance;
13447 /*
13448 * If caller used the VM_MAP_FORK_SHARE_IF_INHERIT_NONE option
13449 * share VM_INHERIT_NONE entries that are not backed by a
13450 * device pager.
13451 */
13452 if (old_entry_inheritance == VM_INHERIT_NONE &&
13453 (options & VM_MAP_FORK_SHARE_IF_INHERIT_NONE) &&
13454 (old_entry->protection & VM_PROT_READ) &&
13455 !(!old_entry->is_sub_map &&
13456 VME_OBJECT(old_entry) != NULL &&
13457 VME_OBJECT(old_entry)->pager != NULL &&
13458 is_device_pager_ops(
13459 VME_OBJECT(old_entry)->pager->mo_pager_ops))) {
13460 old_entry_inheritance = VM_INHERIT_SHARE;
13461 }
13462
13463 if (old_entry_inheritance != VM_INHERIT_NONE &&
13464 (options & VM_MAP_FORK_CORPSE_FOOTPRINT) &&
13465 footprint_collect_kr == KERN_SUCCESS) {
13466 /*
13467 * The corpse won't have old_map->pmap to query
13468 * footprint information, so collect that data now
13469 * and store it in new_map->vmmap_corpse_footprint
13470 * for later autopsy.
13471 */
13472 footprint_collect_kr =
13473 vm_map_corpse_footprint_collect(old_map,
13474 old_entry,
13475 new_map);
13476 }
13477
13478 switch (old_entry_inheritance) {
13479 case VM_INHERIT_NONE:
13480 break;
13481
13482 case VM_INHERIT_SHARE:
13483 vm_map_fork_share(old_map, old_entry, new_map);
13484 new_size += entry_size;
13485 break;
13486
13487 case VM_INHERIT_COPY:
13488
13489 /*
13490 * Inline the copy_quickly case;
13491 * upon failure, fall back on call
13492 * to vm_map_fork_copy.
13493 */
13494
13495 if (old_entry->is_sub_map) {
13496 break;
13497 }
13498 if ((old_entry->wired_count != 0) ||
13499 ((VME_OBJECT(old_entry) != NULL) &&
13500 (VME_OBJECT(old_entry)->true_share))) {
13501 goto slow_vm_map_fork_copy;
13502 }
13503
13504 new_entry = vm_map_entry_create(new_map); /* never the kernel map or descendants */
13505 vm_map_entry_copy(old_map, new_entry, old_entry);
13506 if (old_entry->vme_permanent) {
13507 /* inherit "permanent" on fork() */
13508 new_entry->vme_permanent = TRUE;
13509 }
13510
13511 if (new_entry->used_for_jit == TRUE && new_map->jit_entry_exists == FALSE) {
13512 new_map->jit_entry_exists = TRUE;
13513 }
13514
13515 if (new_entry->is_sub_map) {
13516 /* clear address space specifics */
13517 new_entry->use_pmap = FALSE;
13518 } else {
13519 /*
13520 * We're dealing with a copy-on-write operation,
13521 * so the resulting mapping should not inherit
13522 * the original mapping's accounting settings.
13523 * "iokit_acct" should have been cleared in
13524 * vm_map_entry_copy().
13525 * "use_pmap" should be reset to its default
13526 * (TRUE) so that the new mapping gets
13527 * accounted for in the task's memory footprint.
13528 */
13529 assert(!new_entry->iokit_acct);
13530 new_entry->use_pmap = TRUE;
13531 }
13532
13533 if (!vm_object_copy_quickly(
13534 VME_OBJECT(new_entry),
13535 VME_OFFSET(old_entry),
13536 (old_entry->vme_end -
13537 old_entry->vme_start),
13538 &src_needs_copy,
13539 &new_entry_needs_copy)) {
13540 vm_map_entry_dispose(new_entry);
13541 goto slow_vm_map_fork_copy;
13542 }
13543
13544 /*
13545 * Handle copy-on-write obligations
13546 */
13547
13548 if (src_needs_copy && !old_entry->needs_copy) {
13549 vm_prot_t prot;
13550
13551 assert(!pmap_has_prot_policy(old_map->pmap, old_entry->translated_allow_execute, old_entry->protection));
13552
13553 prot = old_entry->protection & ~VM_PROT_WRITE;
13554
13555 if (override_nx(old_map, VME_ALIAS(old_entry))
13556 && prot) {
13557 prot |= VM_PROT_EXECUTE;
13558 }
13559
13560 assert(!pmap_has_prot_policy(old_map->pmap, old_entry->translated_allow_execute, prot));
13561
13562 vm_object_pmap_protect(
13563 VME_OBJECT(old_entry),
13564 VME_OFFSET(old_entry),
13565 (old_entry->vme_end -
13566 old_entry->vme_start),
13567 ((old_entry->is_shared
13568 || old_map->mapped_in_other_pmaps)
13569 ? PMAP_NULL :
13570 old_map->pmap),
13571 VM_MAP_PAGE_SIZE(old_map),
13572 old_entry->vme_start,
13573 prot);
13574
13575 assert(old_entry->wired_count == 0);
13576 old_entry->needs_copy = TRUE;
13577 }
13578 new_entry->needs_copy = new_entry_needs_copy;
13579
13580 /*
13581 * Insert the entry at the end
13582 * of the map.
13583 */
13584
13585 vm_map_store_entry_link(new_map,
13586 vm_map_last_entry(new_map),
13587 new_entry,
13588 VM_MAP_KERNEL_FLAGS_NONE);
13589 new_size += entry_size;
13590 break;
13591
13592 slow_vm_map_fork_copy:
13593 vm_map_copyin_flags = VM_MAP_COPYIN_FORK;
13594 if (options & VM_MAP_FORK_PRESERVE_PURGEABLE) {
13595 vm_map_copyin_flags |=
13596 VM_MAP_COPYIN_PRESERVE_PURGEABLE;
13597 }
13598 if (vm_map_fork_copy(old_map,
13599 &old_entry,
13600 new_map,
13601 vm_map_copyin_flags)) {
13602 new_size += entry_size;
13603 }
13604 continue;
13605 }
13606 old_entry = old_entry->vme_next;
13607 }
13608
13609 #if PMAP_FORK_NEST
13610 new_entry = vm_map_last_entry(new_map);
13611 if (new_entry == vm_map_to_entry(new_map)) {
13612 /* unnest all that was pre-nested */
13613 vm_map_fork_unnest(new_pmap,
13614 pre_nested_start, pre_nested_end,
13615 vm_map_min(new_map), vm_map_max(new_map));
13616 } else if (new_entry->vme_end < vm_map_max(new_map)) {
13617 /* unnest hole at the end, if pre-nested */
13618 vm_map_fork_unnest(new_pmap,
13619 pre_nested_start, pre_nested_end,
13620 new_entry->vme_end, vm_map_max(new_map));
13621 }
13622 #endif /* PMAP_FORK_NEST */
13623
13624 #if defined(__arm64__)
13625 pmap_insert_commpage(new_map->pmap);
13626 #endif /* __arm64__ */
13627
13628 new_map->size = new_size;
13629
13630 if (options & VM_MAP_FORK_CORPSE_FOOTPRINT) {
13631 vm_map_corpse_footprint_collect_done(new_map);
13632 }
13633
13634 /* Propagate JIT entitlement for the pmap layer. */
13635 if (pmap_get_jit_entitled(old_map->pmap)) {
13636 /* Tell the pmap that it supports JIT. */
13637 pmap_set_jit_entitled(new_map->pmap);
13638 }
13639
13640 /* Propagate TPRO settings for the pmap layer */
13641 if (pmap_get_tpro(old_map->pmap)) {
13642 /* Tell the pmap that it supports TPRO */
13643 pmap_set_tpro(new_map->pmap);
13644 }
13645
13646 vm_map_unlock(new_map);
13647 vm_map_unlock(old_map);
13648 vm_map_deallocate(old_map);
13649
13650 return new_map;
13651 }
13652
13653 /*
13654 * vm_map_exec:
13655 *
13656 * Setup the "new_map" with the proper execution environment according
13657 * to the type of executable (platform, 64bit, chroot environment).
13658 * Map the comm page and shared region, etc...
13659 */
13660 kern_return_t
vm_map_exec(vm_map_t new_map,task_t task,boolean_t is64bit,void * fsroot,cpu_type_t cpu,cpu_subtype_t cpu_subtype,boolean_t reslide,boolean_t is_driverkit,uint32_t rsr_version)13661 vm_map_exec(
13662 vm_map_t new_map,
13663 task_t task,
13664 boolean_t is64bit,
13665 void *fsroot,
13666 cpu_type_t cpu,
13667 cpu_subtype_t cpu_subtype,
13668 boolean_t reslide,
13669 boolean_t is_driverkit,
13670 uint32_t rsr_version)
13671 {
13672 SHARED_REGION_TRACE_DEBUG(
13673 ("shared_region: task %p: vm_map_exec(%p,%p,%p,0x%x,0x%x): ->\n",
13674 (void *)VM_KERNEL_ADDRPERM(current_task()),
13675 (void *)VM_KERNEL_ADDRPERM(new_map),
13676 (void *)VM_KERNEL_ADDRPERM(task),
13677 (void *)VM_KERNEL_ADDRPERM(fsroot),
13678 cpu,
13679 cpu_subtype));
13680 (void) vm_commpage_enter(new_map, task, is64bit);
13681
13682 (void) vm_shared_region_enter(new_map, task, is64bit, fsroot, cpu, cpu_subtype, reslide, is_driverkit, rsr_version);
13683
13684 SHARED_REGION_TRACE_DEBUG(
13685 ("shared_region: task %p: vm_map_exec(%p,%p,%p,0x%x,0x%x): <-\n",
13686 (void *)VM_KERNEL_ADDRPERM(current_task()),
13687 (void *)VM_KERNEL_ADDRPERM(new_map),
13688 (void *)VM_KERNEL_ADDRPERM(task),
13689 (void *)VM_KERNEL_ADDRPERM(fsroot),
13690 cpu,
13691 cpu_subtype));
13692
13693 /*
13694 * Some devices have region(s) of memory that shouldn't get allocated by
13695 * user processes. The following code creates dummy vm_map_entry_t's for each
13696 * of the regions that needs to be reserved to prevent any allocations in
13697 * those regions.
13698 */
13699 kern_return_t kr = KERN_FAILURE;
13700 vm_map_kernel_flags_t vmk_flags = VM_MAP_KERNEL_FLAGS_FIXED_PERMANENT();
13701 vmk_flags.vmkf_beyond_max = true;
13702
13703 const struct vm_reserved_region *regions = NULL;
13704 size_t num_regions = ml_get_vm_reserved_regions(is64bit, ®ions);
13705 assert((num_regions == 0) || (num_regions > 0 && regions != NULL));
13706
13707 for (size_t i = 0; i < num_regions; ++i) {
13708 vm_map_offset_t address = regions[i].vmrr_addr;
13709
13710 kr = vm_map_enter(
13711 new_map,
13712 &address,
13713 regions[i].vmrr_size,
13714 (vm_map_offset_t)0,
13715 vmk_flags,
13716 VM_OBJECT_NULL,
13717 (vm_object_offset_t)0,
13718 FALSE,
13719 VM_PROT_NONE,
13720 VM_PROT_NONE,
13721 VM_INHERIT_COPY);
13722
13723 if (kr != KERN_SUCCESS) {
13724 panic("Failed to reserve %s region in user map %p %d", regions[i].vmrr_name, new_map, kr);
13725 }
13726 }
13727
13728 new_map->reserved_regions = (num_regions ? TRUE : FALSE);
13729
13730 return KERN_SUCCESS;
13731 }
13732
13733 uint64_t vm_map_lookup_and_lock_object_copy_slowly_count = 0;
13734 uint64_t vm_map_lookup_and_lock_object_copy_slowly_size = 0;
13735 uint64_t vm_map_lookup_and_lock_object_copy_slowly_max = 0;
13736 uint64_t vm_map_lookup_and_lock_object_copy_slowly_restart = 0;
13737 uint64_t vm_map_lookup_and_lock_object_copy_slowly_error = 0;
13738 uint64_t vm_map_lookup_and_lock_object_copy_strategically_count = 0;
13739 uint64_t vm_map_lookup_and_lock_object_copy_strategically_size = 0;
13740 uint64_t vm_map_lookup_and_lock_object_copy_strategically_max = 0;
13741 uint64_t vm_map_lookup_and_lock_object_copy_strategically_restart = 0;
13742 uint64_t vm_map_lookup_and_lock_object_copy_strategically_error = 0;
13743 uint64_t vm_map_lookup_and_lock_object_copy_shadow_count = 0;
13744 uint64_t vm_map_lookup_and_lock_object_copy_shadow_size = 0;
13745 uint64_t vm_map_lookup_and_lock_object_copy_shadow_max = 0;
13746 /*
13747 * vm_map_lookup_and_lock_object:
13748 *
13749 * Finds the VM object, offset, and
13750 * protection for a given virtual address in the
13751 * specified map, assuming a page fault of the
13752 * type specified.
13753 *
13754 * Returns the (object, offset, protection) for
13755 * this address, whether it is wired down, and whether
13756 * this map has the only reference to the data in question.
13757 * In order to later verify this lookup, a "version"
13758 * is returned.
13759 * If contended != NULL, *contended will be set to
13760 * true iff the thread had to spin or block to acquire
13761 * an exclusive lock.
13762 *
13763 * The map MUST be locked by the caller and WILL be
13764 * locked on exit. In order to guarantee the
13765 * existence of the returned object, it is returned
13766 * locked.
13767 *
13768 * If a lookup is requested with "write protection"
13769 * specified, the map may be changed to perform virtual
13770 * copying operations, although the data referenced will
13771 * remain the same.
13772 */
13773 kern_return_t
vm_map_lookup_and_lock_object(vm_map_t * var_map,vm_map_offset_t vaddr,vm_prot_t fault_type,int object_lock_type,vm_map_version_t * out_version,vm_object_t * object,vm_object_offset_t * offset,vm_prot_t * out_prot,boolean_t * wired,vm_object_fault_info_t fault_info,vm_map_t * real_map,bool * contended)13774 vm_map_lookup_and_lock_object(
13775 vm_map_t *var_map, /* IN/OUT */
13776 vm_map_offset_t vaddr,
13777 vm_prot_t fault_type,
13778 int object_lock_type,
13779 vm_map_version_t *out_version, /* OUT */
13780 vm_object_t *object, /* OUT */
13781 vm_object_offset_t *offset, /* OUT */
13782 vm_prot_t *out_prot, /* OUT */
13783 boolean_t *wired, /* OUT */
13784 vm_object_fault_info_t fault_info, /* OUT */
13785 vm_map_t *real_map, /* OUT */
13786 bool *contended) /* OUT */
13787 {
13788 vm_map_entry_t entry;
13789 vm_map_t map = *var_map;
13790 vm_map_t old_map = *var_map;
13791 vm_map_t cow_sub_map_parent = VM_MAP_NULL;
13792 vm_map_offset_t cow_parent_vaddr = 0;
13793 vm_map_offset_t old_start = 0;
13794 vm_map_offset_t old_end = 0;
13795 vm_prot_t prot;
13796 boolean_t mask_protections;
13797 boolean_t force_copy;
13798 boolean_t no_force_copy_if_executable;
13799 boolean_t submap_needed_copy;
13800 vm_prot_t original_fault_type;
13801 vm_map_size_t fault_page_mask;
13802
13803 /*
13804 * VM_PROT_MASK means that the caller wants us to use "fault_type"
13805 * as a mask against the mapping's actual protections, not as an
13806 * absolute value.
13807 */
13808 mask_protections = (fault_type & VM_PROT_IS_MASK) ? TRUE : FALSE;
13809 force_copy = (fault_type & VM_PROT_COPY) ? TRUE : FALSE;
13810 no_force_copy_if_executable = (fault_type & VM_PROT_COPY_FAIL_IF_EXECUTABLE) ? TRUE : FALSE;
13811 fault_type &= VM_PROT_ALL;
13812 original_fault_type = fault_type;
13813 if (contended) {
13814 *contended = false;
13815 }
13816
13817 *real_map = map;
13818
13819 fault_page_mask = MIN(VM_MAP_PAGE_MASK(map), PAGE_MASK);
13820 vaddr = VM_MAP_TRUNC_PAGE(vaddr, fault_page_mask);
13821
13822 RetryLookup:
13823 fault_type = original_fault_type;
13824
13825 /*
13826 * If the map has an interesting hint, try it before calling
13827 * full blown lookup routine.
13828 */
13829 entry = map->hint;
13830
13831 if ((entry == vm_map_to_entry(map)) ||
13832 (vaddr < entry->vme_start) || (vaddr >= entry->vme_end)) {
13833 vm_map_entry_t tmp_entry;
13834
13835 /*
13836 * Entry was either not a valid hint, or the vaddr
13837 * was not contained in the entry, so do a full lookup.
13838 */
13839 if (!vm_map_lookup_entry(map, vaddr, &tmp_entry)) {
13840 if ((cow_sub_map_parent) && (cow_sub_map_parent != map)) {
13841 vm_map_unlock(cow_sub_map_parent);
13842 }
13843 if ((*real_map != map)
13844 && (*real_map != cow_sub_map_parent)) {
13845 vm_map_unlock(*real_map);
13846 }
13847 return KERN_INVALID_ADDRESS;
13848 }
13849
13850 entry = tmp_entry;
13851 }
13852 if (map == old_map) {
13853 old_start = entry->vme_start;
13854 old_end = entry->vme_end;
13855 }
13856
13857 /*
13858 * Handle submaps. Drop lock on upper map, submap is
13859 * returned locked.
13860 */
13861
13862 submap_needed_copy = FALSE;
13863 submap_recurse:
13864 if (entry->is_sub_map) {
13865 vm_map_offset_t local_vaddr;
13866 vm_map_offset_t end_delta;
13867 vm_map_offset_t start_delta;
13868 vm_map_offset_t top_entry_saved_start;
13869 vm_object_offset_t top_entry_saved_offset;
13870 vm_map_entry_t submap_entry, saved_submap_entry;
13871 vm_object_offset_t submap_entry_offset;
13872 vm_object_size_t submap_entry_size;
13873 vm_prot_t subentry_protection;
13874 vm_prot_t subentry_max_protection;
13875 boolean_t subentry_no_copy_on_read;
13876 boolean_t subentry_permanent;
13877 boolean_t subentry_csm_associated;
13878 #if __arm64e__
13879 boolean_t subentry_used_for_tpro;
13880 #endif /* __arm64e__ */
13881 boolean_t mapped_needs_copy = FALSE;
13882 vm_map_version_t version;
13883
13884 assertf(VM_MAP_PAGE_SHIFT(VME_SUBMAP(entry)) >= VM_MAP_PAGE_SHIFT(map),
13885 "map %p (%d) entry %p submap %p (%d)\n",
13886 map, VM_MAP_PAGE_SHIFT(map), entry,
13887 VME_SUBMAP(entry), VM_MAP_PAGE_SHIFT(VME_SUBMAP(entry)));
13888
13889 local_vaddr = vaddr;
13890 top_entry_saved_start = entry->vme_start;
13891 top_entry_saved_offset = VME_OFFSET(entry);
13892
13893 if ((entry->use_pmap &&
13894 !((fault_type & VM_PROT_WRITE) ||
13895 force_copy))) {
13896 /* if real_map equals map we unlock below */
13897 if ((*real_map != map) &&
13898 (*real_map != cow_sub_map_parent)) {
13899 vm_map_unlock(*real_map);
13900 }
13901 *real_map = VME_SUBMAP(entry);
13902 }
13903
13904 if (entry->needs_copy &&
13905 ((fault_type & VM_PROT_WRITE) ||
13906 force_copy)) {
13907 if (!mapped_needs_copy) {
13908 if (vm_map_lock_read_to_write(map)) {
13909 vm_map_lock_read(map);
13910 *real_map = map;
13911 goto RetryLookup;
13912 }
13913 vm_map_lock_read(VME_SUBMAP(entry));
13914 *var_map = VME_SUBMAP(entry);
13915 cow_sub_map_parent = map;
13916 /* reset base to map before cow object */
13917 /* this is the map which will accept */
13918 /* the new cow object */
13919 old_start = entry->vme_start;
13920 old_end = entry->vme_end;
13921 cow_parent_vaddr = vaddr;
13922 mapped_needs_copy = TRUE;
13923 } else {
13924 vm_map_lock_read(VME_SUBMAP(entry));
13925 *var_map = VME_SUBMAP(entry);
13926 if ((cow_sub_map_parent != map) &&
13927 (*real_map != map)) {
13928 vm_map_unlock(map);
13929 }
13930 }
13931 } else {
13932 if (entry->needs_copy) {
13933 submap_needed_copy = TRUE;
13934 }
13935 vm_map_lock_read(VME_SUBMAP(entry));
13936 *var_map = VME_SUBMAP(entry);
13937 /* leave map locked if it is a target */
13938 /* cow sub_map above otherwise, just */
13939 /* follow the maps down to the object */
13940 /* here we unlock knowing we are not */
13941 /* revisiting the map. */
13942 if ((*real_map != map) && (map != cow_sub_map_parent)) {
13943 vm_map_unlock_read(map);
13944 }
13945 }
13946
13947 entry = NULL;
13948 map = *var_map;
13949
13950 /* calculate the offset in the submap for vaddr */
13951 local_vaddr = (local_vaddr - top_entry_saved_start) + top_entry_saved_offset;
13952 assertf(VM_MAP_PAGE_ALIGNED(local_vaddr, fault_page_mask),
13953 "local_vaddr 0x%llx entry->vme_start 0x%llx fault_page_mask 0x%llx\n",
13954 (uint64_t)local_vaddr, (uint64_t)top_entry_saved_start, (uint64_t)fault_page_mask);
13955
13956 RetrySubMap:
13957 if (!vm_map_lookup_entry(map, local_vaddr, &submap_entry)) {
13958 if ((cow_sub_map_parent) && (cow_sub_map_parent != map)) {
13959 vm_map_unlock(cow_sub_map_parent);
13960 }
13961 if ((*real_map != map)
13962 && (*real_map != cow_sub_map_parent)) {
13963 vm_map_unlock(*real_map);
13964 }
13965 *real_map = map;
13966 return KERN_INVALID_ADDRESS;
13967 }
13968
13969 /* find the attenuated shadow of the underlying object */
13970 /* on our target map */
13971
13972 /* in english the submap object may extend beyond the */
13973 /* region mapped by the entry or, may only fill a portion */
13974 /* of it. For our purposes, we only care if the object */
13975 /* doesn't fill. In this case the area which will */
13976 /* ultimately be clipped in the top map will only need */
13977 /* to be as big as the portion of the underlying entry */
13978 /* which is mapped */
13979 start_delta = submap_entry->vme_start > top_entry_saved_offset ?
13980 submap_entry->vme_start - top_entry_saved_offset : 0;
13981
13982 end_delta =
13983 (top_entry_saved_offset + start_delta + (old_end - old_start)) <=
13984 submap_entry->vme_end ?
13985 0 : (top_entry_saved_offset +
13986 (old_end - old_start))
13987 - submap_entry->vme_end;
13988
13989 old_start += start_delta;
13990 old_end -= end_delta;
13991
13992 if (submap_entry->is_sub_map) {
13993 entry = submap_entry;
13994 vaddr = local_vaddr;
13995 goto submap_recurse;
13996 }
13997
13998 if (((fault_type & VM_PROT_WRITE) ||
13999 force_copy)
14000 && cow_sub_map_parent) {
14001 vm_object_t sub_object, copy_object;
14002 vm_object_offset_t copy_offset;
14003 vm_map_offset_t local_start;
14004 vm_map_offset_t local_end;
14005 boolean_t object_copied = FALSE;
14006 vm_object_offset_t object_copied_offset = 0;
14007 boolean_t object_copied_needs_copy = FALSE;
14008 kern_return_t kr = KERN_SUCCESS;
14009
14010 if (vm_map_lock_read_to_write(map)) {
14011 vm_map_lock_read(map);
14012 old_start -= start_delta;
14013 old_end += end_delta;
14014 goto RetrySubMap;
14015 }
14016
14017
14018 sub_object = VME_OBJECT(submap_entry);
14019 if (sub_object == VM_OBJECT_NULL) {
14020 sub_object =
14021 vm_object_allocate(
14022 (vm_map_size_t)
14023 (submap_entry->vme_end -
14024 submap_entry->vme_start));
14025 VME_OBJECT_SET(submap_entry, sub_object, false, 0);
14026 VME_OFFSET_SET(submap_entry, 0);
14027 assert(!submap_entry->is_sub_map);
14028 assert(submap_entry->use_pmap);
14029 }
14030 local_start = local_vaddr -
14031 (cow_parent_vaddr - old_start);
14032 local_end = local_vaddr +
14033 (old_end - cow_parent_vaddr);
14034 vm_map_clip_start(map, submap_entry, local_start);
14035 vm_map_clip_end(map, submap_entry, local_end);
14036 if (submap_entry->is_sub_map) {
14037 /* unnesting was done when clipping */
14038 assert(!submap_entry->use_pmap);
14039 }
14040
14041 /* This is the COW case, lets connect */
14042 /* an entry in our space to the underlying */
14043 /* object in the submap, bypassing the */
14044 /* submap. */
14045 submap_entry_offset = VME_OFFSET(submap_entry);
14046 submap_entry_size = submap_entry->vme_end - submap_entry->vme_start;
14047
14048 if ((submap_entry->wired_count != 0 ||
14049 sub_object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC) &&
14050 (submap_entry->protection & VM_PROT_EXECUTE) &&
14051 no_force_copy_if_executable) {
14052 // printf("FBDP map %p entry %p start 0x%llx end 0x%llx wired %d strat %d\n", map, submap_entry, (uint64_t)local_start, (uint64_t)local_end, submap_entry->wired_count, sub_object->copy_strategy);
14053 if ((cow_sub_map_parent) && (cow_sub_map_parent != map)) {
14054 vm_map_unlock(cow_sub_map_parent);
14055 }
14056 if ((*real_map != map)
14057 && (*real_map != cow_sub_map_parent)) {
14058 vm_map_unlock(*real_map);
14059 }
14060 *real_map = map;
14061 ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_SUBMAP_NO_COW_ON_EXECUTABLE), 0 /* arg */);
14062 vm_map_lock_write_to_read(map);
14063 kr = KERN_PROTECTION_FAILURE;
14064 DTRACE_VM4(submap_no_copy_executable,
14065 vm_map_t, map,
14066 vm_object_offset_t, submap_entry_offset,
14067 vm_object_size_t, submap_entry_size,
14068 int, kr);
14069 return kr;
14070 }
14071
14072 if (submap_entry->wired_count != 0) {
14073 vm_object_reference(sub_object);
14074
14075 assertf(VM_MAP_PAGE_ALIGNED(VME_OFFSET(submap_entry), VM_MAP_PAGE_MASK(map)),
14076 "submap_entry %p offset 0x%llx\n",
14077 submap_entry, VME_OFFSET(submap_entry));
14078
14079 DTRACE_VM6(submap_copy_slowly,
14080 vm_map_t, cow_sub_map_parent,
14081 vm_map_offset_t, vaddr,
14082 vm_map_t, map,
14083 vm_object_size_t, submap_entry_size,
14084 int, submap_entry->wired_count,
14085 int, sub_object->copy_strategy);
14086
14087 saved_submap_entry = submap_entry;
14088 version.main_timestamp = map->timestamp;
14089 vm_map_unlock(map); /* Increments timestamp by 1 */
14090 submap_entry = VM_MAP_ENTRY_NULL;
14091
14092 vm_object_lock(sub_object);
14093 kr = vm_object_copy_slowly(sub_object,
14094 submap_entry_offset,
14095 submap_entry_size,
14096 FALSE,
14097 ©_object);
14098 object_copied = TRUE;
14099 object_copied_offset = 0;
14100 /* 4k: account for extra offset in physical page */
14101 object_copied_offset += submap_entry_offset - vm_object_trunc_page(submap_entry_offset);
14102 object_copied_needs_copy = FALSE;
14103 vm_object_deallocate(sub_object);
14104
14105 vm_map_lock(map);
14106
14107 if (kr != KERN_SUCCESS &&
14108 kr != KERN_MEMORY_RESTART_COPY) {
14109 if ((cow_sub_map_parent) && (cow_sub_map_parent != map)) {
14110 vm_map_unlock(cow_sub_map_parent);
14111 }
14112 if ((*real_map != map)
14113 && (*real_map != cow_sub_map_parent)) {
14114 vm_map_unlock(*real_map);
14115 }
14116 *real_map = map;
14117 vm_object_deallocate(copy_object);
14118 copy_object = VM_OBJECT_NULL;
14119 ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_SUBMAP_COPY_SLOWLY_FAILED), 0 /* arg */);
14120 vm_map_lock_write_to_read(map);
14121 DTRACE_VM4(submap_copy_error_slowly,
14122 vm_object_t, sub_object,
14123 vm_object_offset_t, submap_entry_offset,
14124 vm_object_size_t, submap_entry_size,
14125 int, kr);
14126 vm_map_lookup_and_lock_object_copy_slowly_error++;
14127 return kr;
14128 }
14129
14130 if ((kr == KERN_SUCCESS) &&
14131 (version.main_timestamp + 1) == map->timestamp) {
14132 submap_entry = saved_submap_entry;
14133 } else {
14134 saved_submap_entry = NULL;
14135 old_start -= start_delta;
14136 old_end += end_delta;
14137 vm_object_deallocate(copy_object);
14138 copy_object = VM_OBJECT_NULL;
14139 vm_map_lock_write_to_read(map);
14140 vm_map_lookup_and_lock_object_copy_slowly_restart++;
14141 goto RetrySubMap;
14142 }
14143 vm_map_lookup_and_lock_object_copy_slowly_count++;
14144 vm_map_lookup_and_lock_object_copy_slowly_size += submap_entry_size;
14145 if (submap_entry_size > vm_map_lookup_and_lock_object_copy_slowly_max) {
14146 vm_map_lookup_and_lock_object_copy_slowly_max = submap_entry_size;
14147 }
14148 } else if (sub_object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC) {
14149 submap_entry_offset = VME_OFFSET(submap_entry);
14150 copy_object = VM_OBJECT_NULL;
14151 object_copied_offset = submap_entry_offset;
14152 object_copied_needs_copy = FALSE;
14153 DTRACE_VM6(submap_copy_strategically,
14154 vm_map_t, cow_sub_map_parent,
14155 vm_map_offset_t, vaddr,
14156 vm_map_t, map,
14157 vm_object_size_t, submap_entry_size,
14158 int, submap_entry->wired_count,
14159 int, sub_object->copy_strategy);
14160 kr = vm_object_copy_strategically(
14161 sub_object,
14162 submap_entry_offset,
14163 submap_entry->vme_end - submap_entry->vme_start,
14164 false, /* forking */
14165 ©_object,
14166 &object_copied_offset,
14167 &object_copied_needs_copy);
14168 if (kr == KERN_MEMORY_RESTART_COPY) {
14169 old_start -= start_delta;
14170 old_end += end_delta;
14171 vm_object_deallocate(copy_object);
14172 copy_object = VM_OBJECT_NULL;
14173 vm_map_lock_write_to_read(map);
14174 vm_map_lookup_and_lock_object_copy_strategically_restart++;
14175 goto RetrySubMap;
14176 }
14177 if (kr != KERN_SUCCESS) {
14178 if ((cow_sub_map_parent) && (cow_sub_map_parent != map)) {
14179 vm_map_unlock(cow_sub_map_parent);
14180 }
14181 if ((*real_map != map)
14182 && (*real_map != cow_sub_map_parent)) {
14183 vm_map_unlock(*real_map);
14184 }
14185 *real_map = map;
14186 vm_object_deallocate(copy_object);
14187 copy_object = VM_OBJECT_NULL;
14188 ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_SUBMAP_COPY_STRAT_FAILED), 0 /* arg */);
14189 vm_map_lock_write_to_read(map);
14190 DTRACE_VM4(submap_copy_error_strategically,
14191 vm_object_t, sub_object,
14192 vm_object_offset_t, submap_entry_offset,
14193 vm_object_size_t, submap_entry_size,
14194 int, kr);
14195 vm_map_lookup_and_lock_object_copy_strategically_error++;
14196 return kr;
14197 }
14198 assert(copy_object != VM_OBJECT_NULL);
14199 assert(copy_object != sub_object);
14200 object_copied = TRUE;
14201 vm_map_lookup_and_lock_object_copy_strategically_count++;
14202 vm_map_lookup_and_lock_object_copy_strategically_size += submap_entry_size;
14203 if (submap_entry_size > vm_map_lookup_and_lock_object_copy_strategically_max) {
14204 vm_map_lookup_and_lock_object_copy_strategically_max = submap_entry_size;
14205 }
14206 } else {
14207 /* set up shadow object */
14208 object_copied = FALSE;
14209 copy_object = sub_object;
14210 vm_object_lock(sub_object);
14211 vm_object_reference_locked(sub_object);
14212 sub_object->shadowed = TRUE;
14213 vm_object_unlock(sub_object);
14214
14215 assert(submap_entry->wired_count == 0);
14216 submap_entry->needs_copy = TRUE;
14217
14218 prot = submap_entry->protection;
14219 assert(!pmap_has_prot_policy(map->pmap, submap_entry->translated_allow_execute, prot));
14220 prot = prot & ~VM_PROT_WRITE;
14221 assert(!pmap_has_prot_policy(map->pmap, submap_entry->translated_allow_execute, prot));
14222
14223 if (override_nx(old_map,
14224 VME_ALIAS(submap_entry))
14225 && prot) {
14226 prot |= VM_PROT_EXECUTE;
14227 }
14228
14229 vm_object_pmap_protect(
14230 sub_object,
14231 VME_OFFSET(submap_entry),
14232 submap_entry->vme_end -
14233 submap_entry->vme_start,
14234 (submap_entry->is_shared
14235 || map->mapped_in_other_pmaps) ?
14236 PMAP_NULL : map->pmap,
14237 VM_MAP_PAGE_SIZE(map),
14238 submap_entry->vme_start,
14239 prot);
14240 vm_map_lookup_and_lock_object_copy_shadow_count++;
14241 vm_map_lookup_and_lock_object_copy_shadow_size += submap_entry_size;
14242 if (submap_entry_size > vm_map_lookup_and_lock_object_copy_shadow_max) {
14243 vm_map_lookup_and_lock_object_copy_shadow_max = submap_entry_size;
14244 }
14245 }
14246
14247 /*
14248 * Adjust the fault offset to the submap entry.
14249 */
14250 copy_offset = (local_vaddr -
14251 submap_entry->vme_start +
14252 VME_OFFSET(submap_entry));
14253
14254 /* This works diffently than the */
14255 /* normal submap case. We go back */
14256 /* to the parent of the cow map and*/
14257 /* clip out the target portion of */
14258 /* the sub_map, substituting the */
14259 /* new copy object, */
14260
14261 subentry_protection = submap_entry->protection;
14262 subentry_max_protection = submap_entry->max_protection;
14263 subentry_no_copy_on_read = submap_entry->vme_no_copy_on_read;
14264 subentry_permanent = submap_entry->vme_permanent;
14265 subentry_csm_associated = submap_entry->csm_associated;
14266 #if __arm64e__
14267 subentry_used_for_tpro = submap_entry->used_for_tpro;
14268 #endif // __arm64e__
14269 vm_map_unlock(map);
14270 submap_entry = NULL; /* not valid after map unlock */
14271
14272 local_start = old_start;
14273 local_end = old_end;
14274 map = cow_sub_map_parent;
14275 *var_map = cow_sub_map_parent;
14276 vaddr = cow_parent_vaddr;
14277 cow_sub_map_parent = NULL;
14278
14279 if (!vm_map_lookup_entry(map,
14280 vaddr, &entry)) {
14281 if ((cow_sub_map_parent) && (cow_sub_map_parent != map)) {
14282 vm_map_unlock(cow_sub_map_parent);
14283 }
14284 if ((*real_map != map)
14285 && (*real_map != cow_sub_map_parent)) {
14286 vm_map_unlock(*real_map);
14287 }
14288 *real_map = map;
14289 vm_object_deallocate(
14290 copy_object);
14291 copy_object = VM_OBJECT_NULL;
14292 vm_map_lock_write_to_read(map);
14293 DTRACE_VM4(submap_lookup_post_unlock,
14294 uint64_t, (uint64_t)entry->vme_start,
14295 uint64_t, (uint64_t)entry->vme_end,
14296 vm_map_offset_t, vaddr,
14297 int, object_copied);
14298 return KERN_INVALID_ADDRESS;
14299 }
14300
14301 /* clip out the portion of space */
14302 /* mapped by the sub map which */
14303 /* corresponds to the underlying */
14304 /* object */
14305
14306 /*
14307 * Clip (and unnest) the smallest nested chunk
14308 * possible around the faulting address...
14309 */
14310 local_start = vaddr & ~(pmap_shared_region_size_min(map->pmap) - 1);
14311 local_end = local_start + pmap_shared_region_size_min(map->pmap);
14312 /*
14313 * ... but don't go beyond the "old_start" to "old_end"
14314 * range, to avoid spanning over another VM region
14315 * with a possibly different VM object and/or offset.
14316 */
14317 if (local_start < old_start) {
14318 local_start = old_start;
14319 }
14320 if (local_end > old_end) {
14321 local_end = old_end;
14322 }
14323 /*
14324 * Adjust copy_offset to the start of the range.
14325 */
14326 copy_offset -= (vaddr - local_start);
14327
14328 vm_map_clip_start(map, entry, local_start);
14329 vm_map_clip_end(map, entry, local_end);
14330 if (entry->is_sub_map) {
14331 /* unnesting was done when clipping */
14332 assert(!entry->use_pmap);
14333 }
14334
14335 /* substitute copy object for */
14336 /* shared map entry */
14337 vm_map_deallocate(VME_SUBMAP(entry));
14338 assert(!entry->iokit_acct);
14339 entry->use_pmap = TRUE;
14340 VME_OBJECT_SET(entry, copy_object, false, 0);
14341
14342 /* propagate the submap entry's protections */
14343 if (entry->protection != VM_PROT_READ) {
14344 /*
14345 * Someone has already altered the top entry's
14346 * protections via vm_protect(VM_PROT_COPY).
14347 * Respect these new values and ignore the
14348 * submap entry's protections.
14349 */
14350 } else {
14351 /*
14352 * Regular copy-on-write: propagate the submap
14353 * entry's protections to the top map entry.
14354 */
14355 entry->protection |= subentry_protection;
14356 }
14357 entry->max_protection |= subentry_max_protection;
14358 /* propagate some attributes from subentry */
14359 entry->vme_no_copy_on_read = subentry_no_copy_on_read;
14360 entry->vme_permanent = subentry_permanent;
14361 entry->csm_associated = subentry_csm_associated;
14362 #if __arm64e__
14363 /* propagate TPRO iff the destination map has TPRO enabled */
14364 if (subentry_used_for_tpro && vm_map_tpro(map)) {
14365 entry->used_for_tpro = subentry_used_for_tpro;
14366 }
14367 #endif /* __arm64e */
14368 if ((entry->protection & VM_PROT_WRITE) &&
14369 (entry->protection & VM_PROT_EXECUTE) &&
14370 #if XNU_TARGET_OS_OSX
14371 map->pmap != kernel_pmap &&
14372 (vm_map_cs_enforcement(map)
14373 #if __arm64__
14374 || !VM_MAP_IS_EXOTIC(map)
14375 #endif /* __arm64__ */
14376 ) &&
14377 #endif /* XNU_TARGET_OS_OSX */
14378 #if CODE_SIGNING_MONITOR
14379 (csm_address_space_exempt(map->pmap) != KERN_SUCCESS) &&
14380 #endif
14381 !(entry->used_for_jit) &&
14382 VM_MAP_POLICY_WX_STRIP_X(map)) {
14383 DTRACE_VM3(cs_wx,
14384 uint64_t, (uint64_t)entry->vme_start,
14385 uint64_t, (uint64_t)entry->vme_end,
14386 vm_prot_t, entry->protection);
14387 printf("CODE SIGNING: %d[%s] %s:%d(0x%llx,0x%llx,0x%x) can't have both write and exec at the same time\n",
14388 proc_selfpid(),
14389 (get_bsdtask_info(current_task())
14390 ? proc_name_address(get_bsdtask_info(current_task()))
14391 : "?"),
14392 __FUNCTION__, __LINE__,
14393 #if DEVELOPMENT || DEBUG
14394 (uint64_t)entry->vme_start,
14395 (uint64_t)entry->vme_end,
14396 #else /* DEVELOPMENT || DEBUG */
14397 (uint64_t)0,
14398 (uint64_t)0,
14399 #endif /* DEVELOPMENT || DEBUG */
14400 entry->protection);
14401 entry->protection &= ~VM_PROT_EXECUTE;
14402 }
14403
14404 if (object_copied) {
14405 VME_OFFSET_SET(entry, local_start - old_start + object_copied_offset);
14406 entry->needs_copy = object_copied_needs_copy;
14407 entry->is_shared = FALSE;
14408 } else {
14409 assert(VME_OBJECT(entry) != VM_OBJECT_NULL);
14410 assert(VME_OBJECT(entry)->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC);
14411 assert(entry->wired_count == 0);
14412 VME_OFFSET_SET(entry, copy_offset);
14413 entry->needs_copy = TRUE;
14414 if (map != old_map) {
14415 entry->is_shared = TRUE;
14416 }
14417 }
14418 if (entry->inheritance == VM_INHERIT_SHARE) {
14419 entry->inheritance = VM_INHERIT_COPY;
14420 }
14421
14422 vm_map_lock_write_to_read(map);
14423 } else {
14424 if ((cow_sub_map_parent)
14425 && (cow_sub_map_parent != *real_map)
14426 && (cow_sub_map_parent != map)) {
14427 vm_map_unlock(cow_sub_map_parent);
14428 }
14429 entry = submap_entry;
14430 vaddr = local_vaddr;
14431 }
14432 }
14433
14434 /*
14435 * Check whether this task is allowed to have
14436 * this page.
14437 */
14438
14439 prot = entry->protection;
14440
14441 if (override_nx(old_map, VME_ALIAS(entry)) && prot) {
14442 /*
14443 * HACK -- if not a stack, then allow execution
14444 */
14445 prot |= VM_PROT_EXECUTE;
14446 }
14447
14448 #if __arm64e__
14449 /*
14450 * If the entry we're dealing with is TPRO and we have a write
14451 * fault, inject VM_PROT_WRITE into protections. This allows us
14452 * to maintain RO permissions when not marked as TPRO.
14453 */
14454 if (entry->used_for_tpro && (fault_type & VM_PROT_WRITE)) {
14455 prot |= VM_PROT_WRITE;
14456 }
14457 #endif /* __arm64e__ */
14458 if (mask_protections) {
14459 fault_type &= prot;
14460 if (fault_type == VM_PROT_NONE) {
14461 goto protection_failure;
14462 }
14463 }
14464 if (((fault_type & prot) != fault_type)
14465 #if __arm64__
14466 /* prefetch abort in execute-only page */
14467 && !(prot == VM_PROT_EXECUTE && fault_type == (VM_PROT_READ | VM_PROT_EXECUTE))
14468 #elif defined(__x86_64__)
14469 /* Consider the UEXEC bit when handling an EXECUTE fault */
14470 && !((fault_type & VM_PROT_EXECUTE) && !(prot & VM_PROT_EXECUTE) && (prot & VM_PROT_UEXEC))
14471 #endif
14472 ) {
14473 protection_failure:
14474 if (*real_map != map) {
14475 vm_map_unlock(*real_map);
14476 }
14477 *real_map = map;
14478
14479 if ((fault_type & VM_PROT_EXECUTE) && prot) {
14480 log_stack_execution_failure((addr64_t)vaddr, prot);
14481 }
14482
14483 DTRACE_VM2(prot_fault, int, 1, (uint64_t *), NULL);
14484 DTRACE_VM3(prot_fault_detailed, vm_prot_t, fault_type, vm_prot_t, prot, void *, vaddr);
14485 /*
14486 * Noisy (esp. internally) and can be inferred from CrashReports. So OFF for now.
14487 *
14488 * ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_PROTECTION_FAILURE), 0);
14489 */
14490 return KERN_PROTECTION_FAILURE;
14491 }
14492
14493 /*
14494 * If this page is not pageable, we have to get
14495 * it for all possible accesses.
14496 */
14497
14498 *wired = (entry->wired_count != 0);
14499 if (*wired) {
14500 fault_type = prot;
14501 }
14502
14503 /*
14504 * If the entry was copy-on-write, we either ...
14505 */
14506
14507 if (entry->needs_copy) {
14508 /*
14509 * If we want to write the page, we may as well
14510 * handle that now since we've got the map locked.
14511 *
14512 * If we don't need to write the page, we just
14513 * demote the permissions allowed.
14514 */
14515
14516 if ((fault_type & VM_PROT_WRITE) || *wired || force_copy) {
14517 /*
14518 * Make a new object, and place it in the
14519 * object chain. Note that no new references
14520 * have appeared -- one just moved from the
14521 * map to the new object.
14522 */
14523
14524 if (vm_map_lock_read_to_write(map)) {
14525 vm_map_lock_read(map);
14526 goto RetryLookup;
14527 }
14528
14529 if (VME_OBJECT(entry)->shadowed == FALSE) {
14530 vm_object_lock(VME_OBJECT(entry));
14531 VME_OBJECT(entry)->shadowed = TRUE;
14532 vm_object_unlock(VME_OBJECT(entry));
14533 }
14534 VME_OBJECT_SHADOW(entry,
14535 (vm_map_size_t) (entry->vme_end -
14536 entry->vme_start),
14537 vm_map_always_shadow(map));
14538 entry->needs_copy = FALSE;
14539
14540 vm_map_lock_write_to_read(map);
14541 }
14542 if ((fault_type & VM_PROT_WRITE) == 0 && *wired == 0) {
14543 /*
14544 * We're attempting to read a copy-on-write
14545 * page -- don't allow writes.
14546 */
14547
14548 prot &= (~VM_PROT_WRITE);
14549 }
14550 }
14551
14552 if (submap_needed_copy && (prot & VM_PROT_WRITE)) {
14553 /*
14554 * We went through a "needs_copy" submap without triggering
14555 * a copy, so granting write access to the page would bypass
14556 * that submap's "needs_copy".
14557 */
14558 assert(!(fault_type & VM_PROT_WRITE));
14559 assert(!*wired);
14560 assert(!force_copy);
14561 // printf("FBDP %d[%s] submap_needed_copy for %p 0x%llx\n", proc_selfpid(), proc_name_address(current_task()->bsd_info), map, vaddr);
14562 prot &= ~VM_PROT_WRITE;
14563 }
14564
14565 /*
14566 * Create an object if necessary.
14567 */
14568 if (VME_OBJECT(entry) == VM_OBJECT_NULL) {
14569 if (vm_map_lock_read_to_write(map)) {
14570 vm_map_lock_read(map);
14571 goto RetryLookup;
14572 }
14573
14574 VME_OBJECT_SET(entry,
14575 vm_object_allocate(
14576 (vm_map_size_t)(entry->vme_end -
14577 entry->vme_start)), false, 0);
14578 VME_OFFSET_SET(entry, 0);
14579 assert(entry->use_pmap);
14580 vm_map_lock_write_to_read(map);
14581 }
14582
14583 /*
14584 * Return the object/offset from this entry. If the entry
14585 * was copy-on-write or empty, it has been fixed up. Also
14586 * return the protection.
14587 */
14588
14589 *offset = (vaddr - entry->vme_start) + VME_OFFSET(entry);
14590 *object = VME_OBJECT(entry);
14591 *out_prot = prot;
14592 KDBG_FILTERED(MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_MAP_LOOKUP_OBJECT), VM_KERNEL_UNSLIDE_OR_PERM(*object), (unsigned long) VME_ALIAS(entry), 0, 0);
14593
14594 if (fault_info) {
14595 fault_info->interruptible = THREAD_UNINT; /* for now... */
14596 /* ... the caller will change "interruptible" if needed */
14597 fault_info->cluster_size = 0;
14598 fault_info->user_tag = VME_ALIAS(entry);
14599 fault_info->pmap_options = 0;
14600 if (entry->iokit_acct ||
14601 (!entry->is_sub_map && !entry->use_pmap)) {
14602 fault_info->pmap_options |= PMAP_OPTIONS_ALT_ACCT;
14603 }
14604 fault_info->behavior = entry->behavior;
14605 fault_info->lo_offset = VME_OFFSET(entry);
14606 fault_info->hi_offset =
14607 (entry->vme_end - entry->vme_start) + VME_OFFSET(entry);
14608 fault_info->no_cache = entry->no_cache;
14609 fault_info->stealth = FALSE;
14610 fault_info->io_sync = FALSE;
14611 if (entry->used_for_jit ||
14612 #if CODE_SIGNING_MONITOR
14613 (csm_address_space_exempt(map->pmap) == KERN_SUCCESS) ||
14614 #endif
14615 entry->vme_resilient_codesign) {
14616 fault_info->cs_bypass = TRUE;
14617 } else {
14618 fault_info->cs_bypass = FALSE;
14619 }
14620 fault_info->csm_associated = FALSE;
14621 #if CODE_SIGNING_MONITOR
14622 if (entry->csm_associated) {
14623 /*
14624 * The pmap layer will validate this page
14625 * before allowing it to be executed from.
14626 */
14627 fault_info->csm_associated = TRUE;
14628 }
14629 #endif
14630 fault_info->mark_zf_absent = FALSE;
14631 fault_info->batch_pmap_op = FALSE;
14632 fault_info->resilient_media = entry->vme_resilient_media;
14633 fault_info->fi_xnu_user_debug = entry->vme_xnu_user_debug;
14634 fault_info->no_copy_on_read = entry->vme_no_copy_on_read;
14635 #if __arm64e__
14636 fault_info->fi_used_for_tpro = entry->used_for_tpro;
14637 #else /* __arm64e__ */
14638 fault_info->fi_used_for_tpro = FALSE;
14639 #endif
14640 if (entry->translated_allow_execute) {
14641 fault_info->pmap_options |= PMAP_OPTIONS_TRANSLATED_ALLOW_EXECUTE;
14642 }
14643 }
14644
14645 /*
14646 * Lock the object to prevent it from disappearing
14647 */
14648 if (object_lock_type == OBJECT_LOCK_EXCLUSIVE) {
14649 if (contended == NULL) {
14650 vm_object_lock(*object);
14651 } else {
14652 *contended = vm_object_lock_check_contended(*object);
14653 }
14654 } else {
14655 vm_object_lock_shared(*object);
14656 }
14657
14658 /*
14659 * Save the version number
14660 */
14661
14662 out_version->main_timestamp = map->timestamp;
14663
14664 return KERN_SUCCESS;
14665 }
14666
14667
14668 /*
14669 * vm_map_verify:
14670 *
14671 * Verifies that the map in question has not changed
14672 * since the given version. The map has to be locked
14673 * ("shared" mode is fine) before calling this function
14674 * and it will be returned locked too.
14675 */
14676 boolean_t
vm_map_verify(vm_map_t map,vm_map_version_t * version)14677 vm_map_verify(
14678 vm_map_t map,
14679 vm_map_version_t *version) /* REF */
14680 {
14681 boolean_t result;
14682
14683 vm_map_lock_assert_held(map);
14684 result = (map->timestamp == version->main_timestamp);
14685
14686 return result;
14687 }
14688
14689 /*
14690 * TEMPORARYTEMPORARYTEMPORARYTEMPORARYTEMPORARYTEMPORARY
14691 * Goes away after regular vm_region_recurse function migrates to
14692 * 64 bits
14693 * vm_region_recurse: A form of vm_region which follows the
14694 * submaps in a target map
14695 *
14696 */
14697
14698 kern_return_t
vm_map_region_recurse_64(vm_map_t map,vm_map_offset_t * address,vm_map_size_t * size,natural_t * nesting_depth,vm_region_submap_info_64_t submap_info,mach_msg_type_number_t * count)14699 vm_map_region_recurse_64(
14700 vm_map_t map,
14701 vm_map_offset_t *address, /* IN/OUT */
14702 vm_map_size_t *size, /* OUT */
14703 natural_t *nesting_depth, /* IN/OUT */
14704 vm_region_submap_info_64_t submap_info, /* IN/OUT */
14705 mach_msg_type_number_t *count) /* IN/OUT */
14706 {
14707 mach_msg_type_number_t original_count;
14708 vm_region_extended_info_data_t extended;
14709 vm_map_entry_t tmp_entry;
14710 vm_map_offset_t user_address;
14711 unsigned int user_max_depth;
14712
14713 /*
14714 * "curr_entry" is the VM map entry preceding or including the
14715 * address we're looking for.
14716 * "curr_map" is the map or sub-map containing "curr_entry".
14717 * "curr_address" is the equivalent of the top map's "user_address"
14718 * in the current map.
14719 * "curr_offset" is the cumulated offset of "curr_map" in the
14720 * target task's address space.
14721 * "curr_depth" is the depth of "curr_map" in the chain of
14722 * sub-maps.
14723 *
14724 * "curr_max_below" and "curr_max_above" limit the range (around
14725 * "curr_address") we should take into account in the current (sub)map.
14726 * They limit the range to what's visible through the map entries
14727 * we've traversed from the top map to the current map.
14728 *
14729 */
14730 vm_map_entry_t curr_entry;
14731 vm_map_address_t curr_address;
14732 vm_map_offset_t curr_offset;
14733 vm_map_t curr_map;
14734 unsigned int curr_depth;
14735 vm_map_offset_t curr_max_below, curr_max_above;
14736 vm_map_offset_t curr_skip;
14737
14738 /*
14739 * "next_" is the same as "curr_" but for the VM region immediately
14740 * after the address we're looking for. We need to keep track of this
14741 * too because we want to return info about that region if the
14742 * address we're looking for is not mapped.
14743 */
14744 vm_map_entry_t next_entry;
14745 vm_map_offset_t next_offset;
14746 vm_map_offset_t next_address;
14747 vm_map_t next_map;
14748 unsigned int next_depth;
14749 vm_map_offset_t next_max_below, next_max_above;
14750 vm_map_offset_t next_skip;
14751
14752 boolean_t look_for_pages;
14753 vm_region_submap_short_info_64_t short_info;
14754 boolean_t do_region_footprint;
14755 int effective_page_size, effective_page_shift;
14756 boolean_t submap_needed_copy;
14757
14758 if (map == VM_MAP_NULL) {
14759 /* no address space to work on */
14760 return KERN_INVALID_ARGUMENT;
14761 }
14762
14763 effective_page_shift = vm_self_region_page_shift(map);
14764 effective_page_size = (1 << effective_page_shift);
14765
14766 if (*count < VM_REGION_SUBMAP_SHORT_INFO_COUNT_64) {
14767 /*
14768 * "info" structure is not big enough and
14769 * would overflow
14770 */
14771 return KERN_INVALID_ARGUMENT;
14772 }
14773
14774 do_region_footprint = task_self_region_footprint();
14775 original_count = *count;
14776
14777 if (original_count < VM_REGION_SUBMAP_INFO_V0_COUNT_64) {
14778 *count = VM_REGION_SUBMAP_SHORT_INFO_COUNT_64;
14779 look_for_pages = FALSE;
14780 short_info = (vm_region_submap_short_info_64_t) submap_info;
14781 submap_info = NULL;
14782 } else {
14783 look_for_pages = TRUE;
14784 *count = VM_REGION_SUBMAP_INFO_V0_COUNT_64;
14785 short_info = NULL;
14786
14787 if (original_count >= VM_REGION_SUBMAP_INFO_V1_COUNT_64) {
14788 *count = VM_REGION_SUBMAP_INFO_V1_COUNT_64;
14789 }
14790 if (original_count >= VM_REGION_SUBMAP_INFO_V2_COUNT_64) {
14791 *count = VM_REGION_SUBMAP_INFO_V2_COUNT_64;
14792 }
14793 }
14794
14795 user_address = *address;
14796 user_max_depth = *nesting_depth;
14797 submap_needed_copy = FALSE;
14798
14799 if (not_in_kdp) {
14800 vm_map_lock_read(map);
14801 }
14802
14803 recurse_again:
14804 curr_entry = NULL;
14805 curr_map = map;
14806 curr_address = user_address;
14807 curr_offset = 0;
14808 curr_skip = 0;
14809 curr_depth = 0;
14810 curr_max_above = ((vm_map_offset_t) -1) - curr_address;
14811 curr_max_below = curr_address;
14812
14813 next_entry = NULL;
14814 next_map = NULL;
14815 next_address = 0;
14816 next_offset = 0;
14817 next_skip = 0;
14818 next_depth = 0;
14819 next_max_above = (vm_map_offset_t) -1;
14820 next_max_below = (vm_map_offset_t) -1;
14821
14822 for (;;) {
14823 if (vm_map_lookup_entry(curr_map,
14824 curr_address,
14825 &tmp_entry)) {
14826 /* tmp_entry contains the address we're looking for */
14827 curr_entry = tmp_entry;
14828 } else {
14829 vm_map_offset_t skip;
14830 /*
14831 * The address is not mapped. "tmp_entry" is the
14832 * map entry preceding the address. We want the next
14833 * one, if it exists.
14834 */
14835 curr_entry = tmp_entry->vme_next;
14836
14837 if (curr_entry == vm_map_to_entry(curr_map) ||
14838 (curr_entry->vme_start >=
14839 curr_address + curr_max_above)) {
14840 /* no next entry at this level: stop looking */
14841 if (not_in_kdp) {
14842 vm_map_unlock_read(curr_map);
14843 }
14844 curr_entry = NULL;
14845 curr_map = NULL;
14846 curr_skip = 0;
14847 curr_offset = 0;
14848 curr_depth = 0;
14849 curr_max_above = 0;
14850 curr_max_below = 0;
14851 break;
14852 }
14853
14854 /* adjust current address and offset */
14855 skip = curr_entry->vme_start - curr_address;
14856 curr_address = curr_entry->vme_start;
14857 curr_skip += skip;
14858 curr_offset += skip;
14859 curr_max_above -= skip;
14860 curr_max_below = 0;
14861 }
14862
14863 /*
14864 * Is the next entry at this level closer to the address (or
14865 * deeper in the submap chain) than the one we had
14866 * so far ?
14867 */
14868 tmp_entry = curr_entry->vme_next;
14869 if (tmp_entry == vm_map_to_entry(curr_map)) {
14870 /* no next entry at this level */
14871 } else if (tmp_entry->vme_start >=
14872 curr_address + curr_max_above) {
14873 /*
14874 * tmp_entry is beyond the scope of what we mapped of
14875 * this submap in the upper level: ignore it.
14876 */
14877 } else if ((next_entry == NULL) ||
14878 (tmp_entry->vme_start + curr_offset <=
14879 next_entry->vme_start + next_offset)) {
14880 /*
14881 * We didn't have a "next_entry" or this one is
14882 * closer to the address we're looking for:
14883 * use this "tmp_entry" as the new "next_entry".
14884 */
14885 if (next_entry != NULL) {
14886 /* unlock the last "next_map" */
14887 if (next_map != curr_map && not_in_kdp) {
14888 vm_map_unlock_read(next_map);
14889 }
14890 }
14891 next_entry = tmp_entry;
14892 next_map = curr_map;
14893 next_depth = curr_depth;
14894 next_address = next_entry->vme_start;
14895 next_skip = curr_skip;
14896 next_skip += (next_address - curr_address);
14897 next_offset = curr_offset;
14898 next_offset += (next_address - curr_address);
14899 next_max_above = MIN(next_max_above, curr_max_above);
14900 next_max_above = MIN(next_max_above,
14901 next_entry->vme_end - next_address);
14902 next_max_below = MIN(next_max_below, curr_max_below);
14903 next_max_below = MIN(next_max_below,
14904 next_address - next_entry->vme_start);
14905 }
14906
14907 /*
14908 * "curr_max_{above,below}" allow us to keep track of the
14909 * portion of the submap that is actually mapped at this level:
14910 * the rest of that submap is irrelevant to us, since it's not
14911 * mapped here.
14912 * The relevant portion of the map starts at
14913 * "VME_OFFSET(curr_entry)" up to the size of "curr_entry".
14914 */
14915 curr_max_above = MIN(curr_max_above,
14916 curr_entry->vme_end - curr_address);
14917 curr_max_below = MIN(curr_max_below,
14918 curr_address - curr_entry->vme_start);
14919
14920 if (!curr_entry->is_sub_map ||
14921 curr_depth >= user_max_depth) {
14922 /*
14923 * We hit a leaf map or we reached the maximum depth
14924 * we could, so stop looking. Keep the current map
14925 * locked.
14926 */
14927 break;
14928 }
14929
14930 /*
14931 * Get down to the next submap level.
14932 */
14933
14934 if (curr_entry->needs_copy) {
14935 /* everything below this is effectively copy-on-write */
14936 submap_needed_copy = TRUE;
14937 }
14938
14939 /*
14940 * Lock the next level and unlock the current level,
14941 * unless we need to keep it locked to access the "next_entry"
14942 * later.
14943 */
14944 if (not_in_kdp) {
14945 vm_map_lock_read(VME_SUBMAP(curr_entry));
14946 }
14947 if (curr_map == next_map) {
14948 /* keep "next_map" locked in case we need it */
14949 } else {
14950 /* release this map */
14951 if (not_in_kdp) {
14952 vm_map_unlock_read(curr_map);
14953 }
14954 }
14955
14956 /*
14957 * Adjust the offset. "curr_entry" maps the submap
14958 * at relative address "curr_entry->vme_start" in the
14959 * curr_map but skips the first "VME_OFFSET(curr_entry)"
14960 * bytes of the submap.
14961 * "curr_offset" always represents the offset of a virtual
14962 * address in the curr_map relative to the absolute address
14963 * space (i.e. the top-level VM map).
14964 */
14965 curr_offset +=
14966 (VME_OFFSET(curr_entry) - curr_entry->vme_start);
14967 curr_address = user_address + curr_offset;
14968 /* switch to the submap */
14969 curr_map = VME_SUBMAP(curr_entry);
14970 curr_depth++;
14971 curr_entry = NULL;
14972 }
14973
14974 // LP64todo: all the current tools are 32bit, obviously never worked for 64b
14975 // so probably should be a real 32b ID vs. ptr.
14976 // Current users just check for equality
14977
14978 if (curr_entry == NULL) {
14979 /* no VM region contains the address... */
14980
14981 if (do_region_footprint && /* we want footprint numbers */
14982 next_entry == NULL && /* & there are no more regions */
14983 /* & we haven't already provided our fake region: */
14984 user_address <= vm_map_last_entry(map)->vme_end) {
14985 ledger_amount_t ledger_resident, ledger_compressed;
14986
14987 /*
14988 * Add a fake memory region to account for
14989 * purgeable and/or ledger-tagged memory that
14990 * counts towards this task's memory footprint,
14991 * i.e. the resident/compressed pages of non-volatile
14992 * objects owned by that task.
14993 */
14994 task_ledgers_footprint(map->pmap->ledger,
14995 &ledger_resident,
14996 &ledger_compressed);
14997 if (ledger_resident + ledger_compressed == 0) {
14998 /* no purgeable memory usage to report */
14999 return KERN_INVALID_ADDRESS;
15000 }
15001 /* fake region to show nonvolatile footprint */
15002 if (look_for_pages) {
15003 submap_info->protection = VM_PROT_DEFAULT;
15004 submap_info->max_protection = VM_PROT_DEFAULT;
15005 submap_info->inheritance = VM_INHERIT_DEFAULT;
15006 submap_info->offset = 0;
15007 submap_info->user_tag = -1;
15008 submap_info->pages_resident = (unsigned int) (ledger_resident / effective_page_size);
15009 submap_info->pages_shared_now_private = 0;
15010 submap_info->pages_swapped_out = (unsigned int) (ledger_compressed / effective_page_size);
15011 submap_info->pages_dirtied = submap_info->pages_resident;
15012 submap_info->ref_count = 1;
15013 submap_info->shadow_depth = 0;
15014 submap_info->external_pager = 0;
15015 submap_info->share_mode = SM_PRIVATE;
15016 if (submap_needed_copy) {
15017 submap_info->share_mode = SM_COW;
15018 }
15019 submap_info->is_submap = 0;
15020 submap_info->behavior = VM_BEHAVIOR_DEFAULT;
15021 submap_info->object_id = VM_OBJECT_ID_FAKE(map, task_ledgers.purgeable_nonvolatile);
15022 submap_info->user_wired_count = 0;
15023 submap_info->pages_reusable = 0;
15024 } else {
15025 short_info->user_tag = -1;
15026 short_info->offset = 0;
15027 short_info->protection = VM_PROT_DEFAULT;
15028 short_info->inheritance = VM_INHERIT_DEFAULT;
15029 short_info->max_protection = VM_PROT_DEFAULT;
15030 short_info->behavior = VM_BEHAVIOR_DEFAULT;
15031 short_info->user_wired_count = 0;
15032 short_info->is_submap = 0;
15033 short_info->object_id = VM_OBJECT_ID_FAKE(map, task_ledgers.purgeable_nonvolatile);
15034 short_info->external_pager = 0;
15035 short_info->shadow_depth = 0;
15036 short_info->share_mode = SM_PRIVATE;
15037 if (submap_needed_copy) {
15038 short_info->share_mode = SM_COW;
15039 }
15040 short_info->ref_count = 1;
15041 }
15042 *nesting_depth = 0;
15043 *size = (vm_map_size_t) (ledger_resident + ledger_compressed);
15044 // *address = user_address;
15045 *address = vm_map_last_entry(map)->vme_end;
15046 return KERN_SUCCESS;
15047 }
15048
15049 if (next_entry == NULL) {
15050 /* ... and no VM region follows it either */
15051 return KERN_INVALID_ADDRESS;
15052 }
15053 /* ... gather info about the next VM region */
15054 curr_entry = next_entry;
15055 curr_map = next_map; /* still locked ... */
15056 curr_address = next_address;
15057 curr_skip = next_skip;
15058 curr_offset = next_offset;
15059 curr_depth = next_depth;
15060 curr_max_above = next_max_above;
15061 curr_max_below = next_max_below;
15062 } else {
15063 /* we won't need "next_entry" after all */
15064 if (next_entry != NULL) {
15065 /* release "next_map" */
15066 if (next_map != curr_map && not_in_kdp) {
15067 vm_map_unlock_read(next_map);
15068 }
15069 }
15070 }
15071 next_entry = NULL;
15072 next_map = NULL;
15073 next_offset = 0;
15074 next_skip = 0;
15075 next_depth = 0;
15076 next_max_below = -1;
15077 next_max_above = -1;
15078
15079 if (curr_entry->is_sub_map &&
15080 curr_depth < user_max_depth) {
15081 /*
15082 * We're not as deep as we could be: we must have
15083 * gone back up after not finding anything mapped
15084 * below the original top-level map entry's.
15085 * Let's move "curr_address" forward and recurse again.
15086 */
15087 user_address = curr_address;
15088 goto recurse_again;
15089 }
15090
15091 *nesting_depth = curr_depth;
15092 *size = curr_max_above + curr_max_below;
15093 *address = user_address + curr_skip - curr_max_below;
15094
15095 if (look_for_pages) {
15096 submap_info->user_tag = VME_ALIAS(curr_entry);
15097 submap_info->offset = VME_OFFSET(curr_entry);
15098 submap_info->protection = curr_entry->protection;
15099 submap_info->inheritance = curr_entry->inheritance;
15100 submap_info->max_protection = curr_entry->max_protection;
15101 submap_info->behavior = curr_entry->behavior;
15102 submap_info->user_wired_count = curr_entry->user_wired_count;
15103 submap_info->is_submap = curr_entry->is_sub_map;
15104 if (curr_entry->is_sub_map) {
15105 submap_info->object_id = VM_OBJECT_ID(VME_SUBMAP(curr_entry));
15106 } else {
15107 submap_info->object_id = VM_OBJECT_ID(VME_OBJECT(curr_entry));
15108 }
15109 } else {
15110 short_info->user_tag = VME_ALIAS(curr_entry);
15111 short_info->offset = VME_OFFSET(curr_entry);
15112 short_info->protection = curr_entry->protection;
15113 short_info->inheritance = curr_entry->inheritance;
15114 short_info->max_protection = curr_entry->max_protection;
15115 short_info->behavior = curr_entry->behavior;
15116 short_info->user_wired_count = curr_entry->user_wired_count;
15117 short_info->is_submap = curr_entry->is_sub_map;
15118 if (curr_entry->is_sub_map) {
15119 short_info->object_id = VM_OBJECT_ID(VME_SUBMAP(curr_entry));
15120 } else {
15121 short_info->object_id = VM_OBJECT_ID(VME_OBJECT(curr_entry));
15122 }
15123 }
15124
15125 extended.pages_resident = 0;
15126 extended.pages_swapped_out = 0;
15127 extended.pages_shared_now_private = 0;
15128 extended.pages_dirtied = 0;
15129 extended.pages_reusable = 0;
15130 extended.external_pager = 0;
15131 extended.shadow_depth = 0;
15132 extended.share_mode = SM_EMPTY;
15133 extended.ref_count = 0;
15134
15135 if (not_in_kdp) {
15136 if (!curr_entry->is_sub_map) {
15137 vm_map_offset_t range_start, range_end;
15138 range_start = MAX((curr_address - curr_max_below),
15139 curr_entry->vme_start);
15140 range_end = MIN((curr_address + curr_max_above),
15141 curr_entry->vme_end);
15142 vm_map_region_walk(curr_map,
15143 range_start,
15144 curr_entry,
15145 (VME_OFFSET(curr_entry) +
15146 (range_start -
15147 curr_entry->vme_start)),
15148 range_end - range_start,
15149 &extended,
15150 look_for_pages, VM_REGION_EXTENDED_INFO_COUNT);
15151 if (extended.external_pager &&
15152 extended.ref_count == 2 &&
15153 extended.share_mode == SM_SHARED) {
15154 extended.share_mode = SM_PRIVATE;
15155 }
15156 if (submap_needed_copy) {
15157 extended.share_mode = SM_COW;
15158 }
15159 } else {
15160 if (curr_entry->use_pmap) {
15161 extended.share_mode = SM_TRUESHARED;
15162 } else {
15163 extended.share_mode = SM_PRIVATE;
15164 }
15165 extended.ref_count = os_ref_get_count_raw(&VME_SUBMAP(curr_entry)->map_refcnt);
15166 }
15167 }
15168
15169 if (look_for_pages) {
15170 submap_info->pages_resident = extended.pages_resident;
15171 submap_info->pages_swapped_out = extended.pages_swapped_out;
15172 submap_info->pages_shared_now_private =
15173 extended.pages_shared_now_private;
15174 submap_info->pages_dirtied = extended.pages_dirtied;
15175 submap_info->external_pager = extended.external_pager;
15176 submap_info->shadow_depth = extended.shadow_depth;
15177 submap_info->share_mode = extended.share_mode;
15178 submap_info->ref_count = extended.ref_count;
15179
15180 if (original_count >= VM_REGION_SUBMAP_INFO_V1_COUNT_64) {
15181 submap_info->pages_reusable = extended.pages_reusable;
15182 }
15183 if (original_count >= VM_REGION_SUBMAP_INFO_V2_COUNT_64) {
15184 if (curr_entry->is_sub_map) {
15185 submap_info->object_id_full = (vm_object_id_t)VM_KERNEL_ADDRPERM(VME_SUBMAP(curr_entry));
15186 } else if (VME_OBJECT(curr_entry)) {
15187 submap_info->object_id_full = (vm_object_id_t)VM_KERNEL_ADDRPERM(VME_OBJECT(curr_entry));
15188 } else {
15189 submap_info->object_id_full = 0ull;
15190 }
15191 }
15192 } else {
15193 short_info->external_pager = extended.external_pager;
15194 short_info->shadow_depth = extended.shadow_depth;
15195 short_info->share_mode = extended.share_mode;
15196 short_info->ref_count = extended.ref_count;
15197 }
15198
15199 if (not_in_kdp) {
15200 vm_map_unlock_read(curr_map);
15201 }
15202
15203 return KERN_SUCCESS;
15204 }
15205
15206 /*
15207 * vm_region:
15208 *
15209 * User call to obtain information about a region in
15210 * a task's address map. Currently, only one flavor is
15211 * supported.
15212 *
15213 * XXX The reserved and behavior fields cannot be filled
15214 * in until the vm merge from the IK is completed, and
15215 * vm_reserve is implemented.
15216 */
15217
15218 kern_return_t
vm_map_region(vm_map_t map,vm_map_offset_t * address,vm_map_size_t * size,vm_region_flavor_t flavor,vm_region_info_t info,mach_msg_type_number_t * count,mach_port_t * object_name)15219 vm_map_region(
15220 vm_map_t map,
15221 vm_map_offset_t *address, /* IN/OUT */
15222 vm_map_size_t *size, /* OUT */
15223 vm_region_flavor_t flavor, /* IN */
15224 vm_region_info_t info, /* OUT */
15225 mach_msg_type_number_t *count, /* IN/OUT */
15226 mach_port_t *object_name) /* OUT */
15227 {
15228 vm_map_entry_t tmp_entry;
15229 vm_map_entry_t entry;
15230 vm_map_offset_t start;
15231
15232 if (map == VM_MAP_NULL) {
15233 return KERN_INVALID_ARGUMENT;
15234 }
15235
15236 switch (flavor) {
15237 case VM_REGION_BASIC_INFO:
15238 /* legacy for old 32-bit objects info */
15239 {
15240 vm_region_basic_info_t basic;
15241
15242 if (*count < VM_REGION_BASIC_INFO_COUNT) {
15243 return KERN_INVALID_ARGUMENT;
15244 }
15245
15246 basic = (vm_region_basic_info_t) info;
15247 *count = VM_REGION_BASIC_INFO_COUNT;
15248
15249 vm_map_lock_read(map);
15250
15251 start = *address;
15252 if (!vm_map_lookup_entry(map, start, &tmp_entry)) {
15253 if ((entry = tmp_entry->vme_next) == vm_map_to_entry(map)) {
15254 vm_map_unlock_read(map);
15255 return KERN_INVALID_ADDRESS;
15256 }
15257 } else {
15258 entry = tmp_entry;
15259 }
15260
15261 start = entry->vme_start;
15262
15263 basic->offset = (uint32_t)VME_OFFSET(entry);
15264 basic->protection = entry->protection;
15265 basic->inheritance = entry->inheritance;
15266 basic->max_protection = entry->max_protection;
15267 basic->behavior = entry->behavior;
15268 basic->user_wired_count = entry->user_wired_count;
15269 basic->reserved = entry->is_sub_map;
15270 *address = start;
15271 *size = (entry->vme_end - start);
15272
15273 if (object_name) {
15274 *object_name = IP_NULL;
15275 }
15276 if (entry->is_sub_map) {
15277 basic->shared = FALSE;
15278 } else {
15279 basic->shared = entry->is_shared;
15280 }
15281
15282 vm_map_unlock_read(map);
15283 return KERN_SUCCESS;
15284 }
15285
15286 case VM_REGION_BASIC_INFO_64:
15287 {
15288 vm_region_basic_info_64_t basic;
15289
15290 if (*count < VM_REGION_BASIC_INFO_COUNT_64) {
15291 return KERN_INVALID_ARGUMENT;
15292 }
15293
15294 basic = (vm_region_basic_info_64_t) info;
15295 *count = VM_REGION_BASIC_INFO_COUNT_64;
15296
15297 vm_map_lock_read(map);
15298
15299 start = *address;
15300 if (!vm_map_lookup_entry(map, start, &tmp_entry)) {
15301 if ((entry = tmp_entry->vme_next) == vm_map_to_entry(map)) {
15302 vm_map_unlock_read(map);
15303 return KERN_INVALID_ADDRESS;
15304 }
15305 } else {
15306 entry = tmp_entry;
15307 }
15308
15309 start = entry->vme_start;
15310
15311 basic->offset = VME_OFFSET(entry);
15312 basic->protection = entry->protection;
15313 basic->inheritance = entry->inheritance;
15314 basic->max_protection = entry->max_protection;
15315 basic->behavior = entry->behavior;
15316 basic->user_wired_count = entry->user_wired_count;
15317 basic->reserved = entry->is_sub_map;
15318 *address = start;
15319 *size = (entry->vme_end - start);
15320
15321 if (object_name) {
15322 *object_name = IP_NULL;
15323 }
15324 if (entry->is_sub_map) {
15325 basic->shared = FALSE;
15326 } else {
15327 basic->shared = entry->is_shared;
15328 }
15329
15330 vm_map_unlock_read(map);
15331 return KERN_SUCCESS;
15332 }
15333 case VM_REGION_EXTENDED_INFO:
15334 if (*count < VM_REGION_EXTENDED_INFO_COUNT) {
15335 return KERN_INVALID_ARGUMENT;
15336 }
15337 OS_FALLTHROUGH;
15338 case VM_REGION_EXTENDED_INFO__legacy:
15339 if (*count < VM_REGION_EXTENDED_INFO_COUNT__legacy) {
15340 return KERN_INVALID_ARGUMENT;
15341 }
15342
15343 {
15344 vm_region_extended_info_t extended;
15345 mach_msg_type_number_t original_count;
15346 int effective_page_size, effective_page_shift;
15347
15348 extended = (vm_region_extended_info_t) info;
15349
15350 effective_page_shift = vm_self_region_page_shift(map);
15351 effective_page_size = (1 << effective_page_shift);
15352
15353 vm_map_lock_read(map);
15354
15355 start = *address;
15356 if (!vm_map_lookup_entry(map, start, &tmp_entry)) {
15357 if ((entry = tmp_entry->vme_next) == vm_map_to_entry(map)) {
15358 vm_map_unlock_read(map);
15359 return KERN_INVALID_ADDRESS;
15360 }
15361 } else {
15362 entry = tmp_entry;
15363 }
15364 start = entry->vme_start;
15365
15366 extended->protection = entry->protection;
15367 extended->user_tag = VME_ALIAS(entry);
15368 extended->pages_resident = 0;
15369 extended->pages_swapped_out = 0;
15370 extended->pages_shared_now_private = 0;
15371 extended->pages_dirtied = 0;
15372 extended->external_pager = 0;
15373 extended->shadow_depth = 0;
15374
15375 original_count = *count;
15376 if (flavor == VM_REGION_EXTENDED_INFO__legacy) {
15377 *count = VM_REGION_EXTENDED_INFO_COUNT__legacy;
15378 } else {
15379 extended->pages_reusable = 0;
15380 *count = VM_REGION_EXTENDED_INFO_COUNT;
15381 }
15382
15383 vm_map_region_walk(map, start, entry, VME_OFFSET(entry), entry->vme_end - start, extended, TRUE, *count);
15384
15385 if (extended->external_pager && extended->ref_count == 2 && extended->share_mode == SM_SHARED) {
15386 extended->share_mode = SM_PRIVATE;
15387 }
15388
15389 if (object_name) {
15390 *object_name = IP_NULL;
15391 }
15392 *address = start;
15393 *size = (entry->vme_end - start);
15394
15395 vm_map_unlock_read(map);
15396 return KERN_SUCCESS;
15397 }
15398 case VM_REGION_TOP_INFO:
15399 {
15400 vm_region_top_info_t top;
15401
15402 if (*count < VM_REGION_TOP_INFO_COUNT) {
15403 return KERN_INVALID_ARGUMENT;
15404 }
15405
15406 top = (vm_region_top_info_t) info;
15407 *count = VM_REGION_TOP_INFO_COUNT;
15408
15409 vm_map_lock_read(map);
15410
15411 start = *address;
15412 if (!vm_map_lookup_entry(map, start, &tmp_entry)) {
15413 if ((entry = tmp_entry->vme_next) == vm_map_to_entry(map)) {
15414 vm_map_unlock_read(map);
15415 return KERN_INVALID_ADDRESS;
15416 }
15417 } else {
15418 entry = tmp_entry;
15419 }
15420 start = entry->vme_start;
15421
15422 top->private_pages_resident = 0;
15423 top->shared_pages_resident = 0;
15424
15425 vm_map_region_top_walk(entry, top);
15426
15427 if (object_name) {
15428 *object_name = IP_NULL;
15429 }
15430 *address = start;
15431 *size = (entry->vme_end - start);
15432
15433 vm_map_unlock_read(map);
15434 return KERN_SUCCESS;
15435 }
15436 default:
15437 return KERN_INVALID_ARGUMENT;
15438 }
15439 }
15440
15441 #define OBJ_RESIDENT_COUNT(obj, entry_size) \
15442 MIN((entry_size), \
15443 ((obj)->all_reusable ? \
15444 (obj)->wired_page_count : \
15445 (obj)->resident_page_count - (obj)->reusable_page_count))
15446
15447 void
vm_map_region_top_walk(vm_map_entry_t entry,vm_region_top_info_t top)15448 vm_map_region_top_walk(
15449 vm_map_entry_t entry,
15450 vm_region_top_info_t top)
15451 {
15452 if (entry->is_sub_map || VME_OBJECT(entry) == 0) {
15453 top->share_mode = SM_EMPTY;
15454 top->ref_count = 0;
15455 top->obj_id = 0;
15456 return;
15457 }
15458
15459 {
15460 struct vm_object *obj, *tmp_obj;
15461 int ref_count;
15462 uint32_t entry_size;
15463
15464 entry_size = (uint32_t) ((entry->vme_end - entry->vme_start) / PAGE_SIZE_64);
15465
15466 obj = VME_OBJECT(entry);
15467
15468 vm_object_lock(obj);
15469
15470 if ((ref_count = obj->ref_count) > 1 && obj->paging_in_progress) {
15471 ref_count--;
15472 }
15473
15474 assert(obj->reusable_page_count <= obj->resident_page_count);
15475 if (obj->shadow) {
15476 if (ref_count == 1) {
15477 top->private_pages_resident =
15478 OBJ_RESIDENT_COUNT(obj, entry_size);
15479 } else {
15480 top->shared_pages_resident =
15481 OBJ_RESIDENT_COUNT(obj, entry_size);
15482 }
15483 top->ref_count = ref_count;
15484 top->share_mode = SM_COW;
15485
15486 while ((tmp_obj = obj->shadow)) {
15487 vm_object_lock(tmp_obj);
15488 vm_object_unlock(obj);
15489 obj = tmp_obj;
15490
15491 if ((ref_count = obj->ref_count) > 1 && obj->paging_in_progress) {
15492 ref_count--;
15493 }
15494
15495 assert(obj->reusable_page_count <= obj->resident_page_count);
15496 top->shared_pages_resident +=
15497 OBJ_RESIDENT_COUNT(obj, entry_size);
15498 top->ref_count += ref_count - 1;
15499 }
15500 } else {
15501 if (entry->superpage_size) {
15502 top->share_mode = SM_LARGE_PAGE;
15503 top->shared_pages_resident = 0;
15504 top->private_pages_resident = entry_size;
15505 } else if (entry->needs_copy) {
15506 top->share_mode = SM_COW;
15507 top->shared_pages_resident =
15508 OBJ_RESIDENT_COUNT(obj, entry_size);
15509 } else {
15510 if (ref_count == 1 ||
15511 (ref_count == 2 && obj->named)) {
15512 top->share_mode = SM_PRIVATE;
15513 top->private_pages_resident =
15514 OBJ_RESIDENT_COUNT(obj,
15515 entry_size);
15516 } else {
15517 top->share_mode = SM_SHARED;
15518 top->shared_pages_resident =
15519 OBJ_RESIDENT_COUNT(obj,
15520 entry_size);
15521 }
15522 }
15523 top->ref_count = ref_count;
15524 }
15525 /* XXX K64: obj_id will be truncated */
15526 top->obj_id = (unsigned int) (uintptr_t)VM_KERNEL_ADDRPERM(obj);
15527
15528 vm_object_unlock(obj);
15529 }
15530 }
15531
15532 void
vm_map_region_walk(vm_map_t map,vm_map_offset_t va,vm_map_entry_t entry,vm_object_offset_t offset,vm_object_size_t range,vm_region_extended_info_t extended,boolean_t look_for_pages,mach_msg_type_number_t count)15533 vm_map_region_walk(
15534 vm_map_t map,
15535 vm_map_offset_t va,
15536 vm_map_entry_t entry,
15537 vm_object_offset_t offset,
15538 vm_object_size_t range,
15539 vm_region_extended_info_t extended,
15540 boolean_t look_for_pages,
15541 mach_msg_type_number_t count)
15542 {
15543 struct vm_object *obj, *tmp_obj;
15544 vm_map_offset_t last_offset;
15545 int i;
15546 int ref_count;
15547 struct vm_object *shadow_object;
15548 unsigned short shadow_depth;
15549 boolean_t do_region_footprint;
15550 int effective_page_size, effective_page_shift;
15551 vm_map_offset_t effective_page_mask;
15552
15553 do_region_footprint = task_self_region_footprint();
15554
15555 if ((entry->is_sub_map) ||
15556 (VME_OBJECT(entry) == 0) ||
15557 (VME_OBJECT(entry)->phys_contiguous &&
15558 !entry->superpage_size)) {
15559 extended->share_mode = SM_EMPTY;
15560 extended->ref_count = 0;
15561 return;
15562 }
15563
15564 if (entry->superpage_size) {
15565 extended->shadow_depth = 0;
15566 extended->share_mode = SM_LARGE_PAGE;
15567 extended->ref_count = 1;
15568 extended->external_pager = 0;
15569
15570 /* TODO4K: Superpage in 4k mode? */
15571 extended->pages_resident = (unsigned int)(range >> PAGE_SHIFT);
15572 extended->shadow_depth = 0;
15573 return;
15574 }
15575
15576 effective_page_shift = vm_self_region_page_shift(map);
15577 effective_page_size = (1 << effective_page_shift);
15578 effective_page_mask = effective_page_size - 1;
15579
15580 offset = vm_map_trunc_page(offset, effective_page_mask);
15581
15582 obj = VME_OBJECT(entry);
15583
15584 vm_object_lock(obj);
15585
15586 if ((ref_count = obj->ref_count) > 1 && obj->paging_in_progress) {
15587 ref_count--;
15588 }
15589
15590 if (look_for_pages) {
15591 for (last_offset = offset + range;
15592 offset < last_offset;
15593 offset += effective_page_size, va += effective_page_size) {
15594 if (do_region_footprint) {
15595 int disp;
15596
15597 disp = 0;
15598 if (map->has_corpse_footprint) {
15599 /*
15600 * Query the page info data we saved
15601 * while forking the corpse.
15602 */
15603 vm_map_corpse_footprint_query_page_info(
15604 map,
15605 va,
15606 &disp);
15607 } else {
15608 /*
15609 * Query the pmap.
15610 */
15611 vm_map_footprint_query_page_info(
15612 map,
15613 entry,
15614 va,
15615 &disp);
15616 }
15617 if (disp & VM_PAGE_QUERY_PAGE_PRESENT) {
15618 extended->pages_resident++;
15619 }
15620 if (disp & VM_PAGE_QUERY_PAGE_REUSABLE) {
15621 extended->pages_reusable++;
15622 }
15623 if (disp & VM_PAGE_QUERY_PAGE_DIRTY) {
15624 extended->pages_dirtied++;
15625 }
15626 if (disp & PMAP_QUERY_PAGE_COMPRESSED) {
15627 extended->pages_swapped_out++;
15628 }
15629 continue;
15630 }
15631
15632 vm_map_region_look_for_page(map, va, obj,
15633 vm_object_trunc_page(offset), ref_count,
15634 0, extended, count);
15635 }
15636
15637 if (do_region_footprint) {
15638 goto collect_object_info;
15639 }
15640 } else {
15641 collect_object_info:
15642 shadow_object = obj->shadow;
15643 shadow_depth = 0;
15644
15645 if (!(obj->internal)) {
15646 extended->external_pager = 1;
15647 }
15648
15649 if (shadow_object != VM_OBJECT_NULL) {
15650 vm_object_lock(shadow_object);
15651 for (;
15652 shadow_object != VM_OBJECT_NULL;
15653 shadow_depth++) {
15654 vm_object_t next_shadow;
15655
15656 if (!(shadow_object->internal)) {
15657 extended->external_pager = 1;
15658 }
15659
15660 next_shadow = shadow_object->shadow;
15661 if (next_shadow) {
15662 vm_object_lock(next_shadow);
15663 }
15664 vm_object_unlock(shadow_object);
15665 shadow_object = next_shadow;
15666 }
15667 }
15668 extended->shadow_depth = shadow_depth;
15669 }
15670
15671 if (extended->shadow_depth || entry->needs_copy) {
15672 extended->share_mode = SM_COW;
15673 } else {
15674 if (ref_count == 1) {
15675 extended->share_mode = SM_PRIVATE;
15676 } else {
15677 if (obj->true_share) {
15678 extended->share_mode = SM_TRUESHARED;
15679 } else {
15680 extended->share_mode = SM_SHARED;
15681 }
15682 }
15683 }
15684 extended->ref_count = ref_count - extended->shadow_depth;
15685
15686 for (i = 0; i < extended->shadow_depth; i++) {
15687 if ((tmp_obj = obj->shadow) == 0) {
15688 break;
15689 }
15690 vm_object_lock(tmp_obj);
15691 vm_object_unlock(obj);
15692
15693 if ((ref_count = tmp_obj->ref_count) > 1 && tmp_obj->paging_in_progress) {
15694 ref_count--;
15695 }
15696
15697 extended->ref_count += ref_count;
15698 obj = tmp_obj;
15699 }
15700 vm_object_unlock(obj);
15701
15702 if (extended->share_mode == SM_SHARED) {
15703 vm_map_entry_t cur;
15704 vm_map_entry_t last;
15705 int my_refs;
15706
15707 obj = VME_OBJECT(entry);
15708 last = vm_map_to_entry(map);
15709 my_refs = 0;
15710
15711 if ((ref_count = obj->ref_count) > 1 && obj->paging_in_progress) {
15712 ref_count--;
15713 }
15714 for (cur = vm_map_first_entry(map); cur != last; cur = cur->vme_next) {
15715 my_refs += vm_map_region_count_obj_refs(cur, obj);
15716 }
15717
15718 if (my_refs == ref_count) {
15719 extended->share_mode = SM_PRIVATE_ALIASED;
15720 } else if (my_refs > 1) {
15721 extended->share_mode = SM_SHARED_ALIASED;
15722 }
15723 }
15724 }
15725
15726
15727 /* object is locked on entry and locked on return */
15728
15729
15730 static void
vm_map_region_look_for_page(__unused vm_map_t map,__unused vm_map_offset_t va,vm_object_t object,vm_object_offset_t offset,int max_refcnt,unsigned short depth,vm_region_extended_info_t extended,mach_msg_type_number_t count)15731 vm_map_region_look_for_page(
15732 __unused vm_map_t map,
15733 __unused vm_map_offset_t va,
15734 vm_object_t object,
15735 vm_object_offset_t offset,
15736 int max_refcnt,
15737 unsigned short depth,
15738 vm_region_extended_info_t extended,
15739 mach_msg_type_number_t count)
15740 {
15741 vm_page_t p;
15742 vm_object_t shadow;
15743 int ref_count;
15744 vm_object_t caller_object;
15745
15746 shadow = object->shadow;
15747 caller_object = object;
15748
15749
15750 while (TRUE) {
15751 if (!(object->internal)) {
15752 extended->external_pager = 1;
15753 }
15754
15755 if ((p = vm_page_lookup(object, offset)) != VM_PAGE_NULL) {
15756 if (shadow && (max_refcnt == 1)) {
15757 extended->pages_shared_now_private++;
15758 }
15759
15760 if (!p->vmp_fictitious &&
15761 (p->vmp_dirty || pmap_is_modified(VM_PAGE_GET_PHYS_PAGE(p)))) {
15762 extended->pages_dirtied++;
15763 } else if (count >= VM_REGION_EXTENDED_INFO_COUNT) {
15764 if (p->vmp_reusable || object->all_reusable) {
15765 extended->pages_reusable++;
15766 }
15767 }
15768
15769 extended->pages_resident++;
15770
15771 if (object != caller_object) {
15772 vm_object_unlock(object);
15773 }
15774
15775 return;
15776 }
15777 if (object->internal &&
15778 object->alive &&
15779 !object->terminating &&
15780 object->pager_ready) {
15781 if (VM_COMPRESSOR_PAGER_STATE_GET(object, offset)
15782 == VM_EXTERNAL_STATE_EXISTS) {
15783 /* the pager has that page */
15784 extended->pages_swapped_out++;
15785 if (object != caller_object) {
15786 vm_object_unlock(object);
15787 }
15788 return;
15789 }
15790 }
15791
15792 if (shadow) {
15793 vm_object_lock(shadow);
15794
15795 if ((ref_count = shadow->ref_count) > 1 && shadow->paging_in_progress) {
15796 ref_count--;
15797 }
15798
15799 if (++depth > extended->shadow_depth) {
15800 extended->shadow_depth = depth;
15801 }
15802
15803 if (ref_count > max_refcnt) {
15804 max_refcnt = ref_count;
15805 }
15806
15807 if (object != caller_object) {
15808 vm_object_unlock(object);
15809 }
15810
15811 offset = offset + object->vo_shadow_offset;
15812 object = shadow;
15813 shadow = object->shadow;
15814 continue;
15815 }
15816 if (object != caller_object) {
15817 vm_object_unlock(object);
15818 }
15819 break;
15820 }
15821 }
15822
15823 static int
vm_map_region_count_obj_refs(vm_map_entry_t entry,vm_object_t object)15824 vm_map_region_count_obj_refs(
15825 vm_map_entry_t entry,
15826 vm_object_t object)
15827 {
15828 int ref_count;
15829 vm_object_t chk_obj;
15830 vm_object_t tmp_obj;
15831
15832 if (entry->is_sub_map || VME_OBJECT(entry) == VM_OBJECT_NULL) {
15833 return 0;
15834 }
15835
15836 ref_count = 0;
15837 chk_obj = VME_OBJECT(entry);
15838 vm_object_lock(chk_obj);
15839
15840 while (chk_obj) {
15841 if (chk_obj == object) {
15842 ref_count++;
15843 }
15844 tmp_obj = chk_obj->shadow;
15845 if (tmp_obj) {
15846 vm_object_lock(tmp_obj);
15847 }
15848 vm_object_unlock(chk_obj);
15849
15850 chk_obj = tmp_obj;
15851 }
15852
15853 return ref_count;
15854 }
15855
15856
15857 /*
15858 * Routine: vm_map_simplify
15859 *
15860 * Description:
15861 * Attempt to simplify the map representation in
15862 * the vicinity of the given starting address.
15863 * Note:
15864 * This routine is intended primarily to keep the
15865 * kernel maps more compact -- they generally don't
15866 * benefit from the "expand a map entry" technology
15867 * at allocation time because the adjacent entry
15868 * is often wired down.
15869 */
15870 void
vm_map_simplify_entry(vm_map_t map,vm_map_entry_t this_entry)15871 vm_map_simplify_entry(
15872 vm_map_t map,
15873 vm_map_entry_t this_entry)
15874 {
15875 vm_map_entry_t prev_entry;
15876
15877 prev_entry = this_entry->vme_prev;
15878
15879 if ((this_entry != vm_map_to_entry(map)) &&
15880 (prev_entry != vm_map_to_entry(map)) &&
15881
15882 (prev_entry->vme_end == this_entry->vme_start) &&
15883
15884 (prev_entry->is_sub_map == this_entry->is_sub_map) &&
15885 (prev_entry->vme_object_value == this_entry->vme_object_value) &&
15886 (prev_entry->vme_kernel_object == this_entry->vme_kernel_object) &&
15887 ((VME_OFFSET(prev_entry) + (prev_entry->vme_end -
15888 prev_entry->vme_start))
15889 == VME_OFFSET(this_entry)) &&
15890
15891 (prev_entry->behavior == this_entry->behavior) &&
15892 (prev_entry->needs_copy == this_entry->needs_copy) &&
15893 (prev_entry->protection == this_entry->protection) &&
15894 (prev_entry->max_protection == this_entry->max_protection) &&
15895 (prev_entry->inheritance == this_entry->inheritance) &&
15896 (prev_entry->use_pmap == this_entry->use_pmap) &&
15897 (VME_ALIAS(prev_entry) == VME_ALIAS(this_entry)) &&
15898 (prev_entry->no_cache == this_entry->no_cache) &&
15899 (prev_entry->vme_permanent == this_entry->vme_permanent) &&
15900 (prev_entry->map_aligned == this_entry->map_aligned) &&
15901 (prev_entry->zero_wired_pages == this_entry->zero_wired_pages) &&
15902 (prev_entry->used_for_jit == this_entry->used_for_jit) &&
15903 #if __arm64e__
15904 (prev_entry->used_for_tpro == this_entry->used_for_tpro) &&
15905 #endif
15906 (prev_entry->csm_associated == this_entry->csm_associated) &&
15907 (prev_entry->vme_xnu_user_debug == this_entry->vme_xnu_user_debug) &&
15908 (prev_entry->iokit_acct == this_entry->iokit_acct) &&
15909 (prev_entry->vme_resilient_codesign ==
15910 this_entry->vme_resilient_codesign) &&
15911 (prev_entry->vme_resilient_media ==
15912 this_entry->vme_resilient_media) &&
15913 (prev_entry->vme_no_copy_on_read == this_entry->vme_no_copy_on_read) &&
15914 (prev_entry->translated_allow_execute == this_entry->translated_allow_execute) &&
15915
15916 (prev_entry->wired_count == this_entry->wired_count) &&
15917 (prev_entry->user_wired_count == this_entry->user_wired_count) &&
15918
15919 ((prev_entry->vme_atomic == FALSE) && (this_entry->vme_atomic == FALSE)) &&
15920 (prev_entry->in_transition == FALSE) &&
15921 (this_entry->in_transition == FALSE) &&
15922 (prev_entry->needs_wakeup == FALSE) &&
15923 (this_entry->needs_wakeup == FALSE) &&
15924 (prev_entry->is_shared == this_entry->is_shared) &&
15925 (prev_entry->superpage_size == FALSE) &&
15926 (this_entry->superpage_size == FALSE)
15927 ) {
15928 if (prev_entry->vme_permanent) {
15929 assert(this_entry->vme_permanent);
15930 prev_entry->vme_permanent = false;
15931 }
15932 vm_map_store_entry_unlink(map, prev_entry, true);
15933 assert(prev_entry->vme_start < this_entry->vme_end);
15934 if (prev_entry->map_aligned) {
15935 assert(VM_MAP_PAGE_ALIGNED(prev_entry->vme_start,
15936 VM_MAP_PAGE_MASK(map)));
15937 }
15938 this_entry->vme_start = prev_entry->vme_start;
15939 VME_OFFSET_SET(this_entry, VME_OFFSET(prev_entry));
15940
15941 if (map->holelistenabled) {
15942 vm_map_store_update_first_free(map, this_entry, TRUE);
15943 }
15944
15945 if (prev_entry->is_sub_map) {
15946 vm_map_deallocate(VME_SUBMAP(prev_entry));
15947 } else {
15948 vm_object_deallocate(VME_OBJECT(prev_entry));
15949 }
15950 vm_map_entry_dispose(prev_entry);
15951 SAVE_HINT_MAP_WRITE(map, this_entry);
15952 }
15953 }
15954
15955 void
vm_map_simplify(vm_map_t map,vm_map_offset_t start)15956 vm_map_simplify(
15957 vm_map_t map,
15958 vm_map_offset_t start)
15959 {
15960 vm_map_entry_t this_entry;
15961
15962 vm_map_lock(map);
15963 if (vm_map_lookup_entry(map, start, &this_entry)) {
15964 vm_map_simplify_entry(map, this_entry);
15965 vm_map_simplify_entry(map, this_entry->vme_next);
15966 }
15967 vm_map_unlock(map);
15968 }
15969
15970 static void
vm_map_simplify_range(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)15971 vm_map_simplify_range(
15972 vm_map_t map,
15973 vm_map_offset_t start,
15974 vm_map_offset_t end)
15975 {
15976 vm_map_entry_t entry;
15977
15978 /*
15979 * The map should be locked (for "write") by the caller.
15980 */
15981
15982 if (start >= end) {
15983 /* invalid address range */
15984 return;
15985 }
15986
15987 start = vm_map_trunc_page(start,
15988 VM_MAP_PAGE_MASK(map));
15989 end = vm_map_round_page(end,
15990 VM_MAP_PAGE_MASK(map));
15991
15992 if (!vm_map_lookup_entry(map, start, &entry)) {
15993 /* "start" is not mapped and "entry" ends before "start" */
15994 if (entry == vm_map_to_entry(map)) {
15995 /* start with first entry in the map */
15996 entry = vm_map_first_entry(map);
15997 } else {
15998 /* start with next entry */
15999 entry = entry->vme_next;
16000 }
16001 }
16002
16003 while (entry != vm_map_to_entry(map) &&
16004 entry->vme_start <= end) {
16005 /* try and coalesce "entry" with its previous entry */
16006 vm_map_simplify_entry(map, entry);
16007 entry = entry->vme_next;
16008 }
16009 }
16010
16011
16012 /*
16013 * Routine: vm_map_machine_attribute
16014 * Purpose:
16015 * Provide machine-specific attributes to mappings,
16016 * such as cachability etc. for machines that provide
16017 * them. NUMA architectures and machines with big/strange
16018 * caches will use this.
16019 * Note:
16020 * Responsibilities for locking and checking are handled here,
16021 * everything else in the pmap module. If any non-volatile
16022 * information must be kept, the pmap module should handle
16023 * it itself. [This assumes that attributes do not
16024 * need to be inherited, which seems ok to me]
16025 */
16026 kern_return_t
vm_map_machine_attribute(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_machine_attribute_t attribute,vm_machine_attribute_val_t * value)16027 vm_map_machine_attribute(
16028 vm_map_t map,
16029 vm_map_offset_t start,
16030 vm_map_offset_t end,
16031 vm_machine_attribute_t attribute,
16032 vm_machine_attribute_val_t* value) /* IN/OUT */
16033 {
16034 kern_return_t ret;
16035 vm_map_size_t sync_size;
16036 vm_map_entry_t entry;
16037
16038 if (start < vm_map_min(map) || end > vm_map_max(map)) {
16039 return KERN_INVALID_ADDRESS;
16040 }
16041 if (__improbable(vm_map_range_overflows(map, start, end - start))) {
16042 return KERN_INVALID_ADDRESS;
16043 }
16044
16045 /* Figure how much memory we need to flush (in page increments) */
16046 sync_size = end - start;
16047
16048 vm_map_lock(map);
16049
16050 if (attribute != MATTR_CACHE) {
16051 /* If we don't have to find physical addresses, we */
16052 /* don't have to do an explicit traversal here. */
16053 ret = pmap_attribute(map->pmap, start, end - start,
16054 attribute, value);
16055 vm_map_unlock(map);
16056 return ret;
16057 }
16058
16059 ret = KERN_SUCCESS; /* Assume it all worked */
16060
16061 while (sync_size) {
16062 if (vm_map_lookup_entry(map, start, &entry)) {
16063 vm_map_size_t sub_size;
16064 if ((entry->vme_end - start) > sync_size) {
16065 sub_size = sync_size;
16066 sync_size = 0;
16067 } else {
16068 sub_size = entry->vme_end - start;
16069 sync_size -= sub_size;
16070 }
16071 if (entry->is_sub_map) {
16072 vm_map_offset_t sub_start;
16073 vm_map_offset_t sub_end;
16074
16075 sub_start = (start - entry->vme_start)
16076 + VME_OFFSET(entry);
16077 sub_end = sub_start + sub_size;
16078 vm_map_machine_attribute(
16079 VME_SUBMAP(entry),
16080 sub_start,
16081 sub_end,
16082 attribute, value);
16083 } else if (VME_OBJECT(entry)) {
16084 vm_page_t m;
16085 vm_object_t object;
16086 vm_object_t base_object;
16087 vm_object_t last_object;
16088 vm_object_offset_t offset;
16089 vm_object_offset_t base_offset;
16090 vm_map_size_t range;
16091 range = sub_size;
16092 offset = (start - entry->vme_start)
16093 + VME_OFFSET(entry);
16094 offset = vm_object_trunc_page(offset);
16095 base_offset = offset;
16096 object = VME_OBJECT(entry);
16097 base_object = object;
16098 last_object = NULL;
16099
16100 vm_object_lock(object);
16101
16102 while (range) {
16103 m = vm_page_lookup(
16104 object, offset);
16105
16106 if (m && !m->vmp_fictitious) {
16107 ret =
16108 pmap_attribute_cache_sync(
16109 VM_PAGE_GET_PHYS_PAGE(m),
16110 PAGE_SIZE,
16111 attribute, value);
16112 } else if (object->shadow) {
16113 offset = offset + object->vo_shadow_offset;
16114 last_object = object;
16115 object = object->shadow;
16116 vm_object_lock(last_object->shadow);
16117 vm_object_unlock(last_object);
16118 continue;
16119 }
16120 if (range < PAGE_SIZE) {
16121 range = 0;
16122 } else {
16123 range -= PAGE_SIZE;
16124 }
16125
16126 if (base_object != object) {
16127 vm_object_unlock(object);
16128 vm_object_lock(base_object);
16129 object = base_object;
16130 }
16131 /* Bump to the next page */
16132 base_offset += PAGE_SIZE;
16133 offset = base_offset;
16134 }
16135 vm_object_unlock(object);
16136 }
16137 start += sub_size;
16138 } else {
16139 vm_map_unlock(map);
16140 return KERN_FAILURE;
16141 }
16142 }
16143
16144 vm_map_unlock(map);
16145
16146 return ret;
16147 }
16148
16149 /*
16150 * vm_map_behavior_set:
16151 *
16152 * Sets the paging reference behavior of the specified address
16153 * range in the target map. Paging reference behavior affects
16154 * how pagein operations resulting from faults on the map will be
16155 * clustered.
16156 */
16157 kern_return_t
vm_map_behavior_set(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_behavior_t new_behavior)16158 vm_map_behavior_set(
16159 vm_map_t map,
16160 vm_map_offset_t start,
16161 vm_map_offset_t end,
16162 vm_behavior_t new_behavior)
16163 {
16164 vm_map_entry_t entry;
16165 vm_map_entry_t temp_entry;
16166
16167 if (start > end ||
16168 start < vm_map_min(map) ||
16169 end > vm_map_max(map)) {
16170 return KERN_NO_SPACE;
16171 }
16172 if (__improbable(vm_map_range_overflows(map, start, end - start))) {
16173 return KERN_INVALID_ADDRESS;
16174 }
16175
16176 switch (new_behavior) {
16177 /*
16178 * This first block of behaviors all set a persistent state on the specified
16179 * memory range. All we have to do here is to record the desired behavior
16180 * in the vm_map_entry_t's.
16181 */
16182
16183 case VM_BEHAVIOR_DEFAULT:
16184 case VM_BEHAVIOR_RANDOM:
16185 case VM_BEHAVIOR_SEQUENTIAL:
16186 case VM_BEHAVIOR_RSEQNTL:
16187 case VM_BEHAVIOR_ZERO_WIRED_PAGES:
16188 vm_map_lock(map);
16189
16190 /*
16191 * The entire address range must be valid for the map.
16192 * Note that vm_map_range_check() does a
16193 * vm_map_lookup_entry() internally and returns the
16194 * entry containing the start of the address range if
16195 * the entire range is valid.
16196 */
16197 if (vm_map_range_check(map, start, end, &temp_entry)) {
16198 entry = temp_entry;
16199 vm_map_clip_start(map, entry, start);
16200 } else {
16201 vm_map_unlock(map);
16202 return KERN_INVALID_ADDRESS;
16203 }
16204
16205 while ((entry != vm_map_to_entry(map)) && (entry->vme_start < end)) {
16206 vm_map_clip_end(map, entry, end);
16207 if (entry->is_sub_map) {
16208 assert(!entry->use_pmap);
16209 }
16210
16211 if (new_behavior == VM_BEHAVIOR_ZERO_WIRED_PAGES) {
16212 entry->zero_wired_pages = TRUE;
16213 } else {
16214 entry->behavior = new_behavior;
16215 }
16216 entry = entry->vme_next;
16217 }
16218
16219 vm_map_unlock(map);
16220 break;
16221
16222 /*
16223 * The rest of these are different from the above in that they cause
16224 * an immediate action to take place as opposed to setting a behavior that
16225 * affects future actions.
16226 */
16227
16228 case VM_BEHAVIOR_WILLNEED:
16229 return vm_map_willneed(map, start, end);
16230
16231 case VM_BEHAVIOR_DONTNEED:
16232 return vm_map_msync(map, start, end - start, VM_SYNC_DEACTIVATE | VM_SYNC_CONTIGUOUS);
16233
16234 case VM_BEHAVIOR_FREE:
16235 return vm_map_msync(map, start, end - start, VM_SYNC_KILLPAGES | VM_SYNC_CONTIGUOUS);
16236
16237 case VM_BEHAVIOR_REUSABLE:
16238 return vm_map_reusable_pages(map, start, end);
16239
16240 case VM_BEHAVIOR_REUSE:
16241 return vm_map_reuse_pages(map, start, end);
16242
16243 case VM_BEHAVIOR_CAN_REUSE:
16244 return vm_map_can_reuse(map, start, end);
16245
16246 #if MACH_ASSERT
16247 case VM_BEHAVIOR_PAGEOUT:
16248 return vm_map_pageout(map, start, end);
16249 #endif /* MACH_ASSERT */
16250
16251 default:
16252 return KERN_INVALID_ARGUMENT;
16253 }
16254
16255 return KERN_SUCCESS;
16256 }
16257
16258
16259 /*
16260 * Internals for madvise(MADV_WILLNEED) system call.
16261 *
16262 * The implementation is to do:-
16263 * a) read-ahead if the mapping corresponds to a mapped regular file
16264 * b) or, fault in the pages (zero-fill, decompress etc) if it's an anonymous mapping
16265 */
16266
16267
16268 static kern_return_t
vm_map_willneed(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)16269 vm_map_willneed(
16270 vm_map_t map,
16271 vm_map_offset_t start,
16272 vm_map_offset_t end
16273 )
16274 {
16275 vm_map_entry_t entry;
16276 vm_object_t object;
16277 memory_object_t pager;
16278 struct vm_object_fault_info fault_info = {};
16279 kern_return_t kr;
16280 vm_object_size_t len;
16281 vm_object_offset_t offset;
16282
16283 fault_info.interruptible = THREAD_UNINT; /* ignored value */
16284 fault_info.behavior = VM_BEHAVIOR_SEQUENTIAL;
16285 fault_info.stealth = TRUE;
16286
16287 /*
16288 * The MADV_WILLNEED operation doesn't require any changes to the
16289 * vm_map_entry_t's, so the read lock is sufficient.
16290 */
16291
16292 vm_map_lock_read(map);
16293
16294 /*
16295 * The madvise semantics require that the address range be fully
16296 * allocated with no holes. Otherwise, we're required to return
16297 * an error.
16298 */
16299
16300 if (!vm_map_range_check(map, start, end, &entry)) {
16301 vm_map_unlock_read(map);
16302 return KERN_INVALID_ADDRESS;
16303 }
16304
16305 /*
16306 * Examine each vm_map_entry_t in the range.
16307 */
16308 for (; entry != vm_map_to_entry(map) && start < end;) {
16309 /*
16310 * The first time through, the start address could be anywhere
16311 * within the vm_map_entry we found. So adjust the offset to
16312 * correspond. After that, the offset will always be zero to
16313 * correspond to the beginning of the current vm_map_entry.
16314 */
16315 offset = (start - entry->vme_start) + VME_OFFSET(entry);
16316
16317 /*
16318 * Set the length so we don't go beyond the end of the
16319 * map_entry or beyond the end of the range we were given.
16320 * This range could span also multiple map entries all of which
16321 * map different files, so make sure we only do the right amount
16322 * of I/O for each object. Note that it's possible for there
16323 * to be multiple map entries all referring to the same object
16324 * but with different page permissions, but it's not worth
16325 * trying to optimize that case.
16326 */
16327 len = MIN(entry->vme_end - start, end - start);
16328
16329 if ((vm_size_t) len != len) {
16330 /* 32-bit overflow */
16331 len = (vm_size_t) (0 - PAGE_SIZE);
16332 }
16333 fault_info.cluster_size = (vm_size_t) len;
16334 fault_info.lo_offset = offset;
16335 fault_info.hi_offset = offset + len;
16336 fault_info.user_tag = VME_ALIAS(entry);
16337 fault_info.pmap_options = 0;
16338 if (entry->iokit_acct ||
16339 (!entry->is_sub_map && !entry->use_pmap)) {
16340 fault_info.pmap_options |= PMAP_OPTIONS_ALT_ACCT;
16341 }
16342 fault_info.fi_xnu_user_debug = entry->vme_xnu_user_debug;
16343
16344 /*
16345 * If the entry is a submap OR there's no read permission
16346 * to this mapping, then just skip it.
16347 */
16348 if ((entry->is_sub_map) || (entry->protection & VM_PROT_READ) == 0) {
16349 entry = entry->vme_next;
16350 start = entry->vme_start;
16351 continue;
16352 }
16353
16354 object = VME_OBJECT(entry);
16355
16356 if (object == NULL ||
16357 (object && object->internal)) {
16358 /*
16359 * Memory range backed by anonymous memory.
16360 */
16361 vm_size_t region_size = 0, effective_page_size = 0;
16362 vm_map_offset_t addr = 0, effective_page_mask = 0;
16363
16364 region_size = len;
16365 addr = start;
16366
16367 effective_page_mask = MIN(vm_map_page_mask(current_map()), PAGE_MASK);
16368 effective_page_size = effective_page_mask + 1;
16369
16370 vm_map_unlock_read(map);
16371
16372 while (region_size) {
16373 vm_pre_fault(
16374 vm_map_trunc_page(addr, effective_page_mask),
16375 VM_PROT_READ | VM_PROT_WRITE);
16376
16377 region_size -= effective_page_size;
16378 addr += effective_page_size;
16379 }
16380 } else {
16381 /*
16382 * Find the file object backing this map entry. If there is
16383 * none, then we simply ignore the "will need" advice for this
16384 * entry and go on to the next one.
16385 */
16386 if ((object = find_vnode_object(entry)) == VM_OBJECT_NULL) {
16387 entry = entry->vme_next;
16388 start = entry->vme_start;
16389 continue;
16390 }
16391
16392 vm_object_paging_begin(object);
16393 pager = object->pager;
16394 vm_object_unlock(object);
16395
16396 /*
16397 * The data_request() could take a long time, so let's
16398 * release the map lock to avoid blocking other threads.
16399 */
16400 vm_map_unlock_read(map);
16401
16402 /*
16403 * Get the data from the object asynchronously.
16404 *
16405 * Note that memory_object_data_request() places limits on the
16406 * amount of I/O it will do. Regardless of the len we
16407 * specified, it won't do more than MAX_UPL_TRANSFER_BYTES and it
16408 * silently truncates the len to that size. This isn't
16409 * necessarily bad since madvise shouldn't really be used to
16410 * page in unlimited amounts of data. Other Unix variants
16411 * limit the willneed case as well. If this turns out to be an
16412 * issue for developers, then we can always adjust the policy
16413 * here and still be backwards compatible since this is all
16414 * just "advice".
16415 */
16416 kr = memory_object_data_request(
16417 pager,
16418 vm_object_trunc_page(offset) + object->paging_offset,
16419 0, /* ignored */
16420 VM_PROT_READ,
16421 (memory_object_fault_info_t)&fault_info);
16422
16423 vm_object_lock(object);
16424 vm_object_paging_end(object);
16425 vm_object_unlock(object);
16426
16427 /*
16428 * If we couldn't do the I/O for some reason, just give up on
16429 * the madvise. We still return success to the user since
16430 * madvise isn't supposed to fail when the advice can't be
16431 * taken.
16432 */
16433
16434 if (kr != KERN_SUCCESS) {
16435 return KERN_SUCCESS;
16436 }
16437 }
16438
16439 start += len;
16440 if (start >= end) {
16441 /* done */
16442 return KERN_SUCCESS;
16443 }
16444
16445 /* look up next entry */
16446 vm_map_lock_read(map);
16447 if (!vm_map_lookup_entry(map, start, &entry)) {
16448 /*
16449 * There's a new hole in the address range.
16450 */
16451 vm_map_unlock_read(map);
16452 return KERN_INVALID_ADDRESS;
16453 }
16454 }
16455
16456 vm_map_unlock_read(map);
16457 return KERN_SUCCESS;
16458 }
16459
16460 static boolean_t
vm_map_entry_is_reusable(vm_map_entry_t entry)16461 vm_map_entry_is_reusable(
16462 vm_map_entry_t entry)
16463 {
16464 /* Only user map entries */
16465
16466 vm_object_t object;
16467
16468 if (entry->is_sub_map) {
16469 return FALSE;
16470 }
16471
16472 switch (VME_ALIAS(entry)) {
16473 case VM_MEMORY_MALLOC:
16474 case VM_MEMORY_MALLOC_SMALL:
16475 case VM_MEMORY_MALLOC_LARGE:
16476 case VM_MEMORY_REALLOC:
16477 case VM_MEMORY_MALLOC_TINY:
16478 case VM_MEMORY_MALLOC_LARGE_REUSABLE:
16479 case VM_MEMORY_MALLOC_LARGE_REUSED:
16480 /*
16481 * This is a malloc() memory region: check if it's still
16482 * in its original state and can be re-used for more
16483 * malloc() allocations.
16484 */
16485 break;
16486 default:
16487 /*
16488 * Not a malloc() memory region: let the caller decide if
16489 * it's re-usable.
16490 */
16491 return TRUE;
16492 }
16493
16494 if (/*entry->is_shared ||*/
16495 entry->is_sub_map ||
16496 entry->in_transition ||
16497 entry->protection != VM_PROT_DEFAULT ||
16498 entry->max_protection != VM_PROT_ALL ||
16499 entry->inheritance != VM_INHERIT_DEFAULT ||
16500 entry->no_cache ||
16501 entry->vme_permanent ||
16502 entry->superpage_size != FALSE ||
16503 entry->zero_wired_pages ||
16504 entry->wired_count != 0 ||
16505 entry->user_wired_count != 0) {
16506 return FALSE;
16507 }
16508
16509 object = VME_OBJECT(entry);
16510 if (object == VM_OBJECT_NULL) {
16511 return TRUE;
16512 }
16513 if (
16514 #if 0
16515 /*
16516 * Let's proceed even if the VM object is potentially
16517 * shared.
16518 * We check for this later when processing the actual
16519 * VM pages, so the contents will be safe if shared.
16520 *
16521 * But we can still mark this memory region as "reusable" to
16522 * acknowledge that the caller did let us know that the memory
16523 * could be re-used and should not be penalized for holding
16524 * on to it. This allows its "resident size" to not include
16525 * the reusable range.
16526 */
16527 object->ref_count == 1 &&
16528 #endif
16529 object->vo_copy == VM_OBJECT_NULL &&
16530 object->shadow == VM_OBJECT_NULL &&
16531 object->internal &&
16532 object->purgable == VM_PURGABLE_DENY &&
16533 object->wimg_bits == VM_WIMG_USE_DEFAULT &&
16534 !object->code_signed) {
16535 return TRUE;
16536 }
16537 return FALSE;
16538 }
16539
16540 static kern_return_t
vm_map_reuse_pages(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)16541 vm_map_reuse_pages(
16542 vm_map_t map,
16543 vm_map_offset_t start,
16544 vm_map_offset_t end)
16545 {
16546 vm_map_entry_t entry;
16547 vm_object_t object;
16548 vm_object_offset_t start_offset, end_offset;
16549
16550 /*
16551 * The MADV_REUSE operation doesn't require any changes to the
16552 * vm_map_entry_t's, so the read lock is sufficient.
16553 */
16554
16555 if (VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT) {
16556 /*
16557 * XXX TODO4K
16558 * need to figure out what reusable means for a
16559 * portion of a native page.
16560 */
16561 return KERN_SUCCESS;
16562 }
16563
16564 vm_map_lock_read(map);
16565 assert(map->pmap != kernel_pmap); /* protect alias access */
16566
16567 /*
16568 * The madvise semantics require that the address range be fully
16569 * allocated with no holes. Otherwise, we're required to return
16570 * an error.
16571 */
16572
16573 if (!vm_map_range_check(map, start, end, &entry)) {
16574 vm_map_unlock_read(map);
16575 vm_page_stats_reusable.reuse_pages_failure++;
16576 return KERN_INVALID_ADDRESS;
16577 }
16578
16579 /*
16580 * Examine each vm_map_entry_t in the range.
16581 */
16582 for (; entry != vm_map_to_entry(map) && entry->vme_start < end;
16583 entry = entry->vme_next) {
16584 /*
16585 * Sanity check on the VM map entry.
16586 */
16587 if (!vm_map_entry_is_reusable(entry)) {
16588 vm_map_unlock_read(map);
16589 vm_page_stats_reusable.reuse_pages_failure++;
16590 return KERN_INVALID_ADDRESS;
16591 }
16592
16593 /*
16594 * The first time through, the start address could be anywhere
16595 * within the vm_map_entry we found. So adjust the offset to
16596 * correspond.
16597 */
16598 if (entry->vme_start < start) {
16599 start_offset = start - entry->vme_start;
16600 } else {
16601 start_offset = 0;
16602 }
16603 end_offset = MIN(end, entry->vme_end) - entry->vme_start;
16604 start_offset += VME_OFFSET(entry);
16605 end_offset += VME_OFFSET(entry);
16606
16607 object = VME_OBJECT(entry);
16608 if (object != VM_OBJECT_NULL) {
16609 vm_object_lock(object);
16610 vm_object_reuse_pages(object, start_offset, end_offset,
16611 TRUE);
16612 vm_object_unlock(object);
16613 }
16614
16615 if (VME_ALIAS(entry) == VM_MEMORY_MALLOC_LARGE_REUSABLE) {
16616 /*
16617 * XXX
16618 * We do not hold the VM map exclusively here.
16619 * The "alias" field is not that critical, so it's
16620 * safe to update it here, as long as it is the only
16621 * one that can be modified while holding the VM map
16622 * "shared".
16623 */
16624 VME_ALIAS_SET(entry, VM_MEMORY_MALLOC_LARGE_REUSED);
16625 }
16626 }
16627
16628 vm_map_unlock_read(map);
16629 vm_page_stats_reusable.reuse_pages_success++;
16630 return KERN_SUCCESS;
16631 }
16632
16633
16634 static kern_return_t
vm_map_reusable_pages(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)16635 vm_map_reusable_pages(
16636 vm_map_t map,
16637 vm_map_offset_t start,
16638 vm_map_offset_t end)
16639 {
16640 vm_map_entry_t entry;
16641 vm_object_t object;
16642 vm_object_offset_t start_offset, end_offset;
16643 vm_map_offset_t pmap_offset;
16644
16645 if (VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT) {
16646 /*
16647 * XXX TODO4K
16648 * need to figure out what reusable means for a portion
16649 * of a native page.
16650 */
16651 return KERN_SUCCESS;
16652 }
16653
16654 /*
16655 * The MADV_REUSABLE operation doesn't require any changes to the
16656 * vm_map_entry_t's, so the read lock is sufficient.
16657 */
16658
16659 vm_map_lock_read(map);
16660 assert(map->pmap != kernel_pmap); /* protect alias access */
16661
16662 /*
16663 * The madvise semantics require that the address range be fully
16664 * allocated with no holes. Otherwise, we're required to return
16665 * an error.
16666 */
16667
16668 if (!vm_map_range_check(map, start, end, &entry)) {
16669 vm_map_unlock_read(map);
16670 vm_page_stats_reusable.reusable_pages_failure++;
16671 return KERN_INVALID_ADDRESS;
16672 }
16673
16674 /*
16675 * Examine each vm_map_entry_t in the range.
16676 */
16677 for (; entry != vm_map_to_entry(map) && entry->vme_start < end;
16678 entry = entry->vme_next) {
16679 int kill_pages = 0;
16680 boolean_t reusable_no_write = FALSE;
16681
16682 /*
16683 * Sanity check on the VM map entry.
16684 */
16685 if (!vm_map_entry_is_reusable(entry)) {
16686 vm_map_unlock_read(map);
16687 vm_page_stats_reusable.reusable_pages_failure++;
16688 return KERN_INVALID_ADDRESS;
16689 }
16690
16691 if (!(entry->protection & VM_PROT_WRITE) && !entry->used_for_jit
16692 #if __arm64e__
16693 && !entry->used_for_tpro
16694 #endif
16695 ) {
16696 /* not writable: can't discard contents */
16697 vm_map_unlock_read(map);
16698 vm_page_stats_reusable.reusable_nonwritable++;
16699 vm_page_stats_reusable.reusable_pages_failure++;
16700 return KERN_PROTECTION_FAILURE;
16701 }
16702
16703 /*
16704 * The first time through, the start address could be anywhere
16705 * within the vm_map_entry we found. So adjust the offset to
16706 * correspond.
16707 */
16708 if (entry->vme_start < start) {
16709 start_offset = start - entry->vme_start;
16710 pmap_offset = start;
16711 } else {
16712 start_offset = 0;
16713 pmap_offset = entry->vme_start;
16714 }
16715 end_offset = MIN(end, entry->vme_end) - entry->vme_start;
16716 start_offset += VME_OFFSET(entry);
16717 end_offset += VME_OFFSET(entry);
16718
16719 object = VME_OBJECT(entry);
16720 if (object == VM_OBJECT_NULL) {
16721 continue;
16722 }
16723
16724 if (entry->protection & VM_PROT_EXECUTE) {
16725 /*
16726 * Executable mappings might be write-protected by
16727 * hardware, so do not attempt to write to these pages.
16728 */
16729 reusable_no_write = TRUE;
16730 }
16731
16732 vm_object_lock(object);
16733 if (((object->ref_count == 1) ||
16734 (object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC &&
16735 object->vo_copy == VM_OBJECT_NULL)) &&
16736 object->shadow == VM_OBJECT_NULL &&
16737 /*
16738 * "iokit_acct" entries are billed for their virtual size
16739 * (rather than for their resident pages only), so they
16740 * wouldn't benefit from making pages reusable, and it
16741 * would be hard to keep track of pages that are both
16742 * "iokit_acct" and "reusable" in the pmap stats and
16743 * ledgers.
16744 */
16745 !(entry->iokit_acct ||
16746 (!entry->is_sub_map && !entry->use_pmap))) {
16747 if (object->ref_count != 1) {
16748 vm_page_stats_reusable.reusable_shared++;
16749 }
16750 kill_pages = 1;
16751 } else {
16752 kill_pages = -1;
16753 }
16754 if (kill_pages != -1) {
16755 vm_object_deactivate_pages(object,
16756 start_offset,
16757 end_offset - start_offset,
16758 kill_pages,
16759 TRUE /*reusable_pages*/,
16760 reusable_no_write,
16761 map->pmap,
16762 pmap_offset);
16763 } else {
16764 vm_page_stats_reusable.reusable_pages_shared++;
16765 DTRACE_VM4(vm_map_reusable_pages_shared,
16766 unsigned int, VME_ALIAS(entry),
16767 vm_map_t, map,
16768 vm_map_entry_t, entry,
16769 vm_object_t, object);
16770 }
16771 vm_object_unlock(object);
16772
16773 if (VME_ALIAS(entry) == VM_MEMORY_MALLOC_LARGE ||
16774 VME_ALIAS(entry) == VM_MEMORY_MALLOC_LARGE_REUSED) {
16775 /*
16776 * XXX
16777 * We do not hold the VM map exclusively here.
16778 * The "alias" field is not that critical, so it's
16779 * safe to update it here, as long as it is the only
16780 * one that can be modified while holding the VM map
16781 * "shared".
16782 */
16783 VME_ALIAS_SET(entry, VM_MEMORY_MALLOC_LARGE_REUSABLE);
16784 }
16785 }
16786
16787 vm_map_unlock_read(map);
16788 vm_page_stats_reusable.reusable_pages_success++;
16789 return KERN_SUCCESS;
16790 }
16791
16792
16793 static kern_return_t
vm_map_can_reuse(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)16794 vm_map_can_reuse(
16795 vm_map_t map,
16796 vm_map_offset_t start,
16797 vm_map_offset_t end)
16798 {
16799 vm_map_entry_t entry;
16800
16801 /*
16802 * The MADV_REUSABLE operation doesn't require any changes to the
16803 * vm_map_entry_t's, so the read lock is sufficient.
16804 */
16805
16806 vm_map_lock_read(map);
16807 assert(map->pmap != kernel_pmap); /* protect alias access */
16808
16809 /*
16810 * The madvise semantics require that the address range be fully
16811 * allocated with no holes. Otherwise, we're required to return
16812 * an error.
16813 */
16814
16815 if (!vm_map_range_check(map, start, end, &entry)) {
16816 vm_map_unlock_read(map);
16817 vm_page_stats_reusable.can_reuse_failure++;
16818 return KERN_INVALID_ADDRESS;
16819 }
16820
16821 /*
16822 * Examine each vm_map_entry_t in the range.
16823 */
16824 for (; entry != vm_map_to_entry(map) && entry->vme_start < end;
16825 entry = entry->vme_next) {
16826 /*
16827 * Sanity check on the VM map entry.
16828 */
16829 if (!vm_map_entry_is_reusable(entry)) {
16830 vm_map_unlock_read(map);
16831 vm_page_stats_reusable.can_reuse_failure++;
16832 return KERN_INVALID_ADDRESS;
16833 }
16834 }
16835
16836 vm_map_unlock_read(map);
16837 vm_page_stats_reusable.can_reuse_success++;
16838 return KERN_SUCCESS;
16839 }
16840
16841
16842 #if MACH_ASSERT
16843 static kern_return_t
vm_map_pageout(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)16844 vm_map_pageout(
16845 vm_map_t map,
16846 vm_map_offset_t start,
16847 vm_map_offset_t end)
16848 {
16849 vm_map_entry_t entry;
16850
16851 /*
16852 * The MADV_PAGEOUT operation doesn't require any changes to the
16853 * vm_map_entry_t's, so the read lock is sufficient.
16854 */
16855
16856 vm_map_lock_read(map);
16857
16858 /*
16859 * The madvise semantics require that the address range be fully
16860 * allocated with no holes. Otherwise, we're required to return
16861 * an error.
16862 */
16863
16864 if (!vm_map_range_check(map, start, end, &entry)) {
16865 vm_map_unlock_read(map);
16866 return KERN_INVALID_ADDRESS;
16867 }
16868
16869 /*
16870 * Examine each vm_map_entry_t in the range.
16871 */
16872 for (; entry != vm_map_to_entry(map) && entry->vme_start < end;
16873 entry = entry->vme_next) {
16874 vm_object_t object;
16875
16876 /*
16877 * Sanity check on the VM map entry.
16878 */
16879 if (entry->is_sub_map) {
16880 vm_map_t submap;
16881 vm_map_offset_t submap_start;
16882 vm_map_offset_t submap_end;
16883 vm_map_entry_t submap_entry;
16884
16885 submap = VME_SUBMAP(entry);
16886 submap_start = VME_OFFSET(entry);
16887 submap_end = submap_start + (entry->vme_end -
16888 entry->vme_start);
16889
16890 vm_map_lock_read(submap);
16891
16892 if (!vm_map_range_check(submap,
16893 submap_start,
16894 submap_end,
16895 &submap_entry)) {
16896 vm_map_unlock_read(submap);
16897 vm_map_unlock_read(map);
16898 return KERN_INVALID_ADDRESS;
16899 }
16900
16901 if (submap_entry->is_sub_map) {
16902 vm_map_unlock_read(submap);
16903 continue;
16904 }
16905
16906 object = VME_OBJECT(submap_entry);
16907 if (object == VM_OBJECT_NULL || !object->internal) {
16908 vm_map_unlock_read(submap);
16909 continue;
16910 }
16911
16912 vm_object_pageout(object);
16913
16914 vm_map_unlock_read(submap);
16915 submap = VM_MAP_NULL;
16916 submap_entry = VM_MAP_ENTRY_NULL;
16917 continue;
16918 }
16919
16920 object = VME_OBJECT(entry);
16921 if (object == VM_OBJECT_NULL || !object->internal) {
16922 continue;
16923 }
16924
16925 vm_object_pageout(object);
16926 }
16927
16928 vm_map_unlock_read(map);
16929 return KERN_SUCCESS;
16930 }
16931 #endif /* MACH_ASSERT */
16932
16933
16934 /*
16935 * Routine: vm_map_entry_insert
16936 *
16937 * Description: This routine inserts a new vm_entry in a locked map.
16938 */
16939 static vm_map_entry_t
vm_map_entry_insert(vm_map_t map,vm_map_entry_t insp_entry,vm_map_offset_t start,vm_map_offset_t end,vm_object_t object,vm_object_offset_t offset,vm_map_kernel_flags_t vmk_flags,boolean_t needs_copy,vm_prot_t cur_protection,vm_prot_t max_protection,vm_inherit_t inheritance,boolean_t clear_map_aligned)16940 vm_map_entry_insert(
16941 vm_map_t map,
16942 vm_map_entry_t insp_entry,
16943 vm_map_offset_t start,
16944 vm_map_offset_t end,
16945 vm_object_t object,
16946 vm_object_offset_t offset,
16947 vm_map_kernel_flags_t vmk_flags,
16948 boolean_t needs_copy,
16949 vm_prot_t cur_protection,
16950 vm_prot_t max_protection,
16951 vm_inherit_t inheritance,
16952 boolean_t clear_map_aligned)
16953 {
16954 vm_map_entry_t new_entry;
16955 boolean_t map_aligned = FALSE;
16956
16957 assert(insp_entry != (vm_map_entry_t)0);
16958 vm_map_lock_assert_exclusive(map);
16959
16960 #if DEVELOPMENT || DEBUG
16961 vm_object_offset_t end_offset = 0;
16962 assertf(!os_add_overflow(end - start, offset, &end_offset), "size 0x%llx, offset 0x%llx caused overflow", (uint64_t)(end - start), offset);
16963 #endif /* DEVELOPMENT || DEBUG */
16964
16965 if (VM_MAP_PAGE_SHIFT(map) != PAGE_SHIFT) {
16966 map_aligned = TRUE;
16967 }
16968 if (clear_map_aligned &&
16969 (!VM_MAP_PAGE_ALIGNED(start, VM_MAP_PAGE_MASK(map)) ||
16970 !VM_MAP_PAGE_ALIGNED(end, VM_MAP_PAGE_MASK(map)))) {
16971 map_aligned = FALSE;
16972 }
16973 if (map_aligned) {
16974 assert(VM_MAP_PAGE_ALIGNED(start, VM_MAP_PAGE_MASK(map)));
16975 assert(VM_MAP_PAGE_ALIGNED(end, VM_MAP_PAGE_MASK(map)));
16976 } else {
16977 assert(page_aligned(start));
16978 assert(page_aligned(end));
16979 }
16980 assert(start < end);
16981
16982 new_entry = vm_map_entry_create(map);
16983
16984 new_entry->vme_start = start;
16985 new_entry->vme_end = end;
16986
16987 if (vmk_flags.vmkf_submap) {
16988 new_entry->vme_atomic = vmk_flags.vmkf_submap_atomic;
16989 VME_SUBMAP_SET(new_entry, (vm_map_t)object);
16990 } else {
16991 VME_OBJECT_SET(new_entry, object, false, 0);
16992 }
16993 VME_OFFSET_SET(new_entry, offset);
16994 VME_ALIAS_SET(new_entry, vmk_flags.vm_tag);
16995
16996 new_entry->map_aligned = map_aligned;
16997 new_entry->needs_copy = needs_copy;
16998 new_entry->inheritance = inheritance;
16999 new_entry->protection = cur_protection;
17000 new_entry->max_protection = max_protection;
17001 /*
17002 * submap: "use_pmap" means "nested".
17003 * default: false.
17004 *
17005 * object: "use_pmap" means "use pmap accounting" for footprint.
17006 * default: true.
17007 */
17008 new_entry->use_pmap = !vmk_flags.vmkf_submap;
17009 new_entry->no_cache = vmk_flags.vmf_no_cache;
17010 new_entry->vme_permanent = vmk_flags.vmf_permanent;
17011 new_entry->translated_allow_execute = vmk_flags.vmkf_translated_allow_execute;
17012 new_entry->vme_no_copy_on_read = vmk_flags.vmkf_no_copy_on_read;
17013 new_entry->superpage_size = (vmk_flags.vmf_superpage_size != 0);
17014
17015 if (vmk_flags.vmkf_map_jit) {
17016 if (!(map->jit_entry_exists) ||
17017 VM_MAP_POLICY_ALLOW_MULTIPLE_JIT(map)) {
17018 new_entry->used_for_jit = TRUE;
17019 map->jit_entry_exists = TRUE;
17020 }
17021 }
17022
17023 /*
17024 * Insert the new entry into the list.
17025 */
17026
17027 vm_map_store_entry_link(map, insp_entry, new_entry, vmk_flags);
17028 map->size += end - start;
17029
17030 /*
17031 * Update the free space hint and the lookup hint.
17032 */
17033
17034 SAVE_HINT_MAP_WRITE(map, new_entry);
17035 return new_entry;
17036 }
17037
17038 /*
17039 * Routine: vm_map_remap_extract
17040 *
17041 * Description: This routine returns a vm_entry list from a map.
17042 */
17043 static kern_return_t
vm_map_remap_extract(vm_map_t map,vm_map_offset_t addr,vm_map_size_t size,boolean_t copy,vm_map_copy_t map_copy,vm_prot_t * cur_protection,vm_prot_t * max_protection,vm_inherit_t inheritance,vm_map_kernel_flags_t vmk_flags)17044 vm_map_remap_extract(
17045 vm_map_t map,
17046 vm_map_offset_t addr,
17047 vm_map_size_t size,
17048 boolean_t copy,
17049 vm_map_copy_t map_copy,
17050 vm_prot_t *cur_protection, /* IN/OUT */
17051 vm_prot_t *max_protection, /* IN/OUT */
17052 /* What, no behavior? */
17053 vm_inherit_t inheritance,
17054 vm_map_kernel_flags_t vmk_flags)
17055 {
17056 struct vm_map_header *map_header = &map_copy->cpy_hdr;
17057 kern_return_t result;
17058 vm_map_size_t mapped_size;
17059 vm_map_size_t tmp_size;
17060 vm_map_entry_t src_entry; /* result of last map lookup */
17061 vm_map_entry_t new_entry;
17062 vm_object_offset_t offset;
17063 vm_map_offset_t map_address;
17064 vm_map_offset_t src_start; /* start of entry to map */
17065 vm_map_offset_t src_end; /* end of region to be mapped */
17066 vm_object_t object;
17067 vm_map_version_t version;
17068 boolean_t src_needs_copy;
17069 boolean_t new_entry_needs_copy;
17070 vm_map_entry_t saved_src_entry;
17071 boolean_t src_entry_was_wired;
17072 vm_prot_t max_prot_for_prot_copy;
17073 vm_map_offset_t effective_page_mask;
17074 bool pageable, same_map;
17075 boolean_t vm_remap_legacy;
17076 vm_prot_t required_cur_prot, required_max_prot;
17077 vm_object_t new_copy_object; /* vm_object_copy_* result */
17078 boolean_t saved_used_for_jit; /* Saved used_for_jit. */
17079
17080 pageable = vmk_flags.vmkf_copy_pageable;
17081 same_map = vmk_flags.vmkf_copy_same_map;
17082
17083 effective_page_mask = MIN(PAGE_MASK, VM_MAP_PAGE_MASK(map));
17084
17085 assert(map != VM_MAP_NULL);
17086 assert(size != 0);
17087 assert(size == vm_map_round_page(size, effective_page_mask));
17088 assert(inheritance == VM_INHERIT_NONE ||
17089 inheritance == VM_INHERIT_COPY ||
17090 inheritance == VM_INHERIT_SHARE);
17091 assert(!(*cur_protection & ~(VM_PROT_ALL | VM_PROT_ALLEXEC)));
17092 assert(!(*max_protection & ~(VM_PROT_ALL | VM_PROT_ALLEXEC)));
17093 assert((*cur_protection & *max_protection) == *cur_protection);
17094
17095 /*
17096 * Compute start and end of region.
17097 */
17098 src_start = vm_map_trunc_page(addr, effective_page_mask);
17099 src_end = vm_map_round_page(src_start + size, effective_page_mask);
17100
17101 /*
17102 * Initialize map_header.
17103 */
17104 map_header->nentries = 0;
17105 map_header->entries_pageable = pageable;
17106 // map_header->page_shift = MIN(VM_MAP_PAGE_SHIFT(map), PAGE_SHIFT);
17107 map_header->page_shift = (uint16_t)VM_MAP_PAGE_SHIFT(map);
17108 map_header->rb_head_store.rbh_root = (void *)(int)SKIP_RB_TREE;
17109 vm_map_store_init(map_header);
17110
17111 if (copy && vmk_flags.vmkf_remap_prot_copy) {
17112 /*
17113 * Special case for vm_map_protect(VM_PROT_COPY):
17114 * we want to set the new mappings' max protection to the
17115 * specified *max_protection...
17116 */
17117 max_prot_for_prot_copy = *max_protection & (VM_PROT_ALL | VM_PROT_ALLEXEC);
17118 /* ... but we want to use the vm_remap() legacy mode */
17119 *max_protection = VM_PROT_NONE;
17120 *cur_protection = VM_PROT_NONE;
17121 } else {
17122 max_prot_for_prot_copy = VM_PROT_NONE;
17123 }
17124
17125 if (*cur_protection == VM_PROT_NONE &&
17126 *max_protection == VM_PROT_NONE) {
17127 /*
17128 * vm_remap() legacy mode:
17129 * Extract all memory regions in the specified range and
17130 * collect the strictest set of protections allowed on the
17131 * entire range, so the caller knows what they can do with
17132 * the remapped range.
17133 * We start with VM_PROT_ALL and we'll remove the protections
17134 * missing from each memory region.
17135 */
17136 vm_remap_legacy = TRUE;
17137 *cur_protection = VM_PROT_ALL;
17138 *max_protection = VM_PROT_ALL;
17139 required_cur_prot = VM_PROT_NONE;
17140 required_max_prot = VM_PROT_NONE;
17141 } else {
17142 /*
17143 * vm_remap_new() mode:
17144 * Extract all memory regions in the specified range and
17145 * ensure that they have at least the protections specified
17146 * by the caller via *cur_protection and *max_protection.
17147 * The resulting mapping should have these protections.
17148 */
17149 vm_remap_legacy = FALSE;
17150 if (copy) {
17151 required_cur_prot = VM_PROT_NONE;
17152 required_max_prot = VM_PROT_READ;
17153 } else {
17154 required_cur_prot = *cur_protection;
17155 required_max_prot = *max_protection;
17156 }
17157 }
17158
17159 map_address = 0;
17160 mapped_size = 0;
17161 result = KERN_SUCCESS;
17162
17163 /*
17164 * The specified source virtual space might correspond to
17165 * multiple map entries, need to loop on them.
17166 */
17167 vm_map_lock(map);
17168
17169 if (map->pmap == kernel_pmap) {
17170 map_copy->is_kernel_range = true;
17171 map_copy->orig_range = kmem_addr_get_range(addr, size);
17172 #if CONFIG_MAP_RANGES
17173 } else if (map->uses_user_ranges) {
17174 map_copy->is_user_range = true;
17175 map_copy->orig_range = vm_map_user_range_resolve(map, addr, size, NULL);
17176 #endif /* CONFIG_MAP_RANGES */
17177 }
17178
17179 if (VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT) {
17180 /*
17181 * This address space uses sub-pages so the range might
17182 * not be re-mappable in an address space with larger
17183 * pages. Re-assemble any broken-up VM map entries to
17184 * improve our chances of making it work.
17185 */
17186 vm_map_simplify_range(map, src_start, src_end);
17187 }
17188 while (mapped_size != size) {
17189 vm_map_size_t entry_size;
17190
17191 /*
17192 * Find the beginning of the region.
17193 */
17194 if (!vm_map_lookup_entry(map, src_start, &src_entry)) {
17195 result = KERN_INVALID_ADDRESS;
17196 break;
17197 }
17198
17199 if (src_start < src_entry->vme_start ||
17200 (mapped_size && src_start != src_entry->vme_start)) {
17201 result = KERN_INVALID_ADDRESS;
17202 break;
17203 }
17204
17205 tmp_size = size - mapped_size;
17206 if (src_end > src_entry->vme_end) {
17207 tmp_size -= (src_end - src_entry->vme_end);
17208 }
17209
17210 entry_size = (vm_map_size_t)(src_entry->vme_end -
17211 src_entry->vme_start);
17212
17213 if (src_entry->is_sub_map &&
17214 vmk_flags.vmkf_copy_single_object) {
17215 vm_map_t submap;
17216 vm_map_offset_t submap_start;
17217 vm_map_size_t submap_size;
17218 boolean_t submap_needs_copy;
17219
17220 /*
17221 * No check for "required protection" on "src_entry"
17222 * because the protections that matter are the ones
17223 * on the submap's VM map entry, which will be checked
17224 * during the call to vm_map_remap_extract() below.
17225 */
17226 submap_size = src_entry->vme_end - src_start;
17227 if (submap_size > size) {
17228 submap_size = size;
17229 }
17230 submap_start = VME_OFFSET(src_entry) + src_start - src_entry->vme_start;
17231 submap = VME_SUBMAP(src_entry);
17232 if (copy) {
17233 /*
17234 * The caller wants a copy-on-write re-mapping,
17235 * so let's extract from the submap accordingly.
17236 */
17237 submap_needs_copy = TRUE;
17238 } else if (src_entry->needs_copy) {
17239 /*
17240 * The caller wants a shared re-mapping but the
17241 * submap is mapped with "needs_copy", so its
17242 * contents can't be shared as is. Extract the
17243 * contents of the submap as "copy-on-write".
17244 * The re-mapping won't be shared with the
17245 * original mapping but this is equivalent to
17246 * what happened with the original "remap from
17247 * submap" code.
17248 * The shared region is mapped "needs_copy", for
17249 * example.
17250 */
17251 submap_needs_copy = TRUE;
17252 } else {
17253 /*
17254 * The caller wants a shared re-mapping and
17255 * this mapping can be shared (no "needs_copy"),
17256 * so let's extract from the submap accordingly.
17257 * Kernel submaps are mapped without
17258 * "needs_copy", for example.
17259 */
17260 submap_needs_copy = FALSE;
17261 }
17262 vm_map_reference(submap);
17263 vm_map_unlock(map);
17264 src_entry = NULL;
17265 if (vm_remap_legacy) {
17266 *cur_protection = VM_PROT_NONE;
17267 *max_protection = VM_PROT_NONE;
17268 }
17269
17270 DTRACE_VM7(remap_submap_recurse,
17271 vm_map_t, map,
17272 vm_map_offset_t, addr,
17273 vm_map_size_t, size,
17274 boolean_t, copy,
17275 vm_map_offset_t, submap_start,
17276 vm_map_size_t, submap_size,
17277 boolean_t, submap_needs_copy);
17278
17279 result = vm_map_remap_extract(submap,
17280 submap_start,
17281 submap_size,
17282 submap_needs_copy,
17283 map_copy,
17284 cur_protection,
17285 max_protection,
17286 inheritance,
17287 vmk_flags);
17288 vm_map_deallocate(submap);
17289
17290 if (result == KERN_SUCCESS &&
17291 submap_needs_copy &&
17292 !copy) {
17293 /*
17294 * We were asked for a "shared"
17295 * re-mapping but had to ask for a
17296 * "copy-on-write" remapping of the
17297 * submap's mapping to honor the
17298 * submap's "needs_copy".
17299 * We now need to resolve that
17300 * pending "copy-on-write" to
17301 * get something we can share.
17302 */
17303 vm_map_entry_t copy_entry;
17304 vm_object_offset_t copy_offset;
17305 vm_map_size_t copy_size;
17306 vm_object_t copy_object;
17307 copy_entry = vm_map_copy_first_entry(map_copy);
17308 copy_size = copy_entry->vme_end - copy_entry->vme_start;
17309 copy_object = VME_OBJECT(copy_entry);
17310 copy_offset = VME_OFFSET(copy_entry);
17311 if (copy_object == VM_OBJECT_NULL) {
17312 assert(copy_offset == 0);
17313 assert(!copy_entry->needs_copy);
17314 if (copy_entry->max_protection == VM_PROT_NONE) {
17315 assert(copy_entry->protection == VM_PROT_NONE);
17316 /* nothing to share */
17317 } else {
17318 assert(copy_offset == 0);
17319 copy_object = vm_object_allocate(copy_size);
17320 VME_OFFSET_SET(copy_entry, 0);
17321 VME_OBJECT_SET(copy_entry, copy_object, false, 0);
17322 assert(copy_entry->use_pmap);
17323 }
17324 } else if (copy_object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC) {
17325 /* already shareable */
17326 assert(!copy_entry->needs_copy);
17327 } else if (copy_entry->needs_copy ||
17328 copy_object->shadowed ||
17329 (object->internal &&
17330 !object->true_share &&
17331 !copy_entry->is_shared &&
17332 copy_object->vo_size > copy_size)) {
17333 VME_OBJECT_SHADOW(copy_entry, copy_size, TRUE);
17334 assert(copy_entry->use_pmap);
17335 if (copy_entry->needs_copy) {
17336 /* already write-protected */
17337 } else {
17338 vm_prot_t prot;
17339 prot = copy_entry->protection & ~VM_PROT_WRITE;
17340 vm_object_pmap_protect(copy_object,
17341 copy_offset,
17342 copy_size,
17343 PMAP_NULL,
17344 PAGE_SIZE,
17345 0,
17346 prot);
17347 }
17348 copy_entry->needs_copy = FALSE;
17349 }
17350 copy_object = VME_OBJECT(copy_entry);
17351 copy_offset = VME_OFFSET(copy_entry);
17352 if (copy_object &&
17353 copy_object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) {
17354 copy_object->copy_strategy = MEMORY_OBJECT_COPY_DELAY;
17355 copy_object->true_share = TRUE;
17356 }
17357 }
17358
17359 return result;
17360 }
17361
17362 if (src_entry->is_sub_map) {
17363 /* protections for submap mapping are irrelevant here */
17364 } else if (((src_entry->protection & required_cur_prot) !=
17365 required_cur_prot) ||
17366 ((src_entry->max_protection & required_max_prot) !=
17367 required_max_prot)) {
17368 if (vmk_flags.vmkf_copy_single_object &&
17369 mapped_size != 0) {
17370 /*
17371 * Single object extraction.
17372 * We can't extract more with the required
17373 * protection but we've extracted some, so
17374 * stop there and declare success.
17375 * The caller should check the size of
17376 * the copy entry we've extracted.
17377 */
17378 result = KERN_SUCCESS;
17379 } else {
17380 /*
17381 * VM range extraction.
17382 * Required proctection is not available
17383 * for this part of the range: fail.
17384 */
17385 result = KERN_PROTECTION_FAILURE;
17386 }
17387 break;
17388 }
17389
17390 if (src_entry->is_sub_map) {
17391 vm_map_t submap;
17392 vm_map_offset_t submap_start;
17393 vm_map_size_t submap_size;
17394 vm_map_copy_t submap_copy;
17395 vm_prot_t submap_curprot, submap_maxprot;
17396 boolean_t submap_needs_copy;
17397
17398 /*
17399 * No check for "required protection" on "src_entry"
17400 * because the protections that matter are the ones
17401 * on the submap's VM map entry, which will be checked
17402 * during the call to vm_map_copy_extract() below.
17403 */
17404 object = VM_OBJECT_NULL;
17405 submap_copy = VM_MAP_COPY_NULL;
17406
17407 /* find equivalent range in the submap */
17408 submap = VME_SUBMAP(src_entry);
17409 submap_start = VME_OFFSET(src_entry) + src_start - src_entry->vme_start;
17410 submap_size = tmp_size;
17411 if (copy) {
17412 /*
17413 * The caller wants a copy-on-write re-mapping,
17414 * so let's extract from the submap accordingly.
17415 */
17416 submap_needs_copy = TRUE;
17417 } else if (src_entry->needs_copy) {
17418 /*
17419 * The caller wants a shared re-mapping but the
17420 * submap is mapped with "needs_copy", so its
17421 * contents can't be shared as is. Extract the
17422 * contents of the submap as "copy-on-write".
17423 * The re-mapping won't be shared with the
17424 * original mapping but this is equivalent to
17425 * what happened with the original "remap from
17426 * submap" code.
17427 * The shared region is mapped "needs_copy", for
17428 * example.
17429 */
17430 submap_needs_copy = TRUE;
17431 } else {
17432 /*
17433 * The caller wants a shared re-mapping and
17434 * this mapping can be shared (no "needs_copy"),
17435 * so let's extract from the submap accordingly.
17436 * Kernel submaps are mapped without
17437 * "needs_copy", for example.
17438 */
17439 submap_needs_copy = FALSE;
17440 }
17441 /* extra ref to keep submap alive */
17442 vm_map_reference(submap);
17443
17444 DTRACE_VM7(remap_submap_recurse,
17445 vm_map_t, map,
17446 vm_map_offset_t, addr,
17447 vm_map_size_t, size,
17448 boolean_t, copy,
17449 vm_map_offset_t, submap_start,
17450 vm_map_size_t, submap_size,
17451 boolean_t, submap_needs_copy);
17452
17453 /*
17454 * The map can be safely unlocked since we
17455 * already hold a reference on the submap.
17456 *
17457 * No timestamp since we don't care if the map
17458 * gets modified while we're down in the submap.
17459 * We'll resume the extraction at src_start + tmp_size
17460 * anyway.
17461 */
17462 vm_map_unlock(map);
17463 src_entry = NULL; /* not valid once map is unlocked */
17464
17465 if (vm_remap_legacy) {
17466 submap_curprot = VM_PROT_NONE;
17467 submap_maxprot = VM_PROT_NONE;
17468 if (max_prot_for_prot_copy) {
17469 submap_maxprot = max_prot_for_prot_copy;
17470 }
17471 } else {
17472 assert(!max_prot_for_prot_copy);
17473 submap_curprot = *cur_protection;
17474 submap_maxprot = *max_protection;
17475 }
17476 result = vm_map_copy_extract(submap,
17477 submap_start,
17478 submap_size,
17479 submap_needs_copy,
17480 &submap_copy,
17481 &submap_curprot,
17482 &submap_maxprot,
17483 inheritance,
17484 vmk_flags);
17485
17486 /* release extra ref on submap */
17487 vm_map_deallocate(submap);
17488 submap = VM_MAP_NULL;
17489
17490 if (result != KERN_SUCCESS) {
17491 vm_map_lock(map);
17492 break;
17493 }
17494
17495 /* transfer submap_copy entries to map_header */
17496 while (vm_map_copy_first_entry(submap_copy) !=
17497 vm_map_copy_to_entry(submap_copy)) {
17498 vm_map_entry_t copy_entry;
17499 vm_map_size_t copy_entry_size;
17500
17501 copy_entry = vm_map_copy_first_entry(submap_copy);
17502
17503 /*
17504 * Prevent kernel_object from being exposed to
17505 * user space.
17506 */
17507 if (__improbable(copy_entry->vme_kernel_object)) {
17508 printf("%d[%s]: rejecting attempt to extract from kernel_object\n",
17509 proc_selfpid(),
17510 (get_bsdtask_info(current_task())
17511 ? proc_name_address(get_bsdtask_info(current_task()))
17512 : "?"));
17513 DTRACE_VM(extract_kernel_only);
17514 result = KERN_INVALID_RIGHT;
17515 vm_map_copy_discard(submap_copy);
17516 submap_copy = VM_MAP_COPY_NULL;
17517 vm_map_lock(map);
17518 break;
17519 }
17520
17521 #ifdef __arm64e__
17522 if (vmk_flags.vmkf_tpro_enforcement_override) {
17523 copy_entry->used_for_tpro = FALSE;
17524 }
17525 #endif /* __arm64e__ */
17526
17527 vm_map_copy_entry_unlink(submap_copy, copy_entry);
17528 copy_entry_size = copy_entry->vme_end - copy_entry->vme_start;
17529 copy_entry->vme_start = map_address;
17530 copy_entry->vme_end = map_address + copy_entry_size;
17531 map_address += copy_entry_size;
17532 mapped_size += copy_entry_size;
17533 src_start += copy_entry_size;
17534 assert(src_start <= src_end);
17535 _vm_map_store_entry_link(map_header,
17536 map_header->links.prev,
17537 copy_entry);
17538 }
17539 /* done with submap_copy */
17540 vm_map_copy_discard(submap_copy);
17541
17542 if (vm_remap_legacy) {
17543 *cur_protection &= submap_curprot;
17544 *max_protection &= submap_maxprot;
17545 }
17546
17547 /* re-acquire the map lock and continue to next entry */
17548 vm_map_lock(map);
17549 continue;
17550 } else {
17551 object = VME_OBJECT(src_entry);
17552
17553 /*
17554 * Prevent kernel_object from being exposed to
17555 * user space.
17556 */
17557 if (__improbable(is_kernel_object(object))) {
17558 printf("%d[%s]: rejecting attempt to extract from kernel_object\n",
17559 proc_selfpid(),
17560 (get_bsdtask_info(current_task())
17561 ? proc_name_address(get_bsdtask_info(current_task()))
17562 : "?"));
17563 DTRACE_VM(extract_kernel_only);
17564 result = KERN_INVALID_RIGHT;
17565 break;
17566 }
17567
17568 if (src_entry->iokit_acct) {
17569 /*
17570 * This entry uses "IOKit accounting".
17571 */
17572 } else if (object != VM_OBJECT_NULL &&
17573 (object->purgable != VM_PURGABLE_DENY ||
17574 object->vo_ledger_tag != VM_LEDGER_TAG_NONE)) {
17575 /*
17576 * Purgeable objects have their own accounting:
17577 * no pmap accounting for them.
17578 */
17579 assertf(!src_entry->use_pmap,
17580 "map=%p src_entry=%p [0x%llx:0x%llx] 0x%x/0x%x %d",
17581 map,
17582 src_entry,
17583 (uint64_t)src_entry->vme_start,
17584 (uint64_t)src_entry->vme_end,
17585 src_entry->protection,
17586 src_entry->max_protection,
17587 VME_ALIAS(src_entry));
17588 } else {
17589 /*
17590 * Not IOKit or purgeable:
17591 * must be accounted by pmap stats.
17592 */
17593 assertf(src_entry->use_pmap,
17594 "map=%p src_entry=%p [0x%llx:0x%llx] 0x%x/0x%x %d",
17595 map,
17596 src_entry,
17597 (uint64_t)src_entry->vme_start,
17598 (uint64_t)src_entry->vme_end,
17599 src_entry->protection,
17600 src_entry->max_protection,
17601 VME_ALIAS(src_entry));
17602 }
17603
17604 if (object == VM_OBJECT_NULL) {
17605 assert(!src_entry->needs_copy);
17606 if (src_entry->max_protection == VM_PROT_NONE) {
17607 assert(src_entry->protection == VM_PROT_NONE);
17608 /*
17609 * No VM object and no permissions:
17610 * this must be a reserved range with
17611 * nothing to share or copy.
17612 * There could also be all sorts of
17613 * pmap shenanigans within that reserved
17614 * range, so let's just copy the map
17615 * entry as is to remap a similar
17616 * reserved range.
17617 */
17618 offset = 0; /* no object => no offset */
17619 goto copy_src_entry;
17620 }
17621 object = vm_object_allocate(entry_size);
17622 VME_OFFSET_SET(src_entry, 0);
17623 VME_OBJECT_SET(src_entry, object, false, 0);
17624 assert(src_entry->use_pmap);
17625 assert(!map->mapped_in_other_pmaps);
17626 } else if (src_entry->wired_count ||
17627 object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC) {
17628 /*
17629 * A wired memory region should not have
17630 * any pending copy-on-write and needs to
17631 * keep pointing at the VM object that
17632 * contains the wired pages.
17633 * If we're sharing this memory (copy=false),
17634 * we'll share this VM object.
17635 * If we're copying this memory (copy=true),
17636 * we'll call vm_object_copy_slowly() below
17637 * and use the new VM object for the remapping.
17638 *
17639 * Or, we are already using an asymmetric
17640 * copy, and therefore we already have
17641 * the right object.
17642 */
17643 assert(!src_entry->needs_copy);
17644 } else if (src_entry->needs_copy || object->shadowed ||
17645 (object->internal && !object->true_share &&
17646 !src_entry->is_shared &&
17647 object->vo_size > entry_size)) {
17648 bool is_writable;
17649
17650 VME_OBJECT_SHADOW(src_entry, entry_size,
17651 vm_map_always_shadow(map));
17652 assert(src_entry->use_pmap);
17653
17654 is_writable = false;
17655 if (src_entry->protection & VM_PROT_WRITE) {
17656 is_writable = true;
17657 #if __arm64e__
17658 } else if (src_entry->used_for_tpro) {
17659 is_writable = true;
17660 #endif /* __arm64e__ */
17661 }
17662 if (!src_entry->needs_copy && is_writable) {
17663 vm_prot_t prot;
17664
17665 assert(!pmap_has_prot_policy(map->pmap, src_entry->translated_allow_execute, src_entry->protection));
17666
17667 prot = src_entry->protection & ~VM_PROT_WRITE;
17668
17669 if (override_nx(map,
17670 VME_ALIAS(src_entry))
17671 && prot) {
17672 prot |= VM_PROT_EXECUTE;
17673 }
17674
17675 assert(!pmap_has_prot_policy(map->pmap, src_entry->translated_allow_execute, prot));
17676
17677 if (map->mapped_in_other_pmaps) {
17678 vm_object_pmap_protect(
17679 VME_OBJECT(src_entry),
17680 VME_OFFSET(src_entry),
17681 entry_size,
17682 PMAP_NULL,
17683 PAGE_SIZE,
17684 src_entry->vme_start,
17685 prot);
17686 #if MACH_ASSERT
17687 } else if (__improbable(map->pmap == PMAP_NULL)) {
17688 extern boolean_t vm_tests_in_progress;
17689 assert(vm_tests_in_progress);
17690 /*
17691 * Some VM tests (in vm_tests.c)
17692 * sometimes want to use a VM
17693 * map without a pmap.
17694 * Otherwise, this should never
17695 * happen.
17696 */
17697 #endif /* MACH_ASSERT */
17698 } else {
17699 pmap_protect(vm_map_pmap(map),
17700 src_entry->vme_start,
17701 src_entry->vme_end,
17702 prot);
17703 }
17704 }
17705
17706 object = VME_OBJECT(src_entry);
17707 src_entry->needs_copy = FALSE;
17708 }
17709
17710
17711 vm_object_lock(object);
17712 vm_object_reference_locked(object); /* object ref. for new entry */
17713 assert(!src_entry->needs_copy);
17714 if (object->copy_strategy ==
17715 MEMORY_OBJECT_COPY_SYMMETRIC) {
17716 /*
17717 * If we want to share this object (copy==0),
17718 * it needs to be COPY_DELAY.
17719 * If we want to copy this object (copy==1),
17720 * we can't just set "needs_copy" on our side
17721 * and expect the other side to do the same
17722 * (symmetrically), so we can't let the object
17723 * stay COPY_SYMMETRIC.
17724 * So we always switch from COPY_SYMMETRIC to
17725 * COPY_DELAY.
17726 */
17727 object->copy_strategy =
17728 MEMORY_OBJECT_COPY_DELAY;
17729 object->true_share = TRUE;
17730 }
17731 vm_object_unlock(object);
17732 }
17733
17734 offset = (VME_OFFSET(src_entry) +
17735 (src_start - src_entry->vme_start));
17736
17737 copy_src_entry:
17738 new_entry = _vm_map_entry_create(map_header);
17739 vm_map_entry_copy(map, new_entry, src_entry);
17740 if (new_entry->is_sub_map) {
17741 /* clr address space specifics */
17742 new_entry->use_pmap = FALSE;
17743 } else if (copy) {
17744 /*
17745 * We're dealing with a copy-on-write operation,
17746 * so the resulting mapping should not inherit the
17747 * original mapping's accounting settings.
17748 * "use_pmap" should be reset to its default (TRUE)
17749 * so that the new mapping gets accounted for in
17750 * the task's memory footprint.
17751 */
17752 new_entry->use_pmap = TRUE;
17753 }
17754 /* "iokit_acct" was cleared in vm_map_entry_copy() */
17755 assert(!new_entry->iokit_acct);
17756
17757 new_entry->map_aligned = FALSE;
17758
17759 new_entry->vme_start = map_address;
17760 new_entry->vme_end = map_address + tmp_size;
17761 assert(new_entry->vme_start < new_entry->vme_end);
17762 if (copy && vmk_flags.vmkf_remap_prot_copy) {
17763 /* security: keep "permanent" and "csm_associated" */
17764 new_entry->vme_permanent = src_entry->vme_permanent;
17765 new_entry->csm_associated = src_entry->csm_associated;
17766 /*
17767 * Remapping for vm_map_protect(VM_PROT_COPY)
17768 * to convert a read-only mapping into a
17769 * copy-on-write version of itself but
17770 * with write access:
17771 * keep the original inheritance but let's not
17772 * add VM_PROT_WRITE to the max protection yet
17773 * since we want to do more security checks against
17774 * the target map.
17775 */
17776 new_entry->inheritance = src_entry->inheritance;
17777 new_entry->protection &= max_prot_for_prot_copy;
17778 } else {
17779 new_entry->inheritance = inheritance;
17780 if (!vm_remap_legacy) {
17781 new_entry->protection = *cur_protection;
17782 new_entry->max_protection = *max_protection;
17783 }
17784 }
17785 #ifdef __arm64e__
17786 if (copy && vmk_flags.vmkf_tpro_enforcement_override) {
17787 new_entry->used_for_tpro = FALSE;
17788 }
17789 #endif /* __arm64e__ */
17790 VME_OFFSET_SET(new_entry, offset);
17791
17792 /*
17793 * The new region has to be copied now if required.
17794 */
17795 RestartCopy:
17796 if (!copy) {
17797 if (src_entry->used_for_jit == TRUE) {
17798 if (same_map) {
17799 } else if (!VM_MAP_POLICY_ALLOW_JIT_SHARING(map)) {
17800 /*
17801 * Cannot allow an entry describing a JIT
17802 * region to be shared across address spaces.
17803 */
17804 result = KERN_INVALID_ARGUMENT;
17805 vm_object_deallocate(object);
17806 vm_map_entry_dispose(new_entry);
17807 new_entry = VM_MAP_ENTRY_NULL;
17808 break;
17809 }
17810 }
17811
17812 src_entry->is_shared = TRUE;
17813 new_entry->is_shared = TRUE;
17814 if (!(new_entry->is_sub_map)) {
17815 new_entry->needs_copy = FALSE;
17816 }
17817 } else if (src_entry->is_sub_map) {
17818 /* make this a COW sub_map if not already */
17819 assert(new_entry->wired_count == 0);
17820 new_entry->needs_copy = TRUE;
17821 object = VM_OBJECT_NULL;
17822 } else if (src_entry->wired_count == 0 &&
17823 !(debug4k_no_cow_copyin && VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT) &&
17824 vm_object_copy_quickly(VME_OBJECT(new_entry),
17825 VME_OFFSET(new_entry),
17826 (new_entry->vme_end -
17827 new_entry->vme_start),
17828 &src_needs_copy,
17829 &new_entry_needs_copy)) {
17830 new_entry->needs_copy = new_entry_needs_copy;
17831 new_entry->is_shared = FALSE;
17832 assertf(new_entry->use_pmap, "map %p new_entry %p\n", map, new_entry);
17833
17834 /*
17835 * Handle copy_on_write semantics.
17836 */
17837 if (src_needs_copy && !src_entry->needs_copy) {
17838 vm_prot_t prot;
17839
17840 assert(!pmap_has_prot_policy(map->pmap, src_entry->translated_allow_execute, src_entry->protection));
17841
17842 prot = src_entry->protection & ~VM_PROT_WRITE;
17843
17844 if (override_nx(map,
17845 VME_ALIAS(src_entry))
17846 && prot) {
17847 prot |= VM_PROT_EXECUTE;
17848 }
17849
17850 assert(!pmap_has_prot_policy(map->pmap, src_entry->translated_allow_execute, prot));
17851
17852 vm_object_pmap_protect(object,
17853 offset,
17854 entry_size,
17855 ((src_entry->is_shared
17856 || map->mapped_in_other_pmaps) ?
17857 PMAP_NULL : map->pmap),
17858 VM_MAP_PAGE_SIZE(map),
17859 src_entry->vme_start,
17860 prot);
17861
17862 assert(src_entry->wired_count == 0);
17863 src_entry->needs_copy = TRUE;
17864 }
17865 /*
17866 * Throw away the old object reference of the new entry.
17867 */
17868 vm_object_deallocate(object);
17869 } else {
17870 new_entry->is_shared = FALSE;
17871 assertf(new_entry->use_pmap, "map %p new_entry %p\n", map, new_entry);
17872
17873 src_entry_was_wired = (src_entry->wired_count > 0);
17874 saved_src_entry = src_entry;
17875 src_entry = VM_MAP_ENTRY_NULL;
17876
17877 /*
17878 * The map can be safely unlocked since we
17879 * already hold a reference on the object.
17880 *
17881 * Record the timestamp of the map for later
17882 * verification, and unlock the map.
17883 */
17884 version.main_timestamp = map->timestamp;
17885 vm_map_unlock(map); /* Increments timestamp once! */
17886
17887 /*
17888 * Perform the copy.
17889 */
17890 if (src_entry_was_wired > 0 ||
17891 (debug4k_no_cow_copyin &&
17892 VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT)) {
17893 vm_object_lock(object);
17894 result = vm_object_copy_slowly(
17895 object,
17896 offset,
17897 (new_entry->vme_end -
17898 new_entry->vme_start),
17899 THREAD_UNINT,
17900 &new_copy_object);
17901 /* VME_OBJECT_SET will reset used_for_jit, so preserve it. */
17902 saved_used_for_jit = new_entry->used_for_jit;
17903 VME_OBJECT_SET(new_entry, new_copy_object, false, 0);
17904 new_entry->used_for_jit = saved_used_for_jit;
17905 VME_OFFSET_SET(new_entry, offset - vm_object_trunc_page(offset));
17906 new_entry->needs_copy = FALSE;
17907 } else {
17908 vm_object_offset_t new_offset;
17909
17910 new_offset = VME_OFFSET(new_entry);
17911 result = vm_object_copy_strategically(
17912 object,
17913 offset,
17914 (new_entry->vme_end -
17915 new_entry->vme_start),
17916 false, /* forking */
17917 &new_copy_object,
17918 &new_offset,
17919 &new_entry_needs_copy);
17920 /* VME_OBJECT_SET will reset used_for_jit, so preserve it. */
17921 saved_used_for_jit = new_entry->used_for_jit;
17922 VME_OBJECT_SET(new_entry, new_copy_object, false, 0);
17923 new_entry->used_for_jit = saved_used_for_jit;
17924 if (new_offset != VME_OFFSET(new_entry)) {
17925 VME_OFFSET_SET(new_entry, new_offset);
17926 }
17927
17928 new_entry->needs_copy = new_entry_needs_copy;
17929 }
17930
17931 /*
17932 * Throw away the old object reference of the new entry.
17933 */
17934 vm_object_deallocate(object);
17935
17936 if (result != KERN_SUCCESS &&
17937 result != KERN_MEMORY_RESTART_COPY) {
17938 vm_map_entry_dispose(new_entry);
17939 vm_map_lock(map);
17940 break;
17941 }
17942
17943 /*
17944 * Verify that the map has not substantially
17945 * changed while the copy was being made.
17946 */
17947
17948 vm_map_lock(map);
17949 if (version.main_timestamp + 1 != map->timestamp) {
17950 /*
17951 * Simple version comparison failed.
17952 *
17953 * Retry the lookup and verify that the
17954 * same object/offset are still present.
17955 */
17956 saved_src_entry = VM_MAP_ENTRY_NULL;
17957 vm_object_deallocate(VME_OBJECT(new_entry));
17958 vm_map_entry_dispose(new_entry);
17959 if (result == KERN_MEMORY_RESTART_COPY) {
17960 result = KERN_SUCCESS;
17961 }
17962 continue;
17963 }
17964 /* map hasn't changed: src_entry is still valid */
17965 src_entry = saved_src_entry;
17966 saved_src_entry = VM_MAP_ENTRY_NULL;
17967
17968 if (result == KERN_MEMORY_RESTART_COPY) {
17969 vm_object_reference(object);
17970 goto RestartCopy;
17971 }
17972 }
17973
17974 _vm_map_store_entry_link(map_header,
17975 map_header->links.prev, new_entry);
17976
17977 /* protections for submap mapping are irrelevant here */
17978 if (vm_remap_legacy && !src_entry->is_sub_map) {
17979 *cur_protection &= src_entry->protection;
17980 *max_protection &= src_entry->max_protection;
17981 }
17982
17983 map_address += tmp_size;
17984 mapped_size += tmp_size;
17985 src_start += tmp_size;
17986
17987 if (vmk_flags.vmkf_copy_single_object) {
17988 if (mapped_size != size) {
17989 DEBUG4K_SHARE("map %p addr 0x%llx size 0x%llx clipped copy at mapped_size 0x%llx\n",
17990 map, (uint64_t)addr, (uint64_t)size, (uint64_t)mapped_size);
17991 if (src_entry->vme_next != vm_map_to_entry(map) &&
17992 src_entry->vme_next->vme_object_value ==
17993 src_entry->vme_object_value) {
17994 /* XXX TODO4K */
17995 DEBUG4K_ERROR("could have extended copy to next entry...\n");
17996 }
17997 }
17998 break;
17999 }
18000 } /* end while */
18001
18002 vm_map_unlock(map);
18003 if (result != KERN_SUCCESS) {
18004 /*
18005 * Free all allocated elements.
18006 */
18007 for (src_entry = map_header->links.next;
18008 src_entry != CAST_TO_VM_MAP_ENTRY(&map_header->links);
18009 src_entry = new_entry) {
18010 new_entry = src_entry->vme_next;
18011 _vm_map_store_entry_unlink(map_header, src_entry, false);
18012 if (src_entry->is_sub_map) {
18013 vm_map_deallocate(VME_SUBMAP(src_entry));
18014 } else {
18015 vm_object_deallocate(VME_OBJECT(src_entry));
18016 }
18017 vm_map_entry_dispose(src_entry);
18018 }
18019 }
18020 return result;
18021 }
18022
18023 bool
vm_map_is_exotic(vm_map_t map)18024 vm_map_is_exotic(
18025 vm_map_t map)
18026 {
18027 return VM_MAP_IS_EXOTIC(map);
18028 }
18029
18030 bool
vm_map_is_alien(vm_map_t map)18031 vm_map_is_alien(
18032 vm_map_t map)
18033 {
18034 return VM_MAP_IS_ALIEN(map);
18035 }
18036
18037 #if XNU_TARGET_OS_OSX
18038 void
vm_map_mark_alien(vm_map_t map)18039 vm_map_mark_alien(
18040 vm_map_t map)
18041 {
18042 vm_map_lock(map);
18043 map->is_alien = true;
18044 vm_map_unlock(map);
18045 }
18046
18047 void
vm_map_single_jit(vm_map_t map)18048 vm_map_single_jit(
18049 vm_map_t map)
18050 {
18051 vm_map_lock(map);
18052 map->single_jit = true;
18053 vm_map_unlock(map);
18054 }
18055 #endif /* XNU_TARGET_OS_OSX */
18056
18057 /*
18058 * Callers of this function must call vm_map_copy_require on
18059 * previously created vm_map_copy_t or pass a newly created
18060 * one to ensure that it hasn't been forged.
18061 */
18062 static kern_return_t
vm_map_copy_to_physcopy(vm_map_copy_t copy_map,vm_map_t target_map)18063 vm_map_copy_to_physcopy(
18064 vm_map_copy_t copy_map,
18065 vm_map_t target_map)
18066 {
18067 vm_map_size_t size;
18068 vm_map_entry_t entry;
18069 vm_map_entry_t new_entry;
18070 vm_object_t new_object;
18071 unsigned int pmap_flags;
18072 pmap_t new_pmap;
18073 vm_map_t new_map;
18074 vm_map_address_t src_start, src_end, src_cur;
18075 vm_map_address_t dst_start, dst_end, dst_cur;
18076 kern_return_t kr;
18077 void *kbuf;
18078
18079 /*
18080 * Perform the equivalent of vm_allocate() and memcpy().
18081 * Replace the mappings in "copy_map" with the newly allocated mapping.
18082 */
18083 DEBUG4K_COPY("copy_map %p (%d %d 0x%llx 0x%llx) BEFORE\n", copy_map, copy_map->cpy_hdr.page_shift, copy_map->cpy_hdr.nentries, copy_map->offset, (uint64_t)copy_map->size);
18084
18085 assert(copy_map->cpy_hdr.page_shift != VM_MAP_PAGE_MASK(target_map));
18086
18087 /* create a new pmap to map "copy_map" */
18088 pmap_flags = 0;
18089 assert(copy_map->cpy_hdr.page_shift == FOURK_PAGE_SHIFT);
18090 #if PMAP_CREATE_FORCE_4K_PAGES
18091 pmap_flags |= PMAP_CREATE_FORCE_4K_PAGES;
18092 #endif /* PMAP_CREATE_FORCE_4K_PAGES */
18093 pmap_flags |= PMAP_CREATE_64BIT;
18094 new_pmap = pmap_create_options(NULL, (vm_map_size_t)0, pmap_flags);
18095 if (new_pmap == NULL) {
18096 return KERN_RESOURCE_SHORTAGE;
18097 }
18098
18099 /* allocate new VM object */
18100 size = VM_MAP_ROUND_PAGE(copy_map->size, PAGE_MASK);
18101 new_object = vm_object_allocate(size);
18102 assert(new_object);
18103
18104 /* allocate new VM map entry */
18105 new_entry = vm_map_copy_entry_create(copy_map);
18106 assert(new_entry);
18107
18108 /* finish initializing new VM map entry */
18109 new_entry->protection = VM_PROT_DEFAULT;
18110 new_entry->max_protection = VM_PROT_DEFAULT;
18111 new_entry->use_pmap = TRUE;
18112
18113 /* make new VM map entry point to new VM object */
18114 new_entry->vme_start = 0;
18115 new_entry->vme_end = size;
18116 VME_OBJECT_SET(new_entry, new_object, false, 0);
18117 VME_OFFSET_SET(new_entry, 0);
18118
18119 /* create a new pageable VM map to map "copy_map" */
18120 new_map = vm_map_create_options(new_pmap, 0, MACH_VM_MAX_ADDRESS,
18121 VM_MAP_CREATE_PAGEABLE);
18122 assert(new_map);
18123 vm_map_set_page_shift(new_map, copy_map->cpy_hdr.page_shift);
18124
18125 /* map "copy_map" in the new VM map */
18126 src_start = 0;
18127 kr = vm_map_copyout_internal(
18128 new_map,
18129 &src_start,
18130 copy_map,
18131 copy_map->size,
18132 FALSE, /* consume_on_success */
18133 VM_PROT_DEFAULT,
18134 VM_PROT_DEFAULT,
18135 VM_INHERIT_DEFAULT);
18136 assert(kr == KERN_SUCCESS);
18137 src_end = src_start + copy_map->size;
18138
18139 /* map "new_object" in the new VM map */
18140 vm_object_reference(new_object);
18141 dst_start = 0;
18142 kr = vm_map_enter(new_map,
18143 &dst_start,
18144 size,
18145 0, /* mask */
18146 VM_MAP_KERNEL_FLAGS_ANYWHERE(.vm_tag = VM_KERN_MEMORY_OSFMK),
18147 new_object,
18148 0, /* offset */
18149 FALSE, /* needs copy */
18150 VM_PROT_DEFAULT,
18151 VM_PROT_DEFAULT,
18152 VM_INHERIT_DEFAULT);
18153 assert(kr == KERN_SUCCESS);
18154 dst_end = dst_start + size;
18155
18156 /* get a kernel buffer */
18157 kbuf = kalloc_data(PAGE_SIZE, Z_WAITOK | Z_NOFAIL);
18158
18159 /* physically copy "copy_map" mappings to new VM object */
18160 for (src_cur = src_start, dst_cur = dst_start;
18161 src_cur < src_end;
18162 src_cur += PAGE_SIZE, dst_cur += PAGE_SIZE) {
18163 vm_size_t bytes;
18164
18165 bytes = PAGE_SIZE;
18166 if (src_cur + PAGE_SIZE > src_end) {
18167 /* partial copy for last page */
18168 bytes = src_end - src_cur;
18169 assert(bytes > 0 && bytes < PAGE_SIZE);
18170 /* rest of dst page should be zero-filled */
18171 }
18172 /* get bytes from src mapping */
18173 kr = copyinmap(new_map, src_cur, kbuf, bytes);
18174 if (kr != KERN_SUCCESS) {
18175 DEBUG4K_COPY("copyinmap(%p, 0x%llx, %p, 0x%llx) kr 0x%x\n", new_map, (uint64_t)src_cur, kbuf, (uint64_t)bytes, kr);
18176 }
18177 /* put bytes in dst mapping */
18178 assert(dst_cur < dst_end);
18179 assert(dst_cur + bytes <= dst_end);
18180 kr = copyoutmap(new_map, kbuf, dst_cur, bytes);
18181 if (kr != KERN_SUCCESS) {
18182 DEBUG4K_COPY("copyoutmap(%p, %p, 0x%llx, 0x%llx) kr 0x%x\n", new_map, kbuf, (uint64_t)dst_cur, (uint64_t)bytes, kr);
18183 }
18184 }
18185
18186 /* free kernel buffer */
18187 kfree_data(kbuf, PAGE_SIZE);
18188
18189 /* destroy new map */
18190 vm_map_destroy(new_map);
18191 new_map = VM_MAP_NULL;
18192
18193 /* dispose of the old map entries in "copy_map" */
18194 while (vm_map_copy_first_entry(copy_map) !=
18195 vm_map_copy_to_entry(copy_map)) {
18196 entry = vm_map_copy_first_entry(copy_map);
18197 vm_map_copy_entry_unlink(copy_map, entry);
18198 if (entry->is_sub_map) {
18199 vm_map_deallocate(VME_SUBMAP(entry));
18200 } else {
18201 vm_object_deallocate(VME_OBJECT(entry));
18202 }
18203 vm_map_copy_entry_dispose(entry);
18204 }
18205
18206 /* change "copy_map"'s page_size to match "target_map" */
18207 copy_map->cpy_hdr.page_shift = (uint16_t)VM_MAP_PAGE_SHIFT(target_map);
18208 copy_map->offset = 0;
18209 copy_map->size = size;
18210
18211 /* insert new map entry in "copy_map" */
18212 assert(vm_map_copy_last_entry(copy_map) == vm_map_copy_to_entry(copy_map));
18213 vm_map_copy_entry_link(copy_map, vm_map_copy_last_entry(copy_map), new_entry);
18214
18215 DEBUG4K_COPY("copy_map %p (%d %d 0x%llx 0x%llx) AFTER\n", copy_map, copy_map->cpy_hdr.page_shift, copy_map->cpy_hdr.nentries, copy_map->offset, (uint64_t)copy_map->size);
18216 return KERN_SUCCESS;
18217 }
18218
18219 void
18220 vm_map_copy_adjust_get_target_copy_map(
18221 vm_map_copy_t copy_map,
18222 vm_map_copy_t *target_copy_map_p);
18223 void
vm_map_copy_adjust_get_target_copy_map(vm_map_copy_t copy_map,vm_map_copy_t * target_copy_map_p)18224 vm_map_copy_adjust_get_target_copy_map(
18225 vm_map_copy_t copy_map,
18226 vm_map_copy_t *target_copy_map_p)
18227 {
18228 vm_map_copy_t target_copy_map;
18229 vm_map_entry_t entry, target_entry;
18230
18231 if (*target_copy_map_p != VM_MAP_COPY_NULL) {
18232 /* the caller already has a "target_copy_map": use it */
18233 return;
18234 }
18235
18236 /* the caller wants us to create a new copy of "copy_map" */
18237 assert(copy_map->type == VM_MAP_COPY_ENTRY_LIST);
18238 target_copy_map = vm_map_copy_allocate(copy_map->type);
18239 target_copy_map->offset = copy_map->offset;
18240 target_copy_map->size = copy_map->size;
18241 target_copy_map->cpy_hdr.page_shift = copy_map->cpy_hdr.page_shift;
18242 for (entry = vm_map_copy_first_entry(copy_map);
18243 entry != vm_map_copy_to_entry(copy_map);
18244 entry = entry->vme_next) {
18245 target_entry = vm_map_copy_entry_create(target_copy_map);
18246 vm_map_entry_copy_full(target_entry, entry);
18247 if (target_entry->is_sub_map) {
18248 vm_map_reference(VME_SUBMAP(target_entry));
18249 } else {
18250 vm_object_reference(VME_OBJECT(target_entry));
18251 }
18252 vm_map_copy_entry_link(
18253 target_copy_map,
18254 vm_map_copy_last_entry(target_copy_map),
18255 target_entry);
18256 }
18257 entry = VM_MAP_ENTRY_NULL;
18258 *target_copy_map_p = target_copy_map;
18259 }
18260
18261 /*
18262 * Callers of this function must call vm_map_copy_require on
18263 * previously created vm_map_copy_t or pass a newly created
18264 * one to ensure that it hasn't been forged.
18265 */
18266 static void
vm_map_copy_trim(vm_map_copy_t copy_map,uint16_t new_page_shift,vm_map_offset_t trim_start,vm_map_offset_t trim_end)18267 vm_map_copy_trim(
18268 vm_map_copy_t copy_map,
18269 uint16_t new_page_shift,
18270 vm_map_offset_t trim_start,
18271 vm_map_offset_t trim_end)
18272 {
18273 uint16_t copy_page_shift;
18274 vm_map_entry_t entry, next_entry;
18275
18276 assert(copy_map->type == VM_MAP_COPY_ENTRY_LIST);
18277 assert(copy_map->cpy_hdr.nentries > 0);
18278
18279 trim_start += vm_map_copy_first_entry(copy_map)->vme_start;
18280 trim_end += vm_map_copy_first_entry(copy_map)->vme_start;
18281
18282 /* use the new page_shift to do the clipping */
18283 copy_page_shift = VM_MAP_COPY_PAGE_SHIFT(copy_map);
18284 copy_map->cpy_hdr.page_shift = new_page_shift;
18285
18286 for (entry = vm_map_copy_first_entry(copy_map);
18287 entry != vm_map_copy_to_entry(copy_map);
18288 entry = next_entry) {
18289 next_entry = entry->vme_next;
18290 if (entry->vme_end <= trim_start) {
18291 /* entry fully before trim range: skip */
18292 continue;
18293 }
18294 if (entry->vme_start >= trim_end) {
18295 /* entry fully after trim range: done */
18296 break;
18297 }
18298 /* clip entry if needed */
18299 vm_map_copy_clip_start(copy_map, entry, trim_start);
18300 vm_map_copy_clip_end(copy_map, entry, trim_end);
18301 /* dispose of entry */
18302 copy_map->size -= entry->vme_end - entry->vme_start;
18303 vm_map_copy_entry_unlink(copy_map, entry);
18304 if (entry->is_sub_map) {
18305 vm_map_deallocate(VME_SUBMAP(entry));
18306 } else {
18307 vm_object_deallocate(VME_OBJECT(entry));
18308 }
18309 vm_map_copy_entry_dispose(entry);
18310 entry = VM_MAP_ENTRY_NULL;
18311 }
18312
18313 /* restore copy_map's original page_shift */
18314 copy_map->cpy_hdr.page_shift = copy_page_shift;
18315 }
18316
18317 /*
18318 * Make any necessary adjustments to "copy_map" to allow it to be
18319 * mapped into "target_map".
18320 * If no changes were necessary, "target_copy_map" points to the
18321 * untouched "copy_map".
18322 * If changes are necessary, changes will be made to "target_copy_map".
18323 * If "target_copy_map" was NULL, we create a new "vm_map_copy_t" and
18324 * copy the original "copy_map" to it before applying the changes.
18325 * The caller should discard "target_copy_map" if it's not the same as
18326 * the original "copy_map".
18327 */
18328 /* TODO4K: also adjust to sub-range in the copy_map -> add start&end? */
18329 kern_return_t
vm_map_copy_adjust_to_target(vm_map_copy_t src_copy_map,vm_map_offset_t offset,vm_map_size_t size,vm_map_t target_map,boolean_t copy,vm_map_copy_t * target_copy_map_p,vm_map_offset_t * overmap_start_p,vm_map_offset_t * overmap_end_p,vm_map_offset_t * trimmed_start_p)18330 vm_map_copy_adjust_to_target(
18331 vm_map_copy_t src_copy_map,
18332 vm_map_offset_t offset,
18333 vm_map_size_t size,
18334 vm_map_t target_map,
18335 boolean_t copy,
18336 vm_map_copy_t *target_copy_map_p,
18337 vm_map_offset_t *overmap_start_p,
18338 vm_map_offset_t *overmap_end_p,
18339 vm_map_offset_t *trimmed_start_p)
18340 {
18341 vm_map_copy_t copy_map, target_copy_map;
18342 vm_map_size_t target_size;
18343 vm_map_size_t src_copy_map_size;
18344 vm_map_size_t overmap_start, overmap_end;
18345 int misalignments;
18346 vm_map_entry_t entry, target_entry;
18347 vm_map_offset_t addr_adjustment;
18348 vm_map_offset_t new_start, new_end;
18349 int copy_page_mask, target_page_mask;
18350 uint16_t copy_page_shift, target_page_shift;
18351 vm_map_offset_t trimmed_end;
18352
18353 /*
18354 * Assert that the vm_map_copy is coming from the right
18355 * zone and hasn't been forged
18356 */
18357 vm_map_copy_require(src_copy_map);
18358 assert(src_copy_map->type == VM_MAP_COPY_ENTRY_LIST);
18359
18360 /*
18361 * Start working with "src_copy_map" but we'll switch
18362 * to "target_copy_map" as soon as we start making adjustments.
18363 */
18364 copy_map = src_copy_map;
18365 src_copy_map_size = src_copy_map->size;
18366
18367 copy_page_shift = VM_MAP_COPY_PAGE_SHIFT(copy_map);
18368 copy_page_mask = VM_MAP_COPY_PAGE_MASK(copy_map);
18369 target_page_shift = (uint16_t)VM_MAP_PAGE_SHIFT(target_map);
18370 target_page_mask = VM_MAP_PAGE_MASK(target_map);
18371
18372 DEBUG4K_ADJUST("copy_map %p (%d offset 0x%llx size 0x%llx) target_map %p (%d) copy %d offset 0x%llx size 0x%llx target_copy_map %p...\n", copy_map, copy_page_shift, (uint64_t)copy_map->offset, (uint64_t)copy_map->size, target_map, target_page_shift, copy, (uint64_t)offset, (uint64_t)size, *target_copy_map_p);
18373
18374 target_copy_map = *target_copy_map_p;
18375 if (target_copy_map != VM_MAP_COPY_NULL) {
18376 vm_map_copy_require(target_copy_map);
18377 }
18378
18379 if (offset + size > copy_map->size) {
18380 DEBUG4K_ERROR("copy_map %p (%d->%d) copy_map->size 0x%llx offset 0x%llx size 0x%llx KERN_INVALID_ARGUMENT\n", copy_map, copy_page_shift, target_page_shift, (uint64_t)copy_map->size, (uint64_t)offset, (uint64_t)size);
18381 return KERN_INVALID_ARGUMENT;
18382 }
18383
18384 /* trim the end */
18385 trimmed_end = 0;
18386 new_end = VM_MAP_ROUND_PAGE(offset + size, target_page_mask);
18387 if (new_end < copy_map->size) {
18388 trimmed_end = src_copy_map_size - new_end;
18389 DEBUG4K_ADJUST("copy_map %p (%d->%d) copy %d offset 0x%llx size 0x%llx target_copy_map %p... trim end from 0x%llx to 0x%llx\n", copy_map, copy_page_shift, target_page_shift, copy, (uint64_t)offset, (uint64_t)size, target_copy_map, (uint64_t)new_end, (uint64_t)copy_map->size);
18390 /* get "target_copy_map" if needed and adjust it */
18391 vm_map_copy_adjust_get_target_copy_map(copy_map,
18392 &target_copy_map);
18393 copy_map = target_copy_map;
18394 vm_map_copy_trim(target_copy_map, target_page_shift,
18395 new_end, copy_map->size);
18396 }
18397
18398 /* trim the start */
18399 new_start = VM_MAP_TRUNC_PAGE(offset, target_page_mask);
18400 if (new_start != 0) {
18401 DEBUG4K_ADJUST("copy_map %p (%d->%d) copy %d offset 0x%llx size 0x%llx target_copy_map %p... trim start from 0x%llx to 0x%llx\n", copy_map, copy_page_shift, target_page_shift, copy, (uint64_t)offset, (uint64_t)size, target_copy_map, (uint64_t)0, (uint64_t)new_start);
18402 /* get "target_copy_map" if needed and adjust it */
18403 vm_map_copy_adjust_get_target_copy_map(copy_map,
18404 &target_copy_map);
18405 copy_map = target_copy_map;
18406 vm_map_copy_trim(target_copy_map, target_page_shift,
18407 0, new_start);
18408 }
18409 *trimmed_start_p = new_start;
18410
18411 /* target_size starts with what's left after trimming */
18412 target_size = copy_map->size;
18413 assertf(target_size == src_copy_map_size - *trimmed_start_p - trimmed_end,
18414 "target_size 0x%llx src_copy_map_size 0x%llx trimmed_start 0x%llx trimmed_end 0x%llx\n",
18415 (uint64_t)target_size, (uint64_t)src_copy_map_size,
18416 (uint64_t)*trimmed_start_p, (uint64_t)trimmed_end);
18417
18418 /* check for misalignments but don't adjust yet */
18419 misalignments = 0;
18420 overmap_start = 0;
18421 overmap_end = 0;
18422 if (copy_page_shift < target_page_shift) {
18423 /*
18424 * Remapping from 4K to 16K: check the VM object alignments
18425 * throughout the range.
18426 * If the start and end of the range are mis-aligned, we can
18427 * over-map to re-align, and adjust the "overmap" start/end
18428 * and "target_size" of the range accordingly.
18429 * If there is any mis-alignment within the range:
18430 * if "copy":
18431 * we can do immediate-copy instead of copy-on-write,
18432 * else:
18433 * no way to remap and share; fail.
18434 */
18435 for (entry = vm_map_copy_first_entry(copy_map);
18436 entry != vm_map_copy_to_entry(copy_map);
18437 entry = entry->vme_next) {
18438 vm_object_offset_t object_offset_start, object_offset_end;
18439
18440 object_offset_start = VME_OFFSET(entry);
18441 object_offset_end = object_offset_start;
18442 object_offset_end += entry->vme_end - entry->vme_start;
18443 if (object_offset_start & target_page_mask) {
18444 if (entry == vm_map_copy_first_entry(copy_map) && !copy) {
18445 overmap_start++;
18446 } else {
18447 misalignments++;
18448 }
18449 }
18450 if (object_offset_end & target_page_mask) {
18451 if (entry->vme_next == vm_map_copy_to_entry(copy_map) && !copy) {
18452 overmap_end++;
18453 } else {
18454 misalignments++;
18455 }
18456 }
18457 }
18458 }
18459 entry = VM_MAP_ENTRY_NULL;
18460
18461 /* decide how to deal with misalignments */
18462 assert(overmap_start <= 1);
18463 assert(overmap_end <= 1);
18464 if (!overmap_start && !overmap_end && !misalignments) {
18465 /* copy_map is properly aligned for target_map ... */
18466 if (*trimmed_start_p) {
18467 /* ... but we trimmed it, so still need to adjust */
18468 } else {
18469 /* ... and we didn't trim anything: we're done */
18470 if (target_copy_map == VM_MAP_COPY_NULL) {
18471 target_copy_map = copy_map;
18472 }
18473 *target_copy_map_p = target_copy_map;
18474 *overmap_start_p = 0;
18475 *overmap_end_p = 0;
18476 DEBUG4K_ADJUST("copy_map %p (%d offset 0x%llx size 0x%llx) target_map %p (%d) copy %d target_copy_map %p (%d offset 0x%llx size 0x%llx) -> trimmed 0x%llx overmap start 0x%llx end 0x%llx KERN_SUCCESS\n", copy_map, copy_page_shift, (uint64_t)copy_map->offset, (uint64_t)copy_map->size, target_map, target_page_shift, copy, *target_copy_map_p, VM_MAP_COPY_PAGE_SHIFT(*target_copy_map_p), (uint64_t)(*target_copy_map_p)->offset, (uint64_t)(*target_copy_map_p)->size, (uint64_t)*trimmed_start_p, (uint64_t)*overmap_start_p, (uint64_t)*overmap_end_p);
18477 return KERN_SUCCESS;
18478 }
18479 } else if (misalignments && !copy) {
18480 /* can't "share" if misaligned */
18481 DEBUG4K_ADJUST("unsupported sharing\n");
18482 #if MACH_ASSERT
18483 if (debug4k_panic_on_misaligned_sharing) {
18484 panic("DEBUG4k %s:%d unsupported sharing", __FUNCTION__, __LINE__);
18485 }
18486 #endif /* MACH_ASSERT */
18487 DEBUG4K_ADJUST("copy_map %p (%d) target_map %p (%d) copy %d target_copy_map %p -> KERN_NOT_SUPPORTED\n", copy_map, copy_page_shift, target_map, target_page_shift, copy, *target_copy_map_p);
18488 return KERN_NOT_SUPPORTED;
18489 } else {
18490 /* can't virtual-copy if misaligned (but can physical-copy) */
18491 DEBUG4K_ADJUST("mis-aligned copying\n");
18492 }
18493
18494 /* get a "target_copy_map" if needed and switch to it */
18495 vm_map_copy_adjust_get_target_copy_map(copy_map, &target_copy_map);
18496 copy_map = target_copy_map;
18497
18498 if (misalignments && copy) {
18499 vm_map_size_t target_copy_map_size;
18500
18501 /*
18502 * Can't do copy-on-write with misaligned mappings.
18503 * Replace the mappings with a physical copy of the original
18504 * mappings' contents.
18505 */
18506 target_copy_map_size = target_copy_map->size;
18507 kern_return_t kr = vm_map_copy_to_physcopy(target_copy_map, target_map);
18508 if (kr != KERN_SUCCESS) {
18509 return kr;
18510 }
18511 *target_copy_map_p = target_copy_map;
18512 *overmap_start_p = 0;
18513 *overmap_end_p = target_copy_map->size - target_copy_map_size;
18514 DEBUG4K_ADJUST("copy_map %p (%d offset 0x%llx size 0x%llx) target_map %p (%d) copy %d target_copy_map %p (%d offset 0x%llx size 0x%llx)-> trimmed 0x%llx overmap start 0x%llx end 0x%llx PHYSCOPY\n", copy_map, copy_page_shift, (uint64_t)copy_map->offset, (uint64_t)copy_map->size, target_map, target_page_shift, copy, *target_copy_map_p, VM_MAP_COPY_PAGE_SHIFT(*target_copy_map_p), (uint64_t)(*target_copy_map_p)->offset, (uint64_t)(*target_copy_map_p)->size, (uint64_t)*trimmed_start_p, (uint64_t)*overmap_start_p, (uint64_t)*overmap_end_p);
18515 return KERN_SUCCESS;
18516 }
18517
18518 /* apply the adjustments */
18519 misalignments = 0;
18520 overmap_start = 0;
18521 overmap_end = 0;
18522 /* remove copy_map->offset, so that everything starts at offset 0 */
18523 addr_adjustment = copy_map->offset;
18524 /* also remove whatever we trimmed from the start */
18525 addr_adjustment += *trimmed_start_p;
18526 for (target_entry = vm_map_copy_first_entry(target_copy_map);
18527 target_entry != vm_map_copy_to_entry(target_copy_map);
18528 target_entry = target_entry->vme_next) {
18529 vm_object_offset_t object_offset_start, object_offset_end;
18530
18531 DEBUG4K_ADJUST("copy %p (%d 0x%llx 0x%llx) entry %p [ 0x%llx 0x%llx ] object %p offset 0x%llx BEFORE\n", target_copy_map, VM_MAP_COPY_PAGE_SHIFT(target_copy_map), target_copy_map->offset, (uint64_t)target_copy_map->size, target_entry, (uint64_t)target_entry->vme_start, (uint64_t)target_entry->vme_end, VME_OBJECT(target_entry), VME_OFFSET(target_entry));
18532 object_offset_start = VME_OFFSET(target_entry);
18533 if (object_offset_start & target_page_mask) {
18534 DEBUG4K_ADJUST("copy %p (%d 0x%llx 0x%llx) entry %p [ 0x%llx 0x%llx ] object %p offset 0x%llx misaligned at start\n", target_copy_map, VM_MAP_COPY_PAGE_SHIFT(target_copy_map), target_copy_map->offset, (uint64_t)target_copy_map->size, target_entry, (uint64_t)target_entry->vme_start, (uint64_t)target_entry->vme_end, VME_OBJECT(target_entry), VME_OFFSET(target_entry));
18535 if (target_entry == vm_map_copy_first_entry(target_copy_map)) {
18536 /*
18537 * start of 1st entry is mis-aligned:
18538 * re-adjust by over-mapping.
18539 */
18540 overmap_start = object_offset_start - trunc_page_mask_64(object_offset_start, target_page_mask);
18541 DEBUG4K_ADJUST("entry %p offset 0x%llx copy %d -> overmap_start 0x%llx\n", target_entry, VME_OFFSET(target_entry), copy, (uint64_t)overmap_start);
18542 VME_OFFSET_SET(target_entry, VME_OFFSET(target_entry) - overmap_start);
18543 } else {
18544 misalignments++;
18545 DEBUG4K_ADJUST("entry %p offset 0x%llx copy %d -> misalignments %d\n", target_entry, VME_OFFSET(target_entry), copy, misalignments);
18546 assert(copy);
18547 }
18548 }
18549
18550 if (target_entry == vm_map_copy_first_entry(target_copy_map)) {
18551 target_size += overmap_start;
18552 } else {
18553 target_entry->vme_start += overmap_start;
18554 }
18555 target_entry->vme_end += overmap_start;
18556
18557 object_offset_end = VME_OFFSET(target_entry) + target_entry->vme_end - target_entry->vme_start;
18558 if (object_offset_end & target_page_mask) {
18559 DEBUG4K_ADJUST("copy %p (%d 0x%llx 0x%llx) entry %p [ 0x%llx 0x%llx ] object %p offset 0x%llx misaligned at end\n", target_copy_map, VM_MAP_COPY_PAGE_SHIFT(target_copy_map), target_copy_map->offset, (uint64_t)target_copy_map->size, target_entry, (uint64_t)target_entry->vme_start, (uint64_t)target_entry->vme_end, VME_OBJECT(target_entry), VME_OFFSET(target_entry));
18560 if (target_entry->vme_next == vm_map_copy_to_entry(target_copy_map)) {
18561 /*
18562 * end of last entry is mis-aligned: re-adjust by over-mapping.
18563 */
18564 overmap_end = round_page_mask_64(object_offset_end, target_page_mask) - object_offset_end;
18565 DEBUG4K_ADJUST("entry %p offset 0x%llx copy %d -> overmap_end 0x%llx\n", target_entry, VME_OFFSET(target_entry), copy, (uint64_t)overmap_end);
18566 target_entry->vme_end += overmap_end;
18567 target_size += overmap_end;
18568 } else {
18569 misalignments++;
18570 DEBUG4K_ADJUST("entry %p offset 0x%llx copy %d -> misalignments %d\n", target_entry, VME_OFFSET(target_entry), copy, misalignments);
18571 assert(copy);
18572 }
18573 }
18574 target_entry->vme_start -= addr_adjustment;
18575 target_entry->vme_end -= addr_adjustment;
18576 DEBUG4K_ADJUST("copy %p (%d 0x%llx 0x%llx) entry %p [ 0x%llx 0x%llx ] object %p offset 0x%llx AFTER\n", target_copy_map, VM_MAP_COPY_PAGE_SHIFT(target_copy_map), target_copy_map->offset, (uint64_t)target_copy_map->size, target_entry, (uint64_t)target_entry->vme_start, (uint64_t)target_entry->vme_end, VME_OBJECT(target_entry), VME_OFFSET(target_entry));
18577 }
18578
18579 target_copy_map->size = target_size;
18580 target_copy_map->offset += overmap_start;
18581 target_copy_map->offset -= addr_adjustment;
18582 target_copy_map->cpy_hdr.page_shift = target_page_shift;
18583
18584 // assert(VM_MAP_PAGE_ALIGNED(target_copy_map->size, target_page_mask));
18585 // assert(VM_MAP_PAGE_ALIGNED(target_copy_map->offset, FOURK_PAGE_MASK));
18586 assert(overmap_start < VM_MAP_PAGE_SIZE(target_map));
18587 assert(overmap_end < VM_MAP_PAGE_SIZE(target_map));
18588
18589 *target_copy_map_p = target_copy_map;
18590 *overmap_start_p = overmap_start;
18591 *overmap_end_p = overmap_end;
18592
18593 DEBUG4K_ADJUST("copy_map %p (%d offset 0x%llx size 0x%llx) target_map %p (%d) copy %d target_copy_map %p (%d offset 0x%llx size 0x%llx) -> trimmed 0x%llx overmap start 0x%llx end 0x%llx KERN_SUCCESS\n", copy_map, copy_page_shift, (uint64_t)copy_map->offset, (uint64_t)copy_map->size, target_map, target_page_shift, copy, *target_copy_map_p, VM_MAP_COPY_PAGE_SHIFT(*target_copy_map_p), (uint64_t)(*target_copy_map_p)->offset, (uint64_t)(*target_copy_map_p)->size, (uint64_t)*trimmed_start_p, (uint64_t)*overmap_start_p, (uint64_t)*overmap_end_p);
18594 return KERN_SUCCESS;
18595 }
18596
18597 kern_return_t
vm_map_range_physical_size(vm_map_t map,vm_map_address_t start,mach_vm_size_t size,mach_vm_size_t * phys_size)18598 vm_map_range_physical_size(
18599 vm_map_t map,
18600 vm_map_address_t start,
18601 mach_vm_size_t size,
18602 mach_vm_size_t * phys_size)
18603 {
18604 kern_return_t kr;
18605 vm_map_copy_t copy_map, target_copy_map;
18606 vm_map_offset_t adjusted_start, adjusted_end;
18607 vm_map_size_t adjusted_size;
18608 vm_prot_t cur_prot, max_prot;
18609 vm_map_offset_t overmap_start, overmap_end, trimmed_start, end;
18610 vm_map_kernel_flags_t vmk_flags;
18611
18612 if (size == 0) {
18613 DEBUG4K_SHARE("map %p start 0x%llx size 0x%llx -> phys_size 0!\n", map, (uint64_t)start, (uint64_t)size);
18614 *phys_size = 0;
18615 return KERN_SUCCESS;
18616 }
18617
18618 adjusted_start = vm_map_trunc_page(start, VM_MAP_PAGE_MASK(map));
18619 adjusted_end = vm_map_round_page(start + size, VM_MAP_PAGE_MASK(map));
18620 if (__improbable(os_add_overflow(start, size, &end) ||
18621 adjusted_end <= adjusted_start)) {
18622 /* wraparound */
18623 printf("%s:%d(start=0x%llx, size=0x%llx) pgmask 0x%x: wraparound\n", __FUNCTION__, __LINE__, (uint64_t)start, (uint64_t)size, VM_MAP_PAGE_MASK(map));
18624 *phys_size = 0;
18625 return KERN_INVALID_ARGUMENT;
18626 }
18627 if (__improbable(vm_map_range_overflows(map, start, size))) {
18628 *phys_size = 0;
18629 return KERN_INVALID_ADDRESS;
18630 }
18631 assert(adjusted_end > adjusted_start);
18632 adjusted_size = adjusted_end - adjusted_start;
18633 *phys_size = adjusted_size;
18634 if (VM_MAP_PAGE_SIZE(map) == PAGE_SIZE) {
18635 return KERN_SUCCESS;
18636 }
18637 if (start == 0) {
18638 adjusted_start = vm_map_trunc_page(start, PAGE_MASK);
18639 adjusted_end = vm_map_round_page(start + size, PAGE_MASK);
18640 if (__improbable(adjusted_end <= adjusted_start)) {
18641 /* wraparound */
18642 printf("%s:%d(start=0x%llx, size=0x%llx) pgmask 0x%x: wraparound\n", __FUNCTION__, __LINE__, (uint64_t)start, (uint64_t)size, PAGE_MASK);
18643 *phys_size = 0;
18644 return KERN_INVALID_ARGUMENT;
18645 }
18646 assert(adjusted_end > adjusted_start);
18647 adjusted_size = adjusted_end - adjusted_start;
18648 *phys_size = adjusted_size;
18649 return KERN_SUCCESS;
18650 }
18651
18652 vmk_flags = VM_MAP_KERNEL_FLAGS_NONE;
18653 vmk_flags.vmkf_copy_pageable = TRUE;
18654 vmk_flags.vmkf_copy_same_map = TRUE;
18655 assert(adjusted_size != 0);
18656 cur_prot = VM_PROT_NONE; /* legacy mode */
18657 max_prot = VM_PROT_NONE; /* legacy mode */
18658 kr = vm_map_copy_extract(map, adjusted_start, adjusted_size,
18659 FALSE /* copy */,
18660 ©_map,
18661 &cur_prot, &max_prot, VM_INHERIT_DEFAULT,
18662 vmk_flags);
18663 if (kr != KERN_SUCCESS) {
18664 DEBUG4K_ERROR("map %p start 0x%llx 0x%llx size 0x%llx 0x%llx kr 0x%x\n", map, (uint64_t)start, (uint64_t)adjusted_start, size, (uint64_t)adjusted_size, kr);
18665 //assert(0);
18666 *phys_size = 0;
18667 return kr;
18668 }
18669 assert(copy_map != VM_MAP_COPY_NULL);
18670 target_copy_map = copy_map;
18671 DEBUG4K_ADJUST("adjusting...\n");
18672 kr = vm_map_copy_adjust_to_target(
18673 copy_map,
18674 start - adjusted_start, /* offset */
18675 size, /* size */
18676 kernel_map,
18677 FALSE, /* copy */
18678 &target_copy_map,
18679 &overmap_start,
18680 &overmap_end,
18681 &trimmed_start);
18682 if (kr == KERN_SUCCESS) {
18683 if (target_copy_map->size != *phys_size) {
18684 DEBUG4K_ADJUST("map %p (%d) start 0x%llx size 0x%llx adjusted_start 0x%llx adjusted_end 0x%llx overmap_start 0x%llx overmap_end 0x%llx trimmed_start 0x%llx phys_size 0x%llx -> 0x%llx\n", map, VM_MAP_PAGE_SHIFT(map), (uint64_t)start, (uint64_t)size, (uint64_t)adjusted_start, (uint64_t)adjusted_end, (uint64_t)overmap_start, (uint64_t)overmap_end, (uint64_t)trimmed_start, (uint64_t)*phys_size, (uint64_t)target_copy_map->size);
18685 }
18686 *phys_size = target_copy_map->size;
18687 } else {
18688 DEBUG4K_ERROR("map %p start 0x%llx 0x%llx size 0x%llx 0x%llx kr 0x%x\n", map, (uint64_t)start, (uint64_t)adjusted_start, size, (uint64_t)adjusted_size, kr);
18689 //assert(0);
18690 *phys_size = 0;
18691 }
18692 vm_map_copy_discard(copy_map);
18693 copy_map = VM_MAP_COPY_NULL;
18694
18695 return kr;
18696 }
18697
18698
18699 kern_return_t
memory_entry_check_for_adjustment(vm_map_t src_map,ipc_port_t port,vm_map_offset_t * overmap_start,vm_map_offset_t * overmap_end)18700 memory_entry_check_for_adjustment(
18701 vm_map_t src_map,
18702 ipc_port_t port,
18703 vm_map_offset_t *overmap_start,
18704 vm_map_offset_t *overmap_end)
18705 {
18706 kern_return_t kr = KERN_SUCCESS;
18707 vm_map_copy_t copy_map = VM_MAP_COPY_NULL, target_copy_map = VM_MAP_COPY_NULL;
18708
18709 assert(port);
18710 assertf(ip_kotype(port) == IKOT_NAMED_ENTRY, "Port Type expected: %d...received:%d\n", IKOT_NAMED_ENTRY, ip_kotype(port));
18711
18712 vm_named_entry_t named_entry;
18713
18714 named_entry = mach_memory_entry_from_port(port);
18715 named_entry_lock(named_entry);
18716 copy_map = named_entry->backing.copy;
18717 target_copy_map = copy_map;
18718
18719 if (src_map && VM_MAP_PAGE_SHIFT(src_map) < PAGE_SHIFT) {
18720 vm_map_offset_t trimmed_start;
18721
18722 trimmed_start = 0;
18723 DEBUG4K_ADJUST("adjusting...\n");
18724 kr = vm_map_copy_adjust_to_target(
18725 copy_map,
18726 0, /* offset */
18727 copy_map->size, /* size */
18728 src_map,
18729 FALSE, /* copy */
18730 &target_copy_map,
18731 overmap_start,
18732 overmap_end,
18733 &trimmed_start);
18734 assert(trimmed_start == 0);
18735 }
18736 named_entry_unlock(named_entry);
18737
18738 return kr;
18739 }
18740
18741
18742 /*
18743 * Routine: vm_remap
18744 *
18745 * Map portion of a task's address space.
18746 * Mapped region must not overlap more than
18747 * one vm memory object. Protections and
18748 * inheritance attributes remain the same
18749 * as in the original task and are out parameters.
18750 * Source and Target task can be identical
18751 * Other attributes are identical as for vm_map()
18752 */
18753 kern_return_t
vm_map_remap(vm_map_t target_map,vm_map_address_t * address,vm_map_size_t size,vm_map_offset_t mask,vm_map_kernel_flags_t vmk_flags,vm_map_t src_map,vm_map_offset_t memory_address,boolean_t copy,vm_prot_t * cur_protection,vm_prot_t * max_protection,vm_inherit_t inheritance)18754 vm_map_remap(
18755 vm_map_t target_map,
18756 vm_map_address_t *address,
18757 vm_map_size_t size,
18758 vm_map_offset_t mask,
18759 vm_map_kernel_flags_t vmk_flags,
18760 vm_map_t src_map,
18761 vm_map_offset_t memory_address,
18762 boolean_t copy,
18763 vm_prot_t *cur_protection, /* IN/OUT */
18764 vm_prot_t *max_protection, /* IN/OUT */
18765 vm_inherit_t inheritance)
18766 {
18767 kern_return_t result;
18768 vm_map_entry_t entry;
18769 vm_map_entry_t insp_entry = VM_MAP_ENTRY_NULL;
18770 vm_map_entry_t new_entry;
18771 vm_map_copy_t copy_map;
18772 vm_map_offset_t offset_in_mapping;
18773 vm_map_size_t target_size = 0;
18774 vm_map_size_t src_page_mask, target_page_mask;
18775 vm_map_offset_t overmap_start, overmap_end, trimmed_start;
18776 vm_map_offset_t initial_memory_address;
18777 vm_map_size_t initial_size;
18778 VM_MAP_ZAP_DECLARE(zap_list);
18779
18780 if (target_map == VM_MAP_NULL) {
18781 return KERN_INVALID_ARGUMENT;
18782 }
18783
18784 if (__improbable(vm_map_range_overflows(src_map, memory_address, size))) {
18785 return KERN_INVALID_ARGUMENT;
18786 }
18787
18788 initial_memory_address = memory_address;
18789 initial_size = size;
18790 src_page_mask = VM_MAP_PAGE_MASK(src_map);
18791 target_page_mask = VM_MAP_PAGE_MASK(target_map);
18792
18793 switch (inheritance) {
18794 case VM_INHERIT_NONE:
18795 case VM_INHERIT_COPY:
18796 case VM_INHERIT_SHARE:
18797 if (size != 0 && src_map != VM_MAP_NULL) {
18798 break;
18799 }
18800 OS_FALLTHROUGH;
18801 default:
18802 return KERN_INVALID_ARGUMENT;
18803 }
18804
18805 if (src_page_mask != target_page_mask) {
18806 if (copy) {
18807 DEBUG4K_COPY("src_map %p pgsz 0x%x addr 0x%llx size 0x%llx copy %d -> target_map %p pgsz 0x%x\n", src_map, VM_MAP_PAGE_SIZE(src_map), (uint64_t)memory_address, (uint64_t)size, copy, target_map, VM_MAP_PAGE_SIZE(target_map));
18808 } else {
18809 DEBUG4K_SHARE("src_map %p pgsz 0x%x addr 0x%llx size 0x%llx copy %d -> target_map %p pgsz 0x%x\n", src_map, VM_MAP_PAGE_SIZE(src_map), (uint64_t)memory_address, (uint64_t)size, copy, target_map, VM_MAP_PAGE_SIZE(target_map));
18810 }
18811 }
18812
18813 /*
18814 * If the user is requesting that we return the address of the
18815 * first byte of the data (rather than the base of the page),
18816 * then we use different rounding semantics: specifically,
18817 * we assume that (memory_address, size) describes a region
18818 * all of whose pages we must cover, rather than a base to be truncated
18819 * down and a size to be added to that base. So we figure out
18820 * the highest page that the requested region includes and make
18821 * sure that the size will cover it.
18822 *
18823 * The key example we're worried about it is of the form:
18824 *
18825 * memory_address = 0x1ff0, size = 0x20
18826 *
18827 * With the old semantics, we round down the memory_address to 0x1000
18828 * and round up the size to 0x1000, resulting in our covering *only*
18829 * page 0x1000. With the new semantics, we'd realize that the region covers
18830 * 0x1ff0-0x2010, and compute a size of 0x2000. Thus, we cover both page
18831 * 0x1000 and page 0x2000 in the region we remap.
18832 */
18833 if (vmk_flags.vmf_return_data_addr) {
18834 vm_map_offset_t range_start, range_end;
18835
18836 range_start = vm_map_trunc_page(memory_address, src_page_mask);
18837 range_end = vm_map_round_page(memory_address + size, src_page_mask);
18838 memory_address = range_start;
18839 size = range_end - range_start;
18840 offset_in_mapping = initial_memory_address - memory_address;
18841 } else {
18842 /*
18843 * IMPORTANT:
18844 * This legacy code path is broken: for the range mentioned
18845 * above [ memory_address = 0x1ff0,size = 0x20 ], which spans
18846 * two 4k pages, it yields [ memory_address = 0x1000,
18847 * size = 0x1000 ], which covers only the first 4k page.
18848 * BUT some code unfortunately depends on this bug, so we
18849 * can't fix it without breaking something.
18850 * New code should get automatically opted in the new
18851 * behavior with the new VM_FLAGS_RETURN_DATA_ADDR flags.
18852 */
18853 offset_in_mapping = 0;
18854 memory_address = vm_map_trunc_page(memory_address, src_page_mask);
18855 size = vm_map_round_page(size, src_page_mask);
18856 initial_memory_address = memory_address;
18857 initial_size = size;
18858 }
18859
18860
18861 if (size == 0) {
18862 return KERN_INVALID_ARGUMENT;
18863 }
18864
18865 if (vmk_flags.vmf_resilient_media) {
18866 /* must be copy-on-write to be "media resilient" */
18867 if (!copy) {
18868 return KERN_INVALID_ARGUMENT;
18869 }
18870 }
18871
18872 vmk_flags.vmkf_copy_pageable = target_map->hdr.entries_pageable;
18873 vmk_flags.vmkf_copy_same_map = (src_map == target_map);
18874
18875 assert(size != 0);
18876 result = vm_map_copy_extract(src_map,
18877 memory_address,
18878 size,
18879 copy, ©_map,
18880 cur_protection, /* IN/OUT */
18881 max_protection, /* IN/OUT */
18882 inheritance,
18883 vmk_flags);
18884 if (result != KERN_SUCCESS) {
18885 return result;
18886 }
18887 assert(copy_map != VM_MAP_COPY_NULL);
18888
18889 /*
18890 * Handle the policy for vm map ranges
18891 *
18892 * If the maps differ, the target_map policy applies like for vm_map()
18893 * For same mapping remaps, we preserve the range.
18894 */
18895 if (vmk_flags.vmkf_copy_same_map) {
18896 vmk_flags.vmkf_range_id = copy_map->orig_range;
18897 } else {
18898 vm_map_kernel_flags_update_range_id(&vmk_flags, target_map);
18899 }
18900
18901 overmap_start = 0;
18902 overmap_end = 0;
18903 trimmed_start = 0;
18904 target_size = size;
18905 if (src_page_mask != target_page_mask) {
18906 vm_map_copy_t target_copy_map;
18907
18908 target_copy_map = copy_map; /* can modify "copy_map" itself */
18909 DEBUG4K_ADJUST("adjusting...\n");
18910 result = vm_map_copy_adjust_to_target(
18911 copy_map,
18912 offset_in_mapping, /* offset */
18913 initial_size,
18914 target_map,
18915 copy,
18916 &target_copy_map,
18917 &overmap_start,
18918 &overmap_end,
18919 &trimmed_start);
18920 if (result != KERN_SUCCESS) {
18921 DEBUG4K_COPY("failed to adjust 0x%x\n", result);
18922 vm_map_copy_discard(copy_map);
18923 return result;
18924 }
18925 if (trimmed_start == 0) {
18926 /* nothing trimmed: no adjustment needed */
18927 } else if (trimmed_start >= offset_in_mapping) {
18928 /* trimmed more than offset_in_mapping: nothing left */
18929 assert(overmap_start == 0);
18930 assert(overmap_end == 0);
18931 offset_in_mapping = 0;
18932 } else {
18933 /* trimmed some of offset_in_mapping: adjust */
18934 assert(overmap_start == 0);
18935 assert(overmap_end == 0);
18936 offset_in_mapping -= trimmed_start;
18937 }
18938 offset_in_mapping += overmap_start;
18939 target_size = target_copy_map->size;
18940 }
18941
18942 /*
18943 * Allocate/check a range of free virtual address
18944 * space for the target
18945 */
18946 *address = vm_map_trunc_page(*address, target_page_mask);
18947 vm_map_lock(target_map);
18948 target_size = vm_map_round_page(target_size, target_page_mask);
18949 result = vm_map_remap_range_allocate(target_map, address,
18950 target_size, mask, vmk_flags,
18951 &insp_entry, &zap_list);
18952
18953 for (entry = vm_map_copy_first_entry(copy_map);
18954 entry != vm_map_copy_to_entry(copy_map);
18955 entry = new_entry) {
18956 new_entry = entry->vme_next;
18957 vm_map_copy_entry_unlink(copy_map, entry);
18958 if (result == KERN_SUCCESS) {
18959 if (vmk_flags.vmkf_remap_prot_copy) {
18960 /*
18961 * This vm_map_remap() is for a
18962 * vm_protect(VM_PROT_COPY), so the caller
18963 * expects to be allowed to add write access
18964 * to this new mapping. This is done by
18965 * adding VM_PROT_WRITE to each entry's
18966 * max_protection... unless some security
18967 * settings disallow it.
18968 */
18969 bool allow_write = false;
18970 if (entry->vme_permanent) {
18971 /* immutable mapping... */
18972 if ((entry->max_protection & VM_PROT_EXECUTE) &&
18973 developer_mode_state()) {
18974 /*
18975 * ... but executable and
18976 * possibly being debugged,
18977 * so let's allow it to become
18978 * writable, for breakpoints
18979 * and dtrace probes, for
18980 * example.
18981 */
18982 allow_write = true;
18983 } else {
18984 printf("%d[%s] vm_remap(0x%llx,0x%llx) VM_PROT_COPY denied on permanent mapping prot 0x%x/0x%x developer %d\n",
18985 proc_selfpid(),
18986 (get_bsdtask_info(current_task())
18987 ? proc_name_address(get_bsdtask_info(current_task()))
18988 : "?"),
18989 (uint64_t)memory_address,
18990 (uint64_t)size,
18991 entry->protection,
18992 entry->max_protection,
18993 developer_mode_state());
18994 DTRACE_VM6(vm_map_delete_permanent_deny_protcopy,
18995 vm_map_entry_t, entry,
18996 vm_map_offset_t, entry->vme_start,
18997 vm_map_offset_t, entry->vme_end,
18998 vm_prot_t, entry->protection,
18999 vm_prot_t, entry->max_protection,
19000 int, VME_ALIAS(entry));
19001 }
19002 } else {
19003 allow_write = true;
19004 }
19005
19006 /*
19007 * VM_PROT_COPY: allow this mapping to become
19008 * writable, unless it was "permanent".
19009 */
19010 if (allow_write) {
19011 entry->max_protection |= VM_PROT_WRITE;
19012 }
19013 }
19014 if (vmk_flags.vmf_resilient_codesign) {
19015 /* no codesigning -> read-only access */
19016 entry->max_protection = VM_PROT_READ;
19017 entry->protection = VM_PROT_READ;
19018 entry->vme_resilient_codesign = TRUE;
19019 }
19020 entry->vme_start += *address;
19021 entry->vme_end += *address;
19022 assert(!entry->map_aligned);
19023 if (vmk_flags.vmf_resilient_media &&
19024 !entry->is_sub_map &&
19025 (VME_OBJECT(entry) == VM_OBJECT_NULL ||
19026 VME_OBJECT(entry)->internal)) {
19027 entry->vme_resilient_media = TRUE;
19028 }
19029 assert(VM_MAP_PAGE_ALIGNED(entry->vme_start, MIN(target_page_mask, PAGE_MASK)));
19030 assert(VM_MAP_PAGE_ALIGNED(entry->vme_end, MIN(target_page_mask, PAGE_MASK)));
19031 assert(VM_MAP_PAGE_ALIGNED(VME_OFFSET(entry), MIN(target_page_mask, PAGE_MASK)));
19032 vm_map_store_entry_link(target_map, insp_entry, entry,
19033 vmk_flags);
19034 insp_entry = entry;
19035 } else {
19036 if (!entry->is_sub_map) {
19037 vm_object_deallocate(VME_OBJECT(entry));
19038 } else {
19039 vm_map_deallocate(VME_SUBMAP(entry));
19040 }
19041 vm_map_copy_entry_dispose(entry);
19042 }
19043 }
19044
19045 if (vmk_flags.vmf_resilient_codesign) {
19046 *cur_protection = VM_PROT_READ;
19047 *max_protection = VM_PROT_READ;
19048 }
19049
19050 if (result == KERN_SUCCESS) {
19051 target_map->size += target_size;
19052 SAVE_HINT_MAP_WRITE(target_map, insp_entry);
19053 }
19054 vm_map_unlock(target_map);
19055
19056 vm_map_zap_dispose(&zap_list);
19057
19058 if (result == KERN_SUCCESS && target_map->wiring_required) {
19059 result = vm_map_wire_kernel(target_map, *address,
19060 *address + size, *cur_protection, VM_KERN_MEMORY_MLOCK,
19061 TRUE);
19062 }
19063
19064 /*
19065 * If requested, return the address of the data pointed to by the
19066 * request, rather than the base of the resulting page.
19067 */
19068 if (vmk_flags.vmf_return_data_addr) {
19069 *address += offset_in_mapping;
19070 }
19071
19072 if (src_page_mask != target_page_mask) {
19073 DEBUG4K_SHARE("vm_remap(%p 0x%llx 0x%llx copy=%d-> %p 0x%llx 0x%llx result=0x%x\n", src_map, (uint64_t)memory_address, (uint64_t)size, copy, target_map, (uint64_t)*address, (uint64_t)offset_in_mapping, result);
19074 }
19075 vm_map_copy_discard(copy_map);
19076 copy_map = VM_MAP_COPY_NULL;
19077
19078 return result;
19079 }
19080
19081 /*
19082 * Routine: vm_map_remap_range_allocate
19083 *
19084 * Description:
19085 * Allocate a range in the specified virtual address map.
19086 * returns the address and the map entry just before the allocated
19087 * range
19088 *
19089 * Map must be locked.
19090 */
19091
19092 static kern_return_t
vm_map_remap_range_allocate(vm_map_t map,vm_map_address_t * address,vm_map_size_t size,vm_map_offset_t mask,vm_map_kernel_flags_t vmk_flags,vm_map_entry_t * map_entry,vm_map_zap_t zap_list)19093 vm_map_remap_range_allocate(
19094 vm_map_t map,
19095 vm_map_address_t *address, /* IN/OUT */
19096 vm_map_size_t size,
19097 vm_map_offset_t mask,
19098 vm_map_kernel_flags_t vmk_flags,
19099 vm_map_entry_t *map_entry, /* OUT */
19100 vm_map_zap_t zap_list)
19101 {
19102 vm_map_entry_t entry;
19103 vm_map_offset_t start;
19104 kern_return_t kr;
19105
19106 start = *address;
19107
19108 if (!vmk_flags.vmf_fixed) {
19109 kr = vm_map_locate_space(map, size, mask, vmk_flags,
19110 &start, &entry);
19111 if (kr != KERN_SUCCESS) {
19112 return kr;
19113 }
19114 *address = start;
19115 } else {
19116 vm_map_offset_t effective_min_offset, effective_max_offset;
19117 vm_map_entry_t temp_entry;
19118 vm_map_offset_t end;
19119
19120 effective_min_offset = map->min_offset;
19121 effective_max_offset = map->max_offset;
19122
19123 /*
19124 * Verify that:
19125 * the address doesn't itself violate
19126 * the mask requirement.
19127 */
19128
19129 if ((start & mask) != 0) {
19130 return KERN_NO_SPACE;
19131 }
19132
19133 #if CONFIG_MAP_RANGES
19134 if (map->uses_user_ranges) {
19135 struct mach_vm_range r;
19136
19137 vm_map_user_range_resolve(map, start, 1, &r);
19138 if (r.max_address == 0) {
19139 return KERN_INVALID_ADDRESS;
19140 }
19141
19142 effective_min_offset = r.min_address;
19143 effective_max_offset = r.max_address;
19144 }
19145 #endif /* CONFIG_MAP_RANGES */
19146 if (map == kernel_map) {
19147 mach_vm_range_t r = kmem_validate_range_for_overwrite(start, size);
19148 effective_min_offset = r->min_address;
19149 effective_min_offset = r->max_address;
19150 }
19151
19152 /*
19153 * ... the address is within bounds
19154 */
19155
19156 end = start + size;
19157
19158 if ((start < effective_min_offset) ||
19159 (end > effective_max_offset) ||
19160 (start >= end)) {
19161 return KERN_INVALID_ADDRESS;
19162 }
19163
19164 /*
19165 * If we're asked to overwrite whatever was mapped in that
19166 * range, first deallocate that range.
19167 */
19168 if (vmk_flags.vmf_overwrite) {
19169 vmr_flags_t remove_flags = VM_MAP_REMOVE_NO_MAP_ALIGN;
19170
19171 /*
19172 * We use a "zap_list" to avoid having to unlock
19173 * the "map" in vm_map_delete(), which would compromise
19174 * the atomicity of the "deallocate" and then "remap"
19175 * combination.
19176 */
19177 remove_flags |= VM_MAP_REMOVE_NO_YIELD;
19178
19179 if (vmk_flags.vmkf_overwrite_immutable) {
19180 remove_flags |= VM_MAP_REMOVE_IMMUTABLE;
19181 }
19182 if (vmk_flags.vmkf_remap_prot_copy) {
19183 remove_flags |= VM_MAP_REMOVE_IMMUTABLE_CODE;
19184 }
19185 kr = vm_map_delete(map, start, end, remove_flags,
19186 KMEM_GUARD_NONE, zap_list).kmr_return;
19187 if (kr != KERN_SUCCESS) {
19188 /* XXX FBDP restore zap_list? */
19189 return kr;
19190 }
19191 }
19192
19193 /*
19194 * ... the starting address isn't allocated
19195 */
19196
19197 if (vm_map_lookup_entry(map, start, &temp_entry)) {
19198 return KERN_NO_SPACE;
19199 }
19200
19201 entry = temp_entry;
19202
19203 /*
19204 * ... the next region doesn't overlap the
19205 * end point.
19206 */
19207
19208 if ((entry->vme_next != vm_map_to_entry(map)) &&
19209 (entry->vme_next->vme_start < end)) {
19210 return KERN_NO_SPACE;
19211 }
19212 }
19213 *map_entry = entry;
19214 return KERN_SUCCESS;
19215 }
19216
19217 /*
19218 * vm_map_switch:
19219 *
19220 * Set the address map for the current thread to the specified map
19221 */
19222
19223 vm_map_t
vm_map_switch(vm_map_t map)19224 vm_map_switch(
19225 vm_map_t map)
19226 {
19227 thread_t thread = current_thread();
19228 vm_map_t oldmap = thread->map;
19229
19230
19231 /*
19232 * Deactivate the current map and activate the requested map
19233 */
19234 mp_disable_preemption();
19235 PMAP_SWITCH_USER(thread, map, cpu_number());
19236 mp_enable_preemption();
19237 return oldmap;
19238 }
19239
19240
19241 /*
19242 * Routine: vm_map_write_user
19243 *
19244 * Description:
19245 * Copy out data from a kernel space into space in the
19246 * destination map. The space must already exist in the
19247 * destination map.
19248 * NOTE: This routine should only be called by threads
19249 * which can block on a page fault. i.e. kernel mode user
19250 * threads.
19251 *
19252 */
19253 kern_return_t
vm_map_write_user(vm_map_t map,void * src_p,vm_map_address_t dst_addr,vm_size_t size)19254 vm_map_write_user(
19255 vm_map_t map,
19256 void *src_p,
19257 vm_map_address_t dst_addr,
19258 vm_size_t size)
19259 {
19260 kern_return_t kr = KERN_SUCCESS;
19261
19262 if (__improbable(vm_map_range_overflows(map, dst_addr, size))) {
19263 return KERN_INVALID_ADDRESS;
19264 }
19265
19266 if (current_map() == map) {
19267 if (copyout(src_p, dst_addr, size)) {
19268 kr = KERN_INVALID_ADDRESS;
19269 }
19270 } else {
19271 vm_map_t oldmap;
19272
19273 /* take on the identity of the target map while doing */
19274 /* the transfer */
19275
19276 vm_map_reference(map);
19277 oldmap = vm_map_switch(map);
19278 if (copyout(src_p, dst_addr, size)) {
19279 kr = KERN_INVALID_ADDRESS;
19280 }
19281 vm_map_switch(oldmap);
19282 vm_map_deallocate(map);
19283 }
19284 return kr;
19285 }
19286
19287 /*
19288 * Routine: vm_map_read_user
19289 *
19290 * Description:
19291 * Copy in data from a user space source map into the
19292 * kernel map. The space must already exist in the
19293 * kernel map.
19294 * NOTE: This routine should only be called by threads
19295 * which can block on a page fault. i.e. kernel mode user
19296 * threads.
19297 *
19298 */
19299 kern_return_t
vm_map_read_user(vm_map_t map,vm_map_address_t src_addr,void * dst_p,vm_size_t size)19300 vm_map_read_user(
19301 vm_map_t map,
19302 vm_map_address_t src_addr,
19303 void *dst_p,
19304 vm_size_t size)
19305 {
19306 kern_return_t kr = KERN_SUCCESS;
19307
19308 if (__improbable(vm_map_range_overflows(map, src_addr, size))) {
19309 return KERN_INVALID_ADDRESS;
19310 }
19311
19312 if (current_map() == map) {
19313 if (copyin(src_addr, dst_p, size)) {
19314 kr = KERN_INVALID_ADDRESS;
19315 }
19316 } else {
19317 vm_map_t oldmap;
19318
19319 /* take on the identity of the target map while doing */
19320 /* the transfer */
19321
19322 vm_map_reference(map);
19323 oldmap = vm_map_switch(map);
19324 if (copyin(src_addr, dst_p, size)) {
19325 kr = KERN_INVALID_ADDRESS;
19326 }
19327 vm_map_switch(oldmap);
19328 vm_map_deallocate(map);
19329 }
19330 return kr;
19331 }
19332
19333
19334 /*
19335 * vm_map_check_protection:
19336 *
19337 * Assert that the target map allows the specified
19338 * privilege on the entire address region given.
19339 * The entire region must be allocated.
19340 */
19341 boolean_t
vm_map_check_protection(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_prot_t protection)19342 vm_map_check_protection(vm_map_t map, vm_map_offset_t start,
19343 vm_map_offset_t end, vm_prot_t protection)
19344 {
19345 vm_map_entry_t entry;
19346 vm_map_entry_t tmp_entry;
19347
19348 if (__improbable(vm_map_range_overflows(map, start, end - start))) {
19349 return FALSE;
19350 }
19351
19352 vm_map_lock(map);
19353
19354 if (start < vm_map_min(map) || end > vm_map_max(map) || start > end) {
19355 vm_map_unlock(map);
19356 return FALSE;
19357 }
19358
19359 if (!vm_map_lookup_entry(map, start, &tmp_entry)) {
19360 vm_map_unlock(map);
19361 return FALSE;
19362 }
19363
19364 entry = tmp_entry;
19365
19366 while (start < end) {
19367 if (entry == vm_map_to_entry(map)) {
19368 vm_map_unlock(map);
19369 return FALSE;
19370 }
19371
19372 /*
19373 * No holes allowed!
19374 */
19375
19376 if (start < entry->vme_start) {
19377 vm_map_unlock(map);
19378 return FALSE;
19379 }
19380
19381 /*
19382 * Check protection associated with entry.
19383 */
19384
19385 if ((entry->protection & protection) != protection) {
19386 vm_map_unlock(map);
19387 return FALSE;
19388 }
19389
19390 /* go to next entry */
19391
19392 start = entry->vme_end;
19393 entry = entry->vme_next;
19394 }
19395 vm_map_unlock(map);
19396 return TRUE;
19397 }
19398
19399 kern_return_t
vm_map_purgable_control(vm_map_t map,vm_map_offset_t address,vm_purgable_t control,int * state)19400 vm_map_purgable_control(
19401 vm_map_t map,
19402 vm_map_offset_t address,
19403 vm_purgable_t control,
19404 int *state)
19405 {
19406 vm_map_entry_t entry;
19407 vm_object_t object;
19408 kern_return_t kr;
19409 boolean_t was_nonvolatile;
19410
19411 /*
19412 * Vet all the input parameters and current type and state of the
19413 * underlaying object. Return with an error if anything is amiss.
19414 */
19415 if (map == VM_MAP_NULL) {
19416 return KERN_INVALID_ARGUMENT;
19417 }
19418
19419 if (control != VM_PURGABLE_SET_STATE &&
19420 control != VM_PURGABLE_GET_STATE &&
19421 control != VM_PURGABLE_PURGE_ALL &&
19422 control != VM_PURGABLE_SET_STATE_FROM_KERNEL) {
19423 return KERN_INVALID_ARGUMENT;
19424 }
19425
19426 if (control == VM_PURGABLE_PURGE_ALL) {
19427 vm_purgeable_object_purge_all();
19428 return KERN_SUCCESS;
19429 }
19430
19431 if ((control == VM_PURGABLE_SET_STATE ||
19432 control == VM_PURGABLE_SET_STATE_FROM_KERNEL) &&
19433 (((*state & ~(VM_PURGABLE_ALL_MASKS)) != 0) ||
19434 ((*state & VM_PURGABLE_STATE_MASK) > VM_PURGABLE_STATE_MASK))) {
19435 return KERN_INVALID_ARGUMENT;
19436 }
19437
19438 vm_map_lock_read(map);
19439
19440 if (!vm_map_lookup_entry(map, address, &entry) || entry->is_sub_map) {
19441 /*
19442 * Must pass a valid non-submap address.
19443 */
19444 vm_map_unlock_read(map);
19445 return KERN_INVALID_ADDRESS;
19446 }
19447
19448 if ((entry->protection & VM_PROT_WRITE) == 0 &&
19449 control != VM_PURGABLE_GET_STATE) {
19450 /*
19451 * Can't apply purgable controls to something you can't write.
19452 */
19453 vm_map_unlock_read(map);
19454 return KERN_PROTECTION_FAILURE;
19455 }
19456
19457 object = VME_OBJECT(entry);
19458 if (object == VM_OBJECT_NULL ||
19459 object->purgable == VM_PURGABLE_DENY) {
19460 /*
19461 * Object must already be present and be purgeable.
19462 */
19463 vm_map_unlock_read(map);
19464 return KERN_INVALID_ARGUMENT;
19465 }
19466
19467 vm_object_lock(object);
19468
19469 #if 00
19470 if (VME_OFFSET(entry) != 0 ||
19471 entry->vme_end - entry->vme_start != object->vo_size) {
19472 /*
19473 * Can only apply purgable controls to the whole (existing)
19474 * object at once.
19475 */
19476 vm_map_unlock_read(map);
19477 vm_object_unlock(object);
19478 return KERN_INVALID_ARGUMENT;
19479 }
19480 #endif
19481
19482 assert(!entry->is_sub_map);
19483 assert(!entry->use_pmap); /* purgeable has its own accounting */
19484
19485 vm_map_unlock_read(map);
19486
19487 was_nonvolatile = (object->purgable == VM_PURGABLE_NONVOLATILE);
19488
19489 kr = vm_object_purgable_control(object, control, state);
19490
19491 if (was_nonvolatile &&
19492 object->purgable != VM_PURGABLE_NONVOLATILE &&
19493 map->pmap == kernel_pmap) {
19494 #if DEBUG
19495 object->vo_purgeable_volatilizer = kernel_task;
19496 #endif /* DEBUG */
19497 }
19498
19499 vm_object_unlock(object);
19500
19501 return kr;
19502 }
19503
19504 void
vm_map_footprint_query_page_info(vm_map_t map,vm_map_entry_t map_entry,vm_map_offset_t curr_s_offset,int * disposition_p)19505 vm_map_footprint_query_page_info(
19506 vm_map_t map,
19507 vm_map_entry_t map_entry,
19508 vm_map_offset_t curr_s_offset,
19509 int *disposition_p)
19510 {
19511 int pmap_disp;
19512 vm_object_t object = VM_OBJECT_NULL;
19513 int disposition;
19514 int effective_page_size;
19515
19516 vm_map_lock_assert_held(map);
19517 assert(!map->has_corpse_footprint);
19518 assert(curr_s_offset >= map_entry->vme_start);
19519 assert(curr_s_offset < map_entry->vme_end);
19520
19521 if (map_entry->is_sub_map) {
19522 if (!map_entry->use_pmap) {
19523 /* nested pmap: no footprint */
19524 *disposition_p = 0;
19525 return;
19526 }
19527 } else {
19528 object = VME_OBJECT(map_entry);
19529 if (object == VM_OBJECT_NULL) {
19530 /* nothing mapped here: no need to ask */
19531 *disposition_p = 0;
19532 return;
19533 }
19534 }
19535
19536 effective_page_size = MIN(PAGE_SIZE, VM_MAP_PAGE_SIZE(map));
19537
19538 pmap_disp = 0;
19539
19540 /*
19541 * Query the pmap.
19542 */
19543 pmap_query_page_info(map->pmap, curr_s_offset, &pmap_disp);
19544
19545 /*
19546 * Compute this page's disposition.
19547 */
19548 disposition = 0;
19549
19550 /* deal with "alternate accounting" first */
19551 if (!map_entry->is_sub_map &&
19552 object->vo_no_footprint) {
19553 /* does not count in footprint */
19554 assertf(!map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry);
19555 } else if (!map_entry->is_sub_map &&
19556 (object->purgable == VM_PURGABLE_NONVOLATILE ||
19557 (object->purgable == VM_PURGABLE_DENY &&
19558 object->vo_ledger_tag)) &&
19559 VM_OBJECT_OWNER(object) != NULL &&
19560 VM_OBJECT_OWNER(object)->map == map) {
19561 assertf(!map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry);
19562 if ((((curr_s_offset
19563 - map_entry->vme_start
19564 + VME_OFFSET(map_entry))
19565 / effective_page_size) <
19566 (object->resident_page_count +
19567 vm_compressor_pager_get_count(object->pager)))) {
19568 /*
19569 * Non-volatile purgeable object owned
19570 * by this task: report the first
19571 * "#resident + #compressed" pages as
19572 * "resident" (to show that they
19573 * contribute to the footprint) but not
19574 * "dirty" (to avoid double-counting
19575 * with the fake "non-volatile" region
19576 * we'll report at the end of the
19577 * address space to account for all
19578 * (mapped or not) non-volatile memory
19579 * owned by this task.
19580 */
19581 disposition |= VM_PAGE_QUERY_PAGE_PRESENT;
19582 }
19583 } else if (!map_entry->is_sub_map &&
19584 (object->purgable == VM_PURGABLE_VOLATILE ||
19585 object->purgable == VM_PURGABLE_EMPTY) &&
19586 VM_OBJECT_OWNER(object) != NULL &&
19587 VM_OBJECT_OWNER(object)->map == map) {
19588 assertf(!map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry);
19589 if ((((curr_s_offset
19590 - map_entry->vme_start
19591 + VME_OFFSET(map_entry))
19592 / effective_page_size) <
19593 object->wired_page_count)) {
19594 /*
19595 * Volatile|empty purgeable object owned
19596 * by this task: report the first
19597 * "#wired" pages as "resident" (to
19598 * show that they contribute to the
19599 * footprint) but not "dirty" (to avoid
19600 * double-counting with the fake
19601 * "non-volatile" region we'll report
19602 * at the end of the address space to
19603 * account for all (mapped or not)
19604 * non-volatile memory owned by this
19605 * task.
19606 */
19607 disposition |= VM_PAGE_QUERY_PAGE_PRESENT;
19608 }
19609 } else if (!map_entry->is_sub_map &&
19610 map_entry->iokit_acct &&
19611 object->internal &&
19612 object->purgable == VM_PURGABLE_DENY) {
19613 /*
19614 * Non-purgeable IOKit memory: phys_footprint
19615 * includes the entire virtual mapping.
19616 */
19617 assertf(!map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry);
19618 disposition |= VM_PAGE_QUERY_PAGE_PRESENT;
19619 disposition |= VM_PAGE_QUERY_PAGE_DIRTY;
19620 } else if (pmap_disp & (PMAP_QUERY_PAGE_ALTACCT |
19621 PMAP_QUERY_PAGE_COMPRESSED_ALTACCT)) {
19622 /* alternate accounting */
19623 #if __arm64__ && (DEVELOPMENT || DEBUG)
19624 if (map->pmap->footprint_was_suspended) {
19625 /*
19626 * The assertion below can fail if dyld
19627 * suspended footprint accounting
19628 * while doing some adjustments to
19629 * this page; the mapping would say
19630 * "use pmap accounting" but the page
19631 * would be marked "alternate
19632 * accounting".
19633 */
19634 } else
19635 #endif /* __arm64__ && (DEVELOPMENT || DEBUG) */
19636 {
19637 assertf(!map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry);
19638 }
19639 disposition = 0;
19640 } else {
19641 if (pmap_disp & PMAP_QUERY_PAGE_PRESENT) {
19642 assertf(map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry);
19643 disposition |= VM_PAGE_QUERY_PAGE_PRESENT;
19644 disposition |= VM_PAGE_QUERY_PAGE_REF;
19645 if (pmap_disp & PMAP_QUERY_PAGE_INTERNAL) {
19646 disposition |= VM_PAGE_QUERY_PAGE_DIRTY;
19647 } else {
19648 disposition |= VM_PAGE_QUERY_PAGE_EXTERNAL;
19649 }
19650 if (pmap_disp & PMAP_QUERY_PAGE_REUSABLE) {
19651 disposition |= VM_PAGE_QUERY_PAGE_REUSABLE;
19652 }
19653 } else if (pmap_disp & PMAP_QUERY_PAGE_COMPRESSED) {
19654 assertf(map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry);
19655 disposition |= VM_PAGE_QUERY_PAGE_PAGED_OUT;
19656 }
19657 }
19658
19659 *disposition_p = disposition;
19660 }
19661
19662 kern_return_t
vm_map_page_query_internal(vm_map_t target_map,vm_map_offset_t offset,int * disposition,int * ref_count)19663 vm_map_page_query_internal(
19664 vm_map_t target_map,
19665 vm_map_offset_t offset,
19666 int *disposition,
19667 int *ref_count)
19668 {
19669 kern_return_t kr;
19670 vm_page_info_basic_data_t info;
19671 mach_msg_type_number_t count;
19672
19673 count = VM_PAGE_INFO_BASIC_COUNT;
19674 kr = vm_map_page_info(target_map,
19675 offset,
19676 VM_PAGE_INFO_BASIC,
19677 (vm_page_info_t) &info,
19678 &count);
19679 if (kr == KERN_SUCCESS) {
19680 *disposition = info.disposition;
19681 *ref_count = info.ref_count;
19682 } else {
19683 *disposition = 0;
19684 *ref_count = 0;
19685 }
19686
19687 return kr;
19688 }
19689
19690 kern_return_t
vm_map_page_info(vm_map_t map,vm_map_offset_t offset,vm_page_info_flavor_t flavor,vm_page_info_t info,mach_msg_type_number_t * count)19691 vm_map_page_info(
19692 vm_map_t map,
19693 vm_map_offset_t offset,
19694 vm_page_info_flavor_t flavor,
19695 vm_page_info_t info,
19696 mach_msg_type_number_t *count)
19697 {
19698 return vm_map_page_range_info_internal(map,
19699 offset, /* start of range */
19700 (offset + 1), /* this will get rounded in the call to the page boundary */
19701 (int)-1, /* effective_page_shift: unspecified */
19702 flavor,
19703 info,
19704 count);
19705 }
19706
19707 kern_return_t
vm_map_page_range_info_internal(vm_map_t map,vm_map_offset_t start_offset,vm_map_offset_t end_offset,int effective_page_shift,vm_page_info_flavor_t flavor,vm_page_info_t info,mach_msg_type_number_t * count)19708 vm_map_page_range_info_internal(
19709 vm_map_t map,
19710 vm_map_offset_t start_offset,
19711 vm_map_offset_t end_offset,
19712 int effective_page_shift,
19713 vm_page_info_flavor_t flavor,
19714 vm_page_info_t info,
19715 mach_msg_type_number_t *count)
19716 {
19717 vm_map_entry_t map_entry = VM_MAP_ENTRY_NULL;
19718 vm_object_t object = VM_OBJECT_NULL, curr_object = VM_OBJECT_NULL;
19719 vm_page_t m = VM_PAGE_NULL;
19720 kern_return_t retval = KERN_SUCCESS;
19721 int disposition = 0;
19722 int ref_count = 0;
19723 int depth = 0, info_idx = 0;
19724 vm_page_info_basic_t basic_info = 0;
19725 vm_map_offset_t offset_in_page = 0, offset_in_object = 0, curr_offset_in_object = 0;
19726 vm_map_offset_t start = 0, end = 0, curr_s_offset = 0, curr_e_offset = 0;
19727 boolean_t do_region_footprint;
19728 ledger_amount_t ledger_resident, ledger_compressed;
19729 int effective_page_size;
19730 vm_map_offset_t effective_page_mask;
19731
19732 switch (flavor) {
19733 case VM_PAGE_INFO_BASIC:
19734 if (*count != VM_PAGE_INFO_BASIC_COUNT) {
19735 /*
19736 * The "vm_page_info_basic_data" structure was not
19737 * properly padded, so allow the size to be off by
19738 * one to maintain backwards binary compatibility...
19739 */
19740 if (*count != VM_PAGE_INFO_BASIC_COUNT - 1) {
19741 return KERN_INVALID_ARGUMENT;
19742 }
19743 }
19744 break;
19745 default:
19746 return KERN_INVALID_ARGUMENT;
19747 }
19748
19749 if (effective_page_shift == -1) {
19750 effective_page_shift = vm_self_region_page_shift_safely(map);
19751 if (effective_page_shift == -1) {
19752 return KERN_INVALID_ARGUMENT;
19753 }
19754 }
19755 effective_page_size = (1 << effective_page_shift);
19756 effective_page_mask = effective_page_size - 1;
19757
19758 do_region_footprint = task_self_region_footprint();
19759 disposition = 0;
19760 ref_count = 0;
19761 depth = 0;
19762 info_idx = 0; /* Tracks the next index within the info structure to be filled.*/
19763 retval = KERN_SUCCESS;
19764
19765 if (__improbable(vm_map_range_overflows(map, start_offset, end_offset - start_offset))) {
19766 return KERN_INVALID_ADDRESS;
19767 }
19768
19769 offset_in_page = start_offset & effective_page_mask;
19770 start = vm_map_trunc_page(start_offset, effective_page_mask);
19771 end = vm_map_round_page(end_offset, effective_page_mask);
19772
19773 if (end < start) {
19774 return KERN_INVALID_ARGUMENT;
19775 }
19776
19777 assert((end - start) <= MAX_PAGE_RANGE_QUERY);
19778
19779 vm_map_lock_read(map);
19780
19781 task_ledgers_footprint(map->pmap->ledger, &ledger_resident, &ledger_compressed);
19782
19783 for (curr_s_offset = start; curr_s_offset < end;) {
19784 /*
19785 * New lookup needs reset of these variables.
19786 */
19787 curr_object = object = VM_OBJECT_NULL;
19788 offset_in_object = 0;
19789 ref_count = 0;
19790 depth = 0;
19791
19792 if (do_region_footprint &&
19793 curr_s_offset >= vm_map_last_entry(map)->vme_end) {
19794 /*
19795 * Request for "footprint" info about a page beyond
19796 * the end of address space: this must be for
19797 * the fake region vm_map_region_recurse_64()
19798 * reported to account for non-volatile purgeable
19799 * memory owned by this task.
19800 */
19801 disposition = 0;
19802
19803 if (curr_s_offset - vm_map_last_entry(map)->vme_end <=
19804 (unsigned) ledger_compressed) {
19805 /*
19806 * We haven't reported all the "non-volatile
19807 * compressed" pages yet, so report this fake
19808 * page as "compressed".
19809 */
19810 disposition |= VM_PAGE_QUERY_PAGE_PAGED_OUT;
19811 } else {
19812 /*
19813 * We've reported all the non-volatile
19814 * compressed page but not all the non-volatile
19815 * pages , so report this fake page as
19816 * "resident dirty".
19817 */
19818 disposition |= VM_PAGE_QUERY_PAGE_PRESENT;
19819 disposition |= VM_PAGE_QUERY_PAGE_DIRTY;
19820 disposition |= VM_PAGE_QUERY_PAGE_REF;
19821 }
19822 switch (flavor) {
19823 case VM_PAGE_INFO_BASIC:
19824 basic_info = (vm_page_info_basic_t) (((uintptr_t) info) + (info_idx * sizeof(struct vm_page_info_basic)));
19825 basic_info->disposition = disposition;
19826 basic_info->ref_count = 1;
19827 basic_info->object_id = VM_OBJECT_ID_FAKE(map, task_ledgers.purgeable_nonvolatile);
19828 basic_info->offset = 0;
19829 basic_info->depth = 0;
19830
19831 info_idx++;
19832 break;
19833 }
19834 curr_s_offset += effective_page_size;
19835 continue;
19836 }
19837
19838 /*
19839 * First, find the map entry covering "curr_s_offset", going down
19840 * submaps if necessary.
19841 */
19842 if (!vm_map_lookup_entry(map, curr_s_offset, &map_entry)) {
19843 /* no entry -> no object -> no page */
19844
19845 if (curr_s_offset < vm_map_min(map)) {
19846 /*
19847 * Illegal address that falls below map min.
19848 */
19849 curr_e_offset = MIN(end, vm_map_min(map));
19850 } else if (curr_s_offset >= vm_map_max(map)) {
19851 /*
19852 * Illegal address that falls on/after map max.
19853 */
19854 curr_e_offset = end;
19855 } else if (map_entry == vm_map_to_entry(map)) {
19856 /*
19857 * Hit a hole.
19858 */
19859 if (map_entry->vme_next == vm_map_to_entry(map)) {
19860 /*
19861 * Empty map.
19862 */
19863 curr_e_offset = MIN(map->max_offset, end);
19864 } else {
19865 /*
19866 * Hole at start of the map.
19867 */
19868 curr_e_offset = MIN(map_entry->vme_next->vme_start, end);
19869 }
19870 } else {
19871 if (map_entry->vme_next == vm_map_to_entry(map)) {
19872 /*
19873 * Hole at the end of the map.
19874 */
19875 curr_e_offset = MIN(map->max_offset, end);
19876 } else {
19877 curr_e_offset = MIN(map_entry->vme_next->vme_start, end);
19878 }
19879 }
19880
19881 assert(curr_e_offset >= curr_s_offset);
19882
19883 uint64_t num_pages = (curr_e_offset - curr_s_offset) >> effective_page_shift;
19884
19885 void *info_ptr = (void*) (((uintptr_t) info) + (info_idx * sizeof(struct vm_page_info_basic)));
19886
19887 bzero(info_ptr, num_pages * sizeof(struct vm_page_info_basic));
19888
19889 curr_s_offset = curr_e_offset;
19890
19891 info_idx += num_pages;
19892
19893 continue;
19894 }
19895
19896 /* compute offset from this map entry's start */
19897 offset_in_object = curr_s_offset - map_entry->vme_start;
19898
19899 /* compute offset into this map entry's object (or submap) */
19900 offset_in_object += VME_OFFSET(map_entry);
19901
19902 if (map_entry->is_sub_map) {
19903 vm_map_t sub_map = VM_MAP_NULL;
19904 vm_page_info_t submap_info = 0;
19905 vm_map_offset_t submap_s_offset = 0, submap_e_offset = 0, range_len = 0;
19906
19907 range_len = MIN(map_entry->vme_end, end) - curr_s_offset;
19908
19909 submap_s_offset = offset_in_object;
19910 submap_e_offset = submap_s_offset + range_len;
19911
19912 sub_map = VME_SUBMAP(map_entry);
19913
19914 vm_map_reference(sub_map);
19915 vm_map_unlock_read(map);
19916
19917 submap_info = (vm_page_info_t) (((uintptr_t) info) + (info_idx * sizeof(struct vm_page_info_basic)));
19918
19919 assertf(VM_MAP_PAGE_SHIFT(sub_map) >= VM_MAP_PAGE_SHIFT(map),
19920 "Submap page size (%d) differs from current map (%d)\n", VM_MAP_PAGE_SIZE(sub_map), VM_MAP_PAGE_SIZE(map));
19921
19922 retval = vm_map_page_range_info_internal(sub_map,
19923 submap_s_offset,
19924 submap_e_offset,
19925 effective_page_shift,
19926 VM_PAGE_INFO_BASIC,
19927 (vm_page_info_t) submap_info,
19928 count);
19929
19930 assert(retval == KERN_SUCCESS);
19931
19932 vm_map_lock_read(map);
19933 vm_map_deallocate(sub_map);
19934
19935 /* Move the "info" index by the number of pages we inspected.*/
19936 info_idx += range_len >> effective_page_shift;
19937
19938 /* Move our current offset by the size of the range we inspected.*/
19939 curr_s_offset += range_len;
19940
19941 continue;
19942 }
19943
19944 object = VME_OBJECT(map_entry);
19945
19946 if (object == VM_OBJECT_NULL) {
19947 /*
19948 * We don't have an object here and, hence,
19949 * no pages to inspect. We'll fill up the
19950 * info structure appropriately.
19951 */
19952
19953 curr_e_offset = MIN(map_entry->vme_end, end);
19954
19955 uint64_t num_pages = (curr_e_offset - curr_s_offset) >> effective_page_shift;
19956
19957 void *info_ptr = (void*) (((uintptr_t) info) + (info_idx * sizeof(struct vm_page_info_basic)));
19958
19959 bzero(info_ptr, num_pages * sizeof(struct vm_page_info_basic));
19960
19961 curr_s_offset = curr_e_offset;
19962
19963 info_idx += num_pages;
19964
19965 continue;
19966 }
19967
19968 if (do_region_footprint) {
19969 disposition = 0;
19970 if (map->has_corpse_footprint) {
19971 /*
19972 * Query the page info data we saved
19973 * while forking the corpse.
19974 */
19975 vm_map_corpse_footprint_query_page_info(
19976 map,
19977 curr_s_offset,
19978 &disposition);
19979 } else {
19980 /*
19981 * Query the live pmap for footprint info
19982 * about this page.
19983 */
19984 vm_map_footprint_query_page_info(
19985 map,
19986 map_entry,
19987 curr_s_offset,
19988 &disposition);
19989 }
19990 switch (flavor) {
19991 case VM_PAGE_INFO_BASIC:
19992 basic_info = (vm_page_info_basic_t) (((uintptr_t) info) + (info_idx * sizeof(struct vm_page_info_basic)));
19993 basic_info->disposition = disposition;
19994 basic_info->ref_count = 1;
19995 basic_info->object_id = VM_OBJECT_ID_FAKE(map, task_ledgers.purgeable_nonvolatile);
19996 basic_info->offset = 0;
19997 basic_info->depth = 0;
19998
19999 info_idx++;
20000 break;
20001 }
20002 curr_s_offset += effective_page_size;
20003 continue;
20004 }
20005
20006 vm_object_reference(object);
20007 /*
20008 * Shared mode -- so we can allow other readers
20009 * to grab the lock too.
20010 */
20011 vm_object_lock_shared(object);
20012
20013 curr_e_offset = MIN(map_entry->vme_end, end);
20014
20015 vm_map_unlock_read(map);
20016
20017 map_entry = NULL; /* map is unlocked, the entry is no longer valid. */
20018
20019 curr_object = object;
20020
20021 for (; curr_s_offset < curr_e_offset;) {
20022 if (object == curr_object) {
20023 ref_count = curr_object->ref_count - 1; /* account for our object reference above. */
20024 } else {
20025 ref_count = curr_object->ref_count;
20026 }
20027
20028 curr_offset_in_object = offset_in_object;
20029
20030 for (;;) {
20031 m = vm_page_lookup(curr_object, vm_object_trunc_page(curr_offset_in_object));
20032
20033 if (m != VM_PAGE_NULL) {
20034 disposition |= VM_PAGE_QUERY_PAGE_PRESENT;
20035 break;
20036 } else {
20037 if (curr_object->internal &&
20038 curr_object->alive &&
20039 !curr_object->terminating &&
20040 curr_object->pager_ready) {
20041 if (VM_COMPRESSOR_PAGER_STATE_GET(curr_object, vm_object_trunc_page(curr_offset_in_object))
20042 == VM_EXTERNAL_STATE_EXISTS) {
20043 /* the pager has that page */
20044 disposition |= VM_PAGE_QUERY_PAGE_PAGED_OUT;
20045 break;
20046 }
20047 }
20048
20049 /*
20050 * Go down the VM object shadow chain until we find the page
20051 * we're looking for.
20052 */
20053
20054 if (curr_object->shadow != VM_OBJECT_NULL) {
20055 vm_object_t shadow = VM_OBJECT_NULL;
20056
20057 curr_offset_in_object += curr_object->vo_shadow_offset;
20058 shadow = curr_object->shadow;
20059
20060 vm_object_lock_shared(shadow);
20061 vm_object_unlock(curr_object);
20062
20063 curr_object = shadow;
20064 depth++;
20065 continue;
20066 } else {
20067 break;
20068 }
20069 }
20070 }
20071
20072 /* The ref_count is not strictly accurate, it measures the number */
20073 /* of entities holding a ref on the object, they may not be mapping */
20074 /* the object or may not be mapping the section holding the */
20075 /* target page but its still a ball park number and though an over- */
20076 /* count, it picks up the copy-on-write cases */
20077
20078 /* We could also get a picture of page sharing from pmap_attributes */
20079 /* but this would under count as only faulted-in mappings would */
20080 /* show up. */
20081
20082 if ((curr_object == object) && curr_object->shadow) {
20083 disposition |= VM_PAGE_QUERY_PAGE_COPIED;
20084 }
20085
20086 if (!curr_object->internal) {
20087 disposition |= VM_PAGE_QUERY_PAGE_EXTERNAL;
20088 }
20089
20090 if (m != VM_PAGE_NULL) {
20091 if (m->vmp_fictitious) {
20092 disposition |= VM_PAGE_QUERY_PAGE_FICTITIOUS;
20093 } else {
20094 if (m->vmp_dirty || pmap_is_modified(VM_PAGE_GET_PHYS_PAGE(m))) {
20095 disposition |= VM_PAGE_QUERY_PAGE_DIRTY;
20096 }
20097
20098 if (m->vmp_reference || pmap_is_referenced(VM_PAGE_GET_PHYS_PAGE(m))) {
20099 disposition |= VM_PAGE_QUERY_PAGE_REF;
20100 }
20101
20102 if (m->vmp_q_state == VM_PAGE_ON_SPECULATIVE_Q) {
20103 disposition |= VM_PAGE_QUERY_PAGE_SPECULATIVE;
20104 }
20105
20106 /*
20107 * XXX TODO4K:
20108 * when this routine deals with 4k
20109 * pages, check the appropriate CS bit
20110 * here.
20111 */
20112 if (m->vmp_cs_validated) {
20113 disposition |= VM_PAGE_QUERY_PAGE_CS_VALIDATED;
20114 }
20115 if (m->vmp_cs_tainted) {
20116 disposition |= VM_PAGE_QUERY_PAGE_CS_TAINTED;
20117 }
20118 if (m->vmp_cs_nx) {
20119 disposition |= VM_PAGE_QUERY_PAGE_CS_NX;
20120 }
20121 if (m->vmp_reusable || curr_object->all_reusable) {
20122 disposition |= VM_PAGE_QUERY_PAGE_REUSABLE;
20123 }
20124 }
20125 }
20126
20127 switch (flavor) {
20128 case VM_PAGE_INFO_BASIC:
20129 basic_info = (vm_page_info_basic_t) (((uintptr_t) info) + (info_idx * sizeof(struct vm_page_info_basic)));
20130 basic_info->disposition = disposition;
20131 basic_info->ref_count = ref_count;
20132 basic_info->object_id = (vm_object_id_t) (uintptr_t)
20133 VM_KERNEL_ADDRPERM(curr_object);
20134 basic_info->offset =
20135 (memory_object_offset_t) curr_offset_in_object + offset_in_page;
20136 basic_info->depth = depth;
20137
20138 info_idx++;
20139 break;
20140 }
20141
20142 disposition = 0;
20143 offset_in_page = 0; // This doesn't really make sense for any offset other than the starting offset.
20144
20145 /*
20146 * Move to next offset in the range and in our object.
20147 */
20148 curr_s_offset += effective_page_size;
20149 offset_in_object += effective_page_size;
20150 curr_offset_in_object = offset_in_object;
20151
20152 if (curr_object != object) {
20153 vm_object_unlock(curr_object);
20154
20155 curr_object = object;
20156
20157 vm_object_lock_shared(curr_object);
20158 } else {
20159 vm_object_lock_yield_shared(curr_object);
20160 }
20161 }
20162
20163 vm_object_unlock(curr_object);
20164 vm_object_deallocate(curr_object);
20165
20166 vm_map_lock_read(map);
20167 }
20168
20169 vm_map_unlock_read(map);
20170 return retval;
20171 }
20172
20173 /*
20174 * vm_map_msync
20175 *
20176 * Synchronises the memory range specified with its backing store
20177 * image by either flushing or cleaning the contents to the appropriate
20178 * memory manager engaging in a memory object synchronize dialog with
20179 * the manager. The client doesn't return until the manager issues
20180 * m_o_s_completed message. MIG Magically converts user task parameter
20181 * to the task's address map.
20182 *
20183 * interpretation of sync_flags
20184 * VM_SYNC_INVALIDATE - discard pages, only return precious
20185 * pages to manager.
20186 *
20187 * VM_SYNC_INVALIDATE & (VM_SYNC_SYNCHRONOUS | VM_SYNC_ASYNCHRONOUS)
20188 * - discard pages, write dirty or precious
20189 * pages back to memory manager.
20190 *
20191 * VM_SYNC_SYNCHRONOUS | VM_SYNC_ASYNCHRONOUS
20192 * - write dirty or precious pages back to
20193 * the memory manager.
20194 *
20195 * VM_SYNC_CONTIGUOUS - does everything normally, but if there
20196 * is a hole in the region, and we would
20197 * have returned KERN_SUCCESS, return
20198 * KERN_INVALID_ADDRESS instead.
20199 *
20200 * NOTE
20201 * The memory object attributes have not yet been implemented, this
20202 * function will have to deal with the invalidate attribute
20203 *
20204 * RETURNS
20205 * KERN_INVALID_TASK Bad task parameter
20206 * KERN_INVALID_ARGUMENT both sync and async were specified.
20207 * KERN_SUCCESS The usual.
20208 * KERN_INVALID_ADDRESS There was a hole in the region.
20209 */
20210
20211 kern_return_t
vm_map_msync(vm_map_t map,vm_map_address_t address,vm_map_size_t size,vm_sync_t sync_flags)20212 vm_map_msync(
20213 vm_map_t map,
20214 vm_map_address_t address,
20215 vm_map_size_t size,
20216 vm_sync_t sync_flags)
20217 {
20218 vm_map_entry_t entry;
20219 vm_map_size_t amount_left;
20220 vm_object_offset_t offset;
20221 vm_object_offset_t start_offset, end_offset;
20222 boolean_t do_sync_req;
20223 boolean_t had_hole = FALSE;
20224 vm_map_offset_t pmap_offset;
20225
20226 if ((sync_flags & VM_SYNC_ASYNCHRONOUS) &&
20227 (sync_flags & VM_SYNC_SYNCHRONOUS)) {
20228 return KERN_INVALID_ARGUMENT;
20229 }
20230
20231 if (__improbable(vm_map_range_overflows(map, address, size))) {
20232 return KERN_INVALID_ADDRESS;
20233 }
20234
20235 if (VM_MAP_PAGE_MASK(map) < PAGE_MASK) {
20236 DEBUG4K_SHARE("map %p address 0x%llx size 0x%llx flags 0x%x\n", map, (uint64_t)address, (uint64_t)size, sync_flags);
20237 }
20238
20239 /*
20240 * align address and size on page boundaries
20241 */
20242 size = (vm_map_round_page(address + size,
20243 VM_MAP_PAGE_MASK(map)) -
20244 vm_map_trunc_page(address,
20245 VM_MAP_PAGE_MASK(map)));
20246 address = vm_map_trunc_page(address,
20247 VM_MAP_PAGE_MASK(map));
20248
20249 if (map == VM_MAP_NULL) {
20250 return KERN_INVALID_TASK;
20251 }
20252
20253 if (size == 0) {
20254 return KERN_SUCCESS;
20255 }
20256
20257 amount_left = size;
20258
20259 while (amount_left > 0) {
20260 vm_object_size_t flush_size;
20261 vm_object_t object;
20262
20263 vm_map_lock(map);
20264 if (!vm_map_lookup_entry(map,
20265 address,
20266 &entry)) {
20267 vm_map_size_t skip;
20268
20269 /*
20270 * hole in the address map.
20271 */
20272 had_hole = TRUE;
20273
20274 if (sync_flags & VM_SYNC_KILLPAGES) {
20275 /*
20276 * For VM_SYNC_KILLPAGES, there should be
20277 * no holes in the range, since we couldn't
20278 * prevent someone else from allocating in
20279 * that hole and we wouldn't want to "kill"
20280 * their pages.
20281 */
20282 vm_map_unlock(map);
20283 break;
20284 }
20285
20286 /*
20287 * Check for empty map.
20288 */
20289 if (entry == vm_map_to_entry(map) &&
20290 entry->vme_next == entry) {
20291 vm_map_unlock(map);
20292 break;
20293 }
20294 /*
20295 * Check that we don't wrap and that
20296 * we have at least one real map entry.
20297 */
20298 if ((map->hdr.nentries == 0) ||
20299 (entry->vme_next->vme_start < address)) {
20300 vm_map_unlock(map);
20301 break;
20302 }
20303 /*
20304 * Move up to the next entry if needed
20305 */
20306 skip = (entry->vme_next->vme_start - address);
20307 if (skip >= amount_left) {
20308 amount_left = 0;
20309 } else {
20310 amount_left -= skip;
20311 }
20312 address = entry->vme_next->vme_start;
20313 vm_map_unlock(map);
20314 continue;
20315 }
20316
20317 offset = address - entry->vme_start;
20318 pmap_offset = address;
20319
20320 /*
20321 * do we have more to flush than is contained in this
20322 * entry ?
20323 */
20324 if (amount_left + entry->vme_start + offset > entry->vme_end) {
20325 flush_size = entry->vme_end -
20326 (entry->vme_start + offset);
20327 } else {
20328 flush_size = amount_left;
20329 }
20330 amount_left -= flush_size;
20331 address += flush_size;
20332
20333 if (entry->is_sub_map == TRUE) {
20334 vm_map_t local_map;
20335 vm_map_offset_t local_offset;
20336
20337 local_map = VME_SUBMAP(entry);
20338 local_offset = VME_OFFSET(entry);
20339 vm_map_reference(local_map);
20340 vm_map_unlock(map);
20341 if (vm_map_msync(
20342 local_map,
20343 local_offset,
20344 flush_size,
20345 sync_flags) == KERN_INVALID_ADDRESS) {
20346 had_hole = TRUE;
20347 }
20348 vm_map_deallocate(local_map);
20349 continue;
20350 }
20351 object = VME_OBJECT(entry);
20352
20353 /*
20354 * We can't sync this object if the object has not been
20355 * created yet
20356 */
20357 if (object == VM_OBJECT_NULL) {
20358 vm_map_unlock(map);
20359 continue;
20360 }
20361 offset += VME_OFFSET(entry);
20362
20363 vm_object_lock(object);
20364
20365 if (sync_flags & (VM_SYNC_KILLPAGES | VM_SYNC_DEACTIVATE)) {
20366 int kill_pages = 0;
20367
20368 if (VM_MAP_PAGE_MASK(map) < PAGE_MASK) {
20369 /*
20370 * This is a destructive operation and so we
20371 * err on the side of limiting the range of
20372 * the operation.
20373 */
20374 start_offset = vm_object_round_page(offset);
20375 end_offset = vm_object_trunc_page(offset + flush_size);
20376
20377 if (end_offset <= start_offset) {
20378 vm_object_unlock(object);
20379 vm_map_unlock(map);
20380 continue;
20381 }
20382
20383 pmap_offset += start_offset - offset;
20384 } else {
20385 start_offset = offset;
20386 end_offset = offset + flush_size;
20387 }
20388
20389 if (sync_flags & VM_SYNC_KILLPAGES) {
20390 if (((object->ref_count == 1) ||
20391 ((object->copy_strategy !=
20392 MEMORY_OBJECT_COPY_SYMMETRIC) &&
20393 (object->vo_copy == VM_OBJECT_NULL))) &&
20394 (object->shadow == VM_OBJECT_NULL)) {
20395 if (object->ref_count != 1) {
20396 vm_page_stats_reusable.free_shared++;
20397 }
20398 kill_pages = 1;
20399 } else {
20400 kill_pages = -1;
20401 }
20402 }
20403 if (kill_pages != -1) {
20404 vm_object_deactivate_pages(
20405 object,
20406 start_offset,
20407 (vm_object_size_t) (end_offset - start_offset),
20408 kill_pages,
20409 FALSE, /* reusable_pages */
20410 FALSE, /* reusable_no_write */
20411 map->pmap,
20412 pmap_offset);
20413 }
20414 vm_object_unlock(object);
20415 vm_map_unlock(map);
20416 continue;
20417 }
20418 /*
20419 * We can't sync this object if there isn't a pager.
20420 * Don't bother to sync internal objects, since there can't
20421 * be any "permanent" storage for these objects anyway.
20422 */
20423 if ((object->pager == MEMORY_OBJECT_NULL) ||
20424 (object->internal) || (object->private)) {
20425 vm_object_unlock(object);
20426 vm_map_unlock(map);
20427 continue;
20428 }
20429 /*
20430 * keep reference on the object until syncing is done
20431 */
20432 vm_object_reference_locked(object);
20433 vm_object_unlock(object);
20434
20435 vm_map_unlock(map);
20436
20437 if (VM_MAP_PAGE_MASK(map) < PAGE_MASK) {
20438 start_offset = vm_object_trunc_page(offset);
20439 end_offset = vm_object_round_page(offset + flush_size);
20440 } else {
20441 start_offset = offset;
20442 end_offset = offset + flush_size;
20443 }
20444
20445 do_sync_req = vm_object_sync(object,
20446 start_offset,
20447 (end_offset - start_offset),
20448 sync_flags & VM_SYNC_INVALIDATE,
20449 ((sync_flags & VM_SYNC_SYNCHRONOUS) ||
20450 (sync_flags & VM_SYNC_ASYNCHRONOUS)),
20451 sync_flags & VM_SYNC_SYNCHRONOUS);
20452
20453 if ((sync_flags & VM_SYNC_INVALIDATE) && object->resident_page_count == 0) {
20454 /*
20455 * clear out the clustering and read-ahead hints
20456 */
20457 vm_object_lock(object);
20458
20459 object->pages_created = 0;
20460 object->pages_used = 0;
20461 object->sequential = 0;
20462 object->last_alloc = 0;
20463
20464 vm_object_unlock(object);
20465 }
20466 vm_object_deallocate(object);
20467 } /* while */
20468
20469 /* for proper msync() behaviour */
20470 if (had_hole == TRUE && (sync_flags & VM_SYNC_CONTIGUOUS)) {
20471 return KERN_INVALID_ADDRESS;
20472 }
20473
20474 return KERN_SUCCESS;
20475 }/* vm_msync */
20476
20477 void
vm_named_entry_associate_vm_object(vm_named_entry_t named_entry,vm_object_t object,vm_object_offset_t offset,vm_object_size_t size,vm_prot_t prot)20478 vm_named_entry_associate_vm_object(
20479 vm_named_entry_t named_entry,
20480 vm_object_t object,
20481 vm_object_offset_t offset,
20482 vm_object_size_t size,
20483 vm_prot_t prot)
20484 {
20485 vm_map_copy_t copy;
20486 vm_map_entry_t copy_entry;
20487
20488 assert(!named_entry->is_sub_map);
20489 assert(!named_entry->is_copy);
20490 assert(!named_entry->is_object);
20491 assert(!named_entry->internal);
20492 assert(named_entry->backing.copy == VM_MAP_COPY_NULL);
20493
20494 copy = vm_map_copy_allocate(VM_MAP_COPY_ENTRY_LIST);
20495 copy->offset = offset;
20496 copy->size = size;
20497 copy->cpy_hdr.page_shift = (uint16_t)PAGE_SHIFT;
20498
20499 copy_entry = vm_map_copy_entry_create(copy);
20500 copy_entry->protection = prot;
20501 copy_entry->max_protection = prot;
20502 copy_entry->use_pmap = TRUE;
20503 copy_entry->vme_start = VM_MAP_TRUNC_PAGE(offset, PAGE_MASK);
20504 copy_entry->vme_end = VM_MAP_ROUND_PAGE(offset + size, PAGE_MASK);
20505 VME_OBJECT_SET(copy_entry, object, false, 0);
20506 VME_OFFSET_SET(copy_entry, vm_object_trunc_page(offset));
20507 vm_map_copy_entry_link(copy, vm_map_copy_last_entry(copy), copy_entry);
20508
20509 named_entry->backing.copy = copy;
20510 named_entry->is_object = TRUE;
20511 if (object->internal) {
20512 named_entry->internal = TRUE;
20513 }
20514
20515 DEBUG4K_MEMENTRY("named_entry %p copy %p object %p offset 0x%llx size 0x%llx prot 0x%x\n",
20516 named_entry, copy, object, offset, size, prot);
20517 }
20518
20519 vm_object_t
vm_named_entry_to_vm_object(vm_named_entry_t named_entry)20520 vm_named_entry_to_vm_object(
20521 vm_named_entry_t named_entry)
20522 {
20523 vm_map_copy_t copy;
20524 vm_map_entry_t copy_entry;
20525 vm_object_t object;
20526
20527 assert(!named_entry->is_sub_map);
20528 assert(!named_entry->is_copy);
20529 assert(named_entry->is_object);
20530 copy = named_entry->backing.copy;
20531 assert(copy != VM_MAP_COPY_NULL);
20532 /*
20533 * Assert that the vm_map_copy is coming from the right
20534 * zone and hasn't been forged
20535 */
20536 vm_map_copy_require(copy);
20537 assert(copy->cpy_hdr.nentries == 1);
20538 copy_entry = vm_map_copy_first_entry(copy);
20539 object = VME_OBJECT(copy_entry);
20540
20541 DEBUG4K_MEMENTRY("%p -> %p -> %p [0x%llx 0x%llx 0x%llx 0x%x/0x%x ] -> %p offset 0x%llx size 0x%llx prot 0x%x\n", named_entry, copy, copy_entry, (uint64_t)copy_entry->vme_start, (uint64_t)copy_entry->vme_end, copy_entry->vme_offset, copy_entry->protection, copy_entry->max_protection, object, named_entry->offset, named_entry->size, named_entry->protection);
20542
20543 return object;
20544 }
20545
20546 /*
20547 * Routine: convert_port_entry_to_map
20548 * Purpose:
20549 * Convert from a port specifying an entry or a task
20550 * to a map. Doesn't consume the port ref; produces a map ref,
20551 * which may be null. Unlike convert_port_to_map, the
20552 * port may be task or a named entry backed.
20553 * Conditions:
20554 * Nothing locked.
20555 */
20556
20557 vm_map_t
convert_port_entry_to_map(ipc_port_t port)20558 convert_port_entry_to_map(
20559 ipc_port_t port)
20560 {
20561 vm_map_t map = VM_MAP_NULL;
20562 vm_named_entry_t named_entry;
20563
20564 if (!IP_VALID(port)) {
20565 return VM_MAP_NULL;
20566 }
20567
20568 if (ip_kotype(port) != IKOT_NAMED_ENTRY) {
20569 return convert_port_to_map(port);
20570 }
20571
20572 named_entry = mach_memory_entry_from_port(port);
20573
20574 if ((named_entry->is_sub_map) &&
20575 (named_entry->protection & VM_PROT_WRITE)) {
20576 map = named_entry->backing.map;
20577 if (map->pmap != PMAP_NULL) {
20578 if (map->pmap == kernel_pmap) {
20579 panic("userspace has access "
20580 "to a kernel map %p", map);
20581 }
20582 pmap_require(map->pmap);
20583 }
20584 vm_map_reference(map);
20585 }
20586
20587 return map;
20588 }
20589
20590 /*
20591 * Export routines to other components for the things we access locally through
20592 * macros.
20593 */
20594 #undef current_map
20595 vm_map_t
current_map(void)20596 current_map(void)
20597 {
20598 return current_map_fast();
20599 }
20600
20601 /*
20602 * vm_map_reference:
20603 *
20604 * Takes a reference on the specified map.
20605 */
20606 void
vm_map_reference(vm_map_t map)20607 vm_map_reference(
20608 vm_map_t map)
20609 {
20610 if (__probable(map != VM_MAP_NULL)) {
20611 vm_map_require(map);
20612 os_ref_retain_raw(&map->map_refcnt, &map_refgrp);
20613 }
20614 }
20615
20616 /*
20617 * vm_map_deallocate:
20618 *
20619 * Removes a reference from the specified map,
20620 * destroying it if no references remain.
20621 * The map should not be locked.
20622 */
20623 void
vm_map_deallocate(vm_map_t map)20624 vm_map_deallocate(
20625 vm_map_t map)
20626 {
20627 if (__probable(map != VM_MAP_NULL)) {
20628 vm_map_require(map);
20629 if (os_ref_release_raw(&map->map_refcnt, &map_refgrp) == 0) {
20630 vm_map_destroy(map);
20631 }
20632 }
20633 }
20634
20635 void
vm_map_inspect_deallocate(vm_map_inspect_t map)20636 vm_map_inspect_deallocate(
20637 vm_map_inspect_t map)
20638 {
20639 vm_map_deallocate((vm_map_t)map);
20640 }
20641
20642 void
vm_map_read_deallocate(vm_map_read_t map)20643 vm_map_read_deallocate(
20644 vm_map_read_t map)
20645 {
20646 vm_map_deallocate((vm_map_t)map);
20647 }
20648
20649
20650 void
vm_map_disable_NX(vm_map_t map)20651 vm_map_disable_NX(vm_map_t map)
20652 {
20653 if (map == NULL) {
20654 return;
20655 }
20656 if (map->pmap == NULL) {
20657 return;
20658 }
20659
20660 pmap_disable_NX(map->pmap);
20661 }
20662
20663 void
vm_map_disallow_data_exec(vm_map_t map)20664 vm_map_disallow_data_exec(vm_map_t map)
20665 {
20666 if (map == NULL) {
20667 return;
20668 }
20669
20670 map->map_disallow_data_exec = TRUE;
20671 }
20672
20673 /* XXX Consider making these constants (VM_MAX_ADDRESS and MACH_VM_MAX_ADDRESS)
20674 * more descriptive.
20675 */
20676 void
vm_map_set_32bit(vm_map_t map)20677 vm_map_set_32bit(vm_map_t map)
20678 {
20679 #if defined(__arm64__)
20680 map->max_offset = pmap_max_offset(FALSE, ARM_PMAP_MAX_OFFSET_DEVICE);
20681 #else
20682 map->max_offset = (vm_map_offset_t)VM_MAX_ADDRESS;
20683 #endif
20684 }
20685
20686
20687 void
vm_map_set_64bit(vm_map_t map)20688 vm_map_set_64bit(vm_map_t map)
20689 {
20690 #if defined(__arm64__)
20691 map->max_offset = pmap_max_offset(TRUE, ARM_PMAP_MAX_OFFSET_DEVICE);
20692 #else
20693 map->max_offset = (vm_map_offset_t)MACH_VM_MAX_ADDRESS;
20694 #endif
20695 }
20696
20697 /*
20698 * Expand the maximum size of an existing map to the maximum supported.
20699 */
20700 void
vm_map_set_jumbo(vm_map_t map)20701 vm_map_set_jumbo(vm_map_t map)
20702 {
20703 #if defined (__arm64__) && !XNU_TARGET_OS_OSX
20704 vm_map_set_max_addr(map, ~0);
20705 #else /* arm64 */
20706 (void) map;
20707 #endif
20708 }
20709
20710 /*
20711 * This map has a JIT entitlement
20712 */
20713 void
vm_map_set_jit_entitled(vm_map_t map)20714 vm_map_set_jit_entitled(vm_map_t map)
20715 {
20716 #if defined (__arm64__)
20717 pmap_set_jit_entitled(map->pmap);
20718 #else /* arm64 */
20719 (void) map;
20720 #endif
20721 }
20722
20723 /*
20724 * Get status of this maps TPRO flag
20725 */
20726 boolean_t
vm_map_tpro(vm_map_t map)20727 vm_map_tpro(vm_map_t map)
20728 {
20729 #if defined (__arm64e__)
20730 return pmap_get_tpro(map->pmap);
20731 #else /* arm64e */
20732 (void) map;
20733 return FALSE;
20734 #endif
20735 }
20736
20737 /*
20738 * This map has TPRO enabled
20739 */
20740 void
vm_map_set_tpro(vm_map_t map)20741 vm_map_set_tpro(vm_map_t map)
20742 {
20743 #if defined (__arm64e__)
20744 pmap_set_tpro(map->pmap);
20745 #else /* arm64e */
20746 (void) map;
20747 #endif
20748 }
20749
20750 /*
20751 * Does this map have TPRO enforcement enabled
20752 */
20753 boolean_t
vm_map_tpro_enforcement(vm_map_t map)20754 vm_map_tpro_enforcement(vm_map_t map)
20755 {
20756 return map->tpro_enforcement;
20757 }
20758
20759 /*
20760 * Set TPRO enforcement for this map
20761 */
20762 void
vm_map_set_tpro_enforcement(vm_map_t map)20763 vm_map_set_tpro_enforcement(vm_map_t map)
20764 {
20765 if (vm_map_tpro(map)) {
20766 vm_map_lock(map);
20767 map->tpro_enforcement = TRUE;
20768 vm_map_unlock(map);
20769 }
20770 }
20771
20772 /*
20773 * Enable TPRO on the requested region
20774 *
20775 * Note:
20776 * This routine is primarily intended to be called during/soon after map
20777 * creation before the associated task has been released to run. It is only
20778 * currently safe when we have no resident pages.
20779 */
20780 boolean_t
vm_map_set_tpro_range(__unused vm_map_t map,__unused vm_map_address_t start,__unused vm_map_address_t end)20781 vm_map_set_tpro_range(
20782 __unused vm_map_t map,
20783 __unused vm_map_address_t start,
20784 __unused vm_map_address_t end)
20785 {
20786 return TRUE;
20787 }
20788
20789 /*
20790 * Expand the maximum size of an existing map.
20791 */
20792 void
vm_map_set_max_addr(vm_map_t map,vm_map_offset_t new_max_offset)20793 vm_map_set_max_addr(vm_map_t map, vm_map_offset_t new_max_offset)
20794 {
20795 #if defined(__arm64__)
20796 vm_map_offset_t max_supported_offset;
20797 vm_map_offset_t old_max_offset;
20798
20799 vm_map_lock(map);
20800
20801 old_max_offset = map->max_offset;
20802 max_supported_offset = pmap_max_offset(vm_map_is_64bit(map), ARM_PMAP_MAX_OFFSET_JUMBO);
20803
20804 new_max_offset = trunc_page(new_max_offset);
20805
20806 /* The address space cannot be shrunk using this routine. */
20807 if (old_max_offset >= new_max_offset) {
20808 vm_map_unlock(map);
20809 return;
20810 }
20811
20812 if (max_supported_offset < new_max_offset) {
20813 new_max_offset = max_supported_offset;
20814 }
20815
20816 map->max_offset = new_max_offset;
20817
20818 if (map->holelistenabled) {
20819 if (map->holes_list->prev->vme_end == old_max_offset) {
20820 /*
20821 * There is already a hole at the end of the map; simply make it bigger.
20822 */
20823 map->holes_list->prev->vme_end = map->max_offset;
20824 } else {
20825 /*
20826 * There is no hole at the end, so we need to create a new hole
20827 * for the new empty space we're creating.
20828 */
20829 struct vm_map_links *new_hole;
20830
20831 new_hole = zalloc_id(ZONE_ID_VM_MAP_HOLES, Z_WAITOK | Z_NOFAIL);
20832 new_hole->start = old_max_offset;
20833 new_hole->end = map->max_offset;
20834 new_hole->prev = map->holes_list->prev;
20835 new_hole->next = (struct vm_map_entry *)map->holes_list;
20836 map->holes_list->prev->vme_next = (struct vm_map_entry *)new_hole;
20837 map->holes_list->prev = (struct vm_map_entry *)new_hole;
20838 }
20839 }
20840
20841 vm_map_unlock(map);
20842 #else
20843 (void)map;
20844 (void)new_max_offset;
20845 #endif
20846 }
20847
20848 vm_map_offset_t
vm_compute_max_offset(boolean_t is64)20849 vm_compute_max_offset(boolean_t is64)
20850 {
20851 #if defined(__arm64__)
20852 return pmap_max_offset(is64, ARM_PMAP_MAX_OFFSET_DEVICE);
20853 #else
20854 return is64 ? (vm_map_offset_t)MACH_VM_MAX_ADDRESS : (vm_map_offset_t)VM_MAX_ADDRESS;
20855 #endif
20856 }
20857
20858 void
vm_map_get_max_aslr_slide_section(vm_map_t map __unused,int64_t * max_sections,int64_t * section_size)20859 vm_map_get_max_aslr_slide_section(
20860 vm_map_t map __unused,
20861 int64_t *max_sections,
20862 int64_t *section_size)
20863 {
20864 #if defined(__arm64__)
20865 *max_sections = 3;
20866 *section_size = ARM_TT_TWIG_SIZE;
20867 #else
20868 *max_sections = 1;
20869 *section_size = 0;
20870 #endif
20871 }
20872
20873 uint64_t
vm_map_get_max_aslr_slide_pages(vm_map_t map)20874 vm_map_get_max_aslr_slide_pages(vm_map_t map)
20875 {
20876 #if defined(__arm64__)
20877 /* Limit arm64 slide to 16MB to conserve contiguous VA space in the more
20878 * limited embedded address space; this is also meant to minimize pmap
20879 * memory usage on 16KB page systems.
20880 */
20881 return 1 << (24 - VM_MAP_PAGE_SHIFT(map));
20882 #else
20883 return 1 << (vm_map_is_64bit(map) ? 16 : 8);
20884 #endif
20885 }
20886
20887 uint64_t
vm_map_get_max_loader_aslr_slide_pages(vm_map_t map)20888 vm_map_get_max_loader_aslr_slide_pages(vm_map_t map)
20889 {
20890 #if defined(__arm64__)
20891 /* We limit the loader slide to 4MB, in order to ensure at least 8 bits
20892 * of independent entropy on 16KB page systems.
20893 */
20894 return 1 << (22 - VM_MAP_PAGE_SHIFT(map));
20895 #else
20896 return 1 << (vm_map_is_64bit(map) ? 16 : 8);
20897 #endif
20898 }
20899
20900 boolean_t
vm_map_is_64bit(vm_map_t map)20901 vm_map_is_64bit(
20902 vm_map_t map)
20903 {
20904 return map->max_offset > ((vm_map_offset_t)VM_MAX_ADDRESS);
20905 }
20906
20907 boolean_t
vm_map_has_hard_pagezero(vm_map_t map,vm_map_offset_t pagezero_size)20908 vm_map_has_hard_pagezero(
20909 vm_map_t map,
20910 vm_map_offset_t pagezero_size)
20911 {
20912 /*
20913 * XXX FBDP
20914 * We should lock the VM map (for read) here but we can get away
20915 * with it for now because there can't really be any race condition:
20916 * the VM map's min_offset is changed only when the VM map is created
20917 * and when the zero page is established (when the binary gets loaded),
20918 * and this routine gets called only when the task terminates and the
20919 * VM map is being torn down, and when a new map is created via
20920 * load_machfile()/execve().
20921 */
20922 return map->min_offset >= pagezero_size;
20923 }
20924
20925 /*
20926 * Raise a VM map's maximun offset.
20927 */
20928 kern_return_t
vm_map_raise_max_offset(vm_map_t map,vm_map_offset_t new_max_offset)20929 vm_map_raise_max_offset(
20930 vm_map_t map,
20931 vm_map_offset_t new_max_offset)
20932 {
20933 kern_return_t ret;
20934
20935 vm_map_lock(map);
20936 ret = KERN_INVALID_ADDRESS;
20937
20938 if (new_max_offset >= map->max_offset) {
20939 if (!vm_map_is_64bit(map)) {
20940 if (new_max_offset <= (vm_map_offset_t)VM_MAX_ADDRESS) {
20941 map->max_offset = new_max_offset;
20942 ret = KERN_SUCCESS;
20943 }
20944 } else {
20945 if (new_max_offset <= (vm_map_offset_t)MACH_VM_MAX_ADDRESS) {
20946 map->max_offset = new_max_offset;
20947 ret = KERN_SUCCESS;
20948 }
20949 }
20950 }
20951
20952 vm_map_unlock(map);
20953 return ret;
20954 }
20955
20956
20957 /*
20958 * Raise a VM map's minimum offset.
20959 * To strictly enforce "page zero" reservation.
20960 */
20961 kern_return_t
vm_map_raise_min_offset(vm_map_t map,vm_map_offset_t new_min_offset)20962 vm_map_raise_min_offset(
20963 vm_map_t map,
20964 vm_map_offset_t new_min_offset)
20965 {
20966 vm_map_entry_t first_entry;
20967
20968 new_min_offset = vm_map_round_page(new_min_offset,
20969 VM_MAP_PAGE_MASK(map));
20970
20971 vm_map_lock(map);
20972
20973 if (new_min_offset < map->min_offset) {
20974 /*
20975 * Can't move min_offset backwards, as that would expose
20976 * a part of the address space that was previously, and for
20977 * possibly good reasons, inaccessible.
20978 */
20979 vm_map_unlock(map);
20980 return KERN_INVALID_ADDRESS;
20981 }
20982 if (new_min_offset >= map->max_offset) {
20983 /* can't go beyond the end of the address space */
20984 vm_map_unlock(map);
20985 return KERN_INVALID_ADDRESS;
20986 }
20987
20988 first_entry = vm_map_first_entry(map);
20989 if (first_entry != vm_map_to_entry(map) &&
20990 first_entry->vme_start < new_min_offset) {
20991 /*
20992 * Some memory was already allocated below the new
20993 * minimun offset. It's too late to change it now...
20994 */
20995 vm_map_unlock(map);
20996 return KERN_NO_SPACE;
20997 }
20998
20999 map->min_offset = new_min_offset;
21000
21001 if (map->holelistenabled) {
21002 assert(map->holes_list);
21003 map->holes_list->start = new_min_offset;
21004 assert(new_min_offset < map->holes_list->end);
21005 }
21006
21007 vm_map_unlock(map);
21008
21009 return KERN_SUCCESS;
21010 }
21011
21012 /*
21013 * Set the limit on the maximum amount of address space and user wired memory allowed for this map.
21014 * This is basically a copy of the RLIMIT_AS and RLIMIT_MEMLOCK rlimit value maintained by the BSD
21015 * side of the kernel. The limits are checked in the mach VM side, so we keep a copy so we don't
21016 * have to reach over to the BSD data structures.
21017 */
21018
21019 uint64_t vm_map_set_size_limit_count = 0;
21020 kern_return_t
vm_map_set_size_limit(vm_map_t map,uint64_t new_size_limit)21021 vm_map_set_size_limit(vm_map_t map, uint64_t new_size_limit)
21022 {
21023 kern_return_t kr;
21024
21025 vm_map_lock(map);
21026 if (new_size_limit < map->size) {
21027 /* new limit should not be lower than its current size */
21028 DTRACE_VM2(vm_map_set_size_limit_fail,
21029 vm_map_size_t, map->size,
21030 uint64_t, new_size_limit);
21031 kr = KERN_FAILURE;
21032 } else if (new_size_limit == map->size_limit) {
21033 /* no change */
21034 kr = KERN_SUCCESS;
21035 } else {
21036 /* set new limit */
21037 DTRACE_VM2(vm_map_set_size_limit,
21038 vm_map_size_t, map->size,
21039 uint64_t, new_size_limit);
21040 if (new_size_limit != RLIM_INFINITY) {
21041 vm_map_set_size_limit_count++;
21042 }
21043 map->size_limit = new_size_limit;
21044 kr = KERN_SUCCESS;
21045 }
21046 vm_map_unlock(map);
21047 return kr;
21048 }
21049
21050 uint64_t vm_map_set_data_limit_count = 0;
21051 kern_return_t
vm_map_set_data_limit(vm_map_t map,uint64_t new_data_limit)21052 vm_map_set_data_limit(vm_map_t map, uint64_t new_data_limit)
21053 {
21054 kern_return_t kr;
21055
21056 vm_map_lock(map);
21057 if (new_data_limit < map->size) {
21058 /* new limit should not be lower than its current size */
21059 DTRACE_VM2(vm_map_set_data_limit_fail,
21060 vm_map_size_t, map->size,
21061 uint64_t, new_data_limit);
21062 kr = KERN_FAILURE;
21063 } else if (new_data_limit == map->data_limit) {
21064 /* no change */
21065 kr = KERN_SUCCESS;
21066 } else {
21067 /* set new limit */
21068 DTRACE_VM2(vm_map_set_data_limit,
21069 vm_map_size_t, map->size,
21070 uint64_t, new_data_limit);
21071 if (new_data_limit != RLIM_INFINITY) {
21072 vm_map_set_data_limit_count++;
21073 }
21074 map->data_limit = new_data_limit;
21075 kr = KERN_SUCCESS;
21076 }
21077 vm_map_unlock(map);
21078 return kr;
21079 }
21080
21081 void
vm_map_set_user_wire_limit(vm_map_t map,vm_size_t limit)21082 vm_map_set_user_wire_limit(vm_map_t map,
21083 vm_size_t limit)
21084 {
21085 vm_map_lock(map);
21086 map->user_wire_limit = limit;
21087 vm_map_unlock(map);
21088 }
21089
21090
21091 void
vm_map_switch_protect(vm_map_t map,boolean_t val)21092 vm_map_switch_protect(vm_map_t map,
21093 boolean_t val)
21094 {
21095 vm_map_lock(map);
21096 map->switch_protect = val;
21097 vm_map_unlock(map);
21098 }
21099
21100 extern int cs_process_enforcement_enable;
21101 boolean_t
vm_map_cs_enforcement(vm_map_t map)21102 vm_map_cs_enforcement(
21103 vm_map_t map)
21104 {
21105 if (cs_process_enforcement_enable) {
21106 return TRUE;
21107 }
21108 return map->cs_enforcement;
21109 }
21110
21111 kern_return_t
vm_map_cs_wx_enable(__unused vm_map_t map)21112 vm_map_cs_wx_enable(
21113 __unused vm_map_t map)
21114 {
21115 #if CODE_SIGNING_MONITOR
21116 kern_return_t ret = csm_allow_invalid_code(vm_map_pmap(map));
21117 if ((ret == KERN_SUCCESS) || (ret == KERN_NOT_SUPPORTED)) {
21118 return KERN_SUCCESS;
21119 }
21120 return ret;
21121 #else
21122 /* The VM manages WX memory entirely on its own */
21123 return KERN_SUCCESS;
21124 #endif
21125 }
21126
21127 kern_return_t
vm_map_csm_allow_jit(__unused vm_map_t map)21128 vm_map_csm_allow_jit(
21129 __unused vm_map_t map)
21130 {
21131 #if CODE_SIGNING_MONITOR
21132 return csm_allow_jit_region(vm_map_pmap(map));
21133 #else
21134 /* No code signing monitor to enforce JIT policy */
21135 return KERN_SUCCESS;
21136 #endif
21137 }
21138
21139 void
vm_map_cs_debugged_set(vm_map_t map,boolean_t val)21140 vm_map_cs_debugged_set(
21141 vm_map_t map,
21142 boolean_t val)
21143 {
21144 vm_map_lock(map);
21145 map->cs_debugged = val;
21146 vm_map_unlock(map);
21147 }
21148
21149 void
vm_map_cs_enforcement_set(vm_map_t map,boolean_t val)21150 vm_map_cs_enforcement_set(
21151 vm_map_t map,
21152 boolean_t val)
21153 {
21154 vm_map_lock(map);
21155 map->cs_enforcement = val;
21156 pmap_set_vm_map_cs_enforced(map->pmap, val);
21157 vm_map_unlock(map);
21158 }
21159
21160 /*
21161 * IOKit has mapped a region into this map; adjust the pmap's ledgers appropriately.
21162 * phys_footprint is a composite limit consisting of iokit + physmem, so we need to
21163 * bump both counters.
21164 */
21165 void
vm_map_iokit_mapped_region(vm_map_t map,vm_size_t bytes)21166 vm_map_iokit_mapped_region(vm_map_t map, vm_size_t bytes)
21167 {
21168 pmap_t pmap = vm_map_pmap(map);
21169
21170 ledger_credit(pmap->ledger, task_ledgers.iokit_mapped, bytes);
21171 ledger_credit(pmap->ledger, task_ledgers.phys_footprint, bytes);
21172 }
21173
21174 void
vm_map_iokit_unmapped_region(vm_map_t map,vm_size_t bytes)21175 vm_map_iokit_unmapped_region(vm_map_t map, vm_size_t bytes)
21176 {
21177 pmap_t pmap = vm_map_pmap(map);
21178
21179 ledger_debit(pmap->ledger, task_ledgers.iokit_mapped, bytes);
21180 ledger_debit(pmap->ledger, task_ledgers.phys_footprint, bytes);
21181 }
21182
21183 /* Add (generate) code signature for memory range */
21184 #if CONFIG_DYNAMIC_CODE_SIGNING
21185 kern_return_t
vm_map_sign(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)21186 vm_map_sign(vm_map_t map,
21187 vm_map_offset_t start,
21188 vm_map_offset_t end)
21189 {
21190 vm_map_entry_t entry;
21191 vm_page_t m;
21192 vm_object_t object;
21193
21194 /*
21195 * Vet all the input parameters and current type and state of the
21196 * underlaying object. Return with an error if anything is amiss.
21197 */
21198 if (map == VM_MAP_NULL) {
21199 return KERN_INVALID_ARGUMENT;
21200 }
21201
21202 if (__improbable(vm_map_range_overflows(map, start, end - start))) {
21203 return KERN_INVALID_ADDRESS;
21204 }
21205
21206 vm_map_lock_read(map);
21207
21208 if (!vm_map_lookup_entry(map, start, &entry) || entry->is_sub_map) {
21209 /*
21210 * Must pass a valid non-submap address.
21211 */
21212 vm_map_unlock_read(map);
21213 return KERN_INVALID_ADDRESS;
21214 }
21215
21216 if ((entry->vme_start > start) || (entry->vme_end < end)) {
21217 /*
21218 * Map entry doesn't cover the requested range. Not handling
21219 * this situation currently.
21220 */
21221 vm_map_unlock_read(map);
21222 return KERN_INVALID_ARGUMENT;
21223 }
21224
21225 object = VME_OBJECT(entry);
21226 if (object == VM_OBJECT_NULL) {
21227 /*
21228 * Object must already be present or we can't sign.
21229 */
21230 vm_map_unlock_read(map);
21231 return KERN_INVALID_ARGUMENT;
21232 }
21233
21234 vm_object_lock(object);
21235 vm_map_unlock_read(map);
21236
21237 while (start < end) {
21238 uint32_t refmod;
21239
21240 m = vm_page_lookup(object,
21241 start - entry->vme_start + VME_OFFSET(entry));
21242 if (m == VM_PAGE_NULL) {
21243 /* shoud we try to fault a page here? we can probably
21244 * demand it exists and is locked for this request */
21245 vm_object_unlock(object);
21246 return KERN_FAILURE;
21247 }
21248 /* deal with special page status */
21249 if (m->vmp_busy ||
21250 (m->vmp_unusual && (VMP_ERROR_GET(m) || m->vmp_restart || m->vmp_private || m->vmp_absent))) {
21251 vm_object_unlock(object);
21252 return KERN_FAILURE;
21253 }
21254
21255 /* Page is OK... now "validate" it */
21256 /* This is the place where we'll call out to create a code
21257 * directory, later */
21258 /* XXX TODO4K: deal with 4k subpages individually? */
21259 m->vmp_cs_validated = VMP_CS_ALL_TRUE;
21260
21261 /* The page is now "clean" for codesigning purposes. That means
21262 * we don't consider it as modified (wpmapped) anymore. But
21263 * we'll disconnect the page so we note any future modification
21264 * attempts. */
21265 m->vmp_wpmapped = FALSE;
21266 refmod = pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m));
21267
21268 /* Pull the dirty status from the pmap, since we cleared the
21269 * wpmapped bit */
21270 if ((refmod & VM_MEM_MODIFIED) && !m->vmp_dirty) {
21271 SET_PAGE_DIRTY(m, FALSE);
21272 }
21273
21274 /* On to the next page */
21275 start += PAGE_SIZE;
21276 }
21277 vm_object_unlock(object);
21278
21279 return KERN_SUCCESS;
21280 }
21281 #endif
21282
21283 kern_return_t
vm_map_partial_reap(vm_map_t map,unsigned int * reclaimed_resident,unsigned int * reclaimed_compressed)21284 vm_map_partial_reap(vm_map_t map, unsigned int *reclaimed_resident, unsigned int *reclaimed_compressed)
21285 {
21286 vm_map_entry_t entry = VM_MAP_ENTRY_NULL;
21287 vm_map_entry_t next_entry;
21288 kern_return_t kr = KERN_SUCCESS;
21289 VM_MAP_ZAP_DECLARE(zap_list);
21290
21291 vm_map_lock(map);
21292
21293 for (entry = vm_map_first_entry(map);
21294 entry != vm_map_to_entry(map);
21295 entry = next_entry) {
21296 next_entry = entry->vme_next;
21297
21298 if (!entry->is_sub_map &&
21299 VME_OBJECT(entry) &&
21300 (VME_OBJECT(entry)->internal == TRUE) &&
21301 (VME_OBJECT(entry)->ref_count == 1)) {
21302 *reclaimed_resident += VME_OBJECT(entry)->resident_page_count;
21303 *reclaimed_compressed += vm_compressor_pager_get_count(VME_OBJECT(entry)->pager);
21304
21305 (void)vm_map_delete(map, entry->vme_start,
21306 entry->vme_end, VM_MAP_REMOVE_NO_YIELD,
21307 KMEM_GUARD_NONE, &zap_list);
21308 }
21309 }
21310
21311 vm_map_unlock(map);
21312
21313 vm_map_zap_dispose(&zap_list);
21314
21315 return kr;
21316 }
21317
21318
21319 #if DEVELOPMENT || DEBUG
21320
21321 int
vm_map_disconnect_page_mappings(vm_map_t map,boolean_t do_unnest)21322 vm_map_disconnect_page_mappings(
21323 vm_map_t map,
21324 boolean_t do_unnest)
21325 {
21326 vm_map_entry_t entry;
21327 ledger_amount_t byte_count = 0;
21328
21329 if (do_unnest == TRUE) {
21330 #ifndef NO_NESTED_PMAP
21331 vm_map_lock(map);
21332
21333 for (entry = vm_map_first_entry(map);
21334 entry != vm_map_to_entry(map);
21335 entry = entry->vme_next) {
21336 if (entry->is_sub_map && entry->use_pmap) {
21337 /*
21338 * Make sure the range between the start of this entry and
21339 * the end of this entry is no longer nested, so that
21340 * we will only remove mappings from the pmap in use by this
21341 * this task
21342 */
21343 vm_map_clip_unnest(map, entry, entry->vme_start, entry->vme_end);
21344 }
21345 }
21346 vm_map_unlock(map);
21347 #endif
21348 }
21349 vm_map_lock_read(map);
21350
21351 ledger_get_balance(map->pmap->ledger, task_ledgers.phys_mem, &byte_count);
21352
21353 for (entry = vm_map_first_entry(map);
21354 entry != vm_map_to_entry(map);
21355 entry = entry->vme_next) {
21356 if (!entry->is_sub_map && ((VME_OBJECT(entry) == 0) ||
21357 (VME_OBJECT(entry)->phys_contiguous))) {
21358 continue;
21359 }
21360 if (entry->is_sub_map) {
21361 assert(!entry->use_pmap);
21362 }
21363
21364 pmap_remove_options(map->pmap, entry->vme_start, entry->vme_end, 0);
21365 }
21366 vm_map_unlock_read(map);
21367
21368 return (int) (byte_count / VM_MAP_PAGE_SIZE(map));
21369 }
21370
21371 kern_return_t
vm_map_inject_error(vm_map_t map,vm_map_offset_t vaddr)21372 vm_map_inject_error(vm_map_t map, vm_map_offset_t vaddr)
21373 {
21374 vm_object_t object = NULL;
21375 vm_object_offset_t offset;
21376 vm_prot_t prot;
21377 boolean_t wired;
21378 vm_map_version_t version;
21379 vm_map_t real_map;
21380 int result = KERN_FAILURE;
21381
21382 vaddr = vm_map_trunc_page(vaddr, PAGE_MASK);
21383 vm_map_lock(map);
21384
21385 result = vm_map_lookup_and_lock_object(&map, vaddr, VM_PROT_READ,
21386 OBJECT_LOCK_EXCLUSIVE, &version, &object, &offset, &prot, &wired,
21387 NULL, &real_map, NULL);
21388 if (object == NULL) {
21389 result = KERN_MEMORY_ERROR;
21390 } else if (object->pager) {
21391 result = vm_compressor_pager_inject_error(object->pager,
21392 offset);
21393 } else {
21394 result = KERN_MEMORY_PRESENT;
21395 }
21396
21397 if (object != NULL) {
21398 vm_object_unlock(object);
21399 }
21400
21401 if (real_map != map) {
21402 vm_map_unlock(real_map);
21403 }
21404 vm_map_unlock(map);
21405
21406 return result;
21407 }
21408
21409 #endif
21410
21411
21412 #if CONFIG_FREEZE
21413
21414
21415 extern struct freezer_context freezer_context_global;
21416 AbsoluteTime c_freezer_last_yield_ts = 0;
21417
21418 extern unsigned int memorystatus_freeze_private_shared_pages_ratio;
21419 extern unsigned int memorystatus_freeze_shared_mb_per_process_max;
21420
21421 kern_return_t
vm_map_freeze(task_t task,unsigned int * purgeable_count,unsigned int * wired_count,unsigned int * clean_count,unsigned int * dirty_count,unsigned int dirty_budget,unsigned int * shared_count,int * freezer_error_code,boolean_t eval_only)21422 vm_map_freeze(
21423 task_t task,
21424 unsigned int *purgeable_count,
21425 unsigned int *wired_count,
21426 unsigned int *clean_count,
21427 unsigned int *dirty_count,
21428 unsigned int dirty_budget,
21429 unsigned int *shared_count,
21430 int *freezer_error_code,
21431 boolean_t eval_only)
21432 {
21433 vm_map_entry_t entry2 = VM_MAP_ENTRY_NULL;
21434 kern_return_t kr = KERN_SUCCESS;
21435 boolean_t evaluation_phase = TRUE;
21436 vm_object_t cur_shared_object = NULL;
21437 int cur_shared_obj_ref_cnt = 0;
21438 unsigned int dirty_private_count = 0, dirty_shared_count = 0, obj_pages_snapshot = 0;
21439
21440 *purgeable_count = *wired_count = *clean_count = *dirty_count = *shared_count = 0;
21441
21442 /*
21443 * We need the exclusive lock here so that we can
21444 * block any page faults or lookups while we are
21445 * in the middle of freezing this vm map.
21446 */
21447 vm_map_t map = task->map;
21448
21449 vm_map_lock(map);
21450
21451 assert(VM_CONFIG_COMPRESSOR_IS_PRESENT);
21452
21453 if (vm_compressor_low_on_space() || vm_swap_low_on_space()) {
21454 if (vm_compressor_low_on_space()) {
21455 *freezer_error_code = FREEZER_ERROR_NO_COMPRESSOR_SPACE;
21456 }
21457
21458 if (vm_swap_low_on_space()) {
21459 *freezer_error_code = FREEZER_ERROR_NO_SWAP_SPACE;
21460 }
21461
21462 kr = KERN_NO_SPACE;
21463 goto done;
21464 }
21465
21466 if (VM_CONFIG_FREEZER_SWAP_IS_ACTIVE == FALSE) {
21467 /*
21468 * In-memory compressor backing the freezer. No disk.
21469 * So no need to do the evaluation phase.
21470 */
21471 evaluation_phase = FALSE;
21472
21473 if (eval_only == TRUE) {
21474 /*
21475 * We don't support 'eval_only' mode
21476 * in this non-swap config.
21477 */
21478 *freezer_error_code = FREEZER_ERROR_GENERIC;
21479 kr = KERN_INVALID_ARGUMENT;
21480 goto done;
21481 }
21482
21483 freezer_context_global.freezer_ctx_uncompressed_pages = 0;
21484 clock_get_uptime(&c_freezer_last_yield_ts);
21485 }
21486 again:
21487
21488 for (entry2 = vm_map_first_entry(map);
21489 entry2 != vm_map_to_entry(map);
21490 entry2 = entry2->vme_next) {
21491 vm_object_t src_object;
21492
21493 if (entry2->is_sub_map) {
21494 continue;
21495 }
21496
21497 src_object = VME_OBJECT(entry2);
21498 if (!src_object ||
21499 src_object->phys_contiguous ||
21500 !src_object->internal) {
21501 continue;
21502 }
21503
21504 /* If eligible, scan the entry, moving eligible pages over to our parent object */
21505
21506 if (VM_CONFIG_FREEZER_SWAP_IS_ACTIVE) {
21507 /*
21508 * We skip purgeable objects during evaluation phase only.
21509 * If we decide to freeze this process, we'll explicitly
21510 * purge these objects before we go around again with
21511 * 'evaluation_phase' set to FALSE.
21512 */
21513
21514 if ((src_object->purgable == VM_PURGABLE_EMPTY) || (src_object->purgable == VM_PURGABLE_VOLATILE)) {
21515 /*
21516 * We want to purge objects that may not belong to this task but are mapped
21517 * in this task alone. Since we already purged this task's purgeable memory
21518 * at the end of a successful evaluation phase, we want to avoid doing no-op calls
21519 * on this task's purgeable objects. Hence the check for only volatile objects.
21520 */
21521 if (evaluation_phase == FALSE &&
21522 (src_object->purgable == VM_PURGABLE_VOLATILE) &&
21523 (src_object->ref_count == 1)) {
21524 vm_object_lock(src_object);
21525 vm_object_purge(src_object, 0);
21526 vm_object_unlock(src_object);
21527 }
21528 continue;
21529 }
21530
21531 /*
21532 * Pages belonging to this object could be swapped to disk.
21533 * Make sure it's not a shared object because we could end
21534 * up just bringing it back in again.
21535 *
21536 * We try to optimize somewhat by checking for objects that are mapped
21537 * more than once within our own map. But we don't do full searches,
21538 * we just look at the entries following our current entry.
21539 */
21540
21541 if (src_object->ref_count > 1) {
21542 if (src_object != cur_shared_object) {
21543 obj_pages_snapshot = (src_object->resident_page_count - src_object->wired_page_count) + vm_compressor_pager_get_count(src_object->pager);
21544 dirty_shared_count += obj_pages_snapshot;
21545
21546 cur_shared_object = src_object;
21547 cur_shared_obj_ref_cnt = 1;
21548 continue;
21549 } else {
21550 cur_shared_obj_ref_cnt++;
21551 if (src_object->ref_count == cur_shared_obj_ref_cnt) {
21552 /*
21553 * Fall through to below and treat this object as private.
21554 * So deduct its pages from our shared total and add it to the
21555 * private total.
21556 */
21557
21558 dirty_shared_count -= obj_pages_snapshot;
21559 dirty_private_count += obj_pages_snapshot;
21560 } else {
21561 continue;
21562 }
21563 }
21564 }
21565
21566
21567 if (src_object->ref_count == 1) {
21568 dirty_private_count += (src_object->resident_page_count - src_object->wired_page_count) + vm_compressor_pager_get_count(src_object->pager);
21569 }
21570
21571 if (evaluation_phase == TRUE) {
21572 continue;
21573 }
21574 }
21575
21576 uint32_t paged_out_count = vm_object_compressed_freezer_pageout(src_object, dirty_budget);
21577 *wired_count += src_object->wired_page_count;
21578
21579 if (vm_compressor_low_on_space() || vm_swap_low_on_space()) {
21580 if (vm_compressor_low_on_space()) {
21581 *freezer_error_code = FREEZER_ERROR_NO_COMPRESSOR_SPACE;
21582 }
21583
21584 if (vm_swap_low_on_space()) {
21585 *freezer_error_code = FREEZER_ERROR_NO_SWAP_SPACE;
21586 }
21587
21588 kr = KERN_NO_SPACE;
21589 break;
21590 }
21591 if (paged_out_count >= dirty_budget) {
21592 break;
21593 }
21594 dirty_budget -= paged_out_count;
21595 }
21596
21597 *shared_count = (unsigned int) ((dirty_shared_count * PAGE_SIZE_64) / (1024 * 1024ULL));
21598 if (evaluation_phase) {
21599 unsigned int shared_pages_threshold = (memorystatus_freeze_shared_mb_per_process_max * 1024 * 1024ULL) / PAGE_SIZE_64;
21600
21601 if (dirty_shared_count > shared_pages_threshold) {
21602 *freezer_error_code = FREEZER_ERROR_EXCESS_SHARED_MEMORY;
21603 kr = KERN_FAILURE;
21604 goto done;
21605 }
21606
21607 if (dirty_shared_count &&
21608 ((dirty_private_count / dirty_shared_count) < memorystatus_freeze_private_shared_pages_ratio)) {
21609 *freezer_error_code = FREEZER_ERROR_LOW_PRIVATE_SHARED_RATIO;
21610 kr = KERN_FAILURE;
21611 goto done;
21612 }
21613
21614 evaluation_phase = FALSE;
21615 dirty_shared_count = dirty_private_count = 0;
21616
21617 freezer_context_global.freezer_ctx_uncompressed_pages = 0;
21618 clock_get_uptime(&c_freezer_last_yield_ts);
21619
21620 if (eval_only) {
21621 kr = KERN_SUCCESS;
21622 goto done;
21623 }
21624
21625 vm_purgeable_purge_task_owned(task);
21626
21627 goto again;
21628 } else {
21629 kr = KERN_SUCCESS;
21630 }
21631
21632 done:
21633 vm_map_unlock(map);
21634
21635 if ((eval_only == FALSE) && (kr == KERN_SUCCESS)) {
21636 vm_object_compressed_freezer_done();
21637 }
21638 return kr;
21639 }
21640
21641 #endif
21642
21643 /*
21644 * vm_map_entry_should_cow_for_true_share:
21645 *
21646 * Determines if the map entry should be clipped and setup for copy-on-write
21647 * to avoid applying "true_share" to a large VM object when only a subset is
21648 * targeted.
21649 *
21650 * For now, we target only the map entries created for the Objective C
21651 * Garbage Collector, which initially have the following properties:
21652 * - alias == VM_MEMORY_MALLOC
21653 * - wired_count == 0
21654 * - !needs_copy
21655 * and a VM object with:
21656 * - internal
21657 * - copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC
21658 * - !true_share
21659 * - vo_size == ANON_CHUNK_SIZE
21660 *
21661 * Only non-kernel map entries.
21662 */
21663 boolean_t
vm_map_entry_should_cow_for_true_share(vm_map_entry_t entry)21664 vm_map_entry_should_cow_for_true_share(
21665 vm_map_entry_t entry)
21666 {
21667 vm_object_t object;
21668
21669 if (entry->is_sub_map) {
21670 /* entry does not point at a VM object */
21671 return FALSE;
21672 }
21673
21674 if (entry->needs_copy) {
21675 /* already set for copy_on_write: done! */
21676 return FALSE;
21677 }
21678
21679 if (VME_ALIAS(entry) != VM_MEMORY_MALLOC &&
21680 VME_ALIAS(entry) != VM_MEMORY_MALLOC_SMALL) {
21681 /* not a malloc heap or Obj-C Garbage Collector heap */
21682 return FALSE;
21683 }
21684
21685 if (entry->wired_count) {
21686 /* wired: can't change the map entry... */
21687 vm_counters.should_cow_but_wired++;
21688 return FALSE;
21689 }
21690
21691 object = VME_OBJECT(entry);
21692
21693 if (object == VM_OBJECT_NULL) {
21694 /* no object yet... */
21695 return FALSE;
21696 }
21697
21698 if (!object->internal) {
21699 /* not an internal object */
21700 return FALSE;
21701 }
21702
21703 if (object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC) {
21704 /* not the default copy strategy */
21705 return FALSE;
21706 }
21707
21708 if (object->true_share) {
21709 /* already true_share: too late to avoid it */
21710 return FALSE;
21711 }
21712
21713 if (VME_ALIAS(entry) == VM_MEMORY_MALLOC &&
21714 object->vo_size != ANON_CHUNK_SIZE) {
21715 /* ... not an object created for the ObjC Garbage Collector */
21716 return FALSE;
21717 }
21718
21719 if (VME_ALIAS(entry) == VM_MEMORY_MALLOC_SMALL &&
21720 object->vo_size != 2048 * 4096) {
21721 /* ... not a "MALLOC_SMALL" heap */
21722 return FALSE;
21723 }
21724
21725 /*
21726 * All the criteria match: we have a large object being targeted for "true_share".
21727 * To limit the adverse side-effects linked with "true_share", tell the caller to
21728 * try and avoid setting up the entire object for "true_share" by clipping the
21729 * targeted range and setting it up for copy-on-write.
21730 */
21731 return TRUE;
21732 }
21733
21734 uint64_t vm_map_range_overflows_count = 0;
21735 TUNABLE_WRITEABLE(boolean_t, vm_map_range_overflows_log, "vm_map_range_overflows_log", FALSE);
21736 bool
vm_map_range_overflows(vm_map_t map,vm_map_offset_t addr,vm_map_size_t size)21737 vm_map_range_overflows(
21738 vm_map_t map,
21739 vm_map_offset_t addr,
21740 vm_map_size_t size)
21741 {
21742 vm_map_offset_t start, end, sum;
21743 vm_map_offset_t pgmask;
21744
21745 if (size == 0) {
21746 /* empty range -> no overflow */
21747 return false;
21748 }
21749 pgmask = vm_map_page_mask(map);
21750 start = vm_map_trunc_page_mask(addr, pgmask);
21751 end = vm_map_round_page_mask(addr + size, pgmask);
21752 if (__improbable(os_add_overflow(addr, size, &sum) || end <= start)) {
21753 vm_map_range_overflows_count++;
21754 if (vm_map_range_overflows_log) {
21755 printf("%d[%s] vm_map_range_overflows addr 0x%llx size 0x%llx pgmask 0x%llx\n",
21756 proc_selfpid(),
21757 proc_best_name(current_proc()),
21758 (uint64_t)addr,
21759 (uint64_t)size,
21760 (uint64_t)pgmask);
21761 }
21762 DTRACE_VM4(vm_map_range_overflows,
21763 vm_map_t, map,
21764 uint32_t, pgmask,
21765 uint64_t, (uint64_t)addr,
21766 uint64_t, (uint64_t)size);
21767 return true;
21768 }
21769 return false;
21770 }
21771
21772 vm_map_offset_t
vm_map_round_page_mask(vm_map_offset_t offset,vm_map_offset_t mask)21773 vm_map_round_page_mask(
21774 vm_map_offset_t offset,
21775 vm_map_offset_t mask)
21776 {
21777 return VM_MAP_ROUND_PAGE(offset, mask);
21778 }
21779
21780 vm_map_offset_t
vm_map_trunc_page_mask(vm_map_offset_t offset,vm_map_offset_t mask)21781 vm_map_trunc_page_mask(
21782 vm_map_offset_t offset,
21783 vm_map_offset_t mask)
21784 {
21785 return VM_MAP_TRUNC_PAGE(offset, mask);
21786 }
21787
21788 boolean_t
vm_map_page_aligned(vm_map_offset_t offset,vm_map_offset_t mask)21789 vm_map_page_aligned(
21790 vm_map_offset_t offset,
21791 vm_map_offset_t mask)
21792 {
21793 return ((offset) & mask) == 0;
21794 }
21795
21796 int
vm_map_page_shift(vm_map_t map)21797 vm_map_page_shift(
21798 vm_map_t map)
21799 {
21800 return VM_MAP_PAGE_SHIFT(map);
21801 }
21802
21803 int
vm_map_page_size(vm_map_t map)21804 vm_map_page_size(
21805 vm_map_t map)
21806 {
21807 return VM_MAP_PAGE_SIZE(map);
21808 }
21809
21810 vm_map_offset_t
vm_map_page_mask(vm_map_t map)21811 vm_map_page_mask(
21812 vm_map_t map)
21813 {
21814 return VM_MAP_PAGE_MASK(map);
21815 }
21816
21817 kern_return_t
vm_map_set_page_shift(vm_map_t map,int pageshift)21818 vm_map_set_page_shift(
21819 vm_map_t map,
21820 int pageshift)
21821 {
21822 if (map->hdr.nentries != 0) {
21823 /* too late to change page size */
21824 return KERN_FAILURE;
21825 }
21826
21827 map->hdr.page_shift = (uint16_t)pageshift;
21828
21829 return KERN_SUCCESS;
21830 }
21831
21832 kern_return_t
vm_map_query_volatile(vm_map_t map,mach_vm_size_t * volatile_virtual_size_p,mach_vm_size_t * volatile_resident_size_p,mach_vm_size_t * volatile_compressed_size_p,mach_vm_size_t * volatile_pmap_size_p,mach_vm_size_t * volatile_compressed_pmap_size_p)21833 vm_map_query_volatile(
21834 vm_map_t map,
21835 mach_vm_size_t *volatile_virtual_size_p,
21836 mach_vm_size_t *volatile_resident_size_p,
21837 mach_vm_size_t *volatile_compressed_size_p,
21838 mach_vm_size_t *volatile_pmap_size_p,
21839 mach_vm_size_t *volatile_compressed_pmap_size_p)
21840 {
21841 mach_vm_size_t volatile_virtual_size;
21842 mach_vm_size_t volatile_resident_count;
21843 mach_vm_size_t volatile_compressed_count;
21844 mach_vm_size_t volatile_pmap_count;
21845 mach_vm_size_t volatile_compressed_pmap_count;
21846 mach_vm_size_t resident_count;
21847 vm_map_entry_t entry;
21848 vm_object_t object;
21849
21850 /* map should be locked by caller */
21851
21852 volatile_virtual_size = 0;
21853 volatile_resident_count = 0;
21854 volatile_compressed_count = 0;
21855 volatile_pmap_count = 0;
21856 volatile_compressed_pmap_count = 0;
21857
21858 for (entry = vm_map_first_entry(map);
21859 entry != vm_map_to_entry(map);
21860 entry = entry->vme_next) {
21861 mach_vm_size_t pmap_resident_bytes, pmap_compressed_bytes;
21862
21863 if (entry->is_sub_map) {
21864 continue;
21865 }
21866 if (!(entry->protection & VM_PROT_WRITE)) {
21867 continue;
21868 }
21869 object = VME_OBJECT(entry);
21870 if (object == VM_OBJECT_NULL) {
21871 continue;
21872 }
21873 if (object->purgable != VM_PURGABLE_VOLATILE &&
21874 object->purgable != VM_PURGABLE_EMPTY) {
21875 continue;
21876 }
21877 if (VME_OFFSET(entry)) {
21878 /*
21879 * If the map entry has been split and the object now
21880 * appears several times in the VM map, we don't want
21881 * to count the object's resident_page_count more than
21882 * once. We count it only for the first one, starting
21883 * at offset 0 and ignore the other VM map entries.
21884 */
21885 continue;
21886 }
21887 resident_count = object->resident_page_count;
21888 if ((VME_OFFSET(entry) / PAGE_SIZE) >= resident_count) {
21889 resident_count = 0;
21890 } else {
21891 resident_count -= (VME_OFFSET(entry) / PAGE_SIZE);
21892 }
21893
21894 volatile_virtual_size += entry->vme_end - entry->vme_start;
21895 volatile_resident_count += resident_count;
21896 if (object->pager) {
21897 volatile_compressed_count +=
21898 vm_compressor_pager_get_count(object->pager);
21899 }
21900 pmap_compressed_bytes = 0;
21901 pmap_resident_bytes =
21902 pmap_query_resident(map->pmap,
21903 entry->vme_start,
21904 entry->vme_end,
21905 &pmap_compressed_bytes);
21906 volatile_pmap_count += (pmap_resident_bytes / PAGE_SIZE);
21907 volatile_compressed_pmap_count += (pmap_compressed_bytes
21908 / PAGE_SIZE);
21909 }
21910
21911 /* map is still locked on return */
21912
21913 *volatile_virtual_size_p = volatile_virtual_size;
21914 *volatile_resident_size_p = volatile_resident_count * PAGE_SIZE;
21915 *volatile_compressed_size_p = volatile_compressed_count * PAGE_SIZE;
21916 *volatile_pmap_size_p = volatile_pmap_count * PAGE_SIZE;
21917 *volatile_compressed_pmap_size_p = volatile_compressed_pmap_count * PAGE_SIZE;
21918
21919 return KERN_SUCCESS;
21920 }
21921
21922 void
vm_map_sizes(vm_map_t map,vm_map_size_t * psize,vm_map_size_t * pfree,vm_map_size_t * plargest_free)21923 vm_map_sizes(vm_map_t map,
21924 vm_map_size_t * psize,
21925 vm_map_size_t * pfree,
21926 vm_map_size_t * plargest_free)
21927 {
21928 vm_map_entry_t entry;
21929 vm_map_offset_t prev;
21930 vm_map_size_t free, total_free, largest_free;
21931 boolean_t end;
21932
21933 if (!map) {
21934 *psize = *pfree = *plargest_free = 0;
21935 return;
21936 }
21937 total_free = largest_free = 0;
21938
21939 vm_map_lock_read(map);
21940 if (psize) {
21941 *psize = map->max_offset - map->min_offset;
21942 }
21943
21944 prev = map->min_offset;
21945 for (entry = vm_map_first_entry(map);; entry = entry->vme_next) {
21946 end = (entry == vm_map_to_entry(map));
21947
21948 if (end) {
21949 free = entry->vme_end - prev;
21950 } else {
21951 free = entry->vme_start - prev;
21952 }
21953
21954 total_free += free;
21955 if (free > largest_free) {
21956 largest_free = free;
21957 }
21958
21959 if (end) {
21960 break;
21961 }
21962 prev = entry->vme_end;
21963 }
21964 vm_map_unlock_read(map);
21965 if (pfree) {
21966 *pfree = total_free;
21967 }
21968 if (plargest_free) {
21969 *plargest_free = largest_free;
21970 }
21971 }
21972
21973 #if VM_SCAN_FOR_SHADOW_CHAIN
21974 int vm_map_shadow_max(vm_map_t map);
21975 int
vm_map_shadow_max(vm_map_t map)21976 vm_map_shadow_max(
21977 vm_map_t map)
21978 {
21979 int shadows, shadows_max;
21980 vm_map_entry_t entry;
21981 vm_object_t object, next_object;
21982
21983 if (map == NULL) {
21984 return 0;
21985 }
21986
21987 shadows_max = 0;
21988
21989 vm_map_lock_read(map);
21990
21991 for (entry = vm_map_first_entry(map);
21992 entry != vm_map_to_entry(map);
21993 entry = entry->vme_next) {
21994 if (entry->is_sub_map) {
21995 continue;
21996 }
21997 object = VME_OBJECT(entry);
21998 if (object == NULL) {
21999 continue;
22000 }
22001 vm_object_lock_shared(object);
22002 for (shadows = 0;
22003 object->shadow != NULL;
22004 shadows++, object = next_object) {
22005 next_object = object->shadow;
22006 vm_object_lock_shared(next_object);
22007 vm_object_unlock(object);
22008 }
22009 vm_object_unlock(object);
22010 if (shadows > shadows_max) {
22011 shadows_max = shadows;
22012 }
22013 }
22014
22015 vm_map_unlock_read(map);
22016
22017 return shadows_max;
22018 }
22019 #endif /* VM_SCAN_FOR_SHADOW_CHAIN */
22020
22021 void
vm_commit_pagezero_status(vm_map_t lmap)22022 vm_commit_pagezero_status(vm_map_t lmap)
22023 {
22024 pmap_advise_pagezero_range(lmap->pmap, lmap->min_offset);
22025 }
22026
22027 #if __x86_64__
22028 void
vm_map_set_high_start(vm_map_t map,vm_map_offset_t high_start)22029 vm_map_set_high_start(
22030 vm_map_t map,
22031 vm_map_offset_t high_start)
22032 {
22033 map->vmmap_high_start = high_start;
22034 }
22035 #endif /* __x86_64__ */
22036
22037 #if CODE_SIGNING_MONITOR
22038
22039 kern_return_t
vm_map_entry_cs_associate(vm_map_t map,vm_map_entry_t entry,vm_map_kernel_flags_t vmk_flags)22040 vm_map_entry_cs_associate(
22041 vm_map_t map,
22042 vm_map_entry_t entry,
22043 vm_map_kernel_flags_t vmk_flags)
22044 {
22045 vm_object_t cs_object, cs_shadow, backing_object;
22046 vm_object_offset_t cs_offset, backing_offset;
22047 void *cs_blobs;
22048 struct vnode *cs_vnode;
22049 kern_return_t cs_ret;
22050
22051 if (map->pmap == NULL ||
22052 entry->is_sub_map || /* XXX FBDP: recurse on sub-range? */
22053 (csm_address_space_exempt(map->pmap) == KERN_SUCCESS) ||
22054 VME_OBJECT(entry) == VM_OBJECT_NULL) {
22055 return KERN_SUCCESS;
22056 }
22057
22058 if (!(entry->protection & VM_PROT_EXECUTE)) {
22059 /*
22060 * This memory region is not executable, so the code-signing
22061 * monitor would usually not care about it...
22062 */
22063 if (vmk_flags.vmkf_remap_prot_copy &&
22064 (entry->max_protection & VM_PROT_EXECUTE)) {
22065 /*
22066 * ... except if the memory region is being remapped
22067 * from r-x/r-x to rw-/rwx via vm_protect(VM_PROT_COPY)
22068 * which is what a debugger or dtrace would be doing
22069 * to prepare to modify an executable page to insert
22070 * a breakpoint or activate a probe.
22071 * In that case, fall through so that we can mark
22072 * this region as being "debugged" and no longer
22073 * strictly code-signed.
22074 */
22075 } else {
22076 /*
22077 * Really not executable, so no need to tell the
22078 * code-signing monitor.
22079 */
22080 return KERN_SUCCESS;
22081 }
22082 }
22083
22084 vm_map_lock_assert_exclusive(map);
22085
22086 if (entry->used_for_jit) {
22087 cs_ret = csm_associate_jit_region(
22088 map->pmap,
22089 entry->vme_start,
22090 entry->vme_end - entry->vme_start);
22091 goto done;
22092 }
22093
22094 if (vmk_flags.vmkf_remap_prot_copy) {
22095 cs_ret = csm_associate_debug_region(
22096 map->pmap,
22097 entry->vme_start,
22098 entry->vme_end - entry->vme_start);
22099 if (cs_ret == KERN_SUCCESS) {
22100 entry->vme_xnu_user_debug = TRUE;
22101 }
22102 #if DEVELOPMENT || DEBUG
22103 if (vm_log_xnu_user_debug) {
22104 printf("FBDP %d[%s] %s:%d map %p entry %p [ 0x%llx 0x%llx ] vme_xnu_user_debug=%d cs_ret %d\n",
22105 proc_selfpid(),
22106 (get_bsdtask_info(current_task()) ? proc_name_address(get_bsdtask_info(current_task())) : "?"),
22107 __FUNCTION__, __LINE__,
22108 map, entry,
22109 (uint64_t)entry->vme_start, (uint64_t)entry->vme_end,
22110 entry->vme_xnu_user_debug,
22111 cs_ret);
22112 }
22113 #endif /* DEVELOPMENT || DEBUG */
22114 goto done;
22115 }
22116
22117 cs_object = VME_OBJECT(entry);
22118 vm_object_lock_shared(cs_object);
22119 cs_offset = VME_OFFSET(entry);
22120
22121 /* find the VM object backed by the code-signed vnode */
22122 for (;;) {
22123 /* go to the bottom of cs_object's shadow chain */
22124 for (;
22125 cs_object->shadow != VM_OBJECT_NULL;
22126 cs_object = cs_shadow) {
22127 cs_shadow = cs_object->shadow;
22128 cs_offset += cs_object->vo_shadow_offset;
22129 vm_object_lock_shared(cs_shadow);
22130 vm_object_unlock(cs_object);
22131 }
22132 if (cs_object->internal ||
22133 cs_object->pager == MEMORY_OBJECT_NULL) {
22134 vm_object_unlock(cs_object);
22135 return KERN_SUCCESS;
22136 }
22137
22138 cs_offset += cs_object->paging_offset;
22139
22140 /*
22141 * cs_object could be backed by a:
22142 * vnode_pager
22143 * apple_protect_pager
22144 * shared_region_pager
22145 * fourk_pager (multiple backing objects -> fail?)
22146 * ask the pager if it has a backing VM object
22147 */
22148 if (!memory_object_backing_object(cs_object->pager,
22149 cs_offset,
22150 &backing_object,
22151 &backing_offset)) {
22152 /* no backing object: cs_object is it */
22153 break;
22154 }
22155
22156 /* look down the backing object's shadow chain */
22157 vm_object_lock_shared(backing_object);
22158 vm_object_unlock(cs_object);
22159 cs_object = backing_object;
22160 cs_offset = backing_offset;
22161 }
22162
22163 cs_vnode = vnode_pager_lookup_vnode(cs_object->pager);
22164 if (cs_vnode == NULL) {
22165 /* no vnode, no code signatures to associate */
22166 cs_ret = KERN_SUCCESS;
22167 } else {
22168 cs_ret = vnode_pager_get_cs_blobs(cs_vnode,
22169 &cs_blobs);
22170 assert(cs_ret == KERN_SUCCESS);
22171 cs_ret = cs_associate_blob_with_mapping(map->pmap,
22172 entry->vme_start,
22173 (entry->vme_end - entry->vme_start),
22174 cs_offset,
22175 cs_blobs);
22176 }
22177 vm_object_unlock(cs_object);
22178 cs_object = VM_OBJECT_NULL;
22179
22180 done:
22181 if (cs_ret == KERN_SUCCESS) {
22182 DTRACE_VM2(vm_map_entry_cs_associate_success,
22183 vm_map_offset_t, entry->vme_start,
22184 vm_map_offset_t, entry->vme_end);
22185 if (vm_map_executable_immutable) {
22186 /*
22187 * Prevent this executable
22188 * mapping from being unmapped
22189 * or modified.
22190 */
22191 entry->vme_permanent = TRUE;
22192 }
22193 /*
22194 * pmap says it will validate the
22195 * code-signing validity of pages
22196 * faulted in via this mapping, so
22197 * this map entry should be marked so
22198 * that vm_fault() bypasses code-signing
22199 * validation for faults coming through
22200 * this mapping.
22201 */
22202 entry->csm_associated = TRUE;
22203 } else if (cs_ret == KERN_NOT_SUPPORTED) {
22204 /*
22205 * pmap won't check the code-signing
22206 * validity of pages faulted in via
22207 * this mapping, so VM should keep
22208 * doing it.
22209 */
22210 DTRACE_VM3(vm_map_entry_cs_associate_off,
22211 vm_map_offset_t, entry->vme_start,
22212 vm_map_offset_t, entry->vme_end,
22213 int, cs_ret);
22214 } else {
22215 /*
22216 * A real error: do not allow
22217 * execution in this mapping.
22218 */
22219 DTRACE_VM3(vm_map_entry_cs_associate_failure,
22220 vm_map_offset_t, entry->vme_start,
22221 vm_map_offset_t, entry->vme_end,
22222 int, cs_ret);
22223 if (vmk_flags.vmkf_overwrite_immutable) {
22224 /*
22225 * We can get here when we remap an apple_protect pager
22226 * on top of an already cs_associated executable mapping
22227 * with the same code signatures, so we don't want to
22228 * lose VM_PROT_EXECUTE in that case...
22229 */
22230 } else {
22231 entry->protection &= ~VM_PROT_ALLEXEC;
22232 entry->max_protection &= ~VM_PROT_ALLEXEC;
22233 }
22234 }
22235
22236 return cs_ret;
22237 }
22238
22239 #endif /* CODE_SIGNING_MONITOR */
22240
22241 inline bool
vm_map_is_corpse_source(vm_map_t map)22242 vm_map_is_corpse_source(vm_map_t map)
22243 {
22244 bool status = false;
22245 if (map) {
22246 vm_map_lock_read(map);
22247 status = map->corpse_source;
22248 vm_map_unlock_read(map);
22249 }
22250 return status;
22251 }
22252
22253 inline void
vm_map_set_corpse_source(vm_map_t map)22254 vm_map_set_corpse_source(vm_map_t map)
22255 {
22256 if (map) {
22257 vm_map_lock(map);
22258 map->corpse_source = true;
22259 vm_map_unlock(map);
22260 }
22261 }
22262
22263 inline void
vm_map_unset_corpse_source(vm_map_t map)22264 vm_map_unset_corpse_source(vm_map_t map)
22265 {
22266 if (map) {
22267 vm_map_lock(map);
22268 map->corpse_source = false;
22269 vm_map_unlock(map);
22270 }
22271 }
22272 /*
22273 * FORKED CORPSE FOOTPRINT
22274 *
22275 * A forked corpse gets a copy of the original VM map but its pmap is mostly
22276 * empty since it never ran and never got to fault in any pages.
22277 * Collecting footprint info (via "sysctl vm.self_region_footprint") for
22278 * a forked corpse would therefore return very little information.
22279 *
22280 * When forking a corpse, we can pass the VM_MAP_FORK_CORPSE_FOOTPRINT option
22281 * to vm_map_fork() to collect footprint information from the original VM map
22282 * and its pmap, and store it in the forked corpse's VM map. That information
22283 * is stored in place of the VM map's "hole list" since we'll never need to
22284 * lookup for holes in the corpse's map.
22285 *
22286 * The corpse's footprint info looks like this:
22287 *
22288 * vm_map->vmmap_corpse_footprint points to pageable kernel memory laid out
22289 * as follows:
22290 * +---------------------------------------+
22291 * header-> | cf_size |
22292 * +-------------------+-------------------+
22293 * | cf_last_region | cf_last_zeroes |
22294 * +-------------------+-------------------+
22295 * region1-> | cfr_vaddr |
22296 * +-------------------+-------------------+
22297 * | cfr_num_pages | d0 | d1 | d2 | d3 |
22298 * +---------------------------------------+
22299 * | d4 | d5 | ... |
22300 * +---------------------------------------+
22301 * | ... |
22302 * +-------------------+-------------------+
22303 * | dy | dz | na | na | cfr_vaddr... | <-region2
22304 * +-------------------+-------------------+
22305 * | cfr_vaddr (ctd) | cfr_num_pages |
22306 * +---------------------------------------+
22307 * | d0 | d1 ... |
22308 * +---------------------------------------+
22309 * ...
22310 * +---------------------------------------+
22311 * last region-> | cfr_vaddr |
22312 * +---------------------------------------+
22313 * + cfr_num_pages | d0 | d1 | d2 | d3 |
22314 * +---------------------------------------+
22315 * ...
22316 * +---------------------------------------+
22317 * | dx | dy | dz | na | na | na | na | na |
22318 * +---------------------------------------+
22319 *
22320 * where:
22321 * cf_size: total size of the buffer (rounded to page size)
22322 * cf_last_region: offset in the buffer of the last "region" sub-header
22323 * cf_last_zeroes: number of trailing "zero" dispositions at the end
22324 * of last region
22325 * cfr_vaddr: virtual address of the start of the covered "region"
22326 * cfr_num_pages: number of pages in the covered "region"
22327 * d*: disposition of the page at that virtual address
22328 * Regions in the buffer are word-aligned.
22329 *
22330 * We estimate the size of the buffer based on the number of memory regions
22331 * and the virtual size of the address space. While copying each memory region
22332 * during vm_map_fork(), we also collect the footprint info for that region
22333 * and store it in the buffer, packing it as much as possible (coalescing
22334 * contiguous memory regions to avoid having too many region headers and
22335 * avoiding long streaks of "zero" page dispositions by splitting footprint
22336 * "regions", so the number of regions in the footprint buffer might not match
22337 * the number of memory regions in the address space.
22338 *
22339 * We also have to copy the original task's "nonvolatile" ledgers since that's
22340 * part of the footprint and will need to be reported to any tool asking for
22341 * the footprint information of the forked corpse.
22342 */
22343
22344 uint64_t vm_map_corpse_footprint_count = 0;
22345 uint64_t vm_map_corpse_footprint_size_avg = 0;
22346 uint64_t vm_map_corpse_footprint_size_max = 0;
22347 uint64_t vm_map_corpse_footprint_full = 0;
22348 uint64_t vm_map_corpse_footprint_no_buf = 0;
22349
22350 struct vm_map_corpse_footprint_header {
22351 vm_size_t cf_size; /* allocated buffer size */
22352 uint32_t cf_last_region; /* offset of last region in buffer */
22353 union {
22354 uint32_t cfu_last_zeroes; /* during creation:
22355 * number of "zero" dispositions at
22356 * end of last region */
22357 uint32_t cfu_hint_region; /* during lookup:
22358 * offset of last looked up region */
22359 #define cf_last_zeroes cfu.cfu_last_zeroes
22360 #define cf_hint_region cfu.cfu_hint_region
22361 } cfu;
22362 };
22363 typedef uint8_t cf_disp_t;
22364 struct vm_map_corpse_footprint_region {
22365 vm_map_offset_t cfr_vaddr; /* region start virtual address */
22366 uint32_t cfr_num_pages; /* number of pages in this "region" */
22367 cf_disp_t cfr_disposition[0]; /* disposition of each page */
22368 } __attribute__((packed));
22369
22370 static cf_disp_t
vm_page_disposition_to_cf_disp(int disposition)22371 vm_page_disposition_to_cf_disp(
22372 int disposition)
22373 {
22374 assert(sizeof(cf_disp_t) == 1);
22375 /* relocate bits that don't fit in a "uint8_t" */
22376 if (disposition & VM_PAGE_QUERY_PAGE_REUSABLE) {
22377 disposition |= VM_PAGE_QUERY_PAGE_FICTITIOUS;
22378 }
22379 /* cast gets rid of extra bits */
22380 return (cf_disp_t) disposition;
22381 }
22382
22383 static int
vm_page_cf_disp_to_disposition(cf_disp_t cf_disp)22384 vm_page_cf_disp_to_disposition(
22385 cf_disp_t cf_disp)
22386 {
22387 int disposition;
22388
22389 assert(sizeof(cf_disp_t) == 1);
22390 disposition = (int) cf_disp;
22391 /* move relocated bits back in place */
22392 if (cf_disp & VM_PAGE_QUERY_PAGE_FICTITIOUS) {
22393 disposition |= VM_PAGE_QUERY_PAGE_REUSABLE;
22394 disposition &= ~VM_PAGE_QUERY_PAGE_FICTITIOUS;
22395 }
22396 return disposition;
22397 }
22398
22399 /*
22400 * vm_map_corpse_footprint_new_region:
22401 * closes the current footprint "region" and creates a new one
22402 *
22403 * Returns NULL if there's not enough space in the buffer for a new region.
22404 */
22405 static struct vm_map_corpse_footprint_region *
vm_map_corpse_footprint_new_region(struct vm_map_corpse_footprint_header * footprint_header)22406 vm_map_corpse_footprint_new_region(
22407 struct vm_map_corpse_footprint_header *footprint_header)
22408 {
22409 uintptr_t footprint_edge;
22410 uint32_t new_region_offset;
22411 struct vm_map_corpse_footprint_region *footprint_region;
22412 struct vm_map_corpse_footprint_region *new_footprint_region;
22413
22414 footprint_edge = ((uintptr_t)footprint_header +
22415 footprint_header->cf_size);
22416 footprint_region = ((struct vm_map_corpse_footprint_region *)
22417 ((char *)footprint_header +
22418 footprint_header->cf_last_region));
22419 assert((uintptr_t)footprint_region + sizeof(*footprint_region) <=
22420 footprint_edge);
22421
22422 /* get rid of trailing zeroes in the last region */
22423 assert(footprint_region->cfr_num_pages >=
22424 footprint_header->cf_last_zeroes);
22425 footprint_region->cfr_num_pages -=
22426 footprint_header->cf_last_zeroes;
22427 footprint_header->cf_last_zeroes = 0;
22428
22429 /* reuse this region if it's now empty */
22430 if (footprint_region->cfr_num_pages == 0) {
22431 return footprint_region;
22432 }
22433
22434 /* compute offset of new region */
22435 new_region_offset = footprint_header->cf_last_region;
22436 new_region_offset += sizeof(*footprint_region);
22437 new_region_offset += (footprint_region->cfr_num_pages * sizeof(cf_disp_t));
22438 new_region_offset = roundup(new_region_offset, sizeof(int));
22439
22440 /* check if we're going over the edge */
22441 if (((uintptr_t)footprint_header +
22442 new_region_offset +
22443 sizeof(*footprint_region)) >=
22444 footprint_edge) {
22445 /* over the edge: no new region */
22446 return NULL;
22447 }
22448
22449 /* adjust offset of last region in header */
22450 footprint_header->cf_last_region = new_region_offset;
22451
22452 new_footprint_region = (struct vm_map_corpse_footprint_region *)
22453 ((char *)footprint_header +
22454 footprint_header->cf_last_region);
22455 new_footprint_region->cfr_vaddr = 0;
22456 new_footprint_region->cfr_num_pages = 0;
22457 /* caller needs to initialize new region */
22458
22459 return new_footprint_region;
22460 }
22461
22462 /*
22463 * vm_map_corpse_footprint_collect:
22464 * collect footprint information for "old_entry" in "old_map" and
22465 * stores it in "new_map"'s vmmap_footprint_info.
22466 */
22467 kern_return_t
vm_map_corpse_footprint_collect(vm_map_t old_map,vm_map_entry_t old_entry,vm_map_t new_map)22468 vm_map_corpse_footprint_collect(
22469 vm_map_t old_map,
22470 vm_map_entry_t old_entry,
22471 vm_map_t new_map)
22472 {
22473 vm_map_offset_t va;
22474 kern_return_t kr;
22475 struct vm_map_corpse_footprint_header *footprint_header;
22476 struct vm_map_corpse_footprint_region *footprint_region;
22477 struct vm_map_corpse_footprint_region *new_footprint_region;
22478 cf_disp_t *next_disp_p;
22479 uintptr_t footprint_edge;
22480 uint32_t num_pages_tmp;
22481 int effective_page_size;
22482
22483 effective_page_size = MIN(PAGE_SIZE, VM_MAP_PAGE_SIZE(old_map));
22484
22485 va = old_entry->vme_start;
22486
22487 vm_map_lock_assert_exclusive(old_map);
22488 vm_map_lock_assert_exclusive(new_map);
22489
22490 assert(new_map->has_corpse_footprint);
22491 assert(!old_map->has_corpse_footprint);
22492 if (!new_map->has_corpse_footprint ||
22493 old_map->has_corpse_footprint) {
22494 /*
22495 * This can only transfer footprint info from a
22496 * map with a live pmap to a map with a corpse footprint.
22497 */
22498 return KERN_NOT_SUPPORTED;
22499 }
22500
22501 if (new_map->vmmap_corpse_footprint == NULL) {
22502 vm_offset_t buf;
22503 vm_size_t buf_size;
22504
22505 buf = 0;
22506 buf_size = (sizeof(*footprint_header) +
22507 (old_map->hdr.nentries
22508 *
22509 (sizeof(*footprint_region) +
22510 +3)) /* potential alignment for each region */
22511 +
22512 ((old_map->size / effective_page_size)
22513 *
22514 sizeof(cf_disp_t))); /* disposition for each page */
22515 // printf("FBDP corpse map %p guestimate footprint size 0x%llx\n", new_map, (uint64_t) buf_size);
22516 buf_size = round_page(buf_size);
22517
22518 /* limit buffer to 1 page to validate overflow detection */
22519 // buf_size = PAGE_SIZE;
22520
22521 /* limit size to a somewhat sane amount */
22522 #if XNU_TARGET_OS_OSX
22523 #define VM_MAP_CORPSE_FOOTPRINT_INFO_MAX_SIZE (8*1024*1024) /* 8MB */
22524 #else /* XNU_TARGET_OS_OSX */
22525 #define VM_MAP_CORPSE_FOOTPRINT_INFO_MAX_SIZE (256*1024) /* 256KB */
22526 #endif /* XNU_TARGET_OS_OSX */
22527 if (buf_size > VM_MAP_CORPSE_FOOTPRINT_INFO_MAX_SIZE) {
22528 buf_size = VM_MAP_CORPSE_FOOTPRINT_INFO_MAX_SIZE;
22529 }
22530
22531 /*
22532 * Allocate the pageable buffer (with a trailing guard page).
22533 * It will be zero-filled on demand.
22534 */
22535 kr = kmem_alloc(kernel_map, &buf, buf_size + PAGE_SIZE,
22536 KMA_DATA | KMA_PAGEABLE | KMA_GUARD_LAST,
22537 VM_KERN_MEMORY_DIAG);
22538 if (kr != KERN_SUCCESS) {
22539 vm_map_corpse_footprint_no_buf++;
22540 return kr;
22541 }
22542
22543 /* initialize header and 1st region */
22544 footprint_header = (struct vm_map_corpse_footprint_header *)buf;
22545 new_map->vmmap_corpse_footprint = footprint_header;
22546
22547 footprint_header->cf_size = buf_size;
22548 footprint_header->cf_last_region =
22549 sizeof(*footprint_header);
22550 footprint_header->cf_last_zeroes = 0;
22551
22552 footprint_region = (struct vm_map_corpse_footprint_region *)
22553 ((char *)footprint_header +
22554 footprint_header->cf_last_region);
22555 footprint_region->cfr_vaddr = 0;
22556 footprint_region->cfr_num_pages = 0;
22557 } else {
22558 /* retrieve header and last region */
22559 footprint_header = (struct vm_map_corpse_footprint_header *)
22560 new_map->vmmap_corpse_footprint;
22561 footprint_region = (struct vm_map_corpse_footprint_region *)
22562 ((char *)footprint_header +
22563 footprint_header->cf_last_region);
22564 }
22565 footprint_edge = ((uintptr_t)footprint_header +
22566 footprint_header->cf_size);
22567
22568 if ((footprint_region->cfr_vaddr +
22569 (((vm_map_offset_t)footprint_region->cfr_num_pages) *
22570 effective_page_size))
22571 != old_entry->vme_start) {
22572 uint64_t num_pages_delta, num_pages_delta_size;
22573 uint32_t region_offset_delta_size;
22574
22575 /*
22576 * Not the next contiguous virtual address:
22577 * start a new region or store "zero" dispositions for
22578 * the missing pages?
22579 */
22580 /* size of gap in actual page dispositions */
22581 num_pages_delta = ((old_entry->vme_start -
22582 footprint_region->cfr_vaddr) / effective_page_size)
22583 - footprint_region->cfr_num_pages;
22584 num_pages_delta_size = num_pages_delta * sizeof(cf_disp_t);
22585 /* size of gap as a new footprint region header */
22586 region_offset_delta_size =
22587 (sizeof(*footprint_region) +
22588 roundup(((footprint_region->cfr_num_pages -
22589 footprint_header->cf_last_zeroes) * sizeof(cf_disp_t)),
22590 sizeof(int)) -
22591 ((footprint_region->cfr_num_pages -
22592 footprint_header->cf_last_zeroes) * sizeof(cf_disp_t)));
22593 // printf("FBDP %s:%d region 0x%x 0x%llx 0x%x vme_start 0x%llx pages_delta 0x%llx region_delta 0x%x\n", __FUNCTION__, __LINE__, footprint_header->cf_last_region, footprint_region->cfr_vaddr, footprint_region->cfr_num_pages, old_entry->vme_start, num_pages_delta, region_offset_delta);
22594 if (region_offset_delta_size < num_pages_delta_size ||
22595 os_add3_overflow(footprint_region->cfr_num_pages,
22596 (uint32_t) num_pages_delta,
22597 1,
22598 &num_pages_tmp)) {
22599 /*
22600 * Storing data for this gap would take more space
22601 * than inserting a new footprint region header:
22602 * let's start a new region and save space. If it's a
22603 * tie, let's avoid using a new region, since that
22604 * would require more region hops to find the right
22605 * range during lookups.
22606 *
22607 * If the current region's cfr_num_pages would overflow
22608 * if we added "zero" page dispositions for the gap,
22609 * no choice but to start a new region.
22610 */
22611 // printf("FBDP %s:%d new region\n", __FUNCTION__, __LINE__);
22612 new_footprint_region =
22613 vm_map_corpse_footprint_new_region(footprint_header);
22614 /* check that we're not going over the edge */
22615 if (new_footprint_region == NULL) {
22616 goto over_the_edge;
22617 }
22618 footprint_region = new_footprint_region;
22619 /* initialize new region as empty */
22620 footprint_region->cfr_vaddr = old_entry->vme_start;
22621 footprint_region->cfr_num_pages = 0;
22622 } else {
22623 /*
22624 * Store "zero" page dispositions for the missing
22625 * pages.
22626 */
22627 // printf("FBDP %s:%d zero gap\n", __FUNCTION__, __LINE__);
22628 for (; num_pages_delta > 0; num_pages_delta--) {
22629 next_disp_p = (cf_disp_t *)
22630 ((uintptr_t) footprint_region +
22631 sizeof(*footprint_region));
22632 next_disp_p += footprint_region->cfr_num_pages;
22633 /* check that we're not going over the edge */
22634 if ((uintptr_t)next_disp_p >= footprint_edge) {
22635 goto over_the_edge;
22636 }
22637 /* store "zero" disposition for this gap page */
22638 footprint_region->cfr_num_pages++;
22639 *next_disp_p = (cf_disp_t) 0;
22640 footprint_header->cf_last_zeroes++;
22641 }
22642 }
22643 }
22644
22645 for (va = old_entry->vme_start;
22646 va < old_entry->vme_end;
22647 va += effective_page_size) {
22648 int disposition;
22649 cf_disp_t cf_disp;
22650
22651 vm_map_footprint_query_page_info(old_map,
22652 old_entry,
22653 va,
22654 &disposition);
22655 cf_disp = vm_page_disposition_to_cf_disp(disposition);
22656
22657 // if (va < SHARED_REGION_BASE_ARM64) printf("FBDP collect map %p va 0x%llx disp 0x%x\n", new_map, va, disp);
22658
22659 if (cf_disp == 0 && footprint_region->cfr_num_pages == 0) {
22660 /*
22661 * Ignore "zero" dispositions at start of
22662 * region: just move start of region.
22663 */
22664 footprint_region->cfr_vaddr += effective_page_size;
22665 continue;
22666 }
22667
22668 /* would region's cfr_num_pages overflow? */
22669 if (os_add_overflow(footprint_region->cfr_num_pages, 1,
22670 &num_pages_tmp)) {
22671 /* overflow: create a new region */
22672 new_footprint_region =
22673 vm_map_corpse_footprint_new_region(
22674 footprint_header);
22675 if (new_footprint_region == NULL) {
22676 goto over_the_edge;
22677 }
22678 footprint_region = new_footprint_region;
22679 footprint_region->cfr_vaddr = va;
22680 footprint_region->cfr_num_pages = 0;
22681 }
22682
22683 next_disp_p = (cf_disp_t *) ((uintptr_t) footprint_region +
22684 sizeof(*footprint_region));
22685 next_disp_p += footprint_region->cfr_num_pages;
22686 /* check that we're not going over the edge */
22687 if ((uintptr_t)next_disp_p >= footprint_edge) {
22688 goto over_the_edge;
22689 }
22690 /* store this dispostion */
22691 *next_disp_p = cf_disp;
22692 footprint_region->cfr_num_pages++;
22693
22694 if (cf_disp != 0) {
22695 /* non-zero disp: break the current zero streak */
22696 footprint_header->cf_last_zeroes = 0;
22697 /* done */
22698 continue;
22699 }
22700
22701 /* zero disp: add to the current streak of zeroes */
22702 footprint_header->cf_last_zeroes++;
22703 if ((footprint_header->cf_last_zeroes +
22704 roundup(((footprint_region->cfr_num_pages -
22705 footprint_header->cf_last_zeroes) * sizeof(cf_disp_t)) &
22706 (sizeof(int) - 1),
22707 sizeof(int))) <
22708 (sizeof(*footprint_header))) {
22709 /*
22710 * There are not enough trailing "zero" dispositions
22711 * (+ the extra padding we would need for the previous
22712 * region); creating a new region would not save space
22713 * at this point, so let's keep this "zero" disposition
22714 * in this region and reconsider later.
22715 */
22716 continue;
22717 }
22718 /*
22719 * Create a new region to avoid having too many consecutive
22720 * "zero" dispositions.
22721 */
22722 new_footprint_region =
22723 vm_map_corpse_footprint_new_region(footprint_header);
22724 if (new_footprint_region == NULL) {
22725 goto over_the_edge;
22726 }
22727 footprint_region = new_footprint_region;
22728 /* initialize the new region as empty ... */
22729 footprint_region->cfr_num_pages = 0;
22730 /* ... and skip this "zero" disp */
22731 footprint_region->cfr_vaddr = va + effective_page_size;
22732 }
22733
22734 return KERN_SUCCESS;
22735
22736 over_the_edge:
22737 // printf("FBDP map %p footprint was full for va 0x%llx\n", new_map, va);
22738 vm_map_corpse_footprint_full++;
22739 return KERN_RESOURCE_SHORTAGE;
22740 }
22741
22742 /*
22743 * vm_map_corpse_footprint_collect_done:
22744 * completes the footprint collection by getting rid of any remaining
22745 * trailing "zero" dispositions and trimming the unused part of the
22746 * kernel buffer
22747 */
22748 void
vm_map_corpse_footprint_collect_done(vm_map_t new_map)22749 vm_map_corpse_footprint_collect_done(
22750 vm_map_t new_map)
22751 {
22752 struct vm_map_corpse_footprint_header *footprint_header;
22753 struct vm_map_corpse_footprint_region *footprint_region;
22754 vm_size_t buf_size, actual_size;
22755 kern_return_t kr;
22756
22757 assert(new_map->has_corpse_footprint);
22758 if (!new_map->has_corpse_footprint ||
22759 new_map->vmmap_corpse_footprint == NULL) {
22760 return;
22761 }
22762
22763 footprint_header = (struct vm_map_corpse_footprint_header *)
22764 new_map->vmmap_corpse_footprint;
22765 buf_size = footprint_header->cf_size;
22766
22767 footprint_region = (struct vm_map_corpse_footprint_region *)
22768 ((char *)footprint_header +
22769 footprint_header->cf_last_region);
22770
22771 /* get rid of trailing zeroes in last region */
22772 assert(footprint_region->cfr_num_pages >= footprint_header->cf_last_zeroes);
22773 footprint_region->cfr_num_pages -= footprint_header->cf_last_zeroes;
22774 footprint_header->cf_last_zeroes = 0;
22775
22776 actual_size = (vm_size_t)(footprint_header->cf_last_region +
22777 sizeof(*footprint_region) +
22778 (footprint_region->cfr_num_pages * sizeof(cf_disp_t)));
22779
22780 // printf("FBDP map %p buf_size 0x%llx actual_size 0x%llx\n", new_map, (uint64_t) buf_size, (uint64_t) actual_size);
22781 vm_map_corpse_footprint_size_avg =
22782 (((vm_map_corpse_footprint_size_avg *
22783 vm_map_corpse_footprint_count) +
22784 actual_size) /
22785 (vm_map_corpse_footprint_count + 1));
22786 vm_map_corpse_footprint_count++;
22787 if (actual_size > vm_map_corpse_footprint_size_max) {
22788 vm_map_corpse_footprint_size_max = actual_size;
22789 }
22790
22791 actual_size = round_page(actual_size);
22792 if (buf_size > actual_size) {
22793 kr = vm_deallocate(kernel_map,
22794 ((vm_address_t)footprint_header +
22795 actual_size +
22796 PAGE_SIZE), /* trailing guard page */
22797 (buf_size - actual_size));
22798 assertf(kr == KERN_SUCCESS,
22799 "trim: footprint_header %p buf_size 0x%llx actual_size 0x%llx kr=0x%x\n",
22800 footprint_header,
22801 (uint64_t) buf_size,
22802 (uint64_t) actual_size,
22803 kr);
22804 kr = vm_protect(kernel_map,
22805 ((vm_address_t)footprint_header +
22806 actual_size),
22807 PAGE_SIZE,
22808 FALSE, /* set_maximum */
22809 VM_PROT_NONE);
22810 assertf(kr == KERN_SUCCESS,
22811 "guard: footprint_header %p buf_size 0x%llx actual_size 0x%llx kr=0x%x\n",
22812 footprint_header,
22813 (uint64_t) buf_size,
22814 (uint64_t) actual_size,
22815 kr);
22816 }
22817
22818 footprint_header->cf_size = actual_size;
22819 }
22820
22821 /*
22822 * vm_map_corpse_footprint_query_page_info:
22823 * retrieves the disposition of the page at virtual address "vaddr"
22824 * in the forked corpse's VM map
22825 *
22826 * This is the equivalent of vm_map_footprint_query_page_info() for a forked corpse.
22827 */
22828 kern_return_t
vm_map_corpse_footprint_query_page_info(vm_map_t map,vm_map_offset_t va,int * disposition_p)22829 vm_map_corpse_footprint_query_page_info(
22830 vm_map_t map,
22831 vm_map_offset_t va,
22832 int *disposition_p)
22833 {
22834 struct vm_map_corpse_footprint_header *footprint_header;
22835 struct vm_map_corpse_footprint_region *footprint_region;
22836 uint32_t footprint_region_offset;
22837 vm_map_offset_t region_start, region_end;
22838 int disp_idx;
22839 kern_return_t kr;
22840 int effective_page_size;
22841 cf_disp_t cf_disp;
22842
22843 if (!map->has_corpse_footprint) {
22844 *disposition_p = 0;
22845 kr = KERN_INVALID_ARGUMENT;
22846 goto done;
22847 }
22848
22849 footprint_header = map->vmmap_corpse_footprint;
22850 if (footprint_header == NULL) {
22851 *disposition_p = 0;
22852 // if (va < SHARED_REGION_BASE_ARM64) printf("FBDP %d query map %p va 0x%llx disp 0x%x\n", __LINE__, map, va, *disposition_p);
22853 kr = KERN_INVALID_ARGUMENT;
22854 goto done;
22855 }
22856
22857 /* start looking at the hint ("cf_hint_region") */
22858 footprint_region_offset = footprint_header->cf_hint_region;
22859
22860 effective_page_size = MIN(PAGE_SIZE, VM_MAP_PAGE_SIZE(map));
22861
22862 lookup_again:
22863 if (footprint_region_offset < sizeof(*footprint_header)) {
22864 /* hint too low: start from 1st region */
22865 footprint_region_offset = sizeof(*footprint_header);
22866 }
22867 if (footprint_region_offset >= footprint_header->cf_last_region) {
22868 /* hint too high: re-start from 1st region */
22869 footprint_region_offset = sizeof(*footprint_header);
22870 }
22871 footprint_region = (struct vm_map_corpse_footprint_region *)
22872 ((char *)footprint_header + footprint_region_offset);
22873 region_start = footprint_region->cfr_vaddr;
22874 region_end = (region_start +
22875 ((vm_map_offset_t)(footprint_region->cfr_num_pages) *
22876 effective_page_size));
22877 if (va < region_start &&
22878 footprint_region_offset != sizeof(*footprint_header)) {
22879 /* our range starts before the hint region */
22880
22881 /* reset the hint (in a racy way...) */
22882 footprint_header->cf_hint_region = sizeof(*footprint_header);
22883 /* lookup "va" again from 1st region */
22884 footprint_region_offset = sizeof(*footprint_header);
22885 goto lookup_again;
22886 }
22887
22888 while (va >= region_end) {
22889 if (footprint_region_offset >= footprint_header->cf_last_region) {
22890 break;
22891 }
22892 /* skip the region's header */
22893 footprint_region_offset += sizeof(*footprint_region);
22894 /* skip the region's page dispositions */
22895 footprint_region_offset += (footprint_region->cfr_num_pages * sizeof(cf_disp_t));
22896 /* align to next word boundary */
22897 footprint_region_offset =
22898 roundup(footprint_region_offset,
22899 sizeof(int));
22900 footprint_region = (struct vm_map_corpse_footprint_region *)
22901 ((char *)footprint_header + footprint_region_offset);
22902 region_start = footprint_region->cfr_vaddr;
22903 region_end = (region_start +
22904 ((vm_map_offset_t)(footprint_region->cfr_num_pages) *
22905 effective_page_size));
22906 }
22907 if (va < region_start || va >= region_end) {
22908 /* page not found */
22909 *disposition_p = 0;
22910 // if (va < SHARED_REGION_BASE_ARM64) printf("FBDP %d query map %p va 0x%llx disp 0x%x\n", __LINE__, map, va, *disposition_p);
22911 kr = KERN_SUCCESS;
22912 goto done;
22913 }
22914
22915 /* "va" found: set the lookup hint for next lookup (in a racy way...) */
22916 footprint_header->cf_hint_region = footprint_region_offset;
22917
22918 /* get page disposition for "va" in this region */
22919 disp_idx = (int) ((va - footprint_region->cfr_vaddr) / effective_page_size);
22920 cf_disp = footprint_region->cfr_disposition[disp_idx];
22921 *disposition_p = vm_page_cf_disp_to_disposition(cf_disp);
22922 kr = KERN_SUCCESS;
22923 done:
22924 // if (va < SHARED_REGION_BASE_ARM64) printf("FBDP %d query map %p va 0x%llx disp 0x%x\n", __LINE__, map, va, *disposition_p);
22925 /* dtrace -n 'vminfo:::footprint_query_page_info { printf("map 0x%p va 0x%llx disp 0x%x kr 0x%x", arg0, arg1, arg2, arg3); }' */
22926 DTRACE_VM4(footprint_query_page_info,
22927 vm_map_t, map,
22928 vm_map_offset_t, va,
22929 int, *disposition_p,
22930 kern_return_t, kr);
22931
22932 return kr;
22933 }
22934
22935 void
vm_map_corpse_footprint_destroy(vm_map_t map)22936 vm_map_corpse_footprint_destroy(
22937 vm_map_t map)
22938 {
22939 if (map->has_corpse_footprint &&
22940 map->vmmap_corpse_footprint != 0) {
22941 struct vm_map_corpse_footprint_header *footprint_header;
22942 vm_size_t buf_size;
22943 kern_return_t kr;
22944
22945 footprint_header = map->vmmap_corpse_footprint;
22946 buf_size = footprint_header->cf_size;
22947 kr = vm_deallocate(kernel_map,
22948 (vm_offset_t) map->vmmap_corpse_footprint,
22949 ((vm_size_t) buf_size
22950 + PAGE_SIZE)); /* trailing guard page */
22951 assertf(kr == KERN_SUCCESS, "kr=0x%x\n", kr);
22952 map->vmmap_corpse_footprint = 0;
22953 map->has_corpse_footprint = FALSE;
22954 }
22955 }
22956
22957 /*
22958 * vm_map_copy_footprint_ledgers:
22959 * copies any ledger that's relevant to the memory footprint of "old_task"
22960 * into the forked corpse's task ("new_task")
22961 */
22962 void
vm_map_copy_footprint_ledgers(task_t old_task,task_t new_task)22963 vm_map_copy_footprint_ledgers(
22964 task_t old_task,
22965 task_t new_task)
22966 {
22967 vm_map_copy_ledger(old_task, new_task, task_ledgers.phys_footprint);
22968 vm_map_copy_ledger(old_task, new_task, task_ledgers.purgeable_nonvolatile);
22969 vm_map_copy_ledger(old_task, new_task, task_ledgers.purgeable_nonvolatile_compressed);
22970 vm_map_copy_ledger(old_task, new_task, task_ledgers.internal);
22971 vm_map_copy_ledger(old_task, new_task, task_ledgers.internal_compressed);
22972 vm_map_copy_ledger(old_task, new_task, task_ledgers.iokit_mapped);
22973 vm_map_copy_ledger(old_task, new_task, task_ledgers.alternate_accounting);
22974 vm_map_copy_ledger(old_task, new_task, task_ledgers.alternate_accounting_compressed);
22975 vm_map_copy_ledger(old_task, new_task, task_ledgers.page_table);
22976 vm_map_copy_ledger(old_task, new_task, task_ledgers.tagged_footprint);
22977 vm_map_copy_ledger(old_task, new_task, task_ledgers.tagged_footprint_compressed);
22978 vm_map_copy_ledger(old_task, new_task, task_ledgers.network_nonvolatile);
22979 vm_map_copy_ledger(old_task, new_task, task_ledgers.network_nonvolatile_compressed);
22980 vm_map_copy_ledger(old_task, new_task, task_ledgers.media_footprint);
22981 vm_map_copy_ledger(old_task, new_task, task_ledgers.media_footprint_compressed);
22982 vm_map_copy_ledger(old_task, new_task, task_ledgers.graphics_footprint);
22983 vm_map_copy_ledger(old_task, new_task, task_ledgers.graphics_footprint_compressed);
22984 vm_map_copy_ledger(old_task, new_task, task_ledgers.neural_footprint);
22985 vm_map_copy_ledger(old_task, new_task, task_ledgers.neural_footprint_compressed);
22986 vm_map_copy_ledger(old_task, new_task, task_ledgers.wired_mem);
22987 }
22988
22989 /*
22990 * vm_map_copy_ledger:
22991 * copy a single ledger from "old_task" to "new_task"
22992 */
22993 void
vm_map_copy_ledger(task_t old_task,task_t new_task,int ledger_entry)22994 vm_map_copy_ledger(
22995 task_t old_task,
22996 task_t new_task,
22997 int ledger_entry)
22998 {
22999 ledger_amount_t old_balance, new_balance, delta;
23000
23001 assert(new_task->map->has_corpse_footprint);
23002 if (!new_task->map->has_corpse_footprint) {
23003 return;
23004 }
23005
23006 /* turn off sanity checks for the ledger we're about to mess with */
23007 ledger_disable_panic_on_negative(new_task->ledger,
23008 ledger_entry);
23009
23010 /* adjust "new_task" to match "old_task" */
23011 ledger_get_balance(old_task->ledger,
23012 ledger_entry,
23013 &old_balance);
23014 ledger_get_balance(new_task->ledger,
23015 ledger_entry,
23016 &new_balance);
23017 if (new_balance == old_balance) {
23018 /* new == old: done */
23019 } else if (new_balance > old_balance) {
23020 /* new > old ==> new -= new - old */
23021 delta = new_balance - old_balance;
23022 ledger_debit(new_task->ledger,
23023 ledger_entry,
23024 delta);
23025 } else {
23026 /* new < old ==> new += old - new */
23027 delta = old_balance - new_balance;
23028 ledger_credit(new_task->ledger,
23029 ledger_entry,
23030 delta);
23031 }
23032 }
23033
23034 /*
23035 * vm_map_get_pmap:
23036 * returns the pmap associated with the vm_map
23037 */
23038 pmap_t
vm_map_get_pmap(vm_map_t map)23039 vm_map_get_pmap(vm_map_t map)
23040 {
23041 return vm_map_pmap(map);
23042 }
23043
23044 #if CONFIG_MAP_RANGES
23045 static bitmap_t vm_map_user_range_heap_map[BITMAP_LEN(VM_MEMORY_COUNT)];
23046
23047 static_assert(UMEM_RANGE_ID_DEFAULT == MACH_VM_RANGE_DEFAULT);
23048 static_assert(UMEM_RANGE_ID_HEAP == MACH_VM_RANGE_DATA);
23049
23050 /*
23051 * vm_map_range_map_init:
23052 * initializes the VM range ID map to enable index lookup
23053 * of user VM ranges based on VM tag from userspace.
23054 */
23055 static void
vm_map_range_map_init(void)23056 vm_map_range_map_init(void)
23057 {
23058 /*
23059 * VM_MEMORY_MALLOC{,_NANO} are skipped on purpose:
23060 * - the former is malloc metadata which should be kept separate
23061 * - the latter has its own ranges
23062 */
23063 bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_MALLOC_HUGE);
23064 bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_MALLOC_LARGE);
23065 bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_MALLOC_LARGE_REUSED);
23066 bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_MALLOC_MEDIUM);
23067 bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_MALLOC_PROB_GUARD);
23068 bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_MALLOC_SMALL);
23069 bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_MALLOC_TINY);
23070 bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_TCMALLOC);
23071 bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_LIBNETWORK);
23072 bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_IOACCELERATOR);
23073 bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_IOSURFACE);
23074 bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_IMAGEIO);
23075 bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_COREGRAPHICS);
23076 bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_CORESERVICES);
23077 bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_COREDATA);
23078 bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_LAYERKIT);
23079 }
23080
23081 static struct mach_vm_range
vm_map_range_random_uniform(vm_map_size_t req_size,vm_map_offset_t min_addr,vm_map_offset_t max_addr,vm_map_offset_t offmask)23082 vm_map_range_random_uniform(
23083 vm_map_size_t req_size,
23084 vm_map_offset_t min_addr,
23085 vm_map_offset_t max_addr,
23086 vm_map_offset_t offmask)
23087 {
23088 vm_map_offset_t random_addr;
23089 struct mach_vm_range alloc;
23090
23091 req_size = (req_size + offmask) & ~offmask;
23092 min_addr = (min_addr + offmask) & ~offmask;
23093 max_addr = max_addr & ~offmask;
23094
23095 read_random(&random_addr, sizeof(random_addr));
23096 random_addr %= (max_addr - req_size - min_addr);
23097 random_addr &= ~offmask;
23098
23099 alloc.min_address = min_addr + random_addr;
23100 alloc.max_address = min_addr + random_addr + req_size;
23101 return alloc;
23102 }
23103
23104 static vm_map_offset_t
vm_map_range_offmask(void)23105 vm_map_range_offmask(void)
23106 {
23107 uint32_t pte_depth;
23108
23109 /*
23110 * PTE optimizations
23111 *
23112 *
23113 * 16k pages systems
23114 * ~~~~~~~~~~~~~~~~~
23115 *
23116 * A single L1 (sub-)page covers the address space.
23117 * - L2 pages cover 64G,
23118 * - L3 pages cover 32M.
23119 *
23120 * On embedded, the dynamic VA range is 64G and uses a single L2 page.
23121 * As a result, we really only need to align the ranges to 32M to avoid
23122 * partial L3 pages.
23123 *
23124 * On macOS, the usage of L2 pages will increase, so as a result we will
23125 * want to align ranges to 64G in order to utilize them fully.
23126 *
23127 *
23128 * 4k pages systems
23129 * ~~~~~~~~~~~~~~~~
23130 *
23131 * A single L0 (sub-)page covers the address space.
23132 * - L1 pages cover 512G,
23133 * - L2 pages cover 1G,
23134 * - L3 pages cover 2M.
23135 *
23136 * The long tail of processes on a system will tend to have a VA usage
23137 * (ignoring the shared regions) in the 100s of MB order of magnitnude.
23138 * This is achievable with a single L1 and a few L2s without
23139 * randomization.
23140 *
23141 * However once randomization is introduced, the system will immediately
23142 * need several L1s and many more L2s. As a result:
23143 *
23144 * - on embedded devices, the cost of these extra pages isn't
23145 * sustainable, and we just disable the feature entirely,
23146 *
23147 * - on macOS we align ranges to a 512G boundary so that the extra L1
23148 * pages can be used to their full potential.
23149 */
23150
23151 /*
23152 * note, this function assumes _non exotic mappings_
23153 * which is why it uses the native kernel's PAGE_SHIFT.
23154 */
23155 #if XNU_PLATFORM_MacOSX
23156 pte_depth = PAGE_SHIFT > 12 ? 2 : 3;
23157 #else /* !XNU_PLATFORM_MacOSX */
23158 pte_depth = PAGE_SHIFT > 12 ? 1 : 0;
23159 #endif /* !XNU_PLATFORM_MacOSX */
23160
23161 if (pte_depth == 0) {
23162 return 0;
23163 }
23164
23165 return (1ull << ((PAGE_SHIFT - 3) * pte_depth + PAGE_SHIFT)) - 1;
23166 }
23167
23168 /*
23169 * vm_map_range_configure:
23170 * configures the user vm_map ranges by increasing the maximum VA range of
23171 * the map and carving out a range at the end of VA space (searching backwards
23172 * in the newly expanded map).
23173 */
23174 kern_return_t
vm_map_range_configure(vm_map_t map)23175 vm_map_range_configure(vm_map_t map)
23176 {
23177 const vm_map_offset_t offmask = vm_map_range_offmask();
23178 struct mach_vm_range data_range;
23179 vm_map_offset_t default_end;
23180 kern_return_t kr;
23181
23182 if (!vm_map_is_64bit(map) || vm_map_is_exotic(map) || offmask == 0) {
23183 /*
23184 * No point doing vm ranges in a 32bit address space.
23185 */
23186 return KERN_NOT_SUPPORTED;
23187 }
23188
23189 /* Should not be applying ranges to kernel map or kernel map submaps */
23190 assert(vm_map_pmap(map) != kernel_pmap);
23191
23192 #if XNU_PLATFORM_MacOSX
23193
23194 /*
23195 * on macOS, the address space is a massive 47 bits (128T),
23196 * with several carve outs that processes can't use:
23197 * - the shared region
23198 * - the commpage region
23199 * - the GPU carve out (if applicable)
23200 *
23201 * and when nano-malloc is in use it desires memory at the 96T mark.
23202 *
23203 * However, their location is architecture dependent:
23204 * - On intel, the shared region and commpage are
23205 * at the very end of the usable address space (above +127T),
23206 * and there is no GPU carve out, and pthread wants to place
23207 * threads at the 112T mark (0x70T).
23208 *
23209 * - On arm64, these are in the same spot as on embedded devices:
23210 * o shared region: [ 6G, 10G) [ will likely grow over time ]
23211 * o commpage region: [63G, 64G)
23212 * o GPU carve out: [64G, 448G)
23213 *
23214 * This is conveninent because the mappings at the end of the address
23215 * space (when they exist) are made by the kernel.
23216 *
23217 * The policy is to allocate a random 1T for the data heap
23218 * in the end of the address-space in the:
23219 * - [0x71, 0x7f) range on Intel (to leave space for pthread stacks)
23220 * - [0x61, 0x7f) range on ASM (to leave space for Nano malloc).
23221 */
23222
23223 /* see NANOZONE_SIGNATURE in libmalloc */
23224 #if __x86_64__
23225 default_end = 0x71ull << 40;
23226 #else
23227 default_end = 0x61ull << 40;
23228 #endif
23229 data_range = vm_map_range_random_uniform(1ull << 40,
23230 default_end, 0x7full << 40, offmask);
23231
23232 #else /* !XNU_PLATFORM_MacOSX */
23233
23234 /*
23235 * Embedded devices:
23236 *
23237 * The default VA Size scales with the device physical memory.
23238 *
23239 * Out of that:
23240 * - the "zero" page typically uses 4G + some slide
23241 * - the shared region uses SHARED_REGION_SIZE bytes (4G)
23242 *
23243 * Without the use of jumbo or any adjustment to the address space,
23244 * a default VM map typically looks like this:
23245 *
23246 * 0G -->╒════════════╕
23247 * │ pagezero │
23248 * │ + slide │
23249 * ~4G -->╞════════════╡<-- vm_map_min(map)
23250 * │ │
23251 * 6G -->├────────────┤
23252 * │ shared │
23253 * │ region │
23254 * 10G -->├────────────┤
23255 * │ │
23256 * max_va -->├────────────┤<-- vm_map_max(map)
23257 * │ │
23258 * ╎ jumbo ╎
23259 * ╎ ╎
23260 * │ │
23261 * 63G -->╞════════════╡<-- MACH_VM_MAX_ADDRESS
23262 * │ commpage │
23263 * 64G -->├────────────┤<-- MACH_VM_MIN_GPU_CARVEOUT_ADDRESS
23264 * │ │
23265 * ╎ GPU ╎
23266 * ╎ carveout ╎
23267 * │ │
23268 * 448G -->├────────────┤<-- MACH_VM_MAX_GPU_CARVEOUT_ADDRESS
23269 * │ │
23270 * ╎ ╎
23271 * ╎ ╎
23272 * │ │
23273 * 512G -->╘════════════╛<-- (1ull << ARM_16K_TT_L1_SHIFT)
23274 *
23275 * When this drawing was made, "max_va" was smaller than
23276 * ARM64_MAX_OFFSET_DEVICE_LARGE (~15.5G), leaving shy of
23277 * 12G of address space for the zero-page, slide, files,
23278 * binaries, heap ...
23279 *
23280 * We will want to make a "heap/data" carve out inside
23281 * the jumbo range of half of that usable space, assuming
23282 * that this is less than a forth of the jumbo range.
23283 *
23284 * The assert below intends to catch when max_va grows
23285 * too large for this heuristic.
23286 */
23287
23288 vm_map_lock_read(map);
23289 default_end = vm_map_max(map);
23290 vm_map_unlock_read(map);
23291
23292 /*
23293 * Check that we're not already jumbo'd,
23294 * or our address space was somehow modified.
23295 *
23296 * If so we cannot guarantee that we can set up the ranges
23297 * safely without interfering with the existing map.
23298 */
23299 if (default_end > vm_compute_max_offset(true)) {
23300 return KERN_NO_SPACE;
23301 }
23302
23303 if (pmap_max_offset(true, ARM_PMAP_MAX_OFFSET_DEFAULT)) {
23304 /*
23305 * an override boot-arg was set, disable user-ranges
23306 *
23307 * XXX: this is problematic because it means these boot-args
23308 * no longer test the behavior changing the value
23309 * of ARM64_MAX_OFFSET_DEVICE_* would have.
23310 */
23311 return KERN_NOT_SUPPORTED;
23312 }
23313
23314 /* expand the default VM space to the largest possible address */
23315 vm_map_set_jumbo(map);
23316
23317 assert3u(4 * GiB(10), <=, vm_map_max(map) - default_end);
23318 data_range = vm_map_range_random_uniform(GiB(10),
23319 default_end + PAGE_SIZE, vm_map_max(map), offmask);
23320
23321 #endif /* !XNU_PLATFORM_MacOSX */
23322
23323 /*
23324 * Poke holes so that ASAN or people listing regions
23325 * do not think this space is free.
23326 */
23327
23328 if (default_end != data_range.min_address) {
23329 kr = vm_map_enter(map, &default_end,
23330 data_range.min_address - default_end,
23331 0, VM_MAP_KERNEL_FLAGS_FIXED_PERMANENT(), VM_OBJECT_NULL,
23332 0, FALSE, VM_PROT_NONE, VM_PROT_NONE, VM_INHERIT_DEFAULT);
23333 assert(kr == KERN_SUCCESS);
23334 }
23335
23336 if (data_range.max_address != vm_map_max(map)) {
23337 vm_map_entry_t entry;
23338 vm_size_t size;
23339
23340 vm_map_lock_read(map);
23341 vm_map_lookup_entry_or_next(map, data_range.max_address, &entry);
23342 if (entry != vm_map_to_entry(map)) {
23343 size = vm_map_max(map) - data_range.max_address;
23344 } else {
23345 size = entry->vme_start - data_range.max_address;
23346 }
23347 vm_map_unlock_read(map);
23348
23349 kr = vm_map_enter(map, &data_range.max_address, size,
23350 0, VM_MAP_KERNEL_FLAGS_FIXED_PERMANENT(), VM_OBJECT_NULL,
23351 0, FALSE, VM_PROT_NONE, VM_PROT_NONE, VM_INHERIT_DEFAULT);
23352 assert(kr == KERN_SUCCESS);
23353 }
23354
23355 vm_map_lock(map);
23356 map->default_range.min_address = vm_map_min(map);
23357 map->default_range.max_address = default_end;
23358 map->data_range = data_range;
23359 map->uses_user_ranges = true;
23360 vm_map_unlock(map);
23361
23362 return KERN_SUCCESS;
23363 }
23364
23365 /*
23366 * vm_map_range_fork:
23367 * clones the array of ranges from old_map to new_map in support
23368 * of a VM map fork.
23369 */
23370 void
vm_map_range_fork(vm_map_t new_map,vm_map_t old_map)23371 vm_map_range_fork(vm_map_t new_map, vm_map_t old_map)
23372 {
23373 if (!old_map->uses_user_ranges) {
23374 /* nothing to do */
23375 return;
23376 }
23377
23378 new_map->default_range = old_map->default_range;
23379 new_map->data_range = old_map->data_range;
23380
23381 if (old_map->extra_ranges_count) {
23382 vm_map_user_range_t otable, ntable;
23383 uint16_t count;
23384
23385 otable = old_map->extra_ranges;
23386 count = old_map->extra_ranges_count;
23387 ntable = kalloc_data(count * sizeof(struct vm_map_user_range),
23388 Z_WAITOK | Z_ZERO | Z_NOFAIL);
23389 memcpy(ntable, otable,
23390 count * sizeof(struct vm_map_user_range));
23391
23392 new_map->extra_ranges_count = count;
23393 new_map->extra_ranges = ntable;
23394 }
23395
23396 new_map->uses_user_ranges = true;
23397 }
23398
23399 /*
23400 * vm_map_get_user_range:
23401 * copy the VM user range for the given VM map and range ID.
23402 */
23403 kern_return_t
vm_map_get_user_range(vm_map_t map,vm_map_range_id_t range_id,mach_vm_range_t range)23404 vm_map_get_user_range(
23405 vm_map_t map,
23406 vm_map_range_id_t range_id,
23407 mach_vm_range_t range)
23408 {
23409 if (map == NULL || !map->uses_user_ranges || range == NULL) {
23410 return KERN_INVALID_ARGUMENT;
23411 }
23412
23413 switch (range_id) {
23414 case UMEM_RANGE_ID_DEFAULT:
23415 *range = map->default_range;
23416 return KERN_SUCCESS;
23417
23418 case UMEM_RANGE_ID_HEAP:
23419 *range = map->data_range;
23420 return KERN_SUCCESS;
23421
23422 default:
23423 return KERN_INVALID_ARGUMENT;
23424 }
23425 }
23426
23427 static vm_map_range_id_t
vm_map_user_range_resolve(vm_map_t map,mach_vm_address_t addr,mach_vm_size_t size,mach_vm_range_t range)23428 vm_map_user_range_resolve(
23429 vm_map_t map,
23430 mach_vm_address_t addr,
23431 mach_vm_size_t size,
23432 mach_vm_range_t range)
23433 {
23434 struct mach_vm_range tmp;
23435
23436 vm_map_lock_assert_held(map);
23437
23438 static_assert(UMEM_RANGE_ID_DEFAULT == MACH_VM_RANGE_DEFAULT);
23439 static_assert(UMEM_RANGE_ID_HEAP == MACH_VM_RANGE_DATA);
23440
23441 if (mach_vm_range_contains(&map->default_range, addr, size)) {
23442 if (range) {
23443 *range = map->default_range;
23444 }
23445 return UMEM_RANGE_ID_DEFAULT;
23446 }
23447
23448 if (mach_vm_range_contains(&map->data_range, addr, size)) {
23449 if (range) {
23450 *range = map->data_range;
23451 }
23452 return UMEM_RANGE_ID_HEAP;
23453 }
23454
23455 for (size_t i = 0; i < map->extra_ranges_count; i++) {
23456 vm_map_user_range_t r = &map->extra_ranges[i];
23457
23458 tmp.min_address = r->vmur_min_address;
23459 tmp.max_address = r->vmur_max_address;
23460
23461 if (mach_vm_range_contains(&tmp, addr, size)) {
23462 if (range) {
23463 *range = tmp;
23464 }
23465 return r->vmur_range_id;
23466 }
23467 }
23468
23469 if (range) {
23470 range->min_address = range->max_address = 0;
23471 }
23472 return UMEM_RANGE_ID_DEFAULT;
23473 }
23474
23475 static int
vm_map_user_range_cmp(const void * e1,const void * e2)23476 vm_map_user_range_cmp(const void *e1, const void *e2)
23477 {
23478 const struct vm_map_user_range *r1 = e1;
23479 const struct vm_map_user_range *r2 = e2;
23480
23481 if (r1->vmur_min_address != r2->vmur_min_address) {
23482 return r1->vmur_min_address < r2->vmur_min_address ? -1 : 1;
23483 }
23484
23485 return 0;
23486 }
23487
23488 static int
mach_vm_range_recipe_v1_cmp(const void * e1,const void * e2)23489 mach_vm_range_recipe_v1_cmp(const void *e1, const void *e2)
23490 {
23491 const mach_vm_range_recipe_v1_t *r1 = e1;
23492 const mach_vm_range_recipe_v1_t *r2 = e2;
23493
23494 if (r1->range.min_address != r2->range.min_address) {
23495 return r1->range.min_address < r2->range.min_address ? -1 : 1;
23496 }
23497
23498 return 0;
23499 }
23500
23501 /*!
23502 * @function mach_vm_range_create_v1()
23503 *
23504 * @brief
23505 * Handle the backend for mach_vm_range_create() for the
23506 * MACH_VM_RANGE_FLAVOR_V1 flavor.
23507 *
23508 * @description
23509 * This call allows to create "ranges" in the map of a task
23510 * that have special semantics/policies around placement of
23511 * new allocations (in the vm_map_locate_space() sense).
23512 *
23513 * @returns
23514 * - KERN_SUCCESS on success
23515 * - KERN_INVALID_ARGUMENT for incorrect arguments
23516 * - KERN_NO_SPACE if the maximum amount of ranges would be exceeded
23517 * - KERN_MEMORY_PRESENT if any of the requested ranges
23518 * overlaps with existing ranges or allocations in the map.
23519 */
23520 static kern_return_t
mach_vm_range_create_v1(vm_map_t map,mach_vm_range_recipe_v1_t * recipe,uint32_t new_count)23521 mach_vm_range_create_v1(
23522 vm_map_t map,
23523 mach_vm_range_recipe_v1_t *recipe,
23524 uint32_t new_count)
23525 {
23526 const vm_offset_t mask = VM_MAP_PAGE_MASK(map);
23527 vm_map_user_range_t table;
23528 kern_return_t kr = KERN_SUCCESS;
23529 uint16_t count;
23530
23531 struct mach_vm_range void1 = {
23532 .min_address = map->default_range.max_address,
23533 .max_address = map->data_range.min_address,
23534 };
23535 struct mach_vm_range void2 = {
23536 .min_address = map->data_range.max_address,
23537 .max_address = vm_map_max(map),
23538 };
23539
23540 qsort(recipe, new_count, sizeof(mach_vm_range_recipe_v1_t),
23541 mach_vm_range_recipe_v1_cmp);
23542
23543 /*
23544 * Step 1: Validate that the recipes have no intersections.
23545 */
23546
23547 for (size_t i = 0; i < new_count; i++) {
23548 mach_vm_range_t r = &recipe[i].range;
23549 mach_vm_size_t s = mach_vm_range_size(r);
23550
23551 if (recipe[i].flags) {
23552 return KERN_INVALID_ARGUMENT;
23553 }
23554
23555 static_assert(UMEM_RANGE_ID_FIXED == MACH_VM_RANGE_FIXED);
23556 switch (recipe[i].range_tag) {
23557 case MACH_VM_RANGE_FIXED:
23558 break;
23559 default:
23560 return KERN_INVALID_ARGUMENT;
23561 }
23562
23563 if (!VM_MAP_PAGE_ALIGNED(r->min_address, mask) ||
23564 !VM_MAP_PAGE_ALIGNED(r->max_address, mask)) {
23565 return KERN_INVALID_ARGUMENT;
23566 }
23567
23568 if (!mach_vm_range_contains(&void1, r->min_address, s) &&
23569 !mach_vm_range_contains(&void2, r->min_address, s)) {
23570 return KERN_INVALID_ARGUMENT;
23571 }
23572
23573 if (i > 0 && recipe[i - 1].range.max_address >
23574 recipe[i].range.min_address) {
23575 return KERN_INVALID_ARGUMENT;
23576 }
23577 }
23578
23579 vm_map_lock(map);
23580
23581 table = map->extra_ranges;
23582 count = map->extra_ranges_count;
23583
23584 if (count + new_count > VM_MAP_EXTRA_RANGES_MAX) {
23585 kr = KERN_NO_SPACE;
23586 goto out_unlock;
23587 }
23588
23589 /*
23590 * Step 2: Check that there is no intersection with existing ranges.
23591 */
23592
23593 for (size_t i = 0, j = 0; i < new_count && j < count;) {
23594 mach_vm_range_t r1 = &recipe[i].range;
23595 vm_map_user_range_t r2 = &table[j];
23596
23597 if (r1->max_address <= r2->vmur_min_address) {
23598 i++;
23599 } else if (r2->vmur_max_address <= r1->min_address) {
23600 j++;
23601 } else {
23602 kr = KERN_MEMORY_PRESENT;
23603 goto out_unlock;
23604 }
23605 }
23606
23607 /*
23608 * Step 4: commit the new ranges.
23609 */
23610
23611 static_assert(VM_MAP_EXTRA_RANGES_MAX * sizeof(struct vm_map_user_range) <=
23612 KALLOC_SAFE_ALLOC_SIZE);
23613
23614 table = krealloc_data(table,
23615 count * sizeof(struct vm_map_user_range),
23616 (count + new_count) * sizeof(struct vm_map_user_range),
23617 Z_ZERO | Z_WAITOK | Z_NOFAIL);
23618
23619 for (size_t i = 0; i < new_count; i++) {
23620 static_assert(MACH_VM_MAX_ADDRESS < (1ull << 56));
23621
23622 table[count + i] = (struct vm_map_user_range){
23623 .vmur_min_address = recipe[i].range.min_address,
23624 .vmur_max_address = recipe[i].range.max_address,
23625 .vmur_range_id = (vm_map_range_id_t)recipe[i].range_tag,
23626 };
23627 }
23628
23629 qsort(table, count + new_count,
23630 sizeof(struct vm_map_user_range), vm_map_user_range_cmp);
23631
23632 map->extra_ranges_count += new_count;
23633 map->extra_ranges = table;
23634
23635 out_unlock:
23636 vm_map_unlock(map);
23637
23638 if (kr == KERN_SUCCESS) {
23639 for (size_t i = 0; i < new_count; i++) {
23640 vm_map_kernel_flags_t vmk_flags = {
23641 .vmf_fixed = true,
23642 .vmf_overwrite = true,
23643 .vmkf_overwrite_immutable = true,
23644 .vm_tag = recipe[i].vm_tag,
23645 };
23646 __assert_only kern_return_t kr2;
23647
23648 kr2 = vm_map_enter(map, &recipe[i].range.min_address,
23649 mach_vm_range_size(&recipe[i].range),
23650 0, vmk_flags, VM_OBJECT_NULL, 0, FALSE,
23651 VM_PROT_NONE, VM_PROT_ALL,
23652 VM_INHERIT_DEFAULT);
23653 assert(kr2 == KERN_SUCCESS);
23654 }
23655 }
23656 return kr;
23657 }
23658
23659 kern_return_t
mach_vm_range_create(vm_map_t map,mach_vm_range_flavor_t flavor,mach_vm_range_recipes_raw_t recipe,natural_t size)23660 mach_vm_range_create(
23661 vm_map_t map,
23662 mach_vm_range_flavor_t flavor,
23663 mach_vm_range_recipes_raw_t recipe,
23664 natural_t size)
23665 {
23666 if (map != current_map()) {
23667 return KERN_INVALID_ARGUMENT;
23668 }
23669
23670 if (!map->uses_user_ranges) {
23671 return KERN_NOT_SUPPORTED;
23672 }
23673
23674 if (size == 0) {
23675 return KERN_SUCCESS;
23676 }
23677
23678 if (flavor == MACH_VM_RANGE_FLAVOR_V1) {
23679 mach_vm_range_recipe_v1_t *array;
23680
23681 if (size % sizeof(mach_vm_range_recipe_v1_t)) {
23682 return KERN_INVALID_ARGUMENT;
23683 }
23684
23685 size /= sizeof(mach_vm_range_recipe_v1_t);
23686 if (size > VM_MAP_EXTRA_RANGES_MAX) {
23687 return KERN_NO_SPACE;
23688 }
23689
23690 array = (mach_vm_range_recipe_v1_t *)recipe;
23691 return mach_vm_range_create_v1(map, array, size);
23692 }
23693
23694 return KERN_INVALID_ARGUMENT;
23695 }
23696
23697 #else /* !CONFIG_MAP_RANGES */
23698
23699 kern_return_t
mach_vm_range_create(vm_map_t map,mach_vm_range_flavor_t flavor,mach_vm_range_recipes_raw_t recipe,natural_t size)23700 mach_vm_range_create(
23701 vm_map_t map,
23702 mach_vm_range_flavor_t flavor,
23703 mach_vm_range_recipes_raw_t recipe,
23704 natural_t size)
23705 {
23706 #pragma unused(map, flavor, recipe, size)
23707 return KERN_NOT_SUPPORTED;
23708 }
23709
23710 #endif /* !CONFIG_MAP_RANGES */
23711
23712 void
vm_map_kernel_flags_update_range_id(vm_map_kernel_flags_t * vmkf,vm_map_t map)23713 vm_map_kernel_flags_update_range_id(vm_map_kernel_flags_t *vmkf, vm_map_t map)
23714 {
23715 if (map == kernel_map) {
23716 if (vmkf->vmkf_range_id == KMEM_RANGE_ID_NONE) {
23717 vmkf->vmkf_range_id = KMEM_RANGE_ID_DATA;
23718 }
23719 #if CONFIG_MAP_RANGES
23720 } else if (vmkf->vm_tag < VM_MEMORY_COUNT &&
23721 vmkf->vmkf_range_id == UMEM_RANGE_ID_DEFAULT &&
23722 bitmap_test(vm_map_user_range_heap_map, vmkf->vm_tag)) {
23723 vmkf->vmkf_range_id = UMEM_RANGE_ID_HEAP;
23724 #endif /* CONFIG_MAP_RANGES */
23725 }
23726 }
23727
23728 /*
23729 * vm_map_entry_has_device_pager:
23730 * Check if the vm map entry specified by the virtual address has a device pager.
23731 * If the vm map entry does not exist or if the map is NULL, this returns FALSE.
23732 */
23733 boolean_t
vm_map_entry_has_device_pager(vm_map_t map,vm_map_offset_t vaddr)23734 vm_map_entry_has_device_pager(vm_map_t map, vm_map_offset_t vaddr)
23735 {
23736 vm_map_entry_t entry;
23737 vm_object_t object;
23738 boolean_t result;
23739
23740 if (map == NULL) {
23741 return FALSE;
23742 }
23743
23744 vm_map_lock(map);
23745 while (TRUE) {
23746 if (!vm_map_lookup_entry(map, vaddr, &entry)) {
23747 result = FALSE;
23748 break;
23749 }
23750 if (entry->is_sub_map) {
23751 // Check the submap
23752 vm_map_t submap = VME_SUBMAP(entry);
23753 assert(submap != NULL);
23754 vm_map_lock(submap);
23755 vm_map_unlock(map);
23756 map = submap;
23757 continue;
23758 }
23759 object = VME_OBJECT(entry);
23760 if (object != NULL && object->pager != NULL && is_device_pager_ops(object->pager->mo_pager_ops)) {
23761 result = TRUE;
23762 break;
23763 }
23764 result = FALSE;
23765 break;
23766 }
23767
23768 vm_map_unlock(map);
23769 return result;
23770 }
23771
23772
23773 #if MACH_ASSERT
23774
23775 extern int pmap_ledgers_panic;
23776 extern int pmap_ledgers_panic_leeway;
23777
23778 #define LEDGER_DRIFT(__LEDGER) \
23779 int __LEDGER##_over; \
23780 ledger_amount_t __LEDGER##_over_total; \
23781 ledger_amount_t __LEDGER##_over_max; \
23782 int __LEDGER##_under; \
23783 ledger_amount_t __LEDGER##_under_total; \
23784 ledger_amount_t __LEDGER##_under_max
23785
23786 struct {
23787 uint64_t num_pmaps_checked;
23788
23789 LEDGER_DRIFT(phys_footprint);
23790 LEDGER_DRIFT(internal);
23791 LEDGER_DRIFT(internal_compressed);
23792 LEDGER_DRIFT(external);
23793 LEDGER_DRIFT(reusable);
23794 LEDGER_DRIFT(iokit_mapped);
23795 LEDGER_DRIFT(alternate_accounting);
23796 LEDGER_DRIFT(alternate_accounting_compressed);
23797 LEDGER_DRIFT(page_table);
23798 LEDGER_DRIFT(purgeable_volatile);
23799 LEDGER_DRIFT(purgeable_nonvolatile);
23800 LEDGER_DRIFT(purgeable_volatile_compressed);
23801 LEDGER_DRIFT(purgeable_nonvolatile_compressed);
23802 LEDGER_DRIFT(tagged_nofootprint);
23803 LEDGER_DRIFT(tagged_footprint);
23804 LEDGER_DRIFT(tagged_nofootprint_compressed);
23805 LEDGER_DRIFT(tagged_footprint_compressed);
23806 LEDGER_DRIFT(network_volatile);
23807 LEDGER_DRIFT(network_nonvolatile);
23808 LEDGER_DRIFT(network_volatile_compressed);
23809 LEDGER_DRIFT(network_nonvolatile_compressed);
23810 LEDGER_DRIFT(media_nofootprint);
23811 LEDGER_DRIFT(media_footprint);
23812 LEDGER_DRIFT(media_nofootprint_compressed);
23813 LEDGER_DRIFT(media_footprint_compressed);
23814 LEDGER_DRIFT(graphics_nofootprint);
23815 LEDGER_DRIFT(graphics_footprint);
23816 LEDGER_DRIFT(graphics_nofootprint_compressed);
23817 LEDGER_DRIFT(graphics_footprint_compressed);
23818 LEDGER_DRIFT(neural_nofootprint);
23819 LEDGER_DRIFT(neural_footprint);
23820 LEDGER_DRIFT(neural_nofootprint_compressed);
23821 LEDGER_DRIFT(neural_footprint_compressed);
23822 } pmap_ledgers_drift;
23823
23824 void
vm_map_pmap_check_ledgers(pmap_t pmap,ledger_t ledger,int pid,char * procname)23825 vm_map_pmap_check_ledgers(
23826 pmap_t pmap,
23827 ledger_t ledger,
23828 int pid,
23829 char *procname)
23830 {
23831 ledger_amount_t bal;
23832 boolean_t do_panic;
23833
23834 do_panic = FALSE;
23835
23836 pmap_ledgers_drift.num_pmaps_checked++;
23837
23838 #define LEDGER_CHECK_BALANCE(__LEDGER) \
23839 MACRO_BEGIN \
23840 int panic_on_negative = TRUE; \
23841 ledger_get_balance(ledger, \
23842 task_ledgers.__LEDGER, \
23843 &bal); \
23844 ledger_get_panic_on_negative(ledger, \
23845 task_ledgers.__LEDGER, \
23846 &panic_on_negative); \
23847 if (bal != 0) { \
23848 if (panic_on_negative || \
23849 (pmap_ledgers_panic && \
23850 pmap_ledgers_panic_leeway > 0 && \
23851 (bal > (pmap_ledgers_panic_leeway * PAGE_SIZE) || \
23852 bal < (-pmap_ledgers_panic_leeway * PAGE_SIZE)))) { \
23853 do_panic = TRUE; \
23854 } \
23855 printf("LEDGER BALANCE proc %d (%s) " \
23856 "\"%s\" = %lld\n", \
23857 pid, procname, #__LEDGER, bal); \
23858 if (bal > 0) { \
23859 pmap_ledgers_drift.__LEDGER##_over++; \
23860 pmap_ledgers_drift.__LEDGER##_over_total += bal; \
23861 if (bal > pmap_ledgers_drift.__LEDGER##_over_max) { \
23862 pmap_ledgers_drift.__LEDGER##_over_max = bal; \
23863 } \
23864 } else if (bal < 0) { \
23865 pmap_ledgers_drift.__LEDGER##_under++; \
23866 pmap_ledgers_drift.__LEDGER##_under_total += bal; \
23867 if (bal < pmap_ledgers_drift.__LEDGER##_under_max) { \
23868 pmap_ledgers_drift.__LEDGER##_under_max = bal; \
23869 } \
23870 } \
23871 } \
23872 MACRO_END
23873
23874 LEDGER_CHECK_BALANCE(phys_footprint);
23875 LEDGER_CHECK_BALANCE(internal);
23876 LEDGER_CHECK_BALANCE(internal_compressed);
23877 LEDGER_CHECK_BALANCE(external);
23878 LEDGER_CHECK_BALANCE(reusable);
23879 LEDGER_CHECK_BALANCE(iokit_mapped);
23880 LEDGER_CHECK_BALANCE(alternate_accounting);
23881 LEDGER_CHECK_BALANCE(alternate_accounting_compressed);
23882 LEDGER_CHECK_BALANCE(page_table);
23883 LEDGER_CHECK_BALANCE(purgeable_volatile);
23884 LEDGER_CHECK_BALANCE(purgeable_nonvolatile);
23885 LEDGER_CHECK_BALANCE(purgeable_volatile_compressed);
23886 LEDGER_CHECK_BALANCE(purgeable_nonvolatile_compressed);
23887 LEDGER_CHECK_BALANCE(tagged_nofootprint);
23888 LEDGER_CHECK_BALANCE(tagged_footprint);
23889 LEDGER_CHECK_BALANCE(tagged_nofootprint_compressed);
23890 LEDGER_CHECK_BALANCE(tagged_footprint_compressed);
23891 LEDGER_CHECK_BALANCE(network_volatile);
23892 LEDGER_CHECK_BALANCE(network_nonvolatile);
23893 LEDGER_CHECK_BALANCE(network_volatile_compressed);
23894 LEDGER_CHECK_BALANCE(network_nonvolatile_compressed);
23895 LEDGER_CHECK_BALANCE(media_nofootprint);
23896 LEDGER_CHECK_BALANCE(media_footprint);
23897 LEDGER_CHECK_BALANCE(media_nofootprint_compressed);
23898 LEDGER_CHECK_BALANCE(media_footprint_compressed);
23899 LEDGER_CHECK_BALANCE(graphics_nofootprint);
23900 LEDGER_CHECK_BALANCE(graphics_footprint);
23901 LEDGER_CHECK_BALANCE(graphics_nofootprint_compressed);
23902 LEDGER_CHECK_BALANCE(graphics_footprint_compressed);
23903 LEDGER_CHECK_BALANCE(neural_nofootprint);
23904 LEDGER_CHECK_BALANCE(neural_footprint);
23905 LEDGER_CHECK_BALANCE(neural_nofootprint_compressed);
23906 LEDGER_CHECK_BALANCE(neural_footprint_compressed);
23907
23908 if (do_panic) {
23909 if (pmap_ledgers_panic) {
23910 panic("pmap_destroy(%p) %d[%s] has imbalanced ledgers",
23911 pmap, pid, procname);
23912 } else {
23913 printf("pmap_destroy(%p) %d[%s] has imbalanced ledgers\n",
23914 pmap, pid, procname);
23915 }
23916 }
23917 }
23918
23919 void
vm_map_pmap_set_process(vm_map_t map,int pid,char * procname)23920 vm_map_pmap_set_process(
23921 vm_map_t map,
23922 int pid,
23923 char *procname)
23924 {
23925 pmap_set_process(vm_map_pmap(map), pid, procname);
23926 }
23927
23928 #endif /* MACH_ASSERT */
23929