1 /*
2 * Copyright (c) 2000-2021 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /*
29 * @OSF_COPYRIGHT@
30 */
31 /*
32 * Mach Operating System
33 * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
34 * All Rights Reserved.
35 *
36 * Permission to use, copy, modify and distribute this software and its
37 * documentation is hereby granted, provided that both the copyright
38 * notice and this permission notice appear in all copies of the
39 * software, derivative works or modified versions, and any portions
40 * thereof, and that both notices appear in supporting documentation.
41 *
42 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44 * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45 *
46 * Carnegie Mellon requests users of this software to return to
47 *
48 * Software Distribution Coordinator or [email protected]
49 * School of Computer Science
50 * Carnegie Mellon University
51 * Pittsburgh PA 15213-3890
52 *
53 * any improvements or extensions that they make and grant Carnegie Mellon
54 * the rights to redistribute these changes.
55 */
56 /*
57 */
58 /*
59 * File: vm/vm_map.c
60 * Author: Avadis Tevanian, Jr., Michael Wayne Young
61 * Date: 1985
62 *
63 * Virtual memory mapping module.
64 */
65
66 #include <mach/vm_types.h>
67 #include <mach_assert.h>
68
69 #include <vm/vm_options.h>
70
71 #include <libkern/OSAtomic.h>
72
73 #include <mach/kern_return.h>
74 #include <mach/port.h>
75 #include <mach/vm_attributes.h>
76 #include <mach/vm_param.h>
77 #include <mach/vm_behavior.h>
78 #include <mach/vm_statistics.h>
79 #include <mach/memory_object.h>
80 #include <mach/mach_vm.h>
81 #include <machine/cpu_capabilities.h>
82 #include <mach/sdt.h>
83
84 #include <kern/assert.h>
85 #include <kern/backtrace.h>
86 #include <kern/counter.h>
87 #include <kern/exc_guard.h>
88 #include <kern/kalloc.h>
89 #include <kern/zalloc_internal.h>
90
91 #include <vm/cpm.h>
92 #include <vm/vm_compressor.h>
93 #include <vm/vm_compressor_pager.h>
94 #include <vm/vm_init.h>
95 #include <vm/vm_fault.h>
96 #include <vm/vm_map_internal.h>
97 #include <vm/vm_object.h>
98 #include <vm/vm_page.h>
99 #include <vm/vm_pageout.h>
100 #include <vm/pmap.h>
101 #include <vm/vm_kern.h>
102 #include <ipc/ipc_port.h>
103 #include <kern/sched_prim.h>
104 #include <kern/misc_protos.h>
105
106 #include <mach/vm_map_server.h>
107 #include <mach/mach_host_server.h>
108 #include <vm/vm_memtag.h>
109 #include <vm/vm_protos.h>
110 #include <vm/vm_purgeable_internal.h>
111 #include <vm/vm_reclaim_internal.h>
112
113 #include <vm/vm_protos.h>
114 #include <vm/vm_shared_region.h>
115 #include <vm/vm_map_store.h>
116
117 #include <san/kasan.h>
118
119 #include <sys/resource.h>
120 #include <sys/random.h>
121 #include <sys/codesign.h>
122 #include <sys/code_signing.h>
123 #include <sys/mman.h>
124 #include <sys/reboot.h>
125 #include <sys/kdebug_triage.h>
126
127 #include <libkern/section_keywords.h>
128
129 #if DEVELOPMENT || DEBUG
130 extern int proc_selfcsflags(void);
131 int vm_log_xnu_user_debug = 0;
132 int panic_on_unsigned_execute = 0;
133 int panic_on_mlock_failure = 0;
134 #endif /* DEVELOPMENT || DEBUG */
135
136 #if MACH_ASSERT
137 int debug4k_filter = 0;
138 char debug4k_proc_name[1024] = "";
139 int debug4k_proc_filter = (int)-1 & ~(1 << __DEBUG4K_FAULT);
140 int debug4k_panic_on_misaligned_sharing = 0;
141 const char *debug4k_category_name[] = {
142 "error", /* 0 */
143 "life", /* 1 */
144 "load", /* 2 */
145 "fault", /* 3 */
146 "copy", /* 4 */
147 "share", /* 5 */
148 "adjust", /* 6 */
149 "pmap", /* 7 */
150 "mementry", /* 8 */
151 "iokit", /* 9 */
152 "upl", /* 10 */
153 "exc", /* 11 */
154 "vfs" /* 12 */
155 };
156 #endif /* MACH_ASSERT */
157 int debug4k_no_cow_copyin = 0;
158
159
160 #if __arm64__
161 extern const int fourk_binary_compatibility_unsafe;
162 extern const int fourk_binary_compatibility_allow_wx;
163 #endif /* __arm64__ */
164 extern void qsort(void *a, size_t n, size_t es, int (*cmp)(const void *, const void *));
165 extern int proc_selfpid(void);
166 extern char *proc_name_address(void *p);
167 extern char *proc_best_name(struct proc *p);
168
169 #if VM_MAP_DEBUG_APPLE_PROTECT
170 int vm_map_debug_apple_protect = 0;
171 #endif /* VM_MAP_DEBUG_APPLE_PROTECT */
172 #if VM_MAP_DEBUG_FOURK
173 int vm_map_debug_fourk = 0;
174 #endif /* VM_MAP_DEBUG_FOURK */
175
176 #if DEBUG || DEVELOPMENT
177 static TUNABLE(bool, vm_map_executable_immutable,
178 "vm_map_executable_immutable", true);
179 #else
180 #define vm_map_executable_immutable true
181 #endif
182
183 os_refgrp_decl(static, map_refgrp, "vm_map", NULL);
184
185 extern u_int32_t random(void); /* from <libkern/libkern.h> */
186 /* Internal prototypes
187 */
188
189 typedef struct vm_map_zap {
190 vm_map_entry_t vmz_head;
191 vm_map_entry_t *vmz_tail;
192 } *vm_map_zap_t;
193
194 #define VM_MAP_ZAP_DECLARE(zap) \
195 struct vm_map_zap zap = { .vmz_tail = &zap.vmz_head }
196
197 static vm_map_entry_t vm_map_entry_insert(
198 vm_map_t map,
199 vm_map_entry_t insp_entry,
200 vm_map_offset_t start,
201 vm_map_offset_t end,
202 vm_object_t object,
203 vm_object_offset_t offset,
204 vm_map_kernel_flags_t vmk_flags,
205 boolean_t needs_copy,
206 vm_prot_t cur_protection,
207 vm_prot_t max_protection,
208 vm_inherit_t inheritance,
209 boolean_t clear_map_aligned);
210
211 static void vm_map_simplify_range(
212 vm_map_t map,
213 vm_map_offset_t start,
214 vm_map_offset_t end); /* forward */
215
216 static boolean_t vm_map_range_check(
217 vm_map_t map,
218 vm_map_offset_t start,
219 vm_map_offset_t end,
220 vm_map_entry_t *entry);
221
222 static void vm_map_submap_pmap_clean(
223 vm_map_t map,
224 vm_map_offset_t start,
225 vm_map_offset_t end,
226 vm_map_t sub_map,
227 vm_map_offset_t offset);
228
229 static void vm_map_pmap_enter(
230 vm_map_t map,
231 vm_map_offset_t addr,
232 vm_map_offset_t end_addr,
233 vm_object_t object,
234 vm_object_offset_t offset,
235 vm_prot_t protection);
236
237 static void _vm_map_clip_end(
238 struct vm_map_header *map_header,
239 vm_map_entry_t entry,
240 vm_map_offset_t end);
241
242 static void _vm_map_clip_start(
243 struct vm_map_header *map_header,
244 vm_map_entry_t entry,
245 vm_map_offset_t start);
246
247 static kmem_return_t vm_map_delete(
248 vm_map_t map,
249 vm_map_offset_t start,
250 vm_map_offset_t end,
251 vmr_flags_t flags,
252 kmem_guard_t guard,
253 vm_map_zap_t zap);
254
255 static void vm_map_copy_insert(
256 vm_map_t map,
257 vm_map_entry_t after_where,
258 vm_map_copy_t copy);
259
260 static kern_return_t vm_map_copy_overwrite_unaligned(
261 vm_map_t dst_map,
262 vm_map_entry_t entry,
263 vm_map_copy_t copy,
264 vm_map_address_t start,
265 boolean_t discard_on_success);
266
267 static kern_return_t vm_map_copy_overwrite_aligned(
268 vm_map_t dst_map,
269 vm_map_entry_t tmp_entry,
270 vm_map_copy_t copy,
271 vm_map_offset_t start,
272 pmap_t pmap);
273
274 static kern_return_t vm_map_copyin_kernel_buffer(
275 vm_map_t src_map,
276 vm_map_address_t src_addr,
277 vm_map_size_t len,
278 boolean_t src_destroy,
279 vm_map_copy_t *copy_result); /* OUT */
280
281 static kern_return_t vm_map_copyout_kernel_buffer(
282 vm_map_t map,
283 vm_map_address_t *addr, /* IN/OUT */
284 vm_map_copy_t copy,
285 vm_map_size_t copy_size,
286 boolean_t overwrite,
287 boolean_t consume_on_success);
288
289 static void vm_map_fork_share(
290 vm_map_t old_map,
291 vm_map_entry_t old_entry,
292 vm_map_t new_map);
293
294 static boolean_t vm_map_fork_copy(
295 vm_map_t old_map,
296 vm_map_entry_t *old_entry_p,
297 vm_map_t new_map,
298 int vm_map_copyin_flags);
299
300 static kern_return_t vm_map_wire_nested(
301 vm_map_t map,
302 vm_map_offset_t start,
303 vm_map_offset_t end,
304 vm_prot_t caller_prot,
305 vm_tag_t tag,
306 boolean_t user_wire,
307 pmap_t map_pmap,
308 vm_map_offset_t pmap_addr,
309 ppnum_t *physpage_p);
310
311 static kern_return_t vm_map_unwire_nested(
312 vm_map_t map,
313 vm_map_offset_t start,
314 vm_map_offset_t end,
315 boolean_t user_wire,
316 pmap_t map_pmap,
317 vm_map_offset_t pmap_addr);
318
319 static kern_return_t vm_map_overwrite_submap_recurse(
320 vm_map_t dst_map,
321 vm_map_offset_t dst_addr,
322 vm_map_size_t dst_size);
323
324 static kern_return_t vm_map_copy_overwrite_nested(
325 vm_map_t dst_map,
326 vm_map_offset_t dst_addr,
327 vm_map_copy_t copy,
328 boolean_t interruptible,
329 pmap_t pmap,
330 boolean_t discard_on_success);
331
332 static kern_return_t vm_map_remap_extract(
333 vm_map_t map,
334 vm_map_offset_t addr,
335 vm_map_size_t size,
336 boolean_t copy,
337 vm_map_copy_t map_copy,
338 vm_prot_t *cur_protection,
339 vm_prot_t *max_protection,
340 vm_inherit_t inheritance,
341 vm_map_kernel_flags_t vmk_flags);
342
343 static kern_return_t vm_map_remap_range_allocate(
344 vm_map_t map,
345 vm_map_address_t *address,
346 vm_map_size_t size,
347 vm_map_offset_t mask,
348 vm_map_kernel_flags_t vmk_flags,
349 vm_map_entry_t *map_entry,
350 vm_map_zap_t zap_list);
351
352 static void vm_map_region_look_for_page(
353 vm_map_t map,
354 vm_map_offset_t va,
355 vm_object_t object,
356 vm_object_offset_t offset,
357 int max_refcnt,
358 unsigned short depth,
359 vm_region_extended_info_t extended,
360 mach_msg_type_number_t count);
361
362 static int vm_map_region_count_obj_refs(
363 vm_map_entry_t entry,
364 vm_object_t object);
365
366
367 static kern_return_t vm_map_willneed(
368 vm_map_t map,
369 vm_map_offset_t start,
370 vm_map_offset_t end);
371
372 static kern_return_t vm_map_reuse_pages(
373 vm_map_t map,
374 vm_map_offset_t start,
375 vm_map_offset_t end);
376
377 static kern_return_t vm_map_reusable_pages(
378 vm_map_t map,
379 vm_map_offset_t start,
380 vm_map_offset_t end);
381
382 static kern_return_t vm_map_can_reuse(
383 vm_map_t map,
384 vm_map_offset_t start,
385 vm_map_offset_t end);
386
387 static kern_return_t vm_map_random_address_for_size(
388 vm_map_t map,
389 vm_map_offset_t *address,
390 vm_map_size_t size,
391 vm_map_kernel_flags_t vmk_flags);
392
393
394 #if CONFIG_MAP_RANGES
395
396 static vm_map_range_id_t vm_map_user_range_resolve(
397 vm_map_t map,
398 mach_vm_address_t addr,
399 mach_vm_address_t size,
400 mach_vm_range_t range);
401
402 #endif /* CONFIG_MAP_RANGES */
403 #if MACH_ASSERT
404 static kern_return_t vm_map_pageout(
405 vm_map_t map,
406 vm_map_offset_t start,
407 vm_map_offset_t end);
408 #endif /* MACH_ASSERT */
409
410 kern_return_t vm_map_corpse_footprint_collect(
411 vm_map_t old_map,
412 vm_map_entry_t old_entry,
413 vm_map_t new_map);
414 void vm_map_corpse_footprint_collect_done(
415 vm_map_t new_map);
416 void vm_map_corpse_footprint_destroy(
417 vm_map_t map);
418 kern_return_t vm_map_corpse_footprint_query_page_info(
419 vm_map_t map,
420 vm_map_offset_t va,
421 int *disposition_p);
422 void vm_map_footprint_query_page_info(
423 vm_map_t map,
424 vm_map_entry_t map_entry,
425 vm_map_offset_t curr_s_offset,
426 int *disposition_p);
427
428 #if CONFIG_MAP_RANGES
429 static void vm_map_range_map_init(void);
430 #endif /* CONFIG_MAP_RANGES */
431
432 pid_t find_largest_process_vm_map_entries(void);
433
434 extern int exit_with_guard_exception(void *p, mach_exception_data_type_t code,
435 mach_exception_data_type_t subcode);
436
437 /*
438 * Macros to copy a vm_map_entry. We must be careful to correctly
439 * manage the wired page count. vm_map_entry_copy() creates a new
440 * map entry to the same memory - the wired count in the new entry
441 * must be set to zero. vm_map_entry_copy_full() creates a new
442 * entry that is identical to the old entry. This preserves the
443 * wire count; it's used for map splitting and zone changing in
444 * vm_map_copyout.
445 */
446
447 static inline void
vm_map_entry_copy_csm_assoc(vm_map_t map __unused,vm_map_entry_t new __unused,vm_map_entry_t old __unused)448 vm_map_entry_copy_csm_assoc(
449 vm_map_t map __unused,
450 vm_map_entry_t new __unused,
451 vm_map_entry_t old __unused)
452 {
453 #if CODE_SIGNING_MONITOR
454 /* when code signing monitor is enabled, we want to reset on copy */
455 new->csm_associated = FALSE;
456 #else
457 /* when code signing monitor is not enabled, assert as a sanity check */
458 assert(new->csm_associated == FALSE);
459 #endif
460 #if DEVELOPMENT || DEBUG
461 if (new->vme_xnu_user_debug && vm_log_xnu_user_debug) {
462 printf("FBDP %d[%s] %s:%d map %p entry %p [ 0x%llx 0x%llx ] resetting vme_xnu_user_debug\n",
463 proc_selfpid(),
464 (get_bsdtask_info(current_task())
465 ? proc_name_address(get_bsdtask_info(current_task()))
466 : "?"),
467 __FUNCTION__, __LINE__,
468 map, new, new->vme_start, new->vme_end);
469 }
470 #endif /* DEVELOPMENT || DEBUG */
471 new->vme_xnu_user_debug = FALSE;
472 }
473
474 /*
475 * The "used_for_jit" flag was copied from OLD to NEW in vm_map_entry_copy().
476 * But for security reasons on some platforms, we don't want the
477 * new mapping to be "used for jit", so we reset the flag here.
478 */
479 static inline void
vm_map_entry_copy_code_signing(vm_map_t map,vm_map_entry_t new,vm_map_entry_t old __unused)480 vm_map_entry_copy_code_signing(
481 vm_map_t map,
482 vm_map_entry_t new,
483 vm_map_entry_t old __unused)
484 {
485 if (VM_MAP_POLICY_ALLOW_JIT_COPY(map)) {
486 assert(new->used_for_jit == old->used_for_jit);
487 } else {
488 new->used_for_jit = FALSE;
489 }
490 }
491
492 static inline void
vm_map_entry_copy_full(vm_map_entry_t new,vm_map_entry_t old)493 vm_map_entry_copy_full(
494 vm_map_entry_t new,
495 vm_map_entry_t old)
496 {
497 #if MAP_ENTRY_CREATION_DEBUG
498 btref_put(new->vme_creation_bt);
499 btref_retain(old->vme_creation_bt);
500 #endif
501 #if MAP_ENTRY_INSERTION_DEBUG
502 btref_put(new->vme_insertion_bt);
503 btref_retain(old->vme_insertion_bt);
504 #endif
505 #if VM_BTLOG_TAGS
506 /* Discard the btref that might be in the new entry */
507 if (new->vme_kernel_object) {
508 btref_put(new->vme_tag_btref);
509 }
510 /* Retain the btref in the old entry to account for its copy */
511 if (old->vme_kernel_object) {
512 btref_retain(old->vme_tag_btref);
513 }
514 #endif /* VM_BTLOG_TAGS */
515 *new = *old;
516 }
517
518 static inline void
vm_map_entry_copy(vm_map_t map,vm_map_entry_t new,vm_map_entry_t old)519 vm_map_entry_copy(
520 vm_map_t map,
521 vm_map_entry_t new,
522 vm_map_entry_t old)
523 {
524 vm_map_entry_copy_full(new, old);
525
526 new->is_shared = FALSE;
527 new->needs_wakeup = FALSE;
528 new->in_transition = FALSE;
529 new->wired_count = 0;
530 new->user_wired_count = 0;
531 new->vme_permanent = FALSE;
532 vm_map_entry_copy_code_signing(map, new, old);
533 vm_map_entry_copy_csm_assoc(map, new, old);
534 if (new->iokit_acct) {
535 assertf(!new->use_pmap, "old %p new %p\n", old, new);
536 new->iokit_acct = FALSE;
537 new->use_pmap = TRUE;
538 }
539 new->vme_resilient_codesign = FALSE;
540 new->vme_resilient_media = FALSE;
541 new->vme_atomic = FALSE;
542 new->vme_no_copy_on_read = FALSE;
543 }
544
545 /*
546 * Normal lock_read_to_write() returns FALSE/0 on failure.
547 * These functions evaluate to zero on success and non-zero value on failure.
548 */
549 __attribute__((always_inline))
550 int
vm_map_lock_read_to_write(vm_map_t map)551 vm_map_lock_read_to_write(vm_map_t map)
552 {
553 if (lck_rw_lock_shared_to_exclusive(&(map)->lock)) {
554 DTRACE_VM(vm_map_lock_upgrade);
555 return 0;
556 }
557 return 1;
558 }
559
560 __attribute__((always_inline))
561 boolean_t
vm_map_try_lock(vm_map_t map)562 vm_map_try_lock(vm_map_t map)
563 {
564 if (lck_rw_try_lock_exclusive(&(map)->lock)) {
565 DTRACE_VM(vm_map_lock_w);
566 return TRUE;
567 }
568 return FALSE;
569 }
570
571 __attribute__((always_inline))
572 boolean_t
vm_map_try_lock_read(vm_map_t map)573 vm_map_try_lock_read(vm_map_t map)
574 {
575 if (lck_rw_try_lock_shared(&(map)->lock)) {
576 DTRACE_VM(vm_map_lock_r);
577 return TRUE;
578 }
579 return FALSE;
580 }
581
582 /*!
583 * @function kdp_vm_map_is_acquired_exclusive
584 *
585 * @abstract
586 * Checks if vm map is acquired exclusive.
587 *
588 * @discussion
589 * NOT SAFE: To be used only by kernel debugger.
590 *
591 * @param map map to check
592 *
593 * @returns TRUE if the map is acquired exclusively.
594 */
595 boolean_t
kdp_vm_map_is_acquired_exclusive(vm_map_t map)596 kdp_vm_map_is_acquired_exclusive(vm_map_t map)
597 {
598 return kdp_lck_rw_lock_is_acquired_exclusive(&map->lock);
599 }
600
601 /*
602 * Routines to get the page size the caller should
603 * use while inspecting the target address space.
604 * Use the "_safely" variant if the caller is dealing with a user-provided
605 * array whose size depends on the page size, to avoid any overflow or
606 * underflow of a user-allocated buffer.
607 */
608 int
vm_self_region_page_shift_safely(vm_map_t target_map)609 vm_self_region_page_shift_safely(
610 vm_map_t target_map)
611 {
612 int effective_page_shift = 0;
613
614 if (PAGE_SIZE == (4096)) {
615 /* x86_64 and 4k watches: always use 4k */
616 return PAGE_SHIFT;
617 }
618 /* did caller provide an explicit page size for this thread to use? */
619 effective_page_shift = thread_self_region_page_shift();
620 if (effective_page_shift) {
621 /* use the explicitly-provided page size */
622 return effective_page_shift;
623 }
624 /* no explicit page size: use the caller's page size... */
625 effective_page_shift = VM_MAP_PAGE_SHIFT(current_map());
626 if (effective_page_shift == VM_MAP_PAGE_SHIFT(target_map)) {
627 /* page size match: safe to use */
628 return effective_page_shift;
629 }
630 /* page size mismatch */
631 return -1;
632 }
633 int
vm_self_region_page_shift(vm_map_t target_map)634 vm_self_region_page_shift(
635 vm_map_t target_map)
636 {
637 int effective_page_shift;
638
639 effective_page_shift = vm_self_region_page_shift_safely(target_map);
640 if (effective_page_shift == -1) {
641 /* no safe value but OK to guess for caller */
642 effective_page_shift = MIN(VM_MAP_PAGE_SHIFT(current_map()),
643 VM_MAP_PAGE_SHIFT(target_map));
644 }
645 return effective_page_shift;
646 }
647
648
649 /*
650 * Decide if we want to allow processes to execute from their data or stack areas.
651 * override_nx() returns true if we do. Data/stack execution can be enabled independently
652 * for 32 and 64 bit processes. Set the VM_ABI_32 or VM_ABI_64 flags in allow_data_exec
653 * or allow_stack_exec to enable data execution for that type of data area for that particular
654 * ABI (or both by or'ing the flags together). These are initialized in the architecture
655 * specific pmap files since the default behavior varies according to architecture. The
656 * main reason it varies is because of the need to provide binary compatibility with old
657 * applications that were written before these restrictions came into being. In the old
658 * days, an app could execute anything it could read, but this has slowly been tightened
659 * up over time. The default behavior is:
660 *
661 * 32-bit PPC apps may execute from both stack and data areas
662 * 32-bit Intel apps may exeucte from data areas but not stack
663 * 64-bit PPC/Intel apps may not execute from either data or stack
664 *
665 * An application on any architecture may override these defaults by explicitly
666 * adding PROT_EXEC permission to the page in question with the mprotect(2)
667 * system call. This code here just determines what happens when an app tries to
668 * execute from a page that lacks execute permission.
669 *
670 * Note that allow_data_exec or allow_stack_exec may also be modified by sysctl to change the
671 * default behavior for both 32 and 64 bit apps on a system-wide basis. Furthermore,
672 * a Mach-O header flag bit (MH_NO_HEAP_EXECUTION) can be used to forcibly disallow
673 * execution from data areas for a particular binary even if the arch normally permits it. As
674 * a final wrinkle, a posix_spawn attribute flag can be used to negate this opt-in header bit
675 * to support some complicated use cases, notably browsers with out-of-process plugins that
676 * are not all NX-safe.
677 */
678
679 extern int allow_data_exec, allow_stack_exec;
680
681 int
override_nx(vm_map_t map,uint32_t user_tag)682 override_nx(vm_map_t map, uint32_t user_tag) /* map unused on arm */
683 {
684 int current_abi;
685
686 if (map->pmap == kernel_pmap) {
687 return FALSE;
688 }
689
690 /*
691 * Determine if the app is running in 32 or 64 bit mode.
692 */
693
694 if (vm_map_is_64bit(map)) {
695 current_abi = VM_ABI_64;
696 } else {
697 current_abi = VM_ABI_32;
698 }
699
700 /*
701 * Determine if we should allow the execution based on whether it's a
702 * stack or data area and the current architecture.
703 */
704
705 if (user_tag == VM_MEMORY_STACK) {
706 return allow_stack_exec & current_abi;
707 }
708
709 return (allow_data_exec & current_abi) && (map->map_disallow_data_exec == FALSE);
710 }
711
712
713 /*
714 * Virtual memory maps provide for the mapping, protection,
715 * and sharing of virtual memory objects. In addition,
716 * this module provides for an efficient virtual copy of
717 * memory from one map to another.
718 *
719 * Synchronization is required prior to most operations.
720 *
721 * Maps consist of an ordered doubly-linked list of simple
722 * entries; a single hint is used to speed up lookups.
723 *
724 * Sharing maps have been deleted from this version of Mach.
725 * All shared objects are now mapped directly into the respective
726 * maps. This requires a change in the copy on write strategy;
727 * the asymmetric (delayed) strategy is used for shared temporary
728 * objects instead of the symmetric (shadow) strategy. All maps
729 * are now "top level" maps (either task map, kernel map or submap
730 * of the kernel map).
731 *
732 * Since portions of maps are specified by start/end addreses,
733 * which may not align with existing map entries, all
734 * routines merely "clip" entries to these start/end values.
735 * [That is, an entry is split into two, bordering at a
736 * start or end value.] Note that these clippings may not
737 * always be necessary (as the two resulting entries are then
738 * not changed); however, the clipping is done for convenience.
739 * No attempt is currently made to "glue back together" two
740 * abutting entries.
741 *
742 * The symmetric (shadow) copy strategy implements virtual copy
743 * by copying VM object references from one map to
744 * another, and then marking both regions as copy-on-write.
745 * It is important to note that only one writeable reference
746 * to a VM object region exists in any map when this strategy
747 * is used -- this means that shadow object creation can be
748 * delayed until a write operation occurs. The symmetric (delayed)
749 * strategy allows multiple maps to have writeable references to
750 * the same region of a vm object, and hence cannot delay creating
751 * its copy objects. See vm_object_copy_quickly() in vm_object.c.
752 * Copying of permanent objects is completely different; see
753 * vm_object_copy_strategically() in vm_object.c.
754 */
755
756 ZONE_DECLARE_ID(ZONE_ID_VM_MAP_COPY, struct vm_map_copy);
757
758 #define VM_MAP_ZONE_NAME "maps"
759 #define VM_MAP_ZFLAGS (ZC_NOENCRYPT | ZC_VM)
760
761 #define VM_MAP_ENTRY_ZONE_NAME "VM map entries"
762 #define VM_MAP_ENTRY_ZFLAGS (ZC_NOENCRYPT | ZC_VM)
763
764 #define VM_MAP_HOLES_ZONE_NAME "VM map holes"
765 #define VM_MAP_HOLES_ZFLAGS (ZC_NOENCRYPT | ZC_VM)
766
767 /*
768 * Asserts that a vm_map_copy object is coming from the
769 * vm_map_copy_zone to ensure that it isn't a fake constructed
770 * anywhere else.
771 */
772 void
vm_map_copy_require(struct vm_map_copy * copy)773 vm_map_copy_require(struct vm_map_copy *copy)
774 {
775 zone_id_require(ZONE_ID_VM_MAP_COPY, sizeof(struct vm_map_copy), copy);
776 }
777
778 /*
779 * vm_map_require:
780 *
781 * Ensures that the argument is memory allocated from the genuine
782 * vm map zone. (See zone_id_require_allow_foreign).
783 */
784 void
vm_map_require(vm_map_t map)785 vm_map_require(vm_map_t map)
786 {
787 zone_id_require(ZONE_ID_VM_MAP, sizeof(struct _vm_map), map);
788 }
789
790 #define VM_MAP_EARLY_COUNT_MAX 16
791 static __startup_data vm_offset_t map_data;
792 static __startup_data vm_size_t map_data_size;
793 static __startup_data vm_offset_t kentry_data;
794 static __startup_data vm_size_t kentry_data_size;
795 static __startup_data vm_offset_t map_holes_data;
796 static __startup_data vm_size_t map_holes_data_size;
797 static __startup_data vm_map_t *early_map_owners[VM_MAP_EARLY_COUNT_MAX];
798 static __startup_data uint32_t early_map_count;
799
800 #if XNU_TARGET_OS_OSX
801 #define NO_COALESCE_LIMIT ((1024 * 128) - 1)
802 #else /* XNU_TARGET_OS_OSX */
803 #define NO_COALESCE_LIMIT 0
804 #endif /* XNU_TARGET_OS_OSX */
805
806 /* Skip acquiring locks if we're in the midst of a kernel core dump */
807 unsigned int not_in_kdp = 1;
808
809 unsigned int vm_map_set_cache_attr_count = 0;
810
811 kern_return_t
vm_map_set_cache_attr(vm_map_t map,vm_map_offset_t va)812 vm_map_set_cache_attr(
813 vm_map_t map,
814 vm_map_offset_t va)
815 {
816 vm_map_entry_t map_entry;
817 vm_object_t object;
818 kern_return_t kr = KERN_SUCCESS;
819
820 vm_map_lock_read(map);
821
822 if (!vm_map_lookup_entry(map, va, &map_entry) ||
823 map_entry->is_sub_map) {
824 /*
825 * that memory is not properly mapped
826 */
827 kr = KERN_INVALID_ARGUMENT;
828 goto done;
829 }
830 object = VME_OBJECT(map_entry);
831
832 if (object == VM_OBJECT_NULL) {
833 /*
834 * there should be a VM object here at this point
835 */
836 kr = KERN_INVALID_ARGUMENT;
837 goto done;
838 }
839 vm_object_lock(object);
840 object->set_cache_attr = TRUE;
841 vm_object_unlock(object);
842
843 vm_map_set_cache_attr_count++;
844 done:
845 vm_map_unlock_read(map);
846
847 return kr;
848 }
849
850
851 #if CONFIG_CODE_DECRYPTION
852 /*
853 * vm_map_apple_protected:
854 * This remaps the requested part of the object with an object backed by
855 * the decrypting pager.
856 * crypt_info contains entry points and session data for the crypt module.
857 * The crypt_info block will be copied by vm_map_apple_protected. The data structures
858 * referenced in crypt_info must remain valid until crypt_info->crypt_end() is called.
859 */
860 kern_return_t
vm_map_apple_protected(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_object_offset_t crypto_backing_offset,struct pager_crypt_info * crypt_info,uint32_t cryptid)861 vm_map_apple_protected(
862 vm_map_t map,
863 vm_map_offset_t start,
864 vm_map_offset_t end,
865 vm_object_offset_t crypto_backing_offset,
866 struct pager_crypt_info *crypt_info,
867 uint32_t cryptid)
868 {
869 boolean_t map_locked;
870 kern_return_t kr;
871 vm_map_entry_t map_entry;
872 struct vm_map_entry tmp_entry;
873 memory_object_t unprotected_mem_obj;
874 vm_object_t protected_object;
875 vm_map_offset_t map_addr;
876 vm_map_offset_t start_aligned, end_aligned;
877 vm_object_offset_t crypto_start, crypto_end;
878 boolean_t cache_pager;
879
880 map_locked = FALSE;
881 unprotected_mem_obj = MEMORY_OBJECT_NULL;
882
883 if (__improbable(vm_map_range_overflows(map, start, end - start))) {
884 return KERN_INVALID_ADDRESS;
885 }
886 start_aligned = vm_map_trunc_page(start, PAGE_MASK_64);
887 end_aligned = vm_map_round_page(end, PAGE_MASK_64);
888 start_aligned = vm_map_trunc_page(start_aligned, VM_MAP_PAGE_MASK(map));
889 end_aligned = vm_map_round_page(end_aligned, VM_MAP_PAGE_MASK(map));
890
891 #if __arm64__
892 /*
893 * "start" and "end" might be 4K-aligned but not 16K-aligned,
894 * so we might have to loop and establish up to 3 mappings:
895 *
896 * + the first 16K-page, which might overlap with the previous
897 * 4K-aligned mapping,
898 * + the center,
899 * + the last 16K-page, which might overlap with the next
900 * 4K-aligned mapping.
901 * Each of these mapping might be backed by a vnode pager (if
902 * properly page-aligned) or a "fourk_pager", itself backed by a
903 * vnode pager (if 4K-aligned but not page-aligned).
904 */
905 #endif /* __arm64__ */
906
907 map_addr = start_aligned;
908 for (map_addr = start_aligned;
909 map_addr < end;
910 map_addr = tmp_entry.vme_end) {
911 vm_map_lock(map);
912 map_locked = TRUE;
913
914 /* lookup the protected VM object */
915 if (!vm_map_lookup_entry(map,
916 map_addr,
917 &map_entry) ||
918 map_entry->is_sub_map ||
919 VME_OBJECT(map_entry) == VM_OBJECT_NULL) {
920 /* that memory is not properly mapped */
921 kr = KERN_INVALID_ARGUMENT;
922 goto done;
923 }
924
925 /* ensure mapped memory is mapped as executable except
926 * except for model decryption flow */
927 if ((cryptid != CRYPTID_MODEL_ENCRYPTION) &&
928 !(map_entry->protection & VM_PROT_EXECUTE)) {
929 kr = KERN_INVALID_ARGUMENT;
930 goto done;
931 }
932
933 /* get the protected object to be decrypted */
934 protected_object = VME_OBJECT(map_entry);
935 if (protected_object == VM_OBJECT_NULL) {
936 /* there should be a VM object here at this point */
937 kr = KERN_INVALID_ARGUMENT;
938 goto done;
939 }
940 /* ensure protected object stays alive while map is unlocked */
941 vm_object_reference(protected_object);
942
943 /* limit the map entry to the area we want to cover */
944 vm_map_clip_start(map, map_entry, start_aligned);
945 vm_map_clip_end(map, map_entry, end_aligned);
946
947 tmp_entry = *map_entry;
948 map_entry = VM_MAP_ENTRY_NULL; /* not valid after unlocking map */
949 vm_map_unlock(map);
950 map_locked = FALSE;
951
952 /*
953 * This map entry might be only partially encrypted
954 * (if not fully "page-aligned").
955 */
956 crypto_start = 0;
957 crypto_end = tmp_entry.vme_end - tmp_entry.vme_start;
958 if (tmp_entry.vme_start < start) {
959 if (tmp_entry.vme_start != start_aligned) {
960 kr = KERN_INVALID_ADDRESS;
961 }
962 crypto_start += (start - tmp_entry.vme_start);
963 }
964 if (tmp_entry.vme_end > end) {
965 if (tmp_entry.vme_end != end_aligned) {
966 kr = KERN_INVALID_ADDRESS;
967 }
968 crypto_end -= (tmp_entry.vme_end - end);
969 }
970
971 /*
972 * This "extra backing offset" is needed to get the decryption
973 * routine to use the right key. It adjusts for the possibly
974 * relative offset of an interposed "4K" pager...
975 */
976 if (crypto_backing_offset == (vm_object_offset_t) -1) {
977 crypto_backing_offset = VME_OFFSET(&tmp_entry);
978 }
979
980 cache_pager = TRUE;
981 #if XNU_TARGET_OS_OSX
982 if (vm_map_is_alien(map)) {
983 cache_pager = FALSE;
984 }
985 #endif /* XNU_TARGET_OS_OSX */
986
987 /*
988 * Lookup (and create if necessary) the protected memory object
989 * matching that VM object.
990 * If successful, this also grabs a reference on the memory object,
991 * to guarantee that it doesn't go away before we get a chance to map
992 * it.
993 */
994 unprotected_mem_obj = apple_protect_pager_setup(
995 protected_object,
996 VME_OFFSET(&tmp_entry),
997 crypto_backing_offset,
998 crypt_info,
999 crypto_start,
1000 crypto_end,
1001 cache_pager);
1002
1003 /* release extra ref on protected object */
1004 vm_object_deallocate(protected_object);
1005
1006 if (unprotected_mem_obj == NULL) {
1007 kr = KERN_FAILURE;
1008 goto done;
1009 }
1010
1011 /* can overwrite an immutable mapping */
1012 vm_map_kernel_flags_t vmk_flags = {
1013 .vmf_fixed = true,
1014 .vmf_overwrite = true,
1015 .vmkf_overwrite_immutable = true,
1016 };
1017 #if __arm64__
1018 if (tmp_entry.used_for_jit &&
1019 (VM_MAP_PAGE_SHIFT(map) != FOURK_PAGE_SHIFT ||
1020 PAGE_SHIFT != FOURK_PAGE_SHIFT) &&
1021 fourk_binary_compatibility_unsafe &&
1022 fourk_binary_compatibility_allow_wx) {
1023 printf("** FOURK_COMPAT [%d]: "
1024 "allowing write+execute at 0x%llx\n",
1025 proc_selfpid(), tmp_entry.vme_start);
1026 vmk_flags.vmkf_map_jit = TRUE;
1027 }
1028 #endif /* __arm64__ */
1029
1030 /* map this memory object in place of the current one */
1031 map_addr = tmp_entry.vme_start;
1032 kr = vm_map_enter_mem_object(map,
1033 &map_addr,
1034 (tmp_entry.vme_end -
1035 tmp_entry.vme_start),
1036 (mach_vm_offset_t) 0,
1037 vmk_flags,
1038 (ipc_port_t)(uintptr_t) unprotected_mem_obj,
1039 0,
1040 TRUE,
1041 tmp_entry.protection,
1042 tmp_entry.max_protection,
1043 tmp_entry.inheritance);
1044 assertf(kr == KERN_SUCCESS,
1045 "kr = 0x%x\n", kr);
1046 assertf(map_addr == tmp_entry.vme_start,
1047 "map_addr=0x%llx vme_start=0x%llx tmp_entry=%p\n",
1048 (uint64_t)map_addr,
1049 (uint64_t) tmp_entry.vme_start,
1050 &tmp_entry);
1051
1052 #if VM_MAP_DEBUG_APPLE_PROTECT
1053 if (vm_map_debug_apple_protect) {
1054 printf("APPLE_PROTECT: map %p [0x%llx:0x%llx] pager %p:"
1055 " backing:[object:%p,offset:0x%llx,"
1056 "crypto_backing_offset:0x%llx,"
1057 "crypto_start:0x%llx,crypto_end:0x%llx]\n",
1058 map,
1059 (uint64_t) map_addr,
1060 (uint64_t) (map_addr + (tmp_entry.vme_end -
1061 tmp_entry.vme_start)),
1062 unprotected_mem_obj,
1063 protected_object,
1064 VME_OFFSET(&tmp_entry),
1065 crypto_backing_offset,
1066 crypto_start,
1067 crypto_end);
1068 }
1069 #endif /* VM_MAP_DEBUG_APPLE_PROTECT */
1070
1071 /*
1072 * Release the reference obtained by
1073 * apple_protect_pager_setup().
1074 * The mapping (if it succeeded) is now holding a reference on
1075 * the memory object.
1076 */
1077 memory_object_deallocate(unprotected_mem_obj);
1078 unprotected_mem_obj = MEMORY_OBJECT_NULL;
1079
1080 /* continue with next map entry */
1081 crypto_backing_offset += (tmp_entry.vme_end -
1082 tmp_entry.vme_start);
1083 crypto_backing_offset -= crypto_start;
1084 }
1085 kr = KERN_SUCCESS;
1086
1087 done:
1088 if (map_locked) {
1089 vm_map_unlock(map);
1090 }
1091 return kr;
1092 }
1093 #endif /* CONFIG_CODE_DECRYPTION */
1094
1095
1096 LCK_GRP_DECLARE(vm_map_lck_grp, "vm_map");
1097 LCK_ATTR_DECLARE(vm_map_lck_attr, 0, 0);
1098 LCK_ATTR_DECLARE(vm_map_lck_rw_attr, 0, LCK_ATTR_DEBUG);
1099
1100 #if XNU_TARGET_OS_OSX
1101 #define MALLOC_NO_COW_DEFAULT 1
1102 #define MALLOC_NO_COW_EXCEPT_FORK_DEFAULT 1
1103 #else /* XNU_TARGET_OS_OSX */
1104 #define MALLOC_NO_COW_DEFAULT 1
1105 #define MALLOC_NO_COW_EXCEPT_FORK_DEFAULT 0
1106 #endif /* XNU_TARGET_OS_OSX */
1107 TUNABLE(int, malloc_no_cow, "malloc_no_cow", MALLOC_NO_COW_DEFAULT);
1108 TUNABLE(int, malloc_no_cow_except_fork, "malloc_no_cow_except_fork", MALLOC_NO_COW_EXCEPT_FORK_DEFAULT);
1109 uint64_t vm_memory_malloc_no_cow_mask = 0ULL;
1110 #if DEBUG
1111 int vm_check_map_sanity = 0;
1112 #endif
1113
1114 /*
1115 * vm_map_init:
1116 *
1117 * Initialize the vm_map module. Must be called before
1118 * any other vm_map routines.
1119 *
1120 * Map and entry structures are allocated from zones -- we must
1121 * initialize those zones.
1122 *
1123 * There are three zones of interest:
1124 *
1125 * vm_map_zone: used to allocate maps.
1126 * vm_map_entry_zone: used to allocate map entries.
1127 *
1128 * LP32:
1129 * vm_map_entry_reserved_zone: fallback zone for kernel map entries
1130 *
1131 * The kernel allocates map entries from a special zone that is initially
1132 * "crammed" with memory. It would be difficult (perhaps impossible) for
1133 * the kernel to allocate more memory to a entry zone when it became
1134 * empty since the very act of allocating memory implies the creation
1135 * of a new entry.
1136 */
1137 __startup_func
1138 void
vm_map_init(void)1139 vm_map_init(void)
1140 {
1141
1142 #if MACH_ASSERT
1143 PE_parse_boot_argn("debug4k_filter", &debug4k_filter,
1144 sizeof(debug4k_filter));
1145 #endif /* MACH_ASSERT */
1146
1147 zone_create_ext(VM_MAP_ZONE_NAME, sizeof(struct _vm_map),
1148 VM_MAP_ZFLAGS, ZONE_ID_VM_MAP, NULL);
1149
1150 /*
1151 * Don't quarantine because we always need elements available
1152 * Disallow GC on this zone... to aid the GC.
1153 */
1154 zone_create_ext(VM_MAP_ENTRY_ZONE_NAME,
1155 sizeof(struct vm_map_entry), VM_MAP_ENTRY_ZFLAGS,
1156 ZONE_ID_VM_MAP_ENTRY, ^(zone_t z) {
1157 z->z_elems_rsv = (uint16_t)(32 *
1158 (ml_early_cpu_max_number() + 1));
1159 });
1160
1161 zone_create_ext(VM_MAP_HOLES_ZONE_NAME,
1162 sizeof(struct vm_map_links), VM_MAP_HOLES_ZFLAGS,
1163 ZONE_ID_VM_MAP_HOLES, ^(zone_t z) {
1164 z->z_elems_rsv = (uint16_t)(16 * 1024 / zone_elem_outer_size(z));
1165 });
1166
1167 zone_create_ext("VM map copies", sizeof(struct vm_map_copy),
1168 ZC_NOENCRYPT, ZONE_ID_VM_MAP_COPY, NULL);
1169
1170 /*
1171 * Add the stolen memory to zones, adjust zone size and stolen counts.
1172 */
1173 zone_cram_early(vm_map_zone, map_data, map_data_size);
1174 zone_cram_early(vm_map_entry_zone, kentry_data, kentry_data_size);
1175 zone_cram_early(vm_map_holes_zone, map_holes_data, map_holes_data_size);
1176 printf("VM boostrap: %d maps, %d entries and %d holes available\n",
1177 zone_count_free(vm_map_zone),
1178 zone_count_free(vm_map_entry_zone),
1179 zone_count_free(vm_map_holes_zone));
1180
1181 /*
1182 * Since these are covered by zones, remove them from stolen page accounting.
1183 */
1184 VM_PAGE_MOVE_STOLEN(atop_64(map_data_size) + atop_64(kentry_data_size) + atop_64(map_holes_data_size));
1185
1186 #if VM_MAP_DEBUG_APPLE_PROTECT
1187 PE_parse_boot_argn("vm_map_debug_apple_protect",
1188 &vm_map_debug_apple_protect,
1189 sizeof(vm_map_debug_apple_protect));
1190 #endif /* VM_MAP_DEBUG_APPLE_PROTECT */
1191 #if VM_MAP_DEBUG_APPLE_FOURK
1192 PE_parse_boot_argn("vm_map_debug_fourk",
1193 &vm_map_debug_fourk,
1194 sizeof(vm_map_debug_fourk));
1195 #endif /* VM_MAP_DEBUG_FOURK */
1196
1197 if (malloc_no_cow) {
1198 vm_memory_malloc_no_cow_mask = 0ULL;
1199 vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC;
1200 vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_SMALL;
1201 vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_MEDIUM;
1202 #if XNU_TARGET_OS_OSX
1203 /*
1204 * On macOS, keep copy-on-write for MALLOC_LARGE because
1205 * realloc() may use vm_copy() to transfer the old contents
1206 * to the new location.
1207 */
1208 #else /* XNU_TARGET_OS_OSX */
1209 vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_LARGE;
1210 vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_LARGE_REUSABLE;
1211 vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_LARGE_REUSED;
1212 #endif /* XNU_TARGET_OS_OSX */
1213 // vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_HUGE;
1214 // vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_REALLOC;
1215 vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_TINY;
1216 vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_NANO;
1217 // vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_TCMALLOC;
1218 PE_parse_boot_argn("vm_memory_malloc_no_cow_mask",
1219 &vm_memory_malloc_no_cow_mask,
1220 sizeof(vm_memory_malloc_no_cow_mask));
1221 }
1222
1223 #if CONFIG_MAP_RANGES
1224 vm_map_range_map_init();
1225 #endif /* CONFIG_MAP_RANGES */
1226
1227 #if DEBUG
1228 PE_parse_boot_argn("vm_check_map_sanity", &vm_check_map_sanity, sizeof(vm_check_map_sanity));
1229 if (vm_check_map_sanity) {
1230 kprintf("VM sanity checking enabled\n");
1231 } else {
1232 kprintf("VM sanity checking disabled. Set bootarg vm_check_map_sanity=1 to enable\n");
1233 }
1234 #endif /* DEBUG */
1235
1236 #if DEVELOPMENT || DEBUG
1237 PE_parse_boot_argn("panic_on_unsigned_execute",
1238 &panic_on_unsigned_execute,
1239 sizeof(panic_on_unsigned_execute));
1240 PE_parse_boot_argn("panic_on_mlock_failure",
1241 &panic_on_mlock_failure,
1242 sizeof(panic_on_mlock_failure));
1243 #endif /* DEVELOPMENT || DEBUG */
1244 }
1245
1246 __startup_func
1247 static void
vm_map_steal_memory(void)1248 vm_map_steal_memory(void)
1249 {
1250 /*
1251 * We need to reserve enough memory to support boostraping VM maps
1252 * and the zone subsystem.
1253 *
1254 * The VM Maps that need to function before zones can support them
1255 * are the ones registered with vm_map_will_allocate_early_map(),
1256 * which are:
1257 * - the kernel map
1258 * - the various submaps used by zones (pgz, meta, ...)
1259 *
1260 * We also need enough entries and holes to support them
1261 * until zone_metadata_init() is called, which is when
1262 * the zone allocator becomes capable of expanding dynamically.
1263 *
1264 * We need:
1265 * - VM_MAP_EARLY_COUNT_MAX worth of VM Maps.
1266 * - To allow for 3-4 entries per map, but the kernel map
1267 * needs a multiple of VM_MAP_EARLY_COUNT_MAX entries
1268 * to describe the submaps, so double it (and make it 8x too)
1269 * - To allow for holes between entries,
1270 * hence needs the same budget as entries
1271 */
1272 map_data_size = zone_get_early_alloc_size(VM_MAP_ZONE_NAME,
1273 sizeof(struct _vm_map), VM_MAP_ZFLAGS,
1274 VM_MAP_EARLY_COUNT_MAX);
1275
1276 kentry_data_size = zone_get_early_alloc_size(VM_MAP_ENTRY_ZONE_NAME,
1277 sizeof(struct vm_map_entry), VM_MAP_ENTRY_ZFLAGS,
1278 8 * VM_MAP_EARLY_COUNT_MAX);
1279
1280 map_holes_data_size = zone_get_early_alloc_size(VM_MAP_HOLES_ZONE_NAME,
1281 sizeof(struct vm_map_links), VM_MAP_HOLES_ZFLAGS,
1282 8 * VM_MAP_EARLY_COUNT_MAX);
1283
1284 /*
1285 * Steal a contiguous range of memory so that a simple range check
1286 * can validate early addresses being freed/crammed to these
1287 * zones
1288 */
1289 map_data = zone_early_mem_init(map_data_size + kentry_data_size +
1290 map_holes_data_size);
1291 kentry_data = map_data + map_data_size;
1292 map_holes_data = kentry_data + kentry_data_size;
1293 }
1294 STARTUP(PMAP_STEAL, STARTUP_RANK_FIRST, vm_map_steal_memory);
1295
1296 __startup_func
1297 static void
vm_kernel_boostraped(void)1298 vm_kernel_boostraped(void)
1299 {
1300 zone_enable_caching(&zone_array[ZONE_ID_VM_MAP_ENTRY]);
1301 zone_enable_caching(&zone_array[ZONE_ID_VM_MAP_HOLES]);
1302 zone_enable_caching(&zone_array[ZONE_ID_VM_MAP_COPY]);
1303
1304 printf("VM bootstrap done: %d maps, %d entries and %d holes left\n",
1305 zone_count_free(vm_map_zone),
1306 zone_count_free(vm_map_entry_zone),
1307 zone_count_free(vm_map_holes_zone));
1308 }
1309 STARTUP(ZALLOC, STARTUP_RANK_SECOND, vm_kernel_boostraped);
1310
1311 void
vm_map_disable_hole_optimization(vm_map_t map)1312 vm_map_disable_hole_optimization(vm_map_t map)
1313 {
1314 vm_map_entry_t head_entry, hole_entry, next_hole_entry;
1315
1316 if (map->holelistenabled) {
1317 head_entry = hole_entry = CAST_TO_VM_MAP_ENTRY(map->holes_list);
1318
1319 while (hole_entry != NULL) {
1320 next_hole_entry = hole_entry->vme_next;
1321
1322 hole_entry->vme_next = NULL;
1323 hole_entry->vme_prev = NULL;
1324 zfree_id(ZONE_ID_VM_MAP_HOLES, hole_entry);
1325
1326 if (next_hole_entry == head_entry) {
1327 hole_entry = NULL;
1328 } else {
1329 hole_entry = next_hole_entry;
1330 }
1331 }
1332
1333 map->holes_list = NULL;
1334 map->holelistenabled = FALSE;
1335
1336 map->first_free = vm_map_first_entry(map);
1337 SAVE_HINT_HOLE_WRITE(map, NULL);
1338 }
1339 }
1340
1341 boolean_t
vm_kernel_map_is_kernel(vm_map_t map)1342 vm_kernel_map_is_kernel(vm_map_t map)
1343 {
1344 return map->pmap == kernel_pmap;
1345 }
1346
1347 /*
1348 * vm_map_create:
1349 *
1350 * Creates and returns a new empty VM map with
1351 * the given physical map structure, and having
1352 * the given lower and upper address bounds.
1353 */
1354
1355 extern vm_map_t vm_map_create_external(
1356 pmap_t pmap,
1357 vm_map_offset_t min_off,
1358 vm_map_offset_t max_off,
1359 boolean_t pageable);
1360
1361 vm_map_t
vm_map_create_external(pmap_t pmap,vm_map_offset_t min,vm_map_offset_t max,boolean_t pageable)1362 vm_map_create_external(
1363 pmap_t pmap,
1364 vm_map_offset_t min,
1365 vm_map_offset_t max,
1366 boolean_t pageable)
1367 {
1368 vm_map_create_options_t options = VM_MAP_CREATE_DEFAULT;
1369
1370 if (pageable) {
1371 options |= VM_MAP_CREATE_PAGEABLE;
1372 }
1373 return vm_map_create_options(pmap, min, max, options);
1374 }
1375
1376 __startup_func
1377 void
vm_map_will_allocate_early_map(vm_map_t * owner)1378 vm_map_will_allocate_early_map(vm_map_t *owner)
1379 {
1380 if (early_map_count >= VM_MAP_EARLY_COUNT_MAX) {
1381 panic("VM_MAP_EARLY_COUNT_MAX is too low");
1382 }
1383
1384 early_map_owners[early_map_count++] = owner;
1385 }
1386
1387 __startup_func
1388 void
vm_map_relocate_early_maps(vm_offset_t delta)1389 vm_map_relocate_early_maps(vm_offset_t delta)
1390 {
1391 for (uint32_t i = 0; i < early_map_count; i++) {
1392 vm_address_t addr = (vm_address_t)*early_map_owners[i];
1393
1394 *early_map_owners[i] = (vm_map_t)(addr + delta);
1395 }
1396
1397 early_map_count = ~0u;
1398 }
1399
1400 /*
1401 * Routine: vm_map_relocate_early_elem
1402 *
1403 * Purpose:
1404 * Early zone elements are allocated in a temporary part
1405 * of the address space.
1406 *
1407 * Once the zones live in their final place, the early
1408 * VM maps, map entries and map holes need to be relocated.
1409 *
1410 * It involves rewriting any vm_map_t, vm_map_entry_t or
1411 * pointers to vm_map_links. Other pointers to other types
1412 * are fine.
1413 *
1414 * Fortunately, pointers to those types are self-contained
1415 * in those zones, _except_ for pointers to VM maps,
1416 * which are tracked during early boot and fixed with
1417 * vm_map_relocate_early_maps().
1418 */
1419 __startup_func
1420 void
vm_map_relocate_early_elem(uint32_t zone_id,vm_offset_t new_addr,vm_offset_t delta)1421 vm_map_relocate_early_elem(
1422 uint32_t zone_id,
1423 vm_offset_t new_addr,
1424 vm_offset_t delta)
1425 {
1426 #define relocate(type_t, field) ({ \
1427 typeof(((type_t)NULL)->field) *__field = &((type_t)new_addr)->field; \
1428 if (*__field) { \
1429 *__field = (typeof(*__field))((vm_offset_t)*__field + delta); \
1430 } \
1431 })
1432
1433 switch (zone_id) {
1434 case ZONE_ID_VM_MAP:
1435 case ZONE_ID_VM_MAP_ENTRY:
1436 case ZONE_ID_VM_MAP_HOLES:
1437 break;
1438
1439 default:
1440 panic("Unexpected zone ID %d", zone_id);
1441 }
1442
1443 if (zone_id == ZONE_ID_VM_MAP) {
1444 relocate(vm_map_t, hdr.links.prev);
1445 relocate(vm_map_t, hdr.links.next);
1446 ((vm_map_t)new_addr)->pmap = kernel_pmap;
1447 #ifdef VM_MAP_STORE_USE_RB
1448 relocate(vm_map_t, hdr.rb_head_store.rbh_root);
1449 #endif /* VM_MAP_STORE_USE_RB */
1450 relocate(vm_map_t, hint);
1451 relocate(vm_map_t, hole_hint);
1452 relocate(vm_map_t, first_free);
1453 return;
1454 }
1455
1456 relocate(struct vm_map_links *, prev);
1457 relocate(struct vm_map_links *, next);
1458
1459 if (zone_id == ZONE_ID_VM_MAP_ENTRY) {
1460 #ifdef VM_MAP_STORE_USE_RB
1461 relocate(vm_map_entry_t, store.entry.rbe_left);
1462 relocate(vm_map_entry_t, store.entry.rbe_right);
1463 relocate(vm_map_entry_t, store.entry.rbe_parent);
1464 #endif /* VM_MAP_STORE_USE_RB */
1465 if (((vm_map_entry_t)new_addr)->is_sub_map) {
1466 /* no object to relocate because we haven't made any */
1467 ((vm_map_entry_t)new_addr)->vme_submap +=
1468 delta >> VME_SUBMAP_SHIFT;
1469 }
1470 #if MAP_ENTRY_CREATION_DEBUG
1471 relocate(vm_map_entry_t, vme_creation_maphdr);
1472 #endif /* MAP_ENTRY_CREATION_DEBUG */
1473 }
1474
1475 #undef relocate
1476 }
1477
1478 vm_map_t
vm_map_create_options(pmap_t pmap,vm_map_offset_t min,vm_map_offset_t max,vm_map_create_options_t options)1479 vm_map_create_options(
1480 pmap_t pmap,
1481 vm_map_offset_t min,
1482 vm_map_offset_t max,
1483 vm_map_create_options_t options)
1484 {
1485 vm_map_t result;
1486
1487 #if DEBUG || DEVELOPMENT
1488 if (__improbable(startup_phase < STARTUP_SUB_ZALLOC)) {
1489 if (early_map_count != ~0u && early_map_count !=
1490 zone_count_allocated(vm_map_zone) + 1) {
1491 panic("allocating %dth early map, owner not known",
1492 zone_count_allocated(vm_map_zone) + 1);
1493 }
1494 if (early_map_count != ~0u && pmap && pmap != kernel_pmap) {
1495 panic("allocating %dth early map for non kernel pmap",
1496 early_map_count);
1497 }
1498 }
1499 #endif /* DEBUG || DEVELOPMENT */
1500
1501 result = zalloc_id(ZONE_ID_VM_MAP, Z_WAITOK | Z_NOFAIL | Z_ZERO);
1502
1503 vm_map_store_init(&result->hdr);
1504 result->hdr.entries_pageable = (bool)(options & VM_MAP_CREATE_PAGEABLE);
1505 vm_map_set_page_shift(result, PAGE_SHIFT);
1506
1507 result->size_limit = RLIM_INFINITY; /* default unlimited */
1508 result->data_limit = RLIM_INFINITY; /* default unlimited */
1509 result->user_wire_limit = MACH_VM_MAX_ADDRESS; /* default limit is unlimited */
1510 os_ref_init_count_raw(&result->map_refcnt, &map_refgrp, 1);
1511 result->pmap = pmap;
1512 result->min_offset = min;
1513 result->max_offset = max;
1514 result->first_free = vm_map_to_entry(result);
1515 result->hint = vm_map_to_entry(result);
1516
1517 if (options & VM_MAP_CREATE_NEVER_FAULTS) {
1518 assert(pmap == kernel_pmap);
1519 result->never_faults = true;
1520 }
1521
1522 /* "has_corpse_footprint" and "holelistenabled" are mutually exclusive */
1523 if (options & VM_MAP_CREATE_CORPSE_FOOTPRINT) {
1524 result->has_corpse_footprint = true;
1525 } else if (!(options & VM_MAP_CREATE_DISABLE_HOLELIST)) {
1526 struct vm_map_links *hole_entry;
1527
1528 hole_entry = zalloc_id(ZONE_ID_VM_MAP_HOLES, Z_WAITOK | Z_NOFAIL);
1529 hole_entry->start = min;
1530 #if defined(__arm64__)
1531 hole_entry->end = result->max_offset;
1532 #else
1533 hole_entry->end = MAX(max, (vm_map_offset_t)MACH_VM_MAX_ADDRESS);
1534 #endif
1535 result->holes_list = result->hole_hint = hole_entry;
1536 hole_entry->prev = hole_entry->next = CAST_TO_VM_MAP_ENTRY(hole_entry);
1537 result->holelistenabled = true;
1538 }
1539
1540 vm_map_lock_init(result);
1541
1542 return result;
1543 }
1544
1545 /*
1546 * Adjusts a submap that was made by kmem_suballoc()
1547 * before it knew where it would be mapped,
1548 * so that it has the right min/max offsets.
1549 *
1550 * We do not need to hold any locks:
1551 * only the caller knows about this map,
1552 * and it is not published on any entry yet.
1553 */
1554 static void
vm_map_adjust_offsets(vm_map_t map,vm_map_offset_t min_off,vm_map_offset_t max_off)1555 vm_map_adjust_offsets(
1556 vm_map_t map,
1557 vm_map_offset_t min_off,
1558 vm_map_offset_t max_off)
1559 {
1560 assert(map->min_offset == 0);
1561 assert(map->max_offset == max_off - min_off);
1562 assert(map->hdr.nentries == 0);
1563 assert(os_ref_get_count_raw(&map->map_refcnt) == 2);
1564
1565 map->min_offset = min_off;
1566 map->max_offset = max_off;
1567
1568 if (map->holelistenabled) {
1569 struct vm_map_links *hole = map->holes_list;
1570
1571 hole->start = min_off;
1572 #if defined(__arm64__)
1573 hole->end = max_off;
1574 #else
1575 hole->end = MAX(max_off, (vm_map_offset_t)MACH_VM_MAX_ADDRESS);
1576 #endif
1577 }
1578 }
1579
1580
1581 vm_map_size_t
vm_map_adjusted_size(vm_map_t map)1582 vm_map_adjusted_size(vm_map_t map)
1583 {
1584 const struct vm_reserved_region *regions = NULL;
1585 size_t num_regions = 0;
1586 mach_vm_size_t reserved_size = 0, map_size = 0;
1587
1588 if (map == NULL || (map->size == 0)) {
1589 return 0;
1590 }
1591
1592 map_size = map->size;
1593
1594 if (map->reserved_regions == FALSE || !vm_map_is_exotic(map) || map->terminated) {
1595 /*
1596 * No special reserved regions or not an exotic map or the task
1597 * is terminating and these special regions might have already
1598 * been deallocated.
1599 */
1600 return map_size;
1601 }
1602
1603 num_regions = ml_get_vm_reserved_regions(vm_map_is_64bit(map), ®ions);
1604 assert((num_regions == 0) || (num_regions > 0 && regions != NULL));
1605
1606 while (num_regions) {
1607 reserved_size += regions[--num_regions].vmrr_size;
1608 }
1609
1610 /*
1611 * There are a few places where the map is being switched out due to
1612 * 'termination' without that bit being set (e.g. exec and corpse purging).
1613 * In those cases, we could have the map's regions being deallocated on
1614 * a core while some accounting process is trying to get the map's size.
1615 * So this assert can't be enabled till all those places are uniform in
1616 * their use of the 'map->terminated' bit.
1617 *
1618 * assert(map_size >= reserved_size);
1619 */
1620
1621 return (map_size >= reserved_size) ? (map_size - reserved_size) : map_size;
1622 }
1623
1624 /*
1625 * vm_map_entry_create: [ internal use only ]
1626 *
1627 * Allocates a VM map entry for insertion in the
1628 * given map (or map copy). No fields are filled.
1629 *
1630 * The VM entry will be zero initialized, except for:
1631 * - behavior set to VM_BEHAVIOR_DEFAULT
1632 * - inheritance set to VM_INHERIT_DEFAULT
1633 */
1634 #define vm_map_entry_create(map) _vm_map_entry_create(&(map)->hdr)
1635
1636 #define vm_map_copy_entry_create(copy) _vm_map_entry_create(&(copy)->cpy_hdr)
1637
1638 static vm_map_entry_t
_vm_map_entry_create(struct vm_map_header * map_header __unused)1639 _vm_map_entry_create(
1640 struct vm_map_header *map_header __unused)
1641 {
1642 vm_map_entry_t entry = NULL;
1643
1644 entry = zalloc_id(ZONE_ID_VM_MAP_ENTRY, Z_WAITOK | Z_ZERO);
1645
1646 /*
1647 * Help the compiler with what we know to be true,
1648 * so that the further bitfields inits have good codegen.
1649 *
1650 * See rdar://87041299
1651 */
1652 __builtin_assume(entry->vme_object_value == 0);
1653 __builtin_assume(*(uint64_t *)(&entry->vme_object_value + 1) == 0);
1654 __builtin_assume(*(uint64_t *)(&entry->vme_object_value + 2) == 0);
1655
1656 static_assert(VM_MAX_TAG_VALUE <= VME_ALIAS_MASK,
1657 "VME_ALIAS_MASK covers tags");
1658
1659 static_assert(VM_BEHAVIOR_DEFAULT == 0,
1660 "can skip zeroing of the behavior field");
1661 entry->inheritance = VM_INHERIT_DEFAULT;
1662
1663 #if MAP_ENTRY_CREATION_DEBUG
1664 entry->vme_creation_maphdr = map_header;
1665 entry->vme_creation_bt = btref_get(__builtin_frame_address(0),
1666 BTREF_GET_NOWAIT);
1667 #endif
1668 return entry;
1669 }
1670
1671 /*
1672 * vm_map_entry_dispose: [ internal use only ]
1673 *
1674 * Inverse of vm_map_entry_create.
1675 *
1676 * write map lock held so no need to
1677 * do anything special to insure correctness
1678 * of the stores
1679 */
1680 static void
vm_map_entry_dispose(vm_map_entry_t entry)1681 vm_map_entry_dispose(
1682 vm_map_entry_t entry)
1683 {
1684 #if VM_BTLOG_TAGS
1685 if (entry->vme_kernel_object) {
1686 btref_put(entry->vme_tag_btref);
1687 }
1688 #endif /* VM_BTLOG_TAGS */
1689 #if MAP_ENTRY_CREATION_DEBUG
1690 btref_put(entry->vme_creation_bt);
1691 #endif
1692 #if MAP_ENTRY_INSERTION_DEBUG
1693 btref_put(entry->vme_insertion_bt);
1694 #endif
1695 zfree(vm_map_entry_zone, entry);
1696 }
1697
1698 #define vm_map_copy_entry_dispose(copy_entry) \
1699 vm_map_entry_dispose(copy_entry)
1700
1701 static vm_map_entry_t
vm_map_zap_first_entry(vm_map_zap_t list)1702 vm_map_zap_first_entry(
1703 vm_map_zap_t list)
1704 {
1705 return list->vmz_head;
1706 }
1707
1708 static vm_map_entry_t
vm_map_zap_last_entry(vm_map_zap_t list)1709 vm_map_zap_last_entry(
1710 vm_map_zap_t list)
1711 {
1712 assert(vm_map_zap_first_entry(list));
1713 return __container_of(list->vmz_tail, struct vm_map_entry, vme_next);
1714 }
1715
1716 static void
vm_map_zap_append(vm_map_zap_t list,vm_map_entry_t entry)1717 vm_map_zap_append(
1718 vm_map_zap_t list,
1719 vm_map_entry_t entry)
1720 {
1721 entry->vme_next = VM_MAP_ENTRY_NULL;
1722 *list->vmz_tail = entry;
1723 list->vmz_tail = &entry->vme_next;
1724 }
1725
1726 static vm_map_entry_t
vm_map_zap_pop(vm_map_zap_t list)1727 vm_map_zap_pop(
1728 vm_map_zap_t list)
1729 {
1730 vm_map_entry_t head = list->vmz_head;
1731
1732 if (head != VM_MAP_ENTRY_NULL &&
1733 (list->vmz_head = head->vme_next) == VM_MAP_ENTRY_NULL) {
1734 list->vmz_tail = &list->vmz_head;
1735 }
1736
1737 return head;
1738 }
1739
1740 static void
vm_map_zap_dispose(vm_map_zap_t list)1741 vm_map_zap_dispose(
1742 vm_map_zap_t list)
1743 {
1744 vm_map_entry_t entry;
1745
1746 while ((entry = vm_map_zap_pop(list))) {
1747 if (entry->is_sub_map) {
1748 vm_map_deallocate(VME_SUBMAP(entry));
1749 } else {
1750 vm_object_deallocate(VME_OBJECT(entry));
1751 }
1752
1753 vm_map_entry_dispose(entry);
1754 }
1755 }
1756
1757 #if MACH_ASSERT
1758 static boolean_t first_free_check = FALSE;
1759 boolean_t
first_free_is_valid(vm_map_t map)1760 first_free_is_valid(
1761 vm_map_t map)
1762 {
1763 if (!first_free_check) {
1764 return TRUE;
1765 }
1766
1767 return first_free_is_valid_store( map );
1768 }
1769 #endif /* MACH_ASSERT */
1770
1771
1772 #define vm_map_copy_entry_link(copy, after_where, entry) \
1773 _vm_map_store_entry_link(&(copy)->cpy_hdr, after_where, (entry))
1774
1775 #define vm_map_copy_entry_unlink(copy, entry) \
1776 _vm_map_store_entry_unlink(&(copy)->cpy_hdr, (entry), false)
1777
1778 /*
1779 * vm_map_destroy:
1780 *
1781 * Actually destroy a map.
1782 */
1783 void
vm_map_destroy(vm_map_t map)1784 vm_map_destroy(
1785 vm_map_t map)
1786 {
1787 /* final cleanup: this is not allowed to fail */
1788 vmr_flags_t flags = VM_MAP_REMOVE_NO_FLAGS;
1789
1790 VM_MAP_ZAP_DECLARE(zap);
1791
1792 vm_map_lock(map);
1793
1794 map->terminated = true;
1795 /* clean up regular map entries */
1796 (void)vm_map_delete(map, map->min_offset, map->max_offset, flags,
1797 KMEM_GUARD_NONE, &zap);
1798 /* clean up leftover special mappings (commpage, GPU carveout, etc...) */
1799 (void)vm_map_delete(map, 0x0, 0xFFFFFFFFFFFFF000ULL, flags,
1800 KMEM_GUARD_NONE, &zap);
1801
1802 vm_map_disable_hole_optimization(map);
1803 vm_map_corpse_footprint_destroy(map);
1804
1805 vm_map_unlock(map);
1806
1807 vm_map_zap_dispose(&zap);
1808
1809 assert(map->hdr.nentries == 0);
1810
1811 if (map->pmap) {
1812 pmap_destroy(map->pmap);
1813 }
1814
1815 lck_rw_destroy(&map->lock, &vm_map_lck_grp);
1816
1817 #if CONFIG_MAP_RANGES
1818 kfree_data(map->extra_ranges,
1819 map->extra_ranges_count * sizeof(struct vm_map_user_range));
1820 #endif
1821
1822 zfree_id(ZONE_ID_VM_MAP, map);
1823 }
1824
1825 /*
1826 * Returns pid of the task with the largest number of VM map entries.
1827 * Used in the zone-map-exhaustion jetsam path.
1828 */
1829 pid_t
find_largest_process_vm_map_entries(void)1830 find_largest_process_vm_map_entries(void)
1831 {
1832 pid_t victim_pid = -1;
1833 int max_vm_map_entries = 0;
1834 task_t task = TASK_NULL;
1835 queue_head_t *task_list = &tasks;
1836
1837 lck_mtx_lock(&tasks_threads_lock);
1838 queue_iterate(task_list, task, task_t, tasks) {
1839 if (task == kernel_task || !task->active) {
1840 continue;
1841 }
1842
1843 vm_map_t task_map = task->map;
1844 if (task_map != VM_MAP_NULL) {
1845 int task_vm_map_entries = task_map->hdr.nentries;
1846 if (task_vm_map_entries > max_vm_map_entries) {
1847 max_vm_map_entries = task_vm_map_entries;
1848 victim_pid = pid_from_task(task);
1849 }
1850 }
1851 }
1852 lck_mtx_unlock(&tasks_threads_lock);
1853
1854 printf("zone_map_exhaustion: victim pid %d, vm region count: %d\n", victim_pid, max_vm_map_entries);
1855 return victim_pid;
1856 }
1857
1858
1859 /*
1860 * vm_map_lookup_entry: [ internal use only ]
1861 *
1862 * Calls into the vm map store layer to find the map
1863 * entry containing (or immediately preceding) the
1864 * specified address in the given map; the entry is returned
1865 * in the "entry" parameter. The boolean
1866 * result indicates whether the address is
1867 * actually contained in the map.
1868 */
1869 boolean_t
vm_map_lookup_entry(vm_map_t map,vm_map_offset_t address,vm_map_entry_t * entry)1870 vm_map_lookup_entry(
1871 vm_map_t map,
1872 vm_map_offset_t address,
1873 vm_map_entry_t *entry) /* OUT */
1874 {
1875 if (VM_KERNEL_ADDRESS(address)) {
1876 address = VM_KERNEL_STRIP_UPTR(address);
1877 }
1878 #if CONFIG_PROB_GZALLOC
1879 if (map->pmap == kernel_pmap) {
1880 assertf(!pgz_owned(address),
1881 "it is the responsibility of callers to unguard PGZ addresses");
1882 }
1883 #endif /* CONFIG_PROB_GZALLOC */
1884 return vm_map_store_lookup_entry( map, address, entry );
1885 }
1886
1887 boolean_t
vm_map_lookup_entry_or_next(vm_map_t map,vm_map_offset_t address,vm_map_entry_t * entry)1888 vm_map_lookup_entry_or_next(
1889 vm_map_t map,
1890 vm_map_offset_t address,
1891 vm_map_entry_t *entry) /* OUT */
1892 {
1893 if (vm_map_lookup_entry(map, address, entry)) {
1894 return true;
1895 }
1896
1897 *entry = (*entry)->vme_next;
1898 return false;
1899 }
1900
1901 #if CONFIG_PROB_GZALLOC
1902 boolean_t
vm_map_lookup_entry_allow_pgz(vm_map_t map,vm_map_offset_t address,vm_map_entry_t * entry)1903 vm_map_lookup_entry_allow_pgz(
1904 vm_map_t map,
1905 vm_map_offset_t address,
1906 vm_map_entry_t *entry) /* OUT */
1907 {
1908 if (VM_KERNEL_ADDRESS(address)) {
1909 address = VM_KERNEL_STRIP_UPTR(address);
1910 }
1911 return vm_map_store_lookup_entry( map, address, entry );
1912 }
1913 #endif /* CONFIG_PROB_GZALLOC */
1914
1915 /*
1916 * Routine: vm_map_range_invalid_panic
1917 * Purpose:
1918 * Panic on detection of an invalid range id.
1919 */
1920 __abortlike
1921 static void
vm_map_range_invalid_panic(vm_map_t map,vm_map_range_id_t range_id)1922 vm_map_range_invalid_panic(
1923 vm_map_t map,
1924 vm_map_range_id_t range_id)
1925 {
1926 panic("invalid range ID (%u) for map %p", range_id, map);
1927 }
1928
1929 /*
1930 * Routine: vm_map_get_range
1931 * Purpose:
1932 * Adjust bounds based on security policy.
1933 */
1934 static struct mach_vm_range
vm_map_get_range(vm_map_t map,vm_map_address_t * address,vm_map_kernel_flags_t * vmk_flags,vm_map_size_t size,bool * is_ptr)1935 vm_map_get_range(
1936 vm_map_t map,
1937 vm_map_address_t *address,
1938 vm_map_kernel_flags_t *vmk_flags,
1939 vm_map_size_t size,
1940 bool *is_ptr)
1941 {
1942 struct mach_vm_range effective_range = {};
1943 vm_map_range_id_t range_id = vmk_flags->vmkf_range_id;
1944
1945 if (map == kernel_map) {
1946 effective_range = kmem_ranges[range_id];
1947
1948 if (startup_phase >= STARTUP_SUB_KMEM) {
1949 /*
1950 * Hint provided by caller is zeroed as the range is restricted to a
1951 * subset of the entire kernel_map VA, which could put the hint outside
1952 * the range, causing vm_map_store_find_space to fail.
1953 */
1954 *address = 0ull;
1955 /*
1956 * Ensure that range_id passed in by the caller is within meaningful
1957 * bounds. Range id of KMEM_RANGE_ID_NONE will cause vm_map_locate_space
1958 * to fail as the corresponding range is invalid. Range id larger than
1959 * KMEM_RANGE_ID_MAX will lead to an OOB access.
1960 */
1961 if ((range_id == KMEM_RANGE_ID_NONE) ||
1962 (range_id > KMEM_RANGE_ID_MAX)) {
1963 vm_map_range_invalid_panic(map, range_id);
1964 }
1965
1966 /*
1967 * Pointer ranges use kmem_locate_space to do allocations.
1968 *
1969 * Non pointer fronts look like [ Small | Large | Permanent ]
1970 * Adjust range for allocations larger than KMEM_SMALLMAP_THRESHOLD.
1971 * Allocations smaller than KMEM_SMALLMAP_THRESHOLD are allowed to
1972 * use the entire range.
1973 */
1974 if (range_id < KMEM_RANGE_ID_SPRAYQTN) {
1975 *is_ptr = true;
1976 } else if (size >= KMEM_SMALLMAP_THRESHOLD) {
1977 effective_range = kmem_large_ranges[range_id];
1978 }
1979 }
1980 #if CONFIG_MAP_RANGES
1981 } else if (map->uses_user_ranges) {
1982 switch (range_id) {
1983 case UMEM_RANGE_ID_DEFAULT:
1984 effective_range = map->default_range;
1985 break;
1986 case UMEM_RANGE_ID_HEAP:
1987 effective_range = map->data_range;
1988 break;
1989 case UMEM_RANGE_ID_FIXED:
1990 /*
1991 * anywhere allocations with an address in "FIXED"
1992 * makes no sense, leave the range empty
1993 */
1994 break;
1995
1996 default:
1997 vm_map_range_invalid_panic(map, range_id);
1998 }
1999 #endif /* CONFIG_MAP_RANGES */
2000 } else {
2001 /*
2002 * If minimum is 0, bump it up by PAGE_SIZE. We want to limit
2003 * allocations of PAGEZERO to explicit requests since its
2004 * normal use is to catch dereferences of NULL and many
2005 * applications also treat pointers with a value of 0 as
2006 * special and suddenly having address 0 contain useable
2007 * memory would tend to confuse those applications.
2008 */
2009 effective_range.min_address = MAX(map->min_offset, VM_MAP_PAGE_SIZE(map));
2010 effective_range.max_address = map->max_offset;
2011 }
2012
2013 return effective_range;
2014 }
2015
2016 /*
2017 * Routine: vm_map_locate_space
2018 * Purpose:
2019 * Finds a range in the specified virtual address map,
2020 * returning the start of that range,
2021 * as well as the entry right before it.
2022 */
2023 kern_return_t
vm_map_locate_space(vm_map_t map,vm_map_size_t size,vm_map_offset_t mask,vm_map_kernel_flags_t vmk_flags,vm_map_offset_t * start_inout,vm_map_entry_t * entry_out)2024 vm_map_locate_space(
2025 vm_map_t map,
2026 vm_map_size_t size,
2027 vm_map_offset_t mask,
2028 vm_map_kernel_flags_t vmk_flags,
2029 vm_map_offset_t *start_inout,
2030 vm_map_entry_t *entry_out)
2031 {
2032 struct mach_vm_range effective_range = {};
2033 vm_map_size_t guard_offset;
2034 vm_map_offset_t hint, limit;
2035 vm_map_entry_t entry;
2036 bool is_kmem_ptr_range = false;
2037
2038 /*
2039 * Only supported by vm_map_enter() with a fixed address.
2040 */
2041 assert(!vmk_flags.vmkf_beyond_max);
2042
2043 if (__improbable(map->wait_for_space)) {
2044 /*
2045 * support for "wait_for_space" is minimal,
2046 * its only consumer is the ipc_kernel_copy_map.
2047 */
2048 assert(!map->holelistenabled &&
2049 !vmk_flags.vmkf_last_free &&
2050 !vmk_flags.vmkf_keep_map_locked &&
2051 !vmk_flags.vmkf_map_jit &&
2052 !vmk_flags.vmf_random_addr &&
2053 *start_inout <= map->min_offset);
2054 } else if (vmk_flags.vmkf_last_free) {
2055 assert(!vmk_flags.vmkf_map_jit &&
2056 !vmk_flags.vmf_random_addr);
2057 }
2058
2059 if (vmk_flags.vmkf_guard_before) {
2060 guard_offset = VM_MAP_PAGE_SIZE(map);
2061 assert(size > guard_offset);
2062 size -= guard_offset;
2063 } else {
2064 assert(size != 0);
2065 guard_offset = 0;
2066 }
2067
2068 /*
2069 * Validate range_id from flags and get associated range
2070 */
2071 effective_range = vm_map_get_range(map, start_inout, &vmk_flags, size,
2072 &is_kmem_ptr_range);
2073
2074 if (is_kmem_ptr_range) {
2075 return kmem_locate_space(size + guard_offset, vmk_flags.vmkf_range_id,
2076 vmk_flags.vmkf_last_free, start_inout, entry_out);
2077 }
2078
2079 #if XNU_TARGET_OS_OSX
2080 if (__improbable(vmk_flags.vmkf_32bit_map_va)) {
2081 assert(map != kernel_map);
2082 effective_range.max_address = MIN(map->max_offset, 0x00000000FFFFF000ULL);
2083 }
2084 #endif /* XNU_TARGET_OS_OSX */
2085
2086 again:
2087 if (vmk_flags.vmkf_last_free) {
2088 hint = *start_inout;
2089
2090 if (hint == 0 || hint > effective_range.max_address) {
2091 hint = effective_range.max_address;
2092 }
2093 if (hint <= effective_range.min_address) {
2094 return KERN_NO_SPACE;
2095 }
2096 limit = effective_range.min_address;
2097 } else {
2098 hint = *start_inout;
2099
2100 if (vmk_flags.vmkf_map_jit) {
2101 if (map->jit_entry_exists &&
2102 !VM_MAP_POLICY_ALLOW_MULTIPLE_JIT(map)) {
2103 return KERN_INVALID_ARGUMENT;
2104 }
2105 if (VM_MAP_POLICY_ALLOW_JIT_RANDOM_ADDRESS(map)) {
2106 vmk_flags.vmf_random_addr = true;
2107 }
2108 }
2109
2110 if (vmk_flags.vmf_random_addr) {
2111 kern_return_t kr;
2112
2113 kr = vm_map_random_address_for_size(map, &hint, size, vmk_flags);
2114 if (kr != KERN_SUCCESS) {
2115 return kr;
2116 }
2117 }
2118 #if __x86_64__
2119 else if ((hint == 0 || hint == vm_map_min(map)) &&
2120 !map->disable_vmentry_reuse &&
2121 map->vmmap_high_start != 0) {
2122 hint = map->vmmap_high_start;
2123 }
2124 #endif /* __x86_64__ */
2125
2126 if (hint < effective_range.min_address) {
2127 hint = effective_range.min_address;
2128 }
2129 if (effective_range.max_address <= hint) {
2130 return KERN_NO_SPACE;
2131 }
2132
2133 limit = effective_range.max_address;
2134 }
2135 entry = vm_map_store_find_space(map,
2136 hint, limit, vmk_flags.vmkf_last_free,
2137 guard_offset, size, mask,
2138 start_inout);
2139
2140 if (__improbable(entry == NULL)) {
2141 if (map->wait_for_space &&
2142 guard_offset + size <=
2143 effective_range.max_address - effective_range.min_address) {
2144 assert_wait((event_t)map, THREAD_ABORTSAFE);
2145 vm_map_unlock(map);
2146 thread_block(THREAD_CONTINUE_NULL);
2147 vm_map_lock(map);
2148 goto again;
2149 }
2150 return KERN_NO_SPACE;
2151 }
2152
2153 if (entry_out) {
2154 *entry_out = entry;
2155 }
2156 return KERN_SUCCESS;
2157 }
2158
2159
2160 /*
2161 * Routine: vm_map_find_space
2162 * Purpose:
2163 * Allocate a range in the specified virtual address map,
2164 * returning the entry allocated for that range.
2165 * Used by kmem_alloc, etc.
2166 *
2167 * The map must be NOT be locked. It will be returned locked
2168 * on KERN_SUCCESS, unlocked on failure.
2169 *
2170 * If an entry is allocated, the object/offset fields
2171 * are initialized to zero.
2172 */
2173 kern_return_t
vm_map_find_space(vm_map_t map,vm_map_offset_t hint_address,vm_map_size_t size,vm_map_offset_t mask,vm_map_kernel_flags_t vmk_flags,vm_map_entry_t * o_entry)2174 vm_map_find_space(
2175 vm_map_t map,
2176 vm_map_offset_t hint_address,
2177 vm_map_size_t size,
2178 vm_map_offset_t mask,
2179 vm_map_kernel_flags_t vmk_flags,
2180 vm_map_entry_t *o_entry) /* OUT */
2181 {
2182 vm_map_entry_t new_entry, entry;
2183 kern_return_t kr;
2184
2185 if (size == 0) {
2186 return KERN_INVALID_ARGUMENT;
2187 }
2188
2189 new_entry = vm_map_entry_create(map);
2190 new_entry->use_pmap = true;
2191 new_entry->protection = VM_PROT_DEFAULT;
2192 new_entry->max_protection = VM_PROT_ALL;
2193
2194 if (VM_MAP_PAGE_SHIFT(map) != PAGE_SHIFT) {
2195 new_entry->map_aligned = true;
2196 }
2197 if (vmk_flags.vmf_permanent) {
2198 new_entry->vme_permanent = true;
2199 }
2200
2201 vm_map_lock(map);
2202
2203 kr = vm_map_locate_space(map, size, mask, vmk_flags,
2204 &hint_address, &entry);
2205 if (kr != KERN_SUCCESS) {
2206 vm_map_unlock(map);
2207 vm_map_entry_dispose(new_entry);
2208 return kr;
2209 }
2210 new_entry->vme_start = hint_address;
2211 new_entry->vme_end = hint_address + size;
2212
2213 /*
2214 * At this point,
2215 *
2216 * - new_entry's "vme_start" and "vme_end" should define
2217 * the endpoints of the available new range,
2218 *
2219 * - and "entry" should refer to the region before
2220 * the new range,
2221 *
2222 * - and the map should still be locked.
2223 */
2224
2225 assert(page_aligned(new_entry->vme_start));
2226 assert(page_aligned(new_entry->vme_end));
2227 assert(VM_MAP_PAGE_ALIGNED(new_entry->vme_start, VM_MAP_PAGE_MASK(map)));
2228 assert(VM_MAP_PAGE_ALIGNED(new_entry->vme_end, VM_MAP_PAGE_MASK(map)));
2229
2230 /*
2231 * Insert the new entry into the list
2232 */
2233
2234 vm_map_store_entry_link(map, entry, new_entry,
2235 VM_MAP_KERNEL_FLAGS_NONE);
2236 map->size += size;
2237
2238 /*
2239 * Update the lookup hint
2240 */
2241 SAVE_HINT_MAP_WRITE(map, new_entry);
2242
2243 *o_entry = new_entry;
2244 return KERN_SUCCESS;
2245 }
2246
2247 int vm_map_pmap_enter_print = FALSE;
2248 int vm_map_pmap_enter_enable = FALSE;
2249
2250 /*
2251 * Routine: vm_map_pmap_enter [internal only]
2252 *
2253 * Description:
2254 * Force pages from the specified object to be entered into
2255 * the pmap at the specified address if they are present.
2256 * As soon as a page not found in the object the scan ends.
2257 *
2258 * Returns:
2259 * Nothing.
2260 *
2261 * In/out conditions:
2262 * The source map should not be locked on entry.
2263 */
2264 __unused static void
vm_map_pmap_enter(vm_map_t map,vm_map_offset_t addr,vm_map_offset_t end_addr,vm_object_t object,vm_object_offset_t offset,vm_prot_t protection)2265 vm_map_pmap_enter(
2266 vm_map_t map,
2267 vm_map_offset_t addr,
2268 vm_map_offset_t end_addr,
2269 vm_object_t object,
2270 vm_object_offset_t offset,
2271 vm_prot_t protection)
2272 {
2273 int type_of_fault;
2274 kern_return_t kr;
2275 uint8_t object_lock_type = 0;
2276 struct vm_object_fault_info fault_info = {};
2277
2278 if (map->pmap == 0) {
2279 return;
2280 }
2281
2282 assert(VM_MAP_PAGE_SHIFT(map) == PAGE_SHIFT);
2283
2284 while (addr < end_addr) {
2285 vm_page_t m;
2286
2287
2288 /*
2289 * TODO:
2290 * From vm_map_enter(), we come into this function without the map
2291 * lock held or the object lock held.
2292 * We haven't taken a reference on the object either.
2293 * We should do a proper lookup on the map to make sure
2294 * that things are sane before we go locking objects that
2295 * could have been deallocated from under us.
2296 */
2297
2298 object_lock_type = OBJECT_LOCK_EXCLUSIVE;
2299 vm_object_lock(object);
2300
2301 m = vm_page_lookup(object, offset);
2302
2303 if (m == VM_PAGE_NULL || m->vmp_busy || m->vmp_fictitious ||
2304 (m->vmp_unusual && (VMP_ERROR_GET(m) || m->vmp_restart || m->vmp_absent))) {
2305 vm_object_unlock(object);
2306 return;
2307 }
2308
2309 if (vm_map_pmap_enter_print) {
2310 printf("vm_map_pmap_enter:");
2311 printf("map: %p, addr: %llx, object: %p, offset: %llx\n",
2312 map, (unsigned long long)addr, object, (unsigned long long)offset);
2313 }
2314 type_of_fault = DBG_CACHE_HIT_FAULT;
2315 kr = vm_fault_enter(m, map->pmap,
2316 addr,
2317 PAGE_SIZE, 0,
2318 protection, protection,
2319 VM_PAGE_WIRED(m),
2320 FALSE, /* change_wiring */
2321 VM_KERN_MEMORY_NONE, /* tag - not wiring */
2322 &fault_info,
2323 NULL, /* need_retry */
2324 &type_of_fault,
2325 &object_lock_type); /* Exclusive lock mode. Will remain unchanged.*/
2326
2327 vm_object_unlock(object);
2328
2329 offset += PAGE_SIZE_64;
2330 addr += PAGE_SIZE;
2331 }
2332 }
2333
2334 #define MAX_TRIES_TO_GET_RANDOM_ADDRESS 1000
2335 static kern_return_t
vm_map_random_address_for_size(vm_map_t map,vm_map_offset_t * address,vm_map_size_t size,vm_map_kernel_flags_t vmk_flags)2336 vm_map_random_address_for_size(
2337 vm_map_t map,
2338 vm_map_offset_t *address,
2339 vm_map_size_t size,
2340 vm_map_kernel_flags_t vmk_flags)
2341 {
2342 kern_return_t kr = KERN_SUCCESS;
2343 int tries = 0;
2344 vm_map_offset_t random_addr = 0;
2345 vm_map_offset_t hole_end;
2346
2347 vm_map_entry_t next_entry = VM_MAP_ENTRY_NULL;
2348 vm_map_entry_t prev_entry = VM_MAP_ENTRY_NULL;
2349 vm_map_size_t vm_hole_size = 0;
2350 vm_map_size_t addr_space_size;
2351 bool is_kmem_ptr;
2352 struct mach_vm_range effective_range;
2353
2354 effective_range = vm_map_get_range(map, address, &vmk_flags, size,
2355 &is_kmem_ptr);
2356
2357 addr_space_size = effective_range.max_address - effective_range.min_address;
2358 if (size >= addr_space_size) {
2359 return KERN_NO_SPACE;
2360 }
2361 addr_space_size -= size;
2362
2363 assert(VM_MAP_PAGE_ALIGNED(size, VM_MAP_PAGE_MASK(map)));
2364
2365 while (tries < MAX_TRIES_TO_GET_RANDOM_ADDRESS) {
2366 if (startup_phase < STARTUP_SUB_ZALLOC) {
2367 random_addr = (vm_map_offset_t)early_random();
2368 } else {
2369 random_addr = (vm_map_offset_t)random();
2370 }
2371 random_addr <<= VM_MAP_PAGE_SHIFT(map);
2372 random_addr = vm_map_trunc_page(
2373 effective_range.min_address + (random_addr % addr_space_size),
2374 VM_MAP_PAGE_MASK(map));
2375
2376 #if CONFIG_PROB_GZALLOC
2377 if (map->pmap == kernel_pmap && pgz_owned(random_addr)) {
2378 continue;
2379 }
2380 #endif /* CONFIG_PROB_GZALLOC */
2381
2382 if (vm_map_lookup_entry(map, random_addr, &prev_entry) == FALSE) {
2383 if (prev_entry == vm_map_to_entry(map)) {
2384 next_entry = vm_map_first_entry(map);
2385 } else {
2386 next_entry = prev_entry->vme_next;
2387 }
2388 if (next_entry == vm_map_to_entry(map)) {
2389 hole_end = vm_map_max(map);
2390 } else {
2391 hole_end = next_entry->vme_start;
2392 }
2393 vm_hole_size = hole_end - random_addr;
2394 if (vm_hole_size >= size) {
2395 *address = random_addr;
2396 break;
2397 }
2398 }
2399 tries++;
2400 }
2401
2402 if (tries == MAX_TRIES_TO_GET_RANDOM_ADDRESS) {
2403 kr = KERN_NO_SPACE;
2404 }
2405 return kr;
2406 }
2407
2408 static boolean_t
vm_memory_malloc_no_cow(int alias)2409 vm_memory_malloc_no_cow(
2410 int alias)
2411 {
2412 uint64_t alias_mask;
2413
2414 if (!malloc_no_cow) {
2415 return FALSE;
2416 }
2417 if (alias > 63) {
2418 return FALSE;
2419 }
2420 alias_mask = 1ULL << alias;
2421 if (alias_mask & vm_memory_malloc_no_cow_mask) {
2422 return TRUE;
2423 }
2424 return FALSE;
2425 }
2426
2427 uint64_t vm_map_enter_RLIMIT_AS_count = 0;
2428 uint64_t vm_map_enter_RLIMIT_DATA_count = 0;
2429 /*
2430 * Routine: vm_map_enter
2431 *
2432 * Description:
2433 * Allocate a range in the specified virtual address map.
2434 * The resulting range will refer to memory defined by
2435 * the given memory object and offset into that object.
2436 *
2437 * Arguments are as defined in the vm_map call.
2438 */
2439 static unsigned int vm_map_enter_restore_successes = 0;
2440 static unsigned int vm_map_enter_restore_failures = 0;
2441 kern_return_t
vm_map_enter(vm_map_t map,vm_map_offset_t * address,vm_map_size_t size,vm_map_offset_t mask,vm_map_kernel_flags_t vmk_flags,vm_object_t object,vm_object_offset_t offset,boolean_t needs_copy,vm_prot_t cur_protection,vm_prot_t max_protection,vm_inherit_t inheritance)2442 vm_map_enter(
2443 vm_map_t map,
2444 vm_map_offset_t *address, /* IN/OUT */
2445 vm_map_size_t size,
2446 vm_map_offset_t mask,
2447 vm_map_kernel_flags_t vmk_flags,
2448 vm_object_t object,
2449 vm_object_offset_t offset,
2450 boolean_t needs_copy,
2451 vm_prot_t cur_protection,
2452 vm_prot_t max_protection,
2453 vm_inherit_t inheritance)
2454 {
2455 vm_map_entry_t entry, new_entry;
2456 vm_map_offset_t start, tmp_start, tmp_offset;
2457 vm_map_offset_t end, tmp_end;
2458 vm_map_offset_t tmp2_start, tmp2_end;
2459 vm_map_offset_t step;
2460 kern_return_t result = KERN_SUCCESS;
2461 bool map_locked = FALSE;
2462 bool pmap_empty = TRUE;
2463 bool new_mapping_established = FALSE;
2464 const bool keep_map_locked = vmk_flags.vmkf_keep_map_locked;
2465 const bool anywhere = !vmk_flags.vmf_fixed;
2466 const bool purgable = vmk_flags.vmf_purgeable;
2467 const bool overwrite = vmk_flags.vmf_overwrite;
2468 const bool no_cache = vmk_flags.vmf_no_cache;
2469 const bool is_submap = vmk_flags.vmkf_submap;
2470 const bool permanent = vmk_flags.vmf_permanent;
2471 const bool no_copy_on_read = vmk_flags.vmkf_no_copy_on_read;
2472 const bool entry_for_jit = vmk_flags.vmkf_map_jit;
2473 const bool iokit_acct = vmk_flags.vmkf_iokit_acct;
2474 const bool resilient_codesign = vmk_flags.vmf_resilient_codesign;
2475 const bool resilient_media = vmk_flags.vmf_resilient_media;
2476 const bool entry_for_tpro = vmk_flags.vmf_tpro;
2477 const unsigned int superpage_size = vmk_flags.vmf_superpage_size;
2478 const vm_tag_t alias = vmk_flags.vm_tag;
2479 vm_tag_t user_alias;
2480 kern_return_t kr;
2481 bool clear_map_aligned = FALSE;
2482 vm_map_size_t chunk_size = 0;
2483 vm_object_t caller_object;
2484 VM_MAP_ZAP_DECLARE(zap_old_list);
2485 VM_MAP_ZAP_DECLARE(zap_new_list);
2486
2487 caller_object = object;
2488
2489 assertf(vmk_flags.__vmkf_unused == 0, "vmk_flags unused=0x%x\n", vmk_flags.__vmkf_unused);
2490
2491 if (vmk_flags.vmf_4gb_chunk) {
2492 #if defined(__LP64__)
2493 chunk_size = (4ULL * 1024 * 1024 * 1024); /* max. 4GB chunks for the new allocation */
2494 #else /* __LP64__ */
2495 chunk_size = ANON_CHUNK_SIZE;
2496 #endif /* __LP64__ */
2497 } else {
2498 chunk_size = ANON_CHUNK_SIZE;
2499 }
2500
2501
2502
2503 if (superpage_size) {
2504 switch (superpage_size) {
2505 /*
2506 * Note that the current implementation only supports
2507 * a single size for superpages, SUPERPAGE_SIZE, per
2508 * architecture. As soon as more sizes are supposed
2509 * to be supported, SUPERPAGE_SIZE has to be replaced
2510 * with a lookup of the size depending on superpage_size.
2511 */
2512 #ifdef __x86_64__
2513 case SUPERPAGE_SIZE_ANY:
2514 /* handle it like 2 MB and round up to page size */
2515 size = (size + 2 * 1024 * 1024 - 1) & ~(2 * 1024 * 1024 - 1);
2516 OS_FALLTHROUGH;
2517 case SUPERPAGE_SIZE_2MB:
2518 break;
2519 #endif
2520 default:
2521 return KERN_INVALID_ARGUMENT;
2522 }
2523 mask = SUPERPAGE_SIZE - 1;
2524 if (size & (SUPERPAGE_SIZE - 1)) {
2525 return KERN_INVALID_ARGUMENT;
2526 }
2527 inheritance = VM_INHERIT_NONE; /* fork() children won't inherit superpages */
2528 }
2529
2530
2531 if ((cur_protection & VM_PROT_WRITE) &&
2532 (cur_protection & VM_PROT_EXECUTE) &&
2533 #if XNU_TARGET_OS_OSX
2534 map->pmap != kernel_pmap &&
2535 (cs_process_global_enforcement() ||
2536 (vmk_flags.vmkf_cs_enforcement_override
2537 ? vmk_flags.vmkf_cs_enforcement
2538 : (vm_map_cs_enforcement(map)
2539 #if __arm64__
2540 || !VM_MAP_IS_EXOTIC(map)
2541 #endif /* __arm64__ */
2542 ))) &&
2543 #endif /* XNU_TARGET_OS_OSX */
2544 #if CODE_SIGNING_MONITOR
2545 (csm_address_space_exempt(map->pmap) != KERN_SUCCESS) &&
2546 #endif
2547 (VM_MAP_POLICY_WX_FAIL(map) ||
2548 VM_MAP_POLICY_WX_STRIP_X(map)) &&
2549 !entry_for_jit) {
2550 boolean_t vm_protect_wx_fail = VM_MAP_POLICY_WX_FAIL(map);
2551
2552 DTRACE_VM3(cs_wx,
2553 uint64_t, 0,
2554 uint64_t, 0,
2555 vm_prot_t, cur_protection);
2556 printf("CODE SIGNING: %d[%s] %s: curprot cannot be write+execute. %s\n",
2557 proc_selfpid(),
2558 (get_bsdtask_info(current_task())
2559 ? proc_name_address(get_bsdtask_info(current_task()))
2560 : "?"),
2561 __FUNCTION__,
2562 (vm_protect_wx_fail ? "failing" : "turning off execute"));
2563 cur_protection &= ~VM_PROT_EXECUTE;
2564 if (vm_protect_wx_fail) {
2565 return KERN_PROTECTION_FAILURE;
2566 }
2567 }
2568
2569 /*
2570 * If the task has requested executable lockdown,
2571 * deny any new executable mapping.
2572 */
2573 if (map->map_disallow_new_exec == TRUE) {
2574 if (cur_protection & VM_PROT_EXECUTE) {
2575 return KERN_PROTECTION_FAILURE;
2576 }
2577 }
2578
2579 if (resilient_codesign) {
2580 assert(!is_submap);
2581 int reject_prot = (needs_copy ? VM_PROT_ALLEXEC : (VM_PROT_WRITE | VM_PROT_ALLEXEC));
2582 if ((cur_protection | max_protection) & reject_prot) {
2583 return KERN_PROTECTION_FAILURE;
2584 }
2585 }
2586
2587 if (resilient_media) {
2588 assert(!is_submap);
2589 // assert(!needs_copy);
2590 if (object != VM_OBJECT_NULL &&
2591 !object->internal) {
2592 /*
2593 * This mapping is directly backed by an external
2594 * memory manager (e.g. a vnode pager for a file):
2595 * we would not have any safe place to inject
2596 * a zero-filled page if an actual page is not
2597 * available, without possibly impacting the actual
2598 * contents of the mapped object (e.g. the file),
2599 * so we can't provide any media resiliency here.
2600 */
2601 return KERN_INVALID_ARGUMENT;
2602 }
2603 }
2604
2605 if (entry_for_tpro) {
2606 /*
2607 * TPRO overrides the effective permissions of the region
2608 * and explicitly maps as RW. Ensure we have been passed
2609 * the expected permissions. We accept `cur_protections`
2610 * RO as that will be handled on fault.
2611 */
2612 if (!(max_protection & VM_PROT_READ) ||
2613 !(max_protection & VM_PROT_WRITE) ||
2614 !(cur_protection & VM_PROT_READ)) {
2615 return KERN_PROTECTION_FAILURE;
2616 }
2617
2618 /*
2619 * We can now downgrade the cur_protection to RO. This is a mild lie
2620 * to the VM layer. But TPRO will be responsible for toggling the
2621 * protections between RO/RW
2622 */
2623 cur_protection = VM_PROT_READ;
2624 }
2625
2626 if (is_submap) {
2627 vm_map_t submap;
2628 if (purgable) {
2629 /* submaps can not be purgeable */
2630 return KERN_INVALID_ARGUMENT;
2631 }
2632 if (object == VM_OBJECT_NULL) {
2633 /* submaps can not be created lazily */
2634 return KERN_INVALID_ARGUMENT;
2635 }
2636 submap = (vm_map_t) object;
2637 if (VM_MAP_PAGE_SHIFT(submap) != VM_MAP_PAGE_SHIFT(map)) {
2638 /* page size mismatch */
2639 return KERN_INVALID_ARGUMENT;
2640 }
2641 }
2642 if (vmk_flags.vmkf_already) {
2643 /*
2644 * VM_FLAGS_ALREADY says that it's OK if the same mapping
2645 * is already present. For it to be meaningul, the requested
2646 * mapping has to be at a fixed address (!VM_FLAGS_ANYWHERE) and
2647 * we shouldn't try and remove what was mapped there first
2648 * (!VM_FLAGS_OVERWRITE).
2649 */
2650 if (!vmk_flags.vmf_fixed || vmk_flags.vmf_overwrite) {
2651 return KERN_INVALID_ARGUMENT;
2652 }
2653 }
2654
2655 if (size == 0 ||
2656 (offset & MIN(VM_MAP_PAGE_MASK(map), PAGE_MASK_64)) != 0) {
2657 *address = 0;
2658 return KERN_INVALID_ARGUMENT;
2659 }
2660
2661 if (map->pmap == kernel_pmap) {
2662 user_alias = VM_KERN_MEMORY_NONE;
2663 } else {
2664 user_alias = alias;
2665 }
2666
2667 if (user_alias == VM_MEMORY_MALLOC_MEDIUM) {
2668 chunk_size = MALLOC_MEDIUM_CHUNK_SIZE;
2669 }
2670
2671 #define RETURN(value) { result = value; goto BailOut; }
2672
2673 assertf(VM_MAP_PAGE_ALIGNED(*address, FOURK_PAGE_MASK), "0x%llx", (uint64_t)*address);
2674 assertf(VM_MAP_PAGE_ALIGNED(size, FOURK_PAGE_MASK), "0x%llx", (uint64_t)size);
2675 if (VM_MAP_PAGE_MASK(map) >= PAGE_MASK) {
2676 assertf(page_aligned(*address), "0x%llx", (uint64_t)*address);
2677 assertf(page_aligned(size), "0x%llx", (uint64_t)size);
2678 }
2679
2680 if (VM_MAP_PAGE_MASK(map) >= PAGE_MASK &&
2681 !VM_MAP_PAGE_ALIGNED(size, VM_MAP_PAGE_MASK(map))) {
2682 /*
2683 * In most cases, the caller rounds the size up to the
2684 * map's page size.
2685 * If we get a size that is explicitly not map-aligned here,
2686 * we'll have to respect the caller's wish and mark the
2687 * mapping as "not map-aligned" to avoid tripping the
2688 * map alignment checks later.
2689 */
2690 clear_map_aligned = TRUE;
2691 }
2692 if (!anywhere &&
2693 VM_MAP_PAGE_MASK(map) >= PAGE_MASK &&
2694 !VM_MAP_PAGE_ALIGNED(*address, VM_MAP_PAGE_MASK(map))) {
2695 /*
2696 * We've been asked to map at a fixed address and that
2697 * address is not aligned to the map's specific alignment.
2698 * The caller should know what it's doing (i.e. most likely
2699 * mapping some fragmented copy map, transferring memory from
2700 * a VM map with a different alignment), so clear map_aligned
2701 * for this new VM map entry and proceed.
2702 */
2703 clear_map_aligned = TRUE;
2704 }
2705
2706 /*
2707 * Only zero-fill objects are allowed to be purgable.
2708 * LP64todo - limit purgable objects to 32-bits for now
2709 */
2710 if (purgable &&
2711 (offset != 0 ||
2712 (object != VM_OBJECT_NULL &&
2713 (object->vo_size != size ||
2714 object->purgable == VM_PURGABLE_DENY))
2715 #if __LP64__
2716 || size > ANON_MAX_SIZE
2717 #endif
2718 )) {
2719 return KERN_INVALID_ARGUMENT;
2720 }
2721
2722 start = *address;
2723
2724 if (anywhere) {
2725 vm_map_lock(map);
2726 map_locked = TRUE;
2727
2728 result = vm_map_locate_space(map, size, mask, vmk_flags,
2729 &start, &entry);
2730 if (result != KERN_SUCCESS) {
2731 goto BailOut;
2732 }
2733
2734 *address = start;
2735 end = start + size;
2736 assert(VM_MAP_PAGE_ALIGNED(*address,
2737 VM_MAP_PAGE_MASK(map)));
2738 } else {
2739 vm_map_offset_t effective_min_offset, effective_max_offset;
2740
2741 effective_min_offset = map->min_offset;
2742 effective_max_offset = map->max_offset;
2743
2744 if (vmk_flags.vmkf_beyond_max) {
2745 /*
2746 * Allow an insertion beyond the map's max offset.
2747 */
2748 effective_max_offset = 0x00000000FFFFF000ULL;
2749 if (vm_map_is_64bit(map)) {
2750 effective_max_offset = 0xFFFFFFFFFFFFF000ULL;
2751 }
2752 #if XNU_TARGET_OS_OSX
2753 } else if (__improbable(vmk_flags.vmkf_32bit_map_va)) {
2754 effective_max_offset = MIN(map->max_offset, 0x00000000FFFFF000ULL);
2755 #endif /* XNU_TARGET_OS_OSX */
2756 }
2757
2758 if (VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT &&
2759 !overwrite &&
2760 user_alias == VM_MEMORY_REALLOC) {
2761 /*
2762 * Force realloc() to switch to a new allocation,
2763 * to prevent 4k-fragmented virtual ranges.
2764 */
2765 // DEBUG4K_ERROR("no realloc in place");
2766 return KERN_NO_SPACE;
2767 }
2768
2769 /*
2770 * Verify that:
2771 * the address doesn't itself violate
2772 * the mask requirement.
2773 */
2774
2775 vm_map_lock(map);
2776 map_locked = TRUE;
2777 if ((start & mask) != 0) {
2778 RETURN(KERN_NO_SPACE);
2779 }
2780
2781 #if CONFIG_MAP_RANGES
2782 if (map->uses_user_ranges) {
2783 struct mach_vm_range r;
2784
2785 vm_map_user_range_resolve(map, start, 1, &r);
2786 if (r.max_address == 0) {
2787 RETURN(KERN_INVALID_ADDRESS);
2788 }
2789 effective_min_offset = r.min_address;
2790 effective_max_offset = r.max_address;
2791 }
2792 #endif /* CONFIG_MAP_RANGES */
2793
2794 if ((startup_phase >= STARTUP_SUB_KMEM) && !is_submap &&
2795 (map == kernel_map)) {
2796 mach_vm_range_t r = kmem_validate_range_for_overwrite(start, size);
2797 effective_min_offset = r->min_address;
2798 effective_max_offset = r->max_address;
2799 }
2800
2801 /*
2802 * ... the address is within bounds
2803 */
2804
2805 end = start + size;
2806
2807 if ((start < effective_min_offset) ||
2808 (end > effective_max_offset) ||
2809 (start >= end)) {
2810 RETURN(KERN_INVALID_ADDRESS);
2811 }
2812
2813 if (overwrite) {
2814 vmr_flags_t remove_flags = VM_MAP_REMOVE_NO_MAP_ALIGN | VM_MAP_REMOVE_TO_OVERWRITE;
2815 kern_return_t remove_kr;
2816
2817 /*
2818 * Fixed mapping and "overwrite" flag: attempt to
2819 * remove all existing mappings in the specified
2820 * address range, saving them in our "zap_old_list".
2821 *
2822 * This avoids releasing the VM map lock in
2823 * vm_map_entry_delete() and allows atomicity
2824 * when we want to replace some mappings with a new one.
2825 * It also allows us to restore the old VM mappings if the
2826 * new mapping fails.
2827 */
2828 remove_flags |= VM_MAP_REMOVE_NO_YIELD;
2829
2830 if (vmk_flags.vmkf_overwrite_immutable) {
2831 /* we can overwrite immutable mappings */
2832 remove_flags |= VM_MAP_REMOVE_IMMUTABLE;
2833 }
2834 if (vmk_flags.vmkf_remap_prot_copy) {
2835 remove_flags |= VM_MAP_REMOVE_IMMUTABLE_CODE;
2836 }
2837 remove_kr = vm_map_delete(map, start, end, remove_flags,
2838 KMEM_GUARD_NONE, &zap_old_list).kmr_return;
2839 if (remove_kr) {
2840 /* XXX FBDP restore zap_old_list? */
2841 RETURN(remove_kr);
2842 }
2843 }
2844
2845 /*
2846 * ... the starting address isn't allocated
2847 */
2848
2849 if (vm_map_lookup_entry(map, start, &entry)) {
2850 if (!(vmk_flags.vmkf_already)) {
2851 RETURN(KERN_NO_SPACE);
2852 }
2853 /*
2854 * Check if what's already there is what we want.
2855 */
2856 tmp_start = start;
2857 tmp_offset = offset;
2858 if (entry->vme_start < start) {
2859 tmp_start -= start - entry->vme_start;
2860 tmp_offset -= start - entry->vme_start;
2861 }
2862 for (; entry->vme_start < end;
2863 entry = entry->vme_next) {
2864 /*
2865 * Check if the mapping's attributes
2866 * match the existing map entry.
2867 */
2868 if (entry == vm_map_to_entry(map) ||
2869 entry->vme_start != tmp_start ||
2870 entry->is_sub_map != is_submap ||
2871 VME_OFFSET(entry) != tmp_offset ||
2872 entry->needs_copy != needs_copy ||
2873 entry->protection != cur_protection ||
2874 entry->max_protection != max_protection ||
2875 entry->inheritance != inheritance ||
2876 entry->iokit_acct != iokit_acct ||
2877 VME_ALIAS(entry) != alias) {
2878 /* not the same mapping ! */
2879 RETURN(KERN_NO_SPACE);
2880 }
2881 /*
2882 * Check if the same object is being mapped.
2883 */
2884 if (is_submap) {
2885 if (VME_SUBMAP(entry) !=
2886 (vm_map_t) object) {
2887 /* not the same submap */
2888 RETURN(KERN_NO_SPACE);
2889 }
2890 } else {
2891 if (VME_OBJECT(entry) != object) {
2892 /* not the same VM object... */
2893 vm_object_t obj2;
2894
2895 obj2 = VME_OBJECT(entry);
2896 if ((obj2 == VM_OBJECT_NULL ||
2897 obj2->internal) &&
2898 (object == VM_OBJECT_NULL ||
2899 object->internal)) {
2900 /*
2901 * ... but both are
2902 * anonymous memory,
2903 * so equivalent.
2904 */
2905 } else {
2906 RETURN(KERN_NO_SPACE);
2907 }
2908 }
2909 }
2910
2911 tmp_offset += entry->vme_end - entry->vme_start;
2912 tmp_start += entry->vme_end - entry->vme_start;
2913 if (entry->vme_end >= end) {
2914 /* reached the end of our mapping */
2915 break;
2916 }
2917 }
2918 /* it all matches: let's use what's already there ! */
2919 RETURN(KERN_MEMORY_PRESENT);
2920 }
2921
2922 /*
2923 * ... the next region doesn't overlap the
2924 * end point.
2925 */
2926
2927 if ((entry->vme_next != vm_map_to_entry(map)) &&
2928 (entry->vme_next->vme_start < end)) {
2929 RETURN(KERN_NO_SPACE);
2930 }
2931 }
2932
2933 /*
2934 * At this point,
2935 * "start" and "end" should define the endpoints of the
2936 * available new range, and
2937 * "entry" should refer to the region before the new
2938 * range, and
2939 *
2940 * the map should be locked.
2941 */
2942
2943 /*
2944 * See whether we can avoid creating a new entry (and object) by
2945 * extending one of our neighbors. [So far, we only attempt to
2946 * extend from below.] Note that we can never extend/join
2947 * purgable objects because they need to remain distinct
2948 * entities in order to implement their "volatile object"
2949 * semantics.
2950 */
2951
2952 if (purgable ||
2953 entry_for_jit ||
2954 entry_for_tpro ||
2955 vm_memory_malloc_no_cow(user_alias)) {
2956 if (object == VM_OBJECT_NULL) {
2957 object = vm_object_allocate(size);
2958 object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
2959 object->true_share = FALSE;
2960 if (malloc_no_cow_except_fork &&
2961 !purgable &&
2962 !entry_for_jit &&
2963 !entry_for_tpro &&
2964 vm_memory_malloc_no_cow(user_alias)) {
2965 object->copy_strategy = MEMORY_OBJECT_COPY_DELAY_FORK;
2966 object->true_share = TRUE;
2967 }
2968 if (purgable) {
2969 task_t owner;
2970 object->purgable = VM_PURGABLE_NONVOLATILE;
2971 if (map->pmap == kernel_pmap) {
2972 /*
2973 * Purgeable mappings made in a kernel
2974 * map are "owned" by the kernel itself
2975 * rather than the current user task
2976 * because they're likely to be used by
2977 * more than this user task (see
2978 * execargs_purgeable_allocate(), for
2979 * example).
2980 */
2981 owner = kernel_task;
2982 } else {
2983 owner = current_task();
2984 }
2985 assert(object->vo_owner == NULL);
2986 assert(object->resident_page_count == 0);
2987 assert(object->wired_page_count == 0);
2988 vm_object_lock(object);
2989 vm_purgeable_nonvolatile_enqueue(object, owner);
2990 vm_object_unlock(object);
2991 }
2992 offset = (vm_object_offset_t)0;
2993 }
2994 } else if (VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT) {
2995 /* no coalescing if address space uses sub-pages */
2996 } else if ((is_submap == FALSE) &&
2997 (object == VM_OBJECT_NULL) &&
2998 (entry != vm_map_to_entry(map)) &&
2999 (entry->vme_end == start) &&
3000 (!entry->is_shared) &&
3001 (!entry->is_sub_map) &&
3002 (!entry->in_transition) &&
3003 (!entry->needs_wakeup) &&
3004 (entry->behavior == VM_BEHAVIOR_DEFAULT) &&
3005 (entry->protection == cur_protection) &&
3006 (entry->max_protection == max_protection) &&
3007 (entry->inheritance == inheritance) &&
3008 ((user_alias == VM_MEMORY_REALLOC) ||
3009 (VME_ALIAS(entry) == alias)) &&
3010 (entry->no_cache == no_cache) &&
3011 (entry->vme_permanent == permanent) &&
3012 /* no coalescing for immutable executable mappings */
3013 !((entry->protection & VM_PROT_EXECUTE) &&
3014 entry->vme_permanent) &&
3015 (!entry->superpage_size && !superpage_size) &&
3016 /*
3017 * No coalescing if not map-aligned, to avoid propagating
3018 * that condition any further than needed:
3019 */
3020 (!entry->map_aligned || !clear_map_aligned) &&
3021 (!entry->zero_wired_pages) &&
3022 (!entry->used_for_jit && !entry_for_jit) &&
3023 #if __arm64e__
3024 (!entry->used_for_tpro && !entry_for_tpro) &&
3025 #endif
3026 (!entry->csm_associated) &&
3027 (entry->iokit_acct == iokit_acct) &&
3028 (!entry->vme_resilient_codesign) &&
3029 (!entry->vme_resilient_media) &&
3030 (!entry->vme_atomic) &&
3031 (entry->vme_no_copy_on_read == no_copy_on_read) &&
3032
3033 ((entry->vme_end - entry->vme_start) + size <=
3034 (user_alias == VM_MEMORY_REALLOC ?
3035 ANON_CHUNK_SIZE :
3036 NO_COALESCE_LIMIT)) &&
3037
3038 (entry->wired_count == 0)) { /* implies user_wired_count == 0 */
3039 if (vm_object_coalesce(VME_OBJECT(entry),
3040 VM_OBJECT_NULL,
3041 VME_OFFSET(entry),
3042 (vm_object_offset_t) 0,
3043 (vm_map_size_t)(entry->vme_end - entry->vme_start),
3044 (vm_map_size_t)(end - entry->vme_end))) {
3045 /*
3046 * Coalesced the two objects - can extend
3047 * the previous map entry to include the
3048 * new range.
3049 */
3050 map->size += (end - entry->vme_end);
3051 assert(entry->vme_start < end);
3052 assert(VM_MAP_PAGE_ALIGNED(end,
3053 VM_MAP_PAGE_MASK(map)));
3054 if (__improbable(vm_debug_events)) {
3055 DTRACE_VM5(map_entry_extend, vm_map_t, map, vm_map_entry_t, entry, vm_address_t, entry->vme_start, vm_address_t, entry->vme_end, vm_address_t, end);
3056 }
3057 entry->vme_end = end;
3058 if (map->holelistenabled) {
3059 vm_map_store_update_first_free(map, entry, TRUE);
3060 } else {
3061 vm_map_store_update_first_free(map, map->first_free, TRUE);
3062 }
3063 new_mapping_established = TRUE;
3064 RETURN(KERN_SUCCESS);
3065 }
3066 }
3067
3068 step = superpage_size ? SUPERPAGE_SIZE : (end - start);
3069 new_entry = NULL;
3070
3071 if (vmk_flags.vmkf_submap_adjust) {
3072 vm_map_adjust_offsets((vm_map_t)caller_object, start, end);
3073 offset = start;
3074 }
3075
3076 for (tmp2_start = start; tmp2_start < end; tmp2_start += step) {
3077 tmp2_end = tmp2_start + step;
3078 /*
3079 * Create a new entry
3080 *
3081 * XXX FBDP
3082 * The reserved "page zero" in each process's address space can
3083 * be arbitrarily large. Splitting it into separate objects and
3084 * therefore different VM map entries serves no purpose and just
3085 * slows down operations on the VM map, so let's not split the
3086 * allocation into chunks if the max protection is NONE. That
3087 * memory should never be accessible, so it will never get to the
3088 * default pager.
3089 */
3090 tmp_start = tmp2_start;
3091 if (!is_submap &&
3092 object == VM_OBJECT_NULL &&
3093 size > chunk_size &&
3094 max_protection != VM_PROT_NONE &&
3095 superpage_size == 0) {
3096 tmp_end = tmp_start + chunk_size;
3097 } else {
3098 tmp_end = tmp2_end;
3099 }
3100 do {
3101 if (!is_submap &&
3102 object != VM_OBJECT_NULL &&
3103 object->internal &&
3104 offset + (tmp_end - tmp_start) > object->vo_size) {
3105 // printf("FBDP object %p size 0x%llx overmapping offset 0x%llx size 0x%llx\n", object, object->vo_size, offset, (uint64_t)(tmp_end - tmp_start));
3106 DTRACE_VM5(vm_map_enter_overmap,
3107 vm_map_t, map,
3108 vm_map_address_t, tmp_start,
3109 vm_map_address_t, tmp_end,
3110 vm_object_offset_t, offset,
3111 vm_object_size_t, object->vo_size);
3112 }
3113 new_entry = vm_map_entry_insert(map,
3114 entry, tmp_start, tmp_end,
3115 object, offset, vmk_flags,
3116 needs_copy,
3117 cur_protection, max_protection,
3118 (entry_for_jit && !VM_MAP_POLICY_ALLOW_JIT_INHERIT(map) ?
3119 VM_INHERIT_NONE : inheritance),
3120 clear_map_aligned);
3121
3122 assert(!is_kernel_object(object) || (VM_KERN_MEMORY_NONE != alias));
3123
3124 if (resilient_codesign) {
3125 int reject_prot = (needs_copy ? VM_PROT_ALLEXEC : (VM_PROT_WRITE | VM_PROT_ALLEXEC));
3126 if (!((cur_protection | max_protection) & reject_prot)) {
3127 new_entry->vme_resilient_codesign = TRUE;
3128 }
3129 }
3130
3131 if (resilient_media &&
3132 (object == VM_OBJECT_NULL ||
3133 object->internal)) {
3134 new_entry->vme_resilient_media = TRUE;
3135 }
3136
3137 assert(!new_entry->iokit_acct);
3138 if (!is_submap &&
3139 object != VM_OBJECT_NULL &&
3140 (object->purgable != VM_PURGABLE_DENY ||
3141 object->vo_ledger_tag)) {
3142 assert(new_entry->use_pmap);
3143 assert(!new_entry->iokit_acct);
3144 /*
3145 * Turn off pmap accounting since
3146 * purgeable (or tagged) objects have their
3147 * own ledgers.
3148 */
3149 new_entry->use_pmap = FALSE;
3150 } else if (!is_submap &&
3151 iokit_acct &&
3152 object != VM_OBJECT_NULL &&
3153 object->internal) {
3154 /* alternate accounting */
3155 assert(!new_entry->iokit_acct);
3156 assert(new_entry->use_pmap);
3157 new_entry->iokit_acct = TRUE;
3158 new_entry->use_pmap = FALSE;
3159 DTRACE_VM4(
3160 vm_map_iokit_mapped_region,
3161 vm_map_t, map,
3162 vm_map_offset_t, new_entry->vme_start,
3163 vm_map_offset_t, new_entry->vme_end,
3164 int, VME_ALIAS(new_entry));
3165 vm_map_iokit_mapped_region(
3166 map,
3167 (new_entry->vme_end -
3168 new_entry->vme_start));
3169 } else if (!is_submap) {
3170 assert(!new_entry->iokit_acct);
3171 assert(new_entry->use_pmap);
3172 }
3173
3174 if (is_submap) {
3175 vm_map_t submap;
3176 boolean_t submap_is_64bit;
3177 boolean_t use_pmap;
3178
3179 assert(new_entry->is_sub_map);
3180 assert(!new_entry->use_pmap);
3181 assert(!new_entry->iokit_acct);
3182 submap = (vm_map_t) object;
3183 submap_is_64bit = vm_map_is_64bit(submap);
3184 use_pmap = vmk_flags.vmkf_nested_pmap;
3185 #ifndef NO_NESTED_PMAP
3186 if (use_pmap && submap->pmap == NULL) {
3187 ledger_t ledger = map->pmap->ledger;
3188 /* we need a sub pmap to nest... */
3189 submap->pmap = pmap_create_options(ledger, 0,
3190 submap_is_64bit ? PMAP_CREATE_64BIT : 0);
3191 if (submap->pmap == NULL) {
3192 /* let's proceed without nesting... */
3193 }
3194 #if defined(__arm64__)
3195 else {
3196 pmap_set_nested(submap->pmap);
3197 }
3198 #endif
3199 }
3200 if (use_pmap && submap->pmap != NULL) {
3201 if (VM_MAP_PAGE_SHIFT(map) != VM_MAP_PAGE_SHIFT(submap)) {
3202 DEBUG4K_ERROR("map %p (%d) submap %p (%d): incompatible page sizes\n", map, VM_MAP_PAGE_SHIFT(map), submap, VM_MAP_PAGE_SHIFT(submap));
3203 kr = KERN_FAILURE;
3204 } else {
3205 kr = pmap_nest(map->pmap,
3206 submap->pmap,
3207 tmp_start,
3208 tmp_end - tmp_start);
3209 }
3210 if (kr != KERN_SUCCESS) {
3211 printf("vm_map_enter: "
3212 "pmap_nest(0x%llx,0x%llx) "
3213 "error 0x%x\n",
3214 (long long)tmp_start,
3215 (long long)tmp_end,
3216 kr);
3217 } else {
3218 /* we're now nested ! */
3219 new_entry->use_pmap = TRUE;
3220 pmap_empty = FALSE;
3221 }
3222 }
3223 #endif /* NO_NESTED_PMAP */
3224 }
3225 entry = new_entry;
3226
3227 if (superpage_size) {
3228 vm_page_t pages, m;
3229 vm_object_t sp_object;
3230 vm_object_offset_t sp_offset;
3231
3232 VME_OFFSET_SET(entry, 0);
3233
3234 /* allocate one superpage */
3235 kr = cpm_allocate(SUPERPAGE_SIZE, &pages, 0, SUPERPAGE_NBASEPAGES - 1, TRUE, 0);
3236 if (kr != KERN_SUCCESS) {
3237 /* deallocate whole range... */
3238 new_mapping_established = TRUE;
3239 /* ... but only up to "tmp_end" */
3240 size -= end - tmp_end;
3241 RETURN(kr);
3242 }
3243
3244 /* create one vm_object per superpage */
3245 sp_object = vm_object_allocate((vm_map_size_t)(entry->vme_end - entry->vme_start));
3246 sp_object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
3247 sp_object->phys_contiguous = TRUE;
3248 sp_object->vo_shadow_offset = (vm_object_offset_t)VM_PAGE_GET_PHYS_PAGE(pages) * PAGE_SIZE;
3249 VME_OBJECT_SET(entry, sp_object, false, 0);
3250 assert(entry->use_pmap);
3251
3252 /* enter the base pages into the object */
3253 vm_object_lock(sp_object);
3254 for (sp_offset = 0;
3255 sp_offset < SUPERPAGE_SIZE;
3256 sp_offset += PAGE_SIZE) {
3257 m = pages;
3258 pmap_zero_page(VM_PAGE_GET_PHYS_PAGE(m));
3259 pages = NEXT_PAGE(m);
3260 *(NEXT_PAGE_PTR(m)) = VM_PAGE_NULL;
3261 vm_page_insert_wired(m, sp_object, sp_offset, VM_KERN_MEMORY_OSFMK);
3262 }
3263 vm_object_unlock(sp_object);
3264 }
3265 } while (tmp_end != tmp2_end &&
3266 (tmp_start = tmp_end) &&
3267 (tmp_end = (tmp2_end - tmp_end > chunk_size) ?
3268 tmp_end + chunk_size : tmp2_end));
3269 }
3270
3271 new_mapping_established = TRUE;
3272
3273 BailOut:
3274 assert(map_locked == TRUE);
3275
3276 /*
3277 * Address space limit enforcement (RLIMIT_AS and RLIMIT_DATA):
3278 * If we have identified and possibly established the new mapping(s),
3279 * make sure we did not go beyond the address space limit.
3280 */
3281 if (result == KERN_SUCCESS) {
3282 if (map->size_limit != RLIM_INFINITY &&
3283 map->size > map->size_limit) {
3284 /*
3285 * Establishing the requested mappings would exceed
3286 * the process's RLIMIT_AS limit: fail with
3287 * KERN_NO_SPACE.
3288 */
3289 result = KERN_NO_SPACE;
3290 printf("%d[%s] %s: map size 0x%llx over RLIMIT_AS 0x%llx\n",
3291 proc_selfpid(),
3292 (get_bsdtask_info(current_task())
3293 ? proc_name_address(get_bsdtask_info(current_task()))
3294 : "?"),
3295 __FUNCTION__,
3296 (uint64_t) map->size,
3297 (uint64_t) map->size_limit);
3298 DTRACE_VM2(vm_map_enter_RLIMIT_AS,
3299 vm_map_size_t, map->size,
3300 uint64_t, map->size_limit);
3301 vm_map_enter_RLIMIT_AS_count++;
3302 } else if (map->data_limit != RLIM_INFINITY &&
3303 map->size > map->data_limit) {
3304 /*
3305 * Establishing the requested mappings would exceed
3306 * the process's RLIMIT_DATA limit: fail with
3307 * KERN_NO_SPACE.
3308 */
3309 result = KERN_NO_SPACE;
3310 printf("%d[%s] %s: map size 0x%llx over RLIMIT_DATA 0x%llx\n",
3311 proc_selfpid(),
3312 (get_bsdtask_info(current_task())
3313 ? proc_name_address(get_bsdtask_info(current_task()))
3314 : "?"),
3315 __FUNCTION__,
3316 (uint64_t) map->size,
3317 (uint64_t) map->data_limit);
3318 DTRACE_VM2(vm_map_enter_RLIMIT_DATA,
3319 vm_map_size_t, map->size,
3320 uint64_t, map->data_limit);
3321 vm_map_enter_RLIMIT_DATA_count++;
3322 }
3323 }
3324
3325 if (result == KERN_SUCCESS) {
3326 vm_prot_t pager_prot;
3327 memory_object_t pager;
3328
3329 #if DEBUG
3330 if (pmap_empty &&
3331 !(vmk_flags.vmkf_no_pmap_check)) {
3332 assert(pmap_is_empty(map->pmap,
3333 *address,
3334 *address + size));
3335 }
3336 #endif /* DEBUG */
3337
3338 /*
3339 * For "named" VM objects, let the pager know that the
3340 * memory object is being mapped. Some pagers need to keep
3341 * track of this, to know when they can reclaim the memory
3342 * object, for example.
3343 * VM calls memory_object_map() for each mapping (specifying
3344 * the protection of each mapping) and calls
3345 * memory_object_last_unmap() when all the mappings are gone.
3346 */
3347 pager_prot = max_protection;
3348 if (needs_copy) {
3349 /*
3350 * Copy-On-Write mapping: won't modify
3351 * the memory object.
3352 */
3353 pager_prot &= ~VM_PROT_WRITE;
3354 }
3355 if (!is_submap &&
3356 object != VM_OBJECT_NULL &&
3357 object->named &&
3358 object->pager != MEMORY_OBJECT_NULL) {
3359 vm_object_lock(object);
3360 pager = object->pager;
3361 if (object->named &&
3362 pager != MEMORY_OBJECT_NULL) {
3363 assert(object->pager_ready);
3364 vm_object_mapping_wait(object, THREAD_UNINT);
3365 vm_object_mapping_begin(object);
3366 vm_object_unlock(object);
3367
3368 kr = memory_object_map(pager, pager_prot);
3369 assert(kr == KERN_SUCCESS);
3370
3371 vm_object_lock(object);
3372 vm_object_mapping_end(object);
3373 }
3374 vm_object_unlock(object);
3375 }
3376 }
3377
3378 assert(map_locked == TRUE);
3379
3380 if (new_mapping_established) {
3381 /*
3382 * If we release the map lock for any reason below,
3383 * another thread could deallocate our new mapping,
3384 * releasing the caller's reference on "caller_object",
3385 * which was transferred to the mapping.
3386 * If this was the only reference, the object could be
3387 * destroyed.
3388 *
3389 * We need to take an extra reference on "caller_object"
3390 * to keep it alive if we need to return the caller's
3391 * reference to the caller in case of failure.
3392 */
3393 if (is_submap) {
3394 vm_map_reference((vm_map_t)caller_object);
3395 } else {
3396 vm_object_reference(caller_object);
3397 }
3398 }
3399
3400 if (!keep_map_locked) {
3401 vm_map_unlock(map);
3402 map_locked = FALSE;
3403 entry = VM_MAP_ENTRY_NULL;
3404 new_entry = VM_MAP_ENTRY_NULL;
3405 }
3406
3407 /*
3408 * We can't hold the map lock if we enter this block.
3409 */
3410
3411 if (result == KERN_SUCCESS) {
3412 /* Wire down the new entry if the user
3413 * requested all new map entries be wired.
3414 */
3415 if ((map->wiring_required) || (superpage_size)) {
3416 assert(!keep_map_locked);
3417 pmap_empty = FALSE; /* pmap won't be empty */
3418 kr = vm_map_wire_kernel(map, start, end,
3419 cur_protection, VM_KERN_MEMORY_MLOCK,
3420 TRUE);
3421 result = kr;
3422 }
3423
3424 }
3425
3426 if (result != KERN_SUCCESS) {
3427 if (new_mapping_established) {
3428 vmr_flags_t remove_flags = VM_MAP_REMOVE_NO_FLAGS;
3429
3430 /*
3431 * We have to get rid of the new mappings since we
3432 * won't make them available to the user.
3433 * Try and do that atomically, to minimize the risk
3434 * that someone else create new mappings that range.
3435 */
3436 if (!map_locked) {
3437 vm_map_lock(map);
3438 map_locked = TRUE;
3439 }
3440 remove_flags |= VM_MAP_REMOVE_NO_MAP_ALIGN;
3441 remove_flags |= VM_MAP_REMOVE_NO_YIELD;
3442 if (permanent) {
3443 remove_flags |= VM_MAP_REMOVE_IMMUTABLE;
3444 }
3445 (void) vm_map_delete(map,
3446 *address, *address + size,
3447 remove_flags,
3448 KMEM_GUARD_NONE, &zap_new_list);
3449 }
3450
3451 if (vm_map_zap_first_entry(&zap_old_list)) {
3452 vm_map_entry_t entry1, entry2;
3453
3454 /*
3455 * The new mapping failed. Attempt to restore
3456 * the old mappings, saved in the "zap_old_map".
3457 */
3458 if (!map_locked) {
3459 vm_map_lock(map);
3460 map_locked = TRUE;
3461 }
3462
3463 /* first check if the coast is still clear */
3464 start = vm_map_zap_first_entry(&zap_old_list)->vme_start;
3465 end = vm_map_zap_last_entry(&zap_old_list)->vme_end;
3466
3467 if (vm_map_lookup_entry(map, start, &entry1) ||
3468 vm_map_lookup_entry(map, end, &entry2) ||
3469 entry1 != entry2) {
3470 /*
3471 * Part of that range has already been
3472 * re-mapped: we can't restore the old
3473 * mappings...
3474 */
3475 vm_map_enter_restore_failures++;
3476 } else {
3477 /*
3478 * Transfer the saved map entries from
3479 * "zap_old_map" to the original "map",
3480 * inserting them all after "entry1".
3481 */
3482 while ((entry2 = vm_map_zap_pop(&zap_old_list))) {
3483 vm_map_size_t entry_size;
3484
3485 entry_size = (entry2->vme_end -
3486 entry2->vme_start);
3487 vm_map_store_entry_link(map, entry1, entry2,
3488 VM_MAP_KERNEL_FLAGS_NONE);
3489 map->size += entry_size;
3490 entry1 = entry2;
3491 }
3492 if (map->wiring_required) {
3493 /*
3494 * XXX TODO: we should rewire the
3495 * old pages here...
3496 */
3497 }
3498 vm_map_enter_restore_successes++;
3499 }
3500 }
3501 }
3502
3503 /*
3504 * The caller is responsible for releasing the lock if it requested to
3505 * keep the map locked.
3506 */
3507 if (map_locked && !keep_map_locked) {
3508 vm_map_unlock(map);
3509 }
3510
3511 vm_map_zap_dispose(&zap_old_list);
3512 vm_map_zap_dispose(&zap_new_list);
3513
3514 if (new_mapping_established) {
3515 /*
3516 * The caller had a reference on "caller_object" and we
3517 * transferred that reference to the mapping.
3518 * We also took an extra reference on "caller_object" to keep
3519 * it alive while the map was unlocked.
3520 */
3521 if (result == KERN_SUCCESS) {
3522 /*
3523 * On success, the caller's reference on the object gets
3524 * tranferred to the mapping.
3525 * Release our extra reference.
3526 */
3527 if (is_submap) {
3528 vm_map_deallocate((vm_map_t)caller_object);
3529 } else {
3530 vm_object_deallocate(caller_object);
3531 }
3532 } else {
3533 /*
3534 * On error, the caller expects to still have a
3535 * reference on the object it gave us.
3536 * Let's use our extra reference for that.
3537 */
3538 }
3539 }
3540
3541 return result;
3542
3543 #undef RETURN
3544 }
3545
3546 #if __arm64__
3547 extern const struct memory_object_pager_ops fourk_pager_ops;
3548 kern_return_t
vm_map_enter_fourk(vm_map_t map,vm_map_offset_t * address,vm_map_size_t size,vm_map_offset_t mask,vm_map_kernel_flags_t vmk_flags,vm_object_t object,vm_object_offset_t offset,boolean_t needs_copy,vm_prot_t cur_protection,vm_prot_t max_protection,vm_inherit_t inheritance)3549 vm_map_enter_fourk(
3550 vm_map_t map,
3551 vm_map_offset_t *address, /* IN/OUT */
3552 vm_map_size_t size,
3553 vm_map_offset_t mask,
3554 vm_map_kernel_flags_t vmk_flags,
3555 vm_object_t object,
3556 vm_object_offset_t offset,
3557 boolean_t needs_copy,
3558 vm_prot_t cur_protection,
3559 vm_prot_t max_protection,
3560 vm_inherit_t inheritance)
3561 {
3562 vm_map_entry_t entry, new_entry;
3563 vm_map_offset_t start, fourk_start;
3564 vm_map_offset_t end, fourk_end;
3565 vm_map_size_t fourk_size;
3566 kern_return_t result = KERN_SUCCESS;
3567 boolean_t map_locked = FALSE;
3568 boolean_t pmap_empty = TRUE;
3569 boolean_t new_mapping_established = FALSE;
3570 const bool keep_map_locked = vmk_flags.vmkf_keep_map_locked;
3571 const bool anywhere = !vmk_flags.vmf_fixed;
3572 const bool purgable = vmk_flags.vmf_purgeable;
3573 const bool overwrite = vmk_flags.vmf_overwrite;
3574 const bool is_submap = vmk_flags.vmkf_submap;
3575 const bool entry_for_jit = vmk_flags.vmkf_map_jit;
3576 const unsigned int superpage_size = vmk_flags.vmf_superpage_size;
3577 vm_map_offset_t effective_min_offset, effective_max_offset;
3578 kern_return_t kr;
3579 boolean_t clear_map_aligned = FALSE;
3580 memory_object_t fourk_mem_obj;
3581 vm_object_t fourk_object;
3582 vm_map_offset_t fourk_pager_offset;
3583 int fourk_pager_index_start, fourk_pager_index_num;
3584 int cur_idx;
3585 boolean_t fourk_copy;
3586 vm_object_t copy_object;
3587 vm_object_offset_t copy_offset;
3588 VM_MAP_ZAP_DECLARE(zap_list);
3589
3590 if (VM_MAP_PAGE_MASK(map) < PAGE_MASK) {
3591 panic("%s:%d", __FUNCTION__, __LINE__);
3592 }
3593 fourk_mem_obj = MEMORY_OBJECT_NULL;
3594 fourk_object = VM_OBJECT_NULL;
3595
3596 if (superpage_size) {
3597 return KERN_NOT_SUPPORTED;
3598 }
3599
3600 if ((cur_protection & VM_PROT_WRITE) &&
3601 (cur_protection & VM_PROT_EXECUTE) &&
3602 #if XNU_TARGET_OS_OSX
3603 map->pmap != kernel_pmap &&
3604 (vm_map_cs_enforcement(map)
3605 #if __arm64__
3606 || !VM_MAP_IS_EXOTIC(map)
3607 #endif /* __arm64__ */
3608 ) &&
3609 #endif /* XNU_TARGET_OS_OSX */
3610 #if CODE_SIGNING_MONITOR
3611 (csm_address_space_exempt(map->pmap) != KERN_SUCCESS) &&
3612 #endif
3613 !entry_for_jit) {
3614 DTRACE_VM3(cs_wx,
3615 uint64_t, 0,
3616 uint64_t, 0,
3617 vm_prot_t, cur_protection);
3618 printf("CODE SIGNING: %d[%s] %s: curprot cannot be write+execute. "
3619 "turning off execute\n",
3620 proc_selfpid(),
3621 (get_bsdtask_info(current_task())
3622 ? proc_name_address(get_bsdtask_info(current_task()))
3623 : "?"),
3624 __FUNCTION__);
3625 cur_protection &= ~VM_PROT_EXECUTE;
3626 }
3627
3628 /*
3629 * If the task has requested executable lockdown,
3630 * deny any new executable mapping.
3631 */
3632 if (map->map_disallow_new_exec == TRUE) {
3633 if (cur_protection & VM_PROT_EXECUTE) {
3634 return KERN_PROTECTION_FAILURE;
3635 }
3636 }
3637
3638 if (is_submap) {
3639 return KERN_NOT_SUPPORTED;
3640 }
3641 if (vmk_flags.vmkf_already) {
3642 return KERN_NOT_SUPPORTED;
3643 }
3644 if (purgable || entry_for_jit) {
3645 return KERN_NOT_SUPPORTED;
3646 }
3647
3648 effective_min_offset = map->min_offset;
3649
3650 if (vmk_flags.vmkf_beyond_max) {
3651 return KERN_NOT_SUPPORTED;
3652 } else {
3653 effective_max_offset = map->max_offset;
3654 }
3655
3656 if (size == 0 ||
3657 (offset & FOURK_PAGE_MASK) != 0) {
3658 *address = 0;
3659 return KERN_INVALID_ARGUMENT;
3660 }
3661
3662 #define RETURN(value) { result = value; goto BailOut; }
3663
3664 assert(VM_MAP_PAGE_ALIGNED(*address, FOURK_PAGE_MASK));
3665 assert(VM_MAP_PAGE_ALIGNED(size, FOURK_PAGE_MASK));
3666
3667 if (!anywhere && overwrite) {
3668 return KERN_NOT_SUPPORTED;
3669 }
3670
3671 fourk_start = *address;
3672 fourk_size = size;
3673 fourk_end = fourk_start + fourk_size;
3674
3675 start = vm_map_trunc_page(*address, VM_MAP_PAGE_MASK(map));
3676 end = vm_map_round_page(fourk_end, VM_MAP_PAGE_MASK(map));
3677 size = end - start;
3678
3679 if (anywhere) {
3680 return KERN_NOT_SUPPORTED;
3681 } else {
3682 /*
3683 * Verify that:
3684 * the address doesn't itself violate
3685 * the mask requirement.
3686 */
3687
3688 vm_map_lock(map);
3689 map_locked = TRUE;
3690 if ((start & mask) != 0) {
3691 RETURN(KERN_NO_SPACE);
3692 }
3693
3694 /*
3695 * ... the address is within bounds
3696 */
3697
3698 end = start + size;
3699
3700 if ((start < effective_min_offset) ||
3701 (end > effective_max_offset) ||
3702 (start >= end)) {
3703 RETURN(KERN_INVALID_ADDRESS);
3704 }
3705
3706 /*
3707 * ... the starting address isn't allocated
3708 */
3709 if (vm_map_lookup_entry(map, start, &entry)) {
3710 vm_object_t cur_object, shadow_object;
3711
3712 /*
3713 * We might already some 4K mappings
3714 * in a 16K page here.
3715 */
3716
3717 if (entry->vme_end - entry->vme_start
3718 != SIXTEENK_PAGE_SIZE) {
3719 RETURN(KERN_NO_SPACE);
3720 }
3721 if (entry->is_sub_map) {
3722 RETURN(KERN_NO_SPACE);
3723 }
3724 if (VME_OBJECT(entry) == VM_OBJECT_NULL) {
3725 RETURN(KERN_NO_SPACE);
3726 }
3727
3728 /* go all the way down the shadow chain */
3729 cur_object = VME_OBJECT(entry);
3730 vm_object_lock(cur_object);
3731 while (cur_object->shadow != VM_OBJECT_NULL) {
3732 shadow_object = cur_object->shadow;
3733 vm_object_lock(shadow_object);
3734 vm_object_unlock(cur_object);
3735 cur_object = shadow_object;
3736 shadow_object = VM_OBJECT_NULL;
3737 }
3738 if (cur_object->internal ||
3739 cur_object->pager == NULL) {
3740 vm_object_unlock(cur_object);
3741 RETURN(KERN_NO_SPACE);
3742 }
3743 if (cur_object->pager->mo_pager_ops
3744 != &fourk_pager_ops) {
3745 vm_object_unlock(cur_object);
3746 RETURN(KERN_NO_SPACE);
3747 }
3748 fourk_object = cur_object;
3749 fourk_mem_obj = fourk_object->pager;
3750
3751 /* keep the "4K" object alive */
3752 vm_object_reference_locked(fourk_object);
3753 memory_object_reference(fourk_mem_obj);
3754 vm_object_unlock(fourk_object);
3755
3756 /* merge permissions */
3757 entry->protection |= cur_protection;
3758 entry->max_protection |= max_protection;
3759
3760 if ((entry->protection & VM_PROT_WRITE) &&
3761 (entry->protection & VM_PROT_ALLEXEC) &&
3762 fourk_binary_compatibility_unsafe &&
3763 fourk_binary_compatibility_allow_wx) {
3764 /* write+execute: need to be "jit" */
3765 entry->used_for_jit = TRUE;
3766 }
3767 goto map_in_fourk_pager;
3768 }
3769
3770 /*
3771 * ... the next region doesn't overlap the
3772 * end point.
3773 */
3774
3775 if ((entry->vme_next != vm_map_to_entry(map)) &&
3776 (entry->vme_next->vme_start < end)) {
3777 RETURN(KERN_NO_SPACE);
3778 }
3779 }
3780
3781 /*
3782 * At this point,
3783 * "start" and "end" should define the endpoints of the
3784 * available new range, and
3785 * "entry" should refer to the region before the new
3786 * range, and
3787 *
3788 * the map should be locked.
3789 */
3790
3791 /* create a new "4K" pager */
3792 fourk_mem_obj = fourk_pager_create();
3793 fourk_object = fourk_pager_to_vm_object(fourk_mem_obj);
3794 assert(fourk_object);
3795
3796 /* keep the "4" object alive */
3797 vm_object_reference(fourk_object);
3798
3799 /* create a "copy" object, to map the "4K" object copy-on-write */
3800 fourk_copy = TRUE;
3801 result = vm_object_copy_strategically(fourk_object,
3802 0,
3803 end - start,
3804 false, /* forking */
3805 ©_object,
3806 ©_offset,
3807 &fourk_copy);
3808 assert(result == KERN_SUCCESS);
3809 assert(copy_object != VM_OBJECT_NULL);
3810 assert(copy_offset == 0);
3811
3812 /* map the "4K" pager's copy object */
3813 new_entry = vm_map_entry_insert(map,
3814 entry,
3815 vm_map_trunc_page(start, VM_MAP_PAGE_MASK(map)),
3816 vm_map_round_page(end, VM_MAP_PAGE_MASK(map)),
3817 copy_object,
3818 0, /* offset */
3819 vmk_flags,
3820 FALSE, /* needs_copy */
3821 cur_protection, max_protection,
3822 (entry_for_jit && !VM_MAP_POLICY_ALLOW_JIT_INHERIT(map) ?
3823 VM_INHERIT_NONE : inheritance),
3824 clear_map_aligned);
3825 entry = new_entry;
3826
3827 #if VM_MAP_DEBUG_FOURK
3828 if (vm_map_debug_fourk) {
3829 printf("FOURK_PAGER: map %p [0x%llx:0x%llx] new pager %p\n",
3830 map,
3831 (uint64_t) entry->vme_start,
3832 (uint64_t) entry->vme_end,
3833 fourk_mem_obj);
3834 }
3835 #endif /* VM_MAP_DEBUG_FOURK */
3836
3837 new_mapping_established = TRUE;
3838
3839 map_in_fourk_pager:
3840 /* "map" the original "object" where it belongs in the "4K" pager */
3841 fourk_pager_offset = (fourk_start & SIXTEENK_PAGE_MASK);
3842 fourk_pager_index_start = (int) (fourk_pager_offset / FOURK_PAGE_SIZE);
3843 if (fourk_size > SIXTEENK_PAGE_SIZE) {
3844 fourk_pager_index_num = 4;
3845 } else {
3846 fourk_pager_index_num = (int) (fourk_size / FOURK_PAGE_SIZE);
3847 }
3848 if (fourk_pager_index_start + fourk_pager_index_num > 4) {
3849 fourk_pager_index_num = 4 - fourk_pager_index_start;
3850 }
3851 for (cur_idx = 0;
3852 cur_idx < fourk_pager_index_num;
3853 cur_idx++) {
3854 vm_object_t old_object;
3855 vm_object_offset_t old_offset;
3856
3857 kr = fourk_pager_populate(fourk_mem_obj,
3858 TRUE, /* overwrite */
3859 fourk_pager_index_start + cur_idx,
3860 object,
3861 (object
3862 ? (offset +
3863 (cur_idx * FOURK_PAGE_SIZE))
3864 : 0),
3865 &old_object,
3866 &old_offset);
3867 #if VM_MAP_DEBUG_FOURK
3868 if (vm_map_debug_fourk) {
3869 if (old_object == (vm_object_t) -1 &&
3870 old_offset == (vm_object_offset_t) -1) {
3871 printf("FOURK_PAGER: map %p [0x%llx:0x%llx] "
3872 "pager [%p:0x%llx] "
3873 "populate[%d] "
3874 "[object:%p,offset:0x%llx]\n",
3875 map,
3876 (uint64_t) entry->vme_start,
3877 (uint64_t) entry->vme_end,
3878 fourk_mem_obj,
3879 VME_OFFSET(entry),
3880 fourk_pager_index_start + cur_idx,
3881 object,
3882 (object
3883 ? (offset + (cur_idx * FOURK_PAGE_SIZE))
3884 : 0));
3885 } else {
3886 printf("FOURK_PAGER: map %p [0x%llx:0x%llx] "
3887 "pager [%p:0x%llx] "
3888 "populate[%d] [object:%p,offset:0x%llx] "
3889 "old [%p:0x%llx]\n",
3890 map,
3891 (uint64_t) entry->vme_start,
3892 (uint64_t) entry->vme_end,
3893 fourk_mem_obj,
3894 VME_OFFSET(entry),
3895 fourk_pager_index_start + cur_idx,
3896 object,
3897 (object
3898 ? (offset + (cur_idx * FOURK_PAGE_SIZE))
3899 : 0),
3900 old_object,
3901 old_offset);
3902 }
3903 }
3904 #endif /* VM_MAP_DEBUG_FOURK */
3905
3906 assert(kr == KERN_SUCCESS);
3907 if (object != old_object &&
3908 object != VM_OBJECT_NULL &&
3909 object != (vm_object_t) -1) {
3910 vm_object_reference(object);
3911 }
3912 if (object != old_object &&
3913 old_object != VM_OBJECT_NULL &&
3914 old_object != (vm_object_t) -1) {
3915 vm_object_deallocate(old_object);
3916 }
3917 }
3918
3919 BailOut:
3920 assert(map_locked == TRUE);
3921
3922 if (result == KERN_SUCCESS) {
3923 vm_prot_t pager_prot;
3924 memory_object_t pager;
3925
3926 #if DEBUG
3927 if (pmap_empty &&
3928 !(vmk_flags.vmkf_no_pmap_check)) {
3929 assert(pmap_is_empty(map->pmap,
3930 *address,
3931 *address + size));
3932 }
3933 #endif /* DEBUG */
3934
3935 /*
3936 * For "named" VM objects, let the pager know that the
3937 * memory object is being mapped. Some pagers need to keep
3938 * track of this, to know when they can reclaim the memory
3939 * object, for example.
3940 * VM calls memory_object_map() for each mapping (specifying
3941 * the protection of each mapping) and calls
3942 * memory_object_last_unmap() when all the mappings are gone.
3943 */
3944 pager_prot = max_protection;
3945 if (needs_copy) {
3946 /*
3947 * Copy-On-Write mapping: won't modify
3948 * the memory object.
3949 */
3950 pager_prot &= ~VM_PROT_WRITE;
3951 }
3952 if (!is_submap &&
3953 object != VM_OBJECT_NULL &&
3954 object->named &&
3955 object->pager != MEMORY_OBJECT_NULL) {
3956 vm_object_lock(object);
3957 pager = object->pager;
3958 if (object->named &&
3959 pager != MEMORY_OBJECT_NULL) {
3960 assert(object->pager_ready);
3961 vm_object_mapping_wait(object, THREAD_UNINT);
3962 vm_object_mapping_begin(object);
3963 vm_object_unlock(object);
3964
3965 kr = memory_object_map(pager, pager_prot);
3966 assert(kr == KERN_SUCCESS);
3967
3968 vm_object_lock(object);
3969 vm_object_mapping_end(object);
3970 }
3971 vm_object_unlock(object);
3972 }
3973 if (!is_submap &&
3974 fourk_object != VM_OBJECT_NULL &&
3975 fourk_object->named &&
3976 fourk_object->pager != MEMORY_OBJECT_NULL) {
3977 vm_object_lock(fourk_object);
3978 pager = fourk_object->pager;
3979 if (fourk_object->named &&
3980 pager != MEMORY_OBJECT_NULL) {
3981 assert(fourk_object->pager_ready);
3982 vm_object_mapping_wait(fourk_object,
3983 THREAD_UNINT);
3984 vm_object_mapping_begin(fourk_object);
3985 vm_object_unlock(fourk_object);
3986
3987 kr = memory_object_map(pager, VM_PROT_READ);
3988 assert(kr == KERN_SUCCESS);
3989
3990 vm_object_lock(fourk_object);
3991 vm_object_mapping_end(fourk_object);
3992 }
3993 vm_object_unlock(fourk_object);
3994 }
3995 }
3996
3997 if (fourk_object != VM_OBJECT_NULL) {
3998 vm_object_deallocate(fourk_object);
3999 fourk_object = VM_OBJECT_NULL;
4000 memory_object_deallocate(fourk_mem_obj);
4001 fourk_mem_obj = MEMORY_OBJECT_NULL;
4002 }
4003
4004 assert(map_locked == TRUE);
4005
4006 if (!keep_map_locked) {
4007 vm_map_unlock(map);
4008 map_locked = FALSE;
4009 }
4010
4011 /*
4012 * We can't hold the map lock if we enter this block.
4013 */
4014
4015 if (result == KERN_SUCCESS) {
4016 /* Wire down the new entry if the user
4017 * requested all new map entries be wired.
4018 */
4019 if ((map->wiring_required) || (superpage_size)) {
4020 assert(!keep_map_locked);
4021 pmap_empty = FALSE; /* pmap won't be empty */
4022 kr = vm_map_wire_kernel(map, start, end,
4023 new_entry->protection, VM_KERN_MEMORY_MLOCK,
4024 TRUE);
4025 result = kr;
4026 }
4027
4028 }
4029
4030 if (result != KERN_SUCCESS) {
4031 if (new_mapping_established) {
4032 /*
4033 * We have to get rid of the new mappings since we
4034 * won't make them available to the user.
4035 * Try and do that atomically, to minimize the risk
4036 * that someone else create new mappings that range.
4037 */
4038
4039 if (!map_locked) {
4040 vm_map_lock(map);
4041 map_locked = TRUE;
4042 }
4043 (void)vm_map_delete(map, *address, *address + size,
4044 VM_MAP_REMOVE_NO_MAP_ALIGN | VM_MAP_REMOVE_NO_YIELD,
4045 KMEM_GUARD_NONE, &zap_list);
4046 }
4047 }
4048
4049 /*
4050 * The caller is responsible for releasing the lock if it requested to
4051 * keep the map locked.
4052 */
4053 if (map_locked && !keep_map_locked) {
4054 vm_map_unlock(map);
4055 }
4056
4057 vm_map_zap_dispose(&zap_list);
4058
4059 return result;
4060
4061 #undef RETURN
4062 }
4063 #endif /* __arm64__ */
4064
4065 /*
4066 * Counters for the prefault optimization.
4067 */
4068 int64_t vm_prefault_nb_pages = 0;
4069 int64_t vm_prefault_nb_bailout = 0;
4070
4071 static kern_return_t
vm_map_enter_mem_object_helper(vm_map_t target_map,vm_map_offset_t * address,vm_map_size_t initial_size,vm_map_offset_t mask,vm_map_kernel_flags_t vmk_flags,ipc_port_t port,vm_object_offset_t offset,boolean_t copy,vm_prot_t cur_protection,vm_prot_t max_protection,vm_inherit_t inheritance,upl_page_list_ptr_t page_list,unsigned int page_list_count)4072 vm_map_enter_mem_object_helper(
4073 vm_map_t target_map,
4074 vm_map_offset_t *address,
4075 vm_map_size_t initial_size,
4076 vm_map_offset_t mask,
4077 vm_map_kernel_flags_t vmk_flags,
4078 ipc_port_t port,
4079 vm_object_offset_t offset,
4080 boolean_t copy,
4081 vm_prot_t cur_protection,
4082 vm_prot_t max_protection,
4083 vm_inherit_t inheritance,
4084 upl_page_list_ptr_t page_list,
4085 unsigned int page_list_count)
4086 {
4087 vm_map_address_t map_addr;
4088 vm_map_size_t map_size;
4089 vm_object_t object;
4090 vm_object_size_t size;
4091 kern_return_t result;
4092 boolean_t mask_cur_protection, mask_max_protection;
4093 boolean_t kernel_prefault, try_prefault = (page_list_count != 0);
4094 vm_map_offset_t offset_in_mapping = 0;
4095 #if __arm64__
4096 boolean_t fourk = vmk_flags.vmkf_fourk;
4097 #endif /* __arm64__ */
4098
4099 if (VM_MAP_PAGE_SHIFT(target_map) < PAGE_SHIFT) {
4100 /* XXX TODO4K prefaulting depends on page size... */
4101 try_prefault = FALSE;
4102 }
4103
4104 assertf(vmk_flags.__vmkf_unused == 0, "vmk_flags unused=0x%x\n", vmk_flags.__vmkf_unused);
4105 vm_map_kernel_flags_update_range_id(&vmk_flags, target_map);
4106
4107 mask_cur_protection = cur_protection & VM_PROT_IS_MASK;
4108 mask_max_protection = max_protection & VM_PROT_IS_MASK;
4109 cur_protection &= ~VM_PROT_IS_MASK;
4110 max_protection &= ~VM_PROT_IS_MASK;
4111
4112 /*
4113 * Check arguments for validity
4114 */
4115 if ((target_map == VM_MAP_NULL) ||
4116 (cur_protection & ~(VM_PROT_ALL | VM_PROT_ALLEXEC)) ||
4117 (max_protection & ~(VM_PROT_ALL | VM_PROT_ALLEXEC)) ||
4118 (inheritance > VM_INHERIT_LAST_VALID) ||
4119 (try_prefault && (copy || !page_list)) ||
4120 initial_size == 0) {
4121 return KERN_INVALID_ARGUMENT;
4122 }
4123
4124 #if __arm64__
4125 if (cur_protection & VM_PROT_EXECUTE) {
4126 cur_protection |= VM_PROT_READ;
4127 }
4128
4129 if (fourk && VM_MAP_PAGE_SHIFT(target_map) < PAGE_SHIFT) {
4130 /* no "fourk" if map is using a sub-page page size */
4131 fourk = FALSE;
4132 }
4133 if (fourk) {
4134 map_addr = vm_map_trunc_page(*address, FOURK_PAGE_MASK);
4135 map_size = vm_map_round_page(initial_size, FOURK_PAGE_MASK);
4136 } else
4137 #endif /* __arm64__ */
4138 {
4139 map_addr = vm_map_trunc_page(*address,
4140 VM_MAP_PAGE_MASK(target_map));
4141 map_size = vm_map_round_page(initial_size,
4142 VM_MAP_PAGE_MASK(target_map));
4143 }
4144 if (map_size == 0) {
4145 return KERN_INVALID_ARGUMENT;
4146 }
4147 size = vm_object_round_page(initial_size);
4148
4149 /*
4150 * Find the vm object (if any) corresponding to this port.
4151 */
4152 if (!IP_VALID(port)) {
4153 object = VM_OBJECT_NULL;
4154 offset = 0;
4155 copy = FALSE;
4156 } else if (ip_kotype(port) == IKOT_NAMED_ENTRY) {
4157 vm_named_entry_t named_entry;
4158 vm_object_offset_t data_offset;
4159
4160 named_entry = mach_memory_entry_from_port(port);
4161
4162 if (vmk_flags.vmf_return_data_addr ||
4163 vmk_flags.vmf_return_4k_data_addr) {
4164 data_offset = named_entry->data_offset;
4165 offset += named_entry->data_offset;
4166 } else {
4167 data_offset = 0;
4168 }
4169
4170 /* a few checks to make sure user is obeying rules */
4171 if (mask_max_protection) {
4172 max_protection &= named_entry->protection;
4173 }
4174 if (mask_cur_protection) {
4175 cur_protection &= named_entry->protection;
4176 }
4177 if ((named_entry->protection & max_protection) !=
4178 max_protection) {
4179 return KERN_INVALID_RIGHT;
4180 }
4181 if ((named_entry->protection & cur_protection) !=
4182 cur_protection) {
4183 return KERN_INVALID_RIGHT;
4184 }
4185 if (offset + size <= offset) {
4186 /* overflow */
4187 return KERN_INVALID_ARGUMENT;
4188 }
4189 if (named_entry->size < (offset + initial_size)) {
4190 return KERN_INVALID_ARGUMENT;
4191 }
4192
4193 if (named_entry->is_copy) {
4194 /* for a vm_map_copy, we can only map it whole */
4195 if ((size != named_entry->size) &&
4196 (vm_map_round_page(size,
4197 VM_MAP_PAGE_MASK(target_map)) ==
4198 named_entry->size)) {
4199 /* XXX FBDP use the rounded size... */
4200 size = vm_map_round_page(
4201 size,
4202 VM_MAP_PAGE_MASK(target_map));
4203 }
4204 }
4205
4206 /* the callers parameter offset is defined to be the */
4207 /* offset from beginning of named entry offset in object */
4208 offset = offset + named_entry->offset;
4209
4210 if (!VM_MAP_PAGE_ALIGNED(size,
4211 VM_MAP_PAGE_MASK(target_map))) {
4212 /*
4213 * Let's not map more than requested;
4214 * vm_map_enter() will handle this "not map-aligned"
4215 * case.
4216 */
4217 map_size = size;
4218 }
4219
4220 named_entry_lock(named_entry);
4221 if (named_entry->is_sub_map) {
4222 vm_map_t submap;
4223
4224 if (vmk_flags.vmf_return_data_addr ||
4225 vmk_flags.vmf_return_4k_data_addr) {
4226 panic("VM_FLAGS_RETURN_DATA_ADDR not expected for submap.");
4227 }
4228
4229 submap = named_entry->backing.map;
4230 vm_map_reference(submap);
4231 named_entry_unlock(named_entry);
4232
4233 vmk_flags.vmkf_submap = TRUE;
4234
4235 result = vm_map_enter(target_map,
4236 &map_addr,
4237 map_size,
4238 mask,
4239 vmk_flags,
4240 (vm_object_t)(uintptr_t) submap,
4241 offset,
4242 copy,
4243 cur_protection,
4244 max_protection,
4245 inheritance);
4246 if (result != KERN_SUCCESS) {
4247 vm_map_deallocate(submap);
4248 } else {
4249 /*
4250 * No need to lock "submap" just to check its
4251 * "mapped" flag: that flag is never reset
4252 * once it's been set and if we race, we'll
4253 * just end up setting it twice, which is OK.
4254 */
4255 if (submap->mapped_in_other_pmaps == FALSE &&
4256 vm_map_pmap(submap) != PMAP_NULL &&
4257 vm_map_pmap(submap) !=
4258 vm_map_pmap(target_map)) {
4259 /*
4260 * This submap is being mapped in a map
4261 * that uses a different pmap.
4262 * Set its "mapped_in_other_pmaps" flag
4263 * to indicate that we now need to
4264 * remove mappings from all pmaps rather
4265 * than just the submap's pmap.
4266 */
4267 vm_map_lock(submap);
4268 submap->mapped_in_other_pmaps = TRUE;
4269 vm_map_unlock(submap);
4270 }
4271 *address = map_addr;
4272 }
4273 return result;
4274 } else if (named_entry->is_copy) {
4275 kern_return_t kr;
4276 vm_map_copy_t copy_map;
4277 vm_map_entry_t copy_entry;
4278 vm_map_offset_t copy_addr;
4279 vm_map_copy_t target_copy_map;
4280 vm_map_offset_t overmap_start, overmap_end;
4281 vm_map_offset_t trimmed_start;
4282 vm_map_size_t target_size;
4283
4284 if (!vm_map_kernel_flags_check_vmflags(vmk_flags,
4285 (VM_FLAGS_FIXED |
4286 VM_FLAGS_ANYWHERE |
4287 VM_FLAGS_OVERWRITE |
4288 VM_FLAGS_RETURN_4K_DATA_ADDR |
4289 VM_FLAGS_RETURN_DATA_ADDR))) {
4290 named_entry_unlock(named_entry);
4291 return KERN_INVALID_ARGUMENT;
4292 }
4293
4294 copy_map = named_entry->backing.copy;
4295 assert(copy_map->type == VM_MAP_COPY_ENTRY_LIST);
4296 if (copy_map->type != VM_MAP_COPY_ENTRY_LIST) {
4297 /* unsupported type; should not happen */
4298 printf("vm_map_enter_mem_object: "
4299 "memory_entry->backing.copy "
4300 "unsupported type 0x%x\n",
4301 copy_map->type);
4302 named_entry_unlock(named_entry);
4303 return KERN_INVALID_ARGUMENT;
4304 }
4305
4306 if (VM_MAP_PAGE_SHIFT(target_map) != copy_map->cpy_hdr.page_shift) {
4307 DEBUG4K_SHARE("copy_map %p offset %llx size 0x%llx pgshift %d -> target_map %p pgshift %d\n", copy_map, offset, (uint64_t)map_size, copy_map->cpy_hdr.page_shift, target_map, VM_MAP_PAGE_SHIFT(target_map));
4308 }
4309
4310 if (vmk_flags.vmf_return_data_addr ||
4311 vmk_flags.vmf_return_4k_data_addr) {
4312 offset_in_mapping = offset & VM_MAP_PAGE_MASK(target_map);
4313 if (vmk_flags.vmf_return_4k_data_addr) {
4314 offset_in_mapping &= ~((signed)(0xFFF));
4315 }
4316 }
4317
4318 target_copy_map = VM_MAP_COPY_NULL;
4319 target_size = copy_map->size;
4320 overmap_start = 0;
4321 overmap_end = 0;
4322 trimmed_start = 0;
4323 if (copy_map->cpy_hdr.page_shift != VM_MAP_PAGE_SHIFT(target_map)) {
4324 DEBUG4K_ADJUST("adjusting...\n");
4325 kr = vm_map_copy_adjust_to_target(
4326 copy_map,
4327 offset /* includes data_offset */,
4328 initial_size,
4329 target_map,
4330 copy,
4331 &target_copy_map,
4332 &overmap_start,
4333 &overmap_end,
4334 &trimmed_start);
4335 if (kr != KERN_SUCCESS) {
4336 named_entry_unlock(named_entry);
4337 return kr;
4338 }
4339 target_size = target_copy_map->size;
4340 if (trimmed_start >= data_offset) {
4341 data_offset = offset & VM_MAP_PAGE_MASK(target_map);
4342 } else {
4343 data_offset -= trimmed_start;
4344 }
4345 } else {
4346 /*
4347 * Assert that the vm_map_copy is coming from the right
4348 * zone and hasn't been forged
4349 */
4350 vm_map_copy_require(copy_map);
4351 target_copy_map = copy_map;
4352 }
4353
4354 vm_map_kernel_flags_t rsv_flags = vmk_flags;
4355
4356 vm_map_kernel_flags_and_vmflags(&rsv_flags,
4357 (VM_FLAGS_FIXED |
4358 VM_FLAGS_ANYWHERE |
4359 VM_FLAGS_OVERWRITE |
4360 VM_FLAGS_RETURN_4K_DATA_ADDR |
4361 VM_FLAGS_RETURN_DATA_ADDR));
4362
4363 /* reserve a contiguous range */
4364 kr = vm_map_enter(target_map,
4365 &map_addr,
4366 vm_map_round_page(target_size, VM_MAP_PAGE_MASK(target_map)),
4367 mask,
4368 rsv_flags,
4369 VM_OBJECT_NULL,
4370 0,
4371 FALSE, /* copy */
4372 cur_protection,
4373 max_protection,
4374 inheritance);
4375 if (kr != KERN_SUCCESS) {
4376 DEBUG4K_ERROR("kr 0x%x\n", kr);
4377 if (target_copy_map != copy_map) {
4378 vm_map_copy_discard(target_copy_map);
4379 target_copy_map = VM_MAP_COPY_NULL;
4380 }
4381 named_entry_unlock(named_entry);
4382 return kr;
4383 }
4384
4385 copy_addr = map_addr;
4386
4387 for (copy_entry = vm_map_copy_first_entry(target_copy_map);
4388 copy_entry != vm_map_copy_to_entry(target_copy_map);
4389 copy_entry = copy_entry->vme_next) {
4390 vm_map_t copy_submap = VM_MAP_NULL;
4391 vm_object_t copy_object = VM_OBJECT_NULL;
4392 vm_map_size_t copy_size;
4393 vm_object_offset_t copy_offset;
4394 boolean_t do_copy = false;
4395
4396 if (copy_entry->is_sub_map) {
4397 copy_submap = VME_SUBMAP(copy_entry);
4398 copy_object = (vm_object_t)copy_submap;
4399 } else {
4400 copy_object = VME_OBJECT(copy_entry);
4401 }
4402 copy_offset = VME_OFFSET(copy_entry);
4403 copy_size = (copy_entry->vme_end -
4404 copy_entry->vme_start);
4405
4406 /* sanity check */
4407 if ((copy_addr + copy_size) >
4408 (map_addr +
4409 overmap_start + overmap_end +
4410 named_entry->size /* XXX full size */)) {
4411 /* over-mapping too much !? */
4412 kr = KERN_INVALID_ARGUMENT;
4413 DEBUG4K_ERROR("kr 0x%x\n", kr);
4414 /* abort */
4415 break;
4416 }
4417
4418 /* take a reference on the object */
4419 if (copy_entry->is_sub_map) {
4420 vm_map_reference(copy_submap);
4421 } else {
4422 if (!copy &&
4423 copy_object != VM_OBJECT_NULL &&
4424 copy_object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) {
4425 /*
4426 * We need to resolve our side of this
4427 * "symmetric" copy-on-write now; we
4428 * need a new object to map and share,
4429 * instead of the current one which
4430 * might still be shared with the
4431 * original mapping.
4432 *
4433 * Note: A "vm_map_copy_t" does not
4434 * have a lock but we're protected by
4435 * the named entry's lock here.
4436 */
4437 // assert(copy_object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC);
4438 VME_OBJECT_SHADOW(copy_entry, copy_size, TRUE);
4439 assert(copy_object != VME_OBJECT(copy_entry));
4440 if (!copy_entry->needs_copy &&
4441 copy_entry->protection & VM_PROT_WRITE) {
4442 vm_prot_t prot;
4443
4444 prot = copy_entry->protection & ~VM_PROT_WRITE;
4445 vm_object_pmap_protect(copy_object,
4446 copy_offset,
4447 copy_size,
4448 PMAP_NULL,
4449 PAGE_SIZE,
4450 0,
4451 prot);
4452 }
4453 copy_entry->needs_copy = FALSE;
4454 copy_entry->is_shared = TRUE;
4455 copy_object = VME_OBJECT(copy_entry);
4456 copy_offset = VME_OFFSET(copy_entry);
4457 vm_object_lock(copy_object);
4458 /* we're about to make a shared mapping of this object */
4459 copy_object->copy_strategy = MEMORY_OBJECT_COPY_DELAY;
4460 copy_object->true_share = TRUE;
4461 vm_object_unlock(copy_object);
4462 }
4463
4464 if (copy_object != VM_OBJECT_NULL &&
4465 copy_object->named &&
4466 copy_object->pager != MEMORY_OBJECT_NULL &&
4467 copy_object->copy_strategy != MEMORY_OBJECT_COPY_NONE) {
4468 memory_object_t pager;
4469 vm_prot_t pager_prot;
4470
4471 /*
4472 * For "named" VM objects, let the pager know that the
4473 * memory object is being mapped. Some pagers need to keep
4474 * track of this, to know when they can reclaim the memory
4475 * object, for example.
4476 * VM calls memory_object_map() for each mapping (specifying
4477 * the protection of each mapping) and calls
4478 * memory_object_last_unmap() when all the mappings are gone.
4479 */
4480 pager_prot = max_protection;
4481 if (copy) {
4482 /*
4483 * Copy-On-Write mapping: won't modify the
4484 * memory object.
4485 */
4486 pager_prot &= ~VM_PROT_WRITE;
4487 }
4488 vm_object_lock(copy_object);
4489 pager = copy_object->pager;
4490 if (copy_object->named &&
4491 pager != MEMORY_OBJECT_NULL &&
4492 copy_object->copy_strategy != MEMORY_OBJECT_COPY_NONE) {
4493 assert(copy_object->pager_ready);
4494 vm_object_mapping_wait(copy_object, THREAD_UNINT);
4495 vm_object_mapping_begin(copy_object);
4496 vm_object_unlock(copy_object);
4497
4498 kr = memory_object_map(pager, pager_prot);
4499 assert(kr == KERN_SUCCESS);
4500
4501 vm_object_lock(copy_object);
4502 vm_object_mapping_end(copy_object);
4503 }
4504 vm_object_unlock(copy_object);
4505 }
4506
4507 /*
4508 * Perform the copy if requested
4509 */
4510
4511 if (copy && copy_object != VM_OBJECT_NULL) {
4512 vm_object_t new_object;
4513 vm_object_offset_t new_offset;
4514
4515 result = vm_object_copy_strategically(copy_object, copy_offset,
4516 copy_size,
4517 false, /* forking */
4518 &new_object, &new_offset,
4519 &do_copy);
4520
4521
4522 if (result == KERN_MEMORY_RESTART_COPY) {
4523 boolean_t success;
4524 boolean_t src_needs_copy;
4525
4526 /*
4527 * XXX
4528 * We currently ignore src_needs_copy.
4529 * This really is the issue of how to make
4530 * MEMORY_OBJECT_COPY_SYMMETRIC safe for
4531 * non-kernel users to use. Solution forthcoming.
4532 * In the meantime, since we don't allow non-kernel
4533 * memory managers to specify symmetric copy,
4534 * we won't run into problems here.
4535 */
4536 new_object = copy_object;
4537 new_offset = copy_offset;
4538 success = vm_object_copy_quickly(new_object,
4539 new_offset,
4540 copy_size,
4541 &src_needs_copy,
4542 &do_copy);
4543 assert(success);
4544 result = KERN_SUCCESS;
4545 }
4546 if (result != KERN_SUCCESS) {
4547 kr = result;
4548 break;
4549 }
4550
4551 copy_object = new_object;
4552 copy_offset = new_offset;
4553 /*
4554 * No extra object reference for the mapping:
4555 * the mapping should be the only thing keeping
4556 * this new object alive.
4557 */
4558 } else {
4559 /*
4560 * We already have the right object
4561 * to map.
4562 */
4563 copy_object = VME_OBJECT(copy_entry);
4564 /* take an extra ref for the mapping below */
4565 vm_object_reference(copy_object);
4566 }
4567 }
4568
4569 /*
4570 * If the caller does not want a specific
4571 * tag for this new mapping: use
4572 * the tag of the original mapping.
4573 */
4574 vm_map_kernel_flags_t vmk_remap_flags = {
4575 .vmkf_submap = copy_entry->is_sub_map,
4576 };
4577
4578 vm_map_kernel_flags_set_vmflags(&vmk_remap_flags,
4579 vm_map_kernel_flags_vmflags(vmk_flags),
4580 vmk_flags.vm_tag ?: VME_ALIAS(copy_entry));
4581
4582 /* over-map the object into destination */
4583 vmk_remap_flags.vmf_fixed = true;
4584 vmk_remap_flags.vmf_overwrite = true;
4585
4586 if (!copy && !copy_entry->is_sub_map) {
4587 /*
4588 * copy-on-write should have been
4589 * resolved at this point, or we would
4590 * end up sharing instead of copying.
4591 */
4592 assert(!copy_entry->needs_copy);
4593 }
4594 #if XNU_TARGET_OS_OSX
4595 if (copy_entry->used_for_jit) {
4596 vmk_remap_flags.vmkf_map_jit = TRUE;
4597 }
4598 #endif /* XNU_TARGET_OS_OSX */
4599
4600 kr = vm_map_enter(target_map,
4601 ©_addr,
4602 copy_size,
4603 (vm_map_offset_t) 0,
4604 vmk_remap_flags,
4605 copy_object,
4606 copy_offset,
4607 ((copy_object == NULL)
4608 ? FALSE
4609 : (copy || copy_entry->needs_copy)),
4610 cur_protection,
4611 max_protection,
4612 inheritance);
4613 if (kr != KERN_SUCCESS) {
4614 DEBUG4K_SHARE("failed kr 0x%x\n", kr);
4615 if (copy_entry->is_sub_map) {
4616 vm_map_deallocate(copy_submap);
4617 } else {
4618 vm_object_deallocate(copy_object);
4619 }
4620 /* abort */
4621 break;
4622 }
4623
4624 /* next mapping */
4625 copy_addr += copy_size;
4626 }
4627
4628 if (kr == KERN_SUCCESS) {
4629 if (vmk_flags.vmf_return_data_addr ||
4630 vmk_flags.vmf_return_4k_data_addr) {
4631 *address = map_addr + offset_in_mapping;
4632 } else {
4633 *address = map_addr;
4634 }
4635 if (overmap_start) {
4636 *address += overmap_start;
4637 DEBUG4K_SHARE("map %p map_addr 0x%llx offset_in_mapping 0x%llx overmap_start 0x%llx -> *address 0x%llx\n", target_map, (uint64_t)map_addr, (uint64_t) offset_in_mapping, (uint64_t)overmap_start, (uint64_t)*address);
4638 }
4639 }
4640 named_entry_unlock(named_entry);
4641 if (target_copy_map != copy_map) {
4642 vm_map_copy_discard(target_copy_map);
4643 target_copy_map = VM_MAP_COPY_NULL;
4644 }
4645
4646 if (kr != KERN_SUCCESS && !vmk_flags.vmf_overwrite) {
4647 /* deallocate the contiguous range */
4648 (void) vm_deallocate(target_map,
4649 map_addr,
4650 map_size);
4651 }
4652
4653 return kr;
4654 }
4655
4656 if (named_entry->is_object) {
4657 unsigned int access;
4658 unsigned int wimg_mode;
4659
4660 /* we are mapping a VM object */
4661
4662 access = named_entry->access;
4663
4664 if (vmk_flags.vmf_return_data_addr ||
4665 vmk_flags.vmf_return_4k_data_addr) {
4666 offset_in_mapping = offset - VM_MAP_TRUNC_PAGE(offset, VM_MAP_PAGE_MASK(target_map));
4667 if (vmk_flags.vmf_return_4k_data_addr) {
4668 offset_in_mapping &= ~((signed)(0xFFF));
4669 }
4670 offset = VM_MAP_TRUNC_PAGE(offset, VM_MAP_PAGE_MASK(target_map));
4671 map_size = VM_MAP_ROUND_PAGE((offset + offset_in_mapping + initial_size) - offset, VM_MAP_PAGE_MASK(target_map));
4672 }
4673
4674 object = vm_named_entry_to_vm_object(named_entry);
4675 assert(object != VM_OBJECT_NULL);
4676 vm_object_lock(object);
4677 named_entry_unlock(named_entry);
4678
4679 vm_object_reference_locked(object);
4680
4681 wimg_mode = object->wimg_bits;
4682 vm_prot_to_wimg(access, &wimg_mode);
4683 if (object->wimg_bits != wimg_mode) {
4684 vm_object_change_wimg_mode(object, wimg_mode);
4685 }
4686
4687 vm_object_unlock(object);
4688 } else {
4689 panic("invalid VM named entry %p", named_entry);
4690 }
4691 } else if (ip_kotype(port) == IKOT_MEMORY_OBJECT) {
4692 /*
4693 * JMM - This is temporary until we unify named entries
4694 * and raw memory objects.
4695 *
4696 * Detected fake ip_kotype for a memory object. In
4697 * this case, the port isn't really a port at all, but
4698 * instead is just a raw memory object.
4699 */
4700 if (vmk_flags.vmf_return_data_addr ||
4701 vmk_flags.vmf_return_4k_data_addr) {
4702 panic("VM_FLAGS_RETURN_DATA_ADDR not expected for raw memory object.");
4703 }
4704
4705 object = memory_object_to_vm_object((memory_object_t)port);
4706 if (object == VM_OBJECT_NULL) {
4707 return KERN_INVALID_OBJECT;
4708 }
4709 vm_object_reference(object);
4710
4711 /* wait for object (if any) to be ready */
4712 if (object != VM_OBJECT_NULL) {
4713 if (is_kernel_object(object)) {
4714 printf("Warning: Attempt to map kernel object"
4715 " by a non-private kernel entity\n");
4716 return KERN_INVALID_OBJECT;
4717 }
4718 if (!object->pager_ready) {
4719 vm_object_lock(object);
4720
4721 while (!object->pager_ready) {
4722 vm_object_wait(object,
4723 VM_OBJECT_EVENT_PAGER_READY,
4724 THREAD_UNINT);
4725 vm_object_lock(object);
4726 }
4727 vm_object_unlock(object);
4728 }
4729 }
4730 } else {
4731 return KERN_INVALID_OBJECT;
4732 }
4733
4734 if (object != VM_OBJECT_NULL &&
4735 object->named &&
4736 object->pager != MEMORY_OBJECT_NULL &&
4737 object->copy_strategy != MEMORY_OBJECT_COPY_NONE) {
4738 memory_object_t pager;
4739 vm_prot_t pager_prot;
4740 kern_return_t kr;
4741
4742 /*
4743 * For "named" VM objects, let the pager know that the
4744 * memory object is being mapped. Some pagers need to keep
4745 * track of this, to know when they can reclaim the memory
4746 * object, for example.
4747 * VM calls memory_object_map() for each mapping (specifying
4748 * the protection of each mapping) and calls
4749 * memory_object_last_unmap() when all the mappings are gone.
4750 */
4751 pager_prot = max_protection;
4752 if (copy) {
4753 /*
4754 * Copy-On-Write mapping: won't modify the
4755 * memory object.
4756 */
4757 pager_prot &= ~VM_PROT_WRITE;
4758 }
4759 vm_object_lock(object);
4760 pager = object->pager;
4761 if (object->named &&
4762 pager != MEMORY_OBJECT_NULL &&
4763 object->copy_strategy != MEMORY_OBJECT_COPY_NONE) {
4764 assert(object->pager_ready);
4765 vm_object_mapping_wait(object, THREAD_UNINT);
4766 vm_object_mapping_begin(object);
4767 vm_object_unlock(object);
4768
4769 kr = memory_object_map(pager, pager_prot);
4770 assert(kr == KERN_SUCCESS);
4771
4772 vm_object_lock(object);
4773 vm_object_mapping_end(object);
4774 }
4775 vm_object_unlock(object);
4776 }
4777
4778 /*
4779 * Perform the copy if requested
4780 */
4781
4782 if (copy) {
4783 vm_object_t new_object;
4784 vm_object_offset_t new_offset;
4785
4786 result = vm_object_copy_strategically(object, offset,
4787 map_size,
4788 false, /* forking */
4789 &new_object, &new_offset,
4790 ©);
4791
4792
4793 if (result == KERN_MEMORY_RESTART_COPY) {
4794 boolean_t success;
4795 boolean_t src_needs_copy;
4796
4797 /*
4798 * XXX
4799 * We currently ignore src_needs_copy.
4800 * This really is the issue of how to make
4801 * MEMORY_OBJECT_COPY_SYMMETRIC safe for
4802 * non-kernel users to use. Solution forthcoming.
4803 * In the meantime, since we don't allow non-kernel
4804 * memory managers to specify symmetric copy,
4805 * we won't run into problems here.
4806 */
4807 new_object = object;
4808 new_offset = offset;
4809 success = vm_object_copy_quickly(new_object,
4810 new_offset,
4811 map_size,
4812 &src_needs_copy,
4813 ©);
4814 assert(success);
4815 result = KERN_SUCCESS;
4816 }
4817 /*
4818 * Throw away the reference to the
4819 * original object, as it won't be mapped.
4820 */
4821
4822 vm_object_deallocate(object);
4823
4824 if (result != KERN_SUCCESS) {
4825 return result;
4826 }
4827
4828 object = new_object;
4829 offset = new_offset;
4830 }
4831
4832 /*
4833 * If non-kernel users want to try to prefault pages, the mapping and prefault
4834 * needs to be atomic.
4835 */
4836 kernel_prefault = (try_prefault && vm_kernel_map_is_kernel(target_map));
4837 vmk_flags.vmkf_keep_map_locked = (try_prefault && !kernel_prefault);
4838
4839 #if __arm64__
4840 if (fourk) {
4841 /* map this object in a "4K" pager */
4842 result = vm_map_enter_fourk(target_map,
4843 &map_addr,
4844 map_size,
4845 (vm_map_offset_t) mask,
4846 vmk_flags,
4847 object,
4848 offset,
4849 copy,
4850 cur_protection,
4851 max_protection,
4852 inheritance);
4853 } else
4854 #endif /* __arm64__ */
4855 {
4856 result = vm_map_enter(target_map,
4857 &map_addr, map_size,
4858 (vm_map_offset_t)mask,
4859 vmk_flags,
4860 object, offset,
4861 copy,
4862 cur_protection, max_protection,
4863 inheritance);
4864 }
4865 if (result != KERN_SUCCESS) {
4866 vm_object_deallocate(object);
4867 }
4868
4869 /*
4870 * Try to prefault, and do not forget to release the vm map lock.
4871 */
4872 if (result == KERN_SUCCESS && try_prefault) {
4873 mach_vm_address_t va = map_addr;
4874 kern_return_t kr = KERN_SUCCESS;
4875 unsigned int i = 0;
4876 int pmap_options;
4877
4878 pmap_options = kernel_prefault ? 0 : PMAP_OPTIONS_NOWAIT;
4879 if (object->internal) {
4880 pmap_options |= PMAP_OPTIONS_INTERNAL;
4881 }
4882
4883 for (i = 0; i < page_list_count; ++i) {
4884 if (!UPL_VALID_PAGE(page_list, i)) {
4885 if (kernel_prefault) {
4886 assertf(FALSE, "kernel_prefault && !UPL_VALID_PAGE");
4887 result = KERN_MEMORY_ERROR;
4888 break;
4889 }
4890 } else {
4891 /*
4892 * If this function call failed, we should stop
4893 * trying to optimize, other calls are likely
4894 * going to fail too.
4895 *
4896 * We are not gonna report an error for such
4897 * failure though. That's an optimization, not
4898 * something critical.
4899 */
4900 kr = pmap_enter_options(target_map->pmap,
4901 va, UPL_PHYS_PAGE(page_list, i),
4902 cur_protection, VM_PROT_NONE,
4903 0, TRUE, pmap_options, NULL, PMAP_MAPPING_TYPE_INFER);
4904 if (kr != KERN_SUCCESS) {
4905 OSIncrementAtomic64(&vm_prefault_nb_bailout);
4906 if (kernel_prefault) {
4907 result = kr;
4908 }
4909 break;
4910 }
4911 OSIncrementAtomic64(&vm_prefault_nb_pages);
4912 }
4913
4914 /* Next virtual address */
4915 va += PAGE_SIZE;
4916 }
4917 if (vmk_flags.vmkf_keep_map_locked) {
4918 vm_map_unlock(target_map);
4919 }
4920 }
4921
4922 if (vmk_flags.vmf_return_data_addr ||
4923 vmk_flags.vmf_return_4k_data_addr) {
4924 *address = map_addr + offset_in_mapping;
4925 } else {
4926 *address = map_addr;
4927 }
4928 return result;
4929 }
4930
4931 kern_return_t
vm_map_enter_mem_object(vm_map_t target_map,vm_map_offset_t * address,vm_map_size_t initial_size,vm_map_offset_t mask,vm_map_kernel_flags_t vmk_flags,ipc_port_t port,vm_object_offset_t offset,boolean_t copy,vm_prot_t cur_protection,vm_prot_t max_protection,vm_inherit_t inheritance)4932 vm_map_enter_mem_object(
4933 vm_map_t target_map,
4934 vm_map_offset_t *address,
4935 vm_map_size_t initial_size,
4936 vm_map_offset_t mask,
4937 vm_map_kernel_flags_t vmk_flags,
4938 ipc_port_t port,
4939 vm_object_offset_t offset,
4940 boolean_t copy,
4941 vm_prot_t cur_protection,
4942 vm_prot_t max_protection,
4943 vm_inherit_t inheritance)
4944 {
4945 kern_return_t ret;
4946
4947 /* range_id is set by vm_map_enter_mem_object_helper */
4948 ret = vm_map_enter_mem_object_helper(target_map,
4949 address,
4950 initial_size,
4951 mask,
4952 vmk_flags,
4953 port,
4954 offset,
4955 copy,
4956 cur_protection,
4957 max_protection,
4958 inheritance,
4959 NULL,
4960 0);
4961
4962 #if KASAN
4963 if (ret == KERN_SUCCESS && address && target_map->pmap == kernel_pmap) {
4964 kasan_notify_address(*address, initial_size);
4965 }
4966 #endif
4967
4968 return ret;
4969 }
4970
4971 kern_return_t
vm_map_enter_mem_object_prefault(vm_map_t target_map,vm_map_offset_t * address,vm_map_size_t initial_size,vm_map_offset_t mask,vm_map_kernel_flags_t vmk_flags,ipc_port_t port,vm_object_offset_t offset,vm_prot_t cur_protection,vm_prot_t max_protection,upl_page_list_ptr_t page_list,unsigned int page_list_count)4972 vm_map_enter_mem_object_prefault(
4973 vm_map_t target_map,
4974 vm_map_offset_t *address,
4975 vm_map_size_t initial_size,
4976 vm_map_offset_t mask,
4977 vm_map_kernel_flags_t vmk_flags,
4978 ipc_port_t port,
4979 vm_object_offset_t offset,
4980 vm_prot_t cur_protection,
4981 vm_prot_t max_protection,
4982 upl_page_list_ptr_t page_list,
4983 unsigned int page_list_count)
4984 {
4985 kern_return_t ret;
4986
4987 /* range_id is set by vm_map_enter_mem_object_helper */
4988 ret = vm_map_enter_mem_object_helper(target_map,
4989 address,
4990 initial_size,
4991 mask,
4992 vmk_flags,
4993 port,
4994 offset,
4995 FALSE,
4996 cur_protection,
4997 max_protection,
4998 VM_INHERIT_DEFAULT,
4999 page_list,
5000 page_list_count);
5001
5002 #if KASAN
5003 if (ret == KERN_SUCCESS && address && target_map->pmap == kernel_pmap) {
5004 kasan_notify_address(*address, initial_size);
5005 }
5006 #endif
5007
5008 return ret;
5009 }
5010
5011
5012 kern_return_t
vm_map_enter_mem_object_control(vm_map_t target_map,vm_map_offset_t * address,vm_map_size_t initial_size,vm_map_offset_t mask,vm_map_kernel_flags_t vmk_flags,memory_object_control_t control,vm_object_offset_t offset,boolean_t copy,vm_prot_t cur_protection,vm_prot_t max_protection,vm_inherit_t inheritance)5013 vm_map_enter_mem_object_control(
5014 vm_map_t target_map,
5015 vm_map_offset_t *address,
5016 vm_map_size_t initial_size,
5017 vm_map_offset_t mask,
5018 vm_map_kernel_flags_t vmk_flags,
5019 memory_object_control_t control,
5020 vm_object_offset_t offset,
5021 boolean_t copy,
5022 vm_prot_t cur_protection,
5023 vm_prot_t max_protection,
5024 vm_inherit_t inheritance)
5025 {
5026 vm_map_address_t map_addr;
5027 vm_map_size_t map_size;
5028 vm_object_t object;
5029 vm_object_size_t size;
5030 kern_return_t result;
5031 memory_object_t pager;
5032 vm_prot_t pager_prot;
5033 kern_return_t kr;
5034 #if __arm64__
5035 boolean_t fourk = vmk_flags.vmkf_fourk;
5036 #endif /* __arm64__ */
5037
5038 /*
5039 * Check arguments for validity
5040 */
5041 if ((target_map == VM_MAP_NULL) ||
5042 (cur_protection & ~(VM_PROT_ALL | VM_PROT_ALLEXEC)) ||
5043 (max_protection & ~(VM_PROT_ALL | VM_PROT_ALLEXEC)) ||
5044 (inheritance > VM_INHERIT_LAST_VALID) ||
5045 initial_size == 0) {
5046 return KERN_INVALID_ARGUMENT;
5047 }
5048
5049 #if __arm64__
5050 if (fourk && VM_MAP_PAGE_MASK(target_map) < PAGE_MASK) {
5051 fourk = FALSE;
5052 }
5053
5054 if (fourk) {
5055 map_addr = vm_map_trunc_page(*address,
5056 FOURK_PAGE_MASK);
5057 map_size = vm_map_round_page(initial_size,
5058 FOURK_PAGE_MASK);
5059 } else
5060 #endif /* __arm64__ */
5061 {
5062 map_addr = vm_map_trunc_page(*address,
5063 VM_MAP_PAGE_MASK(target_map));
5064 map_size = vm_map_round_page(initial_size,
5065 VM_MAP_PAGE_MASK(target_map));
5066 }
5067 size = vm_object_round_page(initial_size);
5068
5069 object = memory_object_control_to_vm_object(control);
5070
5071 if (object == VM_OBJECT_NULL) {
5072 return KERN_INVALID_OBJECT;
5073 }
5074
5075 if (is_kernel_object(object)) {
5076 printf("Warning: Attempt to map kernel object"
5077 " by a non-private kernel entity\n");
5078 return KERN_INVALID_OBJECT;
5079 }
5080
5081 vm_object_lock(object);
5082 object->ref_count++;
5083
5084 /*
5085 * For "named" VM objects, let the pager know that the
5086 * memory object is being mapped. Some pagers need to keep
5087 * track of this, to know when they can reclaim the memory
5088 * object, for example.
5089 * VM calls memory_object_map() for each mapping (specifying
5090 * the protection of each mapping) and calls
5091 * memory_object_last_unmap() when all the mappings are gone.
5092 */
5093 pager_prot = max_protection;
5094 if (copy) {
5095 pager_prot &= ~VM_PROT_WRITE;
5096 }
5097 pager = object->pager;
5098 if (object->named &&
5099 pager != MEMORY_OBJECT_NULL &&
5100 object->copy_strategy != MEMORY_OBJECT_COPY_NONE) {
5101 assert(object->pager_ready);
5102 vm_object_mapping_wait(object, THREAD_UNINT);
5103 vm_object_mapping_begin(object);
5104 vm_object_unlock(object);
5105
5106 kr = memory_object_map(pager, pager_prot);
5107 assert(kr == KERN_SUCCESS);
5108
5109 vm_object_lock(object);
5110 vm_object_mapping_end(object);
5111 }
5112 vm_object_unlock(object);
5113
5114 /*
5115 * Perform the copy if requested
5116 */
5117
5118 if (copy) {
5119 vm_object_t new_object;
5120 vm_object_offset_t new_offset;
5121
5122 result = vm_object_copy_strategically(object, offset, size,
5123 false, /* forking */
5124 &new_object, &new_offset,
5125 ©);
5126
5127
5128 if (result == KERN_MEMORY_RESTART_COPY) {
5129 boolean_t success;
5130 boolean_t src_needs_copy;
5131
5132 /*
5133 * XXX
5134 * We currently ignore src_needs_copy.
5135 * This really is the issue of how to make
5136 * MEMORY_OBJECT_COPY_SYMMETRIC safe for
5137 * non-kernel users to use. Solution forthcoming.
5138 * In the meantime, since we don't allow non-kernel
5139 * memory managers to specify symmetric copy,
5140 * we won't run into problems here.
5141 */
5142 new_object = object;
5143 new_offset = offset;
5144 success = vm_object_copy_quickly(new_object,
5145 new_offset, size,
5146 &src_needs_copy,
5147 ©);
5148 assert(success);
5149 result = KERN_SUCCESS;
5150 }
5151 /*
5152 * Throw away the reference to the
5153 * original object, as it won't be mapped.
5154 */
5155
5156 vm_object_deallocate(object);
5157
5158 if (result != KERN_SUCCESS) {
5159 return result;
5160 }
5161
5162 object = new_object;
5163 offset = new_offset;
5164 }
5165
5166 #if __arm64__
5167 if (fourk) {
5168 result = vm_map_enter_fourk(target_map,
5169 &map_addr,
5170 map_size,
5171 (vm_map_offset_t)mask,
5172 vmk_flags,
5173 object, offset,
5174 copy,
5175 cur_protection, max_protection,
5176 inheritance);
5177 } else
5178 #endif /* __arm64__ */
5179 {
5180 result = vm_map_enter(target_map,
5181 &map_addr, map_size,
5182 (vm_map_offset_t)mask,
5183 vmk_flags,
5184 object, offset,
5185 copy,
5186 cur_protection, max_protection,
5187 inheritance);
5188 }
5189 if (result != KERN_SUCCESS) {
5190 vm_object_deallocate(object);
5191 }
5192 *address = map_addr;
5193
5194 return result;
5195 }
5196
5197
5198 #if VM_CPM
5199
5200 #ifdef MACH_ASSERT
5201 extern pmap_paddr_t avail_start, avail_end;
5202 #endif
5203
5204 /*
5205 * Allocate memory in the specified map, with the caveat that
5206 * the memory is physically contiguous. This call may fail
5207 * if the system can't find sufficient contiguous memory.
5208 * This call may cause or lead to heart-stopping amounts of
5209 * paging activity.
5210 *
5211 * Memory obtained from this call should be freed in the
5212 * normal way, viz., via vm_deallocate.
5213 */
5214 kern_return_t
vm_map_enter_cpm(vm_map_t map,vm_map_offset_t * addr,vm_map_size_t size,vm_map_kernel_flags_t vmk_flags)5215 vm_map_enter_cpm(
5216 vm_map_t map,
5217 vm_map_offset_t *addr,
5218 vm_map_size_t size,
5219 vm_map_kernel_flags_t vmk_flags)
5220 {
5221 vm_object_t cpm_obj;
5222 pmap_t pmap;
5223 vm_page_t m, pages;
5224 kern_return_t kr;
5225 vm_map_offset_t va, start, end, offset;
5226 #if MACH_ASSERT
5227 vm_map_offset_t prev_addr = 0;
5228 #endif /* MACH_ASSERT */
5229 uint8_t object_lock_type = 0;
5230
5231 if (VM_MAP_PAGE_SHIFT(map) != PAGE_SHIFT) {
5232 /* XXX TODO4K do we need to support this? */
5233 *addr = 0;
5234 return KERN_NOT_SUPPORTED;
5235 }
5236
5237 if (size == 0) {
5238 *addr = 0;
5239 return KERN_SUCCESS;
5240 }
5241 if (vmk_flags.vmf_fixed) {
5242 *addr = vm_map_trunc_page(*addr,
5243 VM_MAP_PAGE_MASK(map));
5244 } else {
5245 *addr = vm_map_min(map);
5246 }
5247 size = vm_map_round_page(size,
5248 VM_MAP_PAGE_MASK(map));
5249
5250 /*
5251 * LP64todo - cpm_allocate should probably allow
5252 * allocations of >4GB, but not with the current
5253 * algorithm, so just cast down the size for now.
5254 */
5255 if (size > VM_MAX_ADDRESS) {
5256 return KERN_RESOURCE_SHORTAGE;
5257 }
5258 if ((kr = cpm_allocate(CAST_DOWN(vm_size_t, size),
5259 &pages, 0, 0, TRUE, flags)) != KERN_SUCCESS) {
5260 return kr;
5261 }
5262
5263 cpm_obj = vm_object_allocate((vm_object_size_t)size);
5264 assert(cpm_obj != VM_OBJECT_NULL);
5265 assert(cpm_obj->internal);
5266 assert(cpm_obj->vo_size == (vm_object_size_t)size);
5267 assert(cpm_obj->can_persist == FALSE);
5268 assert(cpm_obj->pager_created == FALSE);
5269 assert(cpm_obj->pageout == FALSE);
5270 assert(cpm_obj->shadow == VM_OBJECT_NULL);
5271
5272 /*
5273 * Insert pages into object.
5274 */
5275 object_lock_type = OBJECT_LOCK_EXCLUSIVE;
5276 vm_object_lock(cpm_obj);
5277 for (offset = 0; offset < size; offset += PAGE_SIZE) {
5278 m = pages;
5279 pages = NEXT_PAGE(m);
5280 *(NEXT_PAGE_PTR(m)) = VM_PAGE_NULL;
5281
5282 assert(!m->vmp_gobbled);
5283 assert(!m->vmp_wanted);
5284 assert(!m->vmp_pageout);
5285 assert(!m->vmp_tabled);
5286 assert(VM_PAGE_WIRED(m));
5287 assert(m->vmp_busy);
5288 assert(VM_PAGE_GET_PHYS_PAGE(m) >= (avail_start >> PAGE_SHIFT) && VM_PAGE_GET_PHYS_PAGE(m) <= (avail_end >> PAGE_SHIFT));
5289
5290 m->vmp_busy = FALSE;
5291 vm_page_insert(m, cpm_obj, offset);
5292 }
5293 assert(cpm_obj->resident_page_count == size / PAGE_SIZE);
5294 vm_object_unlock(cpm_obj);
5295
5296 /*
5297 * Hang onto a reference on the object in case a
5298 * multi-threaded application for some reason decides
5299 * to deallocate the portion of the address space into
5300 * which we will insert this object.
5301 *
5302 * Unfortunately, we must insert the object now before
5303 * we can talk to the pmap module about which addresses
5304 * must be wired down. Hence, the race with a multi-
5305 * threaded app.
5306 */
5307 vm_object_reference(cpm_obj);
5308
5309 /*
5310 * Insert object into map.
5311 */
5312
5313 kr = vm_map_enter(
5314 map,
5315 addr,
5316 size,
5317 (vm_map_offset_t)0,
5318 vmk_flags,
5319 cpm_obj,
5320 (vm_object_offset_t)0,
5321 FALSE,
5322 VM_PROT_ALL,
5323 VM_PROT_ALL,
5324 VM_INHERIT_DEFAULT);
5325
5326 if (kr != KERN_SUCCESS) {
5327 /*
5328 * A CPM object doesn't have can_persist set,
5329 * so all we have to do is deallocate it to
5330 * free up these pages.
5331 */
5332 assert(cpm_obj->pager_created == FALSE);
5333 assert(cpm_obj->can_persist == FALSE);
5334 assert(cpm_obj->pageout == FALSE);
5335 assert(cpm_obj->shadow == VM_OBJECT_NULL);
5336 vm_object_deallocate(cpm_obj); /* kill acquired ref */
5337 vm_object_deallocate(cpm_obj); /* kill creation ref */
5338 }
5339
5340 /*
5341 * Inform the physical mapping system that the
5342 * range of addresses may not fault, so that
5343 * page tables and such can be locked down as well.
5344 */
5345 start = *addr;
5346 end = start + size;
5347 pmap = vm_map_pmap(map);
5348 pmap_pageable(pmap, start, end, FALSE);
5349
5350 /*
5351 * Enter each page into the pmap, to avoid faults.
5352 * Note that this loop could be coded more efficiently,
5353 * if the need arose, rather than looking up each page
5354 * again.
5355 */
5356 for (offset = 0, va = start; offset < size;
5357 va += PAGE_SIZE, offset += PAGE_SIZE) {
5358 int type_of_fault;
5359
5360 vm_object_lock(cpm_obj);
5361 m = vm_page_lookup(cpm_obj, (vm_object_offset_t)offset);
5362 assert(m != VM_PAGE_NULL);
5363
5364 vm_page_zero_fill(m);
5365
5366 type_of_fault = DBG_ZERO_FILL_FAULT;
5367
5368 vm_fault_enter(m, pmap, va,
5369 PAGE_SIZE, 0,
5370 VM_PROT_ALL, VM_PROT_WRITE,
5371 VM_PAGE_WIRED(m),
5372 FALSE, /* change_wiring */
5373 VM_KERN_MEMORY_NONE, /* tag - not wiring */
5374 FALSE, /* cs_bypass */
5375 0, /* user_tag */
5376 0, /* pmap_options */
5377 NULL, /* need_retry */
5378 &type_of_fault,
5379 &object_lock_type); /* Exclusive lock mode. Will remain unchanged.*/
5380
5381 vm_object_unlock(cpm_obj);
5382 }
5383
5384 #if MACH_ASSERT
5385 /*
5386 * Verify ordering in address space.
5387 */
5388 for (offset = 0; offset < size; offset += PAGE_SIZE) {
5389 vm_object_lock(cpm_obj);
5390 m = vm_page_lookup(cpm_obj, (vm_object_offset_t)offset);
5391 vm_object_unlock(cpm_obj);
5392 if (m == VM_PAGE_NULL) {
5393 panic("vm_allocate_cpm: obj %p off 0x%llx no page",
5394 cpm_obj, (uint64_t)offset);
5395 }
5396 assert(m->vmp_tabled);
5397 assert(!m->vmp_busy);
5398 assert(!m->vmp_wanted);
5399 assert(!m->vmp_fictitious);
5400 assert(!m->vmp_private);
5401 assert(!m->vmp_absent);
5402 assert(!m->vmp_cleaning);
5403 assert(!m->vmp_laundry);
5404 assert(!m->vmp_precious);
5405 assert(!m->vmp_clustered);
5406 if (offset != 0) {
5407 if (VM_PAGE_GET_PHYS_PAGE(m) != prev_addr + 1) {
5408 printf("start 0x%llx end 0x%llx va 0x%llx\n",
5409 (uint64_t)start, (uint64_t)end, (uint64_t)va);
5410 printf("obj %p off 0x%llx\n", cpm_obj, (uint64_t)offset);
5411 printf("m %p prev_address 0x%llx\n", m, (uint64_t)prev_addr);
5412 panic("vm_allocate_cpm: pages not contig!");
5413 }
5414 }
5415 prev_addr = VM_PAGE_GET_PHYS_PAGE(m);
5416 }
5417 #endif /* MACH_ASSERT */
5418
5419 vm_object_deallocate(cpm_obj); /* kill extra ref */
5420
5421 return kr;
5422 }
5423
5424
5425 #else /* VM_CPM */
5426
5427 /*
5428 * Interface is defined in all cases, but unless the kernel
5429 * is built explicitly for this option, the interface does
5430 * nothing.
5431 */
5432
5433 kern_return_t
vm_map_enter_cpm(__unused vm_map_t map,__unused vm_map_offset_t * addr,__unused vm_map_size_t size,__unused vm_map_kernel_flags_t vmk_flags)5434 vm_map_enter_cpm(
5435 __unused vm_map_t map,
5436 __unused vm_map_offset_t *addr,
5437 __unused vm_map_size_t size,
5438 __unused vm_map_kernel_flags_t vmk_flags)
5439 {
5440 return KERN_FAILURE;
5441 }
5442 #endif /* VM_CPM */
5443
5444 /* Not used without nested pmaps */
5445 #ifndef NO_NESTED_PMAP
5446 /*
5447 * Clip and unnest a portion of a nested submap mapping.
5448 */
5449
5450
5451 static void
vm_map_clip_unnest(vm_map_t map,vm_map_entry_t entry,vm_map_offset_t start_unnest,vm_map_offset_t end_unnest)5452 vm_map_clip_unnest(
5453 vm_map_t map,
5454 vm_map_entry_t entry,
5455 vm_map_offset_t start_unnest,
5456 vm_map_offset_t end_unnest)
5457 {
5458 vm_map_offset_t old_start_unnest = start_unnest;
5459 vm_map_offset_t old_end_unnest = end_unnest;
5460
5461 assert(entry->is_sub_map);
5462 assert(VME_SUBMAP(entry) != NULL);
5463 assert(entry->use_pmap);
5464
5465 /*
5466 * Query the platform for the optimal unnest range.
5467 * DRK: There's some duplication of effort here, since
5468 * callers may have adjusted the range to some extent. This
5469 * routine was introduced to support 1GiB subtree nesting
5470 * for x86 platforms, which can also nest on 2MiB boundaries
5471 * depending on size/alignment.
5472 */
5473 if (pmap_adjust_unnest_parameters(map->pmap, &start_unnest, &end_unnest)) {
5474 assert(VME_SUBMAP(entry)->is_nested_map);
5475 assert(!VME_SUBMAP(entry)->disable_vmentry_reuse);
5476 log_unnest_badness(map,
5477 old_start_unnest,
5478 old_end_unnest,
5479 VME_SUBMAP(entry)->is_nested_map,
5480 (entry->vme_start +
5481 VME_SUBMAP(entry)->lowest_unnestable_start -
5482 VME_OFFSET(entry)));
5483 }
5484
5485 if (entry->vme_start > start_unnest ||
5486 entry->vme_end < end_unnest) {
5487 panic("vm_map_clip_unnest(0x%llx,0x%llx): "
5488 "bad nested entry: start=0x%llx end=0x%llx\n",
5489 (long long)start_unnest, (long long)end_unnest,
5490 (long long)entry->vme_start, (long long)entry->vme_end);
5491 }
5492
5493 if (start_unnest > entry->vme_start) {
5494 _vm_map_clip_start(&map->hdr,
5495 entry,
5496 start_unnest);
5497 if (map->holelistenabled) {
5498 vm_map_store_update_first_free(map, NULL, FALSE);
5499 } else {
5500 vm_map_store_update_first_free(map, map->first_free, FALSE);
5501 }
5502 }
5503 if (entry->vme_end > end_unnest) {
5504 _vm_map_clip_end(&map->hdr,
5505 entry,
5506 end_unnest);
5507 if (map->holelistenabled) {
5508 vm_map_store_update_first_free(map, NULL, FALSE);
5509 } else {
5510 vm_map_store_update_first_free(map, map->first_free, FALSE);
5511 }
5512 }
5513
5514 pmap_unnest(map->pmap,
5515 entry->vme_start,
5516 entry->vme_end - entry->vme_start);
5517 if ((map->mapped_in_other_pmaps) && os_ref_get_count_raw(&map->map_refcnt) != 0) {
5518 /* clean up parent map/maps */
5519 vm_map_submap_pmap_clean(
5520 map, entry->vme_start,
5521 entry->vme_end,
5522 VME_SUBMAP(entry),
5523 VME_OFFSET(entry));
5524 }
5525 entry->use_pmap = FALSE;
5526 if ((map->pmap != kernel_pmap) &&
5527 (VME_ALIAS(entry) == VM_MEMORY_SHARED_PMAP)) {
5528 VME_ALIAS_SET(entry, VM_MEMORY_UNSHARED_PMAP);
5529 }
5530 }
5531 #endif /* NO_NESTED_PMAP */
5532
5533 __abortlike
5534 static void
__vm_map_clip_atomic_entry_panic(vm_map_t map,vm_map_entry_t entry,vm_map_offset_t where)5535 __vm_map_clip_atomic_entry_panic(
5536 vm_map_t map,
5537 vm_map_entry_t entry,
5538 vm_map_offset_t where)
5539 {
5540 panic("vm_map_clip(%p): Attempting to clip an atomic VM map entry "
5541 "%p [0x%llx:0x%llx] at 0x%llx", map, entry,
5542 (uint64_t)entry->vme_start,
5543 (uint64_t)entry->vme_end,
5544 (uint64_t)where);
5545 }
5546
5547 /*
5548 * vm_map_clip_start: [ internal use only ]
5549 *
5550 * Asserts that the given entry begins at or after
5551 * the specified address; if necessary,
5552 * it splits the entry into two.
5553 */
5554 void
vm_map_clip_start(vm_map_t map,vm_map_entry_t entry,vm_map_offset_t startaddr)5555 vm_map_clip_start(
5556 vm_map_t map,
5557 vm_map_entry_t entry,
5558 vm_map_offset_t startaddr)
5559 {
5560 #ifndef NO_NESTED_PMAP
5561 if (entry->is_sub_map &&
5562 entry->use_pmap &&
5563 startaddr >= entry->vme_start) {
5564 vm_map_offset_t start_unnest, end_unnest;
5565
5566 /*
5567 * Make sure "startaddr" is no longer in a nested range
5568 * before we clip. Unnest only the minimum range the platform
5569 * can handle.
5570 * vm_map_clip_unnest may perform additional adjustments to
5571 * the unnest range.
5572 */
5573 start_unnest = startaddr & ~(pmap_shared_region_size_min(map->pmap) - 1);
5574 end_unnest = start_unnest + pmap_shared_region_size_min(map->pmap);
5575 vm_map_clip_unnest(map, entry, start_unnest, end_unnest);
5576 }
5577 #endif /* NO_NESTED_PMAP */
5578 if (startaddr > entry->vme_start) {
5579 if (!entry->is_sub_map &&
5580 VME_OBJECT(entry) &&
5581 VME_OBJECT(entry)->phys_contiguous) {
5582 pmap_remove(map->pmap,
5583 (addr64_t)(entry->vme_start),
5584 (addr64_t)(entry->vme_end));
5585 }
5586 if (entry->vme_atomic) {
5587 __vm_map_clip_atomic_entry_panic(map, entry, startaddr);
5588 }
5589
5590 DTRACE_VM5(
5591 vm_map_clip_start,
5592 vm_map_t, map,
5593 vm_map_offset_t, entry->vme_start,
5594 vm_map_offset_t, entry->vme_end,
5595 vm_map_offset_t, startaddr,
5596 int, VME_ALIAS(entry));
5597
5598 _vm_map_clip_start(&map->hdr, entry, startaddr);
5599 if (map->holelistenabled) {
5600 vm_map_store_update_first_free(map, NULL, FALSE);
5601 } else {
5602 vm_map_store_update_first_free(map, map->first_free, FALSE);
5603 }
5604 }
5605 }
5606
5607
5608 #define vm_map_copy_clip_start(copy, entry, startaddr) \
5609 MACRO_BEGIN \
5610 if ((startaddr) > (entry)->vme_start) \
5611 _vm_map_clip_start(&(copy)->cpy_hdr,(entry),(startaddr)); \
5612 MACRO_END
5613
5614 /*
5615 * This routine is called only when it is known that
5616 * the entry must be split.
5617 */
5618 static void
_vm_map_clip_start(struct vm_map_header * map_header,vm_map_entry_t entry,vm_map_offset_t start)5619 _vm_map_clip_start(
5620 struct vm_map_header *map_header,
5621 vm_map_entry_t entry,
5622 vm_map_offset_t start)
5623 {
5624 vm_map_entry_t new_entry;
5625
5626 /*
5627 * Split off the front portion --
5628 * note that we must insert the new
5629 * entry BEFORE this one, so that
5630 * this entry has the specified starting
5631 * address.
5632 */
5633
5634 if (entry->map_aligned) {
5635 assert(VM_MAP_PAGE_ALIGNED(start,
5636 VM_MAP_HDR_PAGE_MASK(map_header)));
5637 }
5638
5639 new_entry = _vm_map_entry_create(map_header);
5640 vm_map_entry_copy_full(new_entry, entry);
5641
5642 new_entry->vme_end = start;
5643 assert(new_entry->vme_start < new_entry->vme_end);
5644 VME_OFFSET_SET(entry, VME_OFFSET(entry) + (start - entry->vme_start));
5645 if (__improbable(start >= entry->vme_end)) {
5646 panic("mapHdr %p entry %p start 0x%llx end 0x%llx new start 0x%llx", map_header, entry, entry->vme_start, entry->vme_end, start);
5647 }
5648 assert(start < entry->vme_end);
5649 entry->vme_start = start;
5650
5651 #if VM_BTLOG_TAGS
5652 if (new_entry->vme_kernel_object) {
5653 btref_retain(new_entry->vme_tag_btref);
5654 }
5655 #endif /* VM_BTLOG_TAGS */
5656
5657 _vm_map_store_entry_link(map_header, entry->vme_prev, new_entry);
5658
5659 if (entry->is_sub_map) {
5660 vm_map_reference(VME_SUBMAP(new_entry));
5661 } else {
5662 vm_object_reference(VME_OBJECT(new_entry));
5663 }
5664 }
5665
5666
5667 /*
5668 * vm_map_clip_end: [ internal use only ]
5669 *
5670 * Asserts that the given entry ends at or before
5671 * the specified address; if necessary,
5672 * it splits the entry into two.
5673 */
5674 void
vm_map_clip_end(vm_map_t map,vm_map_entry_t entry,vm_map_offset_t endaddr)5675 vm_map_clip_end(
5676 vm_map_t map,
5677 vm_map_entry_t entry,
5678 vm_map_offset_t endaddr)
5679 {
5680 if (endaddr > entry->vme_end) {
5681 /*
5682 * Within the scope of this clipping, limit "endaddr" to
5683 * the end of this map entry...
5684 */
5685 endaddr = entry->vme_end;
5686 }
5687 #ifndef NO_NESTED_PMAP
5688 if (entry->is_sub_map && entry->use_pmap) {
5689 vm_map_offset_t start_unnest, end_unnest;
5690
5691 /*
5692 * Make sure the range between the start of this entry and
5693 * the new "endaddr" is no longer nested before we clip.
5694 * Unnest only the minimum range the platform can handle.
5695 * vm_map_clip_unnest may perform additional adjustments to
5696 * the unnest range.
5697 */
5698 start_unnest = entry->vme_start;
5699 end_unnest =
5700 (endaddr + pmap_shared_region_size_min(map->pmap) - 1) &
5701 ~(pmap_shared_region_size_min(map->pmap) - 1);
5702 vm_map_clip_unnest(map, entry, start_unnest, end_unnest);
5703 }
5704 #endif /* NO_NESTED_PMAP */
5705 if (endaddr < entry->vme_end) {
5706 if (!entry->is_sub_map &&
5707 VME_OBJECT(entry) &&
5708 VME_OBJECT(entry)->phys_contiguous) {
5709 pmap_remove(map->pmap,
5710 (addr64_t)(entry->vme_start),
5711 (addr64_t)(entry->vme_end));
5712 }
5713 if (entry->vme_atomic) {
5714 __vm_map_clip_atomic_entry_panic(map, entry, endaddr);
5715 }
5716 DTRACE_VM5(
5717 vm_map_clip_end,
5718 vm_map_t, map,
5719 vm_map_offset_t, entry->vme_start,
5720 vm_map_offset_t, entry->vme_end,
5721 vm_map_offset_t, endaddr,
5722 int, VME_ALIAS(entry));
5723
5724 _vm_map_clip_end(&map->hdr, entry, endaddr);
5725 if (map->holelistenabled) {
5726 vm_map_store_update_first_free(map, NULL, FALSE);
5727 } else {
5728 vm_map_store_update_first_free(map, map->first_free, FALSE);
5729 }
5730 }
5731 }
5732
5733
5734 #define vm_map_copy_clip_end(copy, entry, endaddr) \
5735 MACRO_BEGIN \
5736 if ((endaddr) < (entry)->vme_end) \
5737 _vm_map_clip_end(&(copy)->cpy_hdr,(entry),(endaddr)); \
5738 MACRO_END
5739
5740 /*
5741 * This routine is called only when it is known that
5742 * the entry must be split.
5743 */
5744 static void
_vm_map_clip_end(struct vm_map_header * map_header,vm_map_entry_t entry,vm_map_offset_t end)5745 _vm_map_clip_end(
5746 struct vm_map_header *map_header,
5747 vm_map_entry_t entry,
5748 vm_map_offset_t end)
5749 {
5750 vm_map_entry_t new_entry;
5751
5752 /*
5753 * Create a new entry and insert it
5754 * AFTER the specified entry
5755 */
5756
5757 if (entry->map_aligned) {
5758 assert(VM_MAP_PAGE_ALIGNED(end,
5759 VM_MAP_HDR_PAGE_MASK(map_header)));
5760 }
5761
5762 new_entry = _vm_map_entry_create(map_header);
5763 vm_map_entry_copy_full(new_entry, entry);
5764
5765 if (__improbable(end <= entry->vme_start)) {
5766 panic("mapHdr %p entry %p start 0x%llx end 0x%llx new end 0x%llx", map_header, entry, entry->vme_start, entry->vme_end, end);
5767 }
5768 assert(entry->vme_start < end);
5769 new_entry->vme_start = entry->vme_end = end;
5770 VME_OFFSET_SET(new_entry,
5771 VME_OFFSET(new_entry) + (end - entry->vme_start));
5772 assert(new_entry->vme_start < new_entry->vme_end);
5773
5774 #if VM_BTLOG_TAGS
5775 if (new_entry->vme_kernel_object) {
5776 btref_retain(new_entry->vme_tag_btref);
5777 }
5778 #endif /* VM_BTLOG_TAGS */
5779
5780 _vm_map_store_entry_link(map_header, entry, new_entry);
5781
5782 if (entry->is_sub_map) {
5783 vm_map_reference(VME_SUBMAP(new_entry));
5784 } else {
5785 vm_object_reference(VME_OBJECT(new_entry));
5786 }
5787 }
5788
5789
5790 /*
5791 * VM_MAP_RANGE_CHECK: [ internal use only ]
5792 *
5793 * Asserts that the starting and ending region
5794 * addresses fall within the valid range of the map.
5795 */
5796 #define VM_MAP_RANGE_CHECK(map, start, end) \
5797 MACRO_BEGIN \
5798 if (start < vm_map_min(map)) \
5799 start = vm_map_min(map); \
5800 if (end > vm_map_max(map)) \
5801 end = vm_map_max(map); \
5802 if (start > end) \
5803 start = end; \
5804 MACRO_END
5805
5806 /*
5807 * vm_map_range_check: [ internal use only ]
5808 *
5809 * Check that the region defined by the specified start and
5810 * end addresses are wholly contained within a single map
5811 * entry or set of adjacent map entries of the spacified map,
5812 * i.e. the specified region contains no unmapped space.
5813 * If any or all of the region is unmapped, FALSE is returned.
5814 * Otherwise, TRUE is returned and if the output argument 'entry'
5815 * is not NULL it points to the map entry containing the start
5816 * of the region.
5817 *
5818 * The map is locked for reading on entry and is left locked.
5819 */
5820 static boolean_t
vm_map_range_check(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_map_entry_t * entry)5821 vm_map_range_check(
5822 vm_map_t map,
5823 vm_map_offset_t start,
5824 vm_map_offset_t end,
5825 vm_map_entry_t *entry)
5826 {
5827 vm_map_entry_t cur;
5828 vm_map_offset_t prev;
5829
5830 /*
5831 * Basic sanity checks first
5832 */
5833 if (start < vm_map_min(map) || end > vm_map_max(map) || start > end) {
5834 return FALSE;
5835 }
5836
5837 /*
5838 * Check first if the region starts within a valid
5839 * mapping for the map.
5840 */
5841 if (!vm_map_lookup_entry(map, start, &cur)) {
5842 return FALSE;
5843 }
5844
5845 /*
5846 * Optimize for the case that the region is contained
5847 * in a single map entry.
5848 */
5849 if (entry != (vm_map_entry_t *) NULL) {
5850 *entry = cur;
5851 }
5852 if (end <= cur->vme_end) {
5853 return TRUE;
5854 }
5855
5856 /*
5857 * If the region is not wholly contained within a
5858 * single entry, walk the entries looking for holes.
5859 */
5860 prev = cur->vme_end;
5861 cur = cur->vme_next;
5862 while ((cur != vm_map_to_entry(map)) && (prev == cur->vme_start)) {
5863 if (end <= cur->vme_end) {
5864 return TRUE;
5865 }
5866 prev = cur->vme_end;
5867 cur = cur->vme_next;
5868 }
5869 return FALSE;
5870 }
5871
5872 /*
5873 * vm_map_protect:
5874 *
5875 * Sets the protection of the specified address
5876 * region in the target map. If "set_max" is
5877 * specified, the maximum protection is to be set;
5878 * otherwise, only the current protection is affected.
5879 */
5880 kern_return_t
vm_map_protect(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_prot_t new_prot,boolean_t set_max)5881 vm_map_protect(
5882 vm_map_t map,
5883 vm_map_offset_t start,
5884 vm_map_offset_t end,
5885 vm_prot_t new_prot,
5886 boolean_t set_max)
5887 {
5888 vm_map_entry_t current;
5889 vm_map_offset_t prev;
5890 vm_map_entry_t entry;
5891 vm_prot_t new_max;
5892 int pmap_options = 0;
5893 kern_return_t kr;
5894
5895 if (__improbable(vm_map_range_overflows(map, start, end - start))) {
5896 return KERN_INVALID_ARGUMENT;
5897 }
5898
5899 if (new_prot & VM_PROT_COPY) {
5900 vm_map_offset_t new_start;
5901 vm_prot_t cur_prot, max_prot;
5902 vm_map_kernel_flags_t kflags;
5903
5904 /* LP64todo - see below */
5905 if (start >= map->max_offset) {
5906 return KERN_INVALID_ADDRESS;
5907 }
5908
5909 if ((new_prot & VM_PROT_ALLEXEC) &&
5910 map->pmap != kernel_pmap &&
5911 (vm_map_cs_enforcement(map)
5912 #if XNU_TARGET_OS_OSX && __arm64__
5913 || !VM_MAP_IS_EXOTIC(map)
5914 #endif /* XNU_TARGET_OS_OSX && __arm64__ */
5915 ) &&
5916 VM_MAP_POLICY_WX_FAIL(map)) {
5917 DTRACE_VM3(cs_wx,
5918 uint64_t, (uint64_t) start,
5919 uint64_t, (uint64_t) end,
5920 vm_prot_t, new_prot);
5921 printf("CODE SIGNING: %d[%s] %s:%d(0x%llx,0x%llx,0x%x) can't have both write and exec at the same time\n",
5922 proc_selfpid(),
5923 (get_bsdtask_info(current_task())
5924 ? proc_name_address(get_bsdtask_info(current_task()))
5925 : "?"),
5926 __FUNCTION__, __LINE__,
5927 #if DEVELOPMENT || DEBUG
5928 (uint64_t)start,
5929 (uint64_t)end,
5930 #else /* DEVELOPMENT || DEBUG */
5931 (uint64_t)0,
5932 (uint64_t)0,
5933 #endif /* DEVELOPMENT || DEBUG */
5934 new_prot);
5935 return KERN_PROTECTION_FAILURE;
5936 }
5937
5938 /*
5939 * Let vm_map_remap_extract() know that it will need to:
5940 * + make a copy of the mapping
5941 * + add VM_PROT_WRITE to the max protections
5942 * + remove any protections that are no longer allowed from the
5943 * max protections (to avoid any WRITE/EXECUTE conflict, for
5944 * example).
5945 * Note that "max_prot" is an IN/OUT parameter only for this
5946 * specific (VM_PROT_COPY) case. It's usually an OUT parameter
5947 * only.
5948 */
5949 max_prot = new_prot & (VM_PROT_ALL | VM_PROT_ALLEXEC);
5950 cur_prot = VM_PROT_NONE;
5951 kflags = VM_MAP_KERNEL_FLAGS_FIXED(.vmf_overwrite = true);
5952 kflags.vmkf_remap_prot_copy = true;
5953 kflags.vmkf_tpro_enforcement_override = !vm_map_tpro_enforcement(map);
5954 new_start = start;
5955 kr = vm_map_remap(map,
5956 &new_start,
5957 end - start,
5958 0, /* mask */
5959 kflags,
5960 map,
5961 start,
5962 TRUE, /* copy-on-write remapping! */
5963 &cur_prot, /* IN/OUT */
5964 &max_prot, /* IN/OUT */
5965 VM_INHERIT_DEFAULT);
5966 if (kr != KERN_SUCCESS) {
5967 return kr;
5968 }
5969 new_prot &= ~VM_PROT_COPY;
5970 }
5971
5972 vm_map_lock(map);
5973
5974 /* LP64todo - remove this check when vm_map_commpage64()
5975 * no longer has to stuff in a map_entry for the commpage
5976 * above the map's max_offset.
5977 */
5978 if (start >= map->max_offset) {
5979 vm_map_unlock(map);
5980 return KERN_INVALID_ADDRESS;
5981 }
5982
5983 while (1) {
5984 /*
5985 * Lookup the entry. If it doesn't start in a valid
5986 * entry, return an error.
5987 */
5988 if (!vm_map_lookup_entry(map, start, &entry)) {
5989 vm_map_unlock(map);
5990 return KERN_INVALID_ADDRESS;
5991 }
5992
5993 if (entry->superpage_size && (start & (SUPERPAGE_SIZE - 1))) { /* extend request to whole entry */
5994 start = SUPERPAGE_ROUND_DOWN(start);
5995 continue;
5996 }
5997 break;
5998 }
5999 if (entry->superpage_size) {
6000 end = SUPERPAGE_ROUND_UP(end);
6001 }
6002
6003 /*
6004 * Make a first pass to check for protection and address
6005 * violations.
6006 */
6007
6008 current = entry;
6009 prev = current->vme_start;
6010 while ((current != vm_map_to_entry(map)) &&
6011 (current->vme_start < end)) {
6012 /*
6013 * If there is a hole, return an error.
6014 */
6015 if (current->vme_start != prev) {
6016 vm_map_unlock(map);
6017 return KERN_INVALID_ADDRESS;
6018 }
6019
6020 new_max = current->max_protection;
6021
6022 #if defined(__x86_64__)
6023 /* Allow max mask to include execute prot bits if this map doesn't enforce CS */
6024 if (set_max && (new_prot & VM_PROT_ALLEXEC) && !vm_map_cs_enforcement(map)) {
6025 new_max = (new_max & ~VM_PROT_ALLEXEC) | (new_prot & VM_PROT_ALLEXEC);
6026 }
6027 #elif CODE_SIGNING_MONITOR
6028 if (set_max && (new_prot & VM_PROT_EXECUTE) && (csm_address_space_exempt(map->pmap) == KERN_SUCCESS)) {
6029 new_max |= VM_PROT_EXECUTE;
6030 }
6031 #endif
6032 if ((new_prot & new_max) != new_prot) {
6033 vm_map_unlock(map);
6034 return KERN_PROTECTION_FAILURE;
6035 }
6036
6037 if (current->used_for_jit &&
6038 pmap_has_prot_policy(map->pmap, current->translated_allow_execute, current->protection)) {
6039 vm_map_unlock(map);
6040 return KERN_PROTECTION_FAILURE;
6041 }
6042
6043 #if __arm64e__
6044 /* Disallow remapping hw assisted TPRO mappings */
6045 if (current->used_for_tpro) {
6046 vm_map_unlock(map);
6047 return KERN_PROTECTION_FAILURE;
6048 }
6049 #endif /* __arm64e__ */
6050
6051
6052 if ((new_prot & VM_PROT_WRITE) &&
6053 (new_prot & VM_PROT_ALLEXEC) &&
6054 #if XNU_TARGET_OS_OSX
6055 map->pmap != kernel_pmap &&
6056 (vm_map_cs_enforcement(map)
6057 #if __arm64__
6058 || !VM_MAP_IS_EXOTIC(map)
6059 #endif /* __arm64__ */
6060 ) &&
6061 #endif /* XNU_TARGET_OS_OSX */
6062 #if CODE_SIGNING_MONITOR
6063 (csm_address_space_exempt(map->pmap) != KERN_SUCCESS) &&
6064 #endif
6065 !(current->used_for_jit)) {
6066 DTRACE_VM3(cs_wx,
6067 uint64_t, (uint64_t) current->vme_start,
6068 uint64_t, (uint64_t) current->vme_end,
6069 vm_prot_t, new_prot);
6070 printf("CODE SIGNING: %d[%s] %s:%d(0x%llx,0x%llx,0x%x) can't have both write and exec at the same time\n",
6071 proc_selfpid(),
6072 (get_bsdtask_info(current_task())
6073 ? proc_name_address(get_bsdtask_info(current_task()))
6074 : "?"),
6075 __FUNCTION__, __LINE__,
6076 #if DEVELOPMENT || DEBUG
6077 (uint64_t)current->vme_start,
6078 (uint64_t)current->vme_end,
6079 #else /* DEVELOPMENT || DEBUG */
6080 (uint64_t)0,
6081 (uint64_t)0,
6082 #endif /* DEVELOPMENT || DEBUG */
6083 new_prot);
6084 new_prot &= ~VM_PROT_ALLEXEC;
6085 if (VM_MAP_POLICY_WX_FAIL(map)) {
6086 vm_map_unlock(map);
6087 return KERN_PROTECTION_FAILURE;
6088 }
6089 }
6090
6091 /*
6092 * If the task has requested executable lockdown,
6093 * deny both:
6094 * - adding executable protections OR
6095 * - adding write protections to an existing executable mapping.
6096 */
6097 if (map->map_disallow_new_exec == TRUE) {
6098 if ((new_prot & VM_PROT_ALLEXEC) ||
6099 ((current->protection & VM_PROT_EXECUTE) && (new_prot & VM_PROT_WRITE))) {
6100 vm_map_unlock(map);
6101 return KERN_PROTECTION_FAILURE;
6102 }
6103 }
6104
6105 prev = current->vme_end;
6106 current = current->vme_next;
6107 }
6108
6109 #if __arm64__
6110 if (end > prev &&
6111 end == vm_map_round_page(prev, VM_MAP_PAGE_MASK(map))) {
6112 vm_map_entry_t prev_entry;
6113
6114 prev_entry = current->vme_prev;
6115 if (prev_entry != vm_map_to_entry(map) &&
6116 !prev_entry->map_aligned &&
6117 (vm_map_round_page(prev_entry->vme_end,
6118 VM_MAP_PAGE_MASK(map))
6119 == end)) {
6120 /*
6121 * The last entry in our range is not "map-aligned"
6122 * but it would have reached all the way to "end"
6123 * if it had been map-aligned, so this is not really
6124 * a hole in the range and we can proceed.
6125 */
6126 prev = end;
6127 }
6128 }
6129 #endif /* __arm64__ */
6130
6131 if (end > prev) {
6132 vm_map_unlock(map);
6133 return KERN_INVALID_ADDRESS;
6134 }
6135
6136 /*
6137 * Go back and fix up protections.
6138 * Clip to start here if the range starts within
6139 * the entry.
6140 */
6141
6142 current = entry;
6143 if (current != vm_map_to_entry(map)) {
6144 /* clip and unnest if necessary */
6145 vm_map_clip_start(map, current, start);
6146 }
6147
6148 while ((current != vm_map_to_entry(map)) &&
6149 (current->vme_start < end)) {
6150 vm_prot_t old_prot;
6151
6152 vm_map_clip_end(map, current, end);
6153
6154 #if DEVELOPMENT || DEBUG
6155 if (current->csm_associated && vm_log_xnu_user_debug) {
6156 printf("FBDP %d[%s] %s(0x%llx,0x%llx,0x%x) on map %p entry %p [0x%llx:0x%llx 0x%x/0x%x] csm_associated\n",
6157 proc_selfpid(),
6158 (get_bsdtask_info(current_task())
6159 ? proc_name_address(get_bsdtask_info(current_task()))
6160 : "?"),
6161 __FUNCTION__,
6162 (uint64_t)start,
6163 (uint64_t)end,
6164 new_prot,
6165 map, current,
6166 current->vme_start,
6167 current->vme_end,
6168 current->protection,
6169 current->max_protection);
6170 }
6171 #endif /* DEVELOPMENT || DEBUG */
6172
6173 if (current->is_sub_map) {
6174 /* clipping did unnest if needed */
6175 assert(!current->use_pmap);
6176 }
6177
6178 old_prot = current->protection;
6179
6180 if (set_max) {
6181 current->max_protection = new_prot;
6182 /* Consider either EXECUTE or UEXEC as EXECUTE for this masking */
6183 current->protection = (new_prot & old_prot);
6184 } else {
6185 current->protection = new_prot;
6186 }
6187
6188 #if CODE_SIGNING_MONITOR
6189 if (!current->vme_xnu_user_debug &&
6190 /* a !csm_associated mapping becoming executable */
6191 ((!current->csm_associated &&
6192 !(old_prot & VM_PROT_EXECUTE) &&
6193 (current->protection & VM_PROT_EXECUTE))
6194 ||
6195 /* a csm_associated mapping becoming writable */
6196 (current->csm_associated &&
6197 !(old_prot & VM_PROT_WRITE) &&
6198 (current->protection & VM_PROT_WRITE)))) {
6199 /*
6200 * This mapping has not already been marked as
6201 * "user_debug" and it is either:
6202 * 1. not code-signing-monitored and becoming executable
6203 * 2. code-signing-monitored and becoming writable,
6204 * so inform the CodeSigningMonitor and mark the
6205 * mapping as "user_debug" if appropriate.
6206 */
6207 vm_map_kernel_flags_t vmk_flags;
6208 vmk_flags = VM_MAP_KERNEL_FLAGS_NONE;
6209 /* pretend it's a vm_protect(VM_PROT_COPY)... */
6210 vmk_flags.vmkf_remap_prot_copy = true;
6211 kr = vm_map_entry_cs_associate(map, current, vmk_flags);
6212 #if DEVELOPMENT || DEBUG
6213 if (vm_log_xnu_user_debug) {
6214 printf("FBDP %d[%s] %s:%d map %p entry %p [ 0x%llx 0x%llx ] prot 0x%x -> 0x%x cs_associate -> %d user_debug=%d\n",
6215 proc_selfpid(),
6216 (get_bsdtask_info(current_task()) ? proc_name_address(get_bsdtask_info(current_task())) : "?"),
6217 __FUNCTION__, __LINE__,
6218 map, current,
6219 current->vme_start, current->vme_end,
6220 old_prot, current->protection,
6221 kr, current->vme_xnu_user_debug);
6222 }
6223 #endif /* DEVELOPMENT || DEBUG */
6224 }
6225 #endif /* CODE_SIGNING_MONITOR */
6226
6227 /*
6228 * Update physical map if necessary.
6229 * If the request is to turn off write protection,
6230 * we won't do it for real (in pmap). This is because
6231 * it would cause copy-on-write to fail. We've already
6232 * set, the new protection in the map, so if a
6233 * write-protect fault occurred, it will be fixed up
6234 * properly, COW or not.
6235 */
6236 if (current->protection != old_prot) {
6237 /* Look one level in we support nested pmaps */
6238 /* from mapped submaps which are direct entries */
6239 /* in our map */
6240
6241 vm_prot_t prot;
6242
6243 prot = current->protection;
6244 if (current->is_sub_map || (VME_OBJECT(current) == NULL) || (VME_OBJECT(current) != compressor_object)) {
6245 prot &= ~VM_PROT_WRITE;
6246 } else {
6247 assert(!VME_OBJECT(current)->code_signed);
6248 assert(VME_OBJECT(current)->copy_strategy == MEMORY_OBJECT_COPY_NONE);
6249 if (prot & VM_PROT_WRITE) {
6250 /*
6251 * For write requests on the
6252 * compressor, we wil ask the
6253 * pmap layer to prevent us from
6254 * taking a write fault when we
6255 * attempt to access the mapping
6256 * next.
6257 */
6258 pmap_options |= PMAP_OPTIONS_PROTECT_IMMEDIATE;
6259 }
6260 }
6261
6262 if (override_nx(map, VME_ALIAS(current)) && prot) {
6263 prot |= VM_PROT_EXECUTE;
6264 }
6265
6266 #if DEVELOPMENT || DEBUG
6267 if (!(old_prot & VM_PROT_EXECUTE) &&
6268 (prot & VM_PROT_EXECUTE) &&
6269 panic_on_unsigned_execute &&
6270 (proc_selfcsflags() & CS_KILL)) {
6271 panic("vm_map_protect(%p,0x%llx,0x%llx) old=0x%x new=0x%x - <rdar://23770418> code-signing bypass?", map, (uint64_t)current->vme_start, (uint64_t)current->vme_end, old_prot, prot);
6272 }
6273 #endif /* DEVELOPMENT || DEBUG */
6274
6275 if (pmap_has_prot_policy(map->pmap, current->translated_allow_execute, prot)) {
6276 if (current->wired_count) {
6277 panic("vm_map_protect(%p,0x%llx,0x%llx) new=0x%x wired=%x",
6278 map, (uint64_t)current->vme_start, (uint64_t)current->vme_end, prot, current->wired_count);
6279 }
6280
6281 /* If the pmap layer cares about this
6282 * protection type, force a fault for
6283 * each page so that vm_fault will
6284 * repopulate the page with the full
6285 * set of protections.
6286 */
6287 /*
6288 * TODO: We don't seem to need this,
6289 * but this is due to an internal
6290 * implementation detail of
6291 * pmap_protect. Do we want to rely
6292 * on this?
6293 */
6294 prot = VM_PROT_NONE;
6295 }
6296
6297 if (current->is_sub_map && current->use_pmap) {
6298 pmap_protect(VME_SUBMAP(current)->pmap,
6299 current->vme_start,
6300 current->vme_end,
6301 prot);
6302 } else {
6303 pmap_protect_options(map->pmap,
6304 current->vme_start,
6305 current->vme_end,
6306 prot,
6307 pmap_options,
6308 NULL);
6309 }
6310 }
6311 current = current->vme_next;
6312 }
6313
6314 current = entry;
6315 while ((current != vm_map_to_entry(map)) &&
6316 (current->vme_start <= end)) {
6317 vm_map_simplify_entry(map, current);
6318 current = current->vme_next;
6319 }
6320
6321 vm_map_unlock(map);
6322 return KERN_SUCCESS;
6323 }
6324
6325 /*
6326 * vm_map_inherit:
6327 *
6328 * Sets the inheritance of the specified address
6329 * range in the target map. Inheritance
6330 * affects how the map will be shared with
6331 * child maps at the time of vm_map_fork.
6332 */
6333 kern_return_t
vm_map_inherit(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_inherit_t new_inheritance)6334 vm_map_inherit(
6335 vm_map_t map,
6336 vm_map_offset_t start,
6337 vm_map_offset_t end,
6338 vm_inherit_t new_inheritance)
6339 {
6340 vm_map_entry_t entry;
6341 vm_map_entry_t temp_entry;
6342
6343 vm_map_lock(map);
6344
6345 VM_MAP_RANGE_CHECK(map, start, end);
6346
6347 if (__improbable(vm_map_range_overflows(map, start, end - start))) {
6348 vm_map_unlock(map);
6349 return KERN_INVALID_ADDRESS;
6350 }
6351
6352 if (vm_map_lookup_entry(map, start, &temp_entry)) {
6353 entry = temp_entry;
6354 } else {
6355 temp_entry = temp_entry->vme_next;
6356 entry = temp_entry;
6357 }
6358
6359 /* first check entire range for submaps which can't support the */
6360 /* given inheritance. */
6361 while ((entry != vm_map_to_entry(map)) && (entry->vme_start < end)) {
6362 if (entry->is_sub_map) {
6363 if (new_inheritance == VM_INHERIT_COPY) {
6364 vm_map_unlock(map);
6365 return KERN_INVALID_ARGUMENT;
6366 }
6367 }
6368
6369 entry = entry->vme_next;
6370 }
6371
6372 entry = temp_entry;
6373 if (entry != vm_map_to_entry(map)) {
6374 /* clip and unnest if necessary */
6375 vm_map_clip_start(map, entry, start);
6376 }
6377
6378 while ((entry != vm_map_to_entry(map)) && (entry->vme_start < end)) {
6379 vm_map_clip_end(map, entry, end);
6380 if (entry->is_sub_map) {
6381 /* clip did unnest if needed */
6382 assert(!entry->use_pmap);
6383 }
6384
6385 entry->inheritance = new_inheritance;
6386
6387 entry = entry->vme_next;
6388 }
6389
6390 vm_map_unlock(map);
6391 return KERN_SUCCESS;
6392 }
6393
6394 /*
6395 * Update the accounting for the amount of wired memory in this map. If the user has
6396 * exceeded the defined limits, then we fail. Wiring on behalf of the kernel never fails.
6397 */
6398
6399 static kern_return_t
add_wire_counts(vm_map_t map,vm_map_entry_t entry,boolean_t user_wire)6400 add_wire_counts(
6401 vm_map_t map,
6402 vm_map_entry_t entry,
6403 boolean_t user_wire)
6404 {
6405 vm_map_size_t size;
6406
6407 bool first_wire = entry->wired_count == 0 && entry->user_wired_count == 0;
6408
6409 if (user_wire) {
6410 unsigned int total_wire_count = vm_page_wire_count + vm_lopage_free_count;
6411
6412 /*
6413 * We're wiring memory at the request of the user. Check if this is the first time the user is wiring
6414 * this map entry.
6415 */
6416
6417 if (entry->user_wired_count == 0) {
6418 size = entry->vme_end - entry->vme_start;
6419
6420 /*
6421 * Since this is the first time the user is wiring this map entry, check to see if we're
6422 * exceeding the user wire limits. There is a per map limit which is the smaller of either
6423 * the process's rlimit or the global vm_per_task_user_wire_limit which caps this value. There is also
6424 * a system-wide limit on the amount of memory all users can wire. If the user is over either
6425 * limit, then we fail.
6426 */
6427
6428 if (size + map->user_wire_size > MIN(map->user_wire_limit, vm_per_task_user_wire_limit) ||
6429 size + ptoa_64(total_wire_count) > vm_global_user_wire_limit) {
6430 if (size + ptoa_64(total_wire_count) > vm_global_user_wire_limit) {
6431 #if DEVELOPMENT || DEBUG
6432 if (panic_on_mlock_failure) {
6433 panic("mlock: Over global wire limit. %llu bytes wired and requested to wire %llu bytes more", ptoa_64(total_wire_count), (uint64_t) size);
6434 }
6435 #endif /* DEVELOPMENT || DEBUG */
6436 os_atomic_inc(&vm_add_wire_count_over_global_limit, relaxed);
6437 } else {
6438 os_atomic_inc(&vm_add_wire_count_over_user_limit, relaxed);
6439 #if DEVELOPMENT || DEBUG
6440 if (panic_on_mlock_failure) {
6441 panic("mlock: Over process wire limit. %llu bytes wired and requested to wire %llu bytes more", (uint64_t) map->user_wire_size, (uint64_t) size);
6442 }
6443 #endif /* DEVELOPMENT || DEBUG */
6444 }
6445 return KERN_RESOURCE_SHORTAGE;
6446 }
6447
6448 /*
6449 * The first time the user wires an entry, we also increment the wired_count and add this to
6450 * the total that has been wired in the map.
6451 */
6452
6453 if (entry->wired_count >= MAX_WIRE_COUNT) {
6454 return KERN_FAILURE;
6455 }
6456
6457 entry->wired_count++;
6458 map->user_wire_size += size;
6459 }
6460
6461 if (entry->user_wired_count >= MAX_WIRE_COUNT) {
6462 return KERN_FAILURE;
6463 }
6464
6465 entry->user_wired_count++;
6466 } else {
6467 /*
6468 * The kernel's wiring the memory. Just bump the count and continue.
6469 */
6470
6471 if (entry->wired_count >= MAX_WIRE_COUNT) {
6472 panic("vm_map_wire: too many wirings");
6473 }
6474
6475 entry->wired_count++;
6476 }
6477
6478 if (first_wire) {
6479 vme_btref_consider_and_set(entry, __builtin_frame_address(0));
6480 }
6481
6482 return KERN_SUCCESS;
6483 }
6484
6485 /*
6486 * Update the memory wiring accounting now that the given map entry is being unwired.
6487 */
6488
6489 static void
subtract_wire_counts(vm_map_t map,vm_map_entry_t entry,boolean_t user_wire)6490 subtract_wire_counts(
6491 vm_map_t map,
6492 vm_map_entry_t entry,
6493 boolean_t user_wire)
6494 {
6495 if (user_wire) {
6496 /*
6497 * We're unwiring memory at the request of the user. See if we're removing the last user wire reference.
6498 */
6499
6500 if (entry->user_wired_count == 1) {
6501 /*
6502 * We're removing the last user wire reference. Decrement the wired_count and the total
6503 * user wired memory for this map.
6504 */
6505
6506 assert(entry->wired_count >= 1);
6507 entry->wired_count--;
6508 map->user_wire_size -= entry->vme_end - entry->vme_start;
6509 }
6510
6511 assert(entry->user_wired_count >= 1);
6512 entry->user_wired_count--;
6513 } else {
6514 /*
6515 * The kernel is unwiring the memory. Just update the count.
6516 */
6517
6518 assert(entry->wired_count >= 1);
6519 entry->wired_count--;
6520 }
6521
6522 vme_btref_consider_and_put(entry);
6523 }
6524
6525 int cs_executable_wire = 0;
6526
6527 /*
6528 * vm_map_wire:
6529 *
6530 * Sets the pageability of the specified address range in the
6531 * target map as wired. Regions specified as not pageable require
6532 * locked-down physical memory and physical page maps. The
6533 * access_type variable indicates types of accesses that must not
6534 * generate page faults. This is checked against protection of
6535 * memory being locked-down.
6536 *
6537 * The map must not be locked, but a reference must remain to the
6538 * map throughout the call.
6539 */
6540 static kern_return_t
vm_map_wire_nested(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_prot_t caller_prot,vm_tag_t tag,boolean_t user_wire,pmap_t map_pmap,vm_map_offset_t pmap_addr,ppnum_t * physpage_p)6541 vm_map_wire_nested(
6542 vm_map_t map,
6543 vm_map_offset_t start,
6544 vm_map_offset_t end,
6545 vm_prot_t caller_prot,
6546 vm_tag_t tag,
6547 boolean_t user_wire,
6548 pmap_t map_pmap,
6549 vm_map_offset_t pmap_addr,
6550 ppnum_t *physpage_p)
6551 {
6552 vm_map_entry_t entry;
6553 vm_prot_t access_type;
6554 struct vm_map_entry *first_entry, tmp_entry;
6555 vm_map_t real_map;
6556 vm_map_offset_t s, e;
6557 kern_return_t rc;
6558 boolean_t need_wakeup;
6559 boolean_t main_map = FALSE;
6560 wait_interrupt_t interruptible_state;
6561 thread_t cur_thread;
6562 unsigned int last_timestamp;
6563 vm_map_size_t size;
6564 boolean_t wire_and_extract;
6565 vm_prot_t extra_prots;
6566
6567 extra_prots = VM_PROT_COPY;
6568 extra_prots |= VM_PROT_COPY_FAIL_IF_EXECUTABLE;
6569 #if XNU_TARGET_OS_OSX
6570 if (map->pmap == kernel_pmap ||
6571 !vm_map_cs_enforcement(map)) {
6572 extra_prots &= ~VM_PROT_COPY_FAIL_IF_EXECUTABLE;
6573 }
6574 #endif /* XNU_TARGET_OS_OSX */
6575 #if CODE_SIGNING_MONITOR
6576 if (csm_address_space_exempt(map->pmap) == KERN_SUCCESS) {
6577 extra_prots &= ~VM_PROT_COPY_FAIL_IF_EXECUTABLE;
6578 }
6579 #endif /* CODE_SIGNING_MONITOR */
6580
6581 access_type = (caller_prot & (VM_PROT_ALL | VM_PROT_ALLEXEC));
6582
6583 wire_and_extract = FALSE;
6584 if (physpage_p != NULL) {
6585 /*
6586 * The caller wants the physical page number of the
6587 * wired page. We return only one physical page number
6588 * so this works for only one page at a time.
6589 */
6590 if ((end - start) != PAGE_SIZE) {
6591 return KERN_INVALID_ARGUMENT;
6592 }
6593 wire_and_extract = TRUE;
6594 *physpage_p = 0;
6595 }
6596
6597 vm_map_lock(map);
6598 if (map_pmap == NULL) {
6599 main_map = TRUE;
6600 }
6601 last_timestamp = map->timestamp;
6602
6603 VM_MAP_RANGE_CHECK(map, start, end);
6604 assert(VM_MAP_PAGE_ALIGNED(start, VM_MAP_PAGE_MASK(map)));
6605 assert(VM_MAP_PAGE_ALIGNED(end, VM_MAP_PAGE_MASK(map)));
6606
6607 if (start == end) {
6608 /* We wired what the caller asked for, zero pages */
6609 vm_map_unlock(map);
6610 return KERN_SUCCESS;
6611 }
6612
6613 if (__improbable(vm_map_range_overflows(map, start, end - start))) {
6614 vm_map_unlock(map);
6615 return KERN_INVALID_ADDRESS;
6616 }
6617
6618 need_wakeup = FALSE;
6619 cur_thread = current_thread();
6620
6621 s = start;
6622 rc = KERN_SUCCESS;
6623
6624 if (vm_map_lookup_entry(map, s, &first_entry)) {
6625 entry = first_entry;
6626 /*
6627 * vm_map_clip_start will be done later.
6628 * We don't want to unnest any nested submaps here !
6629 */
6630 } else {
6631 /* Start address is not in map */
6632 rc = KERN_INVALID_ADDRESS;
6633 goto done;
6634 }
6635
6636 while ((entry != vm_map_to_entry(map)) && (s < end)) {
6637 /*
6638 * At this point, we have wired from "start" to "s".
6639 * We still need to wire from "s" to "end".
6640 *
6641 * "entry" hasn't been clipped, so it could start before "s"
6642 * and/or end after "end".
6643 */
6644
6645 /* "e" is how far we want to wire in this entry */
6646 e = entry->vme_end;
6647 if (e > end) {
6648 e = end;
6649 }
6650
6651 /*
6652 * If another thread is wiring/unwiring this entry then
6653 * block after informing other thread to wake us up.
6654 */
6655 if (entry->in_transition) {
6656 wait_result_t wait_result;
6657
6658 /*
6659 * We have not clipped the entry. Make sure that
6660 * the start address is in range so that the lookup
6661 * below will succeed.
6662 * "s" is the current starting point: we've already
6663 * wired from "start" to "s" and we still have
6664 * to wire from "s" to "end".
6665 */
6666
6667 entry->needs_wakeup = TRUE;
6668
6669 /*
6670 * wake up anybody waiting on entries that we have
6671 * already wired.
6672 */
6673 if (need_wakeup) {
6674 vm_map_entry_wakeup(map);
6675 need_wakeup = FALSE;
6676 }
6677 /*
6678 * User wiring is interruptible
6679 */
6680 wait_result = vm_map_entry_wait(map,
6681 (user_wire) ? THREAD_ABORTSAFE :
6682 THREAD_UNINT);
6683 if (user_wire && wait_result == THREAD_INTERRUPTED) {
6684 /*
6685 * undo the wirings we have done so far
6686 * We do not clear the needs_wakeup flag,
6687 * because we cannot tell if we were the
6688 * only one waiting.
6689 */
6690 rc = KERN_FAILURE;
6691 goto done;
6692 }
6693
6694 /*
6695 * Cannot avoid a lookup here. reset timestamp.
6696 */
6697 last_timestamp = map->timestamp;
6698
6699 /*
6700 * The entry could have been clipped, look it up again.
6701 * Worse that can happen is, it may not exist anymore.
6702 */
6703 if (!vm_map_lookup_entry(map, s, &first_entry)) {
6704 /*
6705 * User: undo everything upto the previous
6706 * entry. let vm_map_unwire worry about
6707 * checking the validity of the range.
6708 */
6709 rc = KERN_FAILURE;
6710 goto done;
6711 }
6712 entry = first_entry;
6713 continue;
6714 }
6715
6716 if (entry->is_sub_map) {
6717 vm_map_offset_t sub_start;
6718 vm_map_offset_t sub_end;
6719 vm_map_offset_t local_start;
6720 vm_map_offset_t local_end;
6721 pmap_t pmap;
6722
6723 if (wire_and_extract) {
6724 /*
6725 * Wiring would result in copy-on-write
6726 * which would not be compatible with
6727 * the sharing we have with the original
6728 * provider of this memory.
6729 */
6730 rc = KERN_INVALID_ARGUMENT;
6731 goto done;
6732 }
6733
6734 vm_map_clip_start(map, entry, s);
6735 vm_map_clip_end(map, entry, end);
6736
6737 sub_start = VME_OFFSET(entry);
6738 sub_end = entry->vme_end;
6739 sub_end += VME_OFFSET(entry) - entry->vme_start;
6740
6741 local_end = entry->vme_end;
6742 if (map_pmap == NULL) {
6743 vm_object_t object;
6744 vm_object_offset_t offset;
6745 vm_prot_t prot;
6746 boolean_t wired;
6747 vm_map_entry_t local_entry;
6748 vm_map_version_t version;
6749 vm_map_t lookup_map;
6750
6751 if (entry->use_pmap) {
6752 pmap = VME_SUBMAP(entry)->pmap;
6753 /* ppc implementation requires that */
6754 /* submaps pmap address ranges line */
6755 /* up with parent map */
6756 #ifdef notdef
6757 pmap_addr = sub_start;
6758 #endif
6759 pmap_addr = s;
6760 } else {
6761 pmap = map->pmap;
6762 pmap_addr = s;
6763 }
6764
6765 if (entry->wired_count) {
6766 if ((rc = add_wire_counts(map, entry, user_wire)) != KERN_SUCCESS) {
6767 goto done;
6768 }
6769
6770 /*
6771 * The map was not unlocked:
6772 * no need to goto re-lookup.
6773 * Just go directly to next entry.
6774 */
6775 entry = entry->vme_next;
6776 s = entry->vme_start;
6777 continue;
6778 }
6779
6780 /* call vm_map_lookup_and_lock_object to */
6781 /* cause any needs copy to be */
6782 /* evaluated */
6783 local_start = entry->vme_start;
6784 lookup_map = map;
6785 vm_map_lock_write_to_read(map);
6786 rc = vm_map_lookup_and_lock_object(
6787 &lookup_map, local_start,
6788 (access_type | extra_prots),
6789 OBJECT_LOCK_EXCLUSIVE,
6790 &version, &object,
6791 &offset, &prot, &wired,
6792 NULL,
6793 &real_map, NULL);
6794 if (rc != KERN_SUCCESS) {
6795 vm_map_unlock_read(lookup_map);
6796 assert(map_pmap == NULL);
6797 vm_map_unwire(map, start,
6798 s, user_wire);
6799 return rc;
6800 }
6801 vm_object_unlock(object);
6802 if (real_map != lookup_map) {
6803 vm_map_unlock(real_map);
6804 }
6805 vm_map_unlock_read(lookup_map);
6806 vm_map_lock(map);
6807
6808 /* we unlocked, so must re-lookup */
6809 if (!vm_map_lookup_entry(map,
6810 local_start,
6811 &local_entry)) {
6812 rc = KERN_FAILURE;
6813 goto done;
6814 }
6815
6816 /*
6817 * entry could have been "simplified",
6818 * so re-clip
6819 */
6820 entry = local_entry;
6821 assert(s == local_start);
6822 vm_map_clip_start(map, entry, s);
6823 vm_map_clip_end(map, entry, end);
6824 /* re-compute "e" */
6825 e = entry->vme_end;
6826 if (e > end) {
6827 e = end;
6828 }
6829
6830 /* did we have a change of type? */
6831 if (!entry->is_sub_map) {
6832 last_timestamp = map->timestamp;
6833 continue;
6834 }
6835 } else {
6836 local_start = entry->vme_start;
6837 pmap = map_pmap;
6838 }
6839
6840 if ((rc = add_wire_counts(map, entry, user_wire)) != KERN_SUCCESS) {
6841 goto done;
6842 }
6843
6844 entry->in_transition = TRUE;
6845
6846 vm_map_unlock(map);
6847 rc = vm_map_wire_nested(VME_SUBMAP(entry),
6848 sub_start, sub_end,
6849 caller_prot, tag,
6850 user_wire, pmap, pmap_addr,
6851 NULL);
6852 vm_map_lock(map);
6853
6854 /*
6855 * Find the entry again. It could have been clipped
6856 * after we unlocked the map.
6857 */
6858 if (!vm_map_lookup_entry(map, local_start,
6859 &first_entry)) {
6860 panic("vm_map_wire: re-lookup failed");
6861 }
6862 entry = first_entry;
6863
6864 assert(local_start == s);
6865 /* re-compute "e" */
6866 e = entry->vme_end;
6867 if (e > end) {
6868 e = end;
6869 }
6870
6871 last_timestamp = map->timestamp;
6872 while ((entry != vm_map_to_entry(map)) &&
6873 (entry->vme_start < e)) {
6874 assert(entry->in_transition);
6875 entry->in_transition = FALSE;
6876 if (entry->needs_wakeup) {
6877 entry->needs_wakeup = FALSE;
6878 need_wakeup = TRUE;
6879 }
6880 if (rc != KERN_SUCCESS) {/* from vm_*_wire */
6881 subtract_wire_counts(map, entry, user_wire);
6882 }
6883 entry = entry->vme_next;
6884 }
6885 if (rc != KERN_SUCCESS) { /* from vm_*_wire */
6886 goto done;
6887 }
6888
6889 /* no need to relookup again */
6890 s = entry->vme_start;
6891 continue;
6892 }
6893
6894 /*
6895 * If this entry is already wired then increment
6896 * the appropriate wire reference count.
6897 */
6898 if (entry->wired_count) {
6899 if ((entry->protection & access_type) != access_type) {
6900 /* found a protection problem */
6901
6902 /*
6903 * XXX FBDP
6904 * We should always return an error
6905 * in this case but since we didn't
6906 * enforce it before, let's do
6907 * it only for the new "wire_and_extract"
6908 * code path for now...
6909 */
6910 if (wire_and_extract) {
6911 rc = KERN_PROTECTION_FAILURE;
6912 goto done;
6913 }
6914 }
6915
6916 /*
6917 * entry is already wired down, get our reference
6918 * after clipping to our range.
6919 */
6920 vm_map_clip_start(map, entry, s);
6921 vm_map_clip_end(map, entry, end);
6922
6923 if ((rc = add_wire_counts(map, entry, user_wire)) != KERN_SUCCESS) {
6924 goto done;
6925 }
6926
6927 if (wire_and_extract) {
6928 vm_object_t object;
6929 vm_object_offset_t offset;
6930 vm_page_t m;
6931
6932 /*
6933 * We don't have to "wire" the page again
6934 * bit we still have to "extract" its
6935 * physical page number, after some sanity
6936 * checks.
6937 */
6938 assert((entry->vme_end - entry->vme_start)
6939 == PAGE_SIZE);
6940 assert(!entry->needs_copy);
6941 assert(!entry->is_sub_map);
6942 assert(VME_OBJECT(entry));
6943 if (((entry->vme_end - entry->vme_start)
6944 != PAGE_SIZE) ||
6945 entry->needs_copy ||
6946 entry->is_sub_map ||
6947 VME_OBJECT(entry) == VM_OBJECT_NULL) {
6948 rc = KERN_INVALID_ARGUMENT;
6949 goto done;
6950 }
6951
6952 object = VME_OBJECT(entry);
6953 offset = VME_OFFSET(entry);
6954 /* need exclusive lock to update m->dirty */
6955 if (entry->protection & VM_PROT_WRITE) {
6956 vm_object_lock(object);
6957 } else {
6958 vm_object_lock_shared(object);
6959 }
6960 m = vm_page_lookup(object, offset);
6961 assert(m != VM_PAGE_NULL);
6962 assert(VM_PAGE_WIRED(m));
6963 if (m != VM_PAGE_NULL && VM_PAGE_WIRED(m)) {
6964 *physpage_p = VM_PAGE_GET_PHYS_PAGE(m);
6965 if (entry->protection & VM_PROT_WRITE) {
6966 vm_object_lock_assert_exclusive(
6967 object);
6968 m->vmp_dirty = TRUE;
6969 }
6970 } else {
6971 /* not already wired !? */
6972 *physpage_p = 0;
6973 }
6974 vm_object_unlock(object);
6975 }
6976
6977 /* map was not unlocked: no need to relookup */
6978 entry = entry->vme_next;
6979 s = entry->vme_start;
6980 continue;
6981 }
6982
6983 /*
6984 * Unwired entry or wire request transmitted via submap
6985 */
6986
6987 /*
6988 * Wiring would copy the pages to the shadow object.
6989 * The shadow object would not be code-signed so
6990 * attempting to execute code from these copied pages
6991 * would trigger a code-signing violation.
6992 */
6993
6994 if ((entry->protection & VM_PROT_EXECUTE)
6995 #if XNU_TARGET_OS_OSX
6996 &&
6997 map->pmap != kernel_pmap &&
6998 (vm_map_cs_enforcement(map)
6999 #if __arm64__
7000 || !VM_MAP_IS_EXOTIC(map)
7001 #endif /* __arm64__ */
7002 )
7003 #endif /* XNU_TARGET_OS_OSX */
7004 #if CODE_SIGNING_MONITOR
7005 &&
7006 (csm_address_space_exempt(map->pmap) != KERN_SUCCESS)
7007 #endif
7008 ) {
7009 #if MACH_ASSERT
7010 printf("pid %d[%s] wiring executable range from "
7011 "0x%llx to 0x%llx: rejected to preserve "
7012 "code-signing\n",
7013 proc_selfpid(),
7014 (get_bsdtask_info(current_task())
7015 ? proc_name_address(get_bsdtask_info(current_task()))
7016 : "?"),
7017 (uint64_t) entry->vme_start,
7018 (uint64_t) entry->vme_end);
7019 #endif /* MACH_ASSERT */
7020 DTRACE_VM2(cs_executable_wire,
7021 uint64_t, (uint64_t)entry->vme_start,
7022 uint64_t, (uint64_t)entry->vme_end);
7023 cs_executable_wire++;
7024 rc = KERN_PROTECTION_FAILURE;
7025 goto done;
7026 }
7027
7028 /*
7029 * Perform actions of vm_map_lookup that need the write
7030 * lock on the map: create a shadow object for a
7031 * copy-on-write region, or an object for a zero-fill
7032 * region.
7033 */
7034 size = entry->vme_end - entry->vme_start;
7035 /*
7036 * If wiring a copy-on-write page, we need to copy it now
7037 * even if we're only (currently) requesting read access.
7038 * This is aggressive, but once it's wired we can't move it.
7039 */
7040 if (entry->needs_copy) {
7041 if (wire_and_extract) {
7042 /*
7043 * We're supposed to share with the original
7044 * provider so should not be "needs_copy"
7045 */
7046 rc = KERN_INVALID_ARGUMENT;
7047 goto done;
7048 }
7049
7050 VME_OBJECT_SHADOW(entry, size,
7051 vm_map_always_shadow(map));
7052 entry->needs_copy = FALSE;
7053 } else if (VME_OBJECT(entry) == VM_OBJECT_NULL) {
7054 if (wire_and_extract) {
7055 /*
7056 * We're supposed to share with the original
7057 * provider so should already have an object.
7058 */
7059 rc = KERN_INVALID_ARGUMENT;
7060 goto done;
7061 }
7062 VME_OBJECT_SET(entry, vm_object_allocate(size), false, 0);
7063 VME_OFFSET_SET(entry, (vm_object_offset_t)0);
7064 assert(entry->use_pmap);
7065 } else if (VME_OBJECT(entry)->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) {
7066 if (wire_and_extract) {
7067 /*
7068 * We're supposed to share with the original
7069 * provider so should not be COPY_SYMMETRIC.
7070 */
7071 rc = KERN_INVALID_ARGUMENT;
7072 goto done;
7073 }
7074 /*
7075 * Force an unrequested "copy-on-write" but only for
7076 * the range we're wiring.
7077 */
7078 // printf("FBDP %s:%d map %p entry %p [ 0x%llx 0x%llx ] s 0x%llx end 0x%llx wire&extract=%d\n", __FUNCTION__, __LINE__, map, entry, (uint64_t)entry->vme_start, (uint64_t)entry->vme_end, (uint64_t)s, (uint64_t)end, wire_and_extract);
7079 vm_map_clip_start(map, entry, s);
7080 vm_map_clip_end(map, entry, end);
7081 /* recompute "size" */
7082 size = entry->vme_end - entry->vme_start;
7083 /* make a shadow object */
7084 vm_object_t orig_object;
7085 vm_object_offset_t orig_offset;
7086 orig_object = VME_OBJECT(entry);
7087 orig_offset = VME_OFFSET(entry);
7088 VME_OBJECT_SHADOW(entry, size, vm_map_always_shadow(map));
7089 if (VME_OBJECT(entry) != orig_object) {
7090 /*
7091 * This mapping has not been shared (or it would be
7092 * COPY_DELAY instead of COPY_SYMMETRIC) and it has
7093 * not been copied-on-write (or it would be marked
7094 * as "needs_copy" and would have been handled above
7095 * and also already write-protected).
7096 * We still need to write-protect here to prevent
7097 * other threads from modifying these pages while
7098 * we're in the process of copying and wiring
7099 * the copied pages.
7100 * Since the mapping is neither shared nor COWed,
7101 * we only need to write-protect the PTEs for this
7102 * mapping.
7103 */
7104 vm_object_pmap_protect(orig_object,
7105 orig_offset,
7106 size,
7107 map->pmap,
7108 VM_MAP_PAGE_SIZE(map),
7109 entry->vme_start,
7110 entry->protection & ~VM_PROT_WRITE);
7111 }
7112 }
7113 if (VME_OBJECT(entry)->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) {
7114 /*
7115 * Make the object COPY_DELAY to get a stable object
7116 * to wire.
7117 * That should avoid creating long shadow chains while
7118 * wiring/unwiring the same range repeatedly.
7119 * That also prevents part of the object from being
7120 * wired while another part is "needs_copy", which
7121 * could result in conflicting rules wrt copy-on-write.
7122 */
7123 vm_object_t object;
7124
7125 object = VME_OBJECT(entry);
7126 vm_object_lock(object);
7127 if (object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) {
7128 assertf(vm_object_round_page(VME_OFFSET(entry) + size) - vm_object_trunc_page(VME_OFFSET(entry)) == object->vo_size,
7129 "object %p size 0x%llx entry %p [0x%llx:0x%llx:0x%llx] size 0x%llx\n",
7130 object, (uint64_t)object->vo_size,
7131 entry,
7132 (uint64_t)entry->vme_start,
7133 (uint64_t)entry->vme_end,
7134 (uint64_t)VME_OFFSET(entry),
7135 (uint64_t)size);
7136 assertf(object->ref_count == 1,
7137 "object %p ref_count %d\n",
7138 object, object->ref_count);
7139 assertf(!entry->needs_copy,
7140 "entry %p\n", entry);
7141 object->copy_strategy = MEMORY_OBJECT_COPY_DELAY;
7142 object->true_share = TRUE;
7143 }
7144 vm_object_unlock(object);
7145 }
7146
7147 vm_map_clip_start(map, entry, s);
7148 vm_map_clip_end(map, entry, end);
7149
7150 /* re-compute "e" */
7151 e = entry->vme_end;
7152 if (e > end) {
7153 e = end;
7154 }
7155
7156 /*
7157 * Check for holes and protection mismatch.
7158 * Holes: Next entry should be contiguous unless this
7159 * is the end of the region.
7160 * Protection: Access requested must be allowed, unless
7161 * wiring is by protection class
7162 */
7163 if ((entry->vme_end < end) &&
7164 ((entry->vme_next == vm_map_to_entry(map)) ||
7165 (entry->vme_next->vme_start > entry->vme_end))) {
7166 /* found a hole */
7167 rc = KERN_INVALID_ADDRESS;
7168 goto done;
7169 }
7170 if ((entry->protection & access_type) != access_type) {
7171 /* found a protection problem */
7172 rc = KERN_PROTECTION_FAILURE;
7173 goto done;
7174 }
7175
7176 assert(entry->wired_count == 0 && entry->user_wired_count == 0);
7177
7178 if ((rc = add_wire_counts(map, entry, user_wire)) != KERN_SUCCESS) {
7179 goto done;
7180 }
7181
7182 entry->in_transition = TRUE;
7183
7184 /*
7185 * This entry might get split once we unlock the map.
7186 * In vm_fault_wire(), we need the current range as
7187 * defined by this entry. In order for this to work
7188 * along with a simultaneous clip operation, we make a
7189 * temporary copy of this entry and use that for the
7190 * wiring. Note that the underlying objects do not
7191 * change during a clip.
7192 */
7193 tmp_entry = *entry;
7194
7195 /*
7196 * The in_transition state guarentees that the entry
7197 * (or entries for this range, if split occured) will be
7198 * there when the map lock is acquired for the second time.
7199 */
7200 vm_map_unlock(map);
7201
7202 if (!user_wire && cur_thread != THREAD_NULL) {
7203 interruptible_state = thread_interrupt_level(THREAD_UNINT);
7204 } else {
7205 interruptible_state = THREAD_UNINT;
7206 }
7207
7208 if (map_pmap) {
7209 rc = vm_fault_wire(map,
7210 &tmp_entry, caller_prot, tag, map_pmap, pmap_addr,
7211 physpage_p);
7212 } else {
7213 rc = vm_fault_wire(map,
7214 &tmp_entry, caller_prot, tag, map->pmap,
7215 tmp_entry.vme_start,
7216 physpage_p);
7217 }
7218
7219 if (!user_wire && cur_thread != THREAD_NULL) {
7220 thread_interrupt_level(interruptible_state);
7221 }
7222
7223 vm_map_lock(map);
7224
7225 if (last_timestamp + 1 != map->timestamp) {
7226 /*
7227 * Find the entry again. It could have been clipped
7228 * after we unlocked the map.
7229 */
7230 if (!vm_map_lookup_entry(map, tmp_entry.vme_start,
7231 &first_entry)) {
7232 panic("vm_map_wire: re-lookup failed");
7233 }
7234
7235 entry = first_entry;
7236 }
7237
7238 last_timestamp = map->timestamp;
7239
7240 while ((entry != vm_map_to_entry(map)) &&
7241 (entry->vme_start < tmp_entry.vme_end)) {
7242 assert(entry->in_transition);
7243 entry->in_transition = FALSE;
7244 if (entry->needs_wakeup) {
7245 entry->needs_wakeup = FALSE;
7246 need_wakeup = TRUE;
7247 }
7248 if (rc != KERN_SUCCESS) { /* from vm_*_wire */
7249 subtract_wire_counts(map, entry, user_wire);
7250 }
7251 entry = entry->vme_next;
7252 }
7253
7254 if (rc != KERN_SUCCESS) { /* from vm_*_wire */
7255 goto done;
7256 }
7257
7258 if ((entry != vm_map_to_entry(map)) && /* we still have entries in the map */
7259 (tmp_entry.vme_end != end) && /* AND, we are not at the end of the requested range */
7260 (entry->vme_start != tmp_entry.vme_end)) { /* AND, the next entry is not contiguous. */
7261 /* found a "new" hole */
7262 s = tmp_entry.vme_end;
7263 rc = KERN_INVALID_ADDRESS;
7264 goto done;
7265 }
7266
7267 s = entry->vme_start;
7268 } /* end while loop through map entries */
7269
7270 done:
7271 if (rc == KERN_SUCCESS) {
7272 /* repair any damage we may have made to the VM map */
7273 vm_map_simplify_range(map, start, end);
7274 }
7275
7276 vm_map_unlock(map);
7277
7278 /*
7279 * wake up anybody waiting on entries we wired.
7280 */
7281 if (need_wakeup) {
7282 vm_map_entry_wakeup(map);
7283 }
7284
7285 if (rc != KERN_SUCCESS) {
7286 /* undo what has been wired so far */
7287 vm_map_unwire_nested(map, start, s, user_wire,
7288 map_pmap, pmap_addr);
7289 if (physpage_p) {
7290 *physpage_p = 0;
7291 }
7292 }
7293
7294 return rc;
7295 }
7296
7297 kern_return_t
vm_map_wire_external(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_prot_t caller_prot,boolean_t user_wire)7298 vm_map_wire_external(
7299 vm_map_t map,
7300 vm_map_offset_t start,
7301 vm_map_offset_t end,
7302 vm_prot_t caller_prot,
7303 boolean_t user_wire)
7304 {
7305 kern_return_t kret;
7306
7307 kret = vm_map_wire_nested(map, start, end, caller_prot, vm_tag_bt(),
7308 user_wire, (pmap_t)NULL, 0, NULL);
7309 return kret;
7310 }
7311
7312 kern_return_t
vm_map_wire_kernel(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_prot_t caller_prot,vm_tag_t tag,boolean_t user_wire)7313 vm_map_wire_kernel(
7314 vm_map_t map,
7315 vm_map_offset_t start,
7316 vm_map_offset_t end,
7317 vm_prot_t caller_prot,
7318 vm_tag_t tag,
7319 boolean_t user_wire)
7320 {
7321 kern_return_t kret;
7322
7323 kret = vm_map_wire_nested(map, start, end, caller_prot, tag,
7324 user_wire, (pmap_t)NULL, 0, NULL);
7325 return kret;
7326 }
7327
7328 kern_return_t
vm_map_wire_and_extract_external(vm_map_t map,vm_map_offset_t start,vm_prot_t caller_prot,boolean_t user_wire,ppnum_t * physpage_p)7329 vm_map_wire_and_extract_external(
7330 vm_map_t map,
7331 vm_map_offset_t start,
7332 vm_prot_t caller_prot,
7333 boolean_t user_wire,
7334 ppnum_t *physpage_p)
7335 {
7336 kern_return_t kret;
7337
7338 kret = vm_map_wire_nested(map,
7339 start,
7340 start + VM_MAP_PAGE_SIZE(map),
7341 caller_prot,
7342 vm_tag_bt(),
7343 user_wire,
7344 (pmap_t)NULL,
7345 0,
7346 physpage_p);
7347 if (kret != KERN_SUCCESS &&
7348 physpage_p != NULL) {
7349 *physpage_p = 0;
7350 }
7351 return kret;
7352 }
7353
7354 /*
7355 * vm_map_unwire:
7356 *
7357 * Sets the pageability of the specified address range in the target
7358 * as pageable. Regions specified must have been wired previously.
7359 *
7360 * The map must not be locked, but a reference must remain to the map
7361 * throughout the call.
7362 *
7363 * Kernel will panic on failures. User unwire ignores holes and
7364 * unwired and intransition entries to avoid losing memory by leaving
7365 * it unwired.
7366 */
7367 static kern_return_t
vm_map_unwire_nested(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,boolean_t user_wire,pmap_t map_pmap,vm_map_offset_t pmap_addr)7368 vm_map_unwire_nested(
7369 vm_map_t map,
7370 vm_map_offset_t start,
7371 vm_map_offset_t end,
7372 boolean_t user_wire,
7373 pmap_t map_pmap,
7374 vm_map_offset_t pmap_addr)
7375 {
7376 vm_map_entry_t entry;
7377 struct vm_map_entry *first_entry, tmp_entry;
7378 boolean_t need_wakeup;
7379 boolean_t main_map = FALSE;
7380 unsigned int last_timestamp;
7381
7382 vm_map_lock(map);
7383 if (map_pmap == NULL) {
7384 main_map = TRUE;
7385 }
7386 last_timestamp = map->timestamp;
7387
7388 VM_MAP_RANGE_CHECK(map, start, end);
7389 assert(VM_MAP_PAGE_ALIGNED(start, VM_MAP_PAGE_MASK(map)));
7390 assert(VM_MAP_PAGE_ALIGNED(end, VM_MAP_PAGE_MASK(map)));
7391
7392 if (start == end) {
7393 /* We unwired what the caller asked for: zero pages */
7394 vm_map_unlock(map);
7395 return KERN_SUCCESS;
7396 }
7397
7398 if (__improbable(vm_map_range_overflows(map, start, end - start))) {
7399 vm_map_unlock(map);
7400 return KERN_INVALID_ADDRESS;
7401 }
7402
7403 if (vm_map_lookup_entry(map, start, &first_entry)) {
7404 entry = first_entry;
7405 /*
7406 * vm_map_clip_start will be done later.
7407 * We don't want to unnest any nested sub maps here !
7408 */
7409 } else {
7410 if (!user_wire) {
7411 panic("vm_map_unwire: start not found");
7412 }
7413 /* Start address is not in map. */
7414 vm_map_unlock(map);
7415 return KERN_INVALID_ADDRESS;
7416 }
7417
7418 if (entry->superpage_size) {
7419 /* superpages are always wired */
7420 vm_map_unlock(map);
7421 return KERN_INVALID_ADDRESS;
7422 }
7423
7424 need_wakeup = FALSE;
7425 while ((entry != vm_map_to_entry(map)) && (entry->vme_start < end)) {
7426 if (entry->in_transition) {
7427 /*
7428 * 1)
7429 * Another thread is wiring down this entry. Note
7430 * that if it is not for the other thread we would
7431 * be unwiring an unwired entry. This is not
7432 * permitted. If we wait, we will be unwiring memory
7433 * we did not wire.
7434 *
7435 * 2)
7436 * Another thread is unwiring this entry. We did not
7437 * have a reference to it, because if we did, this
7438 * entry will not be getting unwired now.
7439 */
7440 if (!user_wire) {
7441 /*
7442 * XXX FBDP
7443 * This could happen: there could be some
7444 * overlapping vslock/vsunlock operations
7445 * going on.
7446 * We should probably just wait and retry,
7447 * but then we have to be careful that this
7448 * entry could get "simplified" after
7449 * "in_transition" gets unset and before
7450 * we re-lookup the entry, so we would
7451 * have to re-clip the entry to avoid
7452 * re-unwiring what we have already unwired...
7453 * See vm_map_wire_nested().
7454 *
7455 * Or we could just ignore "in_transition"
7456 * here and proceed to decement the wired
7457 * count(s) on this entry. That should be fine
7458 * as long as "wired_count" doesn't drop all
7459 * the way to 0 (and we should panic if THAT
7460 * happens).
7461 */
7462 panic("vm_map_unwire: in_transition entry");
7463 }
7464
7465 entry = entry->vme_next;
7466 continue;
7467 }
7468
7469 if (entry->is_sub_map) {
7470 vm_map_offset_t sub_start;
7471 vm_map_offset_t sub_end;
7472 vm_map_offset_t local_end;
7473 pmap_t pmap;
7474
7475 vm_map_clip_start(map, entry, start);
7476 vm_map_clip_end(map, entry, end);
7477
7478 sub_start = VME_OFFSET(entry);
7479 sub_end = entry->vme_end - entry->vme_start;
7480 sub_end += VME_OFFSET(entry);
7481 local_end = entry->vme_end;
7482 if (map_pmap == NULL) {
7483 if (entry->use_pmap) {
7484 pmap = VME_SUBMAP(entry)->pmap;
7485 pmap_addr = sub_start;
7486 } else {
7487 pmap = map->pmap;
7488 pmap_addr = start;
7489 }
7490 if (entry->wired_count == 0 ||
7491 (user_wire && entry->user_wired_count == 0)) {
7492 if (!user_wire) {
7493 panic("vm_map_unwire: entry is unwired");
7494 }
7495 entry = entry->vme_next;
7496 continue;
7497 }
7498
7499 /*
7500 * Check for holes
7501 * Holes: Next entry should be contiguous unless
7502 * this is the end of the region.
7503 */
7504 if (((entry->vme_end < end) &&
7505 ((entry->vme_next == vm_map_to_entry(map)) ||
7506 (entry->vme_next->vme_start
7507 > entry->vme_end)))) {
7508 if (!user_wire) {
7509 panic("vm_map_unwire: non-contiguous region");
7510 }
7511 /*
7512 * entry = entry->vme_next;
7513 * continue;
7514 */
7515 }
7516
7517 subtract_wire_counts(map, entry, user_wire);
7518
7519 if (entry->wired_count != 0) {
7520 entry = entry->vme_next;
7521 continue;
7522 }
7523
7524 entry->in_transition = TRUE;
7525 tmp_entry = *entry;/* see comment in vm_map_wire() */
7526
7527 /*
7528 * We can unlock the map now. The in_transition state
7529 * guarantees existance of the entry.
7530 */
7531 vm_map_unlock(map);
7532 vm_map_unwire_nested(VME_SUBMAP(entry),
7533 sub_start, sub_end, user_wire, pmap, pmap_addr);
7534 vm_map_lock(map);
7535
7536 if (last_timestamp + 1 != map->timestamp) {
7537 /*
7538 * Find the entry again. It could have been
7539 * clipped or deleted after we unlocked the map.
7540 */
7541 if (!vm_map_lookup_entry(map,
7542 tmp_entry.vme_start,
7543 &first_entry)) {
7544 if (!user_wire) {
7545 panic("vm_map_unwire: re-lookup failed");
7546 }
7547 entry = first_entry->vme_next;
7548 } else {
7549 entry = first_entry;
7550 }
7551 }
7552 last_timestamp = map->timestamp;
7553
7554 /*
7555 * clear transition bit for all constituent entries
7556 * that were in the original entry (saved in
7557 * tmp_entry). Also check for waiters.
7558 */
7559 while ((entry != vm_map_to_entry(map)) &&
7560 (entry->vme_start < tmp_entry.vme_end)) {
7561 assert(entry->in_transition);
7562 entry->in_transition = FALSE;
7563 if (entry->needs_wakeup) {
7564 entry->needs_wakeup = FALSE;
7565 need_wakeup = TRUE;
7566 }
7567 entry = entry->vme_next;
7568 }
7569 continue;
7570 } else {
7571 tmp_entry = *entry;
7572 vm_map_unlock(map);
7573 vm_map_unwire_nested(VME_SUBMAP(entry),
7574 sub_start, sub_end, user_wire, map_pmap,
7575 pmap_addr);
7576 vm_map_lock(map);
7577
7578 if (last_timestamp + 1 != map->timestamp) {
7579 /*
7580 * Find the entry again. It could have been
7581 * clipped or deleted after we unlocked the map.
7582 */
7583 if (!vm_map_lookup_entry(map,
7584 tmp_entry.vme_start,
7585 &first_entry)) {
7586 if (!user_wire) {
7587 panic("vm_map_unwire: re-lookup failed");
7588 }
7589 entry = first_entry->vme_next;
7590 } else {
7591 entry = first_entry;
7592 }
7593 }
7594 last_timestamp = map->timestamp;
7595 }
7596 }
7597
7598
7599 if ((entry->wired_count == 0) ||
7600 (user_wire && entry->user_wired_count == 0)) {
7601 if (!user_wire) {
7602 panic("vm_map_unwire: entry is unwired");
7603 }
7604
7605 entry = entry->vme_next;
7606 continue;
7607 }
7608
7609 assert(entry->wired_count > 0 &&
7610 (!user_wire || entry->user_wired_count > 0));
7611
7612 vm_map_clip_start(map, entry, start);
7613 vm_map_clip_end(map, entry, end);
7614
7615 /*
7616 * Check for holes
7617 * Holes: Next entry should be contiguous unless
7618 * this is the end of the region.
7619 */
7620 if (((entry->vme_end < end) &&
7621 ((entry->vme_next == vm_map_to_entry(map)) ||
7622 (entry->vme_next->vme_start > entry->vme_end)))) {
7623 if (!user_wire) {
7624 panic("vm_map_unwire: non-contiguous region");
7625 }
7626 entry = entry->vme_next;
7627 continue;
7628 }
7629
7630 subtract_wire_counts(map, entry, user_wire);
7631
7632 if (entry->wired_count != 0) {
7633 entry = entry->vme_next;
7634 continue;
7635 }
7636
7637 if (entry->zero_wired_pages) {
7638 entry->zero_wired_pages = FALSE;
7639 }
7640
7641 entry->in_transition = TRUE;
7642 tmp_entry = *entry; /* see comment in vm_map_wire() */
7643
7644 /*
7645 * We can unlock the map now. The in_transition state
7646 * guarantees existance of the entry.
7647 */
7648 vm_map_unlock(map);
7649 if (map_pmap) {
7650 vm_fault_unwire(map, &tmp_entry, FALSE, map_pmap,
7651 pmap_addr, tmp_entry.vme_end);
7652 } else {
7653 vm_fault_unwire(map, &tmp_entry, FALSE, map->pmap,
7654 tmp_entry.vme_start, tmp_entry.vme_end);
7655 }
7656 vm_map_lock(map);
7657
7658 if (last_timestamp + 1 != map->timestamp) {
7659 /*
7660 * Find the entry again. It could have been clipped
7661 * or deleted after we unlocked the map.
7662 */
7663 if (!vm_map_lookup_entry(map, tmp_entry.vme_start,
7664 &first_entry)) {
7665 if (!user_wire) {
7666 panic("vm_map_unwire: re-lookup failed");
7667 }
7668 entry = first_entry->vme_next;
7669 } else {
7670 entry = first_entry;
7671 }
7672 }
7673 last_timestamp = map->timestamp;
7674
7675 /*
7676 * clear transition bit for all constituent entries that
7677 * were in the original entry (saved in tmp_entry). Also
7678 * check for waiters.
7679 */
7680 while ((entry != vm_map_to_entry(map)) &&
7681 (entry->vme_start < tmp_entry.vme_end)) {
7682 assert(entry->in_transition);
7683 entry->in_transition = FALSE;
7684 if (entry->needs_wakeup) {
7685 entry->needs_wakeup = FALSE;
7686 need_wakeup = TRUE;
7687 }
7688 entry = entry->vme_next;
7689 }
7690 }
7691
7692 /*
7693 * We might have fragmented the address space when we wired this
7694 * range of addresses. Attempt to re-coalesce these VM map entries
7695 * with their neighbors now that they're no longer wired.
7696 * Under some circumstances, address space fragmentation can
7697 * prevent VM object shadow chain collapsing, which can cause
7698 * swap space leaks.
7699 */
7700 vm_map_simplify_range(map, start, end);
7701
7702 vm_map_unlock(map);
7703 /*
7704 * wake up anybody waiting on entries that we have unwired.
7705 */
7706 if (need_wakeup) {
7707 vm_map_entry_wakeup(map);
7708 }
7709 return KERN_SUCCESS;
7710 }
7711
7712 kern_return_t
vm_map_unwire(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,boolean_t user_wire)7713 vm_map_unwire(
7714 vm_map_t map,
7715 vm_map_offset_t start,
7716 vm_map_offset_t end,
7717 boolean_t user_wire)
7718 {
7719 return vm_map_unwire_nested(map, start, end,
7720 user_wire, (pmap_t)NULL, 0);
7721 }
7722
7723
7724 /*
7725 * vm_map_entry_zap: [ internal use only ]
7726 *
7727 * Remove the entry from the target map
7728 * and put it on a zap list.
7729 */
7730 static void
vm_map_entry_zap(vm_map_t map,vm_map_entry_t entry,vm_map_zap_t zap)7731 vm_map_entry_zap(
7732 vm_map_t map,
7733 vm_map_entry_t entry,
7734 vm_map_zap_t zap)
7735 {
7736 vm_map_offset_t s, e;
7737
7738 s = entry->vme_start;
7739 e = entry->vme_end;
7740 assert(VM_MAP_PAGE_ALIGNED(s, FOURK_PAGE_MASK));
7741 assert(VM_MAP_PAGE_ALIGNED(e, FOURK_PAGE_MASK));
7742 if (VM_MAP_PAGE_MASK(map) >= PAGE_MASK) {
7743 assert(page_aligned(s));
7744 assert(page_aligned(e));
7745 }
7746 if (entry->map_aligned == TRUE) {
7747 assert(VM_MAP_PAGE_ALIGNED(s, VM_MAP_PAGE_MASK(map)));
7748 assert(VM_MAP_PAGE_ALIGNED(e, VM_MAP_PAGE_MASK(map)));
7749 }
7750 assert(entry->wired_count == 0);
7751 assert(entry->user_wired_count == 0);
7752 assert(!entry->vme_permanent);
7753
7754 vm_map_store_entry_unlink(map, entry, false);
7755 map->size -= e - s;
7756
7757 vm_map_zap_append(zap, entry);
7758 }
7759
7760 static void
vm_map_submap_pmap_clean(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_map_t sub_map,vm_map_offset_t offset)7761 vm_map_submap_pmap_clean(
7762 vm_map_t map,
7763 vm_map_offset_t start,
7764 vm_map_offset_t end,
7765 vm_map_t sub_map,
7766 vm_map_offset_t offset)
7767 {
7768 vm_map_offset_t submap_start;
7769 vm_map_offset_t submap_end;
7770 vm_map_size_t remove_size;
7771 vm_map_entry_t entry;
7772
7773 submap_end = offset + (end - start);
7774 submap_start = offset;
7775
7776 vm_map_lock_read(sub_map);
7777 if (vm_map_lookup_entry(sub_map, offset, &entry)) {
7778 remove_size = (entry->vme_end - entry->vme_start);
7779 if (offset > entry->vme_start) {
7780 remove_size -= offset - entry->vme_start;
7781 }
7782
7783
7784 if (submap_end < entry->vme_end) {
7785 remove_size -=
7786 entry->vme_end - submap_end;
7787 }
7788 if (entry->is_sub_map) {
7789 vm_map_submap_pmap_clean(
7790 sub_map,
7791 start,
7792 start + remove_size,
7793 VME_SUBMAP(entry),
7794 VME_OFFSET(entry));
7795 } else {
7796 if (map->mapped_in_other_pmaps &&
7797 os_ref_get_count_raw(&map->map_refcnt) != 0 &&
7798 VME_OBJECT(entry) != NULL) {
7799 vm_object_pmap_protect_options(
7800 VME_OBJECT(entry),
7801 (VME_OFFSET(entry) +
7802 offset -
7803 entry->vme_start),
7804 remove_size,
7805 PMAP_NULL,
7806 PAGE_SIZE,
7807 entry->vme_start,
7808 VM_PROT_NONE,
7809 PMAP_OPTIONS_REMOVE);
7810 } else {
7811 pmap_remove(map->pmap,
7812 (addr64_t)start,
7813 (addr64_t)(start + remove_size));
7814 }
7815 }
7816 }
7817
7818 entry = entry->vme_next;
7819
7820 while ((entry != vm_map_to_entry(sub_map))
7821 && (entry->vme_start < submap_end)) {
7822 remove_size = (entry->vme_end - entry->vme_start);
7823 if (submap_end < entry->vme_end) {
7824 remove_size -= entry->vme_end - submap_end;
7825 }
7826 if (entry->is_sub_map) {
7827 vm_map_submap_pmap_clean(
7828 sub_map,
7829 (start + entry->vme_start) - offset,
7830 ((start + entry->vme_start) - offset) + remove_size,
7831 VME_SUBMAP(entry),
7832 VME_OFFSET(entry));
7833 } else {
7834 if (map->mapped_in_other_pmaps &&
7835 os_ref_get_count_raw(&map->map_refcnt) != 0 &&
7836 VME_OBJECT(entry) != NULL) {
7837 vm_object_pmap_protect_options(
7838 VME_OBJECT(entry),
7839 VME_OFFSET(entry),
7840 remove_size,
7841 PMAP_NULL,
7842 PAGE_SIZE,
7843 entry->vme_start,
7844 VM_PROT_NONE,
7845 PMAP_OPTIONS_REMOVE);
7846 } else {
7847 pmap_remove(map->pmap,
7848 (addr64_t)((start + entry->vme_start)
7849 - offset),
7850 (addr64_t)(((start + entry->vme_start)
7851 - offset) + remove_size));
7852 }
7853 }
7854 entry = entry->vme_next;
7855 }
7856 vm_map_unlock_read(sub_map);
7857 return;
7858 }
7859
7860 /*
7861 * virt_memory_guard_ast:
7862 *
7863 * Handle the AST callout for a virtual memory guard.
7864 * raise an EXC_GUARD exception and terminate the task
7865 * if configured to do so.
7866 */
7867 void
virt_memory_guard_ast(thread_t thread,mach_exception_data_type_t code,mach_exception_data_type_t subcode)7868 virt_memory_guard_ast(
7869 thread_t thread,
7870 mach_exception_data_type_t code,
7871 mach_exception_data_type_t subcode)
7872 {
7873 task_t task = get_threadtask(thread);
7874 assert(task != kernel_task);
7875 assert(task == current_task());
7876 kern_return_t sync_exception_result;
7877 uint32_t behavior;
7878
7879 behavior = task->task_exc_guard;
7880
7881 /* Is delivery enabled */
7882 if ((behavior & TASK_EXC_GUARD_VM_DELIVER) == 0) {
7883 return;
7884 }
7885
7886 /* If only once, make sure we're that once */
7887 while (behavior & TASK_EXC_GUARD_VM_ONCE) {
7888 uint32_t new_behavior = behavior & ~TASK_EXC_GUARD_VM_DELIVER;
7889
7890 if (OSCompareAndSwap(behavior, new_behavior, &task->task_exc_guard)) {
7891 break;
7892 }
7893 behavior = task->task_exc_guard;
7894 if ((behavior & TASK_EXC_GUARD_VM_DELIVER) == 0) {
7895 return;
7896 }
7897 }
7898
7899 const bool fatal = task->task_exc_guard & TASK_EXC_GUARD_VM_FATAL;
7900 /* Raise exception synchronously and see if handler claimed it */
7901 sync_exception_result = task_exception_notify(EXC_GUARD, code, subcode, fatal);
7902
7903 if (fatal) {
7904 /*
7905 * If Synchronous EXC_GUARD delivery was successful then
7906 * kill the process and return, else kill the process
7907 * and deliver the exception via EXC_CORPSE_NOTIFY.
7908 */
7909 if (sync_exception_result == KERN_SUCCESS) {
7910 task_bsdtask_kill(current_task());
7911 } else {
7912 exit_with_guard_exception(current_proc(), code, subcode);
7913 }
7914 } else if (task->task_exc_guard & TASK_EXC_GUARD_VM_CORPSE) {
7915 /*
7916 * If the synchronous EXC_GUARD delivery was not successful,
7917 * raise a simulated crash.
7918 */
7919 if (sync_exception_result != KERN_SUCCESS) {
7920 task_violated_guard(code, subcode, NULL, FALSE);
7921 }
7922 }
7923 }
7924
7925 /*
7926 * vm_map_guard_exception:
7927 *
7928 * Generate a GUARD_TYPE_VIRTUAL_MEMORY EXC_GUARD exception.
7929 *
7930 * Right now, we do this when we find nothing mapped, or a
7931 * gap in the mapping when a user address space deallocate
7932 * was requested. We report the address of the first gap found.
7933 */
7934 static void
vm_map_guard_exception(vm_map_offset_t gap_start,unsigned reason)7935 vm_map_guard_exception(
7936 vm_map_offset_t gap_start,
7937 unsigned reason)
7938 {
7939 mach_exception_code_t code = 0;
7940 unsigned int guard_type = GUARD_TYPE_VIRT_MEMORY;
7941 unsigned int target = 0; /* should we pass in pid associated with map? */
7942 mach_exception_data_type_t subcode = (uint64_t)gap_start;
7943 boolean_t fatal = FALSE;
7944
7945 task_t task = current_task_early();
7946
7947 /* Can't deliver exceptions to a NULL task (early boot) or kernel task */
7948 if (task == NULL || task == kernel_task) {
7949 return;
7950 }
7951
7952 EXC_GUARD_ENCODE_TYPE(code, guard_type);
7953 EXC_GUARD_ENCODE_FLAVOR(code, reason);
7954 EXC_GUARD_ENCODE_TARGET(code, target);
7955
7956 if (task->task_exc_guard & TASK_EXC_GUARD_VM_FATAL) {
7957 fatal = TRUE;
7958 }
7959 thread_guard_violation(current_thread(), code, subcode, fatal);
7960 }
7961
7962 static kern_return_t
vm_map_delete_submap_recurse(vm_map_t submap,vm_map_offset_t submap_start,vm_map_offset_t submap_end)7963 vm_map_delete_submap_recurse(
7964 vm_map_t submap,
7965 vm_map_offset_t submap_start,
7966 vm_map_offset_t submap_end)
7967 {
7968 vm_map_entry_t submap_entry;
7969
7970 /*
7971 * Verify that the submap does not contain any "permanent" entries
7972 * within the specified range.
7973 * We do not care about gaps.
7974 */
7975
7976 vm_map_lock(submap);
7977
7978 if (!vm_map_lookup_entry(submap, submap_start, &submap_entry)) {
7979 submap_entry = submap_entry->vme_next;
7980 }
7981
7982 for (;
7983 submap_entry != vm_map_to_entry(submap) &&
7984 submap_entry->vme_start < submap_end;
7985 submap_entry = submap_entry->vme_next) {
7986 if (submap_entry->vme_permanent) {
7987 /* "permanent" entry -> fail */
7988 vm_map_unlock(submap);
7989 return KERN_PROTECTION_FAILURE;
7990 }
7991 }
7992 /* no "permanent" entries in the range -> success */
7993 vm_map_unlock(submap);
7994 return KERN_SUCCESS;
7995 }
7996
7997 __abortlike
7998 static void
__vm_map_delete_misaligned_panic(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)7999 __vm_map_delete_misaligned_panic(
8000 vm_map_t map,
8001 vm_map_offset_t start,
8002 vm_map_offset_t end)
8003 {
8004 panic("vm_map_delete(%p,0x%llx,0x%llx): start is not aligned to 0x%x",
8005 map, (uint64_t)start, (uint64_t)end, VM_MAP_PAGE_SIZE(map));
8006 }
8007
8008 __abortlike
8009 static void
__vm_map_delete_failed_panic(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,kern_return_t kr)8010 __vm_map_delete_failed_panic(
8011 vm_map_t map,
8012 vm_map_offset_t start,
8013 vm_map_offset_t end,
8014 kern_return_t kr)
8015 {
8016 panic("vm_map_delete(%p,0x%llx,0x%llx): failed unexpected with %d",
8017 map, (uint64_t)start, (uint64_t)end, kr);
8018 }
8019
8020 __abortlike
8021 static void
__vm_map_delete_gap_panic(vm_map_t map,vm_map_offset_t where,vm_map_offset_t start,vm_map_offset_t end)8022 __vm_map_delete_gap_panic(
8023 vm_map_t map,
8024 vm_map_offset_t where,
8025 vm_map_offset_t start,
8026 vm_map_offset_t end)
8027 {
8028 panic("vm_map_delete(%p,0x%llx,0x%llx): no map entry at 0x%llx",
8029 map, (uint64_t)start, (uint64_t)end, (uint64_t)where);
8030 }
8031
8032 __abortlike
8033 static void
__vm_map_delete_permanent_panic(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_map_entry_t entry)8034 __vm_map_delete_permanent_panic(
8035 vm_map_t map,
8036 vm_map_offset_t start,
8037 vm_map_offset_t end,
8038 vm_map_entry_t entry)
8039 {
8040 panic("vm_map_delete(%p,0x%llx,0x%llx): "
8041 "Attempting to remove permanent VM map entry %p [0x%llx:0x%llx]",
8042 map, (uint64_t)start, (uint64_t)end, entry,
8043 (uint64_t)entry->vme_start,
8044 (uint64_t)entry->vme_end);
8045 }
8046
8047 __options_decl(vm_map_delete_state_t, uint32_t, {
8048 VMDS_NONE = 0x0000,
8049
8050 VMDS_FOUND_GAP = 0x0001,
8051 VMDS_GAPS_OK = 0x0002,
8052
8053 VMDS_KERNEL_PMAP = 0x0004,
8054 VMDS_NEEDS_LOOKUP = 0x0008,
8055 VMDS_NEEDS_WAKEUP = 0x0010,
8056 VMDS_KERNEL_KMEMPTR = 0x0020
8057 });
8058
8059 /*
8060 * vm_map_delete: [ internal use only ]
8061 *
8062 * Deallocates the given address range from the target map.
8063 * Removes all user wirings. Unwires one kernel wiring if
8064 * VM_MAP_REMOVE_KUNWIRE is set. Waits for kernel wirings to go
8065 * away if VM_MAP_REMOVE_WAIT_FOR_KWIRE is set. Sleeps
8066 * interruptibly if VM_MAP_REMOVE_INTERRUPTIBLE is set.
8067 *
8068 *
8069 * When the map is a kernel map, then any error in removing mappings
8070 * will lead to a panic so that clients do not have to repeat the panic
8071 * code at each call site. If VM_MAP_REMOVE_INTERRUPTIBLE
8072 * is also passed, then KERN_ABORTED will not lead to a panic.
8073 *
8074 * This routine is called with map locked and leaves map locked.
8075 */
8076 static kmem_return_t
vm_map_delete(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vmr_flags_t flags,kmem_guard_t guard,vm_map_zap_t zap_list)8077 vm_map_delete(
8078 vm_map_t map,
8079 vm_map_offset_t start,
8080 vm_map_offset_t end,
8081 vmr_flags_t flags,
8082 kmem_guard_t guard,
8083 vm_map_zap_t zap_list)
8084 {
8085 vm_map_entry_t entry, next;
8086 int interruptible;
8087 vm_map_offset_t gap_start = 0;
8088 vm_map_offset_t clear_in_transition_end = 0;
8089 __unused vm_map_offset_t save_start = start;
8090 __unused vm_map_offset_t save_end = end;
8091 vm_map_delete_state_t state = VMDS_NONE;
8092 kmem_return_t ret = { };
8093 vm_map_range_id_t range_id = 0;
8094 struct kmem_page_meta *meta = NULL;
8095 uint32_t size_idx, slot_idx;
8096 struct mach_vm_range slot;
8097
8098 if (vm_map_pmap(map) == kernel_pmap) {
8099 state |= VMDS_KERNEL_PMAP;
8100 range_id = kmem_addr_get_range(start, end - start);
8101 if (kmem_is_ptr_range(range_id)) {
8102 state |= VMDS_KERNEL_KMEMPTR;
8103 slot_idx = kmem_addr_get_slot_idx(start, end, range_id, &meta,
8104 &size_idx, &slot);
8105 }
8106 }
8107
8108 if (map->terminated || os_ref_get_count_raw(&map->map_refcnt) == 0) {
8109 state |= VMDS_GAPS_OK;
8110 }
8111
8112 if (map->corpse_source &&
8113 !(flags & VM_MAP_REMOVE_TO_OVERWRITE) &&
8114 !map->terminated) {
8115 /*
8116 * The map is being used for corpses related diagnostics.
8117 * So skip any entry removal to avoid perturbing the map state.
8118 * The cleanup will happen in task_terminate_internal after the
8119 * call to task_port_no_senders.
8120 */
8121 goto out;
8122 }
8123
8124 interruptible = (flags & VM_MAP_REMOVE_INTERRUPTIBLE) ?
8125 THREAD_ABORTSAFE : THREAD_UNINT;
8126
8127 if ((flags & VM_MAP_REMOVE_NO_MAP_ALIGN) == 0 &&
8128 (start & VM_MAP_PAGE_MASK(map))) {
8129 __vm_map_delete_misaligned_panic(map, start, end);
8130 }
8131
8132 if ((state & VMDS_GAPS_OK) == 0) {
8133 /*
8134 * If the map isn't terminated then all deletions must have
8135 * no gaps, and be within the [min, max) of the map.
8136 *
8137 * We got here without VM_MAP_RANGE_CHECK() being called,
8138 * and hence must validate bounds manually.
8139 *
8140 * It is worth noting that because vm_deallocate() will
8141 * round_page() the deallocation size, it's possible for "end"
8142 * to be 0 here due to overflow. We hence must treat it as being
8143 * beyond vm_map_max(map).
8144 *
8145 * Similarly, end < start means some wrap around happend,
8146 * which should cause an error or panic.
8147 */
8148 if (end == 0 || end > vm_map_max(map)) {
8149 state |= VMDS_FOUND_GAP;
8150 gap_start = vm_map_max(map);
8151 if (state & VMDS_KERNEL_PMAP) {
8152 __vm_map_delete_gap_panic(map,
8153 gap_start, start, end);
8154 }
8155 goto out;
8156 }
8157
8158 if (end < start) {
8159 if (state & VMDS_KERNEL_PMAP) {
8160 __vm_map_delete_gap_panic(map,
8161 vm_map_max(map), start, end);
8162 }
8163 ret.kmr_return = KERN_INVALID_ARGUMENT;
8164 goto out;
8165 }
8166
8167 if (start < vm_map_min(map)) {
8168 state |= VMDS_FOUND_GAP;
8169 gap_start = start;
8170 if (state & VMDS_KERNEL_PMAP) {
8171 __vm_map_delete_gap_panic(map,
8172 gap_start, start, end);
8173 }
8174 goto out;
8175 }
8176 } else {
8177 /*
8178 * If the map is terminated, we must accept start/end
8179 * being beyond the boundaries of the map as this is
8180 * how some of the mappings like commpage mappings
8181 * can be destroyed (they're outside of those bounds).
8182 *
8183 * end < start is still something we can't cope with,
8184 * so just bail.
8185 */
8186 if (end < start) {
8187 goto out;
8188 }
8189 }
8190
8191
8192 /*
8193 * Find the start of the region.
8194 *
8195 * If in a superpage, extend the range
8196 * to include the start of the mapping.
8197 */
8198 while (vm_map_lookup_entry_or_next(map, start, &entry)) {
8199 if (entry->superpage_size && (start & ~SUPERPAGE_MASK)) {
8200 start = SUPERPAGE_ROUND_DOWN(start);
8201 } else {
8202 SAVE_HINT_MAP_WRITE(map, entry->vme_prev);
8203 break;
8204 }
8205 }
8206
8207 if (entry->superpage_size) {
8208 end = SUPERPAGE_ROUND_UP(end);
8209 }
8210
8211 /*
8212 * Step through all entries in this region
8213 */
8214 for (vm_map_offset_t s = start; s < end;) {
8215 /*
8216 * At this point, we have deleted all the memory entries
8217 * in [start, s) and are proceeding with the [s, end) range.
8218 *
8219 * This loop might drop the map lock, and it is possible that
8220 * some memory was already reallocated within [start, s)
8221 * and we don't want to mess with those entries.
8222 *
8223 * Some of those entries could even have been re-assembled
8224 * with an entry after "s" (in vm_map_simplify_entry()), so
8225 * we may have to vm_map_clip_start() again.
8226 *
8227 * When clear_in_transition_end is set, the we had marked
8228 * [start, clear_in_transition_end) as "in_transition"
8229 * during a previous iteration and we need to clear it.
8230 */
8231
8232 /*
8233 * Step 1: If needed (because we dropped locks),
8234 * lookup the entry again.
8235 *
8236 * If we're coming back from unwiring (Step 5),
8237 * we also need to mark the entries as no longer
8238 * in transition after that.
8239 */
8240
8241 if (state & VMDS_NEEDS_LOOKUP) {
8242 state &= ~VMDS_NEEDS_LOOKUP;
8243
8244 if (vm_map_lookup_entry_or_next(map, s, &entry)) {
8245 SAVE_HINT_MAP_WRITE(map, entry->vme_prev);
8246 }
8247
8248 if (state & VMDS_KERNEL_KMEMPTR) {
8249 kmem_validate_slot(s, meta, size_idx, slot_idx);
8250 }
8251 }
8252
8253 if (clear_in_transition_end) {
8254 for (vm_map_entry_t it = entry;
8255 it != vm_map_to_entry(map) &&
8256 it->vme_start < clear_in_transition_end;
8257 it = it->vme_next) {
8258 assert(it->in_transition);
8259 it->in_transition = FALSE;
8260 if (it->needs_wakeup) {
8261 it->needs_wakeup = FALSE;
8262 state |= VMDS_NEEDS_WAKEUP;
8263 }
8264 }
8265
8266 clear_in_transition_end = 0;
8267 }
8268
8269
8270 /*
8271 * Step 2: Perform various policy checks
8272 * before we do _anything_ to this entry.
8273 */
8274
8275 if (entry == vm_map_to_entry(map) || s < entry->vme_start) {
8276 if (state & (VMDS_GAPS_OK | VMDS_FOUND_GAP)) {
8277 /*
8278 * Either we found a gap already,
8279 * or we are tearing down a map,
8280 * keep going.
8281 */
8282 } else if (state & VMDS_KERNEL_PMAP) {
8283 __vm_map_delete_gap_panic(map, s, start, end);
8284 } else if (s < end) {
8285 state |= VMDS_FOUND_GAP;
8286 gap_start = s;
8287 }
8288
8289 if (entry == vm_map_to_entry(map) ||
8290 end <= entry->vme_start) {
8291 break;
8292 }
8293
8294 s = entry->vme_start;
8295 }
8296
8297 if (state & VMDS_KERNEL_PMAP) {
8298 /*
8299 * In the kernel map and its submaps,
8300 * permanent entries never die, even
8301 * if VM_MAP_REMOVE_IMMUTABLE is passed.
8302 */
8303 if (entry->vme_permanent) {
8304 __vm_map_delete_permanent_panic(map, start, end, entry);
8305 }
8306
8307 if (flags & VM_MAP_REMOVE_GUESS_SIZE) {
8308 end = entry->vme_end;
8309 flags &= ~VM_MAP_REMOVE_GUESS_SIZE;
8310 }
8311
8312 /*
8313 * In the kernel map and its submaps,
8314 * the removal of an atomic/guarded entry is strict.
8315 *
8316 * An atomic entry is processed only if it was
8317 * specifically targeted.
8318 *
8319 * We might have deleted non-atomic entries before
8320 * we reach this this point however...
8321 */
8322 kmem_entry_validate_guard(map, entry,
8323 start, end - start, guard);
8324 }
8325
8326 /*
8327 * Step 2.1: handle "permanent" and "submap" entries
8328 * *before* clipping to avoid triggering some unnecessary
8329 * un-nesting of the shared region.
8330 */
8331 if (entry->vme_permanent && entry->is_sub_map) {
8332 // printf("FBDP %s:%d permanent submap...\n", __FUNCTION__, __LINE__);
8333 /*
8334 * Un-mapping a "permanent" mapping of a user-space
8335 * submap is not allowed unless...
8336 */
8337 if (flags & VM_MAP_REMOVE_IMMUTABLE) {
8338 /*
8339 * a. explicitly requested by the kernel caller.
8340 */
8341 // printf("FBDP %s:%d flags & REMOVE_IMMUTABLE\n", __FUNCTION__, __LINE__);
8342 } else if ((flags & VM_MAP_REMOVE_IMMUTABLE_CODE) &&
8343 developer_mode_state()) {
8344 /*
8345 * b. we're in "developer" mode (for
8346 * breakpoints, dtrace probes, ...).
8347 */
8348 // printf("FBDP %s:%d flags & REMOVE_IMMUTABLE_CODE\n", __FUNCTION__, __LINE__);
8349 } else if (map->terminated) {
8350 /*
8351 * c. this is the final address space cleanup.
8352 */
8353 // printf("FBDP %s:%d map->terminated\n", __FUNCTION__, __LINE__);
8354 } else {
8355 vm_map_offset_t submap_start, submap_end;
8356 kern_return_t submap_kr;
8357
8358 /*
8359 * Check if there are any "permanent" mappings
8360 * in this range in the submap.
8361 */
8362 if (entry->in_transition) {
8363 /* can that even happen ? */
8364 goto in_transition;
8365 }
8366 /* compute the clipped range in the submap */
8367 submap_start = s - entry->vme_start;
8368 submap_start += VME_OFFSET(entry);
8369 submap_end = end - entry->vme_start;
8370 submap_end += VME_OFFSET(entry);
8371 submap_kr = vm_map_delete_submap_recurse(
8372 VME_SUBMAP(entry),
8373 submap_start,
8374 submap_end);
8375 if (submap_kr != KERN_SUCCESS) {
8376 /*
8377 * There are some "permanent" mappings
8378 * in the submap: we are not allowed
8379 * to remove this range.
8380 */
8381 printf("%d[%s] removing permanent submap entry "
8382 "%p [0x%llx:0x%llx] prot 0x%x/0x%x -> KERN_PROT_FAILURE\n",
8383 proc_selfpid(),
8384 (get_bsdtask_info(current_task())
8385 ? proc_name_address(get_bsdtask_info(current_task()))
8386 : "?"), entry,
8387 (uint64_t)entry->vme_start,
8388 (uint64_t)entry->vme_end,
8389 entry->protection,
8390 entry->max_protection);
8391 DTRACE_VM6(vm_map_delete_permanent_deny_submap,
8392 vm_map_entry_t, entry,
8393 vm_map_offset_t, entry->vme_start,
8394 vm_map_offset_t, entry->vme_end,
8395 vm_prot_t, entry->protection,
8396 vm_prot_t, entry->max_protection,
8397 int, VME_ALIAS(entry));
8398 ret.kmr_return = KERN_PROTECTION_FAILURE;
8399 goto out;
8400 }
8401 /* no permanent mappings: proceed */
8402 }
8403 }
8404
8405 /*
8406 * Step 3: Perform any clipping needed.
8407 *
8408 * After this, "entry" starts at "s", ends before "end"
8409 */
8410
8411 if (entry->vme_start < s) {
8412 if ((flags & VM_MAP_REMOVE_NO_MAP_ALIGN) &&
8413 entry->map_aligned &&
8414 !VM_MAP_PAGE_ALIGNED(s, VM_MAP_PAGE_MASK(map))) {
8415 /*
8416 * The entry will no longer be map-aligned
8417 * after clipping and the caller said it's OK.
8418 */
8419 entry->map_aligned = FALSE;
8420 }
8421 vm_map_clip_start(map, entry, s);
8422 SAVE_HINT_MAP_WRITE(map, entry->vme_prev);
8423 }
8424
8425 if (end < entry->vme_end) {
8426 if ((flags & VM_MAP_REMOVE_NO_MAP_ALIGN) &&
8427 entry->map_aligned &&
8428 !VM_MAP_PAGE_ALIGNED(end, VM_MAP_PAGE_MASK(map))) {
8429 /*
8430 * The entry will no longer be map-aligned
8431 * after clipping and the caller said it's OK.
8432 */
8433 entry->map_aligned = FALSE;
8434 }
8435 vm_map_clip_end(map, entry, end);
8436 }
8437
8438 if (entry->vme_permanent && entry->is_sub_map) {
8439 /*
8440 * We already went through step 2.1 which did not deny
8441 * the removal of this "permanent" and "is_sub_map"
8442 * entry.
8443 * Now that we've clipped what we actually want to
8444 * delete, undo the "permanent" part to allow the
8445 * removal to proceed.
8446 */
8447 DTRACE_VM6(vm_map_delete_permanent_allow_submap,
8448 vm_map_entry_t, entry,
8449 vm_map_offset_t, entry->vme_start,
8450 vm_map_offset_t, entry->vme_end,
8451 vm_prot_t, entry->protection,
8452 vm_prot_t, entry->max_protection,
8453 int, VME_ALIAS(entry));
8454 entry->vme_permanent = false;
8455 }
8456
8457 assert(s == entry->vme_start);
8458 assert(entry->vme_end <= end);
8459
8460
8461 /*
8462 * Step 4: If the entry is in flux, wait for this to resolve.
8463 */
8464
8465 if (entry->in_transition) {
8466 wait_result_t wait_result;
8467
8468 in_transition:
8469 /*
8470 * Another thread is wiring/unwiring this entry.
8471 * Let the other thread know we are waiting.
8472 */
8473
8474 entry->needs_wakeup = TRUE;
8475
8476 /*
8477 * wake up anybody waiting on entries that we have
8478 * already unwired/deleted.
8479 */
8480 if (state & VMDS_NEEDS_WAKEUP) {
8481 vm_map_entry_wakeup(map);
8482 state &= ~VMDS_NEEDS_WAKEUP;
8483 }
8484
8485 wait_result = vm_map_entry_wait(map, interruptible);
8486
8487 if (interruptible &&
8488 wait_result == THREAD_INTERRUPTED) {
8489 /*
8490 * We do not clear the needs_wakeup flag,
8491 * since we cannot tell if we were the only one.
8492 */
8493 ret.kmr_return = KERN_ABORTED;
8494 return ret;
8495 }
8496
8497 /*
8498 * The entry could have been clipped or it
8499 * may not exist anymore. Look it up again.
8500 */
8501 state |= VMDS_NEEDS_LOOKUP;
8502 continue;
8503 }
8504
8505
8506 /*
8507 * Step 5: Handle wiring
8508 */
8509
8510 if (entry->wired_count) {
8511 struct vm_map_entry tmp_entry;
8512 boolean_t user_wire;
8513 unsigned int last_timestamp;
8514
8515 user_wire = entry->user_wired_count > 0;
8516
8517 /*
8518 * Remove a kernel wiring if requested
8519 */
8520 if (flags & VM_MAP_REMOVE_KUNWIRE) {
8521 entry->wired_count--;
8522 vme_btref_consider_and_put(entry);
8523 }
8524
8525 /*
8526 * Remove all user wirings for proper accounting
8527 */
8528 while (entry->user_wired_count) {
8529 subtract_wire_counts(map, entry, user_wire);
8530 }
8531
8532 /*
8533 * All our DMA I/O operations in IOKit are currently
8534 * done by wiring through the map entries of the task
8535 * requesting the I/O.
8536 *
8537 * Because of this, we must always wait for kernel wirings
8538 * to go away on the entries before deleting them.
8539 *
8540 * Any caller who wants to actually remove a kernel wiring
8541 * should explicitly set the VM_MAP_REMOVE_KUNWIRE flag to
8542 * properly remove one wiring instead of blasting through
8543 * them all.
8544 */
8545 if (entry->wired_count != 0) {
8546 assert(map != kernel_map);
8547 /*
8548 * Cannot continue. Typical case is when
8549 * a user thread has physical io pending on
8550 * on this page. Either wait for the
8551 * kernel wiring to go away or return an
8552 * error.
8553 */
8554 wait_result_t wait_result;
8555
8556 entry->needs_wakeup = TRUE;
8557 wait_result = vm_map_entry_wait(map,
8558 interruptible);
8559
8560 if (interruptible &&
8561 wait_result == THREAD_INTERRUPTED) {
8562 /*
8563 * We do not clear the
8564 * needs_wakeup flag, since we
8565 * cannot tell if we were the
8566 * only one.
8567 */
8568 ret.kmr_return = KERN_ABORTED;
8569 return ret;
8570 }
8571
8572
8573 /*
8574 * The entry could have been clipped or
8575 * it may not exist anymore. Look it
8576 * up again.
8577 */
8578 state |= VMDS_NEEDS_LOOKUP;
8579 continue;
8580 }
8581
8582 /*
8583 * We can unlock the map now.
8584 *
8585 * The entry might be split once we unlock the map,
8586 * but we need the range as defined by this entry
8587 * to be stable. So we must make a local copy.
8588 *
8589 * The underlying objects do not change during clips,
8590 * and the in_transition state guarentees existence
8591 * of the entry.
8592 */
8593 last_timestamp = map->timestamp;
8594 entry->in_transition = TRUE;
8595 tmp_entry = *entry;
8596 vm_map_unlock(map);
8597
8598 if (tmp_entry.is_sub_map) {
8599 vm_map_t sub_map;
8600 vm_map_offset_t sub_start, sub_end;
8601 pmap_t pmap;
8602 vm_map_offset_t pmap_addr;
8603
8604
8605 sub_map = VME_SUBMAP(&tmp_entry);
8606 sub_start = VME_OFFSET(&tmp_entry);
8607 sub_end = sub_start + (tmp_entry.vme_end -
8608 tmp_entry.vme_start);
8609 if (tmp_entry.use_pmap) {
8610 pmap = sub_map->pmap;
8611 pmap_addr = tmp_entry.vme_start;
8612 } else {
8613 pmap = map->pmap;
8614 pmap_addr = tmp_entry.vme_start;
8615 }
8616 (void) vm_map_unwire_nested(sub_map,
8617 sub_start, sub_end,
8618 user_wire,
8619 pmap, pmap_addr);
8620 } else {
8621 vm_map_offset_t entry_end = tmp_entry.vme_end;
8622 vm_map_offset_t max_end;
8623
8624 if (flags & VM_MAP_REMOVE_NOKUNWIRE_LAST) {
8625 max_end = end - VM_MAP_PAGE_SIZE(map);
8626 if (entry_end > max_end) {
8627 entry_end = max_end;
8628 }
8629 }
8630
8631 if (tmp_entry.vme_kernel_object) {
8632 pmap_protect_options(
8633 map->pmap,
8634 tmp_entry.vme_start,
8635 entry_end,
8636 VM_PROT_NONE,
8637 PMAP_OPTIONS_REMOVE,
8638 NULL);
8639 }
8640 vm_fault_unwire(map, &tmp_entry,
8641 tmp_entry.vme_kernel_object, map->pmap,
8642 tmp_entry.vme_start, entry_end);
8643 }
8644
8645 vm_map_lock(map);
8646
8647 /*
8648 * Unwiring happened, we can now go back to deleting
8649 * them (after we clear the in_transition bit for the range).
8650 */
8651 if (last_timestamp + 1 != map->timestamp) {
8652 state |= VMDS_NEEDS_LOOKUP;
8653 }
8654 clear_in_transition_end = tmp_entry.vme_end;
8655 continue;
8656 }
8657
8658 assert(entry->wired_count == 0);
8659 assert(entry->user_wired_count == 0);
8660
8661
8662 /*
8663 * Step 6: Entry is unwired and ready for us to delete !
8664 */
8665
8666 if (!entry->vme_permanent) {
8667 /*
8668 * Typical case: the entry really shouldn't be permanent
8669 */
8670 } else if ((flags & VM_MAP_REMOVE_IMMUTABLE_CODE) &&
8671 (entry->protection & VM_PROT_EXECUTE) &&
8672 developer_mode_state()) {
8673 /*
8674 * Allow debuggers to undo executable mappings
8675 * when developer mode is on.
8676 */
8677 #if 0
8678 printf("FBDP %d[%s] removing permanent executable entry "
8679 "%p [0x%llx:0x%llx] prot 0x%x/0x%x\n",
8680 proc_selfpid(),
8681 (current_task()->bsd_info
8682 ? proc_name_address(current_task()->bsd_info)
8683 : "?"), entry,
8684 (uint64_t)entry->vme_start,
8685 (uint64_t)entry->vme_end,
8686 entry->protection,
8687 entry->max_protection);
8688 #endif
8689 entry->vme_permanent = FALSE;
8690 } else if ((flags & VM_MAP_REMOVE_IMMUTABLE) || map->terminated) {
8691 #if 0
8692 printf("FBDP %d[%s] removing permanent entry "
8693 "%p [0x%llx:0x%llx] prot 0x%x/0x%x\n",
8694 proc_selfpid(),
8695 (current_task()->bsd_info
8696 ? proc_name_address(current_task()->bsd_info)
8697 : "?"), entry,
8698 (uint64_t)entry->vme_start,
8699 (uint64_t)entry->vme_end,
8700 entry->protection,
8701 entry->max_protection);
8702 #endif
8703 entry->vme_permanent = FALSE;
8704 #if CODE_SIGNING_MONITOR
8705 } else if ((entry->protection & VM_PROT_EXECUTE) && !csm_enabled()) {
8706 entry->vme_permanent = FALSE;
8707
8708 printf("%d[%s] %s(0x%llx,0x%llx): "
8709 "code signing monitor disabled, allowing for permanent executable entry [0x%llx:0x%llx] "
8710 "prot 0x%x/0x%x\n",
8711 proc_selfpid(),
8712 (get_bsdtask_info(current_task())
8713 ? proc_name_address(get_bsdtask_info(current_task()))
8714 : "?"),
8715 __FUNCTION__,
8716 (uint64_t)start,
8717 (uint64_t)end,
8718 (uint64_t)entry->vme_start,
8719 (uint64_t)entry->vme_end,
8720 entry->protection,
8721 entry->max_protection);
8722 #endif
8723 } else {
8724 DTRACE_VM6(vm_map_delete_permanent,
8725 vm_map_entry_t, entry,
8726 vm_map_offset_t, entry->vme_start,
8727 vm_map_offset_t, entry->vme_end,
8728 vm_prot_t, entry->protection,
8729 vm_prot_t, entry->max_protection,
8730 int, VME_ALIAS(entry));
8731 }
8732
8733 if (entry->is_sub_map) {
8734 assertf(VM_MAP_PAGE_SHIFT(VME_SUBMAP(entry)) >= VM_MAP_PAGE_SHIFT(map),
8735 "map %p (%d) entry %p submap %p (%d)\n",
8736 map, VM_MAP_PAGE_SHIFT(map), entry,
8737 VME_SUBMAP(entry),
8738 VM_MAP_PAGE_SHIFT(VME_SUBMAP(entry)));
8739 if (entry->use_pmap) {
8740 #ifndef NO_NESTED_PMAP
8741 int pmap_flags;
8742
8743 if (map->terminated) {
8744 /*
8745 * This is the final cleanup of the
8746 * address space being terminated.
8747 * No new mappings are expected and
8748 * we don't really need to unnest the
8749 * shared region (and lose the "global"
8750 * pmap mappings, if applicable).
8751 *
8752 * Tell the pmap layer that we're
8753 * "clean" wrt nesting.
8754 */
8755 pmap_flags = PMAP_UNNEST_CLEAN;
8756 } else {
8757 /*
8758 * We're unmapping part of the nested
8759 * shared region, so we can't keep the
8760 * nested pmap.
8761 */
8762 pmap_flags = 0;
8763 }
8764 pmap_unnest_options(
8765 map->pmap,
8766 (addr64_t)entry->vme_start,
8767 entry->vme_end - entry->vme_start,
8768 pmap_flags);
8769 #endif /* NO_NESTED_PMAP */
8770 if (map->mapped_in_other_pmaps &&
8771 os_ref_get_count_raw(&map->map_refcnt) != 0) {
8772 /* clean up parent map/maps */
8773 vm_map_submap_pmap_clean(
8774 map, entry->vme_start,
8775 entry->vme_end,
8776 VME_SUBMAP(entry),
8777 VME_OFFSET(entry));
8778 }
8779 } else {
8780 vm_map_submap_pmap_clean(
8781 map, entry->vme_start, entry->vme_end,
8782 VME_SUBMAP(entry),
8783 VME_OFFSET(entry));
8784 }
8785 } else if (entry->vme_kernel_object ||
8786 VME_OBJECT(entry) == compressor_object) {
8787 /*
8788 * nothing to do
8789 */
8790 } else if (map->mapped_in_other_pmaps &&
8791 os_ref_get_count_raw(&map->map_refcnt) != 0) {
8792 vm_object_pmap_protect_options(
8793 VME_OBJECT(entry), VME_OFFSET(entry),
8794 entry->vme_end - entry->vme_start,
8795 PMAP_NULL,
8796 PAGE_SIZE,
8797 entry->vme_start,
8798 VM_PROT_NONE,
8799 PMAP_OPTIONS_REMOVE);
8800 } else if ((VME_OBJECT(entry) != VM_OBJECT_NULL) ||
8801 (state & VMDS_KERNEL_PMAP)) {
8802 /* Remove translations associated
8803 * with this range unless the entry
8804 * does not have an object, or
8805 * it's the kernel map or a descendant
8806 * since the platform could potentially
8807 * create "backdoor" mappings invisible
8808 * to the VM. It is expected that
8809 * objectless, non-kernel ranges
8810 * do not have such VM invisible
8811 * translations.
8812 */
8813 pmap_remove_options(map->pmap,
8814 (addr64_t)entry->vme_start,
8815 (addr64_t)entry->vme_end,
8816 PMAP_OPTIONS_REMOVE);
8817 }
8818
8819 #if DEBUG
8820 /*
8821 * All pmap mappings for this map entry must have been
8822 * cleared by now.
8823 */
8824 assert(pmap_is_empty(map->pmap,
8825 entry->vme_start,
8826 entry->vme_end));
8827 #endif /* DEBUG */
8828
8829 if (entry->iokit_acct) {
8830 /* alternate accounting */
8831 DTRACE_VM4(vm_map_iokit_unmapped_region,
8832 vm_map_t, map,
8833 vm_map_offset_t, entry->vme_start,
8834 vm_map_offset_t, entry->vme_end,
8835 int, VME_ALIAS(entry));
8836 vm_map_iokit_unmapped_region(map,
8837 (entry->vme_end -
8838 entry->vme_start));
8839 entry->iokit_acct = FALSE;
8840 entry->use_pmap = FALSE;
8841 }
8842
8843 /* move "s" forward */
8844 s = entry->vme_end;
8845 next = entry->vme_next;
8846 if (!entry->map_aligned) {
8847 vm_map_offset_t rounded_s;
8848
8849 /*
8850 * Skip artificial gap due to mis-aligned entry
8851 * on devices with a page size smaller than the
8852 * map's page size (i.e. 16k task on a 4k device).
8853 */
8854 rounded_s = VM_MAP_ROUND_PAGE(s, VM_MAP_PAGE_MASK(map));
8855 if (next == vm_map_to_entry(map)) {
8856 s = rounded_s;
8857 } else if (s < rounded_s) {
8858 s = MIN(rounded_s, next->vme_start);
8859 }
8860 }
8861 ret.kmr_size += s - entry->vme_start;
8862
8863 if (entry->vme_permanent) {
8864 /*
8865 * A permanent entry can not be removed, so leave it
8866 * in place but remove all access permissions.
8867 */
8868 if (!entry->csm_associated) {
8869 printf("%s:%d %d[%s] map %p entry %p [ 0x%llx - 0x%llx ] submap %d prot 0x%x/0x%x -> 0/0\n",
8870 __FUNCTION__, __LINE__,
8871 proc_selfpid(),
8872 (get_bsdtask_info(current_task())
8873 ? proc_name_address(get_bsdtask_info(current_task()))
8874 : "?"),
8875 map,
8876 entry,
8877 (uint64_t)entry->vme_start,
8878 (uint64_t)entry->vme_end,
8879 entry->is_sub_map,
8880 entry->protection,
8881 entry->max_protection);
8882 }
8883 DTRACE_VM6(vm_map_delete_permanent_prot_none,
8884 vm_map_entry_t, entry,
8885 vm_map_offset_t, entry->vme_start,
8886 vm_map_offset_t, entry->vme_end,
8887 vm_prot_t, entry->protection,
8888 vm_prot_t, entry->max_protection,
8889 int, VME_ALIAS(entry));
8890 entry->protection = VM_PROT_NONE;
8891 entry->max_protection = VM_PROT_NONE;
8892 } else {
8893 vm_map_entry_zap(map, entry, zap_list);
8894 }
8895
8896 entry = next;
8897 next = VM_MAP_ENTRY_NULL;
8898
8899 if ((flags & VM_MAP_REMOVE_NO_YIELD) == 0 && s < end) {
8900 unsigned int last_timestamp = map->timestamp++;
8901
8902 if (lck_rw_lock_yield_exclusive(&map->lock,
8903 LCK_RW_YIELD_ANY_WAITER)) {
8904 if (last_timestamp != map->timestamp + 1) {
8905 state |= VMDS_NEEDS_LOOKUP;
8906 }
8907 } else {
8908 /* we didn't yield, undo our change */
8909 map->timestamp--;
8910 }
8911 }
8912 }
8913
8914 if (map->wait_for_space) {
8915 thread_wakeup((event_t) map);
8916 }
8917
8918 if (state & VMDS_NEEDS_WAKEUP) {
8919 vm_map_entry_wakeup(map);
8920 }
8921
8922 out:
8923 if ((state & VMDS_KERNEL_PMAP) && ret.kmr_return) {
8924 __vm_map_delete_failed_panic(map, start, end, ret.kmr_return);
8925 }
8926
8927 if (state & VMDS_KERNEL_KMEMPTR) {
8928 kmem_free_space(start, end, range_id, &slot);
8929 }
8930
8931 if (state & VMDS_FOUND_GAP) {
8932 DTRACE_VM3(kern_vm_deallocate_gap,
8933 vm_map_offset_t, gap_start,
8934 vm_map_offset_t, save_start,
8935 vm_map_offset_t, save_end);
8936 if (flags & VM_MAP_REMOVE_GAPS_FAIL) {
8937 ret.kmr_return = KERN_INVALID_VALUE;
8938 } else {
8939 vm_map_guard_exception(gap_start, kGUARD_EXC_DEALLOC_GAP);
8940 }
8941 }
8942
8943 return ret;
8944 }
8945
8946 kmem_return_t
vm_map_remove_and_unlock(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vmr_flags_t flags,kmem_guard_t guard)8947 vm_map_remove_and_unlock(
8948 vm_map_t map,
8949 vm_map_offset_t start,
8950 vm_map_offset_t end,
8951 vmr_flags_t flags,
8952 kmem_guard_t guard)
8953 {
8954 kmem_return_t ret;
8955 VM_MAP_ZAP_DECLARE(zap);
8956
8957 ret = vm_map_delete(map, start, end, flags, guard, &zap);
8958 vm_map_unlock(map);
8959
8960 vm_map_zap_dispose(&zap);
8961
8962 return ret;
8963 }
8964
8965 /*
8966 * vm_map_remove_guard:
8967 *
8968 * Remove the given address range from the target map.
8969 * This is the exported form of vm_map_delete.
8970 */
8971 kmem_return_t
vm_map_remove_guard(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vmr_flags_t flags,kmem_guard_t guard)8972 vm_map_remove_guard(
8973 vm_map_t map,
8974 vm_map_offset_t start,
8975 vm_map_offset_t end,
8976 vmr_flags_t flags,
8977 kmem_guard_t guard)
8978 {
8979 vm_map_lock(map);
8980 return vm_map_remove_and_unlock(map, start, end, flags, guard);
8981 }
8982
8983 /*
8984 * vm_map_terminate:
8985 *
8986 * Clean out a task's map.
8987 */
8988 kern_return_t
vm_map_terminate(vm_map_t map)8989 vm_map_terminate(
8990 vm_map_t map)
8991 {
8992 vm_map_lock(map);
8993 map->terminated = TRUE;
8994 vm_map_disable_hole_optimization(map);
8995 (void)vm_map_remove_and_unlock(map, map->min_offset, map->max_offset,
8996 VM_MAP_REMOVE_NO_FLAGS, KMEM_GUARD_NONE);
8997 return KERN_SUCCESS;
8998 }
8999
9000 /*
9001 * Routine: vm_map_copy_allocate
9002 *
9003 * Description:
9004 * Allocates and initializes a map copy object.
9005 */
9006 static vm_map_copy_t
vm_map_copy_allocate(uint16_t type)9007 vm_map_copy_allocate(uint16_t type)
9008 {
9009 vm_map_copy_t new_copy;
9010
9011 new_copy = zalloc_id(ZONE_ID_VM_MAP_COPY, Z_WAITOK | Z_ZERO);
9012 new_copy->type = type;
9013 if (type == VM_MAP_COPY_ENTRY_LIST) {
9014 new_copy->c_u.hdr.rb_head_store.rbh_root = (void*)(int)SKIP_RB_TREE;
9015 vm_map_store_init(&new_copy->cpy_hdr);
9016 }
9017 return new_copy;
9018 }
9019
9020 /*
9021 * Routine: vm_map_copy_discard
9022 *
9023 * Description:
9024 * Dispose of a map copy object (returned by
9025 * vm_map_copyin).
9026 */
9027 void
vm_map_copy_discard(vm_map_copy_t copy)9028 vm_map_copy_discard(
9029 vm_map_copy_t copy)
9030 {
9031 if (copy == VM_MAP_COPY_NULL) {
9032 return;
9033 }
9034
9035 /*
9036 * Assert that the vm_map_copy is coming from the right
9037 * zone and hasn't been forged
9038 */
9039 vm_map_copy_require(copy);
9040
9041 switch (copy->type) {
9042 case VM_MAP_COPY_ENTRY_LIST:
9043 while (vm_map_copy_first_entry(copy) !=
9044 vm_map_copy_to_entry(copy)) {
9045 vm_map_entry_t entry = vm_map_copy_first_entry(copy);
9046
9047 vm_map_copy_entry_unlink(copy, entry);
9048 if (entry->is_sub_map) {
9049 vm_map_deallocate(VME_SUBMAP(entry));
9050 } else {
9051 vm_object_deallocate(VME_OBJECT(entry));
9052 }
9053 vm_map_copy_entry_dispose(entry);
9054 }
9055 break;
9056 case VM_MAP_COPY_KERNEL_BUFFER:
9057
9058 /*
9059 * The vm_map_copy_t and possibly the data buffer were
9060 * allocated by a single call to kalloc_data(), i.e. the
9061 * vm_map_copy_t was not allocated out of the zone.
9062 */
9063 if (copy->size > msg_ool_size_small || copy->offset) {
9064 panic("Invalid vm_map_copy_t sz:%lld, ofst:%lld",
9065 (long long)copy->size, (long long)copy->offset);
9066 }
9067 kfree_data(copy->cpy_kdata, copy->size);
9068 }
9069 zfree_id(ZONE_ID_VM_MAP_COPY, copy);
9070 }
9071
9072 #if XNU_PLATFORM_MacOSX
9073
9074 /*
9075 * Routine: vm_map_copy_copy
9076 *
9077 * Description:
9078 * Move the information in a map copy object to
9079 * a new map copy object, leaving the old one
9080 * empty.
9081 *
9082 * This is used by kernel routines that need
9083 * to look at out-of-line data (in copyin form)
9084 * before deciding whether to return SUCCESS.
9085 * If the routine returns FAILURE, the original
9086 * copy object will be deallocated; therefore,
9087 * these routines must make a copy of the copy
9088 * object and leave the original empty so that
9089 * deallocation will not fail.
9090 */
9091 vm_map_copy_t
vm_map_copy_copy(vm_map_copy_t copy)9092 vm_map_copy_copy(
9093 vm_map_copy_t copy)
9094 {
9095 vm_map_copy_t new_copy;
9096
9097 if (copy == VM_MAP_COPY_NULL) {
9098 return VM_MAP_COPY_NULL;
9099 }
9100
9101 /*
9102 * Assert that the vm_map_copy is coming from the right
9103 * zone and hasn't been forged
9104 */
9105 vm_map_copy_require(copy);
9106
9107 /*
9108 * Allocate a new copy object, and copy the information
9109 * from the old one into it.
9110 */
9111
9112 new_copy = zalloc_id(ZONE_ID_VM_MAP_COPY, Z_WAITOK | Z_ZERO | Z_NOFAIL);
9113 memcpy((void *) new_copy, (void *) copy, sizeof(struct vm_map_copy));
9114 #if __has_feature(ptrauth_calls)
9115 if (copy->type == VM_MAP_COPY_KERNEL_BUFFER) {
9116 new_copy->cpy_kdata = copy->cpy_kdata;
9117 }
9118 #endif
9119
9120 if (copy->type == VM_MAP_COPY_ENTRY_LIST) {
9121 /*
9122 * The links in the entry chain must be
9123 * changed to point to the new copy object.
9124 */
9125 vm_map_copy_first_entry(copy)->vme_prev
9126 = vm_map_copy_to_entry(new_copy);
9127 vm_map_copy_last_entry(copy)->vme_next
9128 = vm_map_copy_to_entry(new_copy);
9129 }
9130
9131 /*
9132 * Change the old copy object into one that contains
9133 * nothing to be deallocated.
9134 */
9135 bzero(copy, sizeof(struct vm_map_copy));
9136 copy->type = VM_MAP_COPY_KERNEL_BUFFER;
9137
9138 /*
9139 * Return the new object.
9140 */
9141 return new_copy;
9142 }
9143
9144 #endif /* XNU_PLATFORM_MacOSX */
9145
9146 static boolean_t
vm_map_entry_is_overwritable(vm_map_t dst_map __unused,vm_map_entry_t entry)9147 vm_map_entry_is_overwritable(
9148 vm_map_t dst_map __unused,
9149 vm_map_entry_t entry)
9150 {
9151 if (!(entry->protection & VM_PROT_WRITE)) {
9152 /* can't overwrite if not writable */
9153 return FALSE;
9154 }
9155 #if !__x86_64__
9156 if (entry->used_for_jit &&
9157 vm_map_cs_enforcement(dst_map) &&
9158 !dst_map->cs_debugged) {
9159 /*
9160 * Can't overwrite a JIT region while cs_enforced
9161 * and not cs_debugged.
9162 */
9163 return FALSE;
9164 }
9165
9166 #if __arm64e__
9167 /* Do not allow overwrite HW assisted TPRO entries */
9168 if (entry->used_for_tpro) {
9169 return FALSE;
9170 }
9171 #endif /* __arm64e__ */
9172
9173 if (entry->vme_permanent) {
9174 if (entry->is_sub_map) {
9175 /*
9176 * We can't tell if the submap contains "permanent"
9177 * entries within the range targeted by the caller.
9178 * The caller will have to check for that with
9179 * vm_map_overwrite_submap_recurse() for example.
9180 */
9181 } else {
9182 /*
9183 * Do not allow overwriting of a "permanent"
9184 * entry.
9185 */
9186 DTRACE_VM6(vm_map_delete_permanent_deny_overwrite,
9187 vm_map_entry_t, entry,
9188 vm_map_offset_t, entry->vme_start,
9189 vm_map_offset_t, entry->vme_end,
9190 vm_prot_t, entry->protection,
9191 vm_prot_t, entry->max_protection,
9192 int, VME_ALIAS(entry));
9193 return FALSE;
9194 }
9195 }
9196 #endif /* !__x86_64__ */
9197 return TRUE;
9198 }
9199
9200 static kern_return_t
vm_map_overwrite_submap_recurse(vm_map_t dst_map,vm_map_offset_t dst_addr,vm_map_size_t dst_size)9201 vm_map_overwrite_submap_recurse(
9202 vm_map_t dst_map,
9203 vm_map_offset_t dst_addr,
9204 vm_map_size_t dst_size)
9205 {
9206 vm_map_offset_t dst_end;
9207 vm_map_entry_t tmp_entry;
9208 vm_map_entry_t entry;
9209 kern_return_t result;
9210 boolean_t encountered_sub_map = FALSE;
9211
9212
9213
9214 /*
9215 * Verify that the destination is all writeable
9216 * initially. We have to trunc the destination
9217 * address and round the copy size or we'll end up
9218 * splitting entries in strange ways.
9219 */
9220
9221 dst_end = vm_map_round_page(dst_addr + dst_size,
9222 VM_MAP_PAGE_MASK(dst_map));
9223 vm_map_lock(dst_map);
9224
9225 start_pass_1:
9226 if (!vm_map_lookup_entry(dst_map, dst_addr, &tmp_entry)) {
9227 vm_map_unlock(dst_map);
9228 return KERN_INVALID_ADDRESS;
9229 }
9230
9231 vm_map_clip_start(dst_map,
9232 tmp_entry,
9233 vm_map_trunc_page(dst_addr,
9234 VM_MAP_PAGE_MASK(dst_map)));
9235 if (tmp_entry->is_sub_map) {
9236 /* clipping did unnest if needed */
9237 assert(!tmp_entry->use_pmap);
9238 }
9239
9240 for (entry = tmp_entry;;) {
9241 vm_map_entry_t next;
9242
9243 next = entry->vme_next;
9244 while (entry->is_sub_map) {
9245 vm_map_offset_t sub_start;
9246 vm_map_offset_t sub_end;
9247 vm_map_offset_t local_end;
9248
9249 if (entry->in_transition) {
9250 /*
9251 * Say that we are waiting, and wait for entry.
9252 */
9253 entry->needs_wakeup = TRUE;
9254 vm_map_entry_wait(dst_map, THREAD_UNINT);
9255
9256 goto start_pass_1;
9257 }
9258
9259 encountered_sub_map = TRUE;
9260 sub_start = VME_OFFSET(entry);
9261
9262 if (entry->vme_end < dst_end) {
9263 sub_end = entry->vme_end;
9264 } else {
9265 sub_end = dst_end;
9266 }
9267 sub_end -= entry->vme_start;
9268 sub_end += VME_OFFSET(entry);
9269 local_end = entry->vme_end;
9270 vm_map_unlock(dst_map);
9271
9272 result = vm_map_overwrite_submap_recurse(
9273 VME_SUBMAP(entry),
9274 sub_start,
9275 sub_end - sub_start);
9276
9277 if (result != KERN_SUCCESS) {
9278 return result;
9279 }
9280 if (dst_end <= entry->vme_end) {
9281 return KERN_SUCCESS;
9282 }
9283 vm_map_lock(dst_map);
9284 if (!vm_map_lookup_entry(dst_map, local_end,
9285 &tmp_entry)) {
9286 vm_map_unlock(dst_map);
9287 return KERN_INVALID_ADDRESS;
9288 }
9289 entry = tmp_entry;
9290 next = entry->vme_next;
9291 }
9292
9293 if (!(entry->protection & VM_PROT_WRITE)) {
9294 vm_map_unlock(dst_map);
9295 return KERN_PROTECTION_FAILURE;
9296 }
9297
9298 if (!vm_map_entry_is_overwritable(dst_map, entry)) {
9299 vm_map_unlock(dst_map);
9300 return KERN_PROTECTION_FAILURE;
9301 }
9302
9303 /*
9304 * If the entry is in transition, we must wait
9305 * for it to exit that state. Anything could happen
9306 * when we unlock the map, so start over.
9307 */
9308 if (entry->in_transition) {
9309 /*
9310 * Say that we are waiting, and wait for entry.
9311 */
9312 entry->needs_wakeup = TRUE;
9313 vm_map_entry_wait(dst_map, THREAD_UNINT);
9314
9315 goto start_pass_1;
9316 }
9317
9318 /*
9319 * our range is contained completely within this map entry
9320 */
9321 if (dst_end <= entry->vme_end) {
9322 vm_map_unlock(dst_map);
9323 return KERN_SUCCESS;
9324 }
9325 /*
9326 * check that range specified is contiguous region
9327 */
9328 if ((next == vm_map_to_entry(dst_map)) ||
9329 (next->vme_start != entry->vme_end)) {
9330 vm_map_unlock(dst_map);
9331 return KERN_INVALID_ADDRESS;
9332 }
9333
9334 /*
9335 * Check for permanent objects in the destination.
9336 */
9337 if ((VME_OBJECT(entry) != VM_OBJECT_NULL) &&
9338 ((!VME_OBJECT(entry)->internal) ||
9339 (VME_OBJECT(entry)->true_share))) {
9340 if (encountered_sub_map) {
9341 vm_map_unlock(dst_map);
9342 return KERN_FAILURE;
9343 }
9344 }
9345
9346
9347 entry = next;
9348 }/* for */
9349 vm_map_unlock(dst_map);
9350 return KERN_SUCCESS;
9351 }
9352
9353 /*
9354 * Routine: vm_map_copy_overwrite
9355 *
9356 * Description:
9357 * Copy the memory described by the map copy
9358 * object (copy; returned by vm_map_copyin) onto
9359 * the specified destination region (dst_map, dst_addr).
9360 * The destination must be writeable.
9361 *
9362 * Unlike vm_map_copyout, this routine actually
9363 * writes over previously-mapped memory. If the
9364 * previous mapping was to a permanent (user-supplied)
9365 * memory object, it is preserved.
9366 *
9367 * The attributes (protection and inheritance) of the
9368 * destination region are preserved.
9369 *
9370 * If successful, consumes the copy object.
9371 * Otherwise, the caller is responsible for it.
9372 *
9373 * Implementation notes:
9374 * To overwrite aligned temporary virtual memory, it is
9375 * sufficient to remove the previous mapping and insert
9376 * the new copy. This replacement is done either on
9377 * the whole region (if no permanent virtual memory
9378 * objects are embedded in the destination region) or
9379 * in individual map entries.
9380 *
9381 * To overwrite permanent virtual memory , it is necessary
9382 * to copy each page, as the external memory management
9383 * interface currently does not provide any optimizations.
9384 *
9385 * Unaligned memory also has to be copied. It is possible
9386 * to use 'vm_trickery' to copy the aligned data. This is
9387 * not done but not hard to implement.
9388 *
9389 * Once a page of permanent memory has been overwritten,
9390 * it is impossible to interrupt this function; otherwise,
9391 * the call would be neither atomic nor location-independent.
9392 * The kernel-state portion of a user thread must be
9393 * interruptible.
9394 *
9395 * It may be expensive to forward all requests that might
9396 * overwrite permanent memory (vm_write, vm_copy) to
9397 * uninterruptible kernel threads. This routine may be
9398 * called by interruptible threads; however, success is
9399 * not guaranteed -- if the request cannot be performed
9400 * atomically and interruptibly, an error indication is
9401 * returned.
9402 *
9403 * Callers of this function must call vm_map_copy_require on
9404 * previously created vm_map_copy_t or pass a newly created
9405 * one to ensure that it hasn't been forged.
9406 */
9407 static kern_return_t
vm_map_copy_overwrite_nested(vm_map_t dst_map,vm_map_address_t dst_addr,vm_map_copy_t copy,boolean_t interruptible,pmap_t pmap,boolean_t discard_on_success)9408 vm_map_copy_overwrite_nested(
9409 vm_map_t dst_map,
9410 vm_map_address_t dst_addr,
9411 vm_map_copy_t copy,
9412 boolean_t interruptible,
9413 pmap_t pmap,
9414 boolean_t discard_on_success)
9415 {
9416 vm_map_offset_t dst_end;
9417 vm_map_entry_t tmp_entry;
9418 vm_map_entry_t entry;
9419 kern_return_t kr;
9420 boolean_t aligned = TRUE;
9421 boolean_t contains_permanent_objects = FALSE;
9422 boolean_t encountered_sub_map = FALSE;
9423 vm_map_offset_t base_addr;
9424 vm_map_size_t copy_size;
9425 vm_map_size_t total_size;
9426 uint16_t copy_page_shift;
9427
9428 /*
9429 * Check for special kernel buffer allocated
9430 * by new_ipc_kmsg_copyin.
9431 */
9432
9433 if (copy->type == VM_MAP_COPY_KERNEL_BUFFER) {
9434 kr = vm_map_copyout_kernel_buffer(
9435 dst_map, &dst_addr,
9436 copy, copy->size, TRUE, discard_on_success);
9437 return kr;
9438 }
9439
9440 /*
9441 * Only works for entry lists at the moment. Will
9442 * support page lists later.
9443 */
9444
9445 assert(copy->type == VM_MAP_COPY_ENTRY_LIST);
9446
9447 if (copy->size == 0) {
9448 if (discard_on_success) {
9449 vm_map_copy_discard(copy);
9450 }
9451 return KERN_SUCCESS;
9452 }
9453
9454 copy_page_shift = copy->cpy_hdr.page_shift;
9455
9456 /*
9457 * Verify that the destination is all writeable
9458 * initially. We have to trunc the destination
9459 * address and round the copy size or we'll end up
9460 * splitting entries in strange ways.
9461 */
9462
9463 if (!VM_MAP_PAGE_ALIGNED(copy->size,
9464 VM_MAP_PAGE_MASK(dst_map)) ||
9465 !VM_MAP_PAGE_ALIGNED(copy->offset,
9466 VM_MAP_PAGE_MASK(dst_map)) ||
9467 !VM_MAP_PAGE_ALIGNED(dst_addr,
9468 VM_MAP_PAGE_MASK(dst_map)) ||
9469 copy_page_shift != VM_MAP_PAGE_SHIFT(dst_map)) {
9470 aligned = FALSE;
9471 dst_end = vm_map_round_page(dst_addr + copy->size,
9472 VM_MAP_PAGE_MASK(dst_map));
9473 } else {
9474 dst_end = dst_addr + copy->size;
9475 }
9476
9477 vm_map_lock(dst_map);
9478
9479 /* LP64todo - remove this check when vm_map_commpage64()
9480 * no longer has to stuff in a map_entry for the commpage
9481 * above the map's max_offset.
9482 */
9483 if (dst_addr >= dst_map->max_offset) {
9484 vm_map_unlock(dst_map);
9485 return KERN_INVALID_ADDRESS;
9486 }
9487
9488 start_pass_1:
9489 if (!vm_map_lookup_entry(dst_map, dst_addr, &tmp_entry)) {
9490 vm_map_unlock(dst_map);
9491 return KERN_INVALID_ADDRESS;
9492 }
9493 vm_map_clip_start(dst_map,
9494 tmp_entry,
9495 vm_map_trunc_page(dst_addr,
9496 VM_MAP_PAGE_MASK(dst_map)));
9497 for (entry = tmp_entry;;) {
9498 vm_map_entry_t next = entry->vme_next;
9499
9500 while (entry->is_sub_map) {
9501 vm_map_offset_t sub_start;
9502 vm_map_offset_t sub_end;
9503 vm_map_offset_t local_end;
9504
9505 if (entry->in_transition) {
9506 /*
9507 * Say that we are waiting, and wait for entry.
9508 */
9509 entry->needs_wakeup = TRUE;
9510 vm_map_entry_wait(dst_map, THREAD_UNINT);
9511
9512 goto start_pass_1;
9513 }
9514
9515 local_end = entry->vme_end;
9516 if (!(entry->needs_copy)) {
9517 /* if needs_copy we are a COW submap */
9518 /* in such a case we just replace so */
9519 /* there is no need for the follow- */
9520 /* ing check. */
9521 encountered_sub_map = TRUE;
9522 sub_start = VME_OFFSET(entry);
9523
9524 if (entry->vme_end < dst_end) {
9525 sub_end = entry->vme_end;
9526 } else {
9527 sub_end = dst_end;
9528 }
9529 sub_end -= entry->vme_start;
9530 sub_end += VME_OFFSET(entry);
9531 vm_map_unlock(dst_map);
9532
9533 kr = vm_map_overwrite_submap_recurse(
9534 VME_SUBMAP(entry),
9535 sub_start,
9536 sub_end - sub_start);
9537 if (kr != KERN_SUCCESS) {
9538 return kr;
9539 }
9540 vm_map_lock(dst_map);
9541 }
9542
9543 if (dst_end <= entry->vme_end) {
9544 goto start_overwrite;
9545 }
9546 if (!vm_map_lookup_entry(dst_map, local_end,
9547 &entry)) {
9548 vm_map_unlock(dst_map);
9549 return KERN_INVALID_ADDRESS;
9550 }
9551 next = entry->vme_next;
9552 }
9553
9554 if (!(entry->protection & VM_PROT_WRITE)) {
9555 vm_map_unlock(dst_map);
9556 return KERN_PROTECTION_FAILURE;
9557 }
9558
9559 if (!vm_map_entry_is_overwritable(dst_map, entry)) {
9560 vm_map_unlock(dst_map);
9561 return KERN_PROTECTION_FAILURE;
9562 }
9563
9564 /*
9565 * If the entry is in transition, we must wait
9566 * for it to exit that state. Anything could happen
9567 * when we unlock the map, so start over.
9568 */
9569 if (entry->in_transition) {
9570 /*
9571 * Say that we are waiting, and wait for entry.
9572 */
9573 entry->needs_wakeup = TRUE;
9574 vm_map_entry_wait(dst_map, THREAD_UNINT);
9575
9576 goto start_pass_1;
9577 }
9578
9579 /*
9580 * our range is contained completely within this map entry
9581 */
9582 if (dst_end <= entry->vme_end) {
9583 break;
9584 }
9585 /*
9586 * check that range specified is contiguous region
9587 */
9588 if ((next == vm_map_to_entry(dst_map)) ||
9589 (next->vme_start != entry->vme_end)) {
9590 vm_map_unlock(dst_map);
9591 return KERN_INVALID_ADDRESS;
9592 }
9593
9594
9595 /*
9596 * Check for permanent objects in the destination.
9597 */
9598 if ((VME_OBJECT(entry) != VM_OBJECT_NULL) &&
9599 ((!VME_OBJECT(entry)->internal) ||
9600 (VME_OBJECT(entry)->true_share))) {
9601 contains_permanent_objects = TRUE;
9602 }
9603
9604 entry = next;
9605 }/* for */
9606
9607 start_overwrite:
9608 /*
9609 * If there are permanent objects in the destination, then
9610 * the copy cannot be interrupted.
9611 */
9612
9613 if (interruptible && contains_permanent_objects) {
9614 vm_map_unlock(dst_map);
9615 return KERN_FAILURE; /* XXX */
9616 }
9617
9618 /*
9619 *
9620 * Make a second pass, overwriting the data
9621 * At the beginning of each loop iteration,
9622 * the next entry to be overwritten is "tmp_entry"
9623 * (initially, the value returned from the lookup above),
9624 * and the starting address expected in that entry
9625 * is "start".
9626 */
9627
9628 total_size = copy->size;
9629 if (encountered_sub_map) {
9630 copy_size = 0;
9631 /* re-calculate tmp_entry since we've had the map */
9632 /* unlocked */
9633 if (!vm_map_lookup_entry( dst_map, dst_addr, &tmp_entry)) {
9634 vm_map_unlock(dst_map);
9635 return KERN_INVALID_ADDRESS;
9636 }
9637 } else {
9638 copy_size = copy->size;
9639 }
9640
9641 base_addr = dst_addr;
9642 while (TRUE) {
9643 /* deconstruct the copy object and do in parts */
9644 /* only in sub_map, interruptable case */
9645 vm_map_entry_t copy_entry;
9646 vm_map_entry_t previous_prev = VM_MAP_ENTRY_NULL;
9647 vm_map_entry_t next_copy = VM_MAP_ENTRY_NULL;
9648 int nentries;
9649 int remaining_entries = 0;
9650 vm_map_offset_t new_offset = 0;
9651
9652 for (entry = tmp_entry; copy_size == 0;) {
9653 vm_map_entry_t next;
9654
9655 next = entry->vme_next;
9656
9657 /* tmp_entry and base address are moved along */
9658 /* each time we encounter a sub-map. Otherwise */
9659 /* entry can outpase tmp_entry, and the copy_size */
9660 /* may reflect the distance between them */
9661 /* if the current entry is found to be in transition */
9662 /* we will start over at the beginning or the last */
9663 /* encounter of a submap as dictated by base_addr */
9664 /* we will zero copy_size accordingly. */
9665 if (entry->in_transition) {
9666 /*
9667 * Say that we are waiting, and wait for entry.
9668 */
9669 entry->needs_wakeup = TRUE;
9670 vm_map_entry_wait(dst_map, THREAD_UNINT);
9671
9672 if (!vm_map_lookup_entry(dst_map, base_addr,
9673 &tmp_entry)) {
9674 vm_map_unlock(dst_map);
9675 return KERN_INVALID_ADDRESS;
9676 }
9677 copy_size = 0;
9678 entry = tmp_entry;
9679 continue;
9680 }
9681 if (entry->is_sub_map) {
9682 vm_map_offset_t sub_start;
9683 vm_map_offset_t sub_end;
9684 vm_map_offset_t local_end;
9685
9686 if (entry->needs_copy) {
9687 /* if this is a COW submap */
9688 /* just back the range with a */
9689 /* anonymous entry */
9690 assert(!entry->vme_permanent);
9691 if (entry->vme_end < dst_end) {
9692 sub_end = entry->vme_end;
9693 } else {
9694 sub_end = dst_end;
9695 }
9696 if (entry->vme_start < base_addr) {
9697 sub_start = base_addr;
9698 } else {
9699 sub_start = entry->vme_start;
9700 }
9701 vm_map_clip_end(
9702 dst_map, entry, sub_end);
9703 vm_map_clip_start(
9704 dst_map, entry, sub_start);
9705 assert(!entry->use_pmap);
9706 assert(!entry->iokit_acct);
9707 entry->use_pmap = TRUE;
9708 vm_map_deallocate(VME_SUBMAP(entry));
9709 assert(!entry->vme_permanent);
9710 VME_OBJECT_SET(entry, VM_OBJECT_NULL, false, 0);
9711 VME_OFFSET_SET(entry, 0);
9712 entry->is_shared = FALSE;
9713 entry->needs_copy = FALSE;
9714 entry->protection = VM_PROT_DEFAULT;
9715 entry->max_protection = VM_PROT_ALL;
9716 entry->wired_count = 0;
9717 entry->user_wired_count = 0;
9718 if (entry->inheritance
9719 == VM_INHERIT_SHARE) {
9720 entry->inheritance = VM_INHERIT_COPY;
9721 }
9722 continue;
9723 }
9724 /* first take care of any non-sub_map */
9725 /* entries to send */
9726 if (base_addr < entry->vme_start) {
9727 /* stuff to send */
9728 copy_size =
9729 entry->vme_start - base_addr;
9730 break;
9731 }
9732 sub_start = VME_OFFSET(entry);
9733
9734 if (entry->vme_end < dst_end) {
9735 sub_end = entry->vme_end;
9736 } else {
9737 sub_end = dst_end;
9738 }
9739 sub_end -= entry->vme_start;
9740 sub_end += VME_OFFSET(entry);
9741 local_end = entry->vme_end;
9742 vm_map_unlock(dst_map);
9743 copy_size = sub_end - sub_start;
9744
9745 /* adjust the copy object */
9746 if (total_size > copy_size) {
9747 vm_map_size_t local_size = 0;
9748 vm_map_size_t entry_size;
9749
9750 nentries = 1;
9751 new_offset = copy->offset;
9752 copy_entry = vm_map_copy_first_entry(copy);
9753 while (copy_entry !=
9754 vm_map_copy_to_entry(copy)) {
9755 entry_size = copy_entry->vme_end -
9756 copy_entry->vme_start;
9757 if ((local_size < copy_size) &&
9758 ((local_size + entry_size)
9759 >= copy_size)) {
9760 vm_map_copy_clip_end(copy,
9761 copy_entry,
9762 copy_entry->vme_start +
9763 (copy_size - local_size));
9764 entry_size = copy_entry->vme_end -
9765 copy_entry->vme_start;
9766 local_size += entry_size;
9767 new_offset += entry_size;
9768 }
9769 if (local_size >= copy_size) {
9770 next_copy = copy_entry->vme_next;
9771 copy_entry->vme_next =
9772 vm_map_copy_to_entry(copy);
9773 previous_prev =
9774 copy->cpy_hdr.links.prev;
9775 copy->cpy_hdr.links.prev = copy_entry;
9776 copy->size = copy_size;
9777 remaining_entries =
9778 copy->cpy_hdr.nentries;
9779 remaining_entries -= nentries;
9780 copy->cpy_hdr.nentries = nentries;
9781 break;
9782 } else {
9783 local_size += entry_size;
9784 new_offset += entry_size;
9785 nentries++;
9786 }
9787 copy_entry = copy_entry->vme_next;
9788 }
9789 }
9790
9791 if ((entry->use_pmap) && (pmap == NULL)) {
9792 kr = vm_map_copy_overwrite_nested(
9793 VME_SUBMAP(entry),
9794 sub_start,
9795 copy,
9796 interruptible,
9797 VME_SUBMAP(entry)->pmap,
9798 TRUE);
9799 } else if (pmap != NULL) {
9800 kr = vm_map_copy_overwrite_nested(
9801 VME_SUBMAP(entry),
9802 sub_start,
9803 copy,
9804 interruptible, pmap,
9805 TRUE);
9806 } else {
9807 kr = vm_map_copy_overwrite_nested(
9808 VME_SUBMAP(entry),
9809 sub_start,
9810 copy,
9811 interruptible,
9812 dst_map->pmap,
9813 TRUE);
9814 }
9815 if (kr != KERN_SUCCESS) {
9816 if (next_copy != NULL) {
9817 copy->cpy_hdr.nentries +=
9818 remaining_entries;
9819 copy->cpy_hdr.links.prev->vme_next =
9820 next_copy;
9821 copy->cpy_hdr.links.prev
9822 = previous_prev;
9823 copy->size = total_size;
9824 }
9825 return kr;
9826 }
9827 if (dst_end <= local_end) {
9828 return KERN_SUCCESS;
9829 }
9830 /* otherwise copy no longer exists, it was */
9831 /* destroyed after successful copy_overwrite */
9832 copy = vm_map_copy_allocate(VM_MAP_COPY_ENTRY_LIST);
9833 copy->offset = new_offset;
9834 copy->cpy_hdr.page_shift = copy_page_shift;
9835
9836 total_size -= copy_size;
9837 copy_size = 0;
9838 /* put back remainder of copy in container */
9839 if (next_copy != NULL) {
9840 copy->cpy_hdr.nentries = remaining_entries;
9841 copy->cpy_hdr.links.next = next_copy;
9842 copy->cpy_hdr.links.prev = previous_prev;
9843 copy->size = total_size;
9844 next_copy->vme_prev =
9845 vm_map_copy_to_entry(copy);
9846 next_copy = NULL;
9847 }
9848 base_addr = local_end;
9849 vm_map_lock(dst_map);
9850 if (!vm_map_lookup_entry(dst_map,
9851 local_end, &tmp_entry)) {
9852 vm_map_unlock(dst_map);
9853 return KERN_INVALID_ADDRESS;
9854 }
9855 entry = tmp_entry;
9856 continue;
9857 }
9858 if (dst_end <= entry->vme_end) {
9859 copy_size = dst_end - base_addr;
9860 break;
9861 }
9862
9863 if ((next == vm_map_to_entry(dst_map)) ||
9864 (next->vme_start != entry->vme_end)) {
9865 vm_map_unlock(dst_map);
9866 return KERN_INVALID_ADDRESS;
9867 }
9868
9869 entry = next;
9870 }/* for */
9871
9872 next_copy = NULL;
9873 nentries = 1;
9874
9875 /* adjust the copy object */
9876 if (total_size > copy_size) {
9877 vm_map_size_t local_size = 0;
9878 vm_map_size_t entry_size;
9879
9880 new_offset = copy->offset;
9881 copy_entry = vm_map_copy_first_entry(copy);
9882 while (copy_entry != vm_map_copy_to_entry(copy)) {
9883 entry_size = copy_entry->vme_end -
9884 copy_entry->vme_start;
9885 if ((local_size < copy_size) &&
9886 ((local_size + entry_size)
9887 >= copy_size)) {
9888 vm_map_copy_clip_end(copy, copy_entry,
9889 copy_entry->vme_start +
9890 (copy_size - local_size));
9891 entry_size = copy_entry->vme_end -
9892 copy_entry->vme_start;
9893 local_size += entry_size;
9894 new_offset += entry_size;
9895 }
9896 if (local_size >= copy_size) {
9897 next_copy = copy_entry->vme_next;
9898 copy_entry->vme_next =
9899 vm_map_copy_to_entry(copy);
9900 previous_prev =
9901 copy->cpy_hdr.links.prev;
9902 copy->cpy_hdr.links.prev = copy_entry;
9903 copy->size = copy_size;
9904 remaining_entries =
9905 copy->cpy_hdr.nentries;
9906 remaining_entries -= nentries;
9907 copy->cpy_hdr.nentries = nentries;
9908 break;
9909 } else {
9910 local_size += entry_size;
9911 new_offset += entry_size;
9912 nentries++;
9913 }
9914 copy_entry = copy_entry->vme_next;
9915 }
9916 }
9917
9918 if (aligned) {
9919 pmap_t local_pmap;
9920
9921 if (pmap) {
9922 local_pmap = pmap;
9923 } else {
9924 local_pmap = dst_map->pmap;
9925 }
9926
9927 if ((kr = vm_map_copy_overwrite_aligned(
9928 dst_map, tmp_entry, copy,
9929 base_addr, local_pmap)) != KERN_SUCCESS) {
9930 if (next_copy != NULL) {
9931 copy->cpy_hdr.nentries +=
9932 remaining_entries;
9933 copy->cpy_hdr.links.prev->vme_next =
9934 next_copy;
9935 copy->cpy_hdr.links.prev =
9936 previous_prev;
9937 copy->size += copy_size;
9938 }
9939 return kr;
9940 }
9941 vm_map_unlock(dst_map);
9942 } else {
9943 /*
9944 * Performance gain:
9945 *
9946 * if the copy and dst address are misaligned but the same
9947 * offset within the page we can copy_not_aligned the
9948 * misaligned parts and copy aligned the rest. If they are
9949 * aligned but len is unaligned we simply need to copy
9950 * the end bit unaligned. We'll need to split the misaligned
9951 * bits of the region in this case !
9952 */
9953 /* ALWAYS UNLOCKS THE dst_map MAP */
9954 kr = vm_map_copy_overwrite_unaligned(
9955 dst_map,
9956 tmp_entry,
9957 copy,
9958 base_addr,
9959 discard_on_success);
9960 if (kr != KERN_SUCCESS) {
9961 if (next_copy != NULL) {
9962 copy->cpy_hdr.nentries +=
9963 remaining_entries;
9964 copy->cpy_hdr.links.prev->vme_next =
9965 next_copy;
9966 copy->cpy_hdr.links.prev =
9967 previous_prev;
9968 copy->size += copy_size;
9969 }
9970 return kr;
9971 }
9972 }
9973 total_size -= copy_size;
9974 if (total_size == 0) {
9975 break;
9976 }
9977 base_addr += copy_size;
9978 copy_size = 0;
9979 copy->offset = new_offset;
9980 if (next_copy != NULL) {
9981 copy->cpy_hdr.nentries = remaining_entries;
9982 copy->cpy_hdr.links.next = next_copy;
9983 copy->cpy_hdr.links.prev = previous_prev;
9984 next_copy->vme_prev = vm_map_copy_to_entry(copy);
9985 copy->size = total_size;
9986 }
9987 vm_map_lock(dst_map);
9988 while (TRUE) {
9989 if (!vm_map_lookup_entry(dst_map,
9990 base_addr, &tmp_entry)) {
9991 vm_map_unlock(dst_map);
9992 return KERN_INVALID_ADDRESS;
9993 }
9994 if (tmp_entry->in_transition) {
9995 entry->needs_wakeup = TRUE;
9996 vm_map_entry_wait(dst_map, THREAD_UNINT);
9997 } else {
9998 break;
9999 }
10000 }
10001 vm_map_clip_start(dst_map,
10002 tmp_entry,
10003 vm_map_trunc_page(base_addr,
10004 VM_MAP_PAGE_MASK(dst_map)));
10005
10006 entry = tmp_entry;
10007 } /* while */
10008
10009 /*
10010 * Throw away the vm_map_copy object
10011 */
10012 if (discard_on_success) {
10013 vm_map_copy_discard(copy);
10014 }
10015
10016 return KERN_SUCCESS;
10017 }/* vm_map_copy_overwrite */
10018
10019 kern_return_t
vm_map_copy_overwrite(vm_map_t dst_map,vm_map_offset_t dst_addr,vm_map_copy_t copy,vm_map_size_t copy_size,boolean_t interruptible)10020 vm_map_copy_overwrite(
10021 vm_map_t dst_map,
10022 vm_map_offset_t dst_addr,
10023 vm_map_copy_t copy,
10024 vm_map_size_t copy_size,
10025 boolean_t interruptible)
10026 {
10027 vm_map_size_t head_size, tail_size;
10028 vm_map_copy_t head_copy, tail_copy;
10029 vm_map_offset_t head_addr, tail_addr;
10030 vm_map_entry_t entry;
10031 kern_return_t kr;
10032 vm_map_offset_t effective_page_mask, effective_page_size;
10033 uint16_t copy_page_shift;
10034
10035 head_size = 0;
10036 tail_size = 0;
10037 head_copy = NULL;
10038 tail_copy = NULL;
10039 head_addr = 0;
10040 tail_addr = 0;
10041
10042 /*
10043 * Check for null copy object.
10044 */
10045 if (copy == VM_MAP_COPY_NULL) {
10046 return KERN_SUCCESS;
10047 }
10048
10049 if (__improbable(vm_map_range_overflows(dst_map, dst_addr, copy_size))) {
10050 return KERN_INVALID_ADDRESS;
10051 }
10052
10053 /*
10054 * Assert that the vm_map_copy is coming from the right
10055 * zone and hasn't been forged
10056 */
10057 vm_map_copy_require(copy);
10058
10059 if (interruptible ||
10060 copy->type != VM_MAP_COPY_ENTRY_LIST) {
10061 /*
10062 * We can't split the "copy" map if we're interruptible
10063 * or if we don't have a "copy" map...
10064 */
10065 blunt_copy:
10066 kr = vm_map_copy_overwrite_nested(dst_map,
10067 dst_addr,
10068 copy,
10069 interruptible,
10070 (pmap_t) NULL,
10071 TRUE);
10072 if (kr) {
10073 ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_COPYOVERWRITE_FULL_NESTED_ERROR), kr /* arg */);
10074 }
10075 return kr;
10076 }
10077
10078 copy_page_shift = VM_MAP_COPY_PAGE_SHIFT(copy);
10079 if (copy_page_shift < PAGE_SHIFT ||
10080 VM_MAP_PAGE_SHIFT(dst_map) < PAGE_SHIFT) {
10081 goto blunt_copy;
10082 }
10083
10084 if (VM_MAP_PAGE_SHIFT(dst_map) < PAGE_SHIFT) {
10085 effective_page_mask = VM_MAP_PAGE_MASK(dst_map);
10086 } else {
10087 effective_page_mask = MAX(VM_MAP_PAGE_MASK(dst_map), PAGE_MASK);
10088 effective_page_mask = MAX(VM_MAP_COPY_PAGE_MASK(copy),
10089 effective_page_mask);
10090 }
10091 effective_page_size = effective_page_mask + 1;
10092
10093 if (copy_size < VM_MAP_COPY_OVERWRITE_OPTIMIZATION_THRESHOLD_PAGES * effective_page_size) {
10094 /*
10095 * Too small to bother with optimizing...
10096 */
10097 goto blunt_copy;
10098 }
10099
10100 if ((dst_addr & effective_page_mask) !=
10101 (copy->offset & effective_page_mask)) {
10102 /*
10103 * Incompatible mis-alignment of source and destination...
10104 */
10105 goto blunt_copy;
10106 }
10107
10108 /*
10109 * Proper alignment or identical mis-alignment at the beginning.
10110 * Let's try and do a small unaligned copy first (if needed)
10111 * and then an aligned copy for the rest.
10112 */
10113 if (!vm_map_page_aligned(dst_addr, effective_page_mask)) {
10114 head_addr = dst_addr;
10115 head_size = (effective_page_size -
10116 (copy->offset & effective_page_mask));
10117 head_size = MIN(head_size, copy_size);
10118 }
10119 if (!vm_map_page_aligned(copy->offset + copy_size,
10120 effective_page_mask)) {
10121 /*
10122 * Mis-alignment at the end.
10123 * Do an aligned copy up to the last page and
10124 * then an unaligned copy for the remaining bytes.
10125 */
10126 tail_size = ((copy->offset + copy_size) &
10127 effective_page_mask);
10128 tail_size = MIN(tail_size, copy_size);
10129 tail_addr = dst_addr + copy_size - tail_size;
10130 assert(tail_addr >= head_addr + head_size);
10131 }
10132 assert(head_size + tail_size <= copy_size);
10133
10134 if (head_size + tail_size == copy_size) {
10135 /*
10136 * It's all unaligned, no optimization possible...
10137 */
10138 goto blunt_copy;
10139 }
10140
10141 /*
10142 * Can't optimize if there are any submaps in the
10143 * destination due to the way we free the "copy" map
10144 * progressively in vm_map_copy_overwrite_nested()
10145 * in that case.
10146 */
10147 vm_map_lock_read(dst_map);
10148 if (!vm_map_lookup_entry(dst_map, dst_addr, &entry)) {
10149 vm_map_unlock_read(dst_map);
10150 goto blunt_copy;
10151 }
10152 for (;
10153 (entry != vm_map_to_entry(dst_map) &&
10154 entry->vme_start < dst_addr + copy_size);
10155 entry = entry->vme_next) {
10156 if (entry->is_sub_map) {
10157 vm_map_unlock_read(dst_map);
10158 goto blunt_copy;
10159 }
10160 }
10161 vm_map_unlock_read(dst_map);
10162
10163 if (head_size) {
10164 /*
10165 * Unaligned copy of the first "head_size" bytes, to reach
10166 * a page boundary.
10167 */
10168
10169 /*
10170 * Extract "head_copy" out of "copy".
10171 */
10172 head_copy = vm_map_copy_allocate(VM_MAP_COPY_ENTRY_LIST);
10173 head_copy->cpy_hdr.entries_pageable =
10174 copy->cpy_hdr.entries_pageable;
10175 head_copy->cpy_hdr.page_shift = copy_page_shift;
10176
10177 entry = vm_map_copy_first_entry(copy);
10178 if (entry->vme_end < copy->offset + head_size) {
10179 head_size = entry->vme_end - copy->offset;
10180 }
10181
10182 head_copy->offset = copy->offset;
10183 head_copy->size = head_size;
10184 copy->offset += head_size;
10185 copy->size -= head_size;
10186 copy_size -= head_size;
10187 assert(copy_size > 0);
10188
10189 vm_map_copy_clip_end(copy, entry, copy->offset);
10190 vm_map_copy_entry_unlink(copy, entry);
10191 vm_map_copy_entry_link(head_copy,
10192 vm_map_copy_to_entry(head_copy),
10193 entry);
10194
10195 /*
10196 * Do the unaligned copy.
10197 */
10198 kr = vm_map_copy_overwrite_nested(dst_map,
10199 head_addr,
10200 head_copy,
10201 interruptible,
10202 (pmap_t) NULL,
10203 FALSE);
10204 if (kr != KERN_SUCCESS) {
10205 ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_COPYOVERWRITE_PARTIAL_HEAD_NESTED_ERROR), kr /* arg */);
10206 goto done;
10207 }
10208 }
10209
10210 if (tail_size) {
10211 /*
10212 * Extract "tail_copy" out of "copy".
10213 */
10214 tail_copy = vm_map_copy_allocate(VM_MAP_COPY_ENTRY_LIST);
10215 tail_copy->cpy_hdr.entries_pageable =
10216 copy->cpy_hdr.entries_pageable;
10217 tail_copy->cpy_hdr.page_shift = copy_page_shift;
10218
10219 tail_copy->offset = copy->offset + copy_size - tail_size;
10220 tail_copy->size = tail_size;
10221
10222 copy->size -= tail_size;
10223 copy_size -= tail_size;
10224 assert(copy_size > 0);
10225
10226 entry = vm_map_copy_last_entry(copy);
10227 vm_map_copy_clip_start(copy, entry, tail_copy->offset);
10228 entry = vm_map_copy_last_entry(copy);
10229 vm_map_copy_entry_unlink(copy, entry);
10230 vm_map_copy_entry_link(tail_copy,
10231 vm_map_copy_last_entry(tail_copy),
10232 entry);
10233 }
10234
10235 /*
10236 * If we are here from ipc_kmsg_copyout_ool_descriptor(),
10237 * we want to avoid TOCTOU issues w.r.t copy->size but
10238 * we don't need to change vm_map_copy_overwrite_nested()
10239 * and all other vm_map_copy_overwrite variants.
10240 *
10241 * So we assign the original copy_size that was passed into
10242 * this routine back to copy.
10243 *
10244 * This use of local 'copy_size' passed into this routine is
10245 * to try and protect against TOCTOU attacks where the kernel
10246 * has been exploited. We don't expect this to be an issue
10247 * during normal system operation.
10248 */
10249 assertf(copy->size == copy_size,
10250 "Mismatch of copy sizes. Expected 0x%llx, Got 0x%llx\n", (uint64_t) copy_size, (uint64_t) copy->size);
10251 copy->size = copy_size;
10252
10253 /*
10254 * Copy most (or possibly all) of the data.
10255 */
10256 kr = vm_map_copy_overwrite_nested(dst_map,
10257 dst_addr + head_size,
10258 copy,
10259 interruptible,
10260 (pmap_t) NULL,
10261 FALSE);
10262 if (kr != KERN_SUCCESS) {
10263 ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_COPYOVERWRITE_PARTIAL_NESTED_ERROR), kr /* arg */);
10264 goto done;
10265 }
10266
10267 if (tail_size) {
10268 kr = vm_map_copy_overwrite_nested(dst_map,
10269 tail_addr,
10270 tail_copy,
10271 interruptible,
10272 (pmap_t) NULL,
10273 FALSE);
10274 if (kr) {
10275 ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_COPYOVERWRITE_PARTIAL_TAIL_NESTED_ERROR), kr /* arg */);
10276 }
10277 }
10278
10279 done:
10280 assert(copy->type == VM_MAP_COPY_ENTRY_LIST);
10281 if (kr == KERN_SUCCESS) {
10282 /*
10283 * Discard all the copy maps.
10284 */
10285 if (head_copy) {
10286 vm_map_copy_discard(head_copy);
10287 head_copy = NULL;
10288 }
10289 vm_map_copy_discard(copy);
10290 if (tail_copy) {
10291 vm_map_copy_discard(tail_copy);
10292 tail_copy = NULL;
10293 }
10294 } else {
10295 /*
10296 * Re-assemble the original copy map.
10297 */
10298 if (head_copy) {
10299 entry = vm_map_copy_first_entry(head_copy);
10300 vm_map_copy_entry_unlink(head_copy, entry);
10301 vm_map_copy_entry_link(copy,
10302 vm_map_copy_to_entry(copy),
10303 entry);
10304 copy->offset -= head_size;
10305 copy->size += head_size;
10306 vm_map_copy_discard(head_copy);
10307 head_copy = NULL;
10308 }
10309 if (tail_copy) {
10310 entry = vm_map_copy_last_entry(tail_copy);
10311 vm_map_copy_entry_unlink(tail_copy, entry);
10312 vm_map_copy_entry_link(copy,
10313 vm_map_copy_last_entry(copy),
10314 entry);
10315 copy->size += tail_size;
10316 vm_map_copy_discard(tail_copy);
10317 tail_copy = NULL;
10318 }
10319 }
10320 return kr;
10321 }
10322
10323
10324 /*
10325 * Routine: vm_map_copy_overwrite_unaligned [internal use only]
10326 *
10327 * Decription:
10328 * Physically copy unaligned data
10329 *
10330 * Implementation:
10331 * Unaligned parts of pages have to be physically copied. We use
10332 * a modified form of vm_fault_copy (which understands none-aligned
10333 * page offsets and sizes) to do the copy. We attempt to copy as
10334 * much memory in one go as possibly, however vm_fault_copy copies
10335 * within 1 memory object so we have to find the smaller of "amount left"
10336 * "source object data size" and "target object data size". With
10337 * unaligned data we don't need to split regions, therefore the source
10338 * (copy) object should be one map entry, the target range may be split
10339 * over multiple map entries however. In any event we are pessimistic
10340 * about these assumptions.
10341 *
10342 * Callers of this function must call vm_map_copy_require on
10343 * previously created vm_map_copy_t or pass a newly created
10344 * one to ensure that it hasn't been forged.
10345 *
10346 * Assumptions:
10347 * dst_map is locked on entry and is return locked on success,
10348 * unlocked on error.
10349 */
10350
10351 static kern_return_t
vm_map_copy_overwrite_unaligned(vm_map_t dst_map,vm_map_entry_t entry,vm_map_copy_t copy,vm_map_offset_t start,boolean_t discard_on_success)10352 vm_map_copy_overwrite_unaligned(
10353 vm_map_t dst_map,
10354 vm_map_entry_t entry,
10355 vm_map_copy_t copy,
10356 vm_map_offset_t start,
10357 boolean_t discard_on_success)
10358 {
10359 vm_map_entry_t copy_entry;
10360 vm_map_entry_t copy_entry_next;
10361 vm_map_version_t version;
10362 vm_object_t dst_object;
10363 vm_object_offset_t dst_offset;
10364 vm_object_offset_t src_offset;
10365 vm_object_offset_t entry_offset;
10366 vm_map_offset_t entry_end;
10367 vm_map_size_t src_size,
10368 dst_size,
10369 copy_size,
10370 amount_left;
10371 kern_return_t kr = KERN_SUCCESS;
10372
10373
10374 copy_entry = vm_map_copy_first_entry(copy);
10375
10376 vm_map_lock_write_to_read(dst_map);
10377
10378 src_offset = copy->offset - trunc_page_mask_64(copy->offset, VM_MAP_COPY_PAGE_MASK(copy));
10379 amount_left = copy->size;
10380 /*
10381 * unaligned so we never clipped this entry, we need the offset into
10382 * the vm_object not just the data.
10383 */
10384 while (amount_left > 0) {
10385 if (entry == vm_map_to_entry(dst_map)) {
10386 vm_map_unlock_read(dst_map);
10387 return KERN_INVALID_ADDRESS;
10388 }
10389
10390 /* "start" must be within the current map entry */
10391 assert((start >= entry->vme_start) && (start < entry->vme_end));
10392
10393 /*
10394 * Check protection again
10395 */
10396 if (!(entry->protection & VM_PROT_WRITE)) {
10397 vm_map_unlock_read(dst_map);
10398 return KERN_PROTECTION_FAILURE;
10399 }
10400 if (!vm_map_entry_is_overwritable(dst_map, entry)) {
10401 vm_map_unlock_read(dst_map);
10402 return KERN_PROTECTION_FAILURE;
10403 }
10404
10405 /*
10406 * If the entry is in transition, we must wait
10407 * for it to exit that state. Anything could happen
10408 * when we unlock the map, so start over.
10409 */
10410 if (entry->in_transition) {
10411 /*
10412 * Say that we are waiting, and wait for entry.
10413 */
10414 entry->needs_wakeup = TRUE;
10415 vm_map_entry_wait(dst_map, THREAD_UNINT);
10416
10417 goto RetryLookup;
10418 }
10419
10420 dst_offset = start - entry->vme_start;
10421
10422 dst_size = entry->vme_end - start;
10423
10424 src_size = copy_entry->vme_end -
10425 (copy_entry->vme_start + src_offset);
10426
10427 if (dst_size < src_size) {
10428 /*
10429 * we can only copy dst_size bytes before
10430 * we have to get the next destination entry
10431 */
10432 copy_size = dst_size;
10433 } else {
10434 /*
10435 * we can only copy src_size bytes before
10436 * we have to get the next source copy entry
10437 */
10438 copy_size = src_size;
10439 }
10440
10441 if (copy_size > amount_left) {
10442 copy_size = amount_left;
10443 }
10444 /*
10445 * Entry needs copy, create a shadow shadow object for
10446 * Copy on write region.
10447 */
10448 if (entry->needs_copy) {
10449 if (vm_map_lock_read_to_write(dst_map)) {
10450 vm_map_lock_read(dst_map);
10451 goto RetryLookup;
10452 }
10453 VME_OBJECT_SHADOW(entry,
10454 (vm_map_size_t)(entry->vme_end
10455 - entry->vme_start),
10456 vm_map_always_shadow(dst_map));
10457 entry->needs_copy = FALSE;
10458 vm_map_lock_write_to_read(dst_map);
10459 }
10460 dst_object = VME_OBJECT(entry);
10461 /*
10462 * unlike with the virtual (aligned) copy we're going
10463 * to fault on it therefore we need a target object.
10464 */
10465 if (dst_object == VM_OBJECT_NULL) {
10466 if (vm_map_lock_read_to_write(dst_map)) {
10467 vm_map_lock_read(dst_map);
10468 goto RetryLookup;
10469 }
10470 dst_object = vm_object_allocate((vm_map_size_t)
10471 entry->vme_end - entry->vme_start);
10472 VME_OBJECT_SET(entry, dst_object, false, 0);
10473 VME_OFFSET_SET(entry, 0);
10474 assert(entry->use_pmap);
10475 vm_map_lock_write_to_read(dst_map);
10476 }
10477 /*
10478 * Take an object reference and unlock map. The "entry" may
10479 * disappear or change when the map is unlocked.
10480 */
10481 vm_object_reference(dst_object);
10482 version.main_timestamp = dst_map->timestamp;
10483 entry_offset = VME_OFFSET(entry);
10484 entry_end = entry->vme_end;
10485 vm_map_unlock_read(dst_map);
10486 /*
10487 * Copy as much as possible in one pass
10488 */
10489 kr = vm_fault_copy(
10490 VME_OBJECT(copy_entry),
10491 VME_OFFSET(copy_entry) + src_offset,
10492 ©_size,
10493 dst_object,
10494 entry_offset + dst_offset,
10495 dst_map,
10496 &version,
10497 THREAD_UNINT );
10498
10499 start += copy_size;
10500 src_offset += copy_size;
10501 amount_left -= copy_size;
10502 /*
10503 * Release the object reference
10504 */
10505 vm_object_deallocate(dst_object);
10506 /*
10507 * If a hard error occurred, return it now
10508 */
10509 if (kr != KERN_SUCCESS) {
10510 return kr;
10511 }
10512
10513 if ((copy_entry->vme_start + src_offset) == copy_entry->vme_end
10514 || amount_left == 0) {
10515 /*
10516 * all done with this copy entry, dispose.
10517 */
10518 copy_entry_next = copy_entry->vme_next;
10519
10520 if (discard_on_success) {
10521 vm_map_copy_entry_unlink(copy, copy_entry);
10522 assert(!copy_entry->is_sub_map);
10523 vm_object_deallocate(VME_OBJECT(copy_entry));
10524 vm_map_copy_entry_dispose(copy_entry);
10525 }
10526
10527 if (copy_entry_next == vm_map_copy_to_entry(copy) &&
10528 amount_left) {
10529 /*
10530 * not finished copying but run out of source
10531 */
10532 return KERN_INVALID_ADDRESS;
10533 }
10534
10535 copy_entry = copy_entry_next;
10536
10537 src_offset = 0;
10538 }
10539
10540 if (amount_left == 0) {
10541 return KERN_SUCCESS;
10542 }
10543
10544 vm_map_lock_read(dst_map);
10545 if (version.main_timestamp == dst_map->timestamp) {
10546 if (start == entry_end) {
10547 /*
10548 * destination region is split. Use the version
10549 * information to avoid a lookup in the normal
10550 * case.
10551 */
10552 entry = entry->vme_next;
10553 /*
10554 * should be contiguous. Fail if we encounter
10555 * a hole in the destination.
10556 */
10557 if (start != entry->vme_start) {
10558 vm_map_unlock_read(dst_map);
10559 return KERN_INVALID_ADDRESS;
10560 }
10561 }
10562 } else {
10563 /*
10564 * Map version check failed.
10565 * we must lookup the entry because somebody
10566 * might have changed the map behind our backs.
10567 */
10568 RetryLookup:
10569 if (!vm_map_lookup_entry(dst_map, start, &entry)) {
10570 vm_map_unlock_read(dst_map);
10571 return KERN_INVALID_ADDRESS;
10572 }
10573 }
10574 }/* while */
10575
10576 return KERN_SUCCESS;
10577 }/* vm_map_copy_overwrite_unaligned */
10578
10579 /*
10580 * Routine: vm_map_copy_overwrite_aligned [internal use only]
10581 *
10582 * Description:
10583 * Does all the vm_trickery possible for whole pages.
10584 *
10585 * Implementation:
10586 *
10587 * If there are no permanent objects in the destination,
10588 * and the source and destination map entry zones match,
10589 * and the destination map entry is not shared,
10590 * then the map entries can be deleted and replaced
10591 * with those from the copy. The following code is the
10592 * basic idea of what to do, but there are lots of annoying
10593 * little details about getting protection and inheritance
10594 * right. Should add protection, inheritance, and sharing checks
10595 * to the above pass and make sure that no wiring is involved.
10596 *
10597 * Callers of this function must call vm_map_copy_require on
10598 * previously created vm_map_copy_t or pass a newly created
10599 * one to ensure that it hasn't been forged.
10600 */
10601
10602 int vm_map_copy_overwrite_aligned_src_not_internal = 0;
10603 int vm_map_copy_overwrite_aligned_src_not_symmetric = 0;
10604 int vm_map_copy_overwrite_aligned_src_large = 0;
10605
10606 static kern_return_t
vm_map_copy_overwrite_aligned(vm_map_t dst_map,vm_map_entry_t tmp_entry,vm_map_copy_t copy,vm_map_offset_t start,__unused pmap_t pmap)10607 vm_map_copy_overwrite_aligned(
10608 vm_map_t dst_map,
10609 vm_map_entry_t tmp_entry,
10610 vm_map_copy_t copy,
10611 vm_map_offset_t start,
10612 __unused pmap_t pmap)
10613 {
10614 vm_object_t object;
10615 vm_map_entry_t copy_entry;
10616 vm_map_size_t copy_size;
10617 vm_map_size_t size;
10618 vm_map_entry_t entry;
10619
10620 while ((copy_entry = vm_map_copy_first_entry(copy))
10621 != vm_map_copy_to_entry(copy)) {
10622 copy_size = (copy_entry->vme_end - copy_entry->vme_start);
10623
10624 entry = tmp_entry;
10625 if (entry->is_sub_map) {
10626 /* unnested when clipped earlier */
10627 assert(!entry->use_pmap);
10628 }
10629 if (entry == vm_map_to_entry(dst_map)) {
10630 vm_map_unlock(dst_map);
10631 return KERN_INVALID_ADDRESS;
10632 }
10633 size = (entry->vme_end - entry->vme_start);
10634 /*
10635 * Make sure that no holes popped up in the
10636 * address map, and that the protection is
10637 * still valid, in case the map was unlocked
10638 * earlier.
10639 */
10640
10641 if ((entry->vme_start != start) || ((entry->is_sub_map)
10642 && !entry->needs_copy)) {
10643 vm_map_unlock(dst_map);
10644 return KERN_INVALID_ADDRESS;
10645 }
10646 assert(entry != vm_map_to_entry(dst_map));
10647
10648 /*
10649 * Check protection again
10650 */
10651
10652 if (!(entry->protection & VM_PROT_WRITE)) {
10653 vm_map_unlock(dst_map);
10654 return KERN_PROTECTION_FAILURE;
10655 }
10656
10657 if (!vm_map_entry_is_overwritable(dst_map, entry)) {
10658 vm_map_unlock(dst_map);
10659 return KERN_PROTECTION_FAILURE;
10660 }
10661
10662 /*
10663 * If the entry is in transition, we must wait
10664 * for it to exit that state. Anything could happen
10665 * when we unlock the map, so start over.
10666 */
10667 if (entry->in_transition) {
10668 /*
10669 * Say that we are waiting, and wait for entry.
10670 */
10671 entry->needs_wakeup = TRUE;
10672 vm_map_entry_wait(dst_map, THREAD_UNINT);
10673
10674 goto RetryLookup;
10675 }
10676
10677 /*
10678 * Adjust to source size first
10679 */
10680
10681 if (copy_size < size) {
10682 if (entry->map_aligned &&
10683 !VM_MAP_PAGE_ALIGNED(entry->vme_start + copy_size,
10684 VM_MAP_PAGE_MASK(dst_map))) {
10685 /* no longer map-aligned */
10686 entry->map_aligned = FALSE;
10687 }
10688 vm_map_clip_end(dst_map, entry, entry->vme_start + copy_size);
10689 size = copy_size;
10690 }
10691
10692 /*
10693 * Adjust to destination size
10694 */
10695
10696 if (size < copy_size) {
10697 vm_map_copy_clip_end(copy, copy_entry,
10698 copy_entry->vme_start + size);
10699 copy_size = size;
10700 }
10701
10702 assert((entry->vme_end - entry->vme_start) == size);
10703 assert((tmp_entry->vme_end - tmp_entry->vme_start) == size);
10704 assert((copy_entry->vme_end - copy_entry->vme_start) == size);
10705
10706 /*
10707 * If the destination contains temporary unshared memory,
10708 * we can perform the copy by throwing it away and
10709 * installing the source data.
10710 */
10711
10712 object = VME_OBJECT(entry);
10713 if ((!entry->is_shared &&
10714 ((object == VM_OBJECT_NULL) ||
10715 (object->internal && !object->true_share))) ||
10716 entry->needs_copy) {
10717 vm_object_t old_object = VME_OBJECT(entry);
10718 vm_object_offset_t old_offset = VME_OFFSET(entry);
10719 vm_object_offset_t offset;
10720
10721 /*
10722 * Ensure that the source and destination aren't
10723 * identical
10724 */
10725 if (old_object == VME_OBJECT(copy_entry) &&
10726 old_offset == VME_OFFSET(copy_entry)) {
10727 vm_map_copy_entry_unlink(copy, copy_entry);
10728 vm_map_copy_entry_dispose(copy_entry);
10729
10730 if (old_object != VM_OBJECT_NULL) {
10731 vm_object_deallocate(old_object);
10732 }
10733
10734 start = tmp_entry->vme_end;
10735 tmp_entry = tmp_entry->vme_next;
10736 continue;
10737 }
10738
10739 #if XNU_TARGET_OS_OSX
10740 #define __TRADEOFF1_OBJ_SIZE (64 * 1024 * 1024) /* 64 MB */
10741 #define __TRADEOFF1_COPY_SIZE (128 * 1024) /* 128 KB */
10742 if (VME_OBJECT(copy_entry) != VM_OBJECT_NULL &&
10743 VME_OBJECT(copy_entry)->vo_size >= __TRADEOFF1_OBJ_SIZE &&
10744 copy_size <= __TRADEOFF1_COPY_SIZE) {
10745 /*
10746 * Virtual vs. Physical copy tradeoff #1.
10747 *
10748 * Copying only a few pages out of a large
10749 * object: do a physical copy instead of
10750 * a virtual copy, to avoid possibly keeping
10751 * the entire large object alive because of
10752 * those few copy-on-write pages.
10753 */
10754 vm_map_copy_overwrite_aligned_src_large++;
10755 goto slow_copy;
10756 }
10757 #endif /* XNU_TARGET_OS_OSX */
10758
10759 if ((dst_map->pmap != kernel_pmap) &&
10760 (VME_ALIAS(entry) >= VM_MEMORY_MALLOC) &&
10761 (VME_ALIAS(entry) <= VM_MEMORY_MALLOC_MEDIUM)) {
10762 vm_object_t new_object, new_shadow;
10763
10764 /*
10765 * We're about to map something over a mapping
10766 * established by malloc()...
10767 */
10768 new_object = VME_OBJECT(copy_entry);
10769 if (new_object != VM_OBJECT_NULL) {
10770 vm_object_lock_shared(new_object);
10771 }
10772 while (new_object != VM_OBJECT_NULL &&
10773 #if XNU_TARGET_OS_OSX
10774 !new_object->true_share &&
10775 new_object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC &&
10776 #endif /* XNU_TARGET_OS_OSX */
10777 new_object->internal) {
10778 new_shadow = new_object->shadow;
10779 if (new_shadow == VM_OBJECT_NULL) {
10780 break;
10781 }
10782 vm_object_lock_shared(new_shadow);
10783 vm_object_unlock(new_object);
10784 new_object = new_shadow;
10785 }
10786 if (new_object != VM_OBJECT_NULL) {
10787 if (!new_object->internal) {
10788 /*
10789 * The new mapping is backed
10790 * by an external object. We
10791 * don't want malloc'ed memory
10792 * to be replaced with such a
10793 * non-anonymous mapping, so
10794 * let's go off the optimized
10795 * path...
10796 */
10797 vm_map_copy_overwrite_aligned_src_not_internal++;
10798 vm_object_unlock(new_object);
10799 goto slow_copy;
10800 }
10801 #if XNU_TARGET_OS_OSX
10802 if (new_object->true_share ||
10803 new_object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC) {
10804 /*
10805 * Same if there's a "true_share"
10806 * object in the shadow chain, or
10807 * an object with a non-default
10808 * (SYMMETRIC) copy strategy.
10809 */
10810 vm_map_copy_overwrite_aligned_src_not_symmetric++;
10811 vm_object_unlock(new_object);
10812 goto slow_copy;
10813 }
10814 #endif /* XNU_TARGET_OS_OSX */
10815 vm_object_unlock(new_object);
10816 }
10817 /*
10818 * The new mapping is still backed by
10819 * anonymous (internal) memory, so it's
10820 * OK to substitute it for the original
10821 * malloc() mapping.
10822 */
10823 }
10824
10825 if (old_object != VM_OBJECT_NULL) {
10826 assert(!entry->vme_permanent);
10827 if (entry->is_sub_map) {
10828 if (entry->use_pmap) {
10829 #ifndef NO_NESTED_PMAP
10830 pmap_unnest(dst_map->pmap,
10831 (addr64_t)entry->vme_start,
10832 entry->vme_end - entry->vme_start);
10833 #endif /* NO_NESTED_PMAP */
10834 if (dst_map->mapped_in_other_pmaps) {
10835 /* clean up parent */
10836 /* map/maps */
10837 vm_map_submap_pmap_clean(
10838 dst_map, entry->vme_start,
10839 entry->vme_end,
10840 VME_SUBMAP(entry),
10841 VME_OFFSET(entry));
10842 }
10843 } else {
10844 vm_map_submap_pmap_clean(
10845 dst_map, entry->vme_start,
10846 entry->vme_end,
10847 VME_SUBMAP(entry),
10848 VME_OFFSET(entry));
10849 }
10850 vm_map_deallocate(VME_SUBMAP(entry));
10851 } else {
10852 if (dst_map->mapped_in_other_pmaps) {
10853 vm_object_pmap_protect_options(
10854 VME_OBJECT(entry),
10855 VME_OFFSET(entry),
10856 entry->vme_end
10857 - entry->vme_start,
10858 PMAP_NULL,
10859 PAGE_SIZE,
10860 entry->vme_start,
10861 VM_PROT_NONE,
10862 PMAP_OPTIONS_REMOVE);
10863 } else {
10864 pmap_remove_options(
10865 dst_map->pmap,
10866 (addr64_t)(entry->vme_start),
10867 (addr64_t)(entry->vme_end),
10868 PMAP_OPTIONS_REMOVE);
10869 }
10870 vm_object_deallocate(old_object);
10871 }
10872 }
10873
10874 if (entry->iokit_acct) {
10875 /* keep using iokit accounting */
10876 entry->use_pmap = FALSE;
10877 } else {
10878 /* use pmap accounting */
10879 entry->use_pmap = TRUE;
10880 }
10881 assert(!entry->vme_permanent);
10882 VME_OBJECT_SET(entry, VME_OBJECT(copy_entry), false, 0);
10883 object = VME_OBJECT(entry);
10884 entry->needs_copy = copy_entry->needs_copy;
10885 entry->wired_count = 0;
10886 entry->user_wired_count = 0;
10887 offset = VME_OFFSET(copy_entry);
10888 VME_OFFSET_SET(entry, offset);
10889
10890 vm_map_copy_entry_unlink(copy, copy_entry);
10891 vm_map_copy_entry_dispose(copy_entry);
10892
10893 /*
10894 * we could try to push pages into the pmap at this point, BUT
10895 * this optimization only saved on average 2 us per page if ALL
10896 * the pages in the source were currently mapped
10897 * and ALL the pages in the dest were touched, if there were fewer
10898 * than 2/3 of the pages touched, this optimization actually cost more cycles
10899 * it also puts a lot of pressure on the pmap layer w/r to mapping structures
10900 */
10901
10902 /*
10903 * Set up for the next iteration. The map
10904 * has not been unlocked, so the next
10905 * address should be at the end of this
10906 * entry, and the next map entry should be
10907 * the one following it.
10908 */
10909
10910 start = tmp_entry->vme_end;
10911 tmp_entry = tmp_entry->vme_next;
10912 } else {
10913 vm_map_version_t version;
10914 vm_object_t dst_object;
10915 vm_object_offset_t dst_offset;
10916 kern_return_t r;
10917
10918 slow_copy:
10919 if (entry->needs_copy) {
10920 VME_OBJECT_SHADOW(entry,
10921 (entry->vme_end -
10922 entry->vme_start),
10923 vm_map_always_shadow(dst_map));
10924 entry->needs_copy = FALSE;
10925 }
10926
10927 dst_object = VME_OBJECT(entry);
10928 dst_offset = VME_OFFSET(entry);
10929
10930 /*
10931 * Take an object reference, and record
10932 * the map version information so that the
10933 * map can be safely unlocked.
10934 */
10935
10936 if (dst_object == VM_OBJECT_NULL) {
10937 /*
10938 * We would usually have just taken the
10939 * optimized path above if the destination
10940 * object has not been allocated yet. But we
10941 * now disable that optimization if the copy
10942 * entry's object is not backed by anonymous
10943 * memory to avoid replacing malloc'ed
10944 * (i.e. re-usable) anonymous memory with a
10945 * not-so-anonymous mapping.
10946 * So we have to handle this case here and
10947 * allocate a new VM object for this map entry.
10948 */
10949 dst_object = vm_object_allocate(
10950 entry->vme_end - entry->vme_start);
10951 dst_offset = 0;
10952 VME_OBJECT_SET(entry, dst_object, false, 0);
10953 VME_OFFSET_SET(entry, dst_offset);
10954 assert(entry->use_pmap);
10955 }
10956
10957 vm_object_reference(dst_object);
10958
10959 /* account for unlock bumping up timestamp */
10960 version.main_timestamp = dst_map->timestamp + 1;
10961
10962 vm_map_unlock(dst_map);
10963
10964 /*
10965 * Copy as much as possible in one pass
10966 */
10967
10968 copy_size = size;
10969 r = vm_fault_copy(
10970 VME_OBJECT(copy_entry),
10971 VME_OFFSET(copy_entry),
10972 ©_size,
10973 dst_object,
10974 dst_offset,
10975 dst_map,
10976 &version,
10977 THREAD_UNINT );
10978
10979 /*
10980 * Release the object reference
10981 */
10982
10983 vm_object_deallocate(dst_object);
10984
10985 /*
10986 * If a hard error occurred, return it now
10987 */
10988
10989 if (r != KERN_SUCCESS) {
10990 return r;
10991 }
10992
10993 if (copy_size != 0) {
10994 /*
10995 * Dispose of the copied region
10996 */
10997
10998 vm_map_copy_clip_end(copy, copy_entry,
10999 copy_entry->vme_start + copy_size);
11000 vm_map_copy_entry_unlink(copy, copy_entry);
11001 vm_object_deallocate(VME_OBJECT(copy_entry));
11002 vm_map_copy_entry_dispose(copy_entry);
11003 }
11004
11005 /*
11006 * Pick up in the destination map where we left off.
11007 *
11008 * Use the version information to avoid a lookup
11009 * in the normal case.
11010 */
11011
11012 start += copy_size;
11013 vm_map_lock(dst_map);
11014 if (version.main_timestamp == dst_map->timestamp &&
11015 copy_size != 0) {
11016 /* We can safely use saved tmp_entry value */
11017
11018 if (tmp_entry->map_aligned &&
11019 !VM_MAP_PAGE_ALIGNED(
11020 start,
11021 VM_MAP_PAGE_MASK(dst_map))) {
11022 /* no longer map-aligned */
11023 tmp_entry->map_aligned = FALSE;
11024 }
11025 vm_map_clip_end(dst_map, tmp_entry, start);
11026 tmp_entry = tmp_entry->vme_next;
11027 } else {
11028 /* Must do lookup of tmp_entry */
11029
11030 RetryLookup:
11031 if (!vm_map_lookup_entry(dst_map, start, &tmp_entry)) {
11032 vm_map_unlock(dst_map);
11033 return KERN_INVALID_ADDRESS;
11034 }
11035 if (tmp_entry->map_aligned &&
11036 !VM_MAP_PAGE_ALIGNED(
11037 start,
11038 VM_MAP_PAGE_MASK(dst_map))) {
11039 /* no longer map-aligned */
11040 tmp_entry->map_aligned = FALSE;
11041 }
11042 vm_map_clip_start(dst_map, tmp_entry, start);
11043 }
11044 }
11045 }/* while */
11046
11047 return KERN_SUCCESS;
11048 }/* vm_map_copy_overwrite_aligned */
11049
11050 /*
11051 * Routine: vm_map_copyin_kernel_buffer [internal use only]
11052 *
11053 * Description:
11054 * Copy in data to a kernel buffer from space in the
11055 * source map. The original space may be optionally
11056 * deallocated.
11057 *
11058 * If successful, returns a new copy object.
11059 */
11060 static kern_return_t
vm_map_copyin_kernel_buffer(vm_map_t src_map,vm_map_offset_t src_addr,vm_map_size_t len,boolean_t src_destroy,vm_map_copy_t * copy_result)11061 vm_map_copyin_kernel_buffer(
11062 vm_map_t src_map,
11063 vm_map_offset_t src_addr,
11064 vm_map_size_t len,
11065 boolean_t src_destroy,
11066 vm_map_copy_t *copy_result)
11067 {
11068 kern_return_t kr;
11069 vm_map_copy_t copy;
11070 void *kdata;
11071
11072 if (len > msg_ool_size_small) {
11073 return KERN_INVALID_ARGUMENT;
11074 }
11075
11076 kdata = kalloc_data(len, Z_WAITOK);
11077 if (kdata == NULL) {
11078 return KERN_RESOURCE_SHORTAGE;
11079 }
11080 kr = copyinmap(src_map, src_addr, kdata, (vm_size_t)len);
11081 if (kr != KERN_SUCCESS) {
11082 kfree_data(kdata, len);
11083 return kr;
11084 }
11085
11086 copy = vm_map_copy_allocate(VM_MAP_COPY_KERNEL_BUFFER);
11087 copy->cpy_kdata = kdata;
11088 copy->size = len;
11089 copy->offset = 0;
11090
11091 if (src_destroy) {
11092 vmr_flags_t flags = VM_MAP_REMOVE_INTERRUPTIBLE;
11093
11094 if (src_map == kernel_map) {
11095 flags |= VM_MAP_REMOVE_KUNWIRE;
11096 }
11097
11098 (void)vm_map_remove_guard(src_map,
11099 vm_map_trunc_page(src_addr, VM_MAP_PAGE_MASK(src_map)),
11100 vm_map_round_page(src_addr + len, VM_MAP_PAGE_MASK(src_map)),
11101 flags, KMEM_GUARD_NONE);
11102 }
11103
11104 *copy_result = copy;
11105 return KERN_SUCCESS;
11106 }
11107
11108 /*
11109 * Routine: vm_map_copyout_kernel_buffer [internal use only]
11110 *
11111 * Description:
11112 * Copy out data from a kernel buffer into space in the
11113 * destination map. The space may be otpionally dynamically
11114 * allocated.
11115 *
11116 * If successful, consumes the copy object.
11117 * Otherwise, the caller is responsible for it.
11118 *
11119 * Callers of this function must call vm_map_copy_require on
11120 * previously created vm_map_copy_t or pass a newly created
11121 * one to ensure that it hasn't been forged.
11122 */
11123 static int vm_map_copyout_kernel_buffer_failures = 0;
11124 static kern_return_t
vm_map_copyout_kernel_buffer(vm_map_t map,vm_map_address_t * addr,vm_map_copy_t copy,vm_map_size_t copy_size,boolean_t overwrite,boolean_t consume_on_success)11125 vm_map_copyout_kernel_buffer(
11126 vm_map_t map,
11127 vm_map_address_t *addr, /* IN/OUT */
11128 vm_map_copy_t copy,
11129 vm_map_size_t copy_size,
11130 boolean_t overwrite,
11131 boolean_t consume_on_success)
11132 {
11133 kern_return_t kr = KERN_SUCCESS;
11134 thread_t thread = current_thread();
11135
11136 assert(copy->size == copy_size);
11137
11138 /*
11139 * check for corrupted vm_map_copy structure
11140 */
11141 if (copy_size > msg_ool_size_small || copy->offset) {
11142 panic("Invalid vm_map_copy_t sz:%lld, ofst:%lld",
11143 (long long)copy->size, (long long)copy->offset);
11144 }
11145
11146 if (!overwrite) {
11147 /*
11148 * Allocate space in the target map for the data
11149 */
11150 vm_map_kernel_flags_t vmk_flags = VM_MAP_KERNEL_FLAGS_ANYWHERE();
11151
11152 if (map == kernel_map) {
11153 vmk_flags.vmkf_range_id = KMEM_RANGE_ID_DATA;
11154 }
11155
11156 *addr = 0;
11157 kr = vm_map_enter(map,
11158 addr,
11159 vm_map_round_page(copy_size,
11160 VM_MAP_PAGE_MASK(map)),
11161 (vm_map_offset_t) 0,
11162 vmk_flags,
11163 VM_OBJECT_NULL,
11164 (vm_object_offset_t) 0,
11165 FALSE,
11166 VM_PROT_DEFAULT,
11167 VM_PROT_ALL,
11168 VM_INHERIT_DEFAULT);
11169 if (kr != KERN_SUCCESS) {
11170 return kr;
11171 }
11172 #if KASAN
11173 if (map->pmap == kernel_pmap) {
11174 kasan_notify_address(*addr, copy->size);
11175 }
11176 #endif
11177 }
11178
11179 /*
11180 * Copyout the data from the kernel buffer to the target map.
11181 */
11182 if (thread->map == map) {
11183 /*
11184 * If the target map is the current map, just do
11185 * the copy.
11186 */
11187 assert((vm_size_t)copy_size == copy_size);
11188 if (copyout(copy->cpy_kdata, *addr, (vm_size_t)copy_size)) {
11189 kr = KERN_INVALID_ADDRESS;
11190 }
11191 } else {
11192 vm_map_t oldmap;
11193
11194 /*
11195 * If the target map is another map, assume the
11196 * target's address space identity for the duration
11197 * of the copy.
11198 */
11199 vm_map_reference(map);
11200 oldmap = vm_map_switch(map);
11201
11202 assert((vm_size_t)copy_size == copy_size);
11203 if (copyout(copy->cpy_kdata, *addr, (vm_size_t)copy_size)) {
11204 vm_map_copyout_kernel_buffer_failures++;
11205 kr = KERN_INVALID_ADDRESS;
11206 }
11207
11208 (void) vm_map_switch(oldmap);
11209 vm_map_deallocate(map);
11210 }
11211
11212 if (kr != KERN_SUCCESS) {
11213 /* the copy failed, clean up */
11214 if (!overwrite) {
11215 /*
11216 * Deallocate the space we allocated in the target map.
11217 */
11218 (void) vm_map_remove(map,
11219 vm_map_trunc_page(*addr,
11220 VM_MAP_PAGE_MASK(map)),
11221 vm_map_round_page((*addr +
11222 vm_map_round_page(copy_size,
11223 VM_MAP_PAGE_MASK(map))),
11224 VM_MAP_PAGE_MASK(map)));
11225 *addr = 0;
11226 }
11227 } else {
11228 /* copy was successful, dicard the copy structure */
11229 if (consume_on_success) {
11230 kfree_data(copy->cpy_kdata, copy_size);
11231 zfree_id(ZONE_ID_VM_MAP_COPY, copy);
11232 }
11233 }
11234
11235 return kr;
11236 }
11237
11238 /*
11239 * Routine: vm_map_copy_insert [internal use only]
11240 *
11241 * Description:
11242 * Link a copy chain ("copy") into a map at the
11243 * specified location (after "where").
11244 *
11245 * Callers of this function must call vm_map_copy_require on
11246 * previously created vm_map_copy_t or pass a newly created
11247 * one to ensure that it hasn't been forged.
11248 * Side effects:
11249 * The copy chain is destroyed.
11250 */
11251 static void
vm_map_copy_insert(vm_map_t map,vm_map_entry_t after_where,vm_map_copy_t copy)11252 vm_map_copy_insert(
11253 vm_map_t map,
11254 vm_map_entry_t after_where,
11255 vm_map_copy_t copy)
11256 {
11257 vm_map_entry_t entry;
11258
11259 while (vm_map_copy_first_entry(copy) != vm_map_copy_to_entry(copy)) {
11260 entry = vm_map_copy_first_entry(copy);
11261 vm_map_copy_entry_unlink(copy, entry);
11262 vm_map_store_entry_link(map, after_where, entry,
11263 VM_MAP_KERNEL_FLAGS_NONE);
11264 after_where = entry;
11265 }
11266 zfree_id(ZONE_ID_VM_MAP_COPY, copy);
11267 }
11268
11269 /*
11270 * Callers of this function must call vm_map_copy_require on
11271 * previously created vm_map_copy_t or pass a newly created
11272 * one to ensure that it hasn't been forged.
11273 */
11274 void
vm_map_copy_remap(vm_map_t map,vm_map_entry_t where,vm_map_copy_t copy,vm_map_offset_t adjustment,vm_prot_t cur_prot,vm_prot_t max_prot,vm_inherit_t inheritance)11275 vm_map_copy_remap(
11276 vm_map_t map,
11277 vm_map_entry_t where,
11278 vm_map_copy_t copy,
11279 vm_map_offset_t adjustment,
11280 vm_prot_t cur_prot,
11281 vm_prot_t max_prot,
11282 vm_inherit_t inheritance)
11283 {
11284 vm_map_entry_t copy_entry, new_entry;
11285
11286 for (copy_entry = vm_map_copy_first_entry(copy);
11287 copy_entry != vm_map_copy_to_entry(copy);
11288 copy_entry = copy_entry->vme_next) {
11289 /* get a new VM map entry for the map */
11290 new_entry = vm_map_entry_create(map);
11291 /* copy the "copy entry" to the new entry */
11292 vm_map_entry_copy(map, new_entry, copy_entry);
11293 /* adjust "start" and "end" */
11294 new_entry->vme_start += adjustment;
11295 new_entry->vme_end += adjustment;
11296 /* clear some attributes */
11297 new_entry->inheritance = inheritance;
11298 new_entry->protection = cur_prot;
11299 new_entry->max_protection = max_prot;
11300 new_entry->behavior = VM_BEHAVIOR_DEFAULT;
11301 /* take an extra reference on the entry's "object" */
11302 if (new_entry->is_sub_map) {
11303 assert(!new_entry->use_pmap); /* not nested */
11304 vm_map_reference(VME_SUBMAP(new_entry));
11305 } else {
11306 vm_object_reference(VME_OBJECT(new_entry));
11307 }
11308 /* insert the new entry in the map */
11309 vm_map_store_entry_link(map, where, new_entry,
11310 VM_MAP_KERNEL_FLAGS_NONE);
11311 /* continue inserting the "copy entries" after the new entry */
11312 where = new_entry;
11313 }
11314 }
11315
11316
11317 /*
11318 * Returns true if *size matches (or is in the range of) copy->size.
11319 * Upon returning true, the *size field is updated with the actual size of the
11320 * copy object (may be different for VM_MAP_COPY_ENTRY_LIST types)
11321 */
11322 boolean_t
vm_map_copy_validate_size(vm_map_t dst_map,vm_map_copy_t copy,vm_map_size_t * size)11323 vm_map_copy_validate_size(
11324 vm_map_t dst_map,
11325 vm_map_copy_t copy,
11326 vm_map_size_t *size)
11327 {
11328 if (copy == VM_MAP_COPY_NULL) {
11329 return FALSE;
11330 }
11331
11332 /*
11333 * Assert that the vm_map_copy is coming from the right
11334 * zone and hasn't been forged
11335 */
11336 vm_map_copy_require(copy);
11337
11338 vm_map_size_t copy_sz = copy->size;
11339 vm_map_size_t sz = *size;
11340 switch (copy->type) {
11341 case VM_MAP_COPY_KERNEL_BUFFER:
11342 if (sz == copy_sz) {
11343 return TRUE;
11344 }
11345 break;
11346 case VM_MAP_COPY_ENTRY_LIST:
11347 /*
11348 * potential page-size rounding prevents us from exactly
11349 * validating this flavor of vm_map_copy, but we can at least
11350 * assert that it's within a range.
11351 */
11352 if (copy_sz >= sz &&
11353 copy_sz <= vm_map_round_page(sz, VM_MAP_PAGE_MASK(dst_map))) {
11354 *size = copy_sz;
11355 return TRUE;
11356 }
11357 break;
11358 default:
11359 break;
11360 }
11361 return FALSE;
11362 }
11363
11364 /*
11365 * Routine: vm_map_copyout_size
11366 *
11367 * Description:
11368 * Copy out a copy chain ("copy") into newly-allocated
11369 * space in the destination map. Uses a prevalidated
11370 * size for the copy object (vm_map_copy_validate_size).
11371 *
11372 * If successful, consumes the copy object.
11373 * Otherwise, the caller is responsible for it.
11374 */
11375 kern_return_t
vm_map_copyout_size(vm_map_t dst_map,vm_map_address_t * dst_addr,vm_map_copy_t copy,vm_map_size_t copy_size)11376 vm_map_copyout_size(
11377 vm_map_t dst_map,
11378 vm_map_address_t *dst_addr, /* OUT */
11379 vm_map_copy_t copy,
11380 vm_map_size_t copy_size)
11381 {
11382 return vm_map_copyout_internal(dst_map, dst_addr, copy, copy_size,
11383 TRUE, /* consume_on_success */
11384 VM_PROT_DEFAULT,
11385 VM_PROT_ALL,
11386 VM_INHERIT_DEFAULT);
11387 }
11388
11389 /*
11390 * Routine: vm_map_copyout
11391 *
11392 * Description:
11393 * Copy out a copy chain ("copy") into newly-allocated
11394 * space in the destination map.
11395 *
11396 * If successful, consumes the copy object.
11397 * Otherwise, the caller is responsible for it.
11398 */
11399 kern_return_t
vm_map_copyout(vm_map_t dst_map,vm_map_address_t * dst_addr,vm_map_copy_t copy)11400 vm_map_copyout(
11401 vm_map_t dst_map,
11402 vm_map_address_t *dst_addr, /* OUT */
11403 vm_map_copy_t copy)
11404 {
11405 return vm_map_copyout_internal(dst_map, dst_addr, copy, copy ? copy->size : 0,
11406 TRUE, /* consume_on_success */
11407 VM_PROT_DEFAULT,
11408 VM_PROT_ALL,
11409 VM_INHERIT_DEFAULT);
11410 }
11411
11412 kern_return_t
vm_map_copyout_internal(vm_map_t dst_map,vm_map_address_t * dst_addr,vm_map_copy_t copy,vm_map_size_t copy_size,boolean_t consume_on_success,vm_prot_t cur_protection,vm_prot_t max_protection,vm_inherit_t inheritance)11413 vm_map_copyout_internal(
11414 vm_map_t dst_map,
11415 vm_map_address_t *dst_addr, /* OUT */
11416 vm_map_copy_t copy,
11417 vm_map_size_t copy_size,
11418 boolean_t consume_on_success,
11419 vm_prot_t cur_protection,
11420 vm_prot_t max_protection,
11421 vm_inherit_t inheritance)
11422 {
11423 vm_map_size_t size;
11424 vm_map_size_t adjustment;
11425 vm_map_offset_t start;
11426 vm_object_offset_t vm_copy_start;
11427 vm_map_entry_t last;
11428 vm_map_entry_t entry;
11429 vm_map_copy_t original_copy;
11430 kern_return_t kr;
11431 vm_map_kernel_flags_t vmk_flags = VM_MAP_KERNEL_FLAGS_ANYWHERE();
11432
11433 /*
11434 * Check for null copy object.
11435 */
11436
11437 if (copy == VM_MAP_COPY_NULL) {
11438 *dst_addr = 0;
11439 return KERN_SUCCESS;
11440 }
11441
11442 /*
11443 * Assert that the vm_map_copy is coming from the right
11444 * zone and hasn't been forged
11445 */
11446 vm_map_copy_require(copy);
11447
11448 if (copy->size != copy_size) {
11449 *dst_addr = 0;
11450 ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_COPYOUT_INTERNAL_SIZE_ERROR), KERN_FAILURE /* arg */);
11451 return KERN_FAILURE;
11452 }
11453
11454 /*
11455 * Check for special kernel buffer allocated
11456 * by new_ipc_kmsg_copyin.
11457 */
11458
11459 if (copy->type == VM_MAP_COPY_KERNEL_BUFFER) {
11460 kr = vm_map_copyout_kernel_buffer(dst_map, dst_addr,
11461 copy, copy_size, FALSE,
11462 consume_on_success);
11463 if (kr) {
11464 ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_COPYOUT_KERNEL_BUFFER_ERROR), kr /* arg */);
11465 }
11466 return kr;
11467 }
11468
11469 original_copy = copy;
11470 if (copy->cpy_hdr.page_shift != VM_MAP_PAGE_SHIFT(dst_map)) {
11471 vm_map_copy_t target_copy;
11472 vm_map_offset_t overmap_start, overmap_end, trimmed_start;
11473
11474 target_copy = VM_MAP_COPY_NULL;
11475 DEBUG4K_ADJUST("adjusting...\n");
11476 kr = vm_map_copy_adjust_to_target(
11477 copy,
11478 0, /* offset */
11479 copy->size, /* size */
11480 dst_map,
11481 TRUE, /* copy */
11482 &target_copy,
11483 &overmap_start,
11484 &overmap_end,
11485 &trimmed_start);
11486 if (kr != KERN_SUCCESS) {
11487 DEBUG4K_COPY("adjust failed 0x%x\n", kr);
11488 ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_COPYOUT_INTERNAL_ADJUSTING_ERROR), kr /* arg */);
11489 return kr;
11490 }
11491 DEBUG4K_COPY("copy %p (%d 0x%llx 0x%llx) dst_map %p (%d) target_copy %p (%d 0x%llx 0x%llx) overmap_start 0x%llx overmap_end 0x%llx trimmed_start 0x%llx\n", copy, copy->cpy_hdr.page_shift, copy->offset, (uint64_t)copy->size, dst_map, VM_MAP_PAGE_SHIFT(dst_map), target_copy, target_copy->cpy_hdr.page_shift, target_copy->offset, (uint64_t)target_copy->size, (uint64_t)overmap_start, (uint64_t)overmap_end, (uint64_t)trimmed_start);
11492 if (target_copy != copy) {
11493 copy = target_copy;
11494 }
11495 copy_size = copy->size;
11496 }
11497
11498 /*
11499 * Find space for the data
11500 */
11501
11502 vm_copy_start = vm_map_trunc_page((vm_map_size_t)copy->offset,
11503 VM_MAP_COPY_PAGE_MASK(copy));
11504 size = vm_map_round_page((vm_map_size_t)copy->offset + copy_size,
11505 VM_MAP_COPY_PAGE_MASK(copy))
11506 - vm_copy_start;
11507
11508 vm_map_kernel_flags_update_range_id(&vmk_flags, dst_map);
11509
11510 vm_map_lock(dst_map);
11511 kr = vm_map_locate_space(dst_map, size, 0, vmk_flags,
11512 &start, &last);
11513 if (kr != KERN_SUCCESS) {
11514 vm_map_unlock(dst_map);
11515 ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_COPYOUT_INTERNAL_SPACE_ERROR), kr /* arg */);
11516 return kr;
11517 }
11518
11519 adjustment = start - vm_copy_start;
11520 if (!consume_on_success) {
11521 /*
11522 * We're not allowed to consume "copy", so we'll have to
11523 * copy its map entries into the destination map below.
11524 * No need to re-allocate map entries from the correct
11525 * (pageable or not) zone, since we'll get new map entries
11526 * during the transfer.
11527 * We'll also adjust the map entries's "start" and "end"
11528 * during the transfer, to keep "copy"'s entries consistent
11529 * with its "offset".
11530 */
11531 goto after_adjustments;
11532 }
11533
11534 /*
11535 * Since we're going to just drop the map
11536 * entries from the copy into the destination
11537 * map, they must come from the same pool.
11538 */
11539
11540 if (copy->cpy_hdr.entries_pageable != dst_map->hdr.entries_pageable) {
11541 /*
11542 * Mismatches occur when dealing with the default
11543 * pager.
11544 */
11545 vm_map_entry_t next, new;
11546
11547 /*
11548 * Find the zone that the copies were allocated from
11549 */
11550
11551 entry = vm_map_copy_first_entry(copy);
11552
11553 /*
11554 * Reinitialize the copy so that vm_map_copy_entry_link
11555 * will work.
11556 */
11557 vm_map_store_copy_reset(copy, entry);
11558 copy->cpy_hdr.entries_pageable = dst_map->hdr.entries_pageable;
11559
11560 /*
11561 * Copy each entry.
11562 */
11563 while (entry != vm_map_copy_to_entry(copy)) {
11564 new = vm_map_copy_entry_create(copy);
11565 vm_map_entry_copy_full(new, entry);
11566 new->vme_no_copy_on_read = FALSE;
11567 assert(!new->iokit_acct);
11568 if (new->is_sub_map) {
11569 /* clr address space specifics */
11570 new->use_pmap = FALSE;
11571 }
11572 vm_map_copy_entry_link(copy,
11573 vm_map_copy_last_entry(copy),
11574 new);
11575 next = entry->vme_next;
11576 vm_map_entry_dispose(entry);
11577 entry = next;
11578 }
11579 }
11580
11581 /*
11582 * Adjust the addresses in the copy chain, and
11583 * reset the region attributes.
11584 */
11585
11586 for (entry = vm_map_copy_first_entry(copy);
11587 entry != vm_map_copy_to_entry(copy);
11588 entry = entry->vme_next) {
11589 if (VM_MAP_PAGE_SHIFT(dst_map) == PAGE_SHIFT) {
11590 /*
11591 * We're injecting this copy entry into a map that
11592 * has the standard page alignment, so clear
11593 * "map_aligned" (which might have been inherited
11594 * from the original map entry).
11595 */
11596 entry->map_aligned = FALSE;
11597 }
11598
11599 entry->vme_start += adjustment;
11600 entry->vme_end += adjustment;
11601
11602 if (entry->map_aligned) {
11603 assert(VM_MAP_PAGE_ALIGNED(entry->vme_start,
11604 VM_MAP_PAGE_MASK(dst_map)));
11605 assert(VM_MAP_PAGE_ALIGNED(entry->vme_end,
11606 VM_MAP_PAGE_MASK(dst_map)));
11607 }
11608
11609 entry->inheritance = VM_INHERIT_DEFAULT;
11610 entry->protection = VM_PROT_DEFAULT;
11611 entry->max_protection = VM_PROT_ALL;
11612 entry->behavior = VM_BEHAVIOR_DEFAULT;
11613
11614 /*
11615 * If the entry is now wired,
11616 * map the pages into the destination map.
11617 */
11618 if (entry->wired_count != 0) {
11619 vm_map_offset_t va;
11620 vm_object_offset_t offset;
11621 vm_object_t object;
11622 vm_prot_t prot;
11623 int type_of_fault;
11624 uint8_t object_lock_type = OBJECT_LOCK_EXCLUSIVE;
11625
11626 /* TODO4K would need to use actual page size */
11627 assert(VM_MAP_PAGE_SHIFT(dst_map) == PAGE_SHIFT);
11628
11629 object = VME_OBJECT(entry);
11630 offset = VME_OFFSET(entry);
11631 va = entry->vme_start;
11632
11633 pmap_pageable(dst_map->pmap,
11634 entry->vme_start,
11635 entry->vme_end,
11636 TRUE);
11637
11638 while (va < entry->vme_end) {
11639 vm_page_t m;
11640 struct vm_object_fault_info fault_info = {};
11641
11642 /*
11643 * Look up the page in the object.
11644 * Assert that the page will be found in the
11645 * top object:
11646 * either
11647 * the object was newly created by
11648 * vm_object_copy_slowly, and has
11649 * copies of all of the pages from
11650 * the source object
11651 * or
11652 * the object was moved from the old
11653 * map entry; because the old map
11654 * entry was wired, all of the pages
11655 * were in the top-level object.
11656 * (XXX not true if we wire pages for
11657 * reading)
11658 */
11659 vm_object_lock(object);
11660
11661 m = vm_page_lookup(object, offset);
11662 if (m == VM_PAGE_NULL || !VM_PAGE_WIRED(m) ||
11663 m->vmp_absent) {
11664 panic("vm_map_copyout: wiring %p", m);
11665 }
11666
11667 prot = entry->protection;
11668
11669 if (override_nx(dst_map, VME_ALIAS(entry)) &&
11670 prot) {
11671 prot |= VM_PROT_EXECUTE;
11672 }
11673
11674 type_of_fault = DBG_CACHE_HIT_FAULT;
11675
11676 fault_info.user_tag = VME_ALIAS(entry);
11677 fault_info.pmap_options = 0;
11678 if (entry->iokit_acct ||
11679 (!entry->is_sub_map && !entry->use_pmap)) {
11680 fault_info.pmap_options |= PMAP_OPTIONS_ALT_ACCT;
11681 }
11682 if (entry->vme_xnu_user_debug &&
11683 !VM_PAGE_OBJECT(m)->code_signed) {
11684 /*
11685 * Modified code-signed executable
11686 * region: this page does not belong
11687 * to a code-signed VM object, so it
11688 * must have been copied and should
11689 * therefore be typed XNU_USER_DEBUG
11690 * rather than XNU_USER_EXEC.
11691 */
11692 fault_info.pmap_options |= PMAP_OPTIONS_XNU_USER_DEBUG;
11693 }
11694
11695 vm_fault_enter(m,
11696 dst_map->pmap,
11697 va,
11698 PAGE_SIZE, 0,
11699 prot,
11700 prot,
11701 VM_PAGE_WIRED(m),
11702 FALSE, /* change_wiring */
11703 VM_KERN_MEMORY_NONE, /* tag - not wiring */
11704 &fault_info,
11705 NULL, /* need_retry */
11706 &type_of_fault,
11707 &object_lock_type); /*Exclusive mode lock. Will remain unchanged.*/
11708
11709 vm_object_unlock(object);
11710
11711 offset += PAGE_SIZE_64;
11712 va += PAGE_SIZE;
11713 }
11714 }
11715 }
11716
11717 after_adjustments:
11718
11719 /*
11720 * Correct the page alignment for the result
11721 */
11722
11723 *dst_addr = start + (copy->offset - vm_copy_start);
11724
11725 #if KASAN
11726 kasan_notify_address(*dst_addr, size);
11727 #endif
11728
11729 /*
11730 * Update the hints and the map size
11731 */
11732
11733 if (consume_on_success) {
11734 SAVE_HINT_MAP_WRITE(dst_map, vm_map_copy_last_entry(copy));
11735 } else {
11736 SAVE_HINT_MAP_WRITE(dst_map, last);
11737 }
11738
11739 dst_map->size += size;
11740
11741 /*
11742 * Link in the copy
11743 */
11744
11745 if (consume_on_success) {
11746 vm_map_copy_insert(dst_map, last, copy);
11747 if (copy != original_copy) {
11748 vm_map_copy_discard(original_copy);
11749 original_copy = VM_MAP_COPY_NULL;
11750 }
11751 } else {
11752 vm_map_copy_remap(dst_map, last, copy, adjustment,
11753 cur_protection, max_protection,
11754 inheritance);
11755 if (copy != original_copy && original_copy != VM_MAP_COPY_NULL) {
11756 vm_map_copy_discard(copy);
11757 copy = original_copy;
11758 }
11759 }
11760
11761
11762 vm_map_unlock(dst_map);
11763
11764 /*
11765 * XXX If wiring_required, call vm_map_pageable
11766 */
11767
11768 return KERN_SUCCESS;
11769 }
11770
11771 /*
11772 * Routine: vm_map_copyin
11773 *
11774 * Description:
11775 * see vm_map_copyin_common. Exported via Unsupported.exports.
11776 *
11777 */
11778
11779 #undef vm_map_copyin
11780
11781 kern_return_t
vm_map_copyin(vm_map_t src_map,vm_map_address_t src_addr,vm_map_size_t len,boolean_t src_destroy,vm_map_copy_t * copy_result)11782 vm_map_copyin(
11783 vm_map_t src_map,
11784 vm_map_address_t src_addr,
11785 vm_map_size_t len,
11786 boolean_t src_destroy,
11787 vm_map_copy_t *copy_result) /* OUT */
11788 {
11789 return vm_map_copyin_common(src_map, src_addr, len, src_destroy,
11790 FALSE, copy_result, FALSE);
11791 }
11792
11793 /*
11794 * Routine: vm_map_copyin_common
11795 *
11796 * Description:
11797 * Copy the specified region (src_addr, len) from the
11798 * source address space (src_map), possibly removing
11799 * the region from the source address space (src_destroy).
11800 *
11801 * Returns:
11802 * A vm_map_copy_t object (copy_result), suitable for
11803 * insertion into another address space (using vm_map_copyout),
11804 * copying over another address space region (using
11805 * vm_map_copy_overwrite). If the copy is unused, it
11806 * should be destroyed (using vm_map_copy_discard).
11807 *
11808 * In/out conditions:
11809 * The source map should not be locked on entry.
11810 */
11811
11812 typedef struct submap_map {
11813 vm_map_t parent_map;
11814 vm_map_offset_t base_start;
11815 vm_map_offset_t base_end;
11816 vm_map_size_t base_len;
11817 struct submap_map *next;
11818 } submap_map_t;
11819
11820 kern_return_t
vm_map_copyin_common(vm_map_t src_map,vm_map_address_t src_addr,vm_map_size_t len,boolean_t src_destroy,__unused boolean_t src_volatile,vm_map_copy_t * copy_result,boolean_t use_maxprot)11821 vm_map_copyin_common(
11822 vm_map_t src_map,
11823 vm_map_address_t src_addr,
11824 vm_map_size_t len,
11825 boolean_t src_destroy,
11826 __unused boolean_t src_volatile,
11827 vm_map_copy_t *copy_result, /* OUT */
11828 boolean_t use_maxprot)
11829 {
11830 int flags;
11831
11832 flags = 0;
11833 if (src_destroy) {
11834 flags |= VM_MAP_COPYIN_SRC_DESTROY;
11835 }
11836 if (use_maxprot) {
11837 flags |= VM_MAP_COPYIN_USE_MAXPROT;
11838 }
11839 return vm_map_copyin_internal(src_map,
11840 src_addr,
11841 len,
11842 flags,
11843 copy_result);
11844 }
11845 kern_return_t
vm_map_copyin_internal(vm_map_t src_map,vm_map_address_t src_addr,vm_map_size_t len,int flags,vm_map_copy_t * copy_result)11846 vm_map_copyin_internal(
11847 vm_map_t src_map,
11848 vm_map_address_t src_addr,
11849 vm_map_size_t len,
11850 int flags,
11851 vm_map_copy_t *copy_result) /* OUT */
11852 {
11853 vm_map_entry_t tmp_entry; /* Result of last map lookup --
11854 * in multi-level lookup, this
11855 * entry contains the actual
11856 * vm_object/offset.
11857 */
11858 vm_map_entry_t new_entry = VM_MAP_ENTRY_NULL; /* Map entry for copy */
11859
11860 vm_map_offset_t src_start; /* Start of current entry --
11861 * where copy is taking place now
11862 */
11863 vm_map_offset_t src_end; /* End of entire region to be
11864 * copied */
11865 vm_map_offset_t src_base;
11866 vm_map_t base_map = src_map;
11867 boolean_t map_share = FALSE;
11868 submap_map_t *parent_maps = NULL;
11869
11870 vm_map_copy_t copy; /* Resulting copy */
11871 vm_map_address_t copy_addr;
11872 vm_map_size_t copy_size;
11873 boolean_t src_destroy;
11874 boolean_t use_maxprot;
11875 boolean_t preserve_purgeable;
11876 boolean_t entry_was_shared;
11877 vm_map_entry_t saved_src_entry;
11878
11879 if (flags & ~VM_MAP_COPYIN_ALL_FLAGS) {
11880 return KERN_INVALID_ARGUMENT;
11881 }
11882
11883 #if CONFIG_KERNEL_TAGGING
11884 if (src_map->pmap == kernel_pmap) {
11885 src_addr = vm_memtag_canonicalize_address(src_addr);
11886 }
11887 #endif /* CONFIG_KERNEL_TAGGING */
11888
11889 src_destroy = (flags & VM_MAP_COPYIN_SRC_DESTROY) ? TRUE : FALSE;
11890 use_maxprot = (flags & VM_MAP_COPYIN_USE_MAXPROT) ? TRUE : FALSE;
11891 preserve_purgeable =
11892 (flags & VM_MAP_COPYIN_PRESERVE_PURGEABLE) ? TRUE : FALSE;
11893
11894 /*
11895 * Check for copies of zero bytes.
11896 */
11897
11898 if (len == 0) {
11899 *copy_result = VM_MAP_COPY_NULL;
11900 return KERN_SUCCESS;
11901 }
11902
11903 /*
11904 * Check that the end address doesn't overflow
11905 */
11906 if (__improbable(vm_map_range_overflows(src_map, src_addr, len))) {
11907 return KERN_INVALID_ADDRESS;
11908 }
11909 src_end = src_addr + len;
11910 if (src_end < src_addr) {
11911 return KERN_INVALID_ADDRESS;
11912 }
11913
11914 /*
11915 * Compute (page aligned) start and end of region
11916 */
11917 src_start = vm_map_trunc_page(src_addr,
11918 VM_MAP_PAGE_MASK(src_map));
11919 src_end = vm_map_round_page(src_end,
11920 VM_MAP_PAGE_MASK(src_map));
11921 if (src_end < src_addr) {
11922 return KERN_INVALID_ADDRESS;
11923 }
11924
11925 /*
11926 * If the copy is sufficiently small, use a kernel buffer instead
11927 * of making a virtual copy. The theory being that the cost of
11928 * setting up VM (and taking C-O-W faults) dominates the copy costs
11929 * for small regions.
11930 */
11931 if ((len <= msg_ool_size_small) &&
11932 !use_maxprot &&
11933 !preserve_purgeable &&
11934 !(flags & VM_MAP_COPYIN_ENTRY_LIST) &&
11935 /*
11936 * Since the "msg_ool_size_small" threshold was increased and
11937 * vm_map_copyin_kernel_buffer() doesn't handle accesses beyond the
11938 * address space limits, we revert to doing a virtual copy if the
11939 * copied range goes beyond those limits. Otherwise, mach_vm_read()
11940 * of the commpage would now fail when it used to work.
11941 */
11942 (src_start >= vm_map_min(src_map) &&
11943 src_start < vm_map_max(src_map) &&
11944 src_end >= vm_map_min(src_map) &&
11945 src_end < vm_map_max(src_map))) {
11946 return vm_map_copyin_kernel_buffer(src_map, src_addr, len,
11947 src_destroy, copy_result);
11948 }
11949
11950 /*
11951 * Allocate a header element for the list.
11952 *
11953 * Use the start and end in the header to
11954 * remember the endpoints prior to rounding.
11955 */
11956
11957 copy = vm_map_copy_allocate(VM_MAP_COPY_ENTRY_LIST);
11958 copy->cpy_hdr.entries_pageable = TRUE;
11959 copy->cpy_hdr.page_shift = (uint16_t)VM_MAP_PAGE_SHIFT(src_map);
11960 copy->offset = src_addr;
11961 copy->size = len;
11962
11963 new_entry = vm_map_copy_entry_create(copy);
11964
11965 #define RETURN(x) \
11966 MACRO_BEGIN \
11967 vm_map_unlock(src_map); \
11968 if(src_map != base_map) \
11969 vm_map_deallocate(src_map); \
11970 if (new_entry != VM_MAP_ENTRY_NULL) \
11971 vm_map_copy_entry_dispose(new_entry); \
11972 vm_map_copy_discard(copy); \
11973 { \
11974 submap_map_t *_ptr; \
11975 \
11976 for(_ptr = parent_maps; _ptr != NULL; _ptr = parent_maps) { \
11977 parent_maps=parent_maps->next; \
11978 if (_ptr->parent_map != base_map) \
11979 vm_map_deallocate(_ptr->parent_map); \
11980 kfree_type(submap_map_t, _ptr); \
11981 } \
11982 } \
11983 MACRO_RETURN(x); \
11984 MACRO_END
11985
11986 /*
11987 * Find the beginning of the region.
11988 */
11989
11990 vm_map_lock(src_map);
11991
11992 /*
11993 * Lookup the original "src_addr" rather than the truncated
11994 * "src_start", in case "src_start" falls in a non-map-aligned
11995 * map entry *before* the map entry that contains "src_addr"...
11996 */
11997 if (!vm_map_lookup_entry(src_map, src_addr, &tmp_entry)) {
11998 RETURN(KERN_INVALID_ADDRESS);
11999 }
12000 if (!tmp_entry->is_sub_map) {
12001 /*
12002 * ... but clip to the map-rounded "src_start" rather than
12003 * "src_addr" to preserve map-alignment. We'll adjust the
12004 * first copy entry at the end, if needed.
12005 */
12006 vm_map_clip_start(src_map, tmp_entry, src_start);
12007 }
12008 if (src_start < tmp_entry->vme_start) {
12009 /*
12010 * Move "src_start" up to the start of the
12011 * first map entry to copy.
12012 */
12013 src_start = tmp_entry->vme_start;
12014 }
12015 /* set for later submap fix-up */
12016 copy_addr = src_start;
12017
12018 /*
12019 * Go through entries until we get to the end.
12020 */
12021
12022 while (TRUE) {
12023 vm_map_entry_t src_entry = tmp_entry; /* Top-level entry */
12024 vm_map_size_t src_size; /* Size of source
12025 * map entry (in both
12026 * maps)
12027 */
12028
12029 vm_object_t src_object; /* Object to copy */
12030 vm_object_offset_t src_offset;
12031
12032 vm_object_t new_copy_object;/* vm_object_copy_* result */
12033
12034 boolean_t src_needs_copy; /* Should source map
12035 * be made read-only
12036 * for copy-on-write?
12037 */
12038
12039 boolean_t new_entry_needs_copy; /* Will new entry be COW? */
12040
12041 boolean_t was_wired; /* Was source wired? */
12042 boolean_t saved_used_for_jit; /* Saved used_for_jit. */
12043 vm_map_version_t version; /* Version before locks
12044 * dropped to make copy
12045 */
12046 kern_return_t result; /* Return value from
12047 * copy_strategically.
12048 */
12049 while (tmp_entry->is_sub_map) {
12050 vm_map_size_t submap_len;
12051 submap_map_t *ptr;
12052
12053 ptr = kalloc_type(submap_map_t, Z_WAITOK);
12054 ptr->next = parent_maps;
12055 parent_maps = ptr;
12056 ptr->parent_map = src_map;
12057 ptr->base_start = src_start;
12058 ptr->base_end = src_end;
12059 submap_len = tmp_entry->vme_end - src_start;
12060 if (submap_len > (src_end - src_start)) {
12061 submap_len = src_end - src_start;
12062 }
12063 ptr->base_len = submap_len;
12064
12065 src_start -= tmp_entry->vme_start;
12066 src_start += VME_OFFSET(tmp_entry);
12067 src_end = src_start + submap_len;
12068 src_map = VME_SUBMAP(tmp_entry);
12069 vm_map_lock(src_map);
12070 /* keep an outstanding reference for all maps in */
12071 /* the parents tree except the base map */
12072 vm_map_reference(src_map);
12073 vm_map_unlock(ptr->parent_map);
12074 if (!vm_map_lookup_entry(
12075 src_map, src_start, &tmp_entry)) {
12076 RETURN(KERN_INVALID_ADDRESS);
12077 }
12078 map_share = TRUE;
12079 if (!tmp_entry->is_sub_map) {
12080 vm_map_clip_start(src_map, tmp_entry, src_start);
12081 }
12082 src_entry = tmp_entry;
12083 }
12084 /* we are now in the lowest level submap... */
12085
12086 if ((VME_OBJECT(tmp_entry) != VM_OBJECT_NULL) &&
12087 (VME_OBJECT(tmp_entry)->phys_contiguous)) {
12088 /* This is not, supported for now.In future */
12089 /* we will need to detect the phys_contig */
12090 /* condition and then upgrade copy_slowly */
12091 /* to do physical copy from the device mem */
12092 /* based object. We can piggy-back off of */
12093 /* the was wired boolean to set-up the */
12094 /* proper handling */
12095 RETURN(KERN_PROTECTION_FAILURE);
12096 }
12097 /*
12098 * Create a new address map entry to hold the result.
12099 * Fill in the fields from the appropriate source entries.
12100 * We must unlock the source map to do this if we need
12101 * to allocate a map entry.
12102 */
12103 if (new_entry == VM_MAP_ENTRY_NULL) {
12104 version.main_timestamp = src_map->timestamp;
12105 vm_map_unlock(src_map);
12106
12107 new_entry = vm_map_copy_entry_create(copy);
12108
12109 vm_map_lock(src_map);
12110 if ((version.main_timestamp + 1) != src_map->timestamp) {
12111 if (!vm_map_lookup_entry(src_map, src_start,
12112 &tmp_entry)) {
12113 RETURN(KERN_INVALID_ADDRESS);
12114 }
12115 if (!tmp_entry->is_sub_map) {
12116 vm_map_clip_start(src_map, tmp_entry, src_start);
12117 }
12118 continue; /* restart w/ new tmp_entry */
12119 }
12120 }
12121
12122 /*
12123 * Verify that the region can be read.
12124 */
12125 if (((src_entry->protection & VM_PROT_READ) == VM_PROT_NONE &&
12126 !use_maxprot) ||
12127 (src_entry->max_protection & VM_PROT_READ) == 0) {
12128 RETURN(KERN_PROTECTION_FAILURE);
12129 }
12130
12131 /*
12132 * Clip against the endpoints of the entire region.
12133 */
12134
12135 vm_map_clip_end(src_map, src_entry, src_end);
12136
12137 src_size = src_entry->vme_end - src_start;
12138 src_object = VME_OBJECT(src_entry);
12139 src_offset = VME_OFFSET(src_entry);
12140 was_wired = (src_entry->wired_count != 0);
12141
12142 vm_map_entry_copy(src_map, new_entry, src_entry);
12143 if (new_entry->is_sub_map) {
12144 /* clr address space specifics */
12145 new_entry->use_pmap = FALSE;
12146 } else {
12147 /*
12148 * We're dealing with a copy-on-write operation,
12149 * so the resulting mapping should not inherit the
12150 * original mapping's accounting settings.
12151 * "iokit_acct" should have been cleared in
12152 * vm_map_entry_copy().
12153 * "use_pmap" should be reset to its default (TRUE)
12154 * so that the new mapping gets accounted for in
12155 * the task's memory footprint.
12156 */
12157 assert(!new_entry->iokit_acct);
12158 new_entry->use_pmap = TRUE;
12159 }
12160
12161 /*
12162 * Attempt non-blocking copy-on-write optimizations.
12163 */
12164
12165 /*
12166 * If we are destroying the source, and the object
12167 * is internal, we could move the object reference
12168 * from the source to the copy. The copy is
12169 * copy-on-write only if the source is.
12170 * We make another reference to the object, because
12171 * destroying the source entry will deallocate it.
12172 *
12173 * This memory transfer has to be atomic, (to prevent
12174 * the VM object from being shared or copied while
12175 * it's being moved here), so we could only do this
12176 * if we won't have to unlock the VM map until the
12177 * original mapping has been fully removed.
12178 */
12179
12180 RestartCopy:
12181 if ((src_object == VM_OBJECT_NULL ||
12182 (!was_wired && !map_share && !tmp_entry->is_shared
12183 && !(debug4k_no_cow_copyin && VM_MAP_PAGE_SHIFT(src_map) < PAGE_SHIFT))) &&
12184 vm_object_copy_quickly(
12185 VME_OBJECT(new_entry),
12186 src_offset,
12187 src_size,
12188 &src_needs_copy,
12189 &new_entry_needs_copy)) {
12190 new_entry->needs_copy = new_entry_needs_copy;
12191
12192 /*
12193 * Handle copy-on-write obligations
12194 */
12195
12196 if (src_needs_copy && !tmp_entry->needs_copy) {
12197 vm_prot_t prot;
12198
12199 prot = src_entry->protection & ~VM_PROT_WRITE;
12200
12201 if (override_nx(src_map, VME_ALIAS(src_entry))
12202 && prot) {
12203 prot |= VM_PROT_EXECUTE;
12204 }
12205
12206 vm_object_pmap_protect(
12207 src_object,
12208 src_offset,
12209 src_size,
12210 (src_entry->is_shared ?
12211 PMAP_NULL
12212 : src_map->pmap),
12213 VM_MAP_PAGE_SIZE(src_map),
12214 src_entry->vme_start,
12215 prot);
12216
12217 assert(tmp_entry->wired_count == 0);
12218 tmp_entry->needs_copy = TRUE;
12219 }
12220
12221 /*
12222 * The map has never been unlocked, so it's safe
12223 * to move to the next entry rather than doing
12224 * another lookup.
12225 */
12226
12227 goto CopySuccessful;
12228 }
12229
12230 entry_was_shared = tmp_entry->is_shared;
12231
12232 /*
12233 * Take an object reference, so that we may
12234 * release the map lock(s).
12235 */
12236
12237 assert(src_object != VM_OBJECT_NULL);
12238 vm_object_reference(src_object);
12239
12240 /*
12241 * Record the timestamp for later verification.
12242 * Unlock the map.
12243 */
12244
12245 version.main_timestamp = src_map->timestamp;
12246 vm_map_unlock(src_map); /* Increments timestamp once! */
12247 saved_src_entry = src_entry;
12248 tmp_entry = VM_MAP_ENTRY_NULL;
12249 src_entry = VM_MAP_ENTRY_NULL;
12250
12251 /*
12252 * Perform the copy
12253 */
12254
12255 if (was_wired ||
12256 (src_object->copy_strategy == MEMORY_OBJECT_COPY_DELAY_FORK &&
12257 !(flags & VM_MAP_COPYIN_FORK)) ||
12258 (debug4k_no_cow_copyin &&
12259 VM_MAP_PAGE_SHIFT(src_map) < PAGE_SHIFT)) {
12260 CopySlowly:
12261 vm_object_lock(src_object);
12262 result = vm_object_copy_slowly(
12263 src_object,
12264 src_offset,
12265 src_size,
12266 THREAD_UNINT,
12267 &new_copy_object);
12268 /* VME_OBJECT_SET will reset used_for_jit|tpro, so preserve it. */
12269 saved_used_for_jit = new_entry->used_for_jit;
12270 VME_OBJECT_SET(new_entry, new_copy_object, false, 0);
12271 new_entry->used_for_jit = saved_used_for_jit;
12272 VME_OFFSET_SET(new_entry,
12273 src_offset - vm_object_trunc_page(src_offset));
12274 new_entry->needs_copy = FALSE;
12275 } else if (src_object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC &&
12276 (entry_was_shared || map_share)) {
12277 vm_object_t new_object;
12278
12279 vm_object_lock_shared(src_object);
12280 new_object = vm_object_copy_delayed(
12281 src_object,
12282 src_offset,
12283 src_size,
12284 TRUE);
12285 if (new_object == VM_OBJECT_NULL) {
12286 goto CopySlowly;
12287 }
12288
12289 VME_OBJECT_SET(new_entry, new_object, false, 0);
12290 assert(new_entry->wired_count == 0);
12291 new_entry->needs_copy = TRUE;
12292 assert(!new_entry->iokit_acct);
12293 assert(new_object->purgable == VM_PURGABLE_DENY);
12294 assertf(new_entry->use_pmap, "src_map %p new_entry %p\n", src_map, new_entry);
12295 result = KERN_SUCCESS;
12296 } else {
12297 vm_object_offset_t new_offset;
12298 new_offset = VME_OFFSET(new_entry);
12299 result = vm_object_copy_strategically(src_object,
12300 src_offset,
12301 src_size,
12302 (flags & VM_MAP_COPYIN_FORK),
12303 &new_copy_object,
12304 &new_offset,
12305 &new_entry_needs_copy);
12306 /* VME_OBJECT_SET will reset used_for_jit, so preserve it. */
12307 saved_used_for_jit = new_entry->used_for_jit;
12308 VME_OBJECT_SET(new_entry, new_copy_object, false, 0);
12309 new_entry->used_for_jit = saved_used_for_jit;
12310 if (new_offset != VME_OFFSET(new_entry)) {
12311 VME_OFFSET_SET(new_entry, new_offset);
12312 }
12313
12314 new_entry->needs_copy = new_entry_needs_copy;
12315 }
12316
12317 if (result == KERN_SUCCESS &&
12318 ((preserve_purgeable &&
12319 src_object->purgable != VM_PURGABLE_DENY) ||
12320 new_entry->used_for_jit)) {
12321 /*
12322 * Purgeable objects should be COPY_NONE, true share;
12323 * this should be propogated to the copy.
12324 *
12325 * Also force mappings the pmap specially protects to
12326 * be COPY_NONE; trying to COW these mappings would
12327 * change the effective protections, which could have
12328 * side effects if the pmap layer relies on the
12329 * specified protections.
12330 */
12331
12332 vm_object_t new_object;
12333
12334 new_object = VME_OBJECT(new_entry);
12335 assert(new_object != src_object);
12336 vm_object_lock(new_object);
12337 assert(new_object->ref_count == 1);
12338 assert(new_object->shadow == VM_OBJECT_NULL);
12339 assert(new_object->vo_copy == VM_OBJECT_NULL);
12340 assert(new_object->vo_owner == NULL);
12341
12342 new_object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
12343
12344 if (preserve_purgeable &&
12345 src_object->purgable != VM_PURGABLE_DENY) {
12346 new_object->true_share = TRUE;
12347
12348 /* start as non-volatile with no owner... */
12349 new_object->purgable = VM_PURGABLE_NONVOLATILE;
12350 vm_purgeable_nonvolatile_enqueue(new_object, NULL);
12351 /* ... and move to src_object's purgeable state */
12352 if (src_object->purgable != VM_PURGABLE_NONVOLATILE) {
12353 int state;
12354 state = src_object->purgable;
12355 vm_object_purgable_control(
12356 new_object,
12357 VM_PURGABLE_SET_STATE_FROM_KERNEL,
12358 &state);
12359 }
12360 /* no pmap accounting for purgeable objects */
12361 new_entry->use_pmap = FALSE;
12362 }
12363
12364 vm_object_unlock(new_object);
12365 new_object = VM_OBJECT_NULL;
12366 }
12367
12368 if (result != KERN_SUCCESS &&
12369 result != KERN_MEMORY_RESTART_COPY) {
12370 vm_map_lock(src_map);
12371 RETURN(result);
12372 }
12373
12374 /*
12375 * Throw away the extra reference
12376 */
12377
12378 vm_object_deallocate(src_object);
12379
12380 /*
12381 * Verify that the map has not substantially
12382 * changed while the copy was being made.
12383 */
12384
12385 vm_map_lock(src_map);
12386
12387 if ((version.main_timestamp + 1) == src_map->timestamp) {
12388 /* src_map hasn't changed: src_entry is still valid */
12389 src_entry = saved_src_entry;
12390 goto VerificationSuccessful;
12391 }
12392
12393 /*
12394 * Simple version comparison failed.
12395 *
12396 * Retry the lookup and verify that the
12397 * same object/offset are still present.
12398 *
12399 * [Note: a memory manager that colludes with
12400 * the calling task can detect that we have
12401 * cheated. While the map was unlocked, the
12402 * mapping could have been changed and restored.]
12403 */
12404
12405 if (!vm_map_lookup_entry(src_map, src_start, &tmp_entry)) {
12406 if (result != KERN_MEMORY_RESTART_COPY) {
12407 vm_object_deallocate(VME_OBJECT(new_entry));
12408 VME_OBJECT_SET(new_entry, VM_OBJECT_NULL, false, 0);
12409 /* reset accounting state */
12410 new_entry->iokit_acct = FALSE;
12411 new_entry->use_pmap = TRUE;
12412 }
12413 RETURN(KERN_INVALID_ADDRESS);
12414 }
12415
12416 src_entry = tmp_entry;
12417 vm_map_clip_start(src_map, src_entry, src_start);
12418
12419 if ((((src_entry->protection & VM_PROT_READ) == VM_PROT_NONE) &&
12420 !use_maxprot) ||
12421 ((src_entry->max_protection & VM_PROT_READ) == 0)) {
12422 goto VerificationFailed;
12423 }
12424
12425 if (src_entry->vme_end < new_entry->vme_end) {
12426 /*
12427 * This entry might have been shortened
12428 * (vm_map_clip_end) or been replaced with
12429 * an entry that ends closer to "src_start"
12430 * than before.
12431 * Adjust "new_entry" accordingly; copying
12432 * less memory would be correct but we also
12433 * redo the copy (see below) if the new entry
12434 * no longer points at the same object/offset.
12435 */
12436 assert(VM_MAP_PAGE_ALIGNED(src_entry->vme_end,
12437 VM_MAP_COPY_PAGE_MASK(copy)));
12438 new_entry->vme_end = src_entry->vme_end;
12439 src_size = new_entry->vme_end - src_start;
12440 } else if (src_entry->vme_end > new_entry->vme_end) {
12441 /*
12442 * This entry might have been extended
12443 * (vm_map_entry_simplify() or coalesce)
12444 * or been replaced with an entry that ends farther
12445 * from "src_start" than before.
12446 *
12447 * We've called vm_object_copy_*() only on
12448 * the previous <start:end> range, so we can't
12449 * just extend new_entry. We have to re-do
12450 * the copy based on the new entry as if it was
12451 * pointing at a different object/offset (see
12452 * "Verification failed" below).
12453 */
12454 }
12455
12456 if ((VME_OBJECT(src_entry) != src_object) ||
12457 (VME_OFFSET(src_entry) != src_offset) ||
12458 (src_entry->vme_end > new_entry->vme_end)) {
12459 /*
12460 * Verification failed.
12461 *
12462 * Start over with this top-level entry.
12463 */
12464
12465 VerificationFailed: ;
12466
12467 vm_object_deallocate(VME_OBJECT(new_entry));
12468 tmp_entry = src_entry;
12469 continue;
12470 }
12471
12472 /*
12473 * Verification succeeded.
12474 */
12475
12476 VerificationSuccessful:;
12477
12478 if (result == KERN_MEMORY_RESTART_COPY) {
12479 goto RestartCopy;
12480 }
12481
12482 /*
12483 * Copy succeeded.
12484 */
12485
12486 CopySuccessful: ;
12487
12488 /*
12489 * Link in the new copy entry.
12490 */
12491
12492 vm_map_copy_entry_link(copy, vm_map_copy_last_entry(copy),
12493 new_entry);
12494
12495 /*
12496 * Determine whether the entire region
12497 * has been copied.
12498 */
12499 src_base = src_start;
12500 src_start = new_entry->vme_end;
12501 new_entry = VM_MAP_ENTRY_NULL;
12502 while ((src_start >= src_end) && (src_end != 0)) {
12503 submap_map_t *ptr;
12504
12505 if (src_map == base_map) {
12506 /* back to the top */
12507 break;
12508 }
12509
12510 ptr = parent_maps;
12511 assert(ptr != NULL);
12512 parent_maps = parent_maps->next;
12513
12514 /* fix up the damage we did in that submap */
12515 vm_map_simplify_range(src_map,
12516 src_base,
12517 src_end);
12518
12519 vm_map_unlock(src_map);
12520 vm_map_deallocate(src_map);
12521 vm_map_lock(ptr->parent_map);
12522 src_map = ptr->parent_map;
12523 src_base = ptr->base_start;
12524 src_start = ptr->base_start + ptr->base_len;
12525 src_end = ptr->base_end;
12526 if (!vm_map_lookup_entry(src_map,
12527 src_start,
12528 &tmp_entry) &&
12529 (src_end > src_start)) {
12530 RETURN(KERN_INVALID_ADDRESS);
12531 }
12532 kfree_type(submap_map_t, ptr);
12533 if (parent_maps == NULL) {
12534 map_share = FALSE;
12535 }
12536 src_entry = tmp_entry->vme_prev;
12537 }
12538
12539 if ((VM_MAP_PAGE_SHIFT(src_map) != PAGE_SHIFT) &&
12540 (src_start >= src_addr + len) &&
12541 (src_addr + len != 0)) {
12542 /*
12543 * Stop copying now, even though we haven't reached
12544 * "src_end". We'll adjust the end of the last copy
12545 * entry at the end, if needed.
12546 *
12547 * If src_map's aligment is different from the
12548 * system's page-alignment, there could be
12549 * extra non-map-aligned map entries between
12550 * the original (non-rounded) "src_addr + len"
12551 * and the rounded "src_end".
12552 * We do not want to copy those map entries since
12553 * they're not part of the copied range.
12554 */
12555 break;
12556 }
12557
12558 if ((src_start >= src_end) && (src_end != 0)) {
12559 break;
12560 }
12561
12562 /*
12563 * Verify that there are no gaps in the region
12564 */
12565
12566 tmp_entry = src_entry->vme_next;
12567 if ((tmp_entry->vme_start != src_start) ||
12568 (tmp_entry == vm_map_to_entry(src_map))) {
12569 RETURN(KERN_INVALID_ADDRESS);
12570 }
12571 }
12572
12573 /*
12574 * If the source should be destroyed, do it now, since the
12575 * copy was successful.
12576 */
12577 if (src_destroy) {
12578 vmr_flags_t remove_flags = VM_MAP_REMOVE_NO_FLAGS;
12579
12580 if (src_map == kernel_map) {
12581 remove_flags |= VM_MAP_REMOVE_KUNWIRE;
12582 }
12583 (void)vm_map_remove_and_unlock(src_map,
12584 vm_map_trunc_page(src_addr, VM_MAP_PAGE_MASK(src_map)),
12585 src_end,
12586 remove_flags,
12587 KMEM_GUARD_NONE);
12588 } else {
12589 /* fix up the damage we did in the base map */
12590 vm_map_simplify_range(
12591 src_map,
12592 vm_map_trunc_page(src_addr,
12593 VM_MAP_PAGE_MASK(src_map)),
12594 vm_map_round_page(src_end,
12595 VM_MAP_PAGE_MASK(src_map)));
12596 vm_map_unlock(src_map);
12597 }
12598
12599 tmp_entry = VM_MAP_ENTRY_NULL;
12600
12601 if (VM_MAP_PAGE_SHIFT(src_map) > PAGE_SHIFT &&
12602 VM_MAP_PAGE_SHIFT(src_map) != VM_MAP_COPY_PAGE_SHIFT(copy)) {
12603 vm_map_offset_t original_start, original_offset, original_end;
12604
12605 assert(VM_MAP_COPY_PAGE_MASK(copy) == PAGE_MASK);
12606
12607 /* adjust alignment of first copy_entry's "vme_start" */
12608 tmp_entry = vm_map_copy_first_entry(copy);
12609 if (tmp_entry != vm_map_copy_to_entry(copy)) {
12610 vm_map_offset_t adjustment;
12611
12612 original_start = tmp_entry->vme_start;
12613 original_offset = VME_OFFSET(tmp_entry);
12614
12615 /* map-align the start of the first copy entry... */
12616 adjustment = (tmp_entry->vme_start -
12617 vm_map_trunc_page(
12618 tmp_entry->vme_start,
12619 VM_MAP_PAGE_MASK(src_map)));
12620 tmp_entry->vme_start -= adjustment;
12621 VME_OFFSET_SET(tmp_entry,
12622 VME_OFFSET(tmp_entry) - adjustment);
12623 copy_addr -= adjustment;
12624 assert(tmp_entry->vme_start < tmp_entry->vme_end);
12625 /* ... adjust for mis-aligned start of copy range */
12626 adjustment =
12627 (vm_map_trunc_page(copy->offset,
12628 PAGE_MASK) -
12629 vm_map_trunc_page(copy->offset,
12630 VM_MAP_PAGE_MASK(src_map)));
12631 if (adjustment) {
12632 assert(page_aligned(adjustment));
12633 assert(adjustment < VM_MAP_PAGE_SIZE(src_map));
12634 tmp_entry->vme_start += adjustment;
12635 VME_OFFSET_SET(tmp_entry,
12636 (VME_OFFSET(tmp_entry) +
12637 adjustment));
12638 copy_addr += adjustment;
12639 assert(tmp_entry->vme_start < tmp_entry->vme_end);
12640 }
12641
12642 /*
12643 * Assert that the adjustments haven't exposed
12644 * more than was originally copied...
12645 */
12646 assert(tmp_entry->vme_start >= original_start);
12647 assert(VME_OFFSET(tmp_entry) >= original_offset);
12648 /*
12649 * ... and that it did not adjust outside of a
12650 * a single 16K page.
12651 */
12652 assert(vm_map_trunc_page(tmp_entry->vme_start,
12653 VM_MAP_PAGE_MASK(src_map)) ==
12654 vm_map_trunc_page(original_start,
12655 VM_MAP_PAGE_MASK(src_map)));
12656 }
12657
12658 /* adjust alignment of last copy_entry's "vme_end" */
12659 tmp_entry = vm_map_copy_last_entry(copy);
12660 if (tmp_entry != vm_map_copy_to_entry(copy)) {
12661 vm_map_offset_t adjustment;
12662
12663 original_end = tmp_entry->vme_end;
12664
12665 /* map-align the end of the last copy entry... */
12666 tmp_entry->vme_end =
12667 vm_map_round_page(tmp_entry->vme_end,
12668 VM_MAP_PAGE_MASK(src_map));
12669 /* ... adjust for mis-aligned end of copy range */
12670 adjustment =
12671 (vm_map_round_page((copy->offset +
12672 copy->size),
12673 VM_MAP_PAGE_MASK(src_map)) -
12674 vm_map_round_page((copy->offset +
12675 copy->size),
12676 PAGE_MASK));
12677 if (adjustment) {
12678 assert(page_aligned(adjustment));
12679 assert(adjustment < VM_MAP_PAGE_SIZE(src_map));
12680 tmp_entry->vme_end -= adjustment;
12681 assert(tmp_entry->vme_start < tmp_entry->vme_end);
12682 }
12683
12684 /*
12685 * Assert that the adjustments haven't exposed
12686 * more than was originally copied...
12687 */
12688 assert(tmp_entry->vme_end <= original_end);
12689 /*
12690 * ... and that it did not adjust outside of a
12691 * a single 16K page.
12692 */
12693 assert(vm_map_round_page(tmp_entry->vme_end,
12694 VM_MAP_PAGE_MASK(src_map)) ==
12695 vm_map_round_page(original_end,
12696 VM_MAP_PAGE_MASK(src_map)));
12697 }
12698 }
12699
12700 /* Fix-up start and end points in copy. This is necessary */
12701 /* when the various entries in the copy object were picked */
12702 /* up from different sub-maps */
12703
12704 tmp_entry = vm_map_copy_first_entry(copy);
12705 copy_size = 0; /* compute actual size */
12706 while (tmp_entry != vm_map_copy_to_entry(copy)) {
12707 assert(VM_MAP_PAGE_ALIGNED(
12708 copy_addr + (tmp_entry->vme_end -
12709 tmp_entry->vme_start),
12710 MIN(VM_MAP_COPY_PAGE_MASK(copy), PAGE_MASK)));
12711 assert(VM_MAP_PAGE_ALIGNED(
12712 copy_addr,
12713 MIN(VM_MAP_COPY_PAGE_MASK(copy), PAGE_MASK)));
12714
12715 /*
12716 * The copy_entries will be injected directly into the
12717 * destination map and might not be "map aligned" there...
12718 */
12719 tmp_entry->map_aligned = FALSE;
12720
12721 tmp_entry->vme_end = copy_addr +
12722 (tmp_entry->vme_end - tmp_entry->vme_start);
12723 tmp_entry->vme_start = copy_addr;
12724 assert(tmp_entry->vme_start < tmp_entry->vme_end);
12725 copy_addr += tmp_entry->vme_end - tmp_entry->vme_start;
12726 copy_size += tmp_entry->vme_end - tmp_entry->vme_start;
12727 tmp_entry = (struct vm_map_entry *)tmp_entry->vme_next;
12728 }
12729
12730 if (VM_MAP_PAGE_SHIFT(src_map) != PAGE_SHIFT &&
12731 copy_size < copy->size) {
12732 /*
12733 * The actual size of the VM map copy is smaller than what
12734 * was requested by the caller. This must be because some
12735 * PAGE_SIZE-sized pages are missing at the end of the last
12736 * VM_MAP_PAGE_SIZE(src_map)-sized chunk of the range.
12737 * The caller might not have been aware of those missing
12738 * pages and might not want to be aware of it, which is
12739 * fine as long as they don't try to access (and crash on)
12740 * those missing pages.
12741 * Let's adjust the size of the "copy", to avoid failing
12742 * in vm_map_copyout() or vm_map_copy_overwrite().
12743 */
12744 assert(vm_map_round_page(copy_size,
12745 VM_MAP_PAGE_MASK(src_map)) ==
12746 vm_map_round_page(copy->size,
12747 VM_MAP_PAGE_MASK(src_map)));
12748 copy->size = copy_size;
12749 }
12750
12751 *copy_result = copy;
12752 return KERN_SUCCESS;
12753
12754 #undef RETURN
12755 }
12756
12757 kern_return_t
vm_map_copy_extract(vm_map_t src_map,vm_map_address_t src_addr,vm_map_size_t len,boolean_t do_copy,vm_map_copy_t * copy_result,vm_prot_t * cur_prot,vm_prot_t * max_prot,vm_inherit_t inheritance,vm_map_kernel_flags_t vmk_flags)12758 vm_map_copy_extract(
12759 vm_map_t src_map,
12760 vm_map_address_t src_addr,
12761 vm_map_size_t len,
12762 boolean_t do_copy,
12763 vm_map_copy_t *copy_result, /* OUT */
12764 vm_prot_t *cur_prot, /* IN/OUT */
12765 vm_prot_t *max_prot, /* IN/OUT */
12766 vm_inherit_t inheritance,
12767 vm_map_kernel_flags_t vmk_flags)
12768 {
12769 vm_map_copy_t copy;
12770 kern_return_t kr;
12771 vm_prot_t required_cur_prot, required_max_prot;
12772
12773 /*
12774 * Check for copies of zero bytes.
12775 */
12776
12777 if (len == 0) {
12778 *copy_result = VM_MAP_COPY_NULL;
12779 return KERN_SUCCESS;
12780 }
12781
12782 /*
12783 * Check that the end address doesn't overflow
12784 */
12785 if (src_addr + len < src_addr) {
12786 return KERN_INVALID_ADDRESS;
12787 }
12788 if (__improbable(vm_map_range_overflows(src_map, src_addr, len))) {
12789 return KERN_INVALID_ADDRESS;
12790 }
12791
12792 if (VM_MAP_PAGE_SIZE(src_map) < PAGE_SIZE) {
12793 DEBUG4K_SHARE("src_map %p src_addr 0x%llx src_end 0x%llx\n", src_map, (uint64_t)src_addr, (uint64_t)(src_addr + len));
12794 }
12795
12796 required_cur_prot = *cur_prot;
12797 required_max_prot = *max_prot;
12798
12799 /*
12800 * Allocate a header element for the list.
12801 *
12802 * Use the start and end in the header to
12803 * remember the endpoints prior to rounding.
12804 */
12805
12806 copy = vm_map_copy_allocate(VM_MAP_COPY_ENTRY_LIST);
12807 copy->cpy_hdr.entries_pageable = vmk_flags.vmkf_copy_pageable;
12808 copy->offset = 0;
12809 copy->size = len;
12810
12811 kr = vm_map_remap_extract(src_map,
12812 src_addr,
12813 len,
12814 do_copy, /* copy */
12815 copy,
12816 cur_prot, /* IN/OUT */
12817 max_prot, /* IN/OUT */
12818 inheritance,
12819 vmk_flags);
12820 if (kr != KERN_SUCCESS) {
12821 vm_map_copy_discard(copy);
12822 return kr;
12823 }
12824 if (required_cur_prot != VM_PROT_NONE) {
12825 assert((*cur_prot & required_cur_prot) == required_cur_prot);
12826 assert((*max_prot & required_max_prot) == required_max_prot);
12827 }
12828
12829 *copy_result = copy;
12830 return KERN_SUCCESS;
12831 }
12832
12833 static void
vm_map_fork_share(vm_map_t old_map,vm_map_entry_t old_entry,vm_map_t new_map)12834 vm_map_fork_share(
12835 vm_map_t old_map,
12836 vm_map_entry_t old_entry,
12837 vm_map_t new_map)
12838 {
12839 vm_object_t object;
12840 vm_map_entry_t new_entry;
12841
12842 /*
12843 * New sharing code. New map entry
12844 * references original object. Internal
12845 * objects use asynchronous copy algorithm for
12846 * future copies. First make sure we have
12847 * the right object. If we need a shadow,
12848 * or someone else already has one, then
12849 * make a new shadow and share it.
12850 */
12851
12852 if (!old_entry->is_sub_map) {
12853 object = VME_OBJECT(old_entry);
12854 }
12855
12856 if (old_entry->is_sub_map) {
12857 assert(old_entry->wired_count == 0);
12858 #ifndef NO_NESTED_PMAP
12859 #if !PMAP_FORK_NEST
12860 if (old_entry->use_pmap) {
12861 kern_return_t result;
12862
12863 result = pmap_nest(new_map->pmap,
12864 (VME_SUBMAP(old_entry))->pmap,
12865 (addr64_t)old_entry->vme_start,
12866 (uint64_t)(old_entry->vme_end - old_entry->vme_start));
12867 if (result) {
12868 panic("vm_map_fork_share: pmap_nest failed!");
12869 }
12870 }
12871 #endif /* !PMAP_FORK_NEST */
12872 #endif /* NO_NESTED_PMAP */
12873 } else if (object == VM_OBJECT_NULL) {
12874 object = vm_object_allocate((vm_map_size_t)(old_entry->vme_end -
12875 old_entry->vme_start));
12876 VME_OFFSET_SET(old_entry, 0);
12877 VME_OBJECT_SET(old_entry, object, false, 0);
12878 old_entry->use_pmap = TRUE;
12879 // assert(!old_entry->needs_copy);
12880 } else if (object->copy_strategy !=
12881 MEMORY_OBJECT_COPY_SYMMETRIC) {
12882 /*
12883 * We are already using an asymmetric
12884 * copy, and therefore we already have
12885 * the right object.
12886 */
12887
12888 assert(!old_entry->needs_copy);
12889 } else if (old_entry->needs_copy || /* case 1 */
12890 object->shadowed || /* case 2 */
12891 (!object->true_share && /* case 3 */
12892 !old_entry->is_shared &&
12893 (object->vo_size >
12894 (vm_map_size_t)(old_entry->vme_end -
12895 old_entry->vme_start)))) {
12896 /*
12897 * We need to create a shadow.
12898 * There are three cases here.
12899 * In the first case, we need to
12900 * complete a deferred symmetrical
12901 * copy that we participated in.
12902 * In the second and third cases,
12903 * we need to create the shadow so
12904 * that changes that we make to the
12905 * object do not interfere with
12906 * any symmetrical copies which
12907 * have occured (case 2) or which
12908 * might occur (case 3).
12909 *
12910 * The first case is when we had
12911 * deferred shadow object creation
12912 * via the entry->needs_copy mechanism.
12913 * This mechanism only works when
12914 * only one entry points to the source
12915 * object, and we are about to create
12916 * a second entry pointing to the
12917 * same object. The problem is that
12918 * there is no way of mapping from
12919 * an object to the entries pointing
12920 * to it. (Deferred shadow creation
12921 * works with one entry because occurs
12922 * at fault time, and we walk from the
12923 * entry to the object when handling
12924 * the fault.)
12925 *
12926 * The second case is when the object
12927 * to be shared has already been copied
12928 * with a symmetric copy, but we point
12929 * directly to the object without
12930 * needs_copy set in our entry. (This
12931 * can happen because different ranges
12932 * of an object can be pointed to by
12933 * different entries. In particular,
12934 * a single entry pointing to an object
12935 * can be split by a call to vm_inherit,
12936 * which, combined with task_create, can
12937 * result in the different entries
12938 * having different needs_copy values.)
12939 * The shadowed flag in the object allows
12940 * us to detect this case. The problem
12941 * with this case is that if this object
12942 * has or will have shadows, then we
12943 * must not perform an asymmetric copy
12944 * of this object, since such a copy
12945 * allows the object to be changed, which
12946 * will break the previous symmetrical
12947 * copies (which rely upon the object
12948 * not changing). In a sense, the shadowed
12949 * flag says "don't change this object".
12950 * We fix this by creating a shadow
12951 * object for this object, and sharing
12952 * that. This works because we are free
12953 * to change the shadow object (and thus
12954 * to use an asymmetric copy strategy);
12955 * this is also semantically correct,
12956 * since this object is temporary, and
12957 * therefore a copy of the object is
12958 * as good as the object itself. (This
12959 * is not true for permanent objects,
12960 * since the pager needs to see changes,
12961 * which won't happen if the changes
12962 * are made to a copy.)
12963 *
12964 * The third case is when the object
12965 * to be shared has parts sticking
12966 * outside of the entry we're working
12967 * with, and thus may in the future
12968 * be subject to a symmetrical copy.
12969 * (This is a preemptive version of
12970 * case 2.)
12971 */
12972 VME_OBJECT_SHADOW(old_entry,
12973 (vm_map_size_t) (old_entry->vme_end -
12974 old_entry->vme_start),
12975 vm_map_always_shadow(old_map));
12976
12977 /*
12978 * If we're making a shadow for other than
12979 * copy on write reasons, then we have
12980 * to remove write permission.
12981 */
12982
12983 if (!old_entry->needs_copy &&
12984 (old_entry->protection & VM_PROT_WRITE)) {
12985 vm_prot_t prot;
12986
12987 assert(!pmap_has_prot_policy(old_map->pmap, old_entry->translated_allow_execute, old_entry->protection));
12988
12989 prot = old_entry->protection & ~VM_PROT_WRITE;
12990
12991 assert(!pmap_has_prot_policy(old_map->pmap, old_entry->translated_allow_execute, prot));
12992
12993 if (override_nx(old_map, VME_ALIAS(old_entry)) && prot) {
12994 prot |= VM_PROT_EXECUTE;
12995 }
12996
12997
12998 if (old_map->mapped_in_other_pmaps) {
12999 vm_object_pmap_protect(
13000 VME_OBJECT(old_entry),
13001 VME_OFFSET(old_entry),
13002 (old_entry->vme_end -
13003 old_entry->vme_start),
13004 PMAP_NULL,
13005 PAGE_SIZE,
13006 old_entry->vme_start,
13007 prot);
13008 } else {
13009 pmap_protect(old_map->pmap,
13010 old_entry->vme_start,
13011 old_entry->vme_end,
13012 prot);
13013 }
13014 }
13015
13016 old_entry->needs_copy = FALSE;
13017 object = VME_OBJECT(old_entry);
13018 }
13019
13020
13021 /*
13022 * If object was using a symmetric copy strategy,
13023 * change its copy strategy to the default
13024 * asymmetric copy strategy, which is copy_delay
13025 * in the non-norma case and copy_call in the
13026 * norma case. Bump the reference count for the
13027 * new entry.
13028 */
13029
13030 if (old_entry->is_sub_map) {
13031 vm_map_reference(VME_SUBMAP(old_entry));
13032 } else {
13033 vm_object_lock(object);
13034 vm_object_reference_locked(object);
13035 if (object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) {
13036 object->copy_strategy = MEMORY_OBJECT_COPY_DELAY;
13037 }
13038 vm_object_unlock(object);
13039 }
13040
13041 /*
13042 * Clone the entry, using object ref from above.
13043 * Mark both entries as shared.
13044 */
13045
13046 new_entry = vm_map_entry_create(new_map); /* Never the kernel map or descendants */
13047 vm_map_entry_copy(old_map, new_entry, old_entry);
13048 old_entry->is_shared = TRUE;
13049 new_entry->is_shared = TRUE;
13050
13051 /*
13052 * We're dealing with a shared mapping, so the resulting mapping
13053 * should inherit some of the original mapping's accounting settings.
13054 * "iokit_acct" should have been cleared in vm_map_entry_copy().
13055 * "use_pmap" should stay the same as before (if it hasn't been reset
13056 * to TRUE when we cleared "iokit_acct").
13057 */
13058 assert(!new_entry->iokit_acct);
13059
13060 /*
13061 * If old entry's inheritence is VM_INHERIT_NONE,
13062 * the new entry is for corpse fork, remove the
13063 * write permission from the new entry.
13064 */
13065 if (old_entry->inheritance == VM_INHERIT_NONE) {
13066 new_entry->protection &= ~VM_PROT_WRITE;
13067 new_entry->max_protection &= ~VM_PROT_WRITE;
13068 }
13069
13070 /*
13071 * Insert the entry into the new map -- we
13072 * know we're inserting at the end of the new
13073 * map.
13074 */
13075
13076 vm_map_store_entry_link(new_map, vm_map_last_entry(new_map), new_entry,
13077 VM_MAP_KERNEL_FLAGS_NONE);
13078
13079 /*
13080 * Update the physical map
13081 */
13082
13083 if (old_entry->is_sub_map) {
13084 /* Bill Angell pmap support goes here */
13085 } else {
13086 pmap_copy(new_map->pmap, old_map->pmap, new_entry->vme_start,
13087 old_entry->vme_end - old_entry->vme_start,
13088 old_entry->vme_start);
13089 }
13090 }
13091
13092 static boolean_t
vm_map_fork_copy(vm_map_t old_map,vm_map_entry_t * old_entry_p,vm_map_t new_map,int vm_map_copyin_flags)13093 vm_map_fork_copy(
13094 vm_map_t old_map,
13095 vm_map_entry_t *old_entry_p,
13096 vm_map_t new_map,
13097 int vm_map_copyin_flags)
13098 {
13099 vm_map_entry_t old_entry = *old_entry_p;
13100 vm_map_size_t entry_size = old_entry->vme_end - old_entry->vme_start;
13101 vm_map_offset_t start = old_entry->vme_start;
13102 vm_map_copy_t copy;
13103 vm_map_entry_t last = vm_map_last_entry(new_map);
13104
13105 vm_map_unlock(old_map);
13106 /*
13107 * Use maxprot version of copyin because we
13108 * care about whether this memory can ever
13109 * be accessed, not just whether it's accessible
13110 * right now.
13111 */
13112 vm_map_copyin_flags |= VM_MAP_COPYIN_USE_MAXPROT;
13113 if (vm_map_copyin_internal(old_map, start, entry_size,
13114 vm_map_copyin_flags, ©)
13115 != KERN_SUCCESS) {
13116 /*
13117 * The map might have changed while it
13118 * was unlocked, check it again. Skip
13119 * any blank space or permanently
13120 * unreadable region.
13121 */
13122 vm_map_lock(old_map);
13123 if (!vm_map_lookup_entry(old_map, start, &last) ||
13124 (last->max_protection & VM_PROT_READ) == VM_PROT_NONE) {
13125 last = last->vme_next;
13126 }
13127 *old_entry_p = last;
13128
13129 /*
13130 * XXX For some error returns, want to
13131 * XXX skip to the next element. Note
13132 * that INVALID_ADDRESS and
13133 * PROTECTION_FAILURE are handled above.
13134 */
13135
13136 return FALSE;
13137 }
13138
13139 /*
13140 * Assert that the vm_map_copy is coming from the right
13141 * zone and hasn't been forged
13142 */
13143 vm_map_copy_require(copy);
13144
13145 /*
13146 * Insert the copy into the new map
13147 */
13148 vm_map_copy_insert(new_map, last, copy);
13149
13150 /*
13151 * Pick up the traversal at the end of
13152 * the copied region.
13153 */
13154
13155 vm_map_lock(old_map);
13156 start += entry_size;
13157 if (!vm_map_lookup_entry(old_map, start, &last)) {
13158 last = last->vme_next;
13159 } else {
13160 if (last->vme_start == start) {
13161 /*
13162 * No need to clip here and we don't
13163 * want to cause any unnecessary
13164 * unnesting...
13165 */
13166 } else {
13167 vm_map_clip_start(old_map, last, start);
13168 }
13169 }
13170 *old_entry_p = last;
13171
13172 return TRUE;
13173 }
13174
13175 #if PMAP_FORK_NEST
13176 #define PMAP_FORK_NEST_DEBUG 0
13177 static inline void
vm_map_fork_unnest(pmap_t new_pmap,vm_map_offset_t pre_nested_start,vm_map_offset_t pre_nested_end,vm_map_offset_t start,vm_map_offset_t end)13178 vm_map_fork_unnest(
13179 pmap_t new_pmap,
13180 vm_map_offset_t pre_nested_start,
13181 vm_map_offset_t pre_nested_end,
13182 vm_map_offset_t start,
13183 vm_map_offset_t end)
13184 {
13185 kern_return_t kr;
13186 vm_map_offset_t nesting_mask, start_unnest, end_unnest;
13187
13188 assertf(pre_nested_start <= pre_nested_end,
13189 "pre_nested start 0x%llx end 0x%llx",
13190 (uint64_t)pre_nested_start, (uint64_t)pre_nested_end);
13191 assertf(start <= end,
13192 "start 0x%llx end 0x%llx",
13193 (uint64_t) start, (uint64_t)end);
13194
13195 if (pre_nested_start == pre_nested_end) {
13196 /* nothing was pre-nested: done */
13197 return;
13198 }
13199 if (end <= pre_nested_start) {
13200 /* fully before pre-nested range: done */
13201 return;
13202 }
13203 if (start >= pre_nested_end) {
13204 /* fully after pre-nested range: done */
13205 return;
13206 }
13207 /* ignore parts of range outside of pre_nested range */
13208 if (start < pre_nested_start) {
13209 start = pre_nested_start;
13210 }
13211 if (end > pre_nested_end) {
13212 end = pre_nested_end;
13213 }
13214 nesting_mask = pmap_shared_region_size_min(new_pmap) - 1;
13215 start_unnest = start & ~nesting_mask;
13216 end_unnest = (end + nesting_mask) & ~nesting_mask;
13217 kr = pmap_unnest(new_pmap,
13218 (addr64_t)start_unnest,
13219 (uint64_t)(end_unnest - start_unnest));
13220 #if PMAP_FORK_NEST_DEBUG
13221 printf("PMAP_FORK_NEST %s:%d new_pmap %p 0x%llx:0x%llx -> pmap_unnest 0x%llx:0x%llx kr 0x%x\n", __FUNCTION__, __LINE__, new_pmap, (uint64_t)start, (uint64_t)end, (uint64_t)start_unnest, (uint64_t)end_unnest, kr);
13222 #endif /* PMAP_FORK_NEST_DEBUG */
13223 assertf(kr == KERN_SUCCESS,
13224 "0x%llx 0x%llx pmap_unnest(%p, 0x%llx, 0x%llx) -> 0x%x",
13225 (uint64_t)start, (uint64_t)end, new_pmap,
13226 (uint64_t)start_unnest, (uint64_t)(end_unnest - start_unnest),
13227 kr);
13228 }
13229 #endif /* PMAP_FORK_NEST */
13230
13231 void
vm_map_inherit_limits(vm_map_t new_map,const struct _vm_map * old_map)13232 vm_map_inherit_limits(vm_map_t new_map, const struct _vm_map *old_map)
13233 {
13234 new_map->size_limit = old_map->size_limit;
13235 new_map->data_limit = old_map->data_limit;
13236 new_map->user_wire_limit = old_map->user_wire_limit;
13237 new_map->reserved_regions = old_map->reserved_regions;
13238 }
13239
13240 /*
13241 * vm_map_fork:
13242 *
13243 * Create and return a new map based on the old
13244 * map, according to the inheritance values on the
13245 * regions in that map and the options.
13246 *
13247 * The source map must not be locked.
13248 */
13249 vm_map_t
vm_map_fork(ledger_t ledger,vm_map_t old_map,int options)13250 vm_map_fork(
13251 ledger_t ledger,
13252 vm_map_t old_map,
13253 int options)
13254 {
13255 pmap_t new_pmap;
13256 vm_map_t new_map;
13257 vm_map_entry_t old_entry;
13258 vm_map_size_t new_size = 0, entry_size;
13259 vm_map_entry_t new_entry;
13260 boolean_t src_needs_copy;
13261 boolean_t new_entry_needs_copy;
13262 boolean_t pmap_is64bit;
13263 int vm_map_copyin_flags;
13264 vm_inherit_t old_entry_inheritance;
13265 int map_create_options;
13266 kern_return_t footprint_collect_kr;
13267
13268 if (options & ~(VM_MAP_FORK_SHARE_IF_INHERIT_NONE |
13269 VM_MAP_FORK_PRESERVE_PURGEABLE |
13270 VM_MAP_FORK_CORPSE_FOOTPRINT)) {
13271 /* unsupported option */
13272 return VM_MAP_NULL;
13273 }
13274
13275 pmap_is64bit =
13276 #if defined(__i386__) || defined(__x86_64__)
13277 old_map->pmap->pm_task_map != TASK_MAP_32BIT;
13278 #elif defined(__arm64__)
13279 old_map->pmap->is_64bit;
13280 #else
13281 #error Unknown architecture.
13282 #endif
13283
13284 unsigned int pmap_flags = 0;
13285 pmap_flags |= pmap_is64bit ? PMAP_CREATE_64BIT : 0;
13286 #if defined(HAS_APPLE_PAC)
13287 pmap_flags |= old_map->pmap->disable_jop ? PMAP_CREATE_DISABLE_JOP : 0;
13288 #endif
13289 #if CONFIG_ROSETTA
13290 pmap_flags |= old_map->pmap->is_rosetta ? PMAP_CREATE_ROSETTA : 0;
13291 #endif
13292 #if PMAP_CREATE_FORCE_4K_PAGES
13293 if (VM_MAP_PAGE_SIZE(old_map) == FOURK_PAGE_SIZE &&
13294 PAGE_SIZE != FOURK_PAGE_SIZE) {
13295 pmap_flags |= PMAP_CREATE_FORCE_4K_PAGES;
13296 }
13297 #endif /* PMAP_CREATE_FORCE_4K_PAGES */
13298 new_pmap = pmap_create_options(ledger, (vm_map_size_t) 0, pmap_flags);
13299 if (new_pmap == NULL) {
13300 return VM_MAP_NULL;
13301 }
13302
13303 vm_map_reference(old_map);
13304 vm_map_lock(old_map);
13305
13306 map_create_options = 0;
13307 if (old_map->hdr.entries_pageable) {
13308 map_create_options |= VM_MAP_CREATE_PAGEABLE;
13309 }
13310 if (options & VM_MAP_FORK_CORPSE_FOOTPRINT) {
13311 map_create_options |= VM_MAP_CREATE_CORPSE_FOOTPRINT;
13312 footprint_collect_kr = KERN_SUCCESS;
13313 }
13314 new_map = vm_map_create_options(new_pmap,
13315 old_map->min_offset,
13316 old_map->max_offset,
13317 map_create_options);
13318
13319 /* inherit cs_enforcement */
13320 vm_map_cs_enforcement_set(new_map, old_map->cs_enforcement);
13321
13322 vm_map_lock(new_map);
13323 vm_commit_pagezero_status(new_map);
13324 /* inherit the parent map's page size */
13325 vm_map_set_page_shift(new_map, VM_MAP_PAGE_SHIFT(old_map));
13326
13327 /* inherit the parent rlimits */
13328 vm_map_inherit_limits(new_map, old_map);
13329
13330 #if CONFIG_MAP_RANGES
13331 /* inherit the parent map's VM ranges */
13332 vm_map_range_fork(new_map, old_map);
13333 #endif
13334
13335 #if CODE_SIGNING_MONITOR
13336 /* Prepare the monitor for the fork */
13337 csm_fork_prepare(old_map->pmap, new_pmap);
13338 #endif
13339
13340 #if PMAP_FORK_NEST
13341 /*
13342 * Pre-nest the shared region's pmap.
13343 */
13344 vm_map_offset_t pre_nested_start = 0, pre_nested_end = 0;
13345 pmap_fork_nest(old_map->pmap, new_pmap,
13346 &pre_nested_start, &pre_nested_end);
13347 #if PMAP_FORK_NEST_DEBUG
13348 printf("PMAP_FORK_NEST %s:%d old %p new %p pre_nested start 0x%llx end 0x%llx\n", __FUNCTION__, __LINE__, old_map->pmap, new_pmap, (uint64_t)pre_nested_start, (uint64_t)pre_nested_end);
13349 #endif /* PMAP_FORK_NEST_DEBUG */
13350 #endif /* PMAP_FORK_NEST */
13351
13352 for (old_entry = vm_map_first_entry(old_map); old_entry != vm_map_to_entry(old_map);) {
13353 /*
13354 * Abort any corpse collection if the system is shutting down.
13355 */
13356 if ((options & VM_MAP_FORK_CORPSE_FOOTPRINT) &&
13357 get_system_inshutdown()) {
13358 #if PMAP_FORK_NEST
13359 new_entry = vm_map_last_entry(new_map);
13360 if (new_entry == vm_map_to_entry(new_map)) {
13361 /* unnest all that was pre-nested */
13362 vm_map_fork_unnest(new_pmap,
13363 pre_nested_start, pre_nested_end,
13364 vm_map_min(new_map), vm_map_max(new_map));
13365 } else if (new_entry->vme_end < vm_map_max(new_map)) {
13366 /* unnest hole at the end, if pre-nested */
13367 vm_map_fork_unnest(new_pmap,
13368 pre_nested_start, pre_nested_end,
13369 new_entry->vme_end, vm_map_max(new_map));
13370 }
13371 #endif /* PMAP_FORK_NEST */
13372 vm_map_corpse_footprint_collect_done(new_map);
13373 vm_map_unlock(new_map);
13374 vm_map_unlock(old_map);
13375 vm_map_deallocate(new_map);
13376 vm_map_deallocate(old_map);
13377 printf("Aborting corpse map due to system shutdown\n");
13378 return VM_MAP_NULL;
13379 }
13380
13381 entry_size = old_entry->vme_end - old_entry->vme_start;
13382
13383 #if PMAP_FORK_NEST
13384 /*
13385 * Undo any unnecessary pre-nesting.
13386 */
13387 vm_map_offset_t prev_end;
13388 if (old_entry == vm_map_first_entry(old_map)) {
13389 prev_end = vm_map_min(old_map);
13390 } else {
13391 prev_end = old_entry->vme_prev->vme_end;
13392 }
13393 if (prev_end < old_entry->vme_start) {
13394 /* unnest hole before this entry, if pre-nested */
13395 vm_map_fork_unnest(new_pmap,
13396 pre_nested_start, pre_nested_end,
13397 prev_end, old_entry->vme_start);
13398 }
13399 if (old_entry->is_sub_map && old_entry->use_pmap) {
13400 /* keep this entry nested in the child */
13401 #if PMAP_FORK_NEST_DEBUG
13402 printf("PMAP_FORK_NEST %s:%d new_pmap %p keeping 0x%llx:0x%llx nested\n", __FUNCTION__, __LINE__, new_pmap, (uint64_t)old_entry->vme_start, (uint64_t)old_entry->vme_end);
13403 #endif /* PMAP_FORK_NEST_DEBUG */
13404 } else {
13405 /* undo nesting for this entry, if pre-nested */
13406 vm_map_fork_unnest(new_pmap,
13407 pre_nested_start, pre_nested_end,
13408 old_entry->vme_start, old_entry->vme_end);
13409 }
13410 #endif /* PMAP_FORK_NEST */
13411
13412 old_entry_inheritance = old_entry->inheritance;
13413 /*
13414 * If caller used the VM_MAP_FORK_SHARE_IF_INHERIT_NONE option
13415 * share VM_INHERIT_NONE entries that are not backed by a
13416 * device pager.
13417 */
13418 if (old_entry_inheritance == VM_INHERIT_NONE &&
13419 (options & VM_MAP_FORK_SHARE_IF_INHERIT_NONE) &&
13420 (old_entry->protection & VM_PROT_READ) &&
13421 !(!old_entry->is_sub_map &&
13422 VME_OBJECT(old_entry) != NULL &&
13423 VME_OBJECT(old_entry)->pager != NULL &&
13424 is_device_pager_ops(
13425 VME_OBJECT(old_entry)->pager->mo_pager_ops))) {
13426 old_entry_inheritance = VM_INHERIT_SHARE;
13427 }
13428
13429 if (old_entry_inheritance != VM_INHERIT_NONE &&
13430 (options & VM_MAP_FORK_CORPSE_FOOTPRINT) &&
13431 footprint_collect_kr == KERN_SUCCESS) {
13432 /*
13433 * The corpse won't have old_map->pmap to query
13434 * footprint information, so collect that data now
13435 * and store it in new_map->vmmap_corpse_footprint
13436 * for later autopsy.
13437 */
13438 footprint_collect_kr =
13439 vm_map_corpse_footprint_collect(old_map,
13440 old_entry,
13441 new_map);
13442 }
13443
13444 switch (old_entry_inheritance) {
13445 case VM_INHERIT_NONE:
13446 break;
13447
13448 case VM_INHERIT_SHARE:
13449 vm_map_fork_share(old_map, old_entry, new_map);
13450 new_size += entry_size;
13451 break;
13452
13453 case VM_INHERIT_COPY:
13454
13455 /*
13456 * Inline the copy_quickly case;
13457 * upon failure, fall back on call
13458 * to vm_map_fork_copy.
13459 */
13460
13461 if (old_entry->is_sub_map) {
13462 break;
13463 }
13464 if ((old_entry->wired_count != 0) ||
13465 ((VME_OBJECT(old_entry) != NULL) &&
13466 (VME_OBJECT(old_entry)->true_share))) {
13467 goto slow_vm_map_fork_copy;
13468 }
13469
13470 new_entry = vm_map_entry_create(new_map); /* never the kernel map or descendants */
13471 vm_map_entry_copy(old_map, new_entry, old_entry);
13472 if (old_entry->vme_permanent) {
13473 /* inherit "permanent" on fork() */
13474 new_entry->vme_permanent = TRUE;
13475 }
13476
13477 if (new_entry->used_for_jit == TRUE && new_map->jit_entry_exists == FALSE) {
13478 new_map->jit_entry_exists = TRUE;
13479 }
13480
13481 if (new_entry->is_sub_map) {
13482 /* clear address space specifics */
13483 new_entry->use_pmap = FALSE;
13484 } else {
13485 /*
13486 * We're dealing with a copy-on-write operation,
13487 * so the resulting mapping should not inherit
13488 * the original mapping's accounting settings.
13489 * "iokit_acct" should have been cleared in
13490 * vm_map_entry_copy().
13491 * "use_pmap" should be reset to its default
13492 * (TRUE) so that the new mapping gets
13493 * accounted for in the task's memory footprint.
13494 */
13495 assert(!new_entry->iokit_acct);
13496 new_entry->use_pmap = TRUE;
13497 }
13498
13499 if (!vm_object_copy_quickly(
13500 VME_OBJECT(new_entry),
13501 VME_OFFSET(old_entry),
13502 (old_entry->vme_end -
13503 old_entry->vme_start),
13504 &src_needs_copy,
13505 &new_entry_needs_copy)) {
13506 vm_map_entry_dispose(new_entry);
13507 goto slow_vm_map_fork_copy;
13508 }
13509
13510 /*
13511 * Handle copy-on-write obligations
13512 */
13513
13514 if (src_needs_copy && !old_entry->needs_copy) {
13515 vm_prot_t prot;
13516
13517 assert(!pmap_has_prot_policy(old_map->pmap, old_entry->translated_allow_execute, old_entry->protection));
13518
13519 prot = old_entry->protection & ~VM_PROT_WRITE;
13520
13521 if (override_nx(old_map, VME_ALIAS(old_entry))
13522 && prot) {
13523 prot |= VM_PROT_EXECUTE;
13524 }
13525
13526 assert(!pmap_has_prot_policy(old_map->pmap, old_entry->translated_allow_execute, prot));
13527
13528 vm_object_pmap_protect(
13529 VME_OBJECT(old_entry),
13530 VME_OFFSET(old_entry),
13531 (old_entry->vme_end -
13532 old_entry->vme_start),
13533 ((old_entry->is_shared
13534 || old_map->mapped_in_other_pmaps)
13535 ? PMAP_NULL :
13536 old_map->pmap),
13537 VM_MAP_PAGE_SIZE(old_map),
13538 old_entry->vme_start,
13539 prot);
13540
13541 assert(old_entry->wired_count == 0);
13542 old_entry->needs_copy = TRUE;
13543 }
13544 new_entry->needs_copy = new_entry_needs_copy;
13545
13546 /*
13547 * Insert the entry at the end
13548 * of the map.
13549 */
13550
13551 vm_map_store_entry_link(new_map,
13552 vm_map_last_entry(new_map),
13553 new_entry,
13554 VM_MAP_KERNEL_FLAGS_NONE);
13555 new_size += entry_size;
13556 break;
13557
13558 slow_vm_map_fork_copy:
13559 vm_map_copyin_flags = VM_MAP_COPYIN_FORK;
13560 if (options & VM_MAP_FORK_PRESERVE_PURGEABLE) {
13561 vm_map_copyin_flags |=
13562 VM_MAP_COPYIN_PRESERVE_PURGEABLE;
13563 }
13564 if (vm_map_fork_copy(old_map,
13565 &old_entry,
13566 new_map,
13567 vm_map_copyin_flags)) {
13568 new_size += entry_size;
13569 }
13570 continue;
13571 }
13572 old_entry = old_entry->vme_next;
13573 }
13574
13575 #if PMAP_FORK_NEST
13576 new_entry = vm_map_last_entry(new_map);
13577 if (new_entry == vm_map_to_entry(new_map)) {
13578 /* unnest all that was pre-nested */
13579 vm_map_fork_unnest(new_pmap,
13580 pre_nested_start, pre_nested_end,
13581 vm_map_min(new_map), vm_map_max(new_map));
13582 } else if (new_entry->vme_end < vm_map_max(new_map)) {
13583 /* unnest hole at the end, if pre-nested */
13584 vm_map_fork_unnest(new_pmap,
13585 pre_nested_start, pre_nested_end,
13586 new_entry->vme_end, vm_map_max(new_map));
13587 }
13588 #endif /* PMAP_FORK_NEST */
13589
13590 #if defined(__arm64__)
13591 pmap_insert_commpage(new_map->pmap);
13592 #endif /* __arm64__ */
13593
13594 new_map->size = new_size;
13595
13596 if (options & VM_MAP_FORK_CORPSE_FOOTPRINT) {
13597 vm_map_corpse_footprint_collect_done(new_map);
13598 }
13599
13600 /* Propagate JIT entitlement for the pmap layer. */
13601 if (pmap_get_jit_entitled(old_map->pmap)) {
13602 /* Tell the pmap that it supports JIT. */
13603 pmap_set_jit_entitled(new_map->pmap);
13604 }
13605
13606 /* Propagate TPRO settings for the pmap layer */
13607 if (pmap_get_tpro(old_map->pmap)) {
13608 /* Tell the pmap that it supports TPRO */
13609 pmap_set_tpro(new_map->pmap);
13610 }
13611
13612 vm_map_unlock(new_map);
13613 vm_map_unlock(old_map);
13614 vm_map_deallocate(old_map);
13615
13616 return new_map;
13617 }
13618
13619 /*
13620 * vm_map_exec:
13621 *
13622 * Setup the "new_map" with the proper execution environment according
13623 * to the type of executable (platform, 64bit, chroot environment).
13624 * Map the comm page and shared region, etc...
13625 */
13626 kern_return_t
vm_map_exec(vm_map_t new_map,task_t task,boolean_t is64bit,void * fsroot,cpu_type_t cpu,cpu_subtype_t cpu_subtype,boolean_t reslide,boolean_t is_driverkit,uint32_t rsr_version)13627 vm_map_exec(
13628 vm_map_t new_map,
13629 task_t task,
13630 boolean_t is64bit,
13631 void *fsroot,
13632 cpu_type_t cpu,
13633 cpu_subtype_t cpu_subtype,
13634 boolean_t reslide,
13635 boolean_t is_driverkit,
13636 uint32_t rsr_version)
13637 {
13638 SHARED_REGION_TRACE_DEBUG(
13639 ("shared_region: task %p: vm_map_exec(%p,%p,%p,0x%x,0x%x): ->\n",
13640 (void *)VM_KERNEL_ADDRPERM(current_task()),
13641 (void *)VM_KERNEL_ADDRPERM(new_map),
13642 (void *)VM_KERNEL_ADDRPERM(task),
13643 (void *)VM_KERNEL_ADDRPERM(fsroot),
13644 cpu,
13645 cpu_subtype));
13646 (void) vm_commpage_enter(new_map, task, is64bit);
13647
13648 (void) vm_shared_region_enter(new_map, task, is64bit, fsroot, cpu, cpu_subtype, reslide, is_driverkit, rsr_version);
13649
13650 SHARED_REGION_TRACE_DEBUG(
13651 ("shared_region: task %p: vm_map_exec(%p,%p,%p,0x%x,0x%x): <-\n",
13652 (void *)VM_KERNEL_ADDRPERM(current_task()),
13653 (void *)VM_KERNEL_ADDRPERM(new_map),
13654 (void *)VM_KERNEL_ADDRPERM(task),
13655 (void *)VM_KERNEL_ADDRPERM(fsroot),
13656 cpu,
13657 cpu_subtype));
13658
13659 /*
13660 * Some devices have region(s) of memory that shouldn't get allocated by
13661 * user processes. The following code creates dummy vm_map_entry_t's for each
13662 * of the regions that needs to be reserved to prevent any allocations in
13663 * those regions.
13664 */
13665 kern_return_t kr = KERN_FAILURE;
13666 vm_map_kernel_flags_t vmk_flags = VM_MAP_KERNEL_FLAGS_FIXED_PERMANENT();
13667 vmk_flags.vmkf_beyond_max = true;
13668
13669 const struct vm_reserved_region *regions = NULL;
13670 size_t num_regions = ml_get_vm_reserved_regions(is64bit, ®ions);
13671 assert((num_regions == 0) || (num_regions > 0 && regions != NULL));
13672
13673 for (size_t i = 0; i < num_regions; ++i) {
13674 vm_map_offset_t address = regions[i].vmrr_addr;
13675
13676 kr = vm_map_enter(
13677 new_map,
13678 &address,
13679 regions[i].vmrr_size,
13680 (vm_map_offset_t)0,
13681 vmk_flags,
13682 VM_OBJECT_NULL,
13683 (vm_object_offset_t)0,
13684 FALSE,
13685 VM_PROT_NONE,
13686 VM_PROT_NONE,
13687 VM_INHERIT_COPY);
13688
13689 if (kr != KERN_SUCCESS) {
13690 panic("Failed to reserve %s region in user map %p %d", regions[i].vmrr_name, new_map, kr);
13691 }
13692 }
13693
13694 new_map->reserved_regions = (num_regions ? TRUE : FALSE);
13695
13696 return KERN_SUCCESS;
13697 }
13698
13699 uint64_t vm_map_lookup_and_lock_object_copy_slowly_count = 0;
13700 uint64_t vm_map_lookup_and_lock_object_copy_slowly_size = 0;
13701 uint64_t vm_map_lookup_and_lock_object_copy_slowly_max = 0;
13702 uint64_t vm_map_lookup_and_lock_object_copy_slowly_restart = 0;
13703 uint64_t vm_map_lookup_and_lock_object_copy_slowly_error = 0;
13704 uint64_t vm_map_lookup_and_lock_object_copy_strategically_count = 0;
13705 uint64_t vm_map_lookup_and_lock_object_copy_strategically_size = 0;
13706 uint64_t vm_map_lookup_and_lock_object_copy_strategically_max = 0;
13707 uint64_t vm_map_lookup_and_lock_object_copy_strategically_restart = 0;
13708 uint64_t vm_map_lookup_and_lock_object_copy_strategically_error = 0;
13709 uint64_t vm_map_lookup_and_lock_object_copy_shadow_count = 0;
13710 uint64_t vm_map_lookup_and_lock_object_copy_shadow_size = 0;
13711 uint64_t vm_map_lookup_and_lock_object_copy_shadow_max = 0;
13712 /*
13713 * vm_map_lookup_and_lock_object:
13714 *
13715 * Finds the VM object, offset, and
13716 * protection for a given virtual address in the
13717 * specified map, assuming a page fault of the
13718 * type specified.
13719 *
13720 * Returns the (object, offset, protection) for
13721 * this address, whether it is wired down, and whether
13722 * this map has the only reference to the data in question.
13723 * In order to later verify this lookup, a "version"
13724 * is returned.
13725 * If contended != NULL, *contended will be set to
13726 * true iff the thread had to spin or block to acquire
13727 * an exclusive lock.
13728 *
13729 * The map MUST be locked by the caller and WILL be
13730 * locked on exit. In order to guarantee the
13731 * existence of the returned object, it is returned
13732 * locked.
13733 *
13734 * If a lookup is requested with "write protection"
13735 * specified, the map may be changed to perform virtual
13736 * copying operations, although the data referenced will
13737 * remain the same.
13738 */
13739 kern_return_t
vm_map_lookup_and_lock_object(vm_map_t * var_map,vm_map_offset_t vaddr,vm_prot_t fault_type,int object_lock_type,vm_map_version_t * out_version,vm_object_t * object,vm_object_offset_t * offset,vm_prot_t * out_prot,boolean_t * wired,vm_object_fault_info_t fault_info,vm_map_t * real_map,bool * contended)13740 vm_map_lookup_and_lock_object(
13741 vm_map_t *var_map, /* IN/OUT */
13742 vm_map_offset_t vaddr,
13743 vm_prot_t fault_type,
13744 int object_lock_type,
13745 vm_map_version_t *out_version, /* OUT */
13746 vm_object_t *object, /* OUT */
13747 vm_object_offset_t *offset, /* OUT */
13748 vm_prot_t *out_prot, /* OUT */
13749 boolean_t *wired, /* OUT */
13750 vm_object_fault_info_t fault_info, /* OUT */
13751 vm_map_t *real_map, /* OUT */
13752 bool *contended) /* OUT */
13753 {
13754 vm_map_entry_t entry;
13755 vm_map_t map = *var_map;
13756 vm_map_t old_map = *var_map;
13757 vm_map_t cow_sub_map_parent = VM_MAP_NULL;
13758 vm_map_offset_t cow_parent_vaddr = 0;
13759 vm_map_offset_t old_start = 0;
13760 vm_map_offset_t old_end = 0;
13761 vm_prot_t prot;
13762 boolean_t mask_protections;
13763 boolean_t force_copy;
13764 boolean_t no_force_copy_if_executable;
13765 boolean_t submap_needed_copy;
13766 vm_prot_t original_fault_type;
13767 vm_map_size_t fault_page_mask;
13768
13769 /*
13770 * VM_PROT_MASK means that the caller wants us to use "fault_type"
13771 * as a mask against the mapping's actual protections, not as an
13772 * absolute value.
13773 */
13774 mask_protections = (fault_type & VM_PROT_IS_MASK) ? TRUE : FALSE;
13775 force_copy = (fault_type & VM_PROT_COPY) ? TRUE : FALSE;
13776 no_force_copy_if_executable = (fault_type & VM_PROT_COPY_FAIL_IF_EXECUTABLE) ? TRUE : FALSE;
13777 fault_type &= VM_PROT_ALL;
13778 original_fault_type = fault_type;
13779 if (contended) {
13780 *contended = false;
13781 }
13782
13783 *real_map = map;
13784
13785 fault_page_mask = MIN(VM_MAP_PAGE_MASK(map), PAGE_MASK);
13786 vaddr = VM_MAP_TRUNC_PAGE(vaddr, fault_page_mask);
13787
13788 RetryLookup:
13789 fault_type = original_fault_type;
13790
13791 /*
13792 * If the map has an interesting hint, try it before calling
13793 * full blown lookup routine.
13794 */
13795 entry = map->hint;
13796
13797 if ((entry == vm_map_to_entry(map)) ||
13798 (vaddr < entry->vme_start) || (vaddr >= entry->vme_end)) {
13799 vm_map_entry_t tmp_entry;
13800
13801 /*
13802 * Entry was either not a valid hint, or the vaddr
13803 * was not contained in the entry, so do a full lookup.
13804 */
13805 if (!vm_map_lookup_entry(map, vaddr, &tmp_entry)) {
13806 if ((cow_sub_map_parent) && (cow_sub_map_parent != map)) {
13807 vm_map_unlock(cow_sub_map_parent);
13808 }
13809 if ((*real_map != map)
13810 && (*real_map != cow_sub_map_parent)) {
13811 vm_map_unlock(*real_map);
13812 }
13813 return KERN_INVALID_ADDRESS;
13814 }
13815
13816 entry = tmp_entry;
13817 }
13818 if (map == old_map) {
13819 old_start = entry->vme_start;
13820 old_end = entry->vme_end;
13821 }
13822
13823 /*
13824 * Handle submaps. Drop lock on upper map, submap is
13825 * returned locked.
13826 */
13827
13828 submap_needed_copy = FALSE;
13829 submap_recurse:
13830 if (entry->is_sub_map) {
13831 vm_map_offset_t local_vaddr;
13832 vm_map_offset_t end_delta;
13833 vm_map_offset_t start_delta;
13834 vm_map_offset_t top_entry_saved_start;
13835 vm_object_offset_t top_entry_saved_offset;
13836 vm_map_entry_t submap_entry, saved_submap_entry;
13837 vm_object_offset_t submap_entry_offset;
13838 vm_object_size_t submap_entry_size;
13839 vm_prot_t subentry_protection;
13840 vm_prot_t subentry_max_protection;
13841 boolean_t subentry_no_copy_on_read;
13842 boolean_t subentry_permanent;
13843 boolean_t subentry_csm_associated;
13844 #if __arm64e__
13845 boolean_t subentry_used_for_tpro;
13846 #endif /* __arm64e__ */
13847 boolean_t mapped_needs_copy = FALSE;
13848 vm_map_version_t version;
13849
13850 assertf(VM_MAP_PAGE_SHIFT(VME_SUBMAP(entry)) >= VM_MAP_PAGE_SHIFT(map),
13851 "map %p (%d) entry %p submap %p (%d)\n",
13852 map, VM_MAP_PAGE_SHIFT(map), entry,
13853 VME_SUBMAP(entry), VM_MAP_PAGE_SHIFT(VME_SUBMAP(entry)));
13854
13855 local_vaddr = vaddr;
13856 top_entry_saved_start = entry->vme_start;
13857 top_entry_saved_offset = VME_OFFSET(entry);
13858
13859 if ((entry->use_pmap &&
13860 !((fault_type & VM_PROT_WRITE) ||
13861 force_copy))) {
13862 /* if real_map equals map we unlock below */
13863 if ((*real_map != map) &&
13864 (*real_map != cow_sub_map_parent)) {
13865 vm_map_unlock(*real_map);
13866 }
13867 *real_map = VME_SUBMAP(entry);
13868 }
13869
13870 if (entry->needs_copy &&
13871 ((fault_type & VM_PROT_WRITE) ||
13872 force_copy)) {
13873 if (!mapped_needs_copy) {
13874 if (vm_map_lock_read_to_write(map)) {
13875 vm_map_lock_read(map);
13876 *real_map = map;
13877 goto RetryLookup;
13878 }
13879 vm_map_lock_read(VME_SUBMAP(entry));
13880 *var_map = VME_SUBMAP(entry);
13881 cow_sub_map_parent = map;
13882 /* reset base to map before cow object */
13883 /* this is the map which will accept */
13884 /* the new cow object */
13885 old_start = entry->vme_start;
13886 old_end = entry->vme_end;
13887 cow_parent_vaddr = vaddr;
13888 mapped_needs_copy = TRUE;
13889 } else {
13890 vm_map_lock_read(VME_SUBMAP(entry));
13891 *var_map = VME_SUBMAP(entry);
13892 if ((cow_sub_map_parent != map) &&
13893 (*real_map != map)) {
13894 vm_map_unlock(map);
13895 }
13896 }
13897 } else {
13898 if (entry->needs_copy) {
13899 submap_needed_copy = TRUE;
13900 }
13901 vm_map_lock_read(VME_SUBMAP(entry));
13902 *var_map = VME_SUBMAP(entry);
13903 /* leave map locked if it is a target */
13904 /* cow sub_map above otherwise, just */
13905 /* follow the maps down to the object */
13906 /* here we unlock knowing we are not */
13907 /* revisiting the map. */
13908 if ((*real_map != map) && (map != cow_sub_map_parent)) {
13909 vm_map_unlock_read(map);
13910 }
13911 }
13912
13913 entry = NULL;
13914 map = *var_map;
13915
13916 /* calculate the offset in the submap for vaddr */
13917 local_vaddr = (local_vaddr - top_entry_saved_start) + top_entry_saved_offset;
13918 assertf(VM_MAP_PAGE_ALIGNED(local_vaddr, fault_page_mask),
13919 "local_vaddr 0x%llx entry->vme_start 0x%llx fault_page_mask 0x%llx\n",
13920 (uint64_t)local_vaddr, (uint64_t)top_entry_saved_start, (uint64_t)fault_page_mask);
13921
13922 RetrySubMap:
13923 if (!vm_map_lookup_entry(map, local_vaddr, &submap_entry)) {
13924 if ((cow_sub_map_parent) && (cow_sub_map_parent != map)) {
13925 vm_map_unlock(cow_sub_map_parent);
13926 }
13927 if ((*real_map != map)
13928 && (*real_map != cow_sub_map_parent)) {
13929 vm_map_unlock(*real_map);
13930 }
13931 *real_map = map;
13932 return KERN_INVALID_ADDRESS;
13933 }
13934
13935 /* find the attenuated shadow of the underlying object */
13936 /* on our target map */
13937
13938 /* in english the submap object may extend beyond the */
13939 /* region mapped by the entry or, may only fill a portion */
13940 /* of it. For our purposes, we only care if the object */
13941 /* doesn't fill. In this case the area which will */
13942 /* ultimately be clipped in the top map will only need */
13943 /* to be as big as the portion of the underlying entry */
13944 /* which is mapped */
13945 start_delta = submap_entry->vme_start > top_entry_saved_offset ?
13946 submap_entry->vme_start - top_entry_saved_offset : 0;
13947
13948 end_delta =
13949 (top_entry_saved_offset + start_delta + (old_end - old_start)) <=
13950 submap_entry->vme_end ?
13951 0 : (top_entry_saved_offset +
13952 (old_end - old_start))
13953 - submap_entry->vme_end;
13954
13955 old_start += start_delta;
13956 old_end -= end_delta;
13957
13958 if (submap_entry->is_sub_map) {
13959 entry = submap_entry;
13960 vaddr = local_vaddr;
13961 goto submap_recurse;
13962 }
13963
13964 if (((fault_type & VM_PROT_WRITE) ||
13965 force_copy)
13966 && cow_sub_map_parent) {
13967 vm_object_t sub_object, copy_object;
13968 vm_object_offset_t copy_offset;
13969 vm_map_offset_t local_start;
13970 vm_map_offset_t local_end;
13971 boolean_t object_copied = FALSE;
13972 vm_object_offset_t object_copied_offset = 0;
13973 boolean_t object_copied_needs_copy = FALSE;
13974 kern_return_t kr = KERN_SUCCESS;
13975
13976 if (vm_map_lock_read_to_write(map)) {
13977 vm_map_lock_read(map);
13978 old_start -= start_delta;
13979 old_end += end_delta;
13980 goto RetrySubMap;
13981 }
13982
13983
13984 sub_object = VME_OBJECT(submap_entry);
13985 if (sub_object == VM_OBJECT_NULL) {
13986 sub_object =
13987 vm_object_allocate(
13988 (vm_map_size_t)
13989 (submap_entry->vme_end -
13990 submap_entry->vme_start));
13991 VME_OBJECT_SET(submap_entry, sub_object, false, 0);
13992 VME_OFFSET_SET(submap_entry, 0);
13993 assert(!submap_entry->is_sub_map);
13994 assert(submap_entry->use_pmap);
13995 }
13996 local_start = local_vaddr -
13997 (cow_parent_vaddr - old_start);
13998 local_end = local_vaddr +
13999 (old_end - cow_parent_vaddr);
14000 vm_map_clip_start(map, submap_entry, local_start);
14001 vm_map_clip_end(map, submap_entry, local_end);
14002 if (submap_entry->is_sub_map) {
14003 /* unnesting was done when clipping */
14004 assert(!submap_entry->use_pmap);
14005 }
14006
14007 /* This is the COW case, lets connect */
14008 /* an entry in our space to the underlying */
14009 /* object in the submap, bypassing the */
14010 /* submap. */
14011 submap_entry_offset = VME_OFFSET(submap_entry);
14012 submap_entry_size = submap_entry->vme_end - submap_entry->vme_start;
14013
14014 if ((submap_entry->wired_count != 0 ||
14015 sub_object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC) &&
14016 (submap_entry->protection & VM_PROT_EXECUTE) &&
14017 no_force_copy_if_executable) {
14018 // printf("FBDP map %p entry %p start 0x%llx end 0x%llx wired %d strat %d\n", map, submap_entry, (uint64_t)local_start, (uint64_t)local_end, submap_entry->wired_count, sub_object->copy_strategy);
14019 if ((cow_sub_map_parent) && (cow_sub_map_parent != map)) {
14020 vm_map_unlock(cow_sub_map_parent);
14021 }
14022 if ((*real_map != map)
14023 && (*real_map != cow_sub_map_parent)) {
14024 vm_map_unlock(*real_map);
14025 }
14026 *real_map = map;
14027 ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_SUBMAP_NO_COW_ON_EXECUTABLE), 0 /* arg */);
14028 vm_map_lock_write_to_read(map);
14029 kr = KERN_PROTECTION_FAILURE;
14030 DTRACE_VM4(submap_no_copy_executable,
14031 vm_map_t, map,
14032 vm_object_offset_t, submap_entry_offset,
14033 vm_object_size_t, submap_entry_size,
14034 int, kr);
14035 return kr;
14036 }
14037
14038 if (submap_entry->wired_count != 0) {
14039 vm_object_reference(sub_object);
14040
14041 assertf(VM_MAP_PAGE_ALIGNED(VME_OFFSET(submap_entry), VM_MAP_PAGE_MASK(map)),
14042 "submap_entry %p offset 0x%llx\n",
14043 submap_entry, VME_OFFSET(submap_entry));
14044
14045 DTRACE_VM6(submap_copy_slowly,
14046 vm_map_t, cow_sub_map_parent,
14047 vm_map_offset_t, vaddr,
14048 vm_map_t, map,
14049 vm_object_size_t, submap_entry_size,
14050 int, submap_entry->wired_count,
14051 int, sub_object->copy_strategy);
14052
14053 saved_submap_entry = submap_entry;
14054 version.main_timestamp = map->timestamp;
14055 vm_map_unlock(map); /* Increments timestamp by 1 */
14056 submap_entry = VM_MAP_ENTRY_NULL;
14057
14058 vm_object_lock(sub_object);
14059 kr = vm_object_copy_slowly(sub_object,
14060 submap_entry_offset,
14061 submap_entry_size,
14062 FALSE,
14063 ©_object);
14064 object_copied = TRUE;
14065 object_copied_offset = 0;
14066 /* 4k: account for extra offset in physical page */
14067 object_copied_offset += submap_entry_offset - vm_object_trunc_page(submap_entry_offset);
14068 object_copied_needs_copy = FALSE;
14069 vm_object_deallocate(sub_object);
14070
14071 vm_map_lock(map);
14072
14073 if (kr != KERN_SUCCESS &&
14074 kr != KERN_MEMORY_RESTART_COPY) {
14075 if ((cow_sub_map_parent) && (cow_sub_map_parent != map)) {
14076 vm_map_unlock(cow_sub_map_parent);
14077 }
14078 if ((*real_map != map)
14079 && (*real_map != cow_sub_map_parent)) {
14080 vm_map_unlock(*real_map);
14081 }
14082 *real_map = map;
14083 vm_object_deallocate(copy_object);
14084 copy_object = VM_OBJECT_NULL;
14085 ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_SUBMAP_COPY_SLOWLY_FAILED), 0 /* arg */);
14086 vm_map_lock_write_to_read(map);
14087 DTRACE_VM4(submap_copy_error_slowly,
14088 vm_object_t, sub_object,
14089 vm_object_offset_t, submap_entry_offset,
14090 vm_object_size_t, submap_entry_size,
14091 int, kr);
14092 vm_map_lookup_and_lock_object_copy_slowly_error++;
14093 return kr;
14094 }
14095
14096 if ((kr == KERN_SUCCESS) &&
14097 (version.main_timestamp + 1) == map->timestamp) {
14098 submap_entry = saved_submap_entry;
14099 } else {
14100 saved_submap_entry = NULL;
14101 old_start -= start_delta;
14102 old_end += end_delta;
14103 vm_object_deallocate(copy_object);
14104 copy_object = VM_OBJECT_NULL;
14105 vm_map_lock_write_to_read(map);
14106 vm_map_lookup_and_lock_object_copy_slowly_restart++;
14107 goto RetrySubMap;
14108 }
14109 vm_map_lookup_and_lock_object_copy_slowly_count++;
14110 vm_map_lookup_and_lock_object_copy_slowly_size += submap_entry_size;
14111 if (submap_entry_size > vm_map_lookup_and_lock_object_copy_slowly_max) {
14112 vm_map_lookup_and_lock_object_copy_slowly_max = submap_entry_size;
14113 }
14114 } else if (sub_object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC) {
14115 submap_entry_offset = VME_OFFSET(submap_entry);
14116 copy_object = VM_OBJECT_NULL;
14117 object_copied_offset = submap_entry_offset;
14118 object_copied_needs_copy = FALSE;
14119 DTRACE_VM6(submap_copy_strategically,
14120 vm_map_t, cow_sub_map_parent,
14121 vm_map_offset_t, vaddr,
14122 vm_map_t, map,
14123 vm_object_size_t, submap_entry_size,
14124 int, submap_entry->wired_count,
14125 int, sub_object->copy_strategy);
14126 kr = vm_object_copy_strategically(
14127 sub_object,
14128 submap_entry_offset,
14129 submap_entry->vme_end - submap_entry->vme_start,
14130 false, /* forking */
14131 ©_object,
14132 &object_copied_offset,
14133 &object_copied_needs_copy);
14134 if (kr == KERN_MEMORY_RESTART_COPY) {
14135 old_start -= start_delta;
14136 old_end += end_delta;
14137 vm_object_deallocate(copy_object);
14138 copy_object = VM_OBJECT_NULL;
14139 vm_map_lock_write_to_read(map);
14140 vm_map_lookup_and_lock_object_copy_strategically_restart++;
14141 goto RetrySubMap;
14142 }
14143 if (kr != KERN_SUCCESS) {
14144 if ((cow_sub_map_parent) && (cow_sub_map_parent != map)) {
14145 vm_map_unlock(cow_sub_map_parent);
14146 }
14147 if ((*real_map != map)
14148 && (*real_map != cow_sub_map_parent)) {
14149 vm_map_unlock(*real_map);
14150 }
14151 *real_map = map;
14152 vm_object_deallocate(copy_object);
14153 copy_object = VM_OBJECT_NULL;
14154 ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_SUBMAP_COPY_STRAT_FAILED), 0 /* arg */);
14155 vm_map_lock_write_to_read(map);
14156 DTRACE_VM4(submap_copy_error_strategically,
14157 vm_object_t, sub_object,
14158 vm_object_offset_t, submap_entry_offset,
14159 vm_object_size_t, submap_entry_size,
14160 int, kr);
14161 vm_map_lookup_and_lock_object_copy_strategically_error++;
14162 return kr;
14163 }
14164 assert(copy_object != VM_OBJECT_NULL);
14165 assert(copy_object != sub_object);
14166 object_copied = TRUE;
14167 vm_map_lookup_and_lock_object_copy_strategically_count++;
14168 vm_map_lookup_and_lock_object_copy_strategically_size += submap_entry_size;
14169 if (submap_entry_size > vm_map_lookup_and_lock_object_copy_strategically_max) {
14170 vm_map_lookup_and_lock_object_copy_strategically_max = submap_entry_size;
14171 }
14172 } else {
14173 /* set up shadow object */
14174 object_copied = FALSE;
14175 copy_object = sub_object;
14176 vm_object_lock(sub_object);
14177 vm_object_reference_locked(sub_object);
14178 sub_object->shadowed = TRUE;
14179 vm_object_unlock(sub_object);
14180
14181 assert(submap_entry->wired_count == 0);
14182 submap_entry->needs_copy = TRUE;
14183
14184 prot = submap_entry->protection;
14185 assert(!pmap_has_prot_policy(map->pmap, submap_entry->translated_allow_execute, prot));
14186 prot = prot & ~VM_PROT_WRITE;
14187 assert(!pmap_has_prot_policy(map->pmap, submap_entry->translated_allow_execute, prot));
14188
14189 if (override_nx(old_map,
14190 VME_ALIAS(submap_entry))
14191 && prot) {
14192 prot |= VM_PROT_EXECUTE;
14193 }
14194
14195 vm_object_pmap_protect(
14196 sub_object,
14197 VME_OFFSET(submap_entry),
14198 submap_entry->vme_end -
14199 submap_entry->vme_start,
14200 (submap_entry->is_shared
14201 || map->mapped_in_other_pmaps) ?
14202 PMAP_NULL : map->pmap,
14203 VM_MAP_PAGE_SIZE(map),
14204 submap_entry->vme_start,
14205 prot);
14206 vm_map_lookup_and_lock_object_copy_shadow_count++;
14207 vm_map_lookup_and_lock_object_copy_shadow_size += submap_entry_size;
14208 if (submap_entry_size > vm_map_lookup_and_lock_object_copy_shadow_max) {
14209 vm_map_lookup_and_lock_object_copy_shadow_max = submap_entry_size;
14210 }
14211 }
14212
14213 /*
14214 * Adjust the fault offset to the submap entry.
14215 */
14216 copy_offset = (local_vaddr -
14217 submap_entry->vme_start +
14218 VME_OFFSET(submap_entry));
14219
14220 /* This works diffently than the */
14221 /* normal submap case. We go back */
14222 /* to the parent of the cow map and*/
14223 /* clip out the target portion of */
14224 /* the sub_map, substituting the */
14225 /* new copy object, */
14226
14227 subentry_protection = submap_entry->protection;
14228 subentry_max_protection = submap_entry->max_protection;
14229 subentry_no_copy_on_read = submap_entry->vme_no_copy_on_read;
14230 subentry_permanent = submap_entry->vme_permanent;
14231 subentry_csm_associated = submap_entry->csm_associated;
14232 #if __arm64e__
14233 subentry_used_for_tpro = submap_entry->used_for_tpro;
14234 #endif // __arm64e__
14235 vm_map_unlock(map);
14236 submap_entry = NULL; /* not valid after map unlock */
14237
14238 local_start = old_start;
14239 local_end = old_end;
14240 map = cow_sub_map_parent;
14241 *var_map = cow_sub_map_parent;
14242 vaddr = cow_parent_vaddr;
14243 cow_sub_map_parent = NULL;
14244
14245 if (!vm_map_lookup_entry(map,
14246 vaddr, &entry)) {
14247 if ((cow_sub_map_parent) && (cow_sub_map_parent != map)) {
14248 vm_map_unlock(cow_sub_map_parent);
14249 }
14250 if ((*real_map != map)
14251 && (*real_map != cow_sub_map_parent)) {
14252 vm_map_unlock(*real_map);
14253 }
14254 *real_map = map;
14255 vm_object_deallocate(
14256 copy_object);
14257 copy_object = VM_OBJECT_NULL;
14258 vm_map_lock_write_to_read(map);
14259 DTRACE_VM4(submap_lookup_post_unlock,
14260 uint64_t, (uint64_t)entry->vme_start,
14261 uint64_t, (uint64_t)entry->vme_end,
14262 vm_map_offset_t, vaddr,
14263 int, object_copied);
14264 return KERN_INVALID_ADDRESS;
14265 }
14266
14267 /* clip out the portion of space */
14268 /* mapped by the sub map which */
14269 /* corresponds to the underlying */
14270 /* object */
14271
14272 /*
14273 * Clip (and unnest) the smallest nested chunk
14274 * possible around the faulting address...
14275 */
14276 local_start = vaddr & ~(pmap_shared_region_size_min(map->pmap) - 1);
14277 local_end = local_start + pmap_shared_region_size_min(map->pmap);
14278 /*
14279 * ... but don't go beyond the "old_start" to "old_end"
14280 * range, to avoid spanning over another VM region
14281 * with a possibly different VM object and/or offset.
14282 */
14283 if (local_start < old_start) {
14284 local_start = old_start;
14285 }
14286 if (local_end > old_end) {
14287 local_end = old_end;
14288 }
14289 /*
14290 * Adjust copy_offset to the start of the range.
14291 */
14292 copy_offset -= (vaddr - local_start);
14293
14294 vm_map_clip_start(map, entry, local_start);
14295 vm_map_clip_end(map, entry, local_end);
14296 if (entry->is_sub_map) {
14297 /* unnesting was done when clipping */
14298 assert(!entry->use_pmap);
14299 }
14300
14301 /* substitute copy object for */
14302 /* shared map entry */
14303 vm_map_deallocate(VME_SUBMAP(entry));
14304 assert(!entry->iokit_acct);
14305 entry->use_pmap = TRUE;
14306 VME_OBJECT_SET(entry, copy_object, false, 0);
14307
14308 /* propagate the submap entry's protections */
14309 if (entry->protection != VM_PROT_READ) {
14310 /*
14311 * Someone has already altered the top entry's
14312 * protections via vm_protect(VM_PROT_COPY).
14313 * Respect these new values and ignore the
14314 * submap entry's protections.
14315 */
14316 } else {
14317 /*
14318 * Regular copy-on-write: propagate the submap
14319 * entry's protections to the top map entry.
14320 */
14321 entry->protection |= subentry_protection;
14322 }
14323 entry->max_protection |= subentry_max_protection;
14324 /* propagate some attributes from subentry */
14325 entry->vme_no_copy_on_read = subentry_no_copy_on_read;
14326 entry->vme_permanent = subentry_permanent;
14327 entry->csm_associated = subentry_csm_associated;
14328 #if __arm64e__
14329 /* propagate TPRO iff the destination map has TPRO enabled */
14330 if (subentry_used_for_tpro && vm_map_tpro(map)) {
14331 entry->used_for_tpro = subentry_used_for_tpro;
14332 }
14333 #endif /* __arm64e */
14334 if ((entry->protection & VM_PROT_WRITE) &&
14335 (entry->protection & VM_PROT_EXECUTE) &&
14336 #if XNU_TARGET_OS_OSX
14337 map->pmap != kernel_pmap &&
14338 (vm_map_cs_enforcement(map)
14339 #if __arm64__
14340 || !VM_MAP_IS_EXOTIC(map)
14341 #endif /* __arm64__ */
14342 ) &&
14343 #endif /* XNU_TARGET_OS_OSX */
14344 #if CODE_SIGNING_MONITOR
14345 (csm_address_space_exempt(map->pmap) != KERN_SUCCESS) &&
14346 #endif
14347 !(entry->used_for_jit) &&
14348 VM_MAP_POLICY_WX_STRIP_X(map)) {
14349 DTRACE_VM3(cs_wx,
14350 uint64_t, (uint64_t)entry->vme_start,
14351 uint64_t, (uint64_t)entry->vme_end,
14352 vm_prot_t, entry->protection);
14353 printf("CODE SIGNING: %d[%s] %s:%d(0x%llx,0x%llx,0x%x) can't have both write and exec at the same time\n",
14354 proc_selfpid(),
14355 (get_bsdtask_info(current_task())
14356 ? proc_name_address(get_bsdtask_info(current_task()))
14357 : "?"),
14358 __FUNCTION__, __LINE__,
14359 #if DEVELOPMENT || DEBUG
14360 (uint64_t)entry->vme_start,
14361 (uint64_t)entry->vme_end,
14362 #else /* DEVELOPMENT || DEBUG */
14363 (uint64_t)0,
14364 (uint64_t)0,
14365 #endif /* DEVELOPMENT || DEBUG */
14366 entry->protection);
14367 entry->protection &= ~VM_PROT_EXECUTE;
14368 }
14369
14370 if (object_copied) {
14371 VME_OFFSET_SET(entry, local_start - old_start + object_copied_offset);
14372 entry->needs_copy = object_copied_needs_copy;
14373 entry->is_shared = FALSE;
14374 } else {
14375 assert(VME_OBJECT(entry) != VM_OBJECT_NULL);
14376 assert(VME_OBJECT(entry)->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC);
14377 assert(entry->wired_count == 0);
14378 VME_OFFSET_SET(entry, copy_offset);
14379 entry->needs_copy = TRUE;
14380 if (map != old_map) {
14381 entry->is_shared = TRUE;
14382 }
14383 }
14384 if (entry->inheritance == VM_INHERIT_SHARE) {
14385 entry->inheritance = VM_INHERIT_COPY;
14386 }
14387
14388 vm_map_lock_write_to_read(map);
14389 } else {
14390 if ((cow_sub_map_parent)
14391 && (cow_sub_map_parent != *real_map)
14392 && (cow_sub_map_parent != map)) {
14393 vm_map_unlock(cow_sub_map_parent);
14394 }
14395 entry = submap_entry;
14396 vaddr = local_vaddr;
14397 }
14398 }
14399
14400 /*
14401 * Check whether this task is allowed to have
14402 * this page.
14403 */
14404
14405 prot = entry->protection;
14406
14407 if (override_nx(old_map, VME_ALIAS(entry)) && prot) {
14408 /*
14409 * HACK -- if not a stack, then allow execution
14410 */
14411 prot |= VM_PROT_EXECUTE;
14412 }
14413
14414 #if __arm64e__
14415 /*
14416 * If the entry we're dealing with is TPRO and we have a write
14417 * fault, inject VM_PROT_WRITE into protections. This allows us
14418 * to maintain RO permissions when not marked as TPRO.
14419 */
14420 if (entry->used_for_tpro && (fault_type & VM_PROT_WRITE)) {
14421 prot |= VM_PROT_WRITE;
14422 }
14423 #endif /* __arm64e__ */
14424 if (mask_protections) {
14425 fault_type &= prot;
14426 if (fault_type == VM_PROT_NONE) {
14427 goto protection_failure;
14428 }
14429 }
14430 if (((fault_type & prot) != fault_type)
14431 #if __arm64__
14432 /* prefetch abort in execute-only page */
14433 && !(prot == VM_PROT_EXECUTE && fault_type == (VM_PROT_READ | VM_PROT_EXECUTE))
14434 #elif defined(__x86_64__)
14435 /* Consider the UEXEC bit when handling an EXECUTE fault */
14436 && !((fault_type & VM_PROT_EXECUTE) && !(prot & VM_PROT_EXECUTE) && (prot & VM_PROT_UEXEC))
14437 #endif
14438 ) {
14439 protection_failure:
14440 if (*real_map != map) {
14441 vm_map_unlock(*real_map);
14442 }
14443 *real_map = map;
14444
14445 if ((fault_type & VM_PROT_EXECUTE) && prot) {
14446 log_stack_execution_failure((addr64_t)vaddr, prot);
14447 }
14448
14449 DTRACE_VM2(prot_fault, int, 1, (uint64_t *), NULL);
14450 DTRACE_VM3(prot_fault_detailed, vm_prot_t, fault_type, vm_prot_t, prot, void *, vaddr);
14451 /*
14452 * Noisy (esp. internally) and can be inferred from CrashReports. So OFF for now.
14453 *
14454 * ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_PROTECTION_FAILURE), 0);
14455 */
14456 return KERN_PROTECTION_FAILURE;
14457 }
14458
14459 /*
14460 * If this page is not pageable, we have to get
14461 * it for all possible accesses.
14462 */
14463
14464 *wired = (entry->wired_count != 0);
14465 if (*wired) {
14466 fault_type = prot;
14467 }
14468
14469 /*
14470 * If the entry was copy-on-write, we either ...
14471 */
14472
14473 if (entry->needs_copy) {
14474 /*
14475 * If we want to write the page, we may as well
14476 * handle that now since we've got the map locked.
14477 *
14478 * If we don't need to write the page, we just
14479 * demote the permissions allowed.
14480 */
14481
14482 if ((fault_type & VM_PROT_WRITE) || *wired || force_copy) {
14483 /*
14484 * Make a new object, and place it in the
14485 * object chain. Note that no new references
14486 * have appeared -- one just moved from the
14487 * map to the new object.
14488 */
14489
14490 if (vm_map_lock_read_to_write(map)) {
14491 vm_map_lock_read(map);
14492 goto RetryLookup;
14493 }
14494
14495 if (VME_OBJECT(entry)->shadowed == FALSE) {
14496 vm_object_lock(VME_OBJECT(entry));
14497 VME_OBJECT(entry)->shadowed = TRUE;
14498 vm_object_unlock(VME_OBJECT(entry));
14499 }
14500 VME_OBJECT_SHADOW(entry,
14501 (vm_map_size_t) (entry->vme_end -
14502 entry->vme_start),
14503 vm_map_always_shadow(map));
14504 entry->needs_copy = FALSE;
14505
14506 vm_map_lock_write_to_read(map);
14507 }
14508 if ((fault_type & VM_PROT_WRITE) == 0 && *wired == 0) {
14509 /*
14510 * We're attempting to read a copy-on-write
14511 * page -- don't allow writes.
14512 */
14513
14514 prot &= (~VM_PROT_WRITE);
14515 }
14516 }
14517
14518 if (submap_needed_copy && (prot & VM_PROT_WRITE)) {
14519 /*
14520 * We went through a "needs_copy" submap without triggering
14521 * a copy, so granting write access to the page would bypass
14522 * that submap's "needs_copy".
14523 */
14524 assert(!(fault_type & VM_PROT_WRITE));
14525 assert(!*wired);
14526 assert(!force_copy);
14527 // printf("FBDP %d[%s] submap_needed_copy for %p 0x%llx\n", proc_selfpid(), proc_name_address(current_task()->bsd_info), map, vaddr);
14528 prot &= ~VM_PROT_WRITE;
14529 }
14530
14531 /*
14532 * Create an object if necessary.
14533 */
14534 if (VME_OBJECT(entry) == VM_OBJECT_NULL) {
14535 if (vm_map_lock_read_to_write(map)) {
14536 vm_map_lock_read(map);
14537 goto RetryLookup;
14538 }
14539
14540 VME_OBJECT_SET(entry,
14541 vm_object_allocate(
14542 (vm_map_size_t)(entry->vme_end -
14543 entry->vme_start)), false, 0);
14544 VME_OFFSET_SET(entry, 0);
14545 assert(entry->use_pmap);
14546 vm_map_lock_write_to_read(map);
14547 }
14548
14549 /*
14550 * Return the object/offset from this entry. If the entry
14551 * was copy-on-write or empty, it has been fixed up. Also
14552 * return the protection.
14553 */
14554
14555 *offset = (vaddr - entry->vme_start) + VME_OFFSET(entry);
14556 *object = VME_OBJECT(entry);
14557 *out_prot = prot;
14558 KDBG_FILTERED(MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_MAP_LOOKUP_OBJECT), VM_KERNEL_UNSLIDE_OR_PERM(*object), (unsigned long) VME_ALIAS(entry), 0, 0);
14559
14560 if (fault_info) {
14561 fault_info->interruptible = THREAD_UNINT; /* for now... */
14562 /* ... the caller will change "interruptible" if needed */
14563 fault_info->cluster_size = 0;
14564 fault_info->user_tag = VME_ALIAS(entry);
14565 fault_info->pmap_options = 0;
14566 if (entry->iokit_acct ||
14567 (!entry->is_sub_map && !entry->use_pmap)) {
14568 fault_info->pmap_options |= PMAP_OPTIONS_ALT_ACCT;
14569 }
14570 fault_info->behavior = entry->behavior;
14571 fault_info->lo_offset = VME_OFFSET(entry);
14572 fault_info->hi_offset =
14573 (entry->vme_end - entry->vme_start) + VME_OFFSET(entry);
14574 fault_info->no_cache = entry->no_cache;
14575 fault_info->stealth = FALSE;
14576 fault_info->io_sync = FALSE;
14577 if (entry->used_for_jit ||
14578 #if CODE_SIGNING_MONITOR
14579 (csm_address_space_exempt(map->pmap) == KERN_SUCCESS) ||
14580 #endif
14581 entry->vme_resilient_codesign) {
14582 fault_info->cs_bypass = TRUE;
14583 } else {
14584 fault_info->cs_bypass = FALSE;
14585 }
14586 fault_info->csm_associated = FALSE;
14587 #if CODE_SIGNING_MONITOR
14588 if (entry->csm_associated) {
14589 /*
14590 * The pmap layer will validate this page
14591 * before allowing it to be executed from.
14592 */
14593 fault_info->csm_associated = TRUE;
14594 }
14595 #endif
14596 fault_info->mark_zf_absent = FALSE;
14597 fault_info->batch_pmap_op = FALSE;
14598 fault_info->resilient_media = entry->vme_resilient_media;
14599 fault_info->fi_xnu_user_debug = entry->vme_xnu_user_debug;
14600 fault_info->no_copy_on_read = entry->vme_no_copy_on_read;
14601 #if __arm64e__
14602 fault_info->fi_used_for_tpro = entry->used_for_tpro;
14603 #else /* __arm64e__ */
14604 fault_info->fi_used_for_tpro = FALSE;
14605 #endif
14606 if (entry->translated_allow_execute) {
14607 fault_info->pmap_options |= PMAP_OPTIONS_TRANSLATED_ALLOW_EXECUTE;
14608 }
14609 }
14610
14611 /*
14612 * Lock the object to prevent it from disappearing
14613 */
14614 if (object_lock_type == OBJECT_LOCK_EXCLUSIVE) {
14615 if (contended == NULL) {
14616 vm_object_lock(*object);
14617 } else {
14618 *contended = vm_object_lock_check_contended(*object);
14619 }
14620 } else {
14621 vm_object_lock_shared(*object);
14622 }
14623
14624 /*
14625 * Save the version number
14626 */
14627
14628 out_version->main_timestamp = map->timestamp;
14629
14630 return KERN_SUCCESS;
14631 }
14632
14633
14634 /*
14635 * vm_map_verify:
14636 *
14637 * Verifies that the map in question has not changed
14638 * since the given version. The map has to be locked
14639 * ("shared" mode is fine) before calling this function
14640 * and it will be returned locked too.
14641 */
14642 boolean_t
vm_map_verify(vm_map_t map,vm_map_version_t * version)14643 vm_map_verify(
14644 vm_map_t map,
14645 vm_map_version_t *version) /* REF */
14646 {
14647 boolean_t result;
14648
14649 vm_map_lock_assert_held(map);
14650 result = (map->timestamp == version->main_timestamp);
14651
14652 return result;
14653 }
14654
14655 /*
14656 * TEMPORARYTEMPORARYTEMPORARYTEMPORARYTEMPORARYTEMPORARY
14657 * Goes away after regular vm_region_recurse function migrates to
14658 * 64 bits
14659 * vm_region_recurse: A form of vm_region which follows the
14660 * submaps in a target map
14661 *
14662 */
14663
14664 kern_return_t
vm_map_region_recurse_64(vm_map_t map,vm_map_offset_t * address,vm_map_size_t * size,natural_t * nesting_depth,vm_region_submap_info_64_t submap_info,mach_msg_type_number_t * count)14665 vm_map_region_recurse_64(
14666 vm_map_t map,
14667 vm_map_offset_t *address, /* IN/OUT */
14668 vm_map_size_t *size, /* OUT */
14669 natural_t *nesting_depth, /* IN/OUT */
14670 vm_region_submap_info_64_t submap_info, /* IN/OUT */
14671 mach_msg_type_number_t *count) /* IN/OUT */
14672 {
14673 mach_msg_type_number_t original_count;
14674 vm_region_extended_info_data_t extended;
14675 vm_map_entry_t tmp_entry;
14676 vm_map_offset_t user_address;
14677 unsigned int user_max_depth;
14678
14679 /*
14680 * "curr_entry" is the VM map entry preceding or including the
14681 * address we're looking for.
14682 * "curr_map" is the map or sub-map containing "curr_entry".
14683 * "curr_address" is the equivalent of the top map's "user_address"
14684 * in the current map.
14685 * "curr_offset" is the cumulated offset of "curr_map" in the
14686 * target task's address space.
14687 * "curr_depth" is the depth of "curr_map" in the chain of
14688 * sub-maps.
14689 *
14690 * "curr_max_below" and "curr_max_above" limit the range (around
14691 * "curr_address") we should take into account in the current (sub)map.
14692 * They limit the range to what's visible through the map entries
14693 * we've traversed from the top map to the current map.
14694 *
14695 */
14696 vm_map_entry_t curr_entry;
14697 vm_map_address_t curr_address;
14698 vm_map_offset_t curr_offset;
14699 vm_map_t curr_map;
14700 unsigned int curr_depth;
14701 vm_map_offset_t curr_max_below, curr_max_above;
14702 vm_map_offset_t curr_skip;
14703
14704 /*
14705 * "next_" is the same as "curr_" but for the VM region immediately
14706 * after the address we're looking for. We need to keep track of this
14707 * too because we want to return info about that region if the
14708 * address we're looking for is not mapped.
14709 */
14710 vm_map_entry_t next_entry;
14711 vm_map_offset_t next_offset;
14712 vm_map_offset_t next_address;
14713 vm_map_t next_map;
14714 unsigned int next_depth;
14715 vm_map_offset_t next_max_below, next_max_above;
14716 vm_map_offset_t next_skip;
14717
14718 boolean_t look_for_pages;
14719 vm_region_submap_short_info_64_t short_info;
14720 boolean_t do_region_footprint;
14721 int effective_page_size, effective_page_shift;
14722 boolean_t submap_needed_copy;
14723
14724 if (map == VM_MAP_NULL) {
14725 /* no address space to work on */
14726 return KERN_INVALID_ARGUMENT;
14727 }
14728
14729 effective_page_shift = vm_self_region_page_shift(map);
14730 effective_page_size = (1 << effective_page_shift);
14731
14732 if (*count < VM_REGION_SUBMAP_SHORT_INFO_COUNT_64) {
14733 /*
14734 * "info" structure is not big enough and
14735 * would overflow
14736 */
14737 return KERN_INVALID_ARGUMENT;
14738 }
14739
14740 do_region_footprint = task_self_region_footprint();
14741 original_count = *count;
14742
14743 if (original_count < VM_REGION_SUBMAP_INFO_V0_COUNT_64) {
14744 *count = VM_REGION_SUBMAP_SHORT_INFO_COUNT_64;
14745 look_for_pages = FALSE;
14746 short_info = (vm_region_submap_short_info_64_t) submap_info;
14747 submap_info = NULL;
14748 } else {
14749 look_for_pages = TRUE;
14750 *count = VM_REGION_SUBMAP_INFO_V0_COUNT_64;
14751 short_info = NULL;
14752
14753 if (original_count >= VM_REGION_SUBMAP_INFO_V1_COUNT_64) {
14754 *count = VM_REGION_SUBMAP_INFO_V1_COUNT_64;
14755 }
14756 if (original_count >= VM_REGION_SUBMAP_INFO_V2_COUNT_64) {
14757 *count = VM_REGION_SUBMAP_INFO_V2_COUNT_64;
14758 }
14759 }
14760
14761 user_address = *address;
14762 user_max_depth = *nesting_depth;
14763 submap_needed_copy = FALSE;
14764
14765 if (not_in_kdp) {
14766 vm_map_lock_read(map);
14767 }
14768
14769 recurse_again:
14770 curr_entry = NULL;
14771 curr_map = map;
14772 curr_address = user_address;
14773 curr_offset = 0;
14774 curr_skip = 0;
14775 curr_depth = 0;
14776 curr_max_above = ((vm_map_offset_t) -1) - curr_address;
14777 curr_max_below = curr_address;
14778
14779 next_entry = NULL;
14780 next_map = NULL;
14781 next_address = 0;
14782 next_offset = 0;
14783 next_skip = 0;
14784 next_depth = 0;
14785 next_max_above = (vm_map_offset_t) -1;
14786 next_max_below = (vm_map_offset_t) -1;
14787
14788 for (;;) {
14789 if (vm_map_lookup_entry(curr_map,
14790 curr_address,
14791 &tmp_entry)) {
14792 /* tmp_entry contains the address we're looking for */
14793 curr_entry = tmp_entry;
14794 } else {
14795 vm_map_offset_t skip;
14796 /*
14797 * The address is not mapped. "tmp_entry" is the
14798 * map entry preceding the address. We want the next
14799 * one, if it exists.
14800 */
14801 curr_entry = tmp_entry->vme_next;
14802
14803 if (curr_entry == vm_map_to_entry(curr_map) ||
14804 (curr_entry->vme_start >=
14805 curr_address + curr_max_above)) {
14806 /* no next entry at this level: stop looking */
14807 if (not_in_kdp) {
14808 vm_map_unlock_read(curr_map);
14809 }
14810 curr_entry = NULL;
14811 curr_map = NULL;
14812 curr_skip = 0;
14813 curr_offset = 0;
14814 curr_depth = 0;
14815 curr_max_above = 0;
14816 curr_max_below = 0;
14817 break;
14818 }
14819
14820 /* adjust current address and offset */
14821 skip = curr_entry->vme_start - curr_address;
14822 curr_address = curr_entry->vme_start;
14823 curr_skip += skip;
14824 curr_offset += skip;
14825 curr_max_above -= skip;
14826 curr_max_below = 0;
14827 }
14828
14829 /*
14830 * Is the next entry at this level closer to the address (or
14831 * deeper in the submap chain) than the one we had
14832 * so far ?
14833 */
14834 tmp_entry = curr_entry->vme_next;
14835 if (tmp_entry == vm_map_to_entry(curr_map)) {
14836 /* no next entry at this level */
14837 } else if (tmp_entry->vme_start >=
14838 curr_address + curr_max_above) {
14839 /*
14840 * tmp_entry is beyond the scope of what we mapped of
14841 * this submap in the upper level: ignore it.
14842 */
14843 } else if ((next_entry == NULL) ||
14844 (tmp_entry->vme_start + curr_offset <=
14845 next_entry->vme_start + next_offset)) {
14846 /*
14847 * We didn't have a "next_entry" or this one is
14848 * closer to the address we're looking for:
14849 * use this "tmp_entry" as the new "next_entry".
14850 */
14851 if (next_entry != NULL) {
14852 /* unlock the last "next_map" */
14853 if (next_map != curr_map && not_in_kdp) {
14854 vm_map_unlock_read(next_map);
14855 }
14856 }
14857 next_entry = tmp_entry;
14858 next_map = curr_map;
14859 next_depth = curr_depth;
14860 next_address = next_entry->vme_start;
14861 next_skip = curr_skip;
14862 next_skip += (next_address - curr_address);
14863 next_offset = curr_offset;
14864 next_offset += (next_address - curr_address);
14865 next_max_above = MIN(next_max_above, curr_max_above);
14866 next_max_above = MIN(next_max_above,
14867 next_entry->vme_end - next_address);
14868 next_max_below = MIN(next_max_below, curr_max_below);
14869 next_max_below = MIN(next_max_below,
14870 next_address - next_entry->vme_start);
14871 }
14872
14873 /*
14874 * "curr_max_{above,below}" allow us to keep track of the
14875 * portion of the submap that is actually mapped at this level:
14876 * the rest of that submap is irrelevant to us, since it's not
14877 * mapped here.
14878 * The relevant portion of the map starts at
14879 * "VME_OFFSET(curr_entry)" up to the size of "curr_entry".
14880 */
14881 curr_max_above = MIN(curr_max_above,
14882 curr_entry->vme_end - curr_address);
14883 curr_max_below = MIN(curr_max_below,
14884 curr_address - curr_entry->vme_start);
14885
14886 if (!curr_entry->is_sub_map ||
14887 curr_depth >= user_max_depth) {
14888 /*
14889 * We hit a leaf map or we reached the maximum depth
14890 * we could, so stop looking. Keep the current map
14891 * locked.
14892 */
14893 break;
14894 }
14895
14896 /*
14897 * Get down to the next submap level.
14898 */
14899
14900 if (curr_entry->needs_copy) {
14901 /* everything below this is effectively copy-on-write */
14902 submap_needed_copy = TRUE;
14903 }
14904
14905 /*
14906 * Lock the next level and unlock the current level,
14907 * unless we need to keep it locked to access the "next_entry"
14908 * later.
14909 */
14910 if (not_in_kdp) {
14911 vm_map_lock_read(VME_SUBMAP(curr_entry));
14912 }
14913 if (curr_map == next_map) {
14914 /* keep "next_map" locked in case we need it */
14915 } else {
14916 /* release this map */
14917 if (not_in_kdp) {
14918 vm_map_unlock_read(curr_map);
14919 }
14920 }
14921
14922 /*
14923 * Adjust the offset. "curr_entry" maps the submap
14924 * at relative address "curr_entry->vme_start" in the
14925 * curr_map but skips the first "VME_OFFSET(curr_entry)"
14926 * bytes of the submap.
14927 * "curr_offset" always represents the offset of a virtual
14928 * address in the curr_map relative to the absolute address
14929 * space (i.e. the top-level VM map).
14930 */
14931 curr_offset +=
14932 (VME_OFFSET(curr_entry) - curr_entry->vme_start);
14933 curr_address = user_address + curr_offset;
14934 /* switch to the submap */
14935 curr_map = VME_SUBMAP(curr_entry);
14936 curr_depth++;
14937 curr_entry = NULL;
14938 }
14939
14940 // LP64todo: all the current tools are 32bit, obviously never worked for 64b
14941 // so probably should be a real 32b ID vs. ptr.
14942 // Current users just check for equality
14943
14944 if (curr_entry == NULL) {
14945 /* no VM region contains the address... */
14946
14947 if (do_region_footprint && /* we want footprint numbers */
14948 next_entry == NULL && /* & there are no more regions */
14949 /* & we haven't already provided our fake region: */
14950 user_address <= vm_map_last_entry(map)->vme_end) {
14951 ledger_amount_t ledger_resident, ledger_compressed;
14952
14953 /*
14954 * Add a fake memory region to account for
14955 * purgeable and/or ledger-tagged memory that
14956 * counts towards this task's memory footprint,
14957 * i.e. the resident/compressed pages of non-volatile
14958 * objects owned by that task.
14959 */
14960 task_ledgers_footprint(map->pmap->ledger,
14961 &ledger_resident,
14962 &ledger_compressed);
14963 if (ledger_resident + ledger_compressed == 0) {
14964 /* no purgeable memory usage to report */
14965 return KERN_INVALID_ADDRESS;
14966 }
14967 /* fake region to show nonvolatile footprint */
14968 if (look_for_pages) {
14969 submap_info->protection = VM_PROT_DEFAULT;
14970 submap_info->max_protection = VM_PROT_DEFAULT;
14971 submap_info->inheritance = VM_INHERIT_DEFAULT;
14972 submap_info->offset = 0;
14973 submap_info->user_tag = -1;
14974 submap_info->pages_resident = (unsigned int) (ledger_resident / effective_page_size);
14975 submap_info->pages_shared_now_private = 0;
14976 submap_info->pages_swapped_out = (unsigned int) (ledger_compressed / effective_page_size);
14977 submap_info->pages_dirtied = submap_info->pages_resident;
14978 submap_info->ref_count = 1;
14979 submap_info->shadow_depth = 0;
14980 submap_info->external_pager = 0;
14981 submap_info->share_mode = SM_PRIVATE;
14982 if (submap_needed_copy) {
14983 submap_info->share_mode = SM_COW;
14984 }
14985 submap_info->is_submap = 0;
14986 submap_info->behavior = VM_BEHAVIOR_DEFAULT;
14987 submap_info->object_id = VM_OBJECT_ID_FAKE(map, task_ledgers.purgeable_nonvolatile);
14988 submap_info->user_wired_count = 0;
14989 submap_info->pages_reusable = 0;
14990 } else {
14991 short_info->user_tag = -1;
14992 short_info->offset = 0;
14993 short_info->protection = VM_PROT_DEFAULT;
14994 short_info->inheritance = VM_INHERIT_DEFAULT;
14995 short_info->max_protection = VM_PROT_DEFAULT;
14996 short_info->behavior = VM_BEHAVIOR_DEFAULT;
14997 short_info->user_wired_count = 0;
14998 short_info->is_submap = 0;
14999 short_info->object_id = VM_OBJECT_ID_FAKE(map, task_ledgers.purgeable_nonvolatile);
15000 short_info->external_pager = 0;
15001 short_info->shadow_depth = 0;
15002 short_info->share_mode = SM_PRIVATE;
15003 if (submap_needed_copy) {
15004 short_info->share_mode = SM_COW;
15005 }
15006 short_info->ref_count = 1;
15007 }
15008 *nesting_depth = 0;
15009 *size = (vm_map_size_t) (ledger_resident + ledger_compressed);
15010 // *address = user_address;
15011 *address = vm_map_last_entry(map)->vme_end;
15012 return KERN_SUCCESS;
15013 }
15014
15015 if (next_entry == NULL) {
15016 /* ... and no VM region follows it either */
15017 return KERN_INVALID_ADDRESS;
15018 }
15019 /* ... gather info about the next VM region */
15020 curr_entry = next_entry;
15021 curr_map = next_map; /* still locked ... */
15022 curr_address = next_address;
15023 curr_skip = next_skip;
15024 curr_offset = next_offset;
15025 curr_depth = next_depth;
15026 curr_max_above = next_max_above;
15027 curr_max_below = next_max_below;
15028 } else {
15029 /* we won't need "next_entry" after all */
15030 if (next_entry != NULL) {
15031 /* release "next_map" */
15032 if (next_map != curr_map && not_in_kdp) {
15033 vm_map_unlock_read(next_map);
15034 }
15035 }
15036 }
15037 next_entry = NULL;
15038 next_map = NULL;
15039 next_offset = 0;
15040 next_skip = 0;
15041 next_depth = 0;
15042 next_max_below = -1;
15043 next_max_above = -1;
15044
15045 if (curr_entry->is_sub_map &&
15046 curr_depth < user_max_depth) {
15047 /*
15048 * We're not as deep as we could be: we must have
15049 * gone back up after not finding anything mapped
15050 * below the original top-level map entry's.
15051 * Let's move "curr_address" forward and recurse again.
15052 */
15053 user_address = curr_address;
15054 goto recurse_again;
15055 }
15056
15057 *nesting_depth = curr_depth;
15058 *size = curr_max_above + curr_max_below;
15059 *address = user_address + curr_skip - curr_max_below;
15060
15061 if (look_for_pages) {
15062 submap_info->user_tag = VME_ALIAS(curr_entry);
15063 submap_info->offset = VME_OFFSET(curr_entry);
15064 submap_info->protection = curr_entry->protection;
15065 submap_info->inheritance = curr_entry->inheritance;
15066 submap_info->max_protection = curr_entry->max_protection;
15067 submap_info->behavior = curr_entry->behavior;
15068 submap_info->user_wired_count = curr_entry->user_wired_count;
15069 submap_info->is_submap = curr_entry->is_sub_map;
15070 if (curr_entry->is_sub_map) {
15071 submap_info->object_id = VM_OBJECT_ID(VME_SUBMAP(curr_entry));
15072 } else {
15073 submap_info->object_id = VM_OBJECT_ID(VME_OBJECT(curr_entry));
15074 }
15075 } else {
15076 short_info->user_tag = VME_ALIAS(curr_entry);
15077 short_info->offset = VME_OFFSET(curr_entry);
15078 short_info->protection = curr_entry->protection;
15079 short_info->inheritance = curr_entry->inheritance;
15080 short_info->max_protection = curr_entry->max_protection;
15081 short_info->behavior = curr_entry->behavior;
15082 short_info->user_wired_count = curr_entry->user_wired_count;
15083 short_info->is_submap = curr_entry->is_sub_map;
15084 if (curr_entry->is_sub_map) {
15085 short_info->object_id = VM_OBJECT_ID(VME_SUBMAP(curr_entry));
15086 } else {
15087 short_info->object_id = VM_OBJECT_ID(VME_OBJECT(curr_entry));
15088 }
15089 }
15090
15091 extended.pages_resident = 0;
15092 extended.pages_swapped_out = 0;
15093 extended.pages_shared_now_private = 0;
15094 extended.pages_dirtied = 0;
15095 extended.pages_reusable = 0;
15096 extended.external_pager = 0;
15097 extended.shadow_depth = 0;
15098 extended.share_mode = SM_EMPTY;
15099 extended.ref_count = 0;
15100
15101 if (not_in_kdp) {
15102 if (!curr_entry->is_sub_map) {
15103 vm_map_offset_t range_start, range_end;
15104 range_start = MAX((curr_address - curr_max_below),
15105 curr_entry->vme_start);
15106 range_end = MIN((curr_address + curr_max_above),
15107 curr_entry->vme_end);
15108 vm_map_region_walk(curr_map,
15109 range_start,
15110 curr_entry,
15111 (VME_OFFSET(curr_entry) +
15112 (range_start -
15113 curr_entry->vme_start)),
15114 range_end - range_start,
15115 &extended,
15116 look_for_pages, VM_REGION_EXTENDED_INFO_COUNT);
15117 if (extended.external_pager &&
15118 extended.ref_count == 2 &&
15119 extended.share_mode == SM_SHARED) {
15120 extended.share_mode = SM_PRIVATE;
15121 }
15122 if (submap_needed_copy) {
15123 extended.share_mode = SM_COW;
15124 }
15125 } else {
15126 if (curr_entry->use_pmap) {
15127 extended.share_mode = SM_TRUESHARED;
15128 } else {
15129 extended.share_mode = SM_PRIVATE;
15130 }
15131 extended.ref_count = os_ref_get_count_raw(&VME_SUBMAP(curr_entry)->map_refcnt);
15132 }
15133 }
15134
15135 if (look_for_pages) {
15136 submap_info->pages_resident = extended.pages_resident;
15137 submap_info->pages_swapped_out = extended.pages_swapped_out;
15138 submap_info->pages_shared_now_private =
15139 extended.pages_shared_now_private;
15140 submap_info->pages_dirtied = extended.pages_dirtied;
15141 submap_info->external_pager = extended.external_pager;
15142 submap_info->shadow_depth = extended.shadow_depth;
15143 submap_info->share_mode = extended.share_mode;
15144 submap_info->ref_count = extended.ref_count;
15145
15146 if (original_count >= VM_REGION_SUBMAP_INFO_V1_COUNT_64) {
15147 submap_info->pages_reusable = extended.pages_reusable;
15148 }
15149 if (original_count >= VM_REGION_SUBMAP_INFO_V2_COUNT_64) {
15150 if (curr_entry->is_sub_map) {
15151 submap_info->object_id_full = (vm_object_id_t)VM_KERNEL_ADDRPERM(VME_SUBMAP(curr_entry));
15152 } else if (VME_OBJECT(curr_entry)) {
15153 submap_info->object_id_full = (vm_object_id_t)VM_KERNEL_ADDRPERM(VME_OBJECT(curr_entry));
15154 } else {
15155 submap_info->object_id_full = 0ull;
15156 }
15157 }
15158 } else {
15159 short_info->external_pager = extended.external_pager;
15160 short_info->shadow_depth = extended.shadow_depth;
15161 short_info->share_mode = extended.share_mode;
15162 short_info->ref_count = extended.ref_count;
15163 }
15164
15165 if (not_in_kdp) {
15166 vm_map_unlock_read(curr_map);
15167 }
15168
15169 return KERN_SUCCESS;
15170 }
15171
15172 /*
15173 * vm_region:
15174 *
15175 * User call to obtain information about a region in
15176 * a task's address map. Currently, only one flavor is
15177 * supported.
15178 *
15179 * XXX The reserved and behavior fields cannot be filled
15180 * in until the vm merge from the IK is completed, and
15181 * vm_reserve is implemented.
15182 */
15183
15184 kern_return_t
vm_map_region(vm_map_t map,vm_map_offset_t * address,vm_map_size_t * size,vm_region_flavor_t flavor,vm_region_info_t info,mach_msg_type_number_t * count,mach_port_t * object_name)15185 vm_map_region(
15186 vm_map_t map,
15187 vm_map_offset_t *address, /* IN/OUT */
15188 vm_map_size_t *size, /* OUT */
15189 vm_region_flavor_t flavor, /* IN */
15190 vm_region_info_t info, /* OUT */
15191 mach_msg_type_number_t *count, /* IN/OUT */
15192 mach_port_t *object_name) /* OUT */
15193 {
15194 vm_map_entry_t tmp_entry;
15195 vm_map_entry_t entry;
15196 vm_map_offset_t start;
15197
15198 if (map == VM_MAP_NULL) {
15199 return KERN_INVALID_ARGUMENT;
15200 }
15201
15202 switch (flavor) {
15203 case VM_REGION_BASIC_INFO:
15204 /* legacy for old 32-bit objects info */
15205 {
15206 vm_region_basic_info_t basic;
15207
15208 if (*count < VM_REGION_BASIC_INFO_COUNT) {
15209 return KERN_INVALID_ARGUMENT;
15210 }
15211
15212 basic = (vm_region_basic_info_t) info;
15213 *count = VM_REGION_BASIC_INFO_COUNT;
15214
15215 vm_map_lock_read(map);
15216
15217 start = *address;
15218 if (!vm_map_lookup_entry(map, start, &tmp_entry)) {
15219 if ((entry = tmp_entry->vme_next) == vm_map_to_entry(map)) {
15220 vm_map_unlock_read(map);
15221 return KERN_INVALID_ADDRESS;
15222 }
15223 } else {
15224 entry = tmp_entry;
15225 }
15226
15227 start = entry->vme_start;
15228
15229 basic->offset = (uint32_t)VME_OFFSET(entry);
15230 basic->protection = entry->protection;
15231 basic->inheritance = entry->inheritance;
15232 basic->max_protection = entry->max_protection;
15233 basic->behavior = entry->behavior;
15234 basic->user_wired_count = entry->user_wired_count;
15235 basic->reserved = entry->is_sub_map;
15236 *address = start;
15237 *size = (entry->vme_end - start);
15238
15239 if (object_name) {
15240 *object_name = IP_NULL;
15241 }
15242 if (entry->is_sub_map) {
15243 basic->shared = FALSE;
15244 } else {
15245 basic->shared = entry->is_shared;
15246 }
15247
15248 vm_map_unlock_read(map);
15249 return KERN_SUCCESS;
15250 }
15251
15252 case VM_REGION_BASIC_INFO_64:
15253 {
15254 vm_region_basic_info_64_t basic;
15255
15256 if (*count < VM_REGION_BASIC_INFO_COUNT_64) {
15257 return KERN_INVALID_ARGUMENT;
15258 }
15259
15260 basic = (vm_region_basic_info_64_t) info;
15261 *count = VM_REGION_BASIC_INFO_COUNT_64;
15262
15263 vm_map_lock_read(map);
15264
15265 start = *address;
15266 if (!vm_map_lookup_entry(map, start, &tmp_entry)) {
15267 if ((entry = tmp_entry->vme_next) == vm_map_to_entry(map)) {
15268 vm_map_unlock_read(map);
15269 return KERN_INVALID_ADDRESS;
15270 }
15271 } else {
15272 entry = tmp_entry;
15273 }
15274
15275 start = entry->vme_start;
15276
15277 basic->offset = VME_OFFSET(entry);
15278 basic->protection = entry->protection;
15279 basic->inheritance = entry->inheritance;
15280 basic->max_protection = entry->max_protection;
15281 basic->behavior = entry->behavior;
15282 basic->user_wired_count = entry->user_wired_count;
15283 basic->reserved = entry->is_sub_map;
15284 *address = start;
15285 *size = (entry->vme_end - start);
15286
15287 if (object_name) {
15288 *object_name = IP_NULL;
15289 }
15290 if (entry->is_sub_map) {
15291 basic->shared = FALSE;
15292 } else {
15293 basic->shared = entry->is_shared;
15294 }
15295
15296 vm_map_unlock_read(map);
15297 return KERN_SUCCESS;
15298 }
15299 case VM_REGION_EXTENDED_INFO:
15300 if (*count < VM_REGION_EXTENDED_INFO_COUNT) {
15301 return KERN_INVALID_ARGUMENT;
15302 }
15303 OS_FALLTHROUGH;
15304 case VM_REGION_EXTENDED_INFO__legacy:
15305 if (*count < VM_REGION_EXTENDED_INFO_COUNT__legacy) {
15306 return KERN_INVALID_ARGUMENT;
15307 }
15308
15309 {
15310 vm_region_extended_info_t extended;
15311 mach_msg_type_number_t original_count;
15312 int effective_page_size, effective_page_shift;
15313
15314 extended = (vm_region_extended_info_t) info;
15315
15316 effective_page_shift = vm_self_region_page_shift(map);
15317 effective_page_size = (1 << effective_page_shift);
15318
15319 vm_map_lock_read(map);
15320
15321 start = *address;
15322 if (!vm_map_lookup_entry(map, start, &tmp_entry)) {
15323 if ((entry = tmp_entry->vme_next) == vm_map_to_entry(map)) {
15324 vm_map_unlock_read(map);
15325 return KERN_INVALID_ADDRESS;
15326 }
15327 } else {
15328 entry = tmp_entry;
15329 }
15330 start = entry->vme_start;
15331
15332 extended->protection = entry->protection;
15333 extended->user_tag = VME_ALIAS(entry);
15334 extended->pages_resident = 0;
15335 extended->pages_swapped_out = 0;
15336 extended->pages_shared_now_private = 0;
15337 extended->pages_dirtied = 0;
15338 extended->external_pager = 0;
15339 extended->shadow_depth = 0;
15340
15341 original_count = *count;
15342 if (flavor == VM_REGION_EXTENDED_INFO__legacy) {
15343 *count = VM_REGION_EXTENDED_INFO_COUNT__legacy;
15344 } else {
15345 extended->pages_reusable = 0;
15346 *count = VM_REGION_EXTENDED_INFO_COUNT;
15347 }
15348
15349 vm_map_region_walk(map, start, entry, VME_OFFSET(entry), entry->vme_end - start, extended, TRUE, *count);
15350
15351 if (extended->external_pager && extended->ref_count == 2 && extended->share_mode == SM_SHARED) {
15352 extended->share_mode = SM_PRIVATE;
15353 }
15354
15355 if (object_name) {
15356 *object_name = IP_NULL;
15357 }
15358 *address = start;
15359 *size = (entry->vme_end - start);
15360
15361 vm_map_unlock_read(map);
15362 return KERN_SUCCESS;
15363 }
15364 case VM_REGION_TOP_INFO:
15365 {
15366 vm_region_top_info_t top;
15367
15368 if (*count < VM_REGION_TOP_INFO_COUNT) {
15369 return KERN_INVALID_ARGUMENT;
15370 }
15371
15372 top = (vm_region_top_info_t) info;
15373 *count = VM_REGION_TOP_INFO_COUNT;
15374
15375 vm_map_lock_read(map);
15376
15377 start = *address;
15378 if (!vm_map_lookup_entry(map, start, &tmp_entry)) {
15379 if ((entry = tmp_entry->vme_next) == vm_map_to_entry(map)) {
15380 vm_map_unlock_read(map);
15381 return KERN_INVALID_ADDRESS;
15382 }
15383 } else {
15384 entry = tmp_entry;
15385 }
15386 start = entry->vme_start;
15387
15388 top->private_pages_resident = 0;
15389 top->shared_pages_resident = 0;
15390
15391 vm_map_region_top_walk(entry, top);
15392
15393 if (object_name) {
15394 *object_name = IP_NULL;
15395 }
15396 *address = start;
15397 *size = (entry->vme_end - start);
15398
15399 vm_map_unlock_read(map);
15400 return KERN_SUCCESS;
15401 }
15402 default:
15403 return KERN_INVALID_ARGUMENT;
15404 }
15405 }
15406
15407 #define OBJ_RESIDENT_COUNT(obj, entry_size) \
15408 MIN((entry_size), \
15409 ((obj)->all_reusable ? \
15410 (obj)->wired_page_count : \
15411 (obj)->resident_page_count - (obj)->reusable_page_count))
15412
15413 void
vm_map_region_top_walk(vm_map_entry_t entry,vm_region_top_info_t top)15414 vm_map_region_top_walk(
15415 vm_map_entry_t entry,
15416 vm_region_top_info_t top)
15417 {
15418 if (entry->is_sub_map || VME_OBJECT(entry) == 0) {
15419 top->share_mode = SM_EMPTY;
15420 top->ref_count = 0;
15421 top->obj_id = 0;
15422 return;
15423 }
15424
15425 {
15426 struct vm_object *obj, *tmp_obj;
15427 int ref_count;
15428 uint32_t entry_size;
15429
15430 entry_size = (uint32_t) ((entry->vme_end - entry->vme_start) / PAGE_SIZE_64);
15431
15432 obj = VME_OBJECT(entry);
15433
15434 vm_object_lock(obj);
15435
15436 if ((ref_count = obj->ref_count) > 1 && obj->paging_in_progress) {
15437 ref_count--;
15438 }
15439
15440 assert(obj->reusable_page_count <= obj->resident_page_count);
15441 if (obj->shadow) {
15442 if (ref_count == 1) {
15443 top->private_pages_resident =
15444 OBJ_RESIDENT_COUNT(obj, entry_size);
15445 } else {
15446 top->shared_pages_resident =
15447 OBJ_RESIDENT_COUNT(obj, entry_size);
15448 }
15449 top->ref_count = ref_count;
15450 top->share_mode = SM_COW;
15451
15452 while ((tmp_obj = obj->shadow)) {
15453 vm_object_lock(tmp_obj);
15454 vm_object_unlock(obj);
15455 obj = tmp_obj;
15456
15457 if ((ref_count = obj->ref_count) > 1 && obj->paging_in_progress) {
15458 ref_count--;
15459 }
15460
15461 assert(obj->reusable_page_count <= obj->resident_page_count);
15462 top->shared_pages_resident +=
15463 OBJ_RESIDENT_COUNT(obj, entry_size);
15464 top->ref_count += ref_count - 1;
15465 }
15466 } else {
15467 if (entry->superpage_size) {
15468 top->share_mode = SM_LARGE_PAGE;
15469 top->shared_pages_resident = 0;
15470 top->private_pages_resident = entry_size;
15471 } else if (entry->needs_copy) {
15472 top->share_mode = SM_COW;
15473 top->shared_pages_resident =
15474 OBJ_RESIDENT_COUNT(obj, entry_size);
15475 } else {
15476 if (ref_count == 1 ||
15477 (ref_count == 2 && obj->named)) {
15478 top->share_mode = SM_PRIVATE;
15479 top->private_pages_resident =
15480 OBJ_RESIDENT_COUNT(obj,
15481 entry_size);
15482 } else {
15483 top->share_mode = SM_SHARED;
15484 top->shared_pages_resident =
15485 OBJ_RESIDENT_COUNT(obj,
15486 entry_size);
15487 }
15488 }
15489 top->ref_count = ref_count;
15490 }
15491 /* XXX K64: obj_id will be truncated */
15492 top->obj_id = (unsigned int) (uintptr_t)VM_KERNEL_ADDRPERM(obj);
15493
15494 vm_object_unlock(obj);
15495 }
15496 }
15497
15498 void
vm_map_region_walk(vm_map_t map,vm_map_offset_t va,vm_map_entry_t entry,vm_object_offset_t offset,vm_object_size_t range,vm_region_extended_info_t extended,boolean_t look_for_pages,mach_msg_type_number_t count)15499 vm_map_region_walk(
15500 vm_map_t map,
15501 vm_map_offset_t va,
15502 vm_map_entry_t entry,
15503 vm_object_offset_t offset,
15504 vm_object_size_t range,
15505 vm_region_extended_info_t extended,
15506 boolean_t look_for_pages,
15507 mach_msg_type_number_t count)
15508 {
15509 struct vm_object *obj, *tmp_obj;
15510 vm_map_offset_t last_offset;
15511 int i;
15512 int ref_count;
15513 struct vm_object *shadow_object;
15514 unsigned short shadow_depth;
15515 boolean_t do_region_footprint;
15516 int effective_page_size, effective_page_shift;
15517 vm_map_offset_t effective_page_mask;
15518
15519 do_region_footprint = task_self_region_footprint();
15520
15521 if ((entry->is_sub_map) ||
15522 (VME_OBJECT(entry) == 0) ||
15523 (VME_OBJECT(entry)->phys_contiguous &&
15524 !entry->superpage_size)) {
15525 extended->share_mode = SM_EMPTY;
15526 extended->ref_count = 0;
15527 return;
15528 }
15529
15530 if (entry->superpage_size) {
15531 extended->shadow_depth = 0;
15532 extended->share_mode = SM_LARGE_PAGE;
15533 extended->ref_count = 1;
15534 extended->external_pager = 0;
15535
15536 /* TODO4K: Superpage in 4k mode? */
15537 extended->pages_resident = (unsigned int)(range >> PAGE_SHIFT);
15538 extended->shadow_depth = 0;
15539 return;
15540 }
15541
15542 effective_page_shift = vm_self_region_page_shift(map);
15543 effective_page_size = (1 << effective_page_shift);
15544 effective_page_mask = effective_page_size - 1;
15545
15546 offset = vm_map_trunc_page(offset, effective_page_mask);
15547
15548 obj = VME_OBJECT(entry);
15549
15550 vm_object_lock(obj);
15551
15552 if ((ref_count = obj->ref_count) > 1 && obj->paging_in_progress) {
15553 ref_count--;
15554 }
15555
15556 if (look_for_pages) {
15557 for (last_offset = offset + range;
15558 offset < last_offset;
15559 offset += effective_page_size, va += effective_page_size) {
15560 if (do_region_footprint) {
15561 int disp;
15562
15563 disp = 0;
15564 if (map->has_corpse_footprint) {
15565 /*
15566 * Query the page info data we saved
15567 * while forking the corpse.
15568 */
15569 vm_map_corpse_footprint_query_page_info(
15570 map,
15571 va,
15572 &disp);
15573 } else {
15574 /*
15575 * Query the pmap.
15576 */
15577 vm_map_footprint_query_page_info(
15578 map,
15579 entry,
15580 va,
15581 &disp);
15582 }
15583 if (disp & VM_PAGE_QUERY_PAGE_PRESENT) {
15584 extended->pages_resident++;
15585 }
15586 if (disp & VM_PAGE_QUERY_PAGE_REUSABLE) {
15587 extended->pages_reusable++;
15588 }
15589 if (disp & VM_PAGE_QUERY_PAGE_DIRTY) {
15590 extended->pages_dirtied++;
15591 }
15592 if (disp & PMAP_QUERY_PAGE_COMPRESSED) {
15593 extended->pages_swapped_out++;
15594 }
15595 continue;
15596 }
15597
15598 vm_map_region_look_for_page(map, va, obj,
15599 vm_object_trunc_page(offset), ref_count,
15600 0, extended, count);
15601 }
15602
15603 if (do_region_footprint) {
15604 goto collect_object_info;
15605 }
15606 } else {
15607 collect_object_info:
15608 shadow_object = obj->shadow;
15609 shadow_depth = 0;
15610
15611 if (!(obj->internal)) {
15612 extended->external_pager = 1;
15613 }
15614
15615 if (shadow_object != VM_OBJECT_NULL) {
15616 vm_object_lock(shadow_object);
15617 for (;
15618 shadow_object != VM_OBJECT_NULL;
15619 shadow_depth++) {
15620 vm_object_t next_shadow;
15621
15622 if (!(shadow_object->internal)) {
15623 extended->external_pager = 1;
15624 }
15625
15626 next_shadow = shadow_object->shadow;
15627 if (next_shadow) {
15628 vm_object_lock(next_shadow);
15629 }
15630 vm_object_unlock(shadow_object);
15631 shadow_object = next_shadow;
15632 }
15633 }
15634 extended->shadow_depth = shadow_depth;
15635 }
15636
15637 if (extended->shadow_depth || entry->needs_copy) {
15638 extended->share_mode = SM_COW;
15639 } else {
15640 if (ref_count == 1) {
15641 extended->share_mode = SM_PRIVATE;
15642 } else {
15643 if (obj->true_share) {
15644 extended->share_mode = SM_TRUESHARED;
15645 } else {
15646 extended->share_mode = SM_SHARED;
15647 }
15648 }
15649 }
15650 extended->ref_count = ref_count - extended->shadow_depth;
15651
15652 for (i = 0; i < extended->shadow_depth; i++) {
15653 if ((tmp_obj = obj->shadow) == 0) {
15654 break;
15655 }
15656 vm_object_lock(tmp_obj);
15657 vm_object_unlock(obj);
15658
15659 if ((ref_count = tmp_obj->ref_count) > 1 && tmp_obj->paging_in_progress) {
15660 ref_count--;
15661 }
15662
15663 extended->ref_count += ref_count;
15664 obj = tmp_obj;
15665 }
15666 vm_object_unlock(obj);
15667
15668 if (extended->share_mode == SM_SHARED) {
15669 vm_map_entry_t cur;
15670 vm_map_entry_t last;
15671 int my_refs;
15672
15673 obj = VME_OBJECT(entry);
15674 last = vm_map_to_entry(map);
15675 my_refs = 0;
15676
15677 if ((ref_count = obj->ref_count) > 1 && obj->paging_in_progress) {
15678 ref_count--;
15679 }
15680 for (cur = vm_map_first_entry(map); cur != last; cur = cur->vme_next) {
15681 my_refs += vm_map_region_count_obj_refs(cur, obj);
15682 }
15683
15684 if (my_refs == ref_count) {
15685 extended->share_mode = SM_PRIVATE_ALIASED;
15686 } else if (my_refs > 1) {
15687 extended->share_mode = SM_SHARED_ALIASED;
15688 }
15689 }
15690 }
15691
15692
15693 /* object is locked on entry and locked on return */
15694
15695
15696 static void
vm_map_region_look_for_page(__unused vm_map_t map,__unused vm_map_offset_t va,vm_object_t object,vm_object_offset_t offset,int max_refcnt,unsigned short depth,vm_region_extended_info_t extended,mach_msg_type_number_t count)15697 vm_map_region_look_for_page(
15698 __unused vm_map_t map,
15699 __unused vm_map_offset_t va,
15700 vm_object_t object,
15701 vm_object_offset_t offset,
15702 int max_refcnt,
15703 unsigned short depth,
15704 vm_region_extended_info_t extended,
15705 mach_msg_type_number_t count)
15706 {
15707 vm_page_t p;
15708 vm_object_t shadow;
15709 int ref_count;
15710 vm_object_t caller_object;
15711
15712 shadow = object->shadow;
15713 caller_object = object;
15714
15715
15716 while (TRUE) {
15717 if (!(object->internal)) {
15718 extended->external_pager = 1;
15719 }
15720
15721 if ((p = vm_page_lookup(object, offset)) != VM_PAGE_NULL) {
15722 if (shadow && (max_refcnt == 1)) {
15723 extended->pages_shared_now_private++;
15724 }
15725
15726 if (!p->vmp_fictitious &&
15727 (p->vmp_dirty || pmap_is_modified(VM_PAGE_GET_PHYS_PAGE(p)))) {
15728 extended->pages_dirtied++;
15729 } else if (count >= VM_REGION_EXTENDED_INFO_COUNT) {
15730 if (p->vmp_reusable || object->all_reusable) {
15731 extended->pages_reusable++;
15732 }
15733 }
15734
15735 extended->pages_resident++;
15736
15737 if (object != caller_object) {
15738 vm_object_unlock(object);
15739 }
15740
15741 return;
15742 }
15743 if (object->internal &&
15744 object->alive &&
15745 !object->terminating &&
15746 object->pager_ready) {
15747 if (VM_COMPRESSOR_PAGER_STATE_GET(object, offset)
15748 == VM_EXTERNAL_STATE_EXISTS) {
15749 /* the pager has that page */
15750 extended->pages_swapped_out++;
15751 if (object != caller_object) {
15752 vm_object_unlock(object);
15753 }
15754 return;
15755 }
15756 }
15757
15758 if (shadow) {
15759 vm_object_lock(shadow);
15760
15761 if ((ref_count = shadow->ref_count) > 1 && shadow->paging_in_progress) {
15762 ref_count--;
15763 }
15764
15765 if (++depth > extended->shadow_depth) {
15766 extended->shadow_depth = depth;
15767 }
15768
15769 if (ref_count > max_refcnt) {
15770 max_refcnt = ref_count;
15771 }
15772
15773 if (object != caller_object) {
15774 vm_object_unlock(object);
15775 }
15776
15777 offset = offset + object->vo_shadow_offset;
15778 object = shadow;
15779 shadow = object->shadow;
15780 continue;
15781 }
15782 if (object != caller_object) {
15783 vm_object_unlock(object);
15784 }
15785 break;
15786 }
15787 }
15788
15789 static int
vm_map_region_count_obj_refs(vm_map_entry_t entry,vm_object_t object)15790 vm_map_region_count_obj_refs(
15791 vm_map_entry_t entry,
15792 vm_object_t object)
15793 {
15794 int ref_count;
15795 vm_object_t chk_obj;
15796 vm_object_t tmp_obj;
15797
15798 if (entry->is_sub_map || VME_OBJECT(entry) == VM_OBJECT_NULL) {
15799 return 0;
15800 }
15801
15802 ref_count = 0;
15803 chk_obj = VME_OBJECT(entry);
15804 vm_object_lock(chk_obj);
15805
15806 while (chk_obj) {
15807 if (chk_obj == object) {
15808 ref_count++;
15809 }
15810 tmp_obj = chk_obj->shadow;
15811 if (tmp_obj) {
15812 vm_object_lock(tmp_obj);
15813 }
15814 vm_object_unlock(chk_obj);
15815
15816 chk_obj = tmp_obj;
15817 }
15818
15819 return ref_count;
15820 }
15821
15822
15823 /*
15824 * Routine: vm_map_simplify
15825 *
15826 * Description:
15827 * Attempt to simplify the map representation in
15828 * the vicinity of the given starting address.
15829 * Note:
15830 * This routine is intended primarily to keep the
15831 * kernel maps more compact -- they generally don't
15832 * benefit from the "expand a map entry" technology
15833 * at allocation time because the adjacent entry
15834 * is often wired down.
15835 */
15836 void
vm_map_simplify_entry(vm_map_t map,vm_map_entry_t this_entry)15837 vm_map_simplify_entry(
15838 vm_map_t map,
15839 vm_map_entry_t this_entry)
15840 {
15841 vm_map_entry_t prev_entry;
15842
15843 prev_entry = this_entry->vme_prev;
15844
15845 if ((this_entry != vm_map_to_entry(map)) &&
15846 (prev_entry != vm_map_to_entry(map)) &&
15847
15848 (prev_entry->vme_end == this_entry->vme_start) &&
15849
15850 (prev_entry->is_sub_map == this_entry->is_sub_map) &&
15851 (prev_entry->vme_object_value == this_entry->vme_object_value) &&
15852 (prev_entry->vme_kernel_object == this_entry->vme_kernel_object) &&
15853 ((VME_OFFSET(prev_entry) + (prev_entry->vme_end -
15854 prev_entry->vme_start))
15855 == VME_OFFSET(this_entry)) &&
15856
15857 (prev_entry->behavior == this_entry->behavior) &&
15858 (prev_entry->needs_copy == this_entry->needs_copy) &&
15859 (prev_entry->protection == this_entry->protection) &&
15860 (prev_entry->max_protection == this_entry->max_protection) &&
15861 (prev_entry->inheritance == this_entry->inheritance) &&
15862 (prev_entry->use_pmap == this_entry->use_pmap) &&
15863 (VME_ALIAS(prev_entry) == VME_ALIAS(this_entry)) &&
15864 (prev_entry->no_cache == this_entry->no_cache) &&
15865 (prev_entry->vme_permanent == this_entry->vme_permanent) &&
15866 (prev_entry->map_aligned == this_entry->map_aligned) &&
15867 (prev_entry->zero_wired_pages == this_entry->zero_wired_pages) &&
15868 (prev_entry->used_for_jit == this_entry->used_for_jit) &&
15869 #if __arm64e__
15870 (prev_entry->used_for_tpro == this_entry->used_for_tpro) &&
15871 #endif
15872 (prev_entry->csm_associated == this_entry->csm_associated) &&
15873 (prev_entry->vme_xnu_user_debug == this_entry->vme_xnu_user_debug) &&
15874 (prev_entry->iokit_acct == this_entry->iokit_acct) &&
15875 (prev_entry->vme_resilient_codesign ==
15876 this_entry->vme_resilient_codesign) &&
15877 (prev_entry->vme_resilient_media ==
15878 this_entry->vme_resilient_media) &&
15879 (prev_entry->vme_no_copy_on_read == this_entry->vme_no_copy_on_read) &&
15880 (prev_entry->translated_allow_execute == this_entry->translated_allow_execute) &&
15881
15882 (prev_entry->wired_count == this_entry->wired_count) &&
15883 (prev_entry->user_wired_count == this_entry->user_wired_count) &&
15884
15885 ((prev_entry->vme_atomic == FALSE) && (this_entry->vme_atomic == FALSE)) &&
15886 (prev_entry->in_transition == FALSE) &&
15887 (this_entry->in_transition == FALSE) &&
15888 (prev_entry->needs_wakeup == FALSE) &&
15889 (this_entry->needs_wakeup == FALSE) &&
15890 (prev_entry->is_shared == this_entry->is_shared) &&
15891 (prev_entry->superpage_size == FALSE) &&
15892 (this_entry->superpage_size == FALSE)
15893 ) {
15894 if (prev_entry->vme_permanent) {
15895 assert(this_entry->vme_permanent);
15896 prev_entry->vme_permanent = false;
15897 }
15898 vm_map_store_entry_unlink(map, prev_entry, true);
15899 assert(prev_entry->vme_start < this_entry->vme_end);
15900 if (prev_entry->map_aligned) {
15901 assert(VM_MAP_PAGE_ALIGNED(prev_entry->vme_start,
15902 VM_MAP_PAGE_MASK(map)));
15903 }
15904 this_entry->vme_start = prev_entry->vme_start;
15905 VME_OFFSET_SET(this_entry, VME_OFFSET(prev_entry));
15906
15907 if (map->holelistenabled) {
15908 vm_map_store_update_first_free(map, this_entry, TRUE);
15909 }
15910
15911 if (prev_entry->is_sub_map) {
15912 vm_map_deallocate(VME_SUBMAP(prev_entry));
15913 } else {
15914 vm_object_deallocate(VME_OBJECT(prev_entry));
15915 }
15916 vm_map_entry_dispose(prev_entry);
15917 SAVE_HINT_MAP_WRITE(map, this_entry);
15918 }
15919 }
15920
15921 void
vm_map_simplify(vm_map_t map,vm_map_offset_t start)15922 vm_map_simplify(
15923 vm_map_t map,
15924 vm_map_offset_t start)
15925 {
15926 vm_map_entry_t this_entry;
15927
15928 vm_map_lock(map);
15929 if (vm_map_lookup_entry(map, start, &this_entry)) {
15930 vm_map_simplify_entry(map, this_entry);
15931 vm_map_simplify_entry(map, this_entry->vme_next);
15932 }
15933 vm_map_unlock(map);
15934 }
15935
15936 static void
vm_map_simplify_range(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)15937 vm_map_simplify_range(
15938 vm_map_t map,
15939 vm_map_offset_t start,
15940 vm_map_offset_t end)
15941 {
15942 vm_map_entry_t entry;
15943
15944 /*
15945 * The map should be locked (for "write") by the caller.
15946 */
15947
15948 if (start >= end) {
15949 /* invalid address range */
15950 return;
15951 }
15952
15953 start = vm_map_trunc_page(start,
15954 VM_MAP_PAGE_MASK(map));
15955 end = vm_map_round_page(end,
15956 VM_MAP_PAGE_MASK(map));
15957
15958 if (!vm_map_lookup_entry(map, start, &entry)) {
15959 /* "start" is not mapped and "entry" ends before "start" */
15960 if (entry == vm_map_to_entry(map)) {
15961 /* start with first entry in the map */
15962 entry = vm_map_first_entry(map);
15963 } else {
15964 /* start with next entry */
15965 entry = entry->vme_next;
15966 }
15967 }
15968
15969 while (entry != vm_map_to_entry(map) &&
15970 entry->vme_start <= end) {
15971 /* try and coalesce "entry" with its previous entry */
15972 vm_map_simplify_entry(map, entry);
15973 entry = entry->vme_next;
15974 }
15975 }
15976
15977
15978 /*
15979 * Routine: vm_map_machine_attribute
15980 * Purpose:
15981 * Provide machine-specific attributes to mappings,
15982 * such as cachability etc. for machines that provide
15983 * them. NUMA architectures and machines with big/strange
15984 * caches will use this.
15985 * Note:
15986 * Responsibilities for locking and checking are handled here,
15987 * everything else in the pmap module. If any non-volatile
15988 * information must be kept, the pmap module should handle
15989 * it itself. [This assumes that attributes do not
15990 * need to be inherited, which seems ok to me]
15991 */
15992 kern_return_t
vm_map_machine_attribute(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_machine_attribute_t attribute,vm_machine_attribute_val_t * value)15993 vm_map_machine_attribute(
15994 vm_map_t map,
15995 vm_map_offset_t start,
15996 vm_map_offset_t end,
15997 vm_machine_attribute_t attribute,
15998 vm_machine_attribute_val_t* value) /* IN/OUT */
15999 {
16000 kern_return_t ret;
16001 vm_map_size_t sync_size;
16002 vm_map_entry_t entry;
16003
16004 if (start < vm_map_min(map) || end > vm_map_max(map)) {
16005 return KERN_INVALID_ADDRESS;
16006 }
16007 if (__improbable(vm_map_range_overflows(map, start, end - start))) {
16008 return KERN_INVALID_ADDRESS;
16009 }
16010
16011 /* Figure how much memory we need to flush (in page increments) */
16012 sync_size = end - start;
16013
16014 vm_map_lock(map);
16015
16016 if (attribute != MATTR_CACHE) {
16017 /* If we don't have to find physical addresses, we */
16018 /* don't have to do an explicit traversal here. */
16019 ret = pmap_attribute(map->pmap, start, end - start,
16020 attribute, value);
16021 vm_map_unlock(map);
16022 return ret;
16023 }
16024
16025 ret = KERN_SUCCESS; /* Assume it all worked */
16026
16027 while (sync_size) {
16028 if (vm_map_lookup_entry(map, start, &entry)) {
16029 vm_map_size_t sub_size;
16030 if ((entry->vme_end - start) > sync_size) {
16031 sub_size = sync_size;
16032 sync_size = 0;
16033 } else {
16034 sub_size = entry->vme_end - start;
16035 sync_size -= sub_size;
16036 }
16037 if (entry->is_sub_map) {
16038 vm_map_offset_t sub_start;
16039 vm_map_offset_t sub_end;
16040
16041 sub_start = (start - entry->vme_start)
16042 + VME_OFFSET(entry);
16043 sub_end = sub_start + sub_size;
16044 vm_map_machine_attribute(
16045 VME_SUBMAP(entry),
16046 sub_start,
16047 sub_end,
16048 attribute, value);
16049 } else if (VME_OBJECT(entry)) {
16050 vm_page_t m;
16051 vm_object_t object;
16052 vm_object_t base_object;
16053 vm_object_t last_object;
16054 vm_object_offset_t offset;
16055 vm_object_offset_t base_offset;
16056 vm_map_size_t range;
16057 range = sub_size;
16058 offset = (start - entry->vme_start)
16059 + VME_OFFSET(entry);
16060 offset = vm_object_trunc_page(offset);
16061 base_offset = offset;
16062 object = VME_OBJECT(entry);
16063 base_object = object;
16064 last_object = NULL;
16065
16066 vm_object_lock(object);
16067
16068 while (range) {
16069 m = vm_page_lookup(
16070 object, offset);
16071
16072 if (m && !m->vmp_fictitious) {
16073 ret =
16074 pmap_attribute_cache_sync(
16075 VM_PAGE_GET_PHYS_PAGE(m),
16076 PAGE_SIZE,
16077 attribute, value);
16078 } else if (object->shadow) {
16079 offset = offset + object->vo_shadow_offset;
16080 last_object = object;
16081 object = object->shadow;
16082 vm_object_lock(last_object->shadow);
16083 vm_object_unlock(last_object);
16084 continue;
16085 }
16086 if (range < PAGE_SIZE) {
16087 range = 0;
16088 } else {
16089 range -= PAGE_SIZE;
16090 }
16091
16092 if (base_object != object) {
16093 vm_object_unlock(object);
16094 vm_object_lock(base_object);
16095 object = base_object;
16096 }
16097 /* Bump to the next page */
16098 base_offset += PAGE_SIZE;
16099 offset = base_offset;
16100 }
16101 vm_object_unlock(object);
16102 }
16103 start += sub_size;
16104 } else {
16105 vm_map_unlock(map);
16106 return KERN_FAILURE;
16107 }
16108 }
16109
16110 vm_map_unlock(map);
16111
16112 return ret;
16113 }
16114
16115 /*
16116 * vm_map_behavior_set:
16117 *
16118 * Sets the paging reference behavior of the specified address
16119 * range in the target map. Paging reference behavior affects
16120 * how pagein operations resulting from faults on the map will be
16121 * clustered.
16122 */
16123 kern_return_t
vm_map_behavior_set(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_behavior_t new_behavior)16124 vm_map_behavior_set(
16125 vm_map_t map,
16126 vm_map_offset_t start,
16127 vm_map_offset_t end,
16128 vm_behavior_t new_behavior)
16129 {
16130 vm_map_entry_t entry;
16131 vm_map_entry_t temp_entry;
16132
16133 if (start > end ||
16134 start < vm_map_min(map) ||
16135 end > vm_map_max(map)) {
16136 return KERN_NO_SPACE;
16137 }
16138 if (__improbable(vm_map_range_overflows(map, start, end - start))) {
16139 return KERN_INVALID_ADDRESS;
16140 }
16141
16142 switch (new_behavior) {
16143 /*
16144 * This first block of behaviors all set a persistent state on the specified
16145 * memory range. All we have to do here is to record the desired behavior
16146 * in the vm_map_entry_t's.
16147 */
16148
16149 case VM_BEHAVIOR_DEFAULT:
16150 case VM_BEHAVIOR_RANDOM:
16151 case VM_BEHAVIOR_SEQUENTIAL:
16152 case VM_BEHAVIOR_RSEQNTL:
16153 case VM_BEHAVIOR_ZERO_WIRED_PAGES:
16154 vm_map_lock(map);
16155
16156 /*
16157 * The entire address range must be valid for the map.
16158 * Note that vm_map_range_check() does a
16159 * vm_map_lookup_entry() internally and returns the
16160 * entry containing the start of the address range if
16161 * the entire range is valid.
16162 */
16163 if (vm_map_range_check(map, start, end, &temp_entry)) {
16164 entry = temp_entry;
16165 vm_map_clip_start(map, entry, start);
16166 } else {
16167 vm_map_unlock(map);
16168 return KERN_INVALID_ADDRESS;
16169 }
16170
16171 while ((entry != vm_map_to_entry(map)) && (entry->vme_start < end)) {
16172 vm_map_clip_end(map, entry, end);
16173 if (entry->is_sub_map) {
16174 assert(!entry->use_pmap);
16175 }
16176
16177 if (new_behavior == VM_BEHAVIOR_ZERO_WIRED_PAGES) {
16178 entry->zero_wired_pages = TRUE;
16179 } else {
16180 entry->behavior = new_behavior;
16181 }
16182 entry = entry->vme_next;
16183 }
16184
16185 vm_map_unlock(map);
16186 break;
16187
16188 /*
16189 * The rest of these are different from the above in that they cause
16190 * an immediate action to take place as opposed to setting a behavior that
16191 * affects future actions.
16192 */
16193
16194 case VM_BEHAVIOR_WILLNEED:
16195 return vm_map_willneed(map, start, end);
16196
16197 case VM_BEHAVIOR_DONTNEED:
16198 return vm_map_msync(map, start, end - start, VM_SYNC_DEACTIVATE | VM_SYNC_CONTIGUOUS);
16199
16200 case VM_BEHAVIOR_FREE:
16201 return vm_map_msync(map, start, end - start, VM_SYNC_KILLPAGES | VM_SYNC_CONTIGUOUS);
16202
16203 case VM_BEHAVIOR_REUSABLE:
16204 return vm_map_reusable_pages(map, start, end);
16205
16206 case VM_BEHAVIOR_REUSE:
16207 return vm_map_reuse_pages(map, start, end);
16208
16209 case VM_BEHAVIOR_CAN_REUSE:
16210 return vm_map_can_reuse(map, start, end);
16211
16212 #if MACH_ASSERT
16213 case VM_BEHAVIOR_PAGEOUT:
16214 return vm_map_pageout(map, start, end);
16215 #endif /* MACH_ASSERT */
16216
16217 default:
16218 return KERN_INVALID_ARGUMENT;
16219 }
16220
16221 return KERN_SUCCESS;
16222 }
16223
16224
16225 /*
16226 * Internals for madvise(MADV_WILLNEED) system call.
16227 *
16228 * The implementation is to do:-
16229 * a) read-ahead if the mapping corresponds to a mapped regular file
16230 * b) or, fault in the pages (zero-fill, decompress etc) if it's an anonymous mapping
16231 */
16232
16233
16234 static kern_return_t
vm_map_willneed(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)16235 vm_map_willneed(
16236 vm_map_t map,
16237 vm_map_offset_t start,
16238 vm_map_offset_t end
16239 )
16240 {
16241 vm_map_entry_t entry;
16242 vm_object_t object;
16243 memory_object_t pager;
16244 struct vm_object_fault_info fault_info = {};
16245 kern_return_t kr;
16246 vm_object_size_t len;
16247 vm_object_offset_t offset;
16248
16249 fault_info.interruptible = THREAD_UNINT; /* ignored value */
16250 fault_info.behavior = VM_BEHAVIOR_SEQUENTIAL;
16251 fault_info.stealth = TRUE;
16252
16253 /*
16254 * The MADV_WILLNEED operation doesn't require any changes to the
16255 * vm_map_entry_t's, so the read lock is sufficient.
16256 */
16257
16258 vm_map_lock_read(map);
16259
16260 /*
16261 * The madvise semantics require that the address range be fully
16262 * allocated with no holes. Otherwise, we're required to return
16263 * an error.
16264 */
16265
16266 if (!vm_map_range_check(map, start, end, &entry)) {
16267 vm_map_unlock_read(map);
16268 return KERN_INVALID_ADDRESS;
16269 }
16270
16271 /*
16272 * Examine each vm_map_entry_t in the range.
16273 */
16274 for (; entry != vm_map_to_entry(map) && start < end;) {
16275 /*
16276 * The first time through, the start address could be anywhere
16277 * within the vm_map_entry we found. So adjust the offset to
16278 * correspond. After that, the offset will always be zero to
16279 * correspond to the beginning of the current vm_map_entry.
16280 */
16281 offset = (start - entry->vme_start) + VME_OFFSET(entry);
16282
16283 /*
16284 * Set the length so we don't go beyond the end of the
16285 * map_entry or beyond the end of the range we were given.
16286 * This range could span also multiple map entries all of which
16287 * map different files, so make sure we only do the right amount
16288 * of I/O for each object. Note that it's possible for there
16289 * to be multiple map entries all referring to the same object
16290 * but with different page permissions, but it's not worth
16291 * trying to optimize that case.
16292 */
16293 len = MIN(entry->vme_end - start, end - start);
16294
16295 if ((vm_size_t) len != len) {
16296 /* 32-bit overflow */
16297 len = (vm_size_t) (0 - PAGE_SIZE);
16298 }
16299 fault_info.cluster_size = (vm_size_t) len;
16300 fault_info.lo_offset = offset;
16301 fault_info.hi_offset = offset + len;
16302 fault_info.user_tag = VME_ALIAS(entry);
16303 fault_info.pmap_options = 0;
16304 if (entry->iokit_acct ||
16305 (!entry->is_sub_map && !entry->use_pmap)) {
16306 fault_info.pmap_options |= PMAP_OPTIONS_ALT_ACCT;
16307 }
16308 fault_info.fi_xnu_user_debug = entry->vme_xnu_user_debug;
16309
16310 /*
16311 * If the entry is a submap OR there's no read permission
16312 * to this mapping, then just skip it.
16313 */
16314 if ((entry->is_sub_map) || (entry->protection & VM_PROT_READ) == 0) {
16315 entry = entry->vme_next;
16316 start = entry->vme_start;
16317 continue;
16318 }
16319
16320 object = VME_OBJECT(entry);
16321
16322 if (object == NULL ||
16323 (object && object->internal)) {
16324 /*
16325 * Memory range backed by anonymous memory.
16326 */
16327 vm_size_t region_size = 0, effective_page_size = 0;
16328 vm_map_offset_t addr = 0, effective_page_mask = 0;
16329
16330 region_size = len;
16331 addr = start;
16332
16333 effective_page_mask = MIN(vm_map_page_mask(current_map()), PAGE_MASK);
16334 effective_page_size = effective_page_mask + 1;
16335
16336 vm_map_unlock_read(map);
16337
16338 while (region_size) {
16339 vm_pre_fault(
16340 vm_map_trunc_page(addr, effective_page_mask),
16341 VM_PROT_READ | VM_PROT_WRITE);
16342
16343 region_size -= effective_page_size;
16344 addr += effective_page_size;
16345 }
16346 } else {
16347 /*
16348 * Find the file object backing this map entry. If there is
16349 * none, then we simply ignore the "will need" advice for this
16350 * entry and go on to the next one.
16351 */
16352 if ((object = find_vnode_object(entry)) == VM_OBJECT_NULL) {
16353 entry = entry->vme_next;
16354 start = entry->vme_start;
16355 continue;
16356 }
16357
16358 vm_object_paging_begin(object);
16359 pager = object->pager;
16360 vm_object_unlock(object);
16361
16362 /*
16363 * The data_request() could take a long time, so let's
16364 * release the map lock to avoid blocking other threads.
16365 */
16366 vm_map_unlock_read(map);
16367
16368 /*
16369 * Get the data from the object asynchronously.
16370 *
16371 * Note that memory_object_data_request() places limits on the
16372 * amount of I/O it will do. Regardless of the len we
16373 * specified, it won't do more than MAX_UPL_TRANSFER_BYTES and it
16374 * silently truncates the len to that size. This isn't
16375 * necessarily bad since madvise shouldn't really be used to
16376 * page in unlimited amounts of data. Other Unix variants
16377 * limit the willneed case as well. If this turns out to be an
16378 * issue for developers, then we can always adjust the policy
16379 * here and still be backwards compatible since this is all
16380 * just "advice".
16381 */
16382 kr = memory_object_data_request(
16383 pager,
16384 vm_object_trunc_page(offset) + object->paging_offset,
16385 0, /* ignored */
16386 VM_PROT_READ,
16387 (memory_object_fault_info_t)&fault_info);
16388
16389 vm_object_lock(object);
16390 vm_object_paging_end(object);
16391 vm_object_unlock(object);
16392
16393 /*
16394 * If we couldn't do the I/O for some reason, just give up on
16395 * the madvise. We still return success to the user since
16396 * madvise isn't supposed to fail when the advice can't be
16397 * taken.
16398 */
16399
16400 if (kr != KERN_SUCCESS) {
16401 return KERN_SUCCESS;
16402 }
16403 }
16404
16405 start += len;
16406 if (start >= end) {
16407 /* done */
16408 return KERN_SUCCESS;
16409 }
16410
16411 /* look up next entry */
16412 vm_map_lock_read(map);
16413 if (!vm_map_lookup_entry(map, start, &entry)) {
16414 /*
16415 * There's a new hole in the address range.
16416 */
16417 vm_map_unlock_read(map);
16418 return KERN_INVALID_ADDRESS;
16419 }
16420 }
16421
16422 vm_map_unlock_read(map);
16423 return KERN_SUCCESS;
16424 }
16425
16426 static boolean_t
vm_map_entry_is_reusable(vm_map_entry_t entry)16427 vm_map_entry_is_reusable(
16428 vm_map_entry_t entry)
16429 {
16430 /* Only user map entries */
16431
16432 vm_object_t object;
16433
16434 if (entry->is_sub_map) {
16435 return FALSE;
16436 }
16437
16438 switch (VME_ALIAS(entry)) {
16439 case VM_MEMORY_MALLOC:
16440 case VM_MEMORY_MALLOC_SMALL:
16441 case VM_MEMORY_MALLOC_LARGE:
16442 case VM_MEMORY_REALLOC:
16443 case VM_MEMORY_MALLOC_TINY:
16444 case VM_MEMORY_MALLOC_LARGE_REUSABLE:
16445 case VM_MEMORY_MALLOC_LARGE_REUSED:
16446 /*
16447 * This is a malloc() memory region: check if it's still
16448 * in its original state and can be re-used for more
16449 * malloc() allocations.
16450 */
16451 break;
16452 default:
16453 /*
16454 * Not a malloc() memory region: let the caller decide if
16455 * it's re-usable.
16456 */
16457 return TRUE;
16458 }
16459
16460 if (/*entry->is_shared ||*/
16461 entry->is_sub_map ||
16462 entry->in_transition ||
16463 entry->protection != VM_PROT_DEFAULT ||
16464 entry->max_protection != VM_PROT_ALL ||
16465 entry->inheritance != VM_INHERIT_DEFAULT ||
16466 entry->no_cache ||
16467 entry->vme_permanent ||
16468 entry->superpage_size != FALSE ||
16469 entry->zero_wired_pages ||
16470 entry->wired_count != 0 ||
16471 entry->user_wired_count != 0) {
16472 return FALSE;
16473 }
16474
16475 object = VME_OBJECT(entry);
16476 if (object == VM_OBJECT_NULL) {
16477 return TRUE;
16478 }
16479 if (
16480 #if 0
16481 /*
16482 * Let's proceed even if the VM object is potentially
16483 * shared.
16484 * We check for this later when processing the actual
16485 * VM pages, so the contents will be safe if shared.
16486 *
16487 * But we can still mark this memory region as "reusable" to
16488 * acknowledge that the caller did let us know that the memory
16489 * could be re-used and should not be penalized for holding
16490 * on to it. This allows its "resident size" to not include
16491 * the reusable range.
16492 */
16493 object->ref_count == 1 &&
16494 #endif
16495 object->vo_copy == VM_OBJECT_NULL &&
16496 object->shadow == VM_OBJECT_NULL &&
16497 object->internal &&
16498 object->purgable == VM_PURGABLE_DENY &&
16499 object->wimg_bits == VM_WIMG_USE_DEFAULT &&
16500 !object->code_signed) {
16501 return TRUE;
16502 }
16503 return FALSE;
16504 }
16505
16506 static kern_return_t
vm_map_reuse_pages(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)16507 vm_map_reuse_pages(
16508 vm_map_t map,
16509 vm_map_offset_t start,
16510 vm_map_offset_t end)
16511 {
16512 vm_map_entry_t entry;
16513 vm_object_t object;
16514 vm_object_offset_t start_offset, end_offset;
16515
16516 /*
16517 * The MADV_REUSE operation doesn't require any changes to the
16518 * vm_map_entry_t's, so the read lock is sufficient.
16519 */
16520
16521 if (VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT) {
16522 /*
16523 * XXX TODO4K
16524 * need to figure out what reusable means for a
16525 * portion of a native page.
16526 */
16527 return KERN_SUCCESS;
16528 }
16529
16530 vm_map_lock_read(map);
16531 assert(map->pmap != kernel_pmap); /* protect alias access */
16532
16533 /*
16534 * The madvise semantics require that the address range be fully
16535 * allocated with no holes. Otherwise, we're required to return
16536 * an error.
16537 */
16538
16539 if (!vm_map_range_check(map, start, end, &entry)) {
16540 vm_map_unlock_read(map);
16541 vm_page_stats_reusable.reuse_pages_failure++;
16542 return KERN_INVALID_ADDRESS;
16543 }
16544
16545 /*
16546 * Examine each vm_map_entry_t in the range.
16547 */
16548 for (; entry != vm_map_to_entry(map) && entry->vme_start < end;
16549 entry = entry->vme_next) {
16550 /*
16551 * Sanity check on the VM map entry.
16552 */
16553 if (!vm_map_entry_is_reusable(entry)) {
16554 vm_map_unlock_read(map);
16555 vm_page_stats_reusable.reuse_pages_failure++;
16556 return KERN_INVALID_ADDRESS;
16557 }
16558
16559 /*
16560 * The first time through, the start address could be anywhere
16561 * within the vm_map_entry we found. So adjust the offset to
16562 * correspond.
16563 */
16564 if (entry->vme_start < start) {
16565 start_offset = start - entry->vme_start;
16566 } else {
16567 start_offset = 0;
16568 }
16569 end_offset = MIN(end, entry->vme_end) - entry->vme_start;
16570 start_offset += VME_OFFSET(entry);
16571 end_offset += VME_OFFSET(entry);
16572
16573 object = VME_OBJECT(entry);
16574 if (object != VM_OBJECT_NULL) {
16575 vm_object_lock(object);
16576 vm_object_reuse_pages(object, start_offset, end_offset,
16577 TRUE);
16578 vm_object_unlock(object);
16579 }
16580
16581 if (VME_ALIAS(entry) == VM_MEMORY_MALLOC_LARGE_REUSABLE) {
16582 /*
16583 * XXX
16584 * We do not hold the VM map exclusively here.
16585 * The "alias" field is not that critical, so it's
16586 * safe to update it here, as long as it is the only
16587 * one that can be modified while holding the VM map
16588 * "shared".
16589 */
16590 VME_ALIAS_SET(entry, VM_MEMORY_MALLOC_LARGE_REUSED);
16591 }
16592 }
16593
16594 vm_map_unlock_read(map);
16595 vm_page_stats_reusable.reuse_pages_success++;
16596 return KERN_SUCCESS;
16597 }
16598
16599
16600 static kern_return_t
vm_map_reusable_pages(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)16601 vm_map_reusable_pages(
16602 vm_map_t map,
16603 vm_map_offset_t start,
16604 vm_map_offset_t end)
16605 {
16606 vm_map_entry_t entry;
16607 vm_object_t object;
16608 vm_object_offset_t start_offset, end_offset;
16609 vm_map_offset_t pmap_offset;
16610
16611 if (VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT) {
16612 /*
16613 * XXX TODO4K
16614 * need to figure out what reusable means for a portion
16615 * of a native page.
16616 */
16617 return KERN_SUCCESS;
16618 }
16619
16620 /*
16621 * The MADV_REUSABLE operation doesn't require any changes to the
16622 * vm_map_entry_t's, so the read lock is sufficient.
16623 */
16624
16625 vm_map_lock_read(map);
16626 assert(map->pmap != kernel_pmap); /* protect alias access */
16627
16628 /*
16629 * The madvise semantics require that the address range be fully
16630 * allocated with no holes. Otherwise, we're required to return
16631 * an error.
16632 */
16633
16634 if (!vm_map_range_check(map, start, end, &entry)) {
16635 vm_map_unlock_read(map);
16636 vm_page_stats_reusable.reusable_pages_failure++;
16637 return KERN_INVALID_ADDRESS;
16638 }
16639
16640 /*
16641 * Examine each vm_map_entry_t in the range.
16642 */
16643 for (; entry != vm_map_to_entry(map) && entry->vme_start < end;
16644 entry = entry->vme_next) {
16645 int kill_pages = 0;
16646 boolean_t reusable_no_write = FALSE;
16647
16648 /*
16649 * Sanity check on the VM map entry.
16650 */
16651 if (!vm_map_entry_is_reusable(entry)) {
16652 vm_map_unlock_read(map);
16653 vm_page_stats_reusable.reusable_pages_failure++;
16654 return KERN_INVALID_ADDRESS;
16655 }
16656
16657 if (!(entry->protection & VM_PROT_WRITE) && !entry->used_for_jit
16658 #if __arm64e__
16659 && !entry->used_for_tpro
16660 #endif
16661 ) {
16662 /* not writable: can't discard contents */
16663 vm_map_unlock_read(map);
16664 vm_page_stats_reusable.reusable_nonwritable++;
16665 vm_page_stats_reusable.reusable_pages_failure++;
16666 return KERN_PROTECTION_FAILURE;
16667 }
16668
16669 /*
16670 * The first time through, the start address could be anywhere
16671 * within the vm_map_entry we found. So adjust the offset to
16672 * correspond.
16673 */
16674 if (entry->vme_start < start) {
16675 start_offset = start - entry->vme_start;
16676 pmap_offset = start;
16677 } else {
16678 start_offset = 0;
16679 pmap_offset = entry->vme_start;
16680 }
16681 end_offset = MIN(end, entry->vme_end) - entry->vme_start;
16682 start_offset += VME_OFFSET(entry);
16683 end_offset += VME_OFFSET(entry);
16684
16685 object = VME_OBJECT(entry);
16686 if (object == VM_OBJECT_NULL) {
16687 continue;
16688 }
16689
16690 if (entry->protection & VM_PROT_EXECUTE) {
16691 /*
16692 * Executable mappings might be write-protected by
16693 * hardware, so do not attempt to write to these pages.
16694 */
16695 reusable_no_write = TRUE;
16696 }
16697
16698 vm_object_lock(object);
16699 if (((object->ref_count == 1) ||
16700 (object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC &&
16701 object->vo_copy == VM_OBJECT_NULL)) &&
16702 object->shadow == VM_OBJECT_NULL &&
16703 /*
16704 * "iokit_acct" entries are billed for their virtual size
16705 * (rather than for their resident pages only), so they
16706 * wouldn't benefit from making pages reusable, and it
16707 * would be hard to keep track of pages that are both
16708 * "iokit_acct" and "reusable" in the pmap stats and
16709 * ledgers.
16710 */
16711 !(entry->iokit_acct ||
16712 (!entry->is_sub_map && !entry->use_pmap))) {
16713 if (object->ref_count != 1) {
16714 vm_page_stats_reusable.reusable_shared++;
16715 }
16716 kill_pages = 1;
16717 } else {
16718 kill_pages = -1;
16719 }
16720 if (kill_pages != -1) {
16721 vm_object_deactivate_pages(object,
16722 start_offset,
16723 end_offset - start_offset,
16724 kill_pages,
16725 TRUE /*reusable_pages*/,
16726 reusable_no_write,
16727 map->pmap,
16728 pmap_offset);
16729 } else {
16730 vm_page_stats_reusable.reusable_pages_shared++;
16731 DTRACE_VM4(vm_map_reusable_pages_shared,
16732 unsigned int, VME_ALIAS(entry),
16733 vm_map_t, map,
16734 vm_map_entry_t, entry,
16735 vm_object_t, object);
16736 }
16737 vm_object_unlock(object);
16738
16739 if (VME_ALIAS(entry) == VM_MEMORY_MALLOC_LARGE ||
16740 VME_ALIAS(entry) == VM_MEMORY_MALLOC_LARGE_REUSED) {
16741 /*
16742 * XXX
16743 * We do not hold the VM map exclusively here.
16744 * The "alias" field is not that critical, so it's
16745 * safe to update it here, as long as it is the only
16746 * one that can be modified while holding the VM map
16747 * "shared".
16748 */
16749 VME_ALIAS_SET(entry, VM_MEMORY_MALLOC_LARGE_REUSABLE);
16750 }
16751 }
16752
16753 vm_map_unlock_read(map);
16754 vm_page_stats_reusable.reusable_pages_success++;
16755 return KERN_SUCCESS;
16756 }
16757
16758
16759 static kern_return_t
vm_map_can_reuse(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)16760 vm_map_can_reuse(
16761 vm_map_t map,
16762 vm_map_offset_t start,
16763 vm_map_offset_t end)
16764 {
16765 vm_map_entry_t entry;
16766
16767 /*
16768 * The MADV_REUSABLE operation doesn't require any changes to the
16769 * vm_map_entry_t's, so the read lock is sufficient.
16770 */
16771
16772 vm_map_lock_read(map);
16773 assert(map->pmap != kernel_pmap); /* protect alias access */
16774
16775 /*
16776 * The madvise semantics require that the address range be fully
16777 * allocated with no holes. Otherwise, we're required to return
16778 * an error.
16779 */
16780
16781 if (!vm_map_range_check(map, start, end, &entry)) {
16782 vm_map_unlock_read(map);
16783 vm_page_stats_reusable.can_reuse_failure++;
16784 return KERN_INVALID_ADDRESS;
16785 }
16786
16787 /*
16788 * Examine each vm_map_entry_t in the range.
16789 */
16790 for (; entry != vm_map_to_entry(map) && entry->vme_start < end;
16791 entry = entry->vme_next) {
16792 /*
16793 * Sanity check on the VM map entry.
16794 */
16795 if (!vm_map_entry_is_reusable(entry)) {
16796 vm_map_unlock_read(map);
16797 vm_page_stats_reusable.can_reuse_failure++;
16798 return KERN_INVALID_ADDRESS;
16799 }
16800 }
16801
16802 vm_map_unlock_read(map);
16803 vm_page_stats_reusable.can_reuse_success++;
16804 return KERN_SUCCESS;
16805 }
16806
16807
16808 #if MACH_ASSERT
16809 static kern_return_t
vm_map_pageout(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)16810 vm_map_pageout(
16811 vm_map_t map,
16812 vm_map_offset_t start,
16813 vm_map_offset_t end)
16814 {
16815 vm_map_entry_t entry;
16816
16817 /*
16818 * The MADV_PAGEOUT operation doesn't require any changes to the
16819 * vm_map_entry_t's, so the read lock is sufficient.
16820 */
16821
16822 vm_map_lock_read(map);
16823
16824 /*
16825 * The madvise semantics require that the address range be fully
16826 * allocated with no holes. Otherwise, we're required to return
16827 * an error.
16828 */
16829
16830 if (!vm_map_range_check(map, start, end, &entry)) {
16831 vm_map_unlock_read(map);
16832 return KERN_INVALID_ADDRESS;
16833 }
16834
16835 /*
16836 * Examine each vm_map_entry_t in the range.
16837 */
16838 for (; entry != vm_map_to_entry(map) && entry->vme_start < end;
16839 entry = entry->vme_next) {
16840 vm_object_t object;
16841
16842 /*
16843 * Sanity check on the VM map entry.
16844 */
16845 if (entry->is_sub_map) {
16846 vm_map_t submap;
16847 vm_map_offset_t submap_start;
16848 vm_map_offset_t submap_end;
16849 vm_map_entry_t submap_entry;
16850
16851 submap = VME_SUBMAP(entry);
16852 submap_start = VME_OFFSET(entry);
16853 submap_end = submap_start + (entry->vme_end -
16854 entry->vme_start);
16855
16856 vm_map_lock_read(submap);
16857
16858 if (!vm_map_range_check(submap,
16859 submap_start,
16860 submap_end,
16861 &submap_entry)) {
16862 vm_map_unlock_read(submap);
16863 vm_map_unlock_read(map);
16864 return KERN_INVALID_ADDRESS;
16865 }
16866
16867 if (submap_entry->is_sub_map) {
16868 vm_map_unlock_read(submap);
16869 continue;
16870 }
16871
16872 object = VME_OBJECT(submap_entry);
16873 if (object == VM_OBJECT_NULL || !object->internal) {
16874 vm_map_unlock_read(submap);
16875 continue;
16876 }
16877
16878 vm_object_pageout(object);
16879
16880 vm_map_unlock_read(submap);
16881 submap = VM_MAP_NULL;
16882 submap_entry = VM_MAP_ENTRY_NULL;
16883 continue;
16884 }
16885
16886 object = VME_OBJECT(entry);
16887 if (object == VM_OBJECT_NULL || !object->internal) {
16888 continue;
16889 }
16890
16891 vm_object_pageout(object);
16892 }
16893
16894 vm_map_unlock_read(map);
16895 return KERN_SUCCESS;
16896 }
16897 #endif /* MACH_ASSERT */
16898
16899
16900 /*
16901 * Routine: vm_map_entry_insert
16902 *
16903 * Description: This routine inserts a new vm_entry in a locked map.
16904 */
16905 static vm_map_entry_t
vm_map_entry_insert(vm_map_t map,vm_map_entry_t insp_entry,vm_map_offset_t start,vm_map_offset_t end,vm_object_t object,vm_object_offset_t offset,vm_map_kernel_flags_t vmk_flags,boolean_t needs_copy,vm_prot_t cur_protection,vm_prot_t max_protection,vm_inherit_t inheritance,boolean_t clear_map_aligned)16906 vm_map_entry_insert(
16907 vm_map_t map,
16908 vm_map_entry_t insp_entry,
16909 vm_map_offset_t start,
16910 vm_map_offset_t end,
16911 vm_object_t object,
16912 vm_object_offset_t offset,
16913 vm_map_kernel_flags_t vmk_flags,
16914 boolean_t needs_copy,
16915 vm_prot_t cur_protection,
16916 vm_prot_t max_protection,
16917 vm_inherit_t inheritance,
16918 boolean_t clear_map_aligned)
16919 {
16920 vm_map_entry_t new_entry;
16921 boolean_t map_aligned = FALSE;
16922
16923 assert(insp_entry != (vm_map_entry_t)0);
16924 vm_map_lock_assert_exclusive(map);
16925
16926 #if DEVELOPMENT || DEBUG
16927 vm_object_offset_t end_offset = 0;
16928 assertf(!os_add_overflow(end - start, offset, &end_offset), "size 0x%llx, offset 0x%llx caused overflow", (uint64_t)(end - start), offset);
16929 #endif /* DEVELOPMENT || DEBUG */
16930
16931 if (VM_MAP_PAGE_SHIFT(map) != PAGE_SHIFT) {
16932 map_aligned = TRUE;
16933 }
16934 if (clear_map_aligned &&
16935 (!VM_MAP_PAGE_ALIGNED(start, VM_MAP_PAGE_MASK(map)) ||
16936 !VM_MAP_PAGE_ALIGNED(end, VM_MAP_PAGE_MASK(map)))) {
16937 map_aligned = FALSE;
16938 }
16939 if (map_aligned) {
16940 assert(VM_MAP_PAGE_ALIGNED(start, VM_MAP_PAGE_MASK(map)));
16941 assert(VM_MAP_PAGE_ALIGNED(end, VM_MAP_PAGE_MASK(map)));
16942 } else {
16943 assert(page_aligned(start));
16944 assert(page_aligned(end));
16945 }
16946 assert(start < end);
16947
16948 new_entry = vm_map_entry_create(map);
16949
16950 new_entry->vme_start = start;
16951 new_entry->vme_end = end;
16952
16953 if (vmk_flags.vmkf_submap) {
16954 new_entry->vme_atomic = vmk_flags.vmkf_submap_atomic;
16955 VME_SUBMAP_SET(new_entry, (vm_map_t)object);
16956 } else {
16957 VME_OBJECT_SET(new_entry, object, false, 0);
16958 }
16959 VME_OFFSET_SET(new_entry, offset);
16960 VME_ALIAS_SET(new_entry, vmk_flags.vm_tag);
16961
16962 new_entry->map_aligned = map_aligned;
16963 new_entry->needs_copy = needs_copy;
16964 new_entry->inheritance = inheritance;
16965 new_entry->protection = cur_protection;
16966 new_entry->max_protection = max_protection;
16967 /*
16968 * submap: "use_pmap" means "nested".
16969 * default: false.
16970 *
16971 * object: "use_pmap" means "use pmap accounting" for footprint.
16972 * default: true.
16973 */
16974 new_entry->use_pmap = !vmk_flags.vmkf_submap;
16975 new_entry->no_cache = vmk_flags.vmf_no_cache;
16976 new_entry->vme_permanent = vmk_flags.vmf_permanent;
16977 new_entry->translated_allow_execute = vmk_flags.vmkf_translated_allow_execute;
16978 new_entry->vme_no_copy_on_read = vmk_flags.vmkf_no_copy_on_read;
16979 new_entry->superpage_size = (vmk_flags.vmf_superpage_size != 0);
16980
16981 if (vmk_flags.vmkf_map_jit) {
16982 if (!(map->jit_entry_exists) ||
16983 VM_MAP_POLICY_ALLOW_MULTIPLE_JIT(map)) {
16984 new_entry->used_for_jit = TRUE;
16985 map->jit_entry_exists = TRUE;
16986 }
16987 }
16988
16989 /*
16990 * Insert the new entry into the list.
16991 */
16992
16993 vm_map_store_entry_link(map, insp_entry, new_entry, vmk_flags);
16994 map->size += end - start;
16995
16996 /*
16997 * Update the free space hint and the lookup hint.
16998 */
16999
17000 SAVE_HINT_MAP_WRITE(map, new_entry);
17001 return new_entry;
17002 }
17003
17004 /*
17005 * Routine: vm_map_remap_extract
17006 *
17007 * Description: This routine returns a vm_entry list from a map.
17008 */
17009 static kern_return_t
vm_map_remap_extract(vm_map_t map,vm_map_offset_t addr,vm_map_size_t size,boolean_t copy,vm_map_copy_t map_copy,vm_prot_t * cur_protection,vm_prot_t * max_protection,vm_inherit_t inheritance,vm_map_kernel_flags_t vmk_flags)17010 vm_map_remap_extract(
17011 vm_map_t map,
17012 vm_map_offset_t addr,
17013 vm_map_size_t size,
17014 boolean_t copy,
17015 vm_map_copy_t map_copy,
17016 vm_prot_t *cur_protection, /* IN/OUT */
17017 vm_prot_t *max_protection, /* IN/OUT */
17018 /* What, no behavior? */
17019 vm_inherit_t inheritance,
17020 vm_map_kernel_flags_t vmk_flags)
17021 {
17022 struct vm_map_header *map_header = &map_copy->cpy_hdr;
17023 kern_return_t result;
17024 vm_map_size_t mapped_size;
17025 vm_map_size_t tmp_size;
17026 vm_map_entry_t src_entry; /* result of last map lookup */
17027 vm_map_entry_t new_entry;
17028 vm_object_offset_t offset;
17029 vm_map_offset_t map_address;
17030 vm_map_offset_t src_start; /* start of entry to map */
17031 vm_map_offset_t src_end; /* end of region to be mapped */
17032 vm_object_t object;
17033 vm_map_version_t version;
17034 boolean_t src_needs_copy;
17035 boolean_t new_entry_needs_copy;
17036 vm_map_entry_t saved_src_entry;
17037 boolean_t src_entry_was_wired;
17038 vm_prot_t max_prot_for_prot_copy;
17039 vm_map_offset_t effective_page_mask;
17040 bool pageable, same_map;
17041 boolean_t vm_remap_legacy;
17042 vm_prot_t required_cur_prot, required_max_prot;
17043 vm_object_t new_copy_object; /* vm_object_copy_* result */
17044 boolean_t saved_used_for_jit; /* Saved used_for_jit. */
17045
17046 pageable = vmk_flags.vmkf_copy_pageable;
17047 same_map = vmk_flags.vmkf_copy_same_map;
17048
17049 effective_page_mask = MIN(PAGE_MASK, VM_MAP_PAGE_MASK(map));
17050
17051 assert(map != VM_MAP_NULL);
17052 assert(size != 0);
17053 assert(size == vm_map_round_page(size, effective_page_mask));
17054 assert(inheritance == VM_INHERIT_NONE ||
17055 inheritance == VM_INHERIT_COPY ||
17056 inheritance == VM_INHERIT_SHARE);
17057 assert(!(*cur_protection & ~(VM_PROT_ALL | VM_PROT_ALLEXEC)));
17058 assert(!(*max_protection & ~(VM_PROT_ALL | VM_PROT_ALLEXEC)));
17059 assert((*cur_protection & *max_protection) == *cur_protection);
17060
17061 /*
17062 * Compute start and end of region.
17063 */
17064 src_start = vm_map_trunc_page(addr, effective_page_mask);
17065 src_end = vm_map_round_page(src_start + size, effective_page_mask);
17066
17067 /*
17068 * Initialize map_header.
17069 */
17070 map_header->nentries = 0;
17071 map_header->entries_pageable = pageable;
17072 // map_header->page_shift = MIN(VM_MAP_PAGE_SHIFT(map), PAGE_SHIFT);
17073 map_header->page_shift = (uint16_t)VM_MAP_PAGE_SHIFT(map);
17074 map_header->rb_head_store.rbh_root = (void *)(int)SKIP_RB_TREE;
17075 vm_map_store_init(map_header);
17076
17077 if (copy && vmk_flags.vmkf_remap_prot_copy) {
17078 /*
17079 * Special case for vm_map_protect(VM_PROT_COPY):
17080 * we want to set the new mappings' max protection to the
17081 * specified *max_protection...
17082 */
17083 max_prot_for_prot_copy = *max_protection & (VM_PROT_ALL | VM_PROT_ALLEXEC);
17084 /* ... but we want to use the vm_remap() legacy mode */
17085 *max_protection = VM_PROT_NONE;
17086 *cur_protection = VM_PROT_NONE;
17087 } else {
17088 max_prot_for_prot_copy = VM_PROT_NONE;
17089 }
17090
17091 if (*cur_protection == VM_PROT_NONE &&
17092 *max_protection == VM_PROT_NONE) {
17093 /*
17094 * vm_remap() legacy mode:
17095 * Extract all memory regions in the specified range and
17096 * collect the strictest set of protections allowed on the
17097 * entire range, so the caller knows what they can do with
17098 * the remapped range.
17099 * We start with VM_PROT_ALL and we'll remove the protections
17100 * missing from each memory region.
17101 */
17102 vm_remap_legacy = TRUE;
17103 *cur_protection = VM_PROT_ALL;
17104 *max_protection = VM_PROT_ALL;
17105 required_cur_prot = VM_PROT_NONE;
17106 required_max_prot = VM_PROT_NONE;
17107 } else {
17108 /*
17109 * vm_remap_new() mode:
17110 * Extract all memory regions in the specified range and
17111 * ensure that they have at least the protections specified
17112 * by the caller via *cur_protection and *max_protection.
17113 * The resulting mapping should have these protections.
17114 */
17115 vm_remap_legacy = FALSE;
17116 if (copy) {
17117 required_cur_prot = VM_PROT_NONE;
17118 required_max_prot = VM_PROT_READ;
17119 } else {
17120 required_cur_prot = *cur_protection;
17121 required_max_prot = *max_protection;
17122 }
17123 }
17124
17125 map_address = 0;
17126 mapped_size = 0;
17127 result = KERN_SUCCESS;
17128
17129 /*
17130 * The specified source virtual space might correspond to
17131 * multiple map entries, need to loop on them.
17132 */
17133 vm_map_lock(map);
17134
17135 if (map->pmap == kernel_pmap) {
17136 map_copy->is_kernel_range = true;
17137 map_copy->orig_range = kmem_addr_get_range(addr, size);
17138 #if CONFIG_MAP_RANGES
17139 } else if (map->uses_user_ranges) {
17140 map_copy->is_user_range = true;
17141 map_copy->orig_range = vm_map_user_range_resolve(map, addr, size, NULL);
17142 #endif /* CONFIG_MAP_RANGES */
17143 }
17144
17145 if (VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT) {
17146 /*
17147 * This address space uses sub-pages so the range might
17148 * not be re-mappable in an address space with larger
17149 * pages. Re-assemble any broken-up VM map entries to
17150 * improve our chances of making it work.
17151 */
17152 vm_map_simplify_range(map, src_start, src_end);
17153 }
17154 while (mapped_size != size) {
17155 vm_map_size_t entry_size;
17156
17157 /*
17158 * Find the beginning of the region.
17159 */
17160 if (!vm_map_lookup_entry(map, src_start, &src_entry)) {
17161 result = KERN_INVALID_ADDRESS;
17162 break;
17163 }
17164
17165 if (src_start < src_entry->vme_start ||
17166 (mapped_size && src_start != src_entry->vme_start)) {
17167 result = KERN_INVALID_ADDRESS;
17168 break;
17169 }
17170
17171 tmp_size = size - mapped_size;
17172 if (src_end > src_entry->vme_end) {
17173 tmp_size -= (src_end - src_entry->vme_end);
17174 }
17175
17176 entry_size = (vm_map_size_t)(src_entry->vme_end -
17177 src_entry->vme_start);
17178
17179 if (src_entry->is_sub_map &&
17180 vmk_flags.vmkf_copy_single_object) {
17181 vm_map_t submap;
17182 vm_map_offset_t submap_start;
17183 vm_map_size_t submap_size;
17184 boolean_t submap_needs_copy;
17185
17186 /*
17187 * No check for "required protection" on "src_entry"
17188 * because the protections that matter are the ones
17189 * on the submap's VM map entry, which will be checked
17190 * during the call to vm_map_remap_extract() below.
17191 */
17192 submap_size = src_entry->vme_end - src_start;
17193 if (submap_size > size) {
17194 submap_size = size;
17195 }
17196 submap_start = VME_OFFSET(src_entry) + src_start - src_entry->vme_start;
17197 submap = VME_SUBMAP(src_entry);
17198 if (copy) {
17199 /*
17200 * The caller wants a copy-on-write re-mapping,
17201 * so let's extract from the submap accordingly.
17202 */
17203 submap_needs_copy = TRUE;
17204 } else if (src_entry->needs_copy) {
17205 /*
17206 * The caller wants a shared re-mapping but the
17207 * submap is mapped with "needs_copy", so its
17208 * contents can't be shared as is. Extract the
17209 * contents of the submap as "copy-on-write".
17210 * The re-mapping won't be shared with the
17211 * original mapping but this is equivalent to
17212 * what happened with the original "remap from
17213 * submap" code.
17214 * The shared region is mapped "needs_copy", for
17215 * example.
17216 */
17217 submap_needs_copy = TRUE;
17218 } else {
17219 /*
17220 * The caller wants a shared re-mapping and
17221 * this mapping can be shared (no "needs_copy"),
17222 * so let's extract from the submap accordingly.
17223 * Kernel submaps are mapped without
17224 * "needs_copy", for example.
17225 */
17226 submap_needs_copy = FALSE;
17227 }
17228 vm_map_reference(submap);
17229 vm_map_unlock(map);
17230 src_entry = NULL;
17231 if (vm_remap_legacy) {
17232 *cur_protection = VM_PROT_NONE;
17233 *max_protection = VM_PROT_NONE;
17234 }
17235
17236 DTRACE_VM7(remap_submap_recurse,
17237 vm_map_t, map,
17238 vm_map_offset_t, addr,
17239 vm_map_size_t, size,
17240 boolean_t, copy,
17241 vm_map_offset_t, submap_start,
17242 vm_map_size_t, submap_size,
17243 boolean_t, submap_needs_copy);
17244
17245 result = vm_map_remap_extract(submap,
17246 submap_start,
17247 submap_size,
17248 submap_needs_copy,
17249 map_copy,
17250 cur_protection,
17251 max_protection,
17252 inheritance,
17253 vmk_flags);
17254 vm_map_deallocate(submap);
17255 return result;
17256 }
17257
17258 if (src_entry->is_sub_map) {
17259 /* protections for submap mapping are irrelevant here */
17260 } else if (((src_entry->protection & required_cur_prot) !=
17261 required_cur_prot) ||
17262 ((src_entry->max_protection & required_max_prot) !=
17263 required_max_prot)) {
17264 if (vmk_flags.vmkf_copy_single_object &&
17265 mapped_size != 0) {
17266 /*
17267 * Single object extraction.
17268 * We can't extract more with the required
17269 * protection but we've extracted some, so
17270 * stop there and declare success.
17271 * The caller should check the size of
17272 * the copy entry we've extracted.
17273 */
17274 result = KERN_SUCCESS;
17275 } else {
17276 /*
17277 * VM range extraction.
17278 * Required proctection is not available
17279 * for this part of the range: fail.
17280 */
17281 result = KERN_PROTECTION_FAILURE;
17282 }
17283 break;
17284 }
17285
17286 if (src_entry->is_sub_map) {
17287 vm_map_t submap;
17288 vm_map_offset_t submap_start;
17289 vm_map_size_t submap_size;
17290 vm_map_copy_t submap_copy;
17291 vm_prot_t submap_curprot, submap_maxprot;
17292 boolean_t submap_needs_copy;
17293
17294 /*
17295 * No check for "required protection" on "src_entry"
17296 * because the protections that matter are the ones
17297 * on the submap's VM map entry, which will be checked
17298 * during the call to vm_map_copy_extract() below.
17299 */
17300 object = VM_OBJECT_NULL;
17301 submap_copy = VM_MAP_COPY_NULL;
17302
17303 /* find equivalent range in the submap */
17304 submap = VME_SUBMAP(src_entry);
17305 submap_start = VME_OFFSET(src_entry) + src_start - src_entry->vme_start;
17306 submap_size = tmp_size;
17307 if (copy) {
17308 /*
17309 * The caller wants a copy-on-write re-mapping,
17310 * so let's extract from the submap accordingly.
17311 */
17312 submap_needs_copy = TRUE;
17313 } else if (src_entry->needs_copy) {
17314 /*
17315 * The caller wants a shared re-mapping but the
17316 * submap is mapped with "needs_copy", so its
17317 * contents can't be shared as is. Extract the
17318 * contents of the submap as "copy-on-write".
17319 * The re-mapping won't be shared with the
17320 * original mapping but this is equivalent to
17321 * what happened with the original "remap from
17322 * submap" code.
17323 * The shared region is mapped "needs_copy", for
17324 * example.
17325 */
17326 submap_needs_copy = TRUE;
17327 } else {
17328 /*
17329 * The caller wants a shared re-mapping and
17330 * this mapping can be shared (no "needs_copy"),
17331 * so let's extract from the submap accordingly.
17332 * Kernel submaps are mapped without
17333 * "needs_copy", for example.
17334 */
17335 submap_needs_copy = FALSE;
17336 }
17337 /* extra ref to keep submap alive */
17338 vm_map_reference(submap);
17339
17340 DTRACE_VM7(remap_submap_recurse,
17341 vm_map_t, map,
17342 vm_map_offset_t, addr,
17343 vm_map_size_t, size,
17344 boolean_t, copy,
17345 vm_map_offset_t, submap_start,
17346 vm_map_size_t, submap_size,
17347 boolean_t, submap_needs_copy);
17348
17349 /*
17350 * The map can be safely unlocked since we
17351 * already hold a reference on the submap.
17352 *
17353 * No timestamp since we don't care if the map
17354 * gets modified while we're down in the submap.
17355 * We'll resume the extraction at src_start + tmp_size
17356 * anyway.
17357 */
17358 vm_map_unlock(map);
17359 src_entry = NULL; /* not valid once map is unlocked */
17360
17361 if (vm_remap_legacy) {
17362 submap_curprot = VM_PROT_NONE;
17363 submap_maxprot = VM_PROT_NONE;
17364 if (max_prot_for_prot_copy) {
17365 submap_maxprot = max_prot_for_prot_copy;
17366 }
17367 } else {
17368 assert(!max_prot_for_prot_copy);
17369 submap_curprot = *cur_protection;
17370 submap_maxprot = *max_protection;
17371 }
17372 result = vm_map_copy_extract(submap,
17373 submap_start,
17374 submap_size,
17375 submap_needs_copy,
17376 &submap_copy,
17377 &submap_curprot,
17378 &submap_maxprot,
17379 inheritance,
17380 vmk_flags);
17381
17382 /* release extra ref on submap */
17383 vm_map_deallocate(submap);
17384 submap = VM_MAP_NULL;
17385
17386 if (result != KERN_SUCCESS) {
17387 vm_map_lock(map);
17388 break;
17389 }
17390
17391 /* transfer submap_copy entries to map_header */
17392 while (vm_map_copy_first_entry(submap_copy) !=
17393 vm_map_copy_to_entry(submap_copy)) {
17394 vm_map_entry_t copy_entry;
17395 vm_map_size_t copy_entry_size;
17396
17397 copy_entry = vm_map_copy_first_entry(submap_copy);
17398
17399 /*
17400 * Prevent kernel_object from being exposed to
17401 * user space.
17402 */
17403 if (__improbable(copy_entry->vme_kernel_object)) {
17404 printf("%d[%s]: rejecting attempt to extract from kernel_object\n",
17405 proc_selfpid(),
17406 (get_bsdtask_info(current_task())
17407 ? proc_name_address(get_bsdtask_info(current_task()))
17408 : "?"));
17409 DTRACE_VM(extract_kernel_only);
17410 result = KERN_INVALID_RIGHT;
17411 vm_map_copy_discard(submap_copy);
17412 submap_copy = VM_MAP_COPY_NULL;
17413 vm_map_lock(map);
17414 break;
17415 }
17416
17417 #ifdef __arm64e__
17418 if (vmk_flags.vmkf_tpro_enforcement_override) {
17419 copy_entry->used_for_tpro = FALSE;
17420 }
17421 #endif /* __arm64e__ */
17422
17423 vm_map_copy_entry_unlink(submap_copy, copy_entry);
17424 copy_entry_size = copy_entry->vme_end - copy_entry->vme_start;
17425 copy_entry->vme_start = map_address;
17426 copy_entry->vme_end = map_address + copy_entry_size;
17427 map_address += copy_entry_size;
17428 mapped_size += copy_entry_size;
17429 src_start += copy_entry_size;
17430 assert(src_start <= src_end);
17431 _vm_map_store_entry_link(map_header,
17432 map_header->links.prev,
17433 copy_entry);
17434 }
17435 /* done with submap_copy */
17436 vm_map_copy_discard(submap_copy);
17437
17438 if (vm_remap_legacy) {
17439 *cur_protection &= submap_curprot;
17440 *max_protection &= submap_maxprot;
17441 }
17442
17443 /* re-acquire the map lock and continue to next entry */
17444 vm_map_lock(map);
17445 continue;
17446 } else {
17447 object = VME_OBJECT(src_entry);
17448
17449 /*
17450 * Prevent kernel_object from being exposed to
17451 * user space.
17452 */
17453 if (__improbable(is_kernel_object(object))) {
17454 printf("%d[%s]: rejecting attempt to extract from kernel_object\n",
17455 proc_selfpid(),
17456 (get_bsdtask_info(current_task())
17457 ? proc_name_address(get_bsdtask_info(current_task()))
17458 : "?"));
17459 DTRACE_VM(extract_kernel_only);
17460 result = KERN_INVALID_RIGHT;
17461 break;
17462 }
17463
17464 if (src_entry->iokit_acct) {
17465 /*
17466 * This entry uses "IOKit accounting".
17467 */
17468 } else if (object != VM_OBJECT_NULL &&
17469 (object->purgable != VM_PURGABLE_DENY ||
17470 object->vo_ledger_tag != VM_LEDGER_TAG_NONE)) {
17471 /*
17472 * Purgeable objects have their own accounting:
17473 * no pmap accounting for them.
17474 */
17475 assertf(!src_entry->use_pmap,
17476 "map=%p src_entry=%p [0x%llx:0x%llx] 0x%x/0x%x %d",
17477 map,
17478 src_entry,
17479 (uint64_t)src_entry->vme_start,
17480 (uint64_t)src_entry->vme_end,
17481 src_entry->protection,
17482 src_entry->max_protection,
17483 VME_ALIAS(src_entry));
17484 } else {
17485 /*
17486 * Not IOKit or purgeable:
17487 * must be accounted by pmap stats.
17488 */
17489 assertf(src_entry->use_pmap,
17490 "map=%p src_entry=%p [0x%llx:0x%llx] 0x%x/0x%x %d",
17491 map,
17492 src_entry,
17493 (uint64_t)src_entry->vme_start,
17494 (uint64_t)src_entry->vme_end,
17495 src_entry->protection,
17496 src_entry->max_protection,
17497 VME_ALIAS(src_entry));
17498 }
17499
17500 if (object == VM_OBJECT_NULL) {
17501 assert(!src_entry->needs_copy);
17502 if (src_entry->max_protection == VM_PROT_NONE) {
17503 assert(src_entry->protection == VM_PROT_NONE);
17504 /*
17505 * No VM object and no permissions:
17506 * this must be a reserved range with
17507 * nothing to share or copy.
17508 * There could also be all sorts of
17509 * pmap shenanigans within that reserved
17510 * range, so let's just copy the map
17511 * entry as is to remap a similar
17512 * reserved range.
17513 */
17514 offset = 0; /* no object => no offset */
17515 goto copy_src_entry;
17516 }
17517 object = vm_object_allocate(entry_size);
17518 VME_OFFSET_SET(src_entry, 0);
17519 VME_OBJECT_SET(src_entry, object, false, 0);
17520 assert(src_entry->use_pmap);
17521 assert(!map->mapped_in_other_pmaps);
17522 } else if (src_entry->wired_count ||
17523 object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC) {
17524 /*
17525 * A wired memory region should not have
17526 * any pending copy-on-write and needs to
17527 * keep pointing at the VM object that
17528 * contains the wired pages.
17529 * If we're sharing this memory (copy=false),
17530 * we'll share this VM object.
17531 * If we're copying this memory (copy=true),
17532 * we'll call vm_object_copy_slowly() below
17533 * and use the new VM object for the remapping.
17534 *
17535 * Or, we are already using an asymmetric
17536 * copy, and therefore we already have
17537 * the right object.
17538 */
17539 assert(!src_entry->needs_copy);
17540 } else if (src_entry->needs_copy || object->shadowed ||
17541 (object->internal && !object->true_share &&
17542 !src_entry->is_shared &&
17543 object->vo_size > entry_size)) {
17544 VME_OBJECT_SHADOW(src_entry, entry_size,
17545 vm_map_always_shadow(map));
17546 assert(src_entry->use_pmap);
17547
17548 if (!src_entry->needs_copy &&
17549 (src_entry->protection & VM_PROT_WRITE)) {
17550 vm_prot_t prot;
17551
17552 assert(!pmap_has_prot_policy(map->pmap, src_entry->translated_allow_execute, src_entry->protection));
17553
17554 prot = src_entry->protection & ~VM_PROT_WRITE;
17555
17556 if (override_nx(map,
17557 VME_ALIAS(src_entry))
17558 && prot) {
17559 prot |= VM_PROT_EXECUTE;
17560 }
17561
17562 assert(!pmap_has_prot_policy(map->pmap, src_entry->translated_allow_execute, prot));
17563
17564 if (map->mapped_in_other_pmaps) {
17565 vm_object_pmap_protect(
17566 VME_OBJECT(src_entry),
17567 VME_OFFSET(src_entry),
17568 entry_size,
17569 PMAP_NULL,
17570 PAGE_SIZE,
17571 src_entry->vme_start,
17572 prot);
17573 #if MACH_ASSERT
17574 } else if (__improbable(map->pmap == PMAP_NULL)) {
17575 extern boolean_t vm_tests_in_progress;
17576 assert(vm_tests_in_progress);
17577 /*
17578 * Some VM tests (in vm_tests.c)
17579 * sometimes want to use a VM
17580 * map without a pmap.
17581 * Otherwise, this should never
17582 * happen.
17583 */
17584 #endif /* MACH_ASSERT */
17585 } else {
17586 pmap_protect(vm_map_pmap(map),
17587 src_entry->vme_start,
17588 src_entry->vme_end,
17589 prot);
17590 }
17591 }
17592
17593 object = VME_OBJECT(src_entry);
17594 src_entry->needs_copy = FALSE;
17595 }
17596
17597
17598 vm_object_lock(object);
17599 vm_object_reference_locked(object); /* object ref. for new entry */
17600 assert(!src_entry->needs_copy);
17601 if (object->copy_strategy ==
17602 MEMORY_OBJECT_COPY_SYMMETRIC) {
17603 /*
17604 * If we want to share this object (copy==0),
17605 * it needs to be COPY_DELAY.
17606 * If we want to copy this object (copy==1),
17607 * we can't just set "needs_copy" on our side
17608 * and expect the other side to do the same
17609 * (symmetrically), so we can't let the object
17610 * stay COPY_SYMMETRIC.
17611 * So we always switch from COPY_SYMMETRIC to
17612 * COPY_DELAY.
17613 */
17614 object->copy_strategy =
17615 MEMORY_OBJECT_COPY_DELAY;
17616 object->true_share = TRUE;
17617 }
17618 vm_object_unlock(object);
17619 }
17620
17621 offset = (VME_OFFSET(src_entry) +
17622 (src_start - src_entry->vme_start));
17623
17624 copy_src_entry:
17625 new_entry = _vm_map_entry_create(map_header);
17626 vm_map_entry_copy(map, new_entry, src_entry);
17627 if (new_entry->is_sub_map) {
17628 /* clr address space specifics */
17629 new_entry->use_pmap = FALSE;
17630 } else if (copy) {
17631 /*
17632 * We're dealing with a copy-on-write operation,
17633 * so the resulting mapping should not inherit the
17634 * original mapping's accounting settings.
17635 * "use_pmap" should be reset to its default (TRUE)
17636 * so that the new mapping gets accounted for in
17637 * the task's memory footprint.
17638 */
17639 new_entry->use_pmap = TRUE;
17640 }
17641 /* "iokit_acct" was cleared in vm_map_entry_copy() */
17642 assert(!new_entry->iokit_acct);
17643
17644 new_entry->map_aligned = FALSE;
17645
17646 new_entry->vme_start = map_address;
17647 new_entry->vme_end = map_address + tmp_size;
17648 assert(new_entry->vme_start < new_entry->vme_end);
17649 if (copy && vmk_flags.vmkf_remap_prot_copy) {
17650 /* security: keep "permanent" and "csm_associated" */
17651 new_entry->vme_permanent = src_entry->vme_permanent;
17652 new_entry->csm_associated = src_entry->csm_associated;
17653 /*
17654 * Remapping for vm_map_protect(VM_PROT_COPY)
17655 * to convert a read-only mapping into a
17656 * copy-on-write version of itself but
17657 * with write access:
17658 * keep the original inheritance but let's not
17659 * add VM_PROT_WRITE to the max protection yet
17660 * since we want to do more security checks against
17661 * the target map.
17662 */
17663 new_entry->inheritance = src_entry->inheritance;
17664 new_entry->protection &= max_prot_for_prot_copy;
17665 } else {
17666 new_entry->inheritance = inheritance;
17667 if (!vm_remap_legacy) {
17668 new_entry->protection = *cur_protection;
17669 new_entry->max_protection = *max_protection;
17670 }
17671 }
17672 #ifdef __arm64e__
17673 if (copy && vmk_flags.vmkf_tpro_enforcement_override) {
17674 new_entry->used_for_tpro = FALSE;
17675 }
17676 #endif /* __arm64e__ */
17677 VME_OFFSET_SET(new_entry, offset);
17678
17679 /*
17680 * The new region has to be copied now if required.
17681 */
17682 RestartCopy:
17683 if (!copy) {
17684 if (src_entry->used_for_jit == TRUE) {
17685 if (same_map) {
17686 } else if (!VM_MAP_POLICY_ALLOW_JIT_SHARING(map)) {
17687 /*
17688 * Cannot allow an entry describing a JIT
17689 * region to be shared across address spaces.
17690 */
17691 result = KERN_INVALID_ARGUMENT;
17692 vm_object_deallocate(object);
17693 vm_map_entry_dispose(new_entry);
17694 new_entry = VM_MAP_ENTRY_NULL;
17695 break;
17696 }
17697 }
17698
17699 src_entry->is_shared = TRUE;
17700 new_entry->is_shared = TRUE;
17701 if (!(new_entry->is_sub_map)) {
17702 new_entry->needs_copy = FALSE;
17703 }
17704 } else if (src_entry->is_sub_map) {
17705 /* make this a COW sub_map if not already */
17706 assert(new_entry->wired_count == 0);
17707 new_entry->needs_copy = TRUE;
17708 object = VM_OBJECT_NULL;
17709 } else if (src_entry->wired_count == 0 &&
17710 !(debug4k_no_cow_copyin && VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT) &&
17711 vm_object_copy_quickly(VME_OBJECT(new_entry),
17712 VME_OFFSET(new_entry),
17713 (new_entry->vme_end -
17714 new_entry->vme_start),
17715 &src_needs_copy,
17716 &new_entry_needs_copy)) {
17717 new_entry->needs_copy = new_entry_needs_copy;
17718 new_entry->is_shared = FALSE;
17719 assertf(new_entry->use_pmap, "map %p new_entry %p\n", map, new_entry);
17720
17721 /*
17722 * Handle copy_on_write semantics.
17723 */
17724 if (src_needs_copy && !src_entry->needs_copy) {
17725 vm_prot_t prot;
17726
17727 assert(!pmap_has_prot_policy(map->pmap, src_entry->translated_allow_execute, src_entry->protection));
17728
17729 prot = src_entry->protection & ~VM_PROT_WRITE;
17730
17731 if (override_nx(map,
17732 VME_ALIAS(src_entry))
17733 && prot) {
17734 prot |= VM_PROT_EXECUTE;
17735 }
17736
17737 assert(!pmap_has_prot_policy(map->pmap, src_entry->translated_allow_execute, prot));
17738
17739 vm_object_pmap_protect(object,
17740 offset,
17741 entry_size,
17742 ((src_entry->is_shared
17743 || map->mapped_in_other_pmaps) ?
17744 PMAP_NULL : map->pmap),
17745 VM_MAP_PAGE_SIZE(map),
17746 src_entry->vme_start,
17747 prot);
17748
17749 assert(src_entry->wired_count == 0);
17750 src_entry->needs_copy = TRUE;
17751 }
17752 /*
17753 * Throw away the old object reference of the new entry.
17754 */
17755 vm_object_deallocate(object);
17756 } else {
17757 new_entry->is_shared = FALSE;
17758 assertf(new_entry->use_pmap, "map %p new_entry %p\n", map, new_entry);
17759
17760 src_entry_was_wired = (src_entry->wired_count > 0);
17761 saved_src_entry = src_entry;
17762 src_entry = VM_MAP_ENTRY_NULL;
17763
17764 /*
17765 * The map can be safely unlocked since we
17766 * already hold a reference on the object.
17767 *
17768 * Record the timestamp of the map for later
17769 * verification, and unlock the map.
17770 */
17771 version.main_timestamp = map->timestamp;
17772 vm_map_unlock(map); /* Increments timestamp once! */
17773
17774 /*
17775 * Perform the copy.
17776 */
17777 if (src_entry_was_wired > 0 ||
17778 (debug4k_no_cow_copyin &&
17779 VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT)) {
17780 vm_object_lock(object);
17781 result = vm_object_copy_slowly(
17782 object,
17783 offset,
17784 (new_entry->vme_end -
17785 new_entry->vme_start),
17786 THREAD_UNINT,
17787 &new_copy_object);
17788 /* VME_OBJECT_SET will reset used_for_jit, so preserve it. */
17789 saved_used_for_jit = new_entry->used_for_jit;
17790 VME_OBJECT_SET(new_entry, new_copy_object, false, 0);
17791 new_entry->used_for_jit = saved_used_for_jit;
17792 VME_OFFSET_SET(new_entry, offset - vm_object_trunc_page(offset));
17793 new_entry->needs_copy = FALSE;
17794 } else {
17795 vm_object_offset_t new_offset;
17796
17797 new_offset = VME_OFFSET(new_entry);
17798 result = vm_object_copy_strategically(
17799 object,
17800 offset,
17801 (new_entry->vme_end -
17802 new_entry->vme_start),
17803 false, /* forking */
17804 &new_copy_object,
17805 &new_offset,
17806 &new_entry_needs_copy);
17807 /* VME_OBJECT_SET will reset used_for_jit, so preserve it. */
17808 saved_used_for_jit = new_entry->used_for_jit;
17809 VME_OBJECT_SET(new_entry, new_copy_object, false, 0);
17810 new_entry->used_for_jit = saved_used_for_jit;
17811 if (new_offset != VME_OFFSET(new_entry)) {
17812 VME_OFFSET_SET(new_entry, new_offset);
17813 }
17814
17815 new_entry->needs_copy = new_entry_needs_copy;
17816 }
17817
17818 /*
17819 * Throw away the old object reference of the new entry.
17820 */
17821 vm_object_deallocate(object);
17822
17823 if (result != KERN_SUCCESS &&
17824 result != KERN_MEMORY_RESTART_COPY) {
17825 vm_map_entry_dispose(new_entry);
17826 vm_map_lock(map);
17827 break;
17828 }
17829
17830 /*
17831 * Verify that the map has not substantially
17832 * changed while the copy was being made.
17833 */
17834
17835 vm_map_lock(map);
17836 if (version.main_timestamp + 1 != map->timestamp) {
17837 /*
17838 * Simple version comparison failed.
17839 *
17840 * Retry the lookup and verify that the
17841 * same object/offset are still present.
17842 */
17843 saved_src_entry = VM_MAP_ENTRY_NULL;
17844 vm_object_deallocate(VME_OBJECT(new_entry));
17845 vm_map_entry_dispose(new_entry);
17846 if (result == KERN_MEMORY_RESTART_COPY) {
17847 result = KERN_SUCCESS;
17848 }
17849 continue;
17850 }
17851 /* map hasn't changed: src_entry is still valid */
17852 src_entry = saved_src_entry;
17853 saved_src_entry = VM_MAP_ENTRY_NULL;
17854
17855 if (result == KERN_MEMORY_RESTART_COPY) {
17856 vm_object_reference(object);
17857 goto RestartCopy;
17858 }
17859 }
17860
17861 _vm_map_store_entry_link(map_header,
17862 map_header->links.prev, new_entry);
17863
17864 /* protections for submap mapping are irrelevant here */
17865 if (vm_remap_legacy && !src_entry->is_sub_map) {
17866 *cur_protection &= src_entry->protection;
17867 *max_protection &= src_entry->max_protection;
17868 }
17869
17870 map_address += tmp_size;
17871 mapped_size += tmp_size;
17872 src_start += tmp_size;
17873
17874 if (vmk_flags.vmkf_copy_single_object) {
17875 if (mapped_size != size) {
17876 DEBUG4K_SHARE("map %p addr 0x%llx size 0x%llx clipped copy at mapped_size 0x%llx\n",
17877 map, (uint64_t)addr, (uint64_t)size, (uint64_t)mapped_size);
17878 if (src_entry->vme_next != vm_map_to_entry(map) &&
17879 src_entry->vme_next->vme_object_value ==
17880 src_entry->vme_object_value) {
17881 /* XXX TODO4K */
17882 DEBUG4K_ERROR("could have extended copy to next entry...\n");
17883 }
17884 }
17885 break;
17886 }
17887 } /* end while */
17888
17889 vm_map_unlock(map);
17890 if (result != KERN_SUCCESS) {
17891 /*
17892 * Free all allocated elements.
17893 */
17894 for (src_entry = map_header->links.next;
17895 src_entry != CAST_TO_VM_MAP_ENTRY(&map_header->links);
17896 src_entry = new_entry) {
17897 new_entry = src_entry->vme_next;
17898 _vm_map_store_entry_unlink(map_header, src_entry, false);
17899 if (src_entry->is_sub_map) {
17900 vm_map_deallocate(VME_SUBMAP(src_entry));
17901 } else {
17902 vm_object_deallocate(VME_OBJECT(src_entry));
17903 }
17904 vm_map_entry_dispose(src_entry);
17905 }
17906 }
17907 return result;
17908 }
17909
17910 bool
vm_map_is_exotic(vm_map_t map)17911 vm_map_is_exotic(
17912 vm_map_t map)
17913 {
17914 return VM_MAP_IS_EXOTIC(map);
17915 }
17916
17917 bool
vm_map_is_alien(vm_map_t map)17918 vm_map_is_alien(
17919 vm_map_t map)
17920 {
17921 return VM_MAP_IS_ALIEN(map);
17922 }
17923
17924 #if XNU_TARGET_OS_OSX
17925 void
vm_map_mark_alien(vm_map_t map)17926 vm_map_mark_alien(
17927 vm_map_t map)
17928 {
17929 vm_map_lock(map);
17930 map->is_alien = true;
17931 vm_map_unlock(map);
17932 }
17933
17934 void
vm_map_single_jit(vm_map_t map)17935 vm_map_single_jit(
17936 vm_map_t map)
17937 {
17938 vm_map_lock(map);
17939 map->single_jit = true;
17940 vm_map_unlock(map);
17941 }
17942 #endif /* XNU_TARGET_OS_OSX */
17943
17944 /*
17945 * Callers of this function must call vm_map_copy_require on
17946 * previously created vm_map_copy_t or pass a newly created
17947 * one to ensure that it hasn't been forged.
17948 */
17949 static kern_return_t
vm_map_copy_to_physcopy(vm_map_copy_t copy_map,vm_map_t target_map)17950 vm_map_copy_to_physcopy(
17951 vm_map_copy_t copy_map,
17952 vm_map_t target_map)
17953 {
17954 vm_map_size_t size;
17955 vm_map_entry_t entry;
17956 vm_map_entry_t new_entry;
17957 vm_object_t new_object;
17958 unsigned int pmap_flags;
17959 pmap_t new_pmap;
17960 vm_map_t new_map;
17961 vm_map_address_t src_start, src_end, src_cur;
17962 vm_map_address_t dst_start, dst_end, dst_cur;
17963 kern_return_t kr;
17964 void *kbuf;
17965
17966 /*
17967 * Perform the equivalent of vm_allocate() and memcpy().
17968 * Replace the mappings in "copy_map" with the newly allocated mapping.
17969 */
17970 DEBUG4K_COPY("copy_map %p (%d %d 0x%llx 0x%llx) BEFORE\n", copy_map, copy_map->cpy_hdr.page_shift, copy_map->cpy_hdr.nentries, copy_map->offset, (uint64_t)copy_map->size);
17971
17972 assert(copy_map->cpy_hdr.page_shift != VM_MAP_PAGE_MASK(target_map));
17973
17974 /* create a new pmap to map "copy_map" */
17975 pmap_flags = 0;
17976 assert(copy_map->cpy_hdr.page_shift == FOURK_PAGE_SHIFT);
17977 #if PMAP_CREATE_FORCE_4K_PAGES
17978 pmap_flags |= PMAP_CREATE_FORCE_4K_PAGES;
17979 #endif /* PMAP_CREATE_FORCE_4K_PAGES */
17980 pmap_flags |= PMAP_CREATE_64BIT;
17981 new_pmap = pmap_create_options(NULL, (vm_map_size_t)0, pmap_flags);
17982 if (new_pmap == NULL) {
17983 return KERN_RESOURCE_SHORTAGE;
17984 }
17985
17986 /* allocate new VM object */
17987 size = VM_MAP_ROUND_PAGE(copy_map->size, PAGE_MASK);
17988 new_object = vm_object_allocate(size);
17989 assert(new_object);
17990
17991 /* allocate new VM map entry */
17992 new_entry = vm_map_copy_entry_create(copy_map);
17993 assert(new_entry);
17994
17995 /* finish initializing new VM map entry */
17996 new_entry->protection = VM_PROT_DEFAULT;
17997 new_entry->max_protection = VM_PROT_DEFAULT;
17998 new_entry->use_pmap = TRUE;
17999
18000 /* make new VM map entry point to new VM object */
18001 new_entry->vme_start = 0;
18002 new_entry->vme_end = size;
18003 VME_OBJECT_SET(new_entry, new_object, false, 0);
18004 VME_OFFSET_SET(new_entry, 0);
18005
18006 /* create a new pageable VM map to map "copy_map" */
18007 new_map = vm_map_create_options(new_pmap, 0, MACH_VM_MAX_ADDRESS,
18008 VM_MAP_CREATE_PAGEABLE);
18009 assert(new_map);
18010 vm_map_set_page_shift(new_map, copy_map->cpy_hdr.page_shift);
18011
18012 /* map "copy_map" in the new VM map */
18013 src_start = 0;
18014 kr = vm_map_copyout_internal(
18015 new_map,
18016 &src_start,
18017 copy_map,
18018 copy_map->size,
18019 FALSE, /* consume_on_success */
18020 VM_PROT_DEFAULT,
18021 VM_PROT_DEFAULT,
18022 VM_INHERIT_DEFAULT);
18023 assert(kr == KERN_SUCCESS);
18024 src_end = src_start + copy_map->size;
18025
18026 /* map "new_object" in the new VM map */
18027 vm_object_reference(new_object);
18028 dst_start = 0;
18029 kr = vm_map_enter(new_map,
18030 &dst_start,
18031 size,
18032 0, /* mask */
18033 VM_MAP_KERNEL_FLAGS_ANYWHERE(.vm_tag = VM_KERN_MEMORY_OSFMK),
18034 new_object,
18035 0, /* offset */
18036 FALSE, /* needs copy */
18037 VM_PROT_DEFAULT,
18038 VM_PROT_DEFAULT,
18039 VM_INHERIT_DEFAULT);
18040 assert(kr == KERN_SUCCESS);
18041 dst_end = dst_start + size;
18042
18043 /* get a kernel buffer */
18044 kbuf = kalloc_data(PAGE_SIZE, Z_WAITOK | Z_NOFAIL);
18045
18046 /* physically copy "copy_map" mappings to new VM object */
18047 for (src_cur = src_start, dst_cur = dst_start;
18048 src_cur < src_end;
18049 src_cur += PAGE_SIZE, dst_cur += PAGE_SIZE) {
18050 vm_size_t bytes;
18051
18052 bytes = PAGE_SIZE;
18053 if (src_cur + PAGE_SIZE > src_end) {
18054 /* partial copy for last page */
18055 bytes = src_end - src_cur;
18056 assert(bytes > 0 && bytes < PAGE_SIZE);
18057 /* rest of dst page should be zero-filled */
18058 }
18059 /* get bytes from src mapping */
18060 kr = copyinmap(new_map, src_cur, kbuf, bytes);
18061 if (kr != KERN_SUCCESS) {
18062 DEBUG4K_COPY("copyinmap(%p, 0x%llx, %p, 0x%llx) kr 0x%x\n", new_map, (uint64_t)src_cur, kbuf, (uint64_t)bytes, kr);
18063 }
18064 /* put bytes in dst mapping */
18065 assert(dst_cur < dst_end);
18066 assert(dst_cur + bytes <= dst_end);
18067 kr = copyoutmap(new_map, kbuf, dst_cur, bytes);
18068 if (kr != KERN_SUCCESS) {
18069 DEBUG4K_COPY("copyoutmap(%p, %p, 0x%llx, 0x%llx) kr 0x%x\n", new_map, kbuf, (uint64_t)dst_cur, (uint64_t)bytes, kr);
18070 }
18071 }
18072
18073 /* free kernel buffer */
18074 kfree_data(kbuf, PAGE_SIZE);
18075
18076 /* destroy new map */
18077 vm_map_destroy(new_map);
18078 new_map = VM_MAP_NULL;
18079
18080 /* dispose of the old map entries in "copy_map" */
18081 while (vm_map_copy_first_entry(copy_map) !=
18082 vm_map_copy_to_entry(copy_map)) {
18083 entry = vm_map_copy_first_entry(copy_map);
18084 vm_map_copy_entry_unlink(copy_map, entry);
18085 if (entry->is_sub_map) {
18086 vm_map_deallocate(VME_SUBMAP(entry));
18087 } else {
18088 vm_object_deallocate(VME_OBJECT(entry));
18089 }
18090 vm_map_copy_entry_dispose(entry);
18091 }
18092
18093 /* change "copy_map"'s page_size to match "target_map" */
18094 copy_map->cpy_hdr.page_shift = (uint16_t)VM_MAP_PAGE_SHIFT(target_map);
18095 copy_map->offset = 0;
18096 copy_map->size = size;
18097
18098 /* insert new map entry in "copy_map" */
18099 assert(vm_map_copy_last_entry(copy_map) == vm_map_copy_to_entry(copy_map));
18100 vm_map_copy_entry_link(copy_map, vm_map_copy_last_entry(copy_map), new_entry);
18101
18102 DEBUG4K_COPY("copy_map %p (%d %d 0x%llx 0x%llx) AFTER\n", copy_map, copy_map->cpy_hdr.page_shift, copy_map->cpy_hdr.nentries, copy_map->offset, (uint64_t)copy_map->size);
18103 return KERN_SUCCESS;
18104 }
18105
18106 void
18107 vm_map_copy_adjust_get_target_copy_map(
18108 vm_map_copy_t copy_map,
18109 vm_map_copy_t *target_copy_map_p);
18110 void
vm_map_copy_adjust_get_target_copy_map(vm_map_copy_t copy_map,vm_map_copy_t * target_copy_map_p)18111 vm_map_copy_adjust_get_target_copy_map(
18112 vm_map_copy_t copy_map,
18113 vm_map_copy_t *target_copy_map_p)
18114 {
18115 vm_map_copy_t target_copy_map;
18116 vm_map_entry_t entry, target_entry;
18117
18118 if (*target_copy_map_p != VM_MAP_COPY_NULL) {
18119 /* the caller already has a "target_copy_map": use it */
18120 return;
18121 }
18122
18123 /* the caller wants us to create a new copy of "copy_map" */
18124 assert(copy_map->type == VM_MAP_COPY_ENTRY_LIST);
18125 target_copy_map = vm_map_copy_allocate(copy_map->type);
18126 target_copy_map->offset = copy_map->offset;
18127 target_copy_map->size = copy_map->size;
18128 target_copy_map->cpy_hdr.page_shift = copy_map->cpy_hdr.page_shift;
18129 for (entry = vm_map_copy_first_entry(copy_map);
18130 entry != vm_map_copy_to_entry(copy_map);
18131 entry = entry->vme_next) {
18132 target_entry = vm_map_copy_entry_create(target_copy_map);
18133 vm_map_entry_copy_full(target_entry, entry);
18134 if (target_entry->is_sub_map) {
18135 vm_map_reference(VME_SUBMAP(target_entry));
18136 } else {
18137 vm_object_reference(VME_OBJECT(target_entry));
18138 }
18139 vm_map_copy_entry_link(
18140 target_copy_map,
18141 vm_map_copy_last_entry(target_copy_map),
18142 target_entry);
18143 }
18144 entry = VM_MAP_ENTRY_NULL;
18145 *target_copy_map_p = target_copy_map;
18146 }
18147
18148 /*
18149 * Callers of this function must call vm_map_copy_require on
18150 * previously created vm_map_copy_t or pass a newly created
18151 * one to ensure that it hasn't been forged.
18152 */
18153 static void
vm_map_copy_trim(vm_map_copy_t copy_map,uint16_t new_page_shift,vm_map_offset_t trim_start,vm_map_offset_t trim_end)18154 vm_map_copy_trim(
18155 vm_map_copy_t copy_map,
18156 uint16_t new_page_shift,
18157 vm_map_offset_t trim_start,
18158 vm_map_offset_t trim_end)
18159 {
18160 uint16_t copy_page_shift;
18161 vm_map_entry_t entry, next_entry;
18162
18163 assert(copy_map->type == VM_MAP_COPY_ENTRY_LIST);
18164 assert(copy_map->cpy_hdr.nentries > 0);
18165
18166 trim_start += vm_map_copy_first_entry(copy_map)->vme_start;
18167 trim_end += vm_map_copy_first_entry(copy_map)->vme_start;
18168
18169 /* use the new page_shift to do the clipping */
18170 copy_page_shift = VM_MAP_COPY_PAGE_SHIFT(copy_map);
18171 copy_map->cpy_hdr.page_shift = new_page_shift;
18172
18173 for (entry = vm_map_copy_first_entry(copy_map);
18174 entry != vm_map_copy_to_entry(copy_map);
18175 entry = next_entry) {
18176 next_entry = entry->vme_next;
18177 if (entry->vme_end <= trim_start) {
18178 /* entry fully before trim range: skip */
18179 continue;
18180 }
18181 if (entry->vme_start >= trim_end) {
18182 /* entry fully after trim range: done */
18183 break;
18184 }
18185 /* clip entry if needed */
18186 vm_map_copy_clip_start(copy_map, entry, trim_start);
18187 vm_map_copy_clip_end(copy_map, entry, trim_end);
18188 /* dispose of entry */
18189 copy_map->size -= entry->vme_end - entry->vme_start;
18190 vm_map_copy_entry_unlink(copy_map, entry);
18191 if (entry->is_sub_map) {
18192 vm_map_deallocate(VME_SUBMAP(entry));
18193 } else {
18194 vm_object_deallocate(VME_OBJECT(entry));
18195 }
18196 vm_map_copy_entry_dispose(entry);
18197 entry = VM_MAP_ENTRY_NULL;
18198 }
18199
18200 /* restore copy_map's original page_shift */
18201 copy_map->cpy_hdr.page_shift = copy_page_shift;
18202 }
18203
18204 /*
18205 * Make any necessary adjustments to "copy_map" to allow it to be
18206 * mapped into "target_map".
18207 * If no changes were necessary, "target_copy_map" points to the
18208 * untouched "copy_map".
18209 * If changes are necessary, changes will be made to "target_copy_map".
18210 * If "target_copy_map" was NULL, we create a new "vm_map_copy_t" and
18211 * copy the original "copy_map" to it before applying the changes.
18212 * The caller should discard "target_copy_map" if it's not the same as
18213 * the original "copy_map".
18214 */
18215 /* TODO4K: also adjust to sub-range in the copy_map -> add start&end? */
18216 kern_return_t
vm_map_copy_adjust_to_target(vm_map_copy_t src_copy_map,vm_map_offset_t offset,vm_map_size_t size,vm_map_t target_map,boolean_t copy,vm_map_copy_t * target_copy_map_p,vm_map_offset_t * overmap_start_p,vm_map_offset_t * overmap_end_p,vm_map_offset_t * trimmed_start_p)18217 vm_map_copy_adjust_to_target(
18218 vm_map_copy_t src_copy_map,
18219 vm_map_offset_t offset,
18220 vm_map_size_t size,
18221 vm_map_t target_map,
18222 boolean_t copy,
18223 vm_map_copy_t *target_copy_map_p,
18224 vm_map_offset_t *overmap_start_p,
18225 vm_map_offset_t *overmap_end_p,
18226 vm_map_offset_t *trimmed_start_p)
18227 {
18228 vm_map_copy_t copy_map, target_copy_map;
18229 vm_map_size_t target_size;
18230 vm_map_size_t src_copy_map_size;
18231 vm_map_size_t overmap_start, overmap_end;
18232 int misalignments;
18233 vm_map_entry_t entry, target_entry;
18234 vm_map_offset_t addr_adjustment;
18235 vm_map_offset_t new_start, new_end;
18236 int copy_page_mask, target_page_mask;
18237 uint16_t copy_page_shift, target_page_shift;
18238 vm_map_offset_t trimmed_end;
18239
18240 /*
18241 * Assert that the vm_map_copy is coming from the right
18242 * zone and hasn't been forged
18243 */
18244 vm_map_copy_require(src_copy_map);
18245 assert(src_copy_map->type == VM_MAP_COPY_ENTRY_LIST);
18246
18247 /*
18248 * Start working with "src_copy_map" but we'll switch
18249 * to "target_copy_map" as soon as we start making adjustments.
18250 */
18251 copy_map = src_copy_map;
18252 src_copy_map_size = src_copy_map->size;
18253
18254 copy_page_shift = VM_MAP_COPY_PAGE_SHIFT(copy_map);
18255 copy_page_mask = VM_MAP_COPY_PAGE_MASK(copy_map);
18256 target_page_shift = (uint16_t)VM_MAP_PAGE_SHIFT(target_map);
18257 target_page_mask = VM_MAP_PAGE_MASK(target_map);
18258
18259 DEBUG4K_ADJUST("copy_map %p (%d offset 0x%llx size 0x%llx) target_map %p (%d) copy %d offset 0x%llx size 0x%llx target_copy_map %p...\n", copy_map, copy_page_shift, (uint64_t)copy_map->offset, (uint64_t)copy_map->size, target_map, target_page_shift, copy, (uint64_t)offset, (uint64_t)size, *target_copy_map_p);
18260
18261 target_copy_map = *target_copy_map_p;
18262 if (target_copy_map != VM_MAP_COPY_NULL) {
18263 vm_map_copy_require(target_copy_map);
18264 }
18265
18266 if (offset + size > copy_map->size) {
18267 DEBUG4K_ERROR("copy_map %p (%d->%d) copy_map->size 0x%llx offset 0x%llx size 0x%llx KERN_INVALID_ARGUMENT\n", copy_map, copy_page_shift, target_page_shift, (uint64_t)copy_map->size, (uint64_t)offset, (uint64_t)size);
18268 return KERN_INVALID_ARGUMENT;
18269 }
18270
18271 /* trim the end */
18272 trimmed_end = 0;
18273 new_end = VM_MAP_ROUND_PAGE(offset + size, target_page_mask);
18274 if (new_end < copy_map->size) {
18275 trimmed_end = src_copy_map_size - new_end;
18276 DEBUG4K_ADJUST("copy_map %p (%d->%d) copy %d offset 0x%llx size 0x%llx target_copy_map %p... trim end from 0x%llx to 0x%llx\n", copy_map, copy_page_shift, target_page_shift, copy, (uint64_t)offset, (uint64_t)size, target_copy_map, (uint64_t)new_end, (uint64_t)copy_map->size);
18277 /* get "target_copy_map" if needed and adjust it */
18278 vm_map_copy_adjust_get_target_copy_map(copy_map,
18279 &target_copy_map);
18280 copy_map = target_copy_map;
18281 vm_map_copy_trim(target_copy_map, target_page_shift,
18282 new_end, copy_map->size);
18283 }
18284
18285 /* trim the start */
18286 new_start = VM_MAP_TRUNC_PAGE(offset, target_page_mask);
18287 if (new_start != 0) {
18288 DEBUG4K_ADJUST("copy_map %p (%d->%d) copy %d offset 0x%llx size 0x%llx target_copy_map %p... trim start from 0x%llx to 0x%llx\n", copy_map, copy_page_shift, target_page_shift, copy, (uint64_t)offset, (uint64_t)size, target_copy_map, (uint64_t)0, (uint64_t)new_start);
18289 /* get "target_copy_map" if needed and adjust it */
18290 vm_map_copy_adjust_get_target_copy_map(copy_map,
18291 &target_copy_map);
18292 copy_map = target_copy_map;
18293 vm_map_copy_trim(target_copy_map, target_page_shift,
18294 0, new_start);
18295 }
18296 *trimmed_start_p = new_start;
18297
18298 /* target_size starts with what's left after trimming */
18299 target_size = copy_map->size;
18300 assertf(target_size == src_copy_map_size - *trimmed_start_p - trimmed_end,
18301 "target_size 0x%llx src_copy_map_size 0x%llx trimmed_start 0x%llx trimmed_end 0x%llx\n",
18302 (uint64_t)target_size, (uint64_t)src_copy_map_size,
18303 (uint64_t)*trimmed_start_p, (uint64_t)trimmed_end);
18304
18305 /* check for misalignments but don't adjust yet */
18306 misalignments = 0;
18307 overmap_start = 0;
18308 overmap_end = 0;
18309 if (copy_page_shift < target_page_shift) {
18310 /*
18311 * Remapping from 4K to 16K: check the VM object alignments
18312 * throughout the range.
18313 * If the start and end of the range are mis-aligned, we can
18314 * over-map to re-align, and adjust the "overmap" start/end
18315 * and "target_size" of the range accordingly.
18316 * If there is any mis-alignment within the range:
18317 * if "copy":
18318 * we can do immediate-copy instead of copy-on-write,
18319 * else:
18320 * no way to remap and share; fail.
18321 */
18322 for (entry = vm_map_copy_first_entry(copy_map);
18323 entry != vm_map_copy_to_entry(copy_map);
18324 entry = entry->vme_next) {
18325 vm_object_offset_t object_offset_start, object_offset_end;
18326
18327 object_offset_start = VME_OFFSET(entry);
18328 object_offset_end = object_offset_start;
18329 object_offset_end += entry->vme_end - entry->vme_start;
18330 if (object_offset_start & target_page_mask) {
18331 if (entry == vm_map_copy_first_entry(copy_map) && !copy) {
18332 overmap_start++;
18333 } else {
18334 misalignments++;
18335 }
18336 }
18337 if (object_offset_end & target_page_mask) {
18338 if (entry->vme_next == vm_map_copy_to_entry(copy_map) && !copy) {
18339 overmap_end++;
18340 } else {
18341 misalignments++;
18342 }
18343 }
18344 }
18345 }
18346 entry = VM_MAP_ENTRY_NULL;
18347
18348 /* decide how to deal with misalignments */
18349 assert(overmap_start <= 1);
18350 assert(overmap_end <= 1);
18351 if (!overmap_start && !overmap_end && !misalignments) {
18352 /* copy_map is properly aligned for target_map ... */
18353 if (*trimmed_start_p) {
18354 /* ... but we trimmed it, so still need to adjust */
18355 } else {
18356 /* ... and we didn't trim anything: we're done */
18357 if (target_copy_map == VM_MAP_COPY_NULL) {
18358 target_copy_map = copy_map;
18359 }
18360 *target_copy_map_p = target_copy_map;
18361 *overmap_start_p = 0;
18362 *overmap_end_p = 0;
18363 DEBUG4K_ADJUST("copy_map %p (%d offset 0x%llx size 0x%llx) target_map %p (%d) copy %d target_copy_map %p (%d offset 0x%llx size 0x%llx) -> trimmed 0x%llx overmap start 0x%llx end 0x%llx KERN_SUCCESS\n", copy_map, copy_page_shift, (uint64_t)copy_map->offset, (uint64_t)copy_map->size, target_map, target_page_shift, copy, *target_copy_map_p, VM_MAP_COPY_PAGE_SHIFT(*target_copy_map_p), (uint64_t)(*target_copy_map_p)->offset, (uint64_t)(*target_copy_map_p)->size, (uint64_t)*trimmed_start_p, (uint64_t)*overmap_start_p, (uint64_t)*overmap_end_p);
18364 return KERN_SUCCESS;
18365 }
18366 } else if (misalignments && !copy) {
18367 /* can't "share" if misaligned */
18368 DEBUG4K_ADJUST("unsupported sharing\n");
18369 #if MACH_ASSERT
18370 if (debug4k_panic_on_misaligned_sharing) {
18371 panic("DEBUG4k %s:%d unsupported sharing", __FUNCTION__, __LINE__);
18372 }
18373 #endif /* MACH_ASSERT */
18374 DEBUG4K_ADJUST("copy_map %p (%d) target_map %p (%d) copy %d target_copy_map %p -> KERN_NOT_SUPPORTED\n", copy_map, copy_page_shift, target_map, target_page_shift, copy, *target_copy_map_p);
18375 return KERN_NOT_SUPPORTED;
18376 } else {
18377 /* can't virtual-copy if misaligned (but can physical-copy) */
18378 DEBUG4K_ADJUST("mis-aligned copying\n");
18379 }
18380
18381 /* get a "target_copy_map" if needed and switch to it */
18382 vm_map_copy_adjust_get_target_copy_map(copy_map, &target_copy_map);
18383 copy_map = target_copy_map;
18384
18385 if (misalignments && copy) {
18386 vm_map_size_t target_copy_map_size;
18387
18388 /*
18389 * Can't do copy-on-write with misaligned mappings.
18390 * Replace the mappings with a physical copy of the original
18391 * mappings' contents.
18392 */
18393 target_copy_map_size = target_copy_map->size;
18394 kern_return_t kr = vm_map_copy_to_physcopy(target_copy_map, target_map);
18395 if (kr != KERN_SUCCESS) {
18396 return kr;
18397 }
18398 *target_copy_map_p = target_copy_map;
18399 *overmap_start_p = 0;
18400 *overmap_end_p = target_copy_map->size - target_copy_map_size;
18401 DEBUG4K_ADJUST("copy_map %p (%d offset 0x%llx size 0x%llx) target_map %p (%d) copy %d target_copy_map %p (%d offset 0x%llx size 0x%llx)-> trimmed 0x%llx overmap start 0x%llx end 0x%llx PHYSCOPY\n", copy_map, copy_page_shift, (uint64_t)copy_map->offset, (uint64_t)copy_map->size, target_map, target_page_shift, copy, *target_copy_map_p, VM_MAP_COPY_PAGE_SHIFT(*target_copy_map_p), (uint64_t)(*target_copy_map_p)->offset, (uint64_t)(*target_copy_map_p)->size, (uint64_t)*trimmed_start_p, (uint64_t)*overmap_start_p, (uint64_t)*overmap_end_p);
18402 return KERN_SUCCESS;
18403 }
18404
18405 /* apply the adjustments */
18406 misalignments = 0;
18407 overmap_start = 0;
18408 overmap_end = 0;
18409 /* remove copy_map->offset, so that everything starts at offset 0 */
18410 addr_adjustment = copy_map->offset;
18411 /* also remove whatever we trimmed from the start */
18412 addr_adjustment += *trimmed_start_p;
18413 for (target_entry = vm_map_copy_first_entry(target_copy_map);
18414 target_entry != vm_map_copy_to_entry(target_copy_map);
18415 target_entry = target_entry->vme_next) {
18416 vm_object_offset_t object_offset_start, object_offset_end;
18417
18418 DEBUG4K_ADJUST("copy %p (%d 0x%llx 0x%llx) entry %p [ 0x%llx 0x%llx ] object %p offset 0x%llx BEFORE\n", target_copy_map, VM_MAP_COPY_PAGE_SHIFT(target_copy_map), target_copy_map->offset, (uint64_t)target_copy_map->size, target_entry, (uint64_t)target_entry->vme_start, (uint64_t)target_entry->vme_end, VME_OBJECT(target_entry), VME_OFFSET(target_entry));
18419 object_offset_start = VME_OFFSET(target_entry);
18420 if (object_offset_start & target_page_mask) {
18421 DEBUG4K_ADJUST("copy %p (%d 0x%llx 0x%llx) entry %p [ 0x%llx 0x%llx ] object %p offset 0x%llx misaligned at start\n", target_copy_map, VM_MAP_COPY_PAGE_SHIFT(target_copy_map), target_copy_map->offset, (uint64_t)target_copy_map->size, target_entry, (uint64_t)target_entry->vme_start, (uint64_t)target_entry->vme_end, VME_OBJECT(target_entry), VME_OFFSET(target_entry));
18422 if (target_entry == vm_map_copy_first_entry(target_copy_map)) {
18423 /*
18424 * start of 1st entry is mis-aligned:
18425 * re-adjust by over-mapping.
18426 */
18427 overmap_start = object_offset_start - trunc_page_mask_64(object_offset_start, target_page_mask);
18428 DEBUG4K_ADJUST("entry %p offset 0x%llx copy %d -> overmap_start 0x%llx\n", target_entry, VME_OFFSET(target_entry), copy, (uint64_t)overmap_start);
18429 VME_OFFSET_SET(target_entry, VME_OFFSET(target_entry) - overmap_start);
18430 } else {
18431 misalignments++;
18432 DEBUG4K_ADJUST("entry %p offset 0x%llx copy %d -> misalignments %d\n", target_entry, VME_OFFSET(target_entry), copy, misalignments);
18433 assert(copy);
18434 }
18435 }
18436
18437 if (target_entry == vm_map_copy_first_entry(target_copy_map)) {
18438 target_size += overmap_start;
18439 } else {
18440 target_entry->vme_start += overmap_start;
18441 }
18442 target_entry->vme_end += overmap_start;
18443
18444 object_offset_end = VME_OFFSET(target_entry) + target_entry->vme_end - target_entry->vme_start;
18445 if (object_offset_end & target_page_mask) {
18446 DEBUG4K_ADJUST("copy %p (%d 0x%llx 0x%llx) entry %p [ 0x%llx 0x%llx ] object %p offset 0x%llx misaligned at end\n", target_copy_map, VM_MAP_COPY_PAGE_SHIFT(target_copy_map), target_copy_map->offset, (uint64_t)target_copy_map->size, target_entry, (uint64_t)target_entry->vme_start, (uint64_t)target_entry->vme_end, VME_OBJECT(target_entry), VME_OFFSET(target_entry));
18447 if (target_entry->vme_next == vm_map_copy_to_entry(target_copy_map)) {
18448 /*
18449 * end of last entry is mis-aligned: re-adjust by over-mapping.
18450 */
18451 overmap_end = round_page_mask_64(object_offset_end, target_page_mask) - object_offset_end;
18452 DEBUG4K_ADJUST("entry %p offset 0x%llx copy %d -> overmap_end 0x%llx\n", target_entry, VME_OFFSET(target_entry), copy, (uint64_t)overmap_end);
18453 target_entry->vme_end += overmap_end;
18454 target_size += overmap_end;
18455 } else {
18456 misalignments++;
18457 DEBUG4K_ADJUST("entry %p offset 0x%llx copy %d -> misalignments %d\n", target_entry, VME_OFFSET(target_entry), copy, misalignments);
18458 assert(copy);
18459 }
18460 }
18461 target_entry->vme_start -= addr_adjustment;
18462 target_entry->vme_end -= addr_adjustment;
18463 DEBUG4K_ADJUST("copy %p (%d 0x%llx 0x%llx) entry %p [ 0x%llx 0x%llx ] object %p offset 0x%llx AFTER\n", target_copy_map, VM_MAP_COPY_PAGE_SHIFT(target_copy_map), target_copy_map->offset, (uint64_t)target_copy_map->size, target_entry, (uint64_t)target_entry->vme_start, (uint64_t)target_entry->vme_end, VME_OBJECT(target_entry), VME_OFFSET(target_entry));
18464 }
18465
18466 target_copy_map->size = target_size;
18467 target_copy_map->offset += overmap_start;
18468 target_copy_map->offset -= addr_adjustment;
18469 target_copy_map->cpy_hdr.page_shift = target_page_shift;
18470
18471 // assert(VM_MAP_PAGE_ALIGNED(target_copy_map->size, target_page_mask));
18472 // assert(VM_MAP_PAGE_ALIGNED(target_copy_map->offset, FOURK_PAGE_MASK));
18473 assert(overmap_start < VM_MAP_PAGE_SIZE(target_map));
18474 assert(overmap_end < VM_MAP_PAGE_SIZE(target_map));
18475
18476 *target_copy_map_p = target_copy_map;
18477 *overmap_start_p = overmap_start;
18478 *overmap_end_p = overmap_end;
18479
18480 DEBUG4K_ADJUST("copy_map %p (%d offset 0x%llx size 0x%llx) target_map %p (%d) copy %d target_copy_map %p (%d offset 0x%llx size 0x%llx) -> trimmed 0x%llx overmap start 0x%llx end 0x%llx KERN_SUCCESS\n", copy_map, copy_page_shift, (uint64_t)copy_map->offset, (uint64_t)copy_map->size, target_map, target_page_shift, copy, *target_copy_map_p, VM_MAP_COPY_PAGE_SHIFT(*target_copy_map_p), (uint64_t)(*target_copy_map_p)->offset, (uint64_t)(*target_copy_map_p)->size, (uint64_t)*trimmed_start_p, (uint64_t)*overmap_start_p, (uint64_t)*overmap_end_p);
18481 return KERN_SUCCESS;
18482 }
18483
18484 kern_return_t
vm_map_range_physical_size(vm_map_t map,vm_map_address_t start,mach_vm_size_t size,mach_vm_size_t * phys_size)18485 vm_map_range_physical_size(
18486 vm_map_t map,
18487 vm_map_address_t start,
18488 mach_vm_size_t size,
18489 mach_vm_size_t * phys_size)
18490 {
18491 kern_return_t kr;
18492 vm_map_copy_t copy_map, target_copy_map;
18493 vm_map_offset_t adjusted_start, adjusted_end;
18494 vm_map_size_t adjusted_size;
18495 vm_prot_t cur_prot, max_prot;
18496 vm_map_offset_t overmap_start, overmap_end, trimmed_start, end;
18497 vm_map_kernel_flags_t vmk_flags;
18498
18499 if (size == 0) {
18500 DEBUG4K_SHARE("map %p start 0x%llx size 0x%llx -> phys_size 0!\n", map, (uint64_t)start, (uint64_t)size);
18501 *phys_size = 0;
18502 return KERN_SUCCESS;
18503 }
18504
18505 adjusted_start = vm_map_trunc_page(start, VM_MAP_PAGE_MASK(map));
18506 adjusted_end = vm_map_round_page(start + size, VM_MAP_PAGE_MASK(map));
18507 if (__improbable(os_add_overflow(start, size, &end) ||
18508 adjusted_end <= adjusted_start)) {
18509 /* wraparound */
18510 printf("%s:%d(start=0x%llx, size=0x%llx) pgmask 0x%x: wraparound\n", __FUNCTION__, __LINE__, (uint64_t)start, (uint64_t)size, VM_MAP_PAGE_MASK(map));
18511 *phys_size = 0;
18512 return KERN_INVALID_ARGUMENT;
18513 }
18514 if (__improbable(vm_map_range_overflows(map, start, size))) {
18515 *phys_size = 0;
18516 return KERN_INVALID_ADDRESS;
18517 }
18518 assert(adjusted_end > adjusted_start);
18519 adjusted_size = adjusted_end - adjusted_start;
18520 *phys_size = adjusted_size;
18521 if (VM_MAP_PAGE_SIZE(map) == PAGE_SIZE) {
18522 return KERN_SUCCESS;
18523 }
18524 if (start == 0) {
18525 adjusted_start = vm_map_trunc_page(start, PAGE_MASK);
18526 adjusted_end = vm_map_round_page(start + size, PAGE_MASK);
18527 if (__improbable(adjusted_end <= adjusted_start)) {
18528 /* wraparound */
18529 printf("%s:%d(start=0x%llx, size=0x%llx) pgmask 0x%x: wraparound\n", __FUNCTION__, __LINE__, (uint64_t)start, (uint64_t)size, PAGE_MASK);
18530 *phys_size = 0;
18531 return KERN_INVALID_ARGUMENT;
18532 }
18533 assert(adjusted_end > adjusted_start);
18534 adjusted_size = adjusted_end - adjusted_start;
18535 *phys_size = adjusted_size;
18536 return KERN_SUCCESS;
18537 }
18538
18539 vmk_flags = VM_MAP_KERNEL_FLAGS_NONE;
18540 vmk_flags.vmkf_copy_pageable = TRUE;
18541 vmk_flags.vmkf_copy_same_map = TRUE;
18542 assert(adjusted_size != 0);
18543 cur_prot = VM_PROT_NONE; /* legacy mode */
18544 max_prot = VM_PROT_NONE; /* legacy mode */
18545 kr = vm_map_copy_extract(map, adjusted_start, adjusted_size,
18546 FALSE /* copy */,
18547 ©_map,
18548 &cur_prot, &max_prot, VM_INHERIT_DEFAULT,
18549 vmk_flags);
18550 if (kr != KERN_SUCCESS) {
18551 DEBUG4K_ERROR("map %p start 0x%llx 0x%llx size 0x%llx 0x%llx kr 0x%x\n", map, (uint64_t)start, (uint64_t)adjusted_start, size, (uint64_t)adjusted_size, kr);
18552 //assert(0);
18553 *phys_size = 0;
18554 return kr;
18555 }
18556 assert(copy_map != VM_MAP_COPY_NULL);
18557 target_copy_map = copy_map;
18558 DEBUG4K_ADJUST("adjusting...\n");
18559 kr = vm_map_copy_adjust_to_target(
18560 copy_map,
18561 start - adjusted_start, /* offset */
18562 size, /* size */
18563 kernel_map,
18564 FALSE, /* copy */
18565 &target_copy_map,
18566 &overmap_start,
18567 &overmap_end,
18568 &trimmed_start);
18569 if (kr == KERN_SUCCESS) {
18570 if (target_copy_map->size != *phys_size) {
18571 DEBUG4K_ADJUST("map %p (%d) start 0x%llx size 0x%llx adjusted_start 0x%llx adjusted_end 0x%llx overmap_start 0x%llx overmap_end 0x%llx trimmed_start 0x%llx phys_size 0x%llx -> 0x%llx\n", map, VM_MAP_PAGE_SHIFT(map), (uint64_t)start, (uint64_t)size, (uint64_t)adjusted_start, (uint64_t)adjusted_end, (uint64_t)overmap_start, (uint64_t)overmap_end, (uint64_t)trimmed_start, (uint64_t)*phys_size, (uint64_t)target_copy_map->size);
18572 }
18573 *phys_size = target_copy_map->size;
18574 } else {
18575 DEBUG4K_ERROR("map %p start 0x%llx 0x%llx size 0x%llx 0x%llx kr 0x%x\n", map, (uint64_t)start, (uint64_t)adjusted_start, size, (uint64_t)adjusted_size, kr);
18576 //assert(0);
18577 *phys_size = 0;
18578 }
18579 vm_map_copy_discard(copy_map);
18580 copy_map = VM_MAP_COPY_NULL;
18581
18582 return kr;
18583 }
18584
18585
18586 kern_return_t
memory_entry_check_for_adjustment(vm_map_t src_map,ipc_port_t port,vm_map_offset_t * overmap_start,vm_map_offset_t * overmap_end)18587 memory_entry_check_for_adjustment(
18588 vm_map_t src_map,
18589 ipc_port_t port,
18590 vm_map_offset_t *overmap_start,
18591 vm_map_offset_t *overmap_end)
18592 {
18593 kern_return_t kr = KERN_SUCCESS;
18594 vm_map_copy_t copy_map = VM_MAP_COPY_NULL, target_copy_map = VM_MAP_COPY_NULL;
18595
18596 assert(port);
18597 assertf(ip_kotype(port) == IKOT_NAMED_ENTRY, "Port Type expected: %d...received:%d\n", IKOT_NAMED_ENTRY, ip_kotype(port));
18598
18599 vm_named_entry_t named_entry;
18600
18601 named_entry = mach_memory_entry_from_port(port);
18602 named_entry_lock(named_entry);
18603 copy_map = named_entry->backing.copy;
18604 target_copy_map = copy_map;
18605
18606 if (src_map && VM_MAP_PAGE_SHIFT(src_map) < PAGE_SHIFT) {
18607 vm_map_offset_t trimmed_start;
18608
18609 trimmed_start = 0;
18610 DEBUG4K_ADJUST("adjusting...\n");
18611 kr = vm_map_copy_adjust_to_target(
18612 copy_map,
18613 0, /* offset */
18614 copy_map->size, /* size */
18615 src_map,
18616 FALSE, /* copy */
18617 &target_copy_map,
18618 overmap_start,
18619 overmap_end,
18620 &trimmed_start);
18621 assert(trimmed_start == 0);
18622 }
18623 named_entry_unlock(named_entry);
18624
18625 return kr;
18626 }
18627
18628
18629 /*
18630 * Routine: vm_remap
18631 *
18632 * Map portion of a task's address space.
18633 * Mapped region must not overlap more than
18634 * one vm memory object. Protections and
18635 * inheritance attributes remain the same
18636 * as in the original task and are out parameters.
18637 * Source and Target task can be identical
18638 * Other attributes are identical as for vm_map()
18639 */
18640 kern_return_t
vm_map_remap(vm_map_t target_map,vm_map_address_t * address,vm_map_size_t size,vm_map_offset_t mask,vm_map_kernel_flags_t vmk_flags,vm_map_t src_map,vm_map_offset_t memory_address,boolean_t copy,vm_prot_t * cur_protection,vm_prot_t * max_protection,vm_inherit_t inheritance)18641 vm_map_remap(
18642 vm_map_t target_map,
18643 vm_map_address_t *address,
18644 vm_map_size_t size,
18645 vm_map_offset_t mask,
18646 vm_map_kernel_flags_t vmk_flags,
18647 vm_map_t src_map,
18648 vm_map_offset_t memory_address,
18649 boolean_t copy,
18650 vm_prot_t *cur_protection, /* IN/OUT */
18651 vm_prot_t *max_protection, /* IN/OUT */
18652 vm_inherit_t inheritance)
18653 {
18654 kern_return_t result;
18655 vm_map_entry_t entry;
18656 vm_map_entry_t insp_entry = VM_MAP_ENTRY_NULL;
18657 vm_map_entry_t new_entry;
18658 vm_map_copy_t copy_map;
18659 vm_map_offset_t offset_in_mapping;
18660 vm_map_size_t target_size = 0;
18661 vm_map_size_t src_page_mask, target_page_mask;
18662 vm_map_offset_t overmap_start, overmap_end, trimmed_start;
18663 vm_map_offset_t initial_memory_address;
18664 vm_map_size_t initial_size;
18665 VM_MAP_ZAP_DECLARE(zap_list);
18666
18667 if (target_map == VM_MAP_NULL) {
18668 return KERN_INVALID_ARGUMENT;
18669 }
18670
18671 if (__improbable(vm_map_range_overflows(src_map, memory_address, size))) {
18672 return KERN_INVALID_ARGUMENT;
18673 }
18674
18675 initial_memory_address = memory_address;
18676 initial_size = size;
18677 src_page_mask = VM_MAP_PAGE_MASK(src_map);
18678 target_page_mask = VM_MAP_PAGE_MASK(target_map);
18679
18680 switch (inheritance) {
18681 case VM_INHERIT_NONE:
18682 case VM_INHERIT_COPY:
18683 case VM_INHERIT_SHARE:
18684 if (size != 0 && src_map != VM_MAP_NULL) {
18685 break;
18686 }
18687 OS_FALLTHROUGH;
18688 default:
18689 return KERN_INVALID_ARGUMENT;
18690 }
18691
18692 if (src_page_mask != target_page_mask) {
18693 if (copy) {
18694 DEBUG4K_COPY("src_map %p pgsz 0x%x addr 0x%llx size 0x%llx copy %d -> target_map %p pgsz 0x%x\n", src_map, VM_MAP_PAGE_SIZE(src_map), (uint64_t)memory_address, (uint64_t)size, copy, target_map, VM_MAP_PAGE_SIZE(target_map));
18695 } else {
18696 DEBUG4K_SHARE("src_map %p pgsz 0x%x addr 0x%llx size 0x%llx copy %d -> target_map %p pgsz 0x%x\n", src_map, VM_MAP_PAGE_SIZE(src_map), (uint64_t)memory_address, (uint64_t)size, copy, target_map, VM_MAP_PAGE_SIZE(target_map));
18697 }
18698 }
18699
18700 /*
18701 * If the user is requesting that we return the address of the
18702 * first byte of the data (rather than the base of the page),
18703 * then we use different rounding semantics: specifically,
18704 * we assume that (memory_address, size) describes a region
18705 * all of whose pages we must cover, rather than a base to be truncated
18706 * down and a size to be added to that base. So we figure out
18707 * the highest page that the requested region includes and make
18708 * sure that the size will cover it.
18709 *
18710 * The key example we're worried about it is of the form:
18711 *
18712 * memory_address = 0x1ff0, size = 0x20
18713 *
18714 * With the old semantics, we round down the memory_address to 0x1000
18715 * and round up the size to 0x1000, resulting in our covering *only*
18716 * page 0x1000. With the new semantics, we'd realize that the region covers
18717 * 0x1ff0-0x2010, and compute a size of 0x2000. Thus, we cover both page
18718 * 0x1000 and page 0x2000 in the region we remap.
18719 */
18720 if (vmk_flags.vmf_return_data_addr) {
18721 vm_map_offset_t range_start, range_end;
18722
18723 range_start = vm_map_trunc_page(memory_address, src_page_mask);
18724 range_end = vm_map_round_page(memory_address + size, src_page_mask);
18725 memory_address = range_start;
18726 size = range_end - range_start;
18727 offset_in_mapping = initial_memory_address - memory_address;
18728 } else {
18729 /*
18730 * IMPORTANT:
18731 * This legacy code path is broken: for the range mentioned
18732 * above [ memory_address = 0x1ff0,size = 0x20 ], which spans
18733 * two 4k pages, it yields [ memory_address = 0x1000,
18734 * size = 0x1000 ], which covers only the first 4k page.
18735 * BUT some code unfortunately depends on this bug, so we
18736 * can't fix it without breaking something.
18737 * New code should get automatically opted in the new
18738 * behavior with the new VM_FLAGS_RETURN_DATA_ADDR flags.
18739 */
18740 offset_in_mapping = 0;
18741 memory_address = vm_map_trunc_page(memory_address, src_page_mask);
18742 size = vm_map_round_page(size, src_page_mask);
18743 initial_memory_address = memory_address;
18744 initial_size = size;
18745 }
18746
18747
18748 if (size == 0) {
18749 return KERN_INVALID_ARGUMENT;
18750 }
18751
18752 if (vmk_flags.vmf_resilient_media) {
18753 /* must be copy-on-write to be "media resilient" */
18754 if (!copy) {
18755 return KERN_INVALID_ARGUMENT;
18756 }
18757 }
18758
18759 vmk_flags.vmkf_copy_pageable = target_map->hdr.entries_pageable;
18760 vmk_flags.vmkf_copy_same_map = (src_map == target_map);
18761
18762 assert(size != 0);
18763 result = vm_map_copy_extract(src_map,
18764 memory_address,
18765 size,
18766 copy, ©_map,
18767 cur_protection, /* IN/OUT */
18768 max_protection, /* IN/OUT */
18769 inheritance,
18770 vmk_flags);
18771 if (result != KERN_SUCCESS) {
18772 return result;
18773 }
18774 assert(copy_map != VM_MAP_COPY_NULL);
18775
18776 /*
18777 * Handle the policy for vm map ranges
18778 *
18779 * If the maps differ, the target_map policy applies like for vm_map()
18780 * For same mapping remaps, we preserve the range.
18781 */
18782 if (vmk_flags.vmkf_copy_same_map) {
18783 vmk_flags.vmkf_range_id = copy_map->orig_range;
18784 } else {
18785 vm_map_kernel_flags_update_range_id(&vmk_flags, target_map);
18786 }
18787
18788 overmap_start = 0;
18789 overmap_end = 0;
18790 trimmed_start = 0;
18791 target_size = size;
18792 if (src_page_mask != target_page_mask) {
18793 vm_map_copy_t target_copy_map;
18794
18795 target_copy_map = copy_map; /* can modify "copy_map" itself */
18796 DEBUG4K_ADJUST("adjusting...\n");
18797 result = vm_map_copy_adjust_to_target(
18798 copy_map,
18799 offset_in_mapping, /* offset */
18800 initial_size,
18801 target_map,
18802 copy,
18803 &target_copy_map,
18804 &overmap_start,
18805 &overmap_end,
18806 &trimmed_start);
18807 if (result != KERN_SUCCESS) {
18808 DEBUG4K_COPY("failed to adjust 0x%x\n", result);
18809 vm_map_copy_discard(copy_map);
18810 return result;
18811 }
18812 if (trimmed_start == 0) {
18813 /* nothing trimmed: no adjustment needed */
18814 } else if (trimmed_start >= offset_in_mapping) {
18815 /* trimmed more than offset_in_mapping: nothing left */
18816 assert(overmap_start == 0);
18817 assert(overmap_end == 0);
18818 offset_in_mapping = 0;
18819 } else {
18820 /* trimmed some of offset_in_mapping: adjust */
18821 assert(overmap_start == 0);
18822 assert(overmap_end == 0);
18823 offset_in_mapping -= trimmed_start;
18824 }
18825 offset_in_mapping += overmap_start;
18826 target_size = target_copy_map->size;
18827 }
18828
18829 /*
18830 * Allocate/check a range of free virtual address
18831 * space for the target
18832 */
18833 *address = vm_map_trunc_page(*address, target_page_mask);
18834 vm_map_lock(target_map);
18835 target_size = vm_map_round_page(target_size, target_page_mask);
18836 result = vm_map_remap_range_allocate(target_map, address,
18837 target_size, mask, vmk_flags,
18838 &insp_entry, &zap_list);
18839
18840 for (entry = vm_map_copy_first_entry(copy_map);
18841 entry != vm_map_copy_to_entry(copy_map);
18842 entry = new_entry) {
18843 new_entry = entry->vme_next;
18844 vm_map_copy_entry_unlink(copy_map, entry);
18845 if (result == KERN_SUCCESS) {
18846 if (vmk_flags.vmkf_remap_prot_copy) {
18847 /*
18848 * This vm_map_remap() is for a
18849 * vm_protect(VM_PROT_COPY), so the caller
18850 * expects to be allowed to add write access
18851 * to this new mapping. This is done by
18852 * adding VM_PROT_WRITE to each entry's
18853 * max_protection... unless some security
18854 * settings disallow it.
18855 */
18856 bool allow_write = false;
18857 if (entry->vme_permanent) {
18858 /* immutable mapping... */
18859 if ((entry->max_protection & VM_PROT_EXECUTE) &&
18860 developer_mode_state()) {
18861 /*
18862 * ... but executable and
18863 * possibly being debugged,
18864 * so let's allow it to become
18865 * writable, for breakpoints
18866 * and dtrace probes, for
18867 * example.
18868 */
18869 allow_write = true;
18870 } else {
18871 printf("%d[%s] vm_remap(0x%llx,0x%llx) VM_PROT_COPY denied on permanent mapping prot 0x%x/0x%x developer %d\n",
18872 proc_selfpid(),
18873 (get_bsdtask_info(current_task())
18874 ? proc_name_address(get_bsdtask_info(current_task()))
18875 : "?"),
18876 (uint64_t)memory_address,
18877 (uint64_t)size,
18878 entry->protection,
18879 entry->max_protection,
18880 developer_mode_state());
18881 DTRACE_VM6(vm_map_delete_permanent_deny_protcopy,
18882 vm_map_entry_t, entry,
18883 vm_map_offset_t, entry->vme_start,
18884 vm_map_offset_t, entry->vme_end,
18885 vm_prot_t, entry->protection,
18886 vm_prot_t, entry->max_protection,
18887 int, VME_ALIAS(entry));
18888 }
18889 } else {
18890 allow_write = true;
18891 }
18892
18893 /*
18894 * VM_PROT_COPY: allow this mapping to become
18895 * writable, unless it was "permanent".
18896 */
18897 if (allow_write) {
18898 entry->max_protection |= VM_PROT_WRITE;
18899 }
18900 }
18901 if (vmk_flags.vmf_resilient_codesign) {
18902 /* no codesigning -> read-only access */
18903 entry->max_protection = VM_PROT_READ;
18904 entry->protection = VM_PROT_READ;
18905 entry->vme_resilient_codesign = TRUE;
18906 }
18907 entry->vme_start += *address;
18908 entry->vme_end += *address;
18909 assert(!entry->map_aligned);
18910 if (vmk_flags.vmf_resilient_media &&
18911 !entry->is_sub_map &&
18912 (VME_OBJECT(entry) == VM_OBJECT_NULL ||
18913 VME_OBJECT(entry)->internal)) {
18914 entry->vme_resilient_media = TRUE;
18915 }
18916 assert(VM_MAP_PAGE_ALIGNED(entry->vme_start, MIN(target_page_mask, PAGE_MASK)));
18917 assert(VM_MAP_PAGE_ALIGNED(entry->vme_end, MIN(target_page_mask, PAGE_MASK)));
18918 assert(VM_MAP_PAGE_ALIGNED(VME_OFFSET(entry), MIN(target_page_mask, PAGE_MASK)));
18919 vm_map_store_entry_link(target_map, insp_entry, entry,
18920 vmk_flags);
18921 insp_entry = entry;
18922 } else {
18923 if (!entry->is_sub_map) {
18924 vm_object_deallocate(VME_OBJECT(entry));
18925 } else {
18926 vm_map_deallocate(VME_SUBMAP(entry));
18927 }
18928 vm_map_copy_entry_dispose(entry);
18929 }
18930 }
18931
18932 if (vmk_flags.vmf_resilient_codesign) {
18933 *cur_protection = VM_PROT_READ;
18934 *max_protection = VM_PROT_READ;
18935 }
18936
18937 if (result == KERN_SUCCESS) {
18938 target_map->size += target_size;
18939 SAVE_HINT_MAP_WRITE(target_map, insp_entry);
18940 }
18941 vm_map_unlock(target_map);
18942
18943 vm_map_zap_dispose(&zap_list);
18944
18945 if (result == KERN_SUCCESS && target_map->wiring_required) {
18946 result = vm_map_wire_kernel(target_map, *address,
18947 *address + size, *cur_protection, VM_KERN_MEMORY_MLOCK,
18948 TRUE);
18949 }
18950
18951 /*
18952 * If requested, return the address of the data pointed to by the
18953 * request, rather than the base of the resulting page.
18954 */
18955 if (vmk_flags.vmf_return_data_addr) {
18956 *address += offset_in_mapping;
18957 }
18958
18959 if (src_page_mask != target_page_mask) {
18960 DEBUG4K_SHARE("vm_remap(%p 0x%llx 0x%llx copy=%d-> %p 0x%llx 0x%llx result=0x%x\n", src_map, (uint64_t)memory_address, (uint64_t)size, copy, target_map, (uint64_t)*address, (uint64_t)offset_in_mapping, result);
18961 }
18962 vm_map_copy_discard(copy_map);
18963 copy_map = VM_MAP_COPY_NULL;
18964
18965 return result;
18966 }
18967
18968 /*
18969 * Routine: vm_map_remap_range_allocate
18970 *
18971 * Description:
18972 * Allocate a range in the specified virtual address map.
18973 * returns the address and the map entry just before the allocated
18974 * range
18975 *
18976 * Map must be locked.
18977 */
18978
18979 static kern_return_t
vm_map_remap_range_allocate(vm_map_t map,vm_map_address_t * address,vm_map_size_t size,vm_map_offset_t mask,vm_map_kernel_flags_t vmk_flags,vm_map_entry_t * map_entry,vm_map_zap_t zap_list)18980 vm_map_remap_range_allocate(
18981 vm_map_t map,
18982 vm_map_address_t *address, /* IN/OUT */
18983 vm_map_size_t size,
18984 vm_map_offset_t mask,
18985 vm_map_kernel_flags_t vmk_flags,
18986 vm_map_entry_t *map_entry, /* OUT */
18987 vm_map_zap_t zap_list)
18988 {
18989 vm_map_entry_t entry;
18990 vm_map_offset_t start;
18991 kern_return_t kr;
18992
18993 start = *address;
18994
18995 if (!vmk_flags.vmf_fixed) {
18996 kr = vm_map_locate_space(map, size, mask, vmk_flags,
18997 &start, &entry);
18998 if (kr != KERN_SUCCESS) {
18999 return kr;
19000 }
19001 *address = start;
19002 } else {
19003 vm_map_offset_t effective_min_offset, effective_max_offset;
19004 vm_map_entry_t temp_entry;
19005 vm_map_offset_t end;
19006
19007 effective_min_offset = map->min_offset;
19008 effective_max_offset = map->max_offset;
19009
19010 /*
19011 * Verify that:
19012 * the address doesn't itself violate
19013 * the mask requirement.
19014 */
19015
19016 if ((start & mask) != 0) {
19017 return KERN_NO_SPACE;
19018 }
19019
19020 #if CONFIG_MAP_RANGES
19021 if (map->uses_user_ranges) {
19022 struct mach_vm_range r;
19023
19024 vm_map_user_range_resolve(map, start, 1, &r);
19025 if (r.max_address == 0) {
19026 return KERN_INVALID_ADDRESS;
19027 }
19028
19029 effective_min_offset = r.min_address;
19030 effective_max_offset = r.max_address;
19031 }
19032 #endif /* CONFIG_MAP_RANGES */
19033 if (map == kernel_map) {
19034 mach_vm_range_t r = kmem_validate_range_for_overwrite(start, size);
19035 effective_min_offset = r->min_address;
19036 effective_min_offset = r->max_address;
19037 }
19038
19039 /*
19040 * ... the address is within bounds
19041 */
19042
19043 end = start + size;
19044
19045 if ((start < effective_min_offset) ||
19046 (end > effective_max_offset) ||
19047 (start >= end)) {
19048 return KERN_INVALID_ADDRESS;
19049 }
19050
19051 /*
19052 * If we're asked to overwrite whatever was mapped in that
19053 * range, first deallocate that range.
19054 */
19055 if (vmk_flags.vmf_overwrite) {
19056 vmr_flags_t remove_flags = VM_MAP_REMOVE_NO_MAP_ALIGN;
19057
19058 /*
19059 * We use a "zap_list" to avoid having to unlock
19060 * the "map" in vm_map_delete(), which would compromise
19061 * the atomicity of the "deallocate" and then "remap"
19062 * combination.
19063 */
19064 remove_flags |= VM_MAP_REMOVE_NO_YIELD;
19065
19066 if (vmk_flags.vmkf_overwrite_immutable) {
19067 remove_flags |= VM_MAP_REMOVE_IMMUTABLE;
19068 }
19069 if (vmk_flags.vmkf_remap_prot_copy) {
19070 remove_flags |= VM_MAP_REMOVE_IMMUTABLE_CODE;
19071 }
19072 kr = vm_map_delete(map, start, end, remove_flags,
19073 KMEM_GUARD_NONE, zap_list).kmr_return;
19074 if (kr != KERN_SUCCESS) {
19075 /* XXX FBDP restore zap_list? */
19076 return kr;
19077 }
19078 }
19079
19080 /*
19081 * ... the starting address isn't allocated
19082 */
19083
19084 if (vm_map_lookup_entry(map, start, &temp_entry)) {
19085 return KERN_NO_SPACE;
19086 }
19087
19088 entry = temp_entry;
19089
19090 /*
19091 * ... the next region doesn't overlap the
19092 * end point.
19093 */
19094
19095 if ((entry->vme_next != vm_map_to_entry(map)) &&
19096 (entry->vme_next->vme_start < end)) {
19097 return KERN_NO_SPACE;
19098 }
19099 }
19100 *map_entry = entry;
19101 return KERN_SUCCESS;
19102 }
19103
19104 /*
19105 * vm_map_switch:
19106 *
19107 * Set the address map for the current thread to the specified map
19108 */
19109
19110 vm_map_t
vm_map_switch(vm_map_t map)19111 vm_map_switch(
19112 vm_map_t map)
19113 {
19114 thread_t thread = current_thread();
19115 vm_map_t oldmap = thread->map;
19116
19117
19118 /*
19119 * Deactivate the current map and activate the requested map
19120 */
19121 mp_disable_preemption();
19122 PMAP_SWITCH_USER(thread, map, cpu_number());
19123 mp_enable_preemption();
19124 return oldmap;
19125 }
19126
19127
19128 /*
19129 * Routine: vm_map_write_user
19130 *
19131 * Description:
19132 * Copy out data from a kernel space into space in the
19133 * destination map. The space must already exist in the
19134 * destination map.
19135 * NOTE: This routine should only be called by threads
19136 * which can block on a page fault. i.e. kernel mode user
19137 * threads.
19138 *
19139 */
19140 kern_return_t
vm_map_write_user(vm_map_t map,void * src_p,vm_map_address_t dst_addr,vm_size_t size)19141 vm_map_write_user(
19142 vm_map_t map,
19143 void *src_p,
19144 vm_map_address_t dst_addr,
19145 vm_size_t size)
19146 {
19147 kern_return_t kr = KERN_SUCCESS;
19148
19149 if (__improbable(vm_map_range_overflows(map, dst_addr, size))) {
19150 return KERN_INVALID_ADDRESS;
19151 }
19152
19153 if (current_map() == map) {
19154 if (copyout(src_p, dst_addr, size)) {
19155 kr = KERN_INVALID_ADDRESS;
19156 }
19157 } else {
19158 vm_map_t oldmap;
19159
19160 /* take on the identity of the target map while doing */
19161 /* the transfer */
19162
19163 vm_map_reference(map);
19164 oldmap = vm_map_switch(map);
19165 if (copyout(src_p, dst_addr, size)) {
19166 kr = KERN_INVALID_ADDRESS;
19167 }
19168 vm_map_switch(oldmap);
19169 vm_map_deallocate(map);
19170 }
19171 return kr;
19172 }
19173
19174 /*
19175 * Routine: vm_map_read_user
19176 *
19177 * Description:
19178 * Copy in data from a user space source map into the
19179 * kernel map. The space must already exist in the
19180 * kernel map.
19181 * NOTE: This routine should only be called by threads
19182 * which can block on a page fault. i.e. kernel mode user
19183 * threads.
19184 *
19185 */
19186 kern_return_t
vm_map_read_user(vm_map_t map,vm_map_address_t src_addr,void * dst_p,vm_size_t size)19187 vm_map_read_user(
19188 vm_map_t map,
19189 vm_map_address_t src_addr,
19190 void *dst_p,
19191 vm_size_t size)
19192 {
19193 kern_return_t kr = KERN_SUCCESS;
19194
19195 if (__improbable(vm_map_range_overflows(map, src_addr, size))) {
19196 return KERN_INVALID_ADDRESS;
19197 }
19198
19199 if (current_map() == map) {
19200 if (copyin(src_addr, dst_p, size)) {
19201 kr = KERN_INVALID_ADDRESS;
19202 }
19203 } else {
19204 vm_map_t oldmap;
19205
19206 /* take on the identity of the target map while doing */
19207 /* the transfer */
19208
19209 vm_map_reference(map);
19210 oldmap = vm_map_switch(map);
19211 if (copyin(src_addr, dst_p, size)) {
19212 kr = KERN_INVALID_ADDRESS;
19213 }
19214 vm_map_switch(oldmap);
19215 vm_map_deallocate(map);
19216 }
19217 return kr;
19218 }
19219
19220
19221 /*
19222 * vm_map_check_protection:
19223 *
19224 * Assert that the target map allows the specified
19225 * privilege on the entire address region given.
19226 * The entire region must be allocated.
19227 */
19228 boolean_t
vm_map_check_protection(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_prot_t protection)19229 vm_map_check_protection(vm_map_t map, vm_map_offset_t start,
19230 vm_map_offset_t end, vm_prot_t protection)
19231 {
19232 vm_map_entry_t entry;
19233 vm_map_entry_t tmp_entry;
19234
19235 if (__improbable(vm_map_range_overflows(map, start, end - start))) {
19236 return FALSE;
19237 }
19238
19239 vm_map_lock(map);
19240
19241 if (start < vm_map_min(map) || end > vm_map_max(map) || start > end) {
19242 vm_map_unlock(map);
19243 return FALSE;
19244 }
19245
19246 if (!vm_map_lookup_entry(map, start, &tmp_entry)) {
19247 vm_map_unlock(map);
19248 return FALSE;
19249 }
19250
19251 entry = tmp_entry;
19252
19253 while (start < end) {
19254 if (entry == vm_map_to_entry(map)) {
19255 vm_map_unlock(map);
19256 return FALSE;
19257 }
19258
19259 /*
19260 * No holes allowed!
19261 */
19262
19263 if (start < entry->vme_start) {
19264 vm_map_unlock(map);
19265 return FALSE;
19266 }
19267
19268 /*
19269 * Check protection associated with entry.
19270 */
19271
19272 if ((entry->protection & protection) != protection) {
19273 vm_map_unlock(map);
19274 return FALSE;
19275 }
19276
19277 /* go to next entry */
19278
19279 start = entry->vme_end;
19280 entry = entry->vme_next;
19281 }
19282 vm_map_unlock(map);
19283 return TRUE;
19284 }
19285
19286 kern_return_t
vm_map_purgable_control(vm_map_t map,vm_map_offset_t address,vm_purgable_t control,int * state)19287 vm_map_purgable_control(
19288 vm_map_t map,
19289 vm_map_offset_t address,
19290 vm_purgable_t control,
19291 int *state)
19292 {
19293 vm_map_entry_t entry;
19294 vm_object_t object;
19295 kern_return_t kr;
19296 boolean_t was_nonvolatile;
19297
19298 /*
19299 * Vet all the input parameters and current type and state of the
19300 * underlaying object. Return with an error if anything is amiss.
19301 */
19302 if (map == VM_MAP_NULL) {
19303 return KERN_INVALID_ARGUMENT;
19304 }
19305
19306 if (control != VM_PURGABLE_SET_STATE &&
19307 control != VM_PURGABLE_GET_STATE &&
19308 control != VM_PURGABLE_PURGE_ALL &&
19309 control != VM_PURGABLE_SET_STATE_FROM_KERNEL) {
19310 return KERN_INVALID_ARGUMENT;
19311 }
19312
19313 if (control == VM_PURGABLE_PURGE_ALL) {
19314 vm_purgeable_object_purge_all();
19315 return KERN_SUCCESS;
19316 }
19317
19318 if ((control == VM_PURGABLE_SET_STATE ||
19319 control == VM_PURGABLE_SET_STATE_FROM_KERNEL) &&
19320 (((*state & ~(VM_PURGABLE_ALL_MASKS)) != 0) ||
19321 ((*state & VM_PURGABLE_STATE_MASK) > VM_PURGABLE_STATE_MASK))) {
19322 return KERN_INVALID_ARGUMENT;
19323 }
19324
19325 vm_map_lock_read(map);
19326
19327 if (!vm_map_lookup_entry(map, address, &entry) || entry->is_sub_map) {
19328 /*
19329 * Must pass a valid non-submap address.
19330 */
19331 vm_map_unlock_read(map);
19332 return KERN_INVALID_ADDRESS;
19333 }
19334
19335 if ((entry->protection & VM_PROT_WRITE) == 0 &&
19336 control != VM_PURGABLE_GET_STATE) {
19337 /*
19338 * Can't apply purgable controls to something you can't write.
19339 */
19340 vm_map_unlock_read(map);
19341 return KERN_PROTECTION_FAILURE;
19342 }
19343
19344 object = VME_OBJECT(entry);
19345 if (object == VM_OBJECT_NULL ||
19346 object->purgable == VM_PURGABLE_DENY) {
19347 /*
19348 * Object must already be present and be purgeable.
19349 */
19350 vm_map_unlock_read(map);
19351 return KERN_INVALID_ARGUMENT;
19352 }
19353
19354 vm_object_lock(object);
19355
19356 #if 00
19357 if (VME_OFFSET(entry) != 0 ||
19358 entry->vme_end - entry->vme_start != object->vo_size) {
19359 /*
19360 * Can only apply purgable controls to the whole (existing)
19361 * object at once.
19362 */
19363 vm_map_unlock_read(map);
19364 vm_object_unlock(object);
19365 return KERN_INVALID_ARGUMENT;
19366 }
19367 #endif
19368
19369 assert(!entry->is_sub_map);
19370 assert(!entry->use_pmap); /* purgeable has its own accounting */
19371
19372 vm_map_unlock_read(map);
19373
19374 was_nonvolatile = (object->purgable == VM_PURGABLE_NONVOLATILE);
19375
19376 kr = vm_object_purgable_control(object, control, state);
19377
19378 if (was_nonvolatile &&
19379 object->purgable != VM_PURGABLE_NONVOLATILE &&
19380 map->pmap == kernel_pmap) {
19381 #if DEBUG
19382 object->vo_purgeable_volatilizer = kernel_task;
19383 #endif /* DEBUG */
19384 }
19385
19386 vm_object_unlock(object);
19387
19388 return kr;
19389 }
19390
19391 void
vm_map_footprint_query_page_info(vm_map_t map,vm_map_entry_t map_entry,vm_map_offset_t curr_s_offset,int * disposition_p)19392 vm_map_footprint_query_page_info(
19393 vm_map_t map,
19394 vm_map_entry_t map_entry,
19395 vm_map_offset_t curr_s_offset,
19396 int *disposition_p)
19397 {
19398 int pmap_disp;
19399 vm_object_t object = VM_OBJECT_NULL;
19400 int disposition;
19401 int effective_page_size;
19402
19403 vm_map_lock_assert_held(map);
19404 assert(!map->has_corpse_footprint);
19405 assert(curr_s_offset >= map_entry->vme_start);
19406 assert(curr_s_offset < map_entry->vme_end);
19407
19408 if (map_entry->is_sub_map) {
19409 if (!map_entry->use_pmap) {
19410 /* nested pmap: no footprint */
19411 *disposition_p = 0;
19412 return;
19413 }
19414 } else {
19415 object = VME_OBJECT(map_entry);
19416 if (object == VM_OBJECT_NULL) {
19417 /* nothing mapped here: no need to ask */
19418 *disposition_p = 0;
19419 return;
19420 }
19421 }
19422
19423 effective_page_size = MIN(PAGE_SIZE, VM_MAP_PAGE_SIZE(map));
19424
19425 pmap_disp = 0;
19426
19427 /*
19428 * Query the pmap.
19429 */
19430 pmap_query_page_info(map->pmap, curr_s_offset, &pmap_disp);
19431
19432 /*
19433 * Compute this page's disposition.
19434 */
19435 disposition = 0;
19436
19437 /* deal with "alternate accounting" first */
19438 if (!map_entry->is_sub_map &&
19439 object->vo_no_footprint) {
19440 /* does not count in footprint */
19441 assertf(!map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry);
19442 } else if (!map_entry->is_sub_map &&
19443 (object->purgable == VM_PURGABLE_NONVOLATILE ||
19444 (object->purgable == VM_PURGABLE_DENY &&
19445 object->vo_ledger_tag)) &&
19446 VM_OBJECT_OWNER(object) != NULL &&
19447 VM_OBJECT_OWNER(object)->map == map) {
19448 assertf(!map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry);
19449 if ((((curr_s_offset
19450 - map_entry->vme_start
19451 + VME_OFFSET(map_entry))
19452 / effective_page_size) <
19453 (object->resident_page_count +
19454 vm_compressor_pager_get_count(object->pager)))) {
19455 /*
19456 * Non-volatile purgeable object owned
19457 * by this task: report the first
19458 * "#resident + #compressed" pages as
19459 * "resident" (to show that they
19460 * contribute to the footprint) but not
19461 * "dirty" (to avoid double-counting
19462 * with the fake "non-volatile" region
19463 * we'll report at the end of the
19464 * address space to account for all
19465 * (mapped or not) non-volatile memory
19466 * owned by this task.
19467 */
19468 disposition |= VM_PAGE_QUERY_PAGE_PRESENT;
19469 }
19470 } else if (!map_entry->is_sub_map &&
19471 (object->purgable == VM_PURGABLE_VOLATILE ||
19472 object->purgable == VM_PURGABLE_EMPTY) &&
19473 VM_OBJECT_OWNER(object) != NULL &&
19474 VM_OBJECT_OWNER(object)->map == map) {
19475 assertf(!map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry);
19476 if ((((curr_s_offset
19477 - map_entry->vme_start
19478 + VME_OFFSET(map_entry))
19479 / effective_page_size) <
19480 object->wired_page_count)) {
19481 /*
19482 * Volatile|empty purgeable object owned
19483 * by this task: report the first
19484 * "#wired" pages as "resident" (to
19485 * show that they contribute to the
19486 * footprint) but not "dirty" (to avoid
19487 * double-counting with the fake
19488 * "non-volatile" region we'll report
19489 * at the end of the address space to
19490 * account for all (mapped or not)
19491 * non-volatile memory owned by this
19492 * task.
19493 */
19494 disposition |= VM_PAGE_QUERY_PAGE_PRESENT;
19495 }
19496 } else if (!map_entry->is_sub_map &&
19497 map_entry->iokit_acct &&
19498 object->internal &&
19499 object->purgable == VM_PURGABLE_DENY) {
19500 /*
19501 * Non-purgeable IOKit memory: phys_footprint
19502 * includes the entire virtual mapping.
19503 */
19504 assertf(!map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry);
19505 disposition |= VM_PAGE_QUERY_PAGE_PRESENT;
19506 disposition |= VM_PAGE_QUERY_PAGE_DIRTY;
19507 } else if (pmap_disp & (PMAP_QUERY_PAGE_ALTACCT |
19508 PMAP_QUERY_PAGE_COMPRESSED_ALTACCT)) {
19509 /* alternate accounting */
19510 #if __arm64__ && (DEVELOPMENT || DEBUG)
19511 if (map->pmap->footprint_was_suspended) {
19512 /*
19513 * The assertion below can fail if dyld
19514 * suspended footprint accounting
19515 * while doing some adjustments to
19516 * this page; the mapping would say
19517 * "use pmap accounting" but the page
19518 * would be marked "alternate
19519 * accounting".
19520 */
19521 } else
19522 #endif /* __arm64__ && (DEVELOPMENT || DEBUG) */
19523 {
19524 assertf(!map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry);
19525 }
19526 disposition = 0;
19527 } else {
19528 if (pmap_disp & PMAP_QUERY_PAGE_PRESENT) {
19529 assertf(map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry);
19530 disposition |= VM_PAGE_QUERY_PAGE_PRESENT;
19531 disposition |= VM_PAGE_QUERY_PAGE_REF;
19532 if (pmap_disp & PMAP_QUERY_PAGE_INTERNAL) {
19533 disposition |= VM_PAGE_QUERY_PAGE_DIRTY;
19534 } else {
19535 disposition |= VM_PAGE_QUERY_PAGE_EXTERNAL;
19536 }
19537 if (pmap_disp & PMAP_QUERY_PAGE_REUSABLE) {
19538 disposition |= VM_PAGE_QUERY_PAGE_REUSABLE;
19539 }
19540 } else if (pmap_disp & PMAP_QUERY_PAGE_COMPRESSED) {
19541 assertf(map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry);
19542 disposition |= VM_PAGE_QUERY_PAGE_PAGED_OUT;
19543 }
19544 }
19545
19546 *disposition_p = disposition;
19547 }
19548
19549 kern_return_t
vm_map_page_query_internal(vm_map_t target_map,vm_map_offset_t offset,int * disposition,int * ref_count)19550 vm_map_page_query_internal(
19551 vm_map_t target_map,
19552 vm_map_offset_t offset,
19553 int *disposition,
19554 int *ref_count)
19555 {
19556 kern_return_t kr;
19557 vm_page_info_basic_data_t info;
19558 mach_msg_type_number_t count;
19559
19560 count = VM_PAGE_INFO_BASIC_COUNT;
19561 kr = vm_map_page_info(target_map,
19562 offset,
19563 VM_PAGE_INFO_BASIC,
19564 (vm_page_info_t) &info,
19565 &count);
19566 if (kr == KERN_SUCCESS) {
19567 *disposition = info.disposition;
19568 *ref_count = info.ref_count;
19569 } else {
19570 *disposition = 0;
19571 *ref_count = 0;
19572 }
19573
19574 return kr;
19575 }
19576
19577 kern_return_t
vm_map_page_info(vm_map_t map,vm_map_offset_t offset,vm_page_info_flavor_t flavor,vm_page_info_t info,mach_msg_type_number_t * count)19578 vm_map_page_info(
19579 vm_map_t map,
19580 vm_map_offset_t offset,
19581 vm_page_info_flavor_t flavor,
19582 vm_page_info_t info,
19583 mach_msg_type_number_t *count)
19584 {
19585 return vm_map_page_range_info_internal(map,
19586 offset, /* start of range */
19587 (offset + 1), /* this will get rounded in the call to the page boundary */
19588 (int)-1, /* effective_page_shift: unspecified */
19589 flavor,
19590 info,
19591 count);
19592 }
19593
19594 kern_return_t
vm_map_page_range_info_internal(vm_map_t map,vm_map_offset_t start_offset,vm_map_offset_t end_offset,int effective_page_shift,vm_page_info_flavor_t flavor,vm_page_info_t info,mach_msg_type_number_t * count)19595 vm_map_page_range_info_internal(
19596 vm_map_t map,
19597 vm_map_offset_t start_offset,
19598 vm_map_offset_t end_offset,
19599 int effective_page_shift,
19600 vm_page_info_flavor_t flavor,
19601 vm_page_info_t info,
19602 mach_msg_type_number_t *count)
19603 {
19604 vm_map_entry_t map_entry = VM_MAP_ENTRY_NULL;
19605 vm_object_t object = VM_OBJECT_NULL, curr_object = VM_OBJECT_NULL;
19606 vm_page_t m = VM_PAGE_NULL;
19607 kern_return_t retval = KERN_SUCCESS;
19608 int disposition = 0;
19609 int ref_count = 0;
19610 int depth = 0, info_idx = 0;
19611 vm_page_info_basic_t basic_info = 0;
19612 vm_map_offset_t offset_in_page = 0, offset_in_object = 0, curr_offset_in_object = 0;
19613 vm_map_offset_t start = 0, end = 0, curr_s_offset = 0, curr_e_offset = 0;
19614 boolean_t do_region_footprint;
19615 ledger_amount_t ledger_resident, ledger_compressed;
19616 int effective_page_size;
19617 vm_map_offset_t effective_page_mask;
19618
19619 switch (flavor) {
19620 case VM_PAGE_INFO_BASIC:
19621 if (*count != VM_PAGE_INFO_BASIC_COUNT) {
19622 /*
19623 * The "vm_page_info_basic_data" structure was not
19624 * properly padded, so allow the size to be off by
19625 * one to maintain backwards binary compatibility...
19626 */
19627 if (*count != VM_PAGE_INFO_BASIC_COUNT - 1) {
19628 return KERN_INVALID_ARGUMENT;
19629 }
19630 }
19631 break;
19632 default:
19633 return KERN_INVALID_ARGUMENT;
19634 }
19635
19636 if (effective_page_shift == -1) {
19637 effective_page_shift = vm_self_region_page_shift_safely(map);
19638 if (effective_page_shift == -1) {
19639 return KERN_INVALID_ARGUMENT;
19640 }
19641 }
19642 effective_page_size = (1 << effective_page_shift);
19643 effective_page_mask = effective_page_size - 1;
19644
19645 do_region_footprint = task_self_region_footprint();
19646 disposition = 0;
19647 ref_count = 0;
19648 depth = 0;
19649 info_idx = 0; /* Tracks the next index within the info structure to be filled.*/
19650 retval = KERN_SUCCESS;
19651
19652 if (__improbable(vm_map_range_overflows(map, start_offset, end_offset - start_offset))) {
19653 return KERN_INVALID_ADDRESS;
19654 }
19655
19656 offset_in_page = start_offset & effective_page_mask;
19657 start = vm_map_trunc_page(start_offset, effective_page_mask);
19658 end = vm_map_round_page(end_offset, effective_page_mask);
19659
19660 if (end < start) {
19661 return KERN_INVALID_ARGUMENT;
19662 }
19663
19664 assert((end - start) <= MAX_PAGE_RANGE_QUERY);
19665
19666 vm_map_lock_read(map);
19667
19668 task_ledgers_footprint(map->pmap->ledger, &ledger_resident, &ledger_compressed);
19669
19670 for (curr_s_offset = start; curr_s_offset < end;) {
19671 /*
19672 * New lookup needs reset of these variables.
19673 */
19674 curr_object = object = VM_OBJECT_NULL;
19675 offset_in_object = 0;
19676 ref_count = 0;
19677 depth = 0;
19678
19679 if (do_region_footprint &&
19680 curr_s_offset >= vm_map_last_entry(map)->vme_end) {
19681 /*
19682 * Request for "footprint" info about a page beyond
19683 * the end of address space: this must be for
19684 * the fake region vm_map_region_recurse_64()
19685 * reported to account for non-volatile purgeable
19686 * memory owned by this task.
19687 */
19688 disposition = 0;
19689
19690 if (curr_s_offset - vm_map_last_entry(map)->vme_end <=
19691 (unsigned) ledger_compressed) {
19692 /*
19693 * We haven't reported all the "non-volatile
19694 * compressed" pages yet, so report this fake
19695 * page as "compressed".
19696 */
19697 disposition |= VM_PAGE_QUERY_PAGE_PAGED_OUT;
19698 } else {
19699 /*
19700 * We've reported all the non-volatile
19701 * compressed page but not all the non-volatile
19702 * pages , so report this fake page as
19703 * "resident dirty".
19704 */
19705 disposition |= VM_PAGE_QUERY_PAGE_PRESENT;
19706 disposition |= VM_PAGE_QUERY_PAGE_DIRTY;
19707 disposition |= VM_PAGE_QUERY_PAGE_REF;
19708 }
19709 switch (flavor) {
19710 case VM_PAGE_INFO_BASIC:
19711 basic_info = (vm_page_info_basic_t) (((uintptr_t) info) + (info_idx * sizeof(struct vm_page_info_basic)));
19712 basic_info->disposition = disposition;
19713 basic_info->ref_count = 1;
19714 basic_info->object_id = VM_OBJECT_ID_FAKE(map, task_ledgers.purgeable_nonvolatile);
19715 basic_info->offset = 0;
19716 basic_info->depth = 0;
19717
19718 info_idx++;
19719 break;
19720 }
19721 curr_s_offset += effective_page_size;
19722 continue;
19723 }
19724
19725 /*
19726 * First, find the map entry covering "curr_s_offset", going down
19727 * submaps if necessary.
19728 */
19729 if (!vm_map_lookup_entry(map, curr_s_offset, &map_entry)) {
19730 /* no entry -> no object -> no page */
19731
19732 if (curr_s_offset < vm_map_min(map)) {
19733 /*
19734 * Illegal address that falls below map min.
19735 */
19736 curr_e_offset = MIN(end, vm_map_min(map));
19737 } else if (curr_s_offset >= vm_map_max(map)) {
19738 /*
19739 * Illegal address that falls on/after map max.
19740 */
19741 curr_e_offset = end;
19742 } else if (map_entry == vm_map_to_entry(map)) {
19743 /*
19744 * Hit a hole.
19745 */
19746 if (map_entry->vme_next == vm_map_to_entry(map)) {
19747 /*
19748 * Empty map.
19749 */
19750 curr_e_offset = MIN(map->max_offset, end);
19751 } else {
19752 /*
19753 * Hole at start of the map.
19754 */
19755 curr_e_offset = MIN(map_entry->vme_next->vme_start, end);
19756 }
19757 } else {
19758 if (map_entry->vme_next == vm_map_to_entry(map)) {
19759 /*
19760 * Hole at the end of the map.
19761 */
19762 curr_e_offset = MIN(map->max_offset, end);
19763 } else {
19764 curr_e_offset = MIN(map_entry->vme_next->vme_start, end);
19765 }
19766 }
19767
19768 assert(curr_e_offset >= curr_s_offset);
19769
19770 uint64_t num_pages = (curr_e_offset - curr_s_offset) >> effective_page_shift;
19771
19772 void *info_ptr = (void*) (((uintptr_t) info) + (info_idx * sizeof(struct vm_page_info_basic)));
19773
19774 bzero(info_ptr, num_pages * sizeof(struct vm_page_info_basic));
19775
19776 curr_s_offset = curr_e_offset;
19777
19778 info_idx += num_pages;
19779
19780 continue;
19781 }
19782
19783 /* compute offset from this map entry's start */
19784 offset_in_object = curr_s_offset - map_entry->vme_start;
19785
19786 /* compute offset into this map entry's object (or submap) */
19787 offset_in_object += VME_OFFSET(map_entry);
19788
19789 if (map_entry->is_sub_map) {
19790 vm_map_t sub_map = VM_MAP_NULL;
19791 vm_page_info_t submap_info = 0;
19792 vm_map_offset_t submap_s_offset = 0, submap_e_offset = 0, range_len = 0;
19793
19794 range_len = MIN(map_entry->vme_end, end) - curr_s_offset;
19795
19796 submap_s_offset = offset_in_object;
19797 submap_e_offset = submap_s_offset + range_len;
19798
19799 sub_map = VME_SUBMAP(map_entry);
19800
19801 vm_map_reference(sub_map);
19802 vm_map_unlock_read(map);
19803
19804 submap_info = (vm_page_info_t) (((uintptr_t) info) + (info_idx * sizeof(struct vm_page_info_basic)));
19805
19806 assertf(VM_MAP_PAGE_SHIFT(sub_map) >= VM_MAP_PAGE_SHIFT(map),
19807 "Submap page size (%d) differs from current map (%d)\n", VM_MAP_PAGE_SIZE(sub_map), VM_MAP_PAGE_SIZE(map));
19808
19809 retval = vm_map_page_range_info_internal(sub_map,
19810 submap_s_offset,
19811 submap_e_offset,
19812 effective_page_shift,
19813 VM_PAGE_INFO_BASIC,
19814 (vm_page_info_t) submap_info,
19815 count);
19816
19817 assert(retval == KERN_SUCCESS);
19818
19819 vm_map_lock_read(map);
19820 vm_map_deallocate(sub_map);
19821
19822 /* Move the "info" index by the number of pages we inspected.*/
19823 info_idx += range_len >> effective_page_shift;
19824
19825 /* Move our current offset by the size of the range we inspected.*/
19826 curr_s_offset += range_len;
19827
19828 continue;
19829 }
19830
19831 object = VME_OBJECT(map_entry);
19832
19833 if (object == VM_OBJECT_NULL) {
19834 /*
19835 * We don't have an object here and, hence,
19836 * no pages to inspect. We'll fill up the
19837 * info structure appropriately.
19838 */
19839
19840 curr_e_offset = MIN(map_entry->vme_end, end);
19841
19842 uint64_t num_pages = (curr_e_offset - curr_s_offset) >> effective_page_shift;
19843
19844 void *info_ptr = (void*) (((uintptr_t) info) + (info_idx * sizeof(struct vm_page_info_basic)));
19845
19846 bzero(info_ptr, num_pages * sizeof(struct vm_page_info_basic));
19847
19848 curr_s_offset = curr_e_offset;
19849
19850 info_idx += num_pages;
19851
19852 continue;
19853 }
19854
19855 if (do_region_footprint) {
19856 disposition = 0;
19857 if (map->has_corpse_footprint) {
19858 /*
19859 * Query the page info data we saved
19860 * while forking the corpse.
19861 */
19862 vm_map_corpse_footprint_query_page_info(
19863 map,
19864 curr_s_offset,
19865 &disposition);
19866 } else {
19867 /*
19868 * Query the live pmap for footprint info
19869 * about this page.
19870 */
19871 vm_map_footprint_query_page_info(
19872 map,
19873 map_entry,
19874 curr_s_offset,
19875 &disposition);
19876 }
19877 switch (flavor) {
19878 case VM_PAGE_INFO_BASIC:
19879 basic_info = (vm_page_info_basic_t) (((uintptr_t) info) + (info_idx * sizeof(struct vm_page_info_basic)));
19880 basic_info->disposition = disposition;
19881 basic_info->ref_count = 1;
19882 basic_info->object_id = VM_OBJECT_ID_FAKE(map, task_ledgers.purgeable_nonvolatile);
19883 basic_info->offset = 0;
19884 basic_info->depth = 0;
19885
19886 info_idx++;
19887 break;
19888 }
19889 curr_s_offset += effective_page_size;
19890 continue;
19891 }
19892
19893 vm_object_reference(object);
19894 /*
19895 * Shared mode -- so we can allow other readers
19896 * to grab the lock too.
19897 */
19898 vm_object_lock_shared(object);
19899
19900 curr_e_offset = MIN(map_entry->vme_end, end);
19901
19902 vm_map_unlock_read(map);
19903
19904 map_entry = NULL; /* map is unlocked, the entry is no longer valid. */
19905
19906 curr_object = object;
19907
19908 for (; curr_s_offset < curr_e_offset;) {
19909 if (object == curr_object) {
19910 ref_count = curr_object->ref_count - 1; /* account for our object reference above. */
19911 } else {
19912 ref_count = curr_object->ref_count;
19913 }
19914
19915 curr_offset_in_object = offset_in_object;
19916
19917 for (;;) {
19918 m = vm_page_lookup(curr_object, vm_object_trunc_page(curr_offset_in_object));
19919
19920 if (m != VM_PAGE_NULL) {
19921 disposition |= VM_PAGE_QUERY_PAGE_PRESENT;
19922 break;
19923 } else {
19924 if (curr_object->internal &&
19925 curr_object->alive &&
19926 !curr_object->terminating &&
19927 curr_object->pager_ready) {
19928 if (VM_COMPRESSOR_PAGER_STATE_GET(curr_object, vm_object_trunc_page(curr_offset_in_object))
19929 == VM_EXTERNAL_STATE_EXISTS) {
19930 /* the pager has that page */
19931 disposition |= VM_PAGE_QUERY_PAGE_PAGED_OUT;
19932 break;
19933 }
19934 }
19935
19936 /*
19937 * Go down the VM object shadow chain until we find the page
19938 * we're looking for.
19939 */
19940
19941 if (curr_object->shadow != VM_OBJECT_NULL) {
19942 vm_object_t shadow = VM_OBJECT_NULL;
19943
19944 curr_offset_in_object += curr_object->vo_shadow_offset;
19945 shadow = curr_object->shadow;
19946
19947 vm_object_lock_shared(shadow);
19948 vm_object_unlock(curr_object);
19949
19950 curr_object = shadow;
19951 depth++;
19952 continue;
19953 } else {
19954 break;
19955 }
19956 }
19957 }
19958
19959 /* The ref_count is not strictly accurate, it measures the number */
19960 /* of entities holding a ref on the object, they may not be mapping */
19961 /* the object or may not be mapping the section holding the */
19962 /* target page but its still a ball park number and though an over- */
19963 /* count, it picks up the copy-on-write cases */
19964
19965 /* We could also get a picture of page sharing from pmap_attributes */
19966 /* but this would under count as only faulted-in mappings would */
19967 /* show up. */
19968
19969 if ((curr_object == object) && curr_object->shadow) {
19970 disposition |= VM_PAGE_QUERY_PAGE_COPIED;
19971 }
19972
19973 if (!curr_object->internal) {
19974 disposition |= VM_PAGE_QUERY_PAGE_EXTERNAL;
19975 }
19976
19977 if (m != VM_PAGE_NULL) {
19978 if (m->vmp_fictitious) {
19979 disposition |= VM_PAGE_QUERY_PAGE_FICTITIOUS;
19980 } else {
19981 if (m->vmp_dirty || pmap_is_modified(VM_PAGE_GET_PHYS_PAGE(m))) {
19982 disposition |= VM_PAGE_QUERY_PAGE_DIRTY;
19983 }
19984
19985 if (m->vmp_reference || pmap_is_referenced(VM_PAGE_GET_PHYS_PAGE(m))) {
19986 disposition |= VM_PAGE_QUERY_PAGE_REF;
19987 }
19988
19989 if (m->vmp_q_state == VM_PAGE_ON_SPECULATIVE_Q) {
19990 disposition |= VM_PAGE_QUERY_PAGE_SPECULATIVE;
19991 }
19992
19993 /*
19994 * XXX TODO4K:
19995 * when this routine deals with 4k
19996 * pages, check the appropriate CS bit
19997 * here.
19998 */
19999 if (m->vmp_cs_validated) {
20000 disposition |= VM_PAGE_QUERY_PAGE_CS_VALIDATED;
20001 }
20002 if (m->vmp_cs_tainted) {
20003 disposition |= VM_PAGE_QUERY_PAGE_CS_TAINTED;
20004 }
20005 if (m->vmp_cs_nx) {
20006 disposition |= VM_PAGE_QUERY_PAGE_CS_NX;
20007 }
20008 if (m->vmp_reusable || curr_object->all_reusable) {
20009 disposition |= VM_PAGE_QUERY_PAGE_REUSABLE;
20010 }
20011 }
20012 }
20013
20014 switch (flavor) {
20015 case VM_PAGE_INFO_BASIC:
20016 basic_info = (vm_page_info_basic_t) (((uintptr_t) info) + (info_idx * sizeof(struct vm_page_info_basic)));
20017 basic_info->disposition = disposition;
20018 basic_info->ref_count = ref_count;
20019 basic_info->object_id = (vm_object_id_t) (uintptr_t)
20020 VM_KERNEL_ADDRPERM(curr_object);
20021 basic_info->offset =
20022 (memory_object_offset_t) curr_offset_in_object + offset_in_page;
20023 basic_info->depth = depth;
20024
20025 info_idx++;
20026 break;
20027 }
20028
20029 disposition = 0;
20030 offset_in_page = 0; // This doesn't really make sense for any offset other than the starting offset.
20031
20032 /*
20033 * Move to next offset in the range and in our object.
20034 */
20035 curr_s_offset += effective_page_size;
20036 offset_in_object += effective_page_size;
20037 curr_offset_in_object = offset_in_object;
20038
20039 if (curr_object != object) {
20040 vm_object_unlock(curr_object);
20041
20042 curr_object = object;
20043
20044 vm_object_lock_shared(curr_object);
20045 } else {
20046 vm_object_lock_yield_shared(curr_object);
20047 }
20048 }
20049
20050 vm_object_unlock(curr_object);
20051 vm_object_deallocate(curr_object);
20052
20053 vm_map_lock_read(map);
20054 }
20055
20056 vm_map_unlock_read(map);
20057 return retval;
20058 }
20059
20060 /*
20061 * vm_map_msync
20062 *
20063 * Synchronises the memory range specified with its backing store
20064 * image by either flushing or cleaning the contents to the appropriate
20065 * memory manager engaging in a memory object synchronize dialog with
20066 * the manager. The client doesn't return until the manager issues
20067 * m_o_s_completed message. MIG Magically converts user task parameter
20068 * to the task's address map.
20069 *
20070 * interpretation of sync_flags
20071 * VM_SYNC_INVALIDATE - discard pages, only return precious
20072 * pages to manager.
20073 *
20074 * VM_SYNC_INVALIDATE & (VM_SYNC_SYNCHRONOUS | VM_SYNC_ASYNCHRONOUS)
20075 * - discard pages, write dirty or precious
20076 * pages back to memory manager.
20077 *
20078 * VM_SYNC_SYNCHRONOUS | VM_SYNC_ASYNCHRONOUS
20079 * - write dirty or precious pages back to
20080 * the memory manager.
20081 *
20082 * VM_SYNC_CONTIGUOUS - does everything normally, but if there
20083 * is a hole in the region, and we would
20084 * have returned KERN_SUCCESS, return
20085 * KERN_INVALID_ADDRESS instead.
20086 *
20087 * NOTE
20088 * The memory object attributes have not yet been implemented, this
20089 * function will have to deal with the invalidate attribute
20090 *
20091 * RETURNS
20092 * KERN_INVALID_TASK Bad task parameter
20093 * KERN_INVALID_ARGUMENT both sync and async were specified.
20094 * KERN_SUCCESS The usual.
20095 * KERN_INVALID_ADDRESS There was a hole in the region.
20096 */
20097
20098 kern_return_t
vm_map_msync(vm_map_t map,vm_map_address_t address,vm_map_size_t size,vm_sync_t sync_flags)20099 vm_map_msync(
20100 vm_map_t map,
20101 vm_map_address_t address,
20102 vm_map_size_t size,
20103 vm_sync_t sync_flags)
20104 {
20105 vm_map_entry_t entry;
20106 vm_map_size_t amount_left;
20107 vm_object_offset_t offset;
20108 vm_object_offset_t start_offset, end_offset;
20109 boolean_t do_sync_req;
20110 boolean_t had_hole = FALSE;
20111 vm_map_offset_t pmap_offset;
20112
20113 if ((sync_flags & VM_SYNC_ASYNCHRONOUS) &&
20114 (sync_flags & VM_SYNC_SYNCHRONOUS)) {
20115 return KERN_INVALID_ARGUMENT;
20116 }
20117
20118 if (__improbable(vm_map_range_overflows(map, address, size))) {
20119 return KERN_INVALID_ADDRESS;
20120 }
20121
20122 if (VM_MAP_PAGE_MASK(map) < PAGE_MASK) {
20123 DEBUG4K_SHARE("map %p address 0x%llx size 0x%llx flags 0x%x\n", map, (uint64_t)address, (uint64_t)size, sync_flags);
20124 }
20125
20126 /*
20127 * align address and size on page boundaries
20128 */
20129 size = (vm_map_round_page(address + size,
20130 VM_MAP_PAGE_MASK(map)) -
20131 vm_map_trunc_page(address,
20132 VM_MAP_PAGE_MASK(map)));
20133 address = vm_map_trunc_page(address,
20134 VM_MAP_PAGE_MASK(map));
20135
20136 if (map == VM_MAP_NULL) {
20137 return KERN_INVALID_TASK;
20138 }
20139
20140 if (size == 0) {
20141 return KERN_SUCCESS;
20142 }
20143
20144 amount_left = size;
20145
20146 while (amount_left > 0) {
20147 vm_object_size_t flush_size;
20148 vm_object_t object;
20149
20150 vm_map_lock(map);
20151 if (!vm_map_lookup_entry(map,
20152 address,
20153 &entry)) {
20154 vm_map_size_t skip;
20155
20156 /*
20157 * hole in the address map.
20158 */
20159 had_hole = TRUE;
20160
20161 if (sync_flags & VM_SYNC_KILLPAGES) {
20162 /*
20163 * For VM_SYNC_KILLPAGES, there should be
20164 * no holes in the range, since we couldn't
20165 * prevent someone else from allocating in
20166 * that hole and we wouldn't want to "kill"
20167 * their pages.
20168 */
20169 vm_map_unlock(map);
20170 break;
20171 }
20172
20173 /*
20174 * Check for empty map.
20175 */
20176 if (entry == vm_map_to_entry(map) &&
20177 entry->vme_next == entry) {
20178 vm_map_unlock(map);
20179 break;
20180 }
20181 /*
20182 * Check that we don't wrap and that
20183 * we have at least one real map entry.
20184 */
20185 if ((map->hdr.nentries == 0) ||
20186 (entry->vme_next->vme_start < address)) {
20187 vm_map_unlock(map);
20188 break;
20189 }
20190 /*
20191 * Move up to the next entry if needed
20192 */
20193 skip = (entry->vme_next->vme_start - address);
20194 if (skip >= amount_left) {
20195 amount_left = 0;
20196 } else {
20197 amount_left -= skip;
20198 }
20199 address = entry->vme_next->vme_start;
20200 vm_map_unlock(map);
20201 continue;
20202 }
20203
20204 offset = address - entry->vme_start;
20205 pmap_offset = address;
20206
20207 /*
20208 * do we have more to flush than is contained in this
20209 * entry ?
20210 */
20211 if (amount_left + entry->vme_start + offset > entry->vme_end) {
20212 flush_size = entry->vme_end -
20213 (entry->vme_start + offset);
20214 } else {
20215 flush_size = amount_left;
20216 }
20217 amount_left -= flush_size;
20218 address += flush_size;
20219
20220 if (entry->is_sub_map == TRUE) {
20221 vm_map_t local_map;
20222 vm_map_offset_t local_offset;
20223
20224 local_map = VME_SUBMAP(entry);
20225 local_offset = VME_OFFSET(entry);
20226 vm_map_reference(local_map);
20227 vm_map_unlock(map);
20228 if (vm_map_msync(
20229 local_map,
20230 local_offset,
20231 flush_size,
20232 sync_flags) == KERN_INVALID_ADDRESS) {
20233 had_hole = TRUE;
20234 }
20235 vm_map_deallocate(local_map);
20236 continue;
20237 }
20238 object = VME_OBJECT(entry);
20239
20240 /*
20241 * We can't sync this object if the object has not been
20242 * created yet
20243 */
20244 if (object == VM_OBJECT_NULL) {
20245 vm_map_unlock(map);
20246 continue;
20247 }
20248 offset += VME_OFFSET(entry);
20249
20250 vm_object_lock(object);
20251
20252 if (sync_flags & (VM_SYNC_KILLPAGES | VM_SYNC_DEACTIVATE)) {
20253 int kill_pages = 0;
20254
20255 if (VM_MAP_PAGE_MASK(map) < PAGE_MASK) {
20256 /*
20257 * This is a destructive operation and so we
20258 * err on the side of limiting the range of
20259 * the operation.
20260 */
20261 start_offset = vm_object_round_page(offset);
20262 end_offset = vm_object_trunc_page(offset + flush_size);
20263
20264 if (end_offset <= start_offset) {
20265 vm_object_unlock(object);
20266 vm_map_unlock(map);
20267 continue;
20268 }
20269
20270 pmap_offset += start_offset - offset;
20271 } else {
20272 start_offset = offset;
20273 end_offset = offset + flush_size;
20274 }
20275
20276 if (sync_flags & VM_SYNC_KILLPAGES) {
20277 if (((object->ref_count == 1) ||
20278 ((object->copy_strategy !=
20279 MEMORY_OBJECT_COPY_SYMMETRIC) &&
20280 (object->vo_copy == VM_OBJECT_NULL))) &&
20281 (object->shadow == VM_OBJECT_NULL)) {
20282 if (object->ref_count != 1) {
20283 vm_page_stats_reusable.free_shared++;
20284 }
20285 kill_pages = 1;
20286 } else {
20287 kill_pages = -1;
20288 }
20289 }
20290 if (kill_pages != -1) {
20291 vm_object_deactivate_pages(
20292 object,
20293 start_offset,
20294 (vm_object_size_t) (end_offset - start_offset),
20295 kill_pages,
20296 FALSE, /* reusable_pages */
20297 FALSE, /* reusable_no_write */
20298 map->pmap,
20299 pmap_offset);
20300 }
20301 vm_object_unlock(object);
20302 vm_map_unlock(map);
20303 continue;
20304 }
20305 /*
20306 * We can't sync this object if there isn't a pager.
20307 * Don't bother to sync internal objects, since there can't
20308 * be any "permanent" storage for these objects anyway.
20309 */
20310 if ((object->pager == MEMORY_OBJECT_NULL) ||
20311 (object->internal) || (object->private)) {
20312 vm_object_unlock(object);
20313 vm_map_unlock(map);
20314 continue;
20315 }
20316 /*
20317 * keep reference on the object until syncing is done
20318 */
20319 vm_object_reference_locked(object);
20320 vm_object_unlock(object);
20321
20322 vm_map_unlock(map);
20323
20324 if (VM_MAP_PAGE_MASK(map) < PAGE_MASK) {
20325 start_offset = vm_object_trunc_page(offset);
20326 end_offset = vm_object_round_page(offset + flush_size);
20327 } else {
20328 start_offset = offset;
20329 end_offset = offset + flush_size;
20330 }
20331
20332 do_sync_req = vm_object_sync(object,
20333 start_offset,
20334 (end_offset - start_offset),
20335 sync_flags & VM_SYNC_INVALIDATE,
20336 ((sync_flags & VM_SYNC_SYNCHRONOUS) ||
20337 (sync_flags & VM_SYNC_ASYNCHRONOUS)),
20338 sync_flags & VM_SYNC_SYNCHRONOUS);
20339
20340 if ((sync_flags & VM_SYNC_INVALIDATE) && object->resident_page_count == 0) {
20341 /*
20342 * clear out the clustering and read-ahead hints
20343 */
20344 vm_object_lock(object);
20345
20346 object->pages_created = 0;
20347 object->pages_used = 0;
20348 object->sequential = 0;
20349 object->last_alloc = 0;
20350
20351 vm_object_unlock(object);
20352 }
20353 vm_object_deallocate(object);
20354 } /* while */
20355
20356 /* for proper msync() behaviour */
20357 if (had_hole == TRUE && (sync_flags & VM_SYNC_CONTIGUOUS)) {
20358 return KERN_INVALID_ADDRESS;
20359 }
20360
20361 return KERN_SUCCESS;
20362 }/* vm_msync */
20363
20364 void
vm_named_entry_associate_vm_object(vm_named_entry_t named_entry,vm_object_t object,vm_object_offset_t offset,vm_object_size_t size,vm_prot_t prot)20365 vm_named_entry_associate_vm_object(
20366 vm_named_entry_t named_entry,
20367 vm_object_t object,
20368 vm_object_offset_t offset,
20369 vm_object_size_t size,
20370 vm_prot_t prot)
20371 {
20372 vm_map_copy_t copy;
20373 vm_map_entry_t copy_entry;
20374
20375 assert(!named_entry->is_sub_map);
20376 assert(!named_entry->is_copy);
20377 assert(!named_entry->is_object);
20378 assert(!named_entry->internal);
20379 assert(named_entry->backing.copy == VM_MAP_COPY_NULL);
20380
20381 copy = vm_map_copy_allocate(VM_MAP_COPY_ENTRY_LIST);
20382 copy->offset = offset;
20383 copy->size = size;
20384 copy->cpy_hdr.page_shift = (uint16_t)PAGE_SHIFT;
20385
20386 copy_entry = vm_map_copy_entry_create(copy);
20387 copy_entry->protection = prot;
20388 copy_entry->max_protection = prot;
20389 copy_entry->use_pmap = TRUE;
20390 copy_entry->vme_start = VM_MAP_TRUNC_PAGE(offset, PAGE_MASK);
20391 copy_entry->vme_end = VM_MAP_ROUND_PAGE(offset + size, PAGE_MASK);
20392 VME_OBJECT_SET(copy_entry, object, false, 0);
20393 VME_OFFSET_SET(copy_entry, vm_object_trunc_page(offset));
20394 vm_map_copy_entry_link(copy, vm_map_copy_last_entry(copy), copy_entry);
20395
20396 named_entry->backing.copy = copy;
20397 named_entry->is_object = TRUE;
20398 if (object->internal) {
20399 named_entry->internal = TRUE;
20400 }
20401
20402 DEBUG4K_MEMENTRY("named_entry %p copy %p object %p offset 0x%llx size 0x%llx prot 0x%x\n",
20403 named_entry, copy, object, offset, size, prot);
20404 }
20405
20406 vm_object_t
vm_named_entry_to_vm_object(vm_named_entry_t named_entry)20407 vm_named_entry_to_vm_object(
20408 vm_named_entry_t named_entry)
20409 {
20410 vm_map_copy_t copy;
20411 vm_map_entry_t copy_entry;
20412 vm_object_t object;
20413
20414 assert(!named_entry->is_sub_map);
20415 assert(!named_entry->is_copy);
20416 assert(named_entry->is_object);
20417 copy = named_entry->backing.copy;
20418 assert(copy != VM_MAP_COPY_NULL);
20419 /*
20420 * Assert that the vm_map_copy is coming from the right
20421 * zone and hasn't been forged
20422 */
20423 vm_map_copy_require(copy);
20424 assert(copy->cpy_hdr.nentries == 1);
20425 copy_entry = vm_map_copy_first_entry(copy);
20426 object = VME_OBJECT(copy_entry);
20427
20428 DEBUG4K_MEMENTRY("%p -> %p -> %p [0x%llx 0x%llx 0x%llx 0x%x/0x%x ] -> %p offset 0x%llx size 0x%llx prot 0x%x\n", named_entry, copy, copy_entry, (uint64_t)copy_entry->vme_start, (uint64_t)copy_entry->vme_end, copy_entry->vme_offset, copy_entry->protection, copy_entry->max_protection, object, named_entry->offset, named_entry->size, named_entry->protection);
20429
20430 return object;
20431 }
20432
20433 /*
20434 * Routine: convert_port_entry_to_map
20435 * Purpose:
20436 * Convert from a port specifying an entry or a task
20437 * to a map. Doesn't consume the port ref; produces a map ref,
20438 * which may be null. Unlike convert_port_to_map, the
20439 * port may be task or a named entry backed.
20440 * Conditions:
20441 * Nothing locked.
20442 */
20443
20444 vm_map_t
convert_port_entry_to_map(ipc_port_t port)20445 convert_port_entry_to_map(
20446 ipc_port_t port)
20447 {
20448 vm_map_t map = VM_MAP_NULL;
20449 vm_named_entry_t named_entry;
20450
20451 if (!IP_VALID(port)) {
20452 return VM_MAP_NULL;
20453 }
20454
20455 if (ip_kotype(port) != IKOT_NAMED_ENTRY) {
20456 return convert_port_to_map(port);
20457 }
20458
20459 named_entry = mach_memory_entry_from_port(port);
20460
20461 if ((named_entry->is_sub_map) &&
20462 (named_entry->protection & VM_PROT_WRITE)) {
20463 map = named_entry->backing.map;
20464 if (map->pmap != PMAP_NULL) {
20465 if (map->pmap == kernel_pmap) {
20466 panic("userspace has access "
20467 "to a kernel map %p", map);
20468 }
20469 pmap_require(map->pmap);
20470 }
20471 vm_map_reference(map);
20472 }
20473
20474 return map;
20475 }
20476
20477 /*
20478 * Export routines to other components for the things we access locally through
20479 * macros.
20480 */
20481 #undef current_map
20482 vm_map_t
current_map(void)20483 current_map(void)
20484 {
20485 return current_map_fast();
20486 }
20487
20488 /*
20489 * vm_map_reference:
20490 *
20491 * Takes a reference on the specified map.
20492 */
20493 void
vm_map_reference(vm_map_t map)20494 vm_map_reference(
20495 vm_map_t map)
20496 {
20497 if (__probable(map != VM_MAP_NULL)) {
20498 vm_map_require(map);
20499 os_ref_retain_raw(&map->map_refcnt, &map_refgrp);
20500 }
20501 }
20502
20503 /*
20504 * vm_map_deallocate:
20505 *
20506 * Removes a reference from the specified map,
20507 * destroying it if no references remain.
20508 * The map should not be locked.
20509 */
20510 void
vm_map_deallocate(vm_map_t map)20511 vm_map_deallocate(
20512 vm_map_t map)
20513 {
20514 if (__probable(map != VM_MAP_NULL)) {
20515 vm_map_require(map);
20516 if (os_ref_release_raw(&map->map_refcnt, &map_refgrp) == 0) {
20517 vm_map_destroy(map);
20518 }
20519 }
20520 }
20521
20522 void
vm_map_inspect_deallocate(vm_map_inspect_t map)20523 vm_map_inspect_deallocate(
20524 vm_map_inspect_t map)
20525 {
20526 vm_map_deallocate((vm_map_t)map);
20527 }
20528
20529 void
vm_map_read_deallocate(vm_map_read_t map)20530 vm_map_read_deallocate(
20531 vm_map_read_t map)
20532 {
20533 vm_map_deallocate((vm_map_t)map);
20534 }
20535
20536
20537 void
vm_map_disable_NX(vm_map_t map)20538 vm_map_disable_NX(vm_map_t map)
20539 {
20540 if (map == NULL) {
20541 return;
20542 }
20543 if (map->pmap == NULL) {
20544 return;
20545 }
20546
20547 pmap_disable_NX(map->pmap);
20548 }
20549
20550 void
vm_map_disallow_data_exec(vm_map_t map)20551 vm_map_disallow_data_exec(vm_map_t map)
20552 {
20553 if (map == NULL) {
20554 return;
20555 }
20556
20557 map->map_disallow_data_exec = TRUE;
20558 }
20559
20560 /* XXX Consider making these constants (VM_MAX_ADDRESS and MACH_VM_MAX_ADDRESS)
20561 * more descriptive.
20562 */
20563 void
vm_map_set_32bit(vm_map_t map)20564 vm_map_set_32bit(vm_map_t map)
20565 {
20566 #if defined(__arm64__)
20567 map->max_offset = pmap_max_offset(FALSE, ARM_PMAP_MAX_OFFSET_DEVICE);
20568 #else
20569 map->max_offset = (vm_map_offset_t)VM_MAX_ADDRESS;
20570 #endif
20571 }
20572
20573
20574 void
vm_map_set_64bit(vm_map_t map)20575 vm_map_set_64bit(vm_map_t map)
20576 {
20577 #if defined(__arm64__)
20578 map->max_offset = pmap_max_offset(TRUE, ARM_PMAP_MAX_OFFSET_DEVICE);
20579 #else
20580 map->max_offset = (vm_map_offset_t)MACH_VM_MAX_ADDRESS;
20581 #endif
20582 }
20583
20584 /*
20585 * Expand the maximum size of an existing map to the maximum supported.
20586 */
20587 void
vm_map_set_jumbo(vm_map_t map)20588 vm_map_set_jumbo(vm_map_t map)
20589 {
20590 #if defined (__arm64__) && !XNU_TARGET_OS_OSX
20591 vm_map_set_max_addr(map, ~0);
20592 #else /* arm64 */
20593 (void) map;
20594 #endif
20595 }
20596
20597 /*
20598 * This map has a JIT entitlement
20599 */
20600 void
vm_map_set_jit_entitled(vm_map_t map)20601 vm_map_set_jit_entitled(vm_map_t map)
20602 {
20603 #if defined (__arm64__)
20604 pmap_set_jit_entitled(map->pmap);
20605 #else /* arm64 */
20606 (void) map;
20607 #endif
20608 }
20609
20610 /*
20611 * Get status of this maps TPRO flag
20612 */
20613 boolean_t
vm_map_tpro(vm_map_t map)20614 vm_map_tpro(vm_map_t map)
20615 {
20616 #if defined (__arm64e__)
20617 return pmap_get_tpro(map->pmap);
20618 #else /* arm64e */
20619 (void) map;
20620 return FALSE;
20621 #endif
20622 }
20623
20624 /*
20625 * This map has TPRO enabled
20626 */
20627 void
vm_map_set_tpro(vm_map_t map)20628 vm_map_set_tpro(vm_map_t map)
20629 {
20630 #if defined (__arm64e__)
20631 pmap_set_tpro(map->pmap);
20632 #else /* arm64e */
20633 (void) map;
20634 #endif
20635 }
20636
20637 /*
20638 * Does this map have TPRO enforcement enabled
20639 */
20640 boolean_t
vm_map_tpro_enforcement(vm_map_t map)20641 vm_map_tpro_enforcement(vm_map_t map)
20642 {
20643 return map->tpro_enforcement;
20644 }
20645
20646 /*
20647 * Set TPRO enforcement for this map
20648 */
20649 void
vm_map_set_tpro_enforcement(vm_map_t map)20650 vm_map_set_tpro_enforcement(vm_map_t map)
20651 {
20652 if (vm_map_tpro(map)) {
20653 vm_map_lock(map);
20654 map->tpro_enforcement = TRUE;
20655 vm_map_unlock(map);
20656 }
20657 }
20658
20659 /*
20660 * Enable TPRO on the requested region
20661 *
20662 * Note:
20663 * This routine is primarily intended to be called during/soon after map
20664 * creation before the associated task has been released to run. It is only
20665 * currently safe when we have no resident pages.
20666 */
20667 boolean_t
vm_map_set_tpro_range(__unused vm_map_t map,__unused vm_map_address_t start,__unused vm_map_address_t end)20668 vm_map_set_tpro_range(
20669 __unused vm_map_t map,
20670 __unused vm_map_address_t start,
20671 __unused vm_map_address_t end)
20672 {
20673 return TRUE;
20674 }
20675
20676 /*
20677 * Expand the maximum size of an existing map.
20678 */
20679 void
vm_map_set_max_addr(vm_map_t map,vm_map_offset_t new_max_offset)20680 vm_map_set_max_addr(vm_map_t map, vm_map_offset_t new_max_offset)
20681 {
20682 #if defined(__arm64__)
20683 vm_map_offset_t max_supported_offset;
20684 vm_map_offset_t old_max_offset;
20685
20686 vm_map_lock(map);
20687
20688 old_max_offset = map->max_offset;
20689 max_supported_offset = pmap_max_offset(vm_map_is_64bit(map), ARM_PMAP_MAX_OFFSET_JUMBO);
20690
20691 new_max_offset = trunc_page(new_max_offset);
20692
20693 /* The address space cannot be shrunk using this routine. */
20694 if (old_max_offset >= new_max_offset) {
20695 vm_map_unlock(map);
20696 return;
20697 }
20698
20699 if (max_supported_offset < new_max_offset) {
20700 new_max_offset = max_supported_offset;
20701 }
20702
20703 map->max_offset = new_max_offset;
20704
20705 if (map->holelistenabled) {
20706 if (map->holes_list->prev->vme_end == old_max_offset) {
20707 /*
20708 * There is already a hole at the end of the map; simply make it bigger.
20709 */
20710 map->holes_list->prev->vme_end = map->max_offset;
20711 } else {
20712 /*
20713 * There is no hole at the end, so we need to create a new hole
20714 * for the new empty space we're creating.
20715 */
20716 struct vm_map_links *new_hole;
20717
20718 new_hole = zalloc_id(ZONE_ID_VM_MAP_HOLES, Z_WAITOK | Z_NOFAIL);
20719 new_hole->start = old_max_offset;
20720 new_hole->end = map->max_offset;
20721 new_hole->prev = map->holes_list->prev;
20722 new_hole->next = (struct vm_map_entry *)map->holes_list;
20723 map->holes_list->prev->vme_next = (struct vm_map_entry *)new_hole;
20724 map->holes_list->prev = (struct vm_map_entry *)new_hole;
20725 }
20726 }
20727
20728 vm_map_unlock(map);
20729 #else
20730 (void)map;
20731 (void)new_max_offset;
20732 #endif
20733 }
20734
20735 vm_map_offset_t
vm_compute_max_offset(boolean_t is64)20736 vm_compute_max_offset(boolean_t is64)
20737 {
20738 #if defined(__arm64__)
20739 return pmap_max_offset(is64, ARM_PMAP_MAX_OFFSET_DEVICE);
20740 #else
20741 return is64 ? (vm_map_offset_t)MACH_VM_MAX_ADDRESS : (vm_map_offset_t)VM_MAX_ADDRESS;
20742 #endif
20743 }
20744
20745 void
vm_map_get_max_aslr_slide_section(vm_map_t map __unused,int64_t * max_sections,int64_t * section_size)20746 vm_map_get_max_aslr_slide_section(
20747 vm_map_t map __unused,
20748 int64_t *max_sections,
20749 int64_t *section_size)
20750 {
20751 #if defined(__arm64__)
20752 *max_sections = 3;
20753 *section_size = ARM_TT_TWIG_SIZE;
20754 #else
20755 *max_sections = 1;
20756 *section_size = 0;
20757 #endif
20758 }
20759
20760 uint64_t
vm_map_get_max_aslr_slide_pages(vm_map_t map)20761 vm_map_get_max_aslr_slide_pages(vm_map_t map)
20762 {
20763 #if defined(__arm64__)
20764 /* Limit arm64 slide to 16MB to conserve contiguous VA space in the more
20765 * limited embedded address space; this is also meant to minimize pmap
20766 * memory usage on 16KB page systems.
20767 */
20768 return 1 << (24 - VM_MAP_PAGE_SHIFT(map));
20769 #else
20770 return 1 << (vm_map_is_64bit(map) ? 16 : 8);
20771 #endif
20772 }
20773
20774 uint64_t
vm_map_get_max_loader_aslr_slide_pages(vm_map_t map)20775 vm_map_get_max_loader_aslr_slide_pages(vm_map_t map)
20776 {
20777 #if defined(__arm64__)
20778 /* We limit the loader slide to 4MB, in order to ensure at least 8 bits
20779 * of independent entropy on 16KB page systems.
20780 */
20781 return 1 << (22 - VM_MAP_PAGE_SHIFT(map));
20782 #else
20783 return 1 << (vm_map_is_64bit(map) ? 16 : 8);
20784 #endif
20785 }
20786
20787 boolean_t
vm_map_is_64bit(vm_map_t map)20788 vm_map_is_64bit(
20789 vm_map_t map)
20790 {
20791 return map->max_offset > ((vm_map_offset_t)VM_MAX_ADDRESS);
20792 }
20793
20794 boolean_t
vm_map_has_hard_pagezero(vm_map_t map,vm_map_offset_t pagezero_size)20795 vm_map_has_hard_pagezero(
20796 vm_map_t map,
20797 vm_map_offset_t pagezero_size)
20798 {
20799 /*
20800 * XXX FBDP
20801 * We should lock the VM map (for read) here but we can get away
20802 * with it for now because there can't really be any race condition:
20803 * the VM map's min_offset is changed only when the VM map is created
20804 * and when the zero page is established (when the binary gets loaded),
20805 * and this routine gets called only when the task terminates and the
20806 * VM map is being torn down, and when a new map is created via
20807 * load_machfile()/execve().
20808 */
20809 return map->min_offset >= pagezero_size;
20810 }
20811
20812 /*
20813 * Raise a VM map's maximun offset.
20814 */
20815 kern_return_t
vm_map_raise_max_offset(vm_map_t map,vm_map_offset_t new_max_offset)20816 vm_map_raise_max_offset(
20817 vm_map_t map,
20818 vm_map_offset_t new_max_offset)
20819 {
20820 kern_return_t ret;
20821
20822 vm_map_lock(map);
20823 ret = KERN_INVALID_ADDRESS;
20824
20825 if (new_max_offset >= map->max_offset) {
20826 if (!vm_map_is_64bit(map)) {
20827 if (new_max_offset <= (vm_map_offset_t)VM_MAX_ADDRESS) {
20828 map->max_offset = new_max_offset;
20829 ret = KERN_SUCCESS;
20830 }
20831 } else {
20832 if (new_max_offset <= (vm_map_offset_t)MACH_VM_MAX_ADDRESS) {
20833 map->max_offset = new_max_offset;
20834 ret = KERN_SUCCESS;
20835 }
20836 }
20837 }
20838
20839 vm_map_unlock(map);
20840 return ret;
20841 }
20842
20843
20844 /*
20845 * Raise a VM map's minimum offset.
20846 * To strictly enforce "page zero" reservation.
20847 */
20848 kern_return_t
vm_map_raise_min_offset(vm_map_t map,vm_map_offset_t new_min_offset)20849 vm_map_raise_min_offset(
20850 vm_map_t map,
20851 vm_map_offset_t new_min_offset)
20852 {
20853 vm_map_entry_t first_entry;
20854
20855 new_min_offset = vm_map_round_page(new_min_offset,
20856 VM_MAP_PAGE_MASK(map));
20857
20858 vm_map_lock(map);
20859
20860 if (new_min_offset < map->min_offset) {
20861 /*
20862 * Can't move min_offset backwards, as that would expose
20863 * a part of the address space that was previously, and for
20864 * possibly good reasons, inaccessible.
20865 */
20866 vm_map_unlock(map);
20867 return KERN_INVALID_ADDRESS;
20868 }
20869 if (new_min_offset >= map->max_offset) {
20870 /* can't go beyond the end of the address space */
20871 vm_map_unlock(map);
20872 return KERN_INVALID_ADDRESS;
20873 }
20874
20875 first_entry = vm_map_first_entry(map);
20876 if (first_entry != vm_map_to_entry(map) &&
20877 first_entry->vme_start < new_min_offset) {
20878 /*
20879 * Some memory was already allocated below the new
20880 * minimun offset. It's too late to change it now...
20881 */
20882 vm_map_unlock(map);
20883 return KERN_NO_SPACE;
20884 }
20885
20886 map->min_offset = new_min_offset;
20887
20888 if (map->holelistenabled) {
20889 assert(map->holes_list);
20890 map->holes_list->start = new_min_offset;
20891 assert(new_min_offset < map->holes_list->end);
20892 }
20893
20894 vm_map_unlock(map);
20895
20896 return KERN_SUCCESS;
20897 }
20898
20899 /*
20900 * Set the limit on the maximum amount of address space and user wired memory allowed for this map.
20901 * This is basically a copy of the RLIMIT_AS and RLIMIT_MEMLOCK rlimit value maintained by the BSD
20902 * side of the kernel. The limits are checked in the mach VM side, so we keep a copy so we don't
20903 * have to reach over to the BSD data structures.
20904 */
20905
20906 uint64_t vm_map_set_size_limit_count = 0;
20907 kern_return_t
vm_map_set_size_limit(vm_map_t map,uint64_t new_size_limit)20908 vm_map_set_size_limit(vm_map_t map, uint64_t new_size_limit)
20909 {
20910 kern_return_t kr;
20911
20912 vm_map_lock(map);
20913 if (new_size_limit < map->size) {
20914 /* new limit should not be lower than its current size */
20915 DTRACE_VM2(vm_map_set_size_limit_fail,
20916 vm_map_size_t, map->size,
20917 uint64_t, new_size_limit);
20918 kr = KERN_FAILURE;
20919 } else if (new_size_limit == map->size_limit) {
20920 /* no change */
20921 kr = KERN_SUCCESS;
20922 } else {
20923 /* set new limit */
20924 DTRACE_VM2(vm_map_set_size_limit,
20925 vm_map_size_t, map->size,
20926 uint64_t, new_size_limit);
20927 if (new_size_limit != RLIM_INFINITY) {
20928 vm_map_set_size_limit_count++;
20929 }
20930 map->size_limit = new_size_limit;
20931 kr = KERN_SUCCESS;
20932 }
20933 vm_map_unlock(map);
20934 return kr;
20935 }
20936
20937 uint64_t vm_map_set_data_limit_count = 0;
20938 kern_return_t
vm_map_set_data_limit(vm_map_t map,uint64_t new_data_limit)20939 vm_map_set_data_limit(vm_map_t map, uint64_t new_data_limit)
20940 {
20941 kern_return_t kr;
20942
20943 vm_map_lock(map);
20944 if (new_data_limit < map->size) {
20945 /* new limit should not be lower than its current size */
20946 DTRACE_VM2(vm_map_set_data_limit_fail,
20947 vm_map_size_t, map->size,
20948 uint64_t, new_data_limit);
20949 kr = KERN_FAILURE;
20950 } else if (new_data_limit == map->data_limit) {
20951 /* no change */
20952 kr = KERN_SUCCESS;
20953 } else {
20954 /* set new limit */
20955 DTRACE_VM2(vm_map_set_data_limit,
20956 vm_map_size_t, map->size,
20957 uint64_t, new_data_limit);
20958 if (new_data_limit != RLIM_INFINITY) {
20959 vm_map_set_data_limit_count++;
20960 }
20961 map->data_limit = new_data_limit;
20962 kr = KERN_SUCCESS;
20963 }
20964 vm_map_unlock(map);
20965 return kr;
20966 }
20967
20968 void
vm_map_set_user_wire_limit(vm_map_t map,vm_size_t limit)20969 vm_map_set_user_wire_limit(vm_map_t map,
20970 vm_size_t limit)
20971 {
20972 vm_map_lock(map);
20973 map->user_wire_limit = limit;
20974 vm_map_unlock(map);
20975 }
20976
20977
20978 void
vm_map_switch_protect(vm_map_t map,boolean_t val)20979 vm_map_switch_protect(vm_map_t map,
20980 boolean_t val)
20981 {
20982 vm_map_lock(map);
20983 map->switch_protect = val;
20984 vm_map_unlock(map);
20985 }
20986
20987 extern int cs_process_enforcement_enable;
20988 boolean_t
vm_map_cs_enforcement(vm_map_t map)20989 vm_map_cs_enforcement(
20990 vm_map_t map)
20991 {
20992 if (cs_process_enforcement_enable) {
20993 return TRUE;
20994 }
20995 return map->cs_enforcement;
20996 }
20997
20998 kern_return_t
vm_map_cs_wx_enable(__unused vm_map_t map)20999 vm_map_cs_wx_enable(
21000 __unused vm_map_t map)
21001 {
21002 #if CODE_SIGNING_MONITOR
21003 kern_return_t ret = csm_allow_invalid_code(vm_map_pmap(map));
21004 if ((ret == KERN_SUCCESS) || (ret == KERN_NOT_SUPPORTED)) {
21005 return KERN_SUCCESS;
21006 }
21007 return ret;
21008 #else
21009 /* The VM manages WX memory entirely on its own */
21010 return KERN_SUCCESS;
21011 #endif
21012 }
21013
21014 kern_return_t
vm_map_csm_allow_jit(__unused vm_map_t map)21015 vm_map_csm_allow_jit(
21016 __unused vm_map_t map)
21017 {
21018 #if CODE_SIGNING_MONITOR
21019 return csm_allow_jit_region(vm_map_pmap(map));
21020 #else
21021 /* No code signing monitor to enforce JIT policy */
21022 return KERN_SUCCESS;
21023 #endif
21024 }
21025
21026 void
vm_map_cs_debugged_set(vm_map_t map,boolean_t val)21027 vm_map_cs_debugged_set(
21028 vm_map_t map,
21029 boolean_t val)
21030 {
21031 vm_map_lock(map);
21032 map->cs_debugged = val;
21033 vm_map_unlock(map);
21034 }
21035
21036 void
vm_map_cs_enforcement_set(vm_map_t map,boolean_t val)21037 vm_map_cs_enforcement_set(
21038 vm_map_t map,
21039 boolean_t val)
21040 {
21041 vm_map_lock(map);
21042 map->cs_enforcement = val;
21043 pmap_set_vm_map_cs_enforced(map->pmap, val);
21044 vm_map_unlock(map);
21045 }
21046
21047 /*
21048 * IOKit has mapped a region into this map; adjust the pmap's ledgers appropriately.
21049 * phys_footprint is a composite limit consisting of iokit + physmem, so we need to
21050 * bump both counters.
21051 */
21052 void
vm_map_iokit_mapped_region(vm_map_t map,vm_size_t bytes)21053 vm_map_iokit_mapped_region(vm_map_t map, vm_size_t bytes)
21054 {
21055 pmap_t pmap = vm_map_pmap(map);
21056
21057 ledger_credit(pmap->ledger, task_ledgers.iokit_mapped, bytes);
21058 ledger_credit(pmap->ledger, task_ledgers.phys_footprint, bytes);
21059 }
21060
21061 void
vm_map_iokit_unmapped_region(vm_map_t map,vm_size_t bytes)21062 vm_map_iokit_unmapped_region(vm_map_t map, vm_size_t bytes)
21063 {
21064 pmap_t pmap = vm_map_pmap(map);
21065
21066 ledger_debit(pmap->ledger, task_ledgers.iokit_mapped, bytes);
21067 ledger_debit(pmap->ledger, task_ledgers.phys_footprint, bytes);
21068 }
21069
21070 /* Add (generate) code signature for memory range */
21071 #if CONFIG_DYNAMIC_CODE_SIGNING
21072 kern_return_t
vm_map_sign(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)21073 vm_map_sign(vm_map_t map,
21074 vm_map_offset_t start,
21075 vm_map_offset_t end)
21076 {
21077 vm_map_entry_t entry;
21078 vm_page_t m;
21079 vm_object_t object;
21080
21081 /*
21082 * Vet all the input parameters and current type and state of the
21083 * underlaying object. Return with an error if anything is amiss.
21084 */
21085 if (map == VM_MAP_NULL) {
21086 return KERN_INVALID_ARGUMENT;
21087 }
21088
21089 if (__improbable(vm_map_range_overflows(map, start, end - start))) {
21090 return KERN_INVALID_ADDRESS;
21091 }
21092
21093 vm_map_lock_read(map);
21094
21095 if (!vm_map_lookup_entry(map, start, &entry) || entry->is_sub_map) {
21096 /*
21097 * Must pass a valid non-submap address.
21098 */
21099 vm_map_unlock_read(map);
21100 return KERN_INVALID_ADDRESS;
21101 }
21102
21103 if ((entry->vme_start > start) || (entry->vme_end < end)) {
21104 /*
21105 * Map entry doesn't cover the requested range. Not handling
21106 * this situation currently.
21107 */
21108 vm_map_unlock_read(map);
21109 return KERN_INVALID_ARGUMENT;
21110 }
21111
21112 object = VME_OBJECT(entry);
21113 if (object == VM_OBJECT_NULL) {
21114 /*
21115 * Object must already be present or we can't sign.
21116 */
21117 vm_map_unlock_read(map);
21118 return KERN_INVALID_ARGUMENT;
21119 }
21120
21121 vm_object_lock(object);
21122 vm_map_unlock_read(map);
21123
21124 while (start < end) {
21125 uint32_t refmod;
21126
21127 m = vm_page_lookup(object,
21128 start - entry->vme_start + VME_OFFSET(entry));
21129 if (m == VM_PAGE_NULL) {
21130 /* shoud we try to fault a page here? we can probably
21131 * demand it exists and is locked for this request */
21132 vm_object_unlock(object);
21133 return KERN_FAILURE;
21134 }
21135 /* deal with special page status */
21136 if (m->vmp_busy ||
21137 (m->vmp_unusual && (VMP_ERROR_GET(m) || m->vmp_restart || m->vmp_private || m->vmp_absent))) {
21138 vm_object_unlock(object);
21139 return KERN_FAILURE;
21140 }
21141
21142 /* Page is OK... now "validate" it */
21143 /* This is the place where we'll call out to create a code
21144 * directory, later */
21145 /* XXX TODO4K: deal with 4k subpages individually? */
21146 m->vmp_cs_validated = VMP_CS_ALL_TRUE;
21147
21148 /* The page is now "clean" for codesigning purposes. That means
21149 * we don't consider it as modified (wpmapped) anymore. But
21150 * we'll disconnect the page so we note any future modification
21151 * attempts. */
21152 m->vmp_wpmapped = FALSE;
21153 refmod = pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m));
21154
21155 /* Pull the dirty status from the pmap, since we cleared the
21156 * wpmapped bit */
21157 if ((refmod & VM_MEM_MODIFIED) && !m->vmp_dirty) {
21158 SET_PAGE_DIRTY(m, FALSE);
21159 }
21160
21161 /* On to the next page */
21162 start += PAGE_SIZE;
21163 }
21164 vm_object_unlock(object);
21165
21166 return KERN_SUCCESS;
21167 }
21168 #endif
21169
21170 kern_return_t
vm_map_partial_reap(vm_map_t map,unsigned int * reclaimed_resident,unsigned int * reclaimed_compressed)21171 vm_map_partial_reap(vm_map_t map, unsigned int *reclaimed_resident, unsigned int *reclaimed_compressed)
21172 {
21173 vm_map_entry_t entry = VM_MAP_ENTRY_NULL;
21174 vm_map_entry_t next_entry;
21175 kern_return_t kr = KERN_SUCCESS;
21176 VM_MAP_ZAP_DECLARE(zap_list);
21177
21178 vm_map_lock(map);
21179
21180 for (entry = vm_map_first_entry(map);
21181 entry != vm_map_to_entry(map);
21182 entry = next_entry) {
21183 next_entry = entry->vme_next;
21184
21185 if (!entry->is_sub_map &&
21186 VME_OBJECT(entry) &&
21187 (VME_OBJECT(entry)->internal == TRUE) &&
21188 (VME_OBJECT(entry)->ref_count == 1)) {
21189 *reclaimed_resident += VME_OBJECT(entry)->resident_page_count;
21190 *reclaimed_compressed += vm_compressor_pager_get_count(VME_OBJECT(entry)->pager);
21191
21192 (void)vm_map_delete(map, entry->vme_start,
21193 entry->vme_end, VM_MAP_REMOVE_NO_YIELD,
21194 KMEM_GUARD_NONE, &zap_list);
21195 }
21196 }
21197
21198 vm_map_unlock(map);
21199
21200 vm_map_zap_dispose(&zap_list);
21201
21202 return kr;
21203 }
21204
21205
21206 #if DEVELOPMENT || DEBUG
21207
21208 int
vm_map_disconnect_page_mappings(vm_map_t map,boolean_t do_unnest)21209 vm_map_disconnect_page_mappings(
21210 vm_map_t map,
21211 boolean_t do_unnest)
21212 {
21213 vm_map_entry_t entry;
21214 ledger_amount_t byte_count = 0;
21215
21216 if (do_unnest == TRUE) {
21217 #ifndef NO_NESTED_PMAP
21218 vm_map_lock(map);
21219
21220 for (entry = vm_map_first_entry(map);
21221 entry != vm_map_to_entry(map);
21222 entry = entry->vme_next) {
21223 if (entry->is_sub_map && entry->use_pmap) {
21224 /*
21225 * Make sure the range between the start of this entry and
21226 * the end of this entry is no longer nested, so that
21227 * we will only remove mappings from the pmap in use by this
21228 * this task
21229 */
21230 vm_map_clip_unnest(map, entry, entry->vme_start, entry->vme_end);
21231 }
21232 }
21233 vm_map_unlock(map);
21234 #endif
21235 }
21236 vm_map_lock_read(map);
21237
21238 ledger_get_balance(map->pmap->ledger, task_ledgers.phys_mem, &byte_count);
21239
21240 for (entry = vm_map_first_entry(map);
21241 entry != vm_map_to_entry(map);
21242 entry = entry->vme_next) {
21243 if (!entry->is_sub_map && ((VME_OBJECT(entry) == 0) ||
21244 (VME_OBJECT(entry)->phys_contiguous))) {
21245 continue;
21246 }
21247 if (entry->is_sub_map) {
21248 assert(!entry->use_pmap);
21249 }
21250
21251 pmap_remove_options(map->pmap, entry->vme_start, entry->vme_end, 0);
21252 }
21253 vm_map_unlock_read(map);
21254
21255 return (int) (byte_count / VM_MAP_PAGE_SIZE(map));
21256 }
21257
21258 kern_return_t
vm_map_inject_error(vm_map_t map,vm_map_offset_t vaddr)21259 vm_map_inject_error(vm_map_t map, vm_map_offset_t vaddr)
21260 {
21261 vm_object_t object = NULL;
21262 vm_object_offset_t offset;
21263 vm_prot_t prot;
21264 boolean_t wired;
21265 vm_map_version_t version;
21266 vm_map_t real_map;
21267 int result = KERN_FAILURE;
21268
21269 vaddr = vm_map_trunc_page(vaddr, PAGE_MASK);
21270 vm_map_lock(map);
21271
21272 result = vm_map_lookup_and_lock_object(&map, vaddr, VM_PROT_READ,
21273 OBJECT_LOCK_EXCLUSIVE, &version, &object, &offset, &prot, &wired,
21274 NULL, &real_map, NULL);
21275 if (object == NULL) {
21276 result = KERN_MEMORY_ERROR;
21277 } else if (object->pager) {
21278 result = vm_compressor_pager_inject_error(object->pager,
21279 offset);
21280 } else {
21281 result = KERN_MEMORY_PRESENT;
21282 }
21283
21284 if (object != NULL) {
21285 vm_object_unlock(object);
21286 }
21287
21288 if (real_map != map) {
21289 vm_map_unlock(real_map);
21290 }
21291 vm_map_unlock(map);
21292
21293 return result;
21294 }
21295
21296 #endif
21297
21298
21299 #if CONFIG_FREEZE
21300
21301
21302 extern struct freezer_context freezer_context_global;
21303 AbsoluteTime c_freezer_last_yield_ts = 0;
21304
21305 extern unsigned int memorystatus_freeze_private_shared_pages_ratio;
21306 extern unsigned int memorystatus_freeze_shared_mb_per_process_max;
21307
21308 kern_return_t
vm_map_freeze(task_t task,unsigned int * purgeable_count,unsigned int * wired_count,unsigned int * clean_count,unsigned int * dirty_count,unsigned int dirty_budget,unsigned int * shared_count,int * freezer_error_code,boolean_t eval_only)21309 vm_map_freeze(
21310 task_t task,
21311 unsigned int *purgeable_count,
21312 unsigned int *wired_count,
21313 unsigned int *clean_count,
21314 unsigned int *dirty_count,
21315 unsigned int dirty_budget,
21316 unsigned int *shared_count,
21317 int *freezer_error_code,
21318 boolean_t eval_only)
21319 {
21320 vm_map_entry_t entry2 = VM_MAP_ENTRY_NULL;
21321 kern_return_t kr = KERN_SUCCESS;
21322 boolean_t evaluation_phase = TRUE;
21323 vm_object_t cur_shared_object = NULL;
21324 int cur_shared_obj_ref_cnt = 0;
21325 unsigned int dirty_private_count = 0, dirty_shared_count = 0, obj_pages_snapshot = 0;
21326
21327 *purgeable_count = *wired_count = *clean_count = *dirty_count = *shared_count = 0;
21328
21329 /*
21330 * We need the exclusive lock here so that we can
21331 * block any page faults or lookups while we are
21332 * in the middle of freezing this vm map.
21333 */
21334 vm_map_t map = task->map;
21335
21336 vm_map_lock(map);
21337
21338 assert(VM_CONFIG_COMPRESSOR_IS_PRESENT);
21339
21340 if (vm_compressor_low_on_space() || vm_swap_low_on_space()) {
21341 if (vm_compressor_low_on_space()) {
21342 *freezer_error_code = FREEZER_ERROR_NO_COMPRESSOR_SPACE;
21343 }
21344
21345 if (vm_swap_low_on_space()) {
21346 *freezer_error_code = FREEZER_ERROR_NO_SWAP_SPACE;
21347 }
21348
21349 kr = KERN_NO_SPACE;
21350 goto done;
21351 }
21352
21353 if (VM_CONFIG_FREEZER_SWAP_IS_ACTIVE == FALSE) {
21354 /*
21355 * In-memory compressor backing the freezer. No disk.
21356 * So no need to do the evaluation phase.
21357 */
21358 evaluation_phase = FALSE;
21359
21360 if (eval_only == TRUE) {
21361 /*
21362 * We don't support 'eval_only' mode
21363 * in this non-swap config.
21364 */
21365 *freezer_error_code = FREEZER_ERROR_GENERIC;
21366 kr = KERN_INVALID_ARGUMENT;
21367 goto done;
21368 }
21369
21370 freezer_context_global.freezer_ctx_uncompressed_pages = 0;
21371 clock_get_uptime(&c_freezer_last_yield_ts);
21372 }
21373 again:
21374
21375 for (entry2 = vm_map_first_entry(map);
21376 entry2 != vm_map_to_entry(map);
21377 entry2 = entry2->vme_next) {
21378 vm_object_t src_object;
21379
21380 if (entry2->is_sub_map) {
21381 continue;
21382 }
21383
21384 src_object = VME_OBJECT(entry2);
21385 if (!src_object ||
21386 src_object->phys_contiguous ||
21387 !src_object->internal) {
21388 continue;
21389 }
21390
21391 /* If eligible, scan the entry, moving eligible pages over to our parent object */
21392
21393 if (VM_CONFIG_FREEZER_SWAP_IS_ACTIVE) {
21394 /*
21395 * We skip purgeable objects during evaluation phase only.
21396 * If we decide to freeze this process, we'll explicitly
21397 * purge these objects before we go around again with
21398 * 'evaluation_phase' set to FALSE.
21399 */
21400
21401 if ((src_object->purgable == VM_PURGABLE_EMPTY) || (src_object->purgable == VM_PURGABLE_VOLATILE)) {
21402 /*
21403 * We want to purge objects that may not belong to this task but are mapped
21404 * in this task alone. Since we already purged this task's purgeable memory
21405 * at the end of a successful evaluation phase, we want to avoid doing no-op calls
21406 * on this task's purgeable objects. Hence the check for only volatile objects.
21407 */
21408 if (evaluation_phase == FALSE &&
21409 (src_object->purgable == VM_PURGABLE_VOLATILE) &&
21410 (src_object->ref_count == 1)) {
21411 vm_object_lock(src_object);
21412 vm_object_purge(src_object, 0);
21413 vm_object_unlock(src_object);
21414 }
21415 continue;
21416 }
21417
21418 /*
21419 * Pages belonging to this object could be swapped to disk.
21420 * Make sure it's not a shared object because we could end
21421 * up just bringing it back in again.
21422 *
21423 * We try to optimize somewhat by checking for objects that are mapped
21424 * more than once within our own map. But we don't do full searches,
21425 * we just look at the entries following our current entry.
21426 */
21427
21428 if (src_object->ref_count > 1) {
21429 if (src_object != cur_shared_object) {
21430 obj_pages_snapshot = (src_object->resident_page_count - src_object->wired_page_count) + vm_compressor_pager_get_count(src_object->pager);
21431 dirty_shared_count += obj_pages_snapshot;
21432
21433 cur_shared_object = src_object;
21434 cur_shared_obj_ref_cnt = 1;
21435 continue;
21436 } else {
21437 cur_shared_obj_ref_cnt++;
21438 if (src_object->ref_count == cur_shared_obj_ref_cnt) {
21439 /*
21440 * Fall through to below and treat this object as private.
21441 * So deduct its pages from our shared total and add it to the
21442 * private total.
21443 */
21444
21445 dirty_shared_count -= obj_pages_snapshot;
21446 dirty_private_count += obj_pages_snapshot;
21447 } else {
21448 continue;
21449 }
21450 }
21451 }
21452
21453
21454 if (src_object->ref_count == 1) {
21455 dirty_private_count += (src_object->resident_page_count - src_object->wired_page_count) + vm_compressor_pager_get_count(src_object->pager);
21456 }
21457
21458 if (evaluation_phase == TRUE) {
21459 continue;
21460 }
21461 }
21462
21463 uint32_t paged_out_count = vm_object_compressed_freezer_pageout(src_object, dirty_budget);
21464 *wired_count += src_object->wired_page_count;
21465
21466 if (vm_compressor_low_on_space() || vm_swap_low_on_space()) {
21467 if (vm_compressor_low_on_space()) {
21468 *freezer_error_code = FREEZER_ERROR_NO_COMPRESSOR_SPACE;
21469 }
21470
21471 if (vm_swap_low_on_space()) {
21472 *freezer_error_code = FREEZER_ERROR_NO_SWAP_SPACE;
21473 }
21474
21475 kr = KERN_NO_SPACE;
21476 break;
21477 }
21478 if (paged_out_count >= dirty_budget) {
21479 break;
21480 }
21481 dirty_budget -= paged_out_count;
21482 }
21483
21484 *shared_count = (unsigned int) ((dirty_shared_count * PAGE_SIZE_64) / (1024 * 1024ULL));
21485 if (evaluation_phase) {
21486 unsigned int shared_pages_threshold = (memorystatus_freeze_shared_mb_per_process_max * 1024 * 1024ULL) / PAGE_SIZE_64;
21487
21488 if (dirty_shared_count > shared_pages_threshold) {
21489 *freezer_error_code = FREEZER_ERROR_EXCESS_SHARED_MEMORY;
21490 kr = KERN_FAILURE;
21491 goto done;
21492 }
21493
21494 if (dirty_shared_count &&
21495 ((dirty_private_count / dirty_shared_count) < memorystatus_freeze_private_shared_pages_ratio)) {
21496 *freezer_error_code = FREEZER_ERROR_LOW_PRIVATE_SHARED_RATIO;
21497 kr = KERN_FAILURE;
21498 goto done;
21499 }
21500
21501 evaluation_phase = FALSE;
21502 dirty_shared_count = dirty_private_count = 0;
21503
21504 freezer_context_global.freezer_ctx_uncompressed_pages = 0;
21505 clock_get_uptime(&c_freezer_last_yield_ts);
21506
21507 if (eval_only) {
21508 kr = KERN_SUCCESS;
21509 goto done;
21510 }
21511
21512 vm_purgeable_purge_task_owned(task);
21513
21514 goto again;
21515 } else {
21516 kr = KERN_SUCCESS;
21517 }
21518
21519 done:
21520 vm_map_unlock(map);
21521
21522 if ((eval_only == FALSE) && (kr == KERN_SUCCESS)) {
21523 vm_object_compressed_freezer_done();
21524 }
21525 return kr;
21526 }
21527
21528 #endif
21529
21530 /*
21531 * vm_map_entry_should_cow_for_true_share:
21532 *
21533 * Determines if the map entry should be clipped and setup for copy-on-write
21534 * to avoid applying "true_share" to a large VM object when only a subset is
21535 * targeted.
21536 *
21537 * For now, we target only the map entries created for the Objective C
21538 * Garbage Collector, which initially have the following properties:
21539 * - alias == VM_MEMORY_MALLOC
21540 * - wired_count == 0
21541 * - !needs_copy
21542 * and a VM object with:
21543 * - internal
21544 * - copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC
21545 * - !true_share
21546 * - vo_size == ANON_CHUNK_SIZE
21547 *
21548 * Only non-kernel map entries.
21549 */
21550 boolean_t
vm_map_entry_should_cow_for_true_share(vm_map_entry_t entry)21551 vm_map_entry_should_cow_for_true_share(
21552 vm_map_entry_t entry)
21553 {
21554 vm_object_t object;
21555
21556 if (entry->is_sub_map) {
21557 /* entry does not point at a VM object */
21558 return FALSE;
21559 }
21560
21561 if (entry->needs_copy) {
21562 /* already set for copy_on_write: done! */
21563 return FALSE;
21564 }
21565
21566 if (VME_ALIAS(entry) != VM_MEMORY_MALLOC &&
21567 VME_ALIAS(entry) != VM_MEMORY_MALLOC_SMALL) {
21568 /* not a malloc heap or Obj-C Garbage Collector heap */
21569 return FALSE;
21570 }
21571
21572 if (entry->wired_count) {
21573 /* wired: can't change the map entry... */
21574 vm_counters.should_cow_but_wired++;
21575 return FALSE;
21576 }
21577
21578 object = VME_OBJECT(entry);
21579
21580 if (object == VM_OBJECT_NULL) {
21581 /* no object yet... */
21582 return FALSE;
21583 }
21584
21585 if (!object->internal) {
21586 /* not an internal object */
21587 return FALSE;
21588 }
21589
21590 if (object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC) {
21591 /* not the default copy strategy */
21592 return FALSE;
21593 }
21594
21595 if (object->true_share) {
21596 /* already true_share: too late to avoid it */
21597 return FALSE;
21598 }
21599
21600 if (VME_ALIAS(entry) == VM_MEMORY_MALLOC &&
21601 object->vo_size != ANON_CHUNK_SIZE) {
21602 /* ... not an object created for the ObjC Garbage Collector */
21603 return FALSE;
21604 }
21605
21606 if (VME_ALIAS(entry) == VM_MEMORY_MALLOC_SMALL &&
21607 object->vo_size != 2048 * 4096) {
21608 /* ... not a "MALLOC_SMALL" heap */
21609 return FALSE;
21610 }
21611
21612 /*
21613 * All the criteria match: we have a large object being targeted for "true_share".
21614 * To limit the adverse side-effects linked with "true_share", tell the caller to
21615 * try and avoid setting up the entire object for "true_share" by clipping the
21616 * targeted range and setting it up for copy-on-write.
21617 */
21618 return TRUE;
21619 }
21620
21621 uint64_t vm_map_range_overflows_count = 0;
21622 TUNABLE_WRITEABLE(boolean_t, vm_map_range_overflows_log, "vm_map_range_overflows_log", FALSE);
21623 bool
vm_map_range_overflows(vm_map_t map,vm_map_offset_t addr,vm_map_size_t size)21624 vm_map_range_overflows(
21625 vm_map_t map,
21626 vm_map_offset_t addr,
21627 vm_map_size_t size)
21628 {
21629 vm_map_offset_t start, end, sum;
21630 vm_map_offset_t pgmask;
21631
21632 if (size == 0) {
21633 /* empty range -> no overflow */
21634 return false;
21635 }
21636 pgmask = vm_map_page_mask(map);
21637 start = vm_map_trunc_page_mask(addr, pgmask);
21638 end = vm_map_round_page_mask(addr + size, pgmask);
21639 if (__improbable(os_add_overflow(addr, size, &sum) || end <= start)) {
21640 vm_map_range_overflows_count++;
21641 if (vm_map_range_overflows_log) {
21642 printf("%d[%s] vm_map_range_overflows addr 0x%llx size 0x%llx pgmask 0x%llx\n",
21643 proc_selfpid(),
21644 proc_best_name(current_proc()),
21645 (uint64_t)addr,
21646 (uint64_t)size,
21647 (uint64_t)pgmask);
21648 }
21649 DTRACE_VM4(vm_map_range_overflows,
21650 vm_map_t, map,
21651 uint32_t, pgmask,
21652 uint64_t, (uint64_t)addr,
21653 uint64_t, (uint64_t)size);
21654 return true;
21655 }
21656 return false;
21657 }
21658
21659 vm_map_offset_t
vm_map_round_page_mask(vm_map_offset_t offset,vm_map_offset_t mask)21660 vm_map_round_page_mask(
21661 vm_map_offset_t offset,
21662 vm_map_offset_t mask)
21663 {
21664 return VM_MAP_ROUND_PAGE(offset, mask);
21665 }
21666
21667 vm_map_offset_t
vm_map_trunc_page_mask(vm_map_offset_t offset,vm_map_offset_t mask)21668 vm_map_trunc_page_mask(
21669 vm_map_offset_t offset,
21670 vm_map_offset_t mask)
21671 {
21672 return VM_MAP_TRUNC_PAGE(offset, mask);
21673 }
21674
21675 boolean_t
vm_map_page_aligned(vm_map_offset_t offset,vm_map_offset_t mask)21676 vm_map_page_aligned(
21677 vm_map_offset_t offset,
21678 vm_map_offset_t mask)
21679 {
21680 return ((offset) & mask) == 0;
21681 }
21682
21683 int
vm_map_page_shift(vm_map_t map)21684 vm_map_page_shift(
21685 vm_map_t map)
21686 {
21687 return VM_MAP_PAGE_SHIFT(map);
21688 }
21689
21690 int
vm_map_page_size(vm_map_t map)21691 vm_map_page_size(
21692 vm_map_t map)
21693 {
21694 return VM_MAP_PAGE_SIZE(map);
21695 }
21696
21697 vm_map_offset_t
vm_map_page_mask(vm_map_t map)21698 vm_map_page_mask(
21699 vm_map_t map)
21700 {
21701 return VM_MAP_PAGE_MASK(map);
21702 }
21703
21704 kern_return_t
vm_map_set_page_shift(vm_map_t map,int pageshift)21705 vm_map_set_page_shift(
21706 vm_map_t map,
21707 int pageshift)
21708 {
21709 if (map->hdr.nentries != 0) {
21710 /* too late to change page size */
21711 return KERN_FAILURE;
21712 }
21713
21714 map->hdr.page_shift = (uint16_t)pageshift;
21715
21716 return KERN_SUCCESS;
21717 }
21718
21719 kern_return_t
vm_map_query_volatile(vm_map_t map,mach_vm_size_t * volatile_virtual_size_p,mach_vm_size_t * volatile_resident_size_p,mach_vm_size_t * volatile_compressed_size_p,mach_vm_size_t * volatile_pmap_size_p,mach_vm_size_t * volatile_compressed_pmap_size_p)21720 vm_map_query_volatile(
21721 vm_map_t map,
21722 mach_vm_size_t *volatile_virtual_size_p,
21723 mach_vm_size_t *volatile_resident_size_p,
21724 mach_vm_size_t *volatile_compressed_size_p,
21725 mach_vm_size_t *volatile_pmap_size_p,
21726 mach_vm_size_t *volatile_compressed_pmap_size_p)
21727 {
21728 mach_vm_size_t volatile_virtual_size;
21729 mach_vm_size_t volatile_resident_count;
21730 mach_vm_size_t volatile_compressed_count;
21731 mach_vm_size_t volatile_pmap_count;
21732 mach_vm_size_t volatile_compressed_pmap_count;
21733 mach_vm_size_t resident_count;
21734 vm_map_entry_t entry;
21735 vm_object_t object;
21736
21737 /* map should be locked by caller */
21738
21739 volatile_virtual_size = 0;
21740 volatile_resident_count = 0;
21741 volatile_compressed_count = 0;
21742 volatile_pmap_count = 0;
21743 volatile_compressed_pmap_count = 0;
21744
21745 for (entry = vm_map_first_entry(map);
21746 entry != vm_map_to_entry(map);
21747 entry = entry->vme_next) {
21748 mach_vm_size_t pmap_resident_bytes, pmap_compressed_bytes;
21749
21750 if (entry->is_sub_map) {
21751 continue;
21752 }
21753 if (!(entry->protection & VM_PROT_WRITE)) {
21754 continue;
21755 }
21756 object = VME_OBJECT(entry);
21757 if (object == VM_OBJECT_NULL) {
21758 continue;
21759 }
21760 if (object->purgable != VM_PURGABLE_VOLATILE &&
21761 object->purgable != VM_PURGABLE_EMPTY) {
21762 continue;
21763 }
21764 if (VME_OFFSET(entry)) {
21765 /*
21766 * If the map entry has been split and the object now
21767 * appears several times in the VM map, we don't want
21768 * to count the object's resident_page_count more than
21769 * once. We count it only for the first one, starting
21770 * at offset 0 and ignore the other VM map entries.
21771 */
21772 continue;
21773 }
21774 resident_count = object->resident_page_count;
21775 if ((VME_OFFSET(entry) / PAGE_SIZE) >= resident_count) {
21776 resident_count = 0;
21777 } else {
21778 resident_count -= (VME_OFFSET(entry) / PAGE_SIZE);
21779 }
21780
21781 volatile_virtual_size += entry->vme_end - entry->vme_start;
21782 volatile_resident_count += resident_count;
21783 if (object->pager) {
21784 volatile_compressed_count +=
21785 vm_compressor_pager_get_count(object->pager);
21786 }
21787 pmap_compressed_bytes = 0;
21788 pmap_resident_bytes =
21789 pmap_query_resident(map->pmap,
21790 entry->vme_start,
21791 entry->vme_end,
21792 &pmap_compressed_bytes);
21793 volatile_pmap_count += (pmap_resident_bytes / PAGE_SIZE);
21794 volatile_compressed_pmap_count += (pmap_compressed_bytes
21795 / PAGE_SIZE);
21796 }
21797
21798 /* map is still locked on return */
21799
21800 *volatile_virtual_size_p = volatile_virtual_size;
21801 *volatile_resident_size_p = volatile_resident_count * PAGE_SIZE;
21802 *volatile_compressed_size_p = volatile_compressed_count * PAGE_SIZE;
21803 *volatile_pmap_size_p = volatile_pmap_count * PAGE_SIZE;
21804 *volatile_compressed_pmap_size_p = volatile_compressed_pmap_count * PAGE_SIZE;
21805
21806 return KERN_SUCCESS;
21807 }
21808
21809 void
vm_map_sizes(vm_map_t map,vm_map_size_t * psize,vm_map_size_t * pfree,vm_map_size_t * plargest_free)21810 vm_map_sizes(vm_map_t map,
21811 vm_map_size_t * psize,
21812 vm_map_size_t * pfree,
21813 vm_map_size_t * plargest_free)
21814 {
21815 vm_map_entry_t entry;
21816 vm_map_offset_t prev;
21817 vm_map_size_t free, total_free, largest_free;
21818 boolean_t end;
21819
21820 if (!map) {
21821 *psize = *pfree = *plargest_free = 0;
21822 return;
21823 }
21824 total_free = largest_free = 0;
21825
21826 vm_map_lock_read(map);
21827 if (psize) {
21828 *psize = map->max_offset - map->min_offset;
21829 }
21830
21831 prev = map->min_offset;
21832 for (entry = vm_map_first_entry(map);; entry = entry->vme_next) {
21833 end = (entry == vm_map_to_entry(map));
21834
21835 if (end) {
21836 free = entry->vme_end - prev;
21837 } else {
21838 free = entry->vme_start - prev;
21839 }
21840
21841 total_free += free;
21842 if (free > largest_free) {
21843 largest_free = free;
21844 }
21845
21846 if (end) {
21847 break;
21848 }
21849 prev = entry->vme_end;
21850 }
21851 vm_map_unlock_read(map);
21852 if (pfree) {
21853 *pfree = total_free;
21854 }
21855 if (plargest_free) {
21856 *plargest_free = largest_free;
21857 }
21858 }
21859
21860 #if VM_SCAN_FOR_SHADOW_CHAIN
21861 int vm_map_shadow_max(vm_map_t map);
21862 int
vm_map_shadow_max(vm_map_t map)21863 vm_map_shadow_max(
21864 vm_map_t map)
21865 {
21866 int shadows, shadows_max;
21867 vm_map_entry_t entry;
21868 vm_object_t object, next_object;
21869
21870 if (map == NULL) {
21871 return 0;
21872 }
21873
21874 shadows_max = 0;
21875
21876 vm_map_lock_read(map);
21877
21878 for (entry = vm_map_first_entry(map);
21879 entry != vm_map_to_entry(map);
21880 entry = entry->vme_next) {
21881 if (entry->is_sub_map) {
21882 continue;
21883 }
21884 object = VME_OBJECT(entry);
21885 if (object == NULL) {
21886 continue;
21887 }
21888 vm_object_lock_shared(object);
21889 for (shadows = 0;
21890 object->shadow != NULL;
21891 shadows++, object = next_object) {
21892 next_object = object->shadow;
21893 vm_object_lock_shared(next_object);
21894 vm_object_unlock(object);
21895 }
21896 vm_object_unlock(object);
21897 if (shadows > shadows_max) {
21898 shadows_max = shadows;
21899 }
21900 }
21901
21902 vm_map_unlock_read(map);
21903
21904 return shadows_max;
21905 }
21906 #endif /* VM_SCAN_FOR_SHADOW_CHAIN */
21907
21908 void
vm_commit_pagezero_status(vm_map_t lmap)21909 vm_commit_pagezero_status(vm_map_t lmap)
21910 {
21911 pmap_advise_pagezero_range(lmap->pmap, lmap->min_offset);
21912 }
21913
21914 #if __x86_64__
21915 void
vm_map_set_high_start(vm_map_t map,vm_map_offset_t high_start)21916 vm_map_set_high_start(
21917 vm_map_t map,
21918 vm_map_offset_t high_start)
21919 {
21920 map->vmmap_high_start = high_start;
21921 }
21922 #endif /* __x86_64__ */
21923
21924 #if CODE_SIGNING_MONITOR
21925
21926 kern_return_t
vm_map_entry_cs_associate(vm_map_t map,vm_map_entry_t entry,vm_map_kernel_flags_t vmk_flags)21927 vm_map_entry_cs_associate(
21928 vm_map_t map,
21929 vm_map_entry_t entry,
21930 vm_map_kernel_flags_t vmk_flags)
21931 {
21932 vm_object_t cs_object, cs_shadow, backing_object;
21933 vm_object_offset_t cs_offset, backing_offset;
21934 void *cs_blobs;
21935 struct vnode *cs_vnode;
21936 kern_return_t cs_ret;
21937
21938 if (map->pmap == NULL ||
21939 entry->is_sub_map || /* XXX FBDP: recurse on sub-range? */
21940 (csm_address_space_exempt(map->pmap) == KERN_SUCCESS) ||
21941 VME_OBJECT(entry) == VM_OBJECT_NULL) {
21942 return KERN_SUCCESS;
21943 }
21944
21945 if (!(entry->protection & VM_PROT_EXECUTE)) {
21946 /*
21947 * This memory region is not executable, so the code-signing
21948 * monitor would usually not care about it...
21949 */
21950 if (vmk_flags.vmkf_remap_prot_copy &&
21951 (entry->max_protection & VM_PROT_EXECUTE)) {
21952 /*
21953 * ... except if the memory region is being remapped
21954 * from r-x/r-x to rw-/rwx via vm_protect(VM_PROT_COPY)
21955 * which is what a debugger or dtrace would be doing
21956 * to prepare to modify an executable page to insert
21957 * a breakpoint or activate a probe.
21958 * In that case, fall through so that we can mark
21959 * this region as being "debugged" and no longer
21960 * strictly code-signed.
21961 */
21962 } else {
21963 /*
21964 * Really not executable, so no need to tell the
21965 * code-signing monitor.
21966 */
21967 return KERN_SUCCESS;
21968 }
21969 }
21970
21971 vm_map_lock_assert_exclusive(map);
21972
21973 if (entry->used_for_jit) {
21974 cs_ret = csm_associate_jit_region(
21975 map->pmap,
21976 entry->vme_start,
21977 entry->vme_end - entry->vme_start);
21978 goto done;
21979 }
21980
21981 if (vmk_flags.vmkf_remap_prot_copy) {
21982 cs_ret = csm_associate_debug_region(
21983 map->pmap,
21984 entry->vme_start,
21985 entry->vme_end - entry->vme_start);
21986 if (cs_ret == KERN_SUCCESS) {
21987 entry->vme_xnu_user_debug = TRUE;
21988 }
21989 #if DEVELOPMENT || DEBUG
21990 if (vm_log_xnu_user_debug) {
21991 printf("FBDP %d[%s] %s:%d map %p entry %p [ 0x%llx 0x%llx ] vme_xnu_user_debug=%d cs_ret %d\n",
21992 proc_selfpid(),
21993 (get_bsdtask_info(current_task()) ? proc_name_address(get_bsdtask_info(current_task())) : "?"),
21994 __FUNCTION__, __LINE__,
21995 map, entry,
21996 (uint64_t)entry->vme_start, (uint64_t)entry->vme_end,
21997 entry->vme_xnu_user_debug,
21998 cs_ret);
21999 }
22000 #endif /* DEVELOPMENT || DEBUG */
22001 goto done;
22002 }
22003
22004 cs_object = VME_OBJECT(entry);
22005 vm_object_lock_shared(cs_object);
22006 cs_offset = VME_OFFSET(entry);
22007
22008 /* find the VM object backed by the code-signed vnode */
22009 for (;;) {
22010 /* go to the bottom of cs_object's shadow chain */
22011 for (;
22012 cs_object->shadow != VM_OBJECT_NULL;
22013 cs_object = cs_shadow) {
22014 cs_shadow = cs_object->shadow;
22015 cs_offset += cs_object->vo_shadow_offset;
22016 vm_object_lock_shared(cs_shadow);
22017 vm_object_unlock(cs_object);
22018 }
22019 if (cs_object->internal ||
22020 cs_object->pager == MEMORY_OBJECT_NULL) {
22021 vm_object_unlock(cs_object);
22022 return KERN_SUCCESS;
22023 }
22024
22025 cs_offset += cs_object->paging_offset;
22026
22027 /*
22028 * cs_object could be backed by a:
22029 * vnode_pager
22030 * apple_protect_pager
22031 * shared_region_pager
22032 * fourk_pager (multiple backing objects -> fail?)
22033 * ask the pager if it has a backing VM object
22034 */
22035 if (!memory_object_backing_object(cs_object->pager,
22036 cs_offset,
22037 &backing_object,
22038 &backing_offset)) {
22039 /* no backing object: cs_object is it */
22040 break;
22041 }
22042
22043 /* look down the backing object's shadow chain */
22044 vm_object_lock_shared(backing_object);
22045 vm_object_unlock(cs_object);
22046 cs_object = backing_object;
22047 cs_offset = backing_offset;
22048 }
22049
22050 cs_vnode = vnode_pager_lookup_vnode(cs_object->pager);
22051 if (cs_vnode == NULL) {
22052 /* no vnode, no code signatures to associate */
22053 cs_ret = KERN_SUCCESS;
22054 } else {
22055 cs_ret = vnode_pager_get_cs_blobs(cs_vnode,
22056 &cs_blobs);
22057 assert(cs_ret == KERN_SUCCESS);
22058 cs_ret = cs_associate_blob_with_mapping(map->pmap,
22059 entry->vme_start,
22060 (entry->vme_end - entry->vme_start),
22061 cs_offset,
22062 cs_blobs);
22063 }
22064 vm_object_unlock(cs_object);
22065 cs_object = VM_OBJECT_NULL;
22066
22067 done:
22068 if (cs_ret == KERN_SUCCESS) {
22069 DTRACE_VM2(vm_map_entry_cs_associate_success,
22070 vm_map_offset_t, entry->vme_start,
22071 vm_map_offset_t, entry->vme_end);
22072 if (vm_map_executable_immutable) {
22073 /*
22074 * Prevent this executable
22075 * mapping from being unmapped
22076 * or modified.
22077 */
22078 entry->vme_permanent = TRUE;
22079 }
22080 /*
22081 * pmap says it will validate the
22082 * code-signing validity of pages
22083 * faulted in via this mapping, so
22084 * this map entry should be marked so
22085 * that vm_fault() bypasses code-signing
22086 * validation for faults coming through
22087 * this mapping.
22088 */
22089 entry->csm_associated = TRUE;
22090 } else if (cs_ret == KERN_NOT_SUPPORTED) {
22091 /*
22092 * pmap won't check the code-signing
22093 * validity of pages faulted in via
22094 * this mapping, so VM should keep
22095 * doing it.
22096 */
22097 DTRACE_VM3(vm_map_entry_cs_associate_off,
22098 vm_map_offset_t, entry->vme_start,
22099 vm_map_offset_t, entry->vme_end,
22100 int, cs_ret);
22101 } else {
22102 /*
22103 * A real error: do not allow
22104 * execution in this mapping.
22105 */
22106 DTRACE_VM3(vm_map_entry_cs_associate_failure,
22107 vm_map_offset_t, entry->vme_start,
22108 vm_map_offset_t, entry->vme_end,
22109 int, cs_ret);
22110 if (vmk_flags.vmkf_overwrite_immutable) {
22111 /*
22112 * We can get here when we remap an apple_protect pager
22113 * on top of an already cs_associated executable mapping
22114 * with the same code signatures, so we don't want to
22115 * lose VM_PROT_EXECUTE in that case...
22116 */
22117 } else {
22118 entry->protection &= ~VM_PROT_ALLEXEC;
22119 entry->max_protection &= ~VM_PROT_ALLEXEC;
22120 }
22121 }
22122
22123 return cs_ret;
22124 }
22125
22126 #endif /* CODE_SIGNING_MONITOR */
22127
22128 inline bool
vm_map_is_corpse_source(vm_map_t map)22129 vm_map_is_corpse_source(vm_map_t map)
22130 {
22131 bool status = false;
22132 if (map) {
22133 vm_map_lock_read(map);
22134 status = map->corpse_source;
22135 vm_map_unlock_read(map);
22136 }
22137 return status;
22138 }
22139
22140 inline void
vm_map_set_corpse_source(vm_map_t map)22141 vm_map_set_corpse_source(vm_map_t map)
22142 {
22143 if (map) {
22144 vm_map_lock(map);
22145 map->corpse_source = true;
22146 vm_map_unlock(map);
22147 }
22148 }
22149
22150 inline void
vm_map_unset_corpse_source(vm_map_t map)22151 vm_map_unset_corpse_source(vm_map_t map)
22152 {
22153 if (map) {
22154 vm_map_lock(map);
22155 map->corpse_source = false;
22156 vm_map_unlock(map);
22157 }
22158 }
22159 /*
22160 * FORKED CORPSE FOOTPRINT
22161 *
22162 * A forked corpse gets a copy of the original VM map but its pmap is mostly
22163 * empty since it never ran and never got to fault in any pages.
22164 * Collecting footprint info (via "sysctl vm.self_region_footprint") for
22165 * a forked corpse would therefore return very little information.
22166 *
22167 * When forking a corpse, we can pass the VM_MAP_FORK_CORPSE_FOOTPRINT option
22168 * to vm_map_fork() to collect footprint information from the original VM map
22169 * and its pmap, and store it in the forked corpse's VM map. That information
22170 * is stored in place of the VM map's "hole list" since we'll never need to
22171 * lookup for holes in the corpse's map.
22172 *
22173 * The corpse's footprint info looks like this:
22174 *
22175 * vm_map->vmmap_corpse_footprint points to pageable kernel memory laid out
22176 * as follows:
22177 * +---------------------------------------+
22178 * header-> | cf_size |
22179 * +-------------------+-------------------+
22180 * | cf_last_region | cf_last_zeroes |
22181 * +-------------------+-------------------+
22182 * region1-> | cfr_vaddr |
22183 * +-------------------+-------------------+
22184 * | cfr_num_pages | d0 | d1 | d2 | d3 |
22185 * +---------------------------------------+
22186 * | d4 | d5 | ... |
22187 * +---------------------------------------+
22188 * | ... |
22189 * +-------------------+-------------------+
22190 * | dy | dz | na | na | cfr_vaddr... | <-region2
22191 * +-------------------+-------------------+
22192 * | cfr_vaddr (ctd) | cfr_num_pages |
22193 * +---------------------------------------+
22194 * | d0 | d1 ... |
22195 * +---------------------------------------+
22196 * ...
22197 * +---------------------------------------+
22198 * last region-> | cfr_vaddr |
22199 * +---------------------------------------+
22200 * + cfr_num_pages | d0 | d1 | d2 | d3 |
22201 * +---------------------------------------+
22202 * ...
22203 * +---------------------------------------+
22204 * | dx | dy | dz | na | na | na | na | na |
22205 * +---------------------------------------+
22206 *
22207 * where:
22208 * cf_size: total size of the buffer (rounded to page size)
22209 * cf_last_region: offset in the buffer of the last "region" sub-header
22210 * cf_last_zeroes: number of trailing "zero" dispositions at the end
22211 * of last region
22212 * cfr_vaddr: virtual address of the start of the covered "region"
22213 * cfr_num_pages: number of pages in the covered "region"
22214 * d*: disposition of the page at that virtual address
22215 * Regions in the buffer are word-aligned.
22216 *
22217 * We estimate the size of the buffer based on the number of memory regions
22218 * and the virtual size of the address space. While copying each memory region
22219 * during vm_map_fork(), we also collect the footprint info for that region
22220 * and store it in the buffer, packing it as much as possible (coalescing
22221 * contiguous memory regions to avoid having too many region headers and
22222 * avoiding long streaks of "zero" page dispositions by splitting footprint
22223 * "regions", so the number of regions in the footprint buffer might not match
22224 * the number of memory regions in the address space.
22225 *
22226 * We also have to copy the original task's "nonvolatile" ledgers since that's
22227 * part of the footprint and will need to be reported to any tool asking for
22228 * the footprint information of the forked corpse.
22229 */
22230
22231 uint64_t vm_map_corpse_footprint_count = 0;
22232 uint64_t vm_map_corpse_footprint_size_avg = 0;
22233 uint64_t vm_map_corpse_footprint_size_max = 0;
22234 uint64_t vm_map_corpse_footprint_full = 0;
22235 uint64_t vm_map_corpse_footprint_no_buf = 0;
22236
22237 struct vm_map_corpse_footprint_header {
22238 vm_size_t cf_size; /* allocated buffer size */
22239 uint32_t cf_last_region; /* offset of last region in buffer */
22240 union {
22241 uint32_t cfu_last_zeroes; /* during creation:
22242 * number of "zero" dispositions at
22243 * end of last region */
22244 uint32_t cfu_hint_region; /* during lookup:
22245 * offset of last looked up region */
22246 #define cf_last_zeroes cfu.cfu_last_zeroes
22247 #define cf_hint_region cfu.cfu_hint_region
22248 } cfu;
22249 };
22250 typedef uint8_t cf_disp_t;
22251 struct vm_map_corpse_footprint_region {
22252 vm_map_offset_t cfr_vaddr; /* region start virtual address */
22253 uint32_t cfr_num_pages; /* number of pages in this "region" */
22254 cf_disp_t cfr_disposition[0]; /* disposition of each page */
22255 } __attribute__((packed));
22256
22257 static cf_disp_t
vm_page_disposition_to_cf_disp(int disposition)22258 vm_page_disposition_to_cf_disp(
22259 int disposition)
22260 {
22261 assert(sizeof(cf_disp_t) == 1);
22262 /* relocate bits that don't fit in a "uint8_t" */
22263 if (disposition & VM_PAGE_QUERY_PAGE_REUSABLE) {
22264 disposition |= VM_PAGE_QUERY_PAGE_FICTITIOUS;
22265 }
22266 /* cast gets rid of extra bits */
22267 return (cf_disp_t) disposition;
22268 }
22269
22270 static int
vm_page_cf_disp_to_disposition(cf_disp_t cf_disp)22271 vm_page_cf_disp_to_disposition(
22272 cf_disp_t cf_disp)
22273 {
22274 int disposition;
22275
22276 assert(sizeof(cf_disp_t) == 1);
22277 disposition = (int) cf_disp;
22278 /* move relocated bits back in place */
22279 if (cf_disp & VM_PAGE_QUERY_PAGE_FICTITIOUS) {
22280 disposition |= VM_PAGE_QUERY_PAGE_REUSABLE;
22281 disposition &= ~VM_PAGE_QUERY_PAGE_FICTITIOUS;
22282 }
22283 return disposition;
22284 }
22285
22286 /*
22287 * vm_map_corpse_footprint_new_region:
22288 * closes the current footprint "region" and creates a new one
22289 *
22290 * Returns NULL if there's not enough space in the buffer for a new region.
22291 */
22292 static struct vm_map_corpse_footprint_region *
vm_map_corpse_footprint_new_region(struct vm_map_corpse_footprint_header * footprint_header)22293 vm_map_corpse_footprint_new_region(
22294 struct vm_map_corpse_footprint_header *footprint_header)
22295 {
22296 uintptr_t footprint_edge;
22297 uint32_t new_region_offset;
22298 struct vm_map_corpse_footprint_region *footprint_region;
22299 struct vm_map_corpse_footprint_region *new_footprint_region;
22300
22301 footprint_edge = ((uintptr_t)footprint_header +
22302 footprint_header->cf_size);
22303 footprint_region = ((struct vm_map_corpse_footprint_region *)
22304 ((char *)footprint_header +
22305 footprint_header->cf_last_region));
22306 assert((uintptr_t)footprint_region + sizeof(*footprint_region) <=
22307 footprint_edge);
22308
22309 /* get rid of trailing zeroes in the last region */
22310 assert(footprint_region->cfr_num_pages >=
22311 footprint_header->cf_last_zeroes);
22312 footprint_region->cfr_num_pages -=
22313 footprint_header->cf_last_zeroes;
22314 footprint_header->cf_last_zeroes = 0;
22315
22316 /* reuse this region if it's now empty */
22317 if (footprint_region->cfr_num_pages == 0) {
22318 return footprint_region;
22319 }
22320
22321 /* compute offset of new region */
22322 new_region_offset = footprint_header->cf_last_region;
22323 new_region_offset += sizeof(*footprint_region);
22324 new_region_offset += (footprint_region->cfr_num_pages * sizeof(cf_disp_t));
22325 new_region_offset = roundup(new_region_offset, sizeof(int));
22326
22327 /* check if we're going over the edge */
22328 if (((uintptr_t)footprint_header +
22329 new_region_offset +
22330 sizeof(*footprint_region)) >=
22331 footprint_edge) {
22332 /* over the edge: no new region */
22333 return NULL;
22334 }
22335
22336 /* adjust offset of last region in header */
22337 footprint_header->cf_last_region = new_region_offset;
22338
22339 new_footprint_region = (struct vm_map_corpse_footprint_region *)
22340 ((char *)footprint_header +
22341 footprint_header->cf_last_region);
22342 new_footprint_region->cfr_vaddr = 0;
22343 new_footprint_region->cfr_num_pages = 0;
22344 /* caller needs to initialize new region */
22345
22346 return new_footprint_region;
22347 }
22348
22349 /*
22350 * vm_map_corpse_footprint_collect:
22351 * collect footprint information for "old_entry" in "old_map" and
22352 * stores it in "new_map"'s vmmap_footprint_info.
22353 */
22354 kern_return_t
vm_map_corpse_footprint_collect(vm_map_t old_map,vm_map_entry_t old_entry,vm_map_t new_map)22355 vm_map_corpse_footprint_collect(
22356 vm_map_t old_map,
22357 vm_map_entry_t old_entry,
22358 vm_map_t new_map)
22359 {
22360 vm_map_offset_t va;
22361 kern_return_t kr;
22362 struct vm_map_corpse_footprint_header *footprint_header;
22363 struct vm_map_corpse_footprint_region *footprint_region;
22364 struct vm_map_corpse_footprint_region *new_footprint_region;
22365 cf_disp_t *next_disp_p;
22366 uintptr_t footprint_edge;
22367 uint32_t num_pages_tmp;
22368 int effective_page_size;
22369
22370 effective_page_size = MIN(PAGE_SIZE, VM_MAP_PAGE_SIZE(old_map));
22371
22372 va = old_entry->vme_start;
22373
22374 vm_map_lock_assert_exclusive(old_map);
22375 vm_map_lock_assert_exclusive(new_map);
22376
22377 assert(new_map->has_corpse_footprint);
22378 assert(!old_map->has_corpse_footprint);
22379 if (!new_map->has_corpse_footprint ||
22380 old_map->has_corpse_footprint) {
22381 /*
22382 * This can only transfer footprint info from a
22383 * map with a live pmap to a map with a corpse footprint.
22384 */
22385 return KERN_NOT_SUPPORTED;
22386 }
22387
22388 if (new_map->vmmap_corpse_footprint == NULL) {
22389 vm_offset_t buf;
22390 vm_size_t buf_size;
22391
22392 buf = 0;
22393 buf_size = (sizeof(*footprint_header) +
22394 (old_map->hdr.nentries
22395 *
22396 (sizeof(*footprint_region) +
22397 +3)) /* potential alignment for each region */
22398 +
22399 ((old_map->size / effective_page_size)
22400 *
22401 sizeof(cf_disp_t))); /* disposition for each page */
22402 // printf("FBDP corpse map %p guestimate footprint size 0x%llx\n", new_map, (uint64_t) buf_size);
22403 buf_size = round_page(buf_size);
22404
22405 /* limit buffer to 1 page to validate overflow detection */
22406 // buf_size = PAGE_SIZE;
22407
22408 /* limit size to a somewhat sane amount */
22409 #if XNU_TARGET_OS_OSX
22410 #define VM_MAP_CORPSE_FOOTPRINT_INFO_MAX_SIZE (8*1024*1024) /* 8MB */
22411 #else /* XNU_TARGET_OS_OSX */
22412 #define VM_MAP_CORPSE_FOOTPRINT_INFO_MAX_SIZE (256*1024) /* 256KB */
22413 #endif /* XNU_TARGET_OS_OSX */
22414 if (buf_size > VM_MAP_CORPSE_FOOTPRINT_INFO_MAX_SIZE) {
22415 buf_size = VM_MAP_CORPSE_FOOTPRINT_INFO_MAX_SIZE;
22416 }
22417
22418 /*
22419 * Allocate the pageable buffer (with a trailing guard page).
22420 * It will be zero-filled on demand.
22421 */
22422 kr = kmem_alloc(kernel_map, &buf, buf_size + PAGE_SIZE,
22423 KMA_DATA | KMA_PAGEABLE | KMA_GUARD_LAST,
22424 VM_KERN_MEMORY_DIAG);
22425 if (kr != KERN_SUCCESS) {
22426 vm_map_corpse_footprint_no_buf++;
22427 return kr;
22428 }
22429
22430 /* initialize header and 1st region */
22431 footprint_header = (struct vm_map_corpse_footprint_header *)buf;
22432 new_map->vmmap_corpse_footprint = footprint_header;
22433
22434 footprint_header->cf_size = buf_size;
22435 footprint_header->cf_last_region =
22436 sizeof(*footprint_header);
22437 footprint_header->cf_last_zeroes = 0;
22438
22439 footprint_region = (struct vm_map_corpse_footprint_region *)
22440 ((char *)footprint_header +
22441 footprint_header->cf_last_region);
22442 footprint_region->cfr_vaddr = 0;
22443 footprint_region->cfr_num_pages = 0;
22444 } else {
22445 /* retrieve header and last region */
22446 footprint_header = (struct vm_map_corpse_footprint_header *)
22447 new_map->vmmap_corpse_footprint;
22448 footprint_region = (struct vm_map_corpse_footprint_region *)
22449 ((char *)footprint_header +
22450 footprint_header->cf_last_region);
22451 }
22452 footprint_edge = ((uintptr_t)footprint_header +
22453 footprint_header->cf_size);
22454
22455 if ((footprint_region->cfr_vaddr +
22456 (((vm_map_offset_t)footprint_region->cfr_num_pages) *
22457 effective_page_size))
22458 != old_entry->vme_start) {
22459 uint64_t num_pages_delta, num_pages_delta_size;
22460 uint32_t region_offset_delta_size;
22461
22462 /*
22463 * Not the next contiguous virtual address:
22464 * start a new region or store "zero" dispositions for
22465 * the missing pages?
22466 */
22467 /* size of gap in actual page dispositions */
22468 num_pages_delta = ((old_entry->vme_start -
22469 footprint_region->cfr_vaddr) / effective_page_size)
22470 - footprint_region->cfr_num_pages;
22471 num_pages_delta_size = num_pages_delta * sizeof(cf_disp_t);
22472 /* size of gap as a new footprint region header */
22473 region_offset_delta_size =
22474 (sizeof(*footprint_region) +
22475 roundup(((footprint_region->cfr_num_pages -
22476 footprint_header->cf_last_zeroes) * sizeof(cf_disp_t)),
22477 sizeof(int)) -
22478 ((footprint_region->cfr_num_pages -
22479 footprint_header->cf_last_zeroes) * sizeof(cf_disp_t)));
22480 // printf("FBDP %s:%d region 0x%x 0x%llx 0x%x vme_start 0x%llx pages_delta 0x%llx region_delta 0x%x\n", __FUNCTION__, __LINE__, footprint_header->cf_last_region, footprint_region->cfr_vaddr, footprint_region->cfr_num_pages, old_entry->vme_start, num_pages_delta, region_offset_delta);
22481 if (region_offset_delta_size < num_pages_delta_size ||
22482 os_add3_overflow(footprint_region->cfr_num_pages,
22483 (uint32_t) num_pages_delta,
22484 1,
22485 &num_pages_tmp)) {
22486 /*
22487 * Storing data for this gap would take more space
22488 * than inserting a new footprint region header:
22489 * let's start a new region and save space. If it's a
22490 * tie, let's avoid using a new region, since that
22491 * would require more region hops to find the right
22492 * range during lookups.
22493 *
22494 * If the current region's cfr_num_pages would overflow
22495 * if we added "zero" page dispositions for the gap,
22496 * no choice but to start a new region.
22497 */
22498 // printf("FBDP %s:%d new region\n", __FUNCTION__, __LINE__);
22499 new_footprint_region =
22500 vm_map_corpse_footprint_new_region(footprint_header);
22501 /* check that we're not going over the edge */
22502 if (new_footprint_region == NULL) {
22503 goto over_the_edge;
22504 }
22505 footprint_region = new_footprint_region;
22506 /* initialize new region as empty */
22507 footprint_region->cfr_vaddr = old_entry->vme_start;
22508 footprint_region->cfr_num_pages = 0;
22509 } else {
22510 /*
22511 * Store "zero" page dispositions for the missing
22512 * pages.
22513 */
22514 // printf("FBDP %s:%d zero gap\n", __FUNCTION__, __LINE__);
22515 for (; num_pages_delta > 0; num_pages_delta--) {
22516 next_disp_p = (cf_disp_t *)
22517 ((uintptr_t) footprint_region +
22518 sizeof(*footprint_region));
22519 next_disp_p += footprint_region->cfr_num_pages;
22520 /* check that we're not going over the edge */
22521 if ((uintptr_t)next_disp_p >= footprint_edge) {
22522 goto over_the_edge;
22523 }
22524 /* store "zero" disposition for this gap page */
22525 footprint_region->cfr_num_pages++;
22526 *next_disp_p = (cf_disp_t) 0;
22527 footprint_header->cf_last_zeroes++;
22528 }
22529 }
22530 }
22531
22532 for (va = old_entry->vme_start;
22533 va < old_entry->vme_end;
22534 va += effective_page_size) {
22535 int disposition;
22536 cf_disp_t cf_disp;
22537
22538 vm_map_footprint_query_page_info(old_map,
22539 old_entry,
22540 va,
22541 &disposition);
22542 cf_disp = vm_page_disposition_to_cf_disp(disposition);
22543
22544 // if (va < SHARED_REGION_BASE_ARM64) printf("FBDP collect map %p va 0x%llx disp 0x%x\n", new_map, va, disp);
22545
22546 if (cf_disp == 0 && footprint_region->cfr_num_pages == 0) {
22547 /*
22548 * Ignore "zero" dispositions at start of
22549 * region: just move start of region.
22550 */
22551 footprint_region->cfr_vaddr += effective_page_size;
22552 continue;
22553 }
22554
22555 /* would region's cfr_num_pages overflow? */
22556 if (os_add_overflow(footprint_region->cfr_num_pages, 1,
22557 &num_pages_tmp)) {
22558 /* overflow: create a new region */
22559 new_footprint_region =
22560 vm_map_corpse_footprint_new_region(
22561 footprint_header);
22562 if (new_footprint_region == NULL) {
22563 goto over_the_edge;
22564 }
22565 footprint_region = new_footprint_region;
22566 footprint_region->cfr_vaddr = va;
22567 footprint_region->cfr_num_pages = 0;
22568 }
22569
22570 next_disp_p = (cf_disp_t *) ((uintptr_t) footprint_region +
22571 sizeof(*footprint_region));
22572 next_disp_p += footprint_region->cfr_num_pages;
22573 /* check that we're not going over the edge */
22574 if ((uintptr_t)next_disp_p >= footprint_edge) {
22575 goto over_the_edge;
22576 }
22577 /* store this dispostion */
22578 *next_disp_p = cf_disp;
22579 footprint_region->cfr_num_pages++;
22580
22581 if (cf_disp != 0) {
22582 /* non-zero disp: break the current zero streak */
22583 footprint_header->cf_last_zeroes = 0;
22584 /* done */
22585 continue;
22586 }
22587
22588 /* zero disp: add to the current streak of zeroes */
22589 footprint_header->cf_last_zeroes++;
22590 if ((footprint_header->cf_last_zeroes +
22591 roundup(((footprint_region->cfr_num_pages -
22592 footprint_header->cf_last_zeroes) * sizeof(cf_disp_t)) &
22593 (sizeof(int) - 1),
22594 sizeof(int))) <
22595 (sizeof(*footprint_header))) {
22596 /*
22597 * There are not enough trailing "zero" dispositions
22598 * (+ the extra padding we would need for the previous
22599 * region); creating a new region would not save space
22600 * at this point, so let's keep this "zero" disposition
22601 * in this region and reconsider later.
22602 */
22603 continue;
22604 }
22605 /*
22606 * Create a new region to avoid having too many consecutive
22607 * "zero" dispositions.
22608 */
22609 new_footprint_region =
22610 vm_map_corpse_footprint_new_region(footprint_header);
22611 if (new_footprint_region == NULL) {
22612 goto over_the_edge;
22613 }
22614 footprint_region = new_footprint_region;
22615 /* initialize the new region as empty ... */
22616 footprint_region->cfr_num_pages = 0;
22617 /* ... and skip this "zero" disp */
22618 footprint_region->cfr_vaddr = va + effective_page_size;
22619 }
22620
22621 return KERN_SUCCESS;
22622
22623 over_the_edge:
22624 // printf("FBDP map %p footprint was full for va 0x%llx\n", new_map, va);
22625 vm_map_corpse_footprint_full++;
22626 return KERN_RESOURCE_SHORTAGE;
22627 }
22628
22629 /*
22630 * vm_map_corpse_footprint_collect_done:
22631 * completes the footprint collection by getting rid of any remaining
22632 * trailing "zero" dispositions and trimming the unused part of the
22633 * kernel buffer
22634 */
22635 void
vm_map_corpse_footprint_collect_done(vm_map_t new_map)22636 vm_map_corpse_footprint_collect_done(
22637 vm_map_t new_map)
22638 {
22639 struct vm_map_corpse_footprint_header *footprint_header;
22640 struct vm_map_corpse_footprint_region *footprint_region;
22641 vm_size_t buf_size, actual_size;
22642 kern_return_t kr;
22643
22644 assert(new_map->has_corpse_footprint);
22645 if (!new_map->has_corpse_footprint ||
22646 new_map->vmmap_corpse_footprint == NULL) {
22647 return;
22648 }
22649
22650 footprint_header = (struct vm_map_corpse_footprint_header *)
22651 new_map->vmmap_corpse_footprint;
22652 buf_size = footprint_header->cf_size;
22653
22654 footprint_region = (struct vm_map_corpse_footprint_region *)
22655 ((char *)footprint_header +
22656 footprint_header->cf_last_region);
22657
22658 /* get rid of trailing zeroes in last region */
22659 assert(footprint_region->cfr_num_pages >= footprint_header->cf_last_zeroes);
22660 footprint_region->cfr_num_pages -= footprint_header->cf_last_zeroes;
22661 footprint_header->cf_last_zeroes = 0;
22662
22663 actual_size = (vm_size_t)(footprint_header->cf_last_region +
22664 sizeof(*footprint_region) +
22665 (footprint_region->cfr_num_pages * sizeof(cf_disp_t)));
22666
22667 // printf("FBDP map %p buf_size 0x%llx actual_size 0x%llx\n", new_map, (uint64_t) buf_size, (uint64_t) actual_size);
22668 vm_map_corpse_footprint_size_avg =
22669 (((vm_map_corpse_footprint_size_avg *
22670 vm_map_corpse_footprint_count) +
22671 actual_size) /
22672 (vm_map_corpse_footprint_count + 1));
22673 vm_map_corpse_footprint_count++;
22674 if (actual_size > vm_map_corpse_footprint_size_max) {
22675 vm_map_corpse_footprint_size_max = actual_size;
22676 }
22677
22678 actual_size = round_page(actual_size);
22679 if (buf_size > actual_size) {
22680 kr = vm_deallocate(kernel_map,
22681 ((vm_address_t)footprint_header +
22682 actual_size +
22683 PAGE_SIZE), /* trailing guard page */
22684 (buf_size - actual_size));
22685 assertf(kr == KERN_SUCCESS,
22686 "trim: footprint_header %p buf_size 0x%llx actual_size 0x%llx kr=0x%x\n",
22687 footprint_header,
22688 (uint64_t) buf_size,
22689 (uint64_t) actual_size,
22690 kr);
22691 kr = vm_protect(kernel_map,
22692 ((vm_address_t)footprint_header +
22693 actual_size),
22694 PAGE_SIZE,
22695 FALSE, /* set_maximum */
22696 VM_PROT_NONE);
22697 assertf(kr == KERN_SUCCESS,
22698 "guard: footprint_header %p buf_size 0x%llx actual_size 0x%llx kr=0x%x\n",
22699 footprint_header,
22700 (uint64_t) buf_size,
22701 (uint64_t) actual_size,
22702 kr);
22703 }
22704
22705 footprint_header->cf_size = actual_size;
22706 }
22707
22708 /*
22709 * vm_map_corpse_footprint_query_page_info:
22710 * retrieves the disposition of the page at virtual address "vaddr"
22711 * in the forked corpse's VM map
22712 *
22713 * This is the equivalent of vm_map_footprint_query_page_info() for a forked corpse.
22714 */
22715 kern_return_t
vm_map_corpse_footprint_query_page_info(vm_map_t map,vm_map_offset_t va,int * disposition_p)22716 vm_map_corpse_footprint_query_page_info(
22717 vm_map_t map,
22718 vm_map_offset_t va,
22719 int *disposition_p)
22720 {
22721 struct vm_map_corpse_footprint_header *footprint_header;
22722 struct vm_map_corpse_footprint_region *footprint_region;
22723 uint32_t footprint_region_offset;
22724 vm_map_offset_t region_start, region_end;
22725 int disp_idx;
22726 kern_return_t kr;
22727 int effective_page_size;
22728 cf_disp_t cf_disp;
22729
22730 if (!map->has_corpse_footprint) {
22731 *disposition_p = 0;
22732 kr = KERN_INVALID_ARGUMENT;
22733 goto done;
22734 }
22735
22736 footprint_header = map->vmmap_corpse_footprint;
22737 if (footprint_header == NULL) {
22738 *disposition_p = 0;
22739 // if (va < SHARED_REGION_BASE_ARM64) printf("FBDP %d query map %p va 0x%llx disp 0x%x\n", __LINE__, map, va, *disposition_p);
22740 kr = KERN_INVALID_ARGUMENT;
22741 goto done;
22742 }
22743
22744 /* start looking at the hint ("cf_hint_region") */
22745 footprint_region_offset = footprint_header->cf_hint_region;
22746
22747 effective_page_size = MIN(PAGE_SIZE, VM_MAP_PAGE_SIZE(map));
22748
22749 lookup_again:
22750 if (footprint_region_offset < sizeof(*footprint_header)) {
22751 /* hint too low: start from 1st region */
22752 footprint_region_offset = sizeof(*footprint_header);
22753 }
22754 if (footprint_region_offset >= footprint_header->cf_last_region) {
22755 /* hint too high: re-start from 1st region */
22756 footprint_region_offset = sizeof(*footprint_header);
22757 }
22758 footprint_region = (struct vm_map_corpse_footprint_region *)
22759 ((char *)footprint_header + footprint_region_offset);
22760 region_start = footprint_region->cfr_vaddr;
22761 region_end = (region_start +
22762 ((vm_map_offset_t)(footprint_region->cfr_num_pages) *
22763 effective_page_size));
22764 if (va < region_start &&
22765 footprint_region_offset != sizeof(*footprint_header)) {
22766 /* our range starts before the hint region */
22767
22768 /* reset the hint (in a racy way...) */
22769 footprint_header->cf_hint_region = sizeof(*footprint_header);
22770 /* lookup "va" again from 1st region */
22771 footprint_region_offset = sizeof(*footprint_header);
22772 goto lookup_again;
22773 }
22774
22775 while (va >= region_end) {
22776 if (footprint_region_offset >= footprint_header->cf_last_region) {
22777 break;
22778 }
22779 /* skip the region's header */
22780 footprint_region_offset += sizeof(*footprint_region);
22781 /* skip the region's page dispositions */
22782 footprint_region_offset += (footprint_region->cfr_num_pages * sizeof(cf_disp_t));
22783 /* align to next word boundary */
22784 footprint_region_offset =
22785 roundup(footprint_region_offset,
22786 sizeof(int));
22787 footprint_region = (struct vm_map_corpse_footprint_region *)
22788 ((char *)footprint_header + footprint_region_offset);
22789 region_start = footprint_region->cfr_vaddr;
22790 region_end = (region_start +
22791 ((vm_map_offset_t)(footprint_region->cfr_num_pages) *
22792 effective_page_size));
22793 }
22794 if (va < region_start || va >= region_end) {
22795 /* page not found */
22796 *disposition_p = 0;
22797 // if (va < SHARED_REGION_BASE_ARM64) printf("FBDP %d query map %p va 0x%llx disp 0x%x\n", __LINE__, map, va, *disposition_p);
22798 kr = KERN_SUCCESS;
22799 goto done;
22800 }
22801
22802 /* "va" found: set the lookup hint for next lookup (in a racy way...) */
22803 footprint_header->cf_hint_region = footprint_region_offset;
22804
22805 /* get page disposition for "va" in this region */
22806 disp_idx = (int) ((va - footprint_region->cfr_vaddr) / effective_page_size);
22807 cf_disp = footprint_region->cfr_disposition[disp_idx];
22808 *disposition_p = vm_page_cf_disp_to_disposition(cf_disp);
22809 kr = KERN_SUCCESS;
22810 done:
22811 // if (va < SHARED_REGION_BASE_ARM64) printf("FBDP %d query map %p va 0x%llx disp 0x%x\n", __LINE__, map, va, *disposition_p);
22812 /* dtrace -n 'vminfo:::footprint_query_page_info { printf("map 0x%p va 0x%llx disp 0x%x kr 0x%x", arg0, arg1, arg2, arg3); }' */
22813 DTRACE_VM4(footprint_query_page_info,
22814 vm_map_t, map,
22815 vm_map_offset_t, va,
22816 int, *disposition_p,
22817 kern_return_t, kr);
22818
22819 return kr;
22820 }
22821
22822 void
vm_map_corpse_footprint_destroy(vm_map_t map)22823 vm_map_corpse_footprint_destroy(
22824 vm_map_t map)
22825 {
22826 if (map->has_corpse_footprint &&
22827 map->vmmap_corpse_footprint != 0) {
22828 struct vm_map_corpse_footprint_header *footprint_header;
22829 vm_size_t buf_size;
22830 kern_return_t kr;
22831
22832 footprint_header = map->vmmap_corpse_footprint;
22833 buf_size = footprint_header->cf_size;
22834 kr = vm_deallocate(kernel_map,
22835 (vm_offset_t) map->vmmap_corpse_footprint,
22836 ((vm_size_t) buf_size
22837 + PAGE_SIZE)); /* trailing guard page */
22838 assertf(kr == KERN_SUCCESS, "kr=0x%x\n", kr);
22839 map->vmmap_corpse_footprint = 0;
22840 map->has_corpse_footprint = FALSE;
22841 }
22842 }
22843
22844 /*
22845 * vm_map_copy_footprint_ledgers:
22846 * copies any ledger that's relevant to the memory footprint of "old_task"
22847 * into the forked corpse's task ("new_task")
22848 */
22849 void
vm_map_copy_footprint_ledgers(task_t old_task,task_t new_task)22850 vm_map_copy_footprint_ledgers(
22851 task_t old_task,
22852 task_t new_task)
22853 {
22854 vm_map_copy_ledger(old_task, new_task, task_ledgers.phys_footprint);
22855 vm_map_copy_ledger(old_task, new_task, task_ledgers.purgeable_nonvolatile);
22856 vm_map_copy_ledger(old_task, new_task, task_ledgers.purgeable_nonvolatile_compressed);
22857 vm_map_copy_ledger(old_task, new_task, task_ledgers.internal);
22858 vm_map_copy_ledger(old_task, new_task, task_ledgers.internal_compressed);
22859 vm_map_copy_ledger(old_task, new_task, task_ledgers.iokit_mapped);
22860 vm_map_copy_ledger(old_task, new_task, task_ledgers.alternate_accounting);
22861 vm_map_copy_ledger(old_task, new_task, task_ledgers.alternate_accounting_compressed);
22862 vm_map_copy_ledger(old_task, new_task, task_ledgers.page_table);
22863 vm_map_copy_ledger(old_task, new_task, task_ledgers.tagged_footprint);
22864 vm_map_copy_ledger(old_task, new_task, task_ledgers.tagged_footprint_compressed);
22865 vm_map_copy_ledger(old_task, new_task, task_ledgers.network_nonvolatile);
22866 vm_map_copy_ledger(old_task, new_task, task_ledgers.network_nonvolatile_compressed);
22867 vm_map_copy_ledger(old_task, new_task, task_ledgers.media_footprint);
22868 vm_map_copy_ledger(old_task, new_task, task_ledgers.media_footprint_compressed);
22869 vm_map_copy_ledger(old_task, new_task, task_ledgers.graphics_footprint);
22870 vm_map_copy_ledger(old_task, new_task, task_ledgers.graphics_footprint_compressed);
22871 vm_map_copy_ledger(old_task, new_task, task_ledgers.neural_footprint);
22872 vm_map_copy_ledger(old_task, new_task, task_ledgers.neural_footprint_compressed);
22873 vm_map_copy_ledger(old_task, new_task, task_ledgers.wired_mem);
22874 }
22875
22876 /*
22877 * vm_map_copy_ledger:
22878 * copy a single ledger from "old_task" to "new_task"
22879 */
22880 void
vm_map_copy_ledger(task_t old_task,task_t new_task,int ledger_entry)22881 vm_map_copy_ledger(
22882 task_t old_task,
22883 task_t new_task,
22884 int ledger_entry)
22885 {
22886 ledger_amount_t old_balance, new_balance, delta;
22887
22888 assert(new_task->map->has_corpse_footprint);
22889 if (!new_task->map->has_corpse_footprint) {
22890 return;
22891 }
22892
22893 /* turn off sanity checks for the ledger we're about to mess with */
22894 ledger_disable_panic_on_negative(new_task->ledger,
22895 ledger_entry);
22896
22897 /* adjust "new_task" to match "old_task" */
22898 ledger_get_balance(old_task->ledger,
22899 ledger_entry,
22900 &old_balance);
22901 ledger_get_balance(new_task->ledger,
22902 ledger_entry,
22903 &new_balance);
22904 if (new_balance == old_balance) {
22905 /* new == old: done */
22906 } else if (new_balance > old_balance) {
22907 /* new > old ==> new -= new - old */
22908 delta = new_balance - old_balance;
22909 ledger_debit(new_task->ledger,
22910 ledger_entry,
22911 delta);
22912 } else {
22913 /* new < old ==> new += old - new */
22914 delta = old_balance - new_balance;
22915 ledger_credit(new_task->ledger,
22916 ledger_entry,
22917 delta);
22918 }
22919 }
22920
22921 /*
22922 * vm_map_get_pmap:
22923 * returns the pmap associated with the vm_map
22924 */
22925 pmap_t
vm_map_get_pmap(vm_map_t map)22926 vm_map_get_pmap(vm_map_t map)
22927 {
22928 return vm_map_pmap(map);
22929 }
22930
22931 #if CONFIG_MAP_RANGES
22932 static bitmap_t vm_map_user_range_heap_map[BITMAP_LEN(VM_MEMORY_COUNT)];
22933
22934 static_assert(UMEM_RANGE_ID_DEFAULT == MACH_VM_RANGE_DEFAULT);
22935 static_assert(UMEM_RANGE_ID_HEAP == MACH_VM_RANGE_DATA);
22936
22937 /*
22938 * vm_map_range_map_init:
22939 * initializes the VM range ID map to enable index lookup
22940 * of user VM ranges based on VM tag from userspace.
22941 */
22942 static void
vm_map_range_map_init(void)22943 vm_map_range_map_init(void)
22944 {
22945 /*
22946 * VM_MEMORY_MALLOC{,_NANO} are skipped on purpose:
22947 * - the former is malloc metadata which should be kept separate
22948 * - the latter has its own ranges
22949 */
22950 bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_MALLOC_HUGE);
22951 bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_MALLOC_LARGE);
22952 bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_MALLOC_LARGE_REUSED);
22953 bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_MALLOC_MEDIUM);
22954 bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_MALLOC_PROB_GUARD);
22955 bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_MALLOC_SMALL);
22956 bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_MALLOC_TINY);
22957 bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_TCMALLOC);
22958 bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_LIBNETWORK);
22959 bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_IOACCELERATOR);
22960 bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_IOSURFACE);
22961 bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_IMAGEIO);
22962 bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_COREGRAPHICS);
22963 bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_CORESERVICES);
22964 bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_COREDATA);
22965 bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_LAYERKIT);
22966 }
22967
22968 static struct mach_vm_range
vm_map_range_random_uniform(vm_map_size_t req_size,vm_map_offset_t min_addr,vm_map_offset_t max_addr,vm_map_offset_t offmask)22969 vm_map_range_random_uniform(
22970 vm_map_size_t req_size,
22971 vm_map_offset_t min_addr,
22972 vm_map_offset_t max_addr,
22973 vm_map_offset_t offmask)
22974 {
22975 vm_map_offset_t random_addr;
22976 struct mach_vm_range alloc;
22977
22978 req_size = (req_size + offmask) & ~offmask;
22979 min_addr = (min_addr + offmask) & ~offmask;
22980 max_addr = max_addr & ~offmask;
22981
22982 read_random(&random_addr, sizeof(random_addr));
22983 random_addr %= (max_addr - req_size - min_addr);
22984 random_addr &= ~offmask;
22985
22986 alloc.min_address = min_addr + random_addr;
22987 alloc.max_address = min_addr + random_addr + req_size;
22988 return alloc;
22989 }
22990
22991 static vm_map_offset_t
vm_map_range_offmask(void)22992 vm_map_range_offmask(void)
22993 {
22994 uint32_t pte_depth;
22995
22996 /*
22997 * PTE optimizations
22998 *
22999 *
23000 * 16k pages systems
23001 * ~~~~~~~~~~~~~~~~~
23002 *
23003 * A single L1 (sub-)page covers the address space.
23004 * - L2 pages cover 64G,
23005 * - L3 pages cover 32M.
23006 *
23007 * On embedded, the dynamic VA range is 64G and uses a single L2 page.
23008 * As a result, we really only need to align the ranges to 32M to avoid
23009 * partial L3 pages.
23010 *
23011 * On macOS, the usage of L2 pages will increase, so as a result we will
23012 * want to align ranges to 64G in order to utilize them fully.
23013 *
23014 *
23015 * 4k pages systems
23016 * ~~~~~~~~~~~~~~~~
23017 *
23018 * A single L0 (sub-)page covers the address space.
23019 * - L1 pages cover 512G,
23020 * - L2 pages cover 1G,
23021 * - L3 pages cover 2M.
23022 *
23023 * The long tail of processes on a system will tend to have a VA usage
23024 * (ignoring the shared regions) in the 100s of MB order of magnitnude.
23025 * This is achievable with a single L1 and a few L2s without
23026 * randomization.
23027 *
23028 * However once randomization is introduced, the system will immediately
23029 * need several L1s and many more L2s. As a result:
23030 *
23031 * - on embedded devices, the cost of these extra pages isn't
23032 * sustainable, and we just disable the feature entirely,
23033 *
23034 * - on macOS we align ranges to a 512G boundary so that the extra L1
23035 * pages can be used to their full potential.
23036 */
23037
23038 /*
23039 * note, this function assumes _non exotic mappings_
23040 * which is why it uses the native kernel's PAGE_SHIFT.
23041 */
23042 #if XNU_PLATFORM_MacOSX
23043 pte_depth = PAGE_SHIFT > 12 ? 2 : 3;
23044 #else /* !XNU_PLATFORM_MacOSX */
23045 pte_depth = PAGE_SHIFT > 12 ? 1 : 0;
23046 #endif /* !XNU_PLATFORM_MacOSX */
23047
23048 if (pte_depth == 0) {
23049 return 0;
23050 }
23051
23052 return (1ull << ((PAGE_SHIFT - 3) * pte_depth + PAGE_SHIFT)) - 1;
23053 }
23054
23055 /*
23056 * vm_map_range_configure:
23057 * configures the user vm_map ranges by increasing the maximum VA range of
23058 * the map and carving out a range at the end of VA space (searching backwards
23059 * in the newly expanded map).
23060 */
23061 kern_return_t
vm_map_range_configure(vm_map_t map)23062 vm_map_range_configure(vm_map_t map)
23063 {
23064 const vm_map_offset_t offmask = vm_map_range_offmask();
23065 struct mach_vm_range data_range;
23066 vm_map_offset_t default_end;
23067 kern_return_t kr;
23068
23069 if (!vm_map_is_64bit(map) || vm_map_is_exotic(map) || offmask == 0) {
23070 /*
23071 * No point doing vm ranges in a 32bit address space.
23072 */
23073 return KERN_NOT_SUPPORTED;
23074 }
23075
23076 /* Should not be applying ranges to kernel map or kernel map submaps */
23077 assert(vm_map_pmap(map) != kernel_pmap);
23078
23079 #if XNU_PLATFORM_MacOSX
23080
23081 /*
23082 * on macOS, the address space is a massive 47 bits (128T),
23083 * with several carve outs that processes can't use:
23084 * - the shared region
23085 * - the commpage region
23086 * - the GPU carve out (if applicable)
23087 *
23088 * and when nano-malloc is in use it desires memory at the 96T mark.
23089 *
23090 * However, their location is architecture dependent:
23091 * - On intel, the shared region and commpage are
23092 * at the very end of the usable address space (above +127T),
23093 * and there is no GPU carve out, and pthread wants to place
23094 * threads at the 112T mark (0x70T).
23095 *
23096 * - On arm64, these are in the same spot as on embedded devices:
23097 * o shared region: [ 6G, 10G) [ will likely grow over time ]
23098 * o commpage region: [63G, 64G)
23099 * o GPU carve out: [64G, 448G)
23100 *
23101 * This is conveninent because the mappings at the end of the address
23102 * space (when they exist) are made by the kernel.
23103 *
23104 * The policy is to allocate a random 1T for the data heap
23105 * in the end of the address-space in the:
23106 * - [0x71, 0x7f) range on Intel (to leave space for pthread stacks)
23107 * - [0x61, 0x7f) range on ASM (to leave space for Nano malloc).
23108 */
23109
23110 /* see NANOZONE_SIGNATURE in libmalloc */
23111 #if __x86_64__
23112 default_end = 0x71ull << 40;
23113 #else
23114 default_end = 0x61ull << 40;
23115 #endif
23116 data_range = vm_map_range_random_uniform(1ull << 40,
23117 default_end, 0x7full << 40, offmask);
23118
23119 #else /* !XNU_PLATFORM_MacOSX */
23120
23121 /*
23122 * Embedded devices:
23123 *
23124 * The default VA Size scales with the device physical memory.
23125 *
23126 * Out of that:
23127 * - the "zero" page typically uses 4G + some slide
23128 * - the shared region uses SHARED_REGION_SIZE bytes (4G)
23129 *
23130 * Without the use of jumbo or any adjustment to the address space,
23131 * a default VM map typically looks like this:
23132 *
23133 * 0G -->╒════════════╕
23134 * │ pagezero │
23135 * │ + slide │
23136 * ~4G -->╞════════════╡<-- vm_map_min(map)
23137 * │ │
23138 * 6G -->├────────────┤
23139 * │ shared │
23140 * │ region │
23141 * 10G -->├────────────┤
23142 * │ │
23143 * max_va -->├────────────┤<-- vm_map_max(map)
23144 * │ │
23145 * ╎ jumbo ╎
23146 * ╎ ╎
23147 * │ │
23148 * 63G -->╞════════════╡<-- MACH_VM_MAX_ADDRESS
23149 * │ commpage │
23150 * 64G -->├────────────┤<-- MACH_VM_MIN_GPU_CARVEOUT_ADDRESS
23151 * │ │
23152 * ╎ GPU ╎
23153 * ╎ carveout ╎
23154 * │ │
23155 * 448G -->├────────────┤<-- MACH_VM_MAX_GPU_CARVEOUT_ADDRESS
23156 * │ │
23157 * ╎ ╎
23158 * ╎ ╎
23159 * │ │
23160 * 512G -->╘════════════╛<-- (1ull << ARM_16K_TT_L1_SHIFT)
23161 *
23162 * When this drawing was made, "max_va" was smaller than
23163 * ARM64_MAX_OFFSET_DEVICE_LARGE (~15.5G), leaving shy of
23164 * 12G of address space for the zero-page, slide, files,
23165 * binaries, heap ...
23166 *
23167 * We will want to make a "heap/data" carve out inside
23168 * the jumbo range of half of that usable space, assuming
23169 * that this is less than a forth of the jumbo range.
23170 *
23171 * The assert below intends to catch when max_va grows
23172 * too large for this heuristic.
23173 */
23174
23175 vm_map_lock_read(map);
23176 default_end = vm_map_max(map);
23177 vm_map_unlock_read(map);
23178
23179 /*
23180 * Check that we're not already jumbo'd,
23181 * or our address space was somehow modified.
23182 *
23183 * If so we cannot guarantee that we can set up the ranges
23184 * safely without interfering with the existing map.
23185 */
23186 if (default_end > vm_compute_max_offset(true)) {
23187 return KERN_NO_SPACE;
23188 }
23189
23190 if (pmap_max_offset(true, ARM_PMAP_MAX_OFFSET_DEFAULT)) {
23191 /*
23192 * an override boot-arg was set, disable user-ranges
23193 *
23194 * XXX: this is problematic because it means these boot-args
23195 * no longer test the behavior changing the value
23196 * of ARM64_MAX_OFFSET_DEVICE_* would have.
23197 */
23198 return KERN_NOT_SUPPORTED;
23199 }
23200
23201 /* expand the default VM space to the largest possible address */
23202 vm_map_set_jumbo(map);
23203
23204 assert3u(4 * GiB(10), <=, vm_map_max(map) - default_end);
23205 data_range = vm_map_range_random_uniform(GiB(10),
23206 default_end + PAGE_SIZE, vm_map_max(map), offmask);
23207
23208 #endif /* !XNU_PLATFORM_MacOSX */
23209
23210 /*
23211 * Poke holes so that ASAN or people listing regions
23212 * do not think this space is free.
23213 */
23214
23215 if (default_end != data_range.min_address) {
23216 kr = vm_map_enter(map, &default_end,
23217 data_range.min_address - default_end,
23218 0, VM_MAP_KERNEL_FLAGS_FIXED_PERMANENT(), VM_OBJECT_NULL,
23219 0, FALSE, VM_PROT_NONE, VM_PROT_NONE, VM_INHERIT_DEFAULT);
23220 assert(kr == KERN_SUCCESS);
23221 }
23222
23223 if (data_range.max_address != vm_map_max(map)) {
23224 vm_map_entry_t entry;
23225 vm_size_t size;
23226
23227 vm_map_lock_read(map);
23228 vm_map_lookup_entry_or_next(map, data_range.max_address, &entry);
23229 if (entry != vm_map_to_entry(map)) {
23230 size = vm_map_max(map) - data_range.max_address;
23231 } else {
23232 size = entry->vme_start - data_range.max_address;
23233 }
23234 vm_map_unlock_read(map);
23235
23236 kr = vm_map_enter(map, &data_range.max_address, size,
23237 0, VM_MAP_KERNEL_FLAGS_FIXED_PERMANENT(), VM_OBJECT_NULL,
23238 0, FALSE, VM_PROT_NONE, VM_PROT_NONE, VM_INHERIT_DEFAULT);
23239 assert(kr == KERN_SUCCESS);
23240 }
23241
23242 vm_map_lock(map);
23243 map->default_range.min_address = vm_map_min(map);
23244 map->default_range.max_address = default_end;
23245 map->data_range = data_range;
23246 map->uses_user_ranges = true;
23247 vm_map_unlock(map);
23248
23249 return KERN_SUCCESS;
23250 }
23251
23252 /*
23253 * vm_map_range_fork:
23254 * clones the array of ranges from old_map to new_map in support
23255 * of a VM map fork.
23256 */
23257 void
vm_map_range_fork(vm_map_t new_map,vm_map_t old_map)23258 vm_map_range_fork(vm_map_t new_map, vm_map_t old_map)
23259 {
23260 if (!old_map->uses_user_ranges) {
23261 /* nothing to do */
23262 return;
23263 }
23264
23265 new_map->default_range = old_map->default_range;
23266 new_map->data_range = old_map->data_range;
23267
23268 if (old_map->extra_ranges_count) {
23269 vm_map_user_range_t otable, ntable;
23270 uint16_t count;
23271
23272 otable = old_map->extra_ranges;
23273 count = old_map->extra_ranges_count;
23274 ntable = kalloc_data(count * sizeof(struct vm_map_user_range),
23275 Z_WAITOK | Z_ZERO | Z_NOFAIL);
23276 memcpy(ntable, otable,
23277 count * sizeof(struct vm_map_user_range));
23278
23279 new_map->extra_ranges_count = count;
23280 new_map->extra_ranges = ntable;
23281 }
23282
23283 new_map->uses_user_ranges = true;
23284 }
23285
23286 /*
23287 * vm_map_get_user_range:
23288 * copy the VM user range for the given VM map and range ID.
23289 */
23290 kern_return_t
vm_map_get_user_range(vm_map_t map,vm_map_range_id_t range_id,mach_vm_range_t range)23291 vm_map_get_user_range(
23292 vm_map_t map,
23293 vm_map_range_id_t range_id,
23294 mach_vm_range_t range)
23295 {
23296 if (map == NULL || !map->uses_user_ranges || range == NULL) {
23297 return KERN_INVALID_ARGUMENT;
23298 }
23299
23300 switch (range_id) {
23301 case UMEM_RANGE_ID_DEFAULT:
23302 *range = map->default_range;
23303 return KERN_SUCCESS;
23304
23305 case UMEM_RANGE_ID_HEAP:
23306 *range = map->data_range;
23307 return KERN_SUCCESS;
23308
23309 default:
23310 return KERN_INVALID_ARGUMENT;
23311 }
23312 }
23313
23314 static vm_map_range_id_t
vm_map_user_range_resolve(vm_map_t map,mach_vm_address_t addr,mach_vm_size_t size,mach_vm_range_t range)23315 vm_map_user_range_resolve(
23316 vm_map_t map,
23317 mach_vm_address_t addr,
23318 mach_vm_size_t size,
23319 mach_vm_range_t range)
23320 {
23321 struct mach_vm_range tmp;
23322
23323 vm_map_lock_assert_held(map);
23324
23325 static_assert(UMEM_RANGE_ID_DEFAULT == MACH_VM_RANGE_DEFAULT);
23326 static_assert(UMEM_RANGE_ID_HEAP == MACH_VM_RANGE_DATA);
23327
23328 if (mach_vm_range_contains(&map->default_range, addr, size)) {
23329 if (range) {
23330 *range = map->default_range;
23331 }
23332 return UMEM_RANGE_ID_DEFAULT;
23333 }
23334
23335 if (mach_vm_range_contains(&map->data_range, addr, size)) {
23336 if (range) {
23337 *range = map->data_range;
23338 }
23339 return UMEM_RANGE_ID_HEAP;
23340 }
23341
23342 for (size_t i = 0; i < map->extra_ranges_count; i++) {
23343 vm_map_user_range_t r = &map->extra_ranges[i];
23344
23345 tmp.min_address = r->vmur_min_address;
23346 tmp.max_address = r->vmur_max_address;
23347
23348 if (mach_vm_range_contains(&tmp, addr, size)) {
23349 if (range) {
23350 *range = tmp;
23351 }
23352 return r->vmur_range_id;
23353 }
23354 }
23355
23356 if (range) {
23357 range->min_address = range->max_address = 0;
23358 }
23359 return UMEM_RANGE_ID_DEFAULT;
23360 }
23361
23362 static int
vm_map_user_range_cmp(const void * e1,const void * e2)23363 vm_map_user_range_cmp(const void *e1, const void *e2)
23364 {
23365 const struct vm_map_user_range *r1 = e1;
23366 const struct vm_map_user_range *r2 = e2;
23367
23368 if (r1->vmur_min_address != r2->vmur_min_address) {
23369 return r1->vmur_min_address < r2->vmur_min_address ? -1 : 1;
23370 }
23371
23372 return 0;
23373 }
23374
23375 static int
mach_vm_range_recipe_v1_cmp(const void * e1,const void * e2)23376 mach_vm_range_recipe_v1_cmp(const void *e1, const void *e2)
23377 {
23378 const mach_vm_range_recipe_v1_t *r1 = e1;
23379 const mach_vm_range_recipe_v1_t *r2 = e2;
23380
23381 if (r1->range.min_address != r2->range.min_address) {
23382 return r1->range.min_address < r2->range.min_address ? -1 : 1;
23383 }
23384
23385 return 0;
23386 }
23387
23388 /*!
23389 * @function mach_vm_range_create_v1()
23390 *
23391 * @brief
23392 * Handle the backend for mach_vm_range_create() for the
23393 * MACH_VM_RANGE_FLAVOR_V1 flavor.
23394 *
23395 * @description
23396 * This call allows to create "ranges" in the map of a task
23397 * that have special semantics/policies around placement of
23398 * new allocations (in the vm_map_locate_space() sense).
23399 *
23400 * @returns
23401 * - KERN_SUCCESS on success
23402 * - KERN_INVALID_ARGUMENT for incorrect arguments
23403 * - KERN_NO_SPACE if the maximum amount of ranges would be exceeded
23404 * - KERN_MEMORY_PRESENT if any of the requested ranges
23405 * overlaps with existing ranges or allocations in the map.
23406 */
23407 static kern_return_t
mach_vm_range_create_v1(vm_map_t map,mach_vm_range_recipe_v1_t * recipe,uint32_t new_count)23408 mach_vm_range_create_v1(
23409 vm_map_t map,
23410 mach_vm_range_recipe_v1_t *recipe,
23411 uint32_t new_count)
23412 {
23413 const vm_offset_t mask = VM_MAP_PAGE_MASK(map);
23414 vm_map_user_range_t table;
23415 kern_return_t kr = KERN_SUCCESS;
23416 uint16_t count;
23417
23418 struct mach_vm_range void1 = {
23419 .min_address = map->default_range.max_address,
23420 .max_address = map->data_range.min_address,
23421 };
23422 struct mach_vm_range void2 = {
23423 .min_address = map->data_range.max_address,
23424 .max_address = vm_map_max(map),
23425 };
23426
23427 qsort(recipe, new_count, sizeof(mach_vm_range_recipe_v1_t),
23428 mach_vm_range_recipe_v1_cmp);
23429
23430 /*
23431 * Step 1: Validate that the recipes have no intersections.
23432 */
23433
23434 for (size_t i = 0; i < new_count; i++) {
23435 mach_vm_range_t r = &recipe[i].range;
23436 mach_vm_size_t s = mach_vm_range_size(r);
23437
23438 if (recipe[i].flags) {
23439 return KERN_INVALID_ARGUMENT;
23440 }
23441
23442 static_assert(UMEM_RANGE_ID_FIXED == MACH_VM_RANGE_FIXED);
23443 switch (recipe[i].range_tag) {
23444 case MACH_VM_RANGE_FIXED:
23445 break;
23446 default:
23447 return KERN_INVALID_ARGUMENT;
23448 }
23449
23450 if (!VM_MAP_PAGE_ALIGNED(r->min_address, mask) ||
23451 !VM_MAP_PAGE_ALIGNED(r->max_address, mask)) {
23452 return KERN_INVALID_ARGUMENT;
23453 }
23454
23455 if (!mach_vm_range_contains(&void1, r->min_address, s) &&
23456 !mach_vm_range_contains(&void2, r->min_address, s)) {
23457 return KERN_INVALID_ARGUMENT;
23458 }
23459
23460 if (i > 0 && recipe[i - 1].range.max_address >
23461 recipe[i].range.min_address) {
23462 return KERN_INVALID_ARGUMENT;
23463 }
23464 }
23465
23466 vm_map_lock(map);
23467
23468 table = map->extra_ranges;
23469 count = map->extra_ranges_count;
23470
23471 if (count + new_count > VM_MAP_EXTRA_RANGES_MAX) {
23472 kr = KERN_NO_SPACE;
23473 goto out_unlock;
23474 }
23475
23476 /*
23477 * Step 2: Check that there is no intersection with existing ranges.
23478 */
23479
23480 for (size_t i = 0, j = 0; i < new_count && j < count;) {
23481 mach_vm_range_t r1 = &recipe[i].range;
23482 vm_map_user_range_t r2 = &table[j];
23483
23484 if (r1->max_address <= r2->vmur_min_address) {
23485 i++;
23486 } else if (r2->vmur_max_address <= r1->min_address) {
23487 j++;
23488 } else {
23489 kr = KERN_MEMORY_PRESENT;
23490 goto out_unlock;
23491 }
23492 }
23493
23494 /*
23495 * Step 4: commit the new ranges.
23496 */
23497
23498 static_assert(VM_MAP_EXTRA_RANGES_MAX * sizeof(struct vm_map_user_range) <=
23499 KALLOC_SAFE_ALLOC_SIZE);
23500
23501 table = krealloc_data(table,
23502 count * sizeof(struct vm_map_user_range),
23503 (count + new_count) * sizeof(struct vm_map_user_range),
23504 Z_ZERO | Z_WAITOK | Z_NOFAIL);
23505
23506 for (size_t i = 0; i < new_count; i++) {
23507 static_assert(MACH_VM_MAX_ADDRESS < (1ull << 56));
23508
23509 table[count + i] = (struct vm_map_user_range){
23510 .vmur_min_address = recipe[i].range.min_address,
23511 .vmur_max_address = recipe[i].range.max_address,
23512 .vmur_range_id = (vm_map_range_id_t)recipe[i].range_tag,
23513 };
23514 }
23515
23516 qsort(table, count + new_count,
23517 sizeof(struct vm_map_user_range), vm_map_user_range_cmp);
23518
23519 map->extra_ranges_count += new_count;
23520 map->extra_ranges = table;
23521
23522 out_unlock:
23523 vm_map_unlock(map);
23524
23525 if (kr == KERN_SUCCESS) {
23526 for (size_t i = 0; i < new_count; i++) {
23527 vm_map_kernel_flags_t vmk_flags = {
23528 .vmf_fixed = true,
23529 .vmf_overwrite = true,
23530 .vmkf_overwrite_immutable = true,
23531 .vm_tag = recipe[i].vm_tag,
23532 };
23533 __assert_only kern_return_t kr2;
23534
23535 kr2 = vm_map_enter(map, &recipe[i].range.min_address,
23536 mach_vm_range_size(&recipe[i].range),
23537 0, vmk_flags, VM_OBJECT_NULL, 0, FALSE,
23538 VM_PROT_NONE, VM_PROT_ALL,
23539 VM_INHERIT_DEFAULT);
23540 assert(kr2 == KERN_SUCCESS);
23541 }
23542 }
23543 return kr;
23544 }
23545
23546 kern_return_t
mach_vm_range_create(vm_map_t map,mach_vm_range_flavor_t flavor,mach_vm_range_recipes_raw_t recipe,natural_t size)23547 mach_vm_range_create(
23548 vm_map_t map,
23549 mach_vm_range_flavor_t flavor,
23550 mach_vm_range_recipes_raw_t recipe,
23551 natural_t size)
23552 {
23553 if (map != current_map()) {
23554 return KERN_INVALID_ARGUMENT;
23555 }
23556
23557 if (!map->uses_user_ranges) {
23558 return KERN_NOT_SUPPORTED;
23559 }
23560
23561 if (size == 0) {
23562 return KERN_SUCCESS;
23563 }
23564
23565 if (flavor == MACH_VM_RANGE_FLAVOR_V1) {
23566 mach_vm_range_recipe_v1_t *array;
23567
23568 if (size % sizeof(mach_vm_range_recipe_v1_t)) {
23569 return KERN_INVALID_ARGUMENT;
23570 }
23571
23572 size /= sizeof(mach_vm_range_recipe_v1_t);
23573 if (size > VM_MAP_EXTRA_RANGES_MAX) {
23574 return KERN_NO_SPACE;
23575 }
23576
23577 array = (mach_vm_range_recipe_v1_t *)recipe;
23578 return mach_vm_range_create_v1(map, array, size);
23579 }
23580
23581 return KERN_INVALID_ARGUMENT;
23582 }
23583
23584 #else /* !CONFIG_MAP_RANGES */
23585
23586 kern_return_t
mach_vm_range_create(vm_map_t map,mach_vm_range_flavor_t flavor,mach_vm_range_recipes_raw_t recipe,natural_t size)23587 mach_vm_range_create(
23588 vm_map_t map,
23589 mach_vm_range_flavor_t flavor,
23590 mach_vm_range_recipes_raw_t recipe,
23591 natural_t size)
23592 {
23593 #pragma unused(map, flavor, recipe, size)
23594 return KERN_NOT_SUPPORTED;
23595 }
23596
23597 #endif /* !CONFIG_MAP_RANGES */
23598
23599 void
vm_map_kernel_flags_update_range_id(vm_map_kernel_flags_t * vmkf,vm_map_t map)23600 vm_map_kernel_flags_update_range_id(vm_map_kernel_flags_t *vmkf, vm_map_t map)
23601 {
23602 if (map == kernel_map) {
23603 if (vmkf->vmkf_range_id == KMEM_RANGE_ID_NONE) {
23604 vmkf->vmkf_range_id = KMEM_RANGE_ID_DATA;
23605 }
23606 #if CONFIG_MAP_RANGES
23607 } else if (vmkf->vm_tag < VM_MEMORY_COUNT &&
23608 vmkf->vmkf_range_id == UMEM_RANGE_ID_DEFAULT &&
23609 bitmap_test(vm_map_user_range_heap_map, vmkf->vm_tag)) {
23610 vmkf->vmkf_range_id = UMEM_RANGE_ID_HEAP;
23611 #endif /* CONFIG_MAP_RANGES */
23612 }
23613 }
23614
23615 /*
23616 * vm_map_entry_has_device_pager:
23617 * Check if the vm map entry specified by the virtual address has a device pager.
23618 * If the vm map entry does not exist or if the map is NULL, this returns FALSE.
23619 */
23620 boolean_t
vm_map_entry_has_device_pager(vm_map_t map,vm_map_offset_t vaddr)23621 vm_map_entry_has_device_pager(vm_map_t map, vm_map_offset_t vaddr)
23622 {
23623 vm_map_entry_t entry;
23624 vm_object_t object;
23625 boolean_t result;
23626
23627 if (map == NULL) {
23628 return FALSE;
23629 }
23630
23631 vm_map_lock(map);
23632 while (TRUE) {
23633 if (!vm_map_lookup_entry(map, vaddr, &entry)) {
23634 result = FALSE;
23635 break;
23636 }
23637 if (entry->is_sub_map) {
23638 // Check the submap
23639 vm_map_t submap = VME_SUBMAP(entry);
23640 assert(submap != NULL);
23641 vm_map_lock(submap);
23642 vm_map_unlock(map);
23643 map = submap;
23644 continue;
23645 }
23646 object = VME_OBJECT(entry);
23647 if (object != NULL && object->pager != NULL && is_device_pager_ops(object->pager->mo_pager_ops)) {
23648 result = TRUE;
23649 break;
23650 }
23651 result = FALSE;
23652 break;
23653 }
23654
23655 vm_map_unlock(map);
23656 return result;
23657 }
23658
23659
23660 #if MACH_ASSERT
23661
23662 extern int pmap_ledgers_panic;
23663 extern int pmap_ledgers_panic_leeway;
23664
23665 #define LEDGER_DRIFT(__LEDGER) \
23666 int __LEDGER##_over; \
23667 ledger_amount_t __LEDGER##_over_total; \
23668 ledger_amount_t __LEDGER##_over_max; \
23669 int __LEDGER##_under; \
23670 ledger_amount_t __LEDGER##_under_total; \
23671 ledger_amount_t __LEDGER##_under_max
23672
23673 struct {
23674 uint64_t num_pmaps_checked;
23675
23676 LEDGER_DRIFT(phys_footprint);
23677 LEDGER_DRIFT(internal);
23678 LEDGER_DRIFT(internal_compressed);
23679 LEDGER_DRIFT(external);
23680 LEDGER_DRIFT(reusable);
23681 LEDGER_DRIFT(iokit_mapped);
23682 LEDGER_DRIFT(alternate_accounting);
23683 LEDGER_DRIFT(alternate_accounting_compressed);
23684 LEDGER_DRIFT(page_table);
23685 LEDGER_DRIFT(purgeable_volatile);
23686 LEDGER_DRIFT(purgeable_nonvolatile);
23687 LEDGER_DRIFT(purgeable_volatile_compressed);
23688 LEDGER_DRIFT(purgeable_nonvolatile_compressed);
23689 LEDGER_DRIFT(tagged_nofootprint);
23690 LEDGER_DRIFT(tagged_footprint);
23691 LEDGER_DRIFT(tagged_nofootprint_compressed);
23692 LEDGER_DRIFT(tagged_footprint_compressed);
23693 LEDGER_DRIFT(network_volatile);
23694 LEDGER_DRIFT(network_nonvolatile);
23695 LEDGER_DRIFT(network_volatile_compressed);
23696 LEDGER_DRIFT(network_nonvolatile_compressed);
23697 LEDGER_DRIFT(media_nofootprint);
23698 LEDGER_DRIFT(media_footprint);
23699 LEDGER_DRIFT(media_nofootprint_compressed);
23700 LEDGER_DRIFT(media_footprint_compressed);
23701 LEDGER_DRIFT(graphics_nofootprint);
23702 LEDGER_DRIFT(graphics_footprint);
23703 LEDGER_DRIFT(graphics_nofootprint_compressed);
23704 LEDGER_DRIFT(graphics_footprint_compressed);
23705 LEDGER_DRIFT(neural_nofootprint);
23706 LEDGER_DRIFT(neural_footprint);
23707 LEDGER_DRIFT(neural_nofootprint_compressed);
23708 LEDGER_DRIFT(neural_footprint_compressed);
23709 } pmap_ledgers_drift;
23710
23711 void
vm_map_pmap_check_ledgers(pmap_t pmap,ledger_t ledger,int pid,char * procname)23712 vm_map_pmap_check_ledgers(
23713 pmap_t pmap,
23714 ledger_t ledger,
23715 int pid,
23716 char *procname)
23717 {
23718 ledger_amount_t bal;
23719 boolean_t do_panic;
23720
23721 do_panic = FALSE;
23722
23723 pmap_ledgers_drift.num_pmaps_checked++;
23724
23725 #define LEDGER_CHECK_BALANCE(__LEDGER) \
23726 MACRO_BEGIN \
23727 int panic_on_negative = TRUE; \
23728 ledger_get_balance(ledger, \
23729 task_ledgers.__LEDGER, \
23730 &bal); \
23731 ledger_get_panic_on_negative(ledger, \
23732 task_ledgers.__LEDGER, \
23733 &panic_on_negative); \
23734 if (bal != 0) { \
23735 if (panic_on_negative || \
23736 (pmap_ledgers_panic && \
23737 pmap_ledgers_panic_leeway > 0 && \
23738 (bal > (pmap_ledgers_panic_leeway * PAGE_SIZE) || \
23739 bal < (-pmap_ledgers_panic_leeway * PAGE_SIZE)))) { \
23740 do_panic = TRUE; \
23741 } \
23742 printf("LEDGER BALANCE proc %d (%s) " \
23743 "\"%s\" = %lld\n", \
23744 pid, procname, #__LEDGER, bal); \
23745 if (bal > 0) { \
23746 pmap_ledgers_drift.__LEDGER##_over++; \
23747 pmap_ledgers_drift.__LEDGER##_over_total += bal; \
23748 if (bal > pmap_ledgers_drift.__LEDGER##_over_max) { \
23749 pmap_ledgers_drift.__LEDGER##_over_max = bal; \
23750 } \
23751 } else if (bal < 0) { \
23752 pmap_ledgers_drift.__LEDGER##_under++; \
23753 pmap_ledgers_drift.__LEDGER##_under_total += bal; \
23754 if (bal < pmap_ledgers_drift.__LEDGER##_under_max) { \
23755 pmap_ledgers_drift.__LEDGER##_under_max = bal; \
23756 } \
23757 } \
23758 } \
23759 MACRO_END
23760
23761 LEDGER_CHECK_BALANCE(phys_footprint);
23762 LEDGER_CHECK_BALANCE(internal);
23763 LEDGER_CHECK_BALANCE(internal_compressed);
23764 LEDGER_CHECK_BALANCE(external);
23765 LEDGER_CHECK_BALANCE(reusable);
23766 LEDGER_CHECK_BALANCE(iokit_mapped);
23767 LEDGER_CHECK_BALANCE(alternate_accounting);
23768 LEDGER_CHECK_BALANCE(alternate_accounting_compressed);
23769 LEDGER_CHECK_BALANCE(page_table);
23770 LEDGER_CHECK_BALANCE(purgeable_volatile);
23771 LEDGER_CHECK_BALANCE(purgeable_nonvolatile);
23772 LEDGER_CHECK_BALANCE(purgeable_volatile_compressed);
23773 LEDGER_CHECK_BALANCE(purgeable_nonvolatile_compressed);
23774 LEDGER_CHECK_BALANCE(tagged_nofootprint);
23775 LEDGER_CHECK_BALANCE(tagged_footprint);
23776 LEDGER_CHECK_BALANCE(tagged_nofootprint_compressed);
23777 LEDGER_CHECK_BALANCE(tagged_footprint_compressed);
23778 LEDGER_CHECK_BALANCE(network_volatile);
23779 LEDGER_CHECK_BALANCE(network_nonvolatile);
23780 LEDGER_CHECK_BALANCE(network_volatile_compressed);
23781 LEDGER_CHECK_BALANCE(network_nonvolatile_compressed);
23782 LEDGER_CHECK_BALANCE(media_nofootprint);
23783 LEDGER_CHECK_BALANCE(media_footprint);
23784 LEDGER_CHECK_BALANCE(media_nofootprint_compressed);
23785 LEDGER_CHECK_BALANCE(media_footprint_compressed);
23786 LEDGER_CHECK_BALANCE(graphics_nofootprint);
23787 LEDGER_CHECK_BALANCE(graphics_footprint);
23788 LEDGER_CHECK_BALANCE(graphics_nofootprint_compressed);
23789 LEDGER_CHECK_BALANCE(graphics_footprint_compressed);
23790 LEDGER_CHECK_BALANCE(neural_nofootprint);
23791 LEDGER_CHECK_BALANCE(neural_footprint);
23792 LEDGER_CHECK_BALANCE(neural_nofootprint_compressed);
23793 LEDGER_CHECK_BALANCE(neural_footprint_compressed);
23794
23795 if (do_panic) {
23796 if (pmap_ledgers_panic) {
23797 panic("pmap_destroy(%p) %d[%s] has imbalanced ledgers",
23798 pmap, pid, procname);
23799 } else {
23800 printf("pmap_destroy(%p) %d[%s] has imbalanced ledgers\n",
23801 pmap, pid, procname);
23802 }
23803 }
23804 }
23805
23806 void
vm_map_pmap_set_process(vm_map_t map,int pid,char * procname)23807 vm_map_pmap_set_process(
23808 vm_map_t map,
23809 int pid,
23810 char *procname)
23811 {
23812 pmap_set_process(vm_map_pmap(map), pid, procname);
23813 }
23814
23815 #endif /* MACH_ASSERT */
23816