1 /*
2 * Copyright (c) 2000-2021 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /*
29 * @OSF_COPYRIGHT@
30 */
31 /*
32 * Mach Operating System
33 * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
34 * All Rights Reserved.
35 *
36 * Permission to use, copy, modify and distribute this software and its
37 * documentation is hereby granted, provided that both the copyright
38 * notice and this permission notice appear in all copies of the
39 * software, derivative works or modified versions, and any portions
40 * thereof, and that both notices appear in supporting documentation.
41 *
42 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44 * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45 *
46 * Carnegie Mellon requests users of this software to return to
47 *
48 * Software Distribution Coordinator or [email protected]
49 * School of Computer Science
50 * Carnegie Mellon University
51 * Pittsburgh PA 15213-3890
52 *
53 * any improvements or extensions that they make and grant Carnegie Mellon
54 * the rights to redistribute these changes.
55 */
56 /*
57 */
58 /*
59 * File: vm/vm_map.c
60 * Author: Avadis Tevanian, Jr., Michael Wayne Young
61 * Date: 1985
62 *
63 * Virtual memory mapping module.
64 */
65
66 #include <mach/vm_types.h>
67 #include <mach_assert.h>
68
69 #include <vm/vm_options.h>
70
71 #include <libkern/OSAtomic.h>
72
73 #include <mach/kern_return.h>
74 #include <mach/port.h>
75 #include <mach/vm_attributes.h>
76 #include <mach/vm_param.h>
77 #include <mach/vm_behavior.h>
78 #include <mach/vm_statistics.h>
79 #include <mach/memory_object.h>
80 #include <mach/mach_vm.h>
81 #include <machine/cpu_capabilities.h>
82 #include <mach/sdt.h>
83
84 #include <kern/assert.h>
85 #include <kern/backtrace.h>
86 #include <kern/counter.h>
87 #include <kern/exc_guard.h>
88 #include <kern/kalloc.h>
89 #include <kern/zalloc_internal.h>
90
91 #include <vm/cpm.h>
92 #include <vm/vm_compressor.h>
93 #include <vm/vm_compressor_pager.h>
94 #include <vm/vm_init.h>
95 #include <vm/vm_fault.h>
96 #include <vm/vm_map_internal.h>
97 #include <vm/vm_object.h>
98 #include <vm/vm_page.h>
99 #include <vm/vm_pageout.h>
100 #include <vm/pmap.h>
101 #include <vm/vm_kern.h>
102 #include <ipc/ipc_port.h>
103 #include <kern/sched_prim.h>
104 #include <kern/misc_protos.h>
105
106 #include <mach/vm_map_server.h>
107 #include <mach/mach_host_server.h>
108 #include <vm/vm_memtag.h>
109 #include <vm/vm_protos.h>
110 #include <vm/vm_purgeable_internal.h>
111 #include <vm/vm_reclaim_internal.h>
112
113 #include <vm/vm_protos.h>
114 #include <vm/vm_shared_region.h>
115 #include <vm/vm_map_store.h>
116
117 #include <san/kasan.h>
118
119 #include <sys/resource.h>
120 #include <sys/random.h>
121 #include <sys/codesign.h>
122 #include <sys/code_signing.h>
123 #include <sys/mman.h>
124 #include <sys/reboot.h>
125 #include <sys/kdebug_triage.h>
126
127 #include <libkern/section_keywords.h>
128
129 #if DEVELOPMENT || DEBUG
130 extern int proc_selfcsflags(void);
131 int vm_log_xnu_user_debug = 0;
132 int panic_on_unsigned_execute = 0;
133 int panic_on_mlock_failure = 0;
134 #endif /* DEVELOPMENT || DEBUG */
135
136 #if MACH_ASSERT
137 int debug4k_filter = 0;
138 char debug4k_proc_name[1024] = "";
139 int debug4k_proc_filter = (int)-1 & ~(1 << __DEBUG4K_FAULT);
140 int debug4k_panic_on_misaligned_sharing = 0;
141 const char *debug4k_category_name[] = {
142 "error", /* 0 */
143 "life", /* 1 */
144 "load", /* 2 */
145 "fault", /* 3 */
146 "copy", /* 4 */
147 "share", /* 5 */
148 "adjust", /* 6 */
149 "pmap", /* 7 */
150 "mementry", /* 8 */
151 "iokit", /* 9 */
152 "upl", /* 10 */
153 "exc", /* 11 */
154 "vfs" /* 12 */
155 };
156 #endif /* MACH_ASSERT */
157 int debug4k_no_cow_copyin = 0;
158
159
160 #if __arm64__
161 extern const int fourk_binary_compatibility_unsafe;
162 extern const int fourk_binary_compatibility_allow_wx;
163 #endif /* __arm64__ */
164 extern void qsort(void *a, size_t n, size_t es, int (*cmp)(const void *, const void *));
165 extern int proc_selfpid(void);
166 extern char *proc_name_address(void *p);
167 extern char *proc_best_name(struct proc *p);
168
169 #if VM_MAP_DEBUG_APPLE_PROTECT
170 int vm_map_debug_apple_protect = 0;
171 #endif /* VM_MAP_DEBUG_APPLE_PROTECT */
172 #if VM_MAP_DEBUG_FOURK
173 int vm_map_debug_fourk = 0;
174 #endif /* VM_MAP_DEBUG_FOURK */
175
176 #if DEBUG || DEVELOPMENT
177 static TUNABLE(bool, vm_map_executable_immutable,
178 "vm_map_executable_immutable", true);
179 #else
180 #define vm_map_executable_immutable true
181 #endif
182
183 os_refgrp_decl(static, map_refgrp, "vm_map", NULL);
184
185 extern u_int32_t random(void); /* from <libkern/libkern.h> */
186 /* Internal prototypes
187 */
188
189 typedef struct vm_map_zap {
190 vm_map_entry_t vmz_head;
191 vm_map_entry_t *vmz_tail;
192 } *vm_map_zap_t;
193
194 #define VM_MAP_ZAP_DECLARE(zap) \
195 struct vm_map_zap zap = { .vmz_tail = &zap.vmz_head }
196
197 static vm_map_entry_t vm_map_entry_insert(
198 vm_map_t map,
199 vm_map_entry_t insp_entry,
200 vm_map_offset_t start,
201 vm_map_offset_t end,
202 vm_object_t object,
203 vm_object_offset_t offset,
204 vm_map_kernel_flags_t vmk_flags,
205 boolean_t needs_copy,
206 vm_prot_t cur_protection,
207 vm_prot_t max_protection,
208 vm_inherit_t inheritance,
209 boolean_t clear_map_aligned);
210
211 static void vm_map_simplify_range(
212 vm_map_t map,
213 vm_map_offset_t start,
214 vm_map_offset_t end); /* forward */
215
216 static boolean_t vm_map_range_check(
217 vm_map_t map,
218 vm_map_offset_t start,
219 vm_map_offset_t end,
220 vm_map_entry_t *entry);
221
222 static void vm_map_submap_pmap_clean(
223 vm_map_t map,
224 vm_map_offset_t start,
225 vm_map_offset_t end,
226 vm_map_t sub_map,
227 vm_map_offset_t offset);
228
229 static void vm_map_pmap_enter(
230 vm_map_t map,
231 vm_map_offset_t addr,
232 vm_map_offset_t end_addr,
233 vm_object_t object,
234 vm_object_offset_t offset,
235 vm_prot_t protection);
236
237 static void _vm_map_clip_end(
238 struct vm_map_header *map_header,
239 vm_map_entry_t entry,
240 vm_map_offset_t end);
241
242 static void _vm_map_clip_start(
243 struct vm_map_header *map_header,
244 vm_map_entry_t entry,
245 vm_map_offset_t start);
246
247 static kmem_return_t vm_map_delete(
248 vm_map_t map,
249 vm_map_offset_t start,
250 vm_map_offset_t end,
251 vmr_flags_t flags,
252 kmem_guard_t guard,
253 vm_map_zap_t zap);
254
255 static void vm_map_copy_insert(
256 vm_map_t map,
257 vm_map_entry_t after_where,
258 vm_map_copy_t copy);
259
260 static kern_return_t vm_map_copy_overwrite_unaligned(
261 vm_map_t dst_map,
262 vm_map_entry_t entry,
263 vm_map_copy_t copy,
264 vm_map_address_t start,
265 boolean_t discard_on_success);
266
267 static kern_return_t vm_map_copy_overwrite_aligned(
268 vm_map_t dst_map,
269 vm_map_entry_t tmp_entry,
270 vm_map_copy_t copy,
271 vm_map_offset_t start,
272 pmap_t pmap);
273
274 static kern_return_t vm_map_copyin_kernel_buffer(
275 vm_map_t src_map,
276 vm_map_address_t src_addr,
277 vm_map_size_t len,
278 boolean_t src_destroy,
279 vm_map_copy_t *copy_result); /* OUT */
280
281 static kern_return_t vm_map_copyout_kernel_buffer(
282 vm_map_t map,
283 vm_map_address_t *addr, /* IN/OUT */
284 vm_map_copy_t copy,
285 vm_map_size_t copy_size,
286 boolean_t overwrite,
287 boolean_t consume_on_success);
288
289 static void vm_map_fork_share(
290 vm_map_t old_map,
291 vm_map_entry_t old_entry,
292 vm_map_t new_map);
293
294 static boolean_t vm_map_fork_copy(
295 vm_map_t old_map,
296 vm_map_entry_t *old_entry_p,
297 vm_map_t new_map,
298 int vm_map_copyin_flags);
299
300 static kern_return_t vm_map_wire_nested(
301 vm_map_t map,
302 vm_map_offset_t start,
303 vm_map_offset_t end,
304 vm_prot_t caller_prot,
305 vm_tag_t tag,
306 boolean_t user_wire,
307 pmap_t map_pmap,
308 vm_map_offset_t pmap_addr,
309 ppnum_t *physpage_p);
310
311 static kern_return_t vm_map_unwire_nested(
312 vm_map_t map,
313 vm_map_offset_t start,
314 vm_map_offset_t end,
315 boolean_t user_wire,
316 pmap_t map_pmap,
317 vm_map_offset_t pmap_addr);
318
319 static kern_return_t vm_map_overwrite_submap_recurse(
320 vm_map_t dst_map,
321 vm_map_offset_t dst_addr,
322 vm_map_size_t dst_size);
323
324 static kern_return_t vm_map_copy_overwrite_nested(
325 vm_map_t dst_map,
326 vm_map_offset_t dst_addr,
327 vm_map_copy_t copy,
328 boolean_t interruptible,
329 pmap_t pmap,
330 boolean_t discard_on_success);
331
332 static kern_return_t vm_map_remap_extract(
333 vm_map_t map,
334 vm_map_offset_t addr,
335 vm_map_size_t size,
336 boolean_t copy,
337 vm_map_copy_t map_copy,
338 vm_prot_t *cur_protection,
339 vm_prot_t *max_protection,
340 vm_inherit_t inheritance,
341 vm_map_kernel_flags_t vmk_flags);
342
343 static kern_return_t vm_map_remap_range_allocate(
344 vm_map_t map,
345 vm_map_address_t *address,
346 vm_map_size_t size,
347 vm_map_offset_t mask,
348 vm_map_kernel_flags_t vmk_flags,
349 vm_map_entry_t *map_entry,
350 vm_map_zap_t zap_list);
351
352 static void vm_map_region_look_for_page(
353 vm_map_t map,
354 vm_map_offset_t va,
355 vm_object_t object,
356 vm_object_offset_t offset,
357 int max_refcnt,
358 unsigned short depth,
359 vm_region_extended_info_t extended,
360 mach_msg_type_number_t count);
361
362 static int vm_map_region_count_obj_refs(
363 vm_map_entry_t entry,
364 vm_object_t object);
365
366
367 static kern_return_t vm_map_willneed(
368 vm_map_t map,
369 vm_map_offset_t start,
370 vm_map_offset_t end);
371
372 static kern_return_t vm_map_reuse_pages(
373 vm_map_t map,
374 vm_map_offset_t start,
375 vm_map_offset_t end);
376
377 static kern_return_t vm_map_reusable_pages(
378 vm_map_t map,
379 vm_map_offset_t start,
380 vm_map_offset_t end);
381
382 static kern_return_t vm_map_can_reuse(
383 vm_map_t map,
384 vm_map_offset_t start,
385 vm_map_offset_t end);
386
387 static kern_return_t vm_map_zero(
388 vm_map_t map,
389 vm_map_offset_t start,
390 vm_map_offset_t end);
391
392 static kern_return_t vm_map_random_address_for_size(
393 vm_map_t map,
394 vm_map_offset_t *address,
395 vm_map_size_t size,
396 vm_map_kernel_flags_t vmk_flags);
397
398
399 #if CONFIG_MAP_RANGES
400
401 static vm_map_range_id_t vm_map_user_range_resolve(
402 vm_map_t map,
403 mach_vm_address_t addr,
404 mach_vm_address_t size,
405 mach_vm_range_t range);
406
407 #endif /* CONFIG_MAP_RANGES */
408 #if MACH_ASSERT
409 static kern_return_t vm_map_pageout(
410 vm_map_t map,
411 vm_map_offset_t start,
412 vm_map_offset_t end);
413 #endif /* MACH_ASSERT */
414
415 kern_return_t vm_map_corpse_footprint_collect(
416 vm_map_t old_map,
417 vm_map_entry_t old_entry,
418 vm_map_t new_map);
419 void vm_map_corpse_footprint_collect_done(
420 vm_map_t new_map);
421 void vm_map_corpse_footprint_destroy(
422 vm_map_t map);
423 kern_return_t vm_map_corpse_footprint_query_page_info(
424 vm_map_t map,
425 vm_map_offset_t va,
426 int *disposition_p);
427 void vm_map_footprint_query_page_info(
428 vm_map_t map,
429 vm_map_entry_t map_entry,
430 vm_map_offset_t curr_s_offset,
431 int *disposition_p);
432
433 #if CONFIG_MAP_RANGES
434 static void vm_map_range_map_init(void);
435 #endif /* CONFIG_MAP_RANGES */
436
437 pid_t find_largest_process_vm_map_entries(void);
438
439 extern int exit_with_guard_exception(void *p, mach_exception_data_type_t code,
440 mach_exception_data_type_t subcode);
441
442 /*
443 * Macros to copy a vm_map_entry. We must be careful to correctly
444 * manage the wired page count. vm_map_entry_copy() creates a new
445 * map entry to the same memory - the wired count in the new entry
446 * must be set to zero. vm_map_entry_copy_full() creates a new
447 * entry that is identical to the old entry. This preserves the
448 * wire count; it's used for map splitting and zone changing in
449 * vm_map_copyout.
450 */
451
452 static inline void
vm_map_entry_copy_csm_assoc(vm_map_t map __unused,vm_map_entry_t new __unused,vm_map_entry_t old __unused)453 vm_map_entry_copy_csm_assoc(
454 vm_map_t map __unused,
455 vm_map_entry_t new __unused,
456 vm_map_entry_t old __unused)
457 {
458 #if CODE_SIGNING_MONITOR
459 /* when code signing monitor is enabled, we want to reset on copy */
460 new->csm_associated = FALSE;
461 #else
462 /* when code signing monitor is not enabled, assert as a sanity check */
463 assert(new->csm_associated == FALSE);
464 #endif
465 #if DEVELOPMENT || DEBUG
466 if (new->vme_xnu_user_debug && vm_log_xnu_user_debug) {
467 printf("FBDP %d[%s] %s:%d map %p entry %p [ 0x%llx 0x%llx ] resetting vme_xnu_user_debug\n",
468 proc_selfpid(),
469 (get_bsdtask_info(current_task())
470 ? proc_name_address(get_bsdtask_info(current_task()))
471 : "?"),
472 __FUNCTION__, __LINE__,
473 map, new, new->vme_start, new->vme_end);
474 }
475 #endif /* DEVELOPMENT || DEBUG */
476 new->vme_xnu_user_debug = FALSE;
477 }
478
479 /*
480 * The "used_for_jit" flag was copied from OLD to NEW in vm_map_entry_copy().
481 * But for security reasons on some platforms, we don't want the
482 * new mapping to be "used for jit", so we reset the flag here.
483 */
484 static inline void
vm_map_entry_copy_code_signing(vm_map_t map,vm_map_entry_t new,vm_map_entry_t old __unused)485 vm_map_entry_copy_code_signing(
486 vm_map_t map,
487 vm_map_entry_t new,
488 vm_map_entry_t old __unused)
489 {
490 if (VM_MAP_POLICY_ALLOW_JIT_COPY(map)) {
491 assert(new->used_for_jit == old->used_for_jit);
492 } else {
493 if (old->used_for_jit) {
494 DTRACE_VM3(cs_wx,
495 uint64_t, new->vme_start,
496 uint64_t, new->vme_end,
497 vm_prot_t, new->protection);
498 printf("CODE SIGNING: %d[%s] %s: curprot cannot be write+execute. %s\n",
499 proc_selfpid(),
500 (get_bsdtask_info(current_task())
501 ? proc_name_address(get_bsdtask_info(current_task()))
502 : "?"),
503 __FUNCTION__,
504 "removing execute access");
505 new->protection &= ~VM_PROT_EXECUTE;
506 new->max_protection &= ~VM_PROT_EXECUTE;
507 }
508 new->used_for_jit = FALSE;
509 }
510 }
511
512 static inline void
vm_map_entry_copy_full(vm_map_entry_t new,vm_map_entry_t old)513 vm_map_entry_copy_full(
514 vm_map_entry_t new,
515 vm_map_entry_t old)
516 {
517 #if MAP_ENTRY_CREATION_DEBUG
518 btref_put(new->vme_creation_bt);
519 btref_retain(old->vme_creation_bt);
520 #endif
521 #if MAP_ENTRY_INSERTION_DEBUG
522 btref_put(new->vme_insertion_bt);
523 btref_retain(old->vme_insertion_bt);
524 #endif
525 #if VM_BTLOG_TAGS
526 /* Discard the btref that might be in the new entry */
527 if (new->vme_kernel_object) {
528 btref_put(new->vme_tag_btref);
529 }
530 /* Retain the btref in the old entry to account for its copy */
531 if (old->vme_kernel_object) {
532 btref_retain(old->vme_tag_btref);
533 }
534 #endif /* VM_BTLOG_TAGS */
535 *new = *old;
536 }
537
538 static inline void
vm_map_entry_copy(vm_map_t map,vm_map_entry_t new,vm_map_entry_t old)539 vm_map_entry_copy(
540 vm_map_t map,
541 vm_map_entry_t new,
542 vm_map_entry_t old)
543 {
544 vm_map_entry_copy_full(new, old);
545
546 new->is_shared = FALSE;
547 new->needs_wakeup = FALSE;
548 new->in_transition = FALSE;
549 new->wired_count = 0;
550 new->user_wired_count = 0;
551 new->vme_permanent = FALSE;
552 vm_map_entry_copy_code_signing(map, new, old);
553 vm_map_entry_copy_csm_assoc(map, new, old);
554 if (new->iokit_acct) {
555 assertf(!new->use_pmap, "old %p new %p\n", old, new);
556 new->iokit_acct = FALSE;
557 new->use_pmap = TRUE;
558 }
559 new->vme_resilient_codesign = FALSE;
560 new->vme_resilient_media = FALSE;
561 new->vme_atomic = FALSE;
562 new->vme_no_copy_on_read = FALSE;
563 }
564
565 /*
566 * Normal lock_read_to_write() returns FALSE/0 on failure.
567 * These functions evaluate to zero on success and non-zero value on failure.
568 */
569 __attribute__((always_inline))
570 int
vm_map_lock_read_to_write(vm_map_t map)571 vm_map_lock_read_to_write(vm_map_t map)
572 {
573 if (lck_rw_lock_shared_to_exclusive(&(map)->lock)) {
574 DTRACE_VM(vm_map_lock_upgrade);
575 return 0;
576 }
577 return 1;
578 }
579
580 __attribute__((always_inline))
581 boolean_t
vm_map_try_lock(vm_map_t map)582 vm_map_try_lock(vm_map_t map)
583 {
584 if (lck_rw_try_lock_exclusive(&(map)->lock)) {
585 DTRACE_VM(vm_map_lock_w);
586 return TRUE;
587 }
588 return FALSE;
589 }
590
591 __attribute__((always_inline))
592 boolean_t
vm_map_try_lock_read(vm_map_t map)593 vm_map_try_lock_read(vm_map_t map)
594 {
595 if (lck_rw_try_lock_shared(&(map)->lock)) {
596 DTRACE_VM(vm_map_lock_r);
597 return TRUE;
598 }
599 return FALSE;
600 }
601
602 /*!
603 * @function kdp_vm_map_is_acquired_exclusive
604 *
605 * @abstract
606 * Checks if vm map is acquired exclusive.
607 *
608 * @discussion
609 * NOT SAFE: To be used only by kernel debugger.
610 *
611 * @param map map to check
612 *
613 * @returns TRUE if the map is acquired exclusively.
614 */
615 boolean_t
kdp_vm_map_is_acquired_exclusive(vm_map_t map)616 kdp_vm_map_is_acquired_exclusive(vm_map_t map)
617 {
618 return kdp_lck_rw_lock_is_acquired_exclusive(&map->lock);
619 }
620
621 /*
622 * Routines to get the page size the caller should
623 * use while inspecting the target address space.
624 * Use the "_safely" variant if the caller is dealing with a user-provided
625 * array whose size depends on the page size, to avoid any overflow or
626 * underflow of a user-allocated buffer.
627 */
628 int
vm_self_region_page_shift_safely(vm_map_t target_map)629 vm_self_region_page_shift_safely(
630 vm_map_t target_map)
631 {
632 int effective_page_shift = 0;
633
634 if (PAGE_SIZE == (4096)) {
635 /* x86_64 and 4k watches: always use 4k */
636 return PAGE_SHIFT;
637 }
638 /* did caller provide an explicit page size for this thread to use? */
639 effective_page_shift = thread_self_region_page_shift();
640 if (effective_page_shift) {
641 /* use the explicitly-provided page size */
642 return effective_page_shift;
643 }
644 /* no explicit page size: use the caller's page size... */
645 effective_page_shift = VM_MAP_PAGE_SHIFT(current_map());
646 if (effective_page_shift == VM_MAP_PAGE_SHIFT(target_map)) {
647 /* page size match: safe to use */
648 return effective_page_shift;
649 }
650 /* page size mismatch */
651 return -1;
652 }
653 int
vm_self_region_page_shift(vm_map_t target_map)654 vm_self_region_page_shift(
655 vm_map_t target_map)
656 {
657 int effective_page_shift;
658
659 effective_page_shift = vm_self_region_page_shift_safely(target_map);
660 if (effective_page_shift == -1) {
661 /* no safe value but OK to guess for caller */
662 effective_page_shift = MIN(VM_MAP_PAGE_SHIFT(current_map()),
663 VM_MAP_PAGE_SHIFT(target_map));
664 }
665 return effective_page_shift;
666 }
667
668
669 /*
670 * Decide if we want to allow processes to execute from their data or stack areas.
671 * override_nx() returns true if we do. Data/stack execution can be enabled independently
672 * for 32 and 64 bit processes. Set the VM_ABI_32 or VM_ABI_64 flags in allow_data_exec
673 * or allow_stack_exec to enable data execution for that type of data area for that particular
674 * ABI (or both by or'ing the flags together). These are initialized in the architecture
675 * specific pmap files since the default behavior varies according to architecture. The
676 * main reason it varies is because of the need to provide binary compatibility with old
677 * applications that were written before these restrictions came into being. In the old
678 * days, an app could execute anything it could read, but this has slowly been tightened
679 * up over time. The default behavior is:
680 *
681 * 32-bit PPC apps may execute from both stack and data areas
682 * 32-bit Intel apps may exeucte from data areas but not stack
683 * 64-bit PPC/Intel apps may not execute from either data or stack
684 *
685 * An application on any architecture may override these defaults by explicitly
686 * adding PROT_EXEC permission to the page in question with the mprotect(2)
687 * system call. This code here just determines what happens when an app tries to
688 * execute from a page that lacks execute permission.
689 *
690 * Note that allow_data_exec or allow_stack_exec may also be modified by sysctl to change the
691 * default behavior for both 32 and 64 bit apps on a system-wide basis. Furthermore,
692 * a Mach-O header flag bit (MH_NO_HEAP_EXECUTION) can be used to forcibly disallow
693 * execution from data areas for a particular binary even if the arch normally permits it. As
694 * a final wrinkle, a posix_spawn attribute flag can be used to negate this opt-in header bit
695 * to support some complicated use cases, notably browsers with out-of-process plugins that
696 * are not all NX-safe.
697 */
698
699 extern int allow_data_exec, allow_stack_exec;
700
701 int
override_nx(vm_map_t map,uint32_t user_tag)702 override_nx(vm_map_t map, uint32_t user_tag) /* map unused on arm */
703 {
704 int current_abi;
705
706 if (map->pmap == kernel_pmap) {
707 return FALSE;
708 }
709
710 /*
711 * Determine if the app is running in 32 or 64 bit mode.
712 */
713
714 if (vm_map_is_64bit(map)) {
715 current_abi = VM_ABI_64;
716 } else {
717 current_abi = VM_ABI_32;
718 }
719
720 /*
721 * Determine if we should allow the execution based on whether it's a
722 * stack or data area and the current architecture.
723 */
724
725 if (user_tag == VM_MEMORY_STACK) {
726 return allow_stack_exec & current_abi;
727 }
728
729 return (allow_data_exec & current_abi) && (map->map_disallow_data_exec == FALSE);
730 }
731
732
733 /*
734 * Virtual memory maps provide for the mapping, protection,
735 * and sharing of virtual memory objects. In addition,
736 * this module provides for an efficient virtual copy of
737 * memory from one map to another.
738 *
739 * Synchronization is required prior to most operations.
740 *
741 * Maps consist of an ordered doubly-linked list of simple
742 * entries; a single hint is used to speed up lookups.
743 *
744 * Sharing maps have been deleted from this version of Mach.
745 * All shared objects are now mapped directly into the respective
746 * maps. This requires a change in the copy on write strategy;
747 * the asymmetric (delayed) strategy is used for shared temporary
748 * objects instead of the symmetric (shadow) strategy. All maps
749 * are now "top level" maps (either task map, kernel map or submap
750 * of the kernel map).
751 *
752 * Since portions of maps are specified by start/end addreses,
753 * which may not align with existing map entries, all
754 * routines merely "clip" entries to these start/end values.
755 * [That is, an entry is split into two, bordering at a
756 * start or end value.] Note that these clippings may not
757 * always be necessary (as the two resulting entries are then
758 * not changed); however, the clipping is done for convenience.
759 * No attempt is currently made to "glue back together" two
760 * abutting entries.
761 *
762 * The symmetric (shadow) copy strategy implements virtual copy
763 * by copying VM object references from one map to
764 * another, and then marking both regions as copy-on-write.
765 * It is important to note that only one writeable reference
766 * to a VM object region exists in any map when this strategy
767 * is used -- this means that shadow object creation can be
768 * delayed until a write operation occurs. The symmetric (delayed)
769 * strategy allows multiple maps to have writeable references to
770 * the same region of a vm object, and hence cannot delay creating
771 * its copy objects. See vm_object_copy_quickly() in vm_object.c.
772 * Copying of permanent objects is completely different; see
773 * vm_object_copy_strategically() in vm_object.c.
774 */
775
776 ZONE_DECLARE_ID(ZONE_ID_VM_MAP_COPY, struct vm_map_copy);
777
778 #define VM_MAP_ZONE_NAME "maps"
779 #define VM_MAP_ZFLAGS (ZC_NOENCRYPT | ZC_VM)
780
781 #define VM_MAP_ENTRY_ZONE_NAME "VM map entries"
782 #define VM_MAP_ENTRY_ZFLAGS (ZC_NOENCRYPT | ZC_VM)
783
784 #define VM_MAP_HOLES_ZONE_NAME "VM map holes"
785 #define VM_MAP_HOLES_ZFLAGS (ZC_NOENCRYPT | ZC_VM)
786
787 /*
788 * Asserts that a vm_map_copy object is coming from the
789 * vm_map_copy_zone to ensure that it isn't a fake constructed
790 * anywhere else.
791 */
792 void
vm_map_copy_require(struct vm_map_copy * copy)793 vm_map_copy_require(struct vm_map_copy *copy)
794 {
795 zone_id_require(ZONE_ID_VM_MAP_COPY, sizeof(struct vm_map_copy), copy);
796 }
797
798 /*
799 * vm_map_require:
800 *
801 * Ensures that the argument is memory allocated from the genuine
802 * vm map zone. (See zone_id_require_allow_foreign).
803 */
804 void
vm_map_require(vm_map_t map)805 vm_map_require(vm_map_t map)
806 {
807 zone_id_require(ZONE_ID_VM_MAP, sizeof(struct _vm_map), map);
808 }
809
810 #define VM_MAP_EARLY_COUNT_MAX 16
811 static __startup_data vm_offset_t map_data;
812 static __startup_data vm_size_t map_data_size;
813 static __startup_data vm_offset_t kentry_data;
814 static __startup_data vm_size_t kentry_data_size;
815 static __startup_data vm_offset_t map_holes_data;
816 static __startup_data vm_size_t map_holes_data_size;
817 static __startup_data vm_map_t *early_map_owners[VM_MAP_EARLY_COUNT_MAX];
818 static __startup_data uint32_t early_map_count;
819
820 #if XNU_TARGET_OS_OSX
821 #define NO_COALESCE_LIMIT ((1024 * 128) - 1)
822 #else /* XNU_TARGET_OS_OSX */
823 #define NO_COALESCE_LIMIT 0
824 #endif /* XNU_TARGET_OS_OSX */
825
826 /* Skip acquiring locks if we're in the midst of a kernel core dump */
827 unsigned int not_in_kdp = 1;
828
829 unsigned int vm_map_set_cache_attr_count = 0;
830
831 kern_return_t
vm_map_set_cache_attr(vm_map_t map,vm_map_offset_t va)832 vm_map_set_cache_attr(
833 vm_map_t map,
834 vm_map_offset_t va)
835 {
836 vm_map_entry_t map_entry;
837 vm_object_t object;
838 kern_return_t kr = KERN_SUCCESS;
839
840 vm_map_lock_read(map);
841
842 if (!vm_map_lookup_entry(map, va, &map_entry) ||
843 map_entry->is_sub_map) {
844 /*
845 * that memory is not properly mapped
846 */
847 kr = KERN_INVALID_ARGUMENT;
848 goto done;
849 }
850 object = VME_OBJECT(map_entry);
851
852 if (object == VM_OBJECT_NULL) {
853 /*
854 * there should be a VM object here at this point
855 */
856 kr = KERN_INVALID_ARGUMENT;
857 goto done;
858 }
859 vm_object_lock(object);
860 object->set_cache_attr = TRUE;
861 vm_object_unlock(object);
862
863 vm_map_set_cache_attr_count++;
864 done:
865 vm_map_unlock_read(map);
866
867 return kr;
868 }
869
870
871 #if CONFIG_CODE_DECRYPTION
872 /*
873 * vm_map_apple_protected:
874 * This remaps the requested part of the object with an object backed by
875 * the decrypting pager.
876 * crypt_info contains entry points and session data for the crypt module.
877 * The crypt_info block will be copied by vm_map_apple_protected. The data structures
878 * referenced in crypt_info must remain valid until crypt_info->crypt_end() is called.
879 */
880 kern_return_t
vm_map_apple_protected(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_object_offset_t crypto_backing_offset,struct pager_crypt_info * crypt_info,uint32_t cryptid)881 vm_map_apple_protected(
882 vm_map_t map,
883 vm_map_offset_t start,
884 vm_map_offset_t end,
885 vm_object_offset_t crypto_backing_offset,
886 struct pager_crypt_info *crypt_info,
887 uint32_t cryptid)
888 {
889 boolean_t map_locked;
890 kern_return_t kr;
891 vm_map_entry_t map_entry;
892 struct vm_map_entry tmp_entry;
893 memory_object_t unprotected_mem_obj;
894 vm_object_t protected_object;
895 vm_map_offset_t map_addr;
896 vm_map_offset_t start_aligned, end_aligned;
897 vm_object_offset_t crypto_start, crypto_end;
898 boolean_t cache_pager;
899
900 map_locked = FALSE;
901 unprotected_mem_obj = MEMORY_OBJECT_NULL;
902
903 if (__improbable(vm_map_range_overflows(map, start, end - start))) {
904 return KERN_INVALID_ADDRESS;
905 }
906 start_aligned = vm_map_trunc_page(start, PAGE_MASK_64);
907 end_aligned = vm_map_round_page(end, PAGE_MASK_64);
908 start_aligned = vm_map_trunc_page(start_aligned, VM_MAP_PAGE_MASK(map));
909 end_aligned = vm_map_round_page(end_aligned, VM_MAP_PAGE_MASK(map));
910
911 #if __arm64__
912 /*
913 * "start" and "end" might be 4K-aligned but not 16K-aligned,
914 * so we might have to loop and establish up to 3 mappings:
915 *
916 * + the first 16K-page, which might overlap with the previous
917 * 4K-aligned mapping,
918 * + the center,
919 * + the last 16K-page, which might overlap with the next
920 * 4K-aligned mapping.
921 * Each of these mapping might be backed by a vnode pager (if
922 * properly page-aligned) or a "fourk_pager", itself backed by a
923 * vnode pager (if 4K-aligned but not page-aligned).
924 */
925 #endif /* __arm64__ */
926
927 map_addr = start_aligned;
928 for (map_addr = start_aligned;
929 map_addr < end;
930 map_addr = tmp_entry.vme_end) {
931 vm_map_lock(map);
932 map_locked = TRUE;
933
934 /* lookup the protected VM object */
935 if (!vm_map_lookup_entry(map,
936 map_addr,
937 &map_entry) ||
938 map_entry->is_sub_map ||
939 VME_OBJECT(map_entry) == VM_OBJECT_NULL) {
940 /* that memory is not properly mapped */
941 kr = KERN_INVALID_ARGUMENT;
942 goto done;
943 }
944
945 /* ensure mapped memory is mapped as executable except
946 * except for model decryption flow */
947 if ((cryptid != CRYPTID_MODEL_ENCRYPTION) &&
948 !(map_entry->protection & VM_PROT_EXECUTE)) {
949 kr = KERN_INVALID_ARGUMENT;
950 goto done;
951 }
952
953 /* get the protected object to be decrypted */
954 protected_object = VME_OBJECT(map_entry);
955 if (protected_object == VM_OBJECT_NULL) {
956 /* there should be a VM object here at this point */
957 kr = KERN_INVALID_ARGUMENT;
958 goto done;
959 }
960 /* ensure protected object stays alive while map is unlocked */
961 vm_object_reference(protected_object);
962
963 /* limit the map entry to the area we want to cover */
964 vm_map_clip_start(map, map_entry, start_aligned);
965 vm_map_clip_end(map, map_entry, end_aligned);
966
967 tmp_entry = *map_entry;
968 map_entry = VM_MAP_ENTRY_NULL; /* not valid after unlocking map */
969 vm_map_unlock(map);
970 map_locked = FALSE;
971
972 /*
973 * This map entry might be only partially encrypted
974 * (if not fully "page-aligned").
975 */
976 crypto_start = 0;
977 crypto_end = tmp_entry.vme_end - tmp_entry.vme_start;
978 if (tmp_entry.vme_start < start) {
979 if (tmp_entry.vme_start != start_aligned) {
980 kr = KERN_INVALID_ADDRESS;
981 vm_object_deallocate(protected_object);
982 goto done;
983 }
984 crypto_start += (start - tmp_entry.vme_start);
985 }
986 if (tmp_entry.vme_end > end) {
987 if (tmp_entry.vme_end != end_aligned) {
988 kr = KERN_INVALID_ADDRESS;
989 vm_object_deallocate(protected_object);
990 goto done;
991 }
992 crypto_end -= (tmp_entry.vme_end - end);
993 }
994
995 /*
996 * This "extra backing offset" is needed to get the decryption
997 * routine to use the right key. It adjusts for the possibly
998 * relative offset of an interposed "4K" pager...
999 */
1000 if (crypto_backing_offset == (vm_object_offset_t) -1) {
1001 crypto_backing_offset = VME_OFFSET(&tmp_entry);
1002 }
1003
1004 cache_pager = TRUE;
1005 #if XNU_TARGET_OS_OSX
1006 if (vm_map_is_alien(map)) {
1007 cache_pager = FALSE;
1008 }
1009 #endif /* XNU_TARGET_OS_OSX */
1010
1011 /*
1012 * Lookup (and create if necessary) the protected memory object
1013 * matching that VM object.
1014 * If successful, this also grabs a reference on the memory object,
1015 * to guarantee that it doesn't go away before we get a chance to map
1016 * it.
1017 */
1018 unprotected_mem_obj = apple_protect_pager_setup(
1019 protected_object,
1020 VME_OFFSET(&tmp_entry),
1021 crypto_backing_offset,
1022 crypt_info,
1023 crypto_start,
1024 crypto_end,
1025 cache_pager);
1026
1027 /* release extra ref on protected object */
1028 vm_object_deallocate(protected_object);
1029
1030 if (unprotected_mem_obj == NULL) {
1031 kr = KERN_FAILURE;
1032 goto done;
1033 }
1034
1035 /* can overwrite an immutable mapping */
1036 vm_map_kernel_flags_t vmk_flags = {
1037 .vmf_fixed = true,
1038 .vmf_overwrite = true,
1039 .vmkf_overwrite_immutable = true,
1040 };
1041 /* make the new mapping as "permanent" as the one it replaces */
1042 vmk_flags.vmf_permanent = tmp_entry.vme_permanent;
1043 #if __arm64__
1044 if (tmp_entry.used_for_jit &&
1045 (VM_MAP_PAGE_SHIFT(map) != FOURK_PAGE_SHIFT ||
1046 PAGE_SHIFT != FOURK_PAGE_SHIFT) &&
1047 fourk_binary_compatibility_unsafe &&
1048 fourk_binary_compatibility_allow_wx) {
1049 printf("** FOURK_COMPAT [%d]: "
1050 "allowing write+execute at 0x%llx\n",
1051 proc_selfpid(), tmp_entry.vme_start);
1052 vmk_flags.vmkf_map_jit = TRUE;
1053 }
1054 #endif /* __arm64__ */
1055
1056 /* map this memory object in place of the current one */
1057 map_addr = tmp_entry.vme_start;
1058 kr = vm_map_enter_mem_object(map,
1059 &map_addr,
1060 (tmp_entry.vme_end -
1061 tmp_entry.vme_start),
1062 (mach_vm_offset_t) 0,
1063 vmk_flags,
1064 (ipc_port_t)(uintptr_t) unprotected_mem_obj,
1065 0,
1066 TRUE,
1067 tmp_entry.protection,
1068 tmp_entry.max_protection,
1069 tmp_entry.inheritance);
1070 assertf(kr == KERN_SUCCESS,
1071 "kr = 0x%x\n", kr);
1072 assertf(map_addr == tmp_entry.vme_start,
1073 "map_addr=0x%llx vme_start=0x%llx tmp_entry=%p\n",
1074 (uint64_t)map_addr,
1075 (uint64_t) tmp_entry.vme_start,
1076 &tmp_entry);
1077
1078 #if VM_MAP_DEBUG_APPLE_PROTECT
1079 if (vm_map_debug_apple_protect) {
1080 printf("APPLE_PROTECT: map %p [0x%llx:0x%llx] pager %p:"
1081 " backing:[object:%p,offset:0x%llx,"
1082 "crypto_backing_offset:0x%llx,"
1083 "crypto_start:0x%llx,crypto_end:0x%llx]\n",
1084 map,
1085 (uint64_t) map_addr,
1086 (uint64_t) (map_addr + (tmp_entry.vme_end -
1087 tmp_entry.vme_start)),
1088 unprotected_mem_obj,
1089 protected_object,
1090 VME_OFFSET(&tmp_entry),
1091 crypto_backing_offset,
1092 crypto_start,
1093 crypto_end);
1094 }
1095 #endif /* VM_MAP_DEBUG_APPLE_PROTECT */
1096
1097 /*
1098 * Release the reference obtained by
1099 * apple_protect_pager_setup().
1100 * The mapping (if it succeeded) is now holding a reference on
1101 * the memory object.
1102 */
1103 memory_object_deallocate(unprotected_mem_obj);
1104 unprotected_mem_obj = MEMORY_OBJECT_NULL;
1105
1106 /* continue with next map entry */
1107 crypto_backing_offset += (tmp_entry.vme_end -
1108 tmp_entry.vme_start);
1109 crypto_backing_offset -= crypto_start;
1110 }
1111 kr = KERN_SUCCESS;
1112
1113 done:
1114 if (map_locked) {
1115 vm_map_unlock(map);
1116 }
1117 return kr;
1118 }
1119 #endif /* CONFIG_CODE_DECRYPTION */
1120
1121
1122 LCK_GRP_DECLARE(vm_map_lck_grp, "vm_map");
1123 LCK_ATTR_DECLARE(vm_map_lck_attr, 0, 0);
1124 LCK_ATTR_DECLARE(vm_map_lck_rw_attr, 0, LCK_ATTR_DEBUG);
1125
1126 #if XNU_TARGET_OS_OSX
1127 #define MALLOC_NO_COW_DEFAULT 1
1128 #define MALLOC_NO_COW_EXCEPT_FORK_DEFAULT 1
1129 #else /* XNU_TARGET_OS_OSX */
1130 #define MALLOC_NO_COW_DEFAULT 1
1131 #define MALLOC_NO_COW_EXCEPT_FORK_DEFAULT 0
1132 #endif /* XNU_TARGET_OS_OSX */
1133 TUNABLE(int, malloc_no_cow, "malloc_no_cow", MALLOC_NO_COW_DEFAULT);
1134 TUNABLE(int, malloc_no_cow_except_fork, "malloc_no_cow_except_fork", MALLOC_NO_COW_EXCEPT_FORK_DEFAULT);
1135 uint64_t vm_memory_malloc_no_cow_mask = 0ULL;
1136 #if DEBUG
1137 int vm_check_map_sanity = 0;
1138 #endif
1139
1140 /*
1141 * vm_map_init:
1142 *
1143 * Initialize the vm_map module. Must be called before
1144 * any other vm_map routines.
1145 *
1146 * Map and entry structures are allocated from zones -- we must
1147 * initialize those zones.
1148 *
1149 * There are three zones of interest:
1150 *
1151 * vm_map_zone: used to allocate maps.
1152 * vm_map_entry_zone: used to allocate map entries.
1153 *
1154 * LP32:
1155 * vm_map_entry_reserved_zone: fallback zone for kernel map entries
1156 *
1157 * The kernel allocates map entries from a special zone that is initially
1158 * "crammed" with memory. It would be difficult (perhaps impossible) for
1159 * the kernel to allocate more memory to a entry zone when it became
1160 * empty since the very act of allocating memory implies the creation
1161 * of a new entry.
1162 */
1163 __startup_func
1164 void
vm_map_init(void)1165 vm_map_init(void)
1166 {
1167
1168 #if MACH_ASSERT
1169 PE_parse_boot_argn("debug4k_filter", &debug4k_filter,
1170 sizeof(debug4k_filter));
1171 #endif /* MACH_ASSERT */
1172
1173 zone_create_ext(VM_MAP_ZONE_NAME, sizeof(struct _vm_map),
1174 VM_MAP_ZFLAGS, ZONE_ID_VM_MAP, NULL);
1175
1176 /*
1177 * Don't quarantine because we always need elements available
1178 * Disallow GC on this zone... to aid the GC.
1179 */
1180 zone_create_ext(VM_MAP_ENTRY_ZONE_NAME,
1181 sizeof(struct vm_map_entry), VM_MAP_ENTRY_ZFLAGS,
1182 ZONE_ID_VM_MAP_ENTRY, ^(zone_t z) {
1183 z->z_elems_rsv = (uint16_t)(32 *
1184 (ml_early_cpu_max_number() + 1));
1185 });
1186
1187 zone_create_ext(VM_MAP_HOLES_ZONE_NAME,
1188 sizeof(struct vm_map_links), VM_MAP_HOLES_ZFLAGS,
1189 ZONE_ID_VM_MAP_HOLES, ^(zone_t z) {
1190 z->z_elems_rsv = (uint16_t)(16 * 1024 / zone_elem_outer_size(z));
1191 });
1192
1193 zone_create_ext("VM map copies", sizeof(struct vm_map_copy),
1194 ZC_NOENCRYPT, ZONE_ID_VM_MAP_COPY, NULL);
1195
1196 /*
1197 * Add the stolen memory to zones, adjust zone size and stolen counts.
1198 */
1199 zone_cram_early(vm_map_zone, map_data, map_data_size);
1200 zone_cram_early(vm_map_entry_zone, kentry_data, kentry_data_size);
1201 zone_cram_early(vm_map_holes_zone, map_holes_data, map_holes_data_size);
1202 printf("VM boostrap: %d maps, %d entries and %d holes available\n",
1203 zone_count_free(vm_map_zone),
1204 zone_count_free(vm_map_entry_zone),
1205 zone_count_free(vm_map_holes_zone));
1206
1207 /*
1208 * Since these are covered by zones, remove them from stolen page accounting.
1209 */
1210 VM_PAGE_MOVE_STOLEN(atop_64(map_data_size) + atop_64(kentry_data_size) + atop_64(map_holes_data_size));
1211
1212 #if VM_MAP_DEBUG_APPLE_PROTECT
1213 PE_parse_boot_argn("vm_map_debug_apple_protect",
1214 &vm_map_debug_apple_protect,
1215 sizeof(vm_map_debug_apple_protect));
1216 #endif /* VM_MAP_DEBUG_APPLE_PROTECT */
1217 #if VM_MAP_DEBUG_APPLE_FOURK
1218 PE_parse_boot_argn("vm_map_debug_fourk",
1219 &vm_map_debug_fourk,
1220 sizeof(vm_map_debug_fourk));
1221 #endif /* VM_MAP_DEBUG_FOURK */
1222
1223 if (malloc_no_cow) {
1224 vm_memory_malloc_no_cow_mask = 0ULL;
1225 vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC;
1226 vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_SMALL;
1227 vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_MEDIUM;
1228 #if XNU_TARGET_OS_OSX
1229 /*
1230 * On macOS, keep copy-on-write for MALLOC_LARGE because
1231 * realloc() may use vm_copy() to transfer the old contents
1232 * to the new location.
1233 */
1234 #else /* XNU_TARGET_OS_OSX */
1235 vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_LARGE;
1236 vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_LARGE_REUSABLE;
1237 vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_LARGE_REUSED;
1238 #endif /* XNU_TARGET_OS_OSX */
1239 // vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_HUGE;
1240 // vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_REALLOC;
1241 vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_TINY;
1242 vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_NANO;
1243 // vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_TCMALLOC;
1244 PE_parse_boot_argn("vm_memory_malloc_no_cow_mask",
1245 &vm_memory_malloc_no_cow_mask,
1246 sizeof(vm_memory_malloc_no_cow_mask));
1247 }
1248
1249 #if CONFIG_MAP_RANGES
1250 vm_map_range_map_init();
1251 #endif /* CONFIG_MAP_RANGES */
1252
1253 #if DEBUG
1254 PE_parse_boot_argn("vm_check_map_sanity", &vm_check_map_sanity, sizeof(vm_check_map_sanity));
1255 if (vm_check_map_sanity) {
1256 kprintf("VM sanity checking enabled\n");
1257 } else {
1258 kprintf("VM sanity checking disabled. Set bootarg vm_check_map_sanity=1 to enable\n");
1259 }
1260 #endif /* DEBUG */
1261
1262 #if DEVELOPMENT || DEBUG
1263 PE_parse_boot_argn("panic_on_unsigned_execute",
1264 &panic_on_unsigned_execute,
1265 sizeof(panic_on_unsigned_execute));
1266 PE_parse_boot_argn("panic_on_mlock_failure",
1267 &panic_on_mlock_failure,
1268 sizeof(panic_on_mlock_failure));
1269 #endif /* DEVELOPMENT || DEBUG */
1270 }
1271
1272 __startup_func
1273 static void
vm_map_steal_memory(void)1274 vm_map_steal_memory(void)
1275 {
1276 /*
1277 * We need to reserve enough memory to support boostraping VM maps
1278 * and the zone subsystem.
1279 *
1280 * The VM Maps that need to function before zones can support them
1281 * are the ones registered with vm_map_will_allocate_early_map(),
1282 * which are:
1283 * - the kernel map
1284 * - the various submaps used by zones (pgz, meta, ...)
1285 *
1286 * We also need enough entries and holes to support them
1287 * until zone_metadata_init() is called, which is when
1288 * the zone allocator becomes capable of expanding dynamically.
1289 *
1290 * We need:
1291 * - VM_MAP_EARLY_COUNT_MAX worth of VM Maps.
1292 * - To allow for 3-4 entries per map, but the kernel map
1293 * needs a multiple of VM_MAP_EARLY_COUNT_MAX entries
1294 * to describe the submaps, so double it (and make it 8x too)
1295 * - To allow for holes between entries,
1296 * hence needs the same budget as entries
1297 */
1298 map_data_size = zone_get_early_alloc_size(VM_MAP_ZONE_NAME,
1299 sizeof(struct _vm_map), VM_MAP_ZFLAGS,
1300 VM_MAP_EARLY_COUNT_MAX);
1301
1302 kentry_data_size = zone_get_early_alloc_size(VM_MAP_ENTRY_ZONE_NAME,
1303 sizeof(struct vm_map_entry), VM_MAP_ENTRY_ZFLAGS,
1304 8 * VM_MAP_EARLY_COUNT_MAX);
1305
1306 map_holes_data_size = zone_get_early_alloc_size(VM_MAP_HOLES_ZONE_NAME,
1307 sizeof(struct vm_map_links), VM_MAP_HOLES_ZFLAGS,
1308 8 * VM_MAP_EARLY_COUNT_MAX);
1309
1310 /*
1311 * Steal a contiguous range of memory so that a simple range check
1312 * can validate early addresses being freed/crammed to these
1313 * zones
1314 */
1315 map_data = zone_early_mem_init(map_data_size + kentry_data_size +
1316 map_holes_data_size);
1317 kentry_data = map_data + map_data_size;
1318 map_holes_data = kentry_data + kentry_data_size;
1319 }
1320 STARTUP(PMAP_STEAL, STARTUP_RANK_FIRST, vm_map_steal_memory);
1321
1322 __startup_func
1323 static void
vm_kernel_boostraped(void)1324 vm_kernel_boostraped(void)
1325 {
1326 zone_enable_caching(&zone_array[ZONE_ID_VM_MAP_ENTRY]);
1327 zone_enable_caching(&zone_array[ZONE_ID_VM_MAP_HOLES]);
1328 zone_enable_caching(&zone_array[ZONE_ID_VM_MAP_COPY]);
1329
1330 printf("VM bootstrap done: %d maps, %d entries and %d holes left\n",
1331 zone_count_free(vm_map_zone),
1332 zone_count_free(vm_map_entry_zone),
1333 zone_count_free(vm_map_holes_zone));
1334 }
1335 STARTUP(ZALLOC, STARTUP_RANK_SECOND, vm_kernel_boostraped);
1336
1337 void
vm_map_disable_hole_optimization(vm_map_t map)1338 vm_map_disable_hole_optimization(vm_map_t map)
1339 {
1340 vm_map_entry_t head_entry, hole_entry, next_hole_entry;
1341
1342 if (map->holelistenabled) {
1343 head_entry = hole_entry = CAST_TO_VM_MAP_ENTRY(map->holes_list);
1344
1345 while (hole_entry != NULL) {
1346 next_hole_entry = hole_entry->vme_next;
1347
1348 hole_entry->vme_next = NULL;
1349 hole_entry->vme_prev = NULL;
1350 zfree_id(ZONE_ID_VM_MAP_HOLES, hole_entry);
1351
1352 if (next_hole_entry == head_entry) {
1353 hole_entry = NULL;
1354 } else {
1355 hole_entry = next_hole_entry;
1356 }
1357 }
1358
1359 map->holes_list = NULL;
1360 map->holelistenabled = FALSE;
1361
1362 map->first_free = vm_map_first_entry(map);
1363 SAVE_HINT_HOLE_WRITE(map, NULL);
1364 }
1365 }
1366
1367 boolean_t
vm_kernel_map_is_kernel(vm_map_t map)1368 vm_kernel_map_is_kernel(vm_map_t map)
1369 {
1370 return map->pmap == kernel_pmap;
1371 }
1372
1373 /*
1374 * vm_map_create:
1375 *
1376 * Creates and returns a new empty VM map with
1377 * the given physical map structure, and having
1378 * the given lower and upper address bounds.
1379 */
1380
1381 extern vm_map_t vm_map_create_external(
1382 pmap_t pmap,
1383 vm_map_offset_t min_off,
1384 vm_map_offset_t max_off,
1385 boolean_t pageable);
1386
1387 vm_map_t
vm_map_create_external(pmap_t pmap,vm_map_offset_t min,vm_map_offset_t max,boolean_t pageable)1388 vm_map_create_external(
1389 pmap_t pmap,
1390 vm_map_offset_t min,
1391 vm_map_offset_t max,
1392 boolean_t pageable)
1393 {
1394 vm_map_create_options_t options = VM_MAP_CREATE_DEFAULT;
1395
1396 if (pageable) {
1397 options |= VM_MAP_CREATE_PAGEABLE;
1398 }
1399 return vm_map_create_options(pmap, min, max, options);
1400 }
1401
1402 __startup_func
1403 void
vm_map_will_allocate_early_map(vm_map_t * owner)1404 vm_map_will_allocate_early_map(vm_map_t *owner)
1405 {
1406 if (early_map_count >= VM_MAP_EARLY_COUNT_MAX) {
1407 panic("VM_MAP_EARLY_COUNT_MAX is too low");
1408 }
1409
1410 early_map_owners[early_map_count++] = owner;
1411 }
1412
1413 __startup_func
1414 void
vm_map_relocate_early_maps(vm_offset_t delta)1415 vm_map_relocate_early_maps(vm_offset_t delta)
1416 {
1417 for (uint32_t i = 0; i < early_map_count; i++) {
1418 vm_address_t addr = (vm_address_t)*early_map_owners[i];
1419
1420 *early_map_owners[i] = (vm_map_t)(addr + delta);
1421 }
1422
1423 early_map_count = ~0u;
1424 }
1425
1426 /*
1427 * Routine: vm_map_relocate_early_elem
1428 *
1429 * Purpose:
1430 * Early zone elements are allocated in a temporary part
1431 * of the address space.
1432 *
1433 * Once the zones live in their final place, the early
1434 * VM maps, map entries and map holes need to be relocated.
1435 *
1436 * It involves rewriting any vm_map_t, vm_map_entry_t or
1437 * pointers to vm_map_links. Other pointers to other types
1438 * are fine.
1439 *
1440 * Fortunately, pointers to those types are self-contained
1441 * in those zones, _except_ for pointers to VM maps,
1442 * which are tracked during early boot and fixed with
1443 * vm_map_relocate_early_maps().
1444 */
1445 __startup_func
1446 void
vm_map_relocate_early_elem(uint32_t zone_id,vm_offset_t new_addr,vm_offset_t delta)1447 vm_map_relocate_early_elem(
1448 uint32_t zone_id,
1449 vm_offset_t new_addr,
1450 vm_offset_t delta)
1451 {
1452 #define relocate(type_t, field) ({ \
1453 typeof(((type_t)NULL)->field) *__field = &((type_t)new_addr)->field; \
1454 if (*__field) { \
1455 *__field = (typeof(*__field))((vm_offset_t)*__field + delta); \
1456 } \
1457 })
1458
1459 switch (zone_id) {
1460 case ZONE_ID_VM_MAP:
1461 case ZONE_ID_VM_MAP_ENTRY:
1462 case ZONE_ID_VM_MAP_HOLES:
1463 break;
1464
1465 default:
1466 panic("Unexpected zone ID %d", zone_id);
1467 }
1468
1469 if (zone_id == ZONE_ID_VM_MAP) {
1470 relocate(vm_map_t, hdr.links.prev);
1471 relocate(vm_map_t, hdr.links.next);
1472 ((vm_map_t)new_addr)->pmap = kernel_pmap;
1473 #ifdef VM_MAP_STORE_USE_RB
1474 relocate(vm_map_t, hdr.rb_head_store.rbh_root);
1475 #endif /* VM_MAP_STORE_USE_RB */
1476 relocate(vm_map_t, hint);
1477 relocate(vm_map_t, hole_hint);
1478 relocate(vm_map_t, first_free);
1479 return;
1480 }
1481
1482 relocate(struct vm_map_links *, prev);
1483 relocate(struct vm_map_links *, next);
1484
1485 if (zone_id == ZONE_ID_VM_MAP_ENTRY) {
1486 #ifdef VM_MAP_STORE_USE_RB
1487 relocate(vm_map_entry_t, store.entry.rbe_left);
1488 relocate(vm_map_entry_t, store.entry.rbe_right);
1489 relocate(vm_map_entry_t, store.entry.rbe_parent);
1490 #endif /* VM_MAP_STORE_USE_RB */
1491 if (((vm_map_entry_t)new_addr)->is_sub_map) {
1492 /* no object to relocate because we haven't made any */
1493 ((vm_map_entry_t)new_addr)->vme_submap +=
1494 delta >> VME_SUBMAP_SHIFT;
1495 }
1496 #if MAP_ENTRY_CREATION_DEBUG
1497 relocate(vm_map_entry_t, vme_creation_maphdr);
1498 #endif /* MAP_ENTRY_CREATION_DEBUG */
1499 }
1500
1501 #undef relocate
1502 }
1503
1504 vm_map_t
vm_map_create_options(pmap_t pmap,vm_map_offset_t min,vm_map_offset_t max,vm_map_create_options_t options)1505 vm_map_create_options(
1506 pmap_t pmap,
1507 vm_map_offset_t min,
1508 vm_map_offset_t max,
1509 vm_map_create_options_t options)
1510 {
1511 vm_map_t result;
1512
1513 #if DEBUG || DEVELOPMENT
1514 if (__improbable(startup_phase < STARTUP_SUB_ZALLOC)) {
1515 if (early_map_count != ~0u && early_map_count !=
1516 zone_count_allocated(vm_map_zone) + 1) {
1517 panic("allocating %dth early map, owner not known",
1518 zone_count_allocated(vm_map_zone) + 1);
1519 }
1520 if (early_map_count != ~0u && pmap && pmap != kernel_pmap) {
1521 panic("allocating %dth early map for non kernel pmap",
1522 early_map_count);
1523 }
1524 }
1525 #endif /* DEBUG || DEVELOPMENT */
1526
1527 result = zalloc_id(ZONE_ID_VM_MAP, Z_WAITOK | Z_NOFAIL | Z_ZERO);
1528
1529 vm_map_store_init(&result->hdr);
1530 result->hdr.entries_pageable = (bool)(options & VM_MAP_CREATE_PAGEABLE);
1531 vm_map_set_page_shift(result, PAGE_SHIFT);
1532
1533 result->size_limit = RLIM_INFINITY; /* default unlimited */
1534 result->data_limit = RLIM_INFINITY; /* default unlimited */
1535 result->user_wire_limit = MACH_VM_MAX_ADDRESS; /* default limit is unlimited */
1536 os_ref_init_count_raw(&result->map_refcnt, &map_refgrp, 1);
1537 result->pmap = pmap;
1538 result->min_offset = min;
1539 result->max_offset = max;
1540 result->first_free = vm_map_to_entry(result);
1541 result->hint = vm_map_to_entry(result);
1542
1543 if (options & VM_MAP_CREATE_NEVER_FAULTS) {
1544 assert(pmap == kernel_pmap);
1545 result->never_faults = true;
1546 }
1547
1548 /* "has_corpse_footprint" and "holelistenabled" are mutually exclusive */
1549 if (options & VM_MAP_CREATE_CORPSE_FOOTPRINT) {
1550 result->has_corpse_footprint = true;
1551 } else if (!(options & VM_MAP_CREATE_DISABLE_HOLELIST)) {
1552 struct vm_map_links *hole_entry;
1553
1554 hole_entry = zalloc_id(ZONE_ID_VM_MAP_HOLES, Z_WAITOK | Z_NOFAIL);
1555 hole_entry->start = min;
1556 #if defined(__arm64__)
1557 hole_entry->end = result->max_offset;
1558 #else
1559 hole_entry->end = MAX(max, (vm_map_offset_t)MACH_VM_MAX_ADDRESS);
1560 #endif
1561 result->holes_list = result->hole_hint = hole_entry;
1562 hole_entry->prev = hole_entry->next = CAST_TO_VM_MAP_ENTRY(hole_entry);
1563 result->holelistenabled = true;
1564 }
1565
1566 vm_map_lock_init(result);
1567
1568 return result;
1569 }
1570
1571 /*
1572 * Adjusts a submap that was made by kmem_suballoc()
1573 * before it knew where it would be mapped,
1574 * so that it has the right min/max offsets.
1575 *
1576 * We do not need to hold any locks:
1577 * only the caller knows about this map,
1578 * and it is not published on any entry yet.
1579 */
1580 static void
vm_map_adjust_offsets(vm_map_t map,vm_map_offset_t min_off,vm_map_offset_t max_off)1581 vm_map_adjust_offsets(
1582 vm_map_t map,
1583 vm_map_offset_t min_off,
1584 vm_map_offset_t max_off)
1585 {
1586 assert(map->min_offset == 0);
1587 assert(map->max_offset == max_off - min_off);
1588 assert(map->hdr.nentries == 0);
1589 assert(os_ref_get_count_raw(&map->map_refcnt) == 2);
1590
1591 map->min_offset = min_off;
1592 map->max_offset = max_off;
1593
1594 if (map->holelistenabled) {
1595 struct vm_map_links *hole = map->holes_list;
1596
1597 hole->start = min_off;
1598 #if defined(__arm64__)
1599 hole->end = max_off;
1600 #else
1601 hole->end = MAX(max_off, (vm_map_offset_t)MACH_VM_MAX_ADDRESS);
1602 #endif
1603 }
1604 }
1605
1606
1607 vm_map_size_t
vm_map_adjusted_size(vm_map_t map)1608 vm_map_adjusted_size(vm_map_t map)
1609 {
1610 const struct vm_reserved_region *regions = NULL;
1611 size_t num_regions = 0;
1612 mach_vm_size_t reserved_size = 0, map_size = 0;
1613
1614 if (map == NULL || (map->size == 0)) {
1615 return 0;
1616 }
1617
1618 map_size = map->size;
1619
1620 if (map->reserved_regions == FALSE || !vm_map_is_exotic(map) || map->terminated) {
1621 /*
1622 * No special reserved regions or not an exotic map or the task
1623 * is terminating and these special regions might have already
1624 * been deallocated.
1625 */
1626 return map_size;
1627 }
1628
1629 num_regions = ml_get_vm_reserved_regions(vm_map_is_64bit(map), ®ions);
1630 assert((num_regions == 0) || (num_regions > 0 && regions != NULL));
1631
1632 while (num_regions) {
1633 reserved_size += regions[--num_regions].vmrr_size;
1634 }
1635
1636 /*
1637 * There are a few places where the map is being switched out due to
1638 * 'termination' without that bit being set (e.g. exec and corpse purging).
1639 * In those cases, we could have the map's regions being deallocated on
1640 * a core while some accounting process is trying to get the map's size.
1641 * So this assert can't be enabled till all those places are uniform in
1642 * their use of the 'map->terminated' bit.
1643 *
1644 * assert(map_size >= reserved_size);
1645 */
1646
1647 return (map_size >= reserved_size) ? (map_size - reserved_size) : map_size;
1648 }
1649
1650 /*
1651 * vm_map_entry_create: [ internal use only ]
1652 *
1653 * Allocates a VM map entry for insertion in the
1654 * given map (or map copy). No fields are filled.
1655 *
1656 * The VM entry will be zero initialized, except for:
1657 * - behavior set to VM_BEHAVIOR_DEFAULT
1658 * - inheritance set to VM_INHERIT_DEFAULT
1659 */
1660 #define vm_map_entry_create(map) _vm_map_entry_create(&(map)->hdr)
1661
1662 #define vm_map_copy_entry_create(copy) _vm_map_entry_create(&(copy)->cpy_hdr)
1663
1664 static vm_map_entry_t
_vm_map_entry_create(struct vm_map_header * map_header __unused)1665 _vm_map_entry_create(
1666 struct vm_map_header *map_header __unused)
1667 {
1668 vm_map_entry_t entry = NULL;
1669
1670 entry = zalloc_id(ZONE_ID_VM_MAP_ENTRY, Z_WAITOK | Z_ZERO);
1671
1672 /*
1673 * Help the compiler with what we know to be true,
1674 * so that the further bitfields inits have good codegen.
1675 *
1676 * See rdar://87041299
1677 */
1678 __builtin_assume(entry->vme_object_value == 0);
1679 __builtin_assume(*(uint64_t *)(&entry->vme_object_value + 1) == 0);
1680 __builtin_assume(*(uint64_t *)(&entry->vme_object_value + 2) == 0);
1681
1682 static_assert(VM_MAX_TAG_VALUE <= VME_ALIAS_MASK,
1683 "VME_ALIAS_MASK covers tags");
1684
1685 static_assert(VM_BEHAVIOR_DEFAULT == 0,
1686 "can skip zeroing of the behavior field");
1687 entry->inheritance = VM_INHERIT_DEFAULT;
1688
1689 #if MAP_ENTRY_CREATION_DEBUG
1690 entry->vme_creation_maphdr = map_header;
1691 entry->vme_creation_bt = btref_get(__builtin_frame_address(0),
1692 BTREF_GET_NOWAIT);
1693 #endif
1694 return entry;
1695 }
1696
1697 /*
1698 * vm_map_entry_dispose: [ internal use only ]
1699 *
1700 * Inverse of vm_map_entry_create.
1701 *
1702 * write map lock held so no need to
1703 * do anything special to insure correctness
1704 * of the stores
1705 */
1706 static void
vm_map_entry_dispose(vm_map_entry_t entry)1707 vm_map_entry_dispose(
1708 vm_map_entry_t entry)
1709 {
1710 #if VM_BTLOG_TAGS
1711 if (entry->vme_kernel_object) {
1712 btref_put(entry->vme_tag_btref);
1713 }
1714 #endif /* VM_BTLOG_TAGS */
1715 #if MAP_ENTRY_CREATION_DEBUG
1716 btref_put(entry->vme_creation_bt);
1717 #endif
1718 #if MAP_ENTRY_INSERTION_DEBUG
1719 btref_put(entry->vme_insertion_bt);
1720 #endif
1721 zfree(vm_map_entry_zone, entry);
1722 }
1723
1724 #define vm_map_copy_entry_dispose(copy_entry) \
1725 vm_map_entry_dispose(copy_entry)
1726
1727 static vm_map_entry_t
vm_map_zap_first_entry(vm_map_zap_t list)1728 vm_map_zap_first_entry(
1729 vm_map_zap_t list)
1730 {
1731 return list->vmz_head;
1732 }
1733
1734 static vm_map_entry_t
vm_map_zap_last_entry(vm_map_zap_t list)1735 vm_map_zap_last_entry(
1736 vm_map_zap_t list)
1737 {
1738 assert(vm_map_zap_first_entry(list));
1739 return __container_of(list->vmz_tail, struct vm_map_entry, vme_next);
1740 }
1741
1742 static void
vm_map_zap_append(vm_map_zap_t list,vm_map_entry_t entry)1743 vm_map_zap_append(
1744 vm_map_zap_t list,
1745 vm_map_entry_t entry)
1746 {
1747 entry->vme_next = VM_MAP_ENTRY_NULL;
1748 *list->vmz_tail = entry;
1749 list->vmz_tail = &entry->vme_next;
1750 }
1751
1752 static vm_map_entry_t
vm_map_zap_pop(vm_map_zap_t list)1753 vm_map_zap_pop(
1754 vm_map_zap_t list)
1755 {
1756 vm_map_entry_t head = list->vmz_head;
1757
1758 if (head != VM_MAP_ENTRY_NULL &&
1759 (list->vmz_head = head->vme_next) == VM_MAP_ENTRY_NULL) {
1760 list->vmz_tail = &list->vmz_head;
1761 }
1762
1763 return head;
1764 }
1765
1766 static void
vm_map_zap_dispose(vm_map_zap_t list)1767 vm_map_zap_dispose(
1768 vm_map_zap_t list)
1769 {
1770 vm_map_entry_t entry;
1771
1772 while ((entry = vm_map_zap_pop(list))) {
1773 if (entry->is_sub_map) {
1774 vm_map_deallocate(VME_SUBMAP(entry));
1775 } else {
1776 vm_object_deallocate(VME_OBJECT(entry));
1777 }
1778
1779 vm_map_entry_dispose(entry);
1780 }
1781 }
1782
1783 #if MACH_ASSERT
1784 static boolean_t first_free_check = FALSE;
1785 boolean_t
first_free_is_valid(vm_map_t map)1786 first_free_is_valid(
1787 vm_map_t map)
1788 {
1789 if (!first_free_check) {
1790 return TRUE;
1791 }
1792
1793 return first_free_is_valid_store( map );
1794 }
1795 #endif /* MACH_ASSERT */
1796
1797
1798 #define vm_map_copy_entry_link(copy, after_where, entry) \
1799 _vm_map_store_entry_link(&(copy)->cpy_hdr, after_where, (entry))
1800
1801 #define vm_map_copy_entry_unlink(copy, entry) \
1802 _vm_map_store_entry_unlink(&(copy)->cpy_hdr, (entry), false)
1803
1804 /*
1805 * vm_map_destroy:
1806 *
1807 * Actually destroy a map.
1808 */
1809 void
vm_map_destroy(vm_map_t map)1810 vm_map_destroy(
1811 vm_map_t map)
1812 {
1813 /* final cleanup: this is not allowed to fail */
1814 vmr_flags_t flags = VM_MAP_REMOVE_NO_FLAGS;
1815
1816 VM_MAP_ZAP_DECLARE(zap);
1817
1818 vm_map_lock(map);
1819
1820 map->terminated = true;
1821 /* clean up regular map entries */
1822 (void)vm_map_delete(map, map->min_offset, map->max_offset, flags,
1823 KMEM_GUARD_NONE, &zap);
1824 /* clean up leftover special mappings (commpage, GPU carveout, etc...) */
1825 (void)vm_map_delete(map, 0x0, 0xFFFFFFFFFFFFF000ULL, flags,
1826 KMEM_GUARD_NONE, &zap);
1827
1828 vm_map_disable_hole_optimization(map);
1829 vm_map_corpse_footprint_destroy(map);
1830
1831 vm_map_unlock(map);
1832
1833 vm_map_zap_dispose(&zap);
1834
1835 assert(map->hdr.nentries == 0);
1836
1837 if (map->pmap) {
1838 pmap_destroy(map->pmap);
1839 }
1840
1841 lck_rw_destroy(&map->lock, &vm_map_lck_grp);
1842
1843 #if CONFIG_MAP_RANGES
1844 kfree_data(map->extra_ranges,
1845 map->extra_ranges_count * sizeof(struct vm_map_user_range));
1846 #endif
1847
1848 zfree_id(ZONE_ID_VM_MAP, map);
1849 }
1850
1851 /*
1852 * Returns pid of the task with the largest number of VM map entries.
1853 * Used in the zone-map-exhaustion jetsam path.
1854 */
1855 pid_t
find_largest_process_vm_map_entries(void)1856 find_largest_process_vm_map_entries(void)
1857 {
1858 pid_t victim_pid = -1;
1859 int max_vm_map_entries = 0;
1860 task_t task = TASK_NULL;
1861 queue_head_t *task_list = &tasks;
1862
1863 lck_mtx_lock(&tasks_threads_lock);
1864 queue_iterate(task_list, task, task_t, tasks) {
1865 if (task == kernel_task || !task->active) {
1866 continue;
1867 }
1868
1869 vm_map_t task_map = task->map;
1870 if (task_map != VM_MAP_NULL) {
1871 int task_vm_map_entries = task_map->hdr.nentries;
1872 if (task_vm_map_entries > max_vm_map_entries) {
1873 max_vm_map_entries = task_vm_map_entries;
1874 victim_pid = pid_from_task(task);
1875 }
1876 }
1877 }
1878 lck_mtx_unlock(&tasks_threads_lock);
1879
1880 printf("zone_map_exhaustion: victim pid %d, vm region count: %d\n", victim_pid, max_vm_map_entries);
1881 return victim_pid;
1882 }
1883
1884
1885 /*
1886 * vm_map_lookup_entry: [ internal use only ]
1887 *
1888 * Calls into the vm map store layer to find the map
1889 * entry containing (or immediately preceding) the
1890 * specified address in the given map; the entry is returned
1891 * in the "entry" parameter. The boolean
1892 * result indicates whether the address is
1893 * actually contained in the map.
1894 */
1895 boolean_t
vm_map_lookup_entry(vm_map_t map,vm_map_offset_t address,vm_map_entry_t * entry)1896 vm_map_lookup_entry(
1897 vm_map_t map,
1898 vm_map_offset_t address,
1899 vm_map_entry_t *entry) /* OUT */
1900 {
1901 if (VM_KERNEL_ADDRESS(address)) {
1902 address = VM_KERNEL_STRIP_UPTR(address);
1903 }
1904
1905
1906 #if CONFIG_PROB_GZALLOC
1907 if (map->pmap == kernel_pmap) {
1908 assertf(!pgz_owned(address),
1909 "it is the responsibility of callers to unguard PGZ addresses");
1910 }
1911 #endif /* CONFIG_PROB_GZALLOC */
1912 return vm_map_store_lookup_entry( map, address, entry );
1913 }
1914
1915 boolean_t
vm_map_lookup_entry_or_next(vm_map_t map,vm_map_offset_t address,vm_map_entry_t * entry)1916 vm_map_lookup_entry_or_next(
1917 vm_map_t map,
1918 vm_map_offset_t address,
1919 vm_map_entry_t *entry) /* OUT */
1920 {
1921 if (vm_map_lookup_entry(map, address, entry)) {
1922 return true;
1923 }
1924
1925 *entry = (*entry)->vme_next;
1926 return false;
1927 }
1928
1929 #if CONFIG_PROB_GZALLOC
1930 boolean_t
vm_map_lookup_entry_allow_pgz(vm_map_t map,vm_map_offset_t address,vm_map_entry_t * entry)1931 vm_map_lookup_entry_allow_pgz(
1932 vm_map_t map,
1933 vm_map_offset_t address,
1934 vm_map_entry_t *entry) /* OUT */
1935 {
1936 if (VM_KERNEL_ADDRESS(address)) {
1937 address = VM_KERNEL_STRIP_UPTR(address);
1938 }
1939 return vm_map_store_lookup_entry( map, address, entry );
1940 }
1941 #endif /* CONFIG_PROB_GZALLOC */
1942
1943 /*
1944 * Routine: vm_map_range_invalid_panic
1945 * Purpose:
1946 * Panic on detection of an invalid range id.
1947 */
1948 __abortlike
1949 static void
vm_map_range_invalid_panic(vm_map_t map,vm_map_range_id_t range_id)1950 vm_map_range_invalid_panic(
1951 vm_map_t map,
1952 vm_map_range_id_t range_id)
1953 {
1954 panic("invalid range ID (%u) for map %p", range_id, map);
1955 }
1956
1957 /*
1958 * Routine: vm_map_get_range
1959 * Purpose:
1960 * Adjust bounds based on security policy.
1961 */
1962 static struct mach_vm_range
vm_map_get_range(vm_map_t map,vm_map_address_t * address,vm_map_kernel_flags_t * vmk_flags,vm_map_size_t size,bool * is_ptr)1963 vm_map_get_range(
1964 vm_map_t map,
1965 vm_map_address_t *address,
1966 vm_map_kernel_flags_t *vmk_flags,
1967 vm_map_size_t size,
1968 bool *is_ptr)
1969 {
1970 struct mach_vm_range effective_range = {};
1971 vm_map_range_id_t range_id = vmk_flags->vmkf_range_id;
1972
1973 if (map == kernel_map) {
1974 effective_range = kmem_ranges[range_id];
1975
1976 if (startup_phase >= STARTUP_SUB_KMEM) {
1977 /*
1978 * Hint provided by caller is zeroed as the range is restricted to a
1979 * subset of the entire kernel_map VA, which could put the hint outside
1980 * the range, causing vm_map_store_find_space to fail.
1981 */
1982 *address = 0ull;
1983 /*
1984 * Ensure that range_id passed in by the caller is within meaningful
1985 * bounds. Range id of KMEM_RANGE_ID_NONE will cause vm_map_locate_space
1986 * to fail as the corresponding range is invalid. Range id larger than
1987 * KMEM_RANGE_ID_MAX will lead to an OOB access.
1988 */
1989 if ((range_id == KMEM_RANGE_ID_NONE) ||
1990 (range_id > KMEM_RANGE_ID_MAX)) {
1991 vm_map_range_invalid_panic(map, range_id);
1992 }
1993
1994 /*
1995 * Pointer ranges use kmem_locate_space to do allocations.
1996 *
1997 * Non pointer fronts look like [ Small | Large | Permanent ]
1998 * Adjust range for allocations larger than KMEM_SMALLMAP_THRESHOLD.
1999 * Allocations smaller than KMEM_SMALLMAP_THRESHOLD are allowed to
2000 * use the entire range.
2001 */
2002 if (range_id < KMEM_RANGE_ID_SPRAYQTN) {
2003 *is_ptr = true;
2004 } else if (size >= KMEM_SMALLMAP_THRESHOLD) {
2005 effective_range = kmem_large_ranges[range_id];
2006 }
2007 }
2008 #if CONFIG_MAP_RANGES
2009 } else if (map->uses_user_ranges) {
2010 switch (range_id) {
2011 case UMEM_RANGE_ID_DEFAULT:
2012 effective_range = map->default_range;
2013 break;
2014 case UMEM_RANGE_ID_HEAP:
2015 effective_range = map->data_range;
2016 break;
2017 case UMEM_RANGE_ID_FIXED:
2018 /*
2019 * anywhere allocations with an address in "FIXED"
2020 * makes no sense, leave the range empty
2021 */
2022 break;
2023
2024 default:
2025 vm_map_range_invalid_panic(map, range_id);
2026 }
2027 #endif /* CONFIG_MAP_RANGES */
2028 } else {
2029 /*
2030 * If minimum is 0, bump it up by PAGE_SIZE. We want to limit
2031 * allocations of PAGEZERO to explicit requests since its
2032 * normal use is to catch dereferences of NULL and many
2033 * applications also treat pointers with a value of 0 as
2034 * special and suddenly having address 0 contain useable
2035 * memory would tend to confuse those applications.
2036 */
2037 effective_range.min_address = MAX(map->min_offset, VM_MAP_PAGE_SIZE(map));
2038 effective_range.max_address = map->max_offset;
2039 }
2040
2041 return effective_range;
2042 }
2043
2044 /*
2045 * Routine: vm_map_locate_space
2046 * Purpose:
2047 * Finds a range in the specified virtual address map,
2048 * returning the start of that range,
2049 * as well as the entry right before it.
2050 */
2051 kern_return_t
vm_map_locate_space(vm_map_t map,vm_map_size_t size,vm_map_offset_t mask,vm_map_kernel_flags_t vmk_flags,vm_map_offset_t * start_inout,vm_map_entry_t * entry_out)2052 vm_map_locate_space(
2053 vm_map_t map,
2054 vm_map_size_t size,
2055 vm_map_offset_t mask,
2056 vm_map_kernel_flags_t vmk_flags,
2057 vm_map_offset_t *start_inout,
2058 vm_map_entry_t *entry_out)
2059 {
2060 struct mach_vm_range effective_range = {};
2061 vm_map_size_t guard_offset;
2062 vm_map_offset_t hint, limit;
2063 vm_map_entry_t entry;
2064 bool is_kmem_ptr_range = false;
2065
2066 /*
2067 * Only supported by vm_map_enter() with a fixed address.
2068 */
2069 assert(!vmk_flags.vmkf_beyond_max);
2070
2071 if (__improbable(map->wait_for_space)) {
2072 /*
2073 * support for "wait_for_space" is minimal,
2074 * its only consumer is the ipc_kernel_copy_map.
2075 */
2076 assert(!map->holelistenabled &&
2077 !vmk_flags.vmkf_last_free &&
2078 !vmk_flags.vmkf_keep_map_locked &&
2079 !vmk_flags.vmkf_map_jit &&
2080 !vmk_flags.vmf_random_addr &&
2081 *start_inout <= map->min_offset);
2082 } else if (vmk_flags.vmkf_last_free) {
2083 assert(!vmk_flags.vmkf_map_jit &&
2084 !vmk_flags.vmf_random_addr);
2085 }
2086
2087 if (vmk_flags.vmkf_guard_before) {
2088 guard_offset = VM_MAP_PAGE_SIZE(map);
2089 assert(size > guard_offset);
2090 size -= guard_offset;
2091 } else {
2092 assert(size != 0);
2093 guard_offset = 0;
2094 }
2095
2096 /*
2097 * Validate range_id from flags and get associated range
2098 */
2099 effective_range = vm_map_get_range(map, start_inout, &vmk_flags, size,
2100 &is_kmem_ptr_range);
2101
2102 if (is_kmem_ptr_range) {
2103 return kmem_locate_space(size + guard_offset, vmk_flags.vmkf_range_id,
2104 vmk_flags.vmkf_last_free, start_inout, entry_out);
2105 }
2106
2107 #if XNU_TARGET_OS_OSX
2108 if (__improbable(vmk_flags.vmkf_32bit_map_va)) {
2109 assert(map != kernel_map);
2110 effective_range.max_address = MIN(map->max_offset, 0x00000000FFFFF000ULL);
2111 }
2112 #endif /* XNU_TARGET_OS_OSX */
2113
2114 again:
2115 if (vmk_flags.vmkf_last_free) {
2116 hint = *start_inout;
2117
2118 if (hint == 0 || hint > effective_range.max_address) {
2119 hint = effective_range.max_address;
2120 }
2121 if (hint <= effective_range.min_address) {
2122 return KERN_NO_SPACE;
2123 }
2124 limit = effective_range.min_address;
2125 } else {
2126 hint = *start_inout;
2127
2128 if (vmk_flags.vmkf_map_jit) {
2129 if (map->jit_entry_exists &&
2130 !VM_MAP_POLICY_ALLOW_MULTIPLE_JIT(map)) {
2131 return KERN_INVALID_ARGUMENT;
2132 }
2133 if (VM_MAP_POLICY_ALLOW_JIT_RANDOM_ADDRESS(map)) {
2134 vmk_flags.vmf_random_addr = true;
2135 }
2136 }
2137
2138 if (vmk_flags.vmf_random_addr) {
2139 kern_return_t kr;
2140
2141 kr = vm_map_random_address_for_size(map, &hint, size, vmk_flags);
2142 if (kr != KERN_SUCCESS) {
2143 return kr;
2144 }
2145 }
2146 #if __x86_64__
2147 else if ((hint == 0 || hint == vm_map_min(map)) &&
2148 !map->disable_vmentry_reuse &&
2149 map->vmmap_high_start != 0) {
2150 hint = map->vmmap_high_start;
2151 }
2152 #endif /* __x86_64__ */
2153
2154 if (hint < effective_range.min_address) {
2155 hint = effective_range.min_address;
2156 }
2157 if (effective_range.max_address <= hint) {
2158 return KERN_NO_SPACE;
2159 }
2160
2161 limit = effective_range.max_address;
2162 }
2163 entry = vm_map_store_find_space(map,
2164 hint, limit, vmk_flags.vmkf_last_free,
2165 guard_offset, size, mask,
2166 start_inout);
2167
2168 if (__improbable(entry == NULL)) {
2169 if (map->wait_for_space &&
2170 guard_offset + size <=
2171 effective_range.max_address - effective_range.min_address) {
2172 assert_wait((event_t)map, THREAD_ABORTSAFE);
2173 vm_map_unlock(map);
2174 thread_block(THREAD_CONTINUE_NULL);
2175 vm_map_lock(map);
2176 goto again;
2177 }
2178 return KERN_NO_SPACE;
2179 }
2180
2181 if (entry_out) {
2182 *entry_out = entry;
2183 }
2184 return KERN_SUCCESS;
2185 }
2186
2187
2188 /*
2189 * Routine: vm_map_find_space
2190 * Purpose:
2191 * Allocate a range in the specified virtual address map,
2192 * returning the entry allocated for that range.
2193 * Used by kmem_alloc, etc.
2194 *
2195 * The map must be NOT be locked. It will be returned locked
2196 * on KERN_SUCCESS, unlocked on failure.
2197 *
2198 * If an entry is allocated, the object/offset fields
2199 * are initialized to zero.
2200 */
2201 kern_return_t
vm_map_find_space(vm_map_t map,vm_map_offset_t hint_address,vm_map_size_t size,vm_map_offset_t mask,vm_map_kernel_flags_t vmk_flags,vm_map_entry_t * o_entry)2202 vm_map_find_space(
2203 vm_map_t map,
2204 vm_map_offset_t hint_address,
2205 vm_map_size_t size,
2206 vm_map_offset_t mask,
2207 vm_map_kernel_flags_t vmk_flags,
2208 vm_map_entry_t *o_entry) /* OUT */
2209 {
2210 vm_map_entry_t new_entry, entry;
2211 kern_return_t kr;
2212
2213 if (size == 0) {
2214 return KERN_INVALID_ARGUMENT;
2215 }
2216
2217 new_entry = vm_map_entry_create(map);
2218 new_entry->use_pmap = true;
2219 new_entry->protection = VM_PROT_DEFAULT;
2220 new_entry->max_protection = VM_PROT_ALL;
2221
2222 if (VM_MAP_PAGE_SHIFT(map) != PAGE_SHIFT) {
2223 new_entry->map_aligned = true;
2224 }
2225 if (vmk_flags.vmf_permanent) {
2226 new_entry->vme_permanent = true;
2227 }
2228
2229 vm_map_lock(map);
2230
2231 kr = vm_map_locate_space(map, size, mask, vmk_flags,
2232 &hint_address, &entry);
2233 if (kr != KERN_SUCCESS) {
2234 vm_map_unlock(map);
2235 vm_map_entry_dispose(new_entry);
2236 return kr;
2237 }
2238 new_entry->vme_start = hint_address;
2239 new_entry->vme_end = hint_address + size;
2240
2241 /*
2242 * At this point,
2243 *
2244 * - new_entry's "vme_start" and "vme_end" should define
2245 * the endpoints of the available new range,
2246 *
2247 * - and "entry" should refer to the region before
2248 * the new range,
2249 *
2250 * - and the map should still be locked.
2251 */
2252
2253 assert(page_aligned(new_entry->vme_start));
2254 assert(page_aligned(new_entry->vme_end));
2255 assert(VM_MAP_PAGE_ALIGNED(new_entry->vme_start, VM_MAP_PAGE_MASK(map)));
2256 assert(VM_MAP_PAGE_ALIGNED(new_entry->vme_end, VM_MAP_PAGE_MASK(map)));
2257
2258 /*
2259 * Insert the new entry into the list
2260 */
2261
2262 vm_map_store_entry_link(map, entry, new_entry,
2263 VM_MAP_KERNEL_FLAGS_NONE);
2264 map->size += size;
2265
2266 /*
2267 * Update the lookup hint
2268 */
2269 SAVE_HINT_MAP_WRITE(map, new_entry);
2270
2271 *o_entry = new_entry;
2272 return KERN_SUCCESS;
2273 }
2274
2275 int vm_map_pmap_enter_print = FALSE;
2276 int vm_map_pmap_enter_enable = FALSE;
2277
2278 /*
2279 * Routine: vm_map_pmap_enter [internal only]
2280 *
2281 * Description:
2282 * Force pages from the specified object to be entered into
2283 * the pmap at the specified address if they are present.
2284 * As soon as a page not found in the object the scan ends.
2285 *
2286 * Returns:
2287 * Nothing.
2288 *
2289 * In/out conditions:
2290 * The source map should not be locked on entry.
2291 */
2292 __unused static void
vm_map_pmap_enter(vm_map_t map,vm_map_offset_t addr,vm_map_offset_t end_addr,vm_object_t object,vm_object_offset_t offset,vm_prot_t protection)2293 vm_map_pmap_enter(
2294 vm_map_t map,
2295 vm_map_offset_t addr,
2296 vm_map_offset_t end_addr,
2297 vm_object_t object,
2298 vm_object_offset_t offset,
2299 vm_prot_t protection)
2300 {
2301 int type_of_fault;
2302 kern_return_t kr;
2303 uint8_t object_lock_type = 0;
2304 struct vm_object_fault_info fault_info = {};
2305
2306 if (map->pmap == 0) {
2307 return;
2308 }
2309
2310 assert(VM_MAP_PAGE_SHIFT(map) == PAGE_SHIFT);
2311
2312 while (addr < end_addr) {
2313 vm_page_t m;
2314
2315
2316 /*
2317 * TODO:
2318 * From vm_map_enter(), we come into this function without the map
2319 * lock held or the object lock held.
2320 * We haven't taken a reference on the object either.
2321 * We should do a proper lookup on the map to make sure
2322 * that things are sane before we go locking objects that
2323 * could have been deallocated from under us.
2324 */
2325
2326 object_lock_type = OBJECT_LOCK_EXCLUSIVE;
2327 vm_object_lock(object);
2328
2329 m = vm_page_lookup(object, offset);
2330
2331 if (m == VM_PAGE_NULL || m->vmp_busy || m->vmp_fictitious ||
2332 (m->vmp_unusual && (VMP_ERROR_GET(m) || m->vmp_restart || m->vmp_absent))) {
2333 vm_object_unlock(object);
2334 return;
2335 }
2336
2337 if (vm_map_pmap_enter_print) {
2338 printf("vm_map_pmap_enter:");
2339 printf("map: %p, addr: %llx, object: %p, offset: %llx\n",
2340 map, (unsigned long long)addr, object, (unsigned long long)offset);
2341 }
2342 type_of_fault = DBG_CACHE_HIT_FAULT;
2343 kr = vm_fault_enter(m, map->pmap,
2344 addr,
2345 PAGE_SIZE, 0,
2346 protection, protection,
2347 VM_PAGE_WIRED(m),
2348 FALSE, /* change_wiring */
2349 VM_KERN_MEMORY_NONE, /* tag - not wiring */
2350 &fault_info,
2351 NULL, /* need_retry */
2352 &type_of_fault,
2353 &object_lock_type); /* Exclusive lock mode. Will remain unchanged.*/
2354
2355 vm_object_unlock(object);
2356
2357 offset += PAGE_SIZE_64;
2358 addr += PAGE_SIZE;
2359 }
2360 }
2361
2362 #define MAX_TRIES_TO_GET_RANDOM_ADDRESS 1000
2363 static kern_return_t
vm_map_random_address_for_size(vm_map_t map,vm_map_offset_t * address,vm_map_size_t size,vm_map_kernel_flags_t vmk_flags)2364 vm_map_random_address_for_size(
2365 vm_map_t map,
2366 vm_map_offset_t *address,
2367 vm_map_size_t size,
2368 vm_map_kernel_flags_t vmk_flags)
2369 {
2370 kern_return_t kr = KERN_SUCCESS;
2371 int tries = 0;
2372 vm_map_offset_t random_addr = 0;
2373 vm_map_offset_t hole_end;
2374
2375 vm_map_entry_t next_entry = VM_MAP_ENTRY_NULL;
2376 vm_map_entry_t prev_entry = VM_MAP_ENTRY_NULL;
2377 vm_map_size_t vm_hole_size = 0;
2378 vm_map_size_t addr_space_size;
2379 bool is_kmem_ptr;
2380 struct mach_vm_range effective_range;
2381
2382 effective_range = vm_map_get_range(map, address, &vmk_flags, size,
2383 &is_kmem_ptr);
2384
2385 addr_space_size = effective_range.max_address - effective_range.min_address;
2386 if (size >= addr_space_size) {
2387 return KERN_NO_SPACE;
2388 }
2389 addr_space_size -= size;
2390
2391 assert(VM_MAP_PAGE_ALIGNED(size, VM_MAP_PAGE_MASK(map)));
2392
2393 while (tries < MAX_TRIES_TO_GET_RANDOM_ADDRESS) {
2394 if (startup_phase < STARTUP_SUB_ZALLOC) {
2395 random_addr = (vm_map_offset_t)early_random();
2396 } else {
2397 random_addr = (vm_map_offset_t)random();
2398 }
2399 random_addr <<= VM_MAP_PAGE_SHIFT(map);
2400 random_addr = vm_map_trunc_page(
2401 effective_range.min_address + (random_addr % addr_space_size),
2402 VM_MAP_PAGE_MASK(map));
2403
2404 #if CONFIG_PROB_GZALLOC
2405 if (map->pmap == kernel_pmap && pgz_owned(random_addr)) {
2406 continue;
2407 }
2408 #endif /* CONFIG_PROB_GZALLOC */
2409
2410 if (vm_map_lookup_entry(map, random_addr, &prev_entry) == FALSE) {
2411 if (prev_entry == vm_map_to_entry(map)) {
2412 next_entry = vm_map_first_entry(map);
2413 } else {
2414 next_entry = prev_entry->vme_next;
2415 }
2416 if (next_entry == vm_map_to_entry(map)) {
2417 hole_end = vm_map_max(map);
2418 } else {
2419 hole_end = next_entry->vme_start;
2420 }
2421 vm_hole_size = hole_end - random_addr;
2422 if (vm_hole_size >= size) {
2423 *address = random_addr;
2424 break;
2425 }
2426 }
2427 tries++;
2428 }
2429
2430 if (tries == MAX_TRIES_TO_GET_RANDOM_ADDRESS) {
2431 kr = KERN_NO_SPACE;
2432 }
2433 return kr;
2434 }
2435
2436 static boolean_t
vm_memory_malloc_no_cow(int alias)2437 vm_memory_malloc_no_cow(
2438 int alias)
2439 {
2440 uint64_t alias_mask;
2441
2442 if (!malloc_no_cow) {
2443 return FALSE;
2444 }
2445 if (alias > 63) {
2446 return FALSE;
2447 }
2448 alias_mask = 1ULL << alias;
2449 if (alias_mask & vm_memory_malloc_no_cow_mask) {
2450 return TRUE;
2451 }
2452 return FALSE;
2453 }
2454
2455 uint64_t vm_map_enter_RLIMIT_AS_count = 0;
2456 uint64_t vm_map_enter_RLIMIT_DATA_count = 0;
2457 /*
2458 * Routine: vm_map_enter
2459 *
2460 * Description:
2461 * Allocate a range in the specified virtual address map.
2462 * The resulting range will refer to memory defined by
2463 * the given memory object and offset into that object.
2464 *
2465 * Arguments are as defined in the vm_map call.
2466 */
2467 static unsigned int vm_map_enter_restore_successes = 0;
2468 static unsigned int vm_map_enter_restore_failures = 0;
2469 kern_return_t
vm_map_enter(vm_map_t map,vm_map_offset_t * address,vm_map_size_t size,vm_map_offset_t mask,vm_map_kernel_flags_t vmk_flags,vm_object_t object,vm_object_offset_t offset,boolean_t needs_copy,vm_prot_t cur_protection,vm_prot_t max_protection,vm_inherit_t inheritance)2470 vm_map_enter(
2471 vm_map_t map,
2472 vm_map_offset_t *address, /* IN/OUT */
2473 vm_map_size_t size,
2474 vm_map_offset_t mask,
2475 vm_map_kernel_flags_t vmk_flags,
2476 vm_object_t object,
2477 vm_object_offset_t offset,
2478 boolean_t needs_copy,
2479 vm_prot_t cur_protection,
2480 vm_prot_t max_protection,
2481 vm_inherit_t inheritance)
2482 {
2483 vm_map_entry_t entry, new_entry;
2484 vm_map_offset_t start, tmp_start, tmp_offset;
2485 vm_map_offset_t end, tmp_end;
2486 vm_map_offset_t tmp2_start, tmp2_end;
2487 vm_map_offset_t step;
2488 kern_return_t result = KERN_SUCCESS;
2489 bool map_locked = FALSE;
2490 bool pmap_empty = TRUE;
2491 bool new_mapping_established = FALSE;
2492 const bool keep_map_locked = vmk_flags.vmkf_keep_map_locked;
2493 const bool anywhere = !vmk_flags.vmf_fixed;
2494 const bool purgable = vmk_flags.vmf_purgeable;
2495 const bool overwrite = vmk_flags.vmf_overwrite;
2496 const bool no_cache = vmk_flags.vmf_no_cache;
2497 const bool is_submap = vmk_flags.vmkf_submap;
2498 const bool permanent = vmk_flags.vmf_permanent;
2499 const bool no_copy_on_read = vmk_flags.vmkf_no_copy_on_read;
2500 const bool entry_for_jit = vmk_flags.vmkf_map_jit;
2501 const bool iokit_acct = vmk_flags.vmkf_iokit_acct;
2502 const bool resilient_codesign = vmk_flags.vmf_resilient_codesign;
2503 const bool resilient_media = vmk_flags.vmf_resilient_media;
2504 const bool entry_for_tpro = vmk_flags.vmf_tpro;
2505 const unsigned int superpage_size = vmk_flags.vmf_superpage_size;
2506 const vm_tag_t alias = vmk_flags.vm_tag;
2507 vm_tag_t user_alias;
2508 kern_return_t kr;
2509 bool clear_map_aligned = FALSE;
2510 vm_map_size_t chunk_size = 0;
2511 vm_object_t caller_object;
2512 VM_MAP_ZAP_DECLARE(zap_old_list);
2513 VM_MAP_ZAP_DECLARE(zap_new_list);
2514
2515 caller_object = object;
2516
2517 assertf(vmk_flags.__vmkf_unused == 0, "vmk_flags unused=0x%x\n", vmk_flags.__vmkf_unused);
2518
2519 if (vmk_flags.vmf_4gb_chunk) {
2520 #if defined(__LP64__)
2521 chunk_size = (4ULL * 1024 * 1024 * 1024); /* max. 4GB chunks for the new allocation */
2522 #else /* __LP64__ */
2523 chunk_size = ANON_CHUNK_SIZE;
2524 #endif /* __LP64__ */
2525 } else {
2526 chunk_size = ANON_CHUNK_SIZE;
2527 }
2528
2529
2530
2531 if (superpage_size) {
2532 switch (superpage_size) {
2533 /*
2534 * Note that the current implementation only supports
2535 * a single size for superpages, SUPERPAGE_SIZE, per
2536 * architecture. As soon as more sizes are supposed
2537 * to be supported, SUPERPAGE_SIZE has to be replaced
2538 * with a lookup of the size depending on superpage_size.
2539 */
2540 #ifdef __x86_64__
2541 case SUPERPAGE_SIZE_ANY:
2542 /* handle it like 2 MB and round up to page size */
2543 size = (size + 2 * 1024 * 1024 - 1) & ~(2 * 1024 * 1024 - 1);
2544 OS_FALLTHROUGH;
2545 case SUPERPAGE_SIZE_2MB:
2546 break;
2547 #endif
2548 default:
2549 return KERN_INVALID_ARGUMENT;
2550 }
2551 mask = SUPERPAGE_SIZE - 1;
2552 if (size & (SUPERPAGE_SIZE - 1)) {
2553 return KERN_INVALID_ARGUMENT;
2554 }
2555 inheritance = VM_INHERIT_NONE; /* fork() children won't inherit superpages */
2556 }
2557
2558
2559 if ((cur_protection & VM_PROT_WRITE) &&
2560 (cur_protection & VM_PROT_EXECUTE) &&
2561 #if XNU_TARGET_OS_OSX
2562 map->pmap != kernel_pmap &&
2563 (cs_process_global_enforcement() ||
2564 (vmk_flags.vmkf_cs_enforcement_override
2565 ? vmk_flags.vmkf_cs_enforcement
2566 : (vm_map_cs_enforcement(map)
2567 #if __arm64__
2568 || !VM_MAP_IS_EXOTIC(map)
2569 #endif /* __arm64__ */
2570 ))) &&
2571 #endif /* XNU_TARGET_OS_OSX */
2572 #if CODE_SIGNING_MONITOR
2573 (csm_address_space_exempt(map->pmap) != KERN_SUCCESS) &&
2574 #endif
2575 (VM_MAP_POLICY_WX_FAIL(map) ||
2576 VM_MAP_POLICY_WX_STRIP_X(map)) &&
2577 !entry_for_jit) {
2578 boolean_t vm_protect_wx_fail = VM_MAP_POLICY_WX_FAIL(map);
2579
2580 DTRACE_VM3(cs_wx,
2581 uint64_t, 0,
2582 uint64_t, 0,
2583 vm_prot_t, cur_protection);
2584 printf("CODE SIGNING: %d[%s] %s: curprot cannot be write+execute. %s\n",
2585 proc_selfpid(),
2586 (get_bsdtask_info(current_task())
2587 ? proc_name_address(get_bsdtask_info(current_task()))
2588 : "?"),
2589 __FUNCTION__,
2590 (vm_protect_wx_fail ? "failing" : "turning off execute"));
2591 cur_protection &= ~VM_PROT_EXECUTE;
2592 if (vm_protect_wx_fail) {
2593 return KERN_PROTECTION_FAILURE;
2594 }
2595 }
2596
2597 if (entry_for_jit
2598 && cur_protection != VM_PROT_ALL) {
2599 /*
2600 * Native macOS processes and all non-macOS processes are
2601 * expected to create JIT regions via mmap(MAP_JIT, RWX) but
2602 * the RWX requirement was not enforced, and thus, we must live
2603 * with our sins. We are now dealing with a JIT mapping without
2604 * RWX.
2605 *
2606 * We deal with these by letting the MAP_JIT stick in order
2607 * to avoid CS violations when these pages are mapped executable
2608 * down the line. In order to appease the page table monitor (you
2609 * know what I'm talking about), these pages will end up being
2610 * marked as XNU_USER_DEBUG, which will be allowed because we
2611 * don't enforce the code signing monitor on macOS systems. If
2612 * the user-space application ever changes permissions to RWX,
2613 * which they are allowed to since the mapping was originally
2614 * created with MAP_JIT, then they'll switch over to using the
2615 * XNU_USER_JIT type, and won't be allowed to downgrade any
2616 * more after that.
2617 *
2618 * When not on macOS, a MAP_JIT mapping without VM_PROT_ALL is
2619 * strictly disallowed.
2620 */
2621
2622 #if XNU_TARGET_OS_OSX
2623 /*
2624 * Continue to allow non-RWX JIT
2625 */
2626 #else
2627 /* non-macOS: reject JIT regions without RWX */
2628 DTRACE_VM3(cs_wx,
2629 uint64_t, 0,
2630 uint64_t, 0,
2631 vm_prot_t, cur_protection);
2632 printf("CODE SIGNING: %d[%s] %s(%d): JIT requires RWX: failing. \n",
2633 proc_selfpid(),
2634 (get_bsdtask_info(current_task())
2635 ? proc_name_address(get_bsdtask_info(current_task()))
2636 : "?"),
2637 __FUNCTION__,
2638 cur_protection);
2639 return KERN_PROTECTION_FAILURE;
2640 #endif
2641 }
2642
2643 /*
2644 * If the task has requested executable lockdown,
2645 * deny any new executable mapping.
2646 */
2647 if (map->map_disallow_new_exec == TRUE) {
2648 if (cur_protection & VM_PROT_EXECUTE) {
2649 return KERN_PROTECTION_FAILURE;
2650 }
2651 }
2652
2653 if (resilient_codesign) {
2654 assert(!is_submap);
2655 int reject_prot = (needs_copy ? VM_PROT_ALLEXEC : (VM_PROT_WRITE | VM_PROT_ALLEXEC));
2656 if ((cur_protection | max_protection) & reject_prot) {
2657 return KERN_PROTECTION_FAILURE;
2658 }
2659 }
2660
2661 if (resilient_media) {
2662 assert(!is_submap);
2663 // assert(!needs_copy);
2664 if (object != VM_OBJECT_NULL &&
2665 !object->internal) {
2666 /*
2667 * This mapping is directly backed by an external
2668 * memory manager (e.g. a vnode pager for a file):
2669 * we would not have any safe place to inject
2670 * a zero-filled page if an actual page is not
2671 * available, without possibly impacting the actual
2672 * contents of the mapped object (e.g. the file),
2673 * so we can't provide any media resiliency here.
2674 */
2675 return KERN_INVALID_ARGUMENT;
2676 }
2677 }
2678
2679 if (entry_for_tpro) {
2680 /*
2681 * TPRO overrides the effective permissions of the region
2682 * and explicitly maps as RW. Ensure we have been passed
2683 * the expected permissions. We accept `cur_protections`
2684 * RO as that will be handled on fault.
2685 */
2686 if (!(max_protection & VM_PROT_READ) ||
2687 !(max_protection & VM_PROT_WRITE) ||
2688 !(cur_protection & VM_PROT_READ)) {
2689 return KERN_PROTECTION_FAILURE;
2690 }
2691
2692 /*
2693 * We can now downgrade the cur_protection to RO. This is a mild lie
2694 * to the VM layer. But TPRO will be responsible for toggling the
2695 * protections between RO/RW
2696 */
2697 cur_protection = VM_PROT_READ;
2698 }
2699
2700 if (is_submap) {
2701 vm_map_t submap;
2702 if (purgable) {
2703 /* submaps can not be purgeable */
2704 return KERN_INVALID_ARGUMENT;
2705 }
2706 if (object == VM_OBJECT_NULL) {
2707 /* submaps can not be created lazily */
2708 return KERN_INVALID_ARGUMENT;
2709 }
2710 submap = (vm_map_t) object;
2711 if (VM_MAP_PAGE_SHIFT(submap) != VM_MAP_PAGE_SHIFT(map)) {
2712 /* page size mismatch */
2713 return KERN_INVALID_ARGUMENT;
2714 }
2715 }
2716 if (vmk_flags.vmkf_already) {
2717 /*
2718 * VM_FLAGS_ALREADY says that it's OK if the same mapping
2719 * is already present. For it to be meaningul, the requested
2720 * mapping has to be at a fixed address (!VM_FLAGS_ANYWHERE) and
2721 * we shouldn't try and remove what was mapped there first
2722 * (!VM_FLAGS_OVERWRITE).
2723 */
2724 if (!vmk_flags.vmf_fixed || vmk_flags.vmf_overwrite) {
2725 return KERN_INVALID_ARGUMENT;
2726 }
2727 }
2728
2729 if (size == 0 ||
2730 (offset & MIN(VM_MAP_PAGE_MASK(map), PAGE_MASK_64)) != 0) {
2731 *address = 0;
2732 return KERN_INVALID_ARGUMENT;
2733 }
2734
2735 if (map->pmap == kernel_pmap) {
2736 user_alias = VM_KERN_MEMORY_NONE;
2737 } else {
2738 user_alias = alias;
2739 }
2740
2741 if (user_alias == VM_MEMORY_MALLOC_MEDIUM) {
2742 chunk_size = MALLOC_MEDIUM_CHUNK_SIZE;
2743 }
2744
2745 #define RETURN(value) { result = value; goto BailOut; }
2746
2747 assertf(VM_MAP_PAGE_ALIGNED(*address, FOURK_PAGE_MASK), "0x%llx", (uint64_t)*address);
2748 assertf(VM_MAP_PAGE_ALIGNED(size, FOURK_PAGE_MASK), "0x%llx", (uint64_t)size);
2749 if (VM_MAP_PAGE_MASK(map) >= PAGE_MASK) {
2750 assertf(page_aligned(*address), "0x%llx", (uint64_t)*address);
2751 assertf(page_aligned(size), "0x%llx", (uint64_t)size);
2752 }
2753
2754 if (VM_MAP_PAGE_MASK(map) >= PAGE_MASK &&
2755 !VM_MAP_PAGE_ALIGNED(size, VM_MAP_PAGE_MASK(map))) {
2756 /*
2757 * In most cases, the caller rounds the size up to the
2758 * map's page size.
2759 * If we get a size that is explicitly not map-aligned here,
2760 * we'll have to respect the caller's wish and mark the
2761 * mapping as "not map-aligned" to avoid tripping the
2762 * map alignment checks later.
2763 */
2764 clear_map_aligned = TRUE;
2765 }
2766 if (!anywhere &&
2767 VM_MAP_PAGE_MASK(map) >= PAGE_MASK &&
2768 !VM_MAP_PAGE_ALIGNED(*address, VM_MAP_PAGE_MASK(map))) {
2769 /*
2770 * We've been asked to map at a fixed address and that
2771 * address is not aligned to the map's specific alignment.
2772 * The caller should know what it's doing (i.e. most likely
2773 * mapping some fragmented copy map, transferring memory from
2774 * a VM map with a different alignment), so clear map_aligned
2775 * for this new VM map entry and proceed.
2776 */
2777 clear_map_aligned = TRUE;
2778 }
2779
2780 /*
2781 * Only zero-fill objects are allowed to be purgable.
2782 * LP64todo - limit purgable objects to 32-bits for now
2783 */
2784 if (purgable &&
2785 (offset != 0 ||
2786 (object != VM_OBJECT_NULL &&
2787 (object->vo_size != size ||
2788 object->purgable == VM_PURGABLE_DENY))
2789 #if __LP64__
2790 || size > ANON_MAX_SIZE
2791 #endif
2792 )) {
2793 return KERN_INVALID_ARGUMENT;
2794 }
2795
2796 start = *address;
2797
2798 if (anywhere) {
2799 vm_map_lock(map);
2800 map_locked = TRUE;
2801
2802 result = vm_map_locate_space(map, size, mask, vmk_flags,
2803 &start, &entry);
2804 if (result != KERN_SUCCESS) {
2805 goto BailOut;
2806 }
2807
2808 *address = start;
2809 end = start + size;
2810 assert(VM_MAP_PAGE_ALIGNED(*address,
2811 VM_MAP_PAGE_MASK(map)));
2812 } else {
2813 vm_map_offset_t effective_min_offset, effective_max_offset;
2814
2815 effective_min_offset = map->min_offset;
2816 effective_max_offset = map->max_offset;
2817
2818 if (vmk_flags.vmkf_beyond_max) {
2819 /*
2820 * Allow an insertion beyond the map's max offset.
2821 */
2822 effective_max_offset = 0x00000000FFFFF000ULL;
2823 if (vm_map_is_64bit(map)) {
2824 effective_max_offset = 0xFFFFFFFFFFFFF000ULL;
2825 }
2826 #if XNU_TARGET_OS_OSX
2827 } else if (__improbable(vmk_flags.vmkf_32bit_map_va)) {
2828 effective_max_offset = MIN(map->max_offset, 0x00000000FFFFF000ULL);
2829 #endif /* XNU_TARGET_OS_OSX */
2830 }
2831
2832 if (VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT &&
2833 !overwrite &&
2834 user_alias == VM_MEMORY_REALLOC) {
2835 /*
2836 * Force realloc() to switch to a new allocation,
2837 * to prevent 4k-fragmented virtual ranges.
2838 */
2839 // DEBUG4K_ERROR("no realloc in place");
2840 return KERN_NO_SPACE;
2841 }
2842
2843 /*
2844 * Verify that:
2845 * the address doesn't itself violate
2846 * the mask requirement.
2847 */
2848
2849 vm_map_lock(map);
2850 map_locked = TRUE;
2851 if ((start & mask) != 0) {
2852 RETURN(KERN_NO_SPACE);
2853 }
2854
2855 #if CONFIG_MAP_RANGES
2856 if (map->uses_user_ranges) {
2857 struct mach_vm_range r;
2858
2859 vm_map_user_range_resolve(map, start, 1, &r);
2860 if (r.max_address == 0) {
2861 RETURN(KERN_INVALID_ADDRESS);
2862 }
2863 effective_min_offset = r.min_address;
2864 effective_max_offset = r.max_address;
2865 }
2866 #endif /* CONFIG_MAP_RANGES */
2867
2868 if ((startup_phase >= STARTUP_SUB_KMEM) && !is_submap &&
2869 (map == kernel_map)) {
2870 mach_vm_range_t r = kmem_validate_range_for_overwrite(start, size);
2871 effective_min_offset = r->min_address;
2872 effective_max_offset = r->max_address;
2873 }
2874
2875 /*
2876 * ... the address is within bounds
2877 */
2878
2879 end = start + size;
2880
2881 if ((start < effective_min_offset) ||
2882 (end > effective_max_offset) ||
2883 (start >= end)) {
2884 RETURN(KERN_INVALID_ADDRESS);
2885 }
2886
2887 if (overwrite) {
2888 vmr_flags_t remove_flags = VM_MAP_REMOVE_NO_MAP_ALIGN | VM_MAP_REMOVE_TO_OVERWRITE;
2889 kern_return_t remove_kr;
2890
2891 /*
2892 * Fixed mapping and "overwrite" flag: attempt to
2893 * remove all existing mappings in the specified
2894 * address range, saving them in our "zap_old_list".
2895 *
2896 * This avoids releasing the VM map lock in
2897 * vm_map_entry_delete() and allows atomicity
2898 * when we want to replace some mappings with a new one.
2899 * It also allows us to restore the old VM mappings if the
2900 * new mapping fails.
2901 */
2902 remove_flags |= VM_MAP_REMOVE_NO_YIELD;
2903
2904 if (vmk_flags.vmkf_overwrite_immutable) {
2905 /* we can overwrite immutable mappings */
2906 remove_flags |= VM_MAP_REMOVE_IMMUTABLE;
2907 }
2908 if (vmk_flags.vmkf_remap_prot_copy) {
2909 remove_flags |= VM_MAP_REMOVE_IMMUTABLE_CODE;
2910 }
2911 remove_kr = vm_map_delete(map, start, end, remove_flags,
2912 KMEM_GUARD_NONE, &zap_old_list).kmr_return;
2913 if (remove_kr) {
2914 /* XXX FBDP restore zap_old_list? */
2915 RETURN(remove_kr);
2916 }
2917 }
2918
2919 /*
2920 * ... the starting address isn't allocated
2921 */
2922
2923 if (vm_map_lookup_entry(map, start, &entry)) {
2924 if (!(vmk_flags.vmkf_already)) {
2925 RETURN(KERN_NO_SPACE);
2926 }
2927 /*
2928 * Check if what's already there is what we want.
2929 */
2930 tmp_start = start;
2931 tmp_offset = offset;
2932 if (entry->vme_start < start) {
2933 tmp_start -= start - entry->vme_start;
2934 tmp_offset -= start - entry->vme_start;
2935 }
2936 for (; entry->vme_start < end;
2937 entry = entry->vme_next) {
2938 /*
2939 * Check if the mapping's attributes
2940 * match the existing map entry.
2941 */
2942 if (entry == vm_map_to_entry(map) ||
2943 entry->vme_start != tmp_start ||
2944 entry->is_sub_map != is_submap ||
2945 VME_OFFSET(entry) != tmp_offset ||
2946 entry->needs_copy != needs_copy ||
2947 entry->protection != cur_protection ||
2948 entry->max_protection != max_protection ||
2949 entry->inheritance != inheritance ||
2950 entry->iokit_acct != iokit_acct ||
2951 VME_ALIAS(entry) != alias) {
2952 /* not the same mapping ! */
2953 RETURN(KERN_NO_SPACE);
2954 }
2955 /*
2956 * Check if the same object is being mapped.
2957 */
2958 if (is_submap) {
2959 if (VME_SUBMAP(entry) !=
2960 (vm_map_t) object) {
2961 /* not the same submap */
2962 RETURN(KERN_NO_SPACE);
2963 }
2964 } else {
2965 if (VME_OBJECT(entry) != object) {
2966 /* not the same VM object... */
2967 vm_object_t obj2;
2968
2969 obj2 = VME_OBJECT(entry);
2970 if ((obj2 == VM_OBJECT_NULL ||
2971 obj2->internal) &&
2972 (object == VM_OBJECT_NULL ||
2973 object->internal)) {
2974 /*
2975 * ... but both are
2976 * anonymous memory,
2977 * so equivalent.
2978 */
2979 } else {
2980 RETURN(KERN_NO_SPACE);
2981 }
2982 }
2983 }
2984
2985 tmp_offset += entry->vme_end - entry->vme_start;
2986 tmp_start += entry->vme_end - entry->vme_start;
2987 if (entry->vme_end >= end) {
2988 /* reached the end of our mapping */
2989 break;
2990 }
2991 }
2992 /* it all matches: let's use what's already there ! */
2993 RETURN(KERN_MEMORY_PRESENT);
2994 }
2995
2996 /*
2997 * ... the next region doesn't overlap the
2998 * end point.
2999 */
3000
3001 if ((entry->vme_next != vm_map_to_entry(map)) &&
3002 (entry->vme_next->vme_start < end)) {
3003 RETURN(KERN_NO_SPACE);
3004 }
3005 }
3006
3007 /*
3008 * At this point,
3009 * "start" and "end" should define the endpoints of the
3010 * available new range, and
3011 * "entry" should refer to the region before the new
3012 * range, and
3013 *
3014 * the map should be locked.
3015 */
3016
3017 /*
3018 * See whether we can avoid creating a new entry (and object) by
3019 * extending one of our neighbors. [So far, we only attempt to
3020 * extend from below.] Note that we can never extend/join
3021 * purgable objects because they need to remain distinct
3022 * entities in order to implement their "volatile object"
3023 * semantics.
3024 */
3025
3026 if (purgable ||
3027 entry_for_jit ||
3028 entry_for_tpro ||
3029 vm_memory_malloc_no_cow(user_alias)) {
3030 if (object == VM_OBJECT_NULL) {
3031 object = vm_object_allocate(size);
3032 vm_object_lock(object);
3033 object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
3034 VM_OBJECT_SET_TRUE_SHARE(object, FALSE);
3035 if (malloc_no_cow_except_fork &&
3036 !purgable &&
3037 !entry_for_jit &&
3038 !entry_for_tpro &&
3039 vm_memory_malloc_no_cow(user_alias)) {
3040 object->copy_strategy = MEMORY_OBJECT_COPY_DELAY_FORK;
3041 VM_OBJECT_SET_TRUE_SHARE(object, TRUE);
3042 }
3043 if (purgable) {
3044 task_t owner;
3045 VM_OBJECT_SET_PURGABLE(object, VM_PURGABLE_NONVOLATILE);
3046 if (map->pmap == kernel_pmap) {
3047 /*
3048 * Purgeable mappings made in a kernel
3049 * map are "owned" by the kernel itself
3050 * rather than the current user task
3051 * because they're likely to be used by
3052 * more than this user task (see
3053 * execargs_purgeable_allocate(), for
3054 * example).
3055 */
3056 owner = kernel_task;
3057 } else {
3058 owner = current_task();
3059 }
3060 assert(object->vo_owner == NULL);
3061 assert(object->resident_page_count == 0);
3062 assert(object->wired_page_count == 0);
3063 vm_purgeable_nonvolatile_enqueue(object, owner);
3064 }
3065 vm_object_unlock(object);
3066 offset = (vm_object_offset_t)0;
3067 }
3068 } else if (VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT) {
3069 /* no coalescing if address space uses sub-pages */
3070 } else if ((is_submap == FALSE) &&
3071 (object == VM_OBJECT_NULL) &&
3072 (entry != vm_map_to_entry(map)) &&
3073 (entry->vme_end == start) &&
3074 (!entry->is_shared) &&
3075 (!entry->is_sub_map) &&
3076 (!entry->in_transition) &&
3077 (!entry->needs_wakeup) &&
3078 (entry->behavior == VM_BEHAVIOR_DEFAULT) &&
3079 (entry->protection == cur_protection) &&
3080 (entry->max_protection == max_protection) &&
3081 (entry->inheritance == inheritance) &&
3082 ((user_alias == VM_MEMORY_REALLOC) ||
3083 (VME_ALIAS(entry) == alias)) &&
3084 (entry->no_cache == no_cache) &&
3085 (entry->vme_permanent == permanent) &&
3086 /* no coalescing for immutable executable mappings */
3087 !((entry->protection & VM_PROT_EXECUTE) &&
3088 entry->vme_permanent) &&
3089 (!entry->superpage_size && !superpage_size) &&
3090 /*
3091 * No coalescing if not map-aligned, to avoid propagating
3092 * that condition any further than needed:
3093 */
3094 (!entry->map_aligned || !clear_map_aligned) &&
3095 (!entry->zero_wired_pages) &&
3096 (!entry->used_for_jit && !entry_for_jit) &&
3097 #if __arm64e__
3098 (!entry->used_for_tpro && !entry_for_tpro) &&
3099 #endif
3100 (!entry->csm_associated) &&
3101 (entry->iokit_acct == iokit_acct) &&
3102 (!entry->vme_resilient_codesign) &&
3103 (!entry->vme_resilient_media) &&
3104 (!entry->vme_atomic) &&
3105 (entry->vme_no_copy_on_read == no_copy_on_read) &&
3106
3107 ((entry->vme_end - entry->vme_start) + size <=
3108 (user_alias == VM_MEMORY_REALLOC ?
3109 ANON_CHUNK_SIZE :
3110 NO_COALESCE_LIMIT)) &&
3111
3112 (entry->wired_count == 0)) { /* implies user_wired_count == 0 */
3113 if (vm_object_coalesce(VME_OBJECT(entry),
3114 VM_OBJECT_NULL,
3115 VME_OFFSET(entry),
3116 (vm_object_offset_t) 0,
3117 (vm_map_size_t)(entry->vme_end - entry->vme_start),
3118 (vm_map_size_t)(end - entry->vme_end))) {
3119 /*
3120 * Coalesced the two objects - can extend
3121 * the previous map entry to include the
3122 * new range.
3123 */
3124 map->size += (end - entry->vme_end);
3125 assert(entry->vme_start < end);
3126 assert(VM_MAP_PAGE_ALIGNED(end,
3127 VM_MAP_PAGE_MASK(map)));
3128 if (__improbable(vm_debug_events)) {
3129 DTRACE_VM5(map_entry_extend, vm_map_t, map, vm_map_entry_t, entry, vm_address_t, entry->vme_start, vm_address_t, entry->vme_end, vm_address_t, end);
3130 }
3131 entry->vme_end = end;
3132 if (map->holelistenabled) {
3133 vm_map_store_update_first_free(map, entry, TRUE);
3134 } else {
3135 vm_map_store_update_first_free(map, map->first_free, TRUE);
3136 }
3137 new_mapping_established = TRUE;
3138 RETURN(KERN_SUCCESS);
3139 }
3140 }
3141
3142 step = superpage_size ? SUPERPAGE_SIZE : (end - start);
3143 new_entry = NULL;
3144
3145 if (vmk_flags.vmkf_submap_adjust) {
3146 vm_map_adjust_offsets((vm_map_t)caller_object, start, end);
3147 offset = start;
3148 }
3149
3150 for (tmp2_start = start; tmp2_start < end; tmp2_start += step) {
3151 tmp2_end = tmp2_start + step;
3152 /*
3153 * Create a new entry
3154 *
3155 * XXX FBDP
3156 * The reserved "page zero" in each process's address space can
3157 * be arbitrarily large. Splitting it into separate objects and
3158 * therefore different VM map entries serves no purpose and just
3159 * slows down operations on the VM map, so let's not split the
3160 * allocation into chunks if the max protection is NONE. That
3161 * memory should never be accessible, so it will never get to the
3162 * default pager.
3163 */
3164 tmp_start = tmp2_start;
3165 if (!is_submap &&
3166 object == VM_OBJECT_NULL &&
3167 size > chunk_size &&
3168 max_protection != VM_PROT_NONE &&
3169 superpage_size == 0) {
3170 tmp_end = tmp_start + chunk_size;
3171 } else {
3172 tmp_end = tmp2_end;
3173 }
3174 do {
3175 if (!is_submap &&
3176 object != VM_OBJECT_NULL &&
3177 object->internal &&
3178 offset + (tmp_end - tmp_start) > object->vo_size) {
3179 // printf("FBDP object %p size 0x%llx overmapping offset 0x%llx size 0x%llx\n", object, object->vo_size, offset, (uint64_t)(tmp_end - tmp_start));
3180 DTRACE_VM5(vm_map_enter_overmap,
3181 vm_map_t, map,
3182 vm_map_address_t, tmp_start,
3183 vm_map_address_t, tmp_end,
3184 vm_object_offset_t, offset,
3185 vm_object_size_t, object->vo_size);
3186 }
3187 new_entry = vm_map_entry_insert(map,
3188 entry, tmp_start, tmp_end,
3189 object, offset, vmk_flags,
3190 needs_copy,
3191 cur_protection, max_protection,
3192 (entry_for_jit && !VM_MAP_POLICY_ALLOW_JIT_INHERIT(map) ?
3193 VM_INHERIT_NONE : inheritance),
3194 clear_map_aligned);
3195
3196 assert(!is_kernel_object(object) || (VM_KERN_MEMORY_NONE != alias));
3197
3198 if (resilient_codesign) {
3199 int reject_prot = (needs_copy ? VM_PROT_ALLEXEC : (VM_PROT_WRITE | VM_PROT_ALLEXEC));
3200 if (!((cur_protection | max_protection) & reject_prot)) {
3201 new_entry->vme_resilient_codesign = TRUE;
3202 }
3203 }
3204
3205 if (resilient_media &&
3206 (object == VM_OBJECT_NULL ||
3207 object->internal)) {
3208 new_entry->vme_resilient_media = TRUE;
3209 }
3210
3211 assert(!new_entry->iokit_acct);
3212 if (!is_submap &&
3213 object != VM_OBJECT_NULL &&
3214 (object->purgable != VM_PURGABLE_DENY ||
3215 object->vo_ledger_tag)) {
3216 assert(new_entry->use_pmap);
3217 assert(!new_entry->iokit_acct);
3218 /*
3219 * Turn off pmap accounting since
3220 * purgeable (or tagged) objects have their
3221 * own ledgers.
3222 */
3223 new_entry->use_pmap = FALSE;
3224 } else if (!is_submap &&
3225 iokit_acct &&
3226 object != VM_OBJECT_NULL &&
3227 object->internal) {
3228 /* alternate accounting */
3229 assert(!new_entry->iokit_acct);
3230 assert(new_entry->use_pmap);
3231 new_entry->iokit_acct = TRUE;
3232 new_entry->use_pmap = FALSE;
3233 DTRACE_VM4(
3234 vm_map_iokit_mapped_region,
3235 vm_map_t, map,
3236 vm_map_offset_t, new_entry->vme_start,
3237 vm_map_offset_t, new_entry->vme_end,
3238 int, VME_ALIAS(new_entry));
3239 vm_map_iokit_mapped_region(
3240 map,
3241 (new_entry->vme_end -
3242 new_entry->vme_start));
3243 } else if (!is_submap) {
3244 assert(!new_entry->iokit_acct);
3245 assert(new_entry->use_pmap);
3246 }
3247
3248 if (is_submap) {
3249 vm_map_t submap;
3250 boolean_t submap_is_64bit;
3251 boolean_t use_pmap;
3252
3253 assert(new_entry->is_sub_map);
3254 assert(!new_entry->use_pmap);
3255 assert(!new_entry->iokit_acct);
3256 submap = (vm_map_t) object;
3257 submap_is_64bit = vm_map_is_64bit(submap);
3258 use_pmap = vmk_flags.vmkf_nested_pmap;
3259 #ifndef NO_NESTED_PMAP
3260 if (use_pmap && submap->pmap == NULL) {
3261 ledger_t ledger = map->pmap->ledger;
3262 /* we need a sub pmap to nest... */
3263 submap->pmap = pmap_create_options(ledger, 0,
3264 submap_is_64bit ? PMAP_CREATE_64BIT : 0);
3265 if (submap->pmap == NULL) {
3266 /* let's proceed without nesting... */
3267 }
3268 #if defined(__arm64__)
3269 else {
3270 pmap_set_nested(submap->pmap);
3271 }
3272 #endif
3273 }
3274 if (use_pmap && submap->pmap != NULL) {
3275 if (VM_MAP_PAGE_SHIFT(map) != VM_MAP_PAGE_SHIFT(submap)) {
3276 DEBUG4K_ERROR("map %p (%d) submap %p (%d): incompatible page sizes\n", map, VM_MAP_PAGE_SHIFT(map), submap, VM_MAP_PAGE_SHIFT(submap));
3277 kr = KERN_FAILURE;
3278 } else {
3279 kr = pmap_nest(map->pmap,
3280 submap->pmap,
3281 tmp_start,
3282 tmp_end - tmp_start);
3283 }
3284 if (kr != KERN_SUCCESS) {
3285 printf("vm_map_enter: "
3286 "pmap_nest(0x%llx,0x%llx) "
3287 "error 0x%x\n",
3288 (long long)tmp_start,
3289 (long long)tmp_end,
3290 kr);
3291 } else {
3292 /* we're now nested ! */
3293 new_entry->use_pmap = TRUE;
3294 pmap_empty = FALSE;
3295 }
3296 }
3297 #endif /* NO_NESTED_PMAP */
3298 }
3299 entry = new_entry;
3300
3301 if (superpage_size) {
3302 vm_page_t pages, m;
3303 vm_object_t sp_object;
3304 vm_object_offset_t sp_offset;
3305
3306 VME_OFFSET_SET(entry, 0);
3307
3308 /* allocate one superpage */
3309 kr = cpm_allocate(SUPERPAGE_SIZE, &pages, 0, SUPERPAGE_NBASEPAGES - 1, TRUE, 0);
3310 if (kr != KERN_SUCCESS) {
3311 /* deallocate whole range... */
3312 new_mapping_established = TRUE;
3313 /* ... but only up to "tmp_end" */
3314 size -= end - tmp_end;
3315 RETURN(kr);
3316 }
3317
3318 /* create one vm_object per superpage */
3319 sp_object = vm_object_allocate((vm_map_size_t)(entry->vme_end - entry->vme_start));
3320 vm_object_lock(sp_object);
3321 sp_object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
3322 VM_OBJECT_SET_PHYS_CONTIGUOUS(sp_object, TRUE);
3323 sp_object->vo_shadow_offset = (vm_object_offset_t)VM_PAGE_GET_PHYS_PAGE(pages) * PAGE_SIZE;
3324 VME_OBJECT_SET(entry, sp_object, false, 0);
3325 assert(entry->use_pmap);
3326
3327 /* enter the base pages into the object */
3328 for (sp_offset = 0;
3329 sp_offset < SUPERPAGE_SIZE;
3330 sp_offset += PAGE_SIZE) {
3331 m = pages;
3332 pmap_zero_page(VM_PAGE_GET_PHYS_PAGE(m));
3333 pages = NEXT_PAGE(m);
3334 *(NEXT_PAGE_PTR(m)) = VM_PAGE_NULL;
3335 vm_page_insert_wired(m, sp_object, sp_offset, VM_KERN_MEMORY_OSFMK);
3336 }
3337 vm_object_unlock(sp_object);
3338 }
3339 } while (tmp_end != tmp2_end &&
3340 (tmp_start = tmp_end) &&
3341 (tmp_end = (tmp2_end - tmp_end > chunk_size) ?
3342 tmp_end + chunk_size : tmp2_end));
3343 }
3344
3345 new_mapping_established = TRUE;
3346
3347 BailOut:
3348 assert(map_locked == TRUE);
3349
3350 /*
3351 * Address space limit enforcement (RLIMIT_AS and RLIMIT_DATA):
3352 * If we have identified and possibly established the new mapping(s),
3353 * make sure we did not go beyond the address space limit.
3354 */
3355 if (result == KERN_SUCCESS) {
3356 if (map->size_limit != RLIM_INFINITY &&
3357 map->size > map->size_limit) {
3358 /*
3359 * Establishing the requested mappings would exceed
3360 * the process's RLIMIT_AS limit: fail with
3361 * KERN_NO_SPACE.
3362 */
3363 result = KERN_NO_SPACE;
3364 printf("%d[%s] %s: map size 0x%llx over RLIMIT_AS 0x%llx\n",
3365 proc_selfpid(),
3366 (get_bsdtask_info(current_task())
3367 ? proc_name_address(get_bsdtask_info(current_task()))
3368 : "?"),
3369 __FUNCTION__,
3370 (uint64_t) map->size,
3371 (uint64_t) map->size_limit);
3372 DTRACE_VM2(vm_map_enter_RLIMIT_AS,
3373 vm_map_size_t, map->size,
3374 uint64_t, map->size_limit);
3375 vm_map_enter_RLIMIT_AS_count++;
3376 } else if (map->data_limit != RLIM_INFINITY &&
3377 map->size > map->data_limit) {
3378 /*
3379 * Establishing the requested mappings would exceed
3380 * the process's RLIMIT_DATA limit: fail with
3381 * KERN_NO_SPACE.
3382 */
3383 result = KERN_NO_SPACE;
3384 printf("%d[%s] %s: map size 0x%llx over RLIMIT_DATA 0x%llx\n",
3385 proc_selfpid(),
3386 (get_bsdtask_info(current_task())
3387 ? proc_name_address(get_bsdtask_info(current_task()))
3388 : "?"),
3389 __FUNCTION__,
3390 (uint64_t) map->size,
3391 (uint64_t) map->data_limit);
3392 DTRACE_VM2(vm_map_enter_RLIMIT_DATA,
3393 vm_map_size_t, map->size,
3394 uint64_t, map->data_limit);
3395 vm_map_enter_RLIMIT_DATA_count++;
3396 }
3397 }
3398
3399 if (result == KERN_SUCCESS) {
3400 vm_prot_t pager_prot;
3401 memory_object_t pager;
3402
3403 #if DEBUG
3404 if (pmap_empty &&
3405 !(vmk_flags.vmkf_no_pmap_check)) {
3406 assert(pmap_is_empty(map->pmap,
3407 *address,
3408 *address + size));
3409 }
3410 #endif /* DEBUG */
3411
3412 /*
3413 * For "named" VM objects, let the pager know that the
3414 * memory object is being mapped. Some pagers need to keep
3415 * track of this, to know when they can reclaim the memory
3416 * object, for example.
3417 * VM calls memory_object_map() for each mapping (specifying
3418 * the protection of each mapping) and calls
3419 * memory_object_last_unmap() when all the mappings are gone.
3420 */
3421 pager_prot = max_protection;
3422 if (needs_copy) {
3423 /*
3424 * Copy-On-Write mapping: won't modify
3425 * the memory object.
3426 */
3427 pager_prot &= ~VM_PROT_WRITE;
3428 }
3429 if (!is_submap &&
3430 object != VM_OBJECT_NULL &&
3431 object->named &&
3432 object->pager != MEMORY_OBJECT_NULL) {
3433 vm_object_lock(object);
3434 pager = object->pager;
3435 if (object->named &&
3436 pager != MEMORY_OBJECT_NULL) {
3437 assert(object->pager_ready);
3438 vm_object_mapping_wait(object, THREAD_UNINT);
3439 vm_object_mapping_begin(object);
3440 vm_object_unlock(object);
3441
3442 kr = memory_object_map(pager, pager_prot);
3443 assert(kr == KERN_SUCCESS);
3444
3445 vm_object_lock(object);
3446 vm_object_mapping_end(object);
3447 }
3448 vm_object_unlock(object);
3449 }
3450 }
3451
3452 assert(map_locked == TRUE);
3453
3454 if (new_mapping_established) {
3455 /*
3456 * If we release the map lock for any reason below,
3457 * another thread could deallocate our new mapping,
3458 * releasing the caller's reference on "caller_object",
3459 * which was transferred to the mapping.
3460 * If this was the only reference, the object could be
3461 * destroyed.
3462 *
3463 * We need to take an extra reference on "caller_object"
3464 * to keep it alive if we need to return the caller's
3465 * reference to the caller in case of failure.
3466 */
3467 if (is_submap) {
3468 vm_map_reference((vm_map_t)caller_object);
3469 } else {
3470 vm_object_reference(caller_object);
3471 }
3472 }
3473
3474 if (!keep_map_locked) {
3475 vm_map_unlock(map);
3476 map_locked = FALSE;
3477 entry = VM_MAP_ENTRY_NULL;
3478 new_entry = VM_MAP_ENTRY_NULL;
3479 }
3480
3481 /*
3482 * We can't hold the map lock if we enter this block.
3483 */
3484
3485 if (result == KERN_SUCCESS) {
3486 /* Wire down the new entry if the user
3487 * requested all new map entries be wired.
3488 */
3489 if ((map->wiring_required) || (superpage_size)) {
3490 assert(!keep_map_locked);
3491 pmap_empty = FALSE; /* pmap won't be empty */
3492 kr = vm_map_wire_kernel(map, start, end,
3493 cur_protection, VM_KERN_MEMORY_MLOCK,
3494 TRUE);
3495 result = kr;
3496 }
3497
3498 }
3499
3500 if (result != KERN_SUCCESS) {
3501 if (new_mapping_established) {
3502 vmr_flags_t remove_flags = VM_MAP_REMOVE_NO_FLAGS;
3503
3504 /*
3505 * We have to get rid of the new mappings since we
3506 * won't make them available to the user.
3507 * Try and do that atomically, to minimize the risk
3508 * that someone else create new mappings that range.
3509 */
3510 if (!map_locked) {
3511 vm_map_lock(map);
3512 map_locked = TRUE;
3513 }
3514 remove_flags |= VM_MAP_REMOVE_NO_MAP_ALIGN;
3515 remove_flags |= VM_MAP_REMOVE_NO_YIELD;
3516 if (permanent) {
3517 remove_flags |= VM_MAP_REMOVE_IMMUTABLE;
3518 }
3519 (void) vm_map_delete(map,
3520 *address, *address + size,
3521 remove_flags,
3522 KMEM_GUARD_NONE, &zap_new_list);
3523 }
3524
3525 if (vm_map_zap_first_entry(&zap_old_list)) {
3526 vm_map_entry_t entry1, entry2;
3527
3528 /*
3529 * The new mapping failed. Attempt to restore
3530 * the old mappings, saved in the "zap_old_map".
3531 */
3532 if (!map_locked) {
3533 vm_map_lock(map);
3534 map_locked = TRUE;
3535 }
3536
3537 /* first check if the coast is still clear */
3538 start = vm_map_zap_first_entry(&zap_old_list)->vme_start;
3539 end = vm_map_zap_last_entry(&zap_old_list)->vme_end;
3540
3541 if (vm_map_lookup_entry(map, start, &entry1) ||
3542 vm_map_lookup_entry(map, end, &entry2) ||
3543 entry1 != entry2) {
3544 /*
3545 * Part of that range has already been
3546 * re-mapped: we can't restore the old
3547 * mappings...
3548 */
3549 vm_map_enter_restore_failures++;
3550 } else {
3551 /*
3552 * Transfer the saved map entries from
3553 * "zap_old_map" to the original "map",
3554 * inserting them all after "entry1".
3555 */
3556 while ((entry2 = vm_map_zap_pop(&zap_old_list))) {
3557 vm_map_size_t entry_size;
3558
3559 entry_size = (entry2->vme_end -
3560 entry2->vme_start);
3561 vm_map_store_entry_link(map, entry1, entry2,
3562 VM_MAP_KERNEL_FLAGS_NONE);
3563 map->size += entry_size;
3564 entry1 = entry2;
3565 }
3566 if (map->wiring_required) {
3567 /*
3568 * XXX TODO: we should rewire the
3569 * old pages here...
3570 */
3571 }
3572 vm_map_enter_restore_successes++;
3573 }
3574 }
3575 }
3576
3577 /*
3578 * The caller is responsible for releasing the lock if it requested to
3579 * keep the map locked.
3580 */
3581 if (map_locked && !keep_map_locked) {
3582 vm_map_unlock(map);
3583 }
3584
3585 vm_map_zap_dispose(&zap_old_list);
3586 vm_map_zap_dispose(&zap_new_list);
3587
3588 if (new_mapping_established) {
3589 /*
3590 * The caller had a reference on "caller_object" and we
3591 * transferred that reference to the mapping.
3592 * We also took an extra reference on "caller_object" to keep
3593 * it alive while the map was unlocked.
3594 */
3595 if (result == KERN_SUCCESS) {
3596 /*
3597 * On success, the caller's reference on the object gets
3598 * tranferred to the mapping.
3599 * Release our extra reference.
3600 */
3601 if (is_submap) {
3602 vm_map_deallocate((vm_map_t)caller_object);
3603 } else {
3604 vm_object_deallocate(caller_object);
3605 }
3606 } else {
3607 /*
3608 * On error, the caller expects to still have a
3609 * reference on the object it gave us.
3610 * Let's use our extra reference for that.
3611 */
3612 }
3613 }
3614
3615 return result;
3616
3617 #undef RETURN
3618 }
3619
3620 #if __arm64__
3621 extern const struct memory_object_pager_ops fourk_pager_ops;
3622 kern_return_t
vm_map_enter_fourk(vm_map_t map,vm_map_offset_t * address,vm_map_size_t size,vm_map_offset_t mask,vm_map_kernel_flags_t vmk_flags,vm_object_t object,vm_object_offset_t offset,boolean_t needs_copy,vm_prot_t cur_protection,vm_prot_t max_protection,vm_inherit_t inheritance)3623 vm_map_enter_fourk(
3624 vm_map_t map,
3625 vm_map_offset_t *address, /* IN/OUT */
3626 vm_map_size_t size,
3627 vm_map_offset_t mask,
3628 vm_map_kernel_flags_t vmk_flags,
3629 vm_object_t object,
3630 vm_object_offset_t offset,
3631 boolean_t needs_copy,
3632 vm_prot_t cur_protection,
3633 vm_prot_t max_protection,
3634 vm_inherit_t inheritance)
3635 {
3636 vm_map_entry_t entry, new_entry;
3637 vm_map_offset_t start, fourk_start;
3638 vm_map_offset_t end, fourk_end;
3639 vm_map_size_t fourk_size;
3640 kern_return_t result = KERN_SUCCESS;
3641 boolean_t map_locked = FALSE;
3642 boolean_t pmap_empty = TRUE;
3643 boolean_t new_mapping_established = FALSE;
3644 const bool keep_map_locked = vmk_flags.vmkf_keep_map_locked;
3645 const bool anywhere = !vmk_flags.vmf_fixed;
3646 const bool purgable = vmk_flags.vmf_purgeable;
3647 const bool overwrite = vmk_flags.vmf_overwrite;
3648 const bool is_submap = vmk_flags.vmkf_submap;
3649 const bool entry_for_jit = vmk_flags.vmkf_map_jit;
3650 const unsigned int superpage_size = vmk_flags.vmf_superpage_size;
3651 vm_map_offset_t effective_min_offset, effective_max_offset;
3652 kern_return_t kr;
3653 boolean_t clear_map_aligned = FALSE;
3654 memory_object_t fourk_mem_obj;
3655 vm_object_t fourk_object;
3656 vm_map_offset_t fourk_pager_offset;
3657 int fourk_pager_index_start, fourk_pager_index_num;
3658 int cur_idx;
3659 boolean_t fourk_copy;
3660 vm_object_t copy_object;
3661 vm_object_offset_t copy_offset;
3662 VM_MAP_ZAP_DECLARE(zap_list);
3663
3664 if (VM_MAP_PAGE_MASK(map) < PAGE_MASK) {
3665 panic("%s:%d", __FUNCTION__, __LINE__);
3666 }
3667 fourk_mem_obj = MEMORY_OBJECT_NULL;
3668 fourk_object = VM_OBJECT_NULL;
3669
3670 if (superpage_size) {
3671 return KERN_NOT_SUPPORTED;
3672 }
3673
3674 if ((cur_protection & VM_PROT_WRITE) &&
3675 (cur_protection & VM_PROT_EXECUTE) &&
3676 #if XNU_TARGET_OS_OSX
3677 map->pmap != kernel_pmap &&
3678 (vm_map_cs_enforcement(map)
3679 #if __arm64__
3680 || !VM_MAP_IS_EXOTIC(map)
3681 #endif /* __arm64__ */
3682 ) &&
3683 #endif /* XNU_TARGET_OS_OSX */
3684 #if CODE_SIGNING_MONITOR
3685 (csm_address_space_exempt(map->pmap) != KERN_SUCCESS) &&
3686 #endif
3687 !entry_for_jit) {
3688 DTRACE_VM3(cs_wx,
3689 uint64_t, 0,
3690 uint64_t, 0,
3691 vm_prot_t, cur_protection);
3692 printf("CODE SIGNING: %d[%s] %s: curprot cannot be write+execute. "
3693 "turning off execute\n",
3694 proc_selfpid(),
3695 (get_bsdtask_info(current_task())
3696 ? proc_name_address(get_bsdtask_info(current_task()))
3697 : "?"),
3698 __FUNCTION__);
3699 cur_protection &= ~VM_PROT_EXECUTE;
3700 }
3701
3702 /*
3703 * If the task has requested executable lockdown,
3704 * deny any new executable mapping.
3705 */
3706 if (map->map_disallow_new_exec == TRUE) {
3707 if (cur_protection & VM_PROT_EXECUTE) {
3708 return KERN_PROTECTION_FAILURE;
3709 }
3710 }
3711
3712 if (is_submap) {
3713 return KERN_NOT_SUPPORTED;
3714 }
3715 if (vmk_flags.vmkf_already) {
3716 return KERN_NOT_SUPPORTED;
3717 }
3718 if (purgable || entry_for_jit) {
3719 return KERN_NOT_SUPPORTED;
3720 }
3721
3722 effective_min_offset = map->min_offset;
3723
3724 if (vmk_flags.vmkf_beyond_max) {
3725 return KERN_NOT_SUPPORTED;
3726 } else {
3727 effective_max_offset = map->max_offset;
3728 }
3729
3730 if (size == 0 ||
3731 (offset & FOURK_PAGE_MASK) != 0) {
3732 *address = 0;
3733 return KERN_INVALID_ARGUMENT;
3734 }
3735
3736 #define RETURN(value) { result = value; goto BailOut; }
3737
3738 assert(VM_MAP_PAGE_ALIGNED(*address, FOURK_PAGE_MASK));
3739 assert(VM_MAP_PAGE_ALIGNED(size, FOURK_PAGE_MASK));
3740
3741 if (!anywhere && overwrite) {
3742 return KERN_NOT_SUPPORTED;
3743 }
3744
3745 fourk_start = *address;
3746 fourk_size = size;
3747 fourk_end = fourk_start + fourk_size;
3748
3749 start = vm_map_trunc_page(*address, VM_MAP_PAGE_MASK(map));
3750 end = vm_map_round_page(fourk_end, VM_MAP_PAGE_MASK(map));
3751 size = end - start;
3752
3753 if (anywhere) {
3754 return KERN_NOT_SUPPORTED;
3755 } else {
3756 /*
3757 * Verify that:
3758 * the address doesn't itself violate
3759 * the mask requirement.
3760 */
3761
3762 vm_map_lock(map);
3763 map_locked = TRUE;
3764 if ((start & mask) != 0) {
3765 RETURN(KERN_NO_SPACE);
3766 }
3767
3768 /*
3769 * ... the address is within bounds
3770 */
3771
3772 end = start + size;
3773
3774 if ((start < effective_min_offset) ||
3775 (end > effective_max_offset) ||
3776 (start >= end)) {
3777 RETURN(KERN_INVALID_ADDRESS);
3778 }
3779
3780 /*
3781 * ... the starting address isn't allocated
3782 */
3783 if (vm_map_lookup_entry(map, start, &entry)) {
3784 vm_object_t cur_object, shadow_object;
3785
3786 /*
3787 * We might already some 4K mappings
3788 * in a 16K page here.
3789 */
3790
3791 if (entry->vme_end - entry->vme_start
3792 != SIXTEENK_PAGE_SIZE) {
3793 RETURN(KERN_NO_SPACE);
3794 }
3795 if (entry->is_sub_map) {
3796 RETURN(KERN_NO_SPACE);
3797 }
3798 if (VME_OBJECT(entry) == VM_OBJECT_NULL) {
3799 RETURN(KERN_NO_SPACE);
3800 }
3801
3802 /* go all the way down the shadow chain */
3803 cur_object = VME_OBJECT(entry);
3804 vm_object_lock(cur_object);
3805 while (cur_object->shadow != VM_OBJECT_NULL) {
3806 shadow_object = cur_object->shadow;
3807 vm_object_lock(shadow_object);
3808 vm_object_unlock(cur_object);
3809 cur_object = shadow_object;
3810 shadow_object = VM_OBJECT_NULL;
3811 }
3812 if (cur_object->internal ||
3813 cur_object->pager == NULL) {
3814 vm_object_unlock(cur_object);
3815 RETURN(KERN_NO_SPACE);
3816 }
3817 if (cur_object->pager->mo_pager_ops
3818 != &fourk_pager_ops) {
3819 vm_object_unlock(cur_object);
3820 RETURN(KERN_NO_SPACE);
3821 }
3822 fourk_object = cur_object;
3823 fourk_mem_obj = fourk_object->pager;
3824
3825 /* keep the "4K" object alive */
3826 vm_object_reference_locked(fourk_object);
3827 memory_object_reference(fourk_mem_obj);
3828 vm_object_unlock(fourk_object);
3829
3830 /* merge permissions */
3831 entry->protection |= cur_protection;
3832 entry->max_protection |= max_protection;
3833
3834 if ((entry->protection & VM_PROT_WRITE) &&
3835 (entry->protection & VM_PROT_ALLEXEC) &&
3836 fourk_binary_compatibility_unsafe &&
3837 fourk_binary_compatibility_allow_wx) {
3838 /* write+execute: need to be "jit" */
3839 entry->used_for_jit = TRUE;
3840 }
3841 goto map_in_fourk_pager;
3842 }
3843
3844 /*
3845 * ... the next region doesn't overlap the
3846 * end point.
3847 */
3848
3849 if ((entry->vme_next != vm_map_to_entry(map)) &&
3850 (entry->vme_next->vme_start < end)) {
3851 RETURN(KERN_NO_SPACE);
3852 }
3853 }
3854
3855 /*
3856 * At this point,
3857 * "start" and "end" should define the endpoints of the
3858 * available new range, and
3859 * "entry" should refer to the region before the new
3860 * range, and
3861 *
3862 * the map should be locked.
3863 */
3864
3865 /* create a new "4K" pager */
3866 fourk_mem_obj = fourk_pager_create();
3867 fourk_object = fourk_pager_to_vm_object(fourk_mem_obj);
3868 assert(fourk_object);
3869
3870 /* keep the "4" object alive */
3871 vm_object_reference(fourk_object);
3872
3873 /* create a "copy" object, to map the "4K" object copy-on-write */
3874 fourk_copy = TRUE;
3875 result = vm_object_copy_strategically(fourk_object,
3876 0,
3877 end - start,
3878 false, /* forking */
3879 ©_object,
3880 ©_offset,
3881 &fourk_copy);
3882 assert(result == KERN_SUCCESS);
3883 assert(copy_object != VM_OBJECT_NULL);
3884 assert(copy_offset == 0);
3885
3886 /* map the "4K" pager's copy object */
3887 new_entry = vm_map_entry_insert(map,
3888 entry,
3889 vm_map_trunc_page(start, VM_MAP_PAGE_MASK(map)),
3890 vm_map_round_page(end, VM_MAP_PAGE_MASK(map)),
3891 copy_object,
3892 0, /* offset */
3893 vmk_flags,
3894 FALSE, /* needs_copy */
3895 cur_protection, max_protection,
3896 (entry_for_jit && !VM_MAP_POLICY_ALLOW_JIT_INHERIT(map) ?
3897 VM_INHERIT_NONE : inheritance),
3898 clear_map_aligned);
3899 entry = new_entry;
3900
3901 #if VM_MAP_DEBUG_FOURK
3902 if (vm_map_debug_fourk) {
3903 printf("FOURK_PAGER: map %p [0x%llx:0x%llx] new pager %p\n",
3904 map,
3905 (uint64_t) entry->vme_start,
3906 (uint64_t) entry->vme_end,
3907 fourk_mem_obj);
3908 }
3909 #endif /* VM_MAP_DEBUG_FOURK */
3910
3911 new_mapping_established = TRUE;
3912
3913 map_in_fourk_pager:
3914 /* "map" the original "object" where it belongs in the "4K" pager */
3915 fourk_pager_offset = (fourk_start & SIXTEENK_PAGE_MASK);
3916 fourk_pager_index_start = (int) (fourk_pager_offset / FOURK_PAGE_SIZE);
3917 if (fourk_size > SIXTEENK_PAGE_SIZE) {
3918 fourk_pager_index_num = 4;
3919 } else {
3920 fourk_pager_index_num = (int) (fourk_size / FOURK_PAGE_SIZE);
3921 }
3922 if (fourk_pager_index_start + fourk_pager_index_num > 4) {
3923 fourk_pager_index_num = 4 - fourk_pager_index_start;
3924 }
3925 for (cur_idx = 0;
3926 cur_idx < fourk_pager_index_num;
3927 cur_idx++) {
3928 vm_object_t old_object;
3929 vm_object_offset_t old_offset;
3930
3931 kr = fourk_pager_populate(fourk_mem_obj,
3932 TRUE, /* overwrite */
3933 fourk_pager_index_start + cur_idx,
3934 object,
3935 (object
3936 ? (offset +
3937 (cur_idx * FOURK_PAGE_SIZE))
3938 : 0),
3939 &old_object,
3940 &old_offset);
3941 #if VM_MAP_DEBUG_FOURK
3942 if (vm_map_debug_fourk) {
3943 if (old_object == (vm_object_t) -1 &&
3944 old_offset == (vm_object_offset_t) -1) {
3945 printf("FOURK_PAGER: map %p [0x%llx:0x%llx] "
3946 "pager [%p:0x%llx] "
3947 "populate[%d] "
3948 "[object:%p,offset:0x%llx]\n",
3949 map,
3950 (uint64_t) entry->vme_start,
3951 (uint64_t) entry->vme_end,
3952 fourk_mem_obj,
3953 VME_OFFSET(entry),
3954 fourk_pager_index_start + cur_idx,
3955 object,
3956 (object
3957 ? (offset + (cur_idx * FOURK_PAGE_SIZE))
3958 : 0));
3959 } else {
3960 printf("FOURK_PAGER: map %p [0x%llx:0x%llx] "
3961 "pager [%p:0x%llx] "
3962 "populate[%d] [object:%p,offset:0x%llx] "
3963 "old [%p:0x%llx]\n",
3964 map,
3965 (uint64_t) entry->vme_start,
3966 (uint64_t) entry->vme_end,
3967 fourk_mem_obj,
3968 VME_OFFSET(entry),
3969 fourk_pager_index_start + cur_idx,
3970 object,
3971 (object
3972 ? (offset + (cur_idx * FOURK_PAGE_SIZE))
3973 : 0),
3974 old_object,
3975 old_offset);
3976 }
3977 }
3978 #endif /* VM_MAP_DEBUG_FOURK */
3979
3980 assert(kr == KERN_SUCCESS);
3981 if (object != old_object &&
3982 object != VM_OBJECT_NULL &&
3983 object != (vm_object_t) -1) {
3984 vm_object_reference(object);
3985 }
3986 if (object != old_object &&
3987 old_object != VM_OBJECT_NULL &&
3988 old_object != (vm_object_t) -1) {
3989 vm_object_deallocate(old_object);
3990 }
3991 }
3992
3993 BailOut:
3994 assert(map_locked == TRUE);
3995
3996 if (result == KERN_SUCCESS) {
3997 vm_prot_t pager_prot;
3998 memory_object_t pager;
3999
4000 #if DEBUG
4001 if (pmap_empty &&
4002 !(vmk_flags.vmkf_no_pmap_check)) {
4003 assert(pmap_is_empty(map->pmap,
4004 *address,
4005 *address + size));
4006 }
4007 #endif /* DEBUG */
4008
4009 /*
4010 * For "named" VM objects, let the pager know that the
4011 * memory object is being mapped. Some pagers need to keep
4012 * track of this, to know when they can reclaim the memory
4013 * object, for example.
4014 * VM calls memory_object_map() for each mapping (specifying
4015 * the protection of each mapping) and calls
4016 * memory_object_last_unmap() when all the mappings are gone.
4017 */
4018 pager_prot = max_protection;
4019 if (needs_copy) {
4020 /*
4021 * Copy-On-Write mapping: won't modify
4022 * the memory object.
4023 */
4024 pager_prot &= ~VM_PROT_WRITE;
4025 }
4026 if (!is_submap &&
4027 object != VM_OBJECT_NULL &&
4028 object->named &&
4029 object->pager != MEMORY_OBJECT_NULL) {
4030 vm_object_lock(object);
4031 pager = object->pager;
4032 if (object->named &&
4033 pager != MEMORY_OBJECT_NULL) {
4034 assert(object->pager_ready);
4035 vm_object_mapping_wait(object, THREAD_UNINT);
4036 vm_object_mapping_begin(object);
4037 vm_object_unlock(object);
4038
4039 kr = memory_object_map(pager, pager_prot);
4040 assert(kr == KERN_SUCCESS);
4041
4042 vm_object_lock(object);
4043 vm_object_mapping_end(object);
4044 }
4045 vm_object_unlock(object);
4046 }
4047 if (!is_submap &&
4048 fourk_object != VM_OBJECT_NULL &&
4049 fourk_object->named &&
4050 fourk_object->pager != MEMORY_OBJECT_NULL) {
4051 vm_object_lock(fourk_object);
4052 pager = fourk_object->pager;
4053 if (fourk_object->named &&
4054 pager != MEMORY_OBJECT_NULL) {
4055 assert(fourk_object->pager_ready);
4056 vm_object_mapping_wait(fourk_object,
4057 THREAD_UNINT);
4058 vm_object_mapping_begin(fourk_object);
4059 vm_object_unlock(fourk_object);
4060
4061 kr = memory_object_map(pager, VM_PROT_READ);
4062 assert(kr == KERN_SUCCESS);
4063
4064 vm_object_lock(fourk_object);
4065 vm_object_mapping_end(fourk_object);
4066 }
4067 vm_object_unlock(fourk_object);
4068 }
4069 }
4070
4071 if (fourk_object != VM_OBJECT_NULL) {
4072 vm_object_deallocate(fourk_object);
4073 fourk_object = VM_OBJECT_NULL;
4074 memory_object_deallocate(fourk_mem_obj);
4075 fourk_mem_obj = MEMORY_OBJECT_NULL;
4076 }
4077
4078 assert(map_locked == TRUE);
4079
4080 if (!keep_map_locked) {
4081 vm_map_unlock(map);
4082 map_locked = FALSE;
4083 }
4084
4085 /*
4086 * We can't hold the map lock if we enter this block.
4087 */
4088
4089 if (result == KERN_SUCCESS) {
4090 /* Wire down the new entry if the user
4091 * requested all new map entries be wired.
4092 */
4093 if ((map->wiring_required) || (superpage_size)) {
4094 assert(!keep_map_locked);
4095 pmap_empty = FALSE; /* pmap won't be empty */
4096 kr = vm_map_wire_kernel(map, start, end,
4097 new_entry->protection, VM_KERN_MEMORY_MLOCK,
4098 TRUE);
4099 result = kr;
4100 }
4101
4102 }
4103
4104 if (result != KERN_SUCCESS) {
4105 if (new_mapping_established) {
4106 /*
4107 * We have to get rid of the new mappings since we
4108 * won't make them available to the user.
4109 * Try and do that atomically, to minimize the risk
4110 * that someone else create new mappings that range.
4111 */
4112
4113 if (!map_locked) {
4114 vm_map_lock(map);
4115 map_locked = TRUE;
4116 }
4117 (void)vm_map_delete(map, *address, *address + size,
4118 VM_MAP_REMOVE_NO_MAP_ALIGN | VM_MAP_REMOVE_NO_YIELD,
4119 KMEM_GUARD_NONE, &zap_list);
4120 }
4121 }
4122
4123 /*
4124 * The caller is responsible for releasing the lock if it requested to
4125 * keep the map locked.
4126 */
4127 if (map_locked && !keep_map_locked) {
4128 vm_map_unlock(map);
4129 }
4130
4131 vm_map_zap_dispose(&zap_list);
4132
4133 return result;
4134
4135 #undef RETURN
4136 }
4137 #endif /* __arm64__ */
4138
4139 /*
4140 * Counters for the prefault optimization.
4141 */
4142 int64_t vm_prefault_nb_pages = 0;
4143 int64_t vm_prefault_nb_bailout = 0;
4144
4145 static kern_return_t
vm_map_enter_mem_object_helper(vm_map_t target_map,vm_map_offset_t * address,vm_map_size_t initial_size,vm_map_offset_t mask,vm_map_kernel_flags_t vmk_flags,ipc_port_t port,vm_object_offset_t offset,boolean_t copy,vm_prot_t cur_protection,vm_prot_t max_protection,vm_inherit_t inheritance,upl_page_list_ptr_t page_list,unsigned int page_list_count)4146 vm_map_enter_mem_object_helper(
4147 vm_map_t target_map,
4148 vm_map_offset_t *address,
4149 vm_map_size_t initial_size,
4150 vm_map_offset_t mask,
4151 vm_map_kernel_flags_t vmk_flags,
4152 ipc_port_t port,
4153 vm_object_offset_t offset,
4154 boolean_t copy,
4155 vm_prot_t cur_protection,
4156 vm_prot_t max_protection,
4157 vm_inherit_t inheritance,
4158 upl_page_list_ptr_t page_list,
4159 unsigned int page_list_count)
4160 {
4161 vm_map_address_t map_addr;
4162 vm_map_size_t map_size;
4163 vm_object_t object;
4164 vm_object_size_t size;
4165 kern_return_t result;
4166 boolean_t mask_cur_protection, mask_max_protection;
4167 boolean_t kernel_prefault, try_prefault = (page_list_count != 0);
4168 vm_map_offset_t offset_in_mapping = 0;
4169 #if __arm64__
4170 boolean_t fourk = vmk_flags.vmkf_fourk;
4171 #endif /* __arm64__ */
4172
4173 if (VM_MAP_PAGE_SHIFT(target_map) < PAGE_SHIFT) {
4174 /* XXX TODO4K prefaulting depends on page size... */
4175 try_prefault = FALSE;
4176 }
4177
4178 assertf(vmk_flags.__vmkf_unused == 0, "vmk_flags unused=0x%x\n", vmk_flags.__vmkf_unused);
4179 vm_map_kernel_flags_update_range_id(&vmk_flags, target_map);
4180
4181 mask_cur_protection = cur_protection & VM_PROT_IS_MASK;
4182 mask_max_protection = max_protection & VM_PROT_IS_MASK;
4183 cur_protection &= ~VM_PROT_IS_MASK;
4184 max_protection &= ~VM_PROT_IS_MASK;
4185
4186 /*
4187 * Check arguments for validity
4188 */
4189 if ((target_map == VM_MAP_NULL) ||
4190 (cur_protection & ~(VM_PROT_ALL | VM_PROT_ALLEXEC)) ||
4191 (max_protection & ~(VM_PROT_ALL | VM_PROT_ALLEXEC)) ||
4192 (inheritance > VM_INHERIT_LAST_VALID) ||
4193 (try_prefault && (copy || !page_list)) ||
4194 initial_size == 0) {
4195 return KERN_INVALID_ARGUMENT;
4196 }
4197
4198 if (__improbable((cur_protection & max_protection) != cur_protection)) {
4199 /* cur is more permissive than max */
4200 cur_protection &= max_protection;
4201 }
4202
4203 #if __arm64__
4204 if (cur_protection & VM_PROT_EXECUTE) {
4205 cur_protection |= VM_PROT_READ;
4206 }
4207
4208 if (fourk && VM_MAP_PAGE_SHIFT(target_map) < PAGE_SHIFT) {
4209 /* no "fourk" if map is using a sub-page page size */
4210 fourk = FALSE;
4211 }
4212 if (fourk) {
4213 map_addr = vm_map_trunc_page(*address, FOURK_PAGE_MASK);
4214 map_size = vm_map_round_page(initial_size, FOURK_PAGE_MASK);
4215 } else
4216 #endif /* __arm64__ */
4217 {
4218 map_addr = vm_map_trunc_page(*address,
4219 VM_MAP_PAGE_MASK(target_map));
4220 map_size = vm_map_round_page(initial_size,
4221 VM_MAP_PAGE_MASK(target_map));
4222 }
4223 if (map_size == 0) {
4224 return KERN_INVALID_ARGUMENT;
4225 }
4226 size = vm_object_round_page(initial_size);
4227
4228 /*
4229 * Find the vm object (if any) corresponding to this port.
4230 */
4231 if (!IP_VALID(port)) {
4232 object = VM_OBJECT_NULL;
4233 offset = 0;
4234 copy = FALSE;
4235 } else if (ip_kotype(port) == IKOT_NAMED_ENTRY) {
4236 vm_named_entry_t named_entry;
4237 vm_object_offset_t data_offset;
4238
4239 named_entry = mach_memory_entry_from_port(port);
4240
4241 if (vmk_flags.vmf_return_data_addr ||
4242 vmk_flags.vmf_return_4k_data_addr) {
4243 data_offset = named_entry->data_offset;
4244 offset += named_entry->data_offset;
4245 } else {
4246 data_offset = 0;
4247 }
4248
4249 /* a few checks to make sure user is obeying rules */
4250 if (mask_max_protection) {
4251 max_protection &= named_entry->protection;
4252 }
4253 if (mask_cur_protection) {
4254 cur_protection &= named_entry->protection;
4255 }
4256 if ((named_entry->protection & max_protection) !=
4257 max_protection) {
4258 return KERN_INVALID_RIGHT;
4259 }
4260 if ((named_entry->protection & cur_protection) !=
4261 cur_protection) {
4262 return KERN_INVALID_RIGHT;
4263 }
4264 if (offset + size <= offset) {
4265 /* overflow */
4266 return KERN_INVALID_ARGUMENT;
4267 }
4268 if (named_entry->size < (offset + initial_size)) {
4269 return KERN_INVALID_ARGUMENT;
4270 }
4271
4272 if (named_entry->is_copy) {
4273 /* for a vm_map_copy, we can only map it whole */
4274 if ((size != named_entry->size) &&
4275 (vm_map_round_page(size,
4276 VM_MAP_PAGE_MASK(target_map)) ==
4277 named_entry->size)) {
4278 /* XXX FBDP use the rounded size... */
4279 size = vm_map_round_page(
4280 size,
4281 VM_MAP_PAGE_MASK(target_map));
4282 }
4283 }
4284
4285 /* the callers parameter offset is defined to be the */
4286 /* offset from beginning of named entry offset in object */
4287 offset = offset + named_entry->offset;
4288
4289 if (!VM_MAP_PAGE_ALIGNED(size,
4290 VM_MAP_PAGE_MASK(target_map))) {
4291 /*
4292 * Let's not map more than requested;
4293 * vm_map_enter() will handle this "not map-aligned"
4294 * case.
4295 */
4296 map_size = size;
4297 }
4298
4299 named_entry_lock(named_entry);
4300 if (named_entry->is_sub_map) {
4301 vm_map_t submap;
4302
4303 if (vmk_flags.vmf_return_data_addr ||
4304 vmk_flags.vmf_return_4k_data_addr) {
4305 panic("VM_FLAGS_RETURN_DATA_ADDR not expected for submap.");
4306 }
4307
4308 submap = named_entry->backing.map;
4309 vm_map_reference(submap);
4310 named_entry_unlock(named_entry);
4311
4312 vmk_flags.vmkf_submap = TRUE;
4313
4314 result = vm_map_enter(target_map,
4315 &map_addr,
4316 map_size,
4317 mask,
4318 vmk_flags,
4319 (vm_object_t)(uintptr_t) submap,
4320 offset,
4321 copy,
4322 cur_protection,
4323 max_protection,
4324 inheritance);
4325 if (result != KERN_SUCCESS) {
4326 vm_map_deallocate(submap);
4327 } else {
4328 /*
4329 * No need to lock "submap" just to check its
4330 * "mapped" flag: that flag is never reset
4331 * once it's been set and if we race, we'll
4332 * just end up setting it twice, which is OK.
4333 */
4334 if (submap->mapped_in_other_pmaps == FALSE &&
4335 vm_map_pmap(submap) != PMAP_NULL &&
4336 vm_map_pmap(submap) !=
4337 vm_map_pmap(target_map)) {
4338 /*
4339 * This submap is being mapped in a map
4340 * that uses a different pmap.
4341 * Set its "mapped_in_other_pmaps" flag
4342 * to indicate that we now need to
4343 * remove mappings from all pmaps rather
4344 * than just the submap's pmap.
4345 */
4346 vm_map_lock(submap);
4347 submap->mapped_in_other_pmaps = TRUE;
4348 vm_map_unlock(submap);
4349 }
4350 *address = map_addr;
4351 }
4352 return result;
4353 } else if (named_entry->is_copy) {
4354 kern_return_t kr;
4355 vm_map_copy_t copy_map;
4356 vm_map_entry_t copy_entry;
4357 vm_map_offset_t copy_addr;
4358 vm_map_copy_t target_copy_map;
4359 vm_map_offset_t overmap_start, overmap_end;
4360 vm_map_offset_t trimmed_start;
4361 vm_map_size_t target_size;
4362
4363 if (!vm_map_kernel_flags_check_vmflags(vmk_flags,
4364 (VM_FLAGS_FIXED |
4365 VM_FLAGS_ANYWHERE |
4366 VM_FLAGS_OVERWRITE |
4367 VM_FLAGS_RETURN_4K_DATA_ADDR |
4368 VM_FLAGS_RETURN_DATA_ADDR))) {
4369 named_entry_unlock(named_entry);
4370 return KERN_INVALID_ARGUMENT;
4371 }
4372
4373 copy_map = named_entry->backing.copy;
4374 assert(copy_map->type == VM_MAP_COPY_ENTRY_LIST);
4375 if (copy_map->type != VM_MAP_COPY_ENTRY_LIST) {
4376 /* unsupported type; should not happen */
4377 printf("vm_map_enter_mem_object: "
4378 "memory_entry->backing.copy "
4379 "unsupported type 0x%x\n",
4380 copy_map->type);
4381 named_entry_unlock(named_entry);
4382 return KERN_INVALID_ARGUMENT;
4383 }
4384
4385 if (VM_MAP_PAGE_SHIFT(target_map) != copy_map->cpy_hdr.page_shift) {
4386 DEBUG4K_SHARE("copy_map %p offset %llx size 0x%llx pgshift %d -> target_map %p pgshift %d\n", copy_map, offset, (uint64_t)map_size, copy_map->cpy_hdr.page_shift, target_map, VM_MAP_PAGE_SHIFT(target_map));
4387 }
4388
4389 if (vmk_flags.vmf_return_data_addr ||
4390 vmk_flags.vmf_return_4k_data_addr) {
4391 offset_in_mapping = offset & VM_MAP_PAGE_MASK(target_map);
4392 if (vmk_flags.vmf_return_4k_data_addr) {
4393 offset_in_mapping &= ~((signed)(0xFFF));
4394 }
4395 }
4396
4397 target_copy_map = VM_MAP_COPY_NULL;
4398 target_size = copy_map->size;
4399 overmap_start = 0;
4400 overmap_end = 0;
4401 trimmed_start = 0;
4402 if (copy_map->cpy_hdr.page_shift != VM_MAP_PAGE_SHIFT(target_map)) {
4403 DEBUG4K_ADJUST("adjusting...\n");
4404 kr = vm_map_copy_adjust_to_target(
4405 copy_map,
4406 offset /* includes data_offset */,
4407 initial_size,
4408 target_map,
4409 copy,
4410 &target_copy_map,
4411 &overmap_start,
4412 &overmap_end,
4413 &trimmed_start);
4414 if (kr != KERN_SUCCESS) {
4415 named_entry_unlock(named_entry);
4416 return kr;
4417 }
4418 target_size = target_copy_map->size;
4419 if (trimmed_start >= data_offset) {
4420 data_offset = offset & VM_MAP_PAGE_MASK(target_map);
4421 } else {
4422 data_offset -= trimmed_start;
4423 }
4424 } else {
4425 /*
4426 * Assert that the vm_map_copy is coming from the right
4427 * zone and hasn't been forged
4428 */
4429 vm_map_copy_require(copy_map);
4430 target_copy_map = copy_map;
4431 }
4432
4433 vm_map_kernel_flags_t rsv_flags = vmk_flags;
4434
4435 vm_map_kernel_flags_and_vmflags(&rsv_flags,
4436 (VM_FLAGS_FIXED |
4437 VM_FLAGS_ANYWHERE |
4438 VM_FLAGS_OVERWRITE |
4439 VM_FLAGS_RETURN_4K_DATA_ADDR |
4440 VM_FLAGS_RETURN_DATA_ADDR));
4441
4442 /* reserve a contiguous range */
4443 kr = vm_map_enter(target_map,
4444 &map_addr,
4445 vm_map_round_page(target_size, VM_MAP_PAGE_MASK(target_map)),
4446 mask,
4447 rsv_flags,
4448 VM_OBJECT_NULL,
4449 0,
4450 FALSE, /* copy */
4451 cur_protection,
4452 max_protection,
4453 inheritance);
4454 if (kr != KERN_SUCCESS) {
4455 DEBUG4K_ERROR("kr 0x%x\n", kr);
4456 if (target_copy_map != copy_map) {
4457 vm_map_copy_discard(target_copy_map);
4458 target_copy_map = VM_MAP_COPY_NULL;
4459 }
4460 named_entry_unlock(named_entry);
4461 return kr;
4462 }
4463
4464 copy_addr = map_addr;
4465
4466 for (copy_entry = vm_map_copy_first_entry(target_copy_map);
4467 copy_entry != vm_map_copy_to_entry(target_copy_map);
4468 copy_entry = copy_entry->vme_next) {
4469 vm_map_t copy_submap = VM_MAP_NULL;
4470 vm_object_t copy_object = VM_OBJECT_NULL;
4471 vm_map_size_t copy_size;
4472 vm_object_offset_t copy_offset;
4473 boolean_t do_copy = false;
4474
4475 if (copy_entry->is_sub_map) {
4476 copy_submap = VME_SUBMAP(copy_entry);
4477 copy_object = (vm_object_t)copy_submap;
4478 } else {
4479 copy_object = VME_OBJECT(copy_entry);
4480 }
4481 copy_offset = VME_OFFSET(copy_entry);
4482 copy_size = (copy_entry->vme_end -
4483 copy_entry->vme_start);
4484
4485 /* sanity check */
4486 if ((copy_addr + copy_size) >
4487 (map_addr +
4488 overmap_start + overmap_end +
4489 named_entry->size /* XXX full size */)) {
4490 /* over-mapping too much !? */
4491 kr = KERN_INVALID_ARGUMENT;
4492 DEBUG4K_ERROR("kr 0x%x\n", kr);
4493 /* abort */
4494 break;
4495 }
4496
4497 /* take a reference on the object */
4498 if (copy_entry->is_sub_map) {
4499 vm_map_reference(copy_submap);
4500 } else {
4501 if (!copy &&
4502 copy_object != VM_OBJECT_NULL &&
4503 copy_object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) {
4504 bool is_writable;
4505
4506 /*
4507 * We need to resolve our side of this
4508 * "symmetric" copy-on-write now; we
4509 * need a new object to map and share,
4510 * instead of the current one which
4511 * might still be shared with the
4512 * original mapping.
4513 *
4514 * Note: A "vm_map_copy_t" does not
4515 * have a lock but we're protected by
4516 * the named entry's lock here.
4517 */
4518 // assert(copy_object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC);
4519 VME_OBJECT_SHADOW(copy_entry, copy_size, TRUE);
4520 assert(copy_object != VME_OBJECT(copy_entry));
4521 is_writable = false;
4522 if (copy_entry->protection & VM_PROT_WRITE) {
4523 is_writable = true;
4524 #if __arm64e__
4525 } else if (copy_entry->used_for_tpro) {
4526 is_writable = true;
4527 #endif /* __arm64e__ */
4528 }
4529 if (!copy_entry->needs_copy && is_writable) {
4530 vm_prot_t prot;
4531
4532 prot = copy_entry->protection & ~VM_PROT_WRITE;
4533 vm_object_pmap_protect(copy_object,
4534 copy_offset,
4535 copy_size,
4536 PMAP_NULL,
4537 PAGE_SIZE,
4538 0,
4539 prot);
4540 }
4541 copy_entry->needs_copy = FALSE;
4542 copy_entry->is_shared = TRUE;
4543 copy_object = VME_OBJECT(copy_entry);
4544 copy_offset = VME_OFFSET(copy_entry);
4545 vm_object_lock(copy_object);
4546 /* we're about to make a shared mapping of this object */
4547 copy_object->copy_strategy = MEMORY_OBJECT_COPY_DELAY;
4548 VM_OBJECT_SET_TRUE_SHARE(copy_object, TRUE);
4549 vm_object_unlock(copy_object);
4550 }
4551
4552 if (copy_object != VM_OBJECT_NULL &&
4553 copy_object->named &&
4554 copy_object->pager != MEMORY_OBJECT_NULL &&
4555 copy_object->copy_strategy != MEMORY_OBJECT_COPY_NONE) {
4556 memory_object_t pager;
4557 vm_prot_t pager_prot;
4558
4559 /*
4560 * For "named" VM objects, let the pager know that the
4561 * memory object is being mapped. Some pagers need to keep
4562 * track of this, to know when they can reclaim the memory
4563 * object, for example.
4564 * VM calls memory_object_map() for each mapping (specifying
4565 * the protection of each mapping) and calls
4566 * memory_object_last_unmap() when all the mappings are gone.
4567 */
4568 pager_prot = max_protection;
4569 if (copy) {
4570 /*
4571 * Copy-On-Write mapping: won't modify the
4572 * memory object.
4573 */
4574 pager_prot &= ~VM_PROT_WRITE;
4575 }
4576 vm_object_lock(copy_object);
4577 pager = copy_object->pager;
4578 if (copy_object->named &&
4579 pager != MEMORY_OBJECT_NULL &&
4580 copy_object->copy_strategy != MEMORY_OBJECT_COPY_NONE) {
4581 assert(copy_object->pager_ready);
4582 vm_object_mapping_wait(copy_object, THREAD_UNINT);
4583 vm_object_mapping_begin(copy_object);
4584 vm_object_unlock(copy_object);
4585
4586 kr = memory_object_map(pager, pager_prot);
4587 assert(kr == KERN_SUCCESS);
4588
4589 vm_object_lock(copy_object);
4590 vm_object_mapping_end(copy_object);
4591 }
4592 vm_object_unlock(copy_object);
4593 }
4594
4595 /*
4596 * Perform the copy if requested
4597 */
4598
4599 if (copy && copy_object != VM_OBJECT_NULL) {
4600 vm_object_t new_object;
4601 vm_object_offset_t new_offset;
4602
4603 result = vm_object_copy_strategically(copy_object, copy_offset,
4604 copy_size,
4605 false, /* forking */
4606 &new_object, &new_offset,
4607 &do_copy);
4608
4609
4610 if (result == KERN_MEMORY_RESTART_COPY) {
4611 boolean_t success;
4612 boolean_t src_needs_copy;
4613
4614 /*
4615 * XXX
4616 * We currently ignore src_needs_copy.
4617 * This really is the issue of how to make
4618 * MEMORY_OBJECT_COPY_SYMMETRIC safe for
4619 * non-kernel users to use. Solution forthcoming.
4620 * In the meantime, since we don't allow non-kernel
4621 * memory managers to specify symmetric copy,
4622 * we won't run into problems here.
4623 */
4624 new_object = copy_object;
4625 new_offset = copy_offset;
4626 success = vm_object_copy_quickly(new_object,
4627 new_offset,
4628 copy_size,
4629 &src_needs_copy,
4630 &do_copy);
4631 assert(success);
4632 result = KERN_SUCCESS;
4633 }
4634 if (result != KERN_SUCCESS) {
4635 kr = result;
4636 break;
4637 }
4638
4639 copy_object = new_object;
4640 copy_offset = new_offset;
4641 /*
4642 * No extra object reference for the mapping:
4643 * the mapping should be the only thing keeping
4644 * this new object alive.
4645 */
4646 } else {
4647 /*
4648 * We already have the right object
4649 * to map.
4650 */
4651 copy_object = VME_OBJECT(copy_entry);
4652 /* take an extra ref for the mapping below */
4653 vm_object_reference(copy_object);
4654 }
4655 }
4656
4657 /*
4658 * If the caller does not want a specific
4659 * tag for this new mapping: use
4660 * the tag of the original mapping.
4661 */
4662 vm_map_kernel_flags_t vmk_remap_flags = {
4663 .vmkf_submap = copy_entry->is_sub_map,
4664 };
4665
4666 vm_map_kernel_flags_set_vmflags(&vmk_remap_flags,
4667 vm_map_kernel_flags_vmflags(vmk_flags),
4668 vmk_flags.vm_tag ?: VME_ALIAS(copy_entry));
4669
4670 /* over-map the object into destination */
4671 vmk_remap_flags.vmf_fixed = true;
4672 vmk_remap_flags.vmf_overwrite = true;
4673
4674 if (!copy && !copy_entry->is_sub_map) {
4675 /*
4676 * copy-on-write should have been
4677 * resolved at this point, or we would
4678 * end up sharing instead of copying.
4679 */
4680 assert(!copy_entry->needs_copy);
4681 }
4682 #if XNU_TARGET_OS_OSX
4683 if (copy_entry->used_for_jit) {
4684 vmk_remap_flags.vmkf_map_jit = TRUE;
4685 }
4686 #endif /* XNU_TARGET_OS_OSX */
4687
4688 kr = vm_map_enter(target_map,
4689 ©_addr,
4690 copy_size,
4691 (vm_map_offset_t) 0,
4692 vmk_remap_flags,
4693 copy_object,
4694 copy_offset,
4695 ((copy_object == NULL)
4696 ? FALSE
4697 : (copy || copy_entry->needs_copy)),
4698 cur_protection,
4699 max_protection,
4700 inheritance);
4701 if (kr != KERN_SUCCESS) {
4702 DEBUG4K_SHARE("failed kr 0x%x\n", kr);
4703 if (copy_entry->is_sub_map) {
4704 vm_map_deallocate(copy_submap);
4705 } else {
4706 vm_object_deallocate(copy_object);
4707 }
4708 /* abort */
4709 break;
4710 }
4711
4712 /* next mapping */
4713 copy_addr += copy_size;
4714 }
4715
4716 if (kr == KERN_SUCCESS) {
4717 if (vmk_flags.vmf_return_data_addr ||
4718 vmk_flags.vmf_return_4k_data_addr) {
4719 *address = map_addr + offset_in_mapping;
4720 } else {
4721 *address = map_addr;
4722 }
4723 if (overmap_start) {
4724 *address += overmap_start;
4725 DEBUG4K_SHARE("map %p map_addr 0x%llx offset_in_mapping 0x%llx overmap_start 0x%llx -> *address 0x%llx\n", target_map, (uint64_t)map_addr, (uint64_t) offset_in_mapping, (uint64_t)overmap_start, (uint64_t)*address);
4726 }
4727 }
4728 named_entry_unlock(named_entry);
4729 if (target_copy_map != copy_map) {
4730 vm_map_copy_discard(target_copy_map);
4731 target_copy_map = VM_MAP_COPY_NULL;
4732 }
4733
4734 if (kr != KERN_SUCCESS && !vmk_flags.vmf_overwrite) {
4735 /* deallocate the contiguous range */
4736 (void) vm_deallocate(target_map,
4737 map_addr,
4738 map_size);
4739 }
4740
4741 return kr;
4742 }
4743
4744 if (named_entry->is_object) {
4745 unsigned int access;
4746 unsigned int wimg_mode;
4747
4748 /* we are mapping a VM object */
4749
4750 access = named_entry->access;
4751
4752 if (vmk_flags.vmf_return_data_addr ||
4753 vmk_flags.vmf_return_4k_data_addr) {
4754 offset_in_mapping = offset - VM_MAP_TRUNC_PAGE(offset, VM_MAP_PAGE_MASK(target_map));
4755 if (vmk_flags.vmf_return_4k_data_addr) {
4756 offset_in_mapping &= ~((signed)(0xFFF));
4757 }
4758 offset = VM_MAP_TRUNC_PAGE(offset, VM_MAP_PAGE_MASK(target_map));
4759 map_size = VM_MAP_ROUND_PAGE((offset + offset_in_mapping + initial_size) - offset, VM_MAP_PAGE_MASK(target_map));
4760 }
4761
4762 object = vm_named_entry_to_vm_object(named_entry);
4763 assert(object != VM_OBJECT_NULL);
4764 vm_object_lock(object);
4765 named_entry_unlock(named_entry);
4766
4767 vm_object_reference_locked(object);
4768
4769 wimg_mode = object->wimg_bits;
4770 vm_prot_to_wimg(access, &wimg_mode);
4771 if (object->wimg_bits != wimg_mode) {
4772 vm_object_change_wimg_mode(object, wimg_mode);
4773 }
4774
4775 vm_object_unlock(object);
4776 } else {
4777 panic("invalid VM named entry %p", named_entry);
4778 }
4779 } else if (ip_kotype(port) == IKOT_MEMORY_OBJECT) {
4780 /*
4781 * JMM - This is temporary until we unify named entries
4782 * and raw memory objects.
4783 *
4784 * Detected fake ip_kotype for a memory object. In
4785 * this case, the port isn't really a port at all, but
4786 * instead is just a raw memory object.
4787 */
4788 if (vmk_flags.vmf_return_data_addr ||
4789 vmk_flags.vmf_return_4k_data_addr) {
4790 panic("VM_FLAGS_RETURN_DATA_ADDR not expected for raw memory object.");
4791 }
4792
4793 object = memory_object_to_vm_object((memory_object_t)port);
4794 if (object == VM_OBJECT_NULL) {
4795 return KERN_INVALID_OBJECT;
4796 }
4797 vm_object_reference(object);
4798
4799 /* wait for object (if any) to be ready */
4800 if (object != VM_OBJECT_NULL) {
4801 if (is_kernel_object(object)) {
4802 printf("Warning: Attempt to map kernel object"
4803 " by a non-private kernel entity\n");
4804 return KERN_INVALID_OBJECT;
4805 }
4806 if (!object->pager_ready) {
4807 vm_object_lock(object);
4808
4809 while (!object->pager_ready) {
4810 vm_object_wait(object,
4811 VM_OBJECT_EVENT_PAGER_READY,
4812 THREAD_UNINT);
4813 vm_object_lock(object);
4814 }
4815 vm_object_unlock(object);
4816 }
4817 }
4818 } else {
4819 return KERN_INVALID_OBJECT;
4820 }
4821
4822 if (object != VM_OBJECT_NULL &&
4823 object->named &&
4824 object->pager != MEMORY_OBJECT_NULL &&
4825 object->copy_strategy != MEMORY_OBJECT_COPY_NONE) {
4826 memory_object_t pager;
4827 vm_prot_t pager_prot;
4828 kern_return_t kr;
4829
4830 /*
4831 * For "named" VM objects, let the pager know that the
4832 * memory object is being mapped. Some pagers need to keep
4833 * track of this, to know when they can reclaim the memory
4834 * object, for example.
4835 * VM calls memory_object_map() for each mapping (specifying
4836 * the protection of each mapping) and calls
4837 * memory_object_last_unmap() when all the mappings are gone.
4838 */
4839 pager_prot = max_protection;
4840 if (copy) {
4841 /*
4842 * Copy-On-Write mapping: won't modify the
4843 * memory object.
4844 */
4845 pager_prot &= ~VM_PROT_WRITE;
4846 }
4847 vm_object_lock(object);
4848 pager = object->pager;
4849 if (object->named &&
4850 pager != MEMORY_OBJECT_NULL &&
4851 object->copy_strategy != MEMORY_OBJECT_COPY_NONE) {
4852 assert(object->pager_ready);
4853 vm_object_mapping_wait(object, THREAD_UNINT);
4854 vm_object_mapping_begin(object);
4855 vm_object_unlock(object);
4856
4857 kr = memory_object_map(pager, pager_prot);
4858 assert(kr == KERN_SUCCESS);
4859
4860 vm_object_lock(object);
4861 vm_object_mapping_end(object);
4862 }
4863 vm_object_unlock(object);
4864 }
4865
4866 /*
4867 * Perform the copy if requested
4868 */
4869
4870 if (copy) {
4871 vm_object_t new_object;
4872 vm_object_offset_t new_offset;
4873
4874 result = vm_object_copy_strategically(object, offset,
4875 map_size,
4876 false, /* forking */
4877 &new_object, &new_offset,
4878 ©);
4879
4880
4881 if (result == KERN_MEMORY_RESTART_COPY) {
4882 boolean_t success;
4883 boolean_t src_needs_copy;
4884
4885 /*
4886 * XXX
4887 * We currently ignore src_needs_copy.
4888 * This really is the issue of how to make
4889 * MEMORY_OBJECT_COPY_SYMMETRIC safe for
4890 * non-kernel users to use. Solution forthcoming.
4891 * In the meantime, since we don't allow non-kernel
4892 * memory managers to specify symmetric copy,
4893 * we won't run into problems here.
4894 */
4895 new_object = object;
4896 new_offset = offset;
4897 success = vm_object_copy_quickly(new_object,
4898 new_offset,
4899 map_size,
4900 &src_needs_copy,
4901 ©);
4902 assert(success);
4903 result = KERN_SUCCESS;
4904 }
4905 /*
4906 * Throw away the reference to the
4907 * original object, as it won't be mapped.
4908 */
4909
4910 vm_object_deallocate(object);
4911
4912 if (result != KERN_SUCCESS) {
4913 return result;
4914 }
4915
4916 object = new_object;
4917 offset = new_offset;
4918 }
4919
4920 /*
4921 * If non-kernel users want to try to prefault pages, the mapping and prefault
4922 * needs to be atomic.
4923 */
4924 kernel_prefault = (try_prefault && vm_kernel_map_is_kernel(target_map));
4925 vmk_flags.vmkf_keep_map_locked = (try_prefault && !kernel_prefault);
4926
4927 #if __arm64__
4928 if (fourk) {
4929 /* map this object in a "4K" pager */
4930 result = vm_map_enter_fourk(target_map,
4931 &map_addr,
4932 map_size,
4933 (vm_map_offset_t) mask,
4934 vmk_flags,
4935 object,
4936 offset,
4937 copy,
4938 cur_protection,
4939 max_protection,
4940 inheritance);
4941 } else
4942 #endif /* __arm64__ */
4943 {
4944 result = vm_map_enter(target_map,
4945 &map_addr, map_size,
4946 (vm_map_offset_t)mask,
4947 vmk_flags,
4948 object, offset,
4949 copy,
4950 cur_protection, max_protection,
4951 inheritance);
4952 }
4953 if (result != KERN_SUCCESS) {
4954 vm_object_deallocate(object);
4955 }
4956
4957 /*
4958 * Try to prefault, and do not forget to release the vm map lock.
4959 */
4960 if (result == KERN_SUCCESS && try_prefault) {
4961 mach_vm_address_t va = map_addr;
4962 kern_return_t kr = KERN_SUCCESS;
4963 unsigned int i = 0;
4964 int pmap_options;
4965
4966 pmap_options = kernel_prefault ? 0 : PMAP_OPTIONS_NOWAIT;
4967 if (object->internal) {
4968 pmap_options |= PMAP_OPTIONS_INTERNAL;
4969 }
4970
4971 for (i = 0; i < page_list_count; ++i) {
4972 if (!UPL_VALID_PAGE(page_list, i)) {
4973 if (kernel_prefault) {
4974 assertf(FALSE, "kernel_prefault && !UPL_VALID_PAGE");
4975 result = KERN_MEMORY_ERROR;
4976 break;
4977 }
4978 } else {
4979 /*
4980 * If this function call failed, we should stop
4981 * trying to optimize, other calls are likely
4982 * going to fail too.
4983 *
4984 * We are not gonna report an error for such
4985 * failure though. That's an optimization, not
4986 * something critical.
4987 */
4988 kr = pmap_enter_options(target_map->pmap,
4989 va, UPL_PHYS_PAGE(page_list, i),
4990 cur_protection, VM_PROT_NONE,
4991 0, TRUE, pmap_options, NULL, PMAP_MAPPING_TYPE_INFER);
4992 if (kr != KERN_SUCCESS) {
4993 OSIncrementAtomic64(&vm_prefault_nb_bailout);
4994 if (kernel_prefault) {
4995 result = kr;
4996 }
4997 break;
4998 }
4999 OSIncrementAtomic64(&vm_prefault_nb_pages);
5000 }
5001
5002 /* Next virtual address */
5003 va += PAGE_SIZE;
5004 }
5005 if (vmk_flags.vmkf_keep_map_locked) {
5006 vm_map_unlock(target_map);
5007 }
5008 }
5009
5010 if (vmk_flags.vmf_return_data_addr ||
5011 vmk_flags.vmf_return_4k_data_addr) {
5012 *address = map_addr + offset_in_mapping;
5013 } else {
5014 *address = map_addr;
5015 }
5016 return result;
5017 }
5018
5019 kern_return_t
vm_map_enter_mem_object(vm_map_t target_map,vm_map_offset_t * address,vm_map_size_t initial_size,vm_map_offset_t mask,vm_map_kernel_flags_t vmk_flags,ipc_port_t port,vm_object_offset_t offset,boolean_t copy,vm_prot_t cur_protection,vm_prot_t max_protection,vm_inherit_t inheritance)5020 vm_map_enter_mem_object(
5021 vm_map_t target_map,
5022 vm_map_offset_t *address,
5023 vm_map_size_t initial_size,
5024 vm_map_offset_t mask,
5025 vm_map_kernel_flags_t vmk_flags,
5026 ipc_port_t port,
5027 vm_object_offset_t offset,
5028 boolean_t copy,
5029 vm_prot_t cur_protection,
5030 vm_prot_t max_protection,
5031 vm_inherit_t inheritance)
5032 {
5033 kern_return_t ret;
5034
5035 /* range_id is set by vm_map_enter_mem_object_helper */
5036 ret = vm_map_enter_mem_object_helper(target_map,
5037 address,
5038 initial_size,
5039 mask,
5040 vmk_flags,
5041 port,
5042 offset,
5043 copy,
5044 cur_protection,
5045 max_protection,
5046 inheritance,
5047 NULL,
5048 0);
5049
5050 #if KASAN
5051 if (ret == KERN_SUCCESS && address && target_map->pmap == kernel_pmap) {
5052 kasan_notify_address(*address, initial_size);
5053 }
5054 #endif
5055
5056 return ret;
5057 }
5058
5059 kern_return_t
vm_map_enter_mem_object_prefault(vm_map_t target_map,vm_map_offset_t * address,vm_map_size_t initial_size,vm_map_offset_t mask,vm_map_kernel_flags_t vmk_flags,ipc_port_t port,vm_object_offset_t offset,vm_prot_t cur_protection,vm_prot_t max_protection,upl_page_list_ptr_t page_list,unsigned int page_list_count)5060 vm_map_enter_mem_object_prefault(
5061 vm_map_t target_map,
5062 vm_map_offset_t *address,
5063 vm_map_size_t initial_size,
5064 vm_map_offset_t mask,
5065 vm_map_kernel_flags_t vmk_flags,
5066 ipc_port_t port,
5067 vm_object_offset_t offset,
5068 vm_prot_t cur_protection,
5069 vm_prot_t max_protection,
5070 upl_page_list_ptr_t page_list,
5071 unsigned int page_list_count)
5072 {
5073 kern_return_t ret;
5074
5075 /* range_id is set by vm_map_enter_mem_object_helper */
5076 ret = vm_map_enter_mem_object_helper(target_map,
5077 address,
5078 initial_size,
5079 mask,
5080 vmk_flags,
5081 port,
5082 offset,
5083 FALSE,
5084 cur_protection,
5085 max_protection,
5086 VM_INHERIT_DEFAULT,
5087 page_list,
5088 page_list_count);
5089
5090 #if KASAN
5091 if (ret == KERN_SUCCESS && address && target_map->pmap == kernel_pmap) {
5092 kasan_notify_address(*address, initial_size);
5093 }
5094 #endif
5095
5096 return ret;
5097 }
5098
5099
5100 kern_return_t
vm_map_enter_mem_object_control(vm_map_t target_map,vm_map_offset_t * address,vm_map_size_t initial_size,vm_map_offset_t mask,vm_map_kernel_flags_t vmk_flags,memory_object_control_t control,vm_object_offset_t offset,boolean_t copy,vm_prot_t cur_protection,vm_prot_t max_protection,vm_inherit_t inheritance)5101 vm_map_enter_mem_object_control(
5102 vm_map_t target_map,
5103 vm_map_offset_t *address,
5104 vm_map_size_t initial_size,
5105 vm_map_offset_t mask,
5106 vm_map_kernel_flags_t vmk_flags,
5107 memory_object_control_t control,
5108 vm_object_offset_t offset,
5109 boolean_t copy,
5110 vm_prot_t cur_protection,
5111 vm_prot_t max_protection,
5112 vm_inherit_t inheritance)
5113 {
5114 vm_map_address_t map_addr;
5115 vm_map_size_t map_size;
5116 vm_object_t object;
5117 vm_object_size_t size;
5118 kern_return_t result;
5119 memory_object_t pager;
5120 vm_prot_t pager_prot;
5121 kern_return_t kr;
5122 #if __arm64__
5123 boolean_t fourk = vmk_flags.vmkf_fourk;
5124 #endif /* __arm64__ */
5125
5126 /*
5127 * Check arguments for validity
5128 */
5129 if ((target_map == VM_MAP_NULL) ||
5130 (cur_protection & ~(VM_PROT_ALL | VM_PROT_ALLEXEC)) ||
5131 (max_protection & ~(VM_PROT_ALL | VM_PROT_ALLEXEC)) ||
5132 (inheritance > VM_INHERIT_LAST_VALID) ||
5133 initial_size == 0) {
5134 return KERN_INVALID_ARGUMENT;
5135 }
5136
5137 if (__improbable((cur_protection & max_protection) != cur_protection)) {
5138 /* cur is more permissive than max */
5139 cur_protection &= max_protection;
5140 }
5141
5142 #if __arm64__
5143 if (fourk && VM_MAP_PAGE_MASK(target_map) < PAGE_MASK) {
5144 fourk = FALSE;
5145 }
5146
5147 if (fourk) {
5148 map_addr = vm_map_trunc_page(*address,
5149 FOURK_PAGE_MASK);
5150 map_size = vm_map_round_page(initial_size,
5151 FOURK_PAGE_MASK);
5152 } else
5153 #endif /* __arm64__ */
5154 {
5155 map_addr = vm_map_trunc_page(*address,
5156 VM_MAP_PAGE_MASK(target_map));
5157 map_size = vm_map_round_page(initial_size,
5158 VM_MAP_PAGE_MASK(target_map));
5159 }
5160 size = vm_object_round_page(initial_size);
5161
5162 object = memory_object_control_to_vm_object(control);
5163
5164 if (object == VM_OBJECT_NULL) {
5165 return KERN_INVALID_OBJECT;
5166 }
5167
5168 if (is_kernel_object(object)) {
5169 printf("Warning: Attempt to map kernel object"
5170 " by a non-private kernel entity\n");
5171 return KERN_INVALID_OBJECT;
5172 }
5173
5174 vm_object_lock(object);
5175 object->ref_count++;
5176
5177 /*
5178 * For "named" VM objects, let the pager know that the
5179 * memory object is being mapped. Some pagers need to keep
5180 * track of this, to know when they can reclaim the memory
5181 * object, for example.
5182 * VM calls memory_object_map() for each mapping (specifying
5183 * the protection of each mapping) and calls
5184 * memory_object_last_unmap() when all the mappings are gone.
5185 */
5186 pager_prot = max_protection;
5187 if (copy) {
5188 pager_prot &= ~VM_PROT_WRITE;
5189 }
5190 pager = object->pager;
5191 if (object->named &&
5192 pager != MEMORY_OBJECT_NULL &&
5193 object->copy_strategy != MEMORY_OBJECT_COPY_NONE) {
5194 assert(object->pager_ready);
5195 vm_object_mapping_wait(object, THREAD_UNINT);
5196 vm_object_mapping_begin(object);
5197 vm_object_unlock(object);
5198
5199 kr = memory_object_map(pager, pager_prot);
5200 assert(kr == KERN_SUCCESS);
5201
5202 vm_object_lock(object);
5203 vm_object_mapping_end(object);
5204 }
5205 vm_object_unlock(object);
5206
5207 /*
5208 * Perform the copy if requested
5209 */
5210
5211 if (copy) {
5212 vm_object_t new_object;
5213 vm_object_offset_t new_offset;
5214
5215 result = vm_object_copy_strategically(object, offset, size,
5216 false, /* forking */
5217 &new_object, &new_offset,
5218 ©);
5219
5220
5221 if (result == KERN_MEMORY_RESTART_COPY) {
5222 boolean_t success;
5223 boolean_t src_needs_copy;
5224
5225 /*
5226 * XXX
5227 * We currently ignore src_needs_copy.
5228 * This really is the issue of how to make
5229 * MEMORY_OBJECT_COPY_SYMMETRIC safe for
5230 * non-kernel users to use. Solution forthcoming.
5231 * In the meantime, since we don't allow non-kernel
5232 * memory managers to specify symmetric copy,
5233 * we won't run into problems here.
5234 */
5235 new_object = object;
5236 new_offset = offset;
5237 success = vm_object_copy_quickly(new_object,
5238 new_offset, size,
5239 &src_needs_copy,
5240 ©);
5241 assert(success);
5242 result = KERN_SUCCESS;
5243 }
5244 /*
5245 * Throw away the reference to the
5246 * original object, as it won't be mapped.
5247 */
5248
5249 vm_object_deallocate(object);
5250
5251 if (result != KERN_SUCCESS) {
5252 return result;
5253 }
5254
5255 object = new_object;
5256 offset = new_offset;
5257 }
5258
5259 #if __arm64__
5260 if (fourk) {
5261 result = vm_map_enter_fourk(target_map,
5262 &map_addr,
5263 map_size,
5264 (vm_map_offset_t)mask,
5265 vmk_flags,
5266 object, offset,
5267 copy,
5268 cur_protection, max_protection,
5269 inheritance);
5270 } else
5271 #endif /* __arm64__ */
5272 {
5273 result = vm_map_enter(target_map,
5274 &map_addr, map_size,
5275 (vm_map_offset_t)mask,
5276 vmk_flags,
5277 object, offset,
5278 copy,
5279 cur_protection, max_protection,
5280 inheritance);
5281 }
5282 if (result != KERN_SUCCESS) {
5283 vm_object_deallocate(object);
5284 }
5285 *address = map_addr;
5286
5287 return result;
5288 }
5289
5290
5291 #if VM_CPM
5292
5293 #ifdef MACH_ASSERT
5294 extern pmap_paddr_t avail_start, avail_end;
5295 #endif
5296
5297 /*
5298 * Allocate memory in the specified map, with the caveat that
5299 * the memory is physically contiguous. This call may fail
5300 * if the system can't find sufficient contiguous memory.
5301 * This call may cause or lead to heart-stopping amounts of
5302 * paging activity.
5303 *
5304 * Memory obtained from this call should be freed in the
5305 * normal way, viz., via vm_deallocate.
5306 */
5307 kern_return_t
vm_map_enter_cpm(vm_map_t map,vm_map_offset_t * addr,vm_map_size_t size,vm_map_kernel_flags_t vmk_flags)5308 vm_map_enter_cpm(
5309 vm_map_t map,
5310 vm_map_offset_t *addr,
5311 vm_map_size_t size,
5312 vm_map_kernel_flags_t vmk_flags)
5313 {
5314 vm_object_t cpm_obj;
5315 pmap_t pmap;
5316 vm_page_t m, pages;
5317 kern_return_t kr;
5318 vm_map_offset_t va, start, end, offset;
5319 #if MACH_ASSERT
5320 vm_map_offset_t prev_addr = 0;
5321 #endif /* MACH_ASSERT */
5322 uint8_t object_lock_type = 0;
5323
5324 if (VM_MAP_PAGE_SHIFT(map) != PAGE_SHIFT) {
5325 /* XXX TODO4K do we need to support this? */
5326 *addr = 0;
5327 return KERN_NOT_SUPPORTED;
5328 }
5329
5330 if (size == 0) {
5331 *addr = 0;
5332 return KERN_SUCCESS;
5333 }
5334 if (vmk_flags.vmf_fixed) {
5335 *addr = vm_map_trunc_page(*addr,
5336 VM_MAP_PAGE_MASK(map));
5337 } else {
5338 *addr = vm_map_min(map);
5339 }
5340 size = vm_map_round_page(size,
5341 VM_MAP_PAGE_MASK(map));
5342
5343 /*
5344 * LP64todo - cpm_allocate should probably allow
5345 * allocations of >4GB, but not with the current
5346 * algorithm, so just cast down the size for now.
5347 */
5348 if (size > VM_MAX_ADDRESS) {
5349 return KERN_RESOURCE_SHORTAGE;
5350 }
5351 if ((kr = cpm_allocate(CAST_DOWN(vm_size_t, size),
5352 &pages, 0, 0, TRUE, flags)) != KERN_SUCCESS) {
5353 return kr;
5354 }
5355
5356 cpm_obj = vm_object_allocate((vm_object_size_t)size);
5357 assert(cpm_obj != VM_OBJECT_NULL);
5358 assert(cpm_obj->internal);
5359 assert(cpm_obj->vo_size == (vm_object_size_t)size);
5360 assert(cpm_obj->can_persist == FALSE);
5361 assert(cpm_obj->pager_created == FALSE);
5362 assert(cpm_obj->pageout == FALSE);
5363 assert(cpm_obj->shadow == VM_OBJECT_NULL);
5364
5365 /*
5366 * Insert pages into object.
5367 */
5368 object_lock_type = OBJECT_LOCK_EXCLUSIVE;
5369 vm_object_lock(cpm_obj);
5370 for (offset = 0; offset < size; offset += PAGE_SIZE) {
5371 m = pages;
5372 pages = NEXT_PAGE(m);
5373 *(NEXT_PAGE_PTR(m)) = VM_PAGE_NULL;
5374
5375 assert(!m->vmp_gobbled);
5376 assert(!m->vmp_wanted);
5377 assert(!m->vmp_pageout);
5378 assert(!m->vmp_tabled);
5379 assert(VM_PAGE_WIRED(m));
5380 assert(m->vmp_busy);
5381 assert(VM_PAGE_GET_PHYS_PAGE(m) >= (avail_start >> PAGE_SHIFT) && VM_PAGE_GET_PHYS_PAGE(m) <= (avail_end >> PAGE_SHIFT));
5382
5383 m->vmp_busy = FALSE;
5384 vm_page_insert(m, cpm_obj, offset);
5385 }
5386 assert(cpm_obj->resident_page_count == size / PAGE_SIZE);
5387 vm_object_unlock(cpm_obj);
5388
5389 /*
5390 * Hang onto a reference on the object in case a
5391 * multi-threaded application for some reason decides
5392 * to deallocate the portion of the address space into
5393 * which we will insert this object.
5394 *
5395 * Unfortunately, we must insert the object now before
5396 * we can talk to the pmap module about which addresses
5397 * must be wired down. Hence, the race with a multi-
5398 * threaded app.
5399 */
5400 vm_object_reference(cpm_obj);
5401
5402 /*
5403 * Insert object into map.
5404 */
5405
5406 kr = vm_map_enter(
5407 map,
5408 addr,
5409 size,
5410 (vm_map_offset_t)0,
5411 vmk_flags,
5412 cpm_obj,
5413 (vm_object_offset_t)0,
5414 FALSE,
5415 VM_PROT_ALL,
5416 VM_PROT_ALL,
5417 VM_INHERIT_DEFAULT);
5418
5419 if (kr != KERN_SUCCESS) {
5420 /*
5421 * A CPM object doesn't have can_persist set,
5422 * so all we have to do is deallocate it to
5423 * free up these pages.
5424 */
5425 assert(cpm_obj->pager_created == FALSE);
5426 assert(cpm_obj->can_persist == FALSE);
5427 assert(cpm_obj->pageout == FALSE);
5428 assert(cpm_obj->shadow == VM_OBJECT_NULL);
5429 vm_object_deallocate(cpm_obj); /* kill acquired ref */
5430 vm_object_deallocate(cpm_obj); /* kill creation ref */
5431 }
5432
5433 /*
5434 * Inform the physical mapping system that the
5435 * range of addresses may not fault, so that
5436 * page tables and such can be locked down as well.
5437 */
5438 start = *addr;
5439 end = start + size;
5440 pmap = vm_map_pmap(map);
5441 pmap_pageable(pmap, start, end, FALSE);
5442
5443 /*
5444 * Enter each page into the pmap, to avoid faults.
5445 * Note that this loop could be coded more efficiently,
5446 * if the need arose, rather than looking up each page
5447 * again.
5448 */
5449 for (offset = 0, va = start; offset < size;
5450 va += PAGE_SIZE, offset += PAGE_SIZE) {
5451 int type_of_fault;
5452
5453 vm_object_lock(cpm_obj);
5454 m = vm_page_lookup(cpm_obj, (vm_object_offset_t)offset);
5455 assert(m != VM_PAGE_NULL);
5456
5457 vm_page_zero_fill(m);
5458
5459 type_of_fault = DBG_ZERO_FILL_FAULT;
5460
5461 vm_fault_enter(m, pmap, va,
5462 PAGE_SIZE, 0,
5463 VM_PROT_ALL, VM_PROT_WRITE,
5464 VM_PAGE_WIRED(m),
5465 FALSE, /* change_wiring */
5466 VM_KERN_MEMORY_NONE, /* tag - not wiring */
5467 FALSE, /* cs_bypass */
5468 0, /* user_tag */
5469 0, /* pmap_options */
5470 NULL, /* need_retry */
5471 &type_of_fault,
5472 &object_lock_type); /* Exclusive lock mode. Will remain unchanged.*/
5473
5474 vm_object_unlock(cpm_obj);
5475 }
5476
5477 #if MACH_ASSERT
5478 /*
5479 * Verify ordering in address space.
5480 */
5481 for (offset = 0; offset < size; offset += PAGE_SIZE) {
5482 vm_object_lock(cpm_obj);
5483 m = vm_page_lookup(cpm_obj, (vm_object_offset_t)offset);
5484 vm_object_unlock(cpm_obj);
5485 if (m == VM_PAGE_NULL) {
5486 panic("vm_allocate_cpm: obj %p off 0x%llx no page",
5487 cpm_obj, (uint64_t)offset);
5488 }
5489 assert(m->vmp_tabled);
5490 assert(!m->vmp_busy);
5491 assert(!m->vmp_wanted);
5492 assert(!m->vmp_fictitious);
5493 assert(!m->vmp_private);
5494 assert(!m->vmp_absent);
5495 assert(!m->vmp_cleaning);
5496 assert(!m->vmp_laundry);
5497 assert(!m->vmp_precious);
5498 assert(!m->vmp_clustered);
5499 if (offset != 0) {
5500 if (VM_PAGE_GET_PHYS_PAGE(m) != prev_addr + 1) {
5501 printf("start 0x%llx end 0x%llx va 0x%llx\n",
5502 (uint64_t)start, (uint64_t)end, (uint64_t)va);
5503 printf("obj %p off 0x%llx\n", cpm_obj, (uint64_t)offset);
5504 printf("m %p prev_address 0x%llx\n", m, (uint64_t)prev_addr);
5505 panic("vm_allocate_cpm: pages not contig!");
5506 }
5507 }
5508 prev_addr = VM_PAGE_GET_PHYS_PAGE(m);
5509 }
5510 #endif /* MACH_ASSERT */
5511
5512 vm_object_deallocate(cpm_obj); /* kill extra ref */
5513
5514 return kr;
5515 }
5516
5517
5518 #else /* VM_CPM */
5519
5520 /*
5521 * Interface is defined in all cases, but unless the kernel
5522 * is built explicitly for this option, the interface does
5523 * nothing.
5524 */
5525
5526 kern_return_t
vm_map_enter_cpm(__unused vm_map_t map,__unused vm_map_offset_t * addr,__unused vm_map_size_t size,__unused vm_map_kernel_flags_t vmk_flags)5527 vm_map_enter_cpm(
5528 __unused vm_map_t map,
5529 __unused vm_map_offset_t *addr,
5530 __unused vm_map_size_t size,
5531 __unused vm_map_kernel_flags_t vmk_flags)
5532 {
5533 return KERN_FAILURE;
5534 }
5535 #endif /* VM_CPM */
5536
5537 /* Not used without nested pmaps */
5538 #ifndef NO_NESTED_PMAP
5539 /*
5540 * Clip and unnest a portion of a nested submap mapping.
5541 */
5542
5543
5544 static void
vm_map_clip_unnest(vm_map_t map,vm_map_entry_t entry,vm_map_offset_t start_unnest,vm_map_offset_t end_unnest)5545 vm_map_clip_unnest(
5546 vm_map_t map,
5547 vm_map_entry_t entry,
5548 vm_map_offset_t start_unnest,
5549 vm_map_offset_t end_unnest)
5550 {
5551 vm_map_offset_t old_start_unnest = start_unnest;
5552 vm_map_offset_t old_end_unnest = end_unnest;
5553
5554 assert(entry->is_sub_map);
5555 assert(VME_SUBMAP(entry) != NULL);
5556 assert(entry->use_pmap);
5557
5558 /*
5559 * Query the platform for the optimal unnest range.
5560 * DRK: There's some duplication of effort here, since
5561 * callers may have adjusted the range to some extent. This
5562 * routine was introduced to support 1GiB subtree nesting
5563 * for x86 platforms, which can also nest on 2MiB boundaries
5564 * depending on size/alignment.
5565 */
5566 if (pmap_adjust_unnest_parameters(map->pmap, &start_unnest, &end_unnest)) {
5567 assert(VME_SUBMAP(entry)->is_nested_map);
5568 assert(!VME_SUBMAP(entry)->disable_vmentry_reuse);
5569 log_unnest_badness(map,
5570 old_start_unnest,
5571 old_end_unnest,
5572 VME_SUBMAP(entry)->is_nested_map,
5573 (entry->vme_start +
5574 VME_SUBMAP(entry)->lowest_unnestable_start -
5575 VME_OFFSET(entry)));
5576 }
5577
5578 if (entry->vme_start > start_unnest ||
5579 entry->vme_end < end_unnest) {
5580 panic("vm_map_clip_unnest(0x%llx,0x%llx): "
5581 "bad nested entry: start=0x%llx end=0x%llx\n",
5582 (long long)start_unnest, (long long)end_unnest,
5583 (long long)entry->vme_start, (long long)entry->vme_end);
5584 }
5585
5586 if (start_unnest > entry->vme_start) {
5587 _vm_map_clip_start(&map->hdr,
5588 entry,
5589 start_unnest);
5590 if (map->holelistenabled) {
5591 vm_map_store_update_first_free(map, NULL, FALSE);
5592 } else {
5593 vm_map_store_update_first_free(map, map->first_free, FALSE);
5594 }
5595 }
5596 if (entry->vme_end > end_unnest) {
5597 _vm_map_clip_end(&map->hdr,
5598 entry,
5599 end_unnest);
5600 if (map->holelistenabled) {
5601 vm_map_store_update_first_free(map, NULL, FALSE);
5602 } else {
5603 vm_map_store_update_first_free(map, map->first_free, FALSE);
5604 }
5605 }
5606
5607 pmap_unnest(map->pmap,
5608 entry->vme_start,
5609 entry->vme_end - entry->vme_start);
5610 if ((map->mapped_in_other_pmaps) && os_ref_get_count_raw(&map->map_refcnt) != 0) {
5611 /* clean up parent map/maps */
5612 vm_map_submap_pmap_clean(
5613 map, entry->vme_start,
5614 entry->vme_end,
5615 VME_SUBMAP(entry),
5616 VME_OFFSET(entry));
5617 }
5618 entry->use_pmap = FALSE;
5619 if ((map->pmap != kernel_pmap) &&
5620 (VME_ALIAS(entry) == VM_MEMORY_SHARED_PMAP)) {
5621 VME_ALIAS_SET(entry, VM_MEMORY_UNSHARED_PMAP);
5622 }
5623 }
5624 #endif /* NO_NESTED_PMAP */
5625
5626 __abortlike
5627 static void
__vm_map_clip_atomic_entry_panic(vm_map_t map,vm_map_entry_t entry,vm_map_offset_t where)5628 __vm_map_clip_atomic_entry_panic(
5629 vm_map_t map,
5630 vm_map_entry_t entry,
5631 vm_map_offset_t where)
5632 {
5633 panic("vm_map_clip(%p): Attempting to clip an atomic VM map entry "
5634 "%p [0x%llx:0x%llx] at 0x%llx", map, entry,
5635 (uint64_t)entry->vme_start,
5636 (uint64_t)entry->vme_end,
5637 (uint64_t)where);
5638 }
5639
5640 /*
5641 * vm_map_clip_start: [ internal use only ]
5642 *
5643 * Asserts that the given entry begins at or after
5644 * the specified address; if necessary,
5645 * it splits the entry into two.
5646 */
5647 void
vm_map_clip_start(vm_map_t map,vm_map_entry_t entry,vm_map_offset_t startaddr)5648 vm_map_clip_start(
5649 vm_map_t map,
5650 vm_map_entry_t entry,
5651 vm_map_offset_t startaddr)
5652 {
5653 #ifndef NO_NESTED_PMAP
5654 if (entry->is_sub_map &&
5655 entry->use_pmap &&
5656 startaddr >= entry->vme_start) {
5657 vm_map_offset_t start_unnest, end_unnest;
5658
5659 /*
5660 * Make sure "startaddr" is no longer in a nested range
5661 * before we clip. Unnest only the minimum range the platform
5662 * can handle.
5663 * vm_map_clip_unnest may perform additional adjustments to
5664 * the unnest range.
5665 */
5666 start_unnest = startaddr & ~(pmap_shared_region_size_min(map->pmap) - 1);
5667 end_unnest = start_unnest + pmap_shared_region_size_min(map->pmap);
5668 vm_map_clip_unnest(map, entry, start_unnest, end_unnest);
5669 }
5670 #endif /* NO_NESTED_PMAP */
5671 if (startaddr > entry->vme_start) {
5672 if (!entry->is_sub_map &&
5673 VME_OBJECT(entry) &&
5674 VME_OBJECT(entry)->phys_contiguous) {
5675 pmap_remove(map->pmap,
5676 (addr64_t)(entry->vme_start),
5677 (addr64_t)(entry->vme_end));
5678 }
5679 if (entry->vme_atomic) {
5680 __vm_map_clip_atomic_entry_panic(map, entry, startaddr);
5681 }
5682
5683 DTRACE_VM5(
5684 vm_map_clip_start,
5685 vm_map_t, map,
5686 vm_map_offset_t, entry->vme_start,
5687 vm_map_offset_t, entry->vme_end,
5688 vm_map_offset_t, startaddr,
5689 int, VME_ALIAS(entry));
5690
5691 _vm_map_clip_start(&map->hdr, entry, startaddr);
5692 if (map->holelistenabled) {
5693 vm_map_store_update_first_free(map, NULL, FALSE);
5694 } else {
5695 vm_map_store_update_first_free(map, map->first_free, FALSE);
5696 }
5697 }
5698 }
5699
5700
5701 #define vm_map_copy_clip_start(copy, entry, startaddr) \
5702 MACRO_BEGIN \
5703 if ((startaddr) > (entry)->vme_start) \
5704 _vm_map_clip_start(&(copy)->cpy_hdr,(entry),(startaddr)); \
5705 MACRO_END
5706
5707 /*
5708 * This routine is called only when it is known that
5709 * the entry must be split.
5710 */
5711 static void
_vm_map_clip_start(struct vm_map_header * map_header,vm_map_entry_t entry,vm_map_offset_t start)5712 _vm_map_clip_start(
5713 struct vm_map_header *map_header,
5714 vm_map_entry_t entry,
5715 vm_map_offset_t start)
5716 {
5717 vm_map_entry_t new_entry;
5718
5719 /*
5720 * Split off the front portion --
5721 * note that we must insert the new
5722 * entry BEFORE this one, so that
5723 * this entry has the specified starting
5724 * address.
5725 */
5726
5727 if (entry->map_aligned) {
5728 assert(VM_MAP_PAGE_ALIGNED(start,
5729 VM_MAP_HDR_PAGE_MASK(map_header)));
5730 }
5731
5732 new_entry = _vm_map_entry_create(map_header);
5733 vm_map_entry_copy_full(new_entry, entry);
5734
5735 new_entry->vme_end = start;
5736 assert(new_entry->vme_start < new_entry->vme_end);
5737 VME_OFFSET_SET(entry, VME_OFFSET(entry) + (start - entry->vme_start));
5738 if (__improbable(start >= entry->vme_end)) {
5739 panic("mapHdr %p entry %p start 0x%llx end 0x%llx new start 0x%llx", map_header, entry, entry->vme_start, entry->vme_end, start);
5740 }
5741 assert(start < entry->vme_end);
5742 entry->vme_start = start;
5743
5744 #if VM_BTLOG_TAGS
5745 if (new_entry->vme_kernel_object) {
5746 btref_retain(new_entry->vme_tag_btref);
5747 }
5748 #endif /* VM_BTLOG_TAGS */
5749
5750 _vm_map_store_entry_link(map_header, entry->vme_prev, new_entry);
5751
5752 if (entry->is_sub_map) {
5753 vm_map_reference(VME_SUBMAP(new_entry));
5754 } else {
5755 vm_object_reference(VME_OBJECT(new_entry));
5756 }
5757 }
5758
5759
5760 /*
5761 * vm_map_clip_end: [ internal use only ]
5762 *
5763 * Asserts that the given entry ends at or before
5764 * the specified address; if necessary,
5765 * it splits the entry into two.
5766 */
5767 void
vm_map_clip_end(vm_map_t map,vm_map_entry_t entry,vm_map_offset_t endaddr)5768 vm_map_clip_end(
5769 vm_map_t map,
5770 vm_map_entry_t entry,
5771 vm_map_offset_t endaddr)
5772 {
5773 if (endaddr > entry->vme_end) {
5774 /*
5775 * Within the scope of this clipping, limit "endaddr" to
5776 * the end of this map entry...
5777 */
5778 endaddr = entry->vme_end;
5779 }
5780 #ifndef NO_NESTED_PMAP
5781 if (entry->is_sub_map && entry->use_pmap) {
5782 vm_map_offset_t start_unnest, end_unnest;
5783
5784 /*
5785 * Make sure the range between the start of this entry and
5786 * the new "endaddr" is no longer nested before we clip.
5787 * Unnest only the minimum range the platform can handle.
5788 * vm_map_clip_unnest may perform additional adjustments to
5789 * the unnest range.
5790 */
5791 start_unnest = entry->vme_start;
5792 end_unnest =
5793 (endaddr + pmap_shared_region_size_min(map->pmap) - 1) &
5794 ~(pmap_shared_region_size_min(map->pmap) - 1);
5795 vm_map_clip_unnest(map, entry, start_unnest, end_unnest);
5796 }
5797 #endif /* NO_NESTED_PMAP */
5798 if (endaddr < entry->vme_end) {
5799 if (!entry->is_sub_map &&
5800 VME_OBJECT(entry) &&
5801 VME_OBJECT(entry)->phys_contiguous) {
5802 pmap_remove(map->pmap,
5803 (addr64_t)(entry->vme_start),
5804 (addr64_t)(entry->vme_end));
5805 }
5806 if (entry->vme_atomic) {
5807 __vm_map_clip_atomic_entry_panic(map, entry, endaddr);
5808 }
5809 DTRACE_VM5(
5810 vm_map_clip_end,
5811 vm_map_t, map,
5812 vm_map_offset_t, entry->vme_start,
5813 vm_map_offset_t, entry->vme_end,
5814 vm_map_offset_t, endaddr,
5815 int, VME_ALIAS(entry));
5816
5817 _vm_map_clip_end(&map->hdr, entry, endaddr);
5818 if (map->holelistenabled) {
5819 vm_map_store_update_first_free(map, NULL, FALSE);
5820 } else {
5821 vm_map_store_update_first_free(map, map->first_free, FALSE);
5822 }
5823 }
5824 }
5825
5826
5827 #define vm_map_copy_clip_end(copy, entry, endaddr) \
5828 MACRO_BEGIN \
5829 if ((endaddr) < (entry)->vme_end) \
5830 _vm_map_clip_end(&(copy)->cpy_hdr,(entry),(endaddr)); \
5831 MACRO_END
5832
5833 /*
5834 * This routine is called only when it is known that
5835 * the entry must be split.
5836 */
5837 static void
_vm_map_clip_end(struct vm_map_header * map_header,vm_map_entry_t entry,vm_map_offset_t end)5838 _vm_map_clip_end(
5839 struct vm_map_header *map_header,
5840 vm_map_entry_t entry,
5841 vm_map_offset_t end)
5842 {
5843 vm_map_entry_t new_entry;
5844
5845 /*
5846 * Create a new entry and insert it
5847 * AFTER the specified entry
5848 */
5849
5850 if (entry->map_aligned) {
5851 assert(VM_MAP_PAGE_ALIGNED(end,
5852 VM_MAP_HDR_PAGE_MASK(map_header)));
5853 }
5854
5855 new_entry = _vm_map_entry_create(map_header);
5856 vm_map_entry_copy_full(new_entry, entry);
5857
5858 if (__improbable(end <= entry->vme_start)) {
5859 panic("mapHdr %p entry %p start 0x%llx end 0x%llx new end 0x%llx", map_header, entry, entry->vme_start, entry->vme_end, end);
5860 }
5861 assert(entry->vme_start < end);
5862 new_entry->vme_start = entry->vme_end = end;
5863 VME_OFFSET_SET(new_entry,
5864 VME_OFFSET(new_entry) + (end - entry->vme_start));
5865 assert(new_entry->vme_start < new_entry->vme_end);
5866
5867 #if VM_BTLOG_TAGS
5868 if (new_entry->vme_kernel_object) {
5869 btref_retain(new_entry->vme_tag_btref);
5870 }
5871 #endif /* VM_BTLOG_TAGS */
5872
5873 _vm_map_store_entry_link(map_header, entry, new_entry);
5874
5875 if (entry->is_sub_map) {
5876 vm_map_reference(VME_SUBMAP(new_entry));
5877 } else {
5878 vm_object_reference(VME_OBJECT(new_entry));
5879 }
5880 }
5881
5882
5883 /*
5884 * VM_MAP_RANGE_CHECK: [ internal use only ]
5885 *
5886 * Asserts that the starting and ending region
5887 * addresses fall within the valid range of the map.
5888 */
5889 #define VM_MAP_RANGE_CHECK(map, start, end) \
5890 MACRO_BEGIN \
5891 if (start < vm_map_min(map)) \
5892 start = vm_map_min(map); \
5893 if (end > vm_map_max(map)) \
5894 end = vm_map_max(map); \
5895 if (start > end) \
5896 start = end; \
5897 MACRO_END
5898
5899 /*
5900 * vm_map_range_check: [ internal use only ]
5901 *
5902 * Check that the region defined by the specified start and
5903 * end addresses are wholly contained within a single map
5904 * entry or set of adjacent map entries of the spacified map,
5905 * i.e. the specified region contains no unmapped space.
5906 * If any or all of the region is unmapped, FALSE is returned.
5907 * Otherwise, TRUE is returned and if the output argument 'entry'
5908 * is not NULL it points to the map entry containing the start
5909 * of the region.
5910 *
5911 * The map is locked for reading on entry and is left locked.
5912 */
5913 static boolean_t
vm_map_range_check(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_map_entry_t * entry)5914 vm_map_range_check(
5915 vm_map_t map,
5916 vm_map_offset_t start,
5917 vm_map_offset_t end,
5918 vm_map_entry_t *entry)
5919 {
5920 vm_map_entry_t cur;
5921 vm_map_offset_t prev;
5922
5923 /*
5924 * Basic sanity checks first
5925 */
5926 if (start < vm_map_min(map) || end > vm_map_max(map) || start > end) {
5927 return FALSE;
5928 }
5929
5930 /*
5931 * Check first if the region starts within a valid
5932 * mapping for the map.
5933 */
5934 if (!vm_map_lookup_entry(map, start, &cur)) {
5935 return FALSE;
5936 }
5937
5938 /*
5939 * Optimize for the case that the region is contained
5940 * in a single map entry.
5941 */
5942 if (entry != (vm_map_entry_t *) NULL) {
5943 *entry = cur;
5944 }
5945 if (end <= cur->vme_end) {
5946 return TRUE;
5947 }
5948
5949 /*
5950 * If the region is not wholly contained within a
5951 * single entry, walk the entries looking for holes.
5952 */
5953 prev = cur->vme_end;
5954 cur = cur->vme_next;
5955 while ((cur != vm_map_to_entry(map)) && (prev == cur->vme_start)) {
5956 if (end <= cur->vme_end) {
5957 return TRUE;
5958 }
5959 prev = cur->vme_end;
5960 cur = cur->vme_next;
5961 }
5962 return FALSE;
5963 }
5964
5965 /*
5966 * vm_map_protect:
5967 *
5968 * Sets the protection of the specified address
5969 * region in the target map. If "set_max" is
5970 * specified, the maximum protection is to be set;
5971 * otherwise, only the current protection is affected.
5972 */
5973 kern_return_t
vm_map_protect(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_prot_t new_prot,boolean_t set_max)5974 vm_map_protect(
5975 vm_map_t map,
5976 vm_map_offset_t start,
5977 vm_map_offset_t end,
5978 vm_prot_t new_prot,
5979 boolean_t set_max)
5980 {
5981 vm_map_entry_t current;
5982 vm_map_offset_t prev;
5983 vm_map_entry_t entry;
5984 vm_prot_t new_max;
5985 int pmap_options = 0;
5986 kern_return_t kr;
5987
5988 if (__improbable(vm_map_range_overflows(map, start, end - start))) {
5989 return KERN_INVALID_ARGUMENT;
5990 }
5991
5992 if (new_prot & VM_PROT_COPY) {
5993 vm_map_offset_t new_start;
5994 vm_prot_t cur_prot, max_prot;
5995 vm_map_kernel_flags_t kflags;
5996
5997 /* LP64todo - see below */
5998 if (start >= map->max_offset) {
5999 return KERN_INVALID_ADDRESS;
6000 }
6001
6002 if ((new_prot & VM_PROT_ALLEXEC) &&
6003 map->pmap != kernel_pmap &&
6004 (vm_map_cs_enforcement(map)
6005 #if XNU_TARGET_OS_OSX && __arm64__
6006 || !VM_MAP_IS_EXOTIC(map)
6007 #endif /* XNU_TARGET_OS_OSX && __arm64__ */
6008 ) &&
6009 VM_MAP_POLICY_WX_FAIL(map)) {
6010 DTRACE_VM3(cs_wx,
6011 uint64_t, (uint64_t) start,
6012 uint64_t, (uint64_t) end,
6013 vm_prot_t, new_prot);
6014 printf("CODE SIGNING: %d[%s] %s:%d(0x%llx,0x%llx,0x%x) can't have both write and exec at the same time\n",
6015 proc_selfpid(),
6016 (get_bsdtask_info(current_task())
6017 ? proc_name_address(get_bsdtask_info(current_task()))
6018 : "?"),
6019 __FUNCTION__, __LINE__,
6020 #if DEVELOPMENT || DEBUG
6021 (uint64_t)start,
6022 (uint64_t)end,
6023 #else /* DEVELOPMENT || DEBUG */
6024 (uint64_t)0,
6025 (uint64_t)0,
6026 #endif /* DEVELOPMENT || DEBUG */
6027 new_prot);
6028 return KERN_PROTECTION_FAILURE;
6029 }
6030
6031 /*
6032 * Let vm_map_remap_extract() know that it will need to:
6033 * + make a copy of the mapping
6034 * + add VM_PROT_WRITE to the max protections
6035 * + remove any protections that are no longer allowed from the
6036 * max protections (to avoid any WRITE/EXECUTE conflict, for
6037 * example).
6038 * Note that "max_prot" is an IN/OUT parameter only for this
6039 * specific (VM_PROT_COPY) case. It's usually an OUT parameter
6040 * only.
6041 */
6042 max_prot = new_prot & (VM_PROT_ALL | VM_PROT_ALLEXEC);
6043 cur_prot = VM_PROT_NONE;
6044 kflags = VM_MAP_KERNEL_FLAGS_FIXED(.vmf_overwrite = true);
6045 kflags.vmkf_remap_prot_copy = true;
6046 kflags.vmkf_tpro_enforcement_override = !vm_map_tpro_enforcement(map);
6047 new_start = start;
6048 kr = vm_map_remap(map,
6049 &new_start,
6050 end - start,
6051 0, /* mask */
6052 kflags,
6053 map,
6054 start,
6055 TRUE, /* copy-on-write remapping! */
6056 &cur_prot, /* IN/OUT */
6057 &max_prot, /* IN/OUT */
6058 VM_INHERIT_DEFAULT);
6059 if (kr != KERN_SUCCESS) {
6060 return kr;
6061 }
6062 new_prot &= ~VM_PROT_COPY;
6063 }
6064
6065 vm_map_lock(map);
6066
6067 /* LP64todo - remove this check when vm_map_commpage64()
6068 * no longer has to stuff in a map_entry for the commpage
6069 * above the map's max_offset.
6070 */
6071 if (start >= map->max_offset) {
6072 vm_map_unlock(map);
6073 return KERN_INVALID_ADDRESS;
6074 }
6075
6076 while (1) {
6077 /*
6078 * Lookup the entry. If it doesn't start in a valid
6079 * entry, return an error.
6080 */
6081 if (!vm_map_lookup_entry(map, start, &entry)) {
6082 vm_map_unlock(map);
6083 return KERN_INVALID_ADDRESS;
6084 }
6085
6086 if (entry->superpage_size && (start & (SUPERPAGE_SIZE - 1))) { /* extend request to whole entry */
6087 start = SUPERPAGE_ROUND_DOWN(start);
6088 continue;
6089 }
6090 break;
6091 }
6092 if (entry->superpage_size) {
6093 end = SUPERPAGE_ROUND_UP(end);
6094 }
6095
6096 /*
6097 * Make a first pass to check for protection and address
6098 * violations.
6099 */
6100
6101 current = entry;
6102 prev = current->vme_start;
6103 while ((current != vm_map_to_entry(map)) &&
6104 (current->vme_start < end)) {
6105 /*
6106 * If there is a hole, return an error.
6107 */
6108 if (current->vme_start != prev) {
6109 vm_map_unlock(map);
6110 return KERN_INVALID_ADDRESS;
6111 }
6112
6113 new_max = current->max_protection;
6114
6115 #if defined(__x86_64__)
6116 /* Allow max mask to include execute prot bits if this map doesn't enforce CS */
6117 if (set_max && (new_prot & VM_PROT_ALLEXEC) && !vm_map_cs_enforcement(map)) {
6118 new_max = (new_max & ~VM_PROT_ALLEXEC) | (new_prot & VM_PROT_ALLEXEC);
6119 }
6120 #elif CODE_SIGNING_MONITOR
6121 if (set_max && (new_prot & VM_PROT_EXECUTE) && (csm_address_space_exempt(map->pmap) == KERN_SUCCESS)) {
6122 new_max |= VM_PROT_EXECUTE;
6123 }
6124 #endif
6125 if ((new_prot & new_max) != new_prot) {
6126 vm_map_unlock(map);
6127 return KERN_PROTECTION_FAILURE;
6128 }
6129
6130 if (current->used_for_jit &&
6131 pmap_has_prot_policy(map->pmap, current->translated_allow_execute, current->protection)) {
6132 vm_map_unlock(map);
6133 return KERN_PROTECTION_FAILURE;
6134 }
6135
6136 #if __arm64e__
6137 /* Disallow remapping hw assisted TPRO mappings */
6138 if (current->used_for_tpro) {
6139 vm_map_unlock(map);
6140 return KERN_PROTECTION_FAILURE;
6141 }
6142 #endif /* __arm64e__ */
6143
6144
6145 if ((new_prot & VM_PROT_WRITE) &&
6146 (new_prot & VM_PROT_ALLEXEC) &&
6147 #if XNU_TARGET_OS_OSX
6148 map->pmap != kernel_pmap &&
6149 (vm_map_cs_enforcement(map)
6150 #if __arm64__
6151 || !VM_MAP_IS_EXOTIC(map)
6152 #endif /* __arm64__ */
6153 ) &&
6154 #endif /* XNU_TARGET_OS_OSX */
6155 #if CODE_SIGNING_MONITOR
6156 (csm_address_space_exempt(map->pmap) != KERN_SUCCESS) &&
6157 #endif
6158 !(current->used_for_jit)) {
6159 DTRACE_VM3(cs_wx,
6160 uint64_t, (uint64_t) current->vme_start,
6161 uint64_t, (uint64_t) current->vme_end,
6162 vm_prot_t, new_prot);
6163 printf("CODE SIGNING: %d[%s] %s:%d(0x%llx,0x%llx,0x%x) can't have both write and exec at the same time\n",
6164 proc_selfpid(),
6165 (get_bsdtask_info(current_task())
6166 ? proc_name_address(get_bsdtask_info(current_task()))
6167 : "?"),
6168 __FUNCTION__, __LINE__,
6169 #if DEVELOPMENT || DEBUG
6170 (uint64_t)current->vme_start,
6171 (uint64_t)current->vme_end,
6172 #else /* DEVELOPMENT || DEBUG */
6173 (uint64_t)0,
6174 (uint64_t)0,
6175 #endif /* DEVELOPMENT || DEBUG */
6176 new_prot);
6177 new_prot &= ~VM_PROT_ALLEXEC;
6178 if (VM_MAP_POLICY_WX_FAIL(map)) {
6179 vm_map_unlock(map);
6180 return KERN_PROTECTION_FAILURE;
6181 }
6182 }
6183
6184 /*
6185 * If the task has requested executable lockdown,
6186 * deny both:
6187 * - adding executable protections OR
6188 * - adding write protections to an existing executable mapping.
6189 */
6190 if (map->map_disallow_new_exec == TRUE) {
6191 if ((new_prot & VM_PROT_ALLEXEC) ||
6192 ((current->protection & VM_PROT_EXECUTE) && (new_prot & VM_PROT_WRITE))) {
6193 vm_map_unlock(map);
6194 return KERN_PROTECTION_FAILURE;
6195 }
6196 }
6197
6198 prev = current->vme_end;
6199 current = current->vme_next;
6200 }
6201
6202 #if __arm64__
6203 if (end > prev &&
6204 end == vm_map_round_page(prev, VM_MAP_PAGE_MASK(map))) {
6205 vm_map_entry_t prev_entry;
6206
6207 prev_entry = current->vme_prev;
6208 if (prev_entry != vm_map_to_entry(map) &&
6209 !prev_entry->map_aligned &&
6210 (vm_map_round_page(prev_entry->vme_end,
6211 VM_MAP_PAGE_MASK(map))
6212 == end)) {
6213 /*
6214 * The last entry in our range is not "map-aligned"
6215 * but it would have reached all the way to "end"
6216 * if it had been map-aligned, so this is not really
6217 * a hole in the range and we can proceed.
6218 */
6219 prev = end;
6220 }
6221 }
6222 #endif /* __arm64__ */
6223
6224 if (end > prev) {
6225 vm_map_unlock(map);
6226 return KERN_INVALID_ADDRESS;
6227 }
6228
6229 /*
6230 * Go back and fix up protections.
6231 * Clip to start here if the range starts within
6232 * the entry.
6233 */
6234
6235 current = entry;
6236 if (current != vm_map_to_entry(map)) {
6237 /* clip and unnest if necessary */
6238 vm_map_clip_start(map, current, start);
6239 }
6240
6241 while ((current != vm_map_to_entry(map)) &&
6242 (current->vme_start < end)) {
6243 vm_prot_t old_prot;
6244
6245 vm_map_clip_end(map, current, end);
6246
6247 #if DEVELOPMENT || DEBUG
6248 if (current->csm_associated && vm_log_xnu_user_debug) {
6249 printf("FBDP %d[%s] %s(0x%llx,0x%llx,0x%x) on map %p entry %p [0x%llx:0x%llx 0x%x/0x%x] csm_associated\n",
6250 proc_selfpid(),
6251 (get_bsdtask_info(current_task())
6252 ? proc_name_address(get_bsdtask_info(current_task()))
6253 : "?"),
6254 __FUNCTION__,
6255 (uint64_t)start,
6256 (uint64_t)end,
6257 new_prot,
6258 map, current,
6259 current->vme_start,
6260 current->vme_end,
6261 current->protection,
6262 current->max_protection);
6263 }
6264 #endif /* DEVELOPMENT || DEBUG */
6265
6266 if (current->is_sub_map) {
6267 /* clipping did unnest if needed */
6268 assert(!current->use_pmap);
6269 }
6270
6271 old_prot = current->protection;
6272
6273 if (set_max) {
6274 current->max_protection = new_prot;
6275 /* Consider either EXECUTE or UEXEC as EXECUTE for this masking */
6276 current->protection = (new_prot & old_prot);
6277 } else {
6278 current->protection = new_prot;
6279 }
6280
6281 #if CODE_SIGNING_MONITOR
6282 if (!current->vme_xnu_user_debug &&
6283 /* a !csm_associated mapping becoming executable */
6284 ((!current->csm_associated &&
6285 !(old_prot & VM_PROT_EXECUTE) &&
6286 (current->protection & VM_PROT_EXECUTE))
6287 ||
6288 /* a csm_associated mapping becoming writable */
6289 (current->csm_associated &&
6290 !(old_prot & VM_PROT_WRITE) &&
6291 (current->protection & VM_PROT_WRITE)))) {
6292 /*
6293 * This mapping has not already been marked as
6294 * "user_debug" and it is either:
6295 * 1. not code-signing-monitored and becoming executable
6296 * 2. code-signing-monitored and becoming writable,
6297 * so inform the CodeSigningMonitor and mark the
6298 * mapping as "user_debug" if appropriate.
6299 */
6300 vm_map_kernel_flags_t vmk_flags;
6301 vmk_flags = VM_MAP_KERNEL_FLAGS_NONE;
6302 /* pretend it's a vm_protect(VM_PROT_COPY)... */
6303 vmk_flags.vmkf_remap_prot_copy = true;
6304 kr = vm_map_entry_cs_associate(map, current, vmk_flags);
6305 #if DEVELOPMENT || DEBUG
6306 if (vm_log_xnu_user_debug) {
6307 printf("FBDP %d[%s] %s:%d map %p entry %p [ 0x%llx 0x%llx ] prot 0x%x -> 0x%x cs_associate -> %d user_debug=%d\n",
6308 proc_selfpid(),
6309 (get_bsdtask_info(current_task()) ? proc_name_address(get_bsdtask_info(current_task())) : "?"),
6310 __FUNCTION__, __LINE__,
6311 map, current,
6312 current->vme_start, current->vme_end,
6313 old_prot, current->protection,
6314 kr, current->vme_xnu_user_debug);
6315 }
6316 #endif /* DEVELOPMENT || DEBUG */
6317 }
6318 #endif /* CODE_SIGNING_MONITOR */
6319
6320 /*
6321 * Update physical map if necessary.
6322 * If the request is to turn off write protection,
6323 * we won't do it for real (in pmap). This is because
6324 * it would cause copy-on-write to fail. We've already
6325 * set, the new protection in the map, so if a
6326 * write-protect fault occurred, it will be fixed up
6327 * properly, COW or not.
6328 */
6329 if (current->protection != old_prot) {
6330 /* Look one level in we support nested pmaps */
6331 /* from mapped submaps which are direct entries */
6332 /* in our map */
6333
6334 vm_prot_t prot;
6335
6336 prot = current->protection;
6337 if (current->is_sub_map || (VME_OBJECT(current) == NULL) || (VME_OBJECT(current) != compressor_object)) {
6338 prot &= ~VM_PROT_WRITE;
6339 } else {
6340 assert(!VME_OBJECT(current)->code_signed);
6341 assert(VME_OBJECT(current)->copy_strategy == MEMORY_OBJECT_COPY_NONE);
6342 if (prot & VM_PROT_WRITE) {
6343 /*
6344 * For write requests on the
6345 * compressor, we wil ask the
6346 * pmap layer to prevent us from
6347 * taking a write fault when we
6348 * attempt to access the mapping
6349 * next.
6350 */
6351 pmap_options |= PMAP_OPTIONS_PROTECT_IMMEDIATE;
6352 }
6353 }
6354
6355 if (override_nx(map, VME_ALIAS(current)) && prot) {
6356 prot |= VM_PROT_EXECUTE;
6357 }
6358
6359 #if DEVELOPMENT || DEBUG
6360 if (!(old_prot & VM_PROT_EXECUTE) &&
6361 (prot & VM_PROT_EXECUTE) &&
6362 panic_on_unsigned_execute &&
6363 (proc_selfcsflags() & CS_KILL)) {
6364 panic("vm_map_protect(%p,0x%llx,0x%llx) old=0x%x new=0x%x - <rdar://23770418> code-signing bypass?", map, (uint64_t)current->vme_start, (uint64_t)current->vme_end, old_prot, prot);
6365 }
6366 #endif /* DEVELOPMENT || DEBUG */
6367
6368 if (pmap_has_prot_policy(map->pmap, current->translated_allow_execute, prot)) {
6369 if (current->wired_count) {
6370 panic("vm_map_protect(%p,0x%llx,0x%llx) new=0x%x wired=%x",
6371 map, (uint64_t)current->vme_start, (uint64_t)current->vme_end, prot, current->wired_count);
6372 }
6373
6374 /* If the pmap layer cares about this
6375 * protection type, force a fault for
6376 * each page so that vm_fault will
6377 * repopulate the page with the full
6378 * set of protections.
6379 */
6380 /*
6381 * TODO: We don't seem to need this,
6382 * but this is due to an internal
6383 * implementation detail of
6384 * pmap_protect. Do we want to rely
6385 * on this?
6386 */
6387 prot = VM_PROT_NONE;
6388 }
6389
6390 if (current->is_sub_map && current->use_pmap) {
6391 pmap_protect(VME_SUBMAP(current)->pmap,
6392 current->vme_start,
6393 current->vme_end,
6394 prot);
6395 } else {
6396 pmap_protect_options(map->pmap,
6397 current->vme_start,
6398 current->vme_end,
6399 prot,
6400 pmap_options,
6401 NULL);
6402 }
6403 }
6404 current = current->vme_next;
6405 }
6406
6407 current = entry;
6408 while ((current != vm_map_to_entry(map)) &&
6409 (current->vme_start <= end)) {
6410 vm_map_simplify_entry(map, current);
6411 current = current->vme_next;
6412 }
6413
6414 vm_map_unlock(map);
6415 return KERN_SUCCESS;
6416 }
6417
6418 /*
6419 * vm_map_inherit:
6420 *
6421 * Sets the inheritance of the specified address
6422 * range in the target map. Inheritance
6423 * affects how the map will be shared with
6424 * child maps at the time of vm_map_fork.
6425 */
6426 kern_return_t
vm_map_inherit(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_inherit_t new_inheritance)6427 vm_map_inherit(
6428 vm_map_t map,
6429 vm_map_offset_t start,
6430 vm_map_offset_t end,
6431 vm_inherit_t new_inheritance)
6432 {
6433 vm_map_entry_t entry;
6434 vm_map_entry_t temp_entry;
6435
6436 vm_map_lock(map);
6437
6438 VM_MAP_RANGE_CHECK(map, start, end);
6439
6440 if (__improbable(vm_map_range_overflows(map, start, end - start))) {
6441 vm_map_unlock(map);
6442 return KERN_INVALID_ADDRESS;
6443 }
6444
6445 if (vm_map_lookup_entry(map, start, &temp_entry)) {
6446 entry = temp_entry;
6447 } else {
6448 temp_entry = temp_entry->vme_next;
6449 entry = temp_entry;
6450 }
6451
6452 /* first check entire range for submaps which can't support the */
6453 /* given inheritance. */
6454 while ((entry != vm_map_to_entry(map)) && (entry->vme_start < end)) {
6455 if (entry->is_sub_map) {
6456 if (new_inheritance == VM_INHERIT_COPY) {
6457 vm_map_unlock(map);
6458 return KERN_INVALID_ARGUMENT;
6459 }
6460 }
6461
6462 entry = entry->vme_next;
6463 }
6464
6465 entry = temp_entry;
6466 if (entry != vm_map_to_entry(map)) {
6467 /* clip and unnest if necessary */
6468 vm_map_clip_start(map, entry, start);
6469 }
6470
6471 while ((entry != vm_map_to_entry(map)) && (entry->vme_start < end)) {
6472 vm_map_clip_end(map, entry, end);
6473 if (entry->is_sub_map) {
6474 /* clip did unnest if needed */
6475 assert(!entry->use_pmap);
6476 }
6477
6478 entry->inheritance = new_inheritance;
6479
6480 entry = entry->vme_next;
6481 }
6482
6483 vm_map_unlock(map);
6484 return KERN_SUCCESS;
6485 }
6486
6487 /*
6488 * Update the accounting for the amount of wired memory in this map. If the user has
6489 * exceeded the defined limits, then we fail. Wiring on behalf of the kernel never fails.
6490 */
6491
6492 static kern_return_t
add_wire_counts(vm_map_t map,vm_map_entry_t entry,boolean_t user_wire)6493 add_wire_counts(
6494 vm_map_t map,
6495 vm_map_entry_t entry,
6496 boolean_t user_wire)
6497 {
6498 vm_map_size_t size;
6499
6500 bool first_wire = entry->wired_count == 0 && entry->user_wired_count == 0;
6501
6502 if (user_wire) {
6503 unsigned int total_wire_count = vm_page_wire_count + vm_lopage_free_count;
6504
6505 /*
6506 * We're wiring memory at the request of the user. Check if this is the first time the user is wiring
6507 * this map entry.
6508 */
6509
6510 if (entry->user_wired_count == 0) {
6511 size = entry->vme_end - entry->vme_start;
6512
6513 /*
6514 * Since this is the first time the user is wiring this map entry, check to see if we're
6515 * exceeding the user wire limits. There is a per map limit which is the smaller of either
6516 * the process's rlimit or the global vm_per_task_user_wire_limit which caps this value. There is also
6517 * a system-wide limit on the amount of memory all users can wire. If the user is over either
6518 * limit, then we fail.
6519 */
6520
6521 if (size + map->user_wire_size > MIN(map->user_wire_limit, vm_per_task_user_wire_limit) ||
6522 size + ptoa_64(total_wire_count) > vm_global_user_wire_limit) {
6523 if (size + ptoa_64(total_wire_count) > vm_global_user_wire_limit) {
6524 #if DEVELOPMENT || DEBUG
6525 if (panic_on_mlock_failure) {
6526 panic("mlock: Over global wire limit. %llu bytes wired and requested to wire %llu bytes more", ptoa_64(total_wire_count), (uint64_t) size);
6527 }
6528 #endif /* DEVELOPMENT || DEBUG */
6529 os_atomic_inc(&vm_add_wire_count_over_global_limit, relaxed);
6530 } else {
6531 os_atomic_inc(&vm_add_wire_count_over_user_limit, relaxed);
6532 #if DEVELOPMENT || DEBUG
6533 if (panic_on_mlock_failure) {
6534 panic("mlock: Over process wire limit. %llu bytes wired and requested to wire %llu bytes more", (uint64_t) map->user_wire_size, (uint64_t) size);
6535 }
6536 #endif /* DEVELOPMENT || DEBUG */
6537 }
6538 return KERN_RESOURCE_SHORTAGE;
6539 }
6540
6541 /*
6542 * The first time the user wires an entry, we also increment the wired_count and add this to
6543 * the total that has been wired in the map.
6544 */
6545
6546 if (entry->wired_count >= MAX_WIRE_COUNT) {
6547 return KERN_FAILURE;
6548 }
6549
6550 entry->wired_count++;
6551 map->user_wire_size += size;
6552 }
6553
6554 if (entry->user_wired_count >= MAX_WIRE_COUNT) {
6555 return KERN_FAILURE;
6556 }
6557
6558 entry->user_wired_count++;
6559 } else {
6560 /*
6561 * The kernel's wiring the memory. Just bump the count and continue.
6562 */
6563
6564 if (entry->wired_count >= MAX_WIRE_COUNT) {
6565 panic("vm_map_wire: too many wirings");
6566 }
6567
6568 entry->wired_count++;
6569 }
6570
6571 if (first_wire) {
6572 vme_btref_consider_and_set(entry, __builtin_frame_address(0));
6573 }
6574
6575 return KERN_SUCCESS;
6576 }
6577
6578 /*
6579 * Update the memory wiring accounting now that the given map entry is being unwired.
6580 */
6581
6582 static void
subtract_wire_counts(vm_map_t map,vm_map_entry_t entry,boolean_t user_wire)6583 subtract_wire_counts(
6584 vm_map_t map,
6585 vm_map_entry_t entry,
6586 boolean_t user_wire)
6587 {
6588 if (user_wire) {
6589 /*
6590 * We're unwiring memory at the request of the user. See if we're removing the last user wire reference.
6591 */
6592
6593 if (entry->user_wired_count == 1) {
6594 /*
6595 * We're removing the last user wire reference. Decrement the wired_count and the total
6596 * user wired memory for this map.
6597 */
6598
6599 assert(entry->wired_count >= 1);
6600 entry->wired_count--;
6601 map->user_wire_size -= entry->vme_end - entry->vme_start;
6602 }
6603
6604 assert(entry->user_wired_count >= 1);
6605 entry->user_wired_count--;
6606 } else {
6607 /*
6608 * The kernel is unwiring the memory. Just update the count.
6609 */
6610
6611 assert(entry->wired_count >= 1);
6612 entry->wired_count--;
6613 }
6614
6615 vme_btref_consider_and_put(entry);
6616 }
6617
6618 int cs_executable_wire = 0;
6619
6620 /*
6621 * vm_map_wire:
6622 *
6623 * Sets the pageability of the specified address range in the
6624 * target map as wired. Regions specified as not pageable require
6625 * locked-down physical memory and physical page maps. The
6626 * access_type variable indicates types of accesses that must not
6627 * generate page faults. This is checked against protection of
6628 * memory being locked-down.
6629 *
6630 * The map must not be locked, but a reference must remain to the
6631 * map throughout the call.
6632 */
6633 static kern_return_t
vm_map_wire_nested(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_prot_t caller_prot,vm_tag_t tag,boolean_t user_wire,pmap_t map_pmap,vm_map_offset_t pmap_addr,ppnum_t * physpage_p)6634 vm_map_wire_nested(
6635 vm_map_t map,
6636 vm_map_offset_t start,
6637 vm_map_offset_t end,
6638 vm_prot_t caller_prot,
6639 vm_tag_t tag,
6640 boolean_t user_wire,
6641 pmap_t map_pmap,
6642 vm_map_offset_t pmap_addr,
6643 ppnum_t *physpage_p)
6644 {
6645 vm_map_entry_t entry;
6646 vm_prot_t access_type;
6647 struct vm_map_entry *first_entry, tmp_entry;
6648 vm_map_t real_map;
6649 vm_map_offset_t s, e;
6650 kern_return_t rc;
6651 boolean_t need_wakeup;
6652 boolean_t main_map = FALSE;
6653 wait_interrupt_t interruptible_state;
6654 thread_t cur_thread;
6655 unsigned int last_timestamp;
6656 vm_map_size_t size;
6657 boolean_t wire_and_extract;
6658 vm_prot_t extra_prots;
6659
6660 extra_prots = VM_PROT_COPY;
6661 extra_prots |= VM_PROT_COPY_FAIL_IF_EXECUTABLE;
6662 #if XNU_TARGET_OS_OSX
6663 if (map->pmap == kernel_pmap ||
6664 !vm_map_cs_enforcement(map)) {
6665 extra_prots &= ~VM_PROT_COPY_FAIL_IF_EXECUTABLE;
6666 }
6667 #endif /* XNU_TARGET_OS_OSX */
6668 #if CODE_SIGNING_MONITOR
6669 if (csm_address_space_exempt(map->pmap) == KERN_SUCCESS) {
6670 extra_prots &= ~VM_PROT_COPY_FAIL_IF_EXECUTABLE;
6671 }
6672 #endif /* CODE_SIGNING_MONITOR */
6673
6674 access_type = (caller_prot & (VM_PROT_ALL | VM_PROT_ALLEXEC));
6675
6676 wire_and_extract = FALSE;
6677 if (physpage_p != NULL) {
6678 /*
6679 * The caller wants the physical page number of the
6680 * wired page. We return only one physical page number
6681 * so this works for only one page at a time.
6682 */
6683 if ((end - start) != PAGE_SIZE) {
6684 return KERN_INVALID_ARGUMENT;
6685 }
6686 wire_and_extract = TRUE;
6687 *physpage_p = 0;
6688 }
6689
6690 vm_map_lock(map);
6691 if (map_pmap == NULL) {
6692 main_map = TRUE;
6693 }
6694 last_timestamp = map->timestamp;
6695
6696 VM_MAP_RANGE_CHECK(map, start, end);
6697 assert(VM_MAP_PAGE_ALIGNED(start, VM_MAP_PAGE_MASK(map)));
6698 assert(VM_MAP_PAGE_ALIGNED(end, VM_MAP_PAGE_MASK(map)));
6699
6700 if (start == end) {
6701 /* We wired what the caller asked for, zero pages */
6702 vm_map_unlock(map);
6703 return KERN_SUCCESS;
6704 }
6705
6706 if (__improbable(vm_map_range_overflows(map, start, end - start))) {
6707 vm_map_unlock(map);
6708 return KERN_INVALID_ADDRESS;
6709 }
6710
6711 need_wakeup = FALSE;
6712 cur_thread = current_thread();
6713
6714 s = start;
6715 rc = KERN_SUCCESS;
6716
6717 if (vm_map_lookup_entry(map, s, &first_entry)) {
6718 entry = first_entry;
6719 /*
6720 * vm_map_clip_start will be done later.
6721 * We don't want to unnest any nested submaps here !
6722 */
6723 } else {
6724 /* Start address is not in map */
6725 rc = KERN_INVALID_ADDRESS;
6726 goto done;
6727 }
6728
6729 while ((entry != vm_map_to_entry(map)) && (s < end)) {
6730 /*
6731 * At this point, we have wired from "start" to "s".
6732 * We still need to wire from "s" to "end".
6733 *
6734 * "entry" hasn't been clipped, so it could start before "s"
6735 * and/or end after "end".
6736 */
6737
6738 /* "e" is how far we want to wire in this entry */
6739 e = entry->vme_end;
6740 if (e > end) {
6741 e = end;
6742 }
6743
6744 /*
6745 * If another thread is wiring/unwiring this entry then
6746 * block after informing other thread to wake us up.
6747 */
6748 if (entry->in_transition) {
6749 wait_result_t wait_result;
6750
6751 /*
6752 * We have not clipped the entry. Make sure that
6753 * the start address is in range so that the lookup
6754 * below will succeed.
6755 * "s" is the current starting point: we've already
6756 * wired from "start" to "s" and we still have
6757 * to wire from "s" to "end".
6758 */
6759
6760 entry->needs_wakeup = TRUE;
6761
6762 /*
6763 * wake up anybody waiting on entries that we have
6764 * already wired.
6765 */
6766 if (need_wakeup) {
6767 vm_map_entry_wakeup(map);
6768 need_wakeup = FALSE;
6769 }
6770 /*
6771 * User wiring is interruptible
6772 */
6773 wait_result = vm_map_entry_wait(map,
6774 (user_wire) ? THREAD_ABORTSAFE :
6775 THREAD_UNINT);
6776 if (user_wire && wait_result == THREAD_INTERRUPTED) {
6777 /*
6778 * undo the wirings we have done so far
6779 * We do not clear the needs_wakeup flag,
6780 * because we cannot tell if we were the
6781 * only one waiting.
6782 */
6783 rc = KERN_FAILURE;
6784 goto done;
6785 }
6786
6787 /*
6788 * Cannot avoid a lookup here. reset timestamp.
6789 */
6790 last_timestamp = map->timestamp;
6791
6792 /*
6793 * The entry could have been clipped, look it up again.
6794 * Worse that can happen is, it may not exist anymore.
6795 */
6796 if (!vm_map_lookup_entry(map, s, &first_entry)) {
6797 /*
6798 * User: undo everything upto the previous
6799 * entry. let vm_map_unwire worry about
6800 * checking the validity of the range.
6801 */
6802 rc = KERN_FAILURE;
6803 goto done;
6804 }
6805 entry = first_entry;
6806 continue;
6807 }
6808
6809 if (entry->is_sub_map) {
6810 vm_map_offset_t sub_start;
6811 vm_map_offset_t sub_end;
6812 vm_map_offset_t local_start;
6813 vm_map_offset_t local_end;
6814 pmap_t pmap;
6815
6816 if (wire_and_extract) {
6817 /*
6818 * Wiring would result in copy-on-write
6819 * which would not be compatible with
6820 * the sharing we have with the original
6821 * provider of this memory.
6822 */
6823 rc = KERN_INVALID_ARGUMENT;
6824 goto done;
6825 }
6826
6827 vm_map_clip_start(map, entry, s);
6828 vm_map_clip_end(map, entry, end);
6829
6830 sub_start = VME_OFFSET(entry);
6831 sub_end = entry->vme_end;
6832 sub_end += VME_OFFSET(entry) - entry->vme_start;
6833
6834 local_end = entry->vme_end;
6835 if (map_pmap == NULL) {
6836 vm_object_t object;
6837 vm_object_offset_t offset;
6838 vm_prot_t prot;
6839 boolean_t wired;
6840 vm_map_entry_t local_entry;
6841 vm_map_version_t version;
6842 vm_map_t lookup_map;
6843
6844 if (entry->use_pmap) {
6845 pmap = VME_SUBMAP(entry)->pmap;
6846 /* ppc implementation requires that */
6847 /* submaps pmap address ranges line */
6848 /* up with parent map */
6849 #ifdef notdef
6850 pmap_addr = sub_start;
6851 #endif
6852 pmap_addr = s;
6853 } else {
6854 pmap = map->pmap;
6855 pmap_addr = s;
6856 }
6857
6858 if (entry->wired_count) {
6859 if ((rc = add_wire_counts(map, entry, user_wire)) != KERN_SUCCESS) {
6860 goto done;
6861 }
6862
6863 /*
6864 * The map was not unlocked:
6865 * no need to goto re-lookup.
6866 * Just go directly to next entry.
6867 */
6868 entry = entry->vme_next;
6869 s = entry->vme_start;
6870 continue;
6871 }
6872
6873 /* call vm_map_lookup_and_lock_object to */
6874 /* cause any needs copy to be */
6875 /* evaluated */
6876 local_start = entry->vme_start;
6877 lookup_map = map;
6878 vm_map_lock_write_to_read(map);
6879 rc = vm_map_lookup_and_lock_object(
6880 &lookup_map, local_start,
6881 (access_type | extra_prots),
6882 OBJECT_LOCK_EXCLUSIVE,
6883 &version, &object,
6884 &offset, &prot, &wired,
6885 NULL,
6886 &real_map, NULL);
6887 if (rc != KERN_SUCCESS) {
6888 vm_map_unlock_read(lookup_map);
6889 assert(map_pmap == NULL);
6890 vm_map_unwire(map, start,
6891 s, user_wire);
6892 return rc;
6893 }
6894 vm_object_unlock(object);
6895 if (real_map != lookup_map) {
6896 vm_map_unlock(real_map);
6897 }
6898 vm_map_unlock_read(lookup_map);
6899 vm_map_lock(map);
6900
6901 /* we unlocked, so must re-lookup */
6902 if (!vm_map_lookup_entry(map,
6903 local_start,
6904 &local_entry)) {
6905 rc = KERN_FAILURE;
6906 goto done;
6907 }
6908
6909 /*
6910 * entry could have been "simplified",
6911 * so re-clip
6912 */
6913 entry = local_entry;
6914 assert(s == local_start);
6915 vm_map_clip_start(map, entry, s);
6916 vm_map_clip_end(map, entry, end);
6917 /* re-compute "e" */
6918 e = entry->vme_end;
6919 if (e > end) {
6920 e = end;
6921 }
6922
6923 /* did we have a change of type? */
6924 if (!entry->is_sub_map) {
6925 last_timestamp = map->timestamp;
6926 continue;
6927 }
6928 } else {
6929 local_start = entry->vme_start;
6930 pmap = map_pmap;
6931 }
6932
6933 if ((rc = add_wire_counts(map, entry, user_wire)) != KERN_SUCCESS) {
6934 goto done;
6935 }
6936
6937 entry->in_transition = TRUE;
6938
6939 vm_map_unlock(map);
6940 rc = vm_map_wire_nested(VME_SUBMAP(entry),
6941 sub_start, sub_end,
6942 caller_prot, tag,
6943 user_wire, pmap, pmap_addr,
6944 NULL);
6945 vm_map_lock(map);
6946
6947 /*
6948 * Find the entry again. It could have been clipped
6949 * after we unlocked the map.
6950 */
6951 if (!vm_map_lookup_entry(map, local_start,
6952 &first_entry)) {
6953 panic("vm_map_wire: re-lookup failed");
6954 }
6955 entry = first_entry;
6956
6957 assert(local_start == s);
6958 /* re-compute "e" */
6959 e = entry->vme_end;
6960 if (e > end) {
6961 e = end;
6962 }
6963
6964 last_timestamp = map->timestamp;
6965 while ((entry != vm_map_to_entry(map)) &&
6966 (entry->vme_start < e)) {
6967 assert(entry->in_transition);
6968 entry->in_transition = FALSE;
6969 if (entry->needs_wakeup) {
6970 entry->needs_wakeup = FALSE;
6971 need_wakeup = TRUE;
6972 }
6973 if (rc != KERN_SUCCESS) {/* from vm_*_wire */
6974 subtract_wire_counts(map, entry, user_wire);
6975 }
6976 entry = entry->vme_next;
6977 }
6978 if (rc != KERN_SUCCESS) { /* from vm_*_wire */
6979 goto done;
6980 }
6981
6982 /* no need to relookup again */
6983 s = entry->vme_start;
6984 continue;
6985 }
6986
6987 /*
6988 * If this entry is already wired then increment
6989 * the appropriate wire reference count.
6990 */
6991 if (entry->wired_count) {
6992 if ((entry->protection & access_type) != access_type) {
6993 /* found a protection problem */
6994
6995 /*
6996 * XXX FBDP
6997 * We should always return an error
6998 * in this case but since we didn't
6999 * enforce it before, let's do
7000 * it only for the new "wire_and_extract"
7001 * code path for now...
7002 */
7003 if (wire_and_extract) {
7004 rc = KERN_PROTECTION_FAILURE;
7005 goto done;
7006 }
7007 }
7008
7009 /*
7010 * entry is already wired down, get our reference
7011 * after clipping to our range.
7012 */
7013 vm_map_clip_start(map, entry, s);
7014 vm_map_clip_end(map, entry, end);
7015
7016 if ((rc = add_wire_counts(map, entry, user_wire)) != KERN_SUCCESS) {
7017 goto done;
7018 }
7019
7020 if (wire_and_extract) {
7021 vm_object_t object;
7022 vm_object_offset_t offset;
7023 vm_page_t m;
7024
7025 /*
7026 * We don't have to "wire" the page again
7027 * bit we still have to "extract" its
7028 * physical page number, after some sanity
7029 * checks.
7030 */
7031 assert((entry->vme_end - entry->vme_start)
7032 == PAGE_SIZE);
7033 assert(!entry->needs_copy);
7034 assert(!entry->is_sub_map);
7035 assert(VME_OBJECT(entry));
7036 if (((entry->vme_end - entry->vme_start)
7037 != PAGE_SIZE) ||
7038 entry->needs_copy ||
7039 entry->is_sub_map ||
7040 VME_OBJECT(entry) == VM_OBJECT_NULL) {
7041 rc = KERN_INVALID_ARGUMENT;
7042 goto done;
7043 }
7044
7045 object = VME_OBJECT(entry);
7046 offset = VME_OFFSET(entry);
7047 /* need exclusive lock to update m->dirty */
7048 if (entry->protection & VM_PROT_WRITE) {
7049 vm_object_lock(object);
7050 } else {
7051 vm_object_lock_shared(object);
7052 }
7053 m = vm_page_lookup(object, offset);
7054 assert(m != VM_PAGE_NULL);
7055 assert(VM_PAGE_WIRED(m));
7056 if (m != VM_PAGE_NULL && VM_PAGE_WIRED(m)) {
7057 *physpage_p = VM_PAGE_GET_PHYS_PAGE(m);
7058 if (entry->protection & VM_PROT_WRITE) {
7059 vm_object_lock_assert_exclusive(
7060 object);
7061 m->vmp_dirty = TRUE;
7062 }
7063 } else {
7064 /* not already wired !? */
7065 *physpage_p = 0;
7066 }
7067 vm_object_unlock(object);
7068 }
7069
7070 /* map was not unlocked: no need to relookup */
7071 entry = entry->vme_next;
7072 s = entry->vme_start;
7073 continue;
7074 }
7075
7076 /*
7077 * Unwired entry or wire request transmitted via submap
7078 */
7079
7080 /*
7081 * Wiring would copy the pages to the shadow object.
7082 * The shadow object would not be code-signed so
7083 * attempting to execute code from these copied pages
7084 * would trigger a code-signing violation.
7085 */
7086
7087 if ((entry->protection & VM_PROT_EXECUTE)
7088 #if XNU_TARGET_OS_OSX
7089 &&
7090 map->pmap != kernel_pmap &&
7091 (vm_map_cs_enforcement(map)
7092 #if __arm64__
7093 || !VM_MAP_IS_EXOTIC(map)
7094 #endif /* __arm64__ */
7095 )
7096 #endif /* XNU_TARGET_OS_OSX */
7097 #if CODE_SIGNING_MONITOR
7098 &&
7099 (csm_address_space_exempt(map->pmap) != KERN_SUCCESS)
7100 #endif
7101 ) {
7102 #if MACH_ASSERT
7103 printf("pid %d[%s] wiring executable range from "
7104 "0x%llx to 0x%llx: rejected to preserve "
7105 "code-signing\n",
7106 proc_selfpid(),
7107 (get_bsdtask_info(current_task())
7108 ? proc_name_address(get_bsdtask_info(current_task()))
7109 : "?"),
7110 (uint64_t) entry->vme_start,
7111 (uint64_t) entry->vme_end);
7112 #endif /* MACH_ASSERT */
7113 DTRACE_VM2(cs_executable_wire,
7114 uint64_t, (uint64_t)entry->vme_start,
7115 uint64_t, (uint64_t)entry->vme_end);
7116 cs_executable_wire++;
7117 rc = KERN_PROTECTION_FAILURE;
7118 goto done;
7119 }
7120
7121 /*
7122 * Perform actions of vm_map_lookup that need the write
7123 * lock on the map: create a shadow object for a
7124 * copy-on-write region, or an object for a zero-fill
7125 * region.
7126 */
7127 size = entry->vme_end - entry->vme_start;
7128 /*
7129 * If wiring a copy-on-write page, we need to copy it now
7130 * even if we're only (currently) requesting read access.
7131 * This is aggressive, but once it's wired we can't move it.
7132 */
7133 if (entry->needs_copy) {
7134 if (wire_and_extract) {
7135 /*
7136 * We're supposed to share with the original
7137 * provider so should not be "needs_copy"
7138 */
7139 rc = KERN_INVALID_ARGUMENT;
7140 goto done;
7141 }
7142
7143 VME_OBJECT_SHADOW(entry, size,
7144 vm_map_always_shadow(map));
7145 entry->needs_copy = FALSE;
7146 } else if (VME_OBJECT(entry) == VM_OBJECT_NULL) {
7147 if (wire_and_extract) {
7148 /*
7149 * We're supposed to share with the original
7150 * provider so should already have an object.
7151 */
7152 rc = KERN_INVALID_ARGUMENT;
7153 goto done;
7154 }
7155 VME_OBJECT_SET(entry, vm_object_allocate(size), false, 0);
7156 VME_OFFSET_SET(entry, (vm_object_offset_t)0);
7157 assert(entry->use_pmap);
7158 } else if (VME_OBJECT(entry)->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) {
7159 if (wire_and_extract) {
7160 /*
7161 * We're supposed to share with the original
7162 * provider so should not be COPY_SYMMETRIC.
7163 */
7164 rc = KERN_INVALID_ARGUMENT;
7165 goto done;
7166 }
7167 /*
7168 * Force an unrequested "copy-on-write" but only for
7169 * the range we're wiring.
7170 */
7171 // printf("FBDP %s:%d map %p entry %p [ 0x%llx 0x%llx ] s 0x%llx end 0x%llx wire&extract=%d\n", __FUNCTION__, __LINE__, map, entry, (uint64_t)entry->vme_start, (uint64_t)entry->vme_end, (uint64_t)s, (uint64_t)end, wire_and_extract);
7172 vm_map_clip_start(map, entry, s);
7173 vm_map_clip_end(map, entry, end);
7174 /* recompute "size" */
7175 size = entry->vme_end - entry->vme_start;
7176 /* make a shadow object */
7177 vm_object_t orig_object;
7178 vm_object_offset_t orig_offset;
7179 orig_object = VME_OBJECT(entry);
7180 orig_offset = VME_OFFSET(entry);
7181 VME_OBJECT_SHADOW(entry, size, vm_map_always_shadow(map));
7182 if (VME_OBJECT(entry) != orig_object) {
7183 /*
7184 * This mapping has not been shared (or it would be
7185 * COPY_DELAY instead of COPY_SYMMETRIC) and it has
7186 * not been copied-on-write (or it would be marked
7187 * as "needs_copy" and would have been handled above
7188 * and also already write-protected).
7189 * We still need to write-protect here to prevent
7190 * other threads from modifying these pages while
7191 * we're in the process of copying and wiring
7192 * the copied pages.
7193 * Since the mapping is neither shared nor COWed,
7194 * we only need to write-protect the PTEs for this
7195 * mapping.
7196 */
7197 vm_object_pmap_protect(orig_object,
7198 orig_offset,
7199 size,
7200 map->pmap,
7201 VM_MAP_PAGE_SIZE(map),
7202 entry->vme_start,
7203 entry->protection & ~VM_PROT_WRITE);
7204 }
7205 }
7206 if (VME_OBJECT(entry)->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) {
7207 /*
7208 * Make the object COPY_DELAY to get a stable object
7209 * to wire.
7210 * That should avoid creating long shadow chains while
7211 * wiring/unwiring the same range repeatedly.
7212 * That also prevents part of the object from being
7213 * wired while another part is "needs_copy", which
7214 * could result in conflicting rules wrt copy-on-write.
7215 */
7216 vm_object_t object;
7217
7218 object = VME_OBJECT(entry);
7219 vm_object_lock(object);
7220 if (object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) {
7221 assertf(vm_object_round_page(VME_OFFSET(entry) + size) - vm_object_trunc_page(VME_OFFSET(entry)) == object->vo_size,
7222 "object %p size 0x%llx entry %p [0x%llx:0x%llx:0x%llx] size 0x%llx\n",
7223 object, (uint64_t)object->vo_size,
7224 entry,
7225 (uint64_t)entry->vme_start,
7226 (uint64_t)entry->vme_end,
7227 (uint64_t)VME_OFFSET(entry),
7228 (uint64_t)size);
7229 assertf(object->ref_count == 1,
7230 "object %p ref_count %d\n",
7231 object, object->ref_count);
7232 assertf(!entry->needs_copy,
7233 "entry %p\n", entry);
7234 object->copy_strategy = MEMORY_OBJECT_COPY_DELAY;
7235 VM_OBJECT_SET_TRUE_SHARE(object, TRUE);
7236 }
7237 vm_object_unlock(object);
7238 }
7239
7240 vm_map_clip_start(map, entry, s);
7241 vm_map_clip_end(map, entry, end);
7242
7243 /* re-compute "e" */
7244 e = entry->vme_end;
7245 if (e > end) {
7246 e = end;
7247 }
7248
7249 /*
7250 * Check for holes and protection mismatch.
7251 * Holes: Next entry should be contiguous unless this
7252 * is the end of the region.
7253 * Protection: Access requested must be allowed, unless
7254 * wiring is by protection class
7255 */
7256 if ((entry->vme_end < end) &&
7257 ((entry->vme_next == vm_map_to_entry(map)) ||
7258 (entry->vme_next->vme_start > entry->vme_end))) {
7259 /* found a hole */
7260 rc = KERN_INVALID_ADDRESS;
7261 goto done;
7262 }
7263 if ((entry->protection & access_type) != access_type) {
7264 /* found a protection problem */
7265 rc = KERN_PROTECTION_FAILURE;
7266 goto done;
7267 }
7268
7269 assert(entry->wired_count == 0 && entry->user_wired_count == 0);
7270
7271 if ((rc = add_wire_counts(map, entry, user_wire)) != KERN_SUCCESS) {
7272 goto done;
7273 }
7274
7275 entry->in_transition = TRUE;
7276
7277 /*
7278 * This entry might get split once we unlock the map.
7279 * In vm_fault_wire(), we need the current range as
7280 * defined by this entry. In order for this to work
7281 * along with a simultaneous clip operation, we make a
7282 * temporary copy of this entry and use that for the
7283 * wiring. Note that the underlying objects do not
7284 * change during a clip.
7285 */
7286 tmp_entry = *entry;
7287
7288 /*
7289 * The in_transition state guarentees that the entry
7290 * (or entries for this range, if split occured) will be
7291 * there when the map lock is acquired for the second time.
7292 */
7293 vm_map_unlock(map);
7294
7295 if (!user_wire && cur_thread != THREAD_NULL) {
7296 interruptible_state = thread_interrupt_level(THREAD_UNINT);
7297 } else {
7298 interruptible_state = THREAD_UNINT;
7299 }
7300
7301 if (map_pmap) {
7302 rc = vm_fault_wire(map,
7303 &tmp_entry, caller_prot, tag, map_pmap, pmap_addr,
7304 physpage_p);
7305 } else {
7306 rc = vm_fault_wire(map,
7307 &tmp_entry, caller_prot, tag, map->pmap,
7308 tmp_entry.vme_start,
7309 physpage_p);
7310 }
7311
7312 if (!user_wire && cur_thread != THREAD_NULL) {
7313 thread_interrupt_level(interruptible_state);
7314 }
7315
7316 vm_map_lock(map);
7317
7318 if (last_timestamp + 1 != map->timestamp) {
7319 /*
7320 * Find the entry again. It could have been clipped
7321 * after we unlocked the map.
7322 */
7323 if (!vm_map_lookup_entry(map, tmp_entry.vme_start,
7324 &first_entry)) {
7325 panic("vm_map_wire: re-lookup failed");
7326 }
7327
7328 entry = first_entry;
7329 }
7330
7331 last_timestamp = map->timestamp;
7332
7333 while ((entry != vm_map_to_entry(map)) &&
7334 (entry->vme_start < tmp_entry.vme_end)) {
7335 assert(entry->in_transition);
7336 entry->in_transition = FALSE;
7337 if (entry->needs_wakeup) {
7338 entry->needs_wakeup = FALSE;
7339 need_wakeup = TRUE;
7340 }
7341 if (rc != KERN_SUCCESS) { /* from vm_*_wire */
7342 subtract_wire_counts(map, entry, user_wire);
7343 }
7344 entry = entry->vme_next;
7345 }
7346
7347 if (rc != KERN_SUCCESS) { /* from vm_*_wire */
7348 goto done;
7349 }
7350
7351 if ((entry != vm_map_to_entry(map)) && /* we still have entries in the map */
7352 (tmp_entry.vme_end != end) && /* AND, we are not at the end of the requested range */
7353 (entry->vme_start != tmp_entry.vme_end)) { /* AND, the next entry is not contiguous. */
7354 /* found a "new" hole */
7355 s = tmp_entry.vme_end;
7356 rc = KERN_INVALID_ADDRESS;
7357 goto done;
7358 }
7359
7360 s = entry->vme_start;
7361 } /* end while loop through map entries */
7362
7363 done:
7364 if (rc == KERN_SUCCESS) {
7365 /* repair any damage we may have made to the VM map */
7366 vm_map_simplify_range(map, start, end);
7367 }
7368
7369 vm_map_unlock(map);
7370
7371 /*
7372 * wake up anybody waiting on entries we wired.
7373 */
7374 if (need_wakeup) {
7375 vm_map_entry_wakeup(map);
7376 }
7377
7378 if (rc != KERN_SUCCESS) {
7379 /* undo what has been wired so far */
7380 vm_map_unwire_nested(map, start, s, user_wire,
7381 map_pmap, pmap_addr);
7382 if (physpage_p) {
7383 *physpage_p = 0;
7384 }
7385 }
7386
7387 return rc;
7388 }
7389
7390 kern_return_t
vm_map_wire_external(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_prot_t caller_prot,boolean_t user_wire)7391 vm_map_wire_external(
7392 vm_map_t map,
7393 vm_map_offset_t start,
7394 vm_map_offset_t end,
7395 vm_prot_t caller_prot,
7396 boolean_t user_wire)
7397 {
7398 kern_return_t kret;
7399
7400 kret = vm_map_wire_nested(map, start, end, caller_prot, vm_tag_bt(),
7401 user_wire, (pmap_t)NULL, 0, NULL);
7402 return kret;
7403 }
7404
7405 kern_return_t
vm_map_wire_kernel(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_prot_t caller_prot,vm_tag_t tag,boolean_t user_wire)7406 vm_map_wire_kernel(
7407 vm_map_t map,
7408 vm_map_offset_t start,
7409 vm_map_offset_t end,
7410 vm_prot_t caller_prot,
7411 vm_tag_t tag,
7412 boolean_t user_wire)
7413 {
7414 kern_return_t kret;
7415
7416 kret = vm_map_wire_nested(map, start, end, caller_prot, tag,
7417 user_wire, (pmap_t)NULL, 0, NULL);
7418 return kret;
7419 }
7420
7421 kern_return_t
vm_map_wire_and_extract_external(vm_map_t map,vm_map_offset_t start,vm_prot_t caller_prot,boolean_t user_wire,ppnum_t * physpage_p)7422 vm_map_wire_and_extract_external(
7423 vm_map_t map,
7424 vm_map_offset_t start,
7425 vm_prot_t caller_prot,
7426 boolean_t user_wire,
7427 ppnum_t *physpage_p)
7428 {
7429 kern_return_t kret;
7430
7431 kret = vm_map_wire_nested(map,
7432 start,
7433 start + VM_MAP_PAGE_SIZE(map),
7434 caller_prot,
7435 vm_tag_bt(),
7436 user_wire,
7437 (pmap_t)NULL,
7438 0,
7439 physpage_p);
7440 if (kret != KERN_SUCCESS &&
7441 physpage_p != NULL) {
7442 *physpage_p = 0;
7443 }
7444 return kret;
7445 }
7446
7447 /*
7448 * vm_map_unwire:
7449 *
7450 * Sets the pageability of the specified address range in the target
7451 * as pageable. Regions specified must have been wired previously.
7452 *
7453 * The map must not be locked, but a reference must remain to the map
7454 * throughout the call.
7455 *
7456 * Kernel will panic on failures. User unwire ignores holes and
7457 * unwired and intransition entries to avoid losing memory by leaving
7458 * it unwired.
7459 */
7460 static kern_return_t
vm_map_unwire_nested(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,boolean_t user_wire,pmap_t map_pmap,vm_map_offset_t pmap_addr)7461 vm_map_unwire_nested(
7462 vm_map_t map,
7463 vm_map_offset_t start,
7464 vm_map_offset_t end,
7465 boolean_t user_wire,
7466 pmap_t map_pmap,
7467 vm_map_offset_t pmap_addr)
7468 {
7469 vm_map_entry_t entry;
7470 struct vm_map_entry *first_entry, tmp_entry;
7471 boolean_t need_wakeup;
7472 boolean_t main_map = FALSE;
7473 unsigned int last_timestamp;
7474
7475 vm_map_lock(map);
7476 if (map_pmap == NULL) {
7477 main_map = TRUE;
7478 }
7479 last_timestamp = map->timestamp;
7480
7481 VM_MAP_RANGE_CHECK(map, start, end);
7482 assert(VM_MAP_PAGE_ALIGNED(start, VM_MAP_PAGE_MASK(map)));
7483 assert(VM_MAP_PAGE_ALIGNED(end, VM_MAP_PAGE_MASK(map)));
7484
7485 if (start == end) {
7486 /* We unwired what the caller asked for: zero pages */
7487 vm_map_unlock(map);
7488 return KERN_SUCCESS;
7489 }
7490
7491 if (__improbable(vm_map_range_overflows(map, start, end - start))) {
7492 vm_map_unlock(map);
7493 return KERN_INVALID_ADDRESS;
7494 }
7495
7496 if (vm_map_lookup_entry(map, start, &first_entry)) {
7497 entry = first_entry;
7498 /*
7499 * vm_map_clip_start will be done later.
7500 * We don't want to unnest any nested sub maps here !
7501 */
7502 } else {
7503 if (!user_wire) {
7504 panic("vm_map_unwire: start not found");
7505 }
7506 /* Start address is not in map. */
7507 vm_map_unlock(map);
7508 return KERN_INVALID_ADDRESS;
7509 }
7510
7511 if (entry->superpage_size) {
7512 /* superpages are always wired */
7513 vm_map_unlock(map);
7514 return KERN_INVALID_ADDRESS;
7515 }
7516
7517 need_wakeup = FALSE;
7518 while ((entry != vm_map_to_entry(map)) && (entry->vme_start < end)) {
7519 if (entry->in_transition) {
7520 /*
7521 * 1)
7522 * Another thread is wiring down this entry. Note
7523 * that if it is not for the other thread we would
7524 * be unwiring an unwired entry. This is not
7525 * permitted. If we wait, we will be unwiring memory
7526 * we did not wire.
7527 *
7528 * 2)
7529 * Another thread is unwiring this entry. We did not
7530 * have a reference to it, because if we did, this
7531 * entry will not be getting unwired now.
7532 */
7533 if (!user_wire) {
7534 /*
7535 * XXX FBDP
7536 * This could happen: there could be some
7537 * overlapping vslock/vsunlock operations
7538 * going on.
7539 * We should probably just wait and retry,
7540 * but then we have to be careful that this
7541 * entry could get "simplified" after
7542 * "in_transition" gets unset and before
7543 * we re-lookup the entry, so we would
7544 * have to re-clip the entry to avoid
7545 * re-unwiring what we have already unwired...
7546 * See vm_map_wire_nested().
7547 *
7548 * Or we could just ignore "in_transition"
7549 * here and proceed to decement the wired
7550 * count(s) on this entry. That should be fine
7551 * as long as "wired_count" doesn't drop all
7552 * the way to 0 (and we should panic if THAT
7553 * happens).
7554 */
7555 panic("vm_map_unwire: in_transition entry");
7556 }
7557
7558 entry = entry->vme_next;
7559 continue;
7560 }
7561
7562 if (entry->is_sub_map) {
7563 vm_map_offset_t sub_start;
7564 vm_map_offset_t sub_end;
7565 vm_map_offset_t local_end;
7566 pmap_t pmap;
7567
7568 vm_map_clip_start(map, entry, start);
7569 vm_map_clip_end(map, entry, end);
7570
7571 sub_start = VME_OFFSET(entry);
7572 sub_end = entry->vme_end - entry->vme_start;
7573 sub_end += VME_OFFSET(entry);
7574 local_end = entry->vme_end;
7575 if (map_pmap == NULL) {
7576 if (entry->use_pmap) {
7577 pmap = VME_SUBMAP(entry)->pmap;
7578 pmap_addr = sub_start;
7579 } else {
7580 pmap = map->pmap;
7581 pmap_addr = start;
7582 }
7583 if (entry->wired_count == 0 ||
7584 (user_wire && entry->user_wired_count == 0)) {
7585 if (!user_wire) {
7586 panic("vm_map_unwire: entry is unwired");
7587 }
7588 entry = entry->vme_next;
7589 continue;
7590 }
7591
7592 /*
7593 * Check for holes
7594 * Holes: Next entry should be contiguous unless
7595 * this is the end of the region.
7596 */
7597 if (((entry->vme_end < end) &&
7598 ((entry->vme_next == vm_map_to_entry(map)) ||
7599 (entry->vme_next->vme_start
7600 > entry->vme_end)))) {
7601 if (!user_wire) {
7602 panic("vm_map_unwire: non-contiguous region");
7603 }
7604 /*
7605 * entry = entry->vme_next;
7606 * continue;
7607 */
7608 }
7609
7610 subtract_wire_counts(map, entry, user_wire);
7611
7612 if (entry->wired_count != 0) {
7613 entry = entry->vme_next;
7614 continue;
7615 }
7616
7617 entry->in_transition = TRUE;
7618 tmp_entry = *entry;/* see comment in vm_map_wire() */
7619
7620 /*
7621 * We can unlock the map now. The in_transition state
7622 * guarantees existance of the entry.
7623 */
7624 vm_map_unlock(map);
7625 vm_map_unwire_nested(VME_SUBMAP(entry),
7626 sub_start, sub_end, user_wire, pmap, pmap_addr);
7627 vm_map_lock(map);
7628
7629 if (last_timestamp + 1 != map->timestamp) {
7630 /*
7631 * Find the entry again. It could have been
7632 * clipped or deleted after we unlocked the map.
7633 */
7634 if (!vm_map_lookup_entry(map,
7635 tmp_entry.vme_start,
7636 &first_entry)) {
7637 if (!user_wire) {
7638 panic("vm_map_unwire: re-lookup failed");
7639 }
7640 entry = first_entry->vme_next;
7641 } else {
7642 entry = first_entry;
7643 }
7644 }
7645 last_timestamp = map->timestamp;
7646
7647 /*
7648 * clear transition bit for all constituent entries
7649 * that were in the original entry (saved in
7650 * tmp_entry). Also check for waiters.
7651 */
7652 while ((entry != vm_map_to_entry(map)) &&
7653 (entry->vme_start < tmp_entry.vme_end)) {
7654 assert(entry->in_transition);
7655 entry->in_transition = FALSE;
7656 if (entry->needs_wakeup) {
7657 entry->needs_wakeup = FALSE;
7658 need_wakeup = TRUE;
7659 }
7660 entry = entry->vme_next;
7661 }
7662 continue;
7663 } else {
7664 tmp_entry = *entry;
7665 vm_map_unlock(map);
7666 vm_map_unwire_nested(VME_SUBMAP(entry),
7667 sub_start, sub_end, user_wire, map_pmap,
7668 pmap_addr);
7669 vm_map_lock(map);
7670
7671 if (last_timestamp + 1 != map->timestamp) {
7672 /*
7673 * Find the entry again. It could have been
7674 * clipped or deleted after we unlocked the map.
7675 */
7676 if (!vm_map_lookup_entry(map,
7677 tmp_entry.vme_start,
7678 &first_entry)) {
7679 if (!user_wire) {
7680 panic("vm_map_unwire: re-lookup failed");
7681 }
7682 entry = first_entry->vme_next;
7683 } else {
7684 entry = first_entry;
7685 }
7686 }
7687 last_timestamp = map->timestamp;
7688 }
7689 }
7690
7691
7692 if ((entry->wired_count == 0) ||
7693 (user_wire && entry->user_wired_count == 0)) {
7694 if (!user_wire) {
7695 panic("vm_map_unwire: entry is unwired");
7696 }
7697
7698 entry = entry->vme_next;
7699 continue;
7700 }
7701
7702 assert(entry->wired_count > 0 &&
7703 (!user_wire || entry->user_wired_count > 0));
7704
7705 vm_map_clip_start(map, entry, start);
7706 vm_map_clip_end(map, entry, end);
7707
7708 /*
7709 * Check for holes
7710 * Holes: Next entry should be contiguous unless
7711 * this is the end of the region.
7712 */
7713 if (((entry->vme_end < end) &&
7714 ((entry->vme_next == vm_map_to_entry(map)) ||
7715 (entry->vme_next->vme_start > entry->vme_end)))) {
7716 if (!user_wire) {
7717 panic("vm_map_unwire: non-contiguous region");
7718 }
7719 entry = entry->vme_next;
7720 continue;
7721 }
7722
7723 subtract_wire_counts(map, entry, user_wire);
7724
7725 if (entry->wired_count != 0) {
7726 entry = entry->vme_next;
7727 continue;
7728 }
7729
7730 if (entry->zero_wired_pages) {
7731 entry->zero_wired_pages = FALSE;
7732 }
7733
7734 entry->in_transition = TRUE;
7735 tmp_entry = *entry; /* see comment in vm_map_wire() */
7736
7737 /*
7738 * We can unlock the map now. The in_transition state
7739 * guarantees existance of the entry.
7740 */
7741 vm_map_unlock(map);
7742 if (map_pmap) {
7743 vm_fault_unwire(map, &tmp_entry, FALSE, map_pmap,
7744 pmap_addr, tmp_entry.vme_end);
7745 } else {
7746 vm_fault_unwire(map, &tmp_entry, FALSE, map->pmap,
7747 tmp_entry.vme_start, tmp_entry.vme_end);
7748 }
7749 vm_map_lock(map);
7750
7751 if (last_timestamp + 1 != map->timestamp) {
7752 /*
7753 * Find the entry again. It could have been clipped
7754 * or deleted after we unlocked the map.
7755 */
7756 if (!vm_map_lookup_entry(map, tmp_entry.vme_start,
7757 &first_entry)) {
7758 if (!user_wire) {
7759 panic("vm_map_unwire: re-lookup failed");
7760 }
7761 entry = first_entry->vme_next;
7762 } else {
7763 entry = first_entry;
7764 }
7765 }
7766 last_timestamp = map->timestamp;
7767
7768 /*
7769 * clear transition bit for all constituent entries that
7770 * were in the original entry (saved in tmp_entry). Also
7771 * check for waiters.
7772 */
7773 while ((entry != vm_map_to_entry(map)) &&
7774 (entry->vme_start < tmp_entry.vme_end)) {
7775 assert(entry->in_transition);
7776 entry->in_transition = FALSE;
7777 if (entry->needs_wakeup) {
7778 entry->needs_wakeup = FALSE;
7779 need_wakeup = TRUE;
7780 }
7781 entry = entry->vme_next;
7782 }
7783 }
7784
7785 /*
7786 * We might have fragmented the address space when we wired this
7787 * range of addresses. Attempt to re-coalesce these VM map entries
7788 * with their neighbors now that they're no longer wired.
7789 * Under some circumstances, address space fragmentation can
7790 * prevent VM object shadow chain collapsing, which can cause
7791 * swap space leaks.
7792 */
7793 vm_map_simplify_range(map, start, end);
7794
7795 vm_map_unlock(map);
7796 /*
7797 * wake up anybody waiting on entries that we have unwired.
7798 */
7799 if (need_wakeup) {
7800 vm_map_entry_wakeup(map);
7801 }
7802 return KERN_SUCCESS;
7803 }
7804
7805 kern_return_t
vm_map_unwire(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,boolean_t user_wire)7806 vm_map_unwire(
7807 vm_map_t map,
7808 vm_map_offset_t start,
7809 vm_map_offset_t end,
7810 boolean_t user_wire)
7811 {
7812 return vm_map_unwire_nested(map, start, end,
7813 user_wire, (pmap_t)NULL, 0);
7814 }
7815
7816
7817 /*
7818 * vm_map_entry_zap: [ internal use only ]
7819 *
7820 * Remove the entry from the target map
7821 * and put it on a zap list.
7822 */
7823 static void
vm_map_entry_zap(vm_map_t map,vm_map_entry_t entry,vm_map_zap_t zap)7824 vm_map_entry_zap(
7825 vm_map_t map,
7826 vm_map_entry_t entry,
7827 vm_map_zap_t zap)
7828 {
7829 vm_map_offset_t s, e;
7830
7831 s = entry->vme_start;
7832 e = entry->vme_end;
7833 assert(VM_MAP_PAGE_ALIGNED(s, FOURK_PAGE_MASK));
7834 assert(VM_MAP_PAGE_ALIGNED(e, FOURK_PAGE_MASK));
7835 if (VM_MAP_PAGE_MASK(map) >= PAGE_MASK) {
7836 assert(page_aligned(s));
7837 assert(page_aligned(e));
7838 }
7839 if (entry->map_aligned == TRUE) {
7840 assert(VM_MAP_PAGE_ALIGNED(s, VM_MAP_PAGE_MASK(map)));
7841 assert(VM_MAP_PAGE_ALIGNED(e, VM_MAP_PAGE_MASK(map)));
7842 }
7843 assert(entry->wired_count == 0);
7844 assert(entry->user_wired_count == 0);
7845 assert(!entry->vme_permanent);
7846
7847 vm_map_store_entry_unlink(map, entry, false);
7848 map->size -= e - s;
7849
7850 vm_map_zap_append(zap, entry);
7851 }
7852
7853 static void
vm_map_submap_pmap_clean(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_map_t sub_map,vm_map_offset_t offset)7854 vm_map_submap_pmap_clean(
7855 vm_map_t map,
7856 vm_map_offset_t start,
7857 vm_map_offset_t end,
7858 vm_map_t sub_map,
7859 vm_map_offset_t offset)
7860 {
7861 vm_map_offset_t submap_start;
7862 vm_map_offset_t submap_end;
7863 vm_map_size_t remove_size;
7864 vm_map_entry_t entry;
7865
7866 submap_end = offset + (end - start);
7867 submap_start = offset;
7868
7869 vm_map_lock_read(sub_map);
7870 if (vm_map_lookup_entry(sub_map, offset, &entry)) {
7871 remove_size = (entry->vme_end - entry->vme_start);
7872 if (offset > entry->vme_start) {
7873 remove_size -= offset - entry->vme_start;
7874 }
7875
7876
7877 if (submap_end < entry->vme_end) {
7878 remove_size -=
7879 entry->vme_end - submap_end;
7880 }
7881 if (entry->is_sub_map) {
7882 vm_map_submap_pmap_clean(
7883 sub_map,
7884 start,
7885 start + remove_size,
7886 VME_SUBMAP(entry),
7887 VME_OFFSET(entry));
7888 } else {
7889 if (map->mapped_in_other_pmaps &&
7890 os_ref_get_count_raw(&map->map_refcnt) != 0 &&
7891 VME_OBJECT(entry) != NULL) {
7892 vm_object_pmap_protect_options(
7893 VME_OBJECT(entry),
7894 (VME_OFFSET(entry) +
7895 offset -
7896 entry->vme_start),
7897 remove_size,
7898 PMAP_NULL,
7899 PAGE_SIZE,
7900 entry->vme_start,
7901 VM_PROT_NONE,
7902 PMAP_OPTIONS_REMOVE);
7903 } else {
7904 pmap_remove(map->pmap,
7905 (addr64_t)start,
7906 (addr64_t)(start + remove_size));
7907 }
7908 }
7909 }
7910
7911 entry = entry->vme_next;
7912
7913 while ((entry != vm_map_to_entry(sub_map))
7914 && (entry->vme_start < submap_end)) {
7915 remove_size = (entry->vme_end - entry->vme_start);
7916 if (submap_end < entry->vme_end) {
7917 remove_size -= entry->vme_end - submap_end;
7918 }
7919 if (entry->is_sub_map) {
7920 vm_map_submap_pmap_clean(
7921 sub_map,
7922 (start + entry->vme_start) - offset,
7923 ((start + entry->vme_start) - offset) + remove_size,
7924 VME_SUBMAP(entry),
7925 VME_OFFSET(entry));
7926 } else {
7927 if (map->mapped_in_other_pmaps &&
7928 os_ref_get_count_raw(&map->map_refcnt) != 0 &&
7929 VME_OBJECT(entry) != NULL) {
7930 vm_object_pmap_protect_options(
7931 VME_OBJECT(entry),
7932 VME_OFFSET(entry),
7933 remove_size,
7934 PMAP_NULL,
7935 PAGE_SIZE,
7936 entry->vme_start,
7937 VM_PROT_NONE,
7938 PMAP_OPTIONS_REMOVE);
7939 } else {
7940 pmap_remove(map->pmap,
7941 (addr64_t)((start + entry->vme_start)
7942 - offset),
7943 (addr64_t)(((start + entry->vme_start)
7944 - offset) + remove_size));
7945 }
7946 }
7947 entry = entry->vme_next;
7948 }
7949 vm_map_unlock_read(sub_map);
7950 return;
7951 }
7952
7953 /*
7954 * virt_memory_guard_ast:
7955 *
7956 * Handle the AST callout for a virtual memory guard.
7957 * raise an EXC_GUARD exception and terminate the task
7958 * if configured to do so.
7959 */
7960 void
virt_memory_guard_ast(thread_t thread,mach_exception_data_type_t code,mach_exception_data_type_t subcode)7961 virt_memory_guard_ast(
7962 thread_t thread,
7963 mach_exception_data_type_t code,
7964 mach_exception_data_type_t subcode)
7965 {
7966 task_t task = get_threadtask(thread);
7967 assert(task != kernel_task);
7968 assert(task == current_task());
7969 kern_return_t sync_exception_result;
7970 uint32_t behavior;
7971
7972 behavior = task->task_exc_guard;
7973
7974 /* Is delivery enabled */
7975 if ((behavior & TASK_EXC_GUARD_VM_DELIVER) == 0) {
7976 return;
7977 }
7978
7979 /* If only once, make sure we're that once */
7980 while (behavior & TASK_EXC_GUARD_VM_ONCE) {
7981 uint32_t new_behavior = behavior & ~TASK_EXC_GUARD_VM_DELIVER;
7982
7983 if (OSCompareAndSwap(behavior, new_behavior, &task->task_exc_guard)) {
7984 break;
7985 }
7986 behavior = task->task_exc_guard;
7987 if ((behavior & TASK_EXC_GUARD_VM_DELIVER) == 0) {
7988 return;
7989 }
7990 }
7991
7992 const bool fatal = task->task_exc_guard & TASK_EXC_GUARD_VM_FATAL;
7993 /* Raise exception synchronously and see if handler claimed it */
7994 sync_exception_result = task_exception_notify(EXC_GUARD, code, subcode, fatal);
7995
7996 if (fatal) {
7997 /*
7998 * If Synchronous EXC_GUARD delivery was successful then
7999 * kill the process and return, else kill the process
8000 * and deliver the exception via EXC_CORPSE_NOTIFY.
8001 */
8002 if (sync_exception_result == KERN_SUCCESS) {
8003 task_bsdtask_kill(current_task());
8004 } else {
8005 exit_with_guard_exception(current_proc(), code, subcode);
8006 }
8007 } else if (task->task_exc_guard & TASK_EXC_GUARD_VM_CORPSE) {
8008 /*
8009 * If the synchronous EXC_GUARD delivery was not successful,
8010 * raise a simulated crash.
8011 */
8012 if (sync_exception_result != KERN_SUCCESS) {
8013 task_violated_guard(code, subcode, NULL, FALSE);
8014 }
8015 }
8016 }
8017
8018 /*
8019 * vm_map_guard_exception:
8020 *
8021 * Generate a GUARD_TYPE_VIRTUAL_MEMORY EXC_GUARD exception.
8022 *
8023 * Right now, we do this when we find nothing mapped, or a
8024 * gap in the mapping when a user address space deallocate
8025 * was requested. We report the address of the first gap found.
8026 */
8027 static void
vm_map_guard_exception(vm_map_offset_t gap_start,unsigned reason)8028 vm_map_guard_exception(
8029 vm_map_offset_t gap_start,
8030 unsigned reason)
8031 {
8032 mach_exception_code_t code = 0;
8033 unsigned int guard_type = GUARD_TYPE_VIRT_MEMORY;
8034 unsigned int target = 0; /* should we pass in pid associated with map? */
8035 mach_exception_data_type_t subcode = (uint64_t)gap_start;
8036 boolean_t fatal = FALSE;
8037
8038 task_t task = current_task_early();
8039
8040 /* Can't deliver exceptions to a NULL task (early boot) or kernel task */
8041 if (task == NULL || task == kernel_task) {
8042 return;
8043 }
8044
8045 EXC_GUARD_ENCODE_TYPE(code, guard_type);
8046 EXC_GUARD_ENCODE_FLAVOR(code, reason);
8047 EXC_GUARD_ENCODE_TARGET(code, target);
8048
8049 if (task->task_exc_guard & TASK_EXC_GUARD_VM_FATAL) {
8050 fatal = TRUE;
8051 }
8052 thread_guard_violation(current_thread(), code, subcode, fatal);
8053 }
8054
8055 static kern_return_t
vm_map_delete_submap_recurse(vm_map_t submap,vm_map_offset_t submap_start,vm_map_offset_t submap_end)8056 vm_map_delete_submap_recurse(
8057 vm_map_t submap,
8058 vm_map_offset_t submap_start,
8059 vm_map_offset_t submap_end)
8060 {
8061 vm_map_entry_t submap_entry;
8062
8063 /*
8064 * Verify that the submap does not contain any "permanent" entries
8065 * within the specified range.
8066 * We do not care about gaps.
8067 */
8068
8069 vm_map_lock(submap);
8070
8071 if (!vm_map_lookup_entry(submap, submap_start, &submap_entry)) {
8072 submap_entry = submap_entry->vme_next;
8073 }
8074
8075 for (;
8076 submap_entry != vm_map_to_entry(submap) &&
8077 submap_entry->vme_start < submap_end;
8078 submap_entry = submap_entry->vme_next) {
8079 if (submap_entry->vme_permanent) {
8080 /* "permanent" entry -> fail */
8081 vm_map_unlock(submap);
8082 return KERN_PROTECTION_FAILURE;
8083 }
8084 }
8085 /* no "permanent" entries in the range -> success */
8086 vm_map_unlock(submap);
8087 return KERN_SUCCESS;
8088 }
8089
8090 __abortlike
8091 static void
__vm_map_delete_misaligned_panic(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)8092 __vm_map_delete_misaligned_panic(
8093 vm_map_t map,
8094 vm_map_offset_t start,
8095 vm_map_offset_t end)
8096 {
8097 panic("vm_map_delete(%p,0x%llx,0x%llx): start is not aligned to 0x%x",
8098 map, (uint64_t)start, (uint64_t)end, VM_MAP_PAGE_SIZE(map));
8099 }
8100
8101 __abortlike
8102 static void
__vm_map_delete_failed_panic(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,kern_return_t kr)8103 __vm_map_delete_failed_panic(
8104 vm_map_t map,
8105 vm_map_offset_t start,
8106 vm_map_offset_t end,
8107 kern_return_t kr)
8108 {
8109 panic("vm_map_delete(%p,0x%llx,0x%llx): failed unexpected with %d",
8110 map, (uint64_t)start, (uint64_t)end, kr);
8111 }
8112
8113 __abortlike
8114 static void
__vm_map_delete_gap_panic(vm_map_t map,vm_map_offset_t where,vm_map_offset_t start,vm_map_offset_t end)8115 __vm_map_delete_gap_panic(
8116 vm_map_t map,
8117 vm_map_offset_t where,
8118 vm_map_offset_t start,
8119 vm_map_offset_t end)
8120 {
8121 panic("vm_map_delete(%p,0x%llx,0x%llx): no map entry at 0x%llx",
8122 map, (uint64_t)start, (uint64_t)end, (uint64_t)where);
8123 }
8124
8125 __abortlike
8126 static void
__vm_map_delete_permanent_panic(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_map_entry_t entry)8127 __vm_map_delete_permanent_panic(
8128 vm_map_t map,
8129 vm_map_offset_t start,
8130 vm_map_offset_t end,
8131 vm_map_entry_t entry)
8132 {
8133 panic("vm_map_delete(%p,0x%llx,0x%llx): "
8134 "Attempting to remove permanent VM map entry %p [0x%llx:0x%llx]",
8135 map, (uint64_t)start, (uint64_t)end, entry,
8136 (uint64_t)entry->vme_start,
8137 (uint64_t)entry->vme_end);
8138 }
8139
8140 __options_decl(vm_map_delete_state_t, uint32_t, {
8141 VMDS_NONE = 0x0000,
8142
8143 VMDS_FOUND_GAP = 0x0001,
8144 VMDS_GAPS_OK = 0x0002,
8145
8146 VMDS_KERNEL_PMAP = 0x0004,
8147 VMDS_NEEDS_LOOKUP = 0x0008,
8148 VMDS_NEEDS_WAKEUP = 0x0010,
8149 VMDS_KERNEL_KMEMPTR = 0x0020
8150 });
8151
8152 /*
8153 * vm_map_delete: [ internal use only ]
8154 *
8155 * Deallocates the given address range from the target map.
8156 * Removes all user wirings. Unwires one kernel wiring if
8157 * VM_MAP_REMOVE_KUNWIRE is set. Waits for kernel wirings to go
8158 * away if VM_MAP_REMOVE_WAIT_FOR_KWIRE is set. Sleeps
8159 * interruptibly if VM_MAP_REMOVE_INTERRUPTIBLE is set.
8160 *
8161 *
8162 * When the map is a kernel map, then any error in removing mappings
8163 * will lead to a panic so that clients do not have to repeat the panic
8164 * code at each call site. If VM_MAP_REMOVE_INTERRUPTIBLE
8165 * is also passed, then KERN_ABORTED will not lead to a panic.
8166 *
8167 * This routine is called with map locked and leaves map locked.
8168 */
8169 static kmem_return_t
vm_map_delete(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vmr_flags_t flags,kmem_guard_t guard,vm_map_zap_t zap_list)8170 vm_map_delete(
8171 vm_map_t map,
8172 vm_map_offset_t start,
8173 vm_map_offset_t end,
8174 vmr_flags_t flags,
8175 kmem_guard_t guard,
8176 vm_map_zap_t zap_list)
8177 {
8178 vm_map_entry_t entry, next;
8179 int interruptible;
8180 vm_map_offset_t gap_start = 0;
8181 vm_map_offset_t clear_in_transition_end = 0;
8182 __unused vm_map_offset_t save_start = start;
8183 __unused vm_map_offset_t save_end = end;
8184 vm_map_delete_state_t state = VMDS_NONE;
8185 kmem_return_t ret = { };
8186 vm_map_range_id_t range_id = 0;
8187 struct kmem_page_meta *meta = NULL;
8188 uint32_t size_idx, slot_idx;
8189 struct mach_vm_range slot;
8190
8191 if (vm_map_pmap(map) == kernel_pmap) {
8192 state |= VMDS_KERNEL_PMAP;
8193 range_id = kmem_addr_get_range(start, end - start);
8194 if (kmem_is_ptr_range(range_id)) {
8195 state |= VMDS_KERNEL_KMEMPTR;
8196 slot_idx = kmem_addr_get_slot_idx(start, end, range_id, &meta,
8197 &size_idx, &slot);
8198 }
8199 }
8200
8201 if (map->terminated || os_ref_get_count_raw(&map->map_refcnt) == 0) {
8202 state |= VMDS_GAPS_OK;
8203 }
8204
8205 if (map->corpse_source &&
8206 !(flags & VM_MAP_REMOVE_TO_OVERWRITE) &&
8207 !map->terminated) {
8208 /*
8209 * The map is being used for corpses related diagnostics.
8210 * So skip any entry removal to avoid perturbing the map state.
8211 * The cleanup will happen in task_terminate_internal after the
8212 * call to task_port_no_senders.
8213 */
8214 goto out;
8215 }
8216
8217 interruptible = (flags & VM_MAP_REMOVE_INTERRUPTIBLE) ?
8218 THREAD_ABORTSAFE : THREAD_UNINT;
8219
8220 if ((flags & VM_MAP_REMOVE_NO_MAP_ALIGN) == 0 &&
8221 (start & VM_MAP_PAGE_MASK(map))) {
8222 __vm_map_delete_misaligned_panic(map, start, end);
8223 }
8224
8225 if ((state & VMDS_GAPS_OK) == 0) {
8226 /*
8227 * If the map isn't terminated then all deletions must have
8228 * no gaps, and be within the [min, max) of the map.
8229 *
8230 * We got here without VM_MAP_RANGE_CHECK() being called,
8231 * and hence must validate bounds manually.
8232 *
8233 * It is worth noting that because vm_deallocate() will
8234 * round_page() the deallocation size, it's possible for "end"
8235 * to be 0 here due to overflow. We hence must treat it as being
8236 * beyond vm_map_max(map).
8237 *
8238 * Similarly, end < start means some wrap around happend,
8239 * which should cause an error or panic.
8240 */
8241 if (end == 0 || end > vm_map_max(map)) {
8242 state |= VMDS_FOUND_GAP;
8243 gap_start = vm_map_max(map);
8244 if (state & VMDS_KERNEL_PMAP) {
8245 __vm_map_delete_gap_panic(map,
8246 gap_start, start, end);
8247 }
8248 goto out;
8249 }
8250
8251 if (end < start) {
8252 if (state & VMDS_KERNEL_PMAP) {
8253 __vm_map_delete_gap_panic(map,
8254 vm_map_max(map), start, end);
8255 }
8256 ret.kmr_return = KERN_INVALID_ARGUMENT;
8257 goto out;
8258 }
8259
8260 if (start < vm_map_min(map)) {
8261 state |= VMDS_FOUND_GAP;
8262 gap_start = start;
8263 if (state & VMDS_KERNEL_PMAP) {
8264 __vm_map_delete_gap_panic(map,
8265 gap_start, start, end);
8266 }
8267 goto out;
8268 }
8269 } else {
8270 /*
8271 * If the map is terminated, we must accept start/end
8272 * being beyond the boundaries of the map as this is
8273 * how some of the mappings like commpage mappings
8274 * can be destroyed (they're outside of those bounds).
8275 *
8276 * end < start is still something we can't cope with,
8277 * so just bail.
8278 */
8279 if (end < start) {
8280 goto out;
8281 }
8282 }
8283
8284
8285 /*
8286 * Find the start of the region.
8287 *
8288 * If in a superpage, extend the range
8289 * to include the start of the mapping.
8290 */
8291 while (vm_map_lookup_entry_or_next(map, start, &entry)) {
8292 if (entry->superpage_size && (start & ~SUPERPAGE_MASK)) {
8293 start = SUPERPAGE_ROUND_DOWN(start);
8294 } else {
8295 SAVE_HINT_MAP_WRITE(map, entry->vme_prev);
8296 break;
8297 }
8298 }
8299
8300 if (entry->superpage_size) {
8301 end = SUPERPAGE_ROUND_UP(end);
8302 }
8303
8304 /*
8305 * Step through all entries in this region
8306 */
8307 for (vm_map_offset_t s = start; s < end;) {
8308 /*
8309 * At this point, we have deleted all the memory entries
8310 * in [start, s) and are proceeding with the [s, end) range.
8311 *
8312 * This loop might drop the map lock, and it is possible that
8313 * some memory was already reallocated within [start, s)
8314 * and we don't want to mess with those entries.
8315 *
8316 * Some of those entries could even have been re-assembled
8317 * with an entry after "s" (in vm_map_simplify_entry()), so
8318 * we may have to vm_map_clip_start() again.
8319 *
8320 * When clear_in_transition_end is set, the we had marked
8321 * [start, clear_in_transition_end) as "in_transition"
8322 * during a previous iteration and we need to clear it.
8323 */
8324
8325 /*
8326 * Step 1: If needed (because we dropped locks),
8327 * lookup the entry again.
8328 *
8329 * If we're coming back from unwiring (Step 5),
8330 * we also need to mark the entries as no longer
8331 * in transition after that.
8332 */
8333
8334 if (state & VMDS_NEEDS_LOOKUP) {
8335 state &= ~VMDS_NEEDS_LOOKUP;
8336
8337 if (vm_map_lookup_entry_or_next(map, s, &entry)) {
8338 SAVE_HINT_MAP_WRITE(map, entry->vme_prev);
8339 }
8340
8341 if (state & VMDS_KERNEL_KMEMPTR) {
8342 kmem_validate_slot(s, meta, size_idx, slot_idx);
8343 }
8344 }
8345
8346 if (clear_in_transition_end) {
8347 for (vm_map_entry_t it = entry;
8348 it != vm_map_to_entry(map) &&
8349 it->vme_start < clear_in_transition_end;
8350 it = it->vme_next) {
8351 assert(it->in_transition);
8352 it->in_transition = FALSE;
8353 if (it->needs_wakeup) {
8354 it->needs_wakeup = FALSE;
8355 state |= VMDS_NEEDS_WAKEUP;
8356 }
8357 }
8358
8359 clear_in_transition_end = 0;
8360 }
8361
8362
8363 /*
8364 * Step 2: Perform various policy checks
8365 * before we do _anything_ to this entry.
8366 */
8367
8368 if (entry == vm_map_to_entry(map) || s < entry->vme_start) {
8369 if (state & (VMDS_GAPS_OK | VMDS_FOUND_GAP)) {
8370 /*
8371 * Either we found a gap already,
8372 * or we are tearing down a map,
8373 * keep going.
8374 */
8375 } else if (state & VMDS_KERNEL_PMAP) {
8376 __vm_map_delete_gap_panic(map, s, start, end);
8377 } else if (s < end) {
8378 state |= VMDS_FOUND_GAP;
8379 gap_start = s;
8380 }
8381
8382 if (entry == vm_map_to_entry(map) ||
8383 end <= entry->vme_start) {
8384 break;
8385 }
8386
8387 s = entry->vme_start;
8388 }
8389
8390 if (state & VMDS_KERNEL_PMAP) {
8391 /*
8392 * In the kernel map and its submaps,
8393 * permanent entries never die, even
8394 * if VM_MAP_REMOVE_IMMUTABLE is passed.
8395 */
8396 if (entry->vme_permanent) {
8397 __vm_map_delete_permanent_panic(map, start, end, entry);
8398 }
8399
8400 if (flags & VM_MAP_REMOVE_GUESS_SIZE) {
8401 end = entry->vme_end;
8402 flags &= ~VM_MAP_REMOVE_GUESS_SIZE;
8403 }
8404
8405 /*
8406 * In the kernel map and its submaps,
8407 * the removal of an atomic/guarded entry is strict.
8408 *
8409 * An atomic entry is processed only if it was
8410 * specifically targeted.
8411 *
8412 * We might have deleted non-atomic entries before
8413 * we reach this this point however...
8414 */
8415 kmem_entry_validate_guard(map, entry,
8416 start, end - start, guard);
8417 }
8418
8419 /*
8420 * Step 2.1: handle "permanent" and "submap" entries
8421 * *before* clipping to avoid triggering some unnecessary
8422 * un-nesting of the shared region.
8423 */
8424 if (entry->vme_permanent && entry->is_sub_map) {
8425 // printf("FBDP %s:%d permanent submap...\n", __FUNCTION__, __LINE__);
8426 /*
8427 * Un-mapping a "permanent" mapping of a user-space
8428 * submap is not allowed unless...
8429 */
8430 if (flags & VM_MAP_REMOVE_IMMUTABLE) {
8431 /*
8432 * a. explicitly requested by the kernel caller.
8433 */
8434 // printf("FBDP %s:%d flags & REMOVE_IMMUTABLE\n", __FUNCTION__, __LINE__);
8435 } else if ((flags & VM_MAP_REMOVE_IMMUTABLE_CODE) &&
8436 developer_mode_state()) {
8437 /*
8438 * b. we're in "developer" mode (for
8439 * breakpoints, dtrace probes, ...).
8440 */
8441 // printf("FBDP %s:%d flags & REMOVE_IMMUTABLE_CODE\n", __FUNCTION__, __LINE__);
8442 } else if (map->terminated) {
8443 /*
8444 * c. this is the final address space cleanup.
8445 */
8446 // printf("FBDP %s:%d map->terminated\n", __FUNCTION__, __LINE__);
8447 } else {
8448 vm_map_offset_t submap_start, submap_end;
8449 kern_return_t submap_kr;
8450
8451 /*
8452 * Check if there are any "permanent" mappings
8453 * in this range in the submap.
8454 */
8455 if (entry->in_transition) {
8456 /* can that even happen ? */
8457 goto in_transition;
8458 }
8459 /* compute the clipped range in the submap */
8460 submap_start = s - entry->vme_start;
8461 submap_start += VME_OFFSET(entry);
8462 submap_end = end - entry->vme_start;
8463 submap_end += VME_OFFSET(entry);
8464 submap_kr = vm_map_delete_submap_recurse(
8465 VME_SUBMAP(entry),
8466 submap_start,
8467 submap_end);
8468 if (submap_kr != KERN_SUCCESS) {
8469 /*
8470 * There are some "permanent" mappings
8471 * in the submap: we are not allowed
8472 * to remove this range.
8473 */
8474 printf("%d[%s] removing permanent submap entry "
8475 "%p [0x%llx:0x%llx] prot 0x%x/0x%x -> KERN_PROT_FAILURE\n",
8476 proc_selfpid(),
8477 (get_bsdtask_info(current_task())
8478 ? proc_name_address(get_bsdtask_info(current_task()))
8479 : "?"), entry,
8480 (uint64_t)entry->vme_start,
8481 (uint64_t)entry->vme_end,
8482 entry->protection,
8483 entry->max_protection);
8484 DTRACE_VM6(vm_map_delete_permanent_deny_submap,
8485 vm_map_entry_t, entry,
8486 vm_map_offset_t, entry->vme_start,
8487 vm_map_offset_t, entry->vme_end,
8488 vm_prot_t, entry->protection,
8489 vm_prot_t, entry->max_protection,
8490 int, VME_ALIAS(entry));
8491 ret.kmr_return = KERN_PROTECTION_FAILURE;
8492 goto out;
8493 }
8494 /* no permanent mappings: proceed */
8495 }
8496 }
8497
8498 /*
8499 * Step 3: Perform any clipping needed.
8500 *
8501 * After this, "entry" starts at "s", ends before "end"
8502 */
8503
8504 if (entry->vme_start < s) {
8505 if ((flags & VM_MAP_REMOVE_NO_MAP_ALIGN) &&
8506 entry->map_aligned &&
8507 !VM_MAP_PAGE_ALIGNED(s, VM_MAP_PAGE_MASK(map))) {
8508 /*
8509 * The entry will no longer be map-aligned
8510 * after clipping and the caller said it's OK.
8511 */
8512 entry->map_aligned = FALSE;
8513 }
8514 vm_map_clip_start(map, entry, s);
8515 SAVE_HINT_MAP_WRITE(map, entry->vme_prev);
8516 }
8517
8518 if (end < entry->vme_end) {
8519 if ((flags & VM_MAP_REMOVE_NO_MAP_ALIGN) &&
8520 entry->map_aligned &&
8521 !VM_MAP_PAGE_ALIGNED(end, VM_MAP_PAGE_MASK(map))) {
8522 /*
8523 * The entry will no longer be map-aligned
8524 * after clipping and the caller said it's OK.
8525 */
8526 entry->map_aligned = FALSE;
8527 }
8528 vm_map_clip_end(map, entry, end);
8529 }
8530
8531 if (entry->vme_permanent && entry->is_sub_map) {
8532 /*
8533 * We already went through step 2.1 which did not deny
8534 * the removal of this "permanent" and "is_sub_map"
8535 * entry.
8536 * Now that we've clipped what we actually want to
8537 * delete, undo the "permanent" part to allow the
8538 * removal to proceed.
8539 */
8540 DTRACE_VM6(vm_map_delete_permanent_allow_submap,
8541 vm_map_entry_t, entry,
8542 vm_map_offset_t, entry->vme_start,
8543 vm_map_offset_t, entry->vme_end,
8544 vm_prot_t, entry->protection,
8545 vm_prot_t, entry->max_protection,
8546 int, VME_ALIAS(entry));
8547 entry->vme_permanent = false;
8548 }
8549
8550 assert(s == entry->vme_start);
8551 assert(entry->vme_end <= end);
8552
8553
8554 /*
8555 * Step 4: If the entry is in flux, wait for this to resolve.
8556 */
8557
8558 if (entry->in_transition) {
8559 wait_result_t wait_result;
8560
8561 in_transition:
8562 /*
8563 * Another thread is wiring/unwiring this entry.
8564 * Let the other thread know we are waiting.
8565 */
8566
8567 entry->needs_wakeup = TRUE;
8568
8569 /*
8570 * wake up anybody waiting on entries that we have
8571 * already unwired/deleted.
8572 */
8573 if (state & VMDS_NEEDS_WAKEUP) {
8574 vm_map_entry_wakeup(map);
8575 state &= ~VMDS_NEEDS_WAKEUP;
8576 }
8577
8578 wait_result = vm_map_entry_wait(map, interruptible);
8579
8580 if (interruptible &&
8581 wait_result == THREAD_INTERRUPTED) {
8582 /*
8583 * We do not clear the needs_wakeup flag,
8584 * since we cannot tell if we were the only one.
8585 */
8586 ret.kmr_return = KERN_ABORTED;
8587 return ret;
8588 }
8589
8590 /*
8591 * The entry could have been clipped or it
8592 * may not exist anymore. Look it up again.
8593 */
8594 state |= VMDS_NEEDS_LOOKUP;
8595 continue;
8596 }
8597
8598
8599 /*
8600 * Step 5: Handle wiring
8601 */
8602
8603 if (entry->wired_count) {
8604 struct vm_map_entry tmp_entry;
8605 boolean_t user_wire;
8606 unsigned int last_timestamp;
8607
8608 user_wire = entry->user_wired_count > 0;
8609
8610 /*
8611 * Remove a kernel wiring if requested
8612 */
8613 if (flags & VM_MAP_REMOVE_KUNWIRE) {
8614 entry->wired_count--;
8615 vme_btref_consider_and_put(entry);
8616 }
8617
8618 /*
8619 * Remove all user wirings for proper accounting
8620 */
8621 while (entry->user_wired_count) {
8622 subtract_wire_counts(map, entry, user_wire);
8623 }
8624
8625 /*
8626 * All our DMA I/O operations in IOKit are currently
8627 * done by wiring through the map entries of the task
8628 * requesting the I/O.
8629 *
8630 * Because of this, we must always wait for kernel wirings
8631 * to go away on the entries before deleting them.
8632 *
8633 * Any caller who wants to actually remove a kernel wiring
8634 * should explicitly set the VM_MAP_REMOVE_KUNWIRE flag to
8635 * properly remove one wiring instead of blasting through
8636 * them all.
8637 */
8638 if (entry->wired_count != 0) {
8639 assert(map != kernel_map);
8640 /*
8641 * Cannot continue. Typical case is when
8642 * a user thread has physical io pending on
8643 * on this page. Either wait for the
8644 * kernel wiring to go away or return an
8645 * error.
8646 */
8647 wait_result_t wait_result;
8648
8649 entry->needs_wakeup = TRUE;
8650 wait_result = vm_map_entry_wait(map,
8651 interruptible);
8652
8653 if (interruptible &&
8654 wait_result == THREAD_INTERRUPTED) {
8655 /*
8656 * We do not clear the
8657 * needs_wakeup flag, since we
8658 * cannot tell if we were the
8659 * only one.
8660 */
8661 ret.kmr_return = KERN_ABORTED;
8662 return ret;
8663 }
8664
8665
8666 /*
8667 * The entry could have been clipped or
8668 * it may not exist anymore. Look it
8669 * up again.
8670 */
8671 state |= VMDS_NEEDS_LOOKUP;
8672 continue;
8673 }
8674
8675 /*
8676 * We can unlock the map now.
8677 *
8678 * The entry might be split once we unlock the map,
8679 * but we need the range as defined by this entry
8680 * to be stable. So we must make a local copy.
8681 *
8682 * The underlying objects do not change during clips,
8683 * and the in_transition state guarentees existence
8684 * of the entry.
8685 */
8686 last_timestamp = map->timestamp;
8687 entry->in_transition = TRUE;
8688 tmp_entry = *entry;
8689 vm_map_unlock(map);
8690
8691 if (tmp_entry.is_sub_map) {
8692 vm_map_t sub_map;
8693 vm_map_offset_t sub_start, sub_end;
8694 pmap_t pmap;
8695 vm_map_offset_t pmap_addr;
8696
8697
8698 sub_map = VME_SUBMAP(&tmp_entry);
8699 sub_start = VME_OFFSET(&tmp_entry);
8700 sub_end = sub_start + (tmp_entry.vme_end -
8701 tmp_entry.vme_start);
8702 if (tmp_entry.use_pmap) {
8703 pmap = sub_map->pmap;
8704 pmap_addr = tmp_entry.vme_start;
8705 } else {
8706 pmap = map->pmap;
8707 pmap_addr = tmp_entry.vme_start;
8708 }
8709 (void) vm_map_unwire_nested(sub_map,
8710 sub_start, sub_end,
8711 user_wire,
8712 pmap, pmap_addr);
8713 } else {
8714 vm_map_offset_t entry_end = tmp_entry.vme_end;
8715 vm_map_offset_t max_end;
8716
8717 if (flags & VM_MAP_REMOVE_NOKUNWIRE_LAST) {
8718 max_end = end - VM_MAP_PAGE_SIZE(map);
8719 if (entry_end > max_end) {
8720 entry_end = max_end;
8721 }
8722 }
8723
8724 if (tmp_entry.vme_kernel_object) {
8725 pmap_protect_options(
8726 map->pmap,
8727 tmp_entry.vme_start,
8728 entry_end,
8729 VM_PROT_NONE,
8730 PMAP_OPTIONS_REMOVE,
8731 NULL);
8732 }
8733 vm_fault_unwire(map, &tmp_entry,
8734 tmp_entry.vme_kernel_object, map->pmap,
8735 tmp_entry.vme_start, entry_end);
8736 }
8737
8738 vm_map_lock(map);
8739
8740 /*
8741 * Unwiring happened, we can now go back to deleting
8742 * them (after we clear the in_transition bit for the range).
8743 */
8744 if (last_timestamp + 1 != map->timestamp) {
8745 state |= VMDS_NEEDS_LOOKUP;
8746 }
8747 clear_in_transition_end = tmp_entry.vme_end;
8748 continue;
8749 }
8750
8751 assert(entry->wired_count == 0);
8752 assert(entry->user_wired_count == 0);
8753
8754
8755 /*
8756 * Step 6: Entry is unwired and ready for us to delete !
8757 */
8758
8759 if (!entry->vme_permanent) {
8760 /*
8761 * Typical case: the entry really shouldn't be permanent
8762 */
8763 } else if ((flags & VM_MAP_REMOVE_IMMUTABLE_CODE) &&
8764 (entry->protection & VM_PROT_EXECUTE) &&
8765 developer_mode_state()) {
8766 /*
8767 * Allow debuggers to undo executable mappings
8768 * when developer mode is on.
8769 */
8770 #if 0
8771 printf("FBDP %d[%s] removing permanent executable entry "
8772 "%p [0x%llx:0x%llx] prot 0x%x/0x%x\n",
8773 proc_selfpid(),
8774 (current_task()->bsd_info
8775 ? proc_name_address(current_task()->bsd_info)
8776 : "?"), entry,
8777 (uint64_t)entry->vme_start,
8778 (uint64_t)entry->vme_end,
8779 entry->protection,
8780 entry->max_protection);
8781 #endif
8782 entry->vme_permanent = FALSE;
8783 } else if ((flags & VM_MAP_REMOVE_IMMUTABLE) || map->terminated) {
8784 #if 0
8785 printf("FBDP %d[%s] removing permanent entry "
8786 "%p [0x%llx:0x%llx] prot 0x%x/0x%x\n",
8787 proc_selfpid(),
8788 (current_task()->bsd_info
8789 ? proc_name_address(current_task()->bsd_info)
8790 : "?"), entry,
8791 (uint64_t)entry->vme_start,
8792 (uint64_t)entry->vme_end,
8793 entry->protection,
8794 entry->max_protection);
8795 #endif
8796 entry->vme_permanent = FALSE;
8797 #if CODE_SIGNING_MONITOR
8798 } else if ((entry->protection & VM_PROT_EXECUTE) && !csm_enabled()) {
8799 entry->vme_permanent = FALSE;
8800
8801 printf("%d[%s] %s(0x%llx,0x%llx): "
8802 "code signing monitor disabled, allowing for permanent executable entry [0x%llx:0x%llx] "
8803 "prot 0x%x/0x%x\n",
8804 proc_selfpid(),
8805 (get_bsdtask_info(current_task())
8806 ? proc_name_address(get_bsdtask_info(current_task()))
8807 : "?"),
8808 __FUNCTION__,
8809 (uint64_t)start,
8810 (uint64_t)end,
8811 (uint64_t)entry->vme_start,
8812 (uint64_t)entry->vme_end,
8813 entry->protection,
8814 entry->max_protection);
8815 #endif
8816 } else {
8817 DTRACE_VM6(vm_map_delete_permanent,
8818 vm_map_entry_t, entry,
8819 vm_map_offset_t, entry->vme_start,
8820 vm_map_offset_t, entry->vme_end,
8821 vm_prot_t, entry->protection,
8822 vm_prot_t, entry->max_protection,
8823 int, VME_ALIAS(entry));
8824 }
8825
8826 if (entry->is_sub_map) {
8827 assertf(VM_MAP_PAGE_SHIFT(VME_SUBMAP(entry)) >= VM_MAP_PAGE_SHIFT(map),
8828 "map %p (%d) entry %p submap %p (%d)\n",
8829 map, VM_MAP_PAGE_SHIFT(map), entry,
8830 VME_SUBMAP(entry),
8831 VM_MAP_PAGE_SHIFT(VME_SUBMAP(entry)));
8832 if (entry->use_pmap) {
8833 #ifndef NO_NESTED_PMAP
8834 int pmap_flags;
8835
8836 if (map->terminated) {
8837 /*
8838 * This is the final cleanup of the
8839 * address space being terminated.
8840 * No new mappings are expected and
8841 * we don't really need to unnest the
8842 * shared region (and lose the "global"
8843 * pmap mappings, if applicable).
8844 *
8845 * Tell the pmap layer that we're
8846 * "clean" wrt nesting.
8847 */
8848 pmap_flags = PMAP_UNNEST_CLEAN;
8849 } else {
8850 /*
8851 * We're unmapping part of the nested
8852 * shared region, so we can't keep the
8853 * nested pmap.
8854 */
8855 pmap_flags = 0;
8856 }
8857 pmap_unnest_options(
8858 map->pmap,
8859 (addr64_t)entry->vme_start,
8860 entry->vme_end - entry->vme_start,
8861 pmap_flags);
8862 #endif /* NO_NESTED_PMAP */
8863 if (map->mapped_in_other_pmaps &&
8864 os_ref_get_count_raw(&map->map_refcnt) != 0) {
8865 /* clean up parent map/maps */
8866 vm_map_submap_pmap_clean(
8867 map, entry->vme_start,
8868 entry->vme_end,
8869 VME_SUBMAP(entry),
8870 VME_OFFSET(entry));
8871 }
8872 } else {
8873 vm_map_submap_pmap_clean(
8874 map, entry->vme_start, entry->vme_end,
8875 VME_SUBMAP(entry),
8876 VME_OFFSET(entry));
8877 }
8878 } else if (entry->vme_kernel_object ||
8879 VME_OBJECT(entry) == compressor_object) {
8880 /*
8881 * nothing to do
8882 */
8883 } else if (map->mapped_in_other_pmaps &&
8884 os_ref_get_count_raw(&map->map_refcnt) != 0) {
8885 vm_object_pmap_protect_options(
8886 VME_OBJECT(entry), VME_OFFSET(entry),
8887 entry->vme_end - entry->vme_start,
8888 PMAP_NULL,
8889 PAGE_SIZE,
8890 entry->vme_start,
8891 VM_PROT_NONE,
8892 PMAP_OPTIONS_REMOVE);
8893 } else if ((VME_OBJECT(entry) != VM_OBJECT_NULL) ||
8894 (state & VMDS_KERNEL_PMAP)) {
8895 /* Remove translations associated
8896 * with this range unless the entry
8897 * does not have an object, or
8898 * it's the kernel map or a descendant
8899 * since the platform could potentially
8900 * create "backdoor" mappings invisible
8901 * to the VM. It is expected that
8902 * objectless, non-kernel ranges
8903 * do not have such VM invisible
8904 * translations.
8905 */
8906 pmap_remove_options(map->pmap,
8907 (addr64_t)entry->vme_start,
8908 (addr64_t)entry->vme_end,
8909 PMAP_OPTIONS_REMOVE);
8910 }
8911
8912 #if DEBUG
8913 /*
8914 * All pmap mappings for this map entry must have been
8915 * cleared by now.
8916 */
8917 assert(pmap_is_empty(map->pmap,
8918 entry->vme_start,
8919 entry->vme_end));
8920 #endif /* DEBUG */
8921
8922 if (entry->iokit_acct) {
8923 /* alternate accounting */
8924 DTRACE_VM4(vm_map_iokit_unmapped_region,
8925 vm_map_t, map,
8926 vm_map_offset_t, entry->vme_start,
8927 vm_map_offset_t, entry->vme_end,
8928 int, VME_ALIAS(entry));
8929 vm_map_iokit_unmapped_region(map,
8930 (entry->vme_end -
8931 entry->vme_start));
8932 entry->iokit_acct = FALSE;
8933 entry->use_pmap = FALSE;
8934 }
8935
8936 /* move "s" forward */
8937 s = entry->vme_end;
8938 next = entry->vme_next;
8939 if (!entry->map_aligned) {
8940 vm_map_offset_t rounded_s;
8941
8942 /*
8943 * Skip artificial gap due to mis-aligned entry
8944 * on devices with a page size smaller than the
8945 * map's page size (i.e. 16k task on a 4k device).
8946 */
8947 rounded_s = VM_MAP_ROUND_PAGE(s, VM_MAP_PAGE_MASK(map));
8948 if (next == vm_map_to_entry(map)) {
8949 s = rounded_s;
8950 } else if (s < rounded_s) {
8951 s = MIN(rounded_s, next->vme_start);
8952 }
8953 }
8954 ret.kmr_size += s - entry->vme_start;
8955
8956 if (entry->vme_permanent) {
8957 /*
8958 * A permanent entry can not be removed, so leave it
8959 * in place but remove all access permissions.
8960 */
8961 if (!entry->csm_associated) {
8962 printf("%s:%d %d[%s] map %p entry %p [ 0x%llx - 0x%llx ] submap %d prot 0x%x/0x%x -> 0/0\n",
8963 __FUNCTION__, __LINE__,
8964 proc_selfpid(),
8965 (get_bsdtask_info(current_task())
8966 ? proc_name_address(get_bsdtask_info(current_task()))
8967 : "?"),
8968 map,
8969 entry,
8970 (uint64_t)entry->vme_start,
8971 (uint64_t)entry->vme_end,
8972 entry->is_sub_map,
8973 entry->protection,
8974 entry->max_protection);
8975 }
8976 DTRACE_VM6(vm_map_delete_permanent_prot_none,
8977 vm_map_entry_t, entry,
8978 vm_map_offset_t, entry->vme_start,
8979 vm_map_offset_t, entry->vme_end,
8980 vm_prot_t, entry->protection,
8981 vm_prot_t, entry->max_protection,
8982 int, VME_ALIAS(entry));
8983 entry->protection = VM_PROT_NONE;
8984 entry->max_protection = VM_PROT_NONE;
8985 } else {
8986 vm_map_entry_zap(map, entry, zap_list);
8987 }
8988
8989 entry = next;
8990 next = VM_MAP_ENTRY_NULL;
8991
8992 if ((flags & VM_MAP_REMOVE_NO_YIELD) == 0 && s < end) {
8993 unsigned int last_timestamp = map->timestamp++;
8994
8995 if (lck_rw_lock_yield_exclusive(&map->lock,
8996 LCK_RW_YIELD_ANY_WAITER)) {
8997 if (last_timestamp != map->timestamp + 1) {
8998 state |= VMDS_NEEDS_LOOKUP;
8999 }
9000 } else {
9001 /* we didn't yield, undo our change */
9002 map->timestamp--;
9003 }
9004 }
9005 }
9006
9007 if (map->wait_for_space) {
9008 thread_wakeup((event_t) map);
9009 }
9010
9011 if (state & VMDS_NEEDS_WAKEUP) {
9012 vm_map_entry_wakeup(map);
9013 }
9014
9015 out:
9016 if ((state & VMDS_KERNEL_PMAP) && ret.kmr_return) {
9017 __vm_map_delete_failed_panic(map, start, end, ret.kmr_return);
9018 }
9019
9020 if (state & VMDS_KERNEL_KMEMPTR) {
9021 kmem_free_space(start, end, range_id, &slot);
9022 }
9023
9024 if (state & VMDS_FOUND_GAP) {
9025 DTRACE_VM3(kern_vm_deallocate_gap,
9026 vm_map_offset_t, gap_start,
9027 vm_map_offset_t, save_start,
9028 vm_map_offset_t, save_end);
9029 if (flags & VM_MAP_REMOVE_GAPS_FAIL) {
9030 ret.kmr_return = KERN_INVALID_VALUE;
9031 } else {
9032 vm_map_guard_exception(gap_start, kGUARD_EXC_DEALLOC_GAP);
9033 }
9034 }
9035
9036 return ret;
9037 }
9038
9039 kmem_return_t
vm_map_remove_and_unlock(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vmr_flags_t flags,kmem_guard_t guard)9040 vm_map_remove_and_unlock(
9041 vm_map_t map,
9042 vm_map_offset_t start,
9043 vm_map_offset_t end,
9044 vmr_flags_t flags,
9045 kmem_guard_t guard)
9046 {
9047 kmem_return_t ret;
9048 VM_MAP_ZAP_DECLARE(zap);
9049
9050 ret = vm_map_delete(map, start, end, flags, guard, &zap);
9051 vm_map_unlock(map);
9052
9053 vm_map_zap_dispose(&zap);
9054
9055 return ret;
9056 }
9057
9058 /*
9059 * vm_map_remove_guard:
9060 *
9061 * Remove the given address range from the target map.
9062 * This is the exported form of vm_map_delete.
9063 */
9064 kmem_return_t
vm_map_remove_guard(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vmr_flags_t flags,kmem_guard_t guard)9065 vm_map_remove_guard(
9066 vm_map_t map,
9067 vm_map_offset_t start,
9068 vm_map_offset_t end,
9069 vmr_flags_t flags,
9070 kmem_guard_t guard)
9071 {
9072 vm_map_lock(map);
9073 return vm_map_remove_and_unlock(map, start, end, flags, guard);
9074 }
9075
9076 /*
9077 * vm_map_terminate:
9078 *
9079 * Clean out a task's map.
9080 */
9081 kern_return_t
vm_map_terminate(vm_map_t map)9082 vm_map_terminate(
9083 vm_map_t map)
9084 {
9085 vm_map_lock(map);
9086 map->terminated = TRUE;
9087 vm_map_disable_hole_optimization(map);
9088 (void)vm_map_remove_and_unlock(map, map->min_offset, map->max_offset,
9089 VM_MAP_REMOVE_NO_FLAGS, KMEM_GUARD_NONE);
9090 return KERN_SUCCESS;
9091 }
9092
9093 /*
9094 * Routine: vm_map_copy_allocate
9095 *
9096 * Description:
9097 * Allocates and initializes a map copy object.
9098 */
9099 static vm_map_copy_t
vm_map_copy_allocate(uint16_t type)9100 vm_map_copy_allocate(uint16_t type)
9101 {
9102 vm_map_copy_t new_copy;
9103
9104 new_copy = zalloc_id(ZONE_ID_VM_MAP_COPY, Z_WAITOK | Z_ZERO);
9105 new_copy->type = type;
9106 if (type == VM_MAP_COPY_ENTRY_LIST) {
9107 new_copy->c_u.hdr.rb_head_store.rbh_root = (void*)(int)SKIP_RB_TREE;
9108 vm_map_store_init(&new_copy->cpy_hdr);
9109 }
9110 return new_copy;
9111 }
9112
9113 /*
9114 * Routine: vm_map_copy_discard
9115 *
9116 * Description:
9117 * Dispose of a map copy object (returned by
9118 * vm_map_copyin).
9119 */
9120 void
vm_map_copy_discard(vm_map_copy_t copy)9121 vm_map_copy_discard(
9122 vm_map_copy_t copy)
9123 {
9124 if (copy == VM_MAP_COPY_NULL) {
9125 return;
9126 }
9127
9128 /*
9129 * Assert that the vm_map_copy is coming from the right
9130 * zone and hasn't been forged
9131 */
9132 vm_map_copy_require(copy);
9133
9134 switch (copy->type) {
9135 case VM_MAP_COPY_ENTRY_LIST:
9136 while (vm_map_copy_first_entry(copy) !=
9137 vm_map_copy_to_entry(copy)) {
9138 vm_map_entry_t entry = vm_map_copy_first_entry(copy);
9139
9140 vm_map_copy_entry_unlink(copy, entry);
9141 if (entry->is_sub_map) {
9142 vm_map_deallocate(VME_SUBMAP(entry));
9143 } else {
9144 vm_object_deallocate(VME_OBJECT(entry));
9145 }
9146 vm_map_copy_entry_dispose(entry);
9147 }
9148 break;
9149 case VM_MAP_COPY_KERNEL_BUFFER:
9150
9151 /*
9152 * The vm_map_copy_t and possibly the data buffer were
9153 * allocated by a single call to kalloc_data(), i.e. the
9154 * vm_map_copy_t was not allocated out of the zone.
9155 */
9156 if (copy->size > msg_ool_size_small || copy->offset) {
9157 panic("Invalid vm_map_copy_t sz:%lld, ofst:%lld",
9158 (long long)copy->size, (long long)copy->offset);
9159 }
9160 kfree_data(copy->cpy_kdata, copy->size);
9161 }
9162 zfree_id(ZONE_ID_VM_MAP_COPY, copy);
9163 }
9164
9165 #if XNU_PLATFORM_MacOSX
9166
9167 /*
9168 * Routine: vm_map_copy_copy
9169 *
9170 * Description:
9171 * Move the information in a map copy object to
9172 * a new map copy object, leaving the old one
9173 * empty.
9174 *
9175 * This is used by kernel routines that need
9176 * to look at out-of-line data (in copyin form)
9177 * before deciding whether to return SUCCESS.
9178 * If the routine returns FAILURE, the original
9179 * copy object will be deallocated; therefore,
9180 * these routines must make a copy of the copy
9181 * object and leave the original empty so that
9182 * deallocation will not fail.
9183 */
9184 vm_map_copy_t
vm_map_copy_copy(vm_map_copy_t copy)9185 vm_map_copy_copy(
9186 vm_map_copy_t copy)
9187 {
9188 vm_map_copy_t new_copy;
9189
9190 if (copy == VM_MAP_COPY_NULL) {
9191 return VM_MAP_COPY_NULL;
9192 }
9193
9194 /*
9195 * Assert that the vm_map_copy is coming from the right
9196 * zone and hasn't been forged
9197 */
9198 vm_map_copy_require(copy);
9199
9200 /*
9201 * Allocate a new copy object, and copy the information
9202 * from the old one into it.
9203 */
9204
9205 new_copy = zalloc_id(ZONE_ID_VM_MAP_COPY, Z_WAITOK | Z_ZERO | Z_NOFAIL);
9206 memcpy((void *) new_copy, (void *) copy, sizeof(struct vm_map_copy));
9207 #if __has_feature(ptrauth_calls)
9208 if (copy->type == VM_MAP_COPY_KERNEL_BUFFER) {
9209 new_copy->cpy_kdata = copy->cpy_kdata;
9210 }
9211 #endif
9212
9213 if (copy->type == VM_MAP_COPY_ENTRY_LIST) {
9214 /*
9215 * The links in the entry chain must be
9216 * changed to point to the new copy object.
9217 */
9218 vm_map_copy_first_entry(copy)->vme_prev
9219 = vm_map_copy_to_entry(new_copy);
9220 vm_map_copy_last_entry(copy)->vme_next
9221 = vm_map_copy_to_entry(new_copy);
9222 }
9223
9224 /*
9225 * Change the old copy object into one that contains
9226 * nothing to be deallocated.
9227 */
9228 bzero(copy, sizeof(struct vm_map_copy));
9229 copy->type = VM_MAP_COPY_KERNEL_BUFFER;
9230
9231 /*
9232 * Return the new object.
9233 */
9234 return new_copy;
9235 }
9236
9237 #endif /* XNU_PLATFORM_MacOSX */
9238
9239 static boolean_t
vm_map_entry_is_overwritable(vm_map_t dst_map __unused,vm_map_entry_t entry)9240 vm_map_entry_is_overwritable(
9241 vm_map_t dst_map __unused,
9242 vm_map_entry_t entry)
9243 {
9244 if (!(entry->protection & VM_PROT_WRITE)) {
9245 /* can't overwrite if not writable */
9246 return FALSE;
9247 }
9248 #if !__x86_64__
9249 if (entry->used_for_jit &&
9250 vm_map_cs_enforcement(dst_map) &&
9251 !dst_map->cs_debugged) {
9252 /*
9253 * Can't overwrite a JIT region while cs_enforced
9254 * and not cs_debugged.
9255 */
9256 return FALSE;
9257 }
9258
9259 #if __arm64e__
9260 /* Do not allow overwrite HW assisted TPRO entries */
9261 if (entry->used_for_tpro) {
9262 return FALSE;
9263 }
9264 #endif /* __arm64e__ */
9265
9266 if (entry->vme_permanent) {
9267 if (entry->is_sub_map) {
9268 /*
9269 * We can't tell if the submap contains "permanent"
9270 * entries within the range targeted by the caller.
9271 * The caller will have to check for that with
9272 * vm_map_overwrite_submap_recurse() for example.
9273 */
9274 } else {
9275 /*
9276 * Do not allow overwriting of a "permanent"
9277 * entry.
9278 */
9279 DTRACE_VM6(vm_map_delete_permanent_deny_overwrite,
9280 vm_map_entry_t, entry,
9281 vm_map_offset_t, entry->vme_start,
9282 vm_map_offset_t, entry->vme_end,
9283 vm_prot_t, entry->protection,
9284 vm_prot_t, entry->max_protection,
9285 int, VME_ALIAS(entry));
9286 return FALSE;
9287 }
9288 }
9289 #endif /* !__x86_64__ */
9290 return TRUE;
9291 }
9292
9293 static kern_return_t
vm_map_overwrite_submap_recurse(vm_map_t dst_map,vm_map_offset_t dst_addr,vm_map_size_t dst_size)9294 vm_map_overwrite_submap_recurse(
9295 vm_map_t dst_map,
9296 vm_map_offset_t dst_addr,
9297 vm_map_size_t dst_size)
9298 {
9299 vm_map_offset_t dst_end;
9300 vm_map_entry_t tmp_entry;
9301 vm_map_entry_t entry;
9302 kern_return_t result;
9303 boolean_t encountered_sub_map = FALSE;
9304
9305
9306
9307 /*
9308 * Verify that the destination is all writeable
9309 * initially. We have to trunc the destination
9310 * address and round the copy size or we'll end up
9311 * splitting entries in strange ways.
9312 */
9313
9314 dst_end = vm_map_round_page(dst_addr + dst_size,
9315 VM_MAP_PAGE_MASK(dst_map));
9316 vm_map_lock(dst_map);
9317
9318 start_pass_1:
9319 if (!vm_map_lookup_entry(dst_map, dst_addr, &tmp_entry)) {
9320 vm_map_unlock(dst_map);
9321 return KERN_INVALID_ADDRESS;
9322 }
9323
9324 vm_map_clip_start(dst_map,
9325 tmp_entry,
9326 vm_map_trunc_page(dst_addr,
9327 VM_MAP_PAGE_MASK(dst_map)));
9328 if (tmp_entry->is_sub_map) {
9329 /* clipping did unnest if needed */
9330 assert(!tmp_entry->use_pmap);
9331 }
9332
9333 for (entry = tmp_entry;;) {
9334 vm_map_entry_t next;
9335
9336 next = entry->vme_next;
9337 while (entry->is_sub_map) {
9338 vm_map_offset_t sub_start;
9339 vm_map_offset_t sub_end;
9340 vm_map_offset_t local_end;
9341
9342 if (entry->in_transition) {
9343 /*
9344 * Say that we are waiting, and wait for entry.
9345 */
9346 entry->needs_wakeup = TRUE;
9347 vm_map_entry_wait(dst_map, THREAD_UNINT);
9348
9349 goto start_pass_1;
9350 }
9351
9352 encountered_sub_map = TRUE;
9353 sub_start = VME_OFFSET(entry);
9354
9355 if (entry->vme_end < dst_end) {
9356 sub_end = entry->vme_end;
9357 } else {
9358 sub_end = dst_end;
9359 }
9360 sub_end -= entry->vme_start;
9361 sub_end += VME_OFFSET(entry);
9362 local_end = entry->vme_end;
9363 vm_map_unlock(dst_map);
9364
9365 result = vm_map_overwrite_submap_recurse(
9366 VME_SUBMAP(entry),
9367 sub_start,
9368 sub_end - sub_start);
9369
9370 if (result != KERN_SUCCESS) {
9371 return result;
9372 }
9373 if (dst_end <= entry->vme_end) {
9374 return KERN_SUCCESS;
9375 }
9376 vm_map_lock(dst_map);
9377 if (!vm_map_lookup_entry(dst_map, local_end,
9378 &tmp_entry)) {
9379 vm_map_unlock(dst_map);
9380 return KERN_INVALID_ADDRESS;
9381 }
9382 entry = tmp_entry;
9383 next = entry->vme_next;
9384 }
9385
9386 if (!(entry->protection & VM_PROT_WRITE)) {
9387 vm_map_unlock(dst_map);
9388 return KERN_PROTECTION_FAILURE;
9389 }
9390
9391 if (!vm_map_entry_is_overwritable(dst_map, entry)) {
9392 vm_map_unlock(dst_map);
9393 return KERN_PROTECTION_FAILURE;
9394 }
9395
9396 /*
9397 * If the entry is in transition, we must wait
9398 * for it to exit that state. Anything could happen
9399 * when we unlock the map, so start over.
9400 */
9401 if (entry->in_transition) {
9402 /*
9403 * Say that we are waiting, and wait for entry.
9404 */
9405 entry->needs_wakeup = TRUE;
9406 vm_map_entry_wait(dst_map, THREAD_UNINT);
9407
9408 goto start_pass_1;
9409 }
9410
9411 /*
9412 * our range is contained completely within this map entry
9413 */
9414 if (dst_end <= entry->vme_end) {
9415 vm_map_unlock(dst_map);
9416 return KERN_SUCCESS;
9417 }
9418 /*
9419 * check that range specified is contiguous region
9420 */
9421 if ((next == vm_map_to_entry(dst_map)) ||
9422 (next->vme_start != entry->vme_end)) {
9423 vm_map_unlock(dst_map);
9424 return KERN_INVALID_ADDRESS;
9425 }
9426
9427 /*
9428 * Check for permanent objects in the destination.
9429 */
9430 if ((VME_OBJECT(entry) != VM_OBJECT_NULL) &&
9431 ((!VME_OBJECT(entry)->internal) ||
9432 (VME_OBJECT(entry)->true_share))) {
9433 if (encountered_sub_map) {
9434 vm_map_unlock(dst_map);
9435 return KERN_FAILURE;
9436 }
9437 }
9438
9439
9440 entry = next;
9441 }/* for */
9442 vm_map_unlock(dst_map);
9443 return KERN_SUCCESS;
9444 }
9445
9446 /*
9447 * Routine: vm_map_copy_overwrite
9448 *
9449 * Description:
9450 * Copy the memory described by the map copy
9451 * object (copy; returned by vm_map_copyin) onto
9452 * the specified destination region (dst_map, dst_addr).
9453 * The destination must be writeable.
9454 *
9455 * Unlike vm_map_copyout, this routine actually
9456 * writes over previously-mapped memory. If the
9457 * previous mapping was to a permanent (user-supplied)
9458 * memory object, it is preserved.
9459 *
9460 * The attributes (protection and inheritance) of the
9461 * destination region are preserved.
9462 *
9463 * If successful, consumes the copy object.
9464 * Otherwise, the caller is responsible for it.
9465 *
9466 * Implementation notes:
9467 * To overwrite aligned temporary virtual memory, it is
9468 * sufficient to remove the previous mapping and insert
9469 * the new copy. This replacement is done either on
9470 * the whole region (if no permanent virtual memory
9471 * objects are embedded in the destination region) or
9472 * in individual map entries.
9473 *
9474 * To overwrite permanent virtual memory , it is necessary
9475 * to copy each page, as the external memory management
9476 * interface currently does not provide any optimizations.
9477 *
9478 * Unaligned memory also has to be copied. It is possible
9479 * to use 'vm_trickery' to copy the aligned data. This is
9480 * not done but not hard to implement.
9481 *
9482 * Once a page of permanent memory has been overwritten,
9483 * it is impossible to interrupt this function; otherwise,
9484 * the call would be neither atomic nor location-independent.
9485 * The kernel-state portion of a user thread must be
9486 * interruptible.
9487 *
9488 * It may be expensive to forward all requests that might
9489 * overwrite permanent memory (vm_write, vm_copy) to
9490 * uninterruptible kernel threads. This routine may be
9491 * called by interruptible threads; however, success is
9492 * not guaranteed -- if the request cannot be performed
9493 * atomically and interruptibly, an error indication is
9494 * returned.
9495 *
9496 * Callers of this function must call vm_map_copy_require on
9497 * previously created vm_map_copy_t or pass a newly created
9498 * one to ensure that it hasn't been forged.
9499 */
9500 static kern_return_t
vm_map_copy_overwrite_nested(vm_map_t dst_map,vm_map_address_t dst_addr,vm_map_copy_t copy,boolean_t interruptible,pmap_t pmap,boolean_t discard_on_success)9501 vm_map_copy_overwrite_nested(
9502 vm_map_t dst_map,
9503 vm_map_address_t dst_addr,
9504 vm_map_copy_t copy,
9505 boolean_t interruptible,
9506 pmap_t pmap,
9507 boolean_t discard_on_success)
9508 {
9509 vm_map_offset_t dst_end;
9510 vm_map_entry_t tmp_entry;
9511 vm_map_entry_t entry;
9512 kern_return_t kr;
9513 boolean_t aligned = TRUE;
9514 boolean_t contains_permanent_objects = FALSE;
9515 boolean_t encountered_sub_map = FALSE;
9516 vm_map_offset_t base_addr;
9517 vm_map_size_t copy_size;
9518 vm_map_size_t total_size;
9519 uint16_t copy_page_shift;
9520
9521 /*
9522 * Check for special kernel buffer allocated
9523 * by new_ipc_kmsg_copyin.
9524 */
9525
9526 if (copy->type == VM_MAP_COPY_KERNEL_BUFFER) {
9527 kr = vm_map_copyout_kernel_buffer(
9528 dst_map, &dst_addr,
9529 copy, copy->size, TRUE, discard_on_success);
9530 return kr;
9531 }
9532
9533 /*
9534 * Only works for entry lists at the moment. Will
9535 * support page lists later.
9536 */
9537
9538 assert(copy->type == VM_MAP_COPY_ENTRY_LIST);
9539
9540 if (copy->size == 0) {
9541 if (discard_on_success) {
9542 vm_map_copy_discard(copy);
9543 }
9544 return KERN_SUCCESS;
9545 }
9546
9547 copy_page_shift = copy->cpy_hdr.page_shift;
9548
9549 /*
9550 * Verify that the destination is all writeable
9551 * initially. We have to trunc the destination
9552 * address and round the copy size or we'll end up
9553 * splitting entries in strange ways.
9554 */
9555
9556 if (!VM_MAP_PAGE_ALIGNED(copy->size,
9557 VM_MAP_PAGE_MASK(dst_map)) ||
9558 !VM_MAP_PAGE_ALIGNED(copy->offset,
9559 VM_MAP_PAGE_MASK(dst_map)) ||
9560 !VM_MAP_PAGE_ALIGNED(dst_addr,
9561 VM_MAP_PAGE_MASK(dst_map)) ||
9562 copy_page_shift != VM_MAP_PAGE_SHIFT(dst_map)) {
9563 aligned = FALSE;
9564 dst_end = vm_map_round_page(dst_addr + copy->size,
9565 VM_MAP_PAGE_MASK(dst_map));
9566 } else {
9567 dst_end = dst_addr + copy->size;
9568 }
9569
9570 vm_map_lock(dst_map);
9571
9572 /* LP64todo - remove this check when vm_map_commpage64()
9573 * no longer has to stuff in a map_entry for the commpage
9574 * above the map's max_offset.
9575 */
9576 if (dst_addr >= dst_map->max_offset) {
9577 vm_map_unlock(dst_map);
9578 return KERN_INVALID_ADDRESS;
9579 }
9580
9581 start_pass_1:
9582 if (!vm_map_lookup_entry(dst_map, dst_addr, &tmp_entry)) {
9583 vm_map_unlock(dst_map);
9584 return KERN_INVALID_ADDRESS;
9585 }
9586 vm_map_clip_start(dst_map,
9587 tmp_entry,
9588 vm_map_trunc_page(dst_addr,
9589 VM_MAP_PAGE_MASK(dst_map)));
9590 for (entry = tmp_entry;;) {
9591 vm_map_entry_t next = entry->vme_next;
9592
9593 while (entry->is_sub_map) {
9594 vm_map_offset_t sub_start;
9595 vm_map_offset_t sub_end;
9596 vm_map_offset_t local_end;
9597
9598 if (entry->in_transition) {
9599 /*
9600 * Say that we are waiting, and wait for entry.
9601 */
9602 entry->needs_wakeup = TRUE;
9603 vm_map_entry_wait(dst_map, THREAD_UNINT);
9604
9605 goto start_pass_1;
9606 }
9607
9608 local_end = entry->vme_end;
9609 if (!(entry->needs_copy)) {
9610 /* if needs_copy we are a COW submap */
9611 /* in such a case we just replace so */
9612 /* there is no need for the follow- */
9613 /* ing check. */
9614 encountered_sub_map = TRUE;
9615 sub_start = VME_OFFSET(entry);
9616
9617 if (entry->vme_end < dst_end) {
9618 sub_end = entry->vme_end;
9619 } else {
9620 sub_end = dst_end;
9621 }
9622 sub_end -= entry->vme_start;
9623 sub_end += VME_OFFSET(entry);
9624 vm_map_unlock(dst_map);
9625
9626 kr = vm_map_overwrite_submap_recurse(
9627 VME_SUBMAP(entry),
9628 sub_start,
9629 sub_end - sub_start);
9630 if (kr != KERN_SUCCESS) {
9631 return kr;
9632 }
9633 vm_map_lock(dst_map);
9634 }
9635
9636 if (dst_end <= entry->vme_end) {
9637 goto start_overwrite;
9638 }
9639 if (!vm_map_lookup_entry(dst_map, local_end,
9640 &entry)) {
9641 vm_map_unlock(dst_map);
9642 return KERN_INVALID_ADDRESS;
9643 }
9644 next = entry->vme_next;
9645 }
9646
9647 if (!(entry->protection & VM_PROT_WRITE)) {
9648 vm_map_unlock(dst_map);
9649 return KERN_PROTECTION_FAILURE;
9650 }
9651
9652 if (!vm_map_entry_is_overwritable(dst_map, entry)) {
9653 vm_map_unlock(dst_map);
9654 return KERN_PROTECTION_FAILURE;
9655 }
9656
9657 /*
9658 * If the entry is in transition, we must wait
9659 * for it to exit that state. Anything could happen
9660 * when we unlock the map, so start over.
9661 */
9662 if (entry->in_transition) {
9663 /*
9664 * Say that we are waiting, and wait for entry.
9665 */
9666 entry->needs_wakeup = TRUE;
9667 vm_map_entry_wait(dst_map, THREAD_UNINT);
9668
9669 goto start_pass_1;
9670 }
9671
9672 /*
9673 * our range is contained completely within this map entry
9674 */
9675 if (dst_end <= entry->vme_end) {
9676 break;
9677 }
9678 /*
9679 * check that range specified is contiguous region
9680 */
9681 if ((next == vm_map_to_entry(dst_map)) ||
9682 (next->vme_start != entry->vme_end)) {
9683 vm_map_unlock(dst_map);
9684 return KERN_INVALID_ADDRESS;
9685 }
9686
9687
9688 /*
9689 * Check for permanent objects in the destination.
9690 */
9691 if ((VME_OBJECT(entry) != VM_OBJECT_NULL) &&
9692 ((!VME_OBJECT(entry)->internal) ||
9693 (VME_OBJECT(entry)->true_share))) {
9694 contains_permanent_objects = TRUE;
9695 }
9696
9697 entry = next;
9698 }/* for */
9699
9700 start_overwrite:
9701 /*
9702 * If there are permanent objects in the destination, then
9703 * the copy cannot be interrupted.
9704 */
9705
9706 if (interruptible && contains_permanent_objects) {
9707 vm_map_unlock(dst_map);
9708 return KERN_FAILURE; /* XXX */
9709 }
9710
9711 /*
9712 *
9713 * Make a second pass, overwriting the data
9714 * At the beginning of each loop iteration,
9715 * the next entry to be overwritten is "tmp_entry"
9716 * (initially, the value returned from the lookup above),
9717 * and the starting address expected in that entry
9718 * is "start".
9719 */
9720
9721 total_size = copy->size;
9722 if (encountered_sub_map) {
9723 copy_size = 0;
9724 /* re-calculate tmp_entry since we've had the map */
9725 /* unlocked */
9726 if (!vm_map_lookup_entry( dst_map, dst_addr, &tmp_entry)) {
9727 vm_map_unlock(dst_map);
9728 return KERN_INVALID_ADDRESS;
9729 }
9730 } else {
9731 copy_size = copy->size;
9732 }
9733
9734 base_addr = dst_addr;
9735 while (TRUE) {
9736 /* deconstruct the copy object and do in parts */
9737 /* only in sub_map, interruptable case */
9738 vm_map_entry_t copy_entry;
9739 vm_map_entry_t previous_prev = VM_MAP_ENTRY_NULL;
9740 vm_map_entry_t next_copy = VM_MAP_ENTRY_NULL;
9741 int nentries;
9742 int remaining_entries = 0;
9743 vm_map_offset_t new_offset = 0;
9744
9745 for (entry = tmp_entry; copy_size == 0;) {
9746 vm_map_entry_t next;
9747
9748 next = entry->vme_next;
9749
9750 /* tmp_entry and base address are moved along */
9751 /* each time we encounter a sub-map. Otherwise */
9752 /* entry can outpase tmp_entry, and the copy_size */
9753 /* may reflect the distance between them */
9754 /* if the current entry is found to be in transition */
9755 /* we will start over at the beginning or the last */
9756 /* encounter of a submap as dictated by base_addr */
9757 /* we will zero copy_size accordingly. */
9758 if (entry->in_transition) {
9759 /*
9760 * Say that we are waiting, and wait for entry.
9761 */
9762 entry->needs_wakeup = TRUE;
9763 vm_map_entry_wait(dst_map, THREAD_UNINT);
9764
9765 if (!vm_map_lookup_entry(dst_map, base_addr,
9766 &tmp_entry)) {
9767 vm_map_unlock(dst_map);
9768 return KERN_INVALID_ADDRESS;
9769 }
9770 copy_size = 0;
9771 entry = tmp_entry;
9772 continue;
9773 }
9774 if (entry->is_sub_map) {
9775 vm_map_offset_t sub_start;
9776 vm_map_offset_t sub_end;
9777 vm_map_offset_t local_end;
9778
9779 if (entry->needs_copy) {
9780 /* if this is a COW submap */
9781 /* just back the range with a */
9782 /* anonymous entry */
9783 assert(!entry->vme_permanent);
9784 if (entry->vme_end < dst_end) {
9785 sub_end = entry->vme_end;
9786 } else {
9787 sub_end = dst_end;
9788 }
9789 if (entry->vme_start < base_addr) {
9790 sub_start = base_addr;
9791 } else {
9792 sub_start = entry->vme_start;
9793 }
9794 vm_map_clip_end(
9795 dst_map, entry, sub_end);
9796 vm_map_clip_start(
9797 dst_map, entry, sub_start);
9798 assert(!entry->use_pmap);
9799 assert(!entry->iokit_acct);
9800 entry->use_pmap = TRUE;
9801 vm_map_deallocate(VME_SUBMAP(entry));
9802 assert(!entry->vme_permanent);
9803 VME_OBJECT_SET(entry, VM_OBJECT_NULL, false, 0);
9804 VME_OFFSET_SET(entry, 0);
9805 entry->is_shared = FALSE;
9806 entry->needs_copy = FALSE;
9807 entry->protection = VM_PROT_DEFAULT;
9808 entry->max_protection = VM_PROT_ALL;
9809 entry->wired_count = 0;
9810 entry->user_wired_count = 0;
9811 if (entry->inheritance
9812 == VM_INHERIT_SHARE) {
9813 entry->inheritance = VM_INHERIT_COPY;
9814 }
9815 continue;
9816 }
9817 /* first take care of any non-sub_map */
9818 /* entries to send */
9819 if (base_addr < entry->vme_start) {
9820 /* stuff to send */
9821 copy_size =
9822 entry->vme_start - base_addr;
9823 break;
9824 }
9825 sub_start = VME_OFFSET(entry);
9826
9827 if (entry->vme_end < dst_end) {
9828 sub_end = entry->vme_end;
9829 } else {
9830 sub_end = dst_end;
9831 }
9832 sub_end -= entry->vme_start;
9833 sub_end += VME_OFFSET(entry);
9834 local_end = entry->vme_end;
9835 vm_map_unlock(dst_map);
9836 copy_size = sub_end - sub_start;
9837
9838 /* adjust the copy object */
9839 if (total_size > copy_size) {
9840 vm_map_size_t local_size = 0;
9841 vm_map_size_t entry_size;
9842
9843 nentries = 1;
9844 new_offset = copy->offset;
9845 copy_entry = vm_map_copy_first_entry(copy);
9846 while (copy_entry !=
9847 vm_map_copy_to_entry(copy)) {
9848 entry_size = copy_entry->vme_end -
9849 copy_entry->vme_start;
9850 if ((local_size < copy_size) &&
9851 ((local_size + entry_size)
9852 >= copy_size)) {
9853 vm_map_copy_clip_end(copy,
9854 copy_entry,
9855 copy_entry->vme_start +
9856 (copy_size - local_size));
9857 entry_size = copy_entry->vme_end -
9858 copy_entry->vme_start;
9859 local_size += entry_size;
9860 new_offset += entry_size;
9861 }
9862 if (local_size >= copy_size) {
9863 next_copy = copy_entry->vme_next;
9864 copy_entry->vme_next =
9865 vm_map_copy_to_entry(copy);
9866 previous_prev =
9867 copy->cpy_hdr.links.prev;
9868 copy->cpy_hdr.links.prev = copy_entry;
9869 copy->size = copy_size;
9870 remaining_entries =
9871 copy->cpy_hdr.nentries;
9872 remaining_entries -= nentries;
9873 copy->cpy_hdr.nentries = nentries;
9874 break;
9875 } else {
9876 local_size += entry_size;
9877 new_offset += entry_size;
9878 nentries++;
9879 }
9880 copy_entry = copy_entry->vme_next;
9881 }
9882 }
9883
9884 if ((entry->use_pmap) && (pmap == NULL)) {
9885 kr = vm_map_copy_overwrite_nested(
9886 VME_SUBMAP(entry),
9887 sub_start,
9888 copy,
9889 interruptible,
9890 VME_SUBMAP(entry)->pmap,
9891 TRUE);
9892 } else if (pmap != NULL) {
9893 kr = vm_map_copy_overwrite_nested(
9894 VME_SUBMAP(entry),
9895 sub_start,
9896 copy,
9897 interruptible, pmap,
9898 TRUE);
9899 } else {
9900 kr = vm_map_copy_overwrite_nested(
9901 VME_SUBMAP(entry),
9902 sub_start,
9903 copy,
9904 interruptible,
9905 dst_map->pmap,
9906 TRUE);
9907 }
9908 if (kr != KERN_SUCCESS) {
9909 if (next_copy != NULL) {
9910 copy->cpy_hdr.nentries +=
9911 remaining_entries;
9912 copy->cpy_hdr.links.prev->vme_next =
9913 next_copy;
9914 copy->cpy_hdr.links.prev
9915 = previous_prev;
9916 copy->size = total_size;
9917 }
9918 return kr;
9919 }
9920 if (dst_end <= local_end) {
9921 return KERN_SUCCESS;
9922 }
9923 /* otherwise copy no longer exists, it was */
9924 /* destroyed after successful copy_overwrite */
9925 copy = vm_map_copy_allocate(VM_MAP_COPY_ENTRY_LIST);
9926 copy->offset = new_offset;
9927 copy->cpy_hdr.page_shift = copy_page_shift;
9928
9929 total_size -= copy_size;
9930 copy_size = 0;
9931 /* put back remainder of copy in container */
9932 if (next_copy != NULL) {
9933 copy->cpy_hdr.nentries = remaining_entries;
9934 copy->cpy_hdr.links.next = next_copy;
9935 copy->cpy_hdr.links.prev = previous_prev;
9936 copy->size = total_size;
9937 next_copy->vme_prev =
9938 vm_map_copy_to_entry(copy);
9939 next_copy = NULL;
9940 }
9941 base_addr = local_end;
9942 vm_map_lock(dst_map);
9943 if (!vm_map_lookup_entry(dst_map,
9944 local_end, &tmp_entry)) {
9945 vm_map_unlock(dst_map);
9946 return KERN_INVALID_ADDRESS;
9947 }
9948 entry = tmp_entry;
9949 continue;
9950 }
9951 if (dst_end <= entry->vme_end) {
9952 copy_size = dst_end - base_addr;
9953 break;
9954 }
9955
9956 if ((next == vm_map_to_entry(dst_map)) ||
9957 (next->vme_start != entry->vme_end)) {
9958 vm_map_unlock(dst_map);
9959 return KERN_INVALID_ADDRESS;
9960 }
9961
9962 entry = next;
9963 }/* for */
9964
9965 next_copy = NULL;
9966 nentries = 1;
9967
9968 /* adjust the copy object */
9969 if (total_size > copy_size) {
9970 vm_map_size_t local_size = 0;
9971 vm_map_size_t entry_size;
9972
9973 new_offset = copy->offset;
9974 copy_entry = vm_map_copy_first_entry(copy);
9975 while (copy_entry != vm_map_copy_to_entry(copy)) {
9976 entry_size = copy_entry->vme_end -
9977 copy_entry->vme_start;
9978 if ((local_size < copy_size) &&
9979 ((local_size + entry_size)
9980 >= copy_size)) {
9981 vm_map_copy_clip_end(copy, copy_entry,
9982 copy_entry->vme_start +
9983 (copy_size - local_size));
9984 entry_size = copy_entry->vme_end -
9985 copy_entry->vme_start;
9986 local_size += entry_size;
9987 new_offset += entry_size;
9988 }
9989 if (local_size >= copy_size) {
9990 next_copy = copy_entry->vme_next;
9991 copy_entry->vme_next =
9992 vm_map_copy_to_entry(copy);
9993 previous_prev =
9994 copy->cpy_hdr.links.prev;
9995 copy->cpy_hdr.links.prev = copy_entry;
9996 copy->size = copy_size;
9997 remaining_entries =
9998 copy->cpy_hdr.nentries;
9999 remaining_entries -= nentries;
10000 copy->cpy_hdr.nentries = nentries;
10001 break;
10002 } else {
10003 local_size += entry_size;
10004 new_offset += entry_size;
10005 nentries++;
10006 }
10007 copy_entry = copy_entry->vme_next;
10008 }
10009 }
10010
10011 if (aligned) {
10012 pmap_t local_pmap;
10013
10014 if (pmap) {
10015 local_pmap = pmap;
10016 } else {
10017 local_pmap = dst_map->pmap;
10018 }
10019
10020 if ((kr = vm_map_copy_overwrite_aligned(
10021 dst_map, tmp_entry, copy,
10022 base_addr, local_pmap)) != KERN_SUCCESS) {
10023 if (next_copy != NULL) {
10024 copy->cpy_hdr.nentries +=
10025 remaining_entries;
10026 copy->cpy_hdr.links.prev->vme_next =
10027 next_copy;
10028 copy->cpy_hdr.links.prev =
10029 previous_prev;
10030 copy->size += copy_size;
10031 }
10032 return kr;
10033 }
10034 vm_map_unlock(dst_map);
10035 } else {
10036 /*
10037 * Performance gain:
10038 *
10039 * if the copy and dst address are misaligned but the same
10040 * offset within the page we can copy_not_aligned the
10041 * misaligned parts and copy aligned the rest. If they are
10042 * aligned but len is unaligned we simply need to copy
10043 * the end bit unaligned. We'll need to split the misaligned
10044 * bits of the region in this case !
10045 */
10046 /* ALWAYS UNLOCKS THE dst_map MAP */
10047 kr = vm_map_copy_overwrite_unaligned(
10048 dst_map,
10049 tmp_entry,
10050 copy,
10051 base_addr,
10052 discard_on_success);
10053 if (kr != KERN_SUCCESS) {
10054 if (next_copy != NULL) {
10055 copy->cpy_hdr.nentries +=
10056 remaining_entries;
10057 copy->cpy_hdr.links.prev->vme_next =
10058 next_copy;
10059 copy->cpy_hdr.links.prev =
10060 previous_prev;
10061 copy->size += copy_size;
10062 }
10063 return kr;
10064 }
10065 }
10066 total_size -= copy_size;
10067 if (total_size == 0) {
10068 break;
10069 }
10070 base_addr += copy_size;
10071 copy_size = 0;
10072 copy->offset = new_offset;
10073 if (next_copy != NULL) {
10074 copy->cpy_hdr.nentries = remaining_entries;
10075 copy->cpy_hdr.links.next = next_copy;
10076 copy->cpy_hdr.links.prev = previous_prev;
10077 next_copy->vme_prev = vm_map_copy_to_entry(copy);
10078 copy->size = total_size;
10079 }
10080 vm_map_lock(dst_map);
10081 while (TRUE) {
10082 if (!vm_map_lookup_entry(dst_map,
10083 base_addr, &tmp_entry)) {
10084 vm_map_unlock(dst_map);
10085 return KERN_INVALID_ADDRESS;
10086 }
10087 if (tmp_entry->in_transition) {
10088 entry->needs_wakeup = TRUE;
10089 vm_map_entry_wait(dst_map, THREAD_UNINT);
10090 } else {
10091 break;
10092 }
10093 }
10094 vm_map_clip_start(dst_map,
10095 tmp_entry,
10096 vm_map_trunc_page(base_addr,
10097 VM_MAP_PAGE_MASK(dst_map)));
10098
10099 entry = tmp_entry;
10100 } /* while */
10101
10102 /*
10103 * Throw away the vm_map_copy object
10104 */
10105 if (discard_on_success) {
10106 vm_map_copy_discard(copy);
10107 }
10108
10109 return KERN_SUCCESS;
10110 }/* vm_map_copy_overwrite */
10111
10112 kern_return_t
vm_map_copy_overwrite(vm_map_t dst_map,vm_map_offset_t dst_addr,vm_map_copy_t copy,vm_map_size_t copy_size,boolean_t interruptible)10113 vm_map_copy_overwrite(
10114 vm_map_t dst_map,
10115 vm_map_offset_t dst_addr,
10116 vm_map_copy_t copy,
10117 vm_map_size_t copy_size,
10118 boolean_t interruptible)
10119 {
10120 vm_map_size_t head_size, tail_size;
10121 vm_map_copy_t head_copy, tail_copy;
10122 vm_map_offset_t head_addr, tail_addr;
10123 vm_map_entry_t entry;
10124 kern_return_t kr;
10125 vm_map_offset_t effective_page_mask, effective_page_size;
10126 uint16_t copy_page_shift;
10127
10128 head_size = 0;
10129 tail_size = 0;
10130 head_copy = NULL;
10131 tail_copy = NULL;
10132 head_addr = 0;
10133 tail_addr = 0;
10134
10135 /*
10136 * Check for null copy object.
10137 */
10138 if (copy == VM_MAP_COPY_NULL) {
10139 return KERN_SUCCESS;
10140 }
10141
10142 if (__improbable(vm_map_range_overflows(dst_map, dst_addr, copy_size))) {
10143 return KERN_INVALID_ADDRESS;
10144 }
10145
10146 /*
10147 * Assert that the vm_map_copy is coming from the right
10148 * zone and hasn't been forged
10149 */
10150 vm_map_copy_require(copy);
10151
10152 if (interruptible ||
10153 copy->type != VM_MAP_COPY_ENTRY_LIST) {
10154 /*
10155 * We can't split the "copy" map if we're interruptible
10156 * or if we don't have a "copy" map...
10157 */
10158 blunt_copy:
10159 kr = vm_map_copy_overwrite_nested(dst_map,
10160 dst_addr,
10161 copy,
10162 interruptible,
10163 (pmap_t) NULL,
10164 TRUE);
10165 if (kr) {
10166 ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_COPYOVERWRITE_FULL_NESTED_ERROR), kr /* arg */);
10167 }
10168 return kr;
10169 }
10170
10171 copy_page_shift = VM_MAP_COPY_PAGE_SHIFT(copy);
10172 if (copy_page_shift < PAGE_SHIFT ||
10173 VM_MAP_PAGE_SHIFT(dst_map) < PAGE_SHIFT) {
10174 goto blunt_copy;
10175 }
10176
10177 if (VM_MAP_PAGE_SHIFT(dst_map) < PAGE_SHIFT) {
10178 effective_page_mask = VM_MAP_PAGE_MASK(dst_map);
10179 } else {
10180 effective_page_mask = MAX(VM_MAP_PAGE_MASK(dst_map), PAGE_MASK);
10181 effective_page_mask = MAX(VM_MAP_COPY_PAGE_MASK(copy),
10182 effective_page_mask);
10183 }
10184 effective_page_size = effective_page_mask + 1;
10185
10186 if (copy_size < VM_MAP_COPY_OVERWRITE_OPTIMIZATION_THRESHOLD_PAGES * effective_page_size) {
10187 /*
10188 * Too small to bother with optimizing...
10189 */
10190 goto blunt_copy;
10191 }
10192
10193 if ((dst_addr & effective_page_mask) !=
10194 (copy->offset & effective_page_mask)) {
10195 /*
10196 * Incompatible mis-alignment of source and destination...
10197 */
10198 goto blunt_copy;
10199 }
10200
10201 /*
10202 * Proper alignment or identical mis-alignment at the beginning.
10203 * Let's try and do a small unaligned copy first (if needed)
10204 * and then an aligned copy for the rest.
10205 */
10206 if (!vm_map_page_aligned(dst_addr, effective_page_mask)) {
10207 head_addr = dst_addr;
10208 head_size = (effective_page_size -
10209 (copy->offset & effective_page_mask));
10210 head_size = MIN(head_size, copy_size);
10211 }
10212 if (!vm_map_page_aligned(copy->offset + copy_size,
10213 effective_page_mask)) {
10214 /*
10215 * Mis-alignment at the end.
10216 * Do an aligned copy up to the last page and
10217 * then an unaligned copy for the remaining bytes.
10218 */
10219 tail_size = ((copy->offset + copy_size) &
10220 effective_page_mask);
10221 tail_size = MIN(tail_size, copy_size);
10222 tail_addr = dst_addr + copy_size - tail_size;
10223 assert(tail_addr >= head_addr + head_size);
10224 }
10225 assert(head_size + tail_size <= copy_size);
10226
10227 if (head_size + tail_size == copy_size) {
10228 /*
10229 * It's all unaligned, no optimization possible...
10230 */
10231 goto blunt_copy;
10232 }
10233
10234 /*
10235 * Can't optimize if there are any submaps in the
10236 * destination due to the way we free the "copy" map
10237 * progressively in vm_map_copy_overwrite_nested()
10238 * in that case.
10239 */
10240 vm_map_lock_read(dst_map);
10241 if (!vm_map_lookup_entry(dst_map, dst_addr, &entry)) {
10242 vm_map_unlock_read(dst_map);
10243 goto blunt_copy;
10244 }
10245 for (;
10246 (entry != vm_map_to_entry(dst_map) &&
10247 entry->vme_start < dst_addr + copy_size);
10248 entry = entry->vme_next) {
10249 if (entry->is_sub_map) {
10250 vm_map_unlock_read(dst_map);
10251 goto blunt_copy;
10252 }
10253 }
10254 vm_map_unlock_read(dst_map);
10255
10256 if (head_size) {
10257 /*
10258 * Unaligned copy of the first "head_size" bytes, to reach
10259 * a page boundary.
10260 */
10261
10262 /*
10263 * Extract "head_copy" out of "copy".
10264 */
10265 head_copy = vm_map_copy_allocate(VM_MAP_COPY_ENTRY_LIST);
10266 head_copy->cpy_hdr.entries_pageable =
10267 copy->cpy_hdr.entries_pageable;
10268 head_copy->cpy_hdr.page_shift = copy_page_shift;
10269
10270 entry = vm_map_copy_first_entry(copy);
10271 if (entry->vme_end < copy->offset + head_size) {
10272 head_size = entry->vme_end - copy->offset;
10273 }
10274
10275 head_copy->offset = copy->offset;
10276 head_copy->size = head_size;
10277 copy->offset += head_size;
10278 copy->size -= head_size;
10279 copy_size -= head_size;
10280 assert(copy_size > 0);
10281
10282 vm_map_copy_clip_end(copy, entry, copy->offset);
10283 vm_map_copy_entry_unlink(copy, entry);
10284 vm_map_copy_entry_link(head_copy,
10285 vm_map_copy_to_entry(head_copy),
10286 entry);
10287
10288 /*
10289 * Do the unaligned copy.
10290 */
10291 kr = vm_map_copy_overwrite_nested(dst_map,
10292 head_addr,
10293 head_copy,
10294 interruptible,
10295 (pmap_t) NULL,
10296 FALSE);
10297 if (kr != KERN_SUCCESS) {
10298 ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_COPYOVERWRITE_PARTIAL_HEAD_NESTED_ERROR), kr /* arg */);
10299 goto done;
10300 }
10301 }
10302
10303 if (tail_size) {
10304 /*
10305 * Extract "tail_copy" out of "copy".
10306 */
10307 tail_copy = vm_map_copy_allocate(VM_MAP_COPY_ENTRY_LIST);
10308 tail_copy->cpy_hdr.entries_pageable =
10309 copy->cpy_hdr.entries_pageable;
10310 tail_copy->cpy_hdr.page_shift = copy_page_shift;
10311
10312 tail_copy->offset = copy->offset + copy_size - tail_size;
10313 tail_copy->size = tail_size;
10314
10315 copy->size -= tail_size;
10316 copy_size -= tail_size;
10317 assert(copy_size > 0);
10318
10319 entry = vm_map_copy_last_entry(copy);
10320 vm_map_copy_clip_start(copy, entry, tail_copy->offset);
10321 entry = vm_map_copy_last_entry(copy);
10322 vm_map_copy_entry_unlink(copy, entry);
10323 vm_map_copy_entry_link(tail_copy,
10324 vm_map_copy_last_entry(tail_copy),
10325 entry);
10326 }
10327
10328 /*
10329 * If we are here from ipc_kmsg_copyout_ool_descriptor(),
10330 * we want to avoid TOCTOU issues w.r.t copy->size but
10331 * we don't need to change vm_map_copy_overwrite_nested()
10332 * and all other vm_map_copy_overwrite variants.
10333 *
10334 * So we assign the original copy_size that was passed into
10335 * this routine back to copy.
10336 *
10337 * This use of local 'copy_size' passed into this routine is
10338 * to try and protect against TOCTOU attacks where the kernel
10339 * has been exploited. We don't expect this to be an issue
10340 * during normal system operation.
10341 */
10342 assertf(copy->size == copy_size,
10343 "Mismatch of copy sizes. Expected 0x%llx, Got 0x%llx\n", (uint64_t) copy_size, (uint64_t) copy->size);
10344 copy->size = copy_size;
10345
10346 /*
10347 * Copy most (or possibly all) of the data.
10348 */
10349 kr = vm_map_copy_overwrite_nested(dst_map,
10350 dst_addr + head_size,
10351 copy,
10352 interruptible,
10353 (pmap_t) NULL,
10354 FALSE);
10355 if (kr != KERN_SUCCESS) {
10356 ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_COPYOVERWRITE_PARTIAL_NESTED_ERROR), kr /* arg */);
10357 goto done;
10358 }
10359
10360 if (tail_size) {
10361 kr = vm_map_copy_overwrite_nested(dst_map,
10362 tail_addr,
10363 tail_copy,
10364 interruptible,
10365 (pmap_t) NULL,
10366 FALSE);
10367 if (kr) {
10368 ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_COPYOVERWRITE_PARTIAL_TAIL_NESTED_ERROR), kr /* arg */);
10369 }
10370 }
10371
10372 done:
10373 assert(copy->type == VM_MAP_COPY_ENTRY_LIST);
10374 if (kr == KERN_SUCCESS) {
10375 /*
10376 * Discard all the copy maps.
10377 */
10378 if (head_copy) {
10379 vm_map_copy_discard(head_copy);
10380 head_copy = NULL;
10381 }
10382 vm_map_copy_discard(copy);
10383 if (tail_copy) {
10384 vm_map_copy_discard(tail_copy);
10385 tail_copy = NULL;
10386 }
10387 } else {
10388 /*
10389 * Re-assemble the original copy map.
10390 */
10391 if (head_copy) {
10392 entry = vm_map_copy_first_entry(head_copy);
10393 vm_map_copy_entry_unlink(head_copy, entry);
10394 vm_map_copy_entry_link(copy,
10395 vm_map_copy_to_entry(copy),
10396 entry);
10397 copy->offset -= head_size;
10398 copy->size += head_size;
10399 vm_map_copy_discard(head_copy);
10400 head_copy = NULL;
10401 }
10402 if (tail_copy) {
10403 entry = vm_map_copy_last_entry(tail_copy);
10404 vm_map_copy_entry_unlink(tail_copy, entry);
10405 vm_map_copy_entry_link(copy,
10406 vm_map_copy_last_entry(copy),
10407 entry);
10408 copy->size += tail_size;
10409 vm_map_copy_discard(tail_copy);
10410 tail_copy = NULL;
10411 }
10412 }
10413 return kr;
10414 }
10415
10416
10417 /*
10418 * Routine: vm_map_copy_overwrite_unaligned [internal use only]
10419 *
10420 * Decription:
10421 * Physically copy unaligned data
10422 *
10423 * Implementation:
10424 * Unaligned parts of pages have to be physically copied. We use
10425 * a modified form of vm_fault_copy (which understands none-aligned
10426 * page offsets and sizes) to do the copy. We attempt to copy as
10427 * much memory in one go as possibly, however vm_fault_copy copies
10428 * within 1 memory object so we have to find the smaller of "amount left"
10429 * "source object data size" and "target object data size". With
10430 * unaligned data we don't need to split regions, therefore the source
10431 * (copy) object should be one map entry, the target range may be split
10432 * over multiple map entries however. In any event we are pessimistic
10433 * about these assumptions.
10434 *
10435 * Callers of this function must call vm_map_copy_require on
10436 * previously created vm_map_copy_t or pass a newly created
10437 * one to ensure that it hasn't been forged.
10438 *
10439 * Assumptions:
10440 * dst_map is locked on entry and is return locked on success,
10441 * unlocked on error.
10442 */
10443
10444 static kern_return_t
vm_map_copy_overwrite_unaligned(vm_map_t dst_map,vm_map_entry_t entry,vm_map_copy_t copy,vm_map_offset_t start,boolean_t discard_on_success)10445 vm_map_copy_overwrite_unaligned(
10446 vm_map_t dst_map,
10447 vm_map_entry_t entry,
10448 vm_map_copy_t copy,
10449 vm_map_offset_t start,
10450 boolean_t discard_on_success)
10451 {
10452 vm_map_entry_t copy_entry;
10453 vm_map_entry_t copy_entry_next;
10454 vm_map_version_t version;
10455 vm_object_t dst_object;
10456 vm_object_offset_t dst_offset;
10457 vm_object_offset_t src_offset;
10458 vm_object_offset_t entry_offset;
10459 vm_map_offset_t entry_end;
10460 vm_map_size_t src_size,
10461 dst_size,
10462 copy_size,
10463 amount_left;
10464 kern_return_t kr = KERN_SUCCESS;
10465
10466
10467 copy_entry = vm_map_copy_first_entry(copy);
10468
10469 vm_map_lock_write_to_read(dst_map);
10470
10471 src_offset = copy->offset - trunc_page_mask_64(copy->offset, VM_MAP_COPY_PAGE_MASK(copy));
10472 amount_left = copy->size;
10473 /*
10474 * unaligned so we never clipped this entry, we need the offset into
10475 * the vm_object not just the data.
10476 */
10477 while (amount_left > 0) {
10478 if (entry == vm_map_to_entry(dst_map)) {
10479 vm_map_unlock_read(dst_map);
10480 return KERN_INVALID_ADDRESS;
10481 }
10482
10483 /* "start" must be within the current map entry */
10484 assert((start >= entry->vme_start) && (start < entry->vme_end));
10485
10486 /*
10487 * Check protection again
10488 */
10489 if (!(entry->protection & VM_PROT_WRITE)) {
10490 vm_map_unlock_read(dst_map);
10491 return KERN_PROTECTION_FAILURE;
10492 }
10493 if (!vm_map_entry_is_overwritable(dst_map, entry)) {
10494 vm_map_unlock_read(dst_map);
10495 return KERN_PROTECTION_FAILURE;
10496 }
10497
10498 /*
10499 * If the entry is in transition, we must wait
10500 * for it to exit that state. Anything could happen
10501 * when we unlock the map, so start over.
10502 */
10503 if (entry->in_transition) {
10504 /*
10505 * Say that we are waiting, and wait for entry.
10506 */
10507 entry->needs_wakeup = TRUE;
10508 vm_map_entry_wait(dst_map, THREAD_UNINT);
10509
10510 goto RetryLookup;
10511 }
10512
10513 dst_offset = start - entry->vme_start;
10514
10515 dst_size = entry->vme_end - start;
10516
10517 src_size = copy_entry->vme_end -
10518 (copy_entry->vme_start + src_offset);
10519
10520 if (dst_size < src_size) {
10521 /*
10522 * we can only copy dst_size bytes before
10523 * we have to get the next destination entry
10524 */
10525 copy_size = dst_size;
10526 } else {
10527 /*
10528 * we can only copy src_size bytes before
10529 * we have to get the next source copy entry
10530 */
10531 copy_size = src_size;
10532 }
10533
10534 if (copy_size > amount_left) {
10535 copy_size = amount_left;
10536 }
10537 /*
10538 * Entry needs copy, create a shadow shadow object for
10539 * Copy on write region.
10540 */
10541 if (entry->needs_copy) {
10542 if (vm_map_lock_read_to_write(dst_map)) {
10543 vm_map_lock_read(dst_map);
10544 goto RetryLookup;
10545 }
10546 VME_OBJECT_SHADOW(entry,
10547 (vm_map_size_t)(entry->vme_end
10548 - entry->vme_start),
10549 vm_map_always_shadow(dst_map));
10550 entry->needs_copy = FALSE;
10551 vm_map_lock_write_to_read(dst_map);
10552 }
10553 dst_object = VME_OBJECT(entry);
10554 /*
10555 * unlike with the virtual (aligned) copy we're going
10556 * to fault on it therefore we need a target object.
10557 */
10558 if (dst_object == VM_OBJECT_NULL) {
10559 if (vm_map_lock_read_to_write(dst_map)) {
10560 vm_map_lock_read(dst_map);
10561 goto RetryLookup;
10562 }
10563 dst_object = vm_object_allocate((vm_map_size_t)
10564 entry->vme_end - entry->vme_start);
10565 VME_OBJECT_SET(entry, dst_object, false, 0);
10566 VME_OFFSET_SET(entry, 0);
10567 assert(entry->use_pmap);
10568 vm_map_lock_write_to_read(dst_map);
10569 }
10570 /*
10571 * Take an object reference and unlock map. The "entry" may
10572 * disappear or change when the map is unlocked.
10573 */
10574 vm_object_reference(dst_object);
10575 version.main_timestamp = dst_map->timestamp;
10576 entry_offset = VME_OFFSET(entry);
10577 entry_end = entry->vme_end;
10578 vm_map_unlock_read(dst_map);
10579 /*
10580 * Copy as much as possible in one pass
10581 */
10582 kr = vm_fault_copy(
10583 VME_OBJECT(copy_entry),
10584 VME_OFFSET(copy_entry) + src_offset,
10585 ©_size,
10586 dst_object,
10587 entry_offset + dst_offset,
10588 dst_map,
10589 &version,
10590 THREAD_UNINT );
10591
10592 start += copy_size;
10593 src_offset += copy_size;
10594 amount_left -= copy_size;
10595 /*
10596 * Release the object reference
10597 */
10598 vm_object_deallocate(dst_object);
10599 /*
10600 * If a hard error occurred, return it now
10601 */
10602 if (kr != KERN_SUCCESS) {
10603 return kr;
10604 }
10605
10606 if ((copy_entry->vme_start + src_offset) == copy_entry->vme_end
10607 || amount_left == 0) {
10608 /*
10609 * all done with this copy entry, dispose.
10610 */
10611 copy_entry_next = copy_entry->vme_next;
10612
10613 if (discard_on_success) {
10614 vm_map_copy_entry_unlink(copy, copy_entry);
10615 assert(!copy_entry->is_sub_map);
10616 vm_object_deallocate(VME_OBJECT(copy_entry));
10617 vm_map_copy_entry_dispose(copy_entry);
10618 }
10619
10620 if (copy_entry_next == vm_map_copy_to_entry(copy) &&
10621 amount_left) {
10622 /*
10623 * not finished copying but run out of source
10624 */
10625 return KERN_INVALID_ADDRESS;
10626 }
10627
10628 copy_entry = copy_entry_next;
10629
10630 src_offset = 0;
10631 }
10632
10633 if (amount_left == 0) {
10634 return KERN_SUCCESS;
10635 }
10636
10637 vm_map_lock_read(dst_map);
10638 if (version.main_timestamp == dst_map->timestamp) {
10639 if (start == entry_end) {
10640 /*
10641 * destination region is split. Use the version
10642 * information to avoid a lookup in the normal
10643 * case.
10644 */
10645 entry = entry->vme_next;
10646 /*
10647 * should be contiguous. Fail if we encounter
10648 * a hole in the destination.
10649 */
10650 if (start != entry->vme_start) {
10651 vm_map_unlock_read(dst_map);
10652 return KERN_INVALID_ADDRESS;
10653 }
10654 }
10655 } else {
10656 /*
10657 * Map version check failed.
10658 * we must lookup the entry because somebody
10659 * might have changed the map behind our backs.
10660 */
10661 RetryLookup:
10662 if (!vm_map_lookup_entry(dst_map, start, &entry)) {
10663 vm_map_unlock_read(dst_map);
10664 return KERN_INVALID_ADDRESS;
10665 }
10666 }
10667 }/* while */
10668
10669 return KERN_SUCCESS;
10670 }/* vm_map_copy_overwrite_unaligned */
10671
10672 /*
10673 * Routine: vm_map_copy_overwrite_aligned [internal use only]
10674 *
10675 * Description:
10676 * Does all the vm_trickery possible for whole pages.
10677 *
10678 * Implementation:
10679 *
10680 * If there are no permanent objects in the destination,
10681 * and the source and destination map entry zones match,
10682 * and the destination map entry is not shared,
10683 * then the map entries can be deleted and replaced
10684 * with those from the copy. The following code is the
10685 * basic idea of what to do, but there are lots of annoying
10686 * little details about getting protection and inheritance
10687 * right. Should add protection, inheritance, and sharing checks
10688 * to the above pass and make sure that no wiring is involved.
10689 *
10690 * Callers of this function must call vm_map_copy_require on
10691 * previously created vm_map_copy_t or pass a newly created
10692 * one to ensure that it hasn't been forged.
10693 */
10694
10695 int vm_map_copy_overwrite_aligned_src_not_internal = 0;
10696 int vm_map_copy_overwrite_aligned_src_not_symmetric = 0;
10697 int vm_map_copy_overwrite_aligned_src_large = 0;
10698
10699 static kern_return_t
vm_map_copy_overwrite_aligned(vm_map_t dst_map,vm_map_entry_t tmp_entry,vm_map_copy_t copy,vm_map_offset_t start,__unused pmap_t pmap)10700 vm_map_copy_overwrite_aligned(
10701 vm_map_t dst_map,
10702 vm_map_entry_t tmp_entry,
10703 vm_map_copy_t copy,
10704 vm_map_offset_t start,
10705 __unused pmap_t pmap)
10706 {
10707 vm_object_t object;
10708 vm_map_entry_t copy_entry;
10709 vm_map_size_t copy_size;
10710 vm_map_size_t size;
10711 vm_map_entry_t entry;
10712
10713 while ((copy_entry = vm_map_copy_first_entry(copy))
10714 != vm_map_copy_to_entry(copy)) {
10715 copy_size = (copy_entry->vme_end - copy_entry->vme_start);
10716
10717 entry = tmp_entry;
10718 if (entry->is_sub_map) {
10719 /* unnested when clipped earlier */
10720 assert(!entry->use_pmap);
10721 }
10722 if (entry == vm_map_to_entry(dst_map)) {
10723 vm_map_unlock(dst_map);
10724 return KERN_INVALID_ADDRESS;
10725 }
10726 size = (entry->vme_end - entry->vme_start);
10727 /*
10728 * Make sure that no holes popped up in the
10729 * address map, and that the protection is
10730 * still valid, in case the map was unlocked
10731 * earlier.
10732 */
10733
10734 if ((entry->vme_start != start) || ((entry->is_sub_map)
10735 && !entry->needs_copy)) {
10736 vm_map_unlock(dst_map);
10737 return KERN_INVALID_ADDRESS;
10738 }
10739 assert(entry != vm_map_to_entry(dst_map));
10740
10741 /*
10742 * Check protection again
10743 */
10744
10745 if (!(entry->protection & VM_PROT_WRITE)) {
10746 vm_map_unlock(dst_map);
10747 return KERN_PROTECTION_FAILURE;
10748 }
10749
10750 if (!vm_map_entry_is_overwritable(dst_map, entry)) {
10751 vm_map_unlock(dst_map);
10752 return KERN_PROTECTION_FAILURE;
10753 }
10754
10755 /*
10756 * If the entry is in transition, we must wait
10757 * for it to exit that state. Anything could happen
10758 * when we unlock the map, so start over.
10759 */
10760 if (entry->in_transition) {
10761 /*
10762 * Say that we are waiting, and wait for entry.
10763 */
10764 entry->needs_wakeup = TRUE;
10765 vm_map_entry_wait(dst_map, THREAD_UNINT);
10766
10767 goto RetryLookup;
10768 }
10769
10770 /*
10771 * Adjust to source size first
10772 */
10773
10774 if (copy_size < size) {
10775 if (entry->map_aligned &&
10776 !VM_MAP_PAGE_ALIGNED(entry->vme_start + copy_size,
10777 VM_MAP_PAGE_MASK(dst_map))) {
10778 /* no longer map-aligned */
10779 entry->map_aligned = FALSE;
10780 }
10781 vm_map_clip_end(dst_map, entry, entry->vme_start + copy_size);
10782 size = copy_size;
10783 }
10784
10785 /*
10786 * Adjust to destination size
10787 */
10788
10789 if (size < copy_size) {
10790 vm_map_copy_clip_end(copy, copy_entry,
10791 copy_entry->vme_start + size);
10792 copy_size = size;
10793 }
10794
10795 assert((entry->vme_end - entry->vme_start) == size);
10796 assert((tmp_entry->vme_end - tmp_entry->vme_start) == size);
10797 assert((copy_entry->vme_end - copy_entry->vme_start) == size);
10798
10799 /*
10800 * If the destination contains temporary unshared memory,
10801 * we can perform the copy by throwing it away and
10802 * installing the source data.
10803 *
10804 * Exceptions for mappings with special semantics:
10805 * + "permanent" entries,
10806 * + JIT regions,
10807 * + TPRO regions,
10808 * + pmap-specific protection policies,
10809 * + VM objects with COPY_NONE copy strategy.
10810 */
10811
10812 object = VME_OBJECT(entry);
10813 if ((!entry->is_shared &&
10814 !entry->vme_permanent &&
10815 !entry->used_for_jit &&
10816 #if __arm64e__
10817 !entry->used_for_tpro &&
10818 #endif /* __arm64e__ */
10819 !(entry->protection & VM_PROT_EXECUTE) &&
10820 !pmap_has_prot_policy(dst_map->pmap, entry->translated_allow_execute, entry->protection) &&
10821 ((object == VM_OBJECT_NULL) ||
10822 (object->internal &&
10823 !object->true_share &&
10824 object->copy_strategy != MEMORY_OBJECT_COPY_NONE))) ||
10825 entry->needs_copy) {
10826 vm_object_t old_object = VME_OBJECT(entry);
10827 vm_object_offset_t old_offset = VME_OFFSET(entry);
10828 vm_object_offset_t offset;
10829
10830 /*
10831 * Ensure that the source and destination aren't
10832 * identical
10833 */
10834 if (old_object == VME_OBJECT(copy_entry) &&
10835 old_offset == VME_OFFSET(copy_entry)) {
10836 vm_map_copy_entry_unlink(copy, copy_entry);
10837 vm_map_copy_entry_dispose(copy_entry);
10838
10839 if (old_object != VM_OBJECT_NULL) {
10840 vm_object_deallocate(old_object);
10841 }
10842
10843 start = tmp_entry->vme_end;
10844 tmp_entry = tmp_entry->vme_next;
10845 continue;
10846 }
10847
10848 #if XNU_TARGET_OS_OSX
10849 #define __TRADEOFF1_OBJ_SIZE (64 * 1024 * 1024) /* 64 MB */
10850 #define __TRADEOFF1_COPY_SIZE (128 * 1024) /* 128 KB */
10851 if (VME_OBJECT(copy_entry) != VM_OBJECT_NULL &&
10852 VME_OBJECT(copy_entry)->vo_size >= __TRADEOFF1_OBJ_SIZE &&
10853 copy_size <= __TRADEOFF1_COPY_SIZE) {
10854 /*
10855 * Virtual vs. Physical copy tradeoff #1.
10856 *
10857 * Copying only a few pages out of a large
10858 * object: do a physical copy instead of
10859 * a virtual copy, to avoid possibly keeping
10860 * the entire large object alive because of
10861 * those few copy-on-write pages.
10862 */
10863 vm_map_copy_overwrite_aligned_src_large++;
10864 goto slow_copy;
10865 }
10866 #endif /* XNU_TARGET_OS_OSX */
10867
10868 if ((dst_map->pmap != kernel_pmap) &&
10869 (VME_ALIAS(entry) >= VM_MEMORY_MALLOC) &&
10870 (VME_ALIAS(entry) <= VM_MEMORY_MALLOC_MEDIUM)) {
10871 vm_object_t new_object, new_shadow;
10872
10873 /*
10874 * We're about to map something over a mapping
10875 * established by malloc()...
10876 */
10877 new_object = VME_OBJECT(copy_entry);
10878 if (new_object != VM_OBJECT_NULL) {
10879 vm_object_lock_shared(new_object);
10880 }
10881 while (new_object != VM_OBJECT_NULL &&
10882 #if XNU_TARGET_OS_OSX
10883 !new_object->true_share &&
10884 new_object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC &&
10885 #endif /* XNU_TARGET_OS_OSX */
10886 new_object->internal) {
10887 new_shadow = new_object->shadow;
10888 if (new_shadow == VM_OBJECT_NULL) {
10889 break;
10890 }
10891 vm_object_lock_shared(new_shadow);
10892 vm_object_unlock(new_object);
10893 new_object = new_shadow;
10894 }
10895 if (new_object != VM_OBJECT_NULL) {
10896 if (!new_object->internal) {
10897 /*
10898 * The new mapping is backed
10899 * by an external object. We
10900 * don't want malloc'ed memory
10901 * to be replaced with such a
10902 * non-anonymous mapping, so
10903 * let's go off the optimized
10904 * path...
10905 */
10906 vm_map_copy_overwrite_aligned_src_not_internal++;
10907 vm_object_unlock(new_object);
10908 goto slow_copy;
10909 }
10910 #if XNU_TARGET_OS_OSX
10911 if (new_object->true_share ||
10912 new_object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC) {
10913 /*
10914 * Same if there's a "true_share"
10915 * object in the shadow chain, or
10916 * an object with a non-default
10917 * (SYMMETRIC) copy strategy.
10918 */
10919 vm_map_copy_overwrite_aligned_src_not_symmetric++;
10920 vm_object_unlock(new_object);
10921 goto slow_copy;
10922 }
10923 #endif /* XNU_TARGET_OS_OSX */
10924 vm_object_unlock(new_object);
10925 }
10926 /*
10927 * The new mapping is still backed by
10928 * anonymous (internal) memory, so it's
10929 * OK to substitute it for the original
10930 * malloc() mapping.
10931 */
10932 }
10933
10934 if (old_object != VM_OBJECT_NULL) {
10935 assert(!entry->vme_permanent);
10936 if (entry->is_sub_map) {
10937 if (entry->use_pmap) {
10938 #ifndef NO_NESTED_PMAP
10939 pmap_unnest(dst_map->pmap,
10940 (addr64_t)entry->vme_start,
10941 entry->vme_end - entry->vme_start);
10942 #endif /* NO_NESTED_PMAP */
10943 if (dst_map->mapped_in_other_pmaps) {
10944 /* clean up parent */
10945 /* map/maps */
10946 vm_map_submap_pmap_clean(
10947 dst_map, entry->vme_start,
10948 entry->vme_end,
10949 VME_SUBMAP(entry),
10950 VME_OFFSET(entry));
10951 }
10952 } else {
10953 vm_map_submap_pmap_clean(
10954 dst_map, entry->vme_start,
10955 entry->vme_end,
10956 VME_SUBMAP(entry),
10957 VME_OFFSET(entry));
10958 }
10959 vm_map_deallocate(VME_SUBMAP(entry));
10960 } else {
10961 if (dst_map->mapped_in_other_pmaps) {
10962 vm_object_pmap_protect_options(
10963 VME_OBJECT(entry),
10964 VME_OFFSET(entry),
10965 entry->vme_end
10966 - entry->vme_start,
10967 PMAP_NULL,
10968 PAGE_SIZE,
10969 entry->vme_start,
10970 VM_PROT_NONE,
10971 PMAP_OPTIONS_REMOVE);
10972 } else {
10973 pmap_remove_options(
10974 dst_map->pmap,
10975 (addr64_t)(entry->vme_start),
10976 (addr64_t)(entry->vme_end),
10977 PMAP_OPTIONS_REMOVE);
10978 }
10979 vm_object_deallocate(old_object);
10980 }
10981 }
10982
10983 if (entry->iokit_acct) {
10984 /* keep using iokit accounting */
10985 entry->use_pmap = FALSE;
10986 } else {
10987 /* use pmap accounting */
10988 entry->use_pmap = TRUE;
10989 }
10990 assert(!entry->vme_permanent);
10991 VME_OBJECT_SET(entry, VME_OBJECT(copy_entry), false, 0);
10992 object = VME_OBJECT(entry);
10993 entry->needs_copy = copy_entry->needs_copy;
10994 entry->wired_count = 0;
10995 entry->user_wired_count = 0;
10996 offset = VME_OFFSET(copy_entry);
10997 VME_OFFSET_SET(entry, offset);
10998
10999 vm_map_copy_entry_unlink(copy, copy_entry);
11000 vm_map_copy_entry_dispose(copy_entry);
11001
11002 /*
11003 * we could try to push pages into the pmap at this point, BUT
11004 * this optimization only saved on average 2 us per page if ALL
11005 * the pages in the source were currently mapped
11006 * and ALL the pages in the dest were touched, if there were fewer
11007 * than 2/3 of the pages touched, this optimization actually cost more cycles
11008 * it also puts a lot of pressure on the pmap layer w/r to mapping structures
11009 */
11010
11011 /*
11012 * Set up for the next iteration. The map
11013 * has not been unlocked, so the next
11014 * address should be at the end of this
11015 * entry, and the next map entry should be
11016 * the one following it.
11017 */
11018
11019 start = tmp_entry->vme_end;
11020 tmp_entry = tmp_entry->vme_next;
11021 } else {
11022 vm_map_version_t version;
11023 vm_object_t dst_object;
11024 vm_object_offset_t dst_offset;
11025 kern_return_t r;
11026
11027 slow_copy:
11028 if (entry->needs_copy) {
11029 VME_OBJECT_SHADOW(entry,
11030 (entry->vme_end -
11031 entry->vme_start),
11032 vm_map_always_shadow(dst_map));
11033 entry->needs_copy = FALSE;
11034 }
11035
11036 dst_object = VME_OBJECT(entry);
11037 dst_offset = VME_OFFSET(entry);
11038
11039 /*
11040 * Take an object reference, and record
11041 * the map version information so that the
11042 * map can be safely unlocked.
11043 */
11044
11045 if (dst_object == VM_OBJECT_NULL) {
11046 /*
11047 * We would usually have just taken the
11048 * optimized path above if the destination
11049 * object has not been allocated yet. But we
11050 * now disable that optimization if the copy
11051 * entry's object is not backed by anonymous
11052 * memory to avoid replacing malloc'ed
11053 * (i.e. re-usable) anonymous memory with a
11054 * not-so-anonymous mapping.
11055 * So we have to handle this case here and
11056 * allocate a new VM object for this map entry.
11057 */
11058 dst_object = vm_object_allocate(
11059 entry->vme_end - entry->vme_start);
11060 dst_offset = 0;
11061 VME_OBJECT_SET(entry, dst_object, false, 0);
11062 VME_OFFSET_SET(entry, dst_offset);
11063 assert(entry->use_pmap);
11064 }
11065
11066 vm_object_reference(dst_object);
11067
11068 /* account for unlock bumping up timestamp */
11069 version.main_timestamp = dst_map->timestamp + 1;
11070
11071 vm_map_unlock(dst_map);
11072
11073 /*
11074 * Copy as much as possible in one pass
11075 */
11076
11077 copy_size = size;
11078 r = vm_fault_copy(
11079 VME_OBJECT(copy_entry),
11080 VME_OFFSET(copy_entry),
11081 ©_size,
11082 dst_object,
11083 dst_offset,
11084 dst_map,
11085 &version,
11086 THREAD_UNINT );
11087
11088 /*
11089 * Release the object reference
11090 */
11091
11092 vm_object_deallocate(dst_object);
11093
11094 /*
11095 * If a hard error occurred, return it now
11096 */
11097
11098 if (r != KERN_SUCCESS) {
11099 return r;
11100 }
11101
11102 if (copy_size != 0) {
11103 /*
11104 * Dispose of the copied region
11105 */
11106
11107 vm_map_copy_clip_end(copy, copy_entry,
11108 copy_entry->vme_start + copy_size);
11109 vm_map_copy_entry_unlink(copy, copy_entry);
11110 vm_object_deallocate(VME_OBJECT(copy_entry));
11111 vm_map_copy_entry_dispose(copy_entry);
11112 }
11113
11114 /*
11115 * Pick up in the destination map where we left off.
11116 *
11117 * Use the version information to avoid a lookup
11118 * in the normal case.
11119 */
11120
11121 start += copy_size;
11122 vm_map_lock(dst_map);
11123 if (version.main_timestamp == dst_map->timestamp &&
11124 copy_size != 0) {
11125 /* We can safely use saved tmp_entry value */
11126
11127 if (tmp_entry->map_aligned &&
11128 !VM_MAP_PAGE_ALIGNED(
11129 start,
11130 VM_MAP_PAGE_MASK(dst_map))) {
11131 /* no longer map-aligned */
11132 tmp_entry->map_aligned = FALSE;
11133 }
11134 vm_map_clip_end(dst_map, tmp_entry, start);
11135 tmp_entry = tmp_entry->vme_next;
11136 } else {
11137 /* Must do lookup of tmp_entry */
11138
11139 RetryLookup:
11140 if (!vm_map_lookup_entry(dst_map, start, &tmp_entry)) {
11141 vm_map_unlock(dst_map);
11142 return KERN_INVALID_ADDRESS;
11143 }
11144 if (tmp_entry->map_aligned &&
11145 !VM_MAP_PAGE_ALIGNED(
11146 start,
11147 VM_MAP_PAGE_MASK(dst_map))) {
11148 /* no longer map-aligned */
11149 tmp_entry->map_aligned = FALSE;
11150 }
11151 vm_map_clip_start(dst_map, tmp_entry, start);
11152 }
11153 }
11154 }/* while */
11155
11156 return KERN_SUCCESS;
11157 }/* vm_map_copy_overwrite_aligned */
11158
11159 /*
11160 * Routine: vm_map_copyin_kernel_buffer [internal use only]
11161 *
11162 * Description:
11163 * Copy in data to a kernel buffer from space in the
11164 * source map. The original space may be optionally
11165 * deallocated.
11166 *
11167 * If successful, returns a new copy object.
11168 */
11169 static kern_return_t
vm_map_copyin_kernel_buffer(vm_map_t src_map,vm_map_offset_t src_addr,vm_map_size_t len,boolean_t src_destroy,vm_map_copy_t * copy_result)11170 vm_map_copyin_kernel_buffer(
11171 vm_map_t src_map,
11172 vm_map_offset_t src_addr,
11173 vm_map_size_t len,
11174 boolean_t src_destroy,
11175 vm_map_copy_t *copy_result)
11176 {
11177 kern_return_t kr;
11178 vm_map_copy_t copy;
11179 void *kdata;
11180
11181 if (len > msg_ool_size_small) {
11182 return KERN_INVALID_ARGUMENT;
11183 }
11184
11185 kdata = kalloc_data(len, Z_WAITOK);
11186 if (kdata == NULL) {
11187 return KERN_RESOURCE_SHORTAGE;
11188 }
11189 kr = copyinmap(src_map, src_addr, kdata, (vm_size_t)len);
11190 if (kr != KERN_SUCCESS) {
11191 kfree_data(kdata, len);
11192 return kr;
11193 }
11194
11195 copy = vm_map_copy_allocate(VM_MAP_COPY_KERNEL_BUFFER);
11196 copy->cpy_kdata = kdata;
11197 copy->size = len;
11198 copy->offset = 0;
11199
11200 if (src_destroy) {
11201 vmr_flags_t flags = VM_MAP_REMOVE_INTERRUPTIBLE;
11202
11203 if (src_map == kernel_map) {
11204 flags |= VM_MAP_REMOVE_KUNWIRE;
11205 }
11206
11207 (void)vm_map_remove_guard(src_map,
11208 vm_map_trunc_page(src_addr, VM_MAP_PAGE_MASK(src_map)),
11209 vm_map_round_page(src_addr + len, VM_MAP_PAGE_MASK(src_map)),
11210 flags, KMEM_GUARD_NONE);
11211 }
11212
11213 *copy_result = copy;
11214 return KERN_SUCCESS;
11215 }
11216
11217 /*
11218 * Routine: vm_map_copyout_kernel_buffer [internal use only]
11219 *
11220 * Description:
11221 * Copy out data from a kernel buffer into space in the
11222 * destination map. The space may be otpionally dynamically
11223 * allocated.
11224 *
11225 * If successful, consumes the copy object.
11226 * Otherwise, the caller is responsible for it.
11227 *
11228 * Callers of this function must call vm_map_copy_require on
11229 * previously created vm_map_copy_t or pass a newly created
11230 * one to ensure that it hasn't been forged.
11231 */
11232 static int vm_map_copyout_kernel_buffer_failures = 0;
11233 static kern_return_t
vm_map_copyout_kernel_buffer(vm_map_t map,vm_map_address_t * addr,vm_map_copy_t copy,vm_map_size_t copy_size,boolean_t overwrite,boolean_t consume_on_success)11234 vm_map_copyout_kernel_buffer(
11235 vm_map_t map,
11236 vm_map_address_t *addr, /* IN/OUT */
11237 vm_map_copy_t copy,
11238 vm_map_size_t copy_size,
11239 boolean_t overwrite,
11240 boolean_t consume_on_success)
11241 {
11242 kern_return_t kr = KERN_SUCCESS;
11243 thread_t thread = current_thread();
11244
11245 assert(copy->size == copy_size);
11246
11247 /*
11248 * check for corrupted vm_map_copy structure
11249 */
11250 if (copy_size > msg_ool_size_small || copy->offset) {
11251 panic("Invalid vm_map_copy_t sz:%lld, ofst:%lld",
11252 (long long)copy->size, (long long)copy->offset);
11253 }
11254
11255 if (!overwrite) {
11256 /*
11257 * Allocate space in the target map for the data
11258 */
11259 vm_map_kernel_flags_t vmk_flags = VM_MAP_KERNEL_FLAGS_ANYWHERE();
11260
11261 if (map == kernel_map) {
11262 vmk_flags.vmkf_range_id = KMEM_RANGE_ID_DATA;
11263 }
11264
11265 *addr = 0;
11266 kr = vm_map_enter(map,
11267 addr,
11268 vm_map_round_page(copy_size,
11269 VM_MAP_PAGE_MASK(map)),
11270 (vm_map_offset_t) 0,
11271 vmk_flags,
11272 VM_OBJECT_NULL,
11273 (vm_object_offset_t) 0,
11274 FALSE,
11275 VM_PROT_DEFAULT,
11276 VM_PROT_ALL,
11277 VM_INHERIT_DEFAULT);
11278 if (kr != KERN_SUCCESS) {
11279 return kr;
11280 }
11281 #if KASAN
11282 if (map->pmap == kernel_pmap) {
11283 kasan_notify_address(*addr, copy->size);
11284 }
11285 #endif
11286 }
11287
11288 /*
11289 * Copyout the data from the kernel buffer to the target map.
11290 */
11291 if (thread->map == map) {
11292 /*
11293 * If the target map is the current map, just do
11294 * the copy.
11295 */
11296 assert((vm_size_t)copy_size == copy_size);
11297 if (copyout(copy->cpy_kdata, *addr, (vm_size_t)copy_size)) {
11298 kr = KERN_INVALID_ADDRESS;
11299 }
11300 } else {
11301 vm_map_t oldmap;
11302
11303 /*
11304 * If the target map is another map, assume the
11305 * target's address space identity for the duration
11306 * of the copy.
11307 */
11308 vm_map_reference(map);
11309 oldmap = vm_map_switch(map);
11310
11311 assert((vm_size_t)copy_size == copy_size);
11312 if (copyout(copy->cpy_kdata, *addr, (vm_size_t)copy_size)) {
11313 vm_map_copyout_kernel_buffer_failures++;
11314 kr = KERN_INVALID_ADDRESS;
11315 }
11316
11317 (void) vm_map_switch(oldmap);
11318 vm_map_deallocate(map);
11319 }
11320
11321 if (kr != KERN_SUCCESS) {
11322 /* the copy failed, clean up */
11323 if (!overwrite) {
11324 /*
11325 * Deallocate the space we allocated in the target map.
11326 */
11327 (void) vm_map_remove(map,
11328 vm_map_trunc_page(*addr,
11329 VM_MAP_PAGE_MASK(map)),
11330 vm_map_round_page((*addr +
11331 vm_map_round_page(copy_size,
11332 VM_MAP_PAGE_MASK(map))),
11333 VM_MAP_PAGE_MASK(map)));
11334 *addr = 0;
11335 }
11336 } else {
11337 /* copy was successful, dicard the copy structure */
11338 if (consume_on_success) {
11339 kfree_data(copy->cpy_kdata, copy_size);
11340 zfree_id(ZONE_ID_VM_MAP_COPY, copy);
11341 }
11342 }
11343
11344 return kr;
11345 }
11346
11347 /*
11348 * Routine: vm_map_copy_insert [internal use only]
11349 *
11350 * Description:
11351 * Link a copy chain ("copy") into a map at the
11352 * specified location (after "where").
11353 *
11354 * Callers of this function must call vm_map_copy_require on
11355 * previously created vm_map_copy_t or pass a newly created
11356 * one to ensure that it hasn't been forged.
11357 * Side effects:
11358 * The copy chain is destroyed.
11359 */
11360 static void
vm_map_copy_insert(vm_map_t map,vm_map_entry_t after_where,vm_map_copy_t copy)11361 vm_map_copy_insert(
11362 vm_map_t map,
11363 vm_map_entry_t after_where,
11364 vm_map_copy_t copy)
11365 {
11366 vm_map_entry_t entry;
11367
11368 while (vm_map_copy_first_entry(copy) != vm_map_copy_to_entry(copy)) {
11369 entry = vm_map_copy_first_entry(copy);
11370 vm_map_copy_entry_unlink(copy, entry);
11371 vm_map_store_entry_link(map, after_where, entry,
11372 VM_MAP_KERNEL_FLAGS_NONE);
11373 after_where = entry;
11374 }
11375 zfree_id(ZONE_ID_VM_MAP_COPY, copy);
11376 }
11377
11378 /*
11379 * Callers of this function must call vm_map_copy_require on
11380 * previously created vm_map_copy_t or pass a newly created
11381 * one to ensure that it hasn't been forged.
11382 */
11383 void
vm_map_copy_remap(vm_map_t map,vm_map_entry_t where,vm_map_copy_t copy,vm_map_offset_t adjustment,vm_prot_t cur_prot,vm_prot_t max_prot,vm_inherit_t inheritance)11384 vm_map_copy_remap(
11385 vm_map_t map,
11386 vm_map_entry_t where,
11387 vm_map_copy_t copy,
11388 vm_map_offset_t adjustment,
11389 vm_prot_t cur_prot,
11390 vm_prot_t max_prot,
11391 vm_inherit_t inheritance)
11392 {
11393 vm_map_entry_t copy_entry, new_entry;
11394
11395 for (copy_entry = vm_map_copy_first_entry(copy);
11396 copy_entry != vm_map_copy_to_entry(copy);
11397 copy_entry = copy_entry->vme_next) {
11398 /* get a new VM map entry for the map */
11399 new_entry = vm_map_entry_create(map);
11400 /* copy the "copy entry" to the new entry */
11401 vm_map_entry_copy(map, new_entry, copy_entry);
11402 /* adjust "start" and "end" */
11403 new_entry->vme_start += adjustment;
11404 new_entry->vme_end += adjustment;
11405 /* clear some attributes */
11406 new_entry->inheritance = inheritance;
11407 new_entry->protection = cur_prot;
11408 new_entry->max_protection = max_prot;
11409 new_entry->behavior = VM_BEHAVIOR_DEFAULT;
11410 /* take an extra reference on the entry's "object" */
11411 if (new_entry->is_sub_map) {
11412 assert(!new_entry->use_pmap); /* not nested */
11413 vm_map_reference(VME_SUBMAP(new_entry));
11414 } else {
11415 vm_object_reference(VME_OBJECT(new_entry));
11416 }
11417 /* insert the new entry in the map */
11418 vm_map_store_entry_link(map, where, new_entry,
11419 VM_MAP_KERNEL_FLAGS_NONE);
11420 /* continue inserting the "copy entries" after the new entry */
11421 where = new_entry;
11422 }
11423 }
11424
11425
11426 /*
11427 * Returns true if *size matches (or is in the range of) copy->size.
11428 * Upon returning true, the *size field is updated with the actual size of the
11429 * copy object (may be different for VM_MAP_COPY_ENTRY_LIST types)
11430 */
11431 boolean_t
vm_map_copy_validate_size(vm_map_t dst_map,vm_map_copy_t copy,vm_map_size_t * size)11432 vm_map_copy_validate_size(
11433 vm_map_t dst_map,
11434 vm_map_copy_t copy,
11435 vm_map_size_t *size)
11436 {
11437 if (copy == VM_MAP_COPY_NULL) {
11438 return FALSE;
11439 }
11440
11441 /*
11442 * Assert that the vm_map_copy is coming from the right
11443 * zone and hasn't been forged
11444 */
11445 vm_map_copy_require(copy);
11446
11447 vm_map_size_t copy_sz = copy->size;
11448 vm_map_size_t sz = *size;
11449 switch (copy->type) {
11450 case VM_MAP_COPY_KERNEL_BUFFER:
11451 if (sz == copy_sz) {
11452 return TRUE;
11453 }
11454 break;
11455 case VM_MAP_COPY_ENTRY_LIST:
11456 /*
11457 * potential page-size rounding prevents us from exactly
11458 * validating this flavor of vm_map_copy, but we can at least
11459 * assert that it's within a range.
11460 */
11461 if (copy_sz >= sz &&
11462 copy_sz <= vm_map_round_page(sz, VM_MAP_PAGE_MASK(dst_map))) {
11463 *size = copy_sz;
11464 return TRUE;
11465 }
11466 break;
11467 default:
11468 break;
11469 }
11470 return FALSE;
11471 }
11472
11473 /*
11474 * Routine: vm_map_copyout_size
11475 *
11476 * Description:
11477 * Copy out a copy chain ("copy") into newly-allocated
11478 * space in the destination map. Uses a prevalidated
11479 * size for the copy object (vm_map_copy_validate_size).
11480 *
11481 * If successful, consumes the copy object.
11482 * Otherwise, the caller is responsible for it.
11483 */
11484 kern_return_t
vm_map_copyout_size(vm_map_t dst_map,vm_map_address_t * dst_addr,vm_map_copy_t copy,vm_map_size_t copy_size)11485 vm_map_copyout_size(
11486 vm_map_t dst_map,
11487 vm_map_address_t *dst_addr, /* OUT */
11488 vm_map_copy_t copy,
11489 vm_map_size_t copy_size)
11490 {
11491 return vm_map_copyout_internal(dst_map, dst_addr, copy, copy_size,
11492 TRUE, /* consume_on_success */
11493 VM_PROT_DEFAULT,
11494 VM_PROT_ALL,
11495 VM_INHERIT_DEFAULT);
11496 }
11497
11498 /*
11499 * Routine: vm_map_copyout
11500 *
11501 * Description:
11502 * Copy out a copy chain ("copy") into newly-allocated
11503 * space in the destination map.
11504 *
11505 * If successful, consumes the copy object.
11506 * Otherwise, the caller is responsible for it.
11507 */
11508 kern_return_t
vm_map_copyout(vm_map_t dst_map,vm_map_address_t * dst_addr,vm_map_copy_t copy)11509 vm_map_copyout(
11510 vm_map_t dst_map,
11511 vm_map_address_t *dst_addr, /* OUT */
11512 vm_map_copy_t copy)
11513 {
11514 return vm_map_copyout_internal(dst_map, dst_addr, copy, copy ? copy->size : 0,
11515 TRUE, /* consume_on_success */
11516 VM_PROT_DEFAULT,
11517 VM_PROT_ALL,
11518 VM_INHERIT_DEFAULT);
11519 }
11520
11521 kern_return_t
vm_map_copyout_internal(vm_map_t dst_map,vm_map_address_t * dst_addr,vm_map_copy_t copy,vm_map_size_t copy_size,boolean_t consume_on_success,vm_prot_t cur_protection,vm_prot_t max_protection,vm_inherit_t inheritance)11522 vm_map_copyout_internal(
11523 vm_map_t dst_map,
11524 vm_map_address_t *dst_addr, /* OUT */
11525 vm_map_copy_t copy,
11526 vm_map_size_t copy_size,
11527 boolean_t consume_on_success,
11528 vm_prot_t cur_protection,
11529 vm_prot_t max_protection,
11530 vm_inherit_t inheritance)
11531 {
11532 vm_map_size_t size;
11533 vm_map_size_t adjustment;
11534 vm_map_offset_t start;
11535 vm_object_offset_t vm_copy_start;
11536 vm_map_entry_t last;
11537 vm_map_entry_t entry;
11538 vm_map_copy_t original_copy;
11539 kern_return_t kr;
11540 vm_map_kernel_flags_t vmk_flags = VM_MAP_KERNEL_FLAGS_ANYWHERE();
11541
11542 /*
11543 * Check for null copy object.
11544 */
11545
11546 if (copy == VM_MAP_COPY_NULL) {
11547 *dst_addr = 0;
11548 return KERN_SUCCESS;
11549 }
11550
11551 /*
11552 * Assert that the vm_map_copy is coming from the right
11553 * zone and hasn't been forged
11554 */
11555 vm_map_copy_require(copy);
11556
11557 if (copy->size != copy_size) {
11558 *dst_addr = 0;
11559 ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_COPYOUT_INTERNAL_SIZE_ERROR), KERN_FAILURE /* arg */);
11560 return KERN_FAILURE;
11561 }
11562
11563 /*
11564 * Check for special kernel buffer allocated
11565 * by new_ipc_kmsg_copyin.
11566 */
11567
11568 if (copy->type == VM_MAP_COPY_KERNEL_BUFFER) {
11569 kr = vm_map_copyout_kernel_buffer(dst_map, dst_addr,
11570 copy, copy_size, FALSE,
11571 consume_on_success);
11572 if (kr) {
11573 ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_COPYOUT_KERNEL_BUFFER_ERROR), kr /* arg */);
11574 }
11575 return kr;
11576 }
11577
11578 original_copy = copy;
11579 if (copy->cpy_hdr.page_shift != VM_MAP_PAGE_SHIFT(dst_map)) {
11580 vm_map_copy_t target_copy;
11581 vm_map_offset_t overmap_start, overmap_end, trimmed_start;
11582
11583 target_copy = VM_MAP_COPY_NULL;
11584 DEBUG4K_ADJUST("adjusting...\n");
11585 kr = vm_map_copy_adjust_to_target(
11586 copy,
11587 0, /* offset */
11588 copy->size, /* size */
11589 dst_map,
11590 TRUE, /* copy */
11591 &target_copy,
11592 &overmap_start,
11593 &overmap_end,
11594 &trimmed_start);
11595 if (kr != KERN_SUCCESS) {
11596 DEBUG4K_COPY("adjust failed 0x%x\n", kr);
11597 ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_COPYOUT_INTERNAL_ADJUSTING_ERROR), kr /* arg */);
11598 return kr;
11599 }
11600 DEBUG4K_COPY("copy %p (%d 0x%llx 0x%llx) dst_map %p (%d) target_copy %p (%d 0x%llx 0x%llx) overmap_start 0x%llx overmap_end 0x%llx trimmed_start 0x%llx\n", copy, copy->cpy_hdr.page_shift, copy->offset, (uint64_t)copy->size, dst_map, VM_MAP_PAGE_SHIFT(dst_map), target_copy, target_copy->cpy_hdr.page_shift, target_copy->offset, (uint64_t)target_copy->size, (uint64_t)overmap_start, (uint64_t)overmap_end, (uint64_t)trimmed_start);
11601 if (target_copy != copy) {
11602 copy = target_copy;
11603 }
11604 copy_size = copy->size;
11605 }
11606
11607 /*
11608 * Find space for the data
11609 */
11610
11611 vm_copy_start = vm_map_trunc_page((vm_map_size_t)copy->offset,
11612 VM_MAP_COPY_PAGE_MASK(copy));
11613 size = vm_map_round_page((vm_map_size_t)copy->offset + copy_size,
11614 VM_MAP_COPY_PAGE_MASK(copy))
11615 - vm_copy_start;
11616
11617 vm_map_kernel_flags_update_range_id(&vmk_flags, dst_map);
11618
11619 vm_map_lock(dst_map);
11620 kr = vm_map_locate_space(dst_map, size, 0, vmk_flags,
11621 &start, &last);
11622 if (kr != KERN_SUCCESS) {
11623 vm_map_unlock(dst_map);
11624 ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_COPYOUT_INTERNAL_SPACE_ERROR), kr /* arg */);
11625 return kr;
11626 }
11627
11628 adjustment = start - vm_copy_start;
11629 if (!consume_on_success) {
11630 /*
11631 * We're not allowed to consume "copy", so we'll have to
11632 * copy its map entries into the destination map below.
11633 * No need to re-allocate map entries from the correct
11634 * (pageable or not) zone, since we'll get new map entries
11635 * during the transfer.
11636 * We'll also adjust the map entries's "start" and "end"
11637 * during the transfer, to keep "copy"'s entries consistent
11638 * with its "offset".
11639 */
11640 goto after_adjustments;
11641 }
11642
11643 /*
11644 * Since we're going to just drop the map
11645 * entries from the copy into the destination
11646 * map, they must come from the same pool.
11647 */
11648
11649 if (copy->cpy_hdr.entries_pageable != dst_map->hdr.entries_pageable) {
11650 /*
11651 * Mismatches occur when dealing with the default
11652 * pager.
11653 */
11654 vm_map_entry_t next, new;
11655
11656 /*
11657 * Find the zone that the copies were allocated from
11658 */
11659
11660 entry = vm_map_copy_first_entry(copy);
11661
11662 /*
11663 * Reinitialize the copy so that vm_map_copy_entry_link
11664 * will work.
11665 */
11666 vm_map_store_copy_reset(copy, entry);
11667 copy->cpy_hdr.entries_pageable = dst_map->hdr.entries_pageable;
11668
11669 /*
11670 * Copy each entry.
11671 */
11672 while (entry != vm_map_copy_to_entry(copy)) {
11673 new = vm_map_copy_entry_create(copy);
11674 vm_map_entry_copy_full(new, entry);
11675 new->vme_no_copy_on_read = FALSE;
11676 assert(!new->iokit_acct);
11677 if (new->is_sub_map) {
11678 /* clr address space specifics */
11679 new->use_pmap = FALSE;
11680 }
11681 vm_map_copy_entry_link(copy,
11682 vm_map_copy_last_entry(copy),
11683 new);
11684 next = entry->vme_next;
11685 vm_map_entry_dispose(entry);
11686 entry = next;
11687 }
11688 }
11689
11690 /*
11691 * Adjust the addresses in the copy chain, and
11692 * reset the region attributes.
11693 */
11694
11695 for (entry = vm_map_copy_first_entry(copy);
11696 entry != vm_map_copy_to_entry(copy);
11697 entry = entry->vme_next) {
11698 if (VM_MAP_PAGE_SHIFT(dst_map) == PAGE_SHIFT) {
11699 /*
11700 * We're injecting this copy entry into a map that
11701 * has the standard page alignment, so clear
11702 * "map_aligned" (which might have been inherited
11703 * from the original map entry).
11704 */
11705 entry->map_aligned = FALSE;
11706 }
11707
11708 entry->vme_start += adjustment;
11709 entry->vme_end += adjustment;
11710
11711 if (entry->map_aligned) {
11712 assert(VM_MAP_PAGE_ALIGNED(entry->vme_start,
11713 VM_MAP_PAGE_MASK(dst_map)));
11714 assert(VM_MAP_PAGE_ALIGNED(entry->vme_end,
11715 VM_MAP_PAGE_MASK(dst_map)));
11716 }
11717
11718 entry->inheritance = VM_INHERIT_DEFAULT;
11719 entry->protection = VM_PROT_DEFAULT;
11720 entry->max_protection = VM_PROT_ALL;
11721 entry->behavior = VM_BEHAVIOR_DEFAULT;
11722
11723 /*
11724 * If the entry is now wired,
11725 * map the pages into the destination map.
11726 */
11727 if (entry->wired_count != 0) {
11728 vm_map_offset_t va;
11729 vm_object_offset_t offset;
11730 vm_object_t object;
11731 vm_prot_t prot;
11732 int type_of_fault;
11733 uint8_t object_lock_type = OBJECT_LOCK_EXCLUSIVE;
11734
11735 /* TODO4K would need to use actual page size */
11736 assert(VM_MAP_PAGE_SHIFT(dst_map) == PAGE_SHIFT);
11737
11738 object = VME_OBJECT(entry);
11739 offset = VME_OFFSET(entry);
11740 va = entry->vme_start;
11741
11742 pmap_pageable(dst_map->pmap,
11743 entry->vme_start,
11744 entry->vme_end,
11745 TRUE);
11746
11747 while (va < entry->vme_end) {
11748 vm_page_t m;
11749 struct vm_object_fault_info fault_info = {};
11750
11751 /*
11752 * Look up the page in the object.
11753 * Assert that the page will be found in the
11754 * top object:
11755 * either
11756 * the object was newly created by
11757 * vm_object_copy_slowly, and has
11758 * copies of all of the pages from
11759 * the source object
11760 * or
11761 * the object was moved from the old
11762 * map entry; because the old map
11763 * entry was wired, all of the pages
11764 * were in the top-level object.
11765 * (XXX not true if we wire pages for
11766 * reading)
11767 */
11768 vm_object_lock(object);
11769
11770 m = vm_page_lookup(object, offset);
11771 if (m == VM_PAGE_NULL || !VM_PAGE_WIRED(m) ||
11772 m->vmp_absent) {
11773 panic("vm_map_copyout: wiring %p", m);
11774 }
11775
11776 prot = entry->protection;
11777
11778 if (override_nx(dst_map, VME_ALIAS(entry)) &&
11779 prot) {
11780 prot |= VM_PROT_EXECUTE;
11781 }
11782
11783 type_of_fault = DBG_CACHE_HIT_FAULT;
11784
11785 fault_info.user_tag = VME_ALIAS(entry);
11786 fault_info.pmap_options = 0;
11787 if (entry->iokit_acct ||
11788 (!entry->is_sub_map && !entry->use_pmap)) {
11789 fault_info.pmap_options |= PMAP_OPTIONS_ALT_ACCT;
11790 }
11791 if (entry->vme_xnu_user_debug &&
11792 !VM_PAGE_OBJECT(m)->code_signed) {
11793 /*
11794 * Modified code-signed executable
11795 * region: this page does not belong
11796 * to a code-signed VM object, so it
11797 * must have been copied and should
11798 * therefore be typed XNU_USER_DEBUG
11799 * rather than XNU_USER_EXEC.
11800 */
11801 fault_info.pmap_options |= PMAP_OPTIONS_XNU_USER_DEBUG;
11802 }
11803
11804 vm_fault_enter(m,
11805 dst_map->pmap,
11806 va,
11807 PAGE_SIZE, 0,
11808 prot,
11809 prot,
11810 VM_PAGE_WIRED(m),
11811 FALSE, /* change_wiring */
11812 VM_KERN_MEMORY_NONE, /* tag - not wiring */
11813 &fault_info,
11814 NULL, /* need_retry */
11815 &type_of_fault,
11816 &object_lock_type); /*Exclusive mode lock. Will remain unchanged.*/
11817
11818 vm_object_unlock(object);
11819
11820 offset += PAGE_SIZE_64;
11821 va += PAGE_SIZE;
11822 }
11823 }
11824 }
11825
11826 after_adjustments:
11827
11828 /*
11829 * Correct the page alignment for the result
11830 */
11831
11832 *dst_addr = start + (copy->offset - vm_copy_start);
11833
11834 #if KASAN
11835 kasan_notify_address(*dst_addr, size);
11836 #endif
11837
11838 /*
11839 * Update the hints and the map size
11840 */
11841
11842 if (consume_on_success) {
11843 SAVE_HINT_MAP_WRITE(dst_map, vm_map_copy_last_entry(copy));
11844 } else {
11845 SAVE_HINT_MAP_WRITE(dst_map, last);
11846 }
11847
11848 dst_map->size += size;
11849
11850 /*
11851 * Link in the copy
11852 */
11853
11854 if (consume_on_success) {
11855 vm_map_copy_insert(dst_map, last, copy);
11856 if (copy != original_copy) {
11857 vm_map_copy_discard(original_copy);
11858 original_copy = VM_MAP_COPY_NULL;
11859 }
11860 } else {
11861 vm_map_copy_remap(dst_map, last, copy, adjustment,
11862 cur_protection, max_protection,
11863 inheritance);
11864 if (copy != original_copy && original_copy != VM_MAP_COPY_NULL) {
11865 vm_map_copy_discard(copy);
11866 copy = original_copy;
11867 }
11868 }
11869
11870
11871 vm_map_unlock(dst_map);
11872
11873 /*
11874 * XXX If wiring_required, call vm_map_pageable
11875 */
11876
11877 return KERN_SUCCESS;
11878 }
11879
11880 /*
11881 * Routine: vm_map_copyin
11882 *
11883 * Description:
11884 * see vm_map_copyin_common. Exported via Unsupported.exports.
11885 *
11886 */
11887
11888 #undef vm_map_copyin
11889
11890 kern_return_t
vm_map_copyin(vm_map_t src_map,vm_map_address_t src_addr,vm_map_size_t len,boolean_t src_destroy,vm_map_copy_t * copy_result)11891 vm_map_copyin(
11892 vm_map_t src_map,
11893 vm_map_address_t src_addr,
11894 vm_map_size_t len,
11895 boolean_t src_destroy,
11896 vm_map_copy_t *copy_result) /* OUT */
11897 {
11898 return vm_map_copyin_common(src_map, src_addr, len, src_destroy,
11899 FALSE, copy_result, FALSE);
11900 }
11901
11902 /*
11903 * Routine: vm_map_copyin_common
11904 *
11905 * Description:
11906 * Copy the specified region (src_addr, len) from the
11907 * source address space (src_map), possibly removing
11908 * the region from the source address space (src_destroy).
11909 *
11910 * Returns:
11911 * A vm_map_copy_t object (copy_result), suitable for
11912 * insertion into another address space (using vm_map_copyout),
11913 * copying over another address space region (using
11914 * vm_map_copy_overwrite). If the copy is unused, it
11915 * should be destroyed (using vm_map_copy_discard).
11916 *
11917 * In/out conditions:
11918 * The source map should not be locked on entry.
11919 */
11920
11921 typedef struct submap_map {
11922 vm_map_t parent_map;
11923 vm_map_offset_t base_start;
11924 vm_map_offset_t base_end;
11925 vm_map_size_t base_len;
11926 struct submap_map *next;
11927 } submap_map_t;
11928
11929 kern_return_t
vm_map_copyin_common(vm_map_t src_map,vm_map_address_t src_addr,vm_map_size_t len,boolean_t src_destroy,__unused boolean_t src_volatile,vm_map_copy_t * copy_result,boolean_t use_maxprot)11930 vm_map_copyin_common(
11931 vm_map_t src_map,
11932 vm_map_address_t src_addr,
11933 vm_map_size_t len,
11934 boolean_t src_destroy,
11935 __unused boolean_t src_volatile,
11936 vm_map_copy_t *copy_result, /* OUT */
11937 boolean_t use_maxprot)
11938 {
11939 int flags;
11940
11941 flags = 0;
11942 if (src_destroy) {
11943 flags |= VM_MAP_COPYIN_SRC_DESTROY;
11944 }
11945 if (use_maxprot) {
11946 flags |= VM_MAP_COPYIN_USE_MAXPROT;
11947 }
11948 return vm_map_copyin_internal(src_map,
11949 src_addr,
11950 len,
11951 flags,
11952 copy_result);
11953 }
11954 kern_return_t
vm_map_copyin_internal(vm_map_t src_map,vm_map_address_t src_addr,vm_map_size_t len,int flags,vm_map_copy_t * copy_result)11955 vm_map_copyin_internal(
11956 vm_map_t src_map,
11957 vm_map_address_t src_addr,
11958 vm_map_size_t len,
11959 int flags,
11960 vm_map_copy_t *copy_result) /* OUT */
11961 {
11962 vm_map_entry_t tmp_entry; /* Result of last map lookup --
11963 * in multi-level lookup, this
11964 * entry contains the actual
11965 * vm_object/offset.
11966 */
11967 vm_map_entry_t new_entry = VM_MAP_ENTRY_NULL; /* Map entry for copy */
11968
11969 vm_map_offset_t src_start; /* Start of current entry --
11970 * where copy is taking place now
11971 */
11972 vm_map_offset_t src_end; /* End of entire region to be
11973 * copied */
11974 vm_map_offset_t src_base;
11975 vm_map_t base_map = src_map;
11976 boolean_t map_share = FALSE;
11977 submap_map_t *parent_maps = NULL;
11978
11979 vm_map_copy_t copy; /* Resulting copy */
11980 vm_map_address_t copy_addr;
11981 vm_map_size_t copy_size;
11982 boolean_t src_destroy;
11983 boolean_t use_maxprot;
11984 boolean_t preserve_purgeable;
11985 boolean_t entry_was_shared;
11986 vm_map_entry_t saved_src_entry;
11987
11988
11989 if (flags & ~VM_MAP_COPYIN_ALL_FLAGS) {
11990 return KERN_INVALID_ARGUMENT;
11991 }
11992
11993 #if CONFIG_KERNEL_TAGGING
11994 if (src_map->pmap == kernel_pmap) {
11995 src_addr = vm_memtag_canonicalize_address(src_addr);
11996 }
11997 #endif /* CONFIG_KERNEL_TAGGING */
11998
11999 src_destroy = (flags & VM_MAP_COPYIN_SRC_DESTROY) ? TRUE : FALSE;
12000 use_maxprot = (flags & VM_MAP_COPYIN_USE_MAXPROT) ? TRUE : FALSE;
12001 preserve_purgeable =
12002 (flags & VM_MAP_COPYIN_PRESERVE_PURGEABLE) ? TRUE : FALSE;
12003
12004 /*
12005 * Check for copies of zero bytes.
12006 */
12007
12008 if (len == 0) {
12009 *copy_result = VM_MAP_COPY_NULL;
12010 return KERN_SUCCESS;
12011 }
12012
12013 /*
12014 * Check that the end address doesn't overflow
12015 */
12016 if (__improbable(vm_map_range_overflows(src_map, src_addr, len))) {
12017 return KERN_INVALID_ADDRESS;
12018 }
12019 src_end = src_addr + len;
12020 if (src_end < src_addr) {
12021 return KERN_INVALID_ADDRESS;
12022 }
12023
12024 /*
12025 * Compute (page aligned) start and end of region
12026 */
12027 src_start = vm_map_trunc_page(src_addr,
12028 VM_MAP_PAGE_MASK(src_map));
12029 src_end = vm_map_round_page(src_end,
12030 VM_MAP_PAGE_MASK(src_map));
12031 if (src_end < src_addr) {
12032 return KERN_INVALID_ADDRESS;
12033 }
12034
12035 /*
12036 * If the copy is sufficiently small, use a kernel buffer instead
12037 * of making a virtual copy. The theory being that the cost of
12038 * setting up VM (and taking C-O-W faults) dominates the copy costs
12039 * for small regions.
12040 */
12041 if ((len <= msg_ool_size_small) &&
12042 !use_maxprot &&
12043 !preserve_purgeable &&
12044 !(flags & VM_MAP_COPYIN_ENTRY_LIST) &&
12045 /*
12046 * Since the "msg_ool_size_small" threshold was increased and
12047 * vm_map_copyin_kernel_buffer() doesn't handle accesses beyond the
12048 * address space limits, we revert to doing a virtual copy if the
12049 * copied range goes beyond those limits. Otherwise, mach_vm_read()
12050 * of the commpage would now fail when it used to work.
12051 */
12052 (src_start >= vm_map_min(src_map) &&
12053 src_start < vm_map_max(src_map) &&
12054 src_end >= vm_map_min(src_map) &&
12055 src_end < vm_map_max(src_map))) {
12056 return vm_map_copyin_kernel_buffer(src_map, src_addr, len,
12057 src_destroy, copy_result);
12058 }
12059
12060 /*
12061 * Allocate a header element for the list.
12062 *
12063 * Use the start and end in the header to
12064 * remember the endpoints prior to rounding.
12065 */
12066
12067 copy = vm_map_copy_allocate(VM_MAP_COPY_ENTRY_LIST);
12068 copy->cpy_hdr.entries_pageable = TRUE;
12069 copy->cpy_hdr.page_shift = (uint16_t)VM_MAP_PAGE_SHIFT(src_map);
12070 copy->offset = src_addr;
12071 copy->size = len;
12072
12073 new_entry = vm_map_copy_entry_create(copy);
12074
12075 #define RETURN(x) \
12076 MACRO_BEGIN \
12077 vm_map_unlock(src_map); \
12078 if(src_map != base_map) \
12079 vm_map_deallocate(src_map); \
12080 if (new_entry != VM_MAP_ENTRY_NULL) \
12081 vm_map_copy_entry_dispose(new_entry); \
12082 vm_map_copy_discard(copy); \
12083 { \
12084 submap_map_t *_ptr; \
12085 \
12086 for(_ptr = parent_maps; _ptr != NULL; _ptr = parent_maps) { \
12087 parent_maps=parent_maps->next; \
12088 if (_ptr->parent_map != base_map) \
12089 vm_map_deallocate(_ptr->parent_map); \
12090 kfree_type(submap_map_t, _ptr); \
12091 } \
12092 } \
12093 MACRO_RETURN(x); \
12094 MACRO_END
12095
12096 /*
12097 * Find the beginning of the region.
12098 */
12099
12100 vm_map_lock(src_map);
12101
12102 /*
12103 * Lookup the original "src_addr" rather than the truncated
12104 * "src_start", in case "src_start" falls in a non-map-aligned
12105 * map entry *before* the map entry that contains "src_addr"...
12106 */
12107 if (!vm_map_lookup_entry(src_map, src_addr, &tmp_entry)) {
12108 RETURN(KERN_INVALID_ADDRESS);
12109 }
12110 if (!tmp_entry->is_sub_map) {
12111 /*
12112 * ... but clip to the map-rounded "src_start" rather than
12113 * "src_addr" to preserve map-alignment. We'll adjust the
12114 * first copy entry at the end, if needed.
12115 */
12116 vm_map_clip_start(src_map, tmp_entry, src_start);
12117 }
12118 if (src_start < tmp_entry->vme_start) {
12119 /*
12120 * Move "src_start" up to the start of the
12121 * first map entry to copy.
12122 */
12123 src_start = tmp_entry->vme_start;
12124 }
12125 /* set for later submap fix-up */
12126 copy_addr = src_start;
12127
12128 /*
12129 * Go through entries until we get to the end.
12130 */
12131
12132 while (TRUE) {
12133 vm_map_entry_t src_entry = tmp_entry; /* Top-level entry */
12134 vm_map_size_t src_size; /* Size of source
12135 * map entry (in both
12136 * maps)
12137 */
12138
12139 vm_object_t src_object; /* Object to copy */
12140 vm_object_offset_t src_offset;
12141
12142 vm_object_t new_copy_object;/* vm_object_copy_* result */
12143
12144 boolean_t src_needs_copy; /* Should source map
12145 * be made read-only
12146 * for copy-on-write?
12147 */
12148
12149 boolean_t new_entry_needs_copy; /* Will new entry be COW? */
12150
12151 boolean_t was_wired; /* Was source wired? */
12152 boolean_t saved_used_for_jit; /* Saved used_for_jit. */
12153 vm_map_version_t version; /* Version before locks
12154 * dropped to make copy
12155 */
12156 kern_return_t result; /* Return value from
12157 * copy_strategically.
12158 */
12159 while (tmp_entry->is_sub_map) {
12160 vm_map_size_t submap_len;
12161 submap_map_t *ptr;
12162
12163 ptr = kalloc_type(submap_map_t, Z_WAITOK);
12164 ptr->next = parent_maps;
12165 parent_maps = ptr;
12166 ptr->parent_map = src_map;
12167 ptr->base_start = src_start;
12168 ptr->base_end = src_end;
12169 submap_len = tmp_entry->vme_end - src_start;
12170 if (submap_len > (src_end - src_start)) {
12171 submap_len = src_end - src_start;
12172 }
12173 ptr->base_len = submap_len;
12174
12175 src_start -= tmp_entry->vme_start;
12176 src_start += VME_OFFSET(tmp_entry);
12177 src_end = src_start + submap_len;
12178 src_map = VME_SUBMAP(tmp_entry);
12179 vm_map_lock(src_map);
12180 /* keep an outstanding reference for all maps in */
12181 /* the parents tree except the base map */
12182 vm_map_reference(src_map);
12183 vm_map_unlock(ptr->parent_map);
12184 if (!vm_map_lookup_entry(
12185 src_map, src_start, &tmp_entry)) {
12186 RETURN(KERN_INVALID_ADDRESS);
12187 }
12188 map_share = TRUE;
12189 if (!tmp_entry->is_sub_map) {
12190 vm_map_clip_start(src_map, tmp_entry, src_start);
12191 }
12192 src_entry = tmp_entry;
12193 }
12194 /* we are now in the lowest level submap... */
12195
12196 if ((VME_OBJECT(tmp_entry) != VM_OBJECT_NULL) &&
12197 (VME_OBJECT(tmp_entry)->phys_contiguous)) {
12198 /* This is not, supported for now.In future */
12199 /* we will need to detect the phys_contig */
12200 /* condition and then upgrade copy_slowly */
12201 /* to do physical copy from the device mem */
12202 /* based object. We can piggy-back off of */
12203 /* the was wired boolean to set-up the */
12204 /* proper handling */
12205 RETURN(KERN_PROTECTION_FAILURE);
12206 }
12207 /*
12208 * Create a new address map entry to hold the result.
12209 * Fill in the fields from the appropriate source entries.
12210 * We must unlock the source map to do this if we need
12211 * to allocate a map entry.
12212 */
12213 if (new_entry == VM_MAP_ENTRY_NULL) {
12214 version.main_timestamp = src_map->timestamp;
12215 vm_map_unlock(src_map);
12216
12217 new_entry = vm_map_copy_entry_create(copy);
12218
12219 vm_map_lock(src_map);
12220 if ((version.main_timestamp + 1) != src_map->timestamp) {
12221 if (!vm_map_lookup_entry(src_map, src_start,
12222 &tmp_entry)) {
12223 RETURN(KERN_INVALID_ADDRESS);
12224 }
12225 if (!tmp_entry->is_sub_map) {
12226 vm_map_clip_start(src_map, tmp_entry, src_start);
12227 }
12228 continue; /* restart w/ new tmp_entry */
12229 }
12230 }
12231
12232 /*
12233 * Verify that the region can be read.
12234 */
12235 if (((src_entry->protection & VM_PROT_READ) == VM_PROT_NONE &&
12236 !use_maxprot) ||
12237 (src_entry->max_protection & VM_PROT_READ) == 0) {
12238 RETURN(KERN_PROTECTION_FAILURE);
12239 }
12240
12241 /*
12242 * Clip against the endpoints of the entire region.
12243 */
12244
12245 vm_map_clip_end(src_map, src_entry, src_end);
12246
12247 src_size = src_entry->vme_end - src_start;
12248 src_object = VME_OBJECT(src_entry);
12249 src_offset = VME_OFFSET(src_entry);
12250 was_wired = (src_entry->wired_count != 0);
12251
12252 vm_map_entry_copy(src_map, new_entry, src_entry);
12253 if (new_entry->is_sub_map) {
12254 /* clr address space specifics */
12255 new_entry->use_pmap = FALSE;
12256 } else {
12257 /*
12258 * We're dealing with a copy-on-write operation,
12259 * so the resulting mapping should not inherit the
12260 * original mapping's accounting settings.
12261 * "iokit_acct" should have been cleared in
12262 * vm_map_entry_copy().
12263 * "use_pmap" should be reset to its default (TRUE)
12264 * so that the new mapping gets accounted for in
12265 * the task's memory footprint.
12266 */
12267 assert(!new_entry->iokit_acct);
12268 new_entry->use_pmap = TRUE;
12269 }
12270
12271 /*
12272 * Attempt non-blocking copy-on-write optimizations.
12273 */
12274
12275 /*
12276 * If we are destroying the source, and the object
12277 * is internal, we could move the object reference
12278 * from the source to the copy. The copy is
12279 * copy-on-write only if the source is.
12280 * We make another reference to the object, because
12281 * destroying the source entry will deallocate it.
12282 *
12283 * This memory transfer has to be atomic, (to prevent
12284 * the VM object from being shared or copied while
12285 * it's being moved here), so we could only do this
12286 * if we won't have to unlock the VM map until the
12287 * original mapping has been fully removed.
12288 */
12289
12290 RestartCopy:
12291 if ((src_object == VM_OBJECT_NULL ||
12292 (!was_wired && !map_share && !tmp_entry->is_shared
12293 && !(debug4k_no_cow_copyin && VM_MAP_PAGE_SHIFT(src_map) < PAGE_SHIFT))) &&
12294 vm_object_copy_quickly(
12295 VME_OBJECT(new_entry),
12296 src_offset,
12297 src_size,
12298 &src_needs_copy,
12299 &new_entry_needs_copy)) {
12300 new_entry->needs_copy = new_entry_needs_copy;
12301
12302 /*
12303 * Handle copy-on-write obligations
12304 */
12305
12306 if (src_needs_copy && !tmp_entry->needs_copy) {
12307 vm_prot_t prot;
12308
12309 prot = src_entry->protection & ~VM_PROT_WRITE;
12310
12311 if (override_nx(src_map, VME_ALIAS(src_entry))
12312 && prot) {
12313 prot |= VM_PROT_EXECUTE;
12314 }
12315
12316 vm_object_pmap_protect(
12317 src_object,
12318 src_offset,
12319 src_size,
12320 (src_entry->is_shared ?
12321 PMAP_NULL
12322 : src_map->pmap),
12323 VM_MAP_PAGE_SIZE(src_map),
12324 src_entry->vme_start,
12325 prot);
12326
12327 assert(tmp_entry->wired_count == 0);
12328 tmp_entry->needs_copy = TRUE;
12329 }
12330
12331 /*
12332 * The map has never been unlocked, so it's safe
12333 * to move to the next entry rather than doing
12334 * another lookup.
12335 */
12336
12337 goto CopySuccessful;
12338 }
12339
12340 entry_was_shared = tmp_entry->is_shared;
12341
12342 /*
12343 * Take an object reference, so that we may
12344 * release the map lock(s).
12345 */
12346
12347 assert(src_object != VM_OBJECT_NULL);
12348 vm_object_reference(src_object);
12349
12350 /*
12351 * Record the timestamp for later verification.
12352 * Unlock the map.
12353 */
12354
12355 version.main_timestamp = src_map->timestamp;
12356 vm_map_unlock(src_map); /* Increments timestamp once! */
12357 saved_src_entry = src_entry;
12358 tmp_entry = VM_MAP_ENTRY_NULL;
12359 src_entry = VM_MAP_ENTRY_NULL;
12360
12361 /*
12362 * Perform the copy
12363 */
12364
12365 if (was_wired ||
12366 (src_object->copy_strategy == MEMORY_OBJECT_COPY_DELAY_FORK &&
12367 !(flags & VM_MAP_COPYIN_FORK)) ||
12368 (debug4k_no_cow_copyin &&
12369 VM_MAP_PAGE_SHIFT(src_map) < PAGE_SHIFT)) {
12370 CopySlowly:
12371 vm_object_lock(src_object);
12372 result = vm_object_copy_slowly(
12373 src_object,
12374 src_offset,
12375 src_size,
12376 THREAD_UNINT,
12377 &new_copy_object);
12378 /* VME_OBJECT_SET will reset used_for_jit|tpro, so preserve it. */
12379 saved_used_for_jit = new_entry->used_for_jit;
12380 VME_OBJECT_SET(new_entry, new_copy_object, false, 0);
12381 new_entry->used_for_jit = saved_used_for_jit;
12382 VME_OFFSET_SET(new_entry,
12383 src_offset - vm_object_trunc_page(src_offset));
12384 new_entry->needs_copy = FALSE;
12385 } else if (src_object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC &&
12386 (entry_was_shared || map_share)) {
12387 vm_object_t new_object;
12388
12389 vm_object_lock_shared(src_object);
12390 new_object = vm_object_copy_delayed(
12391 src_object,
12392 src_offset,
12393 src_size,
12394 TRUE);
12395 if (new_object == VM_OBJECT_NULL) {
12396 goto CopySlowly;
12397 }
12398
12399 VME_OBJECT_SET(new_entry, new_object, false, 0);
12400 assert(new_entry->wired_count == 0);
12401 new_entry->needs_copy = TRUE;
12402 assert(!new_entry->iokit_acct);
12403 assert(new_object->purgable == VM_PURGABLE_DENY);
12404 assertf(new_entry->use_pmap, "src_map %p new_entry %p\n", src_map, new_entry);
12405 result = KERN_SUCCESS;
12406 } else {
12407 vm_object_offset_t new_offset;
12408 new_offset = VME_OFFSET(new_entry);
12409 result = vm_object_copy_strategically(src_object,
12410 src_offset,
12411 src_size,
12412 (flags & VM_MAP_COPYIN_FORK),
12413 &new_copy_object,
12414 &new_offset,
12415 &new_entry_needs_copy);
12416 /* VME_OBJECT_SET will reset used_for_jit, so preserve it. */
12417 saved_used_for_jit = new_entry->used_for_jit;
12418 VME_OBJECT_SET(new_entry, new_copy_object, false, 0);
12419 new_entry->used_for_jit = saved_used_for_jit;
12420 if (new_offset != VME_OFFSET(new_entry)) {
12421 VME_OFFSET_SET(new_entry, new_offset);
12422 }
12423
12424 new_entry->needs_copy = new_entry_needs_copy;
12425 }
12426
12427 if (result == KERN_SUCCESS &&
12428 ((preserve_purgeable &&
12429 src_object->purgable != VM_PURGABLE_DENY) ||
12430 new_entry->used_for_jit)) {
12431 /*
12432 * Purgeable objects should be COPY_NONE, true share;
12433 * this should be propogated to the copy.
12434 *
12435 * Also force mappings the pmap specially protects to
12436 * be COPY_NONE; trying to COW these mappings would
12437 * change the effective protections, which could have
12438 * side effects if the pmap layer relies on the
12439 * specified protections.
12440 */
12441
12442 vm_object_t new_object;
12443
12444 new_object = VME_OBJECT(new_entry);
12445 assert(new_object != src_object);
12446 vm_object_lock(new_object);
12447 assert(new_object->ref_count == 1);
12448 assert(new_object->shadow == VM_OBJECT_NULL);
12449 assert(new_object->vo_copy == VM_OBJECT_NULL);
12450 assert(new_object->vo_owner == NULL);
12451
12452 new_object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
12453
12454 if (preserve_purgeable &&
12455 src_object->purgable != VM_PURGABLE_DENY) {
12456 VM_OBJECT_SET_TRUE_SHARE(new_object, TRUE);
12457
12458 /* start as non-volatile with no owner... */
12459 VM_OBJECT_SET_PURGABLE(new_object, VM_PURGABLE_NONVOLATILE);
12460 vm_purgeable_nonvolatile_enqueue(new_object, NULL);
12461 /* ... and move to src_object's purgeable state */
12462 if (src_object->purgable != VM_PURGABLE_NONVOLATILE) {
12463 int state;
12464 state = src_object->purgable;
12465 vm_object_purgable_control(
12466 new_object,
12467 VM_PURGABLE_SET_STATE_FROM_KERNEL,
12468 &state);
12469 }
12470 /* no pmap accounting for purgeable objects */
12471 new_entry->use_pmap = FALSE;
12472 }
12473
12474 vm_object_unlock(new_object);
12475 new_object = VM_OBJECT_NULL;
12476 }
12477
12478 if (result != KERN_SUCCESS &&
12479 result != KERN_MEMORY_RESTART_COPY) {
12480 vm_map_lock(src_map);
12481 RETURN(result);
12482 }
12483
12484 /*
12485 * Throw away the extra reference
12486 */
12487
12488 vm_object_deallocate(src_object);
12489
12490 /*
12491 * Verify that the map has not substantially
12492 * changed while the copy was being made.
12493 */
12494
12495 vm_map_lock(src_map);
12496
12497 if ((version.main_timestamp + 1) == src_map->timestamp) {
12498 /* src_map hasn't changed: src_entry is still valid */
12499 src_entry = saved_src_entry;
12500 goto VerificationSuccessful;
12501 }
12502
12503 /*
12504 * Simple version comparison failed.
12505 *
12506 * Retry the lookup and verify that the
12507 * same object/offset are still present.
12508 *
12509 * [Note: a memory manager that colludes with
12510 * the calling task can detect that we have
12511 * cheated. While the map was unlocked, the
12512 * mapping could have been changed and restored.]
12513 */
12514
12515 if (!vm_map_lookup_entry(src_map, src_start, &tmp_entry)) {
12516 if (result != KERN_MEMORY_RESTART_COPY) {
12517 vm_object_deallocate(VME_OBJECT(new_entry));
12518 VME_OBJECT_SET(new_entry, VM_OBJECT_NULL, false, 0);
12519 /* reset accounting state */
12520 new_entry->iokit_acct = FALSE;
12521 new_entry->use_pmap = TRUE;
12522 }
12523 RETURN(KERN_INVALID_ADDRESS);
12524 }
12525
12526 src_entry = tmp_entry;
12527 vm_map_clip_start(src_map, src_entry, src_start);
12528
12529 if ((((src_entry->protection & VM_PROT_READ) == VM_PROT_NONE) &&
12530 !use_maxprot) ||
12531 ((src_entry->max_protection & VM_PROT_READ) == 0)) {
12532 goto VerificationFailed;
12533 }
12534
12535 if (src_entry->vme_end < new_entry->vme_end) {
12536 /*
12537 * This entry might have been shortened
12538 * (vm_map_clip_end) or been replaced with
12539 * an entry that ends closer to "src_start"
12540 * than before.
12541 * Adjust "new_entry" accordingly; copying
12542 * less memory would be correct but we also
12543 * redo the copy (see below) if the new entry
12544 * no longer points at the same object/offset.
12545 */
12546 assert(VM_MAP_PAGE_ALIGNED(src_entry->vme_end,
12547 VM_MAP_COPY_PAGE_MASK(copy)));
12548 new_entry->vme_end = src_entry->vme_end;
12549 src_size = new_entry->vme_end - src_start;
12550 } else if (src_entry->vme_end > new_entry->vme_end) {
12551 /*
12552 * This entry might have been extended
12553 * (vm_map_entry_simplify() or coalesce)
12554 * or been replaced with an entry that ends farther
12555 * from "src_start" than before.
12556 *
12557 * We've called vm_object_copy_*() only on
12558 * the previous <start:end> range, so we can't
12559 * just extend new_entry. We have to re-do
12560 * the copy based on the new entry as if it was
12561 * pointing at a different object/offset (see
12562 * "Verification failed" below).
12563 */
12564 }
12565
12566 if ((VME_OBJECT(src_entry) != src_object) ||
12567 (VME_OFFSET(src_entry) != src_offset) ||
12568 (src_entry->vme_end > new_entry->vme_end)) {
12569 /*
12570 * Verification failed.
12571 *
12572 * Start over with this top-level entry.
12573 */
12574
12575 VerificationFailed: ;
12576
12577 vm_object_deallocate(VME_OBJECT(new_entry));
12578 tmp_entry = src_entry;
12579 continue;
12580 }
12581
12582 /*
12583 * Verification succeeded.
12584 */
12585
12586 VerificationSuccessful:;
12587
12588 if (result == KERN_MEMORY_RESTART_COPY) {
12589 goto RestartCopy;
12590 }
12591
12592 /*
12593 * Copy succeeded.
12594 */
12595
12596 CopySuccessful: ;
12597
12598 /*
12599 * Link in the new copy entry.
12600 */
12601
12602 vm_map_copy_entry_link(copy, vm_map_copy_last_entry(copy),
12603 new_entry);
12604
12605 /*
12606 * Determine whether the entire region
12607 * has been copied.
12608 */
12609 src_base = src_start;
12610 src_start = new_entry->vme_end;
12611 new_entry = VM_MAP_ENTRY_NULL;
12612 while ((src_start >= src_end) && (src_end != 0)) {
12613 submap_map_t *ptr;
12614
12615 if (src_map == base_map) {
12616 /* back to the top */
12617 break;
12618 }
12619
12620 ptr = parent_maps;
12621 assert(ptr != NULL);
12622 parent_maps = parent_maps->next;
12623
12624 /* fix up the damage we did in that submap */
12625 vm_map_simplify_range(src_map,
12626 src_base,
12627 src_end);
12628
12629 vm_map_unlock(src_map);
12630 vm_map_deallocate(src_map);
12631 vm_map_lock(ptr->parent_map);
12632 src_map = ptr->parent_map;
12633 src_base = ptr->base_start;
12634 src_start = ptr->base_start + ptr->base_len;
12635 src_end = ptr->base_end;
12636 if (!vm_map_lookup_entry(src_map,
12637 src_start,
12638 &tmp_entry) &&
12639 (src_end > src_start)) {
12640 RETURN(KERN_INVALID_ADDRESS);
12641 }
12642 kfree_type(submap_map_t, ptr);
12643 if (parent_maps == NULL) {
12644 map_share = FALSE;
12645 }
12646 src_entry = tmp_entry->vme_prev;
12647 }
12648
12649 if ((VM_MAP_PAGE_SHIFT(src_map) != PAGE_SHIFT) &&
12650 (src_start >= src_addr + len) &&
12651 (src_addr + len != 0)) {
12652 /*
12653 * Stop copying now, even though we haven't reached
12654 * "src_end". We'll adjust the end of the last copy
12655 * entry at the end, if needed.
12656 *
12657 * If src_map's aligment is different from the
12658 * system's page-alignment, there could be
12659 * extra non-map-aligned map entries between
12660 * the original (non-rounded) "src_addr + len"
12661 * and the rounded "src_end".
12662 * We do not want to copy those map entries since
12663 * they're not part of the copied range.
12664 */
12665 break;
12666 }
12667
12668 if ((src_start >= src_end) && (src_end != 0)) {
12669 break;
12670 }
12671
12672 /*
12673 * Verify that there are no gaps in the region
12674 */
12675
12676 tmp_entry = src_entry->vme_next;
12677 if ((tmp_entry->vme_start != src_start) ||
12678 (tmp_entry == vm_map_to_entry(src_map))) {
12679 RETURN(KERN_INVALID_ADDRESS);
12680 }
12681 }
12682
12683 /*
12684 * If the source should be destroyed, do it now, since the
12685 * copy was successful.
12686 */
12687 if (src_destroy) {
12688 vmr_flags_t remove_flags = VM_MAP_REMOVE_NO_FLAGS;
12689
12690 if (src_map == kernel_map) {
12691 remove_flags |= VM_MAP_REMOVE_KUNWIRE;
12692 }
12693 (void)vm_map_remove_and_unlock(src_map,
12694 vm_map_trunc_page(src_addr, VM_MAP_PAGE_MASK(src_map)),
12695 src_end,
12696 remove_flags,
12697 KMEM_GUARD_NONE);
12698 } else {
12699 /* fix up the damage we did in the base map */
12700 vm_map_simplify_range(
12701 src_map,
12702 vm_map_trunc_page(src_addr,
12703 VM_MAP_PAGE_MASK(src_map)),
12704 vm_map_round_page(src_end,
12705 VM_MAP_PAGE_MASK(src_map)));
12706 vm_map_unlock(src_map);
12707 }
12708
12709 tmp_entry = VM_MAP_ENTRY_NULL;
12710
12711 if (VM_MAP_PAGE_SHIFT(src_map) > PAGE_SHIFT &&
12712 VM_MAP_PAGE_SHIFT(src_map) != VM_MAP_COPY_PAGE_SHIFT(copy)) {
12713 vm_map_offset_t original_start, original_offset, original_end;
12714
12715 assert(VM_MAP_COPY_PAGE_MASK(copy) == PAGE_MASK);
12716
12717 /* adjust alignment of first copy_entry's "vme_start" */
12718 tmp_entry = vm_map_copy_first_entry(copy);
12719 if (tmp_entry != vm_map_copy_to_entry(copy)) {
12720 vm_map_offset_t adjustment;
12721
12722 original_start = tmp_entry->vme_start;
12723 original_offset = VME_OFFSET(tmp_entry);
12724
12725 /* map-align the start of the first copy entry... */
12726 adjustment = (tmp_entry->vme_start -
12727 vm_map_trunc_page(
12728 tmp_entry->vme_start,
12729 VM_MAP_PAGE_MASK(src_map)));
12730 tmp_entry->vme_start -= adjustment;
12731 VME_OFFSET_SET(tmp_entry,
12732 VME_OFFSET(tmp_entry) - adjustment);
12733 copy_addr -= adjustment;
12734 assert(tmp_entry->vme_start < tmp_entry->vme_end);
12735 /* ... adjust for mis-aligned start of copy range */
12736 adjustment =
12737 (vm_map_trunc_page(copy->offset,
12738 PAGE_MASK) -
12739 vm_map_trunc_page(copy->offset,
12740 VM_MAP_PAGE_MASK(src_map)));
12741 if (adjustment) {
12742 assert(page_aligned(adjustment));
12743 assert(adjustment < VM_MAP_PAGE_SIZE(src_map));
12744 tmp_entry->vme_start += adjustment;
12745 VME_OFFSET_SET(tmp_entry,
12746 (VME_OFFSET(tmp_entry) +
12747 adjustment));
12748 copy_addr += adjustment;
12749 assert(tmp_entry->vme_start < tmp_entry->vme_end);
12750 }
12751
12752 /*
12753 * Assert that the adjustments haven't exposed
12754 * more than was originally copied...
12755 */
12756 assert(tmp_entry->vme_start >= original_start);
12757 assert(VME_OFFSET(tmp_entry) >= original_offset);
12758 /*
12759 * ... and that it did not adjust outside of a
12760 * a single 16K page.
12761 */
12762 assert(vm_map_trunc_page(tmp_entry->vme_start,
12763 VM_MAP_PAGE_MASK(src_map)) ==
12764 vm_map_trunc_page(original_start,
12765 VM_MAP_PAGE_MASK(src_map)));
12766 }
12767
12768 /* adjust alignment of last copy_entry's "vme_end" */
12769 tmp_entry = vm_map_copy_last_entry(copy);
12770 if (tmp_entry != vm_map_copy_to_entry(copy)) {
12771 vm_map_offset_t adjustment;
12772
12773 original_end = tmp_entry->vme_end;
12774
12775 /* map-align the end of the last copy entry... */
12776 tmp_entry->vme_end =
12777 vm_map_round_page(tmp_entry->vme_end,
12778 VM_MAP_PAGE_MASK(src_map));
12779 /* ... adjust for mis-aligned end of copy range */
12780 adjustment =
12781 (vm_map_round_page((copy->offset +
12782 copy->size),
12783 VM_MAP_PAGE_MASK(src_map)) -
12784 vm_map_round_page((copy->offset +
12785 copy->size),
12786 PAGE_MASK));
12787 if (adjustment) {
12788 assert(page_aligned(adjustment));
12789 assert(adjustment < VM_MAP_PAGE_SIZE(src_map));
12790 tmp_entry->vme_end -= adjustment;
12791 assert(tmp_entry->vme_start < tmp_entry->vme_end);
12792 }
12793
12794 /*
12795 * Assert that the adjustments haven't exposed
12796 * more than was originally copied...
12797 */
12798 assert(tmp_entry->vme_end <= original_end);
12799 /*
12800 * ... and that it did not adjust outside of a
12801 * a single 16K page.
12802 */
12803 assert(vm_map_round_page(tmp_entry->vme_end,
12804 VM_MAP_PAGE_MASK(src_map)) ==
12805 vm_map_round_page(original_end,
12806 VM_MAP_PAGE_MASK(src_map)));
12807 }
12808 }
12809
12810 /* Fix-up start and end points in copy. This is necessary */
12811 /* when the various entries in the copy object were picked */
12812 /* up from different sub-maps */
12813
12814 tmp_entry = vm_map_copy_first_entry(copy);
12815 copy_size = 0; /* compute actual size */
12816 while (tmp_entry != vm_map_copy_to_entry(copy)) {
12817 assert(VM_MAP_PAGE_ALIGNED(
12818 copy_addr + (tmp_entry->vme_end -
12819 tmp_entry->vme_start),
12820 MIN(VM_MAP_COPY_PAGE_MASK(copy), PAGE_MASK)));
12821 assert(VM_MAP_PAGE_ALIGNED(
12822 copy_addr,
12823 MIN(VM_MAP_COPY_PAGE_MASK(copy), PAGE_MASK)));
12824
12825 /*
12826 * The copy_entries will be injected directly into the
12827 * destination map and might not be "map aligned" there...
12828 */
12829 tmp_entry->map_aligned = FALSE;
12830
12831 tmp_entry->vme_end = copy_addr +
12832 (tmp_entry->vme_end - tmp_entry->vme_start);
12833 tmp_entry->vme_start = copy_addr;
12834 assert(tmp_entry->vme_start < tmp_entry->vme_end);
12835 copy_addr += tmp_entry->vme_end - tmp_entry->vme_start;
12836 copy_size += tmp_entry->vme_end - tmp_entry->vme_start;
12837 tmp_entry = (struct vm_map_entry *)tmp_entry->vme_next;
12838 }
12839
12840 if (VM_MAP_PAGE_SHIFT(src_map) != PAGE_SHIFT &&
12841 copy_size < copy->size) {
12842 /*
12843 * The actual size of the VM map copy is smaller than what
12844 * was requested by the caller. This must be because some
12845 * PAGE_SIZE-sized pages are missing at the end of the last
12846 * VM_MAP_PAGE_SIZE(src_map)-sized chunk of the range.
12847 * The caller might not have been aware of those missing
12848 * pages and might not want to be aware of it, which is
12849 * fine as long as they don't try to access (and crash on)
12850 * those missing pages.
12851 * Let's adjust the size of the "copy", to avoid failing
12852 * in vm_map_copyout() or vm_map_copy_overwrite().
12853 */
12854 assert(vm_map_round_page(copy_size,
12855 VM_MAP_PAGE_MASK(src_map)) ==
12856 vm_map_round_page(copy->size,
12857 VM_MAP_PAGE_MASK(src_map)));
12858 copy->size = copy_size;
12859 }
12860
12861 *copy_result = copy;
12862 return KERN_SUCCESS;
12863
12864 #undef RETURN
12865 }
12866
12867 kern_return_t
vm_map_copy_extract(vm_map_t src_map,vm_map_address_t src_addr,vm_map_size_t len,boolean_t do_copy,vm_map_copy_t * copy_result,vm_prot_t * cur_prot,vm_prot_t * max_prot,vm_inherit_t inheritance,vm_map_kernel_flags_t vmk_flags)12868 vm_map_copy_extract(
12869 vm_map_t src_map,
12870 vm_map_address_t src_addr,
12871 vm_map_size_t len,
12872 boolean_t do_copy,
12873 vm_map_copy_t *copy_result, /* OUT */
12874 vm_prot_t *cur_prot, /* IN/OUT */
12875 vm_prot_t *max_prot, /* IN/OUT */
12876 vm_inherit_t inheritance,
12877 vm_map_kernel_flags_t vmk_flags)
12878 {
12879 vm_map_copy_t copy;
12880 kern_return_t kr;
12881 vm_prot_t required_cur_prot, required_max_prot;
12882
12883 /*
12884 * Check for copies of zero bytes.
12885 */
12886
12887 if (len == 0) {
12888 *copy_result = VM_MAP_COPY_NULL;
12889 return KERN_SUCCESS;
12890 }
12891
12892 /*
12893 * Check that the end address doesn't overflow
12894 */
12895 if (src_addr + len < src_addr) {
12896 return KERN_INVALID_ADDRESS;
12897 }
12898 if (__improbable(vm_map_range_overflows(src_map, src_addr, len))) {
12899 return KERN_INVALID_ADDRESS;
12900 }
12901
12902 if (VM_MAP_PAGE_SIZE(src_map) < PAGE_SIZE) {
12903 DEBUG4K_SHARE("src_map %p src_addr 0x%llx src_end 0x%llx\n", src_map, (uint64_t)src_addr, (uint64_t)(src_addr + len));
12904 }
12905
12906 required_cur_prot = *cur_prot;
12907 required_max_prot = *max_prot;
12908
12909 /*
12910 * Allocate a header element for the list.
12911 *
12912 * Use the start and end in the header to
12913 * remember the endpoints prior to rounding.
12914 */
12915
12916 copy = vm_map_copy_allocate(VM_MAP_COPY_ENTRY_LIST);
12917 copy->cpy_hdr.entries_pageable = vmk_flags.vmkf_copy_pageable;
12918 copy->offset = 0;
12919 copy->size = len;
12920
12921 kr = vm_map_remap_extract(src_map,
12922 src_addr,
12923 len,
12924 do_copy, /* copy */
12925 copy,
12926 cur_prot, /* IN/OUT */
12927 max_prot, /* IN/OUT */
12928 inheritance,
12929 vmk_flags);
12930 if (kr != KERN_SUCCESS) {
12931 vm_map_copy_discard(copy);
12932 return kr;
12933 }
12934 if (required_cur_prot != VM_PROT_NONE) {
12935 assert((*cur_prot & required_cur_prot) == required_cur_prot);
12936 assert((*max_prot & required_max_prot) == required_max_prot);
12937 }
12938
12939 *copy_result = copy;
12940 return KERN_SUCCESS;
12941 }
12942
12943 static void
vm_map_fork_share(vm_map_t old_map,vm_map_entry_t old_entry,vm_map_t new_map)12944 vm_map_fork_share(
12945 vm_map_t old_map,
12946 vm_map_entry_t old_entry,
12947 vm_map_t new_map)
12948 {
12949 vm_object_t object;
12950 vm_map_entry_t new_entry;
12951
12952 /*
12953 * New sharing code. New map entry
12954 * references original object. Internal
12955 * objects use asynchronous copy algorithm for
12956 * future copies. First make sure we have
12957 * the right object. If we need a shadow,
12958 * or someone else already has one, then
12959 * make a new shadow and share it.
12960 */
12961
12962 if (!old_entry->is_sub_map) {
12963 object = VME_OBJECT(old_entry);
12964 }
12965
12966 if (old_entry->is_sub_map) {
12967 assert(old_entry->wired_count == 0);
12968 #ifndef NO_NESTED_PMAP
12969 #if !PMAP_FORK_NEST
12970 if (old_entry->use_pmap) {
12971 kern_return_t result;
12972
12973 result = pmap_nest(new_map->pmap,
12974 (VME_SUBMAP(old_entry))->pmap,
12975 (addr64_t)old_entry->vme_start,
12976 (uint64_t)(old_entry->vme_end - old_entry->vme_start));
12977 if (result) {
12978 panic("vm_map_fork_share: pmap_nest failed!");
12979 }
12980 }
12981 #endif /* !PMAP_FORK_NEST */
12982 #endif /* NO_NESTED_PMAP */
12983 } else if (object == VM_OBJECT_NULL) {
12984 object = vm_object_allocate((vm_map_size_t)(old_entry->vme_end -
12985 old_entry->vme_start));
12986 VME_OFFSET_SET(old_entry, 0);
12987 VME_OBJECT_SET(old_entry, object, false, 0);
12988 old_entry->use_pmap = TRUE;
12989 // assert(!old_entry->needs_copy);
12990 } else if (object->copy_strategy !=
12991 MEMORY_OBJECT_COPY_SYMMETRIC) {
12992 /*
12993 * We are already using an asymmetric
12994 * copy, and therefore we already have
12995 * the right object.
12996 */
12997
12998 assert(!old_entry->needs_copy);
12999 } else if (old_entry->needs_copy || /* case 1 */
13000 object->shadowed || /* case 2 */
13001 (!object->true_share && /* case 3 */
13002 !old_entry->is_shared &&
13003 (object->vo_size >
13004 (vm_map_size_t)(old_entry->vme_end -
13005 old_entry->vme_start)))) {
13006 bool is_writable;
13007
13008 /*
13009 * We need to create a shadow.
13010 * There are three cases here.
13011 * In the first case, we need to
13012 * complete a deferred symmetrical
13013 * copy that we participated in.
13014 * In the second and third cases,
13015 * we need to create the shadow so
13016 * that changes that we make to the
13017 * object do not interfere with
13018 * any symmetrical copies which
13019 * have occured (case 2) or which
13020 * might occur (case 3).
13021 *
13022 * The first case is when we had
13023 * deferred shadow object creation
13024 * via the entry->needs_copy mechanism.
13025 * This mechanism only works when
13026 * only one entry points to the source
13027 * object, and we are about to create
13028 * a second entry pointing to the
13029 * same object. The problem is that
13030 * there is no way of mapping from
13031 * an object to the entries pointing
13032 * to it. (Deferred shadow creation
13033 * works with one entry because occurs
13034 * at fault time, and we walk from the
13035 * entry to the object when handling
13036 * the fault.)
13037 *
13038 * The second case is when the object
13039 * to be shared has already been copied
13040 * with a symmetric copy, but we point
13041 * directly to the object without
13042 * needs_copy set in our entry. (This
13043 * can happen because different ranges
13044 * of an object can be pointed to by
13045 * different entries. In particular,
13046 * a single entry pointing to an object
13047 * can be split by a call to vm_inherit,
13048 * which, combined with task_create, can
13049 * result in the different entries
13050 * having different needs_copy values.)
13051 * The shadowed flag in the object allows
13052 * us to detect this case. The problem
13053 * with this case is that if this object
13054 * has or will have shadows, then we
13055 * must not perform an asymmetric copy
13056 * of this object, since such a copy
13057 * allows the object to be changed, which
13058 * will break the previous symmetrical
13059 * copies (which rely upon the object
13060 * not changing). In a sense, the shadowed
13061 * flag says "don't change this object".
13062 * We fix this by creating a shadow
13063 * object for this object, and sharing
13064 * that. This works because we are free
13065 * to change the shadow object (and thus
13066 * to use an asymmetric copy strategy);
13067 * this is also semantically correct,
13068 * since this object is temporary, and
13069 * therefore a copy of the object is
13070 * as good as the object itself. (This
13071 * is not true for permanent objects,
13072 * since the pager needs to see changes,
13073 * which won't happen if the changes
13074 * are made to a copy.)
13075 *
13076 * The third case is when the object
13077 * to be shared has parts sticking
13078 * outside of the entry we're working
13079 * with, and thus may in the future
13080 * be subject to a symmetrical copy.
13081 * (This is a preemptive version of
13082 * case 2.)
13083 */
13084 VME_OBJECT_SHADOW(old_entry,
13085 (vm_map_size_t) (old_entry->vme_end -
13086 old_entry->vme_start),
13087 vm_map_always_shadow(old_map));
13088
13089 /*
13090 * If we're making a shadow for other than
13091 * copy on write reasons, then we have
13092 * to remove write permission.
13093 */
13094
13095 is_writable = false;
13096 if (old_entry->protection & VM_PROT_WRITE) {
13097 is_writable = true;
13098 #if __arm64e__
13099 } else if (old_entry->used_for_tpro) {
13100 is_writable = true;
13101 #endif /* __arm64e__ */
13102 }
13103 if (!old_entry->needs_copy && is_writable) {
13104 vm_prot_t prot;
13105
13106 if (pmap_has_prot_policy(old_map->pmap, old_entry->translated_allow_execute, old_entry->protection)) {
13107 panic("%s: map %p pmap %p entry %p 0x%llx:0x%llx prot 0x%x",
13108 __FUNCTION__, old_map, old_map->pmap,
13109 old_entry,
13110 (uint64_t)old_entry->vme_start,
13111 (uint64_t)old_entry->vme_end,
13112 old_entry->protection);
13113 }
13114
13115 prot = old_entry->protection & ~VM_PROT_WRITE;
13116
13117 if (pmap_has_prot_policy(old_map->pmap, old_entry->translated_allow_execute, prot)) {
13118 panic("%s: map %p pmap %p entry %p 0x%llx:0x%llx prot 0x%x",
13119 __FUNCTION__, old_map, old_map->pmap,
13120 old_entry,
13121 (uint64_t)old_entry->vme_start,
13122 (uint64_t)old_entry->vme_end,
13123 prot);
13124 }
13125
13126 if (override_nx(old_map, VME_ALIAS(old_entry)) && prot) {
13127 prot |= VM_PROT_EXECUTE;
13128 }
13129
13130
13131 if (old_map->mapped_in_other_pmaps) {
13132 vm_object_pmap_protect(
13133 VME_OBJECT(old_entry),
13134 VME_OFFSET(old_entry),
13135 (old_entry->vme_end -
13136 old_entry->vme_start),
13137 PMAP_NULL,
13138 PAGE_SIZE,
13139 old_entry->vme_start,
13140 prot);
13141 } else {
13142 pmap_protect(old_map->pmap,
13143 old_entry->vme_start,
13144 old_entry->vme_end,
13145 prot);
13146 }
13147 }
13148
13149 old_entry->needs_copy = FALSE;
13150 object = VME_OBJECT(old_entry);
13151 }
13152
13153
13154 /*
13155 * If object was using a symmetric copy strategy,
13156 * change its copy strategy to the default
13157 * asymmetric copy strategy, which is copy_delay
13158 * in the non-norma case and copy_call in the
13159 * norma case. Bump the reference count for the
13160 * new entry.
13161 */
13162
13163 if (old_entry->is_sub_map) {
13164 vm_map_reference(VME_SUBMAP(old_entry));
13165 } else {
13166 vm_object_lock(object);
13167 vm_object_reference_locked(object);
13168 if (object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) {
13169 object->copy_strategy = MEMORY_OBJECT_COPY_DELAY;
13170 }
13171 vm_object_unlock(object);
13172 }
13173
13174 /*
13175 * Clone the entry, using object ref from above.
13176 * Mark both entries as shared.
13177 */
13178
13179 new_entry = vm_map_entry_create(new_map); /* Never the kernel map or descendants */
13180 vm_map_entry_copy(old_map, new_entry, old_entry);
13181 old_entry->is_shared = TRUE;
13182 new_entry->is_shared = TRUE;
13183
13184 /*
13185 * We're dealing with a shared mapping, so the resulting mapping
13186 * should inherit some of the original mapping's accounting settings.
13187 * "iokit_acct" should have been cleared in vm_map_entry_copy().
13188 * "use_pmap" should stay the same as before (if it hasn't been reset
13189 * to TRUE when we cleared "iokit_acct").
13190 */
13191 assert(!new_entry->iokit_acct);
13192
13193 /*
13194 * If old entry's inheritence is VM_INHERIT_NONE,
13195 * the new entry is for corpse fork, remove the
13196 * write permission from the new entry.
13197 */
13198 if (old_entry->inheritance == VM_INHERIT_NONE) {
13199 new_entry->protection &= ~VM_PROT_WRITE;
13200 new_entry->max_protection &= ~VM_PROT_WRITE;
13201 }
13202
13203 /*
13204 * Insert the entry into the new map -- we
13205 * know we're inserting at the end of the new
13206 * map.
13207 */
13208
13209 vm_map_store_entry_link(new_map, vm_map_last_entry(new_map), new_entry,
13210 VM_MAP_KERNEL_FLAGS_NONE);
13211
13212 /*
13213 * Update the physical map
13214 */
13215
13216 if (old_entry->is_sub_map) {
13217 /* Bill Angell pmap support goes here */
13218 } else {
13219 pmap_copy(new_map->pmap, old_map->pmap, new_entry->vme_start,
13220 old_entry->vme_end - old_entry->vme_start,
13221 old_entry->vme_start);
13222 }
13223 }
13224
13225 static boolean_t
vm_map_fork_copy(vm_map_t old_map,vm_map_entry_t * old_entry_p,vm_map_t new_map,int vm_map_copyin_flags)13226 vm_map_fork_copy(
13227 vm_map_t old_map,
13228 vm_map_entry_t *old_entry_p,
13229 vm_map_t new_map,
13230 int vm_map_copyin_flags)
13231 {
13232 vm_map_entry_t old_entry = *old_entry_p;
13233 vm_map_size_t entry_size = old_entry->vme_end - old_entry->vme_start;
13234 vm_map_offset_t start = old_entry->vme_start;
13235 vm_map_copy_t copy;
13236 vm_map_entry_t last = vm_map_last_entry(new_map);
13237
13238 vm_map_unlock(old_map);
13239 /*
13240 * Use maxprot version of copyin because we
13241 * care about whether this memory can ever
13242 * be accessed, not just whether it's accessible
13243 * right now.
13244 */
13245 vm_map_copyin_flags |= VM_MAP_COPYIN_USE_MAXPROT;
13246 if (vm_map_copyin_internal(old_map, start, entry_size,
13247 vm_map_copyin_flags, ©)
13248 != KERN_SUCCESS) {
13249 /*
13250 * The map might have changed while it
13251 * was unlocked, check it again. Skip
13252 * any blank space or permanently
13253 * unreadable region.
13254 */
13255 vm_map_lock(old_map);
13256 if (!vm_map_lookup_entry(old_map, start, &last) ||
13257 (last->max_protection & VM_PROT_READ) == VM_PROT_NONE) {
13258 last = last->vme_next;
13259 }
13260 *old_entry_p = last;
13261
13262 /*
13263 * XXX For some error returns, want to
13264 * XXX skip to the next element. Note
13265 * that INVALID_ADDRESS and
13266 * PROTECTION_FAILURE are handled above.
13267 */
13268
13269 return FALSE;
13270 }
13271
13272 /*
13273 * Assert that the vm_map_copy is coming from the right
13274 * zone and hasn't been forged
13275 */
13276 vm_map_copy_require(copy);
13277
13278 /*
13279 * Insert the copy into the new map
13280 */
13281 vm_map_copy_insert(new_map, last, copy);
13282
13283 /*
13284 * Pick up the traversal at the end of
13285 * the copied region.
13286 */
13287
13288 vm_map_lock(old_map);
13289 start += entry_size;
13290 if (!vm_map_lookup_entry(old_map, start, &last)) {
13291 last = last->vme_next;
13292 } else {
13293 if (last->vme_start == start) {
13294 /*
13295 * No need to clip here and we don't
13296 * want to cause any unnecessary
13297 * unnesting...
13298 */
13299 } else {
13300 vm_map_clip_start(old_map, last, start);
13301 }
13302 }
13303 *old_entry_p = last;
13304
13305 return TRUE;
13306 }
13307
13308 #if PMAP_FORK_NEST
13309 #define PMAP_FORK_NEST_DEBUG 0
13310 static inline void
vm_map_fork_unnest(pmap_t new_pmap,vm_map_offset_t pre_nested_start,vm_map_offset_t pre_nested_end,vm_map_offset_t start,vm_map_offset_t end)13311 vm_map_fork_unnest(
13312 pmap_t new_pmap,
13313 vm_map_offset_t pre_nested_start,
13314 vm_map_offset_t pre_nested_end,
13315 vm_map_offset_t start,
13316 vm_map_offset_t end)
13317 {
13318 kern_return_t kr;
13319 vm_map_offset_t nesting_mask, start_unnest, end_unnest;
13320
13321 assertf(pre_nested_start <= pre_nested_end,
13322 "pre_nested start 0x%llx end 0x%llx",
13323 (uint64_t)pre_nested_start, (uint64_t)pre_nested_end);
13324 assertf(start <= end,
13325 "start 0x%llx end 0x%llx",
13326 (uint64_t) start, (uint64_t)end);
13327
13328 if (pre_nested_start == pre_nested_end) {
13329 /* nothing was pre-nested: done */
13330 return;
13331 }
13332 if (end <= pre_nested_start) {
13333 /* fully before pre-nested range: done */
13334 return;
13335 }
13336 if (start >= pre_nested_end) {
13337 /* fully after pre-nested range: done */
13338 return;
13339 }
13340 /* ignore parts of range outside of pre_nested range */
13341 if (start < pre_nested_start) {
13342 start = pre_nested_start;
13343 }
13344 if (end > pre_nested_end) {
13345 end = pre_nested_end;
13346 }
13347 nesting_mask = pmap_shared_region_size_min(new_pmap) - 1;
13348 start_unnest = start & ~nesting_mask;
13349 end_unnest = (end + nesting_mask) & ~nesting_mask;
13350 kr = pmap_unnest(new_pmap,
13351 (addr64_t)start_unnest,
13352 (uint64_t)(end_unnest - start_unnest));
13353 #if PMAP_FORK_NEST_DEBUG
13354 printf("PMAP_FORK_NEST %s:%d new_pmap %p 0x%llx:0x%llx -> pmap_unnest 0x%llx:0x%llx kr 0x%x\n", __FUNCTION__, __LINE__, new_pmap, (uint64_t)start, (uint64_t)end, (uint64_t)start_unnest, (uint64_t)end_unnest, kr);
13355 #endif /* PMAP_FORK_NEST_DEBUG */
13356 assertf(kr == KERN_SUCCESS,
13357 "0x%llx 0x%llx pmap_unnest(%p, 0x%llx, 0x%llx) -> 0x%x",
13358 (uint64_t)start, (uint64_t)end, new_pmap,
13359 (uint64_t)start_unnest, (uint64_t)(end_unnest - start_unnest),
13360 kr);
13361 }
13362 #endif /* PMAP_FORK_NEST */
13363
13364 void
vm_map_inherit_limits(vm_map_t new_map,const struct _vm_map * old_map)13365 vm_map_inherit_limits(vm_map_t new_map, const struct _vm_map *old_map)
13366 {
13367 new_map->size_limit = old_map->size_limit;
13368 new_map->data_limit = old_map->data_limit;
13369 new_map->user_wire_limit = old_map->user_wire_limit;
13370 new_map->reserved_regions = old_map->reserved_regions;
13371 }
13372
13373 /*
13374 * vm_map_fork:
13375 *
13376 * Create and return a new map based on the old
13377 * map, according to the inheritance values on the
13378 * regions in that map and the options.
13379 *
13380 * The source map must not be locked.
13381 */
13382 vm_map_t
vm_map_fork(ledger_t ledger,vm_map_t old_map,int options)13383 vm_map_fork(
13384 ledger_t ledger,
13385 vm_map_t old_map,
13386 int options)
13387 {
13388 pmap_t new_pmap;
13389 vm_map_t new_map;
13390 vm_map_entry_t old_entry;
13391 vm_map_size_t new_size = 0, entry_size;
13392 vm_map_entry_t new_entry;
13393 boolean_t src_needs_copy;
13394 boolean_t new_entry_needs_copy;
13395 boolean_t pmap_is64bit;
13396 int vm_map_copyin_flags;
13397 vm_inherit_t old_entry_inheritance;
13398 int map_create_options;
13399 kern_return_t footprint_collect_kr;
13400
13401 if (options & ~(VM_MAP_FORK_SHARE_IF_INHERIT_NONE |
13402 VM_MAP_FORK_PRESERVE_PURGEABLE |
13403 VM_MAP_FORK_CORPSE_FOOTPRINT)) {
13404 /* unsupported option */
13405 return VM_MAP_NULL;
13406 }
13407
13408 pmap_is64bit =
13409 #if defined(__i386__) || defined(__x86_64__)
13410 old_map->pmap->pm_task_map != TASK_MAP_32BIT;
13411 #elif defined(__arm64__)
13412 old_map->pmap->is_64bit;
13413 #else
13414 #error Unknown architecture.
13415 #endif
13416
13417 unsigned int pmap_flags = 0;
13418 pmap_flags |= pmap_is64bit ? PMAP_CREATE_64BIT : 0;
13419 #if defined(HAS_APPLE_PAC)
13420 pmap_flags |= old_map->pmap->disable_jop ? PMAP_CREATE_DISABLE_JOP : 0;
13421 #endif
13422 #if CONFIG_ROSETTA
13423 pmap_flags |= old_map->pmap->is_rosetta ? PMAP_CREATE_ROSETTA : 0;
13424 #endif
13425 #if PMAP_CREATE_FORCE_4K_PAGES
13426 if (VM_MAP_PAGE_SIZE(old_map) == FOURK_PAGE_SIZE &&
13427 PAGE_SIZE != FOURK_PAGE_SIZE) {
13428 pmap_flags |= PMAP_CREATE_FORCE_4K_PAGES;
13429 }
13430 #endif /* PMAP_CREATE_FORCE_4K_PAGES */
13431 new_pmap = pmap_create_options(ledger, (vm_map_size_t) 0, pmap_flags);
13432 if (new_pmap == NULL) {
13433 return VM_MAP_NULL;
13434 }
13435
13436 vm_map_reference(old_map);
13437 vm_map_lock(old_map);
13438
13439 map_create_options = 0;
13440 if (old_map->hdr.entries_pageable) {
13441 map_create_options |= VM_MAP_CREATE_PAGEABLE;
13442 }
13443 if (options & VM_MAP_FORK_CORPSE_FOOTPRINT) {
13444 map_create_options |= VM_MAP_CREATE_CORPSE_FOOTPRINT;
13445 footprint_collect_kr = KERN_SUCCESS;
13446 }
13447 new_map = vm_map_create_options(new_pmap,
13448 old_map->min_offset,
13449 old_map->max_offset,
13450 map_create_options);
13451
13452 /* inherit cs_enforcement */
13453 vm_map_cs_enforcement_set(new_map, old_map->cs_enforcement);
13454
13455 vm_map_lock(new_map);
13456 vm_commit_pagezero_status(new_map);
13457 /* inherit the parent map's page size */
13458 vm_map_set_page_shift(new_map, VM_MAP_PAGE_SHIFT(old_map));
13459
13460 /* inherit the parent rlimits */
13461 vm_map_inherit_limits(new_map, old_map);
13462
13463 #if CONFIG_MAP_RANGES
13464 /* inherit the parent map's VM ranges */
13465 vm_map_range_fork(new_map, old_map);
13466 #endif
13467
13468 #if CODE_SIGNING_MONITOR
13469 /* Prepare the monitor for the fork */
13470 csm_fork_prepare(old_map->pmap, new_pmap);
13471 #endif
13472
13473 #if PMAP_FORK_NEST
13474 /*
13475 * Pre-nest the shared region's pmap.
13476 */
13477 vm_map_offset_t pre_nested_start = 0, pre_nested_end = 0;
13478 pmap_fork_nest(old_map->pmap, new_pmap,
13479 &pre_nested_start, &pre_nested_end);
13480 #if PMAP_FORK_NEST_DEBUG
13481 printf("PMAP_FORK_NEST %s:%d old %p new %p pre_nested start 0x%llx end 0x%llx\n", __FUNCTION__, __LINE__, old_map->pmap, new_pmap, (uint64_t)pre_nested_start, (uint64_t)pre_nested_end);
13482 #endif /* PMAP_FORK_NEST_DEBUG */
13483 #endif /* PMAP_FORK_NEST */
13484
13485 for (old_entry = vm_map_first_entry(old_map); old_entry != vm_map_to_entry(old_map);) {
13486 /*
13487 * Abort any corpse collection if the system is shutting down.
13488 */
13489 if ((options & VM_MAP_FORK_CORPSE_FOOTPRINT) &&
13490 get_system_inshutdown()) {
13491 #if PMAP_FORK_NEST
13492 new_entry = vm_map_last_entry(new_map);
13493 if (new_entry == vm_map_to_entry(new_map)) {
13494 /* unnest all that was pre-nested */
13495 vm_map_fork_unnest(new_pmap,
13496 pre_nested_start, pre_nested_end,
13497 vm_map_min(new_map), vm_map_max(new_map));
13498 } else if (new_entry->vme_end < vm_map_max(new_map)) {
13499 /* unnest hole at the end, if pre-nested */
13500 vm_map_fork_unnest(new_pmap,
13501 pre_nested_start, pre_nested_end,
13502 new_entry->vme_end, vm_map_max(new_map));
13503 }
13504 #endif /* PMAP_FORK_NEST */
13505 vm_map_corpse_footprint_collect_done(new_map);
13506 vm_map_unlock(new_map);
13507 vm_map_unlock(old_map);
13508 vm_map_deallocate(new_map);
13509 vm_map_deallocate(old_map);
13510 printf("Aborting corpse map due to system shutdown\n");
13511 return VM_MAP_NULL;
13512 }
13513
13514 entry_size = old_entry->vme_end - old_entry->vme_start;
13515
13516 #if PMAP_FORK_NEST
13517 /*
13518 * Undo any unnecessary pre-nesting.
13519 */
13520 vm_map_offset_t prev_end;
13521 if (old_entry == vm_map_first_entry(old_map)) {
13522 prev_end = vm_map_min(old_map);
13523 } else {
13524 prev_end = old_entry->vme_prev->vme_end;
13525 }
13526 if (prev_end < old_entry->vme_start) {
13527 /* unnest hole before this entry, if pre-nested */
13528 vm_map_fork_unnest(new_pmap,
13529 pre_nested_start, pre_nested_end,
13530 prev_end, old_entry->vme_start);
13531 }
13532 if (old_entry->is_sub_map && old_entry->use_pmap) {
13533 /* keep this entry nested in the child */
13534 #if PMAP_FORK_NEST_DEBUG
13535 printf("PMAP_FORK_NEST %s:%d new_pmap %p keeping 0x%llx:0x%llx nested\n", __FUNCTION__, __LINE__, new_pmap, (uint64_t)old_entry->vme_start, (uint64_t)old_entry->vme_end);
13536 #endif /* PMAP_FORK_NEST_DEBUG */
13537 } else {
13538 /* undo nesting for this entry, if pre-nested */
13539 vm_map_fork_unnest(new_pmap,
13540 pre_nested_start, pre_nested_end,
13541 old_entry->vme_start, old_entry->vme_end);
13542 }
13543 #endif /* PMAP_FORK_NEST */
13544
13545 old_entry_inheritance = old_entry->inheritance;
13546 /*
13547 * If caller used the VM_MAP_FORK_SHARE_IF_INHERIT_NONE option
13548 * share VM_INHERIT_NONE entries that are not backed by a
13549 * device pager.
13550 */
13551 if (old_entry_inheritance == VM_INHERIT_NONE &&
13552 (options & VM_MAP_FORK_SHARE_IF_INHERIT_NONE) &&
13553 (old_entry->protection & VM_PROT_READ) &&
13554 !(!old_entry->is_sub_map &&
13555 VME_OBJECT(old_entry) != NULL &&
13556 VME_OBJECT(old_entry)->pager != NULL &&
13557 is_device_pager_ops(
13558 VME_OBJECT(old_entry)->pager->mo_pager_ops))) {
13559 old_entry_inheritance = VM_INHERIT_SHARE;
13560 }
13561
13562 if (old_entry_inheritance != VM_INHERIT_NONE &&
13563 (options & VM_MAP_FORK_CORPSE_FOOTPRINT) &&
13564 footprint_collect_kr == KERN_SUCCESS) {
13565 /*
13566 * The corpse won't have old_map->pmap to query
13567 * footprint information, so collect that data now
13568 * and store it in new_map->vmmap_corpse_footprint
13569 * for later autopsy.
13570 */
13571 footprint_collect_kr =
13572 vm_map_corpse_footprint_collect(old_map,
13573 old_entry,
13574 new_map);
13575 }
13576
13577 switch (old_entry_inheritance) {
13578 case VM_INHERIT_NONE:
13579 break;
13580
13581 case VM_INHERIT_SHARE:
13582 vm_map_fork_share(old_map, old_entry, new_map);
13583 new_size += entry_size;
13584 break;
13585
13586 case VM_INHERIT_COPY:
13587
13588 /*
13589 * Inline the copy_quickly case;
13590 * upon failure, fall back on call
13591 * to vm_map_fork_copy.
13592 */
13593
13594 if (old_entry->is_sub_map) {
13595 break;
13596 }
13597 if ((old_entry->wired_count != 0) ||
13598 ((VME_OBJECT(old_entry) != NULL) &&
13599 (VME_OBJECT(old_entry)->true_share))) {
13600 goto slow_vm_map_fork_copy;
13601 }
13602
13603 new_entry = vm_map_entry_create(new_map); /* never the kernel map or descendants */
13604 vm_map_entry_copy(old_map, new_entry, old_entry);
13605 if (old_entry->vme_permanent) {
13606 /* inherit "permanent" on fork() */
13607 new_entry->vme_permanent = TRUE;
13608 }
13609
13610 if (new_entry->used_for_jit == TRUE && new_map->jit_entry_exists == FALSE) {
13611 new_map->jit_entry_exists = TRUE;
13612 }
13613
13614 if (new_entry->is_sub_map) {
13615 /* clear address space specifics */
13616 new_entry->use_pmap = FALSE;
13617 } else {
13618 /*
13619 * We're dealing with a copy-on-write operation,
13620 * so the resulting mapping should not inherit
13621 * the original mapping's accounting settings.
13622 * "iokit_acct" should have been cleared in
13623 * vm_map_entry_copy().
13624 * "use_pmap" should be reset to its default
13625 * (TRUE) so that the new mapping gets
13626 * accounted for in the task's memory footprint.
13627 */
13628 assert(!new_entry->iokit_acct);
13629 new_entry->use_pmap = TRUE;
13630 }
13631
13632 if (!vm_object_copy_quickly(
13633 VME_OBJECT(new_entry),
13634 VME_OFFSET(old_entry),
13635 (old_entry->vme_end -
13636 old_entry->vme_start),
13637 &src_needs_copy,
13638 &new_entry_needs_copy)) {
13639 vm_map_entry_dispose(new_entry);
13640 goto slow_vm_map_fork_copy;
13641 }
13642
13643 /*
13644 * Handle copy-on-write obligations
13645 */
13646
13647 if (src_needs_copy && !old_entry->needs_copy) {
13648 vm_prot_t prot;
13649
13650 if (pmap_has_prot_policy(old_map->pmap, old_entry->translated_allow_execute, old_entry->protection)) {
13651 panic("%s: map %p pmap %p entry %p 0x%llx:0x%llx prot 0x%x",
13652 __FUNCTION__,
13653 old_map, old_map->pmap, old_entry,
13654 (uint64_t)old_entry->vme_start,
13655 (uint64_t)old_entry->vme_end,
13656 old_entry->protection);
13657 }
13658
13659 prot = old_entry->protection & ~VM_PROT_WRITE;
13660
13661 if (override_nx(old_map, VME_ALIAS(old_entry))
13662 && prot) {
13663 prot |= VM_PROT_EXECUTE;
13664 }
13665
13666 if (pmap_has_prot_policy(old_map->pmap, old_entry->translated_allow_execute, prot)) {
13667 panic("%s: map %p pmap %p entry %p 0x%llx:0x%llx prot 0x%x",
13668 __FUNCTION__,
13669 old_map, old_map->pmap, old_entry,
13670 (uint64_t)old_entry->vme_start,
13671 (uint64_t)old_entry->vme_end,
13672 prot);
13673 }
13674
13675 vm_object_pmap_protect(
13676 VME_OBJECT(old_entry),
13677 VME_OFFSET(old_entry),
13678 (old_entry->vme_end -
13679 old_entry->vme_start),
13680 ((old_entry->is_shared
13681 || old_map->mapped_in_other_pmaps)
13682 ? PMAP_NULL :
13683 old_map->pmap),
13684 VM_MAP_PAGE_SIZE(old_map),
13685 old_entry->vme_start,
13686 prot);
13687
13688 assert(old_entry->wired_count == 0);
13689 old_entry->needs_copy = TRUE;
13690 }
13691 new_entry->needs_copy = new_entry_needs_copy;
13692
13693 /*
13694 * Insert the entry at the end
13695 * of the map.
13696 */
13697
13698 vm_map_store_entry_link(new_map,
13699 vm_map_last_entry(new_map),
13700 new_entry,
13701 VM_MAP_KERNEL_FLAGS_NONE);
13702 new_size += entry_size;
13703 break;
13704
13705 slow_vm_map_fork_copy:
13706 vm_map_copyin_flags = VM_MAP_COPYIN_FORK;
13707 if (options & VM_MAP_FORK_PRESERVE_PURGEABLE) {
13708 vm_map_copyin_flags |=
13709 VM_MAP_COPYIN_PRESERVE_PURGEABLE;
13710 }
13711 if (vm_map_fork_copy(old_map,
13712 &old_entry,
13713 new_map,
13714 vm_map_copyin_flags)) {
13715 new_size += entry_size;
13716 }
13717 continue;
13718 }
13719 old_entry = old_entry->vme_next;
13720 }
13721
13722 #if PMAP_FORK_NEST
13723 new_entry = vm_map_last_entry(new_map);
13724 if (new_entry == vm_map_to_entry(new_map)) {
13725 /* unnest all that was pre-nested */
13726 vm_map_fork_unnest(new_pmap,
13727 pre_nested_start, pre_nested_end,
13728 vm_map_min(new_map), vm_map_max(new_map));
13729 } else if (new_entry->vme_end < vm_map_max(new_map)) {
13730 /* unnest hole at the end, if pre-nested */
13731 vm_map_fork_unnest(new_pmap,
13732 pre_nested_start, pre_nested_end,
13733 new_entry->vme_end, vm_map_max(new_map));
13734 }
13735 #endif /* PMAP_FORK_NEST */
13736
13737 #if defined(__arm64__)
13738 pmap_insert_commpage(new_map->pmap);
13739 #endif /* __arm64__ */
13740
13741 new_map->size = new_size;
13742
13743 if (options & VM_MAP_FORK_CORPSE_FOOTPRINT) {
13744 vm_map_corpse_footprint_collect_done(new_map);
13745 }
13746
13747 /* Propagate JIT entitlement for the pmap layer. */
13748 if (pmap_get_jit_entitled(old_map->pmap)) {
13749 /* Tell the pmap that it supports JIT. */
13750 pmap_set_jit_entitled(new_map->pmap);
13751 }
13752
13753 /* Propagate TPRO settings for the pmap layer */
13754 if (pmap_get_tpro(old_map->pmap)) {
13755 /* Tell the pmap that it supports TPRO */
13756 pmap_set_tpro(new_map->pmap);
13757 }
13758
13759
13760 vm_map_unlock(new_map);
13761 vm_map_unlock(old_map);
13762 vm_map_deallocate(old_map);
13763
13764 return new_map;
13765 }
13766
13767 /*
13768 * vm_map_exec:
13769 *
13770 * Setup the "new_map" with the proper execution environment according
13771 * to the type of executable (platform, 64bit, chroot environment).
13772 * Map the comm page and shared region, etc...
13773 */
13774 kern_return_t
vm_map_exec(vm_map_t new_map,task_t task,boolean_t is64bit,void * fsroot,cpu_type_t cpu,cpu_subtype_t cpu_subtype,boolean_t reslide,boolean_t is_driverkit,uint32_t rsr_version)13775 vm_map_exec(
13776 vm_map_t new_map,
13777 task_t task,
13778 boolean_t is64bit,
13779 void *fsroot,
13780 cpu_type_t cpu,
13781 cpu_subtype_t cpu_subtype,
13782 boolean_t reslide,
13783 boolean_t is_driverkit,
13784 uint32_t rsr_version)
13785 {
13786 SHARED_REGION_TRACE_DEBUG(
13787 ("shared_region: task %p: vm_map_exec(%p,%p,%p,0x%x,0x%x): ->\n",
13788 (void *)VM_KERNEL_ADDRPERM(current_task()),
13789 (void *)VM_KERNEL_ADDRPERM(new_map),
13790 (void *)VM_KERNEL_ADDRPERM(task),
13791 (void *)VM_KERNEL_ADDRPERM(fsroot),
13792 cpu,
13793 cpu_subtype));
13794 (void) vm_commpage_enter(new_map, task, is64bit);
13795
13796 (void) vm_shared_region_enter(new_map, task, is64bit, fsroot, cpu, cpu_subtype, reslide, is_driverkit, rsr_version);
13797
13798 SHARED_REGION_TRACE_DEBUG(
13799 ("shared_region: task %p: vm_map_exec(%p,%p,%p,0x%x,0x%x): <-\n",
13800 (void *)VM_KERNEL_ADDRPERM(current_task()),
13801 (void *)VM_KERNEL_ADDRPERM(new_map),
13802 (void *)VM_KERNEL_ADDRPERM(task),
13803 (void *)VM_KERNEL_ADDRPERM(fsroot),
13804 cpu,
13805 cpu_subtype));
13806
13807 /*
13808 * Some devices have region(s) of memory that shouldn't get allocated by
13809 * user processes. The following code creates dummy vm_map_entry_t's for each
13810 * of the regions that needs to be reserved to prevent any allocations in
13811 * those regions.
13812 */
13813 kern_return_t kr = KERN_FAILURE;
13814 vm_map_kernel_flags_t vmk_flags = VM_MAP_KERNEL_FLAGS_FIXED_PERMANENT();
13815 vmk_flags.vmkf_beyond_max = true;
13816
13817 const struct vm_reserved_region *regions = NULL;
13818 size_t num_regions = ml_get_vm_reserved_regions(is64bit, ®ions);
13819 assert((num_regions == 0) || (num_regions > 0 && regions != NULL));
13820
13821 for (size_t i = 0; i < num_regions; ++i) {
13822 vm_map_offset_t address = regions[i].vmrr_addr;
13823
13824 kr = vm_map_enter(
13825 new_map,
13826 &address,
13827 regions[i].vmrr_size,
13828 (vm_map_offset_t)0,
13829 vmk_flags,
13830 VM_OBJECT_NULL,
13831 (vm_object_offset_t)0,
13832 FALSE,
13833 VM_PROT_NONE,
13834 VM_PROT_NONE,
13835 VM_INHERIT_COPY);
13836
13837 if (kr != KERN_SUCCESS) {
13838 panic("Failed to reserve %s region in user map %p %d", regions[i].vmrr_name, new_map, kr);
13839 }
13840 }
13841
13842 new_map->reserved_regions = (num_regions ? TRUE : FALSE);
13843
13844 return KERN_SUCCESS;
13845 }
13846
13847 uint64_t vm_map_lookup_and_lock_object_copy_slowly_count = 0;
13848 uint64_t vm_map_lookup_and_lock_object_copy_slowly_size = 0;
13849 uint64_t vm_map_lookup_and_lock_object_copy_slowly_max = 0;
13850 uint64_t vm_map_lookup_and_lock_object_copy_slowly_restart = 0;
13851 uint64_t vm_map_lookup_and_lock_object_copy_slowly_error = 0;
13852 uint64_t vm_map_lookup_and_lock_object_copy_strategically_count = 0;
13853 uint64_t vm_map_lookup_and_lock_object_copy_strategically_size = 0;
13854 uint64_t vm_map_lookup_and_lock_object_copy_strategically_max = 0;
13855 uint64_t vm_map_lookup_and_lock_object_copy_strategically_restart = 0;
13856 uint64_t vm_map_lookup_and_lock_object_copy_strategically_error = 0;
13857 uint64_t vm_map_lookup_and_lock_object_copy_shadow_count = 0;
13858 uint64_t vm_map_lookup_and_lock_object_copy_shadow_size = 0;
13859 uint64_t vm_map_lookup_and_lock_object_copy_shadow_max = 0;
13860 /*
13861 * vm_map_lookup_and_lock_object:
13862 *
13863 * Finds the VM object, offset, and
13864 * protection for a given virtual address in the
13865 * specified map, assuming a page fault of the
13866 * type specified.
13867 *
13868 * Returns the (object, offset, protection) for
13869 * this address, whether it is wired down, and whether
13870 * this map has the only reference to the data in question.
13871 * In order to later verify this lookup, a "version"
13872 * is returned.
13873 * If contended != NULL, *contended will be set to
13874 * true iff the thread had to spin or block to acquire
13875 * an exclusive lock.
13876 *
13877 * The map MUST be locked by the caller and WILL be
13878 * locked on exit. In order to guarantee the
13879 * existence of the returned object, it is returned
13880 * locked.
13881 *
13882 * If a lookup is requested with "write protection"
13883 * specified, the map may be changed to perform virtual
13884 * copying operations, although the data referenced will
13885 * remain the same.
13886 */
13887 kern_return_t
vm_map_lookup_and_lock_object(vm_map_t * var_map,vm_map_offset_t vaddr,vm_prot_t fault_type,int object_lock_type,vm_map_version_t * out_version,vm_object_t * object,vm_object_offset_t * offset,vm_prot_t * out_prot,boolean_t * wired,vm_object_fault_info_t fault_info,vm_map_t * real_map,bool * contended)13888 vm_map_lookup_and_lock_object(
13889 vm_map_t *var_map, /* IN/OUT */
13890 vm_map_offset_t vaddr,
13891 vm_prot_t fault_type,
13892 int object_lock_type,
13893 vm_map_version_t *out_version, /* OUT */
13894 vm_object_t *object, /* OUT */
13895 vm_object_offset_t *offset, /* OUT */
13896 vm_prot_t *out_prot, /* OUT */
13897 boolean_t *wired, /* OUT */
13898 vm_object_fault_info_t fault_info, /* OUT */
13899 vm_map_t *real_map, /* OUT */
13900 bool *contended) /* OUT */
13901 {
13902 vm_map_entry_t entry;
13903 vm_map_t map = *var_map;
13904 vm_map_t old_map = *var_map;
13905 vm_map_t cow_sub_map_parent = VM_MAP_NULL;
13906 vm_map_offset_t cow_parent_vaddr = 0;
13907 vm_map_offset_t old_start = 0;
13908 vm_map_offset_t old_end = 0;
13909 vm_prot_t prot;
13910 boolean_t mask_protections;
13911 boolean_t force_copy;
13912 boolean_t no_force_copy_if_executable;
13913 boolean_t submap_needed_copy;
13914 vm_prot_t original_fault_type;
13915 vm_map_size_t fault_page_mask;
13916
13917 /*
13918 * VM_PROT_MASK means that the caller wants us to use "fault_type"
13919 * as a mask against the mapping's actual protections, not as an
13920 * absolute value.
13921 */
13922 mask_protections = (fault_type & VM_PROT_IS_MASK) ? TRUE : FALSE;
13923 force_copy = (fault_type & VM_PROT_COPY) ? TRUE : FALSE;
13924 no_force_copy_if_executable = (fault_type & VM_PROT_COPY_FAIL_IF_EXECUTABLE) ? TRUE : FALSE;
13925 fault_type &= VM_PROT_ALL;
13926 original_fault_type = fault_type;
13927 if (contended) {
13928 *contended = false;
13929 }
13930
13931 *real_map = map;
13932
13933 fault_page_mask = MIN(VM_MAP_PAGE_MASK(map), PAGE_MASK);
13934 vaddr = VM_MAP_TRUNC_PAGE(vaddr, fault_page_mask);
13935
13936 RetryLookup:
13937 fault_type = original_fault_type;
13938
13939 /*
13940 * If the map has an interesting hint, try it before calling
13941 * full blown lookup routine.
13942 */
13943 entry = map->hint;
13944
13945 if ((entry == vm_map_to_entry(map)) ||
13946 (vaddr < entry->vme_start) || (vaddr >= entry->vme_end)) {
13947 vm_map_entry_t tmp_entry;
13948
13949 /*
13950 * Entry was either not a valid hint, or the vaddr
13951 * was not contained in the entry, so do a full lookup.
13952 */
13953 if (!vm_map_lookup_entry(map, vaddr, &tmp_entry)) {
13954 if ((cow_sub_map_parent) && (cow_sub_map_parent != map)) {
13955 vm_map_unlock(cow_sub_map_parent);
13956 }
13957 if ((*real_map != map)
13958 && (*real_map != cow_sub_map_parent)) {
13959 vm_map_unlock(*real_map);
13960 }
13961 return KERN_INVALID_ADDRESS;
13962 }
13963
13964 entry = tmp_entry;
13965 }
13966 if (map == old_map) {
13967 old_start = entry->vme_start;
13968 old_end = entry->vme_end;
13969 }
13970
13971 /*
13972 * Handle submaps. Drop lock on upper map, submap is
13973 * returned locked.
13974 */
13975
13976 submap_needed_copy = FALSE;
13977 submap_recurse:
13978 if (entry->is_sub_map) {
13979 vm_map_offset_t local_vaddr;
13980 vm_map_offset_t end_delta;
13981 vm_map_offset_t start_delta;
13982 vm_map_offset_t top_entry_saved_start;
13983 vm_object_offset_t top_entry_saved_offset;
13984 vm_map_entry_t submap_entry, saved_submap_entry;
13985 vm_object_offset_t submap_entry_offset;
13986 vm_object_size_t submap_entry_size;
13987 vm_prot_t subentry_protection;
13988 vm_prot_t subentry_max_protection;
13989 boolean_t subentry_no_copy_on_read;
13990 boolean_t subentry_permanent;
13991 boolean_t subentry_csm_associated;
13992 #if __arm64e__
13993 boolean_t subentry_used_for_tpro;
13994 #endif /* __arm64e__ */
13995 boolean_t mapped_needs_copy = FALSE;
13996 vm_map_version_t version;
13997
13998 assertf(VM_MAP_PAGE_SHIFT(VME_SUBMAP(entry)) >= VM_MAP_PAGE_SHIFT(map),
13999 "map %p (%d) entry %p submap %p (%d)\n",
14000 map, VM_MAP_PAGE_SHIFT(map), entry,
14001 VME_SUBMAP(entry), VM_MAP_PAGE_SHIFT(VME_SUBMAP(entry)));
14002
14003 local_vaddr = vaddr;
14004 top_entry_saved_start = entry->vme_start;
14005 top_entry_saved_offset = VME_OFFSET(entry);
14006
14007 if ((entry->use_pmap &&
14008 !((fault_type & VM_PROT_WRITE) ||
14009 force_copy))) {
14010 /* if real_map equals map we unlock below */
14011 if ((*real_map != map) &&
14012 (*real_map != cow_sub_map_parent)) {
14013 vm_map_unlock(*real_map);
14014 }
14015 *real_map = VME_SUBMAP(entry);
14016 }
14017
14018 if (entry->needs_copy &&
14019 ((fault_type & VM_PROT_WRITE) ||
14020 force_copy)) {
14021 if (!mapped_needs_copy) {
14022 if (vm_map_lock_read_to_write(map)) {
14023 vm_map_lock_read(map);
14024 *real_map = map;
14025 goto RetryLookup;
14026 }
14027 vm_map_lock_read(VME_SUBMAP(entry));
14028 *var_map = VME_SUBMAP(entry);
14029 cow_sub_map_parent = map;
14030 /* reset base to map before cow object */
14031 /* this is the map which will accept */
14032 /* the new cow object */
14033 old_start = entry->vme_start;
14034 old_end = entry->vme_end;
14035 cow_parent_vaddr = vaddr;
14036 mapped_needs_copy = TRUE;
14037 } else {
14038 vm_map_lock_read(VME_SUBMAP(entry));
14039 *var_map = VME_SUBMAP(entry);
14040 if ((cow_sub_map_parent != map) &&
14041 (*real_map != map)) {
14042 vm_map_unlock(map);
14043 }
14044 }
14045 } else {
14046 if (entry->needs_copy) {
14047 submap_needed_copy = TRUE;
14048 }
14049 vm_map_lock_read(VME_SUBMAP(entry));
14050 *var_map = VME_SUBMAP(entry);
14051 /* leave map locked if it is a target */
14052 /* cow sub_map above otherwise, just */
14053 /* follow the maps down to the object */
14054 /* here we unlock knowing we are not */
14055 /* revisiting the map. */
14056 if ((*real_map != map) && (map != cow_sub_map_parent)) {
14057 vm_map_unlock_read(map);
14058 }
14059 }
14060
14061 entry = NULL;
14062 map = *var_map;
14063
14064 /* calculate the offset in the submap for vaddr */
14065 local_vaddr = (local_vaddr - top_entry_saved_start) + top_entry_saved_offset;
14066 assertf(VM_MAP_PAGE_ALIGNED(local_vaddr, fault_page_mask),
14067 "local_vaddr 0x%llx entry->vme_start 0x%llx fault_page_mask 0x%llx\n",
14068 (uint64_t)local_vaddr, (uint64_t)top_entry_saved_start, (uint64_t)fault_page_mask);
14069
14070 RetrySubMap:
14071 if (!vm_map_lookup_entry(map, local_vaddr, &submap_entry)) {
14072 if ((cow_sub_map_parent) && (cow_sub_map_parent != map)) {
14073 vm_map_unlock(cow_sub_map_parent);
14074 }
14075 if ((*real_map != map)
14076 && (*real_map != cow_sub_map_parent)) {
14077 vm_map_unlock(*real_map);
14078 }
14079 *real_map = map;
14080 return KERN_INVALID_ADDRESS;
14081 }
14082
14083 /* find the attenuated shadow of the underlying object */
14084 /* on our target map */
14085
14086 /* in english the submap object may extend beyond the */
14087 /* region mapped by the entry or, may only fill a portion */
14088 /* of it. For our purposes, we only care if the object */
14089 /* doesn't fill. In this case the area which will */
14090 /* ultimately be clipped in the top map will only need */
14091 /* to be as big as the portion of the underlying entry */
14092 /* which is mapped */
14093 start_delta = submap_entry->vme_start > top_entry_saved_offset ?
14094 submap_entry->vme_start - top_entry_saved_offset : 0;
14095
14096 end_delta =
14097 (top_entry_saved_offset + start_delta + (old_end - old_start)) <=
14098 submap_entry->vme_end ?
14099 0 : (top_entry_saved_offset +
14100 (old_end - old_start))
14101 - submap_entry->vme_end;
14102
14103 old_start += start_delta;
14104 old_end -= end_delta;
14105
14106 if (submap_entry->is_sub_map) {
14107 entry = submap_entry;
14108 vaddr = local_vaddr;
14109 goto submap_recurse;
14110 }
14111
14112 if (((fault_type & VM_PROT_WRITE) ||
14113 force_copy)
14114 && cow_sub_map_parent) {
14115 vm_object_t sub_object, copy_object;
14116 vm_object_offset_t copy_offset;
14117 vm_map_offset_t local_start;
14118 vm_map_offset_t local_end;
14119 boolean_t object_copied = FALSE;
14120 vm_object_offset_t object_copied_offset = 0;
14121 boolean_t object_copied_needs_copy = FALSE;
14122 kern_return_t kr = KERN_SUCCESS;
14123
14124 if (vm_map_lock_read_to_write(map)) {
14125 vm_map_lock_read(map);
14126 old_start -= start_delta;
14127 old_end += end_delta;
14128 goto RetrySubMap;
14129 }
14130
14131
14132 sub_object = VME_OBJECT(submap_entry);
14133 if (sub_object == VM_OBJECT_NULL) {
14134 sub_object =
14135 vm_object_allocate(
14136 (vm_map_size_t)
14137 (submap_entry->vme_end -
14138 submap_entry->vme_start));
14139 VME_OBJECT_SET(submap_entry, sub_object, false, 0);
14140 VME_OFFSET_SET(submap_entry, 0);
14141 assert(!submap_entry->is_sub_map);
14142 assert(submap_entry->use_pmap);
14143 }
14144 local_start = local_vaddr -
14145 (cow_parent_vaddr - old_start);
14146 local_end = local_vaddr +
14147 (old_end - cow_parent_vaddr);
14148 vm_map_clip_start(map, submap_entry, local_start);
14149 vm_map_clip_end(map, submap_entry, local_end);
14150 if (submap_entry->is_sub_map) {
14151 /* unnesting was done when clipping */
14152 assert(!submap_entry->use_pmap);
14153 }
14154
14155 /* This is the COW case, lets connect */
14156 /* an entry in our space to the underlying */
14157 /* object in the submap, bypassing the */
14158 /* submap. */
14159 submap_entry_offset = VME_OFFSET(submap_entry);
14160 submap_entry_size = submap_entry->vme_end - submap_entry->vme_start;
14161
14162 if ((submap_entry->wired_count != 0 ||
14163 sub_object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC) &&
14164 (submap_entry->protection & VM_PROT_EXECUTE) &&
14165 no_force_copy_if_executable) {
14166 // printf("FBDP map %p entry %p start 0x%llx end 0x%llx wired %d strat %d\n", map, submap_entry, (uint64_t)local_start, (uint64_t)local_end, submap_entry->wired_count, sub_object->copy_strategy);
14167 if ((cow_sub_map_parent) && (cow_sub_map_parent != map)) {
14168 vm_map_unlock(cow_sub_map_parent);
14169 }
14170 if ((*real_map != map)
14171 && (*real_map != cow_sub_map_parent)) {
14172 vm_map_unlock(*real_map);
14173 }
14174 *real_map = map;
14175 ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_SUBMAP_NO_COW_ON_EXECUTABLE), 0 /* arg */);
14176 vm_map_lock_write_to_read(map);
14177 kr = KERN_PROTECTION_FAILURE;
14178 DTRACE_VM4(submap_no_copy_executable,
14179 vm_map_t, map,
14180 vm_object_offset_t, submap_entry_offset,
14181 vm_object_size_t, submap_entry_size,
14182 int, kr);
14183 return kr;
14184 }
14185
14186 if (submap_entry->wired_count != 0) {
14187 vm_object_reference(sub_object);
14188
14189 assertf(VM_MAP_PAGE_ALIGNED(VME_OFFSET(submap_entry), VM_MAP_PAGE_MASK(map)),
14190 "submap_entry %p offset 0x%llx\n",
14191 submap_entry, VME_OFFSET(submap_entry));
14192
14193 DTRACE_VM6(submap_copy_slowly,
14194 vm_map_t, cow_sub_map_parent,
14195 vm_map_offset_t, vaddr,
14196 vm_map_t, map,
14197 vm_object_size_t, submap_entry_size,
14198 int, submap_entry->wired_count,
14199 int, sub_object->copy_strategy);
14200
14201 saved_submap_entry = submap_entry;
14202 version.main_timestamp = map->timestamp;
14203 vm_map_unlock(map); /* Increments timestamp by 1 */
14204 submap_entry = VM_MAP_ENTRY_NULL;
14205
14206 vm_object_lock(sub_object);
14207 kr = vm_object_copy_slowly(sub_object,
14208 submap_entry_offset,
14209 submap_entry_size,
14210 FALSE,
14211 ©_object);
14212 object_copied = TRUE;
14213 object_copied_offset = 0;
14214 /* 4k: account for extra offset in physical page */
14215 object_copied_offset += submap_entry_offset - vm_object_trunc_page(submap_entry_offset);
14216 object_copied_needs_copy = FALSE;
14217 vm_object_deallocate(sub_object);
14218
14219 vm_map_lock(map);
14220
14221 if (kr != KERN_SUCCESS &&
14222 kr != KERN_MEMORY_RESTART_COPY) {
14223 if ((cow_sub_map_parent) && (cow_sub_map_parent != map)) {
14224 vm_map_unlock(cow_sub_map_parent);
14225 }
14226 if ((*real_map != map)
14227 && (*real_map != cow_sub_map_parent)) {
14228 vm_map_unlock(*real_map);
14229 }
14230 *real_map = map;
14231 vm_object_deallocate(copy_object);
14232 copy_object = VM_OBJECT_NULL;
14233 ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_SUBMAP_COPY_SLOWLY_FAILED), 0 /* arg */);
14234 vm_map_lock_write_to_read(map);
14235 DTRACE_VM4(submap_copy_error_slowly,
14236 vm_object_t, sub_object,
14237 vm_object_offset_t, submap_entry_offset,
14238 vm_object_size_t, submap_entry_size,
14239 int, kr);
14240 vm_map_lookup_and_lock_object_copy_slowly_error++;
14241 return kr;
14242 }
14243
14244 if ((kr == KERN_SUCCESS) &&
14245 (version.main_timestamp + 1) == map->timestamp) {
14246 submap_entry = saved_submap_entry;
14247 } else {
14248 saved_submap_entry = NULL;
14249 old_start -= start_delta;
14250 old_end += end_delta;
14251 vm_object_deallocate(copy_object);
14252 copy_object = VM_OBJECT_NULL;
14253 vm_map_lock_write_to_read(map);
14254 vm_map_lookup_and_lock_object_copy_slowly_restart++;
14255 goto RetrySubMap;
14256 }
14257 vm_map_lookup_and_lock_object_copy_slowly_count++;
14258 vm_map_lookup_and_lock_object_copy_slowly_size += submap_entry_size;
14259 if (submap_entry_size > vm_map_lookup_and_lock_object_copy_slowly_max) {
14260 vm_map_lookup_and_lock_object_copy_slowly_max = submap_entry_size;
14261 }
14262 } else if (sub_object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC) {
14263 submap_entry_offset = VME_OFFSET(submap_entry);
14264 copy_object = VM_OBJECT_NULL;
14265 object_copied_offset = submap_entry_offset;
14266 object_copied_needs_copy = FALSE;
14267 DTRACE_VM6(submap_copy_strategically,
14268 vm_map_t, cow_sub_map_parent,
14269 vm_map_offset_t, vaddr,
14270 vm_map_t, map,
14271 vm_object_size_t, submap_entry_size,
14272 int, submap_entry->wired_count,
14273 int, sub_object->copy_strategy);
14274 kr = vm_object_copy_strategically(
14275 sub_object,
14276 submap_entry_offset,
14277 submap_entry->vme_end - submap_entry->vme_start,
14278 false, /* forking */
14279 ©_object,
14280 &object_copied_offset,
14281 &object_copied_needs_copy);
14282 if (kr == KERN_MEMORY_RESTART_COPY) {
14283 old_start -= start_delta;
14284 old_end += end_delta;
14285 vm_object_deallocate(copy_object);
14286 copy_object = VM_OBJECT_NULL;
14287 vm_map_lock_write_to_read(map);
14288 vm_map_lookup_and_lock_object_copy_strategically_restart++;
14289 goto RetrySubMap;
14290 }
14291 if (kr != KERN_SUCCESS) {
14292 if ((cow_sub_map_parent) && (cow_sub_map_parent != map)) {
14293 vm_map_unlock(cow_sub_map_parent);
14294 }
14295 if ((*real_map != map)
14296 && (*real_map != cow_sub_map_parent)) {
14297 vm_map_unlock(*real_map);
14298 }
14299 *real_map = map;
14300 vm_object_deallocate(copy_object);
14301 copy_object = VM_OBJECT_NULL;
14302 ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_SUBMAP_COPY_STRAT_FAILED), 0 /* arg */);
14303 vm_map_lock_write_to_read(map);
14304 DTRACE_VM4(submap_copy_error_strategically,
14305 vm_object_t, sub_object,
14306 vm_object_offset_t, submap_entry_offset,
14307 vm_object_size_t, submap_entry_size,
14308 int, kr);
14309 vm_map_lookup_and_lock_object_copy_strategically_error++;
14310 return kr;
14311 }
14312 assert(copy_object != VM_OBJECT_NULL);
14313 assert(copy_object != sub_object);
14314 object_copied = TRUE;
14315 vm_map_lookup_and_lock_object_copy_strategically_count++;
14316 vm_map_lookup_and_lock_object_copy_strategically_size += submap_entry_size;
14317 if (submap_entry_size > vm_map_lookup_and_lock_object_copy_strategically_max) {
14318 vm_map_lookup_and_lock_object_copy_strategically_max = submap_entry_size;
14319 }
14320 } else {
14321 /* set up shadow object */
14322 object_copied = FALSE;
14323 copy_object = sub_object;
14324 vm_object_lock(sub_object);
14325 vm_object_reference_locked(sub_object);
14326 VM_OBJECT_SET_SHADOWED(sub_object, TRUE);
14327 vm_object_unlock(sub_object);
14328
14329 assert(submap_entry->wired_count == 0);
14330 submap_entry->needs_copy = TRUE;
14331
14332 prot = submap_entry->protection;
14333 if (pmap_has_prot_policy(map->pmap, submap_entry->translated_allow_execute, prot)) {
14334 panic("%s: map %p pmap %p entry %p 0x%llx:0x%llx prot 0x%x",
14335 __FUNCTION__,
14336 map, map->pmap, submap_entry,
14337 (uint64_t)submap_entry->vme_start,
14338 (uint64_t)submap_entry->vme_end,
14339 prot);
14340 }
14341 prot = prot & ~VM_PROT_WRITE;
14342 if (pmap_has_prot_policy(map->pmap, submap_entry->translated_allow_execute, prot)) {
14343 panic("%s: map %p pmap %p entry %p 0x%llx:0x%llx prot 0x%x",
14344 __FUNCTION__,
14345 map, map->pmap, submap_entry,
14346 (uint64_t)submap_entry->vme_start,
14347 (uint64_t)submap_entry->vme_end,
14348 prot);
14349 }
14350
14351 if (override_nx(old_map,
14352 VME_ALIAS(submap_entry))
14353 && prot) {
14354 prot |= VM_PROT_EXECUTE;
14355 }
14356
14357 vm_object_pmap_protect(
14358 sub_object,
14359 VME_OFFSET(submap_entry),
14360 submap_entry->vme_end -
14361 submap_entry->vme_start,
14362 (submap_entry->is_shared
14363 || map->mapped_in_other_pmaps) ?
14364 PMAP_NULL : map->pmap,
14365 VM_MAP_PAGE_SIZE(map),
14366 submap_entry->vme_start,
14367 prot);
14368 vm_map_lookup_and_lock_object_copy_shadow_count++;
14369 vm_map_lookup_and_lock_object_copy_shadow_size += submap_entry_size;
14370 if (submap_entry_size > vm_map_lookup_and_lock_object_copy_shadow_max) {
14371 vm_map_lookup_and_lock_object_copy_shadow_max = submap_entry_size;
14372 }
14373 }
14374
14375 /*
14376 * Adjust the fault offset to the submap entry.
14377 */
14378 copy_offset = (local_vaddr -
14379 submap_entry->vme_start +
14380 VME_OFFSET(submap_entry));
14381
14382 /* This works diffently than the */
14383 /* normal submap case. We go back */
14384 /* to the parent of the cow map and*/
14385 /* clip out the target portion of */
14386 /* the sub_map, substituting the */
14387 /* new copy object, */
14388
14389 subentry_protection = submap_entry->protection;
14390 subentry_max_protection = submap_entry->max_protection;
14391 subentry_no_copy_on_read = submap_entry->vme_no_copy_on_read;
14392 subentry_permanent = submap_entry->vme_permanent;
14393 subentry_csm_associated = submap_entry->csm_associated;
14394 #if __arm64e__
14395 subentry_used_for_tpro = submap_entry->used_for_tpro;
14396 #endif // __arm64e__
14397 vm_map_unlock(map);
14398 submap_entry = NULL; /* not valid after map unlock */
14399
14400 local_start = old_start;
14401 local_end = old_end;
14402 map = cow_sub_map_parent;
14403 *var_map = cow_sub_map_parent;
14404 vaddr = cow_parent_vaddr;
14405 cow_sub_map_parent = NULL;
14406
14407 if (!vm_map_lookup_entry(map,
14408 vaddr, &entry)) {
14409 if ((cow_sub_map_parent) && (cow_sub_map_parent != map)) {
14410 vm_map_unlock(cow_sub_map_parent);
14411 }
14412 if ((*real_map != map)
14413 && (*real_map != cow_sub_map_parent)) {
14414 vm_map_unlock(*real_map);
14415 }
14416 *real_map = map;
14417 vm_object_deallocate(
14418 copy_object);
14419 copy_object = VM_OBJECT_NULL;
14420 vm_map_lock_write_to_read(map);
14421 DTRACE_VM4(submap_lookup_post_unlock,
14422 uint64_t, (uint64_t)entry->vme_start,
14423 uint64_t, (uint64_t)entry->vme_end,
14424 vm_map_offset_t, vaddr,
14425 int, object_copied);
14426 return KERN_INVALID_ADDRESS;
14427 }
14428
14429 /* clip out the portion of space */
14430 /* mapped by the sub map which */
14431 /* corresponds to the underlying */
14432 /* object */
14433
14434 /*
14435 * Clip (and unnest) the smallest nested chunk
14436 * possible around the faulting address...
14437 */
14438 local_start = vaddr & ~(pmap_shared_region_size_min(map->pmap) - 1);
14439 local_end = local_start + pmap_shared_region_size_min(map->pmap);
14440 /*
14441 * ... but don't go beyond the "old_start" to "old_end"
14442 * range, to avoid spanning over another VM region
14443 * with a possibly different VM object and/or offset.
14444 */
14445 if (local_start < old_start) {
14446 local_start = old_start;
14447 }
14448 if (local_end > old_end) {
14449 local_end = old_end;
14450 }
14451 /*
14452 * Adjust copy_offset to the start of the range.
14453 */
14454 copy_offset -= (vaddr - local_start);
14455
14456 vm_map_clip_start(map, entry, local_start);
14457 vm_map_clip_end(map, entry, local_end);
14458 if (entry->is_sub_map) {
14459 /* unnesting was done when clipping */
14460 assert(!entry->use_pmap);
14461 }
14462
14463 /* substitute copy object for */
14464 /* shared map entry */
14465 vm_map_deallocate(VME_SUBMAP(entry));
14466 assert(!entry->iokit_acct);
14467 entry->use_pmap = TRUE;
14468 VME_OBJECT_SET(entry, copy_object, false, 0);
14469
14470 /* propagate the submap entry's protections */
14471 if (entry->protection != VM_PROT_READ) {
14472 /*
14473 * Someone has already altered the top entry's
14474 * protections via vm_protect(VM_PROT_COPY).
14475 * Respect these new values and ignore the
14476 * submap entry's protections.
14477 */
14478 } else {
14479 /*
14480 * Regular copy-on-write: propagate the submap
14481 * entry's protections to the top map entry.
14482 */
14483 entry->protection |= subentry_protection;
14484 }
14485 entry->max_protection |= subentry_max_protection;
14486 /* propagate some attributes from subentry */
14487 entry->vme_no_copy_on_read = subentry_no_copy_on_read;
14488 entry->vme_permanent = subentry_permanent;
14489 entry->csm_associated = subentry_csm_associated;
14490 #if __arm64e__
14491 /* propagate TPRO iff the destination map has TPRO enabled */
14492 if (subentry_used_for_tpro && vm_map_tpro(map)) {
14493 entry->used_for_tpro = subentry_used_for_tpro;
14494 }
14495 #endif /* __arm64e */
14496 if ((entry->protection & VM_PROT_WRITE) &&
14497 (entry->protection & VM_PROT_EXECUTE) &&
14498 #if XNU_TARGET_OS_OSX
14499 map->pmap != kernel_pmap &&
14500 (vm_map_cs_enforcement(map)
14501 #if __arm64__
14502 || !VM_MAP_IS_EXOTIC(map)
14503 #endif /* __arm64__ */
14504 ) &&
14505 #endif /* XNU_TARGET_OS_OSX */
14506 #if CODE_SIGNING_MONITOR
14507 (csm_address_space_exempt(map->pmap) != KERN_SUCCESS) &&
14508 #endif
14509 !(entry->used_for_jit) &&
14510 VM_MAP_POLICY_WX_STRIP_X(map)) {
14511 DTRACE_VM3(cs_wx,
14512 uint64_t, (uint64_t)entry->vme_start,
14513 uint64_t, (uint64_t)entry->vme_end,
14514 vm_prot_t, entry->protection);
14515 printf("CODE SIGNING: %d[%s] %s:%d(0x%llx,0x%llx,0x%x) can't have both write and exec at the same time\n",
14516 proc_selfpid(),
14517 (get_bsdtask_info(current_task())
14518 ? proc_name_address(get_bsdtask_info(current_task()))
14519 : "?"),
14520 __FUNCTION__, __LINE__,
14521 #if DEVELOPMENT || DEBUG
14522 (uint64_t)entry->vme_start,
14523 (uint64_t)entry->vme_end,
14524 #else /* DEVELOPMENT || DEBUG */
14525 (uint64_t)0,
14526 (uint64_t)0,
14527 #endif /* DEVELOPMENT || DEBUG */
14528 entry->protection);
14529 entry->protection &= ~VM_PROT_EXECUTE;
14530 }
14531
14532 if (object_copied) {
14533 VME_OFFSET_SET(entry, local_start - old_start + object_copied_offset);
14534 entry->needs_copy = object_copied_needs_copy;
14535 entry->is_shared = FALSE;
14536 } else {
14537 assert(VME_OBJECT(entry) != VM_OBJECT_NULL);
14538 assert(VME_OBJECT(entry)->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC);
14539 assert(entry->wired_count == 0);
14540 VME_OFFSET_SET(entry, copy_offset);
14541 entry->needs_copy = TRUE;
14542 if (map != old_map) {
14543 entry->is_shared = TRUE;
14544 }
14545 }
14546 if (entry->inheritance == VM_INHERIT_SHARE) {
14547 entry->inheritance = VM_INHERIT_COPY;
14548 }
14549
14550 vm_map_lock_write_to_read(map);
14551 } else {
14552 if ((cow_sub_map_parent)
14553 && (cow_sub_map_parent != *real_map)
14554 && (cow_sub_map_parent != map)) {
14555 vm_map_unlock(cow_sub_map_parent);
14556 }
14557 entry = submap_entry;
14558 vaddr = local_vaddr;
14559 }
14560 }
14561
14562 /*
14563 * Check whether this task is allowed to have
14564 * this page.
14565 */
14566
14567 prot = entry->protection;
14568
14569 if (override_nx(old_map, VME_ALIAS(entry)) && prot) {
14570 /*
14571 * HACK -- if not a stack, then allow execution
14572 */
14573 prot |= VM_PROT_EXECUTE;
14574 }
14575
14576 #if __arm64e__
14577 /*
14578 * If the entry we're dealing with is TPRO and we have a write
14579 * fault, inject VM_PROT_WRITE into protections. This allows us
14580 * to maintain RO permissions when not marked as TPRO.
14581 */
14582 if (entry->used_for_tpro && (fault_type & VM_PROT_WRITE)) {
14583 prot |= VM_PROT_WRITE;
14584 }
14585 #endif /* __arm64e__ */
14586 if (mask_protections) {
14587 fault_type &= prot;
14588 if (fault_type == VM_PROT_NONE) {
14589 goto protection_failure;
14590 }
14591 }
14592 if (((fault_type & prot) != fault_type)
14593 #if __arm64__
14594 /* prefetch abort in execute-only page */
14595 && !(prot == VM_PROT_EXECUTE && fault_type == (VM_PROT_READ | VM_PROT_EXECUTE))
14596 #elif defined(__x86_64__)
14597 /* Consider the UEXEC bit when handling an EXECUTE fault */
14598 && !((fault_type & VM_PROT_EXECUTE) && !(prot & VM_PROT_EXECUTE) && (prot & VM_PROT_UEXEC))
14599 #endif
14600 ) {
14601 protection_failure:
14602 if (*real_map != map) {
14603 vm_map_unlock(*real_map);
14604 }
14605 *real_map = map;
14606
14607 if ((fault_type & VM_PROT_EXECUTE) && prot) {
14608 log_stack_execution_failure((addr64_t)vaddr, prot);
14609 }
14610
14611 DTRACE_VM2(prot_fault, int, 1, (uint64_t *), NULL);
14612 DTRACE_VM3(prot_fault_detailed, vm_prot_t, fault_type, vm_prot_t, prot, void *, vaddr);
14613 /*
14614 * Noisy (esp. internally) and can be inferred from CrashReports. So OFF for now.
14615 *
14616 * ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_PROTECTION_FAILURE), 0);
14617 */
14618 return KERN_PROTECTION_FAILURE;
14619 }
14620
14621 /*
14622 * If this page is not pageable, we have to get
14623 * it for all possible accesses.
14624 */
14625
14626 *wired = (entry->wired_count != 0);
14627 if (*wired) {
14628 fault_type = prot;
14629 }
14630
14631 /*
14632 * If the entry was copy-on-write, we either ...
14633 */
14634
14635 if (entry->needs_copy) {
14636 /*
14637 * If we want to write the page, we may as well
14638 * handle that now since we've got the map locked.
14639 *
14640 * If we don't need to write the page, we just
14641 * demote the permissions allowed.
14642 */
14643
14644 if ((fault_type & VM_PROT_WRITE) || *wired || force_copy) {
14645 /*
14646 * Make a new object, and place it in the
14647 * object chain. Note that no new references
14648 * have appeared -- one just moved from the
14649 * map to the new object.
14650 */
14651
14652 if (vm_map_lock_read_to_write(map)) {
14653 vm_map_lock_read(map);
14654 goto RetryLookup;
14655 }
14656
14657 if (VME_OBJECT(entry)->shadowed == FALSE) {
14658 vm_object_lock(VME_OBJECT(entry));
14659 VM_OBJECT_SET_SHADOWED(VME_OBJECT(entry), TRUE);
14660 vm_object_unlock(VME_OBJECT(entry));
14661 }
14662 VME_OBJECT_SHADOW(entry,
14663 (vm_map_size_t) (entry->vme_end -
14664 entry->vme_start),
14665 vm_map_always_shadow(map));
14666 entry->needs_copy = FALSE;
14667
14668 vm_map_lock_write_to_read(map);
14669 }
14670 if ((fault_type & VM_PROT_WRITE) == 0 && *wired == 0) {
14671 /*
14672 * We're attempting to read a copy-on-write
14673 * page -- don't allow writes.
14674 */
14675
14676 prot &= (~VM_PROT_WRITE);
14677 }
14678 }
14679
14680 if (submap_needed_copy && (prot & VM_PROT_WRITE)) {
14681 /*
14682 * We went through a "needs_copy" submap without triggering
14683 * a copy, so granting write access to the page would bypass
14684 * that submap's "needs_copy".
14685 */
14686 assert(!(fault_type & VM_PROT_WRITE));
14687 assert(!*wired);
14688 assert(!force_copy);
14689 // printf("FBDP %d[%s] submap_needed_copy for %p 0x%llx\n", proc_selfpid(), proc_name_address(current_task()->bsd_info), map, vaddr);
14690 prot &= ~VM_PROT_WRITE;
14691 }
14692
14693 /*
14694 * Create an object if necessary.
14695 */
14696 if (VME_OBJECT(entry) == VM_OBJECT_NULL) {
14697 if (vm_map_lock_read_to_write(map)) {
14698 vm_map_lock_read(map);
14699 goto RetryLookup;
14700 }
14701
14702 VME_OBJECT_SET(entry,
14703 vm_object_allocate(
14704 (vm_map_size_t)(entry->vme_end -
14705 entry->vme_start)), false, 0);
14706 VME_OFFSET_SET(entry, 0);
14707 assert(entry->use_pmap);
14708 vm_map_lock_write_to_read(map);
14709 }
14710
14711 /*
14712 * Return the object/offset from this entry. If the entry
14713 * was copy-on-write or empty, it has been fixed up. Also
14714 * return the protection.
14715 */
14716
14717 *offset = (vaddr - entry->vme_start) + VME_OFFSET(entry);
14718 *object = VME_OBJECT(entry);
14719 *out_prot = prot;
14720 KDBG_FILTERED(MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_MAP_LOOKUP_OBJECT), VM_KERNEL_UNSLIDE_OR_PERM(*object), (unsigned long) VME_ALIAS(entry), 0, 0);
14721
14722 if (fault_info) {
14723 fault_info->interruptible = THREAD_UNINT; /* for now... */
14724 /* ... the caller will change "interruptible" if needed */
14725 fault_info->cluster_size = 0;
14726 fault_info->user_tag = VME_ALIAS(entry);
14727 fault_info->pmap_options = 0;
14728 if (entry->iokit_acct ||
14729 (!entry->is_sub_map && !entry->use_pmap)) {
14730 fault_info->pmap_options |= PMAP_OPTIONS_ALT_ACCT;
14731 }
14732 fault_info->behavior = entry->behavior;
14733 fault_info->lo_offset = VME_OFFSET(entry);
14734 fault_info->hi_offset =
14735 (entry->vme_end - entry->vme_start) + VME_OFFSET(entry);
14736 fault_info->no_cache = entry->no_cache;
14737 fault_info->stealth = FALSE;
14738 fault_info->io_sync = FALSE;
14739 if (entry->used_for_jit ||
14740 #if CODE_SIGNING_MONITOR
14741 (csm_address_space_exempt(map->pmap) == KERN_SUCCESS) ||
14742 #endif
14743 entry->vme_resilient_codesign) {
14744 fault_info->cs_bypass = TRUE;
14745 } else {
14746 fault_info->cs_bypass = FALSE;
14747 }
14748 fault_info->csm_associated = FALSE;
14749 #if CODE_SIGNING_MONITOR
14750 if (entry->csm_associated) {
14751 /*
14752 * The pmap layer will validate this page
14753 * before allowing it to be executed from.
14754 */
14755 fault_info->csm_associated = TRUE;
14756 }
14757 #endif
14758 fault_info->mark_zf_absent = FALSE;
14759 fault_info->batch_pmap_op = FALSE;
14760 fault_info->resilient_media = entry->vme_resilient_media;
14761 fault_info->fi_xnu_user_debug = entry->vme_xnu_user_debug;
14762 fault_info->no_copy_on_read = entry->vme_no_copy_on_read;
14763 #if __arm64e__
14764 fault_info->fi_used_for_tpro = entry->used_for_tpro;
14765 #else /* __arm64e__ */
14766 fault_info->fi_used_for_tpro = FALSE;
14767 #endif
14768 if (entry->translated_allow_execute) {
14769 fault_info->pmap_options |= PMAP_OPTIONS_TRANSLATED_ALLOW_EXECUTE;
14770 }
14771 }
14772
14773 /*
14774 * Lock the object to prevent it from disappearing
14775 */
14776 if (object_lock_type == OBJECT_LOCK_EXCLUSIVE) {
14777 if (contended == NULL) {
14778 vm_object_lock(*object);
14779 } else {
14780 *contended = vm_object_lock_check_contended(*object);
14781 }
14782 } else {
14783 vm_object_lock_shared(*object);
14784 }
14785
14786 /*
14787 * Save the version number
14788 */
14789
14790 out_version->main_timestamp = map->timestamp;
14791
14792 return KERN_SUCCESS;
14793 }
14794
14795
14796 /*
14797 * vm_map_verify:
14798 *
14799 * Verifies that the map in question has not changed
14800 * since the given version. The map has to be locked
14801 * ("shared" mode is fine) before calling this function
14802 * and it will be returned locked too.
14803 */
14804 boolean_t
vm_map_verify(vm_map_t map,vm_map_version_t * version)14805 vm_map_verify(
14806 vm_map_t map,
14807 vm_map_version_t *version) /* REF */
14808 {
14809 boolean_t result;
14810
14811 vm_map_lock_assert_held(map);
14812 result = (map->timestamp == version->main_timestamp);
14813
14814 return result;
14815 }
14816
14817 /*
14818 * TEMPORARYTEMPORARYTEMPORARYTEMPORARYTEMPORARYTEMPORARY
14819 * Goes away after regular vm_region_recurse function migrates to
14820 * 64 bits
14821 * vm_region_recurse: A form of vm_region which follows the
14822 * submaps in a target map
14823 *
14824 */
14825
14826 kern_return_t
vm_map_region_recurse_64(vm_map_t map,vm_map_offset_t * address,vm_map_size_t * size,natural_t * nesting_depth,vm_region_submap_info_64_t submap_info,mach_msg_type_number_t * count)14827 vm_map_region_recurse_64(
14828 vm_map_t map,
14829 vm_map_offset_t *address, /* IN/OUT */
14830 vm_map_size_t *size, /* OUT */
14831 natural_t *nesting_depth, /* IN/OUT */
14832 vm_region_submap_info_64_t submap_info, /* IN/OUT */
14833 mach_msg_type_number_t *count) /* IN/OUT */
14834 {
14835 mach_msg_type_number_t original_count;
14836 vm_region_extended_info_data_t extended;
14837 vm_map_entry_t tmp_entry;
14838 vm_map_offset_t user_address;
14839 unsigned int user_max_depth;
14840
14841 /*
14842 * "curr_entry" is the VM map entry preceding or including the
14843 * address we're looking for.
14844 * "curr_map" is the map or sub-map containing "curr_entry".
14845 * "curr_address" is the equivalent of the top map's "user_address"
14846 * in the current map.
14847 * "curr_offset" is the cumulated offset of "curr_map" in the
14848 * target task's address space.
14849 * "curr_depth" is the depth of "curr_map" in the chain of
14850 * sub-maps.
14851 *
14852 * "curr_max_below" and "curr_max_above" limit the range (around
14853 * "curr_address") we should take into account in the current (sub)map.
14854 * They limit the range to what's visible through the map entries
14855 * we've traversed from the top map to the current map.
14856 *
14857 */
14858 vm_map_entry_t curr_entry;
14859 vm_map_address_t curr_address;
14860 vm_map_offset_t curr_offset;
14861 vm_map_t curr_map;
14862 unsigned int curr_depth;
14863 vm_map_offset_t curr_max_below, curr_max_above;
14864 vm_map_offset_t curr_skip;
14865
14866 /*
14867 * "next_" is the same as "curr_" but for the VM region immediately
14868 * after the address we're looking for. We need to keep track of this
14869 * too because we want to return info about that region if the
14870 * address we're looking for is not mapped.
14871 */
14872 vm_map_entry_t next_entry;
14873 vm_map_offset_t next_offset;
14874 vm_map_offset_t next_address;
14875 vm_map_t next_map;
14876 unsigned int next_depth;
14877 vm_map_offset_t next_max_below, next_max_above;
14878 vm_map_offset_t next_skip;
14879
14880 boolean_t look_for_pages;
14881 vm_region_submap_short_info_64_t short_info;
14882 boolean_t do_region_footprint;
14883 int effective_page_size, effective_page_shift;
14884 boolean_t submap_needed_copy;
14885
14886 if (map == VM_MAP_NULL) {
14887 /* no address space to work on */
14888 return KERN_INVALID_ARGUMENT;
14889 }
14890
14891 effective_page_shift = vm_self_region_page_shift(map);
14892 effective_page_size = (1 << effective_page_shift);
14893
14894 if (*count < VM_REGION_SUBMAP_SHORT_INFO_COUNT_64) {
14895 /*
14896 * "info" structure is not big enough and
14897 * would overflow
14898 */
14899 return KERN_INVALID_ARGUMENT;
14900 }
14901
14902 do_region_footprint = task_self_region_footprint();
14903 original_count = *count;
14904
14905 if (original_count < VM_REGION_SUBMAP_INFO_V0_COUNT_64) {
14906 *count = VM_REGION_SUBMAP_SHORT_INFO_COUNT_64;
14907 look_for_pages = FALSE;
14908 short_info = (vm_region_submap_short_info_64_t) submap_info;
14909 submap_info = NULL;
14910 } else {
14911 look_for_pages = TRUE;
14912 *count = VM_REGION_SUBMAP_INFO_V0_COUNT_64;
14913 short_info = NULL;
14914
14915 if (original_count >= VM_REGION_SUBMAP_INFO_V1_COUNT_64) {
14916 *count = VM_REGION_SUBMAP_INFO_V1_COUNT_64;
14917 }
14918 if (original_count >= VM_REGION_SUBMAP_INFO_V2_COUNT_64) {
14919 *count = VM_REGION_SUBMAP_INFO_V2_COUNT_64;
14920 }
14921 }
14922
14923 user_address = *address;
14924 user_max_depth = *nesting_depth;
14925 submap_needed_copy = FALSE;
14926
14927 if (not_in_kdp) {
14928 vm_map_lock_read(map);
14929 }
14930
14931 recurse_again:
14932 curr_entry = NULL;
14933 curr_map = map;
14934 curr_address = user_address;
14935 curr_offset = 0;
14936 curr_skip = 0;
14937 curr_depth = 0;
14938 curr_max_above = ((vm_map_offset_t) -1) - curr_address;
14939 curr_max_below = curr_address;
14940
14941 next_entry = NULL;
14942 next_map = NULL;
14943 next_address = 0;
14944 next_offset = 0;
14945 next_skip = 0;
14946 next_depth = 0;
14947 next_max_above = (vm_map_offset_t) -1;
14948 next_max_below = (vm_map_offset_t) -1;
14949
14950 for (;;) {
14951 if (vm_map_lookup_entry(curr_map,
14952 curr_address,
14953 &tmp_entry)) {
14954 /* tmp_entry contains the address we're looking for */
14955 curr_entry = tmp_entry;
14956 } else {
14957 vm_map_offset_t skip;
14958 /*
14959 * The address is not mapped. "tmp_entry" is the
14960 * map entry preceding the address. We want the next
14961 * one, if it exists.
14962 */
14963 curr_entry = tmp_entry->vme_next;
14964
14965 if (curr_entry == vm_map_to_entry(curr_map) ||
14966 (curr_entry->vme_start >=
14967 curr_address + curr_max_above)) {
14968 /* no next entry at this level: stop looking */
14969 if (not_in_kdp) {
14970 vm_map_unlock_read(curr_map);
14971 }
14972 curr_entry = NULL;
14973 curr_map = NULL;
14974 curr_skip = 0;
14975 curr_offset = 0;
14976 curr_depth = 0;
14977 curr_max_above = 0;
14978 curr_max_below = 0;
14979 break;
14980 }
14981
14982 /* adjust current address and offset */
14983 skip = curr_entry->vme_start - curr_address;
14984 curr_address = curr_entry->vme_start;
14985 curr_skip += skip;
14986 curr_offset += skip;
14987 curr_max_above -= skip;
14988 curr_max_below = 0;
14989 }
14990
14991 /*
14992 * Is the next entry at this level closer to the address (or
14993 * deeper in the submap chain) than the one we had
14994 * so far ?
14995 */
14996 tmp_entry = curr_entry->vme_next;
14997 if (tmp_entry == vm_map_to_entry(curr_map)) {
14998 /* no next entry at this level */
14999 } else if (tmp_entry->vme_start >=
15000 curr_address + curr_max_above) {
15001 /*
15002 * tmp_entry is beyond the scope of what we mapped of
15003 * this submap in the upper level: ignore it.
15004 */
15005 } else if ((next_entry == NULL) ||
15006 (tmp_entry->vme_start + curr_offset <=
15007 next_entry->vme_start + next_offset)) {
15008 /*
15009 * We didn't have a "next_entry" or this one is
15010 * closer to the address we're looking for:
15011 * use this "tmp_entry" as the new "next_entry".
15012 */
15013 if (next_entry != NULL) {
15014 /* unlock the last "next_map" */
15015 if (next_map != curr_map && not_in_kdp) {
15016 vm_map_unlock_read(next_map);
15017 }
15018 }
15019 next_entry = tmp_entry;
15020 next_map = curr_map;
15021 next_depth = curr_depth;
15022 next_address = next_entry->vme_start;
15023 next_skip = curr_skip;
15024 next_skip += (next_address - curr_address);
15025 next_offset = curr_offset;
15026 next_offset += (next_address - curr_address);
15027 next_max_above = MIN(next_max_above, curr_max_above);
15028 next_max_above = MIN(next_max_above,
15029 next_entry->vme_end - next_address);
15030 next_max_below = MIN(next_max_below, curr_max_below);
15031 next_max_below = MIN(next_max_below,
15032 next_address - next_entry->vme_start);
15033 }
15034
15035 /*
15036 * "curr_max_{above,below}" allow us to keep track of the
15037 * portion of the submap that is actually mapped at this level:
15038 * the rest of that submap is irrelevant to us, since it's not
15039 * mapped here.
15040 * The relevant portion of the map starts at
15041 * "VME_OFFSET(curr_entry)" up to the size of "curr_entry".
15042 */
15043 curr_max_above = MIN(curr_max_above,
15044 curr_entry->vme_end - curr_address);
15045 curr_max_below = MIN(curr_max_below,
15046 curr_address - curr_entry->vme_start);
15047
15048 if (!curr_entry->is_sub_map ||
15049 curr_depth >= user_max_depth) {
15050 /*
15051 * We hit a leaf map or we reached the maximum depth
15052 * we could, so stop looking. Keep the current map
15053 * locked.
15054 */
15055 break;
15056 }
15057
15058 /*
15059 * Get down to the next submap level.
15060 */
15061
15062 if (curr_entry->needs_copy) {
15063 /* everything below this is effectively copy-on-write */
15064 submap_needed_copy = TRUE;
15065 }
15066
15067 /*
15068 * Lock the next level and unlock the current level,
15069 * unless we need to keep it locked to access the "next_entry"
15070 * later.
15071 */
15072 if (not_in_kdp) {
15073 vm_map_lock_read(VME_SUBMAP(curr_entry));
15074 }
15075 if (curr_map == next_map) {
15076 /* keep "next_map" locked in case we need it */
15077 } else {
15078 /* release this map */
15079 if (not_in_kdp) {
15080 vm_map_unlock_read(curr_map);
15081 }
15082 }
15083
15084 /*
15085 * Adjust the offset. "curr_entry" maps the submap
15086 * at relative address "curr_entry->vme_start" in the
15087 * curr_map but skips the first "VME_OFFSET(curr_entry)"
15088 * bytes of the submap.
15089 * "curr_offset" always represents the offset of a virtual
15090 * address in the curr_map relative to the absolute address
15091 * space (i.e. the top-level VM map).
15092 */
15093 curr_offset +=
15094 (VME_OFFSET(curr_entry) - curr_entry->vme_start);
15095 curr_address = user_address + curr_offset;
15096 /* switch to the submap */
15097 curr_map = VME_SUBMAP(curr_entry);
15098 curr_depth++;
15099 curr_entry = NULL;
15100 }
15101
15102 // LP64todo: all the current tools are 32bit, obviously never worked for 64b
15103 // so probably should be a real 32b ID vs. ptr.
15104 // Current users just check for equality
15105
15106 if (curr_entry == NULL) {
15107 /* no VM region contains the address... */
15108
15109 if (do_region_footprint && /* we want footprint numbers */
15110 next_entry == NULL && /* & there are no more regions */
15111 /* & we haven't already provided our fake region: */
15112 user_address <= vm_map_last_entry(map)->vme_end) {
15113 ledger_amount_t ledger_resident, ledger_compressed;
15114
15115 /*
15116 * Add a fake memory region to account for
15117 * purgeable and/or ledger-tagged memory that
15118 * counts towards this task's memory footprint,
15119 * i.e. the resident/compressed pages of non-volatile
15120 * objects owned by that task.
15121 */
15122 task_ledgers_footprint(map->pmap->ledger,
15123 &ledger_resident,
15124 &ledger_compressed);
15125 if (ledger_resident + ledger_compressed == 0) {
15126 /* no purgeable memory usage to report */
15127 return KERN_INVALID_ADDRESS;
15128 }
15129 /* fake region to show nonvolatile footprint */
15130 if (look_for_pages) {
15131 submap_info->protection = VM_PROT_DEFAULT;
15132 submap_info->max_protection = VM_PROT_DEFAULT;
15133 submap_info->inheritance = VM_INHERIT_DEFAULT;
15134 submap_info->offset = 0;
15135 submap_info->user_tag = -1;
15136 submap_info->pages_resident = (unsigned int) (ledger_resident / effective_page_size);
15137 submap_info->pages_shared_now_private = 0;
15138 submap_info->pages_swapped_out = (unsigned int) (ledger_compressed / effective_page_size);
15139 submap_info->pages_dirtied = submap_info->pages_resident;
15140 submap_info->ref_count = 1;
15141 submap_info->shadow_depth = 0;
15142 submap_info->external_pager = 0;
15143 submap_info->share_mode = SM_PRIVATE;
15144 if (submap_needed_copy) {
15145 submap_info->share_mode = SM_COW;
15146 }
15147 submap_info->is_submap = 0;
15148 submap_info->behavior = VM_BEHAVIOR_DEFAULT;
15149 submap_info->object_id = VM_OBJECT_ID_FAKE(map, task_ledgers.purgeable_nonvolatile);
15150 submap_info->user_wired_count = 0;
15151 submap_info->pages_reusable = 0;
15152 } else {
15153 short_info->user_tag = -1;
15154 short_info->offset = 0;
15155 short_info->protection = VM_PROT_DEFAULT;
15156 short_info->inheritance = VM_INHERIT_DEFAULT;
15157 short_info->max_protection = VM_PROT_DEFAULT;
15158 short_info->behavior = VM_BEHAVIOR_DEFAULT;
15159 short_info->user_wired_count = 0;
15160 short_info->is_submap = 0;
15161 short_info->object_id = VM_OBJECT_ID_FAKE(map, task_ledgers.purgeable_nonvolatile);
15162 short_info->external_pager = 0;
15163 short_info->shadow_depth = 0;
15164 short_info->share_mode = SM_PRIVATE;
15165 if (submap_needed_copy) {
15166 short_info->share_mode = SM_COW;
15167 }
15168 short_info->ref_count = 1;
15169 }
15170 *nesting_depth = 0;
15171 *size = (vm_map_size_t) (ledger_resident + ledger_compressed);
15172 // *address = user_address;
15173 *address = vm_map_last_entry(map)->vme_end;
15174 return KERN_SUCCESS;
15175 }
15176
15177 if (next_entry == NULL) {
15178 /* ... and no VM region follows it either */
15179 return KERN_INVALID_ADDRESS;
15180 }
15181 /* ... gather info about the next VM region */
15182 curr_entry = next_entry;
15183 curr_map = next_map; /* still locked ... */
15184 curr_address = next_address;
15185 curr_skip = next_skip;
15186 curr_offset = next_offset;
15187 curr_depth = next_depth;
15188 curr_max_above = next_max_above;
15189 curr_max_below = next_max_below;
15190 } else {
15191 /* we won't need "next_entry" after all */
15192 if (next_entry != NULL) {
15193 /* release "next_map" */
15194 if (next_map != curr_map && not_in_kdp) {
15195 vm_map_unlock_read(next_map);
15196 }
15197 }
15198 }
15199 next_entry = NULL;
15200 next_map = NULL;
15201 next_offset = 0;
15202 next_skip = 0;
15203 next_depth = 0;
15204 next_max_below = -1;
15205 next_max_above = -1;
15206
15207 if (curr_entry->is_sub_map &&
15208 curr_depth < user_max_depth) {
15209 /*
15210 * We're not as deep as we could be: we must have
15211 * gone back up after not finding anything mapped
15212 * below the original top-level map entry's.
15213 * Let's move "curr_address" forward and recurse again.
15214 */
15215 user_address = curr_address;
15216 goto recurse_again;
15217 }
15218
15219 *nesting_depth = curr_depth;
15220 *size = curr_max_above + curr_max_below;
15221 *address = user_address + curr_skip - curr_max_below;
15222
15223 if (look_for_pages) {
15224 submap_info->user_tag = VME_ALIAS(curr_entry);
15225 submap_info->offset = VME_OFFSET(curr_entry);
15226 submap_info->protection = curr_entry->protection;
15227 submap_info->inheritance = curr_entry->inheritance;
15228 submap_info->max_protection = curr_entry->max_protection;
15229 submap_info->behavior = curr_entry->behavior;
15230 submap_info->user_wired_count = curr_entry->user_wired_count;
15231 submap_info->is_submap = curr_entry->is_sub_map;
15232 if (curr_entry->is_sub_map) {
15233 submap_info->object_id = VM_OBJECT_ID(VME_SUBMAP(curr_entry));
15234 } else {
15235 submap_info->object_id = VM_OBJECT_ID(VME_OBJECT(curr_entry));
15236 }
15237 } else {
15238 short_info->user_tag = VME_ALIAS(curr_entry);
15239 short_info->offset = VME_OFFSET(curr_entry);
15240 short_info->protection = curr_entry->protection;
15241 short_info->inheritance = curr_entry->inheritance;
15242 short_info->max_protection = curr_entry->max_protection;
15243 short_info->behavior = curr_entry->behavior;
15244 short_info->user_wired_count = curr_entry->user_wired_count;
15245 short_info->is_submap = curr_entry->is_sub_map;
15246 if (curr_entry->is_sub_map) {
15247 short_info->object_id = VM_OBJECT_ID(VME_SUBMAP(curr_entry));
15248 } else {
15249 short_info->object_id = VM_OBJECT_ID(VME_OBJECT(curr_entry));
15250 }
15251 }
15252
15253 extended.pages_resident = 0;
15254 extended.pages_swapped_out = 0;
15255 extended.pages_shared_now_private = 0;
15256 extended.pages_dirtied = 0;
15257 extended.pages_reusable = 0;
15258 extended.external_pager = 0;
15259 extended.shadow_depth = 0;
15260 extended.share_mode = SM_EMPTY;
15261 extended.ref_count = 0;
15262
15263 if (not_in_kdp) {
15264 if (!curr_entry->is_sub_map) {
15265 vm_map_offset_t range_start, range_end;
15266 range_start = MAX((curr_address - curr_max_below),
15267 curr_entry->vme_start);
15268 range_end = MIN((curr_address + curr_max_above),
15269 curr_entry->vme_end);
15270 vm_map_region_walk(curr_map,
15271 range_start,
15272 curr_entry,
15273 (VME_OFFSET(curr_entry) +
15274 (range_start -
15275 curr_entry->vme_start)),
15276 range_end - range_start,
15277 &extended,
15278 look_for_pages, VM_REGION_EXTENDED_INFO_COUNT);
15279 if (extended.external_pager &&
15280 extended.ref_count == 2 &&
15281 extended.share_mode == SM_SHARED) {
15282 extended.share_mode = SM_PRIVATE;
15283 }
15284 if (submap_needed_copy) {
15285 extended.share_mode = SM_COW;
15286 }
15287 } else {
15288 if (curr_entry->use_pmap) {
15289 extended.share_mode = SM_TRUESHARED;
15290 } else {
15291 extended.share_mode = SM_PRIVATE;
15292 }
15293 extended.ref_count = os_ref_get_count_raw(&VME_SUBMAP(curr_entry)->map_refcnt);
15294 }
15295 }
15296
15297 if (look_for_pages) {
15298 submap_info->pages_resident = extended.pages_resident;
15299 submap_info->pages_swapped_out = extended.pages_swapped_out;
15300 submap_info->pages_shared_now_private =
15301 extended.pages_shared_now_private;
15302 submap_info->pages_dirtied = extended.pages_dirtied;
15303 submap_info->external_pager = extended.external_pager;
15304 submap_info->shadow_depth = extended.shadow_depth;
15305 submap_info->share_mode = extended.share_mode;
15306 submap_info->ref_count = extended.ref_count;
15307
15308 if (original_count >= VM_REGION_SUBMAP_INFO_V1_COUNT_64) {
15309 submap_info->pages_reusable = extended.pages_reusable;
15310 }
15311 if (original_count >= VM_REGION_SUBMAP_INFO_V2_COUNT_64) {
15312 if (curr_entry->is_sub_map) {
15313 submap_info->object_id_full = (vm_object_id_t)VM_KERNEL_ADDRHASH(VME_SUBMAP(curr_entry));
15314 } else if (VME_OBJECT(curr_entry)) {
15315 submap_info->object_id_full = (vm_object_id_t)VM_KERNEL_ADDRHASH(VME_OBJECT(curr_entry));
15316 } else {
15317 submap_info->object_id_full = 0ull;
15318 }
15319 }
15320 } else {
15321 short_info->external_pager = extended.external_pager;
15322 short_info->shadow_depth = extended.shadow_depth;
15323 short_info->share_mode = extended.share_mode;
15324 short_info->ref_count = extended.ref_count;
15325 }
15326
15327 if (not_in_kdp) {
15328 vm_map_unlock_read(curr_map);
15329 }
15330
15331 return KERN_SUCCESS;
15332 }
15333
15334 /*
15335 * vm_region:
15336 *
15337 * User call to obtain information about a region in
15338 * a task's address map. Currently, only one flavor is
15339 * supported.
15340 *
15341 * XXX The reserved and behavior fields cannot be filled
15342 * in until the vm merge from the IK is completed, and
15343 * vm_reserve is implemented.
15344 */
15345
15346 kern_return_t
vm_map_region(vm_map_t map,vm_map_offset_t * address,vm_map_size_t * size,vm_region_flavor_t flavor,vm_region_info_t info,mach_msg_type_number_t * count,mach_port_t * object_name)15347 vm_map_region(
15348 vm_map_t map,
15349 vm_map_offset_t *address, /* IN/OUT */
15350 vm_map_size_t *size, /* OUT */
15351 vm_region_flavor_t flavor, /* IN */
15352 vm_region_info_t info, /* OUT */
15353 mach_msg_type_number_t *count, /* IN/OUT */
15354 mach_port_t *object_name) /* OUT */
15355 {
15356 vm_map_entry_t tmp_entry;
15357 vm_map_entry_t entry;
15358 vm_map_offset_t start;
15359
15360 if (map == VM_MAP_NULL) {
15361 return KERN_INVALID_ARGUMENT;
15362 }
15363
15364 switch (flavor) {
15365 case VM_REGION_BASIC_INFO:
15366 /* legacy for old 32-bit objects info */
15367 {
15368 vm_region_basic_info_t basic;
15369
15370 if (*count < VM_REGION_BASIC_INFO_COUNT) {
15371 return KERN_INVALID_ARGUMENT;
15372 }
15373
15374 basic = (vm_region_basic_info_t) info;
15375 *count = VM_REGION_BASIC_INFO_COUNT;
15376
15377 vm_map_lock_read(map);
15378
15379 start = *address;
15380 if (!vm_map_lookup_entry(map, start, &tmp_entry)) {
15381 if ((entry = tmp_entry->vme_next) == vm_map_to_entry(map)) {
15382 vm_map_unlock_read(map);
15383 return KERN_INVALID_ADDRESS;
15384 }
15385 } else {
15386 entry = tmp_entry;
15387 }
15388
15389 start = entry->vme_start;
15390
15391 basic->offset = (uint32_t)VME_OFFSET(entry);
15392 basic->protection = entry->protection;
15393 basic->inheritance = entry->inheritance;
15394 basic->max_protection = entry->max_protection;
15395 basic->behavior = entry->behavior;
15396 basic->user_wired_count = entry->user_wired_count;
15397 basic->reserved = entry->is_sub_map;
15398 *address = start;
15399 *size = (entry->vme_end - start);
15400
15401 if (object_name) {
15402 *object_name = IP_NULL;
15403 }
15404 if (entry->is_sub_map) {
15405 basic->shared = FALSE;
15406 } else {
15407 basic->shared = entry->is_shared;
15408 }
15409
15410 vm_map_unlock_read(map);
15411 return KERN_SUCCESS;
15412 }
15413
15414 case VM_REGION_BASIC_INFO_64:
15415 {
15416 vm_region_basic_info_64_t basic;
15417
15418 if (*count < VM_REGION_BASIC_INFO_COUNT_64) {
15419 return KERN_INVALID_ARGUMENT;
15420 }
15421
15422 basic = (vm_region_basic_info_64_t) info;
15423 *count = VM_REGION_BASIC_INFO_COUNT_64;
15424
15425 vm_map_lock_read(map);
15426
15427 start = *address;
15428 if (!vm_map_lookup_entry(map, start, &tmp_entry)) {
15429 if ((entry = tmp_entry->vme_next) == vm_map_to_entry(map)) {
15430 vm_map_unlock_read(map);
15431 return KERN_INVALID_ADDRESS;
15432 }
15433 } else {
15434 entry = tmp_entry;
15435 }
15436
15437 start = entry->vme_start;
15438
15439 basic->offset = VME_OFFSET(entry);
15440 basic->protection = entry->protection;
15441 basic->inheritance = entry->inheritance;
15442 basic->max_protection = entry->max_protection;
15443 basic->behavior = entry->behavior;
15444 basic->user_wired_count = entry->user_wired_count;
15445 basic->reserved = entry->is_sub_map;
15446 *address = start;
15447 *size = (entry->vme_end - start);
15448
15449 if (object_name) {
15450 *object_name = IP_NULL;
15451 }
15452 if (entry->is_sub_map) {
15453 basic->shared = FALSE;
15454 } else {
15455 basic->shared = entry->is_shared;
15456 }
15457
15458 vm_map_unlock_read(map);
15459 return KERN_SUCCESS;
15460 }
15461 case VM_REGION_EXTENDED_INFO:
15462 if (*count < VM_REGION_EXTENDED_INFO_COUNT) {
15463 return KERN_INVALID_ARGUMENT;
15464 }
15465 OS_FALLTHROUGH;
15466 case VM_REGION_EXTENDED_INFO__legacy:
15467 if (*count < VM_REGION_EXTENDED_INFO_COUNT__legacy) {
15468 return KERN_INVALID_ARGUMENT;
15469 }
15470
15471 {
15472 vm_region_extended_info_t extended;
15473 mach_msg_type_number_t original_count;
15474 int effective_page_size, effective_page_shift;
15475
15476 extended = (vm_region_extended_info_t) info;
15477
15478 effective_page_shift = vm_self_region_page_shift(map);
15479 effective_page_size = (1 << effective_page_shift);
15480
15481 vm_map_lock_read(map);
15482
15483 start = *address;
15484 if (!vm_map_lookup_entry(map, start, &tmp_entry)) {
15485 if ((entry = tmp_entry->vme_next) == vm_map_to_entry(map)) {
15486 vm_map_unlock_read(map);
15487 return KERN_INVALID_ADDRESS;
15488 }
15489 } else {
15490 entry = tmp_entry;
15491 }
15492 start = entry->vme_start;
15493
15494 extended->protection = entry->protection;
15495 extended->user_tag = VME_ALIAS(entry);
15496 extended->pages_resident = 0;
15497 extended->pages_swapped_out = 0;
15498 extended->pages_shared_now_private = 0;
15499 extended->pages_dirtied = 0;
15500 extended->external_pager = 0;
15501 extended->shadow_depth = 0;
15502
15503 original_count = *count;
15504 if (flavor == VM_REGION_EXTENDED_INFO__legacy) {
15505 *count = VM_REGION_EXTENDED_INFO_COUNT__legacy;
15506 } else {
15507 extended->pages_reusable = 0;
15508 *count = VM_REGION_EXTENDED_INFO_COUNT;
15509 }
15510
15511 vm_map_region_walk(map, start, entry, VME_OFFSET(entry), entry->vme_end - start, extended, TRUE, *count);
15512
15513 if (extended->external_pager && extended->ref_count == 2 && extended->share_mode == SM_SHARED) {
15514 extended->share_mode = SM_PRIVATE;
15515 }
15516
15517 if (object_name) {
15518 *object_name = IP_NULL;
15519 }
15520 *address = start;
15521 *size = (entry->vme_end - start);
15522
15523 vm_map_unlock_read(map);
15524 return KERN_SUCCESS;
15525 }
15526 case VM_REGION_TOP_INFO:
15527 {
15528 vm_region_top_info_t top;
15529
15530 if (*count < VM_REGION_TOP_INFO_COUNT) {
15531 return KERN_INVALID_ARGUMENT;
15532 }
15533
15534 top = (vm_region_top_info_t) info;
15535 *count = VM_REGION_TOP_INFO_COUNT;
15536
15537 vm_map_lock_read(map);
15538
15539 start = *address;
15540 if (!vm_map_lookup_entry(map, start, &tmp_entry)) {
15541 if ((entry = tmp_entry->vme_next) == vm_map_to_entry(map)) {
15542 vm_map_unlock_read(map);
15543 return KERN_INVALID_ADDRESS;
15544 }
15545 } else {
15546 entry = tmp_entry;
15547 }
15548 start = entry->vme_start;
15549
15550 top->private_pages_resident = 0;
15551 top->shared_pages_resident = 0;
15552
15553 vm_map_region_top_walk(entry, top);
15554
15555 if (object_name) {
15556 *object_name = IP_NULL;
15557 }
15558 *address = start;
15559 *size = (entry->vme_end - start);
15560
15561 vm_map_unlock_read(map);
15562 return KERN_SUCCESS;
15563 }
15564 default:
15565 return KERN_INVALID_ARGUMENT;
15566 }
15567 }
15568
15569 #define OBJ_RESIDENT_COUNT(obj, entry_size) \
15570 MIN((entry_size), \
15571 ((obj)->all_reusable ? \
15572 (obj)->wired_page_count : \
15573 (obj)->resident_page_count - (obj)->reusable_page_count))
15574
15575 void
vm_map_region_top_walk(vm_map_entry_t entry,vm_region_top_info_t top)15576 vm_map_region_top_walk(
15577 vm_map_entry_t entry,
15578 vm_region_top_info_t top)
15579 {
15580 if (entry->is_sub_map || VME_OBJECT(entry) == 0) {
15581 top->share_mode = SM_EMPTY;
15582 top->ref_count = 0;
15583 top->obj_id = 0;
15584 return;
15585 }
15586
15587 {
15588 struct vm_object *obj, *tmp_obj;
15589 int ref_count;
15590 uint32_t entry_size;
15591
15592 entry_size = (uint32_t) ((entry->vme_end - entry->vme_start) / PAGE_SIZE_64);
15593
15594 obj = VME_OBJECT(entry);
15595
15596 vm_object_lock(obj);
15597
15598 if ((ref_count = obj->ref_count) > 1 && obj->paging_in_progress) {
15599 ref_count--;
15600 }
15601
15602 assert(obj->reusable_page_count <= obj->resident_page_count);
15603 if (obj->shadow) {
15604 if (ref_count == 1) {
15605 top->private_pages_resident =
15606 OBJ_RESIDENT_COUNT(obj, entry_size);
15607 } else {
15608 top->shared_pages_resident =
15609 OBJ_RESIDENT_COUNT(obj, entry_size);
15610 }
15611 top->ref_count = ref_count;
15612 top->share_mode = SM_COW;
15613
15614 while ((tmp_obj = obj->shadow)) {
15615 vm_object_lock(tmp_obj);
15616 vm_object_unlock(obj);
15617 obj = tmp_obj;
15618
15619 if ((ref_count = obj->ref_count) > 1 && obj->paging_in_progress) {
15620 ref_count--;
15621 }
15622
15623 assert(obj->reusable_page_count <= obj->resident_page_count);
15624 top->shared_pages_resident +=
15625 OBJ_RESIDENT_COUNT(obj, entry_size);
15626 top->ref_count += ref_count - 1;
15627 }
15628 } else {
15629 if (entry->superpage_size) {
15630 top->share_mode = SM_LARGE_PAGE;
15631 top->shared_pages_resident = 0;
15632 top->private_pages_resident = entry_size;
15633 } else if (entry->needs_copy) {
15634 top->share_mode = SM_COW;
15635 top->shared_pages_resident =
15636 OBJ_RESIDENT_COUNT(obj, entry_size);
15637 } else {
15638 if (ref_count == 1 ||
15639 (ref_count == 2 && obj->named)) {
15640 top->share_mode = SM_PRIVATE;
15641 top->private_pages_resident =
15642 OBJ_RESIDENT_COUNT(obj,
15643 entry_size);
15644 } else {
15645 top->share_mode = SM_SHARED;
15646 top->shared_pages_resident =
15647 OBJ_RESIDENT_COUNT(obj,
15648 entry_size);
15649 }
15650 }
15651 top->ref_count = ref_count;
15652 }
15653
15654 vm_object_unlock(obj);
15655
15656 /* XXX K64: obj_id will be truncated */
15657 top->obj_id = (unsigned int) (uintptr_t)VM_KERNEL_ADDRHASH(obj);
15658 }
15659 }
15660
15661 void
vm_map_region_walk(vm_map_t map,vm_map_offset_t va,vm_map_entry_t entry,vm_object_offset_t offset,vm_object_size_t range,vm_region_extended_info_t extended,boolean_t look_for_pages,mach_msg_type_number_t count)15662 vm_map_region_walk(
15663 vm_map_t map,
15664 vm_map_offset_t va,
15665 vm_map_entry_t entry,
15666 vm_object_offset_t offset,
15667 vm_object_size_t range,
15668 vm_region_extended_info_t extended,
15669 boolean_t look_for_pages,
15670 mach_msg_type_number_t count)
15671 {
15672 struct vm_object *obj, *tmp_obj;
15673 vm_map_offset_t last_offset;
15674 int i;
15675 int ref_count;
15676 struct vm_object *shadow_object;
15677 unsigned short shadow_depth;
15678 boolean_t do_region_footprint;
15679 int effective_page_size, effective_page_shift;
15680 vm_map_offset_t effective_page_mask;
15681
15682 do_region_footprint = task_self_region_footprint();
15683
15684 if ((entry->is_sub_map) ||
15685 (VME_OBJECT(entry) == 0) ||
15686 (VME_OBJECT(entry)->phys_contiguous &&
15687 !entry->superpage_size)) {
15688 extended->share_mode = SM_EMPTY;
15689 extended->ref_count = 0;
15690 return;
15691 }
15692
15693 if (entry->superpage_size) {
15694 extended->shadow_depth = 0;
15695 extended->share_mode = SM_LARGE_PAGE;
15696 extended->ref_count = 1;
15697 extended->external_pager = 0;
15698
15699 /* TODO4K: Superpage in 4k mode? */
15700 extended->pages_resident = (unsigned int)(range >> PAGE_SHIFT);
15701 extended->shadow_depth = 0;
15702 return;
15703 }
15704
15705 effective_page_shift = vm_self_region_page_shift(map);
15706 effective_page_size = (1 << effective_page_shift);
15707 effective_page_mask = effective_page_size - 1;
15708
15709 offset = vm_map_trunc_page(offset, effective_page_mask);
15710
15711 obj = VME_OBJECT(entry);
15712
15713 vm_object_lock(obj);
15714
15715 if ((ref_count = obj->ref_count) > 1 && obj->paging_in_progress) {
15716 ref_count--;
15717 }
15718
15719 if (look_for_pages) {
15720 for (last_offset = offset + range;
15721 offset < last_offset;
15722 offset += effective_page_size, va += effective_page_size) {
15723 if (do_region_footprint) {
15724 int disp;
15725
15726 disp = 0;
15727 if (map->has_corpse_footprint) {
15728 /*
15729 * Query the page info data we saved
15730 * while forking the corpse.
15731 */
15732 vm_map_corpse_footprint_query_page_info(
15733 map,
15734 va,
15735 &disp);
15736 } else {
15737 /*
15738 * Query the pmap.
15739 */
15740 vm_map_footprint_query_page_info(
15741 map,
15742 entry,
15743 va,
15744 &disp);
15745 }
15746 if (disp & VM_PAGE_QUERY_PAGE_PRESENT) {
15747 extended->pages_resident++;
15748 }
15749 if (disp & VM_PAGE_QUERY_PAGE_REUSABLE) {
15750 extended->pages_reusable++;
15751 }
15752 if (disp & VM_PAGE_QUERY_PAGE_DIRTY) {
15753 extended->pages_dirtied++;
15754 }
15755 if (disp & PMAP_QUERY_PAGE_COMPRESSED) {
15756 extended->pages_swapped_out++;
15757 }
15758 continue;
15759 }
15760
15761 vm_map_region_look_for_page(map, va, obj,
15762 vm_object_trunc_page(offset), ref_count,
15763 0, extended, count);
15764 }
15765
15766 if (do_region_footprint) {
15767 goto collect_object_info;
15768 }
15769 } else {
15770 collect_object_info:
15771 shadow_object = obj->shadow;
15772 shadow_depth = 0;
15773
15774 if (!(obj->internal)) {
15775 extended->external_pager = 1;
15776 }
15777
15778 if (shadow_object != VM_OBJECT_NULL) {
15779 vm_object_lock(shadow_object);
15780 for (;
15781 shadow_object != VM_OBJECT_NULL;
15782 shadow_depth++) {
15783 vm_object_t next_shadow;
15784
15785 if (!(shadow_object->internal)) {
15786 extended->external_pager = 1;
15787 }
15788
15789 next_shadow = shadow_object->shadow;
15790 if (next_shadow) {
15791 vm_object_lock(next_shadow);
15792 }
15793 vm_object_unlock(shadow_object);
15794 shadow_object = next_shadow;
15795 }
15796 }
15797 extended->shadow_depth = shadow_depth;
15798 }
15799
15800 if (extended->shadow_depth || entry->needs_copy) {
15801 extended->share_mode = SM_COW;
15802 } else {
15803 if (ref_count == 1) {
15804 extended->share_mode = SM_PRIVATE;
15805 } else {
15806 if (obj->true_share) {
15807 extended->share_mode = SM_TRUESHARED;
15808 } else {
15809 extended->share_mode = SM_SHARED;
15810 }
15811 }
15812 }
15813 extended->ref_count = ref_count - extended->shadow_depth;
15814
15815 for (i = 0; i < extended->shadow_depth; i++) {
15816 if ((tmp_obj = obj->shadow) == 0) {
15817 break;
15818 }
15819 vm_object_lock(tmp_obj);
15820 vm_object_unlock(obj);
15821
15822 if ((ref_count = tmp_obj->ref_count) > 1 && tmp_obj->paging_in_progress) {
15823 ref_count--;
15824 }
15825
15826 extended->ref_count += ref_count;
15827 obj = tmp_obj;
15828 }
15829 vm_object_unlock(obj);
15830
15831 if (extended->share_mode == SM_SHARED) {
15832 vm_map_entry_t cur;
15833 vm_map_entry_t last;
15834 int my_refs;
15835
15836 obj = VME_OBJECT(entry);
15837 last = vm_map_to_entry(map);
15838 my_refs = 0;
15839
15840 if ((ref_count = obj->ref_count) > 1 && obj->paging_in_progress) {
15841 ref_count--;
15842 }
15843 for (cur = vm_map_first_entry(map); cur != last; cur = cur->vme_next) {
15844 my_refs += vm_map_region_count_obj_refs(cur, obj);
15845 }
15846
15847 if (my_refs == ref_count) {
15848 extended->share_mode = SM_PRIVATE_ALIASED;
15849 } else if (my_refs > 1) {
15850 extended->share_mode = SM_SHARED_ALIASED;
15851 }
15852 }
15853 }
15854
15855
15856 /* object is locked on entry and locked on return */
15857
15858
15859 static void
vm_map_region_look_for_page(__unused vm_map_t map,__unused vm_map_offset_t va,vm_object_t object,vm_object_offset_t offset,int max_refcnt,unsigned short depth,vm_region_extended_info_t extended,mach_msg_type_number_t count)15860 vm_map_region_look_for_page(
15861 __unused vm_map_t map,
15862 __unused vm_map_offset_t va,
15863 vm_object_t object,
15864 vm_object_offset_t offset,
15865 int max_refcnt,
15866 unsigned short depth,
15867 vm_region_extended_info_t extended,
15868 mach_msg_type_number_t count)
15869 {
15870 vm_page_t p;
15871 vm_object_t shadow;
15872 int ref_count;
15873 vm_object_t caller_object;
15874
15875 shadow = object->shadow;
15876 caller_object = object;
15877
15878
15879 while (TRUE) {
15880 if (!(object->internal)) {
15881 extended->external_pager = 1;
15882 }
15883
15884 if ((p = vm_page_lookup(object, offset)) != VM_PAGE_NULL) {
15885 if (shadow && (max_refcnt == 1)) {
15886 extended->pages_shared_now_private++;
15887 }
15888
15889 if (!p->vmp_fictitious &&
15890 (p->vmp_dirty || pmap_is_modified(VM_PAGE_GET_PHYS_PAGE(p)))) {
15891 extended->pages_dirtied++;
15892 } else if (count >= VM_REGION_EXTENDED_INFO_COUNT) {
15893 if (p->vmp_reusable || object->all_reusable) {
15894 extended->pages_reusable++;
15895 }
15896 }
15897
15898 extended->pages_resident++;
15899
15900 if (object != caller_object) {
15901 vm_object_unlock(object);
15902 }
15903
15904 return;
15905 }
15906 if (object->internal &&
15907 object->alive &&
15908 !object->terminating &&
15909 object->pager_ready) {
15910 if (VM_COMPRESSOR_PAGER_STATE_GET(object, offset)
15911 == VM_EXTERNAL_STATE_EXISTS) {
15912 /* the pager has that page */
15913 extended->pages_swapped_out++;
15914 if (object != caller_object) {
15915 vm_object_unlock(object);
15916 }
15917 return;
15918 }
15919 }
15920
15921 if (shadow) {
15922 vm_object_lock(shadow);
15923
15924 if ((ref_count = shadow->ref_count) > 1 && shadow->paging_in_progress) {
15925 ref_count--;
15926 }
15927
15928 if (++depth > extended->shadow_depth) {
15929 extended->shadow_depth = depth;
15930 }
15931
15932 if (ref_count > max_refcnt) {
15933 max_refcnt = ref_count;
15934 }
15935
15936 if (object != caller_object) {
15937 vm_object_unlock(object);
15938 }
15939
15940 offset = offset + object->vo_shadow_offset;
15941 object = shadow;
15942 shadow = object->shadow;
15943 continue;
15944 }
15945 if (object != caller_object) {
15946 vm_object_unlock(object);
15947 }
15948 break;
15949 }
15950 }
15951
15952 static int
vm_map_region_count_obj_refs(vm_map_entry_t entry,vm_object_t object)15953 vm_map_region_count_obj_refs(
15954 vm_map_entry_t entry,
15955 vm_object_t object)
15956 {
15957 int ref_count;
15958 vm_object_t chk_obj;
15959 vm_object_t tmp_obj;
15960
15961 if (entry->is_sub_map || VME_OBJECT(entry) == VM_OBJECT_NULL) {
15962 return 0;
15963 }
15964
15965 ref_count = 0;
15966 chk_obj = VME_OBJECT(entry);
15967 vm_object_lock(chk_obj);
15968
15969 while (chk_obj) {
15970 if (chk_obj == object) {
15971 ref_count++;
15972 }
15973 tmp_obj = chk_obj->shadow;
15974 if (tmp_obj) {
15975 vm_object_lock(tmp_obj);
15976 }
15977 vm_object_unlock(chk_obj);
15978
15979 chk_obj = tmp_obj;
15980 }
15981
15982 return ref_count;
15983 }
15984
15985
15986 /*
15987 * Routine: vm_map_simplify
15988 *
15989 * Description:
15990 * Attempt to simplify the map representation in
15991 * the vicinity of the given starting address.
15992 * Note:
15993 * This routine is intended primarily to keep the
15994 * kernel maps more compact -- they generally don't
15995 * benefit from the "expand a map entry" technology
15996 * at allocation time because the adjacent entry
15997 * is often wired down.
15998 */
15999 void
vm_map_simplify_entry(vm_map_t map,vm_map_entry_t this_entry)16000 vm_map_simplify_entry(
16001 vm_map_t map,
16002 vm_map_entry_t this_entry)
16003 {
16004 vm_map_entry_t prev_entry;
16005
16006 prev_entry = this_entry->vme_prev;
16007
16008 if ((this_entry != vm_map_to_entry(map)) &&
16009 (prev_entry != vm_map_to_entry(map)) &&
16010
16011 (prev_entry->vme_end == this_entry->vme_start) &&
16012
16013 (prev_entry->is_sub_map == this_entry->is_sub_map) &&
16014 (prev_entry->vme_object_value == this_entry->vme_object_value) &&
16015 (prev_entry->vme_kernel_object == this_entry->vme_kernel_object) &&
16016 ((VME_OFFSET(prev_entry) + (prev_entry->vme_end -
16017 prev_entry->vme_start))
16018 == VME_OFFSET(this_entry)) &&
16019
16020 (prev_entry->behavior == this_entry->behavior) &&
16021 (prev_entry->needs_copy == this_entry->needs_copy) &&
16022 (prev_entry->protection == this_entry->protection) &&
16023 (prev_entry->max_protection == this_entry->max_protection) &&
16024 (prev_entry->inheritance == this_entry->inheritance) &&
16025 (prev_entry->use_pmap == this_entry->use_pmap) &&
16026 (VME_ALIAS(prev_entry) == VME_ALIAS(this_entry)) &&
16027 (prev_entry->no_cache == this_entry->no_cache) &&
16028 (prev_entry->vme_permanent == this_entry->vme_permanent) &&
16029 (prev_entry->map_aligned == this_entry->map_aligned) &&
16030 (prev_entry->zero_wired_pages == this_entry->zero_wired_pages) &&
16031 (prev_entry->used_for_jit == this_entry->used_for_jit) &&
16032 #if __arm64e__
16033 (prev_entry->used_for_tpro == this_entry->used_for_tpro) &&
16034 #endif
16035 (prev_entry->csm_associated == this_entry->csm_associated) &&
16036 (prev_entry->vme_xnu_user_debug == this_entry->vme_xnu_user_debug) &&
16037 (prev_entry->iokit_acct == this_entry->iokit_acct) &&
16038 (prev_entry->vme_resilient_codesign ==
16039 this_entry->vme_resilient_codesign) &&
16040 (prev_entry->vme_resilient_media ==
16041 this_entry->vme_resilient_media) &&
16042 (prev_entry->vme_no_copy_on_read == this_entry->vme_no_copy_on_read) &&
16043 (prev_entry->translated_allow_execute == this_entry->translated_allow_execute) &&
16044
16045 (prev_entry->wired_count == this_entry->wired_count) &&
16046 (prev_entry->user_wired_count == this_entry->user_wired_count) &&
16047
16048 ((prev_entry->vme_atomic == FALSE) && (this_entry->vme_atomic == FALSE)) &&
16049 (prev_entry->in_transition == FALSE) &&
16050 (this_entry->in_transition == FALSE) &&
16051 (prev_entry->needs_wakeup == FALSE) &&
16052 (this_entry->needs_wakeup == FALSE) &&
16053 (prev_entry->is_shared == this_entry->is_shared) &&
16054 (prev_entry->superpage_size == FALSE) &&
16055 (this_entry->superpage_size == FALSE)
16056 ) {
16057 if (prev_entry->vme_permanent) {
16058 assert(this_entry->vme_permanent);
16059 prev_entry->vme_permanent = false;
16060 }
16061 vm_map_store_entry_unlink(map, prev_entry, true);
16062 assert(prev_entry->vme_start < this_entry->vme_end);
16063 if (prev_entry->map_aligned) {
16064 assert(VM_MAP_PAGE_ALIGNED(prev_entry->vme_start,
16065 VM_MAP_PAGE_MASK(map)));
16066 }
16067 this_entry->vme_start = prev_entry->vme_start;
16068 VME_OFFSET_SET(this_entry, VME_OFFSET(prev_entry));
16069
16070 if (map->holelistenabled) {
16071 vm_map_store_update_first_free(map, this_entry, TRUE);
16072 }
16073
16074 if (prev_entry->is_sub_map) {
16075 vm_map_deallocate(VME_SUBMAP(prev_entry));
16076 } else {
16077 vm_object_deallocate(VME_OBJECT(prev_entry));
16078 }
16079 vm_map_entry_dispose(prev_entry);
16080 SAVE_HINT_MAP_WRITE(map, this_entry);
16081 }
16082 }
16083
16084 void
vm_map_simplify(vm_map_t map,vm_map_offset_t start)16085 vm_map_simplify(
16086 vm_map_t map,
16087 vm_map_offset_t start)
16088 {
16089 vm_map_entry_t this_entry;
16090
16091 vm_map_lock(map);
16092 if (vm_map_lookup_entry(map, start, &this_entry)) {
16093 vm_map_simplify_entry(map, this_entry);
16094 vm_map_simplify_entry(map, this_entry->vme_next);
16095 }
16096 vm_map_unlock(map);
16097 }
16098
16099 static void
vm_map_simplify_range(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)16100 vm_map_simplify_range(
16101 vm_map_t map,
16102 vm_map_offset_t start,
16103 vm_map_offset_t end)
16104 {
16105 vm_map_entry_t entry;
16106
16107 /*
16108 * The map should be locked (for "write") by the caller.
16109 */
16110
16111 if (start >= end) {
16112 /* invalid address range */
16113 return;
16114 }
16115
16116 start = vm_map_trunc_page(start,
16117 VM_MAP_PAGE_MASK(map));
16118 end = vm_map_round_page(end,
16119 VM_MAP_PAGE_MASK(map));
16120
16121 if (!vm_map_lookup_entry(map, start, &entry)) {
16122 /* "start" is not mapped and "entry" ends before "start" */
16123 if (entry == vm_map_to_entry(map)) {
16124 /* start with first entry in the map */
16125 entry = vm_map_first_entry(map);
16126 } else {
16127 /* start with next entry */
16128 entry = entry->vme_next;
16129 }
16130 }
16131
16132 while (entry != vm_map_to_entry(map) &&
16133 entry->vme_start <= end) {
16134 /* try and coalesce "entry" with its previous entry */
16135 vm_map_simplify_entry(map, entry);
16136 entry = entry->vme_next;
16137 }
16138 }
16139
16140
16141 /*
16142 * Routine: vm_map_machine_attribute
16143 * Purpose:
16144 * Provide machine-specific attributes to mappings,
16145 * such as cachability etc. for machines that provide
16146 * them. NUMA architectures and machines with big/strange
16147 * caches will use this.
16148 * Note:
16149 * Responsibilities for locking and checking are handled here,
16150 * everything else in the pmap module. If any non-volatile
16151 * information must be kept, the pmap module should handle
16152 * it itself. [This assumes that attributes do not
16153 * need to be inherited, which seems ok to me]
16154 */
16155 kern_return_t
vm_map_machine_attribute(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_machine_attribute_t attribute,vm_machine_attribute_val_t * value)16156 vm_map_machine_attribute(
16157 vm_map_t map,
16158 vm_map_offset_t start,
16159 vm_map_offset_t end,
16160 vm_machine_attribute_t attribute,
16161 vm_machine_attribute_val_t* value) /* IN/OUT */
16162 {
16163 kern_return_t ret;
16164 vm_map_size_t sync_size;
16165 vm_map_entry_t entry;
16166
16167 if (start < vm_map_min(map) || end > vm_map_max(map)) {
16168 return KERN_INVALID_ADDRESS;
16169 }
16170 if (__improbable(vm_map_range_overflows(map, start, end - start))) {
16171 return KERN_INVALID_ADDRESS;
16172 }
16173
16174 /* Figure how much memory we need to flush (in page increments) */
16175 sync_size = end - start;
16176
16177 vm_map_lock(map);
16178
16179 if (attribute != MATTR_CACHE) {
16180 /* If we don't have to find physical addresses, we */
16181 /* don't have to do an explicit traversal here. */
16182 ret = pmap_attribute(map->pmap, start, end - start,
16183 attribute, value);
16184 vm_map_unlock(map);
16185 return ret;
16186 }
16187
16188 ret = KERN_SUCCESS; /* Assume it all worked */
16189
16190 while (sync_size) {
16191 if (vm_map_lookup_entry(map, start, &entry)) {
16192 vm_map_size_t sub_size;
16193 if ((entry->vme_end - start) > sync_size) {
16194 sub_size = sync_size;
16195 sync_size = 0;
16196 } else {
16197 sub_size = entry->vme_end - start;
16198 sync_size -= sub_size;
16199 }
16200 if (entry->is_sub_map) {
16201 vm_map_offset_t sub_start;
16202 vm_map_offset_t sub_end;
16203
16204 sub_start = (start - entry->vme_start)
16205 + VME_OFFSET(entry);
16206 sub_end = sub_start + sub_size;
16207 vm_map_machine_attribute(
16208 VME_SUBMAP(entry),
16209 sub_start,
16210 sub_end,
16211 attribute, value);
16212 } else if (VME_OBJECT(entry)) {
16213 vm_page_t m;
16214 vm_object_t object;
16215 vm_object_t base_object;
16216 vm_object_t last_object;
16217 vm_object_offset_t offset;
16218 vm_object_offset_t base_offset;
16219 vm_map_size_t range;
16220 range = sub_size;
16221 offset = (start - entry->vme_start)
16222 + VME_OFFSET(entry);
16223 offset = vm_object_trunc_page(offset);
16224 base_offset = offset;
16225 object = VME_OBJECT(entry);
16226 base_object = object;
16227 last_object = NULL;
16228
16229 vm_object_lock(object);
16230
16231 while (range) {
16232 m = vm_page_lookup(
16233 object, offset);
16234
16235 if (m && !m->vmp_fictitious) {
16236 ret =
16237 pmap_attribute_cache_sync(
16238 VM_PAGE_GET_PHYS_PAGE(m),
16239 PAGE_SIZE,
16240 attribute, value);
16241 } else if (object->shadow) {
16242 offset = offset + object->vo_shadow_offset;
16243 last_object = object;
16244 object = object->shadow;
16245 vm_object_lock(last_object->shadow);
16246 vm_object_unlock(last_object);
16247 continue;
16248 }
16249 if (range < PAGE_SIZE) {
16250 range = 0;
16251 } else {
16252 range -= PAGE_SIZE;
16253 }
16254
16255 if (base_object != object) {
16256 vm_object_unlock(object);
16257 vm_object_lock(base_object);
16258 object = base_object;
16259 }
16260 /* Bump to the next page */
16261 base_offset += PAGE_SIZE;
16262 offset = base_offset;
16263 }
16264 vm_object_unlock(object);
16265 }
16266 start += sub_size;
16267 } else {
16268 vm_map_unlock(map);
16269 return KERN_FAILURE;
16270 }
16271 }
16272
16273 vm_map_unlock(map);
16274
16275 return ret;
16276 }
16277
16278 /*
16279 * vm_map_behavior_set:
16280 *
16281 * Sets the paging reference behavior of the specified address
16282 * range in the target map. Paging reference behavior affects
16283 * how pagein operations resulting from faults on the map will be
16284 * clustered.
16285 */
16286 kern_return_t
vm_map_behavior_set(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_behavior_t new_behavior)16287 vm_map_behavior_set(
16288 vm_map_t map,
16289 vm_map_offset_t start,
16290 vm_map_offset_t end,
16291 vm_behavior_t new_behavior)
16292 {
16293 vm_map_entry_t entry;
16294 vm_map_entry_t temp_entry;
16295
16296 if (start > end ||
16297 start < vm_map_min(map) ||
16298 end > vm_map_max(map)) {
16299 return KERN_NO_SPACE;
16300 }
16301 if (__improbable(vm_map_range_overflows(map, start, end - start))) {
16302 return KERN_INVALID_ADDRESS;
16303 }
16304
16305 switch (new_behavior) {
16306 /*
16307 * This first block of behaviors all set a persistent state on the specified
16308 * memory range. All we have to do here is to record the desired behavior
16309 * in the vm_map_entry_t's.
16310 */
16311
16312 case VM_BEHAVIOR_DEFAULT:
16313 case VM_BEHAVIOR_RANDOM:
16314 case VM_BEHAVIOR_SEQUENTIAL:
16315 case VM_BEHAVIOR_RSEQNTL:
16316 case VM_BEHAVIOR_ZERO_WIRED_PAGES:
16317 vm_map_lock(map);
16318
16319 /*
16320 * The entire address range must be valid for the map.
16321 * Note that vm_map_range_check() does a
16322 * vm_map_lookup_entry() internally and returns the
16323 * entry containing the start of the address range if
16324 * the entire range is valid.
16325 */
16326 if (vm_map_range_check(map, start, end, &temp_entry)) {
16327 entry = temp_entry;
16328 vm_map_clip_start(map, entry, start);
16329 } else {
16330 vm_map_unlock(map);
16331 return KERN_INVALID_ADDRESS;
16332 }
16333
16334 while ((entry != vm_map_to_entry(map)) && (entry->vme_start < end)) {
16335 vm_map_clip_end(map, entry, end);
16336 if (entry->is_sub_map) {
16337 assert(!entry->use_pmap);
16338 }
16339
16340 if (new_behavior == VM_BEHAVIOR_ZERO_WIRED_PAGES) {
16341 entry->zero_wired_pages = TRUE;
16342 } else {
16343 entry->behavior = new_behavior;
16344 }
16345 entry = entry->vme_next;
16346 }
16347
16348 vm_map_unlock(map);
16349 break;
16350
16351 /*
16352 * The rest of these are different from the above in that they cause
16353 * an immediate action to take place as opposed to setting a behavior that
16354 * affects future actions.
16355 */
16356
16357 case VM_BEHAVIOR_WILLNEED:
16358 return vm_map_willneed(map, start, end);
16359
16360 case VM_BEHAVIOR_DONTNEED:
16361 return vm_map_msync(map, start, end - start, VM_SYNC_DEACTIVATE | VM_SYNC_CONTIGUOUS);
16362
16363 case VM_BEHAVIOR_FREE:
16364 return vm_map_msync(map, start, end - start, VM_SYNC_KILLPAGES | VM_SYNC_CONTIGUOUS);
16365
16366 case VM_BEHAVIOR_REUSABLE:
16367 return vm_map_reusable_pages(map, start, end);
16368
16369 case VM_BEHAVIOR_REUSE:
16370 return vm_map_reuse_pages(map, start, end);
16371
16372 case VM_BEHAVIOR_CAN_REUSE:
16373 return vm_map_can_reuse(map, start, end);
16374
16375 #if MACH_ASSERT
16376 case VM_BEHAVIOR_PAGEOUT:
16377 return vm_map_pageout(map, start, end);
16378 #endif /* MACH_ASSERT */
16379
16380 case VM_BEHAVIOR_ZERO:
16381 return vm_map_zero(map, start, end);
16382
16383 default:
16384 return KERN_INVALID_ARGUMENT;
16385 }
16386
16387 return KERN_SUCCESS;
16388 }
16389
16390
16391 /*
16392 * Internals for madvise(MADV_WILLNEED) system call.
16393 *
16394 * The implementation is to do:-
16395 * a) read-ahead if the mapping corresponds to a mapped regular file
16396 * b) or, fault in the pages (zero-fill, decompress etc) if it's an anonymous mapping
16397 */
16398
16399
16400 static kern_return_t
vm_map_willneed(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)16401 vm_map_willneed(
16402 vm_map_t map,
16403 vm_map_offset_t start,
16404 vm_map_offset_t end
16405 )
16406 {
16407 vm_map_entry_t entry;
16408 vm_object_t object;
16409 memory_object_t pager;
16410 struct vm_object_fault_info fault_info = {};
16411 kern_return_t kr;
16412 vm_object_size_t len;
16413 vm_object_offset_t offset;
16414
16415 fault_info.interruptible = THREAD_UNINT; /* ignored value */
16416 fault_info.behavior = VM_BEHAVIOR_SEQUENTIAL;
16417 fault_info.stealth = TRUE;
16418
16419 /*
16420 * The MADV_WILLNEED operation doesn't require any changes to the
16421 * vm_map_entry_t's, so the read lock is sufficient.
16422 */
16423
16424 vm_map_lock_read(map);
16425
16426 /*
16427 * The madvise semantics require that the address range be fully
16428 * allocated with no holes. Otherwise, we're required to return
16429 * an error.
16430 */
16431
16432 if (!vm_map_range_check(map, start, end, &entry)) {
16433 vm_map_unlock_read(map);
16434 return KERN_INVALID_ADDRESS;
16435 }
16436
16437 /*
16438 * Examine each vm_map_entry_t in the range.
16439 */
16440 for (; entry != vm_map_to_entry(map) && start < end;) {
16441 /*
16442 * The first time through, the start address could be anywhere
16443 * within the vm_map_entry we found. So adjust the offset to
16444 * correspond. After that, the offset will always be zero to
16445 * correspond to the beginning of the current vm_map_entry.
16446 */
16447 offset = (start - entry->vme_start) + VME_OFFSET(entry);
16448
16449 /*
16450 * Set the length so we don't go beyond the end of the
16451 * map_entry or beyond the end of the range we were given.
16452 * This range could span also multiple map entries all of which
16453 * map different files, so make sure we only do the right amount
16454 * of I/O for each object. Note that it's possible for there
16455 * to be multiple map entries all referring to the same object
16456 * but with different page permissions, but it's not worth
16457 * trying to optimize that case.
16458 */
16459 len = MIN(entry->vme_end - start, end - start);
16460
16461 if ((vm_size_t) len != len) {
16462 /* 32-bit overflow */
16463 len = (vm_size_t) (0 - PAGE_SIZE);
16464 }
16465 fault_info.cluster_size = (vm_size_t) len;
16466 fault_info.lo_offset = offset;
16467 fault_info.hi_offset = offset + len;
16468 fault_info.user_tag = VME_ALIAS(entry);
16469 fault_info.pmap_options = 0;
16470 if (entry->iokit_acct ||
16471 (!entry->is_sub_map && !entry->use_pmap)) {
16472 fault_info.pmap_options |= PMAP_OPTIONS_ALT_ACCT;
16473 }
16474 fault_info.fi_xnu_user_debug = entry->vme_xnu_user_debug;
16475
16476 /*
16477 * If the entry is a submap OR there's no read permission
16478 * to this mapping, then just skip it.
16479 */
16480 if ((entry->is_sub_map) || (entry->protection & VM_PROT_READ) == 0) {
16481 entry = entry->vme_next;
16482 start = entry->vme_start;
16483 continue;
16484 }
16485
16486 object = VME_OBJECT(entry);
16487
16488 if (object == NULL ||
16489 (object && object->internal)) {
16490 /*
16491 * Memory range backed by anonymous memory.
16492 */
16493 vm_size_t region_size = 0, effective_page_size = 0;
16494 vm_map_offset_t addr = 0, effective_page_mask = 0;
16495
16496 region_size = len;
16497 addr = start;
16498
16499 effective_page_mask = MIN(vm_map_page_mask(current_map()), PAGE_MASK);
16500 effective_page_size = effective_page_mask + 1;
16501
16502 vm_map_unlock_read(map);
16503
16504 while (region_size) {
16505 vm_pre_fault(
16506 vm_map_trunc_page(addr, effective_page_mask),
16507 VM_PROT_READ | VM_PROT_WRITE);
16508
16509 region_size -= effective_page_size;
16510 addr += effective_page_size;
16511 }
16512 } else {
16513 /*
16514 * Find the file object backing this map entry. If there is
16515 * none, then we simply ignore the "will need" advice for this
16516 * entry and go on to the next one.
16517 */
16518 if ((object = find_vnode_object(entry)) == VM_OBJECT_NULL) {
16519 entry = entry->vme_next;
16520 start = entry->vme_start;
16521 continue;
16522 }
16523
16524 vm_object_paging_begin(object);
16525 pager = object->pager;
16526 vm_object_unlock(object);
16527
16528 /*
16529 * The data_request() could take a long time, so let's
16530 * release the map lock to avoid blocking other threads.
16531 */
16532 vm_map_unlock_read(map);
16533
16534 /*
16535 * Get the data from the object asynchronously.
16536 *
16537 * Note that memory_object_data_request() places limits on the
16538 * amount of I/O it will do. Regardless of the len we
16539 * specified, it won't do more than MAX_UPL_TRANSFER_BYTES and it
16540 * silently truncates the len to that size. This isn't
16541 * necessarily bad since madvise shouldn't really be used to
16542 * page in unlimited amounts of data. Other Unix variants
16543 * limit the willneed case as well. If this turns out to be an
16544 * issue for developers, then we can always adjust the policy
16545 * here and still be backwards compatible since this is all
16546 * just "advice".
16547 */
16548 kr = memory_object_data_request(
16549 pager,
16550 vm_object_trunc_page(offset) + object->paging_offset,
16551 0, /* ignored */
16552 VM_PROT_READ,
16553 (memory_object_fault_info_t)&fault_info);
16554
16555 vm_object_lock(object);
16556 vm_object_paging_end(object);
16557 vm_object_unlock(object);
16558
16559 /*
16560 * If we couldn't do the I/O for some reason, just give up on
16561 * the madvise. We still return success to the user since
16562 * madvise isn't supposed to fail when the advice can't be
16563 * taken.
16564 */
16565
16566 if (kr != KERN_SUCCESS) {
16567 return KERN_SUCCESS;
16568 }
16569 }
16570
16571 start += len;
16572 if (start >= end) {
16573 /* done */
16574 return KERN_SUCCESS;
16575 }
16576
16577 /* look up next entry */
16578 vm_map_lock_read(map);
16579 if (!vm_map_lookup_entry(map, start, &entry)) {
16580 /*
16581 * There's a new hole in the address range.
16582 */
16583 vm_map_unlock_read(map);
16584 return KERN_INVALID_ADDRESS;
16585 }
16586 }
16587
16588 vm_map_unlock_read(map);
16589 return KERN_SUCCESS;
16590 }
16591
16592 static boolean_t
vm_map_entry_is_reusable(vm_map_entry_t entry)16593 vm_map_entry_is_reusable(
16594 vm_map_entry_t entry)
16595 {
16596 /* Only user map entries */
16597
16598 vm_object_t object;
16599
16600 if (entry->is_sub_map) {
16601 return FALSE;
16602 }
16603
16604 switch (VME_ALIAS(entry)) {
16605 case VM_MEMORY_MALLOC:
16606 case VM_MEMORY_MALLOC_SMALL:
16607 case VM_MEMORY_MALLOC_LARGE:
16608 case VM_MEMORY_REALLOC:
16609 case VM_MEMORY_MALLOC_TINY:
16610 case VM_MEMORY_MALLOC_LARGE_REUSABLE:
16611 case VM_MEMORY_MALLOC_LARGE_REUSED:
16612 /*
16613 * This is a malloc() memory region: check if it's still
16614 * in its original state and can be re-used for more
16615 * malloc() allocations.
16616 */
16617 break;
16618 default:
16619 /*
16620 * Not a malloc() memory region: let the caller decide if
16621 * it's re-usable.
16622 */
16623 return TRUE;
16624 }
16625
16626 if (/*entry->is_shared ||*/
16627 entry->is_sub_map ||
16628 entry->in_transition ||
16629 entry->protection != VM_PROT_DEFAULT ||
16630 entry->max_protection != VM_PROT_ALL ||
16631 entry->inheritance != VM_INHERIT_DEFAULT ||
16632 entry->no_cache ||
16633 entry->vme_permanent ||
16634 entry->superpage_size != FALSE ||
16635 entry->zero_wired_pages ||
16636 entry->wired_count != 0 ||
16637 entry->user_wired_count != 0) {
16638 return FALSE;
16639 }
16640
16641 object = VME_OBJECT(entry);
16642 if (object == VM_OBJECT_NULL) {
16643 return TRUE;
16644 }
16645 if (
16646 #if 0
16647 /*
16648 * Let's proceed even if the VM object is potentially
16649 * shared.
16650 * We check for this later when processing the actual
16651 * VM pages, so the contents will be safe if shared.
16652 *
16653 * But we can still mark this memory region as "reusable" to
16654 * acknowledge that the caller did let us know that the memory
16655 * could be re-used and should not be penalized for holding
16656 * on to it. This allows its "resident size" to not include
16657 * the reusable range.
16658 */
16659 object->ref_count == 1 &&
16660 #endif
16661 object->vo_copy == VM_OBJECT_NULL &&
16662 object->shadow == VM_OBJECT_NULL &&
16663 object->internal &&
16664 object->purgable == VM_PURGABLE_DENY &&
16665 object->wimg_bits == VM_WIMG_USE_DEFAULT &&
16666 !object->code_signed) {
16667 return TRUE;
16668 }
16669 return FALSE;
16670 }
16671
16672 static kern_return_t
vm_map_reuse_pages(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)16673 vm_map_reuse_pages(
16674 vm_map_t map,
16675 vm_map_offset_t start,
16676 vm_map_offset_t end)
16677 {
16678 vm_map_entry_t entry;
16679 vm_object_t object;
16680 vm_object_offset_t start_offset, end_offset;
16681
16682 /*
16683 * The MADV_REUSE operation doesn't require any changes to the
16684 * vm_map_entry_t's, so the read lock is sufficient.
16685 */
16686
16687 if (VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT) {
16688 /*
16689 * XXX TODO4K
16690 * need to figure out what reusable means for a
16691 * portion of a native page.
16692 */
16693 return KERN_SUCCESS;
16694 }
16695
16696 vm_map_lock_read(map);
16697 assert(map->pmap != kernel_pmap); /* protect alias access */
16698
16699 /*
16700 * The madvise semantics require that the address range be fully
16701 * allocated with no holes. Otherwise, we're required to return
16702 * an error.
16703 */
16704
16705 if (!vm_map_range_check(map, start, end, &entry)) {
16706 vm_map_unlock_read(map);
16707 vm_page_stats_reusable.reuse_pages_failure++;
16708 return KERN_INVALID_ADDRESS;
16709 }
16710
16711 /*
16712 * Examine each vm_map_entry_t in the range.
16713 */
16714 for (; entry != vm_map_to_entry(map) && entry->vme_start < end;
16715 entry = entry->vme_next) {
16716 /*
16717 * Sanity check on the VM map entry.
16718 */
16719 if (!vm_map_entry_is_reusable(entry)) {
16720 vm_map_unlock_read(map);
16721 vm_page_stats_reusable.reuse_pages_failure++;
16722 return KERN_INVALID_ADDRESS;
16723 }
16724
16725 /*
16726 * The first time through, the start address could be anywhere
16727 * within the vm_map_entry we found. So adjust the offset to
16728 * correspond.
16729 */
16730 if (entry->vme_start < start) {
16731 start_offset = start - entry->vme_start;
16732 } else {
16733 start_offset = 0;
16734 }
16735 end_offset = MIN(end, entry->vme_end) - entry->vme_start;
16736 start_offset += VME_OFFSET(entry);
16737 end_offset += VME_OFFSET(entry);
16738
16739 object = VME_OBJECT(entry);
16740 if (object != VM_OBJECT_NULL) {
16741 vm_object_lock(object);
16742 vm_object_reuse_pages(object, start_offset, end_offset,
16743 TRUE);
16744 vm_object_unlock(object);
16745 }
16746
16747 if (VME_ALIAS(entry) == VM_MEMORY_MALLOC_LARGE_REUSABLE) {
16748 /*
16749 * XXX
16750 * We do not hold the VM map exclusively here.
16751 * The "alias" field is not that critical, so it's
16752 * safe to update it here, as long as it is the only
16753 * one that can be modified while holding the VM map
16754 * "shared".
16755 */
16756 VME_ALIAS_SET(entry, VM_MEMORY_MALLOC_LARGE_REUSED);
16757 }
16758 }
16759
16760 vm_map_unlock_read(map);
16761 vm_page_stats_reusable.reuse_pages_success++;
16762 return KERN_SUCCESS;
16763 }
16764
16765
16766 static kern_return_t
vm_map_reusable_pages(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)16767 vm_map_reusable_pages(
16768 vm_map_t map,
16769 vm_map_offset_t start,
16770 vm_map_offset_t end)
16771 {
16772 vm_map_entry_t entry;
16773 vm_object_t object;
16774 vm_object_offset_t start_offset, end_offset;
16775 vm_map_offset_t pmap_offset;
16776
16777 if (VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT) {
16778 /*
16779 * XXX TODO4K
16780 * need to figure out what reusable means for a portion
16781 * of a native page.
16782 */
16783 return KERN_SUCCESS;
16784 }
16785
16786 /*
16787 * The MADV_REUSABLE operation doesn't require any changes to the
16788 * vm_map_entry_t's, so the read lock is sufficient.
16789 */
16790
16791 vm_map_lock_read(map);
16792 assert(map->pmap != kernel_pmap); /* protect alias access */
16793
16794 /*
16795 * The madvise semantics require that the address range be fully
16796 * allocated with no holes. Otherwise, we're required to return
16797 * an error.
16798 */
16799
16800 if (!vm_map_range_check(map, start, end, &entry)) {
16801 vm_map_unlock_read(map);
16802 vm_page_stats_reusable.reusable_pages_failure++;
16803 return KERN_INVALID_ADDRESS;
16804 }
16805
16806 /*
16807 * Examine each vm_map_entry_t in the range.
16808 */
16809 for (; entry != vm_map_to_entry(map) && entry->vme_start < end;
16810 entry = entry->vme_next) {
16811 int kill_pages = 0;
16812 boolean_t reusable_no_write = FALSE;
16813
16814 /*
16815 * Sanity check on the VM map entry.
16816 */
16817 if (!vm_map_entry_is_reusable(entry)) {
16818 vm_map_unlock_read(map);
16819 vm_page_stats_reusable.reusable_pages_failure++;
16820 return KERN_INVALID_ADDRESS;
16821 }
16822
16823 if (!(entry->protection & VM_PROT_WRITE) && !entry->used_for_jit
16824 #if __arm64e__
16825 && !entry->used_for_tpro
16826 #endif
16827 ) {
16828 /* not writable: can't discard contents */
16829 vm_map_unlock_read(map);
16830 vm_page_stats_reusable.reusable_nonwritable++;
16831 vm_page_stats_reusable.reusable_pages_failure++;
16832 return KERN_PROTECTION_FAILURE;
16833 }
16834
16835 /*
16836 * The first time through, the start address could be anywhere
16837 * within the vm_map_entry we found. So adjust the offset to
16838 * correspond.
16839 */
16840 if (entry->vme_start < start) {
16841 start_offset = start - entry->vme_start;
16842 pmap_offset = start;
16843 } else {
16844 start_offset = 0;
16845 pmap_offset = entry->vme_start;
16846 }
16847 end_offset = MIN(end, entry->vme_end) - entry->vme_start;
16848 start_offset += VME_OFFSET(entry);
16849 end_offset += VME_OFFSET(entry);
16850
16851 object = VME_OBJECT(entry);
16852 if (object == VM_OBJECT_NULL) {
16853 continue;
16854 }
16855
16856 if (entry->protection & VM_PROT_EXECUTE) {
16857 /*
16858 * Executable mappings might be write-protected by
16859 * hardware, so do not attempt to write to these pages.
16860 */
16861 reusable_no_write = TRUE;
16862 }
16863
16864 if (entry->vme_xnu_user_debug) {
16865 /*
16866 * User debug pages might be write-protected by hardware,
16867 * so do not attempt to write to these pages.
16868 */
16869 reusable_no_write = TRUE;
16870 }
16871
16872 vm_object_lock(object);
16873 if (((object->ref_count == 1) ||
16874 (object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC &&
16875 object->vo_copy == VM_OBJECT_NULL)) &&
16876 object->shadow == VM_OBJECT_NULL &&
16877 /*
16878 * "iokit_acct" entries are billed for their virtual size
16879 * (rather than for their resident pages only), so they
16880 * wouldn't benefit from making pages reusable, and it
16881 * would be hard to keep track of pages that are both
16882 * "iokit_acct" and "reusable" in the pmap stats and
16883 * ledgers.
16884 */
16885 !(entry->iokit_acct ||
16886 (!entry->is_sub_map && !entry->use_pmap))) {
16887 if (object->ref_count != 1) {
16888 vm_page_stats_reusable.reusable_shared++;
16889 }
16890 kill_pages = 1;
16891 } else {
16892 kill_pages = -1;
16893 }
16894 if (kill_pages != -1) {
16895 vm_object_deactivate_pages(object,
16896 start_offset,
16897 end_offset - start_offset,
16898 kill_pages,
16899 TRUE /*reusable_pages*/,
16900 reusable_no_write,
16901 map->pmap,
16902 pmap_offset);
16903 } else {
16904 vm_page_stats_reusable.reusable_pages_shared++;
16905 DTRACE_VM4(vm_map_reusable_pages_shared,
16906 unsigned int, VME_ALIAS(entry),
16907 vm_map_t, map,
16908 vm_map_entry_t, entry,
16909 vm_object_t, object);
16910 }
16911 vm_object_unlock(object);
16912
16913 if (VME_ALIAS(entry) == VM_MEMORY_MALLOC_LARGE ||
16914 VME_ALIAS(entry) == VM_MEMORY_MALLOC_LARGE_REUSED) {
16915 /*
16916 * XXX
16917 * We do not hold the VM map exclusively here.
16918 * The "alias" field is not that critical, so it's
16919 * safe to update it here, as long as it is the only
16920 * one that can be modified while holding the VM map
16921 * "shared".
16922 */
16923 VME_ALIAS_SET(entry, VM_MEMORY_MALLOC_LARGE_REUSABLE);
16924 }
16925 }
16926
16927 vm_map_unlock_read(map);
16928 vm_page_stats_reusable.reusable_pages_success++;
16929 return KERN_SUCCESS;
16930 }
16931
16932
16933 static kern_return_t
vm_map_can_reuse(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)16934 vm_map_can_reuse(
16935 vm_map_t map,
16936 vm_map_offset_t start,
16937 vm_map_offset_t end)
16938 {
16939 vm_map_entry_t entry;
16940
16941 /*
16942 * The MADV_REUSABLE operation doesn't require any changes to the
16943 * vm_map_entry_t's, so the read lock is sufficient.
16944 */
16945
16946 vm_map_lock_read(map);
16947 assert(map->pmap != kernel_pmap); /* protect alias access */
16948
16949 /*
16950 * The madvise semantics require that the address range be fully
16951 * allocated with no holes. Otherwise, we're required to return
16952 * an error.
16953 */
16954
16955 if (!vm_map_range_check(map, start, end, &entry)) {
16956 vm_map_unlock_read(map);
16957 vm_page_stats_reusable.can_reuse_failure++;
16958 return KERN_INVALID_ADDRESS;
16959 }
16960
16961 /*
16962 * Examine each vm_map_entry_t in the range.
16963 */
16964 for (; entry != vm_map_to_entry(map) && entry->vme_start < end;
16965 entry = entry->vme_next) {
16966 /*
16967 * Sanity check on the VM map entry.
16968 */
16969 if (!vm_map_entry_is_reusable(entry)) {
16970 vm_map_unlock_read(map);
16971 vm_page_stats_reusable.can_reuse_failure++;
16972 return KERN_INVALID_ADDRESS;
16973 }
16974 }
16975
16976 vm_map_unlock_read(map);
16977 vm_page_stats_reusable.can_reuse_success++;
16978 return KERN_SUCCESS;
16979 }
16980
16981
16982 #if MACH_ASSERT
16983 static kern_return_t
vm_map_pageout(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)16984 vm_map_pageout(
16985 vm_map_t map,
16986 vm_map_offset_t start,
16987 vm_map_offset_t end)
16988 {
16989 vm_map_entry_t entry;
16990
16991 /*
16992 * The MADV_PAGEOUT operation doesn't require any changes to the
16993 * vm_map_entry_t's, so the read lock is sufficient.
16994 */
16995
16996 vm_map_lock_read(map);
16997
16998 /*
16999 * The madvise semantics require that the address range be fully
17000 * allocated with no holes. Otherwise, we're required to return
17001 * an error.
17002 */
17003
17004 if (!vm_map_range_check(map, start, end, &entry)) {
17005 vm_map_unlock_read(map);
17006 return KERN_INVALID_ADDRESS;
17007 }
17008
17009 /*
17010 * Examine each vm_map_entry_t in the range.
17011 */
17012 for (; entry != vm_map_to_entry(map) && entry->vme_start < end;
17013 entry = entry->vme_next) {
17014 vm_object_t object;
17015
17016 /*
17017 * Sanity check on the VM map entry.
17018 */
17019 if (entry->is_sub_map) {
17020 vm_map_t submap;
17021 vm_map_offset_t submap_start;
17022 vm_map_offset_t submap_end;
17023 vm_map_entry_t submap_entry;
17024
17025 submap = VME_SUBMAP(entry);
17026 submap_start = VME_OFFSET(entry);
17027 submap_end = submap_start + (entry->vme_end -
17028 entry->vme_start);
17029
17030 vm_map_lock_read(submap);
17031
17032 if (!vm_map_range_check(submap,
17033 submap_start,
17034 submap_end,
17035 &submap_entry)) {
17036 vm_map_unlock_read(submap);
17037 vm_map_unlock_read(map);
17038 return KERN_INVALID_ADDRESS;
17039 }
17040
17041 if (submap_entry->is_sub_map) {
17042 vm_map_unlock_read(submap);
17043 continue;
17044 }
17045
17046 object = VME_OBJECT(submap_entry);
17047 if (object == VM_OBJECT_NULL || !object->internal) {
17048 vm_map_unlock_read(submap);
17049 continue;
17050 }
17051
17052 vm_object_pageout(object);
17053
17054 vm_map_unlock_read(submap);
17055 submap = VM_MAP_NULL;
17056 submap_entry = VM_MAP_ENTRY_NULL;
17057 continue;
17058 }
17059
17060 object = VME_OBJECT(entry);
17061 if (object == VM_OBJECT_NULL || !object->internal) {
17062 continue;
17063 }
17064
17065 vm_object_pageout(object);
17066 }
17067
17068 vm_map_unlock_read(map);
17069 return KERN_SUCCESS;
17070 }
17071 #endif /* MACH_ASSERT */
17072
17073 /*
17074 * This function determines if the zero operation can be run on the
17075 * respective entry. Additional checks on the object are in
17076 * vm_object_zero_preflight.
17077 */
17078 static kern_return_t
vm_map_zero_entry_preflight(vm_map_entry_t entry)17079 vm_map_zero_entry_preflight(vm_map_entry_t entry)
17080 {
17081 /*
17082 * Zeroing is restricted to writable non-executable entries and non-JIT
17083 * regions.
17084 */
17085 if (!(entry->protection & VM_PROT_WRITE) ||
17086 (entry->protection & VM_PROT_EXECUTE) ||
17087 entry->used_for_jit ||
17088 entry->vme_xnu_user_debug) {
17089 return KERN_PROTECTION_FAILURE;
17090 }
17091
17092 /*
17093 * Zeroing for copy on write isn't yet supported. Zeroing is also not
17094 * allowed for submaps.
17095 */
17096 if (entry->needs_copy || entry->is_sub_map) {
17097 return KERN_NO_ACCESS;
17098 }
17099
17100 return KERN_SUCCESS;
17101 }
17102
17103 /*
17104 * This function translates entry's start and end to offsets in the object
17105 */
17106 static void
vm_map_get_bounds_in_object(vm_map_entry_t entry,vm_map_offset_t start,vm_map_offset_t end,vm_map_offset_t * start_offset,vm_map_offset_t * end_offset)17107 vm_map_get_bounds_in_object(
17108 vm_map_entry_t entry,
17109 vm_map_offset_t start,
17110 vm_map_offset_t end,
17111 vm_map_offset_t *start_offset,
17112 vm_map_offset_t *end_offset)
17113 {
17114 if (entry->vme_start < start) {
17115 *start_offset = start - entry->vme_start;
17116 } else {
17117 *start_offset = 0;
17118 }
17119 *end_offset = MIN(end, entry->vme_end) - entry->vme_start;
17120 *start_offset += VME_OFFSET(entry);
17121 *end_offset += VME_OFFSET(entry);
17122 }
17123
17124 /*
17125 * This function iterates through the entries in the requested range
17126 * and zeroes any resident pages in the corresponding objects. Compressed
17127 * pages are dropped instead of being faulted in and zeroed.
17128 */
17129 static kern_return_t
vm_map_zero(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)17130 vm_map_zero(
17131 vm_map_t map,
17132 vm_map_offset_t start,
17133 vm_map_offset_t end)
17134 {
17135 vm_map_entry_t entry;
17136 vm_map_offset_t cur = start;
17137 kern_return_t ret;
17138
17139 /*
17140 * This operation isn't supported where the map page size is less than
17141 * the hardware page size. Caller will need to handle error and
17142 * explicitly zero memory if needed.
17143 */
17144 if (VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT) {
17145 return KERN_NO_ACCESS;
17146 }
17147
17148 /*
17149 * The MADV_ZERO operation doesn't require any changes to the
17150 * vm_map_entry_t's, so the read lock is sufficient.
17151 */
17152 vm_map_lock_read(map);
17153 assert(map->pmap != kernel_pmap); /* protect alias access */
17154
17155 /*
17156 * The madvise semantics require that the address range be fully
17157 * allocated with no holes. Otherwise, we're required to return
17158 * an error. This check needs to be redone if the map has changed.
17159 */
17160 if (!vm_map_range_check(map, cur, end, &entry)) {
17161 vm_map_unlock_read(map);
17162 return KERN_INVALID_ADDRESS;
17163 }
17164
17165 /*
17166 * Examine each vm_map_entry_t in the range.
17167 */
17168 while (entry != vm_map_to_entry(map) && entry->vme_start < end) {
17169 vm_map_offset_t cur_offset;
17170 vm_map_offset_t end_offset;
17171 unsigned int last_timestamp = map->timestamp;
17172 vm_object_t object = VME_OBJECT(entry);
17173
17174 ret = vm_map_zero_entry_preflight(entry);
17175 if (ret != KERN_SUCCESS) {
17176 vm_map_unlock_read(map);
17177 return ret;
17178 }
17179
17180 if (object == VM_OBJECT_NULL) {
17181 entry = entry->vme_next;
17182 continue;
17183 }
17184
17185 vm_map_get_bounds_in_object(entry, cur, end, &cur_offset, &end_offset);
17186 vm_object_lock(object);
17187 /*
17188 * Take a reference on the object as vm_object_zero will drop the object
17189 * lock when it encounters a busy page.
17190 */
17191 vm_object_reference_locked(object);
17192 vm_map_unlock_read(map);
17193
17194 ret = vm_object_zero(object, cur_offset, end_offset);
17195 vm_object_unlock(object);
17196 vm_object_deallocate(object);
17197 if (ret != KERN_SUCCESS) {
17198 return ret;
17199 }
17200 /*
17201 * Update cur as vm_object_zero has succeeded.
17202 */
17203 cur += (end_offset - cur_offset);
17204 if (cur == end) {
17205 return KERN_SUCCESS;
17206 }
17207
17208 /*
17209 * If the map timestamp has changed, restart by relooking up cur in the
17210 * map
17211 */
17212 vm_map_lock_read(map);
17213 if (last_timestamp != map->timestamp) {
17214 /*
17215 * Relookup cur in the map
17216 */
17217 if (!vm_map_range_check(map, cur, end, &entry)) {
17218 vm_map_unlock_read(map);
17219 return KERN_INVALID_ADDRESS;
17220 }
17221 continue;
17222 }
17223 /*
17224 * If the map hasn't changed proceed with the next entry
17225 */
17226 entry = entry->vme_next;
17227 }
17228
17229 vm_map_unlock_read(map);
17230 return KERN_SUCCESS;
17231 }
17232
17233
17234 /*
17235 * Routine: vm_map_entry_insert
17236 *
17237 * Description: This routine inserts a new vm_entry in a locked map.
17238 */
17239 static vm_map_entry_t
vm_map_entry_insert(vm_map_t map,vm_map_entry_t insp_entry,vm_map_offset_t start,vm_map_offset_t end,vm_object_t object,vm_object_offset_t offset,vm_map_kernel_flags_t vmk_flags,boolean_t needs_copy,vm_prot_t cur_protection,vm_prot_t max_protection,vm_inherit_t inheritance,boolean_t clear_map_aligned)17240 vm_map_entry_insert(
17241 vm_map_t map,
17242 vm_map_entry_t insp_entry,
17243 vm_map_offset_t start,
17244 vm_map_offset_t end,
17245 vm_object_t object,
17246 vm_object_offset_t offset,
17247 vm_map_kernel_flags_t vmk_flags,
17248 boolean_t needs_copy,
17249 vm_prot_t cur_protection,
17250 vm_prot_t max_protection,
17251 vm_inherit_t inheritance,
17252 boolean_t clear_map_aligned)
17253 {
17254 vm_map_entry_t new_entry;
17255 boolean_t map_aligned = FALSE;
17256
17257 assert(insp_entry != (vm_map_entry_t)0);
17258 vm_map_lock_assert_exclusive(map);
17259
17260 #if DEVELOPMENT || DEBUG
17261 vm_object_offset_t end_offset = 0;
17262 assertf(!os_add_overflow(end - start, offset, &end_offset), "size 0x%llx, offset 0x%llx caused overflow", (uint64_t)(end - start), offset);
17263 #endif /* DEVELOPMENT || DEBUG */
17264
17265 if (VM_MAP_PAGE_SHIFT(map) != PAGE_SHIFT) {
17266 map_aligned = TRUE;
17267 }
17268 if (clear_map_aligned &&
17269 (!VM_MAP_PAGE_ALIGNED(start, VM_MAP_PAGE_MASK(map)) ||
17270 !VM_MAP_PAGE_ALIGNED(end, VM_MAP_PAGE_MASK(map)))) {
17271 map_aligned = FALSE;
17272 }
17273 if (map_aligned) {
17274 assert(VM_MAP_PAGE_ALIGNED(start, VM_MAP_PAGE_MASK(map)));
17275 assert(VM_MAP_PAGE_ALIGNED(end, VM_MAP_PAGE_MASK(map)));
17276 } else {
17277 assert(page_aligned(start));
17278 assert(page_aligned(end));
17279 }
17280 assert(start < end);
17281
17282 new_entry = vm_map_entry_create(map);
17283
17284 new_entry->vme_start = start;
17285 new_entry->vme_end = end;
17286
17287 if (vmk_flags.vmkf_submap) {
17288 new_entry->vme_atomic = vmk_flags.vmkf_submap_atomic;
17289 VME_SUBMAP_SET(new_entry, (vm_map_t)object);
17290 } else {
17291 VME_OBJECT_SET(new_entry, object, false, 0);
17292 }
17293 VME_OFFSET_SET(new_entry, offset);
17294 VME_ALIAS_SET(new_entry, vmk_flags.vm_tag);
17295
17296 new_entry->map_aligned = map_aligned;
17297 new_entry->needs_copy = needs_copy;
17298 new_entry->inheritance = inheritance;
17299 new_entry->protection = cur_protection;
17300 new_entry->max_protection = max_protection;
17301 /*
17302 * submap: "use_pmap" means "nested".
17303 * default: false.
17304 *
17305 * object: "use_pmap" means "use pmap accounting" for footprint.
17306 * default: true.
17307 */
17308 new_entry->use_pmap = !vmk_flags.vmkf_submap;
17309 new_entry->no_cache = vmk_flags.vmf_no_cache;
17310 new_entry->vme_permanent = vmk_flags.vmf_permanent;
17311 new_entry->translated_allow_execute = vmk_flags.vmkf_translated_allow_execute;
17312 new_entry->vme_no_copy_on_read = vmk_flags.vmkf_no_copy_on_read;
17313 new_entry->superpage_size = (vmk_flags.vmf_superpage_size != 0);
17314
17315 if (vmk_flags.vmkf_map_jit) {
17316 if (!(map->jit_entry_exists) ||
17317 VM_MAP_POLICY_ALLOW_MULTIPLE_JIT(map)) {
17318 new_entry->used_for_jit = TRUE;
17319 map->jit_entry_exists = TRUE;
17320 }
17321 }
17322
17323 /*
17324 * Insert the new entry into the list.
17325 */
17326
17327 vm_map_store_entry_link(map, insp_entry, new_entry, vmk_flags);
17328 map->size += end - start;
17329
17330 /*
17331 * Update the free space hint and the lookup hint.
17332 */
17333
17334 SAVE_HINT_MAP_WRITE(map, new_entry);
17335 return new_entry;
17336 }
17337
17338 /*
17339 * Routine: vm_map_remap_extract
17340 *
17341 * Description: This routine returns a vm_entry list from a map.
17342 */
17343 static kern_return_t
vm_map_remap_extract(vm_map_t map,vm_map_offset_t addr,vm_map_size_t size,boolean_t copy,vm_map_copy_t map_copy,vm_prot_t * cur_protection,vm_prot_t * max_protection,vm_inherit_t inheritance,vm_map_kernel_flags_t vmk_flags)17344 vm_map_remap_extract(
17345 vm_map_t map,
17346 vm_map_offset_t addr,
17347 vm_map_size_t size,
17348 boolean_t copy,
17349 vm_map_copy_t map_copy,
17350 vm_prot_t *cur_protection, /* IN/OUT */
17351 vm_prot_t *max_protection, /* IN/OUT */
17352 /* What, no behavior? */
17353 vm_inherit_t inheritance,
17354 vm_map_kernel_flags_t vmk_flags)
17355 {
17356 struct vm_map_header *map_header = &map_copy->cpy_hdr;
17357 kern_return_t result;
17358 vm_map_size_t mapped_size;
17359 vm_map_size_t tmp_size;
17360 vm_map_entry_t src_entry; /* result of last map lookup */
17361 vm_map_entry_t new_entry;
17362 vm_object_offset_t offset;
17363 vm_map_offset_t map_address;
17364 vm_map_offset_t src_start; /* start of entry to map */
17365 vm_map_offset_t src_end; /* end of region to be mapped */
17366 vm_object_t object;
17367 vm_map_version_t version;
17368 boolean_t src_needs_copy;
17369 boolean_t new_entry_needs_copy;
17370 vm_map_entry_t saved_src_entry;
17371 boolean_t src_entry_was_wired;
17372 vm_prot_t max_prot_for_prot_copy;
17373 vm_map_offset_t effective_page_mask;
17374 bool pageable, same_map;
17375 boolean_t vm_remap_legacy;
17376 vm_prot_t required_cur_prot, required_max_prot;
17377 vm_object_t new_copy_object; /* vm_object_copy_* result */
17378 boolean_t saved_used_for_jit; /* Saved used_for_jit. */
17379
17380 pageable = vmk_flags.vmkf_copy_pageable;
17381 same_map = vmk_flags.vmkf_copy_same_map;
17382
17383 effective_page_mask = MIN(PAGE_MASK, VM_MAP_PAGE_MASK(map));
17384
17385 assert(map != VM_MAP_NULL);
17386 assert(size != 0);
17387 assert(size == vm_map_round_page(size, effective_page_mask));
17388 assert(inheritance == VM_INHERIT_NONE ||
17389 inheritance == VM_INHERIT_COPY ||
17390 inheritance == VM_INHERIT_SHARE);
17391 assert(!(*cur_protection & ~(VM_PROT_ALL | VM_PROT_ALLEXEC)));
17392 assert(!(*max_protection & ~(VM_PROT_ALL | VM_PROT_ALLEXEC)));
17393 assert((*cur_protection & *max_protection) == *cur_protection);
17394
17395 /*
17396 * Compute start and end of region.
17397 */
17398 src_start = vm_map_trunc_page(addr, effective_page_mask);
17399 src_end = vm_map_round_page(src_start + size, effective_page_mask);
17400
17401 /*
17402 * Initialize map_header.
17403 */
17404 map_header->nentries = 0;
17405 map_header->entries_pageable = pageable;
17406 // map_header->page_shift = MIN(VM_MAP_PAGE_SHIFT(map), PAGE_SHIFT);
17407 map_header->page_shift = (uint16_t)VM_MAP_PAGE_SHIFT(map);
17408 map_header->rb_head_store.rbh_root = (void *)(int)SKIP_RB_TREE;
17409 vm_map_store_init(map_header);
17410
17411 if (copy && vmk_flags.vmkf_remap_prot_copy) {
17412 /*
17413 * Special case for vm_map_protect(VM_PROT_COPY):
17414 * we want to set the new mappings' max protection to the
17415 * specified *max_protection...
17416 */
17417 max_prot_for_prot_copy = *max_protection & (VM_PROT_ALL | VM_PROT_ALLEXEC);
17418 /* ... but we want to use the vm_remap() legacy mode */
17419 *max_protection = VM_PROT_NONE;
17420 *cur_protection = VM_PROT_NONE;
17421 } else {
17422 max_prot_for_prot_copy = VM_PROT_NONE;
17423 }
17424
17425 if (*cur_protection == VM_PROT_NONE &&
17426 *max_protection == VM_PROT_NONE) {
17427 /*
17428 * vm_remap() legacy mode:
17429 * Extract all memory regions in the specified range and
17430 * collect the strictest set of protections allowed on the
17431 * entire range, so the caller knows what they can do with
17432 * the remapped range.
17433 * We start with VM_PROT_ALL and we'll remove the protections
17434 * missing from each memory region.
17435 */
17436 vm_remap_legacy = TRUE;
17437 *cur_protection = VM_PROT_ALL;
17438 *max_protection = VM_PROT_ALL;
17439 required_cur_prot = VM_PROT_NONE;
17440 required_max_prot = VM_PROT_NONE;
17441 } else {
17442 /*
17443 * vm_remap_new() mode:
17444 * Extract all memory regions in the specified range and
17445 * ensure that they have at least the protections specified
17446 * by the caller via *cur_protection and *max_protection.
17447 * The resulting mapping should have these protections.
17448 */
17449 vm_remap_legacy = FALSE;
17450 if (copy) {
17451 required_cur_prot = VM_PROT_NONE;
17452 required_max_prot = VM_PROT_READ;
17453 } else {
17454 required_cur_prot = *cur_protection;
17455 required_max_prot = *max_protection;
17456 }
17457 }
17458
17459 map_address = 0;
17460 mapped_size = 0;
17461 result = KERN_SUCCESS;
17462
17463 /*
17464 * The specified source virtual space might correspond to
17465 * multiple map entries, need to loop on them.
17466 */
17467 vm_map_lock(map);
17468
17469 if (map->pmap == kernel_pmap) {
17470 map_copy->is_kernel_range = true;
17471 map_copy->orig_range = kmem_addr_get_range(addr, size);
17472 #if CONFIG_MAP_RANGES
17473 } else if (map->uses_user_ranges) {
17474 map_copy->is_user_range = true;
17475 map_copy->orig_range = vm_map_user_range_resolve(map, addr, size, NULL);
17476 #endif /* CONFIG_MAP_RANGES */
17477 }
17478
17479 if (VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT) {
17480 /*
17481 * This address space uses sub-pages so the range might
17482 * not be re-mappable in an address space with larger
17483 * pages. Re-assemble any broken-up VM map entries to
17484 * improve our chances of making it work.
17485 */
17486 vm_map_simplify_range(map, src_start, src_end);
17487 }
17488 while (mapped_size != size) {
17489 vm_map_size_t entry_size;
17490
17491 /*
17492 * Find the beginning of the region.
17493 */
17494 if (!vm_map_lookup_entry(map, src_start, &src_entry)) {
17495 result = KERN_INVALID_ADDRESS;
17496 break;
17497 }
17498
17499 if (src_start < src_entry->vme_start ||
17500 (mapped_size && src_start != src_entry->vme_start)) {
17501 result = KERN_INVALID_ADDRESS;
17502 break;
17503 }
17504
17505 tmp_size = size - mapped_size;
17506 if (src_end > src_entry->vme_end) {
17507 tmp_size -= (src_end - src_entry->vme_end);
17508 }
17509
17510 entry_size = (vm_map_size_t)(src_entry->vme_end -
17511 src_entry->vme_start);
17512
17513 if (src_entry->is_sub_map &&
17514 vmk_flags.vmkf_copy_single_object) {
17515 vm_map_t submap;
17516 vm_map_offset_t submap_start;
17517 vm_map_size_t submap_size;
17518 boolean_t submap_needs_copy;
17519
17520 /*
17521 * No check for "required protection" on "src_entry"
17522 * because the protections that matter are the ones
17523 * on the submap's VM map entry, which will be checked
17524 * during the call to vm_map_remap_extract() below.
17525 */
17526 submap_size = src_entry->vme_end - src_start;
17527 if (submap_size > size) {
17528 submap_size = size;
17529 }
17530 submap_start = VME_OFFSET(src_entry) + src_start - src_entry->vme_start;
17531 submap = VME_SUBMAP(src_entry);
17532 if (copy) {
17533 /*
17534 * The caller wants a copy-on-write re-mapping,
17535 * so let's extract from the submap accordingly.
17536 */
17537 submap_needs_copy = TRUE;
17538 } else if (src_entry->needs_copy) {
17539 /*
17540 * The caller wants a shared re-mapping but the
17541 * submap is mapped with "needs_copy", so its
17542 * contents can't be shared as is. Extract the
17543 * contents of the submap as "copy-on-write".
17544 * The re-mapping won't be shared with the
17545 * original mapping but this is equivalent to
17546 * what happened with the original "remap from
17547 * submap" code.
17548 * The shared region is mapped "needs_copy", for
17549 * example.
17550 */
17551 submap_needs_copy = TRUE;
17552 } else {
17553 /*
17554 * The caller wants a shared re-mapping and
17555 * this mapping can be shared (no "needs_copy"),
17556 * so let's extract from the submap accordingly.
17557 * Kernel submaps are mapped without
17558 * "needs_copy", for example.
17559 */
17560 submap_needs_copy = FALSE;
17561 }
17562 vm_map_reference(submap);
17563 vm_map_unlock(map);
17564 src_entry = NULL;
17565 if (vm_remap_legacy) {
17566 *cur_protection = VM_PROT_NONE;
17567 *max_protection = VM_PROT_NONE;
17568 }
17569
17570 DTRACE_VM7(remap_submap_recurse,
17571 vm_map_t, map,
17572 vm_map_offset_t, addr,
17573 vm_map_size_t, size,
17574 boolean_t, copy,
17575 vm_map_offset_t, submap_start,
17576 vm_map_size_t, submap_size,
17577 boolean_t, submap_needs_copy);
17578
17579 result = vm_map_remap_extract(submap,
17580 submap_start,
17581 submap_size,
17582 submap_needs_copy,
17583 map_copy,
17584 cur_protection,
17585 max_protection,
17586 inheritance,
17587 vmk_flags);
17588 vm_map_deallocate(submap);
17589
17590 if (result == KERN_SUCCESS &&
17591 submap_needs_copy &&
17592 !copy) {
17593 /*
17594 * We were asked for a "shared"
17595 * re-mapping but had to ask for a
17596 * "copy-on-write" remapping of the
17597 * submap's mapping to honor the
17598 * submap's "needs_copy".
17599 * We now need to resolve that
17600 * pending "copy-on-write" to
17601 * get something we can share.
17602 */
17603 vm_map_entry_t copy_entry;
17604 vm_object_offset_t copy_offset;
17605 vm_map_size_t copy_size;
17606 vm_object_t copy_object;
17607 copy_entry = vm_map_copy_first_entry(map_copy);
17608 copy_size = copy_entry->vme_end - copy_entry->vme_start;
17609 copy_object = VME_OBJECT(copy_entry);
17610 copy_offset = VME_OFFSET(copy_entry);
17611 if (copy_object == VM_OBJECT_NULL) {
17612 assert(copy_offset == 0);
17613 assert(!copy_entry->needs_copy);
17614 if (copy_entry->max_protection == VM_PROT_NONE) {
17615 assert(copy_entry->protection == VM_PROT_NONE);
17616 /* nothing to share */
17617 } else {
17618 assert(copy_offset == 0);
17619 copy_object = vm_object_allocate(copy_size);
17620 VME_OFFSET_SET(copy_entry, 0);
17621 VME_OBJECT_SET(copy_entry, copy_object, false, 0);
17622 assert(copy_entry->use_pmap);
17623 }
17624 } else if (copy_object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC) {
17625 /* already shareable */
17626 assert(!copy_entry->needs_copy);
17627 } else if (copy_entry->needs_copy ||
17628 copy_object->shadowed ||
17629 (object->internal &&
17630 !object->true_share &&
17631 !copy_entry->is_shared &&
17632 copy_object->vo_size > copy_size)) {
17633 VME_OBJECT_SHADOW(copy_entry, copy_size, TRUE);
17634 assert(copy_entry->use_pmap);
17635 if (copy_entry->needs_copy) {
17636 /* already write-protected */
17637 } else {
17638 vm_prot_t prot;
17639 prot = copy_entry->protection & ~VM_PROT_WRITE;
17640 vm_object_pmap_protect(copy_object,
17641 copy_offset,
17642 copy_size,
17643 PMAP_NULL,
17644 PAGE_SIZE,
17645 0,
17646 prot);
17647 }
17648 copy_entry->needs_copy = FALSE;
17649 }
17650 copy_object = VME_OBJECT(copy_entry);
17651 copy_offset = VME_OFFSET(copy_entry);
17652 if (copy_object &&
17653 copy_object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) {
17654 copy_object->copy_strategy = MEMORY_OBJECT_COPY_DELAY;
17655 copy_object->true_share = TRUE;
17656 }
17657 }
17658
17659 return result;
17660 }
17661
17662 if (src_entry->is_sub_map) {
17663 /* protections for submap mapping are irrelevant here */
17664 } else if (((src_entry->protection & required_cur_prot) !=
17665 required_cur_prot) ||
17666 ((src_entry->max_protection & required_max_prot) !=
17667 required_max_prot)) {
17668 if (vmk_flags.vmkf_copy_single_object &&
17669 mapped_size != 0) {
17670 /*
17671 * Single object extraction.
17672 * We can't extract more with the required
17673 * protection but we've extracted some, so
17674 * stop there and declare success.
17675 * The caller should check the size of
17676 * the copy entry we've extracted.
17677 */
17678 result = KERN_SUCCESS;
17679 } else {
17680 /*
17681 * VM range extraction.
17682 * Required proctection is not available
17683 * for this part of the range: fail.
17684 */
17685 result = KERN_PROTECTION_FAILURE;
17686 }
17687 break;
17688 }
17689
17690 if (src_entry->is_sub_map) {
17691 vm_map_t submap;
17692 vm_map_offset_t submap_start;
17693 vm_map_size_t submap_size;
17694 vm_map_copy_t submap_copy;
17695 vm_prot_t submap_curprot, submap_maxprot;
17696 boolean_t submap_needs_copy;
17697
17698 /*
17699 * No check for "required protection" on "src_entry"
17700 * because the protections that matter are the ones
17701 * on the submap's VM map entry, which will be checked
17702 * during the call to vm_map_copy_extract() below.
17703 */
17704 object = VM_OBJECT_NULL;
17705 submap_copy = VM_MAP_COPY_NULL;
17706
17707 /* find equivalent range in the submap */
17708 submap = VME_SUBMAP(src_entry);
17709 submap_start = VME_OFFSET(src_entry) + src_start - src_entry->vme_start;
17710 submap_size = tmp_size;
17711 if (copy) {
17712 /*
17713 * The caller wants a copy-on-write re-mapping,
17714 * so let's extract from the submap accordingly.
17715 */
17716 submap_needs_copy = TRUE;
17717 } else if (src_entry->needs_copy) {
17718 /*
17719 * The caller wants a shared re-mapping but the
17720 * submap is mapped with "needs_copy", so its
17721 * contents can't be shared as is. Extract the
17722 * contents of the submap as "copy-on-write".
17723 * The re-mapping won't be shared with the
17724 * original mapping but this is equivalent to
17725 * what happened with the original "remap from
17726 * submap" code.
17727 * The shared region is mapped "needs_copy", for
17728 * example.
17729 */
17730 submap_needs_copy = TRUE;
17731 } else {
17732 /*
17733 * The caller wants a shared re-mapping and
17734 * this mapping can be shared (no "needs_copy"),
17735 * so let's extract from the submap accordingly.
17736 * Kernel submaps are mapped without
17737 * "needs_copy", for example.
17738 */
17739 submap_needs_copy = FALSE;
17740 }
17741 /* extra ref to keep submap alive */
17742 vm_map_reference(submap);
17743
17744 DTRACE_VM7(remap_submap_recurse,
17745 vm_map_t, map,
17746 vm_map_offset_t, addr,
17747 vm_map_size_t, size,
17748 boolean_t, copy,
17749 vm_map_offset_t, submap_start,
17750 vm_map_size_t, submap_size,
17751 boolean_t, submap_needs_copy);
17752
17753 /*
17754 * The map can be safely unlocked since we
17755 * already hold a reference on the submap.
17756 *
17757 * No timestamp since we don't care if the map
17758 * gets modified while we're down in the submap.
17759 * We'll resume the extraction at src_start + tmp_size
17760 * anyway.
17761 */
17762 vm_map_unlock(map);
17763 src_entry = NULL; /* not valid once map is unlocked */
17764
17765 if (vm_remap_legacy) {
17766 submap_curprot = VM_PROT_NONE;
17767 submap_maxprot = VM_PROT_NONE;
17768 if (max_prot_for_prot_copy) {
17769 submap_maxprot = max_prot_for_prot_copy;
17770 }
17771 } else {
17772 assert(!max_prot_for_prot_copy);
17773 submap_curprot = *cur_protection;
17774 submap_maxprot = *max_protection;
17775 }
17776 result = vm_map_copy_extract(submap,
17777 submap_start,
17778 submap_size,
17779 submap_needs_copy,
17780 &submap_copy,
17781 &submap_curprot,
17782 &submap_maxprot,
17783 inheritance,
17784 vmk_flags);
17785
17786 /* release extra ref on submap */
17787 vm_map_deallocate(submap);
17788 submap = VM_MAP_NULL;
17789
17790 if (result != KERN_SUCCESS) {
17791 vm_map_lock(map);
17792 break;
17793 }
17794
17795 /* transfer submap_copy entries to map_header */
17796 while (vm_map_copy_first_entry(submap_copy) !=
17797 vm_map_copy_to_entry(submap_copy)) {
17798 vm_map_entry_t copy_entry;
17799 vm_map_size_t copy_entry_size;
17800
17801 copy_entry = vm_map_copy_first_entry(submap_copy);
17802
17803 /*
17804 * Prevent kernel_object from being exposed to
17805 * user space.
17806 */
17807 if (__improbable(copy_entry->vme_kernel_object)) {
17808 printf("%d[%s]: rejecting attempt to extract from kernel_object\n",
17809 proc_selfpid(),
17810 (get_bsdtask_info(current_task())
17811 ? proc_name_address(get_bsdtask_info(current_task()))
17812 : "?"));
17813 DTRACE_VM(extract_kernel_only);
17814 result = KERN_INVALID_RIGHT;
17815 vm_map_copy_discard(submap_copy);
17816 submap_copy = VM_MAP_COPY_NULL;
17817 vm_map_lock(map);
17818 break;
17819 }
17820
17821 #ifdef __arm64e__
17822 if (vmk_flags.vmkf_tpro_enforcement_override) {
17823 copy_entry->used_for_tpro = FALSE;
17824 }
17825 #endif /* __arm64e__ */
17826
17827 vm_map_copy_entry_unlink(submap_copy, copy_entry);
17828 copy_entry_size = copy_entry->vme_end - copy_entry->vme_start;
17829 copy_entry->vme_start = map_address;
17830 copy_entry->vme_end = map_address + copy_entry_size;
17831 map_address += copy_entry_size;
17832 mapped_size += copy_entry_size;
17833 src_start += copy_entry_size;
17834 assert(src_start <= src_end);
17835 _vm_map_store_entry_link(map_header,
17836 map_header->links.prev,
17837 copy_entry);
17838 }
17839 /* done with submap_copy */
17840 vm_map_copy_discard(submap_copy);
17841
17842 if (vm_remap_legacy) {
17843 *cur_protection &= submap_curprot;
17844 *max_protection &= submap_maxprot;
17845 }
17846
17847 /* re-acquire the map lock and continue to next entry */
17848 vm_map_lock(map);
17849 continue;
17850 } else {
17851 object = VME_OBJECT(src_entry);
17852
17853 /*
17854 * Prevent kernel_object from being exposed to
17855 * user space.
17856 */
17857 if (__improbable(is_kernel_object(object))) {
17858 printf("%d[%s]: rejecting attempt to extract from kernel_object\n",
17859 proc_selfpid(),
17860 (get_bsdtask_info(current_task())
17861 ? proc_name_address(get_bsdtask_info(current_task()))
17862 : "?"));
17863 DTRACE_VM(extract_kernel_only);
17864 result = KERN_INVALID_RIGHT;
17865 break;
17866 }
17867
17868 if (src_entry->iokit_acct) {
17869 /*
17870 * This entry uses "IOKit accounting".
17871 */
17872 } else if (object != VM_OBJECT_NULL &&
17873 (object->purgable != VM_PURGABLE_DENY ||
17874 object->vo_ledger_tag != VM_LEDGER_TAG_NONE)) {
17875 /*
17876 * Purgeable objects have their own accounting:
17877 * no pmap accounting for them.
17878 */
17879 assertf(!src_entry->use_pmap,
17880 "map=%p src_entry=%p [0x%llx:0x%llx] 0x%x/0x%x %d",
17881 map,
17882 src_entry,
17883 (uint64_t)src_entry->vme_start,
17884 (uint64_t)src_entry->vme_end,
17885 src_entry->protection,
17886 src_entry->max_protection,
17887 VME_ALIAS(src_entry));
17888 } else {
17889 /*
17890 * Not IOKit or purgeable:
17891 * must be accounted by pmap stats.
17892 */
17893 assertf(src_entry->use_pmap,
17894 "map=%p src_entry=%p [0x%llx:0x%llx] 0x%x/0x%x %d",
17895 map,
17896 src_entry,
17897 (uint64_t)src_entry->vme_start,
17898 (uint64_t)src_entry->vme_end,
17899 src_entry->protection,
17900 src_entry->max_protection,
17901 VME_ALIAS(src_entry));
17902 }
17903
17904 if (object == VM_OBJECT_NULL) {
17905 assert(!src_entry->needs_copy);
17906 if (src_entry->max_protection == VM_PROT_NONE) {
17907 assert(src_entry->protection == VM_PROT_NONE);
17908 /*
17909 * No VM object and no permissions:
17910 * this must be a reserved range with
17911 * nothing to share or copy.
17912 * There could also be all sorts of
17913 * pmap shenanigans within that reserved
17914 * range, so let's just copy the map
17915 * entry as is to remap a similar
17916 * reserved range.
17917 */
17918 offset = 0; /* no object => no offset */
17919 goto copy_src_entry;
17920 }
17921 object = vm_object_allocate(entry_size);
17922 VME_OFFSET_SET(src_entry, 0);
17923 VME_OBJECT_SET(src_entry, object, false, 0);
17924 assert(src_entry->use_pmap);
17925 assert(!map->mapped_in_other_pmaps);
17926 } else if (src_entry->wired_count ||
17927 object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC) {
17928 /*
17929 * A wired memory region should not have
17930 * any pending copy-on-write and needs to
17931 * keep pointing at the VM object that
17932 * contains the wired pages.
17933 * If we're sharing this memory (copy=false),
17934 * we'll share this VM object.
17935 * If we're copying this memory (copy=true),
17936 * we'll call vm_object_copy_slowly() below
17937 * and use the new VM object for the remapping.
17938 *
17939 * Or, we are already using an asymmetric
17940 * copy, and therefore we already have
17941 * the right object.
17942 */
17943 assert(!src_entry->needs_copy);
17944 } else if (src_entry->needs_copy || object->shadowed ||
17945 (object->internal && !object->true_share &&
17946 !src_entry->is_shared &&
17947 object->vo_size > entry_size)) {
17948 bool is_writable;
17949
17950 VME_OBJECT_SHADOW(src_entry, entry_size,
17951 vm_map_always_shadow(map));
17952 assert(src_entry->use_pmap);
17953
17954 is_writable = false;
17955 if (src_entry->protection & VM_PROT_WRITE) {
17956 is_writable = true;
17957 #if __arm64e__
17958 } else if (src_entry->used_for_tpro) {
17959 is_writable = true;
17960 #endif /* __arm64e__ */
17961 }
17962 if (!src_entry->needs_copy && is_writable) {
17963 vm_prot_t prot;
17964
17965 if (pmap_has_prot_policy(map->pmap, src_entry->translated_allow_execute, src_entry->protection)) {
17966 panic("%s: map %p pmap %p entry %p 0x%llx:0x%llx prot 0x%x",
17967 __FUNCTION__,
17968 map, map->pmap,
17969 src_entry,
17970 (uint64_t)src_entry->vme_start,
17971 (uint64_t)src_entry->vme_end,
17972 src_entry->protection);
17973 }
17974
17975 prot = src_entry->protection & ~VM_PROT_WRITE;
17976
17977 if (override_nx(map,
17978 VME_ALIAS(src_entry))
17979 && prot) {
17980 prot |= VM_PROT_EXECUTE;
17981 }
17982
17983 if (pmap_has_prot_policy(map->pmap, src_entry->translated_allow_execute, prot)) {
17984 panic("%s: map %p pmap %p entry %p 0x%llx:0x%llx prot 0x%x",
17985 __FUNCTION__,
17986 map, map->pmap,
17987 src_entry,
17988 (uint64_t)src_entry->vme_start,
17989 (uint64_t)src_entry->vme_end,
17990 prot);
17991 }
17992
17993 if (map->mapped_in_other_pmaps) {
17994 vm_object_pmap_protect(
17995 VME_OBJECT(src_entry),
17996 VME_OFFSET(src_entry),
17997 entry_size,
17998 PMAP_NULL,
17999 PAGE_SIZE,
18000 src_entry->vme_start,
18001 prot);
18002 #if MACH_ASSERT
18003 } else if (__improbable(map->pmap == PMAP_NULL)) {
18004 extern boolean_t vm_tests_in_progress;
18005 assert(vm_tests_in_progress);
18006 /*
18007 * Some VM tests (in vm_tests.c)
18008 * sometimes want to use a VM
18009 * map without a pmap.
18010 * Otherwise, this should never
18011 * happen.
18012 */
18013 #endif /* MACH_ASSERT */
18014 } else {
18015 pmap_protect(vm_map_pmap(map),
18016 src_entry->vme_start,
18017 src_entry->vme_end,
18018 prot);
18019 }
18020 }
18021
18022 object = VME_OBJECT(src_entry);
18023 src_entry->needs_copy = FALSE;
18024 }
18025
18026
18027 vm_object_lock(object);
18028 vm_object_reference_locked(object); /* object ref. for new entry */
18029 assert(!src_entry->needs_copy);
18030 if (object->copy_strategy ==
18031 MEMORY_OBJECT_COPY_SYMMETRIC) {
18032 /*
18033 * If we want to share this object (copy==0),
18034 * it needs to be COPY_DELAY.
18035 * If we want to copy this object (copy==1),
18036 * we can't just set "needs_copy" on our side
18037 * and expect the other side to do the same
18038 * (symmetrically), so we can't let the object
18039 * stay COPY_SYMMETRIC.
18040 * So we always switch from COPY_SYMMETRIC to
18041 * COPY_DELAY.
18042 */
18043 object->copy_strategy =
18044 MEMORY_OBJECT_COPY_DELAY;
18045 VM_OBJECT_SET_TRUE_SHARE(object, TRUE);
18046 }
18047 vm_object_unlock(object);
18048 }
18049
18050 offset = (VME_OFFSET(src_entry) +
18051 (src_start - src_entry->vme_start));
18052
18053 copy_src_entry:
18054 new_entry = _vm_map_entry_create(map_header);
18055 vm_map_entry_copy(map, new_entry, src_entry);
18056 if (new_entry->is_sub_map) {
18057 /* clr address space specifics */
18058 new_entry->use_pmap = FALSE;
18059 } else if (copy) {
18060 /*
18061 * We're dealing with a copy-on-write operation,
18062 * so the resulting mapping should not inherit the
18063 * original mapping's accounting settings.
18064 * "use_pmap" should be reset to its default (TRUE)
18065 * so that the new mapping gets accounted for in
18066 * the task's memory footprint.
18067 */
18068 new_entry->use_pmap = TRUE;
18069 }
18070 /* "iokit_acct" was cleared in vm_map_entry_copy() */
18071 assert(!new_entry->iokit_acct);
18072
18073 new_entry->map_aligned = FALSE;
18074
18075 new_entry->vme_start = map_address;
18076 new_entry->vme_end = map_address + tmp_size;
18077 assert(new_entry->vme_start < new_entry->vme_end);
18078 if (copy && vmk_flags.vmkf_remap_prot_copy) {
18079 /* security: keep "permanent" and "csm_associated" */
18080 new_entry->vme_permanent = src_entry->vme_permanent;
18081 new_entry->csm_associated = src_entry->csm_associated;
18082 /*
18083 * Remapping for vm_map_protect(VM_PROT_COPY)
18084 * to convert a read-only mapping into a
18085 * copy-on-write version of itself but
18086 * with write access:
18087 * keep the original inheritance but let's not
18088 * add VM_PROT_WRITE to the max protection yet
18089 * since we want to do more security checks against
18090 * the target map.
18091 */
18092 new_entry->inheritance = src_entry->inheritance;
18093 new_entry->protection &= max_prot_for_prot_copy;
18094 } else {
18095 new_entry->inheritance = inheritance;
18096 if (!vm_remap_legacy) {
18097 new_entry->protection = *cur_protection;
18098 new_entry->max_protection = *max_protection;
18099 }
18100 }
18101 #ifdef __arm64e__
18102 if (copy && vmk_flags.vmkf_tpro_enforcement_override) {
18103 new_entry->used_for_tpro = FALSE;
18104 }
18105 #endif /* __arm64e__ */
18106 VME_OFFSET_SET(new_entry, offset);
18107
18108 /*
18109 * The new region has to be copied now if required.
18110 */
18111 RestartCopy:
18112 if (!copy) {
18113 if (src_entry->used_for_jit == TRUE) {
18114 if (same_map) {
18115 } else if (!VM_MAP_POLICY_ALLOW_JIT_SHARING(map)) {
18116 /*
18117 * Cannot allow an entry describing a JIT
18118 * region to be shared across address spaces.
18119 */
18120 result = KERN_INVALID_ARGUMENT;
18121 vm_object_deallocate(object);
18122 vm_map_entry_dispose(new_entry);
18123 new_entry = VM_MAP_ENTRY_NULL;
18124 break;
18125 }
18126 }
18127
18128 src_entry->is_shared = TRUE;
18129 new_entry->is_shared = TRUE;
18130 if (!(new_entry->is_sub_map)) {
18131 new_entry->needs_copy = FALSE;
18132 }
18133 } else if (src_entry->is_sub_map) {
18134 /* make this a COW sub_map if not already */
18135 assert(new_entry->wired_count == 0);
18136 new_entry->needs_copy = TRUE;
18137 object = VM_OBJECT_NULL;
18138 } else if (src_entry->wired_count == 0 &&
18139 !(debug4k_no_cow_copyin && VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT) &&
18140 vm_object_copy_quickly(VME_OBJECT(new_entry),
18141 VME_OFFSET(new_entry),
18142 (new_entry->vme_end -
18143 new_entry->vme_start),
18144 &src_needs_copy,
18145 &new_entry_needs_copy)) {
18146 new_entry->needs_copy = new_entry_needs_copy;
18147 new_entry->is_shared = FALSE;
18148 assertf(new_entry->use_pmap, "map %p new_entry %p\n", map, new_entry);
18149
18150 /*
18151 * Handle copy_on_write semantics.
18152 */
18153 if (src_needs_copy && !src_entry->needs_copy) {
18154 vm_prot_t prot;
18155
18156 if (pmap_has_prot_policy(map->pmap, src_entry->translated_allow_execute, src_entry->protection)) {
18157 panic("%s: map %p pmap %p entry %p 0x%llx:0x%llx prot 0x%x",
18158 __FUNCTION__,
18159 map, map->pmap, src_entry,
18160 (uint64_t)src_entry->vme_start,
18161 (uint64_t)src_entry->vme_end,
18162 src_entry->protection);
18163 }
18164
18165 prot = src_entry->protection & ~VM_PROT_WRITE;
18166
18167 if (override_nx(map,
18168 VME_ALIAS(src_entry))
18169 && prot) {
18170 prot |= VM_PROT_EXECUTE;
18171 }
18172
18173 if (pmap_has_prot_policy(map->pmap, src_entry->translated_allow_execute, prot)) {
18174 panic("%s: map %p pmap %p entry %p 0x%llx:0x%llx prot 0x%x",
18175 __FUNCTION__,
18176 map, map->pmap, src_entry,
18177 (uint64_t)src_entry->vme_start,
18178 (uint64_t)src_entry->vme_end,
18179 prot);
18180 }
18181
18182 vm_object_pmap_protect(object,
18183 offset,
18184 entry_size,
18185 ((src_entry->is_shared
18186 || map->mapped_in_other_pmaps) ?
18187 PMAP_NULL : map->pmap),
18188 VM_MAP_PAGE_SIZE(map),
18189 src_entry->vme_start,
18190 prot);
18191
18192 assert(src_entry->wired_count == 0);
18193 src_entry->needs_copy = TRUE;
18194 }
18195 /*
18196 * Throw away the old object reference of the new entry.
18197 */
18198 vm_object_deallocate(object);
18199 } else {
18200 new_entry->is_shared = FALSE;
18201 assertf(new_entry->use_pmap, "map %p new_entry %p\n", map, new_entry);
18202
18203 src_entry_was_wired = (src_entry->wired_count > 0);
18204 saved_src_entry = src_entry;
18205 src_entry = VM_MAP_ENTRY_NULL;
18206
18207 /*
18208 * The map can be safely unlocked since we
18209 * already hold a reference on the object.
18210 *
18211 * Record the timestamp of the map for later
18212 * verification, and unlock the map.
18213 */
18214 version.main_timestamp = map->timestamp;
18215 vm_map_unlock(map); /* Increments timestamp once! */
18216
18217 /*
18218 * Perform the copy.
18219 */
18220 if (src_entry_was_wired > 0 ||
18221 (debug4k_no_cow_copyin &&
18222 VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT)) {
18223 vm_object_lock(object);
18224 result = vm_object_copy_slowly(
18225 object,
18226 offset,
18227 (new_entry->vme_end -
18228 new_entry->vme_start),
18229 THREAD_UNINT,
18230 &new_copy_object);
18231 /* VME_OBJECT_SET will reset used_for_jit, so preserve it. */
18232 saved_used_for_jit = new_entry->used_for_jit;
18233 VME_OBJECT_SET(new_entry, new_copy_object, false, 0);
18234 new_entry->used_for_jit = saved_used_for_jit;
18235 VME_OFFSET_SET(new_entry, offset - vm_object_trunc_page(offset));
18236 new_entry->needs_copy = FALSE;
18237 } else {
18238 vm_object_offset_t new_offset;
18239
18240 new_offset = VME_OFFSET(new_entry);
18241 result = vm_object_copy_strategically(
18242 object,
18243 offset,
18244 (new_entry->vme_end -
18245 new_entry->vme_start),
18246 false, /* forking */
18247 &new_copy_object,
18248 &new_offset,
18249 &new_entry_needs_copy);
18250 /* VME_OBJECT_SET will reset used_for_jit, so preserve it. */
18251 saved_used_for_jit = new_entry->used_for_jit;
18252 VME_OBJECT_SET(new_entry, new_copy_object, false, 0);
18253 new_entry->used_for_jit = saved_used_for_jit;
18254 if (new_offset != VME_OFFSET(new_entry)) {
18255 VME_OFFSET_SET(new_entry, new_offset);
18256 }
18257
18258 new_entry->needs_copy = new_entry_needs_copy;
18259 }
18260
18261 /*
18262 * Throw away the old object reference of the new entry.
18263 */
18264 vm_object_deallocate(object);
18265
18266 if (result != KERN_SUCCESS &&
18267 result != KERN_MEMORY_RESTART_COPY) {
18268 vm_map_entry_dispose(new_entry);
18269 vm_map_lock(map);
18270 break;
18271 }
18272
18273 /*
18274 * Verify that the map has not substantially
18275 * changed while the copy was being made.
18276 */
18277
18278 vm_map_lock(map);
18279 if (version.main_timestamp + 1 != map->timestamp) {
18280 /*
18281 * Simple version comparison failed.
18282 *
18283 * Retry the lookup and verify that the
18284 * same object/offset are still present.
18285 */
18286 saved_src_entry = VM_MAP_ENTRY_NULL;
18287 vm_object_deallocate(VME_OBJECT(new_entry));
18288 vm_map_entry_dispose(new_entry);
18289 if (result == KERN_MEMORY_RESTART_COPY) {
18290 result = KERN_SUCCESS;
18291 }
18292 continue;
18293 }
18294 /* map hasn't changed: src_entry is still valid */
18295 src_entry = saved_src_entry;
18296 saved_src_entry = VM_MAP_ENTRY_NULL;
18297
18298 if (result == KERN_MEMORY_RESTART_COPY) {
18299 vm_object_reference(object);
18300 goto RestartCopy;
18301 }
18302 }
18303
18304 _vm_map_store_entry_link(map_header,
18305 map_header->links.prev, new_entry);
18306
18307 /* protections for submap mapping are irrelevant here */
18308 if (vm_remap_legacy && !src_entry->is_sub_map) {
18309 *cur_protection &= src_entry->protection;
18310 *max_protection &= src_entry->max_protection;
18311 }
18312
18313 map_address += tmp_size;
18314 mapped_size += tmp_size;
18315 src_start += tmp_size;
18316
18317 if (vmk_flags.vmkf_copy_single_object) {
18318 if (mapped_size != size) {
18319 DEBUG4K_SHARE("map %p addr 0x%llx size 0x%llx clipped copy at mapped_size 0x%llx\n",
18320 map, (uint64_t)addr, (uint64_t)size, (uint64_t)mapped_size);
18321 if (src_entry->vme_next != vm_map_to_entry(map) &&
18322 src_entry->vme_next->vme_object_value ==
18323 src_entry->vme_object_value) {
18324 /* XXX TODO4K */
18325 DEBUG4K_ERROR("could have extended copy to next entry...\n");
18326 }
18327 }
18328 break;
18329 }
18330 } /* end while */
18331
18332 vm_map_unlock(map);
18333 if (result != KERN_SUCCESS) {
18334 /*
18335 * Free all allocated elements.
18336 */
18337 for (src_entry = map_header->links.next;
18338 src_entry != CAST_TO_VM_MAP_ENTRY(&map_header->links);
18339 src_entry = new_entry) {
18340 new_entry = src_entry->vme_next;
18341 _vm_map_store_entry_unlink(map_header, src_entry, false);
18342 if (src_entry->is_sub_map) {
18343 vm_map_deallocate(VME_SUBMAP(src_entry));
18344 } else {
18345 vm_object_deallocate(VME_OBJECT(src_entry));
18346 }
18347 vm_map_entry_dispose(src_entry);
18348 }
18349 }
18350 return result;
18351 }
18352
18353 bool
vm_map_is_exotic(vm_map_t map)18354 vm_map_is_exotic(
18355 vm_map_t map)
18356 {
18357 return VM_MAP_IS_EXOTIC(map);
18358 }
18359
18360 bool
vm_map_is_alien(vm_map_t map)18361 vm_map_is_alien(
18362 vm_map_t map)
18363 {
18364 return VM_MAP_IS_ALIEN(map);
18365 }
18366
18367 #if XNU_TARGET_OS_OSX
18368 void
vm_map_mark_alien(vm_map_t map)18369 vm_map_mark_alien(
18370 vm_map_t map)
18371 {
18372 vm_map_lock(map);
18373 map->is_alien = true;
18374 vm_map_unlock(map);
18375 }
18376
18377 void
vm_map_single_jit(vm_map_t map)18378 vm_map_single_jit(
18379 vm_map_t map)
18380 {
18381 vm_map_lock(map);
18382 map->single_jit = true;
18383 vm_map_unlock(map);
18384 }
18385 #endif /* XNU_TARGET_OS_OSX */
18386
18387
18388 /*
18389 * Callers of this function must call vm_map_copy_require on
18390 * previously created vm_map_copy_t or pass a newly created
18391 * one to ensure that it hasn't been forged.
18392 */
18393 static kern_return_t
vm_map_copy_to_physcopy(vm_map_copy_t copy_map,vm_map_t target_map)18394 vm_map_copy_to_physcopy(
18395 vm_map_copy_t copy_map,
18396 vm_map_t target_map)
18397 {
18398 vm_map_size_t size;
18399 vm_map_entry_t entry;
18400 vm_map_entry_t new_entry;
18401 vm_object_t new_object;
18402 unsigned int pmap_flags;
18403 pmap_t new_pmap;
18404 vm_map_t new_map;
18405 vm_map_address_t src_start, src_end, src_cur;
18406 vm_map_address_t dst_start, dst_end, dst_cur;
18407 kern_return_t kr;
18408 void *kbuf;
18409
18410 /*
18411 * Perform the equivalent of vm_allocate() and memcpy().
18412 * Replace the mappings in "copy_map" with the newly allocated mapping.
18413 */
18414 DEBUG4K_COPY("copy_map %p (%d %d 0x%llx 0x%llx) BEFORE\n", copy_map, copy_map->cpy_hdr.page_shift, copy_map->cpy_hdr.nentries, copy_map->offset, (uint64_t)copy_map->size);
18415
18416 assert(copy_map->cpy_hdr.page_shift != VM_MAP_PAGE_MASK(target_map));
18417
18418 /* create a new pmap to map "copy_map" */
18419 pmap_flags = 0;
18420 assert(copy_map->cpy_hdr.page_shift == FOURK_PAGE_SHIFT);
18421 #if PMAP_CREATE_FORCE_4K_PAGES
18422 pmap_flags |= PMAP_CREATE_FORCE_4K_PAGES;
18423 #endif /* PMAP_CREATE_FORCE_4K_PAGES */
18424 pmap_flags |= PMAP_CREATE_64BIT;
18425 new_pmap = pmap_create_options(NULL, (vm_map_size_t)0, pmap_flags);
18426 if (new_pmap == NULL) {
18427 return KERN_RESOURCE_SHORTAGE;
18428 }
18429
18430 /* allocate new VM object */
18431 size = VM_MAP_ROUND_PAGE(copy_map->size, PAGE_MASK);
18432 new_object = vm_object_allocate(size);
18433 assert(new_object);
18434
18435 /* allocate new VM map entry */
18436 new_entry = vm_map_copy_entry_create(copy_map);
18437 assert(new_entry);
18438
18439 /* finish initializing new VM map entry */
18440 new_entry->protection = VM_PROT_DEFAULT;
18441 new_entry->max_protection = VM_PROT_DEFAULT;
18442 new_entry->use_pmap = TRUE;
18443
18444 /* make new VM map entry point to new VM object */
18445 new_entry->vme_start = 0;
18446 new_entry->vme_end = size;
18447 VME_OBJECT_SET(new_entry, new_object, false, 0);
18448 VME_OFFSET_SET(new_entry, 0);
18449
18450 /* create a new pageable VM map to map "copy_map" */
18451 new_map = vm_map_create_options(new_pmap, 0, MACH_VM_MAX_ADDRESS,
18452 VM_MAP_CREATE_PAGEABLE);
18453 assert(new_map);
18454 vm_map_set_page_shift(new_map, copy_map->cpy_hdr.page_shift);
18455
18456 /* map "copy_map" in the new VM map */
18457 src_start = 0;
18458 kr = vm_map_copyout_internal(
18459 new_map,
18460 &src_start,
18461 copy_map,
18462 copy_map->size,
18463 FALSE, /* consume_on_success */
18464 VM_PROT_DEFAULT,
18465 VM_PROT_DEFAULT,
18466 VM_INHERIT_DEFAULT);
18467 assert(kr == KERN_SUCCESS);
18468 src_end = src_start + copy_map->size;
18469
18470 /* map "new_object" in the new VM map */
18471 vm_object_reference(new_object);
18472 dst_start = 0;
18473 kr = vm_map_enter(new_map,
18474 &dst_start,
18475 size,
18476 0, /* mask */
18477 VM_MAP_KERNEL_FLAGS_ANYWHERE(.vm_tag = VM_KERN_MEMORY_OSFMK),
18478 new_object,
18479 0, /* offset */
18480 FALSE, /* needs copy */
18481 VM_PROT_DEFAULT,
18482 VM_PROT_DEFAULT,
18483 VM_INHERIT_DEFAULT);
18484 assert(kr == KERN_SUCCESS);
18485 dst_end = dst_start + size;
18486
18487 /* get a kernel buffer */
18488 kbuf = kalloc_data(PAGE_SIZE, Z_WAITOK | Z_NOFAIL);
18489
18490 /* physically copy "copy_map" mappings to new VM object */
18491 for (src_cur = src_start, dst_cur = dst_start;
18492 src_cur < src_end;
18493 src_cur += PAGE_SIZE, dst_cur += PAGE_SIZE) {
18494 vm_size_t bytes;
18495
18496 bytes = PAGE_SIZE;
18497 if (src_cur + PAGE_SIZE > src_end) {
18498 /* partial copy for last page */
18499 bytes = src_end - src_cur;
18500 assert(bytes > 0 && bytes < PAGE_SIZE);
18501 /* rest of dst page should be zero-filled */
18502 }
18503 /* get bytes from src mapping */
18504 kr = copyinmap(new_map, src_cur, kbuf, bytes);
18505 if (kr != KERN_SUCCESS) {
18506 DEBUG4K_COPY("copyinmap(%p, 0x%llx, %p, 0x%llx) kr 0x%x\n", new_map, (uint64_t)src_cur, kbuf, (uint64_t)bytes, kr);
18507 }
18508 /* put bytes in dst mapping */
18509 assert(dst_cur < dst_end);
18510 assert(dst_cur + bytes <= dst_end);
18511 kr = copyoutmap(new_map, kbuf, dst_cur, bytes);
18512 if (kr != KERN_SUCCESS) {
18513 DEBUG4K_COPY("copyoutmap(%p, %p, 0x%llx, 0x%llx) kr 0x%x\n", new_map, kbuf, (uint64_t)dst_cur, (uint64_t)bytes, kr);
18514 }
18515 }
18516
18517 /* free kernel buffer */
18518 kfree_data(kbuf, PAGE_SIZE);
18519
18520 /* destroy new map */
18521 vm_map_destroy(new_map);
18522 new_map = VM_MAP_NULL;
18523
18524 /* dispose of the old map entries in "copy_map" */
18525 while (vm_map_copy_first_entry(copy_map) !=
18526 vm_map_copy_to_entry(copy_map)) {
18527 entry = vm_map_copy_first_entry(copy_map);
18528 vm_map_copy_entry_unlink(copy_map, entry);
18529 if (entry->is_sub_map) {
18530 vm_map_deallocate(VME_SUBMAP(entry));
18531 } else {
18532 vm_object_deallocate(VME_OBJECT(entry));
18533 }
18534 vm_map_copy_entry_dispose(entry);
18535 }
18536
18537 /* change "copy_map"'s page_size to match "target_map" */
18538 copy_map->cpy_hdr.page_shift = (uint16_t)VM_MAP_PAGE_SHIFT(target_map);
18539 copy_map->offset = 0;
18540 copy_map->size = size;
18541
18542 /* insert new map entry in "copy_map" */
18543 assert(vm_map_copy_last_entry(copy_map) == vm_map_copy_to_entry(copy_map));
18544 vm_map_copy_entry_link(copy_map, vm_map_copy_last_entry(copy_map), new_entry);
18545
18546 DEBUG4K_COPY("copy_map %p (%d %d 0x%llx 0x%llx) AFTER\n", copy_map, copy_map->cpy_hdr.page_shift, copy_map->cpy_hdr.nentries, copy_map->offset, (uint64_t)copy_map->size);
18547 return KERN_SUCCESS;
18548 }
18549
18550 void
18551 vm_map_copy_adjust_get_target_copy_map(
18552 vm_map_copy_t copy_map,
18553 vm_map_copy_t *target_copy_map_p);
18554 void
vm_map_copy_adjust_get_target_copy_map(vm_map_copy_t copy_map,vm_map_copy_t * target_copy_map_p)18555 vm_map_copy_adjust_get_target_copy_map(
18556 vm_map_copy_t copy_map,
18557 vm_map_copy_t *target_copy_map_p)
18558 {
18559 vm_map_copy_t target_copy_map;
18560 vm_map_entry_t entry, target_entry;
18561
18562 if (*target_copy_map_p != VM_MAP_COPY_NULL) {
18563 /* the caller already has a "target_copy_map": use it */
18564 return;
18565 }
18566
18567 /* the caller wants us to create a new copy of "copy_map" */
18568 assert(copy_map->type == VM_MAP_COPY_ENTRY_LIST);
18569 target_copy_map = vm_map_copy_allocate(copy_map->type);
18570 target_copy_map->offset = copy_map->offset;
18571 target_copy_map->size = copy_map->size;
18572 target_copy_map->cpy_hdr.page_shift = copy_map->cpy_hdr.page_shift;
18573 for (entry = vm_map_copy_first_entry(copy_map);
18574 entry != vm_map_copy_to_entry(copy_map);
18575 entry = entry->vme_next) {
18576 target_entry = vm_map_copy_entry_create(target_copy_map);
18577 vm_map_entry_copy_full(target_entry, entry);
18578 if (target_entry->is_sub_map) {
18579 vm_map_reference(VME_SUBMAP(target_entry));
18580 } else {
18581 vm_object_reference(VME_OBJECT(target_entry));
18582 }
18583 vm_map_copy_entry_link(
18584 target_copy_map,
18585 vm_map_copy_last_entry(target_copy_map),
18586 target_entry);
18587 }
18588 entry = VM_MAP_ENTRY_NULL;
18589 *target_copy_map_p = target_copy_map;
18590 }
18591
18592 /*
18593 * Callers of this function must call vm_map_copy_require on
18594 * previously created vm_map_copy_t or pass a newly created
18595 * one to ensure that it hasn't been forged.
18596 */
18597 static void
vm_map_copy_trim(vm_map_copy_t copy_map,uint16_t new_page_shift,vm_map_offset_t trim_start,vm_map_offset_t trim_end)18598 vm_map_copy_trim(
18599 vm_map_copy_t copy_map,
18600 uint16_t new_page_shift,
18601 vm_map_offset_t trim_start,
18602 vm_map_offset_t trim_end)
18603 {
18604 uint16_t copy_page_shift;
18605 vm_map_entry_t entry, next_entry;
18606
18607 assert(copy_map->type == VM_MAP_COPY_ENTRY_LIST);
18608 assert(copy_map->cpy_hdr.nentries > 0);
18609
18610 trim_start += vm_map_copy_first_entry(copy_map)->vme_start;
18611 trim_end += vm_map_copy_first_entry(copy_map)->vme_start;
18612
18613 /* use the new page_shift to do the clipping */
18614 copy_page_shift = VM_MAP_COPY_PAGE_SHIFT(copy_map);
18615 copy_map->cpy_hdr.page_shift = new_page_shift;
18616
18617 for (entry = vm_map_copy_first_entry(copy_map);
18618 entry != vm_map_copy_to_entry(copy_map);
18619 entry = next_entry) {
18620 next_entry = entry->vme_next;
18621 if (entry->vme_end <= trim_start) {
18622 /* entry fully before trim range: skip */
18623 continue;
18624 }
18625 if (entry->vme_start >= trim_end) {
18626 /* entry fully after trim range: done */
18627 break;
18628 }
18629 /* clip entry if needed */
18630 vm_map_copy_clip_start(copy_map, entry, trim_start);
18631 vm_map_copy_clip_end(copy_map, entry, trim_end);
18632 /* dispose of entry */
18633 copy_map->size -= entry->vme_end - entry->vme_start;
18634 vm_map_copy_entry_unlink(copy_map, entry);
18635 if (entry->is_sub_map) {
18636 vm_map_deallocate(VME_SUBMAP(entry));
18637 } else {
18638 vm_object_deallocate(VME_OBJECT(entry));
18639 }
18640 vm_map_copy_entry_dispose(entry);
18641 entry = VM_MAP_ENTRY_NULL;
18642 }
18643
18644 /* restore copy_map's original page_shift */
18645 copy_map->cpy_hdr.page_shift = copy_page_shift;
18646 }
18647
18648 /*
18649 * Make any necessary adjustments to "copy_map" to allow it to be
18650 * mapped into "target_map".
18651 * If no changes were necessary, "target_copy_map" points to the
18652 * untouched "copy_map".
18653 * If changes are necessary, changes will be made to "target_copy_map".
18654 * If "target_copy_map" was NULL, we create a new "vm_map_copy_t" and
18655 * copy the original "copy_map" to it before applying the changes.
18656 * The caller should discard "target_copy_map" if it's not the same as
18657 * the original "copy_map".
18658 */
18659 /* TODO4K: also adjust to sub-range in the copy_map -> add start&end? */
18660 kern_return_t
vm_map_copy_adjust_to_target(vm_map_copy_t src_copy_map,vm_map_offset_t offset,vm_map_size_t size,vm_map_t target_map,boolean_t copy,vm_map_copy_t * target_copy_map_p,vm_map_offset_t * overmap_start_p,vm_map_offset_t * overmap_end_p,vm_map_offset_t * trimmed_start_p)18661 vm_map_copy_adjust_to_target(
18662 vm_map_copy_t src_copy_map,
18663 vm_map_offset_t offset,
18664 vm_map_size_t size,
18665 vm_map_t target_map,
18666 boolean_t copy,
18667 vm_map_copy_t *target_copy_map_p,
18668 vm_map_offset_t *overmap_start_p,
18669 vm_map_offset_t *overmap_end_p,
18670 vm_map_offset_t *trimmed_start_p)
18671 {
18672 vm_map_copy_t copy_map, target_copy_map;
18673 vm_map_size_t target_size;
18674 vm_map_size_t src_copy_map_size;
18675 vm_map_size_t overmap_start, overmap_end;
18676 int misalignments;
18677 vm_map_entry_t entry, target_entry;
18678 vm_map_offset_t addr_adjustment;
18679 vm_map_offset_t new_start, new_end;
18680 int copy_page_mask, target_page_mask;
18681 uint16_t copy_page_shift, target_page_shift;
18682 vm_map_offset_t trimmed_end;
18683
18684 /*
18685 * Assert that the vm_map_copy is coming from the right
18686 * zone and hasn't been forged
18687 */
18688 vm_map_copy_require(src_copy_map);
18689 assert(src_copy_map->type == VM_MAP_COPY_ENTRY_LIST);
18690
18691 /*
18692 * Start working with "src_copy_map" but we'll switch
18693 * to "target_copy_map" as soon as we start making adjustments.
18694 */
18695 copy_map = src_copy_map;
18696 src_copy_map_size = src_copy_map->size;
18697
18698 copy_page_shift = VM_MAP_COPY_PAGE_SHIFT(copy_map);
18699 copy_page_mask = VM_MAP_COPY_PAGE_MASK(copy_map);
18700 target_page_shift = (uint16_t)VM_MAP_PAGE_SHIFT(target_map);
18701 target_page_mask = VM_MAP_PAGE_MASK(target_map);
18702
18703 DEBUG4K_ADJUST("copy_map %p (%d offset 0x%llx size 0x%llx) target_map %p (%d) copy %d offset 0x%llx size 0x%llx target_copy_map %p...\n", copy_map, copy_page_shift, (uint64_t)copy_map->offset, (uint64_t)copy_map->size, target_map, target_page_shift, copy, (uint64_t)offset, (uint64_t)size, *target_copy_map_p);
18704
18705 target_copy_map = *target_copy_map_p;
18706 if (target_copy_map != VM_MAP_COPY_NULL) {
18707 vm_map_copy_require(target_copy_map);
18708 }
18709
18710 if (offset + size > copy_map->size) {
18711 DEBUG4K_ERROR("copy_map %p (%d->%d) copy_map->size 0x%llx offset 0x%llx size 0x%llx KERN_INVALID_ARGUMENT\n", copy_map, copy_page_shift, target_page_shift, (uint64_t)copy_map->size, (uint64_t)offset, (uint64_t)size);
18712 return KERN_INVALID_ARGUMENT;
18713 }
18714
18715 /* trim the end */
18716 trimmed_end = 0;
18717 new_end = VM_MAP_ROUND_PAGE(offset + size, target_page_mask);
18718 if (new_end < copy_map->size) {
18719 trimmed_end = src_copy_map_size - new_end;
18720 DEBUG4K_ADJUST("copy_map %p (%d->%d) copy %d offset 0x%llx size 0x%llx target_copy_map %p... trim end from 0x%llx to 0x%llx\n", copy_map, copy_page_shift, target_page_shift, copy, (uint64_t)offset, (uint64_t)size, target_copy_map, (uint64_t)new_end, (uint64_t)copy_map->size);
18721 /* get "target_copy_map" if needed and adjust it */
18722 vm_map_copy_adjust_get_target_copy_map(copy_map,
18723 &target_copy_map);
18724 copy_map = target_copy_map;
18725 vm_map_copy_trim(target_copy_map, target_page_shift,
18726 new_end, copy_map->size);
18727 }
18728
18729 /* trim the start */
18730 new_start = VM_MAP_TRUNC_PAGE(offset, target_page_mask);
18731 if (new_start != 0) {
18732 DEBUG4K_ADJUST("copy_map %p (%d->%d) copy %d offset 0x%llx size 0x%llx target_copy_map %p... trim start from 0x%llx to 0x%llx\n", copy_map, copy_page_shift, target_page_shift, copy, (uint64_t)offset, (uint64_t)size, target_copy_map, (uint64_t)0, (uint64_t)new_start);
18733 /* get "target_copy_map" if needed and adjust it */
18734 vm_map_copy_adjust_get_target_copy_map(copy_map,
18735 &target_copy_map);
18736 copy_map = target_copy_map;
18737 vm_map_copy_trim(target_copy_map, target_page_shift,
18738 0, new_start);
18739 }
18740 *trimmed_start_p = new_start;
18741
18742 /* target_size starts with what's left after trimming */
18743 target_size = copy_map->size;
18744 assertf(target_size == src_copy_map_size - *trimmed_start_p - trimmed_end,
18745 "target_size 0x%llx src_copy_map_size 0x%llx trimmed_start 0x%llx trimmed_end 0x%llx\n",
18746 (uint64_t)target_size, (uint64_t)src_copy_map_size,
18747 (uint64_t)*trimmed_start_p, (uint64_t)trimmed_end);
18748
18749 /* check for misalignments but don't adjust yet */
18750 misalignments = 0;
18751 overmap_start = 0;
18752 overmap_end = 0;
18753 if (copy_page_shift < target_page_shift) {
18754 /*
18755 * Remapping from 4K to 16K: check the VM object alignments
18756 * throughout the range.
18757 * If the start and end of the range are mis-aligned, we can
18758 * over-map to re-align, and adjust the "overmap" start/end
18759 * and "target_size" of the range accordingly.
18760 * If there is any mis-alignment within the range:
18761 * if "copy":
18762 * we can do immediate-copy instead of copy-on-write,
18763 * else:
18764 * no way to remap and share; fail.
18765 */
18766 for (entry = vm_map_copy_first_entry(copy_map);
18767 entry != vm_map_copy_to_entry(copy_map);
18768 entry = entry->vme_next) {
18769 vm_object_offset_t object_offset_start, object_offset_end;
18770
18771 object_offset_start = VME_OFFSET(entry);
18772 object_offset_end = object_offset_start;
18773 object_offset_end += entry->vme_end - entry->vme_start;
18774 if (object_offset_start & target_page_mask) {
18775 if (entry == vm_map_copy_first_entry(copy_map) && !copy) {
18776 overmap_start++;
18777 } else {
18778 misalignments++;
18779 }
18780 }
18781 if (object_offset_end & target_page_mask) {
18782 if (entry->vme_next == vm_map_copy_to_entry(copy_map) && !copy) {
18783 overmap_end++;
18784 } else {
18785 misalignments++;
18786 }
18787 }
18788 }
18789 }
18790 entry = VM_MAP_ENTRY_NULL;
18791
18792 /* decide how to deal with misalignments */
18793 assert(overmap_start <= 1);
18794 assert(overmap_end <= 1);
18795 if (!overmap_start && !overmap_end && !misalignments) {
18796 /* copy_map is properly aligned for target_map ... */
18797 if (*trimmed_start_p) {
18798 /* ... but we trimmed it, so still need to adjust */
18799 } else {
18800 /* ... and we didn't trim anything: we're done */
18801 if (target_copy_map == VM_MAP_COPY_NULL) {
18802 target_copy_map = copy_map;
18803 }
18804 *target_copy_map_p = target_copy_map;
18805 *overmap_start_p = 0;
18806 *overmap_end_p = 0;
18807 DEBUG4K_ADJUST("copy_map %p (%d offset 0x%llx size 0x%llx) target_map %p (%d) copy %d target_copy_map %p (%d offset 0x%llx size 0x%llx) -> trimmed 0x%llx overmap start 0x%llx end 0x%llx KERN_SUCCESS\n", copy_map, copy_page_shift, (uint64_t)copy_map->offset, (uint64_t)copy_map->size, target_map, target_page_shift, copy, *target_copy_map_p, VM_MAP_COPY_PAGE_SHIFT(*target_copy_map_p), (uint64_t)(*target_copy_map_p)->offset, (uint64_t)(*target_copy_map_p)->size, (uint64_t)*trimmed_start_p, (uint64_t)*overmap_start_p, (uint64_t)*overmap_end_p);
18808 return KERN_SUCCESS;
18809 }
18810 } else if (misalignments && !copy) {
18811 /* can't "share" if misaligned */
18812 DEBUG4K_ADJUST("unsupported sharing\n");
18813 #if MACH_ASSERT
18814 if (debug4k_panic_on_misaligned_sharing) {
18815 panic("DEBUG4k %s:%d unsupported sharing", __FUNCTION__, __LINE__);
18816 }
18817 #endif /* MACH_ASSERT */
18818 DEBUG4K_ADJUST("copy_map %p (%d) target_map %p (%d) copy %d target_copy_map %p -> KERN_NOT_SUPPORTED\n", copy_map, copy_page_shift, target_map, target_page_shift, copy, *target_copy_map_p);
18819 return KERN_NOT_SUPPORTED;
18820 } else {
18821 /* can't virtual-copy if misaligned (but can physical-copy) */
18822 DEBUG4K_ADJUST("mis-aligned copying\n");
18823 }
18824
18825 /* get a "target_copy_map" if needed and switch to it */
18826 vm_map_copy_adjust_get_target_copy_map(copy_map, &target_copy_map);
18827 copy_map = target_copy_map;
18828
18829 if (misalignments && copy) {
18830 vm_map_size_t target_copy_map_size;
18831
18832 /*
18833 * Can't do copy-on-write with misaligned mappings.
18834 * Replace the mappings with a physical copy of the original
18835 * mappings' contents.
18836 */
18837 target_copy_map_size = target_copy_map->size;
18838 kern_return_t kr = vm_map_copy_to_physcopy(target_copy_map, target_map);
18839 if (kr != KERN_SUCCESS) {
18840 return kr;
18841 }
18842 *target_copy_map_p = target_copy_map;
18843 *overmap_start_p = 0;
18844 *overmap_end_p = target_copy_map->size - target_copy_map_size;
18845 DEBUG4K_ADJUST("copy_map %p (%d offset 0x%llx size 0x%llx) target_map %p (%d) copy %d target_copy_map %p (%d offset 0x%llx size 0x%llx)-> trimmed 0x%llx overmap start 0x%llx end 0x%llx PHYSCOPY\n", copy_map, copy_page_shift, (uint64_t)copy_map->offset, (uint64_t)copy_map->size, target_map, target_page_shift, copy, *target_copy_map_p, VM_MAP_COPY_PAGE_SHIFT(*target_copy_map_p), (uint64_t)(*target_copy_map_p)->offset, (uint64_t)(*target_copy_map_p)->size, (uint64_t)*trimmed_start_p, (uint64_t)*overmap_start_p, (uint64_t)*overmap_end_p);
18846 return KERN_SUCCESS;
18847 }
18848
18849 /* apply the adjustments */
18850 misalignments = 0;
18851 overmap_start = 0;
18852 overmap_end = 0;
18853 /* remove copy_map->offset, so that everything starts at offset 0 */
18854 addr_adjustment = copy_map->offset;
18855 /* also remove whatever we trimmed from the start */
18856 addr_adjustment += *trimmed_start_p;
18857 for (target_entry = vm_map_copy_first_entry(target_copy_map);
18858 target_entry != vm_map_copy_to_entry(target_copy_map);
18859 target_entry = target_entry->vme_next) {
18860 vm_object_offset_t object_offset_start, object_offset_end;
18861
18862 DEBUG4K_ADJUST("copy %p (%d 0x%llx 0x%llx) entry %p [ 0x%llx 0x%llx ] object %p offset 0x%llx BEFORE\n", target_copy_map, VM_MAP_COPY_PAGE_SHIFT(target_copy_map), target_copy_map->offset, (uint64_t)target_copy_map->size, target_entry, (uint64_t)target_entry->vme_start, (uint64_t)target_entry->vme_end, VME_OBJECT(target_entry), VME_OFFSET(target_entry));
18863 object_offset_start = VME_OFFSET(target_entry);
18864 if (object_offset_start & target_page_mask) {
18865 DEBUG4K_ADJUST("copy %p (%d 0x%llx 0x%llx) entry %p [ 0x%llx 0x%llx ] object %p offset 0x%llx misaligned at start\n", target_copy_map, VM_MAP_COPY_PAGE_SHIFT(target_copy_map), target_copy_map->offset, (uint64_t)target_copy_map->size, target_entry, (uint64_t)target_entry->vme_start, (uint64_t)target_entry->vme_end, VME_OBJECT(target_entry), VME_OFFSET(target_entry));
18866 if (target_entry == vm_map_copy_first_entry(target_copy_map)) {
18867 /*
18868 * start of 1st entry is mis-aligned:
18869 * re-adjust by over-mapping.
18870 */
18871 overmap_start = object_offset_start - trunc_page_mask_64(object_offset_start, target_page_mask);
18872 DEBUG4K_ADJUST("entry %p offset 0x%llx copy %d -> overmap_start 0x%llx\n", target_entry, VME_OFFSET(target_entry), copy, (uint64_t)overmap_start);
18873 VME_OFFSET_SET(target_entry, VME_OFFSET(target_entry) - overmap_start);
18874 } else {
18875 misalignments++;
18876 DEBUG4K_ADJUST("entry %p offset 0x%llx copy %d -> misalignments %d\n", target_entry, VME_OFFSET(target_entry), copy, misalignments);
18877 assert(copy);
18878 }
18879 }
18880
18881 if (target_entry == vm_map_copy_first_entry(target_copy_map)) {
18882 target_size += overmap_start;
18883 } else {
18884 target_entry->vme_start += overmap_start;
18885 }
18886 target_entry->vme_end += overmap_start;
18887
18888 object_offset_end = VME_OFFSET(target_entry) + target_entry->vme_end - target_entry->vme_start;
18889 if (object_offset_end & target_page_mask) {
18890 DEBUG4K_ADJUST("copy %p (%d 0x%llx 0x%llx) entry %p [ 0x%llx 0x%llx ] object %p offset 0x%llx misaligned at end\n", target_copy_map, VM_MAP_COPY_PAGE_SHIFT(target_copy_map), target_copy_map->offset, (uint64_t)target_copy_map->size, target_entry, (uint64_t)target_entry->vme_start, (uint64_t)target_entry->vme_end, VME_OBJECT(target_entry), VME_OFFSET(target_entry));
18891 if (target_entry->vme_next == vm_map_copy_to_entry(target_copy_map)) {
18892 /*
18893 * end of last entry is mis-aligned: re-adjust by over-mapping.
18894 */
18895 overmap_end = round_page_mask_64(object_offset_end, target_page_mask) - object_offset_end;
18896 DEBUG4K_ADJUST("entry %p offset 0x%llx copy %d -> overmap_end 0x%llx\n", target_entry, VME_OFFSET(target_entry), copy, (uint64_t)overmap_end);
18897 target_entry->vme_end += overmap_end;
18898 target_size += overmap_end;
18899 } else {
18900 misalignments++;
18901 DEBUG4K_ADJUST("entry %p offset 0x%llx copy %d -> misalignments %d\n", target_entry, VME_OFFSET(target_entry), copy, misalignments);
18902 assert(copy);
18903 }
18904 }
18905 target_entry->vme_start -= addr_adjustment;
18906 target_entry->vme_end -= addr_adjustment;
18907 DEBUG4K_ADJUST("copy %p (%d 0x%llx 0x%llx) entry %p [ 0x%llx 0x%llx ] object %p offset 0x%llx AFTER\n", target_copy_map, VM_MAP_COPY_PAGE_SHIFT(target_copy_map), target_copy_map->offset, (uint64_t)target_copy_map->size, target_entry, (uint64_t)target_entry->vme_start, (uint64_t)target_entry->vme_end, VME_OBJECT(target_entry), VME_OFFSET(target_entry));
18908 }
18909
18910 target_copy_map->size = target_size;
18911 target_copy_map->offset += overmap_start;
18912 target_copy_map->offset -= addr_adjustment;
18913 target_copy_map->cpy_hdr.page_shift = target_page_shift;
18914
18915 // assert(VM_MAP_PAGE_ALIGNED(target_copy_map->size, target_page_mask));
18916 // assert(VM_MAP_PAGE_ALIGNED(target_copy_map->offset, FOURK_PAGE_MASK));
18917 assert(overmap_start < VM_MAP_PAGE_SIZE(target_map));
18918 assert(overmap_end < VM_MAP_PAGE_SIZE(target_map));
18919
18920 *target_copy_map_p = target_copy_map;
18921 *overmap_start_p = overmap_start;
18922 *overmap_end_p = overmap_end;
18923
18924 DEBUG4K_ADJUST("copy_map %p (%d offset 0x%llx size 0x%llx) target_map %p (%d) copy %d target_copy_map %p (%d offset 0x%llx size 0x%llx) -> trimmed 0x%llx overmap start 0x%llx end 0x%llx KERN_SUCCESS\n", copy_map, copy_page_shift, (uint64_t)copy_map->offset, (uint64_t)copy_map->size, target_map, target_page_shift, copy, *target_copy_map_p, VM_MAP_COPY_PAGE_SHIFT(*target_copy_map_p), (uint64_t)(*target_copy_map_p)->offset, (uint64_t)(*target_copy_map_p)->size, (uint64_t)*trimmed_start_p, (uint64_t)*overmap_start_p, (uint64_t)*overmap_end_p);
18925 return KERN_SUCCESS;
18926 }
18927
18928 kern_return_t
vm_map_range_physical_size(vm_map_t map,vm_map_address_t start,mach_vm_size_t size,mach_vm_size_t * phys_size)18929 vm_map_range_physical_size(
18930 vm_map_t map,
18931 vm_map_address_t start,
18932 mach_vm_size_t size,
18933 mach_vm_size_t * phys_size)
18934 {
18935 kern_return_t kr;
18936 vm_map_copy_t copy_map, target_copy_map;
18937 vm_map_offset_t adjusted_start, adjusted_end;
18938 vm_map_size_t adjusted_size;
18939 vm_prot_t cur_prot, max_prot;
18940 vm_map_offset_t overmap_start, overmap_end, trimmed_start, end;
18941 vm_map_kernel_flags_t vmk_flags;
18942
18943 if (size == 0) {
18944 DEBUG4K_SHARE("map %p start 0x%llx size 0x%llx -> phys_size 0!\n", map, (uint64_t)start, (uint64_t)size);
18945 *phys_size = 0;
18946 return KERN_SUCCESS;
18947 }
18948
18949 adjusted_start = vm_map_trunc_page(start, VM_MAP_PAGE_MASK(map));
18950 adjusted_end = vm_map_round_page(start + size, VM_MAP_PAGE_MASK(map));
18951 if (__improbable(os_add_overflow(start, size, &end) ||
18952 adjusted_end <= adjusted_start)) {
18953 /* wraparound */
18954 printf("%s:%d(start=0x%llx, size=0x%llx) pgmask 0x%x: wraparound\n", __FUNCTION__, __LINE__, (uint64_t)start, (uint64_t)size, VM_MAP_PAGE_MASK(map));
18955 *phys_size = 0;
18956 return KERN_INVALID_ARGUMENT;
18957 }
18958 if (__improbable(vm_map_range_overflows(map, start, size))) {
18959 *phys_size = 0;
18960 return KERN_INVALID_ADDRESS;
18961 }
18962 assert(adjusted_end > adjusted_start);
18963 adjusted_size = adjusted_end - adjusted_start;
18964 *phys_size = adjusted_size;
18965 if (VM_MAP_PAGE_SIZE(map) == PAGE_SIZE) {
18966 return KERN_SUCCESS;
18967 }
18968 if (start == 0) {
18969 adjusted_start = vm_map_trunc_page(start, PAGE_MASK);
18970 adjusted_end = vm_map_round_page(start + size, PAGE_MASK);
18971 if (__improbable(adjusted_end <= adjusted_start)) {
18972 /* wraparound */
18973 printf("%s:%d(start=0x%llx, size=0x%llx) pgmask 0x%x: wraparound\n", __FUNCTION__, __LINE__, (uint64_t)start, (uint64_t)size, PAGE_MASK);
18974 *phys_size = 0;
18975 return KERN_INVALID_ARGUMENT;
18976 }
18977 assert(adjusted_end > adjusted_start);
18978 adjusted_size = adjusted_end - adjusted_start;
18979 *phys_size = adjusted_size;
18980 return KERN_SUCCESS;
18981 }
18982
18983 vmk_flags = VM_MAP_KERNEL_FLAGS_NONE;
18984 vmk_flags.vmkf_copy_pageable = TRUE;
18985 vmk_flags.vmkf_copy_same_map = TRUE;
18986 assert(adjusted_size != 0);
18987 cur_prot = VM_PROT_NONE; /* legacy mode */
18988 max_prot = VM_PROT_NONE; /* legacy mode */
18989 kr = vm_map_copy_extract(map, adjusted_start, adjusted_size,
18990 FALSE /* copy */,
18991 ©_map,
18992 &cur_prot, &max_prot, VM_INHERIT_DEFAULT,
18993 vmk_flags);
18994 if (kr != KERN_SUCCESS) {
18995 DEBUG4K_ERROR("map %p start 0x%llx 0x%llx size 0x%llx 0x%llx kr 0x%x\n", map, (uint64_t)start, (uint64_t)adjusted_start, size, (uint64_t)adjusted_size, kr);
18996 //assert(0);
18997 *phys_size = 0;
18998 return kr;
18999 }
19000 assert(copy_map != VM_MAP_COPY_NULL);
19001 target_copy_map = copy_map;
19002 DEBUG4K_ADJUST("adjusting...\n");
19003 kr = vm_map_copy_adjust_to_target(
19004 copy_map,
19005 start - adjusted_start, /* offset */
19006 size, /* size */
19007 kernel_map,
19008 FALSE, /* copy */
19009 &target_copy_map,
19010 &overmap_start,
19011 &overmap_end,
19012 &trimmed_start);
19013 if (kr == KERN_SUCCESS) {
19014 if (target_copy_map->size != *phys_size) {
19015 DEBUG4K_ADJUST("map %p (%d) start 0x%llx size 0x%llx adjusted_start 0x%llx adjusted_end 0x%llx overmap_start 0x%llx overmap_end 0x%llx trimmed_start 0x%llx phys_size 0x%llx -> 0x%llx\n", map, VM_MAP_PAGE_SHIFT(map), (uint64_t)start, (uint64_t)size, (uint64_t)adjusted_start, (uint64_t)adjusted_end, (uint64_t)overmap_start, (uint64_t)overmap_end, (uint64_t)trimmed_start, (uint64_t)*phys_size, (uint64_t)target_copy_map->size);
19016 }
19017 *phys_size = target_copy_map->size;
19018 } else {
19019 DEBUG4K_ERROR("map %p start 0x%llx 0x%llx size 0x%llx 0x%llx kr 0x%x\n", map, (uint64_t)start, (uint64_t)adjusted_start, size, (uint64_t)adjusted_size, kr);
19020 //assert(0);
19021 *phys_size = 0;
19022 }
19023 vm_map_copy_discard(copy_map);
19024 copy_map = VM_MAP_COPY_NULL;
19025
19026 return kr;
19027 }
19028
19029
19030 kern_return_t
memory_entry_check_for_adjustment(vm_map_t src_map,ipc_port_t port,vm_map_offset_t * overmap_start,vm_map_offset_t * overmap_end)19031 memory_entry_check_for_adjustment(
19032 vm_map_t src_map,
19033 ipc_port_t port,
19034 vm_map_offset_t *overmap_start,
19035 vm_map_offset_t *overmap_end)
19036 {
19037 kern_return_t kr = KERN_SUCCESS;
19038 vm_map_copy_t copy_map = VM_MAP_COPY_NULL, target_copy_map = VM_MAP_COPY_NULL;
19039
19040 assert(port);
19041 assertf(ip_kotype(port) == IKOT_NAMED_ENTRY, "Port Type expected: %d...received:%d\n", IKOT_NAMED_ENTRY, ip_kotype(port));
19042
19043 vm_named_entry_t named_entry;
19044
19045 named_entry = mach_memory_entry_from_port(port);
19046 named_entry_lock(named_entry);
19047 copy_map = named_entry->backing.copy;
19048 target_copy_map = copy_map;
19049
19050 if (src_map && VM_MAP_PAGE_SHIFT(src_map) < PAGE_SHIFT) {
19051 vm_map_offset_t trimmed_start;
19052
19053 trimmed_start = 0;
19054 DEBUG4K_ADJUST("adjusting...\n");
19055 kr = vm_map_copy_adjust_to_target(
19056 copy_map,
19057 0, /* offset */
19058 copy_map->size, /* size */
19059 src_map,
19060 FALSE, /* copy */
19061 &target_copy_map,
19062 overmap_start,
19063 overmap_end,
19064 &trimmed_start);
19065 assert(trimmed_start == 0);
19066 }
19067 named_entry_unlock(named_entry);
19068
19069 return kr;
19070 }
19071
19072
19073 /*
19074 * Routine: vm_remap
19075 *
19076 * Map portion of a task's address space.
19077 * Mapped region must not overlap more than
19078 * one vm memory object. Protections and
19079 * inheritance attributes remain the same
19080 * as in the original task and are out parameters.
19081 * Source and Target task can be identical
19082 * Other attributes are identical as for vm_map()
19083 */
19084 kern_return_t
vm_map_remap(vm_map_t target_map,vm_map_address_t * address,vm_map_size_t size,vm_map_offset_t mask,vm_map_kernel_flags_t vmk_flags,vm_map_t src_map,vm_map_offset_t memory_address,boolean_t copy,vm_prot_t * cur_protection,vm_prot_t * max_protection,vm_inherit_t inheritance)19085 vm_map_remap(
19086 vm_map_t target_map,
19087 vm_map_address_t *address,
19088 vm_map_size_t size,
19089 vm_map_offset_t mask,
19090 vm_map_kernel_flags_t vmk_flags,
19091 vm_map_t src_map,
19092 vm_map_offset_t memory_address,
19093 boolean_t copy,
19094 vm_prot_t *cur_protection, /* IN/OUT */
19095 vm_prot_t *max_protection, /* IN/OUT */
19096 vm_inherit_t inheritance)
19097 {
19098 kern_return_t result;
19099 vm_map_entry_t entry;
19100 vm_map_entry_t insp_entry = VM_MAP_ENTRY_NULL;
19101 vm_map_entry_t new_entry;
19102 vm_map_copy_t copy_map;
19103 vm_map_offset_t offset_in_mapping;
19104 vm_map_size_t target_size = 0;
19105 vm_map_size_t src_page_mask, target_page_mask;
19106 vm_map_offset_t overmap_start, overmap_end, trimmed_start;
19107 vm_map_offset_t initial_memory_address;
19108 vm_map_size_t initial_size;
19109 VM_MAP_ZAP_DECLARE(zap_list);
19110
19111 if (target_map == VM_MAP_NULL) {
19112 return KERN_INVALID_ARGUMENT;
19113 }
19114
19115 if (__improbable(vm_map_range_overflows(src_map, memory_address, size))) {
19116 return KERN_INVALID_ARGUMENT;
19117 }
19118
19119 if (__improbable((*cur_protection & *max_protection) != *cur_protection)) {
19120 /* cur is more permissive than max */
19121 return KERN_INVALID_ARGUMENT;
19122 }
19123
19124 initial_memory_address = memory_address;
19125 initial_size = size;
19126 src_page_mask = VM_MAP_PAGE_MASK(src_map);
19127 target_page_mask = VM_MAP_PAGE_MASK(target_map);
19128
19129 switch (inheritance) {
19130 case VM_INHERIT_NONE:
19131 case VM_INHERIT_COPY:
19132 case VM_INHERIT_SHARE:
19133 if (size != 0 && src_map != VM_MAP_NULL) {
19134 break;
19135 }
19136 OS_FALLTHROUGH;
19137 default:
19138 return KERN_INVALID_ARGUMENT;
19139 }
19140
19141 if (src_page_mask != target_page_mask) {
19142 if (copy) {
19143 DEBUG4K_COPY("src_map %p pgsz 0x%x addr 0x%llx size 0x%llx copy %d -> target_map %p pgsz 0x%x\n", src_map, VM_MAP_PAGE_SIZE(src_map), (uint64_t)memory_address, (uint64_t)size, copy, target_map, VM_MAP_PAGE_SIZE(target_map));
19144 } else {
19145 DEBUG4K_SHARE("src_map %p pgsz 0x%x addr 0x%llx size 0x%llx copy %d -> target_map %p pgsz 0x%x\n", src_map, VM_MAP_PAGE_SIZE(src_map), (uint64_t)memory_address, (uint64_t)size, copy, target_map, VM_MAP_PAGE_SIZE(target_map));
19146 }
19147 }
19148
19149 /*
19150 * If the user is requesting that we return the address of the
19151 * first byte of the data (rather than the base of the page),
19152 * then we use different rounding semantics: specifically,
19153 * we assume that (memory_address, size) describes a region
19154 * all of whose pages we must cover, rather than a base to be truncated
19155 * down and a size to be added to that base. So we figure out
19156 * the highest page that the requested region includes and make
19157 * sure that the size will cover it.
19158 *
19159 * The key example we're worried about it is of the form:
19160 *
19161 * memory_address = 0x1ff0, size = 0x20
19162 *
19163 * With the old semantics, we round down the memory_address to 0x1000
19164 * and round up the size to 0x1000, resulting in our covering *only*
19165 * page 0x1000. With the new semantics, we'd realize that the region covers
19166 * 0x1ff0-0x2010, and compute a size of 0x2000. Thus, we cover both page
19167 * 0x1000 and page 0x2000 in the region we remap.
19168 */
19169 if (vmk_flags.vmf_return_data_addr) {
19170 vm_map_offset_t range_start, range_end;
19171
19172 range_start = vm_map_trunc_page(memory_address, src_page_mask);
19173 range_end = vm_map_round_page(memory_address + size, src_page_mask);
19174 memory_address = range_start;
19175 size = range_end - range_start;
19176 offset_in_mapping = initial_memory_address - memory_address;
19177 } else {
19178 /*
19179 * IMPORTANT:
19180 * This legacy code path is broken: for the range mentioned
19181 * above [ memory_address = 0x1ff0,size = 0x20 ], which spans
19182 * two 4k pages, it yields [ memory_address = 0x1000,
19183 * size = 0x1000 ], which covers only the first 4k page.
19184 * BUT some code unfortunately depends on this bug, so we
19185 * can't fix it without breaking something.
19186 * New code should get automatically opted in the new
19187 * behavior with the new VM_FLAGS_RETURN_DATA_ADDR flags.
19188 */
19189 offset_in_mapping = 0;
19190 memory_address = vm_map_trunc_page(memory_address, src_page_mask);
19191 size = vm_map_round_page(size, src_page_mask);
19192 initial_memory_address = memory_address;
19193 initial_size = size;
19194 }
19195
19196
19197 if (size == 0) {
19198 return KERN_INVALID_ARGUMENT;
19199 }
19200
19201 if (vmk_flags.vmf_resilient_media) {
19202 /* must be copy-on-write to be "media resilient" */
19203 if (!copy) {
19204 return KERN_INVALID_ARGUMENT;
19205 }
19206 }
19207
19208 vmk_flags.vmkf_copy_pageable = target_map->hdr.entries_pageable;
19209 vmk_flags.vmkf_copy_same_map = (src_map == target_map);
19210
19211 assert(size != 0);
19212 result = vm_map_copy_extract(src_map,
19213 memory_address,
19214 size,
19215 copy, ©_map,
19216 cur_protection, /* IN/OUT */
19217 max_protection, /* IN/OUT */
19218 inheritance,
19219 vmk_flags);
19220 if (result != KERN_SUCCESS) {
19221 return result;
19222 }
19223 assert(copy_map != VM_MAP_COPY_NULL);
19224
19225 /*
19226 * Handle the policy for vm map ranges
19227 *
19228 * If the maps differ, the target_map policy applies like for vm_map()
19229 * For same mapping remaps, we preserve the range.
19230 */
19231 if (vmk_flags.vmkf_copy_same_map) {
19232 vmk_flags.vmkf_range_id = copy_map->orig_range;
19233 } else {
19234 vm_map_kernel_flags_update_range_id(&vmk_flags, target_map);
19235 }
19236
19237 overmap_start = 0;
19238 overmap_end = 0;
19239 trimmed_start = 0;
19240 target_size = size;
19241 if (src_page_mask != target_page_mask) {
19242 vm_map_copy_t target_copy_map;
19243
19244 target_copy_map = copy_map; /* can modify "copy_map" itself */
19245 DEBUG4K_ADJUST("adjusting...\n");
19246 result = vm_map_copy_adjust_to_target(
19247 copy_map,
19248 offset_in_mapping, /* offset */
19249 initial_size,
19250 target_map,
19251 copy,
19252 &target_copy_map,
19253 &overmap_start,
19254 &overmap_end,
19255 &trimmed_start);
19256 if (result != KERN_SUCCESS) {
19257 DEBUG4K_COPY("failed to adjust 0x%x\n", result);
19258 vm_map_copy_discard(copy_map);
19259 return result;
19260 }
19261 if (trimmed_start == 0) {
19262 /* nothing trimmed: no adjustment needed */
19263 } else if (trimmed_start >= offset_in_mapping) {
19264 /* trimmed more than offset_in_mapping: nothing left */
19265 assert(overmap_start == 0);
19266 assert(overmap_end == 0);
19267 offset_in_mapping = 0;
19268 } else {
19269 /* trimmed some of offset_in_mapping: adjust */
19270 assert(overmap_start == 0);
19271 assert(overmap_end == 0);
19272 offset_in_mapping -= trimmed_start;
19273 }
19274 offset_in_mapping += overmap_start;
19275 target_size = target_copy_map->size;
19276 }
19277
19278 /*
19279 * Allocate/check a range of free virtual address
19280 * space for the target
19281 */
19282 *address = vm_map_trunc_page(*address, target_page_mask);
19283 vm_map_lock(target_map);
19284 target_size = vm_map_round_page(target_size, target_page_mask);
19285 result = vm_map_remap_range_allocate(target_map, address,
19286 target_size, mask, vmk_flags,
19287 &insp_entry, &zap_list);
19288
19289 for (entry = vm_map_copy_first_entry(copy_map);
19290 entry != vm_map_copy_to_entry(copy_map);
19291 entry = new_entry) {
19292 new_entry = entry->vme_next;
19293 vm_map_copy_entry_unlink(copy_map, entry);
19294 if (result == KERN_SUCCESS) {
19295 if (vmk_flags.vmkf_remap_prot_copy) {
19296 /*
19297 * This vm_map_remap() is for a
19298 * vm_protect(VM_PROT_COPY), so the caller
19299 * expects to be allowed to add write access
19300 * to this new mapping. This is done by
19301 * adding VM_PROT_WRITE to each entry's
19302 * max_protection... unless some security
19303 * settings disallow it.
19304 */
19305 bool allow_write = false;
19306 if (entry->vme_permanent) {
19307 /* immutable mapping... */
19308 if ((entry->max_protection & VM_PROT_EXECUTE) &&
19309 developer_mode_state()) {
19310 /*
19311 * ... but executable and
19312 * possibly being debugged,
19313 * so let's allow it to become
19314 * writable, for breakpoints
19315 * and dtrace probes, for
19316 * example.
19317 */
19318 allow_write = true;
19319 } else {
19320 printf("%d[%s] vm_remap(0x%llx,0x%llx) VM_PROT_COPY denied on permanent mapping prot 0x%x/0x%x developer %d\n",
19321 proc_selfpid(),
19322 (get_bsdtask_info(current_task())
19323 ? proc_name_address(get_bsdtask_info(current_task()))
19324 : "?"),
19325 (uint64_t)memory_address,
19326 (uint64_t)size,
19327 entry->protection,
19328 entry->max_protection,
19329 developer_mode_state());
19330 DTRACE_VM6(vm_map_delete_permanent_deny_protcopy,
19331 vm_map_entry_t, entry,
19332 vm_map_offset_t, entry->vme_start,
19333 vm_map_offset_t, entry->vme_end,
19334 vm_prot_t, entry->protection,
19335 vm_prot_t, entry->max_protection,
19336 int, VME_ALIAS(entry));
19337 }
19338 } else {
19339 allow_write = true;
19340 }
19341
19342 /*
19343 * VM_PROT_COPY: allow this mapping to become
19344 * writable, unless it was "permanent".
19345 */
19346 if (allow_write) {
19347 entry->max_protection |= VM_PROT_WRITE;
19348 }
19349 }
19350 if (vmk_flags.vmf_resilient_codesign) {
19351 /* no codesigning -> read-only access */
19352 entry->max_protection = VM_PROT_READ;
19353 entry->protection = VM_PROT_READ;
19354 entry->vme_resilient_codesign = TRUE;
19355 }
19356 entry->vme_start += *address;
19357 entry->vme_end += *address;
19358 assert(!entry->map_aligned);
19359 if (vmk_flags.vmf_resilient_media &&
19360 !entry->is_sub_map &&
19361 (VME_OBJECT(entry) == VM_OBJECT_NULL ||
19362 VME_OBJECT(entry)->internal)) {
19363 entry->vme_resilient_media = TRUE;
19364 }
19365 assert(VM_MAP_PAGE_ALIGNED(entry->vme_start, MIN(target_page_mask, PAGE_MASK)));
19366 assert(VM_MAP_PAGE_ALIGNED(entry->vme_end, MIN(target_page_mask, PAGE_MASK)));
19367 assert(VM_MAP_PAGE_ALIGNED(VME_OFFSET(entry), MIN(target_page_mask, PAGE_MASK)));
19368 vm_map_store_entry_link(target_map, insp_entry, entry,
19369 vmk_flags);
19370 insp_entry = entry;
19371 } else {
19372 if (!entry->is_sub_map) {
19373 vm_object_deallocate(VME_OBJECT(entry));
19374 } else {
19375 vm_map_deallocate(VME_SUBMAP(entry));
19376 }
19377 vm_map_copy_entry_dispose(entry);
19378 }
19379 }
19380
19381 if (vmk_flags.vmf_resilient_codesign) {
19382 *cur_protection = VM_PROT_READ;
19383 *max_protection = VM_PROT_READ;
19384 }
19385
19386 if (result == KERN_SUCCESS) {
19387 target_map->size += target_size;
19388 SAVE_HINT_MAP_WRITE(target_map, insp_entry);
19389 }
19390 vm_map_unlock(target_map);
19391
19392 vm_map_zap_dispose(&zap_list);
19393
19394 if (result == KERN_SUCCESS && target_map->wiring_required) {
19395 result = vm_map_wire_kernel(target_map, *address,
19396 *address + size, *cur_protection, VM_KERN_MEMORY_MLOCK,
19397 TRUE);
19398 }
19399
19400 /*
19401 * If requested, return the address of the data pointed to by the
19402 * request, rather than the base of the resulting page.
19403 */
19404 if (vmk_flags.vmf_return_data_addr) {
19405 *address += offset_in_mapping;
19406 }
19407
19408 if (src_page_mask != target_page_mask) {
19409 DEBUG4K_SHARE("vm_remap(%p 0x%llx 0x%llx copy=%d-> %p 0x%llx 0x%llx result=0x%x\n", src_map, (uint64_t)memory_address, (uint64_t)size, copy, target_map, (uint64_t)*address, (uint64_t)offset_in_mapping, result);
19410 }
19411 vm_map_copy_discard(copy_map);
19412 copy_map = VM_MAP_COPY_NULL;
19413
19414 return result;
19415 }
19416
19417 /*
19418 * Routine: vm_map_remap_range_allocate
19419 *
19420 * Description:
19421 * Allocate a range in the specified virtual address map.
19422 * returns the address and the map entry just before the allocated
19423 * range
19424 *
19425 * Map must be locked.
19426 */
19427
19428 static kern_return_t
vm_map_remap_range_allocate(vm_map_t map,vm_map_address_t * address,vm_map_size_t size,vm_map_offset_t mask,vm_map_kernel_flags_t vmk_flags,vm_map_entry_t * map_entry,vm_map_zap_t zap_list)19429 vm_map_remap_range_allocate(
19430 vm_map_t map,
19431 vm_map_address_t *address, /* IN/OUT */
19432 vm_map_size_t size,
19433 vm_map_offset_t mask,
19434 vm_map_kernel_flags_t vmk_flags,
19435 vm_map_entry_t *map_entry, /* OUT */
19436 vm_map_zap_t zap_list)
19437 {
19438 vm_map_entry_t entry;
19439 vm_map_offset_t start;
19440 kern_return_t kr;
19441
19442 start = *address;
19443
19444 if (!vmk_flags.vmf_fixed) {
19445 kr = vm_map_locate_space(map, size, mask, vmk_flags,
19446 &start, &entry);
19447 if (kr != KERN_SUCCESS) {
19448 return kr;
19449 }
19450 *address = start;
19451 } else {
19452 vm_map_offset_t effective_min_offset, effective_max_offset;
19453 vm_map_entry_t temp_entry;
19454 vm_map_offset_t end;
19455
19456 effective_min_offset = map->min_offset;
19457 effective_max_offset = map->max_offset;
19458
19459 /*
19460 * Verify that:
19461 * the address doesn't itself violate
19462 * the mask requirement.
19463 */
19464
19465 if ((start & mask) != 0) {
19466 return KERN_NO_SPACE;
19467 }
19468
19469 #if CONFIG_MAP_RANGES
19470 if (map->uses_user_ranges) {
19471 struct mach_vm_range r;
19472
19473 vm_map_user_range_resolve(map, start, 1, &r);
19474 if (r.max_address == 0) {
19475 return KERN_INVALID_ADDRESS;
19476 }
19477
19478 effective_min_offset = r.min_address;
19479 effective_max_offset = r.max_address;
19480 }
19481 #endif /* CONFIG_MAP_RANGES */
19482 if (map == kernel_map) {
19483 mach_vm_range_t r = kmem_validate_range_for_overwrite(start, size);
19484 effective_min_offset = r->min_address;
19485 effective_min_offset = r->max_address;
19486 }
19487
19488 /*
19489 * ... the address is within bounds
19490 */
19491
19492 end = start + size;
19493
19494 if ((start < effective_min_offset) ||
19495 (end > effective_max_offset) ||
19496 (start >= end)) {
19497 return KERN_INVALID_ADDRESS;
19498 }
19499
19500 /*
19501 * If we're asked to overwrite whatever was mapped in that
19502 * range, first deallocate that range.
19503 */
19504 if (vmk_flags.vmf_overwrite) {
19505 vmr_flags_t remove_flags = VM_MAP_REMOVE_NO_MAP_ALIGN;
19506
19507 /*
19508 * We use a "zap_list" to avoid having to unlock
19509 * the "map" in vm_map_delete(), which would compromise
19510 * the atomicity of the "deallocate" and then "remap"
19511 * combination.
19512 */
19513 remove_flags |= VM_MAP_REMOVE_NO_YIELD;
19514
19515 if (vmk_flags.vmkf_overwrite_immutable) {
19516 remove_flags |= VM_MAP_REMOVE_IMMUTABLE;
19517 }
19518 if (vmk_flags.vmkf_remap_prot_copy) {
19519 remove_flags |= VM_MAP_REMOVE_IMMUTABLE_CODE;
19520 }
19521 kr = vm_map_delete(map, start, end, remove_flags,
19522 KMEM_GUARD_NONE, zap_list).kmr_return;
19523 if (kr != KERN_SUCCESS) {
19524 /* XXX FBDP restore zap_list? */
19525 return kr;
19526 }
19527 }
19528
19529 /*
19530 * ... the starting address isn't allocated
19531 */
19532
19533 if (vm_map_lookup_entry(map, start, &temp_entry)) {
19534 return KERN_NO_SPACE;
19535 }
19536
19537 entry = temp_entry;
19538
19539 /*
19540 * ... the next region doesn't overlap the
19541 * end point.
19542 */
19543
19544 if ((entry->vme_next != vm_map_to_entry(map)) &&
19545 (entry->vme_next->vme_start < end)) {
19546 return KERN_NO_SPACE;
19547 }
19548 }
19549 *map_entry = entry;
19550 return KERN_SUCCESS;
19551 }
19552
19553 /*
19554 * vm_map_switch:
19555 *
19556 * Set the address map for the current thread to the specified map
19557 */
19558
19559 vm_map_t
vm_map_switch(vm_map_t map)19560 vm_map_switch(
19561 vm_map_t map)
19562 {
19563 thread_t thread = current_thread();
19564 vm_map_t oldmap = thread->map;
19565
19566
19567 /*
19568 * Deactivate the current map and activate the requested map
19569 */
19570 mp_disable_preemption();
19571 PMAP_SWITCH_USER(thread, map, cpu_number());
19572 mp_enable_preemption();
19573 return oldmap;
19574 }
19575
19576
19577 /*
19578 * Routine: vm_map_write_user
19579 *
19580 * Description:
19581 * Copy out data from a kernel space into space in the
19582 * destination map. The space must already exist in the
19583 * destination map.
19584 * NOTE: This routine should only be called by threads
19585 * which can block on a page fault. i.e. kernel mode user
19586 * threads.
19587 *
19588 */
19589 kern_return_t
vm_map_write_user(vm_map_t map,void * src_p,vm_map_address_t dst_addr,vm_size_t size)19590 vm_map_write_user(
19591 vm_map_t map,
19592 void *src_p,
19593 vm_map_address_t dst_addr,
19594 vm_size_t size)
19595 {
19596 kern_return_t kr = KERN_SUCCESS;
19597
19598 if (__improbable(vm_map_range_overflows(map, dst_addr, size))) {
19599 return KERN_INVALID_ADDRESS;
19600 }
19601
19602 if (current_map() == map) {
19603 if (copyout(src_p, dst_addr, size)) {
19604 kr = KERN_INVALID_ADDRESS;
19605 }
19606 } else {
19607 vm_map_t oldmap;
19608
19609 /* take on the identity of the target map while doing */
19610 /* the transfer */
19611
19612 vm_map_reference(map);
19613 oldmap = vm_map_switch(map);
19614 if (copyout(src_p, dst_addr, size)) {
19615 kr = KERN_INVALID_ADDRESS;
19616 }
19617 vm_map_switch(oldmap);
19618 vm_map_deallocate(map);
19619 }
19620 return kr;
19621 }
19622
19623 /*
19624 * Routine: vm_map_read_user
19625 *
19626 * Description:
19627 * Copy in data from a user space source map into the
19628 * kernel map. The space must already exist in the
19629 * kernel map.
19630 * NOTE: This routine should only be called by threads
19631 * which can block on a page fault. i.e. kernel mode user
19632 * threads.
19633 *
19634 */
19635 kern_return_t
vm_map_read_user(vm_map_t map,vm_map_address_t src_addr,void * dst_p,vm_size_t size)19636 vm_map_read_user(
19637 vm_map_t map,
19638 vm_map_address_t src_addr,
19639 void *dst_p,
19640 vm_size_t size)
19641 {
19642 kern_return_t kr = KERN_SUCCESS;
19643
19644 if (__improbable(vm_map_range_overflows(map, src_addr, size))) {
19645 return KERN_INVALID_ADDRESS;
19646 }
19647
19648 if (current_map() == map) {
19649 if (copyin(src_addr, dst_p, size)) {
19650 kr = KERN_INVALID_ADDRESS;
19651 }
19652 } else {
19653 vm_map_t oldmap;
19654
19655 /* take on the identity of the target map while doing */
19656 /* the transfer */
19657
19658 vm_map_reference(map);
19659 oldmap = vm_map_switch(map);
19660 if (copyin(src_addr, dst_p, size)) {
19661 kr = KERN_INVALID_ADDRESS;
19662 }
19663 vm_map_switch(oldmap);
19664 vm_map_deallocate(map);
19665 }
19666 return kr;
19667 }
19668
19669
19670 /*
19671 * vm_map_check_protection:
19672 *
19673 * Assert that the target map allows the specified
19674 * privilege on the entire address region given.
19675 * The entire region must be allocated.
19676 */
19677 boolean_t
vm_map_check_protection(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_prot_t protection)19678 vm_map_check_protection(vm_map_t map, vm_map_offset_t start,
19679 vm_map_offset_t end, vm_prot_t protection)
19680 {
19681 vm_map_entry_t entry;
19682 vm_map_entry_t tmp_entry;
19683
19684 if (__improbable(vm_map_range_overflows(map, start, end - start))) {
19685 return FALSE;
19686 }
19687
19688 vm_map_lock(map);
19689
19690 if (start < vm_map_min(map) || end > vm_map_max(map) || start > end) {
19691 vm_map_unlock(map);
19692 return FALSE;
19693 }
19694
19695 if (!vm_map_lookup_entry(map, start, &tmp_entry)) {
19696 vm_map_unlock(map);
19697 return FALSE;
19698 }
19699
19700 entry = tmp_entry;
19701
19702 while (start < end) {
19703 if (entry == vm_map_to_entry(map)) {
19704 vm_map_unlock(map);
19705 return FALSE;
19706 }
19707
19708 /*
19709 * No holes allowed!
19710 */
19711
19712 if (start < entry->vme_start) {
19713 vm_map_unlock(map);
19714 return FALSE;
19715 }
19716
19717 /*
19718 * Check protection associated with entry.
19719 */
19720
19721 if ((entry->protection & protection) != protection) {
19722 vm_map_unlock(map);
19723 return FALSE;
19724 }
19725
19726 /* go to next entry */
19727
19728 start = entry->vme_end;
19729 entry = entry->vme_next;
19730 }
19731 vm_map_unlock(map);
19732 return TRUE;
19733 }
19734
19735 kern_return_t
vm_map_purgable_control(vm_map_t map,vm_map_offset_t address,vm_purgable_t control,int * state)19736 vm_map_purgable_control(
19737 vm_map_t map,
19738 vm_map_offset_t address,
19739 vm_purgable_t control,
19740 int *state)
19741 {
19742 vm_map_entry_t entry;
19743 vm_object_t object;
19744 kern_return_t kr;
19745 boolean_t was_nonvolatile;
19746
19747 /*
19748 * Vet all the input parameters and current type and state of the
19749 * underlaying object. Return with an error if anything is amiss.
19750 */
19751 if (map == VM_MAP_NULL) {
19752 return KERN_INVALID_ARGUMENT;
19753 }
19754
19755 if (control != VM_PURGABLE_SET_STATE &&
19756 control != VM_PURGABLE_GET_STATE &&
19757 control != VM_PURGABLE_PURGE_ALL &&
19758 control != VM_PURGABLE_SET_STATE_FROM_KERNEL) {
19759 return KERN_INVALID_ARGUMENT;
19760 }
19761
19762 if (control == VM_PURGABLE_PURGE_ALL) {
19763 vm_purgeable_object_purge_all();
19764 return KERN_SUCCESS;
19765 }
19766
19767 if ((control == VM_PURGABLE_SET_STATE ||
19768 control == VM_PURGABLE_SET_STATE_FROM_KERNEL) &&
19769 (((*state & ~(VM_PURGABLE_ALL_MASKS)) != 0) ||
19770 ((*state & VM_PURGABLE_STATE_MASK) > VM_PURGABLE_STATE_MASK))) {
19771 return KERN_INVALID_ARGUMENT;
19772 }
19773
19774 vm_map_lock_read(map);
19775
19776 if (!vm_map_lookup_entry(map, address, &entry) || entry->is_sub_map) {
19777 /*
19778 * Must pass a valid non-submap address.
19779 */
19780 vm_map_unlock_read(map);
19781 return KERN_INVALID_ADDRESS;
19782 }
19783
19784 if ((entry->protection & VM_PROT_WRITE) == 0 &&
19785 control != VM_PURGABLE_GET_STATE) {
19786 /*
19787 * Can't apply purgable controls to something you can't write.
19788 */
19789 vm_map_unlock_read(map);
19790 return KERN_PROTECTION_FAILURE;
19791 }
19792
19793 object = VME_OBJECT(entry);
19794 if (object == VM_OBJECT_NULL ||
19795 object->purgable == VM_PURGABLE_DENY) {
19796 /*
19797 * Object must already be present and be purgeable.
19798 */
19799 vm_map_unlock_read(map);
19800 return KERN_INVALID_ARGUMENT;
19801 }
19802
19803 vm_object_lock(object);
19804
19805 #if 00
19806 if (VME_OFFSET(entry) != 0 ||
19807 entry->vme_end - entry->vme_start != object->vo_size) {
19808 /*
19809 * Can only apply purgable controls to the whole (existing)
19810 * object at once.
19811 */
19812 vm_map_unlock_read(map);
19813 vm_object_unlock(object);
19814 return KERN_INVALID_ARGUMENT;
19815 }
19816 #endif
19817
19818 assert(!entry->is_sub_map);
19819 assert(!entry->use_pmap); /* purgeable has its own accounting */
19820
19821 vm_map_unlock_read(map);
19822
19823 was_nonvolatile = (object->purgable == VM_PURGABLE_NONVOLATILE);
19824
19825 kr = vm_object_purgable_control(object, control, state);
19826
19827 if (was_nonvolatile &&
19828 object->purgable != VM_PURGABLE_NONVOLATILE &&
19829 map->pmap == kernel_pmap) {
19830 #if DEBUG
19831 object->vo_purgeable_volatilizer = kernel_task;
19832 #endif /* DEBUG */
19833 }
19834
19835 vm_object_unlock(object);
19836
19837 return kr;
19838 }
19839
19840 void
vm_map_footprint_query_page_info(vm_map_t map,vm_map_entry_t map_entry,vm_map_offset_t curr_s_offset,int * disposition_p)19841 vm_map_footprint_query_page_info(
19842 vm_map_t map,
19843 vm_map_entry_t map_entry,
19844 vm_map_offset_t curr_s_offset,
19845 int *disposition_p)
19846 {
19847 int pmap_disp;
19848 vm_object_t object = VM_OBJECT_NULL;
19849 int disposition;
19850 int effective_page_size;
19851
19852 vm_map_lock_assert_held(map);
19853 assert(!map->has_corpse_footprint);
19854 assert(curr_s_offset >= map_entry->vme_start);
19855 assert(curr_s_offset < map_entry->vme_end);
19856
19857 if (map_entry->is_sub_map) {
19858 if (!map_entry->use_pmap) {
19859 /* nested pmap: no footprint */
19860 *disposition_p = 0;
19861 return;
19862 }
19863 } else {
19864 object = VME_OBJECT(map_entry);
19865 if (object == VM_OBJECT_NULL) {
19866 /* nothing mapped here: no need to ask */
19867 *disposition_p = 0;
19868 return;
19869 }
19870 }
19871
19872 effective_page_size = MIN(PAGE_SIZE, VM_MAP_PAGE_SIZE(map));
19873
19874 pmap_disp = 0;
19875
19876 /*
19877 * Query the pmap.
19878 */
19879 pmap_query_page_info(map->pmap, curr_s_offset, &pmap_disp);
19880
19881 /*
19882 * Compute this page's disposition.
19883 */
19884 disposition = 0;
19885
19886 /* deal with "alternate accounting" first */
19887 if (!map_entry->is_sub_map &&
19888 object->vo_no_footprint) {
19889 /* does not count in footprint */
19890 assertf(!map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry);
19891 } else if (!map_entry->is_sub_map &&
19892 (object->purgable == VM_PURGABLE_NONVOLATILE ||
19893 (object->purgable == VM_PURGABLE_DENY &&
19894 object->vo_ledger_tag)) &&
19895 VM_OBJECT_OWNER(object) != NULL &&
19896 VM_OBJECT_OWNER(object)->map == map) {
19897 assertf(!map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry);
19898 if ((((curr_s_offset
19899 - map_entry->vme_start
19900 + VME_OFFSET(map_entry))
19901 / effective_page_size) <
19902 (object->resident_page_count +
19903 vm_compressor_pager_get_count(object->pager)))) {
19904 /*
19905 * Non-volatile purgeable object owned
19906 * by this task: report the first
19907 * "#resident + #compressed" pages as
19908 * "resident" (to show that they
19909 * contribute to the footprint) but not
19910 * "dirty" (to avoid double-counting
19911 * with the fake "non-volatile" region
19912 * we'll report at the end of the
19913 * address space to account for all
19914 * (mapped or not) non-volatile memory
19915 * owned by this task.
19916 */
19917 disposition |= VM_PAGE_QUERY_PAGE_PRESENT;
19918 }
19919 } else if (!map_entry->is_sub_map &&
19920 (object->purgable == VM_PURGABLE_VOLATILE ||
19921 object->purgable == VM_PURGABLE_EMPTY) &&
19922 VM_OBJECT_OWNER(object) != NULL &&
19923 VM_OBJECT_OWNER(object)->map == map) {
19924 assertf(!map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry);
19925 if ((((curr_s_offset
19926 - map_entry->vme_start
19927 + VME_OFFSET(map_entry))
19928 / effective_page_size) <
19929 object->wired_page_count)) {
19930 /*
19931 * Volatile|empty purgeable object owned
19932 * by this task: report the first
19933 * "#wired" pages as "resident" (to
19934 * show that they contribute to the
19935 * footprint) but not "dirty" (to avoid
19936 * double-counting with the fake
19937 * "non-volatile" region we'll report
19938 * at the end of the address space to
19939 * account for all (mapped or not)
19940 * non-volatile memory owned by this
19941 * task.
19942 */
19943 disposition |= VM_PAGE_QUERY_PAGE_PRESENT;
19944 }
19945 } else if (!map_entry->is_sub_map &&
19946 map_entry->iokit_acct &&
19947 object->internal &&
19948 object->purgable == VM_PURGABLE_DENY) {
19949 /*
19950 * Non-purgeable IOKit memory: phys_footprint
19951 * includes the entire virtual mapping.
19952 */
19953 assertf(!map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry);
19954 disposition |= VM_PAGE_QUERY_PAGE_PRESENT;
19955 disposition |= VM_PAGE_QUERY_PAGE_DIRTY;
19956 } else if (pmap_disp & (PMAP_QUERY_PAGE_ALTACCT |
19957 PMAP_QUERY_PAGE_COMPRESSED_ALTACCT)) {
19958 /* alternate accounting */
19959 #if __arm64__ && (DEVELOPMENT || DEBUG)
19960 if (map->pmap->footprint_was_suspended) {
19961 /*
19962 * The assertion below can fail if dyld
19963 * suspended footprint accounting
19964 * while doing some adjustments to
19965 * this page; the mapping would say
19966 * "use pmap accounting" but the page
19967 * would be marked "alternate
19968 * accounting".
19969 */
19970 } else
19971 #endif /* __arm64__ && (DEVELOPMENT || DEBUG) */
19972 {
19973 assertf(!map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry);
19974 }
19975 disposition = 0;
19976 } else {
19977 if (pmap_disp & PMAP_QUERY_PAGE_PRESENT) {
19978 assertf(map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry);
19979 disposition |= VM_PAGE_QUERY_PAGE_PRESENT;
19980 disposition |= VM_PAGE_QUERY_PAGE_REF;
19981 if (pmap_disp & PMAP_QUERY_PAGE_INTERNAL) {
19982 disposition |= VM_PAGE_QUERY_PAGE_DIRTY;
19983 } else {
19984 disposition |= VM_PAGE_QUERY_PAGE_EXTERNAL;
19985 }
19986 if (pmap_disp & PMAP_QUERY_PAGE_REUSABLE) {
19987 disposition |= VM_PAGE_QUERY_PAGE_REUSABLE;
19988 }
19989 } else if (pmap_disp & PMAP_QUERY_PAGE_COMPRESSED) {
19990 assertf(map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry);
19991 disposition |= VM_PAGE_QUERY_PAGE_PAGED_OUT;
19992 }
19993 }
19994
19995 *disposition_p = disposition;
19996 }
19997
19998 kern_return_t
vm_map_page_query_internal(vm_map_t target_map,vm_map_offset_t offset,int * disposition,int * ref_count)19999 vm_map_page_query_internal(
20000 vm_map_t target_map,
20001 vm_map_offset_t offset,
20002 int *disposition,
20003 int *ref_count)
20004 {
20005 kern_return_t kr;
20006 vm_page_info_basic_data_t info;
20007 mach_msg_type_number_t count;
20008
20009 count = VM_PAGE_INFO_BASIC_COUNT;
20010 kr = vm_map_page_info(target_map,
20011 offset,
20012 VM_PAGE_INFO_BASIC,
20013 (vm_page_info_t) &info,
20014 &count);
20015 if (kr == KERN_SUCCESS) {
20016 *disposition = info.disposition;
20017 *ref_count = info.ref_count;
20018 } else {
20019 *disposition = 0;
20020 *ref_count = 0;
20021 }
20022
20023 return kr;
20024 }
20025
20026 kern_return_t
vm_map_page_info(vm_map_t map,vm_map_offset_t offset,vm_page_info_flavor_t flavor,vm_page_info_t info,mach_msg_type_number_t * count)20027 vm_map_page_info(
20028 vm_map_t map,
20029 vm_map_offset_t offset,
20030 vm_page_info_flavor_t flavor,
20031 vm_page_info_t info,
20032 mach_msg_type_number_t *count)
20033 {
20034 return vm_map_page_range_info_internal(map,
20035 offset, /* start of range */
20036 (offset + 1), /* this will get rounded in the call to the page boundary */
20037 (int)-1, /* effective_page_shift: unspecified */
20038 flavor,
20039 info,
20040 count);
20041 }
20042
20043 kern_return_t
vm_map_page_range_info_internal(vm_map_t map,vm_map_offset_t start_offset,vm_map_offset_t end_offset,int effective_page_shift,vm_page_info_flavor_t flavor,vm_page_info_t info,mach_msg_type_number_t * count)20044 vm_map_page_range_info_internal(
20045 vm_map_t map,
20046 vm_map_offset_t start_offset,
20047 vm_map_offset_t end_offset,
20048 int effective_page_shift,
20049 vm_page_info_flavor_t flavor,
20050 vm_page_info_t info,
20051 mach_msg_type_number_t *count)
20052 {
20053 vm_map_entry_t map_entry = VM_MAP_ENTRY_NULL;
20054 vm_object_t object = VM_OBJECT_NULL, curr_object = VM_OBJECT_NULL;
20055 vm_page_t m = VM_PAGE_NULL;
20056 kern_return_t retval = KERN_SUCCESS;
20057 int disposition = 0;
20058 int ref_count = 0;
20059 int depth = 0, info_idx = 0;
20060 vm_page_info_basic_t basic_info = 0;
20061 vm_map_offset_t offset_in_page = 0, offset_in_object = 0, curr_offset_in_object = 0;
20062 vm_map_offset_t start = 0, end = 0, curr_s_offset = 0, curr_e_offset = 0;
20063 boolean_t do_region_footprint;
20064 ledger_amount_t ledger_resident, ledger_compressed;
20065 int effective_page_size;
20066 vm_map_offset_t effective_page_mask;
20067
20068 switch (flavor) {
20069 case VM_PAGE_INFO_BASIC:
20070 if (*count != VM_PAGE_INFO_BASIC_COUNT) {
20071 /*
20072 * The "vm_page_info_basic_data" structure was not
20073 * properly padded, so allow the size to be off by
20074 * one to maintain backwards binary compatibility...
20075 */
20076 if (*count != VM_PAGE_INFO_BASIC_COUNT - 1) {
20077 return KERN_INVALID_ARGUMENT;
20078 }
20079 }
20080 break;
20081 default:
20082 return KERN_INVALID_ARGUMENT;
20083 }
20084
20085 if (effective_page_shift == -1) {
20086 effective_page_shift = vm_self_region_page_shift_safely(map);
20087 if (effective_page_shift == -1) {
20088 return KERN_INVALID_ARGUMENT;
20089 }
20090 }
20091 effective_page_size = (1 << effective_page_shift);
20092 effective_page_mask = effective_page_size - 1;
20093
20094 do_region_footprint = task_self_region_footprint();
20095 disposition = 0;
20096 ref_count = 0;
20097 depth = 0;
20098 info_idx = 0; /* Tracks the next index within the info structure to be filled.*/
20099 retval = KERN_SUCCESS;
20100
20101 if (__improbable(vm_map_range_overflows(map, start_offset, end_offset - start_offset))) {
20102 return KERN_INVALID_ADDRESS;
20103 }
20104
20105 offset_in_page = start_offset & effective_page_mask;
20106 start = vm_map_trunc_page(start_offset, effective_page_mask);
20107 end = vm_map_round_page(end_offset, effective_page_mask);
20108
20109 if (end < start) {
20110 return KERN_INVALID_ARGUMENT;
20111 }
20112
20113 assert((end - start) <= MAX_PAGE_RANGE_QUERY);
20114
20115 vm_map_lock_read(map);
20116
20117 task_ledgers_footprint(map->pmap->ledger, &ledger_resident, &ledger_compressed);
20118
20119 for (curr_s_offset = start; curr_s_offset < end;) {
20120 /*
20121 * New lookup needs reset of these variables.
20122 */
20123 curr_object = object = VM_OBJECT_NULL;
20124 offset_in_object = 0;
20125 ref_count = 0;
20126 depth = 0;
20127
20128 if (do_region_footprint &&
20129 curr_s_offset >= vm_map_last_entry(map)->vme_end) {
20130 /*
20131 * Request for "footprint" info about a page beyond
20132 * the end of address space: this must be for
20133 * the fake region vm_map_region_recurse_64()
20134 * reported to account for non-volatile purgeable
20135 * memory owned by this task.
20136 */
20137 disposition = 0;
20138
20139 if (curr_s_offset - vm_map_last_entry(map)->vme_end <=
20140 (unsigned) ledger_compressed) {
20141 /*
20142 * We haven't reported all the "non-volatile
20143 * compressed" pages yet, so report this fake
20144 * page as "compressed".
20145 */
20146 disposition |= VM_PAGE_QUERY_PAGE_PAGED_OUT;
20147 } else {
20148 /*
20149 * We've reported all the non-volatile
20150 * compressed page but not all the non-volatile
20151 * pages , so report this fake page as
20152 * "resident dirty".
20153 */
20154 disposition |= VM_PAGE_QUERY_PAGE_PRESENT;
20155 disposition |= VM_PAGE_QUERY_PAGE_DIRTY;
20156 disposition |= VM_PAGE_QUERY_PAGE_REF;
20157 }
20158 switch (flavor) {
20159 case VM_PAGE_INFO_BASIC:
20160 basic_info = (vm_page_info_basic_t) (((uintptr_t) info) + (info_idx * sizeof(struct vm_page_info_basic)));
20161 basic_info->disposition = disposition;
20162 basic_info->ref_count = 1;
20163 basic_info->object_id = VM_OBJECT_ID_FAKE(map, task_ledgers.purgeable_nonvolatile);
20164 basic_info->offset = 0;
20165 basic_info->depth = 0;
20166
20167 info_idx++;
20168 break;
20169 }
20170 curr_s_offset += effective_page_size;
20171 continue;
20172 }
20173
20174 /*
20175 * First, find the map entry covering "curr_s_offset", going down
20176 * submaps if necessary.
20177 */
20178 if (!vm_map_lookup_entry(map, curr_s_offset, &map_entry)) {
20179 /* no entry -> no object -> no page */
20180
20181 if (curr_s_offset < vm_map_min(map)) {
20182 /*
20183 * Illegal address that falls below map min.
20184 */
20185 curr_e_offset = MIN(end, vm_map_min(map));
20186 } else if (curr_s_offset >= vm_map_max(map)) {
20187 /*
20188 * Illegal address that falls on/after map max.
20189 */
20190 curr_e_offset = end;
20191 } else if (map_entry == vm_map_to_entry(map)) {
20192 /*
20193 * Hit a hole.
20194 */
20195 if (map_entry->vme_next == vm_map_to_entry(map)) {
20196 /*
20197 * Empty map.
20198 */
20199 curr_e_offset = MIN(map->max_offset, end);
20200 } else {
20201 /*
20202 * Hole at start of the map.
20203 */
20204 curr_e_offset = MIN(map_entry->vme_next->vme_start, end);
20205 }
20206 } else {
20207 if (map_entry->vme_next == vm_map_to_entry(map)) {
20208 /*
20209 * Hole at the end of the map.
20210 */
20211 curr_e_offset = MIN(map->max_offset, end);
20212 } else {
20213 curr_e_offset = MIN(map_entry->vme_next->vme_start, end);
20214 }
20215 }
20216
20217 assert(curr_e_offset >= curr_s_offset);
20218
20219 uint64_t num_pages = (curr_e_offset - curr_s_offset) >> effective_page_shift;
20220
20221 void *info_ptr = (void*) (((uintptr_t) info) + (info_idx * sizeof(struct vm_page_info_basic)));
20222
20223 bzero(info_ptr, num_pages * sizeof(struct vm_page_info_basic));
20224
20225 curr_s_offset = curr_e_offset;
20226
20227 info_idx += num_pages;
20228
20229 continue;
20230 }
20231
20232 /* compute offset from this map entry's start */
20233 offset_in_object = curr_s_offset - map_entry->vme_start;
20234
20235 /* compute offset into this map entry's object (or submap) */
20236 offset_in_object += VME_OFFSET(map_entry);
20237
20238 if (map_entry->is_sub_map) {
20239 vm_map_t sub_map = VM_MAP_NULL;
20240 vm_page_info_t submap_info = 0;
20241 vm_map_offset_t submap_s_offset = 0, submap_e_offset = 0, range_len = 0;
20242
20243 range_len = MIN(map_entry->vme_end, end) - curr_s_offset;
20244
20245 submap_s_offset = offset_in_object;
20246 submap_e_offset = submap_s_offset + range_len;
20247
20248 sub_map = VME_SUBMAP(map_entry);
20249
20250 vm_map_reference(sub_map);
20251 vm_map_unlock_read(map);
20252
20253 submap_info = (vm_page_info_t) (((uintptr_t) info) + (info_idx * sizeof(struct vm_page_info_basic)));
20254
20255 assertf(VM_MAP_PAGE_SHIFT(sub_map) >= VM_MAP_PAGE_SHIFT(map),
20256 "Submap page size (%d) differs from current map (%d)\n", VM_MAP_PAGE_SIZE(sub_map), VM_MAP_PAGE_SIZE(map));
20257
20258 retval = vm_map_page_range_info_internal(sub_map,
20259 submap_s_offset,
20260 submap_e_offset,
20261 effective_page_shift,
20262 VM_PAGE_INFO_BASIC,
20263 (vm_page_info_t) submap_info,
20264 count);
20265
20266 assert(retval == KERN_SUCCESS);
20267
20268 vm_map_lock_read(map);
20269 vm_map_deallocate(sub_map);
20270
20271 /* Move the "info" index by the number of pages we inspected.*/
20272 info_idx += range_len >> effective_page_shift;
20273
20274 /* Move our current offset by the size of the range we inspected.*/
20275 curr_s_offset += range_len;
20276
20277 continue;
20278 }
20279
20280 object = VME_OBJECT(map_entry);
20281
20282 if (object == VM_OBJECT_NULL) {
20283 /*
20284 * We don't have an object here and, hence,
20285 * no pages to inspect. We'll fill up the
20286 * info structure appropriately.
20287 */
20288
20289 curr_e_offset = MIN(map_entry->vme_end, end);
20290
20291 uint64_t num_pages = (curr_e_offset - curr_s_offset) >> effective_page_shift;
20292
20293 void *info_ptr = (void*) (((uintptr_t) info) + (info_idx * sizeof(struct vm_page_info_basic)));
20294
20295 bzero(info_ptr, num_pages * sizeof(struct vm_page_info_basic));
20296
20297 curr_s_offset = curr_e_offset;
20298
20299 info_idx += num_pages;
20300
20301 continue;
20302 }
20303
20304 if (do_region_footprint) {
20305 disposition = 0;
20306 if (map->has_corpse_footprint) {
20307 /*
20308 * Query the page info data we saved
20309 * while forking the corpse.
20310 */
20311 vm_map_corpse_footprint_query_page_info(
20312 map,
20313 curr_s_offset,
20314 &disposition);
20315 } else {
20316 /*
20317 * Query the live pmap for footprint info
20318 * about this page.
20319 */
20320 vm_map_footprint_query_page_info(
20321 map,
20322 map_entry,
20323 curr_s_offset,
20324 &disposition);
20325 }
20326 switch (flavor) {
20327 case VM_PAGE_INFO_BASIC:
20328 basic_info = (vm_page_info_basic_t) (((uintptr_t) info) + (info_idx * sizeof(struct vm_page_info_basic)));
20329 basic_info->disposition = disposition;
20330 basic_info->ref_count = 1;
20331 basic_info->object_id = VM_OBJECT_ID_FAKE(map, task_ledgers.purgeable_nonvolatile);
20332 basic_info->offset = 0;
20333 basic_info->depth = 0;
20334
20335 info_idx++;
20336 break;
20337 }
20338 curr_s_offset += effective_page_size;
20339 continue;
20340 }
20341
20342 vm_object_reference(object);
20343 /*
20344 * Shared mode -- so we can allow other readers
20345 * to grab the lock too.
20346 */
20347 vm_object_lock_shared(object);
20348
20349 curr_e_offset = MIN(map_entry->vme_end, end);
20350
20351 vm_map_unlock_read(map);
20352
20353 map_entry = NULL; /* map is unlocked, the entry is no longer valid. */
20354
20355 curr_object = object;
20356
20357 for (; curr_s_offset < curr_e_offset;) {
20358 if (object == curr_object) {
20359 ref_count = curr_object->ref_count - 1; /* account for our object reference above. */
20360 } else {
20361 ref_count = curr_object->ref_count;
20362 }
20363
20364 curr_offset_in_object = offset_in_object;
20365
20366 for (;;) {
20367 m = vm_page_lookup(curr_object, vm_object_trunc_page(curr_offset_in_object));
20368
20369 if (m != VM_PAGE_NULL) {
20370 disposition |= VM_PAGE_QUERY_PAGE_PRESENT;
20371 break;
20372 } else {
20373 if (curr_object->internal &&
20374 curr_object->alive &&
20375 !curr_object->terminating &&
20376 curr_object->pager_ready) {
20377 if (VM_COMPRESSOR_PAGER_STATE_GET(curr_object, vm_object_trunc_page(curr_offset_in_object))
20378 == VM_EXTERNAL_STATE_EXISTS) {
20379 /* the pager has that page */
20380 disposition |= VM_PAGE_QUERY_PAGE_PAGED_OUT;
20381 break;
20382 }
20383 }
20384
20385 /*
20386 * Go down the VM object shadow chain until we find the page
20387 * we're looking for.
20388 */
20389
20390 if (curr_object->shadow != VM_OBJECT_NULL) {
20391 vm_object_t shadow = VM_OBJECT_NULL;
20392
20393 curr_offset_in_object += curr_object->vo_shadow_offset;
20394 shadow = curr_object->shadow;
20395
20396 vm_object_lock_shared(shadow);
20397 vm_object_unlock(curr_object);
20398
20399 curr_object = shadow;
20400 depth++;
20401 continue;
20402 } else {
20403 break;
20404 }
20405 }
20406 }
20407
20408 /* The ref_count is not strictly accurate, it measures the number */
20409 /* of entities holding a ref on the object, they may not be mapping */
20410 /* the object or may not be mapping the section holding the */
20411 /* target page but its still a ball park number and though an over- */
20412 /* count, it picks up the copy-on-write cases */
20413
20414 /* We could also get a picture of page sharing from pmap_attributes */
20415 /* but this would under count as only faulted-in mappings would */
20416 /* show up. */
20417
20418 if ((curr_object == object) && curr_object->shadow) {
20419 disposition |= VM_PAGE_QUERY_PAGE_COPIED;
20420 }
20421
20422 if (!curr_object->internal) {
20423 disposition |= VM_PAGE_QUERY_PAGE_EXTERNAL;
20424 }
20425
20426 if (m != VM_PAGE_NULL) {
20427 if (m->vmp_fictitious) {
20428 disposition |= VM_PAGE_QUERY_PAGE_FICTITIOUS;
20429 } else {
20430 if (m->vmp_dirty || pmap_is_modified(VM_PAGE_GET_PHYS_PAGE(m))) {
20431 disposition |= VM_PAGE_QUERY_PAGE_DIRTY;
20432 }
20433
20434 if (m->vmp_reference || pmap_is_referenced(VM_PAGE_GET_PHYS_PAGE(m))) {
20435 disposition |= VM_PAGE_QUERY_PAGE_REF;
20436 }
20437
20438 if (m->vmp_q_state == VM_PAGE_ON_SPECULATIVE_Q) {
20439 disposition |= VM_PAGE_QUERY_PAGE_SPECULATIVE;
20440 }
20441
20442 /*
20443 * XXX TODO4K:
20444 * when this routine deals with 4k
20445 * pages, check the appropriate CS bit
20446 * here.
20447 */
20448 if (m->vmp_cs_validated) {
20449 disposition |= VM_PAGE_QUERY_PAGE_CS_VALIDATED;
20450 }
20451 if (m->vmp_cs_tainted) {
20452 disposition |= VM_PAGE_QUERY_PAGE_CS_TAINTED;
20453 }
20454 if (m->vmp_cs_nx) {
20455 disposition |= VM_PAGE_QUERY_PAGE_CS_NX;
20456 }
20457 if (m->vmp_reusable || curr_object->all_reusable) {
20458 disposition |= VM_PAGE_QUERY_PAGE_REUSABLE;
20459 }
20460 }
20461 }
20462
20463 switch (flavor) {
20464 case VM_PAGE_INFO_BASIC:
20465 basic_info = (vm_page_info_basic_t) (((uintptr_t) info) + (info_idx * sizeof(struct vm_page_info_basic)));
20466 basic_info->disposition = disposition;
20467 basic_info->ref_count = ref_count;
20468 basic_info->object_id = (vm_object_id_t) (uintptr_t)
20469 VM_KERNEL_ADDRHASH(curr_object);
20470 basic_info->offset =
20471 (memory_object_offset_t) curr_offset_in_object + offset_in_page;
20472 basic_info->depth = depth;
20473
20474 info_idx++;
20475 break;
20476 }
20477
20478 disposition = 0;
20479 offset_in_page = 0; // This doesn't really make sense for any offset other than the starting offset.
20480
20481 /*
20482 * Move to next offset in the range and in our object.
20483 */
20484 curr_s_offset += effective_page_size;
20485 offset_in_object += effective_page_size;
20486 curr_offset_in_object = offset_in_object;
20487
20488 if (curr_object != object) {
20489 vm_object_unlock(curr_object);
20490
20491 curr_object = object;
20492
20493 vm_object_lock_shared(curr_object);
20494 } else {
20495 vm_object_lock_yield_shared(curr_object);
20496 }
20497 }
20498
20499 vm_object_unlock(curr_object);
20500 vm_object_deallocate(curr_object);
20501
20502 vm_map_lock_read(map);
20503 }
20504
20505 vm_map_unlock_read(map);
20506 return retval;
20507 }
20508
20509 /*
20510 * vm_map_msync
20511 *
20512 * Synchronises the memory range specified with its backing store
20513 * image by either flushing or cleaning the contents to the appropriate
20514 * memory manager engaging in a memory object synchronize dialog with
20515 * the manager. The client doesn't return until the manager issues
20516 * m_o_s_completed message. MIG Magically converts user task parameter
20517 * to the task's address map.
20518 *
20519 * interpretation of sync_flags
20520 * VM_SYNC_INVALIDATE - discard pages, only return precious
20521 * pages to manager.
20522 *
20523 * VM_SYNC_INVALIDATE & (VM_SYNC_SYNCHRONOUS | VM_SYNC_ASYNCHRONOUS)
20524 * - discard pages, write dirty or precious
20525 * pages back to memory manager.
20526 *
20527 * VM_SYNC_SYNCHRONOUS | VM_SYNC_ASYNCHRONOUS
20528 * - write dirty or precious pages back to
20529 * the memory manager.
20530 *
20531 * VM_SYNC_CONTIGUOUS - does everything normally, but if there
20532 * is a hole in the region, and we would
20533 * have returned KERN_SUCCESS, return
20534 * KERN_INVALID_ADDRESS instead.
20535 *
20536 * NOTE
20537 * The memory object attributes have not yet been implemented, this
20538 * function will have to deal with the invalidate attribute
20539 *
20540 * RETURNS
20541 * KERN_INVALID_TASK Bad task parameter
20542 * KERN_INVALID_ARGUMENT both sync and async were specified.
20543 * KERN_SUCCESS The usual.
20544 * KERN_INVALID_ADDRESS There was a hole in the region.
20545 */
20546
20547 kern_return_t
vm_map_msync(vm_map_t map,vm_map_address_t address,vm_map_size_t size,vm_sync_t sync_flags)20548 vm_map_msync(
20549 vm_map_t map,
20550 vm_map_address_t address,
20551 vm_map_size_t size,
20552 vm_sync_t sync_flags)
20553 {
20554 vm_map_entry_t entry;
20555 vm_map_size_t amount_left;
20556 vm_object_offset_t offset;
20557 vm_object_offset_t start_offset, end_offset;
20558 boolean_t do_sync_req;
20559 boolean_t had_hole = FALSE;
20560 vm_map_offset_t pmap_offset;
20561
20562 if ((sync_flags & VM_SYNC_ASYNCHRONOUS) &&
20563 (sync_flags & VM_SYNC_SYNCHRONOUS)) {
20564 return KERN_INVALID_ARGUMENT;
20565 }
20566
20567 if (__improbable(vm_map_range_overflows(map, address, size))) {
20568 return KERN_INVALID_ADDRESS;
20569 }
20570
20571 if (VM_MAP_PAGE_MASK(map) < PAGE_MASK) {
20572 DEBUG4K_SHARE("map %p address 0x%llx size 0x%llx flags 0x%x\n", map, (uint64_t)address, (uint64_t)size, sync_flags);
20573 }
20574
20575 /*
20576 * align address and size on page boundaries
20577 */
20578 size = (vm_map_round_page(address + size,
20579 VM_MAP_PAGE_MASK(map)) -
20580 vm_map_trunc_page(address,
20581 VM_MAP_PAGE_MASK(map)));
20582 address = vm_map_trunc_page(address,
20583 VM_MAP_PAGE_MASK(map));
20584
20585 if (map == VM_MAP_NULL) {
20586 return KERN_INVALID_TASK;
20587 }
20588
20589 if (size == 0) {
20590 return KERN_SUCCESS;
20591 }
20592
20593 amount_left = size;
20594
20595 while (amount_left > 0) {
20596 vm_object_size_t flush_size;
20597 vm_object_t object;
20598
20599 vm_map_lock(map);
20600 if (!vm_map_lookup_entry(map,
20601 address,
20602 &entry)) {
20603 vm_map_size_t skip;
20604
20605 /*
20606 * hole in the address map.
20607 */
20608 had_hole = TRUE;
20609
20610 if (sync_flags & VM_SYNC_KILLPAGES) {
20611 /*
20612 * For VM_SYNC_KILLPAGES, there should be
20613 * no holes in the range, since we couldn't
20614 * prevent someone else from allocating in
20615 * that hole and we wouldn't want to "kill"
20616 * their pages.
20617 */
20618 vm_map_unlock(map);
20619 break;
20620 }
20621
20622 /*
20623 * Check for empty map.
20624 */
20625 if (entry == vm_map_to_entry(map) &&
20626 entry->vme_next == entry) {
20627 vm_map_unlock(map);
20628 break;
20629 }
20630 /*
20631 * Check that we don't wrap and that
20632 * we have at least one real map entry.
20633 */
20634 if ((map->hdr.nentries == 0) ||
20635 (entry->vme_next->vme_start < address)) {
20636 vm_map_unlock(map);
20637 break;
20638 }
20639 /*
20640 * Move up to the next entry if needed
20641 */
20642 skip = (entry->vme_next->vme_start - address);
20643 if (skip >= amount_left) {
20644 amount_left = 0;
20645 } else {
20646 amount_left -= skip;
20647 }
20648 address = entry->vme_next->vme_start;
20649 vm_map_unlock(map);
20650 continue;
20651 }
20652
20653 offset = address - entry->vme_start;
20654 pmap_offset = address;
20655
20656 /*
20657 * do we have more to flush than is contained in this
20658 * entry ?
20659 */
20660 if (amount_left + entry->vme_start + offset > entry->vme_end) {
20661 flush_size = entry->vme_end -
20662 (entry->vme_start + offset);
20663 } else {
20664 flush_size = amount_left;
20665 }
20666 amount_left -= flush_size;
20667 address += flush_size;
20668
20669 if (entry->is_sub_map == TRUE) {
20670 vm_map_t local_map;
20671 vm_map_offset_t local_offset;
20672
20673 local_map = VME_SUBMAP(entry);
20674 local_offset = VME_OFFSET(entry);
20675 vm_map_reference(local_map);
20676 vm_map_unlock(map);
20677 if (vm_map_msync(
20678 local_map,
20679 local_offset,
20680 flush_size,
20681 sync_flags) == KERN_INVALID_ADDRESS) {
20682 had_hole = TRUE;
20683 }
20684 vm_map_deallocate(local_map);
20685 continue;
20686 }
20687 object = VME_OBJECT(entry);
20688
20689 /*
20690 * We can't sync this object if the object has not been
20691 * created yet
20692 */
20693 if (object == VM_OBJECT_NULL) {
20694 vm_map_unlock(map);
20695 continue;
20696 }
20697 offset += VME_OFFSET(entry);
20698
20699 vm_object_lock(object);
20700
20701 if (sync_flags & (VM_SYNC_KILLPAGES | VM_SYNC_DEACTIVATE)) {
20702 int kill_pages = 0;
20703
20704 if (VM_MAP_PAGE_MASK(map) < PAGE_MASK) {
20705 /*
20706 * This is a destructive operation and so we
20707 * err on the side of limiting the range of
20708 * the operation.
20709 */
20710 start_offset = vm_object_round_page(offset);
20711 end_offset = vm_object_trunc_page(offset + flush_size);
20712
20713 if (end_offset <= start_offset) {
20714 vm_object_unlock(object);
20715 vm_map_unlock(map);
20716 continue;
20717 }
20718
20719 pmap_offset += start_offset - offset;
20720 } else {
20721 start_offset = offset;
20722 end_offset = offset + flush_size;
20723 }
20724
20725 if (sync_flags & VM_SYNC_KILLPAGES) {
20726 if (((object->ref_count == 1) ||
20727 ((object->copy_strategy !=
20728 MEMORY_OBJECT_COPY_SYMMETRIC) &&
20729 (object->vo_copy == VM_OBJECT_NULL))) &&
20730 (object->shadow == VM_OBJECT_NULL)) {
20731 if (object->ref_count != 1) {
20732 vm_page_stats_reusable.free_shared++;
20733 }
20734 kill_pages = 1;
20735 } else {
20736 kill_pages = -1;
20737 }
20738 }
20739 if (kill_pages != -1) {
20740 vm_object_deactivate_pages(
20741 object,
20742 start_offset,
20743 (vm_object_size_t) (end_offset - start_offset),
20744 kill_pages,
20745 FALSE, /* reusable_pages */
20746 FALSE, /* reusable_no_write */
20747 map->pmap,
20748 pmap_offset);
20749 }
20750 vm_object_unlock(object);
20751 vm_map_unlock(map);
20752 continue;
20753 }
20754 /*
20755 * We can't sync this object if there isn't a pager.
20756 * Don't bother to sync internal objects, since there can't
20757 * be any "permanent" storage for these objects anyway.
20758 */
20759 if ((object->pager == MEMORY_OBJECT_NULL) ||
20760 (object->internal) || (object->private)) {
20761 vm_object_unlock(object);
20762 vm_map_unlock(map);
20763 continue;
20764 }
20765 /*
20766 * keep reference on the object until syncing is done
20767 */
20768 vm_object_reference_locked(object);
20769 vm_object_unlock(object);
20770
20771 vm_map_unlock(map);
20772
20773 if (VM_MAP_PAGE_MASK(map) < PAGE_MASK) {
20774 start_offset = vm_object_trunc_page(offset);
20775 end_offset = vm_object_round_page(offset + flush_size);
20776 } else {
20777 start_offset = offset;
20778 end_offset = offset + flush_size;
20779 }
20780
20781 do_sync_req = vm_object_sync(object,
20782 start_offset,
20783 (end_offset - start_offset),
20784 sync_flags & VM_SYNC_INVALIDATE,
20785 ((sync_flags & VM_SYNC_SYNCHRONOUS) ||
20786 (sync_flags & VM_SYNC_ASYNCHRONOUS)),
20787 sync_flags & VM_SYNC_SYNCHRONOUS);
20788
20789 if ((sync_flags & VM_SYNC_INVALIDATE) && object->resident_page_count == 0) {
20790 /*
20791 * clear out the clustering and read-ahead hints
20792 */
20793 vm_object_lock(object);
20794
20795 object->pages_created = 0;
20796 object->pages_used = 0;
20797 object->sequential = 0;
20798 object->last_alloc = 0;
20799
20800 vm_object_unlock(object);
20801 }
20802 vm_object_deallocate(object);
20803 } /* while */
20804
20805 /* for proper msync() behaviour */
20806 if (had_hole == TRUE && (sync_flags & VM_SYNC_CONTIGUOUS)) {
20807 return KERN_INVALID_ADDRESS;
20808 }
20809
20810 return KERN_SUCCESS;
20811 }/* vm_msync */
20812
20813 void
vm_named_entry_associate_vm_object(vm_named_entry_t named_entry,vm_object_t object,vm_object_offset_t offset,vm_object_size_t size,vm_prot_t prot)20814 vm_named_entry_associate_vm_object(
20815 vm_named_entry_t named_entry,
20816 vm_object_t object,
20817 vm_object_offset_t offset,
20818 vm_object_size_t size,
20819 vm_prot_t prot)
20820 {
20821 vm_map_copy_t copy;
20822 vm_map_entry_t copy_entry;
20823
20824 assert(!named_entry->is_sub_map);
20825 assert(!named_entry->is_copy);
20826 assert(!named_entry->is_object);
20827 assert(!named_entry->internal);
20828 assert(named_entry->backing.copy == VM_MAP_COPY_NULL);
20829
20830 copy = vm_map_copy_allocate(VM_MAP_COPY_ENTRY_LIST);
20831 copy->offset = offset;
20832 copy->size = size;
20833 copy->cpy_hdr.page_shift = (uint16_t)PAGE_SHIFT;
20834
20835 copy_entry = vm_map_copy_entry_create(copy);
20836 copy_entry->protection = prot;
20837 copy_entry->max_protection = prot;
20838 copy_entry->use_pmap = TRUE;
20839 copy_entry->vme_start = VM_MAP_TRUNC_PAGE(offset, PAGE_MASK);
20840 copy_entry->vme_end = VM_MAP_ROUND_PAGE(offset + size, PAGE_MASK);
20841 VME_OBJECT_SET(copy_entry, object, false, 0);
20842 VME_OFFSET_SET(copy_entry, vm_object_trunc_page(offset));
20843 vm_map_copy_entry_link(copy, vm_map_copy_last_entry(copy), copy_entry);
20844
20845 named_entry->backing.copy = copy;
20846 named_entry->is_object = TRUE;
20847 if (object->internal) {
20848 named_entry->internal = TRUE;
20849 }
20850
20851 DEBUG4K_MEMENTRY("named_entry %p copy %p object %p offset 0x%llx size 0x%llx prot 0x%x\n",
20852 named_entry, copy, object, offset, size, prot);
20853 }
20854
20855 vm_object_t
vm_named_entry_to_vm_object(vm_named_entry_t named_entry)20856 vm_named_entry_to_vm_object(
20857 vm_named_entry_t named_entry)
20858 {
20859 vm_map_copy_t copy;
20860 vm_map_entry_t copy_entry;
20861 vm_object_t object;
20862
20863 assert(!named_entry->is_sub_map);
20864 assert(!named_entry->is_copy);
20865 assert(named_entry->is_object);
20866 copy = named_entry->backing.copy;
20867 assert(copy != VM_MAP_COPY_NULL);
20868 /*
20869 * Assert that the vm_map_copy is coming from the right
20870 * zone and hasn't been forged
20871 */
20872 vm_map_copy_require(copy);
20873 assert(copy->cpy_hdr.nentries == 1);
20874 copy_entry = vm_map_copy_first_entry(copy);
20875 object = VME_OBJECT(copy_entry);
20876
20877 DEBUG4K_MEMENTRY("%p -> %p -> %p [0x%llx 0x%llx 0x%llx 0x%x/0x%x ] -> %p offset 0x%llx size 0x%llx prot 0x%x\n", named_entry, copy, copy_entry, (uint64_t)copy_entry->vme_start, (uint64_t)copy_entry->vme_end, copy_entry->vme_offset, copy_entry->protection, copy_entry->max_protection, object, named_entry->offset, named_entry->size, named_entry->protection);
20878
20879 return object;
20880 }
20881
20882 /*
20883 * Routine: convert_port_entry_to_map
20884 * Purpose:
20885 * Convert from a port specifying an entry or a task
20886 * to a map. Doesn't consume the port ref; produces a map ref,
20887 * which may be null. Unlike convert_port_to_map, the
20888 * port may be task or a named entry backed.
20889 * Conditions:
20890 * Nothing locked.
20891 */
20892
20893 vm_map_t
convert_port_entry_to_map(ipc_port_t port)20894 convert_port_entry_to_map(
20895 ipc_port_t port)
20896 {
20897 vm_map_t map = VM_MAP_NULL;
20898 vm_named_entry_t named_entry;
20899
20900 if (!IP_VALID(port)) {
20901 return VM_MAP_NULL;
20902 }
20903
20904 if (ip_kotype(port) != IKOT_NAMED_ENTRY) {
20905 return convert_port_to_map(port);
20906 }
20907
20908 named_entry = mach_memory_entry_from_port(port);
20909
20910 if ((named_entry->is_sub_map) &&
20911 (named_entry->protection & VM_PROT_WRITE)) {
20912 map = named_entry->backing.map;
20913 if (map->pmap != PMAP_NULL) {
20914 if (map->pmap == kernel_pmap) {
20915 panic("userspace has access "
20916 "to a kernel map %p", map);
20917 }
20918 pmap_require(map->pmap);
20919 }
20920 vm_map_reference(map);
20921 }
20922
20923 return map;
20924 }
20925
20926 /*
20927 * Export routines to other components for the things we access locally through
20928 * macros.
20929 */
20930 #undef current_map
20931 vm_map_t
current_map(void)20932 current_map(void)
20933 {
20934 return current_map_fast();
20935 }
20936
20937 /*
20938 * vm_map_reference:
20939 *
20940 * Takes a reference on the specified map.
20941 */
20942 void
vm_map_reference(vm_map_t map)20943 vm_map_reference(
20944 vm_map_t map)
20945 {
20946 if (__probable(map != VM_MAP_NULL)) {
20947 vm_map_require(map);
20948 os_ref_retain_raw(&map->map_refcnt, &map_refgrp);
20949 }
20950 }
20951
20952 /*
20953 * vm_map_deallocate:
20954 *
20955 * Removes a reference from the specified map,
20956 * destroying it if no references remain.
20957 * The map should not be locked.
20958 */
20959 void
vm_map_deallocate(vm_map_t map)20960 vm_map_deallocate(
20961 vm_map_t map)
20962 {
20963 if (__probable(map != VM_MAP_NULL)) {
20964 vm_map_require(map);
20965 if (os_ref_release_raw(&map->map_refcnt, &map_refgrp) == 0) {
20966 vm_map_destroy(map);
20967 }
20968 }
20969 }
20970
20971 void
vm_map_inspect_deallocate(vm_map_inspect_t map)20972 vm_map_inspect_deallocate(
20973 vm_map_inspect_t map)
20974 {
20975 vm_map_deallocate((vm_map_t)map);
20976 }
20977
20978 void
vm_map_read_deallocate(vm_map_read_t map)20979 vm_map_read_deallocate(
20980 vm_map_read_t map)
20981 {
20982 vm_map_deallocate((vm_map_t)map);
20983 }
20984
20985
20986 void
vm_map_disable_NX(vm_map_t map)20987 vm_map_disable_NX(vm_map_t map)
20988 {
20989 if (map == NULL) {
20990 return;
20991 }
20992 if (map->pmap == NULL) {
20993 return;
20994 }
20995
20996 pmap_disable_NX(map->pmap);
20997 }
20998
20999 void
vm_map_disallow_data_exec(vm_map_t map)21000 vm_map_disallow_data_exec(vm_map_t map)
21001 {
21002 if (map == NULL) {
21003 return;
21004 }
21005
21006 map->map_disallow_data_exec = TRUE;
21007 }
21008
21009 /* XXX Consider making these constants (VM_MAX_ADDRESS and MACH_VM_MAX_ADDRESS)
21010 * more descriptive.
21011 */
21012 void
vm_map_set_32bit(vm_map_t map)21013 vm_map_set_32bit(vm_map_t map)
21014 {
21015 #if defined(__arm64__)
21016 map->max_offset = pmap_max_offset(FALSE, ARM_PMAP_MAX_OFFSET_DEVICE);
21017 #else
21018 map->max_offset = (vm_map_offset_t)VM_MAX_ADDRESS;
21019 #endif
21020 }
21021
21022
21023 void
vm_map_set_64bit(vm_map_t map)21024 vm_map_set_64bit(vm_map_t map)
21025 {
21026 #if defined(__arm64__)
21027 map->max_offset = pmap_max_offset(TRUE, ARM_PMAP_MAX_OFFSET_DEVICE);
21028 #else
21029 map->max_offset = (vm_map_offset_t)MACH_VM_MAX_ADDRESS;
21030 #endif
21031 }
21032
21033 /*
21034 * Expand the maximum size of an existing map to the maximum supported.
21035 */
21036 void
vm_map_set_jumbo(vm_map_t map)21037 vm_map_set_jumbo(vm_map_t map)
21038 {
21039 #if defined (__arm64__) && !XNU_TARGET_OS_OSX
21040 vm_map_set_max_addr(map, ~0);
21041 #else /* arm64 */
21042 (void) map;
21043 #endif
21044 }
21045
21046 /*
21047 * This map has a JIT entitlement
21048 */
21049 void
vm_map_set_jit_entitled(vm_map_t map)21050 vm_map_set_jit_entitled(vm_map_t map)
21051 {
21052 #if defined (__arm64__)
21053 pmap_set_jit_entitled(map->pmap);
21054 #else /* arm64 */
21055 (void) map;
21056 #endif
21057 }
21058
21059 /*
21060 * Get status of this maps TPRO flag
21061 */
21062 boolean_t
vm_map_tpro(vm_map_t map)21063 vm_map_tpro(vm_map_t map)
21064 {
21065 #if defined (__arm64e__)
21066 return pmap_get_tpro(map->pmap);
21067 #else /* arm64e */
21068 (void) map;
21069 return FALSE;
21070 #endif
21071 }
21072
21073 /*
21074 * This map has TPRO enabled
21075 */
21076 void
vm_map_set_tpro(vm_map_t map)21077 vm_map_set_tpro(vm_map_t map)
21078 {
21079 #if defined (__arm64e__)
21080 pmap_set_tpro(map->pmap);
21081 #else /* arm64e */
21082 (void) map;
21083 #endif
21084 }
21085
21086 /*
21087 * Does this map have TPRO enforcement enabled
21088 */
21089 boolean_t
vm_map_tpro_enforcement(vm_map_t map)21090 vm_map_tpro_enforcement(vm_map_t map)
21091 {
21092 return map->tpro_enforcement;
21093 }
21094
21095 /*
21096 * Set TPRO enforcement for this map
21097 */
21098 void
vm_map_set_tpro_enforcement(vm_map_t map)21099 vm_map_set_tpro_enforcement(vm_map_t map)
21100 {
21101 if (vm_map_tpro(map)) {
21102 vm_map_lock(map);
21103 map->tpro_enforcement = TRUE;
21104 vm_map_unlock(map);
21105 }
21106 }
21107
21108 /*
21109 * Enable TPRO on the requested region
21110 *
21111 * Note:
21112 * This routine is primarily intended to be called during/soon after map
21113 * creation before the associated task has been released to run. It is only
21114 * currently safe when we have no resident pages.
21115 */
21116 boolean_t
vm_map_set_tpro_range(__unused vm_map_t map,__unused vm_map_address_t start,__unused vm_map_address_t end)21117 vm_map_set_tpro_range(
21118 __unused vm_map_t map,
21119 __unused vm_map_address_t start,
21120 __unused vm_map_address_t end)
21121 {
21122 return TRUE;
21123 }
21124
21125 /*
21126 * Expand the maximum size of an existing map.
21127 */
21128 void
vm_map_set_max_addr(vm_map_t map,vm_map_offset_t new_max_offset)21129 vm_map_set_max_addr(vm_map_t map, vm_map_offset_t new_max_offset)
21130 {
21131 #if defined(__arm64__)
21132 vm_map_offset_t max_supported_offset;
21133 vm_map_offset_t old_max_offset;
21134
21135 vm_map_lock(map);
21136
21137 old_max_offset = map->max_offset;
21138 max_supported_offset = pmap_max_offset(vm_map_is_64bit(map), ARM_PMAP_MAX_OFFSET_JUMBO);
21139
21140 new_max_offset = trunc_page(new_max_offset);
21141
21142 /* The address space cannot be shrunk using this routine. */
21143 if (old_max_offset >= new_max_offset) {
21144 vm_map_unlock(map);
21145 return;
21146 }
21147
21148 if (max_supported_offset < new_max_offset) {
21149 new_max_offset = max_supported_offset;
21150 }
21151
21152 map->max_offset = new_max_offset;
21153
21154 if (map->holelistenabled) {
21155 if (map->holes_list->prev->vme_end == old_max_offset) {
21156 /*
21157 * There is already a hole at the end of the map; simply make it bigger.
21158 */
21159 map->holes_list->prev->vme_end = map->max_offset;
21160 } else {
21161 /*
21162 * There is no hole at the end, so we need to create a new hole
21163 * for the new empty space we're creating.
21164 */
21165 struct vm_map_links *new_hole;
21166
21167 new_hole = zalloc_id(ZONE_ID_VM_MAP_HOLES, Z_WAITOK | Z_NOFAIL);
21168 new_hole->start = old_max_offset;
21169 new_hole->end = map->max_offset;
21170 new_hole->prev = map->holes_list->prev;
21171 new_hole->next = (struct vm_map_entry *)map->holes_list;
21172 map->holes_list->prev->vme_next = (struct vm_map_entry *)new_hole;
21173 map->holes_list->prev = (struct vm_map_entry *)new_hole;
21174 }
21175 }
21176
21177 vm_map_unlock(map);
21178 #else
21179 (void)map;
21180 (void)new_max_offset;
21181 #endif
21182 }
21183
21184 vm_map_offset_t
vm_compute_max_offset(boolean_t is64)21185 vm_compute_max_offset(boolean_t is64)
21186 {
21187 #if defined(__arm64__)
21188 return pmap_max_offset(is64, ARM_PMAP_MAX_OFFSET_DEVICE);
21189 #else
21190 return is64 ? (vm_map_offset_t)MACH_VM_MAX_ADDRESS : (vm_map_offset_t)VM_MAX_ADDRESS;
21191 #endif
21192 }
21193
21194 void
vm_map_get_max_aslr_slide_section(vm_map_t map __unused,int64_t * max_sections,int64_t * section_size)21195 vm_map_get_max_aslr_slide_section(
21196 vm_map_t map __unused,
21197 int64_t *max_sections,
21198 int64_t *section_size)
21199 {
21200 #if defined(__arm64__)
21201 *max_sections = 3;
21202 *section_size = ARM_TT_TWIG_SIZE;
21203 #else
21204 *max_sections = 1;
21205 *section_size = 0;
21206 #endif
21207 }
21208
21209 uint64_t
vm_map_get_max_aslr_slide_pages(vm_map_t map)21210 vm_map_get_max_aslr_slide_pages(vm_map_t map)
21211 {
21212 #if defined(__arm64__)
21213 /* Limit arm64 slide to 16MB to conserve contiguous VA space in the more
21214 * limited embedded address space; this is also meant to minimize pmap
21215 * memory usage on 16KB page systems.
21216 */
21217 return 1 << (24 - VM_MAP_PAGE_SHIFT(map));
21218 #else
21219 return 1 << (vm_map_is_64bit(map) ? 16 : 8);
21220 #endif
21221 }
21222
21223 uint64_t
vm_map_get_max_loader_aslr_slide_pages(vm_map_t map)21224 vm_map_get_max_loader_aslr_slide_pages(vm_map_t map)
21225 {
21226 #if defined(__arm64__)
21227 /* We limit the loader slide to 4MB, in order to ensure at least 8 bits
21228 * of independent entropy on 16KB page systems.
21229 */
21230 return 1 << (22 - VM_MAP_PAGE_SHIFT(map));
21231 #else
21232 return 1 << (vm_map_is_64bit(map) ? 16 : 8);
21233 #endif
21234 }
21235
21236 boolean_t
vm_map_is_64bit(vm_map_t map)21237 vm_map_is_64bit(
21238 vm_map_t map)
21239 {
21240 return map->max_offset > ((vm_map_offset_t)VM_MAX_ADDRESS);
21241 }
21242
21243 boolean_t
vm_map_has_hard_pagezero(vm_map_t map,vm_map_offset_t pagezero_size)21244 vm_map_has_hard_pagezero(
21245 vm_map_t map,
21246 vm_map_offset_t pagezero_size)
21247 {
21248 /*
21249 * XXX FBDP
21250 * We should lock the VM map (for read) here but we can get away
21251 * with it for now because there can't really be any race condition:
21252 * the VM map's min_offset is changed only when the VM map is created
21253 * and when the zero page is established (when the binary gets loaded),
21254 * and this routine gets called only when the task terminates and the
21255 * VM map is being torn down, and when a new map is created via
21256 * load_machfile()/execve().
21257 */
21258 return map->min_offset >= pagezero_size;
21259 }
21260
21261 /*
21262 * Raise a VM map's maximun offset.
21263 */
21264 kern_return_t
vm_map_raise_max_offset(vm_map_t map,vm_map_offset_t new_max_offset)21265 vm_map_raise_max_offset(
21266 vm_map_t map,
21267 vm_map_offset_t new_max_offset)
21268 {
21269 kern_return_t ret;
21270
21271 vm_map_lock(map);
21272 ret = KERN_INVALID_ADDRESS;
21273
21274 if (new_max_offset >= map->max_offset) {
21275 if (!vm_map_is_64bit(map)) {
21276 if (new_max_offset <= (vm_map_offset_t)VM_MAX_ADDRESS) {
21277 map->max_offset = new_max_offset;
21278 ret = KERN_SUCCESS;
21279 }
21280 } else {
21281 if (new_max_offset <= (vm_map_offset_t)MACH_VM_MAX_ADDRESS) {
21282 map->max_offset = new_max_offset;
21283 ret = KERN_SUCCESS;
21284 }
21285 }
21286 }
21287
21288 vm_map_unlock(map);
21289 return ret;
21290 }
21291
21292
21293 /*
21294 * Raise a VM map's minimum offset.
21295 * To strictly enforce "page zero" reservation.
21296 */
21297 kern_return_t
vm_map_raise_min_offset(vm_map_t map,vm_map_offset_t new_min_offset)21298 vm_map_raise_min_offset(
21299 vm_map_t map,
21300 vm_map_offset_t new_min_offset)
21301 {
21302 vm_map_entry_t first_entry;
21303
21304 new_min_offset = vm_map_round_page(new_min_offset,
21305 VM_MAP_PAGE_MASK(map));
21306
21307 vm_map_lock(map);
21308
21309 if (new_min_offset < map->min_offset) {
21310 /*
21311 * Can't move min_offset backwards, as that would expose
21312 * a part of the address space that was previously, and for
21313 * possibly good reasons, inaccessible.
21314 */
21315 vm_map_unlock(map);
21316 return KERN_INVALID_ADDRESS;
21317 }
21318 if (new_min_offset >= map->max_offset) {
21319 /* can't go beyond the end of the address space */
21320 vm_map_unlock(map);
21321 return KERN_INVALID_ADDRESS;
21322 }
21323
21324 first_entry = vm_map_first_entry(map);
21325 if (first_entry != vm_map_to_entry(map) &&
21326 first_entry->vme_start < new_min_offset) {
21327 /*
21328 * Some memory was already allocated below the new
21329 * minimun offset. It's too late to change it now...
21330 */
21331 vm_map_unlock(map);
21332 return KERN_NO_SPACE;
21333 }
21334
21335 map->min_offset = new_min_offset;
21336
21337 if (map->holelistenabled) {
21338 assert(map->holes_list);
21339 map->holes_list->start = new_min_offset;
21340 assert(new_min_offset < map->holes_list->end);
21341 }
21342
21343 vm_map_unlock(map);
21344
21345 return KERN_SUCCESS;
21346 }
21347
21348 /*
21349 * Set the limit on the maximum amount of address space and user wired memory allowed for this map.
21350 * This is basically a copy of the RLIMIT_AS and RLIMIT_MEMLOCK rlimit value maintained by the BSD
21351 * side of the kernel. The limits are checked in the mach VM side, so we keep a copy so we don't
21352 * have to reach over to the BSD data structures.
21353 */
21354
21355 uint64_t vm_map_set_size_limit_count = 0;
21356 kern_return_t
vm_map_set_size_limit(vm_map_t map,uint64_t new_size_limit)21357 vm_map_set_size_limit(vm_map_t map, uint64_t new_size_limit)
21358 {
21359 kern_return_t kr;
21360
21361 vm_map_lock(map);
21362 if (new_size_limit < map->size) {
21363 /* new limit should not be lower than its current size */
21364 DTRACE_VM2(vm_map_set_size_limit_fail,
21365 vm_map_size_t, map->size,
21366 uint64_t, new_size_limit);
21367 kr = KERN_FAILURE;
21368 } else if (new_size_limit == map->size_limit) {
21369 /* no change */
21370 kr = KERN_SUCCESS;
21371 } else {
21372 /* set new limit */
21373 DTRACE_VM2(vm_map_set_size_limit,
21374 vm_map_size_t, map->size,
21375 uint64_t, new_size_limit);
21376 if (new_size_limit != RLIM_INFINITY) {
21377 vm_map_set_size_limit_count++;
21378 }
21379 map->size_limit = new_size_limit;
21380 kr = KERN_SUCCESS;
21381 }
21382 vm_map_unlock(map);
21383 return kr;
21384 }
21385
21386 uint64_t vm_map_set_data_limit_count = 0;
21387 kern_return_t
vm_map_set_data_limit(vm_map_t map,uint64_t new_data_limit)21388 vm_map_set_data_limit(vm_map_t map, uint64_t new_data_limit)
21389 {
21390 kern_return_t kr;
21391
21392 vm_map_lock(map);
21393 if (new_data_limit < map->size) {
21394 /* new limit should not be lower than its current size */
21395 DTRACE_VM2(vm_map_set_data_limit_fail,
21396 vm_map_size_t, map->size,
21397 uint64_t, new_data_limit);
21398 kr = KERN_FAILURE;
21399 } else if (new_data_limit == map->data_limit) {
21400 /* no change */
21401 kr = KERN_SUCCESS;
21402 } else {
21403 /* set new limit */
21404 DTRACE_VM2(vm_map_set_data_limit,
21405 vm_map_size_t, map->size,
21406 uint64_t, new_data_limit);
21407 if (new_data_limit != RLIM_INFINITY) {
21408 vm_map_set_data_limit_count++;
21409 }
21410 map->data_limit = new_data_limit;
21411 kr = KERN_SUCCESS;
21412 }
21413 vm_map_unlock(map);
21414 return kr;
21415 }
21416
21417 void
vm_map_set_user_wire_limit(vm_map_t map,vm_size_t limit)21418 vm_map_set_user_wire_limit(vm_map_t map,
21419 vm_size_t limit)
21420 {
21421 vm_map_lock(map);
21422 map->user_wire_limit = limit;
21423 vm_map_unlock(map);
21424 }
21425
21426
21427 void
vm_map_switch_protect(vm_map_t map,boolean_t val)21428 vm_map_switch_protect(vm_map_t map,
21429 boolean_t val)
21430 {
21431 vm_map_lock(map);
21432 map->switch_protect = val;
21433 vm_map_unlock(map);
21434 }
21435
21436 extern int cs_process_enforcement_enable;
21437 boolean_t
vm_map_cs_enforcement(vm_map_t map)21438 vm_map_cs_enforcement(
21439 vm_map_t map)
21440 {
21441 if (cs_process_enforcement_enable) {
21442 return TRUE;
21443 }
21444 return map->cs_enforcement;
21445 }
21446
21447 kern_return_t
vm_map_cs_wx_enable(__unused vm_map_t map)21448 vm_map_cs_wx_enable(
21449 __unused vm_map_t map)
21450 {
21451 #if CODE_SIGNING_MONITOR
21452 kern_return_t ret = csm_allow_invalid_code(vm_map_pmap(map));
21453 if ((ret == KERN_SUCCESS) || (ret == KERN_NOT_SUPPORTED)) {
21454 return KERN_SUCCESS;
21455 }
21456 return ret;
21457 #else
21458 /* The VM manages WX memory entirely on its own */
21459 return KERN_SUCCESS;
21460 #endif
21461 }
21462
21463 kern_return_t
vm_map_csm_allow_jit(__unused vm_map_t map)21464 vm_map_csm_allow_jit(
21465 __unused vm_map_t map)
21466 {
21467 #if CODE_SIGNING_MONITOR
21468 return csm_allow_jit_region(vm_map_pmap(map));
21469 #else
21470 /* No code signing monitor to enforce JIT policy */
21471 return KERN_SUCCESS;
21472 #endif
21473 }
21474
21475 void
vm_map_cs_debugged_set(vm_map_t map,boolean_t val)21476 vm_map_cs_debugged_set(
21477 vm_map_t map,
21478 boolean_t val)
21479 {
21480 vm_map_lock(map);
21481 map->cs_debugged = val;
21482 vm_map_unlock(map);
21483 }
21484
21485 void
vm_map_cs_enforcement_set(vm_map_t map,boolean_t val)21486 vm_map_cs_enforcement_set(
21487 vm_map_t map,
21488 boolean_t val)
21489 {
21490 vm_map_lock(map);
21491 map->cs_enforcement = val;
21492 pmap_set_vm_map_cs_enforced(map->pmap, val);
21493 vm_map_unlock(map);
21494 }
21495
21496 /*
21497 * IOKit has mapped a region into this map; adjust the pmap's ledgers appropriately.
21498 * phys_footprint is a composite limit consisting of iokit + physmem, so we need to
21499 * bump both counters.
21500 */
21501 void
vm_map_iokit_mapped_region(vm_map_t map,vm_size_t bytes)21502 vm_map_iokit_mapped_region(vm_map_t map, vm_size_t bytes)
21503 {
21504 pmap_t pmap = vm_map_pmap(map);
21505
21506 ledger_credit(pmap->ledger, task_ledgers.iokit_mapped, bytes);
21507 ledger_credit(pmap->ledger, task_ledgers.phys_footprint, bytes);
21508 }
21509
21510 void
vm_map_iokit_unmapped_region(vm_map_t map,vm_size_t bytes)21511 vm_map_iokit_unmapped_region(vm_map_t map, vm_size_t bytes)
21512 {
21513 pmap_t pmap = vm_map_pmap(map);
21514
21515 ledger_debit(pmap->ledger, task_ledgers.iokit_mapped, bytes);
21516 ledger_debit(pmap->ledger, task_ledgers.phys_footprint, bytes);
21517 }
21518
21519 /* Add (generate) code signature for memory range */
21520 #if CONFIG_DYNAMIC_CODE_SIGNING
21521 kern_return_t
vm_map_sign(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)21522 vm_map_sign(vm_map_t map,
21523 vm_map_offset_t start,
21524 vm_map_offset_t end)
21525 {
21526 vm_map_entry_t entry;
21527 vm_page_t m;
21528 vm_object_t object;
21529
21530 /*
21531 * Vet all the input parameters and current type and state of the
21532 * underlaying object. Return with an error if anything is amiss.
21533 */
21534 if (map == VM_MAP_NULL) {
21535 return KERN_INVALID_ARGUMENT;
21536 }
21537
21538 if (__improbable(vm_map_range_overflows(map, start, end - start))) {
21539 return KERN_INVALID_ADDRESS;
21540 }
21541
21542 vm_map_lock_read(map);
21543
21544 if (!vm_map_lookup_entry(map, start, &entry) || entry->is_sub_map) {
21545 /*
21546 * Must pass a valid non-submap address.
21547 */
21548 vm_map_unlock_read(map);
21549 return KERN_INVALID_ADDRESS;
21550 }
21551
21552 if ((entry->vme_start > start) || (entry->vme_end < end)) {
21553 /*
21554 * Map entry doesn't cover the requested range. Not handling
21555 * this situation currently.
21556 */
21557 vm_map_unlock_read(map);
21558 return KERN_INVALID_ARGUMENT;
21559 }
21560
21561 object = VME_OBJECT(entry);
21562 if (object == VM_OBJECT_NULL) {
21563 /*
21564 * Object must already be present or we can't sign.
21565 */
21566 vm_map_unlock_read(map);
21567 return KERN_INVALID_ARGUMENT;
21568 }
21569
21570 vm_object_lock(object);
21571 vm_map_unlock_read(map);
21572
21573 while (start < end) {
21574 uint32_t refmod;
21575
21576 m = vm_page_lookup(object,
21577 start - entry->vme_start + VME_OFFSET(entry));
21578 if (m == VM_PAGE_NULL) {
21579 /* shoud we try to fault a page here? we can probably
21580 * demand it exists and is locked for this request */
21581 vm_object_unlock(object);
21582 return KERN_FAILURE;
21583 }
21584 /* deal with special page status */
21585 if (m->vmp_busy ||
21586 (m->vmp_unusual && (VMP_ERROR_GET(m) || m->vmp_restart || m->vmp_private || m->vmp_absent))) {
21587 vm_object_unlock(object);
21588 return KERN_FAILURE;
21589 }
21590
21591 /* Page is OK... now "validate" it */
21592 /* This is the place where we'll call out to create a code
21593 * directory, later */
21594 /* XXX TODO4K: deal with 4k subpages individually? */
21595 m->vmp_cs_validated = VMP_CS_ALL_TRUE;
21596
21597 /* The page is now "clean" for codesigning purposes. That means
21598 * we don't consider it as modified (wpmapped) anymore. But
21599 * we'll disconnect the page so we note any future modification
21600 * attempts. */
21601 m->vmp_wpmapped = FALSE;
21602 refmod = pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m));
21603
21604 /* Pull the dirty status from the pmap, since we cleared the
21605 * wpmapped bit */
21606 if ((refmod & VM_MEM_MODIFIED) && !m->vmp_dirty) {
21607 SET_PAGE_DIRTY(m, FALSE);
21608 }
21609
21610 /* On to the next page */
21611 start += PAGE_SIZE;
21612 }
21613 vm_object_unlock(object);
21614
21615 return KERN_SUCCESS;
21616 }
21617 #endif
21618
21619 kern_return_t
vm_map_partial_reap(vm_map_t map,unsigned int * reclaimed_resident,unsigned int * reclaimed_compressed)21620 vm_map_partial_reap(vm_map_t map, unsigned int *reclaimed_resident, unsigned int *reclaimed_compressed)
21621 {
21622 vm_map_entry_t entry = VM_MAP_ENTRY_NULL;
21623 vm_map_entry_t next_entry;
21624 kern_return_t kr = KERN_SUCCESS;
21625 VM_MAP_ZAP_DECLARE(zap_list);
21626
21627 vm_map_lock(map);
21628
21629 for (entry = vm_map_first_entry(map);
21630 entry != vm_map_to_entry(map);
21631 entry = next_entry) {
21632 next_entry = entry->vme_next;
21633
21634 if (!entry->is_sub_map &&
21635 VME_OBJECT(entry) &&
21636 (VME_OBJECT(entry)->internal == TRUE) &&
21637 (VME_OBJECT(entry)->ref_count == 1)) {
21638 *reclaimed_resident += VME_OBJECT(entry)->resident_page_count;
21639 *reclaimed_compressed += vm_compressor_pager_get_count(VME_OBJECT(entry)->pager);
21640
21641 (void)vm_map_delete(map, entry->vme_start,
21642 entry->vme_end, VM_MAP_REMOVE_NO_YIELD,
21643 KMEM_GUARD_NONE, &zap_list);
21644 }
21645 }
21646
21647 vm_map_unlock(map);
21648
21649 vm_map_zap_dispose(&zap_list);
21650
21651 return kr;
21652 }
21653
21654
21655 #if DEVELOPMENT || DEBUG
21656
21657 int
vm_map_disconnect_page_mappings(vm_map_t map,boolean_t do_unnest)21658 vm_map_disconnect_page_mappings(
21659 vm_map_t map,
21660 boolean_t do_unnest)
21661 {
21662 vm_map_entry_t entry;
21663 ledger_amount_t byte_count = 0;
21664
21665 if (do_unnest == TRUE) {
21666 #ifndef NO_NESTED_PMAP
21667 vm_map_lock(map);
21668
21669 for (entry = vm_map_first_entry(map);
21670 entry != vm_map_to_entry(map);
21671 entry = entry->vme_next) {
21672 if (entry->is_sub_map && entry->use_pmap) {
21673 /*
21674 * Make sure the range between the start of this entry and
21675 * the end of this entry is no longer nested, so that
21676 * we will only remove mappings from the pmap in use by this
21677 * this task
21678 */
21679 vm_map_clip_unnest(map, entry, entry->vme_start, entry->vme_end);
21680 }
21681 }
21682 vm_map_unlock(map);
21683 #endif
21684 }
21685 vm_map_lock_read(map);
21686
21687 ledger_get_balance(map->pmap->ledger, task_ledgers.phys_mem, &byte_count);
21688
21689 for (entry = vm_map_first_entry(map);
21690 entry != vm_map_to_entry(map);
21691 entry = entry->vme_next) {
21692 if (!entry->is_sub_map && ((VME_OBJECT(entry) == 0) ||
21693 (VME_OBJECT(entry)->phys_contiguous))) {
21694 continue;
21695 }
21696 if (entry->is_sub_map) {
21697 assert(!entry->use_pmap);
21698 }
21699
21700 pmap_remove_options(map->pmap, entry->vme_start, entry->vme_end, 0);
21701 }
21702 vm_map_unlock_read(map);
21703
21704 return (int) (byte_count / VM_MAP_PAGE_SIZE(map));
21705 }
21706
21707 kern_return_t
vm_map_inject_error(vm_map_t map,vm_map_offset_t vaddr)21708 vm_map_inject_error(vm_map_t map, vm_map_offset_t vaddr)
21709 {
21710 vm_object_t object = NULL;
21711 vm_object_offset_t offset;
21712 vm_prot_t prot;
21713 boolean_t wired;
21714 vm_map_version_t version;
21715 vm_map_t real_map;
21716 int result = KERN_FAILURE;
21717
21718 vaddr = vm_map_trunc_page(vaddr, PAGE_MASK);
21719 vm_map_lock(map);
21720
21721 result = vm_map_lookup_and_lock_object(&map, vaddr, VM_PROT_READ,
21722 OBJECT_LOCK_EXCLUSIVE, &version, &object, &offset, &prot, &wired,
21723 NULL, &real_map, NULL);
21724 if (object == NULL) {
21725 result = KERN_MEMORY_ERROR;
21726 } else if (object->pager) {
21727 result = vm_compressor_pager_inject_error(object->pager,
21728 offset);
21729 } else {
21730 result = KERN_MEMORY_PRESENT;
21731 }
21732
21733 if (object != NULL) {
21734 vm_object_unlock(object);
21735 }
21736
21737 if (real_map != map) {
21738 vm_map_unlock(real_map);
21739 }
21740 vm_map_unlock(map);
21741
21742 return result;
21743 }
21744
21745 #endif
21746
21747
21748 #if CONFIG_FREEZE
21749
21750
21751 extern struct freezer_context freezer_context_global;
21752 AbsoluteTime c_freezer_last_yield_ts = 0;
21753
21754 extern unsigned int memorystatus_freeze_private_shared_pages_ratio;
21755 extern unsigned int memorystatus_freeze_shared_mb_per_process_max;
21756
21757 kern_return_t
vm_map_freeze(task_t task,unsigned int * purgeable_count,unsigned int * wired_count,unsigned int * clean_count,unsigned int * dirty_count,unsigned int dirty_budget,unsigned int * shared_count,int * freezer_error_code,boolean_t eval_only)21758 vm_map_freeze(
21759 task_t task,
21760 unsigned int *purgeable_count,
21761 unsigned int *wired_count,
21762 unsigned int *clean_count,
21763 unsigned int *dirty_count,
21764 unsigned int dirty_budget,
21765 unsigned int *shared_count,
21766 int *freezer_error_code,
21767 boolean_t eval_only)
21768 {
21769 vm_map_entry_t entry2 = VM_MAP_ENTRY_NULL;
21770 kern_return_t kr = KERN_SUCCESS;
21771 boolean_t evaluation_phase = TRUE;
21772 vm_object_t cur_shared_object = NULL;
21773 int cur_shared_obj_ref_cnt = 0;
21774 unsigned int dirty_private_count = 0, dirty_shared_count = 0, obj_pages_snapshot = 0;
21775
21776 *purgeable_count = *wired_count = *clean_count = *dirty_count = *shared_count = 0;
21777
21778 /*
21779 * We need the exclusive lock here so that we can
21780 * block any page faults or lookups while we are
21781 * in the middle of freezing this vm map.
21782 */
21783 vm_map_t map = task->map;
21784
21785 vm_map_lock(map);
21786
21787 assert(VM_CONFIG_COMPRESSOR_IS_PRESENT);
21788
21789 if (vm_compressor_low_on_space() || vm_swap_low_on_space()) {
21790 if (vm_compressor_low_on_space()) {
21791 *freezer_error_code = FREEZER_ERROR_NO_COMPRESSOR_SPACE;
21792 }
21793
21794 if (vm_swap_low_on_space()) {
21795 *freezer_error_code = FREEZER_ERROR_NO_SWAP_SPACE;
21796 }
21797
21798 kr = KERN_NO_SPACE;
21799 goto done;
21800 }
21801
21802 if (VM_CONFIG_FREEZER_SWAP_IS_ACTIVE == FALSE) {
21803 /*
21804 * In-memory compressor backing the freezer. No disk.
21805 * So no need to do the evaluation phase.
21806 */
21807 evaluation_phase = FALSE;
21808
21809 if (eval_only == TRUE) {
21810 /*
21811 * We don't support 'eval_only' mode
21812 * in this non-swap config.
21813 */
21814 *freezer_error_code = FREEZER_ERROR_GENERIC;
21815 kr = KERN_INVALID_ARGUMENT;
21816 goto done;
21817 }
21818
21819 freezer_context_global.freezer_ctx_uncompressed_pages = 0;
21820 clock_get_uptime(&c_freezer_last_yield_ts);
21821 }
21822 again:
21823
21824 for (entry2 = vm_map_first_entry(map);
21825 entry2 != vm_map_to_entry(map);
21826 entry2 = entry2->vme_next) {
21827 vm_object_t src_object;
21828
21829 if (entry2->is_sub_map) {
21830 continue;
21831 }
21832
21833 src_object = VME_OBJECT(entry2);
21834 if (!src_object ||
21835 src_object->phys_contiguous ||
21836 !src_object->internal) {
21837 continue;
21838 }
21839
21840 /* If eligible, scan the entry, moving eligible pages over to our parent object */
21841
21842 if (VM_CONFIG_FREEZER_SWAP_IS_ACTIVE) {
21843 /*
21844 * We skip purgeable objects during evaluation phase only.
21845 * If we decide to freeze this process, we'll explicitly
21846 * purge these objects before we go around again with
21847 * 'evaluation_phase' set to FALSE.
21848 */
21849
21850 if ((src_object->purgable == VM_PURGABLE_EMPTY) || (src_object->purgable == VM_PURGABLE_VOLATILE)) {
21851 /*
21852 * We want to purge objects that may not belong to this task but are mapped
21853 * in this task alone. Since we already purged this task's purgeable memory
21854 * at the end of a successful evaluation phase, we want to avoid doing no-op calls
21855 * on this task's purgeable objects. Hence the check for only volatile objects.
21856 */
21857 if (evaluation_phase ||
21858 src_object->purgable != VM_PURGABLE_VOLATILE ||
21859 src_object->ref_count != 1) {
21860 continue;
21861 }
21862 vm_object_lock(src_object);
21863 if (src_object->purgable == VM_PURGABLE_VOLATILE &&
21864 src_object->ref_count == 1) {
21865 purgeable_q_t old_queue;
21866
21867 /* object should be on a purgeable queue */
21868 assert(src_object->objq.next != NULL &&
21869 src_object->objq.prev != NULL);
21870 /* move object from its volatile queue to the nonvolatile queue */
21871 old_queue = vm_purgeable_object_remove(src_object);
21872 assert(old_queue);
21873 if (src_object->purgeable_when_ripe) {
21874 /* remove a token from that volatile queue */
21875 vm_page_lock_queues();
21876 vm_purgeable_token_delete_first(old_queue);
21877 vm_page_unlock_queues();
21878 }
21879 /* purge the object */
21880 vm_object_purge(src_object, 0);
21881 }
21882 vm_object_unlock(src_object);
21883 continue;
21884 }
21885
21886 /*
21887 * Pages belonging to this object could be swapped to disk.
21888 * Make sure it's not a shared object because we could end
21889 * up just bringing it back in again.
21890 *
21891 * We try to optimize somewhat by checking for objects that are mapped
21892 * more than once within our own map. But we don't do full searches,
21893 * we just look at the entries following our current entry.
21894 */
21895
21896 if (src_object->ref_count > 1) {
21897 if (src_object != cur_shared_object) {
21898 obj_pages_snapshot = (src_object->resident_page_count - src_object->wired_page_count) + vm_compressor_pager_get_count(src_object->pager);
21899 dirty_shared_count += obj_pages_snapshot;
21900
21901 cur_shared_object = src_object;
21902 cur_shared_obj_ref_cnt = 1;
21903 continue;
21904 } else {
21905 cur_shared_obj_ref_cnt++;
21906 if (src_object->ref_count == cur_shared_obj_ref_cnt) {
21907 /*
21908 * Fall through to below and treat this object as private.
21909 * So deduct its pages from our shared total and add it to the
21910 * private total.
21911 */
21912
21913 dirty_shared_count -= obj_pages_snapshot;
21914 dirty_private_count += obj_pages_snapshot;
21915 } else {
21916 continue;
21917 }
21918 }
21919 }
21920
21921
21922 if (src_object->ref_count == 1) {
21923 dirty_private_count += (src_object->resident_page_count - src_object->wired_page_count) + vm_compressor_pager_get_count(src_object->pager);
21924 }
21925
21926 if (evaluation_phase == TRUE) {
21927 continue;
21928 }
21929 }
21930
21931 uint32_t paged_out_count = vm_object_compressed_freezer_pageout(src_object, dirty_budget);
21932 *wired_count += src_object->wired_page_count;
21933
21934 if (vm_compressor_low_on_space() || vm_swap_low_on_space()) {
21935 if (vm_compressor_low_on_space()) {
21936 *freezer_error_code = FREEZER_ERROR_NO_COMPRESSOR_SPACE;
21937 }
21938
21939 if (vm_swap_low_on_space()) {
21940 *freezer_error_code = FREEZER_ERROR_NO_SWAP_SPACE;
21941 }
21942
21943 kr = KERN_NO_SPACE;
21944 break;
21945 }
21946 if (paged_out_count >= dirty_budget) {
21947 break;
21948 }
21949 dirty_budget -= paged_out_count;
21950 }
21951
21952 *shared_count = (unsigned int) ((dirty_shared_count * PAGE_SIZE_64) / (1024 * 1024ULL));
21953 if (evaluation_phase) {
21954 unsigned int shared_pages_threshold = (memorystatus_freeze_shared_mb_per_process_max * 1024 * 1024ULL) / PAGE_SIZE_64;
21955
21956 if (dirty_shared_count > shared_pages_threshold) {
21957 *freezer_error_code = FREEZER_ERROR_EXCESS_SHARED_MEMORY;
21958 kr = KERN_FAILURE;
21959 goto done;
21960 }
21961
21962 if (dirty_shared_count &&
21963 ((dirty_private_count / dirty_shared_count) < memorystatus_freeze_private_shared_pages_ratio)) {
21964 *freezer_error_code = FREEZER_ERROR_LOW_PRIVATE_SHARED_RATIO;
21965 kr = KERN_FAILURE;
21966 goto done;
21967 }
21968
21969 evaluation_phase = FALSE;
21970 dirty_shared_count = dirty_private_count = 0;
21971
21972 freezer_context_global.freezer_ctx_uncompressed_pages = 0;
21973 clock_get_uptime(&c_freezer_last_yield_ts);
21974
21975 if (eval_only) {
21976 kr = KERN_SUCCESS;
21977 goto done;
21978 }
21979
21980 vm_purgeable_purge_task_owned(task);
21981
21982 goto again;
21983 } else {
21984 kr = KERN_SUCCESS;
21985 }
21986
21987 done:
21988 vm_map_unlock(map);
21989
21990 if ((eval_only == FALSE) && (kr == KERN_SUCCESS)) {
21991 vm_object_compressed_freezer_done();
21992 }
21993 return kr;
21994 }
21995
21996 #endif
21997
21998 /*
21999 * vm_map_entry_should_cow_for_true_share:
22000 *
22001 * Determines if the map entry should be clipped and setup for copy-on-write
22002 * to avoid applying "true_share" to a large VM object when only a subset is
22003 * targeted.
22004 *
22005 * For now, we target only the map entries created for the Objective C
22006 * Garbage Collector, which initially have the following properties:
22007 * - alias == VM_MEMORY_MALLOC
22008 * - wired_count == 0
22009 * - !needs_copy
22010 * and a VM object with:
22011 * - internal
22012 * - copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC
22013 * - !true_share
22014 * - vo_size == ANON_CHUNK_SIZE
22015 *
22016 * Only non-kernel map entries.
22017 */
22018 boolean_t
vm_map_entry_should_cow_for_true_share(vm_map_entry_t entry)22019 vm_map_entry_should_cow_for_true_share(
22020 vm_map_entry_t entry)
22021 {
22022 vm_object_t object;
22023
22024 if (entry->is_sub_map) {
22025 /* entry does not point at a VM object */
22026 return FALSE;
22027 }
22028
22029 if (entry->needs_copy) {
22030 /* already set for copy_on_write: done! */
22031 return FALSE;
22032 }
22033
22034 if (VME_ALIAS(entry) != VM_MEMORY_MALLOC &&
22035 VME_ALIAS(entry) != VM_MEMORY_MALLOC_SMALL) {
22036 /* not a malloc heap or Obj-C Garbage Collector heap */
22037 return FALSE;
22038 }
22039
22040 if (entry->wired_count) {
22041 /* wired: can't change the map entry... */
22042 vm_counters.should_cow_but_wired++;
22043 return FALSE;
22044 }
22045
22046 object = VME_OBJECT(entry);
22047
22048 if (object == VM_OBJECT_NULL) {
22049 /* no object yet... */
22050 return FALSE;
22051 }
22052
22053 if (!object->internal) {
22054 /* not an internal object */
22055 return FALSE;
22056 }
22057
22058 if (object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC) {
22059 /* not the default copy strategy */
22060 return FALSE;
22061 }
22062
22063 if (object->true_share) {
22064 /* already true_share: too late to avoid it */
22065 return FALSE;
22066 }
22067
22068 if (VME_ALIAS(entry) == VM_MEMORY_MALLOC &&
22069 object->vo_size != ANON_CHUNK_SIZE) {
22070 /* ... not an object created for the ObjC Garbage Collector */
22071 return FALSE;
22072 }
22073
22074 if (VME_ALIAS(entry) == VM_MEMORY_MALLOC_SMALL &&
22075 object->vo_size != 2048 * 4096) {
22076 /* ... not a "MALLOC_SMALL" heap */
22077 return FALSE;
22078 }
22079
22080 /*
22081 * All the criteria match: we have a large object being targeted for "true_share".
22082 * To limit the adverse side-effects linked with "true_share", tell the caller to
22083 * try and avoid setting up the entire object for "true_share" by clipping the
22084 * targeted range and setting it up for copy-on-write.
22085 */
22086 return TRUE;
22087 }
22088
22089 uint64_t vm_map_range_overflows_count = 0;
22090 TUNABLE_WRITEABLE(boolean_t, vm_map_range_overflows_log, "vm_map_range_overflows_log", FALSE);
22091 bool
vm_map_range_overflows(vm_map_t map,vm_map_offset_t addr,vm_map_size_t size)22092 vm_map_range_overflows(
22093 vm_map_t map,
22094 vm_map_offset_t addr,
22095 vm_map_size_t size)
22096 {
22097 vm_map_offset_t start, end, sum;
22098 vm_map_offset_t pgmask;
22099
22100 if (size == 0) {
22101 /* empty range -> no overflow */
22102 return false;
22103 }
22104 pgmask = vm_map_page_mask(map);
22105 start = vm_map_trunc_page_mask(addr, pgmask);
22106 end = vm_map_round_page_mask(addr + size, pgmask);
22107 if (__improbable(os_add_overflow(addr, size, &sum) || end <= start)) {
22108 vm_map_range_overflows_count++;
22109 if (vm_map_range_overflows_log) {
22110 printf("%d[%s] vm_map_range_overflows addr 0x%llx size 0x%llx pgmask 0x%llx\n",
22111 proc_selfpid(),
22112 proc_best_name(current_proc()),
22113 (uint64_t)addr,
22114 (uint64_t)size,
22115 (uint64_t)pgmask);
22116 }
22117 DTRACE_VM4(vm_map_range_overflows,
22118 vm_map_t, map,
22119 uint32_t, pgmask,
22120 uint64_t, (uint64_t)addr,
22121 uint64_t, (uint64_t)size);
22122 return true;
22123 }
22124 return false;
22125 }
22126
22127 vm_map_offset_t
vm_map_round_page_mask(vm_map_offset_t offset,vm_map_offset_t mask)22128 vm_map_round_page_mask(
22129 vm_map_offset_t offset,
22130 vm_map_offset_t mask)
22131 {
22132 return VM_MAP_ROUND_PAGE(offset, mask);
22133 }
22134
22135 vm_map_offset_t
vm_map_trunc_page_mask(vm_map_offset_t offset,vm_map_offset_t mask)22136 vm_map_trunc_page_mask(
22137 vm_map_offset_t offset,
22138 vm_map_offset_t mask)
22139 {
22140 return VM_MAP_TRUNC_PAGE(offset, mask);
22141 }
22142
22143 boolean_t
vm_map_page_aligned(vm_map_offset_t offset,vm_map_offset_t mask)22144 vm_map_page_aligned(
22145 vm_map_offset_t offset,
22146 vm_map_offset_t mask)
22147 {
22148 return ((offset) & mask) == 0;
22149 }
22150
22151 int
vm_map_page_shift(vm_map_t map)22152 vm_map_page_shift(
22153 vm_map_t map)
22154 {
22155 return VM_MAP_PAGE_SHIFT(map);
22156 }
22157
22158 int
vm_map_page_size(vm_map_t map)22159 vm_map_page_size(
22160 vm_map_t map)
22161 {
22162 return VM_MAP_PAGE_SIZE(map);
22163 }
22164
22165 vm_map_offset_t
vm_map_page_mask(vm_map_t map)22166 vm_map_page_mask(
22167 vm_map_t map)
22168 {
22169 return VM_MAP_PAGE_MASK(map);
22170 }
22171
22172 kern_return_t
vm_map_set_page_shift(vm_map_t map,int pageshift)22173 vm_map_set_page_shift(
22174 vm_map_t map,
22175 int pageshift)
22176 {
22177 if (map->hdr.nentries != 0) {
22178 /* too late to change page size */
22179 return KERN_FAILURE;
22180 }
22181
22182 map->hdr.page_shift = (uint16_t)pageshift;
22183
22184 return KERN_SUCCESS;
22185 }
22186
22187 kern_return_t
vm_map_query_volatile(vm_map_t map,mach_vm_size_t * volatile_virtual_size_p,mach_vm_size_t * volatile_resident_size_p,mach_vm_size_t * volatile_compressed_size_p,mach_vm_size_t * volatile_pmap_size_p,mach_vm_size_t * volatile_compressed_pmap_size_p)22188 vm_map_query_volatile(
22189 vm_map_t map,
22190 mach_vm_size_t *volatile_virtual_size_p,
22191 mach_vm_size_t *volatile_resident_size_p,
22192 mach_vm_size_t *volatile_compressed_size_p,
22193 mach_vm_size_t *volatile_pmap_size_p,
22194 mach_vm_size_t *volatile_compressed_pmap_size_p)
22195 {
22196 mach_vm_size_t volatile_virtual_size;
22197 mach_vm_size_t volatile_resident_count;
22198 mach_vm_size_t volatile_compressed_count;
22199 mach_vm_size_t volatile_pmap_count;
22200 mach_vm_size_t volatile_compressed_pmap_count;
22201 mach_vm_size_t resident_count;
22202 vm_map_entry_t entry;
22203 vm_object_t object;
22204
22205 /* map should be locked by caller */
22206
22207 volatile_virtual_size = 0;
22208 volatile_resident_count = 0;
22209 volatile_compressed_count = 0;
22210 volatile_pmap_count = 0;
22211 volatile_compressed_pmap_count = 0;
22212
22213 for (entry = vm_map_first_entry(map);
22214 entry != vm_map_to_entry(map);
22215 entry = entry->vme_next) {
22216 mach_vm_size_t pmap_resident_bytes, pmap_compressed_bytes;
22217
22218 if (entry->is_sub_map) {
22219 continue;
22220 }
22221 if (!(entry->protection & VM_PROT_WRITE)) {
22222 continue;
22223 }
22224 object = VME_OBJECT(entry);
22225 if (object == VM_OBJECT_NULL) {
22226 continue;
22227 }
22228 if (object->purgable != VM_PURGABLE_VOLATILE &&
22229 object->purgable != VM_PURGABLE_EMPTY) {
22230 continue;
22231 }
22232 if (VME_OFFSET(entry)) {
22233 /*
22234 * If the map entry has been split and the object now
22235 * appears several times in the VM map, we don't want
22236 * to count the object's resident_page_count more than
22237 * once. We count it only for the first one, starting
22238 * at offset 0 and ignore the other VM map entries.
22239 */
22240 continue;
22241 }
22242 resident_count = object->resident_page_count;
22243 if ((VME_OFFSET(entry) / PAGE_SIZE) >= resident_count) {
22244 resident_count = 0;
22245 } else {
22246 resident_count -= (VME_OFFSET(entry) / PAGE_SIZE);
22247 }
22248
22249 volatile_virtual_size += entry->vme_end - entry->vme_start;
22250 volatile_resident_count += resident_count;
22251 if (object->pager) {
22252 volatile_compressed_count +=
22253 vm_compressor_pager_get_count(object->pager);
22254 }
22255 pmap_compressed_bytes = 0;
22256 pmap_resident_bytes =
22257 pmap_query_resident(map->pmap,
22258 entry->vme_start,
22259 entry->vme_end,
22260 &pmap_compressed_bytes);
22261 volatile_pmap_count += (pmap_resident_bytes / PAGE_SIZE);
22262 volatile_compressed_pmap_count += (pmap_compressed_bytes
22263 / PAGE_SIZE);
22264 }
22265
22266 /* map is still locked on return */
22267
22268 *volatile_virtual_size_p = volatile_virtual_size;
22269 *volatile_resident_size_p = volatile_resident_count * PAGE_SIZE;
22270 *volatile_compressed_size_p = volatile_compressed_count * PAGE_SIZE;
22271 *volatile_pmap_size_p = volatile_pmap_count * PAGE_SIZE;
22272 *volatile_compressed_pmap_size_p = volatile_compressed_pmap_count * PAGE_SIZE;
22273
22274 return KERN_SUCCESS;
22275 }
22276
22277 void
vm_map_sizes(vm_map_t map,vm_map_size_t * psize,vm_map_size_t * pfree,vm_map_size_t * plargest_free)22278 vm_map_sizes(vm_map_t map,
22279 vm_map_size_t * psize,
22280 vm_map_size_t * pfree,
22281 vm_map_size_t * plargest_free)
22282 {
22283 vm_map_entry_t entry;
22284 vm_map_offset_t prev;
22285 vm_map_size_t free, total_free, largest_free;
22286 boolean_t end;
22287
22288 if (!map) {
22289 *psize = *pfree = *plargest_free = 0;
22290 return;
22291 }
22292 total_free = largest_free = 0;
22293
22294 vm_map_lock_read(map);
22295 if (psize) {
22296 *psize = map->max_offset - map->min_offset;
22297 }
22298
22299 prev = map->min_offset;
22300 for (entry = vm_map_first_entry(map);; entry = entry->vme_next) {
22301 end = (entry == vm_map_to_entry(map));
22302
22303 if (end) {
22304 free = entry->vme_end - prev;
22305 } else {
22306 free = entry->vme_start - prev;
22307 }
22308
22309 total_free += free;
22310 if (free > largest_free) {
22311 largest_free = free;
22312 }
22313
22314 if (end) {
22315 break;
22316 }
22317 prev = entry->vme_end;
22318 }
22319 vm_map_unlock_read(map);
22320 if (pfree) {
22321 *pfree = total_free;
22322 }
22323 if (plargest_free) {
22324 *plargest_free = largest_free;
22325 }
22326 }
22327
22328 #if VM_SCAN_FOR_SHADOW_CHAIN
22329 int vm_map_shadow_max(vm_map_t map);
22330 int
vm_map_shadow_max(vm_map_t map)22331 vm_map_shadow_max(
22332 vm_map_t map)
22333 {
22334 int shadows, shadows_max;
22335 vm_map_entry_t entry;
22336 vm_object_t object, next_object;
22337
22338 if (map == NULL) {
22339 return 0;
22340 }
22341
22342 shadows_max = 0;
22343
22344 vm_map_lock_read(map);
22345
22346 for (entry = vm_map_first_entry(map);
22347 entry != vm_map_to_entry(map);
22348 entry = entry->vme_next) {
22349 if (entry->is_sub_map) {
22350 continue;
22351 }
22352 object = VME_OBJECT(entry);
22353 if (object == NULL) {
22354 continue;
22355 }
22356 vm_object_lock_shared(object);
22357 for (shadows = 0;
22358 object->shadow != NULL;
22359 shadows++, object = next_object) {
22360 next_object = object->shadow;
22361 vm_object_lock_shared(next_object);
22362 vm_object_unlock(object);
22363 }
22364 vm_object_unlock(object);
22365 if (shadows > shadows_max) {
22366 shadows_max = shadows;
22367 }
22368 }
22369
22370 vm_map_unlock_read(map);
22371
22372 return shadows_max;
22373 }
22374 #endif /* VM_SCAN_FOR_SHADOW_CHAIN */
22375
22376 void
vm_commit_pagezero_status(vm_map_t lmap)22377 vm_commit_pagezero_status(vm_map_t lmap)
22378 {
22379 pmap_advise_pagezero_range(lmap->pmap, lmap->min_offset);
22380 }
22381
22382 #if __x86_64__
22383 void
vm_map_set_high_start(vm_map_t map,vm_map_offset_t high_start)22384 vm_map_set_high_start(
22385 vm_map_t map,
22386 vm_map_offset_t high_start)
22387 {
22388 map->vmmap_high_start = high_start;
22389 }
22390 #endif /* __x86_64__ */
22391
22392 #if CODE_SIGNING_MONITOR
22393
22394 kern_return_t
vm_map_entry_cs_associate(vm_map_t map,vm_map_entry_t entry,vm_map_kernel_flags_t vmk_flags)22395 vm_map_entry_cs_associate(
22396 vm_map_t map,
22397 vm_map_entry_t entry,
22398 vm_map_kernel_flags_t vmk_flags)
22399 {
22400 vm_object_t cs_object, cs_shadow, backing_object;
22401 vm_object_offset_t cs_offset, backing_offset;
22402 void *cs_blobs;
22403 struct vnode *cs_vnode;
22404 kern_return_t cs_ret;
22405
22406 if (map->pmap == NULL ||
22407 entry->is_sub_map || /* XXX FBDP: recurse on sub-range? */
22408 (csm_address_space_exempt(map->pmap) == KERN_SUCCESS) ||
22409 VME_OBJECT(entry) == VM_OBJECT_NULL) {
22410 return KERN_SUCCESS;
22411 }
22412
22413 if (!(entry->protection & VM_PROT_EXECUTE)) {
22414 /*
22415 * This memory region is not executable, so the code-signing
22416 * monitor would usually not care about it...
22417 */
22418 if (vmk_flags.vmkf_remap_prot_copy &&
22419 (entry->max_protection & VM_PROT_EXECUTE)) {
22420 /*
22421 * ... except if the memory region is being remapped
22422 * from r-x/r-x to rw-/rwx via vm_protect(VM_PROT_COPY)
22423 * which is what a debugger or dtrace would be doing
22424 * to prepare to modify an executable page to insert
22425 * a breakpoint or activate a probe.
22426 * In that case, fall through so that we can mark
22427 * this region as being "debugged" and no longer
22428 * strictly code-signed.
22429 */
22430 } else {
22431 /*
22432 * Really not executable, so no need to tell the
22433 * code-signing monitor.
22434 */
22435 return KERN_SUCCESS;
22436 }
22437 }
22438
22439 vm_map_lock_assert_exclusive(map);
22440
22441 /*
22442 * Check for a debug association mapping before we check for used_for_jit. This
22443 * allows non-RWX JIT on macOS systems to masquerade their mappings as USER_DEBUG
22444 * pages instead of USER_JIT. These non-RWX JIT pages cannot be marked as USER_JIT
22445 * since they are mapped with RW or RX permissions, which the page table monitor
22446 * denies on USER_JIT pages. Given that, if they're not mapped as USER_DEBUG,
22447 * they will be mapped as USER_EXEC, and that will cause another page table monitor
22448 * violation when those USER_EXEC pages are mapped as RW.
22449 *
22450 * Since these pages switch between RW and RX through mprotect, they mimic what
22451 * we expect a debugger to do. As the code signing monitor does not enforce mappings
22452 * on macOS systems, this works in our favor here and allows us to continue to
22453 * support these legacy-programmed applications without sacrificing security on
22454 * the page table or the code signing monitor. We don't need to explicitly check
22455 * for entry_for_jit here and the mapping permissions. If the initial mapping is
22456 * created with RX, then the application must map it as RW in order to first write
22457 * to the page (MAP_JIT mappings must be private and anonymous). The switch to
22458 * RX will cause vm_map_protect to mark the entry as vmkf_remap_prot_copy.
22459 * Similarly, if the mapping was created as RW, and then switched to RX,
22460 * vm_map_protect will again mark the entry as a copy, and both these cases
22461 * lead to this if-statement being entered.
22462 *
22463 * For more information: rdar://115313336.
22464 */
22465 if (vmk_flags.vmkf_remap_prot_copy) {
22466 cs_ret = csm_associate_debug_region(
22467 map->pmap,
22468 entry->vme_start,
22469 entry->vme_end - entry->vme_start);
22470
22471 /*
22472 * csm_associate_debug_region returns not supported when the code signing
22473 * monitor is disabled. This is intentional, since cs_ret is checked towards
22474 * the end of the function, and if it is not supported, then we still want the
22475 * VM to perform code-signing enforcement on this entry. That said, if we don't
22476 * mark this as a xnu_user_debug page when the code-signing monitor is disabled,
22477 * then it never gets retyped to XNU_USER_DEBUG frame type, which then causes
22478 * an issue with debugging (since it'll be mapped in as XNU_USER_EXEC in some
22479 * cases, which will cause a violation when attempted to be mapped as writable).
22480 */
22481 if ((cs_ret == KERN_SUCCESS) || (cs_ret == KERN_NOT_SUPPORTED)) {
22482 entry->vme_xnu_user_debug = TRUE;
22483 }
22484 #if DEVELOPMENT || DEBUG
22485 if (vm_log_xnu_user_debug) {
22486 printf("FBDP %d[%s] %s:%d map %p entry %p [ 0x%llx 0x%llx ] vme_xnu_user_debug=%d cs_ret %d\n",
22487 proc_selfpid(),
22488 (get_bsdtask_info(current_task()) ? proc_name_address(get_bsdtask_info(current_task())) : "?"),
22489 __FUNCTION__, __LINE__,
22490 map, entry,
22491 (uint64_t)entry->vme_start, (uint64_t)entry->vme_end,
22492 entry->vme_xnu_user_debug,
22493 cs_ret);
22494 }
22495 #endif /* DEVELOPMENT || DEBUG */
22496 goto done;
22497 }
22498
22499 if (entry->used_for_jit) {
22500 cs_ret = csm_associate_jit_region(
22501 map->pmap,
22502 entry->vme_start,
22503 entry->vme_end - entry->vme_start);
22504 goto done;
22505 }
22506
22507 cs_object = VME_OBJECT(entry);
22508 vm_object_lock_shared(cs_object);
22509 cs_offset = VME_OFFSET(entry);
22510
22511 /* find the VM object backed by the code-signed vnode */
22512 for (;;) {
22513 /* go to the bottom of cs_object's shadow chain */
22514 for (;
22515 cs_object->shadow != VM_OBJECT_NULL;
22516 cs_object = cs_shadow) {
22517 cs_shadow = cs_object->shadow;
22518 cs_offset += cs_object->vo_shadow_offset;
22519 vm_object_lock_shared(cs_shadow);
22520 vm_object_unlock(cs_object);
22521 }
22522 if (cs_object->internal ||
22523 cs_object->pager == MEMORY_OBJECT_NULL) {
22524 vm_object_unlock(cs_object);
22525 return KERN_SUCCESS;
22526 }
22527
22528 cs_offset += cs_object->paging_offset;
22529
22530 /*
22531 * cs_object could be backed by a:
22532 * vnode_pager
22533 * apple_protect_pager
22534 * shared_region_pager
22535 * fourk_pager (multiple backing objects -> fail?)
22536 * ask the pager if it has a backing VM object
22537 */
22538 if (!memory_object_backing_object(cs_object->pager,
22539 cs_offset,
22540 &backing_object,
22541 &backing_offset)) {
22542 /* no backing object: cs_object is it */
22543 break;
22544 }
22545
22546 /* look down the backing object's shadow chain */
22547 vm_object_lock_shared(backing_object);
22548 vm_object_unlock(cs_object);
22549 cs_object = backing_object;
22550 cs_offset = backing_offset;
22551 }
22552
22553 cs_vnode = vnode_pager_lookup_vnode(cs_object->pager);
22554 if (cs_vnode == NULL) {
22555 /* no vnode, no code signatures to associate */
22556 cs_ret = KERN_SUCCESS;
22557 } else {
22558 cs_ret = vnode_pager_get_cs_blobs(cs_vnode,
22559 &cs_blobs);
22560 assert(cs_ret == KERN_SUCCESS);
22561 cs_ret = cs_associate_blob_with_mapping(map->pmap,
22562 entry->vme_start,
22563 (entry->vme_end - entry->vme_start),
22564 cs_offset,
22565 cs_blobs);
22566 }
22567 vm_object_unlock(cs_object);
22568 cs_object = VM_OBJECT_NULL;
22569
22570 done:
22571 if (cs_ret == KERN_SUCCESS) {
22572 DTRACE_VM2(vm_map_entry_cs_associate_success,
22573 vm_map_offset_t, entry->vme_start,
22574 vm_map_offset_t, entry->vme_end);
22575 if (vm_map_executable_immutable) {
22576 /*
22577 * Prevent this executable
22578 * mapping from being unmapped
22579 * or modified.
22580 */
22581 entry->vme_permanent = TRUE;
22582 }
22583 /*
22584 * pmap says it will validate the
22585 * code-signing validity of pages
22586 * faulted in via this mapping, so
22587 * this map entry should be marked so
22588 * that vm_fault() bypasses code-signing
22589 * validation for faults coming through
22590 * this mapping.
22591 */
22592 entry->csm_associated = TRUE;
22593 } else if (cs_ret == KERN_NOT_SUPPORTED) {
22594 /*
22595 * pmap won't check the code-signing
22596 * validity of pages faulted in via
22597 * this mapping, so VM should keep
22598 * doing it.
22599 */
22600 DTRACE_VM3(vm_map_entry_cs_associate_off,
22601 vm_map_offset_t, entry->vme_start,
22602 vm_map_offset_t, entry->vme_end,
22603 int, cs_ret);
22604 } else {
22605 /*
22606 * A real error: do not allow
22607 * execution in this mapping.
22608 */
22609 DTRACE_VM3(vm_map_entry_cs_associate_failure,
22610 vm_map_offset_t, entry->vme_start,
22611 vm_map_offset_t, entry->vme_end,
22612 int, cs_ret);
22613 if (vmk_flags.vmkf_overwrite_immutable) {
22614 /*
22615 * We can get here when we remap an apple_protect pager
22616 * on top of an already cs_associated executable mapping
22617 * with the same code signatures, so we don't want to
22618 * lose VM_PROT_EXECUTE in that case...
22619 */
22620 } else {
22621 entry->protection &= ~VM_PROT_ALLEXEC;
22622 entry->max_protection &= ~VM_PROT_ALLEXEC;
22623 }
22624 }
22625
22626 return cs_ret;
22627 }
22628
22629 #endif /* CODE_SIGNING_MONITOR */
22630
22631 inline bool
vm_map_is_corpse_source(vm_map_t map)22632 vm_map_is_corpse_source(vm_map_t map)
22633 {
22634 bool status = false;
22635 if (map) {
22636 vm_map_lock_read(map);
22637 status = map->corpse_source;
22638 vm_map_unlock_read(map);
22639 }
22640 return status;
22641 }
22642
22643 inline void
vm_map_set_corpse_source(vm_map_t map)22644 vm_map_set_corpse_source(vm_map_t map)
22645 {
22646 if (map) {
22647 vm_map_lock(map);
22648 map->corpse_source = true;
22649 vm_map_unlock(map);
22650 }
22651 }
22652
22653 inline void
vm_map_unset_corpse_source(vm_map_t map)22654 vm_map_unset_corpse_source(vm_map_t map)
22655 {
22656 if (map) {
22657 vm_map_lock(map);
22658 map->corpse_source = false;
22659 vm_map_unlock(map);
22660 }
22661 }
22662 /*
22663 * FORKED CORPSE FOOTPRINT
22664 *
22665 * A forked corpse gets a copy of the original VM map but its pmap is mostly
22666 * empty since it never ran and never got to fault in any pages.
22667 * Collecting footprint info (via "sysctl vm.self_region_footprint") for
22668 * a forked corpse would therefore return very little information.
22669 *
22670 * When forking a corpse, we can pass the VM_MAP_FORK_CORPSE_FOOTPRINT option
22671 * to vm_map_fork() to collect footprint information from the original VM map
22672 * and its pmap, and store it in the forked corpse's VM map. That information
22673 * is stored in place of the VM map's "hole list" since we'll never need to
22674 * lookup for holes in the corpse's map.
22675 *
22676 * The corpse's footprint info looks like this:
22677 *
22678 * vm_map->vmmap_corpse_footprint points to pageable kernel memory laid out
22679 * as follows:
22680 * +---------------------------------------+
22681 * header-> | cf_size |
22682 * +-------------------+-------------------+
22683 * | cf_last_region | cf_last_zeroes |
22684 * +-------------------+-------------------+
22685 * region1-> | cfr_vaddr |
22686 * +-------------------+-------------------+
22687 * | cfr_num_pages | d0 | d1 | d2 | d3 |
22688 * +---------------------------------------+
22689 * | d4 | d5 | ... |
22690 * +---------------------------------------+
22691 * | ... |
22692 * +-------------------+-------------------+
22693 * | dy | dz | na | na | cfr_vaddr... | <-region2
22694 * +-------------------+-------------------+
22695 * | cfr_vaddr (ctd) | cfr_num_pages |
22696 * +---------------------------------------+
22697 * | d0 | d1 ... |
22698 * +---------------------------------------+
22699 * ...
22700 * +---------------------------------------+
22701 * last region-> | cfr_vaddr |
22702 * +---------------------------------------+
22703 * + cfr_num_pages | d0 | d1 | d2 | d3 |
22704 * +---------------------------------------+
22705 * ...
22706 * +---------------------------------------+
22707 * | dx | dy | dz | na | na | na | na | na |
22708 * +---------------------------------------+
22709 *
22710 * where:
22711 * cf_size: total size of the buffer (rounded to page size)
22712 * cf_last_region: offset in the buffer of the last "region" sub-header
22713 * cf_last_zeroes: number of trailing "zero" dispositions at the end
22714 * of last region
22715 * cfr_vaddr: virtual address of the start of the covered "region"
22716 * cfr_num_pages: number of pages in the covered "region"
22717 * d*: disposition of the page at that virtual address
22718 * Regions in the buffer are word-aligned.
22719 *
22720 * We estimate the size of the buffer based on the number of memory regions
22721 * and the virtual size of the address space. While copying each memory region
22722 * during vm_map_fork(), we also collect the footprint info for that region
22723 * and store it in the buffer, packing it as much as possible (coalescing
22724 * contiguous memory regions to avoid having too many region headers and
22725 * avoiding long streaks of "zero" page dispositions by splitting footprint
22726 * "regions", so the number of regions in the footprint buffer might not match
22727 * the number of memory regions in the address space.
22728 *
22729 * We also have to copy the original task's "nonvolatile" ledgers since that's
22730 * part of the footprint and will need to be reported to any tool asking for
22731 * the footprint information of the forked corpse.
22732 */
22733
22734 uint64_t vm_map_corpse_footprint_count = 0;
22735 uint64_t vm_map_corpse_footprint_size_avg = 0;
22736 uint64_t vm_map_corpse_footprint_size_max = 0;
22737 uint64_t vm_map_corpse_footprint_full = 0;
22738 uint64_t vm_map_corpse_footprint_no_buf = 0;
22739
22740 struct vm_map_corpse_footprint_header {
22741 vm_size_t cf_size; /* allocated buffer size */
22742 uint32_t cf_last_region; /* offset of last region in buffer */
22743 union {
22744 uint32_t cfu_last_zeroes; /* during creation:
22745 * number of "zero" dispositions at
22746 * end of last region */
22747 uint32_t cfu_hint_region; /* during lookup:
22748 * offset of last looked up region */
22749 #define cf_last_zeroes cfu.cfu_last_zeroes
22750 #define cf_hint_region cfu.cfu_hint_region
22751 } cfu;
22752 };
22753 typedef uint8_t cf_disp_t;
22754 struct vm_map_corpse_footprint_region {
22755 vm_map_offset_t cfr_vaddr; /* region start virtual address */
22756 uint32_t cfr_num_pages; /* number of pages in this "region" */
22757 cf_disp_t cfr_disposition[0]; /* disposition of each page */
22758 } __attribute__((packed));
22759
22760 static cf_disp_t
vm_page_disposition_to_cf_disp(int disposition)22761 vm_page_disposition_to_cf_disp(
22762 int disposition)
22763 {
22764 assert(sizeof(cf_disp_t) == 1);
22765 /* relocate bits that don't fit in a "uint8_t" */
22766 if (disposition & VM_PAGE_QUERY_PAGE_REUSABLE) {
22767 disposition |= VM_PAGE_QUERY_PAGE_FICTITIOUS;
22768 }
22769 /* cast gets rid of extra bits */
22770 return (cf_disp_t) disposition;
22771 }
22772
22773 static int
vm_page_cf_disp_to_disposition(cf_disp_t cf_disp)22774 vm_page_cf_disp_to_disposition(
22775 cf_disp_t cf_disp)
22776 {
22777 int disposition;
22778
22779 assert(sizeof(cf_disp_t) == 1);
22780 disposition = (int) cf_disp;
22781 /* move relocated bits back in place */
22782 if (cf_disp & VM_PAGE_QUERY_PAGE_FICTITIOUS) {
22783 disposition |= VM_PAGE_QUERY_PAGE_REUSABLE;
22784 disposition &= ~VM_PAGE_QUERY_PAGE_FICTITIOUS;
22785 }
22786 return disposition;
22787 }
22788
22789 /*
22790 * vm_map_corpse_footprint_new_region:
22791 * closes the current footprint "region" and creates a new one
22792 *
22793 * Returns NULL if there's not enough space in the buffer for a new region.
22794 */
22795 static struct vm_map_corpse_footprint_region *
vm_map_corpse_footprint_new_region(struct vm_map_corpse_footprint_header * footprint_header)22796 vm_map_corpse_footprint_new_region(
22797 struct vm_map_corpse_footprint_header *footprint_header)
22798 {
22799 uintptr_t footprint_edge;
22800 uint32_t new_region_offset;
22801 struct vm_map_corpse_footprint_region *footprint_region;
22802 struct vm_map_corpse_footprint_region *new_footprint_region;
22803
22804 footprint_edge = ((uintptr_t)footprint_header +
22805 footprint_header->cf_size);
22806 footprint_region = ((struct vm_map_corpse_footprint_region *)
22807 ((char *)footprint_header +
22808 footprint_header->cf_last_region));
22809 assert((uintptr_t)footprint_region + sizeof(*footprint_region) <=
22810 footprint_edge);
22811
22812 /* get rid of trailing zeroes in the last region */
22813 assert(footprint_region->cfr_num_pages >=
22814 footprint_header->cf_last_zeroes);
22815 footprint_region->cfr_num_pages -=
22816 footprint_header->cf_last_zeroes;
22817 footprint_header->cf_last_zeroes = 0;
22818
22819 /* reuse this region if it's now empty */
22820 if (footprint_region->cfr_num_pages == 0) {
22821 return footprint_region;
22822 }
22823
22824 /* compute offset of new region */
22825 new_region_offset = footprint_header->cf_last_region;
22826 new_region_offset += sizeof(*footprint_region);
22827 new_region_offset += (footprint_region->cfr_num_pages * sizeof(cf_disp_t));
22828 new_region_offset = roundup(new_region_offset, sizeof(int));
22829
22830 /* check if we're going over the edge */
22831 if (((uintptr_t)footprint_header +
22832 new_region_offset +
22833 sizeof(*footprint_region)) >=
22834 footprint_edge) {
22835 /* over the edge: no new region */
22836 return NULL;
22837 }
22838
22839 /* adjust offset of last region in header */
22840 footprint_header->cf_last_region = new_region_offset;
22841
22842 new_footprint_region = (struct vm_map_corpse_footprint_region *)
22843 ((char *)footprint_header +
22844 footprint_header->cf_last_region);
22845 new_footprint_region->cfr_vaddr = 0;
22846 new_footprint_region->cfr_num_pages = 0;
22847 /* caller needs to initialize new region */
22848
22849 return new_footprint_region;
22850 }
22851
22852 /*
22853 * vm_map_corpse_footprint_collect:
22854 * collect footprint information for "old_entry" in "old_map" and
22855 * stores it in "new_map"'s vmmap_footprint_info.
22856 */
22857 kern_return_t
vm_map_corpse_footprint_collect(vm_map_t old_map,vm_map_entry_t old_entry,vm_map_t new_map)22858 vm_map_corpse_footprint_collect(
22859 vm_map_t old_map,
22860 vm_map_entry_t old_entry,
22861 vm_map_t new_map)
22862 {
22863 vm_map_offset_t va;
22864 kern_return_t kr;
22865 struct vm_map_corpse_footprint_header *footprint_header;
22866 struct vm_map_corpse_footprint_region *footprint_region;
22867 struct vm_map_corpse_footprint_region *new_footprint_region;
22868 cf_disp_t *next_disp_p;
22869 uintptr_t footprint_edge;
22870 uint32_t num_pages_tmp;
22871 int effective_page_size;
22872
22873 effective_page_size = MIN(PAGE_SIZE, VM_MAP_PAGE_SIZE(old_map));
22874
22875 va = old_entry->vme_start;
22876
22877 vm_map_lock_assert_exclusive(old_map);
22878 vm_map_lock_assert_exclusive(new_map);
22879
22880 assert(new_map->has_corpse_footprint);
22881 assert(!old_map->has_corpse_footprint);
22882 if (!new_map->has_corpse_footprint ||
22883 old_map->has_corpse_footprint) {
22884 /*
22885 * This can only transfer footprint info from a
22886 * map with a live pmap to a map with a corpse footprint.
22887 */
22888 return KERN_NOT_SUPPORTED;
22889 }
22890
22891 if (new_map->vmmap_corpse_footprint == NULL) {
22892 vm_offset_t buf;
22893 vm_size_t buf_size;
22894
22895 buf = 0;
22896 buf_size = (sizeof(*footprint_header) +
22897 (old_map->hdr.nentries
22898 *
22899 (sizeof(*footprint_region) +
22900 +3)) /* potential alignment for each region */
22901 +
22902 ((old_map->size / effective_page_size)
22903 *
22904 sizeof(cf_disp_t))); /* disposition for each page */
22905 // printf("FBDP corpse map %p guestimate footprint size 0x%llx\n", new_map, (uint64_t) buf_size);
22906 buf_size = round_page(buf_size);
22907
22908 /* limit buffer to 1 page to validate overflow detection */
22909 // buf_size = PAGE_SIZE;
22910
22911 /* limit size to a somewhat sane amount */
22912 #if XNU_TARGET_OS_OSX
22913 #define VM_MAP_CORPSE_FOOTPRINT_INFO_MAX_SIZE (8*1024*1024) /* 8MB */
22914 #else /* XNU_TARGET_OS_OSX */
22915 #define VM_MAP_CORPSE_FOOTPRINT_INFO_MAX_SIZE (256*1024) /* 256KB */
22916 #endif /* XNU_TARGET_OS_OSX */
22917 if (buf_size > VM_MAP_CORPSE_FOOTPRINT_INFO_MAX_SIZE) {
22918 buf_size = VM_MAP_CORPSE_FOOTPRINT_INFO_MAX_SIZE;
22919 }
22920
22921 /*
22922 * Allocate the pageable buffer (with a trailing guard page).
22923 * It will be zero-filled on demand.
22924 */
22925 kr = kmem_alloc(kernel_map, &buf, buf_size + PAGE_SIZE,
22926 KMA_DATA | KMA_PAGEABLE | KMA_GUARD_LAST,
22927 VM_KERN_MEMORY_DIAG);
22928 if (kr != KERN_SUCCESS) {
22929 vm_map_corpse_footprint_no_buf++;
22930 return kr;
22931 }
22932
22933 /* initialize header and 1st region */
22934 footprint_header = (struct vm_map_corpse_footprint_header *)buf;
22935 new_map->vmmap_corpse_footprint = footprint_header;
22936
22937 footprint_header->cf_size = buf_size;
22938 footprint_header->cf_last_region =
22939 sizeof(*footprint_header);
22940 footprint_header->cf_last_zeroes = 0;
22941
22942 footprint_region = (struct vm_map_corpse_footprint_region *)
22943 ((char *)footprint_header +
22944 footprint_header->cf_last_region);
22945 footprint_region->cfr_vaddr = 0;
22946 footprint_region->cfr_num_pages = 0;
22947 } else {
22948 /* retrieve header and last region */
22949 footprint_header = (struct vm_map_corpse_footprint_header *)
22950 new_map->vmmap_corpse_footprint;
22951 footprint_region = (struct vm_map_corpse_footprint_region *)
22952 ((char *)footprint_header +
22953 footprint_header->cf_last_region);
22954 }
22955 footprint_edge = ((uintptr_t)footprint_header +
22956 footprint_header->cf_size);
22957
22958 if ((footprint_region->cfr_vaddr +
22959 (((vm_map_offset_t)footprint_region->cfr_num_pages) *
22960 effective_page_size))
22961 != old_entry->vme_start) {
22962 uint64_t num_pages_delta, num_pages_delta_size;
22963 uint32_t region_offset_delta_size;
22964
22965 /*
22966 * Not the next contiguous virtual address:
22967 * start a new region or store "zero" dispositions for
22968 * the missing pages?
22969 */
22970 /* size of gap in actual page dispositions */
22971 num_pages_delta = ((old_entry->vme_start -
22972 footprint_region->cfr_vaddr) / effective_page_size)
22973 - footprint_region->cfr_num_pages;
22974 num_pages_delta_size = num_pages_delta * sizeof(cf_disp_t);
22975 /* size of gap as a new footprint region header */
22976 region_offset_delta_size =
22977 (sizeof(*footprint_region) +
22978 roundup(((footprint_region->cfr_num_pages -
22979 footprint_header->cf_last_zeroes) * sizeof(cf_disp_t)),
22980 sizeof(int)) -
22981 ((footprint_region->cfr_num_pages -
22982 footprint_header->cf_last_zeroes) * sizeof(cf_disp_t)));
22983 // printf("FBDP %s:%d region 0x%x 0x%llx 0x%x vme_start 0x%llx pages_delta 0x%llx region_delta 0x%x\n", __FUNCTION__, __LINE__, footprint_header->cf_last_region, footprint_region->cfr_vaddr, footprint_region->cfr_num_pages, old_entry->vme_start, num_pages_delta, region_offset_delta);
22984 if (region_offset_delta_size < num_pages_delta_size ||
22985 os_add3_overflow(footprint_region->cfr_num_pages,
22986 (uint32_t) num_pages_delta,
22987 1,
22988 &num_pages_tmp)) {
22989 /*
22990 * Storing data for this gap would take more space
22991 * than inserting a new footprint region header:
22992 * let's start a new region and save space. If it's a
22993 * tie, let's avoid using a new region, since that
22994 * would require more region hops to find the right
22995 * range during lookups.
22996 *
22997 * If the current region's cfr_num_pages would overflow
22998 * if we added "zero" page dispositions for the gap,
22999 * no choice but to start a new region.
23000 */
23001 // printf("FBDP %s:%d new region\n", __FUNCTION__, __LINE__);
23002 new_footprint_region =
23003 vm_map_corpse_footprint_new_region(footprint_header);
23004 /* check that we're not going over the edge */
23005 if (new_footprint_region == NULL) {
23006 goto over_the_edge;
23007 }
23008 footprint_region = new_footprint_region;
23009 /* initialize new region as empty */
23010 footprint_region->cfr_vaddr = old_entry->vme_start;
23011 footprint_region->cfr_num_pages = 0;
23012 } else {
23013 /*
23014 * Store "zero" page dispositions for the missing
23015 * pages.
23016 */
23017 // printf("FBDP %s:%d zero gap\n", __FUNCTION__, __LINE__);
23018 for (; num_pages_delta > 0; num_pages_delta--) {
23019 next_disp_p = (cf_disp_t *)
23020 ((uintptr_t) footprint_region +
23021 sizeof(*footprint_region));
23022 next_disp_p += footprint_region->cfr_num_pages;
23023 /* check that we're not going over the edge */
23024 if ((uintptr_t)next_disp_p >= footprint_edge) {
23025 goto over_the_edge;
23026 }
23027 /* store "zero" disposition for this gap page */
23028 footprint_region->cfr_num_pages++;
23029 *next_disp_p = (cf_disp_t) 0;
23030 footprint_header->cf_last_zeroes++;
23031 }
23032 }
23033 }
23034
23035 for (va = old_entry->vme_start;
23036 va < old_entry->vme_end;
23037 va += effective_page_size) {
23038 int disposition;
23039 cf_disp_t cf_disp;
23040
23041 vm_map_footprint_query_page_info(old_map,
23042 old_entry,
23043 va,
23044 &disposition);
23045 cf_disp = vm_page_disposition_to_cf_disp(disposition);
23046
23047 // if (va < SHARED_REGION_BASE_ARM64) printf("FBDP collect map %p va 0x%llx disp 0x%x\n", new_map, va, disp);
23048
23049 if (cf_disp == 0 && footprint_region->cfr_num_pages == 0) {
23050 /*
23051 * Ignore "zero" dispositions at start of
23052 * region: just move start of region.
23053 */
23054 footprint_region->cfr_vaddr += effective_page_size;
23055 continue;
23056 }
23057
23058 /* would region's cfr_num_pages overflow? */
23059 if (os_add_overflow(footprint_region->cfr_num_pages, 1,
23060 &num_pages_tmp)) {
23061 /* overflow: create a new region */
23062 new_footprint_region =
23063 vm_map_corpse_footprint_new_region(
23064 footprint_header);
23065 if (new_footprint_region == NULL) {
23066 goto over_the_edge;
23067 }
23068 footprint_region = new_footprint_region;
23069 footprint_region->cfr_vaddr = va;
23070 footprint_region->cfr_num_pages = 0;
23071 }
23072
23073 next_disp_p = (cf_disp_t *) ((uintptr_t) footprint_region +
23074 sizeof(*footprint_region));
23075 next_disp_p += footprint_region->cfr_num_pages;
23076 /* check that we're not going over the edge */
23077 if ((uintptr_t)next_disp_p >= footprint_edge) {
23078 goto over_the_edge;
23079 }
23080 /* store this dispostion */
23081 *next_disp_p = cf_disp;
23082 footprint_region->cfr_num_pages++;
23083
23084 if (cf_disp != 0) {
23085 /* non-zero disp: break the current zero streak */
23086 footprint_header->cf_last_zeroes = 0;
23087 /* done */
23088 continue;
23089 }
23090
23091 /* zero disp: add to the current streak of zeroes */
23092 footprint_header->cf_last_zeroes++;
23093 if ((footprint_header->cf_last_zeroes +
23094 roundup(((footprint_region->cfr_num_pages -
23095 footprint_header->cf_last_zeroes) * sizeof(cf_disp_t)) &
23096 (sizeof(int) - 1),
23097 sizeof(int))) <
23098 (sizeof(*footprint_header))) {
23099 /*
23100 * There are not enough trailing "zero" dispositions
23101 * (+ the extra padding we would need for the previous
23102 * region); creating a new region would not save space
23103 * at this point, so let's keep this "zero" disposition
23104 * in this region and reconsider later.
23105 */
23106 continue;
23107 }
23108 /*
23109 * Create a new region to avoid having too many consecutive
23110 * "zero" dispositions.
23111 */
23112 new_footprint_region =
23113 vm_map_corpse_footprint_new_region(footprint_header);
23114 if (new_footprint_region == NULL) {
23115 goto over_the_edge;
23116 }
23117 footprint_region = new_footprint_region;
23118 /* initialize the new region as empty ... */
23119 footprint_region->cfr_num_pages = 0;
23120 /* ... and skip this "zero" disp */
23121 footprint_region->cfr_vaddr = va + effective_page_size;
23122 }
23123
23124 return KERN_SUCCESS;
23125
23126 over_the_edge:
23127 // printf("FBDP map %p footprint was full for va 0x%llx\n", new_map, va);
23128 vm_map_corpse_footprint_full++;
23129 return KERN_RESOURCE_SHORTAGE;
23130 }
23131
23132 /*
23133 * vm_map_corpse_footprint_collect_done:
23134 * completes the footprint collection by getting rid of any remaining
23135 * trailing "zero" dispositions and trimming the unused part of the
23136 * kernel buffer
23137 */
23138 void
vm_map_corpse_footprint_collect_done(vm_map_t new_map)23139 vm_map_corpse_footprint_collect_done(
23140 vm_map_t new_map)
23141 {
23142 struct vm_map_corpse_footprint_header *footprint_header;
23143 struct vm_map_corpse_footprint_region *footprint_region;
23144 vm_size_t buf_size, actual_size;
23145 kern_return_t kr;
23146
23147 assert(new_map->has_corpse_footprint);
23148 if (!new_map->has_corpse_footprint ||
23149 new_map->vmmap_corpse_footprint == NULL) {
23150 return;
23151 }
23152
23153 footprint_header = (struct vm_map_corpse_footprint_header *)
23154 new_map->vmmap_corpse_footprint;
23155 buf_size = footprint_header->cf_size;
23156
23157 footprint_region = (struct vm_map_corpse_footprint_region *)
23158 ((char *)footprint_header +
23159 footprint_header->cf_last_region);
23160
23161 /* get rid of trailing zeroes in last region */
23162 assert(footprint_region->cfr_num_pages >= footprint_header->cf_last_zeroes);
23163 footprint_region->cfr_num_pages -= footprint_header->cf_last_zeroes;
23164 footprint_header->cf_last_zeroes = 0;
23165
23166 actual_size = (vm_size_t)(footprint_header->cf_last_region +
23167 sizeof(*footprint_region) +
23168 (footprint_region->cfr_num_pages * sizeof(cf_disp_t)));
23169
23170 // printf("FBDP map %p buf_size 0x%llx actual_size 0x%llx\n", new_map, (uint64_t) buf_size, (uint64_t) actual_size);
23171 vm_map_corpse_footprint_size_avg =
23172 (((vm_map_corpse_footprint_size_avg *
23173 vm_map_corpse_footprint_count) +
23174 actual_size) /
23175 (vm_map_corpse_footprint_count + 1));
23176 vm_map_corpse_footprint_count++;
23177 if (actual_size > vm_map_corpse_footprint_size_max) {
23178 vm_map_corpse_footprint_size_max = actual_size;
23179 }
23180
23181 actual_size = round_page(actual_size);
23182 if (buf_size > actual_size) {
23183 kr = vm_deallocate(kernel_map,
23184 ((vm_address_t)footprint_header +
23185 actual_size +
23186 PAGE_SIZE), /* trailing guard page */
23187 (buf_size - actual_size));
23188 assertf(kr == KERN_SUCCESS,
23189 "trim: footprint_header %p buf_size 0x%llx actual_size 0x%llx kr=0x%x\n",
23190 footprint_header,
23191 (uint64_t) buf_size,
23192 (uint64_t) actual_size,
23193 kr);
23194 kr = vm_protect(kernel_map,
23195 ((vm_address_t)footprint_header +
23196 actual_size),
23197 PAGE_SIZE,
23198 FALSE, /* set_maximum */
23199 VM_PROT_NONE);
23200 assertf(kr == KERN_SUCCESS,
23201 "guard: footprint_header %p buf_size 0x%llx actual_size 0x%llx kr=0x%x\n",
23202 footprint_header,
23203 (uint64_t) buf_size,
23204 (uint64_t) actual_size,
23205 kr);
23206 }
23207
23208 footprint_header->cf_size = actual_size;
23209 }
23210
23211 /*
23212 * vm_map_corpse_footprint_query_page_info:
23213 * retrieves the disposition of the page at virtual address "vaddr"
23214 * in the forked corpse's VM map
23215 *
23216 * This is the equivalent of vm_map_footprint_query_page_info() for a forked corpse.
23217 */
23218 kern_return_t
vm_map_corpse_footprint_query_page_info(vm_map_t map,vm_map_offset_t va,int * disposition_p)23219 vm_map_corpse_footprint_query_page_info(
23220 vm_map_t map,
23221 vm_map_offset_t va,
23222 int *disposition_p)
23223 {
23224 struct vm_map_corpse_footprint_header *footprint_header;
23225 struct vm_map_corpse_footprint_region *footprint_region;
23226 uint32_t footprint_region_offset;
23227 vm_map_offset_t region_start, region_end;
23228 int disp_idx;
23229 kern_return_t kr;
23230 int effective_page_size;
23231 cf_disp_t cf_disp;
23232
23233 if (!map->has_corpse_footprint) {
23234 *disposition_p = 0;
23235 kr = KERN_INVALID_ARGUMENT;
23236 goto done;
23237 }
23238
23239 footprint_header = map->vmmap_corpse_footprint;
23240 if (footprint_header == NULL) {
23241 *disposition_p = 0;
23242 // if (va < SHARED_REGION_BASE_ARM64) printf("FBDP %d query map %p va 0x%llx disp 0x%x\n", __LINE__, map, va, *disposition_p);
23243 kr = KERN_INVALID_ARGUMENT;
23244 goto done;
23245 }
23246
23247 /* start looking at the hint ("cf_hint_region") */
23248 footprint_region_offset = footprint_header->cf_hint_region;
23249
23250 effective_page_size = MIN(PAGE_SIZE, VM_MAP_PAGE_SIZE(map));
23251
23252 lookup_again:
23253 if (footprint_region_offset < sizeof(*footprint_header)) {
23254 /* hint too low: start from 1st region */
23255 footprint_region_offset = sizeof(*footprint_header);
23256 }
23257 if (footprint_region_offset >= footprint_header->cf_last_region) {
23258 /* hint too high: re-start from 1st region */
23259 footprint_region_offset = sizeof(*footprint_header);
23260 }
23261 footprint_region = (struct vm_map_corpse_footprint_region *)
23262 ((char *)footprint_header + footprint_region_offset);
23263 region_start = footprint_region->cfr_vaddr;
23264 region_end = (region_start +
23265 ((vm_map_offset_t)(footprint_region->cfr_num_pages) *
23266 effective_page_size));
23267 if (va < region_start &&
23268 footprint_region_offset != sizeof(*footprint_header)) {
23269 /* our range starts before the hint region */
23270
23271 /* reset the hint (in a racy way...) */
23272 footprint_header->cf_hint_region = sizeof(*footprint_header);
23273 /* lookup "va" again from 1st region */
23274 footprint_region_offset = sizeof(*footprint_header);
23275 goto lookup_again;
23276 }
23277
23278 while (va >= region_end) {
23279 if (footprint_region_offset >= footprint_header->cf_last_region) {
23280 break;
23281 }
23282 /* skip the region's header */
23283 footprint_region_offset += sizeof(*footprint_region);
23284 /* skip the region's page dispositions */
23285 footprint_region_offset += (footprint_region->cfr_num_pages * sizeof(cf_disp_t));
23286 /* align to next word boundary */
23287 footprint_region_offset =
23288 roundup(footprint_region_offset,
23289 sizeof(int));
23290 footprint_region = (struct vm_map_corpse_footprint_region *)
23291 ((char *)footprint_header + footprint_region_offset);
23292 region_start = footprint_region->cfr_vaddr;
23293 region_end = (region_start +
23294 ((vm_map_offset_t)(footprint_region->cfr_num_pages) *
23295 effective_page_size));
23296 }
23297 if (va < region_start || va >= region_end) {
23298 /* page not found */
23299 *disposition_p = 0;
23300 // if (va < SHARED_REGION_BASE_ARM64) printf("FBDP %d query map %p va 0x%llx disp 0x%x\n", __LINE__, map, va, *disposition_p);
23301 kr = KERN_SUCCESS;
23302 goto done;
23303 }
23304
23305 /* "va" found: set the lookup hint for next lookup (in a racy way...) */
23306 footprint_header->cf_hint_region = footprint_region_offset;
23307
23308 /* get page disposition for "va" in this region */
23309 disp_idx = (int) ((va - footprint_region->cfr_vaddr) / effective_page_size);
23310 cf_disp = footprint_region->cfr_disposition[disp_idx];
23311 *disposition_p = vm_page_cf_disp_to_disposition(cf_disp);
23312 kr = KERN_SUCCESS;
23313 done:
23314 // if (va < SHARED_REGION_BASE_ARM64) printf("FBDP %d query map %p va 0x%llx disp 0x%x\n", __LINE__, map, va, *disposition_p);
23315 /* dtrace -n 'vminfo:::footprint_query_page_info { printf("map 0x%p va 0x%llx disp 0x%x kr 0x%x", arg0, arg1, arg2, arg3); }' */
23316 DTRACE_VM4(footprint_query_page_info,
23317 vm_map_t, map,
23318 vm_map_offset_t, va,
23319 int, *disposition_p,
23320 kern_return_t, kr);
23321
23322 return kr;
23323 }
23324
23325 void
vm_map_corpse_footprint_destroy(vm_map_t map)23326 vm_map_corpse_footprint_destroy(
23327 vm_map_t map)
23328 {
23329 if (map->has_corpse_footprint &&
23330 map->vmmap_corpse_footprint != 0) {
23331 struct vm_map_corpse_footprint_header *footprint_header;
23332 vm_size_t buf_size;
23333 kern_return_t kr;
23334
23335 footprint_header = map->vmmap_corpse_footprint;
23336 buf_size = footprint_header->cf_size;
23337 kr = vm_deallocate(kernel_map,
23338 (vm_offset_t) map->vmmap_corpse_footprint,
23339 ((vm_size_t) buf_size
23340 + PAGE_SIZE)); /* trailing guard page */
23341 assertf(kr == KERN_SUCCESS, "kr=0x%x\n", kr);
23342 map->vmmap_corpse_footprint = 0;
23343 map->has_corpse_footprint = FALSE;
23344 }
23345 }
23346
23347 /*
23348 * vm_map_copy_footprint_ledgers:
23349 * copies any ledger that's relevant to the memory footprint of "old_task"
23350 * into the forked corpse's task ("new_task")
23351 */
23352 void
vm_map_copy_footprint_ledgers(task_t old_task,task_t new_task)23353 vm_map_copy_footprint_ledgers(
23354 task_t old_task,
23355 task_t new_task)
23356 {
23357 vm_map_copy_ledger(old_task, new_task, task_ledgers.phys_footprint);
23358 vm_map_copy_ledger(old_task, new_task, task_ledgers.purgeable_nonvolatile);
23359 vm_map_copy_ledger(old_task, new_task, task_ledgers.purgeable_nonvolatile_compressed);
23360 vm_map_copy_ledger(old_task, new_task, task_ledgers.internal);
23361 vm_map_copy_ledger(old_task, new_task, task_ledgers.internal_compressed);
23362 vm_map_copy_ledger(old_task, new_task, task_ledgers.iokit_mapped);
23363 vm_map_copy_ledger(old_task, new_task, task_ledgers.alternate_accounting);
23364 vm_map_copy_ledger(old_task, new_task, task_ledgers.alternate_accounting_compressed);
23365 vm_map_copy_ledger(old_task, new_task, task_ledgers.page_table);
23366 vm_map_copy_ledger(old_task, new_task, task_ledgers.tagged_footprint);
23367 vm_map_copy_ledger(old_task, new_task, task_ledgers.tagged_footprint_compressed);
23368 vm_map_copy_ledger(old_task, new_task, task_ledgers.network_nonvolatile);
23369 vm_map_copy_ledger(old_task, new_task, task_ledgers.network_nonvolatile_compressed);
23370 vm_map_copy_ledger(old_task, new_task, task_ledgers.media_footprint);
23371 vm_map_copy_ledger(old_task, new_task, task_ledgers.media_footprint_compressed);
23372 vm_map_copy_ledger(old_task, new_task, task_ledgers.graphics_footprint);
23373 vm_map_copy_ledger(old_task, new_task, task_ledgers.graphics_footprint_compressed);
23374 vm_map_copy_ledger(old_task, new_task, task_ledgers.neural_footprint);
23375 vm_map_copy_ledger(old_task, new_task, task_ledgers.neural_footprint_compressed);
23376 vm_map_copy_ledger(old_task, new_task, task_ledgers.wired_mem);
23377 }
23378
23379 /*
23380 * vm_map_copy_ledger:
23381 * copy a single ledger from "old_task" to "new_task"
23382 */
23383 void
vm_map_copy_ledger(task_t old_task,task_t new_task,int ledger_entry)23384 vm_map_copy_ledger(
23385 task_t old_task,
23386 task_t new_task,
23387 int ledger_entry)
23388 {
23389 ledger_amount_t old_balance, new_balance, delta;
23390
23391 assert(new_task->map->has_corpse_footprint);
23392 if (!new_task->map->has_corpse_footprint) {
23393 return;
23394 }
23395
23396 /* turn off sanity checks for the ledger we're about to mess with */
23397 ledger_disable_panic_on_negative(new_task->ledger,
23398 ledger_entry);
23399
23400 /* adjust "new_task" to match "old_task" */
23401 ledger_get_balance(old_task->ledger,
23402 ledger_entry,
23403 &old_balance);
23404 ledger_get_balance(new_task->ledger,
23405 ledger_entry,
23406 &new_balance);
23407 if (new_balance == old_balance) {
23408 /* new == old: done */
23409 } else if (new_balance > old_balance) {
23410 /* new > old ==> new -= new - old */
23411 delta = new_balance - old_balance;
23412 ledger_debit(new_task->ledger,
23413 ledger_entry,
23414 delta);
23415 } else {
23416 /* new < old ==> new += old - new */
23417 delta = old_balance - new_balance;
23418 ledger_credit(new_task->ledger,
23419 ledger_entry,
23420 delta);
23421 }
23422 }
23423
23424 /*
23425 * vm_map_get_pmap:
23426 * returns the pmap associated with the vm_map
23427 */
23428 pmap_t
vm_map_get_pmap(vm_map_t map)23429 vm_map_get_pmap(vm_map_t map)
23430 {
23431 return vm_map_pmap(map);
23432 }
23433
23434 #if CONFIG_MAP_RANGES
23435 static bitmap_t vm_map_user_range_heap_map[BITMAP_LEN(VM_MEMORY_COUNT)];
23436
23437 static_assert(UMEM_RANGE_ID_DEFAULT == MACH_VM_RANGE_DEFAULT);
23438 static_assert(UMEM_RANGE_ID_HEAP == MACH_VM_RANGE_DATA);
23439
23440 /*
23441 * vm_map_range_map_init:
23442 * initializes the VM range ID map to enable index lookup
23443 * of user VM ranges based on VM tag from userspace.
23444 */
23445 static void
vm_map_range_map_init(void)23446 vm_map_range_map_init(void)
23447 {
23448 /*
23449 * VM_MEMORY_MALLOC{,_NANO} are skipped on purpose:
23450 * - the former is malloc metadata which should be kept separate
23451 * - the latter has its own ranges
23452 */
23453 bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_MALLOC_HUGE);
23454 bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_MALLOC_LARGE);
23455 bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_MALLOC_LARGE_REUSED);
23456 bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_MALLOC_MEDIUM);
23457 bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_MALLOC_PROB_GUARD);
23458 bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_MALLOC_SMALL);
23459 bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_MALLOC_TINY);
23460 bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_TCMALLOC);
23461 bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_LIBNETWORK);
23462 bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_IOACCELERATOR);
23463 bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_IOSURFACE);
23464 bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_IMAGEIO);
23465 bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_COREGRAPHICS);
23466 bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_CORESERVICES);
23467 bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_COREDATA);
23468 bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_LAYERKIT);
23469 }
23470
23471 static struct mach_vm_range
vm_map_range_random_uniform(vm_map_size_t req_size,vm_map_offset_t min_addr,vm_map_offset_t max_addr,vm_map_offset_t offmask)23472 vm_map_range_random_uniform(
23473 vm_map_size_t req_size,
23474 vm_map_offset_t min_addr,
23475 vm_map_offset_t max_addr,
23476 vm_map_offset_t offmask)
23477 {
23478 vm_map_offset_t random_addr;
23479 struct mach_vm_range alloc;
23480
23481 req_size = (req_size + offmask) & ~offmask;
23482 min_addr = (min_addr + offmask) & ~offmask;
23483 max_addr = max_addr & ~offmask;
23484
23485 read_random(&random_addr, sizeof(random_addr));
23486 random_addr %= (max_addr - req_size - min_addr);
23487 random_addr &= ~offmask;
23488
23489 alloc.min_address = min_addr + random_addr;
23490 alloc.max_address = min_addr + random_addr + req_size;
23491 return alloc;
23492 }
23493
23494 static vm_map_offset_t
vm_map_range_offmask(void)23495 vm_map_range_offmask(void)
23496 {
23497 uint32_t pte_depth;
23498
23499 /*
23500 * PTE optimizations
23501 *
23502 *
23503 * 16k pages systems
23504 * ~~~~~~~~~~~~~~~~~
23505 *
23506 * A single L1 (sub-)page covers the address space.
23507 * - L2 pages cover 64G,
23508 * - L3 pages cover 32M.
23509 *
23510 * On embedded, the dynamic VA range is 64G and uses a single L2 page.
23511 * As a result, we really only need to align the ranges to 32M to avoid
23512 * partial L3 pages.
23513 *
23514 * On macOS, the usage of L2 pages will increase, so as a result we will
23515 * want to align ranges to 64G in order to utilize them fully.
23516 *
23517 *
23518 * 4k pages systems
23519 * ~~~~~~~~~~~~~~~~
23520 *
23521 * A single L0 (sub-)page covers the address space.
23522 * - L1 pages cover 512G,
23523 * - L2 pages cover 1G,
23524 * - L3 pages cover 2M.
23525 *
23526 * The long tail of processes on a system will tend to have a VA usage
23527 * (ignoring the shared regions) in the 100s of MB order of magnitnude.
23528 * This is achievable with a single L1 and a few L2s without
23529 * randomization.
23530 *
23531 * However once randomization is introduced, the system will immediately
23532 * need several L1s and many more L2s. As a result:
23533 *
23534 * - on embedded devices, the cost of these extra pages isn't
23535 * sustainable, and we just disable the feature entirely,
23536 *
23537 * - on macOS we align ranges to a 512G boundary so that the extra L1
23538 * pages can be used to their full potential.
23539 */
23540
23541 /*
23542 * note, this function assumes _non exotic mappings_
23543 * which is why it uses the native kernel's PAGE_SHIFT.
23544 */
23545 #if XNU_PLATFORM_MacOSX
23546 pte_depth = PAGE_SHIFT > 12 ? 2 : 3;
23547 #else /* !XNU_PLATFORM_MacOSX */
23548 pte_depth = PAGE_SHIFT > 12 ? 1 : 0;
23549 #endif /* !XNU_PLATFORM_MacOSX */
23550
23551 if (pte_depth == 0) {
23552 return 0;
23553 }
23554
23555 return (1ull << ((PAGE_SHIFT - 3) * pte_depth + PAGE_SHIFT)) - 1;
23556 }
23557
23558 /*
23559 * vm_map_range_configure:
23560 * configures the user vm_map ranges by increasing the maximum VA range of
23561 * the map and carving out a range at the end of VA space (searching backwards
23562 * in the newly expanded map).
23563 */
23564 kern_return_t
vm_map_range_configure(vm_map_t map)23565 vm_map_range_configure(vm_map_t map)
23566 {
23567 const vm_map_offset_t offmask = vm_map_range_offmask();
23568 struct mach_vm_range data_range;
23569 vm_map_offset_t default_end;
23570 kern_return_t kr;
23571
23572 if (!vm_map_is_64bit(map) || vm_map_is_exotic(map) || offmask == 0) {
23573 /*
23574 * No point doing vm ranges in a 32bit address space.
23575 */
23576 return KERN_NOT_SUPPORTED;
23577 }
23578
23579 /* Should not be applying ranges to kernel map or kernel map submaps */
23580 assert(vm_map_pmap(map) != kernel_pmap);
23581
23582 #if XNU_PLATFORM_MacOSX
23583
23584 /*
23585 * on macOS, the address space is a massive 47 bits (128T),
23586 * with several carve outs that processes can't use:
23587 * - the shared region
23588 * - the commpage region
23589 * - the GPU carve out (if applicable)
23590 *
23591 * and when nano-malloc is in use it desires memory at the 96T mark.
23592 *
23593 * However, their location is architecture dependent:
23594 * - On intel, the shared region and commpage are
23595 * at the very end of the usable address space (above +127T),
23596 * and there is no GPU carve out, and pthread wants to place
23597 * threads at the 112T mark (0x70T).
23598 *
23599 * - On arm64, these are in the same spot as on embedded devices:
23600 * o shared region: [ 6G, 10G) [ will likely grow over time ]
23601 * o commpage region: [63G, 64G)
23602 * o GPU carve out: [64G, 448G)
23603 *
23604 * This is conveninent because the mappings at the end of the address
23605 * space (when they exist) are made by the kernel.
23606 *
23607 * The policy is to allocate a random 1T for the data heap
23608 * in the end of the address-space in the:
23609 * - [0x71, 0x7f) range on Intel (to leave space for pthread stacks)
23610 * - [0x61, 0x7f) range on ASM (to leave space for Nano malloc).
23611 */
23612
23613 /* see NANOZONE_SIGNATURE in libmalloc */
23614 #if __x86_64__
23615 default_end = 0x71ull << 40;
23616 #else
23617 default_end = 0x61ull << 40;
23618 #endif
23619 data_range = vm_map_range_random_uniform(1ull << 40,
23620 default_end, 0x7full << 40, offmask);
23621
23622 #else /* !XNU_PLATFORM_MacOSX */
23623
23624 /*
23625 * Embedded devices:
23626 *
23627 * The default VA Size scales with the device physical memory.
23628 *
23629 * Out of that:
23630 * - the "zero" page typically uses 4G + some slide
23631 * - the shared region uses SHARED_REGION_SIZE bytes (4G)
23632 *
23633 * Without the use of jumbo or any adjustment to the address space,
23634 * a default VM map typically looks like this:
23635 *
23636 * 0G -->╒════════════╕
23637 * │ pagezero │
23638 * │ + slide │
23639 * ~4G -->╞════════════╡<-- vm_map_min(map)
23640 * │ │
23641 * 6G -->├────────────┤
23642 * │ shared │
23643 * │ region │
23644 * 10G -->├────────────┤
23645 * │ │
23646 * max_va -->├────────────┤<-- vm_map_max(map)
23647 * │ │
23648 * ╎ jumbo ╎
23649 * ╎ ╎
23650 * │ │
23651 * 63G -->╞════════════╡<-- MACH_VM_MAX_ADDRESS
23652 * │ commpage │
23653 * 64G -->├────────────┤<-- MACH_VM_MIN_GPU_CARVEOUT_ADDRESS
23654 * │ │
23655 * ╎ GPU ╎
23656 * ╎ carveout ╎
23657 * │ │
23658 * 448G -->├────────────┤<-- MACH_VM_MAX_GPU_CARVEOUT_ADDRESS
23659 * │ │
23660 * ╎ ╎
23661 * ╎ ╎
23662 * │ │
23663 * 512G -->╘════════════╛<-- (1ull << ARM_16K_TT_L1_SHIFT)
23664 *
23665 * When this drawing was made, "max_va" was smaller than
23666 * ARM64_MAX_OFFSET_DEVICE_LARGE (~15.5G), leaving shy of
23667 * 12G of address space for the zero-page, slide, files,
23668 * binaries, heap ...
23669 *
23670 * We will want to make a "heap/data" carve out inside
23671 * the jumbo range of half of that usable space, assuming
23672 * that this is less than a forth of the jumbo range.
23673 *
23674 * The assert below intends to catch when max_va grows
23675 * too large for this heuristic.
23676 */
23677
23678 vm_map_lock_read(map);
23679 default_end = vm_map_max(map);
23680 vm_map_unlock_read(map);
23681
23682 /*
23683 * Check that we're not already jumbo'd,
23684 * or our address space was somehow modified.
23685 *
23686 * If so we cannot guarantee that we can set up the ranges
23687 * safely without interfering with the existing map.
23688 */
23689 if (default_end > vm_compute_max_offset(true)) {
23690 return KERN_NO_SPACE;
23691 }
23692
23693 if (pmap_max_offset(true, ARM_PMAP_MAX_OFFSET_DEFAULT)) {
23694 /*
23695 * an override boot-arg was set, disable user-ranges
23696 *
23697 * XXX: this is problematic because it means these boot-args
23698 * no longer test the behavior changing the value
23699 * of ARM64_MAX_OFFSET_DEVICE_* would have.
23700 */
23701 return KERN_NOT_SUPPORTED;
23702 }
23703
23704 /* expand the default VM space to the largest possible address */
23705 vm_map_set_jumbo(map);
23706
23707 assert3u(7 * GiB(10) / 2, <=, vm_map_max(map) - default_end);
23708 data_range = vm_map_range_random_uniform(GiB(10),
23709 default_end + PAGE_SIZE, vm_map_max(map), offmask);
23710
23711 #endif /* !XNU_PLATFORM_MacOSX */
23712
23713 /*
23714 * Poke holes so that ASAN or people listing regions
23715 * do not think this space is free.
23716 */
23717
23718 if (default_end != data_range.min_address) {
23719 kr = vm_map_enter(map, &default_end,
23720 data_range.min_address - default_end,
23721 0, VM_MAP_KERNEL_FLAGS_FIXED_PERMANENT(), VM_OBJECT_NULL,
23722 0, FALSE, VM_PROT_NONE, VM_PROT_NONE, VM_INHERIT_DEFAULT);
23723 assert(kr == KERN_SUCCESS);
23724 }
23725
23726 if (data_range.max_address != vm_map_max(map)) {
23727 vm_map_entry_t entry;
23728 vm_size_t size;
23729
23730 vm_map_lock_read(map);
23731 vm_map_lookup_entry_or_next(map, data_range.max_address, &entry);
23732 if (entry != vm_map_to_entry(map)) {
23733 size = vm_map_max(map) - data_range.max_address;
23734 } else {
23735 size = entry->vme_start - data_range.max_address;
23736 }
23737 vm_map_unlock_read(map);
23738
23739 kr = vm_map_enter(map, &data_range.max_address, size,
23740 0, VM_MAP_KERNEL_FLAGS_FIXED_PERMANENT(), VM_OBJECT_NULL,
23741 0, FALSE, VM_PROT_NONE, VM_PROT_NONE, VM_INHERIT_DEFAULT);
23742 assert(kr == KERN_SUCCESS);
23743 }
23744
23745 vm_map_lock(map);
23746 map->default_range.min_address = vm_map_min(map);
23747 map->default_range.max_address = default_end;
23748 map->data_range = data_range;
23749 map->uses_user_ranges = true;
23750 vm_map_unlock(map);
23751
23752 return KERN_SUCCESS;
23753 }
23754
23755 /*
23756 * vm_map_range_fork:
23757 * clones the array of ranges from old_map to new_map in support
23758 * of a VM map fork.
23759 */
23760 void
vm_map_range_fork(vm_map_t new_map,vm_map_t old_map)23761 vm_map_range_fork(vm_map_t new_map, vm_map_t old_map)
23762 {
23763 if (!old_map->uses_user_ranges) {
23764 /* nothing to do */
23765 return;
23766 }
23767
23768 new_map->default_range = old_map->default_range;
23769 new_map->data_range = old_map->data_range;
23770
23771 if (old_map->extra_ranges_count) {
23772 vm_map_user_range_t otable, ntable;
23773 uint16_t count;
23774
23775 otable = old_map->extra_ranges;
23776 count = old_map->extra_ranges_count;
23777 ntable = kalloc_data(count * sizeof(struct vm_map_user_range),
23778 Z_WAITOK | Z_ZERO | Z_NOFAIL);
23779 memcpy(ntable, otable,
23780 count * sizeof(struct vm_map_user_range));
23781
23782 new_map->extra_ranges_count = count;
23783 new_map->extra_ranges = ntable;
23784 }
23785
23786 new_map->uses_user_ranges = true;
23787 }
23788
23789 /*
23790 * vm_map_get_user_range:
23791 * copy the VM user range for the given VM map and range ID.
23792 */
23793 kern_return_t
vm_map_get_user_range(vm_map_t map,vm_map_range_id_t range_id,mach_vm_range_t range)23794 vm_map_get_user_range(
23795 vm_map_t map,
23796 vm_map_range_id_t range_id,
23797 mach_vm_range_t range)
23798 {
23799 if (map == NULL || !map->uses_user_ranges || range == NULL) {
23800 return KERN_INVALID_ARGUMENT;
23801 }
23802
23803 switch (range_id) {
23804 case UMEM_RANGE_ID_DEFAULT:
23805 *range = map->default_range;
23806 return KERN_SUCCESS;
23807
23808 case UMEM_RANGE_ID_HEAP:
23809 *range = map->data_range;
23810 return KERN_SUCCESS;
23811
23812 default:
23813 return KERN_INVALID_ARGUMENT;
23814 }
23815 }
23816
23817 static vm_map_range_id_t
vm_map_user_range_resolve(vm_map_t map,mach_vm_address_t addr,mach_vm_size_t size,mach_vm_range_t range)23818 vm_map_user_range_resolve(
23819 vm_map_t map,
23820 mach_vm_address_t addr,
23821 mach_vm_size_t size,
23822 mach_vm_range_t range)
23823 {
23824 struct mach_vm_range tmp;
23825
23826 vm_map_lock_assert_held(map);
23827
23828 static_assert(UMEM_RANGE_ID_DEFAULT == MACH_VM_RANGE_DEFAULT);
23829 static_assert(UMEM_RANGE_ID_HEAP == MACH_VM_RANGE_DATA);
23830
23831 if (mach_vm_range_contains(&map->default_range, addr, size)) {
23832 if (range) {
23833 *range = map->default_range;
23834 }
23835 return UMEM_RANGE_ID_DEFAULT;
23836 }
23837
23838 if (mach_vm_range_contains(&map->data_range, addr, size)) {
23839 if (range) {
23840 *range = map->data_range;
23841 }
23842 return UMEM_RANGE_ID_HEAP;
23843 }
23844
23845 for (size_t i = 0; i < map->extra_ranges_count; i++) {
23846 vm_map_user_range_t r = &map->extra_ranges[i];
23847
23848 tmp.min_address = r->vmur_min_address;
23849 tmp.max_address = r->vmur_max_address;
23850
23851 if (mach_vm_range_contains(&tmp, addr, size)) {
23852 if (range) {
23853 *range = tmp;
23854 }
23855 return r->vmur_range_id;
23856 }
23857 }
23858
23859 if (range) {
23860 range->min_address = range->max_address = 0;
23861 }
23862 return UMEM_RANGE_ID_DEFAULT;
23863 }
23864
23865 static int
vm_map_user_range_cmp(const void * e1,const void * e2)23866 vm_map_user_range_cmp(const void *e1, const void *e2)
23867 {
23868 const struct vm_map_user_range *r1 = e1;
23869 const struct vm_map_user_range *r2 = e2;
23870
23871 if (r1->vmur_min_address != r2->vmur_min_address) {
23872 return r1->vmur_min_address < r2->vmur_min_address ? -1 : 1;
23873 }
23874
23875 return 0;
23876 }
23877
23878 static int
mach_vm_range_recipe_v1_cmp(const void * e1,const void * e2)23879 mach_vm_range_recipe_v1_cmp(const void *e1, const void *e2)
23880 {
23881 const mach_vm_range_recipe_v1_t *r1 = e1;
23882 const mach_vm_range_recipe_v1_t *r2 = e2;
23883
23884 if (r1->range.min_address != r2->range.min_address) {
23885 return r1->range.min_address < r2->range.min_address ? -1 : 1;
23886 }
23887
23888 return 0;
23889 }
23890
23891 /*!
23892 * @function mach_vm_range_create_v1()
23893 *
23894 * @brief
23895 * Handle the backend for mach_vm_range_create() for the
23896 * MACH_VM_RANGE_FLAVOR_V1 flavor.
23897 *
23898 * @description
23899 * This call allows to create "ranges" in the map of a task
23900 * that have special semantics/policies around placement of
23901 * new allocations (in the vm_map_locate_space() sense).
23902 *
23903 * @returns
23904 * - KERN_SUCCESS on success
23905 * - KERN_INVALID_ARGUMENT for incorrect arguments
23906 * - KERN_NO_SPACE if the maximum amount of ranges would be exceeded
23907 * - KERN_MEMORY_PRESENT if any of the requested ranges
23908 * overlaps with existing ranges or allocations in the map.
23909 */
23910 static kern_return_t
mach_vm_range_create_v1(vm_map_t map,mach_vm_range_recipe_v1_t * recipe,uint32_t new_count)23911 mach_vm_range_create_v1(
23912 vm_map_t map,
23913 mach_vm_range_recipe_v1_t *recipe,
23914 uint32_t new_count)
23915 {
23916 const vm_offset_t mask = VM_MAP_PAGE_MASK(map);
23917 vm_map_user_range_t table;
23918 kern_return_t kr = KERN_SUCCESS;
23919 uint16_t count;
23920
23921 struct mach_vm_range void1 = {
23922 .min_address = map->default_range.max_address,
23923 .max_address = map->data_range.min_address,
23924 };
23925 struct mach_vm_range void2 = {
23926 .min_address = map->data_range.max_address,
23927 .max_address = vm_map_max(map),
23928 };
23929
23930 qsort(recipe, new_count, sizeof(mach_vm_range_recipe_v1_t),
23931 mach_vm_range_recipe_v1_cmp);
23932
23933 /*
23934 * Step 1: Validate that the recipes have no intersections.
23935 */
23936
23937 for (size_t i = 0; i < new_count; i++) {
23938 mach_vm_range_t r = &recipe[i].range;
23939 mach_vm_size_t s;
23940
23941 if (recipe[i].flags) {
23942 return KERN_INVALID_ARGUMENT;
23943 }
23944
23945 static_assert(UMEM_RANGE_ID_FIXED == MACH_VM_RANGE_FIXED);
23946 switch (recipe[i].range_tag) {
23947 case MACH_VM_RANGE_FIXED:
23948 break;
23949 default:
23950 return KERN_INVALID_ARGUMENT;
23951 }
23952
23953 if (!VM_MAP_PAGE_ALIGNED(r->min_address, mask) ||
23954 !VM_MAP_PAGE_ALIGNED(r->max_address, mask) ||
23955 r->min_address >= r->max_address) {
23956 return KERN_INVALID_ARGUMENT;
23957 }
23958
23959 s = mach_vm_range_size(r);
23960 if (!mach_vm_range_contains(&void1, r->min_address, s) &&
23961 !mach_vm_range_contains(&void2, r->min_address, s)) {
23962 return KERN_INVALID_ARGUMENT;
23963 }
23964
23965 if (i > 0 && recipe[i - 1].range.max_address >
23966 recipe[i].range.min_address) {
23967 return KERN_INVALID_ARGUMENT;
23968 }
23969 }
23970
23971 vm_map_lock(map);
23972
23973 table = map->extra_ranges;
23974 count = map->extra_ranges_count;
23975
23976 if (count + new_count > VM_MAP_EXTRA_RANGES_MAX) {
23977 kr = KERN_NO_SPACE;
23978 goto out_unlock;
23979 }
23980
23981 /*
23982 * Step 2: Check that there is no intersection with existing ranges.
23983 */
23984
23985 for (size_t i = 0, j = 0; i < new_count && j < count;) {
23986 mach_vm_range_t r1 = &recipe[i].range;
23987 vm_map_user_range_t r2 = &table[j];
23988
23989 if (r1->max_address <= r2->vmur_min_address) {
23990 i++;
23991 } else if (r2->vmur_max_address <= r1->min_address) {
23992 j++;
23993 } else {
23994 kr = KERN_MEMORY_PRESENT;
23995 goto out_unlock;
23996 }
23997 }
23998
23999 /*
24000 * Step 4: commit the new ranges.
24001 */
24002
24003 static_assert(VM_MAP_EXTRA_RANGES_MAX * sizeof(struct vm_map_user_range) <=
24004 KALLOC_SAFE_ALLOC_SIZE);
24005
24006 table = krealloc_data(table,
24007 count * sizeof(struct vm_map_user_range),
24008 (count + new_count) * sizeof(struct vm_map_user_range),
24009 Z_ZERO | Z_WAITOK | Z_NOFAIL);
24010
24011 for (size_t i = 0; i < new_count; i++) {
24012 static_assert(MACH_VM_MAX_ADDRESS < (1ull << 56));
24013
24014 table[count + i] = (struct vm_map_user_range){
24015 .vmur_min_address = recipe[i].range.min_address,
24016 .vmur_max_address = recipe[i].range.max_address,
24017 .vmur_range_id = (vm_map_range_id_t)recipe[i].range_tag,
24018 };
24019 }
24020
24021 qsort(table, count + new_count,
24022 sizeof(struct vm_map_user_range), vm_map_user_range_cmp);
24023
24024 map->extra_ranges_count += new_count;
24025 map->extra_ranges = table;
24026
24027 out_unlock:
24028 vm_map_unlock(map);
24029
24030 if (kr == KERN_SUCCESS) {
24031 for (size_t i = 0; i < new_count; i++) {
24032 vm_map_kernel_flags_t vmk_flags = {
24033 .vmf_fixed = true,
24034 .vmf_overwrite = true,
24035 .vmkf_overwrite_immutable = true,
24036 .vm_tag = recipe[i].vm_tag,
24037 };
24038 __assert_only kern_return_t kr2;
24039
24040 kr2 = vm_map_enter(map, &recipe[i].range.min_address,
24041 mach_vm_range_size(&recipe[i].range),
24042 0, vmk_flags, VM_OBJECT_NULL, 0, FALSE,
24043 VM_PROT_NONE, VM_PROT_ALL,
24044 VM_INHERIT_DEFAULT);
24045 assert(kr2 == KERN_SUCCESS);
24046 }
24047 }
24048 return kr;
24049 }
24050
24051 kern_return_t
mach_vm_range_create(vm_map_t map,mach_vm_range_flavor_t flavor,mach_vm_range_recipes_raw_t recipe,natural_t size)24052 mach_vm_range_create(
24053 vm_map_t map,
24054 mach_vm_range_flavor_t flavor,
24055 mach_vm_range_recipes_raw_t recipe,
24056 natural_t size)
24057 {
24058 if (map != current_map()) {
24059 return KERN_INVALID_ARGUMENT;
24060 }
24061
24062 if (!map->uses_user_ranges) {
24063 return KERN_NOT_SUPPORTED;
24064 }
24065
24066 if (size == 0) {
24067 return KERN_SUCCESS;
24068 }
24069
24070 if (flavor == MACH_VM_RANGE_FLAVOR_V1) {
24071 mach_vm_range_recipe_v1_t *array;
24072
24073 if (size % sizeof(mach_vm_range_recipe_v1_t)) {
24074 return KERN_INVALID_ARGUMENT;
24075 }
24076
24077 size /= sizeof(mach_vm_range_recipe_v1_t);
24078 if (size > VM_MAP_EXTRA_RANGES_MAX) {
24079 return KERN_NO_SPACE;
24080 }
24081
24082 array = (mach_vm_range_recipe_v1_t *)recipe;
24083 return mach_vm_range_create_v1(map, array, size);
24084 }
24085
24086 return KERN_INVALID_ARGUMENT;
24087 }
24088
24089 #else /* !CONFIG_MAP_RANGES */
24090
24091 kern_return_t
mach_vm_range_create(vm_map_t map,mach_vm_range_flavor_t flavor,mach_vm_range_recipes_raw_t recipe,natural_t size)24092 mach_vm_range_create(
24093 vm_map_t map,
24094 mach_vm_range_flavor_t flavor,
24095 mach_vm_range_recipes_raw_t recipe,
24096 natural_t size)
24097 {
24098 #pragma unused(map, flavor, recipe, size)
24099 return KERN_NOT_SUPPORTED;
24100 }
24101
24102 #endif /* !CONFIG_MAP_RANGES */
24103
24104 void
vm_map_kernel_flags_update_range_id(vm_map_kernel_flags_t * vmkf,vm_map_t map)24105 vm_map_kernel_flags_update_range_id(vm_map_kernel_flags_t *vmkf, vm_map_t map)
24106 {
24107 if (map == kernel_map) {
24108 if (vmkf->vmkf_range_id == KMEM_RANGE_ID_NONE) {
24109 vmkf->vmkf_range_id = KMEM_RANGE_ID_DATA;
24110 }
24111 #if CONFIG_MAP_RANGES
24112 } else if (vmkf->vm_tag < VM_MEMORY_COUNT &&
24113 vmkf->vmkf_range_id == UMEM_RANGE_ID_DEFAULT &&
24114 bitmap_test(vm_map_user_range_heap_map, vmkf->vm_tag)) {
24115 vmkf->vmkf_range_id = UMEM_RANGE_ID_HEAP;
24116 #endif /* CONFIG_MAP_RANGES */
24117 }
24118 }
24119
24120 /*
24121 * vm_map_entry_has_device_pager:
24122 * Check if the vm map entry specified by the virtual address has a device pager.
24123 * If the vm map entry does not exist or if the map is NULL, this returns FALSE.
24124 */
24125 boolean_t
vm_map_entry_has_device_pager(vm_map_t map,vm_map_offset_t vaddr)24126 vm_map_entry_has_device_pager(vm_map_t map, vm_map_offset_t vaddr)
24127 {
24128 vm_map_entry_t entry;
24129 vm_object_t object;
24130 boolean_t result;
24131
24132 if (map == NULL) {
24133 return FALSE;
24134 }
24135
24136 vm_map_lock(map);
24137 while (TRUE) {
24138 if (!vm_map_lookup_entry(map, vaddr, &entry)) {
24139 result = FALSE;
24140 break;
24141 }
24142 if (entry->is_sub_map) {
24143 // Check the submap
24144 vm_map_t submap = VME_SUBMAP(entry);
24145 assert(submap != NULL);
24146 vm_map_lock(submap);
24147 vm_map_unlock(map);
24148 map = submap;
24149 continue;
24150 }
24151 object = VME_OBJECT(entry);
24152 if (object != NULL && object->pager != NULL && is_device_pager_ops(object->pager->mo_pager_ops)) {
24153 result = TRUE;
24154 break;
24155 }
24156 result = FALSE;
24157 break;
24158 }
24159
24160 vm_map_unlock(map);
24161 return result;
24162 }
24163
24164
24165 #if MACH_ASSERT
24166
24167 extern int pmap_ledgers_panic;
24168 extern int pmap_ledgers_panic_leeway;
24169
24170 #define LEDGER_DRIFT(__LEDGER) \
24171 int __LEDGER##_over; \
24172 ledger_amount_t __LEDGER##_over_total; \
24173 ledger_amount_t __LEDGER##_over_max; \
24174 int __LEDGER##_under; \
24175 ledger_amount_t __LEDGER##_under_total; \
24176 ledger_amount_t __LEDGER##_under_max
24177
24178 struct {
24179 uint64_t num_pmaps_checked;
24180
24181 LEDGER_DRIFT(phys_footprint);
24182 LEDGER_DRIFT(internal);
24183 LEDGER_DRIFT(internal_compressed);
24184 LEDGER_DRIFT(external);
24185 LEDGER_DRIFT(reusable);
24186 LEDGER_DRIFT(iokit_mapped);
24187 LEDGER_DRIFT(alternate_accounting);
24188 LEDGER_DRIFT(alternate_accounting_compressed);
24189 LEDGER_DRIFT(page_table);
24190 LEDGER_DRIFT(purgeable_volatile);
24191 LEDGER_DRIFT(purgeable_nonvolatile);
24192 LEDGER_DRIFT(purgeable_volatile_compressed);
24193 LEDGER_DRIFT(purgeable_nonvolatile_compressed);
24194 LEDGER_DRIFT(tagged_nofootprint);
24195 LEDGER_DRIFT(tagged_footprint);
24196 LEDGER_DRIFT(tagged_nofootprint_compressed);
24197 LEDGER_DRIFT(tagged_footprint_compressed);
24198 LEDGER_DRIFT(network_volatile);
24199 LEDGER_DRIFT(network_nonvolatile);
24200 LEDGER_DRIFT(network_volatile_compressed);
24201 LEDGER_DRIFT(network_nonvolatile_compressed);
24202 LEDGER_DRIFT(media_nofootprint);
24203 LEDGER_DRIFT(media_footprint);
24204 LEDGER_DRIFT(media_nofootprint_compressed);
24205 LEDGER_DRIFT(media_footprint_compressed);
24206 LEDGER_DRIFT(graphics_nofootprint);
24207 LEDGER_DRIFT(graphics_footprint);
24208 LEDGER_DRIFT(graphics_nofootprint_compressed);
24209 LEDGER_DRIFT(graphics_footprint_compressed);
24210 LEDGER_DRIFT(neural_nofootprint);
24211 LEDGER_DRIFT(neural_footprint);
24212 LEDGER_DRIFT(neural_nofootprint_compressed);
24213 LEDGER_DRIFT(neural_footprint_compressed);
24214 } pmap_ledgers_drift;
24215
24216 void
vm_map_pmap_check_ledgers(pmap_t pmap,ledger_t ledger,int pid,char * procname)24217 vm_map_pmap_check_ledgers(
24218 pmap_t pmap,
24219 ledger_t ledger,
24220 int pid,
24221 char *procname)
24222 {
24223 ledger_amount_t bal;
24224 boolean_t do_panic;
24225
24226 do_panic = FALSE;
24227
24228 pmap_ledgers_drift.num_pmaps_checked++;
24229
24230 #define LEDGER_CHECK_BALANCE(__LEDGER) \
24231 MACRO_BEGIN \
24232 int panic_on_negative = TRUE; \
24233 ledger_get_balance(ledger, \
24234 task_ledgers.__LEDGER, \
24235 &bal); \
24236 ledger_get_panic_on_negative(ledger, \
24237 task_ledgers.__LEDGER, \
24238 &panic_on_negative); \
24239 if (bal != 0) { \
24240 if (panic_on_negative || \
24241 (pmap_ledgers_panic && \
24242 pmap_ledgers_panic_leeway > 0 && \
24243 (bal > (pmap_ledgers_panic_leeway * PAGE_SIZE) || \
24244 bal < (-pmap_ledgers_panic_leeway * PAGE_SIZE)))) { \
24245 do_panic = TRUE; \
24246 } \
24247 printf("LEDGER BALANCE proc %d (%s) " \
24248 "\"%s\" = %lld\n", \
24249 pid, procname, #__LEDGER, bal); \
24250 if (bal > 0) { \
24251 pmap_ledgers_drift.__LEDGER##_over++; \
24252 pmap_ledgers_drift.__LEDGER##_over_total += bal; \
24253 if (bal > pmap_ledgers_drift.__LEDGER##_over_max) { \
24254 pmap_ledgers_drift.__LEDGER##_over_max = bal; \
24255 } \
24256 } else if (bal < 0) { \
24257 pmap_ledgers_drift.__LEDGER##_under++; \
24258 pmap_ledgers_drift.__LEDGER##_under_total += bal; \
24259 if (bal < pmap_ledgers_drift.__LEDGER##_under_max) { \
24260 pmap_ledgers_drift.__LEDGER##_under_max = bal; \
24261 } \
24262 } \
24263 } \
24264 MACRO_END
24265
24266 LEDGER_CHECK_BALANCE(phys_footprint);
24267 LEDGER_CHECK_BALANCE(internal);
24268 LEDGER_CHECK_BALANCE(internal_compressed);
24269 LEDGER_CHECK_BALANCE(external);
24270 LEDGER_CHECK_BALANCE(reusable);
24271 LEDGER_CHECK_BALANCE(iokit_mapped);
24272 LEDGER_CHECK_BALANCE(alternate_accounting);
24273 LEDGER_CHECK_BALANCE(alternate_accounting_compressed);
24274 LEDGER_CHECK_BALANCE(page_table);
24275 LEDGER_CHECK_BALANCE(purgeable_volatile);
24276 LEDGER_CHECK_BALANCE(purgeable_nonvolatile);
24277 LEDGER_CHECK_BALANCE(purgeable_volatile_compressed);
24278 LEDGER_CHECK_BALANCE(purgeable_nonvolatile_compressed);
24279 LEDGER_CHECK_BALANCE(tagged_nofootprint);
24280 LEDGER_CHECK_BALANCE(tagged_footprint);
24281 LEDGER_CHECK_BALANCE(tagged_nofootprint_compressed);
24282 LEDGER_CHECK_BALANCE(tagged_footprint_compressed);
24283 LEDGER_CHECK_BALANCE(network_volatile);
24284 LEDGER_CHECK_BALANCE(network_nonvolatile);
24285 LEDGER_CHECK_BALANCE(network_volatile_compressed);
24286 LEDGER_CHECK_BALANCE(network_nonvolatile_compressed);
24287 LEDGER_CHECK_BALANCE(media_nofootprint);
24288 LEDGER_CHECK_BALANCE(media_footprint);
24289 LEDGER_CHECK_BALANCE(media_nofootprint_compressed);
24290 LEDGER_CHECK_BALANCE(media_footprint_compressed);
24291 LEDGER_CHECK_BALANCE(graphics_nofootprint);
24292 LEDGER_CHECK_BALANCE(graphics_footprint);
24293 LEDGER_CHECK_BALANCE(graphics_nofootprint_compressed);
24294 LEDGER_CHECK_BALANCE(graphics_footprint_compressed);
24295 LEDGER_CHECK_BALANCE(neural_nofootprint);
24296 LEDGER_CHECK_BALANCE(neural_footprint);
24297 LEDGER_CHECK_BALANCE(neural_nofootprint_compressed);
24298 LEDGER_CHECK_BALANCE(neural_footprint_compressed);
24299
24300 if (do_panic) {
24301 if (pmap_ledgers_panic) {
24302 panic("pmap_destroy(%p) %d[%s] has imbalanced ledgers",
24303 pmap, pid, procname);
24304 } else {
24305 printf("pmap_destroy(%p) %d[%s] has imbalanced ledgers\n",
24306 pmap, pid, procname);
24307 }
24308 }
24309 }
24310
24311 void
vm_map_pmap_set_process(vm_map_t map,int pid,char * procname)24312 vm_map_pmap_set_process(
24313 vm_map_t map,
24314 int pid,
24315 char *procname)
24316 {
24317 pmap_set_process(vm_map_pmap(map), pid, procname);
24318 }
24319
24320 #endif /* MACH_ASSERT */
24321