1 /*
2 * Copyright (c) 2000-2021 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /*
29 * @OSF_COPYRIGHT@
30 */
31 /*
32 * Mach Operating System
33 * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
34 * All Rights Reserved.
35 *
36 * Permission to use, copy, modify and distribute this software and its
37 * documentation is hereby granted, provided that both the copyright
38 * notice and this permission notice appear in all copies of the
39 * software, derivative works or modified versions, and any portions
40 * thereof, and that both notices appear in supporting documentation.
41 *
42 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44 * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45 *
46 * Carnegie Mellon requests users of this software to return to
47 *
48 * Software Distribution Coordinator or [email protected]
49 * School of Computer Science
50 * Carnegie Mellon University
51 * Pittsburgh PA 15213-3890
52 *
53 * any improvements or extensions that they make and grant Carnegie Mellon
54 * the rights to redistribute these changes.
55 */
56 /*
57 */
58 /*
59 * File: vm/vm_map.c
60 * Author: Avadis Tevanian, Jr., Michael Wayne Young
61 * Date: 1985
62 *
63 * Virtual memory mapping module.
64 */
65
66 #include <mach/vm_types.h>
67 #include <mach_assert.h>
68
69 #include <vm/vm_options.h>
70
71 #include <libkern/OSAtomic.h>
72
73 #include <mach/kern_return.h>
74 #include <mach/port.h>
75 #include <mach/vm_attributes.h>
76 #include <mach/vm_param.h>
77 #include <mach/vm_behavior.h>
78 #include <mach/vm_statistics.h>
79 #include <mach/memory_object.h>
80 #include <mach/mach_vm_server.h>
81 #include <machine/cpu_capabilities.h>
82 #include <mach/sdt.h>
83
84 #include <kern/assert.h>
85 #include <kern/backtrace.h>
86 #include <kern/counter.h>
87 #include <kern/exc_guard.h>
88 #include <kern/kalloc.h>
89 #include <kern/zalloc_internal.h>
90 #include <kern/telemetry.h>
91 #include <kern/trap_telemetry.h>
92
93 #include <vm/cpm_internal.h>
94 #include <vm/memory_types.h>
95 #include <vm/vm_compressor_xnu.h>
96 #include <vm/vm_compressor_pager_internal.h>
97 #include <vm/vm_init_xnu.h>
98 #include <vm/vm_fault_internal.h>
99 #include <vm/vm_map_internal.h>
100 #include <vm/vm_object_internal.h>
101 #include <vm/vm_page_internal.h>
102 #include <vm/vm_pageout.h>
103 #include <vm/pmap.h>
104 #include <vm/vm_kern_internal.h>
105 #include <ipc/ipc_port.h>
106 #include <kern/sched_prim.h>
107 #include <kern/misc_protos.h>
108
109 #include <mach/vm_map_server.h>
110 #include <mach/mach_host_server.h>
111 #include <vm/vm_memtag.h>
112 #include <vm/vm_protos_internal.h>
113 #include <vm/vm_purgeable_internal.h>
114
115 #include <vm/vm_iokit.h>
116 #include <vm/vm_shared_region_internal.h>
117 #include <vm/vm_map_store_internal.h>
118 #include <vm/vm_memory_entry_xnu.h>
119 #include <vm/memory_object_internal.h>
120 #include <vm/vm_memory_entry.h>
121 #include <vm/vm_sanitize_internal.h>
122 #include <vm/vm_reclaim_xnu.h>
123 #if DEVELOPMENT || DEBUG
124 #include <vm/vm_compressor_info.h>
125 #endif /* DEVELOPMENT || DEBUG */
126 #include <san/kasan.h>
127
128 #include <sys/resource.h>
129 #include <sys/random.h>
130 #include <sys/codesign.h>
131 #include <sys/code_signing.h>
132 #include <sys/mman.h>
133 #include <sys/reboot.h>
134 #include <sys/kdebug_triage.h>
135 #include <sys/reason.h>
136
137 #include <os/log.h>
138
139 #include <libkern/section_keywords.h>
140
141 #include <os/hash.h>
142
143 #if DEVELOPMENT || DEBUG
144 extern int proc_selfcsflags(void);
145 int vm_log_xnu_user_debug = 0;
146 int panic_on_unsigned_execute = 0;
147 int panic_on_mlock_failure = 0;
148 #endif /* DEVELOPMENT || DEBUG */
149
150 #if DEVELOPMENT || DEBUG
151 int debug4k_filter = 0;
152 char debug4k_proc_name[1024] = "";
153 int debug4k_proc_filter = (int)-1 & ~(1 << __DEBUG4K_FAULT);
154 int debug4k_panic_on_misaligned_sharing = 0;
155 const char *debug4k_category_name[] = {
156 "error", /* 0 */
157 "life", /* 1 */
158 "load", /* 2 */
159 "fault", /* 3 */
160 "copy", /* 4 */
161 "share", /* 5 */
162 "adjust", /* 6 */
163 "pmap", /* 7 */
164 "mementry", /* 8 */
165 "iokit", /* 9 */
166 "upl", /* 10 */
167 "exc", /* 11 */
168 "vfs" /* 12 */
169 };
170 #endif /* DEVELOPMENT || DEBUG */
171 int debug4k_no_cow_copyin = 0;
172
173
174 #if __arm64__
175 extern const int fourk_binary_compatibility_unsafe;
176 #endif /* __arm64__ */
177 extern int proc_selfpid(void);
178 extern char *proc_name_address(void *p);
179 extern const char *proc_best_name(struct proc *p);
180
181 #if VM_MAP_DEBUG_APPLE_PROTECT
182 int vm_map_debug_apple_protect = 0;
183 #endif /* VM_MAP_DEBUG_APPLE_PROTECT */
184 #if VM_MAP_DEBUG_FOURK
185 int vm_map_debug_fourk = 0;
186 #endif /* VM_MAP_DEBUG_FOURK */
187
188 #if DEBUG || DEVELOPMENT
189 static TUNABLE(bool, vm_map_executable_immutable,
190 "vm_map_executable_immutable", true);
191 #else
192 #define vm_map_executable_immutable true
193 #endif
194
195 /** Do not enforce the kernel allocation size limit */
196 #define VM_MAP_KERNEL_ALLOC_LIMIT_MODE_BYPASS (0)
197 /** Enforce the kernel allocation limit by refusing too large requests */
198 #define VM_MAP_KERNEL_ALLOC_LIMIT_MODE_REJECT (1)
199 /** Enforce the kernel allocation limit by panicking on any too large request */
200 #define VM_MAP_KERNEL_ALLOC_LIMIT_MODE_PANIC (2)
201 /** Do not enforce the kernel allocation limit but generate a telemetry trap */
202 #define VM_MAP_KERNEL_ALLOC_LIMIT_MODE_TRAP (3)
203
204 #if DEVELOPMENT || DEBUG
205 static TUNABLE(int, vm_map_kernel_alloc_limit_mode,
206 "vm_map_kernel_alloc_limit_mode", VM_MAP_KERNEL_ALLOC_LIMIT_MODE_TRAP);
207 #else
208 #define vm_map_kernel_alloc_limit_mode VM_MAP_KERNEL_ALLOC_LIMIT_MODE_BYPASS
209 #endif /* DEVELOPMENT || DEBUG */
210
211 os_refgrp_decl(static, map_refgrp, "vm_map", NULL);
212
213 extern u_int32_t random(void); /* from <libkern/libkern.h> */
214 /* Internal prototypes
215 */
216
217 typedef struct vm_map_zap {
218 vm_map_entry_t vmz_head;
219 vm_map_entry_t *vmz_tail;
220 } *vm_map_zap_t;
221
222 #define VM_MAP_ZAP_DECLARE(zap) \
223 struct vm_map_zap zap = { .vmz_tail = &zap.vmz_head }
224
225 extern kern_return_t vm_map_wire_external(
226 vm_map_t map,
227 vm_map_offset_ut start_u,
228 vm_map_offset_ut end_u,
229 vm_prot_ut prot_u,
230 boolean_t user_wire) __exported;
231
232 #if XNU_PLATFORM_MacOSX
233 extern /* exported via Private.<arch>.MacOSX.exports on macOS */
234 #else
235 static
236 #endif
237 kern_return_t vm_map_copyin_common(
238 vm_map_t src_map,
239 vm_map_address_ut src_addr,
240 vm_map_size_ut len,
241 boolean_t src_destroy,
242 boolean_t src_volatile,
243 vm_map_copy_t *copy_result, /* OUT */
244 boolean_t use_maxprot);
245
246 static vm_map_entry_t vm_map_entry_insert(
247 vm_map_t map,
248 vm_map_entry_t insp_entry,
249 vm_map_offset_t start,
250 vm_map_offset_t end,
251 vm_object_t object,
252 vm_object_offset_t offset,
253 vm_map_kernel_flags_t vmk_flags,
254 boolean_t needs_copy,
255 vm_prot_t cur_protection,
256 vm_prot_t max_protection,
257 vm_inherit_t inheritance,
258 boolean_t clear_map_aligned);
259
260 static void vm_map_simplify_range(
261 vm_map_t map,
262 vm_map_offset_t start,
263 vm_map_offset_t end); /* forward */
264
265 static boolean_t vm_map_range_check(
266 vm_map_t map,
267 vm_map_offset_t start,
268 vm_map_offset_t end,
269 vm_map_entry_t *entry);
270
271 static void vm_map_submap_pmap_clean(
272 vm_map_t map,
273 vm_map_offset_t start,
274 vm_map_offset_t end,
275 vm_map_t sub_map,
276 vm_map_offset_t offset);
277
278 static void vm_map_pmap_enter(
279 vm_map_t map,
280 vm_map_offset_t addr,
281 vm_map_offset_t end_addr,
282 vm_object_t object,
283 vm_object_offset_t offset,
284 vm_prot_t protection);
285
286 static void _vm_map_clip_end(
287 struct vm_map_header *map_header,
288 vm_map_entry_t entry,
289 vm_map_offset_t end);
290
291 static void _vm_map_clip_start(
292 struct vm_map_header *map_header,
293 vm_map_entry_t entry,
294 vm_map_offset_t start);
295
296 static kmem_return_t vm_map_delete(
297 vm_map_t map,
298 vm_map_offset_t start,
299 vm_map_offset_t end,
300 vmr_flags_t flags,
301 kmem_guard_t guard,
302 vm_map_zap_t zap);
303
304 static void vm_map_copy_insert(
305 vm_map_t map,
306 vm_map_entry_t after_where,
307 vm_map_copy_t copy);
308
309 static kern_return_t vm_map_copy_overwrite_unaligned(
310 vm_map_t dst_map,
311 vm_map_entry_t entry,
312 vm_map_copy_t copy,
313 vm_map_address_t start,
314 boolean_t discard_on_success);
315
316 static kern_return_t vm_map_copy_overwrite_aligned(
317 vm_map_t dst_map,
318 vm_map_entry_t tmp_entry,
319 vm_map_copy_t copy,
320 vm_map_offset_t start,
321 pmap_t pmap);
322
323 static kern_return_t vm_map_copyin_kernel_buffer(
324 vm_map_t src_map,
325 vm_map_address_t src_addr,
326 vm_map_size_t len,
327 boolean_t src_destroy,
328 vm_map_copy_t *copy_result); /* OUT */
329
330 static kern_return_t vm_map_copyout_kernel_buffer(
331 vm_map_t map,
332 vm_map_address_t *addr, /* IN/OUT */
333 vm_map_copy_t copy,
334 vm_map_size_t copy_size,
335 boolean_t overwrite,
336 boolean_t consume_on_success);
337
338 static void vm_map_fork_share(
339 vm_map_t old_map,
340 vm_map_entry_t old_entry,
341 vm_map_t new_map);
342
343 static boolean_t vm_map_fork_copy(
344 vm_map_t old_map,
345 vm_map_entry_t *old_entry_p,
346 vm_map_t new_map,
347 int vm_map_copyin_flags);
348
349 static kern_return_t vm_map_wire_nested(
350 vm_map_t map,
351 vm_map_offset_t start,
352 vm_map_offset_t end,
353 vm_prot_t caller_prot,
354 vm_tag_t tag,
355 boolean_t user_wire,
356 pmap_t map_pmap,
357 vm_map_offset_t pmap_addr,
358 ppnum_t *physpage_p);
359
360 static kern_return_t vm_map_unwire_nested(
361 vm_map_t map,
362 vm_map_offset_t start,
363 vm_map_offset_t end,
364 boolean_t user_wire,
365 pmap_t map_pmap,
366 vm_map_offset_t pmap_addr);
367
368 static kern_return_t vm_map_overwrite_submap_recurse(
369 vm_map_t dst_map,
370 vm_map_offset_t dst_addr,
371 vm_map_size_t dst_size);
372
373 static kern_return_t vm_map_copy_overwrite_nested(
374 vm_map_t dst_map,
375 vm_map_offset_t dst_addr,
376 vm_map_copy_t copy,
377 boolean_t interruptible,
378 pmap_t pmap,
379 boolean_t discard_on_success);
380
381 static kern_return_t vm_map_remap_extract(
382 vm_map_t map,
383 vm_map_offset_t addr,
384 vm_map_size_t size,
385 boolean_t copy,
386 vm_map_copy_t map_copy,
387 vm_prot_t *cur_protection,
388 vm_prot_t *max_protection,
389 vm_inherit_t inheritance,
390 vm_map_kernel_flags_t vmk_flags);
391
392 static void vm_map_region_look_for_page(
393 vm_map_t map,
394 vm_map_offset_t va,
395 vm_object_t object,
396 vm_object_offset_t offset,
397 int max_refcnt,
398 unsigned short depth,
399 vm_region_extended_info_t extended,
400 mach_msg_type_number_t count);
401
402 static boolean_t vm_map_region_has_obj_ref(
403 vm_map_entry_t entry,
404 vm_object_t object);
405
406
407 static kern_return_t vm_map_willneed(
408 vm_map_t map,
409 vm_map_offset_t start,
410 vm_map_offset_t end);
411
412 static kern_return_t vm_map_reuse_pages(
413 vm_map_t map,
414 vm_map_offset_t start,
415 vm_map_offset_t end);
416
417 static kern_return_t vm_map_reusable_pages(
418 vm_map_t map,
419 vm_map_offset_t start,
420 vm_map_offset_t end);
421
422 static kern_return_t vm_map_can_reuse(
423 vm_map_t map,
424 vm_map_offset_t start,
425 vm_map_offset_t end);
426
427 static kern_return_t vm_map_zero(
428 vm_map_t map,
429 vm_map_offset_t start,
430 vm_map_offset_t end);
431
432 static kern_return_t vm_map_random_address_for_size(
433 vm_map_t map,
434 vm_map_offset_t *address,
435 vm_map_size_t size,
436 vm_map_kernel_flags_t vmk_flags);
437
438
439 #if CONFIG_MAP_RANGES
440
441 static vm_map_range_id_t vm_map_user_range_resolve(
442 vm_map_t map,
443 mach_vm_address_t addr,
444 mach_vm_address_t size,
445 mach_vm_range_t range);
446
447 #endif /* CONFIG_MAP_RANGES */
448 #if MACH_ASSERT
449 static kern_return_t vm_map_pageout(
450 vm_map_t map,
451 vm_map_offset_t start,
452 vm_map_offset_t end);
453 #endif /* MACH_ASSERT */
454
455 kern_return_t vm_map_corpse_footprint_collect(
456 vm_map_t old_map,
457 vm_map_entry_t old_entry,
458 vm_map_t new_map);
459 void vm_map_corpse_footprint_collect_done(
460 vm_map_t new_map);
461 void vm_map_corpse_footprint_destroy(
462 vm_map_t map);
463 kern_return_t vm_map_corpse_footprint_query_page_info(
464 vm_map_t map,
465 vm_map_offset_t va,
466 int *disposition_p);
467 void vm_map_footprint_query_page_info(
468 vm_map_t map,
469 vm_map_entry_t map_entry,
470 vm_map_offset_t curr_s_offset,
471 int *disposition_p);
472
473 #if CONFIG_MAP_RANGES
474 static void vm_map_range_map_init(void);
475 #endif /* CONFIG_MAP_RANGES */
476
477 pid_t find_largest_process_vm_map_entries(void);
478
479
480 __attribute__((always_inline))
481 int
vm_map_kernel_flags_vmflags(vm_map_kernel_flags_t vmk_flags)482 vm_map_kernel_flags_vmflags(vm_map_kernel_flags_t vmk_flags)
483 {
484 int flags = vmk_flags.__vm_flags & VM_FLAGS_ANY_MASK;
485
486 /* in vmk flags the meaning of fixed/anywhere is inverted */
487 return flags ^ (VM_FLAGS_FIXED | VM_FLAGS_ANYWHERE);
488 }
489
490 __attribute__((always_inline, overloadable))
491 void
vm_map_kernel_flags_set_vmflags(vm_map_kernel_flags_t * vmk_flags,int vm_flags,vm_tag_t vm_tag)492 vm_map_kernel_flags_set_vmflags(
493 vm_map_kernel_flags_t *vmk_flags,
494 int vm_flags,
495 vm_tag_t vm_tag)
496 {
497 vm_flags ^= (VM_FLAGS_FIXED | VM_FLAGS_ANYWHERE);
498 vmk_flags->__vm_flags &= ~VM_FLAGS_ANY_MASK;
499 vmk_flags->__vm_flags |= (vm_flags & VM_FLAGS_ANY_MASK);
500 vmk_flags->vm_tag = vm_tag;
501 }
502
503 __attribute__((always_inline, overloadable))
504 void
vm_map_kernel_flags_set_vmflags(vm_map_kernel_flags_t * vmk_flags,int vm_flags_and_tag)505 vm_map_kernel_flags_set_vmflags(
506 vm_map_kernel_flags_t *vmk_flags,
507 int vm_flags_and_tag)
508 {
509 vm_flags_and_tag ^= (VM_FLAGS_FIXED | VM_FLAGS_ANYWHERE);
510 vmk_flags->__vm_flags &= ~VM_FLAGS_ANY_MASK;
511 vmk_flags->__vm_flags |= (vm_flags_and_tag & VM_FLAGS_ANY_MASK);
512 VM_GET_FLAGS_ALIAS(vm_flags_and_tag, vmk_flags->vm_tag);
513 }
514
515 __attribute__((always_inline))
516 void
vm_map_kernel_flags_and_vmflags(vm_map_kernel_flags_t * vmk_flags,int vm_flags_mask)517 vm_map_kernel_flags_and_vmflags(
518 vm_map_kernel_flags_t *vmk_flags,
519 int vm_flags_mask)
520 {
521 /* this function doesn't handle the inverted FIXED/ANYWHERE */
522 assert(vm_flags_mask & VM_FLAGS_ANYWHERE);
523 vmk_flags->__vm_flags &= vm_flags_mask;
524 }
525
526 __attribute__((always_inline))
527 bool
vm_map_kernel_flags_check_vm_and_kflags(vm_map_kernel_flags_t vmk_flags,int vm_flags_mask)528 vm_map_kernel_flags_check_vm_and_kflags(
529 vm_map_kernel_flags_t vmk_flags,
530 int vm_flags_mask)
531 {
532 return (vmk_flags.__vm_flags & ~vm_flags_mask) == 0;
533 }
534
535 bool
vm_map_kernel_flags_check_vmflags(vm_map_kernel_flags_t vmk_flags,int vm_flags_mask)536 vm_map_kernel_flags_check_vmflags(
537 vm_map_kernel_flags_t vmk_flags,
538 int vm_flags_mask)
539 {
540 int vmflags = vmk_flags.__vm_flags & VM_FLAGS_ANY_MASK;
541
542 /* Note: up to 16 still has good calling conventions */
543 static_assert(sizeof(vm_map_kernel_flags_t) == 16);
544
545 #if DEBUG || DEVELOPMENT
546 /*
547 * All of this compiles to nothing if all checks pass.
548 */
549 #define check(field, value) ({ \
550 vm_map_kernel_flags_t fl = VM_MAP_KERNEL_FLAGS_NONE; \
551 fl.__vm_flags = (value); \
552 fl.field = 0; \
553 assert(fl.__vm_flags == 0); \
554 })
555
556 /* bits 0-7 */
557 check(vmf_fixed, VM_FLAGS_ANYWHERE); // kind of a lie this is inverted
558 check(vmf_purgeable, VM_FLAGS_PURGABLE);
559 check(vmf_4gb_chunk, VM_FLAGS_4GB_CHUNK);
560 check(vmf_random_addr, VM_FLAGS_RANDOM_ADDR);
561 check(vmf_no_cache, VM_FLAGS_NO_CACHE);
562 check(vmf_resilient_codesign, VM_FLAGS_RESILIENT_CODESIGN);
563 check(vmf_resilient_media, VM_FLAGS_RESILIENT_MEDIA);
564 check(vmf_permanent, VM_FLAGS_PERMANENT);
565
566 /* bits 8-15 */
567 check(vmf_tpro, VM_FLAGS_TPRO);
568 check(vmf_overwrite, VM_FLAGS_OVERWRITE);
569
570 /* bits 16-23 */
571 check(vmf_superpage_size, VM_FLAGS_SUPERPAGE_MASK);
572 check(vmf_return_data_addr, VM_FLAGS_RETURN_DATA_ADDR);
573 check(vmf_return_4k_data_addr, VM_FLAGS_RETURN_4K_DATA_ADDR);
574
575 {
576 vm_map_kernel_flags_t fl = VM_MAP_KERNEL_FLAGS_NONE;
577
578 /* check user tags will never clip */
579 fl.vm_tag = VM_MEMORY_COUNT - 1;
580 assert(fl.vm_tag == VM_MEMORY_COUNT - 1);
581
582 /* check kernel tags will never clip */
583 fl.vm_tag = VM_MAX_TAG_VALUE - 1;
584 assert(fl.vm_tag == VM_MAX_TAG_VALUE - 1);
585 }
586
587
588 #undef check
589 #endif /* DEBUG || DEVELOPMENT */
590
591 return (vmflags & ~vm_flags_mask) == 0;
592 }
593
594 /*
595 * Macros to copy a vm_map_entry. We must be careful to correctly
596 * manage the wired page count. vm_map_entry_copy() creates a new
597 * map entry to the same memory - the wired count in the new entry
598 * must be set to zero. vm_map_entry_copy_full() creates a new
599 * entry that is identical to the old entry. This preserves the
600 * wire count; it's used for map splitting and zone changing in
601 * vm_map_copyout.
602 */
603
604 static inline void
vm_map_entry_copy_csm_assoc(vm_map_t map __unused,vm_map_entry_t new __unused,vm_map_entry_t old __unused)605 vm_map_entry_copy_csm_assoc(
606 vm_map_t map __unused,
607 vm_map_entry_t new __unused,
608 vm_map_entry_t old __unused)
609 {
610 #if CODE_SIGNING_MONITOR
611 /* when code signing monitor is enabled, we want to reset on copy */
612 new->csm_associated = FALSE;
613 #else
614 /* when code signing monitor is not enabled, assert as a sanity check */
615 assert(new->csm_associated == FALSE);
616 #endif
617 #if DEVELOPMENT || DEBUG
618 if (new->vme_xnu_user_debug && vm_log_xnu_user_debug) {
619 printf("FBDP %d[%s] %s:%d map %p entry %p [ 0x%llx 0x%llx ] vme_xnu_user_debug\n",
620 proc_selfpid(),
621 (get_bsdtask_info(current_task())
622 ? proc_name_address(get_bsdtask_info(current_task()))
623 : "?"),
624 __FUNCTION__, __LINE__,
625 map, new, new->vme_start, new->vme_end);
626 }
627 #endif /* DEVELOPMENT || DEBUG */
628 #if XNU_TARGET_OS_OSX
629 /*
630 * On macOS, entries with "vme_xnu_user_debug" can be copied during fork()
631 * and we want the child's entry to keep its "vme_xnu_user_debug" to avoid
632 * trigggering CSM assertions when the child accesses its mapping.
633 */
634 #else /* XNU_TARGET_OS_OSX */
635 new->vme_xnu_user_debug = FALSE;
636 #endif /* XNU_TARGET_OS_OSX */
637 }
638
639 /*
640 * The "used_for_jit" flag was copied from OLD to NEW in vm_map_entry_copy().
641 * But for security reasons on some platforms, we don't want the
642 * new mapping to be "used for jit", so we reset the flag here.
643 */
644 static inline void
vm_map_entry_copy_code_signing(vm_map_t map,vm_map_entry_t new,vm_map_entry_t old __unused)645 vm_map_entry_copy_code_signing(
646 vm_map_t map,
647 vm_map_entry_t new,
648 vm_map_entry_t old __unused)
649 {
650 if (VM_MAP_POLICY_ALLOW_JIT_COPY(map)) {
651 assert(new->used_for_jit == old->used_for_jit);
652 } else {
653 if (old->used_for_jit) {
654 DTRACE_VM3(cs_wx,
655 uint64_t, new->vme_start,
656 uint64_t, new->vme_end,
657 vm_prot_t, new->protection);
658 printf("CODE SIGNING: %d[%s] %s: curprot cannot be write+execute. %s\n",
659 proc_selfpid(),
660 (get_bsdtask_info(current_task())
661 ? proc_name_address(get_bsdtask_info(current_task()))
662 : "?"),
663 __FUNCTION__,
664 "removing execute access");
665 new->protection &= ~VM_PROT_EXECUTE;
666 new->max_protection &= ~VM_PROT_EXECUTE;
667 }
668 new->used_for_jit = FALSE;
669 }
670 }
671
672 static inline void
vm_map_entry_copy_full(vm_map_entry_t new,vm_map_entry_t old)673 vm_map_entry_copy_full(
674 vm_map_entry_t new,
675 vm_map_entry_t old)
676 {
677 #if MAP_ENTRY_CREATION_DEBUG
678 btref_put(new->vme_creation_bt);
679 btref_retain(old->vme_creation_bt);
680 #endif
681 #if MAP_ENTRY_INSERTION_DEBUG
682 btref_put(new->vme_insertion_bt);
683 btref_retain(old->vme_insertion_bt);
684 #endif
685 #if VM_BTLOG_TAGS
686 /* Discard the btref that might be in the new entry */
687 if (new->vme_kernel_object) {
688 btref_put(new->vme_tag_btref);
689 }
690 /* Retain the btref in the old entry to account for its copy */
691 if (old->vme_kernel_object) {
692 btref_retain(old->vme_tag_btref);
693 }
694 #endif /* VM_BTLOG_TAGS */
695 *new = *old;
696 }
697
698 static inline void
vm_map_entry_copy(vm_map_t map,vm_map_entry_t new,vm_map_entry_t old)699 vm_map_entry_copy(
700 vm_map_t map,
701 vm_map_entry_t new,
702 vm_map_entry_t old)
703 {
704 vm_map_entry_copy_full(new, old);
705
706 new->is_shared = FALSE;
707 new->needs_wakeup = FALSE;
708 new->in_transition = FALSE;
709 new->wired_count = 0;
710 new->user_wired_count = 0;
711 new->vme_permanent = FALSE;
712 vm_map_entry_copy_code_signing(map, new, old);
713 vm_map_entry_copy_csm_assoc(map, new, old);
714 if (new->iokit_acct) {
715 assertf(!new->use_pmap, "old %p new %p\n", old, new);
716 new->iokit_acct = FALSE;
717 new->use_pmap = TRUE;
718 }
719 new->vme_resilient_codesign = FALSE;
720 new->vme_resilient_media = FALSE;
721 new->vme_atomic = FALSE;
722 new->vme_no_copy_on_read = FALSE;
723 }
724
725 /*
726 * Normal lock_read_to_write() returns FALSE/0 on failure.
727 * These functions evaluate to zero on success and non-zero value on failure.
728 */
729 __attribute__((always_inline))
730 int
vm_map_lock_read_to_write(vm_map_t map)731 vm_map_lock_read_to_write(vm_map_t map)
732 {
733 if (lck_rw_lock_shared_to_exclusive(&(map)->lock)) {
734 DTRACE_VM(vm_map_lock_upgrade);
735 return 0;
736 }
737 return 1;
738 }
739
740 __attribute__((always_inline))
741 boolean_t
vm_map_try_lock(vm_map_t map)742 vm_map_try_lock(vm_map_t map)
743 {
744 if (lck_rw_try_lock_exclusive(&(map)->lock)) {
745 DTRACE_VM(vm_map_lock_w);
746 return TRUE;
747 }
748 return FALSE;
749 }
750
751 __attribute__((always_inline))
752 boolean_t
vm_map_try_lock_read(vm_map_t map)753 vm_map_try_lock_read(vm_map_t map)
754 {
755 if (lck_rw_try_lock_shared(&(map)->lock)) {
756 DTRACE_VM(vm_map_lock_r);
757 return TRUE;
758 }
759 return FALSE;
760 }
761
762 /*!
763 * @function kdp_vm_map_is_acquired_exclusive
764 *
765 * @abstract
766 * Checks if vm map is acquired exclusive.
767 *
768 * @discussion
769 * NOT SAFE: To be used only by kernel debugger.
770 *
771 * @param map map to check
772 *
773 * @returns TRUE if the map is acquired exclusively.
774 */
775 boolean_t
kdp_vm_map_is_acquired_exclusive(vm_map_t map)776 kdp_vm_map_is_acquired_exclusive(vm_map_t map)
777 {
778 return kdp_lck_rw_lock_is_acquired_exclusive(&map->lock);
779 }
780
781 /*
782 * Routines to get the page size the caller should
783 * use while inspecting the target address space.
784 * Use the "_safely" variant if the caller is dealing with a user-provided
785 * array whose size depends on the page size, to avoid any overflow or
786 * underflow of a user-allocated buffer.
787 */
788 int
vm_self_region_page_shift_safely(vm_map_t target_map)789 vm_self_region_page_shift_safely(
790 vm_map_t target_map)
791 {
792 int effective_page_shift = 0;
793
794 if (PAGE_SIZE == (4096)) {
795 /* x86_64 and 4k watches: always use 4k */
796 return PAGE_SHIFT;
797 }
798 /* did caller provide an explicit page size for this thread to use? */
799 effective_page_shift = thread_self_region_page_shift();
800 if (effective_page_shift) {
801 /* use the explicitly-provided page size */
802 return effective_page_shift;
803 }
804 /* no explicit page size: use the caller's page size... */
805 effective_page_shift = VM_MAP_PAGE_SHIFT(current_map());
806 if (effective_page_shift == VM_MAP_PAGE_SHIFT(target_map)) {
807 /* page size match: safe to use */
808 return effective_page_shift;
809 }
810 /* page size mismatch */
811 return -1;
812 }
813 int
vm_self_region_page_shift(vm_map_t target_map)814 vm_self_region_page_shift(
815 vm_map_t target_map)
816 {
817 int effective_page_shift;
818
819 effective_page_shift = vm_self_region_page_shift_safely(target_map);
820 if (effective_page_shift == -1) {
821 /* no safe value but OK to guess for caller */
822 effective_page_shift = MIN(VM_MAP_PAGE_SHIFT(current_map()),
823 VM_MAP_PAGE_SHIFT(target_map));
824 }
825 return effective_page_shift;
826 }
827
828
829 /*
830 * Decide if we want to allow processes to execute from their data or stack areas.
831 * override_nx() returns true if we do. Data/stack execution can be enabled independently
832 * for 32 and 64 bit processes. Set the VM_ABI_32 or VM_ABI_64 flags in allow_data_exec
833 * or allow_stack_exec to enable data execution for that type of data area for that particular
834 * ABI (or both by or'ing the flags together). These are initialized in the architecture
835 * specific pmap files since the default behavior varies according to architecture. The
836 * main reason it varies is because of the need to provide binary compatibility with old
837 * applications that were written before these restrictions came into being. In the old
838 * days, an app could execute anything it could read, but this has slowly been tightened
839 * up over time. The default behavior is:
840 *
841 * 32-bit PPC apps may execute from both stack and data areas
842 * 32-bit Intel apps may exeucte from data areas but not stack
843 * 64-bit PPC/Intel apps may not execute from either data or stack
844 *
845 * An application on any architecture may override these defaults by explicitly
846 * adding PROT_EXEC permission to the page in question with the mprotect(2)
847 * system call. This code here just determines what happens when an app tries to
848 * execute from a page that lacks execute permission.
849 *
850 * Note that allow_data_exec or allow_stack_exec may also be modified by sysctl to change the
851 * default behavior for both 32 and 64 bit apps on a system-wide basis. Furthermore,
852 * a Mach-O header flag bit (MH_NO_HEAP_EXECUTION) can be used to forcibly disallow
853 * execution from data areas for a particular binary even if the arch normally permits it. As
854 * a final wrinkle, a posix_spawn attribute flag can be used to negate this opt-in header bit
855 * to support some complicated use cases, notably browsers with out-of-process plugins that
856 * are not all NX-safe.
857 */
858
859 extern int allow_data_exec, allow_stack_exec;
860
861 int
override_nx(vm_map_t map,uint32_t user_tag)862 override_nx(vm_map_t map, uint32_t user_tag) /* map unused on arm */
863 {
864 int current_abi;
865
866 if (map->pmap == kernel_pmap) {
867 return FALSE;
868 }
869
870 /*
871 * Determine if the app is running in 32 or 64 bit mode.
872 */
873
874 if (vm_map_is_64bit(map)) {
875 current_abi = VM_ABI_64;
876 } else {
877 current_abi = VM_ABI_32;
878 }
879
880 /*
881 * Determine if we should allow the execution based on whether it's a
882 * stack or data area and the current architecture.
883 */
884
885 if (user_tag == VM_MEMORY_STACK) {
886 return allow_stack_exec & current_abi;
887 }
888
889 return (allow_data_exec & current_abi) && (map->map_disallow_data_exec == FALSE);
890 }
891
892
893 /*
894 * Virtual memory maps provide for the mapping, protection,
895 * and sharing of virtual memory objects. In addition,
896 * this module provides for an efficient virtual copy of
897 * memory from one map to another.
898 *
899 * Synchronization is required prior to most operations.
900 *
901 * Maps consist of an ordered doubly-linked list of simple
902 * entries; a single hint is used to speed up lookups.
903 *
904 * Sharing maps have been deleted from this version of Mach.
905 * All shared objects are now mapped directly into the respective
906 * maps. This requires a change in the copy on write strategy;
907 * the asymmetric (delayed) strategy is used for shared temporary
908 * objects instead of the symmetric (shadow) strategy. All maps
909 * are now "top level" maps (either task map, kernel map or submap
910 * of the kernel map).
911 *
912 * Since portions of maps are specified by start/end addreses,
913 * which may not align with existing map entries, all
914 * routines merely "clip" entries to these start/end values.
915 * [That is, an entry is split into two, bordering at a
916 * start or end value.] Note that these clippings may not
917 * always be necessary (as the two resulting entries are then
918 * not changed); however, the clipping is done for convenience.
919 * No attempt is currently made to "glue back together" two
920 * abutting entries.
921 *
922 * The symmetric (shadow) copy strategy implements virtual copy
923 * by copying VM object references from one map to
924 * another, and then marking both regions as copy-on-write.
925 * It is important to note that only one writeable reference
926 * to a VM object region exists in any map when this strategy
927 * is used -- this means that shadow object creation can be
928 * delayed until a write operation occurs. The symmetric (delayed)
929 * strategy allows multiple maps to have writeable references to
930 * the same region of a vm object, and hence cannot delay creating
931 * its copy objects. See vm_object_copy_quickly() in vm_object.c.
932 * Copying of permanent objects is completely different; see
933 * vm_object_copy_strategically() in vm_object.c.
934 */
935
936 ZONE_DECLARE_ID(ZONE_ID_VM_MAP_COPY, struct vm_map_copy);
937
938 #define VM_MAP_ZONE_NAME "maps"
939 #define VM_MAP_ZFLAGS (ZC_NOENCRYPT | ZC_VM)
940
941 #define VM_MAP_ENTRY_ZONE_NAME "VM map entries"
942 #define VM_MAP_ENTRY_ZFLAGS (ZC_NOENCRYPT | ZC_VM)
943
944 #define VM_MAP_HOLES_ZONE_NAME "VM map holes"
945 #define VM_MAP_HOLES_ZFLAGS (ZC_NOENCRYPT | ZC_VM)
946
947 /*
948 * Asserts that a vm_map_copy object is coming from the
949 * vm_map_copy_zone to ensure that it isn't a fake constructed
950 * anywhere else.
951 */
952 void
vm_map_copy_require(struct vm_map_copy * copy)953 vm_map_copy_require(struct vm_map_copy *copy)
954 {
955 zone_id_require(ZONE_ID_VM_MAP_COPY, sizeof(struct vm_map_copy), copy);
956 }
957
958 /*
959 * vm_map_require:
960 *
961 * Ensures that the argument is memory allocated from the genuine
962 * vm map zone. (See zone_id_require_allow_foreign).
963 */
964 void
vm_map_require(vm_map_t map)965 vm_map_require(vm_map_t map)
966 {
967 zone_id_require(ZONE_ID_VM_MAP, sizeof(struct _vm_map), map);
968 }
969
970 #define VM_MAP_EARLY_COUNT_MAX 16
971 static __startup_data vm_offset_t map_data;
972 static __startup_data vm_size_t map_data_size;
973 static __startup_data vm_offset_t kentry_data;
974 static __startup_data vm_size_t kentry_data_size;
975 static __startup_data vm_offset_t map_holes_data;
976 static __startup_data vm_size_t map_holes_data_size;
977 static __startup_data vm_map_t *early_map_owners[VM_MAP_EARLY_COUNT_MAX];
978 static __startup_data uint32_t early_map_count;
979
980 #if XNU_TARGET_OS_OSX
981 #define NO_COALESCE_LIMIT ((1024 * 128) - 1)
982 #else /* XNU_TARGET_OS_OSX */
983 #define NO_COALESCE_LIMIT 0
984 #endif /* XNU_TARGET_OS_OSX */
985
986 /* Skip acquiring locks if we're in the midst of a kernel core dump */
987 unsigned int not_in_kdp = 1;
988
989 unsigned int vm_map_set_cache_attr_count = 0;
990
991 kern_return_t
vm_map_set_cache_attr(vm_map_t map,vm_map_offset_t va)992 vm_map_set_cache_attr(
993 vm_map_t map,
994 vm_map_offset_t va)
995 {
996 vm_map_entry_t map_entry;
997 vm_object_t object;
998 kern_return_t kr = KERN_SUCCESS;
999
1000 vm_map_lock_read(map);
1001
1002 if (!vm_map_lookup_entry(map, va, &map_entry) ||
1003 map_entry->is_sub_map) {
1004 /*
1005 * that memory is not properly mapped
1006 */
1007 kr = KERN_INVALID_ARGUMENT;
1008 goto done;
1009 }
1010 object = VME_OBJECT(map_entry);
1011
1012 if (object == VM_OBJECT_NULL) {
1013 /*
1014 * there should be a VM object here at this point
1015 */
1016 kr = KERN_INVALID_ARGUMENT;
1017 goto done;
1018 }
1019 vm_object_lock(object);
1020 object->set_cache_attr = TRUE;
1021 vm_object_unlock(object);
1022
1023 vm_map_set_cache_attr_count++;
1024 done:
1025 vm_map_unlock_read(map);
1026
1027 return kr;
1028 }
1029
1030
1031 #if CONFIG_CODE_DECRYPTION
1032 /*
1033 * vm_map_apple_protected:
1034 * This remaps the requested part of the object with an object backed by
1035 * the decrypting pager.
1036 * crypt_info contains entry points and session data for the crypt module.
1037 * The crypt_info block will be copied by vm_map_apple_protected. The data structures
1038 * referenced in crypt_info must remain valid until crypt_info->crypt_end() is called.
1039 */
1040 kern_return_t
vm_map_apple_protected(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_object_offset_t crypto_backing_offset,struct pager_crypt_info * crypt_info,uint32_t cryptid)1041 vm_map_apple_protected(
1042 vm_map_t map,
1043 vm_map_offset_t start,
1044 vm_map_offset_t end,
1045 vm_object_offset_t crypto_backing_offset,
1046 struct pager_crypt_info *crypt_info,
1047 uint32_t cryptid)
1048 {
1049 boolean_t map_locked;
1050 kern_return_t kr;
1051 vm_map_entry_t map_entry;
1052 struct vm_map_entry tmp_entry;
1053 memory_object_t unprotected_mem_obj;
1054 vm_object_t protected_object;
1055 vm_map_offset_t map_addr;
1056 vm_map_offset_t start_aligned, end_aligned;
1057 vm_object_offset_t crypto_start, crypto_end;
1058 boolean_t cache_pager;
1059
1060 map_locked = FALSE;
1061 unprotected_mem_obj = MEMORY_OBJECT_NULL;
1062
1063 if (__improbable(vm_map_range_overflows(map, start, end - start))) {
1064 return KERN_INVALID_ADDRESS;
1065 }
1066 start_aligned = vm_map_trunc_page(start, PAGE_MASK_64);
1067 end_aligned = vm_map_round_page(end, PAGE_MASK_64);
1068 start_aligned = vm_map_trunc_page(start_aligned, VM_MAP_PAGE_MASK(map));
1069 end_aligned = vm_map_round_page(end_aligned, VM_MAP_PAGE_MASK(map));
1070
1071 #if __arm64__
1072 /*
1073 * "start" and "end" might be 4K-aligned but not 16K-aligned,
1074 * so we might have to loop and establish up to 3 mappings:
1075 *
1076 * + the first 16K-page, which might overlap with the previous
1077 * 4K-aligned mapping,
1078 * + the center,
1079 * + the last 16K-page, which might overlap with the next
1080 * 4K-aligned mapping.
1081 * Each of these mapping might be backed by a vnode pager (if
1082 * properly page-aligned) or a "fourk_pager", itself backed by a
1083 * vnode pager (if 4K-aligned but not page-aligned).
1084 */
1085 #endif /* __arm64__ */
1086
1087 map_addr = start_aligned;
1088 for (map_addr = start_aligned;
1089 map_addr < end;
1090 map_addr = tmp_entry.vme_end) {
1091 vm_map_lock(map);
1092 map_locked = TRUE;
1093
1094 /* lookup the protected VM object */
1095 if (!vm_map_lookup_entry(map,
1096 map_addr,
1097 &map_entry) ||
1098 map_entry->is_sub_map ||
1099 VME_OBJECT(map_entry) == VM_OBJECT_NULL) {
1100 /* that memory is not properly mapped */
1101 kr = KERN_INVALID_ARGUMENT;
1102 goto done;
1103 }
1104
1105 /* ensure mapped memory is mapped as executable except
1106 * except for model decryption flow */
1107 if ((cryptid != CRYPTID_MODEL_ENCRYPTION) &&
1108 !(map_entry->protection & VM_PROT_EXECUTE)) {
1109 kr = KERN_INVALID_ARGUMENT;
1110 goto done;
1111 }
1112
1113 /* get the protected object to be decrypted */
1114 protected_object = VME_OBJECT(map_entry);
1115 if (protected_object == VM_OBJECT_NULL) {
1116 /* there should be a VM object here at this point */
1117 kr = KERN_INVALID_ARGUMENT;
1118 goto done;
1119 }
1120 /* ensure protected object stays alive while map is unlocked */
1121 vm_object_reference(protected_object);
1122
1123 /* limit the map entry to the area we want to cover */
1124 vm_map_clip_start(map, map_entry, start_aligned);
1125 vm_map_clip_end(map, map_entry, end_aligned);
1126
1127 tmp_entry = *map_entry;
1128 map_entry = VM_MAP_ENTRY_NULL; /* not valid after unlocking map */
1129 vm_map_unlock(map);
1130 map_locked = FALSE;
1131
1132 /*
1133 * This map entry might be only partially encrypted
1134 * (if not fully "page-aligned").
1135 */
1136 crypto_start = 0;
1137 crypto_end = tmp_entry.vme_end - tmp_entry.vme_start;
1138 if (tmp_entry.vme_start < start) {
1139 if (tmp_entry.vme_start != start_aligned) {
1140 kr = KERN_INVALID_ADDRESS;
1141 vm_object_deallocate(protected_object);
1142 goto done;
1143 }
1144 crypto_start += (start - tmp_entry.vme_start);
1145 }
1146 if (tmp_entry.vme_end > end) {
1147 if (tmp_entry.vme_end != end_aligned) {
1148 kr = KERN_INVALID_ADDRESS;
1149 vm_object_deallocate(protected_object);
1150 goto done;
1151 }
1152 crypto_end -= (tmp_entry.vme_end - end);
1153 }
1154
1155 /*
1156 * This "extra backing offset" is needed to get the decryption
1157 * routine to use the right key. It adjusts for the possibly
1158 * relative offset of an interposed "4K" pager...
1159 */
1160 if (crypto_backing_offset == (vm_object_offset_t) -1) {
1161 crypto_backing_offset = VME_OFFSET(&tmp_entry);
1162 }
1163
1164 cache_pager = TRUE;
1165 #if XNU_TARGET_OS_OSX
1166 if (vm_map_is_alien(map)) {
1167 cache_pager = FALSE;
1168 }
1169 #endif /* XNU_TARGET_OS_OSX */
1170
1171 /*
1172 * Lookup (and create if necessary) the protected memory object
1173 * matching that VM object.
1174 * If successful, this also grabs a reference on the memory object,
1175 * to guarantee that it doesn't go away before we get a chance to map
1176 * it.
1177 */
1178 unprotected_mem_obj = apple_protect_pager_setup(
1179 protected_object,
1180 VME_OFFSET(&tmp_entry),
1181 crypto_backing_offset,
1182 crypt_info,
1183 crypto_start,
1184 crypto_end,
1185 cache_pager);
1186
1187 /* release extra ref on protected object */
1188 vm_object_deallocate(protected_object);
1189
1190 if (unprotected_mem_obj == NULL) {
1191 kr = KERN_FAILURE;
1192 goto done;
1193 }
1194
1195 /* can overwrite an immutable mapping */
1196 vm_map_kernel_flags_t vmk_flags = {
1197 .vmf_fixed = true,
1198 .vmf_overwrite = true,
1199 .vmkf_overwrite_immutable = true,
1200 };
1201 /* make the new mapping as "permanent" as the one it replaces */
1202 vmk_flags.vmf_permanent = tmp_entry.vme_permanent;
1203
1204 /* map this memory object in place of the current one */
1205 map_addr = tmp_entry.vme_start;
1206 kr = mach_vm_map_kernel(map,
1207 vm_sanitize_wrap_addr_ref(&map_addr),
1208 (tmp_entry.vme_end -
1209 tmp_entry.vme_start),
1210 (mach_vm_offset_t) 0,
1211 vmk_flags,
1212 (ipc_port_t)(uintptr_t) unprotected_mem_obj,
1213 0,
1214 TRUE,
1215 tmp_entry.protection,
1216 tmp_entry.max_protection,
1217 tmp_entry.inheritance);
1218 assertf(kr == KERN_SUCCESS,
1219 "kr = 0x%x\n", kr);
1220 assertf(map_addr == tmp_entry.vme_start,
1221 "map_addr=0x%llx vme_start=0x%llx tmp_entry=%p\n",
1222 (uint64_t)map_addr,
1223 (uint64_t) tmp_entry.vme_start,
1224 &tmp_entry);
1225
1226 #if VM_MAP_DEBUG_APPLE_PROTECT
1227 if (vm_map_debug_apple_protect) {
1228 printf("APPLE_PROTECT: map %p [0x%llx:0x%llx] pager %p:"
1229 " backing:[object:%p,offset:0x%llx,"
1230 "crypto_backing_offset:0x%llx,"
1231 "crypto_start:0x%llx,crypto_end:0x%llx]\n",
1232 map,
1233 (uint64_t) map_addr,
1234 (uint64_t) (map_addr + (tmp_entry.vme_end -
1235 tmp_entry.vme_start)),
1236 unprotected_mem_obj,
1237 protected_object,
1238 VME_OFFSET(&tmp_entry),
1239 crypto_backing_offset,
1240 crypto_start,
1241 crypto_end);
1242 }
1243 #endif /* VM_MAP_DEBUG_APPLE_PROTECT */
1244
1245 /*
1246 * Release the reference obtained by
1247 * apple_protect_pager_setup().
1248 * The mapping (if it succeeded) is now holding a reference on
1249 * the memory object.
1250 */
1251 memory_object_deallocate(unprotected_mem_obj);
1252 unprotected_mem_obj = MEMORY_OBJECT_NULL;
1253
1254 /* continue with next map entry */
1255 crypto_backing_offset += (tmp_entry.vme_end -
1256 tmp_entry.vme_start);
1257 crypto_backing_offset -= crypto_start;
1258 }
1259 kr = KERN_SUCCESS;
1260
1261 done:
1262 if (map_locked) {
1263 vm_map_unlock(map);
1264 }
1265 return kr;
1266 }
1267 #endif /* CONFIG_CODE_DECRYPTION */
1268
1269
1270 LCK_GRP_DECLARE(vm_map_lck_grp, "vm_map");
1271 LCK_ATTR_DECLARE(vm_map_lck_attr, 0, 0);
1272 LCK_ATTR_DECLARE(vm_map_lck_rw_attr, 0, LCK_ATTR_DEBUG);
1273
1274 #if XNU_TARGET_OS_OSX
1275 #define MALLOC_NO_COW_DEFAULT 1
1276 #define MALLOC_NO_COW_EXCEPT_FORK_DEFAULT 1
1277 #else /* XNU_TARGET_OS_OSX */
1278 #define MALLOC_NO_COW_DEFAULT 1
1279 #define MALLOC_NO_COW_EXCEPT_FORK_DEFAULT 0
1280 #endif /* XNU_TARGET_OS_OSX */
1281 TUNABLE(int, malloc_no_cow, "malloc_no_cow", MALLOC_NO_COW_DEFAULT);
1282 TUNABLE(int, malloc_no_cow_except_fork, "malloc_no_cow_except_fork", MALLOC_NO_COW_EXCEPT_FORK_DEFAULT);
1283 uint64_t vm_memory_malloc_no_cow_mask = 0ULL;
1284 #if DEBUG
1285 int vm_check_map_sanity = 0;
1286 #endif
1287
1288 /*
1289 * vm_map_init:
1290 *
1291 * Initialize the vm_map module. Must be called before
1292 * any other vm_map routines.
1293 *
1294 * Map and entry structures are allocated from zones -- we must
1295 * initialize those zones.
1296 *
1297 * There are three zones of interest:
1298 *
1299 * vm_map_zone: used to allocate maps.
1300 * vm_map_entry_zone: used to allocate map entries.
1301 *
1302 * LP32:
1303 * vm_map_entry_reserved_zone: fallback zone for kernel map entries
1304 *
1305 * The kernel allocates map entries from a special zone that is initially
1306 * "crammed" with memory. It would be difficult (perhaps impossible) for
1307 * the kernel to allocate more memory to a entry zone when it became
1308 * empty since the very act of allocating memory implies the creation
1309 * of a new entry.
1310 */
1311 __startup_func
1312 void
vm_map_init(void)1313 vm_map_init(void)
1314 {
1315
1316 #if MACH_ASSERT
1317 PE_parse_boot_argn("debug4k_filter", &debug4k_filter,
1318 sizeof(debug4k_filter));
1319 #endif /* MACH_ASSERT */
1320
1321 zone_create_ext(VM_MAP_ZONE_NAME, sizeof(struct _vm_map),
1322 VM_MAP_ZFLAGS, ZONE_ID_VM_MAP, NULL);
1323
1324 /*
1325 * Don't quarantine because we always need elements available
1326 * Disallow GC on this zone... to aid the GC.
1327 */
1328 zone_create_ext(VM_MAP_ENTRY_ZONE_NAME,
1329 sizeof(struct vm_map_entry), VM_MAP_ENTRY_ZFLAGS,
1330 ZONE_ID_VM_MAP_ENTRY, ^(zone_t z) {
1331 z->z_elems_rsv = (uint16_t)(32 *
1332 (ml_early_cpu_max_number() + 1));
1333 });
1334
1335 zone_create_ext(VM_MAP_HOLES_ZONE_NAME,
1336 sizeof(struct vm_map_links), VM_MAP_HOLES_ZFLAGS,
1337 ZONE_ID_VM_MAP_HOLES, ^(zone_t z) {
1338 z->z_elems_rsv = (uint16_t)(16 * 1024 / zone_elem_outer_size(z));
1339 });
1340
1341 zone_create_ext("VM map copies", sizeof(struct vm_map_copy),
1342 ZC_NOENCRYPT, ZONE_ID_VM_MAP_COPY, NULL);
1343
1344 /*
1345 * Add the stolen memory to zones, adjust zone size and stolen counts.
1346 */
1347 zone_cram_early(vm_map_zone, map_data, map_data_size);
1348 zone_cram_early(vm_map_entry_zone, kentry_data, kentry_data_size);
1349 zone_cram_early(vm_map_holes_zone, map_holes_data, map_holes_data_size);
1350 printf("VM boostrap: %d maps, %d entries and %d holes available\n",
1351 zone_count_free(vm_map_zone),
1352 zone_count_free(vm_map_entry_zone),
1353 zone_count_free(vm_map_holes_zone));
1354
1355 /*
1356 * Since these are covered by zones, remove them from stolen page accounting.
1357 */
1358 VM_PAGE_MOVE_STOLEN(atop_64(map_data_size) + atop_64(kentry_data_size) + atop_64(map_holes_data_size));
1359
1360 #if VM_MAP_DEBUG_APPLE_PROTECT
1361 PE_parse_boot_argn("vm_map_debug_apple_protect",
1362 &vm_map_debug_apple_protect,
1363 sizeof(vm_map_debug_apple_protect));
1364 #endif /* VM_MAP_DEBUG_APPLE_PROTECT */
1365 #if VM_MAP_DEBUG_APPLE_FOURK
1366 PE_parse_boot_argn("vm_map_debug_fourk",
1367 &vm_map_debug_fourk,
1368 sizeof(vm_map_debug_fourk));
1369 #endif /* VM_MAP_DEBUG_FOURK */
1370
1371 if (malloc_no_cow) {
1372 vm_memory_malloc_no_cow_mask = 0ULL;
1373 vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC;
1374 vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_SMALL;
1375 vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_MEDIUM;
1376 #if XNU_TARGET_OS_OSX
1377 /*
1378 * On macOS, keep copy-on-write for MALLOC_LARGE because
1379 * realloc() may use vm_copy() to transfer the old contents
1380 * to the new location.
1381 */
1382 #else /* XNU_TARGET_OS_OSX */
1383 vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_LARGE;
1384 vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_LARGE_REUSABLE;
1385 vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_LARGE_REUSED;
1386 #endif /* XNU_TARGET_OS_OSX */
1387 // vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_HUGE;
1388 // vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_REALLOC;
1389 vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_TINY;
1390 vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_NANO;
1391 // vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_TCMALLOC;
1392 PE_parse_boot_argn("vm_memory_malloc_no_cow_mask",
1393 &vm_memory_malloc_no_cow_mask,
1394 sizeof(vm_memory_malloc_no_cow_mask));
1395 }
1396
1397 #if CONFIG_MAP_RANGES
1398 vm_map_range_map_init();
1399 #endif /* CONFIG_MAP_RANGES */
1400
1401 #if DEBUG
1402 PE_parse_boot_argn("vm_check_map_sanity", &vm_check_map_sanity, sizeof(vm_check_map_sanity));
1403 if (vm_check_map_sanity) {
1404 kprintf("VM sanity checking enabled\n");
1405 } else {
1406 kprintf("VM sanity checking disabled. Set bootarg vm_check_map_sanity=1 to enable\n");
1407 }
1408 #endif /* DEBUG */
1409
1410 #if DEVELOPMENT || DEBUG
1411 PE_parse_boot_argn("panic_on_unsigned_execute",
1412 &panic_on_unsigned_execute,
1413 sizeof(panic_on_unsigned_execute));
1414 PE_parse_boot_argn("panic_on_mlock_failure",
1415 &panic_on_mlock_failure,
1416 sizeof(panic_on_mlock_failure));
1417 #endif /* DEVELOPMENT || DEBUG */
1418 }
1419
1420 __startup_func
1421 static void
vm_map_steal_memory(void)1422 vm_map_steal_memory(void)
1423 {
1424
1425 /*
1426 * We need to reserve enough memory to support boostraping VM maps
1427 * and the zone subsystem.
1428 *
1429 * The VM Maps that need to function before zones can support them
1430 * are the ones registered with vm_map_will_allocate_early_map(),
1431 * which are:
1432 * - the kernel map
1433 * - the various submaps used by zones (pgz, meta, ...)
1434 *
1435 * We also need enough entries and holes to support them
1436 * until zone_metadata_init() is called, which is when
1437 * the zone allocator becomes capable of expanding dynamically.
1438 *
1439 * We need:
1440 * - VM_MAP_EARLY_COUNT_MAX worth of VM Maps.
1441 * - To allow for 3-4 entries per map, but the kernel map
1442 * needs a multiple of VM_MAP_EARLY_COUNT_MAX entries
1443 * to describe the submaps, so double it (and make it 8x too)
1444 * - To allow for holes between entries,
1445 * hence needs the same budget as entries
1446 */
1447 map_data_size = zone_get_early_alloc_size(VM_MAP_ZONE_NAME,
1448 sizeof(struct _vm_map), VM_MAP_ZFLAGS,
1449 VM_MAP_EARLY_COUNT_MAX);
1450
1451 kentry_data_size = zone_get_early_alloc_size(VM_MAP_ENTRY_ZONE_NAME,
1452 sizeof(struct vm_map_entry), VM_MAP_ENTRY_ZFLAGS,
1453 8 * VM_MAP_EARLY_COUNT_MAX);
1454
1455 map_holes_data_size = zone_get_early_alloc_size(VM_MAP_HOLES_ZONE_NAME,
1456 sizeof(struct vm_map_links), VM_MAP_HOLES_ZFLAGS,
1457 8 * VM_MAP_EARLY_COUNT_MAX);
1458
1459 /*
1460 * Steal a contiguous range of memory so that a simple range check
1461 * can validate early addresses being freed/crammed to these
1462 * zones
1463 */
1464 map_data = zone_early_mem_init(map_data_size + kentry_data_size +
1465 map_holes_data_size);
1466 kentry_data = map_data + map_data_size;
1467 map_holes_data = kentry_data + kentry_data_size;
1468 }
1469 STARTUP(PMAP_STEAL, STARTUP_RANK_FIRST, vm_map_steal_memory);
1470
1471 __startup_func
1472 static void
vm_kernel_boostraped(void)1473 vm_kernel_boostraped(void)
1474 {
1475 zone_enable_caching(&zone_array[ZONE_ID_VM_MAP_ENTRY]);
1476 zone_enable_caching(&zone_array[ZONE_ID_VM_MAP_HOLES]);
1477 zone_enable_caching(&zone_array[ZONE_ID_VM_MAP_COPY]);
1478
1479 printf("VM bootstrap done: %d maps, %d entries and %d holes left\n",
1480 zone_count_free(vm_map_zone),
1481 zone_count_free(vm_map_entry_zone),
1482 zone_count_free(vm_map_holes_zone));
1483 }
1484 STARTUP(ZALLOC, STARTUP_RANK_SECOND, vm_kernel_boostraped);
1485
1486 void
vm_map_disable_hole_optimization(vm_map_t map)1487 vm_map_disable_hole_optimization(vm_map_t map)
1488 {
1489 vm_map_entry_t head_entry, hole_entry, next_hole_entry;
1490
1491 if (map->holelistenabled) {
1492 head_entry = hole_entry = CAST_TO_VM_MAP_ENTRY(map->holes_list);
1493
1494 while (hole_entry != NULL) {
1495 next_hole_entry = hole_entry->vme_next;
1496
1497 hole_entry->vme_next = NULL;
1498 hole_entry->vme_prev = NULL;
1499 zfree_id(ZONE_ID_VM_MAP_HOLES, hole_entry);
1500
1501 if (next_hole_entry == head_entry) {
1502 hole_entry = NULL;
1503 } else {
1504 hole_entry = next_hole_entry;
1505 }
1506 }
1507
1508 map->holes_list = NULL;
1509 map->holelistenabled = FALSE;
1510
1511 map->first_free = vm_map_to_entry(map);
1512 SAVE_HINT_HOLE_WRITE(map, NULL);
1513 }
1514 }
1515
1516 boolean_t
vm_kernel_map_is_kernel(vm_map_t map)1517 vm_kernel_map_is_kernel(vm_map_t map)
1518 {
1519 return map->pmap == kernel_pmap;
1520 }
1521
1522 /*
1523 * vm_map_create:
1524 *
1525 * Creates and returns a new empty VM map with
1526 * the given physical map structure, and having
1527 * the given lower and upper address bounds.
1528 */
1529
1530 extern vm_map_t vm_map_create_external(
1531 pmap_t pmap,
1532 vm_map_offset_t min_off,
1533 vm_map_offset_t max_off,
1534 boolean_t pageable);
1535
1536 vm_map_t
vm_map_create_external(pmap_t pmap,vm_map_offset_t min,vm_map_offset_t max,boolean_t pageable)1537 vm_map_create_external(
1538 pmap_t pmap,
1539 vm_map_offset_t min,
1540 vm_map_offset_t max,
1541 boolean_t pageable)
1542 {
1543 vm_map_create_options_t options = VM_MAP_CREATE_DEFAULT;
1544
1545 if (pageable) {
1546 options |= VM_MAP_CREATE_PAGEABLE;
1547 }
1548 return vm_map_create_options(pmap, min, max, options);
1549 }
1550
1551 __startup_func
1552 void
vm_map_will_allocate_early_map(vm_map_t * owner)1553 vm_map_will_allocate_early_map(vm_map_t *owner)
1554 {
1555 if (early_map_count >= VM_MAP_EARLY_COUNT_MAX) {
1556 panic("VM_MAP_EARLY_COUNT_MAX is too low");
1557 }
1558
1559 early_map_owners[early_map_count++] = owner;
1560 }
1561
1562 __startup_func
1563 void
vm_map_relocate_early_maps(vm_offset_t delta)1564 vm_map_relocate_early_maps(vm_offset_t delta)
1565 {
1566 for (uint32_t i = 0; i < early_map_count; i++) {
1567 vm_address_t addr = (vm_address_t)*early_map_owners[i];
1568
1569 *early_map_owners[i] = (vm_map_t)(addr + delta);
1570 }
1571
1572 early_map_count = ~0u;
1573 }
1574
1575 /*
1576 * Routine: vm_map_relocate_early_elem
1577 *
1578 * Purpose:
1579 * Early zone elements are allocated in a temporary part
1580 * of the address space.
1581 *
1582 * Once the zones live in their final place, the early
1583 * VM maps, map entries and map holes need to be relocated.
1584 *
1585 * It involves rewriting any vm_map_t, vm_map_entry_t or
1586 * pointers to vm_map_links. Other pointers to other types
1587 * are fine.
1588 *
1589 * Fortunately, pointers to those types are self-contained
1590 * in those zones, _except_ for pointers to VM maps,
1591 * which are tracked during early boot and fixed with
1592 * vm_map_relocate_early_maps().
1593 */
1594 __startup_func
1595 void
vm_map_relocate_early_elem(uint32_t zone_id,vm_offset_t new_addr,vm_offset_t delta)1596 vm_map_relocate_early_elem(
1597 uint32_t zone_id,
1598 vm_offset_t new_addr,
1599 vm_offset_t delta)
1600 {
1601 #define relocate(type_t, field) ({ \
1602 typeof(((type_t)NULL)->field) *__field = &((type_t)new_addr)->field; \
1603 if (*__field) { \
1604 *__field = (typeof(*__field))((vm_offset_t)*__field + delta); \
1605 } \
1606 })
1607
1608 switch (zone_id) {
1609 case ZONE_ID_VM_MAP:
1610 case ZONE_ID_VM_MAP_ENTRY:
1611 case ZONE_ID_VM_MAP_HOLES:
1612 break;
1613
1614 default:
1615 panic("Unexpected zone ID %d", zone_id);
1616 }
1617
1618 if (zone_id == ZONE_ID_VM_MAP) {
1619 relocate(vm_map_t, hdr.links.prev);
1620 relocate(vm_map_t, hdr.links.next);
1621 ((vm_map_t)new_addr)->pmap = kernel_pmap;
1622 #ifdef VM_MAP_STORE_USE_RB
1623 relocate(vm_map_t, hdr.rb_head_store.rbh_root);
1624 #endif /* VM_MAP_STORE_USE_RB */
1625 relocate(vm_map_t, hint);
1626 relocate(vm_map_t, hole_hint);
1627 relocate(vm_map_t, first_free);
1628 return;
1629 }
1630
1631 relocate(struct vm_map_links *, prev);
1632 relocate(struct vm_map_links *, next);
1633
1634 if (zone_id == ZONE_ID_VM_MAP_ENTRY) {
1635 #ifdef VM_MAP_STORE_USE_RB
1636 relocate(vm_map_entry_t, store.entry.rbe_left);
1637 relocate(vm_map_entry_t, store.entry.rbe_right);
1638 relocate(vm_map_entry_t, store.entry.rbe_parent);
1639 #endif /* VM_MAP_STORE_USE_RB */
1640 if (((vm_map_entry_t)new_addr)->is_sub_map) {
1641 /* no object to relocate because we haven't made any */
1642 ((vm_map_entry_t)new_addr)->vme_submap +=
1643 delta >> VME_SUBMAP_SHIFT;
1644 }
1645 #if MAP_ENTRY_CREATION_DEBUG
1646 relocate(vm_map_entry_t, vme_creation_maphdr);
1647 #endif /* MAP_ENTRY_CREATION_DEBUG */
1648 }
1649
1650 #undef relocate
1651 }
1652
1653 /*
1654 * Generate a serial ID to identify a newly allocated vm_map
1655 */
1656 static uintptr_t vm_map_serial_current = 0;
1657 vm_map_serial_t vm_map_serial_generate(void);
1658 void vm_map_assign_serial(vm_map_t, vm_map_serial_t);
1659
1660 vm_map_serial_t
vm_map_serial_generate(void)1661 vm_map_serial_generate(void)
1662 {
1663 vm_map_serial_t serial = (void *)os_atomic_inc(&vm_map_serial_current, relaxed);
1664 return serial;
1665 }
1666
1667 void
vm_map_assign_serial(vm_map_t map,vm_map_serial_t serial)1668 vm_map_assign_serial(vm_map_t map, vm_map_serial_t serial)
1669 {
1670 map->serial_id = serial;
1671 #if CONFIG_SPTM
1672 /* Copy through our ID to the pmap (only available on SPTM systems) */
1673 if (map->pmap) {
1674 map->pmap->associated_vm_map_serial_id = map->serial_id;
1675 }
1676 #endif /* CONFIG_SPTM */
1677 }
1678
1679 vm_map_t
vm_map_create_options(pmap_t pmap,vm_map_offset_t min,vm_map_offset_t max,vm_map_create_options_t options)1680 vm_map_create_options(
1681 pmap_t pmap,
1682 vm_map_offset_t min,
1683 vm_map_offset_t max,
1684 vm_map_create_options_t options)
1685 {
1686 vm_map_t result;
1687
1688 #if DEBUG || DEVELOPMENT
1689 if (__improbable(startup_phase < STARTUP_SUB_ZALLOC)) {
1690 if (early_map_count != ~0u && early_map_count !=
1691 zone_count_allocated(vm_map_zone) + 1) {
1692 panic("allocating %dth early map, owner not known",
1693 zone_count_allocated(vm_map_zone) + 1);
1694 }
1695 if (early_map_count != ~0u && pmap && pmap != kernel_pmap) {
1696 panic("allocating %dth early map for non kernel pmap",
1697 early_map_count);
1698 }
1699 }
1700 #endif /* DEBUG || DEVELOPMENT */
1701
1702 result = zalloc_id(ZONE_ID_VM_MAP, Z_WAITOK | Z_NOFAIL | Z_ZERO);
1703
1704 vm_map_store_init(&result->hdr);
1705 result->hdr.entries_pageable = (bool)(options & VM_MAP_CREATE_PAGEABLE);
1706 vm_map_set_page_shift(result, PAGE_SHIFT);
1707
1708 result->size_limit = RLIM_INFINITY; /* default unlimited */
1709 result->data_limit = RLIM_INFINITY; /* default unlimited */
1710 result->user_wire_limit = MACH_VM_MAX_ADDRESS; /* default limit is unlimited */
1711 os_ref_init_count_raw(&result->map_refcnt, &map_refgrp, 1);
1712
1713 result->pmap = pmap;
1714
1715 /*
1716 * Immediately give ourselves an ID
1717 * Unless this map is being created as part of a fork, in which case
1718 * the caller will reassign the ID of the parent (so don't waste an
1719 * increment here).
1720 */
1721 if ((options & VM_MAP_CREATE_VIA_FORK) == 0) {
1722 vm_map_assign_serial(result, vm_map_serial_generate());
1723 }
1724
1725 result->min_offset = min;
1726 result->max_offset = max;
1727 result->first_free = vm_map_to_entry(result);
1728 result->hint = vm_map_to_entry(result);
1729
1730 if (options & VM_MAP_CREATE_NEVER_FAULTS) {
1731 assert(pmap == kernel_pmap);
1732 result->never_faults = true;
1733 }
1734
1735 /* "has_corpse_footprint" and "holelistenabled" are mutually exclusive */
1736 if (options & VM_MAP_CREATE_CORPSE_FOOTPRINT) {
1737 result->has_corpse_footprint = true;
1738 } else if (!(options & VM_MAP_CREATE_DISABLE_HOLELIST)) {
1739 struct vm_map_links *hole_entry;
1740
1741 hole_entry = zalloc_id(ZONE_ID_VM_MAP_HOLES, Z_WAITOK | Z_NOFAIL);
1742 hole_entry->start = min;
1743 /*
1744 * Holes can be used to track ranges all the way up to
1745 * MACH_VM_MAX_ADDRESS or more (e.g. kernel map).
1746 */
1747 hole_entry->end = MAX(max, (vm_map_offset_t)MACH_VM_MAX_ADDRESS);
1748 result->holes_list = result->hole_hint = hole_entry;
1749 hole_entry->prev = hole_entry->next = CAST_TO_VM_MAP_ENTRY(hole_entry);
1750 result->holelistenabled = true;
1751 }
1752
1753 vm_map_lock_init(result);
1754
1755 return result;
1756 }
1757
1758 /*
1759 * Adjusts a submap that was made by kmem_suballoc()
1760 * before it knew where it would be mapped,
1761 * so that it has the right min/max offsets.
1762 *
1763 * We do not need to hold any locks:
1764 * only the caller knows about this map,
1765 * and it is not published on any entry yet.
1766 */
1767 static void
vm_map_adjust_offsets(vm_map_t map,vm_map_offset_t min_off,vm_map_offset_t max_off)1768 vm_map_adjust_offsets(
1769 vm_map_t map,
1770 vm_map_offset_t min_off,
1771 vm_map_offset_t max_off)
1772 {
1773 assert(map->min_offset == 0);
1774 assert(map->max_offset == max_off - min_off);
1775 assert(map->hdr.nentries == 0);
1776 assert(os_ref_get_count_raw(&map->map_refcnt) == 2);
1777
1778 map->min_offset = min_off;
1779 map->max_offset = max_off;
1780
1781 if (map->holelistenabled) {
1782 struct vm_map_links *hole = map->holes_list;
1783
1784 hole->start = min_off;
1785 #if defined(__arm64__)
1786 hole->end = max_off;
1787 #else
1788 hole->end = MAX(max_off, (vm_map_offset_t)MACH_VM_MAX_ADDRESS);
1789 #endif
1790 }
1791 }
1792
1793
1794 vm_map_size_t
vm_map_adjusted_size(vm_map_t map)1795 vm_map_adjusted_size(vm_map_t map)
1796 {
1797 const struct vm_reserved_region *regions = NULL;
1798 size_t num_regions = 0;
1799 mach_vm_size_t reserved_size = 0, map_size = 0;
1800
1801 if (map == NULL || (map->size == 0)) {
1802 return 0;
1803 }
1804
1805 map_size = map->size;
1806
1807 if (map->reserved_regions == FALSE || !vm_map_is_exotic(map) || map->terminated) {
1808 /*
1809 * No special reserved regions or not an exotic map or the task
1810 * is terminating and these special regions might have already
1811 * been deallocated.
1812 */
1813 return map_size;
1814 }
1815
1816 num_regions = ml_get_vm_reserved_regions(vm_map_is_64bit(map), ®ions);
1817 assert((num_regions == 0) || (num_regions > 0 && regions != NULL));
1818
1819 while (num_regions) {
1820 reserved_size += regions[--num_regions].vmrr_size;
1821 }
1822
1823 /*
1824 * There are a few places where the map is being switched out due to
1825 * 'termination' without that bit being set (e.g. exec and corpse purging).
1826 * In those cases, we could have the map's regions being deallocated on
1827 * a core while some accounting process is trying to get the map's size.
1828 * So this assert can't be enabled till all those places are uniform in
1829 * their use of the 'map->terminated' bit.
1830 *
1831 * assert(map_size >= reserved_size);
1832 */
1833
1834 return (map_size >= reserved_size) ? (map_size - reserved_size) : map_size;
1835 }
1836
1837 /*
1838 * vm_map_entry_create: [ internal use only ]
1839 *
1840 * Allocates a VM map entry for insertion in the
1841 * given map (or map copy). No fields are filled.
1842 *
1843 * The VM entry will be zero initialized, except for:
1844 * - behavior set to VM_BEHAVIOR_DEFAULT
1845 * - inheritance set to VM_INHERIT_DEFAULT
1846 */
1847 #define vm_map_entry_create(map) _vm_map_entry_create(&(map)->hdr)
1848
1849 #define vm_map_copy_entry_create(copy) _vm_map_entry_create(&(copy)->cpy_hdr)
1850
1851 static vm_map_entry_t
_vm_map_entry_create(struct vm_map_header * map_header __unused)1852 _vm_map_entry_create(
1853 struct vm_map_header *map_header __unused)
1854 {
1855 vm_map_entry_t entry = NULL;
1856
1857 entry = zalloc_id(ZONE_ID_VM_MAP_ENTRY, Z_WAITOK | Z_ZERO);
1858
1859 /*
1860 * Help the compiler with what we know to be true,
1861 * so that the further bitfields inits have good codegen.
1862 *
1863 * See rdar://87041299
1864 */
1865 __builtin_assume(entry->vme_object_value == 0);
1866 __builtin_assume(*(uint64_t *)(&entry->vme_object_value + 1) == 0);
1867 __builtin_assume(*(uint64_t *)(&entry->vme_object_value + 2) == 0);
1868
1869 static_assert(VM_MAX_TAG_VALUE <= VME_ALIAS_MASK,
1870 "VME_ALIAS_MASK covers tags");
1871
1872 static_assert(VM_BEHAVIOR_DEFAULT == 0,
1873 "can skip zeroing of the behavior field");
1874 entry->inheritance = VM_INHERIT_DEFAULT;
1875
1876 #if MAP_ENTRY_CREATION_DEBUG
1877 entry->vme_creation_maphdr = map_header;
1878 entry->vme_creation_bt = btref_get(__builtin_frame_address(0),
1879 BTREF_GET_NOWAIT);
1880 #endif
1881 return entry;
1882 }
1883
1884 /*
1885 * vm_map_entry_dispose: [ internal use only ]
1886 *
1887 * Inverse of vm_map_entry_create.
1888 *
1889 * write map lock held so no need to
1890 * do anything special to insure correctness
1891 * of the stores
1892 */
1893 static void
vm_map_entry_dispose(vm_map_entry_t entry)1894 vm_map_entry_dispose(
1895 vm_map_entry_t entry)
1896 {
1897 #if VM_BTLOG_TAGS
1898 if (entry->vme_kernel_object) {
1899 btref_put(entry->vme_tag_btref);
1900 }
1901 #endif /* VM_BTLOG_TAGS */
1902 #if MAP_ENTRY_CREATION_DEBUG
1903 btref_put(entry->vme_creation_bt);
1904 #endif
1905 #if MAP_ENTRY_INSERTION_DEBUG
1906 btref_put(entry->vme_insertion_bt);
1907 #endif
1908 zfree(vm_map_entry_zone, entry);
1909 }
1910
1911 #define vm_map_copy_entry_dispose(copy_entry) \
1912 vm_map_entry_dispose(copy_entry)
1913
1914 static vm_map_entry_t
vm_map_zap_first_entry(vm_map_zap_t list)1915 vm_map_zap_first_entry(
1916 vm_map_zap_t list)
1917 {
1918 return list->vmz_head;
1919 }
1920
1921 static vm_map_entry_t
vm_map_zap_last_entry(vm_map_zap_t list)1922 vm_map_zap_last_entry(
1923 vm_map_zap_t list)
1924 {
1925 assert(vm_map_zap_first_entry(list));
1926 return __container_of(list->vmz_tail, struct vm_map_entry, vme_next);
1927 }
1928
1929 static void
vm_map_zap_append(vm_map_zap_t list,vm_map_entry_t entry)1930 vm_map_zap_append(
1931 vm_map_zap_t list,
1932 vm_map_entry_t entry)
1933 {
1934 entry->vme_next = VM_MAP_ENTRY_NULL;
1935 *list->vmz_tail = entry;
1936 list->vmz_tail = &entry->vme_next;
1937 }
1938
1939 static vm_map_entry_t
vm_map_zap_pop(vm_map_zap_t list)1940 vm_map_zap_pop(
1941 vm_map_zap_t list)
1942 {
1943 vm_map_entry_t head = list->vmz_head;
1944
1945 if (head != VM_MAP_ENTRY_NULL &&
1946 (list->vmz_head = head->vme_next) == VM_MAP_ENTRY_NULL) {
1947 list->vmz_tail = &list->vmz_head;
1948 }
1949
1950 return head;
1951 }
1952
1953 static void
vm_map_zap_dispose(vm_map_zap_t list)1954 vm_map_zap_dispose(
1955 vm_map_zap_t list)
1956 {
1957 vm_map_entry_t entry;
1958
1959 while ((entry = vm_map_zap_pop(list))) {
1960 if (entry->is_sub_map) {
1961 vm_map_deallocate(VME_SUBMAP(entry));
1962 } else {
1963 vm_object_deallocate(VME_OBJECT(entry));
1964 }
1965
1966 vm_map_entry_dispose(entry);
1967 }
1968 }
1969
1970 #if MACH_ASSERT
1971 static boolean_t first_free_check = FALSE;
1972 boolean_t
first_free_is_valid(vm_map_t map)1973 first_free_is_valid(
1974 vm_map_t map)
1975 {
1976 if (!first_free_check) {
1977 return TRUE;
1978 }
1979
1980 return first_free_is_valid_store( map );
1981 }
1982 #endif /* MACH_ASSERT */
1983
1984
1985 #define vm_map_copy_entry_link(copy, after_where, entry) \
1986 _vm_map_store_entry_link(&(copy)->cpy_hdr, after_where, (entry))
1987
1988 #define vm_map_copy_entry_unlink(copy, entry) \
1989 _vm_map_store_entry_unlink(&(copy)->cpy_hdr, (entry), false)
1990
1991 /*
1992 * vm_map_destroy:
1993 *
1994 * Actually destroy a map.
1995 */
1996 void
vm_map_destroy(vm_map_t map)1997 vm_map_destroy(
1998 vm_map_t map)
1999 {
2000 /* final cleanup: this is not allowed to fail */
2001 vmr_flags_t flags = VM_MAP_REMOVE_NO_FLAGS;
2002
2003 VM_MAP_ZAP_DECLARE(zap);
2004
2005 vm_map_lock(map);
2006
2007 map->terminated = true;
2008 /* clean up regular map entries */
2009 (void)vm_map_delete(map, map->min_offset, map->max_offset, flags,
2010 KMEM_GUARD_NONE, &zap);
2011 /* clean up leftover special mappings (commpage, GPU carveout, etc...) */
2012 (void)vm_map_delete(map, 0x0, 0xFFFFFFFFFFFFF000ULL, flags,
2013 KMEM_GUARD_NONE, &zap);
2014
2015 vm_map_disable_hole_optimization(map);
2016 vm_map_corpse_footprint_destroy(map);
2017
2018 vm_map_unlock(map);
2019
2020 vm_map_zap_dispose(&zap);
2021
2022 assert(map->hdr.nentries == 0);
2023
2024 if (map->pmap) {
2025 pmap_destroy(map->pmap);
2026 }
2027
2028 lck_rw_destroy(&map->lock, &vm_map_lck_grp);
2029
2030 #if CONFIG_MAP_RANGES
2031 kfree_data(map->extra_ranges,
2032 map->extra_ranges_count * sizeof(struct vm_map_user_range));
2033 #endif
2034
2035 zfree_id(ZONE_ID_VM_MAP, map);
2036 }
2037
2038 /*
2039 * Returns pid of the task with the largest number of VM map entries.
2040 * Used in the zone-map-exhaustion jetsam path.
2041 */
2042 pid_t
find_largest_process_vm_map_entries(void)2043 find_largest_process_vm_map_entries(void)
2044 {
2045 pid_t victim_pid = -1;
2046 int max_vm_map_entries = 0;
2047 task_t task = TASK_NULL;
2048 queue_head_t *task_list = &tasks;
2049
2050 lck_mtx_lock(&tasks_threads_lock);
2051 queue_iterate(task_list, task, task_t, tasks) {
2052 if (task == kernel_task || !task->active) {
2053 continue;
2054 }
2055
2056 vm_map_t task_map = task->map;
2057 if (task_map != VM_MAP_NULL) {
2058 int task_vm_map_entries = task_map->hdr.nentries;
2059 if (task_vm_map_entries > max_vm_map_entries) {
2060 max_vm_map_entries = task_vm_map_entries;
2061 victim_pid = pid_from_task(task);
2062 }
2063 }
2064 }
2065 lck_mtx_unlock(&tasks_threads_lock);
2066
2067 printf("zone_map_exhaustion: victim pid %d, vm region count: %d\n", victim_pid, max_vm_map_entries);
2068 return victim_pid;
2069 }
2070
2071
2072 /*
2073 * vm_map_lookup_entry: [ internal use only ]
2074 *
2075 * Calls into the vm map store layer to find the map
2076 * entry containing (or immediately preceding) the
2077 * specified address in the given map; the entry is returned
2078 * in the "entry" parameter. The boolean
2079 * result indicates whether the address is
2080 * actually contained in the map.
2081 */
2082 boolean_t
vm_map_lookup_entry(vm_map_t map,vm_map_offset_t address,vm_map_entry_t * entry)2083 vm_map_lookup_entry(
2084 vm_map_t map,
2085 vm_map_offset_t address,
2086 vm_map_entry_t *entry) /* OUT */
2087 {
2088 bool result = false;
2089
2090 #if KASAN_TBI
2091 if (VM_KERNEL_ADDRESS(address)) {
2092 address = vm_memtag_canonicalize_kernel(address);
2093 }
2094 #endif /* KASAN_TBI */
2095
2096
2097 #if CONFIG_PROB_GZALLOC
2098 if (map->pmap == kernel_pmap) {
2099 assertf(!pgz_owned(address),
2100 "it is the responsibility of callers to unguard PGZ addresses");
2101 }
2102 #endif /* CONFIG_PROB_GZALLOC */
2103 result = vm_map_store_lookup_entry( map, address, entry );
2104
2105 return result;
2106 }
2107
2108 boolean_t
vm_map_lookup_entry_or_next(vm_map_t map,vm_map_offset_t address,vm_map_entry_t * entry)2109 vm_map_lookup_entry_or_next(
2110 vm_map_t map,
2111 vm_map_offset_t address,
2112 vm_map_entry_t *entry) /* OUT */
2113 {
2114 if (vm_map_lookup_entry(map, address, entry)) {
2115 return true;
2116 }
2117
2118 *entry = (*entry)->vme_next;
2119 return false;
2120 }
2121
2122 #if CONFIG_PROB_GZALLOC
2123 boolean_t
vm_map_lookup_entry_allow_pgz(vm_map_t map,vm_map_offset_t address,vm_map_entry_t * entry)2124 vm_map_lookup_entry_allow_pgz(
2125 vm_map_t map,
2126 vm_map_offset_t address,
2127 vm_map_entry_t *entry) /* OUT */
2128 {
2129 return vm_map_store_lookup_entry( map, address, entry );
2130 }
2131 #endif /* CONFIG_PROB_GZALLOC */
2132
2133 /*
2134 * Routine: vm_map_range_invalid_panic
2135 * Purpose:
2136 * Panic on detection of an invalid range id.
2137 */
2138 __abortlike
2139 static void
vm_map_range_invalid_panic(vm_map_t map,vm_map_range_id_t range_id)2140 vm_map_range_invalid_panic(
2141 vm_map_t map,
2142 vm_map_range_id_t range_id)
2143 {
2144 panic("invalid range ID (%u) for map %p", range_id, map);
2145 }
2146
2147 /*
2148 * Routine: vm_map_get_range
2149 * Purpose:
2150 * Adjust bounds based on security policy.
2151 */
2152 static struct mach_vm_range
vm_map_get_range(vm_map_t map,vm_map_address_t * address,vm_map_kernel_flags_t * vmk_flags,vm_map_size_t size,bool * is_ptr)2153 vm_map_get_range(
2154 vm_map_t map,
2155 vm_map_address_t *address,
2156 vm_map_kernel_flags_t *vmk_flags,
2157 vm_map_size_t size,
2158 bool *is_ptr)
2159 {
2160 struct mach_vm_range effective_range = {};
2161 vm_map_range_id_t range_id = vmk_flags->vmkf_range_id;
2162
2163 if (map == kernel_map) {
2164 effective_range = kmem_ranges[range_id];
2165
2166 if (startup_phase >= STARTUP_SUB_KMEM) {
2167 /*
2168 * Hint provided by caller is zeroed as the range is restricted to a
2169 * subset of the entire kernel_map VA, which could put the hint outside
2170 * the range, causing vm_map_store_find_space to fail.
2171 */
2172 *address = 0ull;
2173 /*
2174 * Ensure that range_id passed in by the caller is within meaningful
2175 * bounds. Range id of KMEM_RANGE_ID_NONE will cause vm_map_locate_space
2176 * to fail as the corresponding range is invalid. Range id larger than
2177 * KMEM_RANGE_ID_MAX will lead to an OOB access.
2178 */
2179 if ((range_id == KMEM_RANGE_ID_NONE) ||
2180 (range_id > KMEM_RANGE_ID_MAX)) {
2181 vm_map_range_invalid_panic(map, range_id);
2182 }
2183
2184 /*
2185 * Pointer ranges use kmem_locate_space to do allocations.
2186 *
2187 * Non pointer fronts look like [ Small | Large | Permanent ]
2188 * Adjust range for allocations larger than KMEM_SMALLMAP_THRESHOLD.
2189 * Allocations smaller than KMEM_SMALLMAP_THRESHOLD are allowed to
2190 * use the entire range.
2191 */
2192 if (range_id < KMEM_RANGE_ID_SPRAYQTN) {
2193 *is_ptr = true;
2194 } else if (size >= KMEM_SMALLMAP_THRESHOLD) {
2195 effective_range = kmem_large_ranges[range_id];
2196 }
2197 }
2198 #if CONFIG_MAP_RANGES
2199 } else if (map->uses_user_ranges) {
2200 switch (range_id) {
2201 case UMEM_RANGE_ID_DEFAULT:
2202 effective_range = map->default_range;
2203 break;
2204 case UMEM_RANGE_ID_HEAP:
2205 effective_range = map->data_range;
2206 break;
2207 case UMEM_RANGE_ID_LARGE_FILE:
2208 if (map->large_file_range.min_address != map->large_file_range.max_address) {
2209 /* large file range is configured and should be used */
2210 effective_range = map->large_file_range;
2211 } else {
2212 /*
2213 * the user asking for this user range might not have the
2214 * permissions to use the large file range (i.e., it doesn't
2215 * hold the correct entitlement), so we give it the data range
2216 * instead
2217 */
2218 effective_range = map->data_range;
2219 }
2220 break;
2221 case UMEM_RANGE_ID_FIXED:
2222 /*
2223 * anywhere allocations with an address in "FIXED"
2224 * makes no sense, leave the range empty
2225 */
2226 break;
2227
2228 default:
2229 vm_map_range_invalid_panic(map, range_id);
2230 }
2231 #endif /* CONFIG_MAP_RANGES */
2232 } else {
2233 /*
2234 * If minimum is 0, bump it up by PAGE_SIZE. We want to limit
2235 * allocations of PAGEZERO to explicit requests since its
2236 * normal use is to catch dereferences of NULL and many
2237 * applications also treat pointers with a value of 0 as
2238 * special and suddenly having address 0 contain useable
2239 * memory would tend to confuse those applications.
2240 */
2241 effective_range.min_address = MAX(map->min_offset, VM_MAP_PAGE_SIZE(map));
2242 effective_range.max_address = map->max_offset;
2243 }
2244
2245 return effective_range;
2246 }
2247
2248 kern_return_t
vm_map_locate_space_anywhere(vm_map_t map,vm_map_size_t size,vm_map_offset_t mask,vm_map_kernel_flags_t vmk_flags,vm_map_offset_t * start_inout,vm_map_entry_t * entry_out)2249 vm_map_locate_space_anywhere(
2250 vm_map_t map,
2251 vm_map_size_t size,
2252 vm_map_offset_t mask,
2253 vm_map_kernel_flags_t vmk_flags,
2254 vm_map_offset_t *start_inout,
2255 vm_map_entry_t *entry_out)
2256 {
2257 struct mach_vm_range effective_range = {};
2258 vm_map_size_t guard_offset;
2259 vm_map_offset_t hint, limit;
2260 vm_map_entry_t entry;
2261 bool is_kmem_ptr_range = false;
2262
2263 /*
2264 * Only supported by vm_map_enter() with a fixed address.
2265 */
2266 assert(!vmk_flags.vmf_fixed);
2267 assert(!vmk_flags.vmkf_beyond_max);
2268
2269 if (__improbable(map->wait_for_space)) {
2270 /*
2271 * support for "wait_for_space" is minimal,
2272 * its only consumer is the ipc_kernel_copy_map.
2273 */
2274 assert(!map->holelistenabled &&
2275 !vmk_flags.vmkf_last_free &&
2276 !vmk_flags.vmkf_keep_map_locked &&
2277 !vmk_flags.vmkf_map_jit &&
2278 !vmk_flags.vmf_random_addr &&
2279 *start_inout <= map->min_offset);
2280 } else if (vmk_flags.vmkf_last_free) {
2281 assert(!vmk_flags.vmkf_map_jit &&
2282 !vmk_flags.vmf_random_addr);
2283 }
2284
2285 if (vmk_flags.vmkf_guard_before) {
2286 guard_offset = VM_MAP_PAGE_SIZE(map);
2287 assert(size > guard_offset);
2288 size -= guard_offset;
2289 } else {
2290 assert(size != 0);
2291 guard_offset = 0;
2292 }
2293
2294 if (__improbable(!vm_map_is_map_size_valid(
2295 map, size, vmk_flags.vmkf_no_soft_limit))) {
2296 return KERN_NO_SPACE;
2297 }
2298
2299 /*
2300 * Validate range_id from flags and get associated range
2301 */
2302 effective_range = vm_map_get_range(map, start_inout, &vmk_flags, size,
2303 &is_kmem_ptr_range);
2304
2305 if (is_kmem_ptr_range) {
2306 return kmem_locate_space(size + guard_offset, vmk_flags.vmkf_range_id,
2307 vmk_flags.vmkf_last_free, start_inout, entry_out);
2308 }
2309
2310 #if XNU_TARGET_OS_OSX
2311 if (__improbable(vmk_flags.vmkf_32bit_map_va)) {
2312 assert(map != kernel_map);
2313 effective_range.max_address = MIN(map->max_offset, 0x00000000FFFFF000ULL);
2314 }
2315 #endif /* XNU_TARGET_OS_OSX */
2316
2317 again:
2318 if (vmk_flags.vmkf_last_free) {
2319 hint = *start_inout;
2320
2321 if (hint == 0 || hint > effective_range.max_address) {
2322 hint = effective_range.max_address;
2323 }
2324 if (hint <= effective_range.min_address) {
2325 return KERN_NO_SPACE;
2326 }
2327 limit = effective_range.min_address;
2328 } else {
2329 hint = *start_inout;
2330
2331 if (vmk_flags.vmkf_map_jit) {
2332 if (map->jit_entry_exists &&
2333 !VM_MAP_POLICY_ALLOW_MULTIPLE_JIT(map)) {
2334 return KERN_INVALID_ARGUMENT;
2335 }
2336 if (VM_MAP_POLICY_ALLOW_JIT_RANDOM_ADDRESS(map)) {
2337 vmk_flags.vmf_random_addr = true;
2338 }
2339 }
2340
2341 if (vmk_flags.vmf_random_addr) {
2342 kern_return_t kr;
2343
2344 kr = vm_map_random_address_for_size(map, &hint, size, vmk_flags);
2345 if (kr != KERN_SUCCESS) {
2346 return kr;
2347 }
2348 }
2349 #if __x86_64__
2350 else if ((hint == 0 || hint == vm_map_min(map)) &&
2351 !map->disable_vmentry_reuse &&
2352 map->vmmap_high_start != 0) {
2353 hint = map->vmmap_high_start;
2354 }
2355 #endif /* __x86_64__ */
2356
2357 if (hint < effective_range.min_address) {
2358 hint = effective_range.min_address;
2359 }
2360 if (effective_range.max_address <= hint) {
2361 return KERN_NO_SPACE;
2362 }
2363
2364 limit = effective_range.max_address;
2365 }
2366 entry = vm_map_store_find_space(map,
2367 hint, limit, vmk_flags.vmkf_last_free,
2368 guard_offset, size, mask,
2369 start_inout);
2370
2371 if (__improbable(entry == NULL)) {
2372 if (map->wait_for_space &&
2373 guard_offset + size <=
2374 effective_range.max_address - effective_range.min_address) {
2375 assert_wait((event_t)map, THREAD_ABORTSAFE);
2376 vm_map_unlock(map);
2377 thread_block(THREAD_CONTINUE_NULL);
2378 vm_map_lock(map);
2379 goto again;
2380 }
2381 return KERN_NO_SPACE;
2382 }
2383
2384 if (entry_out) {
2385 *entry_out = entry;
2386 }
2387 return KERN_SUCCESS;
2388 }
2389
2390 /*!
2391 * @function vm_map_locate_space_fixed()
2392 *
2393 * @brief
2394 * Locate (no reservation) a range in the specified VM map at a fixed address.
2395 *
2396 * @param map the map to scan for memory, must be locked.
2397 * @param start the fixed address trying to be reserved
2398 * @param size the size of the allocation to make.
2399 * @param mask an alignment mask the allocation must respect,
2400 * @param vmk_flags the vm map kernel flags to influence this call.
2401 * vmk_flags.vmf_anywhere must not be set.
2402 * @param entry_out the entry right before the hole.
2403 * @param zap_list a zap list of entries to clean up after the call.
2404 *
2405 * @returns
2406 * - KERN_SUCCESS in case of success and no conflicting entry is found,
2407 * in which case entry_out is set to the entry before the hole.
2408 *
2409 * - KERN_MEMORY_PRESENT if a conflicting entry is found,
2410 * in which case entry_out is set the conflicting entry,
2411 * the callers MUST handle this error explicitly.
2412 *
2413 * - KERN_INVALID_ADDRESS if the specified @c start or @c size
2414 * would result in a mapping outside of the map.
2415 *
2416 * - KERN_NO_SPACE for various cases of unrecoverable failures.
2417 */
2418 static kern_return_t
vm_map_locate_space_fixed(vm_map_t map,vm_map_offset_t start,vm_map_size_t size,vm_map_offset_t mask,vm_map_kernel_flags_t vmk_flags,vm_map_entry_t * entry_out,vm_map_zap_t zap_list)2419 vm_map_locate_space_fixed(
2420 vm_map_t map,
2421 vm_map_offset_t start,
2422 vm_map_size_t size,
2423 vm_map_offset_t mask,
2424 vm_map_kernel_flags_t vmk_flags,
2425 vm_map_entry_t *entry_out,
2426 vm_map_zap_t zap_list)
2427 {
2428 vm_map_offset_t effective_min_offset, effective_max_offset;
2429 vm_map_entry_t entry;
2430 vm_map_offset_t end;
2431
2432 assert(vmk_flags.vmf_fixed);
2433
2434 effective_min_offset = map->min_offset;
2435 effective_max_offset = map->max_offset;
2436
2437 if (vmk_flags.vmkf_beyond_max) {
2438 /*
2439 * Allow an insertion beyond the map's max offset.
2440 */
2441 effective_max_offset = 0x00000000FFFFF000ULL;
2442 if (vm_map_is_64bit(map)) {
2443 effective_max_offset = 0xFFFFFFFFFFFFF000ULL;
2444 }
2445 #if XNU_TARGET_OS_OSX
2446 } else if (__improbable(vmk_flags.vmkf_32bit_map_va)) {
2447 effective_max_offset = MIN(map->max_offset, 0x00000000FFFFF000ULL);
2448 #endif /* XNU_TARGET_OS_OSX */
2449 }
2450
2451 if (VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT &&
2452 !vmk_flags.vmf_overwrite &&
2453 map->pmap == kernel_pmap &&
2454 vmk_flags.vm_tag == VM_MEMORY_REALLOC) {
2455 /*
2456 * Force realloc() to switch to a new allocation,
2457 * to prevent 4k-fragmented virtual ranges.
2458 */
2459 // DEBUG4K_ERROR("no realloc in place");
2460 return KERN_NO_SPACE;
2461 }
2462
2463 /*
2464 * Verify that:
2465 * the address doesn't itself violate
2466 * the mask requirement.
2467 */
2468
2469 if ((start & mask) != 0) {
2470 return KERN_NO_SPACE;
2471 }
2472
2473 if (__improbable(!vm_map_is_map_size_valid(
2474 map, size, vmk_flags.vmkf_no_soft_limit))) {
2475 return KERN_NO_SPACE;
2476 }
2477
2478 #if CONFIG_MAP_RANGES
2479 if (map->uses_user_ranges) {
2480 struct mach_vm_range r;
2481
2482 vm_map_user_range_resolve(map, start, 1, &r);
2483 if (r.max_address == 0) {
2484 return KERN_INVALID_ADDRESS;
2485 }
2486 effective_min_offset = r.min_address;
2487 effective_max_offset = r.max_address;
2488 }
2489 #endif /* CONFIG_MAP_RANGES */
2490
2491 if ((startup_phase >= STARTUP_SUB_KMEM) && !vmk_flags.vmkf_submap &&
2492 (map == kernel_map)) {
2493 mach_vm_range_t r = kmem_validate_range_for_overwrite(start, size);
2494 effective_min_offset = r->min_address;
2495 effective_max_offset = r->max_address;
2496 }
2497
2498 /*
2499 * ... the address is within bounds
2500 */
2501
2502 end = start + size;
2503
2504 if ((start < effective_min_offset) ||
2505 (end > effective_max_offset) ||
2506 (start >= end)) {
2507 return KERN_INVALID_ADDRESS;
2508 }
2509
2510 if (vmk_flags.vmf_overwrite) {
2511 vmr_flags_t remove_flags = VM_MAP_REMOVE_NO_MAP_ALIGN | VM_MAP_REMOVE_TO_OVERWRITE;
2512 kern_return_t remove_kr;
2513
2514 /*
2515 * Fixed mapping and "overwrite" flag: attempt to
2516 * remove all existing mappings in the specified
2517 * address range, saving them in our "zap_list".
2518 *
2519 * This avoids releasing the VM map lock in
2520 * vm_map_entry_delete() and allows atomicity
2521 * when we want to replace some mappings with a new one.
2522 * It also allows us to restore the old VM mappings if the
2523 * new mapping fails.
2524 */
2525 remove_flags |= VM_MAP_REMOVE_NO_YIELD;
2526
2527 if (vmk_flags.vmkf_overwrite_immutable) {
2528 /* we can overwrite immutable mappings */
2529 remove_flags |= VM_MAP_REMOVE_IMMUTABLE;
2530 }
2531 if (vmk_flags.vmkf_remap_prot_copy) {
2532 remove_flags |= VM_MAP_REMOVE_IMMUTABLE_CODE;
2533 }
2534 remove_kr = vm_map_delete(map, start, end, remove_flags,
2535 KMEM_GUARD_NONE, zap_list).kmr_return;
2536 if (remove_kr) {
2537 /* XXX FBDP restore zap_list? */
2538 return remove_kr;
2539 }
2540 }
2541
2542 /*
2543 * ... the starting address isn't allocated
2544 */
2545
2546 if (vm_map_lookup_entry(map, start, &entry)) {
2547 *entry_out = entry;
2548 return KERN_MEMORY_PRESENT;
2549 }
2550
2551 /*
2552 * ... the next region doesn't overlap the
2553 * end point.
2554 */
2555
2556 if ((entry->vme_next != vm_map_to_entry(map)) &&
2557 (entry->vme_next->vme_start < end)) {
2558 return KERN_NO_SPACE;
2559 }
2560
2561 *entry_out = entry;
2562 return KERN_SUCCESS;
2563 }
2564
2565 /*
2566 * Routine: vm_map_find_space
2567 * Purpose:
2568 * Allocate a range in the specified virtual address map,
2569 * returning the entry allocated for that range.
2570 * Used by kmem_alloc, etc.
2571 *
2572 * The map must be NOT be locked. It will be returned locked
2573 * on KERN_SUCCESS, unlocked on failure.
2574 *
2575 * If an entry is allocated, the object/offset fields
2576 * are initialized to zero.
2577 */
2578 kern_return_t
vm_map_find_space(vm_map_t map,vm_map_offset_t hint_address,vm_map_size_t size,vm_map_offset_t mask,vm_map_kernel_flags_t vmk_flags,vm_map_entry_t * o_entry)2579 vm_map_find_space(
2580 vm_map_t map,
2581 vm_map_offset_t hint_address,
2582 vm_map_size_t size,
2583 vm_map_offset_t mask,
2584 vm_map_kernel_flags_t vmk_flags,
2585 vm_map_entry_t *o_entry) /* OUT */
2586 {
2587 vm_map_entry_t new_entry, entry;
2588 kern_return_t kr;
2589
2590 if (size == 0) {
2591 return KERN_INVALID_ARGUMENT;
2592 }
2593
2594 new_entry = vm_map_entry_create(map);
2595 new_entry->use_pmap = true;
2596 new_entry->protection = VM_PROT_DEFAULT;
2597 new_entry->max_protection = VM_PROT_ALL;
2598
2599 if (VM_MAP_PAGE_SHIFT(map) != PAGE_SHIFT) {
2600 new_entry->map_aligned = true;
2601 }
2602 if (vmk_flags.vmf_permanent) {
2603 new_entry->vme_permanent = true;
2604 }
2605
2606 vm_map_lock(map);
2607
2608 kr = vm_map_locate_space_anywhere(map, size, mask, vmk_flags,
2609 &hint_address, &entry);
2610 if (kr != KERN_SUCCESS) {
2611 vm_map_unlock(map);
2612 vm_map_entry_dispose(new_entry);
2613 return kr;
2614 }
2615 new_entry->vme_start = hint_address;
2616 new_entry->vme_end = hint_address + size;
2617
2618 /*
2619 * At this point,
2620 *
2621 * - new_entry's "vme_start" and "vme_end" should define
2622 * the endpoints of the available new range,
2623 *
2624 * - and "entry" should refer to the region before
2625 * the new range,
2626 *
2627 * - and the map should still be locked.
2628 */
2629
2630 assert(page_aligned(new_entry->vme_start));
2631 assert(page_aligned(new_entry->vme_end));
2632 assert(VM_MAP_PAGE_ALIGNED(new_entry->vme_start, VM_MAP_PAGE_MASK(map)));
2633 assert(VM_MAP_PAGE_ALIGNED(new_entry->vme_end, VM_MAP_PAGE_MASK(map)));
2634
2635
2636 /*
2637 * Insert the new entry into the list
2638 */
2639
2640 vm_map_store_entry_link(map, entry, new_entry,
2641 VM_MAP_KERNEL_FLAGS_NONE);
2642 map->size += size;
2643
2644 /*
2645 * Update the lookup hint
2646 */
2647 SAVE_HINT_MAP_WRITE(map, new_entry);
2648
2649 *o_entry = new_entry;
2650 return KERN_SUCCESS;
2651 }
2652
2653 int vm_map_pmap_enter_print = FALSE;
2654 int vm_map_pmap_enter_enable = FALSE;
2655
2656 /*
2657 * Routine: vm_map_pmap_enter [internal only]
2658 *
2659 * Description:
2660 * Force pages from the specified object to be entered into
2661 * the pmap at the specified address if they are present.
2662 * As soon as a page not found in the object the scan ends.
2663 *
2664 * Returns:
2665 * Nothing.
2666 *
2667 * In/out conditions:
2668 * The source map should not be locked on entry.
2669 */
2670 __unused static void
vm_map_pmap_enter(vm_map_t map,vm_map_offset_t addr,vm_map_offset_t end_addr,vm_object_t object,vm_object_offset_t offset,vm_prot_t protection)2671 vm_map_pmap_enter(
2672 vm_map_t map,
2673 vm_map_offset_t addr,
2674 vm_map_offset_t end_addr,
2675 vm_object_t object,
2676 vm_object_offset_t offset,
2677 vm_prot_t protection)
2678 {
2679 int type_of_fault;
2680 kern_return_t kr;
2681 uint8_t object_lock_type = 0;
2682 struct vm_object_fault_info fault_info = {
2683 .interruptible = THREAD_UNINT,
2684 };
2685
2686 if (map->pmap == 0) {
2687 return;
2688 }
2689
2690 assert(VM_MAP_PAGE_SHIFT(map) == PAGE_SHIFT);
2691
2692 while (addr < end_addr) {
2693 vm_page_t m;
2694
2695
2696 /*
2697 * TODO:
2698 * From vm_map_enter(), we come into this function without the map
2699 * lock held or the object lock held.
2700 * We haven't taken a reference on the object either.
2701 * We should do a proper lookup on the map to make sure
2702 * that things are sane before we go locking objects that
2703 * could have been deallocated from under us.
2704 */
2705
2706 object_lock_type = OBJECT_LOCK_EXCLUSIVE;
2707 vm_object_lock(object);
2708
2709 m = vm_page_lookup(object, offset);
2710
2711 if (m == VM_PAGE_NULL || m->vmp_busy || vm_page_is_fictitious(m) ||
2712 (m->vmp_unusual && (VMP_ERROR_GET(m) || m->vmp_restart || m->vmp_absent))) {
2713 vm_object_unlock(object);
2714 return;
2715 }
2716
2717 if (vm_map_pmap_enter_print) {
2718 printf("vm_map_pmap_enter:");
2719 printf("map: %p, addr: %llx, object: %p, offset: %llx\n",
2720 map, (unsigned long long)addr, object, (unsigned long long)offset);
2721 }
2722 type_of_fault = DBG_CACHE_HIT_FAULT;
2723 kr = vm_fault_enter(m, map->pmap,
2724 addr,
2725 PAGE_SIZE, 0,
2726 protection, protection,
2727 VM_PAGE_WIRED(m),
2728 VM_KERN_MEMORY_NONE, /* tag - not wiring */
2729 &fault_info,
2730 NULL, /* need_retry */
2731 &type_of_fault,
2732 &object_lock_type); /* Exclusive lock mode. Will remain unchanged.*/
2733
2734 vm_object_unlock(object);
2735
2736 offset += PAGE_SIZE_64;
2737 addr += PAGE_SIZE;
2738 }
2739 }
2740
2741 #define MAX_TRIES_TO_GET_RANDOM_ADDRESS 1000
2742 static kern_return_t
vm_map_random_address_for_size(vm_map_t map,vm_map_offset_t * address,vm_map_size_t size,vm_map_kernel_flags_t vmk_flags)2743 vm_map_random_address_for_size(
2744 vm_map_t map,
2745 vm_map_offset_t *address,
2746 vm_map_size_t size,
2747 vm_map_kernel_flags_t vmk_flags)
2748 {
2749 kern_return_t kr = KERN_SUCCESS;
2750 int tries = 0;
2751 vm_map_offset_t random_addr = 0;
2752 vm_map_offset_t hole_end;
2753
2754 vm_map_entry_t next_entry = VM_MAP_ENTRY_NULL;
2755 vm_map_entry_t prev_entry = VM_MAP_ENTRY_NULL;
2756 vm_map_size_t vm_hole_size = 0;
2757 vm_map_size_t addr_space_size;
2758 bool is_kmem_ptr;
2759 struct mach_vm_range effective_range;
2760
2761 effective_range = vm_map_get_range(map, address, &vmk_flags, size,
2762 &is_kmem_ptr);
2763
2764 addr_space_size = effective_range.max_address - effective_range.min_address;
2765 if (size >= addr_space_size) {
2766 return KERN_NO_SPACE;
2767 }
2768 addr_space_size -= size;
2769
2770 assert(VM_MAP_PAGE_ALIGNED(size, VM_MAP_PAGE_MASK(map)));
2771
2772 while (tries < MAX_TRIES_TO_GET_RANDOM_ADDRESS) {
2773 if (startup_phase < STARTUP_SUB_ZALLOC) {
2774 random_addr = (vm_map_offset_t)early_random();
2775 } else {
2776 random_addr = (vm_map_offset_t)random();
2777 }
2778 random_addr <<= VM_MAP_PAGE_SHIFT(map);
2779 random_addr = vm_map_trunc_page(
2780 effective_range.min_address + (random_addr % addr_space_size),
2781 VM_MAP_PAGE_MASK(map));
2782
2783 #if CONFIG_PROB_GZALLOC
2784 if (map->pmap == kernel_pmap && pgz_owned(random_addr)) {
2785 continue;
2786 }
2787 #endif /* CONFIG_PROB_GZALLOC */
2788
2789 if (vm_map_lookup_entry(map, random_addr, &prev_entry) == FALSE) {
2790 if (prev_entry == vm_map_to_entry(map)) {
2791 next_entry = vm_map_first_entry(map);
2792 } else {
2793 next_entry = prev_entry->vme_next;
2794 }
2795 if (next_entry == vm_map_to_entry(map)) {
2796 hole_end = vm_map_max(map);
2797 } else {
2798 hole_end = next_entry->vme_start;
2799 }
2800 vm_hole_size = hole_end - random_addr;
2801 if (vm_hole_size >= size) {
2802 *address = random_addr;
2803 break;
2804 }
2805 }
2806 tries++;
2807 }
2808
2809 if (tries == MAX_TRIES_TO_GET_RANDOM_ADDRESS) {
2810 kr = KERN_NO_SPACE;
2811 }
2812 return kr;
2813 }
2814
2815 static boolean_t
vm_memory_malloc_no_cow(int alias)2816 vm_memory_malloc_no_cow(
2817 int alias)
2818 {
2819 uint64_t alias_mask;
2820
2821 if (!malloc_no_cow) {
2822 return FALSE;
2823 }
2824 if (alias > 63) {
2825 return FALSE;
2826 }
2827 alias_mask = 1ULL << alias;
2828 if (alias_mask & vm_memory_malloc_no_cow_mask) {
2829 return TRUE;
2830 }
2831 return FALSE;
2832 }
2833
2834 uint64_t vm_map_enter_RLIMIT_AS_count = 0;
2835 uint64_t vm_map_enter_RLIMIT_DATA_count = 0;
2836 /*
2837 * Routine: vm_map_enter
2838 *
2839 * Description:
2840 * Allocate a range in the specified virtual address map.
2841 * The resulting range will refer to memory defined by
2842 * the given memory object and offset into that object.
2843 *
2844 * Arguments are as defined in the vm_map call.
2845 */
2846 static unsigned int vm_map_enter_restore_successes = 0;
2847 static unsigned int vm_map_enter_restore_failures = 0;
2848 kern_return_t
vm_map_enter(vm_map_t map,vm_map_offset_t * address,vm_map_size_t size,vm_map_offset_t mask,vm_map_kernel_flags_t vmk_flags,vm_object_t object,vm_object_offset_t offset,boolean_t needs_copy,vm_prot_t cur_protection,vm_prot_t max_protection,vm_inherit_t inheritance)2849 vm_map_enter(
2850 vm_map_t map,
2851 vm_map_offset_t *address, /* IN/OUT */
2852 vm_map_size_t size,
2853 vm_map_offset_t mask,
2854 vm_map_kernel_flags_t vmk_flags,
2855 vm_object_t object,
2856 vm_object_offset_t offset,
2857 boolean_t needs_copy,
2858 vm_prot_t cur_protection,
2859 vm_prot_t max_protection,
2860 vm_inherit_t inheritance)
2861 {
2862 vm_map_entry_t entry, new_entry;
2863 vm_map_offset_t start, tmp_start, tmp_offset;
2864 vm_map_offset_t end, tmp_end;
2865 vm_map_offset_t tmp2_start, tmp2_end;
2866 vm_map_offset_t step;
2867 kern_return_t result = KERN_SUCCESS;
2868 bool map_locked = FALSE;
2869 bool pmap_empty = TRUE;
2870 bool new_mapping_established = FALSE;
2871 const bool keep_map_locked = vmk_flags.vmkf_keep_map_locked;
2872 const bool anywhere = !vmk_flags.vmf_fixed;
2873 const bool purgable = vmk_flags.vmf_purgeable;
2874 const bool no_cache = vmk_flags.vmf_no_cache;
2875 const bool is_submap = vmk_flags.vmkf_submap;
2876 const bool permanent = vmk_flags.vmf_permanent;
2877 const bool no_copy_on_read = vmk_flags.vmkf_no_copy_on_read;
2878 const bool entry_for_jit = vmk_flags.vmkf_map_jit;
2879 const bool iokit_acct = vmk_flags.vmkf_iokit_acct;
2880 const bool resilient_codesign = vmk_flags.vmf_resilient_codesign;
2881 const bool resilient_media = vmk_flags.vmf_resilient_media;
2882 const bool entry_for_tpro = vmk_flags.vmf_tpro;
2883 const unsigned int superpage_size = vmk_flags.vmf_superpage_size;
2884 const vm_tag_t alias = vmk_flags.vm_tag;
2885 vm_tag_t user_alias;
2886 kern_return_t kr;
2887 bool clear_map_aligned = FALSE;
2888 vm_map_size_t chunk_size = 0;
2889 vm_object_t caller_object;
2890 VM_MAP_ZAP_DECLARE(zap_old_list);
2891 VM_MAP_ZAP_DECLARE(zap_new_list);
2892
2893 caller_object = object;
2894
2895 assertf(vmk_flags.__vmkf_unused2 == 0, "vmk_flags unused2=0x%llx\n", vmk_flags.__vmkf_unused2);
2896
2897 if (vmk_flags.vmf_4gb_chunk) {
2898 #if defined(__LP64__)
2899 chunk_size = (4ULL * 1024 * 1024 * 1024); /* max. 4GB chunks for the new allocation */
2900 #else /* __LP64__ */
2901 chunk_size = ANON_CHUNK_SIZE;
2902 #endif /* __LP64__ */
2903 } else {
2904 chunk_size = ANON_CHUNK_SIZE;
2905 }
2906
2907
2908
2909 if (superpage_size) {
2910 if (object != VM_OBJECT_NULL) {
2911 /* caller can't provide their own VM object */
2912 return KERN_INVALID_ARGUMENT;
2913 }
2914 switch (superpage_size) {
2915 /*
2916 * Note that the current implementation only supports
2917 * a single size for superpages, SUPERPAGE_SIZE, per
2918 * architecture. As soon as more sizes are supposed
2919 * to be supported, SUPERPAGE_SIZE has to be replaced
2920 * with a lookup of the size depending on superpage_size.
2921 */
2922 #ifdef __x86_64__
2923 case SUPERPAGE_SIZE_ANY:
2924 /* handle it like 2 MB and round up to page size */
2925 size = (size + 2 * 1024 * 1024 - 1) & ~(2 * 1024 * 1024 - 1);
2926 OS_FALLTHROUGH;
2927 case SUPERPAGE_SIZE_2MB:
2928 break;
2929 #endif
2930 default:
2931 return KERN_INVALID_ARGUMENT;
2932 }
2933 mask = SUPERPAGE_SIZE - 1;
2934 if (size & (SUPERPAGE_SIZE - 1)) {
2935 return KERN_INVALID_ARGUMENT;
2936 }
2937 inheritance = VM_INHERIT_NONE; /* fork() children won't inherit superpages */
2938 }
2939
2940
2941 if ((cur_protection & VM_PROT_WRITE) &&
2942 (cur_protection & VM_PROT_EXECUTE) &&
2943 #if XNU_TARGET_OS_OSX
2944 map->pmap != kernel_pmap &&
2945 (cs_process_global_enforcement() ||
2946 (vmk_flags.vmkf_cs_enforcement_override
2947 ? vmk_flags.vmkf_cs_enforcement
2948 : (vm_map_cs_enforcement(map)
2949 #if __arm64__
2950 || !VM_MAP_IS_EXOTIC(map)
2951 #endif /* __arm64__ */
2952 ))) &&
2953 #endif /* XNU_TARGET_OS_OSX */
2954 #if CODE_SIGNING_MONITOR
2955 (csm_address_space_exempt(map->pmap) != KERN_SUCCESS) &&
2956 #endif
2957 (VM_MAP_POLICY_WX_FAIL(map) ||
2958 VM_MAP_POLICY_WX_STRIP_X(map)) &&
2959 !entry_for_jit) {
2960 boolean_t vm_protect_wx_fail = VM_MAP_POLICY_WX_FAIL(map);
2961
2962 DTRACE_VM3(cs_wx,
2963 uint64_t, 0,
2964 uint64_t, 0,
2965 vm_prot_t, cur_protection);
2966 printf("CODE SIGNING: %d[%s] %s: curprot cannot be write+execute. %s\n",
2967 proc_selfpid(),
2968 (get_bsdtask_info(current_task())
2969 ? proc_name_address(get_bsdtask_info(current_task()))
2970 : "?"),
2971 __FUNCTION__,
2972 (vm_protect_wx_fail ? "failing" : "turning off execute"));
2973 cur_protection &= ~VM_PROT_EXECUTE;
2974 if (vm_protect_wx_fail) {
2975 return KERN_PROTECTION_FAILURE;
2976 }
2977 }
2978
2979 if (entry_for_jit
2980 && cur_protection != VM_PROT_ALL) {
2981 /*
2982 * Native macOS processes and all non-macOS processes are
2983 * expected to create JIT regions via mmap(MAP_JIT, RWX) but
2984 * the RWX requirement was not enforced, and thus, we must live
2985 * with our sins. We are now dealing with a JIT mapping without
2986 * RWX.
2987 *
2988 * We deal with these by letting the MAP_JIT stick in order
2989 * to avoid CS violations when these pages are mapped executable
2990 * down the line. In order to appease the page table monitor (you
2991 * know what I'm talking about), these pages will end up being
2992 * marked as XNU_USER_DEBUG, which will be allowed because we
2993 * don't enforce the code signing monitor on macOS systems. If
2994 * the user-space application ever changes permissions to RWX,
2995 * which they are allowed to since the mapping was originally
2996 * created with MAP_JIT, then they'll switch over to using the
2997 * XNU_USER_JIT type, and won't be allowed to downgrade any
2998 * more after that.
2999 *
3000 * When not on macOS, a MAP_JIT mapping without VM_PROT_ALL is
3001 * strictly disallowed.
3002 */
3003
3004 #if XNU_TARGET_OS_OSX
3005 /*
3006 * Continue to allow non-RWX JIT
3007 */
3008 #else
3009 /* non-macOS: reject JIT regions without RWX */
3010 DTRACE_VM3(cs_wx,
3011 uint64_t, 0,
3012 uint64_t, 0,
3013 vm_prot_t, cur_protection);
3014 printf("CODE SIGNING: %d[%s] %s(%d): JIT requires RWX: failing. \n",
3015 proc_selfpid(),
3016 (get_bsdtask_info(current_task())
3017 ? proc_name_address(get_bsdtask_info(current_task()))
3018 : "?"),
3019 __FUNCTION__,
3020 cur_protection);
3021 return KERN_PROTECTION_FAILURE;
3022 #endif
3023 }
3024
3025 /*
3026 * If the task has requested executable lockdown,
3027 * deny any new executable mapping.
3028 */
3029 if (map->map_disallow_new_exec == TRUE) {
3030 if (cur_protection & VM_PROT_EXECUTE) {
3031 return KERN_PROTECTION_FAILURE;
3032 }
3033 }
3034
3035 if (resilient_codesign) {
3036 assert(!is_submap);
3037 int reject_prot = (needs_copy ? VM_PROT_ALLEXEC : (VM_PROT_WRITE | VM_PROT_ALLEXEC));
3038 if ((cur_protection | max_protection) & reject_prot) {
3039 return KERN_PROTECTION_FAILURE;
3040 }
3041 }
3042
3043 if (resilient_media) {
3044 assert(!is_submap);
3045 // assert(!needs_copy);
3046 if (object != VM_OBJECT_NULL &&
3047 !object->internal) {
3048 /*
3049 * This mapping is directly backed by an external
3050 * memory manager (e.g. a vnode pager for a file):
3051 * we would not have any safe place to inject
3052 * a zero-filled page if an actual page is not
3053 * available, without possibly impacting the actual
3054 * contents of the mapped object (e.g. the file),
3055 * so we can't provide any media resiliency here.
3056 */
3057 return KERN_INVALID_ARGUMENT;
3058 }
3059 }
3060
3061 if (entry_for_tpro) {
3062 /*
3063 * TPRO overrides the effective permissions of the region
3064 * and explicitly maps as RW. Ensure we have been passed
3065 * the expected permissions. We accept `cur_protections`
3066 * RO as that will be handled on fault.
3067 */
3068 if (!(max_protection & VM_PROT_READ) ||
3069 !(max_protection & VM_PROT_WRITE) ||
3070 !(cur_protection & VM_PROT_READ)) {
3071 return KERN_PROTECTION_FAILURE;
3072 }
3073
3074 /*
3075 * We can now downgrade the cur_protection to RO. This is a mild lie
3076 * to the VM layer. But TPRO will be responsible for toggling the
3077 * protections between RO/RW
3078 */
3079 cur_protection = VM_PROT_READ;
3080 }
3081
3082 if (is_submap) {
3083 vm_map_t submap;
3084 if (purgable) {
3085 /* submaps can not be purgeable */
3086 return KERN_INVALID_ARGUMENT;
3087 }
3088 if (object == VM_OBJECT_NULL) {
3089 /* submaps can not be created lazily */
3090 return KERN_INVALID_ARGUMENT;
3091 }
3092 submap = (vm_map_t) object;
3093 if (VM_MAP_PAGE_SHIFT(submap) != VM_MAP_PAGE_SHIFT(map)) {
3094 /* page size mismatch */
3095 return KERN_INVALID_ARGUMENT;
3096 }
3097 }
3098 if (vmk_flags.vmkf_already) {
3099 /*
3100 * VM_FLAGS_ALREADY says that it's OK if the same mapping
3101 * is already present. For it to be meaningul, the requested
3102 * mapping has to be at a fixed address (!VM_FLAGS_ANYWHERE) and
3103 * we shouldn't try and remove what was mapped there first
3104 * (!VM_FLAGS_OVERWRITE).
3105 */
3106 if (!vmk_flags.vmf_fixed || vmk_flags.vmf_overwrite) {
3107 return KERN_INVALID_ARGUMENT;
3108 }
3109 }
3110
3111 if (size == 0 ||
3112 (offset & MIN(VM_MAP_PAGE_MASK(map), PAGE_MASK_64)) != 0) {
3113 *address = 0;
3114 return KERN_INVALID_ARGUMENT;
3115 }
3116
3117 if (map->pmap == kernel_pmap) {
3118 user_alias = VM_KERN_MEMORY_NONE;
3119 } else {
3120 user_alias = alias;
3121 }
3122
3123 if (user_alias == VM_MEMORY_MALLOC_MEDIUM) {
3124 chunk_size = MALLOC_MEDIUM_CHUNK_SIZE;
3125 }
3126
3127 #define RETURN(value) { result = value; goto BailOut; }
3128
3129 assertf(VM_MAP_PAGE_ALIGNED(*address, FOURK_PAGE_MASK), "0x%llx", (uint64_t)*address);
3130 assertf(VM_MAP_PAGE_ALIGNED(size, FOURK_PAGE_MASK), "0x%llx", (uint64_t)size);
3131 if (VM_MAP_PAGE_MASK(map) >= PAGE_MASK) {
3132 assertf(page_aligned(*address), "0x%llx", (uint64_t)*address);
3133 assertf(page_aligned(size), "0x%llx", (uint64_t)size);
3134 }
3135
3136 if (VM_MAP_PAGE_MASK(map) >= PAGE_MASK &&
3137 !VM_MAP_PAGE_ALIGNED(size, VM_MAP_PAGE_MASK(map))) {
3138 /*
3139 * In most cases, the caller rounds the size up to the
3140 * map's page size.
3141 * If we get a size that is explicitly not map-aligned here,
3142 * we'll have to respect the caller's wish and mark the
3143 * mapping as "not map-aligned" to avoid tripping the
3144 * map alignment checks later.
3145 */
3146 clear_map_aligned = TRUE;
3147 }
3148 if (!anywhere &&
3149 VM_MAP_PAGE_MASK(map) >= PAGE_MASK &&
3150 !VM_MAP_PAGE_ALIGNED(*address, VM_MAP_PAGE_MASK(map))) {
3151 /*
3152 * We've been asked to map at a fixed address and that
3153 * address is not aligned to the map's specific alignment.
3154 * The caller should know what it's doing (i.e. most likely
3155 * mapping some fragmented copy map, transferring memory from
3156 * a VM map with a different alignment), so clear map_aligned
3157 * for this new VM map entry and proceed.
3158 */
3159 clear_map_aligned = TRUE;
3160 }
3161
3162 /*
3163 * Only zero-fill objects are allowed to be purgable.
3164 * LP64todo - limit purgable objects to 32-bits for now
3165 */
3166 if (purgable &&
3167 (offset != 0 ||
3168 (object != VM_OBJECT_NULL &&
3169 (object->vo_size != size ||
3170 object->purgable == VM_PURGABLE_DENY))
3171 #if __LP64__
3172 || size > ANON_MAX_SIZE
3173 #endif
3174 )) {
3175 return KERN_INVALID_ARGUMENT;
3176 }
3177
3178 if (__improbable(!vm_map_is_map_size_valid(
3179 map, size, vmk_flags.vmkf_no_soft_limit))) {
3180 return KERN_NO_SPACE;
3181 }
3182
3183 vm_map_lock(map);
3184 map_locked = TRUE;
3185
3186
3187 if (anywhere) {
3188 result = vm_map_locate_space_anywhere(map, size, mask, vmk_flags,
3189 address, &entry);
3190 start = *address;
3191 } else {
3192 start = *address;
3193 result = vm_map_locate_space_fixed(map, start, size, mask,
3194 vmk_flags, &entry, &zap_old_list);
3195 }
3196
3197 end = start + size;
3198
3199 assert(VM_MAP_PAGE_ALIGNED(start, VM_MAP_PAGE_MASK(map)));
3200
3201 /*
3202 * Check if what's already there is what we want.
3203 */
3204 if (result == KERN_MEMORY_PRESENT) {
3205 assert(!anywhere);
3206 if (!(vmk_flags.vmkf_already)) {
3207 RETURN(KERN_NO_SPACE);
3208 }
3209 tmp_start = start;
3210 tmp_offset = offset;
3211 if (entry->vme_start < start) {
3212 tmp_start -= start - entry->vme_start;
3213 tmp_offset -= start - entry->vme_start;
3214 }
3215 for (; entry->vme_start < end;
3216 entry = entry->vme_next) {
3217 /*
3218 * Check if the mapping's attributes
3219 * match the existing map entry.
3220 */
3221 if (entry == vm_map_to_entry(map) ||
3222 entry->vme_start != tmp_start ||
3223 entry->is_sub_map != is_submap ||
3224 VME_OFFSET(entry) != tmp_offset ||
3225 entry->needs_copy != needs_copy ||
3226 entry->protection != cur_protection ||
3227 entry->max_protection != max_protection ||
3228 entry->inheritance != inheritance ||
3229 entry->iokit_acct != iokit_acct ||
3230 VME_ALIAS(entry) != alias) {
3231 /* not the same mapping ! */
3232 RETURN(KERN_NO_SPACE);
3233 }
3234 /*
3235 * Check if the same object is being mapped.
3236 */
3237 if (is_submap) {
3238 if (VME_SUBMAP(entry) !=
3239 (vm_map_t) object) {
3240 /* not the same submap */
3241 RETURN(KERN_NO_SPACE);
3242 }
3243 } else {
3244 if (VME_OBJECT(entry) != object) {
3245 /* not the same VM object... */
3246 vm_object_t obj2;
3247
3248 obj2 = VME_OBJECT(entry);
3249 if ((obj2 == VM_OBJECT_NULL || obj2->internal) &&
3250 (object == VM_OBJECT_NULL || object->internal)) {
3251 /*
3252 * ... but both are
3253 * anonymous memory,
3254 * so equivalent.
3255 */
3256 } else {
3257 RETURN(KERN_NO_SPACE);
3258 }
3259 }
3260 }
3261
3262 tmp_offset += entry->vme_end - entry->vme_start;
3263 tmp_start += entry->vme_end - entry->vme_start;
3264 if (entry->vme_end >= end) {
3265 /* reached the end of our mapping */
3266 break;
3267 }
3268 }
3269 /* it all matches: let's use what's already there ! */
3270 RETURN(KERN_MEMORY_PRESENT);
3271 }
3272
3273 if (result != KERN_SUCCESS) {
3274 goto BailOut;
3275 }
3276
3277
3278 /*
3279 * At this point,
3280 * "start" and "end" should define the endpoints of the
3281 * available new range, and
3282 * "entry" should refer to the region before the new
3283 * range, and
3284 *
3285 * the map should be locked.
3286 */
3287
3288 /*
3289 * See whether we can avoid creating a new entry (and object) by
3290 * extending one of our neighbors. [So far, we only attempt to
3291 * extend from below.] Note that we can never extend/join
3292 * purgable objects because they need to remain distinct
3293 * entities in order to implement their "volatile object"
3294 * semantics.
3295 */
3296
3297 if (purgable ||
3298 entry_for_jit ||
3299 entry_for_tpro ||
3300 vm_memory_malloc_no_cow(user_alias)) {
3301 if (superpage_size) {
3302 /*
3303 * For "super page" allocations, we will allocate
3304 * special physically-contiguous VM objects later on,
3305 * so we should not have flags instructing us to create
3306 * a differently special VM object here.
3307 */
3308 RETURN(KERN_INVALID_ARGUMENT);
3309 }
3310
3311 if (object == VM_OBJECT_NULL) {
3312 assert(!superpage_size);
3313 object = vm_object_allocate(size, map->serial_id);
3314 vm_object_lock(object);
3315 object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
3316 VM_OBJECT_SET_TRUE_SHARE(object, FALSE);
3317 if (malloc_no_cow_except_fork &&
3318 !purgable &&
3319 !entry_for_jit &&
3320 !entry_for_tpro &&
3321 vm_memory_malloc_no_cow(user_alias)) {
3322 object->copy_strategy = MEMORY_OBJECT_COPY_DELAY_FORK;
3323 VM_OBJECT_SET_TRUE_SHARE(object, TRUE);
3324 }
3325 if (entry_for_jit) {
3326 object->vo_inherit_copy_none = true;
3327 }
3328 if (purgable) {
3329 task_t owner;
3330 VM_OBJECT_SET_PURGABLE(object, VM_PURGABLE_NONVOLATILE);
3331 if (map->pmap == kernel_pmap) {
3332 /*
3333 * Purgeable mappings made in a kernel
3334 * map are "owned" by the kernel itself
3335 * rather than the current user task
3336 * because they're likely to be used by
3337 * more than this user task (see
3338 * execargs_purgeable_allocate(), for
3339 * example).
3340 */
3341 owner = kernel_task;
3342 } else {
3343 owner = current_task();
3344 }
3345 assert(object->vo_owner == NULL);
3346 assert(object->resident_page_count == 0);
3347 assert(object->wired_page_count == 0);
3348 vm_purgeable_nonvolatile_enqueue(object, owner);
3349 }
3350 vm_object_unlock(object);
3351 offset = (vm_object_offset_t)0;
3352 }
3353 } else if (VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT) {
3354 /* no coalescing if address space uses sub-pages */
3355 } else if ((is_submap == FALSE) &&
3356 (object == VM_OBJECT_NULL) &&
3357 (entry != vm_map_to_entry(map)) &&
3358 (entry->vme_end == start) &&
3359 (!entry->is_shared) &&
3360 (!entry->is_sub_map) &&
3361 (!entry->in_transition) &&
3362 (!entry->needs_wakeup) &&
3363 (entry->behavior == VM_BEHAVIOR_DEFAULT) &&
3364 (entry->protection == cur_protection) &&
3365 (entry->max_protection == max_protection) &&
3366 (entry->inheritance == inheritance) &&
3367 ((user_alias == VM_MEMORY_REALLOC) ||
3368 (VME_ALIAS(entry) == alias)) &&
3369 (entry->no_cache == no_cache) &&
3370 (entry->vme_permanent == permanent) &&
3371 /* no coalescing for immutable executable mappings */
3372 !((entry->protection & VM_PROT_EXECUTE) &&
3373 entry->vme_permanent) &&
3374 (!entry->superpage_size && !superpage_size) &&
3375 /*
3376 * No coalescing if not map-aligned, to avoid propagating
3377 * that condition any further than needed:
3378 */
3379 (!entry->map_aligned || !clear_map_aligned) &&
3380 (!entry->zero_wired_pages) &&
3381 (!entry->used_for_jit && !entry_for_jit) &&
3382 #if __arm64e__
3383 (!entry->used_for_tpro && !entry_for_tpro) &&
3384 #endif
3385 (!entry->csm_associated) &&
3386 (entry->iokit_acct == iokit_acct) &&
3387 (!entry->vme_resilient_codesign) &&
3388 (!entry->vme_resilient_media) &&
3389 (!entry->vme_atomic) &&
3390 (entry->vme_no_copy_on_read == no_copy_on_read) &&
3391
3392 ((entry->vme_end - entry->vme_start) + size <=
3393 (user_alias == VM_MEMORY_REALLOC ?
3394 ANON_CHUNK_SIZE :
3395 NO_COALESCE_LIMIT)) &&
3396
3397 (entry->wired_count == 0)) { /* implies user_wired_count == 0 */
3398 if (vm_object_coalesce(VME_OBJECT(entry),
3399 VM_OBJECT_NULL,
3400 VME_OFFSET(entry),
3401 (vm_object_offset_t) 0,
3402 (vm_map_size_t)(entry->vme_end - entry->vme_start),
3403 (vm_map_size_t)(end - entry->vme_end))) {
3404 /*
3405 * Coalesced the two objects - can extend
3406 * the previous map entry to include the
3407 * new range.
3408 */
3409 map->size += (end - entry->vme_end);
3410 assert(entry->vme_start < end);
3411 assert(VM_MAP_PAGE_ALIGNED(end,
3412 VM_MAP_PAGE_MASK(map)));
3413 if (__improbable(vm_debug_events)) {
3414 DTRACE_VM5(map_entry_extend, vm_map_t, map, vm_map_entry_t, entry, vm_address_t, entry->vme_start, vm_address_t, entry->vme_end, vm_address_t, end);
3415 }
3416 entry->vme_end = end;
3417 if (map->holelistenabled) {
3418 vm_map_store_update_first_free(map, entry, TRUE);
3419 } else {
3420 vm_map_store_update_first_free(map, map->first_free, TRUE);
3421 }
3422 new_mapping_established = TRUE;
3423 RETURN(KERN_SUCCESS);
3424 }
3425 }
3426
3427 step = superpage_size ? SUPERPAGE_SIZE : (end - start);
3428 new_entry = NULL;
3429
3430 if (vmk_flags.vmkf_submap_adjust) {
3431 vm_map_adjust_offsets((vm_map_t)caller_object, start, end);
3432 offset = start;
3433 }
3434
3435 for (tmp2_start = start; tmp2_start < end; tmp2_start += step) {
3436 tmp2_end = tmp2_start + step;
3437 /*
3438 * Create a new entry
3439 *
3440 * XXX FBDP
3441 * The reserved "page zero" in each process's address space can
3442 * be arbitrarily large. Splitting it into separate objects and
3443 * therefore different VM map entries serves no purpose and just
3444 * slows down operations on the VM map, so let's not split the
3445 * allocation into chunks if the max protection is NONE. That
3446 * memory should never be accessible, so it will never get to the
3447 * default pager.
3448 */
3449 tmp_start = tmp2_start;
3450 if (!is_submap &&
3451 object == VM_OBJECT_NULL &&
3452 size > chunk_size &&
3453 max_protection != VM_PROT_NONE &&
3454 superpage_size == 0) {
3455 tmp_end = tmp_start + chunk_size;
3456 } else {
3457 tmp_end = tmp2_end;
3458 }
3459 do {
3460 if (!is_submap &&
3461 object != VM_OBJECT_NULL &&
3462 object->internal &&
3463 offset + (tmp_end - tmp_start) > object->vo_size) {
3464 // printf("FBDP object %p size 0x%llx overmapping offset 0x%llx size 0x%llx\n", object, object->vo_size, offset, (uint64_t)(tmp_end - tmp_start));
3465 DTRACE_VM5(vm_map_enter_overmap,
3466 vm_map_t, map,
3467 vm_map_address_t, tmp_start,
3468 vm_map_address_t, tmp_end,
3469 vm_object_offset_t, offset,
3470 vm_object_size_t, object->vo_size);
3471 }
3472 new_entry = vm_map_entry_insert(map,
3473 entry, tmp_start, tmp_end,
3474 object, offset, vmk_flags,
3475 needs_copy,
3476 cur_protection, max_protection,
3477 (entry_for_jit && !VM_MAP_POLICY_ALLOW_JIT_INHERIT(map) ?
3478 VM_INHERIT_NONE : inheritance),
3479 clear_map_aligned);
3480
3481 assert(!is_kernel_object(object) || (VM_KERN_MEMORY_NONE != alias));
3482
3483 if (resilient_codesign) {
3484 int reject_prot = (needs_copy ? VM_PROT_ALLEXEC : (VM_PROT_WRITE | VM_PROT_ALLEXEC));
3485 if (!((cur_protection | max_protection) & reject_prot)) {
3486 new_entry->vme_resilient_codesign = TRUE;
3487 }
3488 }
3489
3490 if (resilient_media &&
3491 (object == VM_OBJECT_NULL ||
3492 object->internal)) {
3493 new_entry->vme_resilient_media = TRUE;
3494 }
3495
3496 assert(!new_entry->iokit_acct);
3497 if (!is_submap &&
3498 object != VM_OBJECT_NULL &&
3499 object->internal &&
3500 (object->purgable != VM_PURGABLE_DENY ||
3501 object->vo_ledger_tag)) {
3502 assert(new_entry->use_pmap);
3503 assert(!new_entry->iokit_acct);
3504 /*
3505 * Turn off pmap accounting since
3506 * purgeable (or tagged) objects have their
3507 * own ledgers.
3508 */
3509 new_entry->use_pmap = FALSE;
3510 } else if (!is_submap &&
3511 iokit_acct &&
3512 object != VM_OBJECT_NULL &&
3513 object->internal) {
3514 /* alternate accounting */
3515 assert(!new_entry->iokit_acct);
3516 assert(new_entry->use_pmap);
3517 new_entry->iokit_acct = TRUE;
3518 new_entry->use_pmap = FALSE;
3519 DTRACE_VM4(
3520 vm_map_iokit_mapped_region,
3521 vm_map_t, map,
3522 vm_map_offset_t, new_entry->vme_start,
3523 vm_map_offset_t, new_entry->vme_end,
3524 int, VME_ALIAS(new_entry));
3525 vm_map_iokit_mapped_region(
3526 map,
3527 (new_entry->vme_end -
3528 new_entry->vme_start));
3529 } else if (!is_submap) {
3530 assert(!new_entry->iokit_acct);
3531 assert(new_entry->use_pmap);
3532 }
3533
3534 if (is_submap) {
3535 vm_map_t submap;
3536 boolean_t submap_is_64bit;
3537 boolean_t use_pmap;
3538
3539 assert(new_entry->is_sub_map);
3540 assert(!new_entry->use_pmap);
3541 assert(!new_entry->iokit_acct);
3542 submap = (vm_map_t) object;
3543 submap_is_64bit = vm_map_is_64bit(submap);
3544 use_pmap = vmk_flags.vmkf_nested_pmap;
3545 #ifndef NO_NESTED_PMAP
3546 if (use_pmap && submap->pmap == NULL) {
3547 ledger_t ledger = map->pmap->ledger;
3548 /* we need a sub pmap to nest... */
3549 submap->pmap = pmap_create_options(ledger, 0,
3550 submap_is_64bit ? PMAP_CREATE_64BIT : 0);
3551 if (submap->pmap == NULL) {
3552 /* let's proceed without nesting... */
3553 }
3554 #if defined(__arm64__)
3555 else {
3556 pmap_set_nested(submap->pmap);
3557 }
3558 #endif
3559 }
3560 if (use_pmap && submap->pmap != NULL) {
3561 if (VM_MAP_PAGE_SHIFT(map) != VM_MAP_PAGE_SHIFT(submap)) {
3562 DEBUG4K_ERROR("map %p (%d) submap %p (%d): incompatible page sizes\n", map, VM_MAP_PAGE_SHIFT(map), submap, VM_MAP_PAGE_SHIFT(submap));
3563 kr = KERN_FAILURE;
3564 } else {
3565 kr = pmap_nest(map->pmap,
3566 submap->pmap,
3567 tmp_start,
3568 tmp_end - tmp_start);
3569 }
3570 if (kr != KERN_SUCCESS) {
3571 printf("vm_map_enter: "
3572 "pmap_nest(0x%llx,0x%llx) "
3573 "error 0x%x\n",
3574 (long long)tmp_start,
3575 (long long)tmp_end,
3576 kr);
3577 } else {
3578 /* we're now nested ! */
3579 new_entry->use_pmap = TRUE;
3580 pmap_empty = FALSE;
3581 }
3582 }
3583 #endif /* NO_NESTED_PMAP */
3584 }
3585 entry = new_entry;
3586
3587 if (superpage_size) {
3588 vm_page_t pages, m;
3589 vm_object_t sp_object;
3590 vm_object_offset_t sp_offset;
3591
3592 assert(object == VM_OBJECT_NULL);
3593 VME_OFFSET_SET(entry, 0);
3594
3595 /* allocate one superpage */
3596 kr = cpm_allocate(SUPERPAGE_SIZE, &pages, 0, SUPERPAGE_NBASEPAGES - 1, TRUE, 0);
3597 if (kr != KERN_SUCCESS) {
3598 /* deallocate whole range... */
3599 new_mapping_established = TRUE;
3600 /* ... but only up to "tmp_end" */
3601 size -= end - tmp_end;
3602 RETURN(kr);
3603 }
3604
3605 /* create one vm_object per superpage */
3606 sp_object = vm_object_allocate((vm_map_size_t)(entry->vme_end - entry->vme_start), map->serial_id);
3607 vm_object_lock(sp_object);
3608 sp_object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
3609 VM_OBJECT_SET_PHYS_CONTIGUOUS(sp_object, TRUE);
3610 sp_object->vo_shadow_offset = (vm_object_offset_t)VM_PAGE_GET_PHYS_PAGE(pages) * PAGE_SIZE;
3611 VME_OBJECT_SET(entry, sp_object, false, 0);
3612 assert(entry->use_pmap);
3613
3614 /* enter the base pages into the object */
3615 for (sp_offset = 0;
3616 sp_offset < SUPERPAGE_SIZE;
3617 sp_offset += PAGE_SIZE) {
3618 m = pages;
3619 pmap_zero_page(VM_PAGE_GET_PHYS_PAGE(m));
3620 pages = NEXT_PAGE(m);
3621 *(NEXT_PAGE_PTR(m)) = VM_PAGE_NULL;
3622 vm_page_insert_wired(m, sp_object, sp_offset, VM_KERN_MEMORY_OSFMK);
3623 }
3624 vm_object_unlock(sp_object);
3625 }
3626 } while (tmp_end != tmp2_end &&
3627 (tmp_start = tmp_end) &&
3628 (tmp_end = (tmp2_end - tmp_end > chunk_size) ?
3629 tmp_end + chunk_size : tmp2_end));
3630 }
3631
3632 new_mapping_established = TRUE;
3633
3634
3635 BailOut:
3636 assert(map_locked == TRUE);
3637
3638 /*
3639 * Address space limit enforcement (RLIMIT_AS and RLIMIT_DATA):
3640 * If we have identified and possibly established the new mapping(s),
3641 * make sure we did not go beyond the address space limit.
3642 */
3643 if (result == KERN_SUCCESS) {
3644 if (map->size_limit != RLIM_INFINITY &&
3645 map->size > map->size_limit) {
3646 /*
3647 * Establishing the requested mappings would exceed
3648 * the process's RLIMIT_AS limit: fail with
3649 * KERN_NO_SPACE.
3650 */
3651 result = KERN_NO_SPACE;
3652 printf("%d[%s] %s: map size 0x%llx over RLIMIT_AS 0x%llx\n",
3653 proc_selfpid(),
3654 (get_bsdtask_info(current_task())
3655 ? proc_name_address(get_bsdtask_info(current_task()))
3656 : "?"),
3657 __FUNCTION__,
3658 (uint64_t) map->size,
3659 (uint64_t) map->size_limit);
3660 DTRACE_VM2(vm_map_enter_RLIMIT_AS,
3661 vm_map_size_t, map->size,
3662 uint64_t, map->size_limit);
3663 vm_map_enter_RLIMIT_AS_count++;
3664 } else if (map->data_limit != RLIM_INFINITY &&
3665 map->size > map->data_limit) {
3666 /*
3667 * Establishing the requested mappings would exceed
3668 * the process's RLIMIT_DATA limit: fail with
3669 * KERN_NO_SPACE.
3670 */
3671 result = KERN_NO_SPACE;
3672 printf("%d[%s] %s: map size 0x%llx over RLIMIT_DATA 0x%llx\n",
3673 proc_selfpid(),
3674 (get_bsdtask_info(current_task())
3675 ? proc_name_address(get_bsdtask_info(current_task()))
3676 : "?"),
3677 __FUNCTION__,
3678 (uint64_t) map->size,
3679 (uint64_t) map->data_limit);
3680 DTRACE_VM2(vm_map_enter_RLIMIT_DATA,
3681 vm_map_size_t, map->size,
3682 uint64_t, map->data_limit);
3683 vm_map_enter_RLIMIT_DATA_count++;
3684 }
3685 }
3686
3687 if (result == KERN_SUCCESS) {
3688 vm_prot_t pager_prot;
3689 memory_object_t pager;
3690
3691 #if DEBUG
3692 if (pmap_empty &&
3693 !(vmk_flags.vmkf_no_pmap_check)) {
3694 assert(pmap_is_empty(map->pmap,
3695 *address,
3696 *address + size));
3697 }
3698 #endif /* DEBUG */
3699
3700 /*
3701 * For "named" VM objects, let the pager know that the
3702 * memory object is being mapped. Some pagers need to keep
3703 * track of this, to know when they can reclaim the memory
3704 * object, for example.
3705 * VM calls memory_object_map() for each mapping (specifying
3706 * the protection of each mapping) and calls
3707 * memory_object_last_unmap() when all the mappings are gone.
3708 */
3709 pager_prot = max_protection;
3710 if (needs_copy) {
3711 /*
3712 * Copy-On-Write mapping: won't modify
3713 * the memory object.
3714 */
3715 pager_prot &= ~VM_PROT_WRITE;
3716 }
3717 if (!is_submap &&
3718 object != VM_OBJECT_NULL &&
3719 object->named &&
3720 object->pager != MEMORY_OBJECT_NULL) {
3721 vm_object_lock(object);
3722 pager = object->pager;
3723 if (object->named &&
3724 pager != MEMORY_OBJECT_NULL) {
3725 assert(object->pager_ready);
3726 vm_object_mapping_wait(object, THREAD_UNINT);
3727 /* object might have lost its pager while waiting */
3728 pager = object->pager;
3729 if (object->named && pager != MEMORY_OBJECT_NULL) {
3730 vm_object_mapping_begin(object);
3731 vm_object_unlock(object);
3732
3733 kr = memory_object_map(pager, pager_prot);
3734 assert(kr == KERN_SUCCESS);
3735
3736 vm_object_lock(object);
3737 vm_object_mapping_end(object);
3738 }
3739 }
3740 vm_object_unlock(object);
3741 }
3742 }
3743
3744 assert(map_locked == TRUE);
3745
3746 if (new_mapping_established) {
3747 /*
3748 * If we release the map lock for any reason below,
3749 * another thread could deallocate our new mapping,
3750 * releasing the caller's reference on "caller_object",
3751 * which was transferred to the mapping.
3752 * If this was the only reference, the object could be
3753 * destroyed.
3754 *
3755 * We need to take an extra reference on "caller_object"
3756 * to keep it alive if we need to return the caller's
3757 * reference to the caller in case of failure.
3758 */
3759 if (is_submap) {
3760 vm_map_reference((vm_map_t)caller_object);
3761 } else {
3762 vm_object_reference(caller_object);
3763 }
3764 }
3765
3766 if (!keep_map_locked) {
3767 vm_map_unlock(map);
3768 map_locked = FALSE;
3769 entry = VM_MAP_ENTRY_NULL;
3770 new_entry = VM_MAP_ENTRY_NULL;
3771 }
3772
3773 /*
3774 * We can't hold the map lock if we enter this block.
3775 */
3776
3777 if (result == KERN_SUCCESS) {
3778 /* Wire down the new entry if the user
3779 * requested all new map entries be wired.
3780 */
3781 if ((map->wiring_required) || (superpage_size)) {
3782 assert(!keep_map_locked);
3783 pmap_empty = FALSE; /* pmap won't be empty */
3784 kr = vm_map_wire_nested(map, start, end,
3785 cur_protection, VM_KERN_MEMORY_MLOCK,
3786 TRUE, PMAP_NULL, 0, NULL);
3787 result = kr;
3788 }
3789
3790 }
3791
3792 if (result != KERN_SUCCESS) {
3793 if (new_mapping_established) {
3794 vmr_flags_t remove_flags = VM_MAP_REMOVE_NO_FLAGS;
3795
3796 /*
3797 * We have to get rid of the new mappings since we
3798 * won't make them available to the user.
3799 * Try and do that atomically, to minimize the risk
3800 * that someone else create new mappings that range.
3801 */
3802 if (!map_locked) {
3803 vm_map_lock(map);
3804 map_locked = TRUE;
3805 }
3806 remove_flags |= VM_MAP_REMOVE_NO_MAP_ALIGN;
3807 remove_flags |= VM_MAP_REMOVE_NO_YIELD;
3808 if (permanent) {
3809 remove_flags |= VM_MAP_REMOVE_IMMUTABLE;
3810 }
3811 (void) vm_map_delete(map,
3812 *address, *address + size,
3813 remove_flags,
3814 KMEM_GUARD_NONE, &zap_new_list);
3815 }
3816
3817 if (vm_map_zap_first_entry(&zap_old_list)) {
3818 vm_map_entry_t entry1, entry2;
3819
3820 /*
3821 * The new mapping failed. Attempt to restore
3822 * the old mappings, saved in the "zap_old_map".
3823 */
3824 if (!map_locked) {
3825 vm_map_lock(map);
3826 map_locked = TRUE;
3827 }
3828
3829 /* first check if the coast is still clear */
3830 start = vm_map_zap_first_entry(&zap_old_list)->vme_start;
3831 end = vm_map_zap_last_entry(&zap_old_list)->vme_end;
3832
3833 if (vm_map_lookup_entry(map, start, &entry1) ||
3834 vm_map_lookup_entry(map, end, &entry2) ||
3835 entry1 != entry2) {
3836 /*
3837 * Part of that range has already been
3838 * re-mapped: we can't restore the old
3839 * mappings...
3840 */
3841 vm_map_enter_restore_failures++;
3842 } else {
3843 /*
3844 * Transfer the saved map entries from
3845 * "zap_old_map" to the original "map",
3846 * inserting them all after "entry1".
3847 */
3848 while ((entry2 = vm_map_zap_pop(&zap_old_list))) {
3849 vm_map_size_t entry_size;
3850
3851 entry_size = (entry2->vme_end -
3852 entry2->vme_start);
3853 vm_map_store_entry_link(map, entry1, entry2,
3854 VM_MAP_KERNEL_FLAGS_NONE);
3855 map->size += entry_size;
3856 entry1 = entry2;
3857 }
3858 if (map->wiring_required) {
3859 /*
3860 * XXX TODO: we should rewire the
3861 * old pages here...
3862 */
3863 }
3864 vm_map_enter_restore_successes++;
3865 }
3866 }
3867 }
3868
3869 /*
3870 * The caller is responsible for releasing the lock if it requested to
3871 * keep the map locked.
3872 */
3873 if (map_locked && !keep_map_locked) {
3874 vm_map_unlock(map);
3875 }
3876
3877 vm_map_zap_dispose(&zap_old_list);
3878 vm_map_zap_dispose(&zap_new_list);
3879
3880 if (new_mapping_established) {
3881 /*
3882 * The caller had a reference on "caller_object" and we
3883 * transferred that reference to the mapping.
3884 * We also took an extra reference on "caller_object" to keep
3885 * it alive while the map was unlocked.
3886 */
3887 if (result == KERN_SUCCESS) {
3888 /*
3889 * On success, the caller's reference on the object gets
3890 * tranferred to the mapping.
3891 * Release our extra reference.
3892 */
3893 if (is_submap) {
3894 vm_map_deallocate((vm_map_t)caller_object);
3895 } else {
3896 vm_object_deallocate(caller_object);
3897 }
3898 } else {
3899 /*
3900 * On error, the caller expects to still have a
3901 * reference on the object it gave us.
3902 * Let's use our extra reference for that.
3903 */
3904 }
3905 }
3906
3907 return result;
3908
3909 #undef RETURN
3910 }
3911
3912 /*
3913 * Counters for the prefault optimization.
3914 */
3915 int64_t vm_prefault_nb_pages = 0;
3916 int64_t vm_prefault_nb_bailout = 0;
3917
3918 static kern_return_t
vm_map_enter_adjust_offset(vm_object_offset_t * obj_offs,vm_object_offset_t * obj_end,vm_object_offset_t quantity)3919 vm_map_enter_adjust_offset(
3920 vm_object_offset_t *obj_offs,
3921 vm_object_offset_t *obj_end,
3922 vm_object_offset_t quantity)
3923 {
3924 if (os_add_overflow(*obj_offs, quantity, obj_offs) ||
3925 os_add_overflow(*obj_end, quantity, obj_end) ||
3926 vm_map_round_page_mask(*obj_end, PAGE_MASK) == 0) {
3927 return KERN_INVALID_ARGUMENT;
3928 }
3929
3930 return KERN_SUCCESS;
3931 }
3932
3933 static __attribute__((always_inline, warn_unused_result))
3934 kern_return_t
vm_map_enter_mem_object_sanitize(vm_map_t target_map,vm_map_offset_ut address_u,vm_map_size_ut initial_size_u,vm_map_offset_ut mask_u,vm_object_offset_ut offset_u,vm_prot_ut cur_protection_u,vm_prot_ut max_protection_u,vm_inherit_ut inheritance_u,vm_map_kernel_flags_t vmk_flags,ipc_port_t port,vm_map_address_t * map_addr,vm_map_size_t * map_size,vm_map_offset_t * mask,vm_object_offset_t * obj_offs,vm_object_offset_t * obj_end,vm_object_size_t * obj_size,vm_prot_t * cur_protection,vm_prot_t * max_protection,vm_inherit_t * inheritance)3935 vm_map_enter_mem_object_sanitize(
3936 vm_map_t target_map,
3937 vm_map_offset_ut address_u,
3938 vm_map_size_ut initial_size_u,
3939 vm_map_offset_ut mask_u,
3940 vm_object_offset_ut offset_u,
3941 vm_prot_ut cur_protection_u,
3942 vm_prot_ut max_protection_u,
3943 vm_inherit_ut inheritance_u,
3944 vm_map_kernel_flags_t vmk_flags,
3945 ipc_port_t port,
3946 vm_map_address_t *map_addr,
3947 vm_map_size_t *map_size,
3948 vm_map_offset_t *mask,
3949 vm_object_offset_t *obj_offs,
3950 vm_object_offset_t *obj_end,
3951 vm_object_size_t *obj_size,
3952 vm_prot_t *cur_protection,
3953 vm_prot_t *max_protection,
3954 vm_inherit_t *inheritance)
3955 {
3956 kern_return_t result;
3957
3958 result = vm_sanitize_cur_and_max_prots(cur_protection_u, max_protection_u,
3959 VM_SANITIZE_CALLER_ENTER_MEM_OBJ, target_map,
3960 VM_PROT_IS_MASK, cur_protection,
3961 max_protection);
3962 if (__improbable(result != KERN_SUCCESS)) {
3963 return result;
3964 }
3965
3966 result = vm_sanitize_inherit(inheritance_u, VM_SANITIZE_CALLER_ENTER_MEM_OBJ,
3967 inheritance);
3968 if (__improbable(result != KERN_SUCCESS)) {
3969 return result;
3970 }
3971
3972 result = vm_sanitize_mask(mask_u, VM_SANITIZE_CALLER_ENTER_MEM_OBJ, mask);
3973 if (__improbable(result != KERN_SUCCESS)) {
3974 return result;
3975 }
3976
3977 if (vmk_flags.vmf_fixed) {
3978 vm_map_address_t map_end;
3979
3980 result = vm_sanitize_addr_size(address_u, initial_size_u,
3981 VM_SANITIZE_CALLER_ENTER_MEM_OBJ,
3982 target_map,
3983 VM_SANITIZE_FLAGS_SIZE_ZERO_FAILS | VM_SANITIZE_FLAGS_REALIGN_START,
3984 map_addr, &map_end, map_size);
3985 if (__improbable(result != KERN_SUCCESS)) {
3986 return result;
3987 }
3988 } else {
3989 *map_addr = vm_sanitize_addr(target_map, address_u);
3990 result = vm_sanitize_size(0, initial_size_u,
3991 VM_SANITIZE_CALLER_ENTER_MEM_OBJ, target_map,
3992 VM_SANITIZE_FLAGS_SIZE_ZERO_FAILS, map_size);
3993 if (__improbable(result != KERN_SUCCESS)) {
3994 return result;
3995 }
3996 }
3997
3998 *obj_size = vm_object_round_page(*map_size);
3999 if (__improbable(*obj_size == 0)) {
4000 return KERN_INVALID_ARGUMENT;
4001 }
4002
4003 if (IP_VALID(port)) {
4004 result = vm_sanitize_addr_size(offset_u, *obj_size,
4005 VM_SANITIZE_CALLER_ENTER_MEM_OBJ,
4006 PAGE_MASK,
4007 VM_SANITIZE_FLAGS_SIZE_ZERO_FAILS |
4008 VM_SANITIZE_FLAGS_GET_UNALIGNED_VALUES,
4009 obj_offs, obj_end, obj_size);
4010 if (__improbable(result != KERN_SUCCESS)) {
4011 return result;
4012 }
4013 } else {
4014 *obj_offs = 0;
4015 *obj_end = *obj_size;
4016 }
4017
4018 return KERN_SUCCESS;
4019 }
4020
4021 kern_return_t
vm_map_enter_mem_object(vm_map_t target_map,vm_map_offset_ut * address_u,vm_map_size_ut initial_size_u,vm_map_offset_ut mask_u,vm_map_kernel_flags_t vmk_flags,ipc_port_t port,vm_object_offset_ut offset_u,boolean_t copy,vm_prot_ut cur_protection_u,vm_prot_ut max_protection_u,vm_inherit_ut inheritance_u,upl_page_list_ptr_t page_list,unsigned int page_list_count)4022 vm_map_enter_mem_object(
4023 vm_map_t target_map,
4024 vm_map_offset_ut *address_u,
4025 vm_map_size_ut initial_size_u,
4026 vm_map_offset_ut mask_u,
4027 vm_map_kernel_flags_t vmk_flags,
4028 ipc_port_t port,
4029 vm_object_offset_ut offset_u,
4030 boolean_t copy,
4031 vm_prot_ut cur_protection_u,
4032 vm_prot_ut max_protection_u,
4033 vm_inherit_ut inheritance_u,
4034 upl_page_list_ptr_t page_list,
4035 unsigned int page_list_count)
4036 {
4037 vm_map_offset_t mask;
4038 vm_prot_t cur_protection;
4039 vm_prot_t max_protection;
4040 vm_inherit_t inheritance;
4041 vm_map_address_t map_addr, map_mask;
4042 vm_map_size_t map_size;
4043 vm_object_t object = VM_OBJECT_NULL;
4044 vm_object_offset_t obj_offs, obj_end;
4045 vm_object_size_t obj_size;
4046 kern_return_t result;
4047 boolean_t mask_cur_protection, mask_max_protection;
4048 boolean_t kernel_prefault, try_prefault = (page_list_count != 0);
4049 vm_map_offset_t offset_in_mapping = 0;
4050
4051 if (VM_MAP_PAGE_SHIFT(target_map) < PAGE_SHIFT) {
4052 /* XXX TODO4K prefaulting depends on page size... */
4053 try_prefault = FALSE;
4054 }
4055
4056 /*
4057 * Check arguments for validity
4058 */
4059 if ((target_map == VM_MAP_NULL) ||
4060 (try_prefault && (copy || !page_list))) {
4061 return KERN_INVALID_ARGUMENT;
4062 }
4063
4064 map_mask = vm_map_page_mask(target_map);
4065
4066 /*
4067 * Sanitize any input parameters that are addr/size/prot/inherit
4068 */
4069 result = vm_map_enter_mem_object_sanitize(
4070 target_map,
4071 *address_u,
4072 initial_size_u,
4073 mask_u,
4074 offset_u,
4075 cur_protection_u,
4076 max_protection_u,
4077 inheritance_u,
4078 vmk_flags,
4079 port,
4080 &map_addr,
4081 &map_size,
4082 &mask,
4083 &obj_offs,
4084 &obj_end,
4085 &obj_size,
4086 &cur_protection,
4087 &max_protection,
4088 &inheritance);
4089 if (__improbable(result != KERN_SUCCESS)) {
4090 return vm_sanitize_get_kr(result);
4091 }
4092
4093 assertf(vmk_flags.__vmkf_unused2 == 0, "vmk_flags unused2=0x%llx\n", vmk_flags.__vmkf_unused2);
4094 vm_map_kernel_flags_update_range_id(&vmk_flags, target_map, map_size);
4095
4096 mask_cur_protection = cur_protection & VM_PROT_IS_MASK;
4097 mask_max_protection = max_protection & VM_PROT_IS_MASK;
4098 cur_protection &= ~VM_PROT_IS_MASK;
4099 max_protection &= ~VM_PROT_IS_MASK;
4100
4101 #if __arm64__
4102 if (cur_protection & VM_PROT_EXECUTE) {
4103 cur_protection |= VM_PROT_READ;
4104 }
4105 #endif /* __arm64__ */
4106
4107 /*
4108 * Find the vm object (if any) corresponding to this port.
4109 */
4110 if (!IP_VALID(port)) {
4111 object = VM_OBJECT_NULL;
4112 copy = FALSE;
4113 } else if (ip_kotype(port) == IKOT_NAMED_ENTRY) {
4114 vm_named_entry_t named_entry;
4115 vm_object_size_t initial_size;
4116
4117 named_entry = mach_memory_entry_from_port(port);
4118
4119 if (vmk_flags.vmf_return_data_addr ||
4120 vmk_flags.vmf_return_4k_data_addr) {
4121 result = vm_map_enter_adjust_offset(&obj_offs,
4122 &obj_end, named_entry->data_offset);
4123 if (__improbable(result)) {
4124 return result;
4125 }
4126 }
4127
4128 /* a few checks to make sure user is obeying rules */
4129 if (mask_max_protection) {
4130 max_protection &= named_entry->protection;
4131 }
4132 if (mask_cur_protection) {
4133 cur_protection &= named_entry->protection;
4134 }
4135 if ((named_entry->protection & max_protection) !=
4136 max_protection) {
4137 return KERN_INVALID_RIGHT;
4138 }
4139 if ((named_entry->protection & cur_protection) !=
4140 cur_protection) {
4141 return KERN_INVALID_RIGHT;
4142 }
4143
4144 /*
4145 * unwrap is safe because we know obj_size is larger and doesn't
4146 * overflow
4147 */
4148 initial_size = VM_SANITIZE_UNSAFE_UNWRAP(initial_size_u);
4149 if (named_entry->size < obj_offs + initial_size) {
4150 return KERN_INVALID_ARGUMENT;
4151 }
4152
4153 /* for a vm_map_copy, we can only map it whole */
4154 if (named_entry->is_copy &&
4155 (obj_size != named_entry->size) &&
4156 (vm_map_round_page(obj_size, map_mask) == named_entry->size)) {
4157 /* XXX FBDP use the rounded size... */
4158 obj_end += named_entry->size - obj_size;
4159 obj_size = named_entry->size;
4160 }
4161
4162 if (named_entry->offset) {
4163 /*
4164 * the callers parameter offset is defined to be the
4165 * offset from beginning of named entry offset in object
4166 *
4167 * Because we checked above that
4168 * obj_offs + obj_size < named_entry_size
4169 * these overflow checks should be redundant...
4170 */
4171 result = vm_map_enter_adjust_offset(&obj_offs,
4172 &obj_end, named_entry->offset);
4173 if (__improbable(result)) {
4174 return result;
4175 }
4176 }
4177
4178 if (!VM_MAP_PAGE_ALIGNED(obj_size, map_mask)) {
4179 /*
4180 * Let's not map more than requested;
4181 * vm_map_enter() will handle this "not map-aligned"
4182 * case.
4183 */
4184 map_size = obj_size;
4185 }
4186
4187 named_entry_lock(named_entry);
4188
4189 // rdar://130307561 (Combine copy, object, and submap fields of vm_named_entry into an enum)
4190 assert(named_entry->is_copy || named_entry->is_object || named_entry->is_sub_map);
4191
4192 if (named_entry->is_sub_map) {
4193 vm_map_t submap;
4194
4195 assert(!named_entry->is_copy);
4196 assert(!named_entry->is_object);
4197
4198 if (vmk_flags.vmf_return_data_addr ||
4199 vmk_flags.vmf_return_4k_data_addr) {
4200 panic("VM_FLAGS_RETURN_DATA_ADDR not expected for submap.");
4201 }
4202
4203 submap = named_entry->backing.map;
4204 vm_map_reference(submap);
4205 named_entry_unlock(named_entry);
4206
4207 vmk_flags.vmkf_submap = TRUE;
4208 result = vm_map_enter(target_map,
4209 &map_addr,
4210 map_size,
4211 mask,
4212 vmk_flags,
4213 (vm_object_t)(uintptr_t) submap,
4214 obj_offs,
4215 copy,
4216 cur_protection,
4217 max_protection,
4218 inheritance);
4219 if (result != KERN_SUCCESS) {
4220 vm_map_deallocate(submap);
4221 return result;
4222 }
4223 /*
4224 * No need to lock "submap" just to check its
4225 * "mapped" flag: that flag is never reset
4226 * once it's been set and if we race, we'll
4227 * just end up setting it twice, which is OK.
4228 */
4229 if (submap->mapped_in_other_pmaps == FALSE &&
4230 vm_map_pmap(submap) != PMAP_NULL &&
4231 vm_map_pmap(submap) !=
4232 vm_map_pmap(target_map)) {
4233 /*
4234 * This submap is being mapped in a map
4235 * that uses a different pmap.
4236 * Set its "mapped_in_other_pmaps" flag
4237 * to indicate that we now need to
4238 * remove mappings from all pmaps rather
4239 * than just the submap's pmap.
4240 */
4241 vm_map_lock(submap);
4242 submap->mapped_in_other_pmaps = TRUE;
4243 vm_map_unlock(submap);
4244 }
4245 goto out;
4246 }
4247
4248 if (named_entry->is_copy) {
4249 kern_return_t kr;
4250 vm_map_copy_t copy_map;
4251 vm_map_entry_t copy_entry;
4252 vm_map_offset_t copy_addr;
4253 vm_map_copy_t target_copy_map;
4254 vm_map_offset_t overmap_start, overmap_end;
4255 vm_map_offset_t trimmed_start;
4256 vm_map_size_t target_size;
4257
4258 assert(!named_entry->is_object);
4259 assert(!named_entry->is_sub_map);
4260
4261 int allowed_flags = VM_FLAGS_FIXED |
4262 VM_FLAGS_ANYWHERE |
4263 VM_FLAGS_OVERWRITE |
4264 VM_FLAGS_RETURN_4K_DATA_ADDR |
4265 VM_FLAGS_RETURN_DATA_ADDR;
4266
4267 if (!vm_map_kernel_flags_check_vmflags(vmk_flags, allowed_flags)) {
4268 named_entry_unlock(named_entry);
4269 return KERN_INVALID_ARGUMENT;
4270 }
4271
4272 copy_map = named_entry->backing.copy;
4273 assert(copy_map->type == VM_MAP_COPY_ENTRY_LIST);
4274 if (copy_map->type != VM_MAP_COPY_ENTRY_LIST) {
4275 /* unsupported type; should not happen */
4276 printf("vm_map_enter_mem_object: "
4277 "memory_entry->backing.copy "
4278 "unsupported type 0x%x\n",
4279 copy_map->type);
4280 named_entry_unlock(named_entry);
4281 return KERN_INVALID_ARGUMENT;
4282 }
4283
4284 if (VM_MAP_PAGE_SHIFT(target_map) != copy_map->cpy_hdr.page_shift) {
4285 DEBUG4K_SHARE("copy_map %p offset %llx size 0x%llx pgshift %d -> target_map %p pgshift %d\n", copy_map, obj_offs, (uint64_t)map_size, copy_map->cpy_hdr.page_shift, target_map, VM_MAP_PAGE_SHIFT(target_map));
4286 }
4287
4288 if (vmk_flags.vmf_return_data_addr ||
4289 vmk_flags.vmf_return_4k_data_addr) {
4290 offset_in_mapping = obj_offs & map_mask;
4291 if (vmk_flags.vmf_return_4k_data_addr) {
4292 offset_in_mapping &= ~((signed)(0xFFF));
4293 }
4294 }
4295
4296 target_copy_map = VM_MAP_COPY_NULL;
4297 target_size = copy_map->size;
4298 overmap_start = 0;
4299 overmap_end = 0;
4300 trimmed_start = 0;
4301 if (copy_map->cpy_hdr.page_shift != VM_MAP_PAGE_SHIFT(target_map)) {
4302 DEBUG4K_ADJUST("adjusting...\n");
4303 kr = vm_map_copy_adjust_to_target(
4304 copy_map,
4305 obj_offs,
4306 initial_size,
4307 target_map,
4308 copy,
4309 &target_copy_map,
4310 &overmap_start,
4311 &overmap_end,
4312 &trimmed_start);
4313 if (kr != KERN_SUCCESS) {
4314 named_entry_unlock(named_entry);
4315 return kr;
4316 }
4317 target_size = target_copy_map->size;
4318 } else {
4319 /*
4320 * Assert that the vm_map_copy is coming from the right
4321 * zone and hasn't been forged
4322 */
4323 vm_map_copy_require(copy_map);
4324 target_copy_map = copy_map;
4325 }
4326
4327 vm_map_kernel_flags_t rsv_flags = vmk_flags;
4328
4329 vm_map_kernel_flags_and_vmflags(&rsv_flags,
4330 (VM_FLAGS_FIXED |
4331 VM_FLAGS_ANYWHERE |
4332 VM_FLAGS_OVERWRITE |
4333 VM_FLAGS_RETURN_4K_DATA_ADDR |
4334 VM_FLAGS_RETURN_DATA_ADDR));
4335
4336 /* reserve a contiguous range */
4337 kr = vm_map_enter(target_map,
4338 &map_addr,
4339 vm_map_round_page(target_size, map_mask),
4340 mask,
4341 rsv_flags,
4342 VM_OBJECT_NULL,
4343 0,
4344 FALSE, /* copy */
4345 cur_protection,
4346 max_protection,
4347 inheritance);
4348 if (kr != KERN_SUCCESS) {
4349 DEBUG4K_ERROR("kr 0x%x\n", kr);
4350 if (target_copy_map != copy_map) {
4351 vm_map_copy_discard(target_copy_map);
4352 target_copy_map = VM_MAP_COPY_NULL;
4353 }
4354 named_entry_unlock(named_entry);
4355 return kr;
4356 }
4357
4358 copy_addr = map_addr;
4359
4360 for (copy_entry = vm_map_copy_first_entry(target_copy_map);
4361 copy_entry != vm_map_copy_to_entry(target_copy_map);
4362 copy_entry = copy_entry->vme_next) {
4363 vm_map_t copy_submap = VM_MAP_NULL;
4364 vm_object_t copy_object = VM_OBJECT_NULL;
4365 vm_map_size_t copy_size;
4366 vm_object_offset_t copy_offset;
4367 boolean_t do_copy = false;
4368
4369 if (copy_entry->is_sub_map) {
4370 copy_submap = VME_SUBMAP(copy_entry);
4371 copy_object = (vm_object_t)copy_submap;
4372 } else {
4373 copy_object = VME_OBJECT(copy_entry);
4374 }
4375 copy_offset = VME_OFFSET(copy_entry);
4376 copy_size = (copy_entry->vme_end -
4377 copy_entry->vme_start);
4378
4379 /* sanity check */
4380 if ((copy_addr + copy_size) >
4381 (map_addr +
4382 overmap_start + overmap_end +
4383 named_entry->size /* XXX full size */)) {
4384 /* over-mapping too much !? */
4385 kr = KERN_INVALID_ARGUMENT;
4386 DEBUG4K_ERROR("kr 0x%x\n", kr);
4387 /* abort */
4388 break;
4389 }
4390
4391 /* take a reference on the object */
4392 if (copy_entry->is_sub_map) {
4393 vm_map_reference(copy_submap);
4394 } else {
4395 if (!copy &&
4396 copy_object != VM_OBJECT_NULL &&
4397 copy_object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) {
4398 bool is_writable;
4399
4400 /*
4401 * We need to resolve our side of this
4402 * "symmetric" copy-on-write now; we
4403 * need a new object to map and share,
4404 * instead of the current one which
4405 * might still be shared with the
4406 * original mapping.
4407 *
4408 * Note: A "vm_map_copy_t" does not
4409 * have a lock but we're protected by
4410 * the named entry's lock here.
4411 */
4412 // assert(copy_object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC);
4413 VME_OBJECT_SHADOW(copy_entry, copy_size, TRUE);
4414 assert(copy_object != VME_OBJECT(copy_entry));
4415 is_writable = false;
4416 if (copy_entry->protection & VM_PROT_WRITE) {
4417 is_writable = true;
4418 #if __arm64e__
4419 } else if (copy_entry->used_for_tpro) {
4420 is_writable = true;
4421 #endif /* __arm64e__ */
4422 }
4423 if (!copy_entry->needs_copy && is_writable) {
4424 vm_prot_t prot;
4425
4426 prot = copy_entry->protection & ~VM_PROT_WRITE;
4427 vm_object_pmap_protect(copy_object,
4428 copy_offset,
4429 copy_size,
4430 PMAP_NULL,
4431 PAGE_SIZE,
4432 0,
4433 prot);
4434 }
4435 copy_entry->needs_copy = FALSE;
4436 copy_entry->is_shared = TRUE;
4437 copy_object = VME_OBJECT(copy_entry);
4438 copy_offset = VME_OFFSET(copy_entry);
4439 vm_object_lock(copy_object);
4440 /* we're about to make a shared mapping of this object */
4441 copy_object->copy_strategy = MEMORY_OBJECT_COPY_DELAY;
4442 VM_OBJECT_SET_TRUE_SHARE(copy_object, TRUE);
4443 vm_object_unlock(copy_object);
4444 }
4445
4446 if (copy_object != VM_OBJECT_NULL &&
4447 copy_object->named &&
4448 copy_object->pager != MEMORY_OBJECT_NULL &&
4449 copy_object->copy_strategy != MEMORY_OBJECT_COPY_NONE) {
4450 memory_object_t pager;
4451 vm_prot_t pager_prot;
4452
4453 /*
4454 * For "named" VM objects, let the pager know that the
4455 * memory object is being mapped. Some pagers need to keep
4456 * track of this, to know when they can reclaim the memory
4457 * object, for example.
4458 * VM calls memory_object_map() for each mapping (specifying
4459 * the protection of each mapping) and calls
4460 * memory_object_last_unmap() when all the mappings are gone.
4461 */
4462 pager_prot = max_protection;
4463 if (copy) {
4464 /*
4465 * Copy-On-Write mapping: won't modify the
4466 * memory object.
4467 */
4468 pager_prot &= ~VM_PROT_WRITE;
4469 }
4470 vm_object_lock(copy_object);
4471 pager = copy_object->pager;
4472 if (copy_object->named &&
4473 pager != MEMORY_OBJECT_NULL &&
4474 copy_object->copy_strategy != MEMORY_OBJECT_COPY_NONE) {
4475 assert(copy_object->pager_ready);
4476 vm_object_mapping_wait(copy_object, THREAD_UNINT);
4477 /*
4478 * Object might have lost its pager
4479 * while waiting.
4480 */
4481 pager = copy_object->pager;
4482 if (copy_object->named &&
4483 pager != MEMORY_OBJECT_NULL) {
4484 vm_object_mapping_begin(copy_object);
4485 vm_object_unlock(copy_object);
4486
4487 kr = memory_object_map(pager, pager_prot);
4488 assert(kr == KERN_SUCCESS);
4489
4490 vm_object_lock(copy_object);
4491 vm_object_mapping_end(copy_object);
4492 }
4493 }
4494 vm_object_unlock(copy_object);
4495 }
4496
4497 /*
4498 * Perform the copy if requested
4499 */
4500
4501 if (copy && copy_object != VM_OBJECT_NULL) {
4502 vm_object_t new_object;
4503 vm_object_offset_t new_offset;
4504
4505 result = vm_object_copy_strategically(copy_object, copy_offset,
4506 copy_size,
4507 false, /* forking */
4508 &new_object, &new_offset,
4509 &do_copy);
4510
4511
4512 if (result == KERN_MEMORY_RESTART_COPY) {
4513 boolean_t success;
4514 boolean_t src_needs_copy;
4515
4516 /*
4517 * XXX
4518 * We currently ignore src_needs_copy.
4519 * This really is the issue of how to make
4520 * MEMORY_OBJECT_COPY_SYMMETRIC safe for
4521 * non-kernel users to use. Solution forthcoming.
4522 * In the meantime, since we don't allow non-kernel
4523 * memory managers to specify symmetric copy,
4524 * we won't run into problems here.
4525 */
4526 new_object = copy_object;
4527 new_offset = copy_offset;
4528 success = vm_object_copy_quickly(new_object,
4529 new_offset,
4530 copy_size,
4531 &src_needs_copy,
4532 &do_copy);
4533 assert(success);
4534 result = KERN_SUCCESS;
4535 }
4536 if (result != KERN_SUCCESS) {
4537 kr = result;
4538 break;
4539 }
4540
4541 copy_object = new_object;
4542 copy_offset = new_offset;
4543 /*
4544 * No extra object reference for the mapping:
4545 * the mapping should be the only thing keeping
4546 * this new object alive.
4547 */
4548 } else {
4549 /*
4550 * We already have the right object
4551 * to map.
4552 */
4553 copy_object = VME_OBJECT(copy_entry);
4554 /* take an extra ref for the mapping below */
4555 vm_object_reference(copy_object);
4556 }
4557 }
4558
4559 /*
4560 * If the caller does not want a specific
4561 * tag for this new mapping: use
4562 * the tag of the original mapping.
4563 */
4564 vm_map_kernel_flags_t vmk_remap_flags = {
4565 .vmkf_submap = copy_entry->is_sub_map,
4566 };
4567
4568 vm_map_kernel_flags_set_vmflags(&vmk_remap_flags,
4569 vm_map_kernel_flags_vmflags(vmk_flags),
4570 vmk_flags.vm_tag ?: VME_ALIAS(copy_entry));
4571
4572 /* over-map the object into destination */
4573 vmk_remap_flags.vmf_fixed = true;
4574 vmk_remap_flags.vmf_overwrite = true;
4575
4576 if (!copy && !copy_entry->is_sub_map) {
4577 /*
4578 * copy-on-write should have been
4579 * resolved at this point, or we would
4580 * end up sharing instead of copying.
4581 */
4582 assert(!copy_entry->needs_copy);
4583 }
4584 #if XNU_TARGET_OS_OSX
4585 if (copy_entry->used_for_jit) {
4586 vmk_remap_flags.vmkf_map_jit = TRUE;
4587 }
4588 #endif /* XNU_TARGET_OS_OSX */
4589
4590 kr = vm_map_enter(target_map,
4591 ©_addr,
4592 copy_size,
4593 (vm_map_offset_t) 0,
4594 vmk_remap_flags,
4595 copy_object,
4596 copy_offset,
4597 ((copy_object == NULL)
4598 ? FALSE
4599 : (copy || copy_entry->needs_copy)),
4600 cur_protection,
4601 max_protection,
4602 inheritance);
4603 if (kr != KERN_SUCCESS) {
4604 DEBUG4K_SHARE("failed kr 0x%x\n", kr);
4605 if (copy_entry->is_sub_map) {
4606 vm_map_deallocate(copy_submap);
4607 } else {
4608 vm_object_deallocate(copy_object);
4609 }
4610 /* abort */
4611 break;
4612 }
4613
4614 /* next mapping */
4615 copy_addr += copy_size;
4616 }
4617
4618 named_entry_unlock(named_entry);
4619 if (target_copy_map != copy_map) {
4620 vm_map_copy_discard(target_copy_map);
4621 target_copy_map = VM_MAP_COPY_NULL;
4622 }
4623
4624 if (kr == KERN_SUCCESS) {
4625 if (overmap_start) {
4626 DEBUG4K_SHARE("map %p map_addr 0x%llx offset_in_mapping 0x%llx overmap_start 0x%llx -> *address 0x%llx\n", target_map, (uint64_t)map_addr, (uint64_t)offset_in_mapping, (uint64_t)overmap_start, (uint64_t)(map_addr + offset_in_mapping + overmap_start));
4627 }
4628 offset_in_mapping += overmap_start;
4629 } else if (!vmk_flags.vmf_overwrite) {
4630 /* deallocate the contiguous range */
4631 vm_map_remove(target_map, map_addr,
4632 map_addr + map_size);
4633 }
4634 result = kr;
4635 goto out;
4636 }
4637
4638 if (named_entry->is_object) {
4639 unsigned int access;
4640 unsigned int wimg_mode;
4641
4642 assert(!named_entry->is_copy);
4643 assert(!named_entry->is_sub_map);
4644
4645 /* we are mapping a VM object */
4646
4647 access = named_entry->access;
4648
4649 if (vmk_flags.vmf_return_data_addr ||
4650 vmk_flags.vmf_return_4k_data_addr) {
4651 offset_in_mapping = obj_offs & map_mask;
4652 if (vmk_flags.vmf_return_4k_data_addr) {
4653 offset_in_mapping &= ~((signed)(0xFFF));
4654 }
4655 obj_offs -= offset_in_mapping;
4656 map_size = vm_map_round_page(initial_size +
4657 offset_in_mapping, map_mask);
4658 }
4659
4660 object = vm_named_entry_to_vm_object(named_entry);
4661 assert(object != VM_OBJECT_NULL);
4662 vm_object_lock(object);
4663 named_entry_unlock(named_entry);
4664
4665 wimg_mode = object->wimg_bits;
4666 vm_prot_to_wimg(access, &wimg_mode);
4667 if (object->wimg_bits != wimg_mode) {
4668 vm_object_change_wimg_mode(object, wimg_mode);
4669 }
4670
4671 vm_object_reference_locked(object);
4672 vm_object_unlock(object);
4673 } else {
4674 panic("invalid VM named entry %p", named_entry);
4675 }
4676 } else if (ip_kotype(port) == IKOT_MEMORY_OBJECT) {
4677 /*
4678 * JMM - This is temporary until we unify named entries
4679 * and raw memory objects.
4680 *
4681 * Detected fake ip_kotype for a memory object. In
4682 * this case, the port isn't really a port at all, but
4683 * instead is just a raw memory object.
4684 */
4685 if (vmk_flags.vmf_return_data_addr ||
4686 vmk_flags.vmf_return_4k_data_addr) {
4687 panic("VM_FLAGS_RETURN_DATA_ADDR not expected for raw memory object.");
4688 }
4689
4690 object = memory_object_to_vm_object((memory_object_t)port);
4691 if (object == VM_OBJECT_NULL) {
4692 return KERN_INVALID_OBJECT;
4693 }
4694 vm_object_reference(object);
4695
4696 /* wait for object (if any) to be ready */
4697 if (object != VM_OBJECT_NULL) {
4698 if (is_kernel_object(object)) {
4699 printf("Warning: Attempt to map kernel object"
4700 " by a non-private kernel entity\n");
4701 return KERN_INVALID_OBJECT;
4702 }
4703 if (!object->pager_ready) {
4704 vm_object_lock(object);
4705
4706 while (!object->pager_ready) {
4707 vm_object_sleep(object,
4708 VM_OBJECT_EVENT_PAGER_READY,
4709 THREAD_UNINT,
4710 LCK_SLEEP_EXCLUSIVE);
4711 }
4712 vm_object_unlock(object);
4713 }
4714 }
4715 } else {
4716 return KERN_INVALID_OBJECT;
4717 }
4718
4719 if (object != VM_OBJECT_NULL &&
4720 object->named &&
4721 object->pager != MEMORY_OBJECT_NULL &&
4722 object->copy_strategy != MEMORY_OBJECT_COPY_NONE) {
4723 memory_object_t pager;
4724 vm_prot_t pager_prot;
4725 kern_return_t kr;
4726
4727 /*
4728 * For "named" VM objects, let the pager know that the
4729 * memory object is being mapped. Some pagers need to keep
4730 * track of this, to know when they can reclaim the memory
4731 * object, for example.
4732 * VM calls memory_object_map() for each mapping (specifying
4733 * the protection of each mapping) and calls
4734 * memory_object_last_unmap() when all the mappings are gone.
4735 */
4736 pager_prot = max_protection;
4737 if (copy) {
4738 /*
4739 * Copy-On-Write mapping: won't modify the
4740 * memory object.
4741 */
4742 pager_prot &= ~VM_PROT_WRITE;
4743 }
4744 vm_object_lock(object);
4745 pager = object->pager;
4746 if (object->named &&
4747 pager != MEMORY_OBJECT_NULL &&
4748 object->copy_strategy != MEMORY_OBJECT_COPY_NONE) {
4749 assert(object->pager_ready);
4750 vm_object_mapping_wait(object, THREAD_UNINT);
4751 /* object might have lost its pager while waiting */
4752 pager = object->pager;
4753 if (object->named && pager != MEMORY_OBJECT_NULL) {
4754 vm_object_mapping_begin(object);
4755 vm_object_unlock(object);
4756
4757 kr = memory_object_map(pager, pager_prot);
4758 assert(kr == KERN_SUCCESS);
4759
4760 vm_object_lock(object);
4761 vm_object_mapping_end(object);
4762 }
4763 }
4764 vm_object_unlock(object);
4765 }
4766
4767 /*
4768 * Perform the copy if requested
4769 */
4770
4771 if (copy) {
4772 vm_object_t new_object;
4773 vm_object_offset_t new_offset;
4774
4775 result = vm_object_copy_strategically(object,
4776 obj_offs,
4777 map_size,
4778 false, /* forking */
4779 &new_object, &new_offset,
4780 ©);
4781
4782
4783 if (result == KERN_MEMORY_RESTART_COPY) {
4784 boolean_t success;
4785 boolean_t src_needs_copy;
4786
4787 /*
4788 * XXX
4789 * We currently ignore src_needs_copy.
4790 * This really is the issue of how to make
4791 * MEMORY_OBJECT_COPY_SYMMETRIC safe for
4792 * non-kernel users to use. Solution forthcoming.
4793 * In the meantime, since we don't allow non-kernel
4794 * memory managers to specify symmetric copy,
4795 * we won't run into problems here.
4796 */
4797 new_object = object;
4798 new_offset = obj_offs;
4799 success = vm_object_copy_quickly(new_object,
4800 new_offset,
4801 map_size,
4802 &src_needs_copy,
4803 ©);
4804 assert(success);
4805 result = KERN_SUCCESS;
4806 }
4807 /*
4808 * Throw away the reference to the
4809 * original object, as it won't be mapped.
4810 */
4811
4812 vm_object_deallocate(object);
4813
4814 if (result != KERN_SUCCESS) {
4815 return result;
4816 }
4817
4818 object = new_object;
4819 obj_offs = new_offset;
4820 }
4821
4822 /*
4823 * If non-kernel users want to try to prefault pages, the mapping and prefault
4824 * needs to be atomic.
4825 */
4826 kernel_prefault = (try_prefault && vm_kernel_map_is_kernel(target_map));
4827 vmk_flags.vmkf_keep_map_locked = (try_prefault && !kernel_prefault);
4828
4829 result = vm_map_enter(target_map,
4830 &map_addr, map_size,
4831 (vm_map_offset_t)mask,
4832 vmk_flags,
4833 object, obj_offs,
4834 copy,
4835 cur_protection, max_protection,
4836 inheritance);
4837 if (result != KERN_SUCCESS) {
4838 vm_object_deallocate(object);
4839 }
4840
4841 /*
4842 * Try to prefault, and do not forget to release the vm map lock.
4843 */
4844 if (result == KERN_SUCCESS && try_prefault) {
4845 mach_vm_address_t va = map_addr;
4846 kern_return_t kr = KERN_SUCCESS;
4847 unsigned int i = 0;
4848 int pmap_options;
4849
4850 pmap_options = kernel_prefault ? 0 : PMAP_OPTIONS_NOWAIT;
4851
4852 for (i = 0; i < page_list_count; ++i) {
4853 if (!UPL_VALID_PAGE(page_list, i)) {
4854 if (kernel_prefault) {
4855 assertf(FALSE, "kernel_prefault && !UPL_VALID_PAGE");
4856 result = KERN_MEMORY_ERROR;
4857 break;
4858 }
4859 } else {
4860 /*
4861 * If this function call failed, we should stop
4862 * trying to optimize, other calls are likely
4863 * going to fail too.
4864 *
4865 * We are not gonna report an error for such
4866 * failure though. That's an optimization, not
4867 * something critical.
4868 */
4869 kr = pmap_enter_object_options_check(target_map->pmap,
4870 va, 0, object, UPL_PHYS_PAGE(page_list, i),
4871 cur_protection, VM_PROT_NONE,
4872 TRUE, pmap_options);
4873 if (kr != KERN_SUCCESS) {
4874 OSIncrementAtomic64(&vm_prefault_nb_bailout);
4875 if (kernel_prefault) {
4876 result = kr;
4877 }
4878 break;
4879 }
4880 OSIncrementAtomic64(&vm_prefault_nb_pages);
4881 }
4882
4883 /* Next virtual address */
4884 va += PAGE_SIZE;
4885 }
4886 if (vmk_flags.vmkf_keep_map_locked) {
4887 vm_map_unlock(target_map);
4888 }
4889 }
4890
4891 out:
4892 if (result == KERN_SUCCESS) {
4893 #if KASAN
4894 if (target_map->pmap == kernel_pmap) {
4895 kasan_notify_address(map_addr, map_size);
4896 }
4897 #endif
4898 *address_u = vm_sanitize_wrap_addr(map_addr + offset_in_mapping);
4899 }
4900 return result;
4901 }
4902
4903 kern_return_t
vm_map_enter_mem_object_prefault(vm_map_t target_map,vm_map_offset_ut * address,vm_map_size_ut initial_size,vm_map_offset_ut mask,vm_map_kernel_flags_t vmk_flags,ipc_port_t port,vm_object_offset_ut offset,vm_prot_ut cur_protection,vm_prot_ut max_protection,upl_page_list_ptr_t page_list,unsigned int page_list_count)4904 vm_map_enter_mem_object_prefault(
4905 vm_map_t target_map,
4906 vm_map_offset_ut *address,
4907 vm_map_size_ut initial_size,
4908 vm_map_offset_ut mask,
4909 vm_map_kernel_flags_t vmk_flags,
4910 ipc_port_t port,
4911 vm_object_offset_ut offset,
4912 vm_prot_ut cur_protection,
4913 vm_prot_ut max_protection,
4914 upl_page_list_ptr_t page_list,
4915 unsigned int page_list_count)
4916 {
4917 /* range_id is set by vm_map_enter_mem_object */
4918 return vm_map_enter_mem_object(target_map,
4919 address,
4920 initial_size,
4921 mask,
4922 vmk_flags,
4923 port,
4924 offset,
4925 FALSE,
4926 cur_protection,
4927 max_protection,
4928 VM_INHERIT_DEFAULT,
4929 page_list,
4930 page_list_count);
4931 }
4932
4933 static __attribute__((always_inline, warn_unused_result))
4934 kern_return_t
vm_map_enter_mem_object_control_sanitize(vm_map_t target_map,vm_map_offset_ut address_u,vm_map_size_ut initial_size_u,vm_map_offset_ut mask_u,vm_object_offset_ut offset_u,vm_prot_ut cur_protection_u,vm_prot_ut max_protection_u,vm_inherit_ut inheritance_u,vm_map_kernel_flags_t vmk_flags,vm_map_address_t * map_addr,vm_map_size_t * map_size,vm_map_offset_t * mask,vm_object_offset_t * obj_offs,vm_object_offset_t * obj_end,vm_object_size_t * obj_size,vm_prot_t * cur_protection,vm_prot_t * max_protection,vm_inherit_t * inheritance)4935 vm_map_enter_mem_object_control_sanitize(
4936 vm_map_t target_map,
4937 vm_map_offset_ut address_u,
4938 vm_map_size_ut initial_size_u,
4939 vm_map_offset_ut mask_u,
4940 vm_object_offset_ut offset_u,
4941 vm_prot_ut cur_protection_u,
4942 vm_prot_ut max_protection_u,
4943 vm_inherit_ut inheritance_u,
4944 vm_map_kernel_flags_t vmk_flags,
4945 vm_map_address_t *map_addr,
4946 vm_map_size_t *map_size,
4947 vm_map_offset_t *mask,
4948 vm_object_offset_t *obj_offs,
4949 vm_object_offset_t *obj_end,
4950 vm_object_size_t *obj_size,
4951 vm_prot_t *cur_protection,
4952 vm_prot_t *max_protection,
4953 vm_inherit_t *inheritance)
4954 {
4955 kern_return_t kr;
4956
4957 kr = vm_sanitize_cur_and_max_prots(cur_protection_u, max_protection_u,
4958 VM_SANITIZE_CALLER_ENTER_MEM_OBJ_CTL, target_map,
4959 cur_protection, max_protection);
4960 if (__improbable(kr != KERN_SUCCESS)) {
4961 return kr;
4962 }
4963
4964 kr = vm_sanitize_inherit(inheritance_u, VM_SANITIZE_CALLER_ENTER_MEM_OBJ_CTL,
4965 inheritance);
4966 if (__improbable(kr != KERN_SUCCESS)) {
4967 return kr;
4968 }
4969
4970 kr = vm_sanitize_mask(mask_u, VM_SANITIZE_CALLER_ENTER_MEM_OBJ_CTL, mask);
4971 if (__improbable(kr != KERN_SUCCESS)) {
4972 return kr;
4973 }
4974 /*
4975 * Ensure arithmetic doesn't overflow in vm_object space (kernel
4976 * pages).
4977 * We keep unaligned values for now. The call we eventually make to
4978 * vm_map_enter does guarantee that offset_u is page aligned for EITHER
4979 * target_map pages or kernel pages. But this isn't enough to guarantee
4980 * kernel space alignment.
4981 */
4982 kr = vm_sanitize_addr_size(offset_u, initial_size_u,
4983 VM_SANITIZE_CALLER_ENTER_MEM_OBJ_CTL, PAGE_MASK,
4984 VM_SANITIZE_FLAGS_SIZE_ZERO_FAILS |
4985 VM_SANITIZE_FLAGS_GET_UNALIGNED_VALUES,
4986 obj_offs, obj_end, obj_size);
4987 if (__improbable(kr != KERN_SUCCESS)) {
4988 return kr;
4989 }
4990
4991 /*
4992 * There is no vm_sanitize_addr_size variant that also adjusts for
4993 * a separate offset. Rather than create one for this one-off issue,
4994 * we sanitize map_addr and map_size individually, relying on
4995 * vm_sanitize_size to incorporate the offset. Then, we perform the
4996 * overflow check manually below.
4997 */
4998 *map_addr = vm_sanitize_addr(target_map, address_u);
4999 kr = vm_sanitize_size(offset_u, initial_size_u,
5000 VM_SANITIZE_CALLER_ENTER_MEM_OBJ_CTL, target_map,
5001 VM_SANITIZE_FLAGS_SIZE_ZERO_FAILS, map_size);
5002 if (__improbable(kr != KERN_SUCCESS)) {
5003 return kr;
5004 }
5005
5006 /*
5007 * Ensure arithmetic doesn't overflow in target_map space.
5008 * The computation of map_size above accounts for the possibility that
5009 * offset_u might be unaligned in target_map space.
5010 */
5011 if (vmk_flags.vmf_fixed) {
5012 vm_map_address_t map_end;
5013
5014 if (__improbable(os_add_overflow(*map_addr, *map_size, &map_end))) {
5015 return KERN_INVALID_ARGUMENT;
5016 }
5017 }
5018
5019 return KERN_SUCCESS;
5020 }
5021
5022 kern_return_t
vm_map_enter_mem_object_control(vm_map_t target_map,vm_map_offset_ut * address_u,vm_map_size_ut initial_size_u,vm_map_offset_ut mask_u,vm_map_kernel_flags_t vmk_flags,memory_object_control_t control,vm_object_offset_ut offset_u,boolean_t needs_copy,vm_prot_ut cur_protection_u,vm_prot_ut max_protection_u,vm_inherit_ut inheritance_u)5023 vm_map_enter_mem_object_control(
5024 vm_map_t target_map,
5025 vm_map_offset_ut *address_u,
5026 vm_map_size_ut initial_size_u,
5027 vm_map_offset_ut mask_u,
5028 vm_map_kernel_flags_t vmk_flags,
5029 memory_object_control_t control,
5030 vm_object_offset_ut offset_u,
5031 boolean_t needs_copy,
5032 vm_prot_ut cur_protection_u,
5033 vm_prot_ut max_protection_u,
5034 vm_inherit_ut inheritance_u)
5035 {
5036 vm_map_offset_t mask;
5037 vm_prot_t cur_protection;
5038 vm_prot_t max_protection;
5039 vm_inherit_t inheritance;
5040 vm_map_address_t map_addr;
5041 vm_map_size_t map_size;
5042 vm_object_t object;
5043 vm_object_offset_t obj_offs, obj_end;
5044 vm_object_size_t obj_size;
5045 kern_return_t result;
5046 memory_object_t pager;
5047 vm_prot_t pager_prot;
5048 kern_return_t kr;
5049
5050 /*
5051 * Check arguments for validity
5052 */
5053 if (target_map == VM_MAP_NULL) {
5054 return KERN_INVALID_ARGUMENT;
5055 }
5056
5057 /*
5058 * We only support vmf_return_data_addr-like behavior.
5059 */
5060 vmk_flags.vmf_return_data_addr = true;
5061
5062 /*
5063 * Sanitize any input parameters that are addr/size/prot/inherit
5064 */
5065 kr = vm_map_enter_mem_object_control_sanitize(target_map,
5066 *address_u,
5067 initial_size_u,
5068 mask_u,
5069 offset_u,
5070 cur_protection_u,
5071 max_protection_u,
5072 inheritance_u,
5073 vmk_flags,
5074 &map_addr,
5075 &map_size,
5076 &mask,
5077 &obj_offs,
5078 &obj_end,
5079 &obj_size,
5080 &cur_protection,
5081 &max_protection,
5082 &inheritance);
5083 if (__improbable(kr != KERN_SUCCESS)) {
5084 return vm_sanitize_get_kr(kr);
5085 }
5086
5087 object = memory_object_control_to_vm_object(control);
5088
5089 if (object == VM_OBJECT_NULL) {
5090 return KERN_INVALID_OBJECT;
5091 }
5092
5093 if (is_kernel_object(object)) {
5094 printf("Warning: Attempt to map kernel object"
5095 " by a non-private kernel entity\n");
5096 return KERN_INVALID_OBJECT;
5097 }
5098
5099 vm_object_lock(object);
5100 os_ref_retain_locked_raw(&object->ref_count, &vm_object_refgrp);
5101
5102
5103 /*
5104 * For "named" VM objects, let the pager know that the
5105 * memory object is being mapped. Some pagers need to keep
5106 * track of this, to know when they can reclaim the memory
5107 * object, for example.
5108 * VM calls memory_object_map() for each mapping (specifying
5109 * the protection of each mapping) and calls
5110 * memory_object_last_unmap() when all the mappings are gone.
5111 */
5112 pager_prot = max_protection;
5113 if (needs_copy) {
5114 pager_prot &= ~VM_PROT_WRITE;
5115 }
5116 pager = object->pager;
5117 if (object->named &&
5118 pager != MEMORY_OBJECT_NULL &&
5119 object->copy_strategy != MEMORY_OBJECT_COPY_NONE) {
5120 assert(object->pager_ready);
5121 vm_object_mapping_wait(object, THREAD_UNINT);
5122 /* object might have lost its pager while waiting */
5123 pager = object->pager;
5124 if (object->named && pager != MEMORY_OBJECT_NULL) {
5125 vm_object_mapping_begin(object);
5126 vm_object_unlock(object);
5127
5128 kr = memory_object_map(pager, pager_prot);
5129 assert(kr == KERN_SUCCESS);
5130
5131 vm_object_lock(object);
5132 vm_object_mapping_end(object);
5133 }
5134 }
5135 vm_object_unlock(object);
5136
5137 /*
5138 * Perform the copy if requested
5139 */
5140
5141 if (needs_copy) {
5142 vm_object_t new_object;
5143 vm_object_offset_t new_offset;
5144
5145 result = vm_object_copy_strategically(object, obj_offs, obj_size,
5146 false, /* forking */
5147 &new_object, &new_offset,
5148 &needs_copy);
5149
5150
5151 if (result == KERN_MEMORY_RESTART_COPY) {
5152 boolean_t success;
5153 boolean_t src_needs_copy;
5154
5155 /*
5156 * XXX
5157 * We currently ignore src_needs_copy.
5158 * This really is the issue of how to make
5159 * MEMORY_OBJECT_COPY_SYMMETRIC safe for
5160 * non-kernel users to use. Solution forthcoming.
5161 * In the meantime, since we don't allow non-kernel
5162 * memory managers to specify symmetric copy,
5163 * we won't run into problems here.
5164 */
5165 new_object = object;
5166 new_offset = obj_offs;
5167 success = vm_object_copy_quickly(new_object,
5168 new_offset, obj_size,
5169 &src_needs_copy,
5170 &needs_copy);
5171 assert(success);
5172 result = KERN_SUCCESS;
5173 }
5174 /*
5175 * Throw away the reference to the
5176 * original object, as it won't be mapped.
5177 */
5178
5179 vm_object_deallocate(object);
5180
5181 if (result != KERN_SUCCESS) {
5182 return result;
5183 }
5184
5185 object = new_object;
5186 obj_offs = new_offset;
5187 }
5188
5189 result = vm_map_enter(target_map,
5190 &map_addr, map_size,
5191 (vm_map_offset_t)mask,
5192 vmk_flags,
5193 object,
5194 obj_offs,
5195 needs_copy,
5196 cur_protection, max_protection,
5197 inheritance);
5198
5199 if (result == KERN_SUCCESS) {
5200 *address_u = vm_sanitize_wrap_addr(
5201 map_addr + (obj_offs & vm_map_page_mask(target_map)));
5202 } else {
5203 vm_object_deallocate(object);
5204 }
5205
5206 return result;
5207 }
5208
5209
5210 /* Not used without nested pmaps */
5211 #ifndef NO_NESTED_PMAP
5212 /*
5213 * Clip and unnest a portion of a nested submap mapping.
5214 */
5215
5216
5217 static void
vm_map_clip_unnest(vm_map_t map,vm_map_entry_t entry,vm_map_offset_t start_unnest,vm_map_offset_t end_unnest)5218 vm_map_clip_unnest(
5219 vm_map_t map,
5220 vm_map_entry_t entry,
5221 vm_map_offset_t start_unnest,
5222 vm_map_offset_t end_unnest)
5223 {
5224 vm_map_offset_t old_start_unnest = start_unnest;
5225 vm_map_offset_t old_end_unnest = end_unnest;
5226
5227 assert(entry->is_sub_map);
5228 assert(VME_SUBMAP(entry) != NULL);
5229 assert(entry->use_pmap);
5230
5231 /*
5232 * Query the platform for the optimal unnest range.
5233 * DRK: There's some duplication of effort here, since
5234 * callers may have adjusted the range to some extent. This
5235 * routine was introduced to support 1GiB subtree nesting
5236 * for x86 platforms, which can also nest on 2MiB boundaries
5237 * depending on size/alignment.
5238 */
5239 if (pmap_adjust_unnest_parameters(map->pmap, &start_unnest, &end_unnest)) {
5240 assert(VME_SUBMAP(entry)->is_nested_map);
5241 assert(!VME_SUBMAP(entry)->disable_vmentry_reuse);
5242 log_unnest_badness(map,
5243 old_start_unnest,
5244 old_end_unnest,
5245 VME_SUBMAP(entry)->is_nested_map,
5246 (entry->vme_start +
5247 VME_SUBMAP(entry)->lowest_unnestable_start -
5248 VME_OFFSET(entry)));
5249 }
5250
5251 if (entry->vme_start > start_unnest ||
5252 entry->vme_end < end_unnest) {
5253 panic("vm_map_clip_unnest(0x%llx,0x%llx): "
5254 "bad nested entry: start=0x%llx end=0x%llx\n",
5255 (long long)start_unnest, (long long)end_unnest,
5256 (long long)entry->vme_start, (long long)entry->vme_end);
5257 }
5258
5259 if (start_unnest > entry->vme_start) {
5260 _vm_map_clip_start(&map->hdr,
5261 entry,
5262 start_unnest);
5263 if (map->holelistenabled) {
5264 vm_map_store_update_first_free(map, NULL, FALSE);
5265 } else {
5266 vm_map_store_update_first_free(map, map->first_free, FALSE);
5267 }
5268 }
5269 if (entry->vme_end > end_unnest) {
5270 _vm_map_clip_end(&map->hdr,
5271 entry,
5272 end_unnest);
5273 if (map->holelistenabled) {
5274 vm_map_store_update_first_free(map, NULL, FALSE);
5275 } else {
5276 vm_map_store_update_first_free(map, map->first_free, FALSE);
5277 }
5278 }
5279
5280 pmap_unnest(map->pmap,
5281 entry->vme_start,
5282 entry->vme_end - entry->vme_start);
5283 if ((map->mapped_in_other_pmaps) && os_ref_get_count_raw(&map->map_refcnt) != 0) {
5284 /* clean up parent map/maps */
5285 vm_map_submap_pmap_clean(
5286 map, entry->vme_start,
5287 entry->vme_end,
5288 VME_SUBMAP(entry),
5289 VME_OFFSET(entry));
5290 }
5291 entry->use_pmap = FALSE;
5292 if ((map->pmap != kernel_pmap) &&
5293 (VME_ALIAS(entry) == VM_MEMORY_SHARED_PMAP)) {
5294 VME_ALIAS_SET(entry, VM_MEMORY_UNSHARED_PMAP);
5295 }
5296 }
5297 #endif /* NO_NESTED_PMAP */
5298
5299 __abortlike
5300 static void
__vm_map_clip_atomic_entry_panic(vm_map_t map,vm_map_entry_t entry,vm_map_offset_t where)5301 __vm_map_clip_atomic_entry_panic(
5302 vm_map_t map,
5303 vm_map_entry_t entry,
5304 vm_map_offset_t where)
5305 {
5306 panic("vm_map_clip(%p): Attempting to clip an atomic VM map entry "
5307 "%p [0x%llx:0x%llx] at 0x%llx", map, entry,
5308 (uint64_t)entry->vme_start,
5309 (uint64_t)entry->vme_end,
5310 (uint64_t)where);
5311 }
5312
5313 /*
5314 * vm_map_clip_start: [ internal use only ]
5315 *
5316 * Asserts that the given entry begins at or after
5317 * the specified address; if necessary,
5318 * it splits the entry into two.
5319 */
5320 void
vm_map_clip_start(vm_map_t map,vm_map_entry_t entry,vm_map_offset_t startaddr)5321 vm_map_clip_start(
5322 vm_map_t map,
5323 vm_map_entry_t entry,
5324 vm_map_offset_t startaddr)
5325 {
5326 #ifndef NO_NESTED_PMAP
5327 if (entry->is_sub_map &&
5328 entry->use_pmap &&
5329 startaddr >= entry->vme_start) {
5330 vm_map_offset_t start_unnest, end_unnest;
5331
5332 /*
5333 * Make sure "startaddr" is no longer in a nested range
5334 * before we clip. Unnest only the minimum range the platform
5335 * can handle.
5336 * vm_map_clip_unnest may perform additional adjustments to
5337 * the unnest range.
5338 */
5339 start_unnest = startaddr & ~(pmap_shared_region_size_min(map->pmap) - 1);
5340 end_unnest = start_unnest + pmap_shared_region_size_min(map->pmap);
5341 vm_map_clip_unnest(map, entry, start_unnest, end_unnest);
5342 }
5343 #endif /* NO_NESTED_PMAP */
5344 if (startaddr > entry->vme_start) {
5345 if (!entry->is_sub_map &&
5346 VME_OBJECT(entry) &&
5347 VME_OBJECT(entry)->phys_contiguous) {
5348 pmap_remove(map->pmap,
5349 (addr64_t)(entry->vme_start),
5350 (addr64_t)(entry->vme_end));
5351 }
5352 if (entry->vme_atomic) {
5353 __vm_map_clip_atomic_entry_panic(map, entry, startaddr);
5354 }
5355
5356 DTRACE_VM5(
5357 vm_map_clip_start,
5358 vm_map_t, map,
5359 vm_map_offset_t, entry->vme_start,
5360 vm_map_offset_t, entry->vme_end,
5361 vm_map_offset_t, startaddr,
5362 int, VME_ALIAS(entry));
5363
5364 _vm_map_clip_start(&map->hdr, entry, startaddr);
5365 if (map->holelistenabled) {
5366 vm_map_store_update_first_free(map, NULL, FALSE);
5367 } else {
5368 vm_map_store_update_first_free(map, map->first_free, FALSE);
5369 }
5370 }
5371 }
5372
5373
5374 #define vm_map_copy_clip_start(copy, entry, startaddr) \
5375 MACRO_BEGIN \
5376 if ((startaddr) > (entry)->vme_start) \
5377 _vm_map_clip_start(&(copy)->cpy_hdr,(entry),(startaddr)); \
5378 MACRO_END
5379
5380 /*
5381 * This routine is called only when it is known that
5382 * the entry must be split.
5383 */
5384 static void
_vm_map_clip_start(struct vm_map_header * map_header,vm_map_entry_t entry,vm_map_offset_t start)5385 _vm_map_clip_start(
5386 struct vm_map_header *map_header,
5387 vm_map_entry_t entry,
5388 vm_map_offset_t start)
5389 {
5390 vm_map_entry_t new_entry;
5391
5392 /*
5393 * Split off the front portion --
5394 * note that we must insert the new
5395 * entry BEFORE this one, so that
5396 * this entry has the specified starting
5397 * address.
5398 */
5399
5400 if (entry->map_aligned) {
5401 assert(VM_MAP_PAGE_ALIGNED(start,
5402 VM_MAP_HDR_PAGE_MASK(map_header)));
5403 }
5404
5405 new_entry = _vm_map_entry_create(map_header);
5406 vm_map_entry_copy_full(new_entry, entry);
5407
5408 new_entry->vme_end = start;
5409 assert(new_entry->vme_start < new_entry->vme_end);
5410 VME_OFFSET_SET(entry, VME_OFFSET(entry) + (start - entry->vme_start));
5411 if (__improbable(start >= entry->vme_end)) {
5412 panic("mapHdr %p entry %p start 0x%llx end 0x%llx new start 0x%llx", map_header, entry, entry->vme_start, entry->vme_end, start);
5413 }
5414 assert(start < entry->vme_end);
5415 entry->vme_start = start;
5416
5417 #if VM_BTLOG_TAGS
5418 if (new_entry->vme_kernel_object) {
5419 btref_retain(new_entry->vme_tag_btref);
5420 }
5421 #endif /* VM_BTLOG_TAGS */
5422
5423 _vm_map_store_entry_link(map_header, entry->vme_prev, new_entry);
5424
5425 if (entry->is_sub_map) {
5426 vm_map_reference(VME_SUBMAP(new_entry));
5427 } else {
5428 vm_object_reference(VME_OBJECT(new_entry));
5429 }
5430 }
5431
5432
5433 /*
5434 * vm_map_clip_end: [ internal use only ]
5435 *
5436 * Asserts that the given entry ends at or before
5437 * the specified address; if necessary,
5438 * it splits the entry into two.
5439 */
5440 void
vm_map_clip_end(vm_map_t map,vm_map_entry_t entry,vm_map_offset_t endaddr)5441 vm_map_clip_end(
5442 vm_map_t map,
5443 vm_map_entry_t entry,
5444 vm_map_offset_t endaddr)
5445 {
5446 if (endaddr > entry->vme_end) {
5447 /*
5448 * Within the scope of this clipping, limit "endaddr" to
5449 * the end of this map entry...
5450 */
5451 endaddr = entry->vme_end;
5452 }
5453 #ifndef NO_NESTED_PMAP
5454 if (entry->is_sub_map && entry->use_pmap) {
5455 vm_map_offset_t start_unnest, end_unnest;
5456
5457 /*
5458 * Make sure the range between the start of this entry and
5459 * the new "endaddr" is no longer nested before we clip.
5460 * Unnest only the minimum range the platform can handle.
5461 * vm_map_clip_unnest may perform additional adjustments to
5462 * the unnest range.
5463 */
5464 start_unnest = entry->vme_start;
5465 end_unnest =
5466 (endaddr + pmap_shared_region_size_min(map->pmap) - 1) &
5467 ~(pmap_shared_region_size_min(map->pmap) - 1);
5468 vm_map_clip_unnest(map, entry, start_unnest, end_unnest);
5469 }
5470 #endif /* NO_NESTED_PMAP */
5471 if (endaddr < entry->vme_end) {
5472 if (!entry->is_sub_map &&
5473 VME_OBJECT(entry) &&
5474 VME_OBJECT(entry)->phys_contiguous) {
5475 pmap_remove(map->pmap,
5476 (addr64_t)(entry->vme_start),
5477 (addr64_t)(entry->vme_end));
5478 }
5479 if (entry->vme_atomic) {
5480 __vm_map_clip_atomic_entry_panic(map, entry, endaddr);
5481 }
5482 DTRACE_VM5(
5483 vm_map_clip_end,
5484 vm_map_t, map,
5485 vm_map_offset_t, entry->vme_start,
5486 vm_map_offset_t, entry->vme_end,
5487 vm_map_offset_t, endaddr,
5488 int, VME_ALIAS(entry));
5489
5490 _vm_map_clip_end(&map->hdr, entry, endaddr);
5491 if (map->holelistenabled) {
5492 vm_map_store_update_first_free(map, NULL, FALSE);
5493 } else {
5494 vm_map_store_update_first_free(map, map->first_free, FALSE);
5495 }
5496 }
5497 }
5498
5499
5500 #define vm_map_copy_clip_end(copy, entry, endaddr) \
5501 MACRO_BEGIN \
5502 if ((endaddr) < (entry)->vme_end) \
5503 _vm_map_clip_end(&(copy)->cpy_hdr,(entry),(endaddr)); \
5504 MACRO_END
5505
5506 /*
5507 * This routine is called only when it is known that
5508 * the entry must be split.
5509 */
5510 static void
_vm_map_clip_end(struct vm_map_header * map_header,vm_map_entry_t entry,vm_map_offset_t end)5511 _vm_map_clip_end(
5512 struct vm_map_header *map_header,
5513 vm_map_entry_t entry,
5514 vm_map_offset_t end)
5515 {
5516 vm_map_entry_t new_entry;
5517
5518 /*
5519 * Create a new entry and insert it
5520 * AFTER the specified entry
5521 */
5522
5523 if (entry->map_aligned) {
5524 assert(VM_MAP_PAGE_ALIGNED(end,
5525 VM_MAP_HDR_PAGE_MASK(map_header)));
5526 }
5527
5528 new_entry = _vm_map_entry_create(map_header);
5529 vm_map_entry_copy_full(new_entry, entry);
5530
5531 if (__improbable(end <= entry->vme_start)) {
5532 panic("mapHdr %p entry %p start 0x%llx end 0x%llx new end 0x%llx", map_header, entry, entry->vme_start, entry->vme_end, end);
5533 }
5534 assert(entry->vme_start < end);
5535 new_entry->vme_start = entry->vme_end = end;
5536 VME_OFFSET_SET(new_entry,
5537 VME_OFFSET(new_entry) + (end - entry->vme_start));
5538 assert(new_entry->vme_start < new_entry->vme_end);
5539
5540 #if VM_BTLOG_TAGS
5541 if (new_entry->vme_kernel_object) {
5542 btref_retain(new_entry->vme_tag_btref);
5543 }
5544 #endif /* VM_BTLOG_TAGS */
5545
5546 _vm_map_store_entry_link(map_header, entry, new_entry);
5547
5548 if (entry->is_sub_map) {
5549 vm_map_reference(VME_SUBMAP(new_entry));
5550 } else {
5551 vm_object_reference(VME_OBJECT(new_entry));
5552 }
5553 }
5554
5555
5556 /*
5557 * VM_MAP_RANGE_CHECK: [ internal use only ]
5558 *
5559 * Asserts that the starting and ending region
5560 * addresses fall within the valid range of the map.
5561 */
5562 #define VM_MAP_RANGE_CHECK(map, start, end) \
5563 MACRO_BEGIN \
5564 if (start < vm_map_min(map)) \
5565 start = vm_map_min(map); \
5566 if (end > vm_map_max(map)) \
5567 end = vm_map_max(map); \
5568 if (start > end) \
5569 start = end; \
5570 MACRO_END
5571
5572 /*
5573 * vm_map_range_check: [ internal use only ]
5574 *
5575 * Check that the region defined by the specified start and
5576 * end addresses are wholly contained within a single map
5577 * entry or set of adjacent map entries of the spacified map,
5578 * i.e. the specified region contains no unmapped space.
5579 * If any or all of the region is unmapped, FALSE is returned.
5580 * Otherwise, TRUE is returned and if the output argument 'entry'
5581 * is not NULL it points to the map entry containing the start
5582 * of the region.
5583 *
5584 * The map is locked for reading on entry and is left locked.
5585 */
5586 static boolean_t
vm_map_range_check(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_map_entry_t * entry)5587 vm_map_range_check(
5588 vm_map_t map,
5589 vm_map_offset_t start,
5590 vm_map_offset_t end,
5591 vm_map_entry_t *entry)
5592 {
5593 vm_map_entry_t cur;
5594 vm_map_offset_t prev;
5595
5596 /*
5597 * Basic sanity checks first
5598 */
5599 if (start < vm_map_min(map) || end > vm_map_max(map) || start > end) {
5600 return FALSE;
5601 }
5602
5603 /*
5604 * Check first if the region starts within a valid
5605 * mapping for the map.
5606 */
5607 if (!vm_map_lookup_entry(map, start, &cur)) {
5608 return FALSE;
5609 }
5610
5611 /*
5612 * Optimize for the case that the region is contained
5613 * in a single map entry.
5614 */
5615 if (entry != (vm_map_entry_t *) NULL) {
5616 *entry = cur;
5617 }
5618 if (end <= cur->vme_end) {
5619 return TRUE;
5620 }
5621
5622 /*
5623 * If the region is not wholly contained within a
5624 * single entry, walk the entries looking for holes.
5625 */
5626 prev = cur->vme_end;
5627 cur = cur->vme_next;
5628 while ((cur != vm_map_to_entry(map)) && (prev == cur->vme_start)) {
5629 if (end <= cur->vme_end) {
5630 return TRUE;
5631 }
5632 prev = cur->vme_end;
5633 cur = cur->vme_next;
5634 }
5635 return FALSE;
5636 }
5637
5638 static __attribute__((always_inline, warn_unused_result))
5639 kern_return_t
vm_map_protect_sanitize(vm_map_t map,vm_map_offset_ut start_u,vm_map_offset_ut end_u,vm_prot_ut new_prot_u,vm_map_offset_t * start,vm_map_offset_t * end,vm_prot_t * new_prot)5640 vm_map_protect_sanitize(
5641 vm_map_t map,
5642 vm_map_offset_ut start_u,
5643 vm_map_offset_ut end_u,
5644 vm_prot_ut new_prot_u,
5645 vm_map_offset_t *start,
5646 vm_map_offset_t *end,
5647 vm_prot_t *new_prot)
5648 {
5649 kern_return_t kr;
5650 vm_map_size_t size;
5651 vm_sanitize_flags_t flags = VM_SANITIZE_FLAGS_SIZE_ZERO_SUCCEEDS;
5652
5653
5654 kr = vm_sanitize_prot(new_prot_u, VM_SANITIZE_CALLER_VM_MAP_PROTECT,
5655 map, VM_PROT_COPY, new_prot);
5656 if (__improbable(kr != KERN_SUCCESS)) {
5657 return kr;
5658 }
5659
5660 kr = vm_sanitize_addr_end(start_u, end_u, VM_SANITIZE_CALLER_VM_MAP_PROTECT,
5661 map, flags, start, end, &size);
5662 if (__improbable(kr != KERN_SUCCESS)) {
5663 return kr;
5664 }
5665
5666 return KERN_SUCCESS;
5667 }
5668
5669 /*
5670 * vm_map_protect:
5671 *
5672 * Sets the protection of the specified address
5673 * region in the target map. If "set_max" is
5674 * specified, the maximum protection is to be set;
5675 * otherwise, only the current protection is affected.
5676 */
5677 kern_return_t
vm_map_protect(vm_map_t map,vm_map_offset_ut start_u,vm_map_offset_ut end_u,boolean_t set_max,vm_prot_ut new_prot_u)5678 vm_map_protect(
5679 vm_map_t map,
5680 vm_map_offset_ut start_u,
5681 vm_map_offset_ut end_u,
5682 boolean_t set_max,
5683 vm_prot_ut new_prot_u)
5684 {
5685 vm_map_entry_t current;
5686 vm_map_offset_t prev;
5687 vm_map_entry_t entry;
5688 vm_prot_t new_prot;
5689 vm_prot_t new_max;
5690 int pmap_options = 0;
5691 kern_return_t kr;
5692 vm_map_offset_t start, original_start;
5693 vm_map_offset_t end;
5694
5695 kr = vm_map_protect_sanitize(map,
5696 start_u,
5697 end_u,
5698 new_prot_u,
5699 &start,
5700 &end,
5701 &new_prot);
5702 if (__improbable(kr != KERN_SUCCESS)) {
5703 return vm_sanitize_get_kr(kr);
5704 }
5705 original_start = start;
5706
5707 if (new_prot & VM_PROT_COPY) {
5708 vm_map_offset_t new_start;
5709 vm_prot_t cur_prot, max_prot;
5710 vm_map_kernel_flags_t kflags;
5711
5712 /* LP64todo - see below */
5713 if (start >= map->max_offset) {
5714 return KERN_INVALID_ADDRESS;
5715 }
5716
5717 if ((new_prot & VM_PROT_ALLEXEC) &&
5718 map->pmap != kernel_pmap &&
5719 (vm_map_cs_enforcement(map)
5720 #if XNU_TARGET_OS_OSX && __arm64__
5721 || !VM_MAP_IS_EXOTIC(map)
5722 #endif /* XNU_TARGET_OS_OSX && __arm64__ */
5723 ) &&
5724 VM_MAP_POLICY_WX_FAIL(map)) {
5725 DTRACE_VM3(cs_wx,
5726 uint64_t, (uint64_t) start,
5727 uint64_t, (uint64_t) end,
5728 vm_prot_t, new_prot);
5729 printf("CODE SIGNING: %d[%s] %s:%d(0x%llx,0x%llx,0x%x) can't have both write and exec at the same time\n",
5730 proc_selfpid(),
5731 (get_bsdtask_info(current_task())
5732 ? proc_name_address(get_bsdtask_info(current_task()))
5733 : "?"),
5734 __FUNCTION__, __LINE__,
5735 #if DEVELOPMENT || DEBUG
5736 (uint64_t)start,
5737 (uint64_t)end,
5738 #else /* DEVELOPMENT || DEBUG */
5739 (uint64_t)0,
5740 (uint64_t)0,
5741 #endif /* DEVELOPMENT || DEBUG */
5742 new_prot);
5743 return KERN_PROTECTION_FAILURE;
5744 }
5745
5746 /*
5747 * Let vm_map_remap_extract() know that it will need to:
5748 * + make a copy of the mapping
5749 * + add VM_PROT_WRITE to the max protections
5750 * + remove any protections that are no longer allowed from the
5751 * max protections (to avoid any WRITE/EXECUTE conflict, for
5752 * example).
5753 * Note that "max_prot" is an IN/OUT parameter only for this
5754 * specific (VM_PROT_COPY) case. It's usually an OUT parameter
5755 * only.
5756 */
5757 max_prot = new_prot & (VM_PROT_ALL | VM_PROT_ALLEXEC);
5758 cur_prot = VM_PROT_NONE;
5759 kflags = VM_MAP_KERNEL_FLAGS_FIXED(.vmf_overwrite = true);
5760 kflags.vmkf_remap_prot_copy = true;
5761 kflags.vmkf_tpro_enforcement_override = !vm_map_tpro_enforcement(map);
5762 new_start = start;
5763 kr = vm_map_remap(map,
5764 vm_sanitize_wrap_addr_ref(&new_start),
5765 end - start,
5766 0, /* mask */
5767 kflags,
5768 map,
5769 start,
5770 TRUE, /* copy-on-write remapping! */
5771 vm_sanitize_wrap_prot_ref(&cur_prot), /* IN/OUT */
5772 vm_sanitize_wrap_prot_ref(&max_prot), /* IN/OUT */
5773 VM_INHERIT_DEFAULT);
5774 if (kr != KERN_SUCCESS) {
5775 return kr;
5776 }
5777 new_prot &= ~VM_PROT_COPY;
5778 }
5779
5780 vm_map_lock(map);
5781 restart_after_unlock:
5782
5783 /* LP64todo - remove this check when vm_map_commpage64()
5784 * no longer has to stuff in a map_entry for the commpage
5785 * above the map's max_offset.
5786 */
5787 if (start >= map->max_offset) {
5788 vm_map_unlock(map);
5789 return KERN_INVALID_ADDRESS;
5790 }
5791
5792 while (1) {
5793 /*
5794 * Lookup the entry. If it doesn't start in a valid
5795 * entry, return an error.
5796 */
5797 if (!vm_map_lookup_entry(map, start, &entry)) {
5798 vm_map_unlock(map);
5799 return KERN_INVALID_ADDRESS;
5800 }
5801
5802 if (entry->superpage_size && (start & (SUPERPAGE_SIZE - 1))) { /* extend request to whole entry */
5803 start = SUPERPAGE_ROUND_DOWN(start);
5804 continue;
5805 }
5806 break;
5807 }
5808 if (entry->superpage_size) {
5809 end = SUPERPAGE_ROUND_UP(end);
5810 }
5811
5812 /*
5813 * Make a first pass to check for protection and address
5814 * violations.
5815 */
5816
5817 current = entry;
5818 prev = current->vme_start;
5819 while ((current != vm_map_to_entry(map)) &&
5820 (current->vme_start < end)) {
5821 /*
5822 * If there is a hole, return an error.
5823 */
5824 if (current->vme_start != prev) {
5825 vm_map_unlock(map);
5826 return KERN_INVALID_ADDRESS;
5827 }
5828
5829 new_max = current->max_protection;
5830
5831 #if defined(__x86_64__)
5832 /* Allow max mask to include execute prot bits if this map doesn't enforce CS */
5833 if (set_max && (new_prot & VM_PROT_ALLEXEC) && !vm_map_cs_enforcement(map)) {
5834 new_max = (new_max & ~VM_PROT_ALLEXEC) | (new_prot & VM_PROT_ALLEXEC);
5835 }
5836 #elif CODE_SIGNING_MONITOR
5837 if (set_max && (new_prot & VM_PROT_EXECUTE) && (csm_address_space_exempt(map->pmap) == KERN_SUCCESS)) {
5838 new_max |= VM_PROT_EXECUTE;
5839 }
5840 #endif
5841 if ((new_prot & new_max) != new_prot) {
5842 vm_map_unlock(map);
5843 return KERN_PROTECTION_FAILURE;
5844 }
5845
5846 if (current->used_for_jit &&
5847 pmap_has_prot_policy(map->pmap, current->translated_allow_execute, current->protection)) {
5848 vm_map_unlock(map);
5849 return KERN_PROTECTION_FAILURE;
5850 }
5851
5852 #if __arm64e__
5853 /* Disallow protecting hw assisted TPRO mappings */
5854 if (current->used_for_tpro) {
5855 vm_map_unlock(map);
5856 return KERN_PROTECTION_FAILURE;
5857 }
5858 #endif /* __arm64e__ */
5859
5860
5861 if ((new_prot & VM_PROT_WRITE) &&
5862 (new_prot & VM_PROT_ALLEXEC) &&
5863 #if XNU_TARGET_OS_OSX
5864 map->pmap != kernel_pmap &&
5865 (vm_map_cs_enforcement(map)
5866 #if __arm64__
5867 || !VM_MAP_IS_EXOTIC(map)
5868 #endif /* __arm64__ */
5869 ) &&
5870 #endif /* XNU_TARGET_OS_OSX */
5871 #if CODE_SIGNING_MONITOR
5872 (csm_address_space_exempt(map->pmap) != KERN_SUCCESS) &&
5873 #endif
5874 !(current->used_for_jit)) {
5875 DTRACE_VM3(cs_wx,
5876 uint64_t, (uint64_t) current->vme_start,
5877 uint64_t, (uint64_t) current->vme_end,
5878 vm_prot_t, new_prot);
5879 printf("CODE SIGNING: %d[%s] %s:%d(0x%llx,0x%llx,0x%x) can't have both write and exec at the same time\n",
5880 proc_selfpid(),
5881 (get_bsdtask_info(current_task())
5882 ? proc_name_address(get_bsdtask_info(current_task()))
5883 : "?"),
5884 __FUNCTION__, __LINE__,
5885 #if DEVELOPMENT || DEBUG
5886 (uint64_t)current->vme_start,
5887 (uint64_t)current->vme_end,
5888 #else /* DEVELOPMENT || DEBUG */
5889 (uint64_t)0,
5890 (uint64_t)0,
5891 #endif /* DEVELOPMENT || DEBUG */
5892 new_prot);
5893 new_prot &= ~VM_PROT_ALLEXEC;
5894 if (VM_MAP_POLICY_WX_FAIL(map)) {
5895 vm_map_unlock(map);
5896 return KERN_PROTECTION_FAILURE;
5897 }
5898 }
5899
5900 /*
5901 * If the task has requested executable lockdown,
5902 * deny both:
5903 * - adding executable protections OR
5904 * - adding write protections to an existing executable mapping.
5905 */
5906 if (map->map_disallow_new_exec == TRUE) {
5907 if ((new_prot & VM_PROT_ALLEXEC) ||
5908 ((current->protection & VM_PROT_EXECUTE) && (new_prot & VM_PROT_WRITE))) {
5909 vm_map_unlock(map);
5910 return KERN_PROTECTION_FAILURE;
5911 }
5912 }
5913
5914 prev = current->vme_end;
5915 current = current->vme_next;
5916 }
5917
5918 #if __arm64__
5919 if (end > prev &&
5920 end == vm_map_round_page(prev, VM_MAP_PAGE_MASK(map))) {
5921 vm_map_entry_t prev_entry;
5922
5923 prev_entry = current->vme_prev;
5924 if (prev_entry != vm_map_to_entry(map) &&
5925 !prev_entry->map_aligned &&
5926 (vm_map_round_page(prev_entry->vme_end,
5927 VM_MAP_PAGE_MASK(map))
5928 == end)) {
5929 /*
5930 * The last entry in our range is not "map-aligned"
5931 * but it would have reached all the way to "end"
5932 * if it had been map-aligned, so this is not really
5933 * a hole in the range and we can proceed.
5934 */
5935 prev = end;
5936 }
5937 }
5938 #endif /* __arm64__ */
5939
5940 if (end > prev) {
5941 vm_map_unlock(map);
5942 return KERN_INVALID_ADDRESS;
5943 }
5944
5945 /*
5946 * Go back and fix up protections.
5947 * Clip to start here if the range starts within
5948 * the entry.
5949 */
5950
5951 current = entry;
5952 if (current != vm_map_to_entry(map)) {
5953 /* clip and unnest if necessary */
5954 vm_map_clip_start(map, current, start);
5955 }
5956
5957 while ((current != vm_map_to_entry(map)) &&
5958 (current->vme_start < end)) {
5959 vm_prot_t old_prot;
5960
5961 if (current->in_transition) {
5962 wait_result_t wait_result;
5963 vm_map_offset_t current_start;
5964
5965 /*
5966 * Another thread is wiring/unwiring this entry.
5967 * Let the other thread know we are waiting.
5968 */
5969 current_start = current->vme_start;
5970 current->needs_wakeup = true;
5971 /* wait for the other thread to be done */
5972 wait_result = vm_map_entry_wait(map, TH_UNINT);
5973 /*
5974 * We unlocked the map, so anything could have changed in the
5975 * range and we need to re-check from "current_start" to "end".
5976 * Our entries might no longer be valid.
5977 */
5978 current = NULL;
5979 entry = NULL;
5980 /*
5981 * Re-lookup and re-clip "current_start".
5982 * If it's no longer mapped,
5983 */
5984 vm_map_lookup_entry_or_next(map, current_start, ¤t);
5985 if (current != vm_map_to_entry(map)) {
5986 vm_map_clip_start(map, current, current_start);
5987 }
5988 /* restart from this point */
5989 start = current_start;
5990 goto restart_after_unlock;
5991 }
5992
5993 vm_map_clip_end(map, current, end);
5994
5995 #if DEVELOPMENT || DEBUG
5996 if (current->csm_associated && vm_log_xnu_user_debug) {
5997 printf("FBDP %d[%s] %s(0x%llx,0x%llx,0x%x) on map %p entry %p [0x%llx:0x%llx 0x%x/0x%x] csm_associated\n",
5998 proc_selfpid(),
5999 (get_bsdtask_info(current_task())
6000 ? proc_name_address(get_bsdtask_info(current_task()))
6001 : "?"),
6002 __FUNCTION__,
6003 (uint64_t)start,
6004 (uint64_t)end,
6005 new_prot,
6006 map, current,
6007 current->vme_start,
6008 current->vme_end,
6009 current->protection,
6010 current->max_protection);
6011 }
6012 #endif /* DEVELOPMENT || DEBUG */
6013
6014 if (current->is_sub_map) {
6015 /* clipping did unnest if needed */
6016 assert(!current->use_pmap);
6017 }
6018
6019 old_prot = current->protection;
6020
6021 if (set_max) {
6022 current->max_protection = new_prot;
6023 /* Consider either EXECUTE or UEXEC as EXECUTE for this masking */
6024 current->protection = (new_prot & old_prot);
6025 } else {
6026 current->protection = new_prot;
6027 }
6028
6029 #if CODE_SIGNING_MONITOR
6030 if (/* a !csm_associated mapping becoming executable */
6031 ((!current->csm_associated &&
6032 !(old_prot & VM_PROT_EXECUTE) &&
6033 (current->protection & VM_PROT_EXECUTE))
6034 ||
6035 /* a csm_associated mapping becoming writable */
6036 (current->csm_associated &&
6037 !(old_prot & VM_PROT_WRITE) &&
6038 (current->protection & VM_PROT_WRITE)))) {
6039 /*
6040 * This mapping has not already been marked as
6041 * "user_debug" and it is either:
6042 * 1. not code-signing-monitored and becoming executable
6043 * 2. code-signing-monitored and becoming writable,
6044 * so inform the CodeSigningMonitor and mark the
6045 * mapping as "user_debug" if appropriate.
6046 */
6047 vm_map_kernel_flags_t vmk_flags;
6048 vmk_flags = VM_MAP_KERNEL_FLAGS_NONE;
6049 /* pretend it's a vm_protect(VM_PROT_COPY)... */
6050 vmk_flags.vmkf_remap_prot_copy = true;
6051 kr = vm_map_entry_cs_associate(map, current, vmk_flags);
6052 #if DEVELOPMENT || DEBUG
6053 if (vm_log_xnu_user_debug) {
6054 printf("FBDP %d[%s] %s:%d map %p entry %p [ 0x%llx 0x%llx ] prot 0x%x -> 0x%x cs_associate -> %d user_debug=%d\n",
6055 proc_selfpid(),
6056 (get_bsdtask_info(current_task()) ? proc_name_address(get_bsdtask_info(current_task())) : "?"),
6057 __FUNCTION__, __LINE__,
6058 map, current,
6059 current->vme_start, current->vme_end,
6060 old_prot, current->protection,
6061 kr, current->vme_xnu_user_debug);
6062 }
6063 #endif /* DEVELOPMENT || DEBUG */
6064 }
6065 #endif /* CODE_SIGNING_MONITOR */
6066
6067 /*
6068 * Update physical map if necessary.
6069 * If the request is to turn off write protection,
6070 * we won't do it for real (in pmap). This is because
6071 * it would cause copy-on-write to fail. We've already
6072 * set, the new protection in the map, so if a
6073 * write-protect fault occurred, it will be fixed up
6074 * properly, COW or not.
6075 */
6076 if (current->protection != old_prot) {
6077 /* Look one level in we support nested pmaps */
6078 /* from mapped submaps which are direct entries */
6079 /* in our map */
6080
6081 vm_prot_t prot;
6082
6083 prot = current->protection;
6084 if (current->is_sub_map || (VME_OBJECT(current) == NULL) || (VME_OBJECT(current) != compressor_object)) {
6085 prot &= ~VM_PROT_WRITE;
6086 } else {
6087 assert(!VME_OBJECT(current)->code_signed);
6088 assert(VME_OBJECT(current)->copy_strategy == MEMORY_OBJECT_COPY_NONE);
6089 if (prot & VM_PROT_WRITE) {
6090 /*
6091 * For write requests on the
6092 * compressor, we wil ask the
6093 * pmap layer to prevent us from
6094 * taking a write fault when we
6095 * attempt to access the mapping
6096 * next.
6097 */
6098 pmap_options |= PMAP_OPTIONS_PROTECT_IMMEDIATE;
6099 }
6100 }
6101
6102 if (override_nx(map, VME_ALIAS(current)) && prot) {
6103 prot |= VM_PROT_EXECUTE;
6104 }
6105
6106 #if DEVELOPMENT || DEBUG
6107 if (!(old_prot & VM_PROT_EXECUTE) &&
6108 (prot & VM_PROT_EXECUTE) &&
6109 panic_on_unsigned_execute &&
6110 (proc_selfcsflags() & CS_KILL)) {
6111 panic("vm_map_protect(%p,0x%llx,0x%llx) old=0x%x new=0x%x - <rdar://23770418> code-signing bypass?", map, (uint64_t)current->vme_start, (uint64_t)current->vme_end, old_prot, prot);
6112 }
6113 #endif /* DEVELOPMENT || DEBUG */
6114
6115 if (pmap_has_prot_policy(map->pmap, current->translated_allow_execute, prot)) {
6116 if (current->wired_count) {
6117 panic("vm_map_protect(%p,0x%llx,0x%llx) new=0x%x wired=%x",
6118 map, (uint64_t)current->vme_start, (uint64_t)current->vme_end, prot, current->wired_count);
6119 }
6120
6121 /* If the pmap layer cares about this
6122 * protection type, force a fault for
6123 * each page so that vm_fault will
6124 * repopulate the page with the full
6125 * set of protections.
6126 */
6127 /*
6128 * TODO: We don't seem to need this,
6129 * but this is due to an internal
6130 * implementation detail of
6131 * pmap_protect. Do we want to rely
6132 * on this?
6133 */
6134 prot = VM_PROT_NONE;
6135 }
6136
6137 if (current->is_sub_map && current->use_pmap) {
6138 pmap_protect(VME_SUBMAP(current)->pmap,
6139 current->vme_start,
6140 current->vme_end,
6141 prot);
6142 } else {
6143 pmap_protect_options(map->pmap,
6144 current->vme_start,
6145 current->vme_end,
6146 prot,
6147 pmap_options,
6148 NULL);
6149 }
6150 }
6151 current = current->vme_next;
6152 }
6153
6154 if (entry == VM_MAP_ENTRY_NULL) {
6155 /*
6156 * Re-lookup the original start of our range.
6157 * If it's no longer mapped, start with the next mapping.
6158 */
6159 vm_map_lookup_entry_or_next(map, original_start, &entry);
6160 }
6161 current = entry;
6162 while ((current != vm_map_to_entry(map)) &&
6163 (current->vme_start <= end)) {
6164 vm_map_simplify_entry(map, current);
6165 current = current->vme_next;
6166 }
6167
6168 vm_map_unlock(map);
6169 return KERN_SUCCESS;
6170 }
6171
6172 static __attribute__((always_inline, warn_unused_result))
6173 kern_return_t
vm_map_inherit_sanitize(vm_map_t map,vm_map_offset_ut start_u,vm_map_offset_ut end_u,vm_inherit_ut new_inheritance_u,vm_map_offset_t * start,vm_map_offset_t * end,vm_inherit_t * new_inheritance)6174 vm_map_inherit_sanitize(
6175 vm_map_t map,
6176 vm_map_offset_ut start_u,
6177 vm_map_offset_ut end_u,
6178 vm_inherit_ut new_inheritance_u,
6179 vm_map_offset_t *start,
6180 vm_map_offset_t *end,
6181 vm_inherit_t *new_inheritance)
6182 {
6183 kern_return_t kr;
6184 vm_map_size_t size;
6185
6186 kr = vm_sanitize_inherit(new_inheritance_u,
6187 VM_SANITIZE_CALLER_VM_MAP_INHERIT, new_inheritance);
6188 if (__improbable(kr != KERN_SUCCESS)) {
6189 return kr;
6190 }
6191
6192 vm_sanitize_flags_t flags = VM_SANITIZE_FLAGS_SIZE_ZERO_SUCCEEDS;
6193
6194
6195 kr = vm_sanitize_addr_end(start_u, end_u, VM_SANITIZE_CALLER_VM_MAP_INHERIT,
6196 map, flags, start, end, &size);
6197 if (__improbable(kr != KERN_SUCCESS)) {
6198 return kr;
6199 }
6200
6201 return KERN_SUCCESS;
6202 }
6203
6204 /*
6205 * vm_map_inherit:
6206 *
6207 * Sets the inheritance of the specified address
6208 * range in the target map. Inheritance
6209 * affects how the map will be shared with
6210 * child maps at the time of vm_map_fork.
6211 */
6212 kern_return_t
vm_map_inherit(vm_map_t map,vm_map_offset_ut start_u,vm_map_offset_ut end_u,vm_inherit_ut new_inheritance_u)6213 vm_map_inherit(
6214 vm_map_t map,
6215 vm_map_offset_ut start_u,
6216 vm_map_offset_ut end_u,
6217 vm_inherit_ut new_inheritance_u)
6218 {
6219 vm_map_entry_t entry;
6220 vm_map_entry_t temp_entry;
6221 kern_return_t kr;
6222 vm_map_offset_t start;
6223 vm_map_offset_t end;
6224 vm_inherit_t new_inheritance;
6225
6226 kr = vm_map_inherit_sanitize(map,
6227 start_u,
6228 end_u,
6229 new_inheritance_u,
6230 &start,
6231 &end,
6232 &new_inheritance);
6233 if (__improbable(kr != KERN_SUCCESS)) {
6234 return vm_sanitize_get_kr(kr);
6235 }
6236
6237 vm_map_lock(map);
6238
6239 VM_MAP_RANGE_CHECK(map, start, end);
6240
6241 if (vm_map_lookup_entry(map, start, &temp_entry)) {
6242 entry = temp_entry;
6243 } else {
6244 temp_entry = temp_entry->vme_next;
6245 entry = temp_entry;
6246 }
6247
6248 /* first check entire range for entries which can't support the */
6249 /* given inheritance. */
6250 while ((entry != vm_map_to_entry(map)) && (entry->vme_start < end)) {
6251 if (entry->is_sub_map) {
6252 if (new_inheritance == VM_INHERIT_COPY) {
6253 vm_map_unlock(map);
6254 return KERN_INVALID_ARGUMENT;
6255 }
6256 }
6257
6258 entry = entry->vme_next;
6259 }
6260
6261 entry = temp_entry;
6262 if (entry != vm_map_to_entry(map)) {
6263 /* clip and unnest if necessary */
6264 vm_map_clip_start(map, entry, start);
6265 }
6266
6267 while ((entry != vm_map_to_entry(map)) && (entry->vme_start < end)) {
6268 vm_map_clip_end(map, entry, end);
6269 if (entry->is_sub_map) {
6270 /* clip did unnest if needed */
6271 assert(!entry->use_pmap);
6272 }
6273
6274 entry->inheritance = new_inheritance;
6275
6276 entry = entry->vme_next;
6277 }
6278
6279 vm_map_unlock(map);
6280 return KERN_SUCCESS;
6281 }
6282
6283 /*
6284 * Update the accounting for the amount of wired memory in this map. If the user has
6285 * exceeded the defined limits, then we fail. Wiring on behalf of the kernel never fails.
6286 */
6287
6288 static kern_return_t
add_wire_counts(vm_map_t map,vm_map_entry_t entry,boolean_t user_wire)6289 add_wire_counts(
6290 vm_map_t map,
6291 vm_map_entry_t entry,
6292 boolean_t user_wire)
6293 {
6294 vm_map_size_t size;
6295
6296 bool first_wire = entry->wired_count == 0 && entry->user_wired_count == 0;
6297
6298 if (user_wire) {
6299 unsigned int total_wire_count = vm_page_wire_count + vm_lopage_free_count;
6300
6301 /*
6302 * We're wiring memory at the request of the user. Check if this is the first time the user is wiring
6303 * this map entry.
6304 */
6305
6306 if (entry->user_wired_count == 0) {
6307 size = entry->vme_end - entry->vme_start;
6308
6309 /*
6310 * Since this is the first time the user is wiring this map entry, check to see if we're
6311 * exceeding the user wire limits. There is a per map limit which is the smaller of either
6312 * the process's rlimit or the global vm_per_task_user_wire_limit which caps this value. There is also
6313 * a system-wide limit on the amount of memory all users can wire. If the user is over either
6314 * limit, then we fail.
6315 */
6316
6317 if (size + map->user_wire_size > MIN(map->user_wire_limit, vm_per_task_user_wire_limit) ||
6318 size + ptoa_64(total_wire_count) > vm_global_user_wire_limit) {
6319 if (size + ptoa_64(total_wire_count) > vm_global_user_wire_limit) {
6320 #if DEVELOPMENT || DEBUG
6321 if (panic_on_mlock_failure) {
6322 panic("mlock: Over global wire limit. %llu bytes wired and requested to wire %llu bytes more", ptoa_64(total_wire_count), (uint64_t) size);
6323 }
6324 #endif /* DEVELOPMENT || DEBUG */
6325 os_atomic_inc(&vm_add_wire_count_over_global_limit, relaxed);
6326 } else {
6327 os_atomic_inc(&vm_add_wire_count_over_user_limit, relaxed);
6328 #if DEVELOPMENT || DEBUG
6329 if (panic_on_mlock_failure) {
6330 panic("mlock: Over process wire limit. %llu bytes wired and requested to wire %llu bytes more", (uint64_t) map->user_wire_size, (uint64_t) size);
6331 }
6332 #endif /* DEVELOPMENT || DEBUG */
6333 }
6334 return KERN_RESOURCE_SHORTAGE;
6335 }
6336
6337 /*
6338 * The first time the user wires an entry, we also increment the wired_count and add this to
6339 * the total that has been wired in the map.
6340 */
6341
6342 if (entry->wired_count >= MAX_WIRE_COUNT) {
6343 return KERN_FAILURE;
6344 }
6345
6346 entry->wired_count++;
6347 map->user_wire_size += size;
6348 }
6349
6350 if (entry->user_wired_count >= MAX_WIRE_COUNT) {
6351 return KERN_FAILURE;
6352 }
6353
6354 entry->user_wired_count++;
6355 } else {
6356 /*
6357 * The kernel's wiring the memory. Just bump the count and continue.
6358 */
6359
6360 if (entry->wired_count >= MAX_WIRE_COUNT) {
6361 panic("vm_map_wire: too many wirings");
6362 }
6363
6364 entry->wired_count++;
6365 }
6366
6367 if (first_wire) {
6368 vme_btref_consider_and_set(entry, __builtin_frame_address(0));
6369 }
6370
6371 return KERN_SUCCESS;
6372 }
6373
6374 /*
6375 * Update the memory wiring accounting now that the given map entry is being unwired.
6376 */
6377
6378 static void
subtract_wire_counts(vm_map_t map,vm_map_entry_t entry,boolean_t user_wire)6379 subtract_wire_counts(
6380 vm_map_t map,
6381 vm_map_entry_t entry,
6382 boolean_t user_wire)
6383 {
6384 if (user_wire) {
6385 /*
6386 * We're unwiring memory at the request of the user. See if we're removing the last user wire reference.
6387 */
6388
6389 if (entry->user_wired_count == 1) {
6390 /*
6391 * We're removing the last user wire reference. Decrement the wired_count and the total
6392 * user wired memory for this map.
6393 */
6394
6395 assert(entry->wired_count >= 1);
6396 entry->wired_count--;
6397 map->user_wire_size -= entry->vme_end - entry->vme_start;
6398 }
6399
6400 assert(entry->user_wired_count >= 1);
6401 entry->user_wired_count--;
6402 } else {
6403 /*
6404 * The kernel is unwiring the memory. Just update the count.
6405 */
6406
6407 assert(entry->wired_count >= 1);
6408 entry->wired_count--;
6409 }
6410
6411 vme_btref_consider_and_put(entry);
6412 }
6413
6414 int cs_executable_wire = 0;
6415
6416 static kern_return_t
vm_map_wire_nested(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_prot_t caller_prot,vm_tag_t tag,boolean_t user_wire,pmap_t map_pmap,vm_map_offset_t pmap_addr,ppnum_t * physpage_p)6417 vm_map_wire_nested(
6418 vm_map_t map,
6419 vm_map_offset_t start,
6420 vm_map_offset_t end,
6421 vm_prot_t caller_prot,
6422 vm_tag_t tag,
6423 boolean_t user_wire,
6424 pmap_t map_pmap,
6425 vm_map_offset_t pmap_addr,
6426 ppnum_t *physpage_p)
6427 {
6428 vm_map_entry_t entry;
6429 vm_prot_t access_type;
6430 struct vm_map_entry *first_entry, tmp_entry;
6431 vm_map_t real_map;
6432 vm_map_offset_t s, e;
6433 kern_return_t rc;
6434 boolean_t need_wakeup;
6435 boolean_t main_map = FALSE;
6436 wait_interrupt_t interruptible_state;
6437 thread_t cur_thread;
6438 unsigned int last_timestamp;
6439 vm_map_size_t size;
6440 boolean_t wire_and_extract;
6441 vm_prot_t extra_prots;
6442
6443 extra_prots = VM_PROT_COPY;
6444 extra_prots |= VM_PROT_COPY_FAIL_IF_EXECUTABLE;
6445 #if XNU_TARGET_OS_OSX
6446 if (map->pmap == kernel_pmap ||
6447 !vm_map_cs_enforcement(map)) {
6448 extra_prots &= ~VM_PROT_COPY_FAIL_IF_EXECUTABLE;
6449 }
6450 #endif /* XNU_TARGET_OS_OSX */
6451 #if CODE_SIGNING_MONITOR
6452 if (csm_address_space_exempt(map->pmap) == KERN_SUCCESS) {
6453 extra_prots &= ~VM_PROT_COPY_FAIL_IF_EXECUTABLE;
6454 }
6455 #endif /* CODE_SIGNING_MONITOR */
6456
6457 access_type = (caller_prot & (VM_PROT_ALL | VM_PROT_ALLEXEC));
6458
6459 wire_and_extract = FALSE;
6460 if (physpage_p != NULL) {
6461 /*
6462 * The caller wants the physical page number of the
6463 * wired page. We return only one physical page number
6464 * so this works for only one page at a time.
6465 *
6466 * The only caller (vm_map_wire_and_extract)
6467 * guarantees it.
6468 */
6469 assert(end - start == VM_MAP_PAGE_SIZE(map));
6470 wire_and_extract = TRUE;
6471 *physpage_p = 0;
6472 }
6473
6474 VM_MAP_RANGE_CHECK(map, start, end);
6475 assert(VM_MAP_PAGE_ALIGNED(start, VM_MAP_PAGE_MASK(map)));
6476 assert(VM_MAP_PAGE_ALIGNED(end, VM_MAP_PAGE_MASK(map)));
6477 if (start == end) {
6478 /* We wired what the caller asked for, zero pages */
6479 return KERN_SUCCESS;
6480 }
6481
6482 vm_map_lock(map);
6483 if (map_pmap == NULL) {
6484 main_map = TRUE;
6485 }
6486 last_timestamp = map->timestamp;
6487
6488 need_wakeup = FALSE;
6489 cur_thread = current_thread();
6490
6491 s = start;
6492 rc = KERN_SUCCESS;
6493
6494 if (vm_map_lookup_entry(map, s, &first_entry)) {
6495 entry = first_entry;
6496 /*
6497 * vm_map_clip_start will be done later.
6498 * We don't want to unnest any nested submaps here !
6499 */
6500 } else {
6501 /* Start address is not in map */
6502 rc = KERN_INVALID_ADDRESS;
6503 goto done;
6504 }
6505
6506 while ((entry != vm_map_to_entry(map)) && (s < end)) {
6507 /*
6508 * At this point, we have wired from "start" to "s".
6509 * We still need to wire from "s" to "end".
6510 *
6511 * "entry" hasn't been clipped, so it could start before "s"
6512 * and/or end after "end".
6513 */
6514
6515 /* "e" is how far we want to wire in this entry */
6516 e = entry->vme_end;
6517 if (e > end) {
6518 e = end;
6519 }
6520
6521 /*
6522 * If another thread is wiring/unwiring this entry then
6523 * block after informing other thread to wake us up.
6524 */
6525 if (entry->in_transition) {
6526 wait_result_t wait_result;
6527
6528 /*
6529 * We have not clipped the entry. Make sure that
6530 * the start address is in range so that the lookup
6531 * below will succeed.
6532 * "s" is the current starting point: we've already
6533 * wired from "start" to "s" and we still have
6534 * to wire from "s" to "end".
6535 */
6536
6537 entry->needs_wakeup = TRUE;
6538
6539 /*
6540 * wake up anybody waiting on entries that we have
6541 * already wired.
6542 */
6543 if (need_wakeup) {
6544 vm_map_entry_wakeup(map);
6545 need_wakeup = FALSE;
6546 }
6547 /*
6548 * User wiring is interruptible
6549 */
6550 wait_result = vm_map_entry_wait(map,
6551 (user_wire) ? THREAD_ABORTSAFE :
6552 THREAD_UNINT);
6553 if (user_wire && wait_result == THREAD_INTERRUPTED) {
6554 /*
6555 * undo the wirings we have done so far
6556 * We do not clear the needs_wakeup flag,
6557 * because we cannot tell if we were the
6558 * only one waiting.
6559 */
6560 rc = KERN_FAILURE;
6561 goto done;
6562 }
6563
6564 /*
6565 * Cannot avoid a lookup here. reset timestamp.
6566 */
6567 last_timestamp = map->timestamp;
6568
6569 /*
6570 * The entry could have been clipped, look it up again.
6571 * Worse that can happen is, it may not exist anymore.
6572 */
6573 if (!vm_map_lookup_entry(map, s, &first_entry)) {
6574 /*
6575 * User: undo everything upto the previous
6576 * entry. let vm_map_unwire worry about
6577 * checking the validity of the range.
6578 */
6579 rc = KERN_FAILURE;
6580 goto done;
6581 }
6582 entry = first_entry;
6583 continue;
6584 }
6585
6586 if (entry->is_sub_map) {
6587 vm_map_offset_t sub_start;
6588 vm_map_offset_t sub_end;
6589 vm_map_offset_t local_start;
6590 vm_map_offset_t local_end;
6591 pmap_t pmap;
6592 vm_map_t sub_map = VM_MAP_NULL;
6593
6594 if (wire_and_extract) {
6595 /*
6596 * Wiring would result in copy-on-write
6597 * which would not be compatible with
6598 * the sharing we have with the original
6599 * provider of this memory.
6600 */
6601 rc = KERN_INVALID_ARGUMENT;
6602 goto done;
6603 }
6604
6605 vm_map_clip_start(map, entry, s);
6606 vm_map_clip_end(map, entry, end);
6607
6608 sub_start = VME_OFFSET(entry);
6609 sub_end = entry->vme_end;
6610 sub_end += VME_OFFSET(entry) - entry->vme_start;
6611
6612 local_end = entry->vme_end;
6613 if (map_pmap == NULL) {
6614 vm_object_t object;
6615 vm_object_offset_t offset;
6616 vm_prot_t prot;
6617 boolean_t wired;
6618 vm_map_entry_t local_entry;
6619 vm_map_version_t version;
6620 vm_map_t lookup_map;
6621
6622 if (entry->use_pmap) {
6623 pmap = VME_SUBMAP(entry)->pmap;
6624 /* ppc implementation requires that */
6625 /* submaps pmap address ranges line */
6626 /* up with parent map */
6627 #ifdef notdef
6628 pmap_addr = sub_start;
6629 #endif
6630 pmap_addr = s;
6631 } else {
6632 pmap = map->pmap;
6633 pmap_addr = s;
6634 }
6635
6636 if (entry->wired_count) {
6637 if ((rc = add_wire_counts(map, entry, user_wire)) != KERN_SUCCESS) {
6638 goto done;
6639 }
6640
6641 /*
6642 * The map was not unlocked:
6643 * no need to goto re-lookup.
6644 * Just go directly to next entry.
6645 */
6646 entry = entry->vme_next;
6647 s = entry->vme_start;
6648 continue;
6649 }
6650
6651 /* call vm_map_lookup_and_lock_object to */
6652 /* cause any needs copy to be */
6653 /* evaluated */
6654 local_start = entry->vme_start;
6655 lookup_map = map;
6656 vm_map_lock_write_to_read(map);
6657 rc = vm_map_lookup_and_lock_object(
6658 &lookup_map, local_start,
6659 (access_type | extra_prots),
6660 OBJECT_LOCK_EXCLUSIVE,
6661 &version, &object,
6662 &offset, &prot, &wired,
6663 NULL,
6664 &real_map, NULL);
6665 if (rc != KERN_SUCCESS) {
6666 vm_map_unlock_read(lookup_map);
6667 assert(map_pmap == NULL);
6668 vm_map_unwire_nested(map, start,
6669 s, user_wire, PMAP_NULL, 0);
6670 return rc;
6671 }
6672 vm_object_unlock(object);
6673 if (real_map != lookup_map) {
6674 vm_map_unlock(real_map);
6675 }
6676 vm_map_unlock_read(lookup_map);
6677 vm_map_lock(map);
6678
6679 /* we unlocked, so must re-lookup */
6680 if (!vm_map_lookup_entry(map,
6681 local_start,
6682 &local_entry)) {
6683 rc = KERN_FAILURE;
6684 goto done;
6685 }
6686
6687 /*
6688 * entry could have been "simplified",
6689 * so re-clip
6690 */
6691 entry = local_entry;
6692 assert(s == local_start);
6693 vm_map_clip_start(map, entry, s);
6694 vm_map_clip_end(map, entry, end);
6695 /* re-compute "e" */
6696 e = entry->vme_end;
6697 if (e > end) {
6698 e = end;
6699 }
6700
6701 /* did we have a change of type? */
6702 if (!entry->is_sub_map) {
6703 last_timestamp = map->timestamp;
6704 continue;
6705 }
6706 } else {
6707 local_start = entry->vme_start;
6708 pmap = map_pmap;
6709 }
6710
6711 if ((rc = add_wire_counts(map, entry, user_wire)) != KERN_SUCCESS) {
6712 goto done;
6713 }
6714
6715 entry->in_transition = TRUE;
6716
6717 sub_map = VME_SUBMAP(entry);
6718 vm_map_reference(sub_map);
6719 vm_map_unlock(map);
6720 rc = vm_map_wire_nested(sub_map,
6721 sub_start, sub_end,
6722 caller_prot, tag,
6723 user_wire, pmap, pmap_addr,
6724 NULL);
6725 vm_map_deallocate(sub_map);
6726 sub_map = VM_MAP_NULL;
6727 vm_map_lock(map);
6728
6729 /*
6730 * Find the entry again. It could have been clipped
6731 * after we unlocked the map.
6732 */
6733 if (!vm_map_lookup_entry(map, local_start,
6734 &first_entry)) {
6735 panic("vm_map_wire: re-lookup failed");
6736 }
6737 entry = first_entry;
6738
6739 assert(local_start == s);
6740 /* re-compute "e" */
6741 e = entry->vme_end;
6742 if (e > end) {
6743 e = end;
6744 }
6745
6746 last_timestamp = map->timestamp;
6747 while ((entry != vm_map_to_entry(map)) &&
6748 (entry->vme_start < e)) {
6749 assert(entry->in_transition);
6750 entry->in_transition = FALSE;
6751 if (entry->needs_wakeup) {
6752 entry->needs_wakeup = FALSE;
6753 need_wakeup = TRUE;
6754 }
6755 if (rc != KERN_SUCCESS) {/* from vm_*_wire */
6756 subtract_wire_counts(map, entry, user_wire);
6757 }
6758 entry = entry->vme_next;
6759 }
6760 if (rc != KERN_SUCCESS) { /* from vm_*_wire */
6761 goto done;
6762 }
6763
6764 /* no need to relookup again */
6765 s = entry->vme_start;
6766 continue;
6767 }
6768
6769 /*
6770 * If this entry is already wired then increment
6771 * the appropriate wire reference count.
6772 */
6773 if (entry->wired_count) {
6774 if ((entry->protection & access_type) != access_type) {
6775 /* found a protection problem */
6776
6777 /*
6778 * XXX FBDP
6779 * We should always return an error
6780 * in this case but since we didn't
6781 * enforce it before, let's do
6782 * it only for the new "wire_and_extract"
6783 * code path for now...
6784 */
6785 if (wire_and_extract) {
6786 rc = KERN_PROTECTION_FAILURE;
6787 goto done;
6788 }
6789 }
6790
6791 /*
6792 * entry is already wired down, get our reference
6793 * after clipping to our range.
6794 */
6795 vm_map_clip_start(map, entry, s);
6796 vm_map_clip_end(map, entry, end);
6797
6798 if ((rc = add_wire_counts(map, entry, user_wire)) != KERN_SUCCESS) {
6799 goto done;
6800 }
6801
6802 if (wire_and_extract) {
6803 vm_object_t object;
6804 vm_object_offset_t offset;
6805 vm_page_t m;
6806
6807 /*
6808 * We don't have to "wire" the page again
6809 * bit we still have to "extract" its
6810 * physical page number, after some sanity
6811 * checks.
6812 */
6813 assert((entry->vme_end - entry->vme_start)
6814 == PAGE_SIZE);
6815 assert(!entry->needs_copy);
6816 assert(!entry->is_sub_map);
6817 assert(VME_OBJECT(entry));
6818 if (((entry->vme_end - entry->vme_start)
6819 != PAGE_SIZE) ||
6820 entry->needs_copy ||
6821 entry->is_sub_map ||
6822 VME_OBJECT(entry) == VM_OBJECT_NULL) {
6823 rc = KERN_INVALID_ARGUMENT;
6824 goto done;
6825 }
6826
6827 object = VME_OBJECT(entry);
6828 offset = VME_OFFSET(entry);
6829 /* need exclusive lock to update m->dirty */
6830 if (entry->protection & VM_PROT_WRITE) {
6831 vm_object_lock(object);
6832 } else {
6833 vm_object_lock_shared(object);
6834 }
6835 m = vm_page_lookup(object, offset);
6836 assert(m != VM_PAGE_NULL);
6837 assert(VM_PAGE_WIRED(m));
6838 if (m != VM_PAGE_NULL && VM_PAGE_WIRED(m)) {
6839 *physpage_p = VM_PAGE_GET_PHYS_PAGE(m);
6840 if (entry->protection & VM_PROT_WRITE) {
6841 vm_object_lock_assert_exclusive(
6842 object);
6843 m->vmp_dirty = TRUE;
6844 }
6845 } else {
6846 /* not already wired !? */
6847 *physpage_p = 0;
6848 }
6849 vm_object_unlock(object);
6850 }
6851
6852 /* map was not unlocked: no need to relookup */
6853 entry = entry->vme_next;
6854 s = entry->vme_start;
6855 continue;
6856 }
6857
6858 /*
6859 * Unwired entry or wire request transmitted via submap
6860 */
6861
6862 /*
6863 * Wiring would copy the pages to the shadow object.
6864 * The shadow object would not be code-signed so
6865 * attempting to execute code from these copied pages
6866 * would trigger a code-signing violation.
6867 */
6868
6869 if ((entry->protection & VM_PROT_EXECUTE)
6870 #if XNU_TARGET_OS_OSX
6871 &&
6872 map->pmap != kernel_pmap &&
6873 (vm_map_cs_enforcement(map)
6874 #if __arm64__
6875 || !VM_MAP_IS_EXOTIC(map)
6876 #endif /* __arm64__ */
6877 )
6878 #endif /* XNU_TARGET_OS_OSX */
6879 #if CODE_SIGNING_MONITOR
6880 &&
6881 (csm_address_space_exempt(map->pmap) != KERN_SUCCESS)
6882 #endif
6883 ) {
6884 #if MACH_ASSERT
6885 printf("pid %d[%s] wiring executable range from "
6886 "0x%llx to 0x%llx: rejected to preserve "
6887 "code-signing\n",
6888 proc_selfpid(),
6889 (get_bsdtask_info(current_task())
6890 ? proc_name_address(get_bsdtask_info(current_task()))
6891 : "?"),
6892 (uint64_t) entry->vme_start,
6893 (uint64_t) entry->vme_end);
6894 #endif /* MACH_ASSERT */
6895 DTRACE_VM2(cs_executable_wire,
6896 uint64_t, (uint64_t)entry->vme_start,
6897 uint64_t, (uint64_t)entry->vme_end);
6898 cs_executable_wire++;
6899 rc = KERN_PROTECTION_FAILURE;
6900 goto done;
6901 }
6902
6903 /*
6904 * Perform actions of vm_map_lookup that need the write
6905 * lock on the map: create a shadow object for a
6906 * copy-on-write region, or an object for a zero-fill
6907 * region.
6908 */
6909 size = entry->vme_end - entry->vme_start;
6910 /*
6911 * If wiring a copy-on-write page, we need to copy it now
6912 * even if we're only (currently) requesting read access.
6913 * This is aggressive, but once it's wired we can't move it.
6914 */
6915 if (entry->needs_copy) {
6916 if (wire_and_extract) {
6917 /*
6918 * We're supposed to share with the original
6919 * provider so should not be "needs_copy"
6920 */
6921 rc = KERN_INVALID_ARGUMENT;
6922 goto done;
6923 }
6924
6925 VME_OBJECT_SHADOW(entry, size,
6926 vm_map_always_shadow(map));
6927 entry->needs_copy = FALSE;
6928 } else if (VME_OBJECT(entry) == VM_OBJECT_NULL) {
6929 if (wire_and_extract) {
6930 /*
6931 * We're supposed to share with the original
6932 * provider so should already have an object.
6933 */
6934 rc = KERN_INVALID_ARGUMENT;
6935 goto done;
6936 }
6937 VME_OBJECT_SET(entry, vm_object_allocate(size, map->serial_id), false, 0);
6938 VME_OFFSET_SET(entry, (vm_object_offset_t)0);
6939 assert(entry->use_pmap);
6940 } else if (VME_OBJECT(entry)->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) {
6941 if (wire_and_extract) {
6942 /*
6943 * We're supposed to share with the original
6944 * provider so should not be COPY_SYMMETRIC.
6945 */
6946 rc = KERN_INVALID_ARGUMENT;
6947 goto done;
6948 }
6949 /*
6950 * Force an unrequested "copy-on-write" but only for
6951 * the range we're wiring.
6952 */
6953 // printf("FBDP %s:%d map %p entry %p [ 0x%llx 0x%llx ] s 0x%llx end 0x%llx wire&extract=%d\n", __FUNCTION__, __LINE__, map, entry, (uint64_t)entry->vme_start, (uint64_t)entry->vme_end, (uint64_t)s, (uint64_t)end, wire_and_extract);
6954 vm_map_clip_start(map, entry, s);
6955 vm_map_clip_end(map, entry, end);
6956 /* recompute "size" */
6957 size = entry->vme_end - entry->vme_start;
6958 /* make a shadow object */
6959 vm_object_t orig_object;
6960 vm_object_offset_t orig_offset;
6961 orig_object = VME_OBJECT(entry);
6962 orig_offset = VME_OFFSET(entry);
6963 VME_OBJECT_SHADOW(entry, size, vm_map_always_shadow(map));
6964 if (VME_OBJECT(entry) != orig_object) {
6965 /*
6966 * This mapping has not been shared (or it would be
6967 * COPY_DELAY instead of COPY_SYMMETRIC) and it has
6968 * not been copied-on-write (or it would be marked
6969 * as "needs_copy" and would have been handled above
6970 * and also already write-protected).
6971 * We still need to write-protect here to prevent
6972 * other threads from modifying these pages while
6973 * we're in the process of copying and wiring
6974 * the copied pages.
6975 * Since the mapping is neither shared nor COWed,
6976 * we only need to write-protect the PTEs for this
6977 * mapping.
6978 */
6979 vm_object_pmap_protect(orig_object,
6980 orig_offset,
6981 size,
6982 map->pmap,
6983 VM_MAP_PAGE_SIZE(map),
6984 entry->vme_start,
6985 entry->protection & ~VM_PROT_WRITE);
6986 }
6987 }
6988 if (VME_OBJECT(entry)->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) {
6989 /*
6990 * Make the object COPY_DELAY to get a stable object
6991 * to wire.
6992 * That should avoid creating long shadow chains while
6993 * wiring/unwiring the same range repeatedly.
6994 * That also prevents part of the object from being
6995 * wired while another part is "needs_copy", which
6996 * could result in conflicting rules wrt copy-on-write.
6997 */
6998 vm_object_t object;
6999
7000 object = VME_OBJECT(entry);
7001 vm_object_lock(object);
7002 if (object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) {
7003 assertf(vm_object_round_page(VME_OFFSET(entry) + size) - vm_object_trunc_page(VME_OFFSET(entry)) == object->vo_size,
7004 "object %p size 0x%llx entry %p [0x%llx:0x%llx:0x%llx] size 0x%llx\n",
7005 object, (uint64_t)object->vo_size,
7006 entry,
7007 (uint64_t)entry->vme_start,
7008 (uint64_t)entry->vme_end,
7009 (uint64_t)VME_OFFSET(entry),
7010 (uint64_t)size);
7011 assertf(os_ref_get_count_raw(&object->ref_count) == 1,
7012 "object %p ref_count %d\n",
7013 object, os_ref_get_count_raw(&object->ref_count));
7014 assertf(!entry->needs_copy,
7015 "entry %p\n", entry);
7016 object->copy_strategy = MEMORY_OBJECT_COPY_DELAY;
7017 VM_OBJECT_SET_TRUE_SHARE(object, TRUE);
7018 }
7019 vm_object_unlock(object);
7020 }
7021
7022 vm_map_clip_start(map, entry, s);
7023 vm_map_clip_end(map, entry, end);
7024
7025 /* re-compute "e" */
7026 e = entry->vme_end;
7027 if (e > end) {
7028 e = end;
7029 }
7030
7031 /*
7032 * Check for holes and protection mismatch.
7033 * Holes: Next entry should be contiguous unless this
7034 * is the end of the region.
7035 * Protection: Access requested must be allowed, unless
7036 * wiring is by protection class
7037 */
7038 if ((entry->vme_end < end) &&
7039 ((entry->vme_next == vm_map_to_entry(map)) ||
7040 (entry->vme_next->vme_start > entry->vme_end))) {
7041 /* found a hole */
7042 rc = KERN_INVALID_ADDRESS;
7043 goto done;
7044 }
7045 if ((entry->protection & access_type) != access_type) {
7046 /* found a protection problem */
7047 rc = KERN_PROTECTION_FAILURE;
7048 goto done;
7049 }
7050
7051 assert(entry->wired_count == 0 && entry->user_wired_count == 0);
7052
7053 if ((rc = add_wire_counts(map, entry, user_wire)) != KERN_SUCCESS) {
7054 goto done;
7055 }
7056
7057 entry->in_transition = TRUE;
7058
7059 /*
7060 * This entry might get split once we unlock the map.
7061 * In vm_fault_wire(), we need the current range as
7062 * defined by this entry. In order for this to work
7063 * along with a simultaneous clip operation, we make a
7064 * temporary copy of this entry and use that for the
7065 * wiring. Note that the underlying objects do not
7066 * change during a clip.
7067 */
7068 tmp_entry = *entry;
7069
7070 /*
7071 * The in_transition state guarentees that the entry
7072 * (or entries for this range, if split occured) will be
7073 * there when the map lock is acquired for the second time.
7074 */
7075 vm_map_unlock(map);
7076
7077 if (!user_wire && cur_thread != THREAD_NULL) {
7078 interruptible_state = thread_interrupt_level(THREAD_UNINT);
7079 } else {
7080 interruptible_state = THREAD_UNINT;
7081 }
7082
7083 if (map_pmap) {
7084 rc = vm_fault_wire(map,
7085 &tmp_entry, caller_prot, tag, map_pmap, pmap_addr,
7086 physpage_p);
7087 } else {
7088 rc = vm_fault_wire(map,
7089 &tmp_entry, caller_prot, tag, map->pmap,
7090 tmp_entry.vme_start,
7091 physpage_p);
7092 }
7093
7094 if (!user_wire && cur_thread != THREAD_NULL) {
7095 thread_interrupt_level(interruptible_state);
7096 }
7097
7098 vm_map_lock(map);
7099
7100 if (last_timestamp + 1 != map->timestamp) {
7101 /*
7102 * Find the entry again. It could have been clipped
7103 * after we unlocked the map.
7104 */
7105 if (!vm_map_lookup_entry(map, tmp_entry.vme_start,
7106 &first_entry)) {
7107 panic("vm_map_wire: re-lookup failed");
7108 }
7109
7110 entry = first_entry;
7111 }
7112
7113 last_timestamp = map->timestamp;
7114
7115 while ((entry != vm_map_to_entry(map)) &&
7116 (entry->vme_start < tmp_entry.vme_end)) {
7117 assert(entry->in_transition);
7118 entry->in_transition = FALSE;
7119 if (entry->needs_wakeup) {
7120 entry->needs_wakeup = FALSE;
7121 need_wakeup = TRUE;
7122 }
7123 if (rc != KERN_SUCCESS) { /* from vm_*_wire */
7124 subtract_wire_counts(map, entry, user_wire);
7125 }
7126 entry = entry->vme_next;
7127 }
7128
7129 if (rc != KERN_SUCCESS) { /* from vm_*_wire */
7130 goto done;
7131 }
7132
7133 if ((entry != vm_map_to_entry(map)) && /* we still have entries in the map */
7134 (tmp_entry.vme_end != end) && /* AND, we are not at the end of the requested range */
7135 (entry->vme_start != tmp_entry.vme_end)) { /* AND, the next entry is not contiguous. */
7136 /* found a "new" hole */
7137 s = tmp_entry.vme_end;
7138 rc = KERN_INVALID_ADDRESS;
7139 goto done;
7140 }
7141
7142 s = entry->vme_start;
7143 } /* end while loop through map entries */
7144
7145 done:
7146 if (rc == KERN_SUCCESS) {
7147 /* repair any damage we may have made to the VM map */
7148 vm_map_simplify_range(map, start, end);
7149 }
7150
7151 vm_map_unlock(map);
7152
7153 /*
7154 * wake up anybody waiting on entries we wired.
7155 */
7156 if (need_wakeup) {
7157 vm_map_entry_wakeup(map);
7158 }
7159
7160 if (rc != KERN_SUCCESS) {
7161 /* undo what has been wired so far */
7162 vm_map_unwire_nested(map, start, s, user_wire,
7163 map_pmap, pmap_addr);
7164 if (physpage_p) {
7165 *physpage_p = 0;
7166 }
7167 }
7168
7169 return rc;
7170 }
7171
7172 static __attribute__((always_inline, warn_unused_result))
7173 kern_return_t
vm_map_wire_sanitize(vm_map_t map,vm_map_offset_ut start_u,vm_map_offset_ut end_u,vm_prot_ut prot_u,vm_sanitize_caller_t vm_sanitize_caller,vm_map_offset_t * start,vm_map_offset_t * end,vm_map_size_t * size,vm_prot_t * prot)7174 vm_map_wire_sanitize(
7175 vm_map_t map,
7176 vm_map_offset_ut start_u,
7177 vm_map_offset_ut end_u,
7178 vm_prot_ut prot_u,
7179 vm_sanitize_caller_t vm_sanitize_caller,
7180 vm_map_offset_t *start,
7181 vm_map_offset_t *end,
7182 vm_map_size_t *size,
7183 vm_prot_t *prot)
7184 {
7185 kern_return_t kr;
7186
7187 vm_sanitize_flags_t flags = VM_SANITIZE_FLAGS_SIZE_ZERO_SUCCEEDS;
7188
7189
7190 kr = vm_sanitize_addr_end(start_u, end_u, vm_sanitize_caller, map,
7191 flags, start, end, size);
7192 if (__improbable(kr != KERN_SUCCESS)) {
7193 return kr;
7194 }
7195
7196 kr = vm_sanitize_prot(prot_u, vm_sanitize_caller, map, prot);
7197 if (__improbable(kr != KERN_SUCCESS)) {
7198 return kr;
7199 }
7200
7201 return KERN_SUCCESS;
7202 }
7203
7204 /*
7205 * Validation function for vm_map_wire_nested().
7206 */
7207 kern_return_t
vm_map_wire_impl(vm_map_t map,vm_map_offset_ut start_u,vm_map_offset_ut end_u,vm_prot_ut prot_u,vm_tag_t tag,boolean_t user_wire,ppnum_t * physpage_p,vm_sanitize_caller_t vm_sanitize_caller)7208 vm_map_wire_impl(
7209 vm_map_t map,
7210 vm_map_offset_ut start_u,
7211 vm_map_offset_ut end_u,
7212 vm_prot_ut prot_u,
7213 vm_tag_t tag,
7214 boolean_t user_wire,
7215 ppnum_t *physpage_p,
7216 vm_sanitize_caller_t vm_sanitize_caller)
7217 {
7218 vm_map_offset_t start, end;
7219 vm_map_size_t size;
7220 vm_prot_t prot;
7221 kern_return_t kr;
7222
7223 /*
7224 * Sanitize any input parameters that are addr/size/prot/inherit
7225 */
7226 kr = vm_map_wire_sanitize(map,
7227 start_u,
7228 end_u,
7229 prot_u,
7230 vm_sanitize_caller,
7231 &start,
7232 &end,
7233 &size,
7234 &prot);
7235 if (__improbable(kr != KERN_SUCCESS)) {
7236 if (physpage_p) {
7237 *physpage_p = 0;
7238 }
7239 return vm_sanitize_get_kr(kr);
7240 }
7241
7242 return vm_map_wire_nested(map, start, end, prot, tag, user_wire,
7243 PMAP_NULL, 0, physpage_p);
7244 }
7245
7246 kern_return_t
vm_map_wire_external(vm_map_t map,vm_map_offset_ut start_u,vm_map_offset_ut end_u,vm_prot_ut prot_u,boolean_t user_wire)7247 vm_map_wire_external(
7248 vm_map_t map,
7249 vm_map_offset_ut start_u,
7250 vm_map_offset_ut end_u,
7251 vm_prot_ut prot_u,
7252 boolean_t user_wire)
7253 {
7254 vm_tag_t tag = vm_tag_bt();
7255
7256 return vm_map_wire_kernel(map, start_u, end_u, prot_u, tag, user_wire);
7257 }
7258
7259 kern_return_t
vm_map_wire_kernel(vm_map_t map,vm_map_offset_ut start_u,vm_map_offset_ut end_u,vm_prot_ut prot_u,vm_tag_t tag,boolean_t user_wire)7260 vm_map_wire_kernel(
7261 vm_map_t map,
7262 vm_map_offset_ut start_u,
7263 vm_map_offset_ut end_u,
7264 vm_prot_ut prot_u,
7265 vm_tag_t tag,
7266 boolean_t user_wire)
7267 {
7268 return vm_map_wire_impl(map, start_u, end_u, prot_u, tag,
7269 user_wire, NULL, VM_SANITIZE_CALLER_VM_MAP_WIRE);
7270 }
7271
7272 #if XNU_PLATFORM_MacOSX
7273
7274 kern_return_t
vm_map_wire_and_extract(vm_map_t map,vm_map_offset_ut start_u,vm_prot_ut prot_u,boolean_t user_wire,ppnum_t * physpage_p)7275 vm_map_wire_and_extract(
7276 vm_map_t map,
7277 vm_map_offset_ut start_u,
7278 vm_prot_ut prot_u,
7279 boolean_t user_wire,
7280 ppnum_t *physpage_p)
7281 {
7282 vm_tag_t tag = vm_tag_bt();
7283 vm_map_size_ut size_u = vm_sanitize_wrap_size(VM_MAP_PAGE_SIZE(map));
7284 vm_map_offset_ut end_u = vm_sanitize_compute_ut_end(start_u, size_u);
7285
7286 return vm_map_wire_impl(map, start_u, end_u, prot_u, tag,
7287 user_wire, physpage_p, VM_SANITIZE_CALLER_VM_MAP_WIRE);
7288 }
7289
7290 #endif /* XNU_PLATFORM_MacOSX */
7291
7292 static kern_return_t
vm_map_unwire_nested(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,boolean_t user_wire,pmap_t map_pmap,vm_map_offset_t pmap_addr)7293 vm_map_unwire_nested(
7294 vm_map_t map,
7295 vm_map_offset_t start,
7296 vm_map_offset_t end,
7297 boolean_t user_wire,
7298 pmap_t map_pmap,
7299 vm_map_offset_t pmap_addr)
7300 {
7301 vm_map_entry_t entry;
7302 struct vm_map_entry *first_entry, tmp_entry;
7303 boolean_t need_wakeup;
7304 boolean_t main_map = FALSE;
7305 unsigned int last_timestamp;
7306
7307 VM_MAP_RANGE_CHECK(map, start, end);
7308 assert(VM_MAP_PAGE_ALIGNED(start, VM_MAP_PAGE_MASK(map)));
7309 assert(VM_MAP_PAGE_ALIGNED(end, VM_MAP_PAGE_MASK(map)));
7310
7311 if (start == end) {
7312 /* We unwired what the caller asked for: zero pages */
7313 return KERN_SUCCESS;
7314 }
7315
7316 vm_map_lock(map);
7317 if (map_pmap == NULL) {
7318 main_map = TRUE;
7319 }
7320 last_timestamp = map->timestamp;
7321
7322 if (vm_map_lookup_entry(map, start, &first_entry)) {
7323 entry = first_entry;
7324 /*
7325 * vm_map_clip_start will be done later.
7326 * We don't want to unnest any nested sub maps here !
7327 */
7328 } else {
7329 if (!user_wire) {
7330 panic("vm_map_unwire: start not found");
7331 }
7332 /* Start address is not in map. */
7333 vm_map_unlock(map);
7334 return KERN_INVALID_ADDRESS;
7335 }
7336
7337 if (entry->superpage_size) {
7338 /* superpages are always wired */
7339 vm_map_unlock(map);
7340 return KERN_INVALID_ADDRESS;
7341 }
7342
7343 need_wakeup = FALSE;
7344 while ((entry != vm_map_to_entry(map)) && (entry->vme_start < end)) {
7345 if (entry->in_transition) {
7346 /*
7347 * 1)
7348 * Another thread is wiring down this entry. Note
7349 * that if it is not for the other thread we would
7350 * be unwiring an unwired entry. This is not
7351 * permitted. If we wait, we will be unwiring memory
7352 * we did not wire.
7353 *
7354 * 2)
7355 * Another thread is unwiring this entry. We did not
7356 * have a reference to it, because if we did, this
7357 * entry will not be getting unwired now.
7358 */
7359 if (!user_wire) {
7360 /*
7361 * XXX FBDP
7362 * This could happen: there could be some
7363 * overlapping vslock/vsunlock operations
7364 * going on.
7365 * We should probably just wait and retry,
7366 * but then we have to be careful that this
7367 * entry could get "simplified" after
7368 * "in_transition" gets unset and before
7369 * we re-lookup the entry, so we would
7370 * have to re-clip the entry to avoid
7371 * re-unwiring what we have already unwired...
7372 * See vm_map_wire_nested().
7373 *
7374 * Or we could just ignore "in_transition"
7375 * here and proceed to decement the wired
7376 * count(s) on this entry. That should be fine
7377 * as long as "wired_count" doesn't drop all
7378 * the way to 0 (and we should panic if THAT
7379 * happens).
7380 */
7381 panic("vm_map_unwire: in_transition entry");
7382 }
7383
7384 entry = entry->vme_next;
7385 continue;
7386 }
7387
7388 if (entry->is_sub_map) {
7389 vm_map_offset_t sub_start;
7390 vm_map_offset_t sub_end;
7391 vm_map_offset_t local_end;
7392 pmap_t pmap;
7393 vm_map_t sub_map = VM_MAP_NULL;
7394
7395 vm_map_clip_start(map, entry, start);
7396 vm_map_clip_end(map, entry, end);
7397
7398 sub_start = VME_OFFSET(entry);
7399 sub_end = entry->vme_end - entry->vme_start;
7400 sub_end += VME_OFFSET(entry);
7401 local_end = entry->vme_end;
7402 if (map_pmap == NULL) {
7403 if (entry->use_pmap) {
7404 pmap = VME_SUBMAP(entry)->pmap;
7405 pmap_addr = sub_start;
7406 } else {
7407 pmap = map->pmap;
7408 pmap_addr = start;
7409 }
7410 if (entry->wired_count == 0 ||
7411 (user_wire && entry->user_wired_count == 0)) {
7412 if (!user_wire) {
7413 panic("vm_map_unwire: entry is unwired");
7414 }
7415 entry = entry->vme_next;
7416 continue;
7417 }
7418
7419 /*
7420 * Check for holes
7421 * Holes: Next entry should be contiguous unless
7422 * this is the end of the region.
7423 */
7424 if (((entry->vme_end < end) &&
7425 ((entry->vme_next == vm_map_to_entry(map)) ||
7426 (entry->vme_next->vme_start
7427 > entry->vme_end)))) {
7428 if (!user_wire) {
7429 panic("vm_map_unwire: non-contiguous region");
7430 }
7431 /*
7432 * entry = entry->vme_next;
7433 * continue;
7434 */
7435 }
7436
7437 subtract_wire_counts(map, entry, user_wire);
7438
7439 if (entry->wired_count != 0) {
7440 entry = entry->vme_next;
7441 continue;
7442 }
7443
7444 entry->in_transition = TRUE;
7445 tmp_entry = *entry;/* see comment in vm_map_wire() */
7446
7447 /*
7448 * We can unlock the map now. The in_transition state
7449 * guarantees existance of the entry.
7450 */
7451 sub_map = VME_SUBMAP(entry);
7452 vm_map_reference(sub_map);
7453 vm_map_unlock(map);
7454 vm_map_unwire_nested(sub_map,
7455 sub_start, sub_end, user_wire, pmap, pmap_addr);
7456 vm_map_deallocate(sub_map);
7457 sub_map = VM_MAP_NULL;
7458 vm_map_lock(map);
7459
7460 if (last_timestamp + 1 != map->timestamp) {
7461 /*
7462 * Find the entry again. It could have been
7463 * clipped or deleted after we unlocked the map.
7464 */
7465 if (!vm_map_lookup_entry(map,
7466 tmp_entry.vme_start,
7467 &first_entry)) {
7468 if (!user_wire) {
7469 panic("vm_map_unwire: re-lookup failed");
7470 }
7471 entry = first_entry->vme_next;
7472 } else {
7473 entry = first_entry;
7474 }
7475 }
7476 last_timestamp = map->timestamp;
7477
7478 /*
7479 * clear transition bit for all constituent entries
7480 * that were in the original entry (saved in
7481 * tmp_entry). Also check for waiters.
7482 */
7483 while ((entry != vm_map_to_entry(map)) &&
7484 (entry->vme_start < tmp_entry.vme_end)) {
7485 assert(entry->in_transition);
7486 entry->in_transition = FALSE;
7487 if (entry->needs_wakeup) {
7488 entry->needs_wakeup = FALSE;
7489 need_wakeup = TRUE;
7490 }
7491 entry = entry->vme_next;
7492 }
7493 continue;
7494 } else {
7495 tmp_entry = *entry;
7496 sub_map = VME_SUBMAP(entry);
7497 vm_map_reference(sub_map);
7498 vm_map_unlock(map);
7499 vm_map_unwire_nested(sub_map,
7500 sub_start, sub_end, user_wire, map_pmap,
7501 pmap_addr);
7502 vm_map_deallocate(sub_map);
7503 sub_map = VM_MAP_NULL;
7504 vm_map_lock(map);
7505
7506 if (last_timestamp + 1 != map->timestamp) {
7507 /*
7508 * Find the entry again. It could have been
7509 * clipped or deleted after we unlocked the map.
7510 */
7511 if (!vm_map_lookup_entry(map,
7512 tmp_entry.vme_start,
7513 &first_entry)) {
7514 if (!user_wire) {
7515 panic("vm_map_unwire: re-lookup failed");
7516 }
7517 entry = first_entry->vme_next;
7518 } else {
7519 entry = first_entry;
7520 }
7521 }
7522 last_timestamp = map->timestamp;
7523 }
7524 }
7525
7526
7527 if ((entry->wired_count == 0) ||
7528 (user_wire && entry->user_wired_count == 0)) {
7529 if (!user_wire) {
7530 panic("vm_map_unwire: entry is unwired");
7531 }
7532
7533 entry = entry->vme_next;
7534 continue;
7535 }
7536
7537 assert(entry->wired_count > 0 &&
7538 (!user_wire || entry->user_wired_count > 0));
7539
7540 vm_map_clip_start(map, entry, start);
7541 vm_map_clip_end(map, entry, end);
7542
7543 /*
7544 * Check for holes
7545 * Holes: Next entry should be contiguous unless
7546 * this is the end of the region.
7547 */
7548 if (((entry->vme_end < end) &&
7549 ((entry->vme_next == vm_map_to_entry(map)) ||
7550 (entry->vme_next->vme_start > entry->vme_end)))) {
7551 if (!user_wire) {
7552 panic("vm_map_unwire: non-contiguous region");
7553 }
7554 /*
7555 * entry = entry->vme_next;
7556 * continue;
7557 */
7558 }
7559
7560 subtract_wire_counts(map, entry, user_wire);
7561
7562 if (entry->wired_count != 0) {
7563 entry = entry->vme_next;
7564 continue;
7565 }
7566
7567 if (entry->zero_wired_pages) {
7568 entry->zero_wired_pages = FALSE;
7569 }
7570
7571 entry->in_transition = TRUE;
7572 tmp_entry = *entry; /* see comment in vm_map_wire() */
7573
7574 /*
7575 * We can unlock the map now. The in_transition state
7576 * guarantees existance of the entry.
7577 */
7578 vm_map_unlock(map);
7579 if (map_pmap) {
7580 vm_fault_unwire(map, &tmp_entry, FALSE, map_pmap,
7581 pmap_addr, tmp_entry.vme_end);
7582 } else {
7583 vm_fault_unwire(map, &tmp_entry, FALSE, map->pmap,
7584 tmp_entry.vme_start, tmp_entry.vme_end);
7585 }
7586 vm_map_lock(map);
7587
7588 if (last_timestamp + 1 != map->timestamp) {
7589 /*
7590 * Find the entry again. It could have been clipped
7591 * or deleted after we unlocked the map.
7592 */
7593 if (!vm_map_lookup_entry(map, tmp_entry.vme_start,
7594 &first_entry)) {
7595 if (!user_wire) {
7596 panic("vm_map_unwire: re-lookup failed");
7597 }
7598 entry = first_entry->vme_next;
7599 } else {
7600 entry = first_entry;
7601 }
7602 }
7603 last_timestamp = map->timestamp;
7604
7605 /*
7606 * clear transition bit for all constituent entries that
7607 * were in the original entry (saved in tmp_entry). Also
7608 * check for waiters.
7609 */
7610 while ((entry != vm_map_to_entry(map)) &&
7611 (entry->vme_start < tmp_entry.vme_end)) {
7612 assert(entry->in_transition);
7613 entry->in_transition = FALSE;
7614 if (entry->needs_wakeup) {
7615 entry->needs_wakeup = FALSE;
7616 need_wakeup = TRUE;
7617 }
7618 entry = entry->vme_next;
7619 }
7620 }
7621
7622 /*
7623 * We might have fragmented the address space when we wired this
7624 * range of addresses. Attempt to re-coalesce these VM map entries
7625 * with their neighbors now that they're no longer wired.
7626 * Under some circumstances, address space fragmentation can
7627 * prevent VM object shadow chain collapsing, which can cause
7628 * swap space leaks.
7629 */
7630 vm_map_simplify_range(map, start, end);
7631
7632 vm_map_unlock(map);
7633 /*
7634 * wake up anybody waiting on entries that we have unwired.
7635 */
7636 if (need_wakeup) {
7637 vm_map_entry_wakeup(map);
7638 }
7639 return KERN_SUCCESS;
7640 }
7641
7642 kern_return_t
vm_map_unwire(vm_map_t map,vm_map_offset_ut start_u,vm_map_offset_ut end_u,boolean_t user_wire)7643 vm_map_unwire(
7644 vm_map_t map,
7645 vm_map_offset_ut start_u,
7646 vm_map_offset_ut end_u,
7647 boolean_t user_wire)
7648 {
7649 return vm_map_unwire_impl(map, start_u, end_u, user_wire,
7650 VM_SANITIZE_CALLER_VM_MAP_UNWIRE);
7651 }
7652
7653 static __attribute__((always_inline, warn_unused_result))
7654 kern_return_t
vm_map_unwire_sanitize(vm_map_t map,vm_map_offset_ut start_u,vm_map_offset_ut end_u,vm_sanitize_caller_t vm_sanitize_caller,vm_map_offset_t * start,vm_map_offset_t * end,vm_map_size_t * size)7655 vm_map_unwire_sanitize(
7656 vm_map_t map,
7657 vm_map_offset_ut start_u,
7658 vm_map_offset_ut end_u,
7659 vm_sanitize_caller_t vm_sanitize_caller,
7660 vm_map_offset_t *start,
7661 vm_map_offset_t *end,
7662 vm_map_size_t *size)
7663 {
7664 vm_sanitize_flags_t flags = VM_SANITIZE_FLAGS_SIZE_ZERO_SUCCEEDS;
7665
7666
7667 return vm_sanitize_addr_end(start_u, end_u, vm_sanitize_caller, map,
7668 flags, start, end, size);
7669 }
7670
7671 kern_return_t
vm_map_unwire_impl(vm_map_t map,vm_map_offset_ut start_u,vm_map_offset_ut end_u,boolean_t user_wire,vm_sanitize_caller_t vm_sanitize_caller)7672 vm_map_unwire_impl(
7673 vm_map_t map,
7674 vm_map_offset_ut start_u,
7675 vm_map_offset_ut end_u,
7676 boolean_t user_wire,
7677 vm_sanitize_caller_t vm_sanitize_caller)
7678 {
7679 vm_map_offset_t start, end;
7680 vm_map_size_t size;
7681 kern_return_t kr;
7682
7683 /*
7684 * Sanitize any input parameters that are addr/size/prot/inherit
7685 */
7686 kr = vm_map_unwire_sanitize(
7687 map,
7688 start_u,
7689 end_u,
7690 vm_sanitize_caller,
7691 &start,
7692 &end,
7693 &size);
7694 if (__improbable(kr != KERN_SUCCESS)) {
7695 return vm_sanitize_get_kr(kr);
7696 }
7697
7698 return vm_map_unwire_nested(map, start, end,
7699 user_wire, (pmap_t)NULL, 0);
7700 }
7701
7702
7703 /*
7704 * vm_map_entry_zap: [ internal use only ]
7705 *
7706 * Remove the entry from the target map
7707 * and put it on a zap list.
7708 */
7709 static void
vm_map_entry_zap(vm_map_t map,vm_map_entry_t entry,vm_map_zap_t zap)7710 vm_map_entry_zap(
7711 vm_map_t map,
7712 vm_map_entry_t entry,
7713 vm_map_zap_t zap)
7714 {
7715 vm_map_offset_t s, e;
7716
7717 s = entry->vme_start;
7718 e = entry->vme_end;
7719 assert(VM_MAP_PAGE_ALIGNED(s, FOURK_PAGE_MASK));
7720 assert(VM_MAP_PAGE_ALIGNED(e, FOURK_PAGE_MASK));
7721 if (VM_MAP_PAGE_MASK(map) >= PAGE_MASK) {
7722 assert(page_aligned(s));
7723 assert(page_aligned(e));
7724 }
7725 if (entry->map_aligned == TRUE) {
7726 assert(VM_MAP_PAGE_ALIGNED(s, VM_MAP_PAGE_MASK(map)));
7727 assert(VM_MAP_PAGE_ALIGNED(e, VM_MAP_PAGE_MASK(map)));
7728 }
7729 assert(entry->wired_count == 0);
7730 assert(entry->user_wired_count == 0);
7731 assert(!entry->vme_permanent);
7732
7733 vm_map_store_entry_unlink(map, entry, false);
7734 map->size -= e - s;
7735
7736 vm_map_zap_append(zap, entry);
7737 }
7738
7739 static void
vm_map_submap_pmap_clean(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_map_t sub_map,vm_map_offset_t offset)7740 vm_map_submap_pmap_clean(
7741 vm_map_t map,
7742 vm_map_offset_t start,
7743 vm_map_offset_t end,
7744 vm_map_t sub_map,
7745 vm_map_offset_t offset)
7746 {
7747 vm_map_offset_t submap_start;
7748 vm_map_offset_t submap_end;
7749 vm_map_size_t remove_size;
7750 vm_map_entry_t entry;
7751
7752 submap_end = offset + (end - start);
7753 submap_start = offset;
7754
7755 vm_map_lock_read(sub_map);
7756 if (vm_map_lookup_entry(sub_map, offset, &entry)) {
7757 remove_size = (entry->vme_end - entry->vme_start);
7758 if (offset > entry->vme_start) {
7759 remove_size -= offset - entry->vme_start;
7760 }
7761
7762
7763 if (submap_end < entry->vme_end) {
7764 remove_size -=
7765 entry->vme_end - submap_end;
7766 }
7767 if (entry->is_sub_map) {
7768 vm_map_submap_pmap_clean(
7769 sub_map,
7770 start,
7771 start + remove_size,
7772 VME_SUBMAP(entry),
7773 VME_OFFSET(entry));
7774 } else {
7775 if (map->mapped_in_other_pmaps &&
7776 os_ref_get_count_raw(&map->map_refcnt) != 0 &&
7777 VME_OBJECT(entry) != NULL) {
7778 vm_object_pmap_protect_options(
7779 VME_OBJECT(entry),
7780 (VME_OFFSET(entry) +
7781 offset -
7782 entry->vme_start),
7783 remove_size,
7784 PMAP_NULL,
7785 PAGE_SIZE,
7786 entry->vme_start,
7787 VM_PROT_NONE,
7788 PMAP_OPTIONS_REMOVE);
7789 } else {
7790 pmap_remove(map->pmap,
7791 (addr64_t)start,
7792 (addr64_t)(start + remove_size));
7793 }
7794 }
7795 }
7796
7797 entry = entry->vme_next;
7798
7799 while ((entry != vm_map_to_entry(sub_map))
7800 && (entry->vme_start < submap_end)) {
7801 remove_size = (entry->vme_end - entry->vme_start);
7802 if (submap_end < entry->vme_end) {
7803 remove_size -= entry->vme_end - submap_end;
7804 }
7805 if (entry->is_sub_map) {
7806 vm_map_submap_pmap_clean(
7807 sub_map,
7808 (start + entry->vme_start) - offset,
7809 ((start + entry->vme_start) - offset) + remove_size,
7810 VME_SUBMAP(entry),
7811 VME_OFFSET(entry));
7812 } else {
7813 if (map->mapped_in_other_pmaps &&
7814 os_ref_get_count_raw(&map->map_refcnt) != 0 &&
7815 VME_OBJECT(entry) != NULL) {
7816 vm_object_pmap_protect_options(
7817 VME_OBJECT(entry),
7818 VME_OFFSET(entry),
7819 remove_size,
7820 PMAP_NULL,
7821 PAGE_SIZE,
7822 entry->vme_start,
7823 VM_PROT_NONE,
7824 PMAP_OPTIONS_REMOVE);
7825 } else {
7826 pmap_remove(map->pmap,
7827 (addr64_t)((start + entry->vme_start)
7828 - offset),
7829 (addr64_t)(((start + entry->vme_start)
7830 - offset) + remove_size));
7831 }
7832 }
7833 entry = entry->vme_next;
7834 }
7835 vm_map_unlock_read(sub_map);
7836 return;
7837 }
7838
7839 /*
7840 * virt_memory_guard_ast:
7841 *
7842 * Handle the AST callout for a virtual memory guard.
7843 * raise an EXC_GUARD exception and terminate the task
7844 * if configured to do so.
7845 */
7846 void
virt_memory_guard_ast(thread_t thread,mach_exception_data_type_t code,mach_exception_data_type_t subcode)7847 virt_memory_guard_ast(
7848 thread_t thread,
7849 mach_exception_data_type_t code,
7850 mach_exception_data_type_t subcode)
7851 {
7852 task_t task = get_threadtask(thread);
7853 assert(task != kernel_task);
7854 assert(task == current_task());
7855 kern_return_t sync_exception_result;
7856 uint32_t behavior;
7857
7858 behavior = task->task_exc_guard;
7859
7860
7861 /* Is delivery enabled */
7862 if ((behavior & TASK_EXC_GUARD_VM_DELIVER) == 0) {
7863 return;
7864 }
7865
7866 /* If only once, make sure we're that once */
7867 while (behavior & TASK_EXC_GUARD_VM_ONCE) {
7868 uint32_t new_behavior = behavior & ~TASK_EXC_GUARD_VM_DELIVER;
7869
7870 if (OSCompareAndSwap(behavior, new_behavior, &task->task_exc_guard)) {
7871 break;
7872 }
7873 behavior = task->task_exc_guard;
7874 if ((behavior & TASK_EXC_GUARD_VM_DELIVER) == 0) {
7875 return;
7876 }
7877 }
7878
7879 const bool fatal = task->task_exc_guard & TASK_EXC_GUARD_VM_FATAL;
7880 /* Raise exception synchronously and see if handler claimed it */
7881 sync_exception_result = task_exception_notify(EXC_GUARD, code, subcode, fatal);
7882
7883 if (fatal) {
7884 /*
7885 * If Synchronous EXC_GUARD delivery was successful then
7886 * kill the process and return, else kill the process
7887 * and deliver the exception via EXC_CORPSE_NOTIFY.
7888 */
7889
7890
7891 int flags = PX_DEBUG_NO_HONOR;
7892 exception_info_t info = {
7893 .os_reason = OS_REASON_GUARD,
7894 .exception_type = EXC_GUARD,
7895 .mx_code = code,
7896 .mx_subcode = subcode
7897 };
7898
7899 if (sync_exception_result == KERN_SUCCESS) {
7900 flags |= PX_PSIGNAL;
7901 }
7902 exit_with_mach_exception(current_proc(), info, flags);
7903 } else if (task->task_exc_guard & TASK_EXC_GUARD_VM_CORPSE) {
7904 /*
7905 * If the synchronous EXC_GUARD delivery was not successful,
7906 * raise a simulated crash.
7907 */
7908 if (sync_exception_result != KERN_SUCCESS) {
7909 task_violated_guard(code, subcode, NULL, FALSE);
7910 }
7911 }
7912 }
7913
7914 /*
7915 * Validate policy for VM guard exceptions and encode the correct Mach exception
7916 * code and subcode if the policy allows delivering a guard exception here.
7917 */
7918 static bool
vm_map_guard_exception_internal(vm_map_offset_t address,unsigned reason,mach_exception_code_t * code,mach_exception_data_type_t * subcode)7919 vm_map_guard_exception_internal(
7920 vm_map_offset_t address,
7921 unsigned reason,
7922 mach_exception_code_t *code,
7923 mach_exception_data_type_t *subcode)
7924 {
7925 unsigned int guard_type = GUARD_TYPE_VIRT_MEMORY;
7926 unsigned int target = 0; /* should we pass in pid associated with map? */
7927
7928 task_t task = current_task_early();
7929
7930 /* Can't deliver exceptions to a NULL task (early boot) or kernel task */
7931 if (task == NULL || task == kernel_task) {
7932 return false;
7933 }
7934
7935
7936 *code = 0;
7937 EXC_GUARD_ENCODE_TYPE(*code, guard_type);
7938 EXC_GUARD_ENCODE_FLAVOR(*code, reason);
7939 EXC_GUARD_ENCODE_TARGET(*code, target);
7940 *subcode = (uint64_t)address;
7941
7942 return true;
7943 }
7944
7945 /*
7946 * vm_map_guard_exception:
7947 *
7948 * Generate a GUARD_TYPE_VIRTUAL_MEMORY EXC_GUARD exception.
7949 *
7950 * `reason` is kGUARD_EXC_DEALLOC_GAP when we find nothing mapped,
7951 * or if there is a gap in the mapping when a user address space
7952 * was requested. We report the address of the first gap found.
7953 */
7954
7955 void
vm_map_guard_exception(vm_map_offset_t address,unsigned reason)7956 vm_map_guard_exception(
7957 vm_map_offset_t address,
7958 unsigned reason)
7959 {
7960 mach_exception_code_t code;
7961 mach_exception_data_type_t subcode;
7962 if (vm_map_guard_exception_internal(address, reason, &code, &subcode)) {
7963 task_t task = current_task();
7964 bool fatal = task->task_exc_guard & TASK_EXC_GUARD_VM_FATAL;
7965
7966 thread_guard_violation(current_thread(), code, subcode, fatal);
7967 }
7968 }
7969
7970
7971 static kern_return_t
vm_map_delete_submap_recurse(vm_map_t submap,vm_map_offset_t submap_start,vm_map_offset_t submap_end)7972 vm_map_delete_submap_recurse(
7973 vm_map_t submap,
7974 vm_map_offset_t submap_start,
7975 vm_map_offset_t submap_end)
7976 {
7977 vm_map_entry_t submap_entry;
7978
7979 /*
7980 * Verify that the submap does not contain any "permanent" entries
7981 * within the specified range. We permit TPRO ranges to be overwritten
7982 * as we only reach this path if TPRO const protection is disabled for a
7983 * given map.
7984 *
7985 * We do not care about gaps.
7986 */
7987
7988 vm_map_lock(submap);
7989
7990 if (!vm_map_lookup_entry(submap, submap_start, &submap_entry)) {
7991 submap_entry = submap_entry->vme_next;
7992 }
7993
7994 for (;
7995 submap_entry != vm_map_to_entry(submap) &&
7996 submap_entry->vme_start < submap_end;
7997 submap_entry = submap_entry->vme_next) {
7998 if (submap_entry->vme_permanent
7999 #ifdef __arm64e__
8000 /* allow TPRO submap entries to be overwritten */
8001 && !submap_entry->used_for_tpro
8002 #endif
8003 ) {
8004 /* "permanent" entry -> fail */
8005 vm_map_unlock(submap);
8006 return KERN_PROTECTION_FAILURE;
8007 }
8008 }
8009 /* no "permanent" entries in the range -> success */
8010 vm_map_unlock(submap);
8011 return KERN_SUCCESS;
8012 }
8013
8014 __abortlike
8015 static void
__vm_map_delete_misaligned_panic(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)8016 __vm_map_delete_misaligned_panic(
8017 vm_map_t map,
8018 vm_map_offset_t start,
8019 vm_map_offset_t end)
8020 {
8021 panic("vm_map_delete(%p,0x%llx,0x%llx): start is not aligned to 0x%x",
8022 map, (uint64_t)start, (uint64_t)end, VM_MAP_PAGE_SIZE(map));
8023 }
8024
8025 __abortlike
8026 static void
__vm_map_delete_failed_panic(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,kern_return_t kr)8027 __vm_map_delete_failed_panic(
8028 vm_map_t map,
8029 vm_map_offset_t start,
8030 vm_map_offset_t end,
8031 kern_return_t kr)
8032 {
8033 panic("vm_map_delete(%p,0x%llx,0x%llx): failed unexpected with %d",
8034 map, (uint64_t)start, (uint64_t)end, kr);
8035 }
8036
8037 __abortlike
8038 static void
__vm_map_delete_gap_panic(vm_map_t map,vm_map_offset_t where,vm_map_offset_t start,vm_map_offset_t end)8039 __vm_map_delete_gap_panic(
8040 vm_map_t map,
8041 vm_map_offset_t where,
8042 vm_map_offset_t start,
8043 vm_map_offset_t end)
8044 {
8045 panic("vm_map_delete(%p,0x%llx,0x%llx): no map entry at 0x%llx",
8046 map, (uint64_t)start, (uint64_t)end, (uint64_t)where);
8047 }
8048
8049 __abortlike
8050 static void
__vm_map_delete_permanent_panic(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_map_entry_t entry)8051 __vm_map_delete_permanent_panic(
8052 vm_map_t map,
8053 vm_map_offset_t start,
8054 vm_map_offset_t end,
8055 vm_map_entry_t entry)
8056 {
8057 panic("vm_map_delete(%p,0x%llx,0x%llx): "
8058 "Attempting to remove permanent VM map entry %p [0x%llx:0x%llx]",
8059 map, (uint64_t)start, (uint64_t)end, entry,
8060 (uint64_t)entry->vme_start,
8061 (uint64_t)entry->vme_end);
8062 }
8063
8064 __options_decl(vm_map_delete_state_t, uint32_t, {
8065 VMDS_NONE = 0x0000,
8066
8067 VMDS_FOUND_GAP = 0x0001,
8068 VMDS_GAPS_OK = 0x0002,
8069
8070 VMDS_KERNEL_PMAP = 0x0004,
8071 VMDS_NEEDS_LOOKUP = 0x0008,
8072 VMDS_NEEDS_WAKEUP = 0x0010,
8073 VMDS_KERNEL_KMEMPTR = 0x0020
8074 });
8075
8076 /*
8077 * vm_map_clamp_to_pmap(map, start, end)
8078 *
8079 * Modify *start and *end so they fall within the bounds of map->pmap.
8080 */
8081 #if MACH_ASSERT
8082 static void
vm_map_clamp_to_pmap(vm_map_t map,vm_map_address_t * start,vm_map_address_t * end)8083 vm_map_clamp_to_pmap(vm_map_t map, vm_map_address_t *start, vm_map_address_t *end)
8084 {
8085 vm_map_address_t min;
8086 vm_map_address_t max;
8087
8088 #if __x86_64__
8089 /* x86_64 struct pmap does not have min and max fields */
8090 if (map->pmap == kernel_pmap) {
8091 min = VM_MIN_KERNEL_AND_KEXT_ADDRESS;
8092 max = VM_MAX_KERNEL_ADDRESS;
8093 } else {
8094 min = VM_MAP_MIN_ADDRESS;
8095 max = VM_MAP_MAX_ADDRESS;
8096 }
8097 #else
8098 min = map->pmap->min;
8099 max = map->pmap->max;
8100 #endif
8101
8102 if (*start < min) {
8103 *start = min;
8104 } else if (*start > max) {
8105 *start = max;
8106 }
8107 if (*end < min) {
8108 *end = min;
8109 } else if (*end > max) {
8110 *end = max;
8111 }
8112 }
8113 #endif
8114
8115 int vm_log_map_delete_permanent_prot_none = 0;
8116 /*
8117 * vm_map_delete: [ internal use only ]
8118 *
8119 * Deallocates the given address range from the target map.
8120 * Removes all user wirings. Unwires one kernel wiring if
8121 * VM_MAP_REMOVE_KUNWIRE is set. Waits for kernel wirings to go
8122 * away if VM_MAP_REMOVE_WAIT_FOR_KWIRE is set. Sleeps
8123 * interruptibly if VM_MAP_REMOVE_INTERRUPTIBLE is set.
8124 *
8125 *
8126 * When the map is a kernel map, then any error in removing mappings
8127 * will lead to a panic so that clients do not have to repeat the panic
8128 * code at each call site. If VM_MAP_REMOVE_INTERRUPTIBLE
8129 * is also passed, then KERN_ABORTED will not lead to a panic.
8130 *
8131 * This routine is called with map locked and leaves map locked.
8132 */
8133 static kmem_return_t
vm_map_delete(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vmr_flags_t flags,kmem_guard_t guard,vm_map_zap_t zap_list)8134 vm_map_delete(
8135 vm_map_t map,
8136 vm_map_offset_t start,
8137 vm_map_offset_t end,
8138 vmr_flags_t flags,
8139 kmem_guard_t guard,
8140 vm_map_zap_t zap_list)
8141 {
8142 vm_map_entry_t entry, next;
8143 int interruptible;
8144 vm_map_offset_t gap_start = 0;
8145 vm_map_offset_t clear_in_transition_end = 0;
8146 __unused vm_map_offset_t save_start = start;
8147 __unused vm_map_offset_t save_end = end;
8148 vm_map_delete_state_t state = VMDS_NONE;
8149 kmem_return_t ret = { };
8150 vm_map_range_id_t range_id = 0;
8151 struct kmem_page_meta *meta = NULL;
8152 uint32_t size_idx, slot_idx;
8153 struct mach_vm_range slot;
8154
8155 if (vm_map_pmap(map) == kernel_pmap) {
8156 state |= VMDS_KERNEL_PMAP;
8157 range_id = kmem_addr_get_range(start, end - start);
8158 if (kmem_is_ptr_range(range_id)) {
8159 state |= VMDS_KERNEL_KMEMPTR;
8160 slot_idx = kmem_addr_get_slot_idx(start, end, range_id, &meta,
8161 &size_idx, &slot);
8162 }
8163 }
8164
8165 if (map->terminated || os_ref_get_count_raw(&map->map_refcnt) == 0) {
8166 state |= VMDS_GAPS_OK;
8167 }
8168
8169 if (map->corpse_source &&
8170 !(flags & VM_MAP_REMOVE_TO_OVERWRITE) &&
8171 !map->terminated) {
8172 /*
8173 * The map is being used for corpses related diagnostics.
8174 * So skip any entry removal to avoid perturbing the map state.
8175 * The cleanup will happen in task_terminate_internal after the
8176 * call to task_port_no_senders.
8177 */
8178 goto out;
8179 }
8180
8181 interruptible = (flags & VM_MAP_REMOVE_INTERRUPTIBLE) ?
8182 THREAD_ABORTSAFE : THREAD_UNINT;
8183
8184 if ((flags & VM_MAP_REMOVE_NO_MAP_ALIGN) == 0 &&
8185 (start & VM_MAP_PAGE_MASK(map))) {
8186 __vm_map_delete_misaligned_panic(map, start, end);
8187 }
8188
8189 if ((state & VMDS_GAPS_OK) == 0) {
8190 /*
8191 * If the map isn't terminated then all deletions must have
8192 * no gaps, and be within the [min, max) of the map.
8193 *
8194 * We got here without VM_MAP_RANGE_CHECK() being called,
8195 * and hence must validate bounds manually.
8196 *
8197 * It is worth noting that because vm_deallocate() will
8198 * round_page() the deallocation size, it's possible for "end"
8199 * to be 0 here due to overflow. We hence must treat it as being
8200 * beyond vm_map_max(map).
8201 *
8202 * Similarly, end < start means some wrap around happend,
8203 * which should cause an error or panic.
8204 */
8205 if (end == 0 || end > vm_map_max(map)) {
8206 state |= VMDS_FOUND_GAP;
8207 gap_start = vm_map_max(map);
8208 if (state & VMDS_KERNEL_PMAP) {
8209 __vm_map_delete_gap_panic(map,
8210 gap_start, start, end);
8211 }
8212 goto out;
8213 }
8214
8215 if (end < start) {
8216 if (state & VMDS_KERNEL_PMAP) {
8217 __vm_map_delete_gap_panic(map,
8218 vm_map_max(map), start, end);
8219 }
8220 ret.kmr_return = KERN_INVALID_ARGUMENT;
8221 goto out;
8222 }
8223
8224 if (start < vm_map_min(map)) {
8225 state |= VMDS_FOUND_GAP;
8226 gap_start = start;
8227 if (state & VMDS_KERNEL_PMAP) {
8228 __vm_map_delete_gap_panic(map,
8229 gap_start, start, end);
8230 }
8231 goto out;
8232 }
8233 } else {
8234 /*
8235 * If the map is terminated, we must accept start/end
8236 * being beyond the boundaries of the map as this is
8237 * how some of the mappings like commpage mappings
8238 * can be destroyed (they're outside of those bounds).
8239 *
8240 * end < start is still something we can't cope with,
8241 * so just bail.
8242 */
8243 if (end < start) {
8244 goto out;
8245 }
8246 }
8247
8248
8249 /*
8250 * Find the start of the region.
8251 *
8252 * If in a superpage, extend the range
8253 * to include the start of the mapping.
8254 */
8255 while (vm_map_lookup_entry_or_next(map, start, &entry)) {
8256 if (entry->superpage_size && (start & ~SUPERPAGE_MASK)) {
8257 start = SUPERPAGE_ROUND_DOWN(start);
8258 } else {
8259 SAVE_HINT_MAP_WRITE(map, entry->vme_prev);
8260 break;
8261 }
8262 }
8263
8264 if (entry->superpage_size) {
8265 end = SUPERPAGE_ROUND_UP(end);
8266 }
8267
8268 /*
8269 * Step through all entries in this region
8270 */
8271 for (vm_map_offset_t s = start; s < end;) {
8272 /*
8273 * At this point, we have deleted all the memory entries
8274 * in [start, s) and are proceeding with the [s, end) range.
8275 *
8276 * This loop might drop the map lock, and it is possible that
8277 * some memory was already reallocated within [start, s)
8278 * and we don't want to mess with those entries.
8279 *
8280 * Some of those entries could even have been re-assembled
8281 * with an entry after "s" (in vm_map_simplify_entry()), so
8282 * we may have to vm_map_clip_start() again.
8283 *
8284 * When clear_in_transition_end is set, the we had marked
8285 * [start, clear_in_transition_end) as "in_transition"
8286 * during a previous iteration and we need to clear it.
8287 */
8288
8289 /*
8290 * Step 1: If needed (because we dropped locks),
8291 * lookup the entry again.
8292 *
8293 * If we're coming back from unwiring (Step 5),
8294 * we also need to mark the entries as no longer
8295 * in transition after that.
8296 */
8297
8298 if (state & VMDS_NEEDS_LOOKUP) {
8299 state &= ~VMDS_NEEDS_LOOKUP;
8300
8301 if (vm_map_lookup_entry_or_next(map, s, &entry)) {
8302 SAVE_HINT_MAP_WRITE(map, entry->vme_prev);
8303 }
8304
8305 if (state & VMDS_KERNEL_KMEMPTR) {
8306 kmem_validate_slot(s, meta, size_idx, slot_idx);
8307 }
8308 }
8309
8310 if (clear_in_transition_end) {
8311 for (vm_map_entry_t it = entry;
8312 it != vm_map_to_entry(map) &&
8313 it->vme_start < clear_in_transition_end;
8314 it = it->vme_next) {
8315 assert(it->in_transition);
8316 it->in_transition = FALSE;
8317 if (it->needs_wakeup) {
8318 it->needs_wakeup = FALSE;
8319 state |= VMDS_NEEDS_WAKEUP;
8320 }
8321 }
8322
8323 clear_in_transition_end = 0;
8324 }
8325
8326
8327 /*
8328 * Step 2: Perform various policy checks
8329 * before we do _anything_ to this entry.
8330 */
8331
8332 if (entry == vm_map_to_entry(map) || s < entry->vme_start) {
8333 if (state & (VMDS_GAPS_OK | VMDS_FOUND_GAP)) {
8334 /*
8335 * Either we found a gap already,
8336 * or we are tearing down a map,
8337 * keep going.
8338 */
8339 } else if (state & VMDS_KERNEL_PMAP) {
8340 __vm_map_delete_gap_panic(map, s, start, end);
8341 } else if (s < end) {
8342 state |= VMDS_FOUND_GAP;
8343 gap_start = s;
8344 }
8345
8346 if (entry == vm_map_to_entry(map) ||
8347 end <= entry->vme_start) {
8348 break;
8349 }
8350
8351 s = entry->vme_start;
8352 }
8353
8354 if (state & VMDS_KERNEL_PMAP) {
8355 /*
8356 * In the kernel map and its submaps,
8357 * permanent entries never die, even
8358 * if VM_MAP_REMOVE_IMMUTABLE is passed.
8359 */
8360 if (entry->vme_permanent) {
8361 __vm_map_delete_permanent_panic(map, start, end, entry);
8362 }
8363
8364 if (flags & VM_MAP_REMOVE_GUESS_SIZE) {
8365 end = entry->vme_end;
8366 flags &= ~VM_MAP_REMOVE_GUESS_SIZE;
8367 }
8368
8369 /*
8370 * In the kernel map and its submaps,
8371 * the removal of an atomic/guarded entry is strict.
8372 *
8373 * An atomic entry is processed only if it was
8374 * specifically targeted.
8375 *
8376 * We might have deleted non-atomic entries before
8377 * we reach this this point however...
8378 */
8379 kmem_entry_validate_guard(map, entry,
8380 start, end - start, guard);
8381 }
8382
8383 /*
8384 * Step 2.1: handle "permanent" and "submap" entries
8385 * *before* clipping to avoid triggering some unnecessary
8386 * un-nesting of the shared region.
8387 */
8388 if (entry->vme_permanent && entry->is_sub_map) {
8389 // printf("FBDP %s:%d permanent submap...\n", __FUNCTION__, __LINE__);
8390 /*
8391 * Un-mapping a "permanent" mapping of a user-space
8392 * submap is not allowed unless...
8393 */
8394 if (flags & VM_MAP_REMOVE_IMMUTABLE) {
8395 /*
8396 * a. explicitly requested by the kernel caller.
8397 */
8398 // printf("FBDP %s:%d flags & REMOVE_IMMUTABLE\n", __FUNCTION__, __LINE__);
8399 } else if ((flags & VM_MAP_REMOVE_IMMUTABLE_CODE) &&
8400 developer_mode_state()) {
8401 /*
8402 * b. we're in "developer" mode (for
8403 * breakpoints, dtrace probes, ...).
8404 */
8405 // printf("FBDP %s:%d flags & REMOVE_IMMUTABLE_CODE\n", __FUNCTION__, __LINE__);
8406 } else if (map->terminated) {
8407 /*
8408 * c. this is the final address space cleanup.
8409 */
8410 // printf("FBDP %s:%d map->terminated\n", __FUNCTION__, __LINE__);
8411 } else {
8412 vm_map_offset_t submap_start, submap_end;
8413 kern_return_t submap_kr;
8414
8415 /*
8416 * Check if there are any "permanent" mappings
8417 * in this range in the submap.
8418 */
8419 if (entry->in_transition) {
8420 /* can that even happen ? */
8421 goto in_transition;
8422 }
8423 /* compute the clipped range in the submap */
8424 submap_start = s - entry->vme_start;
8425 submap_start += VME_OFFSET(entry);
8426 submap_end = end - entry->vme_start;
8427 submap_end += VME_OFFSET(entry);
8428 submap_kr = vm_map_delete_submap_recurse(
8429 VME_SUBMAP(entry),
8430 submap_start,
8431 submap_end);
8432 if (submap_kr != KERN_SUCCESS) {
8433 /*
8434 * There are some "permanent" mappings
8435 * in the submap: we are not allowed
8436 * to remove this range.
8437 */
8438 printf("%d[%s] removing permanent submap entry "
8439 "%p [0x%llx:0x%llx] prot 0x%x/0x%x -> KERN_PROT_FAILURE\n",
8440 proc_selfpid(),
8441 (get_bsdtask_info(current_task())
8442 ? proc_name_address(get_bsdtask_info(current_task()))
8443 : "?"), entry,
8444 (uint64_t)entry->vme_start,
8445 (uint64_t)entry->vme_end,
8446 entry->protection,
8447 entry->max_protection);
8448 DTRACE_VM6(vm_map_delete_permanent_deny_submap,
8449 vm_map_entry_t, entry,
8450 vm_map_offset_t, entry->vme_start,
8451 vm_map_offset_t, entry->vme_end,
8452 vm_prot_t, entry->protection,
8453 vm_prot_t, entry->max_protection,
8454 int, VME_ALIAS(entry));
8455 ret.kmr_return = KERN_PROTECTION_FAILURE;
8456 goto out;
8457 }
8458 /* no permanent mappings: proceed */
8459 }
8460 }
8461
8462 /*
8463 * Step 3: Perform any clipping needed.
8464 *
8465 * After this, "entry" starts at "s", ends before "end"
8466 */
8467
8468 if (entry->vme_start < s) {
8469 if ((flags & VM_MAP_REMOVE_NO_MAP_ALIGN) &&
8470 entry->map_aligned &&
8471 !VM_MAP_PAGE_ALIGNED(s, VM_MAP_PAGE_MASK(map))) {
8472 /*
8473 * The entry will no longer be map-aligned
8474 * after clipping and the caller said it's OK.
8475 */
8476 entry->map_aligned = FALSE;
8477 }
8478 vm_map_clip_start(map, entry, s);
8479 SAVE_HINT_MAP_WRITE(map, entry->vme_prev);
8480 }
8481
8482 if (end < entry->vme_end) {
8483 if ((flags & VM_MAP_REMOVE_NO_MAP_ALIGN) &&
8484 entry->map_aligned &&
8485 !VM_MAP_PAGE_ALIGNED(end, VM_MAP_PAGE_MASK(map))) {
8486 /*
8487 * The entry will no longer be map-aligned
8488 * after clipping and the caller said it's OK.
8489 */
8490 entry->map_aligned = FALSE;
8491 }
8492 vm_map_clip_end(map, entry, end);
8493 }
8494
8495 if (entry->vme_permanent && entry->is_sub_map) {
8496 /*
8497 * We already went through step 2.1 which did not deny
8498 * the removal of this "permanent" and "is_sub_map"
8499 * entry.
8500 * Now that we've clipped what we actually want to
8501 * delete, undo the "permanent" part to allow the
8502 * removal to proceed.
8503 */
8504 DTRACE_VM6(vm_map_delete_permanent_allow_submap,
8505 vm_map_entry_t, entry,
8506 vm_map_offset_t, entry->vme_start,
8507 vm_map_offset_t, entry->vme_end,
8508 vm_prot_t, entry->protection,
8509 vm_prot_t, entry->max_protection,
8510 int, VME_ALIAS(entry));
8511 entry->vme_permanent = false;
8512 }
8513
8514 assert(s == entry->vme_start);
8515 assert(entry->vme_end <= end);
8516
8517
8518 /*
8519 * Step 4: If the entry is in flux, wait for this to resolve.
8520 */
8521
8522 if (entry->in_transition) {
8523 wait_result_t wait_result;
8524
8525 in_transition:
8526 /*
8527 * Another thread is wiring/unwiring this entry.
8528 * Let the other thread know we are waiting.
8529 */
8530
8531 entry->needs_wakeup = TRUE;
8532
8533 /*
8534 * wake up anybody waiting on entries that we have
8535 * already unwired/deleted.
8536 */
8537 if (state & VMDS_NEEDS_WAKEUP) {
8538 vm_map_entry_wakeup(map);
8539 state &= ~VMDS_NEEDS_WAKEUP;
8540 }
8541
8542 wait_result = vm_map_entry_wait(map, interruptible);
8543
8544 if (interruptible &&
8545 wait_result == THREAD_INTERRUPTED) {
8546 /*
8547 * We do not clear the needs_wakeup flag,
8548 * since we cannot tell if we were the only one.
8549 */
8550 ret.kmr_return = KERN_ABORTED;
8551 return ret;
8552 }
8553
8554 /*
8555 * The entry could have been clipped or it
8556 * may not exist anymore. Look it up again.
8557 */
8558 state |= VMDS_NEEDS_LOOKUP;
8559 continue;
8560 }
8561
8562
8563 /*
8564 * Step 5: Handle wiring
8565 */
8566
8567 if (entry->wired_count) {
8568 struct vm_map_entry tmp_entry;
8569 boolean_t user_wire;
8570 unsigned int last_timestamp;
8571
8572 user_wire = entry->user_wired_count > 0;
8573
8574 /*
8575 * Remove a kernel wiring if requested
8576 */
8577 if (flags & VM_MAP_REMOVE_KUNWIRE) {
8578 entry->wired_count--;
8579 vme_btref_consider_and_put(entry);
8580 }
8581
8582 /*
8583 * Remove all user wirings for proper accounting
8584 */
8585 while (entry->user_wired_count) {
8586 subtract_wire_counts(map, entry, user_wire);
8587 }
8588
8589 /*
8590 * All our DMA I/O operations in IOKit are currently
8591 * done by wiring through the map entries of the task
8592 * requesting the I/O.
8593 *
8594 * Because of this, we must always wait for kernel wirings
8595 * to go away on the entries before deleting them.
8596 *
8597 * Any caller who wants to actually remove a kernel wiring
8598 * should explicitly set the VM_MAP_REMOVE_KUNWIRE flag to
8599 * properly remove one wiring instead of blasting through
8600 * them all.
8601 */
8602 if (entry->wired_count != 0) {
8603 assert(map != kernel_map);
8604 /*
8605 * Cannot continue. Typical case is when
8606 * a user thread has physical io pending on
8607 * on this page. Either wait for the
8608 * kernel wiring to go away or return an
8609 * error.
8610 */
8611 wait_result_t wait_result;
8612
8613 entry->needs_wakeup = TRUE;
8614 wait_result = vm_map_entry_wait(map,
8615 interruptible);
8616
8617 if (interruptible &&
8618 wait_result == THREAD_INTERRUPTED) {
8619 /*
8620 * We do not clear the
8621 * needs_wakeup flag, since we
8622 * cannot tell if we were the
8623 * only one.
8624 */
8625 ret.kmr_return = KERN_ABORTED;
8626 return ret;
8627 }
8628
8629
8630 /*
8631 * The entry could have been clipped or
8632 * it may not exist anymore. Look it
8633 * up again.
8634 */
8635 state |= VMDS_NEEDS_LOOKUP;
8636 continue;
8637 }
8638
8639 /*
8640 * We can unlock the map now.
8641 *
8642 * The entry might be split once we unlock the map,
8643 * but we need the range as defined by this entry
8644 * to be stable. So we must make a local copy.
8645 *
8646 * The underlying objects do not change during clips,
8647 * and the in_transition state guarentees existence
8648 * of the entry.
8649 */
8650 last_timestamp = map->timestamp;
8651 entry->in_transition = TRUE;
8652 tmp_entry = *entry;
8653 vm_map_unlock(map);
8654
8655 if (tmp_entry.is_sub_map) {
8656 vm_map_t sub_map;
8657 vm_map_offset_t sub_start, sub_end;
8658 pmap_t pmap;
8659 vm_map_offset_t pmap_addr;
8660
8661
8662 sub_map = VME_SUBMAP(&tmp_entry);
8663 sub_start = VME_OFFSET(&tmp_entry);
8664 sub_end = sub_start + (tmp_entry.vme_end -
8665 tmp_entry.vme_start);
8666 if (tmp_entry.use_pmap) {
8667 pmap = sub_map->pmap;
8668 pmap_addr = tmp_entry.vme_start;
8669 } else {
8670 pmap = map->pmap;
8671 pmap_addr = tmp_entry.vme_start;
8672 }
8673 (void) vm_map_unwire_nested(sub_map,
8674 sub_start, sub_end,
8675 user_wire,
8676 pmap, pmap_addr);
8677 } else {
8678 vm_map_offset_t entry_end = tmp_entry.vme_end;
8679 vm_map_offset_t max_end;
8680
8681 if (flags & VM_MAP_REMOVE_NOKUNWIRE_LAST) {
8682 max_end = end - VM_MAP_PAGE_SIZE(map);
8683 if (entry_end > max_end) {
8684 entry_end = max_end;
8685 }
8686 }
8687
8688 if (tmp_entry.vme_kernel_object) {
8689 pmap_protect_options(
8690 map->pmap,
8691 tmp_entry.vme_start,
8692 entry_end,
8693 VM_PROT_NONE,
8694 PMAP_OPTIONS_REMOVE,
8695 NULL);
8696 }
8697 vm_fault_unwire(map, &tmp_entry,
8698 tmp_entry.vme_kernel_object, map->pmap,
8699 tmp_entry.vme_start, entry_end);
8700 }
8701
8702 vm_map_lock(map);
8703
8704 /*
8705 * Unwiring happened, we can now go back to deleting
8706 * them (after we clear the in_transition bit for the range).
8707 */
8708 if (last_timestamp + 1 != map->timestamp) {
8709 state |= VMDS_NEEDS_LOOKUP;
8710 }
8711 clear_in_transition_end = tmp_entry.vme_end;
8712 continue;
8713 }
8714
8715 assert(entry->wired_count == 0);
8716 assert(entry->user_wired_count == 0);
8717
8718
8719 /*
8720 * Step 6: Entry is unwired and ready for us to delete !
8721 */
8722
8723 if (!entry->vme_permanent) {
8724 /*
8725 * Typical case: the entry really shouldn't be permanent
8726 */
8727 } else if ((flags & VM_MAP_REMOVE_IMMUTABLE_CODE) &&
8728 (entry->protection & VM_PROT_EXECUTE) &&
8729 developer_mode_state()) {
8730 /*
8731 * Allow debuggers to undo executable mappings
8732 * when developer mode is on.
8733 */
8734 #if 0
8735 printf("FBDP %d[%s] removing permanent executable entry "
8736 "%p [0x%llx:0x%llx] prot 0x%x/0x%x\n",
8737 proc_selfpid(),
8738 (current_task()->bsd_info
8739 ? proc_name_address(current_task()->bsd_info)
8740 : "?"), entry,
8741 (uint64_t)entry->vme_start,
8742 (uint64_t)entry->vme_end,
8743 entry->protection,
8744 entry->max_protection);
8745 #endif
8746 entry->vme_permanent = FALSE;
8747 } else if ((flags & VM_MAP_REMOVE_IMMUTABLE) || map->terminated) {
8748 #if 0
8749 printf("FBDP %d[%s] removing permanent entry "
8750 "%p [0x%llx:0x%llx] prot 0x%x/0x%x\n",
8751 proc_selfpid(),
8752 (current_task()->bsd_info
8753 ? proc_name_address(current_task()->bsd_info)
8754 : "?"), entry,
8755 (uint64_t)entry->vme_start,
8756 (uint64_t)entry->vme_end,
8757 entry->protection,
8758 entry->max_protection);
8759 #endif
8760 entry->vme_permanent = FALSE;
8761 #if CODE_SIGNING_MONITOR
8762 } else if ((entry->protection & VM_PROT_EXECUTE) && !csm_enabled()) {
8763 entry->vme_permanent = FALSE;
8764
8765 printf("%d[%s] %s(0x%llx,0x%llx): "
8766 "code signing monitor disabled, allowing for permanent executable entry [0x%llx:0x%llx] "
8767 "prot 0x%x/0x%x\n",
8768 proc_selfpid(),
8769 (get_bsdtask_info(current_task())
8770 ? proc_name_address(get_bsdtask_info(current_task()))
8771 : "?"),
8772 __FUNCTION__,
8773 (uint64_t)start,
8774 (uint64_t)end,
8775 (uint64_t)entry->vme_start,
8776 (uint64_t)entry->vme_end,
8777 entry->protection,
8778 entry->max_protection);
8779 #endif
8780 } else {
8781 DTRACE_VM6(vm_map_delete_permanent,
8782 vm_map_entry_t, entry,
8783 vm_map_offset_t, entry->vme_start,
8784 vm_map_offset_t, entry->vme_end,
8785 vm_prot_t, entry->protection,
8786 vm_prot_t, entry->max_protection,
8787 int, VME_ALIAS(entry));
8788 }
8789
8790 if (entry->is_sub_map) {
8791 assertf(VM_MAP_PAGE_SHIFT(VME_SUBMAP(entry)) >= VM_MAP_PAGE_SHIFT(map),
8792 "map %p (%d) entry %p submap %p (%d)\n",
8793 map, VM_MAP_PAGE_SHIFT(map), entry,
8794 VME_SUBMAP(entry),
8795 VM_MAP_PAGE_SHIFT(VME_SUBMAP(entry)));
8796 if (entry->use_pmap) {
8797 #ifndef NO_NESTED_PMAP
8798 int pmap_flags;
8799
8800 if (map->terminated) {
8801 /*
8802 * This is the final cleanup of the
8803 * address space being terminated.
8804 * No new mappings are expected and
8805 * we don't really need to unnest the
8806 * shared region (and lose the "global"
8807 * pmap mappings, if applicable).
8808 *
8809 * Tell the pmap layer that we're
8810 * "clean" wrt nesting.
8811 */
8812 pmap_flags = PMAP_UNNEST_CLEAN;
8813 } else {
8814 /*
8815 * We're unmapping part of the nested
8816 * shared region, so we can't keep the
8817 * nested pmap.
8818 */
8819 pmap_flags = 0;
8820 }
8821 pmap_unnest_options(
8822 map->pmap,
8823 (addr64_t)entry->vme_start,
8824 entry->vme_end - entry->vme_start,
8825 pmap_flags);
8826 #endif /* NO_NESTED_PMAP */
8827 if (map->mapped_in_other_pmaps &&
8828 os_ref_get_count_raw(&map->map_refcnt) != 0) {
8829 /* clean up parent map/maps */
8830 vm_map_submap_pmap_clean(
8831 map, entry->vme_start,
8832 entry->vme_end,
8833 VME_SUBMAP(entry),
8834 VME_OFFSET(entry));
8835 }
8836 } else {
8837 vm_map_submap_pmap_clean(
8838 map, entry->vme_start, entry->vme_end,
8839 VME_SUBMAP(entry),
8840 VME_OFFSET(entry));
8841 }
8842 } else if (entry->vme_kernel_object ||
8843 VME_OBJECT(entry) == compressor_object) {
8844 /*
8845 * nothing to do
8846 */
8847 } else if (map->mapped_in_other_pmaps &&
8848 os_ref_get_count_raw(&map->map_refcnt) != 0) {
8849 vm_object_pmap_protect_options(
8850 VME_OBJECT(entry), VME_OFFSET(entry),
8851 entry->vme_end - entry->vme_start,
8852 PMAP_NULL,
8853 PAGE_SIZE,
8854 entry->vme_start,
8855 VM_PROT_NONE,
8856 PMAP_OPTIONS_REMOVE);
8857 } else if ((VME_OBJECT(entry) != VM_OBJECT_NULL) ||
8858 (state & VMDS_KERNEL_PMAP)) {
8859 /* Remove translations associated
8860 * with this range unless the entry
8861 * does not have an object, or
8862 * it's the kernel map or a descendant
8863 * since the platform could potentially
8864 * create "backdoor" mappings invisible
8865 * to the VM. It is expected that
8866 * objectless, non-kernel ranges
8867 * do not have such VM invisible
8868 * translations.
8869 */
8870 vm_map_address_t remove_start = entry->vme_start;
8871 vm_map_address_t remove_end = entry->vme_end;
8872 #if MACH_ASSERT
8873 /*
8874 * Prevent panics in pmap_remove() from some vm test code
8875 * which uses virtual address ranges that pmap disallows.
8876 */
8877 if (thread_get_test_option(test_option_vm_map_clamp_pmap_remove)) {
8878 vm_map_clamp_to_pmap(map, &remove_start, &remove_end);
8879 }
8880 #endif /* MACH_ASSERT */
8881 pmap_remove(map->pmap, remove_start, remove_end);
8882 }
8883
8884 #if DEBUG
8885 /*
8886 * All pmap mappings for this map entry must have been
8887 * cleared by now.
8888 */
8889 assert(pmap_is_empty(map->pmap,
8890 entry->vme_start,
8891 entry->vme_end));
8892 #endif /* DEBUG */
8893
8894 if (entry->iokit_acct) {
8895 /* alternate accounting */
8896 DTRACE_VM4(vm_map_iokit_unmapped_region,
8897 vm_map_t, map,
8898 vm_map_offset_t, entry->vme_start,
8899 vm_map_offset_t, entry->vme_end,
8900 int, VME_ALIAS(entry));
8901 vm_map_iokit_unmapped_region(map,
8902 (entry->vme_end -
8903 entry->vme_start));
8904 entry->iokit_acct = FALSE;
8905 entry->use_pmap = FALSE;
8906 }
8907
8908 /* move "s" forward */
8909 s = entry->vme_end;
8910 next = entry->vme_next;
8911 if (!entry->map_aligned) {
8912 vm_map_offset_t rounded_s;
8913
8914 /*
8915 * Skip artificial gap due to mis-aligned entry
8916 * on devices with a page size smaller than the
8917 * map's page size (i.e. 16k task on a 4k device).
8918 */
8919 rounded_s = VM_MAP_ROUND_PAGE(s, VM_MAP_PAGE_MASK(map));
8920 if (next == vm_map_to_entry(map)) {
8921 s = rounded_s;
8922 } else if (s < rounded_s) {
8923 s = MIN(rounded_s, next->vme_start);
8924 }
8925 }
8926 ret.kmr_size += s - entry->vme_start;
8927
8928 if (entry->vme_permanent) {
8929 /*
8930 * A permanent entry can not be removed, so leave it
8931 * in place but remove all access permissions.
8932 */
8933 if (__improbable(vm_log_map_delete_permanent_prot_none)) {
8934 printf("%s:%d %d[%s] map %p entry %p [ 0x%llx - 0x%llx ] submap %d prot 0x%x/0x%x -> 0/0\n",
8935 __FUNCTION__, __LINE__,
8936 proc_selfpid(),
8937 (get_bsdtask_info(current_task())
8938 ? proc_name_address(get_bsdtask_info(current_task()))
8939 : "?"),
8940 map,
8941 entry,
8942 (uint64_t)entry->vme_start,
8943 (uint64_t)entry->vme_end,
8944 entry->is_sub_map,
8945 entry->protection,
8946 entry->max_protection);
8947 }
8948 DTRACE_VM6(vm_map_delete_permanent_prot_none,
8949 vm_map_entry_t, entry,
8950 vm_map_offset_t, entry->vme_start,
8951 vm_map_offset_t, entry->vme_end,
8952 vm_prot_t, entry->protection,
8953 vm_prot_t, entry->max_protection,
8954 int, VME_ALIAS(entry));
8955 entry->protection = VM_PROT_NONE;
8956 entry->max_protection = VM_PROT_NONE;
8957 #ifdef __arm64e__
8958 entry->used_for_tpro = FALSE;
8959 #endif
8960 } else {
8961 vm_map_entry_zap(map, entry, zap_list);
8962 }
8963
8964 entry = next;
8965 next = VM_MAP_ENTRY_NULL;
8966
8967 if ((flags & VM_MAP_REMOVE_NO_YIELD) == 0 && s < end) {
8968 unsigned int last_timestamp = map->timestamp++;
8969
8970 if (lck_rw_lock_yield_exclusive(&map->lock,
8971 LCK_RW_YIELD_ANY_WAITER)) {
8972 if (last_timestamp != map->timestamp + 1) {
8973 state |= VMDS_NEEDS_LOOKUP;
8974 }
8975 } else {
8976 /* we didn't yield, undo our change */
8977 map->timestamp--;
8978 }
8979 }
8980 }
8981
8982 if (map->wait_for_space) {
8983 thread_wakeup((event_t) map);
8984 }
8985
8986 if (state & VMDS_NEEDS_WAKEUP) {
8987 vm_map_entry_wakeup(map);
8988 }
8989
8990 out:
8991 if ((state & VMDS_KERNEL_PMAP) && ret.kmr_return) {
8992 __vm_map_delete_failed_panic(map, start, end, ret.kmr_return);
8993 }
8994
8995 if (state & VMDS_KERNEL_KMEMPTR) {
8996 kmem_free_space(start, end, range_id, &slot);
8997 }
8998
8999 if (state & VMDS_FOUND_GAP) {
9000 DTRACE_VM3(kern_vm_deallocate_gap,
9001 vm_map_offset_t, gap_start,
9002 vm_map_offset_t, save_start,
9003 vm_map_offset_t, save_end);
9004 if (flags & VM_MAP_REMOVE_GAPS_FAIL) {
9005 ret.kmr_return = KERN_INVALID_VALUE;
9006 } else {
9007 vm_map_guard_exception(gap_start, kGUARD_EXC_DEALLOC_GAP);
9008 }
9009 }
9010
9011 return ret;
9012 }
9013
9014 kmem_return_t
vm_map_remove_and_unlock(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vmr_flags_t flags,kmem_guard_t guard)9015 vm_map_remove_and_unlock(
9016 vm_map_t map,
9017 vm_map_offset_t start,
9018 vm_map_offset_t end,
9019 vmr_flags_t flags,
9020 kmem_guard_t guard)
9021 {
9022 kmem_return_t ret;
9023 VM_MAP_ZAP_DECLARE(zap);
9024
9025 ret = vm_map_delete(map, start, end, flags, guard, &zap);
9026 vm_map_unlock(map);
9027
9028 vm_map_zap_dispose(&zap);
9029
9030 return ret;
9031 }
9032
9033 /*
9034 * vm_map_remove_guard:
9035 *
9036 * Remove the given address range from the target map.
9037 * This is the exported form of vm_map_delete.
9038 */
9039 kmem_return_t
vm_map_remove_guard(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vmr_flags_t flags,kmem_guard_t guard)9040 vm_map_remove_guard(
9041 vm_map_t map,
9042 vm_map_offset_t start,
9043 vm_map_offset_t end,
9044 vmr_flags_t flags,
9045 kmem_guard_t guard)
9046 {
9047 vm_map_lock(map);
9048 return vm_map_remove_and_unlock(map, start, end, flags, guard);
9049 }
9050
9051
9052 /*
9053 * vm_map_setup:
9054 *
9055 * Perform any required setup on a new task's map. Must be called before the task
9056 * is enabled for IPC access, since after this point other threads may be able
9057 * to look up the task port and make VM API calls.
9058 */
9059 void
vm_map_setup(vm_map_t map,task_t task)9060 vm_map_setup(vm_map_t map, task_t task)
9061 {
9062 /*
9063 * map does NOT take a reference on owning_task. If the map has terminated,
9064 * it is possible that the pointer is NULL, so reads of owning_task must
9065 * happen under the map lock and explicitly check for NULL.
9066 */
9067 vm_map_lock(map);
9068 assert(!map->owning_task);
9069 map->owning_task = task;
9070 vm_map_unlock(map);
9071 #if CONFIG_DEFERRED_RECLAIM
9072 vm_deferred_reclamation_metadata_t vdrm = task->deferred_reclamation_metadata;
9073 if (vdrm) {
9074 vm_deferred_reclamation_task_fork_register(vdrm);
9075 }
9076 #endif /* CONFIG_DEFERRED_RECLAIM */
9077 }
9078
9079 /*
9080 * vm_map_terminate:
9081 *
9082 * Clean out a task's map.
9083 */
9084 kern_return_t
vm_map_terminate(vm_map_t map)9085 vm_map_terminate(
9086 vm_map_t map)
9087 {
9088 vm_map_lock(map);
9089 map->terminated = TRUE;
9090 map->owning_task = NULL;
9091 vm_map_disable_hole_optimization(map);
9092 (void)vm_map_remove_and_unlock(map, map->min_offset, map->max_offset,
9093 VM_MAP_REMOVE_NO_FLAGS, KMEM_GUARD_NONE);
9094 return KERN_SUCCESS;
9095 }
9096
9097 /*
9098 * Routine: vm_map_copy_allocate
9099 *
9100 * Description:
9101 * Allocates and initializes a map copy object.
9102 */
9103 static vm_map_copy_t
vm_map_copy_allocate(uint16_t type)9104 vm_map_copy_allocate(uint16_t type)
9105 {
9106 vm_map_copy_t new_copy;
9107
9108 new_copy = zalloc_id(ZONE_ID_VM_MAP_COPY, Z_WAITOK | Z_ZERO);
9109 new_copy->type = type;
9110 if (type == VM_MAP_COPY_ENTRY_LIST) {
9111 new_copy->c_u.hdr.rb_head_store.rbh_root = (void*)(int)SKIP_RB_TREE;
9112 vm_map_store_init(&new_copy->cpy_hdr);
9113 }
9114 return new_copy;
9115 }
9116
9117 /*
9118 * Routine: vm_map_copy_discard
9119 *
9120 * Description:
9121 * Dispose of a map copy object (returned by
9122 * vm_map_copyin).
9123 */
9124 void
vm_map_copy_discard(vm_map_copy_t copy)9125 vm_map_copy_discard(
9126 vm_map_copy_t copy)
9127 {
9128 if (copy == VM_MAP_COPY_NULL) {
9129 return;
9130 }
9131
9132 /*
9133 * Assert that the vm_map_copy is coming from the right
9134 * zone and hasn't been forged
9135 */
9136 vm_map_copy_require(copy);
9137
9138 switch (copy->type) {
9139 case VM_MAP_COPY_ENTRY_LIST:
9140 while (vm_map_copy_first_entry(copy) !=
9141 vm_map_copy_to_entry(copy)) {
9142 vm_map_entry_t entry = vm_map_copy_first_entry(copy);
9143
9144 vm_map_copy_entry_unlink(copy, entry);
9145 if (entry->is_sub_map) {
9146 vm_map_deallocate(VME_SUBMAP(entry));
9147 } else {
9148 vm_object_deallocate(VME_OBJECT(entry));
9149 }
9150 vm_map_copy_entry_dispose(entry);
9151 }
9152 break;
9153 case VM_MAP_COPY_KERNEL_BUFFER:
9154
9155 /*
9156 * The vm_map_copy_t and possibly the data buffer were
9157 * allocated by a single call to kalloc_data(), i.e. the
9158 * vm_map_copy_t was not allocated out of the zone.
9159 */
9160 if (copy->size > msg_ool_size_small || copy->offset) {
9161 panic("Invalid vm_map_copy_t sz:%lld, ofst:%lld",
9162 (long long)copy->size, (long long)copy->offset);
9163 }
9164 kfree_data(copy->cpy_kdata, copy->size);
9165 }
9166 zfree_id(ZONE_ID_VM_MAP_COPY, copy);
9167 }
9168
9169 #if XNU_PLATFORM_MacOSX
9170
9171 __exported
9172 extern vm_map_copy_t vm_map_copy_copy(vm_map_copy_t copy);
9173
9174 /*
9175 * Routine: vm_map_copy_copy
9176 *
9177 * Description:
9178 * Move the information in a map copy object to
9179 * a new map copy object, leaving the old one
9180 * empty.
9181 *
9182 * This is used by kernel routines that need
9183 * to look at out-of-line data (in copyin form)
9184 * before deciding whether to return SUCCESS.
9185 * If the routine returns FAILURE, the original
9186 * copy object will be deallocated; therefore,
9187 * these routines must make a copy of the copy
9188 * object and leave the original empty so that
9189 * deallocation will not fail.
9190 */
9191 vm_map_copy_t
vm_map_copy_copy(vm_map_copy_t copy)9192 vm_map_copy_copy(
9193 vm_map_copy_t copy)
9194 {
9195 vm_map_copy_t new_copy;
9196
9197 if (copy == VM_MAP_COPY_NULL) {
9198 return VM_MAP_COPY_NULL;
9199 }
9200
9201 /*
9202 * Assert that the vm_map_copy is coming from the right
9203 * zone and hasn't been forged
9204 */
9205 vm_map_copy_require(copy);
9206
9207 /*
9208 * Allocate a new copy object, and copy the information
9209 * from the old one into it.
9210 */
9211
9212 new_copy = zalloc_id(ZONE_ID_VM_MAP_COPY, Z_WAITOK | Z_ZERO | Z_NOFAIL);
9213 memcpy((void *) new_copy, (void *) copy, sizeof(struct vm_map_copy));
9214 #if __has_feature(ptrauth_calls)
9215 if (copy->type == VM_MAP_COPY_KERNEL_BUFFER) {
9216 new_copy->cpy_kdata = copy->cpy_kdata;
9217 }
9218 #endif
9219
9220 if (copy->type == VM_MAP_COPY_ENTRY_LIST) {
9221 /*
9222 * The links in the entry chain must be
9223 * changed to point to the new copy object.
9224 */
9225 vm_map_copy_first_entry(copy)->vme_prev
9226 = vm_map_copy_to_entry(new_copy);
9227 vm_map_copy_last_entry(copy)->vme_next
9228 = vm_map_copy_to_entry(new_copy);
9229 }
9230
9231 /*
9232 * Change the old copy object into one that contains
9233 * nothing to be deallocated.
9234 */
9235 bzero(copy, sizeof(struct vm_map_copy));
9236 copy->type = VM_MAP_COPY_KERNEL_BUFFER;
9237
9238 /*
9239 * Return the new object.
9240 */
9241 return new_copy;
9242 }
9243
9244 #endif /* XNU_PLATFORM_MacOSX */
9245
9246 static boolean_t
vm_map_entry_is_overwritable(vm_map_t dst_map __unused,vm_map_entry_t entry)9247 vm_map_entry_is_overwritable(
9248 vm_map_t dst_map __unused,
9249 vm_map_entry_t entry)
9250 {
9251 if (!(entry->protection & VM_PROT_WRITE)) {
9252 /* can't overwrite if not writable */
9253 return FALSE;
9254 }
9255 #if !__x86_64__
9256 if (entry->used_for_jit &&
9257 vm_map_cs_enforcement(dst_map) &&
9258 !dst_map->cs_debugged) {
9259 /*
9260 * Can't overwrite a JIT region while cs_enforced
9261 * and not cs_debugged.
9262 */
9263 return FALSE;
9264 }
9265
9266 #if __arm64e__
9267 /* Do not allow overwrite HW assisted TPRO entries */
9268 if (entry->used_for_tpro) {
9269 return FALSE;
9270 }
9271 #endif /* __arm64e__ */
9272
9273 if (entry->vme_permanent) {
9274 if (entry->is_sub_map) {
9275 /*
9276 * We can't tell if the submap contains "permanent"
9277 * entries within the range targeted by the caller.
9278 * The caller will have to check for that with
9279 * vm_map_overwrite_submap_recurse() for example.
9280 */
9281 } else {
9282 /*
9283 * Do not allow overwriting of a "permanent"
9284 * entry.
9285 */
9286 DTRACE_VM6(vm_map_delete_permanent_deny_overwrite,
9287 vm_map_entry_t, entry,
9288 vm_map_offset_t, entry->vme_start,
9289 vm_map_offset_t, entry->vme_end,
9290 vm_prot_t, entry->protection,
9291 vm_prot_t, entry->max_protection,
9292 int, VME_ALIAS(entry));
9293 return FALSE;
9294 }
9295 }
9296 #endif /* !__x86_64__ */
9297
9298 if (entry->is_sub_map) {
9299 /* remember not to assume every entry has a VM object... */
9300 }
9301
9302
9303 return TRUE;
9304 }
9305
9306 static kern_return_t
vm_map_overwrite_submap_recurse(vm_map_t dst_map,vm_map_offset_t dst_addr,vm_map_size_t dst_size)9307 vm_map_overwrite_submap_recurse(
9308 vm_map_t dst_map,
9309 vm_map_offset_t dst_addr,
9310 vm_map_size_t dst_size)
9311 {
9312 vm_map_offset_t dst_end;
9313 vm_map_entry_t tmp_entry;
9314 vm_map_entry_t entry;
9315 kern_return_t result;
9316 boolean_t encountered_sub_map = FALSE;
9317
9318
9319
9320 /*
9321 * Verify that the destination is all writeable
9322 * initially. We have to trunc the destination
9323 * address and round the copy size or we'll end up
9324 * splitting entries in strange ways.
9325 */
9326
9327 dst_end = vm_map_round_page(dst_addr + dst_size,
9328 VM_MAP_PAGE_MASK(dst_map));
9329 vm_map_lock(dst_map);
9330
9331 start_pass_1:
9332 if (!vm_map_lookup_entry(dst_map, dst_addr, &tmp_entry)) {
9333 vm_map_unlock(dst_map);
9334 return KERN_INVALID_ADDRESS;
9335 }
9336
9337 vm_map_clip_start(dst_map,
9338 tmp_entry,
9339 vm_map_trunc_page(dst_addr,
9340 VM_MAP_PAGE_MASK(dst_map)));
9341 if (tmp_entry->is_sub_map) {
9342 /* clipping did unnest if needed */
9343 assert(!tmp_entry->use_pmap);
9344 }
9345
9346 for (entry = tmp_entry;;) {
9347 vm_map_entry_t next;
9348
9349 next = entry->vme_next;
9350 while (entry->is_sub_map) {
9351 vm_map_offset_t sub_start;
9352 vm_map_offset_t sub_end;
9353 vm_map_offset_t local_end;
9354 vm_map_t sub_map;
9355
9356 if (entry->in_transition) {
9357 /*
9358 * Say that we are waiting, and wait for entry.
9359 */
9360 entry->needs_wakeup = TRUE;
9361 vm_map_entry_wait(dst_map, THREAD_UNINT);
9362
9363 goto start_pass_1;
9364 }
9365
9366 encountered_sub_map = TRUE;
9367 sub_start = VME_OFFSET(entry);
9368
9369 if (entry->vme_end < dst_end) {
9370 sub_end = entry->vme_end;
9371 } else {
9372 sub_end = dst_end;
9373 }
9374 sub_end -= entry->vme_start;
9375 sub_end += VME_OFFSET(entry);
9376 local_end = entry->vme_end;
9377 sub_map = VME_SUBMAP(entry);
9378 vm_map_reference(sub_map);
9379 vm_map_unlock(dst_map);
9380
9381 result = vm_map_overwrite_submap_recurse(
9382 sub_map,
9383 sub_start,
9384 sub_end - sub_start);
9385
9386 vm_map_deallocate(sub_map);
9387 sub_map = VM_MAP_NULL;
9388
9389 if (result != KERN_SUCCESS) {
9390 return result;
9391 }
9392 if (dst_end <= entry->vme_end) {
9393 return KERN_SUCCESS;
9394 }
9395 vm_map_lock(dst_map);
9396 if (!vm_map_lookup_entry(dst_map, local_end,
9397 &tmp_entry)) {
9398 vm_map_unlock(dst_map);
9399 return KERN_INVALID_ADDRESS;
9400 }
9401 entry = tmp_entry;
9402 next = entry->vme_next;
9403 }
9404 assert(!entry->is_sub_map);
9405
9406 if (!(entry->protection & VM_PROT_WRITE)) {
9407 vm_map_unlock(dst_map);
9408 return KERN_PROTECTION_FAILURE;
9409 }
9410
9411 if (!vm_map_entry_is_overwritable(dst_map, entry)) {
9412 vm_map_unlock(dst_map);
9413 return KERN_PROTECTION_FAILURE;
9414 }
9415
9416 /*
9417 * If the entry is in transition, we must wait
9418 * for it to exit that state. Anything could happen
9419 * when we unlock the map, so start over.
9420 */
9421 if (entry->in_transition) {
9422 /*
9423 * Say that we are waiting, and wait for entry.
9424 */
9425 entry->needs_wakeup = TRUE;
9426 vm_map_entry_wait(dst_map, THREAD_UNINT);
9427
9428 goto start_pass_1;
9429 }
9430
9431 /*
9432 * our range is contained completely within this map entry
9433 */
9434 if (dst_end <= entry->vme_end) {
9435 vm_map_unlock(dst_map);
9436 return KERN_SUCCESS;
9437 }
9438 /*
9439 * check that range specified is contiguous region
9440 */
9441 if ((next == vm_map_to_entry(dst_map)) ||
9442 (next->vme_start != entry->vme_end)) {
9443 vm_map_unlock(dst_map);
9444 return KERN_INVALID_ADDRESS;
9445 }
9446
9447 /*
9448 * Check for permanent objects in the destination.
9449 */
9450 assert(!entry->is_sub_map);
9451 if ((VME_OBJECT(entry) != VM_OBJECT_NULL) &&
9452 ((!VME_OBJECT(entry)->internal) ||
9453 (VME_OBJECT(entry)->true_share))) {
9454 if (encountered_sub_map) {
9455 vm_map_unlock(dst_map);
9456 return KERN_FAILURE;
9457 }
9458 }
9459
9460
9461 entry = next;
9462 }/* for */
9463 vm_map_unlock(dst_map);
9464 return KERN_SUCCESS;
9465 }
9466
9467 /*
9468 * Routine: vm_map_copy_overwrite
9469 *
9470 * Description:
9471 * Copy the memory described by the map copy
9472 * object (copy; returned by vm_map_copyin) onto
9473 * the specified destination region (dst_map, dst_addr).
9474 * The destination must be writeable.
9475 *
9476 * Unlike vm_map_copyout, this routine actually
9477 * writes over previously-mapped memory. If the
9478 * previous mapping was to a permanent (user-supplied)
9479 * memory object, it is preserved.
9480 *
9481 * The attributes (protection and inheritance) of the
9482 * destination region are preserved.
9483 *
9484 * If successful, consumes the copy object.
9485 * Otherwise, the caller is responsible for it.
9486 *
9487 * Implementation notes:
9488 * To overwrite aligned temporary virtual memory, it is
9489 * sufficient to remove the previous mapping and insert
9490 * the new copy. This replacement is done either on
9491 * the whole region (if no permanent virtual memory
9492 * objects are embedded in the destination region) or
9493 * in individual map entries.
9494 *
9495 * To overwrite permanent virtual memory , it is necessary
9496 * to copy each page, as the external memory management
9497 * interface currently does not provide any optimizations.
9498 *
9499 * Unaligned memory also has to be copied. It is possible
9500 * to use 'vm_trickery' to copy the aligned data. This is
9501 * not done but not hard to implement.
9502 *
9503 * Once a page of permanent memory has been overwritten,
9504 * it is impossible to interrupt this function; otherwise,
9505 * the call would be neither atomic nor location-independent.
9506 * The kernel-state portion of a user thread must be
9507 * interruptible.
9508 *
9509 * It may be expensive to forward all requests that might
9510 * overwrite permanent memory (vm_write, vm_copy) to
9511 * uninterruptible kernel threads. This routine may be
9512 * called by interruptible threads; however, success is
9513 * not guaranteed -- if the request cannot be performed
9514 * atomically and interruptibly, an error indication is
9515 * returned.
9516 *
9517 * Callers of this function must call vm_map_copy_require on
9518 * previously created vm_map_copy_t or pass a newly created
9519 * one to ensure that it hasn't been forged.
9520 */
9521 static kern_return_t
vm_map_copy_overwrite_nested(vm_map_t dst_map,vm_map_address_t dst_addr,vm_map_copy_t copy,boolean_t interruptible,pmap_t pmap,boolean_t discard_on_success)9522 vm_map_copy_overwrite_nested(
9523 vm_map_t dst_map,
9524 vm_map_address_t dst_addr,
9525 vm_map_copy_t copy,
9526 boolean_t interruptible,
9527 pmap_t pmap,
9528 boolean_t discard_on_success)
9529 {
9530 vm_map_offset_t dst_end;
9531 vm_map_entry_t tmp_entry;
9532 vm_map_entry_t entry;
9533 kern_return_t kr;
9534 boolean_t aligned = TRUE;
9535 boolean_t contains_permanent_objects = FALSE;
9536 boolean_t encountered_sub_map = FALSE;
9537 vm_map_offset_t base_addr;
9538 vm_map_size_t copy_size;
9539 vm_map_size_t total_size;
9540 uint16_t copy_page_shift;
9541
9542 /*
9543 * Check for special kernel buffer allocated
9544 * by new_ipc_kmsg_copyin.
9545 */
9546
9547 if (copy->type == VM_MAP_COPY_KERNEL_BUFFER) {
9548 kr = vm_map_copyout_kernel_buffer(
9549 dst_map, &dst_addr,
9550 copy, copy->size, TRUE,
9551 discard_on_success);
9552 return kr;
9553 }
9554
9555 /*
9556 * Only works for entry lists at the moment. Will
9557 * support page lists later.
9558 */
9559
9560 assert(copy->type == VM_MAP_COPY_ENTRY_LIST);
9561
9562 if (copy->size == 0) {
9563 if (discard_on_success) {
9564 vm_map_copy_discard(copy);
9565 }
9566 return KERN_SUCCESS;
9567 }
9568
9569 copy_page_shift = copy->cpy_hdr.page_shift;
9570
9571 /*
9572 * Verify that the destination is all writeable
9573 * initially. We have to trunc the destination
9574 * address and round the copy size or we'll end up
9575 * splitting entries in strange ways.
9576 */
9577
9578 if (!VM_MAP_PAGE_ALIGNED(copy->size,
9579 VM_MAP_PAGE_MASK(dst_map)) ||
9580 !VM_MAP_PAGE_ALIGNED(copy->offset,
9581 VM_MAP_PAGE_MASK(dst_map)) ||
9582 !VM_MAP_PAGE_ALIGNED(dst_addr,
9583 VM_MAP_PAGE_MASK(dst_map)) ||
9584 copy_page_shift != VM_MAP_PAGE_SHIFT(dst_map)) {
9585 aligned = FALSE;
9586 dst_end = vm_map_round_page(dst_addr + copy->size,
9587 VM_MAP_PAGE_MASK(dst_map));
9588 } else {
9589 dst_end = dst_addr + copy->size;
9590 }
9591
9592 vm_map_lock(dst_map);
9593
9594 /* LP64todo - remove this check when vm_map_commpage64()
9595 * no longer has to stuff in a map_entry for the commpage
9596 * above the map's max_offset.
9597 */
9598 if (dst_addr >= dst_map->max_offset) {
9599 vm_map_unlock(dst_map);
9600 return KERN_INVALID_ADDRESS;
9601 }
9602
9603 start_pass_1:
9604 if (!vm_map_lookup_entry(dst_map, dst_addr, &tmp_entry)) {
9605 vm_map_unlock(dst_map);
9606 return KERN_INVALID_ADDRESS;
9607 }
9608 vm_map_clip_start(dst_map,
9609 tmp_entry,
9610 vm_map_trunc_page(dst_addr,
9611 VM_MAP_PAGE_MASK(dst_map)));
9612 for (entry = tmp_entry;;) {
9613 vm_map_entry_t next = entry->vme_next;
9614
9615 while (entry->is_sub_map) {
9616 vm_map_offset_t sub_start;
9617 vm_map_offset_t sub_end;
9618 vm_map_offset_t local_end;
9619
9620 if (entry->in_transition) {
9621 /*
9622 * Say that we are waiting, and wait for entry.
9623 */
9624 entry->needs_wakeup = TRUE;
9625 vm_map_entry_wait(dst_map, THREAD_UNINT);
9626
9627 goto start_pass_1;
9628 }
9629
9630 local_end = entry->vme_end;
9631 if (!(entry->needs_copy)) {
9632 vm_map_t sub_map = VM_MAP_NULL;
9633
9634 /* if needs_copy we are a COW submap */
9635 /* in such a case we just replace so */
9636 /* there is no need for the follow- */
9637 /* ing check. */
9638 encountered_sub_map = TRUE;
9639 sub_start = VME_OFFSET(entry);
9640
9641 if (entry->vme_end < dst_end) {
9642 sub_end = entry->vme_end;
9643 } else {
9644 sub_end = dst_end;
9645 }
9646 sub_end -= entry->vme_start;
9647 sub_end += VME_OFFSET(entry);
9648 sub_map = VME_SUBMAP(entry);
9649 vm_map_reference(sub_map);
9650 vm_map_unlock(dst_map);
9651
9652 kr = vm_map_overwrite_submap_recurse(
9653 sub_map,
9654 sub_start,
9655 sub_end - sub_start);
9656
9657 vm_map_deallocate(sub_map);
9658 sub_map = VM_MAP_NULL;
9659 if (kr != KERN_SUCCESS) {
9660 return kr;
9661 }
9662 vm_map_lock(dst_map);
9663 }
9664
9665 if (dst_end <= entry->vme_end) {
9666 goto start_overwrite;
9667 }
9668 if (!vm_map_lookup_entry(dst_map, local_end,
9669 &entry)) {
9670 vm_map_unlock(dst_map);
9671 return KERN_INVALID_ADDRESS;
9672 }
9673 next = entry->vme_next;
9674 }
9675 assert(!entry->is_sub_map);
9676
9677 if (!(entry->protection & VM_PROT_WRITE)) {
9678 vm_map_unlock(dst_map);
9679 return KERN_PROTECTION_FAILURE;
9680 }
9681
9682 if (!vm_map_entry_is_overwritable(dst_map, entry)) {
9683 vm_map_unlock(dst_map);
9684 return KERN_PROTECTION_FAILURE;
9685 }
9686
9687 /*
9688 * If the entry is in transition, we must wait
9689 * for it to exit that state. Anything could happen
9690 * when we unlock the map, so start over.
9691 */
9692 if (entry->in_transition) {
9693 /*
9694 * Say that we are waiting, and wait for entry.
9695 */
9696 entry->needs_wakeup = TRUE;
9697 vm_map_entry_wait(dst_map, THREAD_UNINT);
9698
9699 goto start_pass_1;
9700 }
9701
9702 /*
9703 * our range is contained completely within this map entry
9704 */
9705 if (dst_end <= entry->vme_end) {
9706 break;
9707 }
9708 /*
9709 * check that range specified is contiguous region
9710 */
9711 if ((next == vm_map_to_entry(dst_map)) ||
9712 (next->vme_start != entry->vme_end)) {
9713 vm_map_unlock(dst_map);
9714 return KERN_INVALID_ADDRESS;
9715 }
9716
9717
9718 /*
9719 * Check for permanent objects in the destination.
9720 */
9721 assert(!entry->is_sub_map);
9722 if ((VME_OBJECT(entry) != VM_OBJECT_NULL) &&
9723 ((!VME_OBJECT(entry)->internal) ||
9724 (VME_OBJECT(entry)->true_share))) {
9725 contains_permanent_objects = TRUE;
9726 }
9727
9728 entry = next;
9729 }/* for */
9730
9731 start_overwrite:
9732 /*
9733 * If there are permanent objects in the destination, then
9734 * the copy cannot be interrupted.
9735 */
9736
9737 if (interruptible && contains_permanent_objects) {
9738 vm_map_unlock(dst_map);
9739 return KERN_FAILURE; /* XXX */
9740 }
9741
9742 /*
9743 *
9744 * Make a second pass, overwriting the data
9745 * At the beginning of each loop iteration,
9746 * the next entry to be overwritten is "tmp_entry"
9747 * (initially, the value returned from the lookup above),
9748 * and the starting address expected in that entry
9749 * is "start".
9750 */
9751
9752 total_size = copy->size;
9753 if (encountered_sub_map) {
9754 copy_size = 0;
9755 /* re-calculate tmp_entry since we've had the map */
9756 /* unlocked */
9757 if (!vm_map_lookup_entry( dst_map, dst_addr, &tmp_entry)) {
9758 vm_map_unlock(dst_map);
9759 return KERN_INVALID_ADDRESS;
9760 }
9761 } else {
9762 copy_size = copy->size;
9763 }
9764
9765 base_addr = dst_addr;
9766 while (TRUE) {
9767 /* deconstruct the copy object and do in parts */
9768 /* only in sub_map, interruptable case */
9769 vm_map_entry_t copy_entry;
9770 vm_map_entry_t previous_prev = VM_MAP_ENTRY_NULL;
9771 vm_map_entry_t next_copy = VM_MAP_ENTRY_NULL;
9772 int nentries;
9773 int remaining_entries = 0;
9774 vm_map_offset_t new_offset = 0;
9775
9776 for (entry = tmp_entry; copy_size == 0;) {
9777 vm_map_entry_t next;
9778
9779 next = entry->vme_next;
9780
9781 /* tmp_entry and base address are moved along */
9782 /* each time we encounter a sub-map. Otherwise */
9783 /* entry can outpase tmp_entry, and the copy_size */
9784 /* may reflect the distance between them */
9785 /* if the current entry is found to be in transition */
9786 /* we will start over at the beginning or the last */
9787 /* encounter of a submap as dictated by base_addr */
9788 /* we will zero copy_size accordingly. */
9789 if (entry->in_transition) {
9790 /*
9791 * Say that we are waiting, and wait for entry.
9792 */
9793 entry->needs_wakeup = TRUE;
9794 vm_map_entry_wait(dst_map, THREAD_UNINT);
9795
9796 if (!vm_map_lookup_entry(dst_map, base_addr,
9797 &tmp_entry)) {
9798 vm_map_unlock(dst_map);
9799 return KERN_INVALID_ADDRESS;
9800 }
9801 copy_size = 0;
9802 entry = tmp_entry;
9803 continue;
9804 }
9805 if (entry->is_sub_map) {
9806 vm_map_offset_t sub_start;
9807 vm_map_offset_t sub_end;
9808 vm_map_offset_t local_end;
9809 vm_map_t sub_map = VM_MAP_NULL;
9810 bool use_pmap;
9811
9812 if (entry->needs_copy) {
9813 /* if this is a COW submap */
9814 /* just back the range with a */
9815 /* anonymous entry */
9816 assert(!entry->vme_permanent);
9817 if (entry->vme_end < dst_end) {
9818 sub_end = entry->vme_end;
9819 } else {
9820 sub_end = dst_end;
9821 }
9822 if (entry->vme_start < base_addr) {
9823 sub_start = base_addr;
9824 } else {
9825 sub_start = entry->vme_start;
9826 }
9827 vm_map_clip_end(
9828 dst_map, entry, sub_end);
9829 vm_map_clip_start(
9830 dst_map, entry, sub_start);
9831 assert(!entry->use_pmap);
9832 assert(!entry->iokit_acct);
9833 entry->use_pmap = TRUE;
9834 vm_map_deallocate(VME_SUBMAP(entry));
9835 assert(!entry->vme_permanent);
9836 VME_OBJECT_SET(entry, VM_OBJECT_NULL, false, 0);
9837 VME_OFFSET_SET(entry, 0);
9838 entry->is_shared = FALSE;
9839 entry->needs_copy = FALSE;
9840 entry->protection = VM_PROT_DEFAULT;
9841 entry->max_protection = VM_PROT_ALL;
9842 entry->wired_count = 0;
9843 entry->user_wired_count = 0;
9844 if (entry->inheritance
9845 == VM_INHERIT_SHARE) {
9846 entry->inheritance = VM_INHERIT_COPY;
9847 }
9848 continue;
9849 }
9850 /* first take care of any non-sub_map */
9851 /* entries to send */
9852 if (base_addr < entry->vme_start) {
9853 /* stuff to send */
9854 copy_size =
9855 entry->vme_start - base_addr;
9856 break;
9857 }
9858 sub_start = VME_OFFSET(entry);
9859
9860 if (entry->vme_end < dst_end) {
9861 sub_end = entry->vme_end;
9862 } else {
9863 sub_end = dst_end;
9864 }
9865 sub_end -= entry->vme_start;
9866 sub_end += VME_OFFSET(entry);
9867 local_end = entry->vme_end;
9868 use_pmap = entry->use_pmap;
9869 sub_map = VME_SUBMAP(entry);
9870 vm_map_reference(sub_map);
9871 vm_map_unlock(dst_map);
9872 copy_size = sub_end - sub_start;
9873
9874 /* adjust the copy object */
9875 if (total_size > copy_size) {
9876 vm_map_size_t local_size = 0;
9877 vm_map_size_t entry_size;
9878
9879 nentries = 1;
9880 new_offset = copy->offset;
9881 copy_entry = vm_map_copy_first_entry(copy);
9882 while (copy_entry !=
9883 vm_map_copy_to_entry(copy)) {
9884 entry_size = copy_entry->vme_end -
9885 copy_entry->vme_start;
9886 if ((local_size < copy_size) &&
9887 ((local_size + entry_size)
9888 >= copy_size)) {
9889 vm_map_copy_clip_end(copy,
9890 copy_entry,
9891 copy_entry->vme_start +
9892 (copy_size - local_size));
9893 entry_size = copy_entry->vme_end -
9894 copy_entry->vme_start;
9895 local_size += entry_size;
9896 new_offset += entry_size;
9897 }
9898 if (local_size >= copy_size) {
9899 next_copy = copy_entry->vme_next;
9900 copy_entry->vme_next =
9901 vm_map_copy_to_entry(copy);
9902 previous_prev =
9903 copy->cpy_hdr.links.prev;
9904 copy->cpy_hdr.links.prev = copy_entry;
9905 copy->size = copy_size;
9906 remaining_entries =
9907 copy->cpy_hdr.nentries;
9908 remaining_entries -= nentries;
9909 copy->cpy_hdr.nentries = nentries;
9910 break;
9911 } else {
9912 local_size += entry_size;
9913 new_offset += entry_size;
9914 nentries++;
9915 }
9916 copy_entry = copy_entry->vme_next;
9917 }
9918 }
9919
9920 if ((use_pmap) && (pmap == NULL)) {
9921 kr = vm_map_copy_overwrite_nested(
9922 sub_map,
9923 sub_start,
9924 copy,
9925 interruptible,
9926 sub_map->pmap,
9927 TRUE);
9928 } else if (pmap != NULL) {
9929 kr = vm_map_copy_overwrite_nested(
9930 sub_map,
9931 sub_start,
9932 copy,
9933 interruptible, pmap,
9934 TRUE);
9935 } else {
9936 kr = vm_map_copy_overwrite_nested(
9937 sub_map,
9938 sub_start,
9939 copy,
9940 interruptible,
9941 dst_map->pmap,
9942 TRUE);
9943 }
9944
9945 vm_map_deallocate(sub_map);
9946 sub_map = VM_MAP_NULL;
9947
9948 if (kr != KERN_SUCCESS) {
9949 if (next_copy != NULL) {
9950 copy->cpy_hdr.nentries +=
9951 remaining_entries;
9952 copy->cpy_hdr.links.prev->vme_next =
9953 next_copy;
9954 copy->cpy_hdr.links.prev
9955 = previous_prev;
9956 copy->size = total_size;
9957 }
9958 return kr;
9959 }
9960 if (dst_end <= local_end) {
9961 return KERN_SUCCESS;
9962 }
9963 /* otherwise copy no longer exists, it was */
9964 /* destroyed after successful copy_overwrite */
9965 copy = vm_map_copy_allocate(VM_MAP_COPY_ENTRY_LIST);
9966 copy->offset = new_offset;
9967 copy->cpy_hdr.page_shift = copy_page_shift;
9968
9969 total_size -= copy_size;
9970 copy_size = 0;
9971 /* put back remainder of copy in container */
9972 if (next_copy != NULL) {
9973 copy->cpy_hdr.nentries = remaining_entries;
9974 copy->cpy_hdr.links.next = next_copy;
9975 copy->cpy_hdr.links.prev = previous_prev;
9976 copy->size = total_size;
9977 next_copy->vme_prev =
9978 vm_map_copy_to_entry(copy);
9979 next_copy = NULL;
9980 }
9981 base_addr = local_end;
9982 vm_map_lock(dst_map);
9983 if (!vm_map_lookup_entry(dst_map,
9984 local_end, &tmp_entry)) {
9985 vm_map_unlock(dst_map);
9986 return KERN_INVALID_ADDRESS;
9987 }
9988 entry = tmp_entry;
9989 continue;
9990 }
9991 assert(!entry->is_sub_map);
9992
9993 if (dst_end <= entry->vme_end) {
9994 copy_size = dst_end - base_addr;
9995 break;
9996 }
9997
9998 if ((next == vm_map_to_entry(dst_map)) ||
9999 (next->vme_start != entry->vme_end)) {
10000 vm_map_unlock(dst_map);
10001 return KERN_INVALID_ADDRESS;
10002 }
10003
10004 entry = next;
10005 }/* for */
10006
10007 next_copy = NULL;
10008 nentries = 1;
10009
10010 /* adjust the copy object */
10011 if (total_size > copy_size) {
10012 vm_map_size_t local_size = 0;
10013 vm_map_size_t entry_size;
10014
10015 new_offset = copy->offset;
10016 copy_entry = vm_map_copy_first_entry(copy);
10017 while (copy_entry != vm_map_copy_to_entry(copy)) {
10018 entry_size = copy_entry->vme_end -
10019 copy_entry->vme_start;
10020 if ((local_size < copy_size) &&
10021 ((local_size + entry_size)
10022 >= copy_size)) {
10023 vm_map_copy_clip_end(copy, copy_entry,
10024 copy_entry->vme_start +
10025 (copy_size - local_size));
10026 entry_size = copy_entry->vme_end -
10027 copy_entry->vme_start;
10028 local_size += entry_size;
10029 new_offset += entry_size;
10030 }
10031 if (local_size >= copy_size) {
10032 next_copy = copy_entry->vme_next;
10033 copy_entry->vme_next =
10034 vm_map_copy_to_entry(copy);
10035 previous_prev =
10036 copy->cpy_hdr.links.prev;
10037 copy->cpy_hdr.links.prev = copy_entry;
10038 copy->size = copy_size;
10039 remaining_entries =
10040 copy->cpy_hdr.nentries;
10041 remaining_entries -= nentries;
10042 copy->cpy_hdr.nentries = nentries;
10043 break;
10044 } else {
10045 local_size += entry_size;
10046 new_offset += entry_size;
10047 nentries++;
10048 }
10049 copy_entry = copy_entry->vme_next;
10050 }
10051 }
10052
10053 if (aligned) {
10054 pmap_t local_pmap;
10055
10056 if (pmap) {
10057 local_pmap = pmap;
10058 } else {
10059 local_pmap = dst_map->pmap;
10060 }
10061
10062 if ((kr = vm_map_copy_overwrite_aligned(
10063 dst_map, tmp_entry, copy,
10064 base_addr, local_pmap)) != KERN_SUCCESS) {
10065 if (next_copy != NULL) {
10066 copy->cpy_hdr.nentries +=
10067 remaining_entries;
10068 copy->cpy_hdr.links.prev->vme_next =
10069 next_copy;
10070 copy->cpy_hdr.links.prev =
10071 previous_prev;
10072 copy->size += copy_size;
10073 }
10074 return kr;
10075 }
10076 vm_map_unlock(dst_map);
10077 } else {
10078 /*
10079 * Performance gain:
10080 *
10081 * if the copy and dst address are misaligned but the same
10082 * offset within the page we can copy_not_aligned the
10083 * misaligned parts and copy aligned the rest. If they are
10084 * aligned but len is unaligned we simply need to copy
10085 * the end bit unaligned. We'll need to split the misaligned
10086 * bits of the region in this case !
10087 */
10088 /* ALWAYS UNLOCKS THE dst_map MAP */
10089 kr = vm_map_copy_overwrite_unaligned(
10090 dst_map,
10091 tmp_entry,
10092 copy,
10093 base_addr,
10094 discard_on_success);
10095 if (kr != KERN_SUCCESS) {
10096 if (next_copy != NULL) {
10097 copy->cpy_hdr.nentries +=
10098 remaining_entries;
10099 copy->cpy_hdr.links.prev->vme_next =
10100 next_copy;
10101 copy->cpy_hdr.links.prev =
10102 previous_prev;
10103 copy->size += copy_size;
10104 }
10105 return kr;
10106 }
10107 }
10108 total_size -= copy_size;
10109 if (total_size == 0) {
10110 break;
10111 }
10112 base_addr += copy_size;
10113 copy_size = 0;
10114 copy->offset = new_offset;
10115 if (next_copy != NULL) {
10116 copy->cpy_hdr.nentries = remaining_entries;
10117 copy->cpy_hdr.links.next = next_copy;
10118 copy->cpy_hdr.links.prev = previous_prev;
10119 next_copy->vme_prev = vm_map_copy_to_entry(copy);
10120 copy->size = total_size;
10121 }
10122 vm_map_lock(dst_map);
10123 while (TRUE) {
10124 if (!vm_map_lookup_entry(dst_map,
10125 base_addr, &tmp_entry)) {
10126 vm_map_unlock(dst_map);
10127 return KERN_INVALID_ADDRESS;
10128 }
10129 if (tmp_entry->in_transition) {
10130 entry->needs_wakeup = TRUE;
10131 vm_map_entry_wait(dst_map, THREAD_UNINT);
10132 } else {
10133 break;
10134 }
10135 }
10136 vm_map_clip_start(dst_map,
10137 tmp_entry,
10138 vm_map_trunc_page(base_addr,
10139 VM_MAP_PAGE_MASK(dst_map)));
10140
10141 entry = tmp_entry;
10142 } /* while */
10143
10144 /*
10145 * Throw away the vm_map_copy object
10146 */
10147 if (discard_on_success) {
10148 vm_map_copy_discard(copy);
10149 }
10150
10151 return KERN_SUCCESS;
10152 }/* vm_map_copy_overwrite */
10153
10154 static __attribute__((always_inline, warn_unused_result))
10155 kern_return_t
vm_map_copy_addr_size_sanitize(vm_map_t map,vm_map_offset_ut addr_u,vm_map_size_ut size_u,vm_sanitize_caller_t vm_sanitize_caller,vm_map_offset_t * addr,vm_map_offset_t * end,vm_map_size_t * size)10156 vm_map_copy_addr_size_sanitize(
10157 vm_map_t map,
10158 vm_map_offset_ut addr_u,
10159 vm_map_size_ut size_u,
10160 vm_sanitize_caller_t vm_sanitize_caller,
10161 vm_map_offset_t *addr,
10162 vm_map_offset_t *end,
10163 vm_map_size_t *size)
10164 {
10165 vm_sanitize_flags_t flags = VM_SANITIZE_FLAGS_SIZE_ZERO_FALLTHROUGH |
10166 VM_SANITIZE_FLAGS_GET_UNALIGNED_VALUES |
10167 VM_SANITIZE_FLAGS_CHECK_ADDR_RANGE;
10168
10169 return vm_sanitize_addr_size(addr_u, size_u,
10170 vm_sanitize_caller, map,
10171 flags,
10172 addr, end, size);
10173 }
10174
10175 kern_return_t
vm_map_copy_overwrite(vm_map_t dst_map,vm_map_offset_ut dst_addr_u,vm_map_copy_t copy,vm_map_size_ut copy_size_u,boolean_t interruptible)10176 vm_map_copy_overwrite(
10177 vm_map_t dst_map,
10178 vm_map_offset_ut dst_addr_u,
10179 vm_map_copy_t copy,
10180 vm_map_size_ut copy_size_u,
10181 boolean_t interruptible)
10182 {
10183 vm_map_offset_t dst_addr, dst_end;
10184 vm_map_size_t copy_size;
10185 vm_map_size_t head_size, tail_size;
10186 vm_map_copy_t head_copy, tail_copy;
10187 vm_map_offset_t head_addr, tail_addr;
10188 vm_map_entry_t entry;
10189 kern_return_t kr;
10190 vm_map_offset_t effective_page_mask, effective_page_size;
10191 uint16_t copy_page_shift;
10192
10193 head_size = 0;
10194 tail_size = 0;
10195 head_copy = NULL;
10196 tail_copy = NULL;
10197 head_addr = 0;
10198 tail_addr = 0;
10199
10200 /*
10201 * Check for null copy object.
10202 */
10203 if (copy == VM_MAP_COPY_NULL) {
10204 return KERN_SUCCESS;
10205 }
10206
10207 /*
10208 * Sanitize any input parameters that are addr/size/prot/inherit
10209 */
10210 kr = vm_map_copy_addr_size_sanitize(
10211 dst_map,
10212 dst_addr_u,
10213 copy_size_u,
10214 VM_SANITIZE_CALLER_VM_MAP_COPY_OVERWRITE,
10215 &dst_addr,
10216 &dst_end,
10217 ©_size);
10218 if (__improbable(kr != KERN_SUCCESS)) {
10219 return vm_sanitize_get_kr(kr);
10220 }
10221
10222 /*
10223 * Assert that the vm_map_copy is coming from the right
10224 * zone and hasn't been forged
10225 */
10226 vm_map_copy_require(copy);
10227
10228 if (interruptible ||
10229 copy->type != VM_MAP_COPY_ENTRY_LIST) {
10230 /*
10231 * We can't split the "copy" map if we're interruptible
10232 * or if we don't have a "copy" map...
10233 */
10234 blunt_copy:
10235 kr = vm_map_copy_overwrite_nested(dst_map,
10236 dst_addr,
10237 copy,
10238 interruptible,
10239 (pmap_t) NULL,
10240 TRUE);
10241 if (kr) {
10242 ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_COPYOVERWRITE_FULL_NESTED_ERROR), kr /* arg */);
10243 }
10244 return kr;
10245 }
10246
10247 copy_page_shift = VM_MAP_COPY_PAGE_SHIFT(copy);
10248 if (copy_page_shift < PAGE_SHIFT ||
10249 VM_MAP_PAGE_SHIFT(dst_map) < PAGE_SHIFT) {
10250 goto blunt_copy;
10251 }
10252
10253 if (VM_MAP_PAGE_SHIFT(dst_map) < PAGE_SHIFT) {
10254 effective_page_mask = VM_MAP_PAGE_MASK(dst_map);
10255 } else {
10256 effective_page_mask = MAX(VM_MAP_PAGE_MASK(dst_map), PAGE_MASK);
10257 effective_page_mask = MAX(VM_MAP_COPY_PAGE_MASK(copy),
10258 effective_page_mask);
10259 }
10260 effective_page_size = effective_page_mask + 1;
10261
10262 if (copy_size < VM_MAP_COPY_OVERWRITE_OPTIMIZATION_THRESHOLD_PAGES * effective_page_size) {
10263 /*
10264 * Too small to bother with optimizing...
10265 */
10266 goto blunt_copy;
10267 }
10268
10269 if ((dst_addr & effective_page_mask) !=
10270 (copy->offset & effective_page_mask)) {
10271 /*
10272 * Incompatible mis-alignment of source and destination...
10273 */
10274 goto blunt_copy;
10275 }
10276
10277 /*
10278 * Proper alignment or identical mis-alignment at the beginning.
10279 * Let's try and do a small unaligned copy first (if needed)
10280 * and then an aligned copy for the rest.
10281 */
10282 if (!vm_map_page_aligned(dst_addr, effective_page_mask)) {
10283 head_addr = dst_addr;
10284 head_size = (effective_page_size -
10285 (copy->offset & effective_page_mask));
10286 head_size = MIN(head_size, copy_size);
10287 }
10288 if (!vm_map_page_aligned(copy->offset + copy_size,
10289 effective_page_mask)) {
10290 /*
10291 * Mis-alignment at the end.
10292 * Do an aligned copy up to the last page and
10293 * then an unaligned copy for the remaining bytes.
10294 */
10295 tail_size = ((copy->offset + copy_size) &
10296 effective_page_mask);
10297 tail_size = MIN(tail_size, copy_size);
10298 tail_addr = dst_addr + copy_size - tail_size;
10299 assert(tail_addr >= head_addr + head_size);
10300 }
10301 assert(head_size + tail_size <= copy_size);
10302
10303 if (head_size + tail_size == copy_size) {
10304 /*
10305 * It's all unaligned, no optimization possible...
10306 */
10307 goto blunt_copy;
10308 }
10309
10310 /*
10311 * Can't optimize if there are any submaps in the
10312 * destination due to the way we free the "copy" map
10313 * progressively in vm_map_copy_overwrite_nested()
10314 * in that case.
10315 */
10316 vm_map_lock_read(dst_map);
10317 if (!vm_map_lookup_entry(dst_map, dst_addr, &entry)) {
10318 vm_map_unlock_read(dst_map);
10319 goto blunt_copy;
10320 }
10321 for (;
10322 (entry != vm_map_to_entry(dst_map) &&
10323 entry->vme_start < dst_addr + copy_size);
10324 entry = entry->vme_next) {
10325 if (entry->is_sub_map) {
10326 vm_map_unlock_read(dst_map);
10327 goto blunt_copy;
10328 }
10329 }
10330 vm_map_unlock_read(dst_map);
10331
10332 if (head_size) {
10333 /*
10334 * Unaligned copy of the first "head_size" bytes, to reach
10335 * a page boundary.
10336 */
10337
10338 /*
10339 * Extract "head_copy" out of "copy".
10340 */
10341 head_copy = vm_map_copy_allocate(VM_MAP_COPY_ENTRY_LIST);
10342 head_copy->cpy_hdr.entries_pageable =
10343 copy->cpy_hdr.entries_pageable;
10344 head_copy->cpy_hdr.page_shift = copy_page_shift;
10345
10346 entry = vm_map_copy_first_entry(copy);
10347 if (entry->vme_end < copy->offset + head_size) {
10348 head_size = entry->vme_end - copy->offset;
10349 }
10350
10351 head_copy->offset = copy->offset;
10352 head_copy->size = head_size;
10353 copy->offset += head_size;
10354 copy->size -= head_size;
10355 copy_size -= head_size;
10356 assert(copy_size > 0);
10357
10358 vm_map_copy_clip_end(copy, entry, copy->offset);
10359 vm_map_copy_entry_unlink(copy, entry);
10360 vm_map_copy_entry_link(head_copy,
10361 vm_map_copy_to_entry(head_copy),
10362 entry);
10363
10364 /*
10365 * Do the unaligned copy.
10366 */
10367 kr = vm_map_copy_overwrite_nested(dst_map,
10368 head_addr,
10369 head_copy,
10370 interruptible,
10371 (pmap_t) NULL,
10372 FALSE);
10373 if (kr != KERN_SUCCESS) {
10374 ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_COPYOVERWRITE_PARTIAL_HEAD_NESTED_ERROR), kr /* arg */);
10375 goto done;
10376 }
10377 }
10378
10379 if (tail_size) {
10380 /*
10381 * Extract "tail_copy" out of "copy".
10382 */
10383 tail_copy = vm_map_copy_allocate(VM_MAP_COPY_ENTRY_LIST);
10384 tail_copy->cpy_hdr.entries_pageable =
10385 copy->cpy_hdr.entries_pageable;
10386 tail_copy->cpy_hdr.page_shift = copy_page_shift;
10387
10388 tail_copy->offset = copy->offset + copy_size - tail_size;
10389 tail_copy->size = tail_size;
10390
10391 copy->size -= tail_size;
10392 copy_size -= tail_size;
10393 assert(copy_size > 0);
10394
10395 entry = vm_map_copy_last_entry(copy);
10396 vm_map_copy_clip_start(copy, entry, tail_copy->offset);
10397 entry = vm_map_copy_last_entry(copy);
10398 vm_map_copy_entry_unlink(copy, entry);
10399 vm_map_copy_entry_link(tail_copy,
10400 vm_map_copy_last_entry(tail_copy),
10401 entry);
10402 }
10403
10404 /*
10405 * If we are here from ipc_kmsg_copyout_ool_descriptor(),
10406 * we want to avoid TOCTOU issues w.r.t copy->size but
10407 * we don't need to change vm_map_copy_overwrite_nested()
10408 * and all other vm_map_copy_overwrite variants.
10409 *
10410 * So we assign the original copy_size that was passed into
10411 * this routine back to copy.
10412 *
10413 * This use of local 'copy_size' passed into this routine is
10414 * to try and protect against TOCTOU attacks where the kernel
10415 * has been exploited. We don't expect this to be an issue
10416 * during normal system operation.
10417 */
10418 assertf(copy->size == copy_size,
10419 "Mismatch of copy sizes. Expected 0x%llx, Got 0x%llx\n", (uint64_t) copy_size, (uint64_t) copy->size);
10420 copy->size = copy_size;
10421
10422 /*
10423 * Copy most (or possibly all) of the data.
10424 */
10425 kr = vm_map_copy_overwrite_nested(dst_map,
10426 dst_addr + head_size,
10427 copy,
10428 interruptible,
10429 (pmap_t) NULL,
10430 FALSE);
10431 if (kr != KERN_SUCCESS) {
10432 ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_COPYOVERWRITE_PARTIAL_NESTED_ERROR), kr /* arg */);
10433 goto done;
10434 }
10435
10436 if (tail_size) {
10437 kr = vm_map_copy_overwrite_nested(dst_map,
10438 tail_addr,
10439 tail_copy,
10440 interruptible,
10441 (pmap_t) NULL,
10442 FALSE);
10443 if (kr) {
10444 ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_COPYOVERWRITE_PARTIAL_TAIL_NESTED_ERROR), kr /* arg */);
10445 }
10446 }
10447
10448 done:
10449 assert(copy->type == VM_MAP_COPY_ENTRY_LIST);
10450 if (kr == KERN_SUCCESS) {
10451 /*
10452 * Discard all the copy maps.
10453 */
10454 if (head_copy) {
10455 vm_map_copy_discard(head_copy);
10456 head_copy = NULL;
10457 }
10458 vm_map_copy_discard(copy);
10459 if (tail_copy) {
10460 vm_map_copy_discard(tail_copy);
10461 tail_copy = NULL;
10462 }
10463 } else {
10464 /*
10465 * Re-assemble the original copy map.
10466 */
10467 if (head_copy) {
10468 entry = vm_map_copy_first_entry(head_copy);
10469 vm_map_copy_entry_unlink(head_copy, entry);
10470 vm_map_copy_entry_link(copy,
10471 vm_map_copy_to_entry(copy),
10472 entry);
10473 copy->offset -= head_size;
10474 copy->size += head_size;
10475 vm_map_copy_discard(head_copy);
10476 head_copy = NULL;
10477 }
10478 if (tail_copy) {
10479 entry = vm_map_copy_last_entry(tail_copy);
10480 vm_map_copy_entry_unlink(tail_copy, entry);
10481 vm_map_copy_entry_link(copy,
10482 vm_map_copy_last_entry(copy),
10483 entry);
10484 copy->size += tail_size;
10485 vm_map_copy_discard(tail_copy);
10486 tail_copy = NULL;
10487 }
10488 }
10489 return kr;
10490 }
10491
10492
10493 /*
10494 * Routine: vm_map_copy_overwrite_unaligned [internal use only]
10495 *
10496 * Decription:
10497 * Physically copy unaligned data
10498 *
10499 * Implementation:
10500 * Unaligned parts of pages have to be physically copied. We use
10501 * a modified form of vm_fault_copy (which understands none-aligned
10502 * page offsets and sizes) to do the copy. We attempt to copy as
10503 * much memory in one go as possibly, however vm_fault_copy copies
10504 * within 1 memory object so we have to find the smaller of "amount left"
10505 * "source object data size" and "target object data size". With
10506 * unaligned data we don't need to split regions, therefore the source
10507 * (copy) object should be one map entry, the target range may be split
10508 * over multiple map entries however. In any event we are pessimistic
10509 * about these assumptions.
10510 *
10511 * Callers of this function must call vm_map_copy_require on
10512 * previously created vm_map_copy_t or pass a newly created
10513 * one to ensure that it hasn't been forged.
10514 *
10515 * Assumptions:
10516 * dst_map is locked on entry and is return locked on success,
10517 * unlocked on error.
10518 */
10519
10520 static kern_return_t
vm_map_copy_overwrite_unaligned(vm_map_t dst_map,vm_map_entry_t entry,vm_map_copy_t copy,vm_map_offset_t start,boolean_t discard_on_success)10521 vm_map_copy_overwrite_unaligned(
10522 vm_map_t dst_map,
10523 vm_map_entry_t entry,
10524 vm_map_copy_t copy,
10525 vm_map_offset_t start,
10526 boolean_t discard_on_success)
10527 {
10528 vm_map_entry_t copy_entry;
10529 vm_map_entry_t copy_entry_next;
10530 vm_map_version_t version;
10531 vm_object_t dst_object;
10532 vm_object_offset_t dst_offset;
10533 vm_object_offset_t src_offset;
10534 vm_object_offset_t entry_offset;
10535 vm_map_offset_t entry_end;
10536 vm_map_size_t src_size,
10537 dst_size,
10538 copy_size,
10539 amount_left;
10540 kern_return_t kr = KERN_SUCCESS;
10541
10542
10543 copy_entry = vm_map_copy_first_entry(copy);
10544
10545 vm_map_lock_write_to_read(dst_map);
10546
10547 src_offset = copy->offset - trunc_page_mask_64(copy->offset, VM_MAP_COPY_PAGE_MASK(copy));
10548 amount_left = copy->size;
10549 /*
10550 * unaligned so we never clipped this entry, we need the offset into
10551 * the vm_object not just the data.
10552 */
10553 while (amount_left > 0) {
10554 if (entry == vm_map_to_entry(dst_map)) {
10555 vm_map_unlock_read(dst_map);
10556 return KERN_INVALID_ADDRESS;
10557 }
10558
10559 /* "start" must be within the current map entry */
10560 assert((start >= entry->vme_start) && (start < entry->vme_end));
10561
10562 /*
10563 * Check protection again
10564 */
10565 if (!(entry->protection & VM_PROT_WRITE)) {
10566 vm_map_unlock_read(dst_map);
10567 return KERN_PROTECTION_FAILURE;
10568 }
10569 if (entry->is_sub_map) {
10570 /* not implemented... */
10571 vm_map_unlock_read(dst_map);
10572 return KERN_INVALID_ARGUMENT;
10573 }
10574 if (!vm_map_entry_is_overwritable(dst_map, entry)) {
10575 vm_map_unlock_read(dst_map);
10576 return KERN_PROTECTION_FAILURE;
10577 }
10578 /*
10579 * If the entry is in transition, we must wait
10580 * for it to exit that state. Anything could happen
10581 * when we unlock the map, so start over.
10582 */
10583 if (entry->in_transition) {
10584 /*
10585 * Say that we are waiting, and wait for entry.
10586 */
10587 entry->needs_wakeup = TRUE;
10588 vm_map_entry_wait(dst_map, THREAD_UNINT);
10589
10590 goto RetryLookup;
10591 }
10592
10593 dst_offset = start - entry->vme_start;
10594
10595 dst_size = entry->vme_end - start;
10596
10597 src_size = copy_entry->vme_end -
10598 (copy_entry->vme_start + src_offset);
10599
10600 if (dst_size < src_size) {
10601 /*
10602 * we can only copy dst_size bytes before
10603 * we have to get the next destination entry
10604 */
10605 copy_size = dst_size;
10606 } else {
10607 /*
10608 * we can only copy src_size bytes before
10609 * we have to get the next source copy entry
10610 */
10611 copy_size = src_size;
10612 }
10613
10614 if (copy_size > amount_left) {
10615 copy_size = amount_left;
10616 }
10617 /*
10618 * Entry needs copy, create a shadow shadow object for
10619 * Copy on write region.
10620 */
10621 assert(!entry->is_sub_map);
10622 if (entry->needs_copy) {
10623 if (vm_map_lock_read_to_write(dst_map)) {
10624 vm_map_lock_read(dst_map);
10625 goto RetryLookup;
10626 }
10627 VME_OBJECT_SHADOW(entry,
10628 (vm_map_size_t)(entry->vme_end
10629 - entry->vme_start),
10630 vm_map_always_shadow(dst_map));
10631 entry->needs_copy = FALSE;
10632 vm_map_lock_write_to_read(dst_map);
10633 }
10634 dst_object = VME_OBJECT(entry);
10635 /*
10636 * unlike with the virtual (aligned) copy we're going
10637 * to fault on it therefore we need a target object.
10638 */
10639 if (dst_object == VM_OBJECT_NULL) {
10640 if (vm_map_lock_read_to_write(dst_map)) {
10641 vm_map_lock_read(dst_map);
10642 goto RetryLookup;
10643 }
10644 dst_object = vm_object_allocate((vm_map_size_t)
10645 entry->vme_end - entry->vme_start,
10646 dst_map->serial_id);
10647 VME_OBJECT_SET(entry, dst_object, false, 0);
10648 VME_OFFSET_SET(entry, 0);
10649 assert(entry->use_pmap);
10650 vm_map_lock_write_to_read(dst_map);
10651 }
10652 /*
10653 * Take an object reference and unlock map. The "entry" may
10654 * disappear or change when the map is unlocked.
10655 */
10656 vm_object_reference(dst_object);
10657 version.main_timestamp = dst_map->timestamp;
10658 entry_offset = VME_OFFSET(entry);
10659 entry_end = entry->vme_end;
10660 vm_map_unlock_read(dst_map);
10661 /*
10662 * Copy as much as possible in one pass
10663 */
10664 kr = vm_fault_copy(
10665 VME_OBJECT(copy_entry),
10666 VME_OFFSET(copy_entry) + src_offset,
10667 ©_size,
10668 dst_object,
10669 entry_offset + dst_offset,
10670 dst_map,
10671 &version,
10672 THREAD_UNINT );
10673
10674 start += copy_size;
10675 src_offset += copy_size;
10676 amount_left -= copy_size;
10677 /*
10678 * Release the object reference
10679 */
10680 vm_object_deallocate(dst_object);
10681 /*
10682 * If a hard error occurred, return it now
10683 */
10684 if (kr != KERN_SUCCESS) {
10685 return kr;
10686 }
10687
10688 if ((copy_entry->vme_start + src_offset) == copy_entry->vme_end
10689 || amount_left == 0) {
10690 /*
10691 * all done with this copy entry, dispose.
10692 */
10693 copy_entry_next = copy_entry->vme_next;
10694
10695 if (discard_on_success) {
10696 vm_map_copy_entry_unlink(copy, copy_entry);
10697 assert(!copy_entry->is_sub_map);
10698 vm_object_deallocate(VME_OBJECT(copy_entry));
10699 vm_map_copy_entry_dispose(copy_entry);
10700 }
10701
10702 if (copy_entry_next == vm_map_copy_to_entry(copy) &&
10703 amount_left) {
10704 /*
10705 * not finished copying but run out of source
10706 */
10707 return KERN_INVALID_ADDRESS;
10708 }
10709
10710 copy_entry = copy_entry_next;
10711
10712 src_offset = 0;
10713 }
10714
10715 if (amount_left == 0) {
10716 return KERN_SUCCESS;
10717 }
10718
10719 vm_map_lock_read(dst_map);
10720 if (version.main_timestamp == dst_map->timestamp) {
10721 if (start == entry_end) {
10722 /*
10723 * destination region is split. Use the version
10724 * information to avoid a lookup in the normal
10725 * case.
10726 */
10727 entry = entry->vme_next;
10728 /*
10729 * should be contiguous. Fail if we encounter
10730 * a hole in the destination.
10731 */
10732 if (start != entry->vme_start) {
10733 vm_map_unlock_read(dst_map);
10734 return KERN_INVALID_ADDRESS;
10735 }
10736 }
10737 } else {
10738 /*
10739 * Map version check failed.
10740 * we must lookup the entry because somebody
10741 * might have changed the map behind our backs.
10742 */
10743 RetryLookup:
10744 if (!vm_map_lookup_entry(dst_map, start, &entry)) {
10745 vm_map_unlock_read(dst_map);
10746 return KERN_INVALID_ADDRESS;
10747 }
10748 }
10749 }/* while */
10750
10751 return KERN_SUCCESS;
10752 }/* vm_map_copy_overwrite_unaligned */
10753
10754 /*
10755 * Routine: vm_map_copy_overwrite_aligned [internal use only]
10756 *
10757 * Description:
10758 * Does all the vm_trickery possible for whole pages.
10759 *
10760 * Implementation:
10761 *
10762 * If there are no permanent objects in the destination,
10763 * and the source and destination map entry zones match,
10764 * and the destination map entry is not shared,
10765 * then the map entries can be deleted and replaced
10766 * with those from the copy. The following code is the
10767 * basic idea of what to do, but there are lots of annoying
10768 * little details about getting protection and inheritance
10769 * right. Should add protection, inheritance, and sharing checks
10770 * to the above pass and make sure that no wiring is involved.
10771 *
10772 * Callers of this function must call vm_map_copy_require on
10773 * previously created vm_map_copy_t or pass a newly created
10774 * one to ensure that it hasn't been forged.
10775 */
10776
10777 int vm_map_copy_overwrite_aligned_src_not_internal = 0;
10778 int vm_map_copy_overwrite_aligned_src_not_symmetric = 0;
10779 int vm_map_copy_overwrite_aligned_src_large = 0;
10780
10781 static kern_return_t
vm_map_copy_overwrite_aligned(vm_map_t dst_map,vm_map_entry_t tmp_entry,vm_map_copy_t copy,vm_map_offset_t start,__unused pmap_t pmap)10782 vm_map_copy_overwrite_aligned(
10783 vm_map_t dst_map,
10784 vm_map_entry_t tmp_entry,
10785 vm_map_copy_t copy,
10786 vm_map_offset_t start,
10787 __unused pmap_t pmap)
10788 {
10789 vm_object_t object;
10790 vm_map_entry_t copy_entry;
10791 vm_map_size_t copy_size;
10792 vm_map_size_t size;
10793 vm_map_entry_t entry;
10794
10795 while ((copy_entry = vm_map_copy_first_entry(copy))
10796 != vm_map_copy_to_entry(copy)) {
10797 copy_size = (copy_entry->vme_end - copy_entry->vme_start);
10798
10799 entry = tmp_entry;
10800
10801 if (entry->is_sub_map) {
10802 /* unnested when clipped earlier */
10803 assert(!entry->use_pmap);
10804 }
10805 if (entry == vm_map_to_entry(dst_map)) {
10806 vm_map_unlock(dst_map);
10807 return KERN_INVALID_ADDRESS;
10808 }
10809 size = (entry->vme_end - entry->vme_start);
10810 /*
10811 * Make sure that no holes popped up in the
10812 * address map, and that the protection is
10813 * still valid, in case the map was unlocked
10814 * earlier.
10815 */
10816
10817 if ((entry->vme_start != start) || ((entry->is_sub_map)
10818 && !entry->needs_copy)) {
10819 vm_map_unlock(dst_map);
10820 return KERN_INVALID_ADDRESS;
10821 }
10822 assert(entry != vm_map_to_entry(dst_map));
10823
10824 /*
10825 * Check protection again
10826 */
10827
10828 if (!(entry->protection & VM_PROT_WRITE)) {
10829 vm_map_unlock(dst_map);
10830 return KERN_PROTECTION_FAILURE;
10831 }
10832
10833 if (entry->is_sub_map) {
10834 /* not properly implemented */
10835 vm_map_unlock(dst_map);
10836 return KERN_PROTECTION_FAILURE;
10837 }
10838
10839 if (!vm_map_entry_is_overwritable(dst_map, entry)) {
10840 vm_map_unlock(dst_map);
10841 return KERN_PROTECTION_FAILURE;
10842 }
10843
10844 /*
10845 * If the entry is in transition, we must wait
10846 * for it to exit that state. Anything could happen
10847 * when we unlock the map, so start over.
10848 */
10849 if (entry->in_transition) {
10850 /*
10851 * Say that we are waiting, and wait for entry.
10852 */
10853 entry->needs_wakeup = TRUE;
10854 vm_map_entry_wait(dst_map, THREAD_UNINT);
10855
10856 goto RetryLookup;
10857 }
10858
10859 /*
10860 * Adjust to source size first
10861 */
10862
10863 if (copy_size < size) {
10864 if (entry->map_aligned &&
10865 !VM_MAP_PAGE_ALIGNED(entry->vme_start + copy_size,
10866 VM_MAP_PAGE_MASK(dst_map))) {
10867 /* no longer map-aligned */
10868 entry->map_aligned = FALSE;
10869 }
10870 vm_map_clip_end(dst_map, entry, entry->vme_start + copy_size);
10871 size = copy_size;
10872 }
10873
10874 /*
10875 * Adjust to destination size
10876 */
10877
10878 if (size < copy_size) {
10879 vm_map_copy_clip_end(copy, copy_entry,
10880 copy_entry->vme_start + size);
10881 copy_size = size;
10882 }
10883
10884 assert((entry->vme_end - entry->vme_start) == size);
10885 assert((tmp_entry->vme_end - tmp_entry->vme_start) == size);
10886 assert((copy_entry->vme_end - copy_entry->vme_start) == size);
10887
10888 /*
10889 * If the destination contains temporary unshared memory,
10890 * we can perform the copy by throwing it away and
10891 * installing the source data.
10892 *
10893 * Exceptions for mappings with special semantics:
10894 * + "permanent" entries,
10895 * + JIT regions,
10896 * + TPRO regions,
10897 * + pmap-specific protection policies,
10898 * + VM objects with COPY_NONE copy strategy.
10899 */
10900
10901 object = VME_OBJECT(entry);
10902 if ((!entry->is_shared &&
10903 !entry->vme_permanent &&
10904 !entry->used_for_jit &&
10905 #if __arm64e__
10906 !entry->used_for_tpro &&
10907 #endif /* __arm64e__ */
10908 !(entry->protection & VM_PROT_EXECUTE) &&
10909 !pmap_has_prot_policy(dst_map->pmap, entry->translated_allow_execute, entry->protection) &&
10910 ((object == VM_OBJECT_NULL) ||
10911 (object->internal &&
10912 !object->true_share &&
10913 object->copy_strategy != MEMORY_OBJECT_COPY_NONE))) ||
10914 entry->needs_copy) {
10915 vm_object_t old_object = VME_OBJECT(entry);
10916 vm_object_offset_t old_offset = VME_OFFSET(entry);
10917 vm_object_offset_t offset;
10918
10919 assert(!entry->is_sub_map);
10920 /*
10921 * Ensure that the source and destination aren't
10922 * identical
10923 */
10924 if (old_object == VME_OBJECT(copy_entry) &&
10925 old_offset == VME_OFFSET(copy_entry)) {
10926 vm_map_copy_entry_unlink(copy, copy_entry);
10927 vm_map_copy_entry_dispose(copy_entry);
10928
10929 if (old_object != VM_OBJECT_NULL) {
10930 vm_object_deallocate(old_object);
10931 }
10932
10933 start = tmp_entry->vme_end;
10934 tmp_entry = tmp_entry->vme_next;
10935 continue;
10936 }
10937
10938 #if XNU_TARGET_OS_OSX
10939 #define __TRADEOFF1_OBJ_SIZE (64 * 1024 * 1024) /* 64 MB */
10940 #define __TRADEOFF1_COPY_SIZE (128 * 1024) /* 128 KB */
10941 if (VME_OBJECT(copy_entry) != VM_OBJECT_NULL &&
10942 VME_OBJECT(copy_entry)->vo_size >= __TRADEOFF1_OBJ_SIZE &&
10943 copy_size <= __TRADEOFF1_COPY_SIZE) {
10944 /*
10945 * Virtual vs. Physical copy tradeoff #1.
10946 *
10947 * Copying only a few pages out of a large
10948 * object: do a physical copy instead of
10949 * a virtual copy, to avoid possibly keeping
10950 * the entire large object alive because of
10951 * those few copy-on-write pages.
10952 */
10953 vm_map_copy_overwrite_aligned_src_large++;
10954 goto slow_copy;
10955 }
10956 #endif /* XNU_TARGET_OS_OSX */
10957
10958 if ((dst_map->pmap != kernel_pmap) &&
10959 (VME_ALIAS(entry) >= VM_MEMORY_MALLOC) &&
10960 (VME_ALIAS(entry) <= VM_MEMORY_MALLOC_MEDIUM)) {
10961 vm_object_t new_object, new_shadow;
10962
10963 /*
10964 * We're about to map something over a mapping
10965 * established by malloc()...
10966 */
10967 new_object = VME_OBJECT(copy_entry);
10968 if (new_object != VM_OBJECT_NULL) {
10969 vm_object_lock_shared(new_object);
10970 }
10971 while (new_object != VM_OBJECT_NULL &&
10972 #if XNU_TARGET_OS_OSX
10973 !new_object->true_share &&
10974 new_object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC &&
10975 #endif /* XNU_TARGET_OS_OSX */
10976 new_object->internal) {
10977 new_shadow = new_object->shadow;
10978 if (new_shadow == VM_OBJECT_NULL) {
10979 break;
10980 }
10981 vm_object_lock_shared(new_shadow);
10982 vm_object_unlock(new_object);
10983 new_object = new_shadow;
10984 }
10985 if (new_object != VM_OBJECT_NULL) {
10986 if (!new_object->internal) {
10987 /*
10988 * The new mapping is backed
10989 * by an external object. We
10990 * don't want malloc'ed memory
10991 * to be replaced with such a
10992 * non-anonymous mapping, so
10993 * let's go off the optimized
10994 * path...
10995 */
10996 vm_map_copy_overwrite_aligned_src_not_internal++;
10997 vm_object_unlock(new_object);
10998 goto slow_copy;
10999 }
11000 #if XNU_TARGET_OS_OSX
11001 if (new_object->true_share ||
11002 new_object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC) {
11003 /*
11004 * Same if there's a "true_share"
11005 * object in the shadow chain, or
11006 * an object with a non-default
11007 * (SYMMETRIC) copy strategy.
11008 */
11009 vm_map_copy_overwrite_aligned_src_not_symmetric++;
11010 vm_object_unlock(new_object);
11011 goto slow_copy;
11012 }
11013 #endif /* XNU_TARGET_OS_OSX */
11014 vm_object_unlock(new_object);
11015 }
11016 /*
11017 * The new mapping is still backed by
11018 * anonymous (internal) memory, so it's
11019 * OK to substitute it for the original
11020 * malloc() mapping.
11021 */
11022 }
11023
11024 if (old_object != VM_OBJECT_NULL) {
11025 assert(!entry->vme_permanent);
11026 if (entry->is_sub_map) {
11027 if (entry->use_pmap) {
11028 #ifndef NO_NESTED_PMAP
11029 pmap_unnest(dst_map->pmap,
11030 (addr64_t)entry->vme_start,
11031 entry->vme_end - entry->vme_start);
11032 #endif /* NO_NESTED_PMAP */
11033 if (dst_map->mapped_in_other_pmaps) {
11034 /* clean up parent */
11035 /* map/maps */
11036 vm_map_submap_pmap_clean(
11037 dst_map, entry->vme_start,
11038 entry->vme_end,
11039 VME_SUBMAP(entry),
11040 VME_OFFSET(entry));
11041 }
11042 } else {
11043 vm_map_submap_pmap_clean(
11044 dst_map, entry->vme_start,
11045 entry->vme_end,
11046 VME_SUBMAP(entry),
11047 VME_OFFSET(entry));
11048 }
11049 vm_map_deallocate(VME_SUBMAP(entry));
11050 } else {
11051 if (dst_map->mapped_in_other_pmaps) {
11052 vm_object_pmap_protect_options(
11053 VME_OBJECT(entry),
11054 VME_OFFSET(entry),
11055 entry->vme_end
11056 - entry->vme_start,
11057 PMAP_NULL,
11058 PAGE_SIZE,
11059 entry->vme_start,
11060 VM_PROT_NONE,
11061 PMAP_OPTIONS_REMOVE);
11062 } else {
11063 pmap_remove_options(
11064 dst_map->pmap,
11065 (addr64_t)(entry->vme_start),
11066 (addr64_t)(entry->vme_end),
11067 PMAP_OPTIONS_REMOVE);
11068 }
11069 vm_object_deallocate(old_object);
11070 }
11071 }
11072
11073 if (entry->iokit_acct) {
11074 /* keep using iokit accounting */
11075 entry->use_pmap = FALSE;
11076 } else {
11077 /* use pmap accounting */
11078 entry->use_pmap = TRUE;
11079 }
11080 assert(!entry->vme_permanent);
11081 VME_OBJECT_SET(entry, VME_OBJECT(copy_entry), false, 0);
11082 object = VME_OBJECT(entry);
11083 entry->needs_copy = copy_entry->needs_copy;
11084 entry->wired_count = 0;
11085 entry->user_wired_count = 0;
11086 offset = VME_OFFSET(copy_entry);
11087 VME_OFFSET_SET(entry, offset);
11088
11089 vm_map_copy_entry_unlink(copy, copy_entry);
11090 vm_map_copy_entry_dispose(copy_entry);
11091
11092 /*
11093 * we could try to push pages into the pmap at this point, BUT
11094 * this optimization only saved on average 2 us per page if ALL
11095 * the pages in the source were currently mapped
11096 * and ALL the pages in the dest were touched, if there were fewer
11097 * than 2/3 of the pages touched, this optimization actually cost more cycles
11098 * it also puts a lot of pressure on the pmap layer w/r to mapping structures
11099 */
11100
11101 /*
11102 * Set up for the next iteration. The map
11103 * has not been unlocked, so the next
11104 * address should be at the end of this
11105 * entry, and the next map entry should be
11106 * the one following it.
11107 */
11108
11109 start = tmp_entry->vme_end;
11110 tmp_entry = tmp_entry->vme_next;
11111 } else {
11112 vm_map_version_t version;
11113 vm_object_t dst_object;
11114 vm_object_offset_t dst_offset;
11115 kern_return_t r;
11116
11117 slow_copy:
11118 if (entry->needs_copy) {
11119 VME_OBJECT_SHADOW(entry,
11120 (entry->vme_end -
11121 entry->vme_start),
11122 vm_map_always_shadow(dst_map));
11123 entry->needs_copy = FALSE;
11124 }
11125
11126 dst_object = VME_OBJECT(entry);
11127 dst_offset = VME_OFFSET(entry);
11128
11129 /*
11130 * Take an object reference, and record
11131 * the map version information so that the
11132 * map can be safely unlocked.
11133 */
11134
11135 if (dst_object == VM_OBJECT_NULL) {
11136 /*
11137 * We would usually have just taken the
11138 * optimized path above if the destination
11139 * object has not been allocated yet. But we
11140 * now disable that optimization if the copy
11141 * entry's object is not backed by anonymous
11142 * memory to avoid replacing malloc'ed
11143 * (i.e. re-usable) anonymous memory with a
11144 * not-so-anonymous mapping.
11145 * So we have to handle this case here and
11146 * allocate a new VM object for this map entry.
11147 */
11148 dst_object = vm_object_allocate(
11149 entry->vme_end - entry->vme_start,
11150 dst_map->serial_id
11151 );
11152 dst_offset = 0;
11153 VME_OBJECT_SET(entry, dst_object, false, 0);
11154 VME_OFFSET_SET(entry, dst_offset);
11155 assert(entry->use_pmap);
11156 }
11157
11158 vm_object_reference(dst_object);
11159
11160 /* account for unlock bumping up timestamp */
11161 version.main_timestamp = dst_map->timestamp + 1;
11162
11163 vm_map_unlock(dst_map);
11164
11165 /*
11166 * Copy as much as possible in one pass
11167 */
11168
11169 copy_size = size;
11170 r = vm_fault_copy(
11171 VME_OBJECT(copy_entry),
11172 VME_OFFSET(copy_entry),
11173 ©_size,
11174 dst_object,
11175 dst_offset,
11176 dst_map,
11177 &version,
11178 THREAD_UNINT );
11179
11180 /*
11181 * Release the object reference
11182 */
11183
11184 vm_object_deallocate(dst_object);
11185
11186 /*
11187 * If a hard error occurred, return it now
11188 */
11189
11190 if (r != KERN_SUCCESS) {
11191 return r;
11192 }
11193
11194 if (copy_size != 0) {
11195 /*
11196 * Dispose of the copied region
11197 */
11198
11199 vm_map_copy_clip_end(copy, copy_entry,
11200 copy_entry->vme_start + copy_size);
11201 vm_map_copy_entry_unlink(copy, copy_entry);
11202 vm_object_deallocate(VME_OBJECT(copy_entry));
11203 vm_map_copy_entry_dispose(copy_entry);
11204 }
11205
11206 /*
11207 * Pick up in the destination map where we left off.
11208 *
11209 * Use the version information to avoid a lookup
11210 * in the normal case.
11211 */
11212
11213 start += copy_size;
11214 vm_map_lock(dst_map);
11215 if (version.main_timestamp == dst_map->timestamp &&
11216 copy_size != 0) {
11217 /* We can safely use saved tmp_entry value */
11218
11219 if (tmp_entry->map_aligned &&
11220 !VM_MAP_PAGE_ALIGNED(
11221 start,
11222 VM_MAP_PAGE_MASK(dst_map))) {
11223 /* no longer map-aligned */
11224 tmp_entry->map_aligned = FALSE;
11225 }
11226 vm_map_clip_end(dst_map, tmp_entry, start);
11227 tmp_entry = tmp_entry->vme_next;
11228 } else {
11229 /* Must do lookup of tmp_entry */
11230
11231 RetryLookup:
11232 if (!vm_map_lookup_entry(dst_map, start, &tmp_entry)) {
11233 vm_map_unlock(dst_map);
11234 return KERN_INVALID_ADDRESS;
11235 }
11236 if (tmp_entry->map_aligned &&
11237 !VM_MAP_PAGE_ALIGNED(
11238 start,
11239 VM_MAP_PAGE_MASK(dst_map))) {
11240 /* no longer map-aligned */
11241 tmp_entry->map_aligned = FALSE;
11242 }
11243 vm_map_clip_start(dst_map, tmp_entry, start);
11244 }
11245 }
11246 }/* while */
11247
11248 return KERN_SUCCESS;
11249 }/* vm_map_copy_overwrite_aligned */
11250
11251 /*
11252 * Routine: vm_map_copyin_kernel_buffer [internal use only]
11253 *
11254 * Description:
11255 * Copy in data to a kernel buffer from space in the
11256 * source map. The original space may be optionally
11257 * deallocated.
11258 *
11259 * If successful, returns a new copy object.
11260 */
11261 static kern_return_t
vm_map_copyin_kernel_buffer(vm_map_t src_map,vm_map_offset_t src_addr,vm_map_size_t len,boolean_t src_destroy,vm_map_copy_t * copy_result)11262 vm_map_copyin_kernel_buffer(
11263 vm_map_t src_map,
11264 vm_map_offset_t src_addr,
11265 vm_map_size_t len,
11266 boolean_t src_destroy,
11267 vm_map_copy_t *copy_result)
11268 {
11269 kern_return_t kr;
11270 vm_map_copy_t copy;
11271 void *kdata;
11272
11273 if (len > msg_ool_size_small) {
11274 return KERN_INVALID_ARGUMENT;
11275 }
11276
11277 kdata = kalloc_data(len, Z_WAITOK);
11278 if (kdata == NULL) {
11279 return KERN_RESOURCE_SHORTAGE;
11280 }
11281 kr = copyinmap(src_map, src_addr, kdata, (vm_size_t)len);
11282 if (kr != KERN_SUCCESS) {
11283 kfree_data(kdata, len);
11284 return kr;
11285 }
11286
11287 copy = vm_map_copy_allocate(VM_MAP_COPY_KERNEL_BUFFER);
11288 copy->cpy_kdata = kdata;
11289 copy->size = len;
11290 copy->offset = 0;
11291
11292 if (src_destroy) {
11293 vmr_flags_t flags = VM_MAP_REMOVE_INTERRUPTIBLE;
11294
11295 if (src_map == kernel_map) {
11296 flags |= VM_MAP_REMOVE_KUNWIRE;
11297 }
11298
11299 (void)vm_map_remove_guard(src_map,
11300 vm_map_trunc_page(src_addr, VM_MAP_PAGE_MASK(src_map)),
11301 vm_map_round_page(src_addr + len, VM_MAP_PAGE_MASK(src_map)),
11302 flags, KMEM_GUARD_NONE);
11303 }
11304
11305 *copy_result = copy;
11306 return KERN_SUCCESS;
11307 }
11308
11309 /*
11310 * Routine: vm_map_copyout_kernel_buffer [internal use only]
11311 *
11312 * Description:
11313 * Copy out data from a kernel buffer into space in the
11314 * destination map. The space may be otpionally dynamically
11315 * allocated.
11316 *
11317 * If successful, consumes the copy object.
11318 * Otherwise, the caller is responsible for it.
11319 *
11320 * Callers of this function must call vm_map_copy_require on
11321 * previously created vm_map_copy_t or pass a newly created
11322 * one to ensure that it hasn't been forged.
11323 */
11324 static int vm_map_copyout_kernel_buffer_failures = 0;
11325 static kern_return_t
vm_map_copyout_kernel_buffer(vm_map_t map,vm_map_address_t * addr,vm_map_copy_t copy,vm_map_size_t copy_size,boolean_t overwrite,boolean_t consume_on_success)11326 vm_map_copyout_kernel_buffer(
11327 vm_map_t map,
11328 vm_map_address_t *addr, /* IN/OUT */
11329 vm_map_copy_t copy,
11330 vm_map_size_t copy_size,
11331 boolean_t overwrite,
11332 boolean_t consume_on_success)
11333 {
11334 kern_return_t kr = KERN_SUCCESS;
11335 thread_t thread = current_thread();
11336
11337 assert(copy->size == copy_size);
11338
11339 /*
11340 * check for corrupted vm_map_copy structure
11341 */
11342 if (copy_size > msg_ool_size_small || copy->offset) {
11343 panic("Invalid vm_map_copy_t sz:%lld, ofst:%lld",
11344 (long long)copy->size, (long long)copy->offset);
11345 }
11346
11347 if (!overwrite) {
11348 /*
11349 * Allocate space in the target map for the data
11350 */
11351 vm_map_kernel_flags_t vmk_flags = VM_MAP_KERNEL_FLAGS_ANYWHERE();
11352
11353 if (map == kernel_map) {
11354 vmk_flags.vmkf_range_id = KMEM_RANGE_ID_DATA;
11355 }
11356
11357 *addr = 0;
11358 kr = vm_map_enter(map,
11359 addr,
11360 vm_map_round_page(copy_size,
11361 VM_MAP_PAGE_MASK(map)),
11362 (vm_map_offset_t) 0,
11363 vmk_flags,
11364 VM_OBJECT_NULL,
11365 (vm_object_offset_t) 0,
11366 FALSE,
11367 VM_PROT_DEFAULT,
11368 VM_PROT_ALL,
11369 VM_INHERIT_DEFAULT);
11370 if (kr != KERN_SUCCESS) {
11371 return kr;
11372 }
11373 #if KASAN
11374 if (map->pmap == kernel_pmap) {
11375 kasan_notify_address(*addr, copy->size);
11376 }
11377 #endif
11378 }
11379
11380 /*
11381 * Copyout the data from the kernel buffer to the target map.
11382 */
11383 if (thread->map == map) {
11384 /*
11385 * If the target map is the current map, just do
11386 * the copy.
11387 */
11388 assert((vm_size_t)copy_size == copy_size);
11389 if (copyout(copy->cpy_kdata, *addr, (vm_size_t)copy_size)) {
11390 kr = KERN_INVALID_ADDRESS;
11391 }
11392 } else {
11393 vm_map_switch_context_t switch_ctx;
11394
11395 /*
11396 * If the target map is another map, assume the
11397 * target's address space identity for the duration
11398 * of the copy.
11399 */
11400 vm_map_reference(map);
11401 switch_ctx = vm_map_switch_to(map);
11402
11403 assert((vm_size_t)copy_size == copy_size);
11404 if (copyout(copy->cpy_kdata, *addr, (vm_size_t)copy_size)) {
11405 vm_map_copyout_kernel_buffer_failures++;
11406 kr = KERN_INVALID_ADDRESS;
11407 }
11408
11409 vm_map_switch_back(switch_ctx);
11410 vm_map_deallocate(map);
11411 }
11412
11413 if (kr != KERN_SUCCESS) {
11414 /* the copy failed, clean up */
11415 if (!overwrite) {
11416 /*
11417 * Deallocate the space we allocated in the target map.
11418 */
11419 (void) vm_map_remove(map,
11420 vm_map_trunc_page(*addr,
11421 VM_MAP_PAGE_MASK(map)),
11422 vm_map_round_page((*addr +
11423 vm_map_round_page(copy_size,
11424 VM_MAP_PAGE_MASK(map))),
11425 VM_MAP_PAGE_MASK(map)));
11426 *addr = 0;
11427 }
11428 } else {
11429 /* copy was successful, dicard the copy structure */
11430 if (consume_on_success) {
11431 kfree_data(copy->cpy_kdata, copy_size);
11432 zfree_id(ZONE_ID_VM_MAP_COPY, copy);
11433 }
11434 }
11435
11436 return kr;
11437 }
11438
11439 /*
11440 * Routine: vm_map_copy_insert [internal use only]
11441 *
11442 * Description:
11443 * Link a copy chain ("copy") into a map at the
11444 * specified location (after "where").
11445 *
11446 * Callers of this function must call vm_map_copy_require on
11447 * previously created vm_map_copy_t or pass a newly created
11448 * one to ensure that it hasn't been forged.
11449 * Side effects:
11450 * The copy chain is destroyed.
11451 */
11452 static void
vm_map_copy_insert(vm_map_t map,vm_map_entry_t after_where,vm_map_copy_t copy)11453 vm_map_copy_insert(
11454 vm_map_t map,
11455 vm_map_entry_t after_where,
11456 vm_map_copy_t copy)
11457 {
11458 vm_map_entry_t entry;
11459
11460 while (vm_map_copy_first_entry(copy) != vm_map_copy_to_entry(copy)) {
11461 entry = vm_map_copy_first_entry(copy);
11462 vm_map_copy_entry_unlink(copy, entry);
11463 vm_map_store_entry_link(map, after_where, entry,
11464 VM_MAP_KERNEL_FLAGS_NONE);
11465 after_where = entry;
11466 }
11467 zfree_id(ZONE_ID_VM_MAP_COPY, copy);
11468 }
11469
11470 /*
11471 * Callers of this function must call vm_map_copy_require on
11472 * previously created vm_map_copy_t or pass a newly created
11473 * one to ensure that it hasn't been forged.
11474 */
11475 void
vm_map_copy_remap(vm_map_t map,vm_map_entry_t where,vm_map_copy_t copy,vm_map_offset_t adjustment,vm_prot_t cur_prot,vm_prot_t max_prot,vm_inherit_t inheritance)11476 vm_map_copy_remap(
11477 vm_map_t map,
11478 vm_map_entry_t where,
11479 vm_map_copy_t copy,
11480 vm_map_offset_t adjustment,
11481 vm_prot_t cur_prot,
11482 vm_prot_t max_prot,
11483 vm_inherit_t inheritance)
11484 {
11485 vm_map_entry_t copy_entry, new_entry;
11486
11487 for (copy_entry = vm_map_copy_first_entry(copy);
11488 copy_entry != vm_map_copy_to_entry(copy);
11489 copy_entry = copy_entry->vme_next) {
11490 /* get a new VM map entry for the map */
11491 new_entry = vm_map_entry_create(map);
11492 /* copy the "copy entry" to the new entry */
11493 vm_map_entry_copy(map, new_entry, copy_entry);
11494 /* adjust "start" and "end" */
11495 new_entry->vme_start += adjustment;
11496 new_entry->vme_end += adjustment;
11497 /* clear some attributes */
11498 new_entry->inheritance = inheritance;
11499 new_entry->protection = cur_prot;
11500 new_entry->max_protection = max_prot;
11501 new_entry->behavior = VM_BEHAVIOR_DEFAULT;
11502 /* take an extra reference on the entry's "object" */
11503 if (new_entry->is_sub_map) {
11504 assert(!new_entry->use_pmap); /* not nested */
11505 vm_map_reference(VME_SUBMAP(new_entry));
11506 } else {
11507 vm_object_reference(VME_OBJECT(new_entry));
11508 }
11509 /* insert the new entry in the map */
11510 vm_map_store_entry_link(map, where, new_entry,
11511 VM_MAP_KERNEL_FLAGS_NONE);
11512 /* continue inserting the "copy entries" after the new entry */
11513 where = new_entry;
11514 }
11515 }
11516
11517
11518 /*
11519 * Returns true if *size matches (or is in the range of) copy->size.
11520 * Upon returning true, the *size field is updated with the actual size of the
11521 * copy object (may be different for VM_MAP_COPY_ENTRY_LIST types)
11522 */
11523 boolean_t
vm_map_copy_validate_size(vm_map_t dst_map,vm_map_copy_t copy,vm_map_size_t * size)11524 vm_map_copy_validate_size(
11525 vm_map_t dst_map,
11526 vm_map_copy_t copy,
11527 vm_map_size_t *size)
11528 {
11529 if (copy == VM_MAP_COPY_NULL) {
11530 return FALSE;
11531 }
11532
11533 /*
11534 * Assert that the vm_map_copy is coming from the right
11535 * zone and hasn't been forged
11536 */
11537 vm_map_copy_require(copy);
11538
11539 vm_map_size_t copy_sz = copy->size;
11540 vm_map_size_t sz = *size;
11541 switch (copy->type) {
11542 case VM_MAP_COPY_KERNEL_BUFFER:
11543 if (sz == copy_sz) {
11544 return TRUE;
11545 }
11546 break;
11547 case VM_MAP_COPY_ENTRY_LIST:
11548 /*
11549 * potential page-size rounding prevents us from exactly
11550 * validating this flavor of vm_map_copy, but we can at least
11551 * assert that it's within a range.
11552 */
11553 if (copy_sz >= sz &&
11554 copy_sz <= vm_map_round_page(sz, VM_MAP_PAGE_MASK(dst_map))) {
11555 *size = copy_sz;
11556 return TRUE;
11557 }
11558 break;
11559 default:
11560 break;
11561 }
11562 return FALSE;
11563 }
11564
11565 static kern_return_t
vm_map_copyout_internal(vm_map_t dst_map,vm_map_address_t * dst_addr,vm_map_copy_t copy,vm_map_size_ut copy_size_u,boolean_t consume_on_success,vm_prot_t cur_protection,vm_prot_t max_protection,vm_inherit_t inheritance)11566 vm_map_copyout_internal(
11567 vm_map_t dst_map,
11568 vm_map_address_t *dst_addr, /* OUT */
11569 vm_map_copy_t copy,
11570 vm_map_size_ut copy_size_u,
11571 boolean_t consume_on_success,
11572 vm_prot_t cur_protection,
11573 vm_prot_t max_protection,
11574 vm_inherit_t inheritance)
11575 {
11576 vm_map_size_t size, copy_size;
11577 vm_map_size_t adjustment;
11578 vm_map_offset_t start;
11579 vm_object_offset_t vm_copy_start;
11580 vm_map_entry_t last;
11581 vm_map_entry_t entry;
11582 vm_map_copy_t original_copy;
11583 kern_return_t kr;
11584 vm_map_kernel_flags_t vmk_flags = VM_MAP_KERNEL_FLAGS_ANYWHERE();
11585
11586 /*
11587 * Check for null copy object.
11588 */
11589
11590 if (copy == VM_MAP_COPY_NULL) {
11591 *dst_addr = 0;
11592 return KERN_SUCCESS;
11593 }
11594
11595 /*
11596 * Assert that the vm_map_copy is coming from the right
11597 * zone and hasn't been forged
11598 */
11599 vm_map_copy_require(copy);
11600
11601 if (!VM_SANITIZE_UNSAFE_IS_EQUAL(copy_size_u, copy->size)) {
11602 *dst_addr = 0;
11603 ktriage_record(thread_tid(current_thread()),
11604 KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM,
11605 KDBG_TRIAGE_RESERVED,
11606 KDBG_TRIAGE_VM_COPYOUT_INTERNAL_SIZE_ERROR),
11607 KERN_FAILURE /* arg */);
11608 return KERN_FAILURE;
11609 }
11610 copy_size = copy->size;
11611
11612 /*
11613 * Check for special kernel buffer allocated
11614 * by new_ipc_kmsg_copyin.
11615 */
11616
11617 if (copy->type == VM_MAP_COPY_KERNEL_BUFFER) {
11618 kr = vm_map_copyout_kernel_buffer(dst_map, dst_addr,
11619 copy, copy_size, FALSE,
11620 consume_on_success);
11621 if (kr) {
11622 ktriage_record(thread_tid(current_thread()),
11623 KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM,
11624 KDBG_TRIAGE_RESERVED,
11625 KDBG_TRIAGE_VM_COPYOUT_KERNEL_BUFFER_ERROR), kr /* arg */);
11626 }
11627 return kr;
11628 }
11629
11630
11631 original_copy = copy;
11632 if (copy->cpy_hdr.page_shift != VM_MAP_PAGE_SHIFT(dst_map)) {
11633 vm_map_copy_t target_copy;
11634 vm_map_offset_t overmap_start, overmap_end, trimmed_start;
11635
11636 target_copy = VM_MAP_COPY_NULL;
11637 DEBUG4K_ADJUST("adjusting...\n");
11638 kr = vm_map_copy_adjust_to_target(
11639 copy,
11640 0, /* offset */
11641 copy->size, /* size */
11642 dst_map,
11643 TRUE, /* copy */
11644 &target_copy,
11645 &overmap_start,
11646 &overmap_end,
11647 &trimmed_start);
11648 if (kr != KERN_SUCCESS) {
11649 DEBUG4K_COPY("adjust failed 0x%x\n", kr);
11650 ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_COPYOUT_INTERNAL_ADJUSTING_ERROR), kr /* arg */);
11651 return kr;
11652 }
11653 DEBUG4K_COPY("copy %p (%d 0x%llx 0x%llx) dst_map %p (%d) target_copy %p (%d 0x%llx 0x%llx) overmap_start 0x%llx overmap_end 0x%llx trimmed_start 0x%llx\n", copy, copy->cpy_hdr.page_shift, copy->offset, (uint64_t)copy->size, dst_map, VM_MAP_PAGE_SHIFT(dst_map), target_copy, target_copy->cpy_hdr.page_shift, target_copy->offset, (uint64_t)target_copy->size, (uint64_t)overmap_start, (uint64_t)overmap_end, (uint64_t)trimmed_start);
11654 if (target_copy != copy) {
11655 copy = target_copy;
11656 }
11657 copy_size = copy->size;
11658 }
11659
11660 /*
11661 * Find space for the data
11662 */
11663
11664 vm_copy_start = vm_map_trunc_page((vm_map_size_t)copy->offset,
11665 VM_MAP_COPY_PAGE_MASK(copy));
11666 size = vm_map_round_page((vm_map_size_t)copy->offset + copy_size,
11667 VM_MAP_COPY_PAGE_MASK(copy))
11668 - vm_copy_start;
11669
11670 vm_map_kernel_flags_update_range_id(&vmk_flags, dst_map, size);
11671
11672 vm_map_lock(dst_map);
11673 kr = vm_map_locate_space_anywhere(dst_map, size, 0, vmk_flags,
11674 &start, &last);
11675 if (kr != KERN_SUCCESS) {
11676 vm_map_unlock(dst_map);
11677 ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_COPYOUT_INTERNAL_SPACE_ERROR), kr /* arg */);
11678 return kr;
11679 }
11680
11681 adjustment = start - vm_copy_start;
11682 if (!consume_on_success) {
11683 /*
11684 * We're not allowed to consume "copy", so we'll have to
11685 * copy its map entries into the destination map below.
11686 * No need to re-allocate map entries from the correct
11687 * (pageable or not) zone, since we'll get new map entries
11688 * during the transfer.
11689 * We'll also adjust the map entries's "start" and "end"
11690 * during the transfer, to keep "copy"'s entries consistent
11691 * with its "offset".
11692 */
11693 goto after_adjustments;
11694 }
11695
11696 /*
11697 * Since we're going to just drop the map
11698 * entries from the copy into the destination
11699 * map, they must come from the same pool.
11700 */
11701
11702 if (copy->cpy_hdr.entries_pageable != dst_map->hdr.entries_pageable) {
11703 /*
11704 * Mismatches occur when dealing with the default
11705 * pager.
11706 */
11707 vm_map_entry_t next, new;
11708
11709 /*
11710 * Find the zone that the copies were allocated from
11711 */
11712
11713 entry = vm_map_copy_first_entry(copy);
11714
11715 /*
11716 * Reinitialize the copy so that vm_map_copy_entry_link
11717 * will work.
11718 */
11719 vm_map_store_copy_reset(copy, entry);
11720 copy->cpy_hdr.entries_pageable = dst_map->hdr.entries_pageable;
11721
11722 /*
11723 * Copy each entry.
11724 */
11725 while (entry != vm_map_copy_to_entry(copy)) {
11726 new = vm_map_copy_entry_create(copy);
11727 vm_map_entry_copy_full(new, entry);
11728 new->vme_no_copy_on_read = FALSE;
11729 assert(!new->iokit_acct);
11730 if (new->is_sub_map) {
11731 /* clr address space specifics */
11732 new->use_pmap = FALSE;
11733 }
11734 vm_map_copy_entry_link(copy,
11735 vm_map_copy_last_entry(copy),
11736 new);
11737 next = entry->vme_next;
11738 vm_map_entry_dispose(entry);
11739 entry = next;
11740 }
11741 }
11742
11743 /*
11744 * Adjust the addresses in the copy chain, and
11745 * reset the region attributes.
11746 */
11747
11748 for (entry = vm_map_copy_first_entry(copy);
11749 entry != vm_map_copy_to_entry(copy);
11750 entry = entry->vme_next) {
11751 if (VM_MAP_PAGE_SHIFT(dst_map) == PAGE_SHIFT) {
11752 /*
11753 * We're injecting this copy entry into a map that
11754 * has the standard page alignment, so clear
11755 * "map_aligned" (which might have been inherited
11756 * from the original map entry).
11757 */
11758 entry->map_aligned = FALSE;
11759 }
11760
11761 entry->vme_start += adjustment;
11762 entry->vme_end += adjustment;
11763
11764 if (entry->map_aligned) {
11765 assert(VM_MAP_PAGE_ALIGNED(entry->vme_start,
11766 VM_MAP_PAGE_MASK(dst_map)));
11767 assert(VM_MAP_PAGE_ALIGNED(entry->vme_end,
11768 VM_MAP_PAGE_MASK(dst_map)));
11769 }
11770
11771 entry->inheritance = VM_INHERIT_DEFAULT;
11772 entry->protection = VM_PROT_DEFAULT;
11773 entry->max_protection = VM_PROT_ALL;
11774 entry->behavior = VM_BEHAVIOR_DEFAULT;
11775
11776 /*
11777 * If the entry is now wired,
11778 * map the pages into the destination map.
11779 */
11780 if (entry->wired_count != 0) {
11781 vm_map_offset_t va;
11782 vm_object_offset_t offset;
11783 vm_object_t object;
11784 vm_prot_t prot;
11785 int type_of_fault;
11786 uint8_t object_lock_type = OBJECT_LOCK_EXCLUSIVE;
11787
11788 /* TODO4K would need to use actual page size */
11789 assert(VM_MAP_PAGE_SHIFT(dst_map) == PAGE_SHIFT);
11790
11791 object = VME_OBJECT(entry);
11792 offset = VME_OFFSET(entry);
11793 va = entry->vme_start;
11794
11795 pmap_pageable(dst_map->pmap,
11796 entry->vme_start,
11797 entry->vme_end,
11798 TRUE);
11799
11800 while (va < entry->vme_end) {
11801 vm_page_t m;
11802 struct vm_object_fault_info fault_info = {
11803 .interruptible = THREAD_UNINT,
11804 };
11805
11806 /*
11807 * Look up the page in the object.
11808 * Assert that the page will be found in the
11809 * top object:
11810 * either
11811 * the object was newly created by
11812 * vm_object_copy_slowly, and has
11813 * copies of all of the pages from
11814 * the source object
11815 * or
11816 * the object was moved from the old
11817 * map entry; because the old map
11818 * entry was wired, all of the pages
11819 * were in the top-level object.
11820 * (XXX not true if we wire pages for
11821 * reading)
11822 */
11823 vm_object_lock(object);
11824
11825 m = vm_page_lookup(object, offset);
11826 if (m == VM_PAGE_NULL || !VM_PAGE_WIRED(m) ||
11827 m->vmp_absent) {
11828 panic("vm_map_copyout: wiring %p", m);
11829 }
11830
11831 prot = entry->protection;
11832
11833 if (override_nx(dst_map, VME_ALIAS(entry)) &&
11834 prot) {
11835 prot |= VM_PROT_EXECUTE;
11836 }
11837
11838 type_of_fault = DBG_CACHE_HIT_FAULT;
11839
11840 fault_info.user_tag = VME_ALIAS(entry);
11841 fault_info.pmap_options = 0;
11842 if (entry->iokit_acct ||
11843 (!entry->is_sub_map && !entry->use_pmap)) {
11844 fault_info.pmap_options |= PMAP_OPTIONS_ALT_ACCT;
11845 }
11846 if (entry->vme_xnu_user_debug &&
11847 !VM_PAGE_OBJECT(m)->code_signed) {
11848 /*
11849 * Modified code-signed executable
11850 * region: this page does not belong
11851 * to a code-signed VM object, so it
11852 * must have been copied and should
11853 * therefore be typed XNU_USER_DEBUG
11854 * rather than XNU_USER_EXEC.
11855 */
11856 fault_info.pmap_options |= PMAP_OPTIONS_XNU_USER_DEBUG;
11857 }
11858
11859 vm_fault_enter(m,
11860 dst_map->pmap,
11861 va,
11862 PAGE_SIZE, 0,
11863 prot,
11864 prot,
11865 VM_PAGE_WIRED(m),
11866 VM_KERN_MEMORY_NONE, /* tag - not wiring */
11867 &fault_info,
11868 NULL, /* need_retry */
11869 &type_of_fault,
11870 &object_lock_type); /*Exclusive mode lock. Will remain unchanged.*/
11871
11872 vm_object_unlock(object);
11873
11874 offset += PAGE_SIZE_64;
11875 va += PAGE_SIZE;
11876 }
11877 }
11878 }
11879
11880 after_adjustments:
11881
11882 /*
11883 * Correct the page alignment for the result
11884 */
11885
11886 *dst_addr = start + (copy->offset - vm_copy_start);
11887
11888 #if KASAN
11889 kasan_notify_address(*dst_addr, size);
11890 #endif
11891
11892 /*
11893 * Update the hints and the map size
11894 */
11895
11896 if (consume_on_success) {
11897 SAVE_HINT_MAP_WRITE(dst_map, vm_map_copy_last_entry(copy));
11898 } else {
11899 SAVE_HINT_MAP_WRITE(dst_map, last);
11900 }
11901
11902 dst_map->size += size;
11903
11904 /*
11905 * Link in the copy
11906 */
11907
11908 if (consume_on_success) {
11909 vm_map_copy_insert(dst_map, last, copy);
11910 if (copy != original_copy) {
11911 vm_map_copy_discard(original_copy);
11912 original_copy = VM_MAP_COPY_NULL;
11913 }
11914 } else {
11915 vm_map_copy_remap(dst_map, last, copy, adjustment,
11916 cur_protection, max_protection,
11917 inheritance);
11918 if (copy != original_copy && original_copy != VM_MAP_COPY_NULL) {
11919 vm_map_copy_discard(copy);
11920 copy = original_copy;
11921 }
11922 }
11923
11924
11925 vm_map_unlock(dst_map);
11926
11927 /*
11928 * XXX If wiring_required, call vm_map_pageable
11929 */
11930
11931 return KERN_SUCCESS;
11932 }
11933
11934 /*
11935 * Routine: vm_map_copyout_size
11936 *
11937 * Description:
11938 * Copy out a copy chain ("copy") into newly-allocated
11939 * space in the destination map. Uses a prevalidated
11940 * size for the copy object (vm_map_copy_validate_size).
11941 *
11942 * If successful, consumes the copy object.
11943 * Otherwise, the caller is responsible for it.
11944 */
11945 kern_return_t
vm_map_copyout_size(vm_map_t dst_map,vm_map_address_t * dst_addr,vm_map_copy_t copy,vm_map_size_ut copy_size)11946 vm_map_copyout_size(
11947 vm_map_t dst_map,
11948 vm_map_address_t *dst_addr, /* OUT */
11949 vm_map_copy_t copy,
11950 vm_map_size_ut copy_size)
11951 {
11952 return vm_map_copyout_internal(dst_map, dst_addr, copy, copy_size,
11953 TRUE, /* consume_on_success */
11954 VM_PROT_DEFAULT,
11955 VM_PROT_ALL,
11956 VM_INHERIT_DEFAULT);
11957 }
11958
11959 /*
11960 * Routine: vm_map_copyout
11961 *
11962 * Description:
11963 * Copy out a copy chain ("copy") into newly-allocated
11964 * space in the destination map.
11965 *
11966 * If successful, consumes the copy object.
11967 * Otherwise, the caller is responsible for it.
11968 */
11969 kern_return_t
vm_map_copyout(vm_map_t dst_map,vm_map_address_t * dst_addr,vm_map_copy_t copy)11970 vm_map_copyout(
11971 vm_map_t dst_map,
11972 vm_map_address_t *dst_addr, /* OUT */
11973 vm_map_copy_t copy)
11974 {
11975 return vm_map_copyout_internal(dst_map, dst_addr, copy, copy ? copy->size : 0,
11976 TRUE, /* consume_on_success */
11977 VM_PROT_DEFAULT,
11978 VM_PROT_ALL,
11979 VM_INHERIT_DEFAULT);
11980 }
11981
11982 /*
11983 * Routine: vm_map_copyin
11984 *
11985 * Description:
11986 * see vm_map_copyin_common. Exported via Unsupported.exports.
11987 *
11988 */
11989 kern_return_t
vm_map_copyin(vm_map_t src_map,vm_map_address_ut src_addr,vm_map_size_ut len,boolean_t src_destroy,vm_map_copy_t * copy_result)11990 vm_map_copyin(
11991 vm_map_t src_map,
11992 vm_map_address_ut src_addr,
11993 vm_map_size_ut len,
11994 boolean_t src_destroy,
11995 vm_map_copy_t *copy_result) /* OUT */
11996 {
11997 return vm_map_copyin_common(src_map, src_addr, len, src_destroy,
11998 FALSE, copy_result, FALSE);
11999 }
12000
12001 /*
12002 * Routine: vm_map_copyin_common
12003 *
12004 * Description:
12005 * Copy the specified region (src_addr, len) from the
12006 * source address space (src_map), possibly removing
12007 * the region from the source address space (src_destroy).
12008 *
12009 * Returns:
12010 * A vm_map_copy_t object (copy_result), suitable for
12011 * insertion into another address space (using vm_map_copyout),
12012 * copying over another address space region (using
12013 * vm_map_copy_overwrite). If the copy is unused, it
12014 * should be destroyed (using vm_map_copy_discard).
12015 *
12016 * In/out conditions:
12017 * The source map should not be locked on entry.
12018 */
12019
12020 typedef struct submap_map {
12021 vm_map_t parent_map;
12022 vm_map_offset_t base_start;
12023 vm_map_offset_t base_end;
12024 vm_map_size_t base_len;
12025 struct submap_map *next;
12026 } submap_map_t;
12027
12028 kern_return_t
vm_map_copyin_common(vm_map_t src_map,vm_map_address_ut src_addr,vm_map_size_ut len,boolean_t src_destroy,__unused boolean_t src_volatile,vm_map_copy_t * copy_result,boolean_t use_maxprot)12029 vm_map_copyin_common(
12030 vm_map_t src_map,
12031 vm_map_address_ut src_addr,
12032 vm_map_size_ut len,
12033 boolean_t src_destroy,
12034 __unused boolean_t src_volatile,
12035 vm_map_copy_t *copy_result, /* OUT */
12036 boolean_t use_maxprot)
12037 {
12038 int flags;
12039
12040 flags = 0;
12041 if (src_destroy) {
12042 flags |= VM_MAP_COPYIN_SRC_DESTROY;
12043 }
12044 if (use_maxprot) {
12045 flags |= VM_MAP_COPYIN_USE_MAXPROT;
12046 }
12047 return vm_map_copyin_internal(src_map,
12048 src_addr,
12049 len,
12050 flags,
12051 copy_result);
12052 }
12053
12054 static __attribute__((always_inline, warn_unused_result))
12055 kern_return_t
vm_map_copyin_sanitize(vm_map_t src_map,vm_map_address_ut src_addr_u,vm_map_size_ut len_u,vm_map_offset_t * src_start,vm_map_offset_t * src_end,vm_map_size_t * len,vm_map_offset_t * src_addr_unaligned)12056 vm_map_copyin_sanitize(
12057 vm_map_t src_map,
12058 vm_map_address_ut src_addr_u,
12059 vm_map_size_ut len_u,
12060 vm_map_offset_t *src_start,
12061 vm_map_offset_t *src_end,
12062 vm_map_size_t *len,
12063 vm_map_offset_t *src_addr_unaligned)
12064 {
12065 kern_return_t kr;
12066 vm_sanitize_flags_t flags = VM_SANITIZE_FLAGS_SIZE_ZERO_SUCCEEDS |
12067 VM_SANITIZE_FLAGS_GET_UNALIGNED_VALUES |
12068 VM_SANITIZE_FLAGS_CHECK_ADDR_RANGE;
12069
12070 #if KASAN_TBI
12071 if (vm_kernel_map_is_kernel(src_map)) {
12072 flags |= VM_SANITIZE_FLAGS_CANONICALIZE;
12073 }
12074 #endif /* KASAN_TBI */
12075
12076 kr = vm_sanitize_addr_size(src_addr_u, len_u,
12077 VM_SANITIZE_CALLER_VM_MAP_COPYIN,
12078 src_map,
12079 flags,
12080 src_start, src_end, len);
12081 if (__improbable(kr != KERN_SUCCESS)) {
12082 return kr;
12083 }
12084
12085 /*
12086 * Compute (page aligned) start and end of region
12087 */
12088 *src_addr_unaligned = *src_start; /* remember unaligned value */
12089 *src_start = vm_map_trunc_page(*src_addr_unaligned,
12090 VM_MAP_PAGE_MASK(src_map));
12091 *src_end = vm_map_round_page(*src_end, VM_MAP_PAGE_MASK(src_map));
12092
12093
12094 return KERN_SUCCESS;
12095 }
12096
12097 kern_return_t
vm_map_copyin_internal(vm_map_t src_map,vm_map_address_ut src_addr_u,vm_map_size_ut len_u,int flags,vm_map_copy_t * copy_result)12098 vm_map_copyin_internal(
12099 vm_map_t src_map,
12100 vm_map_address_ut src_addr_u,
12101 vm_map_size_ut len_u,
12102 int flags,
12103 vm_map_copy_t *copy_result) /* OUT */
12104 {
12105 vm_map_entry_t tmp_entry; /* Result of last map lookup --
12106 * in multi-level lookup, this
12107 * entry contains the actual
12108 * vm_object/offset.
12109 */
12110 vm_map_entry_t new_entry = VM_MAP_ENTRY_NULL; /* Map entry for copy */
12111
12112 vm_map_offset_t src_start; /* Start of current entry --
12113 * where copy is taking place now
12114 */
12115 vm_map_offset_t src_end; /* End of entire region to be
12116 * copied */
12117 vm_map_offset_t src_addr_unaligned;
12118 vm_map_offset_t src_base;
12119 vm_map_size_t len;
12120 vm_map_t base_map = src_map;
12121 boolean_t map_share = FALSE;
12122 submap_map_t *parent_maps = NULL;
12123
12124 vm_map_copy_t copy; /* Resulting copy */
12125 vm_map_address_t copy_addr;
12126 vm_map_size_t copy_size;
12127 boolean_t src_destroy;
12128 boolean_t use_maxprot;
12129 boolean_t preserve_purgeable;
12130 boolean_t entry_was_shared;
12131 vm_map_entry_t saved_src_entry;
12132 kern_return_t kr;
12133
12134 if (flags & ~VM_MAP_COPYIN_ALL_FLAGS) {
12135 return KERN_INVALID_ARGUMENT;
12136 }
12137
12138 /*
12139 * Check for copies of zero bytes.
12140 */
12141 if (VM_SANITIZE_UNSAFE_IS_ZERO(len_u)) {
12142 *copy_result = VM_MAP_COPY_NULL;
12143 return KERN_SUCCESS;
12144 }
12145
12146 /*
12147 * Sanitize any input parameters that are addr/size/prot/inherit
12148 */
12149 kr = vm_map_copyin_sanitize(
12150 src_map,
12151 src_addr_u,
12152 len_u,
12153 &src_start,
12154 &src_end,
12155 &len,
12156 &src_addr_unaligned);
12157 if (__improbable(kr != KERN_SUCCESS)) {
12158 return vm_sanitize_get_kr(kr);
12159 }
12160
12161 src_destroy = (flags & VM_MAP_COPYIN_SRC_DESTROY) ? TRUE : FALSE;
12162 use_maxprot = (flags & VM_MAP_COPYIN_USE_MAXPROT) ? TRUE : FALSE;
12163 preserve_purgeable =
12164 (flags & VM_MAP_COPYIN_PRESERVE_PURGEABLE) ? TRUE : FALSE;
12165
12166 /*
12167 * If the copy is sufficiently small, use a kernel buffer instead
12168 * of making a virtual copy. The theory being that the cost of
12169 * setting up VM (and taking C-O-W faults) dominates the copy costs
12170 * for small regions.
12171 */
12172 if ((len <= msg_ool_size_small) &&
12173 !use_maxprot &&
12174 !preserve_purgeable &&
12175 !(flags & VM_MAP_COPYIN_ENTRY_LIST) &&
12176 /*
12177 * Since the "msg_ool_size_small" threshold was increased and
12178 * vm_map_copyin_kernel_buffer() doesn't handle accesses beyond the
12179 * address space limits, we revert to doing a virtual copy if the
12180 * copied range goes beyond those limits. Otherwise, mach_vm_read()
12181 * of the commpage would now fail when it used to work.
12182 */
12183 (src_start >= vm_map_min(src_map) &&
12184 src_start < vm_map_max(src_map) &&
12185 src_end >= vm_map_min(src_map) &&
12186 src_end < vm_map_max(src_map))) {
12187 return vm_map_copyin_kernel_buffer(src_map, src_addr_unaligned, len,
12188 src_destroy, copy_result);
12189 }
12190
12191 /*
12192 * Allocate a header element for the list.
12193 *
12194 * Use the start and end in the header to
12195 * remember the endpoints prior to rounding.
12196 */
12197
12198 copy = vm_map_copy_allocate(VM_MAP_COPY_ENTRY_LIST);
12199 copy->cpy_hdr.entries_pageable = TRUE;
12200 copy->cpy_hdr.page_shift = (uint16_t)VM_MAP_PAGE_SHIFT(src_map);
12201 copy->offset = src_addr_unaligned;
12202 copy->size = len;
12203
12204 new_entry = vm_map_copy_entry_create(copy);
12205
12206 #define RETURN(x) \
12207 MACRO_BEGIN \
12208 vm_map_unlock(src_map); \
12209 if(src_map != base_map) \
12210 vm_map_deallocate(src_map); \
12211 if (new_entry != VM_MAP_ENTRY_NULL) \
12212 vm_map_copy_entry_dispose(new_entry); \
12213 vm_map_copy_discard(copy); \
12214 { \
12215 submap_map_t *_ptr; \
12216 \
12217 for(_ptr = parent_maps; _ptr != NULL; _ptr = parent_maps) { \
12218 parent_maps=parent_maps->next; \
12219 if (_ptr->parent_map != base_map) \
12220 vm_map_deallocate(_ptr->parent_map); \
12221 kfree_type(submap_map_t, _ptr); \
12222 } \
12223 } \
12224 MACRO_RETURN(x); \
12225 MACRO_END
12226
12227 /*
12228 * Find the beginning of the region.
12229 */
12230
12231 vm_map_lock(src_map);
12232
12233 /*
12234 * Lookup the original "src_addr_unaligned" rather than the truncated
12235 * "src_start", in case "src_start" falls in a non-map-aligned
12236 * map entry *before* the map entry that contains "src_addr_unaligned"...
12237 */
12238 if (!vm_map_lookup_entry(src_map, src_addr_unaligned, &tmp_entry)) {
12239 RETURN(KERN_INVALID_ADDRESS);
12240 }
12241 if (!tmp_entry->is_sub_map) {
12242 /*
12243 * ... but clip to the map-rounded "src_start" rather than
12244 * "src_addr_unaligned" to preserve map-alignment. We'll adjust the
12245 * first copy entry at the end, if needed.
12246 */
12247 vm_map_clip_start(src_map, tmp_entry, src_start);
12248 }
12249 if (src_start < tmp_entry->vme_start) {
12250 /*
12251 * Move "src_start" up to the start of the
12252 * first map entry to copy.
12253 */
12254 src_start = tmp_entry->vme_start;
12255 }
12256 /* set for later submap fix-up */
12257 copy_addr = src_start;
12258
12259 /*
12260 * Go through entries until we get to the end.
12261 */
12262
12263 while (TRUE) {
12264 vm_map_entry_t src_entry = tmp_entry; /* Top-level entry */
12265 vm_map_size_t src_size; /* Size of source
12266 * map entry (in both
12267 * maps)
12268 */
12269
12270 vm_object_t src_object; /* Object to copy */
12271 vm_object_offset_t src_offset;
12272
12273 vm_object_t new_copy_object;/* vm_object_copy_* result */
12274
12275 boolean_t src_needs_copy; /* Should source map
12276 * be made read-only
12277 * for copy-on-write?
12278 */
12279
12280 boolean_t new_entry_needs_copy; /* Will new entry be COW? */
12281
12282 boolean_t was_wired; /* Was source wired? */
12283 boolean_t saved_used_for_jit; /* Saved used_for_jit. */
12284 vm_map_version_t version; /* Version before locks
12285 * dropped to make copy
12286 */
12287 kern_return_t result; /* Return value from
12288 * copy_strategically.
12289 */
12290 while (tmp_entry->is_sub_map) {
12291 vm_map_size_t submap_len;
12292 submap_map_t *ptr;
12293
12294 ptr = kalloc_type(submap_map_t, Z_WAITOK);
12295 ptr->next = parent_maps;
12296 parent_maps = ptr;
12297 ptr->parent_map = src_map;
12298 ptr->base_start = src_start;
12299 ptr->base_end = src_end;
12300 submap_len = tmp_entry->vme_end - src_start;
12301 if (submap_len > (src_end - src_start)) {
12302 submap_len = src_end - src_start;
12303 }
12304 ptr->base_len = submap_len;
12305
12306 src_start -= tmp_entry->vme_start;
12307 src_start += VME_OFFSET(tmp_entry);
12308 src_end = src_start + submap_len;
12309 src_map = VME_SUBMAP(tmp_entry);
12310 vm_map_lock(src_map);
12311 /* keep an outstanding reference for all maps in */
12312 /* the parents tree except the base map */
12313 vm_map_reference(src_map);
12314 vm_map_unlock(ptr->parent_map);
12315 if (!vm_map_lookup_entry(
12316 src_map, src_start, &tmp_entry)) {
12317 RETURN(KERN_INVALID_ADDRESS);
12318 }
12319 map_share = TRUE;
12320 if (!tmp_entry->is_sub_map) {
12321 vm_map_clip_start(src_map, tmp_entry, src_start);
12322 }
12323 src_entry = tmp_entry;
12324 }
12325 /* we are now in the lowest level submap... */
12326
12327 if ((VME_OBJECT(tmp_entry) != VM_OBJECT_NULL) &&
12328 (VME_OBJECT(tmp_entry)->phys_contiguous)) {
12329 /* This is not, supported for now.In future */
12330 /* we will need to detect the phys_contig */
12331 /* condition and then upgrade copy_slowly */
12332 /* to do physical copy from the device mem */
12333 /* based object. We can piggy-back off of */
12334 /* the was wired boolean to set-up the */
12335 /* proper handling */
12336 RETURN(KERN_PROTECTION_FAILURE);
12337 }
12338 /*
12339 * Create a new address map entry to hold the result.
12340 * Fill in the fields from the appropriate source entries.
12341 * We must unlock the source map to do this if we need
12342 * to allocate a map entry.
12343 */
12344 if (new_entry == VM_MAP_ENTRY_NULL) {
12345 version.main_timestamp = src_map->timestamp;
12346 vm_map_unlock(src_map);
12347
12348 new_entry = vm_map_copy_entry_create(copy);
12349
12350 vm_map_lock(src_map);
12351 if ((version.main_timestamp + 1) != src_map->timestamp) {
12352 if (!vm_map_lookup_entry(src_map, src_start,
12353 &tmp_entry)) {
12354 RETURN(KERN_INVALID_ADDRESS);
12355 }
12356 if (!tmp_entry->is_sub_map) {
12357 vm_map_clip_start(src_map, tmp_entry, src_start);
12358 }
12359 continue; /* restart w/ new tmp_entry */
12360 }
12361 }
12362
12363 /*
12364 * Verify that the region can be read.
12365 */
12366 if (((src_entry->protection & VM_PROT_READ) == VM_PROT_NONE &&
12367 !use_maxprot) ||
12368 (src_entry->max_protection & VM_PROT_READ) == 0) {
12369 RETURN(KERN_PROTECTION_FAILURE);
12370 }
12371
12372 src_object = VME_OBJECT(src_entry);
12373
12374
12375 /*
12376 * Clip against the endpoints of the entire region.
12377 */
12378
12379 vm_map_clip_end(src_map, src_entry, src_end);
12380
12381 src_size = src_entry->vme_end - src_start;
12382 src_offset = VME_OFFSET(src_entry);
12383 was_wired = (src_entry->wired_count != 0);
12384
12385 vm_map_entry_copy(src_map, new_entry, src_entry);
12386 if (new_entry->is_sub_map) {
12387 /* clr address space specifics */
12388 new_entry->use_pmap = FALSE;
12389 } else {
12390 /*
12391 * We're dealing with a copy-on-write operation,
12392 * so the resulting mapping should not inherit the
12393 * original mapping's accounting settings.
12394 * "iokit_acct" should have been cleared in
12395 * vm_map_entry_copy().
12396 * "use_pmap" should be reset to its default (TRUE)
12397 * so that the new mapping gets accounted for in
12398 * the task's memory footprint.
12399 */
12400 assert(!new_entry->iokit_acct);
12401 new_entry->use_pmap = TRUE;
12402 }
12403
12404 /*
12405 * Attempt non-blocking copy-on-write optimizations.
12406 */
12407
12408 /*
12409 * If we are destroying the source, and the object
12410 * is internal, we could move the object reference
12411 * from the source to the copy. The copy is
12412 * copy-on-write only if the source is.
12413 * We make another reference to the object, because
12414 * destroying the source entry will deallocate it.
12415 *
12416 * This memory transfer has to be atomic, (to prevent
12417 * the VM object from being shared or copied while
12418 * it's being moved here), so we could only do this
12419 * if we won't have to unlock the VM map until the
12420 * original mapping has been fully removed.
12421 */
12422
12423 RestartCopy:
12424 if ((src_object == VM_OBJECT_NULL ||
12425 (!was_wired && !map_share && !tmp_entry->is_shared
12426 && !(debug4k_no_cow_copyin && VM_MAP_PAGE_SHIFT(src_map) < PAGE_SHIFT))) &&
12427 vm_object_copy_quickly(
12428 VME_OBJECT(new_entry),
12429 src_offset,
12430 src_size,
12431 &src_needs_copy,
12432 &new_entry_needs_copy)) {
12433 new_entry->needs_copy = new_entry_needs_copy;
12434
12435 /*
12436 * Handle copy-on-write obligations
12437 */
12438
12439 if (src_needs_copy && !tmp_entry->needs_copy) {
12440 vm_prot_t prot;
12441
12442 prot = src_entry->protection & ~VM_PROT_WRITE;
12443
12444 if (override_nx(src_map, VME_ALIAS(src_entry))
12445 && prot) {
12446 prot |= VM_PROT_EXECUTE;
12447 }
12448
12449 vm_object_pmap_protect(
12450 src_object,
12451 src_offset,
12452 src_size,
12453 (src_entry->is_shared ?
12454 PMAP_NULL
12455 : src_map->pmap),
12456 VM_MAP_PAGE_SIZE(src_map),
12457 src_entry->vme_start,
12458 prot);
12459
12460 assert(tmp_entry->wired_count == 0);
12461 tmp_entry->needs_copy = TRUE;
12462 }
12463
12464 /*
12465 * The map has never been unlocked, so it's safe
12466 * to move to the next entry rather than doing
12467 * another lookup.
12468 */
12469
12470 goto CopySuccessful;
12471 }
12472
12473 entry_was_shared = tmp_entry->is_shared;
12474
12475 /*
12476 * Take an object reference, so that we may
12477 * release the map lock(s).
12478 */
12479
12480 assert(src_object != VM_OBJECT_NULL);
12481 vm_object_reference(src_object);
12482
12483 /*
12484 * Record the timestamp for later verification.
12485 * Unlock the map.
12486 */
12487
12488 version.main_timestamp = src_map->timestamp;
12489 vm_map_unlock(src_map); /* Increments timestamp once! */
12490 saved_src_entry = src_entry;
12491 tmp_entry = VM_MAP_ENTRY_NULL;
12492 src_entry = VM_MAP_ENTRY_NULL;
12493
12494 /*
12495 * Perform the copy
12496 */
12497
12498 if (was_wired ||
12499 (src_object->copy_strategy == MEMORY_OBJECT_COPY_DELAY_FORK &&
12500 !(flags & VM_MAP_COPYIN_FORK)) ||
12501 (debug4k_no_cow_copyin &&
12502 VM_MAP_PAGE_SHIFT(src_map) < PAGE_SHIFT)) {
12503 CopySlowly:
12504 vm_object_lock(src_object);
12505 result = vm_object_copy_slowly(
12506 src_object,
12507 src_offset,
12508 src_size,
12509 THREAD_UNINT,
12510 &new_copy_object);
12511 /* VME_OBJECT_SET will reset used_for_jit|tpro, so preserve it. */
12512 saved_used_for_jit = new_entry->used_for_jit;
12513 VME_OBJECT_SET(new_entry, new_copy_object, false, 0);
12514 new_entry->used_for_jit = saved_used_for_jit;
12515 VME_OFFSET_SET(new_entry,
12516 src_offset - vm_object_trunc_page(src_offset));
12517 new_entry->needs_copy = FALSE;
12518 } else if (src_object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC &&
12519 (entry_was_shared || map_share)) {
12520 vm_object_t new_object;
12521
12522 vm_object_lock_shared(src_object);
12523 new_object = vm_object_copy_delayed(
12524 src_object,
12525 src_offset,
12526 src_size,
12527 TRUE);
12528 if (new_object == VM_OBJECT_NULL) {
12529 goto CopySlowly;
12530 }
12531
12532 VME_OBJECT_SET(new_entry, new_object, false, 0);
12533 assert(new_entry->wired_count == 0);
12534 new_entry->needs_copy = TRUE;
12535 assert(!new_entry->iokit_acct);
12536 assert(new_object->purgable == VM_PURGABLE_DENY);
12537 assertf(new_entry->use_pmap, "src_map %p new_entry %p\n", src_map, new_entry);
12538 result = KERN_SUCCESS;
12539 } else {
12540 vm_object_offset_t new_offset;
12541 new_offset = VME_OFFSET(new_entry);
12542 result = vm_object_copy_strategically(src_object,
12543 src_offset,
12544 src_size,
12545 (flags & VM_MAP_COPYIN_FORK),
12546 &new_copy_object,
12547 &new_offset,
12548 &new_entry_needs_copy);
12549 /* VME_OBJECT_SET will reset used_for_jit, so preserve it. */
12550 saved_used_for_jit = new_entry->used_for_jit;
12551 VME_OBJECT_SET(new_entry, new_copy_object, false, 0);
12552 new_entry->used_for_jit = saved_used_for_jit;
12553 if (new_offset != VME_OFFSET(new_entry)) {
12554 VME_OFFSET_SET(new_entry, new_offset);
12555 }
12556
12557 new_entry->needs_copy = new_entry_needs_copy;
12558 }
12559
12560 if (result == KERN_SUCCESS &&
12561 ((preserve_purgeable &&
12562 src_object->purgable != VM_PURGABLE_DENY) ||
12563 new_entry->used_for_jit)) {
12564 /*
12565 * Purgeable objects should be COPY_NONE, true share;
12566 * this should be propogated to the copy.
12567 *
12568 * Also force mappings the pmap specially protects to
12569 * be COPY_NONE; trying to COW these mappings would
12570 * change the effective protections, which could have
12571 * side effects if the pmap layer relies on the
12572 * specified protections.
12573 */
12574
12575 vm_object_t new_object;
12576
12577 new_object = VME_OBJECT(new_entry);
12578 assert(new_object != src_object);
12579 vm_object_lock(new_object);
12580 assert(os_ref_get_count_raw(&new_object->ref_count) == 1);
12581 assert(new_object->shadow == VM_OBJECT_NULL);
12582 assert(new_object->vo_copy == VM_OBJECT_NULL);
12583 assert(new_object->vo_owner == NULL);
12584
12585 new_object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
12586
12587 if (preserve_purgeable &&
12588 src_object->purgable != VM_PURGABLE_DENY) {
12589 VM_OBJECT_SET_TRUE_SHARE(new_object, TRUE);
12590
12591 /* start as non-volatile with no owner... */
12592 VM_OBJECT_SET_PURGABLE(new_object, VM_PURGABLE_NONVOLATILE);
12593 vm_purgeable_nonvolatile_enqueue(new_object, NULL);
12594 /* ... and move to src_object's purgeable state */
12595 if (src_object->purgable != VM_PURGABLE_NONVOLATILE) {
12596 int state;
12597 state = src_object->purgable;
12598 vm_object_purgable_control(
12599 new_object,
12600 VM_PURGABLE_SET_STATE_FROM_KERNEL,
12601 &state);
12602 }
12603 /* no pmap accounting for purgeable objects */
12604 new_entry->use_pmap = FALSE;
12605 }
12606
12607 vm_object_unlock(new_object);
12608 new_object = VM_OBJECT_NULL;
12609 }
12610
12611 /*
12612 * Throw away the extra reference
12613 */
12614
12615 vm_object_deallocate(src_object);
12616
12617 if (result != KERN_SUCCESS &&
12618 result != KERN_MEMORY_RESTART_COPY) {
12619 vm_map_lock(src_map);
12620 RETURN(result);
12621 }
12622
12623 /*
12624 * Verify that the map has not substantially
12625 * changed while the copy was being made.
12626 */
12627
12628 vm_map_lock(src_map);
12629
12630 if ((version.main_timestamp + 1) == src_map->timestamp) {
12631 /* src_map hasn't changed: src_entry is still valid */
12632 src_entry = saved_src_entry;
12633 goto VerificationSuccessful;
12634 }
12635
12636 /*
12637 * Simple version comparison failed.
12638 *
12639 * Retry the lookup and verify that the
12640 * same object/offset are still present.
12641 *
12642 * [Note: a memory manager that colludes with
12643 * the calling task can detect that we have
12644 * cheated. While the map was unlocked, the
12645 * mapping could have been changed and restored.]
12646 */
12647
12648 if (!vm_map_lookup_entry(src_map, src_start, &tmp_entry)) {
12649 if (result != KERN_MEMORY_RESTART_COPY) {
12650 vm_object_deallocate(VME_OBJECT(new_entry));
12651 VME_OBJECT_SET(new_entry, VM_OBJECT_NULL, false, 0);
12652 /* reset accounting state */
12653 new_entry->iokit_acct = FALSE;
12654 new_entry->use_pmap = TRUE;
12655 }
12656 RETURN(KERN_INVALID_ADDRESS);
12657 }
12658
12659 src_entry = tmp_entry;
12660 vm_map_clip_start(src_map, src_entry, src_start);
12661
12662 if ((((src_entry->protection & VM_PROT_READ) == VM_PROT_NONE) &&
12663 !use_maxprot) ||
12664 ((src_entry->max_protection & VM_PROT_READ) == 0)) {
12665 goto VerificationFailed;
12666 }
12667
12668 if (src_entry->vme_end < new_entry->vme_end) {
12669 /*
12670 * This entry might have been shortened
12671 * (vm_map_clip_end) or been replaced with
12672 * an entry that ends closer to "src_start"
12673 * than before.
12674 * Adjust "new_entry" accordingly; copying
12675 * less memory would be correct but we also
12676 * redo the copy (see below) if the new entry
12677 * no longer points at the same object/offset.
12678 */
12679 assert(VM_MAP_PAGE_ALIGNED(src_entry->vme_end,
12680 VM_MAP_COPY_PAGE_MASK(copy)));
12681 new_entry->vme_end = src_entry->vme_end;
12682 src_size = new_entry->vme_end - src_start;
12683 } else if (src_entry->vme_end > new_entry->vme_end) {
12684 /*
12685 * This entry might have been extended
12686 * (vm_map_entry_simplify() or coalesce)
12687 * or been replaced with an entry that ends farther
12688 * from "src_start" than before.
12689 *
12690 * We've called vm_object_copy_*() only on
12691 * the previous <start:end> range, so we can't
12692 * just extend new_entry. We have to re-do
12693 * the copy based on the new entry as if it was
12694 * pointing at a different object/offset (see
12695 * "Verification failed" below).
12696 */
12697 }
12698
12699 if ((VME_OBJECT(src_entry) != src_object) ||
12700 (VME_OFFSET(src_entry) != src_offset) ||
12701 (src_entry->vme_end > new_entry->vme_end)) {
12702 /*
12703 * Verification failed.
12704 *
12705 * Start over with this top-level entry.
12706 */
12707
12708 VerificationFailed: ;
12709
12710 vm_object_deallocate(VME_OBJECT(new_entry));
12711 tmp_entry = src_entry;
12712 continue;
12713 }
12714
12715 /*
12716 * Verification succeeded.
12717 */
12718
12719 VerificationSuccessful:;
12720
12721 if (result == KERN_MEMORY_RESTART_COPY) {
12722 goto RestartCopy;
12723 }
12724
12725 /*
12726 * Copy succeeded.
12727 */
12728
12729 CopySuccessful: ;
12730
12731 /*
12732 * Link in the new copy entry.
12733 */
12734
12735 vm_map_copy_entry_link(copy, vm_map_copy_last_entry(copy),
12736 new_entry);
12737
12738 /*
12739 * Determine whether the entire region
12740 * has been copied.
12741 */
12742 src_base = src_start;
12743 src_start = new_entry->vme_end;
12744 new_entry = VM_MAP_ENTRY_NULL;
12745 while ((src_start >= src_end) && (src_end != 0)) {
12746 submap_map_t *ptr;
12747
12748 if (src_map == base_map) {
12749 /* back to the top */
12750 break;
12751 }
12752
12753 ptr = parent_maps;
12754 assert(ptr != NULL);
12755 parent_maps = parent_maps->next;
12756
12757 /* fix up the damage we did in that submap */
12758 vm_map_simplify_range(src_map,
12759 src_base,
12760 src_end);
12761
12762 vm_map_unlock(src_map);
12763 vm_map_deallocate(src_map);
12764 vm_map_lock(ptr->parent_map);
12765 src_map = ptr->parent_map;
12766 src_base = ptr->base_start;
12767 src_start = ptr->base_start + ptr->base_len;
12768 src_end = ptr->base_end;
12769 if (!vm_map_lookup_entry(src_map,
12770 src_start,
12771 &tmp_entry) &&
12772 (src_end > src_start)) {
12773 RETURN(KERN_INVALID_ADDRESS);
12774 }
12775 kfree_type(submap_map_t, ptr);
12776 if (parent_maps == NULL) {
12777 map_share = FALSE;
12778 }
12779 src_entry = tmp_entry->vme_prev;
12780 }
12781
12782 if ((VM_MAP_PAGE_SHIFT(src_map) != PAGE_SHIFT) &&
12783 (src_start >= src_addr_unaligned + len) &&
12784 (src_addr_unaligned + len != 0)) {
12785 /*
12786 * Stop copying now, even though we haven't reached
12787 * "src_end". We'll adjust the end of the last copy
12788 * entry at the end, if needed.
12789 *
12790 * If src_map's aligment is different from the
12791 * system's page-alignment, there could be
12792 * extra non-map-aligned map entries between
12793 * the original (non-rounded) "src_addr_unaligned + len"
12794 * and the rounded "src_end".
12795 * We do not want to copy those map entries since
12796 * they're not part of the copied range.
12797 */
12798 break;
12799 }
12800
12801 if ((src_start >= src_end) && (src_end != 0)) {
12802 break;
12803 }
12804
12805 /*
12806 * Verify that there are no gaps in the region
12807 */
12808
12809 tmp_entry = src_entry->vme_next;
12810 if ((tmp_entry->vme_start != src_start) ||
12811 (tmp_entry == vm_map_to_entry(src_map))) {
12812 RETURN(KERN_INVALID_ADDRESS);
12813 }
12814 }
12815
12816 /*
12817 * If the source should be destroyed, do it now, since the
12818 * copy was successful.
12819 */
12820 if (src_destroy) {
12821 vmr_flags_t remove_flags = VM_MAP_REMOVE_NO_FLAGS;
12822
12823 if (src_map == kernel_map) {
12824 remove_flags |= VM_MAP_REMOVE_KUNWIRE;
12825 }
12826 (void)vm_map_remove_and_unlock(src_map,
12827 vm_map_trunc_page(src_addr_unaligned, VM_MAP_PAGE_MASK(src_map)),
12828 src_end,
12829 remove_flags,
12830 KMEM_GUARD_NONE);
12831 } else {
12832 /* fix up the damage we did in the base map */
12833 vm_map_simplify_range(
12834 src_map,
12835 vm_map_trunc_page(src_addr_unaligned,
12836 VM_MAP_PAGE_MASK(src_map)),
12837 vm_map_round_page(src_end,
12838 VM_MAP_PAGE_MASK(src_map)));
12839 vm_map_unlock(src_map);
12840 }
12841
12842 tmp_entry = VM_MAP_ENTRY_NULL;
12843
12844 if (VM_MAP_PAGE_SHIFT(src_map) > PAGE_SHIFT &&
12845 VM_MAP_PAGE_SHIFT(src_map) != VM_MAP_COPY_PAGE_SHIFT(copy)) {
12846 vm_map_offset_t original_start, original_offset, original_end;
12847
12848 assert(VM_MAP_COPY_PAGE_MASK(copy) == PAGE_MASK);
12849
12850 /* adjust alignment of first copy_entry's "vme_start" */
12851 tmp_entry = vm_map_copy_first_entry(copy);
12852 if (tmp_entry != vm_map_copy_to_entry(copy)) {
12853 vm_map_offset_t adjustment;
12854
12855 original_start = tmp_entry->vme_start;
12856 original_offset = VME_OFFSET(tmp_entry);
12857
12858 /* map-align the start of the first copy entry... */
12859 adjustment = (tmp_entry->vme_start -
12860 vm_map_trunc_page(
12861 tmp_entry->vme_start,
12862 VM_MAP_PAGE_MASK(src_map)));
12863 tmp_entry->vme_start -= adjustment;
12864 VME_OFFSET_SET(tmp_entry,
12865 VME_OFFSET(tmp_entry) - adjustment);
12866 copy_addr -= adjustment;
12867 assert(tmp_entry->vme_start < tmp_entry->vme_end);
12868 /* ... adjust for mis-aligned start of copy range */
12869 adjustment =
12870 (vm_map_trunc_page(copy->offset,
12871 PAGE_MASK) -
12872 vm_map_trunc_page(copy->offset,
12873 VM_MAP_PAGE_MASK(src_map)));
12874 if (adjustment) {
12875 assert(page_aligned(adjustment));
12876 assert(adjustment < VM_MAP_PAGE_SIZE(src_map));
12877 tmp_entry->vme_start += adjustment;
12878 VME_OFFSET_SET(tmp_entry,
12879 (VME_OFFSET(tmp_entry) +
12880 adjustment));
12881 copy_addr += adjustment;
12882 assert(tmp_entry->vme_start < tmp_entry->vme_end);
12883 }
12884
12885 /*
12886 * Assert that the adjustments haven't exposed
12887 * more than was originally copied...
12888 */
12889 assert(tmp_entry->vme_start >= original_start);
12890 assert(VME_OFFSET(tmp_entry) >= original_offset);
12891 /*
12892 * ... and that it did not adjust outside of a
12893 * a single 16K page.
12894 */
12895 assert(vm_map_trunc_page(tmp_entry->vme_start,
12896 VM_MAP_PAGE_MASK(src_map)) ==
12897 vm_map_trunc_page(original_start,
12898 VM_MAP_PAGE_MASK(src_map)));
12899 }
12900
12901 /* adjust alignment of last copy_entry's "vme_end" */
12902 tmp_entry = vm_map_copy_last_entry(copy);
12903 if (tmp_entry != vm_map_copy_to_entry(copy)) {
12904 vm_map_offset_t adjustment;
12905
12906 original_end = tmp_entry->vme_end;
12907
12908 /* map-align the end of the last copy entry... */
12909 tmp_entry->vme_end =
12910 vm_map_round_page(tmp_entry->vme_end,
12911 VM_MAP_PAGE_MASK(src_map));
12912 /* ... adjust for mis-aligned end of copy range */
12913 adjustment =
12914 (vm_map_round_page((copy->offset +
12915 copy->size),
12916 VM_MAP_PAGE_MASK(src_map)) -
12917 vm_map_round_page((copy->offset +
12918 copy->size),
12919 PAGE_MASK));
12920 if (adjustment) {
12921 assert(page_aligned(adjustment));
12922 assert(adjustment < VM_MAP_PAGE_SIZE(src_map));
12923 tmp_entry->vme_end -= adjustment;
12924 assert(tmp_entry->vme_start < tmp_entry->vme_end);
12925 }
12926
12927 /*
12928 * Assert that the adjustments haven't exposed
12929 * more than was originally copied...
12930 */
12931 assert(tmp_entry->vme_end <= original_end);
12932 /*
12933 * ... and that it did not adjust outside of a
12934 * a single 16K page.
12935 */
12936 assert(vm_map_round_page(tmp_entry->vme_end,
12937 VM_MAP_PAGE_MASK(src_map)) ==
12938 vm_map_round_page(original_end,
12939 VM_MAP_PAGE_MASK(src_map)));
12940 }
12941 }
12942
12943 /* Fix-up start and end points in copy. This is necessary */
12944 /* when the various entries in the copy object were picked */
12945 /* up from different sub-maps */
12946
12947 tmp_entry = vm_map_copy_first_entry(copy);
12948 copy_size = 0; /* compute actual size */
12949 while (tmp_entry != vm_map_copy_to_entry(copy)) {
12950 assert(VM_MAP_PAGE_ALIGNED(
12951 copy_addr + (tmp_entry->vme_end -
12952 tmp_entry->vme_start),
12953 MIN(VM_MAP_COPY_PAGE_MASK(copy), PAGE_MASK)));
12954 assert(VM_MAP_PAGE_ALIGNED(
12955 copy_addr,
12956 MIN(VM_MAP_COPY_PAGE_MASK(copy), PAGE_MASK)));
12957
12958 /*
12959 * The copy_entries will be injected directly into the
12960 * destination map and might not be "map aligned" there...
12961 */
12962 tmp_entry->map_aligned = FALSE;
12963
12964 tmp_entry->vme_end = copy_addr +
12965 (tmp_entry->vme_end - tmp_entry->vme_start);
12966 tmp_entry->vme_start = copy_addr;
12967 assert(tmp_entry->vme_start < tmp_entry->vme_end);
12968 copy_addr += tmp_entry->vme_end - tmp_entry->vme_start;
12969 copy_size += tmp_entry->vme_end - tmp_entry->vme_start;
12970 tmp_entry = (struct vm_map_entry *)tmp_entry->vme_next;
12971 }
12972
12973 if (VM_MAP_PAGE_SHIFT(src_map) != PAGE_SHIFT &&
12974 copy_size < copy->size) {
12975 /*
12976 * The actual size of the VM map copy is smaller than what
12977 * was requested by the caller. This must be because some
12978 * PAGE_SIZE-sized pages are missing at the end of the last
12979 * VM_MAP_PAGE_SIZE(src_map)-sized chunk of the range.
12980 * The caller might not have been aware of those missing
12981 * pages and might not want to be aware of it, which is
12982 * fine as long as they don't try to access (and crash on)
12983 * those missing pages.
12984 * Let's adjust the size of the "copy", to avoid failing
12985 * in vm_map_copyout() or vm_map_copy_overwrite().
12986 */
12987 assert(vm_map_round_page(copy_size,
12988 VM_MAP_PAGE_MASK(src_map)) ==
12989 vm_map_round_page(copy->size,
12990 VM_MAP_PAGE_MASK(src_map)));
12991 copy->size = copy_size;
12992 }
12993
12994 *copy_result = copy;
12995 return KERN_SUCCESS;
12996
12997 #undef RETURN
12998 }
12999
13000 kern_return_t
vm_map_copy_extract(vm_map_t src_map,vm_map_address_t src_addr,vm_map_size_t len,boolean_t do_copy,vm_map_copy_t * copy_result,vm_prot_t * cur_prot,vm_prot_t * max_prot,vm_inherit_t inheritance,vm_map_kernel_flags_t vmk_flags)13001 vm_map_copy_extract(
13002 vm_map_t src_map,
13003 vm_map_address_t src_addr,
13004 vm_map_size_t len,
13005 boolean_t do_copy,
13006 vm_map_copy_t *copy_result, /* OUT */
13007 vm_prot_t *cur_prot, /* IN/OUT */
13008 vm_prot_t *max_prot, /* IN/OUT */
13009 vm_inherit_t inheritance,
13010 vm_map_kernel_flags_t vmk_flags)
13011 {
13012 vm_map_copy_t copy;
13013 kern_return_t kr;
13014 vm_prot_t required_cur_prot, required_max_prot;
13015
13016 /*
13017 * Check for copies of zero bytes.
13018 */
13019
13020 if (len == 0) {
13021 *copy_result = VM_MAP_COPY_NULL;
13022 return KERN_SUCCESS;
13023 }
13024
13025 /*
13026 * Check that the end address doesn't overflow
13027 */
13028 if (src_addr + len < src_addr) {
13029 return KERN_INVALID_ADDRESS;
13030 }
13031 if (__improbable(vm_map_range_overflows(src_map, src_addr, len))) {
13032 return KERN_INVALID_ADDRESS;
13033 }
13034
13035 if (VM_MAP_PAGE_SIZE(src_map) < PAGE_SIZE) {
13036 DEBUG4K_SHARE("src_map %p src_addr 0x%llx src_end 0x%llx\n", src_map, (uint64_t)src_addr, (uint64_t)(src_addr + len));
13037 }
13038
13039 required_cur_prot = *cur_prot;
13040 required_max_prot = *max_prot;
13041
13042 /*
13043 * Allocate a header element for the list.
13044 *
13045 * Use the start and end in the header to
13046 * remember the endpoints prior to rounding.
13047 */
13048
13049 copy = vm_map_copy_allocate(VM_MAP_COPY_ENTRY_LIST);
13050 copy->cpy_hdr.entries_pageable = vmk_flags.vmkf_copy_pageable;
13051 copy->offset = 0;
13052 copy->size = len;
13053
13054 kr = vm_map_remap_extract(src_map,
13055 src_addr,
13056 len,
13057 do_copy, /* copy */
13058 copy,
13059 cur_prot, /* IN/OUT */
13060 max_prot, /* IN/OUT */
13061 inheritance,
13062 vmk_flags);
13063 if (kr != KERN_SUCCESS) {
13064 vm_map_copy_discard(copy);
13065 if ((kr == KERN_INVALID_ADDRESS ||
13066 kr == KERN_INVALID_ARGUMENT) &&
13067 src_map->terminated) {
13068 /* tell the caller that this address space is gone */
13069 kr = KERN_TERMINATED;
13070 }
13071 return kr;
13072 }
13073 if (required_cur_prot != VM_PROT_NONE) {
13074 assert((*cur_prot & required_cur_prot) == required_cur_prot);
13075 assert((*max_prot & required_max_prot) == required_max_prot);
13076 }
13077
13078 *copy_result = copy;
13079 return KERN_SUCCESS;
13080 }
13081
13082 static void
vm_map_fork_share(vm_map_t old_map,vm_map_entry_t old_entry,vm_map_t new_map)13083 vm_map_fork_share(
13084 vm_map_t old_map,
13085 vm_map_entry_t old_entry,
13086 vm_map_t new_map)
13087 {
13088 vm_object_t object;
13089 vm_map_entry_t new_entry;
13090
13091 /*
13092 * New sharing code. New map entry
13093 * references original object. Internal
13094 * objects use asynchronous copy algorithm for
13095 * future copies. First make sure we have
13096 * the right object. If we need a shadow,
13097 * or someone else already has one, then
13098 * make a new shadow and share it.
13099 */
13100
13101 if (!old_entry->is_sub_map) {
13102 object = VME_OBJECT(old_entry);
13103 }
13104
13105 if (old_entry->is_sub_map) {
13106 assert(old_entry->wired_count == 0);
13107 #ifndef NO_NESTED_PMAP
13108 #if !PMAP_FORK_NEST
13109 if (old_entry->use_pmap) {
13110 kern_return_t result;
13111
13112 result = pmap_nest(new_map->pmap,
13113 (VME_SUBMAP(old_entry))->pmap,
13114 (addr64_t)old_entry->vme_start,
13115 (uint64_t)(old_entry->vme_end - old_entry->vme_start));
13116 if (result) {
13117 panic("vm_map_fork_share: pmap_nest failed!");
13118 }
13119 }
13120 #endif /* !PMAP_FORK_NEST */
13121 #endif /* NO_NESTED_PMAP */
13122 } else if (object == VM_OBJECT_NULL) {
13123 object = vm_object_allocate((vm_map_size_t)(old_entry->vme_end -
13124 old_entry->vme_start), old_map->serial_id);
13125 VME_OFFSET_SET(old_entry, 0);
13126 VME_OBJECT_SET(old_entry, object, false, 0);
13127 old_entry->use_pmap = TRUE;
13128 // assert(!old_entry->needs_copy);
13129 } else if (object->copy_strategy !=
13130 MEMORY_OBJECT_COPY_SYMMETRIC) {
13131 /*
13132 * We are already using an asymmetric
13133 * copy, and therefore we already have
13134 * the right object.
13135 */
13136
13137 assert(!old_entry->needs_copy);
13138 } else if (old_entry->needs_copy || /* case 1 */
13139 object->shadowed || /* case 2 */
13140 (!object->true_share && /* case 3 */
13141 !old_entry->is_shared &&
13142 (object->vo_size >
13143 (vm_map_size_t)(old_entry->vme_end -
13144 old_entry->vme_start)))) {
13145 bool is_writable;
13146
13147 /*
13148 * We need to create a shadow.
13149 * There are three cases here.
13150 * In the first case, we need to
13151 * complete a deferred symmetrical
13152 * copy that we participated in.
13153 * In the second and third cases,
13154 * we need to create the shadow so
13155 * that changes that we make to the
13156 * object do not interfere with
13157 * any symmetrical copies which
13158 * have occured (case 2) or which
13159 * might occur (case 3).
13160 *
13161 * The first case is when we had
13162 * deferred shadow object creation
13163 * via the entry->needs_copy mechanism.
13164 * This mechanism only works when
13165 * only one entry points to the source
13166 * object, and we are about to create
13167 * a second entry pointing to the
13168 * same object. The problem is that
13169 * there is no way of mapping from
13170 * an object to the entries pointing
13171 * to it. (Deferred shadow creation
13172 * works with one entry because occurs
13173 * at fault time, and we walk from the
13174 * entry to the object when handling
13175 * the fault.)
13176 *
13177 * The second case is when the object
13178 * to be shared has already been copied
13179 * with a symmetric copy, but we point
13180 * directly to the object without
13181 * needs_copy set in our entry. (This
13182 * can happen because different ranges
13183 * of an object can be pointed to by
13184 * different entries. In particular,
13185 * a single entry pointing to an object
13186 * can be split by a call to vm_inherit,
13187 * which, combined with task_create, can
13188 * result in the different entries
13189 * having different needs_copy values.)
13190 * The shadowed flag in the object allows
13191 * us to detect this case. The problem
13192 * with this case is that if this object
13193 * has or will have shadows, then we
13194 * must not perform an asymmetric copy
13195 * of this object, since such a copy
13196 * allows the object to be changed, which
13197 * will break the previous symmetrical
13198 * copies (which rely upon the object
13199 * not changing). In a sense, the shadowed
13200 * flag says "don't change this object".
13201 * We fix this by creating a shadow
13202 * object for this object, and sharing
13203 * that. This works because we are free
13204 * to change the shadow object (and thus
13205 * to use an asymmetric copy strategy);
13206 * this is also semantically correct,
13207 * since this object is temporary, and
13208 * therefore a copy of the object is
13209 * as good as the object itself. (This
13210 * is not true for permanent objects,
13211 * since the pager needs to see changes,
13212 * which won't happen if the changes
13213 * are made to a copy.)
13214 *
13215 * The third case is when the object
13216 * to be shared has parts sticking
13217 * outside of the entry we're working
13218 * with, and thus may in the future
13219 * be subject to a symmetrical copy.
13220 * (This is a preemptive version of
13221 * case 2.)
13222 */
13223 VME_OBJECT_SHADOW(old_entry,
13224 (vm_map_size_t) (old_entry->vme_end -
13225 old_entry->vme_start),
13226 vm_map_always_shadow(old_map));
13227
13228 /*
13229 * If we're making a shadow for other than
13230 * copy on write reasons, then we have
13231 * to remove write permission.
13232 */
13233
13234 is_writable = false;
13235 if (old_entry->protection & VM_PROT_WRITE) {
13236 is_writable = true;
13237 #if __arm64e__
13238 } else if (old_entry->used_for_tpro) {
13239 is_writable = true;
13240 #endif /* __arm64e__ */
13241 }
13242 if (!old_entry->needs_copy && is_writable) {
13243 vm_prot_t prot;
13244
13245 if (pmap_has_prot_policy(old_map->pmap, old_entry->translated_allow_execute, old_entry->protection)) {
13246 panic("%s: map %p pmap %p entry %p 0x%llx:0x%llx prot 0x%x",
13247 __FUNCTION__, old_map, old_map->pmap,
13248 old_entry,
13249 (uint64_t)old_entry->vme_start,
13250 (uint64_t)old_entry->vme_end,
13251 old_entry->protection);
13252 }
13253
13254 prot = old_entry->protection & ~VM_PROT_WRITE;
13255
13256 if (pmap_has_prot_policy(old_map->pmap, old_entry->translated_allow_execute, prot)) {
13257 panic("%s: map %p pmap %p entry %p 0x%llx:0x%llx prot 0x%x",
13258 __FUNCTION__, old_map, old_map->pmap,
13259 old_entry,
13260 (uint64_t)old_entry->vme_start,
13261 (uint64_t)old_entry->vme_end,
13262 prot);
13263 }
13264
13265 if (override_nx(old_map, VME_ALIAS(old_entry)) && prot) {
13266 prot |= VM_PROT_EXECUTE;
13267 }
13268
13269
13270 if (old_map->mapped_in_other_pmaps) {
13271 vm_object_pmap_protect(
13272 VME_OBJECT(old_entry),
13273 VME_OFFSET(old_entry),
13274 (old_entry->vme_end -
13275 old_entry->vme_start),
13276 PMAP_NULL,
13277 PAGE_SIZE,
13278 old_entry->vme_start,
13279 prot);
13280 } else {
13281 pmap_protect(old_map->pmap,
13282 old_entry->vme_start,
13283 old_entry->vme_end,
13284 prot);
13285 }
13286 }
13287
13288 old_entry->needs_copy = FALSE;
13289 object = VME_OBJECT(old_entry);
13290 }
13291
13292
13293 /*
13294 * If object was using a symmetric copy strategy,
13295 * change its copy strategy to the default
13296 * asymmetric copy strategy, which is copy_delay
13297 * in the non-norma case and copy_call in the
13298 * norma case. Bump the reference count for the
13299 * new entry.
13300 */
13301
13302 if (old_entry->is_sub_map) {
13303 vm_map_reference(VME_SUBMAP(old_entry));
13304 } else {
13305 vm_object_lock(object);
13306 vm_object_reference_locked(object);
13307 if (object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) {
13308 object->copy_strategy = MEMORY_OBJECT_COPY_DELAY;
13309 }
13310 vm_object_unlock(object);
13311 }
13312
13313 /*
13314 * Clone the entry, using object ref from above.
13315 * Mark both entries as shared.
13316 */
13317
13318 new_entry = vm_map_entry_create(new_map); /* Never the kernel map or descendants */
13319 vm_map_entry_copy(old_map, new_entry, old_entry);
13320 old_entry->is_shared = TRUE;
13321 new_entry->is_shared = TRUE;
13322
13323 /*
13324 * We're dealing with a shared mapping, so the resulting mapping
13325 * should inherit some of the original mapping's accounting settings.
13326 * "iokit_acct" should have been cleared in vm_map_entry_copy().
13327 * "use_pmap" should stay the same as before (if it hasn't been reset
13328 * to TRUE when we cleared "iokit_acct").
13329 */
13330 assert(!new_entry->iokit_acct);
13331
13332 /*
13333 * If old entry's inheritence is VM_INHERIT_NONE,
13334 * the new entry is for corpse fork, remove the
13335 * write permission from the new entry.
13336 */
13337 if (old_entry->inheritance == VM_INHERIT_NONE) {
13338 new_entry->protection &= ~VM_PROT_WRITE;
13339 new_entry->max_protection &= ~VM_PROT_WRITE;
13340 }
13341
13342 /*
13343 * Insert the entry into the new map -- we
13344 * know we're inserting at the end of the new
13345 * map.
13346 */
13347
13348 vm_map_store_entry_link(new_map, vm_map_last_entry(new_map), new_entry,
13349 VM_MAP_KERNEL_FLAGS_NONE);
13350
13351 /*
13352 * Update the physical map
13353 */
13354
13355 if (old_entry->is_sub_map) {
13356 /* Bill Angell pmap support goes here */
13357 } else {
13358 pmap_copy(new_map->pmap, old_map->pmap, new_entry->vme_start,
13359 old_entry->vme_end - old_entry->vme_start,
13360 old_entry->vme_start);
13361 }
13362 }
13363
13364 static boolean_t
vm_map_fork_copy(vm_map_t old_map,vm_map_entry_t * old_entry_p,vm_map_t new_map,int vm_map_copyin_flags)13365 vm_map_fork_copy(
13366 vm_map_t old_map,
13367 vm_map_entry_t *old_entry_p,
13368 vm_map_t new_map,
13369 int vm_map_copyin_flags)
13370 {
13371 vm_map_entry_t old_entry = *old_entry_p;
13372 vm_map_size_t entry_size = old_entry->vme_end - old_entry->vme_start;
13373 vm_map_offset_t start = old_entry->vme_start;
13374 vm_map_copy_t copy;
13375 vm_map_entry_t last = vm_map_last_entry(new_map);
13376
13377 vm_map_unlock(old_map);
13378 /*
13379 * Use maxprot version of copyin because we
13380 * care about whether this memory can ever
13381 * be accessed, not just whether it's accessible
13382 * right now.
13383 */
13384 vm_map_copyin_flags |= VM_MAP_COPYIN_USE_MAXPROT;
13385 if (vm_map_copyin_internal(old_map, start, entry_size,
13386 vm_map_copyin_flags, ©)
13387 != KERN_SUCCESS) {
13388 /*
13389 * The map might have changed while it
13390 * was unlocked, check it again. Skip
13391 * any blank space or permanently
13392 * unreadable region.
13393 */
13394 vm_map_lock(old_map);
13395 if (!vm_map_lookup_entry(old_map, start, &last) ||
13396 (last->max_protection & VM_PROT_READ) == VM_PROT_NONE) {
13397 last = last->vme_next;
13398 }
13399 *old_entry_p = last;
13400
13401 /*
13402 * XXX For some error returns, want to
13403 * XXX skip to the next element. Note
13404 * that INVALID_ADDRESS and
13405 * PROTECTION_FAILURE are handled above.
13406 */
13407
13408 return FALSE;
13409 }
13410
13411 /*
13412 * Assert that the vm_map_copy is coming from the right
13413 * zone and hasn't been forged
13414 */
13415 vm_map_copy_require(copy);
13416
13417 /*
13418 * Insert the copy into the new map
13419 */
13420 vm_map_copy_insert(new_map, last, copy);
13421
13422 /*
13423 * Pick up the traversal at the end of
13424 * the copied region.
13425 */
13426
13427 vm_map_lock(old_map);
13428 start += entry_size;
13429 if (!vm_map_lookup_entry(old_map, start, &last)) {
13430 last = last->vme_next;
13431 } else {
13432 if (last->vme_start == start) {
13433 /*
13434 * No need to clip here and we don't
13435 * want to cause any unnecessary
13436 * unnesting...
13437 */
13438 } else {
13439 vm_map_clip_start(old_map, last, start);
13440 }
13441 }
13442 *old_entry_p = last;
13443
13444 return TRUE;
13445 }
13446
13447 #if PMAP_FORK_NEST
13448 #define PMAP_FORK_NEST_DEBUG 0
13449 static inline void
vm_map_fork_unnest(pmap_t new_pmap,vm_map_offset_t pre_nested_start,vm_map_offset_t pre_nested_end,vm_map_offset_t start,vm_map_offset_t end)13450 vm_map_fork_unnest(
13451 pmap_t new_pmap,
13452 vm_map_offset_t pre_nested_start,
13453 vm_map_offset_t pre_nested_end,
13454 vm_map_offset_t start,
13455 vm_map_offset_t end)
13456 {
13457 kern_return_t kr;
13458 vm_map_offset_t nesting_mask, start_unnest, end_unnest;
13459
13460 assertf(pre_nested_start <= pre_nested_end,
13461 "pre_nested start 0x%llx end 0x%llx",
13462 (uint64_t)pre_nested_start, (uint64_t)pre_nested_end);
13463 assertf(start <= end,
13464 "start 0x%llx end 0x%llx",
13465 (uint64_t) start, (uint64_t)end);
13466
13467 if (pre_nested_start == pre_nested_end) {
13468 /* nothing was pre-nested: done */
13469 return;
13470 }
13471 if (end <= pre_nested_start) {
13472 /* fully before pre-nested range: done */
13473 return;
13474 }
13475 if (start >= pre_nested_end) {
13476 /* fully after pre-nested range: done */
13477 return;
13478 }
13479 /* ignore parts of range outside of pre_nested range */
13480 if (start < pre_nested_start) {
13481 start = pre_nested_start;
13482 }
13483 if (end > pre_nested_end) {
13484 end = pre_nested_end;
13485 }
13486 nesting_mask = pmap_shared_region_size_min(new_pmap) - 1;
13487 start_unnest = start & ~nesting_mask;
13488 end_unnest = (end + nesting_mask) & ~nesting_mask;
13489 kr = pmap_unnest(new_pmap,
13490 (addr64_t)start_unnest,
13491 (uint64_t)(end_unnest - start_unnest));
13492 #if PMAP_FORK_NEST_DEBUG
13493 printf("PMAP_FORK_NEST %s:%d new_pmap %p 0x%llx:0x%llx -> pmap_unnest 0x%llx:0x%llx kr 0x%x\n", __FUNCTION__, __LINE__, new_pmap, (uint64_t)start, (uint64_t)end, (uint64_t)start_unnest, (uint64_t)end_unnest, kr);
13494 #endif /* PMAP_FORK_NEST_DEBUG */
13495 assertf(kr == KERN_SUCCESS,
13496 "0x%llx 0x%llx pmap_unnest(%p, 0x%llx, 0x%llx) -> 0x%x",
13497 (uint64_t)start, (uint64_t)end, new_pmap,
13498 (uint64_t)start_unnest, (uint64_t)(end_unnest - start_unnest),
13499 kr);
13500 }
13501 #endif /* PMAP_FORK_NEST */
13502
13503 void
vm_map_inherit_limits(vm_map_t new_map,const struct _vm_map * old_map)13504 vm_map_inherit_limits(vm_map_t new_map, const struct _vm_map *old_map)
13505 {
13506 new_map->size_limit = old_map->size_limit;
13507 new_map->data_limit = old_map->data_limit;
13508 new_map->user_wire_limit = old_map->user_wire_limit;
13509 new_map->reserved_regions = old_map->reserved_regions;
13510 }
13511
13512 /*
13513 * vm_map_fork:
13514 *
13515 * Create and return a new map based on the old
13516 * map, according to the inheritance values on the
13517 * regions in that map and the options.
13518 *
13519 * The source map must not be locked.
13520 */
13521 vm_map_t
vm_map_fork(ledger_t ledger,vm_map_t old_map,int options)13522 vm_map_fork(
13523 ledger_t ledger,
13524 vm_map_t old_map,
13525 int options)
13526 {
13527 pmap_t new_pmap;
13528 vm_map_t new_map;
13529 vm_map_entry_t old_entry;
13530 vm_map_size_t new_size = 0, entry_size;
13531 vm_map_entry_t new_entry;
13532 boolean_t src_needs_copy;
13533 boolean_t new_entry_needs_copy;
13534 boolean_t pmap_is64bit;
13535 int vm_map_copyin_flags;
13536 vm_inherit_t old_entry_inheritance;
13537 int map_create_options;
13538 kern_return_t footprint_collect_kr;
13539
13540 if (options & ~(VM_MAP_FORK_SHARE_IF_INHERIT_NONE |
13541 VM_MAP_FORK_PRESERVE_PURGEABLE |
13542 VM_MAP_FORK_CORPSE_FOOTPRINT |
13543 VM_MAP_FORK_SHARE_IF_OWNED)) {
13544 /* unsupported option */
13545 return VM_MAP_NULL;
13546 }
13547
13548 pmap_is64bit =
13549 #if defined(__i386__) || defined(__x86_64__)
13550 old_map->pmap->pm_task_map != TASK_MAP_32BIT;
13551 #elif defined(__arm64__)
13552 old_map->pmap->is_64bit;
13553 #else
13554 #error Unknown architecture.
13555 #endif
13556
13557 unsigned int pmap_flags = 0;
13558 pmap_flags |= pmap_is64bit ? PMAP_CREATE_64BIT : 0;
13559 #if defined(HAS_APPLE_PAC)
13560 pmap_flags |= old_map->pmap->disable_jop ? PMAP_CREATE_DISABLE_JOP : 0;
13561 #endif
13562 #if CONFIG_ROSETTA
13563 pmap_flags |= old_map->pmap->is_rosetta ? PMAP_CREATE_ROSETTA : 0;
13564 #endif
13565 #if PMAP_CREATE_FORCE_4K_PAGES
13566 if (VM_MAP_PAGE_SIZE(old_map) == FOURK_PAGE_SIZE &&
13567 PAGE_SIZE != FOURK_PAGE_SIZE) {
13568 pmap_flags |= PMAP_CREATE_FORCE_4K_PAGES;
13569 }
13570 #endif /* PMAP_CREATE_FORCE_4K_PAGES */
13571 new_pmap = pmap_create_options(ledger, (vm_map_size_t) 0, pmap_flags);
13572 if (new_pmap == NULL) {
13573 return VM_MAP_NULL;
13574 }
13575
13576 vm_map_reference(old_map);
13577 vm_map_lock(old_map);
13578
13579 /* Note that we're creating a map out of fork() */
13580 map_create_options = VM_MAP_CREATE_VIA_FORK;
13581 if (old_map->hdr.entries_pageable) {
13582 map_create_options |= VM_MAP_CREATE_PAGEABLE;
13583 }
13584 if (options & VM_MAP_FORK_CORPSE_FOOTPRINT) {
13585 map_create_options |= VM_MAP_CREATE_CORPSE_FOOTPRINT;
13586 footprint_collect_kr = KERN_SUCCESS;
13587 }
13588 new_map = vm_map_create_options(new_pmap,
13589 old_map->min_offset,
13590 old_map->max_offset,
13591 map_create_options);
13592
13593 /* Inherit our parent's ID. */
13594 vm_map_assign_serial(new_map, old_map->serial_id);
13595
13596 /* inherit cs_enforcement */
13597 vm_map_cs_enforcement_set(new_map, old_map->cs_enforcement);
13598
13599 vm_map_lock(new_map);
13600 vm_commit_pagezero_status(new_map);
13601 /* inherit the parent map's page size */
13602 vm_map_set_page_shift(new_map, VM_MAP_PAGE_SHIFT(old_map));
13603
13604 /* inherit the parent rlimits */
13605 vm_map_inherit_limits(new_map, old_map);
13606
13607 #if CONFIG_MAP_RANGES
13608 /* inherit the parent map's VM ranges */
13609 vm_map_range_fork(new_map, old_map);
13610 #endif
13611
13612 #if CODE_SIGNING_MONITOR
13613 /* Prepare the monitor for the fork */
13614 csm_fork_prepare(old_map->pmap, new_pmap);
13615 #endif
13616
13617 #if PMAP_FORK_NEST
13618 /*
13619 * Pre-nest the shared region's pmap.
13620 */
13621 vm_map_offset_t pre_nested_start = 0, pre_nested_end = 0;
13622 pmap_fork_nest(old_map->pmap, new_pmap,
13623 &pre_nested_start, &pre_nested_end);
13624 #if PMAP_FORK_NEST_DEBUG
13625 printf("PMAP_FORK_NEST %s:%d old %p new %p pre_nested start 0x%llx end 0x%llx\n", __FUNCTION__, __LINE__, old_map->pmap, new_pmap, (uint64_t)pre_nested_start, (uint64_t)pre_nested_end);
13626 #endif /* PMAP_FORK_NEST_DEBUG */
13627 #endif /* PMAP_FORK_NEST */
13628
13629 for (old_entry = vm_map_first_entry(old_map); old_entry != vm_map_to_entry(old_map);) {
13630 /*
13631 * Abort any corpse collection if the system is shutting down.
13632 */
13633 if ((options & VM_MAP_FORK_CORPSE_FOOTPRINT) &&
13634 get_system_inshutdown()) {
13635 #if PMAP_FORK_NEST
13636 new_entry = vm_map_last_entry(new_map);
13637 if (new_entry == vm_map_to_entry(new_map)) {
13638 /* unnest all that was pre-nested */
13639 vm_map_fork_unnest(new_pmap,
13640 pre_nested_start, pre_nested_end,
13641 vm_map_min(new_map), vm_map_max(new_map));
13642 } else if (new_entry->vme_end < vm_map_max(new_map)) {
13643 /* unnest hole at the end, if pre-nested */
13644 vm_map_fork_unnest(new_pmap,
13645 pre_nested_start, pre_nested_end,
13646 new_entry->vme_end, vm_map_max(new_map));
13647 }
13648 #endif /* PMAP_FORK_NEST */
13649 vm_map_corpse_footprint_collect_done(new_map);
13650 vm_map_unlock(new_map);
13651 vm_map_unlock(old_map);
13652 vm_map_deallocate(new_map);
13653 vm_map_deallocate(old_map);
13654 printf("Aborting corpse map due to system shutdown\n");
13655 return VM_MAP_NULL;
13656 }
13657
13658 entry_size = old_entry->vme_end - old_entry->vme_start;
13659
13660 #if PMAP_FORK_NEST
13661 /*
13662 * Undo any unnecessary pre-nesting.
13663 */
13664 vm_map_offset_t prev_end;
13665 if (old_entry == vm_map_first_entry(old_map)) {
13666 prev_end = vm_map_min(old_map);
13667 } else {
13668 prev_end = old_entry->vme_prev->vme_end;
13669 }
13670 if (prev_end < old_entry->vme_start) {
13671 /* unnest hole before this entry, if pre-nested */
13672 vm_map_fork_unnest(new_pmap,
13673 pre_nested_start, pre_nested_end,
13674 prev_end, old_entry->vme_start);
13675 }
13676 if (old_entry->is_sub_map && old_entry->use_pmap) {
13677 /* keep this entry nested in the child */
13678 #if PMAP_FORK_NEST_DEBUG
13679 printf("PMAP_FORK_NEST %s:%d new_pmap %p keeping 0x%llx:0x%llx nested\n", __FUNCTION__, __LINE__, new_pmap, (uint64_t)old_entry->vme_start, (uint64_t)old_entry->vme_end);
13680 #endif /* PMAP_FORK_NEST_DEBUG */
13681 } else {
13682 /* undo nesting for this entry, if pre-nested */
13683 vm_map_fork_unnest(new_pmap,
13684 pre_nested_start, pre_nested_end,
13685 old_entry->vme_start, old_entry->vme_end);
13686 }
13687 #endif /* PMAP_FORK_NEST */
13688
13689 old_entry_inheritance = old_entry->inheritance;
13690
13691 /*
13692 * If caller used the VM_MAP_FORK_SHARE_IF_INHERIT_NONE option
13693 * share VM_INHERIT_NONE entries that are not backed by a
13694 * device pager.
13695 */
13696 if (old_entry_inheritance == VM_INHERIT_NONE &&
13697 (options & VM_MAP_FORK_SHARE_IF_INHERIT_NONE) &&
13698 (old_entry->protection & VM_PROT_READ) &&
13699 !(!old_entry->is_sub_map &&
13700 VME_OBJECT(old_entry) != NULL &&
13701 VME_OBJECT(old_entry)->pager != NULL &&
13702 is_device_pager_ops(
13703 VME_OBJECT(old_entry)->pager->mo_pager_ops))) {
13704 old_entry_inheritance = VM_INHERIT_SHARE;
13705 }
13706 if (old_entry_inheritance == VM_INHERIT_COPY &&
13707 (options & VM_MAP_FORK_SHARE_IF_OWNED) &&
13708 !old_entry->is_sub_map &&
13709 VME_OBJECT(old_entry) != VM_OBJECT_NULL) {
13710 vm_object_t object;
13711 task_t owner;
13712 object = VME_OBJECT(old_entry);
13713 owner = VM_OBJECT_OWNER(object);
13714 if (owner != TASK_NULL &&
13715 owner->map == old_map) {
13716 /*
13717 * This mapping points at a VM object owned
13718 * by the task being forked.
13719 * Some tools reporting memory accounting
13720 * info rely on the object ID, so share this
13721 * mapping instead of copying, to make the
13722 * corpse look exactly like the original
13723 * task in that respect.
13724 */
13725 assert(object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC);
13726 old_entry_inheritance = VM_INHERIT_SHARE;
13727 }
13728 }
13729
13730 if (old_entry_inheritance != VM_INHERIT_NONE &&
13731 (options & VM_MAP_FORK_CORPSE_FOOTPRINT) &&
13732 footprint_collect_kr == KERN_SUCCESS) {
13733 /*
13734 * The corpse won't have old_map->pmap to query
13735 * footprint information, so collect that data now
13736 * and store it in new_map->vmmap_corpse_footprint
13737 * for later autopsy.
13738 */
13739 footprint_collect_kr =
13740 vm_map_corpse_footprint_collect(old_map,
13741 old_entry,
13742 new_map);
13743 }
13744
13745 switch (old_entry_inheritance) {
13746 case VM_INHERIT_NONE:
13747 break;
13748
13749 case VM_INHERIT_SHARE:
13750 vm_map_fork_share(old_map, old_entry, new_map);
13751 new_size += entry_size;
13752 break;
13753
13754 case VM_INHERIT_COPY:
13755
13756 /*
13757 * Inline the copy_quickly case;
13758 * upon failure, fall back on call
13759 * to vm_map_fork_copy.
13760 */
13761
13762 if (old_entry->is_sub_map) {
13763 break;
13764 }
13765 if ((old_entry->wired_count != 0) ||
13766 ((VME_OBJECT(old_entry) != NULL) &&
13767 (VME_OBJECT(old_entry)->true_share))) {
13768 goto slow_vm_map_fork_copy;
13769 }
13770
13771 new_entry = vm_map_entry_create(new_map); /* never the kernel map or descendants */
13772 vm_map_entry_copy(old_map, new_entry, old_entry);
13773 if (old_entry->vme_permanent) {
13774 /* inherit "permanent" on fork() */
13775 new_entry->vme_permanent = TRUE;
13776 }
13777
13778 if (new_entry->used_for_jit == TRUE && new_map->jit_entry_exists == FALSE) {
13779 new_map->jit_entry_exists = TRUE;
13780 }
13781
13782 if (new_entry->is_sub_map) {
13783 /* clear address space specifics */
13784 new_entry->use_pmap = FALSE;
13785 } else {
13786 /*
13787 * We're dealing with a copy-on-write operation,
13788 * so the resulting mapping should not inherit
13789 * the original mapping's accounting settings.
13790 * "iokit_acct" should have been cleared in
13791 * vm_map_entry_copy().
13792 * "use_pmap" should be reset to its default
13793 * (TRUE) so that the new mapping gets
13794 * accounted for in the task's memory footprint.
13795 */
13796 assert(!new_entry->iokit_acct);
13797 new_entry->use_pmap = TRUE;
13798 }
13799
13800 if (!vm_object_copy_quickly(
13801 VME_OBJECT(new_entry),
13802 VME_OFFSET(old_entry),
13803 (old_entry->vme_end -
13804 old_entry->vme_start),
13805 &src_needs_copy,
13806 &new_entry_needs_copy)) {
13807 vm_map_entry_dispose(new_entry);
13808 goto slow_vm_map_fork_copy;
13809 }
13810
13811 /*
13812 * Handle copy-on-write obligations
13813 */
13814
13815 if (src_needs_copy && !old_entry->needs_copy) {
13816 vm_prot_t prot;
13817
13818 if (pmap_has_prot_policy(old_map->pmap, old_entry->translated_allow_execute, old_entry->protection)) {
13819 panic("%s: map %p pmap %p entry %p 0x%llx:0x%llx prot 0x%x",
13820 __FUNCTION__,
13821 old_map, old_map->pmap, old_entry,
13822 (uint64_t)old_entry->vme_start,
13823 (uint64_t)old_entry->vme_end,
13824 old_entry->protection);
13825 }
13826
13827 prot = old_entry->protection & ~VM_PROT_WRITE;
13828
13829 if (override_nx(old_map, VME_ALIAS(old_entry))
13830 && prot) {
13831 prot |= VM_PROT_EXECUTE;
13832 }
13833
13834 if (pmap_has_prot_policy(old_map->pmap, old_entry->translated_allow_execute, prot)) {
13835 panic("%s: map %p pmap %p entry %p 0x%llx:0x%llx prot 0x%x",
13836 __FUNCTION__,
13837 old_map, old_map->pmap, old_entry,
13838 (uint64_t)old_entry->vme_start,
13839 (uint64_t)old_entry->vme_end,
13840 prot);
13841 }
13842
13843 vm_object_pmap_protect(
13844 VME_OBJECT(old_entry),
13845 VME_OFFSET(old_entry),
13846 (old_entry->vme_end -
13847 old_entry->vme_start),
13848 ((old_entry->is_shared
13849 || old_map->mapped_in_other_pmaps)
13850 ? PMAP_NULL :
13851 old_map->pmap),
13852 VM_MAP_PAGE_SIZE(old_map),
13853 old_entry->vme_start,
13854 prot);
13855
13856 assert(old_entry->wired_count == 0);
13857 old_entry->needs_copy = TRUE;
13858 }
13859 new_entry->needs_copy = new_entry_needs_copy;
13860
13861 /*
13862 * Insert the entry at the end
13863 * of the map.
13864 */
13865
13866 vm_map_store_entry_link(new_map,
13867 vm_map_last_entry(new_map),
13868 new_entry,
13869 VM_MAP_KERNEL_FLAGS_NONE);
13870 new_size += entry_size;
13871 break;
13872
13873 slow_vm_map_fork_copy:
13874 vm_map_copyin_flags = VM_MAP_COPYIN_FORK;
13875 if (options & VM_MAP_FORK_PRESERVE_PURGEABLE) {
13876 vm_map_copyin_flags |=
13877 VM_MAP_COPYIN_PRESERVE_PURGEABLE;
13878 }
13879 if (vm_map_fork_copy(old_map,
13880 &old_entry,
13881 new_map,
13882 vm_map_copyin_flags)) {
13883 new_size += entry_size;
13884 }
13885 continue;
13886 }
13887 old_entry = old_entry->vme_next;
13888 }
13889
13890 #if PMAP_FORK_NEST
13891 new_entry = vm_map_last_entry(new_map);
13892 if (new_entry == vm_map_to_entry(new_map)) {
13893 /* unnest all that was pre-nested */
13894 vm_map_fork_unnest(new_pmap,
13895 pre_nested_start, pre_nested_end,
13896 vm_map_min(new_map), vm_map_max(new_map));
13897 } else if (new_entry->vme_end < vm_map_max(new_map)) {
13898 /* unnest hole at the end, if pre-nested */
13899 vm_map_fork_unnest(new_pmap,
13900 pre_nested_start, pre_nested_end,
13901 new_entry->vme_end, vm_map_max(new_map));
13902 }
13903 #endif /* PMAP_FORK_NEST */
13904
13905 #if defined(__arm64__)
13906 pmap_insert_commpage(new_map->pmap);
13907 #endif /* __arm64__ */
13908
13909 new_map->size = new_size;
13910
13911 if (options & VM_MAP_FORK_CORPSE_FOOTPRINT) {
13912 vm_map_corpse_footprint_collect_done(new_map);
13913 }
13914
13915 /* Propagate JIT entitlement for the pmap layer. */
13916 if (pmap_get_jit_entitled(old_map->pmap)) {
13917 /* Tell the pmap that it supports JIT. */
13918 pmap_set_jit_entitled(new_map->pmap);
13919 }
13920
13921 /* Propagate TPRO settings for the pmap layer */
13922 if (pmap_get_tpro(old_map->pmap)) {
13923 /* Tell the pmap that it supports TPRO */
13924 pmap_set_tpro(new_map->pmap);
13925 }
13926
13927
13928 vm_map_unlock(new_map);
13929 vm_map_unlock(old_map);
13930 vm_map_deallocate(old_map);
13931
13932 return new_map;
13933 }
13934
13935 /*
13936 * vm_map_exec:
13937 *
13938 * Setup the "new_map" with the proper execution environment according
13939 * to the type of executable (platform, 64bit, chroot environment).
13940 * Map the comm page and shared region, etc...
13941 */
13942 kern_return_t
vm_map_exec(vm_map_t new_map,task_t task,boolean_t is64bit,void * fsroot,cpu_type_t cpu,cpu_subtype_t cpu_subtype,boolean_t reslide,boolean_t is_driverkit,uint32_t rsr_version)13943 vm_map_exec(
13944 vm_map_t new_map,
13945 task_t task,
13946 boolean_t is64bit,
13947 void *fsroot,
13948 cpu_type_t cpu,
13949 cpu_subtype_t cpu_subtype,
13950 boolean_t reslide,
13951 boolean_t is_driverkit,
13952 uint32_t rsr_version)
13953 {
13954 SHARED_REGION_TRACE_DEBUG(
13955 ("shared_region: task %p: vm_map_exec(%p,%p,%p,0x%x,0x%x): ->\n",
13956 (void *)VM_KERNEL_ADDRPERM(current_task()),
13957 (void *)VM_KERNEL_ADDRPERM(new_map),
13958 (void *)VM_KERNEL_ADDRPERM(task),
13959 (void *)VM_KERNEL_ADDRPERM(fsroot),
13960 cpu,
13961 cpu_subtype));
13962 (void) vm_commpage_enter(new_map, task, is64bit);
13963
13964 (void) vm_shared_region_enter(new_map, task, is64bit, fsroot, cpu, cpu_subtype, reslide, is_driverkit, rsr_version);
13965
13966 SHARED_REGION_TRACE_DEBUG(
13967 ("shared_region: task %p: vm_map_exec(%p,%p,%p,0x%x,0x%x): <-\n",
13968 (void *)VM_KERNEL_ADDRPERM(current_task()),
13969 (void *)VM_KERNEL_ADDRPERM(new_map),
13970 (void *)VM_KERNEL_ADDRPERM(task),
13971 (void *)VM_KERNEL_ADDRPERM(fsroot),
13972 cpu,
13973 cpu_subtype));
13974
13975 /*
13976 * Some devices have region(s) of memory that shouldn't get allocated by
13977 * user processes. The following code creates dummy vm_map_entry_t's for each
13978 * of the regions that needs to be reserved to prevent any allocations in
13979 * those regions.
13980 */
13981 kern_return_t kr = KERN_FAILURE;
13982 vm_map_kernel_flags_t vmk_flags = VM_MAP_KERNEL_FLAGS_FIXED_PERMANENT();
13983 vmk_flags.vmkf_beyond_max = true;
13984
13985 const struct vm_reserved_region *regions = NULL;
13986 size_t num_regions = ml_get_vm_reserved_regions(is64bit, ®ions);
13987 assert((num_regions == 0) || (num_regions > 0 && regions != NULL));
13988
13989 for (size_t i = 0; i < num_regions; ++i) {
13990 vm_map_offset_t address = regions[i].vmrr_addr;
13991
13992 kr = vm_map_enter(
13993 new_map,
13994 &address,
13995 regions[i].vmrr_size,
13996 (vm_map_offset_t)0,
13997 vmk_flags,
13998 VM_OBJECT_NULL,
13999 (vm_object_offset_t)0,
14000 FALSE,
14001 VM_PROT_NONE,
14002 VM_PROT_NONE,
14003 VM_INHERIT_COPY);
14004
14005 if (kr != KERN_SUCCESS) {
14006 os_log_error(OS_LOG_DEFAULT, "Failed to reserve %s region in user map %p %d", regions[i].vmrr_name, new_map, kr);
14007 return KERN_FAILURE;
14008 }
14009 }
14010
14011 new_map->reserved_regions = (num_regions ? TRUE : FALSE);
14012
14013 return KERN_SUCCESS;
14014 }
14015
14016 uint64_t vm_map_lookup_and_lock_object_copy_slowly_count = 0;
14017 uint64_t vm_map_lookup_and_lock_object_copy_slowly_size = 0;
14018 uint64_t vm_map_lookup_and_lock_object_copy_slowly_max = 0;
14019 uint64_t vm_map_lookup_and_lock_object_copy_slowly_restart = 0;
14020 uint64_t vm_map_lookup_and_lock_object_copy_slowly_error = 0;
14021 uint64_t vm_map_lookup_and_lock_object_copy_strategically_count = 0;
14022 uint64_t vm_map_lookup_and_lock_object_copy_strategically_size = 0;
14023 uint64_t vm_map_lookup_and_lock_object_copy_strategically_max = 0;
14024 uint64_t vm_map_lookup_and_lock_object_copy_strategically_restart = 0;
14025 uint64_t vm_map_lookup_and_lock_object_copy_strategically_error = 0;
14026 uint64_t vm_map_lookup_and_lock_object_copy_shadow_count = 0;
14027 uint64_t vm_map_lookup_and_lock_object_copy_shadow_size = 0;
14028 uint64_t vm_map_lookup_and_lock_object_copy_shadow_max = 0;
14029 /*
14030 * vm_map_lookup_and_lock_object:
14031 *
14032 * Finds the VM object, offset, and
14033 * protection for a given virtual address in the
14034 * specified map, assuming a page fault of the
14035 * type specified.
14036 *
14037 * Returns the (object, offset, protection) for
14038 * this address, whether it is wired down, and whether
14039 * this map has the only reference to the data in question.
14040 * In order to later verify this lookup, a "version"
14041 * is returned.
14042 * If contended != NULL, *contended will be set to
14043 * true iff the thread had to spin or block to acquire
14044 * an exclusive lock.
14045 *
14046 * The map MUST be locked by the caller and WILL be
14047 * locked on exit. In order to guarantee the
14048 * existence of the returned object, it is returned
14049 * locked.
14050 *
14051 * If a lookup is requested with "write protection"
14052 * specified, the map may be changed to perform virtual
14053 * copying operations, although the data referenced will
14054 * remain the same.
14055 *
14056 * If fault_info is provided, then the information is
14057 * initialized according to the properties of the map entry
14058 * NB: only properties of the entry are initialized,
14059 * namely:
14060 * - user_tag
14061 * - pmap_options
14062 * - iokit_acct
14063 * - behavior
14064 * - lo_offset
14065 * - hi_offset
14066 * - no_cache
14067 * - cs_bypass
14068 * - csm_associated
14069 * - resilient_media
14070 * - vme_xnu_user_debug
14071 * - vme_no_copy_on_read
14072 * - used_for_tpro
14073 */
14074 kern_return_t
vm_map_lookup_and_lock_object(vm_map_t * var_map,vm_map_offset_t vaddr,vm_prot_t fault_type,int object_lock_type,vm_map_version_t * out_version,vm_object_t * object,vm_object_offset_t * offset,vm_prot_t * out_prot,boolean_t * wired,vm_object_fault_info_t fault_info,vm_map_t * real_map,bool * contended)14075 vm_map_lookup_and_lock_object(
14076 vm_map_t *var_map, /* IN/OUT */
14077 vm_map_offset_t vaddr,
14078 vm_prot_t fault_type,
14079 int object_lock_type,
14080 vm_map_version_t *out_version, /* OUT */
14081 vm_object_t *object, /* OUT */
14082 vm_object_offset_t *offset, /* OUT */
14083 vm_prot_t *out_prot, /* OUT */
14084 boolean_t *wired, /* OUT */
14085 vm_object_fault_info_t fault_info, /* OUT */
14086 vm_map_t *real_map, /* OUT */
14087 bool *contended) /* OUT */
14088 {
14089 vm_map_entry_t entry;
14090 vm_map_t map = *var_map;
14091 vm_map_t old_map = *var_map;
14092 vm_map_t cow_sub_map_parent = VM_MAP_NULL;
14093 vm_map_offset_t cow_parent_vaddr = 0;
14094 vm_map_offset_t old_start = 0;
14095 vm_map_offset_t old_end = 0;
14096 vm_prot_t prot;
14097 boolean_t mask_protections;
14098 boolean_t force_copy;
14099 boolean_t no_force_copy_if_executable;
14100 boolean_t submap_needed_copy;
14101 vm_prot_t original_fault_type;
14102 vm_map_size_t fault_page_mask;
14103
14104 /*
14105 * VM_PROT_MASK means that the caller wants us to use "fault_type"
14106 * as a mask against the mapping's actual protections, not as an
14107 * absolute value.
14108 */
14109 mask_protections = (fault_type & VM_PROT_IS_MASK) ? TRUE : FALSE;
14110 force_copy = (fault_type & VM_PROT_COPY) ? TRUE : FALSE;
14111 no_force_copy_if_executable = (fault_type & VM_PROT_COPY_FAIL_IF_EXECUTABLE) ? TRUE : FALSE;
14112 fault_type &= VM_PROT_ALL;
14113 original_fault_type = fault_type;
14114 if (contended) {
14115 *contended = false;
14116 }
14117
14118 *real_map = map;
14119
14120 fault_page_mask = MIN(VM_MAP_PAGE_MASK(map), PAGE_MASK);
14121 vaddr = VM_MAP_TRUNC_PAGE(vaddr, fault_page_mask);
14122
14123 RetryLookup:
14124 fault_type = original_fault_type;
14125
14126 /*
14127 * If the map has an interesting hint, try it before calling
14128 * full blown lookup routine.
14129 */
14130 entry = map->hint;
14131
14132 if ((entry == vm_map_to_entry(map)) ||
14133 (vaddr < entry->vme_start) || (vaddr >= entry->vme_end)) {
14134 vm_map_entry_t tmp_entry;
14135
14136 /*
14137 * Entry was either not a valid hint, or the vaddr
14138 * was not contained in the entry, so do a full lookup.
14139 */
14140 if (!vm_map_lookup_entry(map, vaddr, &tmp_entry)) {
14141 if ((cow_sub_map_parent) && (cow_sub_map_parent != map)) {
14142 vm_map_unlock(cow_sub_map_parent);
14143 }
14144 if ((*real_map != map)
14145 && (*real_map != cow_sub_map_parent)) {
14146 vm_map_unlock(*real_map);
14147 }
14148 return KERN_INVALID_ADDRESS;
14149 }
14150
14151 entry = tmp_entry;
14152 }
14153 if (map == old_map) {
14154 old_start = entry->vme_start;
14155 old_end = entry->vme_end;
14156 }
14157
14158 /*
14159 * Handle submaps. Drop lock on upper map, submap is
14160 * returned locked.
14161 */
14162
14163 submap_needed_copy = FALSE;
14164 submap_recurse:
14165 if (entry->is_sub_map) {
14166 vm_map_offset_t local_vaddr;
14167 vm_map_offset_t end_delta;
14168 vm_map_offset_t start_delta;
14169 vm_map_offset_t top_entry_saved_start;
14170 vm_object_offset_t top_entry_saved_offset;
14171 vm_map_entry_t submap_entry, saved_submap_entry;
14172 vm_object_offset_t submap_entry_offset;
14173 vm_object_size_t submap_entry_size;
14174 vm_prot_t subentry_protection;
14175 vm_prot_t subentry_max_protection;
14176 boolean_t subentry_no_copy_on_read;
14177 boolean_t subentry_permanent;
14178 boolean_t subentry_csm_associated;
14179 #if __arm64e__
14180 boolean_t subentry_used_for_tpro;
14181 #endif /* __arm64e__ */
14182 boolean_t mapped_needs_copy = FALSE;
14183 vm_map_version_t version;
14184
14185 assertf(VM_MAP_PAGE_SHIFT(VME_SUBMAP(entry)) >= VM_MAP_PAGE_SHIFT(map),
14186 "map %p (%d) entry %p submap %p (%d)\n",
14187 map, VM_MAP_PAGE_SHIFT(map), entry,
14188 VME_SUBMAP(entry), VM_MAP_PAGE_SHIFT(VME_SUBMAP(entry)));
14189
14190 local_vaddr = vaddr;
14191 top_entry_saved_start = entry->vme_start;
14192 top_entry_saved_offset = VME_OFFSET(entry);
14193
14194 if ((entry->use_pmap &&
14195 !((fault_type & VM_PROT_WRITE) ||
14196 force_copy))) {
14197 /* if real_map equals map we unlock below */
14198 if ((*real_map != map) &&
14199 (*real_map != cow_sub_map_parent)) {
14200 vm_map_unlock(*real_map);
14201 }
14202 *real_map = VME_SUBMAP(entry);
14203 }
14204
14205 if (entry->needs_copy &&
14206 ((fault_type & VM_PROT_WRITE) ||
14207 force_copy)) {
14208 if (!mapped_needs_copy) {
14209 if (vm_map_lock_read_to_write(map)) {
14210 vm_map_lock_read(map);
14211 *real_map = map;
14212 goto RetryLookup;
14213 }
14214 vm_map_lock_read(VME_SUBMAP(entry));
14215 *var_map = VME_SUBMAP(entry);
14216 cow_sub_map_parent = map;
14217 /* reset base to map before cow object */
14218 /* this is the map which will accept */
14219 /* the new cow object */
14220 old_start = entry->vme_start;
14221 old_end = entry->vme_end;
14222 cow_parent_vaddr = vaddr;
14223 mapped_needs_copy = TRUE;
14224 } else {
14225 vm_map_lock_read(VME_SUBMAP(entry));
14226 *var_map = VME_SUBMAP(entry);
14227 if ((cow_sub_map_parent != map) &&
14228 (*real_map != map)) {
14229 vm_map_unlock(map);
14230 }
14231 }
14232 } else {
14233 if (entry->needs_copy) {
14234 submap_needed_copy = TRUE;
14235 }
14236 vm_map_lock_read(VME_SUBMAP(entry));
14237 *var_map = VME_SUBMAP(entry);
14238 /* leave map locked if it is a target */
14239 /* cow sub_map above otherwise, just */
14240 /* follow the maps down to the object */
14241 /* here we unlock knowing we are not */
14242 /* revisiting the map. */
14243 if ((*real_map != map) && (map != cow_sub_map_parent)) {
14244 vm_map_unlock_read(map);
14245 }
14246 }
14247
14248 entry = NULL;
14249 map = *var_map;
14250
14251 /* calculate the offset in the submap for vaddr */
14252 local_vaddr = (local_vaddr - top_entry_saved_start) + top_entry_saved_offset;
14253 assertf(VM_MAP_PAGE_ALIGNED(local_vaddr, fault_page_mask),
14254 "local_vaddr 0x%llx entry->vme_start 0x%llx fault_page_mask 0x%llx\n",
14255 (uint64_t)local_vaddr, (uint64_t)top_entry_saved_start, (uint64_t)fault_page_mask);
14256
14257 RetrySubMap:
14258 if (!vm_map_lookup_entry(map, local_vaddr, &submap_entry)) {
14259 if ((cow_sub_map_parent) && (cow_sub_map_parent != map)) {
14260 vm_map_unlock(cow_sub_map_parent);
14261 }
14262 if ((*real_map != map)
14263 && (*real_map != cow_sub_map_parent)) {
14264 vm_map_unlock(*real_map);
14265 }
14266 *real_map = map;
14267 return KERN_INVALID_ADDRESS;
14268 }
14269
14270 /* find the attenuated shadow of the underlying object */
14271 /* on our target map */
14272
14273 /* in english the submap object may extend beyond the */
14274 /* region mapped by the entry or, may only fill a portion */
14275 /* of it. For our purposes, we only care if the object */
14276 /* doesn't fill. In this case the area which will */
14277 /* ultimately be clipped in the top map will only need */
14278 /* to be as big as the portion of the underlying entry */
14279 /* which is mapped */
14280 start_delta = submap_entry->vme_start > top_entry_saved_offset ?
14281 submap_entry->vme_start - top_entry_saved_offset : 0;
14282
14283 end_delta =
14284 (top_entry_saved_offset + start_delta + (old_end - old_start)) <=
14285 submap_entry->vme_end ?
14286 0 : (top_entry_saved_offset +
14287 (old_end - old_start))
14288 - submap_entry->vme_end;
14289
14290 old_start += start_delta;
14291 old_end -= end_delta;
14292
14293 if (submap_entry->is_sub_map) {
14294 entry = submap_entry;
14295 vaddr = local_vaddr;
14296 goto submap_recurse;
14297 }
14298
14299 if (((fault_type & VM_PROT_WRITE) ||
14300 force_copy)
14301 && cow_sub_map_parent) {
14302 vm_object_t sub_object, copy_object;
14303 vm_object_offset_t copy_offset;
14304 vm_map_offset_t local_start;
14305 vm_map_offset_t local_end;
14306 boolean_t object_copied = FALSE;
14307 vm_object_offset_t object_copied_offset = 0;
14308 boolean_t object_copied_needs_copy = FALSE;
14309 kern_return_t kr = KERN_SUCCESS;
14310
14311 if (vm_map_lock_read_to_write(map)) {
14312 vm_map_lock_read(map);
14313 old_start -= start_delta;
14314 old_end += end_delta;
14315 goto RetrySubMap;
14316 }
14317
14318
14319 sub_object = VME_OBJECT(submap_entry);
14320 if (sub_object == VM_OBJECT_NULL) {
14321 sub_object =
14322 vm_object_allocate(
14323 (vm_map_size_t)
14324 (submap_entry->vme_end -
14325 submap_entry->vme_start), map->serial_id);
14326 VME_OBJECT_SET(submap_entry, sub_object, false, 0);
14327 VME_OFFSET_SET(submap_entry, 0);
14328 assert(!submap_entry->is_sub_map);
14329 assert(submap_entry->use_pmap);
14330 }
14331 local_start = local_vaddr -
14332 (cow_parent_vaddr - old_start);
14333 local_end = local_vaddr +
14334 (old_end - cow_parent_vaddr);
14335 vm_map_clip_start(map, submap_entry, local_start);
14336 vm_map_clip_end(map, submap_entry, local_end);
14337 if (submap_entry->is_sub_map) {
14338 /* unnesting was done when clipping */
14339 assert(!submap_entry->use_pmap);
14340 }
14341
14342 /* This is the COW case, lets connect */
14343 /* an entry in our space to the underlying */
14344 /* object in the submap, bypassing the */
14345 /* submap. */
14346 submap_entry_offset = VME_OFFSET(submap_entry);
14347 submap_entry_size = submap_entry->vme_end - submap_entry->vme_start;
14348
14349 if ((submap_entry->wired_count != 0 ||
14350 sub_object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC) &&
14351 (submap_entry->protection & VM_PROT_EXECUTE) &&
14352 no_force_copy_if_executable) {
14353 // printf("FBDP map %p entry %p start 0x%llx end 0x%llx wired %d strat %d\n", map, submap_entry, (uint64_t)local_start, (uint64_t)local_end, submap_entry->wired_count, sub_object->copy_strategy);
14354 if ((cow_sub_map_parent) && (cow_sub_map_parent != map)) {
14355 vm_map_unlock(cow_sub_map_parent);
14356 }
14357 if ((*real_map != map)
14358 && (*real_map != cow_sub_map_parent)) {
14359 vm_map_unlock(*real_map);
14360 }
14361 *real_map = map;
14362 ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_SUBMAP_NO_COW_ON_EXECUTABLE), 0 /* arg */);
14363 vm_map_lock_write_to_read(map);
14364 kr = KERN_PROTECTION_FAILURE;
14365 DTRACE_VM4(submap_no_copy_executable,
14366 vm_map_t, map,
14367 vm_object_offset_t, submap_entry_offset,
14368 vm_object_size_t, submap_entry_size,
14369 int, kr);
14370 return kr;
14371 }
14372
14373 if (submap_entry->wired_count != 0) {
14374 vm_object_reference(sub_object);
14375
14376 assertf(VM_MAP_PAGE_ALIGNED(VME_OFFSET(submap_entry), VM_MAP_PAGE_MASK(map)),
14377 "submap_entry %p offset 0x%llx\n",
14378 submap_entry, VME_OFFSET(submap_entry));
14379
14380 DTRACE_VM6(submap_copy_slowly,
14381 vm_map_t, cow_sub_map_parent,
14382 vm_map_offset_t, vaddr,
14383 vm_map_t, map,
14384 vm_object_size_t, submap_entry_size,
14385 int, submap_entry->wired_count,
14386 int, sub_object->copy_strategy);
14387
14388 saved_submap_entry = submap_entry;
14389 version.main_timestamp = map->timestamp;
14390 vm_map_unlock(map); /* Increments timestamp by 1 */
14391 submap_entry = VM_MAP_ENTRY_NULL;
14392
14393 vm_object_lock(sub_object);
14394 kr = vm_object_copy_slowly(sub_object,
14395 submap_entry_offset,
14396 submap_entry_size,
14397 FALSE, /* interruptible */
14398 ©_object);
14399 object_copied = TRUE;
14400 object_copied_offset = 0;
14401 /* 4k: account for extra offset in physical page */
14402 object_copied_offset += submap_entry_offset - vm_object_trunc_page(submap_entry_offset);
14403 object_copied_needs_copy = FALSE;
14404 vm_object_deallocate(sub_object);
14405
14406 vm_map_lock(map);
14407
14408 if (kr != KERN_SUCCESS &&
14409 kr != KERN_MEMORY_RESTART_COPY) {
14410 if ((cow_sub_map_parent) && (cow_sub_map_parent != map)) {
14411 vm_map_unlock(cow_sub_map_parent);
14412 }
14413 if ((*real_map != map)
14414 && (*real_map != cow_sub_map_parent)) {
14415 vm_map_unlock(*real_map);
14416 }
14417 *real_map = map;
14418 vm_object_deallocate(copy_object);
14419 copy_object = VM_OBJECT_NULL;
14420 ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_SUBMAP_COPY_SLOWLY_FAILED), 0 /* arg */);
14421 vm_map_lock_write_to_read(map);
14422 DTRACE_VM4(submap_copy_error_slowly,
14423 vm_object_t, sub_object,
14424 vm_object_offset_t, submap_entry_offset,
14425 vm_object_size_t, submap_entry_size,
14426 int, kr);
14427 vm_map_lookup_and_lock_object_copy_slowly_error++;
14428 return kr;
14429 }
14430
14431 if ((kr == KERN_SUCCESS) &&
14432 (version.main_timestamp + 1) == map->timestamp) {
14433 submap_entry = saved_submap_entry;
14434 } else {
14435 saved_submap_entry = NULL;
14436 old_start -= start_delta;
14437 old_end += end_delta;
14438 vm_object_deallocate(copy_object);
14439 copy_object = VM_OBJECT_NULL;
14440 vm_map_lock_write_to_read(map);
14441 vm_map_lookup_and_lock_object_copy_slowly_restart++;
14442 goto RetrySubMap;
14443 }
14444 vm_map_lookup_and_lock_object_copy_slowly_count++;
14445 vm_map_lookup_and_lock_object_copy_slowly_size += submap_entry_size;
14446 if (submap_entry_size > vm_map_lookup_and_lock_object_copy_slowly_max) {
14447 vm_map_lookup_and_lock_object_copy_slowly_max = submap_entry_size;
14448 }
14449 } else if (sub_object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC) {
14450 submap_entry_offset = VME_OFFSET(submap_entry);
14451 copy_object = VM_OBJECT_NULL;
14452 object_copied_offset = submap_entry_offset;
14453 object_copied_needs_copy = FALSE;
14454 DTRACE_VM6(submap_copy_strategically,
14455 vm_map_t, cow_sub_map_parent,
14456 vm_map_offset_t, vaddr,
14457 vm_map_t, map,
14458 vm_object_size_t, submap_entry_size,
14459 int, submap_entry->wired_count,
14460 int, sub_object->copy_strategy);
14461 kr = vm_object_copy_strategically(
14462 sub_object,
14463 submap_entry_offset,
14464 submap_entry->vme_end - submap_entry->vme_start,
14465 false, /* forking */
14466 ©_object,
14467 &object_copied_offset,
14468 &object_copied_needs_copy);
14469 if (kr == KERN_MEMORY_RESTART_COPY) {
14470 old_start -= start_delta;
14471 old_end += end_delta;
14472 vm_object_deallocate(copy_object);
14473 copy_object = VM_OBJECT_NULL;
14474 vm_map_lock_write_to_read(map);
14475 vm_map_lookup_and_lock_object_copy_strategically_restart++;
14476 goto RetrySubMap;
14477 }
14478 if (kr != KERN_SUCCESS) {
14479 if ((cow_sub_map_parent) && (cow_sub_map_parent != map)) {
14480 vm_map_unlock(cow_sub_map_parent);
14481 }
14482 if ((*real_map != map)
14483 && (*real_map != cow_sub_map_parent)) {
14484 vm_map_unlock(*real_map);
14485 }
14486 *real_map = map;
14487 vm_object_deallocate(copy_object);
14488 copy_object = VM_OBJECT_NULL;
14489 ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_SUBMAP_COPY_STRAT_FAILED), 0 /* arg */);
14490 vm_map_lock_write_to_read(map);
14491 DTRACE_VM4(submap_copy_error_strategically,
14492 vm_object_t, sub_object,
14493 vm_object_offset_t, submap_entry_offset,
14494 vm_object_size_t, submap_entry_size,
14495 int, kr);
14496 vm_map_lookup_and_lock_object_copy_strategically_error++;
14497 return kr;
14498 }
14499 assert(copy_object != VM_OBJECT_NULL);
14500 assert(copy_object != sub_object);
14501 object_copied = TRUE;
14502 vm_map_lookup_and_lock_object_copy_strategically_count++;
14503 vm_map_lookup_and_lock_object_copy_strategically_size += submap_entry_size;
14504 if (submap_entry_size > vm_map_lookup_and_lock_object_copy_strategically_max) {
14505 vm_map_lookup_and_lock_object_copy_strategically_max = submap_entry_size;
14506 }
14507 } else {
14508 /* set up shadow object */
14509 object_copied = FALSE;
14510 copy_object = sub_object;
14511 vm_object_lock(sub_object);
14512 vm_object_reference_locked(sub_object);
14513 VM_OBJECT_SET_SHADOWED(sub_object, TRUE);
14514 vm_object_unlock(sub_object);
14515
14516 assert(submap_entry->wired_count == 0);
14517 submap_entry->needs_copy = TRUE;
14518
14519 prot = submap_entry->protection;
14520 if (pmap_has_prot_policy(map->pmap, submap_entry->translated_allow_execute, prot)) {
14521 panic("%s: map %p pmap %p entry %p 0x%llx:0x%llx prot 0x%x",
14522 __FUNCTION__,
14523 map, map->pmap, submap_entry,
14524 (uint64_t)submap_entry->vme_start,
14525 (uint64_t)submap_entry->vme_end,
14526 prot);
14527 }
14528 prot = prot & ~VM_PROT_WRITE;
14529 if (pmap_has_prot_policy(map->pmap, submap_entry->translated_allow_execute, prot)) {
14530 panic("%s: map %p pmap %p entry %p 0x%llx:0x%llx prot 0x%x",
14531 __FUNCTION__,
14532 map, map->pmap, submap_entry,
14533 (uint64_t)submap_entry->vme_start,
14534 (uint64_t)submap_entry->vme_end,
14535 prot);
14536 }
14537
14538 if (override_nx(old_map,
14539 VME_ALIAS(submap_entry))
14540 && prot) {
14541 prot |= VM_PROT_EXECUTE;
14542 }
14543
14544 vm_object_pmap_protect(
14545 sub_object,
14546 VME_OFFSET(submap_entry),
14547 submap_entry->vme_end -
14548 submap_entry->vme_start,
14549 (submap_entry->is_shared
14550 || map->mapped_in_other_pmaps) ?
14551 PMAP_NULL : map->pmap,
14552 VM_MAP_PAGE_SIZE(map),
14553 submap_entry->vme_start,
14554 prot);
14555 vm_map_lookup_and_lock_object_copy_shadow_count++;
14556 vm_map_lookup_and_lock_object_copy_shadow_size += submap_entry_size;
14557 if (submap_entry_size > vm_map_lookup_and_lock_object_copy_shadow_max) {
14558 vm_map_lookup_and_lock_object_copy_shadow_max = submap_entry_size;
14559 }
14560 }
14561
14562 /*
14563 * Adjust the fault offset to the submap entry.
14564 */
14565 copy_offset = (local_vaddr -
14566 submap_entry->vme_start +
14567 VME_OFFSET(submap_entry));
14568
14569 /* This works diffently than the */
14570 /* normal submap case. We go back */
14571 /* to the parent of the cow map and*/
14572 /* clip out the target portion of */
14573 /* the sub_map, substituting the */
14574 /* new copy object, */
14575
14576 subentry_protection = submap_entry->protection;
14577 subentry_max_protection = submap_entry->max_protection;
14578 subentry_no_copy_on_read = submap_entry->vme_no_copy_on_read;
14579 subentry_permanent = submap_entry->vme_permanent;
14580 subentry_csm_associated = submap_entry->csm_associated;
14581 #if __arm64e__
14582 subentry_used_for_tpro = submap_entry->used_for_tpro;
14583 #endif // __arm64e__
14584 vm_map_unlock(map);
14585 submap_entry = NULL; /* not valid after map unlock */
14586
14587 local_start = old_start;
14588 local_end = old_end;
14589 map = cow_sub_map_parent;
14590 *var_map = cow_sub_map_parent;
14591 vaddr = cow_parent_vaddr;
14592 cow_sub_map_parent = NULL;
14593
14594 if (!vm_map_lookup_entry(map,
14595 vaddr, &entry)) {
14596 if ((cow_sub_map_parent) && (cow_sub_map_parent != map)) {
14597 vm_map_unlock(cow_sub_map_parent);
14598 }
14599 if ((*real_map != map)
14600 && (*real_map != cow_sub_map_parent)) {
14601 vm_map_unlock(*real_map);
14602 }
14603 *real_map = map;
14604 vm_object_deallocate(
14605 copy_object);
14606 copy_object = VM_OBJECT_NULL;
14607 vm_map_lock_write_to_read(map);
14608 DTRACE_VM4(submap_lookup_post_unlock,
14609 uint64_t, (uint64_t)entry->vme_start,
14610 uint64_t, (uint64_t)entry->vme_end,
14611 vm_map_offset_t, vaddr,
14612 int, object_copied);
14613 return KERN_INVALID_ADDRESS;
14614 }
14615
14616 /* clip out the portion of space */
14617 /* mapped by the sub map which */
14618 /* corresponds to the underlying */
14619 /* object */
14620
14621 /*
14622 * Clip (and unnest) the smallest nested chunk
14623 * possible around the faulting address...
14624 */
14625 local_start = vaddr & ~(pmap_shared_region_size_min(map->pmap) - 1);
14626 local_end = local_start + pmap_shared_region_size_min(map->pmap);
14627 /*
14628 * ... but don't go beyond the "old_start" to "old_end"
14629 * range, to avoid spanning over another VM region
14630 * with a possibly different VM object and/or offset.
14631 */
14632 if (local_start < old_start) {
14633 local_start = old_start;
14634 }
14635 if (local_end > old_end) {
14636 local_end = old_end;
14637 }
14638 /*
14639 * Adjust copy_offset to the start of the range.
14640 */
14641 copy_offset -= (vaddr - local_start);
14642
14643 vm_map_clip_start(map, entry, local_start);
14644 vm_map_clip_end(map, entry, local_end);
14645 if (entry->is_sub_map) {
14646 /* unnesting was done when clipping */
14647 assert(!entry->use_pmap);
14648 }
14649
14650 /* substitute copy object for */
14651 /* shared map entry */
14652 vm_map_deallocate(VME_SUBMAP(entry));
14653 assert(!entry->iokit_acct);
14654 entry->use_pmap = TRUE;
14655 VME_OBJECT_SET(entry, copy_object, false, 0);
14656
14657 /* propagate the submap entry's protections */
14658 if (entry->protection != VM_PROT_READ) {
14659 /*
14660 * Someone has already altered the top entry's
14661 * protections via vm_protect(VM_PROT_COPY).
14662 * Respect these new values and ignore the
14663 * submap entry's protections.
14664 */
14665 } else {
14666 /*
14667 * Regular copy-on-write: propagate the submap
14668 * entry's protections to the top map entry.
14669 */
14670 entry->protection |= subentry_protection;
14671 }
14672 entry->max_protection |= subentry_max_protection;
14673 /* propagate some attributes from subentry */
14674 entry->vme_no_copy_on_read = subentry_no_copy_on_read;
14675 entry->vme_permanent = subentry_permanent;
14676 entry->csm_associated = subentry_csm_associated;
14677 #if __arm64e__
14678 /* propagate TPRO iff the destination map has TPRO enabled */
14679 if (subentry_used_for_tpro) {
14680 if (vm_map_tpro(map)) {
14681 entry->used_for_tpro = subentry_used_for_tpro;
14682 } else {
14683 /* "permanent" came from being TPRO */
14684 entry->vme_permanent = FALSE;
14685 }
14686 }
14687 #endif /* __arm64e */
14688 if ((entry->protection & VM_PROT_WRITE) &&
14689 (entry->protection & VM_PROT_EXECUTE) &&
14690 #if XNU_TARGET_OS_OSX
14691 map->pmap != kernel_pmap &&
14692 (vm_map_cs_enforcement(map)
14693 #if __arm64__
14694 || !VM_MAP_IS_EXOTIC(map)
14695 #endif /* __arm64__ */
14696 ) &&
14697 #endif /* XNU_TARGET_OS_OSX */
14698 #if CODE_SIGNING_MONITOR
14699 (csm_address_space_exempt(map->pmap) != KERN_SUCCESS) &&
14700 #endif
14701 !(entry->used_for_jit) &&
14702 VM_MAP_POLICY_WX_STRIP_X(map)) {
14703 DTRACE_VM3(cs_wx,
14704 uint64_t, (uint64_t)entry->vme_start,
14705 uint64_t, (uint64_t)entry->vme_end,
14706 vm_prot_t, entry->protection);
14707 printf("CODE SIGNING: %d[%s] %s:%d(0x%llx,0x%llx,0x%x) can't have both write and exec at the same time\n",
14708 proc_selfpid(),
14709 (get_bsdtask_info(current_task())
14710 ? proc_name_address(get_bsdtask_info(current_task()))
14711 : "?"),
14712 __FUNCTION__, __LINE__,
14713 #if DEVELOPMENT || DEBUG
14714 (uint64_t)entry->vme_start,
14715 (uint64_t)entry->vme_end,
14716 #else /* DEVELOPMENT || DEBUG */
14717 (uint64_t)0,
14718 (uint64_t)0,
14719 #endif /* DEVELOPMENT || DEBUG */
14720 entry->protection);
14721 entry->protection &= ~VM_PROT_EXECUTE;
14722 }
14723
14724 if (object_copied) {
14725 VME_OFFSET_SET(entry, local_start - old_start + object_copied_offset);
14726 entry->needs_copy = object_copied_needs_copy;
14727 entry->is_shared = FALSE;
14728 } else {
14729 assert(VME_OBJECT(entry) != VM_OBJECT_NULL);
14730 assert(VME_OBJECT(entry)->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC);
14731 assert(entry->wired_count == 0);
14732 VME_OFFSET_SET(entry, copy_offset);
14733 entry->needs_copy = TRUE;
14734 if (map != old_map) {
14735 entry->is_shared = TRUE;
14736 }
14737 }
14738 if (entry->inheritance == VM_INHERIT_SHARE) {
14739 entry->inheritance = VM_INHERIT_COPY;
14740 }
14741
14742 vm_map_lock_write_to_read(map);
14743 } else {
14744 if ((cow_sub_map_parent)
14745 && (cow_sub_map_parent != *real_map)
14746 && (cow_sub_map_parent != map)) {
14747 vm_map_unlock(cow_sub_map_parent);
14748 }
14749 entry = submap_entry;
14750 vaddr = local_vaddr;
14751 }
14752 }
14753
14754 /*
14755 * Check whether this task is allowed to have
14756 * this page.
14757 */
14758
14759 prot = entry->protection;
14760
14761 if (override_nx(old_map, VME_ALIAS(entry)) && prot) {
14762 /*
14763 * HACK -- if not a stack, then allow execution
14764 */
14765 prot |= VM_PROT_EXECUTE;
14766 }
14767
14768 #if __arm64e__
14769 /*
14770 * If the entry we're dealing with is TPRO and we have a write
14771 * fault, inject VM_PROT_WRITE into protections. This allows us
14772 * to maintain RO permissions when not marked as TPRO.
14773 */
14774 if (entry->used_for_tpro && (fault_type & VM_PROT_WRITE)) {
14775 prot |= VM_PROT_WRITE;
14776 }
14777 #endif /* __arm64e__ */
14778 if (mask_protections) {
14779 fault_type &= prot;
14780 if (fault_type == VM_PROT_NONE) {
14781 goto protection_failure;
14782 }
14783 }
14784 if (((fault_type & prot) != fault_type)
14785 #if __arm64__
14786 /* prefetch abort in execute-only page */
14787 && !(prot == VM_PROT_EXECUTE && fault_type == (VM_PROT_READ | VM_PROT_EXECUTE))
14788 #elif defined(__x86_64__)
14789 /* Consider the UEXEC bit when handling an EXECUTE fault */
14790 && !((fault_type & VM_PROT_EXECUTE) && !(prot & VM_PROT_EXECUTE) && (prot & VM_PROT_UEXEC))
14791 #endif
14792 ) {
14793 protection_failure:
14794 if (*real_map != map) {
14795 vm_map_unlock(*real_map);
14796 }
14797 *real_map = map;
14798
14799 if ((fault_type & VM_PROT_EXECUTE) && prot) {
14800 log_stack_execution_failure((addr64_t)vaddr, prot);
14801 }
14802
14803 DTRACE_VM2(prot_fault, int, 1, (uint64_t *), NULL);
14804 DTRACE_VM3(prot_fault_detailed, vm_prot_t, fault_type, vm_prot_t, prot, void *, vaddr);
14805 /*
14806 * Noisy (esp. internally) and can be inferred from CrashReports. So OFF for now.
14807 *
14808 * ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_PROTECTION_FAILURE), 0);
14809 */
14810 return KERN_PROTECTION_FAILURE;
14811 }
14812
14813 /*
14814 * If this page is not pageable, we have to get
14815 * it for all possible accesses.
14816 */
14817
14818 *wired = (entry->wired_count != 0);
14819 if (*wired) {
14820 fault_type = prot;
14821 }
14822
14823 /*
14824 * If the entry was copy-on-write, we either ...
14825 */
14826
14827 if (entry->needs_copy) {
14828 /*
14829 * If we want to write the page, we may as well
14830 * handle that now since we've got the map locked.
14831 *
14832 * If we don't need to write the page, we just
14833 * demote the permissions allowed.
14834 */
14835
14836 if ((fault_type & VM_PROT_WRITE) || *wired || force_copy) {
14837 /*
14838 * Make a new object, and place it in the
14839 * object chain. Note that no new references
14840 * have appeared -- one just moved from the
14841 * map to the new object.
14842 */
14843
14844 if (vm_map_lock_read_to_write(map)) {
14845 vm_map_lock_read(map);
14846 goto RetryLookup;
14847 }
14848
14849 if (VME_OBJECT(entry)->shadowed == FALSE) {
14850 vm_object_lock(VME_OBJECT(entry));
14851 VM_OBJECT_SET_SHADOWED(VME_OBJECT(entry), TRUE);
14852 vm_object_unlock(VME_OBJECT(entry));
14853 }
14854 VME_OBJECT_SHADOW(entry,
14855 (vm_map_size_t) (entry->vme_end -
14856 entry->vme_start),
14857 vm_map_always_shadow(map));
14858 entry->needs_copy = FALSE;
14859
14860 vm_map_lock_write_to_read(map);
14861 }
14862 if ((fault_type & VM_PROT_WRITE) == 0 && *wired == 0) {
14863 /*
14864 * We're attempting to read a copy-on-write
14865 * page -- don't allow writes.
14866 */
14867
14868 prot &= (~VM_PROT_WRITE);
14869 }
14870 }
14871
14872 if (submap_needed_copy && (prot & VM_PROT_WRITE)) {
14873 /*
14874 * We went through a "needs_copy" submap without triggering
14875 * a copy, so granting write access to the page would bypass
14876 * that submap's "needs_copy".
14877 */
14878 assert(!(fault_type & VM_PROT_WRITE));
14879 assert(!*wired);
14880 assert(!force_copy);
14881 // printf("FBDP %d[%s] submap_needed_copy for %p 0x%llx\n", proc_selfpid(), proc_name_address(current_task()->bsd_info), map, vaddr);
14882 prot &= ~VM_PROT_WRITE;
14883 }
14884
14885 /*
14886 * Create an object if necessary.
14887 */
14888 if (VME_OBJECT(entry) == VM_OBJECT_NULL) {
14889 if (vm_map_lock_read_to_write(map)) {
14890 vm_map_lock_read(map);
14891 goto RetryLookup;
14892 }
14893
14894 VME_OBJECT_SET(entry,
14895 vm_object_allocate(
14896 (vm_map_size_t)(entry->vme_end -
14897 entry->vme_start),
14898 map->serial_id
14899 ), false, 0);
14900 VME_OFFSET_SET(entry, 0);
14901 assert(entry->use_pmap);
14902 vm_map_lock_write_to_read(map);
14903 }
14904
14905 /*
14906 * Return the object/offset from this entry. If the entry
14907 * was copy-on-write or empty, it has been fixed up. Also
14908 * return the protection.
14909 */
14910
14911 *offset = (vaddr - entry->vme_start) + VME_OFFSET(entry);
14912 *object = VME_OBJECT(entry);
14913 *out_prot = prot;
14914 KDBG_FILTERED(MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_MAP_LOOKUP_OBJECT), VM_KERNEL_UNSLIDE_OR_PERM(*object), (unsigned long) VME_ALIAS(entry), 0, 0);
14915
14916 if (fault_info) {
14917 /*
14918 * Initialize fault information according to the entry being faulted
14919 * from.
14920 */
14921 fault_info->user_tag = VME_ALIAS(entry);
14922 fault_info->pmap_options = 0;
14923 if (entry->iokit_acct ||
14924 (!entry->is_sub_map && !entry->use_pmap)) {
14925 fault_info->pmap_options |= PMAP_OPTIONS_ALT_ACCT;
14926 }
14927 if (fault_info->behavior == VM_BEHAVIOR_DEFAULT) {
14928 fault_info->behavior = entry->behavior;
14929 }
14930 fault_info->lo_offset = VME_OFFSET(entry);
14931 fault_info->hi_offset =
14932 (entry->vme_end - entry->vme_start) + VME_OFFSET(entry);
14933 fault_info->no_cache = entry->no_cache;
14934 fault_info->io_sync = FALSE;
14935 fault_info->cs_bypass = (entry->used_for_jit ||
14936 #if CODE_SIGNING_MONITOR
14937 (csm_address_space_exempt(map->pmap) == KERN_SUCCESS) ||
14938 #endif
14939 entry->vme_resilient_codesign);
14940 fault_info->mark_zf_absent = FALSE;
14941 fault_info->batch_pmap_op = FALSE;
14942 /*
14943 * The pmap layer will validate this page
14944 * before allowing it to be executed from.
14945 */
14946 #if CODE_SIGNING_MONITOR
14947 fault_info->csm_associated = entry->csm_associated;
14948 #else
14949 fault_info->csm_associated = FALSE;
14950 #endif
14951
14952 fault_info->resilient_media = entry->vme_resilient_media;
14953 fault_info->fi_xnu_user_debug = entry->vme_xnu_user_debug;
14954 fault_info->no_copy_on_read = entry->vme_no_copy_on_read;
14955 #if __arm64e__
14956 fault_info->fi_used_for_tpro = entry->used_for_tpro;
14957 #else /* __arm64e__ */
14958 fault_info->fi_used_for_tpro = FALSE;
14959 #endif
14960 if (entry->translated_allow_execute) {
14961 fault_info->pmap_options |= PMAP_OPTIONS_TRANSLATED_ALLOW_EXECUTE;
14962 }
14963 }
14964
14965 /*
14966 * Lock the object to prevent it from disappearing
14967 */
14968 if (object_lock_type == OBJECT_LOCK_EXCLUSIVE) {
14969 if (contended == NULL) {
14970 vm_object_lock(*object);
14971 } else {
14972 *contended = vm_object_lock_check_contended(*object);
14973 }
14974 } else {
14975 vm_object_lock_shared(*object);
14976 }
14977
14978 /*
14979 * Save the version number
14980 */
14981
14982 out_version->main_timestamp = map->timestamp;
14983
14984 return KERN_SUCCESS;
14985 }
14986
14987
14988 /*
14989 * vm_map_verify:
14990 *
14991 * Verifies that the map in question has not changed
14992 * since the given version. The map has to be locked
14993 * ("shared" mode is fine) before calling this function
14994 * and it will be returned locked too.
14995 */
14996 boolean_t
vm_map_verify(vm_map_t map,vm_map_version_t * version)14997 vm_map_verify(
14998 vm_map_t map,
14999 vm_map_version_t *version) /* REF */
15000 {
15001 boolean_t result;
15002
15003 vm_map_lock_assert_held(map);
15004 result = (map->timestamp == version->main_timestamp);
15005
15006 return result;
15007 }
15008
15009
15010 /*
15011 * TEMPORARYTEMPORARYTEMPORARYTEMPORARYTEMPORARYTEMPORARY
15012 * Goes away after regular vm_region_recurse function migrates to
15013 * 64 bits
15014 * vm_region_recurse: A form of vm_region which follows the
15015 * submaps in a target map
15016 *
15017 */
15018
15019 kern_return_t
vm_map_region_recurse_64(vm_map_t map,vm_map_offset_ut * address_u,vm_map_size_ut * size_u,natural_t * nesting_depth,vm_region_submap_info_64_t submap_info,mach_msg_type_number_t * count)15020 vm_map_region_recurse_64(
15021 vm_map_t map,
15022 vm_map_offset_ut *address_u, /* IN/OUT */
15023 vm_map_size_ut *size_u, /* OUT */
15024 natural_t *nesting_depth, /* IN/OUT */
15025 vm_region_submap_info_64_t submap_info, /* IN/OUT */
15026 mach_msg_type_number_t *count) /* IN/OUT */
15027 {
15028 mach_msg_type_number_t original_count;
15029 vm_region_extended_info_data_t extended;
15030 vm_map_entry_t tmp_entry;
15031 vm_map_offset_t user_address;
15032 unsigned int user_max_depth;
15033
15034 /*
15035 * "curr_entry" is the VM map entry preceding or including the
15036 * address we're looking for.
15037 * "curr_map" is the map or sub-map containing "curr_entry".
15038 * "curr_address" is the equivalent of the top map's "user_address"
15039 * in the current map.
15040 * "curr_offset" is the cumulated offset of "curr_map" in the
15041 * target task's address space.
15042 * "curr_depth" is the depth of "curr_map" in the chain of
15043 * sub-maps.
15044 *
15045 * "curr_max_below" and "curr_max_above" limit the range (around
15046 * "curr_address") we should take into account in the current (sub)map.
15047 * They limit the range to what's visible through the map entries
15048 * we've traversed from the top map to the current map.
15049 *
15050 */
15051 vm_map_entry_t curr_entry;
15052 vm_map_t curr_entry_submap;
15053 vm_map_address_t curr_entry_start;
15054 vm_object_offset_t curr_entry_offset;
15055 vm_map_address_t curr_address;
15056 vm_map_offset_t curr_offset;
15057 vm_map_t curr_map;
15058 unsigned int curr_depth;
15059 vm_map_offset_t curr_max_below, curr_max_above;
15060 vm_map_offset_t curr_skip;
15061
15062 /*
15063 * "next_" is the same as "curr_" but for the VM region immediately
15064 * after the address we're looking for. We need to keep track of this
15065 * too because we want to return info about that region if the
15066 * address we're looking for is not mapped.
15067 */
15068 vm_map_entry_t next_entry;
15069 vm_map_offset_t next_offset;
15070 vm_map_offset_t next_address;
15071 vm_map_t next_map;
15072 unsigned int next_depth;
15073 vm_map_offset_t next_max_below, next_max_above;
15074 vm_map_offset_t next_skip;
15075
15076 boolean_t look_for_pages;
15077 vm_region_submap_short_info_64_t short_info;
15078 boolean_t do_region_footprint;
15079 int effective_page_size, effective_page_shift;
15080 boolean_t submap_needed_copy;
15081
15082 if (map == VM_MAP_NULL) {
15083 /* no address space to work on */
15084 return KERN_INVALID_ARGUMENT;
15085 }
15086
15087 user_address = vm_sanitize_addr(map, *address_u);
15088
15089
15090 effective_page_shift = vm_self_region_page_shift(map);
15091 effective_page_size = (1 << effective_page_shift);
15092
15093 if (*count < VM_REGION_SUBMAP_SHORT_INFO_COUNT_64) {
15094 /*
15095 * "info" structure is not big enough and
15096 * would overflow
15097 */
15098 return KERN_INVALID_ARGUMENT;
15099 }
15100
15101 do_region_footprint = task_self_region_footprint();
15102 original_count = *count;
15103
15104 if (original_count < VM_REGION_SUBMAP_INFO_V0_COUNT_64) {
15105 *count = VM_REGION_SUBMAP_SHORT_INFO_COUNT_64;
15106 look_for_pages = FALSE;
15107 short_info = (vm_region_submap_short_info_64_t) submap_info;
15108 submap_info = NULL;
15109 } else {
15110 look_for_pages = TRUE;
15111 *count = VM_REGION_SUBMAP_INFO_V0_COUNT_64;
15112 short_info = NULL;
15113
15114 if (original_count >= VM_REGION_SUBMAP_INFO_V1_COUNT_64) {
15115 *count = VM_REGION_SUBMAP_INFO_V1_COUNT_64;
15116 }
15117 if (original_count >= VM_REGION_SUBMAP_INFO_V2_COUNT_64) {
15118 *count = VM_REGION_SUBMAP_INFO_V2_COUNT_64;
15119 }
15120 }
15121
15122 user_max_depth = *nesting_depth;
15123 submap_needed_copy = FALSE;
15124
15125 if (not_in_kdp) {
15126 vm_map_lock_read(map);
15127 }
15128
15129 recurse_again:
15130 curr_entry = NULL;
15131 curr_map = map;
15132 curr_address = user_address;
15133 curr_offset = 0;
15134 curr_skip = 0;
15135 curr_depth = 0;
15136 curr_max_above = ((vm_map_offset_t) -1) - curr_address;
15137 curr_max_below = curr_address;
15138
15139 next_entry = NULL;
15140 next_map = NULL;
15141 next_address = 0;
15142 next_offset = 0;
15143 next_skip = 0;
15144 next_depth = 0;
15145 next_max_above = (vm_map_offset_t) -1;
15146 next_max_below = (vm_map_offset_t) -1;
15147
15148 for (;;) {
15149 if (vm_map_lookup_entry(curr_map,
15150 curr_address,
15151 &tmp_entry)) {
15152 /* tmp_entry contains the address we're looking for */
15153 curr_entry = tmp_entry;
15154 } else {
15155 vm_map_offset_t skip;
15156 /*
15157 * The address is not mapped. "tmp_entry" is the
15158 * map entry preceding the address. We want the next
15159 * one, if it exists.
15160 */
15161 curr_entry = tmp_entry->vme_next;
15162
15163 if (curr_entry == vm_map_to_entry(curr_map) ||
15164 (curr_entry->vme_start >=
15165 curr_address + curr_max_above)) {
15166 /* no next entry at this level: stop looking */
15167 if (not_in_kdp) {
15168 vm_map_unlock_read(curr_map);
15169 }
15170 curr_entry = NULL;
15171 curr_map = NULL;
15172 curr_skip = 0;
15173 curr_offset = 0;
15174 curr_depth = 0;
15175 curr_max_above = 0;
15176 curr_max_below = 0;
15177 break;
15178 }
15179
15180 /* adjust current address and offset */
15181 skip = curr_entry->vme_start - curr_address;
15182 curr_address = curr_entry->vme_start;
15183 curr_skip += skip;
15184 curr_offset += skip;
15185 curr_max_above -= skip;
15186 curr_max_below = 0;
15187 }
15188
15189 /*
15190 * Is the next entry at this level closer to the address (or
15191 * deeper in the submap chain) than the one we had
15192 * so far ?
15193 */
15194 tmp_entry = curr_entry->vme_next;
15195 if (tmp_entry == vm_map_to_entry(curr_map)) {
15196 /* no next entry at this level */
15197 } else if (tmp_entry->vme_start >=
15198 curr_address + curr_max_above) {
15199 /*
15200 * tmp_entry is beyond the scope of what we mapped of
15201 * this submap in the upper level: ignore it.
15202 */
15203 } else if ((next_entry == NULL) ||
15204 (tmp_entry->vme_start + curr_offset <=
15205 next_entry->vme_start + next_offset)) {
15206 /*
15207 * We didn't have a "next_entry" or this one is
15208 * closer to the address we're looking for:
15209 * use this "tmp_entry" as the new "next_entry".
15210 */
15211 if (next_entry != NULL) {
15212 /* unlock the last "next_map" */
15213 if (next_map != curr_map && not_in_kdp) {
15214 vm_map_unlock_read(next_map);
15215 }
15216 }
15217 next_entry = tmp_entry;
15218 next_map = curr_map;
15219 next_depth = curr_depth;
15220 next_address = next_entry->vme_start;
15221 next_skip = curr_skip;
15222 next_skip += (next_address - curr_address);
15223 next_offset = curr_offset;
15224 next_offset += (next_address - curr_address);
15225 next_max_above = MIN(next_max_above, curr_max_above);
15226 next_max_above = MIN(next_max_above,
15227 next_entry->vme_end - next_address);
15228 next_max_below = MIN(next_max_below, curr_max_below);
15229 next_max_below = MIN(next_max_below,
15230 next_address - next_entry->vme_start);
15231 }
15232
15233 /*
15234 * "curr_max_{above,below}" allow us to keep track of the
15235 * portion of the submap that is actually mapped at this level:
15236 * the rest of that submap is irrelevant to us, since it's not
15237 * mapped here.
15238 * The relevant portion of the map starts at
15239 * "VME_OFFSET(curr_entry)" up to the size of "curr_entry".
15240 */
15241 curr_max_above = MIN(curr_max_above,
15242 curr_entry->vme_end - curr_address);
15243 curr_max_below = MIN(curr_max_below,
15244 curr_address - curr_entry->vme_start);
15245
15246 if (!curr_entry->is_sub_map ||
15247 curr_depth >= user_max_depth) {
15248 /*
15249 * We hit a leaf map or we reached the maximum depth
15250 * we could, so stop looking. Keep the current map
15251 * locked.
15252 */
15253 break;
15254 }
15255
15256 /*
15257 * Get down to the next submap level.
15258 */
15259
15260 if (curr_entry->needs_copy) {
15261 /* everything below this is effectively copy-on-write */
15262 submap_needed_copy = TRUE;
15263 }
15264
15265 /*
15266 * Lock the next level and unlock the current level,
15267 * unless we need to keep it locked to access the "next_entry"
15268 * later.
15269 */
15270 curr_entry_submap = VME_SUBMAP(curr_entry);
15271 curr_entry_start = curr_entry->vme_start;
15272 curr_entry_offset = VME_OFFSET(curr_entry);
15273 curr_entry = VM_MAP_ENTRY_NULL; /* no longer valid after unlocking the map */
15274 if (not_in_kdp) {
15275 vm_map_lock_read(curr_entry_submap);
15276 }
15277 if (curr_map == next_map) {
15278 /* keep "next_map" locked in case we need it */
15279 } else {
15280 /* release this map */
15281 if (not_in_kdp) {
15282 vm_map_unlock_read(curr_map);
15283 }
15284 }
15285
15286 /*
15287 * Adjust the offset. "curr_entry" mapped the submap
15288 * at relative address "curr_entry_start" in the
15289 * curr_map but skips the first "curr_entry_offset"
15290 * bytes of the submap.
15291 * "curr_offset" always represents the offset of a virtual
15292 * address in the curr_map relative to the absolute address
15293 * space (i.e. the top-level VM map).
15294 */
15295 curr_offset += curr_entry_offset - curr_entry_start;
15296 curr_address = user_address + curr_offset;
15297 /* switch to the submap */
15298 curr_map = curr_entry_submap;
15299 curr_depth++;
15300 }
15301
15302 // LP64todo: all the current tools are 32bit, obviously never worked for 64b
15303 // so probably should be a real 32b ID vs. ptr.
15304 // Current users just check for equality
15305
15306 if (curr_entry == NULL) {
15307 /* no VM region contains the address... */
15308
15309 if (do_region_footprint && /* we want footprint numbers */
15310 next_entry == NULL && /* & there are no more regions */
15311 /* & we haven't already provided our fake region: */
15312 user_address <= vm_map_last_entry(map)->vme_end) {
15313 ledger_amount_t ledger_resident, ledger_compressed;
15314
15315 /*
15316 * Add a fake memory region to account for
15317 * purgeable and/or ledger-tagged memory that
15318 * counts towards this task's memory footprint,
15319 * i.e. the resident/compressed pages of non-volatile
15320 * objects owned by that task.
15321 */
15322 task_ledgers_footprint(map->pmap->ledger,
15323 &ledger_resident,
15324 &ledger_compressed);
15325 if (ledger_resident + ledger_compressed == 0) {
15326 /* no purgeable memory usage to report */
15327 return KERN_INVALID_ADDRESS;
15328 }
15329 /* fake region to show nonvolatile footprint */
15330 if (look_for_pages) {
15331 submap_info->protection = VM_PROT_DEFAULT;
15332 submap_info->max_protection = VM_PROT_DEFAULT;
15333 submap_info->inheritance = VM_INHERIT_DEFAULT;
15334 submap_info->offset = 0;
15335 submap_info->user_tag = -1;
15336 submap_info->pages_resident = (unsigned int) (ledger_resident / effective_page_size);
15337 submap_info->pages_shared_now_private = 0;
15338 submap_info->pages_swapped_out = (unsigned int) (ledger_compressed / effective_page_size);
15339 submap_info->pages_dirtied = submap_info->pages_resident;
15340 submap_info->ref_count = 1;
15341 submap_info->shadow_depth = 0;
15342 submap_info->external_pager = 0;
15343 submap_info->share_mode = SM_PRIVATE;
15344 if (submap_needed_copy) {
15345 submap_info->share_mode = SM_COW;
15346 }
15347 submap_info->is_submap = 0;
15348 submap_info->behavior = VM_BEHAVIOR_DEFAULT;
15349 submap_info->object_id = VM_OBJECT_ID_FAKE(map, task_ledgers.purgeable_nonvolatile);
15350 submap_info->user_wired_count = 0;
15351 submap_info->pages_reusable = 0;
15352 } else {
15353 short_info->user_tag = -1;
15354 short_info->offset = 0;
15355 short_info->protection = VM_PROT_DEFAULT;
15356 short_info->inheritance = VM_INHERIT_DEFAULT;
15357 short_info->max_protection = VM_PROT_DEFAULT;
15358 short_info->behavior = VM_BEHAVIOR_DEFAULT;
15359 short_info->user_wired_count = 0;
15360 short_info->is_submap = 0;
15361 short_info->object_id = VM_OBJECT_ID_FAKE(map, task_ledgers.purgeable_nonvolatile);
15362 short_info->external_pager = 0;
15363 short_info->shadow_depth = 0;
15364 short_info->share_mode = SM_PRIVATE;
15365 if (submap_needed_copy) {
15366 short_info->share_mode = SM_COW;
15367 }
15368 short_info->ref_count = 1;
15369 }
15370 *nesting_depth = 0;
15371 *address_u = vm_sanitize_wrap_addr(vm_map_last_entry(map)->vme_end);
15372 *size_u = vm_sanitize_wrap_size(ledger_resident + ledger_compressed);
15373 return KERN_SUCCESS;
15374 }
15375
15376 if (next_entry == NULL) {
15377 /* ... and no VM region follows it either */
15378 return KERN_INVALID_ADDRESS;
15379 }
15380 /* ... gather info about the next VM region */
15381 curr_entry = next_entry;
15382 curr_map = next_map; /* still locked ... */
15383 curr_address = next_address;
15384 curr_skip = next_skip;
15385 curr_offset = next_offset;
15386 curr_depth = next_depth;
15387 curr_max_above = next_max_above;
15388 curr_max_below = next_max_below;
15389 } else {
15390 /* we won't need "next_entry" after all */
15391 if (next_entry != NULL) {
15392 /* release "next_map" */
15393 if (next_map != curr_map && not_in_kdp) {
15394 vm_map_unlock_read(next_map);
15395 }
15396 }
15397 }
15398 next_entry = NULL;
15399 next_map = NULL;
15400 next_offset = 0;
15401 next_skip = 0;
15402 next_depth = 0;
15403 next_max_below = -1;
15404 next_max_above = -1;
15405
15406 if (curr_entry->is_sub_map &&
15407 curr_depth < user_max_depth) {
15408 /*
15409 * We're not as deep as we could be: we must have
15410 * gone back up after not finding anything mapped
15411 * below the original top-level map entry's.
15412 * Let's move "curr_address" forward and recurse again.
15413 */
15414 user_address = curr_address;
15415 goto recurse_again;
15416 }
15417
15418 *nesting_depth = curr_depth;
15419 *address_u = vm_sanitize_wrap_addr(
15420 user_address + curr_skip - curr_max_below);
15421 *size_u = vm_sanitize_wrap_size(curr_max_above + curr_max_below);
15422
15423 if (look_for_pages) {
15424 submap_info->user_tag = VME_ALIAS(curr_entry);
15425 submap_info->offset = VME_OFFSET(curr_entry);
15426 submap_info->protection = curr_entry->protection;
15427 submap_info->inheritance = curr_entry->inheritance;
15428 submap_info->max_protection = curr_entry->max_protection;
15429 submap_info->behavior = curr_entry->behavior;
15430 submap_info->user_wired_count = curr_entry->user_wired_count;
15431 submap_info->is_submap = curr_entry->is_sub_map;
15432 if (curr_entry->is_sub_map) {
15433 submap_info->object_id = VM_OBJECT_ID(VME_SUBMAP(curr_entry));
15434 } else {
15435 submap_info->object_id = VM_OBJECT_ID(VME_OBJECT(curr_entry));
15436 }
15437 } else {
15438 short_info->user_tag = VME_ALIAS(curr_entry);
15439 short_info->offset = VME_OFFSET(curr_entry);
15440 short_info->protection = curr_entry->protection;
15441 short_info->inheritance = curr_entry->inheritance;
15442 short_info->max_protection = curr_entry->max_protection;
15443 short_info->behavior = curr_entry->behavior;
15444 short_info->user_wired_count = curr_entry->user_wired_count;
15445 short_info->is_submap = curr_entry->is_sub_map;
15446 if (curr_entry->is_sub_map) {
15447 short_info->object_id = VM_OBJECT_ID(VME_SUBMAP(curr_entry));
15448 } else {
15449 short_info->object_id = VM_OBJECT_ID(VME_OBJECT(curr_entry));
15450 }
15451 }
15452
15453 extended.pages_resident = 0;
15454 extended.pages_swapped_out = 0;
15455 extended.pages_shared_now_private = 0;
15456 extended.pages_dirtied = 0;
15457 extended.pages_reusable = 0;
15458 extended.external_pager = 0;
15459 extended.shadow_depth = 0;
15460 extended.share_mode = SM_EMPTY;
15461 extended.ref_count = 0;
15462
15463 if (not_in_kdp) {
15464 if (!curr_entry->is_sub_map) {
15465 vm_map_offset_t range_start, range_end;
15466 range_start = MAX((curr_address - curr_max_below),
15467 curr_entry->vme_start);
15468 range_end = MIN((curr_address + curr_max_above),
15469 curr_entry->vme_end);
15470 vm_map_region_walk(curr_map,
15471 range_start,
15472 curr_entry,
15473 (VME_OFFSET(curr_entry) +
15474 (range_start -
15475 curr_entry->vme_start)),
15476 range_end - range_start,
15477 &extended,
15478 look_for_pages, VM_REGION_EXTENDED_INFO_COUNT);
15479 if (submap_needed_copy) {
15480 extended.share_mode = SM_COW;
15481 }
15482 } else {
15483 if (curr_entry->use_pmap) {
15484 extended.share_mode = SM_TRUESHARED;
15485 } else {
15486 extended.share_mode = SM_PRIVATE;
15487 }
15488 extended.ref_count = os_ref_get_count_raw(&VME_SUBMAP(curr_entry)->map_refcnt);
15489 }
15490 }
15491
15492 if (look_for_pages) {
15493 submap_info->pages_resident = extended.pages_resident;
15494 submap_info->pages_swapped_out = extended.pages_swapped_out;
15495 submap_info->pages_shared_now_private =
15496 extended.pages_shared_now_private;
15497 submap_info->pages_dirtied = extended.pages_dirtied;
15498 submap_info->external_pager = extended.external_pager;
15499 submap_info->shadow_depth = extended.shadow_depth;
15500 submap_info->share_mode = extended.share_mode;
15501 submap_info->ref_count = extended.ref_count;
15502
15503 if (original_count >= VM_REGION_SUBMAP_INFO_V1_COUNT_64) {
15504 submap_info->pages_reusable = extended.pages_reusable;
15505 }
15506 if (original_count >= VM_REGION_SUBMAP_INFO_V2_COUNT_64) {
15507 if (curr_entry->is_sub_map) {
15508 submap_info->object_id_full = (vm_object_id_t)VM_KERNEL_ADDRHASH(VME_SUBMAP(curr_entry));
15509 } else if (VME_OBJECT(curr_entry)) {
15510 submap_info->object_id_full = (vm_object_id_t)VM_KERNEL_ADDRHASH(VME_OBJECT(curr_entry));
15511 } else {
15512 submap_info->object_id_full = 0ull;
15513 }
15514 }
15515 } else {
15516 short_info->external_pager = extended.external_pager;
15517 short_info->shadow_depth = extended.shadow_depth;
15518 short_info->share_mode = extended.share_mode;
15519 short_info->ref_count = extended.ref_count;
15520 }
15521
15522 if (not_in_kdp) {
15523 vm_map_unlock_read(curr_map);
15524 }
15525
15526 return KERN_SUCCESS;
15527 }
15528
15529 /*
15530 * vm_region:
15531 *
15532 * User call to obtain information about a region in
15533 * a task's address map. Currently, only one flavor is
15534 * supported.
15535 *
15536 * XXX The reserved and behavior fields cannot be filled
15537 * in until the vm merge from the IK is completed, and
15538 * vm_reserve is implemented.
15539 */
15540
15541 kern_return_t
vm_map_region(vm_map_t map,vm_map_offset_ut * address_u,vm_map_size_ut * size_u,vm_region_flavor_t flavor,vm_region_info_t info,mach_msg_type_number_t * count,mach_port_t * object_name)15542 vm_map_region(
15543 vm_map_t map,
15544 vm_map_offset_ut *address_u, /* IN/OUT */
15545 vm_map_size_ut *size_u, /* OUT */
15546 vm_region_flavor_t flavor, /* IN */
15547 vm_region_info_t info, /* OUT */
15548 mach_msg_type_number_t *count, /* IN/OUT */
15549 mach_port_t *object_name) /* OUT */
15550 {
15551 vm_map_entry_t tmp_entry;
15552 vm_map_entry_t entry;
15553 vm_map_offset_t start;
15554
15555 if (map == VM_MAP_NULL) {
15556 return KERN_INVALID_ARGUMENT;
15557 }
15558
15559 start = vm_sanitize_addr(map, *address_u);
15560
15561
15562 switch (flavor) {
15563 case VM_REGION_BASIC_INFO:
15564 /* legacy for old 32-bit objects info */
15565 {
15566 vm_region_basic_info_t basic;
15567
15568 if (*count < VM_REGION_BASIC_INFO_COUNT) {
15569 return KERN_INVALID_ARGUMENT;
15570 }
15571
15572 basic = (vm_region_basic_info_t) info;
15573 *count = VM_REGION_BASIC_INFO_COUNT;
15574
15575 vm_map_lock_read(map);
15576
15577 if (!vm_map_lookup_entry(map, start, &tmp_entry)) {
15578 if ((entry = tmp_entry->vme_next) == vm_map_to_entry(map)) {
15579 vm_map_unlock_read(map);
15580 return KERN_INVALID_ADDRESS;
15581 }
15582 } else {
15583 entry = tmp_entry;
15584 }
15585
15586 start = entry->vme_start;
15587
15588 basic->offset = (uint32_t)VME_OFFSET(entry);
15589 basic->protection = entry->protection;
15590 basic->inheritance = entry->inheritance;
15591 basic->max_protection = entry->max_protection;
15592 basic->behavior = entry->behavior;
15593 basic->user_wired_count = entry->user_wired_count;
15594 basic->reserved = entry->is_sub_map;
15595
15596 *address_u = vm_sanitize_wrap_addr(start);
15597 *size_u = vm_sanitize_wrap_size(entry->vme_end - start);
15598
15599 if (object_name) {
15600 *object_name = IP_NULL;
15601 }
15602 if (entry->is_sub_map) {
15603 basic->shared = FALSE;
15604 } else {
15605 basic->shared = entry->is_shared;
15606 }
15607
15608 vm_map_unlock_read(map);
15609 return KERN_SUCCESS;
15610 }
15611
15612 case VM_REGION_BASIC_INFO_64:
15613 {
15614 vm_region_basic_info_64_t basic;
15615
15616 if (*count < VM_REGION_BASIC_INFO_COUNT_64) {
15617 return KERN_INVALID_ARGUMENT;
15618 }
15619
15620 basic = (vm_region_basic_info_64_t) info;
15621 *count = VM_REGION_BASIC_INFO_COUNT_64;
15622
15623 vm_map_lock_read(map);
15624
15625 if (!vm_map_lookup_entry(map, start, &tmp_entry)) {
15626 if ((entry = tmp_entry->vme_next) == vm_map_to_entry(map)) {
15627 vm_map_unlock_read(map);
15628 return KERN_INVALID_ADDRESS;
15629 }
15630 } else {
15631 entry = tmp_entry;
15632 }
15633
15634 start = entry->vme_start;
15635
15636 basic->offset = VME_OFFSET(entry);
15637 basic->protection = entry->protection;
15638 basic->inheritance = entry->inheritance;
15639 basic->max_protection = entry->max_protection;
15640 basic->behavior = entry->behavior;
15641 basic->user_wired_count = entry->user_wired_count;
15642 basic->reserved = entry->is_sub_map;
15643
15644 *address_u = vm_sanitize_wrap_addr(start);
15645 *size_u = vm_sanitize_wrap_size(entry->vme_end - start);
15646
15647 if (object_name) {
15648 *object_name = IP_NULL;
15649 }
15650 if (entry->is_sub_map) {
15651 basic->shared = FALSE;
15652 } else {
15653 basic->shared = entry->is_shared;
15654 }
15655
15656 vm_map_unlock_read(map);
15657 return KERN_SUCCESS;
15658 }
15659 case VM_REGION_EXTENDED_INFO:
15660 if (*count < VM_REGION_EXTENDED_INFO_COUNT) {
15661 return KERN_INVALID_ARGUMENT;
15662 }
15663 OS_FALLTHROUGH;
15664 case VM_REGION_EXTENDED_INFO__legacy:
15665 {
15666 vm_region_extended_info_t extended;
15667 mach_msg_type_number_t original_count;
15668 int effective_page_size, effective_page_shift;
15669
15670 if (*count < VM_REGION_EXTENDED_INFO_COUNT__legacy) {
15671 return KERN_INVALID_ARGUMENT;
15672 }
15673
15674 extended = (vm_region_extended_info_t) info;
15675
15676 effective_page_shift = vm_self_region_page_shift(map);
15677 effective_page_size = (1 << effective_page_shift);
15678
15679 vm_map_lock_read(map);
15680
15681 if (!vm_map_lookup_entry(map, start, &tmp_entry)) {
15682 if ((entry = tmp_entry->vme_next) == vm_map_to_entry(map)) {
15683 vm_map_unlock_read(map);
15684 return KERN_INVALID_ADDRESS;
15685 }
15686 } else {
15687 entry = tmp_entry;
15688 }
15689 start = entry->vme_start;
15690
15691 extended->protection = entry->protection;
15692 extended->user_tag = VME_ALIAS(entry);
15693 extended->pages_resident = 0;
15694 extended->pages_swapped_out = 0;
15695 extended->pages_shared_now_private = 0;
15696 extended->pages_dirtied = 0;
15697 extended->external_pager = 0;
15698 extended->shadow_depth = 0;
15699
15700 original_count = *count;
15701 if (flavor == VM_REGION_EXTENDED_INFO__legacy) {
15702 *count = VM_REGION_EXTENDED_INFO_COUNT__legacy;
15703 } else {
15704 extended->pages_reusable = 0;
15705 *count = VM_REGION_EXTENDED_INFO_COUNT;
15706 }
15707
15708 vm_map_region_walk(map, start, entry, VME_OFFSET(entry), entry->vme_end - start, extended, TRUE, *count);
15709
15710 if (object_name) {
15711 *object_name = IP_NULL;
15712 }
15713
15714 *address_u = vm_sanitize_wrap_addr(start);
15715 *size_u = vm_sanitize_wrap_size(entry->vme_end - start);
15716
15717 vm_map_unlock_read(map);
15718 return KERN_SUCCESS;
15719 }
15720 case VM_REGION_TOP_INFO:
15721 {
15722 vm_region_top_info_t top;
15723
15724 if (*count < VM_REGION_TOP_INFO_COUNT) {
15725 return KERN_INVALID_ARGUMENT;
15726 }
15727
15728 top = (vm_region_top_info_t) info;
15729 *count = VM_REGION_TOP_INFO_COUNT;
15730
15731 vm_map_lock_read(map);
15732
15733 if (!vm_map_lookup_entry(map, start, &tmp_entry)) {
15734 if ((entry = tmp_entry->vme_next) == vm_map_to_entry(map)) {
15735 vm_map_unlock_read(map);
15736 return KERN_INVALID_ADDRESS;
15737 }
15738 } else {
15739 entry = tmp_entry;
15740 }
15741 start = entry->vme_start;
15742
15743 top->private_pages_resident = 0;
15744 top->shared_pages_resident = 0;
15745
15746 vm_map_region_top_walk(entry, top);
15747
15748 if (object_name) {
15749 *object_name = IP_NULL;
15750 }
15751
15752 *address_u = vm_sanitize_wrap_addr(start);
15753 *size_u = vm_sanitize_wrap_size(entry->vme_end - start);
15754
15755 vm_map_unlock_read(map);
15756 return KERN_SUCCESS;
15757 }
15758 default:
15759 return KERN_INVALID_ARGUMENT;
15760 }
15761 }
15762
15763 #define OBJ_RESIDENT_COUNT(obj, entry_size) \
15764 MIN((entry_size), \
15765 ((obj)->all_reusable ? \
15766 (obj)->wired_page_count : \
15767 (obj)->resident_page_count - (obj)->reusable_page_count))
15768
15769 void
vm_map_region_top_walk(vm_map_entry_t entry,vm_region_top_info_t top)15770 vm_map_region_top_walk(
15771 vm_map_entry_t entry,
15772 vm_region_top_info_t top)
15773 {
15774 if (entry->is_sub_map || VME_OBJECT(entry) == 0) {
15775 top->share_mode = SM_EMPTY;
15776 top->ref_count = 0;
15777 top->obj_id = 0;
15778 return;
15779 }
15780
15781 {
15782 struct vm_object *obj, *tmp_obj;
15783 int ref_count;
15784 uint32_t entry_size;
15785
15786 entry_size = (uint32_t) ((entry->vme_end - entry->vme_start) / PAGE_SIZE_64);
15787
15788 obj = VME_OBJECT(entry);
15789
15790 vm_object_lock(obj);
15791
15792 if ((ref_count = os_ref_get_count_raw(&obj->ref_count)) > 1 &&
15793 obj->paging_in_progress) {
15794 ref_count--;
15795 }
15796
15797 assert(obj->reusable_page_count <= obj->resident_page_count);
15798 if (obj->shadow) {
15799 if (ref_count == 1) {
15800 top->private_pages_resident =
15801 OBJ_RESIDENT_COUNT(obj, entry_size);
15802 } else {
15803 top->shared_pages_resident =
15804 OBJ_RESIDENT_COUNT(obj, entry_size);
15805 }
15806 top->ref_count = ref_count;
15807 top->share_mode = SM_COW;
15808
15809 while ((tmp_obj = obj->shadow)) {
15810 vm_object_lock(tmp_obj);
15811 vm_object_unlock(obj);
15812 obj = tmp_obj;
15813
15814 if ((ref_count = os_ref_get_count_raw(&obj->ref_count)) > 1 &&
15815 obj->paging_in_progress) {
15816 ref_count--;
15817 }
15818
15819 assert(obj->reusable_page_count <= obj->resident_page_count);
15820 top->shared_pages_resident +=
15821 OBJ_RESIDENT_COUNT(obj, entry_size);
15822 top->ref_count += ref_count - 1;
15823 }
15824 } else {
15825 if (entry->superpage_size) {
15826 top->share_mode = SM_LARGE_PAGE;
15827 top->shared_pages_resident = 0;
15828 top->private_pages_resident = entry_size;
15829 } else if (entry->needs_copy) {
15830 top->share_mode = SM_COW;
15831 top->shared_pages_resident =
15832 OBJ_RESIDENT_COUNT(obj, entry_size);
15833 } else {
15834 if (ref_count == 1 ||
15835 (ref_count == 2 && obj->named)) {
15836 top->share_mode = SM_PRIVATE;
15837 top->private_pages_resident =
15838 OBJ_RESIDENT_COUNT(obj,
15839 entry_size);
15840 } else {
15841 top->share_mode = SM_SHARED;
15842 top->shared_pages_resident =
15843 OBJ_RESIDENT_COUNT(obj,
15844 entry_size);
15845 }
15846 }
15847 top->ref_count = ref_count;
15848 }
15849
15850 vm_object_unlock(obj);
15851
15852 /* XXX K64: obj_id will be truncated */
15853 top->obj_id = (unsigned int) (uintptr_t)VM_KERNEL_ADDRHASH(obj);
15854 }
15855 }
15856
15857 void
vm_map_region_walk(vm_map_t map,vm_map_offset_t va,vm_map_entry_t entry,vm_object_offset_t offset,vm_object_size_t range,vm_region_extended_info_t extended,boolean_t look_for_pages,mach_msg_type_number_t count)15858 vm_map_region_walk(
15859 vm_map_t map,
15860 vm_map_offset_t va,
15861 vm_map_entry_t entry,
15862 vm_object_offset_t offset,
15863 vm_object_size_t range,
15864 vm_region_extended_info_t extended,
15865 boolean_t look_for_pages,
15866 mach_msg_type_number_t count)
15867 {
15868 struct vm_object *obj, *tmp_obj;
15869 vm_map_offset_t last_offset;
15870 int i;
15871 int ref_count;
15872 struct vm_object *shadow_object;
15873 unsigned short shadow_depth;
15874 boolean_t do_region_footprint;
15875 int effective_page_size, effective_page_shift;
15876 vm_map_offset_t effective_page_mask;
15877
15878 do_region_footprint = task_self_region_footprint();
15879
15880 if ((entry->is_sub_map) ||
15881 (VME_OBJECT(entry) == 0) ||
15882 (VME_OBJECT(entry)->phys_contiguous &&
15883 !entry->superpage_size)) {
15884 extended->share_mode = SM_EMPTY;
15885 extended->ref_count = 0;
15886 return;
15887 }
15888
15889 if (entry->superpage_size) {
15890 extended->shadow_depth = 0;
15891 extended->share_mode = SM_LARGE_PAGE;
15892 extended->ref_count = 1;
15893 extended->external_pager = 0;
15894
15895 /* TODO4K: Superpage in 4k mode? */
15896 extended->pages_resident = (unsigned int)(range >> PAGE_SHIFT);
15897 extended->shadow_depth = 0;
15898 return;
15899 }
15900
15901 effective_page_shift = vm_self_region_page_shift(map);
15902 effective_page_size = (1 << effective_page_shift);
15903 effective_page_mask = effective_page_size - 1;
15904
15905 offset = vm_map_trunc_page(offset, effective_page_mask);
15906
15907 obj = VME_OBJECT(entry);
15908
15909 vm_object_lock(obj);
15910
15911 if ((ref_count = os_ref_get_count_raw(&obj->ref_count)) > 1 &&
15912 obj->paging_in_progress) {
15913 ref_count--;
15914 }
15915
15916 if (look_for_pages) {
15917 for (last_offset = offset + range;
15918 offset < last_offset;
15919 offset += effective_page_size, va += effective_page_size) {
15920 if (do_region_footprint) {
15921 int disp;
15922
15923 disp = 0;
15924 if (map->has_corpse_footprint) {
15925 /*
15926 * Query the page info data we saved
15927 * while forking the corpse.
15928 */
15929 vm_map_corpse_footprint_query_page_info(
15930 map,
15931 va,
15932 &disp);
15933 } else {
15934 /*
15935 * Query the pmap.
15936 */
15937 vm_map_footprint_query_page_info(
15938 map,
15939 entry,
15940 va,
15941 &disp);
15942 }
15943 if (disp & VM_PAGE_QUERY_PAGE_PRESENT) {
15944 extended->pages_resident++;
15945 }
15946 if (disp & VM_PAGE_QUERY_PAGE_REUSABLE) {
15947 extended->pages_reusable++;
15948 }
15949 if (disp & VM_PAGE_QUERY_PAGE_DIRTY) {
15950 extended->pages_dirtied++;
15951 }
15952 if (disp & PMAP_QUERY_PAGE_COMPRESSED) {
15953 extended->pages_swapped_out++;
15954 }
15955 continue;
15956 }
15957
15958 vm_map_region_look_for_page(map, va, obj,
15959 vm_object_trunc_page(offset), ref_count,
15960 0, extended, count);
15961 }
15962
15963 if (do_region_footprint) {
15964 goto collect_object_info;
15965 }
15966 } else {
15967 collect_object_info:
15968 shadow_object = obj->shadow;
15969 shadow_depth = 0;
15970
15971 if (!(obj->internal)) {
15972 extended->external_pager = 1;
15973 }
15974
15975 if (shadow_object != VM_OBJECT_NULL) {
15976 vm_object_lock(shadow_object);
15977 for (;
15978 shadow_object != VM_OBJECT_NULL;
15979 shadow_depth++) {
15980 vm_object_t next_shadow;
15981
15982 if (!(shadow_object->internal)) {
15983 extended->external_pager = 1;
15984 }
15985
15986 next_shadow = shadow_object->shadow;
15987 if (next_shadow) {
15988 vm_object_lock(next_shadow);
15989 }
15990 vm_object_unlock(shadow_object);
15991 shadow_object = next_shadow;
15992 }
15993 }
15994 extended->shadow_depth = shadow_depth;
15995 }
15996
15997 if (extended->shadow_depth || entry->needs_copy) {
15998 extended->share_mode = SM_COW;
15999 } else {
16000 if (ref_count == 1) {
16001 extended->share_mode = SM_PRIVATE;
16002 } else {
16003 if (obj->true_share) {
16004 extended->share_mode = SM_TRUESHARED;
16005 } else {
16006 extended->share_mode = SM_SHARED;
16007 }
16008 }
16009 }
16010 extended->ref_count = ref_count - extended->shadow_depth;
16011
16012 for (i = 0; i < extended->shadow_depth; i++) {
16013 if ((tmp_obj = obj->shadow) == 0) {
16014 break;
16015 }
16016 vm_object_lock(tmp_obj);
16017 vm_object_unlock(obj);
16018
16019 if ((ref_count = os_ref_get_count_raw(&tmp_obj->ref_count)) > 1 &&
16020 tmp_obj->paging_in_progress) {
16021 ref_count--;
16022 }
16023
16024 extended->ref_count += ref_count;
16025 obj = tmp_obj;
16026 }
16027 vm_object_unlock(obj);
16028
16029 if (extended->external_pager && extended->ref_count == 2 && extended->share_mode == SM_SHARED) {
16030 extended->share_mode = SM_PRIVATE;
16031 } else if (extended->share_mode == SM_SHARED && !(task_self_region_info_flags() & VM_REGION_INFO_FLAGS_NO_ALIASED)) {
16032 vm_map_entry_t cur;
16033 vm_map_entry_t last;
16034 int my_refs;
16035
16036 obj = VME_OBJECT(entry);
16037 last = vm_map_to_entry(map);
16038 my_refs = 0;
16039
16040 if ((ref_count = os_ref_get_count_raw(&obj->ref_count)) > 1 &&
16041 obj->paging_in_progress) {
16042 ref_count--;
16043 }
16044 for (cur = vm_map_first_entry(map); cur != last; cur = cur->vme_next) {
16045 if (vm_map_region_has_obj_ref(cur, obj)) {
16046 my_refs++;
16047 }
16048 }
16049
16050 if (my_refs == ref_count) {
16051 extended->share_mode = SM_PRIVATE_ALIASED;
16052 } else if (my_refs > 1) {
16053 extended->share_mode = SM_SHARED_ALIASED;
16054 }
16055 }
16056 }
16057
16058
16059 /* object is locked on entry and locked on return */
16060
16061
16062 static void
vm_map_region_look_for_page(__unused vm_map_t map,__unused vm_map_offset_t va,vm_object_t object,vm_object_offset_t offset,int max_refcnt,unsigned short depth,vm_region_extended_info_t extended,mach_msg_type_number_t count)16063 vm_map_region_look_for_page(
16064 __unused vm_map_t map,
16065 __unused vm_map_offset_t va,
16066 vm_object_t object,
16067 vm_object_offset_t offset,
16068 int max_refcnt,
16069 unsigned short depth,
16070 vm_region_extended_info_t extended,
16071 mach_msg_type_number_t count)
16072 {
16073 vm_page_t p;
16074 vm_object_t shadow;
16075 int ref_count;
16076 vm_object_t caller_object;
16077
16078 shadow = object->shadow;
16079 caller_object = object;
16080
16081
16082 while (TRUE) {
16083 if (!(object->internal)) {
16084 extended->external_pager = 1;
16085 }
16086
16087 if ((p = vm_page_lookup(object, offset)) != VM_PAGE_NULL) {
16088 if (shadow && (max_refcnt == 1)) {
16089 extended->pages_shared_now_private++;
16090 }
16091
16092 if (!vm_page_is_fictitious(p) &&
16093 (p->vmp_dirty || pmap_is_modified(VM_PAGE_GET_PHYS_PAGE(p)))) {
16094 extended->pages_dirtied++;
16095 } else if (count >= VM_REGION_EXTENDED_INFO_COUNT) {
16096 if (p->vmp_reusable || object->all_reusable) {
16097 extended->pages_reusable++;
16098 }
16099 }
16100
16101 extended->pages_resident++;
16102
16103 if (object != caller_object) {
16104 vm_object_unlock(object);
16105 }
16106
16107 return;
16108 }
16109 if (object->internal &&
16110 object->alive &&
16111 !object->terminating &&
16112 object->pager_ready) {
16113 if (vm_object_compressor_pager_state_get(object, offset)
16114 == VM_EXTERNAL_STATE_EXISTS) {
16115 /* the pager has that page */
16116 extended->pages_swapped_out++;
16117 if (object != caller_object) {
16118 vm_object_unlock(object);
16119 }
16120 return;
16121 }
16122 }
16123
16124 if (shadow) {
16125 vm_object_lock(shadow);
16126 if ((ref_count = os_ref_get_count_raw(&shadow->ref_count)) > 1 &&
16127 shadow->paging_in_progress) {
16128 ref_count--;
16129 }
16130
16131 if (++depth > extended->shadow_depth) {
16132 extended->shadow_depth = depth;
16133 }
16134
16135 if (ref_count > max_refcnt) {
16136 max_refcnt = ref_count;
16137 }
16138
16139 if (object != caller_object) {
16140 vm_object_unlock(object);
16141 }
16142
16143 offset = offset + object->vo_shadow_offset;
16144 object = shadow;
16145 shadow = object->shadow;
16146 continue;
16147 }
16148 if (object != caller_object) {
16149 vm_object_unlock(object);
16150 }
16151 break;
16152 }
16153 }
16154
16155 static inline boolean_t
vm_map_region_has_obj_ref(vm_map_entry_t entry,vm_object_t object)16156 vm_map_region_has_obj_ref(
16157 vm_map_entry_t entry,
16158 vm_object_t object)
16159 {
16160 vm_object_t cur_obj;
16161 vm_object_t shadow_obj;
16162
16163 if (entry->is_sub_map) {
16164 return FALSE;
16165 }
16166
16167 cur_obj = VME_OBJECT(entry);
16168 if (cur_obj == VM_OBJECT_NULL) {
16169 return FALSE;
16170 } else if (cur_obj == object) {
16171 return TRUE;
16172 }
16173
16174 /*
16175 * Avoid locks for first shadow check, otherwise diagnostic tools will
16176 * spend most of their time obtaining locks in this function when analyzing
16177 * processes with many VM entries which may commonly have no shadow chain.
16178 *
16179 * This is acceptable because:
16180 * - Shadow's fields are not accessed outside of its lock
16181 * - Objects are unlikely to be modified due to:
16182 * - Many diagnostic tools suspend the task
16183 * - VM map is locked
16184 * - The rare incorrect return from this function turns a guess into a
16185 * slightly worse guess
16186 * - Entire shadow chain is not locked as a whole, so can still change
16187 * while traversing, resulting in incorrect guess even with locking
16188 */
16189 shadow_obj = cur_obj->shadow;
16190 if (shadow_obj == VM_OBJECT_NULL) {
16191 return FALSE;
16192 } else if (shadow_obj == object) {
16193 return TRUE;
16194 }
16195
16196 vm_object_lock(cur_obj);
16197
16198 while ((shadow_obj = cur_obj->shadow)) {
16199 /* check if object was found before grabbing a lock */
16200 if (shadow_obj == object) {
16201 vm_object_unlock(cur_obj);
16202 return TRUE;
16203 }
16204
16205 vm_object_lock(shadow_obj);
16206 vm_object_unlock(cur_obj);
16207 cur_obj = shadow_obj;
16208 }
16209
16210 /* exhausted the shadow chain */
16211 vm_object_unlock(cur_obj);
16212 return FALSE;
16213 }
16214
16215
16216 /*
16217 * Routine: vm_map_simplify
16218 *
16219 * Description:
16220 * Attempt to simplify the map representation in
16221 * the vicinity of the given starting address.
16222 * Note:
16223 * This routine is intended primarily to keep the
16224 * kernel maps more compact -- they generally don't
16225 * benefit from the "expand a map entry" technology
16226 * at allocation time because the adjacent entry
16227 * is often wired down.
16228 */
16229 void
vm_map_simplify_entry(vm_map_t map,vm_map_entry_t this_entry)16230 vm_map_simplify_entry(
16231 vm_map_t map,
16232 vm_map_entry_t this_entry)
16233 {
16234 vm_map_entry_t prev_entry;
16235
16236 prev_entry = this_entry->vme_prev;
16237
16238 if ((this_entry != vm_map_to_entry(map)) &&
16239 (prev_entry != vm_map_to_entry(map)) &&
16240
16241 (prev_entry->vme_end == this_entry->vme_start) &&
16242
16243 (prev_entry->is_sub_map == this_entry->is_sub_map) &&
16244 (prev_entry->vme_object_value == this_entry->vme_object_value) &&
16245 (prev_entry->vme_kernel_object == this_entry->vme_kernel_object) &&
16246 ((VME_OFFSET(prev_entry) + (prev_entry->vme_end -
16247 prev_entry->vme_start))
16248 == VME_OFFSET(this_entry)) &&
16249
16250 (prev_entry->behavior == this_entry->behavior) &&
16251 (prev_entry->needs_copy == this_entry->needs_copy) &&
16252 (prev_entry->protection == this_entry->protection) &&
16253 (prev_entry->max_protection == this_entry->max_protection) &&
16254 (prev_entry->inheritance == this_entry->inheritance) &&
16255 (prev_entry->use_pmap == this_entry->use_pmap) &&
16256 (VME_ALIAS(prev_entry) == VME_ALIAS(this_entry)) &&
16257 (prev_entry->no_cache == this_entry->no_cache) &&
16258 (prev_entry->vme_permanent == this_entry->vme_permanent) &&
16259 (prev_entry->map_aligned == this_entry->map_aligned) &&
16260 (prev_entry->zero_wired_pages == this_entry->zero_wired_pages) &&
16261 (prev_entry->used_for_jit == this_entry->used_for_jit) &&
16262 #if __arm64e__
16263 (prev_entry->used_for_tpro == this_entry->used_for_tpro) &&
16264 #endif
16265 (prev_entry->csm_associated == this_entry->csm_associated) &&
16266 (prev_entry->vme_xnu_user_debug == this_entry->vme_xnu_user_debug) &&
16267 (prev_entry->iokit_acct == this_entry->iokit_acct) &&
16268 (prev_entry->vme_resilient_codesign ==
16269 this_entry->vme_resilient_codesign) &&
16270 (prev_entry->vme_resilient_media ==
16271 this_entry->vme_resilient_media) &&
16272 (prev_entry->vme_no_copy_on_read == this_entry->vme_no_copy_on_read) &&
16273 (prev_entry->translated_allow_execute == this_entry->translated_allow_execute) &&
16274
16275 (prev_entry->wired_count == this_entry->wired_count) &&
16276 (prev_entry->user_wired_count == this_entry->user_wired_count) &&
16277
16278 ((prev_entry->vme_atomic == FALSE) && (this_entry->vme_atomic == FALSE)) &&
16279 (prev_entry->in_transition == FALSE) &&
16280 (this_entry->in_transition == FALSE) &&
16281 (prev_entry->needs_wakeup == FALSE) &&
16282 (this_entry->needs_wakeup == FALSE) &&
16283 (prev_entry->is_shared == this_entry->is_shared) &&
16284 (prev_entry->superpage_size == FALSE) &&
16285 (this_entry->superpage_size == FALSE)
16286 ) {
16287 if (prev_entry->vme_permanent) {
16288 assert(this_entry->vme_permanent);
16289 prev_entry->vme_permanent = false;
16290 }
16291 vm_map_store_entry_unlink(map, prev_entry, true);
16292 assert(prev_entry->vme_start < this_entry->vme_end);
16293 if (prev_entry->map_aligned) {
16294 assert(VM_MAP_PAGE_ALIGNED(prev_entry->vme_start,
16295 VM_MAP_PAGE_MASK(map)));
16296 }
16297 this_entry->vme_start = prev_entry->vme_start;
16298 VME_OFFSET_SET(this_entry, VME_OFFSET(prev_entry));
16299
16300 if (map->holelistenabled) {
16301 vm_map_store_update_first_free(map, this_entry, TRUE);
16302 }
16303
16304 if (prev_entry->is_sub_map) {
16305 vm_map_deallocate(VME_SUBMAP(prev_entry));
16306 } else {
16307 vm_object_deallocate(VME_OBJECT(prev_entry));
16308 }
16309 vm_map_entry_dispose(prev_entry);
16310 SAVE_HINT_MAP_WRITE(map, this_entry);
16311 }
16312 }
16313
16314 void
vm_map_simplify(vm_map_t map,vm_map_offset_t start)16315 vm_map_simplify(
16316 vm_map_t map,
16317 vm_map_offset_t start)
16318 {
16319 vm_map_entry_t this_entry;
16320
16321 vm_map_lock(map);
16322 if (vm_map_lookup_entry(map, start, &this_entry)) {
16323 vm_map_simplify_entry(map, this_entry);
16324 vm_map_simplify_entry(map, this_entry->vme_next);
16325 }
16326 vm_map_unlock(map);
16327 }
16328
16329 static void
vm_map_simplify_range(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)16330 vm_map_simplify_range(
16331 vm_map_t map,
16332 vm_map_offset_t start,
16333 vm_map_offset_t end)
16334 {
16335 vm_map_entry_t entry;
16336
16337 /*
16338 * The map should be locked (for "write") by the caller.
16339 */
16340
16341 if (start >= end) {
16342 /* invalid address range */
16343 return;
16344 }
16345
16346 start = vm_map_trunc_page(start,
16347 VM_MAP_PAGE_MASK(map));
16348 end = vm_map_round_page(end,
16349 VM_MAP_PAGE_MASK(map));
16350
16351 if (!vm_map_lookup_entry(map, start, &entry)) {
16352 /* "start" is not mapped and "entry" ends before "start" */
16353 if (entry == vm_map_to_entry(map)) {
16354 /* start with first entry in the map */
16355 entry = vm_map_first_entry(map);
16356 } else {
16357 /* start with next entry */
16358 entry = entry->vme_next;
16359 }
16360 }
16361
16362 while (entry != vm_map_to_entry(map) &&
16363 entry->vme_start <= end) {
16364 /* try and coalesce "entry" with its previous entry */
16365 vm_map_simplify_entry(map, entry);
16366 entry = entry->vme_next;
16367 }
16368 }
16369
16370 static __attribute__((always_inline, warn_unused_result))
16371 kern_return_t
vm_map_machine_attribute_sanitize(vm_map_t map,vm_map_offset_ut start_u,vm_map_offset_ut end_u,mach_vm_offset_t * start,mach_vm_offset_t * end,vm_map_size_t * size)16372 vm_map_machine_attribute_sanitize(
16373 vm_map_t map,
16374 vm_map_offset_ut start_u,
16375 vm_map_offset_ut end_u,
16376 mach_vm_offset_t *start,
16377 mach_vm_offset_t *end,
16378 vm_map_size_t *size)
16379 {
16380 vm_sanitize_flags_t flags = VM_SANITIZE_FLAGS_SIZE_ZERO_SUCCEEDS;
16381
16382
16383 return vm_sanitize_addr_end(start_u, end_u,
16384 VM_SANITIZE_CALLER_VM_MAP_MACHINE_ATTRIBUTE, map,
16385 flags, start, end, size);
16386 }
16387
16388
16389 /*
16390 * Routine: vm_map_machine_attribute
16391 * Purpose:
16392 * Provide machine-specific attributes to mappings,
16393 * such as cachability etc. for machines that provide
16394 * them. NUMA architectures and machines with big/strange
16395 * caches will use this.
16396 * Note:
16397 * Responsibilities for locking and checking are handled here,
16398 * everything else in the pmap module. If any non-volatile
16399 * information must be kept, the pmap module should handle
16400 * it itself. [This assumes that attributes do not
16401 * need to be inherited, which seems ok to me]
16402 */
16403 kern_return_t
vm_map_machine_attribute(vm_map_t map,vm_map_offset_ut start_u,vm_map_offset_ut end_u,vm_machine_attribute_t attribute,vm_machine_attribute_val_t * value)16404 vm_map_machine_attribute(
16405 vm_map_t map,
16406 vm_map_offset_ut start_u,
16407 vm_map_offset_ut end_u,
16408 vm_machine_attribute_t attribute,
16409 vm_machine_attribute_val_t *value) /* IN/OUT */
16410 {
16411 mach_vm_offset_t start, end;
16412 vm_map_size_t sync_size;
16413 kern_return_t ret;
16414 vm_map_entry_t entry;
16415
16416 ret = vm_map_machine_attribute_sanitize(map,
16417 start_u,
16418 end_u,
16419 &start,
16420 &end,
16421 &sync_size);
16422 if (__improbable(ret != KERN_SUCCESS)) {
16423 return vm_sanitize_get_kr(ret);
16424 }
16425
16426 if (start < vm_map_min(map) || end > vm_map_max(map)) {
16427 return KERN_INVALID_ADDRESS;
16428 }
16429
16430 vm_map_lock(map);
16431
16432 if (attribute != MATTR_CACHE) {
16433 /* If we don't have to find physical addresses, we */
16434 /* don't have to do an explicit traversal here. */
16435 ret = pmap_attribute(map->pmap, start, end - start,
16436 attribute, value);
16437 vm_map_unlock(map);
16438 return ret;
16439 }
16440
16441 ret = KERN_SUCCESS; /* Assume it all worked */
16442
16443 while (sync_size) {
16444 if (vm_map_lookup_entry(map, start, &entry)) {
16445 vm_map_size_t sub_size;
16446 if ((entry->vme_end - start) > sync_size) {
16447 sub_size = sync_size;
16448 sync_size = 0;
16449 } else {
16450 sub_size = entry->vme_end - start;
16451 sync_size -= sub_size;
16452 }
16453 if (entry->is_sub_map) {
16454 vm_map_offset_t sub_start;
16455 vm_map_offset_t sub_end;
16456
16457 sub_start = (start - entry->vme_start)
16458 + VME_OFFSET(entry);
16459 sub_end = sub_start + sub_size;
16460 vm_map_machine_attribute(
16461 VME_SUBMAP(entry),
16462 sub_start,
16463 sub_end,
16464 attribute, value);
16465 } else if (VME_OBJECT(entry)) {
16466 vm_page_t m;
16467 vm_object_t object;
16468 vm_object_t base_object;
16469 vm_object_t last_object;
16470 vm_object_offset_t offset;
16471 vm_object_offset_t base_offset;
16472 vm_map_size_t range;
16473 range = sub_size;
16474 offset = (start - entry->vme_start)
16475 + VME_OFFSET(entry);
16476 offset = vm_object_trunc_page(offset);
16477 base_offset = offset;
16478 object = VME_OBJECT(entry);
16479 base_object = object;
16480 last_object = NULL;
16481
16482 vm_object_lock(object);
16483
16484 while (range) {
16485 m = vm_page_lookup(
16486 object, offset);
16487
16488 if (m && !vm_page_is_fictitious(m)) {
16489 ret =
16490 pmap_attribute_cache_sync(
16491 VM_PAGE_GET_PHYS_PAGE(m),
16492 PAGE_SIZE,
16493 attribute, value);
16494 } else if (object->shadow) {
16495 offset = offset + object->vo_shadow_offset;
16496 last_object = object;
16497 object = object->shadow;
16498 vm_object_lock(last_object->shadow);
16499 vm_object_unlock(last_object);
16500 continue;
16501 }
16502 if (range < PAGE_SIZE) {
16503 range = 0;
16504 } else {
16505 range -= PAGE_SIZE;
16506 }
16507
16508 if (base_object != object) {
16509 vm_object_unlock(object);
16510 vm_object_lock(base_object);
16511 object = base_object;
16512 }
16513 /* Bump to the next page */
16514 base_offset += PAGE_SIZE;
16515 offset = base_offset;
16516 }
16517 vm_object_unlock(object);
16518 }
16519 start += sub_size;
16520 } else {
16521 vm_map_unlock(map);
16522 return KERN_FAILURE;
16523 }
16524 }
16525
16526 vm_map_unlock(map);
16527
16528 return ret;
16529 }
16530
16531 /*
16532 * vm_map_behavior_set:
16533 *
16534 * Sets the paging reference behavior of the specified address
16535 * range in the target map. Paging reference behavior affects
16536 * how pagein operations resulting from faults on the map will be
16537 * clustered.
16538 */
16539 kern_return_t
vm_map_behavior_set(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_behavior_t new_behavior)16540 vm_map_behavior_set(
16541 vm_map_t map,
16542 vm_map_offset_t start,
16543 vm_map_offset_t end,
16544 vm_behavior_t new_behavior)
16545 {
16546 vm_map_entry_t entry;
16547 vm_map_entry_t temp_entry;
16548
16549 if (start > end ||
16550 start < vm_map_min(map) ||
16551 end > vm_map_max(map)) {
16552 return KERN_NO_SPACE;
16553 }
16554 if (__improbable(vm_map_range_overflows(map, start, end - start))) {
16555 return KERN_INVALID_ADDRESS;
16556 }
16557
16558 switch (new_behavior) {
16559 /*
16560 * This first block of behaviors all set a persistent state on the specified
16561 * memory range. All we have to do here is to record the desired behavior
16562 * in the vm_map_entry_t's.
16563 */
16564
16565 case VM_BEHAVIOR_DEFAULT:
16566 case VM_BEHAVIOR_RANDOM:
16567 case VM_BEHAVIOR_SEQUENTIAL:
16568 case VM_BEHAVIOR_RSEQNTL:
16569 case VM_BEHAVIOR_ZERO_WIRED_PAGES:
16570 vm_map_lock(map);
16571
16572 /*
16573 * The entire address range must be valid for the map.
16574 * Note that vm_map_range_check() does a
16575 * vm_map_lookup_entry() internally and returns the
16576 * entry containing the start of the address range if
16577 * the entire range is valid.
16578 */
16579 if (vm_map_range_check(map, start, end, &temp_entry)) {
16580 entry = temp_entry;
16581 vm_map_clip_start(map, entry, start);
16582 } else {
16583 vm_map_unlock(map);
16584 return KERN_INVALID_ADDRESS;
16585 }
16586
16587 if (new_behavior == VM_BEHAVIOR_ZERO_WIRED_PAGES) {
16588 /* zeroing requires write access */
16589 temp_entry = entry;
16590 for (;
16591 entry != vm_map_to_entry(map) && (entry->vme_start < end);
16592 entry = entry->vme_next) {
16593 if (!(entry->protection & VM_PROT_WRITE) ||
16594 #if __arm64e__
16595 entry->used_for_tpro ||
16596 #endif /* __arm64e__ */
16597 entry->used_for_jit) {
16598 vm_map_unlock(map);
16599 return KERN_PROTECTION_FAILURE;
16600 }
16601 }
16602 entry = temp_entry;
16603 }
16604
16605 while ((entry != vm_map_to_entry(map)) && (entry->vme_start < end)) {
16606 vm_map_clip_end(map, entry, end);
16607 if (entry->is_sub_map) {
16608 assert(!entry->use_pmap);
16609 }
16610
16611 if (new_behavior == VM_BEHAVIOR_ZERO_WIRED_PAGES) {
16612 assert(entry->protection & VM_PROT_WRITE);
16613 #if __arm64e__
16614 assert(!entry->used_for_tpro);
16615 #endif /* __arm64e__ */
16616 assert(!entry->used_for_jit);
16617 entry->zero_wired_pages = TRUE;
16618 } else {
16619 entry->behavior = new_behavior;
16620 }
16621 entry = entry->vme_next;
16622 }
16623
16624 vm_map_unlock(map);
16625 break;
16626
16627 /*
16628 * The rest of these are different from the above in that they cause
16629 * an immediate action to take place as opposed to setting a behavior that
16630 * affects future actions.
16631 */
16632
16633 case VM_BEHAVIOR_WILLNEED:
16634 return vm_map_willneed(map, start, end);
16635
16636 case VM_BEHAVIOR_DONTNEED:
16637 return vm_map_msync(map, start, end - start, VM_SYNC_DEACTIVATE | VM_SYNC_CONTIGUOUS);
16638
16639 case VM_BEHAVIOR_FREE:
16640 return vm_map_msync(map, start, end - start, VM_SYNC_KILLPAGES | VM_SYNC_CONTIGUOUS);
16641
16642 case VM_BEHAVIOR_REUSABLE:
16643 return vm_map_reusable_pages(map, start, end);
16644
16645 case VM_BEHAVIOR_REUSE:
16646 return vm_map_reuse_pages(map, start, end);
16647
16648 case VM_BEHAVIOR_CAN_REUSE:
16649 return vm_map_can_reuse(map, start, end);
16650
16651 #if MACH_ASSERT
16652 case VM_BEHAVIOR_PAGEOUT:
16653 return vm_map_pageout(map, start, end);
16654 #endif /* MACH_ASSERT */
16655
16656 case VM_BEHAVIOR_ZERO:
16657 return vm_map_zero(map, start, end);
16658
16659 default:
16660 return KERN_INVALID_ARGUMENT;
16661 }
16662
16663 return KERN_SUCCESS;
16664 }
16665
16666
16667 /*
16668 * Internals for madvise(MADV_WILLNEED) system call.
16669 *
16670 * The implementation is to do:-
16671 * a) read-ahead if the mapping corresponds to a mapped regular file
16672 * b) or, fault in the pages (zero-fill, decompress etc) if it's an anonymous mapping
16673 */
16674 static kern_return_t
vm_map_willneed(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)16675 vm_map_willneed(
16676 vm_map_t map,
16677 vm_map_offset_t start,
16678 vm_map_offset_t end
16679 )
16680 {
16681 vm_map_entry_t entry;
16682 kern_return_t kr;
16683 vm_object_size_t len;
16684 vm_size_t region_size;
16685
16686 KDBG(VMDBG_CODE(DBG_VM_MAP_WILLNEED) | DBG_FUNC_START,
16687 start, end);
16688 struct vm_object_fault_info fault_info = {
16689 .interruptible = THREAD_UNINT,
16690 .behavior = VM_BEHAVIOR_SEQUENTIAL,
16691 /* Do not activate pages after faulting */
16692 .stealth = true,
16693 /* Don't wait for busy pages */
16694 .fi_no_sleep = true,
16695 };
16696
16697 /*
16698 * The MADV_WILLNEED operation doesn't require any changes to the
16699 * vm_map_entry_t's, so the read lock is sufficient.
16700 */
16701
16702 vm_map_lock_read(map);
16703
16704 /*
16705 * The madvise semantics require that the address range be fully
16706 * allocated with no holes. Otherwise, we're required to return
16707 * an error.
16708 */
16709
16710 if (!vm_map_range_check(map, start, end, &entry)) {
16711 vm_map_unlock_read(map);
16712 kr = KERN_INVALID_ADDRESS;
16713 goto done;
16714 }
16715
16716 /*
16717 * Examine each vm_map_entry_t in the range.
16718 */
16719 while (start < end) {
16720 /*
16721 * Set the length so we don't go beyond the end of the
16722 * map_entry or beyond the end of the range we were given.
16723 * This range could span also multiple map entries all of which
16724 * map different files, so make sure we only do the right amount
16725 * of I/O for each object. Note that it's possible for there
16726 * to be multiple map entries all referring to the same object
16727 * but with different page permissions, but it's not worth
16728 * trying to optimize that case.
16729 */
16730 len = MIN(entry->vme_end - start, end - start);
16731
16732 vm_map_offset_t addr = start;
16733
16734 vm_size_t effective_page_mask = MIN(vm_map_page_mask(map), PAGE_MASK);
16735 vm_map_offset_t effective_page_size = effective_page_mask + 1;
16736
16737 /*
16738 * Write-fault if the entry supports it to preclude subsequent soft-faults
16739 */
16740 vm_prot_t fault_prot = entry->protection & VM_PROT_WRITE ?
16741 VM_PROT_WRITE : VM_PROT_READ;
16742
16743 vm_map_unlock_read(map);
16744
16745 region_size = len;
16746 while (region_size) {
16747 /*
16748 * Provide a hint for how much clustering we would like. Note that
16749 * each individual fault will limit the size of each request to
16750 * MAX_UPL_TRANSFER_BYTES.
16751 */
16752 fault_info.cluster_size = region_size;
16753 kr = vm_pre_fault_with_info(
16754 map,
16755 vm_map_trunc_page(addr, effective_page_mask),
16756 fault_prot,
16757 &fault_info);
16758 if (kr == KERN_ALREADY_WAITING) {
16759 /*
16760 * The page is busy being faulted/paged by another thread.
16761 */
16762 KDBG(VMDBG_CODE(DBG_VM_MAP_WILLNEED) | DBG_FUNC_NONE,
16763 task_pid(current_task()), addr, kr);
16764 kr = KERN_SUCCESS;
16765 } else if (kr != KERN_SUCCESS) {
16766 goto done;
16767 }
16768 region_size -= effective_page_size;
16769 addr += effective_page_size;
16770 }
16771
16772 start += len;
16773 if (start >= end) {
16774 kr = KERN_SUCCESS;
16775 goto done;
16776 }
16777
16778 if (thread_should_abort(current_thread())) {
16779 kr = KERN_ABORTED;
16780 goto done;
16781 }
16782
16783 /* look up next entry */
16784 vm_map_lock_read(map);
16785 if (!vm_map_lookup_entry(map, start, &entry)) {
16786 /*
16787 * There's a new hole in the address range.
16788 */
16789 vm_map_unlock_read(map);
16790 kr = KERN_INVALID_ADDRESS;
16791 goto done;
16792 }
16793 }
16794
16795 vm_map_unlock_read(map);
16796 done:
16797 KDBG(VMDBG_CODE(DBG_VM_MAP_WILLNEED) | DBG_FUNC_END,
16798 start, kr);
16799 return kr;
16800 }
16801
16802 static boolean_t
vm_map_entry_is_reusable(vm_map_entry_t entry)16803 vm_map_entry_is_reusable(
16804 vm_map_entry_t entry)
16805 {
16806 /* Only user map entries */
16807
16808 vm_object_t object;
16809
16810 if (entry->is_sub_map) {
16811 return FALSE;
16812 }
16813
16814 switch (VME_ALIAS(entry)) {
16815 case VM_MEMORY_MALLOC:
16816 case VM_MEMORY_MALLOC_SMALL:
16817 case VM_MEMORY_MALLOC_LARGE:
16818 case VM_MEMORY_REALLOC:
16819 case VM_MEMORY_MALLOC_TINY:
16820 case VM_MEMORY_MALLOC_LARGE_REUSABLE:
16821 case VM_MEMORY_MALLOC_LARGE_REUSED:
16822 /*
16823 * This is a malloc() memory region: check if it's still
16824 * in its original state and can be re-used for more
16825 * malloc() allocations.
16826 */
16827 break;
16828 default:
16829 /*
16830 * Not a malloc() memory region: let the caller decide if
16831 * it's re-usable.
16832 */
16833 return TRUE;
16834 }
16835
16836 if (/*entry->is_shared ||*/
16837 entry->is_sub_map ||
16838 entry->in_transition ||
16839 entry->protection != VM_PROT_DEFAULT ||
16840 entry->max_protection != VM_PROT_ALL ||
16841 entry->inheritance != VM_INHERIT_DEFAULT ||
16842 entry->no_cache ||
16843 entry->vme_permanent ||
16844 entry->superpage_size != FALSE ||
16845 entry->zero_wired_pages ||
16846 entry->wired_count != 0 ||
16847 entry->user_wired_count != 0) {
16848 return FALSE;
16849 }
16850
16851 object = VME_OBJECT(entry);
16852 if (object == VM_OBJECT_NULL) {
16853 return TRUE;
16854 }
16855 if (
16856 #if 0
16857 /*
16858 * Let's proceed even if the VM object is potentially
16859 * shared.
16860 * We check for this later when processing the actual
16861 * VM pages, so the contents will be safe if shared.
16862 *
16863 * But we can still mark this memory region as "reusable" to
16864 * acknowledge that the caller did let us know that the memory
16865 * could be re-used and should not be penalized for holding
16866 * on to it. This allows its "resident size" to not include
16867 * the reusable range.
16868 */
16869 object->ref_count == 1 &&
16870 #endif
16871 object->vo_copy == VM_OBJECT_NULL &&
16872 object->shadow == VM_OBJECT_NULL &&
16873 object->internal &&
16874 object->purgable == VM_PURGABLE_DENY &&
16875 HAS_DEFAULT_CACHEABILITY(object->wimg_bits & VM_WIMG_MASK) &&
16876 !object->code_signed) {
16877 return TRUE;
16878 }
16879 return FALSE;
16880 }
16881
16882 static kern_return_t
vm_map_reuse_pages(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)16883 vm_map_reuse_pages(
16884 vm_map_t map,
16885 vm_map_offset_t start,
16886 vm_map_offset_t end)
16887 {
16888 vm_map_entry_t entry;
16889 vm_object_t object;
16890 vm_object_offset_t start_offset, end_offset;
16891
16892 /*
16893 * The MADV_REUSE operation doesn't require any changes to the
16894 * vm_map_entry_t's, so the read lock is sufficient.
16895 */
16896
16897 if (VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT) {
16898 /*
16899 * XXX TODO4K
16900 * need to figure out what reusable means for a
16901 * portion of a native page.
16902 */
16903 return KERN_SUCCESS;
16904 }
16905
16906 vm_map_lock_read(map);
16907 assert(map->pmap != kernel_pmap); /* protect alias access */
16908
16909 /*
16910 * The madvise semantics require that the address range be fully
16911 * allocated with no holes. Otherwise, we're required to return
16912 * an error.
16913 */
16914
16915 if (!vm_map_range_check(map, start, end, &entry)) {
16916 vm_map_unlock_read(map);
16917 vm_page_stats_reusable.reuse_pages_failure++;
16918 return KERN_INVALID_ADDRESS;
16919 }
16920
16921 /*
16922 * Examine each vm_map_entry_t in the range.
16923 */
16924 for (; entry != vm_map_to_entry(map) && entry->vme_start < end;
16925 entry = entry->vme_next) {
16926 /*
16927 * Sanity check on the VM map entry.
16928 */
16929 if (!vm_map_entry_is_reusable(entry)) {
16930 vm_map_unlock_read(map);
16931 vm_page_stats_reusable.reuse_pages_failure++;
16932 return KERN_INVALID_ADDRESS;
16933 }
16934
16935 /*
16936 * The first time through, the start address could be anywhere
16937 * within the vm_map_entry we found. So adjust the offset to
16938 * correspond.
16939 */
16940 if (entry->vme_start < start) {
16941 start_offset = start - entry->vme_start;
16942 } else {
16943 start_offset = 0;
16944 }
16945 end_offset = MIN(end, entry->vme_end) - entry->vme_start;
16946 start_offset += VME_OFFSET(entry);
16947 end_offset += VME_OFFSET(entry);
16948
16949 object = VME_OBJECT(entry);
16950 if (object != VM_OBJECT_NULL) {
16951 vm_object_lock(object);
16952 vm_object_reuse_pages(object, start_offset, end_offset,
16953 TRUE);
16954 vm_object_unlock(object);
16955 }
16956
16957 if (VME_ALIAS(entry) == VM_MEMORY_MALLOC_LARGE_REUSABLE) {
16958 /*
16959 * XXX
16960 * We do not hold the VM map exclusively here.
16961 * The "alias" field is not that critical, so it's
16962 * safe to update it here, as long as it is the only
16963 * one that can be modified while holding the VM map
16964 * "shared".
16965 */
16966 VME_ALIAS_SET(entry, VM_MEMORY_MALLOC_LARGE_REUSED);
16967 }
16968 }
16969
16970 vm_map_unlock_read(map);
16971 vm_page_stats_reusable.reuse_pages_success++;
16972 return KERN_SUCCESS;
16973 }
16974
16975
16976 static kern_return_t
vm_map_reusable_pages(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)16977 vm_map_reusable_pages(
16978 vm_map_t map,
16979 vm_map_offset_t start,
16980 vm_map_offset_t end)
16981 {
16982 vm_map_entry_t entry;
16983 vm_object_t object;
16984 vm_object_offset_t start_offset, end_offset;
16985 vm_map_offset_t pmap_offset;
16986
16987 if (VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT) {
16988 /*
16989 * XXX TODO4K
16990 * need to figure out what reusable means for a portion
16991 * of a native page.
16992 */
16993 return KERN_SUCCESS;
16994 }
16995
16996 /*
16997 * The MADV_REUSABLE operation doesn't require any changes to the
16998 * vm_map_entry_t's, so the read lock is sufficient.
16999 */
17000
17001 vm_map_lock_read(map);
17002 assert(map->pmap != kernel_pmap); /* protect alias access */
17003
17004 /*
17005 * The madvise semantics require that the address range be fully
17006 * allocated with no holes. Otherwise, we're required to return
17007 * an error.
17008 */
17009
17010 if (!vm_map_range_check(map, start, end, &entry)) {
17011 vm_map_unlock_read(map);
17012 vm_page_stats_reusable.reusable_pages_failure++;
17013 return KERN_INVALID_ADDRESS;
17014 }
17015
17016 /*
17017 * Examine each vm_map_entry_t in the range.
17018 */
17019 for (; entry != vm_map_to_entry(map) && entry->vme_start < end;
17020 entry = entry->vme_next) {
17021 int kill_pages = 0;
17022 boolean_t kill_no_write = FALSE;
17023
17024 /*
17025 * Sanity check on the VM map entry.
17026 */
17027 if (!vm_map_entry_is_reusable(entry)) {
17028 vm_map_unlock_read(map);
17029 vm_page_stats_reusable.reusable_pages_failure++;
17030 return KERN_INVALID_ADDRESS;
17031 }
17032
17033 if (!(entry->protection & VM_PROT_WRITE) && !entry->used_for_jit
17034 #if __arm64e__
17035 && !entry->used_for_tpro
17036 #endif
17037 ) {
17038 /* not writable: can't discard contents */
17039 vm_map_unlock_read(map);
17040 vm_page_stats_reusable.reusable_nonwritable++;
17041 vm_page_stats_reusable.reusable_pages_failure++;
17042 return KERN_PROTECTION_FAILURE;
17043 }
17044
17045 /*
17046 * The first time through, the start address could be anywhere
17047 * within the vm_map_entry we found. So adjust the offset to
17048 * correspond.
17049 */
17050 if (entry->vme_start < start) {
17051 start_offset = start - entry->vme_start;
17052 pmap_offset = start;
17053 } else {
17054 start_offset = 0;
17055 pmap_offset = entry->vme_start;
17056 }
17057 end_offset = MIN(end, entry->vme_end) - entry->vme_start;
17058 start_offset += VME_OFFSET(entry);
17059 end_offset += VME_OFFSET(entry);
17060
17061 object = VME_OBJECT(entry);
17062 if (object == VM_OBJECT_NULL) {
17063 continue;
17064 }
17065
17066 if ((entry->protection & VM_PROT_EXECUTE) ||
17067 entry->vme_xnu_user_debug) {
17068 /*
17069 * Executable or user debug pages might be write-protected by
17070 * hardware, so do not attempt to write to these pages.
17071 */
17072 kill_no_write = TRUE;
17073 }
17074
17075 vm_object_lock(object);
17076 if (((os_ref_get_count_raw(&object->ref_count) == 1) ||
17077 (object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC &&
17078 object->vo_copy == VM_OBJECT_NULL)) &&
17079 object->shadow == VM_OBJECT_NULL &&
17080 /*
17081 * "iokit_acct" entries are billed for their virtual size
17082 * (rather than for their resident pages only), so they
17083 * wouldn't benefit from making pages reusable, and it
17084 * would be hard to keep track of pages that are both
17085 * "iokit_acct" and "reusable" in the pmap stats and
17086 * ledgers.
17087 */
17088 !(entry->iokit_acct ||
17089 (!entry->is_sub_map && !entry->use_pmap))) {
17090 if (os_ref_get_count_raw(&object->ref_count) != 1) {
17091 vm_page_stats_reusable.reusable_shared++;
17092 }
17093 kill_pages = 1;
17094 } else {
17095 kill_pages = -1;
17096 }
17097 if (kill_pages != -1) {
17098 vm_object_deactivate_pages(object,
17099 start_offset,
17100 end_offset - start_offset,
17101 kill_pages,
17102 TRUE /*reusable_pages*/,
17103 kill_no_write,
17104 map->pmap,
17105 pmap_offset);
17106 } else {
17107 vm_page_stats_reusable.reusable_pages_shared++;
17108 DTRACE_VM4(vm_map_reusable_pages_shared,
17109 unsigned int, VME_ALIAS(entry),
17110 vm_map_t, map,
17111 vm_map_entry_t, entry,
17112 vm_object_t, object);
17113 }
17114 vm_object_unlock(object);
17115
17116 if (VME_ALIAS(entry) == VM_MEMORY_MALLOC_LARGE ||
17117 VME_ALIAS(entry) == VM_MEMORY_MALLOC_LARGE_REUSED) {
17118 /*
17119 * XXX
17120 * We do not hold the VM map exclusively here.
17121 * The "alias" field is not that critical, so it's
17122 * safe to update it here, as long as it is the only
17123 * one that can be modified while holding the VM map
17124 * "shared".
17125 */
17126 VME_ALIAS_SET(entry, VM_MEMORY_MALLOC_LARGE_REUSABLE);
17127 }
17128 }
17129
17130 vm_map_unlock_read(map);
17131 vm_page_stats_reusable.reusable_pages_success++;
17132 return KERN_SUCCESS;
17133 }
17134
17135
17136 static kern_return_t
vm_map_can_reuse(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)17137 vm_map_can_reuse(
17138 vm_map_t map,
17139 vm_map_offset_t start,
17140 vm_map_offset_t end)
17141 {
17142 vm_map_entry_t entry;
17143
17144 /*
17145 * The MADV_REUSABLE operation doesn't require any changes to the
17146 * vm_map_entry_t's, so the read lock is sufficient.
17147 */
17148
17149 vm_map_lock_read(map);
17150 assert(map->pmap != kernel_pmap); /* protect alias access */
17151
17152 /*
17153 * The madvise semantics require that the address range be fully
17154 * allocated with no holes. Otherwise, we're required to return
17155 * an error.
17156 */
17157
17158 if (!vm_map_range_check(map, start, end, &entry)) {
17159 vm_map_unlock_read(map);
17160 vm_page_stats_reusable.can_reuse_failure++;
17161 return KERN_INVALID_ADDRESS;
17162 }
17163
17164 /*
17165 * Examine each vm_map_entry_t in the range.
17166 */
17167 for (; entry != vm_map_to_entry(map) && entry->vme_start < end;
17168 entry = entry->vme_next) {
17169 /*
17170 * Sanity check on the VM map entry.
17171 */
17172 if (!vm_map_entry_is_reusable(entry)) {
17173 vm_map_unlock_read(map);
17174 vm_page_stats_reusable.can_reuse_failure++;
17175 return KERN_INVALID_ADDRESS;
17176 }
17177 }
17178
17179 vm_map_unlock_read(map);
17180 vm_page_stats_reusable.can_reuse_success++;
17181 return KERN_SUCCESS;
17182 }
17183
17184
17185 #if MACH_ASSERT
17186 static kern_return_t
vm_map_pageout(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)17187 vm_map_pageout(
17188 vm_map_t map,
17189 vm_map_offset_t start,
17190 vm_map_offset_t end)
17191 {
17192 vm_map_entry_t entry;
17193
17194 /*
17195 * The MADV_PAGEOUT operation doesn't require any changes to the
17196 * vm_map_entry_t's, so the read lock is sufficient.
17197 */
17198
17199 vm_map_lock_read(map);
17200
17201 /*
17202 * The madvise semantics require that the address range be fully
17203 * allocated with no holes. Otherwise, we're required to return
17204 * an error.
17205 */
17206
17207 if (!vm_map_range_check(map, start, end, &entry)) {
17208 vm_map_unlock_read(map);
17209 return KERN_INVALID_ADDRESS;
17210 }
17211
17212 /*
17213 * Examine each vm_map_entry_t in the range.
17214 */
17215 for (; entry != vm_map_to_entry(map) && entry->vme_start < end;
17216 entry = entry->vme_next) {
17217 vm_object_t object;
17218
17219 /*
17220 * Sanity check on the VM map entry.
17221 */
17222 if (entry->is_sub_map) {
17223 vm_map_t submap;
17224 vm_map_offset_t submap_start;
17225 vm_map_offset_t submap_end;
17226 vm_map_entry_t submap_entry;
17227
17228 submap = VME_SUBMAP(entry);
17229 submap_start = VME_OFFSET(entry);
17230 submap_end = submap_start + (entry->vme_end -
17231 entry->vme_start);
17232
17233 vm_map_lock_read(submap);
17234
17235 if (!vm_map_range_check(submap,
17236 submap_start,
17237 submap_end,
17238 &submap_entry)) {
17239 vm_map_unlock_read(submap);
17240 vm_map_unlock_read(map);
17241 return KERN_INVALID_ADDRESS;
17242 }
17243
17244 if (submap_entry->is_sub_map) {
17245 vm_map_unlock_read(submap);
17246 continue;
17247 }
17248
17249 object = VME_OBJECT(submap_entry);
17250 if (object == VM_OBJECT_NULL || !object->internal) {
17251 vm_map_unlock_read(submap);
17252 continue;
17253 }
17254
17255 vm_object_pageout(object);
17256
17257 vm_map_unlock_read(submap);
17258 submap = VM_MAP_NULL;
17259 submap_entry = VM_MAP_ENTRY_NULL;
17260 continue;
17261 }
17262
17263 object = VME_OBJECT(entry);
17264 if (object == VM_OBJECT_NULL || !object->internal) {
17265 continue;
17266 }
17267
17268 vm_object_pageout(object);
17269 }
17270
17271 vm_map_unlock_read(map);
17272 return KERN_SUCCESS;
17273 }
17274 #endif /* MACH_ASSERT */
17275
17276 /*
17277 * This function determines if the zero operation can be run on the
17278 * respective entry. Additional checks on the object are in
17279 * vm_object_zero_preflight.
17280 */
17281 static kern_return_t
vm_map_zero_entry_preflight(vm_map_entry_t entry)17282 vm_map_zero_entry_preflight(vm_map_entry_t entry)
17283 {
17284 /*
17285 * Zeroing is restricted to writable non-executable entries and non-JIT
17286 * regions.
17287 */
17288 if (!(entry->protection & VM_PROT_WRITE) ||
17289 (entry->protection & VM_PROT_EXECUTE) ||
17290 entry->used_for_jit ||
17291 entry->vme_xnu_user_debug) {
17292 return KERN_PROTECTION_FAILURE;
17293 }
17294
17295 /*
17296 * Zeroing for copy on write isn't yet supported. Zeroing is also not
17297 * allowed for submaps.
17298 */
17299 if (entry->needs_copy || entry->is_sub_map) {
17300 return KERN_NO_ACCESS;
17301 }
17302
17303 return KERN_SUCCESS;
17304 }
17305
17306 /*
17307 * This function translates entry's start and end to offsets in the object
17308 */
17309 static void
vm_map_get_bounds_in_object(vm_map_entry_t entry,vm_map_offset_t start,vm_map_offset_t end,vm_map_offset_t * start_offset,vm_map_offset_t * end_offset)17310 vm_map_get_bounds_in_object(
17311 vm_map_entry_t entry,
17312 vm_map_offset_t start,
17313 vm_map_offset_t end,
17314 vm_map_offset_t *start_offset,
17315 vm_map_offset_t *end_offset)
17316 {
17317 if (entry->vme_start < start) {
17318 *start_offset = start - entry->vme_start;
17319 } else {
17320 *start_offset = 0;
17321 }
17322 *end_offset = MIN(end, entry->vme_end) - entry->vme_start;
17323 *start_offset += VME_OFFSET(entry);
17324 *end_offset += VME_OFFSET(entry);
17325 }
17326
17327 /*
17328 * This function iterates through the entries in the requested range
17329 * and zeroes any resident pages in the corresponding objects. Compressed
17330 * pages are dropped instead of being faulted in and zeroed.
17331 */
17332 static kern_return_t
vm_map_zero(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)17333 vm_map_zero(
17334 vm_map_t map,
17335 vm_map_offset_t start,
17336 vm_map_offset_t end)
17337 {
17338 vm_map_entry_t entry;
17339 vm_map_offset_t cur = start;
17340 kern_return_t ret;
17341
17342 /*
17343 * This operation isn't supported where the map page size is less than
17344 * the hardware page size. Caller will need to handle error and
17345 * explicitly zero memory if needed.
17346 */
17347 if (VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT) {
17348 return KERN_NO_ACCESS;
17349 }
17350
17351 /*
17352 * The MADV_ZERO operation doesn't require any changes to the
17353 * vm_map_entry_t's, so the read lock is sufficient.
17354 */
17355 vm_map_lock_read(map);
17356 assert(map->pmap != kernel_pmap); /* protect alias access */
17357
17358 /*
17359 * The madvise semantics require that the address range be fully
17360 * allocated with no holes. Otherwise, we're required to return
17361 * an error. This check needs to be redone if the map has changed.
17362 */
17363 if (!vm_map_range_check(map, cur, end, &entry)) {
17364 vm_map_unlock_read(map);
17365 return KERN_INVALID_ADDRESS;
17366 }
17367
17368 /*
17369 * Examine each vm_map_entry_t in the range.
17370 */
17371 while (entry != vm_map_to_entry(map) && entry->vme_start < end) {
17372 vm_map_offset_t cur_offset;
17373 vm_map_offset_t end_offset;
17374 unsigned int last_timestamp = map->timestamp;
17375 vm_object_t object = VME_OBJECT(entry);
17376
17377 ret = vm_map_zero_entry_preflight(entry);
17378 if (ret != KERN_SUCCESS) {
17379 vm_map_unlock_read(map);
17380 return ret;
17381 }
17382
17383 if (object == VM_OBJECT_NULL) {
17384 entry = entry->vme_next;
17385 continue;
17386 }
17387
17388 vm_map_get_bounds_in_object(entry, cur, end, &cur_offset, &end_offset);
17389 vm_object_lock(object);
17390 /*
17391 * Take a reference on the object as vm_object_zero will drop the object
17392 * lock when it encounters a busy page.
17393 */
17394 vm_object_reference_locked(object);
17395 vm_map_unlock_read(map);
17396
17397 ret = vm_object_zero(object, cur_offset, end_offset);
17398 vm_object_unlock(object);
17399 vm_object_deallocate(object);
17400 if (ret != KERN_SUCCESS) {
17401 return ret;
17402 }
17403 /*
17404 * Update cur as vm_object_zero has succeeded.
17405 */
17406 cur += (end_offset - cur_offset);
17407 if (cur == end) {
17408 return KERN_SUCCESS;
17409 }
17410
17411 /*
17412 * If the map timestamp has changed, restart by relooking up cur in the
17413 * map
17414 */
17415 vm_map_lock_read(map);
17416 if (last_timestamp != map->timestamp) {
17417 /*
17418 * Relookup cur in the map
17419 */
17420 if (!vm_map_range_check(map, cur, end, &entry)) {
17421 vm_map_unlock_read(map);
17422 return KERN_INVALID_ADDRESS;
17423 }
17424 continue;
17425 }
17426 /*
17427 * If the map hasn't changed proceed with the next entry
17428 */
17429 entry = entry->vme_next;
17430 }
17431
17432 vm_map_unlock_read(map);
17433 return KERN_SUCCESS;
17434 }
17435
17436
17437 /*
17438 * Routine: vm_map_entry_insert
17439 *
17440 * Description: This routine inserts a new vm_entry in a locked map.
17441 */
17442 static vm_map_entry_t
vm_map_entry_insert(vm_map_t map,vm_map_entry_t insp_entry,vm_map_offset_t start,vm_map_offset_t end,vm_object_t object,vm_object_offset_t offset,vm_map_kernel_flags_t vmk_flags,boolean_t needs_copy,vm_prot_t cur_protection,vm_prot_t max_protection,vm_inherit_t inheritance,boolean_t clear_map_aligned)17443 vm_map_entry_insert(
17444 vm_map_t map,
17445 vm_map_entry_t insp_entry,
17446 vm_map_offset_t start,
17447 vm_map_offset_t end,
17448 vm_object_t object,
17449 vm_object_offset_t offset,
17450 vm_map_kernel_flags_t vmk_flags,
17451 boolean_t needs_copy,
17452 vm_prot_t cur_protection,
17453 vm_prot_t max_protection,
17454 vm_inherit_t inheritance,
17455 boolean_t clear_map_aligned)
17456 {
17457 vm_map_entry_t new_entry;
17458 boolean_t map_aligned = FALSE;
17459
17460 assert(insp_entry != (vm_map_entry_t)0);
17461 vm_map_lock_assert_exclusive(map);
17462
17463 __assert_only vm_object_offset_t end_offset = 0;
17464 assertf(!os_add_overflow(end - start, offset, &end_offset), "size 0x%llx, offset 0x%llx caused overflow", (uint64_t)(end - start), offset);
17465
17466 if (VM_MAP_PAGE_SHIFT(map) != PAGE_SHIFT) {
17467 map_aligned = TRUE;
17468 }
17469 if (clear_map_aligned &&
17470 (!VM_MAP_PAGE_ALIGNED(start, VM_MAP_PAGE_MASK(map)) ||
17471 !VM_MAP_PAGE_ALIGNED(end, VM_MAP_PAGE_MASK(map)))) {
17472 map_aligned = FALSE;
17473 }
17474 if (map_aligned) {
17475 assert(VM_MAP_PAGE_ALIGNED(start, VM_MAP_PAGE_MASK(map)));
17476 assert(VM_MAP_PAGE_ALIGNED(end, VM_MAP_PAGE_MASK(map)));
17477 } else {
17478 assert(page_aligned(start));
17479 assert(page_aligned(end));
17480 }
17481 assert(start < end);
17482
17483 new_entry = vm_map_entry_create(map);
17484
17485 new_entry->vme_start = start;
17486 new_entry->vme_end = end;
17487
17488 if (vmk_flags.vmkf_submap) {
17489 new_entry->vme_atomic = vmk_flags.vmkf_submap_atomic;
17490 VME_SUBMAP_SET(new_entry, (vm_map_t)object);
17491 } else {
17492 VME_OBJECT_SET(new_entry, object, false, 0);
17493 }
17494 VME_OFFSET_SET(new_entry, offset);
17495 VME_ALIAS_SET(new_entry, vmk_flags.vm_tag);
17496
17497 new_entry->map_aligned = map_aligned;
17498 new_entry->needs_copy = needs_copy;
17499 new_entry->inheritance = inheritance;
17500 new_entry->protection = cur_protection;
17501 new_entry->max_protection = max_protection;
17502 /*
17503 * submap: "use_pmap" means "nested".
17504 * default: false.
17505 *
17506 * object: "use_pmap" means "use pmap accounting" for footprint.
17507 * default: true.
17508 */
17509 new_entry->use_pmap = !vmk_flags.vmkf_submap;
17510 new_entry->no_cache = vmk_flags.vmf_no_cache;
17511 new_entry->vme_permanent = vmk_flags.vmf_permanent;
17512 new_entry->translated_allow_execute = vmk_flags.vmkf_translated_allow_execute;
17513 new_entry->vme_no_copy_on_read = vmk_flags.vmkf_no_copy_on_read;
17514 new_entry->superpage_size = (vmk_flags.vmf_superpage_size != 0);
17515
17516 if (vmk_flags.vmkf_map_jit) {
17517 if (!(map->jit_entry_exists) ||
17518 VM_MAP_POLICY_ALLOW_MULTIPLE_JIT(map)) {
17519 new_entry->used_for_jit = TRUE;
17520 map->jit_entry_exists = TRUE;
17521 }
17522 }
17523
17524 /*
17525 * Insert the new entry into the list.
17526 */
17527
17528 vm_map_store_entry_link(map, insp_entry, new_entry, vmk_flags);
17529 map->size += end - start;
17530
17531 /*
17532 * Update the free space hint and the lookup hint.
17533 */
17534
17535 SAVE_HINT_MAP_WRITE(map, new_entry);
17536 return new_entry;
17537 }
17538
17539 /*
17540 * Routine: vm_map_remap_extract
17541 *
17542 * Description: This routine returns a vm_entry list from a map.
17543 */
17544 static kern_return_t
vm_map_remap_extract(vm_map_t map,vm_map_offset_t addr,vm_map_size_t size,boolean_t copy,vm_map_copy_t map_copy,vm_prot_t * cur_protection,vm_prot_t * max_protection,vm_inherit_t inheritance,vm_map_kernel_flags_t vmk_flags)17545 vm_map_remap_extract(
17546 vm_map_t map,
17547 vm_map_offset_t addr,
17548 vm_map_size_t size,
17549 boolean_t copy,
17550 vm_map_copy_t map_copy,
17551 vm_prot_t *cur_protection, /* IN/OUT */
17552 vm_prot_t *max_protection, /* IN/OUT */
17553 /* What, no behavior? */
17554 vm_inherit_t inheritance,
17555 vm_map_kernel_flags_t vmk_flags)
17556 {
17557 struct vm_map_header *map_header = &map_copy->cpy_hdr;
17558 kern_return_t result;
17559 vm_map_size_t mapped_size;
17560 vm_map_size_t tmp_size;
17561 vm_map_entry_t src_entry; /* result of last map lookup */
17562 vm_map_entry_t new_entry;
17563 vm_object_offset_t offset;
17564 vm_map_offset_t map_address;
17565 vm_map_offset_t src_start; /* start of entry to map */
17566 vm_map_offset_t src_end; /* end of region to be mapped */
17567 vm_object_t object;
17568 vm_map_version_t version;
17569 boolean_t src_needs_copy;
17570 boolean_t new_entry_needs_copy;
17571 vm_map_entry_t saved_src_entry;
17572 boolean_t src_entry_was_wired;
17573 vm_prot_t max_prot_for_prot_copy;
17574 vm_map_offset_t effective_page_mask;
17575 bool pageable, same_map;
17576 boolean_t vm_remap_legacy;
17577 vm_prot_t required_cur_prot, required_max_prot;
17578 vm_object_t new_copy_object; /* vm_object_copy_* result */
17579 boolean_t saved_used_for_jit; /* Saved used_for_jit. */
17580
17581 pageable = vmk_flags.vmkf_copy_pageable;
17582 same_map = vmk_flags.vmkf_copy_same_map;
17583
17584 effective_page_mask = MIN(PAGE_MASK, VM_MAP_PAGE_MASK(map));
17585
17586 assert(map != VM_MAP_NULL);
17587 assert(size != 0);
17588 assert(size == vm_map_round_page(size, effective_page_mask));
17589 assert(inheritance == VM_INHERIT_NONE ||
17590 inheritance == VM_INHERIT_COPY ||
17591 inheritance == VM_INHERIT_SHARE);
17592 assert(!(*cur_protection & ~(VM_PROT_ALL | VM_PROT_ALLEXEC)));
17593 assert(!(*max_protection & ~(VM_PROT_ALL | VM_PROT_ALLEXEC)));
17594 assert((*cur_protection & *max_protection) == *cur_protection);
17595
17596 /*
17597 * Compute start and end of region.
17598 */
17599 src_start = vm_map_trunc_page(addr, effective_page_mask);
17600 src_end = vm_map_round_page(src_start + size, effective_page_mask);
17601
17602 /*
17603 * Initialize map_header.
17604 */
17605 map_header->nentries = 0;
17606 map_header->entries_pageable = pageable;
17607 // map_header->page_shift = MIN(VM_MAP_PAGE_SHIFT(map), PAGE_SHIFT);
17608 map_header->page_shift = (uint16_t)VM_MAP_PAGE_SHIFT(map);
17609 map_header->rb_head_store.rbh_root = (void *)(int)SKIP_RB_TREE;
17610 vm_map_store_init(map_header);
17611
17612 if (copy && vmk_flags.vmkf_remap_prot_copy) {
17613 /*
17614 * Special case for vm_map_protect(VM_PROT_COPY):
17615 * we want to set the new mappings' max protection to the
17616 * specified *max_protection...
17617 */
17618 max_prot_for_prot_copy = *max_protection & (VM_PROT_ALL | VM_PROT_ALLEXEC);
17619 /* ... but we want to use the vm_remap() legacy mode */
17620 vmk_flags.vmkf_remap_legacy_mode = true;
17621 *max_protection = VM_PROT_NONE;
17622 *cur_protection = VM_PROT_NONE;
17623 } else {
17624 max_prot_for_prot_copy = VM_PROT_NONE;
17625 }
17626
17627 if (vmk_flags.vmkf_remap_legacy_mode) {
17628 /*
17629 * vm_remap() legacy mode:
17630 * Extract all memory regions in the specified range and
17631 * collect the strictest set of protections allowed on the
17632 * entire range, so the caller knows what they can do with
17633 * the remapped range.
17634 * We start with VM_PROT_ALL and we'll remove the protections
17635 * missing from each memory region.
17636 */
17637 vm_remap_legacy = TRUE;
17638 *cur_protection = VM_PROT_ALL;
17639 *max_protection = VM_PROT_ALL;
17640 required_cur_prot = VM_PROT_NONE;
17641 required_max_prot = VM_PROT_NONE;
17642 } else {
17643 /*
17644 * vm_remap_new() mode:
17645 * Extract all memory regions in the specified range and
17646 * ensure that they have at least the protections specified
17647 * by the caller via *cur_protection and *max_protection.
17648 * The resulting mapping should have these protections.
17649 */
17650 vm_remap_legacy = FALSE;
17651 if (copy) {
17652 required_cur_prot = VM_PROT_NONE;
17653 required_max_prot = VM_PROT_READ;
17654 } else {
17655 required_cur_prot = *cur_protection;
17656 required_max_prot = *max_protection;
17657 }
17658 }
17659
17660 map_address = 0;
17661 mapped_size = 0;
17662 result = KERN_SUCCESS;
17663
17664 /*
17665 * The specified source virtual space might correspond to
17666 * multiple map entries, need to loop on them.
17667 */
17668 vm_map_lock(map);
17669
17670 if (map->pmap == kernel_pmap) {
17671 map_copy->is_kernel_range = true;
17672 map_copy->orig_range = kmem_addr_get_range(addr, size);
17673 #if CONFIG_MAP_RANGES
17674 } else if (map->uses_user_ranges) {
17675 map_copy->is_user_range = true;
17676 map_copy->orig_range = vm_map_user_range_resolve(map, addr, size, NULL);
17677 #endif /* CONFIG_MAP_RANGES */
17678 }
17679
17680 if (VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT) {
17681 /*
17682 * This address space uses sub-pages so the range might
17683 * not be re-mappable in an address space with larger
17684 * pages. Re-assemble any broken-up VM map entries to
17685 * improve our chances of making it work.
17686 */
17687 vm_map_simplify_range(map, src_start, src_end);
17688 }
17689 while (mapped_size != size) {
17690 vm_map_size_t entry_size;
17691
17692 /*
17693 * Find the beginning of the region.
17694 */
17695 if (!vm_map_lookup_entry(map, src_start, &src_entry)) {
17696 result = KERN_INVALID_ADDRESS;
17697 break;
17698 }
17699
17700 if (src_start < src_entry->vme_start ||
17701 (mapped_size && src_start != src_entry->vme_start)) {
17702 result = KERN_INVALID_ADDRESS;
17703 break;
17704 }
17705
17706 tmp_size = size - mapped_size;
17707 if (src_end > src_entry->vme_end) {
17708 tmp_size -= (src_end - src_entry->vme_end);
17709 }
17710
17711 entry_size = (vm_map_size_t)(src_entry->vme_end -
17712 src_entry->vme_start);
17713
17714 if (src_entry->is_sub_map &&
17715 vmk_flags.vmkf_copy_single_object) {
17716 vm_map_t submap;
17717 vm_map_offset_t submap_start;
17718 vm_map_size_t submap_size;
17719 boolean_t submap_needs_copy;
17720
17721 /*
17722 * No check for "required protection" on "src_entry"
17723 * because the protections that matter are the ones
17724 * on the submap's VM map entry, which will be checked
17725 * during the call to vm_map_remap_extract() below.
17726 */
17727 object = VM_OBJECT_NULL;
17728
17729 submap_size = src_entry->vme_end - src_start;
17730 if (submap_size > size) {
17731 submap_size = size;
17732 }
17733 submap_start = VME_OFFSET(src_entry) + src_start - src_entry->vme_start;
17734 submap = VME_SUBMAP(src_entry);
17735 if (copy) {
17736 /*
17737 * The caller wants a copy-on-write re-mapping,
17738 * so let's extract from the submap accordingly.
17739 */
17740 submap_needs_copy = TRUE;
17741 } else if (src_entry->needs_copy) {
17742 /*
17743 * The caller wants a shared re-mapping but the
17744 * submap is mapped with "needs_copy", so its
17745 * contents can't be shared as is. Extract the
17746 * contents of the submap as "copy-on-write".
17747 * The re-mapping won't be shared with the
17748 * original mapping but this is equivalent to
17749 * what happened with the original "remap from
17750 * submap" code.
17751 * The shared region is mapped "needs_copy", for
17752 * example.
17753 */
17754 submap_needs_copy = TRUE;
17755 } else {
17756 /*
17757 * The caller wants a shared re-mapping and
17758 * this mapping can be shared (no "needs_copy"),
17759 * so let's extract from the submap accordingly.
17760 * Kernel submaps are mapped without
17761 * "needs_copy", for example.
17762 */
17763 submap_needs_copy = FALSE;
17764 }
17765 vm_map_reference(submap);
17766 vm_map_unlock(map);
17767 src_entry = NULL;
17768 if (vm_remap_legacy) {
17769 *cur_protection = VM_PROT_NONE;
17770 *max_protection = VM_PROT_NONE;
17771 }
17772
17773 DTRACE_VM7(remap_submap_recurse,
17774 vm_map_t, map,
17775 vm_map_offset_t, addr,
17776 vm_map_size_t, size,
17777 boolean_t, copy,
17778 vm_map_offset_t, submap_start,
17779 vm_map_size_t, submap_size,
17780 boolean_t, submap_needs_copy);
17781
17782 result = vm_map_remap_extract(submap,
17783 submap_start,
17784 submap_size,
17785 submap_needs_copy,
17786 map_copy,
17787 cur_protection,
17788 max_protection,
17789 inheritance,
17790 vmk_flags);
17791 vm_map_deallocate(submap);
17792
17793 if (result == KERN_SUCCESS &&
17794 submap_needs_copy &&
17795 !copy) {
17796 /*
17797 * We were asked for a "shared"
17798 * re-mapping but had to ask for a
17799 * "copy-on-write" remapping of the
17800 * submap's mapping to honor the
17801 * submap's "needs_copy".
17802 * We now need to resolve that
17803 * pending "copy-on-write" to
17804 * get something we can share.
17805 */
17806 vm_map_entry_t copy_entry;
17807 vm_object_offset_t copy_offset;
17808 vm_map_size_t copy_size;
17809 vm_object_t copy_object;
17810 copy_entry = vm_map_copy_first_entry(map_copy);
17811 copy_size = copy_entry->vme_end - copy_entry->vme_start;
17812 copy_object = VME_OBJECT(copy_entry);
17813 copy_offset = VME_OFFSET(copy_entry);
17814 if (copy_object == VM_OBJECT_NULL) {
17815 assert(copy_offset == 0);
17816 assert(!copy_entry->needs_copy);
17817 if (copy_entry->max_protection == VM_PROT_NONE) {
17818 assert(copy_entry->protection == VM_PROT_NONE);
17819 /* nothing to share */
17820 } else {
17821 assert(copy_offset == 0);
17822 copy_object = vm_object_allocate(copy_size, submap->serial_id);
17823 VME_OFFSET_SET(copy_entry, 0);
17824 VME_OBJECT_SET(copy_entry, copy_object, false, 0);
17825 assert(copy_entry->use_pmap);
17826 }
17827 } else if (copy_object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC) {
17828 /* already shareable */
17829 assert(!copy_entry->needs_copy);
17830 } else if (copy_entry->needs_copy ||
17831 copy_object->shadowed ||
17832 (copy_object->internal &&
17833 !copy_object->true_share &&
17834 !copy_entry->is_shared &&
17835 copy_object->vo_size > copy_size)) {
17836 VME_OBJECT_SHADOW(copy_entry, copy_size, TRUE);
17837 assert(copy_entry->use_pmap);
17838 if (copy_entry->needs_copy) {
17839 /* already write-protected */
17840 } else {
17841 vm_prot_t prot;
17842 prot = copy_entry->protection & ~VM_PROT_WRITE;
17843 vm_object_pmap_protect(copy_object,
17844 copy_offset,
17845 copy_size,
17846 PMAP_NULL,
17847 PAGE_SIZE,
17848 0,
17849 prot);
17850 }
17851 copy_entry->needs_copy = FALSE;
17852 }
17853 copy_object = VME_OBJECT(copy_entry);
17854 copy_offset = VME_OFFSET(copy_entry);
17855 if (copy_object &&
17856 copy_object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) {
17857 copy_object->copy_strategy = MEMORY_OBJECT_COPY_DELAY;
17858 copy_object->true_share = TRUE;
17859 }
17860 }
17861
17862 return result;
17863 }
17864
17865 if (src_entry->is_sub_map) {
17866 /* protections for submap mapping are irrelevant here */
17867 } else if (((src_entry->protection & required_cur_prot) !=
17868 required_cur_prot) ||
17869 ((src_entry->max_protection & required_max_prot) !=
17870 required_max_prot)) {
17871 if (vmk_flags.vmkf_copy_single_object &&
17872 mapped_size != 0) {
17873 /*
17874 * Single object extraction.
17875 * We can't extract more with the required
17876 * protection but we've extracted some, so
17877 * stop there and declare success.
17878 * The caller should check the size of
17879 * the copy entry we've extracted.
17880 */
17881 result = KERN_SUCCESS;
17882 } else {
17883 /*
17884 * VM range extraction.
17885 * Required proctection is not available
17886 * for this part of the range: fail.
17887 */
17888 result = KERN_PROTECTION_FAILURE;
17889 }
17890 break;
17891 }
17892
17893 if (src_entry->is_sub_map) {
17894 vm_map_t submap;
17895 vm_map_offset_t submap_start;
17896 vm_map_size_t submap_size;
17897 vm_map_copy_t submap_copy;
17898 vm_prot_t submap_curprot, submap_maxprot;
17899 boolean_t submap_needs_copy;
17900
17901 /*
17902 * No check for "required protection" on "src_entry"
17903 * because the protections that matter are the ones
17904 * on the submap's VM map entry, which will be checked
17905 * during the call to vm_map_copy_extract() below.
17906 */
17907 object = VM_OBJECT_NULL;
17908 submap_copy = VM_MAP_COPY_NULL;
17909
17910 /* find equivalent range in the submap */
17911 submap = VME_SUBMAP(src_entry);
17912 submap_start = VME_OFFSET(src_entry) + src_start - src_entry->vme_start;
17913 submap_size = tmp_size;
17914 if (copy) {
17915 /*
17916 * The caller wants a copy-on-write re-mapping,
17917 * so let's extract from the submap accordingly.
17918 */
17919 submap_needs_copy = TRUE;
17920 } else if (src_entry->needs_copy) {
17921 /*
17922 * The caller wants a shared re-mapping but the
17923 * submap is mapped with "needs_copy", so its
17924 * contents can't be shared as is. Extract the
17925 * contents of the submap as "copy-on-write".
17926 * The re-mapping won't be shared with the
17927 * original mapping but this is equivalent to
17928 * what happened with the original "remap from
17929 * submap" code.
17930 * The shared region is mapped "needs_copy", for
17931 * example.
17932 */
17933 submap_needs_copy = TRUE;
17934 } else {
17935 /*
17936 * The caller wants a shared re-mapping and
17937 * this mapping can be shared (no "needs_copy"),
17938 * so let's extract from the submap accordingly.
17939 * Kernel submaps are mapped without
17940 * "needs_copy", for example.
17941 */
17942 submap_needs_copy = FALSE;
17943 }
17944 /* extra ref to keep submap alive */
17945 vm_map_reference(submap);
17946
17947 DTRACE_VM7(remap_submap_recurse,
17948 vm_map_t, map,
17949 vm_map_offset_t, addr,
17950 vm_map_size_t, size,
17951 boolean_t, copy,
17952 vm_map_offset_t, submap_start,
17953 vm_map_size_t, submap_size,
17954 boolean_t, submap_needs_copy);
17955
17956 /*
17957 * The map can be safely unlocked since we
17958 * already hold a reference on the submap.
17959 *
17960 * No timestamp since we don't care if the map
17961 * gets modified while we're down in the submap.
17962 * We'll resume the extraction at src_start + tmp_size
17963 * anyway.
17964 */
17965 vm_map_unlock(map);
17966 src_entry = NULL; /* not valid once map is unlocked */
17967
17968 if (vm_remap_legacy) {
17969 submap_curprot = VM_PROT_NONE;
17970 submap_maxprot = VM_PROT_NONE;
17971 if (max_prot_for_prot_copy) {
17972 submap_maxprot = max_prot_for_prot_copy;
17973 }
17974 } else {
17975 assert(!max_prot_for_prot_copy);
17976 submap_curprot = *cur_protection;
17977 submap_maxprot = *max_protection;
17978 }
17979 result = vm_map_copy_extract(submap,
17980 submap_start,
17981 submap_size,
17982 submap_needs_copy,
17983 &submap_copy,
17984 &submap_curprot,
17985 &submap_maxprot,
17986 inheritance,
17987 vmk_flags);
17988
17989 /* release extra ref on submap */
17990 vm_map_deallocate(submap);
17991 submap = VM_MAP_NULL;
17992
17993 if (result != KERN_SUCCESS) {
17994 vm_map_lock(map);
17995 break;
17996 }
17997
17998 /* transfer submap_copy entries to map_header */
17999 while (vm_map_copy_first_entry(submap_copy) !=
18000 vm_map_copy_to_entry(submap_copy)) {
18001 vm_map_entry_t copy_entry;
18002 vm_map_size_t copy_entry_size;
18003
18004 copy_entry = vm_map_copy_first_entry(submap_copy);
18005
18006 /*
18007 * Prevent kernel_object from being exposed to
18008 * user space.
18009 */
18010 if (__improbable(copy_entry->vme_kernel_object)) {
18011 printf("%d[%s]: rejecting attempt to extract from kernel_object\n",
18012 proc_selfpid(),
18013 (get_bsdtask_info(current_task())
18014 ? proc_name_address(get_bsdtask_info(current_task()))
18015 : "?"));
18016 DTRACE_VM(extract_kernel_only);
18017 result = KERN_INVALID_RIGHT;
18018 vm_map_copy_discard(submap_copy);
18019 submap_copy = VM_MAP_COPY_NULL;
18020 vm_map_lock(map);
18021 break;
18022 }
18023
18024 vm_map_copy_entry_unlink(submap_copy, copy_entry);
18025 copy_entry_size = copy_entry->vme_end - copy_entry->vme_start;
18026 copy_entry->vme_start = map_address;
18027 copy_entry->vme_end = map_address + copy_entry_size;
18028 map_address += copy_entry_size;
18029 mapped_size += copy_entry_size;
18030 src_start += copy_entry_size;
18031 assert(src_start <= src_end);
18032 _vm_map_store_entry_link(map_header,
18033 map_header->links.prev,
18034 copy_entry);
18035 }
18036 /* done with submap_copy */
18037 vm_map_copy_discard(submap_copy);
18038
18039 if (vm_remap_legacy) {
18040 *cur_protection &= submap_curprot;
18041 *max_protection &= submap_maxprot;
18042 }
18043
18044 /* re-acquire the map lock and continue to next entry */
18045 vm_map_lock(map);
18046 continue;
18047 } else {
18048 object = VME_OBJECT(src_entry);
18049
18050 /*
18051 * Prevent kernel_object from being exposed to
18052 * user space.
18053 */
18054 if (__improbable(is_kernel_object(object))) {
18055 printf("%d[%s]: rejecting attempt to extract from kernel_object\n",
18056 proc_selfpid(),
18057 (get_bsdtask_info(current_task())
18058 ? proc_name_address(get_bsdtask_info(current_task()))
18059 : "?"));
18060 DTRACE_VM(extract_kernel_only);
18061 result = KERN_INVALID_RIGHT;
18062 break;
18063 }
18064
18065 if (src_entry->iokit_acct) {
18066 /*
18067 * This entry uses "IOKit accounting".
18068 */
18069 } else if (object != VM_OBJECT_NULL &&
18070 object->internal &&
18071 (object->purgable != VM_PURGABLE_DENY ||
18072 object->vo_ledger_tag != VM_LEDGER_TAG_NONE)) {
18073 /*
18074 * Purgeable objects have their own accounting:
18075 * no pmap accounting for them.
18076 */
18077 assertf(!src_entry->use_pmap,
18078 "map=%p src_entry=%p [0x%llx:0x%llx] 0x%x/0x%x %d",
18079 map,
18080 src_entry,
18081 (uint64_t)src_entry->vme_start,
18082 (uint64_t)src_entry->vme_end,
18083 src_entry->protection,
18084 src_entry->max_protection,
18085 VME_ALIAS(src_entry));
18086 } else {
18087 /*
18088 * Not IOKit or purgeable:
18089 * must be accounted by pmap stats.
18090 */
18091 assertf(src_entry->use_pmap,
18092 "map=%p src_entry=%p [0x%llx:0x%llx] 0x%x/0x%x %d",
18093 map,
18094 src_entry,
18095 (uint64_t)src_entry->vme_start,
18096 (uint64_t)src_entry->vme_end,
18097 src_entry->protection,
18098 src_entry->max_protection,
18099 VME_ALIAS(src_entry));
18100 }
18101
18102 if (object == VM_OBJECT_NULL) {
18103 assert(!src_entry->needs_copy);
18104 if (src_entry->max_protection == VM_PROT_NONE) {
18105 assert(src_entry->protection == VM_PROT_NONE);
18106 /*
18107 * No VM object and no permissions:
18108 * this must be a reserved range with
18109 * nothing to share or copy.
18110 * There could also be all sorts of
18111 * pmap shenanigans within that reserved
18112 * range, so let's just copy the map
18113 * entry as is to remap a similar
18114 * reserved range.
18115 */
18116 offset = 0; /* no object => no offset */
18117 goto copy_src_entry;
18118 }
18119 object = vm_object_allocate(entry_size, map->serial_id);
18120 VME_OFFSET_SET(src_entry, 0);
18121 VME_OBJECT_SET(src_entry, object, false, 0);
18122 assert(src_entry->use_pmap);
18123 assert(!map->mapped_in_other_pmaps);
18124 } else if (src_entry->wired_count ||
18125 object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC) {
18126 /*
18127 * A wired memory region should not have
18128 * any pending copy-on-write and needs to
18129 * keep pointing at the VM object that
18130 * contains the wired pages.
18131 * If we're sharing this memory (copy=false),
18132 * we'll share this VM object.
18133 * If we're copying this memory (copy=true),
18134 * we'll call vm_object_copy_slowly() below
18135 * and use the new VM object for the remapping.
18136 *
18137 * Or, we are already using an asymmetric
18138 * copy, and therefore we already have
18139 * the right object.
18140 */
18141 assert(!src_entry->needs_copy);
18142 } else if (src_entry->needs_copy || object->shadowed ||
18143 (object->internal && !object->true_share &&
18144 !src_entry->is_shared &&
18145 object->vo_size > entry_size)) {
18146 bool is_writable;
18147
18148 VME_OBJECT_SHADOW(src_entry, entry_size,
18149 vm_map_always_shadow(map));
18150 assert(src_entry->use_pmap);
18151
18152 is_writable = false;
18153 if (src_entry->protection & VM_PROT_WRITE) {
18154 is_writable = true;
18155 #if __arm64e__
18156 } else if (src_entry->used_for_tpro) {
18157 is_writable = true;
18158 #endif /* __arm64e__ */
18159 }
18160 if (!src_entry->needs_copy && is_writable) {
18161 vm_prot_t prot;
18162
18163 if (pmap_has_prot_policy(map->pmap, src_entry->translated_allow_execute, src_entry->protection)) {
18164 panic("%s: map %p pmap %p entry %p 0x%llx:0x%llx prot 0x%x",
18165 __FUNCTION__,
18166 map, map->pmap,
18167 src_entry,
18168 (uint64_t)src_entry->vme_start,
18169 (uint64_t)src_entry->vme_end,
18170 src_entry->protection);
18171 }
18172
18173 prot = src_entry->protection & ~VM_PROT_WRITE;
18174
18175 if (override_nx(map,
18176 VME_ALIAS(src_entry))
18177 && prot) {
18178 prot |= VM_PROT_EXECUTE;
18179 }
18180
18181 if (pmap_has_prot_policy(map->pmap, src_entry->translated_allow_execute, prot)) {
18182 panic("%s: map %p pmap %p entry %p 0x%llx:0x%llx prot 0x%x",
18183 __FUNCTION__,
18184 map, map->pmap,
18185 src_entry,
18186 (uint64_t)src_entry->vme_start,
18187 (uint64_t)src_entry->vme_end,
18188 prot);
18189 }
18190
18191 if (map->mapped_in_other_pmaps) {
18192 vm_object_pmap_protect(
18193 VME_OBJECT(src_entry),
18194 VME_OFFSET(src_entry),
18195 entry_size,
18196 PMAP_NULL,
18197 PAGE_SIZE,
18198 src_entry->vme_start,
18199 prot);
18200 #if MACH_ASSERT
18201 } else if (__improbable(map->pmap == PMAP_NULL)) {
18202 /*
18203 * Some VM tests (in vm_tests.c)
18204 * sometimes want to use a VM
18205 * map without a pmap.
18206 * Otherwise, this should never
18207 * happen.
18208 */
18209 if (!thread_get_test_option(test_option_vm_map_allow_null_pmap)) {
18210 panic("null pmap");
18211 }
18212 #endif /* MACH_ASSERT */
18213 } else {
18214 pmap_protect(vm_map_pmap(map),
18215 src_entry->vme_start,
18216 src_entry->vme_end,
18217 prot);
18218 }
18219 }
18220
18221 object = VME_OBJECT(src_entry);
18222 src_entry->needs_copy = FALSE;
18223 }
18224
18225
18226 vm_object_lock(object);
18227 vm_object_reference_locked(object); /* object ref. for new entry */
18228 assert(!src_entry->needs_copy);
18229 if (object->copy_strategy ==
18230 MEMORY_OBJECT_COPY_SYMMETRIC) {
18231 /*
18232 * If we want to share this object (copy==0),
18233 * it needs to be COPY_DELAY.
18234 * If we want to copy this object (copy==1),
18235 * we can't just set "needs_copy" on our side
18236 * and expect the other side to do the same
18237 * (symmetrically), so we can't let the object
18238 * stay COPY_SYMMETRIC.
18239 * So we always switch from COPY_SYMMETRIC to
18240 * COPY_DELAY.
18241 */
18242 object->copy_strategy =
18243 MEMORY_OBJECT_COPY_DELAY;
18244 VM_OBJECT_SET_TRUE_SHARE(object, TRUE);
18245 }
18246 vm_object_unlock(object);
18247 }
18248
18249 offset = (VME_OFFSET(src_entry) +
18250 (src_start - src_entry->vme_start));
18251
18252 copy_src_entry:
18253
18254
18255 new_entry = _vm_map_entry_create(map_header);
18256 vm_map_entry_copy(map, new_entry, src_entry);
18257 if (new_entry->is_sub_map) {
18258 /* clr address space specifics */
18259 new_entry->use_pmap = FALSE;
18260 } else if (copy) {
18261 /*
18262 * We're dealing with a copy-on-write operation,
18263 * so the resulting mapping should not inherit the
18264 * original mapping's accounting settings.
18265 * "use_pmap" should be reset to its default (TRUE)
18266 * so that the new mapping gets accounted for in
18267 * the task's memory footprint.
18268 */
18269 new_entry->use_pmap = TRUE;
18270 }
18271 /* "iokit_acct" was cleared in vm_map_entry_copy() */
18272 assert(!new_entry->iokit_acct);
18273
18274 new_entry->map_aligned = FALSE;
18275
18276 new_entry->vme_start = map_address;
18277 new_entry->vme_end = map_address + tmp_size;
18278 assert(new_entry->vme_start < new_entry->vme_end);
18279 if (copy && vmk_flags.vmkf_remap_prot_copy) {
18280 /* security: keep "permanent" and "csm_associated" */
18281 new_entry->vme_permanent = src_entry->vme_permanent;
18282 new_entry->csm_associated = src_entry->csm_associated;
18283 /*
18284 * Remapping for vm_map_protect(VM_PROT_COPY)
18285 * to convert a read-only mapping into a
18286 * copy-on-write version of itself but
18287 * with write access:
18288 * keep the original inheritance but let's not
18289 * add VM_PROT_WRITE to the max protection yet
18290 * since we want to do more security checks against
18291 * the target map.
18292 */
18293 new_entry->inheritance = src_entry->inheritance;
18294 new_entry->protection &= max_prot_for_prot_copy;
18295
18296 #ifdef __arm64e__
18297 /*
18298 * Remapping for vm_map_protect(VM_PROT_COPY) to remap a TPRO
18299 * region to be explicitly writable without TPRO is only permitted
18300 * if TPRO enforcement has been overridden.
18301 *
18302 * In this case we ensure any entries reset the TPRO state
18303 * and we permit the region to be downgraded from permanent.
18304 */
18305 if (new_entry->used_for_tpro) {
18306 if (vmk_flags.vmkf_tpro_enforcement_override) {
18307 new_entry->used_for_tpro = FALSE;
18308 new_entry->vme_permanent = FALSE;
18309 } else {
18310 result = KERN_PROTECTION_FAILURE;
18311 vm_object_deallocate(object);
18312 vm_map_entry_dispose(new_entry);
18313 new_entry = VM_MAP_ENTRY_NULL;
18314 break;
18315 }
18316 }
18317 #endif
18318 } else {
18319 new_entry->inheritance = inheritance;
18320 if (!vm_remap_legacy) {
18321 new_entry->protection = *cur_protection;
18322 new_entry->max_protection = *max_protection;
18323 }
18324 }
18325
18326 VME_OFFSET_SET(new_entry, offset);
18327
18328 /*
18329 * The new region has to be copied now if required.
18330 */
18331 RestartCopy:
18332 if (!copy) {
18333 if (src_entry->used_for_jit == TRUE) {
18334 if (same_map) {
18335 } else if (!VM_MAP_POLICY_ALLOW_JIT_SHARING(map)) {
18336 /*
18337 * Cannot allow an entry describing a JIT
18338 * region to be shared across address spaces.
18339 */
18340 result = KERN_INVALID_ARGUMENT;
18341 vm_object_deallocate(object);
18342 vm_map_entry_dispose(new_entry);
18343 new_entry = VM_MAP_ENTRY_NULL;
18344 break;
18345 }
18346 }
18347
18348 if (!src_entry->is_sub_map &&
18349 VME_OBJECT(src_entry) == VM_OBJECT_NULL) {
18350 /* no accessible memory; nothing to share */
18351 assert(src_entry->protection == VM_PROT_NONE);
18352 assert(src_entry->max_protection == VM_PROT_NONE);
18353 src_entry->is_shared = FALSE;
18354 } else {
18355 src_entry->is_shared = TRUE;
18356 }
18357 if (!new_entry->is_sub_map &&
18358 VME_OBJECT(new_entry) == VM_OBJECT_NULL) {
18359 /* no accessible memory; nothing to share */
18360 assert(new_entry->protection == VM_PROT_NONE);
18361 assert(new_entry->max_protection == VM_PROT_NONE);
18362 new_entry->is_shared = FALSE;
18363 } else {
18364 new_entry->is_shared = TRUE;
18365 }
18366 if (!(new_entry->is_sub_map)) {
18367 new_entry->needs_copy = FALSE;
18368 }
18369 } else if (src_entry->is_sub_map) {
18370 /* make this a COW sub_map if not already */
18371 assert(new_entry->wired_count == 0);
18372 new_entry->needs_copy = TRUE;
18373 object = VM_OBJECT_NULL;
18374 } else if (src_entry->wired_count == 0 &&
18375 !(debug4k_no_cow_copyin && VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT) &&
18376 vm_object_copy_quickly(VME_OBJECT(new_entry),
18377 VME_OFFSET(new_entry),
18378 (new_entry->vme_end -
18379 new_entry->vme_start),
18380 &src_needs_copy,
18381 &new_entry_needs_copy)) {
18382 new_entry->needs_copy = new_entry_needs_copy;
18383 new_entry->is_shared = FALSE;
18384 assertf(new_entry->use_pmap, "map %p new_entry %p\n", map, new_entry);
18385
18386 /*
18387 * Handle copy_on_write semantics.
18388 */
18389 if (src_needs_copy && !src_entry->needs_copy) {
18390 vm_prot_t prot;
18391
18392 if (pmap_has_prot_policy(map->pmap, src_entry->translated_allow_execute, src_entry->protection)) {
18393 panic("%s: map %p pmap %p entry %p 0x%llx:0x%llx prot 0x%x",
18394 __FUNCTION__,
18395 map, map->pmap, src_entry,
18396 (uint64_t)src_entry->vme_start,
18397 (uint64_t)src_entry->vme_end,
18398 src_entry->protection);
18399 }
18400
18401 prot = src_entry->protection & ~VM_PROT_WRITE;
18402
18403 if (override_nx(map,
18404 VME_ALIAS(src_entry))
18405 && prot) {
18406 prot |= VM_PROT_EXECUTE;
18407 }
18408
18409 if (pmap_has_prot_policy(map->pmap, src_entry->translated_allow_execute, prot)) {
18410 panic("%s: map %p pmap %p entry %p 0x%llx:0x%llx prot 0x%x",
18411 __FUNCTION__,
18412 map, map->pmap, src_entry,
18413 (uint64_t)src_entry->vme_start,
18414 (uint64_t)src_entry->vme_end,
18415 prot);
18416 }
18417
18418 vm_object_pmap_protect(object,
18419 offset,
18420 entry_size,
18421 ((src_entry->is_shared
18422 || map->mapped_in_other_pmaps) ?
18423 PMAP_NULL : map->pmap),
18424 VM_MAP_PAGE_SIZE(map),
18425 src_entry->vme_start,
18426 prot);
18427
18428 assert(src_entry->wired_count == 0);
18429 src_entry->needs_copy = TRUE;
18430 }
18431 /*
18432 * Throw away the old object reference of the new entry.
18433 */
18434 vm_object_deallocate(object);
18435 } else {
18436 new_entry->is_shared = FALSE;
18437 assertf(new_entry->use_pmap, "map %p new_entry %p\n", map, new_entry);
18438
18439 src_entry_was_wired = (src_entry->wired_count > 0);
18440 saved_src_entry = src_entry;
18441 src_entry = VM_MAP_ENTRY_NULL;
18442
18443 /*
18444 * The map can be safely unlocked since we
18445 * already hold a reference on the object.
18446 *
18447 * Record the timestamp of the map for later
18448 * verification, and unlock the map.
18449 */
18450 version.main_timestamp = map->timestamp;
18451 vm_map_unlock(map); /* Increments timestamp once! */
18452
18453 /*
18454 * Perform the copy.
18455 */
18456 if (src_entry_was_wired > 0 ||
18457 (debug4k_no_cow_copyin &&
18458 VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT)) {
18459 vm_object_lock(object);
18460 result = vm_object_copy_slowly(
18461 object,
18462 offset,
18463 (new_entry->vme_end -
18464 new_entry->vme_start),
18465 THREAD_UNINT,
18466 &new_copy_object);
18467 /* VME_OBJECT_SET will reset used_for_jit, so preserve it. */
18468 saved_used_for_jit = new_entry->used_for_jit;
18469 VME_OBJECT_SET(new_entry, new_copy_object, false, 0);
18470 new_entry->used_for_jit = saved_used_for_jit;
18471 VME_OFFSET_SET(new_entry, offset - vm_object_trunc_page(offset));
18472 new_entry->needs_copy = FALSE;
18473 } else {
18474 vm_object_offset_t new_offset;
18475
18476 new_offset = VME_OFFSET(new_entry);
18477 result = vm_object_copy_strategically(
18478 object,
18479 offset,
18480 (new_entry->vme_end -
18481 new_entry->vme_start),
18482 false, /* forking */
18483 &new_copy_object,
18484 &new_offset,
18485 &new_entry_needs_copy);
18486 /* VME_OBJECT_SET will reset used_for_jit, so preserve it. */
18487 saved_used_for_jit = new_entry->used_for_jit;
18488 VME_OBJECT_SET(new_entry, new_copy_object, false, 0);
18489 new_entry->used_for_jit = saved_used_for_jit;
18490 if (new_offset != VME_OFFSET(new_entry)) {
18491 VME_OFFSET_SET(new_entry, new_offset);
18492 }
18493
18494 new_entry->needs_copy = new_entry_needs_copy;
18495 }
18496
18497 /*
18498 * Throw away the old object reference of the new entry.
18499 */
18500 vm_object_deallocate(object);
18501
18502 if (result != KERN_SUCCESS &&
18503 result != KERN_MEMORY_RESTART_COPY) {
18504 vm_map_entry_dispose(new_entry);
18505 vm_map_lock(map);
18506 break;
18507 }
18508
18509 /*
18510 * Verify that the map has not substantially
18511 * changed while the copy was being made.
18512 */
18513
18514 vm_map_lock(map);
18515 if (version.main_timestamp + 1 != map->timestamp) {
18516 /*
18517 * Simple version comparison failed.
18518 *
18519 * Retry the lookup and verify that the
18520 * same object/offset are still present.
18521 */
18522 saved_src_entry = VM_MAP_ENTRY_NULL;
18523 vm_object_deallocate(VME_OBJECT(new_entry));
18524 vm_map_entry_dispose(new_entry);
18525 if (result == KERN_MEMORY_RESTART_COPY) {
18526 result = KERN_SUCCESS;
18527 }
18528 continue;
18529 }
18530 /* map hasn't changed: src_entry is still valid */
18531 src_entry = saved_src_entry;
18532 saved_src_entry = VM_MAP_ENTRY_NULL;
18533
18534 if (result == KERN_MEMORY_RESTART_COPY) {
18535 vm_object_reference(object);
18536 goto RestartCopy;
18537 }
18538 }
18539
18540 _vm_map_store_entry_link(map_header,
18541 map_header->links.prev, new_entry);
18542
18543 /* protections for submap mapping are irrelevant here */
18544 if (vm_remap_legacy && !src_entry->is_sub_map) {
18545 *cur_protection &= src_entry->protection;
18546 *max_protection &= src_entry->max_protection;
18547 }
18548
18549 map_address += tmp_size;
18550 mapped_size += tmp_size;
18551 src_start += tmp_size;
18552
18553 if (vmk_flags.vmkf_copy_single_object) {
18554 if (mapped_size != size) {
18555 DEBUG4K_SHARE("map %p addr 0x%llx size 0x%llx clipped copy at mapped_size 0x%llx\n",
18556 map, (uint64_t)addr, (uint64_t)size, (uint64_t)mapped_size);
18557 if (src_entry->vme_next != vm_map_to_entry(map) &&
18558 src_entry->vme_next->vme_object_value ==
18559 src_entry->vme_object_value) {
18560 /* XXX TODO4K */
18561 DEBUG4K_ERROR("could have extended copy to next entry...\n");
18562 }
18563 }
18564 break;
18565 }
18566 } /* end while */
18567
18568 vm_map_unlock(map);
18569 if (result != KERN_SUCCESS) {
18570 /*
18571 * Free all allocated elements.
18572 */
18573 for (src_entry = map_header->links.next;
18574 src_entry != CAST_TO_VM_MAP_ENTRY(&map_header->links);
18575 src_entry = new_entry) {
18576 new_entry = src_entry->vme_next;
18577 _vm_map_store_entry_unlink(map_header, src_entry, false);
18578 if (src_entry->is_sub_map) {
18579 vm_map_deallocate(VME_SUBMAP(src_entry));
18580 } else {
18581 vm_object_deallocate(VME_OBJECT(src_entry));
18582 }
18583 vm_map_entry_dispose(src_entry);
18584 }
18585 }
18586 return result;
18587 }
18588
18589 bool
vm_map_is_exotic(vm_map_t map)18590 vm_map_is_exotic(
18591 vm_map_t map)
18592 {
18593 return VM_MAP_IS_EXOTIC(map);
18594 }
18595
18596 bool
vm_map_is_alien(vm_map_t map)18597 vm_map_is_alien(
18598 vm_map_t map)
18599 {
18600 return VM_MAP_IS_ALIEN(map);
18601 }
18602
18603 #if XNU_TARGET_OS_OSX
18604 void
vm_map_mark_alien(vm_map_t map)18605 vm_map_mark_alien(
18606 vm_map_t map)
18607 {
18608 vm_map_lock(map);
18609 map->is_alien = true;
18610 vm_map_unlock(map);
18611 }
18612
18613 void
vm_map_single_jit(vm_map_t map)18614 vm_map_single_jit(
18615 vm_map_t map)
18616 {
18617 vm_map_lock(map);
18618 map->single_jit = true;
18619 vm_map_unlock(map);
18620 }
18621 #endif /* XNU_TARGET_OS_OSX */
18622
18623
18624 /*
18625 * Callers of this function must call vm_map_copy_require on
18626 * previously created vm_map_copy_t or pass a newly created
18627 * one to ensure that it hasn't been forged.
18628 */
18629 static kern_return_t
vm_map_copy_to_physcopy(vm_map_copy_t copy_map,vm_map_t target_map)18630 vm_map_copy_to_physcopy(
18631 vm_map_copy_t copy_map,
18632 vm_map_t target_map)
18633 {
18634 vm_map_size_t size;
18635 vm_map_entry_t entry;
18636 vm_map_entry_t new_entry;
18637 vm_object_t new_object;
18638 unsigned int pmap_flags;
18639 pmap_t new_pmap;
18640 vm_map_t new_map;
18641 vm_map_address_t src_start, src_end, src_cur;
18642 vm_map_address_t dst_start, dst_end, dst_cur;
18643 kern_return_t kr;
18644 void *kbuf;
18645
18646 /*
18647 * Perform the equivalent of vm_allocate() and memcpy().
18648 * Replace the mappings in "copy_map" with the newly allocated mapping.
18649 */
18650 DEBUG4K_COPY("copy_map %p (%d %d 0x%llx 0x%llx) BEFORE\n", copy_map, copy_map->cpy_hdr.page_shift, copy_map->cpy_hdr.nentries, copy_map->offset, (uint64_t)copy_map->size);
18651
18652 assert(copy_map->cpy_hdr.page_shift != VM_MAP_PAGE_MASK(target_map));
18653
18654 /* create a new pmap to map "copy_map" */
18655 pmap_flags = 0;
18656 assert(copy_map->cpy_hdr.page_shift == FOURK_PAGE_SHIFT);
18657 #if PMAP_CREATE_FORCE_4K_PAGES
18658 pmap_flags |= PMAP_CREATE_FORCE_4K_PAGES;
18659 #endif /* PMAP_CREATE_FORCE_4K_PAGES */
18660 pmap_flags |= PMAP_CREATE_64BIT;
18661 new_pmap = pmap_create_options(NULL, (vm_map_size_t)0, pmap_flags);
18662 if (new_pmap == NULL) {
18663 return KERN_RESOURCE_SHORTAGE;
18664 }
18665
18666 /* allocate new VM object */
18667 size = VM_MAP_ROUND_PAGE(copy_map->size, PAGE_MASK);
18668 new_object = vm_object_allocate(size, VM_MAP_SERIAL_NONE);
18669 assert(new_object);
18670
18671 /* allocate new VM map entry */
18672 new_entry = vm_map_copy_entry_create(copy_map);
18673 assert(new_entry);
18674
18675 /* finish initializing new VM map entry */
18676 new_entry->protection = VM_PROT_DEFAULT;
18677 new_entry->max_protection = VM_PROT_DEFAULT;
18678 new_entry->use_pmap = TRUE;
18679
18680 /* make new VM map entry point to new VM object */
18681 new_entry->vme_start = 0;
18682 new_entry->vme_end = size;
18683 VME_OBJECT_SET(new_entry, new_object, false, 0);
18684 VME_OFFSET_SET(new_entry, 0);
18685
18686 /* create a new pageable VM map to map "copy_map" */
18687 new_map = vm_map_create_options(new_pmap, 0, MACH_VM_MAX_ADDRESS,
18688 VM_MAP_CREATE_PAGEABLE);
18689 assert(new_map);
18690 vm_map_set_page_shift(new_map, copy_map->cpy_hdr.page_shift);
18691
18692 /* map "copy_map" in the new VM map */
18693 src_start = 0;
18694 kr = vm_map_copyout_internal(
18695 new_map,
18696 &src_start,
18697 copy_map,
18698 copy_map->size,
18699 FALSE, /* consume_on_success */
18700 VM_PROT_DEFAULT,
18701 VM_PROT_DEFAULT,
18702 VM_INHERIT_DEFAULT);
18703 assert(kr == KERN_SUCCESS);
18704 src_end = src_start + copy_map->size;
18705
18706 /* map "new_object" in the new VM map */
18707 vm_object_reference(new_object);
18708 dst_start = 0;
18709 kr = vm_map_enter(new_map,
18710 &dst_start,
18711 size,
18712 0, /* mask */
18713 VM_MAP_KERNEL_FLAGS_ANYWHERE(.vm_tag = VM_KERN_MEMORY_OSFMK),
18714 new_object,
18715 0, /* offset */
18716 FALSE, /* needs copy */
18717 VM_PROT_DEFAULT,
18718 VM_PROT_DEFAULT,
18719 VM_INHERIT_DEFAULT);
18720 assert(kr == KERN_SUCCESS);
18721 dst_end = dst_start + size;
18722
18723 /* get a kernel buffer */
18724 kbuf = kalloc_data(PAGE_SIZE, Z_WAITOK | Z_NOFAIL);
18725
18726 /* physically copy "copy_map" mappings to new VM object */
18727 for (src_cur = src_start, dst_cur = dst_start;
18728 src_cur < src_end;
18729 src_cur += PAGE_SIZE, dst_cur += PAGE_SIZE) {
18730 vm_size_t bytes;
18731
18732 bytes = PAGE_SIZE;
18733 if (src_cur + PAGE_SIZE > src_end) {
18734 /* partial copy for last page */
18735 bytes = src_end - src_cur;
18736 assert(bytes > 0 && bytes < PAGE_SIZE);
18737 /* rest of dst page should be zero-filled */
18738 }
18739 /* get bytes from src mapping */
18740 kr = copyinmap(new_map, src_cur, kbuf, bytes);
18741 if (kr != KERN_SUCCESS) {
18742 DEBUG4K_COPY("copyinmap(%p, 0x%llx, %p, 0x%llx) kr 0x%x\n", new_map, (uint64_t)src_cur, kbuf, (uint64_t)bytes, kr);
18743 }
18744 /* put bytes in dst mapping */
18745 assert(dst_cur < dst_end);
18746 assert(dst_cur + bytes <= dst_end);
18747 kr = copyoutmap(new_map, kbuf, dst_cur, bytes);
18748 if (kr != KERN_SUCCESS) {
18749 DEBUG4K_COPY("copyoutmap(%p, %p, 0x%llx, 0x%llx) kr 0x%x\n", new_map, kbuf, (uint64_t)dst_cur, (uint64_t)bytes, kr);
18750 }
18751 }
18752
18753 /* free kernel buffer */
18754 kfree_data(kbuf, PAGE_SIZE);
18755
18756 /* destroy new map */
18757 vm_map_destroy(new_map);
18758 new_map = VM_MAP_NULL;
18759
18760 /* dispose of the old map entries in "copy_map" */
18761 while (vm_map_copy_first_entry(copy_map) !=
18762 vm_map_copy_to_entry(copy_map)) {
18763 entry = vm_map_copy_first_entry(copy_map);
18764 vm_map_copy_entry_unlink(copy_map, entry);
18765 if (entry->is_sub_map) {
18766 vm_map_deallocate(VME_SUBMAP(entry));
18767 } else {
18768 vm_object_deallocate(VME_OBJECT(entry));
18769 }
18770 vm_map_copy_entry_dispose(entry);
18771 }
18772
18773 /* change "copy_map"'s page_size to match "target_map" */
18774 copy_map->cpy_hdr.page_shift = (uint16_t)VM_MAP_PAGE_SHIFT(target_map);
18775 copy_map->offset = 0;
18776 copy_map->size = size;
18777
18778 /* insert new map entry in "copy_map" */
18779 assert(vm_map_copy_last_entry(copy_map) == vm_map_copy_to_entry(copy_map));
18780 vm_map_copy_entry_link(copy_map, vm_map_copy_last_entry(copy_map), new_entry);
18781
18782 DEBUG4K_COPY("copy_map %p (%d %d 0x%llx 0x%llx) AFTER\n", copy_map, copy_map->cpy_hdr.page_shift, copy_map->cpy_hdr.nentries, copy_map->offset, (uint64_t)copy_map->size);
18783 return KERN_SUCCESS;
18784 }
18785
18786 void
18787 vm_map_copy_adjust_get_target_copy_map(
18788 vm_map_copy_t copy_map,
18789 vm_map_copy_t *target_copy_map_p);
18790 void
vm_map_copy_adjust_get_target_copy_map(vm_map_copy_t copy_map,vm_map_copy_t * target_copy_map_p)18791 vm_map_copy_adjust_get_target_copy_map(
18792 vm_map_copy_t copy_map,
18793 vm_map_copy_t *target_copy_map_p)
18794 {
18795 vm_map_copy_t target_copy_map;
18796 vm_map_entry_t entry, target_entry;
18797
18798 if (*target_copy_map_p != VM_MAP_COPY_NULL) {
18799 /* the caller already has a "target_copy_map": use it */
18800 return;
18801 }
18802
18803 /* the caller wants us to create a new copy of "copy_map" */
18804 assert(copy_map->type == VM_MAP_COPY_ENTRY_LIST);
18805 target_copy_map = vm_map_copy_allocate(copy_map->type);
18806 target_copy_map->offset = copy_map->offset;
18807 target_copy_map->size = copy_map->size;
18808 target_copy_map->cpy_hdr.page_shift = copy_map->cpy_hdr.page_shift;
18809 for (entry = vm_map_copy_first_entry(copy_map);
18810 entry != vm_map_copy_to_entry(copy_map);
18811 entry = entry->vme_next) {
18812 target_entry = vm_map_copy_entry_create(target_copy_map);
18813 vm_map_entry_copy_full(target_entry, entry);
18814 if (target_entry->is_sub_map) {
18815 vm_map_reference(VME_SUBMAP(target_entry));
18816 } else {
18817 vm_object_reference(VME_OBJECT(target_entry));
18818 }
18819 vm_map_copy_entry_link(
18820 target_copy_map,
18821 vm_map_copy_last_entry(target_copy_map),
18822 target_entry);
18823 }
18824 entry = VM_MAP_ENTRY_NULL;
18825 *target_copy_map_p = target_copy_map;
18826 }
18827
18828 /*
18829 * Callers of this function must call vm_map_copy_require on
18830 * previously created vm_map_copy_t or pass a newly created
18831 * one to ensure that it hasn't been forged.
18832 */
18833 static void
vm_map_copy_trim(vm_map_copy_t copy_map,uint16_t new_page_shift,vm_map_offset_t trim_start,vm_map_offset_t trim_end)18834 vm_map_copy_trim(
18835 vm_map_copy_t copy_map,
18836 uint16_t new_page_shift,
18837 vm_map_offset_t trim_start,
18838 vm_map_offset_t trim_end)
18839 {
18840 uint16_t copy_page_shift;
18841 vm_map_entry_t entry, next_entry;
18842
18843 assert(copy_map->type == VM_MAP_COPY_ENTRY_LIST);
18844 assert(copy_map->cpy_hdr.nentries > 0);
18845
18846 trim_start += vm_map_copy_first_entry(copy_map)->vme_start;
18847 trim_end += vm_map_copy_first_entry(copy_map)->vme_start;
18848
18849 /* use the new page_shift to do the clipping */
18850 copy_page_shift = VM_MAP_COPY_PAGE_SHIFT(copy_map);
18851 copy_map->cpy_hdr.page_shift = new_page_shift;
18852
18853 for (entry = vm_map_copy_first_entry(copy_map);
18854 entry != vm_map_copy_to_entry(copy_map);
18855 entry = next_entry) {
18856 next_entry = entry->vme_next;
18857 if (entry->vme_end <= trim_start) {
18858 /* entry fully before trim range: skip */
18859 continue;
18860 }
18861 if (entry->vme_start >= trim_end) {
18862 /* entry fully after trim range: done */
18863 break;
18864 }
18865 /* clip entry if needed */
18866 vm_map_copy_clip_start(copy_map, entry, trim_start);
18867 vm_map_copy_clip_end(copy_map, entry, trim_end);
18868 /* dispose of entry */
18869 copy_map->size -= entry->vme_end - entry->vme_start;
18870 vm_map_copy_entry_unlink(copy_map, entry);
18871 if (entry->is_sub_map) {
18872 vm_map_deallocate(VME_SUBMAP(entry));
18873 } else {
18874 vm_object_deallocate(VME_OBJECT(entry));
18875 }
18876 vm_map_copy_entry_dispose(entry);
18877 entry = VM_MAP_ENTRY_NULL;
18878 }
18879
18880 /* restore copy_map's original page_shift */
18881 copy_map->cpy_hdr.page_shift = copy_page_shift;
18882 }
18883
18884 /*
18885 * Make any necessary adjustments to "copy_map" to allow it to be
18886 * mapped into "target_map".
18887 * If no changes were necessary, "target_copy_map" points to the
18888 * untouched "copy_map".
18889 * If changes are necessary, changes will be made to "target_copy_map".
18890 * If "target_copy_map" was NULL, we create a new "vm_map_copy_t" and
18891 * copy the original "copy_map" to it before applying the changes.
18892 * The caller should discard "target_copy_map" if it's not the same as
18893 * the original "copy_map".
18894 */
18895 /* TODO4K: also adjust to sub-range in the copy_map -> add start&end? */
18896 kern_return_t
vm_map_copy_adjust_to_target(vm_map_copy_t src_copy_map,vm_map_offset_ut offset_u,vm_map_size_ut size_u,vm_map_t target_map,boolean_t copy,vm_map_copy_t * target_copy_map_p,vm_map_offset_t * overmap_start_p,vm_map_offset_t * overmap_end_p,vm_map_offset_t * trimmed_start_p)18897 vm_map_copy_adjust_to_target(
18898 vm_map_copy_t src_copy_map,
18899 vm_map_offset_ut offset_u,
18900 vm_map_size_ut size_u,
18901 vm_map_t target_map,
18902 boolean_t copy,
18903 vm_map_copy_t *target_copy_map_p,
18904 vm_map_offset_t *overmap_start_p,
18905 vm_map_offset_t *overmap_end_p,
18906 vm_map_offset_t *trimmed_start_p)
18907 {
18908 vm_map_copy_t copy_map, target_copy_map;
18909 vm_map_size_t target_size;
18910 vm_map_size_t src_copy_map_size;
18911 vm_map_size_t overmap_start, overmap_end;
18912 int misalignments;
18913 vm_map_entry_t entry, target_entry;
18914 vm_map_offset_t addr_adjustment;
18915 vm_map_offset_t new_start, new_end;
18916 int copy_page_mask, target_page_mask;
18917 uint16_t copy_page_shift, target_page_shift;
18918 vm_map_offset_t trimmed_end;
18919 vm_map_size_t map_size;
18920 kern_return_t kr;
18921
18922 /*
18923 * Sanitize any input parameters that are addr/size/prot/inherit
18924 */
18925 kr = vm_map_copy_addr_size_sanitize(
18926 target_map,
18927 offset_u,
18928 size_u,
18929 VM_SANITIZE_CALLER_MACH_MEMORY_ENTRY_MAP_SIZE,
18930 &new_start,
18931 &new_end,
18932 &map_size);
18933 if (__improbable(kr != KERN_SUCCESS)) {
18934 return vm_sanitize_get_kr(kr);
18935 }
18936
18937 /*
18938 * Assert that the vm_map_copy is coming from the right
18939 * zone and hasn't been forged
18940 */
18941 vm_map_copy_require(src_copy_map);
18942 assert(src_copy_map->type == VM_MAP_COPY_ENTRY_LIST);
18943
18944 /*
18945 * Start working with "src_copy_map" but we'll switch
18946 * to "target_copy_map" as soon as we start making adjustments.
18947 */
18948 copy_map = src_copy_map;
18949 src_copy_map_size = src_copy_map->size;
18950
18951 copy_page_shift = VM_MAP_COPY_PAGE_SHIFT(copy_map);
18952 copy_page_mask = VM_MAP_COPY_PAGE_MASK(copy_map);
18953 target_page_shift = (uint16_t)VM_MAP_PAGE_SHIFT(target_map);
18954 target_page_mask = VM_MAP_PAGE_MASK(target_map);
18955
18956 DEBUG4K_ADJUST("copy_map %p (%d offset 0x%llx size 0x%llx) target_map %p (%d) copy %d offset 0x%llx size 0x%llx target_copy_map %p...\n", copy_map, copy_page_shift, (uint64_t)copy_map->offset, (uint64_t)copy_map->size, target_map, target_page_shift, copy, (uint64_t)VM_SANITIZE_UNSAFE_UNWRAP(offset_u), (uint64_t)VM_SANITIZE_UNSAFE_UNWRAP(size_u), *target_copy_map_p);
18957
18958 target_copy_map = *target_copy_map_p;
18959 if (target_copy_map != VM_MAP_COPY_NULL) {
18960 vm_map_copy_require(target_copy_map);
18961 }
18962
18963 if (new_end > copy_map->size) {
18964 DEBUG4K_ERROR("copy_map %p (%d->%d) copy_map->size 0x%llx offset 0x%llx size 0x%llx KERN_INVALID_ARGUMENT\n", copy_map, copy_page_shift, target_page_shift, (uint64_t)copy_map->size, (uint64_t)VM_SANITIZE_UNSAFE_UNWRAP(offset_u), (uint64_t)VM_SANITIZE_UNSAFE_UNWRAP(size_u));
18965 return KERN_INVALID_ARGUMENT;
18966 }
18967
18968 /* trim the end */
18969 trimmed_end = 0;
18970 new_end = VM_MAP_ROUND_PAGE(new_end, target_page_mask);
18971 if (new_end < copy_map->size) {
18972 trimmed_end = src_copy_map_size - new_end;
18973 DEBUG4K_ADJUST("copy_map %p (%d->%d) copy %d offset 0x%llx size 0x%llx target_copy_map %p... trim end from 0x%llx to 0x%llx\n", copy_map, copy_page_shift, target_page_shift, copy, (uint64_t)VM_SANITIZE_UNSAFE_UNWRAP(offset_u), (uint64_t)VM_SANITIZE_UNSAFE_UNWRAP(size_u), target_copy_map, (uint64_t)new_end, (uint64_t)copy_map->size);
18974 /* get "target_copy_map" if needed and adjust it */
18975 vm_map_copy_adjust_get_target_copy_map(copy_map,
18976 &target_copy_map);
18977 copy_map = target_copy_map;
18978 vm_map_copy_trim(target_copy_map, target_page_shift,
18979 new_end, copy_map->size);
18980 }
18981
18982 /* trim the start */
18983 new_start = VM_MAP_TRUNC_PAGE(new_start, target_page_mask);
18984 if (new_start != 0) {
18985 DEBUG4K_ADJUST("copy_map %p (%d->%d) copy %d offset 0x%llx size 0x%llx target_copy_map %p... trim start from 0x%llx to 0x%llx\n", copy_map, copy_page_shift, target_page_shift, copy, (uint64_t)VM_SANITIZE_UNSAFE_UNWRAP(offset_u), (uint64_t)VM_SANITIZE_UNSAFE_UNWRAP(size_u), target_copy_map, (uint64_t)0, (uint64_t)new_start);
18986 /* get "target_copy_map" if needed and adjust it */
18987 vm_map_copy_adjust_get_target_copy_map(copy_map,
18988 &target_copy_map);
18989 copy_map = target_copy_map;
18990 vm_map_copy_trim(target_copy_map, target_page_shift,
18991 0, new_start);
18992 }
18993 *trimmed_start_p = new_start;
18994
18995 /* target_size starts with what's left after trimming */
18996 target_size = copy_map->size;
18997 assertf(target_size == src_copy_map_size - *trimmed_start_p - trimmed_end,
18998 "target_size 0x%llx src_copy_map_size 0x%llx trimmed_start 0x%llx trimmed_end 0x%llx\n",
18999 (uint64_t)target_size, (uint64_t)src_copy_map_size,
19000 (uint64_t)*trimmed_start_p, (uint64_t)trimmed_end);
19001
19002 /* check for misalignments but don't adjust yet */
19003 misalignments = 0;
19004 overmap_start = 0;
19005 overmap_end = 0;
19006 if (copy_page_shift < target_page_shift) {
19007 /*
19008 * Remapping from 4K to 16K: check the VM object alignments
19009 * throughout the range.
19010 * If the start and end of the range are mis-aligned, we can
19011 * over-map to re-align, and adjust the "overmap" start/end
19012 * and "target_size" of the range accordingly.
19013 * If there is any mis-alignment within the range:
19014 * if "copy":
19015 * we can do immediate-copy instead of copy-on-write,
19016 * else:
19017 * no way to remap and share; fail.
19018 */
19019 for (entry = vm_map_copy_first_entry(copy_map);
19020 entry != vm_map_copy_to_entry(copy_map);
19021 entry = entry->vme_next) {
19022 vm_object_offset_t object_offset_start, object_offset_end;
19023
19024 object_offset_start = VME_OFFSET(entry);
19025 object_offset_end = object_offset_start;
19026 object_offset_end += entry->vme_end - entry->vme_start;
19027 if (object_offset_start & target_page_mask) {
19028 if (entry == vm_map_copy_first_entry(copy_map) && !copy) {
19029 overmap_start++;
19030 } else {
19031 misalignments++;
19032 }
19033 }
19034 if (object_offset_end & target_page_mask) {
19035 if (entry->vme_next == vm_map_copy_to_entry(copy_map) && !copy) {
19036 overmap_end++;
19037 } else {
19038 misalignments++;
19039 }
19040 }
19041 }
19042 }
19043 entry = VM_MAP_ENTRY_NULL;
19044
19045 /* decide how to deal with misalignments */
19046 assert(overmap_start <= 1);
19047 assert(overmap_end <= 1);
19048 if (!overmap_start && !overmap_end && !misalignments) {
19049 /* copy_map is properly aligned for target_map ... */
19050 if (*trimmed_start_p) {
19051 /* ... but we trimmed it, so still need to adjust */
19052 } else {
19053 /* ... and we didn't trim anything: we're done */
19054 if (target_copy_map == VM_MAP_COPY_NULL) {
19055 target_copy_map = copy_map;
19056 }
19057 *target_copy_map_p = target_copy_map;
19058 *overmap_start_p = 0;
19059 *overmap_end_p = 0;
19060 DEBUG4K_ADJUST("copy_map %p (%d offset 0x%llx size 0x%llx) target_map %p (%d) copy %d target_copy_map %p (%d offset 0x%llx size 0x%llx) -> trimmed 0x%llx overmap start 0x%llx end 0x%llx KERN_SUCCESS\n", copy_map, copy_page_shift, (uint64_t)copy_map->offset, (uint64_t)copy_map->size, target_map, target_page_shift, copy, *target_copy_map_p, VM_MAP_COPY_PAGE_SHIFT(*target_copy_map_p), (uint64_t)(*target_copy_map_p)->offset, (uint64_t)(*target_copy_map_p)->size, (uint64_t)*trimmed_start_p, (uint64_t)*overmap_start_p, (uint64_t)*overmap_end_p);
19061 return KERN_SUCCESS;
19062 }
19063 } else if (misalignments && !copy) {
19064 /* can't "share" if misaligned */
19065 DEBUG4K_ADJUST("unsupported sharing\n");
19066 #if MACH_ASSERT
19067 if (debug4k_panic_on_misaligned_sharing) {
19068 panic("DEBUG4k %s:%d unsupported sharing", __FUNCTION__, __LINE__);
19069 }
19070 #endif /* MACH_ASSERT */
19071 DEBUG4K_ADJUST("copy_map %p (%d) target_map %p (%d) copy %d target_copy_map %p -> KERN_NOT_SUPPORTED\n", copy_map, copy_page_shift, target_map, target_page_shift, copy, *target_copy_map_p);
19072 return KERN_NOT_SUPPORTED;
19073 } else {
19074 /* can't virtual-copy if misaligned (but can physical-copy) */
19075 DEBUG4K_ADJUST("mis-aligned copying\n");
19076 }
19077
19078 /* get a "target_copy_map" if needed and switch to it */
19079 vm_map_copy_adjust_get_target_copy_map(copy_map, &target_copy_map);
19080 copy_map = target_copy_map;
19081
19082 if (misalignments && copy) {
19083 vm_map_size_t target_copy_map_size;
19084
19085 /*
19086 * Can't do copy-on-write with misaligned mappings.
19087 * Replace the mappings with a physical copy of the original
19088 * mappings' contents.
19089 */
19090 target_copy_map_size = target_copy_map->size;
19091 kr = vm_map_copy_to_physcopy(target_copy_map, target_map);
19092 if (kr != KERN_SUCCESS) {
19093 return kr;
19094 }
19095 *target_copy_map_p = target_copy_map;
19096 *overmap_start_p = 0;
19097 *overmap_end_p = target_copy_map->size - target_copy_map_size;
19098 DEBUG4K_ADJUST("copy_map %p (%d offset 0x%llx size 0x%llx) target_map %p (%d) copy %d target_copy_map %p (%d offset 0x%llx size 0x%llx)-> trimmed 0x%llx overmap start 0x%llx end 0x%llx PHYSCOPY\n", copy_map, copy_page_shift, (uint64_t)copy_map->offset, (uint64_t)copy_map->size, target_map, target_page_shift, copy, *target_copy_map_p, VM_MAP_COPY_PAGE_SHIFT(*target_copy_map_p), (uint64_t)(*target_copy_map_p)->offset, (uint64_t)(*target_copy_map_p)->size, (uint64_t)*trimmed_start_p, (uint64_t)*overmap_start_p, (uint64_t)*overmap_end_p);
19099 return KERN_SUCCESS;
19100 }
19101
19102 /* apply the adjustments */
19103 misalignments = 0;
19104 overmap_start = 0;
19105 overmap_end = 0;
19106 /* remove copy_map->offset, so that everything starts at offset 0 */
19107 addr_adjustment = copy_map->offset;
19108 /* also remove whatever we trimmed from the start */
19109 addr_adjustment += *trimmed_start_p;
19110 for (target_entry = vm_map_copy_first_entry(target_copy_map);
19111 target_entry != vm_map_copy_to_entry(target_copy_map);
19112 target_entry = target_entry->vme_next) {
19113 vm_object_offset_t object_offset_start, object_offset_end;
19114
19115 DEBUG4K_ADJUST("copy %p (%d 0x%llx 0x%llx) entry %p [ 0x%llx 0x%llx ] object %p offset 0x%llx BEFORE\n", target_copy_map, VM_MAP_COPY_PAGE_SHIFT(target_copy_map), target_copy_map->offset, (uint64_t)target_copy_map->size, target_entry, (uint64_t)target_entry->vme_start, (uint64_t)target_entry->vme_end, VME_OBJECT(target_entry), VME_OFFSET(target_entry));
19116 object_offset_start = VME_OFFSET(target_entry);
19117 if (object_offset_start & target_page_mask) {
19118 DEBUG4K_ADJUST("copy %p (%d 0x%llx 0x%llx) entry %p [ 0x%llx 0x%llx ] object %p offset 0x%llx misaligned at start\n", target_copy_map, VM_MAP_COPY_PAGE_SHIFT(target_copy_map), target_copy_map->offset, (uint64_t)target_copy_map->size, target_entry, (uint64_t)target_entry->vme_start, (uint64_t)target_entry->vme_end, VME_OBJECT(target_entry), VME_OFFSET(target_entry));
19119 if (target_entry == vm_map_copy_first_entry(target_copy_map)) {
19120 /*
19121 * start of 1st entry is mis-aligned:
19122 * re-adjust by over-mapping.
19123 */
19124 overmap_start = object_offset_start - trunc_page_mask_64(object_offset_start, target_page_mask);
19125 DEBUG4K_ADJUST("entry %p offset 0x%llx copy %d -> overmap_start 0x%llx\n", target_entry, VME_OFFSET(target_entry), copy, (uint64_t)overmap_start);
19126 VME_OFFSET_SET(target_entry, VME_OFFSET(target_entry) - overmap_start);
19127 } else {
19128 misalignments++;
19129 DEBUG4K_ADJUST("entry %p offset 0x%llx copy %d -> misalignments %d\n", target_entry, VME_OFFSET(target_entry), copy, misalignments);
19130 assert(copy);
19131 }
19132 }
19133
19134 if (target_entry == vm_map_copy_first_entry(target_copy_map)) {
19135 target_size += overmap_start;
19136 } else {
19137 target_entry->vme_start += overmap_start;
19138 }
19139 target_entry->vme_end += overmap_start;
19140
19141 object_offset_end = VME_OFFSET(target_entry) + target_entry->vme_end - target_entry->vme_start;
19142 if (object_offset_end & target_page_mask) {
19143 DEBUG4K_ADJUST("copy %p (%d 0x%llx 0x%llx) entry %p [ 0x%llx 0x%llx ] object %p offset 0x%llx misaligned at end\n", target_copy_map, VM_MAP_COPY_PAGE_SHIFT(target_copy_map), target_copy_map->offset, (uint64_t)target_copy_map->size, target_entry, (uint64_t)target_entry->vme_start, (uint64_t)target_entry->vme_end, VME_OBJECT(target_entry), VME_OFFSET(target_entry));
19144 if (target_entry->vme_next == vm_map_copy_to_entry(target_copy_map)) {
19145 /*
19146 * end of last entry is mis-aligned: re-adjust by over-mapping.
19147 */
19148 overmap_end = round_page_mask_64(object_offset_end, target_page_mask) - object_offset_end;
19149 DEBUG4K_ADJUST("entry %p offset 0x%llx copy %d -> overmap_end 0x%llx\n", target_entry, VME_OFFSET(target_entry), copy, (uint64_t)overmap_end);
19150 target_entry->vme_end += overmap_end;
19151 target_size += overmap_end;
19152 } else {
19153 misalignments++;
19154 DEBUG4K_ADJUST("entry %p offset 0x%llx copy %d -> misalignments %d\n", target_entry, VME_OFFSET(target_entry), copy, misalignments);
19155 assert(copy);
19156 }
19157 }
19158 target_entry->vme_start -= addr_adjustment;
19159 target_entry->vme_end -= addr_adjustment;
19160 DEBUG4K_ADJUST("copy %p (%d 0x%llx 0x%llx) entry %p [ 0x%llx 0x%llx ] object %p offset 0x%llx AFTER\n", target_copy_map, VM_MAP_COPY_PAGE_SHIFT(target_copy_map), target_copy_map->offset, (uint64_t)target_copy_map->size, target_entry, (uint64_t)target_entry->vme_start, (uint64_t)target_entry->vme_end, VME_OBJECT(target_entry), VME_OFFSET(target_entry));
19161 }
19162
19163 target_copy_map->size = target_size;
19164 target_copy_map->offset += overmap_start;
19165 target_copy_map->offset -= addr_adjustment;
19166 target_copy_map->cpy_hdr.page_shift = target_page_shift;
19167
19168 // assert(VM_MAP_PAGE_ALIGNED(target_copy_map->size, target_page_mask));
19169 // assert(VM_MAP_PAGE_ALIGNED(target_copy_map->offset, FOURK_PAGE_MASK));
19170 assert(overmap_start < VM_MAP_PAGE_SIZE(target_map));
19171 assert(overmap_end < VM_MAP_PAGE_SIZE(target_map));
19172
19173 *target_copy_map_p = target_copy_map;
19174 *overmap_start_p = overmap_start;
19175 *overmap_end_p = overmap_end;
19176
19177 DEBUG4K_ADJUST("copy_map %p (%d offset 0x%llx size 0x%llx) target_map %p (%d) copy %d target_copy_map %p (%d offset 0x%llx size 0x%llx) -> trimmed 0x%llx overmap start 0x%llx end 0x%llx KERN_SUCCESS\n", copy_map, copy_page_shift, (uint64_t)copy_map->offset, (uint64_t)copy_map->size, target_map, target_page_shift, copy, *target_copy_map_p, VM_MAP_COPY_PAGE_SHIFT(*target_copy_map_p), (uint64_t)(*target_copy_map_p)->offset, (uint64_t)(*target_copy_map_p)->size, (uint64_t)*trimmed_start_p, (uint64_t)*overmap_start_p, (uint64_t)*overmap_end_p);
19178 return KERN_SUCCESS;
19179 }
19180
19181 kern_return_t
vm_map_range_physical_size(vm_map_t map,vm_map_address_t start,mach_vm_size_t size,mach_vm_size_t * phys_size)19182 vm_map_range_physical_size(
19183 vm_map_t map,
19184 vm_map_address_t start,
19185 mach_vm_size_t size,
19186 mach_vm_size_t * phys_size)
19187 {
19188 kern_return_t kr;
19189 vm_map_copy_t copy_map, target_copy_map;
19190 vm_map_offset_t adjusted_start, adjusted_end;
19191 vm_map_size_t adjusted_size;
19192 vm_prot_t cur_prot, max_prot;
19193 vm_map_offset_t overmap_start, overmap_end, trimmed_start, end;
19194 vm_map_kernel_flags_t vmk_flags;
19195
19196 if (size == 0) {
19197 DEBUG4K_SHARE("map %p start 0x%llx size 0x%llx -> phys_size 0!\n", map, (uint64_t)start, (uint64_t)size);
19198 *phys_size = 0;
19199 return KERN_SUCCESS;
19200 }
19201
19202 adjusted_start = vm_map_trunc_page(start, VM_MAP_PAGE_MASK(map));
19203 adjusted_end = vm_map_round_page(start + size, VM_MAP_PAGE_MASK(map));
19204 if (__improbable(os_add_overflow(start, size, &end) ||
19205 adjusted_end <= adjusted_start)) {
19206 /* wraparound */
19207 printf("%s:%d(start=0x%llx, size=0x%llx) pgmask 0x%x: wraparound\n", __FUNCTION__, __LINE__, (uint64_t)start, (uint64_t)size, VM_MAP_PAGE_MASK(map));
19208 *phys_size = 0;
19209 return KERN_INVALID_ARGUMENT;
19210 }
19211 if (__improbable(vm_map_range_overflows(map, start, size))) {
19212 *phys_size = 0;
19213 return KERN_INVALID_ADDRESS;
19214 }
19215 assert(adjusted_end > adjusted_start);
19216 adjusted_size = adjusted_end - adjusted_start;
19217 *phys_size = adjusted_size;
19218 if (VM_MAP_PAGE_SIZE(map) == PAGE_SIZE) {
19219 return KERN_SUCCESS;
19220 }
19221 if (start == 0) {
19222 adjusted_start = vm_map_trunc_page(start, PAGE_MASK);
19223 adjusted_end = vm_map_round_page(start + size, PAGE_MASK);
19224 if (__improbable(adjusted_end <= adjusted_start)) {
19225 /* wraparound */
19226 printf("%s:%d(start=0x%llx, size=0x%llx) pgmask 0x%x: wraparound\n", __FUNCTION__, __LINE__, (uint64_t)start, (uint64_t)size, PAGE_MASK);
19227 *phys_size = 0;
19228 return KERN_INVALID_ARGUMENT;
19229 }
19230 assert(adjusted_end > adjusted_start);
19231 adjusted_size = adjusted_end - adjusted_start;
19232 *phys_size = adjusted_size;
19233 return KERN_SUCCESS;
19234 }
19235
19236 vmk_flags = VM_MAP_KERNEL_FLAGS_NONE;
19237 vmk_flags.vmkf_copy_pageable = TRUE;
19238 vmk_flags.vmkf_copy_same_map = TRUE;
19239 assert(adjusted_size != 0);
19240 cur_prot = VM_PROT_NONE; /* legacy mode */
19241 max_prot = VM_PROT_NONE; /* legacy mode */
19242 vmk_flags.vmkf_remap_legacy_mode = true;
19243 kr = vm_map_copy_extract(map, adjusted_start, adjusted_size,
19244 FALSE /* copy */,
19245 ©_map,
19246 &cur_prot, &max_prot, VM_INHERIT_DEFAULT,
19247 vmk_flags);
19248 if (kr != KERN_SUCCESS) {
19249 DEBUG4K_ERROR("map %p start 0x%llx 0x%llx size 0x%llx 0x%llx kr 0x%x\n", map, (uint64_t)start, (uint64_t)adjusted_start, size, (uint64_t)adjusted_size, kr);
19250 //assert(0);
19251 *phys_size = 0;
19252 return kr;
19253 }
19254 assert(copy_map != VM_MAP_COPY_NULL);
19255 target_copy_map = copy_map;
19256 DEBUG4K_ADJUST("adjusting...\n");
19257 kr = vm_map_copy_adjust_to_target(
19258 copy_map,
19259 start - adjusted_start, /* offset */
19260 size, /* size */
19261 kernel_map,
19262 FALSE, /* copy */
19263 &target_copy_map,
19264 &overmap_start,
19265 &overmap_end,
19266 &trimmed_start);
19267 if (kr == KERN_SUCCESS) {
19268 if (target_copy_map->size != *phys_size) {
19269 DEBUG4K_ADJUST("map %p (%d) start 0x%llx size 0x%llx adjusted_start 0x%llx adjusted_end 0x%llx overmap_start 0x%llx overmap_end 0x%llx trimmed_start 0x%llx phys_size 0x%llx -> 0x%llx\n", map, VM_MAP_PAGE_SHIFT(map), (uint64_t)start, (uint64_t)size, (uint64_t)adjusted_start, (uint64_t)adjusted_end, (uint64_t)overmap_start, (uint64_t)overmap_end, (uint64_t)trimmed_start, (uint64_t)*phys_size, (uint64_t)target_copy_map->size);
19270 }
19271 *phys_size = target_copy_map->size;
19272 } else {
19273 DEBUG4K_ERROR("map %p start 0x%llx 0x%llx size 0x%llx 0x%llx kr 0x%x\n", map, (uint64_t)start, (uint64_t)adjusted_start, size, (uint64_t)adjusted_size, kr);
19274 //assert(0);
19275 *phys_size = 0;
19276 }
19277 vm_map_copy_discard(copy_map);
19278 copy_map = VM_MAP_COPY_NULL;
19279
19280 return kr;
19281 }
19282
19283 static __attribute__((always_inline, warn_unused_result))
19284 kern_return_t
vm_map_remap_sanitize(vm_map_t src_map,vm_map_t target_map,vm_map_address_ut address_u,vm_map_size_ut size_u,vm_map_offset_ut mask_u,vm_map_offset_ut memory_address_u,vm_prot_ut cur_protection_u,vm_prot_ut max_protection_u,vm_inherit_ut inheritance_u,vm_map_kernel_flags_t vmk_flags,vm_map_address_t * target_addr,vm_map_address_t * mask,vm_map_offset_t * memory_address,vm_map_offset_t * memory_end,vm_map_size_t * memory_size,vm_prot_t * cur_protection,vm_prot_t * max_protection,vm_inherit_t * inheritance)19285 vm_map_remap_sanitize(
19286 vm_map_t src_map,
19287 vm_map_t target_map,
19288 vm_map_address_ut address_u,
19289 vm_map_size_ut size_u,
19290 vm_map_offset_ut mask_u,
19291 vm_map_offset_ut memory_address_u,
19292 vm_prot_ut cur_protection_u,
19293 vm_prot_ut max_protection_u,
19294 vm_inherit_ut inheritance_u,
19295 vm_map_kernel_flags_t vmk_flags,
19296 vm_map_address_t *target_addr,
19297 vm_map_address_t *mask,
19298 vm_map_offset_t *memory_address,
19299 vm_map_offset_t *memory_end,
19300 vm_map_size_t *memory_size,
19301 vm_prot_t *cur_protection,
19302 vm_prot_t *max_protection,
19303 vm_inherit_t *inheritance)
19304 {
19305 kern_return_t result;
19306 vm_sanitize_flags_t vm_sanitize_flags;
19307
19308 result = vm_sanitize_inherit(inheritance_u, VM_SANITIZE_CALLER_VM_MAP_REMAP,
19309 inheritance);
19310 if (__improbable(result != KERN_SUCCESS)) {
19311 return result;
19312 }
19313
19314 result = vm_sanitize_cur_and_max_prots(cur_protection_u, max_protection_u,
19315 VM_SANITIZE_CALLER_VM_MAP_REMAP, target_map,
19316 cur_protection, max_protection);
19317 if (__improbable(result != KERN_SUCCESS)) {
19318 return result;
19319 }
19320
19321 result = vm_sanitize_mask(mask_u, VM_SANITIZE_CALLER_VM_MAP_REMAP, mask);
19322 if (__improbable(result != KERN_SUCCESS)) {
19323 return result;
19324 }
19325
19326 /*
19327 * If the user is requesting that we return the address of the
19328 * first byte of the data (rather than the base of the page),
19329 * then we use different rounding semantics: specifically,
19330 * we assume that (memory_address, size) describes a region
19331 * all of whose pages we must cover, rather than a base to be truncated
19332 * down and a size to be added to that base. So we figure out
19333 * the highest page that the requested region includes and make
19334 * sure that the size will cover it.
19335 *
19336 * The key example we're worried about it is of the form:
19337 *
19338 * memory_address = 0x1ff0, size = 0x20
19339 *
19340 * With the old semantics, we round down the memory_address to 0x1000
19341 * and round up the size to 0x1000, resulting in our covering *only*
19342 * page 0x1000. With the new semantics, we'd realize that the region covers
19343 * 0x1ff0-0x2010, and compute a size of 0x2000. Thus, we cover both page
19344 * 0x1000 and page 0x2000 in the region we remap.
19345 *
19346 * VM_SANITIZE_FLAGS_REALIGN_START asks for the old (broken) semantics.
19347 */
19348 vm_sanitize_flags = VM_SANITIZE_FLAGS_SIZE_ZERO_FAILS;
19349 if (!vmk_flags.vmf_return_data_addr) {
19350 vm_sanitize_flags |= VM_SANITIZE_FLAGS_REALIGN_START;
19351 }
19352
19353 result = vm_sanitize_addr_size(memory_address_u, size_u,
19354 VM_SANITIZE_CALLER_VM_MAP_REMAP, src_map,
19355 vm_sanitize_flags, memory_address, memory_end,
19356 memory_size);
19357 if (__improbable(result != KERN_SUCCESS)) {
19358 return result;
19359 }
19360
19361 *target_addr = vm_sanitize_addr(target_map, address_u);
19362 return KERN_SUCCESS;
19363 }
19364
19365 /*
19366 * Routine: vm_remap
19367 *
19368 * Map portion of a task's address space.
19369 * Mapped region must not overlap more than
19370 * one vm memory object. Protections and
19371 * inheritance attributes remain the same
19372 * as in the original task and are out parameters.
19373 * Source and Target task can be identical
19374 * Other attributes are identical as for vm_map()
19375 */
19376 kern_return_t
vm_map_remap(vm_map_t target_map,vm_map_address_ut * address_u,vm_map_size_ut size_u,vm_map_offset_ut mask_u,vm_map_kernel_flags_t vmk_flags,vm_map_t src_map,vm_map_offset_ut memory_address_u,boolean_t copy,vm_prot_ut * cur_protection_u,vm_prot_ut * max_protection_u,vm_inherit_ut inheritance_u)19377 vm_map_remap(
19378 vm_map_t target_map,
19379 vm_map_address_ut *address_u,
19380 vm_map_size_ut size_u,
19381 vm_map_offset_ut mask_u,
19382 vm_map_kernel_flags_t vmk_flags,
19383 vm_map_t src_map,
19384 vm_map_offset_ut memory_address_u,
19385 boolean_t copy,
19386 vm_prot_ut *cur_protection_u, /* IN/OUT */
19387 vm_prot_ut *max_protection_u, /* IN/OUT */
19388 vm_inherit_ut inheritance_u)
19389 {
19390 vm_map_address_t target_addr, mask;
19391 vm_map_size_t target_size;
19392 vm_map_offset_t memory_address, memory_end;
19393 vm_map_size_t memory_size;
19394 vm_prot_t cur_protection, max_protection;
19395 vm_inherit_t inheritance;
19396 kern_return_t result;
19397 vm_map_entry_t insp_entry = VM_MAP_ENTRY_NULL;
19398 vm_map_copy_t copy_map;
19399 vm_map_offset_t offset_in_mapping;
19400 vm_map_size_t src_page_mask, target_page_mask;
19401 vm_map_size_t initial_size;
19402 VM_MAP_ZAP_DECLARE(zap_list);
19403
19404 if (target_map == VM_MAP_NULL || src_map == VM_MAP_NULL) {
19405 return KERN_INVALID_ARGUMENT;
19406 }
19407 src_page_mask = VM_MAP_PAGE_MASK(src_map);
19408 target_page_mask = VM_MAP_PAGE_MASK(target_map);
19409
19410 if (src_page_mask != target_page_mask) {
19411 if (copy) {
19412 DEBUG4K_COPY("src_map %p pgsz 0x%x addr 0x%llx size 0x%llx copy %d -> target_map %p pgsz 0x%x\n", src_map, VM_MAP_PAGE_SIZE(src_map), VM_SANITIZE_UNSAFE_UNWRAP(memory_address_u), VM_SANITIZE_UNSAFE_UNWRAP(size_u), copy, target_map, VM_MAP_PAGE_SIZE(target_map));
19413 } else {
19414 DEBUG4K_SHARE("src_map %p pgsz 0x%x addr 0x%llx size 0x%llx copy %d -> target_map %p pgsz 0x%x\n", src_map, VM_MAP_PAGE_SIZE(src_map), VM_SANITIZE_UNSAFE_UNWRAP(memory_address_u), VM_SANITIZE_UNSAFE_UNWRAP(size_u), copy, target_map, VM_MAP_PAGE_SIZE(target_map));
19415 }
19416 }
19417
19418 /*
19419 * Sanitize any input parameters that are addr/size/prot/inherit
19420 */
19421 result = vm_map_remap_sanitize(src_map,
19422 target_map,
19423 *address_u,
19424 size_u,
19425 mask_u,
19426 memory_address_u,
19427 *cur_protection_u,
19428 *max_protection_u,
19429 inheritance_u,
19430 vmk_flags,
19431 &target_addr,
19432 &mask,
19433 &memory_address,
19434 &memory_end,
19435 &memory_size,
19436 &cur_protection,
19437 &max_protection,
19438 &inheritance);
19439 if (__improbable(result != KERN_SUCCESS)) {
19440 return vm_sanitize_get_kr(result);
19441 }
19442
19443 if (vmk_flags.vmf_return_data_addr) {
19444 /*
19445 * This is safe to unwrap now that the quantities
19446 * have been validated and rounded up normally.
19447 */
19448 offset_in_mapping = vm_sanitize_offset_in_page(src_map,
19449 memory_address_u);
19450 initial_size = VM_SANITIZE_UNSAFE_UNWRAP(size_u);
19451 } else {
19452 /*
19453 * IMPORTANT:
19454 * This legacy code path is broken: for the range mentioned
19455 * above [ memory_address = 0x1ff0,size = 0x20 ], which spans
19456 * two 4k pages, it yields [ memory_address = 0x1000,
19457 * size = 0x1000 ], which covers only the first 4k page.
19458 * BUT some code unfortunately depends on this bug, so we
19459 * can't fix it without breaking something.
19460 * New code should get automatically opted in the new
19461 * behavior with the new VM_FLAGS_RETURN_DATA_ADDR flags.
19462 */
19463 offset_in_mapping = 0;
19464 initial_size = memory_size;
19465 }
19466
19467 if (vmk_flags.vmf_resilient_media) {
19468 /* must be copy-on-write to be "media resilient" */
19469 if (!copy) {
19470 return KERN_INVALID_ARGUMENT;
19471 }
19472 }
19473
19474 vmk_flags.vmkf_copy_pageable = target_map->hdr.entries_pageable;
19475 vmk_flags.vmkf_copy_same_map = (src_map == target_map);
19476
19477 assert(memory_size != 0);
19478 result = vm_map_copy_extract(src_map,
19479 memory_address,
19480 memory_size,
19481 copy, ©_map,
19482 &cur_protection, /* IN/OUT */
19483 &max_protection, /* IN/OUT */
19484 inheritance,
19485 vmk_flags);
19486 if (result != KERN_SUCCESS) {
19487 return result;
19488 }
19489 assert(copy_map != VM_MAP_COPY_NULL);
19490
19491 /*
19492 * Handle the policy for vm map ranges
19493 *
19494 * If the maps differ, the target_map policy applies like for vm_map()
19495 * For same mapping remaps, we preserve the range.
19496 */
19497 if (vmk_flags.vmkf_copy_same_map) {
19498 vmk_flags.vmkf_range_id = copy_map->orig_range;
19499 } else {
19500 vm_map_kernel_flags_update_range_id(&vmk_flags, target_map, memory_size);
19501 }
19502
19503 target_size = memory_size;
19504 if (src_page_mask != target_page_mask) {
19505 vm_map_copy_t target_copy_map;
19506 vm_map_offset_t overmap_start = 0;
19507 vm_map_offset_t overmap_end = 0;
19508 vm_map_offset_t trimmed_start = 0;
19509
19510 target_copy_map = copy_map; /* can modify "copy_map" itself */
19511 DEBUG4K_ADJUST("adjusting...\n");
19512 result = vm_map_copy_adjust_to_target(
19513 copy_map,
19514 offset_in_mapping, /* offset */
19515 initial_size,
19516 target_map,
19517 copy,
19518 &target_copy_map,
19519 &overmap_start,
19520 &overmap_end,
19521 &trimmed_start);
19522 if (result != KERN_SUCCESS) {
19523 DEBUG4K_COPY("failed to adjust 0x%x\n", result);
19524 vm_map_copy_discard(copy_map);
19525 return result;
19526 }
19527 if (trimmed_start == 0) {
19528 /* nothing trimmed: no adjustment needed */
19529 } else if (trimmed_start >= offset_in_mapping) {
19530 /* trimmed more than offset_in_mapping: nothing left */
19531 assert(overmap_start == 0);
19532 assert(overmap_end == 0);
19533 offset_in_mapping = 0;
19534 } else {
19535 /* trimmed some of offset_in_mapping: adjust */
19536 assert(overmap_start == 0);
19537 assert(overmap_end == 0);
19538 offset_in_mapping -= trimmed_start;
19539 }
19540 offset_in_mapping += overmap_start;
19541 target_size = target_copy_map->size;
19542 }
19543
19544 /*
19545 * Allocate/check a range of free virtual address
19546 * space for the target
19547 */
19548 target_size = vm_map_round_page(target_size, target_page_mask);
19549
19550 if (target_size == 0) {
19551 vm_map_copy_discard(copy_map);
19552 return KERN_INVALID_ARGUMENT;
19553 }
19554
19555 if (__improbable(!vm_map_is_map_size_valid(
19556 target_map, target_size, vmk_flags.vmkf_no_soft_limit))) {
19557 vm_map_copy_discard(copy_map);
19558 return KERN_NO_SPACE;
19559 }
19560
19561 vm_map_lock(target_map);
19562
19563 if (!vmk_flags.vmf_fixed) {
19564 result = vm_map_locate_space_anywhere(target_map, target_size,
19565 mask, vmk_flags, &target_addr, &insp_entry);
19566 } else {
19567 /*
19568 * vm_map_locate_space_fixed will reject overflowing
19569 * target_addr + target_size values
19570 */
19571 result = vm_map_locate_space_fixed(target_map, target_addr,
19572 target_size, mask, vmk_flags, &insp_entry, &zap_list);
19573
19574 if (result == KERN_MEMORY_PRESENT) {
19575 assert(!vmk_flags.vmkf_already);
19576 insp_entry = VM_MAP_ENTRY_NULL;
19577 result = KERN_NO_SPACE;
19578 }
19579 }
19580
19581 if (result == KERN_SUCCESS) {
19582 while (vm_map_copy_first_entry(copy_map) !=
19583 vm_map_copy_to_entry(copy_map)) {
19584 vm_map_entry_t entry = vm_map_copy_first_entry(copy_map);
19585
19586 vm_map_copy_entry_unlink(copy_map, entry);
19587
19588 if (vmk_flags.vmkf_remap_prot_copy) {
19589 /*
19590 * This vm_map_remap() is for a
19591 * vm_protect(VM_PROT_COPY), so the caller
19592 * expects to be allowed to add write access
19593 * to this new mapping. This is done by
19594 * adding VM_PROT_WRITE to each entry's
19595 * max_protection... unless some security
19596 * settings disallow it.
19597 */
19598 bool allow_write = false;
19599 if (entry->vme_permanent) {
19600 /* immutable mapping... */
19601 if ((entry->max_protection & VM_PROT_EXECUTE) &&
19602 developer_mode_state()) {
19603 /*
19604 * ... but executable and
19605 * possibly being debugged,
19606 * so let's allow it to become
19607 * writable, for breakpoints
19608 * and dtrace probes, for
19609 * example.
19610 */
19611 allow_write = true;
19612 } else {
19613 printf("%d[%s] vm_remap(0x%llx,0x%llx) VM_PROT_COPY denied on permanent mapping prot 0x%x/0x%x developer %d\n",
19614 proc_selfpid(),
19615 (get_bsdtask_info(current_task())
19616 ? proc_name_address(get_bsdtask_info(current_task()))
19617 : "?"),
19618 (uint64_t)memory_address,
19619 (uint64_t)memory_size,
19620 entry->protection,
19621 entry->max_protection,
19622 developer_mode_state());
19623 DTRACE_VM6(vm_map_delete_permanent_deny_protcopy,
19624 vm_map_entry_t, entry,
19625 vm_map_offset_t, entry->vme_start,
19626 vm_map_offset_t, entry->vme_end,
19627 vm_prot_t, entry->protection,
19628 vm_prot_t, entry->max_protection,
19629 int, VME_ALIAS(entry));
19630 }
19631 } else {
19632 allow_write = true;
19633 }
19634
19635 /*
19636 * VM_PROT_COPY: allow this mapping to become
19637 * writable, unless it was "permanent".
19638 */
19639 if (allow_write) {
19640 entry->max_protection |= VM_PROT_WRITE;
19641 }
19642 }
19643 if (vmk_flags.vmf_resilient_codesign) {
19644 /* no codesigning -> read-only access */
19645 entry->max_protection = VM_PROT_READ;
19646 entry->protection = VM_PROT_READ;
19647 entry->vme_resilient_codesign = TRUE;
19648 }
19649 entry->vme_start += target_addr;
19650 entry->vme_end += target_addr;
19651 assert(!entry->map_aligned);
19652 if (vmk_flags.vmf_resilient_media &&
19653 !entry->is_sub_map &&
19654 (VME_OBJECT(entry) == VM_OBJECT_NULL ||
19655 VME_OBJECT(entry)->internal)) {
19656 entry->vme_resilient_media = TRUE;
19657 }
19658 assert(VM_MAP_PAGE_ALIGNED(entry->vme_start, MIN(target_page_mask, PAGE_MASK)));
19659 assert(VM_MAP_PAGE_ALIGNED(entry->vme_end, MIN(target_page_mask, PAGE_MASK)));
19660 assert(VM_MAP_PAGE_ALIGNED(VME_OFFSET(entry), MIN(target_page_mask, PAGE_MASK)));
19661 vm_map_store_entry_link(target_map, insp_entry, entry,
19662 vmk_flags);
19663 insp_entry = entry;
19664 }
19665 }
19666
19667 if (vmk_flags.vmf_resilient_codesign) {
19668 cur_protection = VM_PROT_READ;
19669 max_protection = VM_PROT_READ;
19670 }
19671
19672 if (result == KERN_SUCCESS) {
19673 target_map->size += target_size;
19674 SAVE_HINT_MAP_WRITE(target_map, insp_entry);
19675 }
19676 vm_map_unlock(target_map);
19677
19678 vm_map_zap_dispose(&zap_list);
19679
19680 if (result == KERN_SUCCESS && target_map->wiring_required) {
19681 result = vm_map_wire_nested(target_map, target_addr,
19682 target_addr + target_size, cur_protection, VM_KERN_MEMORY_MLOCK,
19683 TRUE, PMAP_NULL, 0, NULL);
19684 }
19685
19686 if (result == KERN_SUCCESS) {
19687 #if KASAN
19688 if (target_map->pmap == kernel_pmap) {
19689 kasan_notify_address(target_addr, target_size);
19690 }
19691 #endif
19692 /*
19693 * If requested, return the address of the data pointed to by the
19694 * request, rather than the base of the resulting page.
19695 */
19696 if (vmk_flags.vmf_return_data_addr) {
19697 target_addr += offset_in_mapping;
19698 }
19699
19700 /*
19701 * Update OUT parameters.
19702 */
19703 *address_u = vm_sanitize_wrap_addr(target_addr);
19704
19705 *cur_protection_u = vm_sanitize_wrap_prot(cur_protection);
19706 *max_protection_u = vm_sanitize_wrap_prot(max_protection);
19707 }
19708
19709 if (src_page_mask != target_page_mask) {
19710 DEBUG4K_SHARE("vm_remap(%p 0x%llx 0x%llx copy=%d-> %p 0x%llx 0x%llx result=0x%x\n", src_map, (uint64_t)memory_address, (uint64_t)target_size, copy, target_map, (uint64_t)target_addr, (uint64_t)offset_in_mapping, result);
19711 }
19712 vm_map_copy_discard(copy_map);
19713 copy_map = VM_MAP_COPY_NULL;
19714
19715 return result;
19716 }
19717
19718 /*
19719 * vm_map_switch_to:
19720 *
19721 * Set the address map for the current thread to the specified map.
19722 * Returns a struct containing info about the previous map, which should be
19723 * restored with `vm_map_switch_back`
19724 */
19725
19726 vm_map_switch_context_t
vm_map_switch_to(vm_map_t map)19727 vm_map_switch_to(vm_map_t map)
19728 {
19729 thread_t thread = current_thread();
19730 vm_map_t oldmap = thread->map;
19731
19732 /*
19733 * Deactivate the current map and activate the requested map
19734 */
19735 mp_disable_preemption();
19736 PMAP_SWITCH_USER(thread, map, cpu_number());
19737 mp_enable_preemption();
19738
19739 vm_map_lock(map);
19740 task_t task = map->owning_task;
19741 if (task) {
19742 task_reference(task);
19743 }
19744 vm_map_unlock(map);
19745
19746 return (vm_map_switch_context_t) { oldmap, task };
19747 }
19748
19749 void
vm_map_switch_back(vm_map_switch_context_t ctx)19750 vm_map_switch_back(vm_map_switch_context_t ctx)
19751 {
19752 thread_t thread = current_thread();
19753 task_t task = ctx.task;
19754 vm_map_t map = ctx.map;
19755
19756 if (task) {
19757 task_deallocate(task);
19758 } else {
19759 /*
19760 * We want to make sure that vm_map_setup was not called while the
19761 * map was switched. This allows us to guarantee the property that
19762 * we always have a reference on current_map()->owning_task if it is
19763 * not NULL.
19764 */
19765 assert(!thread->map->owning_task);
19766 }
19767
19768 /*
19769 * Restore the original map from prior to vm_map_switch_to
19770 */
19771 mp_disable_preemption();
19772 PMAP_SWITCH_USER(thread, map, cpu_number());
19773 mp_enable_preemption();
19774 }
19775
19776 static __attribute__((always_inline, warn_unused_result))
19777 kern_return_t
vm_map_rw_user_sanitize(vm_map_t map,vm_map_address_ut addr_u,vm_size_ut size_u,vm_sanitize_caller_t vm_sanitize_caller,vm_map_address_t * addr,vm_map_address_t * end,vm_map_size_t * size)19778 vm_map_rw_user_sanitize(
19779 vm_map_t map,
19780 vm_map_address_ut addr_u,
19781 vm_size_ut size_u,
19782 vm_sanitize_caller_t vm_sanitize_caller,
19783 vm_map_address_t *addr,
19784 vm_map_address_t *end,
19785 vm_map_size_t *size)
19786 {
19787 vm_sanitize_flags_t flags = VM_SANITIZE_FLAGS_SIZE_ZERO_FALLTHROUGH |
19788 VM_SANITIZE_FLAGS_GET_UNALIGNED_VALUES |
19789 VM_SANITIZE_FLAGS_CHECK_ADDR_RANGE;
19790
19791 return vm_sanitize_addr_size(addr_u, size_u,
19792 vm_sanitize_caller, map,
19793 flags,
19794 addr, end, size);
19795 }
19796
19797 /*
19798 * Routine: vm_map_write_user
19799 *
19800 * Description:
19801 * Copy out data from a kernel space into space in the
19802 * destination map. The space must already exist in the
19803 * destination map.
19804 * NOTE: This routine should only be called by threads
19805 * which can block on a page fault. i.e. kernel mode user
19806 * threads.
19807 *
19808 */
19809 kern_return_t
vm_map_write_user(vm_map_t map,void * src_p,vm_map_address_ut dst_addr_u,vm_size_ut size_u)19810 vm_map_write_user(
19811 vm_map_t map,
19812 void *src_p,
19813 vm_map_address_ut dst_addr_u,
19814 vm_size_ut size_u)
19815 {
19816 kern_return_t kr;
19817 vm_map_address_t dst_addr, dst_end;
19818 vm_map_size_t size;
19819
19820 /*
19821 * src_p isn't validated: [src_p, src_p + size_u)
19822 * is trusted kernel input.
19823 *
19824 * dst_addr_u and size_u are untrusted and need to be sanitized.
19825 */
19826 kr = vm_map_rw_user_sanitize(map,
19827 dst_addr_u,
19828 size_u,
19829 VM_SANITIZE_CALLER_VM_MAP_WRITE_USER,
19830 &dst_addr,
19831 &dst_end,
19832 &size);
19833 if (__improbable(kr != KERN_SUCCESS)) {
19834 return vm_sanitize_get_kr(kr);
19835 }
19836
19837 if (current_map() == map) {
19838 if (copyout(src_p, dst_addr, size)) {
19839 kr = KERN_INVALID_ADDRESS;
19840 }
19841 } else {
19842 vm_map_switch_context_t switch_ctx;
19843
19844 /* take on the identity of the target map while doing */
19845 /* the transfer */
19846
19847 vm_map_reference(map);
19848 switch_ctx = vm_map_switch_to(map);
19849 if (copyout(src_p, dst_addr, size)) {
19850 kr = KERN_INVALID_ADDRESS;
19851 }
19852 vm_map_switch_back(switch_ctx);
19853 vm_map_deallocate(map);
19854 }
19855 return kr;
19856 }
19857
19858 /*
19859 * Routine: vm_map_read_user
19860 *
19861 * Description:
19862 * Copy in data from a user space source map into the
19863 * kernel map. The space must already exist in the
19864 * kernel map.
19865 * NOTE: This routine should only be called by threads
19866 * which can block on a page fault. i.e. kernel mode user
19867 * threads.
19868 *
19869 */
19870 kern_return_t
vm_map_read_user(vm_map_t map,vm_map_address_ut src_addr_u,void * dst_p,vm_size_ut size_u)19871 vm_map_read_user(
19872 vm_map_t map,
19873 vm_map_address_ut src_addr_u,
19874 void *dst_p,
19875 vm_size_ut size_u)
19876 {
19877 kern_return_t kr;
19878 vm_map_address_t src_addr, src_end;
19879 vm_map_size_t size;
19880
19881 /*
19882 * dst_p isn't validated: [dst_p, dst_p + size_u)
19883 * is trusted kernel input.
19884 *
19885 * src_addr_u and size_u are untrusted and need to be sanitized.
19886 */
19887 kr = vm_map_rw_user_sanitize(map,
19888 src_addr_u,
19889 size_u,
19890 VM_SANITIZE_CALLER_VM_MAP_READ_USER,
19891 &src_addr,
19892 &src_end,
19893 &size);
19894 if (__improbable(kr != KERN_SUCCESS)) {
19895 return vm_sanitize_get_kr(kr);
19896 }
19897
19898 if (current_map() == map) {
19899 if (copyin(src_addr, dst_p, size)) {
19900 kr = KERN_INVALID_ADDRESS;
19901 }
19902 } else {
19903 vm_map_switch_context_t switch_ctx;
19904
19905 /* take on the identity of the target map while doing */
19906 /* the transfer */
19907
19908 vm_map_reference(map);
19909 switch_ctx = vm_map_switch_to(map);
19910 if (copyin(src_addr, dst_p, size)) {
19911 kr = KERN_INVALID_ADDRESS;
19912 }
19913 vm_map_switch_back(switch_ctx);
19914 vm_map_deallocate(map);
19915 }
19916 return kr;
19917 }
19918
19919
19920 static __attribute__((always_inline, warn_unused_result))
19921 kern_return_t
vm_map_check_protection_sanitize(vm_map_t map,vm_map_offset_ut start_u,vm_map_offset_ut end_u,vm_prot_ut protection_u,vm_sanitize_caller_t vm_sanitize_caller,vm_map_offset_t * start,vm_map_offset_t * end,vm_prot_t * protection)19922 vm_map_check_protection_sanitize(
19923 vm_map_t map,
19924 vm_map_offset_ut start_u,
19925 vm_map_offset_ut end_u,
19926 vm_prot_ut protection_u,
19927 vm_sanitize_caller_t vm_sanitize_caller,
19928 vm_map_offset_t *start,
19929 vm_map_offset_t *end,
19930 vm_prot_t *protection)
19931 {
19932 kern_return_t kr;
19933 vm_map_size_t size;
19934
19935 kr = vm_sanitize_addr_end(start_u, end_u, vm_sanitize_caller, map,
19936 VM_SANITIZE_FLAGS_SIZE_ZERO_FALLTHROUGH, start, end,
19937 &size);
19938 if (__improbable(kr != KERN_SUCCESS)) {
19939 return kr;
19940 }
19941
19942 /*
19943 * Given that the protection is used only for comparisons below
19944 * no sanitization is being applied on it.
19945 */
19946 *protection = VM_SANITIZE_UNSAFE_UNWRAP(protection_u);
19947
19948 return KERN_SUCCESS;
19949 }
19950
19951 /*
19952 * vm_map_check_protection:
19953 *
19954 * Assert that the target map allows the specified
19955 * privilege on the entire address region given.
19956 * The entire region must be allocated.
19957 */
19958 boolean_t
vm_map_check_protection(vm_map_t map,vm_map_offset_ut start_u,vm_map_offset_ut end_u,vm_prot_ut protection_u,vm_sanitize_caller_t vm_sanitize_caller)19959 vm_map_check_protection(
19960 vm_map_t map,
19961 vm_map_offset_ut start_u,
19962 vm_map_offset_ut end_u,
19963 vm_prot_ut protection_u,
19964 vm_sanitize_caller_t vm_sanitize_caller)
19965 {
19966 vm_map_entry_t entry;
19967 vm_map_entry_t tmp_entry;
19968 vm_map_offset_t start;
19969 vm_map_offset_t end;
19970 vm_prot_t protection;
19971 kern_return_t kr;
19972
19973 kr = vm_map_check_protection_sanitize(map,
19974 start_u,
19975 end_u,
19976 protection_u,
19977 vm_sanitize_caller,
19978 &start,
19979 &end,
19980 &protection);
19981 if (__improbable(kr != KERN_SUCCESS)) {
19982 kr = vm_sanitize_get_kr(kr);
19983 if (kr == KERN_SUCCESS) {
19984 return true;
19985 }
19986 return false;
19987 }
19988
19989 vm_map_lock(map);
19990
19991 if (start < vm_map_min(map) || end > vm_map_max(map)) {
19992 vm_map_unlock(map);
19993 return false;
19994 }
19995
19996 if (!vm_map_lookup_entry(map, start, &tmp_entry)) {
19997 vm_map_unlock(map);
19998 return false;
19999 }
20000
20001 entry = tmp_entry;
20002
20003 while (start < end) {
20004 if (entry == vm_map_to_entry(map)) {
20005 vm_map_unlock(map);
20006 return false;
20007 }
20008
20009 /*
20010 * No holes allowed!
20011 */
20012
20013 if (start < entry->vme_start) {
20014 vm_map_unlock(map);
20015 return false;
20016 }
20017
20018 /*
20019 * Check protection associated with entry.
20020 */
20021
20022 if ((entry->protection & protection) != protection) {
20023 vm_map_unlock(map);
20024 return false;
20025 }
20026
20027 /* go to next entry */
20028
20029 start = entry->vme_end;
20030 entry = entry->vme_next;
20031 }
20032 vm_map_unlock(map);
20033 return true;
20034 }
20035
20036 kern_return_t
vm_map_purgable_control(vm_map_t map,vm_map_offset_ut address_u,vm_purgable_t control,int * state)20037 vm_map_purgable_control(
20038 vm_map_t map,
20039 vm_map_offset_ut address_u,
20040 vm_purgable_t control,
20041 int *state)
20042 {
20043 vm_map_offset_t address;
20044 vm_map_entry_t entry;
20045 vm_object_t object;
20046 kern_return_t kr;
20047 boolean_t was_nonvolatile;
20048
20049 /*
20050 * Vet all the input parameters and current type and state of the
20051 * underlaying object. Return with an error if anything is amiss.
20052 */
20053 if (map == VM_MAP_NULL) {
20054 return KERN_INVALID_ARGUMENT;
20055 }
20056
20057 if (control != VM_PURGABLE_SET_STATE &&
20058 control != VM_PURGABLE_GET_STATE &&
20059 control != VM_PURGABLE_PURGE_ALL &&
20060 control != VM_PURGABLE_SET_STATE_FROM_KERNEL) {
20061 return KERN_INVALID_ARGUMENT;
20062 }
20063
20064 if (control == VM_PURGABLE_PURGE_ALL) {
20065 vm_purgeable_object_purge_all();
20066 return KERN_SUCCESS;
20067 }
20068
20069 if ((control == VM_PURGABLE_SET_STATE ||
20070 control == VM_PURGABLE_SET_STATE_FROM_KERNEL) &&
20071 (((*state & ~(VM_PURGABLE_ALL_MASKS)) != 0) ||
20072 ((*state & VM_PURGABLE_STATE_MASK) > VM_PURGABLE_STATE_MASK))) {
20073 return KERN_INVALID_ARGUMENT;
20074 }
20075
20076 address = vm_sanitize_addr(map, address_u);
20077
20078 vm_map_lock_read(map);
20079
20080 if (!vm_map_lookup_entry(map, address, &entry) || entry->is_sub_map) {
20081 /*
20082 * Must pass a valid non-submap address.
20083 */
20084 vm_map_unlock_read(map);
20085 return KERN_INVALID_ADDRESS;
20086 }
20087
20088 if ((entry->protection & VM_PROT_WRITE) == 0 &&
20089 control != VM_PURGABLE_GET_STATE) {
20090 /*
20091 * Can't apply purgable controls to something you can't write.
20092 */
20093 vm_map_unlock_read(map);
20094 return KERN_PROTECTION_FAILURE;
20095 }
20096
20097 object = VME_OBJECT(entry);
20098 if (object == VM_OBJECT_NULL ||
20099 object->purgable == VM_PURGABLE_DENY) {
20100 /*
20101 * Object must already be present and be purgeable.
20102 */
20103 vm_map_unlock_read(map);
20104 return KERN_INVALID_ARGUMENT;
20105 }
20106
20107 vm_object_lock(object);
20108
20109 #if 00
20110 if (VME_OFFSET(entry) != 0 ||
20111 entry->vme_end - entry->vme_start != object->vo_size) {
20112 /*
20113 * Can only apply purgable controls to the whole (existing)
20114 * object at once.
20115 */
20116 vm_map_unlock_read(map);
20117 vm_object_unlock(object);
20118 return KERN_INVALID_ARGUMENT;
20119 }
20120 #endif
20121
20122 assert(!entry->is_sub_map);
20123 assert(!entry->use_pmap); /* purgeable has its own accounting */
20124
20125 vm_map_unlock_read(map);
20126
20127 was_nonvolatile = (object->purgable == VM_PURGABLE_NONVOLATILE);
20128
20129 kr = vm_object_purgable_control(object, control, state);
20130
20131 if (was_nonvolatile &&
20132 object->purgable != VM_PURGABLE_NONVOLATILE &&
20133 map->pmap == kernel_pmap) {
20134 #if DEBUG
20135 object->vo_purgeable_volatilizer = kernel_task;
20136 #endif /* DEBUG */
20137 }
20138
20139 vm_object_unlock(object);
20140
20141 return kr;
20142 }
20143
20144 void
vm_map_footprint_query_page_info(vm_map_t map,vm_map_entry_t map_entry,vm_map_offset_t curr_s_offset,int * disposition_p)20145 vm_map_footprint_query_page_info(
20146 vm_map_t map,
20147 vm_map_entry_t map_entry,
20148 vm_map_offset_t curr_s_offset,
20149 int *disposition_p)
20150 {
20151 int pmap_disp;
20152 vm_object_t object = VM_OBJECT_NULL;
20153 int disposition;
20154 int effective_page_size;
20155
20156 vm_map_lock_assert_held(map);
20157 assert(!map->has_corpse_footprint);
20158 assert(curr_s_offset >= map_entry->vme_start);
20159 assert(curr_s_offset < map_entry->vme_end);
20160
20161 if (map_entry->is_sub_map) {
20162 if (!map_entry->use_pmap) {
20163 /* nested pmap: no footprint */
20164 *disposition_p = 0;
20165 return;
20166 }
20167 } else {
20168 object = VME_OBJECT(map_entry);
20169 if (object == VM_OBJECT_NULL) {
20170 /* nothing mapped here: no need to ask */
20171 *disposition_p = 0;
20172 return;
20173 }
20174 }
20175
20176 effective_page_size = MIN(PAGE_SIZE, VM_MAP_PAGE_SIZE(map));
20177
20178 pmap_disp = 0;
20179
20180 /*
20181 * Query the pmap.
20182 */
20183 pmap_query_page_info(map->pmap, curr_s_offset, &pmap_disp);
20184
20185 /*
20186 * Compute this page's disposition.
20187 */
20188 disposition = 0;
20189
20190 /* deal with "alternate accounting" first */
20191 if (!map_entry->is_sub_map &&
20192 object->vo_no_footprint) {
20193 /* does not count in footprint */
20194 // assertf(!map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry);
20195 } else if (!map_entry->is_sub_map &&
20196 !object->internal &&
20197 object->vo_ledger_tag &&
20198 VM_OBJECT_OWNER(object) != NULL &&
20199 VM_OBJECT_OWNER(object)->map == map) {
20200 /* owned external object: wired pages count in footprint */
20201 assertf(map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry);
20202 if ((((curr_s_offset
20203 - map_entry->vme_start
20204 + VME_OFFSET(map_entry))
20205 / effective_page_size) <
20206 object->wired_page_count)) {
20207 /*
20208 * External object owned by this task: report the first
20209 * "#wired" pages as "resident" (to show that they
20210 * contribute to the footprint) but not "dirty"
20211 * (to avoid double-counting with the fake "owned"
20212 * region we'll report at the end of the address space
20213 * to account for all (mapped or not) owned memory
20214 * owned by this task.
20215 */
20216 disposition |= VM_PAGE_QUERY_PAGE_PRESENT;
20217 }
20218 } else if (!map_entry->is_sub_map &&
20219 object->internal &&
20220 (object->purgable == VM_PURGABLE_NONVOLATILE ||
20221 (object->purgable == VM_PURGABLE_DENY &&
20222 object->vo_ledger_tag)) &&
20223 VM_OBJECT_OWNER(object) != NULL &&
20224 VM_OBJECT_OWNER(object)->map == map) {
20225 assertf(!map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry);
20226 if ((((curr_s_offset
20227 - map_entry->vme_start
20228 + VME_OFFSET(map_entry))
20229 / effective_page_size) <
20230 (object->resident_page_count +
20231 vm_compressor_pager_get_count(object->pager)))) {
20232 /*
20233 * Non-volatile purgeable object owned
20234 * by this task: report the first
20235 * "#resident + #compressed" pages as
20236 * "resident" (to show that they
20237 * contribute to the footprint) but not
20238 * "dirty" (to avoid double-counting
20239 * with the fake "non-volatile" region
20240 * we'll report at the end of the
20241 * address space to account for all
20242 * (mapped or not) non-volatile memory
20243 * owned by this task.
20244 */
20245 disposition |= VM_PAGE_QUERY_PAGE_PRESENT;
20246 }
20247 } else if (!map_entry->is_sub_map &&
20248 object->internal &&
20249 (object->purgable == VM_PURGABLE_VOLATILE ||
20250 object->purgable == VM_PURGABLE_EMPTY) &&
20251 VM_OBJECT_OWNER(object) != NULL &&
20252 VM_OBJECT_OWNER(object)->map == map) {
20253 if (object->internal) {
20254 assertf(!map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry);
20255 }
20256 if ((((curr_s_offset
20257 - map_entry->vme_start
20258 + VME_OFFSET(map_entry))
20259 / effective_page_size) <
20260 object->wired_page_count)) {
20261 /*
20262 * Volatile|empty purgeable object owned
20263 * by this task: report the first
20264 * "#wired" pages as "resident" (to
20265 * show that they contribute to the
20266 * footprint) but not "dirty" (to avoid
20267 * double-counting with the fake
20268 * "non-volatile" region we'll report
20269 * at the end of the address space to
20270 * account for all (mapped or not)
20271 * non-volatile memory owned by this
20272 * task.
20273 */
20274 disposition |= VM_PAGE_QUERY_PAGE_PRESENT;
20275 }
20276 } else if (!map_entry->is_sub_map &&
20277 map_entry->iokit_acct &&
20278 object->internal &&
20279 object->purgable == VM_PURGABLE_DENY) {
20280 /*
20281 * Non-purgeable IOKit memory: phys_footprint
20282 * includes the entire virtual mapping.
20283 */
20284 assertf(!map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry);
20285 disposition |= VM_PAGE_QUERY_PAGE_PRESENT;
20286 disposition |= VM_PAGE_QUERY_PAGE_DIRTY;
20287 } else if (pmap_disp & (PMAP_QUERY_PAGE_ALTACCT |
20288 PMAP_QUERY_PAGE_COMPRESSED_ALTACCT)) {
20289 /* alternate accounting */
20290 #if __arm64__ && (DEVELOPMENT || DEBUG)
20291 if (map->pmap->footprint_was_suspended) {
20292 /*
20293 * The assertion below can fail if dyld
20294 * suspended footprint accounting
20295 * while doing some adjustments to
20296 * this page; the mapping would say
20297 * "use pmap accounting" but the page
20298 * would be marked "alternate
20299 * accounting".
20300 */
20301 } else
20302 #endif /* __arm64__ && (DEVELOPMENT || DEBUG) */
20303 {
20304 assertf(!map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry);
20305 }
20306 disposition = 0;
20307 } else {
20308 if (pmap_disp & PMAP_QUERY_PAGE_PRESENT) {
20309 assertf(map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry);
20310 disposition |= VM_PAGE_QUERY_PAGE_PRESENT;
20311 disposition |= VM_PAGE_QUERY_PAGE_REF;
20312 if (pmap_disp & PMAP_QUERY_PAGE_INTERNAL) {
20313 disposition |= VM_PAGE_QUERY_PAGE_DIRTY;
20314 } else {
20315 disposition |= VM_PAGE_QUERY_PAGE_EXTERNAL;
20316 }
20317 if (pmap_disp & PMAP_QUERY_PAGE_REUSABLE) {
20318 disposition |= VM_PAGE_QUERY_PAGE_REUSABLE;
20319 }
20320 } else if (pmap_disp & PMAP_QUERY_PAGE_COMPRESSED) {
20321 assertf(map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry);
20322 disposition |= VM_PAGE_QUERY_PAGE_PAGED_OUT;
20323 }
20324 }
20325
20326 *disposition_p = disposition;
20327 }
20328
20329 kern_return_t
vm_map_page_info(vm_map_t map,vm_map_offset_ut offset_u,vm_page_info_flavor_t flavor,vm_page_info_t info,mach_msg_type_number_t * count)20330 vm_map_page_info(
20331 vm_map_t map,
20332 vm_map_offset_ut offset_u,
20333 vm_page_info_flavor_t flavor,
20334 vm_page_info_t info,
20335 mach_msg_type_number_t *count)
20336 {
20337 return vm_map_page_range_info_internal(map,
20338 offset_u, /* start of range */
20339 vm_sanitize_compute_ut_end(offset_u, 1), /* this will get rounded in the call to the page boundary */
20340 (int)-1, /* effective_page_shift: unspecified */
20341 flavor,
20342 info,
20343 count);
20344 }
20345
20346 static __attribute__((always_inline, warn_unused_result))
20347 kern_return_t
vm_map_page_range_info_sanitize(vm_map_t map,vm_map_offset_ut start_offset_u,vm_map_offset_ut end_offset_u,vm_map_offset_t effective_page_mask,vm_map_offset_t * start,vm_map_offset_t * end,vm_map_offset_t * offset_in_page)20348 vm_map_page_range_info_sanitize(
20349 vm_map_t map,
20350 vm_map_offset_ut start_offset_u,
20351 vm_map_offset_ut end_offset_u,
20352 vm_map_offset_t effective_page_mask,
20353 vm_map_offset_t *start,
20354 vm_map_offset_t *end,
20355 vm_map_offset_t *offset_in_page)
20356 {
20357 kern_return_t retval;
20358 vm_map_size_t size;
20359
20360 /*
20361 * Perform validation against map's mask but don't align start/end,
20362 * as we need for those to be aligned wrt effective_page_mask
20363 */
20364 retval = vm_sanitize_addr_end(start_offset_u, end_offset_u,
20365 VM_SANITIZE_CALLER_VM_MAP_PAGE_RANGE_INFO, map,
20366 VM_SANITIZE_FLAGS_SIZE_ZERO_FALLTHROUGH |
20367 VM_SANITIZE_FLAGS_GET_UNALIGNED_VALUES, start,
20368 end, &size);
20369 if (retval != KERN_SUCCESS) {
20370 return retval;
20371 }
20372
20373 retval = vm_sanitize_addr_end(start_offset_u, end_offset_u,
20374 VM_SANITIZE_CALLER_VM_MAP_PAGE_RANGE_INFO, effective_page_mask,
20375 VM_SANITIZE_FLAGS_SIZE_ZERO_FALLTHROUGH, start,
20376 end, &size);
20377 if (retval != KERN_SUCCESS) {
20378 return retval;
20379 }
20380
20381 *offset_in_page = vm_sanitize_offset_in_page(effective_page_mask,
20382 start_offset_u);
20383
20384 return KERN_SUCCESS;
20385 }
20386
20387 kern_return_t
vm_map_page_range_info_internal(vm_map_t map,vm_map_offset_ut start_offset_u,vm_map_offset_ut end_offset_u,int effective_page_shift,vm_page_info_flavor_t flavor,vm_page_info_t info,mach_msg_type_number_t * count)20388 vm_map_page_range_info_internal(
20389 vm_map_t map,
20390 vm_map_offset_ut start_offset_u,
20391 vm_map_offset_ut end_offset_u,
20392 int effective_page_shift,
20393 vm_page_info_flavor_t flavor,
20394 vm_page_info_t info,
20395 mach_msg_type_number_t *count)
20396 {
20397 vm_map_entry_t map_entry = VM_MAP_ENTRY_NULL;
20398 vm_object_t object = VM_OBJECT_NULL, curr_object = VM_OBJECT_NULL;
20399 vm_page_t m = VM_PAGE_NULL;
20400 kern_return_t retval = KERN_SUCCESS;
20401 int disposition = 0;
20402 int ref_count = 0;
20403 int depth = 0, info_idx = 0;
20404 vm_page_info_basic_t basic_info = 0;
20405 vm_map_offset_t offset_in_page = 0, offset_in_object = 0, curr_offset_in_object = 0;
20406 vm_map_offset_t start = 0, end = 0, curr_s_offset = 0, curr_e_offset = 0;
20407 boolean_t do_region_footprint;
20408 ledger_amount_t ledger_resident, ledger_compressed;
20409 int effective_page_size;
20410 vm_map_offset_t effective_page_mask;
20411
20412 switch (flavor) {
20413 case VM_PAGE_INFO_BASIC:
20414 if (*count != VM_PAGE_INFO_BASIC_COUNT) {
20415 /*
20416 * The "vm_page_info_basic_data" structure was not
20417 * properly padded, so allow the size to be off by
20418 * one to maintain backwards binary compatibility...
20419 */
20420 if (*count != VM_PAGE_INFO_BASIC_COUNT - 1) {
20421 return KERN_INVALID_ARGUMENT;
20422 }
20423 }
20424 break;
20425 default:
20426 return KERN_INVALID_ARGUMENT;
20427 }
20428
20429 if (effective_page_shift == -1) {
20430 effective_page_shift = vm_self_region_page_shift_safely(map);
20431 if (effective_page_shift == -1) {
20432 return KERN_INVALID_ARGUMENT;
20433 }
20434 }
20435 effective_page_size = (1 << effective_page_shift);
20436 effective_page_mask = effective_page_size - 1;
20437
20438
20439 retval = vm_map_page_range_info_sanitize(map,
20440 start_offset_u,
20441 end_offset_u,
20442 effective_page_mask,
20443 &start,
20444 &end,
20445 &offset_in_page);
20446 if (retval != KERN_SUCCESS) {
20447 return vm_sanitize_get_kr(retval);
20448 }
20449
20450 assert((end - start) <= MAX_PAGE_RANGE_QUERY);
20451
20452 do_region_footprint = task_self_region_footprint();
20453 disposition = 0;
20454 ref_count = 0;
20455 depth = 0;
20456 info_idx = 0; /* Tracks the next index within the info structure to be filled.*/
20457
20458 vm_map_lock_read(map);
20459 task_ledgers_footprint(map->pmap->ledger, &ledger_resident, &ledger_compressed);
20460
20461 for (curr_s_offset = start; curr_s_offset < end;) {
20462 /*
20463 * New lookup needs reset of these variables.
20464 */
20465 curr_object = object = VM_OBJECT_NULL;
20466 offset_in_object = 0;
20467 ref_count = 0;
20468 depth = 0;
20469
20470 if (do_region_footprint &&
20471 curr_s_offset >= vm_map_last_entry(map)->vme_end) {
20472 /*
20473 * Request for "footprint" info about a page beyond
20474 * the end of address space: this must be for
20475 * the fake region vm_map_region_recurse_64()
20476 * reported to account for non-volatile purgeable
20477 * memory owned by this task.
20478 */
20479 disposition = 0;
20480
20481 if (curr_s_offset - vm_map_last_entry(map)->vme_end <=
20482 (unsigned) ledger_compressed) {
20483 /*
20484 * We haven't reported all the "non-volatile
20485 * compressed" pages yet, so report this fake
20486 * page as "compressed".
20487 */
20488 disposition |= VM_PAGE_QUERY_PAGE_PAGED_OUT;
20489 } else {
20490 /*
20491 * We've reported all the non-volatile
20492 * compressed page but not all the non-volatile
20493 * pages , so report this fake page as
20494 * "resident dirty".
20495 */
20496 disposition |= VM_PAGE_QUERY_PAGE_PRESENT;
20497 disposition |= VM_PAGE_QUERY_PAGE_DIRTY;
20498 disposition |= VM_PAGE_QUERY_PAGE_REF;
20499 }
20500 switch (flavor) {
20501 case VM_PAGE_INFO_BASIC:
20502 basic_info = (vm_page_info_basic_t) (((uintptr_t) info) + (info_idx * sizeof(struct vm_page_info_basic)));
20503 basic_info->disposition = disposition;
20504 basic_info->ref_count = 1;
20505 basic_info->object_id = VM_OBJECT_ID_FAKE(map, task_ledgers.purgeable_nonvolatile);
20506 basic_info->offset = 0;
20507 basic_info->depth = 0;
20508
20509 info_idx++;
20510 break;
20511 }
20512 curr_s_offset += effective_page_size;
20513 continue;
20514 }
20515
20516 /*
20517 * First, find the map entry covering "curr_s_offset", going down
20518 * submaps if necessary.
20519 */
20520 if (!vm_map_lookup_entry(map, curr_s_offset, &map_entry)) {
20521 /* no entry -> no object -> no page */
20522
20523 if (curr_s_offset < vm_map_min(map)) {
20524 /*
20525 * Illegal address that falls below map min.
20526 */
20527 curr_e_offset = MIN(end, vm_map_min(map));
20528 } else if (curr_s_offset >= vm_map_max(map)) {
20529 /*
20530 * Illegal address that falls on/after map max.
20531 */
20532 curr_e_offset = end;
20533 } else if (map_entry == vm_map_to_entry(map)) {
20534 /*
20535 * Hit a hole.
20536 */
20537 if (map_entry->vme_next == vm_map_to_entry(map)) {
20538 /*
20539 * Empty map.
20540 */
20541 curr_e_offset = MIN(map->max_offset, end);
20542 } else {
20543 /*
20544 * Hole at start of the map.
20545 */
20546 curr_e_offset = MIN(map_entry->vme_next->vme_start, end);
20547 }
20548 } else {
20549 if (map_entry->vme_next == vm_map_to_entry(map)) {
20550 /*
20551 * Hole at the end of the map.
20552 */
20553 curr_e_offset = MIN(map->max_offset, end);
20554 } else {
20555 curr_e_offset = MIN(map_entry->vme_next->vme_start, end);
20556 }
20557 }
20558
20559 assert(curr_e_offset >= curr_s_offset);
20560
20561 uint64_t num_pages = (curr_e_offset - curr_s_offset) >> effective_page_shift;
20562
20563 void *info_ptr = (void*) (((uintptr_t) info) + (info_idx * sizeof(struct vm_page_info_basic)));
20564
20565 bzero(info_ptr, num_pages * sizeof(struct vm_page_info_basic));
20566
20567 curr_s_offset = curr_e_offset;
20568
20569 info_idx += num_pages;
20570
20571 continue;
20572 }
20573
20574 /* compute offset from this map entry's start */
20575 offset_in_object = curr_s_offset - map_entry->vme_start;
20576
20577 /* compute offset into this map entry's object (or submap) */
20578 offset_in_object += VME_OFFSET(map_entry);
20579
20580 if (map_entry->is_sub_map) {
20581 vm_map_t sub_map = VM_MAP_NULL;
20582 vm_page_info_t submap_info = 0;
20583 vm_map_offset_t submap_s_offset = 0, submap_e_offset = 0, range_len = 0;
20584
20585 range_len = MIN(map_entry->vme_end, end) - curr_s_offset;
20586
20587 submap_s_offset = offset_in_object;
20588 submap_e_offset = submap_s_offset + range_len;
20589
20590 sub_map = VME_SUBMAP(map_entry);
20591
20592 vm_map_reference(sub_map);
20593 vm_map_unlock_read(map);
20594
20595 submap_info = (vm_page_info_t) (((uintptr_t) info) + (info_idx * sizeof(struct vm_page_info_basic)));
20596
20597 assertf(VM_MAP_PAGE_SHIFT(sub_map) >= VM_MAP_PAGE_SHIFT(map),
20598 "Submap page size (%d) differs from current map (%d)\n", VM_MAP_PAGE_SIZE(sub_map), VM_MAP_PAGE_SIZE(map));
20599
20600 retval = vm_map_page_range_info_internal(sub_map,
20601 submap_s_offset,
20602 submap_e_offset,
20603 effective_page_shift,
20604 VM_PAGE_INFO_BASIC,
20605 (vm_page_info_t) submap_info,
20606 count);
20607
20608 assert(retval == KERN_SUCCESS);
20609
20610 vm_map_deallocate(sub_map);
20611 sub_map = VM_MAP_NULL;
20612 vm_map_lock_read(map);
20613
20614 /* Move the "info" index by the number of pages we inspected.*/
20615 info_idx += range_len >> effective_page_shift;
20616
20617 /* Move our current offset by the size of the range we inspected.*/
20618 curr_s_offset += range_len;
20619
20620 continue;
20621 }
20622
20623 object = VME_OBJECT(map_entry);
20624
20625 if (object == VM_OBJECT_NULL) {
20626 /*
20627 * We don't have an object here and, hence,
20628 * no pages to inspect. We'll fill up the
20629 * info structure appropriately.
20630 */
20631
20632 curr_e_offset = MIN(map_entry->vme_end, end);
20633
20634 uint64_t num_pages = (curr_e_offset - curr_s_offset) >> effective_page_shift;
20635
20636 void *info_ptr = (void*) (((uintptr_t) info) + (info_idx * sizeof(struct vm_page_info_basic)));
20637
20638 bzero(info_ptr, num_pages * sizeof(struct vm_page_info_basic));
20639
20640 curr_s_offset = curr_e_offset;
20641
20642 info_idx += num_pages;
20643
20644 continue;
20645 }
20646
20647 if (do_region_footprint) {
20648 disposition = 0;
20649 if (map->has_corpse_footprint) {
20650 /*
20651 * Query the page info data we saved
20652 * while forking the corpse.
20653 */
20654 vm_map_corpse_footprint_query_page_info(
20655 map,
20656 curr_s_offset,
20657 &disposition);
20658 } else {
20659 /*
20660 * Query the live pmap for footprint info
20661 * about this page.
20662 */
20663 vm_map_footprint_query_page_info(
20664 map,
20665 map_entry,
20666 curr_s_offset,
20667 &disposition);
20668 }
20669 switch (flavor) {
20670 case VM_PAGE_INFO_BASIC:
20671 basic_info = (vm_page_info_basic_t) (((uintptr_t) info) + (info_idx * sizeof(struct vm_page_info_basic)));
20672 basic_info->disposition = disposition;
20673 basic_info->ref_count = 1;
20674 basic_info->object_id = VM_OBJECT_ID_FAKE(map, task_ledgers.purgeable_nonvolatile);
20675 basic_info->offset = 0;
20676 basic_info->depth = 0;
20677
20678 info_idx++;
20679 break;
20680 }
20681 curr_s_offset += effective_page_size;
20682 continue;
20683 }
20684
20685 vm_object_reference(object);
20686 /*
20687 * Shared mode -- so we can allow other readers
20688 * to grab the lock too.
20689 */
20690 vm_object_lock_shared(object);
20691
20692 curr_e_offset = MIN(map_entry->vme_end, end);
20693
20694 vm_map_unlock_read(map);
20695
20696 map_entry = NULL; /* map is unlocked, the entry is no longer valid. */
20697
20698 curr_object = object;
20699
20700 for (; curr_s_offset < curr_e_offset;) {
20701 if (object == curr_object) {
20702 /* account for our object reference above. */
20703 ref_count = os_ref_get_count_raw(&curr_object->ref_count) - 1;
20704 } else {
20705 ref_count = os_ref_get_count_raw(&curr_object->ref_count);
20706 }
20707
20708 curr_offset_in_object = offset_in_object;
20709
20710 for (;;) {
20711 m = vm_page_lookup(curr_object, vm_object_trunc_page(curr_offset_in_object));
20712
20713 if (m != VM_PAGE_NULL) {
20714 disposition |= VM_PAGE_QUERY_PAGE_PRESENT;
20715 break;
20716 } else {
20717 if (curr_object->internal &&
20718 curr_object->alive &&
20719 !curr_object->terminating &&
20720 curr_object->pager_ready) {
20721 if (vm_object_compressor_pager_state_get(curr_object, vm_object_trunc_page(curr_offset_in_object))
20722 == VM_EXTERNAL_STATE_EXISTS) {
20723 /* the pager has that page */
20724 disposition |= VM_PAGE_QUERY_PAGE_PAGED_OUT;
20725 break;
20726 }
20727 }
20728
20729 /*
20730 * Go down the VM object shadow chain until we find the page
20731 * we're looking for.
20732 */
20733
20734 if (curr_object->shadow != VM_OBJECT_NULL) {
20735 vm_object_t shadow = VM_OBJECT_NULL;
20736
20737 curr_offset_in_object += curr_object->vo_shadow_offset;
20738 shadow = curr_object->shadow;
20739
20740 vm_object_lock_shared(shadow);
20741 vm_object_unlock(curr_object);
20742
20743 curr_object = shadow;
20744 depth++;
20745 continue;
20746 } else {
20747 break;
20748 }
20749 }
20750 }
20751
20752 /* The ref_count is not strictly accurate, it measures the number */
20753 /* of entities holding a ref on the object, they may not be mapping */
20754 /* the object or may not be mapping the section holding the */
20755 /* target page but its still a ball park number and though an over- */
20756 /* count, it picks up the copy-on-write cases */
20757
20758 /* We could also get a picture of page sharing from pmap_attributes */
20759 /* but this would under count as only faulted-in mappings would */
20760 /* show up. */
20761
20762 if ((curr_object == object) && curr_object->shadow) {
20763 disposition |= VM_PAGE_QUERY_PAGE_COPIED;
20764 }
20765
20766 if (!curr_object->internal) {
20767 disposition |= VM_PAGE_QUERY_PAGE_EXTERNAL;
20768 }
20769
20770 if (m != VM_PAGE_NULL) {
20771 if (vm_page_is_fictitious(m)) {
20772 disposition |= VM_PAGE_QUERY_PAGE_FICTITIOUS;
20773 } else {
20774 if (m->vmp_dirty || pmap_is_modified(VM_PAGE_GET_PHYS_PAGE(m))) {
20775 disposition |= VM_PAGE_QUERY_PAGE_DIRTY;
20776 }
20777
20778 if (m->vmp_reference || pmap_is_referenced(VM_PAGE_GET_PHYS_PAGE(m))) {
20779 disposition |= VM_PAGE_QUERY_PAGE_REF;
20780 }
20781
20782 if (m->vmp_q_state == VM_PAGE_ON_SPECULATIVE_Q) {
20783 disposition |= VM_PAGE_QUERY_PAGE_SPECULATIVE;
20784 }
20785
20786 /*
20787 * XXX TODO4K:
20788 * when this routine deals with 4k
20789 * pages, check the appropriate CS bit
20790 * here.
20791 */
20792 if (m->vmp_cs_validated) {
20793 disposition |= VM_PAGE_QUERY_PAGE_CS_VALIDATED;
20794 }
20795 if (m->vmp_cs_tainted) {
20796 disposition |= VM_PAGE_QUERY_PAGE_CS_TAINTED;
20797 }
20798 if (m->vmp_cs_nx) {
20799 disposition |= VM_PAGE_QUERY_PAGE_CS_NX;
20800 }
20801 if (m->vmp_reusable || curr_object->all_reusable) {
20802 disposition |= VM_PAGE_QUERY_PAGE_REUSABLE;
20803 }
20804 }
20805 }
20806
20807 switch (flavor) {
20808 case VM_PAGE_INFO_BASIC:
20809 basic_info = (vm_page_info_basic_t) (((uintptr_t) info) + (info_idx * sizeof(struct vm_page_info_basic)));
20810 basic_info->disposition = disposition;
20811 basic_info->ref_count = ref_count;
20812 basic_info->object_id = (vm_object_id_t) (uintptr_t)
20813 VM_KERNEL_ADDRHASH(curr_object);
20814 basic_info->offset =
20815 (memory_object_offset_t) curr_offset_in_object + offset_in_page;
20816 basic_info->depth = depth;
20817
20818 info_idx++;
20819 break;
20820 }
20821
20822 disposition = 0;
20823 offset_in_page = 0; // This doesn't really make sense for any offset other than the starting offset.
20824
20825 /*
20826 * Move to next offset in the range and in our object.
20827 */
20828 curr_s_offset += effective_page_size;
20829 offset_in_object += effective_page_size;
20830 curr_offset_in_object = offset_in_object;
20831
20832 if (curr_object != object) {
20833 vm_object_unlock(curr_object);
20834
20835 curr_object = object;
20836
20837 vm_object_lock_shared(curr_object);
20838 } else {
20839 vm_object_lock_yield_shared(curr_object);
20840 }
20841 }
20842
20843 vm_object_unlock(curr_object);
20844 vm_object_deallocate(curr_object);
20845
20846 vm_map_lock_read(map);
20847 }
20848
20849 vm_map_unlock_read(map);
20850 return retval;
20851 }
20852
20853 static __attribute__((always_inline, warn_unused_result))
20854 kern_return_t
vm_map_msync_sanitize(vm_map_t map,vm_map_address_ut address_u,vm_map_size_ut size_u,vm_object_offset_t * address,vm_map_size_t * size)20855 vm_map_msync_sanitize(
20856 vm_map_t map,
20857 vm_map_address_ut address_u,
20858 vm_map_size_ut size_u,
20859 vm_object_offset_t *address,
20860 vm_map_size_t *size)
20861 {
20862 vm_object_offset_t end;
20863 vm_sanitize_flags_t flags = VM_SANITIZE_FLAGS_SIZE_ZERO_SUCCEEDS;
20864
20865
20866 return vm_sanitize_addr_size(address_u, size_u,
20867 VM_SANITIZE_CALLER_VM_MAP_MSYNC,
20868 map, flags, address, &end, size);
20869 }
20870
20871 /*
20872 * vm_map_msync
20873 *
20874 * Synchronises the memory range specified with its backing store
20875 * image by either flushing or cleaning the contents to the appropriate
20876 * memory manager engaging in a memory object synchronize dialog with
20877 * the manager. The client doesn't return until the manager issues
20878 * m_o_s_completed message. MIG Magically converts user task parameter
20879 * to the task's address map.
20880 *
20881 * interpretation of sync_flags
20882 * VM_SYNC_INVALIDATE - discard pages, only return precious
20883 * pages to manager.
20884 *
20885 * VM_SYNC_INVALIDATE & (VM_SYNC_SYNCHRONOUS | VM_SYNC_ASYNCHRONOUS)
20886 * - discard pages, write dirty or precious
20887 * pages back to memory manager.
20888 *
20889 * VM_SYNC_SYNCHRONOUS | VM_SYNC_ASYNCHRONOUS
20890 * - write dirty or precious pages back to
20891 * the memory manager.
20892 *
20893 * VM_SYNC_CONTIGUOUS - does everything normally, but if there
20894 * is a hole in the region, and we would
20895 * have returned KERN_SUCCESS, return
20896 * KERN_INVALID_ADDRESS instead.
20897 *
20898 * NOTE
20899 * The memory object attributes have not yet been implemented, this
20900 * function will have to deal with the invalidate attribute
20901 *
20902 * RETURNS
20903 * KERN_INVALID_TASK Bad task parameter
20904 * KERN_INVALID_ARGUMENT both sync and async were specified.
20905 * KERN_SUCCESS The usual.
20906 * KERN_INVALID_ADDRESS There was a hole in the region.
20907 */
20908
20909 kern_return_t
vm_map_msync(vm_map_t map,vm_map_address_ut address_u,vm_map_size_ut size_u,vm_sync_t sync_flags)20910 vm_map_msync(
20911 vm_map_t map,
20912 vm_map_address_ut address_u,
20913 vm_map_size_ut size_u,
20914 vm_sync_t sync_flags)
20915 {
20916 vm_map_entry_t entry;
20917 vm_map_size_t size, amount_left;
20918 vm_object_offset_t address, offset;
20919 vm_object_offset_t start_offset, end_offset;
20920 boolean_t do_sync_req;
20921 boolean_t had_hole = FALSE;
20922 vm_map_offset_t pmap_offset;
20923 kern_return_t kr;
20924
20925 if ((sync_flags & VM_SYNC_ASYNCHRONOUS) &&
20926 (sync_flags & VM_SYNC_SYNCHRONOUS)) {
20927 return KERN_INVALID_ARGUMENT;
20928 }
20929
20930 if (map == VM_MAP_NULL) {
20931 return KERN_INVALID_TASK;
20932 }
20933
20934 kr = vm_map_msync_sanitize(map,
20935 address_u,
20936 size_u,
20937 &address,
20938 &size);
20939 if (VM_MAP_PAGE_MASK(map) < PAGE_MASK) {
20940 DEBUG4K_SHARE("map %p address 0x%llx size 0x%llx flags 0x%x\n", map, (uint64_t)address, (uint64_t)size, sync_flags);
20941 }
20942 if (__improbable(kr != KERN_SUCCESS)) {
20943 return vm_sanitize_get_kr(kr);
20944 }
20945
20946 amount_left = size;
20947
20948 while (amount_left > 0) {
20949 vm_object_size_t flush_size;
20950 vm_object_t object;
20951
20952 vm_map_lock(map);
20953 if (!vm_map_lookup_entry(map,
20954 address,
20955 &entry)) {
20956 vm_map_size_t skip;
20957
20958 /*
20959 * hole in the address map.
20960 */
20961 had_hole = TRUE;
20962
20963 if (sync_flags & VM_SYNC_KILLPAGES) {
20964 /*
20965 * For VM_SYNC_KILLPAGES, there should be
20966 * no holes in the range, since we couldn't
20967 * prevent someone else from allocating in
20968 * that hole and we wouldn't want to "kill"
20969 * their pages.
20970 */
20971 vm_map_unlock(map);
20972 break;
20973 }
20974
20975 /*
20976 * Check for empty map.
20977 */
20978 if (entry == vm_map_to_entry(map) &&
20979 entry->vme_next == entry) {
20980 vm_map_unlock(map);
20981 break;
20982 }
20983 /*
20984 * Check that we don't wrap and that
20985 * we have at least one real map entry.
20986 */
20987 if ((map->hdr.nentries == 0) ||
20988 (entry->vme_next->vme_start < address)) {
20989 vm_map_unlock(map);
20990 break;
20991 }
20992 /*
20993 * Move up to the next entry if needed
20994 */
20995 skip = (entry->vme_next->vme_start - address);
20996 if (skip >= amount_left) {
20997 amount_left = 0;
20998 } else {
20999 amount_left -= skip;
21000 }
21001 address = entry->vme_next->vme_start;
21002 vm_map_unlock(map);
21003 continue;
21004 }
21005
21006 offset = address - entry->vme_start;
21007 pmap_offset = address;
21008
21009 /*
21010 * do we have more to flush than is contained in this
21011 * entry ?
21012 */
21013 if (amount_left + entry->vme_start + offset > entry->vme_end) {
21014 flush_size = entry->vme_end -
21015 (entry->vme_start + offset);
21016 } else {
21017 flush_size = amount_left;
21018 }
21019 amount_left -= flush_size;
21020 address += flush_size;
21021
21022 if (entry->is_sub_map == TRUE) {
21023 vm_map_t local_map;
21024 vm_map_offset_t local_offset;
21025
21026 local_map = VME_SUBMAP(entry);
21027 local_offset = VME_OFFSET(entry);
21028 vm_map_reference(local_map);
21029 vm_map_unlock(map);
21030 if (vm_map_msync(
21031 local_map,
21032 local_offset,
21033 flush_size,
21034 sync_flags) == KERN_INVALID_ADDRESS) {
21035 had_hole = TRUE;
21036 }
21037 vm_map_deallocate(local_map);
21038 local_map = VM_MAP_NULL;
21039 continue;
21040 }
21041 object = VME_OBJECT(entry);
21042
21043 /*
21044 * We can't sync this object if the object has not been
21045 * created yet
21046 */
21047 if (object == VM_OBJECT_NULL) {
21048 vm_map_unlock(map);
21049 continue;
21050 }
21051 offset += VME_OFFSET(entry);
21052
21053 vm_object_lock(object);
21054
21055 if (sync_flags & (VM_SYNC_KILLPAGES | VM_SYNC_DEACTIVATE)) {
21056 int kill_pages = 0;
21057
21058 if (VM_MAP_PAGE_MASK(map) < PAGE_MASK) {
21059 /*
21060 * This is a destructive operation and so we
21061 * err on the side of limiting the range of
21062 * the operation.
21063 */
21064 start_offset = vm_object_round_page(offset);
21065 end_offset = vm_object_trunc_page(offset + flush_size);
21066
21067 if (end_offset <= start_offset) {
21068 vm_object_unlock(object);
21069 vm_map_unlock(map);
21070 continue;
21071 }
21072
21073 pmap_offset += start_offset - offset;
21074 } else {
21075 start_offset = offset;
21076 end_offset = offset + flush_size;
21077 }
21078
21079 if (sync_flags & VM_SYNC_KILLPAGES) {
21080 if (((os_ref_get_count_raw(&object->ref_count) == 1) ||
21081 ((object->copy_strategy !=
21082 MEMORY_OBJECT_COPY_SYMMETRIC) &&
21083 (object->vo_copy == VM_OBJECT_NULL))) &&
21084 (object->shadow == VM_OBJECT_NULL)) {
21085 if (os_ref_get_count_raw(&object->ref_count) != 1) {
21086 vm_page_stats_reusable.free_shared++;
21087 }
21088 kill_pages = 1;
21089 } else {
21090 kill_pages = -1;
21091 }
21092 }
21093 if (kill_pages != -1) {
21094 boolean_t kill_no_write = FALSE;
21095
21096 if ((entry->protection & VM_PROT_EXECUTE) ||
21097 entry->vme_xnu_user_debug) {
21098 /*
21099 * Executable or user debug pages might be write-protected by
21100 * hardware, so do not attempt to write to these pages.
21101 */
21102 kill_no_write = TRUE;
21103 }
21104 vm_object_deactivate_pages(
21105 object,
21106 start_offset,
21107 (vm_object_size_t) (end_offset - start_offset),
21108 kill_pages,
21109 FALSE, /* reusable_pages */
21110 kill_no_write,
21111 map->pmap,
21112 pmap_offset);
21113 }
21114 vm_object_unlock(object);
21115 vm_map_unlock(map);
21116 continue;
21117 }
21118 /*
21119 * We can't sync this object if there isn't a pager.
21120 * Don't bother to sync internal objects, since there can't
21121 * be any "permanent" storage for these objects anyway.
21122 */
21123 if ((object->pager == MEMORY_OBJECT_NULL) ||
21124 (object->internal) || (object->private)) {
21125 vm_object_unlock(object);
21126 vm_map_unlock(map);
21127 continue;
21128 }
21129 /*
21130 * keep reference on the object until syncing is done
21131 */
21132 vm_object_reference_locked(object);
21133 vm_object_unlock(object);
21134
21135 vm_map_unlock(map);
21136
21137 if (VM_MAP_PAGE_MASK(map) < PAGE_MASK) {
21138 start_offset = vm_object_trunc_page(offset);
21139 end_offset = vm_object_round_page(offset + flush_size);
21140 } else {
21141 start_offset = offset;
21142 end_offset = offset + flush_size;
21143 }
21144
21145 do_sync_req = vm_object_sync(object,
21146 start_offset,
21147 (end_offset - start_offset),
21148 sync_flags & VM_SYNC_INVALIDATE,
21149 ((sync_flags & VM_SYNC_SYNCHRONOUS) ||
21150 (sync_flags & VM_SYNC_ASYNCHRONOUS)),
21151 sync_flags & VM_SYNC_SYNCHRONOUS);
21152
21153 if ((sync_flags & VM_SYNC_INVALIDATE) && object->resident_page_count == 0) {
21154 /*
21155 * clear out the clustering and read-ahead hints
21156 */
21157 vm_object_lock(object);
21158
21159 object->pages_created = 0;
21160 object->pages_used = 0;
21161 object->sequential = 0;
21162 object->last_alloc = 0;
21163
21164 vm_object_unlock(object);
21165 }
21166 vm_object_deallocate(object);
21167 } /* while */
21168
21169 /* for proper msync() behaviour */
21170 if (had_hole == TRUE && (sync_flags & VM_SYNC_CONTIGUOUS)) {
21171 return KERN_INVALID_ADDRESS;
21172 }
21173
21174 return KERN_SUCCESS;
21175 }/* vm_msync */
21176
21177 void
vm_named_entry_associate_vm_object(vm_named_entry_t named_entry,vm_object_t object,vm_object_offset_t offset,vm_object_size_t size,vm_prot_t prot)21178 vm_named_entry_associate_vm_object(
21179 vm_named_entry_t named_entry,
21180 vm_object_t object,
21181 vm_object_offset_t offset,
21182 vm_object_size_t size,
21183 vm_prot_t prot)
21184 {
21185 vm_map_copy_t copy;
21186 vm_map_entry_t copy_entry;
21187
21188 assert(!named_entry->is_sub_map);
21189 assert(!named_entry->is_copy);
21190 assert(!named_entry->is_object);
21191 assert(!named_entry->internal);
21192 assert(named_entry->backing.copy == VM_MAP_COPY_NULL);
21193
21194 copy = vm_map_copy_allocate(VM_MAP_COPY_ENTRY_LIST);
21195 copy->offset = offset;
21196 copy->size = size;
21197 copy->cpy_hdr.page_shift = (uint16_t)PAGE_SHIFT;
21198
21199 copy_entry = vm_map_copy_entry_create(copy);
21200 copy_entry->protection = prot;
21201 copy_entry->max_protection = prot;
21202 copy_entry->use_pmap = TRUE;
21203 copy_entry->vme_start = VM_MAP_TRUNC_PAGE(offset, PAGE_MASK);
21204 copy_entry->vme_end = VM_MAP_ROUND_PAGE(offset + size, PAGE_MASK);
21205 VME_OBJECT_SET(copy_entry, object, false, 0);
21206 VME_OFFSET_SET(copy_entry, vm_object_trunc_page(offset));
21207 vm_map_copy_entry_link(copy, vm_map_copy_last_entry(copy), copy_entry);
21208
21209 named_entry->backing.copy = copy;
21210 named_entry->is_object = TRUE;
21211 if (object->internal) {
21212 named_entry->internal = TRUE;
21213 }
21214
21215 DEBUG4K_MEMENTRY("named_entry %p copy %p object %p offset 0x%llx size 0x%llx prot 0x%x\n",
21216 named_entry, copy, object, offset, size, prot);
21217 }
21218
21219 vm_object_t
vm_named_entry_to_vm_object(vm_named_entry_t named_entry)21220 vm_named_entry_to_vm_object(
21221 vm_named_entry_t named_entry)
21222 {
21223 vm_map_copy_t copy;
21224 vm_map_entry_t copy_entry;
21225 vm_object_t object;
21226
21227 assert(!named_entry->is_sub_map);
21228 assert(!named_entry->is_copy);
21229 assert(named_entry->is_object);
21230 copy = named_entry->backing.copy;
21231 assert(copy != VM_MAP_COPY_NULL);
21232 /*
21233 * Assert that the vm_map_copy is coming from the right
21234 * zone and hasn't been forged
21235 */
21236 vm_map_copy_require(copy);
21237 assert(copy->cpy_hdr.nentries == 1);
21238 copy_entry = vm_map_copy_first_entry(copy);
21239 object = VME_OBJECT(copy_entry);
21240
21241 DEBUG4K_MEMENTRY("%p -> %p -> %p [0x%llx 0x%llx 0x%llx 0x%x/0x%x ] -> %p offset 0x%llx size 0x%llx prot 0x%x\n", named_entry, copy, copy_entry, (uint64_t)copy_entry->vme_start, (uint64_t)copy_entry->vme_end, copy_entry->vme_offset, copy_entry->protection, copy_entry->max_protection, object, named_entry->offset, named_entry->size, named_entry->protection);
21242
21243 return object;
21244 }
21245
21246 /*
21247 * Routine: convert_port_entry_to_map
21248 * Purpose:
21249 * Convert from a port specifying an entry or a task
21250 * to a map. Doesn't consume the port ref; produces a map ref,
21251 * which may be null. Unlike convert_port_to_map, the
21252 * port may be task or a named entry backed.
21253 * Conditions:
21254 * Nothing locked.
21255 */
21256
21257 vm_map_t
convert_port_entry_to_map(ipc_port_t port)21258 convert_port_entry_to_map(
21259 ipc_port_t port)
21260 {
21261 vm_map_t map = VM_MAP_NULL;
21262 vm_named_entry_t named_entry;
21263
21264 if (!IP_VALID(port)) {
21265 return VM_MAP_NULL;
21266 }
21267
21268 if (ip_kotype(port) != IKOT_NAMED_ENTRY) {
21269 return convert_port_to_map(port);
21270 }
21271
21272 named_entry = mach_memory_entry_from_port(port);
21273
21274 if ((named_entry->is_sub_map) &&
21275 (named_entry->protection & VM_PROT_WRITE)) {
21276 map = named_entry->backing.map;
21277 if (map->pmap != PMAP_NULL) {
21278 if (map->pmap == kernel_pmap) {
21279 panic("userspace has access "
21280 "to a kernel map %p", map);
21281 }
21282 pmap_require(map->pmap);
21283 }
21284 vm_map_reference(map);
21285 }
21286
21287 return map;
21288 }
21289
21290 /*
21291 * Export routines to other components for the things we access locally through
21292 * macros.
21293 */
21294 #undef current_map
21295 vm_map_t
current_map(void)21296 current_map(void)
21297 {
21298 return current_map_fast();
21299 }
21300
21301 /*
21302 * vm_map_reference:
21303 *
21304 * Takes a reference on the specified map.
21305 */
21306 void
vm_map_reference(vm_map_t map)21307 vm_map_reference(
21308 vm_map_t map)
21309 {
21310 if (__probable(map != VM_MAP_NULL)) {
21311 vm_map_require(map);
21312 os_ref_retain_raw(&map->map_refcnt, &map_refgrp);
21313 }
21314 }
21315
21316 /*
21317 * vm_map_deallocate:
21318 *
21319 * Removes a reference from the specified map,
21320 * destroying it if no references remain.
21321 * The map should not be locked.
21322 */
21323 void
vm_map_deallocate(vm_map_t map)21324 vm_map_deallocate(
21325 vm_map_t map)
21326 {
21327 if (__probable(map != VM_MAP_NULL)) {
21328 vm_map_require(map);
21329 if (os_ref_release_raw(&map->map_refcnt, &map_refgrp) == 0) {
21330 vm_map_destroy(map);
21331 }
21332 }
21333 }
21334
21335 void
vm_map_inspect_deallocate(vm_map_inspect_t map)21336 vm_map_inspect_deallocate(
21337 vm_map_inspect_t map)
21338 {
21339 vm_map_deallocate((vm_map_t)map);
21340 }
21341
21342 void
vm_map_read_deallocate(vm_map_read_t map)21343 vm_map_read_deallocate(
21344 vm_map_read_t map)
21345 {
21346 vm_map_deallocate((vm_map_t)map);
21347 }
21348
21349
21350 void
vm_map_disable_NX(vm_map_t map)21351 vm_map_disable_NX(vm_map_t map)
21352 {
21353 if (map == NULL) {
21354 return;
21355 }
21356 if (map->pmap == NULL) {
21357 return;
21358 }
21359
21360 pmap_disable_NX(map->pmap);
21361 }
21362
21363 void
vm_map_disallow_data_exec(vm_map_t map)21364 vm_map_disallow_data_exec(vm_map_t map)
21365 {
21366 if (map == NULL) {
21367 return;
21368 }
21369
21370 map->map_disallow_data_exec = TRUE;
21371 }
21372
21373 /* XXX Consider making these constants (VM_MAX_ADDRESS and MACH_VM_MAX_ADDRESS)
21374 * more descriptive.
21375 */
21376 void
vm_map_set_32bit(vm_map_t map)21377 vm_map_set_32bit(vm_map_t map)
21378 {
21379 #if defined(__arm64__)
21380 map->max_offset = pmap_max_offset(FALSE, ARM_PMAP_MAX_OFFSET_DEVICE);
21381 #else
21382 map->max_offset = (vm_map_offset_t)VM_MAX_ADDRESS;
21383 #endif
21384 }
21385
21386
21387 void
vm_map_set_64bit(vm_map_t map)21388 vm_map_set_64bit(vm_map_t map)
21389 {
21390 #if defined(__arm64__)
21391 map->max_offset = pmap_max_offset(TRUE, ARM_PMAP_MAX_OFFSET_DEVICE);
21392 #else
21393 map->max_offset = (vm_map_offset_t)MACH_VM_MAX_ADDRESS;
21394 #endif
21395 }
21396
21397 /*
21398 * Expand the maximum size of an existing map to 64GB.
21399 */
21400 void
vm_map_set_jumbo(vm_map_t map)21401 vm_map_set_jumbo(vm_map_t map)
21402 {
21403 #if defined (__arm64__) && !XNU_TARGET_OS_OSX
21404 vm_map_set_max_addr(map, ~0, false);
21405 #else /* arm64 */
21406 (void) map;
21407 #endif
21408 }
21409
21410 #if XNU_TARGET_OS_IOS && EXTENDED_USER_VA_SUPPORT
21411 /*
21412 * Expand the maximum size of an existing map to the maximum supported.
21413 */
21414 void
vm_map_set_extra_jumbo(vm_map_t map)21415 vm_map_set_extra_jumbo(vm_map_t map)
21416 {
21417 #if defined (__arm64__) && !XNU_TARGET_OS_OSX
21418 vm_map_set_max_addr(map, ~0, true);
21419 #else /* arm64 */
21420 (void) map;
21421 #endif
21422 }
21423 #endif /* XNU_TARGET_OS_IOS && EXTENDED_USER_VA_SUPPORT */
21424
21425 /*
21426 * This map has a JIT entitlement
21427 */
21428 void
vm_map_set_jit_entitled(vm_map_t map)21429 vm_map_set_jit_entitled(vm_map_t map)
21430 {
21431 #if defined (__arm64__)
21432 pmap_set_jit_entitled(map->pmap);
21433 #else /* arm64 */
21434 (void) map;
21435 #endif
21436 }
21437
21438 /*
21439 * Get status of this maps TPRO flag
21440 */
21441 boolean_t
vm_map_tpro(vm_map_t map)21442 vm_map_tpro(vm_map_t map)
21443 {
21444 #if defined (__arm64e__)
21445 return pmap_get_tpro(map->pmap);
21446 #else /* arm64e */
21447 (void) map;
21448 return FALSE;
21449 #endif
21450 }
21451
21452 /*
21453 * This map has TPRO enabled
21454 */
21455 void
vm_map_set_tpro(vm_map_t map)21456 vm_map_set_tpro(vm_map_t map)
21457 {
21458 #if defined (__arm64e__)
21459 pmap_set_tpro(map->pmap);
21460 #else /* arm64e */
21461 (void) map;
21462 #endif
21463 }
21464
21465
21466
21467 /*
21468 * Does this map have TPRO enforcement enabled
21469 */
21470 boolean_t
vm_map_tpro_enforcement(vm_map_t map)21471 vm_map_tpro_enforcement(vm_map_t map)
21472 {
21473 return map->tpro_enforcement;
21474 }
21475
21476 /*
21477 * Set TPRO enforcement for this map
21478 */
21479 void
vm_map_set_tpro_enforcement(vm_map_t map)21480 vm_map_set_tpro_enforcement(vm_map_t map)
21481 {
21482 if (vm_map_tpro(map)) {
21483 vm_map_lock(map);
21484 map->tpro_enforcement = TRUE;
21485 vm_map_unlock(map);
21486 }
21487 }
21488
21489 /*
21490 * Enable TPRO on the requested region
21491 *
21492 * Note:
21493 * This routine is primarily intended to be called during/soon after map
21494 * creation before the associated task has been released to run. It is only
21495 * currently safe when we have no resident pages.
21496 */
21497 boolean_t
vm_map_set_tpro_range(__unused vm_map_t map,__unused vm_map_address_t start,__unused vm_map_address_t end)21498 vm_map_set_tpro_range(
21499 __unused vm_map_t map,
21500 __unused vm_map_address_t start,
21501 __unused vm_map_address_t end)
21502 {
21503 return TRUE;
21504 }
21505
21506 /*
21507 * Expand the maximum size of an existing map.
21508 */
21509 void
vm_map_set_max_addr(vm_map_t map,vm_map_offset_t new_max_offset,__unused bool extra_jumbo)21510 vm_map_set_max_addr(
21511 vm_map_t map,
21512 vm_map_offset_t new_max_offset,
21513 __unused bool extra_jumbo)
21514 {
21515 #if defined(__arm64__)
21516 vm_map_offset_t max_supported_offset;
21517 vm_map_offset_t old_max_offset;
21518 unsigned int option = ARM_PMAP_MAX_OFFSET_JUMBO;
21519
21520 vm_map_lock(map);
21521
21522 old_max_offset = map->max_offset;
21523 #if XNU_TARGET_OS_IOS && EXTENDED_USER_VA_SUPPORT
21524 if (extra_jumbo) {
21525 option = ARM_PMAP_MAX_OFFSET_EXTRA_JUMBO;
21526 }
21527 #endif /* XNU_TARGET_OS_IOS && EXTENDED_USER_VA_SUPPORT */
21528 max_supported_offset = pmap_max_offset(vm_map_is_64bit(map), option);
21529
21530 new_max_offset = trunc_page(new_max_offset);
21531
21532 /* The address space cannot be shrunk using this routine. */
21533 if (old_max_offset >= new_max_offset) {
21534 vm_map_unlock(map);
21535 return;
21536 }
21537
21538 if (max_supported_offset < new_max_offset) {
21539 new_max_offset = max_supported_offset;
21540 }
21541
21542 map->max_offset = new_max_offset;
21543
21544 /*
21545 * Disable the following chunk of code that extends the "holes" list
21546 * to accomodate a larger VM map.
21547 * In `vm_map_create_options()`, we now set the end of the "holes" list to
21548 * max(map->max_offset, MACH_VM_MAX_ADDRESS) for all platforms.
21549 * MACH_VM_MAX_ADDRESS is the largest virtual address a userspace process
21550 * can map, so any `new_max_offset` value will be <= MACH_VM_MAX_ADDRESS.
21551 * The "holes" list does not need to be adjusted.
21552 */
21553 #if 0
21554 if (map->holelistenabled) {
21555 if (map->holes_list->prev->vme_end == old_max_offset) {
21556 /*
21557 * There is already a hole at the end of the map; simply make it bigger.
21558 */
21559 map->holes_list->prev->vme_end = map->max_offset;
21560 } else {
21561 /*
21562 * There is no hole at the end, so we need to create a new hole
21563 * for the new empty space we're creating.
21564 */
21565 struct vm_map_links *new_hole;
21566
21567 new_hole = zalloc_id(ZONE_ID_VM_MAP_HOLES, Z_WAITOK | Z_NOFAIL);
21568 new_hole->start = old_max_offset;
21569 new_hole->end = map->max_offset;
21570 new_hole->prev = map->holes_list->prev;
21571 new_hole->next = (struct vm_map_entry *)map->holes_list;
21572 map->holes_list->prev->vme_next = (struct vm_map_entry *)new_hole;
21573 map->holes_list->prev = (struct vm_map_entry *)new_hole;
21574 }
21575 }
21576 #endif
21577
21578 vm_map_unlock(map);
21579 #else
21580 (void)map;
21581 (void)new_max_offset;
21582 #endif
21583 }
21584
21585 vm_map_offset_t
vm_compute_max_offset(boolean_t is64)21586 vm_compute_max_offset(boolean_t is64)
21587 {
21588 #if defined(__arm64__)
21589 return pmap_max_offset(is64, ARM_PMAP_MAX_OFFSET_DEVICE);
21590 #else
21591 return is64 ? (vm_map_offset_t)MACH_VM_MAX_ADDRESS : (vm_map_offset_t)VM_MAX_ADDRESS;
21592 #endif
21593 }
21594
21595 void
vm_map_get_max_aslr_slide_section(vm_map_t map __unused,int64_t * max_sections,int64_t * section_size)21596 vm_map_get_max_aslr_slide_section(
21597 vm_map_t map __unused,
21598 int64_t *max_sections,
21599 int64_t *section_size)
21600 {
21601 #if defined(__arm64__)
21602 *max_sections = 3;
21603 *section_size = ARM_TT_TWIG_SIZE;
21604 #else
21605 *max_sections = 1;
21606 *section_size = 0;
21607 #endif
21608 }
21609
21610 uint64_t
vm_map_get_max_aslr_slide_pages(vm_map_t map)21611 vm_map_get_max_aslr_slide_pages(vm_map_t map)
21612 {
21613 #if defined(__arm64__)
21614 /* Limit arm64 slide to 16MB to conserve contiguous VA space in the more
21615 * limited embedded address space; this is also meant to minimize pmap
21616 * memory usage on 16KB page systems.
21617 */
21618 return 1 << (24 - VM_MAP_PAGE_SHIFT(map));
21619 #else
21620 return 1 << (vm_map_is_64bit(map) ? 16 : 8);
21621 #endif
21622 }
21623
21624 uint64_t
vm_map_get_max_loader_aslr_slide_pages(vm_map_t map)21625 vm_map_get_max_loader_aslr_slide_pages(vm_map_t map)
21626 {
21627 #if defined(__arm64__)
21628 /* We limit the loader slide to 4MB, in order to ensure at least 8 bits
21629 * of independent entropy on 16KB page systems.
21630 */
21631 return 1 << (22 - VM_MAP_PAGE_SHIFT(map));
21632 #else
21633 return 1 << (vm_map_is_64bit(map) ? 16 : 8);
21634 #endif
21635 }
21636
21637 boolean_t
vm_map_is_64bit(vm_map_t map)21638 vm_map_is_64bit(
21639 vm_map_t map)
21640 {
21641 return map->max_offset > ((vm_map_offset_t)VM_MAX_ADDRESS);
21642 }
21643
21644 boolean_t
vm_map_has_hard_pagezero(vm_map_t map,vm_map_offset_t pagezero_size)21645 vm_map_has_hard_pagezero(
21646 vm_map_t map,
21647 vm_map_offset_t pagezero_size)
21648 {
21649 /*
21650 * XXX FBDP
21651 * We should lock the VM map (for read) here but we can get away
21652 * with it for now because there can't really be any race condition:
21653 * the VM map's min_offset is changed only when the VM map is created
21654 * and when the zero page is established (when the binary gets loaded),
21655 * and this routine gets called only when the task terminates and the
21656 * VM map is being torn down, and when a new map is created via
21657 * load_machfile()/execve().
21658 */
21659 return map->min_offset >= pagezero_size;
21660 }
21661
21662 /*
21663 * Raise a VM map's maximun offset.
21664 */
21665 kern_return_t
vm_map_raise_max_offset(vm_map_t map,vm_map_offset_t new_max_offset)21666 vm_map_raise_max_offset(
21667 vm_map_t map,
21668 vm_map_offset_t new_max_offset)
21669 {
21670 kern_return_t ret;
21671
21672 vm_map_lock(map);
21673 ret = KERN_INVALID_ADDRESS;
21674
21675 if (new_max_offset >= map->max_offset) {
21676 if (!vm_map_is_64bit(map)) {
21677 if (new_max_offset <= (vm_map_offset_t)VM_MAX_ADDRESS) {
21678 map->max_offset = new_max_offset;
21679 ret = KERN_SUCCESS;
21680 }
21681 } else {
21682 if (new_max_offset <= (vm_map_offset_t)MACH_VM_MAX_ADDRESS) {
21683 map->max_offset = new_max_offset;
21684 ret = KERN_SUCCESS;
21685 }
21686 }
21687 }
21688
21689 vm_map_unlock(map);
21690 return ret;
21691 }
21692
21693
21694 /*
21695 * Raise a VM map's minimum offset.
21696 * To strictly enforce "page zero" reservation.
21697 */
21698 kern_return_t
vm_map_raise_min_offset(vm_map_t map,vm_map_offset_t new_min_offset)21699 vm_map_raise_min_offset(
21700 vm_map_t map,
21701 vm_map_offset_t new_min_offset)
21702 {
21703 vm_map_entry_t first_entry;
21704
21705 new_min_offset = vm_map_round_page(new_min_offset,
21706 VM_MAP_PAGE_MASK(map));
21707
21708 vm_map_lock(map);
21709
21710 if (new_min_offset < map->min_offset) {
21711 /*
21712 * Can't move min_offset backwards, as that would expose
21713 * a part of the address space that was previously, and for
21714 * possibly good reasons, inaccessible.
21715 */
21716 vm_map_unlock(map);
21717 return KERN_INVALID_ADDRESS;
21718 }
21719 if (new_min_offset >= map->max_offset) {
21720 /* can't go beyond the end of the address space */
21721 vm_map_unlock(map);
21722 return KERN_INVALID_ADDRESS;
21723 }
21724
21725 first_entry = vm_map_first_entry(map);
21726 if (first_entry != vm_map_to_entry(map) &&
21727 first_entry->vme_start < new_min_offset) {
21728 /*
21729 * Some memory was already allocated below the new
21730 * minimun offset. It's too late to change it now...
21731 */
21732 vm_map_unlock(map);
21733 return KERN_NO_SPACE;
21734 }
21735
21736 map->min_offset = new_min_offset;
21737
21738 if (map->holelistenabled) {
21739 assert(map->holes_list);
21740 map->holes_list->start = new_min_offset;
21741 assert(new_min_offset < map->holes_list->end);
21742 }
21743
21744 vm_map_unlock(map);
21745
21746 return KERN_SUCCESS;
21747 }
21748
21749 /*
21750 * Set the limit on the maximum amount of address space and user wired memory allowed for this map.
21751 * This is basically a copy of the RLIMIT_AS and RLIMIT_MEMLOCK rlimit value maintained by the BSD
21752 * side of the kernel. The limits are checked in the mach VM side, so we keep a copy so we don't
21753 * have to reach over to the BSD data structures.
21754 */
21755
21756 uint64_t vm_map_set_size_limit_count = 0;
21757 kern_return_t
vm_map_set_size_limit(vm_map_t map,uint64_t new_size_limit)21758 vm_map_set_size_limit(vm_map_t map, uint64_t new_size_limit)
21759 {
21760 kern_return_t kr;
21761
21762 vm_map_lock(map);
21763 if (new_size_limit < map->size) {
21764 /* new limit should not be lower than its current size */
21765 DTRACE_VM2(vm_map_set_size_limit_fail,
21766 vm_map_size_t, map->size,
21767 uint64_t, new_size_limit);
21768 kr = KERN_FAILURE;
21769 } else if (new_size_limit == map->size_limit) {
21770 /* no change */
21771 kr = KERN_SUCCESS;
21772 } else {
21773 /* set new limit */
21774 DTRACE_VM2(vm_map_set_size_limit,
21775 vm_map_size_t, map->size,
21776 uint64_t, new_size_limit);
21777 if (new_size_limit != RLIM_INFINITY) {
21778 vm_map_set_size_limit_count++;
21779 }
21780 map->size_limit = new_size_limit;
21781 kr = KERN_SUCCESS;
21782 }
21783 vm_map_unlock(map);
21784 return kr;
21785 }
21786
21787 uint64_t vm_map_set_data_limit_count = 0;
21788 kern_return_t
vm_map_set_data_limit(vm_map_t map,uint64_t new_data_limit)21789 vm_map_set_data_limit(vm_map_t map, uint64_t new_data_limit)
21790 {
21791 kern_return_t kr;
21792
21793 vm_map_lock(map);
21794 if (new_data_limit < map->size) {
21795 /* new limit should not be lower than its current size */
21796 DTRACE_VM2(vm_map_set_data_limit_fail,
21797 vm_map_size_t, map->size,
21798 uint64_t, new_data_limit);
21799 kr = KERN_FAILURE;
21800 } else if (new_data_limit == map->data_limit) {
21801 /* no change */
21802 kr = KERN_SUCCESS;
21803 } else {
21804 /* set new limit */
21805 DTRACE_VM2(vm_map_set_data_limit,
21806 vm_map_size_t, map->size,
21807 uint64_t, new_data_limit);
21808 if (new_data_limit != RLIM_INFINITY) {
21809 vm_map_set_data_limit_count++;
21810 }
21811 map->data_limit = new_data_limit;
21812 kr = KERN_SUCCESS;
21813 }
21814 vm_map_unlock(map);
21815 return kr;
21816 }
21817
21818 void
vm_map_set_user_wire_limit(vm_map_t map,vm_size_t limit)21819 vm_map_set_user_wire_limit(vm_map_t map,
21820 vm_size_t limit)
21821 {
21822 vm_map_lock(map);
21823 map->user_wire_limit = limit;
21824 vm_map_unlock(map);
21825 }
21826
21827
21828 void
vm_map_switch_protect(vm_map_t map,boolean_t val)21829 vm_map_switch_protect(vm_map_t map,
21830 boolean_t val)
21831 {
21832 vm_map_lock(map);
21833 map->switch_protect = val;
21834 vm_map_unlock(map);
21835 }
21836
21837 extern int cs_process_enforcement_enable;
21838 boolean_t
vm_map_cs_enforcement(vm_map_t map)21839 vm_map_cs_enforcement(
21840 vm_map_t map)
21841 {
21842 if (cs_process_enforcement_enable) {
21843 return TRUE;
21844 }
21845 return map->cs_enforcement;
21846 }
21847
21848 kern_return_t
vm_map_cs_wx_enable(__unused vm_map_t map)21849 vm_map_cs_wx_enable(
21850 __unused vm_map_t map)
21851 {
21852 #if CODE_SIGNING_MONITOR
21853 kern_return_t ret = csm_allow_invalid_code(vm_map_pmap(map));
21854 if ((ret == KERN_SUCCESS) || (ret == KERN_NOT_SUPPORTED)) {
21855 return KERN_SUCCESS;
21856 }
21857 return ret;
21858 #else
21859 /* The VM manages WX memory entirely on its own */
21860 return KERN_SUCCESS;
21861 #endif
21862 }
21863
21864 kern_return_t
vm_map_csm_allow_jit(__unused vm_map_t map)21865 vm_map_csm_allow_jit(
21866 __unused vm_map_t map)
21867 {
21868 #if CODE_SIGNING_MONITOR
21869 return csm_allow_jit_region(vm_map_pmap(map));
21870 #else
21871 /* No code signing monitor to enforce JIT policy */
21872 return KERN_SUCCESS;
21873 #endif
21874 }
21875
21876 void
vm_map_cs_debugged_set(vm_map_t map,boolean_t val)21877 vm_map_cs_debugged_set(
21878 vm_map_t map,
21879 boolean_t val)
21880 {
21881 vm_map_lock(map);
21882 map->cs_debugged = val;
21883 vm_map_unlock(map);
21884 }
21885
21886 void
vm_map_cs_enforcement_set(vm_map_t map,boolean_t val)21887 vm_map_cs_enforcement_set(
21888 vm_map_t map,
21889 boolean_t val)
21890 {
21891 vm_map_lock(map);
21892 map->cs_enforcement = val;
21893 pmap_set_vm_map_cs_enforced(map->pmap, val);
21894 vm_map_unlock(map);
21895 }
21896
21897 /*
21898 * IOKit has mapped a region into this map; adjust the pmap's ledgers appropriately.
21899 * phys_footprint is a composite limit consisting of iokit + physmem, so we need to
21900 * bump both counters.
21901 */
21902 void
vm_map_iokit_mapped_region(vm_map_t map,vm_size_t bytes)21903 vm_map_iokit_mapped_region(vm_map_t map, vm_size_t bytes)
21904 {
21905 pmap_t pmap = vm_map_pmap(map);
21906
21907 ledger_credit(pmap->ledger, task_ledgers.iokit_mapped, bytes);
21908 ledger_credit(pmap->ledger, task_ledgers.phys_footprint, bytes);
21909 }
21910
21911 void
vm_map_iokit_unmapped_region(vm_map_t map,vm_size_t bytes)21912 vm_map_iokit_unmapped_region(vm_map_t map, vm_size_t bytes)
21913 {
21914 pmap_t pmap = vm_map_pmap(map);
21915
21916 ledger_debit(pmap->ledger, task_ledgers.iokit_mapped, bytes);
21917 ledger_debit(pmap->ledger, task_ledgers.phys_footprint, bytes);
21918 }
21919
21920 /* Add (generate) code signature for memory range */
21921 #if CONFIG_DYNAMIC_CODE_SIGNING
21922 kern_return_t
vm_map_sign(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)21923 vm_map_sign(vm_map_t map,
21924 vm_map_offset_t start,
21925 vm_map_offset_t end)
21926 {
21927 vm_map_entry_t entry;
21928 vm_map_offset_t entry_start;
21929 vm_object_offset_t entry_offset;
21930 vm_page_t m;
21931 vm_object_t object;
21932
21933 /*
21934 * Vet all the input parameters and current type and state of the
21935 * underlaying object. Return with an error if anything is amiss.
21936 */
21937 if (map == VM_MAP_NULL) {
21938 return KERN_INVALID_ARGUMENT;
21939 }
21940
21941 if (__improbable(vm_map_range_overflows(map, start, end - start))) {
21942 return KERN_INVALID_ADDRESS;
21943 }
21944
21945 vm_map_lock_read(map);
21946
21947 if (!vm_map_lookup_entry(map, start, &entry) || entry->is_sub_map) {
21948 /*
21949 * Must pass a valid non-submap address.
21950 */
21951 vm_map_unlock_read(map);
21952 return KERN_INVALID_ADDRESS;
21953 }
21954
21955 if ((entry->vme_start > start) || (entry->vme_end < end)) {
21956 /*
21957 * Map entry doesn't cover the requested range. Not handling
21958 * this situation currently.
21959 */
21960 vm_map_unlock_read(map);
21961 return KERN_INVALID_ARGUMENT;
21962 }
21963
21964 object = VME_OBJECT(entry);
21965 if (object == VM_OBJECT_NULL) {
21966 /*
21967 * Object must already be present or we can't sign.
21968 */
21969 vm_map_unlock_read(map);
21970 return KERN_INVALID_ARGUMENT;
21971 }
21972
21973 vm_object_lock(object);
21974
21975 entry_start = entry->vme_start;
21976 entry_offset = VME_OFFSET(entry);
21977 vm_map_unlock_read(map);
21978 entry = VM_MAP_ENTRY_NULL; /* no longer valid after unlocking map */
21979
21980 while (start < end) {
21981 uint32_t refmod;
21982
21983 m = vm_page_lookup(object,
21984 start - entry_start + entry_offset);
21985 if (m == VM_PAGE_NULL) {
21986 /* shoud we try to fault a page here? we can probably
21987 * demand it exists and is locked for this request */
21988 vm_object_unlock(object);
21989 return KERN_FAILURE;
21990 }
21991 /* deal with special page status */
21992 if (m->vmp_busy ||
21993 (m->vmp_unusual && (VMP_ERROR_GET(m) || m->vmp_restart ||
21994 vm_page_is_private(m) || m->vmp_absent))) {
21995 vm_object_unlock(object);
21996 return KERN_FAILURE;
21997 }
21998
21999 /* Page is OK... now "validate" it */
22000 /* This is the place where we'll call out to create a code
22001 * directory, later */
22002 /* XXX TODO4K: deal with 4k subpages individually? */
22003 m->vmp_cs_validated = VMP_CS_ALL_TRUE;
22004
22005 /* The page is now "clean" for codesigning purposes. That means
22006 * we don't consider it as modified (wpmapped) anymore. But
22007 * we'll disconnect the page so we note any future modification
22008 * attempts. */
22009 m->vmp_wpmapped = FALSE;
22010 refmod = pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m));
22011
22012 /* Pull the dirty status from the pmap, since we cleared the
22013 * wpmapped bit */
22014 if ((refmod & VM_MEM_MODIFIED) && !m->vmp_dirty) {
22015 SET_PAGE_DIRTY(m, FALSE);
22016 }
22017
22018 /* On to the next page */
22019 start += PAGE_SIZE;
22020 }
22021 vm_object_unlock(object);
22022
22023 return KERN_SUCCESS;
22024 }
22025 #endif
22026
22027 kern_return_t
vm_map_partial_reap(vm_map_t map,unsigned int * reclaimed_resident,unsigned int * reclaimed_compressed)22028 vm_map_partial_reap(vm_map_t map, unsigned int *reclaimed_resident, unsigned int *reclaimed_compressed)
22029 {
22030 vm_map_entry_t entry = VM_MAP_ENTRY_NULL;
22031 vm_map_entry_t next_entry;
22032 kern_return_t kr = KERN_SUCCESS;
22033 VM_MAP_ZAP_DECLARE(zap_list);
22034
22035 vm_map_lock(map);
22036
22037 for (entry = vm_map_first_entry(map);
22038 entry != vm_map_to_entry(map);
22039 entry = next_entry) {
22040 next_entry = entry->vme_next;
22041
22042 if (!entry->is_sub_map &&
22043 VME_OBJECT(entry) &&
22044 (VME_OBJECT(entry)->internal == TRUE) &&
22045 (os_ref_get_count_raw(&VME_OBJECT(entry)->ref_count) == 1)) {
22046 *reclaimed_resident += VME_OBJECT(entry)->resident_page_count;
22047 *reclaimed_compressed += vm_compressor_pager_get_count(VME_OBJECT(entry)->pager);
22048
22049 (void)vm_map_delete(map, entry->vme_start,
22050 entry->vme_end, VM_MAP_REMOVE_NO_YIELD,
22051 KMEM_GUARD_NONE, &zap_list);
22052 }
22053 }
22054
22055 vm_map_unlock(map);
22056
22057 vm_map_zap_dispose(&zap_list);
22058
22059 return kr;
22060 }
22061
22062
22063 #if DEVELOPMENT || DEBUG
22064
22065 int
vm_map_disconnect_page_mappings(vm_map_t map,boolean_t do_unnest)22066 vm_map_disconnect_page_mappings(
22067 vm_map_t map,
22068 boolean_t do_unnest)
22069 {
22070 vm_map_entry_t entry;
22071 ledger_amount_t byte_count = 0;
22072
22073 if (do_unnest == TRUE) {
22074 #ifndef NO_NESTED_PMAP
22075 vm_map_lock(map);
22076
22077 for (entry = vm_map_first_entry(map);
22078 entry != vm_map_to_entry(map);
22079 entry = entry->vme_next) {
22080 if (entry->is_sub_map && entry->use_pmap) {
22081 /*
22082 * Make sure the range between the start of this entry and
22083 * the end of this entry is no longer nested, so that
22084 * we will only remove mappings from the pmap in use by this
22085 * this task
22086 */
22087 vm_map_clip_unnest(map, entry, entry->vme_start, entry->vme_end);
22088 }
22089 }
22090 vm_map_unlock(map);
22091 #endif
22092 }
22093 vm_map_lock_read(map);
22094
22095 ledger_get_balance(map->pmap->ledger, task_ledgers.phys_mem, &byte_count);
22096
22097 for (entry = vm_map_first_entry(map);
22098 entry != vm_map_to_entry(map);
22099 entry = entry->vme_next) {
22100 if (!entry->is_sub_map && ((VME_OBJECT(entry) == 0) ||
22101 (VME_OBJECT(entry)->phys_contiguous))) {
22102 continue;
22103 }
22104 if (entry->is_sub_map) {
22105 assert(!entry->use_pmap);
22106 }
22107
22108 pmap_remove_options(map->pmap, entry->vme_start, entry->vme_end, 0);
22109 }
22110 vm_map_unlock_read(map);
22111
22112 return (int) (byte_count / VM_MAP_PAGE_SIZE(map));
22113 }
22114
22115 kern_return_t
vm_map_inject_error(vm_map_t map,vm_map_offset_t vaddr)22116 vm_map_inject_error(vm_map_t map, vm_map_offset_t vaddr)
22117 {
22118 vm_object_t object = NULL;
22119 vm_object_offset_t offset;
22120 vm_prot_t prot;
22121 boolean_t wired;
22122 vm_map_version_t version;
22123 vm_map_t real_map;
22124 int result = KERN_FAILURE;
22125
22126 vaddr = vm_map_trunc_page(vaddr, PAGE_MASK);
22127 vm_map_lock(map);
22128
22129 result = vm_map_lookup_and_lock_object(&map, vaddr, VM_PROT_READ,
22130 OBJECT_LOCK_EXCLUSIVE, &version, &object, &offset, &prot, &wired,
22131 NULL, &real_map, NULL);
22132 if (object == NULL) {
22133 result = KERN_MEMORY_ERROR;
22134 } else if (object->pager) {
22135 result = vm_compressor_pager_inject_error(object->pager,
22136 offset);
22137 } else {
22138 result = KERN_MEMORY_PRESENT;
22139 }
22140
22141 if (object != NULL) {
22142 vm_object_unlock(object);
22143 }
22144
22145 if (real_map != map) {
22146 vm_map_unlock(real_map);
22147 }
22148 vm_map_unlock(map);
22149
22150 return result;
22151 }
22152
22153 /* iterate over map entries. Call the first argument block for the number of entries and the second for every entry
22154 * returns: KERN_SUCCESS if iteration completed ok,
22155 * error code if callback returned an error
22156 * KERN_FAILURE if there was a race of adding/removing entries during the iteration and the number of entries
22157 * iterated is different from the number in the first call
22158 */
22159 static kern_return_t
22160 vm_map_entries_foreach_locked(vm_map_t map, kern_return_t (^count_handler)(int nentries),
22161 kern_return_t (^entry_handler)(void* entry))
22162 {
22163 vm_map_lock_assert_held(map);
22164 int nentries = map->hdr.nentries;
22165 kern_return_t error = count_handler(nentries);
22166 if (error) {
22167 return error;
22168 }
22169
22170 /* iterate until we loop back to the map, see get_vmmap_entries() */
22171 vm_map_entry_t entry = vm_map_first_entry(map);
22172 int count = 0;
22173 while (entry != vm_map_to_entry(map)) {
22174 error = entry_handler(entry);
22175 if (error != KERN_SUCCESS) {
22176 return error;
22177 }
22178 entry = entry->vme_next;
22179 ++count;
22180 if (count > nentries) {
22181 /* nentries and entries iteration don't agree on how many entries there are, shouldn't really happen */
22182 return KERN_FAILURE;
22183 }
22184 }
22185 if (count < nentries) {
22186 return KERN_FAILURE;
22187 }
22188 return KERN_SUCCESS;
22189 }
22190
22191 kern_return_t
22192 vm_map_entries_foreach(vm_map_t map, kern_return_t (^count_handler)(int nentries),
22193 kern_return_t (^entry_handler)(void* entry))
22194 {
22195 vm_map_lock_read(map);
22196 kern_return_t error = vm_map_entries_foreach_locked(map, count_handler, entry_handler);
22197 vm_map_unlock_read(map);
22198 return error;
22199 }
22200
22201 /*
22202 * Dump info about the entry into the given buffer.
22203 * return true on success, false if there was not enough space in the give buffer
22204 * argument size in: bytes free in the given buffer, out: bytes written
22205 */
22206 kern_return_t
vm_map_dump_entry_and_compressor_pager(void * pentry,char * buf,size_t * size)22207 vm_map_dump_entry_and_compressor_pager(void* pentry, char *buf, size_t *size)
22208 {
22209 size_t insize = *size;
22210 kern_return_t kr;
22211 size_t offset = 0;
22212
22213 *size = 0;
22214 if (sizeof(struct vm_map_entry_info) > insize) {
22215 return KERN_NO_SPACE;
22216 }
22217
22218 vm_map_entry_t entry = (vm_map_entry_t)pentry;
22219 struct vm_map_entry_info *out_entry = (struct vm_map_entry_info*)buf;
22220 out_entry->vmei_start = entry->vme_start;
22221 out_entry->vmei_end = entry->vme_end;
22222 out_entry->vmei_alias = VME_ALIAS(entry);
22223 out_entry->vmei_offset = VME_OFFSET(entry);
22224 out_entry->vmei_is_sub_map = entry->is_sub_map;
22225 out_entry->vmei_protection = entry->protection;
22226 offset += sizeof(struct vm_map_entry_info);
22227
22228 out_entry->vmei_slot_mapping_count = 0;
22229 out_entry->vmei_is_compressor_pager = false;
22230 *size = offset;
22231 if (out_entry->vmei_is_sub_map) {
22232 return KERN_SUCCESS; // TODO: sub_map interrogation not supported yet
22233 }
22234 /* have a vm_object? */
22235 vm_object_t object = VME_OBJECT(entry);
22236 if (object == VM_OBJECT_NULL || !object->internal) {
22237 return KERN_SUCCESS;
22238 }
22239 /* objects has a pager? */
22240 memory_object_t pager = object->pager;
22241 if (pager != MEMORY_OBJECT_NULL) {
22242 return KERN_SUCCESS;
22243 }
22244 bool is_compressor = false;
22245 unsigned int slot_mapping_count = 0;
22246 size_t pager_info_size = insize - offset;
22247 kr = vm_compressor_pager_dump(pager, buf + offset, &pager_info_size, &is_compressor, &slot_mapping_count);
22248 if (kr != KERN_SUCCESS) {
22249 /* didn't have enough space for everything we want to write, caller needs to retry */
22250 return kr;
22251 }
22252 offset += pager_info_size;
22253 /* if we got here, is_compressor should be true due to the object->internal check above, so this assignment
22254 * is just for sanity sake */
22255 out_entry->vmei_is_compressor_pager = is_compressor;
22256 out_entry->vmei_slot_mapping_count = slot_mapping_count;
22257 *size = offset;
22258 return KERN_SUCCESS;
22259 }
22260
22261
22262 #endif
22263
22264
22265 #if CONFIG_FREEZE
22266
22267
22268 extern struct freezer_context freezer_context_global;
22269 AbsoluteTime c_freezer_last_yield_ts = 0;
22270
22271 extern unsigned int memorystatus_freeze_private_shared_pages_ratio;
22272 extern unsigned int memorystatus_freeze_shared_mb_per_process_max;
22273
22274 kern_return_t
vm_map_freeze(task_t task,unsigned int * purgeable_count,unsigned int * wired_count,unsigned int * clean_count,unsigned int * dirty_count,unsigned int dirty_budget,unsigned int * shared_count,int * freezer_error_code,boolean_t eval_only)22275 vm_map_freeze(
22276 task_t task,
22277 unsigned int *purgeable_count,
22278 unsigned int *wired_count,
22279 unsigned int *clean_count,
22280 unsigned int *dirty_count,
22281 unsigned int dirty_budget,
22282 unsigned int *shared_count,
22283 int *freezer_error_code,
22284 boolean_t eval_only)
22285 {
22286 vm_map_entry_t entry2 = VM_MAP_ENTRY_NULL;
22287 kern_return_t kr = KERN_SUCCESS;
22288 boolean_t evaluation_phase = TRUE;
22289 vm_object_t cur_shared_object = NULL;
22290 int cur_shared_obj_ref_cnt = 0;
22291 unsigned int dirty_private_count = 0, dirty_shared_count = 0, obj_pages_snapshot = 0;
22292
22293 *purgeable_count = *wired_count = *clean_count = *dirty_count = *shared_count = 0;
22294
22295 /*
22296 * We need the exclusive lock here so that we can
22297 * block any page faults or lookups while we are
22298 * in the middle of freezing this vm map.
22299 */
22300 vm_map_t map = task->map;
22301
22302 vm_map_lock(map);
22303
22304 assert(VM_CONFIG_COMPRESSOR_IS_PRESENT);
22305
22306 if (vm_compressor_low_on_space() || vm_swap_low_on_space()) {
22307 if (vm_compressor_low_on_space()) {
22308 *freezer_error_code = FREEZER_ERROR_NO_COMPRESSOR_SPACE;
22309 }
22310
22311 if (vm_swap_low_on_space()) {
22312 *freezer_error_code = FREEZER_ERROR_NO_SWAP_SPACE;
22313 }
22314
22315 kr = KERN_NO_SPACE;
22316 goto done;
22317 }
22318
22319 if (VM_CONFIG_FREEZER_SWAP_IS_ACTIVE == FALSE) {
22320 /*
22321 * In-memory compressor backing the freezer. No disk.
22322 * So no need to do the evaluation phase.
22323 */
22324 evaluation_phase = FALSE;
22325
22326 if (eval_only == TRUE) {
22327 /*
22328 * We don't support 'eval_only' mode
22329 * in this non-swap config.
22330 */
22331 *freezer_error_code = FREEZER_ERROR_GENERIC;
22332 kr = KERN_INVALID_ARGUMENT;
22333 goto done;
22334 }
22335
22336 freezer_context_global.freezer_ctx_uncompressed_pages = 0;
22337 clock_get_uptime(&c_freezer_last_yield_ts);
22338 }
22339 again:
22340
22341 for (entry2 = vm_map_first_entry(map);
22342 entry2 != vm_map_to_entry(map);
22343 entry2 = entry2->vme_next) {
22344 vm_object_t src_object;
22345
22346 if (entry2->is_sub_map) {
22347 continue;
22348 }
22349
22350 src_object = VME_OBJECT(entry2);
22351 if (!src_object ||
22352 src_object->phys_contiguous ||
22353 !src_object->internal) {
22354 continue;
22355 }
22356
22357 /* If eligible, scan the entry, moving eligible pages over to our parent object */
22358
22359 if (VM_CONFIG_FREEZER_SWAP_IS_ACTIVE) {
22360 /*
22361 * We skip purgeable objects during evaluation phase only.
22362 * If we decide to freeze this process, we'll explicitly
22363 * purge these objects before we go around again with
22364 * 'evaluation_phase' set to FALSE.
22365 */
22366
22367 if ((src_object->purgable == VM_PURGABLE_EMPTY) || (src_object->purgable == VM_PURGABLE_VOLATILE)) {
22368 /*
22369 * We want to purge objects that may not belong to this task but are mapped
22370 * in this task alone. Since we already purged this task's purgeable memory
22371 * at the end of a successful evaluation phase, we want to avoid doing no-op calls
22372 * on this task's purgeable objects. Hence the check for only volatile objects.
22373 */
22374 if (evaluation_phase ||
22375 src_object->purgable != VM_PURGABLE_VOLATILE ||
22376 os_ref_get_count_raw(&src_object->ref_count) != 1) {
22377 continue;
22378 }
22379 vm_object_lock(src_object);
22380 if (src_object->purgable == VM_PURGABLE_VOLATILE &&
22381 os_ref_get_count_raw(&src_object->ref_count) == 1) {
22382 purgeable_q_t old_queue;
22383
22384 /* object should be on a purgeable queue */
22385 assert(src_object->objq.next != NULL &&
22386 src_object->objq.prev != NULL);
22387 /* move object from its volatile queue to the nonvolatile queue */
22388 old_queue = vm_purgeable_object_remove(src_object);
22389 assert(old_queue);
22390 if (src_object->purgeable_when_ripe) {
22391 /* remove a token from that volatile queue */
22392 vm_page_lock_queues();
22393 vm_purgeable_token_delete_first(old_queue);
22394 vm_page_unlock_queues();
22395 }
22396 /* purge the object */
22397 vm_object_purge(src_object, 0);
22398 }
22399 vm_object_unlock(src_object);
22400 continue;
22401 }
22402
22403 /*
22404 * Pages belonging to this object could be swapped to disk.
22405 * Make sure it's not a shared object because we could end
22406 * up just bringing it back in again.
22407 *
22408 * We try to optimize somewhat by checking for objects that are mapped
22409 * more than once within our own map. But we don't do full searches,
22410 * we just look at the entries following our current entry.
22411 */
22412
22413 if (os_ref_get_count_raw(&src_object->ref_count) > 1) {
22414 if (src_object != cur_shared_object) {
22415 obj_pages_snapshot = (src_object->resident_page_count - src_object->wired_page_count) + vm_compressor_pager_get_count(src_object->pager);
22416 dirty_shared_count += obj_pages_snapshot;
22417
22418 cur_shared_object = src_object;
22419 cur_shared_obj_ref_cnt = 1;
22420 continue;
22421 } else {
22422 cur_shared_obj_ref_cnt++;
22423 if (os_ref_get_count_raw(&src_object->ref_count) == cur_shared_obj_ref_cnt) {
22424 /*
22425 * Fall through to below and treat this object as private.
22426 * So deduct its pages from our shared total and add it to the
22427 * private total.
22428 */
22429
22430 dirty_shared_count -= obj_pages_snapshot;
22431 dirty_private_count += obj_pages_snapshot;
22432 } else {
22433 continue;
22434 }
22435 }
22436 }
22437
22438
22439 if (os_ref_get_count_raw(&src_object->ref_count) == 1) {
22440 dirty_private_count += (src_object->resident_page_count - src_object->wired_page_count) + vm_compressor_pager_get_count(src_object->pager);
22441 }
22442
22443 if (evaluation_phase == TRUE) {
22444 continue;
22445 }
22446 }
22447
22448 uint32_t paged_out_count = vm_object_compressed_freezer_pageout(src_object, dirty_budget);
22449 *wired_count += src_object->wired_page_count;
22450
22451 if (vm_compressor_low_on_space() || vm_swap_low_on_space()) {
22452 if (vm_compressor_low_on_space()) {
22453 *freezer_error_code = FREEZER_ERROR_NO_COMPRESSOR_SPACE;
22454 }
22455
22456 if (vm_swap_low_on_space()) {
22457 *freezer_error_code = FREEZER_ERROR_NO_SWAP_SPACE;
22458 }
22459
22460 kr = KERN_NO_SPACE;
22461 break;
22462 }
22463 if (paged_out_count >= dirty_budget) {
22464 break;
22465 }
22466 dirty_budget -= paged_out_count;
22467 }
22468
22469 *shared_count = (unsigned int) ((dirty_shared_count * PAGE_SIZE_64) / (1024 * 1024ULL));
22470 if (evaluation_phase) {
22471 unsigned int shared_pages_threshold = (memorystatus_freeze_shared_mb_per_process_max * 1024 * 1024ULL) / PAGE_SIZE_64;
22472
22473 if (dirty_shared_count > shared_pages_threshold) {
22474 *freezer_error_code = FREEZER_ERROR_EXCESS_SHARED_MEMORY;
22475 kr = KERN_FAILURE;
22476 goto done;
22477 }
22478
22479 if (dirty_shared_count &&
22480 ((dirty_private_count / dirty_shared_count) < memorystatus_freeze_private_shared_pages_ratio)) {
22481 *freezer_error_code = FREEZER_ERROR_LOW_PRIVATE_SHARED_RATIO;
22482 kr = KERN_FAILURE;
22483 goto done;
22484 }
22485
22486 evaluation_phase = FALSE;
22487 dirty_shared_count = dirty_private_count = 0;
22488
22489 freezer_context_global.freezer_ctx_uncompressed_pages = 0;
22490 clock_get_uptime(&c_freezer_last_yield_ts);
22491
22492 if (eval_only) {
22493 kr = KERN_SUCCESS;
22494 goto done;
22495 }
22496
22497 vm_purgeable_purge_task_owned(task);
22498
22499 goto again;
22500 } else {
22501 kr = KERN_SUCCESS;
22502 }
22503
22504 done:
22505 vm_map_unlock(map);
22506
22507 if ((eval_only == FALSE) && (kr == KERN_SUCCESS)) {
22508 vm_object_compressed_freezer_done();
22509 }
22510 return kr;
22511 }
22512
22513 #endif
22514
22515 /*
22516 * vm_map_entry_should_cow_for_true_share:
22517 *
22518 * Determines if the map entry should be clipped and setup for copy-on-write
22519 * to avoid applying "true_share" to a large VM object when only a subset is
22520 * targeted.
22521 *
22522 * For now, we target only the map entries created for the Objective C
22523 * Garbage Collector, which initially have the following properties:
22524 * - alias == VM_MEMORY_MALLOC
22525 * - wired_count == 0
22526 * - !needs_copy
22527 * and a VM object with:
22528 * - internal
22529 * - copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC
22530 * - !true_share
22531 * - vo_size == ANON_CHUNK_SIZE
22532 *
22533 * Only non-kernel map entries.
22534 */
22535 boolean_t
vm_map_entry_should_cow_for_true_share(vm_map_entry_t entry)22536 vm_map_entry_should_cow_for_true_share(
22537 vm_map_entry_t entry)
22538 {
22539 vm_object_t object;
22540
22541 if (entry->is_sub_map) {
22542 /* entry does not point at a VM object */
22543 return FALSE;
22544 }
22545
22546 if (entry->needs_copy) {
22547 /* already set for copy_on_write: done! */
22548 return FALSE;
22549 }
22550
22551 if (VME_ALIAS(entry) != VM_MEMORY_MALLOC &&
22552 VME_ALIAS(entry) != VM_MEMORY_MALLOC_SMALL) {
22553 /* not a malloc heap or Obj-C Garbage Collector heap */
22554 return FALSE;
22555 }
22556
22557 if (entry->wired_count) {
22558 /* wired: can't change the map entry... */
22559 vm_counters.should_cow_but_wired++;
22560 return FALSE;
22561 }
22562
22563 object = VME_OBJECT(entry);
22564
22565 if (object == VM_OBJECT_NULL) {
22566 /* no object yet... */
22567 return FALSE;
22568 }
22569
22570 if (!object->internal) {
22571 /* not an internal object */
22572 return FALSE;
22573 }
22574
22575 if (object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC) {
22576 /* not the default copy strategy */
22577 return FALSE;
22578 }
22579
22580 if (object->true_share) {
22581 /* already true_share: too late to avoid it */
22582 return FALSE;
22583 }
22584
22585 if (VME_ALIAS(entry) == VM_MEMORY_MALLOC &&
22586 object->vo_size != ANON_CHUNK_SIZE) {
22587 /* ... not an object created for the ObjC Garbage Collector */
22588 return FALSE;
22589 }
22590
22591 if (VME_ALIAS(entry) == VM_MEMORY_MALLOC_SMALL &&
22592 object->vo_size != 2048 * 4096) {
22593 /* ... not a "MALLOC_SMALL" heap */
22594 return FALSE;
22595 }
22596
22597 /*
22598 * All the criteria match: we have a large object being targeted for "true_share".
22599 * To limit the adverse side-effects linked with "true_share", tell the caller to
22600 * try and avoid setting up the entire object for "true_share" by clipping the
22601 * targeted range and setting it up for copy-on-write.
22602 */
22603 return TRUE;
22604 }
22605
22606 uint64_t vm_map_range_overflows_count = 0;
22607 TUNABLE_WRITEABLE(boolean_t, vm_map_range_overflows_log, "vm_map_range_overflows_log", FALSE);
22608 bool
vm_map_range_overflows(vm_map_t map,vm_map_offset_t addr,vm_map_size_t size)22609 vm_map_range_overflows(
22610 vm_map_t map,
22611 vm_map_offset_t addr,
22612 vm_map_size_t size)
22613 {
22614 vm_map_offset_t start, end, sum;
22615 vm_map_offset_t pgmask;
22616
22617 if (size == 0) {
22618 /* empty range -> no overflow */
22619 return false;
22620 }
22621 pgmask = vm_map_page_mask(map);
22622 start = vm_map_trunc_page_mask(addr, pgmask);
22623 end = vm_map_round_page_mask(addr + size, pgmask);
22624 if (__improbable(os_add_overflow(addr, size, &sum) || end <= start)) {
22625 vm_map_range_overflows_count++;
22626 if (vm_map_range_overflows_log) {
22627 printf("%d[%s] vm_map_range_overflows addr 0x%llx size 0x%llx pgmask 0x%llx\n",
22628 proc_selfpid(),
22629 proc_best_name(current_proc()),
22630 (uint64_t)addr,
22631 (uint64_t)size,
22632 (uint64_t)pgmask);
22633 }
22634 DTRACE_VM4(vm_map_range_overflows,
22635 vm_map_t, map,
22636 uint32_t, pgmask,
22637 uint64_t, (uint64_t)addr,
22638 uint64_t, (uint64_t)size);
22639 return true;
22640 }
22641 return false;
22642 }
22643
22644 vm_map_offset_t
vm_map_round_page_mask(vm_map_offset_t offset,vm_map_offset_t mask)22645 vm_map_round_page_mask(
22646 vm_map_offset_t offset,
22647 vm_map_offset_t mask)
22648 {
22649 return VM_MAP_ROUND_PAGE(offset, mask);
22650 }
22651
22652 vm_map_offset_t
vm_map_trunc_page_mask(vm_map_offset_t offset,vm_map_offset_t mask)22653 vm_map_trunc_page_mask(
22654 vm_map_offset_t offset,
22655 vm_map_offset_t mask)
22656 {
22657 return VM_MAP_TRUNC_PAGE(offset, mask);
22658 }
22659
22660 boolean_t
vm_map_page_aligned(vm_map_offset_t offset,vm_map_offset_t mask)22661 vm_map_page_aligned(
22662 vm_map_offset_t offset,
22663 vm_map_offset_t mask)
22664 {
22665 return ((offset) & mask) == 0;
22666 }
22667
22668 int
vm_map_page_shift(vm_map_t map)22669 vm_map_page_shift(
22670 vm_map_t map)
22671 {
22672 return VM_MAP_PAGE_SHIFT(map);
22673 }
22674
22675 int
vm_map_page_size(vm_map_t map)22676 vm_map_page_size(
22677 vm_map_t map)
22678 {
22679 return VM_MAP_PAGE_SIZE(map);
22680 }
22681
22682 vm_map_offset_t
vm_map_page_mask(vm_map_t map)22683 vm_map_page_mask(
22684 vm_map_t map)
22685 {
22686 return VM_MAP_PAGE_MASK(map);
22687 }
22688
22689 kern_return_t
vm_map_set_page_shift(vm_map_t map,int pageshift)22690 vm_map_set_page_shift(
22691 vm_map_t map,
22692 int pageshift)
22693 {
22694 if (map->hdr.nentries != 0) {
22695 /* too late to change page size */
22696 return KERN_FAILURE;
22697 }
22698
22699 map->hdr.page_shift = (uint16_t)pageshift;
22700
22701 return KERN_SUCCESS;
22702 }
22703
22704 kern_return_t
vm_map_query_volatile(vm_map_t map,mach_vm_size_t * volatile_virtual_size_p,mach_vm_size_t * volatile_resident_size_p,mach_vm_size_t * volatile_compressed_size_p,mach_vm_size_t * volatile_pmap_size_p,mach_vm_size_t * volatile_compressed_pmap_size_p)22705 vm_map_query_volatile(
22706 vm_map_t map,
22707 mach_vm_size_t *volatile_virtual_size_p,
22708 mach_vm_size_t *volatile_resident_size_p,
22709 mach_vm_size_t *volatile_compressed_size_p,
22710 mach_vm_size_t *volatile_pmap_size_p,
22711 mach_vm_size_t *volatile_compressed_pmap_size_p)
22712 {
22713 mach_vm_size_t volatile_virtual_size;
22714 mach_vm_size_t volatile_resident_count;
22715 mach_vm_size_t volatile_compressed_count;
22716 mach_vm_size_t volatile_pmap_count;
22717 mach_vm_size_t volatile_compressed_pmap_count;
22718 mach_vm_size_t resident_count;
22719 vm_map_entry_t entry;
22720 vm_object_t object;
22721
22722 /* map should be locked by caller */
22723
22724 volatile_virtual_size = 0;
22725 volatile_resident_count = 0;
22726 volatile_compressed_count = 0;
22727 volatile_pmap_count = 0;
22728 volatile_compressed_pmap_count = 0;
22729
22730 for (entry = vm_map_first_entry(map);
22731 entry != vm_map_to_entry(map);
22732 entry = entry->vme_next) {
22733 mach_vm_size_t pmap_resident_bytes, pmap_compressed_bytes;
22734
22735 if (entry->is_sub_map) {
22736 continue;
22737 }
22738 if (!(entry->protection & VM_PROT_WRITE)) {
22739 continue;
22740 }
22741 object = VME_OBJECT(entry);
22742 if (object == VM_OBJECT_NULL) {
22743 continue;
22744 }
22745 if (object->purgable != VM_PURGABLE_VOLATILE &&
22746 object->purgable != VM_PURGABLE_EMPTY) {
22747 continue;
22748 }
22749 if (VME_OFFSET(entry)) {
22750 /*
22751 * If the map entry has been split and the object now
22752 * appears several times in the VM map, we don't want
22753 * to count the object's resident_page_count more than
22754 * once. We count it only for the first one, starting
22755 * at offset 0 and ignore the other VM map entries.
22756 */
22757 continue;
22758 }
22759 resident_count = object->resident_page_count;
22760 if ((VME_OFFSET(entry) / PAGE_SIZE) >= resident_count) {
22761 resident_count = 0;
22762 } else {
22763 resident_count -= (VME_OFFSET(entry) / PAGE_SIZE);
22764 }
22765
22766 volatile_virtual_size += entry->vme_end - entry->vme_start;
22767 volatile_resident_count += resident_count;
22768 if (object->pager) {
22769 volatile_compressed_count +=
22770 vm_compressor_pager_get_count(object->pager);
22771 }
22772 pmap_compressed_bytes = 0;
22773 pmap_resident_bytes =
22774 pmap_query_resident(map->pmap,
22775 entry->vme_start,
22776 entry->vme_end,
22777 &pmap_compressed_bytes);
22778 volatile_pmap_count += (pmap_resident_bytes / PAGE_SIZE);
22779 volatile_compressed_pmap_count += (pmap_compressed_bytes
22780 / PAGE_SIZE);
22781 }
22782
22783 /* map is still locked on return */
22784
22785 *volatile_virtual_size_p = volatile_virtual_size;
22786 *volatile_resident_size_p = volatile_resident_count * PAGE_SIZE;
22787 *volatile_compressed_size_p = volatile_compressed_count * PAGE_SIZE;
22788 *volatile_pmap_size_p = volatile_pmap_count * PAGE_SIZE;
22789 *volatile_compressed_pmap_size_p = volatile_compressed_pmap_count * PAGE_SIZE;
22790
22791 return KERN_SUCCESS;
22792 }
22793
22794 void
vm_map_sizes(vm_map_t map,vm_map_size_t * psize,vm_map_size_t * pfree,vm_map_size_t * plargest_free)22795 vm_map_sizes(vm_map_t map,
22796 vm_map_size_t * psize,
22797 vm_map_size_t * pfree,
22798 vm_map_size_t * plargest_free)
22799 {
22800 vm_map_entry_t entry;
22801 vm_map_offset_t prev;
22802 vm_map_size_t free, total_free, largest_free;
22803 boolean_t end;
22804
22805 if (!map) {
22806 *psize = *pfree = *plargest_free = 0;
22807 return;
22808 }
22809 total_free = largest_free = 0;
22810
22811 vm_map_lock_read(map);
22812 if (psize) {
22813 *psize = map->max_offset - map->min_offset;
22814 }
22815
22816 prev = map->min_offset;
22817 for (entry = vm_map_first_entry(map);; entry = entry->vme_next) {
22818 end = (entry == vm_map_to_entry(map));
22819
22820 if (end) {
22821 free = entry->vme_end - prev;
22822 } else {
22823 free = entry->vme_start - prev;
22824 }
22825
22826 total_free += free;
22827 if (free > largest_free) {
22828 largest_free = free;
22829 }
22830
22831 if (end) {
22832 break;
22833 }
22834 prev = entry->vme_end;
22835 }
22836 vm_map_unlock_read(map);
22837 if (pfree) {
22838 *pfree = total_free;
22839 }
22840 if (plargest_free) {
22841 *plargest_free = largest_free;
22842 }
22843 }
22844
22845 #if VM_SCAN_FOR_SHADOW_CHAIN
22846 int
vm_map_shadow_max(vm_map_t map)22847 vm_map_shadow_max(
22848 vm_map_t map)
22849 {
22850 int shadows, shadows_max;
22851 vm_map_entry_t entry;
22852 vm_object_t object, next_object;
22853
22854 if (map == NULL) {
22855 return 0;
22856 }
22857
22858 shadows_max = 0;
22859
22860 vm_map_lock_read(map);
22861
22862 for (entry = vm_map_first_entry(map);
22863 entry != vm_map_to_entry(map);
22864 entry = entry->vme_next) {
22865 if (entry->is_sub_map) {
22866 continue;
22867 }
22868 object = VME_OBJECT(entry);
22869 if (object == NULL) {
22870 continue;
22871 }
22872 vm_object_lock_shared(object);
22873 for (shadows = 0;
22874 object->shadow != NULL;
22875 shadows++, object = next_object) {
22876 next_object = object->shadow;
22877 vm_object_lock_shared(next_object);
22878 vm_object_unlock(object);
22879 }
22880 vm_object_unlock(object);
22881 if (shadows > shadows_max) {
22882 shadows_max = shadows;
22883 }
22884 }
22885
22886 vm_map_unlock_read(map);
22887
22888 return shadows_max;
22889 }
22890 #endif /* VM_SCAN_FOR_SHADOW_CHAIN */
22891
22892 void
vm_commit_pagezero_status(vm_map_t lmap)22893 vm_commit_pagezero_status(vm_map_t lmap)
22894 {
22895 pmap_advise_pagezero_range(lmap->pmap, lmap->min_offset);
22896 }
22897
22898 #if __x86_64__
22899 void
vm_map_set_high_start(vm_map_t map,vm_map_offset_t high_start)22900 vm_map_set_high_start(
22901 vm_map_t map,
22902 vm_map_offset_t high_start)
22903 {
22904 map->vmmap_high_start = high_start;
22905 }
22906 #endif /* __x86_64__ */
22907
22908 #if CODE_SIGNING_MONITOR
22909
22910 kern_return_t
vm_map_entry_cs_associate(vm_map_t map,vm_map_entry_t entry,vm_map_kernel_flags_t vmk_flags)22911 vm_map_entry_cs_associate(
22912 vm_map_t map,
22913 vm_map_entry_t entry,
22914 vm_map_kernel_flags_t vmk_flags)
22915 {
22916 vm_object_t cs_object, cs_shadow, backing_object;
22917 vm_object_offset_t cs_offset, backing_offset;
22918 void *cs_blobs;
22919 struct vnode *cs_vnode;
22920 kern_return_t cs_ret;
22921
22922 if (map->pmap == NULL ||
22923 entry->is_sub_map || /* XXX FBDP: recurse on sub-range? */
22924 (csm_address_space_exempt(map->pmap) == KERN_SUCCESS) ||
22925 VME_OBJECT(entry) == VM_OBJECT_NULL) {
22926 return KERN_SUCCESS;
22927 }
22928
22929 if (!(entry->protection & VM_PROT_EXECUTE)) {
22930 /*
22931 * This memory region is not executable, so the code-signing
22932 * monitor would usually not care about it...
22933 */
22934 if (vmk_flags.vmkf_remap_prot_copy &&
22935 (entry->max_protection & VM_PROT_EXECUTE)) {
22936 /*
22937 * ... except if the memory region is being remapped
22938 * from r-x/r-x to rw-/rwx via vm_protect(VM_PROT_COPY)
22939 * which is what a debugger or dtrace would be doing
22940 * to prepare to modify an executable page to insert
22941 * a breakpoint or activate a probe.
22942 * In that case, fall through so that we can mark
22943 * this region as being "debugged" and no longer
22944 * strictly code-signed.
22945 */
22946 } else {
22947 /*
22948 * Really not executable, so no need to tell the
22949 * code-signing monitor.
22950 */
22951 return KERN_SUCCESS;
22952 }
22953 }
22954
22955 vm_map_lock_assert_exclusive(map);
22956
22957 /*
22958 * Check for a debug association mapping before we check for used_for_jit. This
22959 * allows non-RWX JIT on macOS systems to masquerade their mappings as USER_DEBUG
22960 * pages instead of USER_JIT. These non-RWX JIT pages cannot be marked as USER_JIT
22961 * since they are mapped with RW or RX permissions, which the page table monitor
22962 * denies on USER_JIT pages. Given that, if they're not mapped as USER_DEBUG,
22963 * they will be mapped as USER_EXEC, and that will cause another page table monitor
22964 * violation when those USER_EXEC pages are mapped as RW.
22965 *
22966 * Since these pages switch between RW and RX through mprotect, they mimic what
22967 * we expect a debugger to do. As the code signing monitor does not enforce mappings
22968 * on macOS systems, this works in our favor here and allows us to continue to
22969 * support these legacy-programmed applications without sacrificing security on
22970 * the page table or the code signing monitor. We don't need to explicitly check
22971 * for entry_for_jit here and the mapping permissions. If the initial mapping is
22972 * created with RX, then the application must map it as RW in order to first write
22973 * to the page (MAP_JIT mappings must be private and anonymous). The switch to
22974 * RX will cause vm_map_protect to mark the entry as vmkf_remap_prot_copy.
22975 * Similarly, if the mapping was created as RW, and then switched to RX,
22976 * vm_map_protect will again mark the entry as a copy, and both these cases
22977 * lead to this if-statement being entered.
22978 *
22979 * For more information: rdar://115313336.
22980 */
22981 if (vmk_flags.vmkf_remap_prot_copy) {
22982 cs_ret = csm_associate_debug_region(
22983 map->pmap,
22984 entry->vme_start,
22985 entry->vme_end - entry->vme_start);
22986
22987 /*
22988 * csm_associate_debug_region returns not supported when the code signing
22989 * monitor is disabled. This is intentional, since cs_ret is checked towards
22990 * the end of the function, and if it is not supported, then we still want the
22991 * VM to perform code-signing enforcement on this entry. That said, if we don't
22992 * mark this as a xnu_user_debug page when the code-signing monitor is disabled,
22993 * then it never gets retyped to XNU_USER_DEBUG frame type, which then causes
22994 * an issue with debugging (since it'll be mapped in as XNU_USER_EXEC in some
22995 * cases, which will cause a violation when attempted to be mapped as writable).
22996 */
22997 if ((cs_ret == KERN_SUCCESS) || (cs_ret == KERN_NOT_SUPPORTED)) {
22998 entry->vme_xnu_user_debug = TRUE;
22999 }
23000 #if DEVELOPMENT || DEBUG
23001 if (vm_log_xnu_user_debug) {
23002 printf("FBDP %d[%s] %s:%d map %p entry %p [ 0x%llx 0x%llx ] vme_xnu_user_debug=%d cs_ret %d\n",
23003 proc_selfpid(),
23004 (get_bsdtask_info(current_task()) ? proc_name_address(get_bsdtask_info(current_task())) : "?"),
23005 __FUNCTION__, __LINE__,
23006 map, entry,
23007 (uint64_t)entry->vme_start, (uint64_t)entry->vme_end,
23008 entry->vme_xnu_user_debug,
23009 cs_ret);
23010 }
23011 #endif /* DEVELOPMENT || DEBUG */
23012 goto done;
23013 }
23014
23015 if (entry->used_for_jit) {
23016 cs_ret = csm_associate_jit_region(
23017 map->pmap,
23018 entry->vme_start,
23019 entry->vme_end - entry->vme_start);
23020 goto done;
23021 }
23022
23023 cs_object = VME_OBJECT(entry);
23024 vm_object_lock_shared(cs_object);
23025 cs_offset = VME_OFFSET(entry);
23026
23027 /* find the VM object backed by the code-signed vnode */
23028 for (;;) {
23029 /* go to the bottom of cs_object's shadow chain */
23030 for (;
23031 cs_object->shadow != VM_OBJECT_NULL;
23032 cs_object = cs_shadow) {
23033 cs_shadow = cs_object->shadow;
23034 cs_offset += cs_object->vo_shadow_offset;
23035 vm_object_lock_shared(cs_shadow);
23036 vm_object_unlock(cs_object);
23037 }
23038 if (cs_object->internal ||
23039 cs_object->pager == MEMORY_OBJECT_NULL) {
23040 vm_object_unlock(cs_object);
23041 return KERN_SUCCESS;
23042 }
23043
23044 cs_offset += cs_object->paging_offset;
23045
23046 /*
23047 * cs_object could be backed by a:
23048 * vnode_pager
23049 * apple_protect_pager
23050 * shared_region_pager
23051 * fourk_pager (multiple backing objects -> fail?)
23052 * ask the pager if it has a backing VM object
23053 */
23054 if (!memory_object_backing_object(cs_object->pager,
23055 cs_offset,
23056 &backing_object,
23057 &backing_offset)) {
23058 /* no backing object: cs_object is it */
23059 break;
23060 }
23061
23062 /* look down the backing object's shadow chain */
23063 vm_object_lock_shared(backing_object);
23064 vm_object_unlock(cs_object);
23065 cs_object = backing_object;
23066 cs_offset = backing_offset;
23067 }
23068
23069 cs_vnode = vnode_pager_lookup_vnode(cs_object->pager);
23070 if (cs_vnode == NULL) {
23071 /* no vnode, no code signatures to associate */
23072 cs_ret = KERN_SUCCESS;
23073 } else {
23074 cs_ret = vnode_pager_get_cs_blobs(cs_vnode,
23075 &cs_blobs);
23076 assert(cs_ret == KERN_SUCCESS);
23077 cs_ret = cs_associate_blob_with_mapping(map->pmap,
23078 entry->vme_start,
23079 (entry->vme_end - entry->vme_start),
23080 cs_offset,
23081 cs_blobs);
23082 }
23083 vm_object_unlock(cs_object);
23084 cs_object = VM_OBJECT_NULL;
23085
23086 done:
23087 if (cs_ret == KERN_SUCCESS) {
23088 DTRACE_VM2(vm_map_entry_cs_associate_success,
23089 vm_map_offset_t, entry->vme_start,
23090 vm_map_offset_t, entry->vme_end);
23091 if (vm_map_executable_immutable) {
23092 /*
23093 * Prevent this executable
23094 * mapping from being unmapped
23095 * or modified.
23096 */
23097 entry->vme_permanent = TRUE;
23098 }
23099 /*
23100 * pmap says it will validate the
23101 * code-signing validity of pages
23102 * faulted in via this mapping, so
23103 * this map entry should be marked so
23104 * that vm_fault() bypasses code-signing
23105 * validation for faults coming through
23106 * this mapping.
23107 */
23108 entry->csm_associated = TRUE;
23109 } else if (cs_ret == KERN_NOT_SUPPORTED) {
23110 /*
23111 * pmap won't check the code-signing
23112 * validity of pages faulted in via
23113 * this mapping, so VM should keep
23114 * doing it.
23115 */
23116 DTRACE_VM3(vm_map_entry_cs_associate_off,
23117 vm_map_offset_t, entry->vme_start,
23118 vm_map_offset_t, entry->vme_end,
23119 int, cs_ret);
23120 } else {
23121 /*
23122 * A real error: do not allow
23123 * execution in this mapping.
23124 */
23125 DTRACE_VM3(vm_map_entry_cs_associate_failure,
23126 vm_map_offset_t, entry->vme_start,
23127 vm_map_offset_t, entry->vme_end,
23128 int, cs_ret);
23129 if (vmk_flags.vmkf_overwrite_immutable) {
23130 /*
23131 * We can get here when we remap an apple_protect pager
23132 * on top of an already cs_associated executable mapping
23133 * with the same code signatures, so we don't want to
23134 * lose VM_PROT_EXECUTE in that case...
23135 */
23136 } else {
23137 entry->protection &= ~VM_PROT_ALLEXEC;
23138 entry->max_protection &= ~VM_PROT_ALLEXEC;
23139 }
23140 }
23141
23142 return cs_ret;
23143 }
23144
23145 #endif /* CODE_SIGNING_MONITOR */
23146
23147 inline bool
vm_map_is_corpse_source(vm_map_t map)23148 vm_map_is_corpse_source(vm_map_t map)
23149 {
23150 bool status = false;
23151 if (map) {
23152 vm_map_lock_read(map);
23153 status = map->corpse_source;
23154 vm_map_unlock_read(map);
23155 }
23156 return status;
23157 }
23158
23159 inline void
vm_map_set_corpse_source(vm_map_t map)23160 vm_map_set_corpse_source(vm_map_t map)
23161 {
23162 if (map) {
23163 vm_map_lock(map);
23164 map->corpse_source = true;
23165 vm_map_unlock(map);
23166 }
23167 }
23168
23169 inline void
vm_map_unset_corpse_source(vm_map_t map)23170 vm_map_unset_corpse_source(vm_map_t map)
23171 {
23172 if (map) {
23173 vm_map_lock(map);
23174 map->corpse_source = false;
23175 vm_map_unlock(map);
23176 }
23177 }
23178 /*
23179 * FORKED CORPSE FOOTPRINT
23180 *
23181 * A forked corpse gets a copy of the original VM map but its pmap is mostly
23182 * empty since it never ran and never got to fault in any pages.
23183 * Collecting footprint info (via "sysctl vm.self_region_footprint") for
23184 * a forked corpse would therefore return very little information.
23185 *
23186 * When forking a corpse, we can pass the VM_MAP_FORK_CORPSE_FOOTPRINT option
23187 * to vm_map_fork() to collect footprint information from the original VM map
23188 * and its pmap, and store it in the forked corpse's VM map. That information
23189 * is stored in place of the VM map's "hole list" since we'll never need to
23190 * lookup for holes in the corpse's map.
23191 *
23192 * The corpse's footprint info looks like this:
23193 *
23194 * vm_map->vmmap_corpse_footprint points to pageable kernel memory laid out
23195 * as follows:
23196 * +---------------------------------------+
23197 * header-> | cf_size |
23198 * +-------------------+-------------------+
23199 * | cf_last_region | cf_last_zeroes |
23200 * +-------------------+-------------------+
23201 * region1-> | cfr_vaddr |
23202 * +-------------------+-------------------+
23203 * | cfr_num_pages | d0 | d1 | d2 | d3 |
23204 * +---------------------------------------+
23205 * | d4 | d5 | ... |
23206 * +---------------------------------------+
23207 * | ... |
23208 * +-------------------+-------------------+
23209 * | dy | dz | na | na | cfr_vaddr... | <-region2
23210 * +-------------------+-------------------+
23211 * | cfr_vaddr (ctd) | cfr_num_pages |
23212 * +---------------------------------------+
23213 * | d0 | d1 ... |
23214 * +---------------------------------------+
23215 * ...
23216 * +---------------------------------------+
23217 * last region-> | cfr_vaddr |
23218 * +---------------------------------------+
23219 * + cfr_num_pages | d0 | d1 | d2 | d3 |
23220 * +---------------------------------------+
23221 * ...
23222 * +---------------------------------------+
23223 * | dx | dy | dz | na | na | na | na | na |
23224 * +---------------------------------------+
23225 *
23226 * where:
23227 * cf_size: total size of the buffer (rounded to page size)
23228 * cf_last_region: offset in the buffer of the last "region" sub-header
23229 * cf_last_zeroes: number of trailing "zero" dispositions at the end
23230 * of last region
23231 * cfr_vaddr: virtual address of the start of the covered "region"
23232 * cfr_num_pages: number of pages in the covered "region"
23233 * d*: disposition of the page at that virtual address
23234 * Regions in the buffer are word-aligned.
23235 *
23236 * We estimate the size of the buffer based on the number of memory regions
23237 * and the virtual size of the address space. While copying each memory region
23238 * during vm_map_fork(), we also collect the footprint info for that region
23239 * and store it in the buffer, packing it as much as possible (coalescing
23240 * contiguous memory regions to avoid having too many region headers and
23241 * avoiding long streaks of "zero" page dispositions by splitting footprint
23242 * "regions", so the number of regions in the footprint buffer might not match
23243 * the number of memory regions in the address space.
23244 *
23245 * We also have to copy the original task's "nonvolatile" ledgers since that's
23246 * part of the footprint and will need to be reported to any tool asking for
23247 * the footprint information of the forked corpse.
23248 */
23249
23250 uint64_t vm_map_corpse_footprint_count = 0;
23251 uint64_t vm_map_corpse_footprint_size_avg = 0;
23252 uint64_t vm_map_corpse_footprint_size_max = 0;
23253 uint64_t vm_map_corpse_footprint_full = 0;
23254 uint64_t vm_map_corpse_footprint_no_buf = 0;
23255
23256 struct vm_map_corpse_footprint_header {
23257 vm_size_t cf_size; /* allocated buffer size */
23258 uint32_t cf_last_region; /* offset of last region in buffer */
23259 union {
23260 uint32_t cfu_last_zeroes; /* during creation:
23261 * number of "zero" dispositions at
23262 * end of last region */
23263 uint32_t cfu_hint_region; /* during lookup:
23264 * offset of last looked up region */
23265 #define cf_last_zeroes cfu.cfu_last_zeroes
23266 #define cf_hint_region cfu.cfu_hint_region
23267 } cfu;
23268 };
23269 typedef uint8_t cf_disp_t;
23270 struct vm_map_corpse_footprint_region {
23271 vm_map_offset_t cfr_vaddr; /* region start virtual address */
23272 uint32_t cfr_num_pages; /* number of pages in this "region" */
23273 cf_disp_t cfr_disposition[0]; /* disposition of each page */
23274 } __attribute__((packed));
23275
23276 static cf_disp_t
vm_page_disposition_to_cf_disp(int disposition)23277 vm_page_disposition_to_cf_disp(
23278 int disposition)
23279 {
23280 assert(sizeof(cf_disp_t) == 1);
23281 /* relocate bits that don't fit in a "uint8_t" */
23282 if (disposition & VM_PAGE_QUERY_PAGE_REUSABLE) {
23283 disposition |= VM_PAGE_QUERY_PAGE_FICTITIOUS;
23284 }
23285 /* cast gets rid of extra bits */
23286 return (cf_disp_t) disposition;
23287 }
23288
23289 static int
vm_page_cf_disp_to_disposition(cf_disp_t cf_disp)23290 vm_page_cf_disp_to_disposition(
23291 cf_disp_t cf_disp)
23292 {
23293 int disposition;
23294
23295 assert(sizeof(cf_disp_t) == 1);
23296 disposition = (int) cf_disp;
23297 /* move relocated bits back in place */
23298 if (cf_disp & VM_PAGE_QUERY_PAGE_FICTITIOUS) {
23299 disposition |= VM_PAGE_QUERY_PAGE_REUSABLE;
23300 disposition &= ~VM_PAGE_QUERY_PAGE_FICTITIOUS;
23301 }
23302 return disposition;
23303 }
23304
23305 static kmem_guard_t
vm_map_corpse_footprint_guard(vm_map_t map)23306 vm_map_corpse_footprint_guard(vm_map_t map)
23307 {
23308 return (kmem_guard_t){
23309 .kmg_atomic = true,
23310 .kmg_tag = VM_KERN_MEMORY_DIAG,
23311 .kmg_context = os_hash_kernel_pointer(&map->vmmap_corpse_footprint),
23312 };
23313 }
23314
23315 /*
23316 * vm_map_corpse_footprint_new_region:
23317 * closes the current footprint "region" and creates a new one
23318 *
23319 * Returns NULL if there's not enough space in the buffer for a new region.
23320 */
23321 static struct vm_map_corpse_footprint_region *
vm_map_corpse_footprint_new_region(struct vm_map_corpse_footprint_header * footprint_header)23322 vm_map_corpse_footprint_new_region(
23323 struct vm_map_corpse_footprint_header *footprint_header)
23324 {
23325 uintptr_t footprint_edge;
23326 uint32_t new_region_offset;
23327 struct vm_map_corpse_footprint_region *footprint_region;
23328 struct vm_map_corpse_footprint_region *new_footprint_region;
23329
23330 footprint_edge = ((uintptr_t)footprint_header +
23331 footprint_header->cf_size);
23332 footprint_region = ((struct vm_map_corpse_footprint_region *)
23333 ((char *)footprint_header +
23334 footprint_header->cf_last_region));
23335 assert((uintptr_t)footprint_region + sizeof(*footprint_region) <=
23336 footprint_edge);
23337
23338 /* get rid of trailing zeroes in the last region */
23339 assert(footprint_region->cfr_num_pages >=
23340 footprint_header->cf_last_zeroes);
23341 footprint_region->cfr_num_pages -=
23342 footprint_header->cf_last_zeroes;
23343 footprint_header->cf_last_zeroes = 0;
23344
23345 /* reuse this region if it's now empty */
23346 if (footprint_region->cfr_num_pages == 0) {
23347 return footprint_region;
23348 }
23349
23350 /* compute offset of new region */
23351 new_region_offset = footprint_header->cf_last_region;
23352 new_region_offset += sizeof(*footprint_region);
23353 new_region_offset += (footprint_region->cfr_num_pages * sizeof(cf_disp_t));
23354 new_region_offset = roundup(new_region_offset, sizeof(int));
23355
23356 /* check if we're going over the edge */
23357 if (((uintptr_t)footprint_header +
23358 new_region_offset +
23359 sizeof(*footprint_region)) >=
23360 footprint_edge) {
23361 /* over the edge: no new region */
23362 return NULL;
23363 }
23364
23365 /* adjust offset of last region in header */
23366 footprint_header->cf_last_region = new_region_offset;
23367
23368 new_footprint_region = (struct vm_map_corpse_footprint_region *)
23369 ((char *)footprint_header +
23370 footprint_header->cf_last_region);
23371 new_footprint_region->cfr_vaddr = 0;
23372 new_footprint_region->cfr_num_pages = 0;
23373 /* caller needs to initialize new region */
23374
23375 return new_footprint_region;
23376 }
23377
23378 /*
23379 * vm_map_corpse_footprint_collect:
23380 * collect footprint information for "old_entry" in "old_map" and
23381 * stores it in "new_map"'s vmmap_footprint_info.
23382 */
23383 kern_return_t
vm_map_corpse_footprint_collect(vm_map_t old_map,vm_map_entry_t old_entry,vm_map_t new_map)23384 vm_map_corpse_footprint_collect(
23385 vm_map_t old_map,
23386 vm_map_entry_t old_entry,
23387 vm_map_t new_map)
23388 {
23389 vm_map_offset_t va;
23390 kmem_return_t kmr;
23391 struct vm_map_corpse_footprint_header *footprint_header;
23392 struct vm_map_corpse_footprint_region *footprint_region;
23393 struct vm_map_corpse_footprint_region *new_footprint_region;
23394 cf_disp_t *next_disp_p;
23395 uintptr_t footprint_edge;
23396 uint32_t num_pages_tmp;
23397 int effective_page_size;
23398
23399 effective_page_size = MIN(PAGE_SIZE, VM_MAP_PAGE_SIZE(old_map));
23400
23401 va = old_entry->vme_start;
23402
23403 vm_map_lock_assert_exclusive(old_map);
23404 vm_map_lock_assert_exclusive(new_map);
23405
23406 assert(new_map->has_corpse_footprint);
23407 assert(!old_map->has_corpse_footprint);
23408 if (!new_map->has_corpse_footprint ||
23409 old_map->has_corpse_footprint) {
23410 /*
23411 * This can only transfer footprint info from a
23412 * map with a live pmap to a map with a corpse footprint.
23413 */
23414 return KERN_NOT_SUPPORTED;
23415 }
23416
23417 if (new_map->vmmap_corpse_footprint == NULL) {
23418 vm_size_t buf_size;
23419
23420 buf_size = (sizeof(*footprint_header) +
23421 (old_map->hdr.nentries
23422 *
23423 (sizeof(*footprint_region) +
23424 +3)) /* potential alignment for each region */
23425 +
23426 ((old_map->size / effective_page_size)
23427 *
23428 sizeof(cf_disp_t))); /* disposition for each page */
23429 // printf("FBDP corpse map %p guestimate footprint size 0x%llx\n", new_map, (uint64_t) buf_size);
23430 buf_size = round_page(buf_size);
23431
23432 /* limit buffer to 1 page to validate overflow detection */
23433 // buf_size = PAGE_SIZE;
23434
23435 /* limit size to a somewhat sane amount */
23436 #if XNU_TARGET_OS_OSX
23437 #define VM_MAP_CORPSE_FOOTPRINT_INFO_MAX_SIZE (8*1024*1024) /* 8MB */
23438 #else /* XNU_TARGET_OS_OSX */
23439 #define VM_MAP_CORPSE_FOOTPRINT_INFO_MAX_SIZE (256*1024) /* 256KB */
23440 #endif /* XNU_TARGET_OS_OSX */
23441 if (buf_size > VM_MAP_CORPSE_FOOTPRINT_INFO_MAX_SIZE) {
23442 buf_size = VM_MAP_CORPSE_FOOTPRINT_INFO_MAX_SIZE;
23443 }
23444 kmem_guard_t guard = vm_map_corpse_footprint_guard(new_map);
23445 kmr = kmem_alloc_guard(kernel_map, buf_size + PAGE_SIZE, 0,
23446 KMA_DATA | KMA_GUARD_LAST | KMA_KOBJECT | KMA_ZERO,
23447 guard);
23448 if (kmr.kmr_return != KERN_SUCCESS) {
23449 vm_map_corpse_footprint_no_buf++;
23450 return kmr.kmr_return;
23451 }
23452
23453 /* initialize header and 1st region */
23454 footprint_header = (struct vm_map_corpse_footprint_header *)kmr.kmr_ptr;
23455 assert3p(footprint_header, !=, NULL);
23456 new_map->vmmap_corpse_footprint = footprint_header;
23457
23458 footprint_header->cf_size = buf_size;
23459 footprint_header->cf_last_region =
23460 sizeof(*footprint_header);
23461 footprint_header->cf_last_zeroes = 0;
23462
23463 footprint_region = (struct vm_map_corpse_footprint_region *)
23464 ((char *)footprint_header +
23465 footprint_header->cf_last_region);
23466 footprint_region->cfr_vaddr = 0;
23467 footprint_region->cfr_num_pages = 0;
23468 } else {
23469 /* retrieve header and last region */
23470 footprint_header = (struct vm_map_corpse_footprint_header *)
23471 new_map->vmmap_corpse_footprint;
23472 footprint_region = (struct vm_map_corpse_footprint_region *)
23473 ((char *)footprint_header +
23474 footprint_header->cf_last_region);
23475 }
23476 footprint_edge = ((uintptr_t)footprint_header +
23477 footprint_header->cf_size);
23478
23479 if ((footprint_region->cfr_vaddr +
23480 (((vm_map_offset_t)footprint_region->cfr_num_pages) *
23481 effective_page_size))
23482 != old_entry->vme_start) {
23483 uint64_t num_pages_delta, num_pages_delta_size;
23484 uint32_t region_offset_delta_size;
23485
23486 /*
23487 * Not the next contiguous virtual address:
23488 * start a new region or store "zero" dispositions for
23489 * the missing pages?
23490 */
23491 /* size of gap in actual page dispositions */
23492 num_pages_delta = ((old_entry->vme_start -
23493 footprint_region->cfr_vaddr) / effective_page_size)
23494 - footprint_region->cfr_num_pages;
23495 num_pages_delta_size = num_pages_delta * sizeof(cf_disp_t);
23496 /* size of gap as a new footprint region header */
23497 region_offset_delta_size =
23498 (sizeof(*footprint_region) +
23499 roundup(((footprint_region->cfr_num_pages -
23500 footprint_header->cf_last_zeroes) * sizeof(cf_disp_t)),
23501 sizeof(int)) -
23502 ((footprint_region->cfr_num_pages -
23503 footprint_header->cf_last_zeroes) * sizeof(cf_disp_t)));
23504 // printf("FBDP %s:%d region 0x%x 0x%llx 0x%x vme_start 0x%llx pages_delta 0x%llx region_delta 0x%x\n", __FUNCTION__, __LINE__, footprint_header->cf_last_region, footprint_region->cfr_vaddr, footprint_region->cfr_num_pages, old_entry->vme_start, num_pages_delta, region_offset_delta);
23505 if (region_offset_delta_size < num_pages_delta_size ||
23506 os_add3_overflow(footprint_region->cfr_num_pages,
23507 (uint32_t) num_pages_delta,
23508 1,
23509 &num_pages_tmp)) {
23510 /*
23511 * Storing data for this gap would take more space
23512 * than inserting a new footprint region header:
23513 * let's start a new region and save space. If it's a
23514 * tie, let's avoid using a new region, since that
23515 * would require more region hops to find the right
23516 * range during lookups.
23517 *
23518 * If the current region's cfr_num_pages would overflow
23519 * if we added "zero" page dispositions for the gap,
23520 * no choice but to start a new region.
23521 */
23522 // printf("FBDP %s:%d new region\n", __FUNCTION__, __LINE__);
23523 new_footprint_region =
23524 vm_map_corpse_footprint_new_region(footprint_header);
23525 /* check that we're not going over the edge */
23526 if (new_footprint_region == NULL) {
23527 goto over_the_edge;
23528 }
23529 footprint_region = new_footprint_region;
23530 /* initialize new region as empty */
23531 footprint_region->cfr_vaddr = old_entry->vme_start;
23532 footprint_region->cfr_num_pages = 0;
23533 } else {
23534 /*
23535 * Store "zero" page dispositions for the missing
23536 * pages.
23537 */
23538 // printf("FBDP %s:%d zero gap\n", __FUNCTION__, __LINE__);
23539 for (; num_pages_delta > 0; num_pages_delta--) {
23540 next_disp_p = (cf_disp_t *)
23541 ((uintptr_t) footprint_region +
23542 sizeof(*footprint_region));
23543 next_disp_p += footprint_region->cfr_num_pages;
23544 /* check that we're not going over the edge */
23545 if ((uintptr_t)next_disp_p >= footprint_edge) {
23546 goto over_the_edge;
23547 }
23548 /* store "zero" disposition for this gap page */
23549 footprint_region->cfr_num_pages++;
23550 *next_disp_p = (cf_disp_t) 0;
23551 footprint_header->cf_last_zeroes++;
23552 }
23553 }
23554 }
23555
23556 for (va = old_entry->vme_start;
23557 va < old_entry->vme_end;
23558 va += effective_page_size) {
23559 int disposition;
23560 cf_disp_t cf_disp;
23561
23562 vm_map_footprint_query_page_info(old_map,
23563 old_entry,
23564 va,
23565 &disposition);
23566 cf_disp = vm_page_disposition_to_cf_disp(disposition);
23567
23568 // if (va < SHARED_REGION_BASE_ARM64) printf("FBDP collect map %p va 0x%llx disp 0x%x\n", new_map, va, disp);
23569
23570 if (cf_disp == 0 && footprint_region->cfr_num_pages == 0) {
23571 /*
23572 * Ignore "zero" dispositions at start of
23573 * region: just move start of region.
23574 */
23575 footprint_region->cfr_vaddr += effective_page_size;
23576 continue;
23577 }
23578
23579 /* would region's cfr_num_pages overflow? */
23580 if (os_add_overflow(footprint_region->cfr_num_pages, 1,
23581 &num_pages_tmp)) {
23582 /* overflow: create a new region */
23583 new_footprint_region =
23584 vm_map_corpse_footprint_new_region(
23585 footprint_header);
23586 if (new_footprint_region == NULL) {
23587 goto over_the_edge;
23588 }
23589 footprint_region = new_footprint_region;
23590 footprint_region->cfr_vaddr = va;
23591 footprint_region->cfr_num_pages = 0;
23592 }
23593
23594 next_disp_p = (cf_disp_t *) ((uintptr_t) footprint_region +
23595 sizeof(*footprint_region));
23596 next_disp_p += footprint_region->cfr_num_pages;
23597 /* check that we're not going over the edge */
23598 if ((uintptr_t)next_disp_p >= footprint_edge) {
23599 goto over_the_edge;
23600 }
23601 /* store this dispostion */
23602 *next_disp_p = cf_disp;
23603 footprint_region->cfr_num_pages++;
23604
23605 if (cf_disp != 0) {
23606 /* non-zero disp: break the current zero streak */
23607 footprint_header->cf_last_zeroes = 0;
23608 /* done */
23609 continue;
23610 }
23611
23612 /* zero disp: add to the current streak of zeroes */
23613 footprint_header->cf_last_zeroes++;
23614 if ((footprint_header->cf_last_zeroes +
23615 roundup(((footprint_region->cfr_num_pages -
23616 footprint_header->cf_last_zeroes) * sizeof(cf_disp_t)) &
23617 (sizeof(int) - 1),
23618 sizeof(int))) <
23619 (sizeof(*footprint_header))) {
23620 /*
23621 * There are not enough trailing "zero" dispositions
23622 * (+ the extra padding we would need for the previous
23623 * region); creating a new region would not save space
23624 * at this point, so let's keep this "zero" disposition
23625 * in this region and reconsider later.
23626 */
23627 continue;
23628 }
23629 /*
23630 * Create a new region to avoid having too many consecutive
23631 * "zero" dispositions.
23632 */
23633 new_footprint_region =
23634 vm_map_corpse_footprint_new_region(footprint_header);
23635 if (new_footprint_region == NULL) {
23636 goto over_the_edge;
23637 }
23638 footprint_region = new_footprint_region;
23639 /* initialize the new region as empty ... */
23640 footprint_region->cfr_num_pages = 0;
23641 /* ... and skip this "zero" disp */
23642 footprint_region->cfr_vaddr = va + effective_page_size;
23643 }
23644
23645 return KERN_SUCCESS;
23646
23647 over_the_edge:
23648 // printf("FBDP map %p footprint was full for va 0x%llx\n", new_map, va);
23649 vm_map_corpse_footprint_full++;
23650 return KERN_RESOURCE_SHORTAGE;
23651 }
23652
23653 /*
23654 * vm_map_corpse_footprint_collect_done:
23655 * completes the footprint collection by getting rid of any remaining
23656 * trailing "zero" dispositions and trimming the unused part of the
23657 * kernel buffer
23658 */
23659 void
vm_map_corpse_footprint_collect_done(vm_map_t new_map)23660 vm_map_corpse_footprint_collect_done(
23661 vm_map_t new_map)
23662 {
23663 struct vm_map_corpse_footprint_header *footprint_header;
23664 struct vm_map_corpse_footprint_region *footprint_region;
23665 vm_size_t buf_size, actual_size;
23666
23667 assert(new_map->has_corpse_footprint);
23668 if (!new_map->has_corpse_footprint ||
23669 new_map->vmmap_corpse_footprint == NULL) {
23670 return;
23671 }
23672
23673 footprint_header = (struct vm_map_corpse_footprint_header *)
23674 new_map->vmmap_corpse_footprint;
23675 buf_size = footprint_header->cf_size;
23676
23677 footprint_region = (struct vm_map_corpse_footprint_region *)
23678 ((char *)footprint_header +
23679 footprint_header->cf_last_region);
23680
23681 /* get rid of trailing zeroes in last region */
23682 assert(footprint_region->cfr_num_pages >= footprint_header->cf_last_zeroes);
23683 footprint_region->cfr_num_pages -= footprint_header->cf_last_zeroes;
23684 footprint_header->cf_last_zeroes = 0;
23685
23686 actual_size = (vm_size_t)(footprint_header->cf_last_region +
23687 sizeof(*footprint_region) +
23688 (footprint_region->cfr_num_pages * sizeof(cf_disp_t)));
23689
23690 // printf("FBDP map %p buf_size 0x%llx actual_size 0x%llx\n", new_map, (uint64_t) buf_size, (uint64_t) actual_size);
23691 vm_map_corpse_footprint_size_avg =
23692 (((vm_map_corpse_footprint_size_avg *
23693 vm_map_corpse_footprint_count) +
23694 actual_size) /
23695 (vm_map_corpse_footprint_count + 1));
23696 vm_map_corpse_footprint_count++;
23697 if (actual_size > vm_map_corpse_footprint_size_max) {
23698 vm_map_corpse_footprint_size_max = actual_size;
23699 }
23700
23701 actual_size = round_page(actual_size);
23702 assert3u(buf_size, >=, actual_size);
23703 if (buf_size > actual_size) {
23704 /*
23705 * Free unused space at the end of the buffer
23706 */
23707 kmem_guard_t guard = vm_map_corpse_footprint_guard(new_map);
23708 kmem_return_t kmr = kmem_realloc_guard(kernel_map,
23709 (vm_offset_t)footprint_header,
23710 /* Account for guard page */
23711 buf_size + PAGE_SIZE,
23712 actual_size + PAGE_SIZE,
23713 KMR_DATA | KMR_GUARD_LAST | KMR_FREEOLD | KMR_KOBJECT,
23714 guard);
23715 assertf(kmr.kmr_return == KERN_SUCCESS,
23716 "trim: footprint_header %p buf_size 0x%llx actual_size 0x%llx kr=0x%x\n",
23717 footprint_header,
23718 (uint64_t) buf_size,
23719 (uint64_t) actual_size,
23720 kmr.kmr_return);
23721 footprint_header = (struct vm_map_corpse_footprint_header *)kmr.kmr_ptr;
23722 assert3p(footprint_header, !=, NULL);
23723 new_map->vmmap_corpse_footprint = footprint_header;
23724 footprint_region = NULL;
23725 }
23726
23727 footprint_header->cf_size = actual_size;
23728 }
23729
23730 /*
23731 * vm_map_corpse_footprint_query_page_info:
23732 * retrieves the disposition of the page at virtual address "vaddr"
23733 * in the forked corpse's VM map
23734 *
23735 * This is the equivalent of vm_map_footprint_query_page_info() for a forked corpse.
23736 */
23737 kern_return_t
vm_map_corpse_footprint_query_page_info(vm_map_t map,vm_map_offset_t va,int * disposition_p)23738 vm_map_corpse_footprint_query_page_info(
23739 vm_map_t map,
23740 vm_map_offset_t va,
23741 int *disposition_p)
23742 {
23743 struct vm_map_corpse_footprint_header *footprint_header;
23744 struct vm_map_corpse_footprint_region *footprint_region;
23745 uint32_t footprint_region_offset;
23746 vm_map_offset_t region_start, region_end;
23747 int disp_idx;
23748 kern_return_t kr;
23749 int effective_page_size;
23750 cf_disp_t cf_disp;
23751
23752 if (!map->has_corpse_footprint) {
23753 *disposition_p = 0;
23754 kr = KERN_INVALID_ARGUMENT;
23755 goto done;
23756 }
23757
23758 footprint_header = map->vmmap_corpse_footprint;
23759 if (footprint_header == NULL) {
23760 *disposition_p = 0;
23761 // if (va < SHARED_REGION_BASE_ARM64) printf("FBDP %d query map %p va 0x%llx disp 0x%x\n", __LINE__, map, va, *disposition_p);
23762 kr = KERN_INVALID_ARGUMENT;
23763 goto done;
23764 }
23765
23766 /* start looking at the hint ("cf_hint_region") */
23767 footprint_region_offset = footprint_header->cf_hint_region;
23768
23769 effective_page_size = MIN(PAGE_SIZE, VM_MAP_PAGE_SIZE(map));
23770
23771 lookup_again:
23772 if (footprint_region_offset < sizeof(*footprint_header)) {
23773 /* hint too low: start from 1st region */
23774 footprint_region_offset = sizeof(*footprint_header);
23775 }
23776 if (footprint_region_offset > footprint_header->cf_last_region) {
23777 /* hint too high: re-start from 1st region */
23778 footprint_region_offset = sizeof(*footprint_header);
23779 }
23780 footprint_region = (struct vm_map_corpse_footprint_region *)
23781 ((char *)footprint_header + footprint_region_offset);
23782 region_start = footprint_region->cfr_vaddr;
23783 region_end = (region_start +
23784 ((vm_map_offset_t)(footprint_region->cfr_num_pages) *
23785 effective_page_size));
23786 if (va < region_start &&
23787 footprint_region_offset != sizeof(*footprint_header)) {
23788 /* our range starts before the hint region */
23789
23790 /* reset the hint (in a racy way...) */
23791 footprint_header->cf_hint_region = sizeof(*footprint_header);
23792 /* lookup "va" again from 1st region */
23793 footprint_region_offset = sizeof(*footprint_header);
23794 goto lookup_again;
23795 }
23796
23797 while (va >= region_end) {
23798 if (footprint_region_offset >= footprint_header->cf_last_region) {
23799 break;
23800 }
23801 /* skip the region's header */
23802 footprint_region_offset += sizeof(*footprint_region);
23803 /* skip the region's page dispositions */
23804 footprint_region_offset += (footprint_region->cfr_num_pages * sizeof(cf_disp_t));
23805 /* align to next word boundary */
23806 footprint_region_offset =
23807 roundup(footprint_region_offset,
23808 sizeof(int));
23809 footprint_region = (struct vm_map_corpse_footprint_region *)
23810 ((char *)footprint_header + footprint_region_offset);
23811 region_start = footprint_region->cfr_vaddr;
23812 region_end = (region_start +
23813 ((vm_map_offset_t)(footprint_region->cfr_num_pages) *
23814 effective_page_size));
23815 }
23816 if (va < region_start || va >= region_end) {
23817 /* page not found */
23818 *disposition_p = 0;
23819 // if (va < SHARED_REGION_BASE_ARM64) printf("FBDP %d query map %p va 0x%llx disp 0x%x\n", __LINE__, map, va, *disposition_p);
23820 kr = KERN_SUCCESS;
23821 goto done;
23822 }
23823
23824 /* "va" found: set the lookup hint for next lookup (in a racy way...) */
23825 footprint_header->cf_hint_region = footprint_region_offset;
23826
23827 /* get page disposition for "va" in this region */
23828 disp_idx = (int) ((va - footprint_region->cfr_vaddr) / effective_page_size);
23829 cf_disp = footprint_region->cfr_disposition[disp_idx];
23830 *disposition_p = vm_page_cf_disp_to_disposition(cf_disp);
23831 kr = KERN_SUCCESS;
23832 done:
23833 // if (va < SHARED_REGION_BASE_ARM64) printf("FBDP %d query map %p va 0x%llx disp 0x%x\n", __LINE__, map, va, *disposition_p);
23834 /* dtrace -n 'vminfo:::footprint_query_page_info { printf("map 0x%p va 0x%llx disp 0x%x kr 0x%x", arg0, arg1, arg2, arg3); }' */
23835 DTRACE_VM4(footprint_query_page_info,
23836 vm_map_t, map,
23837 vm_map_offset_t, va,
23838 int, *disposition_p,
23839 kern_return_t, kr);
23840
23841 return kr;
23842 }
23843
23844 void
vm_map_corpse_footprint_destroy(vm_map_t map)23845 vm_map_corpse_footprint_destroy(
23846 vm_map_t map)
23847 {
23848 if (map->has_corpse_footprint &&
23849 map->vmmap_corpse_footprint != NULL) {
23850 struct vm_map_corpse_footprint_header *footprint_header;
23851 vm_size_t buf_size;
23852
23853 footprint_header = map->vmmap_corpse_footprint;
23854 buf_size = footprint_header->cf_size;
23855 kmem_guard_t guard = vm_map_corpse_footprint_guard(map);
23856 kmem_free_guard(kernel_map, (vm_offset_t)footprint_header,
23857 buf_size + PAGE_SIZE,
23858 KMF_GUARD_LAST, guard);
23859 map->vmmap_corpse_footprint = NULL;
23860 map->has_corpse_footprint = FALSE;
23861 }
23862 }
23863
23864 /*
23865 * vm_map_copy_footprint_ledgers:
23866 * copies any ledger that's relevant to the memory footprint of "old_task"
23867 * into the forked corpse's task ("new_task")
23868 */
23869 void
vm_map_copy_footprint_ledgers(task_t old_task,task_t new_task)23870 vm_map_copy_footprint_ledgers(
23871 task_t old_task,
23872 task_t new_task)
23873 {
23874 vm_map_copy_ledger(old_task, new_task, task_ledgers.phys_footprint);
23875 vm_map_copy_ledger(old_task, new_task, task_ledgers.purgeable_nonvolatile);
23876 vm_map_copy_ledger(old_task, new_task, task_ledgers.purgeable_nonvolatile_compressed);
23877 vm_map_copy_ledger(old_task, new_task, task_ledgers.internal);
23878 vm_map_copy_ledger(old_task, new_task, task_ledgers.internal_compressed);
23879 vm_map_copy_ledger(old_task, new_task, task_ledgers.iokit_mapped);
23880 vm_map_copy_ledger(old_task, new_task, task_ledgers.alternate_accounting);
23881 vm_map_copy_ledger(old_task, new_task, task_ledgers.alternate_accounting_compressed);
23882 vm_map_copy_ledger(old_task, new_task, task_ledgers.page_table);
23883 vm_map_copy_ledger(old_task, new_task, task_ledgers.tagged_footprint);
23884 vm_map_copy_ledger(old_task, new_task, task_ledgers.tagged_footprint_compressed);
23885 vm_map_copy_ledger(old_task, new_task, task_ledgers.network_nonvolatile);
23886 vm_map_copy_ledger(old_task, new_task, task_ledgers.network_nonvolatile_compressed);
23887 vm_map_copy_ledger(old_task, new_task, task_ledgers.media_footprint);
23888 vm_map_copy_ledger(old_task, new_task, task_ledgers.media_footprint_compressed);
23889 vm_map_copy_ledger(old_task, new_task, task_ledgers.graphics_footprint);
23890 vm_map_copy_ledger(old_task, new_task, task_ledgers.graphics_footprint_compressed);
23891 vm_map_copy_ledger(old_task, new_task, task_ledgers.neural_footprint);
23892 vm_map_copy_ledger(old_task, new_task, task_ledgers.neural_footprint_compressed);
23893 vm_map_copy_ledger(old_task, new_task, task_ledgers.wired_mem);
23894 vm_map_copy_ledger(old_task, new_task, task_ledgers.neural_nofootprint_total);
23895 }
23896
23897 /*
23898 * vm_map_copy_ledger:
23899 * copy a single ledger from "old_task" to "new_task"
23900 */
23901 void
vm_map_copy_ledger(task_t old_task,task_t new_task,int ledger_entry)23902 vm_map_copy_ledger(
23903 task_t old_task,
23904 task_t new_task,
23905 int ledger_entry)
23906 {
23907 ledger_amount_t old_balance, new_balance, delta;
23908
23909 assert(new_task->map->has_corpse_footprint);
23910 if (!new_task->map->has_corpse_footprint) {
23911 return;
23912 }
23913
23914 /* turn off sanity checks for the ledger we're about to mess with */
23915 ledger_disable_panic_on_negative(new_task->ledger,
23916 ledger_entry);
23917
23918 /* adjust "new_task" to match "old_task" */
23919 ledger_get_balance(old_task->ledger,
23920 ledger_entry,
23921 &old_balance);
23922 ledger_get_balance(new_task->ledger,
23923 ledger_entry,
23924 &new_balance);
23925 if (new_balance == old_balance) {
23926 /* new == old: done */
23927 } else if (new_balance > old_balance) {
23928 /* new > old ==> new -= new - old */
23929 delta = new_balance - old_balance;
23930 ledger_debit(new_task->ledger,
23931 ledger_entry,
23932 delta);
23933 } else {
23934 /* new < old ==> new += old - new */
23935 delta = old_balance - new_balance;
23936 ledger_credit(new_task->ledger,
23937 ledger_entry,
23938 delta);
23939 }
23940 }
23941
23942 /*
23943 * vm_map_get_pmap:
23944 * returns the pmap associated with the vm_map
23945 */
23946 pmap_t
vm_map_get_pmap(vm_map_t map)23947 vm_map_get_pmap(vm_map_t map)
23948 {
23949 return vm_map_pmap(map);
23950 }
23951
23952 ppnum_t
vm_map_get_phys_page(vm_map_t map,vm_offset_t addr)23953 vm_map_get_phys_page(
23954 vm_map_t map,
23955 vm_offset_t addr)
23956 {
23957 vm_object_offset_t offset;
23958 vm_object_t object;
23959 vm_map_offset_t map_offset;
23960 vm_map_entry_t entry;
23961 ppnum_t phys_page = 0;
23962
23963 map_offset = vm_map_trunc_page(addr, PAGE_MASK);
23964
23965 vm_map_lock(map);
23966 while (vm_map_lookup_entry(map, map_offset, &entry)) {
23967 if (entry->is_sub_map) {
23968 vm_map_t old_map;
23969 vm_map_lock(VME_SUBMAP(entry));
23970 old_map = map;
23971 map = VME_SUBMAP(entry);
23972 map_offset = (VME_OFFSET(entry) +
23973 (map_offset - entry->vme_start));
23974 vm_map_unlock(old_map);
23975 continue;
23976 }
23977 if (VME_OBJECT(entry) == VM_OBJECT_NULL) {
23978 vm_map_unlock(map);
23979 return (ppnum_t) 0;
23980 }
23981 if (VME_OBJECT(entry)->phys_contiguous) {
23982 /* These are not standard pageable memory mappings */
23983 /* If they are not present in the object they will */
23984 /* have to be picked up from the pager through the */
23985 /* fault mechanism. */
23986 if (VME_OBJECT(entry)->vo_shadow_offset == 0) {
23987 /* need to call vm_fault */
23988 vm_map_unlock(map);
23989 vm_fault(map, map_offset, VM_PROT_NONE,
23990 FALSE /* change_wiring */, VM_KERN_MEMORY_NONE,
23991 THREAD_UNINT, NULL, 0);
23992 vm_map_lock(map);
23993 continue;
23994 }
23995 offset = (VME_OFFSET(entry) +
23996 (map_offset - entry->vme_start));
23997 phys_page = (ppnum_t)
23998 ((VME_OBJECT(entry)->vo_shadow_offset
23999 + offset) >> PAGE_SHIFT);
24000 break;
24001 }
24002 offset = (VME_OFFSET(entry) + (map_offset - entry->vme_start));
24003 object = VME_OBJECT(entry);
24004 vm_object_lock(object);
24005 while (TRUE) {
24006 vm_page_t dst_page = vm_page_lookup(object, offset);
24007 if (dst_page == VM_PAGE_NULL) {
24008 if (object->shadow) {
24009 vm_object_t old_object;
24010 vm_object_lock(object->shadow);
24011 old_object = object;
24012 offset = offset + object->vo_shadow_offset;
24013 object = object->shadow;
24014 vm_object_unlock(old_object);
24015 } else {
24016 vm_object_unlock(object);
24017 break;
24018 }
24019 } else {
24020 phys_page = (ppnum_t)(VM_PAGE_GET_PHYS_PAGE(dst_page));
24021 vm_object_unlock(object);
24022 break;
24023 }
24024 }
24025 break;
24026 }
24027
24028 vm_map_unlock(map);
24029 return phys_page;
24030 }
24031
24032 #if CONFIG_MAP_RANGES
24033 static bitmap_t vm_map_user_range_heap_map[BITMAP_LEN(VM_MEMORY_COUNT)];
24034 static bitmap_t vm_map_user_range_large_file_map[BITMAP_LEN(VM_MEMORY_COUNT)];
24035
24036 static_assert((int)UMEM_RANGE_ID_DEFAULT == MACH_VM_RANGE_DEFAULT);
24037 static_assert((int)UMEM_RANGE_ID_HEAP == MACH_VM_RANGE_DATA);
24038
24039 /*
24040 * vm_map_range_map_init:
24041 * initializes the VM range ID map to enable index lookup
24042 * of user VM ranges based on VM tag from userspace.
24043 */
24044 static void
vm_map_range_map_init(void)24045 vm_map_range_map_init(void)
24046 {
24047 /*
24048 * VM_MEMORY_MALLOC{,_NANO} are skipped on purpose:
24049 * - the former is malloc metadata which should be kept separate
24050 * - the latter has its own ranges
24051 */
24052 bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_MALLOC_HUGE);
24053 bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_MALLOC_LARGE);
24054 bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_MALLOC_LARGE_REUSED);
24055 bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_MALLOC_MEDIUM);
24056 bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_MALLOC_PROB_GUARD);
24057 bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_MALLOC_SMALL);
24058 bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_MALLOC_TINY);
24059 bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_TCMALLOC);
24060 bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_LIBNETWORK);
24061 bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_IOACCELERATOR);
24062 bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_IOSURFACE);
24063 bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_IMAGEIO);
24064 bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_COREGRAPHICS);
24065 bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_CORESERVICES);
24066 bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_COREDATA);
24067 bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_LAYERKIT);
24068 bitmap_set(vm_map_user_range_large_file_map, VM_MEMORY_IOACCELERATOR);
24069 bitmap_set(vm_map_user_range_large_file_map, VM_MEMORY_IOSURFACE);
24070 }
24071
24072 static struct mach_vm_range
vm_map_range_random_uniform(vm_map_size_t req_size,vm_map_offset_t min_addr,vm_map_offset_t max_addr,vm_map_offset_t offmask)24073 vm_map_range_random_uniform(
24074 vm_map_size_t req_size,
24075 vm_map_offset_t min_addr,
24076 vm_map_offset_t max_addr,
24077 vm_map_offset_t offmask)
24078 {
24079 vm_map_offset_t random_addr;
24080 struct mach_vm_range alloc;
24081
24082 req_size = (req_size + offmask) & ~offmask;
24083 min_addr = (min_addr + offmask) & ~offmask;
24084 max_addr = max_addr & ~offmask;
24085
24086 read_random(&random_addr, sizeof(random_addr));
24087 random_addr %= (max_addr - req_size - min_addr);
24088 random_addr &= ~offmask;
24089
24090 alloc.min_address = min_addr + random_addr;
24091 alloc.max_address = min_addr + random_addr + req_size;
24092 return alloc;
24093 }
24094
24095 static vm_map_offset_t
vm_map_range_offmask(void)24096 vm_map_range_offmask(void)
24097 {
24098 uint32_t pte_depth;
24099
24100 /*
24101 * PTE optimizations
24102 *
24103 *
24104 * 16k pages systems
24105 * ~~~~~~~~~~~~~~~~~
24106 *
24107 * A single L1 (sub-)page covers the address space.
24108 * - L2 pages cover 64G,
24109 * - L3 pages cover 32M.
24110 *
24111 * On embedded, the dynamic VA range is 64G and uses a single L2 page.
24112 * As a result, we really only need to align the ranges to 32M to avoid
24113 * partial L3 pages.
24114 *
24115 * On macOS, the usage of L2 pages will increase, so as a result we will
24116 * want to align ranges to 64G in order to utilize them fully.
24117 *
24118 *
24119 * 4k pages systems
24120 * ~~~~~~~~~~~~~~~~
24121 *
24122 * A single L0 (sub-)page covers the address space.
24123 * - L1 pages cover 512G,
24124 * - L2 pages cover 1G,
24125 * - L3 pages cover 2M.
24126 *
24127 * The long tail of processes on a system will tend to have a VA usage
24128 * (ignoring the shared regions) in the 100s of MB order of magnitnude.
24129 * This is achievable with a single L1 and a few L2s without
24130 * randomization.
24131 *
24132 * However once randomization is introduced, the system will immediately
24133 * need several L1s and many more L2s. As a result:
24134 *
24135 * - on embedded devices, the cost of these extra pages isn't
24136 * sustainable, and we just disable the feature entirely,
24137 *
24138 * - on macOS we align ranges to a 512G boundary so that the extra L1
24139 * pages can be used to their full potential.
24140 */
24141
24142 /*
24143 * note, this function assumes _non exotic mappings_
24144 * which is why it uses the native kernel's PAGE_SHIFT.
24145 */
24146 #if XNU_PLATFORM_MacOSX
24147 pte_depth = PAGE_SHIFT > 12 ? 2 : 3;
24148 #else /* !XNU_PLATFORM_MacOSX */
24149 pte_depth = PAGE_SHIFT > 12 ? 1 : 0;
24150 #endif /* !XNU_PLATFORM_MacOSX */
24151
24152 if (pte_depth == 0) {
24153 return 0;
24154 }
24155
24156 return (1ull << ((PAGE_SHIFT - 3) * pte_depth + PAGE_SHIFT)) - 1;
24157 }
24158
24159 /*
24160 * vm_map_range_configure:
24161 * configures the user vm_map ranges by increasing the maximum VA range of
24162 * the map and carving out a range at the end of VA space (searching backwards
24163 * in the newly expanded map).
24164 */
24165 kern_return_t
vm_map_range_configure(vm_map_t map,__unused bool needs_extra_jumbo_va)24166 vm_map_range_configure(vm_map_t map, __unused bool needs_extra_jumbo_va)
24167 {
24168 const vm_map_offset_t offmask = vm_map_range_offmask();
24169 struct mach_vm_range data_range;
24170 vm_map_offset_t default_end;
24171 kern_return_t kr;
24172
24173 if (!vm_map_is_64bit(map) || vm_map_is_exotic(map) || offmask == 0) {
24174 /*
24175 * No point doing vm ranges in a 32bit address space.
24176 */
24177 return KERN_NOT_SUPPORTED;
24178 }
24179
24180 /* Should not be applying ranges to kernel map or kernel map submaps */
24181 assert(vm_map_pmap(map) != kernel_pmap);
24182
24183 #if XNU_PLATFORM_MacOSX
24184
24185 /*
24186 * on macOS, the address space is a massive 47 bits (128T),
24187 * with several carve outs that processes can't use:
24188 * - the shared region
24189 * - the commpage region
24190 * - the GPU carve out (if applicable)
24191 *
24192 * and when nano-malloc is in use it desires memory at the 96T mark.
24193 *
24194 * However, their location is architecture dependent:
24195 * - On intel, the shared region and commpage are
24196 * at the very end of the usable address space (above +127T),
24197 * and there is no GPU carve out, and pthread wants to place
24198 * threads at the 112T mark (0x70T).
24199 *
24200 * - On arm64, these are in the same spot as on embedded devices:
24201 * o shared region: [ 6G, 10G) [ will likely grow over time ]
24202 * o commpage region: [63G, 64G)
24203 * o GPU carve out: [64G, 448G)
24204 *
24205 * This is conveninent because the mappings at the end of the address
24206 * space (when they exist) are made by the kernel.
24207 *
24208 * The policy is to allocate a random 1T for the data heap
24209 * in the end of the address-space in the:
24210 * - [0x71, 0x7f) range on Intel (to leave space for pthread stacks)
24211 * - [0x61, 0x7f) range on ASM (to leave space for Nano malloc).
24212 */
24213
24214 /* see NANOZONE_SIGNATURE in libmalloc */
24215 #if __x86_64__
24216 default_end = 0x71ull << 40;
24217 #else
24218 default_end = 0x61ull << 40;
24219 #endif
24220 data_range = vm_map_range_random_uniform(1ull << 40,
24221 default_end, 0x7full << 40, offmask);
24222
24223 #else /* !XNU_PLATFORM_MacOSX */
24224
24225 /*
24226 * Embedded devices:
24227 *
24228 * The default VA Size scales with the device physical memory.
24229 *
24230 * Out of that:
24231 * - the "zero" page typically uses 4G + some slide
24232 * - the shared region uses SHARED_REGION_SIZE bytes (4G)
24233 *
24234 * Without the use of jumbo or any adjustment to the address space,
24235 * a default VM map typically looks like this:
24236 *
24237 * 0G -->╒════════════╕
24238 * │ pagezero │
24239 * │ + slide │
24240 * ~4G -->╞════════════╡<-- vm_map_min(map)
24241 * │ │
24242 * 6G -->├────────────┤
24243 * │ shared │
24244 * │ region │
24245 * 10G -->├────────────┤
24246 * │ │
24247 * max_va -->├────────────┤<-- vm_map_max(map)
24248 * │ │
24249 * ╎ jumbo ╎
24250 * ╎ ╎
24251 * │ │
24252 * 63G -->╞════════════╡<-- MACH_VM_MAX_ADDRESS
24253 * │ commpage │
24254 * 64G -->├────────────┤<-- MACH_VM_MIN_GPU_CARVEOUT_ADDRESS
24255 * │ │
24256 * ╎ GPU ╎
24257 * ╎ carveout ╎
24258 * │ │
24259 * 448G -->├────────────┤<-- MACH_VM_MAX_GPU_CARVEOUT_ADDRESS
24260 * │ │
24261 * ╎ ╎
24262 * ╎ ╎
24263 * │ │
24264 * 512G -->╘════════════╛<-- (1ull << ARM_16K_TT_L1_SHIFT)
24265 *
24266 * When this drawing was made, "max_va" was smaller than
24267 * ARM64_MAX_OFFSET_DEVICE_LARGE (~15.5G), leaving shy of
24268 * 12G of address space for the zero-page, slide, files,
24269 * binaries, heap ...
24270 *
24271 * We will want to make a "heap/data" carve out inside
24272 * the jumbo range of half of that usable space, assuming
24273 * that this is less than a forth of the jumbo range.
24274 *
24275 * The assert below intends to catch when max_va grows
24276 * too large for this heuristic.
24277 */
24278
24279 vm_map_lock_read(map);
24280 default_end = vm_map_max(map);
24281 vm_map_unlock_read(map);
24282
24283 /*
24284 * Check that we're not already jumbo'd,
24285 * or our address space was somehow modified.
24286 *
24287 * If so we cannot guarantee that we can set up the ranges
24288 * safely without interfering with the existing map.
24289 */
24290 if (default_end > vm_compute_max_offset(true)) {
24291 return KERN_NO_SPACE;
24292 }
24293
24294 if (pmap_max_offset(true, ARM_PMAP_MAX_OFFSET_DEFAULT)) {
24295 /*
24296 * an override boot-arg was set, disable user-ranges
24297 *
24298 * XXX: this is problematic because it means these boot-args
24299 * no longer test the behavior changing the value
24300 * of ARM64_MAX_OFFSET_DEVICE_* would have.
24301 */
24302 return KERN_NOT_SUPPORTED;
24303 }
24304
24305 /* expand the default VM space to 64GB */
24306 vm_map_set_jumbo(map);
24307
24308 assert3u(7 * GiB(10) / 2, <=, vm_map_max(map) - default_end);
24309 data_range = vm_map_range_random_uniform(GiB(10),
24310 default_end + PAGE_SIZE, vm_map_max(map), offmask);
24311
24312 #endif /* !XNU_PLATFORM_MacOSX */
24313
24314 /*
24315 * Poke holes so that ASAN or people listing regions
24316 * do not think this space is free.
24317 */
24318
24319 if (default_end != data_range.min_address) {
24320 kr = vm_map_enter(map, &default_end,
24321 data_range.min_address - default_end,
24322 0, VM_MAP_KERNEL_FLAGS_FIXED_PERMANENT(), VM_OBJECT_NULL,
24323 0, FALSE, VM_PROT_NONE, VM_PROT_NONE, VM_INHERIT_DEFAULT);
24324 assert(kr == KERN_SUCCESS);
24325 }
24326
24327 if (data_range.max_address != vm_map_max(map)) {
24328 vm_map_entry_t entry;
24329 vm_size_t size;
24330
24331 /*
24332 * Extend the end of the hole to the next VM entry or the end of the map,
24333 * whichever comes first.
24334 */
24335 vm_map_lock_read(map);
24336 vm_map_lookup_entry_or_next(map, data_range.max_address, &entry);
24337 if (entry == vm_map_to_entry(map) || entry->vme_start > vm_map_max(map)) {
24338 size = vm_map_max(map) - data_range.max_address;
24339 } else {
24340 size = entry->vme_start - data_range.max_address;
24341 }
24342 vm_map_unlock_read(map);
24343
24344 kr = vm_map_enter(map, &data_range.max_address, size,
24345 0, VM_MAP_KERNEL_FLAGS_FIXED_PERMANENT(), VM_OBJECT_NULL,
24346 0, FALSE, VM_PROT_NONE, VM_PROT_NONE, VM_INHERIT_DEFAULT);
24347 assert(kr == KERN_SUCCESS);
24348 }
24349
24350 #if XNU_TARGET_OS_IOS && EXTENDED_USER_VA_SUPPORT
24351 if (needs_extra_jumbo_va) {
24352 /* This will grow the address space to MACH_VM_MAX_ADDRESS */
24353 vm_map_set_extra_jumbo(map);
24354 }
24355 #endif /* XNU_TARGET_OS_IOS && EXTENDED_USER_VA_SUPPORT */
24356
24357 vm_map_lock(map);
24358 map->default_range.min_address = vm_map_min(map);
24359 map->default_range.max_address = default_end;
24360 map->data_range = data_range;
24361 #if XNU_TARGET_OS_IOS && EXTENDED_USER_VA_SUPPORT
24362 /* If process has "extra jumbo" entitlement, enable large file range */
24363 if (needs_extra_jumbo_va) {
24364 map->large_file_range = vm_map_range_random_uniform(TiB(1),
24365 MACH_VM_JUMBO_ADDRESS, MACH_VM_MAX_ADDRESS, offmask);
24366 }
24367 #endif /* XNU_TARGET_OS_IOS && EXTENDED_USER_VA_SUPPORT */
24368 map->uses_user_ranges = true;
24369 vm_map_unlock(map);
24370
24371 return KERN_SUCCESS;
24372 }
24373
24374 /*
24375 * vm_map_range_fork:
24376 * clones the array of ranges from old_map to new_map in support
24377 * of a VM map fork.
24378 */
24379 void
vm_map_range_fork(vm_map_t new_map,vm_map_t old_map)24380 vm_map_range_fork(vm_map_t new_map, vm_map_t old_map)
24381 {
24382 if (!old_map->uses_user_ranges) {
24383 /* nothing to do */
24384 return;
24385 }
24386
24387 new_map->default_range = old_map->default_range;
24388 new_map->data_range = old_map->data_range;
24389
24390 if (old_map->extra_ranges_count) {
24391 vm_map_user_range_t otable, ntable;
24392 uint16_t count;
24393
24394 otable = old_map->extra_ranges;
24395 count = old_map->extra_ranges_count;
24396 ntable = kalloc_data(count * sizeof(struct vm_map_user_range),
24397 Z_WAITOK | Z_ZERO | Z_NOFAIL);
24398 memcpy(ntable, otable,
24399 count * sizeof(struct vm_map_user_range));
24400
24401 new_map->extra_ranges_count = count;
24402 new_map->extra_ranges = ntable;
24403 }
24404
24405 new_map->uses_user_ranges = true;
24406 }
24407
24408 /*
24409 * vm_map_get_user_range:
24410 * copy the VM user range for the given VM map and range ID.
24411 */
24412 kern_return_t
vm_map_get_user_range(vm_map_t map,vm_map_range_id_t range_id,mach_vm_range_t range)24413 vm_map_get_user_range(
24414 vm_map_t map,
24415 vm_map_range_id_t range_id,
24416 mach_vm_range_t range)
24417 {
24418 if (map == NULL || !map->uses_user_ranges || range == NULL) {
24419 return KERN_INVALID_ARGUMENT;
24420 }
24421
24422 switch (range_id) {
24423 case UMEM_RANGE_ID_DEFAULT:
24424 *range = map->default_range;
24425 return KERN_SUCCESS;
24426
24427 case UMEM_RANGE_ID_HEAP:
24428 *range = map->data_range;
24429 return KERN_SUCCESS;
24430
24431 case UMEM_RANGE_ID_LARGE_FILE:
24432 /*
24433 * Because this function tells a user-space process about the user
24434 * ranges in its VM map, this case communicates whether the large file
24435 * range is in use. Note that this is different from how the large file
24436 * range ID is handled in `vm_map_get_range()`: there, we "resolve" the
24437 * VA policy and return either the large file range or data range,
24438 * depending on whether the large file range is enabled.
24439 */
24440 if (map->large_file_range.min_address != map->large_file_range.max_address) {
24441 /* large file range is configured and should be used */
24442 *range = map->large_file_range;
24443 } else {
24444 return KERN_INVALID_ARGUMENT;
24445 }
24446 return KERN_SUCCESS;
24447
24448 default:
24449 return KERN_INVALID_ARGUMENT;
24450 }
24451 }
24452
24453 static vm_map_range_id_t
vm_map_user_range_resolve(vm_map_t map,mach_vm_address_t addr,mach_vm_size_t size,mach_vm_range_t range)24454 vm_map_user_range_resolve(
24455 vm_map_t map,
24456 mach_vm_address_t addr,
24457 mach_vm_size_t size,
24458 mach_vm_range_t range)
24459 {
24460 struct mach_vm_range tmp;
24461
24462 vm_map_lock_assert_held(map);
24463
24464 static_assert((int)UMEM_RANGE_ID_DEFAULT == MACH_VM_RANGE_DEFAULT);
24465 static_assert((int)UMEM_RANGE_ID_HEAP == MACH_VM_RANGE_DATA);
24466
24467 if (mach_vm_range_contains(&map->default_range, addr, size)) {
24468 if (range) {
24469 *range = map->default_range;
24470 }
24471 return UMEM_RANGE_ID_DEFAULT;
24472 }
24473
24474 if (mach_vm_range_contains(&map->data_range, addr, size)) {
24475 if (range) {
24476 *range = map->data_range;
24477 }
24478 return UMEM_RANGE_ID_HEAP;
24479 }
24480
24481 if (mach_vm_range_contains(&map->large_file_range, addr, size)) {
24482 if (range) {
24483 *range = map->large_file_range;
24484 }
24485 return UMEM_RANGE_ID_LARGE_FILE;
24486 }
24487
24488 for (size_t i = 0; i < map->extra_ranges_count; i++) {
24489 vm_map_user_range_t r = &map->extra_ranges[i];
24490
24491 tmp.min_address = r->vmur_min_address;
24492 tmp.max_address = r->vmur_max_address;
24493
24494 if (mach_vm_range_contains(&tmp, addr, size)) {
24495 if (range) {
24496 *range = tmp;
24497 }
24498 return r->vmur_range_id;
24499 }
24500 }
24501
24502 if (range) {
24503 range->min_address = range->max_address = 0;
24504 }
24505 return UMEM_RANGE_ID_DEFAULT;
24506 }
24507 #endif /* CONFIG_MAP_RANGES */
24508
24509 void
vm_map_kernel_flags_update_range_id(vm_map_kernel_flags_t * vmkf,vm_map_t map,__unused vm_map_size_t size)24510 vm_map_kernel_flags_update_range_id(
24511 vm_map_kernel_flags_t *vmkf,
24512 vm_map_t map,
24513 __unused vm_map_size_t size)
24514 {
24515 if (map == kernel_map) {
24516 if (vmkf->vmkf_range_id == KMEM_RANGE_ID_NONE) {
24517 vmkf->vmkf_range_id = KMEM_RANGE_ID_DATA;
24518 }
24519 #if CONFIG_MAP_RANGES
24520 } else if (vmkf->vm_tag < VM_MEMORY_COUNT &&
24521 vmkf->vmkf_range_id == UMEM_RANGE_ID_DEFAULT) {
24522 if (bitmap_test(vm_map_user_range_large_file_map, vmkf->vm_tag)
24523 || size >= VM_LARGE_FILE_THRESHOLD) {
24524 /*
24525 * if the map doesn't have the large file range configured,
24526 * the range will get resolved to the heap range in `vm_map_get_range`
24527 */
24528 vmkf->vmkf_range_id = UMEM_RANGE_ID_LARGE_FILE;
24529 } else if (bitmap_test(vm_map_user_range_heap_map, vmkf->vm_tag)) {
24530 vmkf->vmkf_range_id = UMEM_RANGE_ID_HEAP;
24531 }
24532 #endif /* CONFIG_MAP_RANGES */
24533 }
24534 }
24535
24536 /*
24537 * vm_map_entry_has_device_pager:
24538 * Check if the vm map entry specified by the virtual address has a device pager.
24539 * If the vm map entry does not exist or if the map is NULL, this returns FALSE.
24540 */
24541 boolean_t
vm_map_entry_has_device_pager(vm_map_t map,vm_map_offset_t vaddr)24542 vm_map_entry_has_device_pager(vm_map_t map, vm_map_offset_t vaddr)
24543 {
24544 vm_map_entry_t entry;
24545 vm_object_t object;
24546 boolean_t result;
24547
24548 if (map == NULL) {
24549 return FALSE;
24550 }
24551
24552 vm_map_lock(map);
24553 while (TRUE) {
24554 if (!vm_map_lookup_entry(map, vaddr, &entry)) {
24555 result = FALSE;
24556 break;
24557 }
24558 if (entry->is_sub_map) {
24559 // Check the submap
24560 vm_map_t submap = VME_SUBMAP(entry);
24561 assert(submap != NULL);
24562 vm_map_lock(submap);
24563 vm_map_unlock(map);
24564 map = submap;
24565 continue;
24566 }
24567 object = VME_OBJECT(entry);
24568 if (object != NULL && object->pager != NULL && is_device_pager_ops(object->pager->mo_pager_ops)) {
24569 result = TRUE;
24570 break;
24571 }
24572 result = FALSE;
24573 break;
24574 }
24575
24576 vm_map_unlock(map);
24577 return result;
24578 }
24579
24580 #if MACH_ASSERT
24581
24582 extern int pmap_ledgers_panic;
24583 extern int pmap_ledgers_panic_leeway;
24584
24585 #define LEDGER_DRIFT(__LEDGER) \
24586 int __LEDGER##_over; \
24587 ledger_amount_t __LEDGER##_over_total; \
24588 ledger_amount_t __LEDGER##_over_max; \
24589 int __LEDGER##_under; \
24590 ledger_amount_t __LEDGER##_under_total; \
24591 ledger_amount_t __LEDGER##_under_max
24592
24593 struct {
24594 uint64_t num_pmaps_checked;
24595
24596 LEDGER_DRIFT(phys_footprint);
24597 LEDGER_DRIFT(internal);
24598 LEDGER_DRIFT(internal_compressed);
24599 LEDGER_DRIFT(external);
24600 LEDGER_DRIFT(reusable);
24601 LEDGER_DRIFT(iokit_mapped);
24602 LEDGER_DRIFT(alternate_accounting);
24603 LEDGER_DRIFT(alternate_accounting_compressed);
24604 LEDGER_DRIFT(page_table);
24605 LEDGER_DRIFT(purgeable_volatile);
24606 LEDGER_DRIFT(purgeable_nonvolatile);
24607 LEDGER_DRIFT(purgeable_volatile_compressed);
24608 LEDGER_DRIFT(purgeable_nonvolatile_compressed);
24609 LEDGER_DRIFT(tagged_nofootprint);
24610 LEDGER_DRIFT(tagged_footprint);
24611 LEDGER_DRIFT(tagged_nofootprint_compressed);
24612 LEDGER_DRIFT(tagged_footprint_compressed);
24613 LEDGER_DRIFT(network_volatile);
24614 LEDGER_DRIFT(network_nonvolatile);
24615 LEDGER_DRIFT(network_volatile_compressed);
24616 LEDGER_DRIFT(network_nonvolatile_compressed);
24617 LEDGER_DRIFT(media_nofootprint);
24618 LEDGER_DRIFT(media_footprint);
24619 LEDGER_DRIFT(media_nofootprint_compressed);
24620 LEDGER_DRIFT(media_footprint_compressed);
24621 LEDGER_DRIFT(graphics_nofootprint);
24622 LEDGER_DRIFT(graphics_footprint);
24623 LEDGER_DRIFT(graphics_nofootprint_compressed);
24624 LEDGER_DRIFT(graphics_footprint_compressed);
24625 LEDGER_DRIFT(neural_nofootprint);
24626 LEDGER_DRIFT(neural_footprint);
24627 LEDGER_DRIFT(neural_nofootprint_compressed);
24628 LEDGER_DRIFT(neural_footprint_compressed);
24629 LEDGER_DRIFT(neural_nofootprint_total);
24630 } pmap_ledgers_drift;
24631
24632 void
vm_map_pmap_check_ledgers(pmap_t pmap,ledger_t ledger,int pid,char * procname)24633 vm_map_pmap_check_ledgers(
24634 pmap_t pmap,
24635 ledger_t ledger,
24636 int pid,
24637 char *procname)
24638 {
24639 ledger_amount_t bal;
24640 boolean_t do_panic;
24641
24642 do_panic = FALSE;
24643
24644 pmap_ledgers_drift.num_pmaps_checked++;
24645
24646 #define LEDGER_CHECK_BALANCE(__LEDGER) \
24647 MACRO_BEGIN \
24648 int panic_on_negative = TRUE; \
24649 ledger_get_balance(ledger, \
24650 task_ledgers.__LEDGER, \
24651 &bal); \
24652 ledger_get_panic_on_negative(ledger, \
24653 task_ledgers.__LEDGER, \
24654 &panic_on_negative); \
24655 if (bal != 0) { \
24656 if (panic_on_negative || \
24657 (pmap_ledgers_panic && \
24658 pmap_ledgers_panic_leeway > 0 && \
24659 (bal > (pmap_ledgers_panic_leeway * PAGE_SIZE) || \
24660 bal < (-pmap_ledgers_panic_leeway * PAGE_SIZE)))) { \
24661 do_panic = TRUE; \
24662 } \
24663 printf("LEDGER BALANCE proc %d (%s) " \
24664 "\"%s\" = %lld\n", \
24665 pid, procname, #__LEDGER, bal); \
24666 if (bal > 0) { \
24667 pmap_ledgers_drift.__LEDGER##_over++; \
24668 pmap_ledgers_drift.__LEDGER##_over_total += bal; \
24669 if (bal > pmap_ledgers_drift.__LEDGER##_over_max) { \
24670 pmap_ledgers_drift.__LEDGER##_over_max = bal; \
24671 } \
24672 } else if (bal < 0) { \
24673 pmap_ledgers_drift.__LEDGER##_under++; \
24674 pmap_ledgers_drift.__LEDGER##_under_total += bal; \
24675 if (bal < pmap_ledgers_drift.__LEDGER##_under_max) { \
24676 pmap_ledgers_drift.__LEDGER##_under_max = bal; \
24677 } \
24678 } \
24679 } \
24680 MACRO_END
24681
24682 LEDGER_CHECK_BALANCE(phys_footprint);
24683 LEDGER_CHECK_BALANCE(internal);
24684 LEDGER_CHECK_BALANCE(internal_compressed);
24685 LEDGER_CHECK_BALANCE(external);
24686 LEDGER_CHECK_BALANCE(reusable);
24687 LEDGER_CHECK_BALANCE(iokit_mapped);
24688 LEDGER_CHECK_BALANCE(alternate_accounting);
24689 LEDGER_CHECK_BALANCE(alternate_accounting_compressed);
24690 LEDGER_CHECK_BALANCE(page_table);
24691 LEDGER_CHECK_BALANCE(purgeable_volatile);
24692 LEDGER_CHECK_BALANCE(purgeable_nonvolatile);
24693 LEDGER_CHECK_BALANCE(purgeable_volatile_compressed);
24694 LEDGER_CHECK_BALANCE(purgeable_nonvolatile_compressed);
24695 LEDGER_CHECK_BALANCE(tagged_nofootprint);
24696 LEDGER_CHECK_BALANCE(tagged_footprint);
24697 LEDGER_CHECK_BALANCE(tagged_nofootprint_compressed);
24698 LEDGER_CHECK_BALANCE(tagged_footprint_compressed);
24699 LEDGER_CHECK_BALANCE(network_volatile);
24700 LEDGER_CHECK_BALANCE(network_nonvolatile);
24701 LEDGER_CHECK_BALANCE(network_volatile_compressed);
24702 LEDGER_CHECK_BALANCE(network_nonvolatile_compressed);
24703 LEDGER_CHECK_BALANCE(media_nofootprint);
24704 LEDGER_CHECK_BALANCE(media_footprint);
24705 LEDGER_CHECK_BALANCE(media_nofootprint_compressed);
24706 LEDGER_CHECK_BALANCE(media_footprint_compressed);
24707 LEDGER_CHECK_BALANCE(graphics_nofootprint);
24708 LEDGER_CHECK_BALANCE(graphics_footprint);
24709 LEDGER_CHECK_BALANCE(graphics_nofootprint_compressed);
24710 LEDGER_CHECK_BALANCE(graphics_footprint_compressed);
24711 LEDGER_CHECK_BALANCE(neural_nofootprint);
24712 LEDGER_CHECK_BALANCE(neural_footprint);
24713 LEDGER_CHECK_BALANCE(neural_nofootprint_compressed);
24714 LEDGER_CHECK_BALANCE(neural_footprint_compressed);
24715 LEDGER_CHECK_BALANCE(neural_nofootprint_total);
24716
24717 if (do_panic) {
24718 if (pmap_ledgers_panic) {
24719 panic("pmap_destroy(%p) %d[%s] has imbalanced ledgers",
24720 pmap, pid, procname);
24721 } else {
24722 printf("pmap_destroy(%p) %d[%s] has imbalanced ledgers\n",
24723 pmap, pid, procname);
24724 }
24725 }
24726 }
24727
24728 void
vm_map_pmap_set_process(vm_map_t map,int pid,char * procname)24729 vm_map_pmap_set_process(
24730 vm_map_t map,
24731 int pid,
24732 char *procname)
24733 {
24734 pmap_set_process(vm_map_pmap(map), pid, procname);
24735 }
24736
24737 #endif /* MACH_ASSERT */
24738
24739 /**
24740 * Check if a given given map operation size is valid for the given map, taking
24741 * in to account whether or not the map operation has overridden the soft limit.
24742 *
24743 * This function is meant to be inlined wherever possible as it can, in some
24744 * modes, generates telemetry events which capture shallow backtraces. To
24745 * maximize the usefulness of this backtrace, we want to minize the depth at
24746 * which the backtrace is taken.
24747 */
24748 __attribute__((always_inline))
24749 bool
vm_map_is_map_size_valid(vm_map_t target_map,vm_size_t size,bool no_soft_limit)24750 vm_map_is_map_size_valid(
24751 vm_map_t target_map,
24752 vm_size_t size,
24753 bool no_soft_limit)
24754 {
24755 #ifdef __x86_64__
24756 // Do not enforce any additional limits on x64
24757 (void)target_map;
24758 (void)size;
24759 (void)no_soft_limit;
24760 return true;
24761 #else
24762 if (__probable(target_map->pmap != kernel_pmap ||
24763 size < VM_KERNEL_SIMPLE_MAX_SIZE || no_soft_limit)) {
24764 // Allocation size matches policy
24765 return true;
24766 }
24767
24768 switch (vm_map_kernel_alloc_limit_mode) {
24769 default:
24770 case VM_MAP_KERNEL_ALLOC_LIMIT_MODE_BYPASS:
24771 return true;
24772 case VM_MAP_KERNEL_ALLOC_LIMIT_MODE_TRAP:
24773 trap_telemetry_report_kernel_soft_error(
24774 TRAP_TELEMETRY_KERNEL_SOFT_ERROR_VM_KERNEL_MAX_ALLOC_SIZE,
24775 /* report_once_per_site */ false);
24776 return true;
24777 case VM_MAP_KERNEL_ALLOC_LIMIT_MODE_REJECT:
24778 return false;
24779 case VM_MAP_KERNEL_ALLOC_LIMIT_MODE_PANIC:
24780 panic("1,000,000K ought to be enough for anybody "
24781 "(requested %lu bytes)", size);
24782 }
24783 #endif /* __x86_64__ */
24784 }
24785
24786 vm_map_serial_t
vm_map_maybe_serial_id(vm_map_t maybe_vm_map)24787 vm_map_maybe_serial_id(vm_map_t maybe_vm_map)
24788 {
24789 return maybe_vm_map != NULL ? maybe_vm_map->serial_id : VM_MAP_SERIAL_NONE;
24790 }
24791