1 /*
2 * Copyright (c) 2000-2021 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /*
29 * @OSF_COPYRIGHT@
30 */
31 /*
32 * Mach Operating System
33 * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
34 * All Rights Reserved.
35 *
36 * Permission to use, copy, modify and distribute this software and its
37 * documentation is hereby granted, provided that both the copyright
38 * notice and this permission notice appear in all copies of the
39 * software, derivative works or modified versions, and any portions
40 * thereof, and that both notices appear in supporting documentation.
41 *
42 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44 * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45 *
46 * Carnegie Mellon requests users of this software to return to
47 *
48 * Software Distribution Coordinator or [email protected]
49 * School of Computer Science
50 * Carnegie Mellon University
51 * Pittsburgh PA 15213-3890
52 *
53 * any improvements or extensions that they make and grant Carnegie Mellon
54 * the rights to redistribute these changes.
55 */
56 /*
57 */
58 /*
59 * File: vm/vm_map.c
60 * Author: Avadis Tevanian, Jr., Michael Wayne Young
61 * Date: 1985
62 *
63 * Virtual memory mapping module.
64 */
65
66 #include <mach/vm_types.h>
67 #include <mach_assert.h>
68
69 #include <vm/vm_options.h>
70
71 #include <libkern/OSAtomic.h>
72
73 #include <mach/kern_return.h>
74 #include <mach/port.h>
75 #include <mach/vm_attributes.h>
76 #include <mach/vm_param.h>
77 #include <mach/vm_behavior.h>
78 #include <mach/vm_statistics.h>
79 #include <mach/memory_object.h>
80 #include <mach/mach_vm_server.h>
81 #include <machine/cpu_capabilities.h>
82 #include <mach/sdt.h>
83
84 #include <kern/assert.h>
85 #include <kern/backtrace.h>
86 #include <kern/counter.h>
87 #include <kern/exc_guard.h>
88 #include <kern/kalloc.h>
89 #include <kern/zalloc_internal.h>
90 #include <kern/telemetry.h>
91 #include <kern/trap_telemetry.h>
92
93 #include <vm/cpm_internal.h>
94 #include <vm/memory_types.h>
95 #include <vm/vm_compressor_xnu.h>
96 #include <vm/vm_compressor_pager_internal.h>
97 #include <vm/vm_init_xnu.h>
98 #include <vm/vm_fault_internal.h>
99 #include <vm/vm_map_internal.h>
100 #include <vm/vm_object_internal.h>
101 #include <vm/vm_page_internal.h>
102 #include <vm/vm_pageout.h>
103 #include <vm/pmap.h>
104 #include <vm/vm_kern_internal.h>
105 #include <ipc/ipc_port.h>
106 #include <kern/sched_prim.h>
107 #include <kern/misc_protos.h>
108
109 #include <mach/vm_map_server.h>
110 #include <mach/mach_host_server.h>
111 #include <vm/vm_memtag.h>
112 #include <vm/vm_protos_internal.h>
113 #include <vm/vm_purgeable_internal.h>
114
115 #include <vm/vm_iokit.h>
116 #include <vm/vm_shared_region_internal.h>
117 #include <vm/vm_map_store_internal.h>
118 #include <vm/vm_memory_entry_xnu.h>
119 #include <vm/memory_object_internal.h>
120 #include <vm/vm_memory_entry.h>
121 #include <vm/vm_sanitize_internal.h>
122 #include <vm/vm_reclaim_xnu.h>
123 #if DEVELOPMENT || DEBUG
124 #include <vm/vm_compressor_info.h>
125 #endif /* DEVELOPMENT || DEBUG */
126 #include <san/kasan.h>
127
128 #include <sys/resource.h>
129 #include <sys/random.h>
130 #include <sys/codesign.h>
131 #include <sys/code_signing.h>
132 #include <sys/mman.h>
133 #include <sys/reboot.h>
134 #include <sys/kdebug_triage.h>
135 #include <sys/reason.h>
136
137 #include <os/log.h>
138
139 #include <libkern/section_keywords.h>
140
141 #include <os/hash.h>
142
143 #if DEVELOPMENT || DEBUG
144 extern int proc_selfcsflags(void);
145 int vm_log_xnu_user_debug = 0;
146 int panic_on_unsigned_execute = 0;
147 int panic_on_mlock_failure = 0;
148 #endif /* DEVELOPMENT || DEBUG */
149
150 #if DEVELOPMENT || DEBUG
151 int debug4k_filter = 0;
152 char debug4k_proc_name[1024] = "";
153 int debug4k_proc_filter = (int)-1 & ~(1 << __DEBUG4K_FAULT);
154 int debug4k_panic_on_misaligned_sharing = 0;
155 const char *debug4k_category_name[] = {
156 "error", /* 0 */
157 "life", /* 1 */
158 "load", /* 2 */
159 "fault", /* 3 */
160 "copy", /* 4 */
161 "share", /* 5 */
162 "adjust", /* 6 */
163 "pmap", /* 7 */
164 "mementry", /* 8 */
165 "iokit", /* 9 */
166 "upl", /* 10 */
167 "exc", /* 11 */
168 "vfs" /* 12 */
169 };
170 #endif /* DEVELOPMENT || DEBUG */
171 int debug4k_no_cow_copyin = 0;
172
173
174 #if __arm64__
175 extern const int fourk_binary_compatibility_unsafe;
176 #endif /* __arm64__ */
177 extern int proc_selfpid(void);
178 extern char *proc_name_address(void *p);
179 extern const char *proc_best_name(struct proc *p);
180
181 #if VM_MAP_DEBUG_APPLE_PROTECT
182 int vm_map_debug_apple_protect = 0;
183 #endif /* VM_MAP_DEBUG_APPLE_PROTECT */
184 #if VM_MAP_DEBUG_FOURK
185 int vm_map_debug_fourk = 0;
186 #endif /* VM_MAP_DEBUG_FOURK */
187
188 #if DEBUG || DEVELOPMENT
189 static TUNABLE(bool, vm_map_executable_immutable,
190 "vm_map_executable_immutable", true);
191 #else
192 #define vm_map_executable_immutable true
193 #endif
194
195 /** Do not enforce the kernel allocation size limit */
196 #define VM_MAP_KERNEL_ALLOC_LIMIT_MODE_BYPASS (0)
197 /** Enforce the kernel allocation limit by refusing too large requests */
198 #define VM_MAP_KERNEL_ALLOC_LIMIT_MODE_REJECT (1)
199 /** Enforce the kernel allocation limit by panicking on any too large request */
200 #define VM_MAP_KERNEL_ALLOC_LIMIT_MODE_PANIC (2)
201 /** Do not enforce the kernel allocation limit but generate a telemetry trap */
202 #define VM_MAP_KERNEL_ALLOC_LIMIT_MODE_TRAP (3)
203
204 #if DEVELOPMENT || DEBUG
205 static TUNABLE(int, vm_map_kernel_alloc_limit_mode,
206 "vm_map_kernel_alloc_limit_mode", VM_MAP_KERNEL_ALLOC_LIMIT_MODE_TRAP);
207 #else
208 #define vm_map_kernel_alloc_limit_mode VM_MAP_KERNEL_ALLOC_LIMIT_MODE_BYPASS
209 #endif /* DEVELOPMENT || DEBUG */
210
211 os_refgrp_decl(static, map_refgrp, "vm_map", NULL);
212
213 extern u_int32_t random(void); /* from <libkern/libkern.h> */
214 /* Internal prototypes
215 */
216
217 typedef struct vm_map_zap {
218 vm_map_entry_t vmz_head;
219 vm_map_entry_t *vmz_tail;
220 } *vm_map_zap_t;
221
222 #define VM_MAP_ZAP_DECLARE(zap) \
223 struct vm_map_zap zap = { .vmz_tail = &zap.vmz_head }
224
225 extern kern_return_t vm_map_wire_external(
226 vm_map_t map,
227 vm_map_offset_ut start_u,
228 vm_map_offset_ut end_u,
229 vm_prot_ut prot_u,
230 boolean_t user_wire) __exported;
231
232 #if XNU_PLATFORM_MacOSX
233 extern /* exported via Private.<arch>.MacOSX.exports on macOS */
234 #else
235 static
236 #endif
237 kern_return_t vm_map_copyin_common(
238 vm_map_t src_map,
239 vm_map_address_ut src_addr,
240 vm_map_size_ut len,
241 boolean_t src_destroy,
242 boolean_t src_volatile,
243 vm_map_copy_t *copy_result, /* OUT */
244 boolean_t use_maxprot);
245
246 static vm_map_entry_t vm_map_entry_insert(
247 vm_map_t map,
248 vm_map_entry_t insp_entry,
249 vm_map_offset_t start,
250 vm_map_offset_t end,
251 vm_object_t object,
252 vm_object_offset_t offset,
253 vm_map_kernel_flags_t vmk_flags,
254 boolean_t needs_copy,
255 vm_prot_t cur_protection,
256 vm_prot_t max_protection,
257 vm_inherit_t inheritance,
258 boolean_t clear_map_aligned);
259
260 static void vm_map_simplify_range(
261 vm_map_t map,
262 vm_map_offset_t start,
263 vm_map_offset_t end); /* forward */
264
265 static boolean_t vm_map_range_check(
266 vm_map_t map,
267 vm_map_offset_t start,
268 vm_map_offset_t end,
269 vm_map_entry_t *entry);
270
271 static void vm_map_submap_pmap_clean(
272 vm_map_t map,
273 vm_map_offset_t start,
274 vm_map_offset_t end,
275 vm_map_t sub_map,
276 vm_map_offset_t offset);
277
278 static void vm_map_pmap_enter(
279 vm_map_t map,
280 vm_map_offset_t addr,
281 vm_map_offset_t end_addr,
282 vm_object_t object,
283 vm_object_offset_t offset,
284 vm_prot_t protection);
285
286 static void _vm_map_clip_end(
287 struct vm_map_header *map_header,
288 vm_map_entry_t entry,
289 vm_map_offset_t end);
290
291 static void _vm_map_clip_start(
292 struct vm_map_header *map_header,
293 vm_map_entry_t entry,
294 vm_map_offset_t start);
295
296 static kmem_return_t vm_map_delete(
297 vm_map_t map,
298 vm_map_offset_t start,
299 vm_map_offset_t end,
300 vmr_flags_t flags,
301 kmem_guard_t guard,
302 vm_map_zap_t zap);
303
304 static void vm_map_copy_insert(
305 vm_map_t map,
306 vm_map_entry_t after_where,
307 vm_map_copy_t copy);
308
309 static kern_return_t vm_map_copy_overwrite_unaligned(
310 vm_map_t dst_map,
311 vm_map_entry_t entry,
312 vm_map_copy_t copy,
313 vm_map_address_t start,
314 boolean_t discard_on_success);
315
316 static kern_return_t vm_map_copy_overwrite_aligned(
317 vm_map_t dst_map,
318 vm_map_entry_t tmp_entry,
319 vm_map_copy_t copy,
320 vm_map_offset_t start,
321 pmap_t pmap);
322
323 static kern_return_t vm_map_copyin_kernel_buffer(
324 vm_map_t src_map,
325 vm_map_address_t src_addr,
326 vm_map_size_t len,
327 boolean_t src_destroy,
328 vm_map_copy_t *copy_result); /* OUT */
329
330 static kern_return_t vm_map_copyout_kernel_buffer(
331 vm_map_t map,
332 vm_map_address_t *addr, /* IN/OUT */
333 vm_map_copy_t copy,
334 vm_map_size_t copy_size,
335 boolean_t overwrite,
336 boolean_t consume_on_success);
337
338 static void vm_map_fork_share(
339 vm_map_t old_map,
340 vm_map_entry_t old_entry,
341 vm_map_t new_map);
342
343 static boolean_t vm_map_fork_copy(
344 vm_map_t old_map,
345 vm_map_entry_t *old_entry_p,
346 vm_map_t new_map,
347 int vm_map_copyin_flags);
348
349 static kern_return_t vm_map_wire_nested(
350 vm_map_t map,
351 vm_map_offset_t start,
352 vm_map_offset_t end,
353 vm_prot_t caller_prot,
354 vm_tag_t tag,
355 boolean_t user_wire,
356 pmap_t map_pmap,
357 vm_map_offset_t pmap_addr,
358 ppnum_t *physpage_p);
359
360 static kern_return_t vm_map_unwire_nested(
361 vm_map_t map,
362 vm_map_offset_t start,
363 vm_map_offset_t end,
364 boolean_t user_wire,
365 pmap_t map_pmap,
366 vm_map_offset_t pmap_addr);
367
368 static kern_return_t vm_map_overwrite_submap_recurse(
369 vm_map_t dst_map,
370 vm_map_offset_t dst_addr,
371 vm_map_size_t dst_size);
372
373 static kern_return_t vm_map_copy_overwrite_nested(
374 vm_map_t dst_map,
375 vm_map_offset_t dst_addr,
376 vm_map_copy_t copy,
377 boolean_t interruptible,
378 pmap_t pmap,
379 boolean_t discard_on_success);
380
381 static kern_return_t vm_map_remap_extract(
382 vm_map_t map,
383 vm_map_offset_t addr,
384 vm_map_size_t size,
385 boolean_t copy,
386 vm_map_copy_t map_copy,
387 vm_prot_t *cur_protection,
388 vm_prot_t *max_protection,
389 vm_inherit_t inheritance,
390 vm_map_kernel_flags_t vmk_flags);
391
392 static void vm_map_region_look_for_page(
393 vm_map_t map,
394 vm_map_offset_t va,
395 vm_object_t object,
396 vm_object_offset_t offset,
397 int max_refcnt,
398 unsigned short depth,
399 vm_region_extended_info_t extended,
400 mach_msg_type_number_t count);
401
402 static boolean_t vm_map_region_has_obj_ref(
403 vm_map_entry_t entry,
404 vm_object_t object);
405
406
407 static kern_return_t vm_map_willneed(
408 vm_map_t map,
409 vm_map_offset_t start,
410 vm_map_offset_t end);
411
412 static kern_return_t vm_map_reuse_pages(
413 vm_map_t map,
414 vm_map_offset_t start,
415 vm_map_offset_t end);
416
417 static kern_return_t vm_map_reusable_pages(
418 vm_map_t map,
419 vm_map_offset_t start,
420 vm_map_offset_t end);
421
422 static kern_return_t vm_map_can_reuse(
423 vm_map_t map,
424 vm_map_offset_t start,
425 vm_map_offset_t end);
426
427 static kern_return_t vm_map_zero(
428 vm_map_t map,
429 vm_map_offset_t start,
430 vm_map_offset_t end);
431
432 static kern_return_t vm_map_random_address_for_size(
433 vm_map_t map,
434 vm_map_offset_t *address,
435 vm_map_size_t size,
436 vm_map_kernel_flags_t vmk_flags);
437
438
439 #if CONFIG_MAP_RANGES
440
441 static vm_map_range_id_t vm_map_user_range_resolve(
442 vm_map_t map,
443 mach_vm_address_t addr,
444 mach_vm_address_t size,
445 mach_vm_range_t range);
446
447 #endif /* CONFIG_MAP_RANGES */
448 #if MACH_ASSERT
449 static kern_return_t vm_map_pageout(
450 vm_map_t map,
451 vm_map_offset_t start,
452 vm_map_offset_t end);
453 #endif /* MACH_ASSERT */
454
455 kern_return_t vm_map_corpse_footprint_collect(
456 vm_map_t old_map,
457 vm_map_entry_t old_entry,
458 vm_map_t new_map);
459 void vm_map_corpse_footprint_collect_done(
460 vm_map_t new_map);
461 void vm_map_corpse_footprint_destroy(
462 vm_map_t map);
463 kern_return_t vm_map_corpse_footprint_query_page_info(
464 vm_map_t map,
465 vm_map_offset_t va,
466 int *disposition_p);
467 void vm_map_footprint_query_page_info(
468 vm_map_t map,
469 vm_map_entry_t map_entry,
470 vm_map_offset_t curr_s_offset,
471 int *disposition_p);
472
473 #if CONFIG_MAP_RANGES
474 static void vm_map_range_map_init(void);
475 #endif /* CONFIG_MAP_RANGES */
476
477 pid_t find_largest_process_vm_map_entries(void);
478
479 __attribute__((always_inline))
480 int
vm_map_kernel_flags_vmflags(vm_map_kernel_flags_t vmk_flags)481 vm_map_kernel_flags_vmflags(vm_map_kernel_flags_t vmk_flags)
482 {
483 int flags = vmk_flags.__vm_flags & VM_FLAGS_ANY_MASK;
484
485 /* in vmk flags the meaning of fixed/anywhere is inverted */
486 return flags ^ (VM_FLAGS_FIXED | VM_FLAGS_ANYWHERE);
487 }
488
489 __attribute__((always_inline, overloadable))
490 void
vm_map_kernel_flags_set_vmflags(vm_map_kernel_flags_t * vmk_flags,int vm_flags,vm_tag_t vm_tag)491 vm_map_kernel_flags_set_vmflags(
492 vm_map_kernel_flags_t *vmk_flags,
493 int vm_flags,
494 vm_tag_t vm_tag)
495 {
496 vm_flags ^= (VM_FLAGS_FIXED | VM_FLAGS_ANYWHERE);
497 vmk_flags->__vm_flags &= ~VM_FLAGS_ANY_MASK;
498 vmk_flags->__vm_flags |= (vm_flags & VM_FLAGS_ANY_MASK);
499 vmk_flags->vm_tag = vm_tag;
500 }
501
502 __attribute__((always_inline, overloadable))
503 void
vm_map_kernel_flags_set_vmflags(vm_map_kernel_flags_t * vmk_flags,int vm_flags_and_tag)504 vm_map_kernel_flags_set_vmflags(
505 vm_map_kernel_flags_t *vmk_flags,
506 int vm_flags_and_tag)
507 {
508 vm_flags_and_tag ^= (VM_FLAGS_FIXED | VM_FLAGS_ANYWHERE);
509 vmk_flags->__vm_flags &= ~VM_FLAGS_ANY_MASK;
510 vmk_flags->__vm_flags |= (vm_flags_and_tag & VM_FLAGS_ANY_MASK);
511 VM_GET_FLAGS_ALIAS(vm_flags_and_tag, vmk_flags->vm_tag);
512 }
513
514 __attribute__((always_inline))
515 void
vm_map_kernel_flags_and_vmflags(vm_map_kernel_flags_t * vmk_flags,int vm_flags_mask)516 vm_map_kernel_flags_and_vmflags(
517 vm_map_kernel_flags_t *vmk_flags,
518 int vm_flags_mask)
519 {
520 /* this function doesn't handle the inverted FIXED/ANYWHERE */
521 assert(vm_flags_mask & VM_FLAGS_ANYWHERE);
522 vmk_flags->__vm_flags &= vm_flags_mask;
523 }
524
525 __attribute__((always_inline))
526 bool
vm_map_kernel_flags_check_vm_and_kflags(vm_map_kernel_flags_t vmk_flags,int vm_flags_mask)527 vm_map_kernel_flags_check_vm_and_kflags(
528 vm_map_kernel_flags_t vmk_flags,
529 int vm_flags_mask)
530 {
531 return (vmk_flags.__vm_flags & ~vm_flags_mask) == 0;
532 }
533
534 bool
vm_map_kernel_flags_check_vmflags(vm_map_kernel_flags_t vmk_flags,int vm_flags_mask)535 vm_map_kernel_flags_check_vmflags(
536 vm_map_kernel_flags_t vmk_flags,
537 int vm_flags_mask)
538 {
539 int vmflags = vmk_flags.__vm_flags & VM_FLAGS_ANY_MASK;
540
541 /* Note: up to 16 still has good calling conventions */
542 static_assert(sizeof(vm_map_kernel_flags_t) == 16);
543
544 #if DEBUG || DEVELOPMENT
545 /*
546 * All of this compiles to nothing if all checks pass.
547 */
548 #define check(field, value) ({ \
549 vm_map_kernel_flags_t fl = VM_MAP_KERNEL_FLAGS_NONE; \
550 fl.__vm_flags = (value); \
551 fl.field = 0; \
552 assert(fl.__vm_flags == 0); \
553 })
554
555 /* bits 0-7 */
556 check(vmf_fixed, VM_FLAGS_ANYWHERE); // kind of a lie this is inverted
557 check(vmf_purgeable, VM_FLAGS_PURGABLE);
558 check(vmf_4gb_chunk, VM_FLAGS_4GB_CHUNK);
559 check(vmf_random_addr, VM_FLAGS_RANDOM_ADDR);
560 check(vmf_no_cache, VM_FLAGS_NO_CACHE);
561 check(vmf_resilient_codesign, VM_FLAGS_RESILIENT_CODESIGN);
562 check(vmf_resilient_media, VM_FLAGS_RESILIENT_MEDIA);
563 check(vmf_permanent, VM_FLAGS_PERMANENT);
564
565 /* bits 8-15 */
566 check(vmf_tpro, VM_FLAGS_TPRO);
567 check(vmf_overwrite, VM_FLAGS_OVERWRITE);
568
569 /* bits 16-23 */
570 check(vmf_superpage_size, VM_FLAGS_SUPERPAGE_MASK);
571 check(vmf_return_data_addr, VM_FLAGS_RETURN_DATA_ADDR);
572 check(vmf_return_4k_data_addr, VM_FLAGS_RETURN_4K_DATA_ADDR);
573
574 {
575 vm_map_kernel_flags_t fl = VM_MAP_KERNEL_FLAGS_NONE;
576
577 /* check user tags will never clip */
578 fl.vm_tag = VM_MEMORY_COUNT - 1;
579 assert(fl.vm_tag == VM_MEMORY_COUNT - 1);
580
581 /* check kernel tags will never clip */
582 fl.vm_tag = VM_MAX_TAG_VALUE - 1;
583 assert(fl.vm_tag == VM_MAX_TAG_VALUE - 1);
584 }
585
586
587 #undef check
588 #endif /* DEBUG || DEVELOPMENT */
589
590 return (vmflags & ~vm_flags_mask) == 0;
591 }
592
593 /*
594 * Macros to copy a vm_map_entry. We must be careful to correctly
595 * manage the wired page count. vm_map_entry_copy() creates a new
596 * map entry to the same memory - the wired count in the new entry
597 * must be set to zero. vm_map_entry_copy_full() creates a new
598 * entry that is identical to the old entry. This preserves the
599 * wire count; it's used for map splitting and zone changing in
600 * vm_map_copyout.
601 */
602
603 static inline void
vm_map_entry_copy_csm_assoc(vm_map_t map __unused,vm_map_entry_t new __unused,vm_map_entry_t old __unused)604 vm_map_entry_copy_csm_assoc(
605 vm_map_t map __unused,
606 vm_map_entry_t new __unused,
607 vm_map_entry_t old __unused)
608 {
609 #if CODE_SIGNING_MONITOR
610 /* when code signing monitor is enabled, we want to reset on copy */
611 new->csm_associated = FALSE;
612 #else
613 /* when code signing monitor is not enabled, assert as a sanity check */
614 assert(new->csm_associated == FALSE);
615 #endif
616 #if DEVELOPMENT || DEBUG
617 if (new->vme_xnu_user_debug && vm_log_xnu_user_debug) {
618 printf("FBDP %d[%s] %s:%d map %p entry %p [ 0x%llx 0x%llx ] vme_xnu_user_debug\n",
619 proc_selfpid(),
620 (get_bsdtask_info(current_task())
621 ? proc_name_address(get_bsdtask_info(current_task()))
622 : "?"),
623 __FUNCTION__, __LINE__,
624 map, new, new->vme_start, new->vme_end);
625 }
626 #endif /* DEVELOPMENT || DEBUG */
627 #if XNU_TARGET_OS_OSX
628 /*
629 * On macOS, entries with "vme_xnu_user_debug" can be copied during fork()
630 * and we want the child's entry to keep its "vme_xnu_user_debug" to avoid
631 * trigggering CSM assertions when the child accesses its mapping.
632 */
633 #else /* XNU_TARGET_OS_OSX */
634 new->vme_xnu_user_debug = FALSE;
635 #endif /* XNU_TARGET_OS_OSX */
636 }
637
638 /*
639 * The "used_for_jit" flag was copied from OLD to NEW in vm_map_entry_copy().
640 * But for security reasons on some platforms, we don't want the
641 * new mapping to be "used for jit", so we reset the flag here.
642 */
643 static inline void
vm_map_entry_copy_code_signing(vm_map_t map,vm_map_entry_t new,vm_map_entry_t old __unused)644 vm_map_entry_copy_code_signing(
645 vm_map_t map,
646 vm_map_entry_t new,
647 vm_map_entry_t old __unused)
648 {
649 if (VM_MAP_POLICY_ALLOW_JIT_COPY(map)) {
650 assert(new->used_for_jit == old->used_for_jit);
651 } else {
652 if (old->used_for_jit) {
653 DTRACE_VM3(cs_wx,
654 uint64_t, new->vme_start,
655 uint64_t, new->vme_end,
656 vm_prot_t, new->protection);
657 printf("CODE SIGNING: %d[%s] %s: curprot cannot be write+execute. %s\n",
658 proc_selfpid(),
659 (get_bsdtask_info(current_task())
660 ? proc_name_address(get_bsdtask_info(current_task()))
661 : "?"),
662 __FUNCTION__,
663 "removing execute access");
664 new->protection &= ~VM_PROT_EXECUTE;
665 new->max_protection &= ~VM_PROT_EXECUTE;
666 }
667 new->used_for_jit = FALSE;
668 }
669 }
670
671 static inline void
vm_map_entry_copy_full(vm_map_entry_t new,vm_map_entry_t old)672 vm_map_entry_copy_full(
673 vm_map_entry_t new,
674 vm_map_entry_t old)
675 {
676 #if MAP_ENTRY_CREATION_DEBUG
677 btref_put(new->vme_creation_bt);
678 btref_retain(old->vme_creation_bt);
679 #endif
680 #if MAP_ENTRY_INSERTION_DEBUG
681 btref_put(new->vme_insertion_bt);
682 btref_retain(old->vme_insertion_bt);
683 #endif
684 #if VM_BTLOG_TAGS
685 /* Discard the btref that might be in the new entry */
686 if (new->vme_kernel_object) {
687 btref_put(new->vme_tag_btref);
688 }
689 /* Retain the btref in the old entry to account for its copy */
690 if (old->vme_kernel_object) {
691 btref_retain(old->vme_tag_btref);
692 }
693 #endif /* VM_BTLOG_TAGS */
694 *new = *old;
695 }
696
697 static inline void
vm_map_entry_copy(vm_map_t map,vm_map_entry_t new,vm_map_entry_t old)698 vm_map_entry_copy(
699 vm_map_t map,
700 vm_map_entry_t new,
701 vm_map_entry_t old)
702 {
703 vm_map_entry_copy_full(new, old);
704
705 new->is_shared = FALSE;
706 new->needs_wakeup = FALSE;
707 new->in_transition = FALSE;
708 new->wired_count = 0;
709 new->user_wired_count = 0;
710 new->vme_permanent = FALSE;
711 vm_map_entry_copy_code_signing(map, new, old);
712 vm_map_entry_copy_csm_assoc(map, new, old);
713 if (new->iokit_acct) {
714 assertf(!new->use_pmap, "old %p new %p\n", old, new);
715 new->iokit_acct = FALSE;
716 new->use_pmap = TRUE;
717 }
718 new->vme_resilient_codesign = FALSE;
719 new->vme_resilient_media = FALSE;
720 new->vme_atomic = FALSE;
721 new->vme_no_copy_on_read = FALSE;
722 }
723
724 /*
725 * Normal lock_read_to_write() returns FALSE/0 on failure.
726 * These functions evaluate to zero on success and non-zero value on failure.
727 */
728 __attribute__((always_inline))
729 int
vm_map_lock_read_to_write(vm_map_t map)730 vm_map_lock_read_to_write(vm_map_t map)
731 {
732 if (lck_rw_lock_shared_to_exclusive(&(map)->lock)) {
733 DTRACE_VM(vm_map_lock_upgrade);
734 return 0;
735 }
736 return 1;
737 }
738
739 __attribute__((always_inline))
740 boolean_t
vm_map_try_lock(vm_map_t map)741 vm_map_try_lock(vm_map_t map)
742 {
743 if (lck_rw_try_lock_exclusive(&(map)->lock)) {
744 DTRACE_VM(vm_map_lock_w);
745 return TRUE;
746 }
747 return FALSE;
748 }
749
750 __attribute__((always_inline))
751 boolean_t
vm_map_try_lock_read(vm_map_t map)752 vm_map_try_lock_read(vm_map_t map)
753 {
754 if (lck_rw_try_lock_shared(&(map)->lock)) {
755 DTRACE_VM(vm_map_lock_r);
756 return TRUE;
757 }
758 return FALSE;
759 }
760
761 /*!
762 * @function kdp_vm_map_is_acquired_exclusive
763 *
764 * @abstract
765 * Checks if vm map is acquired exclusive.
766 *
767 * @discussion
768 * NOT SAFE: To be used only by kernel debugger.
769 *
770 * @param map map to check
771 *
772 * @returns TRUE if the map is acquired exclusively.
773 */
774 boolean_t
kdp_vm_map_is_acquired_exclusive(vm_map_t map)775 kdp_vm_map_is_acquired_exclusive(vm_map_t map)
776 {
777 return kdp_lck_rw_lock_is_acquired_exclusive(&map->lock);
778 }
779
780 /*
781 * Routines to get the page size the caller should
782 * use while inspecting the target address space.
783 * Use the "_safely" variant if the caller is dealing with a user-provided
784 * array whose size depends on the page size, to avoid any overflow or
785 * underflow of a user-allocated buffer.
786 */
787 int
vm_self_region_page_shift_safely(vm_map_t target_map)788 vm_self_region_page_shift_safely(
789 vm_map_t target_map)
790 {
791 int effective_page_shift = 0;
792
793 if (PAGE_SIZE == (4096)) {
794 /* x86_64 and 4k watches: always use 4k */
795 return PAGE_SHIFT;
796 }
797 /* did caller provide an explicit page size for this thread to use? */
798 effective_page_shift = thread_self_region_page_shift();
799 if (effective_page_shift) {
800 /* use the explicitly-provided page size */
801 return effective_page_shift;
802 }
803 /* no explicit page size: use the caller's page size... */
804 effective_page_shift = VM_MAP_PAGE_SHIFT(current_map());
805 if (effective_page_shift == VM_MAP_PAGE_SHIFT(target_map)) {
806 /* page size match: safe to use */
807 return effective_page_shift;
808 }
809 /* page size mismatch */
810 return -1;
811 }
812 int
vm_self_region_page_shift(vm_map_t target_map)813 vm_self_region_page_shift(
814 vm_map_t target_map)
815 {
816 int effective_page_shift;
817
818 effective_page_shift = vm_self_region_page_shift_safely(target_map);
819 if (effective_page_shift == -1) {
820 /* no safe value but OK to guess for caller */
821 effective_page_shift = MIN(VM_MAP_PAGE_SHIFT(current_map()),
822 VM_MAP_PAGE_SHIFT(target_map));
823 }
824 return effective_page_shift;
825 }
826
827
828 /*
829 * Decide if we want to allow processes to execute from their data or stack areas.
830 * override_nx() returns true if we do. Data/stack execution can be enabled independently
831 * for 32 and 64 bit processes. Set the VM_ABI_32 or VM_ABI_64 flags in allow_data_exec
832 * or allow_stack_exec to enable data execution for that type of data area for that particular
833 * ABI (or both by or'ing the flags together). These are initialized in the architecture
834 * specific pmap files since the default behavior varies according to architecture. The
835 * main reason it varies is because of the need to provide binary compatibility with old
836 * applications that were written before these restrictions came into being. In the old
837 * days, an app could execute anything it could read, but this has slowly been tightened
838 * up over time. The default behavior is:
839 *
840 * 32-bit PPC apps may execute from both stack and data areas
841 * 32-bit Intel apps may exeucte from data areas but not stack
842 * 64-bit PPC/Intel apps may not execute from either data or stack
843 *
844 * An application on any architecture may override these defaults by explicitly
845 * adding PROT_EXEC permission to the page in question with the mprotect(2)
846 * system call. This code here just determines what happens when an app tries to
847 * execute from a page that lacks execute permission.
848 *
849 * Note that allow_data_exec or allow_stack_exec may also be modified by sysctl to change the
850 * default behavior for both 32 and 64 bit apps on a system-wide basis. Furthermore,
851 * a Mach-O header flag bit (MH_NO_HEAP_EXECUTION) can be used to forcibly disallow
852 * execution from data areas for a particular binary even if the arch normally permits it. As
853 * a final wrinkle, a posix_spawn attribute flag can be used to negate this opt-in header bit
854 * to support some complicated use cases, notably browsers with out-of-process plugins that
855 * are not all NX-safe.
856 */
857
858 extern int allow_data_exec, allow_stack_exec;
859
860 int
override_nx(vm_map_t map,uint32_t user_tag)861 override_nx(vm_map_t map, uint32_t user_tag) /* map unused on arm */
862 {
863 int current_abi;
864
865 if (map->pmap == kernel_pmap) {
866 return FALSE;
867 }
868
869 /*
870 * Determine if the app is running in 32 or 64 bit mode.
871 */
872
873 if (vm_map_is_64bit(map)) {
874 current_abi = VM_ABI_64;
875 } else {
876 current_abi = VM_ABI_32;
877 }
878
879 /*
880 * Determine if we should allow the execution based on whether it's a
881 * stack or data area and the current architecture.
882 */
883
884 if (user_tag == VM_MEMORY_STACK) {
885 return allow_stack_exec & current_abi;
886 }
887
888 return (allow_data_exec & current_abi) && (map->map_disallow_data_exec == FALSE);
889 }
890
891
892 /*
893 * Virtual memory maps provide for the mapping, protection,
894 * and sharing of virtual memory objects. In addition,
895 * this module provides for an efficient virtual copy of
896 * memory from one map to another.
897 *
898 * Synchronization is required prior to most operations.
899 *
900 * Maps consist of an ordered doubly-linked list of simple
901 * entries; a single hint is used to speed up lookups.
902 *
903 * Sharing maps have been deleted from this version of Mach.
904 * All shared objects are now mapped directly into the respective
905 * maps. This requires a change in the copy on write strategy;
906 * the asymmetric (delayed) strategy is used for shared temporary
907 * objects instead of the symmetric (shadow) strategy. All maps
908 * are now "top level" maps (either task map, kernel map or submap
909 * of the kernel map).
910 *
911 * Since portions of maps are specified by start/end addreses,
912 * which may not align with existing map entries, all
913 * routines merely "clip" entries to these start/end values.
914 * [That is, an entry is split into two, bordering at a
915 * start or end value.] Note that these clippings may not
916 * always be necessary (as the two resulting entries are then
917 * not changed); however, the clipping is done for convenience.
918 * No attempt is currently made to "glue back together" two
919 * abutting entries.
920 *
921 * The symmetric (shadow) copy strategy implements virtual copy
922 * by copying VM object references from one map to
923 * another, and then marking both regions as copy-on-write.
924 * It is important to note that only one writeable reference
925 * to a VM object region exists in any map when this strategy
926 * is used -- this means that shadow object creation can be
927 * delayed until a write operation occurs. The symmetric (delayed)
928 * strategy allows multiple maps to have writeable references to
929 * the same region of a vm object, and hence cannot delay creating
930 * its copy objects. See vm_object_copy_quickly() in vm_object.c.
931 * Copying of permanent objects is completely different; see
932 * vm_object_copy_strategically() in vm_object.c.
933 */
934
935 ZONE_DECLARE_ID(ZONE_ID_VM_MAP_COPY, struct vm_map_copy);
936
937 #define VM_MAP_ZONE_NAME "maps"
938 #define VM_MAP_ZFLAGS (ZC_NOENCRYPT | ZC_VM)
939
940 #define VM_MAP_ENTRY_ZONE_NAME "VM map entries"
941 #define VM_MAP_ENTRY_ZFLAGS (ZC_NOENCRYPT | ZC_VM)
942
943 #define VM_MAP_HOLES_ZONE_NAME "VM map holes"
944 #define VM_MAP_HOLES_ZFLAGS (ZC_NOENCRYPT | ZC_VM)
945
946 /*
947 * Asserts that a vm_map_copy object is coming from the
948 * vm_map_copy_zone to ensure that it isn't a fake constructed
949 * anywhere else.
950 */
951 void
vm_map_copy_require(struct vm_map_copy * copy)952 vm_map_copy_require(struct vm_map_copy *copy)
953 {
954 zone_id_require(ZONE_ID_VM_MAP_COPY, sizeof(struct vm_map_copy), copy);
955 }
956
957 /*
958 * vm_map_require:
959 *
960 * Ensures that the argument is memory allocated from the genuine
961 * vm map zone. (See zone_id_require_allow_foreign).
962 */
963 void
vm_map_require(vm_map_t map)964 vm_map_require(vm_map_t map)
965 {
966 zone_id_require(ZONE_ID_VM_MAP, sizeof(struct _vm_map), map);
967 }
968
969 #define VM_MAP_EARLY_COUNT_MAX 16
970 static __startup_data vm_offset_t map_data;
971 static __startup_data vm_size_t map_data_size;
972 static __startup_data vm_offset_t kentry_data;
973 static __startup_data vm_size_t kentry_data_size;
974 static __startup_data vm_offset_t map_holes_data;
975 static __startup_data vm_size_t map_holes_data_size;
976 static __startup_data vm_map_t *early_map_owners[VM_MAP_EARLY_COUNT_MAX];
977 static __startup_data uint32_t early_map_count;
978
979 #if XNU_TARGET_OS_OSX
980 #define NO_COALESCE_LIMIT ((1024 * 128) - 1)
981 #else /* XNU_TARGET_OS_OSX */
982 #define NO_COALESCE_LIMIT 0
983 #endif /* XNU_TARGET_OS_OSX */
984
985 /* Skip acquiring locks if we're in the midst of a kernel core dump */
986 unsigned int not_in_kdp = 1;
987
988 unsigned int vm_map_set_cache_attr_count = 0;
989
990 kern_return_t
vm_map_set_cache_attr(vm_map_t map,vm_map_offset_t va)991 vm_map_set_cache_attr(
992 vm_map_t map,
993 vm_map_offset_t va)
994 {
995 vm_map_entry_t map_entry;
996 vm_object_t object;
997 kern_return_t kr = KERN_SUCCESS;
998
999 vm_map_lock_read(map);
1000
1001 if (!vm_map_lookup_entry(map, va, &map_entry) ||
1002 map_entry->is_sub_map) {
1003 /*
1004 * that memory is not properly mapped
1005 */
1006 kr = KERN_INVALID_ARGUMENT;
1007 goto done;
1008 }
1009 object = VME_OBJECT(map_entry);
1010
1011 if (object == VM_OBJECT_NULL) {
1012 /*
1013 * there should be a VM object here at this point
1014 */
1015 kr = KERN_INVALID_ARGUMENT;
1016 goto done;
1017 }
1018 vm_object_lock(object);
1019 object->set_cache_attr = TRUE;
1020 vm_object_unlock(object);
1021
1022 vm_map_set_cache_attr_count++;
1023 done:
1024 vm_map_unlock_read(map);
1025
1026 return kr;
1027 }
1028
1029
1030 #if CONFIG_CODE_DECRYPTION
1031 /*
1032 * vm_map_apple_protected:
1033 * This remaps the requested part of the object with an object backed by
1034 * the decrypting pager.
1035 * crypt_info contains entry points and session data for the crypt module.
1036 * The crypt_info block will be copied by vm_map_apple_protected. The data structures
1037 * referenced in crypt_info must remain valid until crypt_info->crypt_end() is called.
1038 */
1039 kern_return_t
vm_map_apple_protected(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_object_offset_t crypto_backing_offset,struct pager_crypt_info * crypt_info,uint32_t cryptid)1040 vm_map_apple_protected(
1041 vm_map_t map,
1042 vm_map_offset_t start,
1043 vm_map_offset_t end,
1044 vm_object_offset_t crypto_backing_offset,
1045 struct pager_crypt_info *crypt_info,
1046 uint32_t cryptid)
1047 {
1048 boolean_t map_locked;
1049 kern_return_t kr;
1050 vm_map_entry_t map_entry;
1051 struct vm_map_entry tmp_entry;
1052 memory_object_t unprotected_mem_obj;
1053 vm_object_t protected_object;
1054 vm_map_offset_t map_addr;
1055 vm_map_offset_t start_aligned, end_aligned;
1056 vm_object_offset_t crypto_start, crypto_end;
1057 boolean_t cache_pager;
1058
1059 map_locked = FALSE;
1060 unprotected_mem_obj = MEMORY_OBJECT_NULL;
1061
1062 if (__improbable(vm_map_range_overflows(map, start, end - start))) {
1063 return KERN_INVALID_ADDRESS;
1064 }
1065 start_aligned = vm_map_trunc_page(start, PAGE_MASK_64);
1066 end_aligned = vm_map_round_page(end, PAGE_MASK_64);
1067 start_aligned = vm_map_trunc_page(start_aligned, VM_MAP_PAGE_MASK(map));
1068 end_aligned = vm_map_round_page(end_aligned, VM_MAP_PAGE_MASK(map));
1069
1070 #if __arm64__
1071 /*
1072 * "start" and "end" might be 4K-aligned but not 16K-aligned,
1073 * so we might have to loop and establish up to 3 mappings:
1074 *
1075 * + the first 16K-page, which might overlap with the previous
1076 * 4K-aligned mapping,
1077 * + the center,
1078 * + the last 16K-page, which might overlap with the next
1079 * 4K-aligned mapping.
1080 * Each of these mapping might be backed by a vnode pager (if
1081 * properly page-aligned) or a "fourk_pager", itself backed by a
1082 * vnode pager (if 4K-aligned but not page-aligned).
1083 */
1084 #endif /* __arm64__ */
1085
1086 map_addr = start_aligned;
1087 for (map_addr = start_aligned;
1088 map_addr < end;
1089 map_addr = tmp_entry.vme_end) {
1090 vm_map_lock(map);
1091 map_locked = TRUE;
1092
1093 /* lookup the protected VM object */
1094 if (!vm_map_lookup_entry(map,
1095 map_addr,
1096 &map_entry) ||
1097 map_entry->is_sub_map ||
1098 VME_OBJECT(map_entry) == VM_OBJECT_NULL) {
1099 /* that memory is not properly mapped */
1100 kr = KERN_INVALID_ARGUMENT;
1101 goto done;
1102 }
1103
1104 /* ensure mapped memory is mapped as executable except
1105 * except for model decryption flow */
1106 if ((cryptid != CRYPTID_MODEL_ENCRYPTION) &&
1107 !(map_entry->protection & VM_PROT_EXECUTE)) {
1108 kr = KERN_INVALID_ARGUMENT;
1109 goto done;
1110 }
1111
1112 /* get the protected object to be decrypted */
1113 protected_object = VME_OBJECT(map_entry);
1114 if (protected_object == VM_OBJECT_NULL) {
1115 /* there should be a VM object here at this point */
1116 kr = KERN_INVALID_ARGUMENT;
1117 goto done;
1118 }
1119 /* ensure protected object stays alive while map is unlocked */
1120 vm_object_reference(protected_object);
1121
1122 /* limit the map entry to the area we want to cover */
1123 vm_map_clip_start(map, map_entry, start_aligned);
1124 vm_map_clip_end(map, map_entry, end_aligned);
1125
1126 tmp_entry = *map_entry;
1127 map_entry = VM_MAP_ENTRY_NULL; /* not valid after unlocking map */
1128 vm_map_unlock(map);
1129 map_locked = FALSE;
1130
1131 /*
1132 * This map entry might be only partially encrypted
1133 * (if not fully "page-aligned").
1134 */
1135 crypto_start = 0;
1136 crypto_end = tmp_entry.vme_end - tmp_entry.vme_start;
1137 if (tmp_entry.vme_start < start) {
1138 if (tmp_entry.vme_start != start_aligned) {
1139 kr = KERN_INVALID_ADDRESS;
1140 vm_object_deallocate(protected_object);
1141 goto done;
1142 }
1143 crypto_start += (start - tmp_entry.vme_start);
1144 }
1145 if (tmp_entry.vme_end > end) {
1146 if (tmp_entry.vme_end != end_aligned) {
1147 kr = KERN_INVALID_ADDRESS;
1148 vm_object_deallocate(protected_object);
1149 goto done;
1150 }
1151 crypto_end -= (tmp_entry.vme_end - end);
1152 }
1153
1154 /*
1155 * This "extra backing offset" is needed to get the decryption
1156 * routine to use the right key. It adjusts for the possibly
1157 * relative offset of an interposed "4K" pager...
1158 */
1159 if (crypto_backing_offset == (vm_object_offset_t) -1) {
1160 crypto_backing_offset = VME_OFFSET(&tmp_entry);
1161 }
1162
1163 cache_pager = TRUE;
1164 #if XNU_TARGET_OS_OSX
1165 if (vm_map_is_alien(map)) {
1166 cache_pager = FALSE;
1167 }
1168 #endif /* XNU_TARGET_OS_OSX */
1169
1170 /*
1171 * Lookup (and create if necessary) the protected memory object
1172 * matching that VM object.
1173 * If successful, this also grabs a reference on the memory object,
1174 * to guarantee that it doesn't go away before we get a chance to map
1175 * it.
1176 */
1177 unprotected_mem_obj = apple_protect_pager_setup(
1178 protected_object,
1179 VME_OFFSET(&tmp_entry),
1180 crypto_backing_offset,
1181 crypt_info,
1182 crypto_start,
1183 crypto_end,
1184 cache_pager);
1185
1186 /* release extra ref on protected object */
1187 vm_object_deallocate(protected_object);
1188
1189 if (unprotected_mem_obj == NULL) {
1190 kr = KERN_FAILURE;
1191 goto done;
1192 }
1193
1194 /* can overwrite an immutable mapping */
1195 vm_map_kernel_flags_t vmk_flags = {
1196 .vmf_fixed = true,
1197 .vmf_overwrite = true,
1198 .vmkf_overwrite_immutable = true,
1199 };
1200 /* make the new mapping as "permanent" as the one it replaces */
1201 vmk_flags.vmf_permanent = tmp_entry.vme_permanent;
1202
1203 /* map this memory object in place of the current one */
1204 map_addr = tmp_entry.vme_start;
1205 kr = mach_vm_map_kernel(map,
1206 vm_sanitize_wrap_addr_ref(&map_addr),
1207 (tmp_entry.vme_end -
1208 tmp_entry.vme_start),
1209 (mach_vm_offset_t) 0,
1210 vmk_flags,
1211 (ipc_port_t)(uintptr_t) unprotected_mem_obj,
1212 0,
1213 TRUE,
1214 tmp_entry.protection,
1215 tmp_entry.max_protection,
1216 tmp_entry.inheritance);
1217 assertf(kr == KERN_SUCCESS,
1218 "kr = 0x%x\n", kr);
1219 assertf(map_addr == tmp_entry.vme_start,
1220 "map_addr=0x%llx vme_start=0x%llx tmp_entry=%p\n",
1221 (uint64_t)map_addr,
1222 (uint64_t) tmp_entry.vme_start,
1223 &tmp_entry);
1224
1225 #if VM_MAP_DEBUG_APPLE_PROTECT
1226 if (vm_map_debug_apple_protect) {
1227 printf("APPLE_PROTECT: map %p [0x%llx:0x%llx] pager %p:"
1228 " backing:[object:%p,offset:0x%llx,"
1229 "crypto_backing_offset:0x%llx,"
1230 "crypto_start:0x%llx,crypto_end:0x%llx]\n",
1231 map,
1232 (uint64_t) map_addr,
1233 (uint64_t) (map_addr + (tmp_entry.vme_end -
1234 tmp_entry.vme_start)),
1235 unprotected_mem_obj,
1236 protected_object,
1237 VME_OFFSET(&tmp_entry),
1238 crypto_backing_offset,
1239 crypto_start,
1240 crypto_end);
1241 }
1242 #endif /* VM_MAP_DEBUG_APPLE_PROTECT */
1243
1244 /*
1245 * Release the reference obtained by
1246 * apple_protect_pager_setup().
1247 * The mapping (if it succeeded) is now holding a reference on
1248 * the memory object.
1249 */
1250 memory_object_deallocate(unprotected_mem_obj);
1251 unprotected_mem_obj = MEMORY_OBJECT_NULL;
1252
1253 /* continue with next map entry */
1254 crypto_backing_offset += (tmp_entry.vme_end -
1255 tmp_entry.vme_start);
1256 crypto_backing_offset -= crypto_start;
1257 }
1258 kr = KERN_SUCCESS;
1259
1260 done:
1261 if (map_locked) {
1262 vm_map_unlock(map);
1263 }
1264 return kr;
1265 }
1266 #endif /* CONFIG_CODE_DECRYPTION */
1267
1268
1269 LCK_GRP_DECLARE(vm_map_lck_grp, "vm_map");
1270 LCK_ATTR_DECLARE(vm_map_lck_attr, 0, 0);
1271 LCK_ATTR_DECLARE(vm_map_lck_rw_attr, 0, LCK_ATTR_DEBUG);
1272
1273 #if XNU_TARGET_OS_OSX
1274 #define MALLOC_NO_COW_DEFAULT 1
1275 #define MALLOC_NO_COW_EXCEPT_FORK_DEFAULT 1
1276 #else /* XNU_TARGET_OS_OSX */
1277 #define MALLOC_NO_COW_DEFAULT 1
1278 #define MALLOC_NO_COW_EXCEPT_FORK_DEFAULT 0
1279 #endif /* XNU_TARGET_OS_OSX */
1280 TUNABLE(int, malloc_no_cow, "malloc_no_cow", MALLOC_NO_COW_DEFAULT);
1281 TUNABLE(int, malloc_no_cow_except_fork, "malloc_no_cow_except_fork", MALLOC_NO_COW_EXCEPT_FORK_DEFAULT);
1282 uint64_t vm_memory_malloc_no_cow_mask = 0ULL;
1283 #if DEBUG
1284 int vm_check_map_sanity = 0;
1285 #endif
1286
1287 /*
1288 * vm_map_init:
1289 *
1290 * Initialize the vm_map module. Must be called before
1291 * any other vm_map routines.
1292 *
1293 * Map and entry structures are allocated from zones -- we must
1294 * initialize those zones.
1295 *
1296 * There are three zones of interest:
1297 *
1298 * vm_map_zone: used to allocate maps.
1299 * vm_map_entry_zone: used to allocate map entries.
1300 *
1301 * LP32:
1302 * vm_map_entry_reserved_zone: fallback zone for kernel map entries
1303 *
1304 * The kernel allocates map entries from a special zone that is initially
1305 * "crammed" with memory. It would be difficult (perhaps impossible) for
1306 * the kernel to allocate more memory to a entry zone when it became
1307 * empty since the very act of allocating memory implies the creation
1308 * of a new entry.
1309 */
1310 __startup_func
1311 void
vm_map_init(void)1312 vm_map_init(void)
1313 {
1314
1315 #if MACH_ASSERT
1316 PE_parse_boot_argn("debug4k_filter", &debug4k_filter,
1317 sizeof(debug4k_filter));
1318 #endif /* MACH_ASSERT */
1319
1320 zone_create_ext(VM_MAP_ZONE_NAME, sizeof(struct _vm_map),
1321 VM_MAP_ZFLAGS, ZONE_ID_VM_MAP, NULL);
1322
1323 /*
1324 * Don't quarantine because we always need elements available
1325 * Disallow GC on this zone... to aid the GC.
1326 */
1327 zone_create_ext(VM_MAP_ENTRY_ZONE_NAME,
1328 sizeof(struct vm_map_entry), VM_MAP_ENTRY_ZFLAGS,
1329 ZONE_ID_VM_MAP_ENTRY, ^(zone_t z) {
1330 z->z_elems_rsv = (uint16_t)(32 *
1331 (ml_early_cpu_max_number() + 1));
1332 });
1333
1334 zone_create_ext(VM_MAP_HOLES_ZONE_NAME,
1335 sizeof(struct vm_map_links), VM_MAP_HOLES_ZFLAGS,
1336 ZONE_ID_VM_MAP_HOLES, ^(zone_t z) {
1337 z->z_elems_rsv = (uint16_t)(16 * 1024 / zone_elem_outer_size(z));
1338 });
1339
1340 zone_create_ext("VM map copies", sizeof(struct vm_map_copy),
1341 ZC_NOENCRYPT, ZONE_ID_VM_MAP_COPY, NULL);
1342
1343 /*
1344 * Add the stolen memory to zones, adjust zone size and stolen counts.
1345 */
1346 zone_cram_early(vm_map_zone, map_data, map_data_size);
1347 zone_cram_early(vm_map_entry_zone, kentry_data, kentry_data_size);
1348 zone_cram_early(vm_map_holes_zone, map_holes_data, map_holes_data_size);
1349 printf("VM boostrap: %d maps, %d entries and %d holes available\n",
1350 zone_count_free(vm_map_zone),
1351 zone_count_free(vm_map_entry_zone),
1352 zone_count_free(vm_map_holes_zone));
1353
1354 /*
1355 * Since these are covered by zones, remove them from stolen page accounting.
1356 */
1357 VM_PAGE_MOVE_STOLEN(atop_64(map_data_size) + atop_64(kentry_data_size) + atop_64(map_holes_data_size));
1358
1359 #if VM_MAP_DEBUG_APPLE_PROTECT
1360 PE_parse_boot_argn("vm_map_debug_apple_protect",
1361 &vm_map_debug_apple_protect,
1362 sizeof(vm_map_debug_apple_protect));
1363 #endif /* VM_MAP_DEBUG_APPLE_PROTECT */
1364 #if VM_MAP_DEBUG_APPLE_FOURK
1365 PE_parse_boot_argn("vm_map_debug_fourk",
1366 &vm_map_debug_fourk,
1367 sizeof(vm_map_debug_fourk));
1368 #endif /* VM_MAP_DEBUG_FOURK */
1369
1370 if (malloc_no_cow) {
1371 vm_memory_malloc_no_cow_mask = 0ULL;
1372 vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC;
1373 vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_SMALL;
1374 vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_MEDIUM;
1375 #if XNU_TARGET_OS_OSX
1376 /*
1377 * On macOS, keep copy-on-write for MALLOC_LARGE because
1378 * realloc() may use vm_copy() to transfer the old contents
1379 * to the new location.
1380 */
1381 #else /* XNU_TARGET_OS_OSX */
1382 vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_LARGE;
1383 vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_LARGE_REUSABLE;
1384 vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_LARGE_REUSED;
1385 #endif /* XNU_TARGET_OS_OSX */
1386 // vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_HUGE;
1387 // vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_REALLOC;
1388 vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_TINY;
1389 vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_NANO;
1390 // vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_TCMALLOC;
1391 PE_parse_boot_argn("vm_memory_malloc_no_cow_mask",
1392 &vm_memory_malloc_no_cow_mask,
1393 sizeof(vm_memory_malloc_no_cow_mask));
1394 }
1395
1396 #if CONFIG_MAP_RANGES
1397 vm_map_range_map_init();
1398 #endif /* CONFIG_MAP_RANGES */
1399
1400 #if DEBUG
1401 PE_parse_boot_argn("vm_check_map_sanity", &vm_check_map_sanity, sizeof(vm_check_map_sanity));
1402 if (vm_check_map_sanity) {
1403 kprintf("VM sanity checking enabled\n");
1404 } else {
1405 kprintf("VM sanity checking disabled. Set bootarg vm_check_map_sanity=1 to enable\n");
1406 }
1407 #endif /* DEBUG */
1408
1409 #if DEVELOPMENT || DEBUG
1410 PE_parse_boot_argn("panic_on_unsigned_execute",
1411 &panic_on_unsigned_execute,
1412 sizeof(panic_on_unsigned_execute));
1413 PE_parse_boot_argn("panic_on_mlock_failure",
1414 &panic_on_mlock_failure,
1415 sizeof(panic_on_mlock_failure));
1416 #endif /* DEVELOPMENT || DEBUG */
1417 }
1418
1419 __startup_func
1420 static void
vm_map_steal_memory(void)1421 vm_map_steal_memory(void)
1422 {
1423
1424 /*
1425 * We need to reserve enough memory to support boostraping VM maps
1426 * and the zone subsystem.
1427 *
1428 * The VM Maps that need to function before zones can support them
1429 * are the ones registered with vm_map_will_allocate_early_map(),
1430 * which are:
1431 * - the kernel map
1432 * - the various submaps used by zones (pgz, meta, ...)
1433 *
1434 * We also need enough entries and holes to support them
1435 * until zone_metadata_init() is called, which is when
1436 * the zone allocator becomes capable of expanding dynamically.
1437 *
1438 * We need:
1439 * - VM_MAP_EARLY_COUNT_MAX worth of VM Maps.
1440 * - To allow for 3-4 entries per map, but the kernel map
1441 * needs a multiple of VM_MAP_EARLY_COUNT_MAX entries
1442 * to describe the submaps, so double it (and make it 8x too)
1443 * - To allow for holes between entries,
1444 * hence needs the same budget as entries
1445 */
1446 map_data_size = zone_get_early_alloc_size(VM_MAP_ZONE_NAME,
1447 sizeof(struct _vm_map), VM_MAP_ZFLAGS,
1448 VM_MAP_EARLY_COUNT_MAX);
1449
1450 kentry_data_size = zone_get_early_alloc_size(VM_MAP_ENTRY_ZONE_NAME,
1451 sizeof(struct vm_map_entry), VM_MAP_ENTRY_ZFLAGS,
1452 8 * VM_MAP_EARLY_COUNT_MAX);
1453
1454 map_holes_data_size = zone_get_early_alloc_size(VM_MAP_HOLES_ZONE_NAME,
1455 sizeof(struct vm_map_links), VM_MAP_HOLES_ZFLAGS,
1456 8 * VM_MAP_EARLY_COUNT_MAX);
1457
1458 /*
1459 * Steal a contiguous range of memory so that a simple range check
1460 * can validate early addresses being freed/crammed to these
1461 * zones
1462 */
1463 map_data = zone_early_mem_init(map_data_size + kentry_data_size +
1464 map_holes_data_size);
1465 kentry_data = map_data + map_data_size;
1466 map_holes_data = kentry_data + kentry_data_size;
1467 }
1468 STARTUP(PMAP_STEAL, STARTUP_RANK_FIRST, vm_map_steal_memory);
1469
1470 __startup_func
1471 static void
vm_kernel_boostraped(void)1472 vm_kernel_boostraped(void)
1473 {
1474 zone_enable_caching(&zone_array[ZONE_ID_VM_MAP_ENTRY]);
1475 zone_enable_caching(&zone_array[ZONE_ID_VM_MAP_HOLES]);
1476 zone_enable_caching(&zone_array[ZONE_ID_VM_MAP_COPY]);
1477
1478 printf("VM bootstrap done: %d maps, %d entries and %d holes left\n",
1479 zone_count_free(vm_map_zone),
1480 zone_count_free(vm_map_entry_zone),
1481 zone_count_free(vm_map_holes_zone));
1482 }
1483 STARTUP(ZALLOC, STARTUP_RANK_SECOND, vm_kernel_boostraped);
1484
1485 void
vm_map_disable_hole_optimization(vm_map_t map)1486 vm_map_disable_hole_optimization(vm_map_t map)
1487 {
1488 vm_map_entry_t head_entry, hole_entry, next_hole_entry;
1489
1490 if (map->holelistenabled) {
1491 head_entry = hole_entry = CAST_TO_VM_MAP_ENTRY(map->holes_list);
1492
1493 while (hole_entry != NULL) {
1494 next_hole_entry = hole_entry->vme_next;
1495
1496 hole_entry->vme_next = NULL;
1497 hole_entry->vme_prev = NULL;
1498 zfree_id(ZONE_ID_VM_MAP_HOLES, hole_entry);
1499
1500 if (next_hole_entry == head_entry) {
1501 hole_entry = NULL;
1502 } else {
1503 hole_entry = next_hole_entry;
1504 }
1505 }
1506
1507 map->holes_list = NULL;
1508 map->holelistenabled = FALSE;
1509
1510 map->first_free = vm_map_to_entry(map);
1511 SAVE_HINT_HOLE_WRITE(map, NULL);
1512 }
1513 }
1514
1515 boolean_t
vm_kernel_map_is_kernel(vm_map_t map)1516 vm_kernel_map_is_kernel(vm_map_t map)
1517 {
1518 return map->pmap == kernel_pmap;
1519 }
1520
1521 /*
1522 * vm_map_create:
1523 *
1524 * Creates and returns a new empty VM map with
1525 * the given physical map structure, and having
1526 * the given lower and upper address bounds.
1527 */
1528
1529 extern vm_map_t vm_map_create_external(
1530 pmap_t pmap,
1531 vm_map_offset_t min_off,
1532 vm_map_offset_t max_off,
1533 boolean_t pageable);
1534
1535 vm_map_t
vm_map_create_external(pmap_t pmap,vm_map_offset_t min,vm_map_offset_t max,boolean_t pageable)1536 vm_map_create_external(
1537 pmap_t pmap,
1538 vm_map_offset_t min,
1539 vm_map_offset_t max,
1540 boolean_t pageable)
1541 {
1542 vm_map_create_options_t options = VM_MAP_CREATE_DEFAULT;
1543
1544 if (pageable) {
1545 options |= VM_MAP_CREATE_PAGEABLE;
1546 }
1547 return vm_map_create_options(pmap, min, max, options);
1548 }
1549
1550 __startup_func
1551 void
vm_map_will_allocate_early_map(vm_map_t * owner)1552 vm_map_will_allocate_early_map(vm_map_t *owner)
1553 {
1554 if (early_map_count >= VM_MAP_EARLY_COUNT_MAX) {
1555 panic("VM_MAP_EARLY_COUNT_MAX is too low");
1556 }
1557
1558 early_map_owners[early_map_count++] = owner;
1559 }
1560
1561 __startup_func
1562 void
vm_map_relocate_early_maps(vm_offset_t delta)1563 vm_map_relocate_early_maps(vm_offset_t delta)
1564 {
1565 for (uint32_t i = 0; i < early_map_count; i++) {
1566 vm_address_t addr = (vm_address_t)*early_map_owners[i];
1567
1568 *early_map_owners[i] = (vm_map_t)(addr + delta);
1569 }
1570
1571 early_map_count = ~0u;
1572 }
1573
1574 /*
1575 * Routine: vm_map_relocate_early_elem
1576 *
1577 * Purpose:
1578 * Early zone elements are allocated in a temporary part
1579 * of the address space.
1580 *
1581 * Once the zones live in their final place, the early
1582 * VM maps, map entries and map holes need to be relocated.
1583 *
1584 * It involves rewriting any vm_map_t, vm_map_entry_t or
1585 * pointers to vm_map_links. Other pointers to other types
1586 * are fine.
1587 *
1588 * Fortunately, pointers to those types are self-contained
1589 * in those zones, _except_ for pointers to VM maps,
1590 * which are tracked during early boot and fixed with
1591 * vm_map_relocate_early_maps().
1592 */
1593 __startup_func
1594 void
vm_map_relocate_early_elem(uint32_t zone_id,vm_offset_t new_addr,vm_offset_t delta)1595 vm_map_relocate_early_elem(
1596 uint32_t zone_id,
1597 vm_offset_t new_addr,
1598 vm_offset_t delta)
1599 {
1600 #define relocate(type_t, field) ({ \
1601 typeof(((type_t)NULL)->field) *__field = &((type_t)new_addr)->field; \
1602 if (*__field) { \
1603 *__field = (typeof(*__field))((vm_offset_t)*__field + delta); \
1604 } \
1605 })
1606
1607 switch (zone_id) {
1608 case ZONE_ID_VM_MAP:
1609 case ZONE_ID_VM_MAP_ENTRY:
1610 case ZONE_ID_VM_MAP_HOLES:
1611 break;
1612
1613 default:
1614 panic("Unexpected zone ID %d", zone_id);
1615 }
1616
1617 if (zone_id == ZONE_ID_VM_MAP) {
1618 relocate(vm_map_t, hdr.links.prev);
1619 relocate(vm_map_t, hdr.links.next);
1620 ((vm_map_t)new_addr)->pmap = kernel_pmap;
1621 #ifdef VM_MAP_STORE_USE_RB
1622 relocate(vm_map_t, hdr.rb_head_store.rbh_root);
1623 #endif /* VM_MAP_STORE_USE_RB */
1624 relocate(vm_map_t, hint);
1625 relocate(vm_map_t, hole_hint);
1626 relocate(vm_map_t, first_free);
1627 return;
1628 }
1629
1630 relocate(struct vm_map_links *, prev);
1631 relocate(struct vm_map_links *, next);
1632
1633 if (zone_id == ZONE_ID_VM_MAP_ENTRY) {
1634 #ifdef VM_MAP_STORE_USE_RB
1635 relocate(vm_map_entry_t, store.entry.rbe_left);
1636 relocate(vm_map_entry_t, store.entry.rbe_right);
1637 relocate(vm_map_entry_t, store.entry.rbe_parent);
1638 #endif /* VM_MAP_STORE_USE_RB */
1639 if (((vm_map_entry_t)new_addr)->is_sub_map) {
1640 /* no object to relocate because we haven't made any */
1641 ((vm_map_entry_t)new_addr)->vme_submap +=
1642 delta >> VME_SUBMAP_SHIFT;
1643 }
1644 #if MAP_ENTRY_CREATION_DEBUG
1645 relocate(vm_map_entry_t, vme_creation_maphdr);
1646 #endif /* MAP_ENTRY_CREATION_DEBUG */
1647 }
1648
1649 #undef relocate
1650 }
1651
1652 /*
1653 * Generate a serial ID to identify a newly allocated vm_map
1654 */
1655 static uintptr_t vm_map_serial_current = 0;
1656 vm_map_serial_t vm_map_serial_generate(void);
1657 void vm_map_assign_serial(vm_map_t, vm_map_serial_t);
1658
1659 vm_map_serial_t
vm_map_serial_generate(void)1660 vm_map_serial_generate(void)
1661 {
1662 vm_map_serial_t serial = (void *)os_atomic_inc(&vm_map_serial_current, relaxed);
1663 return serial;
1664 }
1665
1666 void
vm_map_assign_serial(vm_map_t map,vm_map_serial_t serial)1667 vm_map_assign_serial(vm_map_t map, vm_map_serial_t serial)
1668 {
1669 map->serial_id = serial;
1670 #if CONFIG_SPTM
1671 /* Copy through our ID to the pmap (only available on SPTM systems) */
1672 if (map->pmap) {
1673 map->pmap->associated_vm_map_serial_id = map->serial_id;
1674 }
1675 #endif /* CONFIG_SPTM */
1676 }
1677
1678 vm_map_t
vm_map_create_options(pmap_t pmap,vm_map_offset_t min,vm_map_offset_t max,vm_map_create_options_t options)1679 vm_map_create_options(
1680 pmap_t pmap,
1681 vm_map_offset_t min,
1682 vm_map_offset_t max,
1683 vm_map_create_options_t options)
1684 {
1685 vm_map_t result;
1686
1687 #if DEBUG || DEVELOPMENT
1688 if (__improbable(startup_phase < STARTUP_SUB_ZALLOC)) {
1689 if (early_map_count != ~0u && early_map_count !=
1690 zone_count_allocated(vm_map_zone) + 1) {
1691 panic("allocating %dth early map, owner not known",
1692 zone_count_allocated(vm_map_zone) + 1);
1693 }
1694 if (early_map_count != ~0u && pmap && pmap != kernel_pmap) {
1695 panic("allocating %dth early map for non kernel pmap",
1696 early_map_count);
1697 }
1698 }
1699 #endif /* DEBUG || DEVELOPMENT */
1700
1701 result = zalloc_id(ZONE_ID_VM_MAP, Z_WAITOK | Z_NOFAIL | Z_ZERO);
1702
1703 vm_map_store_init(&result->hdr);
1704 result->hdr.entries_pageable = (bool)(options & VM_MAP_CREATE_PAGEABLE);
1705 vm_map_set_page_shift(result, PAGE_SHIFT);
1706
1707 result->size_limit = RLIM_INFINITY; /* default unlimited */
1708 result->data_limit = RLIM_INFINITY; /* default unlimited */
1709 result->user_wire_limit = MACH_VM_MAX_ADDRESS; /* default limit is unlimited */
1710 os_ref_init_count_raw(&result->map_refcnt, &map_refgrp, 1);
1711
1712 result->pmap = pmap;
1713
1714 /*
1715 * Immediately give ourselves an ID
1716 * Unless this map is being created as part of a fork, in which case
1717 * the caller will reassign the ID of the parent (so don't waste an
1718 * increment here).
1719 */
1720 if ((options & VM_MAP_CREATE_VIA_FORK) == 0) {
1721 vm_map_assign_serial(result, vm_map_serial_generate());
1722 }
1723
1724 result->min_offset = min;
1725 result->max_offset = max;
1726 result->first_free = vm_map_to_entry(result);
1727 result->hint = vm_map_to_entry(result);
1728
1729 if (options & VM_MAP_CREATE_NEVER_FAULTS) {
1730 assert(pmap == kernel_pmap);
1731 result->never_faults = true;
1732 }
1733
1734 /* "has_corpse_footprint" and "holelistenabled" are mutually exclusive */
1735 if (options & VM_MAP_CREATE_CORPSE_FOOTPRINT) {
1736 result->has_corpse_footprint = true;
1737 } else if (!(options & VM_MAP_CREATE_DISABLE_HOLELIST)) {
1738 struct vm_map_links *hole_entry;
1739
1740 hole_entry = zalloc_id(ZONE_ID_VM_MAP_HOLES, Z_WAITOK | Z_NOFAIL);
1741 hole_entry->start = min;
1742 /*
1743 * Holes can be used to track ranges all the way up to
1744 * MACH_VM_MAX_ADDRESS or more (e.g. kernel map).
1745 */
1746 hole_entry->end = MAX(max, (vm_map_offset_t)MACH_VM_MAX_ADDRESS);
1747 result->holes_list = result->hole_hint = hole_entry;
1748 hole_entry->prev = hole_entry->next = CAST_TO_VM_MAP_ENTRY(hole_entry);
1749 result->holelistenabled = true;
1750 }
1751
1752 vm_map_lock_init(result);
1753
1754 return result;
1755 }
1756
1757 /*
1758 * Adjusts a submap that was made by kmem_suballoc()
1759 * before it knew where it would be mapped,
1760 * so that it has the right min/max offsets.
1761 *
1762 * We do not need to hold any locks:
1763 * only the caller knows about this map,
1764 * and it is not published on any entry yet.
1765 */
1766 static void
vm_map_adjust_offsets(vm_map_t map,vm_map_offset_t min_off,vm_map_offset_t max_off)1767 vm_map_adjust_offsets(
1768 vm_map_t map,
1769 vm_map_offset_t min_off,
1770 vm_map_offset_t max_off)
1771 {
1772 assert(map->min_offset == 0);
1773 assert(map->max_offset == max_off - min_off);
1774 assert(map->hdr.nentries == 0);
1775 assert(os_ref_get_count_raw(&map->map_refcnt) == 2);
1776
1777 map->min_offset = min_off;
1778 map->max_offset = max_off;
1779
1780 if (map->holelistenabled) {
1781 struct vm_map_links *hole = map->holes_list;
1782
1783 hole->start = min_off;
1784 #if defined(__arm64__)
1785 hole->end = max_off;
1786 #else
1787 hole->end = MAX(max_off, (vm_map_offset_t)MACH_VM_MAX_ADDRESS);
1788 #endif
1789 }
1790 }
1791
1792
1793 vm_map_size_t
vm_map_adjusted_size(vm_map_t map)1794 vm_map_adjusted_size(vm_map_t map)
1795 {
1796 const struct vm_reserved_region *regions = NULL;
1797 size_t num_regions = 0;
1798 mach_vm_size_t reserved_size = 0, map_size = 0;
1799
1800 if (map == NULL || (map->size == 0)) {
1801 return 0;
1802 }
1803
1804 map_size = map->size;
1805
1806 if (map->reserved_regions == FALSE || !vm_map_is_exotic(map) || map->terminated) {
1807 /*
1808 * No special reserved regions or not an exotic map or the task
1809 * is terminating and these special regions might have already
1810 * been deallocated.
1811 */
1812 return map_size;
1813 }
1814
1815 num_regions = ml_get_vm_reserved_regions(vm_map_is_64bit(map), ®ions);
1816 assert((num_regions == 0) || (num_regions > 0 && regions != NULL));
1817
1818 while (num_regions) {
1819 reserved_size += regions[--num_regions].vmrr_size;
1820 }
1821
1822 /*
1823 * There are a few places where the map is being switched out due to
1824 * 'termination' without that bit being set (e.g. exec and corpse purging).
1825 * In those cases, we could have the map's regions being deallocated on
1826 * a core while some accounting process is trying to get the map's size.
1827 * So this assert can't be enabled till all those places are uniform in
1828 * their use of the 'map->terminated' bit.
1829 *
1830 * assert(map_size >= reserved_size);
1831 */
1832
1833 return (map_size >= reserved_size) ? (map_size - reserved_size) : map_size;
1834 }
1835
1836 /*
1837 * vm_map_entry_create: [ internal use only ]
1838 *
1839 * Allocates a VM map entry for insertion in the
1840 * given map (or map copy). No fields are filled.
1841 *
1842 * The VM entry will be zero initialized, except for:
1843 * - behavior set to VM_BEHAVIOR_DEFAULT
1844 * - inheritance set to VM_INHERIT_DEFAULT
1845 */
1846 #define vm_map_entry_create(map) _vm_map_entry_create(&(map)->hdr)
1847
1848 #define vm_map_copy_entry_create(copy) _vm_map_entry_create(&(copy)->cpy_hdr)
1849
1850 static vm_map_entry_t
_vm_map_entry_create(struct vm_map_header * map_header __unused)1851 _vm_map_entry_create(
1852 struct vm_map_header *map_header __unused)
1853 {
1854 vm_map_entry_t entry = NULL;
1855
1856 entry = zalloc_id(ZONE_ID_VM_MAP_ENTRY, Z_WAITOK | Z_ZERO);
1857
1858 /*
1859 * Help the compiler with what we know to be true,
1860 * so that the further bitfields inits have good codegen.
1861 *
1862 * See rdar://87041299
1863 */
1864 __builtin_assume(entry->vme_object_value == 0);
1865 __builtin_assume(*(uint64_t *)(&entry->vme_object_value + 1) == 0);
1866 __builtin_assume(*(uint64_t *)(&entry->vme_object_value + 2) == 0);
1867
1868 static_assert(VM_MAX_TAG_VALUE <= VME_ALIAS_MASK,
1869 "VME_ALIAS_MASK covers tags");
1870
1871 static_assert(VM_BEHAVIOR_DEFAULT == 0,
1872 "can skip zeroing of the behavior field");
1873 entry->inheritance = VM_INHERIT_DEFAULT;
1874
1875 #if MAP_ENTRY_CREATION_DEBUG
1876 entry->vme_creation_maphdr = map_header;
1877 entry->vme_creation_bt = btref_get(__builtin_frame_address(0),
1878 BTREF_GET_NOWAIT);
1879 #endif
1880 return entry;
1881 }
1882
1883 /*
1884 * vm_map_entry_dispose: [ internal use only ]
1885 *
1886 * Inverse of vm_map_entry_create.
1887 *
1888 * write map lock held so no need to
1889 * do anything special to insure correctness
1890 * of the stores
1891 */
1892 static void
vm_map_entry_dispose(vm_map_entry_t entry)1893 vm_map_entry_dispose(
1894 vm_map_entry_t entry)
1895 {
1896 #if VM_BTLOG_TAGS
1897 if (entry->vme_kernel_object) {
1898 btref_put(entry->vme_tag_btref);
1899 }
1900 #endif /* VM_BTLOG_TAGS */
1901 #if MAP_ENTRY_CREATION_DEBUG
1902 btref_put(entry->vme_creation_bt);
1903 #endif
1904 #if MAP_ENTRY_INSERTION_DEBUG
1905 btref_put(entry->vme_insertion_bt);
1906 #endif
1907 zfree(vm_map_entry_zone, entry);
1908 }
1909
1910 #define vm_map_copy_entry_dispose(copy_entry) \
1911 vm_map_entry_dispose(copy_entry)
1912
1913 static vm_map_entry_t
vm_map_zap_first_entry(vm_map_zap_t list)1914 vm_map_zap_first_entry(
1915 vm_map_zap_t list)
1916 {
1917 return list->vmz_head;
1918 }
1919
1920 static vm_map_entry_t
vm_map_zap_last_entry(vm_map_zap_t list)1921 vm_map_zap_last_entry(
1922 vm_map_zap_t list)
1923 {
1924 assert(vm_map_zap_first_entry(list));
1925 return __container_of(list->vmz_tail, struct vm_map_entry, vme_next);
1926 }
1927
1928 static void
vm_map_zap_append(vm_map_zap_t list,vm_map_entry_t entry)1929 vm_map_zap_append(
1930 vm_map_zap_t list,
1931 vm_map_entry_t entry)
1932 {
1933 entry->vme_next = VM_MAP_ENTRY_NULL;
1934 *list->vmz_tail = entry;
1935 list->vmz_tail = &entry->vme_next;
1936 }
1937
1938 static vm_map_entry_t
vm_map_zap_pop(vm_map_zap_t list)1939 vm_map_zap_pop(
1940 vm_map_zap_t list)
1941 {
1942 vm_map_entry_t head = list->vmz_head;
1943
1944 if (head != VM_MAP_ENTRY_NULL &&
1945 (list->vmz_head = head->vme_next) == VM_MAP_ENTRY_NULL) {
1946 list->vmz_tail = &list->vmz_head;
1947 }
1948
1949 return head;
1950 }
1951
1952 static void
vm_map_zap_dispose(vm_map_zap_t list)1953 vm_map_zap_dispose(
1954 vm_map_zap_t list)
1955 {
1956 vm_map_entry_t entry;
1957
1958 while ((entry = vm_map_zap_pop(list))) {
1959 if (entry->is_sub_map) {
1960 vm_map_deallocate(VME_SUBMAP(entry));
1961 } else {
1962 vm_object_deallocate(VME_OBJECT(entry));
1963 }
1964
1965 vm_map_entry_dispose(entry);
1966 }
1967 }
1968
1969 #if MACH_ASSERT
1970 static boolean_t first_free_check = FALSE;
1971 boolean_t
first_free_is_valid(vm_map_t map)1972 first_free_is_valid(
1973 vm_map_t map)
1974 {
1975 if (!first_free_check) {
1976 return TRUE;
1977 }
1978
1979 return first_free_is_valid_store( map );
1980 }
1981 #endif /* MACH_ASSERT */
1982
1983
1984 #define vm_map_copy_entry_link(copy, after_where, entry) \
1985 _vm_map_store_entry_link(&(copy)->cpy_hdr, after_where, (entry))
1986
1987 #define vm_map_copy_entry_unlink(copy, entry) \
1988 _vm_map_store_entry_unlink(&(copy)->cpy_hdr, (entry), false)
1989
1990 /*
1991 * vm_map_destroy:
1992 *
1993 * Actually destroy a map.
1994 */
1995 void
vm_map_destroy(vm_map_t map)1996 vm_map_destroy(
1997 vm_map_t map)
1998 {
1999 /* final cleanup: this is not allowed to fail */
2000 vmr_flags_t flags = VM_MAP_REMOVE_NO_FLAGS;
2001
2002 VM_MAP_ZAP_DECLARE(zap);
2003
2004 vm_map_lock(map);
2005
2006 map->terminated = true;
2007 /* clean up regular map entries */
2008 (void)vm_map_delete(map, map->min_offset, map->max_offset, flags,
2009 KMEM_GUARD_NONE, &zap);
2010 /* clean up leftover special mappings (commpage, GPU carveout, etc...) */
2011 (void)vm_map_delete(map, 0x0, 0xFFFFFFFFFFFFF000ULL, flags,
2012 KMEM_GUARD_NONE, &zap);
2013
2014 vm_map_disable_hole_optimization(map);
2015 vm_map_corpse_footprint_destroy(map);
2016
2017 vm_map_unlock(map);
2018
2019 vm_map_zap_dispose(&zap);
2020
2021 assert(map->hdr.nentries == 0);
2022
2023 if (map->pmap) {
2024 pmap_destroy(map->pmap);
2025 }
2026
2027 lck_rw_destroy(&map->lock, &vm_map_lck_grp);
2028
2029 #if CONFIG_MAP_RANGES
2030 kfree_data(map->extra_ranges,
2031 map->extra_ranges_count * sizeof(struct vm_map_user_range));
2032 #endif
2033
2034 zfree_id(ZONE_ID_VM_MAP, map);
2035 }
2036
2037 /*
2038 * Returns pid of the task with the largest number of VM map entries.
2039 * Used in the zone-map-exhaustion jetsam path.
2040 */
2041 pid_t
find_largest_process_vm_map_entries(void)2042 find_largest_process_vm_map_entries(void)
2043 {
2044 pid_t victim_pid = -1;
2045 int max_vm_map_entries = 0;
2046 task_t task = TASK_NULL;
2047 queue_head_t *task_list = &tasks;
2048
2049 lck_mtx_lock(&tasks_threads_lock);
2050 queue_iterate(task_list, task, task_t, tasks) {
2051 if (task == kernel_task || !task->active) {
2052 continue;
2053 }
2054
2055 vm_map_t task_map = task->map;
2056 if (task_map != VM_MAP_NULL) {
2057 int task_vm_map_entries = task_map->hdr.nentries;
2058 if (task_vm_map_entries > max_vm_map_entries) {
2059 max_vm_map_entries = task_vm_map_entries;
2060 victim_pid = pid_from_task(task);
2061 }
2062 }
2063 }
2064 lck_mtx_unlock(&tasks_threads_lock);
2065
2066 printf("zone_map_exhaustion: victim pid %d, vm region count: %d\n", victim_pid, max_vm_map_entries);
2067 return victim_pid;
2068 }
2069
2070
2071 /*
2072 * vm_map_lookup_entry: [ internal use only ]
2073 *
2074 * Calls into the vm map store layer to find the map
2075 * entry containing (or immediately preceding) the
2076 * specified address in the given map; the entry is returned
2077 * in the "entry" parameter. The boolean
2078 * result indicates whether the address is
2079 * actually contained in the map.
2080 */
2081 boolean_t
vm_map_lookup_entry(vm_map_t map,vm_map_offset_t address,vm_map_entry_t * entry)2082 vm_map_lookup_entry(
2083 vm_map_t map,
2084 vm_map_offset_t address,
2085 vm_map_entry_t *entry) /* OUT */
2086 {
2087 bool result = false;
2088
2089 #if CONFIG_KERNEL_TAGGING
2090 if (VM_KERNEL_ADDRESS(address)) {
2091 address = vm_memtag_canonicalize_kernel(address);
2092 }
2093 #endif /* CONFIG_KERNEL_TAGGING */
2094
2095 #if CONFIG_PROB_GZALLOC
2096 if (map->pmap == kernel_pmap) {
2097 assertf(!pgz_owned(address),
2098 "it is the responsibility of callers to unguard PGZ addresses");
2099 }
2100 #endif /* CONFIG_PROB_GZALLOC */
2101 result = vm_map_store_lookup_entry( map, address, entry );
2102
2103 return result;
2104 }
2105
2106 boolean_t
vm_map_lookup_entry_or_next(vm_map_t map,vm_map_offset_t address,vm_map_entry_t * entry)2107 vm_map_lookup_entry_or_next(
2108 vm_map_t map,
2109 vm_map_offset_t address,
2110 vm_map_entry_t *entry) /* OUT */
2111 {
2112 if (vm_map_lookup_entry(map, address, entry)) {
2113 return true;
2114 }
2115
2116 *entry = (*entry)->vme_next;
2117 return false;
2118 }
2119
2120 #if CONFIG_PROB_GZALLOC
2121 boolean_t
vm_map_lookup_entry_allow_pgz(vm_map_t map,vm_map_offset_t address,vm_map_entry_t * entry)2122 vm_map_lookup_entry_allow_pgz(
2123 vm_map_t map,
2124 vm_map_offset_t address,
2125 vm_map_entry_t *entry) /* OUT */
2126 {
2127 #if CONFIG_KERNEL_TAGGING
2128 if (VM_KERNEL_ADDRESS(address)) {
2129 address = vm_memtag_canonicalize_kernel(address);
2130 }
2131 #endif /* CONFIG_KERNEL_TAGGING */
2132
2133 return vm_map_store_lookup_entry( map, address, entry );
2134 }
2135 #endif /* CONFIG_PROB_GZALLOC */
2136
2137 /*
2138 * Routine: vm_map_range_invalid_panic
2139 * Purpose:
2140 * Panic on detection of an invalid range id.
2141 */
2142 __abortlike
2143 static void
vm_map_range_invalid_panic(vm_map_t map,vm_map_range_id_t range_id)2144 vm_map_range_invalid_panic(
2145 vm_map_t map,
2146 vm_map_range_id_t range_id)
2147 {
2148 panic("invalid range ID (%u) for map %p", range_id, map);
2149 }
2150
2151 /*
2152 * Routine: vm_map_get_range
2153 * Purpose:
2154 * Adjust bounds based on security policy.
2155 */
2156 static struct mach_vm_range
vm_map_get_range(vm_map_t map,vm_map_address_t * address,vm_map_kernel_flags_t * vmk_flags,vm_map_size_t size,bool * is_ptr)2157 vm_map_get_range(
2158 vm_map_t map,
2159 vm_map_address_t *address,
2160 vm_map_kernel_flags_t *vmk_flags,
2161 vm_map_size_t size,
2162 bool *is_ptr)
2163 {
2164 struct mach_vm_range effective_range = {};
2165 vm_map_range_id_t range_id = vmk_flags->vmkf_range_id;
2166
2167 if (map == kernel_map) {
2168 effective_range = kmem_ranges[range_id];
2169
2170 if (startup_phase >= STARTUP_SUB_KMEM) {
2171 /*
2172 * Hint provided by caller is zeroed as the range is restricted to a
2173 * subset of the entire kernel_map VA, which could put the hint outside
2174 * the range, causing vm_map_store_find_space to fail.
2175 */
2176 *address = 0ull;
2177 /*
2178 * Ensure that range_id passed in by the caller is within meaningful
2179 * bounds. Range id of KMEM_RANGE_ID_NONE will cause vm_map_locate_space
2180 * to fail as the corresponding range is invalid. Range id larger than
2181 * KMEM_RANGE_ID_MAX will lead to an OOB access.
2182 */
2183 if ((range_id == KMEM_RANGE_ID_NONE) ||
2184 (range_id > KMEM_RANGE_ID_MAX)) {
2185 vm_map_range_invalid_panic(map, range_id);
2186 }
2187
2188 /*
2189 * Pointer ranges use kmem_locate_space to do allocations.
2190 *
2191 * Non pointer fronts look like [ Small | Large | Permanent ]
2192 * Adjust range for allocations larger than KMEM_SMALLMAP_THRESHOLD.
2193 * Allocations smaller than KMEM_SMALLMAP_THRESHOLD are allowed to
2194 * use the entire range.
2195 */
2196 if (range_id < KMEM_RANGE_ID_SPRAYQTN) {
2197 *is_ptr = true;
2198 } else if (size >= KMEM_SMALLMAP_THRESHOLD) {
2199 effective_range = kmem_large_ranges[range_id];
2200 }
2201 }
2202 #if CONFIG_MAP_RANGES
2203 } else if (map->uses_user_ranges) {
2204 switch (range_id) {
2205 case UMEM_RANGE_ID_DEFAULT:
2206 effective_range = map->default_range;
2207 break;
2208 case UMEM_RANGE_ID_HEAP:
2209 effective_range = map->data_range;
2210 break;
2211 case UMEM_RANGE_ID_LARGE_FILE:
2212 if (map->large_file_range.min_address != map->large_file_range.max_address) {
2213 /* large file range is configured and should be used */
2214 effective_range = map->large_file_range;
2215 } else {
2216 /*
2217 * the user asking for this user range might not have the
2218 * permissions to use the large file range (i.e., it doesn't
2219 * hold the correct entitlement), so we give it the data range
2220 * instead
2221 */
2222 effective_range = map->data_range;
2223 }
2224 break;
2225 case UMEM_RANGE_ID_FIXED:
2226 /*
2227 * anywhere allocations with an address in "FIXED"
2228 * makes no sense, leave the range empty
2229 */
2230 break;
2231
2232 default:
2233 vm_map_range_invalid_panic(map, range_id);
2234 }
2235 #endif /* CONFIG_MAP_RANGES */
2236 } else {
2237 /*
2238 * If minimum is 0, bump it up by PAGE_SIZE. We want to limit
2239 * allocations of PAGEZERO to explicit requests since its
2240 * normal use is to catch dereferences of NULL and many
2241 * applications also treat pointers with a value of 0 as
2242 * special and suddenly having address 0 contain useable
2243 * memory would tend to confuse those applications.
2244 */
2245 effective_range.min_address = MAX(map->min_offset, VM_MAP_PAGE_SIZE(map));
2246 effective_range.max_address = map->max_offset;
2247 }
2248
2249 return effective_range;
2250 }
2251
2252 kern_return_t
vm_map_locate_space_anywhere(vm_map_t map,vm_map_size_t size,vm_map_offset_t mask,vm_map_kernel_flags_t vmk_flags,vm_map_offset_t * start_inout,vm_map_entry_t * entry_out)2253 vm_map_locate_space_anywhere(
2254 vm_map_t map,
2255 vm_map_size_t size,
2256 vm_map_offset_t mask,
2257 vm_map_kernel_flags_t vmk_flags,
2258 vm_map_offset_t *start_inout,
2259 vm_map_entry_t *entry_out)
2260 {
2261 struct mach_vm_range effective_range = {};
2262 vm_map_size_t guard_offset;
2263 vm_map_offset_t hint, limit;
2264 vm_map_entry_t entry;
2265 bool is_kmem_ptr_range = false;
2266
2267 /*
2268 * Only supported by vm_map_enter() with a fixed address.
2269 */
2270 assert(!vmk_flags.vmf_fixed);
2271 assert(!vmk_flags.vmkf_beyond_max);
2272
2273 if (__improbable(map->wait_for_space)) {
2274 /*
2275 * support for "wait_for_space" is minimal,
2276 * its only consumer is the ipc_kernel_copy_map.
2277 */
2278 assert(!map->holelistenabled &&
2279 !vmk_flags.vmkf_last_free &&
2280 !vmk_flags.vmkf_keep_map_locked &&
2281 !vmk_flags.vmkf_map_jit &&
2282 !vmk_flags.vmf_random_addr &&
2283 *start_inout <= map->min_offset);
2284 } else if (vmk_flags.vmkf_last_free) {
2285 assert(!vmk_flags.vmkf_map_jit &&
2286 !vmk_flags.vmf_random_addr);
2287 }
2288
2289 if (vmk_flags.vmkf_guard_before) {
2290 guard_offset = VM_MAP_PAGE_SIZE(map);
2291 assert(size > guard_offset);
2292 size -= guard_offset;
2293 } else {
2294 assert(size != 0);
2295 guard_offset = 0;
2296 }
2297
2298 if (__improbable(!vm_map_is_map_size_valid(
2299 map, size, vmk_flags.vmkf_no_soft_limit))) {
2300 return KERN_NO_SPACE;
2301 }
2302
2303 /*
2304 * Validate range_id from flags and get associated range
2305 */
2306 effective_range = vm_map_get_range(map, start_inout, &vmk_flags, size,
2307 &is_kmem_ptr_range);
2308
2309 if (is_kmem_ptr_range) {
2310 return kmem_locate_space(size + guard_offset, vmk_flags.vmkf_range_id,
2311 vmk_flags.vmkf_last_free, start_inout, entry_out);
2312 }
2313
2314 #if XNU_TARGET_OS_OSX
2315 if (__improbable(vmk_flags.vmkf_32bit_map_va)) {
2316 assert(map != kernel_map);
2317 effective_range.max_address = MIN(map->max_offset, 0x00000000FFFFF000ULL);
2318 }
2319 #endif /* XNU_TARGET_OS_OSX */
2320
2321 again:
2322 if (vmk_flags.vmkf_last_free) {
2323 hint = *start_inout;
2324
2325 if (hint == 0 || hint > effective_range.max_address) {
2326 hint = effective_range.max_address;
2327 }
2328 if (hint <= effective_range.min_address) {
2329 return KERN_NO_SPACE;
2330 }
2331 limit = effective_range.min_address;
2332 } else {
2333 hint = *start_inout;
2334
2335 if (vmk_flags.vmkf_map_jit) {
2336 if (map->jit_entry_exists &&
2337 !VM_MAP_POLICY_ALLOW_MULTIPLE_JIT(map)) {
2338 return KERN_INVALID_ARGUMENT;
2339 }
2340 if (VM_MAP_POLICY_ALLOW_JIT_RANDOM_ADDRESS(map)) {
2341 vmk_flags.vmf_random_addr = true;
2342 }
2343 }
2344
2345 if (vmk_flags.vmf_random_addr) {
2346 kern_return_t kr;
2347
2348 kr = vm_map_random_address_for_size(map, &hint, size, vmk_flags);
2349 if (kr != KERN_SUCCESS) {
2350 return kr;
2351 }
2352 }
2353 #if __x86_64__
2354 else if ((hint == 0 || hint == vm_map_min(map)) &&
2355 !map->disable_vmentry_reuse &&
2356 map->vmmap_high_start != 0) {
2357 hint = map->vmmap_high_start;
2358 }
2359 #endif /* __x86_64__ */
2360
2361 if (hint < effective_range.min_address) {
2362 hint = effective_range.min_address;
2363 }
2364 if (effective_range.max_address <= hint) {
2365 return KERN_NO_SPACE;
2366 }
2367
2368 limit = effective_range.max_address;
2369 }
2370 entry = vm_map_store_find_space(map,
2371 hint, limit, vmk_flags.vmkf_last_free,
2372 guard_offset, size, mask,
2373 start_inout);
2374
2375 if (__improbable(entry == NULL)) {
2376 if (map->wait_for_space &&
2377 guard_offset + size <=
2378 effective_range.max_address - effective_range.min_address) {
2379 assert_wait((event_t)map, THREAD_ABORTSAFE);
2380 vm_map_unlock(map);
2381 thread_block(THREAD_CONTINUE_NULL);
2382 vm_map_lock(map);
2383 goto again;
2384 }
2385 return KERN_NO_SPACE;
2386 }
2387
2388 if (entry_out) {
2389 *entry_out = entry;
2390 }
2391 return KERN_SUCCESS;
2392 }
2393
2394 /*!
2395 * @function vm_map_locate_space_fixed()
2396 *
2397 * @brief
2398 * Locate (no reservation) a range in the specified VM map at a fixed address.
2399 *
2400 * @param map the map to scan for memory, must be locked.
2401 * @param start the fixed address trying to be reserved
2402 * @param size the size of the allocation to make.
2403 * @param mask an alignment mask the allocation must respect,
2404 * @param vmk_flags the vm map kernel flags to influence this call.
2405 * vmk_flags.vmf_anywhere must not be set.
2406 * @param entry_out the entry right before the hole.
2407 * @param zap_list a zap list of entries to clean up after the call.
2408 *
2409 * @returns
2410 * - KERN_SUCCESS in case of success and no conflicting entry is found,
2411 * in which case entry_out is set to the entry before the hole.
2412 *
2413 * - KERN_MEMORY_PRESENT if a conflicting entry is found,
2414 * in which case entry_out is set the conflicting entry,
2415 * the callers MUST handle this error explicitly.
2416 *
2417 * - KERN_INVALID_ADDRESS if the specified @c start or @c size
2418 * would result in a mapping outside of the map.
2419 *
2420 * - KERN_NO_SPACE for various cases of unrecoverable failures.
2421 */
2422 static kern_return_t
vm_map_locate_space_fixed(vm_map_t map,vm_map_offset_t start,vm_map_size_t size,vm_map_offset_t mask,vm_map_kernel_flags_t vmk_flags,vm_map_entry_t * entry_out,vm_map_zap_t zap_list)2423 vm_map_locate_space_fixed(
2424 vm_map_t map,
2425 vm_map_offset_t start,
2426 vm_map_size_t size,
2427 vm_map_offset_t mask,
2428 vm_map_kernel_flags_t vmk_flags,
2429 vm_map_entry_t *entry_out,
2430 vm_map_zap_t zap_list)
2431 {
2432 vm_map_offset_t effective_min_offset, effective_max_offset;
2433 vm_map_entry_t entry;
2434 vm_map_offset_t end;
2435
2436 assert(vmk_flags.vmf_fixed);
2437
2438 effective_min_offset = map->min_offset;
2439 effective_max_offset = map->max_offset;
2440
2441 if (vmk_flags.vmkf_beyond_max) {
2442 /*
2443 * Allow an insertion beyond the map's max offset.
2444 */
2445 effective_max_offset = 0x00000000FFFFF000ULL;
2446 if (vm_map_is_64bit(map)) {
2447 effective_max_offset = 0xFFFFFFFFFFFFF000ULL;
2448 }
2449 #if XNU_TARGET_OS_OSX
2450 } else if (__improbable(vmk_flags.vmkf_32bit_map_va)) {
2451 effective_max_offset = MIN(map->max_offset, 0x00000000FFFFF000ULL);
2452 #endif /* XNU_TARGET_OS_OSX */
2453 }
2454
2455 if (VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT &&
2456 !vmk_flags.vmf_overwrite &&
2457 map->pmap == kernel_pmap &&
2458 vmk_flags.vm_tag == VM_MEMORY_REALLOC) {
2459 /*
2460 * Force realloc() to switch to a new allocation,
2461 * to prevent 4k-fragmented virtual ranges.
2462 */
2463 // DEBUG4K_ERROR("no realloc in place");
2464 return KERN_NO_SPACE;
2465 }
2466
2467 /*
2468 * Verify that:
2469 * the address doesn't itself violate
2470 * the mask requirement.
2471 */
2472
2473 if ((start & mask) != 0) {
2474 return KERN_NO_SPACE;
2475 }
2476
2477 if (__improbable(!vm_map_is_map_size_valid(
2478 map, size, vmk_flags.vmkf_no_soft_limit))) {
2479 return KERN_NO_SPACE;
2480 }
2481
2482 #if CONFIG_MAP_RANGES
2483 if (map->uses_user_ranges) {
2484 struct mach_vm_range r;
2485
2486 vm_map_user_range_resolve(map, start, 1, &r);
2487 if (r.max_address == 0) {
2488 return KERN_INVALID_ADDRESS;
2489 }
2490 effective_min_offset = r.min_address;
2491 effective_max_offset = r.max_address;
2492 }
2493 #endif /* CONFIG_MAP_RANGES */
2494
2495 if ((startup_phase >= STARTUP_SUB_KMEM) && !vmk_flags.vmkf_submap &&
2496 (map == kernel_map)) {
2497 mach_vm_range_t r = kmem_validate_range_for_overwrite(start, size);
2498 effective_min_offset = r->min_address;
2499 effective_max_offset = r->max_address;
2500 }
2501
2502 /*
2503 * ... the address is within bounds
2504 */
2505
2506 end = start + size;
2507
2508 if ((start < effective_min_offset) ||
2509 (end > effective_max_offset) ||
2510 (start >= end)) {
2511 return KERN_INVALID_ADDRESS;
2512 }
2513
2514 if (vmk_flags.vmf_overwrite) {
2515 vmr_flags_t remove_flags = VM_MAP_REMOVE_NO_MAP_ALIGN | VM_MAP_REMOVE_TO_OVERWRITE;
2516 kern_return_t remove_kr;
2517
2518 /*
2519 * Fixed mapping and "overwrite" flag: attempt to
2520 * remove all existing mappings in the specified
2521 * address range, saving them in our "zap_list".
2522 *
2523 * This avoids releasing the VM map lock in
2524 * vm_map_entry_delete() and allows atomicity
2525 * when we want to replace some mappings with a new one.
2526 * It also allows us to restore the old VM mappings if the
2527 * new mapping fails.
2528 */
2529 remove_flags |= VM_MAP_REMOVE_NO_YIELD;
2530
2531 if (vmk_flags.vmkf_overwrite_immutable) {
2532 /* we can overwrite immutable mappings */
2533 remove_flags |= VM_MAP_REMOVE_IMMUTABLE;
2534 }
2535 if (vmk_flags.vmkf_remap_prot_copy) {
2536 remove_flags |= VM_MAP_REMOVE_IMMUTABLE_CODE;
2537 }
2538 remove_kr = vm_map_delete(map, start, end, remove_flags,
2539 KMEM_GUARD_NONE, zap_list).kmr_return;
2540 if (remove_kr) {
2541 /* XXX FBDP restore zap_list? */
2542 return remove_kr;
2543 }
2544 }
2545
2546 /*
2547 * ... the starting address isn't allocated
2548 */
2549
2550 if (vm_map_lookup_entry(map, start, &entry)) {
2551 *entry_out = entry;
2552 return KERN_MEMORY_PRESENT;
2553 }
2554
2555 /*
2556 * ... the next region doesn't overlap the
2557 * end point.
2558 */
2559
2560 if ((entry->vme_next != vm_map_to_entry(map)) &&
2561 (entry->vme_next->vme_start < end)) {
2562 return KERN_NO_SPACE;
2563 }
2564
2565 *entry_out = entry;
2566 return KERN_SUCCESS;
2567 }
2568
2569 /*
2570 * Routine: vm_map_find_space
2571 * Purpose:
2572 * Allocate a range in the specified virtual address map,
2573 * returning the entry allocated for that range.
2574 * Used by kmem_alloc, etc.
2575 *
2576 * The map must be NOT be locked. It will be returned locked
2577 * on KERN_SUCCESS, unlocked on failure.
2578 *
2579 * If an entry is allocated, the object/offset fields
2580 * are initialized to zero.
2581 */
2582 kern_return_t
vm_map_find_space(vm_map_t map,vm_map_offset_t hint_address,vm_map_size_t size,vm_map_offset_t mask,vm_map_kernel_flags_t vmk_flags,vm_map_entry_t * o_entry)2583 vm_map_find_space(
2584 vm_map_t map,
2585 vm_map_offset_t hint_address,
2586 vm_map_size_t size,
2587 vm_map_offset_t mask,
2588 vm_map_kernel_flags_t vmk_flags,
2589 vm_map_entry_t *o_entry) /* OUT */
2590 {
2591 vm_map_entry_t new_entry, entry;
2592 kern_return_t kr;
2593
2594 if (size == 0) {
2595 return KERN_INVALID_ARGUMENT;
2596 }
2597
2598 new_entry = vm_map_entry_create(map);
2599 new_entry->use_pmap = true;
2600 new_entry->protection = VM_PROT_DEFAULT;
2601 new_entry->max_protection = VM_PROT_ALL;
2602
2603 if (VM_MAP_PAGE_SHIFT(map) != PAGE_SHIFT) {
2604 new_entry->map_aligned = true;
2605 }
2606 if (vmk_flags.vmf_permanent) {
2607 new_entry->vme_permanent = true;
2608 }
2609
2610 vm_map_lock(map);
2611
2612 kr = vm_map_locate_space_anywhere(map, size, mask, vmk_flags,
2613 &hint_address, &entry);
2614 if (kr != KERN_SUCCESS) {
2615 vm_map_unlock(map);
2616 vm_map_entry_dispose(new_entry);
2617 return kr;
2618 }
2619 new_entry->vme_start = hint_address;
2620 new_entry->vme_end = hint_address + size;
2621
2622 /*
2623 * At this point,
2624 *
2625 * - new_entry's "vme_start" and "vme_end" should define
2626 * the endpoints of the available new range,
2627 *
2628 * - and "entry" should refer to the region before
2629 * the new range,
2630 *
2631 * - and the map should still be locked.
2632 */
2633
2634 assert(page_aligned(new_entry->vme_start));
2635 assert(page_aligned(new_entry->vme_end));
2636 assert(VM_MAP_PAGE_ALIGNED(new_entry->vme_start, VM_MAP_PAGE_MASK(map)));
2637 assert(VM_MAP_PAGE_ALIGNED(new_entry->vme_end, VM_MAP_PAGE_MASK(map)));
2638
2639
2640 /*
2641 * Insert the new entry into the list
2642 */
2643
2644 vm_map_store_entry_link(map, entry, new_entry,
2645 VM_MAP_KERNEL_FLAGS_NONE);
2646 map->size += size;
2647
2648 /*
2649 * Update the lookup hint
2650 */
2651 SAVE_HINT_MAP_WRITE(map, new_entry);
2652
2653 *o_entry = new_entry;
2654 return KERN_SUCCESS;
2655 }
2656
2657 int vm_map_pmap_enter_print = FALSE;
2658 int vm_map_pmap_enter_enable = FALSE;
2659
2660 /*
2661 * Routine: vm_map_pmap_enter [internal only]
2662 *
2663 * Description:
2664 * Force pages from the specified object to be entered into
2665 * the pmap at the specified address if they are present.
2666 * As soon as a page not found in the object the scan ends.
2667 *
2668 * Returns:
2669 * Nothing.
2670 *
2671 * In/out conditions:
2672 * The source map should not be locked on entry.
2673 */
2674 __unused static void
vm_map_pmap_enter(vm_map_t map,vm_map_offset_t addr,vm_map_offset_t end_addr,vm_object_t object,vm_object_offset_t offset,vm_prot_t protection)2675 vm_map_pmap_enter(
2676 vm_map_t map,
2677 vm_map_offset_t addr,
2678 vm_map_offset_t end_addr,
2679 vm_object_t object,
2680 vm_object_offset_t offset,
2681 vm_prot_t protection)
2682 {
2683 int type_of_fault;
2684 kern_return_t kr;
2685 uint8_t object_lock_type = 0;
2686 struct vm_object_fault_info fault_info = {
2687 .interruptible = THREAD_UNINT,
2688 };
2689
2690 if (map->pmap == 0) {
2691 return;
2692 }
2693
2694 assert(VM_MAP_PAGE_SHIFT(map) == PAGE_SHIFT);
2695
2696 while (addr < end_addr) {
2697 vm_page_t m;
2698
2699
2700 /*
2701 * TODO:
2702 * From vm_map_enter(), we come into this function without the map
2703 * lock held or the object lock held.
2704 * We haven't taken a reference on the object either.
2705 * We should do a proper lookup on the map to make sure
2706 * that things are sane before we go locking objects that
2707 * could have been deallocated from under us.
2708 */
2709
2710 object_lock_type = OBJECT_LOCK_EXCLUSIVE;
2711 vm_object_lock(object);
2712
2713 m = vm_page_lookup(object, offset);
2714
2715 if (m == VM_PAGE_NULL || m->vmp_busy || vm_page_is_fictitious(m) ||
2716 (m->vmp_unusual && (VMP_ERROR_GET(m) || m->vmp_restart || m->vmp_absent))) {
2717 vm_object_unlock(object);
2718 return;
2719 }
2720
2721 if (vm_map_pmap_enter_print) {
2722 printf("vm_map_pmap_enter:");
2723 printf("map: %p, addr: %llx, object: %p, offset: %llx\n",
2724 map, (unsigned long long)addr, object, (unsigned long long)offset);
2725 }
2726 type_of_fault = DBG_CACHE_HIT_FAULT;
2727 kr = vm_fault_enter(m, map->pmap,
2728 addr,
2729 PAGE_SIZE, 0,
2730 protection, protection,
2731 VM_PAGE_WIRED(m),
2732 VM_KERN_MEMORY_NONE, /* tag - not wiring */
2733 &fault_info,
2734 NULL, /* need_retry */
2735 &type_of_fault,
2736 &object_lock_type); /* Exclusive lock mode. Will remain unchanged.*/
2737
2738 vm_object_unlock(object);
2739
2740 offset += PAGE_SIZE_64;
2741 addr += PAGE_SIZE;
2742 }
2743 }
2744
2745 #define MAX_TRIES_TO_GET_RANDOM_ADDRESS 1000
2746 static kern_return_t
vm_map_random_address_for_size(vm_map_t map,vm_map_offset_t * address,vm_map_size_t size,vm_map_kernel_flags_t vmk_flags)2747 vm_map_random_address_for_size(
2748 vm_map_t map,
2749 vm_map_offset_t *address,
2750 vm_map_size_t size,
2751 vm_map_kernel_flags_t vmk_flags)
2752 {
2753 kern_return_t kr = KERN_SUCCESS;
2754 int tries = 0;
2755 vm_map_offset_t random_addr = 0;
2756 vm_map_offset_t hole_end;
2757
2758 vm_map_entry_t next_entry = VM_MAP_ENTRY_NULL;
2759 vm_map_entry_t prev_entry = VM_MAP_ENTRY_NULL;
2760 vm_map_size_t vm_hole_size = 0;
2761 vm_map_size_t addr_space_size;
2762 bool is_kmem_ptr;
2763 struct mach_vm_range effective_range;
2764
2765 effective_range = vm_map_get_range(map, address, &vmk_flags, size,
2766 &is_kmem_ptr);
2767
2768 addr_space_size = effective_range.max_address - effective_range.min_address;
2769 if (size >= addr_space_size) {
2770 return KERN_NO_SPACE;
2771 }
2772 addr_space_size -= size;
2773
2774 assert(VM_MAP_PAGE_ALIGNED(size, VM_MAP_PAGE_MASK(map)));
2775
2776 while (tries < MAX_TRIES_TO_GET_RANDOM_ADDRESS) {
2777 if (startup_phase < STARTUP_SUB_ZALLOC) {
2778 random_addr = (vm_map_offset_t)early_random();
2779 } else {
2780 random_addr = (vm_map_offset_t)random();
2781 }
2782 random_addr <<= VM_MAP_PAGE_SHIFT(map);
2783 random_addr = vm_map_trunc_page(
2784 effective_range.min_address + (random_addr % addr_space_size),
2785 VM_MAP_PAGE_MASK(map));
2786
2787 #if CONFIG_PROB_GZALLOC
2788 if (map->pmap == kernel_pmap && pgz_owned(random_addr)) {
2789 continue;
2790 }
2791 #endif /* CONFIG_PROB_GZALLOC */
2792
2793 if (vm_map_lookup_entry(map, random_addr, &prev_entry) == FALSE) {
2794 if (prev_entry == vm_map_to_entry(map)) {
2795 next_entry = vm_map_first_entry(map);
2796 } else {
2797 next_entry = prev_entry->vme_next;
2798 }
2799 if (next_entry == vm_map_to_entry(map)) {
2800 hole_end = vm_map_max(map);
2801 } else {
2802 hole_end = next_entry->vme_start;
2803 }
2804 vm_hole_size = hole_end - random_addr;
2805 if (vm_hole_size >= size) {
2806 *address = random_addr;
2807 break;
2808 }
2809 }
2810 tries++;
2811 }
2812
2813 if (tries == MAX_TRIES_TO_GET_RANDOM_ADDRESS) {
2814 kr = KERN_NO_SPACE;
2815 }
2816 return kr;
2817 }
2818
2819 static boolean_t
vm_memory_malloc_no_cow(int alias)2820 vm_memory_malloc_no_cow(
2821 int alias)
2822 {
2823 uint64_t alias_mask;
2824
2825 if (!malloc_no_cow) {
2826 return FALSE;
2827 }
2828 if (alias > 63) {
2829 return FALSE;
2830 }
2831 alias_mask = 1ULL << alias;
2832 if (alias_mask & vm_memory_malloc_no_cow_mask) {
2833 return TRUE;
2834 }
2835 return FALSE;
2836 }
2837
2838 uint64_t vm_map_enter_RLIMIT_AS_count = 0;
2839 uint64_t vm_map_enter_RLIMIT_DATA_count = 0;
2840 /*
2841 * Routine: vm_map_enter
2842 *
2843 * Description:
2844 * Allocate a range in the specified virtual address map.
2845 * The resulting range will refer to memory defined by
2846 * the given memory object and offset into that object.
2847 *
2848 * Arguments are as defined in the vm_map call.
2849 */
2850 static unsigned int vm_map_enter_restore_successes = 0;
2851 static unsigned int vm_map_enter_restore_failures = 0;
2852 kern_return_t
vm_map_enter(vm_map_t map,vm_map_offset_t * address,vm_map_size_t size,vm_map_offset_t mask,vm_map_kernel_flags_t vmk_flags,vm_object_t object,vm_object_offset_t offset,boolean_t needs_copy,vm_prot_t cur_protection,vm_prot_t max_protection,vm_inherit_t inheritance)2853 vm_map_enter(
2854 vm_map_t map,
2855 vm_map_offset_t *address, /* IN/OUT */
2856 vm_map_size_t size,
2857 vm_map_offset_t mask,
2858 vm_map_kernel_flags_t vmk_flags,
2859 vm_object_t object,
2860 vm_object_offset_t offset,
2861 boolean_t needs_copy,
2862 vm_prot_t cur_protection,
2863 vm_prot_t max_protection,
2864 vm_inherit_t inheritance)
2865 {
2866 vm_map_entry_t entry, new_entry;
2867 vm_map_offset_t start, tmp_start, tmp_offset;
2868 vm_map_offset_t end, tmp_end;
2869 vm_map_offset_t tmp2_start, tmp2_end;
2870 vm_map_offset_t step;
2871 kern_return_t result = KERN_SUCCESS;
2872 bool map_locked = FALSE;
2873 bool pmap_empty = TRUE;
2874 bool new_mapping_established = FALSE;
2875 const bool keep_map_locked = vmk_flags.vmkf_keep_map_locked;
2876 const bool anywhere = !vmk_flags.vmf_fixed;
2877 const bool purgable = vmk_flags.vmf_purgeable;
2878 const bool no_cache = vmk_flags.vmf_no_cache;
2879 const bool is_submap = vmk_flags.vmkf_submap;
2880 const bool permanent = vmk_flags.vmf_permanent;
2881 const bool no_copy_on_read = vmk_flags.vmkf_no_copy_on_read;
2882 const bool entry_for_jit = vmk_flags.vmkf_map_jit;
2883 const bool iokit_acct = vmk_flags.vmkf_iokit_acct;
2884 const bool resilient_codesign = vmk_flags.vmf_resilient_codesign;
2885 const bool resilient_media = vmk_flags.vmf_resilient_media;
2886 const bool entry_for_tpro = vmk_flags.vmf_tpro;
2887 const unsigned int superpage_size = vmk_flags.vmf_superpage_size;
2888 const vm_tag_t alias = vmk_flags.vm_tag;
2889 vm_tag_t user_alias;
2890 kern_return_t kr;
2891 bool clear_map_aligned = FALSE;
2892 vm_map_size_t chunk_size = 0;
2893 vm_object_t caller_object;
2894 VM_MAP_ZAP_DECLARE(zap_old_list);
2895 VM_MAP_ZAP_DECLARE(zap_new_list);
2896
2897 caller_object = object;
2898
2899 assertf(vmk_flags.__vmkf_unused2 == 0, "vmk_flags unused2=0x%llx\n", vmk_flags.__vmkf_unused2);
2900
2901 if (vmk_flags.vmf_4gb_chunk) {
2902 #if defined(__LP64__)
2903 chunk_size = (4ULL * 1024 * 1024 * 1024); /* max. 4GB chunks for the new allocation */
2904 #else /* __LP64__ */
2905 chunk_size = ANON_CHUNK_SIZE;
2906 #endif /* __LP64__ */
2907 } else {
2908 chunk_size = ANON_CHUNK_SIZE;
2909 }
2910
2911
2912
2913 if (superpage_size) {
2914 if (object != VM_OBJECT_NULL) {
2915 /* caller can't provide their own VM object */
2916 return KERN_INVALID_ARGUMENT;
2917 }
2918 switch (superpage_size) {
2919 /*
2920 * Note that the current implementation only supports
2921 * a single size for superpages, SUPERPAGE_SIZE, per
2922 * architecture. As soon as more sizes are supposed
2923 * to be supported, SUPERPAGE_SIZE has to be replaced
2924 * with a lookup of the size depending on superpage_size.
2925 */
2926 #ifdef __x86_64__
2927 case SUPERPAGE_SIZE_ANY:
2928 /* handle it like 2 MB and round up to page size */
2929 size = (size + 2 * 1024 * 1024 - 1) & ~(2 * 1024 * 1024 - 1);
2930 OS_FALLTHROUGH;
2931 case SUPERPAGE_SIZE_2MB:
2932 break;
2933 #endif
2934 default:
2935 return KERN_INVALID_ARGUMENT;
2936 }
2937 mask = SUPERPAGE_SIZE - 1;
2938 if (size & (SUPERPAGE_SIZE - 1)) {
2939 return KERN_INVALID_ARGUMENT;
2940 }
2941 inheritance = VM_INHERIT_NONE; /* fork() children won't inherit superpages */
2942 }
2943
2944
2945 if ((cur_protection & VM_PROT_WRITE) &&
2946 (cur_protection & VM_PROT_EXECUTE) &&
2947 #if XNU_TARGET_OS_OSX
2948 map->pmap != kernel_pmap &&
2949 (cs_process_global_enforcement() ||
2950 (vmk_flags.vmkf_cs_enforcement_override
2951 ? vmk_flags.vmkf_cs_enforcement
2952 : (vm_map_cs_enforcement(map)
2953 #if __arm64__
2954 || !VM_MAP_IS_EXOTIC(map)
2955 #endif /* __arm64__ */
2956 ))) &&
2957 #endif /* XNU_TARGET_OS_OSX */
2958 #if CODE_SIGNING_MONITOR
2959 (csm_address_space_exempt(map->pmap) != KERN_SUCCESS) &&
2960 #endif
2961 (VM_MAP_POLICY_WX_FAIL(map) ||
2962 VM_MAP_POLICY_WX_STRIP_X(map)) &&
2963 !entry_for_jit) {
2964 boolean_t vm_protect_wx_fail = VM_MAP_POLICY_WX_FAIL(map);
2965
2966 DTRACE_VM3(cs_wx,
2967 uint64_t, 0,
2968 uint64_t, 0,
2969 vm_prot_t, cur_protection);
2970 printf("CODE SIGNING: %d[%s] %s: curprot cannot be write+execute. %s\n",
2971 proc_selfpid(),
2972 (get_bsdtask_info(current_task())
2973 ? proc_name_address(get_bsdtask_info(current_task()))
2974 : "?"),
2975 __FUNCTION__,
2976 (vm_protect_wx_fail ? "failing" : "turning off execute"));
2977 cur_protection &= ~VM_PROT_EXECUTE;
2978 if (vm_protect_wx_fail) {
2979 return KERN_PROTECTION_FAILURE;
2980 }
2981 }
2982
2983 if (entry_for_jit
2984 && cur_protection != VM_PROT_ALL) {
2985 /*
2986 * Native macOS processes and all non-macOS processes are
2987 * expected to create JIT regions via mmap(MAP_JIT, RWX) but
2988 * the RWX requirement was not enforced, and thus, we must live
2989 * with our sins. We are now dealing with a JIT mapping without
2990 * RWX.
2991 *
2992 * We deal with these by letting the MAP_JIT stick in order
2993 * to avoid CS violations when these pages are mapped executable
2994 * down the line. In order to appease the page table monitor (you
2995 * know what I'm talking about), these pages will end up being
2996 * marked as XNU_USER_DEBUG, which will be allowed because we
2997 * don't enforce the code signing monitor on macOS systems. If
2998 * the user-space application ever changes permissions to RWX,
2999 * which they are allowed to since the mapping was originally
3000 * created with MAP_JIT, then they'll switch over to using the
3001 * XNU_USER_JIT type, and won't be allowed to downgrade any
3002 * more after that.
3003 *
3004 * When not on macOS, a MAP_JIT mapping without VM_PROT_ALL is
3005 * strictly disallowed.
3006 */
3007
3008 #if XNU_TARGET_OS_OSX
3009 /*
3010 * Continue to allow non-RWX JIT
3011 */
3012 #else
3013 /* non-macOS: reject JIT regions without RWX */
3014 DTRACE_VM3(cs_wx,
3015 uint64_t, 0,
3016 uint64_t, 0,
3017 vm_prot_t, cur_protection);
3018 printf("CODE SIGNING: %d[%s] %s(%d): JIT requires RWX: failing. \n",
3019 proc_selfpid(),
3020 (get_bsdtask_info(current_task())
3021 ? proc_name_address(get_bsdtask_info(current_task()))
3022 : "?"),
3023 __FUNCTION__,
3024 cur_protection);
3025 return KERN_PROTECTION_FAILURE;
3026 #endif
3027 }
3028
3029 /*
3030 * If the task has requested executable lockdown,
3031 * deny any new executable mapping.
3032 */
3033 if (map->map_disallow_new_exec == TRUE) {
3034 if (cur_protection & VM_PROT_EXECUTE) {
3035 return KERN_PROTECTION_FAILURE;
3036 }
3037 }
3038
3039 if (resilient_codesign) {
3040 assert(!is_submap);
3041 int reject_prot = (needs_copy ? VM_PROT_ALLEXEC : (VM_PROT_WRITE | VM_PROT_ALLEXEC));
3042 if ((cur_protection | max_protection) & reject_prot) {
3043 return KERN_PROTECTION_FAILURE;
3044 }
3045 }
3046
3047 if (resilient_media) {
3048 assert(!is_submap);
3049 // assert(!needs_copy);
3050 if (object != VM_OBJECT_NULL &&
3051 !object->internal) {
3052 /*
3053 * This mapping is directly backed by an external
3054 * memory manager (e.g. a vnode pager for a file):
3055 * we would not have any safe place to inject
3056 * a zero-filled page if an actual page is not
3057 * available, without possibly impacting the actual
3058 * contents of the mapped object (e.g. the file),
3059 * so we can't provide any media resiliency here.
3060 */
3061 return KERN_INVALID_ARGUMENT;
3062 }
3063 }
3064
3065 if (entry_for_tpro) {
3066 /*
3067 * TPRO overrides the effective permissions of the region
3068 * and explicitly maps as RW. Ensure we have been passed
3069 * the expected permissions. We accept `cur_protections`
3070 * RO as that will be handled on fault.
3071 */
3072 if (!(max_protection & VM_PROT_READ) ||
3073 !(max_protection & VM_PROT_WRITE) ||
3074 !(cur_protection & VM_PROT_READ)) {
3075 return KERN_PROTECTION_FAILURE;
3076 }
3077
3078 /*
3079 * We can now downgrade the cur_protection to RO. This is a mild lie
3080 * to the VM layer. But TPRO will be responsible for toggling the
3081 * protections between RO/RW
3082 */
3083 cur_protection = VM_PROT_READ;
3084 }
3085
3086 if (is_submap) {
3087 vm_map_t submap;
3088 if (purgable) {
3089 /* submaps can not be purgeable */
3090 return KERN_INVALID_ARGUMENT;
3091 }
3092 if (object == VM_OBJECT_NULL) {
3093 /* submaps can not be created lazily */
3094 return KERN_INVALID_ARGUMENT;
3095 }
3096 submap = (vm_map_t) object;
3097 if (VM_MAP_PAGE_SHIFT(submap) != VM_MAP_PAGE_SHIFT(map)) {
3098 /* page size mismatch */
3099 return KERN_INVALID_ARGUMENT;
3100 }
3101 }
3102 if (vmk_flags.vmkf_already) {
3103 /*
3104 * VM_FLAGS_ALREADY says that it's OK if the same mapping
3105 * is already present. For it to be meaningul, the requested
3106 * mapping has to be at a fixed address (!VM_FLAGS_ANYWHERE) and
3107 * we shouldn't try and remove what was mapped there first
3108 * (!VM_FLAGS_OVERWRITE).
3109 */
3110 if (!vmk_flags.vmf_fixed || vmk_flags.vmf_overwrite) {
3111 return KERN_INVALID_ARGUMENT;
3112 }
3113 }
3114
3115 if (size == 0 ||
3116 (offset & MIN(VM_MAP_PAGE_MASK(map), PAGE_MASK_64)) != 0) {
3117 *address = 0;
3118 return KERN_INVALID_ARGUMENT;
3119 }
3120
3121 if (map->pmap == kernel_pmap) {
3122 user_alias = VM_KERN_MEMORY_NONE;
3123 } else {
3124 user_alias = alias;
3125 }
3126
3127 if (user_alias == VM_MEMORY_MALLOC_MEDIUM) {
3128 chunk_size = MALLOC_MEDIUM_CHUNK_SIZE;
3129 }
3130
3131 #define RETURN(value) { result = value; goto BailOut; }
3132
3133 assertf(VM_MAP_PAGE_ALIGNED(*address, FOURK_PAGE_MASK), "0x%llx", (uint64_t)*address);
3134 assertf(VM_MAP_PAGE_ALIGNED(size, FOURK_PAGE_MASK), "0x%llx", (uint64_t)size);
3135 if (VM_MAP_PAGE_MASK(map) >= PAGE_MASK) {
3136 assertf(page_aligned(*address), "0x%llx", (uint64_t)*address);
3137 assertf(page_aligned(size), "0x%llx", (uint64_t)size);
3138 }
3139
3140 if (VM_MAP_PAGE_MASK(map) >= PAGE_MASK &&
3141 !VM_MAP_PAGE_ALIGNED(size, VM_MAP_PAGE_MASK(map))) {
3142 /*
3143 * In most cases, the caller rounds the size up to the
3144 * map's page size.
3145 * If we get a size that is explicitly not map-aligned here,
3146 * we'll have to respect the caller's wish and mark the
3147 * mapping as "not map-aligned" to avoid tripping the
3148 * map alignment checks later.
3149 */
3150 clear_map_aligned = TRUE;
3151 }
3152 if (!anywhere &&
3153 VM_MAP_PAGE_MASK(map) >= PAGE_MASK &&
3154 !VM_MAP_PAGE_ALIGNED(*address, VM_MAP_PAGE_MASK(map))) {
3155 /*
3156 * We've been asked to map at a fixed address and that
3157 * address is not aligned to the map's specific alignment.
3158 * The caller should know what it's doing (i.e. most likely
3159 * mapping some fragmented copy map, transferring memory from
3160 * a VM map with a different alignment), so clear map_aligned
3161 * for this new VM map entry and proceed.
3162 */
3163 clear_map_aligned = TRUE;
3164 }
3165
3166 /*
3167 * Only zero-fill objects are allowed to be purgable.
3168 * LP64todo - limit purgable objects to 32-bits for now
3169 */
3170 if (purgable &&
3171 (offset != 0 ||
3172 (object != VM_OBJECT_NULL &&
3173 (object->vo_size != size ||
3174 object->purgable == VM_PURGABLE_DENY))
3175 #if __LP64__
3176 || size > ANON_MAX_SIZE
3177 #endif
3178 )) {
3179 return KERN_INVALID_ARGUMENT;
3180 }
3181
3182 if (__improbable(!vm_map_is_map_size_valid(
3183 map, size, vmk_flags.vmkf_no_soft_limit))) {
3184 return KERN_NO_SPACE;
3185 }
3186
3187 vm_map_lock(map);
3188 map_locked = TRUE;
3189
3190
3191 if (anywhere) {
3192 result = vm_map_locate_space_anywhere(map, size, mask, vmk_flags,
3193 address, &entry);
3194 start = *address;
3195 } else {
3196 start = *address;
3197 result = vm_map_locate_space_fixed(map, start, size, mask,
3198 vmk_flags, &entry, &zap_old_list);
3199 }
3200
3201 end = start + size;
3202
3203 assert(VM_MAP_PAGE_ALIGNED(start, VM_MAP_PAGE_MASK(map)));
3204
3205 /*
3206 * Check if what's already there is what we want.
3207 */
3208 if (result == KERN_MEMORY_PRESENT) {
3209 assert(!anywhere);
3210 if (!(vmk_flags.vmkf_already)) {
3211 RETURN(KERN_NO_SPACE);
3212 }
3213 tmp_start = start;
3214 tmp_offset = offset;
3215 if (entry->vme_start < start) {
3216 tmp_start -= start - entry->vme_start;
3217 tmp_offset -= start - entry->vme_start;
3218 }
3219 for (; entry->vme_start < end;
3220 entry = entry->vme_next) {
3221 /*
3222 * Check if the mapping's attributes
3223 * match the existing map entry.
3224 */
3225 if (entry == vm_map_to_entry(map) ||
3226 entry->vme_start != tmp_start ||
3227 entry->is_sub_map != is_submap ||
3228 VME_OFFSET(entry) != tmp_offset ||
3229 entry->needs_copy != needs_copy ||
3230 entry->protection != cur_protection ||
3231 entry->max_protection != max_protection ||
3232 entry->inheritance != inheritance ||
3233 entry->iokit_acct != iokit_acct ||
3234 VME_ALIAS(entry) != alias) {
3235 /* not the same mapping ! */
3236 RETURN(KERN_NO_SPACE);
3237 }
3238 /*
3239 * Check if the same object is being mapped.
3240 */
3241 if (is_submap) {
3242 if (VME_SUBMAP(entry) !=
3243 (vm_map_t) object) {
3244 /* not the same submap */
3245 RETURN(KERN_NO_SPACE);
3246 }
3247 } else {
3248 if (VME_OBJECT(entry) != object) {
3249 /* not the same VM object... */
3250 vm_object_t obj2;
3251
3252 obj2 = VME_OBJECT(entry);
3253 if ((obj2 == VM_OBJECT_NULL || obj2->internal) &&
3254 (object == VM_OBJECT_NULL || object->internal)) {
3255 /*
3256 * ... but both are
3257 * anonymous memory,
3258 * so equivalent.
3259 */
3260 } else {
3261 RETURN(KERN_NO_SPACE);
3262 }
3263 }
3264 }
3265
3266 tmp_offset += entry->vme_end - entry->vme_start;
3267 tmp_start += entry->vme_end - entry->vme_start;
3268 if (entry->vme_end >= end) {
3269 /* reached the end of our mapping */
3270 break;
3271 }
3272 }
3273 /* it all matches: let's use what's already there ! */
3274 RETURN(KERN_MEMORY_PRESENT);
3275 }
3276
3277 if (result != KERN_SUCCESS) {
3278 goto BailOut;
3279 }
3280
3281
3282 /*
3283 * At this point,
3284 * "start" and "end" should define the endpoints of the
3285 * available new range, and
3286 * "entry" should refer to the region before the new
3287 * range, and
3288 *
3289 * the map should be locked.
3290 */
3291
3292 /*
3293 * See whether we can avoid creating a new entry (and object) by
3294 * extending one of our neighbors. [So far, we only attempt to
3295 * extend from below.] Note that we can never extend/join
3296 * purgable objects because they need to remain distinct
3297 * entities in order to implement their "volatile object"
3298 * semantics.
3299 */
3300
3301 if (purgable ||
3302 entry_for_jit ||
3303 entry_for_tpro ||
3304 vm_memory_malloc_no_cow(user_alias)) {
3305 if (superpage_size) {
3306 /*
3307 * For "super page" allocations, we will allocate
3308 * special physically-contiguous VM objects later on,
3309 * so we should not have flags instructing us to create
3310 * a differently special VM object here.
3311 */
3312 RETURN(KERN_INVALID_ARGUMENT);
3313 }
3314
3315 if (object == VM_OBJECT_NULL) {
3316 assert(!superpage_size);
3317 object = vm_object_allocate(size, map->serial_id);
3318 vm_object_lock(object);
3319 object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
3320 VM_OBJECT_SET_TRUE_SHARE(object, FALSE);
3321 if (malloc_no_cow_except_fork &&
3322 !purgable &&
3323 !entry_for_jit &&
3324 !entry_for_tpro &&
3325 vm_memory_malloc_no_cow(user_alias)) {
3326 object->copy_strategy = MEMORY_OBJECT_COPY_DELAY_FORK;
3327 VM_OBJECT_SET_TRUE_SHARE(object, TRUE);
3328 }
3329 if (entry_for_jit) {
3330 object->vo_inherit_copy_none = true;
3331 }
3332 if (purgable) {
3333 task_t owner;
3334 VM_OBJECT_SET_PURGABLE(object, VM_PURGABLE_NONVOLATILE);
3335 if (map->pmap == kernel_pmap) {
3336 /*
3337 * Purgeable mappings made in a kernel
3338 * map are "owned" by the kernel itself
3339 * rather than the current user task
3340 * because they're likely to be used by
3341 * more than this user task (see
3342 * execargs_purgeable_allocate(), for
3343 * example).
3344 */
3345 owner = kernel_task;
3346 } else {
3347 owner = current_task();
3348 }
3349 assert(object->vo_owner == NULL);
3350 assert(object->resident_page_count == 0);
3351 assert(object->wired_page_count == 0);
3352 vm_purgeable_nonvolatile_enqueue(object, owner);
3353 }
3354 vm_object_unlock(object);
3355 offset = (vm_object_offset_t)0;
3356 }
3357 } else if (VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT) {
3358 /* no coalescing if address space uses sub-pages */
3359 } else if ((is_submap == FALSE) &&
3360 (object == VM_OBJECT_NULL) &&
3361 (entry != vm_map_to_entry(map)) &&
3362 (entry->vme_end == start) &&
3363 (!entry->is_shared) &&
3364 (!entry->is_sub_map) &&
3365 (!entry->in_transition) &&
3366 (!entry->needs_wakeup) &&
3367 (entry->behavior == VM_BEHAVIOR_DEFAULT) &&
3368 (entry->protection == cur_protection) &&
3369 (entry->max_protection == max_protection) &&
3370 (entry->inheritance == inheritance) &&
3371 ((user_alias == VM_MEMORY_REALLOC) ||
3372 (VME_ALIAS(entry) == alias)) &&
3373 (entry->no_cache == no_cache) &&
3374 (entry->vme_permanent == permanent) &&
3375 /* no coalescing for immutable executable mappings */
3376 !((entry->protection & VM_PROT_EXECUTE) &&
3377 entry->vme_permanent) &&
3378 (!entry->superpage_size && !superpage_size) &&
3379 /*
3380 * No coalescing if not map-aligned, to avoid propagating
3381 * that condition any further than needed:
3382 */
3383 (!entry->map_aligned || !clear_map_aligned) &&
3384 (!entry->zero_wired_pages) &&
3385 (!entry->used_for_jit && !entry_for_jit) &&
3386 #if __arm64e__
3387 (!entry->used_for_tpro && !entry_for_tpro) &&
3388 #endif
3389 (!entry->csm_associated) &&
3390 (entry->iokit_acct == iokit_acct) &&
3391 (!entry->vme_resilient_codesign) &&
3392 (!entry->vme_resilient_media) &&
3393 (!entry->vme_atomic) &&
3394 (entry->vme_no_copy_on_read == no_copy_on_read) &&
3395
3396 ((entry->vme_end - entry->vme_start) + size <=
3397 (user_alias == VM_MEMORY_REALLOC ?
3398 ANON_CHUNK_SIZE :
3399 NO_COALESCE_LIMIT)) &&
3400
3401 (entry->wired_count == 0)) { /* implies user_wired_count == 0 */
3402 if (vm_object_coalesce(VME_OBJECT(entry),
3403 VM_OBJECT_NULL,
3404 VME_OFFSET(entry),
3405 (vm_object_offset_t) 0,
3406 (vm_map_size_t)(entry->vme_end - entry->vme_start),
3407 (vm_map_size_t)(end - entry->vme_end))) {
3408 /*
3409 * Coalesced the two objects - can extend
3410 * the previous map entry to include the
3411 * new range.
3412 */
3413 map->size += (end - entry->vme_end);
3414 assert(entry->vme_start < end);
3415 assert(VM_MAP_PAGE_ALIGNED(end,
3416 VM_MAP_PAGE_MASK(map)));
3417 if (__improbable(vm_debug_events)) {
3418 DTRACE_VM5(map_entry_extend, vm_map_t, map, vm_map_entry_t, entry, vm_address_t, entry->vme_start, vm_address_t, entry->vme_end, vm_address_t, end);
3419 }
3420 entry->vme_end = end;
3421 if (map->holelistenabled) {
3422 vm_map_store_update_first_free(map, entry, TRUE);
3423 } else {
3424 vm_map_store_update_first_free(map, map->first_free, TRUE);
3425 }
3426 new_mapping_established = TRUE;
3427 RETURN(KERN_SUCCESS);
3428 }
3429 }
3430
3431 step = superpage_size ? SUPERPAGE_SIZE : (end - start);
3432 new_entry = NULL;
3433
3434 if (vmk_flags.vmkf_submap_adjust) {
3435 vm_map_adjust_offsets((vm_map_t)caller_object, start, end);
3436 offset = start;
3437 }
3438
3439 for (tmp2_start = start; tmp2_start < end; tmp2_start += step) {
3440 tmp2_end = tmp2_start + step;
3441 /*
3442 * Create a new entry
3443 *
3444 * XXX FBDP
3445 * The reserved "page zero" in each process's address space can
3446 * be arbitrarily large. Splitting it into separate objects and
3447 * therefore different VM map entries serves no purpose and just
3448 * slows down operations on the VM map, so let's not split the
3449 * allocation into chunks if the max protection is NONE. That
3450 * memory should never be accessible, so it will never get to the
3451 * default pager.
3452 */
3453 tmp_start = tmp2_start;
3454 if (!is_submap &&
3455 object == VM_OBJECT_NULL &&
3456 size > chunk_size &&
3457 max_protection != VM_PROT_NONE &&
3458 superpage_size == 0) {
3459 tmp_end = tmp_start + chunk_size;
3460 } else {
3461 tmp_end = tmp2_end;
3462 }
3463 do {
3464 if (!is_submap &&
3465 object != VM_OBJECT_NULL &&
3466 object->internal &&
3467 offset + (tmp_end - tmp_start) > object->vo_size) {
3468 // printf("FBDP object %p size 0x%llx overmapping offset 0x%llx size 0x%llx\n", object, object->vo_size, offset, (uint64_t)(tmp_end - tmp_start));
3469 DTRACE_VM5(vm_map_enter_overmap,
3470 vm_map_t, map,
3471 vm_map_address_t, tmp_start,
3472 vm_map_address_t, tmp_end,
3473 vm_object_offset_t, offset,
3474 vm_object_size_t, object->vo_size);
3475 }
3476 new_entry = vm_map_entry_insert(map,
3477 entry, tmp_start, tmp_end,
3478 object, offset, vmk_flags,
3479 needs_copy,
3480 cur_protection, max_protection,
3481 (entry_for_jit && !VM_MAP_POLICY_ALLOW_JIT_INHERIT(map) ?
3482 VM_INHERIT_NONE : inheritance),
3483 clear_map_aligned);
3484
3485 assert(!is_kernel_object(object) || (VM_KERN_MEMORY_NONE != alias));
3486
3487 if (resilient_codesign) {
3488 int reject_prot = (needs_copy ? VM_PROT_ALLEXEC : (VM_PROT_WRITE | VM_PROT_ALLEXEC));
3489 if (!((cur_protection | max_protection) & reject_prot)) {
3490 new_entry->vme_resilient_codesign = TRUE;
3491 }
3492 }
3493
3494 if (resilient_media &&
3495 (object == VM_OBJECT_NULL ||
3496 object->internal)) {
3497 new_entry->vme_resilient_media = TRUE;
3498 }
3499
3500 assert(!new_entry->iokit_acct);
3501 if (!is_submap &&
3502 object != VM_OBJECT_NULL &&
3503 object->internal &&
3504 (object->purgable != VM_PURGABLE_DENY ||
3505 object->vo_ledger_tag)) {
3506 assert(new_entry->use_pmap);
3507 assert(!new_entry->iokit_acct);
3508 /*
3509 * Turn off pmap accounting since
3510 * purgeable (or tagged) objects have their
3511 * own ledgers.
3512 */
3513 new_entry->use_pmap = FALSE;
3514 } else if (!is_submap &&
3515 iokit_acct &&
3516 object != VM_OBJECT_NULL &&
3517 object->internal) {
3518 /* alternate accounting */
3519 assert(!new_entry->iokit_acct);
3520 assert(new_entry->use_pmap);
3521 new_entry->iokit_acct = TRUE;
3522 new_entry->use_pmap = FALSE;
3523 DTRACE_VM4(
3524 vm_map_iokit_mapped_region,
3525 vm_map_t, map,
3526 vm_map_offset_t, new_entry->vme_start,
3527 vm_map_offset_t, new_entry->vme_end,
3528 int, VME_ALIAS(new_entry));
3529 vm_map_iokit_mapped_region(
3530 map,
3531 (new_entry->vme_end -
3532 new_entry->vme_start));
3533 } else if (!is_submap) {
3534 assert(!new_entry->iokit_acct);
3535 assert(new_entry->use_pmap);
3536 }
3537
3538 if (is_submap) {
3539 vm_map_t submap;
3540 boolean_t submap_is_64bit;
3541 boolean_t use_pmap;
3542
3543 assert(new_entry->is_sub_map);
3544 assert(!new_entry->use_pmap);
3545 assert(!new_entry->iokit_acct);
3546 submap = (vm_map_t) object;
3547 submap_is_64bit = vm_map_is_64bit(submap);
3548 use_pmap = vmk_flags.vmkf_nested_pmap;
3549 #ifndef NO_NESTED_PMAP
3550 if (use_pmap && submap->pmap == NULL) {
3551 ledger_t ledger = map->pmap->ledger;
3552 /* we need a sub pmap to nest... */
3553 submap->pmap = pmap_create_options(ledger, 0,
3554 submap_is_64bit ? PMAP_CREATE_64BIT : 0);
3555 if (submap->pmap == NULL) {
3556 /* let's proceed without nesting... */
3557 }
3558 #if defined(__arm64__)
3559 else {
3560 pmap_set_nested(submap->pmap);
3561 }
3562 #endif
3563 }
3564 if (use_pmap && submap->pmap != NULL) {
3565 if (VM_MAP_PAGE_SHIFT(map) != VM_MAP_PAGE_SHIFT(submap)) {
3566 DEBUG4K_ERROR("map %p (%d) submap %p (%d): incompatible page sizes\n", map, VM_MAP_PAGE_SHIFT(map), submap, VM_MAP_PAGE_SHIFT(submap));
3567 kr = KERN_FAILURE;
3568 } else {
3569 kr = pmap_nest(map->pmap,
3570 submap->pmap,
3571 tmp_start,
3572 tmp_end - tmp_start);
3573 }
3574 if (kr != KERN_SUCCESS) {
3575 printf("vm_map_enter: "
3576 "pmap_nest(0x%llx,0x%llx) "
3577 "error 0x%x\n",
3578 (long long)tmp_start,
3579 (long long)tmp_end,
3580 kr);
3581 } else {
3582 /* we're now nested ! */
3583 new_entry->use_pmap = TRUE;
3584 pmap_empty = FALSE;
3585 }
3586 }
3587 #endif /* NO_NESTED_PMAP */
3588 }
3589 entry = new_entry;
3590
3591 if (superpage_size) {
3592 vm_page_t pages, m;
3593 vm_object_t sp_object;
3594 vm_object_offset_t sp_offset;
3595
3596 assert(object == VM_OBJECT_NULL);
3597 VME_OFFSET_SET(entry, 0);
3598
3599 /* allocate one superpage */
3600 kr = cpm_allocate(SUPERPAGE_SIZE, &pages, 0, SUPERPAGE_NBASEPAGES - 1, TRUE, 0);
3601 if (kr != KERN_SUCCESS) {
3602 /* deallocate whole range... */
3603 new_mapping_established = TRUE;
3604 /* ... but only up to "tmp_end" */
3605 size -= end - tmp_end;
3606 RETURN(kr);
3607 }
3608
3609 /* create one vm_object per superpage */
3610 sp_object = vm_object_allocate((vm_map_size_t)(entry->vme_end - entry->vme_start), map->serial_id);
3611 vm_object_lock(sp_object);
3612 sp_object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
3613 VM_OBJECT_SET_PHYS_CONTIGUOUS(sp_object, TRUE);
3614 sp_object->vo_shadow_offset = (vm_object_offset_t)VM_PAGE_GET_PHYS_PAGE(pages) * PAGE_SIZE;
3615 VME_OBJECT_SET(entry, sp_object, false, 0);
3616 assert(entry->use_pmap);
3617
3618 /* enter the base pages into the object */
3619 for (sp_offset = 0;
3620 sp_offset < SUPERPAGE_SIZE;
3621 sp_offset += PAGE_SIZE) {
3622 m = pages;
3623 pmap_zero_page(VM_PAGE_GET_PHYS_PAGE(m));
3624 pages = NEXT_PAGE(m);
3625 *(NEXT_PAGE_PTR(m)) = VM_PAGE_NULL;
3626 vm_page_insert_wired(m, sp_object, sp_offset, VM_KERN_MEMORY_OSFMK);
3627 }
3628 vm_object_unlock(sp_object);
3629 }
3630 } while (tmp_end != tmp2_end &&
3631 (tmp_start = tmp_end) &&
3632 (tmp_end = (tmp2_end - tmp_end > chunk_size) ?
3633 tmp_end + chunk_size : tmp2_end));
3634 }
3635
3636 new_mapping_established = TRUE;
3637
3638
3639 BailOut:
3640 assert(map_locked == TRUE);
3641
3642 /*
3643 * Address space limit enforcement (RLIMIT_AS and RLIMIT_DATA):
3644 * If we have identified and possibly established the new mapping(s),
3645 * make sure we did not go beyond the address space limit.
3646 */
3647 if (result == KERN_SUCCESS) {
3648 if (map->size_limit != RLIM_INFINITY &&
3649 map->size > map->size_limit) {
3650 /*
3651 * Establishing the requested mappings would exceed
3652 * the process's RLIMIT_AS limit: fail with
3653 * KERN_NO_SPACE.
3654 */
3655 result = KERN_NO_SPACE;
3656 printf("%d[%s] %s: map size 0x%llx over RLIMIT_AS 0x%llx\n",
3657 proc_selfpid(),
3658 (get_bsdtask_info(current_task())
3659 ? proc_name_address(get_bsdtask_info(current_task()))
3660 : "?"),
3661 __FUNCTION__,
3662 (uint64_t) map->size,
3663 (uint64_t) map->size_limit);
3664 DTRACE_VM2(vm_map_enter_RLIMIT_AS,
3665 vm_map_size_t, map->size,
3666 uint64_t, map->size_limit);
3667 vm_map_enter_RLIMIT_AS_count++;
3668 } else if (map->data_limit != RLIM_INFINITY &&
3669 map->size > map->data_limit) {
3670 /*
3671 * Establishing the requested mappings would exceed
3672 * the process's RLIMIT_DATA limit: fail with
3673 * KERN_NO_SPACE.
3674 */
3675 result = KERN_NO_SPACE;
3676 printf("%d[%s] %s: map size 0x%llx over RLIMIT_DATA 0x%llx\n",
3677 proc_selfpid(),
3678 (get_bsdtask_info(current_task())
3679 ? proc_name_address(get_bsdtask_info(current_task()))
3680 : "?"),
3681 __FUNCTION__,
3682 (uint64_t) map->size,
3683 (uint64_t) map->data_limit);
3684 DTRACE_VM2(vm_map_enter_RLIMIT_DATA,
3685 vm_map_size_t, map->size,
3686 uint64_t, map->data_limit);
3687 vm_map_enter_RLIMIT_DATA_count++;
3688 }
3689 }
3690
3691 if (result == KERN_SUCCESS) {
3692 vm_prot_t pager_prot;
3693 memory_object_t pager;
3694
3695 #if DEBUG
3696 if (pmap_empty &&
3697 !(vmk_flags.vmkf_no_pmap_check)) {
3698 assert(pmap_is_empty(map->pmap,
3699 *address,
3700 *address + size));
3701 }
3702 #endif /* DEBUG */
3703
3704 /*
3705 * For "named" VM objects, let the pager know that the
3706 * memory object is being mapped. Some pagers need to keep
3707 * track of this, to know when they can reclaim the memory
3708 * object, for example.
3709 * VM calls memory_object_map() for each mapping (specifying
3710 * the protection of each mapping) and calls
3711 * memory_object_last_unmap() when all the mappings are gone.
3712 */
3713 pager_prot = max_protection;
3714 if (needs_copy) {
3715 /*
3716 * Copy-On-Write mapping: won't modify
3717 * the memory object.
3718 */
3719 pager_prot &= ~VM_PROT_WRITE;
3720 }
3721 if (!is_submap &&
3722 object != VM_OBJECT_NULL &&
3723 object->named &&
3724 object->pager != MEMORY_OBJECT_NULL) {
3725 vm_object_lock(object);
3726 pager = object->pager;
3727 if (object->named &&
3728 pager != MEMORY_OBJECT_NULL) {
3729 assert(object->pager_ready);
3730 vm_object_mapping_wait(object, THREAD_UNINT);
3731 /* object might have lost its pager while waiting */
3732 pager = object->pager;
3733 if (object->named && pager != MEMORY_OBJECT_NULL) {
3734 vm_object_mapping_begin(object);
3735 vm_object_unlock(object);
3736
3737 kr = memory_object_map(pager, pager_prot);
3738 assert(kr == KERN_SUCCESS);
3739
3740 vm_object_lock(object);
3741 vm_object_mapping_end(object);
3742 }
3743 }
3744 vm_object_unlock(object);
3745 }
3746 }
3747
3748 assert(map_locked == TRUE);
3749
3750 if (new_mapping_established) {
3751 /*
3752 * If we release the map lock for any reason below,
3753 * another thread could deallocate our new mapping,
3754 * releasing the caller's reference on "caller_object",
3755 * which was transferred to the mapping.
3756 * If this was the only reference, the object could be
3757 * destroyed.
3758 *
3759 * We need to take an extra reference on "caller_object"
3760 * to keep it alive if we need to return the caller's
3761 * reference to the caller in case of failure.
3762 */
3763 if (is_submap) {
3764 vm_map_reference((vm_map_t)caller_object);
3765 } else {
3766 vm_object_reference(caller_object);
3767 }
3768 }
3769
3770 if (!keep_map_locked) {
3771 vm_map_unlock(map);
3772 map_locked = FALSE;
3773 entry = VM_MAP_ENTRY_NULL;
3774 new_entry = VM_MAP_ENTRY_NULL;
3775 }
3776
3777 /*
3778 * We can't hold the map lock if we enter this block.
3779 */
3780
3781 if (result == KERN_SUCCESS) {
3782 /* Wire down the new entry if the user
3783 * requested all new map entries be wired.
3784 */
3785 if ((map->wiring_required) || (superpage_size)) {
3786 assert(!keep_map_locked);
3787 pmap_empty = FALSE; /* pmap won't be empty */
3788 kr = vm_map_wire_nested(map, start, end,
3789 cur_protection, VM_KERN_MEMORY_MLOCK,
3790 TRUE, PMAP_NULL, 0, NULL);
3791 result = kr;
3792 }
3793
3794 }
3795
3796 if (result != KERN_SUCCESS) {
3797 if (new_mapping_established) {
3798 vmr_flags_t remove_flags = VM_MAP_REMOVE_NO_FLAGS;
3799
3800 /*
3801 * We have to get rid of the new mappings since we
3802 * won't make them available to the user.
3803 * Try and do that atomically, to minimize the risk
3804 * that someone else create new mappings that range.
3805 */
3806 if (!map_locked) {
3807 vm_map_lock(map);
3808 map_locked = TRUE;
3809 }
3810 remove_flags |= VM_MAP_REMOVE_NO_MAP_ALIGN;
3811 remove_flags |= VM_MAP_REMOVE_NO_YIELD;
3812 if (permanent) {
3813 remove_flags |= VM_MAP_REMOVE_IMMUTABLE;
3814 }
3815 (void) vm_map_delete(map,
3816 *address, *address + size,
3817 remove_flags,
3818 KMEM_GUARD_NONE, &zap_new_list);
3819 }
3820
3821 if (vm_map_zap_first_entry(&zap_old_list)) {
3822 vm_map_entry_t entry1, entry2;
3823
3824 /*
3825 * The new mapping failed. Attempt to restore
3826 * the old mappings, saved in the "zap_old_map".
3827 */
3828 if (!map_locked) {
3829 vm_map_lock(map);
3830 map_locked = TRUE;
3831 }
3832
3833 /* first check if the coast is still clear */
3834 start = vm_map_zap_first_entry(&zap_old_list)->vme_start;
3835 end = vm_map_zap_last_entry(&zap_old_list)->vme_end;
3836
3837 if (vm_map_lookup_entry(map, start, &entry1) ||
3838 vm_map_lookup_entry(map, end, &entry2) ||
3839 entry1 != entry2) {
3840 /*
3841 * Part of that range has already been
3842 * re-mapped: we can't restore the old
3843 * mappings...
3844 */
3845 vm_map_enter_restore_failures++;
3846 } else {
3847 /*
3848 * Transfer the saved map entries from
3849 * "zap_old_map" to the original "map",
3850 * inserting them all after "entry1".
3851 */
3852 while ((entry2 = vm_map_zap_pop(&zap_old_list))) {
3853 vm_map_size_t entry_size;
3854
3855 entry_size = (entry2->vme_end -
3856 entry2->vme_start);
3857 vm_map_store_entry_link(map, entry1, entry2,
3858 VM_MAP_KERNEL_FLAGS_NONE);
3859 map->size += entry_size;
3860 entry1 = entry2;
3861 }
3862 if (map->wiring_required) {
3863 /*
3864 * XXX TODO: we should rewire the
3865 * old pages here...
3866 */
3867 }
3868 vm_map_enter_restore_successes++;
3869 }
3870 }
3871 }
3872
3873 /*
3874 * The caller is responsible for releasing the lock if it requested to
3875 * keep the map locked.
3876 */
3877 if (map_locked && !keep_map_locked) {
3878 vm_map_unlock(map);
3879 }
3880
3881 vm_map_zap_dispose(&zap_old_list);
3882 vm_map_zap_dispose(&zap_new_list);
3883
3884 if (new_mapping_established) {
3885 /*
3886 * The caller had a reference on "caller_object" and we
3887 * transferred that reference to the mapping.
3888 * We also took an extra reference on "caller_object" to keep
3889 * it alive while the map was unlocked.
3890 */
3891 if (result == KERN_SUCCESS) {
3892 /*
3893 * On success, the caller's reference on the object gets
3894 * tranferred to the mapping.
3895 * Release our extra reference.
3896 */
3897 if (is_submap) {
3898 vm_map_deallocate((vm_map_t)caller_object);
3899 } else {
3900 vm_object_deallocate(caller_object);
3901 }
3902 } else {
3903 /*
3904 * On error, the caller expects to still have a
3905 * reference on the object it gave us.
3906 * Let's use our extra reference for that.
3907 */
3908 }
3909 }
3910
3911 return result;
3912
3913 #undef RETURN
3914 }
3915
3916 /*
3917 * Counters for the prefault optimization.
3918 */
3919 int64_t vm_prefault_nb_pages = 0;
3920 int64_t vm_prefault_nb_bailout = 0;
3921
3922 static kern_return_t
vm_map_enter_adjust_offset(vm_object_offset_t * obj_offs,vm_object_offset_t * obj_end,vm_object_offset_t quantity)3923 vm_map_enter_adjust_offset(
3924 vm_object_offset_t *obj_offs,
3925 vm_object_offset_t *obj_end,
3926 vm_object_offset_t quantity)
3927 {
3928 if (os_add_overflow(*obj_offs, quantity, obj_offs) ||
3929 os_add_overflow(*obj_end, quantity, obj_end) ||
3930 vm_map_round_page_mask(*obj_end, PAGE_MASK) == 0) {
3931 return KERN_INVALID_ARGUMENT;
3932 }
3933
3934 return KERN_SUCCESS;
3935 }
3936
3937 static __attribute__((always_inline, warn_unused_result))
3938 kern_return_t
vm_map_enter_mem_object_sanitize(vm_map_t target_map,vm_map_offset_ut address_u,vm_map_size_ut initial_size_u,vm_map_offset_ut mask_u,vm_object_offset_ut offset_u,vm_prot_ut cur_protection_u,vm_prot_ut max_protection_u,vm_inherit_ut inheritance_u,vm_map_kernel_flags_t vmk_flags,ipc_port_t port,vm_map_address_t * map_addr,vm_map_size_t * map_size,vm_map_offset_t * mask,vm_object_offset_t * obj_offs,vm_object_offset_t * obj_end,vm_object_size_t * obj_size,vm_prot_t * cur_protection,vm_prot_t * max_protection,vm_inherit_t * inheritance)3939 vm_map_enter_mem_object_sanitize(
3940 vm_map_t target_map,
3941 vm_map_offset_ut address_u,
3942 vm_map_size_ut initial_size_u,
3943 vm_map_offset_ut mask_u,
3944 vm_object_offset_ut offset_u,
3945 vm_prot_ut cur_protection_u,
3946 vm_prot_ut max_protection_u,
3947 vm_inherit_ut inheritance_u,
3948 vm_map_kernel_flags_t vmk_flags,
3949 ipc_port_t port,
3950 vm_map_address_t *map_addr,
3951 vm_map_size_t *map_size,
3952 vm_map_offset_t *mask,
3953 vm_object_offset_t *obj_offs,
3954 vm_object_offset_t *obj_end,
3955 vm_object_size_t *obj_size,
3956 vm_prot_t *cur_protection,
3957 vm_prot_t *max_protection,
3958 vm_inherit_t *inheritance)
3959 {
3960 kern_return_t result;
3961
3962 result = vm_sanitize_cur_and_max_prots(cur_protection_u, max_protection_u,
3963 VM_SANITIZE_CALLER_ENTER_MEM_OBJ, target_map,
3964 VM_PROT_IS_MASK, cur_protection,
3965 max_protection);
3966 if (__improbable(result != KERN_SUCCESS)) {
3967 return result;
3968 }
3969
3970 result = vm_sanitize_inherit(inheritance_u, VM_SANITIZE_CALLER_ENTER_MEM_OBJ,
3971 inheritance);
3972 if (__improbable(result != KERN_SUCCESS)) {
3973 return result;
3974 }
3975
3976 result = vm_sanitize_mask(mask_u, VM_SANITIZE_CALLER_ENTER_MEM_OBJ, mask);
3977 if (__improbable(result != KERN_SUCCESS)) {
3978 return result;
3979 }
3980
3981 if (vmk_flags.vmf_fixed) {
3982 vm_map_address_t map_end;
3983
3984 result = vm_sanitize_addr_size(address_u, initial_size_u,
3985 VM_SANITIZE_CALLER_ENTER_MEM_OBJ,
3986 target_map,
3987 VM_SANITIZE_FLAGS_SIZE_ZERO_FAILS | VM_SANITIZE_FLAGS_REALIGN_START,
3988 map_addr, &map_end, map_size);
3989 if (__improbable(result != KERN_SUCCESS)) {
3990 return result;
3991 }
3992 } else {
3993 *map_addr = vm_sanitize_addr(target_map, address_u);
3994 result = vm_sanitize_size(0, initial_size_u,
3995 VM_SANITIZE_CALLER_ENTER_MEM_OBJ, target_map,
3996 VM_SANITIZE_FLAGS_SIZE_ZERO_FAILS, map_size);
3997 if (__improbable(result != KERN_SUCCESS)) {
3998 return result;
3999 }
4000 }
4001
4002 *obj_size = vm_object_round_page(*map_size);
4003 if (__improbable(*obj_size == 0)) {
4004 return KERN_INVALID_ARGUMENT;
4005 }
4006
4007 if (IP_VALID(port)) {
4008 result = vm_sanitize_addr_size(offset_u, *obj_size,
4009 VM_SANITIZE_CALLER_ENTER_MEM_OBJ,
4010 PAGE_MASK,
4011 VM_SANITIZE_FLAGS_SIZE_ZERO_FAILS |
4012 VM_SANITIZE_FLAGS_GET_UNALIGNED_VALUES,
4013 obj_offs, obj_end, obj_size);
4014 if (__improbable(result != KERN_SUCCESS)) {
4015 return result;
4016 }
4017 } else {
4018 *obj_offs = 0;
4019 *obj_end = *obj_size;
4020 }
4021
4022 return KERN_SUCCESS;
4023 }
4024
4025 kern_return_t
vm_map_enter_mem_object(vm_map_t target_map,vm_map_offset_ut * address_u,vm_map_size_ut initial_size_u,vm_map_offset_ut mask_u,vm_map_kernel_flags_t vmk_flags,ipc_port_t port,vm_object_offset_ut offset_u,boolean_t copy,vm_prot_ut cur_protection_u,vm_prot_ut max_protection_u,vm_inherit_ut inheritance_u,upl_page_list_ptr_t page_list,unsigned int page_list_count)4026 vm_map_enter_mem_object(
4027 vm_map_t target_map,
4028 vm_map_offset_ut *address_u,
4029 vm_map_size_ut initial_size_u,
4030 vm_map_offset_ut mask_u,
4031 vm_map_kernel_flags_t vmk_flags,
4032 ipc_port_t port,
4033 vm_object_offset_ut offset_u,
4034 boolean_t copy,
4035 vm_prot_ut cur_protection_u,
4036 vm_prot_ut max_protection_u,
4037 vm_inherit_ut inheritance_u,
4038 upl_page_list_ptr_t page_list,
4039 unsigned int page_list_count)
4040 {
4041 vm_map_offset_t mask;
4042 vm_prot_t cur_protection;
4043 vm_prot_t max_protection;
4044 vm_inherit_t inheritance;
4045 vm_map_address_t map_addr, map_mask;
4046 vm_map_size_t map_size;
4047 vm_object_t object = VM_OBJECT_NULL;
4048 vm_object_offset_t obj_offs, obj_end;
4049 vm_object_size_t obj_size;
4050 kern_return_t result;
4051 boolean_t mask_cur_protection, mask_max_protection;
4052 boolean_t kernel_prefault, try_prefault = (page_list_count != 0);
4053 vm_map_offset_t offset_in_mapping = 0;
4054
4055 if (VM_MAP_PAGE_SHIFT(target_map) < PAGE_SHIFT) {
4056 /* XXX TODO4K prefaulting depends on page size... */
4057 try_prefault = FALSE;
4058 }
4059
4060 /*
4061 * Check arguments for validity
4062 */
4063 if ((target_map == VM_MAP_NULL) ||
4064 (try_prefault && (copy || !page_list))) {
4065 return KERN_INVALID_ARGUMENT;
4066 }
4067
4068 map_mask = vm_map_page_mask(target_map);
4069
4070 /*
4071 * Sanitize any input parameters that are addr/size/prot/inherit
4072 */
4073 result = vm_map_enter_mem_object_sanitize(
4074 target_map,
4075 *address_u,
4076 initial_size_u,
4077 mask_u,
4078 offset_u,
4079 cur_protection_u,
4080 max_protection_u,
4081 inheritance_u,
4082 vmk_flags,
4083 port,
4084 &map_addr,
4085 &map_size,
4086 &mask,
4087 &obj_offs,
4088 &obj_end,
4089 &obj_size,
4090 &cur_protection,
4091 &max_protection,
4092 &inheritance);
4093 if (__improbable(result != KERN_SUCCESS)) {
4094 return vm_sanitize_get_kr(result);
4095 }
4096
4097 assertf(vmk_flags.__vmkf_unused2 == 0, "vmk_flags unused2=0x%llx\n", vmk_flags.__vmkf_unused2);
4098 vm_map_kernel_flags_update_range_id(&vmk_flags, target_map, map_size);
4099
4100 mask_cur_protection = cur_protection & VM_PROT_IS_MASK;
4101 mask_max_protection = max_protection & VM_PROT_IS_MASK;
4102 cur_protection &= ~VM_PROT_IS_MASK;
4103 max_protection &= ~VM_PROT_IS_MASK;
4104
4105 #if __arm64__
4106 if (cur_protection & VM_PROT_EXECUTE) {
4107 cur_protection |= VM_PROT_READ;
4108 }
4109 #endif /* __arm64__ */
4110
4111 /*
4112 * Find the vm object (if any) corresponding to this port.
4113 */
4114 if (!IP_VALID(port)) {
4115 object = VM_OBJECT_NULL;
4116 copy = FALSE;
4117 } else if (ip_kotype(port) == IKOT_NAMED_ENTRY) {
4118 vm_named_entry_t named_entry;
4119 vm_object_size_t initial_size;
4120
4121 named_entry = mach_memory_entry_from_port(port);
4122
4123 if (vmk_flags.vmf_return_data_addr ||
4124 vmk_flags.vmf_return_4k_data_addr) {
4125 result = vm_map_enter_adjust_offset(&obj_offs,
4126 &obj_end, named_entry->data_offset);
4127 if (__improbable(result)) {
4128 return result;
4129 }
4130 }
4131
4132 /* a few checks to make sure user is obeying rules */
4133 if (mask_max_protection) {
4134 max_protection &= named_entry->protection;
4135 }
4136 if (mask_cur_protection) {
4137 cur_protection &= named_entry->protection;
4138 }
4139 if ((named_entry->protection & max_protection) !=
4140 max_protection) {
4141 return KERN_INVALID_RIGHT;
4142 }
4143 if ((named_entry->protection & cur_protection) !=
4144 cur_protection) {
4145 return KERN_INVALID_RIGHT;
4146 }
4147
4148 /*
4149 * unwrap is safe because we know obj_size is larger and doesn't
4150 * overflow
4151 */
4152 initial_size = VM_SANITIZE_UNSAFE_UNWRAP(initial_size_u);
4153 if (named_entry->size < obj_offs + initial_size) {
4154 return KERN_INVALID_ARGUMENT;
4155 }
4156
4157 /* for a vm_map_copy, we can only map it whole */
4158 if (named_entry->is_copy &&
4159 (obj_size != named_entry->size) &&
4160 (vm_map_round_page(obj_size, map_mask) == named_entry->size)) {
4161 /* XXX FBDP use the rounded size... */
4162 obj_end += named_entry->size - obj_size;
4163 obj_size = named_entry->size;
4164 }
4165
4166 if (named_entry->offset) {
4167 /*
4168 * the callers parameter offset is defined to be the
4169 * offset from beginning of named entry offset in object
4170 *
4171 * Because we checked above that
4172 * obj_offs + obj_size < named_entry_size
4173 * these overflow checks should be redundant...
4174 */
4175 result = vm_map_enter_adjust_offset(&obj_offs,
4176 &obj_end, named_entry->offset);
4177 if (__improbable(result)) {
4178 return result;
4179 }
4180 }
4181
4182 if (!VM_MAP_PAGE_ALIGNED(obj_size, map_mask)) {
4183 /*
4184 * Let's not map more than requested;
4185 * vm_map_enter() will handle this "not map-aligned"
4186 * case.
4187 */
4188 map_size = obj_size;
4189 }
4190
4191 named_entry_lock(named_entry);
4192
4193 // rdar://130307561 (Combine copy, object, and submap fields of vm_named_entry into an enum)
4194 assert(named_entry->is_copy || named_entry->is_object || named_entry->is_sub_map);
4195
4196 if (named_entry->is_sub_map) {
4197 vm_map_t submap;
4198
4199 assert(!named_entry->is_copy);
4200 assert(!named_entry->is_object);
4201
4202 if (vmk_flags.vmf_return_data_addr ||
4203 vmk_flags.vmf_return_4k_data_addr) {
4204 panic("VM_FLAGS_RETURN_DATA_ADDR not expected for submap.");
4205 }
4206
4207 submap = named_entry->backing.map;
4208 vm_map_reference(submap);
4209 named_entry_unlock(named_entry);
4210
4211 vmk_flags.vmkf_submap = TRUE;
4212 result = vm_map_enter(target_map,
4213 &map_addr,
4214 map_size,
4215 mask,
4216 vmk_flags,
4217 (vm_object_t)(uintptr_t) submap,
4218 obj_offs,
4219 copy,
4220 cur_protection,
4221 max_protection,
4222 inheritance);
4223 if (result != KERN_SUCCESS) {
4224 vm_map_deallocate(submap);
4225 return result;
4226 }
4227 /*
4228 * No need to lock "submap" just to check its
4229 * "mapped" flag: that flag is never reset
4230 * once it's been set and if we race, we'll
4231 * just end up setting it twice, which is OK.
4232 */
4233 if (submap->mapped_in_other_pmaps == FALSE &&
4234 vm_map_pmap(submap) != PMAP_NULL &&
4235 vm_map_pmap(submap) !=
4236 vm_map_pmap(target_map)) {
4237 /*
4238 * This submap is being mapped in a map
4239 * that uses a different pmap.
4240 * Set its "mapped_in_other_pmaps" flag
4241 * to indicate that we now need to
4242 * remove mappings from all pmaps rather
4243 * than just the submap's pmap.
4244 */
4245 vm_map_lock(submap);
4246 submap->mapped_in_other_pmaps = TRUE;
4247 vm_map_unlock(submap);
4248 }
4249 goto out;
4250 }
4251
4252 if (named_entry->is_copy) {
4253 kern_return_t kr;
4254 vm_map_copy_t copy_map;
4255 vm_map_entry_t copy_entry;
4256 vm_map_offset_t copy_addr;
4257 vm_map_copy_t target_copy_map;
4258 vm_map_offset_t overmap_start, overmap_end;
4259 vm_map_offset_t trimmed_start;
4260 vm_map_size_t target_size;
4261
4262 assert(!named_entry->is_object);
4263 assert(!named_entry->is_sub_map);
4264
4265 int allowed_flags = VM_FLAGS_FIXED |
4266 VM_FLAGS_ANYWHERE |
4267 VM_FLAGS_OVERWRITE |
4268 VM_FLAGS_RETURN_4K_DATA_ADDR |
4269 VM_FLAGS_RETURN_DATA_ADDR;
4270
4271 if (!vm_map_kernel_flags_check_vmflags(vmk_flags, allowed_flags)) {
4272 named_entry_unlock(named_entry);
4273 return KERN_INVALID_ARGUMENT;
4274 }
4275
4276 copy_map = named_entry->backing.copy;
4277 assert(copy_map->type == VM_MAP_COPY_ENTRY_LIST);
4278 if (copy_map->type != VM_MAP_COPY_ENTRY_LIST) {
4279 /* unsupported type; should not happen */
4280 printf("vm_map_enter_mem_object: "
4281 "memory_entry->backing.copy "
4282 "unsupported type 0x%x\n",
4283 copy_map->type);
4284 named_entry_unlock(named_entry);
4285 return KERN_INVALID_ARGUMENT;
4286 }
4287
4288 if (VM_MAP_PAGE_SHIFT(target_map) != copy_map->cpy_hdr.page_shift) {
4289 DEBUG4K_SHARE("copy_map %p offset %llx size 0x%llx pgshift %d -> target_map %p pgshift %d\n", copy_map, obj_offs, (uint64_t)map_size, copy_map->cpy_hdr.page_shift, target_map, VM_MAP_PAGE_SHIFT(target_map));
4290 }
4291
4292 if (vmk_flags.vmf_return_data_addr ||
4293 vmk_flags.vmf_return_4k_data_addr) {
4294 offset_in_mapping = obj_offs & map_mask;
4295 if (vmk_flags.vmf_return_4k_data_addr) {
4296 offset_in_mapping &= ~((signed)(0xFFF));
4297 }
4298 }
4299
4300 target_copy_map = VM_MAP_COPY_NULL;
4301 target_size = copy_map->size;
4302 overmap_start = 0;
4303 overmap_end = 0;
4304 trimmed_start = 0;
4305 if (copy_map->cpy_hdr.page_shift != VM_MAP_PAGE_SHIFT(target_map)) {
4306 DEBUG4K_ADJUST("adjusting...\n");
4307 kr = vm_map_copy_adjust_to_target(
4308 copy_map,
4309 obj_offs,
4310 initial_size,
4311 target_map,
4312 copy,
4313 &target_copy_map,
4314 &overmap_start,
4315 &overmap_end,
4316 &trimmed_start);
4317 if (kr != KERN_SUCCESS) {
4318 named_entry_unlock(named_entry);
4319 return kr;
4320 }
4321 target_size = target_copy_map->size;
4322 } else {
4323 /*
4324 * Assert that the vm_map_copy is coming from the right
4325 * zone and hasn't been forged
4326 */
4327 vm_map_copy_require(copy_map);
4328 target_copy_map = copy_map;
4329 }
4330
4331 vm_map_kernel_flags_t rsv_flags = vmk_flags;
4332
4333 vm_map_kernel_flags_and_vmflags(&rsv_flags,
4334 (VM_FLAGS_FIXED |
4335 VM_FLAGS_ANYWHERE |
4336 VM_FLAGS_OVERWRITE |
4337 VM_FLAGS_RETURN_4K_DATA_ADDR |
4338 VM_FLAGS_RETURN_DATA_ADDR));
4339
4340 /* reserve a contiguous range */
4341 kr = vm_map_enter(target_map,
4342 &map_addr,
4343 vm_map_round_page(target_size, map_mask),
4344 mask,
4345 rsv_flags,
4346 VM_OBJECT_NULL,
4347 0,
4348 FALSE, /* copy */
4349 cur_protection,
4350 max_protection,
4351 inheritance);
4352 if (kr != KERN_SUCCESS) {
4353 DEBUG4K_ERROR("kr 0x%x\n", kr);
4354 if (target_copy_map != copy_map) {
4355 vm_map_copy_discard(target_copy_map);
4356 target_copy_map = VM_MAP_COPY_NULL;
4357 }
4358 named_entry_unlock(named_entry);
4359 return kr;
4360 }
4361
4362 copy_addr = map_addr;
4363
4364 for (copy_entry = vm_map_copy_first_entry(target_copy_map);
4365 copy_entry != vm_map_copy_to_entry(target_copy_map);
4366 copy_entry = copy_entry->vme_next) {
4367 vm_map_t copy_submap = VM_MAP_NULL;
4368 vm_object_t copy_object = VM_OBJECT_NULL;
4369 vm_map_size_t copy_size;
4370 vm_object_offset_t copy_offset;
4371 boolean_t do_copy = false;
4372
4373 if (copy_entry->is_sub_map) {
4374 copy_submap = VME_SUBMAP(copy_entry);
4375 copy_object = (vm_object_t)copy_submap;
4376 } else {
4377 copy_object = VME_OBJECT(copy_entry);
4378 }
4379 copy_offset = VME_OFFSET(copy_entry);
4380 copy_size = (copy_entry->vme_end -
4381 copy_entry->vme_start);
4382
4383 /* sanity check */
4384 if ((copy_addr + copy_size) >
4385 (map_addr +
4386 overmap_start + overmap_end +
4387 named_entry->size /* XXX full size */)) {
4388 /* over-mapping too much !? */
4389 kr = KERN_INVALID_ARGUMENT;
4390 DEBUG4K_ERROR("kr 0x%x\n", kr);
4391 /* abort */
4392 break;
4393 }
4394
4395 /* take a reference on the object */
4396 if (copy_entry->is_sub_map) {
4397 vm_map_reference(copy_submap);
4398 } else {
4399 if (!copy &&
4400 copy_object != VM_OBJECT_NULL &&
4401 copy_object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) {
4402 bool is_writable;
4403
4404 /*
4405 * We need to resolve our side of this
4406 * "symmetric" copy-on-write now; we
4407 * need a new object to map and share,
4408 * instead of the current one which
4409 * might still be shared with the
4410 * original mapping.
4411 *
4412 * Note: A "vm_map_copy_t" does not
4413 * have a lock but we're protected by
4414 * the named entry's lock here.
4415 */
4416 // assert(copy_object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC);
4417 VME_OBJECT_SHADOW(copy_entry, copy_size, TRUE);
4418 assert(copy_object != VME_OBJECT(copy_entry));
4419 is_writable = false;
4420 if (copy_entry->protection & VM_PROT_WRITE) {
4421 is_writable = true;
4422 #if __arm64e__
4423 } else if (copy_entry->used_for_tpro) {
4424 is_writable = true;
4425 #endif /* __arm64e__ */
4426 }
4427 if (!copy_entry->needs_copy && is_writable) {
4428 vm_prot_t prot;
4429
4430 prot = copy_entry->protection & ~VM_PROT_WRITE;
4431 vm_object_pmap_protect(copy_object,
4432 copy_offset,
4433 copy_size,
4434 PMAP_NULL,
4435 PAGE_SIZE,
4436 0,
4437 prot);
4438 }
4439 copy_entry->needs_copy = FALSE;
4440 copy_entry->is_shared = TRUE;
4441 copy_object = VME_OBJECT(copy_entry);
4442 copy_offset = VME_OFFSET(copy_entry);
4443 vm_object_lock(copy_object);
4444 /* we're about to make a shared mapping of this object */
4445 copy_object->copy_strategy = MEMORY_OBJECT_COPY_DELAY;
4446 VM_OBJECT_SET_TRUE_SHARE(copy_object, TRUE);
4447 vm_object_unlock(copy_object);
4448 }
4449
4450 if (copy_object != VM_OBJECT_NULL &&
4451 copy_object->named &&
4452 copy_object->pager != MEMORY_OBJECT_NULL &&
4453 copy_object->copy_strategy != MEMORY_OBJECT_COPY_NONE) {
4454 memory_object_t pager;
4455 vm_prot_t pager_prot;
4456
4457 /*
4458 * For "named" VM objects, let the pager know that the
4459 * memory object is being mapped. Some pagers need to keep
4460 * track of this, to know when they can reclaim the memory
4461 * object, for example.
4462 * VM calls memory_object_map() for each mapping (specifying
4463 * the protection of each mapping) and calls
4464 * memory_object_last_unmap() when all the mappings are gone.
4465 */
4466 pager_prot = max_protection;
4467 if (copy) {
4468 /*
4469 * Copy-On-Write mapping: won't modify the
4470 * memory object.
4471 */
4472 pager_prot &= ~VM_PROT_WRITE;
4473 }
4474 vm_object_lock(copy_object);
4475 pager = copy_object->pager;
4476 if (copy_object->named &&
4477 pager != MEMORY_OBJECT_NULL &&
4478 copy_object->copy_strategy != MEMORY_OBJECT_COPY_NONE) {
4479 assert(copy_object->pager_ready);
4480 vm_object_mapping_wait(copy_object, THREAD_UNINT);
4481 /*
4482 * Object might have lost its pager
4483 * while waiting.
4484 */
4485 pager = copy_object->pager;
4486 if (copy_object->named &&
4487 pager != MEMORY_OBJECT_NULL) {
4488 vm_object_mapping_begin(copy_object);
4489 vm_object_unlock(copy_object);
4490
4491 kr = memory_object_map(pager, pager_prot);
4492 assert(kr == KERN_SUCCESS);
4493
4494 vm_object_lock(copy_object);
4495 vm_object_mapping_end(copy_object);
4496 }
4497 }
4498 vm_object_unlock(copy_object);
4499 }
4500
4501 /*
4502 * Perform the copy if requested
4503 */
4504
4505 if (copy && copy_object != VM_OBJECT_NULL) {
4506 vm_object_t new_object;
4507 vm_object_offset_t new_offset;
4508
4509 result = vm_object_copy_strategically(copy_object, copy_offset,
4510 copy_size,
4511 false, /* forking */
4512 &new_object, &new_offset,
4513 &do_copy);
4514
4515
4516 if (result == KERN_MEMORY_RESTART_COPY) {
4517 boolean_t success;
4518 boolean_t src_needs_copy;
4519
4520 /*
4521 * XXX
4522 * We currently ignore src_needs_copy.
4523 * This really is the issue of how to make
4524 * MEMORY_OBJECT_COPY_SYMMETRIC safe for
4525 * non-kernel users to use. Solution forthcoming.
4526 * In the meantime, since we don't allow non-kernel
4527 * memory managers to specify symmetric copy,
4528 * we won't run into problems here.
4529 */
4530 new_object = copy_object;
4531 new_offset = copy_offset;
4532 success = vm_object_copy_quickly(new_object,
4533 new_offset,
4534 copy_size,
4535 &src_needs_copy,
4536 &do_copy);
4537 assert(success);
4538 result = KERN_SUCCESS;
4539 }
4540 if (result != KERN_SUCCESS) {
4541 kr = result;
4542 break;
4543 }
4544
4545 copy_object = new_object;
4546 copy_offset = new_offset;
4547 /*
4548 * No extra object reference for the mapping:
4549 * the mapping should be the only thing keeping
4550 * this new object alive.
4551 */
4552 } else {
4553 /*
4554 * We already have the right object
4555 * to map.
4556 */
4557 copy_object = VME_OBJECT(copy_entry);
4558 /* take an extra ref for the mapping below */
4559 vm_object_reference(copy_object);
4560 }
4561 }
4562
4563 /*
4564 * If the caller does not want a specific
4565 * tag for this new mapping: use
4566 * the tag of the original mapping.
4567 */
4568 vm_map_kernel_flags_t vmk_remap_flags = {
4569 .vmkf_submap = copy_entry->is_sub_map,
4570 };
4571
4572 vm_map_kernel_flags_set_vmflags(&vmk_remap_flags,
4573 vm_map_kernel_flags_vmflags(vmk_flags),
4574 vmk_flags.vm_tag ?: VME_ALIAS(copy_entry));
4575
4576 /* over-map the object into destination */
4577 vmk_remap_flags.vmf_fixed = true;
4578 vmk_remap_flags.vmf_overwrite = true;
4579
4580 if (!copy && !copy_entry->is_sub_map) {
4581 /*
4582 * copy-on-write should have been
4583 * resolved at this point, or we would
4584 * end up sharing instead of copying.
4585 */
4586 assert(!copy_entry->needs_copy);
4587 }
4588 #if XNU_TARGET_OS_OSX
4589 if (copy_entry->used_for_jit) {
4590 vmk_remap_flags.vmkf_map_jit = TRUE;
4591 }
4592 #endif /* XNU_TARGET_OS_OSX */
4593
4594 kr = vm_map_enter(target_map,
4595 ©_addr,
4596 copy_size,
4597 (vm_map_offset_t) 0,
4598 vmk_remap_flags,
4599 copy_object,
4600 copy_offset,
4601 ((copy_object == NULL)
4602 ? FALSE
4603 : (copy || copy_entry->needs_copy)),
4604 cur_protection,
4605 max_protection,
4606 inheritance);
4607 if (kr != KERN_SUCCESS) {
4608 DEBUG4K_SHARE("failed kr 0x%x\n", kr);
4609 if (copy_entry->is_sub_map) {
4610 vm_map_deallocate(copy_submap);
4611 } else {
4612 vm_object_deallocate(copy_object);
4613 }
4614 /* abort */
4615 break;
4616 }
4617
4618 /* next mapping */
4619 copy_addr += copy_size;
4620 }
4621
4622 named_entry_unlock(named_entry);
4623 if (target_copy_map != copy_map) {
4624 vm_map_copy_discard(target_copy_map);
4625 target_copy_map = VM_MAP_COPY_NULL;
4626 }
4627
4628 if (kr == KERN_SUCCESS) {
4629 if (overmap_start) {
4630 DEBUG4K_SHARE("map %p map_addr 0x%llx offset_in_mapping 0x%llx overmap_start 0x%llx -> *address 0x%llx\n", target_map, (uint64_t)map_addr, (uint64_t)offset_in_mapping, (uint64_t)overmap_start, (uint64_t)(map_addr + offset_in_mapping + overmap_start));
4631 }
4632 offset_in_mapping += overmap_start;
4633 } else if (!vmk_flags.vmf_overwrite) {
4634 /* deallocate the contiguous range */
4635 vm_map_remove(target_map, map_addr,
4636 map_addr + map_size);
4637 }
4638 result = kr;
4639 goto out;
4640 }
4641
4642 if (named_entry->is_object) {
4643 unsigned int access;
4644 unsigned int wimg_mode;
4645
4646 assert(!named_entry->is_copy);
4647 assert(!named_entry->is_sub_map);
4648
4649 /* we are mapping a VM object */
4650
4651 access = named_entry->access;
4652
4653 if (vmk_flags.vmf_return_data_addr ||
4654 vmk_flags.vmf_return_4k_data_addr) {
4655 offset_in_mapping = obj_offs & map_mask;
4656 if (vmk_flags.vmf_return_4k_data_addr) {
4657 offset_in_mapping &= ~((signed)(0xFFF));
4658 }
4659 obj_offs -= offset_in_mapping;
4660 map_size = vm_map_round_page(initial_size +
4661 offset_in_mapping, map_mask);
4662 }
4663
4664 object = vm_named_entry_to_vm_object(named_entry);
4665 assert(object != VM_OBJECT_NULL);
4666 vm_object_lock(object);
4667 named_entry_unlock(named_entry);
4668
4669 wimg_mode = object->wimg_bits;
4670 vm_prot_to_wimg(access, &wimg_mode);
4671 if (object->wimg_bits != wimg_mode) {
4672 vm_object_change_wimg_mode(object, wimg_mode);
4673 }
4674
4675 vm_object_reference_locked(object);
4676 vm_object_unlock(object);
4677 } else {
4678 panic("invalid VM named entry %p", named_entry);
4679 }
4680 } else if (ip_kotype(port) == IKOT_MEMORY_OBJECT) {
4681 /*
4682 * JMM - This is temporary until we unify named entries
4683 * and raw memory objects.
4684 *
4685 * Detected fake ip_kotype for a memory object. In
4686 * this case, the port isn't really a port at all, but
4687 * instead is just a raw memory object.
4688 */
4689 if (vmk_flags.vmf_return_data_addr ||
4690 vmk_flags.vmf_return_4k_data_addr) {
4691 panic("VM_FLAGS_RETURN_DATA_ADDR not expected for raw memory object.");
4692 }
4693
4694 object = memory_object_to_vm_object((memory_object_t)port);
4695 if (object == VM_OBJECT_NULL) {
4696 return KERN_INVALID_OBJECT;
4697 }
4698 vm_object_reference(object);
4699
4700 /* wait for object (if any) to be ready */
4701 if (object != VM_OBJECT_NULL) {
4702 if (is_kernel_object(object)) {
4703 printf("Warning: Attempt to map kernel object"
4704 " by a non-private kernel entity\n");
4705 return KERN_INVALID_OBJECT;
4706 }
4707 if (!object->pager_ready) {
4708 vm_object_lock(object);
4709
4710 while (!object->pager_ready) {
4711 vm_object_sleep(object,
4712 VM_OBJECT_EVENT_PAGER_READY,
4713 THREAD_UNINT,
4714 LCK_SLEEP_EXCLUSIVE);
4715 }
4716 vm_object_unlock(object);
4717 }
4718 }
4719 } else {
4720 return KERN_INVALID_OBJECT;
4721 }
4722
4723 if (object != VM_OBJECT_NULL &&
4724 object->named &&
4725 object->pager != MEMORY_OBJECT_NULL &&
4726 object->copy_strategy != MEMORY_OBJECT_COPY_NONE) {
4727 memory_object_t pager;
4728 vm_prot_t pager_prot;
4729 kern_return_t kr;
4730
4731 /*
4732 * For "named" VM objects, let the pager know that the
4733 * memory object is being mapped. Some pagers need to keep
4734 * track of this, to know when they can reclaim the memory
4735 * object, for example.
4736 * VM calls memory_object_map() for each mapping (specifying
4737 * the protection of each mapping) and calls
4738 * memory_object_last_unmap() when all the mappings are gone.
4739 */
4740 pager_prot = max_protection;
4741 if (copy) {
4742 /*
4743 * Copy-On-Write mapping: won't modify the
4744 * memory object.
4745 */
4746 pager_prot &= ~VM_PROT_WRITE;
4747 }
4748 vm_object_lock(object);
4749 pager = object->pager;
4750 if (object->named &&
4751 pager != MEMORY_OBJECT_NULL &&
4752 object->copy_strategy != MEMORY_OBJECT_COPY_NONE) {
4753 assert(object->pager_ready);
4754 vm_object_mapping_wait(object, THREAD_UNINT);
4755 /* object might have lost its pager while waiting */
4756 pager = object->pager;
4757 if (object->named && pager != MEMORY_OBJECT_NULL) {
4758 vm_object_mapping_begin(object);
4759 vm_object_unlock(object);
4760
4761 kr = memory_object_map(pager, pager_prot);
4762 assert(kr == KERN_SUCCESS);
4763
4764 vm_object_lock(object);
4765 vm_object_mapping_end(object);
4766 }
4767 }
4768 vm_object_unlock(object);
4769 }
4770
4771 /*
4772 * Perform the copy if requested
4773 */
4774
4775 if (copy) {
4776 vm_object_t new_object;
4777 vm_object_offset_t new_offset;
4778
4779 result = vm_object_copy_strategically(object,
4780 obj_offs,
4781 map_size,
4782 false, /* forking */
4783 &new_object, &new_offset,
4784 ©);
4785
4786
4787 if (result == KERN_MEMORY_RESTART_COPY) {
4788 boolean_t success;
4789 boolean_t src_needs_copy;
4790
4791 /*
4792 * XXX
4793 * We currently ignore src_needs_copy.
4794 * This really is the issue of how to make
4795 * MEMORY_OBJECT_COPY_SYMMETRIC safe for
4796 * non-kernel users to use. Solution forthcoming.
4797 * In the meantime, since we don't allow non-kernel
4798 * memory managers to specify symmetric copy,
4799 * we won't run into problems here.
4800 */
4801 new_object = object;
4802 new_offset = obj_offs;
4803 success = vm_object_copy_quickly(new_object,
4804 new_offset,
4805 map_size,
4806 &src_needs_copy,
4807 ©);
4808 assert(success);
4809 result = KERN_SUCCESS;
4810 }
4811 /*
4812 * Throw away the reference to the
4813 * original object, as it won't be mapped.
4814 */
4815
4816 vm_object_deallocate(object);
4817
4818 if (result != KERN_SUCCESS) {
4819 return result;
4820 }
4821
4822 object = new_object;
4823 obj_offs = new_offset;
4824 }
4825
4826 /*
4827 * If non-kernel users want to try to prefault pages, the mapping and prefault
4828 * needs to be atomic.
4829 */
4830 kernel_prefault = (try_prefault && vm_kernel_map_is_kernel(target_map));
4831 vmk_flags.vmkf_keep_map_locked = (try_prefault && !kernel_prefault);
4832
4833 result = vm_map_enter(target_map,
4834 &map_addr, map_size,
4835 (vm_map_offset_t)mask,
4836 vmk_flags,
4837 object, obj_offs,
4838 copy,
4839 cur_protection, max_protection,
4840 inheritance);
4841 if (result != KERN_SUCCESS) {
4842 vm_object_deallocate(object);
4843 }
4844
4845 /*
4846 * Try to prefault, and do not forget to release the vm map lock.
4847 */
4848 if (result == KERN_SUCCESS && try_prefault) {
4849 mach_vm_address_t va = map_addr;
4850 kern_return_t kr = KERN_SUCCESS;
4851 unsigned int i = 0;
4852 int pmap_options;
4853
4854 pmap_options = kernel_prefault ? 0 : PMAP_OPTIONS_NOWAIT;
4855
4856 for (i = 0; i < page_list_count; ++i) {
4857 if (!UPL_VALID_PAGE(page_list, i)) {
4858 if (kernel_prefault) {
4859 assertf(FALSE, "kernel_prefault && !UPL_VALID_PAGE");
4860 result = KERN_MEMORY_ERROR;
4861 break;
4862 }
4863 } else {
4864 /*
4865 * If this function call failed, we should stop
4866 * trying to optimize, other calls are likely
4867 * going to fail too.
4868 *
4869 * We are not gonna report an error for such
4870 * failure though. That's an optimization, not
4871 * something critical.
4872 */
4873 kr = pmap_enter_object_options_check(target_map->pmap,
4874 va, 0, object, UPL_PHYS_PAGE(page_list, i),
4875 cur_protection, VM_PROT_NONE,
4876 TRUE, pmap_options);
4877 if (kr != KERN_SUCCESS) {
4878 OSIncrementAtomic64(&vm_prefault_nb_bailout);
4879 if (kernel_prefault) {
4880 result = kr;
4881 }
4882 break;
4883 }
4884 OSIncrementAtomic64(&vm_prefault_nb_pages);
4885 }
4886
4887 /* Next virtual address */
4888 va += PAGE_SIZE;
4889 }
4890 if (vmk_flags.vmkf_keep_map_locked) {
4891 vm_map_unlock(target_map);
4892 }
4893 }
4894
4895 out:
4896 if (result == KERN_SUCCESS) {
4897 #if KASAN
4898 if (target_map->pmap == kernel_pmap) {
4899 kasan_notify_address(map_addr, map_size);
4900 }
4901 #endif
4902 *address_u = vm_sanitize_wrap_addr(map_addr + offset_in_mapping);
4903 }
4904 return result;
4905 }
4906
4907 kern_return_t
vm_map_enter_mem_object_prefault(vm_map_t target_map,vm_map_offset_ut * address,vm_map_size_ut initial_size,vm_map_offset_ut mask,vm_map_kernel_flags_t vmk_flags,ipc_port_t port,vm_object_offset_ut offset,vm_prot_ut cur_protection,vm_prot_ut max_protection,upl_page_list_ptr_t page_list,unsigned int page_list_count)4908 vm_map_enter_mem_object_prefault(
4909 vm_map_t target_map,
4910 vm_map_offset_ut *address,
4911 vm_map_size_ut initial_size,
4912 vm_map_offset_ut mask,
4913 vm_map_kernel_flags_t vmk_flags,
4914 ipc_port_t port,
4915 vm_object_offset_ut offset,
4916 vm_prot_ut cur_protection,
4917 vm_prot_ut max_protection,
4918 upl_page_list_ptr_t page_list,
4919 unsigned int page_list_count)
4920 {
4921 /* range_id is set by vm_map_enter_mem_object */
4922 return vm_map_enter_mem_object(target_map,
4923 address,
4924 initial_size,
4925 mask,
4926 vmk_flags,
4927 port,
4928 offset,
4929 FALSE,
4930 cur_protection,
4931 max_protection,
4932 VM_INHERIT_DEFAULT,
4933 page_list,
4934 page_list_count);
4935 }
4936
4937 static __attribute__((always_inline, warn_unused_result))
4938 kern_return_t
vm_map_enter_mem_object_control_sanitize(vm_map_t target_map,vm_map_offset_ut address_u,vm_map_size_ut initial_size_u,vm_map_offset_ut mask_u,vm_object_offset_ut offset_u,vm_prot_ut cur_protection_u,vm_prot_ut max_protection_u,vm_inherit_ut inheritance_u,vm_map_kernel_flags_t vmk_flags,vm_map_address_t * map_addr,vm_map_size_t * map_size,vm_map_offset_t * mask,vm_object_offset_t * obj_offs,vm_object_offset_t * obj_end,vm_object_size_t * obj_size,vm_prot_t * cur_protection,vm_prot_t * max_protection,vm_inherit_t * inheritance)4939 vm_map_enter_mem_object_control_sanitize(
4940 vm_map_t target_map,
4941 vm_map_offset_ut address_u,
4942 vm_map_size_ut initial_size_u,
4943 vm_map_offset_ut mask_u,
4944 vm_object_offset_ut offset_u,
4945 vm_prot_ut cur_protection_u,
4946 vm_prot_ut max_protection_u,
4947 vm_inherit_ut inheritance_u,
4948 vm_map_kernel_flags_t vmk_flags,
4949 vm_map_address_t *map_addr,
4950 vm_map_size_t *map_size,
4951 vm_map_offset_t *mask,
4952 vm_object_offset_t *obj_offs,
4953 vm_object_offset_t *obj_end,
4954 vm_object_size_t *obj_size,
4955 vm_prot_t *cur_protection,
4956 vm_prot_t *max_protection,
4957 vm_inherit_t *inheritance)
4958 {
4959 kern_return_t kr;
4960
4961 kr = vm_sanitize_cur_and_max_prots(cur_protection_u, max_protection_u,
4962 VM_SANITIZE_CALLER_ENTER_MEM_OBJ_CTL, target_map,
4963 cur_protection, max_protection);
4964 if (__improbable(kr != KERN_SUCCESS)) {
4965 return kr;
4966 }
4967
4968 kr = vm_sanitize_inherit(inheritance_u, VM_SANITIZE_CALLER_ENTER_MEM_OBJ_CTL,
4969 inheritance);
4970 if (__improbable(kr != KERN_SUCCESS)) {
4971 return kr;
4972 }
4973
4974 kr = vm_sanitize_mask(mask_u, VM_SANITIZE_CALLER_ENTER_MEM_OBJ_CTL, mask);
4975 if (__improbable(kr != KERN_SUCCESS)) {
4976 return kr;
4977 }
4978 /*
4979 * Ensure arithmetic doesn't overflow in vm_object space (kernel
4980 * pages).
4981 * We keep unaligned values for now. The call we eventually make to
4982 * vm_map_enter does guarantee that offset_u is page aligned for EITHER
4983 * target_map pages or kernel pages. But this isn't enough to guarantee
4984 * kernel space alignment.
4985 */
4986 kr = vm_sanitize_addr_size(offset_u, initial_size_u,
4987 VM_SANITIZE_CALLER_ENTER_MEM_OBJ_CTL, PAGE_MASK,
4988 VM_SANITIZE_FLAGS_SIZE_ZERO_FAILS |
4989 VM_SANITIZE_FLAGS_GET_UNALIGNED_VALUES,
4990 obj_offs, obj_end, obj_size);
4991 if (__improbable(kr != KERN_SUCCESS)) {
4992 return kr;
4993 }
4994
4995 /*
4996 * There is no vm_sanitize_addr_size variant that also adjusts for
4997 * a separate offset. Rather than create one for this one-off issue,
4998 * we sanitize map_addr and map_size individually, relying on
4999 * vm_sanitize_size to incorporate the offset. Then, we perform the
5000 * overflow check manually below.
5001 */
5002 *map_addr = vm_sanitize_addr(target_map, address_u);
5003 kr = vm_sanitize_size(offset_u, initial_size_u,
5004 VM_SANITIZE_CALLER_ENTER_MEM_OBJ_CTL, target_map,
5005 VM_SANITIZE_FLAGS_SIZE_ZERO_FAILS, map_size);
5006 if (__improbable(kr != KERN_SUCCESS)) {
5007 return kr;
5008 }
5009
5010 /*
5011 * Ensure arithmetic doesn't overflow in target_map space.
5012 * The computation of map_size above accounts for the possibility that
5013 * offset_u might be unaligned in target_map space.
5014 */
5015 if (vmk_flags.vmf_fixed) {
5016 vm_map_address_t map_end;
5017
5018 if (__improbable(os_add_overflow(*map_addr, *map_size, &map_end))) {
5019 return KERN_INVALID_ARGUMENT;
5020 }
5021 }
5022
5023 return KERN_SUCCESS;
5024 }
5025
5026 kern_return_t
vm_map_enter_mem_object_control(vm_map_t target_map,vm_map_offset_ut * address_u,vm_map_size_ut initial_size_u,vm_map_offset_ut mask_u,vm_map_kernel_flags_t vmk_flags,memory_object_control_t control,vm_object_offset_ut offset_u,boolean_t needs_copy,vm_prot_ut cur_protection_u,vm_prot_ut max_protection_u,vm_inherit_ut inheritance_u)5027 vm_map_enter_mem_object_control(
5028 vm_map_t target_map,
5029 vm_map_offset_ut *address_u,
5030 vm_map_size_ut initial_size_u,
5031 vm_map_offset_ut mask_u,
5032 vm_map_kernel_flags_t vmk_flags,
5033 memory_object_control_t control,
5034 vm_object_offset_ut offset_u,
5035 boolean_t needs_copy,
5036 vm_prot_ut cur_protection_u,
5037 vm_prot_ut max_protection_u,
5038 vm_inherit_ut inheritance_u)
5039 {
5040 vm_map_offset_t mask;
5041 vm_prot_t cur_protection;
5042 vm_prot_t max_protection;
5043 vm_inherit_t inheritance;
5044 vm_map_address_t map_addr;
5045 vm_map_size_t map_size;
5046 vm_object_t object;
5047 vm_object_offset_t obj_offs, obj_end;
5048 vm_object_size_t obj_size;
5049 kern_return_t result;
5050 memory_object_t pager;
5051 vm_prot_t pager_prot;
5052 kern_return_t kr;
5053
5054 /*
5055 * Check arguments for validity
5056 */
5057 if (target_map == VM_MAP_NULL) {
5058 return KERN_INVALID_ARGUMENT;
5059 }
5060
5061 /*
5062 * We only support vmf_return_data_addr-like behavior.
5063 */
5064 vmk_flags.vmf_return_data_addr = true;
5065
5066 /*
5067 * Sanitize any input parameters that are addr/size/prot/inherit
5068 */
5069 kr = vm_map_enter_mem_object_control_sanitize(target_map,
5070 *address_u,
5071 initial_size_u,
5072 mask_u,
5073 offset_u,
5074 cur_protection_u,
5075 max_protection_u,
5076 inheritance_u,
5077 vmk_flags,
5078 &map_addr,
5079 &map_size,
5080 &mask,
5081 &obj_offs,
5082 &obj_end,
5083 &obj_size,
5084 &cur_protection,
5085 &max_protection,
5086 &inheritance);
5087 if (__improbable(kr != KERN_SUCCESS)) {
5088 return vm_sanitize_get_kr(kr);
5089 }
5090
5091 object = memory_object_control_to_vm_object(control);
5092
5093 if (object == VM_OBJECT_NULL) {
5094 return KERN_INVALID_OBJECT;
5095 }
5096
5097 if (is_kernel_object(object)) {
5098 printf("Warning: Attempt to map kernel object"
5099 " by a non-private kernel entity\n");
5100 return KERN_INVALID_OBJECT;
5101 }
5102
5103 vm_object_lock(object);
5104 os_ref_retain_locked_raw(&object->ref_count, &vm_object_refgrp);
5105
5106
5107 /*
5108 * For "named" VM objects, let the pager know that the
5109 * memory object is being mapped. Some pagers need to keep
5110 * track of this, to know when they can reclaim the memory
5111 * object, for example.
5112 * VM calls memory_object_map() for each mapping (specifying
5113 * the protection of each mapping) and calls
5114 * memory_object_last_unmap() when all the mappings are gone.
5115 */
5116 pager_prot = max_protection;
5117 if (needs_copy) {
5118 pager_prot &= ~VM_PROT_WRITE;
5119 }
5120 pager = object->pager;
5121 if (object->named &&
5122 pager != MEMORY_OBJECT_NULL &&
5123 object->copy_strategy != MEMORY_OBJECT_COPY_NONE) {
5124 assert(object->pager_ready);
5125 vm_object_mapping_wait(object, THREAD_UNINT);
5126 /* object might have lost its pager while waiting */
5127 pager = object->pager;
5128 if (object->named && pager != MEMORY_OBJECT_NULL) {
5129 vm_object_mapping_begin(object);
5130 vm_object_unlock(object);
5131
5132 kr = memory_object_map(pager, pager_prot);
5133 assert(kr == KERN_SUCCESS);
5134
5135 vm_object_lock(object);
5136 vm_object_mapping_end(object);
5137 }
5138 }
5139 vm_object_unlock(object);
5140
5141 /*
5142 * Perform the copy if requested
5143 */
5144
5145 if (needs_copy) {
5146 vm_object_t new_object;
5147 vm_object_offset_t new_offset;
5148
5149 result = vm_object_copy_strategically(object, obj_offs, obj_size,
5150 false, /* forking */
5151 &new_object, &new_offset,
5152 &needs_copy);
5153
5154
5155 if (result == KERN_MEMORY_RESTART_COPY) {
5156 boolean_t success;
5157 boolean_t src_needs_copy;
5158
5159 /*
5160 * XXX
5161 * We currently ignore src_needs_copy.
5162 * This really is the issue of how to make
5163 * MEMORY_OBJECT_COPY_SYMMETRIC safe for
5164 * non-kernel users to use. Solution forthcoming.
5165 * In the meantime, since we don't allow non-kernel
5166 * memory managers to specify symmetric copy,
5167 * we won't run into problems here.
5168 */
5169 new_object = object;
5170 new_offset = obj_offs;
5171 success = vm_object_copy_quickly(new_object,
5172 new_offset, obj_size,
5173 &src_needs_copy,
5174 &needs_copy);
5175 assert(success);
5176 result = KERN_SUCCESS;
5177 }
5178 /*
5179 * Throw away the reference to the
5180 * original object, as it won't be mapped.
5181 */
5182
5183 vm_object_deallocate(object);
5184
5185 if (result != KERN_SUCCESS) {
5186 return result;
5187 }
5188
5189 object = new_object;
5190 obj_offs = new_offset;
5191 }
5192
5193 result = vm_map_enter(target_map,
5194 &map_addr, map_size,
5195 (vm_map_offset_t)mask,
5196 vmk_flags,
5197 object,
5198 obj_offs,
5199 needs_copy,
5200 cur_protection, max_protection,
5201 inheritance);
5202
5203 if (result == KERN_SUCCESS) {
5204 *address_u = vm_sanitize_wrap_addr(
5205 map_addr + (obj_offs & vm_map_page_mask(target_map)));
5206 } else {
5207 vm_object_deallocate(object);
5208 }
5209
5210 return result;
5211 }
5212
5213
5214 /* Not used without nested pmaps */
5215 #ifndef NO_NESTED_PMAP
5216 /*
5217 * Clip and unnest a portion of a nested submap mapping.
5218 */
5219
5220
5221 static void
vm_map_clip_unnest(vm_map_t map,vm_map_entry_t entry,vm_map_offset_t start_unnest,vm_map_offset_t end_unnest)5222 vm_map_clip_unnest(
5223 vm_map_t map,
5224 vm_map_entry_t entry,
5225 vm_map_offset_t start_unnest,
5226 vm_map_offset_t end_unnest)
5227 {
5228 vm_map_offset_t old_start_unnest = start_unnest;
5229 vm_map_offset_t old_end_unnest = end_unnest;
5230
5231 assert(entry->is_sub_map);
5232 assert(VME_SUBMAP(entry) != NULL);
5233 assert(entry->use_pmap);
5234
5235 /*
5236 * Query the platform for the optimal unnest range.
5237 * DRK: There's some duplication of effort here, since
5238 * callers may have adjusted the range to some extent. This
5239 * routine was introduced to support 1GiB subtree nesting
5240 * for x86 platforms, which can also nest on 2MiB boundaries
5241 * depending on size/alignment.
5242 */
5243 if (pmap_adjust_unnest_parameters(map->pmap, &start_unnest, &end_unnest)) {
5244 assert(VME_SUBMAP(entry)->is_nested_map);
5245 assert(!VME_SUBMAP(entry)->disable_vmentry_reuse);
5246 log_unnest_badness(map,
5247 old_start_unnest,
5248 old_end_unnest,
5249 VME_SUBMAP(entry)->is_nested_map,
5250 (entry->vme_start +
5251 VME_SUBMAP(entry)->lowest_unnestable_start -
5252 VME_OFFSET(entry)));
5253 }
5254
5255 if (entry->vme_start > start_unnest ||
5256 entry->vme_end < end_unnest) {
5257 panic("vm_map_clip_unnest(0x%llx,0x%llx): "
5258 "bad nested entry: start=0x%llx end=0x%llx\n",
5259 (long long)start_unnest, (long long)end_unnest,
5260 (long long)entry->vme_start, (long long)entry->vme_end);
5261 }
5262
5263 if (start_unnest > entry->vme_start) {
5264 _vm_map_clip_start(&map->hdr,
5265 entry,
5266 start_unnest);
5267 if (map->holelistenabled) {
5268 vm_map_store_update_first_free(map, NULL, FALSE);
5269 } else {
5270 vm_map_store_update_first_free(map, map->first_free, FALSE);
5271 }
5272 }
5273 if (entry->vme_end > end_unnest) {
5274 _vm_map_clip_end(&map->hdr,
5275 entry,
5276 end_unnest);
5277 if (map->holelistenabled) {
5278 vm_map_store_update_first_free(map, NULL, FALSE);
5279 } else {
5280 vm_map_store_update_first_free(map, map->first_free, FALSE);
5281 }
5282 }
5283
5284 pmap_unnest(map->pmap,
5285 entry->vme_start,
5286 entry->vme_end - entry->vme_start);
5287 if ((map->mapped_in_other_pmaps) && os_ref_get_count_raw(&map->map_refcnt) != 0) {
5288 /* clean up parent map/maps */
5289 vm_map_submap_pmap_clean(
5290 map, entry->vme_start,
5291 entry->vme_end,
5292 VME_SUBMAP(entry),
5293 VME_OFFSET(entry));
5294 }
5295 entry->use_pmap = FALSE;
5296 if ((map->pmap != kernel_pmap) &&
5297 (VME_ALIAS(entry) == VM_MEMORY_SHARED_PMAP)) {
5298 VME_ALIAS_SET(entry, VM_MEMORY_UNSHARED_PMAP);
5299 }
5300 }
5301 #endif /* NO_NESTED_PMAP */
5302
5303 __abortlike
5304 static void
__vm_map_clip_atomic_entry_panic(vm_map_t map,vm_map_entry_t entry,vm_map_offset_t where)5305 __vm_map_clip_atomic_entry_panic(
5306 vm_map_t map,
5307 vm_map_entry_t entry,
5308 vm_map_offset_t where)
5309 {
5310 panic("vm_map_clip(%p): Attempting to clip an atomic VM map entry "
5311 "%p [0x%llx:0x%llx] at 0x%llx", map, entry,
5312 (uint64_t)entry->vme_start,
5313 (uint64_t)entry->vme_end,
5314 (uint64_t)where);
5315 }
5316
5317 /*
5318 * vm_map_clip_start: [ internal use only ]
5319 *
5320 * Asserts that the given entry begins at or after
5321 * the specified address; if necessary,
5322 * it splits the entry into two.
5323 */
5324 void
vm_map_clip_start(vm_map_t map,vm_map_entry_t entry,vm_map_offset_t startaddr)5325 vm_map_clip_start(
5326 vm_map_t map,
5327 vm_map_entry_t entry,
5328 vm_map_offset_t startaddr)
5329 {
5330 #ifndef NO_NESTED_PMAP
5331 if (entry->is_sub_map &&
5332 entry->use_pmap &&
5333 startaddr >= entry->vme_start) {
5334 vm_map_offset_t start_unnest, end_unnest;
5335
5336 /*
5337 * Make sure "startaddr" is no longer in a nested range
5338 * before we clip. Unnest only the minimum range the platform
5339 * can handle.
5340 * vm_map_clip_unnest may perform additional adjustments to
5341 * the unnest range.
5342 */
5343 start_unnest = startaddr & ~(pmap_shared_region_size_min(map->pmap) - 1);
5344 end_unnest = start_unnest + pmap_shared_region_size_min(map->pmap);
5345 vm_map_clip_unnest(map, entry, start_unnest, end_unnest);
5346 }
5347 #endif /* NO_NESTED_PMAP */
5348 if (startaddr > entry->vme_start) {
5349 if (!entry->is_sub_map &&
5350 VME_OBJECT(entry) &&
5351 VME_OBJECT(entry)->phys_contiguous) {
5352 pmap_remove(map->pmap,
5353 (addr64_t)(entry->vme_start),
5354 (addr64_t)(entry->vme_end));
5355 }
5356 if (entry->vme_atomic) {
5357 __vm_map_clip_atomic_entry_panic(map, entry, startaddr);
5358 }
5359
5360 DTRACE_VM5(
5361 vm_map_clip_start,
5362 vm_map_t, map,
5363 vm_map_offset_t, entry->vme_start,
5364 vm_map_offset_t, entry->vme_end,
5365 vm_map_offset_t, startaddr,
5366 int, VME_ALIAS(entry));
5367
5368 _vm_map_clip_start(&map->hdr, entry, startaddr);
5369 if (map->holelistenabled) {
5370 vm_map_store_update_first_free(map, NULL, FALSE);
5371 } else {
5372 vm_map_store_update_first_free(map, map->first_free, FALSE);
5373 }
5374 }
5375 }
5376
5377
5378 #define vm_map_copy_clip_start(copy, entry, startaddr) \
5379 MACRO_BEGIN \
5380 if ((startaddr) > (entry)->vme_start) \
5381 _vm_map_clip_start(&(copy)->cpy_hdr,(entry),(startaddr)); \
5382 MACRO_END
5383
5384 /*
5385 * This routine is called only when it is known that
5386 * the entry must be split.
5387 */
5388 static void
_vm_map_clip_start(struct vm_map_header * map_header,vm_map_entry_t entry,vm_map_offset_t start)5389 _vm_map_clip_start(
5390 struct vm_map_header *map_header,
5391 vm_map_entry_t entry,
5392 vm_map_offset_t start)
5393 {
5394 vm_map_entry_t new_entry;
5395
5396 /*
5397 * Split off the front portion --
5398 * note that we must insert the new
5399 * entry BEFORE this one, so that
5400 * this entry has the specified starting
5401 * address.
5402 */
5403
5404 if (entry->map_aligned) {
5405 assert(VM_MAP_PAGE_ALIGNED(start,
5406 VM_MAP_HDR_PAGE_MASK(map_header)));
5407 }
5408
5409 new_entry = _vm_map_entry_create(map_header);
5410 vm_map_entry_copy_full(new_entry, entry);
5411
5412 new_entry->vme_end = start;
5413 assert(new_entry->vme_start < new_entry->vme_end);
5414 VME_OFFSET_SET(entry, VME_OFFSET(entry) + (start - entry->vme_start));
5415 if (__improbable(start >= entry->vme_end)) {
5416 panic("mapHdr %p entry %p start 0x%llx end 0x%llx new start 0x%llx", map_header, entry, entry->vme_start, entry->vme_end, start);
5417 }
5418 assert(start < entry->vme_end);
5419 entry->vme_start = start;
5420
5421 #if VM_BTLOG_TAGS
5422 if (new_entry->vme_kernel_object) {
5423 btref_retain(new_entry->vme_tag_btref);
5424 }
5425 #endif /* VM_BTLOG_TAGS */
5426
5427 _vm_map_store_entry_link(map_header, entry->vme_prev, new_entry);
5428
5429 if (entry->is_sub_map) {
5430 vm_map_reference(VME_SUBMAP(new_entry));
5431 } else {
5432 vm_object_reference(VME_OBJECT(new_entry));
5433 }
5434 }
5435
5436
5437 /*
5438 * vm_map_clip_end: [ internal use only ]
5439 *
5440 * Asserts that the given entry ends at or before
5441 * the specified address; if necessary,
5442 * it splits the entry into two.
5443 */
5444 void
vm_map_clip_end(vm_map_t map,vm_map_entry_t entry,vm_map_offset_t endaddr)5445 vm_map_clip_end(
5446 vm_map_t map,
5447 vm_map_entry_t entry,
5448 vm_map_offset_t endaddr)
5449 {
5450 if (endaddr > entry->vme_end) {
5451 /*
5452 * Within the scope of this clipping, limit "endaddr" to
5453 * the end of this map entry...
5454 */
5455 endaddr = entry->vme_end;
5456 }
5457 #ifndef NO_NESTED_PMAP
5458 if (entry->is_sub_map && entry->use_pmap) {
5459 vm_map_offset_t start_unnest, end_unnest;
5460
5461 /*
5462 * Make sure the range between the start of this entry and
5463 * the new "endaddr" is no longer nested before we clip.
5464 * Unnest only the minimum range the platform can handle.
5465 * vm_map_clip_unnest may perform additional adjustments to
5466 * the unnest range.
5467 */
5468 start_unnest = entry->vme_start;
5469 end_unnest =
5470 (endaddr + pmap_shared_region_size_min(map->pmap) - 1) &
5471 ~(pmap_shared_region_size_min(map->pmap) - 1);
5472 vm_map_clip_unnest(map, entry, start_unnest, end_unnest);
5473 }
5474 #endif /* NO_NESTED_PMAP */
5475 if (endaddr < entry->vme_end) {
5476 if (!entry->is_sub_map &&
5477 VME_OBJECT(entry) &&
5478 VME_OBJECT(entry)->phys_contiguous) {
5479 pmap_remove(map->pmap,
5480 (addr64_t)(entry->vme_start),
5481 (addr64_t)(entry->vme_end));
5482 }
5483 if (entry->vme_atomic) {
5484 __vm_map_clip_atomic_entry_panic(map, entry, endaddr);
5485 }
5486 DTRACE_VM5(
5487 vm_map_clip_end,
5488 vm_map_t, map,
5489 vm_map_offset_t, entry->vme_start,
5490 vm_map_offset_t, entry->vme_end,
5491 vm_map_offset_t, endaddr,
5492 int, VME_ALIAS(entry));
5493
5494 _vm_map_clip_end(&map->hdr, entry, endaddr);
5495 if (map->holelistenabled) {
5496 vm_map_store_update_first_free(map, NULL, FALSE);
5497 } else {
5498 vm_map_store_update_first_free(map, map->first_free, FALSE);
5499 }
5500 }
5501 }
5502
5503
5504 #define vm_map_copy_clip_end(copy, entry, endaddr) \
5505 MACRO_BEGIN \
5506 if ((endaddr) < (entry)->vme_end) \
5507 _vm_map_clip_end(&(copy)->cpy_hdr,(entry),(endaddr)); \
5508 MACRO_END
5509
5510 /*
5511 * This routine is called only when it is known that
5512 * the entry must be split.
5513 */
5514 static void
_vm_map_clip_end(struct vm_map_header * map_header,vm_map_entry_t entry,vm_map_offset_t end)5515 _vm_map_clip_end(
5516 struct vm_map_header *map_header,
5517 vm_map_entry_t entry,
5518 vm_map_offset_t end)
5519 {
5520 vm_map_entry_t new_entry;
5521
5522 /*
5523 * Create a new entry and insert it
5524 * AFTER the specified entry
5525 */
5526
5527 if (entry->map_aligned) {
5528 assert(VM_MAP_PAGE_ALIGNED(end,
5529 VM_MAP_HDR_PAGE_MASK(map_header)));
5530 }
5531
5532 new_entry = _vm_map_entry_create(map_header);
5533 vm_map_entry_copy_full(new_entry, entry);
5534
5535 if (__improbable(end <= entry->vme_start)) {
5536 panic("mapHdr %p entry %p start 0x%llx end 0x%llx new end 0x%llx", map_header, entry, entry->vme_start, entry->vme_end, end);
5537 }
5538 assert(entry->vme_start < end);
5539 new_entry->vme_start = entry->vme_end = end;
5540 VME_OFFSET_SET(new_entry,
5541 VME_OFFSET(new_entry) + (end - entry->vme_start));
5542 assert(new_entry->vme_start < new_entry->vme_end);
5543
5544 #if VM_BTLOG_TAGS
5545 if (new_entry->vme_kernel_object) {
5546 btref_retain(new_entry->vme_tag_btref);
5547 }
5548 #endif /* VM_BTLOG_TAGS */
5549
5550 _vm_map_store_entry_link(map_header, entry, new_entry);
5551
5552 if (entry->is_sub_map) {
5553 vm_map_reference(VME_SUBMAP(new_entry));
5554 } else {
5555 vm_object_reference(VME_OBJECT(new_entry));
5556 }
5557 }
5558
5559
5560 /*
5561 * VM_MAP_RANGE_CHECK: [ internal use only ]
5562 *
5563 * Asserts that the starting and ending region
5564 * addresses fall within the valid range of the map.
5565 */
5566 #define VM_MAP_RANGE_CHECK(map, start, end) \
5567 MACRO_BEGIN \
5568 if (start < vm_map_min(map)) \
5569 start = vm_map_min(map); \
5570 if (end > vm_map_max(map)) \
5571 end = vm_map_max(map); \
5572 if (start > end) \
5573 start = end; \
5574 MACRO_END
5575
5576 /*
5577 * vm_map_range_check: [ internal use only ]
5578 *
5579 * Check that the region defined by the specified start and
5580 * end addresses are wholly contained within a single map
5581 * entry or set of adjacent map entries of the spacified map,
5582 * i.e. the specified region contains no unmapped space.
5583 * If any or all of the region is unmapped, FALSE is returned.
5584 * Otherwise, TRUE is returned and if the output argument 'entry'
5585 * is not NULL it points to the map entry containing the start
5586 * of the region.
5587 *
5588 * The map is locked for reading on entry and is left locked.
5589 */
5590 static boolean_t
vm_map_range_check(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_map_entry_t * entry)5591 vm_map_range_check(
5592 vm_map_t map,
5593 vm_map_offset_t start,
5594 vm_map_offset_t end,
5595 vm_map_entry_t *entry)
5596 {
5597 vm_map_entry_t cur;
5598 vm_map_offset_t prev;
5599
5600 /*
5601 * Basic sanity checks first
5602 */
5603 if (start < vm_map_min(map) || end > vm_map_max(map) || start > end) {
5604 return FALSE;
5605 }
5606
5607 /*
5608 * Check first if the region starts within a valid
5609 * mapping for the map.
5610 */
5611 if (!vm_map_lookup_entry(map, start, &cur)) {
5612 return FALSE;
5613 }
5614
5615 /*
5616 * Optimize for the case that the region is contained
5617 * in a single map entry.
5618 */
5619 if (entry != (vm_map_entry_t *) NULL) {
5620 *entry = cur;
5621 }
5622 if (end <= cur->vme_end) {
5623 return TRUE;
5624 }
5625
5626 /*
5627 * If the region is not wholly contained within a
5628 * single entry, walk the entries looking for holes.
5629 */
5630 prev = cur->vme_end;
5631 cur = cur->vme_next;
5632 while ((cur != vm_map_to_entry(map)) && (prev == cur->vme_start)) {
5633 if (end <= cur->vme_end) {
5634 return TRUE;
5635 }
5636 prev = cur->vme_end;
5637 cur = cur->vme_next;
5638 }
5639 return FALSE;
5640 }
5641
5642 static __attribute__((always_inline, warn_unused_result))
5643 kern_return_t
vm_map_protect_sanitize(vm_map_t map,vm_map_offset_ut start_u,vm_map_offset_ut end_u,vm_prot_ut new_prot_u,vm_map_offset_t * start,vm_map_offset_t * end,vm_prot_t * new_prot)5644 vm_map_protect_sanitize(
5645 vm_map_t map,
5646 vm_map_offset_ut start_u,
5647 vm_map_offset_ut end_u,
5648 vm_prot_ut new_prot_u,
5649 vm_map_offset_t *start,
5650 vm_map_offset_t *end,
5651 vm_prot_t *new_prot)
5652 {
5653 kern_return_t kr;
5654 vm_map_size_t size;
5655
5656 kr = vm_sanitize_prot(new_prot_u, VM_SANITIZE_CALLER_VM_MAP_PROTECT,
5657 map, VM_PROT_COPY, new_prot);
5658 if (__improbable(kr != KERN_SUCCESS)) {
5659 return kr;
5660 }
5661
5662 kr = vm_sanitize_addr_end(start_u, end_u, VM_SANITIZE_CALLER_VM_MAP_PROTECT,
5663 map, VM_SANITIZE_FLAGS_SIZE_ZERO_SUCCEEDS, start, end, &size);
5664 if (__improbable(kr != KERN_SUCCESS)) {
5665 return kr;
5666 }
5667
5668 return KERN_SUCCESS;
5669 }
5670
5671 /*
5672 * vm_map_protect:
5673 *
5674 * Sets the protection of the specified address
5675 * region in the target map. If "set_max" is
5676 * specified, the maximum protection is to be set;
5677 * otherwise, only the current protection is affected.
5678 */
5679 kern_return_t
vm_map_protect(vm_map_t map,vm_map_offset_ut start_u,vm_map_offset_ut end_u,boolean_t set_max,vm_prot_ut new_prot_u)5680 vm_map_protect(
5681 vm_map_t map,
5682 vm_map_offset_ut start_u,
5683 vm_map_offset_ut end_u,
5684 boolean_t set_max,
5685 vm_prot_ut new_prot_u)
5686 {
5687 vm_map_entry_t current;
5688 vm_map_offset_t prev;
5689 vm_map_entry_t entry;
5690 vm_prot_t new_prot;
5691 vm_prot_t new_max;
5692 int pmap_options = 0;
5693 kern_return_t kr;
5694 vm_map_offset_t start, original_start;
5695 vm_map_offset_t end;
5696
5697 kr = vm_map_protect_sanitize(map,
5698 start_u,
5699 end_u,
5700 new_prot_u,
5701 &start,
5702 &end,
5703 &new_prot);
5704 if (__improbable(kr != KERN_SUCCESS)) {
5705 return vm_sanitize_get_kr(kr);
5706 }
5707 original_start = start;
5708
5709 if (new_prot & VM_PROT_COPY) {
5710 vm_map_offset_t new_start;
5711 vm_prot_t cur_prot, max_prot;
5712 vm_map_kernel_flags_t kflags;
5713
5714 /* LP64todo - see below */
5715 if (start >= map->max_offset) {
5716 return KERN_INVALID_ADDRESS;
5717 }
5718
5719 if ((new_prot & VM_PROT_ALLEXEC) &&
5720 map->pmap != kernel_pmap &&
5721 (vm_map_cs_enforcement(map)
5722 #if XNU_TARGET_OS_OSX && __arm64__
5723 || !VM_MAP_IS_EXOTIC(map)
5724 #endif /* XNU_TARGET_OS_OSX && __arm64__ */
5725 ) &&
5726 VM_MAP_POLICY_WX_FAIL(map)) {
5727 DTRACE_VM3(cs_wx,
5728 uint64_t, (uint64_t) start,
5729 uint64_t, (uint64_t) end,
5730 vm_prot_t, new_prot);
5731 printf("CODE SIGNING: %d[%s] %s:%d(0x%llx,0x%llx,0x%x) can't have both write and exec at the same time\n",
5732 proc_selfpid(),
5733 (get_bsdtask_info(current_task())
5734 ? proc_name_address(get_bsdtask_info(current_task()))
5735 : "?"),
5736 __FUNCTION__, __LINE__,
5737 #if DEVELOPMENT || DEBUG
5738 (uint64_t)start,
5739 (uint64_t)end,
5740 #else /* DEVELOPMENT || DEBUG */
5741 (uint64_t)0,
5742 (uint64_t)0,
5743 #endif /* DEVELOPMENT || DEBUG */
5744 new_prot);
5745 return KERN_PROTECTION_FAILURE;
5746 }
5747
5748 /*
5749 * Let vm_map_remap_extract() know that it will need to:
5750 * + make a copy of the mapping
5751 * + add VM_PROT_WRITE to the max protections
5752 * + remove any protections that are no longer allowed from the
5753 * max protections (to avoid any WRITE/EXECUTE conflict, for
5754 * example).
5755 * Note that "max_prot" is an IN/OUT parameter only for this
5756 * specific (VM_PROT_COPY) case. It's usually an OUT parameter
5757 * only.
5758 */
5759 max_prot = new_prot & (VM_PROT_ALL | VM_PROT_ALLEXEC);
5760 cur_prot = VM_PROT_NONE;
5761 kflags = VM_MAP_KERNEL_FLAGS_FIXED(.vmf_overwrite = true);
5762 kflags.vmkf_remap_prot_copy = true;
5763 kflags.vmkf_tpro_enforcement_override = !vm_map_tpro_enforcement(map);
5764 new_start = start;
5765 kr = vm_map_remap(map,
5766 vm_sanitize_wrap_addr_ref(&new_start),
5767 end - start,
5768 0, /* mask */
5769 kflags,
5770 map,
5771 start,
5772 TRUE, /* copy-on-write remapping! */
5773 vm_sanitize_wrap_prot_ref(&cur_prot), /* IN/OUT */
5774 vm_sanitize_wrap_prot_ref(&max_prot), /* IN/OUT */
5775 VM_INHERIT_DEFAULT);
5776 if (kr != KERN_SUCCESS) {
5777 return kr;
5778 }
5779 new_prot &= ~VM_PROT_COPY;
5780 }
5781
5782 vm_map_lock(map);
5783 restart_after_unlock:
5784
5785 /* LP64todo - remove this check when vm_map_commpage64()
5786 * no longer has to stuff in a map_entry for the commpage
5787 * above the map's max_offset.
5788 */
5789 if (start >= map->max_offset) {
5790 vm_map_unlock(map);
5791 return KERN_INVALID_ADDRESS;
5792 }
5793
5794 while (1) {
5795 /*
5796 * Lookup the entry. If it doesn't start in a valid
5797 * entry, return an error.
5798 */
5799 if (!vm_map_lookup_entry(map, start, &entry)) {
5800 vm_map_unlock(map);
5801 return KERN_INVALID_ADDRESS;
5802 }
5803
5804 if (entry->superpage_size && (start & (SUPERPAGE_SIZE - 1))) { /* extend request to whole entry */
5805 start = SUPERPAGE_ROUND_DOWN(start);
5806 continue;
5807 }
5808 break;
5809 }
5810 if (entry->superpage_size) {
5811 end = SUPERPAGE_ROUND_UP(end);
5812 }
5813
5814 /*
5815 * Make a first pass to check for protection and address
5816 * violations.
5817 */
5818
5819 current = entry;
5820 prev = current->vme_start;
5821 while ((current != vm_map_to_entry(map)) &&
5822 (current->vme_start < end)) {
5823 /*
5824 * If there is a hole, return an error.
5825 */
5826 if (current->vme_start != prev) {
5827 vm_map_unlock(map);
5828 return KERN_INVALID_ADDRESS;
5829 }
5830
5831 new_max = current->max_protection;
5832
5833 #if defined(__x86_64__)
5834 /* Allow max mask to include execute prot bits if this map doesn't enforce CS */
5835 if (set_max && (new_prot & VM_PROT_ALLEXEC) && !vm_map_cs_enforcement(map)) {
5836 new_max = (new_max & ~VM_PROT_ALLEXEC) | (new_prot & VM_PROT_ALLEXEC);
5837 }
5838 #elif CODE_SIGNING_MONITOR
5839 if (set_max && (new_prot & VM_PROT_EXECUTE) && (csm_address_space_exempt(map->pmap) == KERN_SUCCESS)) {
5840 new_max |= VM_PROT_EXECUTE;
5841 }
5842 #endif
5843 if ((new_prot & new_max) != new_prot) {
5844 vm_map_unlock(map);
5845 return KERN_PROTECTION_FAILURE;
5846 }
5847
5848 if (current->used_for_jit &&
5849 pmap_has_prot_policy(map->pmap, current->translated_allow_execute, current->protection)) {
5850 vm_map_unlock(map);
5851 return KERN_PROTECTION_FAILURE;
5852 }
5853
5854 #if __arm64e__
5855 /* Disallow protecting hw assisted TPRO mappings */
5856 if (current->used_for_tpro) {
5857 vm_map_unlock(map);
5858 return KERN_PROTECTION_FAILURE;
5859 }
5860 #endif /* __arm64e__ */
5861
5862
5863 if ((new_prot & VM_PROT_WRITE) &&
5864 (new_prot & VM_PROT_ALLEXEC) &&
5865 #if XNU_TARGET_OS_OSX
5866 map->pmap != kernel_pmap &&
5867 (vm_map_cs_enforcement(map)
5868 #if __arm64__
5869 || !VM_MAP_IS_EXOTIC(map)
5870 #endif /* __arm64__ */
5871 ) &&
5872 #endif /* XNU_TARGET_OS_OSX */
5873 #if CODE_SIGNING_MONITOR
5874 (csm_address_space_exempt(map->pmap) != KERN_SUCCESS) &&
5875 #endif
5876 !(current->used_for_jit)) {
5877 DTRACE_VM3(cs_wx,
5878 uint64_t, (uint64_t) current->vme_start,
5879 uint64_t, (uint64_t) current->vme_end,
5880 vm_prot_t, new_prot);
5881 printf("CODE SIGNING: %d[%s] %s:%d(0x%llx,0x%llx,0x%x) can't have both write and exec at the same time\n",
5882 proc_selfpid(),
5883 (get_bsdtask_info(current_task())
5884 ? proc_name_address(get_bsdtask_info(current_task()))
5885 : "?"),
5886 __FUNCTION__, __LINE__,
5887 #if DEVELOPMENT || DEBUG
5888 (uint64_t)current->vme_start,
5889 (uint64_t)current->vme_end,
5890 #else /* DEVELOPMENT || DEBUG */
5891 (uint64_t)0,
5892 (uint64_t)0,
5893 #endif /* DEVELOPMENT || DEBUG */
5894 new_prot);
5895 new_prot &= ~VM_PROT_ALLEXEC;
5896 if (VM_MAP_POLICY_WX_FAIL(map)) {
5897 vm_map_unlock(map);
5898 return KERN_PROTECTION_FAILURE;
5899 }
5900 }
5901
5902 /*
5903 * If the task has requested executable lockdown,
5904 * deny both:
5905 * - adding executable protections OR
5906 * - adding write protections to an existing executable mapping.
5907 */
5908 if (map->map_disallow_new_exec == TRUE) {
5909 if ((new_prot & VM_PROT_ALLEXEC) ||
5910 ((current->protection & VM_PROT_EXECUTE) && (new_prot & VM_PROT_WRITE))) {
5911 vm_map_unlock(map);
5912 return KERN_PROTECTION_FAILURE;
5913 }
5914 }
5915
5916 prev = current->vme_end;
5917 current = current->vme_next;
5918 }
5919
5920 #if __arm64__
5921 if (end > prev &&
5922 end == vm_map_round_page(prev, VM_MAP_PAGE_MASK(map))) {
5923 vm_map_entry_t prev_entry;
5924
5925 prev_entry = current->vme_prev;
5926 if (prev_entry != vm_map_to_entry(map) &&
5927 !prev_entry->map_aligned &&
5928 (vm_map_round_page(prev_entry->vme_end,
5929 VM_MAP_PAGE_MASK(map))
5930 == end)) {
5931 /*
5932 * The last entry in our range is not "map-aligned"
5933 * but it would have reached all the way to "end"
5934 * if it had been map-aligned, so this is not really
5935 * a hole in the range and we can proceed.
5936 */
5937 prev = end;
5938 }
5939 }
5940 #endif /* __arm64__ */
5941
5942 if (end > prev) {
5943 vm_map_unlock(map);
5944 return KERN_INVALID_ADDRESS;
5945 }
5946
5947 /*
5948 * Go back and fix up protections.
5949 * Clip to start here if the range starts within
5950 * the entry.
5951 */
5952
5953 current = entry;
5954 if (current != vm_map_to_entry(map)) {
5955 /* clip and unnest if necessary */
5956 vm_map_clip_start(map, current, start);
5957 }
5958
5959 while ((current != vm_map_to_entry(map)) &&
5960 (current->vme_start < end)) {
5961 vm_prot_t old_prot;
5962
5963 if (current->in_transition) {
5964 wait_result_t wait_result;
5965 vm_map_offset_t current_start;
5966
5967 /*
5968 * Another thread is wiring/unwiring this entry.
5969 * Let the other thread know we are waiting.
5970 */
5971 current_start = current->vme_start;
5972 current->needs_wakeup = true;
5973 /* wait for the other thread to be done */
5974 wait_result = vm_map_entry_wait(map, TH_UNINT);
5975 /*
5976 * We unlocked the map, so anything could have changed in the
5977 * range and we need to re-check from "current_start" to "end".
5978 * Our entries might no longer be valid.
5979 */
5980 current = NULL;
5981 entry = NULL;
5982 /*
5983 * Re-lookup and re-clip "current_start".
5984 * If it's no longer mapped,
5985 */
5986 vm_map_lookup_entry_or_next(map, current_start, ¤t);
5987 if (current != vm_map_to_entry(map)) {
5988 vm_map_clip_start(map, current, current_start);
5989 }
5990 /* restart from this point */
5991 start = current_start;
5992 goto restart_after_unlock;
5993 }
5994
5995 vm_map_clip_end(map, current, end);
5996
5997 #if DEVELOPMENT || DEBUG
5998 if (current->csm_associated && vm_log_xnu_user_debug) {
5999 printf("FBDP %d[%s] %s(0x%llx,0x%llx,0x%x) on map %p entry %p [0x%llx:0x%llx 0x%x/0x%x] csm_associated\n",
6000 proc_selfpid(),
6001 (get_bsdtask_info(current_task())
6002 ? proc_name_address(get_bsdtask_info(current_task()))
6003 : "?"),
6004 __FUNCTION__,
6005 (uint64_t)start,
6006 (uint64_t)end,
6007 new_prot,
6008 map, current,
6009 current->vme_start,
6010 current->vme_end,
6011 current->protection,
6012 current->max_protection);
6013 }
6014 #endif /* DEVELOPMENT || DEBUG */
6015
6016 if (current->is_sub_map) {
6017 /* clipping did unnest if needed */
6018 assert(!current->use_pmap);
6019 }
6020
6021 old_prot = current->protection;
6022
6023 if (set_max) {
6024 current->max_protection = new_prot;
6025 /* Consider either EXECUTE or UEXEC as EXECUTE for this masking */
6026 current->protection = (new_prot & old_prot);
6027 } else {
6028 current->protection = new_prot;
6029 }
6030
6031 #if CODE_SIGNING_MONITOR
6032 if (/* a !csm_associated mapping becoming executable */
6033 ((!current->csm_associated &&
6034 !(old_prot & VM_PROT_EXECUTE) &&
6035 (current->protection & VM_PROT_EXECUTE))
6036 ||
6037 /* a csm_associated mapping becoming writable */
6038 (current->csm_associated &&
6039 !(old_prot & VM_PROT_WRITE) &&
6040 (current->protection & VM_PROT_WRITE)))) {
6041 /*
6042 * This mapping has not already been marked as
6043 * "user_debug" and it is either:
6044 * 1. not code-signing-monitored and becoming executable
6045 * 2. code-signing-monitored and becoming writable,
6046 * so inform the CodeSigningMonitor and mark the
6047 * mapping as "user_debug" if appropriate.
6048 */
6049 vm_map_kernel_flags_t vmk_flags;
6050 vmk_flags = VM_MAP_KERNEL_FLAGS_NONE;
6051 /* pretend it's a vm_protect(VM_PROT_COPY)... */
6052 vmk_flags.vmkf_remap_prot_copy = true;
6053 kr = vm_map_entry_cs_associate(map, current, vmk_flags);
6054 #if DEVELOPMENT || DEBUG
6055 if (vm_log_xnu_user_debug) {
6056 printf("FBDP %d[%s] %s:%d map %p entry %p [ 0x%llx 0x%llx ] prot 0x%x -> 0x%x cs_associate -> %d user_debug=%d\n",
6057 proc_selfpid(),
6058 (get_bsdtask_info(current_task()) ? proc_name_address(get_bsdtask_info(current_task())) : "?"),
6059 __FUNCTION__, __LINE__,
6060 map, current,
6061 current->vme_start, current->vme_end,
6062 old_prot, current->protection,
6063 kr, current->vme_xnu_user_debug);
6064 }
6065 #endif /* DEVELOPMENT || DEBUG */
6066 }
6067 #endif /* CODE_SIGNING_MONITOR */
6068
6069 /*
6070 * Update physical map if necessary.
6071 * If the request is to turn off write protection,
6072 * we won't do it for real (in pmap). This is because
6073 * it would cause copy-on-write to fail. We've already
6074 * set, the new protection in the map, so if a
6075 * write-protect fault occurred, it will be fixed up
6076 * properly, COW or not.
6077 */
6078 if (current->protection != old_prot) {
6079 /* Look one level in we support nested pmaps */
6080 /* from mapped submaps which are direct entries */
6081 /* in our map */
6082
6083 vm_prot_t prot;
6084
6085 prot = current->protection;
6086 if (current->is_sub_map || (VME_OBJECT(current) == NULL) || (VME_OBJECT(current) != compressor_object)) {
6087 prot &= ~VM_PROT_WRITE;
6088 } else {
6089 assert(!VME_OBJECT(current)->code_signed);
6090 assert(VME_OBJECT(current)->copy_strategy == MEMORY_OBJECT_COPY_NONE);
6091 if (prot & VM_PROT_WRITE) {
6092 /*
6093 * For write requests on the
6094 * compressor, we wil ask the
6095 * pmap layer to prevent us from
6096 * taking a write fault when we
6097 * attempt to access the mapping
6098 * next.
6099 */
6100 pmap_options |= PMAP_OPTIONS_PROTECT_IMMEDIATE;
6101 }
6102 }
6103
6104 if (override_nx(map, VME_ALIAS(current)) && prot) {
6105 prot |= VM_PROT_EXECUTE;
6106 }
6107
6108 #if DEVELOPMENT || DEBUG
6109 if (!(old_prot & VM_PROT_EXECUTE) &&
6110 (prot & VM_PROT_EXECUTE) &&
6111 panic_on_unsigned_execute &&
6112 (proc_selfcsflags() & CS_KILL)) {
6113 panic("vm_map_protect(%p,0x%llx,0x%llx) old=0x%x new=0x%x - <rdar://23770418> code-signing bypass?", map, (uint64_t)current->vme_start, (uint64_t)current->vme_end, old_prot, prot);
6114 }
6115 #endif /* DEVELOPMENT || DEBUG */
6116
6117 if (pmap_has_prot_policy(map->pmap, current->translated_allow_execute, prot)) {
6118 if (current->wired_count) {
6119 panic("vm_map_protect(%p,0x%llx,0x%llx) new=0x%x wired=%x",
6120 map, (uint64_t)current->vme_start, (uint64_t)current->vme_end, prot, current->wired_count);
6121 }
6122
6123 /* If the pmap layer cares about this
6124 * protection type, force a fault for
6125 * each page so that vm_fault will
6126 * repopulate the page with the full
6127 * set of protections.
6128 */
6129 /*
6130 * TODO: We don't seem to need this,
6131 * but this is due to an internal
6132 * implementation detail of
6133 * pmap_protect. Do we want to rely
6134 * on this?
6135 */
6136 prot = VM_PROT_NONE;
6137 }
6138
6139 if (current->is_sub_map && current->use_pmap) {
6140 pmap_protect(VME_SUBMAP(current)->pmap,
6141 current->vme_start,
6142 current->vme_end,
6143 prot);
6144 } else {
6145 pmap_protect_options(map->pmap,
6146 current->vme_start,
6147 current->vme_end,
6148 prot,
6149 pmap_options,
6150 NULL);
6151 }
6152 }
6153 current = current->vme_next;
6154 }
6155
6156 if (entry == VM_MAP_ENTRY_NULL) {
6157 /*
6158 * Re-lookup the original start of our range.
6159 * If it's no longer mapped, start with the next mapping.
6160 */
6161 vm_map_lookup_entry_or_next(map, original_start, &entry);
6162 }
6163 current = entry;
6164 while ((current != vm_map_to_entry(map)) &&
6165 (current->vme_start <= end)) {
6166 vm_map_simplify_entry(map, current);
6167 current = current->vme_next;
6168 }
6169
6170 vm_map_unlock(map);
6171 return KERN_SUCCESS;
6172 }
6173
6174 static __attribute__((always_inline, warn_unused_result))
6175 kern_return_t
vm_map_inherit_sanitize(vm_map_t map,vm_map_offset_ut start_u,vm_map_offset_ut end_u,vm_inherit_ut new_inheritance_u,vm_map_offset_t * start,vm_map_offset_t * end,vm_inherit_t * new_inheritance)6176 vm_map_inherit_sanitize(
6177 vm_map_t map,
6178 vm_map_offset_ut start_u,
6179 vm_map_offset_ut end_u,
6180 vm_inherit_ut new_inheritance_u,
6181 vm_map_offset_t *start,
6182 vm_map_offset_t *end,
6183 vm_inherit_t *new_inheritance)
6184 {
6185 kern_return_t kr;
6186 vm_map_size_t size;
6187
6188 kr = vm_sanitize_inherit(new_inheritance_u,
6189 VM_SANITIZE_CALLER_VM_MAP_INHERIT, new_inheritance);
6190 if (__improbable(kr != KERN_SUCCESS)) {
6191 return kr;
6192 }
6193
6194 kr = vm_sanitize_addr_end(start_u, end_u, VM_SANITIZE_CALLER_VM_MAP_INHERIT,
6195 map, VM_SANITIZE_FLAGS_SIZE_ZERO_SUCCEEDS, start, end, &size);
6196 if (__improbable(kr != KERN_SUCCESS)) {
6197 return kr;
6198 }
6199
6200 return KERN_SUCCESS;
6201 }
6202
6203 /*
6204 * vm_map_inherit:
6205 *
6206 * Sets the inheritance of the specified address
6207 * range in the target map. Inheritance
6208 * affects how the map will be shared with
6209 * child maps at the time of vm_map_fork.
6210 */
6211 kern_return_t
vm_map_inherit(vm_map_t map,vm_map_offset_ut start_u,vm_map_offset_ut end_u,vm_inherit_ut new_inheritance_u)6212 vm_map_inherit(
6213 vm_map_t map,
6214 vm_map_offset_ut start_u,
6215 vm_map_offset_ut end_u,
6216 vm_inherit_ut new_inheritance_u)
6217 {
6218 vm_map_entry_t entry;
6219 vm_map_entry_t temp_entry;
6220 kern_return_t kr;
6221 vm_map_offset_t start;
6222 vm_map_offset_t end;
6223 vm_inherit_t new_inheritance;
6224
6225 kr = vm_map_inherit_sanitize(map,
6226 start_u,
6227 end_u,
6228 new_inheritance_u,
6229 &start,
6230 &end,
6231 &new_inheritance);
6232 if (__improbable(kr != KERN_SUCCESS)) {
6233 return vm_sanitize_get_kr(kr);
6234 }
6235
6236 vm_map_lock(map);
6237
6238 VM_MAP_RANGE_CHECK(map, start, end);
6239
6240 if (vm_map_lookup_entry(map, start, &temp_entry)) {
6241 entry = temp_entry;
6242 } else {
6243 temp_entry = temp_entry->vme_next;
6244 entry = temp_entry;
6245 }
6246
6247 /* first check entire range for entries which can't support the */
6248 /* given inheritance. */
6249 while ((entry != vm_map_to_entry(map)) && (entry->vme_start < end)) {
6250 if (entry->is_sub_map) {
6251 if (new_inheritance == VM_INHERIT_COPY) {
6252 vm_map_unlock(map);
6253 return KERN_INVALID_ARGUMENT;
6254 }
6255 }
6256
6257 entry = entry->vme_next;
6258 }
6259
6260 entry = temp_entry;
6261 if (entry != vm_map_to_entry(map)) {
6262 /* clip and unnest if necessary */
6263 vm_map_clip_start(map, entry, start);
6264 }
6265
6266 while ((entry != vm_map_to_entry(map)) && (entry->vme_start < end)) {
6267 vm_map_clip_end(map, entry, end);
6268 if (entry->is_sub_map) {
6269 /* clip did unnest if needed */
6270 assert(!entry->use_pmap);
6271 }
6272
6273 entry->inheritance = new_inheritance;
6274
6275 entry = entry->vme_next;
6276 }
6277
6278 vm_map_unlock(map);
6279 return KERN_SUCCESS;
6280 }
6281
6282 /*
6283 * Update the accounting for the amount of wired memory in this map. If the user has
6284 * exceeded the defined limits, then we fail. Wiring on behalf of the kernel never fails.
6285 */
6286
6287 static kern_return_t
add_wire_counts(vm_map_t map,vm_map_entry_t entry,boolean_t user_wire)6288 add_wire_counts(
6289 vm_map_t map,
6290 vm_map_entry_t entry,
6291 boolean_t user_wire)
6292 {
6293 vm_map_size_t size;
6294
6295 bool first_wire = entry->wired_count == 0 && entry->user_wired_count == 0;
6296
6297 if (user_wire) {
6298 unsigned int total_wire_count = vm_page_wire_count + vm_lopage_free_count;
6299
6300 /*
6301 * We're wiring memory at the request of the user. Check if this is the first time the user is wiring
6302 * this map entry.
6303 */
6304
6305 if (entry->user_wired_count == 0) {
6306 size = entry->vme_end - entry->vme_start;
6307
6308 /*
6309 * Since this is the first time the user is wiring this map entry, check to see if we're
6310 * exceeding the user wire limits. There is a per map limit which is the smaller of either
6311 * the process's rlimit or the global vm_per_task_user_wire_limit which caps this value. There is also
6312 * a system-wide limit on the amount of memory all users can wire. If the user is over either
6313 * limit, then we fail.
6314 */
6315
6316 if (size + map->user_wire_size > MIN(map->user_wire_limit, vm_per_task_user_wire_limit) ||
6317 size + ptoa_64(total_wire_count) > vm_global_user_wire_limit) {
6318 if (size + ptoa_64(total_wire_count) > vm_global_user_wire_limit) {
6319 #if DEVELOPMENT || DEBUG
6320 if (panic_on_mlock_failure) {
6321 panic("mlock: Over global wire limit. %llu bytes wired and requested to wire %llu bytes more", ptoa_64(total_wire_count), (uint64_t) size);
6322 }
6323 #endif /* DEVELOPMENT || DEBUG */
6324 os_atomic_inc(&vm_add_wire_count_over_global_limit, relaxed);
6325 } else {
6326 os_atomic_inc(&vm_add_wire_count_over_user_limit, relaxed);
6327 #if DEVELOPMENT || DEBUG
6328 if (panic_on_mlock_failure) {
6329 panic("mlock: Over process wire limit. %llu bytes wired and requested to wire %llu bytes more", (uint64_t) map->user_wire_size, (uint64_t) size);
6330 }
6331 #endif /* DEVELOPMENT || DEBUG */
6332 }
6333 return KERN_RESOURCE_SHORTAGE;
6334 }
6335
6336 /*
6337 * The first time the user wires an entry, we also increment the wired_count and add this to
6338 * the total that has been wired in the map.
6339 */
6340
6341 if (entry->wired_count >= MAX_WIRE_COUNT) {
6342 return KERN_FAILURE;
6343 }
6344
6345 entry->wired_count++;
6346 map->user_wire_size += size;
6347 }
6348
6349 if (entry->user_wired_count >= MAX_WIRE_COUNT) {
6350 return KERN_FAILURE;
6351 }
6352
6353 entry->user_wired_count++;
6354 } else {
6355 /*
6356 * The kernel's wiring the memory. Just bump the count and continue.
6357 */
6358
6359 if (entry->wired_count >= MAX_WIRE_COUNT) {
6360 panic("vm_map_wire: too many wirings");
6361 }
6362
6363 entry->wired_count++;
6364 }
6365
6366 if (first_wire) {
6367 vme_btref_consider_and_set(entry, __builtin_frame_address(0));
6368 }
6369
6370 return KERN_SUCCESS;
6371 }
6372
6373 /*
6374 * Update the memory wiring accounting now that the given map entry is being unwired.
6375 */
6376
6377 static void
subtract_wire_counts(vm_map_t map,vm_map_entry_t entry,boolean_t user_wire)6378 subtract_wire_counts(
6379 vm_map_t map,
6380 vm_map_entry_t entry,
6381 boolean_t user_wire)
6382 {
6383 if (user_wire) {
6384 /*
6385 * We're unwiring memory at the request of the user. See if we're removing the last user wire reference.
6386 */
6387
6388 if (entry->user_wired_count == 1) {
6389 /*
6390 * We're removing the last user wire reference. Decrement the wired_count and the total
6391 * user wired memory for this map.
6392 */
6393
6394 assert(entry->wired_count >= 1);
6395 entry->wired_count--;
6396 map->user_wire_size -= entry->vme_end - entry->vme_start;
6397 }
6398
6399 assert(entry->user_wired_count >= 1);
6400 entry->user_wired_count--;
6401 } else {
6402 /*
6403 * The kernel is unwiring the memory. Just update the count.
6404 */
6405
6406 assert(entry->wired_count >= 1);
6407 entry->wired_count--;
6408 }
6409
6410 vme_btref_consider_and_put(entry);
6411 }
6412
6413 int cs_executable_wire = 0;
6414
6415 static kern_return_t
vm_map_wire_nested(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_prot_t caller_prot,vm_tag_t tag,boolean_t user_wire,pmap_t map_pmap,vm_map_offset_t pmap_addr,ppnum_t * physpage_p)6416 vm_map_wire_nested(
6417 vm_map_t map,
6418 vm_map_offset_t start,
6419 vm_map_offset_t end,
6420 vm_prot_t caller_prot,
6421 vm_tag_t tag,
6422 boolean_t user_wire,
6423 pmap_t map_pmap,
6424 vm_map_offset_t pmap_addr,
6425 ppnum_t *physpage_p)
6426 {
6427 vm_map_entry_t entry;
6428 vm_prot_t access_type;
6429 struct vm_map_entry *first_entry, tmp_entry;
6430 vm_map_t real_map;
6431 vm_map_offset_t s, e;
6432 kern_return_t rc;
6433 boolean_t need_wakeup;
6434 boolean_t main_map = FALSE;
6435 wait_interrupt_t interruptible_state;
6436 thread_t cur_thread;
6437 unsigned int last_timestamp;
6438 vm_map_size_t size;
6439 boolean_t wire_and_extract;
6440 vm_prot_t extra_prots;
6441
6442 extra_prots = VM_PROT_COPY;
6443 extra_prots |= VM_PROT_COPY_FAIL_IF_EXECUTABLE;
6444 #if XNU_TARGET_OS_OSX
6445 if (map->pmap == kernel_pmap ||
6446 !vm_map_cs_enforcement(map)) {
6447 extra_prots &= ~VM_PROT_COPY_FAIL_IF_EXECUTABLE;
6448 }
6449 #endif /* XNU_TARGET_OS_OSX */
6450 #if CODE_SIGNING_MONITOR
6451 if (csm_address_space_exempt(map->pmap) == KERN_SUCCESS) {
6452 extra_prots &= ~VM_PROT_COPY_FAIL_IF_EXECUTABLE;
6453 }
6454 #endif /* CODE_SIGNING_MONITOR */
6455
6456 access_type = (caller_prot & (VM_PROT_ALL | VM_PROT_ALLEXEC));
6457
6458 wire_and_extract = FALSE;
6459 if (physpage_p != NULL) {
6460 /*
6461 * The caller wants the physical page number of the
6462 * wired page. We return only one physical page number
6463 * so this works for only one page at a time.
6464 *
6465 * The only caller (vm_map_wire_and_extract)
6466 * guarantees it.
6467 */
6468 assert(end - start == VM_MAP_PAGE_SIZE(map));
6469 wire_and_extract = TRUE;
6470 *physpage_p = 0;
6471 }
6472
6473 VM_MAP_RANGE_CHECK(map, start, end);
6474 assert(VM_MAP_PAGE_ALIGNED(start, VM_MAP_PAGE_MASK(map)));
6475 assert(VM_MAP_PAGE_ALIGNED(end, VM_MAP_PAGE_MASK(map)));
6476 if (start == end) {
6477 /* We wired what the caller asked for, zero pages */
6478 return KERN_SUCCESS;
6479 }
6480
6481 vm_map_lock(map);
6482 if (map_pmap == NULL) {
6483 main_map = TRUE;
6484 }
6485 last_timestamp = map->timestamp;
6486
6487 need_wakeup = FALSE;
6488 cur_thread = current_thread();
6489
6490 s = start;
6491 rc = KERN_SUCCESS;
6492
6493 if (vm_map_lookup_entry(map, s, &first_entry)) {
6494 entry = first_entry;
6495 /*
6496 * vm_map_clip_start will be done later.
6497 * We don't want to unnest any nested submaps here !
6498 */
6499 } else {
6500 /* Start address is not in map */
6501 rc = KERN_INVALID_ADDRESS;
6502 goto done;
6503 }
6504
6505 while ((entry != vm_map_to_entry(map)) && (s < end)) {
6506 /*
6507 * At this point, we have wired from "start" to "s".
6508 * We still need to wire from "s" to "end".
6509 *
6510 * "entry" hasn't been clipped, so it could start before "s"
6511 * and/or end after "end".
6512 */
6513
6514 /* "e" is how far we want to wire in this entry */
6515 e = entry->vme_end;
6516 if (e > end) {
6517 e = end;
6518 }
6519
6520 /*
6521 * If another thread is wiring/unwiring this entry then
6522 * block after informing other thread to wake us up.
6523 */
6524 if (entry->in_transition) {
6525 wait_result_t wait_result;
6526
6527 /*
6528 * We have not clipped the entry. Make sure that
6529 * the start address is in range so that the lookup
6530 * below will succeed.
6531 * "s" is the current starting point: we've already
6532 * wired from "start" to "s" and we still have
6533 * to wire from "s" to "end".
6534 */
6535
6536 entry->needs_wakeup = TRUE;
6537
6538 /*
6539 * wake up anybody waiting on entries that we have
6540 * already wired.
6541 */
6542 if (need_wakeup) {
6543 vm_map_entry_wakeup(map);
6544 need_wakeup = FALSE;
6545 }
6546 /*
6547 * User wiring is interruptible
6548 */
6549 wait_result = vm_map_entry_wait(map,
6550 (user_wire) ? THREAD_ABORTSAFE :
6551 THREAD_UNINT);
6552 if (user_wire && wait_result == THREAD_INTERRUPTED) {
6553 /*
6554 * undo the wirings we have done so far
6555 * We do not clear the needs_wakeup flag,
6556 * because we cannot tell if we were the
6557 * only one waiting.
6558 */
6559 rc = KERN_FAILURE;
6560 goto done;
6561 }
6562
6563 /*
6564 * Cannot avoid a lookup here. reset timestamp.
6565 */
6566 last_timestamp = map->timestamp;
6567
6568 /*
6569 * The entry could have been clipped, look it up again.
6570 * Worse that can happen is, it may not exist anymore.
6571 */
6572 if (!vm_map_lookup_entry(map, s, &first_entry)) {
6573 /*
6574 * User: undo everything upto the previous
6575 * entry. let vm_map_unwire worry about
6576 * checking the validity of the range.
6577 */
6578 rc = KERN_FAILURE;
6579 goto done;
6580 }
6581 entry = first_entry;
6582 continue;
6583 }
6584
6585 if (entry->is_sub_map) {
6586 vm_map_offset_t sub_start;
6587 vm_map_offset_t sub_end;
6588 vm_map_offset_t local_start;
6589 vm_map_offset_t local_end;
6590 pmap_t pmap;
6591 vm_map_t sub_map = VM_MAP_NULL;
6592
6593 if (wire_and_extract) {
6594 /*
6595 * Wiring would result in copy-on-write
6596 * which would not be compatible with
6597 * the sharing we have with the original
6598 * provider of this memory.
6599 */
6600 rc = KERN_INVALID_ARGUMENT;
6601 goto done;
6602 }
6603
6604 vm_map_clip_start(map, entry, s);
6605 vm_map_clip_end(map, entry, end);
6606
6607 sub_start = VME_OFFSET(entry);
6608 sub_end = entry->vme_end;
6609 sub_end += VME_OFFSET(entry) - entry->vme_start;
6610
6611 local_end = entry->vme_end;
6612 if (map_pmap == NULL) {
6613 vm_object_t object;
6614 vm_object_offset_t offset;
6615 vm_prot_t prot;
6616 boolean_t wired;
6617 vm_map_entry_t local_entry;
6618 vm_map_version_t version;
6619 vm_map_t lookup_map;
6620
6621 if (entry->use_pmap) {
6622 pmap = VME_SUBMAP(entry)->pmap;
6623 /* ppc implementation requires that */
6624 /* submaps pmap address ranges line */
6625 /* up with parent map */
6626 #ifdef notdef
6627 pmap_addr = sub_start;
6628 #endif
6629 pmap_addr = s;
6630 } else {
6631 pmap = map->pmap;
6632 pmap_addr = s;
6633 }
6634
6635 if (entry->wired_count) {
6636 if ((rc = add_wire_counts(map, entry, user_wire)) != KERN_SUCCESS) {
6637 goto done;
6638 }
6639
6640 /*
6641 * The map was not unlocked:
6642 * no need to goto re-lookup.
6643 * Just go directly to next entry.
6644 */
6645 entry = entry->vme_next;
6646 s = entry->vme_start;
6647 continue;
6648 }
6649
6650 /* call vm_map_lookup_and_lock_object to */
6651 /* cause any needs copy to be */
6652 /* evaluated */
6653 local_start = entry->vme_start;
6654 lookup_map = map;
6655 vm_map_lock_write_to_read(map);
6656 rc = vm_map_lookup_and_lock_object(
6657 &lookup_map, local_start,
6658 (access_type | extra_prots),
6659 OBJECT_LOCK_EXCLUSIVE,
6660 &version, &object,
6661 &offset, &prot, &wired,
6662 NULL,
6663 &real_map, NULL);
6664 if (rc != KERN_SUCCESS) {
6665 vm_map_unlock_read(lookup_map);
6666 assert(map_pmap == NULL);
6667 vm_map_unwire_nested(map, start,
6668 s, user_wire, PMAP_NULL, 0);
6669 return rc;
6670 }
6671 vm_object_unlock(object);
6672 if (real_map != lookup_map) {
6673 vm_map_unlock(real_map);
6674 }
6675 vm_map_unlock_read(lookup_map);
6676 vm_map_lock(map);
6677
6678 /* we unlocked, so must re-lookup */
6679 if (!vm_map_lookup_entry(map,
6680 local_start,
6681 &local_entry)) {
6682 rc = KERN_FAILURE;
6683 goto done;
6684 }
6685
6686 /*
6687 * entry could have been "simplified",
6688 * so re-clip
6689 */
6690 entry = local_entry;
6691 assert(s == local_start);
6692 vm_map_clip_start(map, entry, s);
6693 vm_map_clip_end(map, entry, end);
6694 /* re-compute "e" */
6695 e = entry->vme_end;
6696 if (e > end) {
6697 e = end;
6698 }
6699
6700 /* did we have a change of type? */
6701 if (!entry->is_sub_map) {
6702 last_timestamp = map->timestamp;
6703 continue;
6704 }
6705 } else {
6706 local_start = entry->vme_start;
6707 pmap = map_pmap;
6708 }
6709
6710 if ((rc = add_wire_counts(map, entry, user_wire)) != KERN_SUCCESS) {
6711 goto done;
6712 }
6713
6714 entry->in_transition = TRUE;
6715
6716 sub_map = VME_SUBMAP(entry);
6717 vm_map_reference(sub_map);
6718 vm_map_unlock(map);
6719 rc = vm_map_wire_nested(sub_map,
6720 sub_start, sub_end,
6721 caller_prot, tag,
6722 user_wire, pmap, pmap_addr,
6723 NULL);
6724 vm_map_deallocate(sub_map);
6725 sub_map = VM_MAP_NULL;
6726 vm_map_lock(map);
6727
6728 /*
6729 * Find the entry again. It could have been clipped
6730 * after we unlocked the map.
6731 */
6732 if (!vm_map_lookup_entry(map, local_start,
6733 &first_entry)) {
6734 panic("vm_map_wire: re-lookup failed");
6735 }
6736 entry = first_entry;
6737
6738 assert(local_start == s);
6739 /* re-compute "e" */
6740 e = entry->vme_end;
6741 if (e > end) {
6742 e = end;
6743 }
6744
6745 last_timestamp = map->timestamp;
6746 while ((entry != vm_map_to_entry(map)) &&
6747 (entry->vme_start < e)) {
6748 assert(entry->in_transition);
6749 entry->in_transition = FALSE;
6750 if (entry->needs_wakeup) {
6751 entry->needs_wakeup = FALSE;
6752 need_wakeup = TRUE;
6753 }
6754 if (rc != KERN_SUCCESS) {/* from vm_*_wire */
6755 subtract_wire_counts(map, entry, user_wire);
6756 }
6757 entry = entry->vme_next;
6758 }
6759 if (rc != KERN_SUCCESS) { /* from vm_*_wire */
6760 goto done;
6761 }
6762
6763 /* no need to relookup again */
6764 s = entry->vme_start;
6765 continue;
6766 }
6767
6768 /*
6769 * If this entry is already wired then increment
6770 * the appropriate wire reference count.
6771 */
6772 if (entry->wired_count) {
6773 if ((entry->protection & access_type) != access_type) {
6774 /* found a protection problem */
6775
6776 /*
6777 * XXX FBDP
6778 * We should always return an error
6779 * in this case but since we didn't
6780 * enforce it before, let's do
6781 * it only for the new "wire_and_extract"
6782 * code path for now...
6783 */
6784 if (wire_and_extract) {
6785 rc = KERN_PROTECTION_FAILURE;
6786 goto done;
6787 }
6788 }
6789
6790 /*
6791 * entry is already wired down, get our reference
6792 * after clipping to our range.
6793 */
6794 vm_map_clip_start(map, entry, s);
6795 vm_map_clip_end(map, entry, end);
6796
6797 if ((rc = add_wire_counts(map, entry, user_wire)) != KERN_SUCCESS) {
6798 goto done;
6799 }
6800
6801 if (wire_and_extract) {
6802 vm_object_t object;
6803 vm_object_offset_t offset;
6804 vm_page_t m;
6805
6806 /*
6807 * We don't have to "wire" the page again
6808 * bit we still have to "extract" its
6809 * physical page number, after some sanity
6810 * checks.
6811 */
6812 assert((entry->vme_end - entry->vme_start)
6813 == PAGE_SIZE);
6814 assert(!entry->needs_copy);
6815 assert(!entry->is_sub_map);
6816 assert(VME_OBJECT(entry));
6817 if (((entry->vme_end - entry->vme_start)
6818 != PAGE_SIZE) ||
6819 entry->needs_copy ||
6820 entry->is_sub_map ||
6821 VME_OBJECT(entry) == VM_OBJECT_NULL) {
6822 rc = KERN_INVALID_ARGUMENT;
6823 goto done;
6824 }
6825
6826 object = VME_OBJECT(entry);
6827 offset = VME_OFFSET(entry);
6828 /* need exclusive lock to update m->dirty */
6829 if (entry->protection & VM_PROT_WRITE) {
6830 vm_object_lock(object);
6831 } else {
6832 vm_object_lock_shared(object);
6833 }
6834 m = vm_page_lookup(object, offset);
6835 assert(m != VM_PAGE_NULL);
6836 assert(VM_PAGE_WIRED(m));
6837 if (m != VM_PAGE_NULL && VM_PAGE_WIRED(m)) {
6838 *physpage_p = VM_PAGE_GET_PHYS_PAGE(m);
6839 if (entry->protection & VM_PROT_WRITE) {
6840 vm_object_lock_assert_exclusive(
6841 object);
6842 m->vmp_dirty = TRUE;
6843 }
6844 } else {
6845 /* not already wired !? */
6846 *physpage_p = 0;
6847 }
6848 vm_object_unlock(object);
6849 }
6850
6851 /* map was not unlocked: no need to relookup */
6852 entry = entry->vme_next;
6853 s = entry->vme_start;
6854 continue;
6855 }
6856
6857 /*
6858 * Unwired entry or wire request transmitted via submap
6859 */
6860
6861 /*
6862 * Wiring would copy the pages to the shadow object.
6863 * The shadow object would not be code-signed so
6864 * attempting to execute code from these copied pages
6865 * would trigger a code-signing violation.
6866 */
6867
6868 if ((entry->protection & VM_PROT_EXECUTE)
6869 #if XNU_TARGET_OS_OSX
6870 &&
6871 map->pmap != kernel_pmap &&
6872 (vm_map_cs_enforcement(map)
6873 #if __arm64__
6874 || !VM_MAP_IS_EXOTIC(map)
6875 #endif /* __arm64__ */
6876 )
6877 #endif /* XNU_TARGET_OS_OSX */
6878 #if CODE_SIGNING_MONITOR
6879 &&
6880 (csm_address_space_exempt(map->pmap) != KERN_SUCCESS)
6881 #endif
6882 ) {
6883 #if MACH_ASSERT
6884 printf("pid %d[%s] wiring executable range from "
6885 "0x%llx to 0x%llx: rejected to preserve "
6886 "code-signing\n",
6887 proc_selfpid(),
6888 (get_bsdtask_info(current_task())
6889 ? proc_name_address(get_bsdtask_info(current_task()))
6890 : "?"),
6891 (uint64_t) entry->vme_start,
6892 (uint64_t) entry->vme_end);
6893 #endif /* MACH_ASSERT */
6894 DTRACE_VM2(cs_executable_wire,
6895 uint64_t, (uint64_t)entry->vme_start,
6896 uint64_t, (uint64_t)entry->vme_end);
6897 cs_executable_wire++;
6898 rc = KERN_PROTECTION_FAILURE;
6899 goto done;
6900 }
6901
6902 /*
6903 * Perform actions of vm_map_lookup that need the write
6904 * lock on the map: create a shadow object for a
6905 * copy-on-write region, or an object for a zero-fill
6906 * region.
6907 */
6908 size = entry->vme_end - entry->vme_start;
6909 /*
6910 * If wiring a copy-on-write page, we need to copy it now
6911 * even if we're only (currently) requesting read access.
6912 * This is aggressive, but once it's wired we can't move it.
6913 */
6914 if (entry->needs_copy) {
6915 if (wire_and_extract) {
6916 /*
6917 * We're supposed to share with the original
6918 * provider so should not be "needs_copy"
6919 */
6920 rc = KERN_INVALID_ARGUMENT;
6921 goto done;
6922 }
6923
6924 VME_OBJECT_SHADOW(entry, size,
6925 vm_map_always_shadow(map));
6926 entry->needs_copy = FALSE;
6927 } else if (VME_OBJECT(entry) == VM_OBJECT_NULL) {
6928 if (wire_and_extract) {
6929 /*
6930 * We're supposed to share with the original
6931 * provider so should already have an object.
6932 */
6933 rc = KERN_INVALID_ARGUMENT;
6934 goto done;
6935 }
6936 VME_OBJECT_SET(entry, vm_object_allocate(size, map->serial_id), false, 0);
6937 VME_OFFSET_SET(entry, (vm_object_offset_t)0);
6938 assert(entry->use_pmap);
6939 } else if (VME_OBJECT(entry)->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) {
6940 if (wire_and_extract) {
6941 /*
6942 * We're supposed to share with the original
6943 * provider so should not be COPY_SYMMETRIC.
6944 */
6945 rc = KERN_INVALID_ARGUMENT;
6946 goto done;
6947 }
6948 /*
6949 * Force an unrequested "copy-on-write" but only for
6950 * the range we're wiring.
6951 */
6952 // printf("FBDP %s:%d map %p entry %p [ 0x%llx 0x%llx ] s 0x%llx end 0x%llx wire&extract=%d\n", __FUNCTION__, __LINE__, map, entry, (uint64_t)entry->vme_start, (uint64_t)entry->vme_end, (uint64_t)s, (uint64_t)end, wire_and_extract);
6953 vm_map_clip_start(map, entry, s);
6954 vm_map_clip_end(map, entry, end);
6955 /* recompute "size" */
6956 size = entry->vme_end - entry->vme_start;
6957 /* make a shadow object */
6958 vm_object_t orig_object;
6959 vm_object_offset_t orig_offset;
6960 orig_object = VME_OBJECT(entry);
6961 orig_offset = VME_OFFSET(entry);
6962 VME_OBJECT_SHADOW(entry, size, vm_map_always_shadow(map));
6963 if (VME_OBJECT(entry) != orig_object) {
6964 /*
6965 * This mapping has not been shared (or it would be
6966 * COPY_DELAY instead of COPY_SYMMETRIC) and it has
6967 * not been copied-on-write (or it would be marked
6968 * as "needs_copy" and would have been handled above
6969 * and also already write-protected).
6970 * We still need to write-protect here to prevent
6971 * other threads from modifying these pages while
6972 * we're in the process of copying and wiring
6973 * the copied pages.
6974 * Since the mapping is neither shared nor COWed,
6975 * we only need to write-protect the PTEs for this
6976 * mapping.
6977 */
6978 vm_object_pmap_protect(orig_object,
6979 orig_offset,
6980 size,
6981 map->pmap,
6982 VM_MAP_PAGE_SIZE(map),
6983 entry->vme_start,
6984 entry->protection & ~VM_PROT_WRITE);
6985 }
6986 }
6987 if (VME_OBJECT(entry)->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) {
6988 /*
6989 * Make the object COPY_DELAY to get a stable object
6990 * to wire.
6991 * That should avoid creating long shadow chains while
6992 * wiring/unwiring the same range repeatedly.
6993 * That also prevents part of the object from being
6994 * wired while another part is "needs_copy", which
6995 * could result in conflicting rules wrt copy-on-write.
6996 */
6997 vm_object_t object;
6998
6999 object = VME_OBJECT(entry);
7000 vm_object_lock(object);
7001 if (object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) {
7002 assertf(vm_object_round_page(VME_OFFSET(entry) + size) - vm_object_trunc_page(VME_OFFSET(entry)) == object->vo_size,
7003 "object %p size 0x%llx entry %p [0x%llx:0x%llx:0x%llx] size 0x%llx\n",
7004 object, (uint64_t)object->vo_size,
7005 entry,
7006 (uint64_t)entry->vme_start,
7007 (uint64_t)entry->vme_end,
7008 (uint64_t)VME_OFFSET(entry),
7009 (uint64_t)size);
7010 assertf(os_ref_get_count_raw(&object->ref_count) == 1,
7011 "object %p ref_count %d\n",
7012 object, os_ref_get_count_raw(&object->ref_count));
7013 assertf(!entry->needs_copy,
7014 "entry %p\n", entry);
7015 object->copy_strategy = MEMORY_OBJECT_COPY_DELAY;
7016 VM_OBJECT_SET_TRUE_SHARE(object, TRUE);
7017 }
7018 vm_object_unlock(object);
7019 }
7020
7021 vm_map_clip_start(map, entry, s);
7022 vm_map_clip_end(map, entry, end);
7023
7024 /* re-compute "e" */
7025 e = entry->vme_end;
7026 if (e > end) {
7027 e = end;
7028 }
7029
7030 /*
7031 * Check for holes and protection mismatch.
7032 * Holes: Next entry should be contiguous unless this
7033 * is the end of the region.
7034 * Protection: Access requested must be allowed, unless
7035 * wiring is by protection class
7036 */
7037 if ((entry->vme_end < end) &&
7038 ((entry->vme_next == vm_map_to_entry(map)) ||
7039 (entry->vme_next->vme_start > entry->vme_end))) {
7040 /* found a hole */
7041 rc = KERN_INVALID_ADDRESS;
7042 goto done;
7043 }
7044 if ((entry->protection & access_type) != access_type) {
7045 /* found a protection problem */
7046 rc = KERN_PROTECTION_FAILURE;
7047 goto done;
7048 }
7049
7050 assert(entry->wired_count == 0 && entry->user_wired_count == 0);
7051
7052 if ((rc = add_wire_counts(map, entry, user_wire)) != KERN_SUCCESS) {
7053 goto done;
7054 }
7055
7056 entry->in_transition = TRUE;
7057
7058 /*
7059 * This entry might get split once we unlock the map.
7060 * In vm_fault_wire(), we need the current range as
7061 * defined by this entry. In order for this to work
7062 * along with a simultaneous clip operation, we make a
7063 * temporary copy of this entry and use that for the
7064 * wiring. Note that the underlying objects do not
7065 * change during a clip.
7066 */
7067 tmp_entry = *entry;
7068
7069 /*
7070 * The in_transition state guarentees that the entry
7071 * (or entries for this range, if split occured) will be
7072 * there when the map lock is acquired for the second time.
7073 */
7074 vm_map_unlock(map);
7075
7076 if (!user_wire && cur_thread != THREAD_NULL) {
7077 interruptible_state = thread_interrupt_level(THREAD_UNINT);
7078 } else {
7079 interruptible_state = THREAD_UNINT;
7080 }
7081
7082 if (map_pmap) {
7083 rc = vm_fault_wire(map,
7084 &tmp_entry, caller_prot, tag, map_pmap, pmap_addr,
7085 physpage_p);
7086 } else {
7087 rc = vm_fault_wire(map,
7088 &tmp_entry, caller_prot, tag, map->pmap,
7089 tmp_entry.vme_start,
7090 physpage_p);
7091 }
7092
7093 if (!user_wire && cur_thread != THREAD_NULL) {
7094 thread_interrupt_level(interruptible_state);
7095 }
7096
7097 vm_map_lock(map);
7098
7099 if (last_timestamp + 1 != map->timestamp) {
7100 /*
7101 * Find the entry again. It could have been clipped
7102 * after we unlocked the map.
7103 */
7104 if (!vm_map_lookup_entry(map, tmp_entry.vme_start,
7105 &first_entry)) {
7106 panic("vm_map_wire: re-lookup failed");
7107 }
7108
7109 entry = first_entry;
7110 }
7111
7112 last_timestamp = map->timestamp;
7113
7114 while ((entry != vm_map_to_entry(map)) &&
7115 (entry->vme_start < tmp_entry.vme_end)) {
7116 assert(entry->in_transition);
7117 entry->in_transition = FALSE;
7118 if (entry->needs_wakeup) {
7119 entry->needs_wakeup = FALSE;
7120 need_wakeup = TRUE;
7121 }
7122 if (rc != KERN_SUCCESS) { /* from vm_*_wire */
7123 subtract_wire_counts(map, entry, user_wire);
7124 }
7125 entry = entry->vme_next;
7126 }
7127
7128 if (rc != KERN_SUCCESS) { /* from vm_*_wire */
7129 goto done;
7130 }
7131
7132 if ((entry != vm_map_to_entry(map)) && /* we still have entries in the map */
7133 (tmp_entry.vme_end != end) && /* AND, we are not at the end of the requested range */
7134 (entry->vme_start != tmp_entry.vme_end)) { /* AND, the next entry is not contiguous. */
7135 /* found a "new" hole */
7136 s = tmp_entry.vme_end;
7137 rc = KERN_INVALID_ADDRESS;
7138 goto done;
7139 }
7140
7141 s = entry->vme_start;
7142 } /* end while loop through map entries */
7143
7144 done:
7145 if (rc == KERN_SUCCESS) {
7146 /* repair any damage we may have made to the VM map */
7147 vm_map_simplify_range(map, start, end);
7148 }
7149
7150 vm_map_unlock(map);
7151
7152 /*
7153 * wake up anybody waiting on entries we wired.
7154 */
7155 if (need_wakeup) {
7156 vm_map_entry_wakeup(map);
7157 }
7158
7159 if (rc != KERN_SUCCESS) {
7160 /* undo what has been wired so far */
7161 vm_map_unwire_nested(map, start, s, user_wire,
7162 map_pmap, pmap_addr);
7163 if (physpage_p) {
7164 *physpage_p = 0;
7165 }
7166 }
7167
7168 return rc;
7169 }
7170
7171 static __attribute__((always_inline, warn_unused_result))
7172 kern_return_t
vm_map_wire_sanitize(vm_map_t map,vm_map_offset_ut start_u,vm_map_offset_ut end_u,vm_prot_ut prot_u,vm_sanitize_caller_t vm_sanitize_caller,vm_map_offset_t * start,vm_map_offset_t * end,vm_map_size_t * size,vm_prot_t * prot)7173 vm_map_wire_sanitize(
7174 vm_map_t map,
7175 vm_map_offset_ut start_u,
7176 vm_map_offset_ut end_u,
7177 vm_prot_ut prot_u,
7178 vm_sanitize_caller_t vm_sanitize_caller,
7179 vm_map_offset_t *start,
7180 vm_map_offset_t *end,
7181 vm_map_size_t *size,
7182 vm_prot_t *prot)
7183 {
7184 kern_return_t kr;
7185
7186 kr = vm_sanitize_addr_end(start_u, end_u, vm_sanitize_caller, map,
7187 VM_SANITIZE_FLAGS_SIZE_ZERO_SUCCEEDS, start, end,
7188 size);
7189 if (__improbable(kr != KERN_SUCCESS)) {
7190 return kr;
7191 }
7192
7193 kr = vm_sanitize_prot(prot_u, vm_sanitize_caller, map, prot);
7194 if (__improbable(kr != KERN_SUCCESS)) {
7195 return kr;
7196 }
7197
7198 return KERN_SUCCESS;
7199 }
7200
7201 /*
7202 * Validation function for vm_map_wire_nested().
7203 */
7204 kern_return_t
vm_map_wire_impl(vm_map_t map,vm_map_offset_ut start_u,vm_map_offset_ut end_u,vm_prot_ut prot_u,vm_tag_t tag,boolean_t user_wire,ppnum_t * physpage_p,vm_sanitize_caller_t vm_sanitize_caller)7205 vm_map_wire_impl(
7206 vm_map_t map,
7207 vm_map_offset_ut start_u,
7208 vm_map_offset_ut end_u,
7209 vm_prot_ut prot_u,
7210 vm_tag_t tag,
7211 boolean_t user_wire,
7212 ppnum_t *physpage_p,
7213 vm_sanitize_caller_t vm_sanitize_caller)
7214 {
7215 vm_map_offset_t start, end;
7216 vm_map_size_t size;
7217 vm_prot_t prot;
7218 kern_return_t kr;
7219
7220 /*
7221 * Sanitize any input parameters that are addr/size/prot/inherit
7222 */
7223 kr = vm_map_wire_sanitize(map,
7224 start_u,
7225 end_u,
7226 prot_u,
7227 vm_sanitize_caller,
7228 &start,
7229 &end,
7230 &size,
7231 &prot);
7232 if (__improbable(kr != KERN_SUCCESS)) {
7233 if (physpage_p) {
7234 *physpage_p = 0;
7235 }
7236 return vm_sanitize_get_kr(kr);
7237 }
7238
7239 return vm_map_wire_nested(map, start, end, prot, tag, user_wire,
7240 PMAP_NULL, 0, physpage_p);
7241 }
7242
7243 kern_return_t
vm_map_wire_external(vm_map_t map,vm_map_offset_ut start_u,vm_map_offset_ut end_u,vm_prot_ut prot_u,boolean_t user_wire)7244 vm_map_wire_external(
7245 vm_map_t map,
7246 vm_map_offset_ut start_u,
7247 vm_map_offset_ut end_u,
7248 vm_prot_ut prot_u,
7249 boolean_t user_wire)
7250 {
7251 vm_tag_t tag = vm_tag_bt();
7252
7253 return vm_map_wire_kernel(map, start_u, end_u, prot_u, tag, user_wire);
7254 }
7255
7256 kern_return_t
vm_map_wire_kernel(vm_map_t map,vm_map_offset_ut start_u,vm_map_offset_ut end_u,vm_prot_ut prot_u,vm_tag_t tag,boolean_t user_wire)7257 vm_map_wire_kernel(
7258 vm_map_t map,
7259 vm_map_offset_ut start_u,
7260 vm_map_offset_ut end_u,
7261 vm_prot_ut prot_u,
7262 vm_tag_t tag,
7263 boolean_t user_wire)
7264 {
7265 return vm_map_wire_impl(map, start_u, end_u, prot_u, tag,
7266 user_wire, NULL, VM_SANITIZE_CALLER_VM_MAP_WIRE);
7267 }
7268
7269 #if XNU_PLATFORM_MacOSX
7270
7271 kern_return_t
vm_map_wire_and_extract(vm_map_t map,vm_map_offset_ut start_u,vm_prot_ut prot_u,boolean_t user_wire,ppnum_t * physpage_p)7272 vm_map_wire_and_extract(
7273 vm_map_t map,
7274 vm_map_offset_ut start_u,
7275 vm_prot_ut prot_u,
7276 boolean_t user_wire,
7277 ppnum_t *physpage_p)
7278 {
7279 vm_tag_t tag = vm_tag_bt();
7280 vm_map_size_ut size_u = vm_sanitize_wrap_size(VM_MAP_PAGE_SIZE(map));
7281 vm_map_offset_ut end_u = vm_sanitize_compute_ut_end(start_u, size_u);
7282
7283 return vm_map_wire_impl(map, start_u, end_u, prot_u, tag,
7284 user_wire, physpage_p, VM_SANITIZE_CALLER_VM_MAP_WIRE);
7285 }
7286
7287 #endif /* XNU_PLATFORM_MacOSX */
7288
7289 static kern_return_t
vm_map_unwire_nested(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,boolean_t user_wire,pmap_t map_pmap,vm_map_offset_t pmap_addr)7290 vm_map_unwire_nested(
7291 vm_map_t map,
7292 vm_map_offset_t start,
7293 vm_map_offset_t end,
7294 boolean_t user_wire,
7295 pmap_t map_pmap,
7296 vm_map_offset_t pmap_addr)
7297 {
7298 vm_map_entry_t entry;
7299 struct vm_map_entry *first_entry, tmp_entry;
7300 boolean_t need_wakeup;
7301 boolean_t main_map = FALSE;
7302 unsigned int last_timestamp;
7303
7304 VM_MAP_RANGE_CHECK(map, start, end);
7305 assert(VM_MAP_PAGE_ALIGNED(start, VM_MAP_PAGE_MASK(map)));
7306 assert(VM_MAP_PAGE_ALIGNED(end, VM_MAP_PAGE_MASK(map)));
7307
7308 if (start == end) {
7309 /* We unwired what the caller asked for: zero pages */
7310 return KERN_SUCCESS;
7311 }
7312
7313 vm_map_lock(map);
7314 if (map_pmap == NULL) {
7315 main_map = TRUE;
7316 }
7317 last_timestamp = map->timestamp;
7318
7319 if (vm_map_lookup_entry(map, start, &first_entry)) {
7320 entry = first_entry;
7321 /*
7322 * vm_map_clip_start will be done later.
7323 * We don't want to unnest any nested sub maps here !
7324 */
7325 } else {
7326 if (!user_wire) {
7327 panic("vm_map_unwire: start not found");
7328 }
7329 /* Start address is not in map. */
7330 vm_map_unlock(map);
7331 return KERN_INVALID_ADDRESS;
7332 }
7333
7334 if (entry->superpage_size) {
7335 /* superpages are always wired */
7336 vm_map_unlock(map);
7337 return KERN_INVALID_ADDRESS;
7338 }
7339
7340 need_wakeup = FALSE;
7341 while ((entry != vm_map_to_entry(map)) && (entry->vme_start < end)) {
7342 if (entry->in_transition) {
7343 /*
7344 * 1)
7345 * Another thread is wiring down this entry. Note
7346 * that if it is not for the other thread we would
7347 * be unwiring an unwired entry. This is not
7348 * permitted. If we wait, we will be unwiring memory
7349 * we did not wire.
7350 *
7351 * 2)
7352 * Another thread is unwiring this entry. We did not
7353 * have a reference to it, because if we did, this
7354 * entry will not be getting unwired now.
7355 */
7356 if (!user_wire) {
7357 /*
7358 * XXX FBDP
7359 * This could happen: there could be some
7360 * overlapping vslock/vsunlock operations
7361 * going on.
7362 * We should probably just wait and retry,
7363 * but then we have to be careful that this
7364 * entry could get "simplified" after
7365 * "in_transition" gets unset and before
7366 * we re-lookup the entry, so we would
7367 * have to re-clip the entry to avoid
7368 * re-unwiring what we have already unwired...
7369 * See vm_map_wire_nested().
7370 *
7371 * Or we could just ignore "in_transition"
7372 * here and proceed to decement the wired
7373 * count(s) on this entry. That should be fine
7374 * as long as "wired_count" doesn't drop all
7375 * the way to 0 (and we should panic if THAT
7376 * happens).
7377 */
7378 panic("vm_map_unwire: in_transition entry");
7379 }
7380
7381 entry = entry->vme_next;
7382 continue;
7383 }
7384
7385 if (entry->is_sub_map) {
7386 vm_map_offset_t sub_start;
7387 vm_map_offset_t sub_end;
7388 vm_map_offset_t local_end;
7389 pmap_t pmap;
7390 vm_map_t sub_map = VM_MAP_NULL;
7391
7392 vm_map_clip_start(map, entry, start);
7393 vm_map_clip_end(map, entry, end);
7394
7395 sub_start = VME_OFFSET(entry);
7396 sub_end = entry->vme_end - entry->vme_start;
7397 sub_end += VME_OFFSET(entry);
7398 local_end = entry->vme_end;
7399 if (map_pmap == NULL) {
7400 if (entry->use_pmap) {
7401 pmap = VME_SUBMAP(entry)->pmap;
7402 pmap_addr = sub_start;
7403 } else {
7404 pmap = map->pmap;
7405 pmap_addr = start;
7406 }
7407 if (entry->wired_count == 0 ||
7408 (user_wire && entry->user_wired_count == 0)) {
7409 if (!user_wire) {
7410 panic("vm_map_unwire: entry is unwired");
7411 }
7412 entry = entry->vme_next;
7413 continue;
7414 }
7415
7416 /*
7417 * Check for holes
7418 * Holes: Next entry should be contiguous unless
7419 * this is the end of the region.
7420 */
7421 if (((entry->vme_end < end) &&
7422 ((entry->vme_next == vm_map_to_entry(map)) ||
7423 (entry->vme_next->vme_start
7424 > entry->vme_end)))) {
7425 if (!user_wire) {
7426 panic("vm_map_unwire: non-contiguous region");
7427 }
7428 /*
7429 * entry = entry->vme_next;
7430 * continue;
7431 */
7432 }
7433
7434 subtract_wire_counts(map, entry, user_wire);
7435
7436 if (entry->wired_count != 0) {
7437 entry = entry->vme_next;
7438 continue;
7439 }
7440
7441 entry->in_transition = TRUE;
7442 tmp_entry = *entry;/* see comment in vm_map_wire() */
7443
7444 /*
7445 * We can unlock the map now. The in_transition state
7446 * guarantees existance of the entry.
7447 */
7448 sub_map = VME_SUBMAP(entry);
7449 vm_map_reference(sub_map);
7450 vm_map_unlock(map);
7451 vm_map_unwire_nested(sub_map,
7452 sub_start, sub_end, user_wire, pmap, pmap_addr);
7453 vm_map_deallocate(sub_map);
7454 sub_map = VM_MAP_NULL;
7455 vm_map_lock(map);
7456
7457 if (last_timestamp + 1 != map->timestamp) {
7458 /*
7459 * Find the entry again. It could have been
7460 * clipped or deleted after we unlocked the map.
7461 */
7462 if (!vm_map_lookup_entry(map,
7463 tmp_entry.vme_start,
7464 &first_entry)) {
7465 if (!user_wire) {
7466 panic("vm_map_unwire: re-lookup failed");
7467 }
7468 entry = first_entry->vme_next;
7469 } else {
7470 entry = first_entry;
7471 }
7472 }
7473 last_timestamp = map->timestamp;
7474
7475 /*
7476 * clear transition bit for all constituent entries
7477 * that were in the original entry (saved in
7478 * tmp_entry). Also check for waiters.
7479 */
7480 while ((entry != vm_map_to_entry(map)) &&
7481 (entry->vme_start < tmp_entry.vme_end)) {
7482 assert(entry->in_transition);
7483 entry->in_transition = FALSE;
7484 if (entry->needs_wakeup) {
7485 entry->needs_wakeup = FALSE;
7486 need_wakeup = TRUE;
7487 }
7488 entry = entry->vme_next;
7489 }
7490 continue;
7491 } else {
7492 tmp_entry = *entry;
7493 sub_map = VME_SUBMAP(entry);
7494 vm_map_reference(sub_map);
7495 vm_map_unlock(map);
7496 vm_map_unwire_nested(sub_map,
7497 sub_start, sub_end, user_wire, map_pmap,
7498 pmap_addr);
7499 vm_map_deallocate(sub_map);
7500 sub_map = VM_MAP_NULL;
7501 vm_map_lock(map);
7502
7503 if (last_timestamp + 1 != map->timestamp) {
7504 /*
7505 * Find the entry again. It could have been
7506 * clipped or deleted after we unlocked the map.
7507 */
7508 if (!vm_map_lookup_entry(map,
7509 tmp_entry.vme_start,
7510 &first_entry)) {
7511 if (!user_wire) {
7512 panic("vm_map_unwire: re-lookup failed");
7513 }
7514 entry = first_entry->vme_next;
7515 } else {
7516 entry = first_entry;
7517 }
7518 }
7519 last_timestamp = map->timestamp;
7520 }
7521 }
7522
7523
7524 if ((entry->wired_count == 0) ||
7525 (user_wire && entry->user_wired_count == 0)) {
7526 if (!user_wire) {
7527 panic("vm_map_unwire: entry is unwired");
7528 }
7529
7530 entry = entry->vme_next;
7531 continue;
7532 }
7533
7534 assert(entry->wired_count > 0 &&
7535 (!user_wire || entry->user_wired_count > 0));
7536
7537 vm_map_clip_start(map, entry, start);
7538 vm_map_clip_end(map, entry, end);
7539
7540 /*
7541 * Check for holes
7542 * Holes: Next entry should be contiguous unless
7543 * this is the end of the region.
7544 */
7545 if (((entry->vme_end < end) &&
7546 ((entry->vme_next == vm_map_to_entry(map)) ||
7547 (entry->vme_next->vme_start > entry->vme_end)))) {
7548 if (!user_wire) {
7549 panic("vm_map_unwire: non-contiguous region");
7550 }
7551 /*
7552 * entry = entry->vme_next;
7553 * continue;
7554 */
7555 }
7556
7557 subtract_wire_counts(map, entry, user_wire);
7558
7559 if (entry->wired_count != 0) {
7560 entry = entry->vme_next;
7561 continue;
7562 }
7563
7564 if (entry->zero_wired_pages) {
7565 entry->zero_wired_pages = FALSE;
7566 }
7567
7568 entry->in_transition = TRUE;
7569 tmp_entry = *entry; /* see comment in vm_map_wire() */
7570
7571 /*
7572 * We can unlock the map now. The in_transition state
7573 * guarantees existance of the entry.
7574 */
7575 vm_map_unlock(map);
7576 if (map_pmap) {
7577 vm_fault_unwire(map, &tmp_entry, FALSE, map_pmap,
7578 pmap_addr, tmp_entry.vme_end);
7579 } else {
7580 vm_fault_unwire(map, &tmp_entry, FALSE, map->pmap,
7581 tmp_entry.vme_start, tmp_entry.vme_end);
7582 }
7583 vm_map_lock(map);
7584
7585 if (last_timestamp + 1 != map->timestamp) {
7586 /*
7587 * Find the entry again. It could have been clipped
7588 * or deleted after we unlocked the map.
7589 */
7590 if (!vm_map_lookup_entry(map, tmp_entry.vme_start,
7591 &first_entry)) {
7592 if (!user_wire) {
7593 panic("vm_map_unwire: re-lookup failed");
7594 }
7595 entry = first_entry->vme_next;
7596 } else {
7597 entry = first_entry;
7598 }
7599 }
7600 last_timestamp = map->timestamp;
7601
7602 /*
7603 * clear transition bit for all constituent entries that
7604 * were in the original entry (saved in tmp_entry). Also
7605 * check for waiters.
7606 */
7607 while ((entry != vm_map_to_entry(map)) &&
7608 (entry->vme_start < tmp_entry.vme_end)) {
7609 assert(entry->in_transition);
7610 entry->in_transition = FALSE;
7611 if (entry->needs_wakeup) {
7612 entry->needs_wakeup = FALSE;
7613 need_wakeup = TRUE;
7614 }
7615 entry = entry->vme_next;
7616 }
7617 }
7618
7619 /*
7620 * We might have fragmented the address space when we wired this
7621 * range of addresses. Attempt to re-coalesce these VM map entries
7622 * with their neighbors now that they're no longer wired.
7623 * Under some circumstances, address space fragmentation can
7624 * prevent VM object shadow chain collapsing, which can cause
7625 * swap space leaks.
7626 */
7627 vm_map_simplify_range(map, start, end);
7628
7629 vm_map_unlock(map);
7630 /*
7631 * wake up anybody waiting on entries that we have unwired.
7632 */
7633 if (need_wakeup) {
7634 vm_map_entry_wakeup(map);
7635 }
7636 return KERN_SUCCESS;
7637 }
7638
7639 kern_return_t
vm_map_unwire(vm_map_t map,vm_map_offset_ut start_u,vm_map_offset_ut end_u,boolean_t user_wire)7640 vm_map_unwire(
7641 vm_map_t map,
7642 vm_map_offset_ut start_u,
7643 vm_map_offset_ut end_u,
7644 boolean_t user_wire)
7645 {
7646 return vm_map_unwire_impl(map, start_u, end_u, user_wire,
7647 VM_SANITIZE_CALLER_VM_MAP_UNWIRE);
7648 }
7649
7650 static __attribute__((always_inline, warn_unused_result))
7651 kern_return_t
vm_map_unwire_sanitize(vm_map_t map,vm_map_offset_ut start_u,vm_map_offset_ut end_u,vm_sanitize_caller_t vm_sanitize_caller,vm_map_offset_t * start,vm_map_offset_t * end,vm_map_size_t * size)7652 vm_map_unwire_sanitize(
7653 vm_map_t map,
7654 vm_map_offset_ut start_u,
7655 vm_map_offset_ut end_u,
7656 vm_sanitize_caller_t vm_sanitize_caller,
7657 vm_map_offset_t *start,
7658 vm_map_offset_t *end,
7659 vm_map_size_t *size)
7660 {
7661 return vm_sanitize_addr_end(start_u, end_u, vm_sanitize_caller, map,
7662 VM_SANITIZE_FLAGS_SIZE_ZERO_SUCCEEDS, start, end,
7663 size);
7664 }
7665
7666 kern_return_t
vm_map_unwire_impl(vm_map_t map,vm_map_offset_ut start_u,vm_map_offset_ut end_u,boolean_t user_wire,vm_sanitize_caller_t vm_sanitize_caller)7667 vm_map_unwire_impl(
7668 vm_map_t map,
7669 vm_map_offset_ut start_u,
7670 vm_map_offset_ut end_u,
7671 boolean_t user_wire,
7672 vm_sanitize_caller_t vm_sanitize_caller)
7673 {
7674 vm_map_offset_t start, end;
7675 vm_map_size_t size;
7676 kern_return_t kr;
7677
7678 /*
7679 * Sanitize any input parameters that are addr/size/prot/inherit
7680 */
7681 kr = vm_map_unwire_sanitize(
7682 map,
7683 start_u,
7684 end_u,
7685 vm_sanitize_caller,
7686 &start,
7687 &end,
7688 &size);
7689 if (__improbable(kr != KERN_SUCCESS)) {
7690 return vm_sanitize_get_kr(kr);
7691 }
7692
7693 return vm_map_unwire_nested(map, start, end,
7694 user_wire, (pmap_t)NULL, 0);
7695 }
7696
7697
7698 /*
7699 * vm_map_entry_zap: [ internal use only ]
7700 *
7701 * Remove the entry from the target map
7702 * and put it on a zap list.
7703 */
7704 static void
vm_map_entry_zap(vm_map_t map,vm_map_entry_t entry,vm_map_zap_t zap)7705 vm_map_entry_zap(
7706 vm_map_t map,
7707 vm_map_entry_t entry,
7708 vm_map_zap_t zap)
7709 {
7710 vm_map_offset_t s, e;
7711
7712 s = entry->vme_start;
7713 e = entry->vme_end;
7714 assert(VM_MAP_PAGE_ALIGNED(s, FOURK_PAGE_MASK));
7715 assert(VM_MAP_PAGE_ALIGNED(e, FOURK_PAGE_MASK));
7716 if (VM_MAP_PAGE_MASK(map) >= PAGE_MASK) {
7717 assert(page_aligned(s));
7718 assert(page_aligned(e));
7719 }
7720 if (entry->map_aligned == TRUE) {
7721 assert(VM_MAP_PAGE_ALIGNED(s, VM_MAP_PAGE_MASK(map)));
7722 assert(VM_MAP_PAGE_ALIGNED(e, VM_MAP_PAGE_MASK(map)));
7723 }
7724 assert(entry->wired_count == 0);
7725 assert(entry->user_wired_count == 0);
7726 assert(!entry->vme_permanent);
7727
7728 vm_map_store_entry_unlink(map, entry, false);
7729 map->size -= e - s;
7730
7731 vm_map_zap_append(zap, entry);
7732 }
7733
7734 static void
vm_map_submap_pmap_clean(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_map_t sub_map,vm_map_offset_t offset)7735 vm_map_submap_pmap_clean(
7736 vm_map_t map,
7737 vm_map_offset_t start,
7738 vm_map_offset_t end,
7739 vm_map_t sub_map,
7740 vm_map_offset_t offset)
7741 {
7742 vm_map_offset_t submap_start;
7743 vm_map_offset_t submap_end;
7744 vm_map_size_t remove_size;
7745 vm_map_entry_t entry;
7746
7747 submap_end = offset + (end - start);
7748 submap_start = offset;
7749
7750 vm_map_lock_read(sub_map);
7751 if (vm_map_lookup_entry(sub_map, offset, &entry)) {
7752 remove_size = (entry->vme_end - entry->vme_start);
7753 if (offset > entry->vme_start) {
7754 remove_size -= offset - entry->vme_start;
7755 }
7756
7757
7758 if (submap_end < entry->vme_end) {
7759 remove_size -=
7760 entry->vme_end - submap_end;
7761 }
7762 if (entry->is_sub_map) {
7763 vm_map_submap_pmap_clean(
7764 sub_map,
7765 start,
7766 start + remove_size,
7767 VME_SUBMAP(entry),
7768 VME_OFFSET(entry));
7769 } else {
7770 if (map->mapped_in_other_pmaps &&
7771 os_ref_get_count_raw(&map->map_refcnt) != 0 &&
7772 VME_OBJECT(entry) != NULL) {
7773 vm_object_pmap_protect_options(
7774 VME_OBJECT(entry),
7775 (VME_OFFSET(entry) +
7776 offset -
7777 entry->vme_start),
7778 remove_size,
7779 PMAP_NULL,
7780 PAGE_SIZE,
7781 entry->vme_start,
7782 VM_PROT_NONE,
7783 PMAP_OPTIONS_REMOVE);
7784 } else {
7785 pmap_remove(map->pmap,
7786 (addr64_t)start,
7787 (addr64_t)(start + remove_size));
7788 }
7789 }
7790 }
7791
7792 entry = entry->vme_next;
7793
7794 while ((entry != vm_map_to_entry(sub_map))
7795 && (entry->vme_start < submap_end)) {
7796 remove_size = (entry->vme_end - entry->vme_start);
7797 if (submap_end < entry->vme_end) {
7798 remove_size -= entry->vme_end - submap_end;
7799 }
7800 if (entry->is_sub_map) {
7801 vm_map_submap_pmap_clean(
7802 sub_map,
7803 (start + entry->vme_start) - offset,
7804 ((start + entry->vme_start) - offset) + remove_size,
7805 VME_SUBMAP(entry),
7806 VME_OFFSET(entry));
7807 } else {
7808 if (map->mapped_in_other_pmaps &&
7809 os_ref_get_count_raw(&map->map_refcnt) != 0 &&
7810 VME_OBJECT(entry) != NULL) {
7811 vm_object_pmap_protect_options(
7812 VME_OBJECT(entry),
7813 VME_OFFSET(entry),
7814 remove_size,
7815 PMAP_NULL,
7816 PAGE_SIZE,
7817 entry->vme_start,
7818 VM_PROT_NONE,
7819 PMAP_OPTIONS_REMOVE);
7820 } else {
7821 pmap_remove(map->pmap,
7822 (addr64_t)((start + entry->vme_start)
7823 - offset),
7824 (addr64_t)(((start + entry->vme_start)
7825 - offset) + remove_size));
7826 }
7827 }
7828 entry = entry->vme_next;
7829 }
7830 vm_map_unlock_read(sub_map);
7831 return;
7832 }
7833
7834 /*
7835 * virt_memory_guard_ast:
7836 *
7837 * Handle the AST callout for a virtual memory guard.
7838 * raise an EXC_GUARD exception and terminate the task
7839 * if configured to do so.
7840 */
7841 void
virt_memory_guard_ast(thread_t thread,mach_exception_data_type_t code,mach_exception_data_type_t subcode)7842 virt_memory_guard_ast(
7843 thread_t thread,
7844 mach_exception_data_type_t code,
7845 mach_exception_data_type_t subcode)
7846 {
7847 task_t task = get_threadtask(thread);
7848 assert(task != kernel_task);
7849 assert(task == current_task());
7850 kern_return_t sync_exception_result;
7851 uint32_t behavior;
7852
7853 behavior = task->task_exc_guard;
7854
7855
7856 /* Is delivery enabled */
7857 if ((behavior & TASK_EXC_GUARD_VM_DELIVER) == 0) {
7858 return;
7859 }
7860
7861 /* If only once, make sure we're that once */
7862 while (behavior & TASK_EXC_GUARD_VM_ONCE) {
7863 uint32_t new_behavior = behavior & ~TASK_EXC_GUARD_VM_DELIVER;
7864
7865 if (OSCompareAndSwap(behavior, new_behavior, &task->task_exc_guard)) {
7866 break;
7867 }
7868 behavior = task->task_exc_guard;
7869 if ((behavior & TASK_EXC_GUARD_VM_DELIVER) == 0) {
7870 return;
7871 }
7872 }
7873
7874 const bool fatal = task->task_exc_guard & TASK_EXC_GUARD_VM_FATAL;
7875 /* Raise exception synchronously and see if handler claimed it */
7876 sync_exception_result = task_exception_notify(EXC_GUARD, code, subcode, fatal);
7877
7878 if (fatal) {
7879 /*
7880 * If Synchronous EXC_GUARD delivery was successful then
7881 * kill the process and return, else kill the process
7882 * and deliver the exception via EXC_CORPSE_NOTIFY.
7883 */
7884
7885
7886 int flags = PX_DEBUG_NO_HONOR;
7887 exception_info_t info = {
7888 .os_reason = OS_REASON_GUARD,
7889 .exception_type = EXC_GUARD,
7890 .mx_code = code,
7891 .mx_subcode = subcode
7892 };
7893
7894 if (sync_exception_result == KERN_SUCCESS) {
7895 flags |= PX_PSIGNAL;
7896 }
7897 exit_with_mach_exception(current_proc(), info, flags);
7898 } else if (task->task_exc_guard & TASK_EXC_GUARD_VM_CORPSE) {
7899 /*
7900 * If the synchronous EXC_GUARD delivery was not successful,
7901 * raise a simulated crash.
7902 */
7903 if (sync_exception_result != KERN_SUCCESS) {
7904 task_violated_guard(code, subcode, NULL, FALSE);
7905 }
7906 }
7907 }
7908
7909 /*
7910 * Validate policy for VM guard exceptions and encode the correct Mach exception
7911 * code and subcode if the policy allows delivering a guard exception here.
7912 */
7913 static bool
vm_map_guard_exception_internal(vm_map_offset_t address,unsigned reason,mach_exception_code_t * code,mach_exception_data_type_t * subcode)7914 vm_map_guard_exception_internal(
7915 vm_map_offset_t address,
7916 unsigned reason,
7917 mach_exception_code_t *code,
7918 mach_exception_data_type_t *subcode)
7919 {
7920 unsigned int guard_type = GUARD_TYPE_VIRT_MEMORY;
7921 unsigned int target = 0; /* should we pass in pid associated with map? */
7922
7923 task_t task = current_task_early();
7924
7925 /* Can't deliver exceptions to a NULL task (early boot) or kernel task */
7926 if (task == NULL || task == kernel_task) {
7927 return false;
7928 }
7929
7930
7931 *code = 0;
7932 EXC_GUARD_ENCODE_TYPE(*code, guard_type);
7933 EXC_GUARD_ENCODE_FLAVOR(*code, reason);
7934 EXC_GUARD_ENCODE_TARGET(*code, target);
7935 *subcode = (uint64_t)address;
7936
7937 return true;
7938 }
7939
7940 /*
7941 * vm_map_guard_exception:
7942 *
7943 * Generate a GUARD_TYPE_VIRTUAL_MEMORY EXC_GUARD exception.
7944 *
7945 * `reason` is kGUARD_EXC_DEALLOC_GAP when we find nothing mapped,
7946 * or if there is a gap in the mapping when a user address space
7947 * was requested. We report the address of the first gap found.
7948 */
7949
7950 void
vm_map_guard_exception(vm_map_offset_t address,unsigned reason)7951 vm_map_guard_exception(
7952 vm_map_offset_t address,
7953 unsigned reason)
7954 {
7955 mach_exception_code_t code;
7956 mach_exception_data_type_t subcode;
7957 if (vm_map_guard_exception_internal(address, reason, &code, &subcode)) {
7958 task_t task = current_task();
7959 bool fatal = task->task_exc_guard & TASK_EXC_GUARD_VM_FATAL;
7960
7961 thread_guard_violation(current_thread(), code, subcode, fatal);
7962 }
7963 }
7964
7965
7966 static kern_return_t
vm_map_delete_submap_recurse(vm_map_t submap,vm_map_offset_t submap_start,vm_map_offset_t submap_end)7967 vm_map_delete_submap_recurse(
7968 vm_map_t submap,
7969 vm_map_offset_t submap_start,
7970 vm_map_offset_t submap_end)
7971 {
7972 vm_map_entry_t submap_entry;
7973
7974 /*
7975 * Verify that the submap does not contain any "permanent" entries
7976 * within the specified range. We permit TPRO ranges to be overwritten
7977 * as we only reach this path if TPRO const protection is disabled for a
7978 * given map.
7979 *
7980 * We do not care about gaps.
7981 */
7982
7983 vm_map_lock(submap);
7984
7985 if (!vm_map_lookup_entry(submap, submap_start, &submap_entry)) {
7986 submap_entry = submap_entry->vme_next;
7987 }
7988
7989 for (;
7990 submap_entry != vm_map_to_entry(submap) &&
7991 submap_entry->vme_start < submap_end;
7992 submap_entry = submap_entry->vme_next) {
7993 if (submap_entry->vme_permanent
7994 #ifdef __arm64e__
7995 /* allow TPRO submap entries to be overwritten */
7996 && !submap_entry->used_for_tpro
7997 #endif
7998 ) {
7999 /* "permanent" entry -> fail */
8000 vm_map_unlock(submap);
8001 return KERN_PROTECTION_FAILURE;
8002 }
8003 }
8004 /* no "permanent" entries in the range -> success */
8005 vm_map_unlock(submap);
8006 return KERN_SUCCESS;
8007 }
8008
8009 __abortlike
8010 static void
__vm_map_delete_misaligned_panic(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)8011 __vm_map_delete_misaligned_panic(
8012 vm_map_t map,
8013 vm_map_offset_t start,
8014 vm_map_offset_t end)
8015 {
8016 panic("vm_map_delete(%p,0x%llx,0x%llx): start is not aligned to 0x%x",
8017 map, (uint64_t)start, (uint64_t)end, VM_MAP_PAGE_SIZE(map));
8018 }
8019
8020 __abortlike
8021 static void
__vm_map_delete_failed_panic(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,kern_return_t kr)8022 __vm_map_delete_failed_panic(
8023 vm_map_t map,
8024 vm_map_offset_t start,
8025 vm_map_offset_t end,
8026 kern_return_t kr)
8027 {
8028 panic("vm_map_delete(%p,0x%llx,0x%llx): failed unexpected with %d",
8029 map, (uint64_t)start, (uint64_t)end, kr);
8030 }
8031
8032 __abortlike
8033 static void
__vm_map_delete_gap_panic(vm_map_t map,vm_map_offset_t where,vm_map_offset_t start,vm_map_offset_t end)8034 __vm_map_delete_gap_panic(
8035 vm_map_t map,
8036 vm_map_offset_t where,
8037 vm_map_offset_t start,
8038 vm_map_offset_t end)
8039 {
8040 panic("vm_map_delete(%p,0x%llx,0x%llx): no map entry at 0x%llx",
8041 map, (uint64_t)start, (uint64_t)end, (uint64_t)where);
8042 }
8043
8044 __abortlike
8045 static void
__vm_map_delete_permanent_panic(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_map_entry_t entry)8046 __vm_map_delete_permanent_panic(
8047 vm_map_t map,
8048 vm_map_offset_t start,
8049 vm_map_offset_t end,
8050 vm_map_entry_t entry)
8051 {
8052 panic("vm_map_delete(%p,0x%llx,0x%llx): "
8053 "Attempting to remove permanent VM map entry %p [0x%llx:0x%llx]",
8054 map, (uint64_t)start, (uint64_t)end, entry,
8055 (uint64_t)entry->vme_start,
8056 (uint64_t)entry->vme_end);
8057 }
8058
8059 __options_decl(vm_map_delete_state_t, uint32_t, {
8060 VMDS_NONE = 0x0000,
8061
8062 VMDS_FOUND_GAP = 0x0001,
8063 VMDS_GAPS_OK = 0x0002,
8064
8065 VMDS_KERNEL_PMAP = 0x0004,
8066 VMDS_NEEDS_LOOKUP = 0x0008,
8067 VMDS_NEEDS_WAKEUP = 0x0010,
8068 VMDS_KERNEL_KMEMPTR = 0x0020
8069 });
8070
8071 /*
8072 * vm_map_clamp_to_pmap(map, start, end)
8073 *
8074 * Modify *start and *end so they fall within the bounds of map->pmap.
8075 */
8076 #if MACH_ASSERT
8077 static void
vm_map_clamp_to_pmap(vm_map_t map,vm_map_address_t * start,vm_map_address_t * end)8078 vm_map_clamp_to_pmap(vm_map_t map, vm_map_address_t *start, vm_map_address_t *end)
8079 {
8080 vm_map_address_t min;
8081 vm_map_address_t max;
8082
8083 #if __x86_64__
8084 /* x86_64 struct pmap does not have min and max fields */
8085 if (map->pmap == kernel_pmap) {
8086 min = VM_MIN_KERNEL_AND_KEXT_ADDRESS;
8087 max = VM_MAX_KERNEL_ADDRESS;
8088 } else {
8089 min = VM_MAP_MIN_ADDRESS;
8090 max = VM_MAP_MAX_ADDRESS;
8091 }
8092 #else
8093 min = map->pmap->min;
8094 max = map->pmap->max;
8095 #endif
8096
8097 if (*start < min) {
8098 *start = min;
8099 } else if (*start > max) {
8100 *start = max;
8101 }
8102 if (*end < min) {
8103 *end = min;
8104 } else if (*end > max) {
8105 *end = max;
8106 }
8107 }
8108 #endif
8109
8110 int vm_log_map_delete_permanent_prot_none = 0;
8111 /*
8112 * vm_map_delete: [ internal use only ]
8113 *
8114 * Deallocates the given address range from the target map.
8115 * Removes all user wirings. Unwires one kernel wiring if
8116 * VM_MAP_REMOVE_KUNWIRE is set. Waits for kernel wirings to go
8117 * away if VM_MAP_REMOVE_WAIT_FOR_KWIRE is set. Sleeps
8118 * interruptibly if VM_MAP_REMOVE_INTERRUPTIBLE is set.
8119 *
8120 *
8121 * When the map is a kernel map, then any error in removing mappings
8122 * will lead to a panic so that clients do not have to repeat the panic
8123 * code at each call site. If VM_MAP_REMOVE_INTERRUPTIBLE
8124 * is also passed, then KERN_ABORTED will not lead to a panic.
8125 *
8126 * This routine is called with map locked and leaves map locked.
8127 */
8128 static kmem_return_t
vm_map_delete(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vmr_flags_t flags,kmem_guard_t guard,vm_map_zap_t zap_list)8129 vm_map_delete(
8130 vm_map_t map,
8131 vm_map_offset_t start,
8132 vm_map_offset_t end,
8133 vmr_flags_t flags,
8134 kmem_guard_t guard,
8135 vm_map_zap_t zap_list)
8136 {
8137 vm_map_entry_t entry, next;
8138 int interruptible;
8139 vm_map_offset_t gap_start = 0;
8140 vm_map_offset_t clear_in_transition_end = 0;
8141 __unused vm_map_offset_t save_start = start;
8142 __unused vm_map_offset_t save_end = end;
8143 vm_map_delete_state_t state = VMDS_NONE;
8144 kmem_return_t ret = { };
8145 vm_map_range_id_t range_id = 0;
8146 struct kmem_page_meta *meta = NULL;
8147 uint32_t size_idx, slot_idx;
8148 struct mach_vm_range slot;
8149
8150 if (vm_map_pmap(map) == kernel_pmap) {
8151 state |= VMDS_KERNEL_PMAP;
8152 range_id = kmem_addr_get_range(start, end - start);
8153 if (kmem_is_ptr_range(range_id)) {
8154 state |= VMDS_KERNEL_KMEMPTR;
8155 slot_idx = kmem_addr_get_slot_idx(start, end, range_id, &meta,
8156 &size_idx, &slot);
8157 }
8158 }
8159
8160 if (map->terminated || os_ref_get_count_raw(&map->map_refcnt) == 0) {
8161 state |= VMDS_GAPS_OK;
8162 }
8163
8164 if (map->corpse_source &&
8165 !(flags & VM_MAP_REMOVE_TO_OVERWRITE) &&
8166 !map->terminated) {
8167 /*
8168 * The map is being used for corpses related diagnostics.
8169 * So skip any entry removal to avoid perturbing the map state.
8170 * The cleanup will happen in task_terminate_internal after the
8171 * call to task_port_no_senders.
8172 */
8173 goto out;
8174 }
8175
8176 interruptible = (flags & VM_MAP_REMOVE_INTERRUPTIBLE) ?
8177 THREAD_ABORTSAFE : THREAD_UNINT;
8178
8179 if ((flags & VM_MAP_REMOVE_NO_MAP_ALIGN) == 0 &&
8180 (start & VM_MAP_PAGE_MASK(map))) {
8181 __vm_map_delete_misaligned_panic(map, start, end);
8182 }
8183
8184 if ((state & VMDS_GAPS_OK) == 0) {
8185 /*
8186 * If the map isn't terminated then all deletions must have
8187 * no gaps, and be within the [min, max) of the map.
8188 *
8189 * We got here without VM_MAP_RANGE_CHECK() being called,
8190 * and hence must validate bounds manually.
8191 *
8192 * It is worth noting that because vm_deallocate() will
8193 * round_page() the deallocation size, it's possible for "end"
8194 * to be 0 here due to overflow. We hence must treat it as being
8195 * beyond vm_map_max(map).
8196 *
8197 * Similarly, end < start means some wrap around happend,
8198 * which should cause an error or panic.
8199 */
8200 if (end == 0 || end > vm_map_max(map)) {
8201 state |= VMDS_FOUND_GAP;
8202 gap_start = vm_map_max(map);
8203 if (state & VMDS_KERNEL_PMAP) {
8204 __vm_map_delete_gap_panic(map,
8205 gap_start, start, end);
8206 }
8207 goto out;
8208 }
8209
8210 if (end < start) {
8211 if (state & VMDS_KERNEL_PMAP) {
8212 __vm_map_delete_gap_panic(map,
8213 vm_map_max(map), start, end);
8214 }
8215 ret.kmr_return = KERN_INVALID_ARGUMENT;
8216 goto out;
8217 }
8218
8219 if (start < vm_map_min(map)) {
8220 state |= VMDS_FOUND_GAP;
8221 gap_start = start;
8222 if (state & VMDS_KERNEL_PMAP) {
8223 __vm_map_delete_gap_panic(map,
8224 gap_start, start, end);
8225 }
8226 goto out;
8227 }
8228 } else {
8229 /*
8230 * If the map is terminated, we must accept start/end
8231 * being beyond the boundaries of the map as this is
8232 * how some of the mappings like commpage mappings
8233 * can be destroyed (they're outside of those bounds).
8234 *
8235 * end < start is still something we can't cope with,
8236 * so just bail.
8237 */
8238 if (end < start) {
8239 goto out;
8240 }
8241 }
8242
8243
8244 /*
8245 * Find the start of the region.
8246 *
8247 * If in a superpage, extend the range
8248 * to include the start of the mapping.
8249 */
8250 while (vm_map_lookup_entry_or_next(map, start, &entry)) {
8251 if (entry->superpage_size && (start & ~SUPERPAGE_MASK)) {
8252 start = SUPERPAGE_ROUND_DOWN(start);
8253 } else {
8254 SAVE_HINT_MAP_WRITE(map, entry->vme_prev);
8255 break;
8256 }
8257 }
8258
8259 if (entry->superpage_size) {
8260 end = SUPERPAGE_ROUND_UP(end);
8261 }
8262
8263 /*
8264 * Step through all entries in this region
8265 */
8266 for (vm_map_offset_t s = start; s < end;) {
8267 /*
8268 * At this point, we have deleted all the memory entries
8269 * in [start, s) and are proceeding with the [s, end) range.
8270 *
8271 * This loop might drop the map lock, and it is possible that
8272 * some memory was already reallocated within [start, s)
8273 * and we don't want to mess with those entries.
8274 *
8275 * Some of those entries could even have been re-assembled
8276 * with an entry after "s" (in vm_map_simplify_entry()), so
8277 * we may have to vm_map_clip_start() again.
8278 *
8279 * When clear_in_transition_end is set, the we had marked
8280 * [start, clear_in_transition_end) as "in_transition"
8281 * during a previous iteration and we need to clear it.
8282 */
8283
8284 /*
8285 * Step 1: If needed (because we dropped locks),
8286 * lookup the entry again.
8287 *
8288 * If we're coming back from unwiring (Step 5),
8289 * we also need to mark the entries as no longer
8290 * in transition after that.
8291 */
8292
8293 if (state & VMDS_NEEDS_LOOKUP) {
8294 state &= ~VMDS_NEEDS_LOOKUP;
8295
8296 if (vm_map_lookup_entry_or_next(map, s, &entry)) {
8297 SAVE_HINT_MAP_WRITE(map, entry->vme_prev);
8298 }
8299
8300 if (state & VMDS_KERNEL_KMEMPTR) {
8301 kmem_validate_slot(s, meta, size_idx, slot_idx);
8302 }
8303 }
8304
8305 if (clear_in_transition_end) {
8306 for (vm_map_entry_t it = entry;
8307 it != vm_map_to_entry(map) &&
8308 it->vme_start < clear_in_transition_end;
8309 it = it->vme_next) {
8310 assert(it->in_transition);
8311 it->in_transition = FALSE;
8312 if (it->needs_wakeup) {
8313 it->needs_wakeup = FALSE;
8314 state |= VMDS_NEEDS_WAKEUP;
8315 }
8316 }
8317
8318 clear_in_transition_end = 0;
8319 }
8320
8321
8322 /*
8323 * Step 2: Perform various policy checks
8324 * before we do _anything_ to this entry.
8325 */
8326
8327 if (entry == vm_map_to_entry(map) || s < entry->vme_start) {
8328 if (state & (VMDS_GAPS_OK | VMDS_FOUND_GAP)) {
8329 /*
8330 * Either we found a gap already,
8331 * or we are tearing down a map,
8332 * keep going.
8333 */
8334 } else if (state & VMDS_KERNEL_PMAP) {
8335 __vm_map_delete_gap_panic(map, s, start, end);
8336 } else if (s < end) {
8337 state |= VMDS_FOUND_GAP;
8338 gap_start = s;
8339 }
8340
8341 if (entry == vm_map_to_entry(map) ||
8342 end <= entry->vme_start) {
8343 break;
8344 }
8345
8346 s = entry->vme_start;
8347 }
8348
8349 if (state & VMDS_KERNEL_PMAP) {
8350 /*
8351 * In the kernel map and its submaps,
8352 * permanent entries never die, even
8353 * if VM_MAP_REMOVE_IMMUTABLE is passed.
8354 */
8355 if (entry->vme_permanent) {
8356 __vm_map_delete_permanent_panic(map, start, end, entry);
8357 }
8358
8359 if (flags & VM_MAP_REMOVE_GUESS_SIZE) {
8360 end = entry->vme_end;
8361 flags &= ~VM_MAP_REMOVE_GUESS_SIZE;
8362 }
8363
8364 /*
8365 * In the kernel map and its submaps,
8366 * the removal of an atomic/guarded entry is strict.
8367 *
8368 * An atomic entry is processed only if it was
8369 * specifically targeted.
8370 *
8371 * We might have deleted non-atomic entries before
8372 * we reach this this point however...
8373 */
8374 kmem_entry_validate_guard(map, entry,
8375 start, end - start, guard);
8376 }
8377
8378 /*
8379 * Step 2.1: handle "permanent" and "submap" entries
8380 * *before* clipping to avoid triggering some unnecessary
8381 * un-nesting of the shared region.
8382 */
8383 if (entry->vme_permanent && entry->is_sub_map) {
8384 // printf("FBDP %s:%d permanent submap...\n", __FUNCTION__, __LINE__);
8385 /*
8386 * Un-mapping a "permanent" mapping of a user-space
8387 * submap is not allowed unless...
8388 */
8389 if (flags & VM_MAP_REMOVE_IMMUTABLE) {
8390 /*
8391 * a. explicitly requested by the kernel caller.
8392 */
8393 // printf("FBDP %s:%d flags & REMOVE_IMMUTABLE\n", __FUNCTION__, __LINE__);
8394 } else if ((flags & VM_MAP_REMOVE_IMMUTABLE_CODE) &&
8395 developer_mode_state()) {
8396 /*
8397 * b. we're in "developer" mode (for
8398 * breakpoints, dtrace probes, ...).
8399 */
8400 // printf("FBDP %s:%d flags & REMOVE_IMMUTABLE_CODE\n", __FUNCTION__, __LINE__);
8401 } else if (map->terminated) {
8402 /*
8403 * c. this is the final address space cleanup.
8404 */
8405 // printf("FBDP %s:%d map->terminated\n", __FUNCTION__, __LINE__);
8406 } else {
8407 vm_map_offset_t submap_start, submap_end;
8408 kern_return_t submap_kr;
8409
8410 /*
8411 * Check if there are any "permanent" mappings
8412 * in this range in the submap.
8413 */
8414 if (entry->in_transition) {
8415 /* can that even happen ? */
8416 goto in_transition;
8417 }
8418 /* compute the clipped range in the submap */
8419 submap_start = s - entry->vme_start;
8420 submap_start += VME_OFFSET(entry);
8421 submap_end = end - entry->vme_start;
8422 submap_end += VME_OFFSET(entry);
8423 submap_kr = vm_map_delete_submap_recurse(
8424 VME_SUBMAP(entry),
8425 submap_start,
8426 submap_end);
8427 if (submap_kr != KERN_SUCCESS) {
8428 /*
8429 * There are some "permanent" mappings
8430 * in the submap: we are not allowed
8431 * to remove this range.
8432 */
8433 printf("%d[%s] removing permanent submap entry "
8434 "%p [0x%llx:0x%llx] prot 0x%x/0x%x -> KERN_PROT_FAILURE\n",
8435 proc_selfpid(),
8436 (get_bsdtask_info(current_task())
8437 ? proc_name_address(get_bsdtask_info(current_task()))
8438 : "?"), entry,
8439 (uint64_t)entry->vme_start,
8440 (uint64_t)entry->vme_end,
8441 entry->protection,
8442 entry->max_protection);
8443 DTRACE_VM6(vm_map_delete_permanent_deny_submap,
8444 vm_map_entry_t, entry,
8445 vm_map_offset_t, entry->vme_start,
8446 vm_map_offset_t, entry->vme_end,
8447 vm_prot_t, entry->protection,
8448 vm_prot_t, entry->max_protection,
8449 int, VME_ALIAS(entry));
8450 ret.kmr_return = KERN_PROTECTION_FAILURE;
8451 goto out;
8452 }
8453 /* no permanent mappings: proceed */
8454 }
8455 }
8456
8457 /*
8458 * Step 3: Perform any clipping needed.
8459 *
8460 * After this, "entry" starts at "s", ends before "end"
8461 */
8462
8463 if (entry->vme_start < s) {
8464 if ((flags & VM_MAP_REMOVE_NO_MAP_ALIGN) &&
8465 entry->map_aligned &&
8466 !VM_MAP_PAGE_ALIGNED(s, VM_MAP_PAGE_MASK(map))) {
8467 /*
8468 * The entry will no longer be map-aligned
8469 * after clipping and the caller said it's OK.
8470 */
8471 entry->map_aligned = FALSE;
8472 }
8473 vm_map_clip_start(map, entry, s);
8474 SAVE_HINT_MAP_WRITE(map, entry->vme_prev);
8475 }
8476
8477 if (end < entry->vme_end) {
8478 if ((flags & VM_MAP_REMOVE_NO_MAP_ALIGN) &&
8479 entry->map_aligned &&
8480 !VM_MAP_PAGE_ALIGNED(end, VM_MAP_PAGE_MASK(map))) {
8481 /*
8482 * The entry will no longer be map-aligned
8483 * after clipping and the caller said it's OK.
8484 */
8485 entry->map_aligned = FALSE;
8486 }
8487 vm_map_clip_end(map, entry, end);
8488 }
8489
8490 if (entry->vme_permanent && entry->is_sub_map) {
8491 /*
8492 * We already went through step 2.1 which did not deny
8493 * the removal of this "permanent" and "is_sub_map"
8494 * entry.
8495 * Now that we've clipped what we actually want to
8496 * delete, undo the "permanent" part to allow the
8497 * removal to proceed.
8498 */
8499 DTRACE_VM6(vm_map_delete_permanent_allow_submap,
8500 vm_map_entry_t, entry,
8501 vm_map_offset_t, entry->vme_start,
8502 vm_map_offset_t, entry->vme_end,
8503 vm_prot_t, entry->protection,
8504 vm_prot_t, entry->max_protection,
8505 int, VME_ALIAS(entry));
8506 entry->vme_permanent = false;
8507 }
8508
8509 assert(s == entry->vme_start);
8510 assert(entry->vme_end <= end);
8511
8512
8513 /*
8514 * Step 4: If the entry is in flux, wait for this to resolve.
8515 */
8516
8517 if (entry->in_transition) {
8518 wait_result_t wait_result;
8519
8520 in_transition:
8521 /*
8522 * Another thread is wiring/unwiring this entry.
8523 * Let the other thread know we are waiting.
8524 */
8525
8526 entry->needs_wakeup = TRUE;
8527
8528 /*
8529 * wake up anybody waiting on entries that we have
8530 * already unwired/deleted.
8531 */
8532 if (state & VMDS_NEEDS_WAKEUP) {
8533 vm_map_entry_wakeup(map);
8534 state &= ~VMDS_NEEDS_WAKEUP;
8535 }
8536
8537 wait_result = vm_map_entry_wait(map, interruptible);
8538
8539 if (interruptible &&
8540 wait_result == THREAD_INTERRUPTED) {
8541 /*
8542 * We do not clear the needs_wakeup flag,
8543 * since we cannot tell if we were the only one.
8544 */
8545 ret.kmr_return = KERN_ABORTED;
8546 return ret;
8547 }
8548
8549 /*
8550 * The entry could have been clipped or it
8551 * may not exist anymore. Look it up again.
8552 */
8553 state |= VMDS_NEEDS_LOOKUP;
8554 continue;
8555 }
8556
8557
8558 /*
8559 * Step 5: Handle wiring
8560 */
8561
8562 if (entry->wired_count) {
8563 struct vm_map_entry tmp_entry;
8564 boolean_t user_wire;
8565 unsigned int last_timestamp;
8566
8567 user_wire = entry->user_wired_count > 0;
8568
8569 /*
8570 * Remove a kernel wiring if requested
8571 */
8572 if (flags & VM_MAP_REMOVE_KUNWIRE) {
8573 entry->wired_count--;
8574 vme_btref_consider_and_put(entry);
8575 }
8576
8577 /*
8578 * Remove all user wirings for proper accounting
8579 */
8580 while (entry->user_wired_count) {
8581 subtract_wire_counts(map, entry, user_wire);
8582 }
8583
8584 /*
8585 * All our DMA I/O operations in IOKit are currently
8586 * done by wiring through the map entries of the task
8587 * requesting the I/O.
8588 *
8589 * Because of this, we must always wait for kernel wirings
8590 * to go away on the entries before deleting them.
8591 *
8592 * Any caller who wants to actually remove a kernel wiring
8593 * should explicitly set the VM_MAP_REMOVE_KUNWIRE flag to
8594 * properly remove one wiring instead of blasting through
8595 * them all.
8596 */
8597 if (entry->wired_count != 0) {
8598 assert(map != kernel_map);
8599 /*
8600 * Cannot continue. Typical case is when
8601 * a user thread has physical io pending on
8602 * on this page. Either wait for the
8603 * kernel wiring to go away or return an
8604 * error.
8605 */
8606 wait_result_t wait_result;
8607
8608 entry->needs_wakeup = TRUE;
8609 wait_result = vm_map_entry_wait(map,
8610 interruptible);
8611
8612 if (interruptible &&
8613 wait_result == THREAD_INTERRUPTED) {
8614 /*
8615 * We do not clear the
8616 * needs_wakeup flag, since we
8617 * cannot tell if we were the
8618 * only one.
8619 */
8620 ret.kmr_return = KERN_ABORTED;
8621 return ret;
8622 }
8623
8624
8625 /*
8626 * The entry could have been clipped or
8627 * it may not exist anymore. Look it
8628 * up again.
8629 */
8630 state |= VMDS_NEEDS_LOOKUP;
8631 continue;
8632 }
8633
8634 /*
8635 * We can unlock the map now.
8636 *
8637 * The entry might be split once we unlock the map,
8638 * but we need the range as defined by this entry
8639 * to be stable. So we must make a local copy.
8640 *
8641 * The underlying objects do not change during clips,
8642 * and the in_transition state guarentees existence
8643 * of the entry.
8644 */
8645 last_timestamp = map->timestamp;
8646 entry->in_transition = TRUE;
8647 tmp_entry = *entry;
8648 vm_map_unlock(map);
8649
8650 if (tmp_entry.is_sub_map) {
8651 vm_map_t sub_map;
8652 vm_map_offset_t sub_start, sub_end;
8653 pmap_t pmap;
8654 vm_map_offset_t pmap_addr;
8655
8656
8657 sub_map = VME_SUBMAP(&tmp_entry);
8658 sub_start = VME_OFFSET(&tmp_entry);
8659 sub_end = sub_start + (tmp_entry.vme_end -
8660 tmp_entry.vme_start);
8661 if (tmp_entry.use_pmap) {
8662 pmap = sub_map->pmap;
8663 pmap_addr = tmp_entry.vme_start;
8664 } else {
8665 pmap = map->pmap;
8666 pmap_addr = tmp_entry.vme_start;
8667 }
8668 (void) vm_map_unwire_nested(sub_map,
8669 sub_start, sub_end,
8670 user_wire,
8671 pmap, pmap_addr);
8672 } else {
8673 vm_map_offset_t entry_end = tmp_entry.vme_end;
8674 vm_map_offset_t max_end;
8675
8676 if (flags & VM_MAP_REMOVE_NOKUNWIRE_LAST) {
8677 max_end = end - VM_MAP_PAGE_SIZE(map);
8678 if (entry_end > max_end) {
8679 entry_end = max_end;
8680 }
8681 }
8682
8683 if (tmp_entry.vme_kernel_object) {
8684 pmap_protect_options(
8685 map->pmap,
8686 tmp_entry.vme_start,
8687 entry_end,
8688 VM_PROT_NONE,
8689 PMAP_OPTIONS_REMOVE,
8690 NULL);
8691 }
8692 vm_fault_unwire(map, &tmp_entry,
8693 tmp_entry.vme_kernel_object, map->pmap,
8694 tmp_entry.vme_start, entry_end);
8695 }
8696
8697 vm_map_lock(map);
8698
8699 /*
8700 * Unwiring happened, we can now go back to deleting
8701 * them (after we clear the in_transition bit for the range).
8702 */
8703 if (last_timestamp + 1 != map->timestamp) {
8704 state |= VMDS_NEEDS_LOOKUP;
8705 }
8706 clear_in_transition_end = tmp_entry.vme_end;
8707 continue;
8708 }
8709
8710 assert(entry->wired_count == 0);
8711 assert(entry->user_wired_count == 0);
8712
8713
8714 /*
8715 * Step 6: Entry is unwired and ready for us to delete !
8716 */
8717
8718 if (!entry->vme_permanent) {
8719 /*
8720 * Typical case: the entry really shouldn't be permanent
8721 */
8722 } else if ((flags & VM_MAP_REMOVE_IMMUTABLE_CODE) &&
8723 (entry->protection & VM_PROT_EXECUTE) &&
8724 developer_mode_state()) {
8725 /*
8726 * Allow debuggers to undo executable mappings
8727 * when developer mode is on.
8728 */
8729 #if 0
8730 printf("FBDP %d[%s] removing permanent executable entry "
8731 "%p [0x%llx:0x%llx] prot 0x%x/0x%x\n",
8732 proc_selfpid(),
8733 (current_task()->bsd_info
8734 ? proc_name_address(current_task()->bsd_info)
8735 : "?"), entry,
8736 (uint64_t)entry->vme_start,
8737 (uint64_t)entry->vme_end,
8738 entry->protection,
8739 entry->max_protection);
8740 #endif
8741 entry->vme_permanent = FALSE;
8742 } else if ((flags & VM_MAP_REMOVE_IMMUTABLE) || map->terminated) {
8743 #if 0
8744 printf("FBDP %d[%s] removing permanent entry "
8745 "%p [0x%llx:0x%llx] prot 0x%x/0x%x\n",
8746 proc_selfpid(),
8747 (current_task()->bsd_info
8748 ? proc_name_address(current_task()->bsd_info)
8749 : "?"), entry,
8750 (uint64_t)entry->vme_start,
8751 (uint64_t)entry->vme_end,
8752 entry->protection,
8753 entry->max_protection);
8754 #endif
8755 entry->vme_permanent = FALSE;
8756 #if CODE_SIGNING_MONITOR
8757 } else if ((entry->protection & VM_PROT_EXECUTE) && !csm_enabled()) {
8758 entry->vme_permanent = FALSE;
8759
8760 printf("%d[%s] %s(0x%llx,0x%llx): "
8761 "code signing monitor disabled, allowing for permanent executable entry [0x%llx:0x%llx] "
8762 "prot 0x%x/0x%x\n",
8763 proc_selfpid(),
8764 (get_bsdtask_info(current_task())
8765 ? proc_name_address(get_bsdtask_info(current_task()))
8766 : "?"),
8767 __FUNCTION__,
8768 (uint64_t)start,
8769 (uint64_t)end,
8770 (uint64_t)entry->vme_start,
8771 (uint64_t)entry->vme_end,
8772 entry->protection,
8773 entry->max_protection);
8774 #endif
8775 } else {
8776 DTRACE_VM6(vm_map_delete_permanent,
8777 vm_map_entry_t, entry,
8778 vm_map_offset_t, entry->vme_start,
8779 vm_map_offset_t, entry->vme_end,
8780 vm_prot_t, entry->protection,
8781 vm_prot_t, entry->max_protection,
8782 int, VME_ALIAS(entry));
8783 }
8784
8785 if (entry->is_sub_map) {
8786 assertf(VM_MAP_PAGE_SHIFT(VME_SUBMAP(entry)) >= VM_MAP_PAGE_SHIFT(map),
8787 "map %p (%d) entry %p submap %p (%d)\n",
8788 map, VM_MAP_PAGE_SHIFT(map), entry,
8789 VME_SUBMAP(entry),
8790 VM_MAP_PAGE_SHIFT(VME_SUBMAP(entry)));
8791 if (entry->use_pmap) {
8792 #ifndef NO_NESTED_PMAP
8793 int pmap_flags;
8794
8795 if (map->terminated) {
8796 /*
8797 * This is the final cleanup of the
8798 * address space being terminated.
8799 * No new mappings are expected and
8800 * we don't really need to unnest the
8801 * shared region (and lose the "global"
8802 * pmap mappings, if applicable).
8803 *
8804 * Tell the pmap layer that we're
8805 * "clean" wrt nesting.
8806 */
8807 pmap_flags = PMAP_UNNEST_CLEAN;
8808 } else {
8809 /*
8810 * We're unmapping part of the nested
8811 * shared region, so we can't keep the
8812 * nested pmap.
8813 */
8814 pmap_flags = 0;
8815 }
8816 pmap_unnest_options(
8817 map->pmap,
8818 (addr64_t)entry->vme_start,
8819 entry->vme_end - entry->vme_start,
8820 pmap_flags);
8821 #endif /* NO_NESTED_PMAP */
8822 if (map->mapped_in_other_pmaps &&
8823 os_ref_get_count_raw(&map->map_refcnt) != 0) {
8824 /* clean up parent map/maps */
8825 vm_map_submap_pmap_clean(
8826 map, entry->vme_start,
8827 entry->vme_end,
8828 VME_SUBMAP(entry),
8829 VME_OFFSET(entry));
8830 }
8831 } else {
8832 vm_map_submap_pmap_clean(
8833 map, entry->vme_start, entry->vme_end,
8834 VME_SUBMAP(entry),
8835 VME_OFFSET(entry));
8836 }
8837 } else if (entry->vme_kernel_object ||
8838 VME_OBJECT(entry) == compressor_object) {
8839 /*
8840 * nothing to do
8841 */
8842 } else if (map->mapped_in_other_pmaps &&
8843 os_ref_get_count_raw(&map->map_refcnt) != 0) {
8844 vm_object_pmap_protect_options(
8845 VME_OBJECT(entry), VME_OFFSET(entry),
8846 entry->vme_end - entry->vme_start,
8847 PMAP_NULL,
8848 PAGE_SIZE,
8849 entry->vme_start,
8850 VM_PROT_NONE,
8851 PMAP_OPTIONS_REMOVE);
8852 } else if ((VME_OBJECT(entry) != VM_OBJECT_NULL) ||
8853 (state & VMDS_KERNEL_PMAP)) {
8854 /* Remove translations associated
8855 * with this range unless the entry
8856 * does not have an object, or
8857 * it's the kernel map or a descendant
8858 * since the platform could potentially
8859 * create "backdoor" mappings invisible
8860 * to the VM. It is expected that
8861 * objectless, non-kernel ranges
8862 * do not have such VM invisible
8863 * translations.
8864 */
8865 vm_map_address_t remove_start = entry->vme_start;
8866 vm_map_address_t remove_end = entry->vme_end;
8867 #if MACH_ASSERT
8868 /*
8869 * Prevent panics in pmap_remove() from some vm test code
8870 * which uses virtual address ranges that pmap disallows.
8871 */
8872 if (thread_get_test_option(test_option_vm_map_clamp_pmap_remove)) {
8873 vm_map_clamp_to_pmap(map, &remove_start, &remove_end);
8874 }
8875 #endif /* MACH_ASSERT */
8876 pmap_remove(map->pmap, remove_start, remove_end);
8877 }
8878
8879 #if DEBUG
8880 /*
8881 * All pmap mappings for this map entry must have been
8882 * cleared by now.
8883 */
8884 assert(pmap_is_empty(map->pmap,
8885 entry->vme_start,
8886 entry->vme_end));
8887 #endif /* DEBUG */
8888
8889 if (entry->iokit_acct) {
8890 /* alternate accounting */
8891 DTRACE_VM4(vm_map_iokit_unmapped_region,
8892 vm_map_t, map,
8893 vm_map_offset_t, entry->vme_start,
8894 vm_map_offset_t, entry->vme_end,
8895 int, VME_ALIAS(entry));
8896 vm_map_iokit_unmapped_region(map,
8897 (entry->vme_end -
8898 entry->vme_start));
8899 entry->iokit_acct = FALSE;
8900 entry->use_pmap = FALSE;
8901 }
8902
8903 /* move "s" forward */
8904 s = entry->vme_end;
8905 next = entry->vme_next;
8906 if (!entry->map_aligned) {
8907 vm_map_offset_t rounded_s;
8908
8909 /*
8910 * Skip artificial gap due to mis-aligned entry
8911 * on devices with a page size smaller than the
8912 * map's page size (i.e. 16k task on a 4k device).
8913 */
8914 rounded_s = VM_MAP_ROUND_PAGE(s, VM_MAP_PAGE_MASK(map));
8915 if (next == vm_map_to_entry(map)) {
8916 s = rounded_s;
8917 } else if (s < rounded_s) {
8918 s = MIN(rounded_s, next->vme_start);
8919 }
8920 }
8921 ret.kmr_size += s - entry->vme_start;
8922
8923 if (entry->vme_permanent) {
8924 /*
8925 * A permanent entry can not be removed, so leave it
8926 * in place but remove all access permissions.
8927 */
8928 if (__improbable(vm_log_map_delete_permanent_prot_none)) {
8929 printf("%s:%d %d[%s] map %p entry %p [ 0x%llx - 0x%llx ] submap %d prot 0x%x/0x%x -> 0/0\n",
8930 __FUNCTION__, __LINE__,
8931 proc_selfpid(),
8932 (get_bsdtask_info(current_task())
8933 ? proc_name_address(get_bsdtask_info(current_task()))
8934 : "?"),
8935 map,
8936 entry,
8937 (uint64_t)entry->vme_start,
8938 (uint64_t)entry->vme_end,
8939 entry->is_sub_map,
8940 entry->protection,
8941 entry->max_protection);
8942 }
8943 DTRACE_VM6(vm_map_delete_permanent_prot_none,
8944 vm_map_entry_t, entry,
8945 vm_map_offset_t, entry->vme_start,
8946 vm_map_offset_t, entry->vme_end,
8947 vm_prot_t, entry->protection,
8948 vm_prot_t, entry->max_protection,
8949 int, VME_ALIAS(entry));
8950 entry->protection = VM_PROT_NONE;
8951 entry->max_protection = VM_PROT_NONE;
8952 #ifdef __arm64e__
8953 entry->used_for_tpro = FALSE;
8954 #endif
8955 } else {
8956 vm_map_entry_zap(map, entry, zap_list);
8957 }
8958
8959 entry = next;
8960 next = VM_MAP_ENTRY_NULL;
8961
8962 if ((flags & VM_MAP_REMOVE_NO_YIELD) == 0 && s < end) {
8963 unsigned int last_timestamp = map->timestamp++;
8964
8965 if (lck_rw_lock_yield_exclusive(&map->lock,
8966 LCK_RW_YIELD_ANY_WAITER)) {
8967 if (last_timestamp != map->timestamp + 1) {
8968 state |= VMDS_NEEDS_LOOKUP;
8969 }
8970 } else {
8971 /* we didn't yield, undo our change */
8972 map->timestamp--;
8973 }
8974 }
8975 }
8976
8977 if (map->wait_for_space) {
8978 thread_wakeup((event_t) map);
8979 }
8980
8981 if (state & VMDS_NEEDS_WAKEUP) {
8982 vm_map_entry_wakeup(map);
8983 }
8984
8985 out:
8986 if ((state & VMDS_KERNEL_PMAP) && ret.kmr_return) {
8987 __vm_map_delete_failed_panic(map, start, end, ret.kmr_return);
8988 }
8989
8990 if (state & VMDS_KERNEL_KMEMPTR) {
8991 kmem_free_space(start, end, range_id, &slot);
8992 }
8993
8994 if (state & VMDS_FOUND_GAP) {
8995 DTRACE_VM3(kern_vm_deallocate_gap,
8996 vm_map_offset_t, gap_start,
8997 vm_map_offset_t, save_start,
8998 vm_map_offset_t, save_end);
8999 if (flags & VM_MAP_REMOVE_GAPS_FAIL) {
9000 ret.kmr_return = KERN_INVALID_VALUE;
9001 } else {
9002 vm_map_guard_exception(gap_start, kGUARD_EXC_DEALLOC_GAP);
9003 }
9004 }
9005
9006 return ret;
9007 }
9008
9009 kmem_return_t
vm_map_remove_and_unlock(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vmr_flags_t flags,kmem_guard_t guard)9010 vm_map_remove_and_unlock(
9011 vm_map_t map,
9012 vm_map_offset_t start,
9013 vm_map_offset_t end,
9014 vmr_flags_t flags,
9015 kmem_guard_t guard)
9016 {
9017 kmem_return_t ret;
9018 VM_MAP_ZAP_DECLARE(zap);
9019
9020 ret = vm_map_delete(map, start, end, flags, guard, &zap);
9021 vm_map_unlock(map);
9022
9023 vm_map_zap_dispose(&zap);
9024
9025 return ret;
9026 }
9027
9028 /*
9029 * vm_map_remove_guard:
9030 *
9031 * Remove the given address range from the target map.
9032 * This is the exported form of vm_map_delete.
9033 */
9034 kmem_return_t
vm_map_remove_guard(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vmr_flags_t flags,kmem_guard_t guard)9035 vm_map_remove_guard(
9036 vm_map_t map,
9037 vm_map_offset_t start,
9038 vm_map_offset_t end,
9039 vmr_flags_t flags,
9040 kmem_guard_t guard)
9041 {
9042 vm_map_lock(map);
9043 return vm_map_remove_and_unlock(map, start, end, flags, guard);
9044 }
9045
9046
9047 /*
9048 * vm_map_setup:
9049 *
9050 * Perform any required setup on a new task's map. Must be called before the task
9051 * is enabled for IPC access, since after this point other threads may be able
9052 * to look up the task port and make VM API calls.
9053 */
9054 void
vm_map_setup(vm_map_t map,task_t task)9055 vm_map_setup(vm_map_t map, task_t task)
9056 {
9057 /*
9058 * map does NOT take a reference on owning_task. If the map has terminated,
9059 * it is possible that the pointer is NULL, so reads of owning_task must
9060 * happen under the map lock and explicitly check for NULL.
9061 */
9062 vm_map_lock(map);
9063 assert(!map->owning_task);
9064 map->owning_task = task;
9065 vm_map_unlock(map);
9066 #if CONFIG_DEFERRED_RECLAIM
9067 vm_deferred_reclamation_metadata_t vdrm = task->deferred_reclamation_metadata;
9068 if (vdrm) {
9069 vm_deferred_reclamation_task_fork_register(vdrm);
9070 }
9071 #endif /* CONFIG_DEFERRED_RECLAIM */
9072 }
9073
9074 /*
9075 * vm_map_terminate:
9076 *
9077 * Clean out a task's map.
9078 */
9079 kern_return_t
vm_map_terminate(vm_map_t map)9080 vm_map_terminate(
9081 vm_map_t map)
9082 {
9083 vm_map_lock(map);
9084 map->terminated = TRUE;
9085 map->owning_task = NULL;
9086 vm_map_disable_hole_optimization(map);
9087 (void)vm_map_remove_and_unlock(map, map->min_offset, map->max_offset,
9088 VM_MAP_REMOVE_NO_FLAGS, KMEM_GUARD_NONE);
9089 return KERN_SUCCESS;
9090 }
9091
9092 /*
9093 * Routine: vm_map_copy_allocate
9094 *
9095 * Description:
9096 * Allocates and initializes a map copy object.
9097 */
9098 static vm_map_copy_t
vm_map_copy_allocate(uint16_t type)9099 vm_map_copy_allocate(uint16_t type)
9100 {
9101 vm_map_copy_t new_copy;
9102
9103 new_copy = zalloc_id(ZONE_ID_VM_MAP_COPY, Z_WAITOK | Z_ZERO);
9104 new_copy->type = type;
9105 if (type == VM_MAP_COPY_ENTRY_LIST) {
9106 new_copy->c_u.hdr.rb_head_store.rbh_root = (void*)(int)SKIP_RB_TREE;
9107 vm_map_store_init(&new_copy->cpy_hdr);
9108 }
9109 return new_copy;
9110 }
9111
9112 /*
9113 * Routine: vm_map_copy_discard
9114 *
9115 * Description:
9116 * Dispose of a map copy object (returned by
9117 * vm_map_copyin).
9118 */
9119 void
vm_map_copy_discard(vm_map_copy_t copy)9120 vm_map_copy_discard(
9121 vm_map_copy_t copy)
9122 {
9123 if (copy == VM_MAP_COPY_NULL) {
9124 return;
9125 }
9126
9127 /*
9128 * Assert that the vm_map_copy is coming from the right
9129 * zone and hasn't been forged
9130 */
9131 vm_map_copy_require(copy);
9132
9133 switch (copy->type) {
9134 case VM_MAP_COPY_ENTRY_LIST:
9135 while (vm_map_copy_first_entry(copy) !=
9136 vm_map_copy_to_entry(copy)) {
9137 vm_map_entry_t entry = vm_map_copy_first_entry(copy);
9138
9139 vm_map_copy_entry_unlink(copy, entry);
9140 if (entry->is_sub_map) {
9141 vm_map_deallocate(VME_SUBMAP(entry));
9142 } else {
9143 vm_object_deallocate(VME_OBJECT(entry));
9144 }
9145 vm_map_copy_entry_dispose(entry);
9146 }
9147 break;
9148 case VM_MAP_COPY_KERNEL_BUFFER:
9149
9150 /*
9151 * The vm_map_copy_t and possibly the data buffer were
9152 * allocated by a single call to kalloc_data(), i.e. the
9153 * vm_map_copy_t was not allocated out of the zone.
9154 */
9155 if (copy->size > msg_ool_size_small || copy->offset) {
9156 panic("Invalid vm_map_copy_t sz:%lld, ofst:%lld",
9157 (long long)copy->size, (long long)copy->offset);
9158 }
9159 kfree_data(copy->cpy_kdata, copy->size);
9160 }
9161 zfree_id(ZONE_ID_VM_MAP_COPY, copy);
9162 }
9163
9164 #if XNU_PLATFORM_MacOSX
9165
9166 __exported
9167 extern vm_map_copy_t vm_map_copy_copy(vm_map_copy_t copy);
9168
9169 /*
9170 * Routine: vm_map_copy_copy
9171 *
9172 * Description:
9173 * Move the information in a map copy object to
9174 * a new map copy object, leaving the old one
9175 * empty.
9176 *
9177 * This is used by kernel routines that need
9178 * to look at out-of-line data (in copyin form)
9179 * before deciding whether to return SUCCESS.
9180 * If the routine returns FAILURE, the original
9181 * copy object will be deallocated; therefore,
9182 * these routines must make a copy of the copy
9183 * object and leave the original empty so that
9184 * deallocation will not fail.
9185 */
9186 vm_map_copy_t
vm_map_copy_copy(vm_map_copy_t copy)9187 vm_map_copy_copy(
9188 vm_map_copy_t copy)
9189 {
9190 vm_map_copy_t new_copy;
9191
9192 if (copy == VM_MAP_COPY_NULL) {
9193 return VM_MAP_COPY_NULL;
9194 }
9195
9196 /*
9197 * Assert that the vm_map_copy is coming from the right
9198 * zone and hasn't been forged
9199 */
9200 vm_map_copy_require(copy);
9201
9202 /*
9203 * Allocate a new copy object, and copy the information
9204 * from the old one into it.
9205 */
9206
9207 new_copy = zalloc_id(ZONE_ID_VM_MAP_COPY, Z_WAITOK | Z_ZERO | Z_NOFAIL);
9208 memcpy((void *) new_copy, (void *) copy, sizeof(struct vm_map_copy));
9209 #if __has_feature(ptrauth_calls)
9210 if (copy->type == VM_MAP_COPY_KERNEL_BUFFER) {
9211 new_copy->cpy_kdata = copy->cpy_kdata;
9212 }
9213 #endif
9214
9215 if (copy->type == VM_MAP_COPY_ENTRY_LIST) {
9216 /*
9217 * The links in the entry chain must be
9218 * changed to point to the new copy object.
9219 */
9220 vm_map_copy_first_entry(copy)->vme_prev
9221 = vm_map_copy_to_entry(new_copy);
9222 vm_map_copy_last_entry(copy)->vme_next
9223 = vm_map_copy_to_entry(new_copy);
9224 }
9225
9226 /*
9227 * Change the old copy object into one that contains
9228 * nothing to be deallocated.
9229 */
9230 bzero(copy, sizeof(struct vm_map_copy));
9231 copy->type = VM_MAP_COPY_KERNEL_BUFFER;
9232
9233 /*
9234 * Return the new object.
9235 */
9236 return new_copy;
9237 }
9238
9239 #endif /* XNU_PLATFORM_MacOSX */
9240
9241 static boolean_t
vm_map_entry_is_overwritable(vm_map_t dst_map __unused,vm_map_entry_t entry)9242 vm_map_entry_is_overwritable(
9243 vm_map_t dst_map __unused,
9244 vm_map_entry_t entry)
9245 {
9246 if (!(entry->protection & VM_PROT_WRITE)) {
9247 /* can't overwrite if not writable */
9248 return FALSE;
9249 }
9250 #if !__x86_64__
9251 if (entry->used_for_jit &&
9252 vm_map_cs_enforcement(dst_map) &&
9253 !dst_map->cs_debugged) {
9254 /*
9255 * Can't overwrite a JIT region while cs_enforced
9256 * and not cs_debugged.
9257 */
9258 return FALSE;
9259 }
9260
9261 #if __arm64e__
9262 /* Do not allow overwrite HW assisted TPRO entries */
9263 if (entry->used_for_tpro) {
9264 return FALSE;
9265 }
9266 #endif /* __arm64e__ */
9267
9268 if (entry->vme_permanent) {
9269 if (entry->is_sub_map) {
9270 /*
9271 * We can't tell if the submap contains "permanent"
9272 * entries within the range targeted by the caller.
9273 * The caller will have to check for that with
9274 * vm_map_overwrite_submap_recurse() for example.
9275 */
9276 } else {
9277 /*
9278 * Do not allow overwriting of a "permanent"
9279 * entry.
9280 */
9281 DTRACE_VM6(vm_map_delete_permanent_deny_overwrite,
9282 vm_map_entry_t, entry,
9283 vm_map_offset_t, entry->vme_start,
9284 vm_map_offset_t, entry->vme_end,
9285 vm_prot_t, entry->protection,
9286 vm_prot_t, entry->max_protection,
9287 int, VME_ALIAS(entry));
9288 return FALSE;
9289 }
9290 }
9291 #endif /* !__x86_64__ */
9292
9293 if (entry->is_sub_map) {
9294 /* remember not to assume every entry has a VM object... */
9295 }
9296
9297
9298 return TRUE;
9299 }
9300
9301 static kern_return_t
vm_map_overwrite_submap_recurse(vm_map_t dst_map,vm_map_offset_t dst_addr,vm_map_size_t dst_size)9302 vm_map_overwrite_submap_recurse(
9303 vm_map_t dst_map,
9304 vm_map_offset_t dst_addr,
9305 vm_map_size_t dst_size)
9306 {
9307 vm_map_offset_t dst_end;
9308 vm_map_entry_t tmp_entry;
9309 vm_map_entry_t entry;
9310 kern_return_t result;
9311 boolean_t encountered_sub_map = FALSE;
9312
9313
9314
9315 /*
9316 * Verify that the destination is all writeable
9317 * initially. We have to trunc the destination
9318 * address and round the copy size or we'll end up
9319 * splitting entries in strange ways.
9320 */
9321
9322 dst_end = vm_map_round_page(dst_addr + dst_size,
9323 VM_MAP_PAGE_MASK(dst_map));
9324 vm_map_lock(dst_map);
9325
9326 start_pass_1:
9327 if (!vm_map_lookup_entry(dst_map, dst_addr, &tmp_entry)) {
9328 vm_map_unlock(dst_map);
9329 return KERN_INVALID_ADDRESS;
9330 }
9331
9332 vm_map_clip_start(dst_map,
9333 tmp_entry,
9334 vm_map_trunc_page(dst_addr,
9335 VM_MAP_PAGE_MASK(dst_map)));
9336 if (tmp_entry->is_sub_map) {
9337 /* clipping did unnest if needed */
9338 assert(!tmp_entry->use_pmap);
9339 }
9340
9341 for (entry = tmp_entry;;) {
9342 vm_map_entry_t next;
9343
9344 next = entry->vme_next;
9345 while (entry->is_sub_map) {
9346 vm_map_offset_t sub_start;
9347 vm_map_offset_t sub_end;
9348 vm_map_offset_t local_end;
9349 vm_map_t sub_map;
9350
9351 if (entry->in_transition) {
9352 /*
9353 * Say that we are waiting, and wait for entry.
9354 */
9355 entry->needs_wakeup = TRUE;
9356 vm_map_entry_wait(dst_map, THREAD_UNINT);
9357
9358 goto start_pass_1;
9359 }
9360
9361 encountered_sub_map = TRUE;
9362 sub_start = VME_OFFSET(entry);
9363
9364 if (entry->vme_end < dst_end) {
9365 sub_end = entry->vme_end;
9366 } else {
9367 sub_end = dst_end;
9368 }
9369 sub_end -= entry->vme_start;
9370 sub_end += VME_OFFSET(entry);
9371 local_end = entry->vme_end;
9372 sub_map = VME_SUBMAP(entry);
9373 vm_map_reference(sub_map);
9374 vm_map_unlock(dst_map);
9375
9376 result = vm_map_overwrite_submap_recurse(
9377 sub_map,
9378 sub_start,
9379 sub_end - sub_start);
9380
9381 vm_map_deallocate(sub_map);
9382 sub_map = VM_MAP_NULL;
9383
9384 if (result != KERN_SUCCESS) {
9385 return result;
9386 }
9387 if (dst_end <= entry->vme_end) {
9388 return KERN_SUCCESS;
9389 }
9390 vm_map_lock(dst_map);
9391 if (!vm_map_lookup_entry(dst_map, local_end,
9392 &tmp_entry)) {
9393 vm_map_unlock(dst_map);
9394 return KERN_INVALID_ADDRESS;
9395 }
9396 entry = tmp_entry;
9397 next = entry->vme_next;
9398 }
9399 assert(!entry->is_sub_map);
9400
9401 if (!(entry->protection & VM_PROT_WRITE)) {
9402 vm_map_unlock(dst_map);
9403 return KERN_PROTECTION_FAILURE;
9404 }
9405
9406 if (!vm_map_entry_is_overwritable(dst_map, entry)) {
9407 vm_map_unlock(dst_map);
9408 return KERN_PROTECTION_FAILURE;
9409 }
9410
9411 /*
9412 * If the entry is in transition, we must wait
9413 * for it to exit that state. Anything could happen
9414 * when we unlock the map, so start over.
9415 */
9416 if (entry->in_transition) {
9417 /*
9418 * Say that we are waiting, and wait for entry.
9419 */
9420 entry->needs_wakeup = TRUE;
9421 vm_map_entry_wait(dst_map, THREAD_UNINT);
9422
9423 goto start_pass_1;
9424 }
9425
9426 /*
9427 * our range is contained completely within this map entry
9428 */
9429 if (dst_end <= entry->vme_end) {
9430 vm_map_unlock(dst_map);
9431 return KERN_SUCCESS;
9432 }
9433 /*
9434 * check that range specified is contiguous region
9435 */
9436 if ((next == vm_map_to_entry(dst_map)) ||
9437 (next->vme_start != entry->vme_end)) {
9438 vm_map_unlock(dst_map);
9439 return KERN_INVALID_ADDRESS;
9440 }
9441
9442 /*
9443 * Check for permanent objects in the destination.
9444 */
9445 assert(!entry->is_sub_map);
9446 if ((VME_OBJECT(entry) != VM_OBJECT_NULL) &&
9447 ((!VME_OBJECT(entry)->internal) ||
9448 (VME_OBJECT(entry)->true_share))) {
9449 if (encountered_sub_map) {
9450 vm_map_unlock(dst_map);
9451 return KERN_FAILURE;
9452 }
9453 }
9454
9455
9456 entry = next;
9457 }/* for */
9458 vm_map_unlock(dst_map);
9459 return KERN_SUCCESS;
9460 }
9461
9462 /*
9463 * Routine: vm_map_copy_overwrite
9464 *
9465 * Description:
9466 * Copy the memory described by the map copy
9467 * object (copy; returned by vm_map_copyin) onto
9468 * the specified destination region (dst_map, dst_addr).
9469 * The destination must be writeable.
9470 *
9471 * Unlike vm_map_copyout, this routine actually
9472 * writes over previously-mapped memory. If the
9473 * previous mapping was to a permanent (user-supplied)
9474 * memory object, it is preserved.
9475 *
9476 * The attributes (protection and inheritance) of the
9477 * destination region are preserved.
9478 *
9479 * If successful, consumes the copy object.
9480 * Otherwise, the caller is responsible for it.
9481 *
9482 * Implementation notes:
9483 * To overwrite aligned temporary virtual memory, it is
9484 * sufficient to remove the previous mapping and insert
9485 * the new copy. This replacement is done either on
9486 * the whole region (if no permanent virtual memory
9487 * objects are embedded in the destination region) or
9488 * in individual map entries.
9489 *
9490 * To overwrite permanent virtual memory , it is necessary
9491 * to copy each page, as the external memory management
9492 * interface currently does not provide any optimizations.
9493 *
9494 * Unaligned memory also has to be copied. It is possible
9495 * to use 'vm_trickery' to copy the aligned data. This is
9496 * not done but not hard to implement.
9497 *
9498 * Once a page of permanent memory has been overwritten,
9499 * it is impossible to interrupt this function; otherwise,
9500 * the call would be neither atomic nor location-independent.
9501 * The kernel-state portion of a user thread must be
9502 * interruptible.
9503 *
9504 * It may be expensive to forward all requests that might
9505 * overwrite permanent memory (vm_write, vm_copy) to
9506 * uninterruptible kernel threads. This routine may be
9507 * called by interruptible threads; however, success is
9508 * not guaranteed -- if the request cannot be performed
9509 * atomically and interruptibly, an error indication is
9510 * returned.
9511 *
9512 * Callers of this function must call vm_map_copy_require on
9513 * previously created vm_map_copy_t or pass a newly created
9514 * one to ensure that it hasn't been forged.
9515 */
9516 static kern_return_t
vm_map_copy_overwrite_nested(vm_map_t dst_map,vm_map_address_t dst_addr,vm_map_copy_t copy,boolean_t interruptible,pmap_t pmap,boolean_t discard_on_success)9517 vm_map_copy_overwrite_nested(
9518 vm_map_t dst_map,
9519 vm_map_address_t dst_addr,
9520 vm_map_copy_t copy,
9521 boolean_t interruptible,
9522 pmap_t pmap,
9523 boolean_t discard_on_success)
9524 {
9525 vm_map_offset_t dst_end;
9526 vm_map_entry_t tmp_entry;
9527 vm_map_entry_t entry;
9528 kern_return_t kr;
9529 boolean_t aligned = TRUE;
9530 boolean_t contains_permanent_objects = FALSE;
9531 boolean_t encountered_sub_map = FALSE;
9532 vm_map_offset_t base_addr;
9533 vm_map_size_t copy_size;
9534 vm_map_size_t total_size;
9535 uint16_t copy_page_shift;
9536
9537 /*
9538 * Check for special kernel buffer allocated
9539 * by new_ipc_kmsg_copyin.
9540 */
9541
9542 if (copy->type == VM_MAP_COPY_KERNEL_BUFFER) {
9543 kr = vm_map_copyout_kernel_buffer(
9544 dst_map, &dst_addr,
9545 copy, copy->size, TRUE, discard_on_success);
9546 return kr;
9547 }
9548
9549 /*
9550 * Only works for entry lists at the moment. Will
9551 * support page lists later.
9552 */
9553
9554 assert(copy->type == VM_MAP_COPY_ENTRY_LIST);
9555
9556 if (copy->size == 0) {
9557 if (discard_on_success) {
9558 vm_map_copy_discard(copy);
9559 }
9560 return KERN_SUCCESS;
9561 }
9562
9563 copy_page_shift = copy->cpy_hdr.page_shift;
9564
9565 /*
9566 * Verify that the destination is all writeable
9567 * initially. We have to trunc the destination
9568 * address and round the copy size or we'll end up
9569 * splitting entries in strange ways.
9570 */
9571
9572 if (!VM_MAP_PAGE_ALIGNED(copy->size,
9573 VM_MAP_PAGE_MASK(dst_map)) ||
9574 !VM_MAP_PAGE_ALIGNED(copy->offset,
9575 VM_MAP_PAGE_MASK(dst_map)) ||
9576 !VM_MAP_PAGE_ALIGNED(dst_addr,
9577 VM_MAP_PAGE_MASK(dst_map)) ||
9578 copy_page_shift != VM_MAP_PAGE_SHIFT(dst_map)) {
9579 aligned = FALSE;
9580 dst_end = vm_map_round_page(dst_addr + copy->size,
9581 VM_MAP_PAGE_MASK(dst_map));
9582 } else {
9583 dst_end = dst_addr + copy->size;
9584 }
9585
9586 vm_map_lock(dst_map);
9587
9588 /* LP64todo - remove this check when vm_map_commpage64()
9589 * no longer has to stuff in a map_entry for the commpage
9590 * above the map's max_offset.
9591 */
9592 if (dst_addr >= dst_map->max_offset) {
9593 vm_map_unlock(dst_map);
9594 return KERN_INVALID_ADDRESS;
9595 }
9596
9597 start_pass_1:
9598 if (!vm_map_lookup_entry(dst_map, dst_addr, &tmp_entry)) {
9599 vm_map_unlock(dst_map);
9600 return KERN_INVALID_ADDRESS;
9601 }
9602 vm_map_clip_start(dst_map,
9603 tmp_entry,
9604 vm_map_trunc_page(dst_addr,
9605 VM_MAP_PAGE_MASK(dst_map)));
9606 for (entry = tmp_entry;;) {
9607 vm_map_entry_t next = entry->vme_next;
9608
9609 while (entry->is_sub_map) {
9610 vm_map_offset_t sub_start;
9611 vm_map_offset_t sub_end;
9612 vm_map_offset_t local_end;
9613
9614 if (entry->in_transition) {
9615 /*
9616 * Say that we are waiting, and wait for entry.
9617 */
9618 entry->needs_wakeup = TRUE;
9619 vm_map_entry_wait(dst_map, THREAD_UNINT);
9620
9621 goto start_pass_1;
9622 }
9623
9624 local_end = entry->vme_end;
9625 if (!(entry->needs_copy)) {
9626 vm_map_t sub_map = VM_MAP_NULL;
9627
9628 /* if needs_copy we are a COW submap */
9629 /* in such a case we just replace so */
9630 /* there is no need for the follow- */
9631 /* ing check. */
9632 encountered_sub_map = TRUE;
9633 sub_start = VME_OFFSET(entry);
9634
9635 if (entry->vme_end < dst_end) {
9636 sub_end = entry->vme_end;
9637 } else {
9638 sub_end = dst_end;
9639 }
9640 sub_end -= entry->vme_start;
9641 sub_end += VME_OFFSET(entry);
9642 sub_map = VME_SUBMAP(entry);
9643 vm_map_reference(sub_map);
9644 vm_map_unlock(dst_map);
9645
9646 kr = vm_map_overwrite_submap_recurse(
9647 sub_map,
9648 sub_start,
9649 sub_end - sub_start);
9650
9651 vm_map_deallocate(sub_map);
9652 sub_map = VM_MAP_NULL;
9653 if (kr != KERN_SUCCESS) {
9654 return kr;
9655 }
9656 vm_map_lock(dst_map);
9657 }
9658
9659 if (dst_end <= entry->vme_end) {
9660 goto start_overwrite;
9661 }
9662 if (!vm_map_lookup_entry(dst_map, local_end,
9663 &entry)) {
9664 vm_map_unlock(dst_map);
9665 return KERN_INVALID_ADDRESS;
9666 }
9667 next = entry->vme_next;
9668 }
9669 assert(!entry->is_sub_map);
9670
9671 if (!(entry->protection & VM_PROT_WRITE)) {
9672 vm_map_unlock(dst_map);
9673 return KERN_PROTECTION_FAILURE;
9674 }
9675
9676 if (!vm_map_entry_is_overwritable(dst_map, entry)) {
9677 vm_map_unlock(dst_map);
9678 return KERN_PROTECTION_FAILURE;
9679 }
9680
9681 /*
9682 * If the entry is in transition, we must wait
9683 * for it to exit that state. Anything could happen
9684 * when we unlock the map, so start over.
9685 */
9686 if (entry->in_transition) {
9687 /*
9688 * Say that we are waiting, and wait for entry.
9689 */
9690 entry->needs_wakeup = TRUE;
9691 vm_map_entry_wait(dst_map, THREAD_UNINT);
9692
9693 goto start_pass_1;
9694 }
9695
9696 /*
9697 * our range is contained completely within this map entry
9698 */
9699 if (dst_end <= entry->vme_end) {
9700 break;
9701 }
9702 /*
9703 * check that range specified is contiguous region
9704 */
9705 if ((next == vm_map_to_entry(dst_map)) ||
9706 (next->vme_start != entry->vme_end)) {
9707 vm_map_unlock(dst_map);
9708 return KERN_INVALID_ADDRESS;
9709 }
9710
9711
9712 /*
9713 * Check for permanent objects in the destination.
9714 */
9715 assert(!entry->is_sub_map);
9716 if ((VME_OBJECT(entry) != VM_OBJECT_NULL) &&
9717 ((!VME_OBJECT(entry)->internal) ||
9718 (VME_OBJECT(entry)->true_share))) {
9719 contains_permanent_objects = TRUE;
9720 }
9721
9722 entry = next;
9723 }/* for */
9724
9725 start_overwrite:
9726 /*
9727 * If there are permanent objects in the destination, then
9728 * the copy cannot be interrupted.
9729 */
9730
9731 if (interruptible && contains_permanent_objects) {
9732 vm_map_unlock(dst_map);
9733 return KERN_FAILURE; /* XXX */
9734 }
9735
9736 /*
9737 *
9738 * Make a second pass, overwriting the data
9739 * At the beginning of each loop iteration,
9740 * the next entry to be overwritten is "tmp_entry"
9741 * (initially, the value returned from the lookup above),
9742 * and the starting address expected in that entry
9743 * is "start".
9744 */
9745
9746 total_size = copy->size;
9747 if (encountered_sub_map) {
9748 copy_size = 0;
9749 /* re-calculate tmp_entry since we've had the map */
9750 /* unlocked */
9751 if (!vm_map_lookup_entry( dst_map, dst_addr, &tmp_entry)) {
9752 vm_map_unlock(dst_map);
9753 return KERN_INVALID_ADDRESS;
9754 }
9755 } else {
9756 copy_size = copy->size;
9757 }
9758
9759 base_addr = dst_addr;
9760 while (TRUE) {
9761 /* deconstruct the copy object and do in parts */
9762 /* only in sub_map, interruptable case */
9763 vm_map_entry_t copy_entry;
9764 vm_map_entry_t previous_prev = VM_MAP_ENTRY_NULL;
9765 vm_map_entry_t next_copy = VM_MAP_ENTRY_NULL;
9766 int nentries;
9767 int remaining_entries = 0;
9768 vm_map_offset_t new_offset = 0;
9769
9770 for (entry = tmp_entry; copy_size == 0;) {
9771 vm_map_entry_t next;
9772
9773 next = entry->vme_next;
9774
9775 /* tmp_entry and base address are moved along */
9776 /* each time we encounter a sub-map. Otherwise */
9777 /* entry can outpase tmp_entry, and the copy_size */
9778 /* may reflect the distance between them */
9779 /* if the current entry is found to be in transition */
9780 /* we will start over at the beginning or the last */
9781 /* encounter of a submap as dictated by base_addr */
9782 /* we will zero copy_size accordingly. */
9783 if (entry->in_transition) {
9784 /*
9785 * Say that we are waiting, and wait for entry.
9786 */
9787 entry->needs_wakeup = TRUE;
9788 vm_map_entry_wait(dst_map, THREAD_UNINT);
9789
9790 if (!vm_map_lookup_entry(dst_map, base_addr,
9791 &tmp_entry)) {
9792 vm_map_unlock(dst_map);
9793 return KERN_INVALID_ADDRESS;
9794 }
9795 copy_size = 0;
9796 entry = tmp_entry;
9797 continue;
9798 }
9799 if (entry->is_sub_map) {
9800 vm_map_offset_t sub_start;
9801 vm_map_offset_t sub_end;
9802 vm_map_offset_t local_end;
9803 vm_map_t sub_map = VM_MAP_NULL;
9804 bool use_pmap;
9805
9806 if (entry->needs_copy) {
9807 /* if this is a COW submap */
9808 /* just back the range with a */
9809 /* anonymous entry */
9810 assert(!entry->vme_permanent);
9811 if (entry->vme_end < dst_end) {
9812 sub_end = entry->vme_end;
9813 } else {
9814 sub_end = dst_end;
9815 }
9816 if (entry->vme_start < base_addr) {
9817 sub_start = base_addr;
9818 } else {
9819 sub_start = entry->vme_start;
9820 }
9821 vm_map_clip_end(
9822 dst_map, entry, sub_end);
9823 vm_map_clip_start(
9824 dst_map, entry, sub_start);
9825 assert(!entry->use_pmap);
9826 assert(!entry->iokit_acct);
9827 entry->use_pmap = TRUE;
9828 vm_map_deallocate(VME_SUBMAP(entry));
9829 assert(!entry->vme_permanent);
9830 VME_OBJECT_SET(entry, VM_OBJECT_NULL, false, 0);
9831 VME_OFFSET_SET(entry, 0);
9832 entry->is_shared = FALSE;
9833 entry->needs_copy = FALSE;
9834 entry->protection = VM_PROT_DEFAULT;
9835 entry->max_protection = VM_PROT_ALL;
9836 entry->wired_count = 0;
9837 entry->user_wired_count = 0;
9838 if (entry->inheritance
9839 == VM_INHERIT_SHARE) {
9840 entry->inheritance = VM_INHERIT_COPY;
9841 }
9842 continue;
9843 }
9844 /* first take care of any non-sub_map */
9845 /* entries to send */
9846 if (base_addr < entry->vme_start) {
9847 /* stuff to send */
9848 copy_size =
9849 entry->vme_start - base_addr;
9850 break;
9851 }
9852 sub_start = VME_OFFSET(entry);
9853
9854 if (entry->vme_end < dst_end) {
9855 sub_end = entry->vme_end;
9856 } else {
9857 sub_end = dst_end;
9858 }
9859 sub_end -= entry->vme_start;
9860 sub_end += VME_OFFSET(entry);
9861 local_end = entry->vme_end;
9862 use_pmap = entry->use_pmap;
9863 sub_map = VME_SUBMAP(entry);
9864 vm_map_reference(sub_map);
9865 vm_map_unlock(dst_map);
9866 copy_size = sub_end - sub_start;
9867
9868 /* adjust the copy object */
9869 if (total_size > copy_size) {
9870 vm_map_size_t local_size = 0;
9871 vm_map_size_t entry_size;
9872
9873 nentries = 1;
9874 new_offset = copy->offset;
9875 copy_entry = vm_map_copy_first_entry(copy);
9876 while (copy_entry !=
9877 vm_map_copy_to_entry(copy)) {
9878 entry_size = copy_entry->vme_end -
9879 copy_entry->vme_start;
9880 if ((local_size < copy_size) &&
9881 ((local_size + entry_size)
9882 >= copy_size)) {
9883 vm_map_copy_clip_end(copy,
9884 copy_entry,
9885 copy_entry->vme_start +
9886 (copy_size - local_size));
9887 entry_size = copy_entry->vme_end -
9888 copy_entry->vme_start;
9889 local_size += entry_size;
9890 new_offset += entry_size;
9891 }
9892 if (local_size >= copy_size) {
9893 next_copy = copy_entry->vme_next;
9894 copy_entry->vme_next =
9895 vm_map_copy_to_entry(copy);
9896 previous_prev =
9897 copy->cpy_hdr.links.prev;
9898 copy->cpy_hdr.links.prev = copy_entry;
9899 copy->size = copy_size;
9900 remaining_entries =
9901 copy->cpy_hdr.nentries;
9902 remaining_entries -= nentries;
9903 copy->cpy_hdr.nentries = nentries;
9904 break;
9905 } else {
9906 local_size += entry_size;
9907 new_offset += entry_size;
9908 nentries++;
9909 }
9910 copy_entry = copy_entry->vme_next;
9911 }
9912 }
9913
9914 if ((use_pmap) && (pmap == NULL)) {
9915 kr = vm_map_copy_overwrite_nested(
9916 sub_map,
9917 sub_start,
9918 copy,
9919 interruptible,
9920 sub_map->pmap,
9921 TRUE);
9922 } else if (pmap != NULL) {
9923 kr = vm_map_copy_overwrite_nested(
9924 sub_map,
9925 sub_start,
9926 copy,
9927 interruptible, pmap,
9928 TRUE);
9929 } else {
9930 kr = vm_map_copy_overwrite_nested(
9931 sub_map,
9932 sub_start,
9933 copy,
9934 interruptible,
9935 dst_map->pmap,
9936 TRUE);
9937 }
9938
9939 vm_map_deallocate(sub_map);
9940 sub_map = VM_MAP_NULL;
9941
9942 if (kr != KERN_SUCCESS) {
9943 if (next_copy != NULL) {
9944 copy->cpy_hdr.nentries +=
9945 remaining_entries;
9946 copy->cpy_hdr.links.prev->vme_next =
9947 next_copy;
9948 copy->cpy_hdr.links.prev
9949 = previous_prev;
9950 copy->size = total_size;
9951 }
9952 return kr;
9953 }
9954 if (dst_end <= local_end) {
9955 return KERN_SUCCESS;
9956 }
9957 /* otherwise copy no longer exists, it was */
9958 /* destroyed after successful copy_overwrite */
9959 copy = vm_map_copy_allocate(VM_MAP_COPY_ENTRY_LIST);
9960 copy->offset = new_offset;
9961 copy->cpy_hdr.page_shift = copy_page_shift;
9962
9963 total_size -= copy_size;
9964 copy_size = 0;
9965 /* put back remainder of copy in container */
9966 if (next_copy != NULL) {
9967 copy->cpy_hdr.nentries = remaining_entries;
9968 copy->cpy_hdr.links.next = next_copy;
9969 copy->cpy_hdr.links.prev = previous_prev;
9970 copy->size = total_size;
9971 next_copy->vme_prev =
9972 vm_map_copy_to_entry(copy);
9973 next_copy = NULL;
9974 }
9975 base_addr = local_end;
9976 vm_map_lock(dst_map);
9977 if (!vm_map_lookup_entry(dst_map,
9978 local_end, &tmp_entry)) {
9979 vm_map_unlock(dst_map);
9980 return KERN_INVALID_ADDRESS;
9981 }
9982 entry = tmp_entry;
9983 continue;
9984 }
9985 assert(!entry->is_sub_map);
9986
9987 if (dst_end <= entry->vme_end) {
9988 copy_size = dst_end - base_addr;
9989 break;
9990 }
9991
9992 if ((next == vm_map_to_entry(dst_map)) ||
9993 (next->vme_start != entry->vme_end)) {
9994 vm_map_unlock(dst_map);
9995 return KERN_INVALID_ADDRESS;
9996 }
9997
9998 entry = next;
9999 }/* for */
10000
10001 next_copy = NULL;
10002 nentries = 1;
10003
10004 /* adjust the copy object */
10005 if (total_size > copy_size) {
10006 vm_map_size_t local_size = 0;
10007 vm_map_size_t entry_size;
10008
10009 new_offset = copy->offset;
10010 copy_entry = vm_map_copy_first_entry(copy);
10011 while (copy_entry != vm_map_copy_to_entry(copy)) {
10012 entry_size = copy_entry->vme_end -
10013 copy_entry->vme_start;
10014 if ((local_size < copy_size) &&
10015 ((local_size + entry_size)
10016 >= copy_size)) {
10017 vm_map_copy_clip_end(copy, copy_entry,
10018 copy_entry->vme_start +
10019 (copy_size - local_size));
10020 entry_size = copy_entry->vme_end -
10021 copy_entry->vme_start;
10022 local_size += entry_size;
10023 new_offset += entry_size;
10024 }
10025 if (local_size >= copy_size) {
10026 next_copy = copy_entry->vme_next;
10027 copy_entry->vme_next =
10028 vm_map_copy_to_entry(copy);
10029 previous_prev =
10030 copy->cpy_hdr.links.prev;
10031 copy->cpy_hdr.links.prev = copy_entry;
10032 copy->size = copy_size;
10033 remaining_entries =
10034 copy->cpy_hdr.nentries;
10035 remaining_entries -= nentries;
10036 copy->cpy_hdr.nentries = nentries;
10037 break;
10038 } else {
10039 local_size += entry_size;
10040 new_offset += entry_size;
10041 nentries++;
10042 }
10043 copy_entry = copy_entry->vme_next;
10044 }
10045 }
10046
10047 if (aligned) {
10048 pmap_t local_pmap;
10049
10050 if (pmap) {
10051 local_pmap = pmap;
10052 } else {
10053 local_pmap = dst_map->pmap;
10054 }
10055
10056 if ((kr = vm_map_copy_overwrite_aligned(
10057 dst_map, tmp_entry, copy,
10058 base_addr, local_pmap)) != KERN_SUCCESS) {
10059 if (next_copy != NULL) {
10060 copy->cpy_hdr.nentries +=
10061 remaining_entries;
10062 copy->cpy_hdr.links.prev->vme_next =
10063 next_copy;
10064 copy->cpy_hdr.links.prev =
10065 previous_prev;
10066 copy->size += copy_size;
10067 }
10068 return kr;
10069 }
10070 vm_map_unlock(dst_map);
10071 } else {
10072 /*
10073 * Performance gain:
10074 *
10075 * if the copy and dst address are misaligned but the same
10076 * offset within the page we can copy_not_aligned the
10077 * misaligned parts and copy aligned the rest. If they are
10078 * aligned but len is unaligned we simply need to copy
10079 * the end bit unaligned. We'll need to split the misaligned
10080 * bits of the region in this case !
10081 */
10082 /* ALWAYS UNLOCKS THE dst_map MAP */
10083 kr = vm_map_copy_overwrite_unaligned(
10084 dst_map,
10085 tmp_entry,
10086 copy,
10087 base_addr,
10088 discard_on_success);
10089 if (kr != KERN_SUCCESS) {
10090 if (next_copy != NULL) {
10091 copy->cpy_hdr.nentries +=
10092 remaining_entries;
10093 copy->cpy_hdr.links.prev->vme_next =
10094 next_copy;
10095 copy->cpy_hdr.links.prev =
10096 previous_prev;
10097 copy->size += copy_size;
10098 }
10099 return kr;
10100 }
10101 }
10102 total_size -= copy_size;
10103 if (total_size == 0) {
10104 break;
10105 }
10106 base_addr += copy_size;
10107 copy_size = 0;
10108 copy->offset = new_offset;
10109 if (next_copy != NULL) {
10110 copy->cpy_hdr.nentries = remaining_entries;
10111 copy->cpy_hdr.links.next = next_copy;
10112 copy->cpy_hdr.links.prev = previous_prev;
10113 next_copy->vme_prev = vm_map_copy_to_entry(copy);
10114 copy->size = total_size;
10115 }
10116 vm_map_lock(dst_map);
10117 while (TRUE) {
10118 if (!vm_map_lookup_entry(dst_map,
10119 base_addr, &tmp_entry)) {
10120 vm_map_unlock(dst_map);
10121 return KERN_INVALID_ADDRESS;
10122 }
10123 if (tmp_entry->in_transition) {
10124 entry->needs_wakeup = TRUE;
10125 vm_map_entry_wait(dst_map, THREAD_UNINT);
10126 } else {
10127 break;
10128 }
10129 }
10130 vm_map_clip_start(dst_map,
10131 tmp_entry,
10132 vm_map_trunc_page(base_addr,
10133 VM_MAP_PAGE_MASK(dst_map)));
10134
10135 entry = tmp_entry;
10136 } /* while */
10137
10138 /*
10139 * Throw away the vm_map_copy object
10140 */
10141 if (discard_on_success) {
10142 vm_map_copy_discard(copy);
10143 }
10144
10145 return KERN_SUCCESS;
10146 }/* vm_map_copy_overwrite */
10147
10148 static __attribute__((always_inline, warn_unused_result))
10149 kern_return_t
vm_map_copy_addr_size_sanitize(vm_map_t map,vm_map_offset_ut addr_u,vm_map_size_ut size_u,vm_sanitize_caller_t vm_sanitize_caller,vm_map_offset_t * addr,vm_map_offset_t * end,vm_map_size_t * size)10150 vm_map_copy_addr_size_sanitize(
10151 vm_map_t map,
10152 vm_map_offset_ut addr_u,
10153 vm_map_size_ut size_u,
10154 vm_sanitize_caller_t vm_sanitize_caller,
10155 vm_map_offset_t *addr,
10156 vm_map_offset_t *end,
10157 vm_map_size_t *size)
10158 {
10159 vm_sanitize_flags_t flags = VM_SANITIZE_FLAGS_SIZE_ZERO_FALLTHROUGH |
10160 VM_SANITIZE_FLAGS_GET_UNALIGNED_VALUES |
10161 VM_SANITIZE_FLAGS_CHECK_ADDR_RANGE;
10162
10163 return vm_sanitize_addr_size(addr_u, size_u,
10164 vm_sanitize_caller, map,
10165 flags,
10166 addr, end, size);
10167 }
10168
10169 kern_return_t
vm_map_copy_overwrite(vm_map_t dst_map,vm_map_offset_ut dst_addr_u,vm_map_copy_t copy,vm_map_size_ut copy_size_u,boolean_t interruptible)10170 vm_map_copy_overwrite(
10171 vm_map_t dst_map,
10172 vm_map_offset_ut dst_addr_u,
10173 vm_map_copy_t copy,
10174 vm_map_size_ut copy_size_u,
10175 boolean_t interruptible)
10176 {
10177 vm_map_offset_t dst_addr, dst_end;
10178 vm_map_size_t copy_size;
10179 vm_map_size_t head_size, tail_size;
10180 vm_map_copy_t head_copy, tail_copy;
10181 vm_map_offset_t head_addr, tail_addr;
10182 vm_map_entry_t entry;
10183 kern_return_t kr;
10184 vm_map_offset_t effective_page_mask, effective_page_size;
10185 uint16_t copy_page_shift;
10186
10187 head_size = 0;
10188 tail_size = 0;
10189 head_copy = NULL;
10190 tail_copy = NULL;
10191 head_addr = 0;
10192 tail_addr = 0;
10193
10194 /*
10195 * Check for null copy object.
10196 */
10197 if (copy == VM_MAP_COPY_NULL) {
10198 return KERN_SUCCESS;
10199 }
10200
10201 /*
10202 * Sanitize any input parameters that are addr/size/prot/inherit
10203 */
10204 kr = vm_map_copy_addr_size_sanitize(
10205 dst_map,
10206 dst_addr_u,
10207 copy_size_u,
10208 VM_SANITIZE_CALLER_VM_MAP_COPY_OVERWRITE,
10209 &dst_addr,
10210 &dst_end,
10211 ©_size);
10212 if (__improbable(kr != KERN_SUCCESS)) {
10213 return vm_sanitize_get_kr(kr);
10214 }
10215
10216 /*
10217 * Assert that the vm_map_copy is coming from the right
10218 * zone and hasn't been forged
10219 */
10220 vm_map_copy_require(copy);
10221
10222 if (interruptible ||
10223 copy->type != VM_MAP_COPY_ENTRY_LIST) {
10224 /*
10225 * We can't split the "copy" map if we're interruptible
10226 * or if we don't have a "copy" map...
10227 */
10228 blunt_copy:
10229 kr = vm_map_copy_overwrite_nested(dst_map,
10230 dst_addr,
10231 copy,
10232 interruptible,
10233 (pmap_t) NULL,
10234 TRUE);
10235 if (kr) {
10236 ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_COPYOVERWRITE_FULL_NESTED_ERROR), kr /* arg */);
10237 }
10238 return kr;
10239 }
10240
10241 copy_page_shift = VM_MAP_COPY_PAGE_SHIFT(copy);
10242 if (copy_page_shift < PAGE_SHIFT ||
10243 VM_MAP_PAGE_SHIFT(dst_map) < PAGE_SHIFT) {
10244 goto blunt_copy;
10245 }
10246
10247 if (VM_MAP_PAGE_SHIFT(dst_map) < PAGE_SHIFT) {
10248 effective_page_mask = VM_MAP_PAGE_MASK(dst_map);
10249 } else {
10250 effective_page_mask = MAX(VM_MAP_PAGE_MASK(dst_map), PAGE_MASK);
10251 effective_page_mask = MAX(VM_MAP_COPY_PAGE_MASK(copy),
10252 effective_page_mask);
10253 }
10254 effective_page_size = effective_page_mask + 1;
10255
10256 if (copy_size < VM_MAP_COPY_OVERWRITE_OPTIMIZATION_THRESHOLD_PAGES * effective_page_size) {
10257 /*
10258 * Too small to bother with optimizing...
10259 */
10260 goto blunt_copy;
10261 }
10262
10263 if ((dst_addr & effective_page_mask) !=
10264 (copy->offset & effective_page_mask)) {
10265 /*
10266 * Incompatible mis-alignment of source and destination...
10267 */
10268 goto blunt_copy;
10269 }
10270
10271 /*
10272 * Proper alignment or identical mis-alignment at the beginning.
10273 * Let's try and do a small unaligned copy first (if needed)
10274 * and then an aligned copy for the rest.
10275 */
10276 if (!vm_map_page_aligned(dst_addr, effective_page_mask)) {
10277 head_addr = dst_addr;
10278 head_size = (effective_page_size -
10279 (copy->offset & effective_page_mask));
10280 head_size = MIN(head_size, copy_size);
10281 }
10282 if (!vm_map_page_aligned(copy->offset + copy_size,
10283 effective_page_mask)) {
10284 /*
10285 * Mis-alignment at the end.
10286 * Do an aligned copy up to the last page and
10287 * then an unaligned copy for the remaining bytes.
10288 */
10289 tail_size = ((copy->offset + copy_size) &
10290 effective_page_mask);
10291 tail_size = MIN(tail_size, copy_size);
10292 tail_addr = dst_addr + copy_size - tail_size;
10293 assert(tail_addr >= head_addr + head_size);
10294 }
10295 assert(head_size + tail_size <= copy_size);
10296
10297 if (head_size + tail_size == copy_size) {
10298 /*
10299 * It's all unaligned, no optimization possible...
10300 */
10301 goto blunt_copy;
10302 }
10303
10304 /*
10305 * Can't optimize if there are any submaps in the
10306 * destination due to the way we free the "copy" map
10307 * progressively in vm_map_copy_overwrite_nested()
10308 * in that case.
10309 */
10310 vm_map_lock_read(dst_map);
10311 if (!vm_map_lookup_entry(dst_map, dst_addr, &entry)) {
10312 vm_map_unlock_read(dst_map);
10313 goto blunt_copy;
10314 }
10315 for (;
10316 (entry != vm_map_to_entry(dst_map) &&
10317 entry->vme_start < dst_addr + copy_size);
10318 entry = entry->vme_next) {
10319 if (entry->is_sub_map) {
10320 vm_map_unlock_read(dst_map);
10321 goto blunt_copy;
10322 }
10323 }
10324 vm_map_unlock_read(dst_map);
10325
10326 if (head_size) {
10327 /*
10328 * Unaligned copy of the first "head_size" bytes, to reach
10329 * a page boundary.
10330 */
10331
10332 /*
10333 * Extract "head_copy" out of "copy".
10334 */
10335 head_copy = vm_map_copy_allocate(VM_MAP_COPY_ENTRY_LIST);
10336 head_copy->cpy_hdr.entries_pageable =
10337 copy->cpy_hdr.entries_pageable;
10338 head_copy->cpy_hdr.page_shift = copy_page_shift;
10339
10340 entry = vm_map_copy_first_entry(copy);
10341 if (entry->vme_end < copy->offset + head_size) {
10342 head_size = entry->vme_end - copy->offset;
10343 }
10344
10345 head_copy->offset = copy->offset;
10346 head_copy->size = head_size;
10347 copy->offset += head_size;
10348 copy->size -= head_size;
10349 copy_size -= head_size;
10350 assert(copy_size > 0);
10351
10352 vm_map_copy_clip_end(copy, entry, copy->offset);
10353 vm_map_copy_entry_unlink(copy, entry);
10354 vm_map_copy_entry_link(head_copy,
10355 vm_map_copy_to_entry(head_copy),
10356 entry);
10357
10358 /*
10359 * Do the unaligned copy.
10360 */
10361 kr = vm_map_copy_overwrite_nested(dst_map,
10362 head_addr,
10363 head_copy,
10364 interruptible,
10365 (pmap_t) NULL,
10366 FALSE);
10367 if (kr != KERN_SUCCESS) {
10368 ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_COPYOVERWRITE_PARTIAL_HEAD_NESTED_ERROR), kr /* arg */);
10369 goto done;
10370 }
10371 }
10372
10373 if (tail_size) {
10374 /*
10375 * Extract "tail_copy" out of "copy".
10376 */
10377 tail_copy = vm_map_copy_allocate(VM_MAP_COPY_ENTRY_LIST);
10378 tail_copy->cpy_hdr.entries_pageable =
10379 copy->cpy_hdr.entries_pageable;
10380 tail_copy->cpy_hdr.page_shift = copy_page_shift;
10381
10382 tail_copy->offset = copy->offset + copy_size - tail_size;
10383 tail_copy->size = tail_size;
10384
10385 copy->size -= tail_size;
10386 copy_size -= tail_size;
10387 assert(copy_size > 0);
10388
10389 entry = vm_map_copy_last_entry(copy);
10390 vm_map_copy_clip_start(copy, entry, tail_copy->offset);
10391 entry = vm_map_copy_last_entry(copy);
10392 vm_map_copy_entry_unlink(copy, entry);
10393 vm_map_copy_entry_link(tail_copy,
10394 vm_map_copy_last_entry(tail_copy),
10395 entry);
10396 }
10397
10398 /*
10399 * If we are here from ipc_kmsg_copyout_ool_descriptor(),
10400 * we want to avoid TOCTOU issues w.r.t copy->size but
10401 * we don't need to change vm_map_copy_overwrite_nested()
10402 * and all other vm_map_copy_overwrite variants.
10403 *
10404 * So we assign the original copy_size that was passed into
10405 * this routine back to copy.
10406 *
10407 * This use of local 'copy_size' passed into this routine is
10408 * to try and protect against TOCTOU attacks where the kernel
10409 * has been exploited. We don't expect this to be an issue
10410 * during normal system operation.
10411 */
10412 assertf(copy->size == copy_size,
10413 "Mismatch of copy sizes. Expected 0x%llx, Got 0x%llx\n", (uint64_t) copy_size, (uint64_t) copy->size);
10414 copy->size = copy_size;
10415
10416 /*
10417 * Copy most (or possibly all) of the data.
10418 */
10419 kr = vm_map_copy_overwrite_nested(dst_map,
10420 dst_addr + head_size,
10421 copy,
10422 interruptible,
10423 (pmap_t) NULL,
10424 FALSE);
10425 if (kr != KERN_SUCCESS) {
10426 ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_COPYOVERWRITE_PARTIAL_NESTED_ERROR), kr /* arg */);
10427 goto done;
10428 }
10429
10430 if (tail_size) {
10431 kr = vm_map_copy_overwrite_nested(dst_map,
10432 tail_addr,
10433 tail_copy,
10434 interruptible,
10435 (pmap_t) NULL,
10436 FALSE);
10437 if (kr) {
10438 ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_COPYOVERWRITE_PARTIAL_TAIL_NESTED_ERROR), kr /* arg */);
10439 }
10440 }
10441
10442 done:
10443 assert(copy->type == VM_MAP_COPY_ENTRY_LIST);
10444 if (kr == KERN_SUCCESS) {
10445 /*
10446 * Discard all the copy maps.
10447 */
10448 if (head_copy) {
10449 vm_map_copy_discard(head_copy);
10450 head_copy = NULL;
10451 }
10452 vm_map_copy_discard(copy);
10453 if (tail_copy) {
10454 vm_map_copy_discard(tail_copy);
10455 tail_copy = NULL;
10456 }
10457 } else {
10458 /*
10459 * Re-assemble the original copy map.
10460 */
10461 if (head_copy) {
10462 entry = vm_map_copy_first_entry(head_copy);
10463 vm_map_copy_entry_unlink(head_copy, entry);
10464 vm_map_copy_entry_link(copy,
10465 vm_map_copy_to_entry(copy),
10466 entry);
10467 copy->offset -= head_size;
10468 copy->size += head_size;
10469 vm_map_copy_discard(head_copy);
10470 head_copy = NULL;
10471 }
10472 if (tail_copy) {
10473 entry = vm_map_copy_last_entry(tail_copy);
10474 vm_map_copy_entry_unlink(tail_copy, entry);
10475 vm_map_copy_entry_link(copy,
10476 vm_map_copy_last_entry(copy),
10477 entry);
10478 copy->size += tail_size;
10479 vm_map_copy_discard(tail_copy);
10480 tail_copy = NULL;
10481 }
10482 }
10483 return kr;
10484 }
10485
10486
10487 /*
10488 * Routine: vm_map_copy_overwrite_unaligned [internal use only]
10489 *
10490 * Decription:
10491 * Physically copy unaligned data
10492 *
10493 * Implementation:
10494 * Unaligned parts of pages have to be physically copied. We use
10495 * a modified form of vm_fault_copy (which understands none-aligned
10496 * page offsets and sizes) to do the copy. We attempt to copy as
10497 * much memory in one go as possibly, however vm_fault_copy copies
10498 * within 1 memory object so we have to find the smaller of "amount left"
10499 * "source object data size" and "target object data size". With
10500 * unaligned data we don't need to split regions, therefore the source
10501 * (copy) object should be one map entry, the target range may be split
10502 * over multiple map entries however. In any event we are pessimistic
10503 * about these assumptions.
10504 *
10505 * Callers of this function must call vm_map_copy_require on
10506 * previously created vm_map_copy_t or pass a newly created
10507 * one to ensure that it hasn't been forged.
10508 *
10509 * Assumptions:
10510 * dst_map is locked on entry and is return locked on success,
10511 * unlocked on error.
10512 */
10513
10514 static kern_return_t
vm_map_copy_overwrite_unaligned(vm_map_t dst_map,vm_map_entry_t entry,vm_map_copy_t copy,vm_map_offset_t start,boolean_t discard_on_success)10515 vm_map_copy_overwrite_unaligned(
10516 vm_map_t dst_map,
10517 vm_map_entry_t entry,
10518 vm_map_copy_t copy,
10519 vm_map_offset_t start,
10520 boolean_t discard_on_success)
10521 {
10522 vm_map_entry_t copy_entry;
10523 vm_map_entry_t copy_entry_next;
10524 vm_map_version_t version;
10525 vm_object_t dst_object;
10526 vm_object_offset_t dst_offset;
10527 vm_object_offset_t src_offset;
10528 vm_object_offset_t entry_offset;
10529 vm_map_offset_t entry_end;
10530 vm_map_size_t src_size,
10531 dst_size,
10532 copy_size,
10533 amount_left;
10534 kern_return_t kr = KERN_SUCCESS;
10535
10536
10537 copy_entry = vm_map_copy_first_entry(copy);
10538
10539 vm_map_lock_write_to_read(dst_map);
10540
10541 src_offset = copy->offset - trunc_page_mask_64(copy->offset, VM_MAP_COPY_PAGE_MASK(copy));
10542 amount_left = copy->size;
10543 /*
10544 * unaligned so we never clipped this entry, we need the offset into
10545 * the vm_object not just the data.
10546 */
10547 while (amount_left > 0) {
10548 if (entry == vm_map_to_entry(dst_map)) {
10549 vm_map_unlock_read(dst_map);
10550 return KERN_INVALID_ADDRESS;
10551 }
10552
10553 /* "start" must be within the current map entry */
10554 assert((start >= entry->vme_start) && (start < entry->vme_end));
10555
10556 /*
10557 * Check protection again
10558 */
10559 if (!(entry->protection & VM_PROT_WRITE)) {
10560 vm_map_unlock_read(dst_map);
10561 return KERN_PROTECTION_FAILURE;
10562 }
10563 if (entry->is_sub_map) {
10564 /* not implemented... */
10565 vm_map_unlock_read(dst_map);
10566 return KERN_INVALID_ARGUMENT;
10567 }
10568 if (!vm_map_entry_is_overwritable(dst_map, entry)) {
10569 vm_map_unlock_read(dst_map);
10570 return KERN_PROTECTION_FAILURE;
10571 }
10572 /*
10573 * If the entry is in transition, we must wait
10574 * for it to exit that state. Anything could happen
10575 * when we unlock the map, so start over.
10576 */
10577 if (entry->in_transition) {
10578 /*
10579 * Say that we are waiting, and wait for entry.
10580 */
10581 entry->needs_wakeup = TRUE;
10582 vm_map_entry_wait(dst_map, THREAD_UNINT);
10583
10584 goto RetryLookup;
10585 }
10586
10587 dst_offset = start - entry->vme_start;
10588
10589 dst_size = entry->vme_end - start;
10590
10591 src_size = copy_entry->vme_end -
10592 (copy_entry->vme_start + src_offset);
10593
10594 if (dst_size < src_size) {
10595 /*
10596 * we can only copy dst_size bytes before
10597 * we have to get the next destination entry
10598 */
10599 copy_size = dst_size;
10600 } else {
10601 /*
10602 * we can only copy src_size bytes before
10603 * we have to get the next source copy entry
10604 */
10605 copy_size = src_size;
10606 }
10607
10608 if (copy_size > amount_left) {
10609 copy_size = amount_left;
10610 }
10611 /*
10612 * Entry needs copy, create a shadow shadow object for
10613 * Copy on write region.
10614 */
10615 assert(!entry->is_sub_map);
10616 if (entry->needs_copy) {
10617 if (vm_map_lock_read_to_write(dst_map)) {
10618 vm_map_lock_read(dst_map);
10619 goto RetryLookup;
10620 }
10621 VME_OBJECT_SHADOW(entry,
10622 (vm_map_size_t)(entry->vme_end
10623 - entry->vme_start),
10624 vm_map_always_shadow(dst_map));
10625 entry->needs_copy = FALSE;
10626 vm_map_lock_write_to_read(dst_map);
10627 }
10628 dst_object = VME_OBJECT(entry);
10629 /*
10630 * unlike with the virtual (aligned) copy we're going
10631 * to fault on it therefore we need a target object.
10632 */
10633 if (dst_object == VM_OBJECT_NULL) {
10634 if (vm_map_lock_read_to_write(dst_map)) {
10635 vm_map_lock_read(dst_map);
10636 goto RetryLookup;
10637 }
10638 dst_object = vm_object_allocate((vm_map_size_t)
10639 entry->vme_end - entry->vme_start,
10640 dst_map->serial_id);
10641 VME_OBJECT_SET(entry, dst_object, false, 0);
10642 VME_OFFSET_SET(entry, 0);
10643 assert(entry->use_pmap);
10644 vm_map_lock_write_to_read(dst_map);
10645 }
10646 /*
10647 * Take an object reference and unlock map. The "entry" may
10648 * disappear or change when the map is unlocked.
10649 */
10650 vm_object_reference(dst_object);
10651 version.main_timestamp = dst_map->timestamp;
10652 entry_offset = VME_OFFSET(entry);
10653 entry_end = entry->vme_end;
10654 vm_map_unlock_read(dst_map);
10655 /*
10656 * Copy as much as possible in one pass
10657 */
10658 kr = vm_fault_copy(
10659 VME_OBJECT(copy_entry),
10660 VME_OFFSET(copy_entry) + src_offset,
10661 ©_size,
10662 dst_object,
10663 entry_offset + dst_offset,
10664 dst_map,
10665 &version,
10666 THREAD_UNINT );
10667
10668 start += copy_size;
10669 src_offset += copy_size;
10670 amount_left -= copy_size;
10671 /*
10672 * Release the object reference
10673 */
10674 vm_object_deallocate(dst_object);
10675 /*
10676 * If a hard error occurred, return it now
10677 */
10678 if (kr != KERN_SUCCESS) {
10679 return kr;
10680 }
10681
10682 if ((copy_entry->vme_start + src_offset) == copy_entry->vme_end
10683 || amount_left == 0) {
10684 /*
10685 * all done with this copy entry, dispose.
10686 */
10687 copy_entry_next = copy_entry->vme_next;
10688
10689 if (discard_on_success) {
10690 vm_map_copy_entry_unlink(copy, copy_entry);
10691 assert(!copy_entry->is_sub_map);
10692 vm_object_deallocate(VME_OBJECT(copy_entry));
10693 vm_map_copy_entry_dispose(copy_entry);
10694 }
10695
10696 if (copy_entry_next == vm_map_copy_to_entry(copy) &&
10697 amount_left) {
10698 /*
10699 * not finished copying but run out of source
10700 */
10701 return KERN_INVALID_ADDRESS;
10702 }
10703
10704 copy_entry = copy_entry_next;
10705
10706 src_offset = 0;
10707 }
10708
10709 if (amount_left == 0) {
10710 return KERN_SUCCESS;
10711 }
10712
10713 vm_map_lock_read(dst_map);
10714 if (version.main_timestamp == dst_map->timestamp) {
10715 if (start == entry_end) {
10716 /*
10717 * destination region is split. Use the version
10718 * information to avoid a lookup in the normal
10719 * case.
10720 */
10721 entry = entry->vme_next;
10722 /*
10723 * should be contiguous. Fail if we encounter
10724 * a hole in the destination.
10725 */
10726 if (start != entry->vme_start) {
10727 vm_map_unlock_read(dst_map);
10728 return KERN_INVALID_ADDRESS;
10729 }
10730 }
10731 } else {
10732 /*
10733 * Map version check failed.
10734 * we must lookup the entry because somebody
10735 * might have changed the map behind our backs.
10736 */
10737 RetryLookup:
10738 if (!vm_map_lookup_entry(dst_map, start, &entry)) {
10739 vm_map_unlock_read(dst_map);
10740 return KERN_INVALID_ADDRESS;
10741 }
10742 }
10743 }/* while */
10744
10745 return KERN_SUCCESS;
10746 }/* vm_map_copy_overwrite_unaligned */
10747
10748 /*
10749 * Routine: vm_map_copy_overwrite_aligned [internal use only]
10750 *
10751 * Description:
10752 * Does all the vm_trickery possible for whole pages.
10753 *
10754 * Implementation:
10755 *
10756 * If there are no permanent objects in the destination,
10757 * and the source and destination map entry zones match,
10758 * and the destination map entry is not shared,
10759 * then the map entries can be deleted and replaced
10760 * with those from the copy. The following code is the
10761 * basic idea of what to do, but there are lots of annoying
10762 * little details about getting protection and inheritance
10763 * right. Should add protection, inheritance, and sharing checks
10764 * to the above pass and make sure that no wiring is involved.
10765 *
10766 * Callers of this function must call vm_map_copy_require on
10767 * previously created vm_map_copy_t or pass a newly created
10768 * one to ensure that it hasn't been forged.
10769 */
10770
10771 int vm_map_copy_overwrite_aligned_src_not_internal = 0;
10772 int vm_map_copy_overwrite_aligned_src_not_symmetric = 0;
10773 int vm_map_copy_overwrite_aligned_src_large = 0;
10774
10775 static kern_return_t
vm_map_copy_overwrite_aligned(vm_map_t dst_map,vm_map_entry_t tmp_entry,vm_map_copy_t copy,vm_map_offset_t start,__unused pmap_t pmap)10776 vm_map_copy_overwrite_aligned(
10777 vm_map_t dst_map,
10778 vm_map_entry_t tmp_entry,
10779 vm_map_copy_t copy,
10780 vm_map_offset_t start,
10781 __unused pmap_t pmap)
10782 {
10783 vm_object_t object;
10784 vm_map_entry_t copy_entry;
10785 vm_map_size_t copy_size;
10786 vm_map_size_t size;
10787 vm_map_entry_t entry;
10788
10789 while ((copy_entry = vm_map_copy_first_entry(copy))
10790 != vm_map_copy_to_entry(copy)) {
10791 copy_size = (copy_entry->vme_end - copy_entry->vme_start);
10792
10793 entry = tmp_entry;
10794
10795 if (entry->is_sub_map) {
10796 /* unnested when clipped earlier */
10797 assert(!entry->use_pmap);
10798 }
10799 if (entry == vm_map_to_entry(dst_map)) {
10800 vm_map_unlock(dst_map);
10801 return KERN_INVALID_ADDRESS;
10802 }
10803 size = (entry->vme_end - entry->vme_start);
10804 /*
10805 * Make sure that no holes popped up in the
10806 * address map, and that the protection is
10807 * still valid, in case the map was unlocked
10808 * earlier.
10809 */
10810
10811 if ((entry->vme_start != start) || ((entry->is_sub_map)
10812 && !entry->needs_copy)) {
10813 vm_map_unlock(dst_map);
10814 return KERN_INVALID_ADDRESS;
10815 }
10816 assert(entry != vm_map_to_entry(dst_map));
10817
10818 /*
10819 * Check protection again
10820 */
10821
10822 if (!(entry->protection & VM_PROT_WRITE)) {
10823 vm_map_unlock(dst_map);
10824 return KERN_PROTECTION_FAILURE;
10825 }
10826
10827 if (entry->is_sub_map) {
10828 /* not properly implemented */
10829 vm_map_unlock(dst_map);
10830 return KERN_PROTECTION_FAILURE;
10831 }
10832
10833 if (!vm_map_entry_is_overwritable(dst_map, entry)) {
10834 vm_map_unlock(dst_map);
10835 return KERN_PROTECTION_FAILURE;
10836 }
10837
10838 /*
10839 * If the entry is in transition, we must wait
10840 * for it to exit that state. Anything could happen
10841 * when we unlock the map, so start over.
10842 */
10843 if (entry->in_transition) {
10844 /*
10845 * Say that we are waiting, and wait for entry.
10846 */
10847 entry->needs_wakeup = TRUE;
10848 vm_map_entry_wait(dst_map, THREAD_UNINT);
10849
10850 goto RetryLookup;
10851 }
10852
10853 /*
10854 * Adjust to source size first
10855 */
10856
10857 if (copy_size < size) {
10858 if (entry->map_aligned &&
10859 !VM_MAP_PAGE_ALIGNED(entry->vme_start + copy_size,
10860 VM_MAP_PAGE_MASK(dst_map))) {
10861 /* no longer map-aligned */
10862 entry->map_aligned = FALSE;
10863 }
10864 vm_map_clip_end(dst_map, entry, entry->vme_start + copy_size);
10865 size = copy_size;
10866 }
10867
10868 /*
10869 * Adjust to destination size
10870 */
10871
10872 if (size < copy_size) {
10873 vm_map_copy_clip_end(copy, copy_entry,
10874 copy_entry->vme_start + size);
10875 copy_size = size;
10876 }
10877
10878 assert((entry->vme_end - entry->vme_start) == size);
10879 assert((tmp_entry->vme_end - tmp_entry->vme_start) == size);
10880 assert((copy_entry->vme_end - copy_entry->vme_start) == size);
10881
10882 /*
10883 * If the destination contains temporary unshared memory,
10884 * we can perform the copy by throwing it away and
10885 * installing the source data.
10886 *
10887 * Exceptions for mappings with special semantics:
10888 * + "permanent" entries,
10889 * + JIT regions,
10890 * + TPRO regions,
10891 * + pmap-specific protection policies,
10892 * + VM objects with COPY_NONE copy strategy.
10893 */
10894
10895 object = VME_OBJECT(entry);
10896 if ((!entry->is_shared &&
10897 !entry->vme_permanent &&
10898 !entry->used_for_jit &&
10899 #if __arm64e__
10900 !entry->used_for_tpro &&
10901 #endif /* __arm64e__ */
10902 !(entry->protection & VM_PROT_EXECUTE) &&
10903 !pmap_has_prot_policy(dst_map->pmap, entry->translated_allow_execute, entry->protection) &&
10904 ((object == VM_OBJECT_NULL) ||
10905 (object->internal &&
10906 !object->true_share &&
10907 object->copy_strategy != MEMORY_OBJECT_COPY_NONE))) ||
10908 entry->needs_copy) {
10909 vm_object_t old_object = VME_OBJECT(entry);
10910 vm_object_offset_t old_offset = VME_OFFSET(entry);
10911 vm_object_offset_t offset;
10912
10913 assert(!entry->is_sub_map);
10914 /*
10915 * Ensure that the source and destination aren't
10916 * identical
10917 */
10918 if (old_object == VME_OBJECT(copy_entry) &&
10919 old_offset == VME_OFFSET(copy_entry)) {
10920 vm_map_copy_entry_unlink(copy, copy_entry);
10921 vm_map_copy_entry_dispose(copy_entry);
10922
10923 if (old_object != VM_OBJECT_NULL) {
10924 vm_object_deallocate(old_object);
10925 }
10926
10927 start = tmp_entry->vme_end;
10928 tmp_entry = tmp_entry->vme_next;
10929 continue;
10930 }
10931
10932 #if XNU_TARGET_OS_OSX
10933 #define __TRADEOFF1_OBJ_SIZE (64 * 1024 * 1024) /* 64 MB */
10934 #define __TRADEOFF1_COPY_SIZE (128 * 1024) /* 128 KB */
10935 if (VME_OBJECT(copy_entry) != VM_OBJECT_NULL &&
10936 VME_OBJECT(copy_entry)->vo_size >= __TRADEOFF1_OBJ_SIZE &&
10937 copy_size <= __TRADEOFF1_COPY_SIZE) {
10938 /*
10939 * Virtual vs. Physical copy tradeoff #1.
10940 *
10941 * Copying only a few pages out of a large
10942 * object: do a physical copy instead of
10943 * a virtual copy, to avoid possibly keeping
10944 * the entire large object alive because of
10945 * those few copy-on-write pages.
10946 */
10947 vm_map_copy_overwrite_aligned_src_large++;
10948 goto slow_copy;
10949 }
10950 #endif /* XNU_TARGET_OS_OSX */
10951
10952 if ((dst_map->pmap != kernel_pmap) &&
10953 (VME_ALIAS(entry) >= VM_MEMORY_MALLOC) &&
10954 (VME_ALIAS(entry) <= VM_MEMORY_MALLOC_MEDIUM)) {
10955 vm_object_t new_object, new_shadow;
10956
10957 /*
10958 * We're about to map something over a mapping
10959 * established by malloc()...
10960 */
10961 new_object = VME_OBJECT(copy_entry);
10962 if (new_object != VM_OBJECT_NULL) {
10963 vm_object_lock_shared(new_object);
10964 }
10965 while (new_object != VM_OBJECT_NULL &&
10966 #if XNU_TARGET_OS_OSX
10967 !new_object->true_share &&
10968 new_object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC &&
10969 #endif /* XNU_TARGET_OS_OSX */
10970 new_object->internal) {
10971 new_shadow = new_object->shadow;
10972 if (new_shadow == VM_OBJECT_NULL) {
10973 break;
10974 }
10975 vm_object_lock_shared(new_shadow);
10976 vm_object_unlock(new_object);
10977 new_object = new_shadow;
10978 }
10979 if (new_object != VM_OBJECT_NULL) {
10980 if (!new_object->internal) {
10981 /*
10982 * The new mapping is backed
10983 * by an external object. We
10984 * don't want malloc'ed memory
10985 * to be replaced with such a
10986 * non-anonymous mapping, so
10987 * let's go off the optimized
10988 * path...
10989 */
10990 vm_map_copy_overwrite_aligned_src_not_internal++;
10991 vm_object_unlock(new_object);
10992 goto slow_copy;
10993 }
10994 #if XNU_TARGET_OS_OSX
10995 if (new_object->true_share ||
10996 new_object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC) {
10997 /*
10998 * Same if there's a "true_share"
10999 * object in the shadow chain, or
11000 * an object with a non-default
11001 * (SYMMETRIC) copy strategy.
11002 */
11003 vm_map_copy_overwrite_aligned_src_not_symmetric++;
11004 vm_object_unlock(new_object);
11005 goto slow_copy;
11006 }
11007 #endif /* XNU_TARGET_OS_OSX */
11008 vm_object_unlock(new_object);
11009 }
11010 /*
11011 * The new mapping is still backed by
11012 * anonymous (internal) memory, so it's
11013 * OK to substitute it for the original
11014 * malloc() mapping.
11015 */
11016 }
11017
11018 if (old_object != VM_OBJECT_NULL) {
11019 assert(!entry->vme_permanent);
11020 if (entry->is_sub_map) {
11021 if (entry->use_pmap) {
11022 #ifndef NO_NESTED_PMAP
11023 pmap_unnest(dst_map->pmap,
11024 (addr64_t)entry->vme_start,
11025 entry->vme_end - entry->vme_start);
11026 #endif /* NO_NESTED_PMAP */
11027 if (dst_map->mapped_in_other_pmaps) {
11028 /* clean up parent */
11029 /* map/maps */
11030 vm_map_submap_pmap_clean(
11031 dst_map, entry->vme_start,
11032 entry->vme_end,
11033 VME_SUBMAP(entry),
11034 VME_OFFSET(entry));
11035 }
11036 } else {
11037 vm_map_submap_pmap_clean(
11038 dst_map, entry->vme_start,
11039 entry->vme_end,
11040 VME_SUBMAP(entry),
11041 VME_OFFSET(entry));
11042 }
11043 vm_map_deallocate(VME_SUBMAP(entry));
11044 } else {
11045 if (dst_map->mapped_in_other_pmaps) {
11046 vm_object_pmap_protect_options(
11047 VME_OBJECT(entry),
11048 VME_OFFSET(entry),
11049 entry->vme_end
11050 - entry->vme_start,
11051 PMAP_NULL,
11052 PAGE_SIZE,
11053 entry->vme_start,
11054 VM_PROT_NONE,
11055 PMAP_OPTIONS_REMOVE);
11056 } else {
11057 pmap_remove_options(
11058 dst_map->pmap,
11059 (addr64_t)(entry->vme_start),
11060 (addr64_t)(entry->vme_end),
11061 PMAP_OPTIONS_REMOVE);
11062 }
11063 vm_object_deallocate(old_object);
11064 }
11065 }
11066
11067 if (entry->iokit_acct) {
11068 /* keep using iokit accounting */
11069 entry->use_pmap = FALSE;
11070 } else {
11071 /* use pmap accounting */
11072 entry->use_pmap = TRUE;
11073 }
11074 assert(!entry->vme_permanent);
11075 VME_OBJECT_SET(entry, VME_OBJECT(copy_entry), false, 0);
11076 object = VME_OBJECT(entry);
11077 entry->needs_copy = copy_entry->needs_copy;
11078 entry->wired_count = 0;
11079 entry->user_wired_count = 0;
11080 offset = VME_OFFSET(copy_entry);
11081 VME_OFFSET_SET(entry, offset);
11082
11083 vm_map_copy_entry_unlink(copy, copy_entry);
11084 vm_map_copy_entry_dispose(copy_entry);
11085
11086 /*
11087 * we could try to push pages into the pmap at this point, BUT
11088 * this optimization only saved on average 2 us per page if ALL
11089 * the pages in the source were currently mapped
11090 * and ALL the pages in the dest were touched, if there were fewer
11091 * than 2/3 of the pages touched, this optimization actually cost more cycles
11092 * it also puts a lot of pressure on the pmap layer w/r to mapping structures
11093 */
11094
11095 /*
11096 * Set up for the next iteration. The map
11097 * has not been unlocked, so the next
11098 * address should be at the end of this
11099 * entry, and the next map entry should be
11100 * the one following it.
11101 */
11102
11103 start = tmp_entry->vme_end;
11104 tmp_entry = tmp_entry->vme_next;
11105 } else {
11106 vm_map_version_t version;
11107 vm_object_t dst_object;
11108 vm_object_offset_t dst_offset;
11109 kern_return_t r;
11110
11111 slow_copy:
11112 if (entry->needs_copy) {
11113 VME_OBJECT_SHADOW(entry,
11114 (entry->vme_end -
11115 entry->vme_start),
11116 vm_map_always_shadow(dst_map));
11117 entry->needs_copy = FALSE;
11118 }
11119
11120 dst_object = VME_OBJECT(entry);
11121 dst_offset = VME_OFFSET(entry);
11122
11123 /*
11124 * Take an object reference, and record
11125 * the map version information so that the
11126 * map can be safely unlocked.
11127 */
11128
11129 if (dst_object == VM_OBJECT_NULL) {
11130 /*
11131 * We would usually have just taken the
11132 * optimized path above if the destination
11133 * object has not been allocated yet. But we
11134 * now disable that optimization if the copy
11135 * entry's object is not backed by anonymous
11136 * memory to avoid replacing malloc'ed
11137 * (i.e. re-usable) anonymous memory with a
11138 * not-so-anonymous mapping.
11139 * So we have to handle this case here and
11140 * allocate a new VM object for this map entry.
11141 */
11142 dst_object = vm_object_allocate(
11143 entry->vme_end - entry->vme_start,
11144 dst_map->serial_id
11145 );
11146 dst_offset = 0;
11147 VME_OBJECT_SET(entry, dst_object, false, 0);
11148 VME_OFFSET_SET(entry, dst_offset);
11149 assert(entry->use_pmap);
11150 }
11151
11152 vm_object_reference(dst_object);
11153
11154 /* account for unlock bumping up timestamp */
11155 version.main_timestamp = dst_map->timestamp + 1;
11156
11157 vm_map_unlock(dst_map);
11158
11159 /*
11160 * Copy as much as possible in one pass
11161 */
11162
11163 copy_size = size;
11164 r = vm_fault_copy(
11165 VME_OBJECT(copy_entry),
11166 VME_OFFSET(copy_entry),
11167 ©_size,
11168 dst_object,
11169 dst_offset,
11170 dst_map,
11171 &version,
11172 THREAD_UNINT );
11173
11174 /*
11175 * Release the object reference
11176 */
11177
11178 vm_object_deallocate(dst_object);
11179
11180 /*
11181 * If a hard error occurred, return it now
11182 */
11183
11184 if (r != KERN_SUCCESS) {
11185 return r;
11186 }
11187
11188 if (copy_size != 0) {
11189 /*
11190 * Dispose of the copied region
11191 */
11192
11193 vm_map_copy_clip_end(copy, copy_entry,
11194 copy_entry->vme_start + copy_size);
11195 vm_map_copy_entry_unlink(copy, copy_entry);
11196 vm_object_deallocate(VME_OBJECT(copy_entry));
11197 vm_map_copy_entry_dispose(copy_entry);
11198 }
11199
11200 /*
11201 * Pick up in the destination map where we left off.
11202 *
11203 * Use the version information to avoid a lookup
11204 * in the normal case.
11205 */
11206
11207 start += copy_size;
11208 vm_map_lock(dst_map);
11209 if (version.main_timestamp == dst_map->timestamp &&
11210 copy_size != 0) {
11211 /* We can safely use saved tmp_entry value */
11212
11213 if (tmp_entry->map_aligned &&
11214 !VM_MAP_PAGE_ALIGNED(
11215 start,
11216 VM_MAP_PAGE_MASK(dst_map))) {
11217 /* no longer map-aligned */
11218 tmp_entry->map_aligned = FALSE;
11219 }
11220 vm_map_clip_end(dst_map, tmp_entry, start);
11221 tmp_entry = tmp_entry->vme_next;
11222 } else {
11223 /* Must do lookup of tmp_entry */
11224
11225 RetryLookup:
11226 if (!vm_map_lookup_entry(dst_map, start, &tmp_entry)) {
11227 vm_map_unlock(dst_map);
11228 return KERN_INVALID_ADDRESS;
11229 }
11230 if (tmp_entry->map_aligned &&
11231 !VM_MAP_PAGE_ALIGNED(
11232 start,
11233 VM_MAP_PAGE_MASK(dst_map))) {
11234 /* no longer map-aligned */
11235 tmp_entry->map_aligned = FALSE;
11236 }
11237 vm_map_clip_start(dst_map, tmp_entry, start);
11238 }
11239 }
11240 }/* while */
11241
11242 return KERN_SUCCESS;
11243 }/* vm_map_copy_overwrite_aligned */
11244
11245 /*
11246 * Routine: vm_map_copyin_kernel_buffer [internal use only]
11247 *
11248 * Description:
11249 * Copy in data to a kernel buffer from space in the
11250 * source map. The original space may be optionally
11251 * deallocated.
11252 *
11253 * If successful, returns a new copy object.
11254 */
11255 static kern_return_t
vm_map_copyin_kernel_buffer(vm_map_t src_map,vm_map_offset_t src_addr,vm_map_size_t len,boolean_t src_destroy,vm_map_copy_t * copy_result)11256 vm_map_copyin_kernel_buffer(
11257 vm_map_t src_map,
11258 vm_map_offset_t src_addr,
11259 vm_map_size_t len,
11260 boolean_t src_destroy,
11261 vm_map_copy_t *copy_result)
11262 {
11263 kern_return_t kr;
11264 vm_map_copy_t copy;
11265 void *kdata;
11266
11267 if (len > msg_ool_size_small) {
11268 return KERN_INVALID_ARGUMENT;
11269 }
11270
11271 kdata = kalloc_data(len, Z_WAITOK);
11272 if (kdata == NULL) {
11273 return KERN_RESOURCE_SHORTAGE;
11274 }
11275 kr = copyinmap(src_map, src_addr, kdata, (vm_size_t)len);
11276 if (kr != KERN_SUCCESS) {
11277 kfree_data(kdata, len);
11278 return kr;
11279 }
11280
11281 copy = vm_map_copy_allocate(VM_MAP_COPY_KERNEL_BUFFER);
11282 copy->cpy_kdata = kdata;
11283 copy->size = len;
11284 copy->offset = 0;
11285
11286 if (src_destroy) {
11287 vmr_flags_t flags = VM_MAP_REMOVE_INTERRUPTIBLE;
11288
11289 if (src_map == kernel_map) {
11290 flags |= VM_MAP_REMOVE_KUNWIRE;
11291 }
11292
11293 (void)vm_map_remove_guard(src_map,
11294 vm_map_trunc_page(src_addr, VM_MAP_PAGE_MASK(src_map)),
11295 vm_map_round_page(src_addr + len, VM_MAP_PAGE_MASK(src_map)),
11296 flags, KMEM_GUARD_NONE);
11297 }
11298
11299 *copy_result = copy;
11300 return KERN_SUCCESS;
11301 }
11302
11303 /*
11304 * Routine: vm_map_copyout_kernel_buffer [internal use only]
11305 *
11306 * Description:
11307 * Copy out data from a kernel buffer into space in the
11308 * destination map. The space may be otpionally dynamically
11309 * allocated.
11310 *
11311 * If successful, consumes the copy object.
11312 * Otherwise, the caller is responsible for it.
11313 *
11314 * Callers of this function must call vm_map_copy_require on
11315 * previously created vm_map_copy_t or pass a newly created
11316 * one to ensure that it hasn't been forged.
11317 */
11318 static int vm_map_copyout_kernel_buffer_failures = 0;
11319 static kern_return_t
vm_map_copyout_kernel_buffer(vm_map_t map,vm_map_address_t * addr,vm_map_copy_t copy,vm_map_size_t copy_size,boolean_t overwrite,boolean_t consume_on_success)11320 vm_map_copyout_kernel_buffer(
11321 vm_map_t map,
11322 vm_map_address_t *addr, /* IN/OUT */
11323 vm_map_copy_t copy,
11324 vm_map_size_t copy_size,
11325 boolean_t overwrite,
11326 boolean_t consume_on_success)
11327 {
11328 kern_return_t kr = KERN_SUCCESS;
11329 thread_t thread = current_thread();
11330
11331 assert(copy->size == copy_size);
11332
11333 /*
11334 * check for corrupted vm_map_copy structure
11335 */
11336 if (copy_size > msg_ool_size_small || copy->offset) {
11337 panic("Invalid vm_map_copy_t sz:%lld, ofst:%lld",
11338 (long long)copy->size, (long long)copy->offset);
11339 }
11340
11341 if (!overwrite) {
11342 /*
11343 * Allocate space in the target map for the data
11344 */
11345 vm_map_kernel_flags_t vmk_flags = VM_MAP_KERNEL_FLAGS_ANYWHERE();
11346
11347 if (map == kernel_map) {
11348 vmk_flags.vmkf_range_id = KMEM_RANGE_ID_DATA;
11349 }
11350
11351 *addr = 0;
11352 kr = vm_map_enter(map,
11353 addr,
11354 vm_map_round_page(copy_size,
11355 VM_MAP_PAGE_MASK(map)),
11356 (vm_map_offset_t) 0,
11357 vmk_flags,
11358 VM_OBJECT_NULL,
11359 (vm_object_offset_t) 0,
11360 FALSE,
11361 VM_PROT_DEFAULT,
11362 VM_PROT_ALL,
11363 VM_INHERIT_DEFAULT);
11364 if (kr != KERN_SUCCESS) {
11365 return kr;
11366 }
11367 #if KASAN
11368 if (map->pmap == kernel_pmap) {
11369 kasan_notify_address(*addr, copy->size);
11370 }
11371 #endif
11372 }
11373
11374 /*
11375 * Copyout the data from the kernel buffer to the target map.
11376 */
11377 if (thread->map == map) {
11378 /*
11379 * If the target map is the current map, just do
11380 * the copy.
11381 */
11382 assert((vm_size_t)copy_size == copy_size);
11383 if (copyout(copy->cpy_kdata, *addr, (vm_size_t)copy_size)) {
11384 kr = KERN_INVALID_ADDRESS;
11385 }
11386 } else {
11387 vm_map_switch_context_t switch_ctx;
11388
11389 /*
11390 * If the target map is another map, assume the
11391 * target's address space identity for the duration
11392 * of the copy.
11393 */
11394 vm_map_reference(map);
11395 switch_ctx = vm_map_switch_to(map);
11396
11397 assert((vm_size_t)copy_size == copy_size);
11398 if (copyout(copy->cpy_kdata, *addr, (vm_size_t)copy_size)) {
11399 vm_map_copyout_kernel_buffer_failures++;
11400 kr = KERN_INVALID_ADDRESS;
11401 }
11402
11403 vm_map_switch_back(switch_ctx);
11404 vm_map_deallocate(map);
11405 }
11406
11407 if (kr != KERN_SUCCESS) {
11408 /* the copy failed, clean up */
11409 if (!overwrite) {
11410 /*
11411 * Deallocate the space we allocated in the target map.
11412 */
11413 (void) vm_map_remove(map,
11414 vm_map_trunc_page(*addr,
11415 VM_MAP_PAGE_MASK(map)),
11416 vm_map_round_page((*addr +
11417 vm_map_round_page(copy_size,
11418 VM_MAP_PAGE_MASK(map))),
11419 VM_MAP_PAGE_MASK(map)));
11420 *addr = 0;
11421 }
11422 } else {
11423 /* copy was successful, dicard the copy structure */
11424 if (consume_on_success) {
11425 kfree_data(copy->cpy_kdata, copy_size);
11426 zfree_id(ZONE_ID_VM_MAP_COPY, copy);
11427 }
11428 }
11429
11430 return kr;
11431 }
11432
11433 /*
11434 * Routine: vm_map_copy_insert [internal use only]
11435 *
11436 * Description:
11437 * Link a copy chain ("copy") into a map at the
11438 * specified location (after "where").
11439 *
11440 * Callers of this function must call vm_map_copy_require on
11441 * previously created vm_map_copy_t or pass a newly created
11442 * one to ensure that it hasn't been forged.
11443 * Side effects:
11444 * The copy chain is destroyed.
11445 */
11446 static void
vm_map_copy_insert(vm_map_t map,vm_map_entry_t after_where,vm_map_copy_t copy)11447 vm_map_copy_insert(
11448 vm_map_t map,
11449 vm_map_entry_t after_where,
11450 vm_map_copy_t copy)
11451 {
11452 vm_map_entry_t entry;
11453
11454 while (vm_map_copy_first_entry(copy) != vm_map_copy_to_entry(copy)) {
11455 entry = vm_map_copy_first_entry(copy);
11456 vm_map_copy_entry_unlink(copy, entry);
11457 vm_map_store_entry_link(map, after_where, entry,
11458 VM_MAP_KERNEL_FLAGS_NONE);
11459 after_where = entry;
11460 }
11461 zfree_id(ZONE_ID_VM_MAP_COPY, copy);
11462 }
11463
11464 /*
11465 * Callers of this function must call vm_map_copy_require on
11466 * previously created vm_map_copy_t or pass a newly created
11467 * one to ensure that it hasn't been forged.
11468 */
11469 void
vm_map_copy_remap(vm_map_t map,vm_map_entry_t where,vm_map_copy_t copy,vm_map_offset_t adjustment,vm_prot_t cur_prot,vm_prot_t max_prot,vm_inherit_t inheritance)11470 vm_map_copy_remap(
11471 vm_map_t map,
11472 vm_map_entry_t where,
11473 vm_map_copy_t copy,
11474 vm_map_offset_t adjustment,
11475 vm_prot_t cur_prot,
11476 vm_prot_t max_prot,
11477 vm_inherit_t inheritance)
11478 {
11479 vm_map_entry_t copy_entry, new_entry;
11480
11481 for (copy_entry = vm_map_copy_first_entry(copy);
11482 copy_entry != vm_map_copy_to_entry(copy);
11483 copy_entry = copy_entry->vme_next) {
11484 /* get a new VM map entry for the map */
11485 new_entry = vm_map_entry_create(map);
11486 /* copy the "copy entry" to the new entry */
11487 vm_map_entry_copy(map, new_entry, copy_entry);
11488 /* adjust "start" and "end" */
11489 new_entry->vme_start += adjustment;
11490 new_entry->vme_end += adjustment;
11491 /* clear some attributes */
11492 new_entry->inheritance = inheritance;
11493 new_entry->protection = cur_prot;
11494 new_entry->max_protection = max_prot;
11495 new_entry->behavior = VM_BEHAVIOR_DEFAULT;
11496 /* take an extra reference on the entry's "object" */
11497 if (new_entry->is_sub_map) {
11498 assert(!new_entry->use_pmap); /* not nested */
11499 vm_map_reference(VME_SUBMAP(new_entry));
11500 } else {
11501 vm_object_reference(VME_OBJECT(new_entry));
11502 }
11503 /* insert the new entry in the map */
11504 vm_map_store_entry_link(map, where, new_entry,
11505 VM_MAP_KERNEL_FLAGS_NONE);
11506 /* continue inserting the "copy entries" after the new entry */
11507 where = new_entry;
11508 }
11509 }
11510
11511
11512 /*
11513 * Returns true if *size matches (or is in the range of) copy->size.
11514 * Upon returning true, the *size field is updated with the actual size of the
11515 * copy object (may be different for VM_MAP_COPY_ENTRY_LIST types)
11516 */
11517 boolean_t
vm_map_copy_validate_size(vm_map_t dst_map,vm_map_copy_t copy,vm_map_size_t * size)11518 vm_map_copy_validate_size(
11519 vm_map_t dst_map,
11520 vm_map_copy_t copy,
11521 vm_map_size_t *size)
11522 {
11523 if (copy == VM_MAP_COPY_NULL) {
11524 return FALSE;
11525 }
11526
11527 /*
11528 * Assert that the vm_map_copy is coming from the right
11529 * zone and hasn't been forged
11530 */
11531 vm_map_copy_require(copy);
11532
11533 vm_map_size_t copy_sz = copy->size;
11534 vm_map_size_t sz = *size;
11535 switch (copy->type) {
11536 case VM_MAP_COPY_KERNEL_BUFFER:
11537 if (sz == copy_sz) {
11538 return TRUE;
11539 }
11540 break;
11541 case VM_MAP_COPY_ENTRY_LIST:
11542 /*
11543 * potential page-size rounding prevents us from exactly
11544 * validating this flavor of vm_map_copy, but we can at least
11545 * assert that it's within a range.
11546 */
11547 if (copy_sz >= sz &&
11548 copy_sz <= vm_map_round_page(sz, VM_MAP_PAGE_MASK(dst_map))) {
11549 *size = copy_sz;
11550 return TRUE;
11551 }
11552 break;
11553 default:
11554 break;
11555 }
11556 return FALSE;
11557 }
11558
11559 static kern_return_t
vm_map_copyout_internal(vm_map_t dst_map,vm_map_address_t * dst_addr,vm_map_copy_t copy,vm_map_size_ut copy_size_u,boolean_t consume_on_success,vm_prot_t cur_protection,vm_prot_t max_protection,vm_inherit_t inheritance)11560 vm_map_copyout_internal(
11561 vm_map_t dst_map,
11562 vm_map_address_t *dst_addr, /* OUT */
11563 vm_map_copy_t copy,
11564 vm_map_size_ut copy_size_u,
11565 boolean_t consume_on_success,
11566 vm_prot_t cur_protection,
11567 vm_prot_t max_protection,
11568 vm_inherit_t inheritance)
11569 {
11570 vm_map_size_t size, copy_size;
11571 vm_map_size_t adjustment;
11572 vm_map_offset_t start;
11573 vm_object_offset_t vm_copy_start;
11574 vm_map_entry_t last;
11575 vm_map_entry_t entry;
11576 vm_map_copy_t original_copy;
11577 kern_return_t kr;
11578 vm_map_kernel_flags_t vmk_flags = VM_MAP_KERNEL_FLAGS_ANYWHERE();
11579
11580 /*
11581 * Check for null copy object.
11582 */
11583
11584 if (copy == VM_MAP_COPY_NULL) {
11585 *dst_addr = 0;
11586 return KERN_SUCCESS;
11587 }
11588
11589 /*
11590 * Assert that the vm_map_copy is coming from the right
11591 * zone and hasn't been forged
11592 */
11593 vm_map_copy_require(copy);
11594
11595 if (!VM_SANITIZE_UNSAFE_IS_EQUAL(copy_size_u, copy->size)) {
11596 *dst_addr = 0;
11597 ktriage_record(thread_tid(current_thread()),
11598 KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM,
11599 KDBG_TRIAGE_RESERVED,
11600 KDBG_TRIAGE_VM_COPYOUT_INTERNAL_SIZE_ERROR),
11601 KERN_FAILURE /* arg */);
11602 return KERN_FAILURE;
11603 }
11604 copy_size = copy->size;
11605
11606 /*
11607 * Check for special kernel buffer allocated
11608 * by new_ipc_kmsg_copyin.
11609 */
11610
11611 if (copy->type == VM_MAP_COPY_KERNEL_BUFFER) {
11612 kr = vm_map_copyout_kernel_buffer(dst_map, dst_addr,
11613 copy, copy_size, FALSE,
11614 consume_on_success);
11615 if (kr) {
11616 ktriage_record(thread_tid(current_thread()),
11617 KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM,
11618 KDBG_TRIAGE_RESERVED,
11619 KDBG_TRIAGE_VM_COPYOUT_KERNEL_BUFFER_ERROR), kr /* arg */);
11620 }
11621 return kr;
11622 }
11623
11624
11625 original_copy = copy;
11626 if (copy->cpy_hdr.page_shift != VM_MAP_PAGE_SHIFT(dst_map)) {
11627 vm_map_copy_t target_copy;
11628 vm_map_offset_t overmap_start, overmap_end, trimmed_start;
11629
11630 target_copy = VM_MAP_COPY_NULL;
11631 DEBUG4K_ADJUST("adjusting...\n");
11632 kr = vm_map_copy_adjust_to_target(
11633 copy,
11634 0, /* offset */
11635 copy->size, /* size */
11636 dst_map,
11637 TRUE, /* copy */
11638 &target_copy,
11639 &overmap_start,
11640 &overmap_end,
11641 &trimmed_start);
11642 if (kr != KERN_SUCCESS) {
11643 DEBUG4K_COPY("adjust failed 0x%x\n", kr);
11644 ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_COPYOUT_INTERNAL_ADJUSTING_ERROR), kr /* arg */);
11645 return kr;
11646 }
11647 DEBUG4K_COPY("copy %p (%d 0x%llx 0x%llx) dst_map %p (%d) target_copy %p (%d 0x%llx 0x%llx) overmap_start 0x%llx overmap_end 0x%llx trimmed_start 0x%llx\n", copy, copy->cpy_hdr.page_shift, copy->offset, (uint64_t)copy->size, dst_map, VM_MAP_PAGE_SHIFT(dst_map), target_copy, target_copy->cpy_hdr.page_shift, target_copy->offset, (uint64_t)target_copy->size, (uint64_t)overmap_start, (uint64_t)overmap_end, (uint64_t)trimmed_start);
11648 if (target_copy != copy) {
11649 copy = target_copy;
11650 }
11651 copy_size = copy->size;
11652 }
11653
11654 /*
11655 * Find space for the data
11656 */
11657
11658 vm_copy_start = vm_map_trunc_page((vm_map_size_t)copy->offset,
11659 VM_MAP_COPY_PAGE_MASK(copy));
11660 size = vm_map_round_page((vm_map_size_t)copy->offset + copy_size,
11661 VM_MAP_COPY_PAGE_MASK(copy))
11662 - vm_copy_start;
11663
11664 vm_map_kernel_flags_update_range_id(&vmk_flags, dst_map, size);
11665
11666 vm_map_lock(dst_map);
11667 kr = vm_map_locate_space_anywhere(dst_map, size, 0, vmk_flags,
11668 &start, &last);
11669 if (kr != KERN_SUCCESS) {
11670 vm_map_unlock(dst_map);
11671 ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_COPYOUT_INTERNAL_SPACE_ERROR), kr /* arg */);
11672 return kr;
11673 }
11674
11675 adjustment = start - vm_copy_start;
11676 if (!consume_on_success) {
11677 /*
11678 * We're not allowed to consume "copy", so we'll have to
11679 * copy its map entries into the destination map below.
11680 * No need to re-allocate map entries from the correct
11681 * (pageable or not) zone, since we'll get new map entries
11682 * during the transfer.
11683 * We'll also adjust the map entries's "start" and "end"
11684 * during the transfer, to keep "copy"'s entries consistent
11685 * with its "offset".
11686 */
11687 goto after_adjustments;
11688 }
11689
11690 /*
11691 * Since we're going to just drop the map
11692 * entries from the copy into the destination
11693 * map, they must come from the same pool.
11694 */
11695
11696 if (copy->cpy_hdr.entries_pageable != dst_map->hdr.entries_pageable) {
11697 /*
11698 * Mismatches occur when dealing with the default
11699 * pager.
11700 */
11701 vm_map_entry_t next, new;
11702
11703 /*
11704 * Find the zone that the copies were allocated from
11705 */
11706
11707 entry = vm_map_copy_first_entry(copy);
11708
11709 /*
11710 * Reinitialize the copy so that vm_map_copy_entry_link
11711 * will work.
11712 */
11713 vm_map_store_copy_reset(copy, entry);
11714 copy->cpy_hdr.entries_pageable = dst_map->hdr.entries_pageable;
11715
11716 /*
11717 * Copy each entry.
11718 */
11719 while (entry != vm_map_copy_to_entry(copy)) {
11720 new = vm_map_copy_entry_create(copy);
11721 vm_map_entry_copy_full(new, entry);
11722 new->vme_no_copy_on_read = FALSE;
11723 assert(!new->iokit_acct);
11724 if (new->is_sub_map) {
11725 /* clr address space specifics */
11726 new->use_pmap = FALSE;
11727 }
11728 vm_map_copy_entry_link(copy,
11729 vm_map_copy_last_entry(copy),
11730 new);
11731 next = entry->vme_next;
11732 vm_map_entry_dispose(entry);
11733 entry = next;
11734 }
11735 }
11736
11737 /*
11738 * Adjust the addresses in the copy chain, and
11739 * reset the region attributes.
11740 */
11741
11742 for (entry = vm_map_copy_first_entry(copy);
11743 entry != vm_map_copy_to_entry(copy);
11744 entry = entry->vme_next) {
11745 if (VM_MAP_PAGE_SHIFT(dst_map) == PAGE_SHIFT) {
11746 /*
11747 * We're injecting this copy entry into a map that
11748 * has the standard page alignment, so clear
11749 * "map_aligned" (which might have been inherited
11750 * from the original map entry).
11751 */
11752 entry->map_aligned = FALSE;
11753 }
11754
11755 entry->vme_start += adjustment;
11756 entry->vme_end += adjustment;
11757
11758 if (entry->map_aligned) {
11759 assert(VM_MAP_PAGE_ALIGNED(entry->vme_start,
11760 VM_MAP_PAGE_MASK(dst_map)));
11761 assert(VM_MAP_PAGE_ALIGNED(entry->vme_end,
11762 VM_MAP_PAGE_MASK(dst_map)));
11763 }
11764
11765 entry->inheritance = VM_INHERIT_DEFAULT;
11766 entry->protection = VM_PROT_DEFAULT;
11767 entry->max_protection = VM_PROT_ALL;
11768 entry->behavior = VM_BEHAVIOR_DEFAULT;
11769
11770 /*
11771 * If the entry is now wired,
11772 * map the pages into the destination map.
11773 */
11774 if (entry->wired_count != 0) {
11775 vm_map_offset_t va;
11776 vm_object_offset_t offset;
11777 vm_object_t object;
11778 vm_prot_t prot;
11779 int type_of_fault;
11780 uint8_t object_lock_type = OBJECT_LOCK_EXCLUSIVE;
11781
11782 /* TODO4K would need to use actual page size */
11783 assert(VM_MAP_PAGE_SHIFT(dst_map) == PAGE_SHIFT);
11784
11785 object = VME_OBJECT(entry);
11786 offset = VME_OFFSET(entry);
11787 va = entry->vme_start;
11788
11789 pmap_pageable(dst_map->pmap,
11790 entry->vme_start,
11791 entry->vme_end,
11792 TRUE);
11793
11794 while (va < entry->vme_end) {
11795 vm_page_t m;
11796 struct vm_object_fault_info fault_info = {
11797 .interruptible = THREAD_UNINT,
11798 };
11799
11800 /*
11801 * Look up the page in the object.
11802 * Assert that the page will be found in the
11803 * top object:
11804 * either
11805 * the object was newly created by
11806 * vm_object_copy_slowly, and has
11807 * copies of all of the pages from
11808 * the source object
11809 * or
11810 * the object was moved from the old
11811 * map entry; because the old map
11812 * entry was wired, all of the pages
11813 * were in the top-level object.
11814 * (XXX not true if we wire pages for
11815 * reading)
11816 */
11817 vm_object_lock(object);
11818
11819 m = vm_page_lookup(object, offset);
11820 if (m == VM_PAGE_NULL || !VM_PAGE_WIRED(m) ||
11821 m->vmp_absent) {
11822 panic("vm_map_copyout: wiring %p", m);
11823 }
11824
11825 prot = entry->protection;
11826
11827 if (override_nx(dst_map, VME_ALIAS(entry)) &&
11828 prot) {
11829 prot |= VM_PROT_EXECUTE;
11830 }
11831
11832 type_of_fault = DBG_CACHE_HIT_FAULT;
11833
11834 fault_info.user_tag = VME_ALIAS(entry);
11835 fault_info.pmap_options = 0;
11836 if (entry->iokit_acct ||
11837 (!entry->is_sub_map && !entry->use_pmap)) {
11838 fault_info.pmap_options |= PMAP_OPTIONS_ALT_ACCT;
11839 }
11840 if (entry->vme_xnu_user_debug &&
11841 !VM_PAGE_OBJECT(m)->code_signed) {
11842 /*
11843 * Modified code-signed executable
11844 * region: this page does not belong
11845 * to a code-signed VM object, so it
11846 * must have been copied and should
11847 * therefore be typed XNU_USER_DEBUG
11848 * rather than XNU_USER_EXEC.
11849 */
11850 fault_info.pmap_options |= PMAP_OPTIONS_XNU_USER_DEBUG;
11851 }
11852
11853 vm_fault_enter(m,
11854 dst_map->pmap,
11855 va,
11856 PAGE_SIZE, 0,
11857 prot,
11858 prot,
11859 VM_PAGE_WIRED(m),
11860 VM_KERN_MEMORY_NONE, /* tag - not wiring */
11861 &fault_info,
11862 NULL, /* need_retry */
11863 &type_of_fault,
11864 &object_lock_type); /*Exclusive mode lock. Will remain unchanged.*/
11865
11866 vm_object_unlock(object);
11867
11868 offset += PAGE_SIZE_64;
11869 va += PAGE_SIZE;
11870 }
11871 }
11872 }
11873
11874 after_adjustments:
11875
11876 /*
11877 * Correct the page alignment for the result
11878 */
11879
11880 *dst_addr = start + (copy->offset - vm_copy_start);
11881
11882 #if KASAN
11883 kasan_notify_address(*dst_addr, size);
11884 #endif
11885
11886 /*
11887 * Update the hints and the map size
11888 */
11889
11890 if (consume_on_success) {
11891 SAVE_HINT_MAP_WRITE(dst_map, vm_map_copy_last_entry(copy));
11892 } else {
11893 SAVE_HINT_MAP_WRITE(dst_map, last);
11894 }
11895
11896 dst_map->size += size;
11897
11898 /*
11899 * Link in the copy
11900 */
11901
11902 if (consume_on_success) {
11903 vm_map_copy_insert(dst_map, last, copy);
11904 if (copy != original_copy) {
11905 vm_map_copy_discard(original_copy);
11906 original_copy = VM_MAP_COPY_NULL;
11907 }
11908 } else {
11909 vm_map_copy_remap(dst_map, last, copy, adjustment,
11910 cur_protection, max_protection,
11911 inheritance);
11912 if (copy != original_copy && original_copy != VM_MAP_COPY_NULL) {
11913 vm_map_copy_discard(copy);
11914 copy = original_copy;
11915 }
11916 }
11917
11918
11919 vm_map_unlock(dst_map);
11920
11921 /*
11922 * XXX If wiring_required, call vm_map_pageable
11923 */
11924
11925 return KERN_SUCCESS;
11926 }
11927
11928 /*
11929 * Routine: vm_map_copyout_size
11930 *
11931 * Description:
11932 * Copy out a copy chain ("copy") into newly-allocated
11933 * space in the destination map. Uses a prevalidated
11934 * size for the copy object (vm_map_copy_validate_size).
11935 *
11936 * If successful, consumes the copy object.
11937 * Otherwise, the caller is responsible for it.
11938 */
11939 kern_return_t
vm_map_copyout_size(vm_map_t dst_map,vm_map_address_t * dst_addr,vm_map_copy_t copy,vm_map_size_ut copy_size)11940 vm_map_copyout_size(
11941 vm_map_t dst_map,
11942 vm_map_address_t *dst_addr, /* OUT */
11943 vm_map_copy_t copy,
11944 vm_map_size_ut copy_size)
11945 {
11946 return vm_map_copyout_internal(dst_map, dst_addr, copy, copy_size,
11947 TRUE, /* consume_on_success */
11948 VM_PROT_DEFAULT,
11949 VM_PROT_ALL,
11950 VM_INHERIT_DEFAULT);
11951 }
11952
11953 /*
11954 * Routine: vm_map_copyout
11955 *
11956 * Description:
11957 * Copy out a copy chain ("copy") into newly-allocated
11958 * space in the destination map.
11959 *
11960 * If successful, consumes the copy object.
11961 * Otherwise, the caller is responsible for it.
11962 */
11963 kern_return_t
vm_map_copyout(vm_map_t dst_map,vm_map_address_t * dst_addr,vm_map_copy_t copy)11964 vm_map_copyout(
11965 vm_map_t dst_map,
11966 vm_map_address_t *dst_addr, /* OUT */
11967 vm_map_copy_t copy)
11968 {
11969 return vm_map_copyout_internal(dst_map, dst_addr, copy, copy ? copy->size : 0,
11970 TRUE, /* consume_on_success */
11971 VM_PROT_DEFAULT,
11972 VM_PROT_ALL,
11973 VM_INHERIT_DEFAULT);
11974 }
11975
11976 /*
11977 * Routine: vm_map_copyin
11978 *
11979 * Description:
11980 * see vm_map_copyin_common. Exported via Unsupported.exports.
11981 *
11982 */
11983 kern_return_t
vm_map_copyin(vm_map_t src_map,vm_map_address_ut src_addr,vm_map_size_ut len,boolean_t src_destroy,vm_map_copy_t * copy_result)11984 vm_map_copyin(
11985 vm_map_t src_map,
11986 vm_map_address_ut src_addr,
11987 vm_map_size_ut len,
11988 boolean_t src_destroy,
11989 vm_map_copy_t *copy_result) /* OUT */
11990 {
11991 return vm_map_copyin_common(src_map, src_addr, len, src_destroy,
11992 FALSE, copy_result, FALSE);
11993 }
11994
11995 /*
11996 * Routine: vm_map_copyin_common
11997 *
11998 * Description:
11999 * Copy the specified region (src_addr, len) from the
12000 * source address space (src_map), possibly removing
12001 * the region from the source address space (src_destroy).
12002 *
12003 * Returns:
12004 * A vm_map_copy_t object (copy_result), suitable for
12005 * insertion into another address space (using vm_map_copyout),
12006 * copying over another address space region (using
12007 * vm_map_copy_overwrite). If the copy is unused, it
12008 * should be destroyed (using vm_map_copy_discard).
12009 *
12010 * In/out conditions:
12011 * The source map should not be locked on entry.
12012 */
12013
12014 typedef struct submap_map {
12015 vm_map_t parent_map;
12016 vm_map_offset_t base_start;
12017 vm_map_offset_t base_end;
12018 vm_map_size_t base_len;
12019 struct submap_map *next;
12020 } submap_map_t;
12021
12022 kern_return_t
vm_map_copyin_common(vm_map_t src_map,vm_map_address_ut src_addr,vm_map_size_ut len,boolean_t src_destroy,__unused boolean_t src_volatile,vm_map_copy_t * copy_result,boolean_t use_maxprot)12023 vm_map_copyin_common(
12024 vm_map_t src_map,
12025 vm_map_address_ut src_addr,
12026 vm_map_size_ut len,
12027 boolean_t src_destroy,
12028 __unused boolean_t src_volatile,
12029 vm_map_copy_t *copy_result, /* OUT */
12030 boolean_t use_maxprot)
12031 {
12032 int flags;
12033
12034 flags = 0;
12035 if (src_destroy) {
12036 flags |= VM_MAP_COPYIN_SRC_DESTROY;
12037 }
12038 if (use_maxprot) {
12039 flags |= VM_MAP_COPYIN_USE_MAXPROT;
12040 }
12041 return vm_map_copyin_internal(src_map,
12042 src_addr,
12043 len,
12044 flags,
12045 copy_result);
12046 }
12047
12048 static __attribute__((always_inline, warn_unused_result))
12049 kern_return_t
vm_map_copyin_sanitize(vm_map_t src_map,vm_map_address_ut src_addr_u,vm_map_size_ut len_u,vm_map_offset_t * src_start,vm_map_offset_t * src_end,vm_map_size_t * len,vm_map_offset_t * src_addr_unaligned)12050 vm_map_copyin_sanitize(
12051 vm_map_t src_map,
12052 vm_map_address_ut src_addr_u,
12053 vm_map_size_ut len_u,
12054 vm_map_offset_t *src_start,
12055 vm_map_offset_t *src_end,
12056 vm_map_size_t *len,
12057 vm_map_offset_t *src_addr_unaligned)
12058 {
12059 kern_return_t kr;
12060 vm_sanitize_flags_t flags = VM_SANITIZE_FLAGS_SIZE_ZERO_SUCCEEDS |
12061 VM_SANITIZE_FLAGS_GET_UNALIGNED_VALUES |
12062 VM_SANITIZE_FLAGS_CHECK_ADDR_RANGE;
12063
12064 #if KASAN_TBI
12065 if (vm_kernel_map_is_kernel(src_map)) {
12066 flags |= VM_SANITIZE_FLAGS_CANONICALIZE;
12067 }
12068 #endif /* KASAN_TBI */
12069
12070 kr = vm_sanitize_addr_size(src_addr_u, len_u,
12071 VM_SANITIZE_CALLER_VM_MAP_COPYIN,
12072 src_map,
12073 flags,
12074 src_start, src_end, len);
12075 if (__improbable(kr != KERN_SUCCESS)) {
12076 return kr;
12077 }
12078
12079 /*
12080 * Compute (page aligned) start and end of region
12081 */
12082 *src_addr_unaligned = *src_start; /* remember unaligned value */
12083 *src_start = vm_map_trunc_page(*src_addr_unaligned,
12084 VM_MAP_PAGE_MASK(src_map));
12085 *src_end = vm_map_round_page(*src_end, VM_MAP_PAGE_MASK(src_map));
12086 return KERN_SUCCESS;
12087 }
12088
12089 kern_return_t
vm_map_copyin_internal(vm_map_t src_map,vm_map_address_ut src_addr_u,vm_map_size_ut len_u,int flags,vm_map_copy_t * copy_result)12090 vm_map_copyin_internal(
12091 vm_map_t src_map,
12092 vm_map_address_ut src_addr_u,
12093 vm_map_size_ut len_u,
12094 int flags,
12095 vm_map_copy_t *copy_result) /* OUT */
12096 {
12097 vm_map_entry_t tmp_entry; /* Result of last map lookup --
12098 * in multi-level lookup, this
12099 * entry contains the actual
12100 * vm_object/offset.
12101 */
12102 vm_map_entry_t new_entry = VM_MAP_ENTRY_NULL; /* Map entry for copy */
12103
12104 vm_map_offset_t src_start; /* Start of current entry --
12105 * where copy is taking place now
12106 */
12107 vm_map_offset_t src_end; /* End of entire region to be
12108 * copied */
12109 vm_map_offset_t src_addr_unaligned;
12110 vm_map_offset_t src_base;
12111 vm_map_size_t len;
12112 vm_map_t base_map = src_map;
12113 boolean_t map_share = FALSE;
12114 submap_map_t *parent_maps = NULL;
12115
12116 vm_map_copy_t copy; /* Resulting copy */
12117 vm_map_address_t copy_addr;
12118 vm_map_size_t copy_size;
12119 boolean_t src_destroy;
12120 boolean_t use_maxprot;
12121 boolean_t preserve_purgeable;
12122 boolean_t entry_was_shared;
12123 vm_map_entry_t saved_src_entry;
12124 kern_return_t kr;
12125
12126 if (flags & ~VM_MAP_COPYIN_ALL_FLAGS) {
12127 return KERN_INVALID_ARGUMENT;
12128 }
12129
12130 /*
12131 * Check for copies of zero bytes.
12132 */
12133 if (VM_SANITIZE_UNSAFE_IS_ZERO(len_u)) {
12134 *copy_result = VM_MAP_COPY_NULL;
12135 return KERN_SUCCESS;
12136 }
12137
12138 /*
12139 * Sanitize any input parameters that are addr/size/prot/inherit
12140 */
12141 kr = vm_map_copyin_sanitize(
12142 src_map,
12143 src_addr_u,
12144 len_u,
12145 &src_start,
12146 &src_end,
12147 &len,
12148 &src_addr_unaligned);
12149 if (__improbable(kr != KERN_SUCCESS)) {
12150 return vm_sanitize_get_kr(kr);
12151 }
12152
12153
12154 src_destroy = (flags & VM_MAP_COPYIN_SRC_DESTROY) ? TRUE : FALSE;
12155 use_maxprot = (flags & VM_MAP_COPYIN_USE_MAXPROT) ? TRUE : FALSE;
12156 preserve_purgeable =
12157 (flags & VM_MAP_COPYIN_PRESERVE_PURGEABLE) ? TRUE : FALSE;
12158
12159 /*
12160 * If the copy is sufficiently small, use a kernel buffer instead
12161 * of making a virtual copy. The theory being that the cost of
12162 * setting up VM (and taking C-O-W faults) dominates the copy costs
12163 * for small regions.
12164 */
12165 if ((len <= msg_ool_size_small) &&
12166 !use_maxprot &&
12167 !preserve_purgeable &&
12168 !(flags & VM_MAP_COPYIN_ENTRY_LIST) &&
12169 /*
12170 * Since the "msg_ool_size_small" threshold was increased and
12171 * vm_map_copyin_kernel_buffer() doesn't handle accesses beyond the
12172 * address space limits, we revert to doing a virtual copy if the
12173 * copied range goes beyond those limits. Otherwise, mach_vm_read()
12174 * of the commpage would now fail when it used to work.
12175 */
12176 (src_start >= vm_map_min(src_map) &&
12177 src_start < vm_map_max(src_map) &&
12178 src_end >= vm_map_min(src_map) &&
12179 src_end < vm_map_max(src_map))) {
12180 return vm_map_copyin_kernel_buffer(src_map, src_addr_unaligned, len,
12181 src_destroy, copy_result);
12182 }
12183
12184 /*
12185 * Allocate a header element for the list.
12186 *
12187 * Use the start and end in the header to
12188 * remember the endpoints prior to rounding.
12189 */
12190
12191 copy = vm_map_copy_allocate(VM_MAP_COPY_ENTRY_LIST);
12192 copy->cpy_hdr.entries_pageable = TRUE;
12193 copy->cpy_hdr.page_shift = (uint16_t)VM_MAP_PAGE_SHIFT(src_map);
12194 copy->offset = src_addr_unaligned;
12195 copy->size = len;
12196
12197 new_entry = vm_map_copy_entry_create(copy);
12198
12199 #define RETURN(x) \
12200 MACRO_BEGIN \
12201 vm_map_unlock(src_map); \
12202 if(src_map != base_map) \
12203 vm_map_deallocate(src_map); \
12204 if (new_entry != VM_MAP_ENTRY_NULL) \
12205 vm_map_copy_entry_dispose(new_entry); \
12206 vm_map_copy_discard(copy); \
12207 { \
12208 submap_map_t *_ptr; \
12209 \
12210 for(_ptr = parent_maps; _ptr != NULL; _ptr = parent_maps) { \
12211 parent_maps=parent_maps->next; \
12212 if (_ptr->parent_map != base_map) \
12213 vm_map_deallocate(_ptr->parent_map); \
12214 kfree_type(submap_map_t, _ptr); \
12215 } \
12216 } \
12217 MACRO_RETURN(x); \
12218 MACRO_END
12219
12220 /*
12221 * Find the beginning of the region.
12222 */
12223
12224 vm_map_lock(src_map);
12225
12226 /*
12227 * Lookup the original "src_addr_unaligned" rather than the truncated
12228 * "src_start", in case "src_start" falls in a non-map-aligned
12229 * map entry *before* the map entry that contains "src_addr_unaligned"...
12230 */
12231 if (!vm_map_lookup_entry(src_map, src_addr_unaligned, &tmp_entry)) {
12232 RETURN(KERN_INVALID_ADDRESS);
12233 }
12234 if (!tmp_entry->is_sub_map) {
12235 /*
12236 * ... but clip to the map-rounded "src_start" rather than
12237 * "src_addr_unaligned" to preserve map-alignment. We'll adjust the
12238 * first copy entry at the end, if needed.
12239 */
12240 vm_map_clip_start(src_map, tmp_entry, src_start);
12241 }
12242 if (src_start < tmp_entry->vme_start) {
12243 /*
12244 * Move "src_start" up to the start of the
12245 * first map entry to copy.
12246 */
12247 src_start = tmp_entry->vme_start;
12248 }
12249 /* set for later submap fix-up */
12250 copy_addr = src_start;
12251
12252 /*
12253 * Go through entries until we get to the end.
12254 */
12255
12256 while (TRUE) {
12257 vm_map_entry_t src_entry = tmp_entry; /* Top-level entry */
12258 vm_map_size_t src_size; /* Size of source
12259 * map entry (in both
12260 * maps)
12261 */
12262
12263 vm_object_t src_object; /* Object to copy */
12264 vm_object_offset_t src_offset;
12265
12266 vm_object_t new_copy_object;/* vm_object_copy_* result */
12267
12268 boolean_t src_needs_copy; /* Should source map
12269 * be made read-only
12270 * for copy-on-write?
12271 */
12272
12273 boolean_t new_entry_needs_copy; /* Will new entry be COW? */
12274
12275 boolean_t was_wired; /* Was source wired? */
12276 boolean_t saved_used_for_jit; /* Saved used_for_jit. */
12277 vm_map_version_t version; /* Version before locks
12278 * dropped to make copy
12279 */
12280 kern_return_t result; /* Return value from
12281 * copy_strategically.
12282 */
12283 while (tmp_entry->is_sub_map) {
12284 vm_map_size_t submap_len;
12285 submap_map_t *ptr;
12286
12287 ptr = kalloc_type(submap_map_t, Z_WAITOK);
12288 ptr->next = parent_maps;
12289 parent_maps = ptr;
12290 ptr->parent_map = src_map;
12291 ptr->base_start = src_start;
12292 ptr->base_end = src_end;
12293 submap_len = tmp_entry->vme_end - src_start;
12294 if (submap_len > (src_end - src_start)) {
12295 submap_len = src_end - src_start;
12296 }
12297 ptr->base_len = submap_len;
12298
12299 src_start -= tmp_entry->vme_start;
12300 src_start += VME_OFFSET(tmp_entry);
12301 src_end = src_start + submap_len;
12302 src_map = VME_SUBMAP(tmp_entry);
12303 vm_map_lock(src_map);
12304 /* keep an outstanding reference for all maps in */
12305 /* the parents tree except the base map */
12306 vm_map_reference(src_map);
12307 vm_map_unlock(ptr->parent_map);
12308 if (!vm_map_lookup_entry(
12309 src_map, src_start, &tmp_entry)) {
12310 RETURN(KERN_INVALID_ADDRESS);
12311 }
12312 map_share = TRUE;
12313 if (!tmp_entry->is_sub_map) {
12314 vm_map_clip_start(src_map, tmp_entry, src_start);
12315 }
12316 src_entry = tmp_entry;
12317 }
12318 /* we are now in the lowest level submap... */
12319
12320 if ((VME_OBJECT(tmp_entry) != VM_OBJECT_NULL) &&
12321 (VME_OBJECT(tmp_entry)->phys_contiguous)) {
12322 /* This is not, supported for now.In future */
12323 /* we will need to detect the phys_contig */
12324 /* condition and then upgrade copy_slowly */
12325 /* to do physical copy from the device mem */
12326 /* based object. We can piggy-back off of */
12327 /* the was wired boolean to set-up the */
12328 /* proper handling */
12329 RETURN(KERN_PROTECTION_FAILURE);
12330 }
12331 /*
12332 * Create a new address map entry to hold the result.
12333 * Fill in the fields from the appropriate source entries.
12334 * We must unlock the source map to do this if we need
12335 * to allocate a map entry.
12336 */
12337 if (new_entry == VM_MAP_ENTRY_NULL) {
12338 version.main_timestamp = src_map->timestamp;
12339 vm_map_unlock(src_map);
12340
12341 new_entry = vm_map_copy_entry_create(copy);
12342
12343 vm_map_lock(src_map);
12344 if ((version.main_timestamp + 1) != src_map->timestamp) {
12345 if (!vm_map_lookup_entry(src_map, src_start,
12346 &tmp_entry)) {
12347 RETURN(KERN_INVALID_ADDRESS);
12348 }
12349 if (!tmp_entry->is_sub_map) {
12350 vm_map_clip_start(src_map, tmp_entry, src_start);
12351 }
12352 continue; /* restart w/ new tmp_entry */
12353 }
12354 }
12355
12356 /*
12357 * Verify that the region can be read.
12358 */
12359 if (((src_entry->protection & VM_PROT_READ) == VM_PROT_NONE &&
12360 !use_maxprot) ||
12361 (src_entry->max_protection & VM_PROT_READ) == 0) {
12362 RETURN(KERN_PROTECTION_FAILURE);
12363 }
12364
12365 src_object = VME_OBJECT(src_entry);
12366
12367
12368 /*
12369 * Clip against the endpoints of the entire region.
12370 */
12371
12372 vm_map_clip_end(src_map, src_entry, src_end);
12373
12374 src_size = src_entry->vme_end - src_start;
12375 src_offset = VME_OFFSET(src_entry);
12376 was_wired = (src_entry->wired_count != 0);
12377
12378 vm_map_entry_copy(src_map, new_entry, src_entry);
12379 if (new_entry->is_sub_map) {
12380 /* clr address space specifics */
12381 new_entry->use_pmap = FALSE;
12382 } else {
12383 /*
12384 * We're dealing with a copy-on-write operation,
12385 * so the resulting mapping should not inherit the
12386 * original mapping's accounting settings.
12387 * "iokit_acct" should have been cleared in
12388 * vm_map_entry_copy().
12389 * "use_pmap" should be reset to its default (TRUE)
12390 * so that the new mapping gets accounted for in
12391 * the task's memory footprint.
12392 */
12393 assert(!new_entry->iokit_acct);
12394 new_entry->use_pmap = TRUE;
12395 }
12396
12397 /*
12398 * Attempt non-blocking copy-on-write optimizations.
12399 */
12400
12401 /*
12402 * If we are destroying the source, and the object
12403 * is internal, we could move the object reference
12404 * from the source to the copy. The copy is
12405 * copy-on-write only if the source is.
12406 * We make another reference to the object, because
12407 * destroying the source entry will deallocate it.
12408 *
12409 * This memory transfer has to be atomic, (to prevent
12410 * the VM object from being shared or copied while
12411 * it's being moved here), so we could only do this
12412 * if we won't have to unlock the VM map until the
12413 * original mapping has been fully removed.
12414 */
12415
12416 RestartCopy:
12417 if ((src_object == VM_OBJECT_NULL ||
12418 (!was_wired && !map_share && !tmp_entry->is_shared
12419 && !(debug4k_no_cow_copyin && VM_MAP_PAGE_SHIFT(src_map) < PAGE_SHIFT))) &&
12420 vm_object_copy_quickly(
12421 VME_OBJECT(new_entry),
12422 src_offset,
12423 src_size,
12424 &src_needs_copy,
12425 &new_entry_needs_copy)) {
12426 new_entry->needs_copy = new_entry_needs_copy;
12427
12428 /*
12429 * Handle copy-on-write obligations
12430 */
12431
12432 if (src_needs_copy && !tmp_entry->needs_copy) {
12433 vm_prot_t prot;
12434
12435 prot = src_entry->protection & ~VM_PROT_WRITE;
12436
12437 if (override_nx(src_map, VME_ALIAS(src_entry))
12438 && prot) {
12439 prot |= VM_PROT_EXECUTE;
12440 }
12441
12442 vm_object_pmap_protect(
12443 src_object,
12444 src_offset,
12445 src_size,
12446 (src_entry->is_shared ?
12447 PMAP_NULL
12448 : src_map->pmap),
12449 VM_MAP_PAGE_SIZE(src_map),
12450 src_entry->vme_start,
12451 prot);
12452
12453 assert(tmp_entry->wired_count == 0);
12454 tmp_entry->needs_copy = TRUE;
12455 }
12456
12457 /*
12458 * The map has never been unlocked, so it's safe
12459 * to move to the next entry rather than doing
12460 * another lookup.
12461 */
12462
12463 goto CopySuccessful;
12464 }
12465
12466 entry_was_shared = tmp_entry->is_shared;
12467
12468 /*
12469 * Take an object reference, so that we may
12470 * release the map lock(s).
12471 */
12472
12473 assert(src_object != VM_OBJECT_NULL);
12474 vm_object_reference(src_object);
12475
12476 /*
12477 * Record the timestamp for later verification.
12478 * Unlock the map.
12479 */
12480
12481 version.main_timestamp = src_map->timestamp;
12482 vm_map_unlock(src_map); /* Increments timestamp once! */
12483 saved_src_entry = src_entry;
12484 tmp_entry = VM_MAP_ENTRY_NULL;
12485 src_entry = VM_MAP_ENTRY_NULL;
12486
12487 /*
12488 * Perform the copy
12489 */
12490
12491 if (was_wired ||
12492 (src_object->copy_strategy == MEMORY_OBJECT_COPY_DELAY_FORK &&
12493 !(flags & VM_MAP_COPYIN_FORK)) ||
12494 (debug4k_no_cow_copyin &&
12495 VM_MAP_PAGE_SHIFT(src_map) < PAGE_SHIFT)) {
12496 CopySlowly:
12497 vm_object_lock(src_object);
12498 result = vm_object_copy_slowly(
12499 src_object,
12500 src_offset,
12501 src_size,
12502 THREAD_UNINT,
12503 &new_copy_object);
12504 /* VME_OBJECT_SET will reset used_for_jit|tpro, so preserve it. */
12505 saved_used_for_jit = new_entry->used_for_jit;
12506 VME_OBJECT_SET(new_entry, new_copy_object, false, 0);
12507 new_entry->used_for_jit = saved_used_for_jit;
12508 VME_OFFSET_SET(new_entry,
12509 src_offset - vm_object_trunc_page(src_offset));
12510 new_entry->needs_copy = FALSE;
12511 } else if (src_object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC &&
12512 (entry_was_shared || map_share)) {
12513 vm_object_t new_object;
12514
12515 vm_object_lock_shared(src_object);
12516 new_object = vm_object_copy_delayed(
12517 src_object,
12518 src_offset,
12519 src_size,
12520 TRUE);
12521 if (new_object == VM_OBJECT_NULL) {
12522 goto CopySlowly;
12523 }
12524
12525 VME_OBJECT_SET(new_entry, new_object, false, 0);
12526 assert(new_entry->wired_count == 0);
12527 new_entry->needs_copy = TRUE;
12528 assert(!new_entry->iokit_acct);
12529 assert(new_object->purgable == VM_PURGABLE_DENY);
12530 assertf(new_entry->use_pmap, "src_map %p new_entry %p\n", src_map, new_entry);
12531 result = KERN_SUCCESS;
12532 } else {
12533 vm_object_offset_t new_offset;
12534 new_offset = VME_OFFSET(new_entry);
12535 result = vm_object_copy_strategically(src_object,
12536 src_offset,
12537 src_size,
12538 (flags & VM_MAP_COPYIN_FORK),
12539 &new_copy_object,
12540 &new_offset,
12541 &new_entry_needs_copy);
12542 /* VME_OBJECT_SET will reset used_for_jit, so preserve it. */
12543 saved_used_for_jit = new_entry->used_for_jit;
12544 VME_OBJECT_SET(new_entry, new_copy_object, false, 0);
12545 new_entry->used_for_jit = saved_used_for_jit;
12546 if (new_offset != VME_OFFSET(new_entry)) {
12547 VME_OFFSET_SET(new_entry, new_offset);
12548 }
12549
12550 new_entry->needs_copy = new_entry_needs_copy;
12551 }
12552
12553 if (result == KERN_SUCCESS &&
12554 ((preserve_purgeable &&
12555 src_object->purgable != VM_PURGABLE_DENY) ||
12556 new_entry->used_for_jit)) {
12557 /*
12558 * Purgeable objects should be COPY_NONE, true share;
12559 * this should be propogated to the copy.
12560 *
12561 * Also force mappings the pmap specially protects to
12562 * be COPY_NONE; trying to COW these mappings would
12563 * change the effective protections, which could have
12564 * side effects if the pmap layer relies on the
12565 * specified protections.
12566 */
12567
12568 vm_object_t new_object;
12569
12570 new_object = VME_OBJECT(new_entry);
12571 assert(new_object != src_object);
12572 vm_object_lock(new_object);
12573 assert(os_ref_get_count_raw(&new_object->ref_count) == 1);
12574 assert(new_object->shadow == VM_OBJECT_NULL);
12575 assert(new_object->vo_copy == VM_OBJECT_NULL);
12576 assert(new_object->vo_owner == NULL);
12577
12578 new_object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
12579
12580 if (preserve_purgeable &&
12581 src_object->purgable != VM_PURGABLE_DENY) {
12582 VM_OBJECT_SET_TRUE_SHARE(new_object, TRUE);
12583
12584 /* start as non-volatile with no owner... */
12585 VM_OBJECT_SET_PURGABLE(new_object, VM_PURGABLE_NONVOLATILE);
12586 vm_purgeable_nonvolatile_enqueue(new_object, NULL);
12587 /* ... and move to src_object's purgeable state */
12588 if (src_object->purgable != VM_PURGABLE_NONVOLATILE) {
12589 int state;
12590 state = src_object->purgable;
12591 vm_object_purgable_control(
12592 new_object,
12593 VM_PURGABLE_SET_STATE_FROM_KERNEL,
12594 &state);
12595 }
12596 /* no pmap accounting for purgeable objects */
12597 new_entry->use_pmap = FALSE;
12598 }
12599
12600 vm_object_unlock(new_object);
12601 new_object = VM_OBJECT_NULL;
12602 }
12603
12604 /*
12605 * Throw away the extra reference
12606 */
12607
12608 vm_object_deallocate(src_object);
12609
12610 if (result != KERN_SUCCESS &&
12611 result != KERN_MEMORY_RESTART_COPY) {
12612 vm_map_lock(src_map);
12613 RETURN(result);
12614 }
12615
12616 /*
12617 * Verify that the map has not substantially
12618 * changed while the copy was being made.
12619 */
12620
12621 vm_map_lock(src_map);
12622
12623 if ((version.main_timestamp + 1) == src_map->timestamp) {
12624 /* src_map hasn't changed: src_entry is still valid */
12625 src_entry = saved_src_entry;
12626 goto VerificationSuccessful;
12627 }
12628
12629 /*
12630 * Simple version comparison failed.
12631 *
12632 * Retry the lookup and verify that the
12633 * same object/offset are still present.
12634 *
12635 * [Note: a memory manager that colludes with
12636 * the calling task can detect that we have
12637 * cheated. While the map was unlocked, the
12638 * mapping could have been changed and restored.]
12639 */
12640
12641 if (!vm_map_lookup_entry(src_map, src_start, &tmp_entry)) {
12642 if (result != KERN_MEMORY_RESTART_COPY) {
12643 vm_object_deallocate(VME_OBJECT(new_entry));
12644 VME_OBJECT_SET(new_entry, VM_OBJECT_NULL, false, 0);
12645 /* reset accounting state */
12646 new_entry->iokit_acct = FALSE;
12647 new_entry->use_pmap = TRUE;
12648 }
12649 RETURN(KERN_INVALID_ADDRESS);
12650 }
12651
12652 src_entry = tmp_entry;
12653 vm_map_clip_start(src_map, src_entry, src_start);
12654
12655 if ((((src_entry->protection & VM_PROT_READ) == VM_PROT_NONE) &&
12656 !use_maxprot) ||
12657 ((src_entry->max_protection & VM_PROT_READ) == 0)) {
12658 goto VerificationFailed;
12659 }
12660
12661 if (src_entry->vme_end < new_entry->vme_end) {
12662 /*
12663 * This entry might have been shortened
12664 * (vm_map_clip_end) or been replaced with
12665 * an entry that ends closer to "src_start"
12666 * than before.
12667 * Adjust "new_entry" accordingly; copying
12668 * less memory would be correct but we also
12669 * redo the copy (see below) if the new entry
12670 * no longer points at the same object/offset.
12671 */
12672 assert(VM_MAP_PAGE_ALIGNED(src_entry->vme_end,
12673 VM_MAP_COPY_PAGE_MASK(copy)));
12674 new_entry->vme_end = src_entry->vme_end;
12675 src_size = new_entry->vme_end - src_start;
12676 } else if (src_entry->vme_end > new_entry->vme_end) {
12677 /*
12678 * This entry might have been extended
12679 * (vm_map_entry_simplify() or coalesce)
12680 * or been replaced with an entry that ends farther
12681 * from "src_start" than before.
12682 *
12683 * We've called vm_object_copy_*() only on
12684 * the previous <start:end> range, so we can't
12685 * just extend new_entry. We have to re-do
12686 * the copy based on the new entry as if it was
12687 * pointing at a different object/offset (see
12688 * "Verification failed" below).
12689 */
12690 }
12691
12692 if ((VME_OBJECT(src_entry) != src_object) ||
12693 (VME_OFFSET(src_entry) != src_offset) ||
12694 (src_entry->vme_end > new_entry->vme_end)) {
12695 /*
12696 * Verification failed.
12697 *
12698 * Start over with this top-level entry.
12699 */
12700
12701 VerificationFailed: ;
12702
12703 vm_object_deallocate(VME_OBJECT(new_entry));
12704 tmp_entry = src_entry;
12705 continue;
12706 }
12707
12708 /*
12709 * Verification succeeded.
12710 */
12711
12712 VerificationSuccessful:;
12713
12714 if (result == KERN_MEMORY_RESTART_COPY) {
12715 goto RestartCopy;
12716 }
12717
12718 /*
12719 * Copy succeeded.
12720 */
12721
12722 CopySuccessful: ;
12723
12724 /*
12725 * Link in the new copy entry.
12726 */
12727
12728 vm_map_copy_entry_link(copy, vm_map_copy_last_entry(copy),
12729 new_entry);
12730
12731 /*
12732 * Determine whether the entire region
12733 * has been copied.
12734 */
12735 src_base = src_start;
12736 src_start = new_entry->vme_end;
12737 new_entry = VM_MAP_ENTRY_NULL;
12738 while ((src_start >= src_end) && (src_end != 0)) {
12739 submap_map_t *ptr;
12740
12741 if (src_map == base_map) {
12742 /* back to the top */
12743 break;
12744 }
12745
12746 ptr = parent_maps;
12747 assert(ptr != NULL);
12748 parent_maps = parent_maps->next;
12749
12750 /* fix up the damage we did in that submap */
12751 vm_map_simplify_range(src_map,
12752 src_base,
12753 src_end);
12754
12755 vm_map_unlock(src_map);
12756 vm_map_deallocate(src_map);
12757 vm_map_lock(ptr->parent_map);
12758 src_map = ptr->parent_map;
12759 src_base = ptr->base_start;
12760 src_start = ptr->base_start + ptr->base_len;
12761 src_end = ptr->base_end;
12762 if (!vm_map_lookup_entry(src_map,
12763 src_start,
12764 &tmp_entry) &&
12765 (src_end > src_start)) {
12766 RETURN(KERN_INVALID_ADDRESS);
12767 }
12768 kfree_type(submap_map_t, ptr);
12769 if (parent_maps == NULL) {
12770 map_share = FALSE;
12771 }
12772 src_entry = tmp_entry->vme_prev;
12773 }
12774
12775 if ((VM_MAP_PAGE_SHIFT(src_map) != PAGE_SHIFT) &&
12776 (src_start >= src_addr_unaligned + len) &&
12777 (src_addr_unaligned + len != 0)) {
12778 /*
12779 * Stop copying now, even though we haven't reached
12780 * "src_end". We'll adjust the end of the last copy
12781 * entry at the end, if needed.
12782 *
12783 * If src_map's aligment is different from the
12784 * system's page-alignment, there could be
12785 * extra non-map-aligned map entries between
12786 * the original (non-rounded) "src_addr_unaligned + len"
12787 * and the rounded "src_end".
12788 * We do not want to copy those map entries since
12789 * they're not part of the copied range.
12790 */
12791 break;
12792 }
12793
12794 if ((src_start >= src_end) && (src_end != 0)) {
12795 break;
12796 }
12797
12798 /*
12799 * Verify that there are no gaps in the region
12800 */
12801
12802 tmp_entry = src_entry->vme_next;
12803 if ((tmp_entry->vme_start != src_start) ||
12804 (tmp_entry == vm_map_to_entry(src_map))) {
12805 RETURN(KERN_INVALID_ADDRESS);
12806 }
12807 }
12808
12809 /*
12810 * If the source should be destroyed, do it now, since the
12811 * copy was successful.
12812 */
12813 if (src_destroy) {
12814 vmr_flags_t remove_flags = VM_MAP_REMOVE_NO_FLAGS;
12815
12816 if (src_map == kernel_map) {
12817 remove_flags |= VM_MAP_REMOVE_KUNWIRE;
12818 }
12819 (void)vm_map_remove_and_unlock(src_map,
12820 vm_map_trunc_page(src_addr_unaligned, VM_MAP_PAGE_MASK(src_map)),
12821 src_end,
12822 remove_flags,
12823 KMEM_GUARD_NONE);
12824 } else {
12825 /* fix up the damage we did in the base map */
12826 vm_map_simplify_range(
12827 src_map,
12828 vm_map_trunc_page(src_addr_unaligned,
12829 VM_MAP_PAGE_MASK(src_map)),
12830 vm_map_round_page(src_end,
12831 VM_MAP_PAGE_MASK(src_map)));
12832 vm_map_unlock(src_map);
12833 }
12834
12835 tmp_entry = VM_MAP_ENTRY_NULL;
12836
12837 if (VM_MAP_PAGE_SHIFT(src_map) > PAGE_SHIFT &&
12838 VM_MAP_PAGE_SHIFT(src_map) != VM_MAP_COPY_PAGE_SHIFT(copy)) {
12839 vm_map_offset_t original_start, original_offset, original_end;
12840
12841 assert(VM_MAP_COPY_PAGE_MASK(copy) == PAGE_MASK);
12842
12843 /* adjust alignment of first copy_entry's "vme_start" */
12844 tmp_entry = vm_map_copy_first_entry(copy);
12845 if (tmp_entry != vm_map_copy_to_entry(copy)) {
12846 vm_map_offset_t adjustment;
12847
12848 original_start = tmp_entry->vme_start;
12849 original_offset = VME_OFFSET(tmp_entry);
12850
12851 /* map-align the start of the first copy entry... */
12852 adjustment = (tmp_entry->vme_start -
12853 vm_map_trunc_page(
12854 tmp_entry->vme_start,
12855 VM_MAP_PAGE_MASK(src_map)));
12856 tmp_entry->vme_start -= adjustment;
12857 VME_OFFSET_SET(tmp_entry,
12858 VME_OFFSET(tmp_entry) - adjustment);
12859 copy_addr -= adjustment;
12860 assert(tmp_entry->vme_start < tmp_entry->vme_end);
12861 /* ... adjust for mis-aligned start of copy range */
12862 adjustment =
12863 (vm_map_trunc_page(copy->offset,
12864 PAGE_MASK) -
12865 vm_map_trunc_page(copy->offset,
12866 VM_MAP_PAGE_MASK(src_map)));
12867 if (adjustment) {
12868 assert(page_aligned(adjustment));
12869 assert(adjustment < VM_MAP_PAGE_SIZE(src_map));
12870 tmp_entry->vme_start += adjustment;
12871 VME_OFFSET_SET(tmp_entry,
12872 (VME_OFFSET(tmp_entry) +
12873 adjustment));
12874 copy_addr += adjustment;
12875 assert(tmp_entry->vme_start < tmp_entry->vme_end);
12876 }
12877
12878 /*
12879 * Assert that the adjustments haven't exposed
12880 * more than was originally copied...
12881 */
12882 assert(tmp_entry->vme_start >= original_start);
12883 assert(VME_OFFSET(tmp_entry) >= original_offset);
12884 /*
12885 * ... and that it did not adjust outside of a
12886 * a single 16K page.
12887 */
12888 assert(vm_map_trunc_page(tmp_entry->vme_start,
12889 VM_MAP_PAGE_MASK(src_map)) ==
12890 vm_map_trunc_page(original_start,
12891 VM_MAP_PAGE_MASK(src_map)));
12892 }
12893
12894 /* adjust alignment of last copy_entry's "vme_end" */
12895 tmp_entry = vm_map_copy_last_entry(copy);
12896 if (tmp_entry != vm_map_copy_to_entry(copy)) {
12897 vm_map_offset_t adjustment;
12898
12899 original_end = tmp_entry->vme_end;
12900
12901 /* map-align the end of the last copy entry... */
12902 tmp_entry->vme_end =
12903 vm_map_round_page(tmp_entry->vme_end,
12904 VM_MAP_PAGE_MASK(src_map));
12905 /* ... adjust for mis-aligned end of copy range */
12906 adjustment =
12907 (vm_map_round_page((copy->offset +
12908 copy->size),
12909 VM_MAP_PAGE_MASK(src_map)) -
12910 vm_map_round_page((copy->offset +
12911 copy->size),
12912 PAGE_MASK));
12913 if (adjustment) {
12914 assert(page_aligned(adjustment));
12915 assert(adjustment < VM_MAP_PAGE_SIZE(src_map));
12916 tmp_entry->vme_end -= adjustment;
12917 assert(tmp_entry->vme_start < tmp_entry->vme_end);
12918 }
12919
12920 /*
12921 * Assert that the adjustments haven't exposed
12922 * more than was originally copied...
12923 */
12924 assert(tmp_entry->vme_end <= original_end);
12925 /*
12926 * ... and that it did not adjust outside of a
12927 * a single 16K page.
12928 */
12929 assert(vm_map_round_page(tmp_entry->vme_end,
12930 VM_MAP_PAGE_MASK(src_map)) ==
12931 vm_map_round_page(original_end,
12932 VM_MAP_PAGE_MASK(src_map)));
12933 }
12934 }
12935
12936 /* Fix-up start and end points in copy. This is necessary */
12937 /* when the various entries in the copy object were picked */
12938 /* up from different sub-maps */
12939
12940 tmp_entry = vm_map_copy_first_entry(copy);
12941 copy_size = 0; /* compute actual size */
12942 while (tmp_entry != vm_map_copy_to_entry(copy)) {
12943 assert(VM_MAP_PAGE_ALIGNED(
12944 copy_addr + (tmp_entry->vme_end -
12945 tmp_entry->vme_start),
12946 MIN(VM_MAP_COPY_PAGE_MASK(copy), PAGE_MASK)));
12947 assert(VM_MAP_PAGE_ALIGNED(
12948 copy_addr,
12949 MIN(VM_MAP_COPY_PAGE_MASK(copy), PAGE_MASK)));
12950
12951 /*
12952 * The copy_entries will be injected directly into the
12953 * destination map and might not be "map aligned" there...
12954 */
12955 tmp_entry->map_aligned = FALSE;
12956
12957 tmp_entry->vme_end = copy_addr +
12958 (tmp_entry->vme_end - tmp_entry->vme_start);
12959 tmp_entry->vme_start = copy_addr;
12960 assert(tmp_entry->vme_start < tmp_entry->vme_end);
12961 copy_addr += tmp_entry->vme_end - tmp_entry->vme_start;
12962 copy_size += tmp_entry->vme_end - tmp_entry->vme_start;
12963 tmp_entry = (struct vm_map_entry *)tmp_entry->vme_next;
12964 }
12965
12966 if (VM_MAP_PAGE_SHIFT(src_map) != PAGE_SHIFT &&
12967 copy_size < copy->size) {
12968 /*
12969 * The actual size of the VM map copy is smaller than what
12970 * was requested by the caller. This must be because some
12971 * PAGE_SIZE-sized pages are missing at the end of the last
12972 * VM_MAP_PAGE_SIZE(src_map)-sized chunk of the range.
12973 * The caller might not have been aware of those missing
12974 * pages and might not want to be aware of it, which is
12975 * fine as long as they don't try to access (and crash on)
12976 * those missing pages.
12977 * Let's adjust the size of the "copy", to avoid failing
12978 * in vm_map_copyout() or vm_map_copy_overwrite().
12979 */
12980 assert(vm_map_round_page(copy_size,
12981 VM_MAP_PAGE_MASK(src_map)) ==
12982 vm_map_round_page(copy->size,
12983 VM_MAP_PAGE_MASK(src_map)));
12984 copy->size = copy_size;
12985 }
12986
12987 *copy_result = copy;
12988 return KERN_SUCCESS;
12989
12990 #undef RETURN
12991 }
12992
12993 kern_return_t
vm_map_copy_extract(vm_map_t src_map,vm_map_address_t src_addr,vm_map_size_t len,boolean_t do_copy,vm_map_copy_t * copy_result,vm_prot_t * cur_prot,vm_prot_t * max_prot,vm_inherit_t inheritance,vm_map_kernel_flags_t vmk_flags)12994 vm_map_copy_extract(
12995 vm_map_t src_map,
12996 vm_map_address_t src_addr,
12997 vm_map_size_t len,
12998 boolean_t do_copy,
12999 vm_map_copy_t *copy_result, /* OUT */
13000 vm_prot_t *cur_prot, /* IN/OUT */
13001 vm_prot_t *max_prot, /* IN/OUT */
13002 vm_inherit_t inheritance,
13003 vm_map_kernel_flags_t vmk_flags)
13004 {
13005 vm_map_copy_t copy;
13006 kern_return_t kr;
13007 vm_prot_t required_cur_prot, required_max_prot;
13008
13009 /*
13010 * Check for copies of zero bytes.
13011 */
13012
13013 if (len == 0) {
13014 *copy_result = VM_MAP_COPY_NULL;
13015 return KERN_SUCCESS;
13016 }
13017
13018 /*
13019 * Check that the end address doesn't overflow
13020 */
13021 if (src_addr + len < src_addr) {
13022 return KERN_INVALID_ADDRESS;
13023 }
13024 if (__improbable(vm_map_range_overflows(src_map, src_addr, len))) {
13025 return KERN_INVALID_ADDRESS;
13026 }
13027
13028 if (VM_MAP_PAGE_SIZE(src_map) < PAGE_SIZE) {
13029 DEBUG4K_SHARE("src_map %p src_addr 0x%llx src_end 0x%llx\n", src_map, (uint64_t)src_addr, (uint64_t)(src_addr + len));
13030 }
13031
13032 required_cur_prot = *cur_prot;
13033 required_max_prot = *max_prot;
13034
13035 /*
13036 * Allocate a header element for the list.
13037 *
13038 * Use the start and end in the header to
13039 * remember the endpoints prior to rounding.
13040 */
13041
13042 copy = vm_map_copy_allocate(VM_MAP_COPY_ENTRY_LIST);
13043 copy->cpy_hdr.entries_pageable = vmk_flags.vmkf_copy_pageable;
13044 copy->offset = 0;
13045 copy->size = len;
13046
13047 kr = vm_map_remap_extract(src_map,
13048 src_addr,
13049 len,
13050 do_copy, /* copy */
13051 copy,
13052 cur_prot, /* IN/OUT */
13053 max_prot, /* IN/OUT */
13054 inheritance,
13055 vmk_flags);
13056 if (kr != KERN_SUCCESS) {
13057 vm_map_copy_discard(copy);
13058 if ((kr == KERN_INVALID_ADDRESS ||
13059 kr == KERN_INVALID_ARGUMENT) &&
13060 src_map->terminated) {
13061 /* tell the caller that this address space is gone */
13062 kr = KERN_TERMINATED;
13063 }
13064 return kr;
13065 }
13066 if (required_cur_prot != VM_PROT_NONE) {
13067 assert((*cur_prot & required_cur_prot) == required_cur_prot);
13068 assert((*max_prot & required_max_prot) == required_max_prot);
13069 }
13070
13071 *copy_result = copy;
13072 return KERN_SUCCESS;
13073 }
13074
13075 static void
vm_map_fork_share(vm_map_t old_map,vm_map_entry_t old_entry,vm_map_t new_map)13076 vm_map_fork_share(
13077 vm_map_t old_map,
13078 vm_map_entry_t old_entry,
13079 vm_map_t new_map)
13080 {
13081 vm_object_t object;
13082 vm_map_entry_t new_entry;
13083
13084 /*
13085 * New sharing code. New map entry
13086 * references original object. Internal
13087 * objects use asynchronous copy algorithm for
13088 * future copies. First make sure we have
13089 * the right object. If we need a shadow,
13090 * or someone else already has one, then
13091 * make a new shadow and share it.
13092 */
13093
13094 if (!old_entry->is_sub_map) {
13095 object = VME_OBJECT(old_entry);
13096 }
13097
13098 if (old_entry->is_sub_map) {
13099 assert(old_entry->wired_count == 0);
13100 #ifndef NO_NESTED_PMAP
13101 #if !PMAP_FORK_NEST
13102 if (old_entry->use_pmap) {
13103 kern_return_t result;
13104
13105 result = pmap_nest(new_map->pmap,
13106 (VME_SUBMAP(old_entry))->pmap,
13107 (addr64_t)old_entry->vme_start,
13108 (uint64_t)(old_entry->vme_end - old_entry->vme_start));
13109 if (result) {
13110 panic("vm_map_fork_share: pmap_nest failed!");
13111 }
13112 }
13113 #endif /* !PMAP_FORK_NEST */
13114 #endif /* NO_NESTED_PMAP */
13115 } else if (object == VM_OBJECT_NULL) {
13116 object = vm_object_allocate((vm_map_size_t)(old_entry->vme_end -
13117 old_entry->vme_start), old_map->serial_id);
13118 VME_OFFSET_SET(old_entry, 0);
13119 VME_OBJECT_SET(old_entry, object, false, 0);
13120 old_entry->use_pmap = TRUE;
13121 // assert(!old_entry->needs_copy);
13122 } else if (object->copy_strategy !=
13123 MEMORY_OBJECT_COPY_SYMMETRIC) {
13124 /*
13125 * We are already using an asymmetric
13126 * copy, and therefore we already have
13127 * the right object.
13128 */
13129
13130 assert(!old_entry->needs_copy);
13131 } else if (old_entry->needs_copy || /* case 1 */
13132 object->shadowed || /* case 2 */
13133 (!object->true_share && /* case 3 */
13134 !old_entry->is_shared &&
13135 (object->vo_size >
13136 (vm_map_size_t)(old_entry->vme_end -
13137 old_entry->vme_start)))) {
13138 bool is_writable;
13139
13140 /*
13141 * We need to create a shadow.
13142 * There are three cases here.
13143 * In the first case, we need to
13144 * complete a deferred symmetrical
13145 * copy that we participated in.
13146 * In the second and third cases,
13147 * we need to create the shadow so
13148 * that changes that we make to the
13149 * object do not interfere with
13150 * any symmetrical copies which
13151 * have occured (case 2) or which
13152 * might occur (case 3).
13153 *
13154 * The first case is when we had
13155 * deferred shadow object creation
13156 * via the entry->needs_copy mechanism.
13157 * This mechanism only works when
13158 * only one entry points to the source
13159 * object, and we are about to create
13160 * a second entry pointing to the
13161 * same object. The problem is that
13162 * there is no way of mapping from
13163 * an object to the entries pointing
13164 * to it. (Deferred shadow creation
13165 * works with one entry because occurs
13166 * at fault time, and we walk from the
13167 * entry to the object when handling
13168 * the fault.)
13169 *
13170 * The second case is when the object
13171 * to be shared has already been copied
13172 * with a symmetric copy, but we point
13173 * directly to the object without
13174 * needs_copy set in our entry. (This
13175 * can happen because different ranges
13176 * of an object can be pointed to by
13177 * different entries. In particular,
13178 * a single entry pointing to an object
13179 * can be split by a call to vm_inherit,
13180 * which, combined with task_create, can
13181 * result in the different entries
13182 * having different needs_copy values.)
13183 * The shadowed flag in the object allows
13184 * us to detect this case. The problem
13185 * with this case is that if this object
13186 * has or will have shadows, then we
13187 * must not perform an asymmetric copy
13188 * of this object, since such a copy
13189 * allows the object to be changed, which
13190 * will break the previous symmetrical
13191 * copies (which rely upon the object
13192 * not changing). In a sense, the shadowed
13193 * flag says "don't change this object".
13194 * We fix this by creating a shadow
13195 * object for this object, and sharing
13196 * that. This works because we are free
13197 * to change the shadow object (and thus
13198 * to use an asymmetric copy strategy);
13199 * this is also semantically correct,
13200 * since this object is temporary, and
13201 * therefore a copy of the object is
13202 * as good as the object itself. (This
13203 * is not true for permanent objects,
13204 * since the pager needs to see changes,
13205 * which won't happen if the changes
13206 * are made to a copy.)
13207 *
13208 * The third case is when the object
13209 * to be shared has parts sticking
13210 * outside of the entry we're working
13211 * with, and thus may in the future
13212 * be subject to a symmetrical copy.
13213 * (This is a preemptive version of
13214 * case 2.)
13215 */
13216 VME_OBJECT_SHADOW(old_entry,
13217 (vm_map_size_t) (old_entry->vme_end -
13218 old_entry->vme_start),
13219 vm_map_always_shadow(old_map));
13220
13221 /*
13222 * If we're making a shadow for other than
13223 * copy on write reasons, then we have
13224 * to remove write permission.
13225 */
13226
13227 is_writable = false;
13228 if (old_entry->protection & VM_PROT_WRITE) {
13229 is_writable = true;
13230 #if __arm64e__
13231 } else if (old_entry->used_for_tpro) {
13232 is_writable = true;
13233 #endif /* __arm64e__ */
13234 }
13235 if (!old_entry->needs_copy && is_writable) {
13236 vm_prot_t prot;
13237
13238 if (pmap_has_prot_policy(old_map->pmap, old_entry->translated_allow_execute, old_entry->protection)) {
13239 panic("%s: map %p pmap %p entry %p 0x%llx:0x%llx prot 0x%x",
13240 __FUNCTION__, old_map, old_map->pmap,
13241 old_entry,
13242 (uint64_t)old_entry->vme_start,
13243 (uint64_t)old_entry->vme_end,
13244 old_entry->protection);
13245 }
13246
13247 prot = old_entry->protection & ~VM_PROT_WRITE;
13248
13249 if (pmap_has_prot_policy(old_map->pmap, old_entry->translated_allow_execute, prot)) {
13250 panic("%s: map %p pmap %p entry %p 0x%llx:0x%llx prot 0x%x",
13251 __FUNCTION__, old_map, old_map->pmap,
13252 old_entry,
13253 (uint64_t)old_entry->vme_start,
13254 (uint64_t)old_entry->vme_end,
13255 prot);
13256 }
13257
13258 if (override_nx(old_map, VME_ALIAS(old_entry)) && prot) {
13259 prot |= VM_PROT_EXECUTE;
13260 }
13261
13262
13263 if (old_map->mapped_in_other_pmaps) {
13264 vm_object_pmap_protect(
13265 VME_OBJECT(old_entry),
13266 VME_OFFSET(old_entry),
13267 (old_entry->vme_end -
13268 old_entry->vme_start),
13269 PMAP_NULL,
13270 PAGE_SIZE,
13271 old_entry->vme_start,
13272 prot);
13273 } else {
13274 pmap_protect(old_map->pmap,
13275 old_entry->vme_start,
13276 old_entry->vme_end,
13277 prot);
13278 }
13279 }
13280
13281 old_entry->needs_copy = FALSE;
13282 object = VME_OBJECT(old_entry);
13283 }
13284
13285
13286 /*
13287 * If object was using a symmetric copy strategy,
13288 * change its copy strategy to the default
13289 * asymmetric copy strategy, which is copy_delay
13290 * in the non-norma case and copy_call in the
13291 * norma case. Bump the reference count for the
13292 * new entry.
13293 */
13294
13295 if (old_entry->is_sub_map) {
13296 vm_map_reference(VME_SUBMAP(old_entry));
13297 } else {
13298 vm_object_lock(object);
13299 vm_object_reference_locked(object);
13300 if (object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) {
13301 object->copy_strategy = MEMORY_OBJECT_COPY_DELAY;
13302 }
13303 vm_object_unlock(object);
13304 }
13305
13306 /*
13307 * Clone the entry, using object ref from above.
13308 * Mark both entries as shared.
13309 */
13310
13311 new_entry = vm_map_entry_create(new_map); /* Never the kernel map or descendants */
13312 vm_map_entry_copy(old_map, new_entry, old_entry);
13313 old_entry->is_shared = TRUE;
13314 new_entry->is_shared = TRUE;
13315
13316 /*
13317 * We're dealing with a shared mapping, so the resulting mapping
13318 * should inherit some of the original mapping's accounting settings.
13319 * "iokit_acct" should have been cleared in vm_map_entry_copy().
13320 * "use_pmap" should stay the same as before (if it hasn't been reset
13321 * to TRUE when we cleared "iokit_acct").
13322 */
13323 assert(!new_entry->iokit_acct);
13324
13325 /*
13326 * If old entry's inheritence is VM_INHERIT_NONE,
13327 * the new entry is for corpse fork, remove the
13328 * write permission from the new entry.
13329 */
13330 if (old_entry->inheritance == VM_INHERIT_NONE) {
13331 new_entry->protection &= ~VM_PROT_WRITE;
13332 new_entry->max_protection &= ~VM_PROT_WRITE;
13333 }
13334
13335 /*
13336 * Insert the entry into the new map -- we
13337 * know we're inserting at the end of the new
13338 * map.
13339 */
13340
13341 vm_map_store_entry_link(new_map, vm_map_last_entry(new_map), new_entry,
13342 VM_MAP_KERNEL_FLAGS_NONE);
13343
13344 /*
13345 * Update the physical map
13346 */
13347
13348 if (old_entry->is_sub_map) {
13349 /* Bill Angell pmap support goes here */
13350 } else {
13351 pmap_copy(new_map->pmap, old_map->pmap, new_entry->vme_start,
13352 old_entry->vme_end - old_entry->vme_start,
13353 old_entry->vme_start);
13354 }
13355 }
13356
13357 static boolean_t
vm_map_fork_copy(vm_map_t old_map,vm_map_entry_t * old_entry_p,vm_map_t new_map,int vm_map_copyin_flags)13358 vm_map_fork_copy(
13359 vm_map_t old_map,
13360 vm_map_entry_t *old_entry_p,
13361 vm_map_t new_map,
13362 int vm_map_copyin_flags)
13363 {
13364 vm_map_entry_t old_entry = *old_entry_p;
13365 vm_map_size_t entry_size = old_entry->vme_end - old_entry->vme_start;
13366 vm_map_offset_t start = old_entry->vme_start;
13367 vm_map_copy_t copy;
13368 vm_map_entry_t last = vm_map_last_entry(new_map);
13369
13370 vm_map_unlock(old_map);
13371 /*
13372 * Use maxprot version of copyin because we
13373 * care about whether this memory can ever
13374 * be accessed, not just whether it's accessible
13375 * right now.
13376 */
13377 vm_map_copyin_flags |= VM_MAP_COPYIN_USE_MAXPROT;
13378 if (vm_map_copyin_internal(old_map, start, entry_size,
13379 vm_map_copyin_flags, ©)
13380 != KERN_SUCCESS) {
13381 /*
13382 * The map might have changed while it
13383 * was unlocked, check it again. Skip
13384 * any blank space or permanently
13385 * unreadable region.
13386 */
13387 vm_map_lock(old_map);
13388 if (!vm_map_lookup_entry(old_map, start, &last) ||
13389 (last->max_protection & VM_PROT_READ) == VM_PROT_NONE) {
13390 last = last->vme_next;
13391 }
13392 *old_entry_p = last;
13393
13394 /*
13395 * XXX For some error returns, want to
13396 * XXX skip to the next element. Note
13397 * that INVALID_ADDRESS and
13398 * PROTECTION_FAILURE are handled above.
13399 */
13400
13401 return FALSE;
13402 }
13403
13404 /*
13405 * Assert that the vm_map_copy is coming from the right
13406 * zone and hasn't been forged
13407 */
13408 vm_map_copy_require(copy);
13409
13410 /*
13411 * Insert the copy into the new map
13412 */
13413 vm_map_copy_insert(new_map, last, copy);
13414
13415 /*
13416 * Pick up the traversal at the end of
13417 * the copied region.
13418 */
13419
13420 vm_map_lock(old_map);
13421 start += entry_size;
13422 if (!vm_map_lookup_entry(old_map, start, &last)) {
13423 last = last->vme_next;
13424 } else {
13425 if (last->vme_start == start) {
13426 /*
13427 * No need to clip here and we don't
13428 * want to cause any unnecessary
13429 * unnesting...
13430 */
13431 } else {
13432 vm_map_clip_start(old_map, last, start);
13433 }
13434 }
13435 *old_entry_p = last;
13436
13437 return TRUE;
13438 }
13439
13440 #if PMAP_FORK_NEST
13441 #define PMAP_FORK_NEST_DEBUG 0
13442 static inline void
vm_map_fork_unnest(pmap_t new_pmap,vm_map_offset_t pre_nested_start,vm_map_offset_t pre_nested_end,vm_map_offset_t start,vm_map_offset_t end)13443 vm_map_fork_unnest(
13444 pmap_t new_pmap,
13445 vm_map_offset_t pre_nested_start,
13446 vm_map_offset_t pre_nested_end,
13447 vm_map_offset_t start,
13448 vm_map_offset_t end)
13449 {
13450 kern_return_t kr;
13451 vm_map_offset_t nesting_mask, start_unnest, end_unnest;
13452
13453 assertf(pre_nested_start <= pre_nested_end,
13454 "pre_nested start 0x%llx end 0x%llx",
13455 (uint64_t)pre_nested_start, (uint64_t)pre_nested_end);
13456 assertf(start <= end,
13457 "start 0x%llx end 0x%llx",
13458 (uint64_t) start, (uint64_t)end);
13459
13460 if (pre_nested_start == pre_nested_end) {
13461 /* nothing was pre-nested: done */
13462 return;
13463 }
13464 if (end <= pre_nested_start) {
13465 /* fully before pre-nested range: done */
13466 return;
13467 }
13468 if (start >= pre_nested_end) {
13469 /* fully after pre-nested range: done */
13470 return;
13471 }
13472 /* ignore parts of range outside of pre_nested range */
13473 if (start < pre_nested_start) {
13474 start = pre_nested_start;
13475 }
13476 if (end > pre_nested_end) {
13477 end = pre_nested_end;
13478 }
13479 nesting_mask = pmap_shared_region_size_min(new_pmap) - 1;
13480 start_unnest = start & ~nesting_mask;
13481 end_unnest = (end + nesting_mask) & ~nesting_mask;
13482 kr = pmap_unnest(new_pmap,
13483 (addr64_t)start_unnest,
13484 (uint64_t)(end_unnest - start_unnest));
13485 #if PMAP_FORK_NEST_DEBUG
13486 printf("PMAP_FORK_NEST %s:%d new_pmap %p 0x%llx:0x%llx -> pmap_unnest 0x%llx:0x%llx kr 0x%x\n", __FUNCTION__, __LINE__, new_pmap, (uint64_t)start, (uint64_t)end, (uint64_t)start_unnest, (uint64_t)end_unnest, kr);
13487 #endif /* PMAP_FORK_NEST_DEBUG */
13488 assertf(kr == KERN_SUCCESS,
13489 "0x%llx 0x%llx pmap_unnest(%p, 0x%llx, 0x%llx) -> 0x%x",
13490 (uint64_t)start, (uint64_t)end, new_pmap,
13491 (uint64_t)start_unnest, (uint64_t)(end_unnest - start_unnest),
13492 kr);
13493 }
13494 #endif /* PMAP_FORK_NEST */
13495
13496 void
vm_map_inherit_limits(vm_map_t new_map,const struct _vm_map * old_map)13497 vm_map_inherit_limits(vm_map_t new_map, const struct _vm_map *old_map)
13498 {
13499 new_map->size_limit = old_map->size_limit;
13500 new_map->data_limit = old_map->data_limit;
13501 new_map->user_wire_limit = old_map->user_wire_limit;
13502 new_map->reserved_regions = old_map->reserved_regions;
13503 }
13504
13505 /*
13506 * vm_map_fork:
13507 *
13508 * Create and return a new map based on the old
13509 * map, according to the inheritance values on the
13510 * regions in that map and the options.
13511 *
13512 * The source map must not be locked.
13513 */
13514 vm_map_t
vm_map_fork(ledger_t ledger,vm_map_t old_map,int options)13515 vm_map_fork(
13516 ledger_t ledger,
13517 vm_map_t old_map,
13518 int options)
13519 {
13520 pmap_t new_pmap;
13521 vm_map_t new_map;
13522 vm_map_entry_t old_entry;
13523 vm_map_size_t new_size = 0, entry_size;
13524 vm_map_entry_t new_entry;
13525 boolean_t src_needs_copy;
13526 boolean_t new_entry_needs_copy;
13527 boolean_t pmap_is64bit;
13528 int vm_map_copyin_flags;
13529 vm_inherit_t old_entry_inheritance;
13530 int map_create_options;
13531 kern_return_t footprint_collect_kr;
13532
13533 if (options & ~(VM_MAP_FORK_SHARE_IF_INHERIT_NONE |
13534 VM_MAP_FORK_PRESERVE_PURGEABLE |
13535 VM_MAP_FORK_CORPSE_FOOTPRINT |
13536 VM_MAP_FORK_SHARE_IF_OWNED)) {
13537 /* unsupported option */
13538 return VM_MAP_NULL;
13539 }
13540
13541 pmap_is64bit =
13542 #if defined(__i386__) || defined(__x86_64__)
13543 old_map->pmap->pm_task_map != TASK_MAP_32BIT;
13544 #elif defined(__arm64__)
13545 old_map->pmap->is_64bit;
13546 #else
13547 #error Unknown architecture.
13548 #endif
13549
13550 unsigned int pmap_flags = 0;
13551 pmap_flags |= pmap_is64bit ? PMAP_CREATE_64BIT : 0;
13552 #if defined(HAS_APPLE_PAC)
13553 pmap_flags |= old_map->pmap->disable_jop ? PMAP_CREATE_DISABLE_JOP : 0;
13554 #endif
13555 #if CONFIG_ROSETTA
13556 pmap_flags |= old_map->pmap->is_rosetta ? PMAP_CREATE_ROSETTA : 0;
13557 #endif
13558 #if PMAP_CREATE_FORCE_4K_PAGES
13559 if (VM_MAP_PAGE_SIZE(old_map) == FOURK_PAGE_SIZE &&
13560 PAGE_SIZE != FOURK_PAGE_SIZE) {
13561 pmap_flags |= PMAP_CREATE_FORCE_4K_PAGES;
13562 }
13563 #endif /* PMAP_CREATE_FORCE_4K_PAGES */
13564 new_pmap = pmap_create_options(ledger, (vm_map_size_t) 0, pmap_flags);
13565 if (new_pmap == NULL) {
13566 return VM_MAP_NULL;
13567 }
13568
13569 vm_map_reference(old_map);
13570 vm_map_lock(old_map);
13571
13572 /* Note that we're creating a map out of fork() */
13573 map_create_options = VM_MAP_CREATE_VIA_FORK;
13574 if (old_map->hdr.entries_pageable) {
13575 map_create_options |= VM_MAP_CREATE_PAGEABLE;
13576 }
13577 if (options & VM_MAP_FORK_CORPSE_FOOTPRINT) {
13578 map_create_options |= VM_MAP_CREATE_CORPSE_FOOTPRINT;
13579 footprint_collect_kr = KERN_SUCCESS;
13580 }
13581 new_map = vm_map_create_options(new_pmap,
13582 old_map->min_offset,
13583 old_map->max_offset,
13584 map_create_options);
13585
13586 /* Inherit our parent's ID. */
13587 vm_map_assign_serial(new_map, old_map->serial_id);
13588
13589 /* inherit cs_enforcement */
13590 vm_map_cs_enforcement_set(new_map, old_map->cs_enforcement);
13591
13592 vm_map_lock(new_map);
13593 vm_commit_pagezero_status(new_map);
13594 /* inherit the parent map's page size */
13595 vm_map_set_page_shift(new_map, VM_MAP_PAGE_SHIFT(old_map));
13596
13597 /* inherit the parent rlimits */
13598 vm_map_inherit_limits(new_map, old_map);
13599
13600 #if CONFIG_MAP_RANGES
13601 /* inherit the parent map's VM ranges */
13602 vm_map_range_fork(new_map, old_map);
13603 #endif
13604
13605 #if CODE_SIGNING_MONITOR
13606 /* Prepare the monitor for the fork */
13607 csm_fork_prepare(old_map->pmap, new_pmap);
13608 #endif
13609
13610 #if PMAP_FORK_NEST
13611 /*
13612 * Pre-nest the shared region's pmap.
13613 */
13614 vm_map_offset_t pre_nested_start = 0, pre_nested_end = 0;
13615 pmap_fork_nest(old_map->pmap, new_pmap,
13616 &pre_nested_start, &pre_nested_end);
13617 #if PMAP_FORK_NEST_DEBUG
13618 printf("PMAP_FORK_NEST %s:%d old %p new %p pre_nested start 0x%llx end 0x%llx\n", __FUNCTION__, __LINE__, old_map->pmap, new_pmap, (uint64_t)pre_nested_start, (uint64_t)pre_nested_end);
13619 #endif /* PMAP_FORK_NEST_DEBUG */
13620 #endif /* PMAP_FORK_NEST */
13621
13622 for (old_entry = vm_map_first_entry(old_map); old_entry != vm_map_to_entry(old_map);) {
13623 /*
13624 * Abort any corpse collection if the system is shutting down.
13625 */
13626 if ((options & VM_MAP_FORK_CORPSE_FOOTPRINT) &&
13627 get_system_inshutdown()) {
13628 #if PMAP_FORK_NEST
13629 new_entry = vm_map_last_entry(new_map);
13630 if (new_entry == vm_map_to_entry(new_map)) {
13631 /* unnest all that was pre-nested */
13632 vm_map_fork_unnest(new_pmap,
13633 pre_nested_start, pre_nested_end,
13634 vm_map_min(new_map), vm_map_max(new_map));
13635 } else if (new_entry->vme_end < vm_map_max(new_map)) {
13636 /* unnest hole at the end, if pre-nested */
13637 vm_map_fork_unnest(new_pmap,
13638 pre_nested_start, pre_nested_end,
13639 new_entry->vme_end, vm_map_max(new_map));
13640 }
13641 #endif /* PMAP_FORK_NEST */
13642 vm_map_corpse_footprint_collect_done(new_map);
13643 vm_map_unlock(new_map);
13644 vm_map_unlock(old_map);
13645 vm_map_deallocate(new_map);
13646 vm_map_deallocate(old_map);
13647 printf("Aborting corpse map due to system shutdown\n");
13648 return VM_MAP_NULL;
13649 }
13650
13651 entry_size = old_entry->vme_end - old_entry->vme_start;
13652
13653 #if PMAP_FORK_NEST
13654 /*
13655 * Undo any unnecessary pre-nesting.
13656 */
13657 vm_map_offset_t prev_end;
13658 if (old_entry == vm_map_first_entry(old_map)) {
13659 prev_end = vm_map_min(old_map);
13660 } else {
13661 prev_end = old_entry->vme_prev->vme_end;
13662 }
13663 if (prev_end < old_entry->vme_start) {
13664 /* unnest hole before this entry, if pre-nested */
13665 vm_map_fork_unnest(new_pmap,
13666 pre_nested_start, pre_nested_end,
13667 prev_end, old_entry->vme_start);
13668 }
13669 if (old_entry->is_sub_map && old_entry->use_pmap) {
13670 /* keep this entry nested in the child */
13671 #if PMAP_FORK_NEST_DEBUG
13672 printf("PMAP_FORK_NEST %s:%d new_pmap %p keeping 0x%llx:0x%llx nested\n", __FUNCTION__, __LINE__, new_pmap, (uint64_t)old_entry->vme_start, (uint64_t)old_entry->vme_end);
13673 #endif /* PMAP_FORK_NEST_DEBUG */
13674 } else {
13675 /* undo nesting for this entry, if pre-nested */
13676 vm_map_fork_unnest(new_pmap,
13677 pre_nested_start, pre_nested_end,
13678 old_entry->vme_start, old_entry->vme_end);
13679 }
13680 #endif /* PMAP_FORK_NEST */
13681
13682 old_entry_inheritance = old_entry->inheritance;
13683
13684 /*
13685 * If caller used the VM_MAP_FORK_SHARE_IF_INHERIT_NONE option
13686 * share VM_INHERIT_NONE entries that are not backed by a
13687 * device pager.
13688 */
13689 if (old_entry_inheritance == VM_INHERIT_NONE &&
13690 (options & VM_MAP_FORK_SHARE_IF_INHERIT_NONE) &&
13691 (old_entry->protection & VM_PROT_READ) &&
13692 !(!old_entry->is_sub_map &&
13693 VME_OBJECT(old_entry) != NULL &&
13694 VME_OBJECT(old_entry)->pager != NULL &&
13695 is_device_pager_ops(
13696 VME_OBJECT(old_entry)->pager->mo_pager_ops))) {
13697 old_entry_inheritance = VM_INHERIT_SHARE;
13698 }
13699 if (old_entry_inheritance == VM_INHERIT_COPY &&
13700 (options & VM_MAP_FORK_SHARE_IF_OWNED) &&
13701 !old_entry->is_sub_map &&
13702 VME_OBJECT(old_entry) != VM_OBJECT_NULL) {
13703 vm_object_t object;
13704 task_t owner;
13705 object = VME_OBJECT(old_entry);
13706 owner = VM_OBJECT_OWNER(object);
13707 if (owner != TASK_NULL &&
13708 owner->map == old_map) {
13709 /*
13710 * This mapping points at a VM object owned
13711 * by the task being forked.
13712 * Some tools reporting memory accounting
13713 * info rely on the object ID, so share this
13714 * mapping instead of copying, to make the
13715 * corpse look exactly like the original
13716 * task in that respect.
13717 */
13718 assert(object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC);
13719 old_entry_inheritance = VM_INHERIT_SHARE;
13720 }
13721 }
13722
13723 if (old_entry_inheritance != VM_INHERIT_NONE &&
13724 (options & VM_MAP_FORK_CORPSE_FOOTPRINT) &&
13725 footprint_collect_kr == KERN_SUCCESS) {
13726 /*
13727 * The corpse won't have old_map->pmap to query
13728 * footprint information, so collect that data now
13729 * and store it in new_map->vmmap_corpse_footprint
13730 * for later autopsy.
13731 */
13732 footprint_collect_kr =
13733 vm_map_corpse_footprint_collect(old_map,
13734 old_entry,
13735 new_map);
13736 }
13737
13738 switch (old_entry_inheritance) {
13739 case VM_INHERIT_NONE:
13740 break;
13741
13742 case VM_INHERIT_SHARE:
13743 vm_map_fork_share(old_map, old_entry, new_map);
13744 new_size += entry_size;
13745 break;
13746
13747 case VM_INHERIT_COPY:
13748
13749 /*
13750 * Inline the copy_quickly case;
13751 * upon failure, fall back on call
13752 * to vm_map_fork_copy.
13753 */
13754
13755 if (old_entry->is_sub_map) {
13756 break;
13757 }
13758 if ((old_entry->wired_count != 0) ||
13759 ((VME_OBJECT(old_entry) != NULL) &&
13760 (VME_OBJECT(old_entry)->true_share))) {
13761 goto slow_vm_map_fork_copy;
13762 }
13763
13764 new_entry = vm_map_entry_create(new_map); /* never the kernel map or descendants */
13765 vm_map_entry_copy(old_map, new_entry, old_entry);
13766 if (old_entry->vme_permanent) {
13767 /* inherit "permanent" on fork() */
13768 new_entry->vme_permanent = TRUE;
13769 }
13770
13771 if (new_entry->used_for_jit == TRUE && new_map->jit_entry_exists == FALSE) {
13772 new_map->jit_entry_exists = TRUE;
13773 }
13774
13775 if (new_entry->is_sub_map) {
13776 /* clear address space specifics */
13777 new_entry->use_pmap = FALSE;
13778 } else {
13779 /*
13780 * We're dealing with a copy-on-write operation,
13781 * so the resulting mapping should not inherit
13782 * the original mapping's accounting settings.
13783 * "iokit_acct" should have been cleared in
13784 * vm_map_entry_copy().
13785 * "use_pmap" should be reset to its default
13786 * (TRUE) so that the new mapping gets
13787 * accounted for in the task's memory footprint.
13788 */
13789 assert(!new_entry->iokit_acct);
13790 new_entry->use_pmap = TRUE;
13791 }
13792
13793 if (!vm_object_copy_quickly(
13794 VME_OBJECT(new_entry),
13795 VME_OFFSET(old_entry),
13796 (old_entry->vme_end -
13797 old_entry->vme_start),
13798 &src_needs_copy,
13799 &new_entry_needs_copy)) {
13800 vm_map_entry_dispose(new_entry);
13801 goto slow_vm_map_fork_copy;
13802 }
13803
13804 /*
13805 * Handle copy-on-write obligations
13806 */
13807
13808 if (src_needs_copy && !old_entry->needs_copy) {
13809 vm_prot_t prot;
13810
13811 if (pmap_has_prot_policy(old_map->pmap, old_entry->translated_allow_execute, old_entry->protection)) {
13812 panic("%s: map %p pmap %p entry %p 0x%llx:0x%llx prot 0x%x",
13813 __FUNCTION__,
13814 old_map, old_map->pmap, old_entry,
13815 (uint64_t)old_entry->vme_start,
13816 (uint64_t)old_entry->vme_end,
13817 old_entry->protection);
13818 }
13819
13820 prot = old_entry->protection & ~VM_PROT_WRITE;
13821
13822 if (override_nx(old_map, VME_ALIAS(old_entry))
13823 && prot) {
13824 prot |= VM_PROT_EXECUTE;
13825 }
13826
13827 if (pmap_has_prot_policy(old_map->pmap, old_entry->translated_allow_execute, prot)) {
13828 panic("%s: map %p pmap %p entry %p 0x%llx:0x%llx prot 0x%x",
13829 __FUNCTION__,
13830 old_map, old_map->pmap, old_entry,
13831 (uint64_t)old_entry->vme_start,
13832 (uint64_t)old_entry->vme_end,
13833 prot);
13834 }
13835
13836 vm_object_pmap_protect(
13837 VME_OBJECT(old_entry),
13838 VME_OFFSET(old_entry),
13839 (old_entry->vme_end -
13840 old_entry->vme_start),
13841 ((old_entry->is_shared
13842 || old_map->mapped_in_other_pmaps)
13843 ? PMAP_NULL :
13844 old_map->pmap),
13845 VM_MAP_PAGE_SIZE(old_map),
13846 old_entry->vme_start,
13847 prot);
13848
13849 assert(old_entry->wired_count == 0);
13850 old_entry->needs_copy = TRUE;
13851 }
13852 new_entry->needs_copy = new_entry_needs_copy;
13853
13854 /*
13855 * Insert the entry at the end
13856 * of the map.
13857 */
13858
13859 vm_map_store_entry_link(new_map,
13860 vm_map_last_entry(new_map),
13861 new_entry,
13862 VM_MAP_KERNEL_FLAGS_NONE);
13863 new_size += entry_size;
13864 break;
13865
13866 slow_vm_map_fork_copy:
13867 vm_map_copyin_flags = VM_MAP_COPYIN_FORK;
13868 if (options & VM_MAP_FORK_PRESERVE_PURGEABLE) {
13869 vm_map_copyin_flags |=
13870 VM_MAP_COPYIN_PRESERVE_PURGEABLE;
13871 }
13872 if (vm_map_fork_copy(old_map,
13873 &old_entry,
13874 new_map,
13875 vm_map_copyin_flags)) {
13876 new_size += entry_size;
13877 }
13878 continue;
13879 }
13880 old_entry = old_entry->vme_next;
13881 }
13882
13883 #if PMAP_FORK_NEST
13884 new_entry = vm_map_last_entry(new_map);
13885 if (new_entry == vm_map_to_entry(new_map)) {
13886 /* unnest all that was pre-nested */
13887 vm_map_fork_unnest(new_pmap,
13888 pre_nested_start, pre_nested_end,
13889 vm_map_min(new_map), vm_map_max(new_map));
13890 } else if (new_entry->vme_end < vm_map_max(new_map)) {
13891 /* unnest hole at the end, if pre-nested */
13892 vm_map_fork_unnest(new_pmap,
13893 pre_nested_start, pre_nested_end,
13894 new_entry->vme_end, vm_map_max(new_map));
13895 }
13896 #endif /* PMAP_FORK_NEST */
13897
13898 #if defined(__arm64__)
13899 pmap_insert_commpage(new_map->pmap);
13900 #endif /* __arm64__ */
13901
13902 new_map->size = new_size;
13903
13904 if (options & VM_MAP_FORK_CORPSE_FOOTPRINT) {
13905 vm_map_corpse_footprint_collect_done(new_map);
13906 }
13907
13908 /* Propagate JIT entitlement for the pmap layer. */
13909 if (pmap_get_jit_entitled(old_map->pmap)) {
13910 /* Tell the pmap that it supports JIT. */
13911 pmap_set_jit_entitled(new_map->pmap);
13912 }
13913
13914 /* Propagate TPRO settings for the pmap layer */
13915 if (pmap_get_tpro(old_map->pmap)) {
13916 /* Tell the pmap that it supports TPRO */
13917 pmap_set_tpro(new_map->pmap);
13918 }
13919
13920
13921 vm_map_unlock(new_map);
13922 vm_map_unlock(old_map);
13923 vm_map_deallocate(old_map);
13924
13925 return new_map;
13926 }
13927
13928 /*
13929 * vm_map_exec:
13930 *
13931 * Setup the "new_map" with the proper execution environment according
13932 * to the type of executable (platform, 64bit, chroot environment).
13933 * Map the comm page and shared region, etc...
13934 */
13935 kern_return_t
vm_map_exec(vm_map_t new_map,task_t task,boolean_t is64bit,void * fsroot,cpu_type_t cpu,cpu_subtype_t cpu_subtype,boolean_t reslide,boolean_t is_driverkit,uint32_t rsr_version)13936 vm_map_exec(
13937 vm_map_t new_map,
13938 task_t task,
13939 boolean_t is64bit,
13940 void *fsroot,
13941 cpu_type_t cpu,
13942 cpu_subtype_t cpu_subtype,
13943 boolean_t reslide,
13944 boolean_t is_driverkit,
13945 uint32_t rsr_version)
13946 {
13947 SHARED_REGION_TRACE_DEBUG(
13948 ("shared_region: task %p: vm_map_exec(%p,%p,%p,0x%x,0x%x): ->\n",
13949 (void *)VM_KERNEL_ADDRPERM(current_task()),
13950 (void *)VM_KERNEL_ADDRPERM(new_map),
13951 (void *)VM_KERNEL_ADDRPERM(task),
13952 (void *)VM_KERNEL_ADDRPERM(fsroot),
13953 cpu,
13954 cpu_subtype));
13955 (void) vm_commpage_enter(new_map, task, is64bit);
13956
13957 (void) vm_shared_region_enter(new_map, task, is64bit, fsroot, cpu, cpu_subtype, reslide, is_driverkit, rsr_version);
13958
13959 SHARED_REGION_TRACE_DEBUG(
13960 ("shared_region: task %p: vm_map_exec(%p,%p,%p,0x%x,0x%x): <-\n",
13961 (void *)VM_KERNEL_ADDRPERM(current_task()),
13962 (void *)VM_KERNEL_ADDRPERM(new_map),
13963 (void *)VM_KERNEL_ADDRPERM(task),
13964 (void *)VM_KERNEL_ADDRPERM(fsroot),
13965 cpu,
13966 cpu_subtype));
13967
13968 /*
13969 * Some devices have region(s) of memory that shouldn't get allocated by
13970 * user processes. The following code creates dummy vm_map_entry_t's for each
13971 * of the regions that needs to be reserved to prevent any allocations in
13972 * those regions.
13973 */
13974 kern_return_t kr = KERN_FAILURE;
13975 vm_map_kernel_flags_t vmk_flags = VM_MAP_KERNEL_FLAGS_FIXED_PERMANENT();
13976 vmk_flags.vmkf_beyond_max = true;
13977
13978 const struct vm_reserved_region *regions = NULL;
13979 size_t num_regions = ml_get_vm_reserved_regions(is64bit, ®ions);
13980 assert((num_regions == 0) || (num_regions > 0 && regions != NULL));
13981
13982 for (size_t i = 0; i < num_regions; ++i) {
13983 vm_map_offset_t address = regions[i].vmrr_addr;
13984
13985 kr = vm_map_enter(
13986 new_map,
13987 &address,
13988 regions[i].vmrr_size,
13989 (vm_map_offset_t)0,
13990 vmk_flags,
13991 VM_OBJECT_NULL,
13992 (vm_object_offset_t)0,
13993 FALSE,
13994 VM_PROT_NONE,
13995 VM_PROT_NONE,
13996 VM_INHERIT_COPY);
13997
13998 if (kr != KERN_SUCCESS) {
13999 os_log_error(OS_LOG_DEFAULT, "Failed to reserve %s region in user map %p %d", regions[i].vmrr_name, new_map, kr);
14000 return KERN_FAILURE;
14001 }
14002 }
14003
14004 new_map->reserved_regions = (num_regions ? TRUE : FALSE);
14005
14006 return KERN_SUCCESS;
14007 }
14008
14009 uint64_t vm_map_lookup_and_lock_object_copy_slowly_count = 0;
14010 uint64_t vm_map_lookup_and_lock_object_copy_slowly_size = 0;
14011 uint64_t vm_map_lookup_and_lock_object_copy_slowly_max = 0;
14012 uint64_t vm_map_lookup_and_lock_object_copy_slowly_restart = 0;
14013 uint64_t vm_map_lookup_and_lock_object_copy_slowly_error = 0;
14014 uint64_t vm_map_lookup_and_lock_object_copy_strategically_count = 0;
14015 uint64_t vm_map_lookup_and_lock_object_copy_strategically_size = 0;
14016 uint64_t vm_map_lookup_and_lock_object_copy_strategically_max = 0;
14017 uint64_t vm_map_lookup_and_lock_object_copy_strategically_restart = 0;
14018 uint64_t vm_map_lookup_and_lock_object_copy_strategically_error = 0;
14019 uint64_t vm_map_lookup_and_lock_object_copy_shadow_count = 0;
14020 uint64_t vm_map_lookup_and_lock_object_copy_shadow_size = 0;
14021 uint64_t vm_map_lookup_and_lock_object_copy_shadow_max = 0;
14022 /*
14023 * vm_map_lookup_and_lock_object:
14024 *
14025 * Finds the VM object, offset, and
14026 * protection for a given virtual address in the
14027 * specified map, assuming a page fault of the
14028 * type specified.
14029 *
14030 * Returns the (object, offset, protection) for
14031 * this address, whether it is wired down, and whether
14032 * this map has the only reference to the data in question.
14033 * In order to later verify this lookup, a "version"
14034 * is returned.
14035 * If contended != NULL, *contended will be set to
14036 * true iff the thread had to spin or block to acquire
14037 * an exclusive lock.
14038 *
14039 * The map MUST be locked by the caller and WILL be
14040 * locked on exit. In order to guarantee the
14041 * existence of the returned object, it is returned
14042 * locked.
14043 *
14044 * If a lookup is requested with "write protection"
14045 * specified, the map may be changed to perform virtual
14046 * copying operations, although the data referenced will
14047 * remain the same.
14048 *
14049 * If fault_info is provided, then the information is
14050 * initialized according to the properties of the map entry
14051 * NB: only properties of the entry are initialized,
14052 * namely:
14053 * - user_tag
14054 * - pmap_options
14055 * - iokit_acct
14056 * - behavior
14057 * - lo_offset
14058 * - hi_offset
14059 * - no_cache
14060 * - cs_bypass
14061 * - csm_associated
14062 * - resilient_media
14063 * - vme_xnu_user_debug
14064 * - vme_no_copy_on_read
14065 * - used_for_tpro
14066 */
14067 kern_return_t
vm_map_lookup_and_lock_object(vm_map_t * var_map,vm_map_offset_t vaddr,vm_prot_t fault_type,int object_lock_type,vm_map_version_t * out_version,vm_object_t * object,vm_object_offset_t * offset,vm_prot_t * out_prot,boolean_t * wired,vm_object_fault_info_t fault_info,vm_map_t * real_map,bool * contended)14068 vm_map_lookup_and_lock_object(
14069 vm_map_t *var_map, /* IN/OUT */
14070 vm_map_offset_t vaddr,
14071 vm_prot_t fault_type,
14072 int object_lock_type,
14073 vm_map_version_t *out_version, /* OUT */
14074 vm_object_t *object, /* OUT */
14075 vm_object_offset_t *offset, /* OUT */
14076 vm_prot_t *out_prot, /* OUT */
14077 boolean_t *wired, /* OUT */
14078 vm_object_fault_info_t fault_info, /* OUT */
14079 vm_map_t *real_map, /* OUT */
14080 bool *contended) /* OUT */
14081 {
14082 vm_map_entry_t entry;
14083 vm_map_t map = *var_map;
14084 vm_map_t old_map = *var_map;
14085 vm_map_t cow_sub_map_parent = VM_MAP_NULL;
14086 vm_map_offset_t cow_parent_vaddr = 0;
14087 vm_map_offset_t old_start = 0;
14088 vm_map_offset_t old_end = 0;
14089 vm_prot_t prot;
14090 boolean_t mask_protections;
14091 boolean_t force_copy;
14092 boolean_t no_force_copy_if_executable;
14093 boolean_t submap_needed_copy;
14094 vm_prot_t original_fault_type;
14095 vm_map_size_t fault_page_mask;
14096
14097 /*
14098 * VM_PROT_MASK means that the caller wants us to use "fault_type"
14099 * as a mask against the mapping's actual protections, not as an
14100 * absolute value.
14101 */
14102 mask_protections = (fault_type & VM_PROT_IS_MASK) ? TRUE : FALSE;
14103 force_copy = (fault_type & VM_PROT_COPY) ? TRUE : FALSE;
14104 no_force_copy_if_executable = (fault_type & VM_PROT_COPY_FAIL_IF_EXECUTABLE) ? TRUE : FALSE;
14105 fault_type &= VM_PROT_ALL;
14106 original_fault_type = fault_type;
14107 if (contended) {
14108 *contended = false;
14109 }
14110
14111 *real_map = map;
14112
14113 fault_page_mask = MIN(VM_MAP_PAGE_MASK(map), PAGE_MASK);
14114 vaddr = VM_MAP_TRUNC_PAGE(vaddr, fault_page_mask);
14115
14116 RetryLookup:
14117 fault_type = original_fault_type;
14118
14119 /*
14120 * If the map has an interesting hint, try it before calling
14121 * full blown lookup routine.
14122 */
14123 entry = map->hint;
14124
14125 if ((entry == vm_map_to_entry(map)) ||
14126 (vaddr < entry->vme_start) || (vaddr >= entry->vme_end)) {
14127 vm_map_entry_t tmp_entry;
14128
14129 /*
14130 * Entry was either not a valid hint, or the vaddr
14131 * was not contained in the entry, so do a full lookup.
14132 */
14133 if (!vm_map_lookup_entry(map, vaddr, &tmp_entry)) {
14134 if ((cow_sub_map_parent) && (cow_sub_map_parent != map)) {
14135 vm_map_unlock(cow_sub_map_parent);
14136 }
14137 if ((*real_map != map)
14138 && (*real_map != cow_sub_map_parent)) {
14139 vm_map_unlock(*real_map);
14140 }
14141 return KERN_INVALID_ADDRESS;
14142 }
14143
14144 entry = tmp_entry;
14145 }
14146 if (map == old_map) {
14147 old_start = entry->vme_start;
14148 old_end = entry->vme_end;
14149 }
14150
14151 /*
14152 * Handle submaps. Drop lock on upper map, submap is
14153 * returned locked.
14154 */
14155
14156 submap_needed_copy = FALSE;
14157 submap_recurse:
14158 if (entry->is_sub_map) {
14159 vm_map_offset_t local_vaddr;
14160 vm_map_offset_t end_delta;
14161 vm_map_offset_t start_delta;
14162 vm_map_offset_t top_entry_saved_start;
14163 vm_object_offset_t top_entry_saved_offset;
14164 vm_map_entry_t submap_entry, saved_submap_entry;
14165 vm_object_offset_t submap_entry_offset;
14166 vm_object_size_t submap_entry_size;
14167 vm_prot_t subentry_protection;
14168 vm_prot_t subentry_max_protection;
14169 boolean_t subentry_no_copy_on_read;
14170 boolean_t subentry_permanent;
14171 boolean_t subentry_csm_associated;
14172 #if __arm64e__
14173 boolean_t subentry_used_for_tpro;
14174 #endif /* __arm64e__ */
14175 boolean_t mapped_needs_copy = FALSE;
14176 vm_map_version_t version;
14177
14178 assertf(VM_MAP_PAGE_SHIFT(VME_SUBMAP(entry)) >= VM_MAP_PAGE_SHIFT(map),
14179 "map %p (%d) entry %p submap %p (%d)\n",
14180 map, VM_MAP_PAGE_SHIFT(map), entry,
14181 VME_SUBMAP(entry), VM_MAP_PAGE_SHIFT(VME_SUBMAP(entry)));
14182
14183 local_vaddr = vaddr;
14184 top_entry_saved_start = entry->vme_start;
14185 top_entry_saved_offset = VME_OFFSET(entry);
14186
14187 if ((entry->use_pmap &&
14188 !((fault_type & VM_PROT_WRITE) ||
14189 force_copy))) {
14190 /* if real_map equals map we unlock below */
14191 if ((*real_map != map) &&
14192 (*real_map != cow_sub_map_parent)) {
14193 vm_map_unlock(*real_map);
14194 }
14195 *real_map = VME_SUBMAP(entry);
14196 }
14197
14198 if (entry->needs_copy &&
14199 ((fault_type & VM_PROT_WRITE) ||
14200 force_copy)) {
14201 if (!mapped_needs_copy) {
14202 if (vm_map_lock_read_to_write(map)) {
14203 vm_map_lock_read(map);
14204 *real_map = map;
14205 goto RetryLookup;
14206 }
14207 vm_map_lock_read(VME_SUBMAP(entry));
14208 *var_map = VME_SUBMAP(entry);
14209 cow_sub_map_parent = map;
14210 /* reset base to map before cow object */
14211 /* this is the map which will accept */
14212 /* the new cow object */
14213 old_start = entry->vme_start;
14214 old_end = entry->vme_end;
14215 cow_parent_vaddr = vaddr;
14216 mapped_needs_copy = TRUE;
14217 } else {
14218 vm_map_lock_read(VME_SUBMAP(entry));
14219 *var_map = VME_SUBMAP(entry);
14220 if ((cow_sub_map_parent != map) &&
14221 (*real_map != map)) {
14222 vm_map_unlock(map);
14223 }
14224 }
14225 } else {
14226 if (entry->needs_copy) {
14227 submap_needed_copy = TRUE;
14228 }
14229 vm_map_lock_read(VME_SUBMAP(entry));
14230 *var_map = VME_SUBMAP(entry);
14231 /* leave map locked if it is a target */
14232 /* cow sub_map above otherwise, just */
14233 /* follow the maps down to the object */
14234 /* here we unlock knowing we are not */
14235 /* revisiting the map. */
14236 if ((*real_map != map) && (map != cow_sub_map_parent)) {
14237 vm_map_unlock_read(map);
14238 }
14239 }
14240
14241 entry = NULL;
14242 map = *var_map;
14243
14244 /* calculate the offset in the submap for vaddr */
14245 local_vaddr = (local_vaddr - top_entry_saved_start) + top_entry_saved_offset;
14246 assertf(VM_MAP_PAGE_ALIGNED(local_vaddr, fault_page_mask),
14247 "local_vaddr 0x%llx entry->vme_start 0x%llx fault_page_mask 0x%llx\n",
14248 (uint64_t)local_vaddr, (uint64_t)top_entry_saved_start, (uint64_t)fault_page_mask);
14249
14250 RetrySubMap:
14251 if (!vm_map_lookup_entry(map, local_vaddr, &submap_entry)) {
14252 if ((cow_sub_map_parent) && (cow_sub_map_parent != map)) {
14253 vm_map_unlock(cow_sub_map_parent);
14254 }
14255 if ((*real_map != map)
14256 && (*real_map != cow_sub_map_parent)) {
14257 vm_map_unlock(*real_map);
14258 }
14259 *real_map = map;
14260 return KERN_INVALID_ADDRESS;
14261 }
14262
14263 /* find the attenuated shadow of the underlying object */
14264 /* on our target map */
14265
14266 /* in english the submap object may extend beyond the */
14267 /* region mapped by the entry or, may only fill a portion */
14268 /* of it. For our purposes, we only care if the object */
14269 /* doesn't fill. In this case the area which will */
14270 /* ultimately be clipped in the top map will only need */
14271 /* to be as big as the portion of the underlying entry */
14272 /* which is mapped */
14273 start_delta = submap_entry->vme_start > top_entry_saved_offset ?
14274 submap_entry->vme_start - top_entry_saved_offset : 0;
14275
14276 end_delta =
14277 (top_entry_saved_offset + start_delta + (old_end - old_start)) <=
14278 submap_entry->vme_end ?
14279 0 : (top_entry_saved_offset +
14280 (old_end - old_start))
14281 - submap_entry->vme_end;
14282
14283 old_start += start_delta;
14284 old_end -= end_delta;
14285
14286 if (submap_entry->is_sub_map) {
14287 entry = submap_entry;
14288 vaddr = local_vaddr;
14289 goto submap_recurse;
14290 }
14291
14292 if (((fault_type & VM_PROT_WRITE) ||
14293 force_copy)
14294 && cow_sub_map_parent) {
14295 vm_object_t sub_object, copy_object;
14296 vm_object_offset_t copy_offset;
14297 vm_map_offset_t local_start;
14298 vm_map_offset_t local_end;
14299 boolean_t object_copied = FALSE;
14300 vm_object_offset_t object_copied_offset = 0;
14301 boolean_t object_copied_needs_copy = FALSE;
14302 kern_return_t kr = KERN_SUCCESS;
14303
14304 if (vm_map_lock_read_to_write(map)) {
14305 vm_map_lock_read(map);
14306 old_start -= start_delta;
14307 old_end += end_delta;
14308 goto RetrySubMap;
14309 }
14310
14311
14312 sub_object = VME_OBJECT(submap_entry);
14313 if (sub_object == VM_OBJECT_NULL) {
14314 sub_object =
14315 vm_object_allocate(
14316 (vm_map_size_t)
14317 (submap_entry->vme_end -
14318 submap_entry->vme_start), map->serial_id);
14319 VME_OBJECT_SET(submap_entry, sub_object, false, 0);
14320 VME_OFFSET_SET(submap_entry, 0);
14321 assert(!submap_entry->is_sub_map);
14322 assert(submap_entry->use_pmap);
14323 }
14324 local_start = local_vaddr -
14325 (cow_parent_vaddr - old_start);
14326 local_end = local_vaddr +
14327 (old_end - cow_parent_vaddr);
14328 vm_map_clip_start(map, submap_entry, local_start);
14329 vm_map_clip_end(map, submap_entry, local_end);
14330 if (submap_entry->is_sub_map) {
14331 /* unnesting was done when clipping */
14332 assert(!submap_entry->use_pmap);
14333 }
14334
14335 /* This is the COW case, lets connect */
14336 /* an entry in our space to the underlying */
14337 /* object in the submap, bypassing the */
14338 /* submap. */
14339 submap_entry_offset = VME_OFFSET(submap_entry);
14340 submap_entry_size = submap_entry->vme_end - submap_entry->vme_start;
14341
14342 if ((submap_entry->wired_count != 0 ||
14343 sub_object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC) &&
14344 (submap_entry->protection & VM_PROT_EXECUTE) &&
14345 no_force_copy_if_executable) {
14346 // printf("FBDP map %p entry %p start 0x%llx end 0x%llx wired %d strat %d\n", map, submap_entry, (uint64_t)local_start, (uint64_t)local_end, submap_entry->wired_count, sub_object->copy_strategy);
14347 if ((cow_sub_map_parent) && (cow_sub_map_parent != map)) {
14348 vm_map_unlock(cow_sub_map_parent);
14349 }
14350 if ((*real_map != map)
14351 && (*real_map != cow_sub_map_parent)) {
14352 vm_map_unlock(*real_map);
14353 }
14354 *real_map = map;
14355 ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_SUBMAP_NO_COW_ON_EXECUTABLE), 0 /* arg */);
14356 vm_map_lock_write_to_read(map);
14357 kr = KERN_PROTECTION_FAILURE;
14358 DTRACE_VM4(submap_no_copy_executable,
14359 vm_map_t, map,
14360 vm_object_offset_t, submap_entry_offset,
14361 vm_object_size_t, submap_entry_size,
14362 int, kr);
14363 return kr;
14364 }
14365
14366 if (submap_entry->wired_count != 0) {
14367 vm_object_reference(sub_object);
14368
14369 assertf(VM_MAP_PAGE_ALIGNED(VME_OFFSET(submap_entry), VM_MAP_PAGE_MASK(map)),
14370 "submap_entry %p offset 0x%llx\n",
14371 submap_entry, VME_OFFSET(submap_entry));
14372
14373 DTRACE_VM6(submap_copy_slowly,
14374 vm_map_t, cow_sub_map_parent,
14375 vm_map_offset_t, vaddr,
14376 vm_map_t, map,
14377 vm_object_size_t, submap_entry_size,
14378 int, submap_entry->wired_count,
14379 int, sub_object->copy_strategy);
14380
14381 saved_submap_entry = submap_entry;
14382 version.main_timestamp = map->timestamp;
14383 vm_map_unlock(map); /* Increments timestamp by 1 */
14384 submap_entry = VM_MAP_ENTRY_NULL;
14385
14386 vm_object_lock(sub_object);
14387 kr = vm_object_copy_slowly(sub_object,
14388 submap_entry_offset,
14389 submap_entry_size,
14390 FALSE, /* interruptible */
14391 ©_object);
14392 object_copied = TRUE;
14393 object_copied_offset = 0;
14394 /* 4k: account for extra offset in physical page */
14395 object_copied_offset += submap_entry_offset - vm_object_trunc_page(submap_entry_offset);
14396 object_copied_needs_copy = FALSE;
14397 vm_object_deallocate(sub_object);
14398
14399 vm_map_lock(map);
14400
14401 if (kr != KERN_SUCCESS &&
14402 kr != KERN_MEMORY_RESTART_COPY) {
14403 if ((cow_sub_map_parent) && (cow_sub_map_parent != map)) {
14404 vm_map_unlock(cow_sub_map_parent);
14405 }
14406 if ((*real_map != map)
14407 && (*real_map != cow_sub_map_parent)) {
14408 vm_map_unlock(*real_map);
14409 }
14410 *real_map = map;
14411 vm_object_deallocate(copy_object);
14412 copy_object = VM_OBJECT_NULL;
14413 ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_SUBMAP_COPY_SLOWLY_FAILED), 0 /* arg */);
14414 vm_map_lock_write_to_read(map);
14415 DTRACE_VM4(submap_copy_error_slowly,
14416 vm_object_t, sub_object,
14417 vm_object_offset_t, submap_entry_offset,
14418 vm_object_size_t, submap_entry_size,
14419 int, kr);
14420 vm_map_lookup_and_lock_object_copy_slowly_error++;
14421 return kr;
14422 }
14423
14424 if ((kr == KERN_SUCCESS) &&
14425 (version.main_timestamp + 1) == map->timestamp) {
14426 submap_entry = saved_submap_entry;
14427 } else {
14428 saved_submap_entry = NULL;
14429 old_start -= start_delta;
14430 old_end += end_delta;
14431 vm_object_deallocate(copy_object);
14432 copy_object = VM_OBJECT_NULL;
14433 vm_map_lock_write_to_read(map);
14434 vm_map_lookup_and_lock_object_copy_slowly_restart++;
14435 goto RetrySubMap;
14436 }
14437 vm_map_lookup_and_lock_object_copy_slowly_count++;
14438 vm_map_lookup_and_lock_object_copy_slowly_size += submap_entry_size;
14439 if (submap_entry_size > vm_map_lookup_and_lock_object_copy_slowly_max) {
14440 vm_map_lookup_and_lock_object_copy_slowly_max = submap_entry_size;
14441 }
14442 } else if (sub_object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC) {
14443 submap_entry_offset = VME_OFFSET(submap_entry);
14444 copy_object = VM_OBJECT_NULL;
14445 object_copied_offset = submap_entry_offset;
14446 object_copied_needs_copy = FALSE;
14447 DTRACE_VM6(submap_copy_strategically,
14448 vm_map_t, cow_sub_map_parent,
14449 vm_map_offset_t, vaddr,
14450 vm_map_t, map,
14451 vm_object_size_t, submap_entry_size,
14452 int, submap_entry->wired_count,
14453 int, sub_object->copy_strategy);
14454 kr = vm_object_copy_strategically(
14455 sub_object,
14456 submap_entry_offset,
14457 submap_entry->vme_end - submap_entry->vme_start,
14458 false, /* forking */
14459 ©_object,
14460 &object_copied_offset,
14461 &object_copied_needs_copy);
14462 if (kr == KERN_MEMORY_RESTART_COPY) {
14463 old_start -= start_delta;
14464 old_end += end_delta;
14465 vm_object_deallocate(copy_object);
14466 copy_object = VM_OBJECT_NULL;
14467 vm_map_lock_write_to_read(map);
14468 vm_map_lookup_and_lock_object_copy_strategically_restart++;
14469 goto RetrySubMap;
14470 }
14471 if (kr != KERN_SUCCESS) {
14472 if ((cow_sub_map_parent) && (cow_sub_map_parent != map)) {
14473 vm_map_unlock(cow_sub_map_parent);
14474 }
14475 if ((*real_map != map)
14476 && (*real_map != cow_sub_map_parent)) {
14477 vm_map_unlock(*real_map);
14478 }
14479 *real_map = map;
14480 vm_object_deallocate(copy_object);
14481 copy_object = VM_OBJECT_NULL;
14482 ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_SUBMAP_COPY_STRAT_FAILED), 0 /* arg */);
14483 vm_map_lock_write_to_read(map);
14484 DTRACE_VM4(submap_copy_error_strategically,
14485 vm_object_t, sub_object,
14486 vm_object_offset_t, submap_entry_offset,
14487 vm_object_size_t, submap_entry_size,
14488 int, kr);
14489 vm_map_lookup_and_lock_object_copy_strategically_error++;
14490 return kr;
14491 }
14492 assert(copy_object != VM_OBJECT_NULL);
14493 assert(copy_object != sub_object);
14494 object_copied = TRUE;
14495 vm_map_lookup_and_lock_object_copy_strategically_count++;
14496 vm_map_lookup_and_lock_object_copy_strategically_size += submap_entry_size;
14497 if (submap_entry_size > vm_map_lookup_and_lock_object_copy_strategically_max) {
14498 vm_map_lookup_and_lock_object_copy_strategically_max = submap_entry_size;
14499 }
14500 } else {
14501 /* set up shadow object */
14502 object_copied = FALSE;
14503 copy_object = sub_object;
14504 vm_object_lock(sub_object);
14505 vm_object_reference_locked(sub_object);
14506 VM_OBJECT_SET_SHADOWED(sub_object, TRUE);
14507 vm_object_unlock(sub_object);
14508
14509 assert(submap_entry->wired_count == 0);
14510 submap_entry->needs_copy = TRUE;
14511
14512 prot = submap_entry->protection;
14513 if (pmap_has_prot_policy(map->pmap, submap_entry->translated_allow_execute, prot)) {
14514 panic("%s: map %p pmap %p entry %p 0x%llx:0x%llx prot 0x%x",
14515 __FUNCTION__,
14516 map, map->pmap, submap_entry,
14517 (uint64_t)submap_entry->vme_start,
14518 (uint64_t)submap_entry->vme_end,
14519 prot);
14520 }
14521 prot = prot & ~VM_PROT_WRITE;
14522 if (pmap_has_prot_policy(map->pmap, submap_entry->translated_allow_execute, prot)) {
14523 panic("%s: map %p pmap %p entry %p 0x%llx:0x%llx prot 0x%x",
14524 __FUNCTION__,
14525 map, map->pmap, submap_entry,
14526 (uint64_t)submap_entry->vme_start,
14527 (uint64_t)submap_entry->vme_end,
14528 prot);
14529 }
14530
14531 if (override_nx(old_map,
14532 VME_ALIAS(submap_entry))
14533 && prot) {
14534 prot |= VM_PROT_EXECUTE;
14535 }
14536
14537 vm_object_pmap_protect(
14538 sub_object,
14539 VME_OFFSET(submap_entry),
14540 submap_entry->vme_end -
14541 submap_entry->vme_start,
14542 (submap_entry->is_shared
14543 || map->mapped_in_other_pmaps) ?
14544 PMAP_NULL : map->pmap,
14545 VM_MAP_PAGE_SIZE(map),
14546 submap_entry->vme_start,
14547 prot);
14548 vm_map_lookup_and_lock_object_copy_shadow_count++;
14549 vm_map_lookup_and_lock_object_copy_shadow_size += submap_entry_size;
14550 if (submap_entry_size > vm_map_lookup_and_lock_object_copy_shadow_max) {
14551 vm_map_lookup_and_lock_object_copy_shadow_max = submap_entry_size;
14552 }
14553 }
14554
14555 /*
14556 * Adjust the fault offset to the submap entry.
14557 */
14558 copy_offset = (local_vaddr -
14559 submap_entry->vme_start +
14560 VME_OFFSET(submap_entry));
14561
14562 /* This works diffently than the */
14563 /* normal submap case. We go back */
14564 /* to the parent of the cow map and*/
14565 /* clip out the target portion of */
14566 /* the sub_map, substituting the */
14567 /* new copy object, */
14568
14569 subentry_protection = submap_entry->protection;
14570 subentry_max_protection = submap_entry->max_protection;
14571 subentry_no_copy_on_read = submap_entry->vme_no_copy_on_read;
14572 subentry_permanent = submap_entry->vme_permanent;
14573 subentry_csm_associated = submap_entry->csm_associated;
14574 #if __arm64e__
14575 subentry_used_for_tpro = submap_entry->used_for_tpro;
14576 #endif // __arm64e__
14577 vm_map_unlock(map);
14578 submap_entry = NULL; /* not valid after map unlock */
14579
14580 local_start = old_start;
14581 local_end = old_end;
14582 map = cow_sub_map_parent;
14583 *var_map = cow_sub_map_parent;
14584 vaddr = cow_parent_vaddr;
14585 cow_sub_map_parent = NULL;
14586
14587 if (!vm_map_lookup_entry(map,
14588 vaddr, &entry)) {
14589 if ((cow_sub_map_parent) && (cow_sub_map_parent != map)) {
14590 vm_map_unlock(cow_sub_map_parent);
14591 }
14592 if ((*real_map != map)
14593 && (*real_map != cow_sub_map_parent)) {
14594 vm_map_unlock(*real_map);
14595 }
14596 *real_map = map;
14597 vm_object_deallocate(
14598 copy_object);
14599 copy_object = VM_OBJECT_NULL;
14600 vm_map_lock_write_to_read(map);
14601 DTRACE_VM4(submap_lookup_post_unlock,
14602 uint64_t, (uint64_t)entry->vme_start,
14603 uint64_t, (uint64_t)entry->vme_end,
14604 vm_map_offset_t, vaddr,
14605 int, object_copied);
14606 return KERN_INVALID_ADDRESS;
14607 }
14608
14609 /* clip out the portion of space */
14610 /* mapped by the sub map which */
14611 /* corresponds to the underlying */
14612 /* object */
14613
14614 /*
14615 * Clip (and unnest) the smallest nested chunk
14616 * possible around the faulting address...
14617 */
14618 local_start = vaddr & ~(pmap_shared_region_size_min(map->pmap) - 1);
14619 local_end = local_start + pmap_shared_region_size_min(map->pmap);
14620 /*
14621 * ... but don't go beyond the "old_start" to "old_end"
14622 * range, to avoid spanning over another VM region
14623 * with a possibly different VM object and/or offset.
14624 */
14625 if (local_start < old_start) {
14626 local_start = old_start;
14627 }
14628 if (local_end > old_end) {
14629 local_end = old_end;
14630 }
14631 /*
14632 * Adjust copy_offset to the start of the range.
14633 */
14634 copy_offset -= (vaddr - local_start);
14635
14636 vm_map_clip_start(map, entry, local_start);
14637 vm_map_clip_end(map, entry, local_end);
14638 if (entry->is_sub_map) {
14639 /* unnesting was done when clipping */
14640 assert(!entry->use_pmap);
14641 }
14642
14643 /* substitute copy object for */
14644 /* shared map entry */
14645 vm_map_deallocate(VME_SUBMAP(entry));
14646 assert(!entry->iokit_acct);
14647 entry->use_pmap = TRUE;
14648 VME_OBJECT_SET(entry, copy_object, false, 0);
14649
14650 /* propagate the submap entry's protections */
14651 if (entry->protection != VM_PROT_READ) {
14652 /*
14653 * Someone has already altered the top entry's
14654 * protections via vm_protect(VM_PROT_COPY).
14655 * Respect these new values and ignore the
14656 * submap entry's protections.
14657 */
14658 } else {
14659 /*
14660 * Regular copy-on-write: propagate the submap
14661 * entry's protections to the top map entry.
14662 */
14663 entry->protection |= subentry_protection;
14664 }
14665 entry->max_protection |= subentry_max_protection;
14666 /* propagate some attributes from subentry */
14667 entry->vme_no_copy_on_read = subentry_no_copy_on_read;
14668 entry->vme_permanent = subentry_permanent;
14669 entry->csm_associated = subentry_csm_associated;
14670 #if __arm64e__
14671 /* propagate TPRO iff the destination map has TPRO enabled */
14672 if (subentry_used_for_tpro) {
14673 if (vm_map_tpro(map)) {
14674 entry->used_for_tpro = subentry_used_for_tpro;
14675 } else {
14676 /* "permanent" came from being TPRO */
14677 entry->vme_permanent = FALSE;
14678 }
14679 }
14680 #endif /* __arm64e */
14681 if ((entry->protection & VM_PROT_WRITE) &&
14682 (entry->protection & VM_PROT_EXECUTE) &&
14683 #if XNU_TARGET_OS_OSX
14684 map->pmap != kernel_pmap &&
14685 (vm_map_cs_enforcement(map)
14686 #if __arm64__
14687 || !VM_MAP_IS_EXOTIC(map)
14688 #endif /* __arm64__ */
14689 ) &&
14690 #endif /* XNU_TARGET_OS_OSX */
14691 #if CODE_SIGNING_MONITOR
14692 (csm_address_space_exempt(map->pmap) != KERN_SUCCESS) &&
14693 #endif
14694 !(entry->used_for_jit) &&
14695 VM_MAP_POLICY_WX_STRIP_X(map)) {
14696 DTRACE_VM3(cs_wx,
14697 uint64_t, (uint64_t)entry->vme_start,
14698 uint64_t, (uint64_t)entry->vme_end,
14699 vm_prot_t, entry->protection);
14700 printf("CODE SIGNING: %d[%s] %s:%d(0x%llx,0x%llx,0x%x) can't have both write and exec at the same time\n",
14701 proc_selfpid(),
14702 (get_bsdtask_info(current_task())
14703 ? proc_name_address(get_bsdtask_info(current_task()))
14704 : "?"),
14705 __FUNCTION__, __LINE__,
14706 #if DEVELOPMENT || DEBUG
14707 (uint64_t)entry->vme_start,
14708 (uint64_t)entry->vme_end,
14709 #else /* DEVELOPMENT || DEBUG */
14710 (uint64_t)0,
14711 (uint64_t)0,
14712 #endif /* DEVELOPMENT || DEBUG */
14713 entry->protection);
14714 entry->protection &= ~VM_PROT_EXECUTE;
14715 }
14716
14717 if (object_copied) {
14718 VME_OFFSET_SET(entry, local_start - old_start + object_copied_offset);
14719 entry->needs_copy = object_copied_needs_copy;
14720 entry->is_shared = FALSE;
14721 } else {
14722 assert(VME_OBJECT(entry) != VM_OBJECT_NULL);
14723 assert(VME_OBJECT(entry)->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC);
14724 assert(entry->wired_count == 0);
14725 VME_OFFSET_SET(entry, copy_offset);
14726 entry->needs_copy = TRUE;
14727 if (map != old_map) {
14728 entry->is_shared = TRUE;
14729 }
14730 }
14731 if (entry->inheritance == VM_INHERIT_SHARE) {
14732 entry->inheritance = VM_INHERIT_COPY;
14733 }
14734
14735 vm_map_lock_write_to_read(map);
14736 } else {
14737 if ((cow_sub_map_parent)
14738 && (cow_sub_map_parent != *real_map)
14739 && (cow_sub_map_parent != map)) {
14740 vm_map_unlock(cow_sub_map_parent);
14741 }
14742 entry = submap_entry;
14743 vaddr = local_vaddr;
14744 }
14745 }
14746
14747 /*
14748 * Check whether this task is allowed to have
14749 * this page.
14750 */
14751
14752 prot = entry->protection;
14753
14754 if (override_nx(old_map, VME_ALIAS(entry)) && prot) {
14755 /*
14756 * HACK -- if not a stack, then allow execution
14757 */
14758 prot |= VM_PROT_EXECUTE;
14759 }
14760
14761 #if __arm64e__
14762 /*
14763 * If the entry we're dealing with is TPRO and we have a write
14764 * fault, inject VM_PROT_WRITE into protections. This allows us
14765 * to maintain RO permissions when not marked as TPRO.
14766 */
14767 if (entry->used_for_tpro && (fault_type & VM_PROT_WRITE)) {
14768 prot |= VM_PROT_WRITE;
14769 }
14770 #endif /* __arm64e__ */
14771 if (mask_protections) {
14772 fault_type &= prot;
14773 if (fault_type == VM_PROT_NONE) {
14774 goto protection_failure;
14775 }
14776 }
14777 if (((fault_type & prot) != fault_type)
14778 #if __arm64__
14779 /* prefetch abort in execute-only page */
14780 && !(prot == VM_PROT_EXECUTE && fault_type == (VM_PROT_READ | VM_PROT_EXECUTE))
14781 #elif defined(__x86_64__)
14782 /* Consider the UEXEC bit when handling an EXECUTE fault */
14783 && !((fault_type & VM_PROT_EXECUTE) && !(prot & VM_PROT_EXECUTE) && (prot & VM_PROT_UEXEC))
14784 #endif
14785 ) {
14786 protection_failure:
14787 if (*real_map != map) {
14788 vm_map_unlock(*real_map);
14789 }
14790 *real_map = map;
14791
14792 if ((fault_type & VM_PROT_EXECUTE) && prot) {
14793 log_stack_execution_failure((addr64_t)vaddr, prot);
14794 }
14795
14796 DTRACE_VM2(prot_fault, int, 1, (uint64_t *), NULL);
14797 DTRACE_VM3(prot_fault_detailed, vm_prot_t, fault_type, vm_prot_t, prot, void *, vaddr);
14798 /*
14799 * Noisy (esp. internally) and can be inferred from CrashReports. So OFF for now.
14800 *
14801 * ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_PROTECTION_FAILURE), 0);
14802 */
14803 return KERN_PROTECTION_FAILURE;
14804 }
14805
14806 /*
14807 * If this page is not pageable, we have to get
14808 * it for all possible accesses.
14809 */
14810
14811 *wired = (entry->wired_count != 0);
14812 if (*wired) {
14813 fault_type = prot;
14814 }
14815
14816 /*
14817 * If the entry was copy-on-write, we either ...
14818 */
14819
14820 if (entry->needs_copy) {
14821 /*
14822 * If we want to write the page, we may as well
14823 * handle that now since we've got the map locked.
14824 *
14825 * If we don't need to write the page, we just
14826 * demote the permissions allowed.
14827 */
14828
14829 if ((fault_type & VM_PROT_WRITE) || *wired || force_copy) {
14830 /*
14831 * Make a new object, and place it in the
14832 * object chain. Note that no new references
14833 * have appeared -- one just moved from the
14834 * map to the new object.
14835 */
14836
14837 if (vm_map_lock_read_to_write(map)) {
14838 vm_map_lock_read(map);
14839 goto RetryLookup;
14840 }
14841
14842 if (VME_OBJECT(entry)->shadowed == FALSE) {
14843 vm_object_lock(VME_OBJECT(entry));
14844 VM_OBJECT_SET_SHADOWED(VME_OBJECT(entry), TRUE);
14845 vm_object_unlock(VME_OBJECT(entry));
14846 }
14847 VME_OBJECT_SHADOW(entry,
14848 (vm_map_size_t) (entry->vme_end -
14849 entry->vme_start),
14850 vm_map_always_shadow(map));
14851 entry->needs_copy = FALSE;
14852
14853 vm_map_lock_write_to_read(map);
14854 }
14855 if ((fault_type & VM_PROT_WRITE) == 0 && *wired == 0) {
14856 /*
14857 * We're attempting to read a copy-on-write
14858 * page -- don't allow writes.
14859 */
14860
14861 prot &= (~VM_PROT_WRITE);
14862 }
14863 }
14864
14865 if (submap_needed_copy && (prot & VM_PROT_WRITE)) {
14866 /*
14867 * We went through a "needs_copy" submap without triggering
14868 * a copy, so granting write access to the page would bypass
14869 * that submap's "needs_copy".
14870 */
14871 assert(!(fault_type & VM_PROT_WRITE));
14872 assert(!*wired);
14873 assert(!force_copy);
14874 // printf("FBDP %d[%s] submap_needed_copy for %p 0x%llx\n", proc_selfpid(), proc_name_address(current_task()->bsd_info), map, vaddr);
14875 prot &= ~VM_PROT_WRITE;
14876 }
14877
14878 /*
14879 * Create an object if necessary.
14880 */
14881 if (VME_OBJECT(entry) == VM_OBJECT_NULL) {
14882 if (vm_map_lock_read_to_write(map)) {
14883 vm_map_lock_read(map);
14884 goto RetryLookup;
14885 }
14886
14887 VME_OBJECT_SET(entry,
14888 vm_object_allocate(
14889 (vm_map_size_t)(entry->vme_end -
14890 entry->vme_start),
14891 map->serial_id
14892 ), false, 0);
14893 VME_OFFSET_SET(entry, 0);
14894 assert(entry->use_pmap);
14895 vm_map_lock_write_to_read(map);
14896 }
14897
14898 /*
14899 * Return the object/offset from this entry. If the entry
14900 * was copy-on-write or empty, it has been fixed up. Also
14901 * return the protection.
14902 */
14903
14904 *offset = (vaddr - entry->vme_start) + VME_OFFSET(entry);
14905 *object = VME_OBJECT(entry);
14906 *out_prot = prot;
14907 KDBG_FILTERED(MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_MAP_LOOKUP_OBJECT), VM_KERNEL_UNSLIDE_OR_PERM(*object), (unsigned long) VME_ALIAS(entry), 0, 0);
14908
14909 if (fault_info) {
14910 /*
14911 * Initialize fault information according to the entry being faulted
14912 * from.
14913 */
14914 fault_info->user_tag = VME_ALIAS(entry);
14915 fault_info->pmap_options = 0;
14916 if (entry->iokit_acct ||
14917 (!entry->is_sub_map && !entry->use_pmap)) {
14918 fault_info->pmap_options |= PMAP_OPTIONS_ALT_ACCT;
14919 }
14920 if (fault_info->behavior == VM_BEHAVIOR_DEFAULT) {
14921 fault_info->behavior = entry->behavior;
14922 }
14923 fault_info->lo_offset = VME_OFFSET(entry);
14924 fault_info->hi_offset =
14925 (entry->vme_end - entry->vme_start) + VME_OFFSET(entry);
14926 fault_info->no_cache = entry->no_cache;
14927 fault_info->io_sync = FALSE;
14928 fault_info->cs_bypass = (entry->used_for_jit ||
14929 #if CODE_SIGNING_MONITOR
14930 (csm_address_space_exempt(map->pmap) == KERN_SUCCESS) ||
14931 #endif
14932 entry->vme_resilient_codesign);
14933 fault_info->mark_zf_absent = FALSE;
14934 fault_info->batch_pmap_op = FALSE;
14935 /*
14936 * The pmap layer will validate this page
14937 * before allowing it to be executed from.
14938 */
14939 #if CODE_SIGNING_MONITOR
14940 fault_info->csm_associated = entry->csm_associated;
14941 #else
14942 fault_info->csm_associated = FALSE;
14943 #endif
14944
14945 fault_info->resilient_media = entry->vme_resilient_media;
14946 fault_info->fi_xnu_user_debug = entry->vme_xnu_user_debug;
14947 fault_info->no_copy_on_read = entry->vme_no_copy_on_read;
14948 #if __arm64e__
14949 fault_info->fi_used_for_tpro = entry->used_for_tpro;
14950 #else /* __arm64e__ */
14951 fault_info->fi_used_for_tpro = FALSE;
14952 #endif
14953 if (entry->translated_allow_execute) {
14954 fault_info->pmap_options |= PMAP_OPTIONS_TRANSLATED_ALLOW_EXECUTE;
14955 }
14956 }
14957
14958 /*
14959 * Lock the object to prevent it from disappearing
14960 */
14961 if (object_lock_type == OBJECT_LOCK_EXCLUSIVE) {
14962 if (contended == NULL) {
14963 vm_object_lock(*object);
14964 } else {
14965 *contended = vm_object_lock_check_contended(*object);
14966 }
14967 } else {
14968 vm_object_lock_shared(*object);
14969 }
14970
14971 /*
14972 * Save the version number
14973 */
14974
14975 out_version->main_timestamp = map->timestamp;
14976
14977 return KERN_SUCCESS;
14978 }
14979
14980
14981 /*
14982 * vm_map_verify:
14983 *
14984 * Verifies that the map in question has not changed
14985 * since the given version. The map has to be locked
14986 * ("shared" mode is fine) before calling this function
14987 * and it will be returned locked too.
14988 */
14989 boolean_t
vm_map_verify(vm_map_t map,vm_map_version_t * version)14990 vm_map_verify(
14991 vm_map_t map,
14992 vm_map_version_t *version) /* REF */
14993 {
14994 boolean_t result;
14995
14996 vm_map_lock_assert_held(map);
14997 result = (map->timestamp == version->main_timestamp);
14998
14999 return result;
15000 }
15001
15002
15003 /*
15004 * TEMPORARYTEMPORARYTEMPORARYTEMPORARYTEMPORARYTEMPORARY
15005 * Goes away after regular vm_region_recurse function migrates to
15006 * 64 bits
15007 * vm_region_recurse: A form of vm_region which follows the
15008 * submaps in a target map
15009 *
15010 */
15011
15012 kern_return_t
vm_map_region_recurse_64(vm_map_t map,vm_map_offset_ut * address_u,vm_map_size_ut * size_u,natural_t * nesting_depth,vm_region_submap_info_64_t submap_info,mach_msg_type_number_t * count)15013 vm_map_region_recurse_64(
15014 vm_map_t map,
15015 vm_map_offset_ut *address_u, /* IN/OUT */
15016 vm_map_size_ut *size_u, /* OUT */
15017 natural_t *nesting_depth, /* IN/OUT */
15018 vm_region_submap_info_64_t submap_info, /* IN/OUT */
15019 mach_msg_type_number_t *count) /* IN/OUT */
15020 {
15021 mach_msg_type_number_t original_count;
15022 vm_region_extended_info_data_t extended;
15023 vm_map_entry_t tmp_entry;
15024 vm_map_offset_t user_address;
15025 unsigned int user_max_depth;
15026
15027 /*
15028 * "curr_entry" is the VM map entry preceding or including the
15029 * address we're looking for.
15030 * "curr_map" is the map or sub-map containing "curr_entry".
15031 * "curr_address" is the equivalent of the top map's "user_address"
15032 * in the current map.
15033 * "curr_offset" is the cumulated offset of "curr_map" in the
15034 * target task's address space.
15035 * "curr_depth" is the depth of "curr_map" in the chain of
15036 * sub-maps.
15037 *
15038 * "curr_max_below" and "curr_max_above" limit the range (around
15039 * "curr_address") we should take into account in the current (sub)map.
15040 * They limit the range to what's visible through the map entries
15041 * we've traversed from the top map to the current map.
15042 *
15043 */
15044 vm_map_entry_t curr_entry;
15045 vm_map_t curr_entry_submap;
15046 vm_map_address_t curr_entry_start;
15047 vm_object_offset_t curr_entry_offset;
15048 vm_map_address_t curr_address;
15049 vm_map_offset_t curr_offset;
15050 vm_map_t curr_map;
15051 unsigned int curr_depth;
15052 vm_map_offset_t curr_max_below, curr_max_above;
15053 vm_map_offset_t curr_skip;
15054
15055 /*
15056 * "next_" is the same as "curr_" but for the VM region immediately
15057 * after the address we're looking for. We need to keep track of this
15058 * too because we want to return info about that region if the
15059 * address we're looking for is not mapped.
15060 */
15061 vm_map_entry_t next_entry;
15062 vm_map_offset_t next_offset;
15063 vm_map_offset_t next_address;
15064 vm_map_t next_map;
15065 unsigned int next_depth;
15066 vm_map_offset_t next_max_below, next_max_above;
15067 vm_map_offset_t next_skip;
15068
15069 boolean_t look_for_pages;
15070 vm_region_submap_short_info_64_t short_info;
15071 boolean_t do_region_footprint;
15072 int effective_page_size, effective_page_shift;
15073 boolean_t submap_needed_copy;
15074
15075 if (map == VM_MAP_NULL) {
15076 /* no address space to work on */
15077 return KERN_INVALID_ARGUMENT;
15078 }
15079
15080 user_address = vm_sanitize_addr(map, *address_u);
15081
15082 effective_page_shift = vm_self_region_page_shift(map);
15083 effective_page_size = (1 << effective_page_shift);
15084
15085 if (*count < VM_REGION_SUBMAP_SHORT_INFO_COUNT_64) {
15086 /*
15087 * "info" structure is not big enough and
15088 * would overflow
15089 */
15090 return KERN_INVALID_ARGUMENT;
15091 }
15092
15093 do_region_footprint = task_self_region_footprint();
15094 original_count = *count;
15095
15096 if (original_count < VM_REGION_SUBMAP_INFO_V0_COUNT_64) {
15097 *count = VM_REGION_SUBMAP_SHORT_INFO_COUNT_64;
15098 look_for_pages = FALSE;
15099 short_info = (vm_region_submap_short_info_64_t) submap_info;
15100 submap_info = NULL;
15101 } else {
15102 look_for_pages = TRUE;
15103 *count = VM_REGION_SUBMAP_INFO_V0_COUNT_64;
15104 short_info = NULL;
15105
15106 if (original_count >= VM_REGION_SUBMAP_INFO_V1_COUNT_64) {
15107 *count = VM_REGION_SUBMAP_INFO_V1_COUNT_64;
15108 }
15109 if (original_count >= VM_REGION_SUBMAP_INFO_V2_COUNT_64) {
15110 *count = VM_REGION_SUBMAP_INFO_V2_COUNT_64;
15111 }
15112 }
15113
15114 user_max_depth = *nesting_depth;
15115 submap_needed_copy = FALSE;
15116
15117 if (not_in_kdp) {
15118 vm_map_lock_read(map);
15119 }
15120
15121 recurse_again:
15122 curr_entry = NULL;
15123 curr_map = map;
15124 curr_address = user_address;
15125 curr_offset = 0;
15126 curr_skip = 0;
15127 curr_depth = 0;
15128 curr_max_above = ((vm_map_offset_t) -1) - curr_address;
15129 curr_max_below = curr_address;
15130
15131 next_entry = NULL;
15132 next_map = NULL;
15133 next_address = 0;
15134 next_offset = 0;
15135 next_skip = 0;
15136 next_depth = 0;
15137 next_max_above = (vm_map_offset_t) -1;
15138 next_max_below = (vm_map_offset_t) -1;
15139
15140 for (;;) {
15141 if (vm_map_lookup_entry(curr_map,
15142 curr_address,
15143 &tmp_entry)) {
15144 /* tmp_entry contains the address we're looking for */
15145 curr_entry = tmp_entry;
15146 } else {
15147 vm_map_offset_t skip;
15148 /*
15149 * The address is not mapped. "tmp_entry" is the
15150 * map entry preceding the address. We want the next
15151 * one, if it exists.
15152 */
15153 curr_entry = tmp_entry->vme_next;
15154
15155 if (curr_entry == vm_map_to_entry(curr_map) ||
15156 (curr_entry->vme_start >=
15157 curr_address + curr_max_above)) {
15158 /* no next entry at this level: stop looking */
15159 if (not_in_kdp) {
15160 vm_map_unlock_read(curr_map);
15161 }
15162 curr_entry = NULL;
15163 curr_map = NULL;
15164 curr_skip = 0;
15165 curr_offset = 0;
15166 curr_depth = 0;
15167 curr_max_above = 0;
15168 curr_max_below = 0;
15169 break;
15170 }
15171
15172 /* adjust current address and offset */
15173 skip = curr_entry->vme_start - curr_address;
15174 curr_address = curr_entry->vme_start;
15175 curr_skip += skip;
15176 curr_offset += skip;
15177 curr_max_above -= skip;
15178 curr_max_below = 0;
15179 }
15180
15181 /*
15182 * Is the next entry at this level closer to the address (or
15183 * deeper in the submap chain) than the one we had
15184 * so far ?
15185 */
15186 tmp_entry = curr_entry->vme_next;
15187 if (tmp_entry == vm_map_to_entry(curr_map)) {
15188 /* no next entry at this level */
15189 } else if (tmp_entry->vme_start >=
15190 curr_address + curr_max_above) {
15191 /*
15192 * tmp_entry is beyond the scope of what we mapped of
15193 * this submap in the upper level: ignore it.
15194 */
15195 } else if ((next_entry == NULL) ||
15196 (tmp_entry->vme_start + curr_offset <=
15197 next_entry->vme_start + next_offset)) {
15198 /*
15199 * We didn't have a "next_entry" or this one is
15200 * closer to the address we're looking for:
15201 * use this "tmp_entry" as the new "next_entry".
15202 */
15203 if (next_entry != NULL) {
15204 /* unlock the last "next_map" */
15205 if (next_map != curr_map && not_in_kdp) {
15206 vm_map_unlock_read(next_map);
15207 }
15208 }
15209 next_entry = tmp_entry;
15210 next_map = curr_map;
15211 next_depth = curr_depth;
15212 next_address = next_entry->vme_start;
15213 next_skip = curr_skip;
15214 next_skip += (next_address - curr_address);
15215 next_offset = curr_offset;
15216 next_offset += (next_address - curr_address);
15217 next_max_above = MIN(next_max_above, curr_max_above);
15218 next_max_above = MIN(next_max_above,
15219 next_entry->vme_end - next_address);
15220 next_max_below = MIN(next_max_below, curr_max_below);
15221 next_max_below = MIN(next_max_below,
15222 next_address - next_entry->vme_start);
15223 }
15224
15225 /*
15226 * "curr_max_{above,below}" allow us to keep track of the
15227 * portion of the submap that is actually mapped at this level:
15228 * the rest of that submap is irrelevant to us, since it's not
15229 * mapped here.
15230 * The relevant portion of the map starts at
15231 * "VME_OFFSET(curr_entry)" up to the size of "curr_entry".
15232 */
15233 curr_max_above = MIN(curr_max_above,
15234 curr_entry->vme_end - curr_address);
15235 curr_max_below = MIN(curr_max_below,
15236 curr_address - curr_entry->vme_start);
15237
15238 if (!curr_entry->is_sub_map ||
15239 curr_depth >= user_max_depth) {
15240 /*
15241 * We hit a leaf map or we reached the maximum depth
15242 * we could, so stop looking. Keep the current map
15243 * locked.
15244 */
15245 break;
15246 }
15247
15248 /*
15249 * Get down to the next submap level.
15250 */
15251
15252 if (curr_entry->needs_copy) {
15253 /* everything below this is effectively copy-on-write */
15254 submap_needed_copy = TRUE;
15255 }
15256
15257 /*
15258 * Lock the next level and unlock the current level,
15259 * unless we need to keep it locked to access the "next_entry"
15260 * later.
15261 */
15262 curr_entry_submap = VME_SUBMAP(curr_entry);
15263 curr_entry_start = curr_entry->vme_start;
15264 curr_entry_offset = VME_OFFSET(curr_entry);
15265 curr_entry = VM_MAP_ENTRY_NULL; /* no longer valid after unlocking the map */
15266 if (not_in_kdp) {
15267 vm_map_lock_read(curr_entry_submap);
15268 }
15269 if (curr_map == next_map) {
15270 /* keep "next_map" locked in case we need it */
15271 } else {
15272 /* release this map */
15273 if (not_in_kdp) {
15274 vm_map_unlock_read(curr_map);
15275 }
15276 }
15277
15278 /*
15279 * Adjust the offset. "curr_entry" mapped the submap
15280 * at relative address "curr_entry_start" in the
15281 * curr_map but skips the first "curr_entry_offset"
15282 * bytes of the submap.
15283 * "curr_offset" always represents the offset of a virtual
15284 * address in the curr_map relative to the absolute address
15285 * space (i.e. the top-level VM map).
15286 */
15287 curr_offset += curr_entry_offset - curr_entry_start;
15288 curr_address = user_address + curr_offset;
15289 /* switch to the submap */
15290 curr_map = curr_entry_submap;
15291 curr_depth++;
15292 }
15293
15294 // LP64todo: all the current tools are 32bit, obviously never worked for 64b
15295 // so probably should be a real 32b ID vs. ptr.
15296 // Current users just check for equality
15297
15298 if (curr_entry == NULL) {
15299 /* no VM region contains the address... */
15300
15301 if (do_region_footprint && /* we want footprint numbers */
15302 next_entry == NULL && /* & there are no more regions */
15303 /* & we haven't already provided our fake region: */
15304 user_address <= vm_map_last_entry(map)->vme_end) {
15305 ledger_amount_t ledger_resident, ledger_compressed;
15306
15307 /*
15308 * Add a fake memory region to account for
15309 * purgeable and/or ledger-tagged memory that
15310 * counts towards this task's memory footprint,
15311 * i.e. the resident/compressed pages of non-volatile
15312 * objects owned by that task.
15313 */
15314 task_ledgers_footprint(map->pmap->ledger,
15315 &ledger_resident,
15316 &ledger_compressed);
15317 if (ledger_resident + ledger_compressed == 0) {
15318 /* no purgeable memory usage to report */
15319 return KERN_INVALID_ADDRESS;
15320 }
15321 /* fake region to show nonvolatile footprint */
15322 if (look_for_pages) {
15323 submap_info->protection = VM_PROT_DEFAULT;
15324 submap_info->max_protection = VM_PROT_DEFAULT;
15325 submap_info->inheritance = VM_INHERIT_DEFAULT;
15326 submap_info->offset = 0;
15327 submap_info->user_tag = -1;
15328 submap_info->pages_resident = (unsigned int) (ledger_resident / effective_page_size);
15329 submap_info->pages_shared_now_private = 0;
15330 submap_info->pages_swapped_out = (unsigned int) (ledger_compressed / effective_page_size);
15331 submap_info->pages_dirtied = submap_info->pages_resident;
15332 submap_info->ref_count = 1;
15333 submap_info->shadow_depth = 0;
15334 submap_info->external_pager = 0;
15335 submap_info->share_mode = SM_PRIVATE;
15336 if (submap_needed_copy) {
15337 submap_info->share_mode = SM_COW;
15338 }
15339 submap_info->is_submap = 0;
15340 submap_info->behavior = VM_BEHAVIOR_DEFAULT;
15341 submap_info->object_id = VM_OBJECT_ID_FAKE(map, task_ledgers.purgeable_nonvolatile);
15342 submap_info->user_wired_count = 0;
15343 submap_info->pages_reusable = 0;
15344 } else {
15345 short_info->user_tag = -1;
15346 short_info->offset = 0;
15347 short_info->protection = VM_PROT_DEFAULT;
15348 short_info->inheritance = VM_INHERIT_DEFAULT;
15349 short_info->max_protection = VM_PROT_DEFAULT;
15350 short_info->behavior = VM_BEHAVIOR_DEFAULT;
15351 short_info->user_wired_count = 0;
15352 short_info->is_submap = 0;
15353 short_info->object_id = VM_OBJECT_ID_FAKE(map, task_ledgers.purgeable_nonvolatile);
15354 short_info->external_pager = 0;
15355 short_info->shadow_depth = 0;
15356 short_info->share_mode = SM_PRIVATE;
15357 if (submap_needed_copy) {
15358 short_info->share_mode = SM_COW;
15359 }
15360 short_info->ref_count = 1;
15361 }
15362 *nesting_depth = 0;
15363 *address_u = vm_sanitize_wrap_addr(vm_map_last_entry(map)->vme_end);
15364 *size_u = vm_sanitize_wrap_size(ledger_resident + ledger_compressed);
15365 return KERN_SUCCESS;
15366 }
15367
15368 if (next_entry == NULL) {
15369 /* ... and no VM region follows it either */
15370 return KERN_INVALID_ADDRESS;
15371 }
15372 /* ... gather info about the next VM region */
15373 curr_entry = next_entry;
15374 curr_map = next_map; /* still locked ... */
15375 curr_address = next_address;
15376 curr_skip = next_skip;
15377 curr_offset = next_offset;
15378 curr_depth = next_depth;
15379 curr_max_above = next_max_above;
15380 curr_max_below = next_max_below;
15381 } else {
15382 /* we won't need "next_entry" after all */
15383 if (next_entry != NULL) {
15384 /* release "next_map" */
15385 if (next_map != curr_map && not_in_kdp) {
15386 vm_map_unlock_read(next_map);
15387 }
15388 }
15389 }
15390 next_entry = NULL;
15391 next_map = NULL;
15392 next_offset = 0;
15393 next_skip = 0;
15394 next_depth = 0;
15395 next_max_below = -1;
15396 next_max_above = -1;
15397
15398 if (curr_entry->is_sub_map &&
15399 curr_depth < user_max_depth) {
15400 /*
15401 * We're not as deep as we could be: we must have
15402 * gone back up after not finding anything mapped
15403 * below the original top-level map entry's.
15404 * Let's move "curr_address" forward and recurse again.
15405 */
15406 user_address = curr_address;
15407 goto recurse_again;
15408 }
15409
15410 *nesting_depth = curr_depth;
15411 *address_u = vm_sanitize_wrap_addr(
15412 user_address + curr_skip - curr_max_below);
15413 *size_u = vm_sanitize_wrap_size(curr_max_above + curr_max_below);
15414
15415 if (look_for_pages) {
15416 submap_info->user_tag = VME_ALIAS(curr_entry);
15417 submap_info->offset = VME_OFFSET(curr_entry);
15418 submap_info->protection = curr_entry->protection;
15419 submap_info->inheritance = curr_entry->inheritance;
15420 submap_info->max_protection = curr_entry->max_protection;
15421 submap_info->behavior = curr_entry->behavior;
15422 submap_info->user_wired_count = curr_entry->user_wired_count;
15423 submap_info->is_submap = curr_entry->is_sub_map;
15424 if (curr_entry->is_sub_map) {
15425 submap_info->object_id = VM_OBJECT_ID(VME_SUBMAP(curr_entry));
15426 } else {
15427 submap_info->object_id = VM_OBJECT_ID(VME_OBJECT(curr_entry));
15428 }
15429 } else {
15430 short_info->user_tag = VME_ALIAS(curr_entry);
15431 short_info->offset = VME_OFFSET(curr_entry);
15432 short_info->protection = curr_entry->protection;
15433 short_info->inheritance = curr_entry->inheritance;
15434 short_info->max_protection = curr_entry->max_protection;
15435 short_info->behavior = curr_entry->behavior;
15436 short_info->user_wired_count = curr_entry->user_wired_count;
15437 short_info->is_submap = curr_entry->is_sub_map;
15438 if (curr_entry->is_sub_map) {
15439 short_info->object_id = VM_OBJECT_ID(VME_SUBMAP(curr_entry));
15440 } else {
15441 short_info->object_id = VM_OBJECT_ID(VME_OBJECT(curr_entry));
15442 }
15443 }
15444
15445 extended.pages_resident = 0;
15446 extended.pages_swapped_out = 0;
15447 extended.pages_shared_now_private = 0;
15448 extended.pages_dirtied = 0;
15449 extended.pages_reusable = 0;
15450 extended.external_pager = 0;
15451 extended.shadow_depth = 0;
15452 extended.share_mode = SM_EMPTY;
15453 extended.ref_count = 0;
15454
15455 if (not_in_kdp) {
15456 if (!curr_entry->is_sub_map) {
15457 vm_map_offset_t range_start, range_end;
15458 range_start = MAX((curr_address - curr_max_below),
15459 curr_entry->vme_start);
15460 range_end = MIN((curr_address + curr_max_above),
15461 curr_entry->vme_end);
15462 vm_map_region_walk(curr_map,
15463 range_start,
15464 curr_entry,
15465 (VME_OFFSET(curr_entry) +
15466 (range_start -
15467 curr_entry->vme_start)),
15468 range_end - range_start,
15469 &extended,
15470 look_for_pages, VM_REGION_EXTENDED_INFO_COUNT);
15471 if (submap_needed_copy) {
15472 extended.share_mode = SM_COW;
15473 }
15474 } else {
15475 if (curr_entry->use_pmap) {
15476 extended.share_mode = SM_TRUESHARED;
15477 } else {
15478 extended.share_mode = SM_PRIVATE;
15479 }
15480 extended.ref_count = os_ref_get_count_raw(&VME_SUBMAP(curr_entry)->map_refcnt);
15481 }
15482 }
15483
15484 if (look_for_pages) {
15485 submap_info->pages_resident = extended.pages_resident;
15486 submap_info->pages_swapped_out = extended.pages_swapped_out;
15487 submap_info->pages_shared_now_private =
15488 extended.pages_shared_now_private;
15489 submap_info->pages_dirtied = extended.pages_dirtied;
15490 submap_info->external_pager = extended.external_pager;
15491 submap_info->shadow_depth = extended.shadow_depth;
15492 submap_info->share_mode = extended.share_mode;
15493 submap_info->ref_count = extended.ref_count;
15494
15495 if (original_count >= VM_REGION_SUBMAP_INFO_V1_COUNT_64) {
15496 submap_info->pages_reusable = extended.pages_reusable;
15497 }
15498 if (original_count >= VM_REGION_SUBMAP_INFO_V2_COUNT_64) {
15499 if (curr_entry->is_sub_map) {
15500 submap_info->object_id_full = (vm_object_id_t)VM_KERNEL_ADDRHASH(VME_SUBMAP(curr_entry));
15501 } else if (VME_OBJECT(curr_entry)) {
15502 submap_info->object_id_full = (vm_object_id_t)VM_KERNEL_ADDRHASH(VME_OBJECT(curr_entry));
15503 } else {
15504 submap_info->object_id_full = 0ull;
15505 }
15506 }
15507 } else {
15508 short_info->external_pager = extended.external_pager;
15509 short_info->shadow_depth = extended.shadow_depth;
15510 short_info->share_mode = extended.share_mode;
15511 short_info->ref_count = extended.ref_count;
15512 }
15513
15514 if (not_in_kdp) {
15515 vm_map_unlock_read(curr_map);
15516 }
15517
15518 return KERN_SUCCESS;
15519 }
15520
15521 /*
15522 * vm_region:
15523 *
15524 * User call to obtain information about a region in
15525 * a task's address map. Currently, only one flavor is
15526 * supported.
15527 *
15528 * XXX The reserved and behavior fields cannot be filled
15529 * in until the vm merge from the IK is completed, and
15530 * vm_reserve is implemented.
15531 */
15532
15533 kern_return_t
vm_map_region(vm_map_t map,vm_map_offset_ut * address_u,vm_map_size_ut * size_u,vm_region_flavor_t flavor,vm_region_info_t info,mach_msg_type_number_t * count,mach_port_t * object_name)15534 vm_map_region(
15535 vm_map_t map,
15536 vm_map_offset_ut *address_u, /* IN/OUT */
15537 vm_map_size_ut *size_u, /* OUT */
15538 vm_region_flavor_t flavor, /* IN */
15539 vm_region_info_t info, /* OUT */
15540 mach_msg_type_number_t *count, /* IN/OUT */
15541 mach_port_t *object_name) /* OUT */
15542 {
15543 vm_map_entry_t tmp_entry;
15544 vm_map_entry_t entry;
15545 vm_map_offset_t start;
15546
15547 if (map == VM_MAP_NULL) {
15548 return KERN_INVALID_ARGUMENT;
15549 }
15550
15551 start = vm_sanitize_addr(map, *address_u);
15552
15553 switch (flavor) {
15554 case VM_REGION_BASIC_INFO:
15555 /* legacy for old 32-bit objects info */
15556 {
15557 vm_region_basic_info_t basic;
15558
15559 if (*count < VM_REGION_BASIC_INFO_COUNT) {
15560 return KERN_INVALID_ARGUMENT;
15561 }
15562
15563 basic = (vm_region_basic_info_t) info;
15564 *count = VM_REGION_BASIC_INFO_COUNT;
15565
15566 vm_map_lock_read(map);
15567
15568 if (!vm_map_lookup_entry(map, start, &tmp_entry)) {
15569 if ((entry = tmp_entry->vme_next) == vm_map_to_entry(map)) {
15570 vm_map_unlock_read(map);
15571 return KERN_INVALID_ADDRESS;
15572 }
15573 } else {
15574 entry = tmp_entry;
15575 }
15576
15577 start = entry->vme_start;
15578
15579 basic->offset = (uint32_t)VME_OFFSET(entry);
15580 basic->protection = entry->protection;
15581 basic->inheritance = entry->inheritance;
15582 basic->max_protection = entry->max_protection;
15583 basic->behavior = entry->behavior;
15584 basic->user_wired_count = entry->user_wired_count;
15585 basic->reserved = entry->is_sub_map;
15586
15587 *address_u = vm_sanitize_wrap_addr(start);
15588 *size_u = vm_sanitize_wrap_size(entry->vme_end - start);
15589
15590 if (object_name) {
15591 *object_name = IP_NULL;
15592 }
15593 if (entry->is_sub_map) {
15594 basic->shared = FALSE;
15595 } else {
15596 basic->shared = entry->is_shared;
15597 }
15598
15599 vm_map_unlock_read(map);
15600 return KERN_SUCCESS;
15601 }
15602
15603 case VM_REGION_BASIC_INFO_64:
15604 {
15605 vm_region_basic_info_64_t basic;
15606
15607 if (*count < VM_REGION_BASIC_INFO_COUNT_64) {
15608 return KERN_INVALID_ARGUMENT;
15609 }
15610
15611 basic = (vm_region_basic_info_64_t) info;
15612 *count = VM_REGION_BASIC_INFO_COUNT_64;
15613
15614 vm_map_lock_read(map);
15615
15616 if (!vm_map_lookup_entry(map, start, &tmp_entry)) {
15617 if ((entry = tmp_entry->vme_next) == vm_map_to_entry(map)) {
15618 vm_map_unlock_read(map);
15619 return KERN_INVALID_ADDRESS;
15620 }
15621 } else {
15622 entry = tmp_entry;
15623 }
15624
15625 start = entry->vme_start;
15626
15627 basic->offset = VME_OFFSET(entry);
15628 basic->protection = entry->protection;
15629 basic->inheritance = entry->inheritance;
15630 basic->max_protection = entry->max_protection;
15631 basic->behavior = entry->behavior;
15632 basic->user_wired_count = entry->user_wired_count;
15633 basic->reserved = entry->is_sub_map;
15634
15635 *address_u = vm_sanitize_wrap_addr(start);
15636 *size_u = vm_sanitize_wrap_size(entry->vme_end - start);
15637
15638 if (object_name) {
15639 *object_name = IP_NULL;
15640 }
15641 if (entry->is_sub_map) {
15642 basic->shared = FALSE;
15643 } else {
15644 basic->shared = entry->is_shared;
15645 }
15646
15647 vm_map_unlock_read(map);
15648 return KERN_SUCCESS;
15649 }
15650 case VM_REGION_EXTENDED_INFO:
15651 if (*count < VM_REGION_EXTENDED_INFO_COUNT) {
15652 return KERN_INVALID_ARGUMENT;
15653 }
15654 OS_FALLTHROUGH;
15655 case VM_REGION_EXTENDED_INFO__legacy:
15656 {
15657 vm_region_extended_info_t extended;
15658 mach_msg_type_number_t original_count;
15659 int effective_page_size, effective_page_shift;
15660
15661 if (*count < VM_REGION_EXTENDED_INFO_COUNT__legacy) {
15662 return KERN_INVALID_ARGUMENT;
15663 }
15664
15665 extended = (vm_region_extended_info_t) info;
15666
15667 effective_page_shift = vm_self_region_page_shift(map);
15668 effective_page_size = (1 << effective_page_shift);
15669
15670 vm_map_lock_read(map);
15671
15672 if (!vm_map_lookup_entry(map, start, &tmp_entry)) {
15673 if ((entry = tmp_entry->vme_next) == vm_map_to_entry(map)) {
15674 vm_map_unlock_read(map);
15675 return KERN_INVALID_ADDRESS;
15676 }
15677 } else {
15678 entry = tmp_entry;
15679 }
15680 start = entry->vme_start;
15681
15682 extended->protection = entry->protection;
15683 extended->user_tag = VME_ALIAS(entry);
15684 extended->pages_resident = 0;
15685 extended->pages_swapped_out = 0;
15686 extended->pages_shared_now_private = 0;
15687 extended->pages_dirtied = 0;
15688 extended->external_pager = 0;
15689 extended->shadow_depth = 0;
15690
15691 original_count = *count;
15692 if (flavor == VM_REGION_EXTENDED_INFO__legacy) {
15693 *count = VM_REGION_EXTENDED_INFO_COUNT__legacy;
15694 } else {
15695 extended->pages_reusable = 0;
15696 *count = VM_REGION_EXTENDED_INFO_COUNT;
15697 }
15698
15699 vm_map_region_walk(map, start, entry, VME_OFFSET(entry), entry->vme_end - start, extended, TRUE, *count);
15700
15701 if (object_name) {
15702 *object_name = IP_NULL;
15703 }
15704
15705 *address_u = vm_sanitize_wrap_addr(start);
15706 *size_u = vm_sanitize_wrap_size(entry->vme_end - start);
15707
15708 vm_map_unlock_read(map);
15709 return KERN_SUCCESS;
15710 }
15711 case VM_REGION_TOP_INFO:
15712 {
15713 vm_region_top_info_t top;
15714
15715 if (*count < VM_REGION_TOP_INFO_COUNT) {
15716 return KERN_INVALID_ARGUMENT;
15717 }
15718
15719 top = (vm_region_top_info_t) info;
15720 *count = VM_REGION_TOP_INFO_COUNT;
15721
15722 vm_map_lock_read(map);
15723
15724 if (!vm_map_lookup_entry(map, start, &tmp_entry)) {
15725 if ((entry = tmp_entry->vme_next) == vm_map_to_entry(map)) {
15726 vm_map_unlock_read(map);
15727 return KERN_INVALID_ADDRESS;
15728 }
15729 } else {
15730 entry = tmp_entry;
15731 }
15732 start = entry->vme_start;
15733
15734 top->private_pages_resident = 0;
15735 top->shared_pages_resident = 0;
15736
15737 vm_map_region_top_walk(entry, top);
15738
15739 if (object_name) {
15740 *object_name = IP_NULL;
15741 }
15742
15743 *address_u = vm_sanitize_wrap_addr(start);
15744 *size_u = vm_sanitize_wrap_size(entry->vme_end - start);
15745
15746 vm_map_unlock_read(map);
15747 return KERN_SUCCESS;
15748 }
15749 default:
15750 return KERN_INVALID_ARGUMENT;
15751 }
15752 }
15753
15754 #define OBJ_RESIDENT_COUNT(obj, entry_size) \
15755 MIN((entry_size), \
15756 ((obj)->all_reusable ? \
15757 (obj)->wired_page_count : \
15758 (obj)->resident_page_count - (obj)->reusable_page_count))
15759
15760 void
vm_map_region_top_walk(vm_map_entry_t entry,vm_region_top_info_t top)15761 vm_map_region_top_walk(
15762 vm_map_entry_t entry,
15763 vm_region_top_info_t top)
15764 {
15765 if (entry->is_sub_map || VME_OBJECT(entry) == 0) {
15766 top->share_mode = SM_EMPTY;
15767 top->ref_count = 0;
15768 top->obj_id = 0;
15769 return;
15770 }
15771
15772 {
15773 struct vm_object *obj, *tmp_obj;
15774 int ref_count;
15775 uint32_t entry_size;
15776
15777 entry_size = (uint32_t) ((entry->vme_end - entry->vme_start) / PAGE_SIZE_64);
15778
15779 obj = VME_OBJECT(entry);
15780
15781 vm_object_lock(obj);
15782
15783 if ((ref_count = os_ref_get_count_raw(&obj->ref_count)) > 1 &&
15784 obj->paging_in_progress) {
15785 ref_count--;
15786 }
15787
15788 assert(obj->reusable_page_count <= obj->resident_page_count);
15789 if (obj->shadow) {
15790 if (ref_count == 1) {
15791 top->private_pages_resident =
15792 OBJ_RESIDENT_COUNT(obj, entry_size);
15793 } else {
15794 top->shared_pages_resident =
15795 OBJ_RESIDENT_COUNT(obj, entry_size);
15796 }
15797 top->ref_count = ref_count;
15798 top->share_mode = SM_COW;
15799
15800 while ((tmp_obj = obj->shadow)) {
15801 vm_object_lock(tmp_obj);
15802 vm_object_unlock(obj);
15803 obj = tmp_obj;
15804
15805 if ((ref_count = os_ref_get_count_raw(&obj->ref_count)) > 1 &&
15806 obj->paging_in_progress) {
15807 ref_count--;
15808 }
15809
15810 assert(obj->reusable_page_count <= obj->resident_page_count);
15811 top->shared_pages_resident +=
15812 OBJ_RESIDENT_COUNT(obj, entry_size);
15813 top->ref_count += ref_count - 1;
15814 }
15815 } else {
15816 if (entry->superpage_size) {
15817 top->share_mode = SM_LARGE_PAGE;
15818 top->shared_pages_resident = 0;
15819 top->private_pages_resident = entry_size;
15820 } else if (entry->needs_copy) {
15821 top->share_mode = SM_COW;
15822 top->shared_pages_resident =
15823 OBJ_RESIDENT_COUNT(obj, entry_size);
15824 } else {
15825 if (ref_count == 1 ||
15826 (ref_count == 2 && obj->named)) {
15827 top->share_mode = SM_PRIVATE;
15828 top->private_pages_resident =
15829 OBJ_RESIDENT_COUNT(obj,
15830 entry_size);
15831 } else {
15832 top->share_mode = SM_SHARED;
15833 top->shared_pages_resident =
15834 OBJ_RESIDENT_COUNT(obj,
15835 entry_size);
15836 }
15837 }
15838 top->ref_count = ref_count;
15839 }
15840
15841 vm_object_unlock(obj);
15842
15843 /* XXX K64: obj_id will be truncated */
15844 top->obj_id = (unsigned int) (uintptr_t)VM_KERNEL_ADDRHASH(obj);
15845 }
15846 }
15847
15848 void
vm_map_region_walk(vm_map_t map,vm_map_offset_t va,vm_map_entry_t entry,vm_object_offset_t offset,vm_object_size_t range,vm_region_extended_info_t extended,boolean_t look_for_pages,mach_msg_type_number_t count)15849 vm_map_region_walk(
15850 vm_map_t map,
15851 vm_map_offset_t va,
15852 vm_map_entry_t entry,
15853 vm_object_offset_t offset,
15854 vm_object_size_t range,
15855 vm_region_extended_info_t extended,
15856 boolean_t look_for_pages,
15857 mach_msg_type_number_t count)
15858 {
15859 struct vm_object *obj, *tmp_obj;
15860 vm_map_offset_t last_offset;
15861 int i;
15862 int ref_count;
15863 struct vm_object *shadow_object;
15864 unsigned short shadow_depth;
15865 boolean_t do_region_footprint;
15866 int effective_page_size, effective_page_shift;
15867 vm_map_offset_t effective_page_mask;
15868
15869 do_region_footprint = task_self_region_footprint();
15870
15871 if ((entry->is_sub_map) ||
15872 (VME_OBJECT(entry) == 0) ||
15873 (VME_OBJECT(entry)->phys_contiguous &&
15874 !entry->superpage_size)) {
15875 extended->share_mode = SM_EMPTY;
15876 extended->ref_count = 0;
15877 return;
15878 }
15879
15880 if (entry->superpage_size) {
15881 extended->shadow_depth = 0;
15882 extended->share_mode = SM_LARGE_PAGE;
15883 extended->ref_count = 1;
15884 extended->external_pager = 0;
15885
15886 /* TODO4K: Superpage in 4k mode? */
15887 extended->pages_resident = (unsigned int)(range >> PAGE_SHIFT);
15888 extended->shadow_depth = 0;
15889 return;
15890 }
15891
15892 effective_page_shift = vm_self_region_page_shift(map);
15893 effective_page_size = (1 << effective_page_shift);
15894 effective_page_mask = effective_page_size - 1;
15895
15896 offset = vm_map_trunc_page(offset, effective_page_mask);
15897
15898 obj = VME_OBJECT(entry);
15899
15900 vm_object_lock(obj);
15901
15902 if ((ref_count = os_ref_get_count_raw(&obj->ref_count)) > 1 &&
15903 obj->paging_in_progress) {
15904 ref_count--;
15905 }
15906
15907 if (look_for_pages) {
15908 for (last_offset = offset + range;
15909 offset < last_offset;
15910 offset += effective_page_size, va += effective_page_size) {
15911 if (do_region_footprint) {
15912 int disp;
15913
15914 disp = 0;
15915 if (map->has_corpse_footprint) {
15916 /*
15917 * Query the page info data we saved
15918 * while forking the corpse.
15919 */
15920 vm_map_corpse_footprint_query_page_info(
15921 map,
15922 va,
15923 &disp);
15924 } else {
15925 /*
15926 * Query the pmap.
15927 */
15928 vm_map_footprint_query_page_info(
15929 map,
15930 entry,
15931 va,
15932 &disp);
15933 }
15934 if (disp & VM_PAGE_QUERY_PAGE_PRESENT) {
15935 extended->pages_resident++;
15936 }
15937 if (disp & VM_PAGE_QUERY_PAGE_REUSABLE) {
15938 extended->pages_reusable++;
15939 }
15940 if (disp & VM_PAGE_QUERY_PAGE_DIRTY) {
15941 extended->pages_dirtied++;
15942 }
15943 if (disp & PMAP_QUERY_PAGE_COMPRESSED) {
15944 extended->pages_swapped_out++;
15945 }
15946 continue;
15947 }
15948
15949 vm_map_region_look_for_page(map, va, obj,
15950 vm_object_trunc_page(offset), ref_count,
15951 0, extended, count);
15952 }
15953
15954 if (do_region_footprint) {
15955 goto collect_object_info;
15956 }
15957 } else {
15958 collect_object_info:
15959 shadow_object = obj->shadow;
15960 shadow_depth = 0;
15961
15962 if (!(obj->internal)) {
15963 extended->external_pager = 1;
15964 }
15965
15966 if (shadow_object != VM_OBJECT_NULL) {
15967 vm_object_lock(shadow_object);
15968 for (;
15969 shadow_object != VM_OBJECT_NULL;
15970 shadow_depth++) {
15971 vm_object_t next_shadow;
15972
15973 if (!(shadow_object->internal)) {
15974 extended->external_pager = 1;
15975 }
15976
15977 next_shadow = shadow_object->shadow;
15978 if (next_shadow) {
15979 vm_object_lock(next_shadow);
15980 }
15981 vm_object_unlock(shadow_object);
15982 shadow_object = next_shadow;
15983 }
15984 }
15985 extended->shadow_depth = shadow_depth;
15986 }
15987
15988 if (extended->shadow_depth || entry->needs_copy) {
15989 extended->share_mode = SM_COW;
15990 } else {
15991 if (ref_count == 1) {
15992 extended->share_mode = SM_PRIVATE;
15993 } else {
15994 if (obj->true_share) {
15995 extended->share_mode = SM_TRUESHARED;
15996 } else {
15997 extended->share_mode = SM_SHARED;
15998 }
15999 }
16000 }
16001 extended->ref_count = ref_count - extended->shadow_depth;
16002
16003 for (i = 0; i < extended->shadow_depth; i++) {
16004 if ((tmp_obj = obj->shadow) == 0) {
16005 break;
16006 }
16007 vm_object_lock(tmp_obj);
16008 vm_object_unlock(obj);
16009
16010 if ((ref_count = os_ref_get_count_raw(&tmp_obj->ref_count)) > 1 &&
16011 tmp_obj->paging_in_progress) {
16012 ref_count--;
16013 }
16014
16015 extended->ref_count += ref_count;
16016 obj = tmp_obj;
16017 }
16018 vm_object_unlock(obj);
16019
16020 if (extended->external_pager && extended->ref_count == 2 && extended->share_mode == SM_SHARED) {
16021 extended->share_mode = SM_PRIVATE;
16022 } else if (extended->share_mode == SM_SHARED && !(task_self_region_info_flags() & VM_REGION_INFO_FLAGS_NO_ALIASED)) {
16023 vm_map_entry_t cur;
16024 vm_map_entry_t last;
16025 int my_refs;
16026
16027 obj = VME_OBJECT(entry);
16028 last = vm_map_to_entry(map);
16029 my_refs = 0;
16030
16031 if ((ref_count = os_ref_get_count_raw(&obj->ref_count)) > 1 &&
16032 obj->paging_in_progress) {
16033 ref_count--;
16034 }
16035 for (cur = vm_map_first_entry(map); cur != last; cur = cur->vme_next) {
16036 if (vm_map_region_has_obj_ref(cur, obj)) {
16037 my_refs++;
16038 }
16039 }
16040
16041 if (my_refs == ref_count) {
16042 extended->share_mode = SM_PRIVATE_ALIASED;
16043 } else if (my_refs > 1) {
16044 extended->share_mode = SM_SHARED_ALIASED;
16045 }
16046 }
16047 }
16048
16049
16050 /* object is locked on entry and locked on return */
16051
16052
16053 static void
vm_map_region_look_for_page(__unused vm_map_t map,__unused vm_map_offset_t va,vm_object_t object,vm_object_offset_t offset,int max_refcnt,unsigned short depth,vm_region_extended_info_t extended,mach_msg_type_number_t count)16054 vm_map_region_look_for_page(
16055 __unused vm_map_t map,
16056 __unused vm_map_offset_t va,
16057 vm_object_t object,
16058 vm_object_offset_t offset,
16059 int max_refcnt,
16060 unsigned short depth,
16061 vm_region_extended_info_t extended,
16062 mach_msg_type_number_t count)
16063 {
16064 vm_page_t p;
16065 vm_object_t shadow;
16066 int ref_count;
16067 vm_object_t caller_object;
16068
16069 shadow = object->shadow;
16070 caller_object = object;
16071
16072
16073 while (TRUE) {
16074 if (!(object->internal)) {
16075 extended->external_pager = 1;
16076 }
16077
16078 if ((p = vm_page_lookup(object, offset)) != VM_PAGE_NULL) {
16079 if (shadow && (max_refcnt == 1)) {
16080 extended->pages_shared_now_private++;
16081 }
16082
16083 if (!vm_page_is_fictitious(p) &&
16084 (p->vmp_dirty || pmap_is_modified(VM_PAGE_GET_PHYS_PAGE(p)))) {
16085 extended->pages_dirtied++;
16086 } else if (count >= VM_REGION_EXTENDED_INFO_COUNT) {
16087 if (p->vmp_reusable || object->all_reusable) {
16088 extended->pages_reusable++;
16089 }
16090 }
16091
16092 extended->pages_resident++;
16093
16094 if (object != caller_object) {
16095 vm_object_unlock(object);
16096 }
16097
16098 return;
16099 }
16100 if (object->internal &&
16101 object->alive &&
16102 !object->terminating &&
16103 object->pager_ready) {
16104 if (vm_object_compressor_pager_state_get(object, offset)
16105 == VM_EXTERNAL_STATE_EXISTS) {
16106 /* the pager has that page */
16107 extended->pages_swapped_out++;
16108 if (object != caller_object) {
16109 vm_object_unlock(object);
16110 }
16111 return;
16112 }
16113 }
16114
16115 if (shadow) {
16116 vm_object_lock(shadow);
16117 if ((ref_count = os_ref_get_count_raw(&shadow->ref_count)) > 1 &&
16118 shadow->paging_in_progress) {
16119 ref_count--;
16120 }
16121
16122 if (++depth > extended->shadow_depth) {
16123 extended->shadow_depth = depth;
16124 }
16125
16126 if (ref_count > max_refcnt) {
16127 max_refcnt = ref_count;
16128 }
16129
16130 if (object != caller_object) {
16131 vm_object_unlock(object);
16132 }
16133
16134 offset = offset + object->vo_shadow_offset;
16135 object = shadow;
16136 shadow = object->shadow;
16137 continue;
16138 }
16139 if (object != caller_object) {
16140 vm_object_unlock(object);
16141 }
16142 break;
16143 }
16144 }
16145
16146 static inline boolean_t
vm_map_region_has_obj_ref(vm_map_entry_t entry,vm_object_t object)16147 vm_map_region_has_obj_ref(
16148 vm_map_entry_t entry,
16149 vm_object_t object)
16150 {
16151 vm_object_t cur_obj;
16152 vm_object_t shadow_obj;
16153
16154 if (entry->is_sub_map) {
16155 return FALSE;
16156 }
16157
16158 cur_obj = VME_OBJECT(entry);
16159 if (cur_obj == VM_OBJECT_NULL) {
16160 return FALSE;
16161 } else if (cur_obj == object) {
16162 return TRUE;
16163 }
16164
16165 /*
16166 * Avoid locks for first shadow check, otherwise diagnostic tools will
16167 * spend most of their time obtaining locks in this function when analyzing
16168 * processes with many VM entries which may commonly have no shadow chain.
16169 *
16170 * This is acceptable because:
16171 * - Shadow's fields are not accessed outside of its lock
16172 * - Objects are unlikely to be modified due to:
16173 * - Many diagnostic tools suspend the task
16174 * - VM map is locked
16175 * - The rare incorrect return from this function turns a guess into a
16176 * slightly worse guess
16177 * - Entire shadow chain is not locked as a whole, so can still change
16178 * while traversing, resulting in incorrect guess even with locking
16179 */
16180 shadow_obj = cur_obj->shadow;
16181 if (shadow_obj == VM_OBJECT_NULL) {
16182 return FALSE;
16183 } else if (shadow_obj == object) {
16184 return TRUE;
16185 }
16186
16187 vm_object_lock(cur_obj);
16188
16189 while ((shadow_obj = cur_obj->shadow)) {
16190 /* check if object was found before grabbing a lock */
16191 if (shadow_obj == object) {
16192 vm_object_unlock(cur_obj);
16193 return TRUE;
16194 }
16195
16196 vm_object_lock(shadow_obj);
16197 vm_object_unlock(cur_obj);
16198 cur_obj = shadow_obj;
16199 }
16200
16201 /* exhausted the shadow chain */
16202 vm_object_unlock(cur_obj);
16203 return FALSE;
16204 }
16205
16206
16207 /*
16208 * Routine: vm_map_simplify
16209 *
16210 * Description:
16211 * Attempt to simplify the map representation in
16212 * the vicinity of the given starting address.
16213 * Note:
16214 * This routine is intended primarily to keep the
16215 * kernel maps more compact -- they generally don't
16216 * benefit from the "expand a map entry" technology
16217 * at allocation time because the adjacent entry
16218 * is often wired down.
16219 */
16220 void
vm_map_simplify_entry(vm_map_t map,vm_map_entry_t this_entry)16221 vm_map_simplify_entry(
16222 vm_map_t map,
16223 vm_map_entry_t this_entry)
16224 {
16225 vm_map_entry_t prev_entry;
16226
16227 prev_entry = this_entry->vme_prev;
16228
16229 if ((this_entry != vm_map_to_entry(map)) &&
16230 (prev_entry != vm_map_to_entry(map)) &&
16231
16232 (prev_entry->vme_end == this_entry->vme_start) &&
16233
16234 (prev_entry->is_sub_map == this_entry->is_sub_map) &&
16235 (prev_entry->vme_object_value == this_entry->vme_object_value) &&
16236 (prev_entry->vme_kernel_object == this_entry->vme_kernel_object) &&
16237 ((VME_OFFSET(prev_entry) + (prev_entry->vme_end -
16238 prev_entry->vme_start))
16239 == VME_OFFSET(this_entry)) &&
16240
16241 (prev_entry->behavior == this_entry->behavior) &&
16242 (prev_entry->needs_copy == this_entry->needs_copy) &&
16243 (prev_entry->protection == this_entry->protection) &&
16244 (prev_entry->max_protection == this_entry->max_protection) &&
16245 (prev_entry->inheritance == this_entry->inheritance) &&
16246 (prev_entry->use_pmap == this_entry->use_pmap) &&
16247 (VME_ALIAS(prev_entry) == VME_ALIAS(this_entry)) &&
16248 (prev_entry->no_cache == this_entry->no_cache) &&
16249 (prev_entry->vme_permanent == this_entry->vme_permanent) &&
16250 (prev_entry->map_aligned == this_entry->map_aligned) &&
16251 (prev_entry->zero_wired_pages == this_entry->zero_wired_pages) &&
16252 (prev_entry->used_for_jit == this_entry->used_for_jit) &&
16253 #if __arm64e__
16254 (prev_entry->used_for_tpro == this_entry->used_for_tpro) &&
16255 #endif
16256 (prev_entry->csm_associated == this_entry->csm_associated) &&
16257 (prev_entry->vme_xnu_user_debug == this_entry->vme_xnu_user_debug) &&
16258 (prev_entry->iokit_acct == this_entry->iokit_acct) &&
16259 (prev_entry->vme_resilient_codesign ==
16260 this_entry->vme_resilient_codesign) &&
16261 (prev_entry->vme_resilient_media ==
16262 this_entry->vme_resilient_media) &&
16263 (prev_entry->vme_no_copy_on_read == this_entry->vme_no_copy_on_read) &&
16264 (prev_entry->translated_allow_execute == this_entry->translated_allow_execute) &&
16265
16266 (prev_entry->wired_count == this_entry->wired_count) &&
16267 (prev_entry->user_wired_count == this_entry->user_wired_count) &&
16268
16269 ((prev_entry->vme_atomic == FALSE) && (this_entry->vme_atomic == FALSE)) &&
16270 (prev_entry->in_transition == FALSE) &&
16271 (this_entry->in_transition == FALSE) &&
16272 (prev_entry->needs_wakeup == FALSE) &&
16273 (this_entry->needs_wakeup == FALSE) &&
16274 (prev_entry->is_shared == this_entry->is_shared) &&
16275 (prev_entry->superpage_size == FALSE) &&
16276 (this_entry->superpage_size == FALSE)
16277 ) {
16278 if (prev_entry->vme_permanent) {
16279 assert(this_entry->vme_permanent);
16280 prev_entry->vme_permanent = false;
16281 }
16282 vm_map_store_entry_unlink(map, prev_entry, true);
16283 assert(prev_entry->vme_start < this_entry->vme_end);
16284 if (prev_entry->map_aligned) {
16285 assert(VM_MAP_PAGE_ALIGNED(prev_entry->vme_start,
16286 VM_MAP_PAGE_MASK(map)));
16287 }
16288 this_entry->vme_start = prev_entry->vme_start;
16289 VME_OFFSET_SET(this_entry, VME_OFFSET(prev_entry));
16290
16291 if (map->holelistenabled) {
16292 vm_map_store_update_first_free(map, this_entry, TRUE);
16293 }
16294
16295 if (prev_entry->is_sub_map) {
16296 vm_map_deallocate(VME_SUBMAP(prev_entry));
16297 } else {
16298 vm_object_deallocate(VME_OBJECT(prev_entry));
16299 }
16300 vm_map_entry_dispose(prev_entry);
16301 SAVE_HINT_MAP_WRITE(map, this_entry);
16302 }
16303 }
16304
16305 void
vm_map_simplify(vm_map_t map,vm_map_offset_t start)16306 vm_map_simplify(
16307 vm_map_t map,
16308 vm_map_offset_t start)
16309 {
16310 vm_map_entry_t this_entry;
16311
16312 vm_map_lock(map);
16313 if (vm_map_lookup_entry(map, start, &this_entry)) {
16314 vm_map_simplify_entry(map, this_entry);
16315 vm_map_simplify_entry(map, this_entry->vme_next);
16316 }
16317 vm_map_unlock(map);
16318 }
16319
16320 static void
vm_map_simplify_range(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)16321 vm_map_simplify_range(
16322 vm_map_t map,
16323 vm_map_offset_t start,
16324 vm_map_offset_t end)
16325 {
16326 vm_map_entry_t entry;
16327
16328 /*
16329 * The map should be locked (for "write") by the caller.
16330 */
16331
16332 if (start >= end) {
16333 /* invalid address range */
16334 return;
16335 }
16336
16337 start = vm_map_trunc_page(start,
16338 VM_MAP_PAGE_MASK(map));
16339 end = vm_map_round_page(end,
16340 VM_MAP_PAGE_MASK(map));
16341
16342 if (!vm_map_lookup_entry(map, start, &entry)) {
16343 /* "start" is not mapped and "entry" ends before "start" */
16344 if (entry == vm_map_to_entry(map)) {
16345 /* start with first entry in the map */
16346 entry = vm_map_first_entry(map);
16347 } else {
16348 /* start with next entry */
16349 entry = entry->vme_next;
16350 }
16351 }
16352
16353 while (entry != vm_map_to_entry(map) &&
16354 entry->vme_start <= end) {
16355 /* try and coalesce "entry" with its previous entry */
16356 vm_map_simplify_entry(map, entry);
16357 entry = entry->vme_next;
16358 }
16359 }
16360
16361 static __attribute__((always_inline, warn_unused_result))
16362 kern_return_t
vm_map_machine_attribute_sanitize(vm_map_t map,vm_map_offset_ut start_u,vm_map_offset_ut end_u,mach_vm_offset_t * start,mach_vm_offset_t * end,vm_map_size_t * size)16363 vm_map_machine_attribute_sanitize(
16364 vm_map_t map,
16365 vm_map_offset_ut start_u,
16366 vm_map_offset_ut end_u,
16367 mach_vm_offset_t *start,
16368 mach_vm_offset_t *end,
16369 vm_map_size_t *size)
16370 {
16371 return vm_sanitize_addr_end(start_u, end_u,
16372 VM_SANITIZE_CALLER_VM_MAP_MACHINE_ATTRIBUTE, map,
16373 VM_SANITIZE_FLAGS_SIZE_ZERO_SUCCEEDS, start, end,
16374 size);
16375 }
16376
16377
16378 /*
16379 * Routine: vm_map_machine_attribute
16380 * Purpose:
16381 * Provide machine-specific attributes to mappings,
16382 * such as cachability etc. for machines that provide
16383 * them. NUMA architectures and machines with big/strange
16384 * caches will use this.
16385 * Note:
16386 * Responsibilities for locking and checking are handled here,
16387 * everything else in the pmap module. If any non-volatile
16388 * information must be kept, the pmap module should handle
16389 * it itself. [This assumes that attributes do not
16390 * need to be inherited, which seems ok to me]
16391 */
16392 kern_return_t
vm_map_machine_attribute(vm_map_t map,vm_map_offset_ut start_u,vm_map_offset_ut end_u,vm_machine_attribute_t attribute,vm_machine_attribute_val_t * value)16393 vm_map_machine_attribute(
16394 vm_map_t map,
16395 vm_map_offset_ut start_u,
16396 vm_map_offset_ut end_u,
16397 vm_machine_attribute_t attribute,
16398 vm_machine_attribute_val_t *value) /* IN/OUT */
16399 {
16400 mach_vm_offset_t start, end;
16401 vm_map_size_t sync_size;
16402 kern_return_t ret;
16403 vm_map_entry_t entry;
16404
16405 ret = vm_map_machine_attribute_sanitize(map,
16406 start_u,
16407 end_u,
16408 &start,
16409 &end,
16410 &sync_size);
16411 if (__improbable(ret != KERN_SUCCESS)) {
16412 return vm_sanitize_get_kr(ret);
16413 }
16414
16415 if (start < vm_map_min(map) || end > vm_map_max(map)) {
16416 return KERN_INVALID_ADDRESS;
16417 }
16418
16419 vm_map_lock(map);
16420
16421 if (attribute != MATTR_CACHE) {
16422 /* If we don't have to find physical addresses, we */
16423 /* don't have to do an explicit traversal here. */
16424 ret = pmap_attribute(map->pmap, start, end - start,
16425 attribute, value);
16426 vm_map_unlock(map);
16427 return ret;
16428 }
16429
16430 ret = KERN_SUCCESS; /* Assume it all worked */
16431
16432 while (sync_size) {
16433 if (vm_map_lookup_entry(map, start, &entry)) {
16434 vm_map_size_t sub_size;
16435 if ((entry->vme_end - start) > sync_size) {
16436 sub_size = sync_size;
16437 sync_size = 0;
16438 } else {
16439 sub_size = entry->vme_end - start;
16440 sync_size -= sub_size;
16441 }
16442 if (entry->is_sub_map) {
16443 vm_map_offset_t sub_start;
16444 vm_map_offset_t sub_end;
16445
16446 sub_start = (start - entry->vme_start)
16447 + VME_OFFSET(entry);
16448 sub_end = sub_start + sub_size;
16449 vm_map_machine_attribute(
16450 VME_SUBMAP(entry),
16451 sub_start,
16452 sub_end,
16453 attribute, value);
16454 } else if (VME_OBJECT(entry)) {
16455 vm_page_t m;
16456 vm_object_t object;
16457 vm_object_t base_object;
16458 vm_object_t last_object;
16459 vm_object_offset_t offset;
16460 vm_object_offset_t base_offset;
16461 vm_map_size_t range;
16462 range = sub_size;
16463 offset = (start - entry->vme_start)
16464 + VME_OFFSET(entry);
16465 offset = vm_object_trunc_page(offset);
16466 base_offset = offset;
16467 object = VME_OBJECT(entry);
16468 base_object = object;
16469 last_object = NULL;
16470
16471 vm_object_lock(object);
16472
16473 while (range) {
16474 m = vm_page_lookup(
16475 object, offset);
16476
16477 if (m && !vm_page_is_fictitious(m)) {
16478 ret =
16479 pmap_attribute_cache_sync(
16480 VM_PAGE_GET_PHYS_PAGE(m),
16481 PAGE_SIZE,
16482 attribute, value);
16483 } else if (object->shadow) {
16484 offset = offset + object->vo_shadow_offset;
16485 last_object = object;
16486 object = object->shadow;
16487 vm_object_lock(last_object->shadow);
16488 vm_object_unlock(last_object);
16489 continue;
16490 }
16491 if (range < PAGE_SIZE) {
16492 range = 0;
16493 } else {
16494 range -= PAGE_SIZE;
16495 }
16496
16497 if (base_object != object) {
16498 vm_object_unlock(object);
16499 vm_object_lock(base_object);
16500 object = base_object;
16501 }
16502 /* Bump to the next page */
16503 base_offset += PAGE_SIZE;
16504 offset = base_offset;
16505 }
16506 vm_object_unlock(object);
16507 }
16508 start += sub_size;
16509 } else {
16510 vm_map_unlock(map);
16511 return KERN_FAILURE;
16512 }
16513 }
16514
16515 vm_map_unlock(map);
16516
16517 return ret;
16518 }
16519
16520 /*
16521 * vm_map_behavior_set:
16522 *
16523 * Sets the paging reference behavior of the specified address
16524 * range in the target map. Paging reference behavior affects
16525 * how pagein operations resulting from faults on the map will be
16526 * clustered.
16527 */
16528 kern_return_t
vm_map_behavior_set(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_behavior_t new_behavior)16529 vm_map_behavior_set(
16530 vm_map_t map,
16531 vm_map_offset_t start,
16532 vm_map_offset_t end,
16533 vm_behavior_t new_behavior)
16534 {
16535 vm_map_entry_t entry;
16536 vm_map_entry_t temp_entry;
16537
16538 if (start > end ||
16539 start < vm_map_min(map) ||
16540 end > vm_map_max(map)) {
16541 return KERN_NO_SPACE;
16542 }
16543 if (__improbable(vm_map_range_overflows(map, start, end - start))) {
16544 return KERN_INVALID_ADDRESS;
16545 }
16546
16547 switch (new_behavior) {
16548 /*
16549 * This first block of behaviors all set a persistent state on the specified
16550 * memory range. All we have to do here is to record the desired behavior
16551 * in the vm_map_entry_t's.
16552 */
16553
16554 case VM_BEHAVIOR_DEFAULT:
16555 case VM_BEHAVIOR_RANDOM:
16556 case VM_BEHAVIOR_SEQUENTIAL:
16557 case VM_BEHAVIOR_RSEQNTL:
16558 case VM_BEHAVIOR_ZERO_WIRED_PAGES:
16559 vm_map_lock(map);
16560
16561 /*
16562 * The entire address range must be valid for the map.
16563 * Note that vm_map_range_check() does a
16564 * vm_map_lookup_entry() internally and returns the
16565 * entry containing the start of the address range if
16566 * the entire range is valid.
16567 */
16568 if (vm_map_range_check(map, start, end, &temp_entry)) {
16569 entry = temp_entry;
16570 vm_map_clip_start(map, entry, start);
16571 } else {
16572 vm_map_unlock(map);
16573 return KERN_INVALID_ADDRESS;
16574 }
16575
16576 if (new_behavior == VM_BEHAVIOR_ZERO_WIRED_PAGES) {
16577 /* zeroing requires write access */
16578 temp_entry = entry;
16579 for (;
16580 entry != vm_map_to_entry(map) && (entry->vme_start < end);
16581 entry = entry->vme_next) {
16582 if (!(entry->protection & VM_PROT_WRITE) ||
16583 #if __arm64e__
16584 entry->used_for_tpro ||
16585 #endif /* __arm64e__ */
16586 entry->used_for_jit) {
16587 vm_map_unlock(map);
16588 return KERN_PROTECTION_FAILURE;
16589 }
16590 }
16591 entry = temp_entry;
16592 }
16593
16594 while ((entry != vm_map_to_entry(map)) && (entry->vme_start < end)) {
16595 vm_map_clip_end(map, entry, end);
16596 if (entry->is_sub_map) {
16597 assert(!entry->use_pmap);
16598 }
16599
16600 if (new_behavior == VM_BEHAVIOR_ZERO_WIRED_PAGES) {
16601 assert(entry->protection & VM_PROT_WRITE);
16602 #if __arm64e__
16603 assert(!entry->used_for_tpro);
16604 #endif /* __arm64e__ */
16605 assert(!entry->used_for_jit);
16606 entry->zero_wired_pages = TRUE;
16607 } else {
16608 entry->behavior = new_behavior;
16609 }
16610 entry = entry->vme_next;
16611 }
16612
16613 vm_map_unlock(map);
16614 break;
16615
16616 /*
16617 * The rest of these are different from the above in that they cause
16618 * an immediate action to take place as opposed to setting a behavior that
16619 * affects future actions.
16620 */
16621
16622 case VM_BEHAVIOR_WILLNEED:
16623 return vm_map_willneed(map, start, end);
16624
16625 case VM_BEHAVIOR_DONTNEED:
16626 return vm_map_msync(map, start, end - start, VM_SYNC_DEACTIVATE | VM_SYNC_CONTIGUOUS);
16627
16628 case VM_BEHAVIOR_FREE:
16629 return vm_map_msync(map, start, end - start, VM_SYNC_KILLPAGES | VM_SYNC_CONTIGUOUS);
16630
16631 case VM_BEHAVIOR_REUSABLE:
16632 return vm_map_reusable_pages(map, start, end);
16633
16634 case VM_BEHAVIOR_REUSE:
16635 return vm_map_reuse_pages(map, start, end);
16636
16637 case VM_BEHAVIOR_CAN_REUSE:
16638 return vm_map_can_reuse(map, start, end);
16639
16640 #if MACH_ASSERT
16641 case VM_BEHAVIOR_PAGEOUT:
16642 return vm_map_pageout(map, start, end);
16643 #endif /* MACH_ASSERT */
16644
16645 case VM_BEHAVIOR_ZERO:
16646 return vm_map_zero(map, start, end);
16647
16648 default:
16649 return KERN_INVALID_ARGUMENT;
16650 }
16651
16652 return KERN_SUCCESS;
16653 }
16654
16655
16656 /*
16657 * Internals for madvise(MADV_WILLNEED) system call.
16658 *
16659 * The implementation is to do:-
16660 * a) read-ahead if the mapping corresponds to a mapped regular file
16661 * b) or, fault in the pages (zero-fill, decompress etc) if it's an anonymous mapping
16662 */
16663 static kern_return_t
vm_map_willneed(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)16664 vm_map_willneed(
16665 vm_map_t map,
16666 vm_map_offset_t start,
16667 vm_map_offset_t end
16668 )
16669 {
16670 vm_map_entry_t entry;
16671 kern_return_t kr;
16672 vm_object_size_t len;
16673 vm_size_t region_size;
16674
16675 KDBG(VMDBG_CODE(DBG_VM_MAP_WILLNEED) | DBG_FUNC_START,
16676 start, end);
16677 struct vm_object_fault_info fault_info = {
16678 .interruptible = THREAD_UNINT,
16679 .behavior = VM_BEHAVIOR_SEQUENTIAL,
16680 /* Do not activate pages after faulting */
16681 .stealth = true,
16682 /* Don't wait for busy pages */
16683 .fi_no_sleep = true,
16684 };
16685
16686 /*
16687 * The MADV_WILLNEED operation doesn't require any changes to the
16688 * vm_map_entry_t's, so the read lock is sufficient.
16689 */
16690
16691 vm_map_lock_read(map);
16692
16693 /*
16694 * The madvise semantics require that the address range be fully
16695 * allocated with no holes. Otherwise, we're required to return
16696 * an error.
16697 */
16698
16699 if (!vm_map_range_check(map, start, end, &entry)) {
16700 vm_map_unlock_read(map);
16701 kr = KERN_INVALID_ADDRESS;
16702 goto done;
16703 }
16704
16705 /*
16706 * Examine each vm_map_entry_t in the range.
16707 */
16708 while (start < end) {
16709 /*
16710 * Set the length so we don't go beyond the end of the
16711 * map_entry or beyond the end of the range we were given.
16712 * This range could span also multiple map entries all of which
16713 * map different files, so make sure we only do the right amount
16714 * of I/O for each object. Note that it's possible for there
16715 * to be multiple map entries all referring to the same object
16716 * but with different page permissions, but it's not worth
16717 * trying to optimize that case.
16718 */
16719 len = MIN(entry->vme_end - start, end - start);
16720
16721 vm_map_offset_t addr = start;
16722
16723 vm_size_t effective_page_mask = MIN(vm_map_page_mask(map), PAGE_MASK);
16724 vm_map_offset_t effective_page_size = effective_page_mask + 1;
16725
16726 /*
16727 * Write-fault if the entry supports it to preclude subsequent soft-faults
16728 */
16729 vm_prot_t fault_prot = entry->protection & VM_PROT_WRITE ?
16730 VM_PROT_WRITE : VM_PROT_READ;
16731
16732 vm_map_unlock_read(map);
16733
16734 region_size = len;
16735 while (region_size) {
16736 /*
16737 * Provide a hint for how much clustering we would like. Note that
16738 * each individual fault will limit the size of each request to
16739 * MAX_UPL_TRANSFER_BYTES.
16740 */
16741 fault_info.cluster_size = region_size;
16742 kr = vm_pre_fault_with_info(
16743 map,
16744 vm_map_trunc_page(addr, effective_page_mask),
16745 fault_prot,
16746 &fault_info);
16747 if (kr == KERN_ALREADY_WAITING) {
16748 /*
16749 * The page is busy being faulted/paged by another thread.
16750 */
16751 KDBG(VMDBG_CODE(DBG_VM_MAP_WILLNEED) | DBG_FUNC_NONE,
16752 task_pid(current_task()), addr, kr);
16753 kr = KERN_SUCCESS;
16754 } else if (kr != KERN_SUCCESS) {
16755 goto done;
16756 }
16757 region_size -= effective_page_size;
16758 addr += effective_page_size;
16759 }
16760
16761 start += len;
16762 if (start >= end) {
16763 kr = KERN_SUCCESS;
16764 goto done;
16765 }
16766
16767 if (thread_should_abort(current_thread())) {
16768 kr = KERN_ABORTED;
16769 goto done;
16770 }
16771
16772 /* look up next entry */
16773 vm_map_lock_read(map);
16774 if (!vm_map_lookup_entry(map, start, &entry)) {
16775 /*
16776 * There's a new hole in the address range.
16777 */
16778 vm_map_unlock_read(map);
16779 kr = KERN_INVALID_ADDRESS;
16780 goto done;
16781 }
16782 }
16783
16784 vm_map_unlock_read(map);
16785 done:
16786 KDBG(VMDBG_CODE(DBG_VM_MAP_WILLNEED) | DBG_FUNC_END,
16787 start, kr);
16788 return kr;
16789 }
16790
16791 static boolean_t
vm_map_entry_is_reusable(vm_map_entry_t entry)16792 vm_map_entry_is_reusable(
16793 vm_map_entry_t entry)
16794 {
16795 /* Only user map entries */
16796
16797 vm_object_t object;
16798
16799 if (entry->is_sub_map) {
16800 return FALSE;
16801 }
16802
16803 switch (VME_ALIAS(entry)) {
16804 case VM_MEMORY_MALLOC:
16805 case VM_MEMORY_MALLOC_SMALL:
16806 case VM_MEMORY_MALLOC_LARGE:
16807 case VM_MEMORY_REALLOC:
16808 case VM_MEMORY_MALLOC_TINY:
16809 case VM_MEMORY_MALLOC_LARGE_REUSABLE:
16810 case VM_MEMORY_MALLOC_LARGE_REUSED:
16811 /*
16812 * This is a malloc() memory region: check if it's still
16813 * in its original state and can be re-used for more
16814 * malloc() allocations.
16815 */
16816 break;
16817 default:
16818 /*
16819 * Not a malloc() memory region: let the caller decide if
16820 * it's re-usable.
16821 */
16822 return TRUE;
16823 }
16824
16825 if (/*entry->is_shared ||*/
16826 entry->is_sub_map ||
16827 entry->in_transition ||
16828 entry->protection != VM_PROT_DEFAULT ||
16829 entry->max_protection != VM_PROT_ALL ||
16830 entry->inheritance != VM_INHERIT_DEFAULT ||
16831 entry->no_cache ||
16832 entry->vme_permanent ||
16833 entry->superpage_size != FALSE ||
16834 entry->zero_wired_pages ||
16835 entry->wired_count != 0 ||
16836 entry->user_wired_count != 0) {
16837 return FALSE;
16838 }
16839
16840 object = VME_OBJECT(entry);
16841 if (object == VM_OBJECT_NULL) {
16842 return TRUE;
16843 }
16844 if (
16845 #if 0
16846 /*
16847 * Let's proceed even if the VM object is potentially
16848 * shared.
16849 * We check for this later when processing the actual
16850 * VM pages, so the contents will be safe if shared.
16851 *
16852 * But we can still mark this memory region as "reusable" to
16853 * acknowledge that the caller did let us know that the memory
16854 * could be re-used and should not be penalized for holding
16855 * on to it. This allows its "resident size" to not include
16856 * the reusable range.
16857 */
16858 object->ref_count == 1 &&
16859 #endif
16860 object->vo_copy == VM_OBJECT_NULL &&
16861 object->shadow == VM_OBJECT_NULL &&
16862 object->internal &&
16863 object->purgable == VM_PURGABLE_DENY &&
16864 HAS_DEFAULT_CACHEABILITY(object->wimg_bits & VM_WIMG_MASK) &&
16865 !object->code_signed) {
16866 return TRUE;
16867 }
16868 return FALSE;
16869 }
16870
16871 static kern_return_t
vm_map_reuse_pages(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)16872 vm_map_reuse_pages(
16873 vm_map_t map,
16874 vm_map_offset_t start,
16875 vm_map_offset_t end)
16876 {
16877 vm_map_entry_t entry;
16878 vm_object_t object;
16879 vm_object_offset_t start_offset, end_offset;
16880
16881 /*
16882 * The MADV_REUSE operation doesn't require any changes to the
16883 * vm_map_entry_t's, so the read lock is sufficient.
16884 */
16885
16886 if (VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT) {
16887 /*
16888 * XXX TODO4K
16889 * need to figure out what reusable means for a
16890 * portion of a native page.
16891 */
16892 return KERN_SUCCESS;
16893 }
16894
16895 vm_map_lock_read(map);
16896 assert(map->pmap != kernel_pmap); /* protect alias access */
16897
16898 /*
16899 * The madvise semantics require that the address range be fully
16900 * allocated with no holes. Otherwise, we're required to return
16901 * an error.
16902 */
16903
16904 if (!vm_map_range_check(map, start, end, &entry)) {
16905 vm_map_unlock_read(map);
16906 vm_page_stats_reusable.reuse_pages_failure++;
16907 return KERN_INVALID_ADDRESS;
16908 }
16909
16910 /*
16911 * Examine each vm_map_entry_t in the range.
16912 */
16913 for (; entry != vm_map_to_entry(map) && entry->vme_start < end;
16914 entry = entry->vme_next) {
16915 /*
16916 * Sanity check on the VM map entry.
16917 */
16918 if (!vm_map_entry_is_reusable(entry)) {
16919 vm_map_unlock_read(map);
16920 vm_page_stats_reusable.reuse_pages_failure++;
16921 return KERN_INVALID_ADDRESS;
16922 }
16923
16924 /*
16925 * The first time through, the start address could be anywhere
16926 * within the vm_map_entry we found. So adjust the offset to
16927 * correspond.
16928 */
16929 if (entry->vme_start < start) {
16930 start_offset = start - entry->vme_start;
16931 } else {
16932 start_offset = 0;
16933 }
16934 end_offset = MIN(end, entry->vme_end) - entry->vme_start;
16935 start_offset += VME_OFFSET(entry);
16936 end_offset += VME_OFFSET(entry);
16937
16938 object = VME_OBJECT(entry);
16939 if (object != VM_OBJECT_NULL) {
16940 vm_object_lock(object);
16941 vm_object_reuse_pages(object, start_offset, end_offset,
16942 TRUE);
16943 vm_object_unlock(object);
16944 }
16945
16946 if (VME_ALIAS(entry) == VM_MEMORY_MALLOC_LARGE_REUSABLE) {
16947 /*
16948 * XXX
16949 * We do not hold the VM map exclusively here.
16950 * The "alias" field is not that critical, so it's
16951 * safe to update it here, as long as it is the only
16952 * one that can be modified while holding the VM map
16953 * "shared".
16954 */
16955 VME_ALIAS_SET(entry, VM_MEMORY_MALLOC_LARGE_REUSED);
16956 }
16957 }
16958
16959 vm_map_unlock_read(map);
16960 vm_page_stats_reusable.reuse_pages_success++;
16961 return KERN_SUCCESS;
16962 }
16963
16964
16965 static kern_return_t
vm_map_reusable_pages(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)16966 vm_map_reusable_pages(
16967 vm_map_t map,
16968 vm_map_offset_t start,
16969 vm_map_offset_t end)
16970 {
16971 vm_map_entry_t entry;
16972 vm_object_t object;
16973 vm_object_offset_t start_offset, end_offset;
16974 vm_map_offset_t pmap_offset;
16975
16976 if (VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT) {
16977 /*
16978 * XXX TODO4K
16979 * need to figure out what reusable means for a portion
16980 * of a native page.
16981 */
16982 return KERN_SUCCESS;
16983 }
16984
16985 /*
16986 * The MADV_REUSABLE operation doesn't require any changes to the
16987 * vm_map_entry_t's, so the read lock is sufficient.
16988 */
16989
16990 vm_map_lock_read(map);
16991 assert(map->pmap != kernel_pmap); /* protect alias access */
16992
16993 /*
16994 * The madvise semantics require that the address range be fully
16995 * allocated with no holes. Otherwise, we're required to return
16996 * an error.
16997 */
16998
16999 if (!vm_map_range_check(map, start, end, &entry)) {
17000 vm_map_unlock_read(map);
17001 vm_page_stats_reusable.reusable_pages_failure++;
17002 return KERN_INVALID_ADDRESS;
17003 }
17004
17005 /*
17006 * Examine each vm_map_entry_t in the range.
17007 */
17008 for (; entry != vm_map_to_entry(map) && entry->vme_start < end;
17009 entry = entry->vme_next) {
17010 int kill_pages = 0;
17011 boolean_t kill_no_write = FALSE;
17012
17013 /*
17014 * Sanity check on the VM map entry.
17015 */
17016 if (!vm_map_entry_is_reusable(entry)) {
17017 vm_map_unlock_read(map);
17018 vm_page_stats_reusable.reusable_pages_failure++;
17019 return KERN_INVALID_ADDRESS;
17020 }
17021
17022 if (!(entry->protection & VM_PROT_WRITE) && !entry->used_for_jit
17023 #if __arm64e__
17024 && !entry->used_for_tpro
17025 #endif
17026 ) {
17027 /* not writable: can't discard contents */
17028 vm_map_unlock_read(map);
17029 vm_page_stats_reusable.reusable_nonwritable++;
17030 vm_page_stats_reusable.reusable_pages_failure++;
17031 return KERN_PROTECTION_FAILURE;
17032 }
17033
17034 /*
17035 * The first time through, the start address could be anywhere
17036 * within the vm_map_entry we found. So adjust the offset to
17037 * correspond.
17038 */
17039 if (entry->vme_start < start) {
17040 start_offset = start - entry->vme_start;
17041 pmap_offset = start;
17042 } else {
17043 start_offset = 0;
17044 pmap_offset = entry->vme_start;
17045 }
17046 end_offset = MIN(end, entry->vme_end) - entry->vme_start;
17047 start_offset += VME_OFFSET(entry);
17048 end_offset += VME_OFFSET(entry);
17049
17050 object = VME_OBJECT(entry);
17051 if (object == VM_OBJECT_NULL) {
17052 continue;
17053 }
17054
17055 if ((entry->protection & VM_PROT_EXECUTE) ||
17056 entry->vme_xnu_user_debug) {
17057 /*
17058 * Executable or user debug pages might be write-protected by
17059 * hardware, so do not attempt to write to these pages.
17060 */
17061 kill_no_write = TRUE;
17062 }
17063
17064 vm_object_lock(object);
17065 if (((os_ref_get_count_raw(&object->ref_count) == 1) ||
17066 (object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC &&
17067 object->vo_copy == VM_OBJECT_NULL)) &&
17068 object->shadow == VM_OBJECT_NULL &&
17069 /*
17070 * "iokit_acct" entries are billed for their virtual size
17071 * (rather than for their resident pages only), so they
17072 * wouldn't benefit from making pages reusable, and it
17073 * would be hard to keep track of pages that are both
17074 * "iokit_acct" and "reusable" in the pmap stats and
17075 * ledgers.
17076 */
17077 !(entry->iokit_acct ||
17078 (!entry->is_sub_map && !entry->use_pmap))) {
17079 if (os_ref_get_count_raw(&object->ref_count) != 1) {
17080 vm_page_stats_reusable.reusable_shared++;
17081 }
17082 kill_pages = 1;
17083 } else {
17084 kill_pages = -1;
17085 }
17086 if (kill_pages != -1) {
17087 vm_object_deactivate_pages(object,
17088 start_offset,
17089 end_offset - start_offset,
17090 kill_pages,
17091 TRUE /*reusable_pages*/,
17092 kill_no_write,
17093 map->pmap,
17094 pmap_offset);
17095 } else {
17096 vm_page_stats_reusable.reusable_pages_shared++;
17097 DTRACE_VM4(vm_map_reusable_pages_shared,
17098 unsigned int, VME_ALIAS(entry),
17099 vm_map_t, map,
17100 vm_map_entry_t, entry,
17101 vm_object_t, object);
17102 }
17103 vm_object_unlock(object);
17104
17105 if (VME_ALIAS(entry) == VM_MEMORY_MALLOC_LARGE ||
17106 VME_ALIAS(entry) == VM_MEMORY_MALLOC_LARGE_REUSED) {
17107 /*
17108 * XXX
17109 * We do not hold the VM map exclusively here.
17110 * The "alias" field is not that critical, so it's
17111 * safe to update it here, as long as it is the only
17112 * one that can be modified while holding the VM map
17113 * "shared".
17114 */
17115 VME_ALIAS_SET(entry, VM_MEMORY_MALLOC_LARGE_REUSABLE);
17116 }
17117 }
17118
17119 vm_map_unlock_read(map);
17120 vm_page_stats_reusable.reusable_pages_success++;
17121 return KERN_SUCCESS;
17122 }
17123
17124
17125 static kern_return_t
vm_map_can_reuse(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)17126 vm_map_can_reuse(
17127 vm_map_t map,
17128 vm_map_offset_t start,
17129 vm_map_offset_t end)
17130 {
17131 vm_map_entry_t entry;
17132
17133 /*
17134 * The MADV_REUSABLE operation doesn't require any changes to the
17135 * vm_map_entry_t's, so the read lock is sufficient.
17136 */
17137
17138 vm_map_lock_read(map);
17139 assert(map->pmap != kernel_pmap); /* protect alias access */
17140
17141 /*
17142 * The madvise semantics require that the address range be fully
17143 * allocated with no holes. Otherwise, we're required to return
17144 * an error.
17145 */
17146
17147 if (!vm_map_range_check(map, start, end, &entry)) {
17148 vm_map_unlock_read(map);
17149 vm_page_stats_reusable.can_reuse_failure++;
17150 return KERN_INVALID_ADDRESS;
17151 }
17152
17153 /*
17154 * Examine each vm_map_entry_t in the range.
17155 */
17156 for (; entry != vm_map_to_entry(map) && entry->vme_start < end;
17157 entry = entry->vme_next) {
17158 /*
17159 * Sanity check on the VM map entry.
17160 */
17161 if (!vm_map_entry_is_reusable(entry)) {
17162 vm_map_unlock_read(map);
17163 vm_page_stats_reusable.can_reuse_failure++;
17164 return KERN_INVALID_ADDRESS;
17165 }
17166 }
17167
17168 vm_map_unlock_read(map);
17169 vm_page_stats_reusable.can_reuse_success++;
17170 return KERN_SUCCESS;
17171 }
17172
17173
17174 #if MACH_ASSERT
17175 static kern_return_t
vm_map_pageout(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)17176 vm_map_pageout(
17177 vm_map_t map,
17178 vm_map_offset_t start,
17179 vm_map_offset_t end)
17180 {
17181 vm_map_entry_t entry;
17182
17183 /*
17184 * The MADV_PAGEOUT operation doesn't require any changes to the
17185 * vm_map_entry_t's, so the read lock is sufficient.
17186 */
17187
17188 vm_map_lock_read(map);
17189
17190 /*
17191 * The madvise semantics require that the address range be fully
17192 * allocated with no holes. Otherwise, we're required to return
17193 * an error.
17194 */
17195
17196 if (!vm_map_range_check(map, start, end, &entry)) {
17197 vm_map_unlock_read(map);
17198 return KERN_INVALID_ADDRESS;
17199 }
17200
17201 /*
17202 * Examine each vm_map_entry_t in the range.
17203 */
17204 for (; entry != vm_map_to_entry(map) && entry->vme_start < end;
17205 entry = entry->vme_next) {
17206 vm_object_t object;
17207
17208 /*
17209 * Sanity check on the VM map entry.
17210 */
17211 if (entry->is_sub_map) {
17212 vm_map_t submap;
17213 vm_map_offset_t submap_start;
17214 vm_map_offset_t submap_end;
17215 vm_map_entry_t submap_entry;
17216
17217 submap = VME_SUBMAP(entry);
17218 submap_start = VME_OFFSET(entry);
17219 submap_end = submap_start + (entry->vme_end -
17220 entry->vme_start);
17221
17222 vm_map_lock_read(submap);
17223
17224 if (!vm_map_range_check(submap,
17225 submap_start,
17226 submap_end,
17227 &submap_entry)) {
17228 vm_map_unlock_read(submap);
17229 vm_map_unlock_read(map);
17230 return KERN_INVALID_ADDRESS;
17231 }
17232
17233 if (submap_entry->is_sub_map) {
17234 vm_map_unlock_read(submap);
17235 continue;
17236 }
17237
17238 object = VME_OBJECT(submap_entry);
17239 if (object == VM_OBJECT_NULL || !object->internal) {
17240 vm_map_unlock_read(submap);
17241 continue;
17242 }
17243
17244 vm_object_pageout(object);
17245
17246 vm_map_unlock_read(submap);
17247 submap = VM_MAP_NULL;
17248 submap_entry = VM_MAP_ENTRY_NULL;
17249 continue;
17250 }
17251
17252 object = VME_OBJECT(entry);
17253 if (object == VM_OBJECT_NULL || !object->internal) {
17254 continue;
17255 }
17256
17257 vm_object_pageout(object);
17258 }
17259
17260 vm_map_unlock_read(map);
17261 return KERN_SUCCESS;
17262 }
17263 #endif /* MACH_ASSERT */
17264
17265 /*
17266 * This function determines if the zero operation can be run on the
17267 * respective entry. Additional checks on the object are in
17268 * vm_object_zero_preflight.
17269 */
17270 static kern_return_t
vm_map_zero_entry_preflight(vm_map_entry_t entry)17271 vm_map_zero_entry_preflight(vm_map_entry_t entry)
17272 {
17273 /*
17274 * Zeroing is restricted to writable non-executable entries and non-JIT
17275 * regions.
17276 */
17277 if (!(entry->protection & VM_PROT_WRITE) ||
17278 (entry->protection & VM_PROT_EXECUTE) ||
17279 entry->used_for_jit ||
17280 entry->vme_xnu_user_debug) {
17281 return KERN_PROTECTION_FAILURE;
17282 }
17283
17284 /*
17285 * Zeroing for copy on write isn't yet supported. Zeroing is also not
17286 * allowed for submaps.
17287 */
17288 if (entry->needs_copy || entry->is_sub_map) {
17289 return KERN_NO_ACCESS;
17290 }
17291
17292 return KERN_SUCCESS;
17293 }
17294
17295 /*
17296 * This function translates entry's start and end to offsets in the object
17297 */
17298 static void
vm_map_get_bounds_in_object(vm_map_entry_t entry,vm_map_offset_t start,vm_map_offset_t end,vm_map_offset_t * start_offset,vm_map_offset_t * end_offset)17299 vm_map_get_bounds_in_object(
17300 vm_map_entry_t entry,
17301 vm_map_offset_t start,
17302 vm_map_offset_t end,
17303 vm_map_offset_t *start_offset,
17304 vm_map_offset_t *end_offset)
17305 {
17306 if (entry->vme_start < start) {
17307 *start_offset = start - entry->vme_start;
17308 } else {
17309 *start_offset = 0;
17310 }
17311 *end_offset = MIN(end, entry->vme_end) - entry->vme_start;
17312 *start_offset += VME_OFFSET(entry);
17313 *end_offset += VME_OFFSET(entry);
17314 }
17315
17316 /*
17317 * This function iterates through the entries in the requested range
17318 * and zeroes any resident pages in the corresponding objects. Compressed
17319 * pages are dropped instead of being faulted in and zeroed.
17320 */
17321 static kern_return_t
vm_map_zero(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)17322 vm_map_zero(
17323 vm_map_t map,
17324 vm_map_offset_t start,
17325 vm_map_offset_t end)
17326 {
17327 vm_map_entry_t entry;
17328 vm_map_offset_t cur = start;
17329 kern_return_t ret;
17330
17331 /*
17332 * This operation isn't supported where the map page size is less than
17333 * the hardware page size. Caller will need to handle error and
17334 * explicitly zero memory if needed.
17335 */
17336 if (VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT) {
17337 return KERN_NO_ACCESS;
17338 }
17339
17340 /*
17341 * The MADV_ZERO operation doesn't require any changes to the
17342 * vm_map_entry_t's, so the read lock is sufficient.
17343 */
17344 vm_map_lock_read(map);
17345 assert(map->pmap != kernel_pmap); /* protect alias access */
17346
17347 /*
17348 * The madvise semantics require that the address range be fully
17349 * allocated with no holes. Otherwise, we're required to return
17350 * an error. This check needs to be redone if the map has changed.
17351 */
17352 if (!vm_map_range_check(map, cur, end, &entry)) {
17353 vm_map_unlock_read(map);
17354 return KERN_INVALID_ADDRESS;
17355 }
17356
17357 /*
17358 * Examine each vm_map_entry_t in the range.
17359 */
17360 while (entry != vm_map_to_entry(map) && entry->vme_start < end) {
17361 vm_map_offset_t cur_offset;
17362 vm_map_offset_t end_offset;
17363 unsigned int last_timestamp = map->timestamp;
17364 vm_object_t object = VME_OBJECT(entry);
17365
17366 ret = vm_map_zero_entry_preflight(entry);
17367 if (ret != KERN_SUCCESS) {
17368 vm_map_unlock_read(map);
17369 return ret;
17370 }
17371
17372 if (object == VM_OBJECT_NULL) {
17373 entry = entry->vme_next;
17374 continue;
17375 }
17376
17377 vm_map_get_bounds_in_object(entry, cur, end, &cur_offset, &end_offset);
17378 vm_object_lock(object);
17379 /*
17380 * Take a reference on the object as vm_object_zero will drop the object
17381 * lock when it encounters a busy page.
17382 */
17383 vm_object_reference_locked(object);
17384 vm_map_unlock_read(map);
17385
17386 ret = vm_object_zero(object, cur_offset, end_offset);
17387 vm_object_unlock(object);
17388 vm_object_deallocate(object);
17389 if (ret != KERN_SUCCESS) {
17390 return ret;
17391 }
17392 /*
17393 * Update cur as vm_object_zero has succeeded.
17394 */
17395 cur += (end_offset - cur_offset);
17396 if (cur == end) {
17397 return KERN_SUCCESS;
17398 }
17399
17400 /*
17401 * If the map timestamp has changed, restart by relooking up cur in the
17402 * map
17403 */
17404 vm_map_lock_read(map);
17405 if (last_timestamp != map->timestamp) {
17406 /*
17407 * Relookup cur in the map
17408 */
17409 if (!vm_map_range_check(map, cur, end, &entry)) {
17410 vm_map_unlock_read(map);
17411 return KERN_INVALID_ADDRESS;
17412 }
17413 continue;
17414 }
17415 /*
17416 * If the map hasn't changed proceed with the next entry
17417 */
17418 entry = entry->vme_next;
17419 }
17420
17421 vm_map_unlock_read(map);
17422 return KERN_SUCCESS;
17423 }
17424
17425
17426 /*
17427 * Routine: vm_map_entry_insert
17428 *
17429 * Description: This routine inserts a new vm_entry in a locked map.
17430 */
17431 static vm_map_entry_t
vm_map_entry_insert(vm_map_t map,vm_map_entry_t insp_entry,vm_map_offset_t start,vm_map_offset_t end,vm_object_t object,vm_object_offset_t offset,vm_map_kernel_flags_t vmk_flags,boolean_t needs_copy,vm_prot_t cur_protection,vm_prot_t max_protection,vm_inherit_t inheritance,boolean_t clear_map_aligned)17432 vm_map_entry_insert(
17433 vm_map_t map,
17434 vm_map_entry_t insp_entry,
17435 vm_map_offset_t start,
17436 vm_map_offset_t end,
17437 vm_object_t object,
17438 vm_object_offset_t offset,
17439 vm_map_kernel_flags_t vmk_flags,
17440 boolean_t needs_copy,
17441 vm_prot_t cur_protection,
17442 vm_prot_t max_protection,
17443 vm_inherit_t inheritance,
17444 boolean_t clear_map_aligned)
17445 {
17446 vm_map_entry_t new_entry;
17447 boolean_t map_aligned = FALSE;
17448
17449 assert(insp_entry != (vm_map_entry_t)0);
17450 vm_map_lock_assert_exclusive(map);
17451
17452 __assert_only vm_object_offset_t end_offset = 0;
17453 assertf(!os_add_overflow(end - start, offset, &end_offset), "size 0x%llx, offset 0x%llx caused overflow", (uint64_t)(end - start), offset);
17454
17455 if (VM_MAP_PAGE_SHIFT(map) != PAGE_SHIFT) {
17456 map_aligned = TRUE;
17457 }
17458 if (clear_map_aligned &&
17459 (!VM_MAP_PAGE_ALIGNED(start, VM_MAP_PAGE_MASK(map)) ||
17460 !VM_MAP_PAGE_ALIGNED(end, VM_MAP_PAGE_MASK(map)))) {
17461 map_aligned = FALSE;
17462 }
17463 if (map_aligned) {
17464 assert(VM_MAP_PAGE_ALIGNED(start, VM_MAP_PAGE_MASK(map)));
17465 assert(VM_MAP_PAGE_ALIGNED(end, VM_MAP_PAGE_MASK(map)));
17466 } else {
17467 assert(page_aligned(start));
17468 assert(page_aligned(end));
17469 }
17470 assert(start < end);
17471
17472 new_entry = vm_map_entry_create(map);
17473
17474 new_entry->vme_start = start;
17475 new_entry->vme_end = end;
17476
17477 if (vmk_flags.vmkf_submap) {
17478 new_entry->vme_atomic = vmk_flags.vmkf_submap_atomic;
17479 VME_SUBMAP_SET(new_entry, (vm_map_t)object);
17480 } else {
17481 VME_OBJECT_SET(new_entry, object, false, 0);
17482 }
17483 VME_OFFSET_SET(new_entry, offset);
17484 VME_ALIAS_SET(new_entry, vmk_flags.vm_tag);
17485
17486 new_entry->map_aligned = map_aligned;
17487 new_entry->needs_copy = needs_copy;
17488 new_entry->inheritance = inheritance;
17489 new_entry->protection = cur_protection;
17490 new_entry->max_protection = max_protection;
17491 /*
17492 * submap: "use_pmap" means "nested".
17493 * default: false.
17494 *
17495 * object: "use_pmap" means "use pmap accounting" for footprint.
17496 * default: true.
17497 */
17498 new_entry->use_pmap = !vmk_flags.vmkf_submap;
17499 new_entry->no_cache = vmk_flags.vmf_no_cache;
17500 new_entry->vme_permanent = vmk_flags.vmf_permanent;
17501 new_entry->translated_allow_execute = vmk_flags.vmkf_translated_allow_execute;
17502 new_entry->vme_no_copy_on_read = vmk_flags.vmkf_no_copy_on_read;
17503 new_entry->superpage_size = (vmk_flags.vmf_superpage_size != 0);
17504
17505 if (vmk_flags.vmkf_map_jit) {
17506 if (!(map->jit_entry_exists) ||
17507 VM_MAP_POLICY_ALLOW_MULTIPLE_JIT(map)) {
17508 new_entry->used_for_jit = TRUE;
17509 map->jit_entry_exists = TRUE;
17510 }
17511 }
17512
17513 /*
17514 * Insert the new entry into the list.
17515 */
17516
17517 vm_map_store_entry_link(map, insp_entry, new_entry, vmk_flags);
17518 map->size += end - start;
17519
17520 /*
17521 * Update the free space hint and the lookup hint.
17522 */
17523
17524 SAVE_HINT_MAP_WRITE(map, new_entry);
17525 return new_entry;
17526 }
17527
17528 /*
17529 * Routine: vm_map_remap_extract
17530 *
17531 * Description: This routine returns a vm_entry list from a map.
17532 */
17533 static kern_return_t
vm_map_remap_extract(vm_map_t map,vm_map_offset_t addr,vm_map_size_t size,boolean_t copy,vm_map_copy_t map_copy,vm_prot_t * cur_protection,vm_prot_t * max_protection,vm_inherit_t inheritance,vm_map_kernel_flags_t vmk_flags)17534 vm_map_remap_extract(
17535 vm_map_t map,
17536 vm_map_offset_t addr,
17537 vm_map_size_t size,
17538 boolean_t copy,
17539 vm_map_copy_t map_copy,
17540 vm_prot_t *cur_protection, /* IN/OUT */
17541 vm_prot_t *max_protection, /* IN/OUT */
17542 /* What, no behavior? */
17543 vm_inherit_t inheritance,
17544 vm_map_kernel_flags_t vmk_flags)
17545 {
17546 struct vm_map_header *map_header = &map_copy->cpy_hdr;
17547 kern_return_t result;
17548 vm_map_size_t mapped_size;
17549 vm_map_size_t tmp_size;
17550 vm_map_entry_t src_entry; /* result of last map lookup */
17551 vm_map_entry_t new_entry;
17552 vm_object_offset_t offset;
17553 vm_map_offset_t map_address;
17554 vm_map_offset_t src_start; /* start of entry to map */
17555 vm_map_offset_t src_end; /* end of region to be mapped */
17556 vm_object_t object;
17557 vm_map_version_t version;
17558 boolean_t src_needs_copy;
17559 boolean_t new_entry_needs_copy;
17560 vm_map_entry_t saved_src_entry;
17561 boolean_t src_entry_was_wired;
17562 vm_prot_t max_prot_for_prot_copy;
17563 vm_map_offset_t effective_page_mask;
17564 bool pageable, same_map;
17565 boolean_t vm_remap_legacy;
17566 vm_prot_t required_cur_prot, required_max_prot;
17567 vm_object_t new_copy_object; /* vm_object_copy_* result */
17568 boolean_t saved_used_for_jit; /* Saved used_for_jit. */
17569
17570 pageable = vmk_flags.vmkf_copy_pageable;
17571 same_map = vmk_flags.vmkf_copy_same_map;
17572
17573 effective_page_mask = MIN(PAGE_MASK, VM_MAP_PAGE_MASK(map));
17574
17575 assert(map != VM_MAP_NULL);
17576 assert(size != 0);
17577 assert(size == vm_map_round_page(size, effective_page_mask));
17578 assert(inheritance == VM_INHERIT_NONE ||
17579 inheritance == VM_INHERIT_COPY ||
17580 inheritance == VM_INHERIT_SHARE);
17581 assert(!(*cur_protection & ~(VM_PROT_ALL | VM_PROT_ALLEXEC)));
17582 assert(!(*max_protection & ~(VM_PROT_ALL | VM_PROT_ALLEXEC)));
17583 assert((*cur_protection & *max_protection) == *cur_protection);
17584
17585 /*
17586 * Compute start and end of region.
17587 */
17588 src_start = vm_map_trunc_page(addr, effective_page_mask);
17589 src_end = vm_map_round_page(src_start + size, effective_page_mask);
17590
17591 /*
17592 * Initialize map_header.
17593 */
17594 map_header->nentries = 0;
17595 map_header->entries_pageable = pageable;
17596 // map_header->page_shift = MIN(VM_MAP_PAGE_SHIFT(map), PAGE_SHIFT);
17597 map_header->page_shift = (uint16_t)VM_MAP_PAGE_SHIFT(map);
17598 map_header->rb_head_store.rbh_root = (void *)(int)SKIP_RB_TREE;
17599 vm_map_store_init(map_header);
17600
17601 if (copy && vmk_flags.vmkf_remap_prot_copy) {
17602 /*
17603 * Special case for vm_map_protect(VM_PROT_COPY):
17604 * we want to set the new mappings' max protection to the
17605 * specified *max_protection...
17606 */
17607 max_prot_for_prot_copy = *max_protection & (VM_PROT_ALL | VM_PROT_ALLEXEC);
17608 /* ... but we want to use the vm_remap() legacy mode */
17609 vmk_flags.vmkf_remap_legacy_mode = true;
17610 *max_protection = VM_PROT_NONE;
17611 *cur_protection = VM_PROT_NONE;
17612 } else {
17613 max_prot_for_prot_copy = VM_PROT_NONE;
17614 }
17615
17616 if (vmk_flags.vmkf_remap_legacy_mode) {
17617 /*
17618 * vm_remap() legacy mode:
17619 * Extract all memory regions in the specified range and
17620 * collect the strictest set of protections allowed on the
17621 * entire range, so the caller knows what they can do with
17622 * the remapped range.
17623 * We start with VM_PROT_ALL and we'll remove the protections
17624 * missing from each memory region.
17625 */
17626 vm_remap_legacy = TRUE;
17627 *cur_protection = VM_PROT_ALL;
17628 *max_protection = VM_PROT_ALL;
17629 required_cur_prot = VM_PROT_NONE;
17630 required_max_prot = VM_PROT_NONE;
17631 } else {
17632 /*
17633 * vm_remap_new() mode:
17634 * Extract all memory regions in the specified range and
17635 * ensure that they have at least the protections specified
17636 * by the caller via *cur_protection and *max_protection.
17637 * The resulting mapping should have these protections.
17638 */
17639 vm_remap_legacy = FALSE;
17640 if (copy) {
17641 required_cur_prot = VM_PROT_NONE;
17642 required_max_prot = VM_PROT_READ;
17643 } else {
17644 required_cur_prot = *cur_protection;
17645 required_max_prot = *max_protection;
17646 }
17647 }
17648
17649 map_address = 0;
17650 mapped_size = 0;
17651 result = KERN_SUCCESS;
17652
17653 /*
17654 * The specified source virtual space might correspond to
17655 * multiple map entries, need to loop on them.
17656 */
17657 vm_map_lock(map);
17658
17659 if (map->pmap == kernel_pmap) {
17660 map_copy->is_kernel_range = true;
17661 map_copy->orig_range = kmem_addr_get_range(addr, size);
17662 #if CONFIG_MAP_RANGES
17663 } else if (map->uses_user_ranges) {
17664 map_copy->is_user_range = true;
17665 map_copy->orig_range = vm_map_user_range_resolve(map, addr, size, NULL);
17666 #endif /* CONFIG_MAP_RANGES */
17667 }
17668
17669 if (VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT) {
17670 /*
17671 * This address space uses sub-pages so the range might
17672 * not be re-mappable in an address space with larger
17673 * pages. Re-assemble any broken-up VM map entries to
17674 * improve our chances of making it work.
17675 */
17676 vm_map_simplify_range(map, src_start, src_end);
17677 }
17678 while (mapped_size != size) {
17679 vm_map_size_t entry_size;
17680
17681 /*
17682 * Find the beginning of the region.
17683 */
17684 if (!vm_map_lookup_entry(map, src_start, &src_entry)) {
17685 result = KERN_INVALID_ADDRESS;
17686 break;
17687 }
17688
17689 if (src_start < src_entry->vme_start ||
17690 (mapped_size && src_start != src_entry->vme_start)) {
17691 result = KERN_INVALID_ADDRESS;
17692 break;
17693 }
17694
17695 tmp_size = size - mapped_size;
17696 if (src_end > src_entry->vme_end) {
17697 tmp_size -= (src_end - src_entry->vme_end);
17698 }
17699
17700 entry_size = (vm_map_size_t)(src_entry->vme_end -
17701 src_entry->vme_start);
17702
17703 if (src_entry->is_sub_map &&
17704 vmk_flags.vmkf_copy_single_object) {
17705 vm_map_t submap;
17706 vm_map_offset_t submap_start;
17707 vm_map_size_t submap_size;
17708 boolean_t submap_needs_copy;
17709
17710 /*
17711 * No check for "required protection" on "src_entry"
17712 * because the protections that matter are the ones
17713 * on the submap's VM map entry, which will be checked
17714 * during the call to vm_map_remap_extract() below.
17715 */
17716 object = VM_OBJECT_NULL;
17717
17718 submap_size = src_entry->vme_end - src_start;
17719 if (submap_size > size) {
17720 submap_size = size;
17721 }
17722 submap_start = VME_OFFSET(src_entry) + src_start - src_entry->vme_start;
17723 submap = VME_SUBMAP(src_entry);
17724 if (copy) {
17725 /*
17726 * The caller wants a copy-on-write re-mapping,
17727 * so let's extract from the submap accordingly.
17728 */
17729 submap_needs_copy = TRUE;
17730 } else if (src_entry->needs_copy) {
17731 /*
17732 * The caller wants a shared re-mapping but the
17733 * submap is mapped with "needs_copy", so its
17734 * contents can't be shared as is. Extract the
17735 * contents of the submap as "copy-on-write".
17736 * The re-mapping won't be shared with the
17737 * original mapping but this is equivalent to
17738 * what happened with the original "remap from
17739 * submap" code.
17740 * The shared region is mapped "needs_copy", for
17741 * example.
17742 */
17743 submap_needs_copy = TRUE;
17744 } else {
17745 /*
17746 * The caller wants a shared re-mapping and
17747 * this mapping can be shared (no "needs_copy"),
17748 * so let's extract from the submap accordingly.
17749 * Kernel submaps are mapped without
17750 * "needs_copy", for example.
17751 */
17752 submap_needs_copy = FALSE;
17753 }
17754 vm_map_reference(submap);
17755 vm_map_unlock(map);
17756 src_entry = NULL;
17757 if (vm_remap_legacy) {
17758 *cur_protection = VM_PROT_NONE;
17759 *max_protection = VM_PROT_NONE;
17760 }
17761
17762 DTRACE_VM7(remap_submap_recurse,
17763 vm_map_t, map,
17764 vm_map_offset_t, addr,
17765 vm_map_size_t, size,
17766 boolean_t, copy,
17767 vm_map_offset_t, submap_start,
17768 vm_map_size_t, submap_size,
17769 boolean_t, submap_needs_copy);
17770
17771 result = vm_map_remap_extract(submap,
17772 submap_start,
17773 submap_size,
17774 submap_needs_copy,
17775 map_copy,
17776 cur_protection,
17777 max_protection,
17778 inheritance,
17779 vmk_flags);
17780 vm_map_deallocate(submap);
17781
17782 if (result == KERN_SUCCESS &&
17783 submap_needs_copy &&
17784 !copy) {
17785 /*
17786 * We were asked for a "shared"
17787 * re-mapping but had to ask for a
17788 * "copy-on-write" remapping of the
17789 * submap's mapping to honor the
17790 * submap's "needs_copy".
17791 * We now need to resolve that
17792 * pending "copy-on-write" to
17793 * get something we can share.
17794 */
17795 vm_map_entry_t copy_entry;
17796 vm_object_offset_t copy_offset;
17797 vm_map_size_t copy_size;
17798 vm_object_t copy_object;
17799 copy_entry = vm_map_copy_first_entry(map_copy);
17800 copy_size = copy_entry->vme_end - copy_entry->vme_start;
17801 copy_object = VME_OBJECT(copy_entry);
17802 copy_offset = VME_OFFSET(copy_entry);
17803 if (copy_object == VM_OBJECT_NULL) {
17804 assert(copy_offset == 0);
17805 assert(!copy_entry->needs_copy);
17806 if (copy_entry->max_protection == VM_PROT_NONE) {
17807 assert(copy_entry->protection == VM_PROT_NONE);
17808 /* nothing to share */
17809 } else {
17810 assert(copy_offset == 0);
17811 copy_object = vm_object_allocate(copy_size, submap->serial_id);
17812 VME_OFFSET_SET(copy_entry, 0);
17813 VME_OBJECT_SET(copy_entry, copy_object, false, 0);
17814 assert(copy_entry->use_pmap);
17815 }
17816 } else if (copy_object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC) {
17817 /* already shareable */
17818 assert(!copy_entry->needs_copy);
17819 } else if (copy_entry->needs_copy ||
17820 copy_object->shadowed ||
17821 (copy_object->internal &&
17822 !copy_object->true_share &&
17823 !copy_entry->is_shared &&
17824 copy_object->vo_size > copy_size)) {
17825 VME_OBJECT_SHADOW(copy_entry, copy_size, TRUE);
17826 assert(copy_entry->use_pmap);
17827 if (copy_entry->needs_copy) {
17828 /* already write-protected */
17829 } else {
17830 vm_prot_t prot;
17831 prot = copy_entry->protection & ~VM_PROT_WRITE;
17832 vm_object_pmap_protect(copy_object,
17833 copy_offset,
17834 copy_size,
17835 PMAP_NULL,
17836 PAGE_SIZE,
17837 0,
17838 prot);
17839 }
17840 copy_entry->needs_copy = FALSE;
17841 }
17842 copy_object = VME_OBJECT(copy_entry);
17843 copy_offset = VME_OFFSET(copy_entry);
17844 if (copy_object &&
17845 copy_object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) {
17846 copy_object->copy_strategy = MEMORY_OBJECT_COPY_DELAY;
17847 copy_object->true_share = TRUE;
17848 }
17849 }
17850
17851 return result;
17852 }
17853
17854 if (src_entry->is_sub_map) {
17855 /* protections for submap mapping are irrelevant here */
17856 } else if (((src_entry->protection & required_cur_prot) !=
17857 required_cur_prot) ||
17858 ((src_entry->max_protection & required_max_prot) !=
17859 required_max_prot)) {
17860 if (vmk_flags.vmkf_copy_single_object &&
17861 mapped_size != 0) {
17862 /*
17863 * Single object extraction.
17864 * We can't extract more with the required
17865 * protection but we've extracted some, so
17866 * stop there and declare success.
17867 * The caller should check the size of
17868 * the copy entry we've extracted.
17869 */
17870 result = KERN_SUCCESS;
17871 } else {
17872 /*
17873 * VM range extraction.
17874 * Required proctection is not available
17875 * for this part of the range: fail.
17876 */
17877 result = KERN_PROTECTION_FAILURE;
17878 }
17879 break;
17880 }
17881
17882 if (src_entry->is_sub_map) {
17883 vm_map_t submap;
17884 vm_map_offset_t submap_start;
17885 vm_map_size_t submap_size;
17886 vm_map_copy_t submap_copy;
17887 vm_prot_t submap_curprot, submap_maxprot;
17888 boolean_t submap_needs_copy;
17889
17890 /*
17891 * No check for "required protection" on "src_entry"
17892 * because the protections that matter are the ones
17893 * on the submap's VM map entry, which will be checked
17894 * during the call to vm_map_copy_extract() below.
17895 */
17896 object = VM_OBJECT_NULL;
17897 submap_copy = VM_MAP_COPY_NULL;
17898
17899 /* find equivalent range in the submap */
17900 submap = VME_SUBMAP(src_entry);
17901 submap_start = VME_OFFSET(src_entry) + src_start - src_entry->vme_start;
17902 submap_size = tmp_size;
17903 if (copy) {
17904 /*
17905 * The caller wants a copy-on-write re-mapping,
17906 * so let's extract from the submap accordingly.
17907 */
17908 submap_needs_copy = TRUE;
17909 } else if (src_entry->needs_copy) {
17910 /*
17911 * The caller wants a shared re-mapping but the
17912 * submap is mapped with "needs_copy", so its
17913 * contents can't be shared as is. Extract the
17914 * contents of the submap as "copy-on-write".
17915 * The re-mapping won't be shared with the
17916 * original mapping but this is equivalent to
17917 * what happened with the original "remap from
17918 * submap" code.
17919 * The shared region is mapped "needs_copy", for
17920 * example.
17921 */
17922 submap_needs_copy = TRUE;
17923 } else {
17924 /*
17925 * The caller wants a shared re-mapping and
17926 * this mapping can be shared (no "needs_copy"),
17927 * so let's extract from the submap accordingly.
17928 * Kernel submaps are mapped without
17929 * "needs_copy", for example.
17930 */
17931 submap_needs_copy = FALSE;
17932 }
17933 /* extra ref to keep submap alive */
17934 vm_map_reference(submap);
17935
17936 DTRACE_VM7(remap_submap_recurse,
17937 vm_map_t, map,
17938 vm_map_offset_t, addr,
17939 vm_map_size_t, size,
17940 boolean_t, copy,
17941 vm_map_offset_t, submap_start,
17942 vm_map_size_t, submap_size,
17943 boolean_t, submap_needs_copy);
17944
17945 /*
17946 * The map can be safely unlocked since we
17947 * already hold a reference on the submap.
17948 *
17949 * No timestamp since we don't care if the map
17950 * gets modified while we're down in the submap.
17951 * We'll resume the extraction at src_start + tmp_size
17952 * anyway.
17953 */
17954 vm_map_unlock(map);
17955 src_entry = NULL; /* not valid once map is unlocked */
17956
17957 if (vm_remap_legacy) {
17958 submap_curprot = VM_PROT_NONE;
17959 submap_maxprot = VM_PROT_NONE;
17960 if (max_prot_for_prot_copy) {
17961 submap_maxprot = max_prot_for_prot_copy;
17962 }
17963 } else {
17964 assert(!max_prot_for_prot_copy);
17965 submap_curprot = *cur_protection;
17966 submap_maxprot = *max_protection;
17967 }
17968 result = vm_map_copy_extract(submap,
17969 submap_start,
17970 submap_size,
17971 submap_needs_copy,
17972 &submap_copy,
17973 &submap_curprot,
17974 &submap_maxprot,
17975 inheritance,
17976 vmk_flags);
17977
17978 /* release extra ref on submap */
17979 vm_map_deallocate(submap);
17980 submap = VM_MAP_NULL;
17981
17982 if (result != KERN_SUCCESS) {
17983 vm_map_lock(map);
17984 break;
17985 }
17986
17987 /* transfer submap_copy entries to map_header */
17988 while (vm_map_copy_first_entry(submap_copy) !=
17989 vm_map_copy_to_entry(submap_copy)) {
17990 vm_map_entry_t copy_entry;
17991 vm_map_size_t copy_entry_size;
17992
17993 copy_entry = vm_map_copy_first_entry(submap_copy);
17994
17995 /*
17996 * Prevent kernel_object from being exposed to
17997 * user space.
17998 */
17999 if (__improbable(copy_entry->vme_kernel_object)) {
18000 printf("%d[%s]: rejecting attempt to extract from kernel_object\n",
18001 proc_selfpid(),
18002 (get_bsdtask_info(current_task())
18003 ? proc_name_address(get_bsdtask_info(current_task()))
18004 : "?"));
18005 DTRACE_VM(extract_kernel_only);
18006 result = KERN_INVALID_RIGHT;
18007 vm_map_copy_discard(submap_copy);
18008 submap_copy = VM_MAP_COPY_NULL;
18009 vm_map_lock(map);
18010 break;
18011 }
18012
18013 vm_map_copy_entry_unlink(submap_copy, copy_entry);
18014 copy_entry_size = copy_entry->vme_end - copy_entry->vme_start;
18015 copy_entry->vme_start = map_address;
18016 copy_entry->vme_end = map_address + copy_entry_size;
18017 map_address += copy_entry_size;
18018 mapped_size += copy_entry_size;
18019 src_start += copy_entry_size;
18020 assert(src_start <= src_end);
18021 _vm_map_store_entry_link(map_header,
18022 map_header->links.prev,
18023 copy_entry);
18024 }
18025 /* done with submap_copy */
18026 vm_map_copy_discard(submap_copy);
18027
18028 if (vm_remap_legacy) {
18029 *cur_protection &= submap_curprot;
18030 *max_protection &= submap_maxprot;
18031 }
18032
18033 /* re-acquire the map lock and continue to next entry */
18034 vm_map_lock(map);
18035 continue;
18036 } else {
18037 object = VME_OBJECT(src_entry);
18038
18039 /*
18040 * Prevent kernel_object from being exposed to
18041 * user space.
18042 */
18043 if (__improbable(is_kernel_object(object))) {
18044 printf("%d[%s]: rejecting attempt to extract from kernel_object\n",
18045 proc_selfpid(),
18046 (get_bsdtask_info(current_task())
18047 ? proc_name_address(get_bsdtask_info(current_task()))
18048 : "?"));
18049 DTRACE_VM(extract_kernel_only);
18050 result = KERN_INVALID_RIGHT;
18051 break;
18052 }
18053
18054 if (src_entry->iokit_acct) {
18055 /*
18056 * This entry uses "IOKit accounting".
18057 */
18058 } else if (object != VM_OBJECT_NULL &&
18059 object->internal &&
18060 (object->purgable != VM_PURGABLE_DENY ||
18061 object->vo_ledger_tag != VM_LEDGER_TAG_NONE)) {
18062 /*
18063 * Purgeable objects have their own accounting:
18064 * no pmap accounting for them.
18065 */
18066 assertf(!src_entry->use_pmap,
18067 "map=%p src_entry=%p [0x%llx:0x%llx] 0x%x/0x%x %d",
18068 map,
18069 src_entry,
18070 (uint64_t)src_entry->vme_start,
18071 (uint64_t)src_entry->vme_end,
18072 src_entry->protection,
18073 src_entry->max_protection,
18074 VME_ALIAS(src_entry));
18075 } else {
18076 /*
18077 * Not IOKit or purgeable:
18078 * must be accounted by pmap stats.
18079 */
18080 assertf(src_entry->use_pmap,
18081 "map=%p src_entry=%p [0x%llx:0x%llx] 0x%x/0x%x %d",
18082 map,
18083 src_entry,
18084 (uint64_t)src_entry->vme_start,
18085 (uint64_t)src_entry->vme_end,
18086 src_entry->protection,
18087 src_entry->max_protection,
18088 VME_ALIAS(src_entry));
18089 }
18090
18091 if (object == VM_OBJECT_NULL) {
18092 assert(!src_entry->needs_copy);
18093 if (src_entry->max_protection == VM_PROT_NONE) {
18094 assert(src_entry->protection == VM_PROT_NONE);
18095 /*
18096 * No VM object and no permissions:
18097 * this must be a reserved range with
18098 * nothing to share or copy.
18099 * There could also be all sorts of
18100 * pmap shenanigans within that reserved
18101 * range, so let's just copy the map
18102 * entry as is to remap a similar
18103 * reserved range.
18104 */
18105 offset = 0; /* no object => no offset */
18106 goto copy_src_entry;
18107 }
18108 object = vm_object_allocate(entry_size, map->serial_id);
18109 VME_OFFSET_SET(src_entry, 0);
18110 VME_OBJECT_SET(src_entry, object, false, 0);
18111 assert(src_entry->use_pmap);
18112 assert(!map->mapped_in_other_pmaps);
18113 } else if (src_entry->wired_count ||
18114 object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC) {
18115 /*
18116 * A wired memory region should not have
18117 * any pending copy-on-write and needs to
18118 * keep pointing at the VM object that
18119 * contains the wired pages.
18120 * If we're sharing this memory (copy=false),
18121 * we'll share this VM object.
18122 * If we're copying this memory (copy=true),
18123 * we'll call vm_object_copy_slowly() below
18124 * and use the new VM object for the remapping.
18125 *
18126 * Or, we are already using an asymmetric
18127 * copy, and therefore we already have
18128 * the right object.
18129 */
18130 assert(!src_entry->needs_copy);
18131 } else if (src_entry->needs_copy || object->shadowed ||
18132 (object->internal && !object->true_share &&
18133 !src_entry->is_shared &&
18134 object->vo_size > entry_size)) {
18135 bool is_writable;
18136
18137 VME_OBJECT_SHADOW(src_entry, entry_size,
18138 vm_map_always_shadow(map));
18139 assert(src_entry->use_pmap);
18140
18141 is_writable = false;
18142 if (src_entry->protection & VM_PROT_WRITE) {
18143 is_writable = true;
18144 #if __arm64e__
18145 } else if (src_entry->used_for_tpro) {
18146 is_writable = true;
18147 #endif /* __arm64e__ */
18148 }
18149 if (!src_entry->needs_copy && is_writable) {
18150 vm_prot_t prot;
18151
18152 if (pmap_has_prot_policy(map->pmap, src_entry->translated_allow_execute, src_entry->protection)) {
18153 panic("%s: map %p pmap %p entry %p 0x%llx:0x%llx prot 0x%x",
18154 __FUNCTION__,
18155 map, map->pmap,
18156 src_entry,
18157 (uint64_t)src_entry->vme_start,
18158 (uint64_t)src_entry->vme_end,
18159 src_entry->protection);
18160 }
18161
18162 prot = src_entry->protection & ~VM_PROT_WRITE;
18163
18164 if (override_nx(map,
18165 VME_ALIAS(src_entry))
18166 && prot) {
18167 prot |= VM_PROT_EXECUTE;
18168 }
18169
18170 if (pmap_has_prot_policy(map->pmap, src_entry->translated_allow_execute, prot)) {
18171 panic("%s: map %p pmap %p entry %p 0x%llx:0x%llx prot 0x%x",
18172 __FUNCTION__,
18173 map, map->pmap,
18174 src_entry,
18175 (uint64_t)src_entry->vme_start,
18176 (uint64_t)src_entry->vme_end,
18177 prot);
18178 }
18179
18180 if (map->mapped_in_other_pmaps) {
18181 vm_object_pmap_protect(
18182 VME_OBJECT(src_entry),
18183 VME_OFFSET(src_entry),
18184 entry_size,
18185 PMAP_NULL,
18186 PAGE_SIZE,
18187 src_entry->vme_start,
18188 prot);
18189 #if MACH_ASSERT
18190 } else if (__improbable(map->pmap == PMAP_NULL)) {
18191 /*
18192 * Some VM tests (in vm_tests.c)
18193 * sometimes want to use a VM
18194 * map without a pmap.
18195 * Otherwise, this should never
18196 * happen.
18197 */
18198 if (!thread_get_test_option(test_option_vm_map_allow_null_pmap)) {
18199 panic("null pmap");
18200 }
18201 #endif /* MACH_ASSERT */
18202 } else {
18203 pmap_protect(vm_map_pmap(map),
18204 src_entry->vme_start,
18205 src_entry->vme_end,
18206 prot);
18207 }
18208 }
18209
18210 object = VME_OBJECT(src_entry);
18211 src_entry->needs_copy = FALSE;
18212 }
18213
18214
18215 vm_object_lock(object);
18216 vm_object_reference_locked(object); /* object ref. for new entry */
18217 assert(!src_entry->needs_copy);
18218 if (object->copy_strategy ==
18219 MEMORY_OBJECT_COPY_SYMMETRIC) {
18220 /*
18221 * If we want to share this object (copy==0),
18222 * it needs to be COPY_DELAY.
18223 * If we want to copy this object (copy==1),
18224 * we can't just set "needs_copy" on our side
18225 * and expect the other side to do the same
18226 * (symmetrically), so we can't let the object
18227 * stay COPY_SYMMETRIC.
18228 * So we always switch from COPY_SYMMETRIC to
18229 * COPY_DELAY.
18230 */
18231 object->copy_strategy =
18232 MEMORY_OBJECT_COPY_DELAY;
18233 VM_OBJECT_SET_TRUE_SHARE(object, TRUE);
18234 }
18235 vm_object_unlock(object);
18236 }
18237
18238 offset = (VME_OFFSET(src_entry) +
18239 (src_start - src_entry->vme_start));
18240
18241 copy_src_entry:
18242
18243
18244 new_entry = _vm_map_entry_create(map_header);
18245 vm_map_entry_copy(map, new_entry, src_entry);
18246 if (new_entry->is_sub_map) {
18247 /* clr address space specifics */
18248 new_entry->use_pmap = FALSE;
18249 } else if (copy) {
18250 /*
18251 * We're dealing with a copy-on-write operation,
18252 * so the resulting mapping should not inherit the
18253 * original mapping's accounting settings.
18254 * "use_pmap" should be reset to its default (TRUE)
18255 * so that the new mapping gets accounted for in
18256 * the task's memory footprint.
18257 */
18258 new_entry->use_pmap = TRUE;
18259 }
18260 /* "iokit_acct" was cleared in vm_map_entry_copy() */
18261 assert(!new_entry->iokit_acct);
18262
18263 new_entry->map_aligned = FALSE;
18264
18265 new_entry->vme_start = map_address;
18266 new_entry->vme_end = map_address + tmp_size;
18267 assert(new_entry->vme_start < new_entry->vme_end);
18268 if (copy && vmk_flags.vmkf_remap_prot_copy) {
18269 /* security: keep "permanent" and "csm_associated" */
18270 new_entry->vme_permanent = src_entry->vme_permanent;
18271 new_entry->csm_associated = src_entry->csm_associated;
18272 /*
18273 * Remapping for vm_map_protect(VM_PROT_COPY)
18274 * to convert a read-only mapping into a
18275 * copy-on-write version of itself but
18276 * with write access:
18277 * keep the original inheritance but let's not
18278 * add VM_PROT_WRITE to the max protection yet
18279 * since we want to do more security checks against
18280 * the target map.
18281 */
18282 new_entry->inheritance = src_entry->inheritance;
18283 new_entry->protection &= max_prot_for_prot_copy;
18284
18285 #ifdef __arm64e__
18286 /*
18287 * Remapping for vm_map_protect(VM_PROT_COPY) to remap a TPRO
18288 * region to be explicitly writable without TPRO is only permitted
18289 * if TPRO enforcement has been overridden.
18290 *
18291 * In this case we ensure any entries reset the TPRO state
18292 * and we permit the region to be downgraded from permanent.
18293 */
18294 if (new_entry->used_for_tpro) {
18295 if (vmk_flags.vmkf_tpro_enforcement_override) {
18296 new_entry->used_for_tpro = FALSE;
18297 new_entry->vme_permanent = FALSE;
18298 } else {
18299 result = KERN_PROTECTION_FAILURE;
18300 vm_object_deallocate(object);
18301 vm_map_entry_dispose(new_entry);
18302 new_entry = VM_MAP_ENTRY_NULL;
18303 break;
18304 }
18305 }
18306 #endif
18307 } else {
18308 new_entry->inheritance = inheritance;
18309 if (!vm_remap_legacy) {
18310 new_entry->protection = *cur_protection;
18311 new_entry->max_protection = *max_protection;
18312 }
18313 }
18314
18315 VME_OFFSET_SET(new_entry, offset);
18316
18317 /*
18318 * The new region has to be copied now if required.
18319 */
18320 RestartCopy:
18321 if (!copy) {
18322 if (src_entry->used_for_jit == TRUE) {
18323 if (same_map) {
18324 } else if (!VM_MAP_POLICY_ALLOW_JIT_SHARING(map)) {
18325 /*
18326 * Cannot allow an entry describing a JIT
18327 * region to be shared across address spaces.
18328 */
18329 result = KERN_INVALID_ARGUMENT;
18330 vm_object_deallocate(object);
18331 vm_map_entry_dispose(new_entry);
18332 new_entry = VM_MAP_ENTRY_NULL;
18333 break;
18334 }
18335 }
18336
18337 if (!src_entry->is_sub_map &&
18338 VME_OBJECT(src_entry) == VM_OBJECT_NULL) {
18339 /* no accessible memory; nothing to share */
18340 assert(src_entry->protection == VM_PROT_NONE);
18341 assert(src_entry->max_protection == VM_PROT_NONE);
18342 src_entry->is_shared = FALSE;
18343 } else {
18344 src_entry->is_shared = TRUE;
18345 }
18346 if (!new_entry->is_sub_map &&
18347 VME_OBJECT(new_entry) == VM_OBJECT_NULL) {
18348 /* no accessible memory; nothing to share */
18349 assert(new_entry->protection == VM_PROT_NONE);
18350 assert(new_entry->max_protection == VM_PROT_NONE);
18351 new_entry->is_shared = FALSE;
18352 } else {
18353 new_entry->is_shared = TRUE;
18354 }
18355 if (!(new_entry->is_sub_map)) {
18356 new_entry->needs_copy = FALSE;
18357 }
18358 } else if (src_entry->is_sub_map) {
18359 /* make this a COW sub_map if not already */
18360 assert(new_entry->wired_count == 0);
18361 new_entry->needs_copy = TRUE;
18362 object = VM_OBJECT_NULL;
18363 } else if (src_entry->wired_count == 0 &&
18364 !(debug4k_no_cow_copyin && VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT) &&
18365 vm_object_copy_quickly(VME_OBJECT(new_entry),
18366 VME_OFFSET(new_entry),
18367 (new_entry->vme_end -
18368 new_entry->vme_start),
18369 &src_needs_copy,
18370 &new_entry_needs_copy)) {
18371 new_entry->needs_copy = new_entry_needs_copy;
18372 new_entry->is_shared = FALSE;
18373 assertf(new_entry->use_pmap, "map %p new_entry %p\n", map, new_entry);
18374
18375 /*
18376 * Handle copy_on_write semantics.
18377 */
18378 if (src_needs_copy && !src_entry->needs_copy) {
18379 vm_prot_t prot;
18380
18381 if (pmap_has_prot_policy(map->pmap, src_entry->translated_allow_execute, src_entry->protection)) {
18382 panic("%s: map %p pmap %p entry %p 0x%llx:0x%llx prot 0x%x",
18383 __FUNCTION__,
18384 map, map->pmap, src_entry,
18385 (uint64_t)src_entry->vme_start,
18386 (uint64_t)src_entry->vme_end,
18387 src_entry->protection);
18388 }
18389
18390 prot = src_entry->protection & ~VM_PROT_WRITE;
18391
18392 if (override_nx(map,
18393 VME_ALIAS(src_entry))
18394 && prot) {
18395 prot |= VM_PROT_EXECUTE;
18396 }
18397
18398 if (pmap_has_prot_policy(map->pmap, src_entry->translated_allow_execute, prot)) {
18399 panic("%s: map %p pmap %p entry %p 0x%llx:0x%llx prot 0x%x",
18400 __FUNCTION__,
18401 map, map->pmap, src_entry,
18402 (uint64_t)src_entry->vme_start,
18403 (uint64_t)src_entry->vme_end,
18404 prot);
18405 }
18406
18407 vm_object_pmap_protect(object,
18408 offset,
18409 entry_size,
18410 ((src_entry->is_shared
18411 || map->mapped_in_other_pmaps) ?
18412 PMAP_NULL : map->pmap),
18413 VM_MAP_PAGE_SIZE(map),
18414 src_entry->vme_start,
18415 prot);
18416
18417 assert(src_entry->wired_count == 0);
18418 src_entry->needs_copy = TRUE;
18419 }
18420 /*
18421 * Throw away the old object reference of the new entry.
18422 */
18423 vm_object_deallocate(object);
18424 } else {
18425 new_entry->is_shared = FALSE;
18426 assertf(new_entry->use_pmap, "map %p new_entry %p\n", map, new_entry);
18427
18428 src_entry_was_wired = (src_entry->wired_count > 0);
18429 saved_src_entry = src_entry;
18430 src_entry = VM_MAP_ENTRY_NULL;
18431
18432 /*
18433 * The map can be safely unlocked since we
18434 * already hold a reference on the object.
18435 *
18436 * Record the timestamp of the map for later
18437 * verification, and unlock the map.
18438 */
18439 version.main_timestamp = map->timestamp;
18440 vm_map_unlock(map); /* Increments timestamp once! */
18441
18442 /*
18443 * Perform the copy.
18444 */
18445 if (src_entry_was_wired > 0 ||
18446 (debug4k_no_cow_copyin &&
18447 VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT)) {
18448 vm_object_lock(object);
18449 result = vm_object_copy_slowly(
18450 object,
18451 offset,
18452 (new_entry->vme_end -
18453 new_entry->vme_start),
18454 THREAD_UNINT,
18455 &new_copy_object);
18456 /* VME_OBJECT_SET will reset used_for_jit, so preserve it. */
18457 saved_used_for_jit = new_entry->used_for_jit;
18458 VME_OBJECT_SET(new_entry, new_copy_object, false, 0);
18459 new_entry->used_for_jit = saved_used_for_jit;
18460 VME_OFFSET_SET(new_entry, offset - vm_object_trunc_page(offset));
18461 new_entry->needs_copy = FALSE;
18462 } else {
18463 vm_object_offset_t new_offset;
18464
18465 new_offset = VME_OFFSET(new_entry);
18466 result = vm_object_copy_strategically(
18467 object,
18468 offset,
18469 (new_entry->vme_end -
18470 new_entry->vme_start),
18471 false, /* forking */
18472 &new_copy_object,
18473 &new_offset,
18474 &new_entry_needs_copy);
18475 /* VME_OBJECT_SET will reset used_for_jit, so preserve it. */
18476 saved_used_for_jit = new_entry->used_for_jit;
18477 VME_OBJECT_SET(new_entry, new_copy_object, false, 0);
18478 new_entry->used_for_jit = saved_used_for_jit;
18479 if (new_offset != VME_OFFSET(new_entry)) {
18480 VME_OFFSET_SET(new_entry, new_offset);
18481 }
18482
18483 new_entry->needs_copy = new_entry_needs_copy;
18484 }
18485
18486 /*
18487 * Throw away the old object reference of the new entry.
18488 */
18489 vm_object_deallocate(object);
18490
18491 if (result != KERN_SUCCESS &&
18492 result != KERN_MEMORY_RESTART_COPY) {
18493 vm_map_entry_dispose(new_entry);
18494 vm_map_lock(map);
18495 break;
18496 }
18497
18498 /*
18499 * Verify that the map has not substantially
18500 * changed while the copy was being made.
18501 */
18502
18503 vm_map_lock(map);
18504 if (version.main_timestamp + 1 != map->timestamp) {
18505 /*
18506 * Simple version comparison failed.
18507 *
18508 * Retry the lookup and verify that the
18509 * same object/offset are still present.
18510 */
18511 saved_src_entry = VM_MAP_ENTRY_NULL;
18512 vm_object_deallocate(VME_OBJECT(new_entry));
18513 vm_map_entry_dispose(new_entry);
18514 if (result == KERN_MEMORY_RESTART_COPY) {
18515 result = KERN_SUCCESS;
18516 }
18517 continue;
18518 }
18519 /* map hasn't changed: src_entry is still valid */
18520 src_entry = saved_src_entry;
18521 saved_src_entry = VM_MAP_ENTRY_NULL;
18522
18523 if (result == KERN_MEMORY_RESTART_COPY) {
18524 vm_object_reference(object);
18525 goto RestartCopy;
18526 }
18527 }
18528
18529 _vm_map_store_entry_link(map_header,
18530 map_header->links.prev, new_entry);
18531
18532 /* protections for submap mapping are irrelevant here */
18533 if (vm_remap_legacy && !src_entry->is_sub_map) {
18534 *cur_protection &= src_entry->protection;
18535 *max_protection &= src_entry->max_protection;
18536 }
18537
18538 map_address += tmp_size;
18539 mapped_size += tmp_size;
18540 src_start += tmp_size;
18541
18542 if (vmk_flags.vmkf_copy_single_object) {
18543 if (mapped_size != size) {
18544 DEBUG4K_SHARE("map %p addr 0x%llx size 0x%llx clipped copy at mapped_size 0x%llx\n",
18545 map, (uint64_t)addr, (uint64_t)size, (uint64_t)mapped_size);
18546 if (src_entry->vme_next != vm_map_to_entry(map) &&
18547 src_entry->vme_next->vme_object_value ==
18548 src_entry->vme_object_value) {
18549 /* XXX TODO4K */
18550 DEBUG4K_ERROR("could have extended copy to next entry...\n");
18551 }
18552 }
18553 break;
18554 }
18555 } /* end while */
18556
18557 vm_map_unlock(map);
18558 if (result != KERN_SUCCESS) {
18559 /*
18560 * Free all allocated elements.
18561 */
18562 for (src_entry = map_header->links.next;
18563 src_entry != CAST_TO_VM_MAP_ENTRY(&map_header->links);
18564 src_entry = new_entry) {
18565 new_entry = src_entry->vme_next;
18566 _vm_map_store_entry_unlink(map_header, src_entry, false);
18567 if (src_entry->is_sub_map) {
18568 vm_map_deallocate(VME_SUBMAP(src_entry));
18569 } else {
18570 vm_object_deallocate(VME_OBJECT(src_entry));
18571 }
18572 vm_map_entry_dispose(src_entry);
18573 }
18574 }
18575 return result;
18576 }
18577
18578 bool
vm_map_is_exotic(vm_map_t map)18579 vm_map_is_exotic(
18580 vm_map_t map)
18581 {
18582 return VM_MAP_IS_EXOTIC(map);
18583 }
18584
18585 bool
vm_map_is_alien(vm_map_t map)18586 vm_map_is_alien(
18587 vm_map_t map)
18588 {
18589 return VM_MAP_IS_ALIEN(map);
18590 }
18591
18592 #if XNU_TARGET_OS_OSX
18593 void
vm_map_mark_alien(vm_map_t map)18594 vm_map_mark_alien(
18595 vm_map_t map)
18596 {
18597 vm_map_lock(map);
18598 map->is_alien = true;
18599 vm_map_unlock(map);
18600 }
18601
18602 void
vm_map_single_jit(vm_map_t map)18603 vm_map_single_jit(
18604 vm_map_t map)
18605 {
18606 vm_map_lock(map);
18607 map->single_jit = true;
18608 vm_map_unlock(map);
18609 }
18610 #endif /* XNU_TARGET_OS_OSX */
18611
18612
18613 /*
18614 * Callers of this function must call vm_map_copy_require on
18615 * previously created vm_map_copy_t or pass a newly created
18616 * one to ensure that it hasn't been forged.
18617 */
18618 static kern_return_t
vm_map_copy_to_physcopy(vm_map_copy_t copy_map,vm_map_t target_map)18619 vm_map_copy_to_physcopy(
18620 vm_map_copy_t copy_map,
18621 vm_map_t target_map)
18622 {
18623 vm_map_size_t size;
18624 vm_map_entry_t entry;
18625 vm_map_entry_t new_entry;
18626 vm_object_t new_object;
18627 unsigned int pmap_flags;
18628 pmap_t new_pmap;
18629 vm_map_t new_map;
18630 vm_map_address_t src_start, src_end, src_cur;
18631 vm_map_address_t dst_start, dst_end, dst_cur;
18632 kern_return_t kr;
18633 void *kbuf;
18634
18635 /*
18636 * Perform the equivalent of vm_allocate() and memcpy().
18637 * Replace the mappings in "copy_map" with the newly allocated mapping.
18638 */
18639 DEBUG4K_COPY("copy_map %p (%d %d 0x%llx 0x%llx) BEFORE\n", copy_map, copy_map->cpy_hdr.page_shift, copy_map->cpy_hdr.nentries, copy_map->offset, (uint64_t)copy_map->size);
18640
18641 assert(copy_map->cpy_hdr.page_shift != VM_MAP_PAGE_MASK(target_map));
18642
18643 /* create a new pmap to map "copy_map" */
18644 pmap_flags = 0;
18645 assert(copy_map->cpy_hdr.page_shift == FOURK_PAGE_SHIFT);
18646 #if PMAP_CREATE_FORCE_4K_PAGES
18647 pmap_flags |= PMAP_CREATE_FORCE_4K_PAGES;
18648 #endif /* PMAP_CREATE_FORCE_4K_PAGES */
18649 pmap_flags |= PMAP_CREATE_64BIT;
18650 new_pmap = pmap_create_options(NULL, (vm_map_size_t)0, pmap_flags);
18651 if (new_pmap == NULL) {
18652 return KERN_RESOURCE_SHORTAGE;
18653 }
18654
18655 /* allocate new VM object */
18656 size = VM_MAP_ROUND_PAGE(copy_map->size, PAGE_MASK);
18657 new_object = vm_object_allocate(size, VM_MAP_SERIAL_NONE);
18658 assert(new_object);
18659
18660 /* allocate new VM map entry */
18661 new_entry = vm_map_copy_entry_create(copy_map);
18662 assert(new_entry);
18663
18664 /* finish initializing new VM map entry */
18665 new_entry->protection = VM_PROT_DEFAULT;
18666 new_entry->max_protection = VM_PROT_DEFAULT;
18667 new_entry->use_pmap = TRUE;
18668
18669 /* make new VM map entry point to new VM object */
18670 new_entry->vme_start = 0;
18671 new_entry->vme_end = size;
18672 VME_OBJECT_SET(new_entry, new_object, false, 0);
18673 VME_OFFSET_SET(new_entry, 0);
18674
18675 /* create a new pageable VM map to map "copy_map" */
18676 new_map = vm_map_create_options(new_pmap, 0, MACH_VM_MAX_ADDRESS,
18677 VM_MAP_CREATE_PAGEABLE);
18678 assert(new_map);
18679 vm_map_set_page_shift(new_map, copy_map->cpy_hdr.page_shift);
18680
18681 /* map "copy_map" in the new VM map */
18682 src_start = 0;
18683 kr = vm_map_copyout_internal(
18684 new_map,
18685 &src_start,
18686 copy_map,
18687 copy_map->size,
18688 FALSE, /* consume_on_success */
18689 VM_PROT_DEFAULT,
18690 VM_PROT_DEFAULT,
18691 VM_INHERIT_DEFAULT);
18692 assert(kr == KERN_SUCCESS);
18693 src_end = src_start + copy_map->size;
18694
18695 /* map "new_object" in the new VM map */
18696 vm_object_reference(new_object);
18697 dst_start = 0;
18698 kr = vm_map_enter(new_map,
18699 &dst_start,
18700 size,
18701 0, /* mask */
18702 VM_MAP_KERNEL_FLAGS_ANYWHERE(.vm_tag = VM_KERN_MEMORY_OSFMK),
18703 new_object,
18704 0, /* offset */
18705 FALSE, /* needs copy */
18706 VM_PROT_DEFAULT,
18707 VM_PROT_DEFAULT,
18708 VM_INHERIT_DEFAULT);
18709 assert(kr == KERN_SUCCESS);
18710 dst_end = dst_start + size;
18711
18712 /* get a kernel buffer */
18713 kbuf = kalloc_data(PAGE_SIZE, Z_WAITOK | Z_NOFAIL);
18714
18715 /* physically copy "copy_map" mappings to new VM object */
18716 for (src_cur = src_start, dst_cur = dst_start;
18717 src_cur < src_end;
18718 src_cur += PAGE_SIZE, dst_cur += PAGE_SIZE) {
18719 vm_size_t bytes;
18720
18721 bytes = PAGE_SIZE;
18722 if (src_cur + PAGE_SIZE > src_end) {
18723 /* partial copy for last page */
18724 bytes = src_end - src_cur;
18725 assert(bytes > 0 && bytes < PAGE_SIZE);
18726 /* rest of dst page should be zero-filled */
18727 }
18728 /* get bytes from src mapping */
18729 kr = copyinmap(new_map, src_cur, kbuf, bytes);
18730 if (kr != KERN_SUCCESS) {
18731 DEBUG4K_COPY("copyinmap(%p, 0x%llx, %p, 0x%llx) kr 0x%x\n", new_map, (uint64_t)src_cur, kbuf, (uint64_t)bytes, kr);
18732 }
18733 /* put bytes in dst mapping */
18734 assert(dst_cur < dst_end);
18735 assert(dst_cur + bytes <= dst_end);
18736 kr = copyoutmap(new_map, kbuf, dst_cur, bytes);
18737 if (kr != KERN_SUCCESS) {
18738 DEBUG4K_COPY("copyoutmap(%p, %p, 0x%llx, 0x%llx) kr 0x%x\n", new_map, kbuf, (uint64_t)dst_cur, (uint64_t)bytes, kr);
18739 }
18740 }
18741
18742 /* free kernel buffer */
18743 kfree_data(kbuf, PAGE_SIZE);
18744
18745 /* destroy new map */
18746 vm_map_destroy(new_map);
18747 new_map = VM_MAP_NULL;
18748
18749 /* dispose of the old map entries in "copy_map" */
18750 while (vm_map_copy_first_entry(copy_map) !=
18751 vm_map_copy_to_entry(copy_map)) {
18752 entry = vm_map_copy_first_entry(copy_map);
18753 vm_map_copy_entry_unlink(copy_map, entry);
18754 if (entry->is_sub_map) {
18755 vm_map_deallocate(VME_SUBMAP(entry));
18756 } else {
18757 vm_object_deallocate(VME_OBJECT(entry));
18758 }
18759 vm_map_copy_entry_dispose(entry);
18760 }
18761
18762 /* change "copy_map"'s page_size to match "target_map" */
18763 copy_map->cpy_hdr.page_shift = (uint16_t)VM_MAP_PAGE_SHIFT(target_map);
18764 copy_map->offset = 0;
18765 copy_map->size = size;
18766
18767 /* insert new map entry in "copy_map" */
18768 assert(vm_map_copy_last_entry(copy_map) == vm_map_copy_to_entry(copy_map));
18769 vm_map_copy_entry_link(copy_map, vm_map_copy_last_entry(copy_map), new_entry);
18770
18771 DEBUG4K_COPY("copy_map %p (%d %d 0x%llx 0x%llx) AFTER\n", copy_map, copy_map->cpy_hdr.page_shift, copy_map->cpy_hdr.nentries, copy_map->offset, (uint64_t)copy_map->size);
18772 return KERN_SUCCESS;
18773 }
18774
18775 void
18776 vm_map_copy_adjust_get_target_copy_map(
18777 vm_map_copy_t copy_map,
18778 vm_map_copy_t *target_copy_map_p);
18779 void
vm_map_copy_adjust_get_target_copy_map(vm_map_copy_t copy_map,vm_map_copy_t * target_copy_map_p)18780 vm_map_copy_adjust_get_target_copy_map(
18781 vm_map_copy_t copy_map,
18782 vm_map_copy_t *target_copy_map_p)
18783 {
18784 vm_map_copy_t target_copy_map;
18785 vm_map_entry_t entry, target_entry;
18786
18787 if (*target_copy_map_p != VM_MAP_COPY_NULL) {
18788 /* the caller already has a "target_copy_map": use it */
18789 return;
18790 }
18791
18792 /* the caller wants us to create a new copy of "copy_map" */
18793 assert(copy_map->type == VM_MAP_COPY_ENTRY_LIST);
18794 target_copy_map = vm_map_copy_allocate(copy_map->type);
18795 target_copy_map->offset = copy_map->offset;
18796 target_copy_map->size = copy_map->size;
18797 target_copy_map->cpy_hdr.page_shift = copy_map->cpy_hdr.page_shift;
18798 for (entry = vm_map_copy_first_entry(copy_map);
18799 entry != vm_map_copy_to_entry(copy_map);
18800 entry = entry->vme_next) {
18801 target_entry = vm_map_copy_entry_create(target_copy_map);
18802 vm_map_entry_copy_full(target_entry, entry);
18803 if (target_entry->is_sub_map) {
18804 vm_map_reference(VME_SUBMAP(target_entry));
18805 } else {
18806 vm_object_reference(VME_OBJECT(target_entry));
18807 }
18808 vm_map_copy_entry_link(
18809 target_copy_map,
18810 vm_map_copy_last_entry(target_copy_map),
18811 target_entry);
18812 }
18813 entry = VM_MAP_ENTRY_NULL;
18814 *target_copy_map_p = target_copy_map;
18815 }
18816
18817 /*
18818 * Callers of this function must call vm_map_copy_require on
18819 * previously created vm_map_copy_t or pass a newly created
18820 * one to ensure that it hasn't been forged.
18821 */
18822 static void
vm_map_copy_trim(vm_map_copy_t copy_map,uint16_t new_page_shift,vm_map_offset_t trim_start,vm_map_offset_t trim_end)18823 vm_map_copy_trim(
18824 vm_map_copy_t copy_map,
18825 uint16_t new_page_shift,
18826 vm_map_offset_t trim_start,
18827 vm_map_offset_t trim_end)
18828 {
18829 uint16_t copy_page_shift;
18830 vm_map_entry_t entry, next_entry;
18831
18832 assert(copy_map->type == VM_MAP_COPY_ENTRY_LIST);
18833 assert(copy_map->cpy_hdr.nentries > 0);
18834
18835 trim_start += vm_map_copy_first_entry(copy_map)->vme_start;
18836 trim_end += vm_map_copy_first_entry(copy_map)->vme_start;
18837
18838 /* use the new page_shift to do the clipping */
18839 copy_page_shift = VM_MAP_COPY_PAGE_SHIFT(copy_map);
18840 copy_map->cpy_hdr.page_shift = new_page_shift;
18841
18842 for (entry = vm_map_copy_first_entry(copy_map);
18843 entry != vm_map_copy_to_entry(copy_map);
18844 entry = next_entry) {
18845 next_entry = entry->vme_next;
18846 if (entry->vme_end <= trim_start) {
18847 /* entry fully before trim range: skip */
18848 continue;
18849 }
18850 if (entry->vme_start >= trim_end) {
18851 /* entry fully after trim range: done */
18852 break;
18853 }
18854 /* clip entry if needed */
18855 vm_map_copy_clip_start(copy_map, entry, trim_start);
18856 vm_map_copy_clip_end(copy_map, entry, trim_end);
18857 /* dispose of entry */
18858 copy_map->size -= entry->vme_end - entry->vme_start;
18859 vm_map_copy_entry_unlink(copy_map, entry);
18860 if (entry->is_sub_map) {
18861 vm_map_deallocate(VME_SUBMAP(entry));
18862 } else {
18863 vm_object_deallocate(VME_OBJECT(entry));
18864 }
18865 vm_map_copy_entry_dispose(entry);
18866 entry = VM_MAP_ENTRY_NULL;
18867 }
18868
18869 /* restore copy_map's original page_shift */
18870 copy_map->cpy_hdr.page_shift = copy_page_shift;
18871 }
18872
18873 /*
18874 * Make any necessary adjustments to "copy_map" to allow it to be
18875 * mapped into "target_map".
18876 * If no changes were necessary, "target_copy_map" points to the
18877 * untouched "copy_map".
18878 * If changes are necessary, changes will be made to "target_copy_map".
18879 * If "target_copy_map" was NULL, we create a new "vm_map_copy_t" and
18880 * copy the original "copy_map" to it before applying the changes.
18881 * The caller should discard "target_copy_map" if it's not the same as
18882 * the original "copy_map".
18883 */
18884 /* TODO4K: also adjust to sub-range in the copy_map -> add start&end? */
18885 kern_return_t
vm_map_copy_adjust_to_target(vm_map_copy_t src_copy_map,vm_map_offset_ut offset_u,vm_map_size_ut size_u,vm_map_t target_map,boolean_t copy,vm_map_copy_t * target_copy_map_p,vm_map_offset_t * overmap_start_p,vm_map_offset_t * overmap_end_p,vm_map_offset_t * trimmed_start_p)18886 vm_map_copy_adjust_to_target(
18887 vm_map_copy_t src_copy_map,
18888 vm_map_offset_ut offset_u,
18889 vm_map_size_ut size_u,
18890 vm_map_t target_map,
18891 boolean_t copy,
18892 vm_map_copy_t *target_copy_map_p,
18893 vm_map_offset_t *overmap_start_p,
18894 vm_map_offset_t *overmap_end_p,
18895 vm_map_offset_t *trimmed_start_p)
18896 {
18897 vm_map_copy_t copy_map, target_copy_map;
18898 vm_map_size_t target_size;
18899 vm_map_size_t src_copy_map_size;
18900 vm_map_size_t overmap_start, overmap_end;
18901 int misalignments;
18902 vm_map_entry_t entry, target_entry;
18903 vm_map_offset_t addr_adjustment;
18904 vm_map_offset_t new_start, new_end;
18905 int copy_page_mask, target_page_mask;
18906 uint16_t copy_page_shift, target_page_shift;
18907 vm_map_offset_t trimmed_end;
18908 vm_map_size_t map_size;
18909 kern_return_t kr;
18910
18911 /*
18912 * Sanitize any input parameters that are addr/size/prot/inherit
18913 */
18914 kr = vm_map_copy_addr_size_sanitize(
18915 target_map,
18916 offset_u,
18917 size_u,
18918 VM_SANITIZE_CALLER_MACH_MEMORY_ENTRY_MAP_SIZE,
18919 &new_start,
18920 &new_end,
18921 &map_size);
18922 if (__improbable(kr != KERN_SUCCESS)) {
18923 return vm_sanitize_get_kr(kr);
18924 }
18925
18926 /*
18927 * Assert that the vm_map_copy is coming from the right
18928 * zone and hasn't been forged
18929 */
18930 vm_map_copy_require(src_copy_map);
18931 assert(src_copy_map->type == VM_MAP_COPY_ENTRY_LIST);
18932
18933 /*
18934 * Start working with "src_copy_map" but we'll switch
18935 * to "target_copy_map" as soon as we start making adjustments.
18936 */
18937 copy_map = src_copy_map;
18938 src_copy_map_size = src_copy_map->size;
18939
18940 copy_page_shift = VM_MAP_COPY_PAGE_SHIFT(copy_map);
18941 copy_page_mask = VM_MAP_COPY_PAGE_MASK(copy_map);
18942 target_page_shift = (uint16_t)VM_MAP_PAGE_SHIFT(target_map);
18943 target_page_mask = VM_MAP_PAGE_MASK(target_map);
18944
18945 DEBUG4K_ADJUST("copy_map %p (%d offset 0x%llx size 0x%llx) target_map %p (%d) copy %d offset 0x%llx size 0x%llx target_copy_map %p...\n", copy_map, copy_page_shift, (uint64_t)copy_map->offset, (uint64_t)copy_map->size, target_map, target_page_shift, copy, (uint64_t)VM_SANITIZE_UNSAFE_UNWRAP(offset_u), (uint64_t)VM_SANITIZE_UNSAFE_UNWRAP(size_u), *target_copy_map_p);
18946
18947 target_copy_map = *target_copy_map_p;
18948 if (target_copy_map != VM_MAP_COPY_NULL) {
18949 vm_map_copy_require(target_copy_map);
18950 }
18951
18952 if (new_end > copy_map->size) {
18953 DEBUG4K_ERROR("copy_map %p (%d->%d) copy_map->size 0x%llx offset 0x%llx size 0x%llx KERN_INVALID_ARGUMENT\n", copy_map, copy_page_shift, target_page_shift, (uint64_t)copy_map->size, (uint64_t)VM_SANITIZE_UNSAFE_UNWRAP(offset_u), (uint64_t)VM_SANITIZE_UNSAFE_UNWRAP(size_u));
18954 return KERN_INVALID_ARGUMENT;
18955 }
18956
18957 /* trim the end */
18958 trimmed_end = 0;
18959 new_end = VM_MAP_ROUND_PAGE(new_end, target_page_mask);
18960 if (new_end < copy_map->size) {
18961 trimmed_end = src_copy_map_size - new_end;
18962 DEBUG4K_ADJUST("copy_map %p (%d->%d) copy %d offset 0x%llx size 0x%llx target_copy_map %p... trim end from 0x%llx to 0x%llx\n", copy_map, copy_page_shift, target_page_shift, copy, (uint64_t)VM_SANITIZE_UNSAFE_UNWRAP(offset_u), (uint64_t)VM_SANITIZE_UNSAFE_UNWRAP(size_u), target_copy_map, (uint64_t)new_end, (uint64_t)copy_map->size);
18963 /* get "target_copy_map" if needed and adjust it */
18964 vm_map_copy_adjust_get_target_copy_map(copy_map,
18965 &target_copy_map);
18966 copy_map = target_copy_map;
18967 vm_map_copy_trim(target_copy_map, target_page_shift,
18968 new_end, copy_map->size);
18969 }
18970
18971 /* trim the start */
18972 new_start = VM_MAP_TRUNC_PAGE(new_start, target_page_mask);
18973 if (new_start != 0) {
18974 DEBUG4K_ADJUST("copy_map %p (%d->%d) copy %d offset 0x%llx size 0x%llx target_copy_map %p... trim start from 0x%llx to 0x%llx\n", copy_map, copy_page_shift, target_page_shift, copy, (uint64_t)VM_SANITIZE_UNSAFE_UNWRAP(offset_u), (uint64_t)VM_SANITIZE_UNSAFE_UNWRAP(size_u), target_copy_map, (uint64_t)0, (uint64_t)new_start);
18975 /* get "target_copy_map" if needed and adjust it */
18976 vm_map_copy_adjust_get_target_copy_map(copy_map,
18977 &target_copy_map);
18978 copy_map = target_copy_map;
18979 vm_map_copy_trim(target_copy_map, target_page_shift,
18980 0, new_start);
18981 }
18982 *trimmed_start_p = new_start;
18983
18984 /* target_size starts with what's left after trimming */
18985 target_size = copy_map->size;
18986 assertf(target_size == src_copy_map_size - *trimmed_start_p - trimmed_end,
18987 "target_size 0x%llx src_copy_map_size 0x%llx trimmed_start 0x%llx trimmed_end 0x%llx\n",
18988 (uint64_t)target_size, (uint64_t)src_copy_map_size,
18989 (uint64_t)*trimmed_start_p, (uint64_t)trimmed_end);
18990
18991 /* check for misalignments but don't adjust yet */
18992 misalignments = 0;
18993 overmap_start = 0;
18994 overmap_end = 0;
18995 if (copy_page_shift < target_page_shift) {
18996 /*
18997 * Remapping from 4K to 16K: check the VM object alignments
18998 * throughout the range.
18999 * If the start and end of the range are mis-aligned, we can
19000 * over-map to re-align, and adjust the "overmap" start/end
19001 * and "target_size" of the range accordingly.
19002 * If there is any mis-alignment within the range:
19003 * if "copy":
19004 * we can do immediate-copy instead of copy-on-write,
19005 * else:
19006 * no way to remap and share; fail.
19007 */
19008 for (entry = vm_map_copy_first_entry(copy_map);
19009 entry != vm_map_copy_to_entry(copy_map);
19010 entry = entry->vme_next) {
19011 vm_object_offset_t object_offset_start, object_offset_end;
19012
19013 object_offset_start = VME_OFFSET(entry);
19014 object_offset_end = object_offset_start;
19015 object_offset_end += entry->vme_end - entry->vme_start;
19016 if (object_offset_start & target_page_mask) {
19017 if (entry == vm_map_copy_first_entry(copy_map) && !copy) {
19018 overmap_start++;
19019 } else {
19020 misalignments++;
19021 }
19022 }
19023 if (object_offset_end & target_page_mask) {
19024 if (entry->vme_next == vm_map_copy_to_entry(copy_map) && !copy) {
19025 overmap_end++;
19026 } else {
19027 misalignments++;
19028 }
19029 }
19030 }
19031 }
19032 entry = VM_MAP_ENTRY_NULL;
19033
19034 /* decide how to deal with misalignments */
19035 assert(overmap_start <= 1);
19036 assert(overmap_end <= 1);
19037 if (!overmap_start && !overmap_end && !misalignments) {
19038 /* copy_map is properly aligned for target_map ... */
19039 if (*trimmed_start_p) {
19040 /* ... but we trimmed it, so still need to adjust */
19041 } else {
19042 /* ... and we didn't trim anything: we're done */
19043 if (target_copy_map == VM_MAP_COPY_NULL) {
19044 target_copy_map = copy_map;
19045 }
19046 *target_copy_map_p = target_copy_map;
19047 *overmap_start_p = 0;
19048 *overmap_end_p = 0;
19049 DEBUG4K_ADJUST("copy_map %p (%d offset 0x%llx size 0x%llx) target_map %p (%d) copy %d target_copy_map %p (%d offset 0x%llx size 0x%llx) -> trimmed 0x%llx overmap start 0x%llx end 0x%llx KERN_SUCCESS\n", copy_map, copy_page_shift, (uint64_t)copy_map->offset, (uint64_t)copy_map->size, target_map, target_page_shift, copy, *target_copy_map_p, VM_MAP_COPY_PAGE_SHIFT(*target_copy_map_p), (uint64_t)(*target_copy_map_p)->offset, (uint64_t)(*target_copy_map_p)->size, (uint64_t)*trimmed_start_p, (uint64_t)*overmap_start_p, (uint64_t)*overmap_end_p);
19050 return KERN_SUCCESS;
19051 }
19052 } else if (misalignments && !copy) {
19053 /* can't "share" if misaligned */
19054 DEBUG4K_ADJUST("unsupported sharing\n");
19055 #if MACH_ASSERT
19056 if (debug4k_panic_on_misaligned_sharing) {
19057 panic("DEBUG4k %s:%d unsupported sharing", __FUNCTION__, __LINE__);
19058 }
19059 #endif /* MACH_ASSERT */
19060 DEBUG4K_ADJUST("copy_map %p (%d) target_map %p (%d) copy %d target_copy_map %p -> KERN_NOT_SUPPORTED\n", copy_map, copy_page_shift, target_map, target_page_shift, copy, *target_copy_map_p);
19061 return KERN_NOT_SUPPORTED;
19062 } else {
19063 /* can't virtual-copy if misaligned (but can physical-copy) */
19064 DEBUG4K_ADJUST("mis-aligned copying\n");
19065 }
19066
19067 /* get a "target_copy_map" if needed and switch to it */
19068 vm_map_copy_adjust_get_target_copy_map(copy_map, &target_copy_map);
19069 copy_map = target_copy_map;
19070
19071 if (misalignments && copy) {
19072 vm_map_size_t target_copy_map_size;
19073
19074 /*
19075 * Can't do copy-on-write with misaligned mappings.
19076 * Replace the mappings with a physical copy of the original
19077 * mappings' contents.
19078 */
19079 target_copy_map_size = target_copy_map->size;
19080 kr = vm_map_copy_to_physcopy(target_copy_map, target_map);
19081 if (kr != KERN_SUCCESS) {
19082 return kr;
19083 }
19084 *target_copy_map_p = target_copy_map;
19085 *overmap_start_p = 0;
19086 *overmap_end_p = target_copy_map->size - target_copy_map_size;
19087 DEBUG4K_ADJUST("copy_map %p (%d offset 0x%llx size 0x%llx) target_map %p (%d) copy %d target_copy_map %p (%d offset 0x%llx size 0x%llx)-> trimmed 0x%llx overmap start 0x%llx end 0x%llx PHYSCOPY\n", copy_map, copy_page_shift, (uint64_t)copy_map->offset, (uint64_t)copy_map->size, target_map, target_page_shift, copy, *target_copy_map_p, VM_MAP_COPY_PAGE_SHIFT(*target_copy_map_p), (uint64_t)(*target_copy_map_p)->offset, (uint64_t)(*target_copy_map_p)->size, (uint64_t)*trimmed_start_p, (uint64_t)*overmap_start_p, (uint64_t)*overmap_end_p);
19088 return KERN_SUCCESS;
19089 }
19090
19091 /* apply the adjustments */
19092 misalignments = 0;
19093 overmap_start = 0;
19094 overmap_end = 0;
19095 /* remove copy_map->offset, so that everything starts at offset 0 */
19096 addr_adjustment = copy_map->offset;
19097 /* also remove whatever we trimmed from the start */
19098 addr_adjustment += *trimmed_start_p;
19099 for (target_entry = vm_map_copy_first_entry(target_copy_map);
19100 target_entry != vm_map_copy_to_entry(target_copy_map);
19101 target_entry = target_entry->vme_next) {
19102 vm_object_offset_t object_offset_start, object_offset_end;
19103
19104 DEBUG4K_ADJUST("copy %p (%d 0x%llx 0x%llx) entry %p [ 0x%llx 0x%llx ] object %p offset 0x%llx BEFORE\n", target_copy_map, VM_MAP_COPY_PAGE_SHIFT(target_copy_map), target_copy_map->offset, (uint64_t)target_copy_map->size, target_entry, (uint64_t)target_entry->vme_start, (uint64_t)target_entry->vme_end, VME_OBJECT(target_entry), VME_OFFSET(target_entry));
19105 object_offset_start = VME_OFFSET(target_entry);
19106 if (object_offset_start & target_page_mask) {
19107 DEBUG4K_ADJUST("copy %p (%d 0x%llx 0x%llx) entry %p [ 0x%llx 0x%llx ] object %p offset 0x%llx misaligned at start\n", target_copy_map, VM_MAP_COPY_PAGE_SHIFT(target_copy_map), target_copy_map->offset, (uint64_t)target_copy_map->size, target_entry, (uint64_t)target_entry->vme_start, (uint64_t)target_entry->vme_end, VME_OBJECT(target_entry), VME_OFFSET(target_entry));
19108 if (target_entry == vm_map_copy_first_entry(target_copy_map)) {
19109 /*
19110 * start of 1st entry is mis-aligned:
19111 * re-adjust by over-mapping.
19112 */
19113 overmap_start = object_offset_start - trunc_page_mask_64(object_offset_start, target_page_mask);
19114 DEBUG4K_ADJUST("entry %p offset 0x%llx copy %d -> overmap_start 0x%llx\n", target_entry, VME_OFFSET(target_entry), copy, (uint64_t)overmap_start);
19115 VME_OFFSET_SET(target_entry, VME_OFFSET(target_entry) - overmap_start);
19116 } else {
19117 misalignments++;
19118 DEBUG4K_ADJUST("entry %p offset 0x%llx copy %d -> misalignments %d\n", target_entry, VME_OFFSET(target_entry), copy, misalignments);
19119 assert(copy);
19120 }
19121 }
19122
19123 if (target_entry == vm_map_copy_first_entry(target_copy_map)) {
19124 target_size += overmap_start;
19125 } else {
19126 target_entry->vme_start += overmap_start;
19127 }
19128 target_entry->vme_end += overmap_start;
19129
19130 object_offset_end = VME_OFFSET(target_entry) + target_entry->vme_end - target_entry->vme_start;
19131 if (object_offset_end & target_page_mask) {
19132 DEBUG4K_ADJUST("copy %p (%d 0x%llx 0x%llx) entry %p [ 0x%llx 0x%llx ] object %p offset 0x%llx misaligned at end\n", target_copy_map, VM_MAP_COPY_PAGE_SHIFT(target_copy_map), target_copy_map->offset, (uint64_t)target_copy_map->size, target_entry, (uint64_t)target_entry->vme_start, (uint64_t)target_entry->vme_end, VME_OBJECT(target_entry), VME_OFFSET(target_entry));
19133 if (target_entry->vme_next == vm_map_copy_to_entry(target_copy_map)) {
19134 /*
19135 * end of last entry is mis-aligned: re-adjust by over-mapping.
19136 */
19137 overmap_end = round_page_mask_64(object_offset_end, target_page_mask) - object_offset_end;
19138 DEBUG4K_ADJUST("entry %p offset 0x%llx copy %d -> overmap_end 0x%llx\n", target_entry, VME_OFFSET(target_entry), copy, (uint64_t)overmap_end);
19139 target_entry->vme_end += overmap_end;
19140 target_size += overmap_end;
19141 } else {
19142 misalignments++;
19143 DEBUG4K_ADJUST("entry %p offset 0x%llx copy %d -> misalignments %d\n", target_entry, VME_OFFSET(target_entry), copy, misalignments);
19144 assert(copy);
19145 }
19146 }
19147 target_entry->vme_start -= addr_adjustment;
19148 target_entry->vme_end -= addr_adjustment;
19149 DEBUG4K_ADJUST("copy %p (%d 0x%llx 0x%llx) entry %p [ 0x%llx 0x%llx ] object %p offset 0x%llx AFTER\n", target_copy_map, VM_MAP_COPY_PAGE_SHIFT(target_copy_map), target_copy_map->offset, (uint64_t)target_copy_map->size, target_entry, (uint64_t)target_entry->vme_start, (uint64_t)target_entry->vme_end, VME_OBJECT(target_entry), VME_OFFSET(target_entry));
19150 }
19151
19152 target_copy_map->size = target_size;
19153 target_copy_map->offset += overmap_start;
19154 target_copy_map->offset -= addr_adjustment;
19155 target_copy_map->cpy_hdr.page_shift = target_page_shift;
19156
19157 // assert(VM_MAP_PAGE_ALIGNED(target_copy_map->size, target_page_mask));
19158 // assert(VM_MAP_PAGE_ALIGNED(target_copy_map->offset, FOURK_PAGE_MASK));
19159 assert(overmap_start < VM_MAP_PAGE_SIZE(target_map));
19160 assert(overmap_end < VM_MAP_PAGE_SIZE(target_map));
19161
19162 *target_copy_map_p = target_copy_map;
19163 *overmap_start_p = overmap_start;
19164 *overmap_end_p = overmap_end;
19165
19166 DEBUG4K_ADJUST("copy_map %p (%d offset 0x%llx size 0x%llx) target_map %p (%d) copy %d target_copy_map %p (%d offset 0x%llx size 0x%llx) -> trimmed 0x%llx overmap start 0x%llx end 0x%llx KERN_SUCCESS\n", copy_map, copy_page_shift, (uint64_t)copy_map->offset, (uint64_t)copy_map->size, target_map, target_page_shift, copy, *target_copy_map_p, VM_MAP_COPY_PAGE_SHIFT(*target_copy_map_p), (uint64_t)(*target_copy_map_p)->offset, (uint64_t)(*target_copy_map_p)->size, (uint64_t)*trimmed_start_p, (uint64_t)*overmap_start_p, (uint64_t)*overmap_end_p);
19167 return KERN_SUCCESS;
19168 }
19169
19170 kern_return_t
vm_map_range_physical_size(vm_map_t map,vm_map_address_t start,mach_vm_size_t size,mach_vm_size_t * phys_size)19171 vm_map_range_physical_size(
19172 vm_map_t map,
19173 vm_map_address_t start,
19174 mach_vm_size_t size,
19175 mach_vm_size_t * phys_size)
19176 {
19177 kern_return_t kr;
19178 vm_map_copy_t copy_map, target_copy_map;
19179 vm_map_offset_t adjusted_start, adjusted_end;
19180 vm_map_size_t adjusted_size;
19181 vm_prot_t cur_prot, max_prot;
19182 vm_map_offset_t overmap_start, overmap_end, trimmed_start, end;
19183 vm_map_kernel_flags_t vmk_flags;
19184
19185 if (size == 0) {
19186 DEBUG4K_SHARE("map %p start 0x%llx size 0x%llx -> phys_size 0!\n", map, (uint64_t)start, (uint64_t)size);
19187 *phys_size = 0;
19188 return KERN_SUCCESS;
19189 }
19190
19191 adjusted_start = vm_map_trunc_page(start, VM_MAP_PAGE_MASK(map));
19192 adjusted_end = vm_map_round_page(start + size, VM_MAP_PAGE_MASK(map));
19193 if (__improbable(os_add_overflow(start, size, &end) ||
19194 adjusted_end <= adjusted_start)) {
19195 /* wraparound */
19196 printf("%s:%d(start=0x%llx, size=0x%llx) pgmask 0x%x: wraparound\n", __FUNCTION__, __LINE__, (uint64_t)start, (uint64_t)size, VM_MAP_PAGE_MASK(map));
19197 *phys_size = 0;
19198 return KERN_INVALID_ARGUMENT;
19199 }
19200 if (__improbable(vm_map_range_overflows(map, start, size))) {
19201 *phys_size = 0;
19202 return KERN_INVALID_ADDRESS;
19203 }
19204 assert(adjusted_end > adjusted_start);
19205 adjusted_size = adjusted_end - adjusted_start;
19206 *phys_size = adjusted_size;
19207 if (VM_MAP_PAGE_SIZE(map) == PAGE_SIZE) {
19208 return KERN_SUCCESS;
19209 }
19210 if (start == 0) {
19211 adjusted_start = vm_map_trunc_page(start, PAGE_MASK);
19212 adjusted_end = vm_map_round_page(start + size, PAGE_MASK);
19213 if (__improbable(adjusted_end <= adjusted_start)) {
19214 /* wraparound */
19215 printf("%s:%d(start=0x%llx, size=0x%llx) pgmask 0x%x: wraparound\n", __FUNCTION__, __LINE__, (uint64_t)start, (uint64_t)size, PAGE_MASK);
19216 *phys_size = 0;
19217 return KERN_INVALID_ARGUMENT;
19218 }
19219 assert(adjusted_end > adjusted_start);
19220 adjusted_size = adjusted_end - adjusted_start;
19221 *phys_size = adjusted_size;
19222 return KERN_SUCCESS;
19223 }
19224
19225 vmk_flags = VM_MAP_KERNEL_FLAGS_NONE;
19226 vmk_flags.vmkf_copy_pageable = TRUE;
19227 vmk_flags.vmkf_copy_same_map = TRUE;
19228 assert(adjusted_size != 0);
19229 cur_prot = VM_PROT_NONE; /* legacy mode */
19230 max_prot = VM_PROT_NONE; /* legacy mode */
19231 vmk_flags.vmkf_remap_legacy_mode = true;
19232 kr = vm_map_copy_extract(map, adjusted_start, adjusted_size,
19233 FALSE /* copy */,
19234 ©_map,
19235 &cur_prot, &max_prot, VM_INHERIT_DEFAULT,
19236 vmk_flags);
19237 if (kr != KERN_SUCCESS) {
19238 DEBUG4K_ERROR("map %p start 0x%llx 0x%llx size 0x%llx 0x%llx kr 0x%x\n", map, (uint64_t)start, (uint64_t)adjusted_start, size, (uint64_t)adjusted_size, kr);
19239 //assert(0);
19240 *phys_size = 0;
19241 return kr;
19242 }
19243 assert(copy_map != VM_MAP_COPY_NULL);
19244 target_copy_map = copy_map;
19245 DEBUG4K_ADJUST("adjusting...\n");
19246 kr = vm_map_copy_adjust_to_target(
19247 copy_map,
19248 start - adjusted_start, /* offset */
19249 size, /* size */
19250 kernel_map,
19251 FALSE, /* copy */
19252 &target_copy_map,
19253 &overmap_start,
19254 &overmap_end,
19255 &trimmed_start);
19256 if (kr == KERN_SUCCESS) {
19257 if (target_copy_map->size != *phys_size) {
19258 DEBUG4K_ADJUST("map %p (%d) start 0x%llx size 0x%llx adjusted_start 0x%llx adjusted_end 0x%llx overmap_start 0x%llx overmap_end 0x%llx trimmed_start 0x%llx phys_size 0x%llx -> 0x%llx\n", map, VM_MAP_PAGE_SHIFT(map), (uint64_t)start, (uint64_t)size, (uint64_t)adjusted_start, (uint64_t)adjusted_end, (uint64_t)overmap_start, (uint64_t)overmap_end, (uint64_t)trimmed_start, (uint64_t)*phys_size, (uint64_t)target_copy_map->size);
19259 }
19260 *phys_size = target_copy_map->size;
19261 } else {
19262 DEBUG4K_ERROR("map %p start 0x%llx 0x%llx size 0x%llx 0x%llx kr 0x%x\n", map, (uint64_t)start, (uint64_t)adjusted_start, size, (uint64_t)adjusted_size, kr);
19263 //assert(0);
19264 *phys_size = 0;
19265 }
19266 vm_map_copy_discard(copy_map);
19267 copy_map = VM_MAP_COPY_NULL;
19268
19269 return kr;
19270 }
19271
19272 static __attribute__((always_inline, warn_unused_result))
19273 kern_return_t
vm_map_remap_sanitize(vm_map_t src_map,vm_map_t target_map,vm_map_address_ut address_u,vm_map_size_ut size_u,vm_map_offset_ut mask_u,vm_map_offset_ut memory_address_u,vm_prot_ut cur_protection_u,vm_prot_ut max_protection_u,vm_inherit_ut inheritance_u,vm_map_kernel_flags_t vmk_flags,vm_map_address_t * target_addr,vm_map_address_t * mask,vm_map_offset_t * memory_address,vm_map_offset_t * memory_end,vm_map_size_t * memory_size,vm_prot_t * cur_protection,vm_prot_t * max_protection,vm_inherit_t * inheritance)19274 vm_map_remap_sanitize(
19275 vm_map_t src_map,
19276 vm_map_t target_map,
19277 vm_map_address_ut address_u,
19278 vm_map_size_ut size_u,
19279 vm_map_offset_ut mask_u,
19280 vm_map_offset_ut memory_address_u,
19281 vm_prot_ut cur_protection_u,
19282 vm_prot_ut max_protection_u,
19283 vm_inherit_ut inheritance_u,
19284 vm_map_kernel_flags_t vmk_flags,
19285 vm_map_address_t *target_addr,
19286 vm_map_address_t *mask,
19287 vm_map_offset_t *memory_address,
19288 vm_map_offset_t *memory_end,
19289 vm_map_size_t *memory_size,
19290 vm_prot_t *cur_protection,
19291 vm_prot_t *max_protection,
19292 vm_inherit_t *inheritance)
19293 {
19294 kern_return_t result;
19295 vm_sanitize_flags_t vm_sanitize_flags;
19296
19297 result = vm_sanitize_inherit(inheritance_u, VM_SANITIZE_CALLER_VM_MAP_REMAP,
19298 inheritance);
19299 if (__improbable(result != KERN_SUCCESS)) {
19300 return result;
19301 }
19302
19303 result = vm_sanitize_cur_and_max_prots(cur_protection_u, max_protection_u,
19304 VM_SANITIZE_CALLER_VM_MAP_REMAP, target_map,
19305 cur_protection, max_protection);
19306 if (__improbable(result != KERN_SUCCESS)) {
19307 return result;
19308 }
19309
19310 result = vm_sanitize_mask(mask_u, VM_SANITIZE_CALLER_VM_MAP_REMAP, mask);
19311 if (__improbable(result != KERN_SUCCESS)) {
19312 return result;
19313 }
19314
19315 /*
19316 * If the user is requesting that we return the address of the
19317 * first byte of the data (rather than the base of the page),
19318 * then we use different rounding semantics: specifically,
19319 * we assume that (memory_address, size) describes a region
19320 * all of whose pages we must cover, rather than a base to be truncated
19321 * down and a size to be added to that base. So we figure out
19322 * the highest page that the requested region includes and make
19323 * sure that the size will cover it.
19324 *
19325 * The key example we're worried about it is of the form:
19326 *
19327 * memory_address = 0x1ff0, size = 0x20
19328 *
19329 * With the old semantics, we round down the memory_address to 0x1000
19330 * and round up the size to 0x1000, resulting in our covering *only*
19331 * page 0x1000. With the new semantics, we'd realize that the region covers
19332 * 0x1ff0-0x2010, and compute a size of 0x2000. Thus, we cover both page
19333 * 0x1000 and page 0x2000 in the region we remap.
19334 *
19335 * VM_SANITIZE_FLAGS_REALIGN_START asks for the old (broken) semantics.
19336 */
19337 vm_sanitize_flags = VM_SANITIZE_FLAGS_SIZE_ZERO_FAILS;
19338 if (!vmk_flags.vmf_return_data_addr) {
19339 vm_sanitize_flags |= VM_SANITIZE_FLAGS_REALIGN_START;
19340 }
19341
19342 result = vm_sanitize_addr_size(memory_address_u, size_u,
19343 VM_SANITIZE_CALLER_VM_MAP_REMAP, src_map,
19344 vm_sanitize_flags, memory_address, memory_end,
19345 memory_size);
19346 if (__improbable(result != KERN_SUCCESS)) {
19347 return result;
19348 }
19349
19350 *target_addr = vm_sanitize_addr(target_map, address_u);
19351 return KERN_SUCCESS;
19352 }
19353
19354 /*
19355 * Routine: vm_remap
19356 *
19357 * Map portion of a task's address space.
19358 * Mapped region must not overlap more than
19359 * one vm memory object. Protections and
19360 * inheritance attributes remain the same
19361 * as in the original task and are out parameters.
19362 * Source and Target task can be identical
19363 * Other attributes are identical as for vm_map()
19364 */
19365 kern_return_t
vm_map_remap(vm_map_t target_map,vm_map_address_ut * address_u,vm_map_size_ut size_u,vm_map_offset_ut mask_u,vm_map_kernel_flags_t vmk_flags,vm_map_t src_map,vm_map_offset_ut memory_address_u,boolean_t copy,vm_prot_ut * cur_protection_u,vm_prot_ut * max_protection_u,vm_inherit_ut inheritance_u)19366 vm_map_remap(
19367 vm_map_t target_map,
19368 vm_map_address_ut *address_u,
19369 vm_map_size_ut size_u,
19370 vm_map_offset_ut mask_u,
19371 vm_map_kernel_flags_t vmk_flags,
19372 vm_map_t src_map,
19373 vm_map_offset_ut memory_address_u,
19374 boolean_t copy,
19375 vm_prot_ut *cur_protection_u, /* IN/OUT */
19376 vm_prot_ut *max_protection_u, /* IN/OUT */
19377 vm_inherit_ut inheritance_u)
19378 {
19379 vm_map_address_t target_addr, mask;
19380 vm_map_size_t target_size;
19381 vm_map_offset_t memory_address, memory_end;
19382 vm_map_size_t memory_size;
19383 vm_prot_t cur_protection, max_protection;
19384 vm_inherit_t inheritance;
19385 kern_return_t result;
19386 vm_map_entry_t insp_entry = VM_MAP_ENTRY_NULL;
19387 vm_map_copy_t copy_map;
19388 vm_map_offset_t offset_in_mapping;
19389 vm_map_size_t src_page_mask, target_page_mask;
19390 vm_map_size_t initial_size;
19391 VM_MAP_ZAP_DECLARE(zap_list);
19392
19393 if (target_map == VM_MAP_NULL || src_map == VM_MAP_NULL) {
19394 return KERN_INVALID_ARGUMENT;
19395 }
19396 src_page_mask = VM_MAP_PAGE_MASK(src_map);
19397 target_page_mask = VM_MAP_PAGE_MASK(target_map);
19398
19399 if (src_page_mask != target_page_mask) {
19400 if (copy) {
19401 DEBUG4K_COPY("src_map %p pgsz 0x%x addr 0x%llx size 0x%llx copy %d -> target_map %p pgsz 0x%x\n", src_map, VM_MAP_PAGE_SIZE(src_map), VM_SANITIZE_UNSAFE_UNWRAP(memory_address_u), VM_SANITIZE_UNSAFE_UNWRAP(size_u), copy, target_map, VM_MAP_PAGE_SIZE(target_map));
19402 } else {
19403 DEBUG4K_SHARE("src_map %p pgsz 0x%x addr 0x%llx size 0x%llx copy %d -> target_map %p pgsz 0x%x\n", src_map, VM_MAP_PAGE_SIZE(src_map), VM_SANITIZE_UNSAFE_UNWRAP(memory_address_u), VM_SANITIZE_UNSAFE_UNWRAP(size_u), copy, target_map, VM_MAP_PAGE_SIZE(target_map));
19404 }
19405 }
19406
19407 /*
19408 * Sanitize any input parameters that are addr/size/prot/inherit
19409 */
19410 result = vm_map_remap_sanitize(src_map,
19411 target_map,
19412 *address_u,
19413 size_u,
19414 mask_u,
19415 memory_address_u,
19416 *cur_protection_u,
19417 *max_protection_u,
19418 inheritance_u,
19419 vmk_flags,
19420 &target_addr,
19421 &mask,
19422 &memory_address,
19423 &memory_end,
19424 &memory_size,
19425 &cur_protection,
19426 &max_protection,
19427 &inheritance);
19428 if (__improbable(result != KERN_SUCCESS)) {
19429 return vm_sanitize_get_kr(result);
19430 }
19431
19432 if (vmk_flags.vmf_return_data_addr) {
19433 /*
19434 * This is safe to unwrap now that the quantities
19435 * have been validated and rounded up normally.
19436 */
19437 offset_in_mapping = vm_sanitize_offset_in_page(src_map,
19438 memory_address_u);
19439 initial_size = VM_SANITIZE_UNSAFE_UNWRAP(size_u);
19440 } else {
19441 /*
19442 * IMPORTANT:
19443 * This legacy code path is broken: for the range mentioned
19444 * above [ memory_address = 0x1ff0,size = 0x20 ], which spans
19445 * two 4k pages, it yields [ memory_address = 0x1000,
19446 * size = 0x1000 ], which covers only the first 4k page.
19447 * BUT some code unfortunately depends on this bug, so we
19448 * can't fix it without breaking something.
19449 * New code should get automatically opted in the new
19450 * behavior with the new VM_FLAGS_RETURN_DATA_ADDR flags.
19451 */
19452 offset_in_mapping = 0;
19453 initial_size = memory_size;
19454 }
19455
19456 if (vmk_flags.vmf_resilient_media) {
19457 /* must be copy-on-write to be "media resilient" */
19458 if (!copy) {
19459 return KERN_INVALID_ARGUMENT;
19460 }
19461 }
19462
19463 vmk_flags.vmkf_copy_pageable = target_map->hdr.entries_pageable;
19464 vmk_flags.vmkf_copy_same_map = (src_map == target_map);
19465
19466 assert(memory_size != 0);
19467 result = vm_map_copy_extract(src_map,
19468 memory_address,
19469 memory_size,
19470 copy, ©_map,
19471 &cur_protection, /* IN/OUT */
19472 &max_protection, /* IN/OUT */
19473 inheritance,
19474 vmk_flags);
19475 if (result != KERN_SUCCESS) {
19476 return result;
19477 }
19478 assert(copy_map != VM_MAP_COPY_NULL);
19479
19480 /*
19481 * Handle the policy for vm map ranges
19482 *
19483 * If the maps differ, the target_map policy applies like for vm_map()
19484 * For same mapping remaps, we preserve the range.
19485 */
19486 if (vmk_flags.vmkf_copy_same_map) {
19487 vmk_flags.vmkf_range_id = copy_map->orig_range;
19488 } else {
19489 vm_map_kernel_flags_update_range_id(&vmk_flags, target_map, memory_size);
19490 }
19491
19492 target_size = memory_size;
19493 if (src_page_mask != target_page_mask) {
19494 vm_map_copy_t target_copy_map;
19495 vm_map_offset_t overmap_start = 0;
19496 vm_map_offset_t overmap_end = 0;
19497 vm_map_offset_t trimmed_start = 0;
19498
19499 target_copy_map = copy_map; /* can modify "copy_map" itself */
19500 DEBUG4K_ADJUST("adjusting...\n");
19501 result = vm_map_copy_adjust_to_target(
19502 copy_map,
19503 offset_in_mapping, /* offset */
19504 initial_size,
19505 target_map,
19506 copy,
19507 &target_copy_map,
19508 &overmap_start,
19509 &overmap_end,
19510 &trimmed_start);
19511 if (result != KERN_SUCCESS) {
19512 DEBUG4K_COPY("failed to adjust 0x%x\n", result);
19513 vm_map_copy_discard(copy_map);
19514 return result;
19515 }
19516 if (trimmed_start == 0) {
19517 /* nothing trimmed: no adjustment needed */
19518 } else if (trimmed_start >= offset_in_mapping) {
19519 /* trimmed more than offset_in_mapping: nothing left */
19520 assert(overmap_start == 0);
19521 assert(overmap_end == 0);
19522 offset_in_mapping = 0;
19523 } else {
19524 /* trimmed some of offset_in_mapping: adjust */
19525 assert(overmap_start == 0);
19526 assert(overmap_end == 0);
19527 offset_in_mapping -= trimmed_start;
19528 }
19529 offset_in_mapping += overmap_start;
19530 target_size = target_copy_map->size;
19531 }
19532
19533 /*
19534 * Allocate/check a range of free virtual address
19535 * space for the target
19536 */
19537 target_size = vm_map_round_page(target_size, target_page_mask);
19538
19539 if (target_size == 0) {
19540 vm_map_copy_discard(copy_map);
19541 return KERN_INVALID_ARGUMENT;
19542 }
19543
19544 if (__improbable(!vm_map_is_map_size_valid(
19545 target_map, target_size, vmk_flags.vmkf_no_soft_limit))) {
19546 vm_map_copy_discard(copy_map);
19547 return KERN_NO_SPACE;
19548 }
19549
19550 vm_map_lock(target_map);
19551
19552 if (!vmk_flags.vmf_fixed) {
19553 result = vm_map_locate_space_anywhere(target_map, target_size,
19554 mask, vmk_flags, &target_addr, &insp_entry);
19555 } else {
19556 /*
19557 * vm_map_locate_space_fixed will reject overflowing
19558 * target_addr + target_size values
19559 */
19560 result = vm_map_locate_space_fixed(target_map, target_addr,
19561 target_size, mask, vmk_flags, &insp_entry, &zap_list);
19562
19563 if (result == KERN_MEMORY_PRESENT) {
19564 assert(!vmk_flags.vmkf_already);
19565 insp_entry = VM_MAP_ENTRY_NULL;
19566 result = KERN_NO_SPACE;
19567 }
19568 }
19569
19570 if (result == KERN_SUCCESS) {
19571 while (vm_map_copy_first_entry(copy_map) !=
19572 vm_map_copy_to_entry(copy_map)) {
19573 vm_map_entry_t entry = vm_map_copy_first_entry(copy_map);
19574
19575 vm_map_copy_entry_unlink(copy_map, entry);
19576
19577 if (vmk_flags.vmkf_remap_prot_copy) {
19578 /*
19579 * This vm_map_remap() is for a
19580 * vm_protect(VM_PROT_COPY), so the caller
19581 * expects to be allowed to add write access
19582 * to this new mapping. This is done by
19583 * adding VM_PROT_WRITE to each entry's
19584 * max_protection... unless some security
19585 * settings disallow it.
19586 */
19587 bool allow_write = false;
19588 if (entry->vme_permanent) {
19589 /* immutable mapping... */
19590 if ((entry->max_protection & VM_PROT_EXECUTE) &&
19591 developer_mode_state()) {
19592 /*
19593 * ... but executable and
19594 * possibly being debugged,
19595 * so let's allow it to become
19596 * writable, for breakpoints
19597 * and dtrace probes, for
19598 * example.
19599 */
19600 allow_write = true;
19601 } else {
19602 printf("%d[%s] vm_remap(0x%llx,0x%llx) VM_PROT_COPY denied on permanent mapping prot 0x%x/0x%x developer %d\n",
19603 proc_selfpid(),
19604 (get_bsdtask_info(current_task())
19605 ? proc_name_address(get_bsdtask_info(current_task()))
19606 : "?"),
19607 (uint64_t)memory_address,
19608 (uint64_t)memory_size,
19609 entry->protection,
19610 entry->max_protection,
19611 developer_mode_state());
19612 DTRACE_VM6(vm_map_delete_permanent_deny_protcopy,
19613 vm_map_entry_t, entry,
19614 vm_map_offset_t, entry->vme_start,
19615 vm_map_offset_t, entry->vme_end,
19616 vm_prot_t, entry->protection,
19617 vm_prot_t, entry->max_protection,
19618 int, VME_ALIAS(entry));
19619 }
19620 } else {
19621 allow_write = true;
19622 }
19623
19624 /*
19625 * VM_PROT_COPY: allow this mapping to become
19626 * writable, unless it was "permanent".
19627 */
19628 if (allow_write) {
19629 entry->max_protection |= VM_PROT_WRITE;
19630 }
19631 }
19632 if (vmk_flags.vmf_resilient_codesign) {
19633 /* no codesigning -> read-only access */
19634 entry->max_protection = VM_PROT_READ;
19635 entry->protection = VM_PROT_READ;
19636 entry->vme_resilient_codesign = TRUE;
19637 }
19638 entry->vme_start += target_addr;
19639 entry->vme_end += target_addr;
19640 assert(!entry->map_aligned);
19641 if (vmk_flags.vmf_resilient_media &&
19642 !entry->is_sub_map &&
19643 (VME_OBJECT(entry) == VM_OBJECT_NULL ||
19644 VME_OBJECT(entry)->internal)) {
19645 entry->vme_resilient_media = TRUE;
19646 }
19647 assert(VM_MAP_PAGE_ALIGNED(entry->vme_start, MIN(target_page_mask, PAGE_MASK)));
19648 assert(VM_MAP_PAGE_ALIGNED(entry->vme_end, MIN(target_page_mask, PAGE_MASK)));
19649 assert(VM_MAP_PAGE_ALIGNED(VME_OFFSET(entry), MIN(target_page_mask, PAGE_MASK)));
19650 vm_map_store_entry_link(target_map, insp_entry, entry,
19651 vmk_flags);
19652 insp_entry = entry;
19653 }
19654 }
19655
19656 if (vmk_flags.vmf_resilient_codesign) {
19657 cur_protection = VM_PROT_READ;
19658 max_protection = VM_PROT_READ;
19659 }
19660
19661 if (result == KERN_SUCCESS) {
19662 target_map->size += target_size;
19663 SAVE_HINT_MAP_WRITE(target_map, insp_entry);
19664 }
19665 vm_map_unlock(target_map);
19666
19667 vm_map_zap_dispose(&zap_list);
19668
19669 if (result == KERN_SUCCESS && target_map->wiring_required) {
19670 result = vm_map_wire_nested(target_map, target_addr,
19671 target_addr + target_size, cur_protection, VM_KERN_MEMORY_MLOCK,
19672 TRUE, PMAP_NULL, 0, NULL);
19673 }
19674
19675 if (result == KERN_SUCCESS) {
19676 #if KASAN
19677 if (target_map->pmap == kernel_pmap) {
19678 kasan_notify_address(target_addr, target_size);
19679 }
19680 #endif
19681 /*
19682 * If requested, return the address of the data pointed to by the
19683 * request, rather than the base of the resulting page.
19684 */
19685 if (vmk_flags.vmf_return_data_addr) {
19686 target_addr += offset_in_mapping;
19687 }
19688
19689 /*
19690 * Update OUT parameters.
19691 */
19692 *address_u = vm_sanitize_wrap_addr(target_addr);
19693
19694 *cur_protection_u = vm_sanitize_wrap_prot(cur_protection);
19695 *max_protection_u = vm_sanitize_wrap_prot(max_protection);
19696 }
19697
19698 if (src_page_mask != target_page_mask) {
19699 DEBUG4K_SHARE("vm_remap(%p 0x%llx 0x%llx copy=%d-> %p 0x%llx 0x%llx result=0x%x\n", src_map, (uint64_t)memory_address, (uint64_t)target_size, copy, target_map, (uint64_t)target_addr, (uint64_t)offset_in_mapping, result);
19700 }
19701 vm_map_copy_discard(copy_map);
19702 copy_map = VM_MAP_COPY_NULL;
19703
19704 return result;
19705 }
19706
19707 /*
19708 * vm_map_switch_to:
19709 *
19710 * Set the address map for the current thread to the specified map.
19711 * Returns a struct containing info about the previous map, which should be
19712 * restored with `vm_map_switch_back`
19713 */
19714
19715 vm_map_switch_context_t
vm_map_switch_to(vm_map_t map)19716 vm_map_switch_to(vm_map_t map)
19717 {
19718 thread_t thread = current_thread();
19719 vm_map_t oldmap = thread->map;
19720
19721 /*
19722 * Deactivate the current map and activate the requested map
19723 */
19724 mp_disable_preemption();
19725 PMAP_SWITCH_USER(thread, map, cpu_number());
19726 mp_enable_preemption();
19727
19728 vm_map_lock(map);
19729 task_t task = map->owning_task;
19730 if (task) {
19731 task_reference(task);
19732 }
19733 vm_map_unlock(map);
19734
19735 return (vm_map_switch_context_t) { oldmap, task };
19736 }
19737
19738 void
vm_map_switch_back(vm_map_switch_context_t ctx)19739 vm_map_switch_back(vm_map_switch_context_t ctx)
19740 {
19741 thread_t thread = current_thread();
19742 task_t task = ctx.task;
19743 vm_map_t map = ctx.map;
19744
19745 if (task) {
19746 task_deallocate(task);
19747 } else {
19748 /*
19749 * We want to make sure that vm_map_setup was not called while the
19750 * map was switched. This allows us to guarantee the property that
19751 * we always have a reference on current_map()->owning_task if it is
19752 * not NULL.
19753 */
19754 assert(!thread->map->owning_task);
19755 }
19756
19757 /*
19758 * Restore the original map from prior to vm_map_switch_to
19759 */
19760 mp_disable_preemption();
19761 PMAP_SWITCH_USER(thread, map, cpu_number());
19762 mp_enable_preemption();
19763 }
19764
19765 static __attribute__((always_inline, warn_unused_result))
19766 kern_return_t
vm_map_rw_user_sanitize(vm_map_t map,vm_map_address_ut addr_u,vm_size_ut size_u,vm_sanitize_caller_t vm_sanitize_caller,vm_map_address_t * addr,vm_map_address_t * end,vm_map_size_t * size)19767 vm_map_rw_user_sanitize(
19768 vm_map_t map,
19769 vm_map_address_ut addr_u,
19770 vm_size_ut size_u,
19771 vm_sanitize_caller_t vm_sanitize_caller,
19772 vm_map_address_t *addr,
19773 vm_map_address_t *end,
19774 vm_map_size_t *size)
19775 {
19776 vm_sanitize_flags_t flags = VM_SANITIZE_FLAGS_SIZE_ZERO_FALLTHROUGH |
19777 VM_SANITIZE_FLAGS_GET_UNALIGNED_VALUES |
19778 VM_SANITIZE_FLAGS_CHECK_ADDR_RANGE;
19779
19780 return vm_sanitize_addr_size(addr_u, size_u,
19781 vm_sanitize_caller, map,
19782 flags,
19783 addr, end, size);
19784 }
19785
19786 /*
19787 * Routine: vm_map_write_user
19788 *
19789 * Description:
19790 * Copy out data from a kernel space into space in the
19791 * destination map. The space must already exist in the
19792 * destination map.
19793 * NOTE: This routine should only be called by threads
19794 * which can block on a page fault. i.e. kernel mode user
19795 * threads.
19796 *
19797 */
19798 kern_return_t
vm_map_write_user(vm_map_t map,void * src_p,vm_map_address_ut dst_addr_u,vm_size_ut size_u)19799 vm_map_write_user(
19800 vm_map_t map,
19801 void *src_p,
19802 vm_map_address_ut dst_addr_u,
19803 vm_size_ut size_u)
19804 {
19805 kern_return_t kr;
19806 vm_map_address_t dst_addr, dst_end;
19807 vm_map_size_t size;
19808
19809 /*
19810 * src_p isn't validated: [src_p, src_p + size_u)
19811 * is trusted kernel input.
19812 *
19813 * dst_addr_u and size_u are untrusted and need to be sanitized.
19814 */
19815 kr = vm_map_rw_user_sanitize(map,
19816 dst_addr_u,
19817 size_u,
19818 VM_SANITIZE_CALLER_VM_MAP_WRITE_USER,
19819 &dst_addr,
19820 &dst_end,
19821 &size);
19822 if (__improbable(kr != KERN_SUCCESS)) {
19823 return vm_sanitize_get_kr(kr);
19824 }
19825
19826 if (current_map() == map) {
19827 if (copyout(src_p, dst_addr, size)) {
19828 kr = KERN_INVALID_ADDRESS;
19829 }
19830 } else {
19831 vm_map_switch_context_t switch_ctx;
19832
19833 /* take on the identity of the target map while doing */
19834 /* the transfer */
19835
19836 vm_map_reference(map);
19837 switch_ctx = vm_map_switch_to(map);
19838 if (copyout(src_p, dst_addr, size)) {
19839 kr = KERN_INVALID_ADDRESS;
19840 }
19841 vm_map_switch_back(switch_ctx);
19842 vm_map_deallocate(map);
19843 }
19844 return kr;
19845 }
19846
19847 /*
19848 * Routine: vm_map_read_user
19849 *
19850 * Description:
19851 * Copy in data from a user space source map into the
19852 * kernel map. The space must already exist in the
19853 * kernel map.
19854 * NOTE: This routine should only be called by threads
19855 * which can block on a page fault. i.e. kernel mode user
19856 * threads.
19857 *
19858 */
19859 kern_return_t
vm_map_read_user(vm_map_t map,vm_map_address_ut src_addr_u,void * dst_p,vm_size_ut size_u)19860 vm_map_read_user(
19861 vm_map_t map,
19862 vm_map_address_ut src_addr_u,
19863 void *dst_p,
19864 vm_size_ut size_u)
19865 {
19866 kern_return_t kr;
19867 vm_map_address_t src_addr, src_end;
19868 vm_map_size_t size;
19869
19870 /*
19871 * dst_p isn't validated: [dst_p, dst_p + size_u)
19872 * is trusted kernel input.
19873 *
19874 * src_addr_u and size_u are untrusted and need to be sanitized.
19875 */
19876 kr = vm_map_rw_user_sanitize(map,
19877 src_addr_u,
19878 size_u,
19879 VM_SANITIZE_CALLER_VM_MAP_READ_USER,
19880 &src_addr,
19881 &src_end,
19882 &size);
19883 if (__improbable(kr != KERN_SUCCESS)) {
19884 return vm_sanitize_get_kr(kr);
19885 }
19886
19887 if (current_map() == map) {
19888 if (copyin(src_addr, dst_p, size)) {
19889 kr = KERN_INVALID_ADDRESS;
19890 }
19891 } else {
19892 vm_map_switch_context_t switch_ctx;
19893
19894 /* take on the identity of the target map while doing */
19895 /* the transfer */
19896
19897 vm_map_reference(map);
19898 switch_ctx = vm_map_switch_to(map);
19899 if (copyin(src_addr, dst_p, size)) {
19900 kr = KERN_INVALID_ADDRESS;
19901 }
19902 vm_map_switch_back(switch_ctx);
19903 vm_map_deallocate(map);
19904 }
19905 return kr;
19906 }
19907
19908
19909 static __attribute__((always_inline, warn_unused_result))
19910 kern_return_t
vm_map_check_protection_sanitize(vm_map_t map,vm_map_offset_ut start_u,vm_map_offset_ut end_u,vm_prot_ut protection_u,vm_sanitize_caller_t vm_sanitize_caller,vm_map_offset_t * start,vm_map_offset_t * end,vm_prot_t * protection)19911 vm_map_check_protection_sanitize(
19912 vm_map_t map,
19913 vm_map_offset_ut start_u,
19914 vm_map_offset_ut end_u,
19915 vm_prot_ut protection_u,
19916 vm_sanitize_caller_t vm_sanitize_caller,
19917 vm_map_offset_t *start,
19918 vm_map_offset_t *end,
19919 vm_prot_t *protection)
19920 {
19921 kern_return_t kr;
19922 vm_map_size_t size;
19923
19924 kr = vm_sanitize_addr_end(start_u, end_u, vm_sanitize_caller, map,
19925 VM_SANITIZE_FLAGS_SIZE_ZERO_FALLTHROUGH, start, end,
19926 &size);
19927 if (__improbable(kr != KERN_SUCCESS)) {
19928 return kr;
19929 }
19930
19931 /*
19932 * Given that the protection is used only for comparisons below
19933 * no sanitization is being applied on it.
19934 */
19935 *protection = VM_SANITIZE_UNSAFE_UNWRAP(protection_u);
19936
19937 return KERN_SUCCESS;
19938 }
19939
19940 /*
19941 * vm_map_check_protection:
19942 *
19943 * Assert that the target map allows the specified
19944 * privilege on the entire address region given.
19945 * The entire region must be allocated.
19946 */
19947 boolean_t
vm_map_check_protection(vm_map_t map,vm_map_offset_ut start_u,vm_map_offset_ut end_u,vm_prot_ut protection_u,vm_sanitize_caller_t vm_sanitize_caller)19948 vm_map_check_protection(
19949 vm_map_t map,
19950 vm_map_offset_ut start_u,
19951 vm_map_offset_ut end_u,
19952 vm_prot_ut protection_u,
19953 vm_sanitize_caller_t vm_sanitize_caller)
19954 {
19955 vm_map_entry_t entry;
19956 vm_map_entry_t tmp_entry;
19957 vm_map_offset_t start;
19958 vm_map_offset_t end;
19959 vm_prot_t protection;
19960 kern_return_t kr;
19961
19962 kr = vm_map_check_protection_sanitize(map,
19963 start_u,
19964 end_u,
19965 protection_u,
19966 vm_sanitize_caller,
19967 &start,
19968 &end,
19969 &protection);
19970 if (__improbable(kr != KERN_SUCCESS)) {
19971 kr = vm_sanitize_get_kr(kr);
19972 if (kr == KERN_SUCCESS) {
19973 return true;
19974 }
19975 return false;
19976 }
19977
19978 vm_map_lock(map);
19979
19980 if (start < vm_map_min(map) || end > vm_map_max(map)) {
19981 vm_map_unlock(map);
19982 return false;
19983 }
19984
19985 if (!vm_map_lookup_entry(map, start, &tmp_entry)) {
19986 vm_map_unlock(map);
19987 return false;
19988 }
19989
19990 entry = tmp_entry;
19991
19992 while (start < end) {
19993 if (entry == vm_map_to_entry(map)) {
19994 vm_map_unlock(map);
19995 return false;
19996 }
19997
19998 /*
19999 * No holes allowed!
20000 */
20001
20002 if (start < entry->vme_start) {
20003 vm_map_unlock(map);
20004 return false;
20005 }
20006
20007 /*
20008 * Check protection associated with entry.
20009 */
20010
20011 if ((entry->protection & protection) != protection) {
20012 vm_map_unlock(map);
20013 return false;
20014 }
20015
20016 /* go to next entry */
20017
20018 start = entry->vme_end;
20019 entry = entry->vme_next;
20020 }
20021 vm_map_unlock(map);
20022 return true;
20023 }
20024
20025 kern_return_t
vm_map_purgable_control(vm_map_t map,vm_map_offset_ut address_u,vm_purgable_t control,int * state)20026 vm_map_purgable_control(
20027 vm_map_t map,
20028 vm_map_offset_ut address_u,
20029 vm_purgable_t control,
20030 int *state)
20031 {
20032 vm_map_offset_t address;
20033 vm_map_entry_t entry;
20034 vm_object_t object;
20035 kern_return_t kr;
20036 boolean_t was_nonvolatile;
20037
20038 /*
20039 * Vet all the input parameters and current type and state of the
20040 * underlaying object. Return with an error if anything is amiss.
20041 */
20042 if (map == VM_MAP_NULL) {
20043 return KERN_INVALID_ARGUMENT;
20044 }
20045
20046 if (control != VM_PURGABLE_SET_STATE &&
20047 control != VM_PURGABLE_GET_STATE &&
20048 control != VM_PURGABLE_PURGE_ALL &&
20049 control != VM_PURGABLE_SET_STATE_FROM_KERNEL) {
20050 return KERN_INVALID_ARGUMENT;
20051 }
20052
20053 if (control == VM_PURGABLE_PURGE_ALL) {
20054 vm_purgeable_object_purge_all();
20055 return KERN_SUCCESS;
20056 }
20057
20058 if ((control == VM_PURGABLE_SET_STATE ||
20059 control == VM_PURGABLE_SET_STATE_FROM_KERNEL) &&
20060 (((*state & ~(VM_PURGABLE_ALL_MASKS)) != 0) ||
20061 ((*state & VM_PURGABLE_STATE_MASK) > VM_PURGABLE_STATE_MASK))) {
20062 return KERN_INVALID_ARGUMENT;
20063 }
20064
20065 address = vm_sanitize_addr(map, address_u);
20066
20067 vm_map_lock_read(map);
20068
20069 if (!vm_map_lookup_entry(map, address, &entry) || entry->is_sub_map) {
20070 /*
20071 * Must pass a valid non-submap address.
20072 */
20073 vm_map_unlock_read(map);
20074 return KERN_INVALID_ADDRESS;
20075 }
20076
20077 if ((entry->protection & VM_PROT_WRITE) == 0 &&
20078 control != VM_PURGABLE_GET_STATE) {
20079 /*
20080 * Can't apply purgable controls to something you can't write.
20081 */
20082 vm_map_unlock_read(map);
20083 return KERN_PROTECTION_FAILURE;
20084 }
20085
20086 object = VME_OBJECT(entry);
20087 if (object == VM_OBJECT_NULL ||
20088 object->purgable == VM_PURGABLE_DENY) {
20089 /*
20090 * Object must already be present and be purgeable.
20091 */
20092 vm_map_unlock_read(map);
20093 return KERN_INVALID_ARGUMENT;
20094 }
20095
20096 vm_object_lock(object);
20097
20098 #if 00
20099 if (VME_OFFSET(entry) != 0 ||
20100 entry->vme_end - entry->vme_start != object->vo_size) {
20101 /*
20102 * Can only apply purgable controls to the whole (existing)
20103 * object at once.
20104 */
20105 vm_map_unlock_read(map);
20106 vm_object_unlock(object);
20107 return KERN_INVALID_ARGUMENT;
20108 }
20109 #endif
20110
20111 assert(!entry->is_sub_map);
20112 assert(!entry->use_pmap); /* purgeable has its own accounting */
20113
20114 vm_map_unlock_read(map);
20115
20116 was_nonvolatile = (object->purgable == VM_PURGABLE_NONVOLATILE);
20117
20118 kr = vm_object_purgable_control(object, control, state);
20119
20120 if (was_nonvolatile &&
20121 object->purgable != VM_PURGABLE_NONVOLATILE &&
20122 map->pmap == kernel_pmap) {
20123 #if DEBUG
20124 object->vo_purgeable_volatilizer = kernel_task;
20125 #endif /* DEBUG */
20126 }
20127
20128 vm_object_unlock(object);
20129
20130 return kr;
20131 }
20132
20133 void
vm_map_footprint_query_page_info(vm_map_t map,vm_map_entry_t map_entry,vm_map_offset_t curr_s_offset,int * disposition_p)20134 vm_map_footprint_query_page_info(
20135 vm_map_t map,
20136 vm_map_entry_t map_entry,
20137 vm_map_offset_t curr_s_offset,
20138 int *disposition_p)
20139 {
20140 int pmap_disp;
20141 vm_object_t object = VM_OBJECT_NULL;
20142 int disposition;
20143 int effective_page_size;
20144
20145 vm_map_lock_assert_held(map);
20146 assert(!map->has_corpse_footprint);
20147 assert(curr_s_offset >= map_entry->vme_start);
20148 assert(curr_s_offset < map_entry->vme_end);
20149
20150 if (map_entry->is_sub_map) {
20151 if (!map_entry->use_pmap) {
20152 /* nested pmap: no footprint */
20153 *disposition_p = 0;
20154 return;
20155 }
20156 } else {
20157 object = VME_OBJECT(map_entry);
20158 if (object == VM_OBJECT_NULL) {
20159 /* nothing mapped here: no need to ask */
20160 *disposition_p = 0;
20161 return;
20162 }
20163 }
20164
20165 effective_page_size = MIN(PAGE_SIZE, VM_MAP_PAGE_SIZE(map));
20166
20167 pmap_disp = 0;
20168
20169 /*
20170 * Query the pmap.
20171 */
20172 pmap_query_page_info(map->pmap, curr_s_offset, &pmap_disp);
20173
20174 /*
20175 * Compute this page's disposition.
20176 */
20177 disposition = 0;
20178
20179 /* deal with "alternate accounting" first */
20180 if (!map_entry->is_sub_map &&
20181 object->vo_no_footprint) {
20182 /* does not count in footprint */
20183 // assertf(!map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry);
20184 } else if (!map_entry->is_sub_map &&
20185 !object->internal &&
20186 object->vo_ledger_tag &&
20187 VM_OBJECT_OWNER(object) != NULL &&
20188 VM_OBJECT_OWNER(object)->map == map) {
20189 /* owned external object: wired pages count in footprint */
20190 assertf(map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry);
20191 if ((((curr_s_offset
20192 - map_entry->vme_start
20193 + VME_OFFSET(map_entry))
20194 / effective_page_size) <
20195 object->wired_page_count)) {
20196 /*
20197 * External object owned by this task: report the first
20198 * "#wired" pages as "resident" (to show that they
20199 * contribute to the footprint) but not "dirty"
20200 * (to avoid double-counting with the fake "owned"
20201 * region we'll report at the end of the address space
20202 * to account for all (mapped or not) owned memory
20203 * owned by this task.
20204 */
20205 disposition |= VM_PAGE_QUERY_PAGE_PRESENT;
20206 }
20207 } else if (!map_entry->is_sub_map &&
20208 object->internal &&
20209 (object->purgable == VM_PURGABLE_NONVOLATILE ||
20210 (object->purgable == VM_PURGABLE_DENY &&
20211 object->vo_ledger_tag)) &&
20212 VM_OBJECT_OWNER(object) != NULL &&
20213 VM_OBJECT_OWNER(object)->map == map) {
20214 assertf(!map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry);
20215 if ((((curr_s_offset
20216 - map_entry->vme_start
20217 + VME_OFFSET(map_entry))
20218 / effective_page_size) <
20219 (object->resident_page_count +
20220 vm_compressor_pager_get_count(object->pager)))) {
20221 /*
20222 * Non-volatile purgeable object owned
20223 * by this task: report the first
20224 * "#resident + #compressed" pages as
20225 * "resident" (to show that they
20226 * contribute to the footprint) but not
20227 * "dirty" (to avoid double-counting
20228 * with the fake "non-volatile" region
20229 * we'll report at the end of the
20230 * address space to account for all
20231 * (mapped or not) non-volatile memory
20232 * owned by this task.
20233 */
20234 disposition |= VM_PAGE_QUERY_PAGE_PRESENT;
20235 }
20236 } else if (!map_entry->is_sub_map &&
20237 object->internal &&
20238 (object->purgable == VM_PURGABLE_VOLATILE ||
20239 object->purgable == VM_PURGABLE_EMPTY) &&
20240 VM_OBJECT_OWNER(object) != NULL &&
20241 VM_OBJECT_OWNER(object)->map == map) {
20242 if (object->internal) {
20243 assertf(!map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry);
20244 }
20245 if ((((curr_s_offset
20246 - map_entry->vme_start
20247 + VME_OFFSET(map_entry))
20248 / effective_page_size) <
20249 object->wired_page_count)) {
20250 /*
20251 * Volatile|empty purgeable object owned
20252 * by this task: report the first
20253 * "#wired" pages as "resident" (to
20254 * show that they contribute to the
20255 * footprint) but not "dirty" (to avoid
20256 * double-counting with the fake
20257 * "non-volatile" region we'll report
20258 * at the end of the address space to
20259 * account for all (mapped or not)
20260 * non-volatile memory owned by this
20261 * task.
20262 */
20263 disposition |= VM_PAGE_QUERY_PAGE_PRESENT;
20264 }
20265 } else if (!map_entry->is_sub_map &&
20266 map_entry->iokit_acct &&
20267 object->internal &&
20268 object->purgable == VM_PURGABLE_DENY) {
20269 /*
20270 * Non-purgeable IOKit memory: phys_footprint
20271 * includes the entire virtual mapping.
20272 */
20273 assertf(!map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry);
20274 disposition |= VM_PAGE_QUERY_PAGE_PRESENT;
20275 disposition |= VM_PAGE_QUERY_PAGE_DIRTY;
20276 } else if (pmap_disp & (PMAP_QUERY_PAGE_ALTACCT |
20277 PMAP_QUERY_PAGE_COMPRESSED_ALTACCT)) {
20278 /* alternate accounting */
20279 #if __arm64__ && (DEVELOPMENT || DEBUG)
20280 if (map->pmap->footprint_was_suspended) {
20281 /*
20282 * The assertion below can fail if dyld
20283 * suspended footprint accounting
20284 * while doing some adjustments to
20285 * this page; the mapping would say
20286 * "use pmap accounting" but the page
20287 * would be marked "alternate
20288 * accounting".
20289 */
20290 } else
20291 #endif /* __arm64__ && (DEVELOPMENT || DEBUG) */
20292 {
20293 assertf(!map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry);
20294 }
20295 disposition = 0;
20296 } else {
20297 if (pmap_disp & PMAP_QUERY_PAGE_PRESENT) {
20298 assertf(map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry);
20299 disposition |= VM_PAGE_QUERY_PAGE_PRESENT;
20300 disposition |= VM_PAGE_QUERY_PAGE_REF;
20301 if (pmap_disp & PMAP_QUERY_PAGE_INTERNAL) {
20302 disposition |= VM_PAGE_QUERY_PAGE_DIRTY;
20303 } else {
20304 disposition |= VM_PAGE_QUERY_PAGE_EXTERNAL;
20305 }
20306 if (pmap_disp & PMAP_QUERY_PAGE_REUSABLE) {
20307 disposition |= VM_PAGE_QUERY_PAGE_REUSABLE;
20308 }
20309 } else if (pmap_disp & PMAP_QUERY_PAGE_COMPRESSED) {
20310 assertf(map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry);
20311 disposition |= VM_PAGE_QUERY_PAGE_PAGED_OUT;
20312 }
20313 }
20314
20315 *disposition_p = disposition;
20316 }
20317
20318 kern_return_t
vm_map_page_info(vm_map_t map,vm_map_offset_ut offset_u,vm_page_info_flavor_t flavor,vm_page_info_t info,mach_msg_type_number_t * count)20319 vm_map_page_info(
20320 vm_map_t map,
20321 vm_map_offset_ut offset_u,
20322 vm_page_info_flavor_t flavor,
20323 vm_page_info_t info,
20324 mach_msg_type_number_t *count)
20325 {
20326 return vm_map_page_range_info_internal(map,
20327 offset_u, /* start of range */
20328 vm_sanitize_compute_ut_end(offset_u, 1), /* this will get rounded in the call to the page boundary */
20329 (int)-1, /* effective_page_shift: unspecified */
20330 flavor,
20331 info,
20332 count);
20333 }
20334
20335 static __attribute__((always_inline, warn_unused_result))
20336 kern_return_t
vm_map_page_range_info_sanitize(vm_map_t map,vm_map_offset_ut start_offset_u,vm_map_offset_ut end_offset_u,vm_map_offset_t effective_page_mask,vm_map_offset_t * start,vm_map_offset_t * end,vm_map_offset_t * offset_in_page)20337 vm_map_page_range_info_sanitize(
20338 vm_map_t map,
20339 vm_map_offset_ut start_offset_u,
20340 vm_map_offset_ut end_offset_u,
20341 vm_map_offset_t effective_page_mask,
20342 vm_map_offset_t *start,
20343 vm_map_offset_t *end,
20344 vm_map_offset_t *offset_in_page)
20345 {
20346 kern_return_t retval;
20347 vm_map_size_t size;
20348
20349 /*
20350 * Perform validation against map's mask but don't align start/end,
20351 * as we need for those to be aligned wrt effective_page_mask
20352 */
20353 retval = vm_sanitize_addr_end(start_offset_u, end_offset_u,
20354 VM_SANITIZE_CALLER_VM_MAP_PAGE_RANGE_INFO, map,
20355 VM_SANITIZE_FLAGS_SIZE_ZERO_FALLTHROUGH |
20356 VM_SANITIZE_FLAGS_GET_UNALIGNED_VALUES, start,
20357 end, &size);
20358 if (retval != KERN_SUCCESS) {
20359 return retval;
20360 }
20361
20362 retval = vm_sanitize_addr_end(start_offset_u, end_offset_u,
20363 VM_SANITIZE_CALLER_VM_MAP_PAGE_RANGE_INFO, effective_page_mask,
20364 VM_SANITIZE_FLAGS_SIZE_ZERO_FALLTHROUGH, start,
20365 end, &size);
20366 if (retval != KERN_SUCCESS) {
20367 return retval;
20368 }
20369
20370 *offset_in_page = vm_sanitize_offset_in_page(effective_page_mask,
20371 start_offset_u);
20372
20373 return KERN_SUCCESS;
20374 }
20375
20376 kern_return_t
vm_map_page_range_info_internal(vm_map_t map,vm_map_offset_ut start_offset_u,vm_map_offset_ut end_offset_u,int effective_page_shift,vm_page_info_flavor_t flavor,vm_page_info_t info,mach_msg_type_number_t * count)20377 vm_map_page_range_info_internal(
20378 vm_map_t map,
20379 vm_map_offset_ut start_offset_u,
20380 vm_map_offset_ut end_offset_u,
20381 int effective_page_shift,
20382 vm_page_info_flavor_t flavor,
20383 vm_page_info_t info,
20384 mach_msg_type_number_t *count)
20385 {
20386 vm_map_entry_t map_entry = VM_MAP_ENTRY_NULL;
20387 vm_object_t object = VM_OBJECT_NULL, curr_object = VM_OBJECT_NULL;
20388 vm_page_t m = VM_PAGE_NULL;
20389 kern_return_t retval = KERN_SUCCESS;
20390 int disposition = 0;
20391 int ref_count = 0;
20392 int depth = 0, info_idx = 0;
20393 vm_page_info_basic_t basic_info = 0;
20394 vm_map_offset_t offset_in_page = 0, offset_in_object = 0, curr_offset_in_object = 0;
20395 vm_map_offset_t start = 0, end = 0, curr_s_offset = 0, curr_e_offset = 0;
20396 boolean_t do_region_footprint;
20397 ledger_amount_t ledger_resident, ledger_compressed;
20398 int effective_page_size;
20399 vm_map_offset_t effective_page_mask;
20400
20401 switch (flavor) {
20402 case VM_PAGE_INFO_BASIC:
20403 if (*count != VM_PAGE_INFO_BASIC_COUNT) {
20404 /*
20405 * The "vm_page_info_basic_data" structure was not
20406 * properly padded, so allow the size to be off by
20407 * one to maintain backwards binary compatibility...
20408 */
20409 if (*count != VM_PAGE_INFO_BASIC_COUNT - 1) {
20410 return KERN_INVALID_ARGUMENT;
20411 }
20412 }
20413 break;
20414 default:
20415 return KERN_INVALID_ARGUMENT;
20416 }
20417
20418 if (effective_page_shift == -1) {
20419 effective_page_shift = vm_self_region_page_shift_safely(map);
20420 if (effective_page_shift == -1) {
20421 return KERN_INVALID_ARGUMENT;
20422 }
20423 }
20424 effective_page_size = (1 << effective_page_shift);
20425 effective_page_mask = effective_page_size - 1;
20426
20427
20428 retval = vm_map_page_range_info_sanitize(map,
20429 start_offset_u,
20430 end_offset_u,
20431 effective_page_mask,
20432 &start,
20433 &end,
20434 &offset_in_page);
20435 if (retval != KERN_SUCCESS) {
20436 return vm_sanitize_get_kr(retval);
20437 }
20438
20439 assert((end - start) <= MAX_PAGE_RANGE_QUERY);
20440
20441 do_region_footprint = task_self_region_footprint();
20442 disposition = 0;
20443 ref_count = 0;
20444 depth = 0;
20445 info_idx = 0; /* Tracks the next index within the info structure to be filled.*/
20446
20447 vm_map_lock_read(map);
20448
20449
20450 task_ledgers_footprint(map->pmap->ledger, &ledger_resident, &ledger_compressed);
20451
20452 for (curr_s_offset = start; curr_s_offset < end;) {
20453 /*
20454 * New lookup needs reset of these variables.
20455 */
20456 curr_object = object = VM_OBJECT_NULL;
20457 offset_in_object = 0;
20458 ref_count = 0;
20459 depth = 0;
20460
20461 if (do_region_footprint &&
20462 curr_s_offset >= vm_map_last_entry(map)->vme_end) {
20463 /*
20464 * Request for "footprint" info about a page beyond
20465 * the end of address space: this must be for
20466 * the fake region vm_map_region_recurse_64()
20467 * reported to account for non-volatile purgeable
20468 * memory owned by this task.
20469 */
20470 disposition = 0;
20471
20472 if (curr_s_offset - vm_map_last_entry(map)->vme_end <=
20473 (unsigned) ledger_compressed) {
20474 /*
20475 * We haven't reported all the "non-volatile
20476 * compressed" pages yet, so report this fake
20477 * page as "compressed".
20478 */
20479 disposition |= VM_PAGE_QUERY_PAGE_PAGED_OUT;
20480 } else {
20481 /*
20482 * We've reported all the non-volatile
20483 * compressed page but not all the non-volatile
20484 * pages , so report this fake page as
20485 * "resident dirty".
20486 */
20487 disposition |= VM_PAGE_QUERY_PAGE_PRESENT;
20488 disposition |= VM_PAGE_QUERY_PAGE_DIRTY;
20489 disposition |= VM_PAGE_QUERY_PAGE_REF;
20490 }
20491 switch (flavor) {
20492 case VM_PAGE_INFO_BASIC:
20493 basic_info = (vm_page_info_basic_t) (((uintptr_t) info) + (info_idx * sizeof(struct vm_page_info_basic)));
20494 basic_info->disposition = disposition;
20495 basic_info->ref_count = 1;
20496 basic_info->object_id = VM_OBJECT_ID_FAKE(map, task_ledgers.purgeable_nonvolatile);
20497 basic_info->offset = 0;
20498 basic_info->depth = 0;
20499
20500 info_idx++;
20501 break;
20502 }
20503 curr_s_offset += effective_page_size;
20504 continue;
20505 }
20506
20507 /*
20508 * First, find the map entry covering "curr_s_offset", going down
20509 * submaps if necessary.
20510 */
20511 if (!vm_map_lookup_entry(map, curr_s_offset, &map_entry)) {
20512 /* no entry -> no object -> no page */
20513
20514 if (curr_s_offset < vm_map_min(map)) {
20515 /*
20516 * Illegal address that falls below map min.
20517 */
20518 curr_e_offset = MIN(end, vm_map_min(map));
20519 } else if (curr_s_offset >= vm_map_max(map)) {
20520 /*
20521 * Illegal address that falls on/after map max.
20522 */
20523 curr_e_offset = end;
20524 } else if (map_entry == vm_map_to_entry(map)) {
20525 /*
20526 * Hit a hole.
20527 */
20528 if (map_entry->vme_next == vm_map_to_entry(map)) {
20529 /*
20530 * Empty map.
20531 */
20532 curr_e_offset = MIN(map->max_offset, end);
20533 } else {
20534 /*
20535 * Hole at start of the map.
20536 */
20537 curr_e_offset = MIN(map_entry->vme_next->vme_start, end);
20538 }
20539 } else {
20540 if (map_entry->vme_next == vm_map_to_entry(map)) {
20541 /*
20542 * Hole at the end of the map.
20543 */
20544 curr_e_offset = MIN(map->max_offset, end);
20545 } else {
20546 curr_e_offset = MIN(map_entry->vme_next->vme_start, end);
20547 }
20548 }
20549
20550 assert(curr_e_offset >= curr_s_offset);
20551
20552 uint64_t num_pages = (curr_e_offset - curr_s_offset) >> effective_page_shift;
20553
20554 void *info_ptr = (void*) (((uintptr_t) info) + (info_idx * sizeof(struct vm_page_info_basic)));
20555
20556 bzero(info_ptr, num_pages * sizeof(struct vm_page_info_basic));
20557
20558 curr_s_offset = curr_e_offset;
20559
20560 info_idx += num_pages;
20561
20562 continue;
20563 }
20564
20565 /* compute offset from this map entry's start */
20566 offset_in_object = curr_s_offset - map_entry->vme_start;
20567
20568 /* compute offset into this map entry's object (or submap) */
20569 offset_in_object += VME_OFFSET(map_entry);
20570
20571 if (map_entry->is_sub_map) {
20572 vm_map_t sub_map = VM_MAP_NULL;
20573 vm_page_info_t submap_info = 0;
20574 vm_map_offset_t submap_s_offset = 0, submap_e_offset = 0, range_len = 0;
20575
20576 range_len = MIN(map_entry->vme_end, end) - curr_s_offset;
20577
20578 submap_s_offset = offset_in_object;
20579 submap_e_offset = submap_s_offset + range_len;
20580
20581 sub_map = VME_SUBMAP(map_entry);
20582
20583 vm_map_reference(sub_map);
20584 vm_map_unlock_read(map);
20585
20586 submap_info = (vm_page_info_t) (((uintptr_t) info) + (info_idx * sizeof(struct vm_page_info_basic)));
20587
20588 assertf(VM_MAP_PAGE_SHIFT(sub_map) >= VM_MAP_PAGE_SHIFT(map),
20589 "Submap page size (%d) differs from current map (%d)\n", VM_MAP_PAGE_SIZE(sub_map), VM_MAP_PAGE_SIZE(map));
20590
20591 retval = vm_map_page_range_info_internal(sub_map,
20592 submap_s_offset,
20593 submap_e_offset,
20594 effective_page_shift,
20595 VM_PAGE_INFO_BASIC,
20596 (vm_page_info_t) submap_info,
20597 count);
20598
20599 assert(retval == KERN_SUCCESS);
20600
20601 vm_map_deallocate(sub_map);
20602 sub_map = VM_MAP_NULL;
20603 vm_map_lock_read(map);
20604
20605 /* Move the "info" index by the number of pages we inspected.*/
20606 info_idx += range_len >> effective_page_shift;
20607
20608 /* Move our current offset by the size of the range we inspected.*/
20609 curr_s_offset += range_len;
20610
20611 continue;
20612 }
20613
20614 object = VME_OBJECT(map_entry);
20615
20616 if (object == VM_OBJECT_NULL) {
20617 /*
20618 * We don't have an object here and, hence,
20619 * no pages to inspect. We'll fill up the
20620 * info structure appropriately.
20621 */
20622
20623 curr_e_offset = MIN(map_entry->vme_end, end);
20624
20625 uint64_t num_pages = (curr_e_offset - curr_s_offset) >> effective_page_shift;
20626
20627 void *info_ptr = (void*) (((uintptr_t) info) + (info_idx * sizeof(struct vm_page_info_basic)));
20628
20629 bzero(info_ptr, num_pages * sizeof(struct vm_page_info_basic));
20630
20631 curr_s_offset = curr_e_offset;
20632
20633 info_idx += num_pages;
20634
20635 continue;
20636 }
20637
20638 if (do_region_footprint) {
20639 disposition = 0;
20640 if (map->has_corpse_footprint) {
20641 /*
20642 * Query the page info data we saved
20643 * while forking the corpse.
20644 */
20645 vm_map_corpse_footprint_query_page_info(
20646 map,
20647 curr_s_offset,
20648 &disposition);
20649 } else {
20650 /*
20651 * Query the live pmap for footprint info
20652 * about this page.
20653 */
20654 vm_map_footprint_query_page_info(
20655 map,
20656 map_entry,
20657 curr_s_offset,
20658 &disposition);
20659 }
20660 switch (flavor) {
20661 case VM_PAGE_INFO_BASIC:
20662 basic_info = (vm_page_info_basic_t) (((uintptr_t) info) + (info_idx * sizeof(struct vm_page_info_basic)));
20663 basic_info->disposition = disposition;
20664 basic_info->ref_count = 1;
20665 basic_info->object_id = VM_OBJECT_ID_FAKE(map, task_ledgers.purgeable_nonvolatile);
20666 basic_info->offset = 0;
20667 basic_info->depth = 0;
20668
20669 info_idx++;
20670 break;
20671 }
20672 curr_s_offset += effective_page_size;
20673 continue;
20674 }
20675
20676 vm_object_reference(object);
20677 /*
20678 * Shared mode -- so we can allow other readers
20679 * to grab the lock too.
20680 */
20681 vm_object_lock_shared(object);
20682
20683 curr_e_offset = MIN(map_entry->vme_end, end);
20684
20685 vm_map_unlock_read(map);
20686
20687 map_entry = NULL; /* map is unlocked, the entry is no longer valid. */
20688
20689 curr_object = object;
20690
20691 for (; curr_s_offset < curr_e_offset;) {
20692 if (object == curr_object) {
20693 /* account for our object reference above. */
20694 ref_count = os_ref_get_count_raw(&curr_object->ref_count) - 1;
20695 } else {
20696 ref_count = os_ref_get_count_raw(&curr_object->ref_count);
20697 }
20698
20699 curr_offset_in_object = offset_in_object;
20700
20701 for (;;) {
20702 m = vm_page_lookup(curr_object, vm_object_trunc_page(curr_offset_in_object));
20703
20704 if (m != VM_PAGE_NULL) {
20705 disposition |= VM_PAGE_QUERY_PAGE_PRESENT;
20706 break;
20707 } else {
20708 if (curr_object->internal &&
20709 curr_object->alive &&
20710 !curr_object->terminating &&
20711 curr_object->pager_ready) {
20712 if (vm_object_compressor_pager_state_get(curr_object, vm_object_trunc_page(curr_offset_in_object))
20713 == VM_EXTERNAL_STATE_EXISTS) {
20714 /* the pager has that page */
20715 disposition |= VM_PAGE_QUERY_PAGE_PAGED_OUT;
20716 break;
20717 }
20718 }
20719
20720 /*
20721 * Go down the VM object shadow chain until we find the page
20722 * we're looking for.
20723 */
20724
20725 if (curr_object->shadow != VM_OBJECT_NULL) {
20726 vm_object_t shadow = VM_OBJECT_NULL;
20727
20728 curr_offset_in_object += curr_object->vo_shadow_offset;
20729 shadow = curr_object->shadow;
20730
20731 vm_object_lock_shared(shadow);
20732 vm_object_unlock(curr_object);
20733
20734 curr_object = shadow;
20735 depth++;
20736 continue;
20737 } else {
20738 break;
20739 }
20740 }
20741 }
20742
20743 /* The ref_count is not strictly accurate, it measures the number */
20744 /* of entities holding a ref on the object, they may not be mapping */
20745 /* the object or may not be mapping the section holding the */
20746 /* target page but its still a ball park number and though an over- */
20747 /* count, it picks up the copy-on-write cases */
20748
20749 /* We could also get a picture of page sharing from pmap_attributes */
20750 /* but this would under count as only faulted-in mappings would */
20751 /* show up. */
20752
20753 if ((curr_object == object) && curr_object->shadow) {
20754 disposition |= VM_PAGE_QUERY_PAGE_COPIED;
20755 }
20756
20757 if (!curr_object->internal) {
20758 disposition |= VM_PAGE_QUERY_PAGE_EXTERNAL;
20759 }
20760
20761 if (m != VM_PAGE_NULL) {
20762 if (vm_page_is_fictitious(m)) {
20763 disposition |= VM_PAGE_QUERY_PAGE_FICTITIOUS;
20764 } else {
20765 if (m->vmp_dirty || pmap_is_modified(VM_PAGE_GET_PHYS_PAGE(m))) {
20766 disposition |= VM_PAGE_QUERY_PAGE_DIRTY;
20767 }
20768
20769 if (m->vmp_reference || pmap_is_referenced(VM_PAGE_GET_PHYS_PAGE(m))) {
20770 disposition |= VM_PAGE_QUERY_PAGE_REF;
20771 }
20772
20773 if (m->vmp_q_state == VM_PAGE_ON_SPECULATIVE_Q) {
20774 disposition |= VM_PAGE_QUERY_PAGE_SPECULATIVE;
20775 }
20776
20777 /*
20778 * XXX TODO4K:
20779 * when this routine deals with 4k
20780 * pages, check the appropriate CS bit
20781 * here.
20782 */
20783 if (m->vmp_cs_validated) {
20784 disposition |= VM_PAGE_QUERY_PAGE_CS_VALIDATED;
20785 }
20786 if (m->vmp_cs_tainted) {
20787 disposition |= VM_PAGE_QUERY_PAGE_CS_TAINTED;
20788 }
20789 if (m->vmp_cs_nx) {
20790 disposition |= VM_PAGE_QUERY_PAGE_CS_NX;
20791 }
20792 if (m->vmp_reusable || curr_object->all_reusable) {
20793 disposition |= VM_PAGE_QUERY_PAGE_REUSABLE;
20794 }
20795 }
20796 }
20797
20798 switch (flavor) {
20799 case VM_PAGE_INFO_BASIC:
20800 basic_info = (vm_page_info_basic_t) (((uintptr_t) info) + (info_idx * sizeof(struct vm_page_info_basic)));
20801 basic_info->disposition = disposition;
20802 basic_info->ref_count = ref_count;
20803 basic_info->object_id = (vm_object_id_t) (uintptr_t)
20804 VM_KERNEL_ADDRHASH(curr_object);
20805 basic_info->offset =
20806 (memory_object_offset_t) curr_offset_in_object + offset_in_page;
20807 basic_info->depth = depth;
20808
20809 info_idx++;
20810 break;
20811 }
20812
20813 disposition = 0;
20814 offset_in_page = 0; // This doesn't really make sense for any offset other than the starting offset.
20815
20816 /*
20817 * Move to next offset in the range and in our object.
20818 */
20819 curr_s_offset += effective_page_size;
20820 offset_in_object += effective_page_size;
20821 curr_offset_in_object = offset_in_object;
20822
20823 if (curr_object != object) {
20824 vm_object_unlock(curr_object);
20825
20826 curr_object = object;
20827
20828 vm_object_lock_shared(curr_object);
20829 } else {
20830 vm_object_lock_yield_shared(curr_object);
20831 }
20832 }
20833
20834 vm_object_unlock(curr_object);
20835 vm_object_deallocate(curr_object);
20836
20837 vm_map_lock_read(map);
20838 }
20839
20840 vm_map_unlock_read(map);
20841 return retval;
20842 }
20843
20844 static __attribute__((always_inline, warn_unused_result))
20845 kern_return_t
vm_map_msync_sanitize(vm_map_t map,vm_map_address_ut address_u,vm_map_size_ut size_u,vm_object_offset_t * address,vm_map_size_t * size)20846 vm_map_msync_sanitize(
20847 vm_map_t map,
20848 vm_map_address_ut address_u,
20849 vm_map_size_ut size_u,
20850 vm_object_offset_t *address,
20851 vm_map_size_t *size)
20852 {
20853 vm_object_offset_t end;
20854
20855 return vm_sanitize_addr_size(address_u, size_u,
20856 VM_SANITIZE_CALLER_VM_MAP_MSYNC,
20857 map, VM_SANITIZE_FLAGS_SIZE_ZERO_SUCCEEDS,
20858 address, &end, size);
20859 }
20860
20861 /*
20862 * vm_map_msync
20863 *
20864 * Synchronises the memory range specified with its backing store
20865 * image by either flushing or cleaning the contents to the appropriate
20866 * memory manager engaging in a memory object synchronize dialog with
20867 * the manager. The client doesn't return until the manager issues
20868 * m_o_s_completed message. MIG Magically converts user task parameter
20869 * to the task's address map.
20870 *
20871 * interpretation of sync_flags
20872 * VM_SYNC_INVALIDATE - discard pages, only return precious
20873 * pages to manager.
20874 *
20875 * VM_SYNC_INVALIDATE & (VM_SYNC_SYNCHRONOUS | VM_SYNC_ASYNCHRONOUS)
20876 * - discard pages, write dirty or precious
20877 * pages back to memory manager.
20878 *
20879 * VM_SYNC_SYNCHRONOUS | VM_SYNC_ASYNCHRONOUS
20880 * - write dirty or precious pages back to
20881 * the memory manager.
20882 *
20883 * VM_SYNC_CONTIGUOUS - does everything normally, but if there
20884 * is a hole in the region, and we would
20885 * have returned KERN_SUCCESS, return
20886 * KERN_INVALID_ADDRESS instead.
20887 *
20888 * NOTE
20889 * The memory object attributes have not yet been implemented, this
20890 * function will have to deal with the invalidate attribute
20891 *
20892 * RETURNS
20893 * KERN_INVALID_TASK Bad task parameter
20894 * KERN_INVALID_ARGUMENT both sync and async were specified.
20895 * KERN_SUCCESS The usual.
20896 * KERN_INVALID_ADDRESS There was a hole in the region.
20897 */
20898
20899 kern_return_t
vm_map_msync(vm_map_t map,vm_map_address_ut address_u,vm_map_size_ut size_u,vm_sync_t sync_flags)20900 vm_map_msync(
20901 vm_map_t map,
20902 vm_map_address_ut address_u,
20903 vm_map_size_ut size_u,
20904 vm_sync_t sync_flags)
20905 {
20906 vm_map_entry_t entry;
20907 vm_map_size_t size, amount_left;
20908 vm_object_offset_t address, offset;
20909 vm_object_offset_t start_offset, end_offset;
20910 boolean_t do_sync_req;
20911 boolean_t had_hole = FALSE;
20912 vm_map_offset_t pmap_offset;
20913 kern_return_t kr;
20914
20915 if ((sync_flags & VM_SYNC_ASYNCHRONOUS) &&
20916 (sync_flags & VM_SYNC_SYNCHRONOUS)) {
20917 return KERN_INVALID_ARGUMENT;
20918 }
20919
20920 if (map == VM_MAP_NULL) {
20921 return KERN_INVALID_TASK;
20922 }
20923
20924 kr = vm_map_msync_sanitize(map,
20925 address_u,
20926 size_u,
20927 &address,
20928 &size);
20929 if (VM_MAP_PAGE_MASK(map) < PAGE_MASK) {
20930 DEBUG4K_SHARE("map %p address 0x%llx size 0x%llx flags 0x%x\n", map, (uint64_t)address, (uint64_t)size, sync_flags);
20931 }
20932 if (__improbable(kr != KERN_SUCCESS)) {
20933 return vm_sanitize_get_kr(kr);
20934 }
20935
20936 amount_left = size;
20937
20938 while (amount_left > 0) {
20939 vm_object_size_t flush_size;
20940 vm_object_t object;
20941
20942 vm_map_lock(map);
20943 if (!vm_map_lookup_entry(map,
20944 address,
20945 &entry)) {
20946 vm_map_size_t skip;
20947
20948 /*
20949 * hole in the address map.
20950 */
20951 had_hole = TRUE;
20952
20953 if (sync_flags & VM_SYNC_KILLPAGES) {
20954 /*
20955 * For VM_SYNC_KILLPAGES, there should be
20956 * no holes in the range, since we couldn't
20957 * prevent someone else from allocating in
20958 * that hole and we wouldn't want to "kill"
20959 * their pages.
20960 */
20961 vm_map_unlock(map);
20962 break;
20963 }
20964
20965 /*
20966 * Check for empty map.
20967 */
20968 if (entry == vm_map_to_entry(map) &&
20969 entry->vme_next == entry) {
20970 vm_map_unlock(map);
20971 break;
20972 }
20973 /*
20974 * Check that we don't wrap and that
20975 * we have at least one real map entry.
20976 */
20977 if ((map->hdr.nentries == 0) ||
20978 (entry->vme_next->vme_start < address)) {
20979 vm_map_unlock(map);
20980 break;
20981 }
20982 /*
20983 * Move up to the next entry if needed
20984 */
20985 skip = (entry->vme_next->vme_start - address);
20986 if (skip >= amount_left) {
20987 amount_left = 0;
20988 } else {
20989 amount_left -= skip;
20990 }
20991 address = entry->vme_next->vme_start;
20992 vm_map_unlock(map);
20993 continue;
20994 }
20995
20996 offset = address - entry->vme_start;
20997 pmap_offset = address;
20998
20999 /*
21000 * do we have more to flush than is contained in this
21001 * entry ?
21002 */
21003 if (amount_left + entry->vme_start + offset > entry->vme_end) {
21004 flush_size = entry->vme_end -
21005 (entry->vme_start + offset);
21006 } else {
21007 flush_size = amount_left;
21008 }
21009 amount_left -= flush_size;
21010 address += flush_size;
21011
21012 if (entry->is_sub_map == TRUE) {
21013 vm_map_t local_map;
21014 vm_map_offset_t local_offset;
21015
21016 local_map = VME_SUBMAP(entry);
21017 local_offset = VME_OFFSET(entry);
21018 vm_map_reference(local_map);
21019 vm_map_unlock(map);
21020 if (vm_map_msync(
21021 local_map,
21022 local_offset,
21023 flush_size,
21024 sync_flags) == KERN_INVALID_ADDRESS) {
21025 had_hole = TRUE;
21026 }
21027 vm_map_deallocate(local_map);
21028 local_map = VM_MAP_NULL;
21029 continue;
21030 }
21031 object = VME_OBJECT(entry);
21032
21033 /*
21034 * We can't sync this object if the object has not been
21035 * created yet
21036 */
21037 if (object == VM_OBJECT_NULL) {
21038 vm_map_unlock(map);
21039 continue;
21040 }
21041 offset += VME_OFFSET(entry);
21042
21043 vm_object_lock(object);
21044
21045 if (sync_flags & (VM_SYNC_KILLPAGES | VM_SYNC_DEACTIVATE)) {
21046 int kill_pages = 0;
21047
21048 if (VM_MAP_PAGE_MASK(map) < PAGE_MASK) {
21049 /*
21050 * This is a destructive operation and so we
21051 * err on the side of limiting the range of
21052 * the operation.
21053 */
21054 start_offset = vm_object_round_page(offset);
21055 end_offset = vm_object_trunc_page(offset + flush_size);
21056
21057 if (end_offset <= start_offset) {
21058 vm_object_unlock(object);
21059 vm_map_unlock(map);
21060 continue;
21061 }
21062
21063 pmap_offset += start_offset - offset;
21064 } else {
21065 start_offset = offset;
21066 end_offset = offset + flush_size;
21067 }
21068
21069 if (sync_flags & VM_SYNC_KILLPAGES) {
21070 if (((os_ref_get_count_raw(&object->ref_count) == 1) ||
21071 ((object->copy_strategy !=
21072 MEMORY_OBJECT_COPY_SYMMETRIC) &&
21073 (object->vo_copy == VM_OBJECT_NULL))) &&
21074 (object->shadow == VM_OBJECT_NULL)) {
21075 if (os_ref_get_count_raw(&object->ref_count) != 1) {
21076 vm_page_stats_reusable.free_shared++;
21077 }
21078 kill_pages = 1;
21079 } else {
21080 kill_pages = -1;
21081 }
21082 }
21083 if (kill_pages != -1) {
21084 boolean_t kill_no_write = FALSE;
21085
21086 if ((entry->protection & VM_PROT_EXECUTE) ||
21087 entry->vme_xnu_user_debug) {
21088 /*
21089 * Executable or user debug pages might be write-protected by
21090 * hardware, so do not attempt to write to these pages.
21091 */
21092 kill_no_write = TRUE;
21093 }
21094 vm_object_deactivate_pages(
21095 object,
21096 start_offset,
21097 (vm_object_size_t) (end_offset - start_offset),
21098 kill_pages,
21099 FALSE, /* reusable_pages */
21100 kill_no_write,
21101 map->pmap,
21102 pmap_offset);
21103 }
21104 vm_object_unlock(object);
21105 vm_map_unlock(map);
21106 continue;
21107 }
21108 /*
21109 * We can't sync this object if there isn't a pager.
21110 * Don't bother to sync internal objects, since there can't
21111 * be any "permanent" storage for these objects anyway.
21112 */
21113 if ((object->pager == MEMORY_OBJECT_NULL) ||
21114 (object->internal) || (object->private)) {
21115 vm_object_unlock(object);
21116 vm_map_unlock(map);
21117 continue;
21118 }
21119 /*
21120 * keep reference on the object until syncing is done
21121 */
21122 vm_object_reference_locked(object);
21123 vm_object_unlock(object);
21124
21125 vm_map_unlock(map);
21126
21127 if (VM_MAP_PAGE_MASK(map) < PAGE_MASK) {
21128 start_offset = vm_object_trunc_page(offset);
21129 end_offset = vm_object_round_page(offset + flush_size);
21130 } else {
21131 start_offset = offset;
21132 end_offset = offset + flush_size;
21133 }
21134
21135 do_sync_req = vm_object_sync(object,
21136 start_offset,
21137 (end_offset - start_offset),
21138 sync_flags & VM_SYNC_INVALIDATE,
21139 ((sync_flags & VM_SYNC_SYNCHRONOUS) ||
21140 (sync_flags & VM_SYNC_ASYNCHRONOUS)),
21141 sync_flags & VM_SYNC_SYNCHRONOUS);
21142
21143 if ((sync_flags & VM_SYNC_INVALIDATE) && object->resident_page_count == 0) {
21144 /*
21145 * clear out the clustering and read-ahead hints
21146 */
21147 vm_object_lock(object);
21148
21149 object->pages_created = 0;
21150 object->pages_used = 0;
21151 object->sequential = 0;
21152 object->last_alloc = 0;
21153
21154 vm_object_unlock(object);
21155 }
21156 vm_object_deallocate(object);
21157 } /* while */
21158
21159 /* for proper msync() behaviour */
21160 if (had_hole == TRUE && (sync_flags & VM_SYNC_CONTIGUOUS)) {
21161 return KERN_INVALID_ADDRESS;
21162 }
21163
21164 return KERN_SUCCESS;
21165 }/* vm_msync */
21166
21167 void
vm_named_entry_associate_vm_object(vm_named_entry_t named_entry,vm_object_t object,vm_object_offset_t offset,vm_object_size_t size,vm_prot_t prot)21168 vm_named_entry_associate_vm_object(
21169 vm_named_entry_t named_entry,
21170 vm_object_t object,
21171 vm_object_offset_t offset,
21172 vm_object_size_t size,
21173 vm_prot_t prot)
21174 {
21175 vm_map_copy_t copy;
21176 vm_map_entry_t copy_entry;
21177
21178 assert(!named_entry->is_sub_map);
21179 assert(!named_entry->is_copy);
21180 assert(!named_entry->is_object);
21181 assert(!named_entry->internal);
21182 assert(named_entry->backing.copy == VM_MAP_COPY_NULL);
21183
21184 copy = vm_map_copy_allocate(VM_MAP_COPY_ENTRY_LIST);
21185 copy->offset = offset;
21186 copy->size = size;
21187 copy->cpy_hdr.page_shift = (uint16_t)PAGE_SHIFT;
21188
21189 copy_entry = vm_map_copy_entry_create(copy);
21190 copy_entry->protection = prot;
21191 copy_entry->max_protection = prot;
21192 copy_entry->use_pmap = TRUE;
21193 copy_entry->vme_start = VM_MAP_TRUNC_PAGE(offset, PAGE_MASK);
21194 copy_entry->vme_end = VM_MAP_ROUND_PAGE(offset + size, PAGE_MASK);
21195 VME_OBJECT_SET(copy_entry, object, false, 0);
21196 VME_OFFSET_SET(copy_entry, vm_object_trunc_page(offset));
21197 vm_map_copy_entry_link(copy, vm_map_copy_last_entry(copy), copy_entry);
21198
21199 named_entry->backing.copy = copy;
21200 named_entry->is_object = TRUE;
21201 if (object->internal) {
21202 named_entry->internal = TRUE;
21203 }
21204
21205 DEBUG4K_MEMENTRY("named_entry %p copy %p object %p offset 0x%llx size 0x%llx prot 0x%x\n",
21206 named_entry, copy, object, offset, size, prot);
21207 }
21208
21209 vm_object_t
vm_named_entry_to_vm_object(vm_named_entry_t named_entry)21210 vm_named_entry_to_vm_object(
21211 vm_named_entry_t named_entry)
21212 {
21213 vm_map_copy_t copy;
21214 vm_map_entry_t copy_entry;
21215 vm_object_t object;
21216
21217 assert(!named_entry->is_sub_map);
21218 assert(!named_entry->is_copy);
21219 assert(named_entry->is_object);
21220 copy = named_entry->backing.copy;
21221 assert(copy != VM_MAP_COPY_NULL);
21222 /*
21223 * Assert that the vm_map_copy is coming from the right
21224 * zone and hasn't been forged
21225 */
21226 vm_map_copy_require(copy);
21227 assert(copy->cpy_hdr.nentries == 1);
21228 copy_entry = vm_map_copy_first_entry(copy);
21229 object = VME_OBJECT(copy_entry);
21230
21231 DEBUG4K_MEMENTRY("%p -> %p -> %p [0x%llx 0x%llx 0x%llx 0x%x/0x%x ] -> %p offset 0x%llx size 0x%llx prot 0x%x\n", named_entry, copy, copy_entry, (uint64_t)copy_entry->vme_start, (uint64_t)copy_entry->vme_end, copy_entry->vme_offset, copy_entry->protection, copy_entry->max_protection, object, named_entry->offset, named_entry->size, named_entry->protection);
21232
21233 return object;
21234 }
21235
21236 /*
21237 * Routine: convert_port_entry_to_map
21238 * Purpose:
21239 * Convert from a port specifying an entry or a task
21240 * to a map. Doesn't consume the port ref; produces a map ref,
21241 * which may be null. Unlike convert_port_to_map, the
21242 * port may be task or a named entry backed.
21243 * Conditions:
21244 * Nothing locked.
21245 */
21246
21247 vm_map_t
convert_port_entry_to_map(ipc_port_t port)21248 convert_port_entry_to_map(
21249 ipc_port_t port)
21250 {
21251 vm_map_t map = VM_MAP_NULL;
21252 vm_named_entry_t named_entry;
21253
21254 if (!IP_VALID(port)) {
21255 return VM_MAP_NULL;
21256 }
21257
21258 if (ip_kotype(port) != IKOT_NAMED_ENTRY) {
21259 return convert_port_to_map(port);
21260 }
21261
21262 named_entry = mach_memory_entry_from_port(port);
21263
21264 if ((named_entry->is_sub_map) &&
21265 (named_entry->protection & VM_PROT_WRITE)) {
21266 map = named_entry->backing.map;
21267 if (map->pmap != PMAP_NULL) {
21268 if (map->pmap == kernel_pmap) {
21269 panic("userspace has access "
21270 "to a kernel map %p", map);
21271 }
21272 pmap_require(map->pmap);
21273 }
21274 vm_map_reference(map);
21275 }
21276
21277 return map;
21278 }
21279
21280 /*
21281 * Export routines to other components for the things we access locally through
21282 * macros.
21283 */
21284 #undef current_map
21285 vm_map_t
current_map(void)21286 current_map(void)
21287 {
21288 return current_map_fast();
21289 }
21290
21291 /*
21292 * vm_map_reference:
21293 *
21294 * Takes a reference on the specified map.
21295 */
21296 void
vm_map_reference(vm_map_t map)21297 vm_map_reference(
21298 vm_map_t map)
21299 {
21300 if (__probable(map != VM_MAP_NULL)) {
21301 vm_map_require(map);
21302 os_ref_retain_raw(&map->map_refcnt, &map_refgrp);
21303 }
21304 }
21305
21306 /*
21307 * vm_map_deallocate:
21308 *
21309 * Removes a reference from the specified map,
21310 * destroying it if no references remain.
21311 * The map should not be locked.
21312 */
21313 void
vm_map_deallocate(vm_map_t map)21314 vm_map_deallocate(
21315 vm_map_t map)
21316 {
21317 if (__probable(map != VM_MAP_NULL)) {
21318 vm_map_require(map);
21319 if (os_ref_release_raw(&map->map_refcnt, &map_refgrp) == 0) {
21320 vm_map_destroy(map);
21321 }
21322 }
21323 }
21324
21325 void
vm_map_inspect_deallocate(vm_map_inspect_t map)21326 vm_map_inspect_deallocate(
21327 vm_map_inspect_t map)
21328 {
21329 vm_map_deallocate((vm_map_t)map);
21330 }
21331
21332 void
vm_map_read_deallocate(vm_map_read_t map)21333 vm_map_read_deallocate(
21334 vm_map_read_t map)
21335 {
21336 vm_map_deallocate((vm_map_t)map);
21337 }
21338
21339
21340 void
vm_map_disable_NX(vm_map_t map)21341 vm_map_disable_NX(vm_map_t map)
21342 {
21343 if (map == NULL) {
21344 return;
21345 }
21346 if (map->pmap == NULL) {
21347 return;
21348 }
21349
21350 pmap_disable_NX(map->pmap);
21351 }
21352
21353 void
vm_map_disallow_data_exec(vm_map_t map)21354 vm_map_disallow_data_exec(vm_map_t map)
21355 {
21356 if (map == NULL) {
21357 return;
21358 }
21359
21360 map->map_disallow_data_exec = TRUE;
21361 }
21362
21363 /* XXX Consider making these constants (VM_MAX_ADDRESS and MACH_VM_MAX_ADDRESS)
21364 * more descriptive.
21365 */
21366 void
vm_map_set_32bit(vm_map_t map)21367 vm_map_set_32bit(vm_map_t map)
21368 {
21369 #if defined(__arm64__)
21370 map->max_offset = pmap_max_offset(FALSE, ARM_PMAP_MAX_OFFSET_DEVICE);
21371 #else
21372 map->max_offset = (vm_map_offset_t)VM_MAX_ADDRESS;
21373 #endif
21374 }
21375
21376
21377 void
vm_map_set_64bit(vm_map_t map)21378 vm_map_set_64bit(vm_map_t map)
21379 {
21380 #if defined(__arm64__)
21381 map->max_offset = pmap_max_offset(TRUE, ARM_PMAP_MAX_OFFSET_DEVICE);
21382 #else
21383 map->max_offset = (vm_map_offset_t)MACH_VM_MAX_ADDRESS;
21384 #endif
21385 }
21386
21387 /*
21388 * Expand the maximum size of an existing map to 64GB.
21389 */
21390 void
vm_map_set_jumbo(vm_map_t map)21391 vm_map_set_jumbo(vm_map_t map)
21392 {
21393 #if defined (__arm64__) && !XNU_TARGET_OS_OSX
21394 vm_map_set_max_addr(map, ~0, false);
21395 #else /* arm64 */
21396 (void) map;
21397 #endif
21398 }
21399
21400 #if XNU_TARGET_OS_IOS && EXTENDED_USER_VA_SUPPORT
21401 /*
21402 * Expand the maximum size of an existing map to the maximum supported.
21403 */
21404 void
vm_map_set_extra_jumbo(vm_map_t map)21405 vm_map_set_extra_jumbo(vm_map_t map)
21406 {
21407 #if defined (__arm64__) && !XNU_TARGET_OS_OSX
21408 vm_map_set_max_addr(map, ~0, true);
21409 #else /* arm64 */
21410 (void) map;
21411 #endif
21412 }
21413 #endif /* XNU_TARGET_OS_IOS && EXTENDED_USER_VA_SUPPORT */
21414
21415 /*
21416 * This map has a JIT entitlement
21417 */
21418 void
vm_map_set_jit_entitled(vm_map_t map)21419 vm_map_set_jit_entitled(vm_map_t map)
21420 {
21421 #if defined (__arm64__)
21422 pmap_set_jit_entitled(map->pmap);
21423 #else /* arm64 */
21424 (void) map;
21425 #endif
21426 }
21427
21428 /*
21429 * Get status of this maps TPRO flag
21430 */
21431 boolean_t
vm_map_tpro(vm_map_t map)21432 vm_map_tpro(vm_map_t map)
21433 {
21434 #if defined (__arm64e__)
21435 return pmap_get_tpro(map->pmap);
21436 #else /* arm64e */
21437 (void) map;
21438 return FALSE;
21439 #endif
21440 }
21441
21442 /*
21443 * This map has TPRO enabled
21444 */
21445 void
vm_map_set_tpro(vm_map_t map)21446 vm_map_set_tpro(vm_map_t map)
21447 {
21448 #if defined (__arm64e__)
21449 pmap_set_tpro(map->pmap);
21450 #else /* arm64e */
21451 (void) map;
21452 #endif
21453 }
21454
21455
21456
21457 /*
21458 * Does this map have TPRO enforcement enabled
21459 */
21460 boolean_t
vm_map_tpro_enforcement(vm_map_t map)21461 vm_map_tpro_enforcement(vm_map_t map)
21462 {
21463 return map->tpro_enforcement;
21464 }
21465
21466 /*
21467 * Set TPRO enforcement for this map
21468 */
21469 void
vm_map_set_tpro_enforcement(vm_map_t map)21470 vm_map_set_tpro_enforcement(vm_map_t map)
21471 {
21472 if (vm_map_tpro(map)) {
21473 vm_map_lock(map);
21474 map->tpro_enforcement = TRUE;
21475 vm_map_unlock(map);
21476 }
21477 }
21478
21479 /*
21480 * Enable TPRO on the requested region
21481 *
21482 * Note:
21483 * This routine is primarily intended to be called during/soon after map
21484 * creation before the associated task has been released to run. It is only
21485 * currently safe when we have no resident pages.
21486 */
21487 boolean_t
vm_map_set_tpro_range(__unused vm_map_t map,__unused vm_map_address_t start,__unused vm_map_address_t end)21488 vm_map_set_tpro_range(
21489 __unused vm_map_t map,
21490 __unused vm_map_address_t start,
21491 __unused vm_map_address_t end)
21492 {
21493 return TRUE;
21494 }
21495
21496 /*
21497 * Expand the maximum size of an existing map.
21498 */
21499 void
vm_map_set_max_addr(vm_map_t map,vm_map_offset_t new_max_offset,__unused bool extra_jumbo)21500 vm_map_set_max_addr(
21501 vm_map_t map,
21502 vm_map_offset_t new_max_offset,
21503 __unused bool extra_jumbo)
21504 {
21505 #if defined(__arm64__)
21506 vm_map_offset_t max_supported_offset;
21507 vm_map_offset_t old_max_offset;
21508 unsigned int option = ARM_PMAP_MAX_OFFSET_JUMBO;
21509
21510 vm_map_lock(map);
21511
21512 old_max_offset = map->max_offset;
21513 #if XNU_TARGET_OS_IOS && EXTENDED_USER_VA_SUPPORT
21514 if (extra_jumbo) {
21515 option = ARM_PMAP_MAX_OFFSET_EXTRA_JUMBO;
21516 }
21517 #endif /* XNU_TARGET_OS_IOS && EXTENDED_USER_VA_SUPPORT */
21518 max_supported_offset = pmap_max_offset(vm_map_is_64bit(map), option);
21519
21520 new_max_offset = trunc_page(new_max_offset);
21521
21522 /* The address space cannot be shrunk using this routine. */
21523 if (old_max_offset >= new_max_offset) {
21524 vm_map_unlock(map);
21525 return;
21526 }
21527
21528 if (max_supported_offset < new_max_offset) {
21529 new_max_offset = max_supported_offset;
21530 }
21531
21532 map->max_offset = new_max_offset;
21533
21534 /*
21535 * Disable the following chunk of code that extends the "holes" list
21536 * to accomodate a larger VM map.
21537 * In `vm_map_create_options()`, we now set the end of the "holes" list to
21538 * max(map->max_offset, MACH_VM_MAX_ADDRESS) for all platforms.
21539 * MACH_VM_MAX_ADDRESS is the largest virtual address a userspace process
21540 * can map, so any `new_max_offset` value will be <= MACH_VM_MAX_ADDRESS.
21541 * The "holes" list does not need to be adjusted.
21542 */
21543 #if 0
21544 if (map->holelistenabled) {
21545 if (map->holes_list->prev->vme_end == old_max_offset) {
21546 /*
21547 * There is already a hole at the end of the map; simply make it bigger.
21548 */
21549 map->holes_list->prev->vme_end = map->max_offset;
21550 } else {
21551 /*
21552 * There is no hole at the end, so we need to create a new hole
21553 * for the new empty space we're creating.
21554 */
21555 struct vm_map_links *new_hole;
21556
21557 new_hole = zalloc_id(ZONE_ID_VM_MAP_HOLES, Z_WAITOK | Z_NOFAIL);
21558 new_hole->start = old_max_offset;
21559 new_hole->end = map->max_offset;
21560 new_hole->prev = map->holes_list->prev;
21561 new_hole->next = (struct vm_map_entry *)map->holes_list;
21562 map->holes_list->prev->vme_next = (struct vm_map_entry *)new_hole;
21563 map->holes_list->prev = (struct vm_map_entry *)new_hole;
21564 }
21565 }
21566 #endif
21567
21568 vm_map_unlock(map);
21569 #else
21570 (void)map;
21571 (void)new_max_offset;
21572 #endif
21573 }
21574
21575 vm_map_offset_t
vm_compute_max_offset(boolean_t is64)21576 vm_compute_max_offset(boolean_t is64)
21577 {
21578 #if defined(__arm64__)
21579 return pmap_max_offset(is64, ARM_PMAP_MAX_OFFSET_DEVICE);
21580 #else
21581 return is64 ? (vm_map_offset_t)MACH_VM_MAX_ADDRESS : (vm_map_offset_t)VM_MAX_ADDRESS;
21582 #endif
21583 }
21584
21585 void
vm_map_get_max_aslr_slide_section(vm_map_t map __unused,int64_t * max_sections,int64_t * section_size)21586 vm_map_get_max_aslr_slide_section(
21587 vm_map_t map __unused,
21588 int64_t *max_sections,
21589 int64_t *section_size)
21590 {
21591 #if defined(__arm64__)
21592 *max_sections = 3;
21593 *section_size = ARM_TT_TWIG_SIZE;
21594 #else
21595 *max_sections = 1;
21596 *section_size = 0;
21597 #endif
21598 }
21599
21600 uint64_t
vm_map_get_max_aslr_slide_pages(vm_map_t map)21601 vm_map_get_max_aslr_slide_pages(vm_map_t map)
21602 {
21603 #if defined(__arm64__)
21604 /* Limit arm64 slide to 16MB to conserve contiguous VA space in the more
21605 * limited embedded address space; this is also meant to minimize pmap
21606 * memory usage on 16KB page systems.
21607 */
21608 return 1 << (24 - VM_MAP_PAGE_SHIFT(map));
21609 #else
21610 return 1 << (vm_map_is_64bit(map) ? 16 : 8);
21611 #endif
21612 }
21613
21614 uint64_t
vm_map_get_max_loader_aslr_slide_pages(vm_map_t map)21615 vm_map_get_max_loader_aslr_slide_pages(vm_map_t map)
21616 {
21617 #if defined(__arm64__)
21618 /* We limit the loader slide to 4MB, in order to ensure at least 8 bits
21619 * of independent entropy on 16KB page systems.
21620 */
21621 return 1 << (22 - VM_MAP_PAGE_SHIFT(map));
21622 #else
21623 return 1 << (vm_map_is_64bit(map) ? 16 : 8);
21624 #endif
21625 }
21626
21627 boolean_t
vm_map_is_64bit(vm_map_t map)21628 vm_map_is_64bit(
21629 vm_map_t map)
21630 {
21631 return map->max_offset > ((vm_map_offset_t)VM_MAX_ADDRESS);
21632 }
21633
21634 boolean_t
vm_map_has_hard_pagezero(vm_map_t map,vm_map_offset_t pagezero_size)21635 vm_map_has_hard_pagezero(
21636 vm_map_t map,
21637 vm_map_offset_t pagezero_size)
21638 {
21639 /*
21640 * XXX FBDP
21641 * We should lock the VM map (for read) here but we can get away
21642 * with it for now because there can't really be any race condition:
21643 * the VM map's min_offset is changed only when the VM map is created
21644 * and when the zero page is established (when the binary gets loaded),
21645 * and this routine gets called only when the task terminates and the
21646 * VM map is being torn down, and when a new map is created via
21647 * load_machfile()/execve().
21648 */
21649 return map->min_offset >= pagezero_size;
21650 }
21651
21652 /*
21653 * Raise a VM map's maximun offset.
21654 */
21655 kern_return_t
vm_map_raise_max_offset(vm_map_t map,vm_map_offset_t new_max_offset)21656 vm_map_raise_max_offset(
21657 vm_map_t map,
21658 vm_map_offset_t new_max_offset)
21659 {
21660 kern_return_t ret;
21661
21662 vm_map_lock(map);
21663 ret = KERN_INVALID_ADDRESS;
21664
21665 if (new_max_offset >= map->max_offset) {
21666 if (!vm_map_is_64bit(map)) {
21667 if (new_max_offset <= (vm_map_offset_t)VM_MAX_ADDRESS) {
21668 map->max_offset = new_max_offset;
21669 ret = KERN_SUCCESS;
21670 }
21671 } else {
21672 if (new_max_offset <= (vm_map_offset_t)MACH_VM_MAX_ADDRESS) {
21673 map->max_offset = new_max_offset;
21674 ret = KERN_SUCCESS;
21675 }
21676 }
21677 }
21678
21679 vm_map_unlock(map);
21680 return ret;
21681 }
21682
21683
21684 /*
21685 * Raise a VM map's minimum offset.
21686 * To strictly enforce "page zero" reservation.
21687 */
21688 kern_return_t
vm_map_raise_min_offset(vm_map_t map,vm_map_offset_t new_min_offset)21689 vm_map_raise_min_offset(
21690 vm_map_t map,
21691 vm_map_offset_t new_min_offset)
21692 {
21693 vm_map_entry_t first_entry;
21694
21695 new_min_offset = vm_map_round_page(new_min_offset,
21696 VM_MAP_PAGE_MASK(map));
21697
21698 vm_map_lock(map);
21699
21700 if (new_min_offset < map->min_offset) {
21701 /*
21702 * Can't move min_offset backwards, as that would expose
21703 * a part of the address space that was previously, and for
21704 * possibly good reasons, inaccessible.
21705 */
21706 vm_map_unlock(map);
21707 return KERN_INVALID_ADDRESS;
21708 }
21709 if (new_min_offset >= map->max_offset) {
21710 /* can't go beyond the end of the address space */
21711 vm_map_unlock(map);
21712 return KERN_INVALID_ADDRESS;
21713 }
21714
21715 first_entry = vm_map_first_entry(map);
21716 if (first_entry != vm_map_to_entry(map) &&
21717 first_entry->vme_start < new_min_offset) {
21718 /*
21719 * Some memory was already allocated below the new
21720 * minimun offset. It's too late to change it now...
21721 */
21722 vm_map_unlock(map);
21723 return KERN_NO_SPACE;
21724 }
21725
21726 map->min_offset = new_min_offset;
21727
21728 if (map->holelistenabled) {
21729 assert(map->holes_list);
21730 map->holes_list->start = new_min_offset;
21731 assert(new_min_offset < map->holes_list->end);
21732 }
21733
21734 vm_map_unlock(map);
21735
21736 return KERN_SUCCESS;
21737 }
21738
21739 /*
21740 * Set the limit on the maximum amount of address space and user wired memory allowed for this map.
21741 * This is basically a copy of the RLIMIT_AS and RLIMIT_MEMLOCK rlimit value maintained by the BSD
21742 * side of the kernel. The limits are checked in the mach VM side, so we keep a copy so we don't
21743 * have to reach over to the BSD data structures.
21744 */
21745
21746 uint64_t vm_map_set_size_limit_count = 0;
21747 kern_return_t
vm_map_set_size_limit(vm_map_t map,uint64_t new_size_limit)21748 vm_map_set_size_limit(vm_map_t map, uint64_t new_size_limit)
21749 {
21750 kern_return_t kr;
21751
21752 vm_map_lock(map);
21753 if (new_size_limit < map->size) {
21754 /* new limit should not be lower than its current size */
21755 DTRACE_VM2(vm_map_set_size_limit_fail,
21756 vm_map_size_t, map->size,
21757 uint64_t, new_size_limit);
21758 kr = KERN_FAILURE;
21759 } else if (new_size_limit == map->size_limit) {
21760 /* no change */
21761 kr = KERN_SUCCESS;
21762 } else {
21763 /* set new limit */
21764 DTRACE_VM2(vm_map_set_size_limit,
21765 vm_map_size_t, map->size,
21766 uint64_t, new_size_limit);
21767 if (new_size_limit != RLIM_INFINITY) {
21768 vm_map_set_size_limit_count++;
21769 }
21770 map->size_limit = new_size_limit;
21771 kr = KERN_SUCCESS;
21772 }
21773 vm_map_unlock(map);
21774 return kr;
21775 }
21776
21777 uint64_t vm_map_set_data_limit_count = 0;
21778 kern_return_t
vm_map_set_data_limit(vm_map_t map,uint64_t new_data_limit)21779 vm_map_set_data_limit(vm_map_t map, uint64_t new_data_limit)
21780 {
21781 kern_return_t kr;
21782
21783 vm_map_lock(map);
21784 if (new_data_limit < map->size) {
21785 /* new limit should not be lower than its current size */
21786 DTRACE_VM2(vm_map_set_data_limit_fail,
21787 vm_map_size_t, map->size,
21788 uint64_t, new_data_limit);
21789 kr = KERN_FAILURE;
21790 } else if (new_data_limit == map->data_limit) {
21791 /* no change */
21792 kr = KERN_SUCCESS;
21793 } else {
21794 /* set new limit */
21795 DTRACE_VM2(vm_map_set_data_limit,
21796 vm_map_size_t, map->size,
21797 uint64_t, new_data_limit);
21798 if (new_data_limit != RLIM_INFINITY) {
21799 vm_map_set_data_limit_count++;
21800 }
21801 map->data_limit = new_data_limit;
21802 kr = KERN_SUCCESS;
21803 }
21804 vm_map_unlock(map);
21805 return kr;
21806 }
21807
21808 void
vm_map_set_user_wire_limit(vm_map_t map,vm_size_t limit)21809 vm_map_set_user_wire_limit(vm_map_t map,
21810 vm_size_t limit)
21811 {
21812 vm_map_lock(map);
21813 map->user_wire_limit = limit;
21814 vm_map_unlock(map);
21815 }
21816
21817
21818 void
vm_map_switch_protect(vm_map_t map,boolean_t val)21819 vm_map_switch_protect(vm_map_t map,
21820 boolean_t val)
21821 {
21822 vm_map_lock(map);
21823 map->switch_protect = val;
21824 vm_map_unlock(map);
21825 }
21826
21827 extern int cs_process_enforcement_enable;
21828 boolean_t
vm_map_cs_enforcement(vm_map_t map)21829 vm_map_cs_enforcement(
21830 vm_map_t map)
21831 {
21832 if (cs_process_enforcement_enable) {
21833 return TRUE;
21834 }
21835 return map->cs_enforcement;
21836 }
21837
21838 kern_return_t
vm_map_cs_wx_enable(__unused vm_map_t map)21839 vm_map_cs_wx_enable(
21840 __unused vm_map_t map)
21841 {
21842 #if CODE_SIGNING_MONITOR
21843 kern_return_t ret = csm_allow_invalid_code(vm_map_pmap(map));
21844 if ((ret == KERN_SUCCESS) || (ret == KERN_NOT_SUPPORTED)) {
21845 return KERN_SUCCESS;
21846 }
21847 return ret;
21848 #else
21849 /* The VM manages WX memory entirely on its own */
21850 return KERN_SUCCESS;
21851 #endif
21852 }
21853
21854 kern_return_t
vm_map_csm_allow_jit(__unused vm_map_t map)21855 vm_map_csm_allow_jit(
21856 __unused vm_map_t map)
21857 {
21858 #if CODE_SIGNING_MONITOR
21859 return csm_allow_jit_region(vm_map_pmap(map));
21860 #else
21861 /* No code signing monitor to enforce JIT policy */
21862 return KERN_SUCCESS;
21863 #endif
21864 }
21865
21866 void
vm_map_cs_debugged_set(vm_map_t map,boolean_t val)21867 vm_map_cs_debugged_set(
21868 vm_map_t map,
21869 boolean_t val)
21870 {
21871 vm_map_lock(map);
21872 map->cs_debugged = val;
21873 vm_map_unlock(map);
21874 }
21875
21876 void
vm_map_cs_enforcement_set(vm_map_t map,boolean_t val)21877 vm_map_cs_enforcement_set(
21878 vm_map_t map,
21879 boolean_t val)
21880 {
21881 vm_map_lock(map);
21882 map->cs_enforcement = val;
21883 pmap_set_vm_map_cs_enforced(map->pmap, val);
21884 vm_map_unlock(map);
21885 }
21886
21887 /*
21888 * IOKit has mapped a region into this map; adjust the pmap's ledgers appropriately.
21889 * phys_footprint is a composite limit consisting of iokit + physmem, so we need to
21890 * bump both counters.
21891 */
21892 void
vm_map_iokit_mapped_region(vm_map_t map,vm_size_t bytes)21893 vm_map_iokit_mapped_region(vm_map_t map, vm_size_t bytes)
21894 {
21895 pmap_t pmap = vm_map_pmap(map);
21896
21897 ledger_credit(pmap->ledger, task_ledgers.iokit_mapped, bytes);
21898 ledger_credit(pmap->ledger, task_ledgers.phys_footprint, bytes);
21899 }
21900
21901 void
vm_map_iokit_unmapped_region(vm_map_t map,vm_size_t bytes)21902 vm_map_iokit_unmapped_region(vm_map_t map, vm_size_t bytes)
21903 {
21904 pmap_t pmap = vm_map_pmap(map);
21905
21906 ledger_debit(pmap->ledger, task_ledgers.iokit_mapped, bytes);
21907 ledger_debit(pmap->ledger, task_ledgers.phys_footprint, bytes);
21908 }
21909
21910 /* Add (generate) code signature for memory range */
21911 #if CONFIG_DYNAMIC_CODE_SIGNING
21912 kern_return_t
vm_map_sign(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)21913 vm_map_sign(vm_map_t map,
21914 vm_map_offset_t start,
21915 vm_map_offset_t end)
21916 {
21917 vm_map_entry_t entry;
21918 vm_map_offset_t entry_start;
21919 vm_object_offset_t entry_offset;
21920 vm_page_t m;
21921 vm_object_t object;
21922
21923 /*
21924 * Vet all the input parameters and current type and state of the
21925 * underlaying object. Return with an error if anything is amiss.
21926 */
21927 if (map == VM_MAP_NULL) {
21928 return KERN_INVALID_ARGUMENT;
21929 }
21930
21931 if (__improbable(vm_map_range_overflows(map, start, end - start))) {
21932 return KERN_INVALID_ADDRESS;
21933 }
21934
21935 vm_map_lock_read(map);
21936
21937 if (!vm_map_lookup_entry(map, start, &entry) || entry->is_sub_map) {
21938 /*
21939 * Must pass a valid non-submap address.
21940 */
21941 vm_map_unlock_read(map);
21942 return KERN_INVALID_ADDRESS;
21943 }
21944
21945 if ((entry->vme_start > start) || (entry->vme_end < end)) {
21946 /*
21947 * Map entry doesn't cover the requested range. Not handling
21948 * this situation currently.
21949 */
21950 vm_map_unlock_read(map);
21951 return KERN_INVALID_ARGUMENT;
21952 }
21953
21954 object = VME_OBJECT(entry);
21955 if (object == VM_OBJECT_NULL) {
21956 /*
21957 * Object must already be present or we can't sign.
21958 */
21959 vm_map_unlock_read(map);
21960 return KERN_INVALID_ARGUMENT;
21961 }
21962
21963 vm_object_lock(object);
21964
21965 entry_start = entry->vme_start;
21966 entry_offset = VME_OFFSET(entry);
21967 vm_map_unlock_read(map);
21968 entry = VM_MAP_ENTRY_NULL; /* no longer valid after unlocking map */
21969
21970 while (start < end) {
21971 uint32_t refmod;
21972
21973 m = vm_page_lookup(object,
21974 start - entry_start + entry_offset);
21975 if (m == VM_PAGE_NULL) {
21976 /* shoud we try to fault a page here? we can probably
21977 * demand it exists and is locked for this request */
21978 vm_object_unlock(object);
21979 return KERN_FAILURE;
21980 }
21981 /* deal with special page status */
21982 if (m->vmp_busy ||
21983 (m->vmp_unusual && (VMP_ERROR_GET(m) || m->vmp_restart ||
21984 vm_page_is_private(m) || m->vmp_absent))) {
21985 vm_object_unlock(object);
21986 return KERN_FAILURE;
21987 }
21988
21989 /* Page is OK... now "validate" it */
21990 /* This is the place where we'll call out to create a code
21991 * directory, later */
21992 /* XXX TODO4K: deal with 4k subpages individually? */
21993 m->vmp_cs_validated = VMP_CS_ALL_TRUE;
21994
21995 /* The page is now "clean" for codesigning purposes. That means
21996 * we don't consider it as modified (wpmapped) anymore. But
21997 * we'll disconnect the page so we note any future modification
21998 * attempts. */
21999 m->vmp_wpmapped = FALSE;
22000 refmod = pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m));
22001
22002 /* Pull the dirty status from the pmap, since we cleared the
22003 * wpmapped bit */
22004 if ((refmod & VM_MEM_MODIFIED) && !m->vmp_dirty) {
22005 SET_PAGE_DIRTY(m, FALSE);
22006 }
22007
22008 /* On to the next page */
22009 start += PAGE_SIZE;
22010 }
22011 vm_object_unlock(object);
22012
22013 return KERN_SUCCESS;
22014 }
22015 #endif
22016
22017 kern_return_t
vm_map_partial_reap(vm_map_t map,unsigned int * reclaimed_resident,unsigned int * reclaimed_compressed)22018 vm_map_partial_reap(vm_map_t map, unsigned int *reclaimed_resident, unsigned int *reclaimed_compressed)
22019 {
22020 vm_map_entry_t entry = VM_MAP_ENTRY_NULL;
22021 vm_map_entry_t next_entry;
22022 kern_return_t kr = KERN_SUCCESS;
22023 VM_MAP_ZAP_DECLARE(zap_list);
22024
22025 vm_map_lock(map);
22026
22027 for (entry = vm_map_first_entry(map);
22028 entry != vm_map_to_entry(map);
22029 entry = next_entry) {
22030 next_entry = entry->vme_next;
22031
22032 if (!entry->is_sub_map &&
22033 VME_OBJECT(entry) &&
22034 (VME_OBJECT(entry)->internal == TRUE) &&
22035 (os_ref_get_count_raw(&VME_OBJECT(entry)->ref_count) == 1)) {
22036 *reclaimed_resident += VME_OBJECT(entry)->resident_page_count;
22037 *reclaimed_compressed += vm_compressor_pager_get_count(VME_OBJECT(entry)->pager);
22038
22039 (void)vm_map_delete(map, entry->vme_start,
22040 entry->vme_end, VM_MAP_REMOVE_NO_YIELD,
22041 KMEM_GUARD_NONE, &zap_list);
22042 }
22043 }
22044
22045 vm_map_unlock(map);
22046
22047 vm_map_zap_dispose(&zap_list);
22048
22049 return kr;
22050 }
22051
22052
22053 #if DEVELOPMENT || DEBUG
22054
22055 int
vm_map_disconnect_page_mappings(vm_map_t map,boolean_t do_unnest)22056 vm_map_disconnect_page_mappings(
22057 vm_map_t map,
22058 boolean_t do_unnest)
22059 {
22060 vm_map_entry_t entry;
22061 ledger_amount_t byte_count = 0;
22062
22063 if (do_unnest == TRUE) {
22064 #ifndef NO_NESTED_PMAP
22065 vm_map_lock(map);
22066
22067 for (entry = vm_map_first_entry(map);
22068 entry != vm_map_to_entry(map);
22069 entry = entry->vme_next) {
22070 if (entry->is_sub_map && entry->use_pmap) {
22071 /*
22072 * Make sure the range between the start of this entry and
22073 * the end of this entry is no longer nested, so that
22074 * we will only remove mappings from the pmap in use by this
22075 * this task
22076 */
22077 vm_map_clip_unnest(map, entry, entry->vme_start, entry->vme_end);
22078 }
22079 }
22080 vm_map_unlock(map);
22081 #endif
22082 }
22083 vm_map_lock_read(map);
22084
22085 ledger_get_balance(map->pmap->ledger, task_ledgers.phys_mem, &byte_count);
22086
22087 for (entry = vm_map_first_entry(map);
22088 entry != vm_map_to_entry(map);
22089 entry = entry->vme_next) {
22090 if (!entry->is_sub_map && ((VME_OBJECT(entry) == 0) ||
22091 (VME_OBJECT(entry)->phys_contiguous))) {
22092 continue;
22093 }
22094 if (entry->is_sub_map) {
22095 assert(!entry->use_pmap);
22096 }
22097
22098 pmap_remove_options(map->pmap, entry->vme_start, entry->vme_end, 0);
22099 }
22100 vm_map_unlock_read(map);
22101
22102 return (int) (byte_count / VM_MAP_PAGE_SIZE(map));
22103 }
22104
22105 kern_return_t
vm_map_inject_error(vm_map_t map,vm_map_offset_t vaddr)22106 vm_map_inject_error(vm_map_t map, vm_map_offset_t vaddr)
22107 {
22108 vm_object_t object = NULL;
22109 vm_object_offset_t offset;
22110 vm_prot_t prot;
22111 boolean_t wired;
22112 vm_map_version_t version;
22113 vm_map_t real_map;
22114 int result = KERN_FAILURE;
22115
22116 vaddr = vm_map_trunc_page(vaddr, PAGE_MASK);
22117 vm_map_lock(map);
22118
22119 result = vm_map_lookup_and_lock_object(&map, vaddr, VM_PROT_READ,
22120 OBJECT_LOCK_EXCLUSIVE, &version, &object, &offset, &prot, &wired,
22121 NULL, &real_map, NULL);
22122 if (object == NULL) {
22123 result = KERN_MEMORY_ERROR;
22124 } else if (object->pager) {
22125 result = vm_compressor_pager_inject_error(object->pager,
22126 offset);
22127 } else {
22128 result = KERN_MEMORY_PRESENT;
22129 }
22130
22131 if (object != NULL) {
22132 vm_object_unlock(object);
22133 }
22134
22135 if (real_map != map) {
22136 vm_map_unlock(real_map);
22137 }
22138 vm_map_unlock(map);
22139
22140 return result;
22141 }
22142
22143 /* iterate over map entries. Call the first argument block for the number of entries and the second for every entry
22144 * returns: KERN_SUCCESS if iteration completed ok,
22145 * error code if callback returned an error
22146 * KERN_FAILURE if there was a race of adding/removing entries during the iteration and the number of entries
22147 * iterated is different from the number in the first call
22148 */
22149 static kern_return_t
22150 vm_map_entries_foreach_locked(vm_map_t map, kern_return_t (^count_handler)(int nentries),
22151 kern_return_t (^entry_handler)(void* entry))
22152 {
22153 vm_map_lock_assert_held(map);
22154 int nentries = map->hdr.nentries;
22155 kern_return_t error = count_handler(nentries);
22156 if (error) {
22157 return error;
22158 }
22159
22160 /* iterate until we loop back to the map, see get_vmmap_entries() */
22161 vm_map_entry_t entry = vm_map_first_entry(map);
22162 int count = 0;
22163 while (entry != vm_map_to_entry(map)) {
22164 error = entry_handler(entry);
22165 if (error != KERN_SUCCESS) {
22166 return error;
22167 }
22168 entry = entry->vme_next;
22169 ++count;
22170 if (count > nentries) {
22171 /* nentries and entries iteration don't agree on how many entries there are, shouldn't really happen */
22172 return KERN_FAILURE;
22173 }
22174 }
22175 if (count < nentries) {
22176 return KERN_FAILURE;
22177 }
22178 return KERN_SUCCESS;
22179 }
22180
22181 kern_return_t
22182 vm_map_entries_foreach(vm_map_t map, kern_return_t (^count_handler)(int nentries),
22183 kern_return_t (^entry_handler)(void* entry))
22184 {
22185 vm_map_lock_read(map);
22186 kern_return_t error = vm_map_entries_foreach_locked(map, count_handler, entry_handler);
22187 vm_map_unlock_read(map);
22188 return error;
22189 }
22190
22191 /*
22192 * Dump info about the entry into the given buffer.
22193 * return true on success, false if there was not enough space in the give buffer
22194 * argument size in: bytes free in the given buffer, out: bytes written
22195 */
22196 kern_return_t
vm_map_dump_entry_and_compressor_pager(void * pentry,char * buf,size_t * size)22197 vm_map_dump_entry_and_compressor_pager(void* pentry, char *buf, size_t *size)
22198 {
22199 size_t insize = *size;
22200 kern_return_t kr;
22201 size_t offset = 0;
22202
22203 *size = 0;
22204 if (sizeof(struct vm_map_entry_info) > insize) {
22205 return KERN_NO_SPACE;
22206 }
22207
22208 vm_map_entry_t entry = (vm_map_entry_t)pentry;
22209 struct vm_map_entry_info *out_entry = (struct vm_map_entry_info*)buf;
22210 out_entry->vmei_start = entry->vme_start;
22211 out_entry->vmei_end = entry->vme_end;
22212 out_entry->vmei_alias = VME_ALIAS(entry);
22213 out_entry->vmei_offset = VME_OFFSET(entry);
22214 out_entry->vmei_is_sub_map = entry->is_sub_map;
22215 out_entry->vmei_protection = entry->protection;
22216 offset += sizeof(struct vm_map_entry_info);
22217
22218 out_entry->vmei_slot_mapping_count = 0;
22219 out_entry->vmei_is_compressor_pager = false;
22220 *size = offset;
22221 if (out_entry->vmei_is_sub_map) {
22222 return KERN_SUCCESS; // TODO: sub_map interrogation not supported yet
22223 }
22224 /* have a vm_object? */
22225 vm_object_t object = VME_OBJECT(entry);
22226 if (object == VM_OBJECT_NULL || !object->internal) {
22227 return KERN_SUCCESS;
22228 }
22229 /* objects has a pager? */
22230 memory_object_t pager = object->pager;
22231 if (pager != MEMORY_OBJECT_NULL) {
22232 return KERN_SUCCESS;
22233 }
22234 bool is_compressor = false;
22235 unsigned int slot_mapping_count = 0;
22236 size_t pager_info_size = insize - offset;
22237 kr = vm_compressor_pager_dump(pager, buf + offset, &pager_info_size, &is_compressor, &slot_mapping_count);
22238 if (kr != KERN_SUCCESS) {
22239 /* didn't have enough space for everything we want to write, caller needs to retry */
22240 return kr;
22241 }
22242 offset += pager_info_size;
22243 /* if we got here, is_compressor should be true due to the object->internal check above, so this assignment
22244 * is just for sanity sake */
22245 out_entry->vmei_is_compressor_pager = is_compressor;
22246 out_entry->vmei_slot_mapping_count = slot_mapping_count;
22247 *size = offset;
22248 return KERN_SUCCESS;
22249 }
22250
22251
22252 #endif
22253
22254
22255 #if CONFIG_FREEZE
22256
22257
22258 extern struct freezer_context freezer_context_global;
22259 AbsoluteTime c_freezer_last_yield_ts = 0;
22260
22261 extern unsigned int memorystatus_freeze_private_shared_pages_ratio;
22262 extern unsigned int memorystatus_freeze_shared_mb_per_process_max;
22263
22264 kern_return_t
vm_map_freeze(task_t task,unsigned int * purgeable_count,unsigned int * wired_count,unsigned int * clean_count,unsigned int * dirty_count,unsigned int dirty_budget,unsigned int * shared_count,int * freezer_error_code,boolean_t eval_only)22265 vm_map_freeze(
22266 task_t task,
22267 unsigned int *purgeable_count,
22268 unsigned int *wired_count,
22269 unsigned int *clean_count,
22270 unsigned int *dirty_count,
22271 unsigned int dirty_budget,
22272 unsigned int *shared_count,
22273 int *freezer_error_code,
22274 boolean_t eval_only)
22275 {
22276 vm_map_entry_t entry2 = VM_MAP_ENTRY_NULL;
22277 kern_return_t kr = KERN_SUCCESS;
22278 boolean_t evaluation_phase = TRUE;
22279 vm_object_t cur_shared_object = NULL;
22280 int cur_shared_obj_ref_cnt = 0;
22281 unsigned int dirty_private_count = 0, dirty_shared_count = 0, obj_pages_snapshot = 0;
22282
22283 *purgeable_count = *wired_count = *clean_count = *dirty_count = *shared_count = 0;
22284
22285 /*
22286 * We need the exclusive lock here so that we can
22287 * block any page faults or lookups while we are
22288 * in the middle of freezing this vm map.
22289 */
22290 vm_map_t map = task->map;
22291
22292 vm_map_lock(map);
22293
22294 assert(VM_CONFIG_COMPRESSOR_IS_PRESENT);
22295
22296 if (vm_compressor_low_on_space() || vm_swap_low_on_space()) {
22297 if (vm_compressor_low_on_space()) {
22298 *freezer_error_code = FREEZER_ERROR_NO_COMPRESSOR_SPACE;
22299 }
22300
22301 if (vm_swap_low_on_space()) {
22302 *freezer_error_code = FREEZER_ERROR_NO_SWAP_SPACE;
22303 }
22304
22305 kr = KERN_NO_SPACE;
22306 goto done;
22307 }
22308
22309 if (VM_CONFIG_FREEZER_SWAP_IS_ACTIVE == FALSE) {
22310 /*
22311 * In-memory compressor backing the freezer. No disk.
22312 * So no need to do the evaluation phase.
22313 */
22314 evaluation_phase = FALSE;
22315
22316 if (eval_only == TRUE) {
22317 /*
22318 * We don't support 'eval_only' mode
22319 * in this non-swap config.
22320 */
22321 *freezer_error_code = FREEZER_ERROR_GENERIC;
22322 kr = KERN_INVALID_ARGUMENT;
22323 goto done;
22324 }
22325
22326 freezer_context_global.freezer_ctx_uncompressed_pages = 0;
22327 clock_get_uptime(&c_freezer_last_yield_ts);
22328 }
22329 again:
22330
22331 for (entry2 = vm_map_first_entry(map);
22332 entry2 != vm_map_to_entry(map);
22333 entry2 = entry2->vme_next) {
22334 vm_object_t src_object;
22335
22336 if (entry2->is_sub_map) {
22337 continue;
22338 }
22339
22340 src_object = VME_OBJECT(entry2);
22341 if (!src_object ||
22342 src_object->phys_contiguous ||
22343 !src_object->internal) {
22344 continue;
22345 }
22346
22347 /* If eligible, scan the entry, moving eligible pages over to our parent object */
22348
22349 if (VM_CONFIG_FREEZER_SWAP_IS_ACTIVE) {
22350 /*
22351 * We skip purgeable objects during evaluation phase only.
22352 * If we decide to freeze this process, we'll explicitly
22353 * purge these objects before we go around again with
22354 * 'evaluation_phase' set to FALSE.
22355 */
22356
22357 if ((src_object->purgable == VM_PURGABLE_EMPTY) || (src_object->purgable == VM_PURGABLE_VOLATILE)) {
22358 /*
22359 * We want to purge objects that may not belong to this task but are mapped
22360 * in this task alone. Since we already purged this task's purgeable memory
22361 * at the end of a successful evaluation phase, we want to avoid doing no-op calls
22362 * on this task's purgeable objects. Hence the check for only volatile objects.
22363 */
22364 if (evaluation_phase ||
22365 src_object->purgable != VM_PURGABLE_VOLATILE ||
22366 os_ref_get_count_raw(&src_object->ref_count) != 1) {
22367 continue;
22368 }
22369 vm_object_lock(src_object);
22370 if (src_object->purgable == VM_PURGABLE_VOLATILE &&
22371 os_ref_get_count_raw(&src_object->ref_count) == 1) {
22372 purgeable_q_t old_queue;
22373
22374 /* object should be on a purgeable queue */
22375 assert(src_object->objq.next != NULL &&
22376 src_object->objq.prev != NULL);
22377 /* move object from its volatile queue to the nonvolatile queue */
22378 old_queue = vm_purgeable_object_remove(src_object);
22379 assert(old_queue);
22380 if (src_object->purgeable_when_ripe) {
22381 /* remove a token from that volatile queue */
22382 vm_page_lock_queues();
22383 vm_purgeable_token_delete_first(old_queue);
22384 vm_page_unlock_queues();
22385 }
22386 /* purge the object */
22387 vm_object_purge(src_object, 0);
22388 }
22389 vm_object_unlock(src_object);
22390 continue;
22391 }
22392
22393 /*
22394 * Pages belonging to this object could be swapped to disk.
22395 * Make sure it's not a shared object because we could end
22396 * up just bringing it back in again.
22397 *
22398 * We try to optimize somewhat by checking for objects that are mapped
22399 * more than once within our own map. But we don't do full searches,
22400 * we just look at the entries following our current entry.
22401 */
22402
22403 if (os_ref_get_count_raw(&src_object->ref_count) > 1) {
22404 if (src_object != cur_shared_object) {
22405 obj_pages_snapshot = (src_object->resident_page_count - src_object->wired_page_count) + vm_compressor_pager_get_count(src_object->pager);
22406 dirty_shared_count += obj_pages_snapshot;
22407
22408 cur_shared_object = src_object;
22409 cur_shared_obj_ref_cnt = 1;
22410 continue;
22411 } else {
22412 cur_shared_obj_ref_cnt++;
22413 if (os_ref_get_count_raw(&src_object->ref_count) == cur_shared_obj_ref_cnt) {
22414 /*
22415 * Fall through to below and treat this object as private.
22416 * So deduct its pages from our shared total and add it to the
22417 * private total.
22418 */
22419
22420 dirty_shared_count -= obj_pages_snapshot;
22421 dirty_private_count += obj_pages_snapshot;
22422 } else {
22423 continue;
22424 }
22425 }
22426 }
22427
22428
22429 if (os_ref_get_count_raw(&src_object->ref_count) == 1) {
22430 dirty_private_count += (src_object->resident_page_count - src_object->wired_page_count) + vm_compressor_pager_get_count(src_object->pager);
22431 }
22432
22433 if (evaluation_phase == TRUE) {
22434 continue;
22435 }
22436 }
22437
22438 uint32_t paged_out_count = vm_object_compressed_freezer_pageout(src_object, dirty_budget);
22439 *wired_count += src_object->wired_page_count;
22440
22441 if (vm_compressor_low_on_space() || vm_swap_low_on_space()) {
22442 if (vm_compressor_low_on_space()) {
22443 *freezer_error_code = FREEZER_ERROR_NO_COMPRESSOR_SPACE;
22444 }
22445
22446 if (vm_swap_low_on_space()) {
22447 *freezer_error_code = FREEZER_ERROR_NO_SWAP_SPACE;
22448 }
22449
22450 kr = KERN_NO_SPACE;
22451 break;
22452 }
22453 if (paged_out_count >= dirty_budget) {
22454 break;
22455 }
22456 dirty_budget -= paged_out_count;
22457 }
22458
22459 *shared_count = (unsigned int) ((dirty_shared_count * PAGE_SIZE_64) / (1024 * 1024ULL));
22460 if (evaluation_phase) {
22461 unsigned int shared_pages_threshold = (memorystatus_freeze_shared_mb_per_process_max * 1024 * 1024ULL) / PAGE_SIZE_64;
22462
22463 if (dirty_shared_count > shared_pages_threshold) {
22464 *freezer_error_code = FREEZER_ERROR_EXCESS_SHARED_MEMORY;
22465 kr = KERN_FAILURE;
22466 goto done;
22467 }
22468
22469 if (dirty_shared_count &&
22470 ((dirty_private_count / dirty_shared_count) < memorystatus_freeze_private_shared_pages_ratio)) {
22471 *freezer_error_code = FREEZER_ERROR_LOW_PRIVATE_SHARED_RATIO;
22472 kr = KERN_FAILURE;
22473 goto done;
22474 }
22475
22476 evaluation_phase = FALSE;
22477 dirty_shared_count = dirty_private_count = 0;
22478
22479 freezer_context_global.freezer_ctx_uncompressed_pages = 0;
22480 clock_get_uptime(&c_freezer_last_yield_ts);
22481
22482 if (eval_only) {
22483 kr = KERN_SUCCESS;
22484 goto done;
22485 }
22486
22487 vm_purgeable_purge_task_owned(task);
22488
22489 goto again;
22490 } else {
22491 kr = KERN_SUCCESS;
22492 }
22493
22494 done:
22495 vm_map_unlock(map);
22496
22497 if ((eval_only == FALSE) && (kr == KERN_SUCCESS)) {
22498 vm_object_compressed_freezer_done();
22499 }
22500 return kr;
22501 }
22502
22503 #endif
22504
22505 /*
22506 * vm_map_entry_should_cow_for_true_share:
22507 *
22508 * Determines if the map entry should be clipped and setup for copy-on-write
22509 * to avoid applying "true_share" to a large VM object when only a subset is
22510 * targeted.
22511 *
22512 * For now, we target only the map entries created for the Objective C
22513 * Garbage Collector, which initially have the following properties:
22514 * - alias == VM_MEMORY_MALLOC
22515 * - wired_count == 0
22516 * - !needs_copy
22517 * and a VM object with:
22518 * - internal
22519 * - copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC
22520 * - !true_share
22521 * - vo_size == ANON_CHUNK_SIZE
22522 *
22523 * Only non-kernel map entries.
22524 */
22525 boolean_t
vm_map_entry_should_cow_for_true_share(vm_map_entry_t entry)22526 vm_map_entry_should_cow_for_true_share(
22527 vm_map_entry_t entry)
22528 {
22529 vm_object_t object;
22530
22531 if (entry->is_sub_map) {
22532 /* entry does not point at a VM object */
22533 return FALSE;
22534 }
22535
22536 if (entry->needs_copy) {
22537 /* already set for copy_on_write: done! */
22538 return FALSE;
22539 }
22540
22541 if (VME_ALIAS(entry) != VM_MEMORY_MALLOC &&
22542 VME_ALIAS(entry) != VM_MEMORY_MALLOC_SMALL) {
22543 /* not a malloc heap or Obj-C Garbage Collector heap */
22544 return FALSE;
22545 }
22546
22547 if (entry->wired_count) {
22548 /* wired: can't change the map entry... */
22549 vm_counters.should_cow_but_wired++;
22550 return FALSE;
22551 }
22552
22553 object = VME_OBJECT(entry);
22554
22555 if (object == VM_OBJECT_NULL) {
22556 /* no object yet... */
22557 return FALSE;
22558 }
22559
22560 if (!object->internal) {
22561 /* not an internal object */
22562 return FALSE;
22563 }
22564
22565 if (object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC) {
22566 /* not the default copy strategy */
22567 return FALSE;
22568 }
22569
22570 if (object->true_share) {
22571 /* already true_share: too late to avoid it */
22572 return FALSE;
22573 }
22574
22575 if (VME_ALIAS(entry) == VM_MEMORY_MALLOC &&
22576 object->vo_size != ANON_CHUNK_SIZE) {
22577 /* ... not an object created for the ObjC Garbage Collector */
22578 return FALSE;
22579 }
22580
22581 if (VME_ALIAS(entry) == VM_MEMORY_MALLOC_SMALL &&
22582 object->vo_size != 2048 * 4096) {
22583 /* ... not a "MALLOC_SMALL" heap */
22584 return FALSE;
22585 }
22586
22587 /*
22588 * All the criteria match: we have a large object being targeted for "true_share".
22589 * To limit the adverse side-effects linked with "true_share", tell the caller to
22590 * try and avoid setting up the entire object for "true_share" by clipping the
22591 * targeted range and setting it up for copy-on-write.
22592 */
22593 return TRUE;
22594 }
22595
22596 uint64_t vm_map_range_overflows_count = 0;
22597 TUNABLE_WRITEABLE(boolean_t, vm_map_range_overflows_log, "vm_map_range_overflows_log", FALSE);
22598 bool
vm_map_range_overflows(vm_map_t map,vm_map_offset_t addr,vm_map_size_t size)22599 vm_map_range_overflows(
22600 vm_map_t map,
22601 vm_map_offset_t addr,
22602 vm_map_size_t size)
22603 {
22604 vm_map_offset_t start, end, sum;
22605 vm_map_offset_t pgmask;
22606
22607 if (size == 0) {
22608 /* empty range -> no overflow */
22609 return false;
22610 }
22611 pgmask = vm_map_page_mask(map);
22612 start = vm_map_trunc_page_mask(addr, pgmask);
22613 end = vm_map_round_page_mask(addr + size, pgmask);
22614 if (__improbable(os_add_overflow(addr, size, &sum) || end <= start)) {
22615 vm_map_range_overflows_count++;
22616 if (vm_map_range_overflows_log) {
22617 printf("%d[%s] vm_map_range_overflows addr 0x%llx size 0x%llx pgmask 0x%llx\n",
22618 proc_selfpid(),
22619 proc_best_name(current_proc()),
22620 (uint64_t)addr,
22621 (uint64_t)size,
22622 (uint64_t)pgmask);
22623 }
22624 DTRACE_VM4(vm_map_range_overflows,
22625 vm_map_t, map,
22626 uint32_t, pgmask,
22627 uint64_t, (uint64_t)addr,
22628 uint64_t, (uint64_t)size);
22629 return true;
22630 }
22631 return false;
22632 }
22633
22634 vm_map_offset_t
vm_map_round_page_mask(vm_map_offset_t offset,vm_map_offset_t mask)22635 vm_map_round_page_mask(
22636 vm_map_offset_t offset,
22637 vm_map_offset_t mask)
22638 {
22639 return VM_MAP_ROUND_PAGE(offset, mask);
22640 }
22641
22642 vm_map_offset_t
vm_map_trunc_page_mask(vm_map_offset_t offset,vm_map_offset_t mask)22643 vm_map_trunc_page_mask(
22644 vm_map_offset_t offset,
22645 vm_map_offset_t mask)
22646 {
22647 return VM_MAP_TRUNC_PAGE(offset, mask);
22648 }
22649
22650 boolean_t
vm_map_page_aligned(vm_map_offset_t offset,vm_map_offset_t mask)22651 vm_map_page_aligned(
22652 vm_map_offset_t offset,
22653 vm_map_offset_t mask)
22654 {
22655 return ((offset) & mask) == 0;
22656 }
22657
22658 int
vm_map_page_shift(vm_map_t map)22659 vm_map_page_shift(
22660 vm_map_t map)
22661 {
22662 return VM_MAP_PAGE_SHIFT(map);
22663 }
22664
22665 int
vm_map_page_size(vm_map_t map)22666 vm_map_page_size(
22667 vm_map_t map)
22668 {
22669 return VM_MAP_PAGE_SIZE(map);
22670 }
22671
22672 vm_map_offset_t
vm_map_page_mask(vm_map_t map)22673 vm_map_page_mask(
22674 vm_map_t map)
22675 {
22676 return VM_MAP_PAGE_MASK(map);
22677 }
22678
22679 kern_return_t
vm_map_set_page_shift(vm_map_t map,int pageshift)22680 vm_map_set_page_shift(
22681 vm_map_t map,
22682 int pageshift)
22683 {
22684 if (map->hdr.nentries != 0) {
22685 /* too late to change page size */
22686 return KERN_FAILURE;
22687 }
22688
22689 map->hdr.page_shift = (uint16_t)pageshift;
22690
22691 return KERN_SUCCESS;
22692 }
22693
22694 kern_return_t
vm_map_query_volatile(vm_map_t map,mach_vm_size_t * volatile_virtual_size_p,mach_vm_size_t * volatile_resident_size_p,mach_vm_size_t * volatile_compressed_size_p,mach_vm_size_t * volatile_pmap_size_p,mach_vm_size_t * volatile_compressed_pmap_size_p)22695 vm_map_query_volatile(
22696 vm_map_t map,
22697 mach_vm_size_t *volatile_virtual_size_p,
22698 mach_vm_size_t *volatile_resident_size_p,
22699 mach_vm_size_t *volatile_compressed_size_p,
22700 mach_vm_size_t *volatile_pmap_size_p,
22701 mach_vm_size_t *volatile_compressed_pmap_size_p)
22702 {
22703 mach_vm_size_t volatile_virtual_size;
22704 mach_vm_size_t volatile_resident_count;
22705 mach_vm_size_t volatile_compressed_count;
22706 mach_vm_size_t volatile_pmap_count;
22707 mach_vm_size_t volatile_compressed_pmap_count;
22708 mach_vm_size_t resident_count;
22709 vm_map_entry_t entry;
22710 vm_object_t object;
22711
22712 /* map should be locked by caller */
22713
22714 volatile_virtual_size = 0;
22715 volatile_resident_count = 0;
22716 volatile_compressed_count = 0;
22717 volatile_pmap_count = 0;
22718 volatile_compressed_pmap_count = 0;
22719
22720 for (entry = vm_map_first_entry(map);
22721 entry != vm_map_to_entry(map);
22722 entry = entry->vme_next) {
22723 mach_vm_size_t pmap_resident_bytes, pmap_compressed_bytes;
22724
22725 if (entry->is_sub_map) {
22726 continue;
22727 }
22728 if (!(entry->protection & VM_PROT_WRITE)) {
22729 continue;
22730 }
22731 object = VME_OBJECT(entry);
22732 if (object == VM_OBJECT_NULL) {
22733 continue;
22734 }
22735 if (object->purgable != VM_PURGABLE_VOLATILE &&
22736 object->purgable != VM_PURGABLE_EMPTY) {
22737 continue;
22738 }
22739 if (VME_OFFSET(entry)) {
22740 /*
22741 * If the map entry has been split and the object now
22742 * appears several times in the VM map, we don't want
22743 * to count the object's resident_page_count more than
22744 * once. We count it only for the first one, starting
22745 * at offset 0 and ignore the other VM map entries.
22746 */
22747 continue;
22748 }
22749 resident_count = object->resident_page_count;
22750 if ((VME_OFFSET(entry) / PAGE_SIZE) >= resident_count) {
22751 resident_count = 0;
22752 } else {
22753 resident_count -= (VME_OFFSET(entry) / PAGE_SIZE);
22754 }
22755
22756 volatile_virtual_size += entry->vme_end - entry->vme_start;
22757 volatile_resident_count += resident_count;
22758 if (object->pager) {
22759 volatile_compressed_count +=
22760 vm_compressor_pager_get_count(object->pager);
22761 }
22762 pmap_compressed_bytes = 0;
22763 pmap_resident_bytes =
22764 pmap_query_resident(map->pmap,
22765 entry->vme_start,
22766 entry->vme_end,
22767 &pmap_compressed_bytes);
22768 volatile_pmap_count += (pmap_resident_bytes / PAGE_SIZE);
22769 volatile_compressed_pmap_count += (pmap_compressed_bytes
22770 / PAGE_SIZE);
22771 }
22772
22773 /* map is still locked on return */
22774
22775 *volatile_virtual_size_p = volatile_virtual_size;
22776 *volatile_resident_size_p = volatile_resident_count * PAGE_SIZE;
22777 *volatile_compressed_size_p = volatile_compressed_count * PAGE_SIZE;
22778 *volatile_pmap_size_p = volatile_pmap_count * PAGE_SIZE;
22779 *volatile_compressed_pmap_size_p = volatile_compressed_pmap_count * PAGE_SIZE;
22780
22781 return KERN_SUCCESS;
22782 }
22783
22784 void
vm_map_sizes(vm_map_t map,vm_map_size_t * psize,vm_map_size_t * pfree,vm_map_size_t * plargest_free)22785 vm_map_sizes(vm_map_t map,
22786 vm_map_size_t * psize,
22787 vm_map_size_t * pfree,
22788 vm_map_size_t * plargest_free)
22789 {
22790 vm_map_entry_t entry;
22791 vm_map_offset_t prev;
22792 vm_map_size_t free, total_free, largest_free;
22793 boolean_t end;
22794
22795 if (!map) {
22796 *psize = *pfree = *plargest_free = 0;
22797 return;
22798 }
22799 total_free = largest_free = 0;
22800
22801 vm_map_lock_read(map);
22802 if (psize) {
22803 *psize = map->max_offset - map->min_offset;
22804 }
22805
22806 prev = map->min_offset;
22807 for (entry = vm_map_first_entry(map);; entry = entry->vme_next) {
22808 end = (entry == vm_map_to_entry(map));
22809
22810 if (end) {
22811 free = entry->vme_end - prev;
22812 } else {
22813 free = entry->vme_start - prev;
22814 }
22815
22816 total_free += free;
22817 if (free > largest_free) {
22818 largest_free = free;
22819 }
22820
22821 if (end) {
22822 break;
22823 }
22824 prev = entry->vme_end;
22825 }
22826 vm_map_unlock_read(map);
22827 if (pfree) {
22828 *pfree = total_free;
22829 }
22830 if (plargest_free) {
22831 *plargest_free = largest_free;
22832 }
22833 }
22834
22835 #if VM_SCAN_FOR_SHADOW_CHAIN
22836 int
vm_map_shadow_max(vm_map_t map)22837 vm_map_shadow_max(
22838 vm_map_t map)
22839 {
22840 int shadows, shadows_max;
22841 vm_map_entry_t entry;
22842 vm_object_t object, next_object;
22843
22844 if (map == NULL) {
22845 return 0;
22846 }
22847
22848 shadows_max = 0;
22849
22850 vm_map_lock_read(map);
22851
22852 for (entry = vm_map_first_entry(map);
22853 entry != vm_map_to_entry(map);
22854 entry = entry->vme_next) {
22855 if (entry->is_sub_map) {
22856 continue;
22857 }
22858 object = VME_OBJECT(entry);
22859 if (object == NULL) {
22860 continue;
22861 }
22862 vm_object_lock_shared(object);
22863 for (shadows = 0;
22864 object->shadow != NULL;
22865 shadows++, object = next_object) {
22866 next_object = object->shadow;
22867 vm_object_lock_shared(next_object);
22868 vm_object_unlock(object);
22869 }
22870 vm_object_unlock(object);
22871 if (shadows > shadows_max) {
22872 shadows_max = shadows;
22873 }
22874 }
22875
22876 vm_map_unlock_read(map);
22877
22878 return shadows_max;
22879 }
22880 #endif /* VM_SCAN_FOR_SHADOW_CHAIN */
22881
22882 void
vm_commit_pagezero_status(vm_map_t lmap)22883 vm_commit_pagezero_status(vm_map_t lmap)
22884 {
22885 pmap_advise_pagezero_range(lmap->pmap, lmap->min_offset);
22886 }
22887
22888 #if __x86_64__
22889 void
vm_map_set_high_start(vm_map_t map,vm_map_offset_t high_start)22890 vm_map_set_high_start(
22891 vm_map_t map,
22892 vm_map_offset_t high_start)
22893 {
22894 map->vmmap_high_start = high_start;
22895 }
22896 #endif /* __x86_64__ */
22897
22898 #if CODE_SIGNING_MONITOR
22899
22900 kern_return_t
vm_map_entry_cs_associate(vm_map_t map,vm_map_entry_t entry,vm_map_kernel_flags_t vmk_flags)22901 vm_map_entry_cs_associate(
22902 vm_map_t map,
22903 vm_map_entry_t entry,
22904 vm_map_kernel_flags_t vmk_flags)
22905 {
22906 vm_object_t cs_object, cs_shadow, backing_object;
22907 vm_object_offset_t cs_offset, backing_offset;
22908 void *cs_blobs;
22909 struct vnode *cs_vnode;
22910 kern_return_t cs_ret;
22911
22912 if (map->pmap == NULL ||
22913 entry->is_sub_map || /* XXX FBDP: recurse on sub-range? */
22914 (csm_address_space_exempt(map->pmap) == KERN_SUCCESS) ||
22915 VME_OBJECT(entry) == VM_OBJECT_NULL) {
22916 return KERN_SUCCESS;
22917 }
22918
22919 if (!(entry->protection & VM_PROT_EXECUTE)) {
22920 /*
22921 * This memory region is not executable, so the code-signing
22922 * monitor would usually not care about it...
22923 */
22924 if (vmk_flags.vmkf_remap_prot_copy &&
22925 (entry->max_protection & VM_PROT_EXECUTE)) {
22926 /*
22927 * ... except if the memory region is being remapped
22928 * from r-x/r-x to rw-/rwx via vm_protect(VM_PROT_COPY)
22929 * which is what a debugger or dtrace would be doing
22930 * to prepare to modify an executable page to insert
22931 * a breakpoint or activate a probe.
22932 * In that case, fall through so that we can mark
22933 * this region as being "debugged" and no longer
22934 * strictly code-signed.
22935 */
22936 } else {
22937 /*
22938 * Really not executable, so no need to tell the
22939 * code-signing monitor.
22940 */
22941 return KERN_SUCCESS;
22942 }
22943 }
22944
22945 vm_map_lock_assert_exclusive(map);
22946
22947 /*
22948 * Check for a debug association mapping before we check for used_for_jit. This
22949 * allows non-RWX JIT on macOS systems to masquerade their mappings as USER_DEBUG
22950 * pages instead of USER_JIT. These non-RWX JIT pages cannot be marked as USER_JIT
22951 * since they are mapped with RW or RX permissions, which the page table monitor
22952 * denies on USER_JIT pages. Given that, if they're not mapped as USER_DEBUG,
22953 * they will be mapped as USER_EXEC, and that will cause another page table monitor
22954 * violation when those USER_EXEC pages are mapped as RW.
22955 *
22956 * Since these pages switch between RW and RX through mprotect, they mimic what
22957 * we expect a debugger to do. As the code signing monitor does not enforce mappings
22958 * on macOS systems, this works in our favor here and allows us to continue to
22959 * support these legacy-programmed applications without sacrificing security on
22960 * the page table or the code signing monitor. We don't need to explicitly check
22961 * for entry_for_jit here and the mapping permissions. If the initial mapping is
22962 * created with RX, then the application must map it as RW in order to first write
22963 * to the page (MAP_JIT mappings must be private and anonymous). The switch to
22964 * RX will cause vm_map_protect to mark the entry as vmkf_remap_prot_copy.
22965 * Similarly, if the mapping was created as RW, and then switched to RX,
22966 * vm_map_protect will again mark the entry as a copy, and both these cases
22967 * lead to this if-statement being entered.
22968 *
22969 * For more information: rdar://115313336.
22970 */
22971 if (vmk_flags.vmkf_remap_prot_copy) {
22972 cs_ret = csm_associate_debug_region(
22973 map->pmap,
22974 entry->vme_start,
22975 entry->vme_end - entry->vme_start);
22976
22977 /*
22978 * csm_associate_debug_region returns not supported when the code signing
22979 * monitor is disabled. This is intentional, since cs_ret is checked towards
22980 * the end of the function, and if it is not supported, then we still want the
22981 * VM to perform code-signing enforcement on this entry. That said, if we don't
22982 * mark this as a xnu_user_debug page when the code-signing monitor is disabled,
22983 * then it never gets retyped to XNU_USER_DEBUG frame type, which then causes
22984 * an issue with debugging (since it'll be mapped in as XNU_USER_EXEC in some
22985 * cases, which will cause a violation when attempted to be mapped as writable).
22986 */
22987 if ((cs_ret == KERN_SUCCESS) || (cs_ret == KERN_NOT_SUPPORTED)) {
22988 entry->vme_xnu_user_debug = TRUE;
22989 }
22990 #if DEVELOPMENT || DEBUG
22991 if (vm_log_xnu_user_debug) {
22992 printf("FBDP %d[%s] %s:%d map %p entry %p [ 0x%llx 0x%llx ] vme_xnu_user_debug=%d cs_ret %d\n",
22993 proc_selfpid(),
22994 (get_bsdtask_info(current_task()) ? proc_name_address(get_bsdtask_info(current_task())) : "?"),
22995 __FUNCTION__, __LINE__,
22996 map, entry,
22997 (uint64_t)entry->vme_start, (uint64_t)entry->vme_end,
22998 entry->vme_xnu_user_debug,
22999 cs_ret);
23000 }
23001 #endif /* DEVELOPMENT || DEBUG */
23002 goto done;
23003 }
23004
23005 if (entry->used_for_jit) {
23006 cs_ret = csm_associate_jit_region(
23007 map->pmap,
23008 entry->vme_start,
23009 entry->vme_end - entry->vme_start);
23010 goto done;
23011 }
23012
23013 cs_object = VME_OBJECT(entry);
23014 vm_object_lock_shared(cs_object);
23015 cs_offset = VME_OFFSET(entry);
23016
23017 /* find the VM object backed by the code-signed vnode */
23018 for (;;) {
23019 /* go to the bottom of cs_object's shadow chain */
23020 for (;
23021 cs_object->shadow != VM_OBJECT_NULL;
23022 cs_object = cs_shadow) {
23023 cs_shadow = cs_object->shadow;
23024 cs_offset += cs_object->vo_shadow_offset;
23025 vm_object_lock_shared(cs_shadow);
23026 vm_object_unlock(cs_object);
23027 }
23028 if (cs_object->internal ||
23029 cs_object->pager == MEMORY_OBJECT_NULL) {
23030 vm_object_unlock(cs_object);
23031 return KERN_SUCCESS;
23032 }
23033
23034 cs_offset += cs_object->paging_offset;
23035
23036 /*
23037 * cs_object could be backed by a:
23038 * vnode_pager
23039 * apple_protect_pager
23040 * shared_region_pager
23041 * fourk_pager (multiple backing objects -> fail?)
23042 * ask the pager if it has a backing VM object
23043 */
23044 if (!memory_object_backing_object(cs_object->pager,
23045 cs_offset,
23046 &backing_object,
23047 &backing_offset)) {
23048 /* no backing object: cs_object is it */
23049 break;
23050 }
23051
23052 /* look down the backing object's shadow chain */
23053 vm_object_lock_shared(backing_object);
23054 vm_object_unlock(cs_object);
23055 cs_object = backing_object;
23056 cs_offset = backing_offset;
23057 }
23058
23059 cs_vnode = vnode_pager_lookup_vnode(cs_object->pager);
23060 if (cs_vnode == NULL) {
23061 /* no vnode, no code signatures to associate */
23062 cs_ret = KERN_SUCCESS;
23063 } else {
23064 cs_ret = vnode_pager_get_cs_blobs(cs_vnode,
23065 &cs_blobs);
23066 assert(cs_ret == KERN_SUCCESS);
23067 cs_ret = cs_associate_blob_with_mapping(map->pmap,
23068 entry->vme_start,
23069 (entry->vme_end - entry->vme_start),
23070 cs_offset,
23071 cs_blobs);
23072 }
23073 vm_object_unlock(cs_object);
23074 cs_object = VM_OBJECT_NULL;
23075
23076 done:
23077 if (cs_ret == KERN_SUCCESS) {
23078 DTRACE_VM2(vm_map_entry_cs_associate_success,
23079 vm_map_offset_t, entry->vme_start,
23080 vm_map_offset_t, entry->vme_end);
23081 if (vm_map_executable_immutable) {
23082 /*
23083 * Prevent this executable
23084 * mapping from being unmapped
23085 * or modified.
23086 */
23087 entry->vme_permanent = TRUE;
23088 }
23089 /*
23090 * pmap says it will validate the
23091 * code-signing validity of pages
23092 * faulted in via this mapping, so
23093 * this map entry should be marked so
23094 * that vm_fault() bypasses code-signing
23095 * validation for faults coming through
23096 * this mapping.
23097 */
23098 entry->csm_associated = TRUE;
23099 } else if (cs_ret == KERN_NOT_SUPPORTED) {
23100 /*
23101 * pmap won't check the code-signing
23102 * validity of pages faulted in via
23103 * this mapping, so VM should keep
23104 * doing it.
23105 */
23106 DTRACE_VM3(vm_map_entry_cs_associate_off,
23107 vm_map_offset_t, entry->vme_start,
23108 vm_map_offset_t, entry->vme_end,
23109 int, cs_ret);
23110 } else {
23111 /*
23112 * A real error: do not allow
23113 * execution in this mapping.
23114 */
23115 DTRACE_VM3(vm_map_entry_cs_associate_failure,
23116 vm_map_offset_t, entry->vme_start,
23117 vm_map_offset_t, entry->vme_end,
23118 int, cs_ret);
23119 if (vmk_flags.vmkf_overwrite_immutable) {
23120 /*
23121 * We can get here when we remap an apple_protect pager
23122 * on top of an already cs_associated executable mapping
23123 * with the same code signatures, so we don't want to
23124 * lose VM_PROT_EXECUTE in that case...
23125 */
23126 } else {
23127 entry->protection &= ~VM_PROT_ALLEXEC;
23128 entry->max_protection &= ~VM_PROT_ALLEXEC;
23129 }
23130 }
23131
23132 return cs_ret;
23133 }
23134
23135 #endif /* CODE_SIGNING_MONITOR */
23136
23137 inline bool
vm_map_is_corpse_source(vm_map_t map)23138 vm_map_is_corpse_source(vm_map_t map)
23139 {
23140 bool status = false;
23141 if (map) {
23142 vm_map_lock_read(map);
23143 status = map->corpse_source;
23144 vm_map_unlock_read(map);
23145 }
23146 return status;
23147 }
23148
23149 inline void
vm_map_set_corpse_source(vm_map_t map)23150 vm_map_set_corpse_source(vm_map_t map)
23151 {
23152 if (map) {
23153 vm_map_lock(map);
23154 map->corpse_source = true;
23155 vm_map_unlock(map);
23156 }
23157 }
23158
23159 inline void
vm_map_unset_corpse_source(vm_map_t map)23160 vm_map_unset_corpse_source(vm_map_t map)
23161 {
23162 if (map) {
23163 vm_map_lock(map);
23164 map->corpse_source = false;
23165 vm_map_unlock(map);
23166 }
23167 }
23168 /*
23169 * FORKED CORPSE FOOTPRINT
23170 *
23171 * A forked corpse gets a copy of the original VM map but its pmap is mostly
23172 * empty since it never ran and never got to fault in any pages.
23173 * Collecting footprint info (via "sysctl vm.self_region_footprint") for
23174 * a forked corpse would therefore return very little information.
23175 *
23176 * When forking a corpse, we can pass the VM_MAP_FORK_CORPSE_FOOTPRINT option
23177 * to vm_map_fork() to collect footprint information from the original VM map
23178 * and its pmap, and store it in the forked corpse's VM map. That information
23179 * is stored in place of the VM map's "hole list" since we'll never need to
23180 * lookup for holes in the corpse's map.
23181 *
23182 * The corpse's footprint info looks like this:
23183 *
23184 * vm_map->vmmap_corpse_footprint points to pageable kernel memory laid out
23185 * as follows:
23186 * +---------------------------------------+
23187 * header-> | cf_size |
23188 * +-------------------+-------------------+
23189 * | cf_last_region | cf_last_zeroes |
23190 * +-------------------+-------------------+
23191 * region1-> | cfr_vaddr |
23192 * +-------------------+-------------------+
23193 * | cfr_num_pages | d0 | d1 | d2 | d3 |
23194 * +---------------------------------------+
23195 * | d4 | d5 | ... |
23196 * +---------------------------------------+
23197 * | ... |
23198 * +-------------------+-------------------+
23199 * | dy | dz | na | na | cfr_vaddr... | <-region2
23200 * +-------------------+-------------------+
23201 * | cfr_vaddr (ctd) | cfr_num_pages |
23202 * +---------------------------------------+
23203 * | d0 | d1 ... |
23204 * +---------------------------------------+
23205 * ...
23206 * +---------------------------------------+
23207 * last region-> | cfr_vaddr |
23208 * +---------------------------------------+
23209 * + cfr_num_pages | d0 | d1 | d2 | d3 |
23210 * +---------------------------------------+
23211 * ...
23212 * +---------------------------------------+
23213 * | dx | dy | dz | na | na | na | na | na |
23214 * +---------------------------------------+
23215 *
23216 * where:
23217 * cf_size: total size of the buffer (rounded to page size)
23218 * cf_last_region: offset in the buffer of the last "region" sub-header
23219 * cf_last_zeroes: number of trailing "zero" dispositions at the end
23220 * of last region
23221 * cfr_vaddr: virtual address of the start of the covered "region"
23222 * cfr_num_pages: number of pages in the covered "region"
23223 * d*: disposition of the page at that virtual address
23224 * Regions in the buffer are word-aligned.
23225 *
23226 * We estimate the size of the buffer based on the number of memory regions
23227 * and the virtual size of the address space. While copying each memory region
23228 * during vm_map_fork(), we also collect the footprint info for that region
23229 * and store it in the buffer, packing it as much as possible (coalescing
23230 * contiguous memory regions to avoid having too many region headers and
23231 * avoiding long streaks of "zero" page dispositions by splitting footprint
23232 * "regions", so the number of regions in the footprint buffer might not match
23233 * the number of memory regions in the address space.
23234 *
23235 * We also have to copy the original task's "nonvolatile" ledgers since that's
23236 * part of the footprint and will need to be reported to any tool asking for
23237 * the footprint information of the forked corpse.
23238 */
23239
23240 uint64_t vm_map_corpse_footprint_count = 0;
23241 uint64_t vm_map_corpse_footprint_size_avg = 0;
23242 uint64_t vm_map_corpse_footprint_size_max = 0;
23243 uint64_t vm_map_corpse_footprint_full = 0;
23244 uint64_t vm_map_corpse_footprint_no_buf = 0;
23245
23246 struct vm_map_corpse_footprint_header {
23247 vm_size_t cf_size; /* allocated buffer size */
23248 uint32_t cf_last_region; /* offset of last region in buffer */
23249 union {
23250 uint32_t cfu_last_zeroes; /* during creation:
23251 * number of "zero" dispositions at
23252 * end of last region */
23253 uint32_t cfu_hint_region; /* during lookup:
23254 * offset of last looked up region */
23255 #define cf_last_zeroes cfu.cfu_last_zeroes
23256 #define cf_hint_region cfu.cfu_hint_region
23257 } cfu;
23258 };
23259 typedef uint8_t cf_disp_t;
23260 struct vm_map_corpse_footprint_region {
23261 vm_map_offset_t cfr_vaddr; /* region start virtual address */
23262 uint32_t cfr_num_pages; /* number of pages in this "region" */
23263 cf_disp_t cfr_disposition[0]; /* disposition of each page */
23264 } __attribute__((packed));
23265
23266 static cf_disp_t
vm_page_disposition_to_cf_disp(int disposition)23267 vm_page_disposition_to_cf_disp(
23268 int disposition)
23269 {
23270 assert(sizeof(cf_disp_t) == 1);
23271 /* relocate bits that don't fit in a "uint8_t" */
23272 if (disposition & VM_PAGE_QUERY_PAGE_REUSABLE) {
23273 disposition |= VM_PAGE_QUERY_PAGE_FICTITIOUS;
23274 }
23275 /* cast gets rid of extra bits */
23276 return (cf_disp_t) disposition;
23277 }
23278
23279 static int
vm_page_cf_disp_to_disposition(cf_disp_t cf_disp)23280 vm_page_cf_disp_to_disposition(
23281 cf_disp_t cf_disp)
23282 {
23283 int disposition;
23284
23285 assert(sizeof(cf_disp_t) == 1);
23286 disposition = (int) cf_disp;
23287 /* move relocated bits back in place */
23288 if (cf_disp & VM_PAGE_QUERY_PAGE_FICTITIOUS) {
23289 disposition |= VM_PAGE_QUERY_PAGE_REUSABLE;
23290 disposition &= ~VM_PAGE_QUERY_PAGE_FICTITIOUS;
23291 }
23292 return disposition;
23293 }
23294
23295 static kmem_guard_t
vm_map_corpse_footprint_guard(vm_map_t map)23296 vm_map_corpse_footprint_guard(vm_map_t map)
23297 {
23298 return (kmem_guard_t){
23299 .kmg_atomic = true,
23300 .kmg_tag = VM_KERN_MEMORY_DIAG,
23301 .kmg_context = os_hash_kernel_pointer(&map->vmmap_corpse_footprint),
23302 };
23303 }
23304
23305 /*
23306 * vm_map_corpse_footprint_new_region:
23307 * closes the current footprint "region" and creates a new one
23308 *
23309 * Returns NULL if there's not enough space in the buffer for a new region.
23310 */
23311 static struct vm_map_corpse_footprint_region *
vm_map_corpse_footprint_new_region(struct vm_map_corpse_footprint_header * footprint_header)23312 vm_map_corpse_footprint_new_region(
23313 struct vm_map_corpse_footprint_header *footprint_header)
23314 {
23315 uintptr_t footprint_edge;
23316 uint32_t new_region_offset;
23317 struct vm_map_corpse_footprint_region *footprint_region;
23318 struct vm_map_corpse_footprint_region *new_footprint_region;
23319
23320 footprint_edge = ((uintptr_t)footprint_header +
23321 footprint_header->cf_size);
23322 footprint_region = ((struct vm_map_corpse_footprint_region *)
23323 ((char *)footprint_header +
23324 footprint_header->cf_last_region));
23325 assert((uintptr_t)footprint_region + sizeof(*footprint_region) <=
23326 footprint_edge);
23327
23328 /* get rid of trailing zeroes in the last region */
23329 assert(footprint_region->cfr_num_pages >=
23330 footprint_header->cf_last_zeroes);
23331 footprint_region->cfr_num_pages -=
23332 footprint_header->cf_last_zeroes;
23333 footprint_header->cf_last_zeroes = 0;
23334
23335 /* reuse this region if it's now empty */
23336 if (footprint_region->cfr_num_pages == 0) {
23337 return footprint_region;
23338 }
23339
23340 /* compute offset of new region */
23341 new_region_offset = footprint_header->cf_last_region;
23342 new_region_offset += sizeof(*footprint_region);
23343 new_region_offset += (footprint_region->cfr_num_pages * sizeof(cf_disp_t));
23344 new_region_offset = roundup(new_region_offset, sizeof(int));
23345
23346 /* check if we're going over the edge */
23347 if (((uintptr_t)footprint_header +
23348 new_region_offset +
23349 sizeof(*footprint_region)) >=
23350 footprint_edge) {
23351 /* over the edge: no new region */
23352 return NULL;
23353 }
23354
23355 /* adjust offset of last region in header */
23356 footprint_header->cf_last_region = new_region_offset;
23357
23358 new_footprint_region = (struct vm_map_corpse_footprint_region *)
23359 ((char *)footprint_header +
23360 footprint_header->cf_last_region);
23361 new_footprint_region->cfr_vaddr = 0;
23362 new_footprint_region->cfr_num_pages = 0;
23363 /* caller needs to initialize new region */
23364
23365 return new_footprint_region;
23366 }
23367
23368 /*
23369 * vm_map_corpse_footprint_collect:
23370 * collect footprint information for "old_entry" in "old_map" and
23371 * stores it in "new_map"'s vmmap_footprint_info.
23372 */
23373 kern_return_t
vm_map_corpse_footprint_collect(vm_map_t old_map,vm_map_entry_t old_entry,vm_map_t new_map)23374 vm_map_corpse_footprint_collect(
23375 vm_map_t old_map,
23376 vm_map_entry_t old_entry,
23377 vm_map_t new_map)
23378 {
23379 vm_map_offset_t va;
23380 kmem_return_t kmr;
23381 struct vm_map_corpse_footprint_header *footprint_header;
23382 struct vm_map_corpse_footprint_region *footprint_region;
23383 struct vm_map_corpse_footprint_region *new_footprint_region;
23384 cf_disp_t *next_disp_p;
23385 uintptr_t footprint_edge;
23386 uint32_t num_pages_tmp;
23387 int effective_page_size;
23388
23389 effective_page_size = MIN(PAGE_SIZE, VM_MAP_PAGE_SIZE(old_map));
23390
23391 va = old_entry->vme_start;
23392
23393 vm_map_lock_assert_exclusive(old_map);
23394 vm_map_lock_assert_exclusive(new_map);
23395
23396 assert(new_map->has_corpse_footprint);
23397 assert(!old_map->has_corpse_footprint);
23398 if (!new_map->has_corpse_footprint ||
23399 old_map->has_corpse_footprint) {
23400 /*
23401 * This can only transfer footprint info from a
23402 * map with a live pmap to a map with a corpse footprint.
23403 */
23404 return KERN_NOT_SUPPORTED;
23405 }
23406
23407 if (new_map->vmmap_corpse_footprint == NULL) {
23408 vm_size_t buf_size;
23409
23410 buf_size = (sizeof(*footprint_header) +
23411 (old_map->hdr.nentries
23412 *
23413 (sizeof(*footprint_region) +
23414 +3)) /* potential alignment for each region */
23415 +
23416 ((old_map->size / effective_page_size)
23417 *
23418 sizeof(cf_disp_t))); /* disposition for each page */
23419 // printf("FBDP corpse map %p guestimate footprint size 0x%llx\n", new_map, (uint64_t) buf_size);
23420 buf_size = round_page(buf_size);
23421
23422 /* limit buffer to 1 page to validate overflow detection */
23423 // buf_size = PAGE_SIZE;
23424
23425 /* limit size to a somewhat sane amount */
23426 #if XNU_TARGET_OS_OSX
23427 #define VM_MAP_CORPSE_FOOTPRINT_INFO_MAX_SIZE (8*1024*1024) /* 8MB */
23428 #else /* XNU_TARGET_OS_OSX */
23429 #define VM_MAP_CORPSE_FOOTPRINT_INFO_MAX_SIZE (256*1024) /* 256KB */
23430 #endif /* XNU_TARGET_OS_OSX */
23431 if (buf_size > VM_MAP_CORPSE_FOOTPRINT_INFO_MAX_SIZE) {
23432 buf_size = VM_MAP_CORPSE_FOOTPRINT_INFO_MAX_SIZE;
23433 }
23434 kmem_guard_t guard = vm_map_corpse_footprint_guard(new_map);
23435 kmr = kmem_alloc_guard(kernel_map, buf_size + PAGE_SIZE, 0,
23436 KMA_DATA | KMA_GUARD_LAST | KMA_KOBJECT | KMA_ZERO,
23437 guard);
23438 if (kmr.kmr_return != KERN_SUCCESS) {
23439 vm_map_corpse_footprint_no_buf++;
23440 return kmr.kmr_return;
23441 }
23442
23443 /* initialize header and 1st region */
23444 footprint_header = (struct vm_map_corpse_footprint_header *)kmr.kmr_ptr;
23445 assert3p(footprint_header, !=, NULL);
23446 new_map->vmmap_corpse_footprint = footprint_header;
23447
23448 footprint_header->cf_size = buf_size;
23449 footprint_header->cf_last_region =
23450 sizeof(*footprint_header);
23451 footprint_header->cf_last_zeroes = 0;
23452
23453 footprint_region = (struct vm_map_corpse_footprint_region *)
23454 ((char *)footprint_header +
23455 footprint_header->cf_last_region);
23456 footprint_region->cfr_vaddr = 0;
23457 footprint_region->cfr_num_pages = 0;
23458 } else {
23459 /* retrieve header and last region */
23460 footprint_header = (struct vm_map_corpse_footprint_header *)
23461 new_map->vmmap_corpse_footprint;
23462 footprint_region = (struct vm_map_corpse_footprint_region *)
23463 ((char *)footprint_header +
23464 footprint_header->cf_last_region);
23465 }
23466 footprint_edge = ((uintptr_t)footprint_header +
23467 footprint_header->cf_size);
23468
23469 if ((footprint_region->cfr_vaddr +
23470 (((vm_map_offset_t)footprint_region->cfr_num_pages) *
23471 effective_page_size))
23472 != old_entry->vme_start) {
23473 uint64_t num_pages_delta, num_pages_delta_size;
23474 uint32_t region_offset_delta_size;
23475
23476 /*
23477 * Not the next contiguous virtual address:
23478 * start a new region or store "zero" dispositions for
23479 * the missing pages?
23480 */
23481 /* size of gap in actual page dispositions */
23482 num_pages_delta = ((old_entry->vme_start -
23483 footprint_region->cfr_vaddr) / effective_page_size)
23484 - footprint_region->cfr_num_pages;
23485 num_pages_delta_size = num_pages_delta * sizeof(cf_disp_t);
23486 /* size of gap as a new footprint region header */
23487 region_offset_delta_size =
23488 (sizeof(*footprint_region) +
23489 roundup(((footprint_region->cfr_num_pages -
23490 footprint_header->cf_last_zeroes) * sizeof(cf_disp_t)),
23491 sizeof(int)) -
23492 ((footprint_region->cfr_num_pages -
23493 footprint_header->cf_last_zeroes) * sizeof(cf_disp_t)));
23494 // printf("FBDP %s:%d region 0x%x 0x%llx 0x%x vme_start 0x%llx pages_delta 0x%llx region_delta 0x%x\n", __FUNCTION__, __LINE__, footprint_header->cf_last_region, footprint_region->cfr_vaddr, footprint_region->cfr_num_pages, old_entry->vme_start, num_pages_delta, region_offset_delta);
23495 if (region_offset_delta_size < num_pages_delta_size ||
23496 os_add3_overflow(footprint_region->cfr_num_pages,
23497 (uint32_t) num_pages_delta,
23498 1,
23499 &num_pages_tmp)) {
23500 /*
23501 * Storing data for this gap would take more space
23502 * than inserting a new footprint region header:
23503 * let's start a new region and save space. If it's a
23504 * tie, let's avoid using a new region, since that
23505 * would require more region hops to find the right
23506 * range during lookups.
23507 *
23508 * If the current region's cfr_num_pages would overflow
23509 * if we added "zero" page dispositions for the gap,
23510 * no choice but to start a new region.
23511 */
23512 // printf("FBDP %s:%d new region\n", __FUNCTION__, __LINE__);
23513 new_footprint_region =
23514 vm_map_corpse_footprint_new_region(footprint_header);
23515 /* check that we're not going over the edge */
23516 if (new_footprint_region == NULL) {
23517 goto over_the_edge;
23518 }
23519 footprint_region = new_footprint_region;
23520 /* initialize new region as empty */
23521 footprint_region->cfr_vaddr = old_entry->vme_start;
23522 footprint_region->cfr_num_pages = 0;
23523 } else {
23524 /*
23525 * Store "zero" page dispositions for the missing
23526 * pages.
23527 */
23528 // printf("FBDP %s:%d zero gap\n", __FUNCTION__, __LINE__);
23529 for (; num_pages_delta > 0; num_pages_delta--) {
23530 next_disp_p = (cf_disp_t *)
23531 ((uintptr_t) footprint_region +
23532 sizeof(*footprint_region));
23533 next_disp_p += footprint_region->cfr_num_pages;
23534 /* check that we're not going over the edge */
23535 if ((uintptr_t)next_disp_p >= footprint_edge) {
23536 goto over_the_edge;
23537 }
23538 /* store "zero" disposition for this gap page */
23539 footprint_region->cfr_num_pages++;
23540 *next_disp_p = (cf_disp_t) 0;
23541 footprint_header->cf_last_zeroes++;
23542 }
23543 }
23544 }
23545
23546 for (va = old_entry->vme_start;
23547 va < old_entry->vme_end;
23548 va += effective_page_size) {
23549 int disposition;
23550 cf_disp_t cf_disp;
23551
23552 vm_map_footprint_query_page_info(old_map,
23553 old_entry,
23554 va,
23555 &disposition);
23556 cf_disp = vm_page_disposition_to_cf_disp(disposition);
23557
23558 // if (va < SHARED_REGION_BASE_ARM64) printf("FBDP collect map %p va 0x%llx disp 0x%x\n", new_map, va, disp);
23559
23560 if (cf_disp == 0 && footprint_region->cfr_num_pages == 0) {
23561 /*
23562 * Ignore "zero" dispositions at start of
23563 * region: just move start of region.
23564 */
23565 footprint_region->cfr_vaddr += effective_page_size;
23566 continue;
23567 }
23568
23569 /* would region's cfr_num_pages overflow? */
23570 if (os_add_overflow(footprint_region->cfr_num_pages, 1,
23571 &num_pages_tmp)) {
23572 /* overflow: create a new region */
23573 new_footprint_region =
23574 vm_map_corpse_footprint_new_region(
23575 footprint_header);
23576 if (new_footprint_region == NULL) {
23577 goto over_the_edge;
23578 }
23579 footprint_region = new_footprint_region;
23580 footprint_region->cfr_vaddr = va;
23581 footprint_region->cfr_num_pages = 0;
23582 }
23583
23584 next_disp_p = (cf_disp_t *) ((uintptr_t) footprint_region +
23585 sizeof(*footprint_region));
23586 next_disp_p += footprint_region->cfr_num_pages;
23587 /* check that we're not going over the edge */
23588 if ((uintptr_t)next_disp_p >= footprint_edge) {
23589 goto over_the_edge;
23590 }
23591 /* store this dispostion */
23592 *next_disp_p = cf_disp;
23593 footprint_region->cfr_num_pages++;
23594
23595 if (cf_disp != 0) {
23596 /* non-zero disp: break the current zero streak */
23597 footprint_header->cf_last_zeroes = 0;
23598 /* done */
23599 continue;
23600 }
23601
23602 /* zero disp: add to the current streak of zeroes */
23603 footprint_header->cf_last_zeroes++;
23604 if ((footprint_header->cf_last_zeroes +
23605 roundup(((footprint_region->cfr_num_pages -
23606 footprint_header->cf_last_zeroes) * sizeof(cf_disp_t)) &
23607 (sizeof(int) - 1),
23608 sizeof(int))) <
23609 (sizeof(*footprint_header))) {
23610 /*
23611 * There are not enough trailing "zero" dispositions
23612 * (+ the extra padding we would need for the previous
23613 * region); creating a new region would not save space
23614 * at this point, so let's keep this "zero" disposition
23615 * in this region and reconsider later.
23616 */
23617 continue;
23618 }
23619 /*
23620 * Create a new region to avoid having too many consecutive
23621 * "zero" dispositions.
23622 */
23623 new_footprint_region =
23624 vm_map_corpse_footprint_new_region(footprint_header);
23625 if (new_footprint_region == NULL) {
23626 goto over_the_edge;
23627 }
23628 footprint_region = new_footprint_region;
23629 /* initialize the new region as empty ... */
23630 footprint_region->cfr_num_pages = 0;
23631 /* ... and skip this "zero" disp */
23632 footprint_region->cfr_vaddr = va + effective_page_size;
23633 }
23634
23635 return KERN_SUCCESS;
23636
23637 over_the_edge:
23638 // printf("FBDP map %p footprint was full for va 0x%llx\n", new_map, va);
23639 vm_map_corpse_footprint_full++;
23640 return KERN_RESOURCE_SHORTAGE;
23641 }
23642
23643 /*
23644 * vm_map_corpse_footprint_collect_done:
23645 * completes the footprint collection by getting rid of any remaining
23646 * trailing "zero" dispositions and trimming the unused part of the
23647 * kernel buffer
23648 */
23649 void
vm_map_corpse_footprint_collect_done(vm_map_t new_map)23650 vm_map_corpse_footprint_collect_done(
23651 vm_map_t new_map)
23652 {
23653 struct vm_map_corpse_footprint_header *footprint_header;
23654 struct vm_map_corpse_footprint_region *footprint_region;
23655 vm_size_t buf_size, actual_size;
23656
23657 assert(new_map->has_corpse_footprint);
23658 if (!new_map->has_corpse_footprint ||
23659 new_map->vmmap_corpse_footprint == NULL) {
23660 return;
23661 }
23662
23663 footprint_header = (struct vm_map_corpse_footprint_header *)
23664 new_map->vmmap_corpse_footprint;
23665 buf_size = footprint_header->cf_size;
23666
23667 footprint_region = (struct vm_map_corpse_footprint_region *)
23668 ((char *)footprint_header +
23669 footprint_header->cf_last_region);
23670
23671 /* get rid of trailing zeroes in last region */
23672 assert(footprint_region->cfr_num_pages >= footprint_header->cf_last_zeroes);
23673 footprint_region->cfr_num_pages -= footprint_header->cf_last_zeroes;
23674 footprint_header->cf_last_zeroes = 0;
23675
23676 actual_size = (vm_size_t)(footprint_header->cf_last_region +
23677 sizeof(*footprint_region) +
23678 (footprint_region->cfr_num_pages * sizeof(cf_disp_t)));
23679
23680 // printf("FBDP map %p buf_size 0x%llx actual_size 0x%llx\n", new_map, (uint64_t) buf_size, (uint64_t) actual_size);
23681 vm_map_corpse_footprint_size_avg =
23682 (((vm_map_corpse_footprint_size_avg *
23683 vm_map_corpse_footprint_count) +
23684 actual_size) /
23685 (vm_map_corpse_footprint_count + 1));
23686 vm_map_corpse_footprint_count++;
23687 if (actual_size > vm_map_corpse_footprint_size_max) {
23688 vm_map_corpse_footprint_size_max = actual_size;
23689 }
23690
23691 actual_size = round_page(actual_size);
23692 assert3u(buf_size, >=, actual_size);
23693 if (buf_size > actual_size) {
23694 /*
23695 * Free unused space at the end of the buffer
23696 */
23697 kmem_guard_t guard = vm_map_corpse_footprint_guard(new_map);
23698 kmem_return_t kmr = kmem_realloc_guard(kernel_map,
23699 (vm_offset_t)footprint_header,
23700 /* Account for guard page */
23701 buf_size + PAGE_SIZE,
23702 actual_size + PAGE_SIZE,
23703 KMR_DATA | KMR_GUARD_LAST | KMR_FREEOLD | KMR_KOBJECT,
23704 guard);
23705 assertf(kmr.kmr_return == KERN_SUCCESS,
23706 "trim: footprint_header %p buf_size 0x%llx actual_size 0x%llx kr=0x%x\n",
23707 footprint_header,
23708 (uint64_t) buf_size,
23709 (uint64_t) actual_size,
23710 kmr.kmr_return);
23711 footprint_header = (struct vm_map_corpse_footprint_header *)kmr.kmr_ptr;
23712 assert3p(footprint_header, !=, NULL);
23713 new_map->vmmap_corpse_footprint = footprint_header;
23714 footprint_region = NULL;
23715 }
23716
23717 footprint_header->cf_size = actual_size;
23718 }
23719
23720 /*
23721 * vm_map_corpse_footprint_query_page_info:
23722 * retrieves the disposition of the page at virtual address "vaddr"
23723 * in the forked corpse's VM map
23724 *
23725 * This is the equivalent of vm_map_footprint_query_page_info() for a forked corpse.
23726 */
23727 kern_return_t
vm_map_corpse_footprint_query_page_info(vm_map_t map,vm_map_offset_t va,int * disposition_p)23728 vm_map_corpse_footprint_query_page_info(
23729 vm_map_t map,
23730 vm_map_offset_t va,
23731 int *disposition_p)
23732 {
23733 struct vm_map_corpse_footprint_header *footprint_header;
23734 struct vm_map_corpse_footprint_region *footprint_region;
23735 uint32_t footprint_region_offset;
23736 vm_map_offset_t region_start, region_end;
23737 int disp_idx;
23738 kern_return_t kr;
23739 int effective_page_size;
23740 cf_disp_t cf_disp;
23741
23742 if (!map->has_corpse_footprint) {
23743 *disposition_p = 0;
23744 kr = KERN_INVALID_ARGUMENT;
23745 goto done;
23746 }
23747
23748 footprint_header = map->vmmap_corpse_footprint;
23749 if (footprint_header == NULL) {
23750 *disposition_p = 0;
23751 // if (va < SHARED_REGION_BASE_ARM64) printf("FBDP %d query map %p va 0x%llx disp 0x%x\n", __LINE__, map, va, *disposition_p);
23752 kr = KERN_INVALID_ARGUMENT;
23753 goto done;
23754 }
23755
23756 /* start looking at the hint ("cf_hint_region") */
23757 footprint_region_offset = footprint_header->cf_hint_region;
23758
23759 effective_page_size = MIN(PAGE_SIZE, VM_MAP_PAGE_SIZE(map));
23760
23761 lookup_again:
23762 if (footprint_region_offset < sizeof(*footprint_header)) {
23763 /* hint too low: start from 1st region */
23764 footprint_region_offset = sizeof(*footprint_header);
23765 }
23766 if (footprint_region_offset > footprint_header->cf_last_region) {
23767 /* hint too high: re-start from 1st region */
23768 footprint_region_offset = sizeof(*footprint_header);
23769 }
23770 footprint_region = (struct vm_map_corpse_footprint_region *)
23771 ((char *)footprint_header + footprint_region_offset);
23772 region_start = footprint_region->cfr_vaddr;
23773 region_end = (region_start +
23774 ((vm_map_offset_t)(footprint_region->cfr_num_pages) *
23775 effective_page_size));
23776 if (va < region_start &&
23777 footprint_region_offset != sizeof(*footprint_header)) {
23778 /* our range starts before the hint region */
23779
23780 /* reset the hint (in a racy way...) */
23781 footprint_header->cf_hint_region = sizeof(*footprint_header);
23782 /* lookup "va" again from 1st region */
23783 footprint_region_offset = sizeof(*footprint_header);
23784 goto lookup_again;
23785 }
23786
23787 while (va >= region_end) {
23788 if (footprint_region_offset >= footprint_header->cf_last_region) {
23789 break;
23790 }
23791 /* skip the region's header */
23792 footprint_region_offset += sizeof(*footprint_region);
23793 /* skip the region's page dispositions */
23794 footprint_region_offset += (footprint_region->cfr_num_pages * sizeof(cf_disp_t));
23795 /* align to next word boundary */
23796 footprint_region_offset =
23797 roundup(footprint_region_offset,
23798 sizeof(int));
23799 footprint_region = (struct vm_map_corpse_footprint_region *)
23800 ((char *)footprint_header + footprint_region_offset);
23801 region_start = footprint_region->cfr_vaddr;
23802 region_end = (region_start +
23803 ((vm_map_offset_t)(footprint_region->cfr_num_pages) *
23804 effective_page_size));
23805 }
23806 if (va < region_start || va >= region_end) {
23807 /* page not found */
23808 *disposition_p = 0;
23809 // if (va < SHARED_REGION_BASE_ARM64) printf("FBDP %d query map %p va 0x%llx disp 0x%x\n", __LINE__, map, va, *disposition_p);
23810 kr = KERN_SUCCESS;
23811 goto done;
23812 }
23813
23814 /* "va" found: set the lookup hint for next lookup (in a racy way...) */
23815 footprint_header->cf_hint_region = footprint_region_offset;
23816
23817 /* get page disposition for "va" in this region */
23818 disp_idx = (int) ((va - footprint_region->cfr_vaddr) / effective_page_size);
23819 cf_disp = footprint_region->cfr_disposition[disp_idx];
23820 *disposition_p = vm_page_cf_disp_to_disposition(cf_disp);
23821 kr = KERN_SUCCESS;
23822 done:
23823 // if (va < SHARED_REGION_BASE_ARM64) printf("FBDP %d query map %p va 0x%llx disp 0x%x\n", __LINE__, map, va, *disposition_p);
23824 /* dtrace -n 'vminfo:::footprint_query_page_info { printf("map 0x%p va 0x%llx disp 0x%x kr 0x%x", arg0, arg1, arg2, arg3); }' */
23825 DTRACE_VM4(footprint_query_page_info,
23826 vm_map_t, map,
23827 vm_map_offset_t, va,
23828 int, *disposition_p,
23829 kern_return_t, kr);
23830
23831 return kr;
23832 }
23833
23834 void
vm_map_corpse_footprint_destroy(vm_map_t map)23835 vm_map_corpse_footprint_destroy(
23836 vm_map_t map)
23837 {
23838 if (map->has_corpse_footprint &&
23839 map->vmmap_corpse_footprint != NULL) {
23840 struct vm_map_corpse_footprint_header *footprint_header;
23841 vm_size_t buf_size;
23842
23843 footprint_header = map->vmmap_corpse_footprint;
23844 buf_size = footprint_header->cf_size;
23845 kmem_guard_t guard = vm_map_corpse_footprint_guard(map);
23846 kmem_free_guard(kernel_map, (vm_offset_t)footprint_header,
23847 buf_size + PAGE_SIZE,
23848 KMF_GUARD_LAST, guard);
23849 map->vmmap_corpse_footprint = NULL;
23850 map->has_corpse_footprint = FALSE;
23851 }
23852 }
23853
23854 /*
23855 * vm_map_copy_footprint_ledgers:
23856 * copies any ledger that's relevant to the memory footprint of "old_task"
23857 * into the forked corpse's task ("new_task")
23858 */
23859 void
vm_map_copy_footprint_ledgers(task_t old_task,task_t new_task)23860 vm_map_copy_footprint_ledgers(
23861 task_t old_task,
23862 task_t new_task)
23863 {
23864 vm_map_copy_ledger(old_task, new_task, task_ledgers.phys_footprint);
23865 vm_map_copy_ledger(old_task, new_task, task_ledgers.purgeable_nonvolatile);
23866 vm_map_copy_ledger(old_task, new_task, task_ledgers.purgeable_nonvolatile_compressed);
23867 vm_map_copy_ledger(old_task, new_task, task_ledgers.internal);
23868 vm_map_copy_ledger(old_task, new_task, task_ledgers.internal_compressed);
23869 vm_map_copy_ledger(old_task, new_task, task_ledgers.iokit_mapped);
23870 vm_map_copy_ledger(old_task, new_task, task_ledgers.alternate_accounting);
23871 vm_map_copy_ledger(old_task, new_task, task_ledgers.alternate_accounting_compressed);
23872 vm_map_copy_ledger(old_task, new_task, task_ledgers.page_table);
23873 vm_map_copy_ledger(old_task, new_task, task_ledgers.tagged_footprint);
23874 vm_map_copy_ledger(old_task, new_task, task_ledgers.tagged_footprint_compressed);
23875 vm_map_copy_ledger(old_task, new_task, task_ledgers.network_nonvolatile);
23876 vm_map_copy_ledger(old_task, new_task, task_ledgers.network_nonvolatile_compressed);
23877 vm_map_copy_ledger(old_task, new_task, task_ledgers.media_footprint);
23878 vm_map_copy_ledger(old_task, new_task, task_ledgers.media_footprint_compressed);
23879 vm_map_copy_ledger(old_task, new_task, task_ledgers.graphics_footprint);
23880 vm_map_copy_ledger(old_task, new_task, task_ledgers.graphics_footprint_compressed);
23881 vm_map_copy_ledger(old_task, new_task, task_ledgers.neural_footprint);
23882 vm_map_copy_ledger(old_task, new_task, task_ledgers.neural_footprint_compressed);
23883 vm_map_copy_ledger(old_task, new_task, task_ledgers.wired_mem);
23884 vm_map_copy_ledger(old_task, new_task, task_ledgers.neural_nofootprint_total);
23885 }
23886
23887 /*
23888 * vm_map_copy_ledger:
23889 * copy a single ledger from "old_task" to "new_task"
23890 */
23891 void
vm_map_copy_ledger(task_t old_task,task_t new_task,int ledger_entry)23892 vm_map_copy_ledger(
23893 task_t old_task,
23894 task_t new_task,
23895 int ledger_entry)
23896 {
23897 ledger_amount_t old_balance, new_balance, delta;
23898
23899 assert(new_task->map->has_corpse_footprint);
23900 if (!new_task->map->has_corpse_footprint) {
23901 return;
23902 }
23903
23904 /* turn off sanity checks for the ledger we're about to mess with */
23905 ledger_disable_panic_on_negative(new_task->ledger,
23906 ledger_entry);
23907
23908 /* adjust "new_task" to match "old_task" */
23909 ledger_get_balance(old_task->ledger,
23910 ledger_entry,
23911 &old_balance);
23912 ledger_get_balance(new_task->ledger,
23913 ledger_entry,
23914 &new_balance);
23915 if (new_balance == old_balance) {
23916 /* new == old: done */
23917 } else if (new_balance > old_balance) {
23918 /* new > old ==> new -= new - old */
23919 delta = new_balance - old_balance;
23920 ledger_debit(new_task->ledger,
23921 ledger_entry,
23922 delta);
23923 } else {
23924 /* new < old ==> new += old - new */
23925 delta = old_balance - new_balance;
23926 ledger_credit(new_task->ledger,
23927 ledger_entry,
23928 delta);
23929 }
23930 }
23931
23932 /*
23933 * vm_map_get_pmap:
23934 * returns the pmap associated with the vm_map
23935 */
23936 pmap_t
vm_map_get_pmap(vm_map_t map)23937 vm_map_get_pmap(vm_map_t map)
23938 {
23939 return vm_map_pmap(map);
23940 }
23941
23942 ppnum_t
vm_map_get_phys_page(vm_map_t map,vm_offset_t addr)23943 vm_map_get_phys_page(
23944 vm_map_t map,
23945 vm_offset_t addr)
23946 {
23947 vm_object_offset_t offset;
23948 vm_object_t object;
23949 vm_map_offset_t map_offset;
23950 vm_map_entry_t entry;
23951 ppnum_t phys_page = 0;
23952
23953 map_offset = vm_map_trunc_page(addr, PAGE_MASK);
23954
23955 vm_map_lock(map);
23956 while (vm_map_lookup_entry(map, map_offset, &entry)) {
23957 if (entry->is_sub_map) {
23958 vm_map_t old_map;
23959 vm_map_lock(VME_SUBMAP(entry));
23960 old_map = map;
23961 map = VME_SUBMAP(entry);
23962 map_offset = (VME_OFFSET(entry) +
23963 (map_offset - entry->vme_start));
23964 vm_map_unlock(old_map);
23965 continue;
23966 }
23967 if (VME_OBJECT(entry) == VM_OBJECT_NULL) {
23968 vm_map_unlock(map);
23969 return (ppnum_t) 0;
23970 }
23971 if (VME_OBJECT(entry)->phys_contiguous) {
23972 /* These are not standard pageable memory mappings */
23973 /* If they are not present in the object they will */
23974 /* have to be picked up from the pager through the */
23975 /* fault mechanism. */
23976 if (VME_OBJECT(entry)->vo_shadow_offset == 0) {
23977 /* need to call vm_fault */
23978 vm_map_unlock(map);
23979 vm_fault(map, map_offset, VM_PROT_NONE,
23980 FALSE /* change_wiring */, VM_KERN_MEMORY_NONE,
23981 THREAD_UNINT, NULL, 0);
23982 vm_map_lock(map);
23983 continue;
23984 }
23985 offset = (VME_OFFSET(entry) +
23986 (map_offset - entry->vme_start));
23987 phys_page = (ppnum_t)
23988 ((VME_OBJECT(entry)->vo_shadow_offset
23989 + offset) >> PAGE_SHIFT);
23990 break;
23991 }
23992 offset = (VME_OFFSET(entry) + (map_offset - entry->vme_start));
23993 object = VME_OBJECT(entry);
23994 vm_object_lock(object);
23995 while (TRUE) {
23996 vm_page_t dst_page = vm_page_lookup(object, offset);
23997 if (dst_page == VM_PAGE_NULL) {
23998 if (object->shadow) {
23999 vm_object_t old_object;
24000 vm_object_lock(object->shadow);
24001 old_object = object;
24002 offset = offset + object->vo_shadow_offset;
24003 object = object->shadow;
24004 vm_object_unlock(old_object);
24005 } else {
24006 vm_object_unlock(object);
24007 break;
24008 }
24009 } else {
24010 phys_page = (ppnum_t)(VM_PAGE_GET_PHYS_PAGE(dst_page));
24011 vm_object_unlock(object);
24012 break;
24013 }
24014 }
24015 break;
24016 }
24017
24018 vm_map_unlock(map);
24019 return phys_page;
24020 }
24021
24022 #if CONFIG_MAP_RANGES
24023 static bitmap_t vm_map_user_range_heap_map[BITMAP_LEN(VM_MEMORY_COUNT)];
24024 static bitmap_t vm_map_user_range_large_file_map[BITMAP_LEN(VM_MEMORY_COUNT)];
24025
24026 static_assert((int)UMEM_RANGE_ID_DEFAULT == MACH_VM_RANGE_DEFAULT);
24027 static_assert((int)UMEM_RANGE_ID_HEAP == MACH_VM_RANGE_DATA);
24028
24029 /*
24030 * vm_map_range_map_init:
24031 * initializes the VM range ID map to enable index lookup
24032 * of user VM ranges based on VM tag from userspace.
24033 */
24034 static void
vm_map_range_map_init(void)24035 vm_map_range_map_init(void)
24036 {
24037 /*
24038 * VM_MEMORY_MALLOC{,_NANO} are skipped on purpose:
24039 * - the former is malloc metadata which should be kept separate
24040 * - the latter has its own ranges
24041 */
24042 bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_MALLOC_HUGE);
24043 bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_MALLOC_LARGE);
24044 bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_MALLOC_LARGE_REUSED);
24045 bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_MALLOC_MEDIUM);
24046 bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_MALLOC_PROB_GUARD);
24047 bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_MALLOC_SMALL);
24048 bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_MALLOC_TINY);
24049 bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_TCMALLOC);
24050 bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_LIBNETWORK);
24051 bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_IOACCELERATOR);
24052 bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_IOSURFACE);
24053 bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_IMAGEIO);
24054 bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_COREGRAPHICS);
24055 bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_CORESERVICES);
24056 bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_COREDATA);
24057 bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_LAYERKIT);
24058 bitmap_set(vm_map_user_range_large_file_map, VM_MEMORY_IOACCELERATOR);
24059 bitmap_set(vm_map_user_range_large_file_map, VM_MEMORY_IOSURFACE);
24060 }
24061
24062 static struct mach_vm_range
vm_map_range_random_uniform(vm_map_size_t req_size,vm_map_offset_t min_addr,vm_map_offset_t max_addr,vm_map_offset_t offmask)24063 vm_map_range_random_uniform(
24064 vm_map_size_t req_size,
24065 vm_map_offset_t min_addr,
24066 vm_map_offset_t max_addr,
24067 vm_map_offset_t offmask)
24068 {
24069 vm_map_offset_t random_addr;
24070 struct mach_vm_range alloc;
24071
24072 req_size = (req_size + offmask) & ~offmask;
24073 min_addr = (min_addr + offmask) & ~offmask;
24074 max_addr = max_addr & ~offmask;
24075
24076 read_random(&random_addr, sizeof(random_addr));
24077 random_addr %= (max_addr - req_size - min_addr);
24078 random_addr &= ~offmask;
24079
24080 alloc.min_address = min_addr + random_addr;
24081 alloc.max_address = min_addr + random_addr + req_size;
24082 return alloc;
24083 }
24084
24085 static vm_map_offset_t
vm_map_range_offmask(void)24086 vm_map_range_offmask(void)
24087 {
24088 uint32_t pte_depth;
24089
24090 /*
24091 * PTE optimizations
24092 *
24093 *
24094 * 16k pages systems
24095 * ~~~~~~~~~~~~~~~~~
24096 *
24097 * A single L1 (sub-)page covers the address space.
24098 * - L2 pages cover 64G,
24099 * - L3 pages cover 32M.
24100 *
24101 * On embedded, the dynamic VA range is 64G and uses a single L2 page.
24102 * As a result, we really only need to align the ranges to 32M to avoid
24103 * partial L3 pages.
24104 *
24105 * On macOS, the usage of L2 pages will increase, so as a result we will
24106 * want to align ranges to 64G in order to utilize them fully.
24107 *
24108 *
24109 * 4k pages systems
24110 * ~~~~~~~~~~~~~~~~
24111 *
24112 * A single L0 (sub-)page covers the address space.
24113 * - L1 pages cover 512G,
24114 * - L2 pages cover 1G,
24115 * - L3 pages cover 2M.
24116 *
24117 * The long tail of processes on a system will tend to have a VA usage
24118 * (ignoring the shared regions) in the 100s of MB order of magnitnude.
24119 * This is achievable with a single L1 and a few L2s without
24120 * randomization.
24121 *
24122 * However once randomization is introduced, the system will immediately
24123 * need several L1s and many more L2s. As a result:
24124 *
24125 * - on embedded devices, the cost of these extra pages isn't
24126 * sustainable, and we just disable the feature entirely,
24127 *
24128 * - on macOS we align ranges to a 512G boundary so that the extra L1
24129 * pages can be used to their full potential.
24130 */
24131
24132 /*
24133 * note, this function assumes _non exotic mappings_
24134 * which is why it uses the native kernel's PAGE_SHIFT.
24135 */
24136 #if XNU_PLATFORM_MacOSX
24137 pte_depth = PAGE_SHIFT > 12 ? 2 : 3;
24138 #else /* !XNU_PLATFORM_MacOSX */
24139 pte_depth = PAGE_SHIFT > 12 ? 1 : 0;
24140 #endif /* !XNU_PLATFORM_MacOSX */
24141
24142 if (pte_depth == 0) {
24143 return 0;
24144 }
24145
24146 return (1ull << ((PAGE_SHIFT - 3) * pte_depth + PAGE_SHIFT)) - 1;
24147 }
24148
24149 /*
24150 * vm_map_range_configure:
24151 * configures the user vm_map ranges by increasing the maximum VA range of
24152 * the map and carving out a range at the end of VA space (searching backwards
24153 * in the newly expanded map).
24154 */
24155 kern_return_t
vm_map_range_configure(vm_map_t map,__unused bool needs_extra_jumbo_va)24156 vm_map_range_configure(vm_map_t map, __unused bool needs_extra_jumbo_va)
24157 {
24158 const vm_map_offset_t offmask = vm_map_range_offmask();
24159 struct mach_vm_range data_range;
24160 vm_map_offset_t default_end;
24161 kern_return_t kr;
24162
24163 if (!vm_map_is_64bit(map) || vm_map_is_exotic(map) || offmask == 0) {
24164 /*
24165 * No point doing vm ranges in a 32bit address space.
24166 */
24167 return KERN_NOT_SUPPORTED;
24168 }
24169
24170 /* Should not be applying ranges to kernel map or kernel map submaps */
24171 assert(vm_map_pmap(map) != kernel_pmap);
24172
24173 #if XNU_PLATFORM_MacOSX
24174
24175 /*
24176 * on macOS, the address space is a massive 47 bits (128T),
24177 * with several carve outs that processes can't use:
24178 * - the shared region
24179 * - the commpage region
24180 * - the GPU carve out (if applicable)
24181 *
24182 * and when nano-malloc is in use it desires memory at the 96T mark.
24183 *
24184 * However, their location is architecture dependent:
24185 * - On intel, the shared region and commpage are
24186 * at the very end of the usable address space (above +127T),
24187 * and there is no GPU carve out, and pthread wants to place
24188 * threads at the 112T mark (0x70T).
24189 *
24190 * - On arm64, these are in the same spot as on embedded devices:
24191 * o shared region: [ 6G, 10G) [ will likely grow over time ]
24192 * o commpage region: [63G, 64G)
24193 * o GPU carve out: [64G, 448G)
24194 *
24195 * This is conveninent because the mappings at the end of the address
24196 * space (when they exist) are made by the kernel.
24197 *
24198 * The policy is to allocate a random 1T for the data heap
24199 * in the end of the address-space in the:
24200 * - [0x71, 0x7f) range on Intel (to leave space for pthread stacks)
24201 * - [0x61, 0x7f) range on ASM (to leave space for Nano malloc).
24202 */
24203
24204 /* see NANOZONE_SIGNATURE in libmalloc */
24205 #if __x86_64__
24206 default_end = 0x71ull << 40;
24207 #else
24208 default_end = 0x61ull << 40;
24209 #endif
24210 data_range = vm_map_range_random_uniform(1ull << 40,
24211 default_end, 0x7full << 40, offmask);
24212
24213 #else /* !XNU_PLATFORM_MacOSX */
24214
24215 /*
24216 * Embedded devices:
24217 *
24218 * The default VA Size scales with the device physical memory.
24219 *
24220 * Out of that:
24221 * - the "zero" page typically uses 4G + some slide
24222 * - the shared region uses SHARED_REGION_SIZE bytes (4G)
24223 *
24224 * Without the use of jumbo or any adjustment to the address space,
24225 * a default VM map typically looks like this:
24226 *
24227 * 0G -->╒════════════╕
24228 * │ pagezero │
24229 * │ + slide │
24230 * ~4G -->╞════════════╡<-- vm_map_min(map)
24231 * │ │
24232 * 6G -->├────────────┤
24233 * │ shared │
24234 * │ region │
24235 * 10G -->├────────────┤
24236 * │ │
24237 * max_va -->├────────────┤<-- vm_map_max(map)
24238 * │ │
24239 * ╎ jumbo ╎
24240 * ╎ ╎
24241 * │ │
24242 * 63G -->╞════════════╡<-- MACH_VM_MAX_ADDRESS
24243 * │ commpage │
24244 * 64G -->├────────────┤<-- MACH_VM_MIN_GPU_CARVEOUT_ADDRESS
24245 * │ │
24246 * ╎ GPU ╎
24247 * ╎ carveout ╎
24248 * │ │
24249 * 448G -->├────────────┤<-- MACH_VM_MAX_GPU_CARVEOUT_ADDRESS
24250 * │ │
24251 * ╎ ╎
24252 * ╎ ╎
24253 * │ │
24254 * 512G -->╘════════════╛<-- (1ull << ARM_16K_TT_L1_SHIFT)
24255 *
24256 * When this drawing was made, "max_va" was smaller than
24257 * ARM64_MAX_OFFSET_DEVICE_LARGE (~15.5G), leaving shy of
24258 * 12G of address space for the zero-page, slide, files,
24259 * binaries, heap ...
24260 *
24261 * We will want to make a "heap/data" carve out inside
24262 * the jumbo range of half of that usable space, assuming
24263 * that this is less than a forth of the jumbo range.
24264 *
24265 * The assert below intends to catch when max_va grows
24266 * too large for this heuristic.
24267 */
24268
24269 vm_map_lock_read(map);
24270 default_end = vm_map_max(map);
24271 vm_map_unlock_read(map);
24272
24273 /*
24274 * Check that we're not already jumbo'd,
24275 * or our address space was somehow modified.
24276 *
24277 * If so we cannot guarantee that we can set up the ranges
24278 * safely without interfering with the existing map.
24279 */
24280 if (default_end > vm_compute_max_offset(true)) {
24281 return KERN_NO_SPACE;
24282 }
24283
24284 if (pmap_max_offset(true, ARM_PMAP_MAX_OFFSET_DEFAULT)) {
24285 /*
24286 * an override boot-arg was set, disable user-ranges
24287 *
24288 * XXX: this is problematic because it means these boot-args
24289 * no longer test the behavior changing the value
24290 * of ARM64_MAX_OFFSET_DEVICE_* would have.
24291 */
24292 return KERN_NOT_SUPPORTED;
24293 }
24294
24295 /* expand the default VM space to 64GB */
24296 vm_map_set_jumbo(map);
24297
24298 assert3u(7 * GiB(10) / 2, <=, vm_map_max(map) - default_end);
24299 data_range = vm_map_range_random_uniform(GiB(10),
24300 default_end + PAGE_SIZE, vm_map_max(map), offmask);
24301
24302 #endif /* !XNU_PLATFORM_MacOSX */
24303
24304 /*
24305 * Poke holes so that ASAN or people listing regions
24306 * do not think this space is free.
24307 */
24308
24309 if (default_end != data_range.min_address) {
24310 kr = vm_map_enter(map, &default_end,
24311 data_range.min_address - default_end,
24312 0, VM_MAP_KERNEL_FLAGS_FIXED_PERMANENT(), VM_OBJECT_NULL,
24313 0, FALSE, VM_PROT_NONE, VM_PROT_NONE, VM_INHERIT_DEFAULT);
24314 assert(kr == KERN_SUCCESS);
24315 }
24316
24317 if (data_range.max_address != vm_map_max(map)) {
24318 vm_map_entry_t entry;
24319 vm_size_t size;
24320
24321 /*
24322 * Extend the end of the hole to the next VM entry or the end of the map,
24323 * whichever comes first.
24324 */
24325 vm_map_lock_read(map);
24326 vm_map_lookup_entry_or_next(map, data_range.max_address, &entry);
24327 if (entry == vm_map_to_entry(map) || entry->vme_start > vm_map_max(map)) {
24328 size = vm_map_max(map) - data_range.max_address;
24329 } else {
24330 size = entry->vme_start - data_range.max_address;
24331 }
24332 vm_map_unlock_read(map);
24333
24334 kr = vm_map_enter(map, &data_range.max_address, size,
24335 0, VM_MAP_KERNEL_FLAGS_FIXED_PERMANENT(), VM_OBJECT_NULL,
24336 0, FALSE, VM_PROT_NONE, VM_PROT_NONE, VM_INHERIT_DEFAULT);
24337 assert(kr == KERN_SUCCESS);
24338 }
24339
24340 #if XNU_TARGET_OS_IOS && EXTENDED_USER_VA_SUPPORT
24341 if (needs_extra_jumbo_va) {
24342 /* This will grow the address space to MACH_VM_MAX_ADDRESS */
24343 vm_map_set_extra_jumbo(map);
24344 }
24345 #endif /* XNU_TARGET_OS_IOS && EXTENDED_USER_VA_SUPPORT */
24346
24347 vm_map_lock(map);
24348 map->default_range.min_address = vm_map_min(map);
24349 map->default_range.max_address = default_end;
24350 map->data_range = data_range;
24351 #if XNU_TARGET_OS_IOS && EXTENDED_USER_VA_SUPPORT
24352 /* If process has "extra jumbo" entitlement, enable large file range */
24353 if (needs_extra_jumbo_va) {
24354 map->large_file_range = vm_map_range_random_uniform(TiB(1),
24355 MACH_VM_JUMBO_ADDRESS, MACH_VM_MAX_ADDRESS, offmask);
24356 }
24357 #endif /* XNU_TARGET_OS_IOS && EXTENDED_USER_VA_SUPPORT */
24358 map->uses_user_ranges = true;
24359 vm_map_unlock(map);
24360
24361 return KERN_SUCCESS;
24362 }
24363
24364 /*
24365 * vm_map_range_fork:
24366 * clones the array of ranges from old_map to new_map in support
24367 * of a VM map fork.
24368 */
24369 void
vm_map_range_fork(vm_map_t new_map,vm_map_t old_map)24370 vm_map_range_fork(vm_map_t new_map, vm_map_t old_map)
24371 {
24372 if (!old_map->uses_user_ranges) {
24373 /* nothing to do */
24374 return;
24375 }
24376
24377 new_map->default_range = old_map->default_range;
24378 new_map->data_range = old_map->data_range;
24379
24380 if (old_map->extra_ranges_count) {
24381 vm_map_user_range_t otable, ntable;
24382 uint16_t count;
24383
24384 otable = old_map->extra_ranges;
24385 count = old_map->extra_ranges_count;
24386 ntable = kalloc_data(count * sizeof(struct vm_map_user_range),
24387 Z_WAITOK | Z_ZERO | Z_NOFAIL);
24388 memcpy(ntable, otable,
24389 count * sizeof(struct vm_map_user_range));
24390
24391 new_map->extra_ranges_count = count;
24392 new_map->extra_ranges = ntable;
24393 }
24394
24395 new_map->uses_user_ranges = true;
24396 }
24397
24398 /*
24399 * vm_map_get_user_range:
24400 * copy the VM user range for the given VM map and range ID.
24401 */
24402 kern_return_t
vm_map_get_user_range(vm_map_t map,vm_map_range_id_t range_id,mach_vm_range_t range)24403 vm_map_get_user_range(
24404 vm_map_t map,
24405 vm_map_range_id_t range_id,
24406 mach_vm_range_t range)
24407 {
24408 if (map == NULL || !map->uses_user_ranges || range == NULL) {
24409 return KERN_INVALID_ARGUMENT;
24410 }
24411
24412 switch (range_id) {
24413 case UMEM_RANGE_ID_DEFAULT:
24414 *range = map->default_range;
24415 return KERN_SUCCESS;
24416
24417 case UMEM_RANGE_ID_HEAP:
24418 *range = map->data_range;
24419 return KERN_SUCCESS;
24420
24421 case UMEM_RANGE_ID_LARGE_FILE:
24422 /*
24423 * Because this function tells a user-space process about the user
24424 * ranges in its VM map, this case communicates whether the large file
24425 * range is in use. Note that this is different from how the large file
24426 * range ID is handled in `vm_map_get_range()`: there, we "resolve" the
24427 * VA policy and return either the large file range or data range,
24428 * depending on whether the large file range is enabled.
24429 */
24430 if (map->large_file_range.min_address != map->large_file_range.max_address) {
24431 /* large file range is configured and should be used */
24432 *range = map->large_file_range;
24433 } else {
24434 return KERN_INVALID_ARGUMENT;
24435 }
24436 return KERN_SUCCESS;
24437
24438 default:
24439 return KERN_INVALID_ARGUMENT;
24440 }
24441 }
24442
24443 static vm_map_range_id_t
vm_map_user_range_resolve(vm_map_t map,mach_vm_address_t addr,mach_vm_size_t size,mach_vm_range_t range)24444 vm_map_user_range_resolve(
24445 vm_map_t map,
24446 mach_vm_address_t addr,
24447 mach_vm_size_t size,
24448 mach_vm_range_t range)
24449 {
24450 struct mach_vm_range tmp;
24451
24452 vm_map_lock_assert_held(map);
24453
24454 static_assert((int)UMEM_RANGE_ID_DEFAULT == MACH_VM_RANGE_DEFAULT);
24455 static_assert((int)UMEM_RANGE_ID_HEAP == MACH_VM_RANGE_DATA);
24456
24457 if (mach_vm_range_contains(&map->default_range, addr, size)) {
24458 if (range) {
24459 *range = map->default_range;
24460 }
24461 return UMEM_RANGE_ID_DEFAULT;
24462 }
24463
24464 if (mach_vm_range_contains(&map->data_range, addr, size)) {
24465 if (range) {
24466 *range = map->data_range;
24467 }
24468 return UMEM_RANGE_ID_HEAP;
24469 }
24470
24471 if (mach_vm_range_contains(&map->large_file_range, addr, size)) {
24472 if (range) {
24473 *range = map->large_file_range;
24474 }
24475 return UMEM_RANGE_ID_LARGE_FILE;
24476 }
24477
24478 for (size_t i = 0; i < map->extra_ranges_count; i++) {
24479 vm_map_user_range_t r = &map->extra_ranges[i];
24480
24481 tmp.min_address = r->vmur_min_address;
24482 tmp.max_address = r->vmur_max_address;
24483
24484 if (mach_vm_range_contains(&tmp, addr, size)) {
24485 if (range) {
24486 *range = tmp;
24487 }
24488 return r->vmur_range_id;
24489 }
24490 }
24491
24492 if (range) {
24493 range->min_address = range->max_address = 0;
24494 }
24495 return UMEM_RANGE_ID_DEFAULT;
24496 }
24497 #endif /* CONFIG_MAP_RANGES */
24498
24499 void
vm_map_kernel_flags_update_range_id(vm_map_kernel_flags_t * vmkf,vm_map_t map,__unused vm_map_size_t size)24500 vm_map_kernel_flags_update_range_id(
24501 vm_map_kernel_flags_t *vmkf,
24502 vm_map_t map,
24503 __unused vm_map_size_t size)
24504 {
24505 if (map == kernel_map) {
24506 if (vmkf->vmkf_range_id == KMEM_RANGE_ID_NONE) {
24507 vmkf->vmkf_range_id = KMEM_RANGE_ID_DATA;
24508 }
24509 #if CONFIG_MAP_RANGES
24510 } else if (vmkf->vm_tag < VM_MEMORY_COUNT &&
24511 vmkf->vmkf_range_id == UMEM_RANGE_ID_DEFAULT) {
24512 if (bitmap_test(vm_map_user_range_large_file_map, vmkf->vm_tag)
24513 || size >= VM_LARGE_FILE_THRESHOLD) {
24514 /*
24515 * if the map doesn't have the large file range configured,
24516 * the range will get resolved to the heap range in `vm_map_get_range`
24517 */
24518 vmkf->vmkf_range_id = UMEM_RANGE_ID_LARGE_FILE;
24519 } else if (bitmap_test(vm_map_user_range_heap_map, vmkf->vm_tag)) {
24520 vmkf->vmkf_range_id = UMEM_RANGE_ID_HEAP;
24521 }
24522 #endif /* CONFIG_MAP_RANGES */
24523 }
24524 }
24525
24526 /*
24527 * vm_map_entry_has_device_pager:
24528 * Check if the vm map entry specified by the virtual address has a device pager.
24529 * If the vm map entry does not exist or if the map is NULL, this returns FALSE.
24530 */
24531 boolean_t
vm_map_entry_has_device_pager(vm_map_t map,vm_map_offset_t vaddr)24532 vm_map_entry_has_device_pager(vm_map_t map, vm_map_offset_t vaddr)
24533 {
24534 vm_map_entry_t entry;
24535 vm_object_t object;
24536 boolean_t result;
24537
24538 if (map == NULL) {
24539 return FALSE;
24540 }
24541
24542 vm_map_lock(map);
24543 while (TRUE) {
24544 if (!vm_map_lookup_entry(map, vaddr, &entry)) {
24545 result = FALSE;
24546 break;
24547 }
24548 if (entry->is_sub_map) {
24549 // Check the submap
24550 vm_map_t submap = VME_SUBMAP(entry);
24551 assert(submap != NULL);
24552 vm_map_lock(submap);
24553 vm_map_unlock(map);
24554 map = submap;
24555 continue;
24556 }
24557 object = VME_OBJECT(entry);
24558 if (object != NULL && object->pager != NULL && is_device_pager_ops(object->pager->mo_pager_ops)) {
24559 result = TRUE;
24560 break;
24561 }
24562 result = FALSE;
24563 break;
24564 }
24565
24566 vm_map_unlock(map);
24567 return result;
24568 }
24569
24570 #if MACH_ASSERT
24571
24572 extern int pmap_ledgers_panic;
24573 extern int pmap_ledgers_panic_leeway;
24574
24575 #define LEDGER_DRIFT(__LEDGER) \
24576 int __LEDGER##_over; \
24577 ledger_amount_t __LEDGER##_over_total; \
24578 ledger_amount_t __LEDGER##_over_max; \
24579 int __LEDGER##_under; \
24580 ledger_amount_t __LEDGER##_under_total; \
24581 ledger_amount_t __LEDGER##_under_max
24582
24583 struct {
24584 uint64_t num_pmaps_checked;
24585
24586 LEDGER_DRIFT(phys_footprint);
24587 LEDGER_DRIFT(internal);
24588 LEDGER_DRIFT(internal_compressed);
24589 LEDGER_DRIFT(external);
24590 LEDGER_DRIFT(reusable);
24591 LEDGER_DRIFT(iokit_mapped);
24592 LEDGER_DRIFT(alternate_accounting);
24593 LEDGER_DRIFT(alternate_accounting_compressed);
24594 LEDGER_DRIFT(page_table);
24595 LEDGER_DRIFT(purgeable_volatile);
24596 LEDGER_DRIFT(purgeable_nonvolatile);
24597 LEDGER_DRIFT(purgeable_volatile_compressed);
24598 LEDGER_DRIFT(purgeable_nonvolatile_compressed);
24599 LEDGER_DRIFT(tagged_nofootprint);
24600 LEDGER_DRIFT(tagged_footprint);
24601 LEDGER_DRIFT(tagged_nofootprint_compressed);
24602 LEDGER_DRIFT(tagged_footprint_compressed);
24603 LEDGER_DRIFT(network_volatile);
24604 LEDGER_DRIFT(network_nonvolatile);
24605 LEDGER_DRIFT(network_volatile_compressed);
24606 LEDGER_DRIFT(network_nonvolatile_compressed);
24607 LEDGER_DRIFT(media_nofootprint);
24608 LEDGER_DRIFT(media_footprint);
24609 LEDGER_DRIFT(media_nofootprint_compressed);
24610 LEDGER_DRIFT(media_footprint_compressed);
24611 LEDGER_DRIFT(graphics_nofootprint);
24612 LEDGER_DRIFT(graphics_footprint);
24613 LEDGER_DRIFT(graphics_nofootprint_compressed);
24614 LEDGER_DRIFT(graphics_footprint_compressed);
24615 LEDGER_DRIFT(neural_nofootprint);
24616 LEDGER_DRIFT(neural_footprint);
24617 LEDGER_DRIFT(neural_nofootprint_compressed);
24618 LEDGER_DRIFT(neural_footprint_compressed);
24619 LEDGER_DRIFT(neural_nofootprint_total);
24620 } pmap_ledgers_drift;
24621
24622 void
vm_map_pmap_check_ledgers(pmap_t pmap,ledger_t ledger,int pid,char * procname)24623 vm_map_pmap_check_ledgers(
24624 pmap_t pmap,
24625 ledger_t ledger,
24626 int pid,
24627 char *procname)
24628 {
24629 ledger_amount_t bal;
24630 boolean_t do_panic;
24631
24632 do_panic = FALSE;
24633
24634 pmap_ledgers_drift.num_pmaps_checked++;
24635
24636 #define LEDGER_CHECK_BALANCE(__LEDGER) \
24637 MACRO_BEGIN \
24638 int panic_on_negative = TRUE; \
24639 ledger_get_balance(ledger, \
24640 task_ledgers.__LEDGER, \
24641 &bal); \
24642 ledger_get_panic_on_negative(ledger, \
24643 task_ledgers.__LEDGER, \
24644 &panic_on_negative); \
24645 if (bal != 0) { \
24646 if (panic_on_negative || \
24647 (pmap_ledgers_panic && \
24648 pmap_ledgers_panic_leeway > 0 && \
24649 (bal > (pmap_ledgers_panic_leeway * PAGE_SIZE) || \
24650 bal < (-pmap_ledgers_panic_leeway * PAGE_SIZE)))) { \
24651 do_panic = TRUE; \
24652 } \
24653 printf("LEDGER BALANCE proc %d (%s) " \
24654 "\"%s\" = %lld\n", \
24655 pid, procname, #__LEDGER, bal); \
24656 if (bal > 0) { \
24657 pmap_ledgers_drift.__LEDGER##_over++; \
24658 pmap_ledgers_drift.__LEDGER##_over_total += bal; \
24659 if (bal > pmap_ledgers_drift.__LEDGER##_over_max) { \
24660 pmap_ledgers_drift.__LEDGER##_over_max = bal; \
24661 } \
24662 } else if (bal < 0) { \
24663 pmap_ledgers_drift.__LEDGER##_under++; \
24664 pmap_ledgers_drift.__LEDGER##_under_total += bal; \
24665 if (bal < pmap_ledgers_drift.__LEDGER##_under_max) { \
24666 pmap_ledgers_drift.__LEDGER##_under_max = bal; \
24667 } \
24668 } \
24669 } \
24670 MACRO_END
24671
24672 LEDGER_CHECK_BALANCE(phys_footprint);
24673 LEDGER_CHECK_BALANCE(internal);
24674 LEDGER_CHECK_BALANCE(internal_compressed);
24675 LEDGER_CHECK_BALANCE(external);
24676 LEDGER_CHECK_BALANCE(reusable);
24677 LEDGER_CHECK_BALANCE(iokit_mapped);
24678 LEDGER_CHECK_BALANCE(alternate_accounting);
24679 LEDGER_CHECK_BALANCE(alternate_accounting_compressed);
24680 LEDGER_CHECK_BALANCE(page_table);
24681 LEDGER_CHECK_BALANCE(purgeable_volatile);
24682 LEDGER_CHECK_BALANCE(purgeable_nonvolatile);
24683 LEDGER_CHECK_BALANCE(purgeable_volatile_compressed);
24684 LEDGER_CHECK_BALANCE(purgeable_nonvolatile_compressed);
24685 LEDGER_CHECK_BALANCE(tagged_nofootprint);
24686 LEDGER_CHECK_BALANCE(tagged_footprint);
24687 LEDGER_CHECK_BALANCE(tagged_nofootprint_compressed);
24688 LEDGER_CHECK_BALANCE(tagged_footprint_compressed);
24689 LEDGER_CHECK_BALANCE(network_volatile);
24690 LEDGER_CHECK_BALANCE(network_nonvolatile);
24691 LEDGER_CHECK_BALANCE(network_volatile_compressed);
24692 LEDGER_CHECK_BALANCE(network_nonvolatile_compressed);
24693 LEDGER_CHECK_BALANCE(media_nofootprint);
24694 LEDGER_CHECK_BALANCE(media_footprint);
24695 LEDGER_CHECK_BALANCE(media_nofootprint_compressed);
24696 LEDGER_CHECK_BALANCE(media_footprint_compressed);
24697 LEDGER_CHECK_BALANCE(graphics_nofootprint);
24698 LEDGER_CHECK_BALANCE(graphics_footprint);
24699 LEDGER_CHECK_BALANCE(graphics_nofootprint_compressed);
24700 LEDGER_CHECK_BALANCE(graphics_footprint_compressed);
24701 LEDGER_CHECK_BALANCE(neural_nofootprint);
24702 LEDGER_CHECK_BALANCE(neural_footprint);
24703 LEDGER_CHECK_BALANCE(neural_nofootprint_compressed);
24704 LEDGER_CHECK_BALANCE(neural_footprint_compressed);
24705 LEDGER_CHECK_BALANCE(neural_nofootprint_total);
24706
24707 if (do_panic) {
24708 if (pmap_ledgers_panic) {
24709 panic("pmap_destroy(%p) %d[%s] has imbalanced ledgers",
24710 pmap, pid, procname);
24711 } else {
24712 printf("pmap_destroy(%p) %d[%s] has imbalanced ledgers\n",
24713 pmap, pid, procname);
24714 }
24715 }
24716 }
24717
24718 void
vm_map_pmap_set_process(vm_map_t map,int pid,char * procname)24719 vm_map_pmap_set_process(
24720 vm_map_t map,
24721 int pid,
24722 char *procname)
24723 {
24724 pmap_set_process(vm_map_pmap(map), pid, procname);
24725 }
24726
24727 #endif /* MACH_ASSERT */
24728
24729 /**
24730 * Check if a given given map operation size is valid for the given map, taking
24731 * in to account whether or not the map operation has overridden the soft limit.
24732 *
24733 * This function is meant to be inlined wherever possible as it can, in some
24734 * modes, generates telemetry events which capture shallow backtraces. To
24735 * maximize the usefulness of this backtrace, we want to minize the depth at
24736 * which the backtrace is taken.
24737 */
24738 __attribute__((always_inline))
24739 bool
vm_map_is_map_size_valid(vm_map_t target_map,vm_size_t size,bool no_soft_limit)24740 vm_map_is_map_size_valid(
24741 vm_map_t target_map,
24742 vm_size_t size,
24743 bool no_soft_limit)
24744 {
24745 #ifdef __x86_64__
24746 // Do not enforce any additional limits on x64
24747 (void)target_map;
24748 (void)size;
24749 (void)no_soft_limit;
24750 return true;
24751 #else
24752 if (__probable(target_map->pmap != kernel_pmap ||
24753 size < VM_KERNEL_SIMPLE_MAX_SIZE || no_soft_limit)) {
24754 // Allocation size matches policy
24755 return true;
24756 }
24757
24758 switch (vm_map_kernel_alloc_limit_mode) {
24759 default:
24760 case VM_MAP_KERNEL_ALLOC_LIMIT_MODE_BYPASS:
24761 return true;
24762 case VM_MAP_KERNEL_ALLOC_LIMIT_MODE_TRAP:
24763 trap_telemetry_report_kernel_soft_error(
24764 TRAP_TELEMETRY_KERNEL_SOFT_ERROR_VM_KERNEL_MAX_ALLOC_SIZE,
24765 /* report_once_per_site */ false);
24766 return true;
24767 case VM_MAP_KERNEL_ALLOC_LIMIT_MODE_REJECT:
24768 return false;
24769 case VM_MAP_KERNEL_ALLOC_LIMIT_MODE_PANIC:
24770 panic("1,000,000K ought to be enough for anybody "
24771 "(requested %lu bytes)", size);
24772 }
24773 #endif /* __x86_64__ */
24774 }
24775
24776 vm_map_serial_t
vm_map_maybe_serial_id(vm_map_t maybe_vm_map)24777 vm_map_maybe_serial_id(vm_map_t maybe_vm_map)
24778 {
24779 return maybe_vm_map != NULL ? maybe_vm_map->serial_id : VM_MAP_SERIAL_NONE;
24780 }
24781