1 /*
2 * Copyright (c) 2000-2021 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /*
29 * @OSF_COPYRIGHT@
30 */
31 /*
32 * Mach Operating System
33 * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
34 * All Rights Reserved.
35 *
36 * Permission to use, copy, modify and distribute this software and its
37 * documentation is hereby granted, provided that both the copyright
38 * notice and this permission notice appear in all copies of the
39 * software, derivative works or modified versions, and any portions
40 * thereof, and that both notices appear in supporting documentation.
41 *
42 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44 * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45 *
46 * Carnegie Mellon requests users of this software to return to
47 *
48 * Software Distribution Coordinator or [email protected]
49 * School of Computer Science
50 * Carnegie Mellon University
51 * Pittsburgh PA 15213-3890
52 *
53 * any improvements or extensions that they make and grant Carnegie Mellon
54 * the rights to redistribute these changes.
55 */
56 /*
57 */
58 /*
59 * File: vm/vm_map.c
60 * Author: Avadis Tevanian, Jr., Michael Wayne Young
61 * Date: 1985
62 *
63 * Virtual memory mapping module.
64 */
65
66 #include <mach/vm_types.h>
67 #include <mach_assert.h>
68
69 #include <vm/vm_options.h>
70
71 #include <libkern/OSAtomic.h>
72
73 #include <mach/kern_return.h>
74 #include <mach/port.h>
75 #include <mach/vm_attributes.h>
76 #include <mach/vm_param.h>
77 #include <mach/vm_behavior.h>
78 #include <mach/vm_statistics.h>
79 #include <mach/memory_object.h>
80 #include <mach/mach_vm_server.h>
81 #include <machine/cpu_capabilities.h>
82 #include <mach/sdt.h>
83
84 #include <kern/assert.h>
85 #include <kern/backtrace.h>
86 #include <kern/counter.h>
87 #include <kern/exc_guard.h>
88 #include <kern/kalloc.h>
89 #include <kern/zalloc_internal.h>
90 #include <kern/telemetry.h>
91 #include <kern/trap_telemetry.h>
92
93 #include <vm/cpm_internal.h>
94 #include <vm/memory_types.h>
95 #include <vm/vm_compressor_xnu.h>
96 #include <vm/vm_compressor_pager_internal.h>
97 #include <vm/vm_init_xnu.h>
98 #include <vm/vm_fault_internal.h>
99 #include <vm/vm_map_internal.h>
100 #include <vm/vm_object_internal.h>
101 #include <vm/vm_page_internal.h>
102 #include <vm/vm_pageout.h>
103 #include <vm/pmap.h>
104 #include <vm/vm_kern_internal.h>
105 #include <ipc/ipc_port.h>
106 #include <kern/sched_prim.h>
107 #include <kern/misc_protos.h>
108
109 #include <mach/vm_map_server.h>
110 #include <mach/mach_host_server.h>
111 #include <vm/vm_memtag.h>
112 #include <vm/vm_protos_internal.h>
113 #include <vm/vm_purgeable_internal.h>
114
115 #include <vm/vm_iokit.h>
116 #include <vm/vm_shared_region_internal.h>
117 #include <vm/vm_map_store_internal.h>
118 #include <vm/vm_memory_entry_xnu.h>
119 #include <vm/memory_object_internal.h>
120 #include <vm/vm_memory_entry.h>
121 #include <vm/vm_sanitize_internal.h>
122 #include <vm/vm_reclaim_xnu.h>
123 #if DEVELOPMENT || DEBUG
124 #include <vm/vm_compressor_info.h>
125 #endif /* DEVELOPMENT || DEBUG */
126 #include <san/kasan.h>
127
128 #include <sys/resource.h>
129 #include <sys/random.h>
130 #include <sys/codesign.h>
131 #include <sys/code_signing.h>
132 #include <sys/mman.h>
133 #include <sys/reboot.h>
134 #include <sys/kdebug_triage.h>
135 #include <sys/reason.h>
136
137 #include <os/log.h>
138
139 #include <libkern/section_keywords.h>
140
141 #include <os/hash.h>
142
143 #if DEVELOPMENT || DEBUG
144 extern int proc_selfcsflags(void);
145 int vm_log_xnu_user_debug = 0;
146 int panic_on_unsigned_execute = 0;
147 int panic_on_mlock_failure = 0;
148 #endif /* DEVELOPMENT || DEBUG */
149
150 #if DEVELOPMENT || DEBUG
151 int debug4k_filter = 0;
152 char debug4k_proc_name[1024] = "";
153 int debug4k_proc_filter = (int)-1 & ~(1 << __DEBUG4K_FAULT);
154 int debug4k_panic_on_misaligned_sharing = 0;
155 const char *debug4k_category_name[] = {
156 "error", /* 0 */
157 "life", /* 1 */
158 "load", /* 2 */
159 "fault", /* 3 */
160 "copy", /* 4 */
161 "share", /* 5 */
162 "adjust", /* 6 */
163 "pmap", /* 7 */
164 "mementry", /* 8 */
165 "iokit", /* 9 */
166 "upl", /* 10 */
167 "exc", /* 11 */
168 "vfs" /* 12 */
169 };
170 #endif /* DEVELOPMENT || DEBUG */
171 int debug4k_no_cow_copyin = 0;
172
173
174 #if __arm64__
175 extern const int fourk_binary_compatibility_unsafe;
176 #endif /* __arm64__ */
177 extern int proc_selfpid(void);
178 extern char *proc_name_address(void *p);
179 extern const char *proc_best_name(struct proc *p);
180
181 #if VM_MAP_DEBUG_APPLE_PROTECT
182 int vm_map_debug_apple_protect = 0;
183 #endif /* VM_MAP_DEBUG_APPLE_PROTECT */
184 #if VM_MAP_DEBUG_FOURK
185 int vm_map_debug_fourk = 0;
186 #endif /* VM_MAP_DEBUG_FOURK */
187
188 #if DEBUG || DEVELOPMENT
189 static TUNABLE(bool, vm_map_executable_immutable,
190 "vm_map_executable_immutable", true);
191 #else
192 #define vm_map_executable_immutable true
193 #endif
194
195 /** Do not enforce the kernel allocation size limit */
196 #define VM_MAP_KERNEL_ALLOC_LIMIT_MODE_BYPASS (0)
197 /** Enforce the kernel allocation limit by refusing too large requests */
198 #define VM_MAP_KERNEL_ALLOC_LIMIT_MODE_REJECT (1)
199 /** Enforce the kernel allocation limit by panicking on any too large request */
200 #define VM_MAP_KERNEL_ALLOC_LIMIT_MODE_PANIC (2)
201 /** Do not enforce the kernel allocation limit but generate a telemetry trap */
202 #define VM_MAP_KERNEL_ALLOC_LIMIT_MODE_TRAP (3)
203
204 #if DEVELOPMENT || DEBUG
205 static TUNABLE(int, vm_map_kernel_alloc_limit_mode,
206 "vm_map_kernel_alloc_limit_mode", VM_MAP_KERNEL_ALLOC_LIMIT_MODE_TRAP);
207 #else
208 #define vm_map_kernel_alloc_limit_mode VM_MAP_KERNEL_ALLOC_LIMIT_MODE_BYPASS
209 #endif /* DEVELOPMENT || DEBUG */
210
211 os_refgrp_decl(static, map_refgrp, "vm_map", NULL);
212
213 extern u_int32_t random(void); /* from <libkern/libkern.h> */
214 /* Internal prototypes
215 */
216
217 typedef struct vm_map_zap {
218 vm_map_entry_t vmz_head;
219 vm_map_entry_t *vmz_tail;
220 } *vm_map_zap_t;
221
222 #define VM_MAP_ZAP_DECLARE(zap) \
223 struct vm_map_zap zap = { .vmz_tail = &zap.vmz_head }
224
225 extern kern_return_t vm_map_wire_external(
226 vm_map_t map,
227 vm_map_offset_ut start_u,
228 vm_map_offset_ut end_u,
229 vm_prot_ut prot_u,
230 boolean_t user_wire) __exported;
231
232 #if XNU_PLATFORM_MacOSX
233 extern /* exported via Private.<arch>.MacOSX.exports on macOS */
234 #else
235 static
236 #endif
237 kern_return_t vm_map_copyin_common(
238 vm_map_t src_map,
239 vm_map_address_ut src_addr,
240 vm_map_size_ut len,
241 boolean_t src_destroy,
242 boolean_t src_volatile,
243 vm_map_copy_t *copy_result, /* OUT */
244 boolean_t use_maxprot);
245
246 static vm_map_entry_t vm_map_entry_insert(
247 vm_map_t map,
248 vm_map_entry_t insp_entry,
249 vm_map_offset_t start,
250 vm_map_offset_t end,
251 vm_object_t object,
252 vm_object_offset_t offset,
253 vm_map_kernel_flags_t vmk_flags,
254 boolean_t needs_copy,
255 vm_prot_t cur_protection,
256 vm_prot_t max_protection,
257 vm_inherit_t inheritance,
258 boolean_t clear_map_aligned);
259
260 static void vm_map_simplify_range(
261 vm_map_t map,
262 vm_map_offset_t start,
263 vm_map_offset_t end); /* forward */
264
265 static boolean_t vm_map_range_check(
266 vm_map_t map,
267 vm_map_offset_t start,
268 vm_map_offset_t end,
269 vm_map_entry_t *entry);
270
271 static void vm_map_submap_pmap_clean(
272 vm_map_t map,
273 vm_map_offset_t start,
274 vm_map_offset_t end,
275 vm_map_t sub_map,
276 vm_map_offset_t offset);
277
278 static void vm_map_pmap_enter(
279 vm_map_t map,
280 vm_map_offset_t addr,
281 vm_map_offset_t end_addr,
282 vm_object_t object,
283 vm_object_offset_t offset,
284 vm_prot_t protection);
285
286 static void _vm_map_clip_end(
287 struct vm_map_header *map_header,
288 vm_map_entry_t entry,
289 vm_map_offset_t end);
290
291 static void _vm_map_clip_start(
292 struct vm_map_header *map_header,
293 vm_map_entry_t entry,
294 vm_map_offset_t start);
295
296 static kmem_return_t vm_map_delete(
297 vm_map_t map,
298 vm_map_offset_t start,
299 vm_map_offset_t end,
300 vmr_flags_t flags,
301 kmem_guard_t guard,
302 vm_map_zap_t zap);
303
304 static void vm_map_copy_insert(
305 vm_map_t map,
306 vm_map_entry_t after_where,
307 vm_map_copy_t copy);
308
309 static kern_return_t vm_map_copy_overwrite_unaligned(
310 vm_map_t dst_map,
311 vm_map_entry_t entry,
312 vm_map_copy_t copy,
313 vm_map_address_t start,
314 boolean_t discard_on_success);
315
316 static kern_return_t vm_map_copy_overwrite_aligned(
317 vm_map_t dst_map,
318 vm_map_entry_t tmp_entry,
319 vm_map_copy_t copy,
320 vm_map_offset_t start,
321 pmap_t pmap);
322
323 static kern_return_t vm_map_copyin_kernel_buffer(
324 vm_map_t src_map,
325 vm_map_address_t src_addr,
326 vm_map_size_t len,
327 boolean_t src_destroy,
328 vm_map_copy_t *copy_result); /* OUT */
329
330 static kern_return_t vm_map_copyout_kernel_buffer(
331 vm_map_t map,
332 vm_map_address_t *addr, /* IN/OUT */
333 vm_map_copy_t copy,
334 vm_map_size_t copy_size,
335 boolean_t overwrite,
336 boolean_t consume_on_success);
337
338 static void vm_map_fork_share(
339 vm_map_t old_map,
340 vm_map_entry_t old_entry,
341 vm_map_t new_map);
342
343 static boolean_t vm_map_fork_copy(
344 vm_map_t old_map,
345 vm_map_entry_t *old_entry_p,
346 vm_map_t new_map,
347 int vm_map_copyin_flags);
348
349 static kern_return_t vm_map_wire_nested(
350 vm_map_t map,
351 vm_map_offset_t start,
352 vm_map_offset_t end,
353 vm_prot_t caller_prot,
354 vm_tag_t tag,
355 boolean_t user_wire,
356 pmap_t map_pmap,
357 vm_map_offset_t pmap_addr,
358 ppnum_t *physpage_p);
359
360 static kern_return_t vm_map_unwire_nested(
361 vm_map_t map,
362 vm_map_offset_t start,
363 vm_map_offset_t end,
364 boolean_t user_wire,
365 pmap_t map_pmap,
366 vm_map_offset_t pmap_addr);
367
368 static kern_return_t vm_map_overwrite_submap_recurse(
369 vm_map_t dst_map,
370 vm_map_offset_t dst_addr,
371 vm_map_size_t dst_size);
372
373 static kern_return_t vm_map_copy_overwrite_nested(
374 vm_map_t dst_map,
375 vm_map_offset_t dst_addr,
376 vm_map_copy_t copy,
377 boolean_t interruptible,
378 pmap_t pmap,
379 boolean_t discard_on_success);
380
381 static kern_return_t vm_map_remap_extract(
382 vm_map_t map,
383 vm_map_offset_t addr,
384 vm_map_size_t size,
385 boolean_t copy,
386 vm_map_copy_t map_copy,
387 vm_prot_t *cur_protection,
388 vm_prot_t *max_protection,
389 vm_inherit_t inheritance,
390 vm_map_kernel_flags_t vmk_flags);
391
392 static void vm_map_region_look_for_page(
393 vm_map_t map,
394 vm_map_offset_t va,
395 vm_object_t object,
396 vm_object_offset_t offset,
397 int max_refcnt,
398 unsigned short depth,
399 vm_region_extended_info_t extended,
400 mach_msg_type_number_t count);
401
402 static boolean_t vm_map_region_has_obj_ref(
403 vm_map_entry_t entry,
404 vm_object_t object);
405
406
407 static kern_return_t vm_map_willneed(
408 vm_map_t map,
409 vm_map_offset_t start,
410 vm_map_offset_t end);
411
412 static kern_return_t vm_map_reuse_pages(
413 vm_map_t map,
414 vm_map_offset_t start,
415 vm_map_offset_t end);
416
417 static kern_return_t vm_map_reusable_pages(
418 vm_map_t map,
419 vm_map_offset_t start,
420 vm_map_offset_t end);
421
422 static kern_return_t vm_map_can_reuse(
423 vm_map_t map,
424 vm_map_offset_t start,
425 vm_map_offset_t end);
426
427 static kern_return_t vm_map_zero(
428 vm_map_t map,
429 vm_map_offset_t start,
430 vm_map_offset_t end);
431
432 static kern_return_t vm_map_random_address_for_size(
433 vm_map_t map,
434 vm_map_offset_t *address,
435 vm_map_size_t size,
436 vm_map_kernel_flags_t vmk_flags);
437
438
439 #if CONFIG_MAP_RANGES
440
441 static vm_map_range_id_t vm_map_user_range_resolve(
442 vm_map_t map,
443 mach_vm_address_t addr,
444 mach_vm_address_t size,
445 mach_vm_range_t range);
446
447 #endif /* CONFIG_MAP_RANGES */
448 #if MACH_ASSERT
449 static kern_return_t vm_map_pageout(
450 vm_map_t map,
451 vm_map_offset_t start,
452 vm_map_offset_t end);
453 #endif /* MACH_ASSERT */
454
455 kern_return_t vm_map_corpse_footprint_collect(
456 vm_map_t old_map,
457 vm_map_entry_t old_entry,
458 vm_map_t new_map);
459 void vm_map_corpse_footprint_collect_done(
460 vm_map_t new_map);
461 void vm_map_corpse_footprint_destroy(
462 vm_map_t map);
463 kern_return_t vm_map_corpse_footprint_query_page_info(
464 vm_map_t map,
465 vm_map_offset_t va,
466 int *disposition_p);
467 void vm_map_footprint_query_page_info(
468 vm_map_t map,
469 vm_map_entry_t map_entry,
470 vm_map_offset_t curr_s_offset,
471 int *disposition_p);
472
473 #if CONFIG_MAP_RANGES
474 static void vm_map_range_map_init(void);
475 #endif /* CONFIG_MAP_RANGES */
476
477 pid_t find_largest_process_vm_map_entries(void);
478
479 __attribute__((always_inline))
480 int
vm_map_kernel_flags_vmflags(vm_map_kernel_flags_t vmk_flags)481 vm_map_kernel_flags_vmflags(vm_map_kernel_flags_t vmk_flags)
482 {
483 int flags = vmk_flags.__vm_flags & VM_FLAGS_ANY_MASK;
484
485 /* in vmk flags the meaning of fixed/anywhere is inverted */
486 return flags ^ (VM_FLAGS_FIXED | VM_FLAGS_ANYWHERE);
487 }
488
489 __attribute__((always_inline, overloadable))
490 void
vm_map_kernel_flags_set_vmflags(vm_map_kernel_flags_t * vmk_flags,int vm_flags,vm_tag_t vm_tag)491 vm_map_kernel_flags_set_vmflags(
492 vm_map_kernel_flags_t *vmk_flags,
493 int vm_flags,
494 vm_tag_t vm_tag)
495 {
496 vm_flags ^= (VM_FLAGS_FIXED | VM_FLAGS_ANYWHERE);
497 vmk_flags->__vm_flags &= ~VM_FLAGS_ANY_MASK;
498 vmk_flags->__vm_flags |= (vm_flags & VM_FLAGS_ANY_MASK);
499 vmk_flags->vm_tag = vm_tag;
500 }
501
502 __attribute__((always_inline, overloadable))
503 void
vm_map_kernel_flags_set_vmflags(vm_map_kernel_flags_t * vmk_flags,int vm_flags_and_tag)504 vm_map_kernel_flags_set_vmflags(
505 vm_map_kernel_flags_t *vmk_flags,
506 int vm_flags_and_tag)
507 {
508 vm_flags_and_tag ^= (VM_FLAGS_FIXED | VM_FLAGS_ANYWHERE);
509 vmk_flags->__vm_flags &= ~VM_FLAGS_ANY_MASK;
510 vmk_flags->__vm_flags |= (vm_flags_and_tag & VM_FLAGS_ANY_MASK);
511 VM_GET_FLAGS_ALIAS(vm_flags_and_tag, vmk_flags->vm_tag);
512 }
513
514 __attribute__((always_inline))
515 void
vm_map_kernel_flags_and_vmflags(vm_map_kernel_flags_t * vmk_flags,int vm_flags_mask)516 vm_map_kernel_flags_and_vmflags(
517 vm_map_kernel_flags_t *vmk_flags,
518 int vm_flags_mask)
519 {
520 /* this function doesn't handle the inverted FIXED/ANYWHERE */
521 assert(vm_flags_mask & VM_FLAGS_ANYWHERE);
522 vmk_flags->__vm_flags &= vm_flags_mask;
523 }
524
525 __attribute__((always_inline))
526 bool
vm_map_kernel_flags_check_vm_and_kflags(vm_map_kernel_flags_t vmk_flags,int vm_flags_mask)527 vm_map_kernel_flags_check_vm_and_kflags(
528 vm_map_kernel_flags_t vmk_flags,
529 int vm_flags_mask)
530 {
531 return (vmk_flags.__vm_flags & ~vm_flags_mask) == 0;
532 }
533
534 bool
vm_map_kernel_flags_check_vmflags(vm_map_kernel_flags_t vmk_flags,int vm_flags_mask)535 vm_map_kernel_flags_check_vmflags(
536 vm_map_kernel_flags_t vmk_flags,
537 int vm_flags_mask)
538 {
539 int vmflags = vmk_flags.__vm_flags & VM_FLAGS_ANY_MASK;
540
541 /* Note: up to 16 still has good calling conventions */
542 static_assert(sizeof(vm_map_kernel_flags_t) == 16);
543
544 #if DEBUG || DEVELOPMENT
545 /*
546 * All of this compiles to nothing if all checks pass.
547 */
548 #define check(field, value) ({ \
549 vm_map_kernel_flags_t fl = VM_MAP_KERNEL_FLAGS_NONE; \
550 fl.__vm_flags = (value); \
551 fl.field = 0; \
552 assert(fl.__vm_flags == 0); \
553 })
554
555 /* bits 0-7 */
556 check(vmf_fixed, VM_FLAGS_ANYWHERE); // kind of a lie this is inverted
557 check(vmf_purgeable, VM_FLAGS_PURGABLE);
558 check(vmf_4gb_chunk, VM_FLAGS_4GB_CHUNK);
559 check(vmf_random_addr, VM_FLAGS_RANDOM_ADDR);
560 check(vmf_no_cache, VM_FLAGS_NO_CACHE);
561 check(vmf_resilient_codesign, VM_FLAGS_RESILIENT_CODESIGN);
562 check(vmf_resilient_media, VM_FLAGS_RESILIENT_MEDIA);
563 check(vmf_permanent, VM_FLAGS_PERMANENT);
564
565 /* bits 8-15 */
566 check(vmf_tpro, VM_FLAGS_TPRO);
567 check(vmf_overwrite, VM_FLAGS_OVERWRITE);
568
569 /* bits 16-23 */
570 check(vmf_superpage_size, VM_FLAGS_SUPERPAGE_MASK);
571 check(vmf_return_data_addr, VM_FLAGS_RETURN_DATA_ADDR);
572 check(vmf_return_4k_data_addr, VM_FLAGS_RETURN_4K_DATA_ADDR);
573
574 {
575 vm_map_kernel_flags_t fl = VM_MAP_KERNEL_FLAGS_NONE;
576
577 /* check user tags will never clip */
578 fl.vm_tag = VM_MEMORY_COUNT - 1;
579 assert(fl.vm_tag == VM_MEMORY_COUNT - 1);
580
581 /* check kernel tags will never clip */
582 fl.vm_tag = VM_MAX_TAG_VALUE - 1;
583 assert(fl.vm_tag == VM_MAX_TAG_VALUE - 1);
584 }
585
586
587 #undef check
588 #endif /* DEBUG || DEVELOPMENT */
589
590 return (vmflags & ~vm_flags_mask) == 0;
591 }
592
593 /*
594 * Macros to copy a vm_map_entry. We must be careful to correctly
595 * manage the wired page count. vm_map_entry_copy() creates a new
596 * map entry to the same memory - the wired count in the new entry
597 * must be set to zero. vm_map_entry_copy_full() creates a new
598 * entry that is identical to the old entry. This preserves the
599 * wire count; it's used for map splitting and zone changing in
600 * vm_map_copyout.
601 */
602
603 static inline void
vm_map_entry_copy_csm_assoc(vm_map_t map __unused,vm_map_entry_t new __unused,vm_map_entry_t old __unused)604 vm_map_entry_copy_csm_assoc(
605 vm_map_t map __unused,
606 vm_map_entry_t new __unused,
607 vm_map_entry_t old __unused)
608 {
609 #if CODE_SIGNING_MONITOR
610 /* when code signing monitor is enabled, we want to reset on copy */
611 new->csm_associated = FALSE;
612 #else
613 /* when code signing monitor is not enabled, assert as a sanity check */
614 assert(new->csm_associated == FALSE);
615 #endif
616 #if DEVELOPMENT || DEBUG
617 if (new->vme_xnu_user_debug && vm_log_xnu_user_debug) {
618 printf("FBDP %d[%s] %s:%d map %p entry %p [ 0x%llx 0x%llx ] vme_xnu_user_debug\n",
619 proc_selfpid(),
620 (get_bsdtask_info(current_task())
621 ? proc_name_address(get_bsdtask_info(current_task()))
622 : "?"),
623 __FUNCTION__, __LINE__,
624 map, new, new->vme_start, new->vme_end);
625 }
626 #endif /* DEVELOPMENT || DEBUG */
627 #if XNU_TARGET_OS_OSX
628 /*
629 * On macOS, entries with "vme_xnu_user_debug" can be copied during fork()
630 * and we want the child's entry to keep its "vme_xnu_user_debug" to avoid
631 * trigggering CSM assertions when the child accesses its mapping.
632 */
633 #else /* XNU_TARGET_OS_OSX */
634 new->vme_xnu_user_debug = FALSE;
635 #endif /* XNU_TARGET_OS_OSX */
636 }
637
638 /*
639 * The "used_for_jit" flag was copied from OLD to NEW in vm_map_entry_copy().
640 * But for security reasons on some platforms, we don't want the
641 * new mapping to be "used for jit", so we reset the flag here.
642 */
643 static inline void
vm_map_entry_copy_code_signing(vm_map_t map,vm_map_entry_t new,vm_map_entry_t old __unused)644 vm_map_entry_copy_code_signing(
645 vm_map_t map,
646 vm_map_entry_t new,
647 vm_map_entry_t old __unused)
648 {
649 if (VM_MAP_POLICY_ALLOW_JIT_COPY(map)) {
650 assert(new->used_for_jit == old->used_for_jit);
651 } else {
652 if (old->used_for_jit) {
653 DTRACE_VM3(cs_wx,
654 uint64_t, new->vme_start,
655 uint64_t, new->vme_end,
656 vm_prot_t, new->protection);
657 printf("CODE SIGNING: %d[%s] %s: curprot cannot be write+execute. %s\n",
658 proc_selfpid(),
659 (get_bsdtask_info(current_task())
660 ? proc_name_address(get_bsdtask_info(current_task()))
661 : "?"),
662 __FUNCTION__,
663 "removing execute access");
664 new->protection &= ~VM_PROT_EXECUTE;
665 new->max_protection &= ~VM_PROT_EXECUTE;
666 }
667 new->used_for_jit = FALSE;
668 }
669 }
670
671 static inline void
vm_map_entry_copy_full(vm_map_entry_t new,vm_map_entry_t old)672 vm_map_entry_copy_full(
673 vm_map_entry_t new,
674 vm_map_entry_t old)
675 {
676 #if MAP_ENTRY_CREATION_DEBUG
677 btref_put(new->vme_creation_bt);
678 btref_retain(old->vme_creation_bt);
679 #endif
680 #if MAP_ENTRY_INSERTION_DEBUG
681 btref_put(new->vme_insertion_bt);
682 btref_retain(old->vme_insertion_bt);
683 #endif
684 #if VM_BTLOG_TAGS
685 /* Discard the btref that might be in the new entry */
686 if (new->vme_kernel_object) {
687 btref_put(new->vme_tag_btref);
688 }
689 /* Retain the btref in the old entry to account for its copy */
690 if (old->vme_kernel_object) {
691 btref_retain(old->vme_tag_btref);
692 }
693 #endif /* VM_BTLOG_TAGS */
694 *new = *old;
695 }
696
697 static inline void
vm_map_entry_copy(vm_map_t map,vm_map_entry_t new,vm_map_entry_t old)698 vm_map_entry_copy(
699 vm_map_t map,
700 vm_map_entry_t new,
701 vm_map_entry_t old)
702 {
703 vm_map_entry_copy_full(new, old);
704
705 new->is_shared = FALSE;
706 new->needs_wakeup = FALSE;
707 new->in_transition = FALSE;
708 new->wired_count = 0;
709 new->user_wired_count = 0;
710 new->vme_permanent = FALSE;
711 vm_map_entry_copy_code_signing(map, new, old);
712 vm_map_entry_copy_csm_assoc(map, new, old);
713 if (new->iokit_acct) {
714 assertf(!new->use_pmap, "old %p new %p\n", old, new);
715 new->iokit_acct = FALSE;
716 new->use_pmap = TRUE;
717 }
718 new->vme_resilient_codesign = FALSE;
719 new->vme_resilient_media = FALSE;
720 new->vme_atomic = FALSE;
721 new->vme_no_copy_on_read = FALSE;
722 }
723
724 /*
725 * Normal lock_read_to_write() returns FALSE/0 on failure.
726 * These functions evaluate to zero on success and non-zero value on failure.
727 */
728 __attribute__((always_inline))
729 int
vm_map_lock_read_to_write(vm_map_t map)730 vm_map_lock_read_to_write(vm_map_t map)
731 {
732 if (lck_rw_lock_shared_to_exclusive(&(map)->lock)) {
733 DTRACE_VM(vm_map_lock_upgrade);
734 return 0;
735 }
736 return 1;
737 }
738
739 __attribute__((always_inline))
740 boolean_t
vm_map_try_lock(vm_map_t map)741 vm_map_try_lock(vm_map_t map)
742 {
743 if (lck_rw_try_lock_exclusive(&(map)->lock)) {
744 DTRACE_VM(vm_map_lock_w);
745 return TRUE;
746 }
747 return FALSE;
748 }
749
750 __attribute__((always_inline))
751 boolean_t
vm_map_try_lock_read(vm_map_t map)752 vm_map_try_lock_read(vm_map_t map)
753 {
754 if (lck_rw_try_lock_shared(&(map)->lock)) {
755 DTRACE_VM(vm_map_lock_r);
756 return TRUE;
757 }
758 return FALSE;
759 }
760
761 /*!
762 * @function kdp_vm_map_is_acquired_exclusive
763 *
764 * @abstract
765 * Checks if vm map is acquired exclusive.
766 *
767 * @discussion
768 * NOT SAFE: To be used only by kernel debugger.
769 *
770 * @param map map to check
771 *
772 * @returns TRUE if the map is acquired exclusively.
773 */
774 boolean_t
kdp_vm_map_is_acquired_exclusive(vm_map_t map)775 kdp_vm_map_is_acquired_exclusive(vm_map_t map)
776 {
777 return kdp_lck_rw_lock_is_acquired_exclusive(&map->lock);
778 }
779
780 /*
781 * Routines to get the page size the caller should
782 * use while inspecting the target address space.
783 * Use the "_safely" variant if the caller is dealing with a user-provided
784 * array whose size depends on the page size, to avoid any overflow or
785 * underflow of a user-allocated buffer.
786 */
787 int
vm_self_region_page_shift_safely(vm_map_t target_map)788 vm_self_region_page_shift_safely(
789 vm_map_t target_map)
790 {
791 int effective_page_shift = 0;
792
793 if (PAGE_SIZE == (4096)) {
794 /* x86_64 and 4k watches: always use 4k */
795 return PAGE_SHIFT;
796 }
797 /* did caller provide an explicit page size for this thread to use? */
798 effective_page_shift = thread_self_region_page_shift();
799 if (effective_page_shift) {
800 /* use the explicitly-provided page size */
801 return effective_page_shift;
802 }
803 /* no explicit page size: use the caller's page size... */
804 effective_page_shift = VM_MAP_PAGE_SHIFT(current_map());
805 if (effective_page_shift == VM_MAP_PAGE_SHIFT(target_map)) {
806 /* page size match: safe to use */
807 return effective_page_shift;
808 }
809 /* page size mismatch */
810 return -1;
811 }
812 int
vm_self_region_page_shift(vm_map_t target_map)813 vm_self_region_page_shift(
814 vm_map_t target_map)
815 {
816 int effective_page_shift;
817
818 effective_page_shift = vm_self_region_page_shift_safely(target_map);
819 if (effective_page_shift == -1) {
820 /* no safe value but OK to guess for caller */
821 effective_page_shift = MIN(VM_MAP_PAGE_SHIFT(current_map()),
822 VM_MAP_PAGE_SHIFT(target_map));
823 }
824 return effective_page_shift;
825 }
826
827
828 /*
829 * Decide if we want to allow processes to execute from their data or stack areas.
830 * override_nx() returns true if we do. Data/stack execution can be enabled independently
831 * for 32 and 64 bit processes. Set the VM_ABI_32 or VM_ABI_64 flags in allow_data_exec
832 * or allow_stack_exec to enable data execution for that type of data area for that particular
833 * ABI (or both by or'ing the flags together). These are initialized in the architecture
834 * specific pmap files since the default behavior varies according to architecture. The
835 * main reason it varies is because of the need to provide binary compatibility with old
836 * applications that were written before these restrictions came into being. In the old
837 * days, an app could execute anything it could read, but this has slowly been tightened
838 * up over time. The default behavior is:
839 *
840 * 32-bit PPC apps may execute from both stack and data areas
841 * 32-bit Intel apps may exeucte from data areas but not stack
842 * 64-bit PPC/Intel apps may not execute from either data or stack
843 *
844 * An application on any architecture may override these defaults by explicitly
845 * adding PROT_EXEC permission to the page in question with the mprotect(2)
846 * system call. This code here just determines what happens when an app tries to
847 * execute from a page that lacks execute permission.
848 *
849 * Note that allow_data_exec or allow_stack_exec may also be modified by sysctl to change the
850 * default behavior for both 32 and 64 bit apps on a system-wide basis. Furthermore,
851 * a Mach-O header flag bit (MH_NO_HEAP_EXECUTION) can be used to forcibly disallow
852 * execution from data areas for a particular binary even if the arch normally permits it. As
853 * a final wrinkle, a posix_spawn attribute flag can be used to negate this opt-in header bit
854 * to support some complicated use cases, notably browsers with out-of-process plugins that
855 * are not all NX-safe.
856 */
857
858 extern int allow_data_exec, allow_stack_exec;
859
860 int
override_nx(vm_map_t map,uint32_t user_tag)861 override_nx(vm_map_t map, uint32_t user_tag) /* map unused on arm */
862 {
863 int current_abi;
864
865 if (map->pmap == kernel_pmap) {
866 return FALSE;
867 }
868
869 /*
870 * Determine if the app is running in 32 or 64 bit mode.
871 */
872
873 if (vm_map_is_64bit(map)) {
874 current_abi = VM_ABI_64;
875 } else {
876 current_abi = VM_ABI_32;
877 }
878
879 /*
880 * Determine if we should allow the execution based on whether it's a
881 * stack or data area and the current architecture.
882 */
883
884 if (user_tag == VM_MEMORY_STACK) {
885 return allow_stack_exec & current_abi;
886 }
887
888 return (allow_data_exec & current_abi) && (map->map_disallow_data_exec == FALSE);
889 }
890
891
892 /*
893 * Virtual memory maps provide for the mapping, protection,
894 * and sharing of virtual memory objects. In addition,
895 * this module provides for an efficient virtual copy of
896 * memory from one map to another.
897 *
898 * Synchronization is required prior to most operations.
899 *
900 * Maps consist of an ordered doubly-linked list of simple
901 * entries; a single hint is used to speed up lookups.
902 *
903 * Sharing maps have been deleted from this version of Mach.
904 * All shared objects are now mapped directly into the respective
905 * maps. This requires a change in the copy on write strategy;
906 * the asymmetric (delayed) strategy is used for shared temporary
907 * objects instead of the symmetric (shadow) strategy. All maps
908 * are now "top level" maps (either task map, kernel map or submap
909 * of the kernel map).
910 *
911 * Since portions of maps are specified by start/end addreses,
912 * which may not align with existing map entries, all
913 * routines merely "clip" entries to these start/end values.
914 * [That is, an entry is split into two, bordering at a
915 * start or end value.] Note that these clippings may not
916 * always be necessary (as the two resulting entries are then
917 * not changed); however, the clipping is done for convenience.
918 * No attempt is currently made to "glue back together" two
919 * abutting entries.
920 *
921 * The symmetric (shadow) copy strategy implements virtual copy
922 * by copying VM object references from one map to
923 * another, and then marking both regions as copy-on-write.
924 * It is important to note that only one writeable reference
925 * to a VM object region exists in any map when this strategy
926 * is used -- this means that shadow object creation can be
927 * delayed until a write operation occurs. The symmetric (delayed)
928 * strategy allows multiple maps to have writeable references to
929 * the same region of a vm object, and hence cannot delay creating
930 * its copy objects. See vm_object_copy_quickly() in vm_object.c.
931 * Copying of permanent objects is completely different; see
932 * vm_object_copy_strategically() in vm_object.c.
933 */
934
935 ZONE_DECLARE_ID(ZONE_ID_VM_MAP_COPY, struct vm_map_copy);
936
937 #define VM_MAP_ZONE_NAME "maps"
938 #define VM_MAP_ZFLAGS (ZC_NOENCRYPT | ZC_VM)
939
940 #define VM_MAP_ENTRY_ZONE_NAME "VM map entries"
941 #define VM_MAP_ENTRY_ZFLAGS (ZC_NOENCRYPT | ZC_VM)
942
943 #define VM_MAP_HOLES_ZONE_NAME "VM map holes"
944 #define VM_MAP_HOLES_ZFLAGS (ZC_NOENCRYPT | ZC_VM)
945
946 /*
947 * Asserts that a vm_map_copy object is coming from the
948 * vm_map_copy_zone to ensure that it isn't a fake constructed
949 * anywhere else.
950 */
951 void
vm_map_copy_require(struct vm_map_copy * copy)952 vm_map_copy_require(struct vm_map_copy *copy)
953 {
954 zone_id_require(ZONE_ID_VM_MAP_COPY, sizeof(struct vm_map_copy), copy);
955 }
956
957 /*
958 * vm_map_require:
959 *
960 * Ensures that the argument is memory allocated from the genuine
961 * vm map zone. (See zone_id_require_allow_foreign).
962 */
963 void
vm_map_require(vm_map_t map)964 vm_map_require(vm_map_t map)
965 {
966 zone_id_require(ZONE_ID_VM_MAP, sizeof(struct _vm_map), map);
967 }
968
969 #define VM_MAP_EARLY_COUNT_MAX 16
970 static __startup_data vm_offset_t map_data;
971 static __startup_data vm_size_t map_data_size;
972 static __startup_data vm_offset_t kentry_data;
973 static __startup_data vm_size_t kentry_data_size;
974 static __startup_data vm_offset_t map_holes_data;
975 static __startup_data vm_size_t map_holes_data_size;
976 static __startup_data vm_map_t *early_map_owners[VM_MAP_EARLY_COUNT_MAX];
977 static __startup_data uint32_t early_map_count;
978
979 #if XNU_TARGET_OS_OSX
980 #define NO_COALESCE_LIMIT ((1024 * 128) - 1)
981 #else /* XNU_TARGET_OS_OSX */
982 #define NO_COALESCE_LIMIT 0
983 #endif /* XNU_TARGET_OS_OSX */
984
985 /* Skip acquiring locks if we're in the midst of a kernel core dump */
986 unsigned int not_in_kdp = 1;
987
988 unsigned int vm_map_set_cache_attr_count = 0;
989
990 kern_return_t
vm_map_set_cache_attr(vm_map_t map,vm_map_offset_t va)991 vm_map_set_cache_attr(
992 vm_map_t map,
993 vm_map_offset_t va)
994 {
995 vm_map_entry_t map_entry;
996 vm_object_t object;
997 kern_return_t kr = KERN_SUCCESS;
998
999 vm_map_lock_read(map);
1000
1001 if (!vm_map_lookup_entry(map, va, &map_entry) ||
1002 map_entry->is_sub_map) {
1003 /*
1004 * that memory is not properly mapped
1005 */
1006 kr = KERN_INVALID_ARGUMENT;
1007 goto done;
1008 }
1009 object = VME_OBJECT(map_entry);
1010
1011 if (object == VM_OBJECT_NULL) {
1012 /*
1013 * there should be a VM object here at this point
1014 */
1015 kr = KERN_INVALID_ARGUMENT;
1016 goto done;
1017 }
1018 vm_object_lock(object);
1019 object->set_cache_attr = TRUE;
1020 vm_object_unlock(object);
1021
1022 vm_map_set_cache_attr_count++;
1023 done:
1024 vm_map_unlock_read(map);
1025
1026 return kr;
1027 }
1028
1029
1030 #if CONFIG_CODE_DECRYPTION
1031 /*
1032 * vm_map_apple_protected:
1033 * This remaps the requested part of the object with an object backed by
1034 * the decrypting pager.
1035 * crypt_info contains entry points and session data for the crypt module.
1036 * The crypt_info block will be copied by vm_map_apple_protected. The data structures
1037 * referenced in crypt_info must remain valid until crypt_info->crypt_end() is called.
1038 */
1039 kern_return_t
vm_map_apple_protected(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_object_offset_t crypto_backing_offset,struct pager_crypt_info * crypt_info,uint32_t cryptid)1040 vm_map_apple_protected(
1041 vm_map_t map,
1042 vm_map_offset_t start,
1043 vm_map_offset_t end,
1044 vm_object_offset_t crypto_backing_offset,
1045 struct pager_crypt_info *crypt_info,
1046 uint32_t cryptid)
1047 {
1048 boolean_t map_locked;
1049 kern_return_t kr;
1050 vm_map_entry_t map_entry;
1051 struct vm_map_entry tmp_entry;
1052 memory_object_t unprotected_mem_obj;
1053 vm_object_t protected_object;
1054 vm_map_offset_t map_addr;
1055 vm_map_offset_t start_aligned, end_aligned;
1056 vm_object_offset_t crypto_start, crypto_end;
1057 boolean_t cache_pager;
1058
1059 map_locked = FALSE;
1060 unprotected_mem_obj = MEMORY_OBJECT_NULL;
1061
1062 if (__improbable(vm_map_range_overflows(map, start, end - start))) {
1063 return KERN_INVALID_ADDRESS;
1064 }
1065 start_aligned = vm_map_trunc_page(start, PAGE_MASK_64);
1066 end_aligned = vm_map_round_page(end, PAGE_MASK_64);
1067 start_aligned = vm_map_trunc_page(start_aligned, VM_MAP_PAGE_MASK(map));
1068 end_aligned = vm_map_round_page(end_aligned, VM_MAP_PAGE_MASK(map));
1069
1070 #if __arm64__
1071 /*
1072 * "start" and "end" might be 4K-aligned but not 16K-aligned,
1073 * so we might have to loop and establish up to 3 mappings:
1074 *
1075 * + the first 16K-page, which might overlap with the previous
1076 * 4K-aligned mapping,
1077 * + the center,
1078 * + the last 16K-page, which might overlap with the next
1079 * 4K-aligned mapping.
1080 * Each of these mapping might be backed by a vnode pager (if
1081 * properly page-aligned) or a "fourk_pager", itself backed by a
1082 * vnode pager (if 4K-aligned but not page-aligned).
1083 */
1084 #endif /* __arm64__ */
1085
1086 map_addr = start_aligned;
1087 for (map_addr = start_aligned;
1088 map_addr < end;
1089 map_addr = tmp_entry.vme_end) {
1090 vm_map_lock(map);
1091 map_locked = TRUE;
1092
1093 /* lookup the protected VM object */
1094 if (!vm_map_lookup_entry(map,
1095 map_addr,
1096 &map_entry) ||
1097 map_entry->is_sub_map ||
1098 VME_OBJECT(map_entry) == VM_OBJECT_NULL) {
1099 /* that memory is not properly mapped */
1100 kr = KERN_INVALID_ARGUMENT;
1101 goto done;
1102 }
1103
1104 /* ensure mapped memory is mapped as executable except
1105 * except for model decryption flow */
1106 if ((cryptid != CRYPTID_MODEL_ENCRYPTION) &&
1107 !(map_entry->protection & VM_PROT_EXECUTE)) {
1108 kr = KERN_INVALID_ARGUMENT;
1109 goto done;
1110 }
1111
1112 /* get the protected object to be decrypted */
1113 protected_object = VME_OBJECT(map_entry);
1114 if (protected_object == VM_OBJECT_NULL) {
1115 /* there should be a VM object here at this point */
1116 kr = KERN_INVALID_ARGUMENT;
1117 goto done;
1118 }
1119 /* ensure protected object stays alive while map is unlocked */
1120 vm_object_reference(protected_object);
1121
1122 /* limit the map entry to the area we want to cover */
1123 vm_map_clip_start(map, map_entry, start_aligned);
1124 vm_map_clip_end(map, map_entry, end_aligned);
1125
1126 tmp_entry = *map_entry;
1127 map_entry = VM_MAP_ENTRY_NULL; /* not valid after unlocking map */
1128 vm_map_unlock(map);
1129 map_locked = FALSE;
1130
1131 /*
1132 * This map entry might be only partially encrypted
1133 * (if not fully "page-aligned").
1134 */
1135 crypto_start = 0;
1136 crypto_end = tmp_entry.vme_end - tmp_entry.vme_start;
1137 if (tmp_entry.vme_start < start) {
1138 if (tmp_entry.vme_start != start_aligned) {
1139 kr = KERN_INVALID_ADDRESS;
1140 vm_object_deallocate(protected_object);
1141 goto done;
1142 }
1143 crypto_start += (start - tmp_entry.vme_start);
1144 }
1145 if (tmp_entry.vme_end > end) {
1146 if (tmp_entry.vme_end != end_aligned) {
1147 kr = KERN_INVALID_ADDRESS;
1148 vm_object_deallocate(protected_object);
1149 goto done;
1150 }
1151 crypto_end -= (tmp_entry.vme_end - end);
1152 }
1153
1154 /*
1155 * This "extra backing offset" is needed to get the decryption
1156 * routine to use the right key. It adjusts for the possibly
1157 * relative offset of an interposed "4K" pager...
1158 */
1159 if (crypto_backing_offset == (vm_object_offset_t) -1) {
1160 crypto_backing_offset = VME_OFFSET(&tmp_entry);
1161 }
1162
1163 cache_pager = TRUE;
1164 #if XNU_TARGET_OS_OSX
1165 if (vm_map_is_alien(map)) {
1166 cache_pager = FALSE;
1167 }
1168 #endif /* XNU_TARGET_OS_OSX */
1169
1170 /*
1171 * Lookup (and create if necessary) the protected memory object
1172 * matching that VM object.
1173 * If successful, this also grabs a reference on the memory object,
1174 * to guarantee that it doesn't go away before we get a chance to map
1175 * it.
1176 */
1177 unprotected_mem_obj = apple_protect_pager_setup(
1178 protected_object,
1179 VME_OFFSET(&tmp_entry),
1180 crypto_backing_offset,
1181 crypt_info,
1182 crypto_start,
1183 crypto_end,
1184 cache_pager);
1185
1186 /* release extra ref on protected object */
1187 vm_object_deallocate(protected_object);
1188
1189 if (unprotected_mem_obj == NULL) {
1190 kr = KERN_FAILURE;
1191 goto done;
1192 }
1193
1194 /* can overwrite an immutable mapping */
1195 vm_map_kernel_flags_t vmk_flags = {
1196 .vmf_fixed = true,
1197 .vmf_overwrite = true,
1198 .vmkf_overwrite_immutable = true,
1199 };
1200 /* make the new mapping as "permanent" as the one it replaces */
1201 vmk_flags.vmf_permanent = tmp_entry.vme_permanent;
1202
1203 /* map this memory object in place of the current one */
1204 map_addr = tmp_entry.vme_start;
1205 kr = mach_vm_map_kernel(map,
1206 vm_sanitize_wrap_addr_ref(&map_addr),
1207 (tmp_entry.vme_end -
1208 tmp_entry.vme_start),
1209 (mach_vm_offset_t) 0,
1210 vmk_flags,
1211 (ipc_port_t)(uintptr_t) unprotected_mem_obj,
1212 0,
1213 TRUE,
1214 tmp_entry.protection,
1215 tmp_entry.max_protection,
1216 tmp_entry.inheritance);
1217 assertf(kr == KERN_SUCCESS,
1218 "kr = 0x%x\n", kr);
1219 assertf(map_addr == tmp_entry.vme_start,
1220 "map_addr=0x%llx vme_start=0x%llx tmp_entry=%p\n",
1221 (uint64_t)map_addr,
1222 (uint64_t) tmp_entry.vme_start,
1223 &tmp_entry);
1224
1225 #if VM_MAP_DEBUG_APPLE_PROTECT
1226 if (vm_map_debug_apple_protect) {
1227 printf("APPLE_PROTECT: map %p [0x%llx:0x%llx] pager %p:"
1228 " backing:[object:%p,offset:0x%llx,"
1229 "crypto_backing_offset:0x%llx,"
1230 "crypto_start:0x%llx,crypto_end:0x%llx]\n",
1231 map,
1232 (uint64_t) map_addr,
1233 (uint64_t) (map_addr + (tmp_entry.vme_end -
1234 tmp_entry.vme_start)),
1235 unprotected_mem_obj,
1236 protected_object,
1237 VME_OFFSET(&tmp_entry),
1238 crypto_backing_offset,
1239 crypto_start,
1240 crypto_end);
1241 }
1242 #endif /* VM_MAP_DEBUG_APPLE_PROTECT */
1243
1244 /*
1245 * Release the reference obtained by
1246 * apple_protect_pager_setup().
1247 * The mapping (if it succeeded) is now holding a reference on
1248 * the memory object.
1249 */
1250 memory_object_deallocate(unprotected_mem_obj);
1251 unprotected_mem_obj = MEMORY_OBJECT_NULL;
1252
1253 /* continue with next map entry */
1254 crypto_backing_offset += (tmp_entry.vme_end -
1255 tmp_entry.vme_start);
1256 crypto_backing_offset -= crypto_start;
1257 }
1258 kr = KERN_SUCCESS;
1259
1260 done:
1261 if (map_locked) {
1262 vm_map_unlock(map);
1263 }
1264 return kr;
1265 }
1266 #endif /* CONFIG_CODE_DECRYPTION */
1267
1268
1269 LCK_GRP_DECLARE(vm_map_lck_grp, "vm_map");
1270 LCK_ATTR_DECLARE(vm_map_lck_attr, 0, 0);
1271 LCK_ATTR_DECLARE(vm_map_lck_rw_attr, 0, LCK_ATTR_DEBUG);
1272
1273 #if XNU_TARGET_OS_OSX
1274 #define MALLOC_NO_COW_DEFAULT 1
1275 #define MALLOC_NO_COW_EXCEPT_FORK_DEFAULT 1
1276 #else /* XNU_TARGET_OS_OSX */
1277 #define MALLOC_NO_COW_DEFAULT 1
1278 #define MALLOC_NO_COW_EXCEPT_FORK_DEFAULT 0
1279 #endif /* XNU_TARGET_OS_OSX */
1280 TUNABLE(int, malloc_no_cow, "malloc_no_cow", MALLOC_NO_COW_DEFAULT);
1281 TUNABLE(int, malloc_no_cow_except_fork, "malloc_no_cow_except_fork", MALLOC_NO_COW_EXCEPT_FORK_DEFAULT);
1282 uint64_t vm_memory_malloc_no_cow_mask = 0ULL;
1283 #if DEBUG
1284 int vm_check_map_sanity = 0;
1285 #endif
1286
1287 /*
1288 * vm_map_init:
1289 *
1290 * Initialize the vm_map module. Must be called before
1291 * any other vm_map routines.
1292 *
1293 * Map and entry structures are allocated from zones -- we must
1294 * initialize those zones.
1295 *
1296 * There are three zones of interest:
1297 *
1298 * vm_map_zone: used to allocate maps.
1299 * vm_map_entry_zone: used to allocate map entries.
1300 *
1301 * LP32:
1302 * vm_map_entry_reserved_zone: fallback zone for kernel map entries
1303 *
1304 * The kernel allocates map entries from a special zone that is initially
1305 * "crammed" with memory. It would be difficult (perhaps impossible) for
1306 * the kernel to allocate more memory to a entry zone when it became
1307 * empty since the very act of allocating memory implies the creation
1308 * of a new entry.
1309 */
1310 __startup_func
1311 void
vm_map_init(void)1312 vm_map_init(void)
1313 {
1314
1315 #if MACH_ASSERT
1316 PE_parse_boot_argn("debug4k_filter", &debug4k_filter,
1317 sizeof(debug4k_filter));
1318 #endif /* MACH_ASSERT */
1319
1320 zone_create_ext(VM_MAP_ZONE_NAME, sizeof(struct _vm_map),
1321 VM_MAP_ZFLAGS, ZONE_ID_VM_MAP, NULL);
1322
1323 /*
1324 * Don't quarantine because we always need elements available
1325 * Disallow GC on this zone... to aid the GC.
1326 */
1327 zone_create_ext(VM_MAP_ENTRY_ZONE_NAME,
1328 sizeof(struct vm_map_entry), VM_MAP_ENTRY_ZFLAGS,
1329 ZONE_ID_VM_MAP_ENTRY, ^(zone_t z) {
1330 z->z_elems_rsv = (uint16_t)(32 *
1331 (ml_early_cpu_max_number() + 1));
1332 });
1333
1334 zone_create_ext(VM_MAP_HOLES_ZONE_NAME,
1335 sizeof(struct vm_map_links), VM_MAP_HOLES_ZFLAGS,
1336 ZONE_ID_VM_MAP_HOLES, ^(zone_t z) {
1337 z->z_elems_rsv = (uint16_t)(16 * 1024 / zone_elem_outer_size(z));
1338 });
1339
1340 zone_create_ext("VM map copies", sizeof(struct vm_map_copy),
1341 ZC_NOENCRYPT, ZONE_ID_VM_MAP_COPY, NULL);
1342
1343 /*
1344 * Add the stolen memory to zones, adjust zone size and stolen counts.
1345 */
1346 zone_cram_early(vm_map_zone, map_data, map_data_size);
1347 zone_cram_early(vm_map_entry_zone, kentry_data, kentry_data_size);
1348 zone_cram_early(vm_map_holes_zone, map_holes_data, map_holes_data_size);
1349 printf("VM boostrap: %d maps, %d entries and %d holes available\n",
1350 zone_count_free(vm_map_zone),
1351 zone_count_free(vm_map_entry_zone),
1352 zone_count_free(vm_map_holes_zone));
1353
1354 /*
1355 * Since these are covered by zones, remove them from stolen page accounting.
1356 */
1357 VM_PAGE_MOVE_STOLEN(atop_64(map_data_size) + atop_64(kentry_data_size) + atop_64(map_holes_data_size));
1358
1359 #if VM_MAP_DEBUG_APPLE_PROTECT
1360 PE_parse_boot_argn("vm_map_debug_apple_protect",
1361 &vm_map_debug_apple_protect,
1362 sizeof(vm_map_debug_apple_protect));
1363 #endif /* VM_MAP_DEBUG_APPLE_PROTECT */
1364 #if VM_MAP_DEBUG_APPLE_FOURK
1365 PE_parse_boot_argn("vm_map_debug_fourk",
1366 &vm_map_debug_fourk,
1367 sizeof(vm_map_debug_fourk));
1368 #endif /* VM_MAP_DEBUG_FOURK */
1369
1370 if (malloc_no_cow) {
1371 vm_memory_malloc_no_cow_mask = 0ULL;
1372 vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC;
1373 vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_SMALL;
1374 vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_MEDIUM;
1375 #if XNU_TARGET_OS_OSX
1376 /*
1377 * On macOS, keep copy-on-write for MALLOC_LARGE because
1378 * realloc() may use vm_copy() to transfer the old contents
1379 * to the new location.
1380 */
1381 #else /* XNU_TARGET_OS_OSX */
1382 vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_LARGE;
1383 vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_LARGE_REUSABLE;
1384 vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_LARGE_REUSED;
1385 #endif /* XNU_TARGET_OS_OSX */
1386 // vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_HUGE;
1387 // vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_REALLOC;
1388 vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_TINY;
1389 vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_NANO;
1390 // vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_TCMALLOC;
1391 PE_parse_boot_argn("vm_memory_malloc_no_cow_mask",
1392 &vm_memory_malloc_no_cow_mask,
1393 sizeof(vm_memory_malloc_no_cow_mask));
1394 }
1395
1396 #if CONFIG_MAP_RANGES
1397 vm_map_range_map_init();
1398 #endif /* CONFIG_MAP_RANGES */
1399
1400 #if DEBUG
1401 PE_parse_boot_argn("vm_check_map_sanity", &vm_check_map_sanity, sizeof(vm_check_map_sanity));
1402 if (vm_check_map_sanity) {
1403 kprintf("VM sanity checking enabled\n");
1404 } else {
1405 kprintf("VM sanity checking disabled. Set bootarg vm_check_map_sanity=1 to enable\n");
1406 }
1407 #endif /* DEBUG */
1408
1409 #if DEVELOPMENT || DEBUG
1410 PE_parse_boot_argn("panic_on_unsigned_execute",
1411 &panic_on_unsigned_execute,
1412 sizeof(panic_on_unsigned_execute));
1413 PE_parse_boot_argn("panic_on_mlock_failure",
1414 &panic_on_mlock_failure,
1415 sizeof(panic_on_mlock_failure));
1416 #endif /* DEVELOPMENT || DEBUG */
1417 }
1418
1419 __startup_func
1420 static void
vm_map_steal_memory(void)1421 vm_map_steal_memory(void)
1422 {
1423
1424 /*
1425 * We need to reserve enough memory to support boostraping VM maps
1426 * and the zone subsystem.
1427 *
1428 * The VM Maps that need to function before zones can support them
1429 * are the ones registered with vm_map_will_allocate_early_map(),
1430 * which are:
1431 * - the kernel map
1432 * - the various submaps used by zones (pgz, meta, ...)
1433 *
1434 * We also need enough entries and holes to support them
1435 * until zone_metadata_init() is called, which is when
1436 * the zone allocator becomes capable of expanding dynamically.
1437 *
1438 * We need:
1439 * - VM_MAP_EARLY_COUNT_MAX worth of VM Maps.
1440 * - To allow for 3-4 entries per map, but the kernel map
1441 * needs a multiple of VM_MAP_EARLY_COUNT_MAX entries
1442 * to describe the submaps, so double it (and make it 8x too)
1443 * - To allow for holes between entries,
1444 * hence needs the same budget as entries
1445 */
1446 map_data_size = zone_get_early_alloc_size(VM_MAP_ZONE_NAME,
1447 sizeof(struct _vm_map), VM_MAP_ZFLAGS,
1448 VM_MAP_EARLY_COUNT_MAX);
1449
1450 kentry_data_size = zone_get_early_alloc_size(VM_MAP_ENTRY_ZONE_NAME,
1451 sizeof(struct vm_map_entry), VM_MAP_ENTRY_ZFLAGS,
1452 8 * VM_MAP_EARLY_COUNT_MAX);
1453
1454 map_holes_data_size = zone_get_early_alloc_size(VM_MAP_HOLES_ZONE_NAME,
1455 sizeof(struct vm_map_links), VM_MAP_HOLES_ZFLAGS,
1456 8 * VM_MAP_EARLY_COUNT_MAX);
1457
1458 /*
1459 * Steal a contiguous range of memory so that a simple range check
1460 * can validate early addresses being freed/crammed to these
1461 * zones
1462 */
1463 map_data = zone_early_mem_init(map_data_size + kentry_data_size +
1464 map_holes_data_size);
1465 kentry_data = map_data + map_data_size;
1466 map_holes_data = kentry_data + kentry_data_size;
1467 }
1468 STARTUP(PMAP_STEAL, STARTUP_RANK_FIRST, vm_map_steal_memory);
1469
1470 __startup_func
1471 static void
vm_kernel_boostraped(void)1472 vm_kernel_boostraped(void)
1473 {
1474 zone_enable_caching(&zone_array[ZONE_ID_VM_MAP_ENTRY]);
1475 zone_enable_caching(&zone_array[ZONE_ID_VM_MAP_HOLES]);
1476 zone_enable_caching(&zone_array[ZONE_ID_VM_MAP_COPY]);
1477
1478 printf("VM bootstrap done: %d maps, %d entries and %d holes left\n",
1479 zone_count_free(vm_map_zone),
1480 zone_count_free(vm_map_entry_zone),
1481 zone_count_free(vm_map_holes_zone));
1482 }
1483 STARTUP(ZALLOC, STARTUP_RANK_SECOND, vm_kernel_boostraped);
1484
1485 void
vm_map_disable_hole_optimization(vm_map_t map)1486 vm_map_disable_hole_optimization(vm_map_t map)
1487 {
1488 vm_map_entry_t head_entry, hole_entry, next_hole_entry;
1489
1490 if (map->holelistenabled) {
1491 head_entry = hole_entry = CAST_TO_VM_MAP_ENTRY(map->holes_list);
1492
1493 while (hole_entry != NULL) {
1494 next_hole_entry = hole_entry->vme_next;
1495
1496 hole_entry->vme_next = NULL;
1497 hole_entry->vme_prev = NULL;
1498 zfree_id(ZONE_ID_VM_MAP_HOLES, hole_entry);
1499
1500 if (next_hole_entry == head_entry) {
1501 hole_entry = NULL;
1502 } else {
1503 hole_entry = next_hole_entry;
1504 }
1505 }
1506
1507 map->holes_list = NULL;
1508 map->holelistenabled = FALSE;
1509
1510 map->first_free = vm_map_first_entry(map);
1511 SAVE_HINT_HOLE_WRITE(map, NULL);
1512 }
1513 }
1514
1515 boolean_t
vm_kernel_map_is_kernel(vm_map_t map)1516 vm_kernel_map_is_kernel(vm_map_t map)
1517 {
1518 return map->pmap == kernel_pmap;
1519 }
1520
1521 /*
1522 * vm_map_create:
1523 *
1524 * Creates and returns a new empty VM map with
1525 * the given physical map structure, and having
1526 * the given lower and upper address bounds.
1527 */
1528
1529 extern vm_map_t vm_map_create_external(
1530 pmap_t pmap,
1531 vm_map_offset_t min_off,
1532 vm_map_offset_t max_off,
1533 boolean_t pageable);
1534
1535 vm_map_t
vm_map_create_external(pmap_t pmap,vm_map_offset_t min,vm_map_offset_t max,boolean_t pageable)1536 vm_map_create_external(
1537 pmap_t pmap,
1538 vm_map_offset_t min,
1539 vm_map_offset_t max,
1540 boolean_t pageable)
1541 {
1542 vm_map_create_options_t options = VM_MAP_CREATE_DEFAULT;
1543
1544 if (pageable) {
1545 options |= VM_MAP_CREATE_PAGEABLE;
1546 }
1547 return vm_map_create_options(pmap, min, max, options);
1548 }
1549
1550 __startup_func
1551 void
vm_map_will_allocate_early_map(vm_map_t * owner)1552 vm_map_will_allocate_early_map(vm_map_t *owner)
1553 {
1554 if (early_map_count >= VM_MAP_EARLY_COUNT_MAX) {
1555 panic("VM_MAP_EARLY_COUNT_MAX is too low");
1556 }
1557
1558 early_map_owners[early_map_count++] = owner;
1559 }
1560
1561 __startup_func
1562 void
vm_map_relocate_early_maps(vm_offset_t delta)1563 vm_map_relocate_early_maps(vm_offset_t delta)
1564 {
1565 for (uint32_t i = 0; i < early_map_count; i++) {
1566 vm_address_t addr = (vm_address_t)*early_map_owners[i];
1567
1568 *early_map_owners[i] = (vm_map_t)(addr + delta);
1569 }
1570
1571 early_map_count = ~0u;
1572 }
1573
1574 /*
1575 * Routine: vm_map_relocate_early_elem
1576 *
1577 * Purpose:
1578 * Early zone elements are allocated in a temporary part
1579 * of the address space.
1580 *
1581 * Once the zones live in their final place, the early
1582 * VM maps, map entries and map holes need to be relocated.
1583 *
1584 * It involves rewriting any vm_map_t, vm_map_entry_t or
1585 * pointers to vm_map_links. Other pointers to other types
1586 * are fine.
1587 *
1588 * Fortunately, pointers to those types are self-contained
1589 * in those zones, _except_ for pointers to VM maps,
1590 * which are tracked during early boot and fixed with
1591 * vm_map_relocate_early_maps().
1592 */
1593 __startup_func
1594 void
vm_map_relocate_early_elem(uint32_t zone_id,vm_offset_t new_addr,vm_offset_t delta)1595 vm_map_relocate_early_elem(
1596 uint32_t zone_id,
1597 vm_offset_t new_addr,
1598 vm_offset_t delta)
1599 {
1600 #define relocate(type_t, field) ({ \
1601 typeof(((type_t)NULL)->field) *__field = &((type_t)new_addr)->field; \
1602 if (*__field) { \
1603 *__field = (typeof(*__field))((vm_offset_t)*__field + delta); \
1604 } \
1605 })
1606
1607 switch (zone_id) {
1608 case ZONE_ID_VM_MAP:
1609 case ZONE_ID_VM_MAP_ENTRY:
1610 case ZONE_ID_VM_MAP_HOLES:
1611 break;
1612
1613 default:
1614 panic("Unexpected zone ID %d", zone_id);
1615 }
1616
1617 if (zone_id == ZONE_ID_VM_MAP) {
1618 relocate(vm_map_t, hdr.links.prev);
1619 relocate(vm_map_t, hdr.links.next);
1620 ((vm_map_t)new_addr)->pmap = kernel_pmap;
1621 #ifdef VM_MAP_STORE_USE_RB
1622 relocate(vm_map_t, hdr.rb_head_store.rbh_root);
1623 #endif /* VM_MAP_STORE_USE_RB */
1624 relocate(vm_map_t, hint);
1625 relocate(vm_map_t, hole_hint);
1626 relocate(vm_map_t, first_free);
1627 return;
1628 }
1629
1630 relocate(struct vm_map_links *, prev);
1631 relocate(struct vm_map_links *, next);
1632
1633 if (zone_id == ZONE_ID_VM_MAP_ENTRY) {
1634 #ifdef VM_MAP_STORE_USE_RB
1635 relocate(vm_map_entry_t, store.entry.rbe_left);
1636 relocate(vm_map_entry_t, store.entry.rbe_right);
1637 relocate(vm_map_entry_t, store.entry.rbe_parent);
1638 #endif /* VM_MAP_STORE_USE_RB */
1639 if (((vm_map_entry_t)new_addr)->is_sub_map) {
1640 /* no object to relocate because we haven't made any */
1641 ((vm_map_entry_t)new_addr)->vme_submap +=
1642 delta >> VME_SUBMAP_SHIFT;
1643 }
1644 #if MAP_ENTRY_CREATION_DEBUG
1645 relocate(vm_map_entry_t, vme_creation_maphdr);
1646 #endif /* MAP_ENTRY_CREATION_DEBUG */
1647 }
1648
1649 #undef relocate
1650 }
1651
1652 vm_map_t
vm_map_create_options(pmap_t pmap,vm_map_offset_t min,vm_map_offset_t max,vm_map_create_options_t options)1653 vm_map_create_options(
1654 pmap_t pmap,
1655 vm_map_offset_t min,
1656 vm_map_offset_t max,
1657 vm_map_create_options_t options)
1658 {
1659 vm_map_t result;
1660
1661 #if DEBUG || DEVELOPMENT
1662 if (__improbable(startup_phase < STARTUP_SUB_ZALLOC)) {
1663 if (early_map_count != ~0u && early_map_count !=
1664 zone_count_allocated(vm_map_zone) + 1) {
1665 panic("allocating %dth early map, owner not known",
1666 zone_count_allocated(vm_map_zone) + 1);
1667 }
1668 if (early_map_count != ~0u && pmap && pmap != kernel_pmap) {
1669 panic("allocating %dth early map for non kernel pmap",
1670 early_map_count);
1671 }
1672 }
1673 #endif /* DEBUG || DEVELOPMENT */
1674
1675 result = zalloc_id(ZONE_ID_VM_MAP, Z_WAITOK | Z_NOFAIL | Z_ZERO);
1676
1677 vm_map_store_init(&result->hdr);
1678 result->hdr.entries_pageable = (bool)(options & VM_MAP_CREATE_PAGEABLE);
1679 vm_map_set_page_shift(result, PAGE_SHIFT);
1680
1681 result->size_limit = RLIM_INFINITY; /* default unlimited */
1682 result->data_limit = RLIM_INFINITY; /* default unlimited */
1683 result->user_wire_limit = MACH_VM_MAX_ADDRESS; /* default limit is unlimited */
1684 os_ref_init_count_raw(&result->map_refcnt, &map_refgrp, 1);
1685 result->pmap = pmap;
1686 result->min_offset = min;
1687 result->max_offset = max;
1688 result->first_free = vm_map_to_entry(result);
1689 result->hint = vm_map_to_entry(result);
1690
1691 if (options & VM_MAP_CREATE_NEVER_FAULTS) {
1692 assert(pmap == kernel_pmap);
1693 result->never_faults = true;
1694 }
1695
1696 /* "has_corpse_footprint" and "holelistenabled" are mutually exclusive */
1697 if (options & VM_MAP_CREATE_CORPSE_FOOTPRINT) {
1698 result->has_corpse_footprint = true;
1699 } else if (!(options & VM_MAP_CREATE_DISABLE_HOLELIST)) {
1700 struct vm_map_links *hole_entry;
1701
1702 hole_entry = zalloc_id(ZONE_ID_VM_MAP_HOLES, Z_WAITOK | Z_NOFAIL);
1703 hole_entry->start = min;
1704 /*
1705 * Holes can be used to track ranges all the way up to
1706 * MACH_VM_MAX_ADDRESS or more (e.g. kernel map).
1707 */
1708 hole_entry->end = MAX(max, (vm_map_offset_t)MACH_VM_MAX_ADDRESS);
1709 result->holes_list = result->hole_hint = hole_entry;
1710 hole_entry->prev = hole_entry->next = CAST_TO_VM_MAP_ENTRY(hole_entry);
1711 result->holelistenabled = true;
1712 }
1713
1714 vm_map_lock_init(result);
1715
1716 return result;
1717 }
1718
1719 /*
1720 * Adjusts a submap that was made by kmem_suballoc()
1721 * before it knew where it would be mapped,
1722 * so that it has the right min/max offsets.
1723 *
1724 * We do not need to hold any locks:
1725 * only the caller knows about this map,
1726 * and it is not published on any entry yet.
1727 */
1728 static void
vm_map_adjust_offsets(vm_map_t map,vm_map_offset_t min_off,vm_map_offset_t max_off)1729 vm_map_adjust_offsets(
1730 vm_map_t map,
1731 vm_map_offset_t min_off,
1732 vm_map_offset_t max_off)
1733 {
1734 assert(map->min_offset == 0);
1735 assert(map->max_offset == max_off - min_off);
1736 assert(map->hdr.nentries == 0);
1737 assert(os_ref_get_count_raw(&map->map_refcnt) == 2);
1738
1739 map->min_offset = min_off;
1740 map->max_offset = max_off;
1741
1742 if (map->holelistenabled) {
1743 struct vm_map_links *hole = map->holes_list;
1744
1745 hole->start = min_off;
1746 #if defined(__arm64__)
1747 hole->end = max_off;
1748 #else
1749 hole->end = MAX(max_off, (vm_map_offset_t)MACH_VM_MAX_ADDRESS);
1750 #endif
1751 }
1752 }
1753
1754
1755 vm_map_size_t
vm_map_adjusted_size(vm_map_t map)1756 vm_map_adjusted_size(vm_map_t map)
1757 {
1758 const struct vm_reserved_region *regions = NULL;
1759 size_t num_regions = 0;
1760 mach_vm_size_t reserved_size = 0, map_size = 0;
1761
1762 if (map == NULL || (map->size == 0)) {
1763 return 0;
1764 }
1765
1766 map_size = map->size;
1767
1768 if (map->reserved_regions == FALSE || !vm_map_is_exotic(map) || map->terminated) {
1769 /*
1770 * No special reserved regions or not an exotic map or the task
1771 * is terminating and these special regions might have already
1772 * been deallocated.
1773 */
1774 return map_size;
1775 }
1776
1777 num_regions = ml_get_vm_reserved_regions(vm_map_is_64bit(map), ®ions);
1778 assert((num_regions == 0) || (num_regions > 0 && regions != NULL));
1779
1780 while (num_regions) {
1781 reserved_size += regions[--num_regions].vmrr_size;
1782 }
1783
1784 /*
1785 * There are a few places where the map is being switched out due to
1786 * 'termination' without that bit being set (e.g. exec and corpse purging).
1787 * In those cases, we could have the map's regions being deallocated on
1788 * a core while some accounting process is trying to get the map's size.
1789 * So this assert can't be enabled till all those places are uniform in
1790 * their use of the 'map->terminated' bit.
1791 *
1792 * assert(map_size >= reserved_size);
1793 */
1794
1795 return (map_size >= reserved_size) ? (map_size - reserved_size) : map_size;
1796 }
1797
1798 /*
1799 * vm_map_entry_create: [ internal use only ]
1800 *
1801 * Allocates a VM map entry for insertion in the
1802 * given map (or map copy). No fields are filled.
1803 *
1804 * The VM entry will be zero initialized, except for:
1805 * - behavior set to VM_BEHAVIOR_DEFAULT
1806 * - inheritance set to VM_INHERIT_DEFAULT
1807 */
1808 #define vm_map_entry_create(map) _vm_map_entry_create(&(map)->hdr)
1809
1810 #define vm_map_copy_entry_create(copy) _vm_map_entry_create(&(copy)->cpy_hdr)
1811
1812 static vm_map_entry_t
_vm_map_entry_create(struct vm_map_header * map_header __unused)1813 _vm_map_entry_create(
1814 struct vm_map_header *map_header __unused)
1815 {
1816 vm_map_entry_t entry = NULL;
1817
1818 entry = zalloc_id(ZONE_ID_VM_MAP_ENTRY, Z_WAITOK | Z_ZERO);
1819
1820 /*
1821 * Help the compiler with what we know to be true,
1822 * so that the further bitfields inits have good codegen.
1823 *
1824 * See rdar://87041299
1825 */
1826 __builtin_assume(entry->vme_object_value == 0);
1827 __builtin_assume(*(uint64_t *)(&entry->vme_object_value + 1) == 0);
1828 __builtin_assume(*(uint64_t *)(&entry->vme_object_value + 2) == 0);
1829
1830 static_assert(VM_MAX_TAG_VALUE <= VME_ALIAS_MASK,
1831 "VME_ALIAS_MASK covers tags");
1832
1833 static_assert(VM_BEHAVIOR_DEFAULT == 0,
1834 "can skip zeroing of the behavior field");
1835 entry->inheritance = VM_INHERIT_DEFAULT;
1836
1837 #if MAP_ENTRY_CREATION_DEBUG
1838 entry->vme_creation_maphdr = map_header;
1839 entry->vme_creation_bt = btref_get(__builtin_frame_address(0),
1840 BTREF_GET_NOWAIT);
1841 #endif
1842 return entry;
1843 }
1844
1845 /*
1846 * vm_map_entry_dispose: [ internal use only ]
1847 *
1848 * Inverse of vm_map_entry_create.
1849 *
1850 * write map lock held so no need to
1851 * do anything special to insure correctness
1852 * of the stores
1853 */
1854 static void
vm_map_entry_dispose(vm_map_entry_t entry)1855 vm_map_entry_dispose(
1856 vm_map_entry_t entry)
1857 {
1858 #if VM_BTLOG_TAGS
1859 if (entry->vme_kernel_object) {
1860 btref_put(entry->vme_tag_btref);
1861 }
1862 #endif /* VM_BTLOG_TAGS */
1863 #if MAP_ENTRY_CREATION_DEBUG
1864 btref_put(entry->vme_creation_bt);
1865 #endif
1866 #if MAP_ENTRY_INSERTION_DEBUG
1867 btref_put(entry->vme_insertion_bt);
1868 #endif
1869 zfree(vm_map_entry_zone, entry);
1870 }
1871
1872 #define vm_map_copy_entry_dispose(copy_entry) \
1873 vm_map_entry_dispose(copy_entry)
1874
1875 static vm_map_entry_t
vm_map_zap_first_entry(vm_map_zap_t list)1876 vm_map_zap_first_entry(
1877 vm_map_zap_t list)
1878 {
1879 return list->vmz_head;
1880 }
1881
1882 static vm_map_entry_t
vm_map_zap_last_entry(vm_map_zap_t list)1883 vm_map_zap_last_entry(
1884 vm_map_zap_t list)
1885 {
1886 assert(vm_map_zap_first_entry(list));
1887 return __container_of(list->vmz_tail, struct vm_map_entry, vme_next);
1888 }
1889
1890 static void
vm_map_zap_append(vm_map_zap_t list,vm_map_entry_t entry)1891 vm_map_zap_append(
1892 vm_map_zap_t list,
1893 vm_map_entry_t entry)
1894 {
1895 entry->vme_next = VM_MAP_ENTRY_NULL;
1896 *list->vmz_tail = entry;
1897 list->vmz_tail = &entry->vme_next;
1898 }
1899
1900 static vm_map_entry_t
vm_map_zap_pop(vm_map_zap_t list)1901 vm_map_zap_pop(
1902 vm_map_zap_t list)
1903 {
1904 vm_map_entry_t head = list->vmz_head;
1905
1906 if (head != VM_MAP_ENTRY_NULL &&
1907 (list->vmz_head = head->vme_next) == VM_MAP_ENTRY_NULL) {
1908 list->vmz_tail = &list->vmz_head;
1909 }
1910
1911 return head;
1912 }
1913
1914 static void
vm_map_zap_dispose(vm_map_zap_t list)1915 vm_map_zap_dispose(
1916 vm_map_zap_t list)
1917 {
1918 vm_map_entry_t entry;
1919
1920 while ((entry = vm_map_zap_pop(list))) {
1921 if (entry->is_sub_map) {
1922 vm_map_deallocate(VME_SUBMAP(entry));
1923 } else {
1924 vm_object_deallocate(VME_OBJECT(entry));
1925 }
1926
1927 vm_map_entry_dispose(entry);
1928 }
1929 }
1930
1931 #if MACH_ASSERT
1932 static boolean_t first_free_check = FALSE;
1933 boolean_t
first_free_is_valid(vm_map_t map)1934 first_free_is_valid(
1935 vm_map_t map)
1936 {
1937 if (!first_free_check) {
1938 return TRUE;
1939 }
1940
1941 return first_free_is_valid_store( map );
1942 }
1943 #endif /* MACH_ASSERT */
1944
1945
1946 #define vm_map_copy_entry_link(copy, after_where, entry) \
1947 _vm_map_store_entry_link(&(copy)->cpy_hdr, after_where, (entry))
1948
1949 #define vm_map_copy_entry_unlink(copy, entry) \
1950 _vm_map_store_entry_unlink(&(copy)->cpy_hdr, (entry), false)
1951
1952 /*
1953 * vm_map_destroy:
1954 *
1955 * Actually destroy a map.
1956 */
1957 void
vm_map_destroy(vm_map_t map)1958 vm_map_destroy(
1959 vm_map_t map)
1960 {
1961 /* final cleanup: this is not allowed to fail */
1962 vmr_flags_t flags = VM_MAP_REMOVE_NO_FLAGS;
1963
1964 VM_MAP_ZAP_DECLARE(zap);
1965
1966 vm_map_lock(map);
1967
1968 map->terminated = true;
1969 /* clean up regular map entries */
1970 (void)vm_map_delete(map, map->min_offset, map->max_offset, flags,
1971 KMEM_GUARD_NONE, &zap);
1972 /* clean up leftover special mappings (commpage, GPU carveout, etc...) */
1973 (void)vm_map_delete(map, 0x0, 0xFFFFFFFFFFFFF000ULL, flags,
1974 KMEM_GUARD_NONE, &zap);
1975
1976 vm_map_disable_hole_optimization(map);
1977 vm_map_corpse_footprint_destroy(map);
1978
1979 vm_map_unlock(map);
1980
1981 vm_map_zap_dispose(&zap);
1982
1983 assert(map->hdr.nentries == 0);
1984
1985 if (map->pmap) {
1986 pmap_destroy(map->pmap);
1987 }
1988
1989 lck_rw_destroy(&map->lock, &vm_map_lck_grp);
1990
1991 #if CONFIG_MAP_RANGES
1992 kfree_data(map->extra_ranges,
1993 map->extra_ranges_count * sizeof(struct vm_map_user_range));
1994 #endif
1995
1996 zfree_id(ZONE_ID_VM_MAP, map);
1997 }
1998
1999 /*
2000 * Returns pid of the task with the largest number of VM map entries.
2001 * Used in the zone-map-exhaustion jetsam path.
2002 */
2003 pid_t
find_largest_process_vm_map_entries(void)2004 find_largest_process_vm_map_entries(void)
2005 {
2006 pid_t victim_pid = -1;
2007 int max_vm_map_entries = 0;
2008 task_t task = TASK_NULL;
2009 queue_head_t *task_list = &tasks;
2010
2011 lck_mtx_lock(&tasks_threads_lock);
2012 queue_iterate(task_list, task, task_t, tasks) {
2013 if (task == kernel_task || !task->active) {
2014 continue;
2015 }
2016
2017 vm_map_t task_map = task->map;
2018 if (task_map != VM_MAP_NULL) {
2019 int task_vm_map_entries = task_map->hdr.nentries;
2020 if (task_vm_map_entries > max_vm_map_entries) {
2021 max_vm_map_entries = task_vm_map_entries;
2022 victim_pid = pid_from_task(task);
2023 }
2024 }
2025 }
2026 lck_mtx_unlock(&tasks_threads_lock);
2027
2028 printf("zone_map_exhaustion: victim pid %d, vm region count: %d\n", victim_pid, max_vm_map_entries);
2029 return victim_pid;
2030 }
2031
2032
2033 /*
2034 * vm_map_lookup_entry: [ internal use only ]
2035 *
2036 * Calls into the vm map store layer to find the map
2037 * entry containing (or immediately preceding) the
2038 * specified address in the given map; the entry is returned
2039 * in the "entry" parameter. The boolean
2040 * result indicates whether the address is
2041 * actually contained in the map.
2042 */
2043 boolean_t
vm_map_lookup_entry(vm_map_t map,vm_map_offset_t address,vm_map_entry_t * entry)2044 vm_map_lookup_entry(
2045 vm_map_t map,
2046 vm_map_offset_t address,
2047 vm_map_entry_t *entry) /* OUT */
2048 {
2049 bool result = false;
2050
2051 #if CONFIG_KERNEL_TAGGING
2052 if (VM_KERNEL_ADDRESS(address)) {
2053 address = vm_memtag_canonicalize_kernel(address);
2054 }
2055 #endif /* CONFIG_KERNEL_TAGGING */
2056
2057 #if CONFIG_PROB_GZALLOC
2058 if (map->pmap == kernel_pmap) {
2059 assertf(!pgz_owned(address),
2060 "it is the responsibility of callers to unguard PGZ addresses");
2061 }
2062 #endif /* CONFIG_PROB_GZALLOC */
2063 result = vm_map_store_lookup_entry( map, address, entry );
2064
2065 return result;
2066 }
2067
2068 boolean_t
vm_map_lookup_entry_or_next(vm_map_t map,vm_map_offset_t address,vm_map_entry_t * entry)2069 vm_map_lookup_entry_or_next(
2070 vm_map_t map,
2071 vm_map_offset_t address,
2072 vm_map_entry_t *entry) /* OUT */
2073 {
2074 if (vm_map_lookup_entry(map, address, entry)) {
2075 return true;
2076 }
2077
2078 *entry = (*entry)->vme_next;
2079 return false;
2080 }
2081
2082 #if CONFIG_PROB_GZALLOC
2083 boolean_t
vm_map_lookup_entry_allow_pgz(vm_map_t map,vm_map_offset_t address,vm_map_entry_t * entry)2084 vm_map_lookup_entry_allow_pgz(
2085 vm_map_t map,
2086 vm_map_offset_t address,
2087 vm_map_entry_t *entry) /* OUT */
2088 {
2089 #if CONFIG_KERNEL_TAGGING
2090 if (VM_KERNEL_ADDRESS(address)) {
2091 address = vm_memtag_canonicalize_kernel(address);
2092 }
2093 #endif /* CONFIG_KERNEL_TAGGING */
2094
2095 return vm_map_store_lookup_entry( map, address, entry );
2096 }
2097 #endif /* CONFIG_PROB_GZALLOC */
2098
2099 /*
2100 * Routine: vm_map_range_invalid_panic
2101 * Purpose:
2102 * Panic on detection of an invalid range id.
2103 */
2104 __abortlike
2105 static void
vm_map_range_invalid_panic(vm_map_t map,vm_map_range_id_t range_id)2106 vm_map_range_invalid_panic(
2107 vm_map_t map,
2108 vm_map_range_id_t range_id)
2109 {
2110 panic("invalid range ID (%u) for map %p", range_id, map);
2111 }
2112
2113 /*
2114 * Routine: vm_map_get_range
2115 * Purpose:
2116 * Adjust bounds based on security policy.
2117 */
2118 static struct mach_vm_range
vm_map_get_range(vm_map_t map,vm_map_address_t * address,vm_map_kernel_flags_t * vmk_flags,vm_map_size_t size,bool * is_ptr)2119 vm_map_get_range(
2120 vm_map_t map,
2121 vm_map_address_t *address,
2122 vm_map_kernel_flags_t *vmk_flags,
2123 vm_map_size_t size,
2124 bool *is_ptr)
2125 {
2126 struct mach_vm_range effective_range = {};
2127 vm_map_range_id_t range_id = vmk_flags->vmkf_range_id;
2128
2129 if (map == kernel_map) {
2130 effective_range = kmem_ranges[range_id];
2131
2132 if (startup_phase >= STARTUP_SUB_KMEM) {
2133 /*
2134 * Hint provided by caller is zeroed as the range is restricted to a
2135 * subset of the entire kernel_map VA, which could put the hint outside
2136 * the range, causing vm_map_store_find_space to fail.
2137 */
2138 *address = 0ull;
2139 /*
2140 * Ensure that range_id passed in by the caller is within meaningful
2141 * bounds. Range id of KMEM_RANGE_ID_NONE will cause vm_map_locate_space
2142 * to fail as the corresponding range is invalid. Range id larger than
2143 * KMEM_RANGE_ID_MAX will lead to an OOB access.
2144 */
2145 if ((range_id == KMEM_RANGE_ID_NONE) ||
2146 (range_id > KMEM_RANGE_ID_MAX)) {
2147 vm_map_range_invalid_panic(map, range_id);
2148 }
2149
2150 /*
2151 * Pointer ranges use kmem_locate_space to do allocations.
2152 *
2153 * Non pointer fronts look like [ Small | Large | Permanent ]
2154 * Adjust range for allocations larger than KMEM_SMALLMAP_THRESHOLD.
2155 * Allocations smaller than KMEM_SMALLMAP_THRESHOLD are allowed to
2156 * use the entire range.
2157 */
2158 if (range_id < KMEM_RANGE_ID_SPRAYQTN) {
2159 *is_ptr = true;
2160 } else if (size >= KMEM_SMALLMAP_THRESHOLD) {
2161 effective_range = kmem_large_ranges[range_id];
2162 }
2163 }
2164 #if CONFIG_MAP_RANGES
2165 } else if (map->uses_user_ranges) {
2166 switch (range_id) {
2167 case UMEM_RANGE_ID_DEFAULT:
2168 effective_range = map->default_range;
2169 break;
2170 case UMEM_RANGE_ID_HEAP:
2171 effective_range = map->data_range;
2172 break;
2173 case UMEM_RANGE_ID_LARGE_FILE:
2174 if (map->large_file_range.min_address != map->large_file_range.max_address) {
2175 /* large file range is configured and should be used */
2176 effective_range = map->large_file_range;
2177 } else {
2178 /*
2179 * the user asking for this user range might not have the
2180 * permissions to use the large file range (i.e., it doesn't
2181 * hold the correct entitlement), so we give it the data range
2182 * instead
2183 */
2184 effective_range = map->data_range;
2185 }
2186 break;
2187 case UMEM_RANGE_ID_FIXED:
2188 /*
2189 * anywhere allocations with an address in "FIXED"
2190 * makes no sense, leave the range empty
2191 */
2192 break;
2193
2194 default:
2195 vm_map_range_invalid_panic(map, range_id);
2196 }
2197 #endif /* CONFIG_MAP_RANGES */
2198 } else {
2199 /*
2200 * If minimum is 0, bump it up by PAGE_SIZE. We want to limit
2201 * allocations of PAGEZERO to explicit requests since its
2202 * normal use is to catch dereferences of NULL and many
2203 * applications also treat pointers with a value of 0 as
2204 * special and suddenly having address 0 contain useable
2205 * memory would tend to confuse those applications.
2206 */
2207 effective_range.min_address = MAX(map->min_offset, VM_MAP_PAGE_SIZE(map));
2208 effective_range.max_address = map->max_offset;
2209 }
2210
2211 return effective_range;
2212 }
2213
2214 kern_return_t
vm_map_locate_space_anywhere(vm_map_t map,vm_map_size_t size,vm_map_offset_t mask,vm_map_kernel_flags_t vmk_flags,vm_map_offset_t * start_inout,vm_map_entry_t * entry_out)2215 vm_map_locate_space_anywhere(
2216 vm_map_t map,
2217 vm_map_size_t size,
2218 vm_map_offset_t mask,
2219 vm_map_kernel_flags_t vmk_flags,
2220 vm_map_offset_t *start_inout,
2221 vm_map_entry_t *entry_out)
2222 {
2223 struct mach_vm_range effective_range = {};
2224 vm_map_size_t guard_offset;
2225 vm_map_offset_t hint, limit;
2226 vm_map_entry_t entry;
2227 bool is_kmem_ptr_range = false;
2228
2229 /*
2230 * Only supported by vm_map_enter() with a fixed address.
2231 */
2232 assert(!vmk_flags.vmf_fixed);
2233 assert(!vmk_flags.vmkf_beyond_max);
2234
2235 if (__improbable(map->wait_for_space)) {
2236 /*
2237 * support for "wait_for_space" is minimal,
2238 * its only consumer is the ipc_kernel_copy_map.
2239 */
2240 assert(!map->holelistenabled &&
2241 !vmk_flags.vmkf_last_free &&
2242 !vmk_flags.vmkf_keep_map_locked &&
2243 !vmk_flags.vmkf_map_jit &&
2244 !vmk_flags.vmf_random_addr &&
2245 *start_inout <= map->min_offset);
2246 } else if (vmk_flags.vmkf_last_free) {
2247 assert(!vmk_flags.vmkf_map_jit &&
2248 !vmk_flags.vmf_random_addr);
2249 }
2250
2251 if (vmk_flags.vmkf_guard_before) {
2252 guard_offset = VM_MAP_PAGE_SIZE(map);
2253 assert(size > guard_offset);
2254 size -= guard_offset;
2255 } else {
2256 assert(size != 0);
2257 guard_offset = 0;
2258 }
2259
2260 if (__improbable(!vm_map_is_map_size_valid(
2261 map, size, vmk_flags.vmkf_no_soft_limit))) {
2262 return KERN_NO_SPACE;
2263 }
2264
2265 /*
2266 * Validate range_id from flags and get associated range
2267 */
2268 effective_range = vm_map_get_range(map, start_inout, &vmk_flags, size,
2269 &is_kmem_ptr_range);
2270
2271 if (is_kmem_ptr_range) {
2272 return kmem_locate_space(size + guard_offset, vmk_flags.vmkf_range_id,
2273 vmk_flags.vmkf_last_free, start_inout, entry_out);
2274 }
2275
2276 #if XNU_TARGET_OS_OSX
2277 if (__improbable(vmk_flags.vmkf_32bit_map_va)) {
2278 assert(map != kernel_map);
2279 effective_range.max_address = MIN(map->max_offset, 0x00000000FFFFF000ULL);
2280 }
2281 #endif /* XNU_TARGET_OS_OSX */
2282
2283 again:
2284 if (vmk_flags.vmkf_last_free) {
2285 hint = *start_inout;
2286
2287 if (hint == 0 || hint > effective_range.max_address) {
2288 hint = effective_range.max_address;
2289 }
2290 if (hint <= effective_range.min_address) {
2291 return KERN_NO_SPACE;
2292 }
2293 limit = effective_range.min_address;
2294 } else {
2295 hint = *start_inout;
2296
2297 if (vmk_flags.vmkf_map_jit) {
2298 if (map->jit_entry_exists &&
2299 !VM_MAP_POLICY_ALLOW_MULTIPLE_JIT(map)) {
2300 return KERN_INVALID_ARGUMENT;
2301 }
2302 if (VM_MAP_POLICY_ALLOW_JIT_RANDOM_ADDRESS(map)) {
2303 vmk_flags.vmf_random_addr = true;
2304 }
2305 }
2306
2307 if (vmk_flags.vmf_random_addr) {
2308 kern_return_t kr;
2309
2310 kr = vm_map_random_address_for_size(map, &hint, size, vmk_flags);
2311 if (kr != KERN_SUCCESS) {
2312 return kr;
2313 }
2314 }
2315 #if __x86_64__
2316 else if ((hint == 0 || hint == vm_map_min(map)) &&
2317 !map->disable_vmentry_reuse &&
2318 map->vmmap_high_start != 0) {
2319 hint = map->vmmap_high_start;
2320 }
2321 #endif /* __x86_64__ */
2322
2323 if (hint < effective_range.min_address) {
2324 hint = effective_range.min_address;
2325 }
2326 if (effective_range.max_address <= hint) {
2327 return KERN_NO_SPACE;
2328 }
2329
2330 limit = effective_range.max_address;
2331 }
2332 entry = vm_map_store_find_space(map,
2333 hint, limit, vmk_flags.vmkf_last_free,
2334 guard_offset, size, mask,
2335 start_inout);
2336
2337 if (__improbable(entry == NULL)) {
2338 if (map->wait_for_space &&
2339 guard_offset + size <=
2340 effective_range.max_address - effective_range.min_address) {
2341 assert_wait((event_t)map, THREAD_ABORTSAFE);
2342 vm_map_unlock(map);
2343 thread_block(THREAD_CONTINUE_NULL);
2344 vm_map_lock(map);
2345 goto again;
2346 }
2347 return KERN_NO_SPACE;
2348 }
2349
2350 if (entry_out) {
2351 *entry_out = entry;
2352 }
2353 return KERN_SUCCESS;
2354 }
2355
2356 /*!
2357 * @function vm_map_locate_space_fixed()
2358 *
2359 * @brief
2360 * Locate (no reservation) a range in the specified VM map at a fixed address.
2361 *
2362 * @param map the map to scan for memory, must be locked.
2363 * @param start the fixed address trying to be reserved
2364 * @param size the size of the allocation to make.
2365 * @param mask an alignment mask the allocation must respect,
2366 * @param vmk_flags the vm map kernel flags to influence this call.
2367 * vmk_flags.vmf_anywhere must not be set.
2368 * @param entry_out the entry right before the hole.
2369 * @param zap_list a zap list of entries to clean up after the call.
2370 *
2371 * @returns
2372 * - KERN_SUCCESS in case of success and no conflicting entry is found,
2373 * in which case entry_out is set to the entry before the hole.
2374 *
2375 * - KERN_MEMORY_PRESENT if a conflicting entry is found,
2376 * in which case entry_out is set the conflicting entry,
2377 * the callers MUST handle this error explicitly.
2378 *
2379 * - KERN_INVALID_ADDRESS if the specified @c start or @c size
2380 * would result in a mapping outside of the map.
2381 *
2382 * - KERN_NO_SPACE for various cases of unrecoverable failures.
2383 */
2384 static kern_return_t
vm_map_locate_space_fixed(vm_map_t map,vm_map_offset_t start,vm_map_size_t size,vm_map_offset_t mask,vm_map_kernel_flags_t vmk_flags,vm_map_entry_t * entry_out,vm_map_zap_t zap_list)2385 vm_map_locate_space_fixed(
2386 vm_map_t map,
2387 vm_map_offset_t start,
2388 vm_map_size_t size,
2389 vm_map_offset_t mask,
2390 vm_map_kernel_flags_t vmk_flags,
2391 vm_map_entry_t *entry_out,
2392 vm_map_zap_t zap_list)
2393 {
2394 vm_map_offset_t effective_min_offset, effective_max_offset;
2395 vm_map_entry_t entry;
2396 vm_map_offset_t end;
2397
2398 assert(vmk_flags.vmf_fixed);
2399
2400 effective_min_offset = map->min_offset;
2401 effective_max_offset = map->max_offset;
2402
2403 if (vmk_flags.vmkf_beyond_max) {
2404 /*
2405 * Allow an insertion beyond the map's max offset.
2406 */
2407 effective_max_offset = 0x00000000FFFFF000ULL;
2408 if (vm_map_is_64bit(map)) {
2409 effective_max_offset = 0xFFFFFFFFFFFFF000ULL;
2410 }
2411 #if XNU_TARGET_OS_OSX
2412 } else if (__improbable(vmk_flags.vmkf_32bit_map_va)) {
2413 effective_max_offset = MIN(map->max_offset, 0x00000000FFFFF000ULL);
2414 #endif /* XNU_TARGET_OS_OSX */
2415 }
2416
2417 if (VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT &&
2418 !vmk_flags.vmf_overwrite &&
2419 map->pmap == kernel_pmap &&
2420 vmk_flags.vm_tag == VM_MEMORY_REALLOC) {
2421 /*
2422 * Force realloc() to switch to a new allocation,
2423 * to prevent 4k-fragmented virtual ranges.
2424 */
2425 // DEBUG4K_ERROR("no realloc in place");
2426 return KERN_NO_SPACE;
2427 }
2428
2429 /*
2430 * Verify that:
2431 * the address doesn't itself violate
2432 * the mask requirement.
2433 */
2434
2435 if ((start & mask) != 0) {
2436 return KERN_NO_SPACE;
2437 }
2438
2439 if (__improbable(!vm_map_is_map_size_valid(
2440 map, size, vmk_flags.vmkf_no_soft_limit))) {
2441 return KERN_NO_SPACE;
2442 }
2443
2444 #if CONFIG_MAP_RANGES
2445 if (map->uses_user_ranges) {
2446 struct mach_vm_range r;
2447
2448 vm_map_user_range_resolve(map, start, 1, &r);
2449 if (r.max_address == 0) {
2450 return KERN_INVALID_ADDRESS;
2451 }
2452 effective_min_offset = r.min_address;
2453 effective_max_offset = r.max_address;
2454 }
2455 #endif /* CONFIG_MAP_RANGES */
2456
2457 if ((startup_phase >= STARTUP_SUB_KMEM) && !vmk_flags.vmkf_submap &&
2458 (map == kernel_map)) {
2459 mach_vm_range_t r = kmem_validate_range_for_overwrite(start, size);
2460 effective_min_offset = r->min_address;
2461 effective_max_offset = r->max_address;
2462 }
2463
2464 /*
2465 * ... the address is within bounds
2466 */
2467
2468 end = start + size;
2469
2470 if ((start < effective_min_offset) ||
2471 (end > effective_max_offset) ||
2472 (start >= end)) {
2473 return KERN_INVALID_ADDRESS;
2474 }
2475
2476 if (vmk_flags.vmf_overwrite) {
2477 vmr_flags_t remove_flags = VM_MAP_REMOVE_NO_MAP_ALIGN | VM_MAP_REMOVE_TO_OVERWRITE;
2478 kern_return_t remove_kr;
2479
2480 /*
2481 * Fixed mapping and "overwrite" flag: attempt to
2482 * remove all existing mappings in the specified
2483 * address range, saving them in our "zap_list".
2484 *
2485 * This avoids releasing the VM map lock in
2486 * vm_map_entry_delete() and allows atomicity
2487 * when we want to replace some mappings with a new one.
2488 * It also allows us to restore the old VM mappings if the
2489 * new mapping fails.
2490 */
2491 remove_flags |= VM_MAP_REMOVE_NO_YIELD;
2492
2493 if (vmk_flags.vmkf_overwrite_immutable) {
2494 /* we can overwrite immutable mappings */
2495 remove_flags |= VM_MAP_REMOVE_IMMUTABLE;
2496 }
2497 if (vmk_flags.vmkf_remap_prot_copy) {
2498 remove_flags |= VM_MAP_REMOVE_IMMUTABLE_CODE;
2499 }
2500 remove_kr = vm_map_delete(map, start, end, remove_flags,
2501 KMEM_GUARD_NONE, zap_list).kmr_return;
2502 if (remove_kr) {
2503 /* XXX FBDP restore zap_list? */
2504 return remove_kr;
2505 }
2506 }
2507
2508 /*
2509 * ... the starting address isn't allocated
2510 */
2511
2512 if (vm_map_lookup_entry(map, start, &entry)) {
2513 *entry_out = entry;
2514 return KERN_MEMORY_PRESENT;
2515 }
2516
2517 /*
2518 * ... the next region doesn't overlap the
2519 * end point.
2520 */
2521
2522 if ((entry->vme_next != vm_map_to_entry(map)) &&
2523 (entry->vme_next->vme_start < end)) {
2524 return KERN_NO_SPACE;
2525 }
2526
2527 *entry_out = entry;
2528 return KERN_SUCCESS;
2529 }
2530
2531 /*
2532 * Routine: vm_map_find_space
2533 * Purpose:
2534 * Allocate a range in the specified virtual address map,
2535 * returning the entry allocated for that range.
2536 * Used by kmem_alloc, etc.
2537 *
2538 * The map must be NOT be locked. It will be returned locked
2539 * on KERN_SUCCESS, unlocked on failure.
2540 *
2541 * If an entry is allocated, the object/offset fields
2542 * are initialized to zero.
2543 */
2544 kern_return_t
vm_map_find_space(vm_map_t map,vm_map_offset_t hint_address,vm_map_size_t size,vm_map_offset_t mask,vm_map_kernel_flags_t vmk_flags,vm_map_entry_t * o_entry)2545 vm_map_find_space(
2546 vm_map_t map,
2547 vm_map_offset_t hint_address,
2548 vm_map_size_t size,
2549 vm_map_offset_t mask,
2550 vm_map_kernel_flags_t vmk_flags,
2551 vm_map_entry_t *o_entry) /* OUT */
2552 {
2553 vm_map_entry_t new_entry, entry;
2554 kern_return_t kr;
2555
2556 if (size == 0) {
2557 return KERN_INVALID_ARGUMENT;
2558 }
2559
2560 new_entry = vm_map_entry_create(map);
2561 new_entry->use_pmap = true;
2562 new_entry->protection = VM_PROT_DEFAULT;
2563 new_entry->max_protection = VM_PROT_ALL;
2564
2565 if (VM_MAP_PAGE_SHIFT(map) != PAGE_SHIFT) {
2566 new_entry->map_aligned = true;
2567 }
2568 if (vmk_flags.vmf_permanent) {
2569 new_entry->vme_permanent = true;
2570 }
2571
2572 vm_map_lock(map);
2573
2574 kr = vm_map_locate_space_anywhere(map, size, mask, vmk_flags,
2575 &hint_address, &entry);
2576 if (kr != KERN_SUCCESS) {
2577 vm_map_unlock(map);
2578 vm_map_entry_dispose(new_entry);
2579 return kr;
2580 }
2581 new_entry->vme_start = hint_address;
2582 new_entry->vme_end = hint_address + size;
2583
2584 /*
2585 * At this point,
2586 *
2587 * - new_entry's "vme_start" and "vme_end" should define
2588 * the endpoints of the available new range,
2589 *
2590 * - and "entry" should refer to the region before
2591 * the new range,
2592 *
2593 * - and the map should still be locked.
2594 */
2595
2596 assert(page_aligned(new_entry->vme_start));
2597 assert(page_aligned(new_entry->vme_end));
2598 assert(VM_MAP_PAGE_ALIGNED(new_entry->vme_start, VM_MAP_PAGE_MASK(map)));
2599 assert(VM_MAP_PAGE_ALIGNED(new_entry->vme_end, VM_MAP_PAGE_MASK(map)));
2600
2601
2602 /*
2603 * Insert the new entry into the list
2604 */
2605
2606 vm_map_store_entry_link(map, entry, new_entry,
2607 VM_MAP_KERNEL_FLAGS_NONE);
2608 map->size += size;
2609
2610 /*
2611 * Update the lookup hint
2612 */
2613 SAVE_HINT_MAP_WRITE(map, new_entry);
2614
2615 *o_entry = new_entry;
2616 return KERN_SUCCESS;
2617 }
2618
2619 int vm_map_pmap_enter_print = FALSE;
2620 int vm_map_pmap_enter_enable = FALSE;
2621
2622 /*
2623 * Routine: vm_map_pmap_enter [internal only]
2624 *
2625 * Description:
2626 * Force pages from the specified object to be entered into
2627 * the pmap at the specified address if they are present.
2628 * As soon as a page not found in the object the scan ends.
2629 *
2630 * Returns:
2631 * Nothing.
2632 *
2633 * In/out conditions:
2634 * The source map should not be locked on entry.
2635 */
2636 __unused static void
vm_map_pmap_enter(vm_map_t map,vm_map_offset_t addr,vm_map_offset_t end_addr,vm_object_t object,vm_object_offset_t offset,vm_prot_t protection)2637 vm_map_pmap_enter(
2638 vm_map_t map,
2639 vm_map_offset_t addr,
2640 vm_map_offset_t end_addr,
2641 vm_object_t object,
2642 vm_object_offset_t offset,
2643 vm_prot_t protection)
2644 {
2645 int type_of_fault;
2646 kern_return_t kr;
2647 uint8_t object_lock_type = 0;
2648 struct vm_object_fault_info fault_info = {
2649 .interruptible = THREAD_UNINT,
2650 };
2651
2652 if (map->pmap == 0) {
2653 return;
2654 }
2655
2656 assert(VM_MAP_PAGE_SHIFT(map) == PAGE_SHIFT);
2657
2658 while (addr < end_addr) {
2659 vm_page_t m;
2660
2661
2662 /*
2663 * TODO:
2664 * From vm_map_enter(), we come into this function without the map
2665 * lock held or the object lock held.
2666 * We haven't taken a reference on the object either.
2667 * We should do a proper lookup on the map to make sure
2668 * that things are sane before we go locking objects that
2669 * could have been deallocated from under us.
2670 */
2671
2672 object_lock_type = OBJECT_LOCK_EXCLUSIVE;
2673 vm_object_lock(object);
2674
2675 m = vm_page_lookup(object, offset);
2676
2677 if (m == VM_PAGE_NULL || m->vmp_busy || vm_page_is_fictitious(m) ||
2678 (m->vmp_unusual && (VMP_ERROR_GET(m) || m->vmp_restart || m->vmp_absent))) {
2679 vm_object_unlock(object);
2680 return;
2681 }
2682
2683 if (vm_map_pmap_enter_print) {
2684 printf("vm_map_pmap_enter:");
2685 printf("map: %p, addr: %llx, object: %p, offset: %llx\n",
2686 map, (unsigned long long)addr, object, (unsigned long long)offset);
2687 }
2688 type_of_fault = DBG_CACHE_HIT_FAULT;
2689 kr = vm_fault_enter(m, map->pmap,
2690 addr,
2691 PAGE_SIZE, 0,
2692 protection, protection,
2693 VM_PAGE_WIRED(m),
2694 VM_KERN_MEMORY_NONE, /* tag - not wiring */
2695 &fault_info,
2696 NULL, /* need_retry */
2697 &type_of_fault,
2698 &object_lock_type); /* Exclusive lock mode. Will remain unchanged.*/
2699
2700 vm_object_unlock(object);
2701
2702 offset += PAGE_SIZE_64;
2703 addr += PAGE_SIZE;
2704 }
2705 }
2706
2707 #define MAX_TRIES_TO_GET_RANDOM_ADDRESS 1000
2708 static kern_return_t
vm_map_random_address_for_size(vm_map_t map,vm_map_offset_t * address,vm_map_size_t size,vm_map_kernel_flags_t vmk_flags)2709 vm_map_random_address_for_size(
2710 vm_map_t map,
2711 vm_map_offset_t *address,
2712 vm_map_size_t size,
2713 vm_map_kernel_flags_t vmk_flags)
2714 {
2715 kern_return_t kr = KERN_SUCCESS;
2716 int tries = 0;
2717 vm_map_offset_t random_addr = 0;
2718 vm_map_offset_t hole_end;
2719
2720 vm_map_entry_t next_entry = VM_MAP_ENTRY_NULL;
2721 vm_map_entry_t prev_entry = VM_MAP_ENTRY_NULL;
2722 vm_map_size_t vm_hole_size = 0;
2723 vm_map_size_t addr_space_size;
2724 bool is_kmem_ptr;
2725 struct mach_vm_range effective_range;
2726
2727 effective_range = vm_map_get_range(map, address, &vmk_flags, size,
2728 &is_kmem_ptr);
2729
2730 addr_space_size = effective_range.max_address - effective_range.min_address;
2731 if (size >= addr_space_size) {
2732 return KERN_NO_SPACE;
2733 }
2734 addr_space_size -= size;
2735
2736 assert(VM_MAP_PAGE_ALIGNED(size, VM_MAP_PAGE_MASK(map)));
2737
2738 while (tries < MAX_TRIES_TO_GET_RANDOM_ADDRESS) {
2739 if (startup_phase < STARTUP_SUB_ZALLOC) {
2740 random_addr = (vm_map_offset_t)early_random();
2741 } else {
2742 random_addr = (vm_map_offset_t)random();
2743 }
2744 random_addr <<= VM_MAP_PAGE_SHIFT(map);
2745 random_addr = vm_map_trunc_page(
2746 effective_range.min_address + (random_addr % addr_space_size),
2747 VM_MAP_PAGE_MASK(map));
2748
2749 #if CONFIG_PROB_GZALLOC
2750 if (map->pmap == kernel_pmap && pgz_owned(random_addr)) {
2751 continue;
2752 }
2753 #endif /* CONFIG_PROB_GZALLOC */
2754
2755 if (vm_map_lookup_entry(map, random_addr, &prev_entry) == FALSE) {
2756 if (prev_entry == vm_map_to_entry(map)) {
2757 next_entry = vm_map_first_entry(map);
2758 } else {
2759 next_entry = prev_entry->vme_next;
2760 }
2761 if (next_entry == vm_map_to_entry(map)) {
2762 hole_end = vm_map_max(map);
2763 } else {
2764 hole_end = next_entry->vme_start;
2765 }
2766 vm_hole_size = hole_end - random_addr;
2767 if (vm_hole_size >= size) {
2768 *address = random_addr;
2769 break;
2770 }
2771 }
2772 tries++;
2773 }
2774
2775 if (tries == MAX_TRIES_TO_GET_RANDOM_ADDRESS) {
2776 kr = KERN_NO_SPACE;
2777 }
2778 return kr;
2779 }
2780
2781 static boolean_t
vm_memory_malloc_no_cow(int alias)2782 vm_memory_malloc_no_cow(
2783 int alias)
2784 {
2785 uint64_t alias_mask;
2786
2787 if (!malloc_no_cow) {
2788 return FALSE;
2789 }
2790 if (alias > 63) {
2791 return FALSE;
2792 }
2793 alias_mask = 1ULL << alias;
2794 if (alias_mask & vm_memory_malloc_no_cow_mask) {
2795 return TRUE;
2796 }
2797 return FALSE;
2798 }
2799
2800 uint64_t vm_map_enter_RLIMIT_AS_count = 0;
2801 uint64_t vm_map_enter_RLIMIT_DATA_count = 0;
2802 /*
2803 * Routine: vm_map_enter
2804 *
2805 * Description:
2806 * Allocate a range in the specified virtual address map.
2807 * The resulting range will refer to memory defined by
2808 * the given memory object and offset into that object.
2809 *
2810 * Arguments are as defined in the vm_map call.
2811 */
2812 static unsigned int vm_map_enter_restore_successes = 0;
2813 static unsigned int vm_map_enter_restore_failures = 0;
2814 kern_return_t
vm_map_enter(vm_map_t map,vm_map_offset_t * address,vm_map_size_t size,vm_map_offset_t mask,vm_map_kernel_flags_t vmk_flags,vm_object_t object,vm_object_offset_t offset,boolean_t needs_copy,vm_prot_t cur_protection,vm_prot_t max_protection,vm_inherit_t inheritance)2815 vm_map_enter(
2816 vm_map_t map,
2817 vm_map_offset_t *address, /* IN/OUT */
2818 vm_map_size_t size,
2819 vm_map_offset_t mask,
2820 vm_map_kernel_flags_t vmk_flags,
2821 vm_object_t object,
2822 vm_object_offset_t offset,
2823 boolean_t needs_copy,
2824 vm_prot_t cur_protection,
2825 vm_prot_t max_protection,
2826 vm_inherit_t inheritance)
2827 {
2828 vm_map_entry_t entry, new_entry;
2829 vm_map_offset_t start, tmp_start, tmp_offset;
2830 vm_map_offset_t end, tmp_end;
2831 vm_map_offset_t tmp2_start, tmp2_end;
2832 vm_map_offset_t step;
2833 kern_return_t result = KERN_SUCCESS;
2834 bool map_locked = FALSE;
2835 bool pmap_empty = TRUE;
2836 bool new_mapping_established = FALSE;
2837 const bool keep_map_locked = vmk_flags.vmkf_keep_map_locked;
2838 const bool anywhere = !vmk_flags.vmf_fixed;
2839 const bool purgable = vmk_flags.vmf_purgeable;
2840 const bool no_cache = vmk_flags.vmf_no_cache;
2841 const bool is_submap = vmk_flags.vmkf_submap;
2842 const bool permanent = vmk_flags.vmf_permanent;
2843 const bool no_copy_on_read = vmk_flags.vmkf_no_copy_on_read;
2844 const bool entry_for_jit = vmk_flags.vmkf_map_jit;
2845 const bool iokit_acct = vmk_flags.vmkf_iokit_acct;
2846 const bool resilient_codesign = vmk_flags.vmf_resilient_codesign;
2847 const bool resilient_media = vmk_flags.vmf_resilient_media;
2848 const bool entry_for_tpro = vmk_flags.vmf_tpro;
2849 const unsigned int superpage_size = vmk_flags.vmf_superpage_size;
2850 const vm_tag_t alias = vmk_flags.vm_tag;
2851 vm_tag_t user_alias;
2852 kern_return_t kr;
2853 bool clear_map_aligned = FALSE;
2854 vm_map_size_t chunk_size = 0;
2855 vm_object_t caller_object;
2856 VM_MAP_ZAP_DECLARE(zap_old_list);
2857 VM_MAP_ZAP_DECLARE(zap_new_list);
2858
2859 caller_object = object;
2860
2861 assertf(vmk_flags.__vmkf_unused2 == 0, "vmk_flags unused2=0x%llx\n", vmk_flags.__vmkf_unused2);
2862
2863 if (vmk_flags.vmf_4gb_chunk) {
2864 #if defined(__LP64__)
2865 chunk_size = (4ULL * 1024 * 1024 * 1024); /* max. 4GB chunks for the new allocation */
2866 #else /* __LP64__ */
2867 chunk_size = ANON_CHUNK_SIZE;
2868 #endif /* __LP64__ */
2869 } else {
2870 chunk_size = ANON_CHUNK_SIZE;
2871 }
2872
2873
2874
2875 if (superpage_size) {
2876 if (object != VM_OBJECT_NULL) {
2877 /* caller can't provide their own VM object */
2878 return KERN_INVALID_ARGUMENT;
2879 }
2880 switch (superpage_size) {
2881 /*
2882 * Note that the current implementation only supports
2883 * a single size for superpages, SUPERPAGE_SIZE, per
2884 * architecture. As soon as more sizes are supposed
2885 * to be supported, SUPERPAGE_SIZE has to be replaced
2886 * with a lookup of the size depending on superpage_size.
2887 */
2888 #ifdef __x86_64__
2889 case SUPERPAGE_SIZE_ANY:
2890 /* handle it like 2 MB and round up to page size */
2891 size = (size + 2 * 1024 * 1024 - 1) & ~(2 * 1024 * 1024 - 1);
2892 OS_FALLTHROUGH;
2893 case SUPERPAGE_SIZE_2MB:
2894 break;
2895 #endif
2896 default:
2897 return KERN_INVALID_ARGUMENT;
2898 }
2899 mask = SUPERPAGE_SIZE - 1;
2900 if (size & (SUPERPAGE_SIZE - 1)) {
2901 return KERN_INVALID_ARGUMENT;
2902 }
2903 inheritance = VM_INHERIT_NONE; /* fork() children won't inherit superpages */
2904 }
2905
2906
2907 if ((cur_protection & VM_PROT_WRITE) &&
2908 (cur_protection & VM_PROT_EXECUTE) &&
2909 #if XNU_TARGET_OS_OSX
2910 map->pmap != kernel_pmap &&
2911 (cs_process_global_enforcement() ||
2912 (vmk_flags.vmkf_cs_enforcement_override
2913 ? vmk_flags.vmkf_cs_enforcement
2914 : (vm_map_cs_enforcement(map)
2915 #if __arm64__
2916 || !VM_MAP_IS_EXOTIC(map)
2917 #endif /* __arm64__ */
2918 ))) &&
2919 #endif /* XNU_TARGET_OS_OSX */
2920 #if CODE_SIGNING_MONITOR
2921 (csm_address_space_exempt(map->pmap) != KERN_SUCCESS) &&
2922 #endif
2923 (VM_MAP_POLICY_WX_FAIL(map) ||
2924 VM_MAP_POLICY_WX_STRIP_X(map)) &&
2925 !entry_for_jit) {
2926 boolean_t vm_protect_wx_fail = VM_MAP_POLICY_WX_FAIL(map);
2927
2928 DTRACE_VM3(cs_wx,
2929 uint64_t, 0,
2930 uint64_t, 0,
2931 vm_prot_t, cur_protection);
2932 printf("CODE SIGNING: %d[%s] %s: curprot cannot be write+execute. %s\n",
2933 proc_selfpid(),
2934 (get_bsdtask_info(current_task())
2935 ? proc_name_address(get_bsdtask_info(current_task()))
2936 : "?"),
2937 __FUNCTION__,
2938 (vm_protect_wx_fail ? "failing" : "turning off execute"));
2939 cur_protection &= ~VM_PROT_EXECUTE;
2940 if (vm_protect_wx_fail) {
2941 return KERN_PROTECTION_FAILURE;
2942 }
2943 }
2944
2945 if (entry_for_jit
2946 && cur_protection != VM_PROT_ALL) {
2947 /*
2948 * Native macOS processes and all non-macOS processes are
2949 * expected to create JIT regions via mmap(MAP_JIT, RWX) but
2950 * the RWX requirement was not enforced, and thus, we must live
2951 * with our sins. We are now dealing with a JIT mapping without
2952 * RWX.
2953 *
2954 * We deal with these by letting the MAP_JIT stick in order
2955 * to avoid CS violations when these pages are mapped executable
2956 * down the line. In order to appease the page table monitor (you
2957 * know what I'm talking about), these pages will end up being
2958 * marked as XNU_USER_DEBUG, which will be allowed because we
2959 * don't enforce the code signing monitor on macOS systems. If
2960 * the user-space application ever changes permissions to RWX,
2961 * which they are allowed to since the mapping was originally
2962 * created with MAP_JIT, then they'll switch over to using the
2963 * XNU_USER_JIT type, and won't be allowed to downgrade any
2964 * more after that.
2965 *
2966 * When not on macOS, a MAP_JIT mapping without VM_PROT_ALL is
2967 * strictly disallowed.
2968 */
2969
2970 #if XNU_TARGET_OS_OSX
2971 /*
2972 * Continue to allow non-RWX JIT
2973 */
2974 #else
2975 /* non-macOS: reject JIT regions without RWX */
2976 DTRACE_VM3(cs_wx,
2977 uint64_t, 0,
2978 uint64_t, 0,
2979 vm_prot_t, cur_protection);
2980 printf("CODE SIGNING: %d[%s] %s(%d): JIT requires RWX: failing. \n",
2981 proc_selfpid(),
2982 (get_bsdtask_info(current_task())
2983 ? proc_name_address(get_bsdtask_info(current_task()))
2984 : "?"),
2985 __FUNCTION__,
2986 cur_protection);
2987 return KERN_PROTECTION_FAILURE;
2988 #endif
2989 }
2990
2991 /*
2992 * If the task has requested executable lockdown,
2993 * deny any new executable mapping.
2994 */
2995 if (map->map_disallow_new_exec == TRUE) {
2996 if (cur_protection & VM_PROT_EXECUTE) {
2997 return KERN_PROTECTION_FAILURE;
2998 }
2999 }
3000
3001 if (resilient_codesign) {
3002 assert(!is_submap);
3003 int reject_prot = (needs_copy ? VM_PROT_ALLEXEC : (VM_PROT_WRITE | VM_PROT_ALLEXEC));
3004 if ((cur_protection | max_protection) & reject_prot) {
3005 return KERN_PROTECTION_FAILURE;
3006 }
3007 }
3008
3009 if (resilient_media) {
3010 assert(!is_submap);
3011 // assert(!needs_copy);
3012 if (object != VM_OBJECT_NULL &&
3013 !object->internal) {
3014 /*
3015 * This mapping is directly backed by an external
3016 * memory manager (e.g. a vnode pager for a file):
3017 * we would not have any safe place to inject
3018 * a zero-filled page if an actual page is not
3019 * available, without possibly impacting the actual
3020 * contents of the mapped object (e.g. the file),
3021 * so we can't provide any media resiliency here.
3022 */
3023 return KERN_INVALID_ARGUMENT;
3024 }
3025 }
3026
3027 if (entry_for_tpro) {
3028 /*
3029 * TPRO overrides the effective permissions of the region
3030 * and explicitly maps as RW. Ensure we have been passed
3031 * the expected permissions. We accept `cur_protections`
3032 * RO as that will be handled on fault.
3033 */
3034 if (!(max_protection & VM_PROT_READ) ||
3035 !(max_protection & VM_PROT_WRITE) ||
3036 !(cur_protection & VM_PROT_READ)) {
3037 return KERN_PROTECTION_FAILURE;
3038 }
3039
3040 /*
3041 * We can now downgrade the cur_protection to RO. This is a mild lie
3042 * to the VM layer. But TPRO will be responsible for toggling the
3043 * protections between RO/RW
3044 */
3045 cur_protection = VM_PROT_READ;
3046 }
3047
3048 if (is_submap) {
3049 vm_map_t submap;
3050 if (purgable) {
3051 /* submaps can not be purgeable */
3052 return KERN_INVALID_ARGUMENT;
3053 }
3054 if (object == VM_OBJECT_NULL) {
3055 /* submaps can not be created lazily */
3056 return KERN_INVALID_ARGUMENT;
3057 }
3058 submap = (vm_map_t) object;
3059 if (VM_MAP_PAGE_SHIFT(submap) != VM_MAP_PAGE_SHIFT(map)) {
3060 /* page size mismatch */
3061 return KERN_INVALID_ARGUMENT;
3062 }
3063 }
3064 if (vmk_flags.vmkf_already) {
3065 /*
3066 * VM_FLAGS_ALREADY says that it's OK if the same mapping
3067 * is already present. For it to be meaningul, the requested
3068 * mapping has to be at a fixed address (!VM_FLAGS_ANYWHERE) and
3069 * we shouldn't try and remove what was mapped there first
3070 * (!VM_FLAGS_OVERWRITE).
3071 */
3072 if (!vmk_flags.vmf_fixed || vmk_flags.vmf_overwrite) {
3073 return KERN_INVALID_ARGUMENT;
3074 }
3075 }
3076
3077 if (size == 0 ||
3078 (offset & MIN(VM_MAP_PAGE_MASK(map), PAGE_MASK_64)) != 0) {
3079 *address = 0;
3080 return KERN_INVALID_ARGUMENT;
3081 }
3082
3083 if (map->pmap == kernel_pmap) {
3084 user_alias = VM_KERN_MEMORY_NONE;
3085 } else {
3086 user_alias = alias;
3087 }
3088
3089 if (user_alias == VM_MEMORY_MALLOC_MEDIUM) {
3090 chunk_size = MALLOC_MEDIUM_CHUNK_SIZE;
3091 }
3092
3093 #define RETURN(value) { result = value; goto BailOut; }
3094
3095 assertf(VM_MAP_PAGE_ALIGNED(*address, FOURK_PAGE_MASK), "0x%llx", (uint64_t)*address);
3096 assertf(VM_MAP_PAGE_ALIGNED(size, FOURK_PAGE_MASK), "0x%llx", (uint64_t)size);
3097 if (VM_MAP_PAGE_MASK(map) >= PAGE_MASK) {
3098 assertf(page_aligned(*address), "0x%llx", (uint64_t)*address);
3099 assertf(page_aligned(size), "0x%llx", (uint64_t)size);
3100 }
3101
3102 if (VM_MAP_PAGE_MASK(map) >= PAGE_MASK &&
3103 !VM_MAP_PAGE_ALIGNED(size, VM_MAP_PAGE_MASK(map))) {
3104 /*
3105 * In most cases, the caller rounds the size up to the
3106 * map's page size.
3107 * If we get a size that is explicitly not map-aligned here,
3108 * we'll have to respect the caller's wish and mark the
3109 * mapping as "not map-aligned" to avoid tripping the
3110 * map alignment checks later.
3111 */
3112 clear_map_aligned = TRUE;
3113 }
3114 if (!anywhere &&
3115 VM_MAP_PAGE_MASK(map) >= PAGE_MASK &&
3116 !VM_MAP_PAGE_ALIGNED(*address, VM_MAP_PAGE_MASK(map))) {
3117 /*
3118 * We've been asked to map at a fixed address and that
3119 * address is not aligned to the map's specific alignment.
3120 * The caller should know what it's doing (i.e. most likely
3121 * mapping some fragmented copy map, transferring memory from
3122 * a VM map with a different alignment), so clear map_aligned
3123 * for this new VM map entry and proceed.
3124 */
3125 clear_map_aligned = TRUE;
3126 }
3127
3128 /*
3129 * Only zero-fill objects are allowed to be purgable.
3130 * LP64todo - limit purgable objects to 32-bits for now
3131 */
3132 if (purgable &&
3133 (offset != 0 ||
3134 (object != VM_OBJECT_NULL &&
3135 (object->vo_size != size ||
3136 object->purgable == VM_PURGABLE_DENY))
3137 #if __LP64__
3138 || size > ANON_MAX_SIZE
3139 #endif
3140 )) {
3141 return KERN_INVALID_ARGUMENT;
3142 }
3143
3144 if (__improbable(!vm_map_is_map_size_valid(
3145 map, size, vmk_flags.vmkf_no_soft_limit))) {
3146 return KERN_NO_SPACE;
3147 }
3148
3149 vm_map_lock(map);
3150 map_locked = TRUE;
3151
3152 if (anywhere) {
3153 result = vm_map_locate_space_anywhere(map, size, mask, vmk_flags,
3154 address, &entry);
3155 start = *address;
3156 } else {
3157 start = *address;
3158 result = vm_map_locate_space_fixed(map, start, size, mask,
3159 vmk_flags, &entry, &zap_old_list);
3160 }
3161
3162 end = start + size;
3163
3164 assert(VM_MAP_PAGE_ALIGNED(start, VM_MAP_PAGE_MASK(map)));
3165
3166 /*
3167 * Check if what's already there is what we want.
3168 */
3169 if (result == KERN_MEMORY_PRESENT) {
3170 assert(!anywhere);
3171 if (!(vmk_flags.vmkf_already)) {
3172 RETURN(KERN_NO_SPACE);
3173 }
3174 tmp_start = start;
3175 tmp_offset = offset;
3176 if (entry->vme_start < start) {
3177 tmp_start -= start - entry->vme_start;
3178 tmp_offset -= start - entry->vme_start;
3179 }
3180 for (; entry->vme_start < end;
3181 entry = entry->vme_next) {
3182 /*
3183 * Check if the mapping's attributes
3184 * match the existing map entry.
3185 */
3186 if (entry == vm_map_to_entry(map) ||
3187 entry->vme_start != tmp_start ||
3188 entry->is_sub_map != is_submap ||
3189 VME_OFFSET(entry) != tmp_offset ||
3190 entry->needs_copy != needs_copy ||
3191 entry->protection != cur_protection ||
3192 entry->max_protection != max_protection ||
3193 entry->inheritance != inheritance ||
3194 entry->iokit_acct != iokit_acct ||
3195 VME_ALIAS(entry) != alias) {
3196 /* not the same mapping ! */
3197 RETURN(KERN_NO_SPACE);
3198 }
3199 /*
3200 * Check if the same object is being mapped.
3201 */
3202 if (is_submap) {
3203 if (VME_SUBMAP(entry) !=
3204 (vm_map_t) object) {
3205 /* not the same submap */
3206 RETURN(KERN_NO_SPACE);
3207 }
3208 } else {
3209 if (VME_OBJECT(entry) != object) {
3210 /* not the same VM object... */
3211 vm_object_t obj2;
3212
3213 obj2 = VME_OBJECT(entry);
3214 if ((obj2 == VM_OBJECT_NULL || obj2->internal) &&
3215 (object == VM_OBJECT_NULL || object->internal)) {
3216 /*
3217 * ... but both are
3218 * anonymous memory,
3219 * so equivalent.
3220 */
3221 } else {
3222 RETURN(KERN_NO_SPACE);
3223 }
3224 }
3225 }
3226
3227 tmp_offset += entry->vme_end - entry->vme_start;
3228 tmp_start += entry->vme_end - entry->vme_start;
3229 if (entry->vme_end >= end) {
3230 /* reached the end of our mapping */
3231 break;
3232 }
3233 }
3234 /* it all matches: let's use what's already there ! */
3235 RETURN(KERN_MEMORY_PRESENT);
3236 }
3237
3238 if (result != KERN_SUCCESS) {
3239 goto BailOut;
3240 }
3241
3242
3243 /*
3244 * At this point,
3245 * "start" and "end" should define the endpoints of the
3246 * available new range, and
3247 * "entry" should refer to the region before the new
3248 * range, and
3249 *
3250 * the map should be locked.
3251 */
3252
3253 /*
3254 * See whether we can avoid creating a new entry (and object) by
3255 * extending one of our neighbors. [So far, we only attempt to
3256 * extend from below.] Note that we can never extend/join
3257 * purgable objects because they need to remain distinct
3258 * entities in order to implement their "volatile object"
3259 * semantics.
3260 */
3261
3262 if (purgable ||
3263 entry_for_jit ||
3264 entry_for_tpro ||
3265 vm_memory_malloc_no_cow(user_alias)) {
3266 if (superpage_size) {
3267 /*
3268 * For "super page" allocations, we will allocate
3269 * special physically-contiguous VM objects later on,
3270 * so we should not have flags instructing us to create
3271 * a differently special VM object here.
3272 */
3273 RETURN(KERN_INVALID_ARGUMENT);
3274 }
3275
3276 if (object == VM_OBJECT_NULL) {
3277 assert(!superpage_size);
3278 object = vm_object_allocate(size);
3279 vm_object_lock(object);
3280 object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
3281 VM_OBJECT_SET_TRUE_SHARE(object, FALSE);
3282 if (malloc_no_cow_except_fork &&
3283 !purgable &&
3284 !entry_for_jit &&
3285 !entry_for_tpro &&
3286 vm_memory_malloc_no_cow(user_alias)) {
3287 object->copy_strategy = MEMORY_OBJECT_COPY_DELAY_FORK;
3288 VM_OBJECT_SET_TRUE_SHARE(object, TRUE);
3289 }
3290 if (entry_for_jit) {
3291 object->vo_inherit_copy_none = true;
3292 }
3293 if (purgable) {
3294 task_t owner;
3295 VM_OBJECT_SET_PURGABLE(object, VM_PURGABLE_NONVOLATILE);
3296 if (map->pmap == kernel_pmap) {
3297 /*
3298 * Purgeable mappings made in a kernel
3299 * map are "owned" by the kernel itself
3300 * rather than the current user task
3301 * because they're likely to be used by
3302 * more than this user task (see
3303 * execargs_purgeable_allocate(), for
3304 * example).
3305 */
3306 owner = kernel_task;
3307 } else {
3308 owner = current_task();
3309 }
3310 assert(object->vo_owner == NULL);
3311 assert(object->resident_page_count == 0);
3312 assert(object->wired_page_count == 0);
3313 vm_purgeable_nonvolatile_enqueue(object, owner);
3314 }
3315 vm_object_unlock(object);
3316 offset = (vm_object_offset_t)0;
3317 }
3318 } else if (VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT) {
3319 /* no coalescing if address space uses sub-pages */
3320 } else if ((is_submap == FALSE) &&
3321 (object == VM_OBJECT_NULL) &&
3322 (entry != vm_map_to_entry(map)) &&
3323 (entry->vme_end == start) &&
3324 (!entry->is_shared) &&
3325 (!entry->is_sub_map) &&
3326 (!entry->in_transition) &&
3327 (!entry->needs_wakeup) &&
3328 (entry->behavior == VM_BEHAVIOR_DEFAULT) &&
3329 (entry->protection == cur_protection) &&
3330 (entry->max_protection == max_protection) &&
3331 (entry->inheritance == inheritance) &&
3332 ((user_alias == VM_MEMORY_REALLOC) ||
3333 (VME_ALIAS(entry) == alias)) &&
3334 (entry->no_cache == no_cache) &&
3335 (entry->vme_permanent == permanent) &&
3336 /* no coalescing for immutable executable mappings */
3337 !((entry->protection & VM_PROT_EXECUTE) &&
3338 entry->vme_permanent) &&
3339 (!entry->superpage_size && !superpage_size) &&
3340 /*
3341 * No coalescing if not map-aligned, to avoid propagating
3342 * that condition any further than needed:
3343 */
3344 (!entry->map_aligned || !clear_map_aligned) &&
3345 (!entry->zero_wired_pages) &&
3346 (!entry->used_for_jit && !entry_for_jit) &&
3347 #if __arm64e__
3348 (!entry->used_for_tpro && !entry_for_tpro) &&
3349 #endif
3350 (!entry->csm_associated) &&
3351 (entry->iokit_acct == iokit_acct) &&
3352 (!entry->vme_resilient_codesign) &&
3353 (!entry->vme_resilient_media) &&
3354 (!entry->vme_atomic) &&
3355 (entry->vme_no_copy_on_read == no_copy_on_read) &&
3356
3357 ((entry->vme_end - entry->vme_start) + size <=
3358 (user_alias == VM_MEMORY_REALLOC ?
3359 ANON_CHUNK_SIZE :
3360 NO_COALESCE_LIMIT)) &&
3361
3362 (entry->wired_count == 0)) { /* implies user_wired_count == 0 */
3363 if (vm_object_coalesce(VME_OBJECT(entry),
3364 VM_OBJECT_NULL,
3365 VME_OFFSET(entry),
3366 (vm_object_offset_t) 0,
3367 (vm_map_size_t)(entry->vme_end - entry->vme_start),
3368 (vm_map_size_t)(end - entry->vme_end))) {
3369 /*
3370 * Coalesced the two objects - can extend
3371 * the previous map entry to include the
3372 * new range.
3373 */
3374 map->size += (end - entry->vme_end);
3375 assert(entry->vme_start < end);
3376 assert(VM_MAP_PAGE_ALIGNED(end,
3377 VM_MAP_PAGE_MASK(map)));
3378 if (__improbable(vm_debug_events)) {
3379 DTRACE_VM5(map_entry_extend, vm_map_t, map, vm_map_entry_t, entry, vm_address_t, entry->vme_start, vm_address_t, entry->vme_end, vm_address_t, end);
3380 }
3381 entry->vme_end = end;
3382 if (map->holelistenabled) {
3383 vm_map_store_update_first_free(map, entry, TRUE);
3384 } else {
3385 vm_map_store_update_first_free(map, map->first_free, TRUE);
3386 }
3387 new_mapping_established = TRUE;
3388 RETURN(KERN_SUCCESS);
3389 }
3390 }
3391
3392 step = superpage_size ? SUPERPAGE_SIZE : (end - start);
3393 new_entry = NULL;
3394
3395 if (vmk_flags.vmkf_submap_adjust) {
3396 vm_map_adjust_offsets((vm_map_t)caller_object, start, end);
3397 offset = start;
3398 }
3399
3400 for (tmp2_start = start; tmp2_start < end; tmp2_start += step) {
3401 tmp2_end = tmp2_start + step;
3402 /*
3403 * Create a new entry
3404 *
3405 * XXX FBDP
3406 * The reserved "page zero" in each process's address space can
3407 * be arbitrarily large. Splitting it into separate objects and
3408 * therefore different VM map entries serves no purpose and just
3409 * slows down operations on the VM map, so let's not split the
3410 * allocation into chunks if the max protection is NONE. That
3411 * memory should never be accessible, so it will never get to the
3412 * default pager.
3413 */
3414 tmp_start = tmp2_start;
3415 if (!is_submap &&
3416 object == VM_OBJECT_NULL &&
3417 size > chunk_size &&
3418 max_protection != VM_PROT_NONE &&
3419 superpage_size == 0) {
3420 tmp_end = tmp_start + chunk_size;
3421 } else {
3422 tmp_end = tmp2_end;
3423 }
3424 do {
3425 if (!is_submap &&
3426 object != VM_OBJECT_NULL &&
3427 object->internal &&
3428 offset + (tmp_end - tmp_start) > object->vo_size) {
3429 // printf("FBDP object %p size 0x%llx overmapping offset 0x%llx size 0x%llx\n", object, object->vo_size, offset, (uint64_t)(tmp_end - tmp_start));
3430 DTRACE_VM5(vm_map_enter_overmap,
3431 vm_map_t, map,
3432 vm_map_address_t, tmp_start,
3433 vm_map_address_t, tmp_end,
3434 vm_object_offset_t, offset,
3435 vm_object_size_t, object->vo_size);
3436 }
3437 new_entry = vm_map_entry_insert(map,
3438 entry, tmp_start, tmp_end,
3439 object, offset, vmk_flags,
3440 needs_copy,
3441 cur_protection, max_protection,
3442 (entry_for_jit && !VM_MAP_POLICY_ALLOW_JIT_INHERIT(map) ?
3443 VM_INHERIT_NONE : inheritance),
3444 clear_map_aligned);
3445
3446 assert(!is_kernel_object(object) || (VM_KERN_MEMORY_NONE != alias));
3447
3448 if (resilient_codesign) {
3449 int reject_prot = (needs_copy ? VM_PROT_ALLEXEC : (VM_PROT_WRITE | VM_PROT_ALLEXEC));
3450 if (!((cur_protection | max_protection) & reject_prot)) {
3451 new_entry->vme_resilient_codesign = TRUE;
3452 }
3453 }
3454
3455 if (resilient_media &&
3456 (object == VM_OBJECT_NULL ||
3457 object->internal)) {
3458 new_entry->vme_resilient_media = TRUE;
3459 }
3460
3461 assert(!new_entry->iokit_acct);
3462 if (!is_submap &&
3463 object != VM_OBJECT_NULL &&
3464 object->internal &&
3465 (object->purgable != VM_PURGABLE_DENY ||
3466 object->vo_ledger_tag)) {
3467 assert(new_entry->use_pmap);
3468 assert(!new_entry->iokit_acct);
3469 /*
3470 * Turn off pmap accounting since
3471 * purgeable (or tagged) objects have their
3472 * own ledgers.
3473 */
3474 new_entry->use_pmap = FALSE;
3475 } else if (!is_submap &&
3476 iokit_acct &&
3477 object != VM_OBJECT_NULL &&
3478 object->internal) {
3479 /* alternate accounting */
3480 assert(!new_entry->iokit_acct);
3481 assert(new_entry->use_pmap);
3482 new_entry->iokit_acct = TRUE;
3483 new_entry->use_pmap = FALSE;
3484 DTRACE_VM4(
3485 vm_map_iokit_mapped_region,
3486 vm_map_t, map,
3487 vm_map_offset_t, new_entry->vme_start,
3488 vm_map_offset_t, new_entry->vme_end,
3489 int, VME_ALIAS(new_entry));
3490 vm_map_iokit_mapped_region(
3491 map,
3492 (new_entry->vme_end -
3493 new_entry->vme_start));
3494 } else if (!is_submap) {
3495 assert(!new_entry->iokit_acct);
3496 assert(new_entry->use_pmap);
3497 }
3498
3499 if (is_submap) {
3500 vm_map_t submap;
3501 boolean_t submap_is_64bit;
3502 boolean_t use_pmap;
3503
3504 assert(new_entry->is_sub_map);
3505 assert(!new_entry->use_pmap);
3506 assert(!new_entry->iokit_acct);
3507 submap = (vm_map_t) object;
3508 submap_is_64bit = vm_map_is_64bit(submap);
3509 use_pmap = vmk_flags.vmkf_nested_pmap;
3510 #ifndef NO_NESTED_PMAP
3511 if (use_pmap && submap->pmap == NULL) {
3512 ledger_t ledger = map->pmap->ledger;
3513 /* we need a sub pmap to nest... */
3514 submap->pmap = pmap_create_options(ledger, 0,
3515 submap_is_64bit ? PMAP_CREATE_64BIT : 0);
3516 if (submap->pmap == NULL) {
3517 /* let's proceed without nesting... */
3518 }
3519 #if defined(__arm64__)
3520 else {
3521 pmap_set_nested(submap->pmap);
3522 }
3523 #endif
3524 }
3525 if (use_pmap && submap->pmap != NULL) {
3526 if (VM_MAP_PAGE_SHIFT(map) != VM_MAP_PAGE_SHIFT(submap)) {
3527 DEBUG4K_ERROR("map %p (%d) submap %p (%d): incompatible page sizes\n", map, VM_MAP_PAGE_SHIFT(map), submap, VM_MAP_PAGE_SHIFT(submap));
3528 kr = KERN_FAILURE;
3529 } else {
3530 kr = pmap_nest(map->pmap,
3531 submap->pmap,
3532 tmp_start,
3533 tmp_end - tmp_start);
3534 }
3535 if (kr != KERN_SUCCESS) {
3536 printf("vm_map_enter: "
3537 "pmap_nest(0x%llx,0x%llx) "
3538 "error 0x%x\n",
3539 (long long)tmp_start,
3540 (long long)tmp_end,
3541 kr);
3542 } else {
3543 /* we're now nested ! */
3544 new_entry->use_pmap = TRUE;
3545 pmap_empty = FALSE;
3546 }
3547 }
3548 #endif /* NO_NESTED_PMAP */
3549 }
3550 entry = new_entry;
3551
3552 if (superpage_size) {
3553 vm_page_t pages, m;
3554 vm_object_t sp_object;
3555 vm_object_offset_t sp_offset;
3556
3557 assert(object == VM_OBJECT_NULL);
3558 VME_OFFSET_SET(entry, 0);
3559
3560 /* allocate one superpage */
3561 kr = cpm_allocate(SUPERPAGE_SIZE, &pages, 0, SUPERPAGE_NBASEPAGES - 1, TRUE, 0);
3562 if (kr != KERN_SUCCESS) {
3563 /* deallocate whole range... */
3564 new_mapping_established = TRUE;
3565 /* ... but only up to "tmp_end" */
3566 size -= end - tmp_end;
3567 RETURN(kr);
3568 }
3569
3570 /* create one vm_object per superpage */
3571 sp_object = vm_object_allocate((vm_map_size_t)(entry->vme_end - entry->vme_start));
3572 vm_object_lock(sp_object);
3573 sp_object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
3574 VM_OBJECT_SET_PHYS_CONTIGUOUS(sp_object, TRUE);
3575 sp_object->vo_shadow_offset = (vm_object_offset_t)VM_PAGE_GET_PHYS_PAGE(pages) * PAGE_SIZE;
3576 VME_OBJECT_SET(entry, sp_object, false, 0);
3577 assert(entry->use_pmap);
3578
3579 /* enter the base pages into the object */
3580 for (sp_offset = 0;
3581 sp_offset < SUPERPAGE_SIZE;
3582 sp_offset += PAGE_SIZE) {
3583 m = pages;
3584 pmap_zero_page(VM_PAGE_GET_PHYS_PAGE(m));
3585 pages = NEXT_PAGE(m);
3586 *(NEXT_PAGE_PTR(m)) = VM_PAGE_NULL;
3587 vm_page_insert_wired(m, sp_object, sp_offset, VM_KERN_MEMORY_OSFMK);
3588 }
3589 vm_object_unlock(sp_object);
3590 }
3591 } while (tmp_end != tmp2_end &&
3592 (tmp_start = tmp_end) &&
3593 (tmp_end = (tmp2_end - tmp_end > chunk_size) ?
3594 tmp_end + chunk_size : tmp2_end));
3595 }
3596
3597 new_mapping_established = TRUE;
3598
3599
3600 BailOut:
3601 assert(map_locked == TRUE);
3602
3603 /*
3604 * Address space limit enforcement (RLIMIT_AS and RLIMIT_DATA):
3605 * If we have identified and possibly established the new mapping(s),
3606 * make sure we did not go beyond the address space limit.
3607 */
3608 if (result == KERN_SUCCESS) {
3609 if (map->size_limit != RLIM_INFINITY &&
3610 map->size > map->size_limit) {
3611 /*
3612 * Establishing the requested mappings would exceed
3613 * the process's RLIMIT_AS limit: fail with
3614 * KERN_NO_SPACE.
3615 */
3616 result = KERN_NO_SPACE;
3617 printf("%d[%s] %s: map size 0x%llx over RLIMIT_AS 0x%llx\n",
3618 proc_selfpid(),
3619 (get_bsdtask_info(current_task())
3620 ? proc_name_address(get_bsdtask_info(current_task()))
3621 : "?"),
3622 __FUNCTION__,
3623 (uint64_t) map->size,
3624 (uint64_t) map->size_limit);
3625 DTRACE_VM2(vm_map_enter_RLIMIT_AS,
3626 vm_map_size_t, map->size,
3627 uint64_t, map->size_limit);
3628 vm_map_enter_RLIMIT_AS_count++;
3629 } else if (map->data_limit != RLIM_INFINITY &&
3630 map->size > map->data_limit) {
3631 /*
3632 * Establishing the requested mappings would exceed
3633 * the process's RLIMIT_DATA limit: fail with
3634 * KERN_NO_SPACE.
3635 */
3636 result = KERN_NO_SPACE;
3637 printf("%d[%s] %s: map size 0x%llx over RLIMIT_DATA 0x%llx\n",
3638 proc_selfpid(),
3639 (get_bsdtask_info(current_task())
3640 ? proc_name_address(get_bsdtask_info(current_task()))
3641 : "?"),
3642 __FUNCTION__,
3643 (uint64_t) map->size,
3644 (uint64_t) map->data_limit);
3645 DTRACE_VM2(vm_map_enter_RLIMIT_DATA,
3646 vm_map_size_t, map->size,
3647 uint64_t, map->data_limit);
3648 vm_map_enter_RLIMIT_DATA_count++;
3649 }
3650 }
3651
3652 if (result == KERN_SUCCESS) {
3653 vm_prot_t pager_prot;
3654 memory_object_t pager;
3655
3656 #if DEBUG
3657 if (pmap_empty &&
3658 !(vmk_flags.vmkf_no_pmap_check)) {
3659 assert(pmap_is_empty(map->pmap,
3660 *address,
3661 *address + size));
3662 }
3663 #endif /* DEBUG */
3664
3665 /*
3666 * For "named" VM objects, let the pager know that the
3667 * memory object is being mapped. Some pagers need to keep
3668 * track of this, to know when they can reclaim the memory
3669 * object, for example.
3670 * VM calls memory_object_map() for each mapping (specifying
3671 * the protection of each mapping) and calls
3672 * memory_object_last_unmap() when all the mappings are gone.
3673 */
3674 pager_prot = max_protection;
3675 if (needs_copy) {
3676 /*
3677 * Copy-On-Write mapping: won't modify
3678 * the memory object.
3679 */
3680 pager_prot &= ~VM_PROT_WRITE;
3681 }
3682 if (!is_submap &&
3683 object != VM_OBJECT_NULL &&
3684 object->named &&
3685 object->pager != MEMORY_OBJECT_NULL) {
3686 vm_object_lock(object);
3687 pager = object->pager;
3688 if (object->named &&
3689 pager != MEMORY_OBJECT_NULL) {
3690 assert(object->pager_ready);
3691 vm_object_mapping_wait(object, THREAD_UNINT);
3692 /* object might have lost its pager while waiting */
3693 pager = object->pager;
3694 if (object->named && pager != MEMORY_OBJECT_NULL) {
3695 vm_object_mapping_begin(object);
3696 vm_object_unlock(object);
3697
3698 kr = memory_object_map(pager, pager_prot);
3699 assert(kr == KERN_SUCCESS);
3700
3701 vm_object_lock(object);
3702 vm_object_mapping_end(object);
3703 }
3704 }
3705 vm_object_unlock(object);
3706 }
3707 }
3708
3709 assert(map_locked == TRUE);
3710
3711 if (new_mapping_established) {
3712 /*
3713 * If we release the map lock for any reason below,
3714 * another thread could deallocate our new mapping,
3715 * releasing the caller's reference on "caller_object",
3716 * which was transferred to the mapping.
3717 * If this was the only reference, the object could be
3718 * destroyed.
3719 *
3720 * We need to take an extra reference on "caller_object"
3721 * to keep it alive if we need to return the caller's
3722 * reference to the caller in case of failure.
3723 */
3724 if (is_submap) {
3725 vm_map_reference((vm_map_t)caller_object);
3726 } else {
3727 vm_object_reference(caller_object);
3728 }
3729 }
3730
3731 if (!keep_map_locked) {
3732 vm_map_unlock(map);
3733 map_locked = FALSE;
3734 entry = VM_MAP_ENTRY_NULL;
3735 new_entry = VM_MAP_ENTRY_NULL;
3736 }
3737
3738 /*
3739 * We can't hold the map lock if we enter this block.
3740 */
3741
3742 if (result == KERN_SUCCESS) {
3743 /* Wire down the new entry if the user
3744 * requested all new map entries be wired.
3745 */
3746 if ((map->wiring_required) || (superpage_size)) {
3747 assert(!keep_map_locked);
3748 pmap_empty = FALSE; /* pmap won't be empty */
3749 kr = vm_map_wire_nested(map, start, end,
3750 cur_protection, VM_KERN_MEMORY_MLOCK,
3751 TRUE, PMAP_NULL, 0, NULL);
3752 result = kr;
3753 }
3754
3755 }
3756
3757 if (result != KERN_SUCCESS) {
3758 if (new_mapping_established) {
3759 vmr_flags_t remove_flags = VM_MAP_REMOVE_NO_FLAGS;
3760
3761 /*
3762 * We have to get rid of the new mappings since we
3763 * won't make them available to the user.
3764 * Try and do that atomically, to minimize the risk
3765 * that someone else create new mappings that range.
3766 */
3767 if (!map_locked) {
3768 vm_map_lock(map);
3769 map_locked = TRUE;
3770 }
3771 remove_flags |= VM_MAP_REMOVE_NO_MAP_ALIGN;
3772 remove_flags |= VM_MAP_REMOVE_NO_YIELD;
3773 if (permanent) {
3774 remove_flags |= VM_MAP_REMOVE_IMMUTABLE;
3775 }
3776 (void) vm_map_delete(map,
3777 *address, *address + size,
3778 remove_flags,
3779 KMEM_GUARD_NONE, &zap_new_list);
3780 }
3781
3782 if (vm_map_zap_first_entry(&zap_old_list)) {
3783 vm_map_entry_t entry1, entry2;
3784
3785 /*
3786 * The new mapping failed. Attempt to restore
3787 * the old mappings, saved in the "zap_old_map".
3788 */
3789 if (!map_locked) {
3790 vm_map_lock(map);
3791 map_locked = TRUE;
3792 }
3793
3794 /* first check if the coast is still clear */
3795 start = vm_map_zap_first_entry(&zap_old_list)->vme_start;
3796 end = vm_map_zap_last_entry(&zap_old_list)->vme_end;
3797
3798 if (vm_map_lookup_entry(map, start, &entry1) ||
3799 vm_map_lookup_entry(map, end, &entry2) ||
3800 entry1 != entry2) {
3801 /*
3802 * Part of that range has already been
3803 * re-mapped: we can't restore the old
3804 * mappings...
3805 */
3806 vm_map_enter_restore_failures++;
3807 } else {
3808 /*
3809 * Transfer the saved map entries from
3810 * "zap_old_map" to the original "map",
3811 * inserting them all after "entry1".
3812 */
3813 while ((entry2 = vm_map_zap_pop(&zap_old_list))) {
3814 vm_map_size_t entry_size;
3815
3816 entry_size = (entry2->vme_end -
3817 entry2->vme_start);
3818 vm_map_store_entry_link(map, entry1, entry2,
3819 VM_MAP_KERNEL_FLAGS_NONE);
3820 map->size += entry_size;
3821 entry1 = entry2;
3822 }
3823 if (map->wiring_required) {
3824 /*
3825 * XXX TODO: we should rewire the
3826 * old pages here...
3827 */
3828 }
3829 vm_map_enter_restore_successes++;
3830 }
3831 }
3832 }
3833
3834 /*
3835 * The caller is responsible for releasing the lock if it requested to
3836 * keep the map locked.
3837 */
3838 if (map_locked && !keep_map_locked) {
3839 vm_map_unlock(map);
3840 }
3841
3842 vm_map_zap_dispose(&zap_old_list);
3843 vm_map_zap_dispose(&zap_new_list);
3844
3845 if (new_mapping_established) {
3846 /*
3847 * The caller had a reference on "caller_object" and we
3848 * transferred that reference to the mapping.
3849 * We also took an extra reference on "caller_object" to keep
3850 * it alive while the map was unlocked.
3851 */
3852 if (result == KERN_SUCCESS) {
3853 /*
3854 * On success, the caller's reference on the object gets
3855 * tranferred to the mapping.
3856 * Release our extra reference.
3857 */
3858 if (is_submap) {
3859 vm_map_deallocate((vm_map_t)caller_object);
3860 } else {
3861 vm_object_deallocate(caller_object);
3862 }
3863 } else {
3864 /*
3865 * On error, the caller expects to still have a
3866 * reference on the object it gave us.
3867 * Let's use our extra reference for that.
3868 */
3869 }
3870 }
3871
3872 return result;
3873
3874 #undef RETURN
3875 }
3876
3877 /*
3878 * Counters for the prefault optimization.
3879 */
3880 int64_t vm_prefault_nb_pages = 0;
3881 int64_t vm_prefault_nb_bailout = 0;
3882
3883 static kern_return_t
vm_map_enter_adjust_offset(vm_object_offset_t * obj_offs,vm_object_offset_t * obj_end,vm_object_offset_t quantity)3884 vm_map_enter_adjust_offset(
3885 vm_object_offset_t *obj_offs,
3886 vm_object_offset_t *obj_end,
3887 vm_object_offset_t quantity)
3888 {
3889 if (os_add_overflow(*obj_offs, quantity, obj_offs) ||
3890 os_add_overflow(*obj_end, quantity, obj_end) ||
3891 vm_map_round_page_mask(*obj_end, PAGE_MASK) == 0) {
3892 return KERN_INVALID_ARGUMENT;
3893 }
3894
3895 return KERN_SUCCESS;
3896 }
3897
3898 static __attribute__((always_inline, warn_unused_result))
3899 kern_return_t
vm_map_enter_mem_object_sanitize(vm_map_t target_map,vm_map_offset_ut address_u,vm_map_size_ut initial_size_u,vm_map_offset_ut mask_u,vm_object_offset_ut offset_u,vm_prot_ut cur_protection_u,vm_prot_ut max_protection_u,vm_inherit_ut inheritance_u,vm_map_kernel_flags_t vmk_flags,ipc_port_t port,vm_map_address_t * map_addr,vm_map_size_t * map_size,vm_map_offset_t * mask,vm_object_offset_t * obj_offs,vm_object_offset_t * obj_end,vm_object_size_t * obj_size,vm_prot_t * cur_protection,vm_prot_t * max_protection,vm_inherit_t * inheritance)3900 vm_map_enter_mem_object_sanitize(
3901 vm_map_t target_map,
3902 vm_map_offset_ut address_u,
3903 vm_map_size_ut initial_size_u,
3904 vm_map_offset_ut mask_u,
3905 vm_object_offset_ut offset_u,
3906 vm_prot_ut cur_protection_u,
3907 vm_prot_ut max_protection_u,
3908 vm_inherit_ut inheritance_u,
3909 vm_map_kernel_flags_t vmk_flags,
3910 ipc_port_t port,
3911 vm_map_address_t *map_addr,
3912 vm_map_size_t *map_size,
3913 vm_map_offset_t *mask,
3914 vm_object_offset_t *obj_offs,
3915 vm_object_offset_t *obj_end,
3916 vm_object_size_t *obj_size,
3917 vm_prot_t *cur_protection,
3918 vm_prot_t *max_protection,
3919 vm_inherit_t *inheritance)
3920 {
3921 kern_return_t result;
3922
3923 result = vm_sanitize_cur_and_max_prots(cur_protection_u, max_protection_u,
3924 VM_SANITIZE_CALLER_ENTER_MEM_OBJ, target_map,
3925 VM_PROT_IS_MASK, cur_protection,
3926 max_protection);
3927 if (__improbable(result != KERN_SUCCESS)) {
3928 return result;
3929 }
3930
3931 result = vm_sanitize_inherit(inheritance_u, VM_SANITIZE_CALLER_ENTER_MEM_OBJ,
3932 inheritance);
3933 if (__improbable(result != KERN_SUCCESS)) {
3934 return result;
3935 }
3936
3937 result = vm_sanitize_mask(mask_u, VM_SANITIZE_CALLER_ENTER_MEM_OBJ, mask);
3938 if (__improbable(result != KERN_SUCCESS)) {
3939 return result;
3940 }
3941
3942 if (vmk_flags.vmf_fixed) {
3943 vm_map_address_t map_end;
3944
3945 result = vm_sanitize_addr_size(address_u, initial_size_u,
3946 VM_SANITIZE_CALLER_ENTER_MEM_OBJ,
3947 target_map,
3948 VM_SANITIZE_FLAGS_SIZE_ZERO_FAILS | VM_SANITIZE_FLAGS_REALIGN_START,
3949 map_addr, &map_end, map_size);
3950 if (__improbable(result != KERN_SUCCESS)) {
3951 return result;
3952 }
3953 } else {
3954 *map_addr = vm_sanitize_addr(target_map, address_u);
3955 result = vm_sanitize_size(0, initial_size_u,
3956 VM_SANITIZE_CALLER_ENTER_MEM_OBJ, target_map,
3957 VM_SANITIZE_FLAGS_SIZE_ZERO_FAILS, map_size);
3958 if (__improbable(result != KERN_SUCCESS)) {
3959 return result;
3960 }
3961 }
3962
3963 *obj_size = vm_object_round_page(*map_size);
3964 if (__improbable(*obj_size == 0)) {
3965 return KERN_INVALID_ARGUMENT;
3966 }
3967
3968 if (IP_VALID(port)) {
3969 result = vm_sanitize_addr_size(offset_u, *obj_size,
3970 VM_SANITIZE_CALLER_ENTER_MEM_OBJ,
3971 PAGE_MASK,
3972 VM_SANITIZE_FLAGS_SIZE_ZERO_FAILS |
3973 VM_SANITIZE_FLAGS_GET_UNALIGNED_VALUES,
3974 obj_offs, obj_end, obj_size);
3975 if (__improbable(result != KERN_SUCCESS)) {
3976 return result;
3977 }
3978 } else {
3979 *obj_offs = 0;
3980 *obj_end = *obj_size;
3981 }
3982
3983 return KERN_SUCCESS;
3984 }
3985
3986 kern_return_t
vm_map_enter_mem_object(vm_map_t target_map,vm_map_offset_ut * address_u,vm_map_size_ut initial_size_u,vm_map_offset_ut mask_u,vm_map_kernel_flags_t vmk_flags,ipc_port_t port,vm_object_offset_ut offset_u,boolean_t copy,vm_prot_ut cur_protection_u,vm_prot_ut max_protection_u,vm_inherit_ut inheritance_u,upl_page_list_ptr_t page_list,unsigned int page_list_count)3987 vm_map_enter_mem_object(
3988 vm_map_t target_map,
3989 vm_map_offset_ut *address_u,
3990 vm_map_size_ut initial_size_u,
3991 vm_map_offset_ut mask_u,
3992 vm_map_kernel_flags_t vmk_flags,
3993 ipc_port_t port,
3994 vm_object_offset_ut offset_u,
3995 boolean_t copy,
3996 vm_prot_ut cur_protection_u,
3997 vm_prot_ut max_protection_u,
3998 vm_inherit_ut inheritance_u,
3999 upl_page_list_ptr_t page_list,
4000 unsigned int page_list_count)
4001 {
4002 vm_map_offset_t mask;
4003 vm_prot_t cur_protection;
4004 vm_prot_t max_protection;
4005 vm_inherit_t inheritance;
4006 vm_map_address_t map_addr, map_mask;
4007 vm_map_size_t map_size;
4008 vm_object_t object = VM_OBJECT_NULL;
4009 vm_object_offset_t obj_offs, obj_end;
4010 vm_object_size_t obj_size;
4011 kern_return_t result;
4012 boolean_t mask_cur_protection, mask_max_protection;
4013 boolean_t kernel_prefault, try_prefault = (page_list_count != 0);
4014 vm_map_offset_t offset_in_mapping = 0;
4015
4016 if (VM_MAP_PAGE_SHIFT(target_map) < PAGE_SHIFT) {
4017 /* XXX TODO4K prefaulting depends on page size... */
4018 try_prefault = FALSE;
4019 }
4020
4021 /*
4022 * Check arguments for validity
4023 */
4024 if ((target_map == VM_MAP_NULL) ||
4025 (try_prefault && (copy || !page_list))) {
4026 return KERN_INVALID_ARGUMENT;
4027 }
4028
4029 map_mask = vm_map_page_mask(target_map);
4030
4031 /*
4032 * Sanitize any input parameters that are addr/size/prot/inherit
4033 */
4034 result = vm_map_enter_mem_object_sanitize(
4035 target_map,
4036 *address_u,
4037 initial_size_u,
4038 mask_u,
4039 offset_u,
4040 cur_protection_u,
4041 max_protection_u,
4042 inheritance_u,
4043 vmk_flags,
4044 port,
4045 &map_addr,
4046 &map_size,
4047 &mask,
4048 &obj_offs,
4049 &obj_end,
4050 &obj_size,
4051 &cur_protection,
4052 &max_protection,
4053 &inheritance);
4054 if (__improbable(result != KERN_SUCCESS)) {
4055 return vm_sanitize_get_kr(result);
4056 }
4057
4058 assertf(vmk_flags.__vmkf_unused2 == 0, "vmk_flags unused2=0x%llx\n", vmk_flags.__vmkf_unused2);
4059 vm_map_kernel_flags_update_range_id(&vmk_flags, target_map, map_size);
4060
4061 mask_cur_protection = cur_protection & VM_PROT_IS_MASK;
4062 mask_max_protection = max_protection & VM_PROT_IS_MASK;
4063 cur_protection &= ~VM_PROT_IS_MASK;
4064 max_protection &= ~VM_PROT_IS_MASK;
4065
4066 #if __arm64__
4067 if (cur_protection & VM_PROT_EXECUTE) {
4068 cur_protection |= VM_PROT_READ;
4069 }
4070 #endif /* __arm64__ */
4071
4072 /*
4073 * Find the vm object (if any) corresponding to this port.
4074 */
4075 if (!IP_VALID(port)) {
4076 object = VM_OBJECT_NULL;
4077 copy = FALSE;
4078 } else if (ip_kotype(port) == IKOT_NAMED_ENTRY) {
4079 vm_named_entry_t named_entry;
4080 vm_object_size_t initial_size;
4081
4082 named_entry = mach_memory_entry_from_port(port);
4083
4084 if (vmk_flags.vmf_return_data_addr ||
4085 vmk_flags.vmf_return_4k_data_addr) {
4086 result = vm_map_enter_adjust_offset(&obj_offs,
4087 &obj_end, named_entry->data_offset);
4088 if (__improbable(result)) {
4089 return result;
4090 }
4091 }
4092
4093 /* a few checks to make sure user is obeying rules */
4094 if (mask_max_protection) {
4095 max_protection &= named_entry->protection;
4096 }
4097 if (mask_cur_protection) {
4098 cur_protection &= named_entry->protection;
4099 }
4100 if ((named_entry->protection & max_protection) !=
4101 max_protection) {
4102 return KERN_INVALID_RIGHT;
4103 }
4104 if ((named_entry->protection & cur_protection) !=
4105 cur_protection) {
4106 return KERN_INVALID_RIGHT;
4107 }
4108
4109 /*
4110 * unwrap is safe because we know obj_size is larger and doesn't
4111 * overflow
4112 */
4113 initial_size = VM_SANITIZE_UNSAFE_UNWRAP(initial_size_u);
4114 if (named_entry->size < obj_offs + initial_size) {
4115 return KERN_INVALID_ARGUMENT;
4116 }
4117
4118 /* for a vm_map_copy, we can only map it whole */
4119 if (named_entry->is_copy &&
4120 (obj_size != named_entry->size) &&
4121 (vm_map_round_page(obj_size, map_mask) == named_entry->size)) {
4122 /* XXX FBDP use the rounded size... */
4123 obj_end += named_entry->size - obj_size;
4124 obj_size = named_entry->size;
4125 }
4126
4127 if (named_entry->offset) {
4128 /*
4129 * the callers parameter offset is defined to be the
4130 * offset from beginning of named entry offset in object
4131 *
4132 * Because we checked above that
4133 * obj_offs + obj_size < named_entry_size
4134 * these overflow checks should be redundant...
4135 */
4136 result = vm_map_enter_adjust_offset(&obj_offs,
4137 &obj_end, named_entry->offset);
4138 if (__improbable(result)) {
4139 return result;
4140 }
4141 }
4142
4143 if (!VM_MAP_PAGE_ALIGNED(obj_size, map_mask)) {
4144 /*
4145 * Let's not map more than requested;
4146 * vm_map_enter() will handle this "not map-aligned"
4147 * case.
4148 */
4149 map_size = obj_size;
4150 }
4151
4152 named_entry_lock(named_entry);
4153
4154 // rdar://130307561 (Combine copy, object, and submap fields of vm_named_entry into an enum)
4155 assert(named_entry->is_copy || named_entry->is_object || named_entry->is_sub_map);
4156
4157 if (named_entry->is_sub_map) {
4158 vm_map_t submap;
4159
4160 assert(!named_entry->is_copy);
4161 assert(!named_entry->is_object);
4162
4163 if (vmk_flags.vmf_return_data_addr ||
4164 vmk_flags.vmf_return_4k_data_addr) {
4165 panic("VM_FLAGS_RETURN_DATA_ADDR not expected for submap.");
4166 }
4167
4168 submap = named_entry->backing.map;
4169 vm_map_reference(submap);
4170 named_entry_unlock(named_entry);
4171
4172 vmk_flags.vmkf_submap = TRUE;
4173 result = vm_map_enter(target_map,
4174 &map_addr,
4175 map_size,
4176 mask,
4177 vmk_flags,
4178 (vm_object_t)(uintptr_t) submap,
4179 obj_offs,
4180 copy,
4181 cur_protection,
4182 max_protection,
4183 inheritance);
4184 if (result != KERN_SUCCESS) {
4185 vm_map_deallocate(submap);
4186 return result;
4187 }
4188 /*
4189 * No need to lock "submap" just to check its
4190 * "mapped" flag: that flag is never reset
4191 * once it's been set and if we race, we'll
4192 * just end up setting it twice, which is OK.
4193 */
4194 if (submap->mapped_in_other_pmaps == FALSE &&
4195 vm_map_pmap(submap) != PMAP_NULL &&
4196 vm_map_pmap(submap) !=
4197 vm_map_pmap(target_map)) {
4198 /*
4199 * This submap is being mapped in a map
4200 * that uses a different pmap.
4201 * Set its "mapped_in_other_pmaps" flag
4202 * to indicate that we now need to
4203 * remove mappings from all pmaps rather
4204 * than just the submap's pmap.
4205 */
4206 vm_map_lock(submap);
4207 submap->mapped_in_other_pmaps = TRUE;
4208 vm_map_unlock(submap);
4209 }
4210 goto out;
4211 }
4212
4213 if (named_entry->is_copy) {
4214 kern_return_t kr;
4215 vm_map_copy_t copy_map;
4216 vm_map_entry_t copy_entry;
4217 vm_map_offset_t copy_addr;
4218 vm_map_copy_t target_copy_map;
4219 vm_map_offset_t overmap_start, overmap_end;
4220 vm_map_offset_t trimmed_start;
4221 vm_map_size_t target_size;
4222
4223 assert(!named_entry->is_object);
4224 assert(!named_entry->is_sub_map);
4225
4226 int allowed_flags = VM_FLAGS_FIXED |
4227 VM_FLAGS_ANYWHERE |
4228 VM_FLAGS_OVERWRITE |
4229 VM_FLAGS_RETURN_4K_DATA_ADDR |
4230 VM_FLAGS_RETURN_DATA_ADDR;
4231
4232 if (!vm_map_kernel_flags_check_vmflags(vmk_flags, allowed_flags)) {
4233 named_entry_unlock(named_entry);
4234 return KERN_INVALID_ARGUMENT;
4235 }
4236
4237 copy_map = named_entry->backing.copy;
4238 assert(copy_map->type == VM_MAP_COPY_ENTRY_LIST);
4239 if (copy_map->type != VM_MAP_COPY_ENTRY_LIST) {
4240 /* unsupported type; should not happen */
4241 printf("vm_map_enter_mem_object: "
4242 "memory_entry->backing.copy "
4243 "unsupported type 0x%x\n",
4244 copy_map->type);
4245 named_entry_unlock(named_entry);
4246 return KERN_INVALID_ARGUMENT;
4247 }
4248
4249 if (VM_MAP_PAGE_SHIFT(target_map) != copy_map->cpy_hdr.page_shift) {
4250 DEBUG4K_SHARE("copy_map %p offset %llx size 0x%llx pgshift %d -> target_map %p pgshift %d\n", copy_map, obj_offs, (uint64_t)map_size, copy_map->cpy_hdr.page_shift, target_map, VM_MAP_PAGE_SHIFT(target_map));
4251 }
4252
4253 if (vmk_flags.vmf_return_data_addr ||
4254 vmk_flags.vmf_return_4k_data_addr) {
4255 offset_in_mapping = obj_offs & map_mask;
4256 if (vmk_flags.vmf_return_4k_data_addr) {
4257 offset_in_mapping &= ~((signed)(0xFFF));
4258 }
4259 }
4260
4261 target_copy_map = VM_MAP_COPY_NULL;
4262 target_size = copy_map->size;
4263 overmap_start = 0;
4264 overmap_end = 0;
4265 trimmed_start = 0;
4266 if (copy_map->cpy_hdr.page_shift != VM_MAP_PAGE_SHIFT(target_map)) {
4267 DEBUG4K_ADJUST("adjusting...\n");
4268 kr = vm_map_copy_adjust_to_target(
4269 copy_map,
4270 obj_offs,
4271 initial_size,
4272 target_map,
4273 copy,
4274 &target_copy_map,
4275 &overmap_start,
4276 &overmap_end,
4277 &trimmed_start);
4278 if (kr != KERN_SUCCESS) {
4279 named_entry_unlock(named_entry);
4280 return kr;
4281 }
4282 target_size = target_copy_map->size;
4283 } else {
4284 /*
4285 * Assert that the vm_map_copy is coming from the right
4286 * zone and hasn't been forged
4287 */
4288 vm_map_copy_require(copy_map);
4289 target_copy_map = copy_map;
4290 }
4291
4292 vm_map_kernel_flags_t rsv_flags = vmk_flags;
4293
4294 vm_map_kernel_flags_and_vmflags(&rsv_flags,
4295 (VM_FLAGS_FIXED |
4296 VM_FLAGS_ANYWHERE |
4297 VM_FLAGS_OVERWRITE |
4298 VM_FLAGS_RETURN_4K_DATA_ADDR |
4299 VM_FLAGS_RETURN_DATA_ADDR));
4300
4301 /* reserve a contiguous range */
4302 kr = vm_map_enter(target_map,
4303 &map_addr,
4304 vm_map_round_page(target_size, map_mask),
4305 mask,
4306 rsv_flags,
4307 VM_OBJECT_NULL,
4308 0,
4309 FALSE, /* copy */
4310 cur_protection,
4311 max_protection,
4312 inheritance);
4313 if (kr != KERN_SUCCESS) {
4314 DEBUG4K_ERROR("kr 0x%x\n", kr);
4315 if (target_copy_map != copy_map) {
4316 vm_map_copy_discard(target_copy_map);
4317 target_copy_map = VM_MAP_COPY_NULL;
4318 }
4319 named_entry_unlock(named_entry);
4320 return kr;
4321 }
4322
4323 copy_addr = map_addr;
4324
4325 for (copy_entry = vm_map_copy_first_entry(target_copy_map);
4326 copy_entry != vm_map_copy_to_entry(target_copy_map);
4327 copy_entry = copy_entry->vme_next) {
4328 vm_map_t copy_submap = VM_MAP_NULL;
4329 vm_object_t copy_object = VM_OBJECT_NULL;
4330 vm_map_size_t copy_size;
4331 vm_object_offset_t copy_offset;
4332 boolean_t do_copy = false;
4333
4334 if (copy_entry->is_sub_map) {
4335 copy_submap = VME_SUBMAP(copy_entry);
4336 copy_object = (vm_object_t)copy_submap;
4337 } else {
4338 copy_object = VME_OBJECT(copy_entry);
4339 }
4340 copy_offset = VME_OFFSET(copy_entry);
4341 copy_size = (copy_entry->vme_end -
4342 copy_entry->vme_start);
4343
4344 /* sanity check */
4345 if ((copy_addr + copy_size) >
4346 (map_addr +
4347 overmap_start + overmap_end +
4348 named_entry->size /* XXX full size */)) {
4349 /* over-mapping too much !? */
4350 kr = KERN_INVALID_ARGUMENT;
4351 DEBUG4K_ERROR("kr 0x%x\n", kr);
4352 /* abort */
4353 break;
4354 }
4355
4356 /* take a reference on the object */
4357 if (copy_entry->is_sub_map) {
4358 vm_map_reference(copy_submap);
4359 } else {
4360 if (!copy &&
4361 copy_object != VM_OBJECT_NULL &&
4362 copy_object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) {
4363 bool is_writable;
4364
4365 /*
4366 * We need to resolve our side of this
4367 * "symmetric" copy-on-write now; we
4368 * need a new object to map and share,
4369 * instead of the current one which
4370 * might still be shared with the
4371 * original mapping.
4372 *
4373 * Note: A "vm_map_copy_t" does not
4374 * have a lock but we're protected by
4375 * the named entry's lock here.
4376 */
4377 // assert(copy_object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC);
4378 VME_OBJECT_SHADOW(copy_entry, copy_size, TRUE);
4379 assert(copy_object != VME_OBJECT(copy_entry));
4380 is_writable = false;
4381 if (copy_entry->protection & VM_PROT_WRITE) {
4382 is_writable = true;
4383 #if __arm64e__
4384 } else if (copy_entry->used_for_tpro) {
4385 is_writable = true;
4386 #endif /* __arm64e__ */
4387 }
4388 if (!copy_entry->needs_copy && is_writable) {
4389 vm_prot_t prot;
4390
4391 prot = copy_entry->protection & ~VM_PROT_WRITE;
4392 vm_object_pmap_protect(copy_object,
4393 copy_offset,
4394 copy_size,
4395 PMAP_NULL,
4396 PAGE_SIZE,
4397 0,
4398 prot);
4399 }
4400 copy_entry->needs_copy = FALSE;
4401 copy_entry->is_shared = TRUE;
4402 copy_object = VME_OBJECT(copy_entry);
4403 copy_offset = VME_OFFSET(copy_entry);
4404 vm_object_lock(copy_object);
4405 /* we're about to make a shared mapping of this object */
4406 copy_object->copy_strategy = MEMORY_OBJECT_COPY_DELAY;
4407 VM_OBJECT_SET_TRUE_SHARE(copy_object, TRUE);
4408 vm_object_unlock(copy_object);
4409 }
4410
4411 if (copy_object != VM_OBJECT_NULL &&
4412 copy_object->named &&
4413 copy_object->pager != MEMORY_OBJECT_NULL &&
4414 copy_object->copy_strategy != MEMORY_OBJECT_COPY_NONE) {
4415 memory_object_t pager;
4416 vm_prot_t pager_prot;
4417
4418 /*
4419 * For "named" VM objects, let the pager know that the
4420 * memory object is being mapped. Some pagers need to keep
4421 * track of this, to know when they can reclaim the memory
4422 * object, for example.
4423 * VM calls memory_object_map() for each mapping (specifying
4424 * the protection of each mapping) and calls
4425 * memory_object_last_unmap() when all the mappings are gone.
4426 */
4427 pager_prot = max_protection;
4428 if (copy) {
4429 /*
4430 * Copy-On-Write mapping: won't modify the
4431 * memory object.
4432 */
4433 pager_prot &= ~VM_PROT_WRITE;
4434 }
4435 vm_object_lock(copy_object);
4436 pager = copy_object->pager;
4437 if (copy_object->named &&
4438 pager != MEMORY_OBJECT_NULL &&
4439 copy_object->copy_strategy != MEMORY_OBJECT_COPY_NONE) {
4440 assert(copy_object->pager_ready);
4441 vm_object_mapping_wait(copy_object, THREAD_UNINT);
4442 /*
4443 * Object might have lost its pager
4444 * while waiting.
4445 */
4446 pager = copy_object->pager;
4447 if (copy_object->named &&
4448 pager != MEMORY_OBJECT_NULL) {
4449 vm_object_mapping_begin(copy_object);
4450 vm_object_unlock(copy_object);
4451
4452 kr = memory_object_map(pager, pager_prot);
4453 assert(kr == KERN_SUCCESS);
4454
4455 vm_object_lock(copy_object);
4456 vm_object_mapping_end(copy_object);
4457 }
4458 }
4459 vm_object_unlock(copy_object);
4460 }
4461
4462 /*
4463 * Perform the copy if requested
4464 */
4465
4466 if (copy && copy_object != VM_OBJECT_NULL) {
4467 vm_object_t new_object;
4468 vm_object_offset_t new_offset;
4469
4470 result = vm_object_copy_strategically(copy_object, copy_offset,
4471 copy_size,
4472 false, /* forking */
4473 &new_object, &new_offset,
4474 &do_copy);
4475
4476
4477 if (result == KERN_MEMORY_RESTART_COPY) {
4478 boolean_t success;
4479 boolean_t src_needs_copy;
4480
4481 /*
4482 * XXX
4483 * We currently ignore src_needs_copy.
4484 * This really is the issue of how to make
4485 * MEMORY_OBJECT_COPY_SYMMETRIC safe for
4486 * non-kernel users to use. Solution forthcoming.
4487 * In the meantime, since we don't allow non-kernel
4488 * memory managers to specify symmetric copy,
4489 * we won't run into problems here.
4490 */
4491 new_object = copy_object;
4492 new_offset = copy_offset;
4493 success = vm_object_copy_quickly(new_object,
4494 new_offset,
4495 copy_size,
4496 &src_needs_copy,
4497 &do_copy);
4498 assert(success);
4499 result = KERN_SUCCESS;
4500 }
4501 if (result != KERN_SUCCESS) {
4502 kr = result;
4503 break;
4504 }
4505
4506 copy_object = new_object;
4507 copy_offset = new_offset;
4508 /*
4509 * No extra object reference for the mapping:
4510 * the mapping should be the only thing keeping
4511 * this new object alive.
4512 */
4513 } else {
4514 /*
4515 * We already have the right object
4516 * to map.
4517 */
4518 copy_object = VME_OBJECT(copy_entry);
4519 /* take an extra ref for the mapping below */
4520 vm_object_reference(copy_object);
4521 }
4522 }
4523
4524 /*
4525 * If the caller does not want a specific
4526 * tag for this new mapping: use
4527 * the tag of the original mapping.
4528 */
4529 vm_map_kernel_flags_t vmk_remap_flags = {
4530 .vmkf_submap = copy_entry->is_sub_map,
4531 };
4532
4533 vm_map_kernel_flags_set_vmflags(&vmk_remap_flags,
4534 vm_map_kernel_flags_vmflags(vmk_flags),
4535 vmk_flags.vm_tag ?: VME_ALIAS(copy_entry));
4536
4537 /* over-map the object into destination */
4538 vmk_remap_flags.vmf_fixed = true;
4539 vmk_remap_flags.vmf_overwrite = true;
4540
4541 if (!copy && !copy_entry->is_sub_map) {
4542 /*
4543 * copy-on-write should have been
4544 * resolved at this point, or we would
4545 * end up sharing instead of copying.
4546 */
4547 assert(!copy_entry->needs_copy);
4548 }
4549 #if XNU_TARGET_OS_OSX
4550 if (copy_entry->used_for_jit) {
4551 vmk_remap_flags.vmkf_map_jit = TRUE;
4552 }
4553 #endif /* XNU_TARGET_OS_OSX */
4554
4555 kr = vm_map_enter(target_map,
4556 ©_addr,
4557 copy_size,
4558 (vm_map_offset_t) 0,
4559 vmk_remap_flags,
4560 copy_object,
4561 copy_offset,
4562 ((copy_object == NULL)
4563 ? FALSE
4564 : (copy || copy_entry->needs_copy)),
4565 cur_protection,
4566 max_protection,
4567 inheritance);
4568 if (kr != KERN_SUCCESS) {
4569 DEBUG4K_SHARE("failed kr 0x%x\n", kr);
4570 if (copy_entry->is_sub_map) {
4571 vm_map_deallocate(copy_submap);
4572 } else {
4573 vm_object_deallocate(copy_object);
4574 }
4575 /* abort */
4576 break;
4577 }
4578
4579 /* next mapping */
4580 copy_addr += copy_size;
4581 }
4582
4583 named_entry_unlock(named_entry);
4584 if (target_copy_map != copy_map) {
4585 vm_map_copy_discard(target_copy_map);
4586 target_copy_map = VM_MAP_COPY_NULL;
4587 }
4588
4589 if (kr == KERN_SUCCESS) {
4590 if (overmap_start) {
4591 DEBUG4K_SHARE("map %p map_addr 0x%llx offset_in_mapping 0x%llx overmap_start 0x%llx -> *address 0x%llx\n", target_map, (uint64_t)map_addr, (uint64_t)offset_in_mapping, (uint64_t)overmap_start, (uint64_t)(map_addr + offset_in_mapping + overmap_start));
4592 }
4593 offset_in_mapping += overmap_start;
4594 } else if (!vmk_flags.vmf_overwrite) {
4595 /* deallocate the contiguous range */
4596 vm_map_remove(target_map, map_addr,
4597 map_addr + map_size);
4598 }
4599 result = kr;
4600 goto out;
4601 }
4602
4603 if (named_entry->is_object) {
4604 unsigned int access;
4605 unsigned int wimg_mode;
4606
4607 assert(!named_entry->is_copy);
4608 assert(!named_entry->is_sub_map);
4609
4610 /* we are mapping a VM object */
4611
4612 access = named_entry->access;
4613
4614 if (vmk_flags.vmf_return_data_addr ||
4615 vmk_flags.vmf_return_4k_data_addr) {
4616 offset_in_mapping = obj_offs & map_mask;
4617 if (vmk_flags.vmf_return_4k_data_addr) {
4618 offset_in_mapping &= ~((signed)(0xFFF));
4619 }
4620 obj_offs -= offset_in_mapping;
4621 map_size = vm_map_round_page(initial_size +
4622 offset_in_mapping, map_mask);
4623 }
4624
4625 object = vm_named_entry_to_vm_object(named_entry);
4626 assert(object != VM_OBJECT_NULL);
4627 vm_object_lock(object);
4628 named_entry_unlock(named_entry);
4629
4630 wimg_mode = object->wimg_bits;
4631 vm_prot_to_wimg(access, &wimg_mode);
4632 if (object->wimg_bits != wimg_mode) {
4633 vm_object_change_wimg_mode(object, wimg_mode);
4634 }
4635
4636 vm_object_reference_locked(object);
4637 vm_object_unlock(object);
4638 } else {
4639 panic("invalid VM named entry %p", named_entry);
4640 }
4641 } else if (ip_kotype(port) == IKOT_MEMORY_OBJECT) {
4642 /*
4643 * JMM - This is temporary until we unify named entries
4644 * and raw memory objects.
4645 *
4646 * Detected fake ip_kotype for a memory object. In
4647 * this case, the port isn't really a port at all, but
4648 * instead is just a raw memory object.
4649 */
4650 if (vmk_flags.vmf_return_data_addr ||
4651 vmk_flags.vmf_return_4k_data_addr) {
4652 panic("VM_FLAGS_RETURN_DATA_ADDR not expected for raw memory object.");
4653 }
4654
4655 object = memory_object_to_vm_object((memory_object_t)port);
4656 if (object == VM_OBJECT_NULL) {
4657 return KERN_INVALID_OBJECT;
4658 }
4659 vm_object_reference(object);
4660
4661 /* wait for object (if any) to be ready */
4662 if (object != VM_OBJECT_NULL) {
4663 if (is_kernel_object(object)) {
4664 printf("Warning: Attempt to map kernel object"
4665 " by a non-private kernel entity\n");
4666 return KERN_INVALID_OBJECT;
4667 }
4668 if (!object->pager_ready) {
4669 vm_object_lock(object);
4670
4671 while (!object->pager_ready) {
4672 vm_object_sleep(object,
4673 VM_OBJECT_EVENT_PAGER_READY,
4674 THREAD_UNINT,
4675 LCK_SLEEP_EXCLUSIVE);
4676 }
4677 vm_object_unlock(object);
4678 }
4679 }
4680 } else {
4681 return KERN_INVALID_OBJECT;
4682 }
4683
4684 if (object != VM_OBJECT_NULL &&
4685 object->named &&
4686 object->pager != MEMORY_OBJECT_NULL &&
4687 object->copy_strategy != MEMORY_OBJECT_COPY_NONE) {
4688 memory_object_t pager;
4689 vm_prot_t pager_prot;
4690 kern_return_t kr;
4691
4692 /*
4693 * For "named" VM objects, let the pager know that the
4694 * memory object is being mapped. Some pagers need to keep
4695 * track of this, to know when they can reclaim the memory
4696 * object, for example.
4697 * VM calls memory_object_map() for each mapping (specifying
4698 * the protection of each mapping) and calls
4699 * memory_object_last_unmap() when all the mappings are gone.
4700 */
4701 pager_prot = max_protection;
4702 if (copy) {
4703 /*
4704 * Copy-On-Write mapping: won't modify the
4705 * memory object.
4706 */
4707 pager_prot &= ~VM_PROT_WRITE;
4708 }
4709 vm_object_lock(object);
4710 pager = object->pager;
4711 if (object->named &&
4712 pager != MEMORY_OBJECT_NULL &&
4713 object->copy_strategy != MEMORY_OBJECT_COPY_NONE) {
4714 assert(object->pager_ready);
4715 vm_object_mapping_wait(object, THREAD_UNINT);
4716 /* object might have lost its pager while waiting */
4717 pager = object->pager;
4718 if (object->named && pager != MEMORY_OBJECT_NULL) {
4719 vm_object_mapping_begin(object);
4720 vm_object_unlock(object);
4721
4722 kr = memory_object_map(pager, pager_prot);
4723 assert(kr == KERN_SUCCESS);
4724
4725 vm_object_lock(object);
4726 vm_object_mapping_end(object);
4727 }
4728 }
4729 vm_object_unlock(object);
4730 }
4731
4732 /*
4733 * Perform the copy if requested
4734 */
4735
4736 if (copy) {
4737 vm_object_t new_object;
4738 vm_object_offset_t new_offset;
4739
4740 result = vm_object_copy_strategically(object,
4741 obj_offs,
4742 map_size,
4743 false, /* forking */
4744 &new_object, &new_offset,
4745 ©);
4746
4747
4748 if (result == KERN_MEMORY_RESTART_COPY) {
4749 boolean_t success;
4750 boolean_t src_needs_copy;
4751
4752 /*
4753 * XXX
4754 * We currently ignore src_needs_copy.
4755 * This really is the issue of how to make
4756 * MEMORY_OBJECT_COPY_SYMMETRIC safe for
4757 * non-kernel users to use. Solution forthcoming.
4758 * In the meantime, since we don't allow non-kernel
4759 * memory managers to specify symmetric copy,
4760 * we won't run into problems here.
4761 */
4762 new_object = object;
4763 new_offset = obj_offs;
4764 success = vm_object_copy_quickly(new_object,
4765 new_offset,
4766 map_size,
4767 &src_needs_copy,
4768 ©);
4769 assert(success);
4770 result = KERN_SUCCESS;
4771 }
4772 /*
4773 * Throw away the reference to the
4774 * original object, as it won't be mapped.
4775 */
4776
4777 vm_object_deallocate(object);
4778
4779 if (result != KERN_SUCCESS) {
4780 return result;
4781 }
4782
4783 object = new_object;
4784 obj_offs = new_offset;
4785 }
4786
4787 /*
4788 * If non-kernel users want to try to prefault pages, the mapping and prefault
4789 * needs to be atomic.
4790 */
4791 kernel_prefault = (try_prefault && vm_kernel_map_is_kernel(target_map));
4792 vmk_flags.vmkf_keep_map_locked = (try_prefault && !kernel_prefault);
4793
4794 result = vm_map_enter(target_map,
4795 &map_addr, map_size,
4796 (vm_map_offset_t)mask,
4797 vmk_flags,
4798 object, obj_offs,
4799 copy,
4800 cur_protection, max_protection,
4801 inheritance);
4802 if (result != KERN_SUCCESS) {
4803 vm_object_deallocate(object);
4804 }
4805
4806 /*
4807 * Try to prefault, and do not forget to release the vm map lock.
4808 */
4809 if (result == KERN_SUCCESS && try_prefault) {
4810 mach_vm_address_t va = map_addr;
4811 kern_return_t kr = KERN_SUCCESS;
4812 unsigned int i = 0;
4813 int pmap_options;
4814
4815 pmap_options = kernel_prefault ? 0 : PMAP_OPTIONS_NOWAIT;
4816
4817 for (i = 0; i < page_list_count; ++i) {
4818 if (!UPL_VALID_PAGE(page_list, i)) {
4819 if (kernel_prefault) {
4820 assertf(FALSE, "kernel_prefault && !UPL_VALID_PAGE");
4821 result = KERN_MEMORY_ERROR;
4822 break;
4823 }
4824 } else {
4825 /*
4826 * If this function call failed, we should stop
4827 * trying to optimize, other calls are likely
4828 * going to fail too.
4829 *
4830 * We are not gonna report an error for such
4831 * failure though. That's an optimization, not
4832 * something critical.
4833 */
4834 kr = pmap_enter_object_options_check(target_map->pmap,
4835 va, 0, object, UPL_PHYS_PAGE(page_list, i),
4836 cur_protection, VM_PROT_NONE,
4837 TRUE, pmap_options);
4838 if (kr != KERN_SUCCESS) {
4839 OSIncrementAtomic64(&vm_prefault_nb_bailout);
4840 if (kernel_prefault) {
4841 result = kr;
4842 }
4843 break;
4844 }
4845 OSIncrementAtomic64(&vm_prefault_nb_pages);
4846 }
4847
4848 /* Next virtual address */
4849 va += PAGE_SIZE;
4850 }
4851 if (vmk_flags.vmkf_keep_map_locked) {
4852 vm_map_unlock(target_map);
4853 }
4854 }
4855
4856 out:
4857 if (result == KERN_SUCCESS) {
4858 #if KASAN
4859 if (target_map->pmap == kernel_pmap) {
4860 kasan_notify_address(map_addr, map_size);
4861 }
4862 #endif
4863 *address_u = vm_sanitize_wrap_addr(map_addr + offset_in_mapping);
4864 }
4865 return result;
4866 }
4867
4868 kern_return_t
vm_map_enter_mem_object_prefault(vm_map_t target_map,vm_map_offset_ut * address,vm_map_size_ut initial_size,vm_map_offset_ut mask,vm_map_kernel_flags_t vmk_flags,ipc_port_t port,vm_object_offset_ut offset,vm_prot_ut cur_protection,vm_prot_ut max_protection,upl_page_list_ptr_t page_list,unsigned int page_list_count)4869 vm_map_enter_mem_object_prefault(
4870 vm_map_t target_map,
4871 vm_map_offset_ut *address,
4872 vm_map_size_ut initial_size,
4873 vm_map_offset_ut mask,
4874 vm_map_kernel_flags_t vmk_flags,
4875 ipc_port_t port,
4876 vm_object_offset_ut offset,
4877 vm_prot_ut cur_protection,
4878 vm_prot_ut max_protection,
4879 upl_page_list_ptr_t page_list,
4880 unsigned int page_list_count)
4881 {
4882 /* range_id is set by vm_map_enter_mem_object */
4883 return vm_map_enter_mem_object(target_map,
4884 address,
4885 initial_size,
4886 mask,
4887 vmk_flags,
4888 port,
4889 offset,
4890 FALSE,
4891 cur_protection,
4892 max_protection,
4893 VM_INHERIT_DEFAULT,
4894 page_list,
4895 page_list_count);
4896 }
4897
4898 static __attribute__((always_inline, warn_unused_result))
4899 kern_return_t
vm_map_enter_mem_object_control_sanitize(vm_map_t target_map,vm_map_offset_ut address_u,vm_map_size_ut initial_size_u,vm_map_offset_ut mask_u,vm_object_offset_ut offset_u,vm_prot_ut cur_protection_u,vm_prot_ut max_protection_u,vm_inherit_ut inheritance_u,vm_map_kernel_flags_t vmk_flags,vm_map_address_t * map_addr,vm_map_size_t * map_size,vm_map_offset_t * mask,vm_object_offset_t * obj_offs,vm_object_offset_t * obj_end,vm_object_size_t * obj_size,vm_prot_t * cur_protection,vm_prot_t * max_protection,vm_inherit_t * inheritance)4900 vm_map_enter_mem_object_control_sanitize(
4901 vm_map_t target_map,
4902 vm_map_offset_ut address_u,
4903 vm_map_size_ut initial_size_u,
4904 vm_map_offset_ut mask_u,
4905 vm_object_offset_ut offset_u,
4906 vm_prot_ut cur_protection_u,
4907 vm_prot_ut max_protection_u,
4908 vm_inherit_ut inheritance_u,
4909 vm_map_kernel_flags_t vmk_flags,
4910 vm_map_address_t *map_addr,
4911 vm_map_size_t *map_size,
4912 vm_map_offset_t *mask,
4913 vm_object_offset_t *obj_offs,
4914 vm_object_offset_t *obj_end,
4915 vm_object_size_t *obj_size,
4916 vm_prot_t *cur_protection,
4917 vm_prot_t *max_protection,
4918 vm_inherit_t *inheritance)
4919 {
4920 kern_return_t kr;
4921
4922 kr = vm_sanitize_cur_and_max_prots(cur_protection_u, max_protection_u,
4923 VM_SANITIZE_CALLER_ENTER_MEM_OBJ_CTL, target_map,
4924 cur_protection, max_protection);
4925 if (__improbable(kr != KERN_SUCCESS)) {
4926 return kr;
4927 }
4928
4929 kr = vm_sanitize_inherit(inheritance_u, VM_SANITIZE_CALLER_ENTER_MEM_OBJ_CTL,
4930 inheritance);
4931 if (__improbable(kr != KERN_SUCCESS)) {
4932 return kr;
4933 }
4934
4935 kr = vm_sanitize_mask(mask_u, VM_SANITIZE_CALLER_ENTER_MEM_OBJ_CTL, mask);
4936 if (__improbable(kr != KERN_SUCCESS)) {
4937 return kr;
4938 }
4939 /*
4940 * Ensure arithmetic doesn't overflow in vm_object space (kernel
4941 * pages).
4942 * We keep unaligned values for now. The call we eventually make to
4943 * vm_map_enter does guarantee that offset_u is page aligned for EITHER
4944 * target_map pages or kernel pages. But this isn't enough to guarantee
4945 * kernel space alignment.
4946 */
4947 kr = vm_sanitize_addr_size(offset_u, initial_size_u,
4948 VM_SANITIZE_CALLER_ENTER_MEM_OBJ_CTL, PAGE_MASK,
4949 VM_SANITIZE_FLAGS_SIZE_ZERO_FAILS |
4950 VM_SANITIZE_FLAGS_GET_UNALIGNED_VALUES,
4951 obj_offs, obj_end, obj_size);
4952 if (__improbable(kr != KERN_SUCCESS)) {
4953 return kr;
4954 }
4955
4956 /*
4957 * There is no vm_sanitize_addr_size variant that also adjusts for
4958 * a separate offset. Rather than create one for this one-off issue,
4959 * we sanitize map_addr and map_size individually, relying on
4960 * vm_sanitize_size to incorporate the offset. Then, we perform the
4961 * overflow check manually below.
4962 */
4963 *map_addr = vm_sanitize_addr(target_map, address_u);
4964 kr = vm_sanitize_size(offset_u, initial_size_u,
4965 VM_SANITIZE_CALLER_ENTER_MEM_OBJ_CTL, target_map,
4966 VM_SANITIZE_FLAGS_SIZE_ZERO_FAILS, map_size);
4967 if (__improbable(kr != KERN_SUCCESS)) {
4968 return kr;
4969 }
4970
4971 /*
4972 * Ensure arithmetic doesn't overflow in target_map space.
4973 * The computation of map_size above accounts for the possibility that
4974 * offset_u might be unaligned in target_map space.
4975 */
4976 if (vmk_flags.vmf_fixed) {
4977 vm_map_address_t map_end;
4978
4979 if (__improbable(os_add_overflow(*map_addr, *map_size, &map_end))) {
4980 return KERN_INVALID_ARGUMENT;
4981 }
4982 }
4983
4984 return KERN_SUCCESS;
4985 }
4986
4987 kern_return_t
vm_map_enter_mem_object_control(vm_map_t target_map,vm_map_offset_ut * address_u,vm_map_size_ut initial_size_u,vm_map_offset_ut mask_u,vm_map_kernel_flags_t vmk_flags,memory_object_control_t control,vm_object_offset_ut offset_u,boolean_t needs_copy,vm_prot_ut cur_protection_u,vm_prot_ut max_protection_u,vm_inherit_ut inheritance_u)4988 vm_map_enter_mem_object_control(
4989 vm_map_t target_map,
4990 vm_map_offset_ut *address_u,
4991 vm_map_size_ut initial_size_u,
4992 vm_map_offset_ut mask_u,
4993 vm_map_kernel_flags_t vmk_flags,
4994 memory_object_control_t control,
4995 vm_object_offset_ut offset_u,
4996 boolean_t needs_copy,
4997 vm_prot_ut cur_protection_u,
4998 vm_prot_ut max_protection_u,
4999 vm_inherit_ut inheritance_u)
5000 {
5001 vm_map_offset_t mask;
5002 vm_prot_t cur_protection;
5003 vm_prot_t max_protection;
5004 vm_inherit_t inheritance;
5005 vm_map_address_t map_addr;
5006 vm_map_size_t map_size;
5007 vm_object_t object;
5008 vm_object_offset_t obj_offs, obj_end;
5009 vm_object_size_t obj_size;
5010 kern_return_t result;
5011 memory_object_t pager;
5012 vm_prot_t pager_prot;
5013 kern_return_t kr;
5014
5015 /*
5016 * Check arguments for validity
5017 */
5018 if (target_map == VM_MAP_NULL) {
5019 return KERN_INVALID_ARGUMENT;
5020 }
5021
5022 /*
5023 * We only support vmf_return_data_addr-like behavior.
5024 */
5025 vmk_flags.vmf_return_data_addr = true;
5026
5027 /*
5028 * Sanitize any input parameters that are addr/size/prot/inherit
5029 */
5030 kr = vm_map_enter_mem_object_control_sanitize(target_map,
5031 *address_u,
5032 initial_size_u,
5033 mask_u,
5034 offset_u,
5035 cur_protection_u,
5036 max_protection_u,
5037 inheritance_u,
5038 vmk_flags,
5039 &map_addr,
5040 &map_size,
5041 &mask,
5042 &obj_offs,
5043 &obj_end,
5044 &obj_size,
5045 &cur_protection,
5046 &max_protection,
5047 &inheritance);
5048 if (__improbable(kr != KERN_SUCCESS)) {
5049 return vm_sanitize_get_kr(kr);
5050 }
5051
5052 object = memory_object_control_to_vm_object(control);
5053
5054 if (object == VM_OBJECT_NULL) {
5055 return KERN_INVALID_OBJECT;
5056 }
5057
5058 if (is_kernel_object(object)) {
5059 printf("Warning: Attempt to map kernel object"
5060 " by a non-private kernel entity\n");
5061 return KERN_INVALID_OBJECT;
5062 }
5063
5064 vm_object_lock(object);
5065 os_ref_retain_locked_raw(&object->ref_count, &vm_object_refgrp);
5066
5067
5068 /*
5069 * For "named" VM objects, let the pager know that the
5070 * memory object is being mapped. Some pagers need to keep
5071 * track of this, to know when they can reclaim the memory
5072 * object, for example.
5073 * VM calls memory_object_map() for each mapping (specifying
5074 * the protection of each mapping) and calls
5075 * memory_object_last_unmap() when all the mappings are gone.
5076 */
5077 pager_prot = max_protection;
5078 if (needs_copy) {
5079 pager_prot &= ~VM_PROT_WRITE;
5080 }
5081 pager = object->pager;
5082 if (object->named &&
5083 pager != MEMORY_OBJECT_NULL &&
5084 object->copy_strategy != MEMORY_OBJECT_COPY_NONE) {
5085 assert(object->pager_ready);
5086 vm_object_mapping_wait(object, THREAD_UNINT);
5087 /* object might have lost its pager while waiting */
5088 pager = object->pager;
5089 if (object->named && pager != MEMORY_OBJECT_NULL) {
5090 vm_object_mapping_begin(object);
5091 vm_object_unlock(object);
5092
5093 kr = memory_object_map(pager, pager_prot);
5094 assert(kr == KERN_SUCCESS);
5095
5096 vm_object_lock(object);
5097 vm_object_mapping_end(object);
5098 }
5099 }
5100 vm_object_unlock(object);
5101
5102 /*
5103 * Perform the copy if requested
5104 */
5105
5106 if (needs_copy) {
5107 vm_object_t new_object;
5108 vm_object_offset_t new_offset;
5109
5110 result = vm_object_copy_strategically(object, obj_offs, obj_size,
5111 false, /* forking */
5112 &new_object, &new_offset,
5113 &needs_copy);
5114
5115
5116 if (result == KERN_MEMORY_RESTART_COPY) {
5117 boolean_t success;
5118 boolean_t src_needs_copy;
5119
5120 /*
5121 * XXX
5122 * We currently ignore src_needs_copy.
5123 * This really is the issue of how to make
5124 * MEMORY_OBJECT_COPY_SYMMETRIC safe for
5125 * non-kernel users to use. Solution forthcoming.
5126 * In the meantime, since we don't allow non-kernel
5127 * memory managers to specify symmetric copy,
5128 * we won't run into problems here.
5129 */
5130 new_object = object;
5131 new_offset = obj_offs;
5132 success = vm_object_copy_quickly(new_object,
5133 new_offset, obj_size,
5134 &src_needs_copy,
5135 &needs_copy);
5136 assert(success);
5137 result = KERN_SUCCESS;
5138 }
5139 /*
5140 * Throw away the reference to the
5141 * original object, as it won't be mapped.
5142 */
5143
5144 vm_object_deallocate(object);
5145
5146 if (result != KERN_SUCCESS) {
5147 return result;
5148 }
5149
5150 object = new_object;
5151 obj_offs = new_offset;
5152 }
5153
5154 result = vm_map_enter(target_map,
5155 &map_addr, map_size,
5156 (vm_map_offset_t)mask,
5157 vmk_flags,
5158 object,
5159 obj_offs,
5160 needs_copy,
5161 cur_protection, max_protection,
5162 inheritance);
5163
5164 if (result == KERN_SUCCESS) {
5165 *address_u = vm_sanitize_wrap_addr(
5166 map_addr + (obj_offs & vm_map_page_mask(target_map)));
5167 } else {
5168 vm_object_deallocate(object);
5169 }
5170
5171 return result;
5172 }
5173
5174
5175 /* Not used without nested pmaps */
5176 #ifndef NO_NESTED_PMAP
5177 /*
5178 * Clip and unnest a portion of a nested submap mapping.
5179 */
5180
5181
5182 static void
vm_map_clip_unnest(vm_map_t map,vm_map_entry_t entry,vm_map_offset_t start_unnest,vm_map_offset_t end_unnest)5183 vm_map_clip_unnest(
5184 vm_map_t map,
5185 vm_map_entry_t entry,
5186 vm_map_offset_t start_unnest,
5187 vm_map_offset_t end_unnest)
5188 {
5189 vm_map_offset_t old_start_unnest = start_unnest;
5190 vm_map_offset_t old_end_unnest = end_unnest;
5191
5192 assert(entry->is_sub_map);
5193 assert(VME_SUBMAP(entry) != NULL);
5194 assert(entry->use_pmap);
5195
5196 /*
5197 * Query the platform for the optimal unnest range.
5198 * DRK: There's some duplication of effort here, since
5199 * callers may have adjusted the range to some extent. This
5200 * routine was introduced to support 1GiB subtree nesting
5201 * for x86 platforms, which can also nest on 2MiB boundaries
5202 * depending on size/alignment.
5203 */
5204 if (pmap_adjust_unnest_parameters(map->pmap, &start_unnest, &end_unnest)) {
5205 assert(VME_SUBMAP(entry)->is_nested_map);
5206 assert(!VME_SUBMAP(entry)->disable_vmentry_reuse);
5207 log_unnest_badness(map,
5208 old_start_unnest,
5209 old_end_unnest,
5210 VME_SUBMAP(entry)->is_nested_map,
5211 (entry->vme_start +
5212 VME_SUBMAP(entry)->lowest_unnestable_start -
5213 VME_OFFSET(entry)));
5214 }
5215
5216 if (entry->vme_start > start_unnest ||
5217 entry->vme_end < end_unnest) {
5218 panic("vm_map_clip_unnest(0x%llx,0x%llx): "
5219 "bad nested entry: start=0x%llx end=0x%llx\n",
5220 (long long)start_unnest, (long long)end_unnest,
5221 (long long)entry->vme_start, (long long)entry->vme_end);
5222 }
5223
5224 if (start_unnest > entry->vme_start) {
5225 _vm_map_clip_start(&map->hdr,
5226 entry,
5227 start_unnest);
5228 if (map->holelistenabled) {
5229 vm_map_store_update_first_free(map, NULL, FALSE);
5230 } else {
5231 vm_map_store_update_first_free(map, map->first_free, FALSE);
5232 }
5233 }
5234 if (entry->vme_end > end_unnest) {
5235 _vm_map_clip_end(&map->hdr,
5236 entry,
5237 end_unnest);
5238 if (map->holelistenabled) {
5239 vm_map_store_update_first_free(map, NULL, FALSE);
5240 } else {
5241 vm_map_store_update_first_free(map, map->first_free, FALSE);
5242 }
5243 }
5244
5245 pmap_unnest(map->pmap,
5246 entry->vme_start,
5247 entry->vme_end - entry->vme_start);
5248 if ((map->mapped_in_other_pmaps) && os_ref_get_count_raw(&map->map_refcnt) != 0) {
5249 /* clean up parent map/maps */
5250 vm_map_submap_pmap_clean(
5251 map, entry->vme_start,
5252 entry->vme_end,
5253 VME_SUBMAP(entry),
5254 VME_OFFSET(entry));
5255 }
5256 entry->use_pmap = FALSE;
5257 if ((map->pmap != kernel_pmap) &&
5258 (VME_ALIAS(entry) == VM_MEMORY_SHARED_PMAP)) {
5259 VME_ALIAS_SET(entry, VM_MEMORY_UNSHARED_PMAP);
5260 }
5261 }
5262 #endif /* NO_NESTED_PMAP */
5263
5264 __abortlike
5265 static void
__vm_map_clip_atomic_entry_panic(vm_map_t map,vm_map_entry_t entry,vm_map_offset_t where)5266 __vm_map_clip_atomic_entry_panic(
5267 vm_map_t map,
5268 vm_map_entry_t entry,
5269 vm_map_offset_t where)
5270 {
5271 panic("vm_map_clip(%p): Attempting to clip an atomic VM map entry "
5272 "%p [0x%llx:0x%llx] at 0x%llx", map, entry,
5273 (uint64_t)entry->vme_start,
5274 (uint64_t)entry->vme_end,
5275 (uint64_t)where);
5276 }
5277
5278 /*
5279 * vm_map_clip_start: [ internal use only ]
5280 *
5281 * Asserts that the given entry begins at or after
5282 * the specified address; if necessary,
5283 * it splits the entry into two.
5284 */
5285 void
vm_map_clip_start(vm_map_t map,vm_map_entry_t entry,vm_map_offset_t startaddr)5286 vm_map_clip_start(
5287 vm_map_t map,
5288 vm_map_entry_t entry,
5289 vm_map_offset_t startaddr)
5290 {
5291 #ifndef NO_NESTED_PMAP
5292 if (entry->is_sub_map &&
5293 entry->use_pmap &&
5294 startaddr >= entry->vme_start) {
5295 vm_map_offset_t start_unnest, end_unnest;
5296
5297 /*
5298 * Make sure "startaddr" is no longer in a nested range
5299 * before we clip. Unnest only the minimum range the platform
5300 * can handle.
5301 * vm_map_clip_unnest may perform additional adjustments to
5302 * the unnest range.
5303 */
5304 start_unnest = startaddr & ~(pmap_shared_region_size_min(map->pmap) - 1);
5305 end_unnest = start_unnest + pmap_shared_region_size_min(map->pmap);
5306 vm_map_clip_unnest(map, entry, start_unnest, end_unnest);
5307 }
5308 #endif /* NO_NESTED_PMAP */
5309 if (startaddr > entry->vme_start) {
5310 if (!entry->is_sub_map &&
5311 VME_OBJECT(entry) &&
5312 VME_OBJECT(entry)->phys_contiguous) {
5313 pmap_remove(map->pmap,
5314 (addr64_t)(entry->vme_start),
5315 (addr64_t)(entry->vme_end));
5316 }
5317 if (entry->vme_atomic) {
5318 __vm_map_clip_atomic_entry_panic(map, entry, startaddr);
5319 }
5320
5321 DTRACE_VM5(
5322 vm_map_clip_start,
5323 vm_map_t, map,
5324 vm_map_offset_t, entry->vme_start,
5325 vm_map_offset_t, entry->vme_end,
5326 vm_map_offset_t, startaddr,
5327 int, VME_ALIAS(entry));
5328
5329 _vm_map_clip_start(&map->hdr, entry, startaddr);
5330 if (map->holelistenabled) {
5331 vm_map_store_update_first_free(map, NULL, FALSE);
5332 } else {
5333 vm_map_store_update_first_free(map, map->first_free, FALSE);
5334 }
5335 }
5336 }
5337
5338
5339 #define vm_map_copy_clip_start(copy, entry, startaddr) \
5340 MACRO_BEGIN \
5341 if ((startaddr) > (entry)->vme_start) \
5342 _vm_map_clip_start(&(copy)->cpy_hdr,(entry),(startaddr)); \
5343 MACRO_END
5344
5345 /*
5346 * This routine is called only when it is known that
5347 * the entry must be split.
5348 */
5349 static void
_vm_map_clip_start(struct vm_map_header * map_header,vm_map_entry_t entry,vm_map_offset_t start)5350 _vm_map_clip_start(
5351 struct vm_map_header *map_header,
5352 vm_map_entry_t entry,
5353 vm_map_offset_t start)
5354 {
5355 vm_map_entry_t new_entry;
5356
5357 /*
5358 * Split off the front portion --
5359 * note that we must insert the new
5360 * entry BEFORE this one, so that
5361 * this entry has the specified starting
5362 * address.
5363 */
5364
5365 if (entry->map_aligned) {
5366 assert(VM_MAP_PAGE_ALIGNED(start,
5367 VM_MAP_HDR_PAGE_MASK(map_header)));
5368 }
5369
5370 new_entry = _vm_map_entry_create(map_header);
5371 vm_map_entry_copy_full(new_entry, entry);
5372
5373 new_entry->vme_end = start;
5374 assert(new_entry->vme_start < new_entry->vme_end);
5375 VME_OFFSET_SET(entry, VME_OFFSET(entry) + (start - entry->vme_start));
5376 if (__improbable(start >= entry->vme_end)) {
5377 panic("mapHdr %p entry %p start 0x%llx end 0x%llx new start 0x%llx", map_header, entry, entry->vme_start, entry->vme_end, start);
5378 }
5379 assert(start < entry->vme_end);
5380 entry->vme_start = start;
5381
5382 #if VM_BTLOG_TAGS
5383 if (new_entry->vme_kernel_object) {
5384 btref_retain(new_entry->vme_tag_btref);
5385 }
5386 #endif /* VM_BTLOG_TAGS */
5387
5388 _vm_map_store_entry_link(map_header, entry->vme_prev, new_entry);
5389
5390 if (entry->is_sub_map) {
5391 vm_map_reference(VME_SUBMAP(new_entry));
5392 } else {
5393 vm_object_reference(VME_OBJECT(new_entry));
5394 }
5395 }
5396
5397
5398 /*
5399 * vm_map_clip_end: [ internal use only ]
5400 *
5401 * Asserts that the given entry ends at or before
5402 * the specified address; if necessary,
5403 * it splits the entry into two.
5404 */
5405 void
vm_map_clip_end(vm_map_t map,vm_map_entry_t entry,vm_map_offset_t endaddr)5406 vm_map_clip_end(
5407 vm_map_t map,
5408 vm_map_entry_t entry,
5409 vm_map_offset_t endaddr)
5410 {
5411 if (endaddr > entry->vme_end) {
5412 /*
5413 * Within the scope of this clipping, limit "endaddr" to
5414 * the end of this map entry...
5415 */
5416 endaddr = entry->vme_end;
5417 }
5418 #ifndef NO_NESTED_PMAP
5419 if (entry->is_sub_map && entry->use_pmap) {
5420 vm_map_offset_t start_unnest, end_unnest;
5421
5422 /*
5423 * Make sure the range between the start of this entry and
5424 * the new "endaddr" is no longer nested before we clip.
5425 * Unnest only the minimum range the platform can handle.
5426 * vm_map_clip_unnest may perform additional adjustments to
5427 * the unnest range.
5428 */
5429 start_unnest = entry->vme_start;
5430 end_unnest =
5431 (endaddr + pmap_shared_region_size_min(map->pmap) - 1) &
5432 ~(pmap_shared_region_size_min(map->pmap) - 1);
5433 vm_map_clip_unnest(map, entry, start_unnest, end_unnest);
5434 }
5435 #endif /* NO_NESTED_PMAP */
5436 if (endaddr < entry->vme_end) {
5437 if (!entry->is_sub_map &&
5438 VME_OBJECT(entry) &&
5439 VME_OBJECT(entry)->phys_contiguous) {
5440 pmap_remove(map->pmap,
5441 (addr64_t)(entry->vme_start),
5442 (addr64_t)(entry->vme_end));
5443 }
5444 if (entry->vme_atomic) {
5445 __vm_map_clip_atomic_entry_panic(map, entry, endaddr);
5446 }
5447 DTRACE_VM5(
5448 vm_map_clip_end,
5449 vm_map_t, map,
5450 vm_map_offset_t, entry->vme_start,
5451 vm_map_offset_t, entry->vme_end,
5452 vm_map_offset_t, endaddr,
5453 int, VME_ALIAS(entry));
5454
5455 _vm_map_clip_end(&map->hdr, entry, endaddr);
5456 if (map->holelistenabled) {
5457 vm_map_store_update_first_free(map, NULL, FALSE);
5458 } else {
5459 vm_map_store_update_first_free(map, map->first_free, FALSE);
5460 }
5461 }
5462 }
5463
5464
5465 #define vm_map_copy_clip_end(copy, entry, endaddr) \
5466 MACRO_BEGIN \
5467 if ((endaddr) < (entry)->vme_end) \
5468 _vm_map_clip_end(&(copy)->cpy_hdr,(entry),(endaddr)); \
5469 MACRO_END
5470
5471 /*
5472 * This routine is called only when it is known that
5473 * the entry must be split.
5474 */
5475 static void
_vm_map_clip_end(struct vm_map_header * map_header,vm_map_entry_t entry,vm_map_offset_t end)5476 _vm_map_clip_end(
5477 struct vm_map_header *map_header,
5478 vm_map_entry_t entry,
5479 vm_map_offset_t end)
5480 {
5481 vm_map_entry_t new_entry;
5482
5483 /*
5484 * Create a new entry and insert it
5485 * AFTER the specified entry
5486 */
5487
5488 if (entry->map_aligned) {
5489 assert(VM_MAP_PAGE_ALIGNED(end,
5490 VM_MAP_HDR_PAGE_MASK(map_header)));
5491 }
5492
5493 new_entry = _vm_map_entry_create(map_header);
5494 vm_map_entry_copy_full(new_entry, entry);
5495
5496 if (__improbable(end <= entry->vme_start)) {
5497 panic("mapHdr %p entry %p start 0x%llx end 0x%llx new end 0x%llx", map_header, entry, entry->vme_start, entry->vme_end, end);
5498 }
5499 assert(entry->vme_start < end);
5500 new_entry->vme_start = entry->vme_end = end;
5501 VME_OFFSET_SET(new_entry,
5502 VME_OFFSET(new_entry) + (end - entry->vme_start));
5503 assert(new_entry->vme_start < new_entry->vme_end);
5504
5505 #if VM_BTLOG_TAGS
5506 if (new_entry->vme_kernel_object) {
5507 btref_retain(new_entry->vme_tag_btref);
5508 }
5509 #endif /* VM_BTLOG_TAGS */
5510
5511 _vm_map_store_entry_link(map_header, entry, new_entry);
5512
5513 if (entry->is_sub_map) {
5514 vm_map_reference(VME_SUBMAP(new_entry));
5515 } else {
5516 vm_object_reference(VME_OBJECT(new_entry));
5517 }
5518 }
5519
5520
5521 /*
5522 * VM_MAP_RANGE_CHECK: [ internal use only ]
5523 *
5524 * Asserts that the starting and ending region
5525 * addresses fall within the valid range of the map.
5526 */
5527 #define VM_MAP_RANGE_CHECK(map, start, end) \
5528 MACRO_BEGIN \
5529 if (start < vm_map_min(map)) \
5530 start = vm_map_min(map); \
5531 if (end > vm_map_max(map)) \
5532 end = vm_map_max(map); \
5533 if (start > end) \
5534 start = end; \
5535 MACRO_END
5536
5537 /*
5538 * vm_map_range_check: [ internal use only ]
5539 *
5540 * Check that the region defined by the specified start and
5541 * end addresses are wholly contained within a single map
5542 * entry or set of adjacent map entries of the spacified map,
5543 * i.e. the specified region contains no unmapped space.
5544 * If any or all of the region is unmapped, FALSE is returned.
5545 * Otherwise, TRUE is returned and if the output argument 'entry'
5546 * is not NULL it points to the map entry containing the start
5547 * of the region.
5548 *
5549 * The map is locked for reading on entry and is left locked.
5550 */
5551 static boolean_t
vm_map_range_check(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_map_entry_t * entry)5552 vm_map_range_check(
5553 vm_map_t map,
5554 vm_map_offset_t start,
5555 vm_map_offset_t end,
5556 vm_map_entry_t *entry)
5557 {
5558 vm_map_entry_t cur;
5559 vm_map_offset_t prev;
5560
5561 /*
5562 * Basic sanity checks first
5563 */
5564 if (start < vm_map_min(map) || end > vm_map_max(map) || start > end) {
5565 return FALSE;
5566 }
5567
5568 /*
5569 * Check first if the region starts within a valid
5570 * mapping for the map.
5571 */
5572 if (!vm_map_lookup_entry(map, start, &cur)) {
5573 return FALSE;
5574 }
5575
5576 /*
5577 * Optimize for the case that the region is contained
5578 * in a single map entry.
5579 */
5580 if (entry != (vm_map_entry_t *) NULL) {
5581 *entry = cur;
5582 }
5583 if (end <= cur->vme_end) {
5584 return TRUE;
5585 }
5586
5587 /*
5588 * If the region is not wholly contained within a
5589 * single entry, walk the entries looking for holes.
5590 */
5591 prev = cur->vme_end;
5592 cur = cur->vme_next;
5593 while ((cur != vm_map_to_entry(map)) && (prev == cur->vme_start)) {
5594 if (end <= cur->vme_end) {
5595 return TRUE;
5596 }
5597 prev = cur->vme_end;
5598 cur = cur->vme_next;
5599 }
5600 return FALSE;
5601 }
5602
5603 static __attribute__((always_inline, warn_unused_result))
5604 kern_return_t
vm_map_protect_sanitize(vm_map_t map,vm_map_offset_ut start_u,vm_map_offset_ut end_u,vm_prot_ut new_prot_u,vm_map_offset_t * start,vm_map_offset_t * end,vm_prot_t * new_prot)5605 vm_map_protect_sanitize(
5606 vm_map_t map,
5607 vm_map_offset_ut start_u,
5608 vm_map_offset_ut end_u,
5609 vm_prot_ut new_prot_u,
5610 vm_map_offset_t *start,
5611 vm_map_offset_t *end,
5612 vm_prot_t *new_prot)
5613 {
5614 kern_return_t kr;
5615 vm_map_size_t size;
5616
5617 kr = vm_sanitize_prot(new_prot_u, VM_SANITIZE_CALLER_VM_MAP_PROTECT,
5618 map, VM_PROT_COPY, new_prot);
5619 if (__improbable(kr != KERN_SUCCESS)) {
5620 return kr;
5621 }
5622
5623 kr = vm_sanitize_addr_end(start_u, end_u, VM_SANITIZE_CALLER_VM_MAP_PROTECT,
5624 map, VM_SANITIZE_FLAGS_SIZE_ZERO_SUCCEEDS, start, end, &size);
5625 if (__improbable(kr != KERN_SUCCESS)) {
5626 return kr;
5627 }
5628
5629 return KERN_SUCCESS;
5630 }
5631
5632 /*
5633 * vm_map_protect:
5634 *
5635 * Sets the protection of the specified address
5636 * region in the target map. If "set_max" is
5637 * specified, the maximum protection is to be set;
5638 * otherwise, only the current protection is affected.
5639 */
5640 kern_return_t
vm_map_protect(vm_map_t map,vm_map_offset_ut start_u,vm_map_offset_ut end_u,boolean_t set_max,vm_prot_ut new_prot_u)5641 vm_map_protect(
5642 vm_map_t map,
5643 vm_map_offset_ut start_u,
5644 vm_map_offset_ut end_u,
5645 boolean_t set_max,
5646 vm_prot_ut new_prot_u)
5647 {
5648 vm_map_entry_t current;
5649 vm_map_offset_t prev;
5650 vm_map_entry_t entry;
5651 vm_prot_t new_prot;
5652 vm_prot_t new_max;
5653 int pmap_options = 0;
5654 kern_return_t kr;
5655 vm_map_offset_t start, original_start;
5656 vm_map_offset_t end;
5657
5658 kr = vm_map_protect_sanitize(map,
5659 start_u,
5660 end_u,
5661 new_prot_u,
5662 &start,
5663 &end,
5664 &new_prot);
5665 if (__improbable(kr != KERN_SUCCESS)) {
5666 return vm_sanitize_get_kr(kr);
5667 }
5668 original_start = start;
5669
5670 if (new_prot & VM_PROT_COPY) {
5671 vm_map_offset_t new_start;
5672 vm_prot_t cur_prot, max_prot;
5673 vm_map_kernel_flags_t kflags;
5674
5675 /* LP64todo - see below */
5676 if (start >= map->max_offset) {
5677 return KERN_INVALID_ADDRESS;
5678 }
5679
5680 if ((new_prot & VM_PROT_ALLEXEC) &&
5681 map->pmap != kernel_pmap &&
5682 (vm_map_cs_enforcement(map)
5683 #if XNU_TARGET_OS_OSX && __arm64__
5684 || !VM_MAP_IS_EXOTIC(map)
5685 #endif /* XNU_TARGET_OS_OSX && __arm64__ */
5686 ) &&
5687 VM_MAP_POLICY_WX_FAIL(map)) {
5688 DTRACE_VM3(cs_wx,
5689 uint64_t, (uint64_t) start,
5690 uint64_t, (uint64_t) end,
5691 vm_prot_t, new_prot);
5692 printf("CODE SIGNING: %d[%s] %s:%d(0x%llx,0x%llx,0x%x) can't have both write and exec at the same time\n",
5693 proc_selfpid(),
5694 (get_bsdtask_info(current_task())
5695 ? proc_name_address(get_bsdtask_info(current_task()))
5696 : "?"),
5697 __FUNCTION__, __LINE__,
5698 #if DEVELOPMENT || DEBUG
5699 (uint64_t)start,
5700 (uint64_t)end,
5701 #else /* DEVELOPMENT || DEBUG */
5702 (uint64_t)0,
5703 (uint64_t)0,
5704 #endif /* DEVELOPMENT || DEBUG */
5705 new_prot);
5706 return KERN_PROTECTION_FAILURE;
5707 }
5708
5709 /*
5710 * Let vm_map_remap_extract() know that it will need to:
5711 * + make a copy of the mapping
5712 * + add VM_PROT_WRITE to the max protections
5713 * + remove any protections that are no longer allowed from the
5714 * max protections (to avoid any WRITE/EXECUTE conflict, for
5715 * example).
5716 * Note that "max_prot" is an IN/OUT parameter only for this
5717 * specific (VM_PROT_COPY) case. It's usually an OUT parameter
5718 * only.
5719 */
5720 max_prot = new_prot & (VM_PROT_ALL | VM_PROT_ALLEXEC);
5721 cur_prot = VM_PROT_NONE;
5722 kflags = VM_MAP_KERNEL_FLAGS_FIXED(.vmf_overwrite = true);
5723 kflags.vmkf_remap_prot_copy = true;
5724 kflags.vmkf_tpro_enforcement_override = !vm_map_tpro_enforcement(map);
5725 new_start = start;
5726 kr = vm_map_remap(map,
5727 vm_sanitize_wrap_addr_ref(&new_start),
5728 end - start,
5729 0, /* mask */
5730 kflags,
5731 map,
5732 start,
5733 TRUE, /* copy-on-write remapping! */
5734 vm_sanitize_wrap_prot_ref(&cur_prot), /* IN/OUT */
5735 vm_sanitize_wrap_prot_ref(&max_prot), /* IN/OUT */
5736 VM_INHERIT_DEFAULT);
5737 if (kr != KERN_SUCCESS) {
5738 return kr;
5739 }
5740 new_prot &= ~VM_PROT_COPY;
5741 }
5742
5743 vm_map_lock(map);
5744 restart_after_unlock:
5745
5746 /* LP64todo - remove this check when vm_map_commpage64()
5747 * no longer has to stuff in a map_entry for the commpage
5748 * above the map's max_offset.
5749 */
5750 if (start >= map->max_offset) {
5751 vm_map_unlock(map);
5752 return KERN_INVALID_ADDRESS;
5753 }
5754
5755 while (1) {
5756 /*
5757 * Lookup the entry. If it doesn't start in a valid
5758 * entry, return an error.
5759 */
5760 if (!vm_map_lookup_entry(map, start, &entry)) {
5761 vm_map_unlock(map);
5762 return KERN_INVALID_ADDRESS;
5763 }
5764
5765 if (entry->superpage_size && (start & (SUPERPAGE_SIZE - 1))) { /* extend request to whole entry */
5766 start = SUPERPAGE_ROUND_DOWN(start);
5767 continue;
5768 }
5769 break;
5770 }
5771 if (entry->superpage_size) {
5772 end = SUPERPAGE_ROUND_UP(end);
5773 }
5774
5775 /*
5776 * Make a first pass to check for protection and address
5777 * violations.
5778 */
5779
5780 current = entry;
5781 prev = current->vme_start;
5782 while ((current != vm_map_to_entry(map)) &&
5783 (current->vme_start < end)) {
5784 /*
5785 * If there is a hole, return an error.
5786 */
5787 if (current->vme_start != prev) {
5788 vm_map_unlock(map);
5789 return KERN_INVALID_ADDRESS;
5790 }
5791
5792 new_max = current->max_protection;
5793
5794 #if defined(__x86_64__)
5795 /* Allow max mask to include execute prot bits if this map doesn't enforce CS */
5796 if (set_max && (new_prot & VM_PROT_ALLEXEC) && !vm_map_cs_enforcement(map)) {
5797 new_max = (new_max & ~VM_PROT_ALLEXEC) | (new_prot & VM_PROT_ALLEXEC);
5798 }
5799 #elif CODE_SIGNING_MONITOR
5800 if (set_max && (new_prot & VM_PROT_EXECUTE) && (csm_address_space_exempt(map->pmap) == KERN_SUCCESS)) {
5801 new_max |= VM_PROT_EXECUTE;
5802 }
5803 #endif
5804 if ((new_prot & new_max) != new_prot) {
5805 vm_map_unlock(map);
5806 return KERN_PROTECTION_FAILURE;
5807 }
5808
5809 if (current->used_for_jit &&
5810 pmap_has_prot_policy(map->pmap, current->translated_allow_execute, current->protection)) {
5811 vm_map_unlock(map);
5812 return KERN_PROTECTION_FAILURE;
5813 }
5814
5815 #if __arm64e__
5816 /* Disallow protecting hw assisted TPRO mappings */
5817 if (current->used_for_tpro) {
5818 vm_map_unlock(map);
5819 return KERN_PROTECTION_FAILURE;
5820 }
5821 #endif /* __arm64e__ */
5822
5823
5824 if ((new_prot & VM_PROT_WRITE) &&
5825 (new_prot & VM_PROT_ALLEXEC) &&
5826 #if XNU_TARGET_OS_OSX
5827 map->pmap != kernel_pmap &&
5828 (vm_map_cs_enforcement(map)
5829 #if __arm64__
5830 || !VM_MAP_IS_EXOTIC(map)
5831 #endif /* __arm64__ */
5832 ) &&
5833 #endif /* XNU_TARGET_OS_OSX */
5834 #if CODE_SIGNING_MONITOR
5835 (csm_address_space_exempt(map->pmap) != KERN_SUCCESS) &&
5836 #endif
5837 !(current->used_for_jit)) {
5838 DTRACE_VM3(cs_wx,
5839 uint64_t, (uint64_t) current->vme_start,
5840 uint64_t, (uint64_t) current->vme_end,
5841 vm_prot_t, new_prot);
5842 printf("CODE SIGNING: %d[%s] %s:%d(0x%llx,0x%llx,0x%x) can't have both write and exec at the same time\n",
5843 proc_selfpid(),
5844 (get_bsdtask_info(current_task())
5845 ? proc_name_address(get_bsdtask_info(current_task()))
5846 : "?"),
5847 __FUNCTION__, __LINE__,
5848 #if DEVELOPMENT || DEBUG
5849 (uint64_t)current->vme_start,
5850 (uint64_t)current->vme_end,
5851 #else /* DEVELOPMENT || DEBUG */
5852 (uint64_t)0,
5853 (uint64_t)0,
5854 #endif /* DEVELOPMENT || DEBUG */
5855 new_prot);
5856 new_prot &= ~VM_PROT_ALLEXEC;
5857 if (VM_MAP_POLICY_WX_FAIL(map)) {
5858 vm_map_unlock(map);
5859 return KERN_PROTECTION_FAILURE;
5860 }
5861 }
5862
5863 /*
5864 * If the task has requested executable lockdown,
5865 * deny both:
5866 * - adding executable protections OR
5867 * - adding write protections to an existing executable mapping.
5868 */
5869 if (map->map_disallow_new_exec == TRUE) {
5870 if ((new_prot & VM_PROT_ALLEXEC) ||
5871 ((current->protection & VM_PROT_EXECUTE) && (new_prot & VM_PROT_WRITE))) {
5872 vm_map_unlock(map);
5873 return KERN_PROTECTION_FAILURE;
5874 }
5875 }
5876
5877 prev = current->vme_end;
5878 current = current->vme_next;
5879 }
5880
5881 #if __arm64__
5882 if (end > prev &&
5883 end == vm_map_round_page(prev, VM_MAP_PAGE_MASK(map))) {
5884 vm_map_entry_t prev_entry;
5885
5886 prev_entry = current->vme_prev;
5887 if (prev_entry != vm_map_to_entry(map) &&
5888 !prev_entry->map_aligned &&
5889 (vm_map_round_page(prev_entry->vme_end,
5890 VM_MAP_PAGE_MASK(map))
5891 == end)) {
5892 /*
5893 * The last entry in our range is not "map-aligned"
5894 * but it would have reached all the way to "end"
5895 * if it had been map-aligned, so this is not really
5896 * a hole in the range and we can proceed.
5897 */
5898 prev = end;
5899 }
5900 }
5901 #endif /* __arm64__ */
5902
5903 if (end > prev) {
5904 vm_map_unlock(map);
5905 return KERN_INVALID_ADDRESS;
5906 }
5907
5908 /*
5909 * Go back and fix up protections.
5910 * Clip to start here if the range starts within
5911 * the entry.
5912 */
5913
5914 current = entry;
5915 if (current != vm_map_to_entry(map)) {
5916 /* clip and unnest if necessary */
5917 vm_map_clip_start(map, current, start);
5918 }
5919
5920 while ((current != vm_map_to_entry(map)) &&
5921 (current->vme_start < end)) {
5922 vm_prot_t old_prot;
5923
5924 if (current->in_transition) {
5925 wait_result_t wait_result;
5926 vm_map_offset_t current_start;
5927
5928 /*
5929 * Another thread is wiring/unwiring this entry.
5930 * Let the other thread know we are waiting.
5931 */
5932 current_start = current->vme_start;
5933 current->needs_wakeup = true;
5934 /* wait for the other thread to be done */
5935 wait_result = vm_map_entry_wait(map, TH_UNINT);
5936 /*
5937 * We unlocked the map, so anything could have changed in the
5938 * range and we need to re-check from "current_start" to "end".
5939 * Our entries might no longer be valid.
5940 */
5941 current = NULL;
5942 entry = NULL;
5943 /*
5944 * Re-lookup and re-clip "current_start".
5945 * If it's no longer mapped,
5946 */
5947 vm_map_lookup_entry_or_next(map, current_start, ¤t);
5948 if (current != vm_map_to_entry(map)) {
5949 vm_map_clip_start(map, current, current_start);
5950 }
5951 /* restart from this point */
5952 start = current_start;
5953 goto restart_after_unlock;
5954 }
5955
5956 vm_map_clip_end(map, current, end);
5957
5958 #if DEVELOPMENT || DEBUG
5959 if (current->csm_associated && vm_log_xnu_user_debug) {
5960 printf("FBDP %d[%s] %s(0x%llx,0x%llx,0x%x) on map %p entry %p [0x%llx:0x%llx 0x%x/0x%x] csm_associated\n",
5961 proc_selfpid(),
5962 (get_bsdtask_info(current_task())
5963 ? proc_name_address(get_bsdtask_info(current_task()))
5964 : "?"),
5965 __FUNCTION__,
5966 (uint64_t)start,
5967 (uint64_t)end,
5968 new_prot,
5969 map, current,
5970 current->vme_start,
5971 current->vme_end,
5972 current->protection,
5973 current->max_protection);
5974 }
5975 #endif /* DEVELOPMENT || DEBUG */
5976
5977 if (current->is_sub_map) {
5978 /* clipping did unnest if needed */
5979 assert(!current->use_pmap);
5980 }
5981
5982 old_prot = current->protection;
5983
5984 if (set_max) {
5985 current->max_protection = new_prot;
5986 /* Consider either EXECUTE or UEXEC as EXECUTE for this masking */
5987 current->protection = (new_prot & old_prot);
5988 } else {
5989 current->protection = new_prot;
5990 }
5991
5992 #if CODE_SIGNING_MONITOR
5993 if (/* a !csm_associated mapping becoming executable */
5994 ((!current->csm_associated &&
5995 !(old_prot & VM_PROT_EXECUTE) &&
5996 (current->protection & VM_PROT_EXECUTE))
5997 ||
5998 /* a csm_associated mapping becoming writable */
5999 (current->csm_associated &&
6000 !(old_prot & VM_PROT_WRITE) &&
6001 (current->protection & VM_PROT_WRITE)))) {
6002 /*
6003 * This mapping has not already been marked as
6004 * "user_debug" and it is either:
6005 * 1. not code-signing-monitored and becoming executable
6006 * 2. code-signing-monitored and becoming writable,
6007 * so inform the CodeSigningMonitor and mark the
6008 * mapping as "user_debug" if appropriate.
6009 */
6010 vm_map_kernel_flags_t vmk_flags;
6011 vmk_flags = VM_MAP_KERNEL_FLAGS_NONE;
6012 /* pretend it's a vm_protect(VM_PROT_COPY)... */
6013 vmk_flags.vmkf_remap_prot_copy = true;
6014 kr = vm_map_entry_cs_associate(map, current, vmk_flags);
6015 #if DEVELOPMENT || DEBUG
6016 if (vm_log_xnu_user_debug) {
6017 printf("FBDP %d[%s] %s:%d map %p entry %p [ 0x%llx 0x%llx ] prot 0x%x -> 0x%x cs_associate -> %d user_debug=%d\n",
6018 proc_selfpid(),
6019 (get_bsdtask_info(current_task()) ? proc_name_address(get_bsdtask_info(current_task())) : "?"),
6020 __FUNCTION__, __LINE__,
6021 map, current,
6022 current->vme_start, current->vme_end,
6023 old_prot, current->protection,
6024 kr, current->vme_xnu_user_debug);
6025 }
6026 #endif /* DEVELOPMENT || DEBUG */
6027 }
6028 #endif /* CODE_SIGNING_MONITOR */
6029
6030 /*
6031 * Update physical map if necessary.
6032 * If the request is to turn off write protection,
6033 * we won't do it for real (in pmap). This is because
6034 * it would cause copy-on-write to fail. We've already
6035 * set, the new protection in the map, so if a
6036 * write-protect fault occurred, it will be fixed up
6037 * properly, COW or not.
6038 */
6039 if (current->protection != old_prot) {
6040 /* Look one level in we support nested pmaps */
6041 /* from mapped submaps which are direct entries */
6042 /* in our map */
6043
6044 vm_prot_t prot;
6045
6046 prot = current->protection;
6047 if (current->is_sub_map || (VME_OBJECT(current) == NULL) || (VME_OBJECT(current) != compressor_object)) {
6048 prot &= ~VM_PROT_WRITE;
6049 } else {
6050 assert(!VME_OBJECT(current)->code_signed);
6051 assert(VME_OBJECT(current)->copy_strategy == MEMORY_OBJECT_COPY_NONE);
6052 if (prot & VM_PROT_WRITE) {
6053 /*
6054 * For write requests on the
6055 * compressor, we wil ask the
6056 * pmap layer to prevent us from
6057 * taking a write fault when we
6058 * attempt to access the mapping
6059 * next.
6060 */
6061 pmap_options |= PMAP_OPTIONS_PROTECT_IMMEDIATE;
6062 }
6063 }
6064
6065 if (override_nx(map, VME_ALIAS(current)) && prot) {
6066 prot |= VM_PROT_EXECUTE;
6067 }
6068
6069 #if DEVELOPMENT || DEBUG
6070 if (!(old_prot & VM_PROT_EXECUTE) &&
6071 (prot & VM_PROT_EXECUTE) &&
6072 panic_on_unsigned_execute &&
6073 (proc_selfcsflags() & CS_KILL)) {
6074 panic("vm_map_protect(%p,0x%llx,0x%llx) old=0x%x new=0x%x - <rdar://23770418> code-signing bypass?", map, (uint64_t)current->vme_start, (uint64_t)current->vme_end, old_prot, prot);
6075 }
6076 #endif /* DEVELOPMENT || DEBUG */
6077
6078 if (pmap_has_prot_policy(map->pmap, current->translated_allow_execute, prot)) {
6079 if (current->wired_count) {
6080 panic("vm_map_protect(%p,0x%llx,0x%llx) new=0x%x wired=%x",
6081 map, (uint64_t)current->vme_start, (uint64_t)current->vme_end, prot, current->wired_count);
6082 }
6083
6084 /* If the pmap layer cares about this
6085 * protection type, force a fault for
6086 * each page so that vm_fault will
6087 * repopulate the page with the full
6088 * set of protections.
6089 */
6090 /*
6091 * TODO: We don't seem to need this,
6092 * but this is due to an internal
6093 * implementation detail of
6094 * pmap_protect. Do we want to rely
6095 * on this?
6096 */
6097 prot = VM_PROT_NONE;
6098 }
6099
6100 if (current->is_sub_map && current->use_pmap) {
6101 pmap_protect(VME_SUBMAP(current)->pmap,
6102 current->vme_start,
6103 current->vme_end,
6104 prot);
6105 } else {
6106 pmap_protect_options(map->pmap,
6107 current->vme_start,
6108 current->vme_end,
6109 prot,
6110 pmap_options,
6111 NULL);
6112 }
6113 }
6114 current = current->vme_next;
6115 }
6116
6117 if (entry == VM_MAP_ENTRY_NULL) {
6118 /*
6119 * Re-lookup the original start of our range.
6120 * If it's no longer mapped, start with the next mapping.
6121 */
6122 vm_map_lookup_entry_or_next(map, original_start, &entry);
6123 }
6124 current = entry;
6125 while ((current != vm_map_to_entry(map)) &&
6126 (current->vme_start <= end)) {
6127 vm_map_simplify_entry(map, current);
6128 current = current->vme_next;
6129 }
6130
6131 vm_map_unlock(map);
6132 return KERN_SUCCESS;
6133 }
6134
6135 static __attribute__((always_inline, warn_unused_result))
6136 kern_return_t
vm_map_inherit_sanitize(vm_map_t map,vm_map_offset_ut start_u,vm_map_offset_ut end_u,vm_inherit_ut new_inheritance_u,vm_map_offset_t * start,vm_map_offset_t * end,vm_inherit_t * new_inheritance)6137 vm_map_inherit_sanitize(
6138 vm_map_t map,
6139 vm_map_offset_ut start_u,
6140 vm_map_offset_ut end_u,
6141 vm_inherit_ut new_inheritance_u,
6142 vm_map_offset_t *start,
6143 vm_map_offset_t *end,
6144 vm_inherit_t *new_inheritance)
6145 {
6146 kern_return_t kr;
6147 vm_map_size_t size;
6148
6149 kr = vm_sanitize_inherit(new_inheritance_u,
6150 VM_SANITIZE_CALLER_VM_MAP_INHERIT, new_inheritance);
6151 if (__improbable(kr != KERN_SUCCESS)) {
6152 return kr;
6153 }
6154
6155 kr = vm_sanitize_addr_end(start_u, end_u, VM_SANITIZE_CALLER_VM_MAP_INHERIT,
6156 map, VM_SANITIZE_FLAGS_SIZE_ZERO_SUCCEEDS, start, end, &size);
6157 if (__improbable(kr != KERN_SUCCESS)) {
6158 return kr;
6159 }
6160
6161 return KERN_SUCCESS;
6162 }
6163
6164 /*
6165 * vm_map_inherit:
6166 *
6167 * Sets the inheritance of the specified address
6168 * range in the target map. Inheritance
6169 * affects how the map will be shared with
6170 * child maps at the time of vm_map_fork.
6171 */
6172 kern_return_t
vm_map_inherit(vm_map_t map,vm_map_offset_ut start_u,vm_map_offset_ut end_u,vm_inherit_ut new_inheritance_u)6173 vm_map_inherit(
6174 vm_map_t map,
6175 vm_map_offset_ut start_u,
6176 vm_map_offset_ut end_u,
6177 vm_inherit_ut new_inheritance_u)
6178 {
6179 vm_map_entry_t entry;
6180 vm_map_entry_t temp_entry;
6181 kern_return_t kr;
6182 vm_map_offset_t start;
6183 vm_map_offset_t end;
6184 vm_inherit_t new_inheritance;
6185
6186 kr = vm_map_inherit_sanitize(map,
6187 start_u,
6188 end_u,
6189 new_inheritance_u,
6190 &start,
6191 &end,
6192 &new_inheritance);
6193 if (__improbable(kr != KERN_SUCCESS)) {
6194 return vm_sanitize_get_kr(kr);
6195 }
6196
6197 vm_map_lock(map);
6198
6199 VM_MAP_RANGE_CHECK(map, start, end);
6200
6201 if (vm_map_lookup_entry(map, start, &temp_entry)) {
6202 entry = temp_entry;
6203 } else {
6204 temp_entry = temp_entry->vme_next;
6205 entry = temp_entry;
6206 }
6207
6208 /* first check entire range for entries which can't support the */
6209 /* given inheritance. */
6210 while ((entry != vm_map_to_entry(map)) && (entry->vme_start < end)) {
6211 if (entry->is_sub_map) {
6212 if (new_inheritance == VM_INHERIT_COPY) {
6213 vm_map_unlock(map);
6214 return KERN_INVALID_ARGUMENT;
6215 }
6216 }
6217
6218 entry = entry->vme_next;
6219 }
6220
6221 entry = temp_entry;
6222 if (entry != vm_map_to_entry(map)) {
6223 /* clip and unnest if necessary */
6224 vm_map_clip_start(map, entry, start);
6225 }
6226
6227 while ((entry != vm_map_to_entry(map)) && (entry->vme_start < end)) {
6228 vm_map_clip_end(map, entry, end);
6229 if (entry->is_sub_map) {
6230 /* clip did unnest if needed */
6231 assert(!entry->use_pmap);
6232 }
6233
6234 entry->inheritance = new_inheritance;
6235
6236 entry = entry->vme_next;
6237 }
6238
6239 vm_map_unlock(map);
6240 return KERN_SUCCESS;
6241 }
6242
6243 /*
6244 * Update the accounting for the amount of wired memory in this map. If the user has
6245 * exceeded the defined limits, then we fail. Wiring on behalf of the kernel never fails.
6246 */
6247
6248 static kern_return_t
add_wire_counts(vm_map_t map,vm_map_entry_t entry,boolean_t user_wire)6249 add_wire_counts(
6250 vm_map_t map,
6251 vm_map_entry_t entry,
6252 boolean_t user_wire)
6253 {
6254 vm_map_size_t size;
6255
6256 bool first_wire = entry->wired_count == 0 && entry->user_wired_count == 0;
6257
6258 if (user_wire) {
6259 unsigned int total_wire_count = vm_page_wire_count + vm_lopage_free_count;
6260
6261 /*
6262 * We're wiring memory at the request of the user. Check if this is the first time the user is wiring
6263 * this map entry.
6264 */
6265
6266 if (entry->user_wired_count == 0) {
6267 size = entry->vme_end - entry->vme_start;
6268
6269 /*
6270 * Since this is the first time the user is wiring this map entry, check to see if we're
6271 * exceeding the user wire limits. There is a per map limit which is the smaller of either
6272 * the process's rlimit or the global vm_per_task_user_wire_limit which caps this value. There is also
6273 * a system-wide limit on the amount of memory all users can wire. If the user is over either
6274 * limit, then we fail.
6275 */
6276
6277 if (size + map->user_wire_size > MIN(map->user_wire_limit, vm_per_task_user_wire_limit) ||
6278 size + ptoa_64(total_wire_count) > vm_global_user_wire_limit) {
6279 if (size + ptoa_64(total_wire_count) > vm_global_user_wire_limit) {
6280 #if DEVELOPMENT || DEBUG
6281 if (panic_on_mlock_failure) {
6282 panic("mlock: Over global wire limit. %llu bytes wired and requested to wire %llu bytes more", ptoa_64(total_wire_count), (uint64_t) size);
6283 }
6284 #endif /* DEVELOPMENT || DEBUG */
6285 os_atomic_inc(&vm_add_wire_count_over_global_limit, relaxed);
6286 } else {
6287 os_atomic_inc(&vm_add_wire_count_over_user_limit, relaxed);
6288 #if DEVELOPMENT || DEBUG
6289 if (panic_on_mlock_failure) {
6290 panic("mlock: Over process wire limit. %llu bytes wired and requested to wire %llu bytes more", (uint64_t) map->user_wire_size, (uint64_t) size);
6291 }
6292 #endif /* DEVELOPMENT || DEBUG */
6293 }
6294 return KERN_RESOURCE_SHORTAGE;
6295 }
6296
6297 /*
6298 * The first time the user wires an entry, we also increment the wired_count and add this to
6299 * the total that has been wired in the map.
6300 */
6301
6302 if (entry->wired_count >= MAX_WIRE_COUNT) {
6303 return KERN_FAILURE;
6304 }
6305
6306 entry->wired_count++;
6307 map->user_wire_size += size;
6308 }
6309
6310 if (entry->user_wired_count >= MAX_WIRE_COUNT) {
6311 return KERN_FAILURE;
6312 }
6313
6314 entry->user_wired_count++;
6315 } else {
6316 /*
6317 * The kernel's wiring the memory. Just bump the count and continue.
6318 */
6319
6320 if (entry->wired_count >= MAX_WIRE_COUNT) {
6321 panic("vm_map_wire: too many wirings");
6322 }
6323
6324 entry->wired_count++;
6325 }
6326
6327 if (first_wire) {
6328 vme_btref_consider_and_set(entry, __builtin_frame_address(0));
6329 }
6330
6331 return KERN_SUCCESS;
6332 }
6333
6334 /*
6335 * Update the memory wiring accounting now that the given map entry is being unwired.
6336 */
6337
6338 static void
subtract_wire_counts(vm_map_t map,vm_map_entry_t entry,boolean_t user_wire)6339 subtract_wire_counts(
6340 vm_map_t map,
6341 vm_map_entry_t entry,
6342 boolean_t user_wire)
6343 {
6344 if (user_wire) {
6345 /*
6346 * We're unwiring memory at the request of the user. See if we're removing the last user wire reference.
6347 */
6348
6349 if (entry->user_wired_count == 1) {
6350 /*
6351 * We're removing the last user wire reference. Decrement the wired_count and the total
6352 * user wired memory for this map.
6353 */
6354
6355 assert(entry->wired_count >= 1);
6356 entry->wired_count--;
6357 map->user_wire_size -= entry->vme_end - entry->vme_start;
6358 }
6359
6360 assert(entry->user_wired_count >= 1);
6361 entry->user_wired_count--;
6362 } else {
6363 /*
6364 * The kernel is unwiring the memory. Just update the count.
6365 */
6366
6367 assert(entry->wired_count >= 1);
6368 entry->wired_count--;
6369 }
6370
6371 vme_btref_consider_and_put(entry);
6372 }
6373
6374 int cs_executable_wire = 0;
6375
6376 static kern_return_t
vm_map_wire_nested(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_prot_t caller_prot,vm_tag_t tag,boolean_t user_wire,pmap_t map_pmap,vm_map_offset_t pmap_addr,ppnum_t * physpage_p)6377 vm_map_wire_nested(
6378 vm_map_t map,
6379 vm_map_offset_t start,
6380 vm_map_offset_t end,
6381 vm_prot_t caller_prot,
6382 vm_tag_t tag,
6383 boolean_t user_wire,
6384 pmap_t map_pmap,
6385 vm_map_offset_t pmap_addr,
6386 ppnum_t *physpage_p)
6387 {
6388 vm_map_entry_t entry;
6389 vm_prot_t access_type;
6390 struct vm_map_entry *first_entry, tmp_entry;
6391 vm_map_t real_map;
6392 vm_map_offset_t s, e;
6393 kern_return_t rc;
6394 boolean_t need_wakeup;
6395 boolean_t main_map = FALSE;
6396 wait_interrupt_t interruptible_state;
6397 thread_t cur_thread;
6398 unsigned int last_timestamp;
6399 vm_map_size_t size;
6400 boolean_t wire_and_extract;
6401 vm_prot_t extra_prots;
6402
6403 extra_prots = VM_PROT_COPY;
6404 extra_prots |= VM_PROT_COPY_FAIL_IF_EXECUTABLE;
6405 #if XNU_TARGET_OS_OSX
6406 if (map->pmap == kernel_pmap ||
6407 !vm_map_cs_enforcement(map)) {
6408 extra_prots &= ~VM_PROT_COPY_FAIL_IF_EXECUTABLE;
6409 }
6410 #endif /* XNU_TARGET_OS_OSX */
6411 #if CODE_SIGNING_MONITOR
6412 if (csm_address_space_exempt(map->pmap) == KERN_SUCCESS) {
6413 extra_prots &= ~VM_PROT_COPY_FAIL_IF_EXECUTABLE;
6414 }
6415 #endif /* CODE_SIGNING_MONITOR */
6416
6417 access_type = (caller_prot & (VM_PROT_ALL | VM_PROT_ALLEXEC));
6418
6419 wire_and_extract = FALSE;
6420 if (physpage_p != NULL) {
6421 /*
6422 * The caller wants the physical page number of the
6423 * wired page. We return only one physical page number
6424 * so this works for only one page at a time.
6425 *
6426 * The only caller (vm_map_wire_and_extract)
6427 * guarantees it.
6428 */
6429 assert(end - start == VM_MAP_PAGE_SIZE(map));
6430 wire_and_extract = TRUE;
6431 *physpage_p = 0;
6432 }
6433
6434 VM_MAP_RANGE_CHECK(map, start, end);
6435 assert(VM_MAP_PAGE_ALIGNED(start, VM_MAP_PAGE_MASK(map)));
6436 assert(VM_MAP_PAGE_ALIGNED(end, VM_MAP_PAGE_MASK(map)));
6437 if (start == end) {
6438 /* We wired what the caller asked for, zero pages */
6439 return KERN_SUCCESS;
6440 }
6441
6442 vm_map_lock(map);
6443 if (map_pmap == NULL) {
6444 main_map = TRUE;
6445 }
6446 last_timestamp = map->timestamp;
6447
6448 need_wakeup = FALSE;
6449 cur_thread = current_thread();
6450
6451 s = start;
6452 rc = KERN_SUCCESS;
6453
6454 if (vm_map_lookup_entry(map, s, &first_entry)) {
6455 entry = first_entry;
6456 /*
6457 * vm_map_clip_start will be done later.
6458 * We don't want to unnest any nested submaps here !
6459 */
6460 } else {
6461 /* Start address is not in map */
6462 rc = KERN_INVALID_ADDRESS;
6463 goto done;
6464 }
6465
6466 while ((entry != vm_map_to_entry(map)) && (s < end)) {
6467 /*
6468 * At this point, we have wired from "start" to "s".
6469 * We still need to wire from "s" to "end".
6470 *
6471 * "entry" hasn't been clipped, so it could start before "s"
6472 * and/or end after "end".
6473 */
6474
6475 /* "e" is how far we want to wire in this entry */
6476 e = entry->vme_end;
6477 if (e > end) {
6478 e = end;
6479 }
6480
6481 /*
6482 * If another thread is wiring/unwiring this entry then
6483 * block after informing other thread to wake us up.
6484 */
6485 if (entry->in_transition) {
6486 wait_result_t wait_result;
6487
6488 /*
6489 * We have not clipped the entry. Make sure that
6490 * the start address is in range so that the lookup
6491 * below will succeed.
6492 * "s" is the current starting point: we've already
6493 * wired from "start" to "s" and we still have
6494 * to wire from "s" to "end".
6495 */
6496
6497 entry->needs_wakeup = TRUE;
6498
6499 /*
6500 * wake up anybody waiting on entries that we have
6501 * already wired.
6502 */
6503 if (need_wakeup) {
6504 vm_map_entry_wakeup(map);
6505 need_wakeup = FALSE;
6506 }
6507 /*
6508 * User wiring is interruptible
6509 */
6510 wait_result = vm_map_entry_wait(map,
6511 (user_wire) ? THREAD_ABORTSAFE :
6512 THREAD_UNINT);
6513 if (user_wire && wait_result == THREAD_INTERRUPTED) {
6514 /*
6515 * undo the wirings we have done so far
6516 * We do not clear the needs_wakeup flag,
6517 * because we cannot tell if we were the
6518 * only one waiting.
6519 */
6520 rc = KERN_FAILURE;
6521 goto done;
6522 }
6523
6524 /*
6525 * Cannot avoid a lookup here. reset timestamp.
6526 */
6527 last_timestamp = map->timestamp;
6528
6529 /*
6530 * The entry could have been clipped, look it up again.
6531 * Worse that can happen is, it may not exist anymore.
6532 */
6533 if (!vm_map_lookup_entry(map, s, &first_entry)) {
6534 /*
6535 * User: undo everything upto the previous
6536 * entry. let vm_map_unwire worry about
6537 * checking the validity of the range.
6538 */
6539 rc = KERN_FAILURE;
6540 goto done;
6541 }
6542 entry = first_entry;
6543 continue;
6544 }
6545
6546 if (entry->is_sub_map) {
6547 vm_map_offset_t sub_start;
6548 vm_map_offset_t sub_end;
6549 vm_map_offset_t local_start;
6550 vm_map_offset_t local_end;
6551 pmap_t pmap;
6552 vm_map_t sub_map = VM_MAP_NULL;
6553
6554 if (wire_and_extract) {
6555 /*
6556 * Wiring would result in copy-on-write
6557 * which would not be compatible with
6558 * the sharing we have with the original
6559 * provider of this memory.
6560 */
6561 rc = KERN_INVALID_ARGUMENT;
6562 goto done;
6563 }
6564
6565 vm_map_clip_start(map, entry, s);
6566 vm_map_clip_end(map, entry, end);
6567
6568 sub_start = VME_OFFSET(entry);
6569 sub_end = entry->vme_end;
6570 sub_end += VME_OFFSET(entry) - entry->vme_start;
6571
6572 local_end = entry->vme_end;
6573 if (map_pmap == NULL) {
6574 vm_object_t object;
6575 vm_object_offset_t offset;
6576 vm_prot_t prot;
6577 boolean_t wired;
6578 vm_map_entry_t local_entry;
6579 vm_map_version_t version;
6580 vm_map_t lookup_map;
6581
6582 if (entry->use_pmap) {
6583 pmap = VME_SUBMAP(entry)->pmap;
6584 /* ppc implementation requires that */
6585 /* submaps pmap address ranges line */
6586 /* up with parent map */
6587 #ifdef notdef
6588 pmap_addr = sub_start;
6589 #endif
6590 pmap_addr = s;
6591 } else {
6592 pmap = map->pmap;
6593 pmap_addr = s;
6594 }
6595
6596 if (entry->wired_count) {
6597 if ((rc = add_wire_counts(map, entry, user_wire)) != KERN_SUCCESS) {
6598 goto done;
6599 }
6600
6601 /*
6602 * The map was not unlocked:
6603 * no need to goto re-lookup.
6604 * Just go directly to next entry.
6605 */
6606 entry = entry->vme_next;
6607 s = entry->vme_start;
6608 continue;
6609 }
6610
6611 /* call vm_map_lookup_and_lock_object to */
6612 /* cause any needs copy to be */
6613 /* evaluated */
6614 local_start = entry->vme_start;
6615 lookup_map = map;
6616 vm_map_lock_write_to_read(map);
6617 rc = vm_map_lookup_and_lock_object(
6618 &lookup_map, local_start,
6619 (access_type | extra_prots),
6620 OBJECT_LOCK_EXCLUSIVE,
6621 &version, &object,
6622 &offset, &prot, &wired,
6623 NULL,
6624 &real_map, NULL);
6625 if (rc != KERN_SUCCESS) {
6626 vm_map_unlock_read(lookup_map);
6627 assert(map_pmap == NULL);
6628 vm_map_unwire_nested(map, start,
6629 s, user_wire, PMAP_NULL, 0);
6630 return rc;
6631 }
6632 vm_object_unlock(object);
6633 if (real_map != lookup_map) {
6634 vm_map_unlock(real_map);
6635 }
6636 vm_map_unlock_read(lookup_map);
6637 vm_map_lock(map);
6638
6639 /* we unlocked, so must re-lookup */
6640 if (!vm_map_lookup_entry(map,
6641 local_start,
6642 &local_entry)) {
6643 rc = KERN_FAILURE;
6644 goto done;
6645 }
6646
6647 /*
6648 * entry could have been "simplified",
6649 * so re-clip
6650 */
6651 entry = local_entry;
6652 assert(s == local_start);
6653 vm_map_clip_start(map, entry, s);
6654 vm_map_clip_end(map, entry, end);
6655 /* re-compute "e" */
6656 e = entry->vme_end;
6657 if (e > end) {
6658 e = end;
6659 }
6660
6661 /* did we have a change of type? */
6662 if (!entry->is_sub_map) {
6663 last_timestamp = map->timestamp;
6664 continue;
6665 }
6666 } else {
6667 local_start = entry->vme_start;
6668 pmap = map_pmap;
6669 }
6670
6671 if ((rc = add_wire_counts(map, entry, user_wire)) != KERN_SUCCESS) {
6672 goto done;
6673 }
6674
6675 entry->in_transition = TRUE;
6676
6677 sub_map = VME_SUBMAP(entry);
6678 vm_map_reference(sub_map);
6679 vm_map_unlock(map);
6680 rc = vm_map_wire_nested(sub_map,
6681 sub_start, sub_end,
6682 caller_prot, tag,
6683 user_wire, pmap, pmap_addr,
6684 NULL);
6685 vm_map_deallocate(sub_map);
6686 sub_map = VM_MAP_NULL;
6687 vm_map_lock(map);
6688
6689 /*
6690 * Find the entry again. It could have been clipped
6691 * after we unlocked the map.
6692 */
6693 if (!vm_map_lookup_entry(map, local_start,
6694 &first_entry)) {
6695 panic("vm_map_wire: re-lookup failed");
6696 }
6697 entry = first_entry;
6698
6699 assert(local_start == s);
6700 /* re-compute "e" */
6701 e = entry->vme_end;
6702 if (e > end) {
6703 e = end;
6704 }
6705
6706 last_timestamp = map->timestamp;
6707 while ((entry != vm_map_to_entry(map)) &&
6708 (entry->vme_start < e)) {
6709 assert(entry->in_transition);
6710 entry->in_transition = FALSE;
6711 if (entry->needs_wakeup) {
6712 entry->needs_wakeup = FALSE;
6713 need_wakeup = TRUE;
6714 }
6715 if (rc != KERN_SUCCESS) {/* from vm_*_wire */
6716 subtract_wire_counts(map, entry, user_wire);
6717 }
6718 entry = entry->vme_next;
6719 }
6720 if (rc != KERN_SUCCESS) { /* from vm_*_wire */
6721 goto done;
6722 }
6723
6724 /* no need to relookup again */
6725 s = entry->vme_start;
6726 continue;
6727 }
6728
6729 /*
6730 * If this entry is already wired then increment
6731 * the appropriate wire reference count.
6732 */
6733 if (entry->wired_count) {
6734 if ((entry->protection & access_type) != access_type) {
6735 /* found a protection problem */
6736
6737 /*
6738 * XXX FBDP
6739 * We should always return an error
6740 * in this case but since we didn't
6741 * enforce it before, let's do
6742 * it only for the new "wire_and_extract"
6743 * code path for now...
6744 */
6745 if (wire_and_extract) {
6746 rc = KERN_PROTECTION_FAILURE;
6747 goto done;
6748 }
6749 }
6750
6751 /*
6752 * entry is already wired down, get our reference
6753 * after clipping to our range.
6754 */
6755 vm_map_clip_start(map, entry, s);
6756 vm_map_clip_end(map, entry, end);
6757
6758 if ((rc = add_wire_counts(map, entry, user_wire)) != KERN_SUCCESS) {
6759 goto done;
6760 }
6761
6762 if (wire_and_extract) {
6763 vm_object_t object;
6764 vm_object_offset_t offset;
6765 vm_page_t m;
6766
6767 /*
6768 * We don't have to "wire" the page again
6769 * bit we still have to "extract" its
6770 * physical page number, after some sanity
6771 * checks.
6772 */
6773 assert((entry->vme_end - entry->vme_start)
6774 == PAGE_SIZE);
6775 assert(!entry->needs_copy);
6776 assert(!entry->is_sub_map);
6777 assert(VME_OBJECT(entry));
6778 if (((entry->vme_end - entry->vme_start)
6779 != PAGE_SIZE) ||
6780 entry->needs_copy ||
6781 entry->is_sub_map ||
6782 VME_OBJECT(entry) == VM_OBJECT_NULL) {
6783 rc = KERN_INVALID_ARGUMENT;
6784 goto done;
6785 }
6786
6787 object = VME_OBJECT(entry);
6788 offset = VME_OFFSET(entry);
6789 /* need exclusive lock to update m->dirty */
6790 if (entry->protection & VM_PROT_WRITE) {
6791 vm_object_lock(object);
6792 } else {
6793 vm_object_lock_shared(object);
6794 }
6795 m = vm_page_lookup(object, offset);
6796 assert(m != VM_PAGE_NULL);
6797 assert(VM_PAGE_WIRED(m));
6798 if (m != VM_PAGE_NULL && VM_PAGE_WIRED(m)) {
6799 *physpage_p = VM_PAGE_GET_PHYS_PAGE(m);
6800 if (entry->protection & VM_PROT_WRITE) {
6801 vm_object_lock_assert_exclusive(
6802 object);
6803 m->vmp_dirty = TRUE;
6804 }
6805 } else {
6806 /* not already wired !? */
6807 *physpage_p = 0;
6808 }
6809 vm_object_unlock(object);
6810 }
6811
6812 /* map was not unlocked: no need to relookup */
6813 entry = entry->vme_next;
6814 s = entry->vme_start;
6815 continue;
6816 }
6817
6818 /*
6819 * Unwired entry or wire request transmitted via submap
6820 */
6821
6822 /*
6823 * Wiring would copy the pages to the shadow object.
6824 * The shadow object would not be code-signed so
6825 * attempting to execute code from these copied pages
6826 * would trigger a code-signing violation.
6827 */
6828
6829 if ((entry->protection & VM_PROT_EXECUTE)
6830 #if XNU_TARGET_OS_OSX
6831 &&
6832 map->pmap != kernel_pmap &&
6833 (vm_map_cs_enforcement(map)
6834 #if __arm64__
6835 || !VM_MAP_IS_EXOTIC(map)
6836 #endif /* __arm64__ */
6837 )
6838 #endif /* XNU_TARGET_OS_OSX */
6839 #if CODE_SIGNING_MONITOR
6840 &&
6841 (csm_address_space_exempt(map->pmap) != KERN_SUCCESS)
6842 #endif
6843 ) {
6844 #if MACH_ASSERT
6845 printf("pid %d[%s] wiring executable range from "
6846 "0x%llx to 0x%llx: rejected to preserve "
6847 "code-signing\n",
6848 proc_selfpid(),
6849 (get_bsdtask_info(current_task())
6850 ? proc_name_address(get_bsdtask_info(current_task()))
6851 : "?"),
6852 (uint64_t) entry->vme_start,
6853 (uint64_t) entry->vme_end);
6854 #endif /* MACH_ASSERT */
6855 DTRACE_VM2(cs_executable_wire,
6856 uint64_t, (uint64_t)entry->vme_start,
6857 uint64_t, (uint64_t)entry->vme_end);
6858 cs_executable_wire++;
6859 rc = KERN_PROTECTION_FAILURE;
6860 goto done;
6861 }
6862
6863 /*
6864 * Perform actions of vm_map_lookup that need the write
6865 * lock on the map: create a shadow object for a
6866 * copy-on-write region, or an object for a zero-fill
6867 * region.
6868 */
6869 size = entry->vme_end - entry->vme_start;
6870 /*
6871 * If wiring a copy-on-write page, we need to copy it now
6872 * even if we're only (currently) requesting read access.
6873 * This is aggressive, but once it's wired we can't move it.
6874 */
6875 if (entry->needs_copy) {
6876 if (wire_and_extract) {
6877 /*
6878 * We're supposed to share with the original
6879 * provider so should not be "needs_copy"
6880 */
6881 rc = KERN_INVALID_ARGUMENT;
6882 goto done;
6883 }
6884
6885 VME_OBJECT_SHADOW(entry, size,
6886 vm_map_always_shadow(map));
6887 entry->needs_copy = FALSE;
6888 } else if (VME_OBJECT(entry) == VM_OBJECT_NULL) {
6889 if (wire_and_extract) {
6890 /*
6891 * We're supposed to share with the original
6892 * provider so should already have an object.
6893 */
6894 rc = KERN_INVALID_ARGUMENT;
6895 goto done;
6896 }
6897 VME_OBJECT_SET(entry, vm_object_allocate(size), false, 0);
6898 VME_OFFSET_SET(entry, (vm_object_offset_t)0);
6899 assert(entry->use_pmap);
6900 } else if (VME_OBJECT(entry)->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) {
6901 if (wire_and_extract) {
6902 /*
6903 * We're supposed to share with the original
6904 * provider so should not be COPY_SYMMETRIC.
6905 */
6906 rc = KERN_INVALID_ARGUMENT;
6907 goto done;
6908 }
6909 /*
6910 * Force an unrequested "copy-on-write" but only for
6911 * the range we're wiring.
6912 */
6913 // printf("FBDP %s:%d map %p entry %p [ 0x%llx 0x%llx ] s 0x%llx end 0x%llx wire&extract=%d\n", __FUNCTION__, __LINE__, map, entry, (uint64_t)entry->vme_start, (uint64_t)entry->vme_end, (uint64_t)s, (uint64_t)end, wire_and_extract);
6914 vm_map_clip_start(map, entry, s);
6915 vm_map_clip_end(map, entry, end);
6916 /* recompute "size" */
6917 size = entry->vme_end - entry->vme_start;
6918 /* make a shadow object */
6919 vm_object_t orig_object;
6920 vm_object_offset_t orig_offset;
6921 orig_object = VME_OBJECT(entry);
6922 orig_offset = VME_OFFSET(entry);
6923 VME_OBJECT_SHADOW(entry, size, vm_map_always_shadow(map));
6924 if (VME_OBJECT(entry) != orig_object) {
6925 /*
6926 * This mapping has not been shared (or it would be
6927 * COPY_DELAY instead of COPY_SYMMETRIC) and it has
6928 * not been copied-on-write (or it would be marked
6929 * as "needs_copy" and would have been handled above
6930 * and also already write-protected).
6931 * We still need to write-protect here to prevent
6932 * other threads from modifying these pages while
6933 * we're in the process of copying and wiring
6934 * the copied pages.
6935 * Since the mapping is neither shared nor COWed,
6936 * we only need to write-protect the PTEs for this
6937 * mapping.
6938 */
6939 vm_object_pmap_protect(orig_object,
6940 orig_offset,
6941 size,
6942 map->pmap,
6943 VM_MAP_PAGE_SIZE(map),
6944 entry->vme_start,
6945 entry->protection & ~VM_PROT_WRITE);
6946 }
6947 }
6948 if (VME_OBJECT(entry)->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) {
6949 /*
6950 * Make the object COPY_DELAY to get a stable object
6951 * to wire.
6952 * That should avoid creating long shadow chains while
6953 * wiring/unwiring the same range repeatedly.
6954 * That also prevents part of the object from being
6955 * wired while another part is "needs_copy", which
6956 * could result in conflicting rules wrt copy-on-write.
6957 */
6958 vm_object_t object;
6959
6960 object = VME_OBJECT(entry);
6961 vm_object_lock(object);
6962 if (object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) {
6963 assertf(vm_object_round_page(VME_OFFSET(entry) + size) - vm_object_trunc_page(VME_OFFSET(entry)) == object->vo_size,
6964 "object %p size 0x%llx entry %p [0x%llx:0x%llx:0x%llx] size 0x%llx\n",
6965 object, (uint64_t)object->vo_size,
6966 entry,
6967 (uint64_t)entry->vme_start,
6968 (uint64_t)entry->vme_end,
6969 (uint64_t)VME_OFFSET(entry),
6970 (uint64_t)size);
6971 assertf(os_ref_get_count_raw(&object->ref_count) == 1,
6972 "object %p ref_count %d\n",
6973 object, os_ref_get_count_raw(&object->ref_count));
6974 assertf(!entry->needs_copy,
6975 "entry %p\n", entry);
6976 object->copy_strategy = MEMORY_OBJECT_COPY_DELAY;
6977 VM_OBJECT_SET_TRUE_SHARE(object, TRUE);
6978 }
6979 vm_object_unlock(object);
6980 }
6981
6982 vm_map_clip_start(map, entry, s);
6983 vm_map_clip_end(map, entry, end);
6984
6985 /* re-compute "e" */
6986 e = entry->vme_end;
6987 if (e > end) {
6988 e = end;
6989 }
6990
6991 /*
6992 * Check for holes and protection mismatch.
6993 * Holes: Next entry should be contiguous unless this
6994 * is the end of the region.
6995 * Protection: Access requested must be allowed, unless
6996 * wiring is by protection class
6997 */
6998 if ((entry->vme_end < end) &&
6999 ((entry->vme_next == vm_map_to_entry(map)) ||
7000 (entry->vme_next->vme_start > entry->vme_end))) {
7001 /* found a hole */
7002 rc = KERN_INVALID_ADDRESS;
7003 goto done;
7004 }
7005 if ((entry->protection & access_type) != access_type) {
7006 /* found a protection problem */
7007 rc = KERN_PROTECTION_FAILURE;
7008 goto done;
7009 }
7010
7011 assert(entry->wired_count == 0 && entry->user_wired_count == 0);
7012
7013 if ((rc = add_wire_counts(map, entry, user_wire)) != KERN_SUCCESS) {
7014 goto done;
7015 }
7016
7017 entry->in_transition = TRUE;
7018
7019 /*
7020 * This entry might get split once we unlock the map.
7021 * In vm_fault_wire(), we need the current range as
7022 * defined by this entry. In order for this to work
7023 * along with a simultaneous clip operation, we make a
7024 * temporary copy of this entry and use that for the
7025 * wiring. Note that the underlying objects do not
7026 * change during a clip.
7027 */
7028 tmp_entry = *entry;
7029
7030 /*
7031 * The in_transition state guarentees that the entry
7032 * (or entries for this range, if split occured) will be
7033 * there when the map lock is acquired for the second time.
7034 */
7035 vm_map_unlock(map);
7036
7037 if (!user_wire && cur_thread != THREAD_NULL) {
7038 interruptible_state = thread_interrupt_level(THREAD_UNINT);
7039 } else {
7040 interruptible_state = THREAD_UNINT;
7041 }
7042
7043 if (map_pmap) {
7044 rc = vm_fault_wire(map,
7045 &tmp_entry, caller_prot, tag, map_pmap, pmap_addr,
7046 physpage_p);
7047 } else {
7048 rc = vm_fault_wire(map,
7049 &tmp_entry, caller_prot, tag, map->pmap,
7050 tmp_entry.vme_start,
7051 physpage_p);
7052 }
7053
7054 if (!user_wire && cur_thread != THREAD_NULL) {
7055 thread_interrupt_level(interruptible_state);
7056 }
7057
7058 vm_map_lock(map);
7059
7060 if (last_timestamp + 1 != map->timestamp) {
7061 /*
7062 * Find the entry again. It could have been clipped
7063 * after we unlocked the map.
7064 */
7065 if (!vm_map_lookup_entry(map, tmp_entry.vme_start,
7066 &first_entry)) {
7067 panic("vm_map_wire: re-lookup failed");
7068 }
7069
7070 entry = first_entry;
7071 }
7072
7073 last_timestamp = map->timestamp;
7074
7075 while ((entry != vm_map_to_entry(map)) &&
7076 (entry->vme_start < tmp_entry.vme_end)) {
7077 assert(entry->in_transition);
7078 entry->in_transition = FALSE;
7079 if (entry->needs_wakeup) {
7080 entry->needs_wakeup = FALSE;
7081 need_wakeup = TRUE;
7082 }
7083 if (rc != KERN_SUCCESS) { /* from vm_*_wire */
7084 subtract_wire_counts(map, entry, user_wire);
7085 }
7086 entry = entry->vme_next;
7087 }
7088
7089 if (rc != KERN_SUCCESS) { /* from vm_*_wire */
7090 goto done;
7091 }
7092
7093 if ((entry != vm_map_to_entry(map)) && /* we still have entries in the map */
7094 (tmp_entry.vme_end != end) && /* AND, we are not at the end of the requested range */
7095 (entry->vme_start != tmp_entry.vme_end)) { /* AND, the next entry is not contiguous. */
7096 /* found a "new" hole */
7097 s = tmp_entry.vme_end;
7098 rc = KERN_INVALID_ADDRESS;
7099 goto done;
7100 }
7101
7102 s = entry->vme_start;
7103 } /* end while loop through map entries */
7104
7105 done:
7106 if (rc == KERN_SUCCESS) {
7107 /* repair any damage we may have made to the VM map */
7108 vm_map_simplify_range(map, start, end);
7109 }
7110
7111 vm_map_unlock(map);
7112
7113 /*
7114 * wake up anybody waiting on entries we wired.
7115 */
7116 if (need_wakeup) {
7117 vm_map_entry_wakeup(map);
7118 }
7119
7120 if (rc != KERN_SUCCESS) {
7121 /* undo what has been wired so far */
7122 vm_map_unwire_nested(map, start, s, user_wire,
7123 map_pmap, pmap_addr);
7124 if (physpage_p) {
7125 *physpage_p = 0;
7126 }
7127 }
7128
7129 return rc;
7130 }
7131
7132 static __attribute__((always_inline, warn_unused_result))
7133 kern_return_t
vm_map_wire_sanitize(vm_map_t map,vm_map_offset_ut start_u,vm_map_offset_ut end_u,vm_prot_ut prot_u,vm_sanitize_caller_t vm_sanitize_caller,vm_map_offset_t * start,vm_map_offset_t * end,vm_map_size_t * size,vm_prot_t * prot)7134 vm_map_wire_sanitize(
7135 vm_map_t map,
7136 vm_map_offset_ut start_u,
7137 vm_map_offset_ut end_u,
7138 vm_prot_ut prot_u,
7139 vm_sanitize_caller_t vm_sanitize_caller,
7140 vm_map_offset_t *start,
7141 vm_map_offset_t *end,
7142 vm_map_size_t *size,
7143 vm_prot_t *prot)
7144 {
7145 kern_return_t kr;
7146
7147 kr = vm_sanitize_addr_end(start_u, end_u, vm_sanitize_caller, map,
7148 VM_SANITIZE_FLAGS_SIZE_ZERO_SUCCEEDS, start, end,
7149 size);
7150 if (__improbable(kr != KERN_SUCCESS)) {
7151 return kr;
7152 }
7153
7154 kr = vm_sanitize_prot(prot_u, vm_sanitize_caller, map, prot);
7155 if (__improbable(kr != KERN_SUCCESS)) {
7156 return kr;
7157 }
7158
7159 return KERN_SUCCESS;
7160 }
7161
7162 /*
7163 * Validation function for vm_map_wire_nested().
7164 */
7165 kern_return_t
vm_map_wire_impl(vm_map_t map,vm_map_offset_ut start_u,vm_map_offset_ut end_u,vm_prot_ut prot_u,vm_tag_t tag,boolean_t user_wire,ppnum_t * physpage_p,vm_sanitize_caller_t vm_sanitize_caller)7166 vm_map_wire_impl(
7167 vm_map_t map,
7168 vm_map_offset_ut start_u,
7169 vm_map_offset_ut end_u,
7170 vm_prot_ut prot_u,
7171 vm_tag_t tag,
7172 boolean_t user_wire,
7173 ppnum_t *physpage_p,
7174 vm_sanitize_caller_t vm_sanitize_caller)
7175 {
7176 vm_map_offset_t start, end;
7177 vm_map_size_t size;
7178 vm_prot_t prot;
7179 kern_return_t kr;
7180
7181 /*
7182 * Sanitize any input parameters that are addr/size/prot/inherit
7183 */
7184 kr = vm_map_wire_sanitize(map,
7185 start_u,
7186 end_u,
7187 prot_u,
7188 vm_sanitize_caller,
7189 &start,
7190 &end,
7191 &size,
7192 &prot);
7193 if (__improbable(kr != KERN_SUCCESS)) {
7194 if (physpage_p) {
7195 *physpage_p = 0;
7196 }
7197 return vm_sanitize_get_kr(kr);
7198 }
7199
7200 return vm_map_wire_nested(map, start, end, prot, tag, user_wire,
7201 PMAP_NULL, 0, physpage_p);
7202 }
7203
7204 kern_return_t
vm_map_wire_external(vm_map_t map,vm_map_offset_ut start_u,vm_map_offset_ut end_u,vm_prot_ut prot_u,boolean_t user_wire)7205 vm_map_wire_external(
7206 vm_map_t map,
7207 vm_map_offset_ut start_u,
7208 vm_map_offset_ut end_u,
7209 vm_prot_ut prot_u,
7210 boolean_t user_wire)
7211 {
7212 vm_tag_t tag = vm_tag_bt();
7213
7214 return vm_map_wire_kernel(map, start_u, end_u, prot_u, tag, user_wire);
7215 }
7216
7217 kern_return_t
vm_map_wire_kernel(vm_map_t map,vm_map_offset_ut start_u,vm_map_offset_ut end_u,vm_prot_ut prot_u,vm_tag_t tag,boolean_t user_wire)7218 vm_map_wire_kernel(
7219 vm_map_t map,
7220 vm_map_offset_ut start_u,
7221 vm_map_offset_ut end_u,
7222 vm_prot_ut prot_u,
7223 vm_tag_t tag,
7224 boolean_t user_wire)
7225 {
7226 return vm_map_wire_impl(map, start_u, end_u, prot_u, tag,
7227 user_wire, NULL, VM_SANITIZE_CALLER_VM_MAP_WIRE);
7228 }
7229
7230 #if XNU_PLATFORM_MacOSX
7231
7232 kern_return_t
vm_map_wire_and_extract(vm_map_t map,vm_map_offset_ut start_u,vm_prot_ut prot_u,boolean_t user_wire,ppnum_t * physpage_p)7233 vm_map_wire_and_extract(
7234 vm_map_t map,
7235 vm_map_offset_ut start_u,
7236 vm_prot_ut prot_u,
7237 boolean_t user_wire,
7238 ppnum_t *physpage_p)
7239 {
7240 vm_tag_t tag = vm_tag_bt();
7241 vm_map_size_ut size_u = vm_sanitize_wrap_size(VM_MAP_PAGE_SIZE(map));
7242 vm_map_offset_ut end_u = vm_sanitize_compute_ut_end(start_u, size_u);
7243
7244 return vm_map_wire_impl(map, start_u, end_u, prot_u, tag,
7245 user_wire, physpage_p, VM_SANITIZE_CALLER_VM_MAP_WIRE);
7246 }
7247
7248 #endif /* XNU_PLATFORM_MacOSX */
7249
7250 static kern_return_t
vm_map_unwire_nested(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,boolean_t user_wire,pmap_t map_pmap,vm_map_offset_t pmap_addr)7251 vm_map_unwire_nested(
7252 vm_map_t map,
7253 vm_map_offset_t start,
7254 vm_map_offset_t end,
7255 boolean_t user_wire,
7256 pmap_t map_pmap,
7257 vm_map_offset_t pmap_addr)
7258 {
7259 vm_map_entry_t entry;
7260 struct vm_map_entry *first_entry, tmp_entry;
7261 boolean_t need_wakeup;
7262 boolean_t main_map = FALSE;
7263 unsigned int last_timestamp;
7264
7265 VM_MAP_RANGE_CHECK(map, start, end);
7266 assert(VM_MAP_PAGE_ALIGNED(start, VM_MAP_PAGE_MASK(map)));
7267 assert(VM_MAP_PAGE_ALIGNED(end, VM_MAP_PAGE_MASK(map)));
7268
7269 if (start == end) {
7270 /* We unwired what the caller asked for: zero pages */
7271 return KERN_SUCCESS;
7272 }
7273
7274 vm_map_lock(map);
7275 if (map_pmap == NULL) {
7276 main_map = TRUE;
7277 }
7278 last_timestamp = map->timestamp;
7279
7280 if (vm_map_lookup_entry(map, start, &first_entry)) {
7281 entry = first_entry;
7282 /*
7283 * vm_map_clip_start will be done later.
7284 * We don't want to unnest any nested sub maps here !
7285 */
7286 } else {
7287 if (!user_wire) {
7288 panic("vm_map_unwire: start not found");
7289 }
7290 /* Start address is not in map. */
7291 vm_map_unlock(map);
7292 return KERN_INVALID_ADDRESS;
7293 }
7294
7295 if (entry->superpage_size) {
7296 /* superpages are always wired */
7297 vm_map_unlock(map);
7298 return KERN_INVALID_ADDRESS;
7299 }
7300
7301 need_wakeup = FALSE;
7302 while ((entry != vm_map_to_entry(map)) && (entry->vme_start < end)) {
7303 if (entry->in_transition) {
7304 /*
7305 * 1)
7306 * Another thread is wiring down this entry. Note
7307 * that if it is not for the other thread we would
7308 * be unwiring an unwired entry. This is not
7309 * permitted. If we wait, we will be unwiring memory
7310 * we did not wire.
7311 *
7312 * 2)
7313 * Another thread is unwiring this entry. We did not
7314 * have a reference to it, because if we did, this
7315 * entry will not be getting unwired now.
7316 */
7317 if (!user_wire) {
7318 /*
7319 * XXX FBDP
7320 * This could happen: there could be some
7321 * overlapping vslock/vsunlock operations
7322 * going on.
7323 * We should probably just wait and retry,
7324 * but then we have to be careful that this
7325 * entry could get "simplified" after
7326 * "in_transition" gets unset and before
7327 * we re-lookup the entry, so we would
7328 * have to re-clip the entry to avoid
7329 * re-unwiring what we have already unwired...
7330 * See vm_map_wire_nested().
7331 *
7332 * Or we could just ignore "in_transition"
7333 * here and proceed to decement the wired
7334 * count(s) on this entry. That should be fine
7335 * as long as "wired_count" doesn't drop all
7336 * the way to 0 (and we should panic if THAT
7337 * happens).
7338 */
7339 panic("vm_map_unwire: in_transition entry");
7340 }
7341
7342 entry = entry->vme_next;
7343 continue;
7344 }
7345
7346 if (entry->is_sub_map) {
7347 vm_map_offset_t sub_start;
7348 vm_map_offset_t sub_end;
7349 vm_map_offset_t local_end;
7350 pmap_t pmap;
7351 vm_map_t sub_map = VM_MAP_NULL;
7352
7353 vm_map_clip_start(map, entry, start);
7354 vm_map_clip_end(map, entry, end);
7355
7356 sub_start = VME_OFFSET(entry);
7357 sub_end = entry->vme_end - entry->vme_start;
7358 sub_end += VME_OFFSET(entry);
7359 local_end = entry->vme_end;
7360 if (map_pmap == NULL) {
7361 if (entry->use_pmap) {
7362 pmap = VME_SUBMAP(entry)->pmap;
7363 pmap_addr = sub_start;
7364 } else {
7365 pmap = map->pmap;
7366 pmap_addr = start;
7367 }
7368 if (entry->wired_count == 0 ||
7369 (user_wire && entry->user_wired_count == 0)) {
7370 if (!user_wire) {
7371 panic("vm_map_unwire: entry is unwired");
7372 }
7373 entry = entry->vme_next;
7374 continue;
7375 }
7376
7377 /*
7378 * Check for holes
7379 * Holes: Next entry should be contiguous unless
7380 * this is the end of the region.
7381 */
7382 if (((entry->vme_end < end) &&
7383 ((entry->vme_next == vm_map_to_entry(map)) ||
7384 (entry->vme_next->vme_start
7385 > entry->vme_end)))) {
7386 if (!user_wire) {
7387 panic("vm_map_unwire: non-contiguous region");
7388 }
7389 /*
7390 * entry = entry->vme_next;
7391 * continue;
7392 */
7393 }
7394
7395 subtract_wire_counts(map, entry, user_wire);
7396
7397 if (entry->wired_count != 0) {
7398 entry = entry->vme_next;
7399 continue;
7400 }
7401
7402 entry->in_transition = TRUE;
7403 tmp_entry = *entry;/* see comment in vm_map_wire() */
7404
7405 /*
7406 * We can unlock the map now. The in_transition state
7407 * guarantees existance of the entry.
7408 */
7409 sub_map = VME_SUBMAP(entry);
7410 vm_map_reference(sub_map);
7411 vm_map_unlock(map);
7412 vm_map_unwire_nested(sub_map,
7413 sub_start, sub_end, user_wire, pmap, pmap_addr);
7414 vm_map_deallocate(sub_map);
7415 sub_map = VM_MAP_NULL;
7416 vm_map_lock(map);
7417
7418 if (last_timestamp + 1 != map->timestamp) {
7419 /*
7420 * Find the entry again. It could have been
7421 * clipped or deleted after we unlocked the map.
7422 */
7423 if (!vm_map_lookup_entry(map,
7424 tmp_entry.vme_start,
7425 &first_entry)) {
7426 if (!user_wire) {
7427 panic("vm_map_unwire: re-lookup failed");
7428 }
7429 entry = first_entry->vme_next;
7430 } else {
7431 entry = first_entry;
7432 }
7433 }
7434 last_timestamp = map->timestamp;
7435
7436 /*
7437 * clear transition bit for all constituent entries
7438 * that were in the original entry (saved in
7439 * tmp_entry). Also check for waiters.
7440 */
7441 while ((entry != vm_map_to_entry(map)) &&
7442 (entry->vme_start < tmp_entry.vme_end)) {
7443 assert(entry->in_transition);
7444 entry->in_transition = FALSE;
7445 if (entry->needs_wakeup) {
7446 entry->needs_wakeup = FALSE;
7447 need_wakeup = TRUE;
7448 }
7449 entry = entry->vme_next;
7450 }
7451 continue;
7452 } else {
7453 tmp_entry = *entry;
7454 sub_map = VME_SUBMAP(entry);
7455 vm_map_reference(sub_map);
7456 vm_map_unlock(map);
7457 vm_map_unwire_nested(sub_map,
7458 sub_start, sub_end, user_wire, map_pmap,
7459 pmap_addr);
7460 vm_map_deallocate(sub_map);
7461 sub_map = VM_MAP_NULL;
7462 vm_map_lock(map);
7463
7464 if (last_timestamp + 1 != map->timestamp) {
7465 /*
7466 * Find the entry again. It could have been
7467 * clipped or deleted after we unlocked the map.
7468 */
7469 if (!vm_map_lookup_entry(map,
7470 tmp_entry.vme_start,
7471 &first_entry)) {
7472 if (!user_wire) {
7473 panic("vm_map_unwire: re-lookup failed");
7474 }
7475 entry = first_entry->vme_next;
7476 } else {
7477 entry = first_entry;
7478 }
7479 }
7480 last_timestamp = map->timestamp;
7481 }
7482 }
7483
7484
7485 if ((entry->wired_count == 0) ||
7486 (user_wire && entry->user_wired_count == 0)) {
7487 if (!user_wire) {
7488 panic("vm_map_unwire: entry is unwired");
7489 }
7490
7491 entry = entry->vme_next;
7492 continue;
7493 }
7494
7495 assert(entry->wired_count > 0 &&
7496 (!user_wire || entry->user_wired_count > 0));
7497
7498 vm_map_clip_start(map, entry, start);
7499 vm_map_clip_end(map, entry, end);
7500
7501 /*
7502 * Check for holes
7503 * Holes: Next entry should be contiguous unless
7504 * this is the end of the region.
7505 */
7506 if (((entry->vme_end < end) &&
7507 ((entry->vme_next == vm_map_to_entry(map)) ||
7508 (entry->vme_next->vme_start > entry->vme_end)))) {
7509 if (!user_wire) {
7510 panic("vm_map_unwire: non-contiguous region");
7511 }
7512 /*
7513 * entry = entry->vme_next;
7514 * continue;
7515 */
7516 }
7517
7518 subtract_wire_counts(map, entry, user_wire);
7519
7520 if (entry->wired_count != 0) {
7521 entry = entry->vme_next;
7522 continue;
7523 }
7524
7525 if (entry->zero_wired_pages) {
7526 entry->zero_wired_pages = FALSE;
7527 }
7528
7529 entry->in_transition = TRUE;
7530 tmp_entry = *entry; /* see comment in vm_map_wire() */
7531
7532 /*
7533 * We can unlock the map now. The in_transition state
7534 * guarantees existance of the entry.
7535 */
7536 vm_map_unlock(map);
7537 if (map_pmap) {
7538 vm_fault_unwire(map, &tmp_entry, FALSE, map_pmap,
7539 pmap_addr, tmp_entry.vme_end);
7540 } else {
7541 vm_fault_unwire(map, &tmp_entry, FALSE, map->pmap,
7542 tmp_entry.vme_start, tmp_entry.vme_end);
7543 }
7544 vm_map_lock(map);
7545
7546 if (last_timestamp + 1 != map->timestamp) {
7547 /*
7548 * Find the entry again. It could have been clipped
7549 * or deleted after we unlocked the map.
7550 */
7551 if (!vm_map_lookup_entry(map, tmp_entry.vme_start,
7552 &first_entry)) {
7553 if (!user_wire) {
7554 panic("vm_map_unwire: re-lookup failed");
7555 }
7556 entry = first_entry->vme_next;
7557 } else {
7558 entry = first_entry;
7559 }
7560 }
7561 last_timestamp = map->timestamp;
7562
7563 /*
7564 * clear transition bit for all constituent entries that
7565 * were in the original entry (saved in tmp_entry). Also
7566 * check for waiters.
7567 */
7568 while ((entry != vm_map_to_entry(map)) &&
7569 (entry->vme_start < tmp_entry.vme_end)) {
7570 assert(entry->in_transition);
7571 entry->in_transition = FALSE;
7572 if (entry->needs_wakeup) {
7573 entry->needs_wakeup = FALSE;
7574 need_wakeup = TRUE;
7575 }
7576 entry = entry->vme_next;
7577 }
7578 }
7579
7580 /*
7581 * We might have fragmented the address space when we wired this
7582 * range of addresses. Attempt to re-coalesce these VM map entries
7583 * with their neighbors now that they're no longer wired.
7584 * Under some circumstances, address space fragmentation can
7585 * prevent VM object shadow chain collapsing, which can cause
7586 * swap space leaks.
7587 */
7588 vm_map_simplify_range(map, start, end);
7589
7590 vm_map_unlock(map);
7591 /*
7592 * wake up anybody waiting on entries that we have unwired.
7593 */
7594 if (need_wakeup) {
7595 vm_map_entry_wakeup(map);
7596 }
7597 return KERN_SUCCESS;
7598 }
7599
7600 kern_return_t
vm_map_unwire(vm_map_t map,vm_map_offset_ut start_u,vm_map_offset_ut end_u,boolean_t user_wire)7601 vm_map_unwire(
7602 vm_map_t map,
7603 vm_map_offset_ut start_u,
7604 vm_map_offset_ut end_u,
7605 boolean_t user_wire)
7606 {
7607 return vm_map_unwire_impl(map, start_u, end_u, user_wire,
7608 VM_SANITIZE_CALLER_VM_MAP_UNWIRE);
7609 }
7610
7611 static __attribute__((always_inline, warn_unused_result))
7612 kern_return_t
vm_map_unwire_sanitize(vm_map_t map,vm_map_offset_ut start_u,vm_map_offset_ut end_u,vm_sanitize_caller_t vm_sanitize_caller,vm_map_offset_t * start,vm_map_offset_t * end,vm_map_size_t * size)7613 vm_map_unwire_sanitize(
7614 vm_map_t map,
7615 vm_map_offset_ut start_u,
7616 vm_map_offset_ut end_u,
7617 vm_sanitize_caller_t vm_sanitize_caller,
7618 vm_map_offset_t *start,
7619 vm_map_offset_t *end,
7620 vm_map_size_t *size)
7621 {
7622 return vm_sanitize_addr_end(start_u, end_u, vm_sanitize_caller, map,
7623 VM_SANITIZE_FLAGS_SIZE_ZERO_SUCCEEDS, start, end,
7624 size);
7625 }
7626
7627 kern_return_t
vm_map_unwire_impl(vm_map_t map,vm_map_offset_ut start_u,vm_map_offset_ut end_u,boolean_t user_wire,vm_sanitize_caller_t vm_sanitize_caller)7628 vm_map_unwire_impl(
7629 vm_map_t map,
7630 vm_map_offset_ut start_u,
7631 vm_map_offset_ut end_u,
7632 boolean_t user_wire,
7633 vm_sanitize_caller_t vm_sanitize_caller)
7634 {
7635 vm_map_offset_t start, end;
7636 vm_map_size_t size;
7637 kern_return_t kr;
7638
7639 /*
7640 * Sanitize any input parameters that are addr/size/prot/inherit
7641 */
7642 kr = vm_map_unwire_sanitize(
7643 map,
7644 start_u,
7645 end_u,
7646 vm_sanitize_caller,
7647 &start,
7648 &end,
7649 &size);
7650 if (__improbable(kr != KERN_SUCCESS)) {
7651 return vm_sanitize_get_kr(kr);
7652 }
7653
7654 return vm_map_unwire_nested(map, start, end,
7655 user_wire, (pmap_t)NULL, 0);
7656 }
7657
7658
7659 /*
7660 * vm_map_entry_zap: [ internal use only ]
7661 *
7662 * Remove the entry from the target map
7663 * and put it on a zap list.
7664 */
7665 static void
vm_map_entry_zap(vm_map_t map,vm_map_entry_t entry,vm_map_zap_t zap)7666 vm_map_entry_zap(
7667 vm_map_t map,
7668 vm_map_entry_t entry,
7669 vm_map_zap_t zap)
7670 {
7671 vm_map_offset_t s, e;
7672
7673 s = entry->vme_start;
7674 e = entry->vme_end;
7675 assert(VM_MAP_PAGE_ALIGNED(s, FOURK_PAGE_MASK));
7676 assert(VM_MAP_PAGE_ALIGNED(e, FOURK_PAGE_MASK));
7677 if (VM_MAP_PAGE_MASK(map) >= PAGE_MASK) {
7678 assert(page_aligned(s));
7679 assert(page_aligned(e));
7680 }
7681 if (entry->map_aligned == TRUE) {
7682 assert(VM_MAP_PAGE_ALIGNED(s, VM_MAP_PAGE_MASK(map)));
7683 assert(VM_MAP_PAGE_ALIGNED(e, VM_MAP_PAGE_MASK(map)));
7684 }
7685 assert(entry->wired_count == 0);
7686 assert(entry->user_wired_count == 0);
7687 assert(!entry->vme_permanent);
7688
7689 vm_map_store_entry_unlink(map, entry, false);
7690 map->size -= e - s;
7691
7692 vm_map_zap_append(zap, entry);
7693 }
7694
7695 static void
vm_map_submap_pmap_clean(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_map_t sub_map,vm_map_offset_t offset)7696 vm_map_submap_pmap_clean(
7697 vm_map_t map,
7698 vm_map_offset_t start,
7699 vm_map_offset_t end,
7700 vm_map_t sub_map,
7701 vm_map_offset_t offset)
7702 {
7703 vm_map_offset_t submap_start;
7704 vm_map_offset_t submap_end;
7705 vm_map_size_t remove_size;
7706 vm_map_entry_t entry;
7707
7708 submap_end = offset + (end - start);
7709 submap_start = offset;
7710
7711 vm_map_lock_read(sub_map);
7712 if (vm_map_lookup_entry(sub_map, offset, &entry)) {
7713 remove_size = (entry->vme_end - entry->vme_start);
7714 if (offset > entry->vme_start) {
7715 remove_size -= offset - entry->vme_start;
7716 }
7717
7718
7719 if (submap_end < entry->vme_end) {
7720 remove_size -=
7721 entry->vme_end - submap_end;
7722 }
7723 if (entry->is_sub_map) {
7724 vm_map_submap_pmap_clean(
7725 sub_map,
7726 start,
7727 start + remove_size,
7728 VME_SUBMAP(entry),
7729 VME_OFFSET(entry));
7730 } else {
7731 if (map->mapped_in_other_pmaps &&
7732 os_ref_get_count_raw(&map->map_refcnt) != 0 &&
7733 VME_OBJECT(entry) != NULL) {
7734 vm_object_pmap_protect_options(
7735 VME_OBJECT(entry),
7736 (VME_OFFSET(entry) +
7737 offset -
7738 entry->vme_start),
7739 remove_size,
7740 PMAP_NULL,
7741 PAGE_SIZE,
7742 entry->vme_start,
7743 VM_PROT_NONE,
7744 PMAP_OPTIONS_REMOVE);
7745 } else {
7746 pmap_remove(map->pmap,
7747 (addr64_t)start,
7748 (addr64_t)(start + remove_size));
7749 }
7750 }
7751 }
7752
7753 entry = entry->vme_next;
7754
7755 while ((entry != vm_map_to_entry(sub_map))
7756 && (entry->vme_start < submap_end)) {
7757 remove_size = (entry->vme_end - entry->vme_start);
7758 if (submap_end < entry->vme_end) {
7759 remove_size -= entry->vme_end - submap_end;
7760 }
7761 if (entry->is_sub_map) {
7762 vm_map_submap_pmap_clean(
7763 sub_map,
7764 (start + entry->vme_start) - offset,
7765 ((start + entry->vme_start) - offset) + remove_size,
7766 VME_SUBMAP(entry),
7767 VME_OFFSET(entry));
7768 } else {
7769 if (map->mapped_in_other_pmaps &&
7770 os_ref_get_count_raw(&map->map_refcnt) != 0 &&
7771 VME_OBJECT(entry) != NULL) {
7772 vm_object_pmap_protect_options(
7773 VME_OBJECT(entry),
7774 VME_OFFSET(entry),
7775 remove_size,
7776 PMAP_NULL,
7777 PAGE_SIZE,
7778 entry->vme_start,
7779 VM_PROT_NONE,
7780 PMAP_OPTIONS_REMOVE);
7781 } else {
7782 pmap_remove(map->pmap,
7783 (addr64_t)((start + entry->vme_start)
7784 - offset),
7785 (addr64_t)(((start + entry->vme_start)
7786 - offset) + remove_size));
7787 }
7788 }
7789 entry = entry->vme_next;
7790 }
7791 vm_map_unlock_read(sub_map);
7792 return;
7793 }
7794
7795 /*
7796 * virt_memory_guard_ast:
7797 *
7798 * Handle the AST callout for a virtual memory guard.
7799 * raise an EXC_GUARD exception and terminate the task
7800 * if configured to do so.
7801 */
7802 void
virt_memory_guard_ast(thread_t thread,mach_exception_data_type_t code,mach_exception_data_type_t subcode)7803 virt_memory_guard_ast(
7804 thread_t thread,
7805 mach_exception_data_type_t code,
7806 mach_exception_data_type_t subcode)
7807 {
7808 task_t task = get_threadtask(thread);
7809 assert(task != kernel_task);
7810 assert(task == current_task());
7811 kern_return_t sync_exception_result;
7812 uint32_t behavior;
7813
7814 behavior = task->task_exc_guard;
7815
7816
7817 /* Is delivery enabled */
7818 if ((behavior & TASK_EXC_GUARD_VM_DELIVER) == 0) {
7819 return;
7820 }
7821
7822 /* If only once, make sure we're that once */
7823 while (behavior & TASK_EXC_GUARD_VM_ONCE) {
7824 uint32_t new_behavior = behavior & ~TASK_EXC_GUARD_VM_DELIVER;
7825
7826 if (OSCompareAndSwap(behavior, new_behavior, &task->task_exc_guard)) {
7827 break;
7828 }
7829 behavior = task->task_exc_guard;
7830 if ((behavior & TASK_EXC_GUARD_VM_DELIVER) == 0) {
7831 return;
7832 }
7833 }
7834
7835 const bool fatal = task->task_exc_guard & TASK_EXC_GUARD_VM_FATAL;
7836 /* Raise exception synchronously and see if handler claimed it */
7837 sync_exception_result = task_exception_notify(EXC_GUARD, code, subcode, fatal);
7838
7839 if (fatal) {
7840 /*
7841 * If Synchronous EXC_GUARD delivery was successful then
7842 * kill the process and return, else kill the process
7843 * and deliver the exception via EXC_CORPSE_NOTIFY.
7844 */
7845
7846
7847 int flags = PX_DEBUG_NO_HONOR;
7848 exception_info_t info = {
7849 .os_reason = OS_REASON_GUARD,
7850 .exception_type = EXC_GUARD,
7851 .mx_code = code,
7852 .mx_subcode = subcode
7853 };
7854
7855 if (sync_exception_result == KERN_SUCCESS) {
7856 flags |= PX_PSIGNAL;
7857 }
7858 exit_with_mach_exception(current_proc(), info, flags);
7859 } else if (task->task_exc_guard & TASK_EXC_GUARD_VM_CORPSE) {
7860 /*
7861 * If the synchronous EXC_GUARD delivery was not successful,
7862 * raise a simulated crash.
7863 */
7864 if (sync_exception_result != KERN_SUCCESS) {
7865 task_violated_guard(code, subcode, NULL, FALSE);
7866 }
7867 }
7868 }
7869
7870 /*
7871 * Validate policy for VM guard exceptions and encode the correct Mach exception
7872 * code and subcode if the policy allows delivering a guard exception here.
7873 */
7874 static bool
vm_map_guard_exception_internal(vm_map_offset_t address,unsigned reason,mach_exception_code_t * code,mach_exception_data_type_t * subcode)7875 vm_map_guard_exception_internal(
7876 vm_map_offset_t address,
7877 unsigned reason,
7878 mach_exception_code_t *code,
7879 mach_exception_data_type_t *subcode)
7880 {
7881 unsigned int guard_type = GUARD_TYPE_VIRT_MEMORY;
7882 unsigned int target = 0; /* should we pass in pid associated with map? */
7883
7884 task_t task = current_task_early();
7885
7886 /* Can't deliver exceptions to a NULL task (early boot) or kernel task */
7887 if (task == NULL || task == kernel_task) {
7888 return false;
7889 }
7890
7891
7892 *code = 0;
7893 EXC_GUARD_ENCODE_TYPE(*code, guard_type);
7894 EXC_GUARD_ENCODE_FLAVOR(*code, reason);
7895 EXC_GUARD_ENCODE_TARGET(*code, target);
7896 *subcode = (uint64_t)address;
7897
7898 return true;
7899 }
7900
7901 /*
7902 * vm_map_guard_exception:
7903 *
7904 * Generate a GUARD_TYPE_VIRTUAL_MEMORY EXC_GUARD exception.
7905 *
7906 * `reason` is kGUARD_EXC_DEALLOC_GAP when we find nothing mapped,
7907 * or if there is a gap in the mapping when a user address space
7908 * was requested. We report the address of the first gap found.
7909 */
7910
7911 void
vm_map_guard_exception(vm_map_offset_t address,unsigned reason)7912 vm_map_guard_exception(
7913 vm_map_offset_t address,
7914 unsigned reason)
7915 {
7916 mach_exception_code_t code;
7917 mach_exception_data_type_t subcode;
7918 if (vm_map_guard_exception_internal(address, reason, &code, &subcode)) {
7919 task_t task = current_task();
7920 bool fatal = task->task_exc_guard & TASK_EXC_GUARD_VM_FATAL;
7921
7922 thread_guard_violation(current_thread(), code, subcode, fatal);
7923 }
7924 }
7925
7926
7927 static kern_return_t
vm_map_delete_submap_recurse(vm_map_t submap,vm_map_offset_t submap_start,vm_map_offset_t submap_end)7928 vm_map_delete_submap_recurse(
7929 vm_map_t submap,
7930 vm_map_offset_t submap_start,
7931 vm_map_offset_t submap_end)
7932 {
7933 vm_map_entry_t submap_entry;
7934
7935 /*
7936 * Verify that the submap does not contain any "permanent" entries
7937 * within the specified range. We permit TPRO ranges to be overwritten
7938 * as we only reach this path if TPRO const protection is disabled for a
7939 * given map.
7940 *
7941 * We do not care about gaps.
7942 */
7943
7944 vm_map_lock(submap);
7945
7946 if (!vm_map_lookup_entry(submap, submap_start, &submap_entry)) {
7947 submap_entry = submap_entry->vme_next;
7948 }
7949
7950 for (;
7951 submap_entry != vm_map_to_entry(submap) &&
7952 submap_entry->vme_start < submap_end;
7953 submap_entry = submap_entry->vme_next) {
7954 if (submap_entry->vme_permanent
7955 #ifdef __arm64e__
7956 /* allow TPRO submap entries to be overwritten */
7957 && !submap_entry->used_for_tpro
7958 #endif
7959 ) {
7960 /* "permanent" entry -> fail */
7961 vm_map_unlock(submap);
7962 return KERN_PROTECTION_FAILURE;
7963 }
7964 }
7965 /* no "permanent" entries in the range -> success */
7966 vm_map_unlock(submap);
7967 return KERN_SUCCESS;
7968 }
7969
7970 __abortlike
7971 static void
__vm_map_delete_misaligned_panic(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)7972 __vm_map_delete_misaligned_panic(
7973 vm_map_t map,
7974 vm_map_offset_t start,
7975 vm_map_offset_t end)
7976 {
7977 panic("vm_map_delete(%p,0x%llx,0x%llx): start is not aligned to 0x%x",
7978 map, (uint64_t)start, (uint64_t)end, VM_MAP_PAGE_SIZE(map));
7979 }
7980
7981 __abortlike
7982 static void
__vm_map_delete_failed_panic(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,kern_return_t kr)7983 __vm_map_delete_failed_panic(
7984 vm_map_t map,
7985 vm_map_offset_t start,
7986 vm_map_offset_t end,
7987 kern_return_t kr)
7988 {
7989 panic("vm_map_delete(%p,0x%llx,0x%llx): failed unexpected with %d",
7990 map, (uint64_t)start, (uint64_t)end, kr);
7991 }
7992
7993 __abortlike
7994 static void
__vm_map_delete_gap_panic(vm_map_t map,vm_map_offset_t where,vm_map_offset_t start,vm_map_offset_t end)7995 __vm_map_delete_gap_panic(
7996 vm_map_t map,
7997 vm_map_offset_t where,
7998 vm_map_offset_t start,
7999 vm_map_offset_t end)
8000 {
8001 panic("vm_map_delete(%p,0x%llx,0x%llx): no map entry at 0x%llx",
8002 map, (uint64_t)start, (uint64_t)end, (uint64_t)where);
8003 }
8004
8005 __abortlike
8006 static void
__vm_map_delete_permanent_panic(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_map_entry_t entry)8007 __vm_map_delete_permanent_panic(
8008 vm_map_t map,
8009 vm_map_offset_t start,
8010 vm_map_offset_t end,
8011 vm_map_entry_t entry)
8012 {
8013 panic("vm_map_delete(%p,0x%llx,0x%llx): "
8014 "Attempting to remove permanent VM map entry %p [0x%llx:0x%llx]",
8015 map, (uint64_t)start, (uint64_t)end, entry,
8016 (uint64_t)entry->vme_start,
8017 (uint64_t)entry->vme_end);
8018 }
8019
8020 __options_decl(vm_map_delete_state_t, uint32_t, {
8021 VMDS_NONE = 0x0000,
8022
8023 VMDS_FOUND_GAP = 0x0001,
8024 VMDS_GAPS_OK = 0x0002,
8025
8026 VMDS_KERNEL_PMAP = 0x0004,
8027 VMDS_NEEDS_LOOKUP = 0x0008,
8028 VMDS_NEEDS_WAKEUP = 0x0010,
8029 VMDS_KERNEL_KMEMPTR = 0x0020
8030 });
8031
8032 /*
8033 * vm_map_clamp_to_pmap(map, start, end)
8034 *
8035 * Modify *start and *end so they fall within the bounds of map->pmap.
8036 */
8037 #if MACH_ASSERT
8038 static void
vm_map_clamp_to_pmap(vm_map_t map,vm_map_address_t * start,vm_map_address_t * end)8039 vm_map_clamp_to_pmap(vm_map_t map, vm_map_address_t *start, vm_map_address_t *end)
8040 {
8041 vm_map_address_t min;
8042 vm_map_address_t max;
8043
8044 #if __x86_64__
8045 /* x86_64 struct pmap does not have min and max fields */
8046 if (map->pmap == kernel_pmap) {
8047 min = VM_MIN_KERNEL_AND_KEXT_ADDRESS;
8048 max = VM_MAX_KERNEL_ADDRESS;
8049 } else {
8050 min = VM_MAP_MIN_ADDRESS;
8051 max = VM_MAP_MAX_ADDRESS;
8052 }
8053 #else
8054 min = map->pmap->min;
8055 max = map->pmap->max;
8056 #endif
8057
8058 if (*start < min) {
8059 *start = min;
8060 } else if (*start > max) {
8061 *start = max;
8062 }
8063 if (*end < min) {
8064 *end = min;
8065 } else if (*end > max) {
8066 *end = max;
8067 }
8068 }
8069 #endif
8070
8071 int vm_log_map_delete_permanent_prot_none = 0;
8072 /*
8073 * vm_map_delete: [ internal use only ]
8074 *
8075 * Deallocates the given address range from the target map.
8076 * Removes all user wirings. Unwires one kernel wiring if
8077 * VM_MAP_REMOVE_KUNWIRE is set. Waits for kernel wirings to go
8078 * away if VM_MAP_REMOVE_WAIT_FOR_KWIRE is set. Sleeps
8079 * interruptibly if VM_MAP_REMOVE_INTERRUPTIBLE is set.
8080 *
8081 *
8082 * When the map is a kernel map, then any error in removing mappings
8083 * will lead to a panic so that clients do not have to repeat the panic
8084 * code at each call site. If VM_MAP_REMOVE_INTERRUPTIBLE
8085 * is also passed, then KERN_ABORTED will not lead to a panic.
8086 *
8087 * This routine is called with map locked and leaves map locked.
8088 */
8089 static kmem_return_t
vm_map_delete(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vmr_flags_t flags,kmem_guard_t guard,vm_map_zap_t zap_list)8090 vm_map_delete(
8091 vm_map_t map,
8092 vm_map_offset_t start,
8093 vm_map_offset_t end,
8094 vmr_flags_t flags,
8095 kmem_guard_t guard,
8096 vm_map_zap_t zap_list)
8097 {
8098 vm_map_entry_t entry, next;
8099 int interruptible;
8100 vm_map_offset_t gap_start = 0;
8101 vm_map_offset_t clear_in_transition_end = 0;
8102 __unused vm_map_offset_t save_start = start;
8103 __unused vm_map_offset_t save_end = end;
8104 vm_map_delete_state_t state = VMDS_NONE;
8105 kmem_return_t ret = { };
8106 vm_map_range_id_t range_id = 0;
8107 struct kmem_page_meta *meta = NULL;
8108 uint32_t size_idx, slot_idx;
8109 struct mach_vm_range slot;
8110
8111 if (vm_map_pmap(map) == kernel_pmap) {
8112 state |= VMDS_KERNEL_PMAP;
8113 range_id = kmem_addr_get_range(start, end - start);
8114 if (kmem_is_ptr_range(range_id)) {
8115 state |= VMDS_KERNEL_KMEMPTR;
8116 slot_idx = kmem_addr_get_slot_idx(start, end, range_id, &meta,
8117 &size_idx, &slot);
8118 }
8119 }
8120
8121 if (map->terminated || os_ref_get_count_raw(&map->map_refcnt) == 0) {
8122 state |= VMDS_GAPS_OK;
8123 }
8124
8125 if (map->corpse_source &&
8126 !(flags & VM_MAP_REMOVE_TO_OVERWRITE) &&
8127 !map->terminated) {
8128 /*
8129 * The map is being used for corpses related diagnostics.
8130 * So skip any entry removal to avoid perturbing the map state.
8131 * The cleanup will happen in task_terminate_internal after the
8132 * call to task_port_no_senders.
8133 */
8134 goto out;
8135 }
8136
8137 interruptible = (flags & VM_MAP_REMOVE_INTERRUPTIBLE) ?
8138 THREAD_ABORTSAFE : THREAD_UNINT;
8139
8140 if ((flags & VM_MAP_REMOVE_NO_MAP_ALIGN) == 0 &&
8141 (start & VM_MAP_PAGE_MASK(map))) {
8142 __vm_map_delete_misaligned_panic(map, start, end);
8143 }
8144
8145 if ((state & VMDS_GAPS_OK) == 0) {
8146 /*
8147 * If the map isn't terminated then all deletions must have
8148 * no gaps, and be within the [min, max) of the map.
8149 *
8150 * We got here without VM_MAP_RANGE_CHECK() being called,
8151 * and hence must validate bounds manually.
8152 *
8153 * It is worth noting that because vm_deallocate() will
8154 * round_page() the deallocation size, it's possible for "end"
8155 * to be 0 here due to overflow. We hence must treat it as being
8156 * beyond vm_map_max(map).
8157 *
8158 * Similarly, end < start means some wrap around happend,
8159 * which should cause an error or panic.
8160 */
8161 if (end == 0 || end > vm_map_max(map)) {
8162 state |= VMDS_FOUND_GAP;
8163 gap_start = vm_map_max(map);
8164 if (state & VMDS_KERNEL_PMAP) {
8165 __vm_map_delete_gap_panic(map,
8166 gap_start, start, end);
8167 }
8168 goto out;
8169 }
8170
8171 if (end < start) {
8172 if (state & VMDS_KERNEL_PMAP) {
8173 __vm_map_delete_gap_panic(map,
8174 vm_map_max(map), start, end);
8175 }
8176 ret.kmr_return = KERN_INVALID_ARGUMENT;
8177 goto out;
8178 }
8179
8180 if (start < vm_map_min(map)) {
8181 state |= VMDS_FOUND_GAP;
8182 gap_start = start;
8183 if (state & VMDS_KERNEL_PMAP) {
8184 __vm_map_delete_gap_panic(map,
8185 gap_start, start, end);
8186 }
8187 goto out;
8188 }
8189 } else {
8190 /*
8191 * If the map is terminated, we must accept start/end
8192 * being beyond the boundaries of the map as this is
8193 * how some of the mappings like commpage mappings
8194 * can be destroyed (they're outside of those bounds).
8195 *
8196 * end < start is still something we can't cope with,
8197 * so just bail.
8198 */
8199 if (end < start) {
8200 goto out;
8201 }
8202 }
8203
8204
8205 /*
8206 * Find the start of the region.
8207 *
8208 * If in a superpage, extend the range
8209 * to include the start of the mapping.
8210 */
8211 while (vm_map_lookup_entry_or_next(map, start, &entry)) {
8212 if (entry->superpage_size && (start & ~SUPERPAGE_MASK)) {
8213 start = SUPERPAGE_ROUND_DOWN(start);
8214 } else {
8215 SAVE_HINT_MAP_WRITE(map, entry->vme_prev);
8216 break;
8217 }
8218 }
8219
8220 if (entry->superpage_size) {
8221 end = SUPERPAGE_ROUND_UP(end);
8222 }
8223
8224 /*
8225 * Step through all entries in this region
8226 */
8227 for (vm_map_offset_t s = start; s < end;) {
8228 /*
8229 * At this point, we have deleted all the memory entries
8230 * in [start, s) and are proceeding with the [s, end) range.
8231 *
8232 * This loop might drop the map lock, and it is possible that
8233 * some memory was already reallocated within [start, s)
8234 * and we don't want to mess with those entries.
8235 *
8236 * Some of those entries could even have been re-assembled
8237 * with an entry after "s" (in vm_map_simplify_entry()), so
8238 * we may have to vm_map_clip_start() again.
8239 *
8240 * When clear_in_transition_end is set, the we had marked
8241 * [start, clear_in_transition_end) as "in_transition"
8242 * during a previous iteration and we need to clear it.
8243 */
8244
8245 /*
8246 * Step 1: If needed (because we dropped locks),
8247 * lookup the entry again.
8248 *
8249 * If we're coming back from unwiring (Step 5),
8250 * we also need to mark the entries as no longer
8251 * in transition after that.
8252 */
8253
8254 if (state & VMDS_NEEDS_LOOKUP) {
8255 state &= ~VMDS_NEEDS_LOOKUP;
8256
8257 if (vm_map_lookup_entry_or_next(map, s, &entry)) {
8258 SAVE_HINT_MAP_WRITE(map, entry->vme_prev);
8259 }
8260
8261 if (state & VMDS_KERNEL_KMEMPTR) {
8262 kmem_validate_slot(s, meta, size_idx, slot_idx);
8263 }
8264 }
8265
8266 if (clear_in_transition_end) {
8267 for (vm_map_entry_t it = entry;
8268 it != vm_map_to_entry(map) &&
8269 it->vme_start < clear_in_transition_end;
8270 it = it->vme_next) {
8271 assert(it->in_transition);
8272 it->in_transition = FALSE;
8273 if (it->needs_wakeup) {
8274 it->needs_wakeup = FALSE;
8275 state |= VMDS_NEEDS_WAKEUP;
8276 }
8277 }
8278
8279 clear_in_transition_end = 0;
8280 }
8281
8282
8283 /*
8284 * Step 2: Perform various policy checks
8285 * before we do _anything_ to this entry.
8286 */
8287
8288 if (entry == vm_map_to_entry(map) || s < entry->vme_start) {
8289 if (state & (VMDS_GAPS_OK | VMDS_FOUND_GAP)) {
8290 /*
8291 * Either we found a gap already,
8292 * or we are tearing down a map,
8293 * keep going.
8294 */
8295 } else if (state & VMDS_KERNEL_PMAP) {
8296 __vm_map_delete_gap_panic(map, s, start, end);
8297 } else if (s < end) {
8298 state |= VMDS_FOUND_GAP;
8299 gap_start = s;
8300 }
8301
8302 if (entry == vm_map_to_entry(map) ||
8303 end <= entry->vme_start) {
8304 break;
8305 }
8306
8307 s = entry->vme_start;
8308 }
8309
8310 if (state & VMDS_KERNEL_PMAP) {
8311 /*
8312 * In the kernel map and its submaps,
8313 * permanent entries never die, even
8314 * if VM_MAP_REMOVE_IMMUTABLE is passed.
8315 */
8316 if (entry->vme_permanent) {
8317 __vm_map_delete_permanent_panic(map, start, end, entry);
8318 }
8319
8320 if (flags & VM_MAP_REMOVE_GUESS_SIZE) {
8321 end = entry->vme_end;
8322 flags &= ~VM_MAP_REMOVE_GUESS_SIZE;
8323 }
8324
8325 /*
8326 * In the kernel map and its submaps,
8327 * the removal of an atomic/guarded entry is strict.
8328 *
8329 * An atomic entry is processed only if it was
8330 * specifically targeted.
8331 *
8332 * We might have deleted non-atomic entries before
8333 * we reach this this point however...
8334 */
8335 kmem_entry_validate_guard(map, entry,
8336 start, end - start, guard);
8337 }
8338
8339 /*
8340 * Step 2.1: handle "permanent" and "submap" entries
8341 * *before* clipping to avoid triggering some unnecessary
8342 * un-nesting of the shared region.
8343 */
8344 if (entry->vme_permanent && entry->is_sub_map) {
8345 // printf("FBDP %s:%d permanent submap...\n", __FUNCTION__, __LINE__);
8346 /*
8347 * Un-mapping a "permanent" mapping of a user-space
8348 * submap is not allowed unless...
8349 */
8350 if (flags & VM_MAP_REMOVE_IMMUTABLE) {
8351 /*
8352 * a. explicitly requested by the kernel caller.
8353 */
8354 // printf("FBDP %s:%d flags & REMOVE_IMMUTABLE\n", __FUNCTION__, __LINE__);
8355 } else if ((flags & VM_MAP_REMOVE_IMMUTABLE_CODE) &&
8356 developer_mode_state()) {
8357 /*
8358 * b. we're in "developer" mode (for
8359 * breakpoints, dtrace probes, ...).
8360 */
8361 // printf("FBDP %s:%d flags & REMOVE_IMMUTABLE_CODE\n", __FUNCTION__, __LINE__);
8362 } else if (map->terminated) {
8363 /*
8364 * c. this is the final address space cleanup.
8365 */
8366 // printf("FBDP %s:%d map->terminated\n", __FUNCTION__, __LINE__);
8367 } else {
8368 vm_map_offset_t submap_start, submap_end;
8369 kern_return_t submap_kr;
8370
8371 /*
8372 * Check if there are any "permanent" mappings
8373 * in this range in the submap.
8374 */
8375 if (entry->in_transition) {
8376 /* can that even happen ? */
8377 goto in_transition;
8378 }
8379 /* compute the clipped range in the submap */
8380 submap_start = s - entry->vme_start;
8381 submap_start += VME_OFFSET(entry);
8382 submap_end = end - entry->vme_start;
8383 submap_end += VME_OFFSET(entry);
8384 submap_kr = vm_map_delete_submap_recurse(
8385 VME_SUBMAP(entry),
8386 submap_start,
8387 submap_end);
8388 if (submap_kr != KERN_SUCCESS) {
8389 /*
8390 * There are some "permanent" mappings
8391 * in the submap: we are not allowed
8392 * to remove this range.
8393 */
8394 printf("%d[%s] removing permanent submap entry "
8395 "%p [0x%llx:0x%llx] prot 0x%x/0x%x -> KERN_PROT_FAILURE\n",
8396 proc_selfpid(),
8397 (get_bsdtask_info(current_task())
8398 ? proc_name_address(get_bsdtask_info(current_task()))
8399 : "?"), entry,
8400 (uint64_t)entry->vme_start,
8401 (uint64_t)entry->vme_end,
8402 entry->protection,
8403 entry->max_protection);
8404 DTRACE_VM6(vm_map_delete_permanent_deny_submap,
8405 vm_map_entry_t, entry,
8406 vm_map_offset_t, entry->vme_start,
8407 vm_map_offset_t, entry->vme_end,
8408 vm_prot_t, entry->protection,
8409 vm_prot_t, entry->max_protection,
8410 int, VME_ALIAS(entry));
8411 ret.kmr_return = KERN_PROTECTION_FAILURE;
8412 goto out;
8413 }
8414 /* no permanent mappings: proceed */
8415 }
8416 }
8417
8418 /*
8419 * Step 3: Perform any clipping needed.
8420 *
8421 * After this, "entry" starts at "s", ends before "end"
8422 */
8423
8424 if (entry->vme_start < s) {
8425 if ((flags & VM_MAP_REMOVE_NO_MAP_ALIGN) &&
8426 entry->map_aligned &&
8427 !VM_MAP_PAGE_ALIGNED(s, VM_MAP_PAGE_MASK(map))) {
8428 /*
8429 * The entry will no longer be map-aligned
8430 * after clipping and the caller said it's OK.
8431 */
8432 entry->map_aligned = FALSE;
8433 }
8434 vm_map_clip_start(map, entry, s);
8435 SAVE_HINT_MAP_WRITE(map, entry->vme_prev);
8436 }
8437
8438 if (end < entry->vme_end) {
8439 if ((flags & VM_MAP_REMOVE_NO_MAP_ALIGN) &&
8440 entry->map_aligned &&
8441 !VM_MAP_PAGE_ALIGNED(end, VM_MAP_PAGE_MASK(map))) {
8442 /*
8443 * The entry will no longer be map-aligned
8444 * after clipping and the caller said it's OK.
8445 */
8446 entry->map_aligned = FALSE;
8447 }
8448 vm_map_clip_end(map, entry, end);
8449 }
8450
8451 if (entry->vme_permanent && entry->is_sub_map) {
8452 /*
8453 * We already went through step 2.1 which did not deny
8454 * the removal of this "permanent" and "is_sub_map"
8455 * entry.
8456 * Now that we've clipped what we actually want to
8457 * delete, undo the "permanent" part to allow the
8458 * removal to proceed.
8459 */
8460 DTRACE_VM6(vm_map_delete_permanent_allow_submap,
8461 vm_map_entry_t, entry,
8462 vm_map_offset_t, entry->vme_start,
8463 vm_map_offset_t, entry->vme_end,
8464 vm_prot_t, entry->protection,
8465 vm_prot_t, entry->max_protection,
8466 int, VME_ALIAS(entry));
8467 entry->vme_permanent = false;
8468 }
8469
8470 assert(s == entry->vme_start);
8471 assert(entry->vme_end <= end);
8472
8473
8474 /*
8475 * Step 4: If the entry is in flux, wait for this to resolve.
8476 */
8477
8478 if (entry->in_transition) {
8479 wait_result_t wait_result;
8480
8481 in_transition:
8482 /*
8483 * Another thread is wiring/unwiring this entry.
8484 * Let the other thread know we are waiting.
8485 */
8486
8487 entry->needs_wakeup = TRUE;
8488
8489 /*
8490 * wake up anybody waiting on entries that we have
8491 * already unwired/deleted.
8492 */
8493 if (state & VMDS_NEEDS_WAKEUP) {
8494 vm_map_entry_wakeup(map);
8495 state &= ~VMDS_NEEDS_WAKEUP;
8496 }
8497
8498 wait_result = vm_map_entry_wait(map, interruptible);
8499
8500 if (interruptible &&
8501 wait_result == THREAD_INTERRUPTED) {
8502 /*
8503 * We do not clear the needs_wakeup flag,
8504 * since we cannot tell if we were the only one.
8505 */
8506 ret.kmr_return = KERN_ABORTED;
8507 return ret;
8508 }
8509
8510 /*
8511 * The entry could have been clipped or it
8512 * may not exist anymore. Look it up again.
8513 */
8514 state |= VMDS_NEEDS_LOOKUP;
8515 continue;
8516 }
8517
8518
8519 /*
8520 * Step 5: Handle wiring
8521 */
8522
8523 if (entry->wired_count) {
8524 struct vm_map_entry tmp_entry;
8525 boolean_t user_wire;
8526 unsigned int last_timestamp;
8527
8528 user_wire = entry->user_wired_count > 0;
8529
8530 /*
8531 * Remove a kernel wiring if requested
8532 */
8533 if (flags & VM_MAP_REMOVE_KUNWIRE) {
8534 entry->wired_count--;
8535 vme_btref_consider_and_put(entry);
8536 }
8537
8538 /*
8539 * Remove all user wirings for proper accounting
8540 */
8541 while (entry->user_wired_count) {
8542 subtract_wire_counts(map, entry, user_wire);
8543 }
8544
8545 /*
8546 * All our DMA I/O operations in IOKit are currently
8547 * done by wiring through the map entries of the task
8548 * requesting the I/O.
8549 *
8550 * Because of this, we must always wait for kernel wirings
8551 * to go away on the entries before deleting them.
8552 *
8553 * Any caller who wants to actually remove a kernel wiring
8554 * should explicitly set the VM_MAP_REMOVE_KUNWIRE flag to
8555 * properly remove one wiring instead of blasting through
8556 * them all.
8557 */
8558 if (entry->wired_count != 0) {
8559 assert(map != kernel_map);
8560 /*
8561 * Cannot continue. Typical case is when
8562 * a user thread has physical io pending on
8563 * on this page. Either wait for the
8564 * kernel wiring to go away or return an
8565 * error.
8566 */
8567 wait_result_t wait_result;
8568
8569 entry->needs_wakeup = TRUE;
8570 wait_result = vm_map_entry_wait(map,
8571 interruptible);
8572
8573 if (interruptible &&
8574 wait_result == THREAD_INTERRUPTED) {
8575 /*
8576 * We do not clear the
8577 * needs_wakeup flag, since we
8578 * cannot tell if we were the
8579 * only one.
8580 */
8581 ret.kmr_return = KERN_ABORTED;
8582 return ret;
8583 }
8584
8585
8586 /*
8587 * The entry could have been clipped or
8588 * it may not exist anymore. Look it
8589 * up again.
8590 */
8591 state |= VMDS_NEEDS_LOOKUP;
8592 continue;
8593 }
8594
8595 /*
8596 * We can unlock the map now.
8597 *
8598 * The entry might be split once we unlock the map,
8599 * but we need the range as defined by this entry
8600 * to be stable. So we must make a local copy.
8601 *
8602 * The underlying objects do not change during clips,
8603 * and the in_transition state guarentees existence
8604 * of the entry.
8605 */
8606 last_timestamp = map->timestamp;
8607 entry->in_transition = TRUE;
8608 tmp_entry = *entry;
8609 vm_map_unlock(map);
8610
8611 if (tmp_entry.is_sub_map) {
8612 vm_map_t sub_map;
8613 vm_map_offset_t sub_start, sub_end;
8614 pmap_t pmap;
8615 vm_map_offset_t pmap_addr;
8616
8617
8618 sub_map = VME_SUBMAP(&tmp_entry);
8619 sub_start = VME_OFFSET(&tmp_entry);
8620 sub_end = sub_start + (tmp_entry.vme_end -
8621 tmp_entry.vme_start);
8622 if (tmp_entry.use_pmap) {
8623 pmap = sub_map->pmap;
8624 pmap_addr = tmp_entry.vme_start;
8625 } else {
8626 pmap = map->pmap;
8627 pmap_addr = tmp_entry.vme_start;
8628 }
8629 (void) vm_map_unwire_nested(sub_map,
8630 sub_start, sub_end,
8631 user_wire,
8632 pmap, pmap_addr);
8633 } else {
8634 vm_map_offset_t entry_end = tmp_entry.vme_end;
8635 vm_map_offset_t max_end;
8636
8637 if (flags & VM_MAP_REMOVE_NOKUNWIRE_LAST) {
8638 max_end = end - VM_MAP_PAGE_SIZE(map);
8639 if (entry_end > max_end) {
8640 entry_end = max_end;
8641 }
8642 }
8643
8644 if (tmp_entry.vme_kernel_object) {
8645 pmap_protect_options(
8646 map->pmap,
8647 tmp_entry.vme_start,
8648 entry_end,
8649 VM_PROT_NONE,
8650 PMAP_OPTIONS_REMOVE,
8651 NULL);
8652 }
8653 vm_fault_unwire(map, &tmp_entry,
8654 tmp_entry.vme_kernel_object, map->pmap,
8655 tmp_entry.vme_start, entry_end);
8656 }
8657
8658 vm_map_lock(map);
8659
8660 /*
8661 * Unwiring happened, we can now go back to deleting
8662 * them (after we clear the in_transition bit for the range).
8663 */
8664 if (last_timestamp + 1 != map->timestamp) {
8665 state |= VMDS_NEEDS_LOOKUP;
8666 }
8667 clear_in_transition_end = tmp_entry.vme_end;
8668 continue;
8669 }
8670
8671 assert(entry->wired_count == 0);
8672 assert(entry->user_wired_count == 0);
8673
8674
8675 /*
8676 * Step 6: Entry is unwired and ready for us to delete !
8677 */
8678
8679 if (!entry->vme_permanent) {
8680 /*
8681 * Typical case: the entry really shouldn't be permanent
8682 */
8683 } else if ((flags & VM_MAP_REMOVE_IMMUTABLE_CODE) &&
8684 (entry->protection & VM_PROT_EXECUTE) &&
8685 developer_mode_state()) {
8686 /*
8687 * Allow debuggers to undo executable mappings
8688 * when developer mode is on.
8689 */
8690 #if 0
8691 printf("FBDP %d[%s] removing permanent executable entry "
8692 "%p [0x%llx:0x%llx] prot 0x%x/0x%x\n",
8693 proc_selfpid(),
8694 (current_task()->bsd_info
8695 ? proc_name_address(current_task()->bsd_info)
8696 : "?"), entry,
8697 (uint64_t)entry->vme_start,
8698 (uint64_t)entry->vme_end,
8699 entry->protection,
8700 entry->max_protection);
8701 #endif
8702 entry->vme_permanent = FALSE;
8703 } else if ((flags & VM_MAP_REMOVE_IMMUTABLE) || map->terminated) {
8704 #if 0
8705 printf("FBDP %d[%s] removing permanent entry "
8706 "%p [0x%llx:0x%llx] prot 0x%x/0x%x\n",
8707 proc_selfpid(),
8708 (current_task()->bsd_info
8709 ? proc_name_address(current_task()->bsd_info)
8710 : "?"), entry,
8711 (uint64_t)entry->vme_start,
8712 (uint64_t)entry->vme_end,
8713 entry->protection,
8714 entry->max_protection);
8715 #endif
8716 entry->vme_permanent = FALSE;
8717 #if CODE_SIGNING_MONITOR
8718 } else if ((entry->protection & VM_PROT_EXECUTE) && !csm_enabled()) {
8719 entry->vme_permanent = FALSE;
8720
8721 printf("%d[%s] %s(0x%llx,0x%llx): "
8722 "code signing monitor disabled, allowing for permanent executable entry [0x%llx:0x%llx] "
8723 "prot 0x%x/0x%x\n",
8724 proc_selfpid(),
8725 (get_bsdtask_info(current_task())
8726 ? proc_name_address(get_bsdtask_info(current_task()))
8727 : "?"),
8728 __FUNCTION__,
8729 (uint64_t)start,
8730 (uint64_t)end,
8731 (uint64_t)entry->vme_start,
8732 (uint64_t)entry->vme_end,
8733 entry->protection,
8734 entry->max_protection);
8735 #endif
8736 } else {
8737 DTRACE_VM6(vm_map_delete_permanent,
8738 vm_map_entry_t, entry,
8739 vm_map_offset_t, entry->vme_start,
8740 vm_map_offset_t, entry->vme_end,
8741 vm_prot_t, entry->protection,
8742 vm_prot_t, entry->max_protection,
8743 int, VME_ALIAS(entry));
8744 }
8745
8746 if (entry->is_sub_map) {
8747 assertf(VM_MAP_PAGE_SHIFT(VME_SUBMAP(entry)) >= VM_MAP_PAGE_SHIFT(map),
8748 "map %p (%d) entry %p submap %p (%d)\n",
8749 map, VM_MAP_PAGE_SHIFT(map), entry,
8750 VME_SUBMAP(entry),
8751 VM_MAP_PAGE_SHIFT(VME_SUBMAP(entry)));
8752 if (entry->use_pmap) {
8753 #ifndef NO_NESTED_PMAP
8754 int pmap_flags;
8755
8756 if (map->terminated) {
8757 /*
8758 * This is the final cleanup of the
8759 * address space being terminated.
8760 * No new mappings are expected and
8761 * we don't really need to unnest the
8762 * shared region (and lose the "global"
8763 * pmap mappings, if applicable).
8764 *
8765 * Tell the pmap layer that we're
8766 * "clean" wrt nesting.
8767 */
8768 pmap_flags = PMAP_UNNEST_CLEAN;
8769 } else {
8770 /*
8771 * We're unmapping part of the nested
8772 * shared region, so we can't keep the
8773 * nested pmap.
8774 */
8775 pmap_flags = 0;
8776 }
8777 pmap_unnest_options(
8778 map->pmap,
8779 (addr64_t)entry->vme_start,
8780 entry->vme_end - entry->vme_start,
8781 pmap_flags);
8782 #endif /* NO_NESTED_PMAP */
8783 if (map->mapped_in_other_pmaps &&
8784 os_ref_get_count_raw(&map->map_refcnt) != 0) {
8785 /* clean up parent map/maps */
8786 vm_map_submap_pmap_clean(
8787 map, entry->vme_start,
8788 entry->vme_end,
8789 VME_SUBMAP(entry),
8790 VME_OFFSET(entry));
8791 }
8792 } else {
8793 vm_map_submap_pmap_clean(
8794 map, entry->vme_start, entry->vme_end,
8795 VME_SUBMAP(entry),
8796 VME_OFFSET(entry));
8797 }
8798 } else if (entry->vme_kernel_object ||
8799 VME_OBJECT(entry) == compressor_object) {
8800 /*
8801 * nothing to do
8802 */
8803 } else if (map->mapped_in_other_pmaps &&
8804 os_ref_get_count_raw(&map->map_refcnt) != 0) {
8805 vm_object_pmap_protect_options(
8806 VME_OBJECT(entry), VME_OFFSET(entry),
8807 entry->vme_end - entry->vme_start,
8808 PMAP_NULL,
8809 PAGE_SIZE,
8810 entry->vme_start,
8811 VM_PROT_NONE,
8812 PMAP_OPTIONS_REMOVE);
8813 } else if ((VME_OBJECT(entry) != VM_OBJECT_NULL) ||
8814 (state & VMDS_KERNEL_PMAP)) {
8815 /* Remove translations associated
8816 * with this range unless the entry
8817 * does not have an object, or
8818 * it's the kernel map or a descendant
8819 * since the platform could potentially
8820 * create "backdoor" mappings invisible
8821 * to the VM. It is expected that
8822 * objectless, non-kernel ranges
8823 * do not have such VM invisible
8824 * translations.
8825 */
8826 vm_map_address_t remove_start = entry->vme_start;
8827 vm_map_address_t remove_end = entry->vme_end;
8828 #if MACH_ASSERT
8829 /*
8830 * Prevent panics in pmap_remove() from some vm test code
8831 * which uses virtual address ranges that pmap disallows.
8832 */
8833 if (thread_get_test_option(test_option_vm_map_clamp_pmap_remove)) {
8834 vm_map_clamp_to_pmap(map, &remove_start, &remove_end);
8835 }
8836 #endif /* MACH_ASSERT */
8837 pmap_remove(map->pmap, remove_start, remove_end);
8838 }
8839
8840 #if DEBUG
8841 /*
8842 * All pmap mappings for this map entry must have been
8843 * cleared by now.
8844 */
8845 assert(pmap_is_empty(map->pmap,
8846 entry->vme_start,
8847 entry->vme_end));
8848 #endif /* DEBUG */
8849
8850 if (entry->iokit_acct) {
8851 /* alternate accounting */
8852 DTRACE_VM4(vm_map_iokit_unmapped_region,
8853 vm_map_t, map,
8854 vm_map_offset_t, entry->vme_start,
8855 vm_map_offset_t, entry->vme_end,
8856 int, VME_ALIAS(entry));
8857 vm_map_iokit_unmapped_region(map,
8858 (entry->vme_end -
8859 entry->vme_start));
8860 entry->iokit_acct = FALSE;
8861 entry->use_pmap = FALSE;
8862 }
8863
8864 /* move "s" forward */
8865 s = entry->vme_end;
8866 next = entry->vme_next;
8867 if (!entry->map_aligned) {
8868 vm_map_offset_t rounded_s;
8869
8870 /*
8871 * Skip artificial gap due to mis-aligned entry
8872 * on devices with a page size smaller than the
8873 * map's page size (i.e. 16k task on a 4k device).
8874 */
8875 rounded_s = VM_MAP_ROUND_PAGE(s, VM_MAP_PAGE_MASK(map));
8876 if (next == vm_map_to_entry(map)) {
8877 s = rounded_s;
8878 } else if (s < rounded_s) {
8879 s = MIN(rounded_s, next->vme_start);
8880 }
8881 }
8882 ret.kmr_size += s - entry->vme_start;
8883
8884 if (entry->vme_permanent) {
8885 /*
8886 * A permanent entry can not be removed, so leave it
8887 * in place but remove all access permissions.
8888 */
8889 if (__improbable(vm_log_map_delete_permanent_prot_none)) {
8890 printf("%s:%d %d[%s] map %p entry %p [ 0x%llx - 0x%llx ] submap %d prot 0x%x/0x%x -> 0/0\n",
8891 __FUNCTION__, __LINE__,
8892 proc_selfpid(),
8893 (get_bsdtask_info(current_task())
8894 ? proc_name_address(get_bsdtask_info(current_task()))
8895 : "?"),
8896 map,
8897 entry,
8898 (uint64_t)entry->vme_start,
8899 (uint64_t)entry->vme_end,
8900 entry->is_sub_map,
8901 entry->protection,
8902 entry->max_protection);
8903 }
8904 DTRACE_VM6(vm_map_delete_permanent_prot_none,
8905 vm_map_entry_t, entry,
8906 vm_map_offset_t, entry->vme_start,
8907 vm_map_offset_t, entry->vme_end,
8908 vm_prot_t, entry->protection,
8909 vm_prot_t, entry->max_protection,
8910 int, VME_ALIAS(entry));
8911 entry->protection = VM_PROT_NONE;
8912 entry->max_protection = VM_PROT_NONE;
8913 #ifdef __arm64e__
8914 entry->used_for_tpro = FALSE;
8915 #endif
8916 } else {
8917 vm_map_entry_zap(map, entry, zap_list);
8918 }
8919
8920 entry = next;
8921 next = VM_MAP_ENTRY_NULL;
8922
8923 if ((flags & VM_MAP_REMOVE_NO_YIELD) == 0 && s < end) {
8924 unsigned int last_timestamp = map->timestamp++;
8925
8926 if (lck_rw_lock_yield_exclusive(&map->lock,
8927 LCK_RW_YIELD_ANY_WAITER)) {
8928 if (last_timestamp != map->timestamp + 1) {
8929 state |= VMDS_NEEDS_LOOKUP;
8930 }
8931 } else {
8932 /* we didn't yield, undo our change */
8933 map->timestamp--;
8934 }
8935 }
8936 }
8937
8938 if (map->wait_for_space) {
8939 thread_wakeup((event_t) map);
8940 }
8941
8942 if (state & VMDS_NEEDS_WAKEUP) {
8943 vm_map_entry_wakeup(map);
8944 }
8945
8946 out:
8947 if ((state & VMDS_KERNEL_PMAP) && ret.kmr_return) {
8948 __vm_map_delete_failed_panic(map, start, end, ret.kmr_return);
8949 }
8950
8951 if (state & VMDS_KERNEL_KMEMPTR) {
8952 kmem_free_space(start, end, range_id, &slot);
8953 }
8954
8955 if (state & VMDS_FOUND_GAP) {
8956 DTRACE_VM3(kern_vm_deallocate_gap,
8957 vm_map_offset_t, gap_start,
8958 vm_map_offset_t, save_start,
8959 vm_map_offset_t, save_end);
8960 if (flags & VM_MAP_REMOVE_GAPS_FAIL) {
8961 ret.kmr_return = KERN_INVALID_VALUE;
8962 } else {
8963 vm_map_guard_exception(gap_start, kGUARD_EXC_DEALLOC_GAP);
8964 }
8965 }
8966
8967 return ret;
8968 }
8969
8970 kmem_return_t
vm_map_remove_and_unlock(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vmr_flags_t flags,kmem_guard_t guard)8971 vm_map_remove_and_unlock(
8972 vm_map_t map,
8973 vm_map_offset_t start,
8974 vm_map_offset_t end,
8975 vmr_flags_t flags,
8976 kmem_guard_t guard)
8977 {
8978 kmem_return_t ret;
8979 VM_MAP_ZAP_DECLARE(zap);
8980
8981 ret = vm_map_delete(map, start, end, flags, guard, &zap);
8982 vm_map_unlock(map);
8983
8984 vm_map_zap_dispose(&zap);
8985
8986 return ret;
8987 }
8988
8989 /*
8990 * vm_map_remove_guard:
8991 *
8992 * Remove the given address range from the target map.
8993 * This is the exported form of vm_map_delete.
8994 */
8995 kmem_return_t
vm_map_remove_guard(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vmr_flags_t flags,kmem_guard_t guard)8996 vm_map_remove_guard(
8997 vm_map_t map,
8998 vm_map_offset_t start,
8999 vm_map_offset_t end,
9000 vmr_flags_t flags,
9001 kmem_guard_t guard)
9002 {
9003 vm_map_lock(map);
9004 return vm_map_remove_and_unlock(map, start, end, flags, guard);
9005 }
9006
9007
9008 /*
9009 * vm_map_setup:
9010 *
9011 * Perform any required setup on a new task's map. Must be called before the task
9012 * is enabled for IPC access, since after this point other threads may be able
9013 * to look up the task port and make VM API calls.
9014 */
9015 void
vm_map_setup(vm_map_t map,task_t task)9016 vm_map_setup(vm_map_t map, task_t task)
9017 {
9018 /*
9019 * map does NOT take a reference on owning_task. If the map has terminated,
9020 * it is possible that the pointer is NULL, so reads of owning_task must
9021 * happen under the map lock and explicitly check for NULL.
9022 */
9023 vm_map_lock(map);
9024 assert(!map->owning_task);
9025 map->owning_task = task;
9026 vm_map_unlock(map);
9027 #if CONFIG_DEFERRED_RECLAIM
9028 vm_deferred_reclamation_metadata_t vdrm = task->deferred_reclamation_metadata;
9029 if (vdrm) {
9030 vm_deferred_reclamation_task_fork_register(vdrm);
9031 }
9032 #endif /* CONFIG_DEFERRED_RECLAIM */
9033 }
9034
9035 /*
9036 * vm_map_terminate:
9037 *
9038 * Clean out a task's map.
9039 */
9040 kern_return_t
vm_map_terminate(vm_map_t map)9041 vm_map_terminate(
9042 vm_map_t map)
9043 {
9044 vm_map_lock(map);
9045 map->terminated = TRUE;
9046 map->owning_task = NULL;
9047 vm_map_disable_hole_optimization(map);
9048 (void)vm_map_remove_and_unlock(map, map->min_offset, map->max_offset,
9049 VM_MAP_REMOVE_NO_FLAGS, KMEM_GUARD_NONE);
9050 return KERN_SUCCESS;
9051 }
9052
9053 /*
9054 * Routine: vm_map_copy_allocate
9055 *
9056 * Description:
9057 * Allocates and initializes a map copy object.
9058 */
9059 static vm_map_copy_t
vm_map_copy_allocate(uint16_t type)9060 vm_map_copy_allocate(uint16_t type)
9061 {
9062 vm_map_copy_t new_copy;
9063
9064 new_copy = zalloc_id(ZONE_ID_VM_MAP_COPY, Z_WAITOK | Z_ZERO);
9065 new_copy->type = type;
9066 if (type == VM_MAP_COPY_ENTRY_LIST) {
9067 new_copy->c_u.hdr.rb_head_store.rbh_root = (void*)(int)SKIP_RB_TREE;
9068 vm_map_store_init(&new_copy->cpy_hdr);
9069 }
9070 return new_copy;
9071 }
9072
9073 /*
9074 * Routine: vm_map_copy_discard
9075 *
9076 * Description:
9077 * Dispose of a map copy object (returned by
9078 * vm_map_copyin).
9079 */
9080 void
vm_map_copy_discard(vm_map_copy_t copy)9081 vm_map_copy_discard(
9082 vm_map_copy_t copy)
9083 {
9084 if (copy == VM_MAP_COPY_NULL) {
9085 return;
9086 }
9087
9088 /*
9089 * Assert that the vm_map_copy is coming from the right
9090 * zone and hasn't been forged
9091 */
9092 vm_map_copy_require(copy);
9093
9094 switch (copy->type) {
9095 case VM_MAP_COPY_ENTRY_LIST:
9096 while (vm_map_copy_first_entry(copy) !=
9097 vm_map_copy_to_entry(copy)) {
9098 vm_map_entry_t entry = vm_map_copy_first_entry(copy);
9099
9100 vm_map_copy_entry_unlink(copy, entry);
9101 if (entry->is_sub_map) {
9102 vm_map_deallocate(VME_SUBMAP(entry));
9103 } else {
9104 vm_object_deallocate(VME_OBJECT(entry));
9105 }
9106 vm_map_copy_entry_dispose(entry);
9107 }
9108 break;
9109 case VM_MAP_COPY_KERNEL_BUFFER:
9110
9111 /*
9112 * The vm_map_copy_t and possibly the data buffer were
9113 * allocated by a single call to kalloc_data(), i.e. the
9114 * vm_map_copy_t was not allocated out of the zone.
9115 */
9116 if (copy->size > msg_ool_size_small || copy->offset) {
9117 panic("Invalid vm_map_copy_t sz:%lld, ofst:%lld",
9118 (long long)copy->size, (long long)copy->offset);
9119 }
9120 kfree_data(copy->cpy_kdata, copy->size);
9121 }
9122 zfree_id(ZONE_ID_VM_MAP_COPY, copy);
9123 }
9124
9125 #if XNU_PLATFORM_MacOSX
9126
9127 __exported
9128 extern vm_map_copy_t vm_map_copy_copy(vm_map_copy_t copy);
9129
9130 /*
9131 * Routine: vm_map_copy_copy
9132 *
9133 * Description:
9134 * Move the information in a map copy object to
9135 * a new map copy object, leaving the old one
9136 * empty.
9137 *
9138 * This is used by kernel routines that need
9139 * to look at out-of-line data (in copyin form)
9140 * before deciding whether to return SUCCESS.
9141 * If the routine returns FAILURE, the original
9142 * copy object will be deallocated; therefore,
9143 * these routines must make a copy of the copy
9144 * object and leave the original empty so that
9145 * deallocation will not fail.
9146 */
9147 vm_map_copy_t
vm_map_copy_copy(vm_map_copy_t copy)9148 vm_map_copy_copy(
9149 vm_map_copy_t copy)
9150 {
9151 vm_map_copy_t new_copy;
9152
9153 if (copy == VM_MAP_COPY_NULL) {
9154 return VM_MAP_COPY_NULL;
9155 }
9156
9157 /*
9158 * Assert that the vm_map_copy is coming from the right
9159 * zone and hasn't been forged
9160 */
9161 vm_map_copy_require(copy);
9162
9163 /*
9164 * Allocate a new copy object, and copy the information
9165 * from the old one into it.
9166 */
9167
9168 new_copy = zalloc_id(ZONE_ID_VM_MAP_COPY, Z_WAITOK | Z_ZERO | Z_NOFAIL);
9169 memcpy((void *) new_copy, (void *) copy, sizeof(struct vm_map_copy));
9170 #if __has_feature(ptrauth_calls)
9171 if (copy->type == VM_MAP_COPY_KERNEL_BUFFER) {
9172 new_copy->cpy_kdata = copy->cpy_kdata;
9173 }
9174 #endif
9175
9176 if (copy->type == VM_MAP_COPY_ENTRY_LIST) {
9177 /*
9178 * The links in the entry chain must be
9179 * changed to point to the new copy object.
9180 */
9181 vm_map_copy_first_entry(copy)->vme_prev
9182 = vm_map_copy_to_entry(new_copy);
9183 vm_map_copy_last_entry(copy)->vme_next
9184 = vm_map_copy_to_entry(new_copy);
9185 }
9186
9187 /*
9188 * Change the old copy object into one that contains
9189 * nothing to be deallocated.
9190 */
9191 bzero(copy, sizeof(struct vm_map_copy));
9192 copy->type = VM_MAP_COPY_KERNEL_BUFFER;
9193
9194 /*
9195 * Return the new object.
9196 */
9197 return new_copy;
9198 }
9199
9200 #endif /* XNU_PLATFORM_MacOSX */
9201
9202 static boolean_t
vm_map_entry_is_overwritable(vm_map_t dst_map __unused,vm_map_entry_t entry)9203 vm_map_entry_is_overwritable(
9204 vm_map_t dst_map __unused,
9205 vm_map_entry_t entry)
9206 {
9207 if (!(entry->protection & VM_PROT_WRITE)) {
9208 /* can't overwrite if not writable */
9209 return FALSE;
9210 }
9211 #if !__x86_64__
9212 if (entry->used_for_jit &&
9213 vm_map_cs_enforcement(dst_map) &&
9214 !dst_map->cs_debugged) {
9215 /*
9216 * Can't overwrite a JIT region while cs_enforced
9217 * and not cs_debugged.
9218 */
9219 return FALSE;
9220 }
9221
9222 #if __arm64e__
9223 /* Do not allow overwrite HW assisted TPRO entries */
9224 if (entry->used_for_tpro) {
9225 return FALSE;
9226 }
9227 #endif /* __arm64e__ */
9228
9229 if (entry->vme_permanent) {
9230 if (entry->is_sub_map) {
9231 /*
9232 * We can't tell if the submap contains "permanent"
9233 * entries within the range targeted by the caller.
9234 * The caller will have to check for that with
9235 * vm_map_overwrite_submap_recurse() for example.
9236 */
9237 } else {
9238 /*
9239 * Do not allow overwriting of a "permanent"
9240 * entry.
9241 */
9242 DTRACE_VM6(vm_map_delete_permanent_deny_overwrite,
9243 vm_map_entry_t, entry,
9244 vm_map_offset_t, entry->vme_start,
9245 vm_map_offset_t, entry->vme_end,
9246 vm_prot_t, entry->protection,
9247 vm_prot_t, entry->max_protection,
9248 int, VME_ALIAS(entry));
9249 return FALSE;
9250 }
9251 }
9252 #endif /* !__x86_64__ */
9253
9254 if (entry->is_sub_map) {
9255 /* remember not to assume every entry has a VM object... */
9256 }
9257
9258
9259 return TRUE;
9260 }
9261
9262 static kern_return_t
vm_map_overwrite_submap_recurse(vm_map_t dst_map,vm_map_offset_t dst_addr,vm_map_size_t dst_size)9263 vm_map_overwrite_submap_recurse(
9264 vm_map_t dst_map,
9265 vm_map_offset_t dst_addr,
9266 vm_map_size_t dst_size)
9267 {
9268 vm_map_offset_t dst_end;
9269 vm_map_entry_t tmp_entry;
9270 vm_map_entry_t entry;
9271 kern_return_t result;
9272 boolean_t encountered_sub_map = FALSE;
9273
9274
9275
9276 /*
9277 * Verify that the destination is all writeable
9278 * initially. We have to trunc the destination
9279 * address and round the copy size or we'll end up
9280 * splitting entries in strange ways.
9281 */
9282
9283 dst_end = vm_map_round_page(dst_addr + dst_size,
9284 VM_MAP_PAGE_MASK(dst_map));
9285 vm_map_lock(dst_map);
9286
9287 start_pass_1:
9288 if (!vm_map_lookup_entry(dst_map, dst_addr, &tmp_entry)) {
9289 vm_map_unlock(dst_map);
9290 return KERN_INVALID_ADDRESS;
9291 }
9292
9293 vm_map_clip_start(dst_map,
9294 tmp_entry,
9295 vm_map_trunc_page(dst_addr,
9296 VM_MAP_PAGE_MASK(dst_map)));
9297 if (tmp_entry->is_sub_map) {
9298 /* clipping did unnest if needed */
9299 assert(!tmp_entry->use_pmap);
9300 }
9301
9302 for (entry = tmp_entry;;) {
9303 vm_map_entry_t next;
9304
9305 next = entry->vme_next;
9306 while (entry->is_sub_map) {
9307 vm_map_offset_t sub_start;
9308 vm_map_offset_t sub_end;
9309 vm_map_offset_t local_end;
9310 vm_map_t sub_map;
9311
9312 if (entry->in_transition) {
9313 /*
9314 * Say that we are waiting, and wait for entry.
9315 */
9316 entry->needs_wakeup = TRUE;
9317 vm_map_entry_wait(dst_map, THREAD_UNINT);
9318
9319 goto start_pass_1;
9320 }
9321
9322 encountered_sub_map = TRUE;
9323 sub_start = VME_OFFSET(entry);
9324
9325 if (entry->vme_end < dst_end) {
9326 sub_end = entry->vme_end;
9327 } else {
9328 sub_end = dst_end;
9329 }
9330 sub_end -= entry->vme_start;
9331 sub_end += VME_OFFSET(entry);
9332 local_end = entry->vme_end;
9333 sub_map = VME_SUBMAP(entry);
9334 vm_map_reference(sub_map);
9335 vm_map_unlock(dst_map);
9336
9337 result = vm_map_overwrite_submap_recurse(
9338 sub_map,
9339 sub_start,
9340 sub_end - sub_start);
9341
9342 vm_map_deallocate(sub_map);
9343 sub_map = VM_MAP_NULL;
9344
9345 if (result != KERN_SUCCESS) {
9346 return result;
9347 }
9348 if (dst_end <= entry->vme_end) {
9349 return KERN_SUCCESS;
9350 }
9351 vm_map_lock(dst_map);
9352 if (!vm_map_lookup_entry(dst_map, local_end,
9353 &tmp_entry)) {
9354 vm_map_unlock(dst_map);
9355 return KERN_INVALID_ADDRESS;
9356 }
9357 entry = tmp_entry;
9358 next = entry->vme_next;
9359 }
9360 assert(!entry->is_sub_map);
9361
9362 if (!(entry->protection & VM_PROT_WRITE)) {
9363 vm_map_unlock(dst_map);
9364 return KERN_PROTECTION_FAILURE;
9365 }
9366
9367 if (!vm_map_entry_is_overwritable(dst_map, entry)) {
9368 vm_map_unlock(dst_map);
9369 return KERN_PROTECTION_FAILURE;
9370 }
9371
9372 /*
9373 * If the entry is in transition, we must wait
9374 * for it to exit that state. Anything could happen
9375 * when we unlock the map, so start over.
9376 */
9377 if (entry->in_transition) {
9378 /*
9379 * Say that we are waiting, and wait for entry.
9380 */
9381 entry->needs_wakeup = TRUE;
9382 vm_map_entry_wait(dst_map, THREAD_UNINT);
9383
9384 goto start_pass_1;
9385 }
9386
9387 /*
9388 * our range is contained completely within this map entry
9389 */
9390 if (dst_end <= entry->vme_end) {
9391 vm_map_unlock(dst_map);
9392 return KERN_SUCCESS;
9393 }
9394 /*
9395 * check that range specified is contiguous region
9396 */
9397 if ((next == vm_map_to_entry(dst_map)) ||
9398 (next->vme_start != entry->vme_end)) {
9399 vm_map_unlock(dst_map);
9400 return KERN_INVALID_ADDRESS;
9401 }
9402
9403 /*
9404 * Check for permanent objects in the destination.
9405 */
9406 assert(!entry->is_sub_map);
9407 if ((VME_OBJECT(entry) != VM_OBJECT_NULL) &&
9408 ((!VME_OBJECT(entry)->internal) ||
9409 (VME_OBJECT(entry)->true_share))) {
9410 if (encountered_sub_map) {
9411 vm_map_unlock(dst_map);
9412 return KERN_FAILURE;
9413 }
9414 }
9415
9416
9417 entry = next;
9418 }/* for */
9419 vm_map_unlock(dst_map);
9420 return KERN_SUCCESS;
9421 }
9422
9423 /*
9424 * Routine: vm_map_copy_overwrite
9425 *
9426 * Description:
9427 * Copy the memory described by the map copy
9428 * object (copy; returned by vm_map_copyin) onto
9429 * the specified destination region (dst_map, dst_addr).
9430 * The destination must be writeable.
9431 *
9432 * Unlike vm_map_copyout, this routine actually
9433 * writes over previously-mapped memory. If the
9434 * previous mapping was to a permanent (user-supplied)
9435 * memory object, it is preserved.
9436 *
9437 * The attributes (protection and inheritance) of the
9438 * destination region are preserved.
9439 *
9440 * If successful, consumes the copy object.
9441 * Otherwise, the caller is responsible for it.
9442 *
9443 * Implementation notes:
9444 * To overwrite aligned temporary virtual memory, it is
9445 * sufficient to remove the previous mapping and insert
9446 * the new copy. This replacement is done either on
9447 * the whole region (if no permanent virtual memory
9448 * objects are embedded in the destination region) or
9449 * in individual map entries.
9450 *
9451 * To overwrite permanent virtual memory , it is necessary
9452 * to copy each page, as the external memory management
9453 * interface currently does not provide any optimizations.
9454 *
9455 * Unaligned memory also has to be copied. It is possible
9456 * to use 'vm_trickery' to copy the aligned data. This is
9457 * not done but not hard to implement.
9458 *
9459 * Once a page of permanent memory has been overwritten,
9460 * it is impossible to interrupt this function; otherwise,
9461 * the call would be neither atomic nor location-independent.
9462 * The kernel-state portion of a user thread must be
9463 * interruptible.
9464 *
9465 * It may be expensive to forward all requests that might
9466 * overwrite permanent memory (vm_write, vm_copy) to
9467 * uninterruptible kernel threads. This routine may be
9468 * called by interruptible threads; however, success is
9469 * not guaranteed -- if the request cannot be performed
9470 * atomically and interruptibly, an error indication is
9471 * returned.
9472 *
9473 * Callers of this function must call vm_map_copy_require on
9474 * previously created vm_map_copy_t or pass a newly created
9475 * one to ensure that it hasn't been forged.
9476 */
9477 static kern_return_t
vm_map_copy_overwrite_nested(vm_map_t dst_map,vm_map_address_t dst_addr,vm_map_copy_t copy,boolean_t interruptible,pmap_t pmap,boolean_t discard_on_success)9478 vm_map_copy_overwrite_nested(
9479 vm_map_t dst_map,
9480 vm_map_address_t dst_addr,
9481 vm_map_copy_t copy,
9482 boolean_t interruptible,
9483 pmap_t pmap,
9484 boolean_t discard_on_success)
9485 {
9486 vm_map_offset_t dst_end;
9487 vm_map_entry_t tmp_entry;
9488 vm_map_entry_t entry;
9489 kern_return_t kr;
9490 boolean_t aligned = TRUE;
9491 boolean_t contains_permanent_objects = FALSE;
9492 boolean_t encountered_sub_map = FALSE;
9493 vm_map_offset_t base_addr;
9494 vm_map_size_t copy_size;
9495 vm_map_size_t total_size;
9496 uint16_t copy_page_shift;
9497
9498 /*
9499 * Check for special kernel buffer allocated
9500 * by new_ipc_kmsg_copyin.
9501 */
9502
9503 if (copy->type == VM_MAP_COPY_KERNEL_BUFFER) {
9504 kr = vm_map_copyout_kernel_buffer(
9505 dst_map, &dst_addr,
9506 copy, copy->size, TRUE, discard_on_success);
9507 return kr;
9508 }
9509
9510 /*
9511 * Only works for entry lists at the moment. Will
9512 * support page lists later.
9513 */
9514
9515 assert(copy->type == VM_MAP_COPY_ENTRY_LIST);
9516
9517 if (copy->size == 0) {
9518 if (discard_on_success) {
9519 vm_map_copy_discard(copy);
9520 }
9521 return KERN_SUCCESS;
9522 }
9523
9524 copy_page_shift = copy->cpy_hdr.page_shift;
9525
9526 /*
9527 * Verify that the destination is all writeable
9528 * initially. We have to trunc the destination
9529 * address and round the copy size or we'll end up
9530 * splitting entries in strange ways.
9531 */
9532
9533 if (!VM_MAP_PAGE_ALIGNED(copy->size,
9534 VM_MAP_PAGE_MASK(dst_map)) ||
9535 !VM_MAP_PAGE_ALIGNED(copy->offset,
9536 VM_MAP_PAGE_MASK(dst_map)) ||
9537 !VM_MAP_PAGE_ALIGNED(dst_addr,
9538 VM_MAP_PAGE_MASK(dst_map)) ||
9539 copy_page_shift != VM_MAP_PAGE_SHIFT(dst_map)) {
9540 aligned = FALSE;
9541 dst_end = vm_map_round_page(dst_addr + copy->size,
9542 VM_MAP_PAGE_MASK(dst_map));
9543 } else {
9544 dst_end = dst_addr + copy->size;
9545 }
9546
9547 vm_map_lock(dst_map);
9548
9549 /* LP64todo - remove this check when vm_map_commpage64()
9550 * no longer has to stuff in a map_entry for the commpage
9551 * above the map's max_offset.
9552 */
9553 if (dst_addr >= dst_map->max_offset) {
9554 vm_map_unlock(dst_map);
9555 return KERN_INVALID_ADDRESS;
9556 }
9557
9558 start_pass_1:
9559 if (!vm_map_lookup_entry(dst_map, dst_addr, &tmp_entry)) {
9560 vm_map_unlock(dst_map);
9561 return KERN_INVALID_ADDRESS;
9562 }
9563 vm_map_clip_start(dst_map,
9564 tmp_entry,
9565 vm_map_trunc_page(dst_addr,
9566 VM_MAP_PAGE_MASK(dst_map)));
9567 for (entry = tmp_entry;;) {
9568 vm_map_entry_t next = entry->vme_next;
9569
9570 while (entry->is_sub_map) {
9571 vm_map_offset_t sub_start;
9572 vm_map_offset_t sub_end;
9573 vm_map_offset_t local_end;
9574
9575 if (entry->in_transition) {
9576 /*
9577 * Say that we are waiting, and wait for entry.
9578 */
9579 entry->needs_wakeup = TRUE;
9580 vm_map_entry_wait(dst_map, THREAD_UNINT);
9581
9582 goto start_pass_1;
9583 }
9584
9585 local_end = entry->vme_end;
9586 if (!(entry->needs_copy)) {
9587 vm_map_t sub_map = VM_MAP_NULL;
9588
9589 /* if needs_copy we are a COW submap */
9590 /* in such a case we just replace so */
9591 /* there is no need for the follow- */
9592 /* ing check. */
9593 encountered_sub_map = TRUE;
9594 sub_start = VME_OFFSET(entry);
9595
9596 if (entry->vme_end < dst_end) {
9597 sub_end = entry->vme_end;
9598 } else {
9599 sub_end = dst_end;
9600 }
9601 sub_end -= entry->vme_start;
9602 sub_end += VME_OFFSET(entry);
9603 sub_map = VME_SUBMAP(entry);
9604 vm_map_reference(sub_map);
9605 vm_map_unlock(dst_map);
9606
9607 kr = vm_map_overwrite_submap_recurse(
9608 sub_map,
9609 sub_start,
9610 sub_end - sub_start);
9611
9612 vm_map_deallocate(sub_map);
9613 sub_map = VM_MAP_NULL;
9614 if (kr != KERN_SUCCESS) {
9615 return kr;
9616 }
9617 vm_map_lock(dst_map);
9618 }
9619
9620 if (dst_end <= entry->vme_end) {
9621 goto start_overwrite;
9622 }
9623 if (!vm_map_lookup_entry(dst_map, local_end,
9624 &entry)) {
9625 vm_map_unlock(dst_map);
9626 return KERN_INVALID_ADDRESS;
9627 }
9628 next = entry->vme_next;
9629 }
9630 assert(!entry->is_sub_map);
9631
9632 if (!(entry->protection & VM_PROT_WRITE)) {
9633 vm_map_unlock(dst_map);
9634 return KERN_PROTECTION_FAILURE;
9635 }
9636
9637 if (!vm_map_entry_is_overwritable(dst_map, entry)) {
9638 vm_map_unlock(dst_map);
9639 return KERN_PROTECTION_FAILURE;
9640 }
9641
9642 /*
9643 * If the entry is in transition, we must wait
9644 * for it to exit that state. Anything could happen
9645 * when we unlock the map, so start over.
9646 */
9647 if (entry->in_transition) {
9648 /*
9649 * Say that we are waiting, and wait for entry.
9650 */
9651 entry->needs_wakeup = TRUE;
9652 vm_map_entry_wait(dst_map, THREAD_UNINT);
9653
9654 goto start_pass_1;
9655 }
9656
9657 /*
9658 * our range is contained completely within this map entry
9659 */
9660 if (dst_end <= entry->vme_end) {
9661 break;
9662 }
9663 /*
9664 * check that range specified is contiguous region
9665 */
9666 if ((next == vm_map_to_entry(dst_map)) ||
9667 (next->vme_start != entry->vme_end)) {
9668 vm_map_unlock(dst_map);
9669 return KERN_INVALID_ADDRESS;
9670 }
9671
9672
9673 /*
9674 * Check for permanent objects in the destination.
9675 */
9676 assert(!entry->is_sub_map);
9677 if ((VME_OBJECT(entry) != VM_OBJECT_NULL) &&
9678 ((!VME_OBJECT(entry)->internal) ||
9679 (VME_OBJECT(entry)->true_share))) {
9680 contains_permanent_objects = TRUE;
9681 }
9682
9683 entry = next;
9684 }/* for */
9685
9686 start_overwrite:
9687 /*
9688 * If there are permanent objects in the destination, then
9689 * the copy cannot be interrupted.
9690 */
9691
9692 if (interruptible && contains_permanent_objects) {
9693 vm_map_unlock(dst_map);
9694 return KERN_FAILURE; /* XXX */
9695 }
9696
9697 /*
9698 *
9699 * Make a second pass, overwriting the data
9700 * At the beginning of each loop iteration,
9701 * the next entry to be overwritten is "tmp_entry"
9702 * (initially, the value returned from the lookup above),
9703 * and the starting address expected in that entry
9704 * is "start".
9705 */
9706
9707 total_size = copy->size;
9708 if (encountered_sub_map) {
9709 copy_size = 0;
9710 /* re-calculate tmp_entry since we've had the map */
9711 /* unlocked */
9712 if (!vm_map_lookup_entry( dst_map, dst_addr, &tmp_entry)) {
9713 vm_map_unlock(dst_map);
9714 return KERN_INVALID_ADDRESS;
9715 }
9716 } else {
9717 copy_size = copy->size;
9718 }
9719
9720 base_addr = dst_addr;
9721 while (TRUE) {
9722 /* deconstruct the copy object and do in parts */
9723 /* only in sub_map, interruptable case */
9724 vm_map_entry_t copy_entry;
9725 vm_map_entry_t previous_prev = VM_MAP_ENTRY_NULL;
9726 vm_map_entry_t next_copy = VM_MAP_ENTRY_NULL;
9727 int nentries;
9728 int remaining_entries = 0;
9729 vm_map_offset_t new_offset = 0;
9730
9731 for (entry = tmp_entry; copy_size == 0;) {
9732 vm_map_entry_t next;
9733
9734 next = entry->vme_next;
9735
9736 /* tmp_entry and base address are moved along */
9737 /* each time we encounter a sub-map. Otherwise */
9738 /* entry can outpase tmp_entry, and the copy_size */
9739 /* may reflect the distance between them */
9740 /* if the current entry is found to be in transition */
9741 /* we will start over at the beginning or the last */
9742 /* encounter of a submap as dictated by base_addr */
9743 /* we will zero copy_size accordingly. */
9744 if (entry->in_transition) {
9745 /*
9746 * Say that we are waiting, and wait for entry.
9747 */
9748 entry->needs_wakeup = TRUE;
9749 vm_map_entry_wait(dst_map, THREAD_UNINT);
9750
9751 if (!vm_map_lookup_entry(dst_map, base_addr,
9752 &tmp_entry)) {
9753 vm_map_unlock(dst_map);
9754 return KERN_INVALID_ADDRESS;
9755 }
9756 copy_size = 0;
9757 entry = tmp_entry;
9758 continue;
9759 }
9760 if (entry->is_sub_map) {
9761 vm_map_offset_t sub_start;
9762 vm_map_offset_t sub_end;
9763 vm_map_offset_t local_end;
9764 vm_map_t sub_map = VM_MAP_NULL;
9765 bool use_pmap;
9766
9767 if (entry->needs_copy) {
9768 /* if this is a COW submap */
9769 /* just back the range with a */
9770 /* anonymous entry */
9771 assert(!entry->vme_permanent);
9772 if (entry->vme_end < dst_end) {
9773 sub_end = entry->vme_end;
9774 } else {
9775 sub_end = dst_end;
9776 }
9777 if (entry->vme_start < base_addr) {
9778 sub_start = base_addr;
9779 } else {
9780 sub_start = entry->vme_start;
9781 }
9782 vm_map_clip_end(
9783 dst_map, entry, sub_end);
9784 vm_map_clip_start(
9785 dst_map, entry, sub_start);
9786 assert(!entry->use_pmap);
9787 assert(!entry->iokit_acct);
9788 entry->use_pmap = TRUE;
9789 vm_map_deallocate(VME_SUBMAP(entry));
9790 assert(!entry->vme_permanent);
9791 VME_OBJECT_SET(entry, VM_OBJECT_NULL, false, 0);
9792 VME_OFFSET_SET(entry, 0);
9793 entry->is_shared = FALSE;
9794 entry->needs_copy = FALSE;
9795 entry->protection = VM_PROT_DEFAULT;
9796 entry->max_protection = VM_PROT_ALL;
9797 entry->wired_count = 0;
9798 entry->user_wired_count = 0;
9799 if (entry->inheritance
9800 == VM_INHERIT_SHARE) {
9801 entry->inheritance = VM_INHERIT_COPY;
9802 }
9803 continue;
9804 }
9805 /* first take care of any non-sub_map */
9806 /* entries to send */
9807 if (base_addr < entry->vme_start) {
9808 /* stuff to send */
9809 copy_size =
9810 entry->vme_start - base_addr;
9811 break;
9812 }
9813 sub_start = VME_OFFSET(entry);
9814
9815 if (entry->vme_end < dst_end) {
9816 sub_end = entry->vme_end;
9817 } else {
9818 sub_end = dst_end;
9819 }
9820 sub_end -= entry->vme_start;
9821 sub_end += VME_OFFSET(entry);
9822 local_end = entry->vme_end;
9823 use_pmap = entry->use_pmap;
9824 sub_map = VME_SUBMAP(entry);
9825 vm_map_reference(sub_map);
9826 vm_map_unlock(dst_map);
9827 copy_size = sub_end - sub_start;
9828
9829 /* adjust the copy object */
9830 if (total_size > copy_size) {
9831 vm_map_size_t local_size = 0;
9832 vm_map_size_t entry_size;
9833
9834 nentries = 1;
9835 new_offset = copy->offset;
9836 copy_entry = vm_map_copy_first_entry(copy);
9837 while (copy_entry !=
9838 vm_map_copy_to_entry(copy)) {
9839 entry_size = copy_entry->vme_end -
9840 copy_entry->vme_start;
9841 if ((local_size < copy_size) &&
9842 ((local_size + entry_size)
9843 >= copy_size)) {
9844 vm_map_copy_clip_end(copy,
9845 copy_entry,
9846 copy_entry->vme_start +
9847 (copy_size - local_size));
9848 entry_size = copy_entry->vme_end -
9849 copy_entry->vme_start;
9850 local_size += entry_size;
9851 new_offset += entry_size;
9852 }
9853 if (local_size >= copy_size) {
9854 next_copy = copy_entry->vme_next;
9855 copy_entry->vme_next =
9856 vm_map_copy_to_entry(copy);
9857 previous_prev =
9858 copy->cpy_hdr.links.prev;
9859 copy->cpy_hdr.links.prev = copy_entry;
9860 copy->size = copy_size;
9861 remaining_entries =
9862 copy->cpy_hdr.nentries;
9863 remaining_entries -= nentries;
9864 copy->cpy_hdr.nentries = nentries;
9865 break;
9866 } else {
9867 local_size += entry_size;
9868 new_offset += entry_size;
9869 nentries++;
9870 }
9871 copy_entry = copy_entry->vme_next;
9872 }
9873 }
9874
9875 if ((use_pmap) && (pmap == NULL)) {
9876 kr = vm_map_copy_overwrite_nested(
9877 sub_map,
9878 sub_start,
9879 copy,
9880 interruptible,
9881 sub_map->pmap,
9882 TRUE);
9883 } else if (pmap != NULL) {
9884 kr = vm_map_copy_overwrite_nested(
9885 sub_map,
9886 sub_start,
9887 copy,
9888 interruptible, pmap,
9889 TRUE);
9890 } else {
9891 kr = vm_map_copy_overwrite_nested(
9892 sub_map,
9893 sub_start,
9894 copy,
9895 interruptible,
9896 dst_map->pmap,
9897 TRUE);
9898 }
9899
9900 vm_map_deallocate(sub_map);
9901 sub_map = VM_MAP_NULL;
9902
9903 if (kr != KERN_SUCCESS) {
9904 if (next_copy != NULL) {
9905 copy->cpy_hdr.nentries +=
9906 remaining_entries;
9907 copy->cpy_hdr.links.prev->vme_next =
9908 next_copy;
9909 copy->cpy_hdr.links.prev
9910 = previous_prev;
9911 copy->size = total_size;
9912 }
9913 return kr;
9914 }
9915 if (dst_end <= local_end) {
9916 return KERN_SUCCESS;
9917 }
9918 /* otherwise copy no longer exists, it was */
9919 /* destroyed after successful copy_overwrite */
9920 copy = vm_map_copy_allocate(VM_MAP_COPY_ENTRY_LIST);
9921 copy->offset = new_offset;
9922 copy->cpy_hdr.page_shift = copy_page_shift;
9923
9924 total_size -= copy_size;
9925 copy_size = 0;
9926 /* put back remainder of copy in container */
9927 if (next_copy != NULL) {
9928 copy->cpy_hdr.nentries = remaining_entries;
9929 copy->cpy_hdr.links.next = next_copy;
9930 copy->cpy_hdr.links.prev = previous_prev;
9931 copy->size = total_size;
9932 next_copy->vme_prev =
9933 vm_map_copy_to_entry(copy);
9934 next_copy = NULL;
9935 }
9936 base_addr = local_end;
9937 vm_map_lock(dst_map);
9938 if (!vm_map_lookup_entry(dst_map,
9939 local_end, &tmp_entry)) {
9940 vm_map_unlock(dst_map);
9941 return KERN_INVALID_ADDRESS;
9942 }
9943 entry = tmp_entry;
9944 continue;
9945 }
9946 assert(!entry->is_sub_map);
9947
9948 if (dst_end <= entry->vme_end) {
9949 copy_size = dst_end - base_addr;
9950 break;
9951 }
9952
9953 if ((next == vm_map_to_entry(dst_map)) ||
9954 (next->vme_start != entry->vme_end)) {
9955 vm_map_unlock(dst_map);
9956 return KERN_INVALID_ADDRESS;
9957 }
9958
9959 entry = next;
9960 }/* for */
9961
9962 next_copy = NULL;
9963 nentries = 1;
9964
9965 /* adjust the copy object */
9966 if (total_size > copy_size) {
9967 vm_map_size_t local_size = 0;
9968 vm_map_size_t entry_size;
9969
9970 new_offset = copy->offset;
9971 copy_entry = vm_map_copy_first_entry(copy);
9972 while (copy_entry != vm_map_copy_to_entry(copy)) {
9973 entry_size = copy_entry->vme_end -
9974 copy_entry->vme_start;
9975 if ((local_size < copy_size) &&
9976 ((local_size + entry_size)
9977 >= copy_size)) {
9978 vm_map_copy_clip_end(copy, copy_entry,
9979 copy_entry->vme_start +
9980 (copy_size - local_size));
9981 entry_size = copy_entry->vme_end -
9982 copy_entry->vme_start;
9983 local_size += entry_size;
9984 new_offset += entry_size;
9985 }
9986 if (local_size >= copy_size) {
9987 next_copy = copy_entry->vme_next;
9988 copy_entry->vme_next =
9989 vm_map_copy_to_entry(copy);
9990 previous_prev =
9991 copy->cpy_hdr.links.prev;
9992 copy->cpy_hdr.links.prev = copy_entry;
9993 copy->size = copy_size;
9994 remaining_entries =
9995 copy->cpy_hdr.nentries;
9996 remaining_entries -= nentries;
9997 copy->cpy_hdr.nentries = nentries;
9998 break;
9999 } else {
10000 local_size += entry_size;
10001 new_offset += entry_size;
10002 nentries++;
10003 }
10004 copy_entry = copy_entry->vme_next;
10005 }
10006 }
10007
10008 if (aligned) {
10009 pmap_t local_pmap;
10010
10011 if (pmap) {
10012 local_pmap = pmap;
10013 } else {
10014 local_pmap = dst_map->pmap;
10015 }
10016
10017 if ((kr = vm_map_copy_overwrite_aligned(
10018 dst_map, tmp_entry, copy,
10019 base_addr, local_pmap)) != KERN_SUCCESS) {
10020 if (next_copy != NULL) {
10021 copy->cpy_hdr.nentries +=
10022 remaining_entries;
10023 copy->cpy_hdr.links.prev->vme_next =
10024 next_copy;
10025 copy->cpy_hdr.links.prev =
10026 previous_prev;
10027 copy->size += copy_size;
10028 }
10029 return kr;
10030 }
10031 vm_map_unlock(dst_map);
10032 } else {
10033 /*
10034 * Performance gain:
10035 *
10036 * if the copy and dst address are misaligned but the same
10037 * offset within the page we can copy_not_aligned the
10038 * misaligned parts and copy aligned the rest. If they are
10039 * aligned but len is unaligned we simply need to copy
10040 * the end bit unaligned. We'll need to split the misaligned
10041 * bits of the region in this case !
10042 */
10043 /* ALWAYS UNLOCKS THE dst_map MAP */
10044 kr = vm_map_copy_overwrite_unaligned(
10045 dst_map,
10046 tmp_entry,
10047 copy,
10048 base_addr,
10049 discard_on_success);
10050 if (kr != KERN_SUCCESS) {
10051 if (next_copy != NULL) {
10052 copy->cpy_hdr.nentries +=
10053 remaining_entries;
10054 copy->cpy_hdr.links.prev->vme_next =
10055 next_copy;
10056 copy->cpy_hdr.links.prev =
10057 previous_prev;
10058 copy->size += copy_size;
10059 }
10060 return kr;
10061 }
10062 }
10063 total_size -= copy_size;
10064 if (total_size == 0) {
10065 break;
10066 }
10067 base_addr += copy_size;
10068 copy_size = 0;
10069 copy->offset = new_offset;
10070 if (next_copy != NULL) {
10071 copy->cpy_hdr.nentries = remaining_entries;
10072 copy->cpy_hdr.links.next = next_copy;
10073 copy->cpy_hdr.links.prev = previous_prev;
10074 next_copy->vme_prev = vm_map_copy_to_entry(copy);
10075 copy->size = total_size;
10076 }
10077 vm_map_lock(dst_map);
10078 while (TRUE) {
10079 if (!vm_map_lookup_entry(dst_map,
10080 base_addr, &tmp_entry)) {
10081 vm_map_unlock(dst_map);
10082 return KERN_INVALID_ADDRESS;
10083 }
10084 if (tmp_entry->in_transition) {
10085 entry->needs_wakeup = TRUE;
10086 vm_map_entry_wait(dst_map, THREAD_UNINT);
10087 } else {
10088 break;
10089 }
10090 }
10091 vm_map_clip_start(dst_map,
10092 tmp_entry,
10093 vm_map_trunc_page(base_addr,
10094 VM_MAP_PAGE_MASK(dst_map)));
10095
10096 entry = tmp_entry;
10097 } /* while */
10098
10099 /*
10100 * Throw away the vm_map_copy object
10101 */
10102 if (discard_on_success) {
10103 vm_map_copy_discard(copy);
10104 }
10105
10106 return KERN_SUCCESS;
10107 }/* vm_map_copy_overwrite */
10108
10109 static __attribute__((always_inline, warn_unused_result))
10110 kern_return_t
vm_map_copy_addr_size_sanitize(vm_map_t map,vm_map_offset_ut addr_u,vm_map_size_ut size_u,vm_sanitize_caller_t vm_sanitize_caller,vm_map_offset_t * addr,vm_map_offset_t * end,vm_map_size_t * size)10111 vm_map_copy_addr_size_sanitize(
10112 vm_map_t map,
10113 vm_map_offset_ut addr_u,
10114 vm_map_size_ut size_u,
10115 vm_sanitize_caller_t vm_sanitize_caller,
10116 vm_map_offset_t *addr,
10117 vm_map_offset_t *end,
10118 vm_map_size_t *size)
10119 {
10120 vm_sanitize_flags_t flags = VM_SANITIZE_FLAGS_SIZE_ZERO_FALLTHROUGH |
10121 VM_SANITIZE_FLAGS_GET_UNALIGNED_VALUES |
10122 VM_SANITIZE_FLAGS_CHECK_ADDR_RANGE;
10123
10124 return vm_sanitize_addr_size(addr_u, size_u,
10125 vm_sanitize_caller, map,
10126 flags,
10127 addr, end, size);
10128 }
10129
10130 kern_return_t
vm_map_copy_overwrite(vm_map_t dst_map,vm_map_offset_ut dst_addr_u,vm_map_copy_t copy,vm_map_size_ut copy_size_u,boolean_t interruptible)10131 vm_map_copy_overwrite(
10132 vm_map_t dst_map,
10133 vm_map_offset_ut dst_addr_u,
10134 vm_map_copy_t copy,
10135 vm_map_size_ut copy_size_u,
10136 boolean_t interruptible)
10137 {
10138 vm_map_offset_t dst_addr, dst_end;
10139 vm_map_size_t copy_size;
10140 vm_map_size_t head_size, tail_size;
10141 vm_map_copy_t head_copy, tail_copy;
10142 vm_map_offset_t head_addr, tail_addr;
10143 vm_map_entry_t entry;
10144 kern_return_t kr;
10145 vm_map_offset_t effective_page_mask, effective_page_size;
10146 uint16_t copy_page_shift;
10147
10148 head_size = 0;
10149 tail_size = 0;
10150 head_copy = NULL;
10151 tail_copy = NULL;
10152 head_addr = 0;
10153 tail_addr = 0;
10154
10155 /*
10156 * Check for null copy object.
10157 */
10158 if (copy == VM_MAP_COPY_NULL) {
10159 return KERN_SUCCESS;
10160 }
10161
10162 /*
10163 * Sanitize any input parameters that are addr/size/prot/inherit
10164 */
10165 kr = vm_map_copy_addr_size_sanitize(
10166 dst_map,
10167 dst_addr_u,
10168 copy_size_u,
10169 VM_SANITIZE_CALLER_VM_MAP_COPY_OVERWRITE,
10170 &dst_addr,
10171 &dst_end,
10172 ©_size);
10173 if (__improbable(kr != KERN_SUCCESS)) {
10174 return vm_sanitize_get_kr(kr);
10175 }
10176
10177 /*
10178 * Assert that the vm_map_copy is coming from the right
10179 * zone and hasn't been forged
10180 */
10181 vm_map_copy_require(copy);
10182
10183 if (interruptible ||
10184 copy->type != VM_MAP_COPY_ENTRY_LIST) {
10185 /*
10186 * We can't split the "copy" map if we're interruptible
10187 * or if we don't have a "copy" map...
10188 */
10189 blunt_copy:
10190 kr = vm_map_copy_overwrite_nested(dst_map,
10191 dst_addr,
10192 copy,
10193 interruptible,
10194 (pmap_t) NULL,
10195 TRUE);
10196 if (kr) {
10197 ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_COPYOVERWRITE_FULL_NESTED_ERROR), kr /* arg */);
10198 }
10199 return kr;
10200 }
10201
10202 copy_page_shift = VM_MAP_COPY_PAGE_SHIFT(copy);
10203 if (copy_page_shift < PAGE_SHIFT ||
10204 VM_MAP_PAGE_SHIFT(dst_map) < PAGE_SHIFT) {
10205 goto blunt_copy;
10206 }
10207
10208 if (VM_MAP_PAGE_SHIFT(dst_map) < PAGE_SHIFT) {
10209 effective_page_mask = VM_MAP_PAGE_MASK(dst_map);
10210 } else {
10211 effective_page_mask = MAX(VM_MAP_PAGE_MASK(dst_map), PAGE_MASK);
10212 effective_page_mask = MAX(VM_MAP_COPY_PAGE_MASK(copy),
10213 effective_page_mask);
10214 }
10215 effective_page_size = effective_page_mask + 1;
10216
10217 if (copy_size < VM_MAP_COPY_OVERWRITE_OPTIMIZATION_THRESHOLD_PAGES * effective_page_size) {
10218 /*
10219 * Too small to bother with optimizing...
10220 */
10221 goto blunt_copy;
10222 }
10223
10224 if ((dst_addr & effective_page_mask) !=
10225 (copy->offset & effective_page_mask)) {
10226 /*
10227 * Incompatible mis-alignment of source and destination...
10228 */
10229 goto blunt_copy;
10230 }
10231
10232 /*
10233 * Proper alignment or identical mis-alignment at the beginning.
10234 * Let's try and do a small unaligned copy first (if needed)
10235 * and then an aligned copy for the rest.
10236 */
10237 if (!vm_map_page_aligned(dst_addr, effective_page_mask)) {
10238 head_addr = dst_addr;
10239 head_size = (effective_page_size -
10240 (copy->offset & effective_page_mask));
10241 head_size = MIN(head_size, copy_size);
10242 }
10243 if (!vm_map_page_aligned(copy->offset + copy_size,
10244 effective_page_mask)) {
10245 /*
10246 * Mis-alignment at the end.
10247 * Do an aligned copy up to the last page and
10248 * then an unaligned copy for the remaining bytes.
10249 */
10250 tail_size = ((copy->offset + copy_size) &
10251 effective_page_mask);
10252 tail_size = MIN(tail_size, copy_size);
10253 tail_addr = dst_addr + copy_size - tail_size;
10254 assert(tail_addr >= head_addr + head_size);
10255 }
10256 assert(head_size + tail_size <= copy_size);
10257
10258 if (head_size + tail_size == copy_size) {
10259 /*
10260 * It's all unaligned, no optimization possible...
10261 */
10262 goto blunt_copy;
10263 }
10264
10265 /*
10266 * Can't optimize if there are any submaps in the
10267 * destination due to the way we free the "copy" map
10268 * progressively in vm_map_copy_overwrite_nested()
10269 * in that case.
10270 */
10271 vm_map_lock_read(dst_map);
10272 if (!vm_map_lookup_entry(dst_map, dst_addr, &entry)) {
10273 vm_map_unlock_read(dst_map);
10274 goto blunt_copy;
10275 }
10276 for (;
10277 (entry != vm_map_to_entry(dst_map) &&
10278 entry->vme_start < dst_addr + copy_size);
10279 entry = entry->vme_next) {
10280 if (entry->is_sub_map) {
10281 vm_map_unlock_read(dst_map);
10282 goto blunt_copy;
10283 }
10284 }
10285 vm_map_unlock_read(dst_map);
10286
10287 if (head_size) {
10288 /*
10289 * Unaligned copy of the first "head_size" bytes, to reach
10290 * a page boundary.
10291 */
10292
10293 /*
10294 * Extract "head_copy" out of "copy".
10295 */
10296 head_copy = vm_map_copy_allocate(VM_MAP_COPY_ENTRY_LIST);
10297 head_copy->cpy_hdr.entries_pageable =
10298 copy->cpy_hdr.entries_pageable;
10299 head_copy->cpy_hdr.page_shift = copy_page_shift;
10300
10301 entry = vm_map_copy_first_entry(copy);
10302 if (entry->vme_end < copy->offset + head_size) {
10303 head_size = entry->vme_end - copy->offset;
10304 }
10305
10306 head_copy->offset = copy->offset;
10307 head_copy->size = head_size;
10308 copy->offset += head_size;
10309 copy->size -= head_size;
10310 copy_size -= head_size;
10311 assert(copy_size > 0);
10312
10313 vm_map_copy_clip_end(copy, entry, copy->offset);
10314 vm_map_copy_entry_unlink(copy, entry);
10315 vm_map_copy_entry_link(head_copy,
10316 vm_map_copy_to_entry(head_copy),
10317 entry);
10318
10319 /*
10320 * Do the unaligned copy.
10321 */
10322 kr = vm_map_copy_overwrite_nested(dst_map,
10323 head_addr,
10324 head_copy,
10325 interruptible,
10326 (pmap_t) NULL,
10327 FALSE);
10328 if (kr != KERN_SUCCESS) {
10329 ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_COPYOVERWRITE_PARTIAL_HEAD_NESTED_ERROR), kr /* arg */);
10330 goto done;
10331 }
10332 }
10333
10334 if (tail_size) {
10335 /*
10336 * Extract "tail_copy" out of "copy".
10337 */
10338 tail_copy = vm_map_copy_allocate(VM_MAP_COPY_ENTRY_LIST);
10339 tail_copy->cpy_hdr.entries_pageable =
10340 copy->cpy_hdr.entries_pageable;
10341 tail_copy->cpy_hdr.page_shift = copy_page_shift;
10342
10343 tail_copy->offset = copy->offset + copy_size - tail_size;
10344 tail_copy->size = tail_size;
10345
10346 copy->size -= tail_size;
10347 copy_size -= tail_size;
10348 assert(copy_size > 0);
10349
10350 entry = vm_map_copy_last_entry(copy);
10351 vm_map_copy_clip_start(copy, entry, tail_copy->offset);
10352 entry = vm_map_copy_last_entry(copy);
10353 vm_map_copy_entry_unlink(copy, entry);
10354 vm_map_copy_entry_link(tail_copy,
10355 vm_map_copy_last_entry(tail_copy),
10356 entry);
10357 }
10358
10359 /*
10360 * If we are here from ipc_kmsg_copyout_ool_descriptor(),
10361 * we want to avoid TOCTOU issues w.r.t copy->size but
10362 * we don't need to change vm_map_copy_overwrite_nested()
10363 * and all other vm_map_copy_overwrite variants.
10364 *
10365 * So we assign the original copy_size that was passed into
10366 * this routine back to copy.
10367 *
10368 * This use of local 'copy_size' passed into this routine is
10369 * to try and protect against TOCTOU attacks where the kernel
10370 * has been exploited. We don't expect this to be an issue
10371 * during normal system operation.
10372 */
10373 assertf(copy->size == copy_size,
10374 "Mismatch of copy sizes. Expected 0x%llx, Got 0x%llx\n", (uint64_t) copy_size, (uint64_t) copy->size);
10375 copy->size = copy_size;
10376
10377 /*
10378 * Copy most (or possibly all) of the data.
10379 */
10380 kr = vm_map_copy_overwrite_nested(dst_map,
10381 dst_addr + head_size,
10382 copy,
10383 interruptible,
10384 (pmap_t) NULL,
10385 FALSE);
10386 if (kr != KERN_SUCCESS) {
10387 ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_COPYOVERWRITE_PARTIAL_NESTED_ERROR), kr /* arg */);
10388 goto done;
10389 }
10390
10391 if (tail_size) {
10392 kr = vm_map_copy_overwrite_nested(dst_map,
10393 tail_addr,
10394 tail_copy,
10395 interruptible,
10396 (pmap_t) NULL,
10397 FALSE);
10398 if (kr) {
10399 ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_COPYOVERWRITE_PARTIAL_TAIL_NESTED_ERROR), kr /* arg */);
10400 }
10401 }
10402
10403 done:
10404 assert(copy->type == VM_MAP_COPY_ENTRY_LIST);
10405 if (kr == KERN_SUCCESS) {
10406 /*
10407 * Discard all the copy maps.
10408 */
10409 if (head_copy) {
10410 vm_map_copy_discard(head_copy);
10411 head_copy = NULL;
10412 }
10413 vm_map_copy_discard(copy);
10414 if (tail_copy) {
10415 vm_map_copy_discard(tail_copy);
10416 tail_copy = NULL;
10417 }
10418 } else {
10419 /*
10420 * Re-assemble the original copy map.
10421 */
10422 if (head_copy) {
10423 entry = vm_map_copy_first_entry(head_copy);
10424 vm_map_copy_entry_unlink(head_copy, entry);
10425 vm_map_copy_entry_link(copy,
10426 vm_map_copy_to_entry(copy),
10427 entry);
10428 copy->offset -= head_size;
10429 copy->size += head_size;
10430 vm_map_copy_discard(head_copy);
10431 head_copy = NULL;
10432 }
10433 if (tail_copy) {
10434 entry = vm_map_copy_last_entry(tail_copy);
10435 vm_map_copy_entry_unlink(tail_copy, entry);
10436 vm_map_copy_entry_link(copy,
10437 vm_map_copy_last_entry(copy),
10438 entry);
10439 copy->size += tail_size;
10440 vm_map_copy_discard(tail_copy);
10441 tail_copy = NULL;
10442 }
10443 }
10444 return kr;
10445 }
10446
10447
10448 /*
10449 * Routine: vm_map_copy_overwrite_unaligned [internal use only]
10450 *
10451 * Decription:
10452 * Physically copy unaligned data
10453 *
10454 * Implementation:
10455 * Unaligned parts of pages have to be physically copied. We use
10456 * a modified form of vm_fault_copy (which understands none-aligned
10457 * page offsets and sizes) to do the copy. We attempt to copy as
10458 * much memory in one go as possibly, however vm_fault_copy copies
10459 * within 1 memory object so we have to find the smaller of "amount left"
10460 * "source object data size" and "target object data size". With
10461 * unaligned data we don't need to split regions, therefore the source
10462 * (copy) object should be one map entry, the target range may be split
10463 * over multiple map entries however. In any event we are pessimistic
10464 * about these assumptions.
10465 *
10466 * Callers of this function must call vm_map_copy_require on
10467 * previously created vm_map_copy_t or pass a newly created
10468 * one to ensure that it hasn't been forged.
10469 *
10470 * Assumptions:
10471 * dst_map is locked on entry and is return locked on success,
10472 * unlocked on error.
10473 */
10474
10475 static kern_return_t
vm_map_copy_overwrite_unaligned(vm_map_t dst_map,vm_map_entry_t entry,vm_map_copy_t copy,vm_map_offset_t start,boolean_t discard_on_success)10476 vm_map_copy_overwrite_unaligned(
10477 vm_map_t dst_map,
10478 vm_map_entry_t entry,
10479 vm_map_copy_t copy,
10480 vm_map_offset_t start,
10481 boolean_t discard_on_success)
10482 {
10483 vm_map_entry_t copy_entry;
10484 vm_map_entry_t copy_entry_next;
10485 vm_map_version_t version;
10486 vm_object_t dst_object;
10487 vm_object_offset_t dst_offset;
10488 vm_object_offset_t src_offset;
10489 vm_object_offset_t entry_offset;
10490 vm_map_offset_t entry_end;
10491 vm_map_size_t src_size,
10492 dst_size,
10493 copy_size,
10494 amount_left;
10495 kern_return_t kr = KERN_SUCCESS;
10496
10497
10498 copy_entry = vm_map_copy_first_entry(copy);
10499
10500 vm_map_lock_write_to_read(dst_map);
10501
10502 src_offset = copy->offset - trunc_page_mask_64(copy->offset, VM_MAP_COPY_PAGE_MASK(copy));
10503 amount_left = copy->size;
10504 /*
10505 * unaligned so we never clipped this entry, we need the offset into
10506 * the vm_object not just the data.
10507 */
10508 while (amount_left > 0) {
10509 if (entry == vm_map_to_entry(dst_map)) {
10510 vm_map_unlock_read(dst_map);
10511 return KERN_INVALID_ADDRESS;
10512 }
10513
10514 /* "start" must be within the current map entry */
10515 assert((start >= entry->vme_start) && (start < entry->vme_end));
10516
10517 /*
10518 * Check protection again
10519 */
10520 if (!(entry->protection & VM_PROT_WRITE)) {
10521 vm_map_unlock_read(dst_map);
10522 return KERN_PROTECTION_FAILURE;
10523 }
10524 if (entry->is_sub_map) {
10525 /* not implemented... */
10526 vm_map_unlock_read(dst_map);
10527 return KERN_INVALID_ARGUMENT;
10528 }
10529 if (!vm_map_entry_is_overwritable(dst_map, entry)) {
10530 vm_map_unlock_read(dst_map);
10531 return KERN_PROTECTION_FAILURE;
10532 }
10533 /*
10534 * If the entry is in transition, we must wait
10535 * for it to exit that state. Anything could happen
10536 * when we unlock the map, so start over.
10537 */
10538 if (entry->in_transition) {
10539 /*
10540 * Say that we are waiting, and wait for entry.
10541 */
10542 entry->needs_wakeup = TRUE;
10543 vm_map_entry_wait(dst_map, THREAD_UNINT);
10544
10545 goto RetryLookup;
10546 }
10547
10548 dst_offset = start - entry->vme_start;
10549
10550 dst_size = entry->vme_end - start;
10551
10552 src_size = copy_entry->vme_end -
10553 (copy_entry->vme_start + src_offset);
10554
10555 if (dst_size < src_size) {
10556 /*
10557 * we can only copy dst_size bytes before
10558 * we have to get the next destination entry
10559 */
10560 copy_size = dst_size;
10561 } else {
10562 /*
10563 * we can only copy src_size bytes before
10564 * we have to get the next source copy entry
10565 */
10566 copy_size = src_size;
10567 }
10568
10569 if (copy_size > amount_left) {
10570 copy_size = amount_left;
10571 }
10572 /*
10573 * Entry needs copy, create a shadow shadow object for
10574 * Copy on write region.
10575 */
10576 assert(!entry->is_sub_map);
10577 if (entry->needs_copy) {
10578 if (vm_map_lock_read_to_write(dst_map)) {
10579 vm_map_lock_read(dst_map);
10580 goto RetryLookup;
10581 }
10582 VME_OBJECT_SHADOW(entry,
10583 (vm_map_size_t)(entry->vme_end
10584 - entry->vme_start),
10585 vm_map_always_shadow(dst_map));
10586 entry->needs_copy = FALSE;
10587 vm_map_lock_write_to_read(dst_map);
10588 }
10589 dst_object = VME_OBJECT(entry);
10590 /*
10591 * unlike with the virtual (aligned) copy we're going
10592 * to fault on it therefore we need a target object.
10593 */
10594 if (dst_object == VM_OBJECT_NULL) {
10595 if (vm_map_lock_read_to_write(dst_map)) {
10596 vm_map_lock_read(dst_map);
10597 goto RetryLookup;
10598 }
10599 dst_object = vm_object_allocate((vm_map_size_t)
10600 entry->vme_end - entry->vme_start);
10601 VME_OBJECT_SET(entry, dst_object, false, 0);
10602 VME_OFFSET_SET(entry, 0);
10603 assert(entry->use_pmap);
10604 vm_map_lock_write_to_read(dst_map);
10605 }
10606 /*
10607 * Take an object reference and unlock map. The "entry" may
10608 * disappear or change when the map is unlocked.
10609 */
10610 vm_object_reference(dst_object);
10611 version.main_timestamp = dst_map->timestamp;
10612 entry_offset = VME_OFFSET(entry);
10613 entry_end = entry->vme_end;
10614 vm_map_unlock_read(dst_map);
10615 /*
10616 * Copy as much as possible in one pass
10617 */
10618 kr = vm_fault_copy(
10619 VME_OBJECT(copy_entry),
10620 VME_OFFSET(copy_entry) + src_offset,
10621 ©_size,
10622 dst_object,
10623 entry_offset + dst_offset,
10624 dst_map,
10625 &version,
10626 THREAD_UNINT );
10627
10628 start += copy_size;
10629 src_offset += copy_size;
10630 amount_left -= copy_size;
10631 /*
10632 * Release the object reference
10633 */
10634 vm_object_deallocate(dst_object);
10635 /*
10636 * If a hard error occurred, return it now
10637 */
10638 if (kr != KERN_SUCCESS) {
10639 return kr;
10640 }
10641
10642 if ((copy_entry->vme_start + src_offset) == copy_entry->vme_end
10643 || amount_left == 0) {
10644 /*
10645 * all done with this copy entry, dispose.
10646 */
10647 copy_entry_next = copy_entry->vme_next;
10648
10649 if (discard_on_success) {
10650 vm_map_copy_entry_unlink(copy, copy_entry);
10651 assert(!copy_entry->is_sub_map);
10652 vm_object_deallocate(VME_OBJECT(copy_entry));
10653 vm_map_copy_entry_dispose(copy_entry);
10654 }
10655
10656 if (copy_entry_next == vm_map_copy_to_entry(copy) &&
10657 amount_left) {
10658 /*
10659 * not finished copying but run out of source
10660 */
10661 return KERN_INVALID_ADDRESS;
10662 }
10663
10664 copy_entry = copy_entry_next;
10665
10666 src_offset = 0;
10667 }
10668
10669 if (amount_left == 0) {
10670 return KERN_SUCCESS;
10671 }
10672
10673 vm_map_lock_read(dst_map);
10674 if (version.main_timestamp == dst_map->timestamp) {
10675 if (start == entry_end) {
10676 /*
10677 * destination region is split. Use the version
10678 * information to avoid a lookup in the normal
10679 * case.
10680 */
10681 entry = entry->vme_next;
10682 /*
10683 * should be contiguous. Fail if we encounter
10684 * a hole in the destination.
10685 */
10686 if (start != entry->vme_start) {
10687 vm_map_unlock_read(dst_map);
10688 return KERN_INVALID_ADDRESS;
10689 }
10690 }
10691 } else {
10692 /*
10693 * Map version check failed.
10694 * we must lookup the entry because somebody
10695 * might have changed the map behind our backs.
10696 */
10697 RetryLookup:
10698 if (!vm_map_lookup_entry(dst_map, start, &entry)) {
10699 vm_map_unlock_read(dst_map);
10700 return KERN_INVALID_ADDRESS;
10701 }
10702 }
10703 }/* while */
10704
10705 return KERN_SUCCESS;
10706 }/* vm_map_copy_overwrite_unaligned */
10707
10708 /*
10709 * Routine: vm_map_copy_overwrite_aligned [internal use only]
10710 *
10711 * Description:
10712 * Does all the vm_trickery possible for whole pages.
10713 *
10714 * Implementation:
10715 *
10716 * If there are no permanent objects in the destination,
10717 * and the source and destination map entry zones match,
10718 * and the destination map entry is not shared,
10719 * then the map entries can be deleted and replaced
10720 * with those from the copy. The following code is the
10721 * basic idea of what to do, but there are lots of annoying
10722 * little details about getting protection and inheritance
10723 * right. Should add protection, inheritance, and sharing checks
10724 * to the above pass and make sure that no wiring is involved.
10725 *
10726 * Callers of this function must call vm_map_copy_require on
10727 * previously created vm_map_copy_t or pass a newly created
10728 * one to ensure that it hasn't been forged.
10729 */
10730
10731 int vm_map_copy_overwrite_aligned_src_not_internal = 0;
10732 int vm_map_copy_overwrite_aligned_src_not_symmetric = 0;
10733 int vm_map_copy_overwrite_aligned_src_large = 0;
10734
10735 static kern_return_t
vm_map_copy_overwrite_aligned(vm_map_t dst_map,vm_map_entry_t tmp_entry,vm_map_copy_t copy,vm_map_offset_t start,__unused pmap_t pmap)10736 vm_map_copy_overwrite_aligned(
10737 vm_map_t dst_map,
10738 vm_map_entry_t tmp_entry,
10739 vm_map_copy_t copy,
10740 vm_map_offset_t start,
10741 __unused pmap_t pmap)
10742 {
10743 vm_object_t object;
10744 vm_map_entry_t copy_entry;
10745 vm_map_size_t copy_size;
10746 vm_map_size_t size;
10747 vm_map_entry_t entry;
10748
10749 while ((copy_entry = vm_map_copy_first_entry(copy))
10750 != vm_map_copy_to_entry(copy)) {
10751 copy_size = (copy_entry->vme_end - copy_entry->vme_start);
10752
10753 entry = tmp_entry;
10754
10755 if (entry->is_sub_map) {
10756 /* unnested when clipped earlier */
10757 assert(!entry->use_pmap);
10758 }
10759 if (entry == vm_map_to_entry(dst_map)) {
10760 vm_map_unlock(dst_map);
10761 return KERN_INVALID_ADDRESS;
10762 }
10763 size = (entry->vme_end - entry->vme_start);
10764 /*
10765 * Make sure that no holes popped up in the
10766 * address map, and that the protection is
10767 * still valid, in case the map was unlocked
10768 * earlier.
10769 */
10770
10771 if ((entry->vme_start != start) || ((entry->is_sub_map)
10772 && !entry->needs_copy)) {
10773 vm_map_unlock(dst_map);
10774 return KERN_INVALID_ADDRESS;
10775 }
10776 assert(entry != vm_map_to_entry(dst_map));
10777
10778 /*
10779 * Check protection again
10780 */
10781
10782 if (!(entry->protection & VM_PROT_WRITE)) {
10783 vm_map_unlock(dst_map);
10784 return KERN_PROTECTION_FAILURE;
10785 }
10786
10787 if (entry->is_sub_map) {
10788 /* not properly implemented */
10789 vm_map_unlock(dst_map);
10790 return KERN_PROTECTION_FAILURE;
10791 }
10792
10793 if (!vm_map_entry_is_overwritable(dst_map, entry)) {
10794 vm_map_unlock(dst_map);
10795 return KERN_PROTECTION_FAILURE;
10796 }
10797
10798 /*
10799 * If the entry is in transition, we must wait
10800 * for it to exit that state. Anything could happen
10801 * when we unlock the map, so start over.
10802 */
10803 if (entry->in_transition) {
10804 /*
10805 * Say that we are waiting, and wait for entry.
10806 */
10807 entry->needs_wakeup = TRUE;
10808 vm_map_entry_wait(dst_map, THREAD_UNINT);
10809
10810 goto RetryLookup;
10811 }
10812
10813 /*
10814 * Adjust to source size first
10815 */
10816
10817 if (copy_size < size) {
10818 if (entry->map_aligned &&
10819 !VM_MAP_PAGE_ALIGNED(entry->vme_start + copy_size,
10820 VM_MAP_PAGE_MASK(dst_map))) {
10821 /* no longer map-aligned */
10822 entry->map_aligned = FALSE;
10823 }
10824 vm_map_clip_end(dst_map, entry, entry->vme_start + copy_size);
10825 size = copy_size;
10826 }
10827
10828 /*
10829 * Adjust to destination size
10830 */
10831
10832 if (size < copy_size) {
10833 vm_map_copy_clip_end(copy, copy_entry,
10834 copy_entry->vme_start + size);
10835 copy_size = size;
10836 }
10837
10838 assert((entry->vme_end - entry->vme_start) == size);
10839 assert((tmp_entry->vme_end - tmp_entry->vme_start) == size);
10840 assert((copy_entry->vme_end - copy_entry->vme_start) == size);
10841
10842 /*
10843 * If the destination contains temporary unshared memory,
10844 * we can perform the copy by throwing it away and
10845 * installing the source data.
10846 *
10847 * Exceptions for mappings with special semantics:
10848 * + "permanent" entries,
10849 * + JIT regions,
10850 * + TPRO regions,
10851 * + pmap-specific protection policies,
10852 * + VM objects with COPY_NONE copy strategy.
10853 */
10854
10855 object = VME_OBJECT(entry);
10856 if ((!entry->is_shared &&
10857 !entry->vme_permanent &&
10858 !entry->used_for_jit &&
10859 #if __arm64e__
10860 !entry->used_for_tpro &&
10861 #endif /* __arm64e__ */
10862 !(entry->protection & VM_PROT_EXECUTE) &&
10863 !pmap_has_prot_policy(dst_map->pmap, entry->translated_allow_execute, entry->protection) &&
10864 ((object == VM_OBJECT_NULL) ||
10865 (object->internal &&
10866 !object->true_share &&
10867 object->copy_strategy != MEMORY_OBJECT_COPY_NONE))) ||
10868 entry->needs_copy) {
10869 vm_object_t old_object = VME_OBJECT(entry);
10870 vm_object_offset_t old_offset = VME_OFFSET(entry);
10871 vm_object_offset_t offset;
10872
10873 assert(!entry->is_sub_map);
10874 /*
10875 * Ensure that the source and destination aren't
10876 * identical
10877 */
10878 if (old_object == VME_OBJECT(copy_entry) &&
10879 old_offset == VME_OFFSET(copy_entry)) {
10880 vm_map_copy_entry_unlink(copy, copy_entry);
10881 vm_map_copy_entry_dispose(copy_entry);
10882
10883 if (old_object != VM_OBJECT_NULL) {
10884 vm_object_deallocate(old_object);
10885 }
10886
10887 start = tmp_entry->vme_end;
10888 tmp_entry = tmp_entry->vme_next;
10889 continue;
10890 }
10891
10892 #if XNU_TARGET_OS_OSX
10893 #define __TRADEOFF1_OBJ_SIZE (64 * 1024 * 1024) /* 64 MB */
10894 #define __TRADEOFF1_COPY_SIZE (128 * 1024) /* 128 KB */
10895 if (VME_OBJECT(copy_entry) != VM_OBJECT_NULL &&
10896 VME_OBJECT(copy_entry)->vo_size >= __TRADEOFF1_OBJ_SIZE &&
10897 copy_size <= __TRADEOFF1_COPY_SIZE) {
10898 /*
10899 * Virtual vs. Physical copy tradeoff #1.
10900 *
10901 * Copying only a few pages out of a large
10902 * object: do a physical copy instead of
10903 * a virtual copy, to avoid possibly keeping
10904 * the entire large object alive because of
10905 * those few copy-on-write pages.
10906 */
10907 vm_map_copy_overwrite_aligned_src_large++;
10908 goto slow_copy;
10909 }
10910 #endif /* XNU_TARGET_OS_OSX */
10911
10912 if ((dst_map->pmap != kernel_pmap) &&
10913 (VME_ALIAS(entry) >= VM_MEMORY_MALLOC) &&
10914 (VME_ALIAS(entry) <= VM_MEMORY_MALLOC_MEDIUM)) {
10915 vm_object_t new_object, new_shadow;
10916
10917 /*
10918 * We're about to map something over a mapping
10919 * established by malloc()...
10920 */
10921 new_object = VME_OBJECT(copy_entry);
10922 if (new_object != VM_OBJECT_NULL) {
10923 vm_object_lock_shared(new_object);
10924 }
10925 while (new_object != VM_OBJECT_NULL &&
10926 #if XNU_TARGET_OS_OSX
10927 !new_object->true_share &&
10928 new_object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC &&
10929 #endif /* XNU_TARGET_OS_OSX */
10930 new_object->internal) {
10931 new_shadow = new_object->shadow;
10932 if (new_shadow == VM_OBJECT_NULL) {
10933 break;
10934 }
10935 vm_object_lock_shared(new_shadow);
10936 vm_object_unlock(new_object);
10937 new_object = new_shadow;
10938 }
10939 if (new_object != VM_OBJECT_NULL) {
10940 if (!new_object->internal) {
10941 /*
10942 * The new mapping is backed
10943 * by an external object. We
10944 * don't want malloc'ed memory
10945 * to be replaced with such a
10946 * non-anonymous mapping, so
10947 * let's go off the optimized
10948 * path...
10949 */
10950 vm_map_copy_overwrite_aligned_src_not_internal++;
10951 vm_object_unlock(new_object);
10952 goto slow_copy;
10953 }
10954 #if XNU_TARGET_OS_OSX
10955 if (new_object->true_share ||
10956 new_object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC) {
10957 /*
10958 * Same if there's a "true_share"
10959 * object in the shadow chain, or
10960 * an object with a non-default
10961 * (SYMMETRIC) copy strategy.
10962 */
10963 vm_map_copy_overwrite_aligned_src_not_symmetric++;
10964 vm_object_unlock(new_object);
10965 goto slow_copy;
10966 }
10967 #endif /* XNU_TARGET_OS_OSX */
10968 vm_object_unlock(new_object);
10969 }
10970 /*
10971 * The new mapping is still backed by
10972 * anonymous (internal) memory, so it's
10973 * OK to substitute it for the original
10974 * malloc() mapping.
10975 */
10976 }
10977
10978 if (old_object != VM_OBJECT_NULL) {
10979 assert(!entry->vme_permanent);
10980 if (entry->is_sub_map) {
10981 if (entry->use_pmap) {
10982 #ifndef NO_NESTED_PMAP
10983 pmap_unnest(dst_map->pmap,
10984 (addr64_t)entry->vme_start,
10985 entry->vme_end - entry->vme_start);
10986 #endif /* NO_NESTED_PMAP */
10987 if (dst_map->mapped_in_other_pmaps) {
10988 /* clean up parent */
10989 /* map/maps */
10990 vm_map_submap_pmap_clean(
10991 dst_map, entry->vme_start,
10992 entry->vme_end,
10993 VME_SUBMAP(entry),
10994 VME_OFFSET(entry));
10995 }
10996 } else {
10997 vm_map_submap_pmap_clean(
10998 dst_map, entry->vme_start,
10999 entry->vme_end,
11000 VME_SUBMAP(entry),
11001 VME_OFFSET(entry));
11002 }
11003 vm_map_deallocate(VME_SUBMAP(entry));
11004 } else {
11005 if (dst_map->mapped_in_other_pmaps) {
11006 vm_object_pmap_protect_options(
11007 VME_OBJECT(entry),
11008 VME_OFFSET(entry),
11009 entry->vme_end
11010 - entry->vme_start,
11011 PMAP_NULL,
11012 PAGE_SIZE,
11013 entry->vme_start,
11014 VM_PROT_NONE,
11015 PMAP_OPTIONS_REMOVE);
11016 } else {
11017 pmap_remove_options(
11018 dst_map->pmap,
11019 (addr64_t)(entry->vme_start),
11020 (addr64_t)(entry->vme_end),
11021 PMAP_OPTIONS_REMOVE);
11022 }
11023 vm_object_deallocate(old_object);
11024 }
11025 }
11026
11027 if (entry->iokit_acct) {
11028 /* keep using iokit accounting */
11029 entry->use_pmap = FALSE;
11030 } else {
11031 /* use pmap accounting */
11032 entry->use_pmap = TRUE;
11033 }
11034 assert(!entry->vme_permanent);
11035 VME_OBJECT_SET(entry, VME_OBJECT(copy_entry), false, 0);
11036 object = VME_OBJECT(entry);
11037 entry->needs_copy = copy_entry->needs_copy;
11038 entry->wired_count = 0;
11039 entry->user_wired_count = 0;
11040 offset = VME_OFFSET(copy_entry);
11041 VME_OFFSET_SET(entry, offset);
11042
11043 vm_map_copy_entry_unlink(copy, copy_entry);
11044 vm_map_copy_entry_dispose(copy_entry);
11045
11046 /*
11047 * we could try to push pages into the pmap at this point, BUT
11048 * this optimization only saved on average 2 us per page if ALL
11049 * the pages in the source were currently mapped
11050 * and ALL the pages in the dest were touched, if there were fewer
11051 * than 2/3 of the pages touched, this optimization actually cost more cycles
11052 * it also puts a lot of pressure on the pmap layer w/r to mapping structures
11053 */
11054
11055 /*
11056 * Set up for the next iteration. The map
11057 * has not been unlocked, so the next
11058 * address should be at the end of this
11059 * entry, and the next map entry should be
11060 * the one following it.
11061 */
11062
11063 start = tmp_entry->vme_end;
11064 tmp_entry = tmp_entry->vme_next;
11065 } else {
11066 vm_map_version_t version;
11067 vm_object_t dst_object;
11068 vm_object_offset_t dst_offset;
11069 kern_return_t r;
11070
11071 slow_copy:
11072 if (entry->needs_copy) {
11073 VME_OBJECT_SHADOW(entry,
11074 (entry->vme_end -
11075 entry->vme_start),
11076 vm_map_always_shadow(dst_map));
11077 entry->needs_copy = FALSE;
11078 }
11079
11080 dst_object = VME_OBJECT(entry);
11081 dst_offset = VME_OFFSET(entry);
11082
11083 /*
11084 * Take an object reference, and record
11085 * the map version information so that the
11086 * map can be safely unlocked.
11087 */
11088
11089 if (dst_object == VM_OBJECT_NULL) {
11090 /*
11091 * We would usually have just taken the
11092 * optimized path above if the destination
11093 * object has not been allocated yet. But we
11094 * now disable that optimization if the copy
11095 * entry's object is not backed by anonymous
11096 * memory to avoid replacing malloc'ed
11097 * (i.e. re-usable) anonymous memory with a
11098 * not-so-anonymous mapping.
11099 * So we have to handle this case here and
11100 * allocate a new VM object for this map entry.
11101 */
11102 dst_object = vm_object_allocate(
11103 entry->vme_end - entry->vme_start);
11104 dst_offset = 0;
11105 VME_OBJECT_SET(entry, dst_object, false, 0);
11106 VME_OFFSET_SET(entry, dst_offset);
11107 assert(entry->use_pmap);
11108 }
11109
11110 vm_object_reference(dst_object);
11111
11112 /* account for unlock bumping up timestamp */
11113 version.main_timestamp = dst_map->timestamp + 1;
11114
11115 vm_map_unlock(dst_map);
11116
11117 /*
11118 * Copy as much as possible in one pass
11119 */
11120
11121 copy_size = size;
11122 r = vm_fault_copy(
11123 VME_OBJECT(copy_entry),
11124 VME_OFFSET(copy_entry),
11125 ©_size,
11126 dst_object,
11127 dst_offset,
11128 dst_map,
11129 &version,
11130 THREAD_UNINT );
11131
11132 /*
11133 * Release the object reference
11134 */
11135
11136 vm_object_deallocate(dst_object);
11137
11138 /*
11139 * If a hard error occurred, return it now
11140 */
11141
11142 if (r != KERN_SUCCESS) {
11143 return r;
11144 }
11145
11146 if (copy_size != 0) {
11147 /*
11148 * Dispose of the copied region
11149 */
11150
11151 vm_map_copy_clip_end(copy, copy_entry,
11152 copy_entry->vme_start + copy_size);
11153 vm_map_copy_entry_unlink(copy, copy_entry);
11154 vm_object_deallocate(VME_OBJECT(copy_entry));
11155 vm_map_copy_entry_dispose(copy_entry);
11156 }
11157
11158 /*
11159 * Pick up in the destination map where we left off.
11160 *
11161 * Use the version information to avoid a lookup
11162 * in the normal case.
11163 */
11164
11165 start += copy_size;
11166 vm_map_lock(dst_map);
11167 if (version.main_timestamp == dst_map->timestamp &&
11168 copy_size != 0) {
11169 /* We can safely use saved tmp_entry value */
11170
11171 if (tmp_entry->map_aligned &&
11172 !VM_MAP_PAGE_ALIGNED(
11173 start,
11174 VM_MAP_PAGE_MASK(dst_map))) {
11175 /* no longer map-aligned */
11176 tmp_entry->map_aligned = FALSE;
11177 }
11178 vm_map_clip_end(dst_map, tmp_entry, start);
11179 tmp_entry = tmp_entry->vme_next;
11180 } else {
11181 /* Must do lookup of tmp_entry */
11182
11183 RetryLookup:
11184 if (!vm_map_lookup_entry(dst_map, start, &tmp_entry)) {
11185 vm_map_unlock(dst_map);
11186 return KERN_INVALID_ADDRESS;
11187 }
11188 if (tmp_entry->map_aligned &&
11189 !VM_MAP_PAGE_ALIGNED(
11190 start,
11191 VM_MAP_PAGE_MASK(dst_map))) {
11192 /* no longer map-aligned */
11193 tmp_entry->map_aligned = FALSE;
11194 }
11195 vm_map_clip_start(dst_map, tmp_entry, start);
11196 }
11197 }
11198 }/* while */
11199
11200 return KERN_SUCCESS;
11201 }/* vm_map_copy_overwrite_aligned */
11202
11203 /*
11204 * Routine: vm_map_copyin_kernel_buffer [internal use only]
11205 *
11206 * Description:
11207 * Copy in data to a kernel buffer from space in the
11208 * source map. The original space may be optionally
11209 * deallocated.
11210 *
11211 * If successful, returns a new copy object.
11212 */
11213 static kern_return_t
vm_map_copyin_kernel_buffer(vm_map_t src_map,vm_map_offset_t src_addr,vm_map_size_t len,boolean_t src_destroy,vm_map_copy_t * copy_result)11214 vm_map_copyin_kernel_buffer(
11215 vm_map_t src_map,
11216 vm_map_offset_t src_addr,
11217 vm_map_size_t len,
11218 boolean_t src_destroy,
11219 vm_map_copy_t *copy_result)
11220 {
11221 kern_return_t kr;
11222 vm_map_copy_t copy;
11223 void *kdata;
11224
11225 if (len > msg_ool_size_small) {
11226 return KERN_INVALID_ARGUMENT;
11227 }
11228
11229 kdata = kalloc_data(len, Z_WAITOK);
11230 if (kdata == NULL) {
11231 return KERN_RESOURCE_SHORTAGE;
11232 }
11233 kr = copyinmap(src_map, src_addr, kdata, (vm_size_t)len);
11234 if (kr != KERN_SUCCESS) {
11235 kfree_data(kdata, len);
11236 return kr;
11237 }
11238
11239 copy = vm_map_copy_allocate(VM_MAP_COPY_KERNEL_BUFFER);
11240 copy->cpy_kdata = kdata;
11241 copy->size = len;
11242 copy->offset = 0;
11243
11244 if (src_destroy) {
11245 vmr_flags_t flags = VM_MAP_REMOVE_INTERRUPTIBLE;
11246
11247 if (src_map == kernel_map) {
11248 flags |= VM_MAP_REMOVE_KUNWIRE;
11249 }
11250
11251 (void)vm_map_remove_guard(src_map,
11252 vm_map_trunc_page(src_addr, VM_MAP_PAGE_MASK(src_map)),
11253 vm_map_round_page(src_addr + len, VM_MAP_PAGE_MASK(src_map)),
11254 flags, KMEM_GUARD_NONE);
11255 }
11256
11257 *copy_result = copy;
11258 return KERN_SUCCESS;
11259 }
11260
11261 /*
11262 * Routine: vm_map_copyout_kernel_buffer [internal use only]
11263 *
11264 * Description:
11265 * Copy out data from a kernel buffer into space in the
11266 * destination map. The space may be otpionally dynamically
11267 * allocated.
11268 *
11269 * If successful, consumes the copy object.
11270 * Otherwise, the caller is responsible for it.
11271 *
11272 * Callers of this function must call vm_map_copy_require on
11273 * previously created vm_map_copy_t or pass a newly created
11274 * one to ensure that it hasn't been forged.
11275 */
11276 static int vm_map_copyout_kernel_buffer_failures = 0;
11277 static kern_return_t
vm_map_copyout_kernel_buffer(vm_map_t map,vm_map_address_t * addr,vm_map_copy_t copy,vm_map_size_t copy_size,boolean_t overwrite,boolean_t consume_on_success)11278 vm_map_copyout_kernel_buffer(
11279 vm_map_t map,
11280 vm_map_address_t *addr, /* IN/OUT */
11281 vm_map_copy_t copy,
11282 vm_map_size_t copy_size,
11283 boolean_t overwrite,
11284 boolean_t consume_on_success)
11285 {
11286 kern_return_t kr = KERN_SUCCESS;
11287 thread_t thread = current_thread();
11288
11289 assert(copy->size == copy_size);
11290
11291 /*
11292 * check for corrupted vm_map_copy structure
11293 */
11294 if (copy_size > msg_ool_size_small || copy->offset) {
11295 panic("Invalid vm_map_copy_t sz:%lld, ofst:%lld",
11296 (long long)copy->size, (long long)copy->offset);
11297 }
11298
11299 if (!overwrite) {
11300 /*
11301 * Allocate space in the target map for the data
11302 */
11303 vm_map_kernel_flags_t vmk_flags = VM_MAP_KERNEL_FLAGS_ANYWHERE();
11304
11305 if (map == kernel_map) {
11306 vmk_flags.vmkf_range_id = KMEM_RANGE_ID_DATA;
11307 }
11308
11309 *addr = 0;
11310 kr = vm_map_enter(map,
11311 addr,
11312 vm_map_round_page(copy_size,
11313 VM_MAP_PAGE_MASK(map)),
11314 (vm_map_offset_t) 0,
11315 vmk_flags,
11316 VM_OBJECT_NULL,
11317 (vm_object_offset_t) 0,
11318 FALSE,
11319 VM_PROT_DEFAULT,
11320 VM_PROT_ALL,
11321 VM_INHERIT_DEFAULT);
11322 if (kr != KERN_SUCCESS) {
11323 return kr;
11324 }
11325 #if KASAN
11326 if (map->pmap == kernel_pmap) {
11327 kasan_notify_address(*addr, copy->size);
11328 }
11329 #endif
11330 }
11331
11332 /*
11333 * Copyout the data from the kernel buffer to the target map.
11334 */
11335 if (thread->map == map) {
11336 /*
11337 * If the target map is the current map, just do
11338 * the copy.
11339 */
11340 assert((vm_size_t)copy_size == copy_size);
11341 if (copyout(copy->cpy_kdata, *addr, (vm_size_t)copy_size)) {
11342 kr = KERN_INVALID_ADDRESS;
11343 }
11344 } else {
11345 vm_map_switch_context_t switch_ctx;
11346
11347 /*
11348 * If the target map is another map, assume the
11349 * target's address space identity for the duration
11350 * of the copy.
11351 */
11352 vm_map_reference(map);
11353 switch_ctx = vm_map_switch_to(map);
11354
11355 assert((vm_size_t)copy_size == copy_size);
11356 if (copyout(copy->cpy_kdata, *addr, (vm_size_t)copy_size)) {
11357 vm_map_copyout_kernel_buffer_failures++;
11358 kr = KERN_INVALID_ADDRESS;
11359 }
11360
11361 vm_map_switch_back(switch_ctx);
11362 vm_map_deallocate(map);
11363 }
11364
11365 if (kr != KERN_SUCCESS) {
11366 /* the copy failed, clean up */
11367 if (!overwrite) {
11368 /*
11369 * Deallocate the space we allocated in the target map.
11370 */
11371 (void) vm_map_remove(map,
11372 vm_map_trunc_page(*addr,
11373 VM_MAP_PAGE_MASK(map)),
11374 vm_map_round_page((*addr +
11375 vm_map_round_page(copy_size,
11376 VM_MAP_PAGE_MASK(map))),
11377 VM_MAP_PAGE_MASK(map)));
11378 *addr = 0;
11379 }
11380 } else {
11381 /* copy was successful, dicard the copy structure */
11382 if (consume_on_success) {
11383 kfree_data(copy->cpy_kdata, copy_size);
11384 zfree_id(ZONE_ID_VM_MAP_COPY, copy);
11385 }
11386 }
11387
11388 return kr;
11389 }
11390
11391 /*
11392 * Routine: vm_map_copy_insert [internal use only]
11393 *
11394 * Description:
11395 * Link a copy chain ("copy") into a map at the
11396 * specified location (after "where").
11397 *
11398 * Callers of this function must call vm_map_copy_require on
11399 * previously created vm_map_copy_t or pass a newly created
11400 * one to ensure that it hasn't been forged.
11401 * Side effects:
11402 * The copy chain is destroyed.
11403 */
11404 static void
vm_map_copy_insert(vm_map_t map,vm_map_entry_t after_where,vm_map_copy_t copy)11405 vm_map_copy_insert(
11406 vm_map_t map,
11407 vm_map_entry_t after_where,
11408 vm_map_copy_t copy)
11409 {
11410 vm_map_entry_t entry;
11411
11412 while (vm_map_copy_first_entry(copy) != vm_map_copy_to_entry(copy)) {
11413 entry = vm_map_copy_first_entry(copy);
11414 vm_map_copy_entry_unlink(copy, entry);
11415 vm_map_store_entry_link(map, after_where, entry,
11416 VM_MAP_KERNEL_FLAGS_NONE);
11417 after_where = entry;
11418 }
11419 zfree_id(ZONE_ID_VM_MAP_COPY, copy);
11420 }
11421
11422 /*
11423 * Callers of this function must call vm_map_copy_require on
11424 * previously created vm_map_copy_t or pass a newly created
11425 * one to ensure that it hasn't been forged.
11426 */
11427 void
vm_map_copy_remap(vm_map_t map,vm_map_entry_t where,vm_map_copy_t copy,vm_map_offset_t adjustment,vm_prot_t cur_prot,vm_prot_t max_prot,vm_inherit_t inheritance)11428 vm_map_copy_remap(
11429 vm_map_t map,
11430 vm_map_entry_t where,
11431 vm_map_copy_t copy,
11432 vm_map_offset_t adjustment,
11433 vm_prot_t cur_prot,
11434 vm_prot_t max_prot,
11435 vm_inherit_t inheritance)
11436 {
11437 vm_map_entry_t copy_entry, new_entry;
11438
11439 for (copy_entry = vm_map_copy_first_entry(copy);
11440 copy_entry != vm_map_copy_to_entry(copy);
11441 copy_entry = copy_entry->vme_next) {
11442 /* get a new VM map entry for the map */
11443 new_entry = vm_map_entry_create(map);
11444 /* copy the "copy entry" to the new entry */
11445 vm_map_entry_copy(map, new_entry, copy_entry);
11446 /* adjust "start" and "end" */
11447 new_entry->vme_start += adjustment;
11448 new_entry->vme_end += adjustment;
11449 /* clear some attributes */
11450 new_entry->inheritance = inheritance;
11451 new_entry->protection = cur_prot;
11452 new_entry->max_protection = max_prot;
11453 new_entry->behavior = VM_BEHAVIOR_DEFAULT;
11454 /* take an extra reference on the entry's "object" */
11455 if (new_entry->is_sub_map) {
11456 assert(!new_entry->use_pmap); /* not nested */
11457 vm_map_reference(VME_SUBMAP(new_entry));
11458 } else {
11459 vm_object_reference(VME_OBJECT(new_entry));
11460 }
11461 /* insert the new entry in the map */
11462 vm_map_store_entry_link(map, where, new_entry,
11463 VM_MAP_KERNEL_FLAGS_NONE);
11464 /* continue inserting the "copy entries" after the new entry */
11465 where = new_entry;
11466 }
11467 }
11468
11469
11470 /*
11471 * Returns true if *size matches (or is in the range of) copy->size.
11472 * Upon returning true, the *size field is updated with the actual size of the
11473 * copy object (may be different for VM_MAP_COPY_ENTRY_LIST types)
11474 */
11475 boolean_t
vm_map_copy_validate_size(vm_map_t dst_map,vm_map_copy_t copy,vm_map_size_t * size)11476 vm_map_copy_validate_size(
11477 vm_map_t dst_map,
11478 vm_map_copy_t copy,
11479 vm_map_size_t *size)
11480 {
11481 if (copy == VM_MAP_COPY_NULL) {
11482 return FALSE;
11483 }
11484
11485 /*
11486 * Assert that the vm_map_copy is coming from the right
11487 * zone and hasn't been forged
11488 */
11489 vm_map_copy_require(copy);
11490
11491 vm_map_size_t copy_sz = copy->size;
11492 vm_map_size_t sz = *size;
11493 switch (copy->type) {
11494 case VM_MAP_COPY_KERNEL_BUFFER:
11495 if (sz == copy_sz) {
11496 return TRUE;
11497 }
11498 break;
11499 case VM_MAP_COPY_ENTRY_LIST:
11500 /*
11501 * potential page-size rounding prevents us from exactly
11502 * validating this flavor of vm_map_copy, but we can at least
11503 * assert that it's within a range.
11504 */
11505 if (copy_sz >= sz &&
11506 copy_sz <= vm_map_round_page(sz, VM_MAP_PAGE_MASK(dst_map))) {
11507 *size = copy_sz;
11508 return TRUE;
11509 }
11510 break;
11511 default:
11512 break;
11513 }
11514 return FALSE;
11515 }
11516
11517 static kern_return_t
vm_map_copyout_internal(vm_map_t dst_map,vm_map_address_t * dst_addr,vm_map_copy_t copy,vm_map_size_ut copy_size_u,boolean_t consume_on_success,vm_prot_t cur_protection,vm_prot_t max_protection,vm_inherit_t inheritance)11518 vm_map_copyout_internal(
11519 vm_map_t dst_map,
11520 vm_map_address_t *dst_addr, /* OUT */
11521 vm_map_copy_t copy,
11522 vm_map_size_ut copy_size_u,
11523 boolean_t consume_on_success,
11524 vm_prot_t cur_protection,
11525 vm_prot_t max_protection,
11526 vm_inherit_t inheritance)
11527 {
11528 vm_map_size_t size, copy_size;
11529 vm_map_size_t adjustment;
11530 vm_map_offset_t start;
11531 vm_object_offset_t vm_copy_start;
11532 vm_map_entry_t last;
11533 vm_map_entry_t entry;
11534 vm_map_copy_t original_copy;
11535 kern_return_t kr;
11536 vm_map_kernel_flags_t vmk_flags = VM_MAP_KERNEL_FLAGS_ANYWHERE();
11537
11538 /*
11539 * Check for null copy object.
11540 */
11541
11542 if (copy == VM_MAP_COPY_NULL) {
11543 *dst_addr = 0;
11544 return KERN_SUCCESS;
11545 }
11546
11547 /*
11548 * Assert that the vm_map_copy is coming from the right
11549 * zone and hasn't been forged
11550 */
11551 vm_map_copy_require(copy);
11552
11553 if (!VM_SANITIZE_UNSAFE_IS_EQUAL(copy_size_u, copy->size)) {
11554 *dst_addr = 0;
11555 ktriage_record(thread_tid(current_thread()),
11556 KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM,
11557 KDBG_TRIAGE_RESERVED,
11558 KDBG_TRIAGE_VM_COPYOUT_INTERNAL_SIZE_ERROR),
11559 KERN_FAILURE /* arg */);
11560 return KERN_FAILURE;
11561 }
11562 copy_size = copy->size;
11563
11564 /*
11565 * Check for special kernel buffer allocated
11566 * by new_ipc_kmsg_copyin.
11567 */
11568
11569 if (copy->type == VM_MAP_COPY_KERNEL_BUFFER) {
11570 kr = vm_map_copyout_kernel_buffer(dst_map, dst_addr,
11571 copy, copy_size, FALSE,
11572 consume_on_success);
11573 if (kr) {
11574 ktriage_record(thread_tid(current_thread()),
11575 KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM,
11576 KDBG_TRIAGE_RESERVED,
11577 KDBG_TRIAGE_VM_COPYOUT_KERNEL_BUFFER_ERROR), kr /* arg */);
11578 }
11579 return kr;
11580 }
11581
11582
11583 original_copy = copy;
11584 if (copy->cpy_hdr.page_shift != VM_MAP_PAGE_SHIFT(dst_map)) {
11585 vm_map_copy_t target_copy;
11586 vm_map_offset_t overmap_start, overmap_end, trimmed_start;
11587
11588 target_copy = VM_MAP_COPY_NULL;
11589 DEBUG4K_ADJUST("adjusting...\n");
11590 kr = vm_map_copy_adjust_to_target(
11591 copy,
11592 0, /* offset */
11593 copy->size, /* size */
11594 dst_map,
11595 TRUE, /* copy */
11596 &target_copy,
11597 &overmap_start,
11598 &overmap_end,
11599 &trimmed_start);
11600 if (kr != KERN_SUCCESS) {
11601 DEBUG4K_COPY("adjust failed 0x%x\n", kr);
11602 ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_COPYOUT_INTERNAL_ADJUSTING_ERROR), kr /* arg */);
11603 return kr;
11604 }
11605 DEBUG4K_COPY("copy %p (%d 0x%llx 0x%llx) dst_map %p (%d) target_copy %p (%d 0x%llx 0x%llx) overmap_start 0x%llx overmap_end 0x%llx trimmed_start 0x%llx\n", copy, copy->cpy_hdr.page_shift, copy->offset, (uint64_t)copy->size, dst_map, VM_MAP_PAGE_SHIFT(dst_map), target_copy, target_copy->cpy_hdr.page_shift, target_copy->offset, (uint64_t)target_copy->size, (uint64_t)overmap_start, (uint64_t)overmap_end, (uint64_t)trimmed_start);
11606 if (target_copy != copy) {
11607 copy = target_copy;
11608 }
11609 copy_size = copy->size;
11610 }
11611
11612 /*
11613 * Find space for the data
11614 */
11615
11616 vm_copy_start = vm_map_trunc_page((vm_map_size_t)copy->offset,
11617 VM_MAP_COPY_PAGE_MASK(copy));
11618 size = vm_map_round_page((vm_map_size_t)copy->offset + copy_size,
11619 VM_MAP_COPY_PAGE_MASK(copy))
11620 - vm_copy_start;
11621
11622 vm_map_kernel_flags_update_range_id(&vmk_flags, dst_map, size);
11623
11624 vm_map_lock(dst_map);
11625 kr = vm_map_locate_space_anywhere(dst_map, size, 0, vmk_flags,
11626 &start, &last);
11627 if (kr != KERN_SUCCESS) {
11628 vm_map_unlock(dst_map);
11629 ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_COPYOUT_INTERNAL_SPACE_ERROR), kr /* arg */);
11630 return kr;
11631 }
11632
11633 adjustment = start - vm_copy_start;
11634 if (!consume_on_success) {
11635 /*
11636 * We're not allowed to consume "copy", so we'll have to
11637 * copy its map entries into the destination map below.
11638 * No need to re-allocate map entries from the correct
11639 * (pageable or not) zone, since we'll get new map entries
11640 * during the transfer.
11641 * We'll also adjust the map entries's "start" and "end"
11642 * during the transfer, to keep "copy"'s entries consistent
11643 * with its "offset".
11644 */
11645 goto after_adjustments;
11646 }
11647
11648 /*
11649 * Since we're going to just drop the map
11650 * entries from the copy into the destination
11651 * map, they must come from the same pool.
11652 */
11653
11654 if (copy->cpy_hdr.entries_pageable != dst_map->hdr.entries_pageable) {
11655 /*
11656 * Mismatches occur when dealing with the default
11657 * pager.
11658 */
11659 vm_map_entry_t next, new;
11660
11661 /*
11662 * Find the zone that the copies were allocated from
11663 */
11664
11665 entry = vm_map_copy_first_entry(copy);
11666
11667 /*
11668 * Reinitialize the copy so that vm_map_copy_entry_link
11669 * will work.
11670 */
11671 vm_map_store_copy_reset(copy, entry);
11672 copy->cpy_hdr.entries_pageable = dst_map->hdr.entries_pageable;
11673
11674 /*
11675 * Copy each entry.
11676 */
11677 while (entry != vm_map_copy_to_entry(copy)) {
11678 new = vm_map_copy_entry_create(copy);
11679 vm_map_entry_copy_full(new, entry);
11680 new->vme_no_copy_on_read = FALSE;
11681 assert(!new->iokit_acct);
11682 if (new->is_sub_map) {
11683 /* clr address space specifics */
11684 new->use_pmap = FALSE;
11685 }
11686 vm_map_copy_entry_link(copy,
11687 vm_map_copy_last_entry(copy),
11688 new);
11689 next = entry->vme_next;
11690 vm_map_entry_dispose(entry);
11691 entry = next;
11692 }
11693 }
11694
11695 /*
11696 * Adjust the addresses in the copy chain, and
11697 * reset the region attributes.
11698 */
11699
11700 for (entry = vm_map_copy_first_entry(copy);
11701 entry != vm_map_copy_to_entry(copy);
11702 entry = entry->vme_next) {
11703 if (VM_MAP_PAGE_SHIFT(dst_map) == PAGE_SHIFT) {
11704 /*
11705 * We're injecting this copy entry into a map that
11706 * has the standard page alignment, so clear
11707 * "map_aligned" (which might have been inherited
11708 * from the original map entry).
11709 */
11710 entry->map_aligned = FALSE;
11711 }
11712
11713 entry->vme_start += adjustment;
11714 entry->vme_end += adjustment;
11715
11716 if (entry->map_aligned) {
11717 assert(VM_MAP_PAGE_ALIGNED(entry->vme_start,
11718 VM_MAP_PAGE_MASK(dst_map)));
11719 assert(VM_MAP_PAGE_ALIGNED(entry->vme_end,
11720 VM_MAP_PAGE_MASK(dst_map)));
11721 }
11722
11723 entry->inheritance = VM_INHERIT_DEFAULT;
11724 entry->protection = VM_PROT_DEFAULT;
11725 entry->max_protection = VM_PROT_ALL;
11726 entry->behavior = VM_BEHAVIOR_DEFAULT;
11727
11728 /*
11729 * If the entry is now wired,
11730 * map the pages into the destination map.
11731 */
11732 if (entry->wired_count != 0) {
11733 vm_map_offset_t va;
11734 vm_object_offset_t offset;
11735 vm_object_t object;
11736 vm_prot_t prot;
11737 int type_of_fault;
11738 uint8_t object_lock_type = OBJECT_LOCK_EXCLUSIVE;
11739
11740 /* TODO4K would need to use actual page size */
11741 assert(VM_MAP_PAGE_SHIFT(dst_map) == PAGE_SHIFT);
11742
11743 object = VME_OBJECT(entry);
11744 offset = VME_OFFSET(entry);
11745 va = entry->vme_start;
11746
11747 pmap_pageable(dst_map->pmap,
11748 entry->vme_start,
11749 entry->vme_end,
11750 TRUE);
11751
11752 while (va < entry->vme_end) {
11753 vm_page_t m;
11754 struct vm_object_fault_info fault_info = {
11755 .interruptible = THREAD_UNINT,
11756 };
11757
11758 /*
11759 * Look up the page in the object.
11760 * Assert that the page will be found in the
11761 * top object:
11762 * either
11763 * the object was newly created by
11764 * vm_object_copy_slowly, and has
11765 * copies of all of the pages from
11766 * the source object
11767 * or
11768 * the object was moved from the old
11769 * map entry; because the old map
11770 * entry was wired, all of the pages
11771 * were in the top-level object.
11772 * (XXX not true if we wire pages for
11773 * reading)
11774 */
11775 vm_object_lock(object);
11776
11777 m = vm_page_lookup(object, offset);
11778 if (m == VM_PAGE_NULL || !VM_PAGE_WIRED(m) ||
11779 m->vmp_absent) {
11780 panic("vm_map_copyout: wiring %p", m);
11781 }
11782
11783 prot = entry->protection;
11784
11785 if (override_nx(dst_map, VME_ALIAS(entry)) &&
11786 prot) {
11787 prot |= VM_PROT_EXECUTE;
11788 }
11789
11790 type_of_fault = DBG_CACHE_HIT_FAULT;
11791
11792 fault_info.user_tag = VME_ALIAS(entry);
11793 fault_info.pmap_options = 0;
11794 if (entry->iokit_acct ||
11795 (!entry->is_sub_map && !entry->use_pmap)) {
11796 fault_info.pmap_options |= PMAP_OPTIONS_ALT_ACCT;
11797 }
11798 if (entry->vme_xnu_user_debug &&
11799 !VM_PAGE_OBJECT(m)->code_signed) {
11800 /*
11801 * Modified code-signed executable
11802 * region: this page does not belong
11803 * to a code-signed VM object, so it
11804 * must have been copied and should
11805 * therefore be typed XNU_USER_DEBUG
11806 * rather than XNU_USER_EXEC.
11807 */
11808 fault_info.pmap_options |= PMAP_OPTIONS_XNU_USER_DEBUG;
11809 }
11810
11811 vm_fault_enter(m,
11812 dst_map->pmap,
11813 va,
11814 PAGE_SIZE, 0,
11815 prot,
11816 prot,
11817 VM_PAGE_WIRED(m),
11818 VM_KERN_MEMORY_NONE, /* tag - not wiring */
11819 &fault_info,
11820 NULL, /* need_retry */
11821 &type_of_fault,
11822 &object_lock_type); /*Exclusive mode lock. Will remain unchanged.*/
11823
11824 vm_object_unlock(object);
11825
11826 offset += PAGE_SIZE_64;
11827 va += PAGE_SIZE;
11828 }
11829 }
11830 }
11831
11832 after_adjustments:
11833
11834 /*
11835 * Correct the page alignment for the result
11836 */
11837
11838 *dst_addr = start + (copy->offset - vm_copy_start);
11839
11840 #if KASAN
11841 kasan_notify_address(*dst_addr, size);
11842 #endif
11843
11844 /*
11845 * Update the hints and the map size
11846 */
11847
11848 if (consume_on_success) {
11849 SAVE_HINT_MAP_WRITE(dst_map, vm_map_copy_last_entry(copy));
11850 } else {
11851 SAVE_HINT_MAP_WRITE(dst_map, last);
11852 }
11853
11854 dst_map->size += size;
11855
11856 /*
11857 * Link in the copy
11858 */
11859
11860 if (consume_on_success) {
11861 vm_map_copy_insert(dst_map, last, copy);
11862 if (copy != original_copy) {
11863 vm_map_copy_discard(original_copy);
11864 original_copy = VM_MAP_COPY_NULL;
11865 }
11866 } else {
11867 vm_map_copy_remap(dst_map, last, copy, adjustment,
11868 cur_protection, max_protection,
11869 inheritance);
11870 if (copy != original_copy && original_copy != VM_MAP_COPY_NULL) {
11871 vm_map_copy_discard(copy);
11872 copy = original_copy;
11873 }
11874 }
11875
11876
11877 vm_map_unlock(dst_map);
11878
11879 /*
11880 * XXX If wiring_required, call vm_map_pageable
11881 */
11882
11883 return KERN_SUCCESS;
11884 }
11885
11886 /*
11887 * Routine: vm_map_copyout_size
11888 *
11889 * Description:
11890 * Copy out a copy chain ("copy") into newly-allocated
11891 * space in the destination map. Uses a prevalidated
11892 * size for the copy object (vm_map_copy_validate_size).
11893 *
11894 * If successful, consumes the copy object.
11895 * Otherwise, the caller is responsible for it.
11896 */
11897 kern_return_t
vm_map_copyout_size(vm_map_t dst_map,vm_map_address_t * dst_addr,vm_map_copy_t copy,vm_map_size_ut copy_size)11898 vm_map_copyout_size(
11899 vm_map_t dst_map,
11900 vm_map_address_t *dst_addr, /* OUT */
11901 vm_map_copy_t copy,
11902 vm_map_size_ut copy_size)
11903 {
11904 return vm_map_copyout_internal(dst_map, dst_addr, copy, copy_size,
11905 TRUE, /* consume_on_success */
11906 VM_PROT_DEFAULT,
11907 VM_PROT_ALL,
11908 VM_INHERIT_DEFAULT);
11909 }
11910
11911 /*
11912 * Routine: vm_map_copyout
11913 *
11914 * Description:
11915 * Copy out a copy chain ("copy") into newly-allocated
11916 * space in the destination map.
11917 *
11918 * If successful, consumes the copy object.
11919 * Otherwise, the caller is responsible for it.
11920 */
11921 kern_return_t
vm_map_copyout(vm_map_t dst_map,vm_map_address_t * dst_addr,vm_map_copy_t copy)11922 vm_map_copyout(
11923 vm_map_t dst_map,
11924 vm_map_address_t *dst_addr, /* OUT */
11925 vm_map_copy_t copy)
11926 {
11927 return vm_map_copyout_internal(dst_map, dst_addr, copy, copy ? copy->size : 0,
11928 TRUE, /* consume_on_success */
11929 VM_PROT_DEFAULT,
11930 VM_PROT_ALL,
11931 VM_INHERIT_DEFAULT);
11932 }
11933
11934 /*
11935 * Routine: vm_map_copyin
11936 *
11937 * Description:
11938 * see vm_map_copyin_common. Exported via Unsupported.exports.
11939 *
11940 */
11941 kern_return_t
vm_map_copyin(vm_map_t src_map,vm_map_address_ut src_addr,vm_map_size_ut len,boolean_t src_destroy,vm_map_copy_t * copy_result)11942 vm_map_copyin(
11943 vm_map_t src_map,
11944 vm_map_address_ut src_addr,
11945 vm_map_size_ut len,
11946 boolean_t src_destroy,
11947 vm_map_copy_t *copy_result) /* OUT */
11948 {
11949 return vm_map_copyin_common(src_map, src_addr, len, src_destroy,
11950 FALSE, copy_result, FALSE);
11951 }
11952
11953 /*
11954 * Routine: vm_map_copyin_common
11955 *
11956 * Description:
11957 * Copy the specified region (src_addr, len) from the
11958 * source address space (src_map), possibly removing
11959 * the region from the source address space (src_destroy).
11960 *
11961 * Returns:
11962 * A vm_map_copy_t object (copy_result), suitable for
11963 * insertion into another address space (using vm_map_copyout),
11964 * copying over another address space region (using
11965 * vm_map_copy_overwrite). If the copy is unused, it
11966 * should be destroyed (using vm_map_copy_discard).
11967 *
11968 * In/out conditions:
11969 * The source map should not be locked on entry.
11970 */
11971
11972 typedef struct submap_map {
11973 vm_map_t parent_map;
11974 vm_map_offset_t base_start;
11975 vm_map_offset_t base_end;
11976 vm_map_size_t base_len;
11977 struct submap_map *next;
11978 } submap_map_t;
11979
11980 kern_return_t
vm_map_copyin_common(vm_map_t src_map,vm_map_address_ut src_addr,vm_map_size_ut len,boolean_t src_destroy,__unused boolean_t src_volatile,vm_map_copy_t * copy_result,boolean_t use_maxprot)11981 vm_map_copyin_common(
11982 vm_map_t src_map,
11983 vm_map_address_ut src_addr,
11984 vm_map_size_ut len,
11985 boolean_t src_destroy,
11986 __unused boolean_t src_volatile,
11987 vm_map_copy_t *copy_result, /* OUT */
11988 boolean_t use_maxprot)
11989 {
11990 int flags;
11991
11992 flags = 0;
11993 if (src_destroy) {
11994 flags |= VM_MAP_COPYIN_SRC_DESTROY;
11995 }
11996 if (use_maxprot) {
11997 flags |= VM_MAP_COPYIN_USE_MAXPROT;
11998 }
11999 return vm_map_copyin_internal(src_map,
12000 src_addr,
12001 len,
12002 flags,
12003 copy_result);
12004 }
12005
12006 static __attribute__((always_inline, warn_unused_result))
12007 kern_return_t
vm_map_copyin_sanitize(vm_map_t src_map,vm_map_address_ut src_addr_u,vm_map_size_ut len_u,vm_map_offset_t * src_start,vm_map_offset_t * src_end,vm_map_size_t * len,vm_map_offset_t * src_addr_unaligned)12008 vm_map_copyin_sanitize(
12009 vm_map_t src_map,
12010 vm_map_address_ut src_addr_u,
12011 vm_map_size_ut len_u,
12012 vm_map_offset_t *src_start,
12013 vm_map_offset_t *src_end,
12014 vm_map_size_t *len,
12015 vm_map_offset_t *src_addr_unaligned)
12016 {
12017 kern_return_t kr;
12018 vm_sanitize_flags_t flags = VM_SANITIZE_FLAGS_SIZE_ZERO_SUCCEEDS |
12019 VM_SANITIZE_FLAGS_GET_UNALIGNED_VALUES |
12020 VM_SANITIZE_FLAGS_CHECK_ADDR_RANGE;
12021
12022 #if KASAN_TBI
12023 if (vm_kernel_map_is_kernel(src_map)) {
12024 flags |= VM_SANITIZE_FLAGS_CANONICALIZE;
12025 }
12026 #endif /* KASAN_TBI */
12027
12028 kr = vm_sanitize_addr_size(src_addr_u, len_u,
12029 VM_SANITIZE_CALLER_VM_MAP_COPYIN,
12030 src_map,
12031 flags,
12032 src_start, src_end, len);
12033 if (__improbable(kr != KERN_SUCCESS)) {
12034 return kr;
12035 }
12036
12037 /*
12038 * Compute (page aligned) start and end of region
12039 */
12040 *src_addr_unaligned = *src_start; /* remember unaligned value */
12041 *src_start = vm_map_trunc_page(*src_addr_unaligned,
12042 VM_MAP_PAGE_MASK(src_map));
12043 *src_end = vm_map_round_page(*src_end, VM_MAP_PAGE_MASK(src_map));
12044 return KERN_SUCCESS;
12045 }
12046
12047 kern_return_t
vm_map_copyin_internal(vm_map_t src_map,vm_map_address_ut src_addr_u,vm_map_size_ut len_u,int flags,vm_map_copy_t * copy_result)12048 vm_map_copyin_internal(
12049 vm_map_t src_map,
12050 vm_map_address_ut src_addr_u,
12051 vm_map_size_ut len_u,
12052 int flags,
12053 vm_map_copy_t *copy_result) /* OUT */
12054 {
12055 vm_map_entry_t tmp_entry; /* Result of last map lookup --
12056 * in multi-level lookup, this
12057 * entry contains the actual
12058 * vm_object/offset.
12059 */
12060 vm_map_entry_t new_entry = VM_MAP_ENTRY_NULL; /* Map entry for copy */
12061
12062 vm_map_offset_t src_start; /* Start of current entry --
12063 * where copy is taking place now
12064 */
12065 vm_map_offset_t src_end; /* End of entire region to be
12066 * copied */
12067 vm_map_offset_t src_addr_unaligned;
12068 vm_map_offset_t src_base;
12069 vm_map_size_t len;
12070 vm_map_t base_map = src_map;
12071 boolean_t map_share = FALSE;
12072 submap_map_t *parent_maps = NULL;
12073
12074 vm_map_copy_t copy; /* Resulting copy */
12075 vm_map_address_t copy_addr;
12076 vm_map_size_t copy_size;
12077 boolean_t src_destroy;
12078 boolean_t use_maxprot;
12079 boolean_t preserve_purgeable;
12080 boolean_t entry_was_shared;
12081 vm_map_entry_t saved_src_entry;
12082 kern_return_t kr;
12083
12084 if (flags & ~VM_MAP_COPYIN_ALL_FLAGS) {
12085 return KERN_INVALID_ARGUMENT;
12086 }
12087
12088 /*
12089 * Check for copies of zero bytes.
12090 */
12091 if (VM_SANITIZE_UNSAFE_IS_ZERO(len_u)) {
12092 *copy_result = VM_MAP_COPY_NULL;
12093 return KERN_SUCCESS;
12094 }
12095
12096 /*
12097 * Sanitize any input parameters that are addr/size/prot/inherit
12098 */
12099 kr = vm_map_copyin_sanitize(
12100 src_map,
12101 src_addr_u,
12102 len_u,
12103 &src_start,
12104 &src_end,
12105 &len,
12106 &src_addr_unaligned);
12107 if (__improbable(kr != KERN_SUCCESS)) {
12108 return vm_sanitize_get_kr(kr);
12109 }
12110
12111
12112 src_destroy = (flags & VM_MAP_COPYIN_SRC_DESTROY) ? TRUE : FALSE;
12113 use_maxprot = (flags & VM_MAP_COPYIN_USE_MAXPROT) ? TRUE : FALSE;
12114 preserve_purgeable =
12115 (flags & VM_MAP_COPYIN_PRESERVE_PURGEABLE) ? TRUE : FALSE;
12116
12117 /*
12118 * If the copy is sufficiently small, use a kernel buffer instead
12119 * of making a virtual copy. The theory being that the cost of
12120 * setting up VM (and taking C-O-W faults) dominates the copy costs
12121 * for small regions.
12122 */
12123 if ((len <= msg_ool_size_small) &&
12124 !use_maxprot &&
12125 !preserve_purgeable &&
12126 !(flags & VM_MAP_COPYIN_ENTRY_LIST) &&
12127 /*
12128 * Since the "msg_ool_size_small" threshold was increased and
12129 * vm_map_copyin_kernel_buffer() doesn't handle accesses beyond the
12130 * address space limits, we revert to doing a virtual copy if the
12131 * copied range goes beyond those limits. Otherwise, mach_vm_read()
12132 * of the commpage would now fail when it used to work.
12133 */
12134 (src_start >= vm_map_min(src_map) &&
12135 src_start < vm_map_max(src_map) &&
12136 src_end >= vm_map_min(src_map) &&
12137 src_end < vm_map_max(src_map))) {
12138 return vm_map_copyin_kernel_buffer(src_map, src_addr_unaligned, len,
12139 src_destroy, copy_result);
12140 }
12141
12142 /*
12143 * Allocate a header element for the list.
12144 *
12145 * Use the start and end in the header to
12146 * remember the endpoints prior to rounding.
12147 */
12148
12149 copy = vm_map_copy_allocate(VM_MAP_COPY_ENTRY_LIST);
12150 copy->cpy_hdr.entries_pageable = TRUE;
12151 copy->cpy_hdr.page_shift = (uint16_t)VM_MAP_PAGE_SHIFT(src_map);
12152 copy->offset = src_addr_unaligned;
12153 copy->size = len;
12154
12155 new_entry = vm_map_copy_entry_create(copy);
12156
12157 #define RETURN(x) \
12158 MACRO_BEGIN \
12159 vm_map_unlock(src_map); \
12160 if(src_map != base_map) \
12161 vm_map_deallocate(src_map); \
12162 if (new_entry != VM_MAP_ENTRY_NULL) \
12163 vm_map_copy_entry_dispose(new_entry); \
12164 vm_map_copy_discard(copy); \
12165 { \
12166 submap_map_t *_ptr; \
12167 \
12168 for(_ptr = parent_maps; _ptr != NULL; _ptr = parent_maps) { \
12169 parent_maps=parent_maps->next; \
12170 if (_ptr->parent_map != base_map) \
12171 vm_map_deallocate(_ptr->parent_map); \
12172 kfree_type(submap_map_t, _ptr); \
12173 } \
12174 } \
12175 MACRO_RETURN(x); \
12176 MACRO_END
12177
12178 /*
12179 * Find the beginning of the region.
12180 */
12181
12182 vm_map_lock(src_map);
12183
12184 /*
12185 * Lookup the original "src_addr_unaligned" rather than the truncated
12186 * "src_start", in case "src_start" falls in a non-map-aligned
12187 * map entry *before* the map entry that contains "src_addr_unaligned"...
12188 */
12189 if (!vm_map_lookup_entry(src_map, src_addr_unaligned, &tmp_entry)) {
12190 RETURN(KERN_INVALID_ADDRESS);
12191 }
12192 if (!tmp_entry->is_sub_map) {
12193 /*
12194 * ... but clip to the map-rounded "src_start" rather than
12195 * "src_addr_unaligned" to preserve map-alignment. We'll adjust the
12196 * first copy entry at the end, if needed.
12197 */
12198 vm_map_clip_start(src_map, tmp_entry, src_start);
12199 }
12200 if (src_start < tmp_entry->vme_start) {
12201 /*
12202 * Move "src_start" up to the start of the
12203 * first map entry to copy.
12204 */
12205 src_start = tmp_entry->vme_start;
12206 }
12207 /* set for later submap fix-up */
12208 copy_addr = src_start;
12209
12210 /*
12211 * Go through entries until we get to the end.
12212 */
12213
12214 while (TRUE) {
12215 vm_map_entry_t src_entry = tmp_entry; /* Top-level entry */
12216 vm_map_size_t src_size; /* Size of source
12217 * map entry (in both
12218 * maps)
12219 */
12220
12221 vm_object_t src_object; /* Object to copy */
12222 vm_object_offset_t src_offset;
12223
12224 vm_object_t new_copy_object;/* vm_object_copy_* result */
12225
12226 boolean_t src_needs_copy; /* Should source map
12227 * be made read-only
12228 * for copy-on-write?
12229 */
12230
12231 boolean_t new_entry_needs_copy; /* Will new entry be COW? */
12232
12233 boolean_t was_wired; /* Was source wired? */
12234 boolean_t saved_used_for_jit; /* Saved used_for_jit. */
12235 vm_map_version_t version; /* Version before locks
12236 * dropped to make copy
12237 */
12238 kern_return_t result; /* Return value from
12239 * copy_strategically.
12240 */
12241 while (tmp_entry->is_sub_map) {
12242 vm_map_size_t submap_len;
12243 submap_map_t *ptr;
12244
12245 ptr = kalloc_type(submap_map_t, Z_WAITOK);
12246 ptr->next = parent_maps;
12247 parent_maps = ptr;
12248 ptr->parent_map = src_map;
12249 ptr->base_start = src_start;
12250 ptr->base_end = src_end;
12251 submap_len = tmp_entry->vme_end - src_start;
12252 if (submap_len > (src_end - src_start)) {
12253 submap_len = src_end - src_start;
12254 }
12255 ptr->base_len = submap_len;
12256
12257 src_start -= tmp_entry->vme_start;
12258 src_start += VME_OFFSET(tmp_entry);
12259 src_end = src_start + submap_len;
12260 src_map = VME_SUBMAP(tmp_entry);
12261 vm_map_lock(src_map);
12262 /* keep an outstanding reference for all maps in */
12263 /* the parents tree except the base map */
12264 vm_map_reference(src_map);
12265 vm_map_unlock(ptr->parent_map);
12266 if (!vm_map_lookup_entry(
12267 src_map, src_start, &tmp_entry)) {
12268 RETURN(KERN_INVALID_ADDRESS);
12269 }
12270 map_share = TRUE;
12271 if (!tmp_entry->is_sub_map) {
12272 vm_map_clip_start(src_map, tmp_entry, src_start);
12273 }
12274 src_entry = tmp_entry;
12275 }
12276 /* we are now in the lowest level submap... */
12277
12278 if ((VME_OBJECT(tmp_entry) != VM_OBJECT_NULL) &&
12279 (VME_OBJECT(tmp_entry)->phys_contiguous)) {
12280 /* This is not, supported for now.In future */
12281 /* we will need to detect the phys_contig */
12282 /* condition and then upgrade copy_slowly */
12283 /* to do physical copy from the device mem */
12284 /* based object. We can piggy-back off of */
12285 /* the was wired boolean to set-up the */
12286 /* proper handling */
12287 RETURN(KERN_PROTECTION_FAILURE);
12288 }
12289 /*
12290 * Create a new address map entry to hold the result.
12291 * Fill in the fields from the appropriate source entries.
12292 * We must unlock the source map to do this if we need
12293 * to allocate a map entry.
12294 */
12295 if (new_entry == VM_MAP_ENTRY_NULL) {
12296 version.main_timestamp = src_map->timestamp;
12297 vm_map_unlock(src_map);
12298
12299 new_entry = vm_map_copy_entry_create(copy);
12300
12301 vm_map_lock(src_map);
12302 if ((version.main_timestamp + 1) != src_map->timestamp) {
12303 if (!vm_map_lookup_entry(src_map, src_start,
12304 &tmp_entry)) {
12305 RETURN(KERN_INVALID_ADDRESS);
12306 }
12307 if (!tmp_entry->is_sub_map) {
12308 vm_map_clip_start(src_map, tmp_entry, src_start);
12309 }
12310 continue; /* restart w/ new tmp_entry */
12311 }
12312 }
12313
12314 /*
12315 * Verify that the region can be read.
12316 */
12317 if (((src_entry->protection & VM_PROT_READ) == VM_PROT_NONE &&
12318 !use_maxprot) ||
12319 (src_entry->max_protection & VM_PROT_READ) == 0) {
12320 RETURN(KERN_PROTECTION_FAILURE);
12321 }
12322
12323 src_object = VME_OBJECT(src_entry);
12324
12325
12326 /*
12327 * Clip against the endpoints of the entire region.
12328 */
12329
12330 vm_map_clip_end(src_map, src_entry, src_end);
12331
12332 src_size = src_entry->vme_end - src_start;
12333 src_offset = VME_OFFSET(src_entry);
12334 was_wired = (src_entry->wired_count != 0);
12335
12336 vm_map_entry_copy(src_map, new_entry, src_entry);
12337 if (new_entry->is_sub_map) {
12338 /* clr address space specifics */
12339 new_entry->use_pmap = FALSE;
12340 } else {
12341 /*
12342 * We're dealing with a copy-on-write operation,
12343 * so the resulting mapping should not inherit the
12344 * original mapping's accounting settings.
12345 * "iokit_acct" should have been cleared in
12346 * vm_map_entry_copy().
12347 * "use_pmap" should be reset to its default (TRUE)
12348 * so that the new mapping gets accounted for in
12349 * the task's memory footprint.
12350 */
12351 assert(!new_entry->iokit_acct);
12352 new_entry->use_pmap = TRUE;
12353 }
12354
12355 /*
12356 * Attempt non-blocking copy-on-write optimizations.
12357 */
12358
12359 /*
12360 * If we are destroying the source, and the object
12361 * is internal, we could move the object reference
12362 * from the source to the copy. The copy is
12363 * copy-on-write only if the source is.
12364 * We make another reference to the object, because
12365 * destroying the source entry will deallocate it.
12366 *
12367 * This memory transfer has to be atomic, (to prevent
12368 * the VM object from being shared or copied while
12369 * it's being moved here), so we could only do this
12370 * if we won't have to unlock the VM map until the
12371 * original mapping has been fully removed.
12372 */
12373
12374 RestartCopy:
12375 if ((src_object == VM_OBJECT_NULL ||
12376 (!was_wired && !map_share && !tmp_entry->is_shared
12377 && !(debug4k_no_cow_copyin && VM_MAP_PAGE_SHIFT(src_map) < PAGE_SHIFT))) &&
12378 vm_object_copy_quickly(
12379 VME_OBJECT(new_entry),
12380 src_offset,
12381 src_size,
12382 &src_needs_copy,
12383 &new_entry_needs_copy)) {
12384 new_entry->needs_copy = new_entry_needs_copy;
12385
12386 /*
12387 * Handle copy-on-write obligations
12388 */
12389
12390 if (src_needs_copy && !tmp_entry->needs_copy) {
12391 vm_prot_t prot;
12392
12393 prot = src_entry->protection & ~VM_PROT_WRITE;
12394
12395 if (override_nx(src_map, VME_ALIAS(src_entry))
12396 && prot) {
12397 prot |= VM_PROT_EXECUTE;
12398 }
12399
12400 vm_object_pmap_protect(
12401 src_object,
12402 src_offset,
12403 src_size,
12404 (src_entry->is_shared ?
12405 PMAP_NULL
12406 : src_map->pmap),
12407 VM_MAP_PAGE_SIZE(src_map),
12408 src_entry->vme_start,
12409 prot);
12410
12411 assert(tmp_entry->wired_count == 0);
12412 tmp_entry->needs_copy = TRUE;
12413 }
12414
12415 /*
12416 * The map has never been unlocked, so it's safe
12417 * to move to the next entry rather than doing
12418 * another lookup.
12419 */
12420
12421 goto CopySuccessful;
12422 }
12423
12424 entry_was_shared = tmp_entry->is_shared;
12425
12426 /*
12427 * Take an object reference, so that we may
12428 * release the map lock(s).
12429 */
12430
12431 assert(src_object != VM_OBJECT_NULL);
12432 vm_object_reference(src_object);
12433
12434 /*
12435 * Record the timestamp for later verification.
12436 * Unlock the map.
12437 */
12438
12439 version.main_timestamp = src_map->timestamp;
12440 vm_map_unlock(src_map); /* Increments timestamp once! */
12441 saved_src_entry = src_entry;
12442 tmp_entry = VM_MAP_ENTRY_NULL;
12443 src_entry = VM_MAP_ENTRY_NULL;
12444
12445 /*
12446 * Perform the copy
12447 */
12448
12449 if (was_wired ||
12450 (src_object->copy_strategy == MEMORY_OBJECT_COPY_DELAY_FORK &&
12451 !(flags & VM_MAP_COPYIN_FORK)) ||
12452 (debug4k_no_cow_copyin &&
12453 VM_MAP_PAGE_SHIFT(src_map) < PAGE_SHIFT)) {
12454 CopySlowly:
12455 vm_object_lock(src_object);
12456 result = vm_object_copy_slowly(
12457 src_object,
12458 src_offset,
12459 src_size,
12460 THREAD_UNINT,
12461 &new_copy_object);
12462 /* VME_OBJECT_SET will reset used_for_jit|tpro, so preserve it. */
12463 saved_used_for_jit = new_entry->used_for_jit;
12464 VME_OBJECT_SET(new_entry, new_copy_object, false, 0);
12465 new_entry->used_for_jit = saved_used_for_jit;
12466 VME_OFFSET_SET(new_entry,
12467 src_offset - vm_object_trunc_page(src_offset));
12468 new_entry->needs_copy = FALSE;
12469 } else if (src_object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC &&
12470 (entry_was_shared || map_share)) {
12471 vm_object_t new_object;
12472
12473 vm_object_lock_shared(src_object);
12474 new_object = vm_object_copy_delayed(
12475 src_object,
12476 src_offset,
12477 src_size,
12478 TRUE);
12479 if (new_object == VM_OBJECT_NULL) {
12480 goto CopySlowly;
12481 }
12482
12483 VME_OBJECT_SET(new_entry, new_object, false, 0);
12484 assert(new_entry->wired_count == 0);
12485 new_entry->needs_copy = TRUE;
12486 assert(!new_entry->iokit_acct);
12487 assert(new_object->purgable == VM_PURGABLE_DENY);
12488 assertf(new_entry->use_pmap, "src_map %p new_entry %p\n", src_map, new_entry);
12489 result = KERN_SUCCESS;
12490 } else {
12491 vm_object_offset_t new_offset;
12492 new_offset = VME_OFFSET(new_entry);
12493 result = vm_object_copy_strategically(src_object,
12494 src_offset,
12495 src_size,
12496 (flags & VM_MAP_COPYIN_FORK),
12497 &new_copy_object,
12498 &new_offset,
12499 &new_entry_needs_copy);
12500 /* VME_OBJECT_SET will reset used_for_jit, so preserve it. */
12501 saved_used_for_jit = new_entry->used_for_jit;
12502 VME_OBJECT_SET(new_entry, new_copy_object, false, 0);
12503 new_entry->used_for_jit = saved_used_for_jit;
12504 if (new_offset != VME_OFFSET(new_entry)) {
12505 VME_OFFSET_SET(new_entry, new_offset);
12506 }
12507
12508 new_entry->needs_copy = new_entry_needs_copy;
12509 }
12510
12511 if (result == KERN_SUCCESS &&
12512 ((preserve_purgeable &&
12513 src_object->purgable != VM_PURGABLE_DENY) ||
12514 new_entry->used_for_jit)) {
12515 /*
12516 * Purgeable objects should be COPY_NONE, true share;
12517 * this should be propogated to the copy.
12518 *
12519 * Also force mappings the pmap specially protects to
12520 * be COPY_NONE; trying to COW these mappings would
12521 * change the effective protections, which could have
12522 * side effects if the pmap layer relies on the
12523 * specified protections.
12524 */
12525
12526 vm_object_t new_object;
12527
12528 new_object = VME_OBJECT(new_entry);
12529 assert(new_object != src_object);
12530 vm_object_lock(new_object);
12531 assert(os_ref_get_count_raw(&new_object->ref_count) == 1);
12532 assert(new_object->shadow == VM_OBJECT_NULL);
12533 assert(new_object->vo_copy == VM_OBJECT_NULL);
12534 assert(new_object->vo_owner == NULL);
12535
12536 new_object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
12537
12538 if (preserve_purgeable &&
12539 src_object->purgable != VM_PURGABLE_DENY) {
12540 VM_OBJECT_SET_TRUE_SHARE(new_object, TRUE);
12541
12542 /* start as non-volatile with no owner... */
12543 VM_OBJECT_SET_PURGABLE(new_object, VM_PURGABLE_NONVOLATILE);
12544 vm_purgeable_nonvolatile_enqueue(new_object, NULL);
12545 /* ... and move to src_object's purgeable state */
12546 if (src_object->purgable != VM_PURGABLE_NONVOLATILE) {
12547 int state;
12548 state = src_object->purgable;
12549 vm_object_purgable_control(
12550 new_object,
12551 VM_PURGABLE_SET_STATE_FROM_KERNEL,
12552 &state);
12553 }
12554 /* no pmap accounting for purgeable objects */
12555 new_entry->use_pmap = FALSE;
12556 }
12557
12558 vm_object_unlock(new_object);
12559 new_object = VM_OBJECT_NULL;
12560 }
12561
12562 /*
12563 * Throw away the extra reference
12564 */
12565
12566 vm_object_deallocate(src_object);
12567
12568 if (result != KERN_SUCCESS &&
12569 result != KERN_MEMORY_RESTART_COPY) {
12570 vm_map_lock(src_map);
12571 RETURN(result);
12572 }
12573
12574 /*
12575 * Verify that the map has not substantially
12576 * changed while the copy was being made.
12577 */
12578
12579 vm_map_lock(src_map);
12580
12581 if ((version.main_timestamp + 1) == src_map->timestamp) {
12582 /* src_map hasn't changed: src_entry is still valid */
12583 src_entry = saved_src_entry;
12584 goto VerificationSuccessful;
12585 }
12586
12587 /*
12588 * Simple version comparison failed.
12589 *
12590 * Retry the lookup and verify that the
12591 * same object/offset are still present.
12592 *
12593 * [Note: a memory manager that colludes with
12594 * the calling task can detect that we have
12595 * cheated. While the map was unlocked, the
12596 * mapping could have been changed and restored.]
12597 */
12598
12599 if (!vm_map_lookup_entry(src_map, src_start, &tmp_entry)) {
12600 if (result != KERN_MEMORY_RESTART_COPY) {
12601 vm_object_deallocate(VME_OBJECT(new_entry));
12602 VME_OBJECT_SET(new_entry, VM_OBJECT_NULL, false, 0);
12603 /* reset accounting state */
12604 new_entry->iokit_acct = FALSE;
12605 new_entry->use_pmap = TRUE;
12606 }
12607 RETURN(KERN_INVALID_ADDRESS);
12608 }
12609
12610 src_entry = tmp_entry;
12611 vm_map_clip_start(src_map, src_entry, src_start);
12612
12613 if ((((src_entry->protection & VM_PROT_READ) == VM_PROT_NONE) &&
12614 !use_maxprot) ||
12615 ((src_entry->max_protection & VM_PROT_READ) == 0)) {
12616 goto VerificationFailed;
12617 }
12618
12619 if (src_entry->vme_end < new_entry->vme_end) {
12620 /*
12621 * This entry might have been shortened
12622 * (vm_map_clip_end) or been replaced with
12623 * an entry that ends closer to "src_start"
12624 * than before.
12625 * Adjust "new_entry" accordingly; copying
12626 * less memory would be correct but we also
12627 * redo the copy (see below) if the new entry
12628 * no longer points at the same object/offset.
12629 */
12630 assert(VM_MAP_PAGE_ALIGNED(src_entry->vme_end,
12631 VM_MAP_COPY_PAGE_MASK(copy)));
12632 new_entry->vme_end = src_entry->vme_end;
12633 src_size = new_entry->vme_end - src_start;
12634 } else if (src_entry->vme_end > new_entry->vme_end) {
12635 /*
12636 * This entry might have been extended
12637 * (vm_map_entry_simplify() or coalesce)
12638 * or been replaced with an entry that ends farther
12639 * from "src_start" than before.
12640 *
12641 * We've called vm_object_copy_*() only on
12642 * the previous <start:end> range, so we can't
12643 * just extend new_entry. We have to re-do
12644 * the copy based on the new entry as if it was
12645 * pointing at a different object/offset (see
12646 * "Verification failed" below).
12647 */
12648 }
12649
12650 if ((VME_OBJECT(src_entry) != src_object) ||
12651 (VME_OFFSET(src_entry) != src_offset) ||
12652 (src_entry->vme_end > new_entry->vme_end)) {
12653 /*
12654 * Verification failed.
12655 *
12656 * Start over with this top-level entry.
12657 */
12658
12659 VerificationFailed: ;
12660
12661 vm_object_deallocate(VME_OBJECT(new_entry));
12662 tmp_entry = src_entry;
12663 continue;
12664 }
12665
12666 /*
12667 * Verification succeeded.
12668 */
12669
12670 VerificationSuccessful:;
12671
12672 if (result == KERN_MEMORY_RESTART_COPY) {
12673 goto RestartCopy;
12674 }
12675
12676 /*
12677 * Copy succeeded.
12678 */
12679
12680 CopySuccessful: ;
12681
12682 /*
12683 * Link in the new copy entry.
12684 */
12685
12686 vm_map_copy_entry_link(copy, vm_map_copy_last_entry(copy),
12687 new_entry);
12688
12689 /*
12690 * Determine whether the entire region
12691 * has been copied.
12692 */
12693 src_base = src_start;
12694 src_start = new_entry->vme_end;
12695 new_entry = VM_MAP_ENTRY_NULL;
12696 while ((src_start >= src_end) && (src_end != 0)) {
12697 submap_map_t *ptr;
12698
12699 if (src_map == base_map) {
12700 /* back to the top */
12701 break;
12702 }
12703
12704 ptr = parent_maps;
12705 assert(ptr != NULL);
12706 parent_maps = parent_maps->next;
12707
12708 /* fix up the damage we did in that submap */
12709 vm_map_simplify_range(src_map,
12710 src_base,
12711 src_end);
12712
12713 vm_map_unlock(src_map);
12714 vm_map_deallocate(src_map);
12715 vm_map_lock(ptr->parent_map);
12716 src_map = ptr->parent_map;
12717 src_base = ptr->base_start;
12718 src_start = ptr->base_start + ptr->base_len;
12719 src_end = ptr->base_end;
12720 if (!vm_map_lookup_entry(src_map,
12721 src_start,
12722 &tmp_entry) &&
12723 (src_end > src_start)) {
12724 RETURN(KERN_INVALID_ADDRESS);
12725 }
12726 kfree_type(submap_map_t, ptr);
12727 if (parent_maps == NULL) {
12728 map_share = FALSE;
12729 }
12730 src_entry = tmp_entry->vme_prev;
12731 }
12732
12733 if ((VM_MAP_PAGE_SHIFT(src_map) != PAGE_SHIFT) &&
12734 (src_start >= src_addr_unaligned + len) &&
12735 (src_addr_unaligned + len != 0)) {
12736 /*
12737 * Stop copying now, even though we haven't reached
12738 * "src_end". We'll adjust the end of the last copy
12739 * entry at the end, if needed.
12740 *
12741 * If src_map's aligment is different from the
12742 * system's page-alignment, there could be
12743 * extra non-map-aligned map entries between
12744 * the original (non-rounded) "src_addr_unaligned + len"
12745 * and the rounded "src_end".
12746 * We do not want to copy those map entries since
12747 * they're not part of the copied range.
12748 */
12749 break;
12750 }
12751
12752 if ((src_start >= src_end) && (src_end != 0)) {
12753 break;
12754 }
12755
12756 /*
12757 * Verify that there are no gaps in the region
12758 */
12759
12760 tmp_entry = src_entry->vme_next;
12761 if ((tmp_entry->vme_start != src_start) ||
12762 (tmp_entry == vm_map_to_entry(src_map))) {
12763 RETURN(KERN_INVALID_ADDRESS);
12764 }
12765 }
12766
12767 /*
12768 * If the source should be destroyed, do it now, since the
12769 * copy was successful.
12770 */
12771 if (src_destroy) {
12772 vmr_flags_t remove_flags = VM_MAP_REMOVE_NO_FLAGS;
12773
12774 if (src_map == kernel_map) {
12775 remove_flags |= VM_MAP_REMOVE_KUNWIRE;
12776 }
12777 (void)vm_map_remove_and_unlock(src_map,
12778 vm_map_trunc_page(src_addr_unaligned, VM_MAP_PAGE_MASK(src_map)),
12779 src_end,
12780 remove_flags,
12781 KMEM_GUARD_NONE);
12782 } else {
12783 /* fix up the damage we did in the base map */
12784 vm_map_simplify_range(
12785 src_map,
12786 vm_map_trunc_page(src_addr_unaligned,
12787 VM_MAP_PAGE_MASK(src_map)),
12788 vm_map_round_page(src_end,
12789 VM_MAP_PAGE_MASK(src_map)));
12790 vm_map_unlock(src_map);
12791 }
12792
12793 tmp_entry = VM_MAP_ENTRY_NULL;
12794
12795 if (VM_MAP_PAGE_SHIFT(src_map) > PAGE_SHIFT &&
12796 VM_MAP_PAGE_SHIFT(src_map) != VM_MAP_COPY_PAGE_SHIFT(copy)) {
12797 vm_map_offset_t original_start, original_offset, original_end;
12798
12799 assert(VM_MAP_COPY_PAGE_MASK(copy) == PAGE_MASK);
12800
12801 /* adjust alignment of first copy_entry's "vme_start" */
12802 tmp_entry = vm_map_copy_first_entry(copy);
12803 if (tmp_entry != vm_map_copy_to_entry(copy)) {
12804 vm_map_offset_t adjustment;
12805
12806 original_start = tmp_entry->vme_start;
12807 original_offset = VME_OFFSET(tmp_entry);
12808
12809 /* map-align the start of the first copy entry... */
12810 adjustment = (tmp_entry->vme_start -
12811 vm_map_trunc_page(
12812 tmp_entry->vme_start,
12813 VM_MAP_PAGE_MASK(src_map)));
12814 tmp_entry->vme_start -= adjustment;
12815 VME_OFFSET_SET(tmp_entry,
12816 VME_OFFSET(tmp_entry) - adjustment);
12817 copy_addr -= adjustment;
12818 assert(tmp_entry->vme_start < tmp_entry->vme_end);
12819 /* ... adjust for mis-aligned start of copy range */
12820 adjustment =
12821 (vm_map_trunc_page(copy->offset,
12822 PAGE_MASK) -
12823 vm_map_trunc_page(copy->offset,
12824 VM_MAP_PAGE_MASK(src_map)));
12825 if (adjustment) {
12826 assert(page_aligned(adjustment));
12827 assert(adjustment < VM_MAP_PAGE_SIZE(src_map));
12828 tmp_entry->vme_start += adjustment;
12829 VME_OFFSET_SET(tmp_entry,
12830 (VME_OFFSET(tmp_entry) +
12831 adjustment));
12832 copy_addr += adjustment;
12833 assert(tmp_entry->vme_start < tmp_entry->vme_end);
12834 }
12835
12836 /*
12837 * Assert that the adjustments haven't exposed
12838 * more than was originally copied...
12839 */
12840 assert(tmp_entry->vme_start >= original_start);
12841 assert(VME_OFFSET(tmp_entry) >= original_offset);
12842 /*
12843 * ... and that it did not adjust outside of a
12844 * a single 16K page.
12845 */
12846 assert(vm_map_trunc_page(tmp_entry->vme_start,
12847 VM_MAP_PAGE_MASK(src_map)) ==
12848 vm_map_trunc_page(original_start,
12849 VM_MAP_PAGE_MASK(src_map)));
12850 }
12851
12852 /* adjust alignment of last copy_entry's "vme_end" */
12853 tmp_entry = vm_map_copy_last_entry(copy);
12854 if (tmp_entry != vm_map_copy_to_entry(copy)) {
12855 vm_map_offset_t adjustment;
12856
12857 original_end = tmp_entry->vme_end;
12858
12859 /* map-align the end of the last copy entry... */
12860 tmp_entry->vme_end =
12861 vm_map_round_page(tmp_entry->vme_end,
12862 VM_MAP_PAGE_MASK(src_map));
12863 /* ... adjust for mis-aligned end of copy range */
12864 adjustment =
12865 (vm_map_round_page((copy->offset +
12866 copy->size),
12867 VM_MAP_PAGE_MASK(src_map)) -
12868 vm_map_round_page((copy->offset +
12869 copy->size),
12870 PAGE_MASK));
12871 if (adjustment) {
12872 assert(page_aligned(adjustment));
12873 assert(adjustment < VM_MAP_PAGE_SIZE(src_map));
12874 tmp_entry->vme_end -= adjustment;
12875 assert(tmp_entry->vme_start < tmp_entry->vme_end);
12876 }
12877
12878 /*
12879 * Assert that the adjustments haven't exposed
12880 * more than was originally copied...
12881 */
12882 assert(tmp_entry->vme_end <= original_end);
12883 /*
12884 * ... and that it did not adjust outside of a
12885 * a single 16K page.
12886 */
12887 assert(vm_map_round_page(tmp_entry->vme_end,
12888 VM_MAP_PAGE_MASK(src_map)) ==
12889 vm_map_round_page(original_end,
12890 VM_MAP_PAGE_MASK(src_map)));
12891 }
12892 }
12893
12894 /* Fix-up start and end points in copy. This is necessary */
12895 /* when the various entries in the copy object were picked */
12896 /* up from different sub-maps */
12897
12898 tmp_entry = vm_map_copy_first_entry(copy);
12899 copy_size = 0; /* compute actual size */
12900 while (tmp_entry != vm_map_copy_to_entry(copy)) {
12901 assert(VM_MAP_PAGE_ALIGNED(
12902 copy_addr + (tmp_entry->vme_end -
12903 tmp_entry->vme_start),
12904 MIN(VM_MAP_COPY_PAGE_MASK(copy), PAGE_MASK)));
12905 assert(VM_MAP_PAGE_ALIGNED(
12906 copy_addr,
12907 MIN(VM_MAP_COPY_PAGE_MASK(copy), PAGE_MASK)));
12908
12909 /*
12910 * The copy_entries will be injected directly into the
12911 * destination map and might not be "map aligned" there...
12912 */
12913 tmp_entry->map_aligned = FALSE;
12914
12915 tmp_entry->vme_end = copy_addr +
12916 (tmp_entry->vme_end - tmp_entry->vme_start);
12917 tmp_entry->vme_start = copy_addr;
12918 assert(tmp_entry->vme_start < tmp_entry->vme_end);
12919 copy_addr += tmp_entry->vme_end - tmp_entry->vme_start;
12920 copy_size += tmp_entry->vme_end - tmp_entry->vme_start;
12921 tmp_entry = (struct vm_map_entry *)tmp_entry->vme_next;
12922 }
12923
12924 if (VM_MAP_PAGE_SHIFT(src_map) != PAGE_SHIFT &&
12925 copy_size < copy->size) {
12926 /*
12927 * The actual size of the VM map copy is smaller than what
12928 * was requested by the caller. This must be because some
12929 * PAGE_SIZE-sized pages are missing at the end of the last
12930 * VM_MAP_PAGE_SIZE(src_map)-sized chunk of the range.
12931 * The caller might not have been aware of those missing
12932 * pages and might not want to be aware of it, which is
12933 * fine as long as they don't try to access (and crash on)
12934 * those missing pages.
12935 * Let's adjust the size of the "copy", to avoid failing
12936 * in vm_map_copyout() or vm_map_copy_overwrite().
12937 */
12938 assert(vm_map_round_page(copy_size,
12939 VM_MAP_PAGE_MASK(src_map)) ==
12940 vm_map_round_page(copy->size,
12941 VM_MAP_PAGE_MASK(src_map)));
12942 copy->size = copy_size;
12943 }
12944
12945 *copy_result = copy;
12946 return KERN_SUCCESS;
12947
12948 #undef RETURN
12949 }
12950
12951 kern_return_t
vm_map_copy_extract(vm_map_t src_map,vm_map_address_t src_addr,vm_map_size_t len,boolean_t do_copy,vm_map_copy_t * copy_result,vm_prot_t * cur_prot,vm_prot_t * max_prot,vm_inherit_t inheritance,vm_map_kernel_flags_t vmk_flags)12952 vm_map_copy_extract(
12953 vm_map_t src_map,
12954 vm_map_address_t src_addr,
12955 vm_map_size_t len,
12956 boolean_t do_copy,
12957 vm_map_copy_t *copy_result, /* OUT */
12958 vm_prot_t *cur_prot, /* IN/OUT */
12959 vm_prot_t *max_prot, /* IN/OUT */
12960 vm_inherit_t inheritance,
12961 vm_map_kernel_flags_t vmk_flags)
12962 {
12963 vm_map_copy_t copy;
12964 kern_return_t kr;
12965 vm_prot_t required_cur_prot, required_max_prot;
12966
12967 /*
12968 * Check for copies of zero bytes.
12969 */
12970
12971 if (len == 0) {
12972 *copy_result = VM_MAP_COPY_NULL;
12973 return KERN_SUCCESS;
12974 }
12975
12976 /*
12977 * Check that the end address doesn't overflow
12978 */
12979 if (src_addr + len < src_addr) {
12980 return KERN_INVALID_ADDRESS;
12981 }
12982 if (__improbable(vm_map_range_overflows(src_map, src_addr, len))) {
12983 return KERN_INVALID_ADDRESS;
12984 }
12985
12986 if (VM_MAP_PAGE_SIZE(src_map) < PAGE_SIZE) {
12987 DEBUG4K_SHARE("src_map %p src_addr 0x%llx src_end 0x%llx\n", src_map, (uint64_t)src_addr, (uint64_t)(src_addr + len));
12988 }
12989
12990 required_cur_prot = *cur_prot;
12991 required_max_prot = *max_prot;
12992
12993 /*
12994 * Allocate a header element for the list.
12995 *
12996 * Use the start and end in the header to
12997 * remember the endpoints prior to rounding.
12998 */
12999
13000 copy = vm_map_copy_allocate(VM_MAP_COPY_ENTRY_LIST);
13001 copy->cpy_hdr.entries_pageable = vmk_flags.vmkf_copy_pageable;
13002 copy->offset = 0;
13003 copy->size = len;
13004
13005 kr = vm_map_remap_extract(src_map,
13006 src_addr,
13007 len,
13008 do_copy, /* copy */
13009 copy,
13010 cur_prot, /* IN/OUT */
13011 max_prot, /* IN/OUT */
13012 inheritance,
13013 vmk_flags);
13014 if (kr != KERN_SUCCESS) {
13015 vm_map_copy_discard(copy);
13016 if ((kr == KERN_INVALID_ADDRESS ||
13017 kr == KERN_INVALID_ARGUMENT) &&
13018 src_map->terminated) {
13019 /* tell the caller that this address space is gone */
13020 kr = KERN_TERMINATED;
13021 }
13022 return kr;
13023 }
13024 if (required_cur_prot != VM_PROT_NONE) {
13025 assert((*cur_prot & required_cur_prot) == required_cur_prot);
13026 assert((*max_prot & required_max_prot) == required_max_prot);
13027 }
13028
13029 *copy_result = copy;
13030 return KERN_SUCCESS;
13031 }
13032
13033 static void
vm_map_fork_share(vm_map_t old_map,vm_map_entry_t old_entry,vm_map_t new_map)13034 vm_map_fork_share(
13035 vm_map_t old_map,
13036 vm_map_entry_t old_entry,
13037 vm_map_t new_map)
13038 {
13039 vm_object_t object;
13040 vm_map_entry_t new_entry;
13041
13042 /*
13043 * New sharing code. New map entry
13044 * references original object. Internal
13045 * objects use asynchronous copy algorithm for
13046 * future copies. First make sure we have
13047 * the right object. If we need a shadow,
13048 * or someone else already has one, then
13049 * make a new shadow and share it.
13050 */
13051
13052 if (!old_entry->is_sub_map) {
13053 object = VME_OBJECT(old_entry);
13054 }
13055
13056 if (old_entry->is_sub_map) {
13057 assert(old_entry->wired_count == 0);
13058 #ifndef NO_NESTED_PMAP
13059 #if !PMAP_FORK_NEST
13060 if (old_entry->use_pmap) {
13061 kern_return_t result;
13062
13063 result = pmap_nest(new_map->pmap,
13064 (VME_SUBMAP(old_entry))->pmap,
13065 (addr64_t)old_entry->vme_start,
13066 (uint64_t)(old_entry->vme_end - old_entry->vme_start));
13067 if (result) {
13068 panic("vm_map_fork_share: pmap_nest failed!");
13069 }
13070 }
13071 #endif /* !PMAP_FORK_NEST */
13072 #endif /* NO_NESTED_PMAP */
13073 } else if (object == VM_OBJECT_NULL) {
13074 object = vm_object_allocate((vm_map_size_t)(old_entry->vme_end -
13075 old_entry->vme_start));
13076 VME_OFFSET_SET(old_entry, 0);
13077 VME_OBJECT_SET(old_entry, object, false, 0);
13078 old_entry->use_pmap = TRUE;
13079 // assert(!old_entry->needs_copy);
13080 } else if (object->copy_strategy !=
13081 MEMORY_OBJECT_COPY_SYMMETRIC) {
13082 /*
13083 * We are already using an asymmetric
13084 * copy, and therefore we already have
13085 * the right object.
13086 */
13087
13088 assert(!old_entry->needs_copy);
13089 } else if (old_entry->needs_copy || /* case 1 */
13090 object->shadowed || /* case 2 */
13091 (!object->true_share && /* case 3 */
13092 !old_entry->is_shared &&
13093 (object->vo_size >
13094 (vm_map_size_t)(old_entry->vme_end -
13095 old_entry->vme_start)))) {
13096 bool is_writable;
13097
13098 /*
13099 * We need to create a shadow.
13100 * There are three cases here.
13101 * In the first case, we need to
13102 * complete a deferred symmetrical
13103 * copy that we participated in.
13104 * In the second and third cases,
13105 * we need to create the shadow so
13106 * that changes that we make to the
13107 * object do not interfere with
13108 * any symmetrical copies which
13109 * have occured (case 2) or which
13110 * might occur (case 3).
13111 *
13112 * The first case is when we had
13113 * deferred shadow object creation
13114 * via the entry->needs_copy mechanism.
13115 * This mechanism only works when
13116 * only one entry points to the source
13117 * object, and we are about to create
13118 * a second entry pointing to the
13119 * same object. The problem is that
13120 * there is no way of mapping from
13121 * an object to the entries pointing
13122 * to it. (Deferred shadow creation
13123 * works with one entry because occurs
13124 * at fault time, and we walk from the
13125 * entry to the object when handling
13126 * the fault.)
13127 *
13128 * The second case is when the object
13129 * to be shared has already been copied
13130 * with a symmetric copy, but we point
13131 * directly to the object without
13132 * needs_copy set in our entry. (This
13133 * can happen because different ranges
13134 * of an object can be pointed to by
13135 * different entries. In particular,
13136 * a single entry pointing to an object
13137 * can be split by a call to vm_inherit,
13138 * which, combined with task_create, can
13139 * result in the different entries
13140 * having different needs_copy values.)
13141 * The shadowed flag in the object allows
13142 * us to detect this case. The problem
13143 * with this case is that if this object
13144 * has or will have shadows, then we
13145 * must not perform an asymmetric copy
13146 * of this object, since such a copy
13147 * allows the object to be changed, which
13148 * will break the previous symmetrical
13149 * copies (which rely upon the object
13150 * not changing). In a sense, the shadowed
13151 * flag says "don't change this object".
13152 * We fix this by creating a shadow
13153 * object for this object, and sharing
13154 * that. This works because we are free
13155 * to change the shadow object (and thus
13156 * to use an asymmetric copy strategy);
13157 * this is also semantically correct,
13158 * since this object is temporary, and
13159 * therefore a copy of the object is
13160 * as good as the object itself. (This
13161 * is not true for permanent objects,
13162 * since the pager needs to see changes,
13163 * which won't happen if the changes
13164 * are made to a copy.)
13165 *
13166 * The third case is when the object
13167 * to be shared has parts sticking
13168 * outside of the entry we're working
13169 * with, and thus may in the future
13170 * be subject to a symmetrical copy.
13171 * (This is a preemptive version of
13172 * case 2.)
13173 */
13174 VME_OBJECT_SHADOW(old_entry,
13175 (vm_map_size_t) (old_entry->vme_end -
13176 old_entry->vme_start),
13177 vm_map_always_shadow(old_map));
13178
13179 /*
13180 * If we're making a shadow for other than
13181 * copy on write reasons, then we have
13182 * to remove write permission.
13183 */
13184
13185 is_writable = false;
13186 if (old_entry->protection & VM_PROT_WRITE) {
13187 is_writable = true;
13188 #if __arm64e__
13189 } else if (old_entry->used_for_tpro) {
13190 is_writable = true;
13191 #endif /* __arm64e__ */
13192 }
13193 if (!old_entry->needs_copy && is_writable) {
13194 vm_prot_t prot;
13195
13196 if (pmap_has_prot_policy(old_map->pmap, old_entry->translated_allow_execute, old_entry->protection)) {
13197 panic("%s: map %p pmap %p entry %p 0x%llx:0x%llx prot 0x%x",
13198 __FUNCTION__, old_map, old_map->pmap,
13199 old_entry,
13200 (uint64_t)old_entry->vme_start,
13201 (uint64_t)old_entry->vme_end,
13202 old_entry->protection);
13203 }
13204
13205 prot = old_entry->protection & ~VM_PROT_WRITE;
13206
13207 if (pmap_has_prot_policy(old_map->pmap, old_entry->translated_allow_execute, prot)) {
13208 panic("%s: map %p pmap %p entry %p 0x%llx:0x%llx prot 0x%x",
13209 __FUNCTION__, old_map, old_map->pmap,
13210 old_entry,
13211 (uint64_t)old_entry->vme_start,
13212 (uint64_t)old_entry->vme_end,
13213 prot);
13214 }
13215
13216 if (override_nx(old_map, VME_ALIAS(old_entry)) && prot) {
13217 prot |= VM_PROT_EXECUTE;
13218 }
13219
13220
13221 if (old_map->mapped_in_other_pmaps) {
13222 vm_object_pmap_protect(
13223 VME_OBJECT(old_entry),
13224 VME_OFFSET(old_entry),
13225 (old_entry->vme_end -
13226 old_entry->vme_start),
13227 PMAP_NULL,
13228 PAGE_SIZE,
13229 old_entry->vme_start,
13230 prot);
13231 } else {
13232 pmap_protect(old_map->pmap,
13233 old_entry->vme_start,
13234 old_entry->vme_end,
13235 prot);
13236 }
13237 }
13238
13239 old_entry->needs_copy = FALSE;
13240 object = VME_OBJECT(old_entry);
13241 }
13242
13243
13244 /*
13245 * If object was using a symmetric copy strategy,
13246 * change its copy strategy to the default
13247 * asymmetric copy strategy, which is copy_delay
13248 * in the non-norma case and copy_call in the
13249 * norma case. Bump the reference count for the
13250 * new entry.
13251 */
13252
13253 if (old_entry->is_sub_map) {
13254 vm_map_reference(VME_SUBMAP(old_entry));
13255 } else {
13256 vm_object_lock(object);
13257 vm_object_reference_locked(object);
13258 if (object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) {
13259 object->copy_strategy = MEMORY_OBJECT_COPY_DELAY;
13260 }
13261 vm_object_unlock(object);
13262 }
13263
13264 /*
13265 * Clone the entry, using object ref from above.
13266 * Mark both entries as shared.
13267 */
13268
13269 new_entry = vm_map_entry_create(new_map); /* Never the kernel map or descendants */
13270 vm_map_entry_copy(old_map, new_entry, old_entry);
13271 old_entry->is_shared = TRUE;
13272 new_entry->is_shared = TRUE;
13273
13274 /*
13275 * We're dealing with a shared mapping, so the resulting mapping
13276 * should inherit some of the original mapping's accounting settings.
13277 * "iokit_acct" should have been cleared in vm_map_entry_copy().
13278 * "use_pmap" should stay the same as before (if it hasn't been reset
13279 * to TRUE when we cleared "iokit_acct").
13280 */
13281 assert(!new_entry->iokit_acct);
13282
13283 /*
13284 * If old entry's inheritence is VM_INHERIT_NONE,
13285 * the new entry is for corpse fork, remove the
13286 * write permission from the new entry.
13287 */
13288 if (old_entry->inheritance == VM_INHERIT_NONE) {
13289 new_entry->protection &= ~VM_PROT_WRITE;
13290 new_entry->max_protection &= ~VM_PROT_WRITE;
13291 }
13292
13293 /*
13294 * Insert the entry into the new map -- we
13295 * know we're inserting at the end of the new
13296 * map.
13297 */
13298
13299 vm_map_store_entry_link(new_map, vm_map_last_entry(new_map), new_entry,
13300 VM_MAP_KERNEL_FLAGS_NONE);
13301
13302 /*
13303 * Update the physical map
13304 */
13305
13306 if (old_entry->is_sub_map) {
13307 /* Bill Angell pmap support goes here */
13308 } else {
13309 pmap_copy(new_map->pmap, old_map->pmap, new_entry->vme_start,
13310 old_entry->vme_end - old_entry->vme_start,
13311 old_entry->vme_start);
13312 }
13313 }
13314
13315 static boolean_t
vm_map_fork_copy(vm_map_t old_map,vm_map_entry_t * old_entry_p,vm_map_t new_map,int vm_map_copyin_flags)13316 vm_map_fork_copy(
13317 vm_map_t old_map,
13318 vm_map_entry_t *old_entry_p,
13319 vm_map_t new_map,
13320 int vm_map_copyin_flags)
13321 {
13322 vm_map_entry_t old_entry = *old_entry_p;
13323 vm_map_size_t entry_size = old_entry->vme_end - old_entry->vme_start;
13324 vm_map_offset_t start = old_entry->vme_start;
13325 vm_map_copy_t copy;
13326 vm_map_entry_t last = vm_map_last_entry(new_map);
13327
13328 vm_map_unlock(old_map);
13329 /*
13330 * Use maxprot version of copyin because we
13331 * care about whether this memory can ever
13332 * be accessed, not just whether it's accessible
13333 * right now.
13334 */
13335 vm_map_copyin_flags |= VM_MAP_COPYIN_USE_MAXPROT;
13336 if (vm_map_copyin_internal(old_map, start, entry_size,
13337 vm_map_copyin_flags, ©)
13338 != KERN_SUCCESS) {
13339 /*
13340 * The map might have changed while it
13341 * was unlocked, check it again. Skip
13342 * any blank space or permanently
13343 * unreadable region.
13344 */
13345 vm_map_lock(old_map);
13346 if (!vm_map_lookup_entry(old_map, start, &last) ||
13347 (last->max_protection & VM_PROT_READ) == VM_PROT_NONE) {
13348 last = last->vme_next;
13349 }
13350 *old_entry_p = last;
13351
13352 /*
13353 * XXX For some error returns, want to
13354 * XXX skip to the next element. Note
13355 * that INVALID_ADDRESS and
13356 * PROTECTION_FAILURE are handled above.
13357 */
13358
13359 return FALSE;
13360 }
13361
13362 /*
13363 * Assert that the vm_map_copy is coming from the right
13364 * zone and hasn't been forged
13365 */
13366 vm_map_copy_require(copy);
13367
13368 /*
13369 * Insert the copy into the new map
13370 */
13371 vm_map_copy_insert(new_map, last, copy);
13372
13373 /*
13374 * Pick up the traversal at the end of
13375 * the copied region.
13376 */
13377
13378 vm_map_lock(old_map);
13379 start += entry_size;
13380 if (!vm_map_lookup_entry(old_map, start, &last)) {
13381 last = last->vme_next;
13382 } else {
13383 if (last->vme_start == start) {
13384 /*
13385 * No need to clip here and we don't
13386 * want to cause any unnecessary
13387 * unnesting...
13388 */
13389 } else {
13390 vm_map_clip_start(old_map, last, start);
13391 }
13392 }
13393 *old_entry_p = last;
13394
13395 return TRUE;
13396 }
13397
13398 #if PMAP_FORK_NEST
13399 #define PMAP_FORK_NEST_DEBUG 0
13400 static inline void
vm_map_fork_unnest(pmap_t new_pmap,vm_map_offset_t pre_nested_start,vm_map_offset_t pre_nested_end,vm_map_offset_t start,vm_map_offset_t end)13401 vm_map_fork_unnest(
13402 pmap_t new_pmap,
13403 vm_map_offset_t pre_nested_start,
13404 vm_map_offset_t pre_nested_end,
13405 vm_map_offset_t start,
13406 vm_map_offset_t end)
13407 {
13408 kern_return_t kr;
13409 vm_map_offset_t nesting_mask, start_unnest, end_unnest;
13410
13411 assertf(pre_nested_start <= pre_nested_end,
13412 "pre_nested start 0x%llx end 0x%llx",
13413 (uint64_t)pre_nested_start, (uint64_t)pre_nested_end);
13414 assertf(start <= end,
13415 "start 0x%llx end 0x%llx",
13416 (uint64_t) start, (uint64_t)end);
13417
13418 if (pre_nested_start == pre_nested_end) {
13419 /* nothing was pre-nested: done */
13420 return;
13421 }
13422 if (end <= pre_nested_start) {
13423 /* fully before pre-nested range: done */
13424 return;
13425 }
13426 if (start >= pre_nested_end) {
13427 /* fully after pre-nested range: done */
13428 return;
13429 }
13430 /* ignore parts of range outside of pre_nested range */
13431 if (start < pre_nested_start) {
13432 start = pre_nested_start;
13433 }
13434 if (end > pre_nested_end) {
13435 end = pre_nested_end;
13436 }
13437 nesting_mask = pmap_shared_region_size_min(new_pmap) - 1;
13438 start_unnest = start & ~nesting_mask;
13439 end_unnest = (end + nesting_mask) & ~nesting_mask;
13440 kr = pmap_unnest(new_pmap,
13441 (addr64_t)start_unnest,
13442 (uint64_t)(end_unnest - start_unnest));
13443 #if PMAP_FORK_NEST_DEBUG
13444 printf("PMAP_FORK_NEST %s:%d new_pmap %p 0x%llx:0x%llx -> pmap_unnest 0x%llx:0x%llx kr 0x%x\n", __FUNCTION__, __LINE__, new_pmap, (uint64_t)start, (uint64_t)end, (uint64_t)start_unnest, (uint64_t)end_unnest, kr);
13445 #endif /* PMAP_FORK_NEST_DEBUG */
13446 assertf(kr == KERN_SUCCESS,
13447 "0x%llx 0x%llx pmap_unnest(%p, 0x%llx, 0x%llx) -> 0x%x",
13448 (uint64_t)start, (uint64_t)end, new_pmap,
13449 (uint64_t)start_unnest, (uint64_t)(end_unnest - start_unnest),
13450 kr);
13451 }
13452 #endif /* PMAP_FORK_NEST */
13453
13454 void
vm_map_inherit_limits(vm_map_t new_map,const struct _vm_map * old_map)13455 vm_map_inherit_limits(vm_map_t new_map, const struct _vm_map *old_map)
13456 {
13457 new_map->size_limit = old_map->size_limit;
13458 new_map->data_limit = old_map->data_limit;
13459 new_map->user_wire_limit = old_map->user_wire_limit;
13460 new_map->reserved_regions = old_map->reserved_regions;
13461 }
13462
13463 /*
13464 * vm_map_fork:
13465 *
13466 * Create and return a new map based on the old
13467 * map, according to the inheritance values on the
13468 * regions in that map and the options.
13469 *
13470 * The source map must not be locked.
13471 */
13472 vm_map_t
vm_map_fork(ledger_t ledger,vm_map_t old_map,int options)13473 vm_map_fork(
13474 ledger_t ledger,
13475 vm_map_t old_map,
13476 int options)
13477 {
13478 pmap_t new_pmap;
13479 vm_map_t new_map;
13480 vm_map_entry_t old_entry;
13481 vm_map_size_t new_size = 0, entry_size;
13482 vm_map_entry_t new_entry;
13483 boolean_t src_needs_copy;
13484 boolean_t new_entry_needs_copy;
13485 boolean_t pmap_is64bit;
13486 int vm_map_copyin_flags;
13487 vm_inherit_t old_entry_inheritance;
13488 int map_create_options;
13489 kern_return_t footprint_collect_kr;
13490
13491 if (options & ~(VM_MAP_FORK_SHARE_IF_INHERIT_NONE |
13492 VM_MAP_FORK_PRESERVE_PURGEABLE |
13493 VM_MAP_FORK_CORPSE_FOOTPRINT |
13494 VM_MAP_FORK_SHARE_IF_OWNED)) {
13495 /* unsupported option */
13496 return VM_MAP_NULL;
13497 }
13498
13499 pmap_is64bit =
13500 #if defined(__i386__) || defined(__x86_64__)
13501 old_map->pmap->pm_task_map != TASK_MAP_32BIT;
13502 #elif defined(__arm64__)
13503 old_map->pmap->is_64bit;
13504 #else
13505 #error Unknown architecture.
13506 #endif
13507
13508 unsigned int pmap_flags = 0;
13509 pmap_flags |= pmap_is64bit ? PMAP_CREATE_64BIT : 0;
13510 #if defined(HAS_APPLE_PAC)
13511 pmap_flags |= old_map->pmap->disable_jop ? PMAP_CREATE_DISABLE_JOP : 0;
13512 #endif
13513 #if CONFIG_ROSETTA
13514 pmap_flags |= old_map->pmap->is_rosetta ? PMAP_CREATE_ROSETTA : 0;
13515 #endif
13516 #if PMAP_CREATE_FORCE_4K_PAGES
13517 if (VM_MAP_PAGE_SIZE(old_map) == FOURK_PAGE_SIZE &&
13518 PAGE_SIZE != FOURK_PAGE_SIZE) {
13519 pmap_flags |= PMAP_CREATE_FORCE_4K_PAGES;
13520 }
13521 #endif /* PMAP_CREATE_FORCE_4K_PAGES */
13522 new_pmap = pmap_create_options(ledger, (vm_map_size_t) 0, pmap_flags);
13523 if (new_pmap == NULL) {
13524 return VM_MAP_NULL;
13525 }
13526
13527 vm_map_reference(old_map);
13528 vm_map_lock(old_map);
13529
13530 map_create_options = 0;
13531 if (old_map->hdr.entries_pageable) {
13532 map_create_options |= VM_MAP_CREATE_PAGEABLE;
13533 }
13534 if (options & VM_MAP_FORK_CORPSE_FOOTPRINT) {
13535 map_create_options |= VM_MAP_CREATE_CORPSE_FOOTPRINT;
13536 footprint_collect_kr = KERN_SUCCESS;
13537 }
13538 new_map = vm_map_create_options(new_pmap,
13539 old_map->min_offset,
13540 old_map->max_offset,
13541 map_create_options);
13542
13543 /* inherit cs_enforcement */
13544 vm_map_cs_enforcement_set(new_map, old_map->cs_enforcement);
13545
13546 vm_map_lock(new_map);
13547 vm_commit_pagezero_status(new_map);
13548 /* inherit the parent map's page size */
13549 vm_map_set_page_shift(new_map, VM_MAP_PAGE_SHIFT(old_map));
13550
13551 /* inherit the parent rlimits */
13552 vm_map_inherit_limits(new_map, old_map);
13553
13554 #if CONFIG_MAP_RANGES
13555 /* inherit the parent map's VM ranges */
13556 vm_map_range_fork(new_map, old_map);
13557 #endif
13558
13559 #if CODE_SIGNING_MONITOR
13560 /* Prepare the monitor for the fork */
13561 csm_fork_prepare(old_map->pmap, new_pmap);
13562 #endif
13563
13564 #if PMAP_FORK_NEST
13565 /*
13566 * Pre-nest the shared region's pmap.
13567 */
13568 vm_map_offset_t pre_nested_start = 0, pre_nested_end = 0;
13569 pmap_fork_nest(old_map->pmap, new_pmap,
13570 &pre_nested_start, &pre_nested_end);
13571 #if PMAP_FORK_NEST_DEBUG
13572 printf("PMAP_FORK_NEST %s:%d old %p new %p pre_nested start 0x%llx end 0x%llx\n", __FUNCTION__, __LINE__, old_map->pmap, new_pmap, (uint64_t)pre_nested_start, (uint64_t)pre_nested_end);
13573 #endif /* PMAP_FORK_NEST_DEBUG */
13574 #endif /* PMAP_FORK_NEST */
13575
13576 for (old_entry = vm_map_first_entry(old_map); old_entry != vm_map_to_entry(old_map);) {
13577 /*
13578 * Abort any corpse collection if the system is shutting down.
13579 */
13580 if ((options & VM_MAP_FORK_CORPSE_FOOTPRINT) &&
13581 get_system_inshutdown()) {
13582 #if PMAP_FORK_NEST
13583 new_entry = vm_map_last_entry(new_map);
13584 if (new_entry == vm_map_to_entry(new_map)) {
13585 /* unnest all that was pre-nested */
13586 vm_map_fork_unnest(new_pmap,
13587 pre_nested_start, pre_nested_end,
13588 vm_map_min(new_map), vm_map_max(new_map));
13589 } else if (new_entry->vme_end < vm_map_max(new_map)) {
13590 /* unnest hole at the end, if pre-nested */
13591 vm_map_fork_unnest(new_pmap,
13592 pre_nested_start, pre_nested_end,
13593 new_entry->vme_end, vm_map_max(new_map));
13594 }
13595 #endif /* PMAP_FORK_NEST */
13596 vm_map_corpse_footprint_collect_done(new_map);
13597 vm_map_unlock(new_map);
13598 vm_map_unlock(old_map);
13599 vm_map_deallocate(new_map);
13600 vm_map_deallocate(old_map);
13601 printf("Aborting corpse map due to system shutdown\n");
13602 return VM_MAP_NULL;
13603 }
13604
13605 entry_size = old_entry->vme_end - old_entry->vme_start;
13606
13607 #if PMAP_FORK_NEST
13608 /*
13609 * Undo any unnecessary pre-nesting.
13610 */
13611 vm_map_offset_t prev_end;
13612 if (old_entry == vm_map_first_entry(old_map)) {
13613 prev_end = vm_map_min(old_map);
13614 } else {
13615 prev_end = old_entry->vme_prev->vme_end;
13616 }
13617 if (prev_end < old_entry->vme_start) {
13618 /* unnest hole before this entry, if pre-nested */
13619 vm_map_fork_unnest(new_pmap,
13620 pre_nested_start, pre_nested_end,
13621 prev_end, old_entry->vme_start);
13622 }
13623 if (old_entry->is_sub_map && old_entry->use_pmap) {
13624 /* keep this entry nested in the child */
13625 #if PMAP_FORK_NEST_DEBUG
13626 printf("PMAP_FORK_NEST %s:%d new_pmap %p keeping 0x%llx:0x%llx nested\n", __FUNCTION__, __LINE__, new_pmap, (uint64_t)old_entry->vme_start, (uint64_t)old_entry->vme_end);
13627 #endif /* PMAP_FORK_NEST_DEBUG */
13628 } else {
13629 /* undo nesting for this entry, if pre-nested */
13630 vm_map_fork_unnest(new_pmap,
13631 pre_nested_start, pre_nested_end,
13632 old_entry->vme_start, old_entry->vme_end);
13633 }
13634 #endif /* PMAP_FORK_NEST */
13635
13636 old_entry_inheritance = old_entry->inheritance;
13637
13638 /*
13639 * If caller used the VM_MAP_FORK_SHARE_IF_INHERIT_NONE option
13640 * share VM_INHERIT_NONE entries that are not backed by a
13641 * device pager.
13642 */
13643 if (old_entry_inheritance == VM_INHERIT_NONE &&
13644 (options & VM_MAP_FORK_SHARE_IF_INHERIT_NONE) &&
13645 (old_entry->protection & VM_PROT_READ) &&
13646 !(!old_entry->is_sub_map &&
13647 VME_OBJECT(old_entry) != NULL &&
13648 VME_OBJECT(old_entry)->pager != NULL &&
13649 is_device_pager_ops(
13650 VME_OBJECT(old_entry)->pager->mo_pager_ops))) {
13651 old_entry_inheritance = VM_INHERIT_SHARE;
13652 }
13653 if (old_entry_inheritance == VM_INHERIT_COPY &&
13654 (options & VM_MAP_FORK_SHARE_IF_OWNED) &&
13655 !old_entry->is_sub_map &&
13656 VME_OBJECT(old_entry) != VM_OBJECT_NULL) {
13657 vm_object_t object;
13658 task_t owner;
13659 object = VME_OBJECT(old_entry);
13660 owner = VM_OBJECT_OWNER(object);
13661 if (owner != TASK_NULL &&
13662 owner->map == old_map) {
13663 /*
13664 * This mapping points at a VM object owned
13665 * by the task being forked.
13666 * Some tools reporting memory accounting
13667 * info rely on the object ID, so share this
13668 * mapping instead of copying, to make the
13669 * corpse look exactly like the original
13670 * task in that respect.
13671 */
13672 assert(object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC);
13673 old_entry_inheritance = VM_INHERIT_SHARE;
13674 }
13675 }
13676
13677 if (old_entry_inheritance != VM_INHERIT_NONE &&
13678 (options & VM_MAP_FORK_CORPSE_FOOTPRINT) &&
13679 footprint_collect_kr == KERN_SUCCESS) {
13680 /*
13681 * The corpse won't have old_map->pmap to query
13682 * footprint information, so collect that data now
13683 * and store it in new_map->vmmap_corpse_footprint
13684 * for later autopsy.
13685 */
13686 footprint_collect_kr =
13687 vm_map_corpse_footprint_collect(old_map,
13688 old_entry,
13689 new_map);
13690 }
13691
13692 switch (old_entry_inheritance) {
13693 case VM_INHERIT_NONE:
13694 break;
13695
13696 case VM_INHERIT_SHARE:
13697 vm_map_fork_share(old_map, old_entry, new_map);
13698 new_size += entry_size;
13699 break;
13700
13701 case VM_INHERIT_COPY:
13702
13703 /*
13704 * Inline the copy_quickly case;
13705 * upon failure, fall back on call
13706 * to vm_map_fork_copy.
13707 */
13708
13709 if (old_entry->is_sub_map) {
13710 break;
13711 }
13712 if ((old_entry->wired_count != 0) ||
13713 ((VME_OBJECT(old_entry) != NULL) &&
13714 (VME_OBJECT(old_entry)->true_share))) {
13715 goto slow_vm_map_fork_copy;
13716 }
13717
13718 new_entry = vm_map_entry_create(new_map); /* never the kernel map or descendants */
13719 vm_map_entry_copy(old_map, new_entry, old_entry);
13720 if (old_entry->vme_permanent) {
13721 /* inherit "permanent" on fork() */
13722 new_entry->vme_permanent = TRUE;
13723 }
13724
13725 if (new_entry->used_for_jit == TRUE && new_map->jit_entry_exists == FALSE) {
13726 new_map->jit_entry_exists = TRUE;
13727 }
13728
13729 if (new_entry->is_sub_map) {
13730 /* clear address space specifics */
13731 new_entry->use_pmap = FALSE;
13732 } else {
13733 /*
13734 * We're dealing with a copy-on-write operation,
13735 * so the resulting mapping should not inherit
13736 * the original mapping's accounting settings.
13737 * "iokit_acct" should have been cleared in
13738 * vm_map_entry_copy().
13739 * "use_pmap" should be reset to its default
13740 * (TRUE) so that the new mapping gets
13741 * accounted for in the task's memory footprint.
13742 */
13743 assert(!new_entry->iokit_acct);
13744 new_entry->use_pmap = TRUE;
13745 }
13746
13747 if (!vm_object_copy_quickly(
13748 VME_OBJECT(new_entry),
13749 VME_OFFSET(old_entry),
13750 (old_entry->vme_end -
13751 old_entry->vme_start),
13752 &src_needs_copy,
13753 &new_entry_needs_copy)) {
13754 vm_map_entry_dispose(new_entry);
13755 goto slow_vm_map_fork_copy;
13756 }
13757
13758 /*
13759 * Handle copy-on-write obligations
13760 */
13761
13762 if (src_needs_copy && !old_entry->needs_copy) {
13763 vm_prot_t prot;
13764
13765 if (pmap_has_prot_policy(old_map->pmap, old_entry->translated_allow_execute, old_entry->protection)) {
13766 panic("%s: map %p pmap %p entry %p 0x%llx:0x%llx prot 0x%x",
13767 __FUNCTION__,
13768 old_map, old_map->pmap, old_entry,
13769 (uint64_t)old_entry->vme_start,
13770 (uint64_t)old_entry->vme_end,
13771 old_entry->protection);
13772 }
13773
13774 prot = old_entry->protection & ~VM_PROT_WRITE;
13775
13776 if (override_nx(old_map, VME_ALIAS(old_entry))
13777 && prot) {
13778 prot |= VM_PROT_EXECUTE;
13779 }
13780
13781 if (pmap_has_prot_policy(old_map->pmap, old_entry->translated_allow_execute, prot)) {
13782 panic("%s: map %p pmap %p entry %p 0x%llx:0x%llx prot 0x%x",
13783 __FUNCTION__,
13784 old_map, old_map->pmap, old_entry,
13785 (uint64_t)old_entry->vme_start,
13786 (uint64_t)old_entry->vme_end,
13787 prot);
13788 }
13789
13790 vm_object_pmap_protect(
13791 VME_OBJECT(old_entry),
13792 VME_OFFSET(old_entry),
13793 (old_entry->vme_end -
13794 old_entry->vme_start),
13795 ((old_entry->is_shared
13796 || old_map->mapped_in_other_pmaps)
13797 ? PMAP_NULL :
13798 old_map->pmap),
13799 VM_MAP_PAGE_SIZE(old_map),
13800 old_entry->vme_start,
13801 prot);
13802
13803 assert(old_entry->wired_count == 0);
13804 old_entry->needs_copy = TRUE;
13805 }
13806 new_entry->needs_copy = new_entry_needs_copy;
13807
13808 /*
13809 * Insert the entry at the end
13810 * of the map.
13811 */
13812
13813 vm_map_store_entry_link(new_map,
13814 vm_map_last_entry(new_map),
13815 new_entry,
13816 VM_MAP_KERNEL_FLAGS_NONE);
13817 new_size += entry_size;
13818 break;
13819
13820 slow_vm_map_fork_copy:
13821 vm_map_copyin_flags = VM_MAP_COPYIN_FORK;
13822 if (options & VM_MAP_FORK_PRESERVE_PURGEABLE) {
13823 vm_map_copyin_flags |=
13824 VM_MAP_COPYIN_PRESERVE_PURGEABLE;
13825 }
13826 if (vm_map_fork_copy(old_map,
13827 &old_entry,
13828 new_map,
13829 vm_map_copyin_flags)) {
13830 new_size += entry_size;
13831 }
13832 continue;
13833 }
13834 old_entry = old_entry->vme_next;
13835 }
13836
13837 #if PMAP_FORK_NEST
13838 new_entry = vm_map_last_entry(new_map);
13839 if (new_entry == vm_map_to_entry(new_map)) {
13840 /* unnest all that was pre-nested */
13841 vm_map_fork_unnest(new_pmap,
13842 pre_nested_start, pre_nested_end,
13843 vm_map_min(new_map), vm_map_max(new_map));
13844 } else if (new_entry->vme_end < vm_map_max(new_map)) {
13845 /* unnest hole at the end, if pre-nested */
13846 vm_map_fork_unnest(new_pmap,
13847 pre_nested_start, pre_nested_end,
13848 new_entry->vme_end, vm_map_max(new_map));
13849 }
13850 #endif /* PMAP_FORK_NEST */
13851
13852 #if defined(__arm64__)
13853 pmap_insert_commpage(new_map->pmap);
13854 #endif /* __arm64__ */
13855
13856 new_map->size = new_size;
13857
13858 if (options & VM_MAP_FORK_CORPSE_FOOTPRINT) {
13859 vm_map_corpse_footprint_collect_done(new_map);
13860 }
13861
13862 /* Propagate JIT entitlement for the pmap layer. */
13863 if (pmap_get_jit_entitled(old_map->pmap)) {
13864 /* Tell the pmap that it supports JIT. */
13865 pmap_set_jit_entitled(new_map->pmap);
13866 }
13867
13868 /* Propagate TPRO settings for the pmap layer */
13869 if (pmap_get_tpro(old_map->pmap)) {
13870 /* Tell the pmap that it supports TPRO */
13871 pmap_set_tpro(new_map->pmap);
13872 }
13873
13874
13875 vm_map_unlock(new_map);
13876 vm_map_unlock(old_map);
13877 vm_map_deallocate(old_map);
13878
13879 return new_map;
13880 }
13881
13882 /*
13883 * vm_map_exec:
13884 *
13885 * Setup the "new_map" with the proper execution environment according
13886 * to the type of executable (platform, 64bit, chroot environment).
13887 * Map the comm page and shared region, etc...
13888 */
13889 kern_return_t
vm_map_exec(vm_map_t new_map,task_t task,boolean_t is64bit,void * fsroot,cpu_type_t cpu,cpu_subtype_t cpu_subtype,boolean_t reslide,boolean_t is_driverkit,uint32_t rsr_version)13890 vm_map_exec(
13891 vm_map_t new_map,
13892 task_t task,
13893 boolean_t is64bit,
13894 void *fsroot,
13895 cpu_type_t cpu,
13896 cpu_subtype_t cpu_subtype,
13897 boolean_t reslide,
13898 boolean_t is_driverkit,
13899 uint32_t rsr_version)
13900 {
13901 SHARED_REGION_TRACE_DEBUG(
13902 ("shared_region: task %p: vm_map_exec(%p,%p,%p,0x%x,0x%x): ->\n",
13903 (void *)VM_KERNEL_ADDRPERM(current_task()),
13904 (void *)VM_KERNEL_ADDRPERM(new_map),
13905 (void *)VM_KERNEL_ADDRPERM(task),
13906 (void *)VM_KERNEL_ADDRPERM(fsroot),
13907 cpu,
13908 cpu_subtype));
13909 (void) vm_commpage_enter(new_map, task, is64bit);
13910
13911 (void) vm_shared_region_enter(new_map, task, is64bit, fsroot, cpu, cpu_subtype, reslide, is_driverkit, rsr_version);
13912
13913 SHARED_REGION_TRACE_DEBUG(
13914 ("shared_region: task %p: vm_map_exec(%p,%p,%p,0x%x,0x%x): <-\n",
13915 (void *)VM_KERNEL_ADDRPERM(current_task()),
13916 (void *)VM_KERNEL_ADDRPERM(new_map),
13917 (void *)VM_KERNEL_ADDRPERM(task),
13918 (void *)VM_KERNEL_ADDRPERM(fsroot),
13919 cpu,
13920 cpu_subtype));
13921
13922 /*
13923 * Some devices have region(s) of memory that shouldn't get allocated by
13924 * user processes. The following code creates dummy vm_map_entry_t's for each
13925 * of the regions that needs to be reserved to prevent any allocations in
13926 * those regions.
13927 */
13928 kern_return_t kr = KERN_FAILURE;
13929 vm_map_kernel_flags_t vmk_flags = VM_MAP_KERNEL_FLAGS_FIXED_PERMANENT();
13930 vmk_flags.vmkf_beyond_max = true;
13931
13932 const struct vm_reserved_region *regions = NULL;
13933 size_t num_regions = ml_get_vm_reserved_regions(is64bit, ®ions);
13934 assert((num_regions == 0) || (num_regions > 0 && regions != NULL));
13935
13936 for (size_t i = 0; i < num_regions; ++i) {
13937 vm_map_offset_t address = regions[i].vmrr_addr;
13938
13939 kr = vm_map_enter(
13940 new_map,
13941 &address,
13942 regions[i].vmrr_size,
13943 (vm_map_offset_t)0,
13944 vmk_flags,
13945 VM_OBJECT_NULL,
13946 (vm_object_offset_t)0,
13947 FALSE,
13948 VM_PROT_NONE,
13949 VM_PROT_NONE,
13950 VM_INHERIT_COPY);
13951
13952 if (kr != KERN_SUCCESS) {
13953 os_log_error(OS_LOG_DEFAULT, "Failed to reserve %s region in user map %p %d", regions[i].vmrr_name, new_map, kr);
13954 return KERN_FAILURE;
13955 }
13956 }
13957
13958 new_map->reserved_regions = (num_regions ? TRUE : FALSE);
13959
13960 return KERN_SUCCESS;
13961 }
13962
13963 uint64_t vm_map_lookup_and_lock_object_copy_slowly_count = 0;
13964 uint64_t vm_map_lookup_and_lock_object_copy_slowly_size = 0;
13965 uint64_t vm_map_lookup_and_lock_object_copy_slowly_max = 0;
13966 uint64_t vm_map_lookup_and_lock_object_copy_slowly_restart = 0;
13967 uint64_t vm_map_lookup_and_lock_object_copy_slowly_error = 0;
13968 uint64_t vm_map_lookup_and_lock_object_copy_strategically_count = 0;
13969 uint64_t vm_map_lookup_and_lock_object_copy_strategically_size = 0;
13970 uint64_t vm_map_lookup_and_lock_object_copy_strategically_max = 0;
13971 uint64_t vm_map_lookup_and_lock_object_copy_strategically_restart = 0;
13972 uint64_t vm_map_lookup_and_lock_object_copy_strategically_error = 0;
13973 uint64_t vm_map_lookup_and_lock_object_copy_shadow_count = 0;
13974 uint64_t vm_map_lookup_and_lock_object_copy_shadow_size = 0;
13975 uint64_t vm_map_lookup_and_lock_object_copy_shadow_max = 0;
13976 /*
13977 * vm_map_lookup_and_lock_object:
13978 *
13979 * Finds the VM object, offset, and
13980 * protection for a given virtual address in the
13981 * specified map, assuming a page fault of the
13982 * type specified.
13983 *
13984 * Returns the (object, offset, protection) for
13985 * this address, whether it is wired down, and whether
13986 * this map has the only reference to the data in question.
13987 * In order to later verify this lookup, a "version"
13988 * is returned.
13989 * If contended != NULL, *contended will be set to
13990 * true iff the thread had to spin or block to acquire
13991 * an exclusive lock.
13992 *
13993 * The map MUST be locked by the caller and WILL be
13994 * locked on exit. In order to guarantee the
13995 * existence of the returned object, it is returned
13996 * locked.
13997 *
13998 * If a lookup is requested with "write protection"
13999 * specified, the map may be changed to perform virtual
14000 * copying operations, although the data referenced will
14001 * remain the same.
14002 *
14003 * If fault_info is provided, then the information is
14004 * initialized according to the properties of the map entry
14005 * NB: only properties of the entry are initialized,
14006 * namely:
14007 * - user_tag
14008 * - pmap_options
14009 * - iokit_acct
14010 * - behavior
14011 * - lo_offset
14012 * - hi_offset
14013 * - no_cache
14014 * - cs_bypass
14015 * - csm_associated
14016 * - resilient_media
14017 * - vme_xnu_user_debug
14018 * - vme_no_copy_on_read
14019 * - used_for_tpro
14020 */
14021 kern_return_t
vm_map_lookup_and_lock_object(vm_map_t * var_map,vm_map_offset_t vaddr,vm_prot_t fault_type,int object_lock_type,vm_map_version_t * out_version,vm_object_t * object,vm_object_offset_t * offset,vm_prot_t * out_prot,boolean_t * wired,vm_object_fault_info_t fault_info,vm_map_t * real_map,bool * contended)14022 vm_map_lookup_and_lock_object(
14023 vm_map_t *var_map, /* IN/OUT */
14024 vm_map_offset_t vaddr,
14025 vm_prot_t fault_type,
14026 int object_lock_type,
14027 vm_map_version_t *out_version, /* OUT */
14028 vm_object_t *object, /* OUT */
14029 vm_object_offset_t *offset, /* OUT */
14030 vm_prot_t *out_prot, /* OUT */
14031 boolean_t *wired, /* OUT */
14032 vm_object_fault_info_t fault_info, /* OUT */
14033 vm_map_t *real_map, /* OUT */
14034 bool *contended) /* OUT */
14035 {
14036 vm_map_entry_t entry;
14037 vm_map_t map = *var_map;
14038 vm_map_t old_map = *var_map;
14039 vm_map_t cow_sub_map_parent = VM_MAP_NULL;
14040 vm_map_offset_t cow_parent_vaddr = 0;
14041 vm_map_offset_t old_start = 0;
14042 vm_map_offset_t old_end = 0;
14043 vm_prot_t prot;
14044 boolean_t mask_protections;
14045 boolean_t force_copy;
14046 boolean_t no_force_copy_if_executable;
14047 boolean_t submap_needed_copy;
14048 vm_prot_t original_fault_type;
14049 vm_map_size_t fault_page_mask;
14050
14051 /*
14052 * VM_PROT_MASK means that the caller wants us to use "fault_type"
14053 * as a mask against the mapping's actual protections, not as an
14054 * absolute value.
14055 */
14056 mask_protections = (fault_type & VM_PROT_IS_MASK) ? TRUE : FALSE;
14057 force_copy = (fault_type & VM_PROT_COPY) ? TRUE : FALSE;
14058 no_force_copy_if_executable = (fault_type & VM_PROT_COPY_FAIL_IF_EXECUTABLE) ? TRUE : FALSE;
14059 fault_type &= VM_PROT_ALL;
14060 original_fault_type = fault_type;
14061 if (contended) {
14062 *contended = false;
14063 }
14064
14065 *real_map = map;
14066
14067 fault_page_mask = MIN(VM_MAP_PAGE_MASK(map), PAGE_MASK);
14068 vaddr = VM_MAP_TRUNC_PAGE(vaddr, fault_page_mask);
14069
14070 RetryLookup:
14071 fault_type = original_fault_type;
14072
14073 /*
14074 * If the map has an interesting hint, try it before calling
14075 * full blown lookup routine.
14076 */
14077 entry = map->hint;
14078
14079 if ((entry == vm_map_to_entry(map)) ||
14080 (vaddr < entry->vme_start) || (vaddr >= entry->vme_end)) {
14081 vm_map_entry_t tmp_entry;
14082
14083 /*
14084 * Entry was either not a valid hint, or the vaddr
14085 * was not contained in the entry, so do a full lookup.
14086 */
14087 if (!vm_map_lookup_entry(map, vaddr, &tmp_entry)) {
14088 if ((cow_sub_map_parent) && (cow_sub_map_parent != map)) {
14089 vm_map_unlock(cow_sub_map_parent);
14090 }
14091 if ((*real_map != map)
14092 && (*real_map != cow_sub_map_parent)) {
14093 vm_map_unlock(*real_map);
14094 }
14095 return KERN_INVALID_ADDRESS;
14096 }
14097
14098 entry = tmp_entry;
14099 }
14100 if (map == old_map) {
14101 old_start = entry->vme_start;
14102 old_end = entry->vme_end;
14103 }
14104
14105 /*
14106 * Handle submaps. Drop lock on upper map, submap is
14107 * returned locked.
14108 */
14109
14110 submap_needed_copy = FALSE;
14111 submap_recurse:
14112 if (entry->is_sub_map) {
14113 vm_map_offset_t local_vaddr;
14114 vm_map_offset_t end_delta;
14115 vm_map_offset_t start_delta;
14116 vm_map_offset_t top_entry_saved_start;
14117 vm_object_offset_t top_entry_saved_offset;
14118 vm_map_entry_t submap_entry, saved_submap_entry;
14119 vm_object_offset_t submap_entry_offset;
14120 vm_object_size_t submap_entry_size;
14121 vm_prot_t subentry_protection;
14122 vm_prot_t subentry_max_protection;
14123 boolean_t subentry_no_copy_on_read;
14124 boolean_t subentry_permanent;
14125 boolean_t subentry_csm_associated;
14126 #if __arm64e__
14127 boolean_t subentry_used_for_tpro;
14128 #endif /* __arm64e__ */
14129 boolean_t mapped_needs_copy = FALSE;
14130 vm_map_version_t version;
14131
14132 assertf(VM_MAP_PAGE_SHIFT(VME_SUBMAP(entry)) >= VM_MAP_PAGE_SHIFT(map),
14133 "map %p (%d) entry %p submap %p (%d)\n",
14134 map, VM_MAP_PAGE_SHIFT(map), entry,
14135 VME_SUBMAP(entry), VM_MAP_PAGE_SHIFT(VME_SUBMAP(entry)));
14136
14137 local_vaddr = vaddr;
14138 top_entry_saved_start = entry->vme_start;
14139 top_entry_saved_offset = VME_OFFSET(entry);
14140
14141 if ((entry->use_pmap &&
14142 !((fault_type & VM_PROT_WRITE) ||
14143 force_copy))) {
14144 /* if real_map equals map we unlock below */
14145 if ((*real_map != map) &&
14146 (*real_map != cow_sub_map_parent)) {
14147 vm_map_unlock(*real_map);
14148 }
14149 *real_map = VME_SUBMAP(entry);
14150 }
14151
14152 if (entry->needs_copy &&
14153 ((fault_type & VM_PROT_WRITE) ||
14154 force_copy)) {
14155 if (!mapped_needs_copy) {
14156 if (vm_map_lock_read_to_write(map)) {
14157 vm_map_lock_read(map);
14158 *real_map = map;
14159 goto RetryLookup;
14160 }
14161 vm_map_lock_read(VME_SUBMAP(entry));
14162 *var_map = VME_SUBMAP(entry);
14163 cow_sub_map_parent = map;
14164 /* reset base to map before cow object */
14165 /* this is the map which will accept */
14166 /* the new cow object */
14167 old_start = entry->vme_start;
14168 old_end = entry->vme_end;
14169 cow_parent_vaddr = vaddr;
14170 mapped_needs_copy = TRUE;
14171 } else {
14172 vm_map_lock_read(VME_SUBMAP(entry));
14173 *var_map = VME_SUBMAP(entry);
14174 if ((cow_sub_map_parent != map) &&
14175 (*real_map != map)) {
14176 vm_map_unlock(map);
14177 }
14178 }
14179 } else {
14180 if (entry->needs_copy) {
14181 submap_needed_copy = TRUE;
14182 }
14183 vm_map_lock_read(VME_SUBMAP(entry));
14184 *var_map = VME_SUBMAP(entry);
14185 /* leave map locked if it is a target */
14186 /* cow sub_map above otherwise, just */
14187 /* follow the maps down to the object */
14188 /* here we unlock knowing we are not */
14189 /* revisiting the map. */
14190 if ((*real_map != map) && (map != cow_sub_map_parent)) {
14191 vm_map_unlock_read(map);
14192 }
14193 }
14194
14195 entry = NULL;
14196 map = *var_map;
14197
14198 /* calculate the offset in the submap for vaddr */
14199 local_vaddr = (local_vaddr - top_entry_saved_start) + top_entry_saved_offset;
14200 assertf(VM_MAP_PAGE_ALIGNED(local_vaddr, fault_page_mask),
14201 "local_vaddr 0x%llx entry->vme_start 0x%llx fault_page_mask 0x%llx\n",
14202 (uint64_t)local_vaddr, (uint64_t)top_entry_saved_start, (uint64_t)fault_page_mask);
14203
14204 RetrySubMap:
14205 if (!vm_map_lookup_entry(map, local_vaddr, &submap_entry)) {
14206 if ((cow_sub_map_parent) && (cow_sub_map_parent != map)) {
14207 vm_map_unlock(cow_sub_map_parent);
14208 }
14209 if ((*real_map != map)
14210 && (*real_map != cow_sub_map_parent)) {
14211 vm_map_unlock(*real_map);
14212 }
14213 *real_map = map;
14214 return KERN_INVALID_ADDRESS;
14215 }
14216
14217 /* find the attenuated shadow of the underlying object */
14218 /* on our target map */
14219
14220 /* in english the submap object may extend beyond the */
14221 /* region mapped by the entry or, may only fill a portion */
14222 /* of it. For our purposes, we only care if the object */
14223 /* doesn't fill. In this case the area which will */
14224 /* ultimately be clipped in the top map will only need */
14225 /* to be as big as the portion of the underlying entry */
14226 /* which is mapped */
14227 start_delta = submap_entry->vme_start > top_entry_saved_offset ?
14228 submap_entry->vme_start - top_entry_saved_offset : 0;
14229
14230 end_delta =
14231 (top_entry_saved_offset + start_delta + (old_end - old_start)) <=
14232 submap_entry->vme_end ?
14233 0 : (top_entry_saved_offset +
14234 (old_end - old_start))
14235 - submap_entry->vme_end;
14236
14237 old_start += start_delta;
14238 old_end -= end_delta;
14239
14240 if (submap_entry->is_sub_map) {
14241 entry = submap_entry;
14242 vaddr = local_vaddr;
14243 goto submap_recurse;
14244 }
14245
14246 if (((fault_type & VM_PROT_WRITE) ||
14247 force_copy)
14248 && cow_sub_map_parent) {
14249 vm_object_t sub_object, copy_object;
14250 vm_object_offset_t copy_offset;
14251 vm_map_offset_t local_start;
14252 vm_map_offset_t local_end;
14253 boolean_t object_copied = FALSE;
14254 vm_object_offset_t object_copied_offset = 0;
14255 boolean_t object_copied_needs_copy = FALSE;
14256 kern_return_t kr = KERN_SUCCESS;
14257
14258 if (vm_map_lock_read_to_write(map)) {
14259 vm_map_lock_read(map);
14260 old_start -= start_delta;
14261 old_end += end_delta;
14262 goto RetrySubMap;
14263 }
14264
14265
14266 sub_object = VME_OBJECT(submap_entry);
14267 if (sub_object == VM_OBJECT_NULL) {
14268 sub_object =
14269 vm_object_allocate(
14270 (vm_map_size_t)
14271 (submap_entry->vme_end -
14272 submap_entry->vme_start));
14273 VME_OBJECT_SET(submap_entry, sub_object, false, 0);
14274 VME_OFFSET_SET(submap_entry, 0);
14275 assert(!submap_entry->is_sub_map);
14276 assert(submap_entry->use_pmap);
14277 }
14278 local_start = local_vaddr -
14279 (cow_parent_vaddr - old_start);
14280 local_end = local_vaddr +
14281 (old_end - cow_parent_vaddr);
14282 vm_map_clip_start(map, submap_entry, local_start);
14283 vm_map_clip_end(map, submap_entry, local_end);
14284 if (submap_entry->is_sub_map) {
14285 /* unnesting was done when clipping */
14286 assert(!submap_entry->use_pmap);
14287 }
14288
14289 /* This is the COW case, lets connect */
14290 /* an entry in our space to the underlying */
14291 /* object in the submap, bypassing the */
14292 /* submap. */
14293 submap_entry_offset = VME_OFFSET(submap_entry);
14294 submap_entry_size = submap_entry->vme_end - submap_entry->vme_start;
14295
14296 if ((submap_entry->wired_count != 0 ||
14297 sub_object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC) &&
14298 (submap_entry->protection & VM_PROT_EXECUTE) &&
14299 no_force_copy_if_executable) {
14300 // printf("FBDP map %p entry %p start 0x%llx end 0x%llx wired %d strat %d\n", map, submap_entry, (uint64_t)local_start, (uint64_t)local_end, submap_entry->wired_count, sub_object->copy_strategy);
14301 if ((cow_sub_map_parent) && (cow_sub_map_parent != map)) {
14302 vm_map_unlock(cow_sub_map_parent);
14303 }
14304 if ((*real_map != map)
14305 && (*real_map != cow_sub_map_parent)) {
14306 vm_map_unlock(*real_map);
14307 }
14308 *real_map = map;
14309 ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_SUBMAP_NO_COW_ON_EXECUTABLE), 0 /* arg */);
14310 vm_map_lock_write_to_read(map);
14311 kr = KERN_PROTECTION_FAILURE;
14312 DTRACE_VM4(submap_no_copy_executable,
14313 vm_map_t, map,
14314 vm_object_offset_t, submap_entry_offset,
14315 vm_object_size_t, submap_entry_size,
14316 int, kr);
14317 return kr;
14318 }
14319
14320 if (submap_entry->wired_count != 0) {
14321 vm_object_reference(sub_object);
14322
14323 assertf(VM_MAP_PAGE_ALIGNED(VME_OFFSET(submap_entry), VM_MAP_PAGE_MASK(map)),
14324 "submap_entry %p offset 0x%llx\n",
14325 submap_entry, VME_OFFSET(submap_entry));
14326
14327 DTRACE_VM6(submap_copy_slowly,
14328 vm_map_t, cow_sub_map_parent,
14329 vm_map_offset_t, vaddr,
14330 vm_map_t, map,
14331 vm_object_size_t, submap_entry_size,
14332 int, submap_entry->wired_count,
14333 int, sub_object->copy_strategy);
14334
14335 saved_submap_entry = submap_entry;
14336 version.main_timestamp = map->timestamp;
14337 vm_map_unlock(map); /* Increments timestamp by 1 */
14338 submap_entry = VM_MAP_ENTRY_NULL;
14339
14340 vm_object_lock(sub_object);
14341 kr = vm_object_copy_slowly(sub_object,
14342 submap_entry_offset,
14343 submap_entry_size,
14344 FALSE, /* interruptible */
14345 ©_object);
14346 object_copied = TRUE;
14347 object_copied_offset = 0;
14348 /* 4k: account for extra offset in physical page */
14349 object_copied_offset += submap_entry_offset - vm_object_trunc_page(submap_entry_offset);
14350 object_copied_needs_copy = FALSE;
14351 vm_object_deallocate(sub_object);
14352
14353 vm_map_lock(map);
14354
14355 if (kr != KERN_SUCCESS &&
14356 kr != KERN_MEMORY_RESTART_COPY) {
14357 if ((cow_sub_map_parent) && (cow_sub_map_parent != map)) {
14358 vm_map_unlock(cow_sub_map_parent);
14359 }
14360 if ((*real_map != map)
14361 && (*real_map != cow_sub_map_parent)) {
14362 vm_map_unlock(*real_map);
14363 }
14364 *real_map = map;
14365 vm_object_deallocate(copy_object);
14366 copy_object = VM_OBJECT_NULL;
14367 ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_SUBMAP_COPY_SLOWLY_FAILED), 0 /* arg */);
14368 vm_map_lock_write_to_read(map);
14369 DTRACE_VM4(submap_copy_error_slowly,
14370 vm_object_t, sub_object,
14371 vm_object_offset_t, submap_entry_offset,
14372 vm_object_size_t, submap_entry_size,
14373 int, kr);
14374 vm_map_lookup_and_lock_object_copy_slowly_error++;
14375 return kr;
14376 }
14377
14378 if ((kr == KERN_SUCCESS) &&
14379 (version.main_timestamp + 1) == map->timestamp) {
14380 submap_entry = saved_submap_entry;
14381 } else {
14382 saved_submap_entry = NULL;
14383 old_start -= start_delta;
14384 old_end += end_delta;
14385 vm_object_deallocate(copy_object);
14386 copy_object = VM_OBJECT_NULL;
14387 vm_map_lock_write_to_read(map);
14388 vm_map_lookup_and_lock_object_copy_slowly_restart++;
14389 goto RetrySubMap;
14390 }
14391 vm_map_lookup_and_lock_object_copy_slowly_count++;
14392 vm_map_lookup_and_lock_object_copy_slowly_size += submap_entry_size;
14393 if (submap_entry_size > vm_map_lookup_and_lock_object_copy_slowly_max) {
14394 vm_map_lookup_and_lock_object_copy_slowly_max = submap_entry_size;
14395 }
14396 } else if (sub_object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC) {
14397 submap_entry_offset = VME_OFFSET(submap_entry);
14398 copy_object = VM_OBJECT_NULL;
14399 object_copied_offset = submap_entry_offset;
14400 object_copied_needs_copy = FALSE;
14401 DTRACE_VM6(submap_copy_strategically,
14402 vm_map_t, cow_sub_map_parent,
14403 vm_map_offset_t, vaddr,
14404 vm_map_t, map,
14405 vm_object_size_t, submap_entry_size,
14406 int, submap_entry->wired_count,
14407 int, sub_object->copy_strategy);
14408 kr = vm_object_copy_strategically(
14409 sub_object,
14410 submap_entry_offset,
14411 submap_entry->vme_end - submap_entry->vme_start,
14412 false, /* forking */
14413 ©_object,
14414 &object_copied_offset,
14415 &object_copied_needs_copy);
14416 if (kr == KERN_MEMORY_RESTART_COPY) {
14417 old_start -= start_delta;
14418 old_end += end_delta;
14419 vm_object_deallocate(copy_object);
14420 copy_object = VM_OBJECT_NULL;
14421 vm_map_lock_write_to_read(map);
14422 vm_map_lookup_and_lock_object_copy_strategically_restart++;
14423 goto RetrySubMap;
14424 }
14425 if (kr != KERN_SUCCESS) {
14426 if ((cow_sub_map_parent) && (cow_sub_map_parent != map)) {
14427 vm_map_unlock(cow_sub_map_parent);
14428 }
14429 if ((*real_map != map)
14430 && (*real_map != cow_sub_map_parent)) {
14431 vm_map_unlock(*real_map);
14432 }
14433 *real_map = map;
14434 vm_object_deallocate(copy_object);
14435 copy_object = VM_OBJECT_NULL;
14436 ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_SUBMAP_COPY_STRAT_FAILED), 0 /* arg */);
14437 vm_map_lock_write_to_read(map);
14438 DTRACE_VM4(submap_copy_error_strategically,
14439 vm_object_t, sub_object,
14440 vm_object_offset_t, submap_entry_offset,
14441 vm_object_size_t, submap_entry_size,
14442 int, kr);
14443 vm_map_lookup_and_lock_object_copy_strategically_error++;
14444 return kr;
14445 }
14446 assert(copy_object != VM_OBJECT_NULL);
14447 assert(copy_object != sub_object);
14448 object_copied = TRUE;
14449 vm_map_lookup_and_lock_object_copy_strategically_count++;
14450 vm_map_lookup_and_lock_object_copy_strategically_size += submap_entry_size;
14451 if (submap_entry_size > vm_map_lookup_and_lock_object_copy_strategically_max) {
14452 vm_map_lookup_and_lock_object_copy_strategically_max = submap_entry_size;
14453 }
14454 } else {
14455 /* set up shadow object */
14456 object_copied = FALSE;
14457 copy_object = sub_object;
14458 vm_object_lock(sub_object);
14459 vm_object_reference_locked(sub_object);
14460 VM_OBJECT_SET_SHADOWED(sub_object, TRUE);
14461 vm_object_unlock(sub_object);
14462
14463 assert(submap_entry->wired_count == 0);
14464 submap_entry->needs_copy = TRUE;
14465
14466 prot = submap_entry->protection;
14467 if (pmap_has_prot_policy(map->pmap, submap_entry->translated_allow_execute, prot)) {
14468 panic("%s: map %p pmap %p entry %p 0x%llx:0x%llx prot 0x%x",
14469 __FUNCTION__,
14470 map, map->pmap, submap_entry,
14471 (uint64_t)submap_entry->vme_start,
14472 (uint64_t)submap_entry->vme_end,
14473 prot);
14474 }
14475 prot = prot & ~VM_PROT_WRITE;
14476 if (pmap_has_prot_policy(map->pmap, submap_entry->translated_allow_execute, prot)) {
14477 panic("%s: map %p pmap %p entry %p 0x%llx:0x%llx prot 0x%x",
14478 __FUNCTION__,
14479 map, map->pmap, submap_entry,
14480 (uint64_t)submap_entry->vme_start,
14481 (uint64_t)submap_entry->vme_end,
14482 prot);
14483 }
14484
14485 if (override_nx(old_map,
14486 VME_ALIAS(submap_entry))
14487 && prot) {
14488 prot |= VM_PROT_EXECUTE;
14489 }
14490
14491 vm_object_pmap_protect(
14492 sub_object,
14493 VME_OFFSET(submap_entry),
14494 submap_entry->vme_end -
14495 submap_entry->vme_start,
14496 (submap_entry->is_shared
14497 || map->mapped_in_other_pmaps) ?
14498 PMAP_NULL : map->pmap,
14499 VM_MAP_PAGE_SIZE(map),
14500 submap_entry->vme_start,
14501 prot);
14502 vm_map_lookup_and_lock_object_copy_shadow_count++;
14503 vm_map_lookup_and_lock_object_copy_shadow_size += submap_entry_size;
14504 if (submap_entry_size > vm_map_lookup_and_lock_object_copy_shadow_max) {
14505 vm_map_lookup_and_lock_object_copy_shadow_max = submap_entry_size;
14506 }
14507 }
14508
14509 /*
14510 * Adjust the fault offset to the submap entry.
14511 */
14512 copy_offset = (local_vaddr -
14513 submap_entry->vme_start +
14514 VME_OFFSET(submap_entry));
14515
14516 /* This works diffently than the */
14517 /* normal submap case. We go back */
14518 /* to the parent of the cow map and*/
14519 /* clip out the target portion of */
14520 /* the sub_map, substituting the */
14521 /* new copy object, */
14522
14523 subentry_protection = submap_entry->protection;
14524 subentry_max_protection = submap_entry->max_protection;
14525 subentry_no_copy_on_read = submap_entry->vme_no_copy_on_read;
14526 subentry_permanent = submap_entry->vme_permanent;
14527 subentry_csm_associated = submap_entry->csm_associated;
14528 #if __arm64e__
14529 subentry_used_for_tpro = submap_entry->used_for_tpro;
14530 #endif // __arm64e__
14531 vm_map_unlock(map);
14532 submap_entry = NULL; /* not valid after map unlock */
14533
14534 local_start = old_start;
14535 local_end = old_end;
14536 map = cow_sub_map_parent;
14537 *var_map = cow_sub_map_parent;
14538 vaddr = cow_parent_vaddr;
14539 cow_sub_map_parent = NULL;
14540
14541 if (!vm_map_lookup_entry(map,
14542 vaddr, &entry)) {
14543 if ((cow_sub_map_parent) && (cow_sub_map_parent != map)) {
14544 vm_map_unlock(cow_sub_map_parent);
14545 }
14546 if ((*real_map != map)
14547 && (*real_map != cow_sub_map_parent)) {
14548 vm_map_unlock(*real_map);
14549 }
14550 *real_map = map;
14551 vm_object_deallocate(
14552 copy_object);
14553 copy_object = VM_OBJECT_NULL;
14554 vm_map_lock_write_to_read(map);
14555 DTRACE_VM4(submap_lookup_post_unlock,
14556 uint64_t, (uint64_t)entry->vme_start,
14557 uint64_t, (uint64_t)entry->vme_end,
14558 vm_map_offset_t, vaddr,
14559 int, object_copied);
14560 return KERN_INVALID_ADDRESS;
14561 }
14562
14563 /* clip out the portion of space */
14564 /* mapped by the sub map which */
14565 /* corresponds to the underlying */
14566 /* object */
14567
14568 /*
14569 * Clip (and unnest) the smallest nested chunk
14570 * possible around the faulting address...
14571 */
14572 local_start = vaddr & ~(pmap_shared_region_size_min(map->pmap) - 1);
14573 local_end = local_start + pmap_shared_region_size_min(map->pmap);
14574 /*
14575 * ... but don't go beyond the "old_start" to "old_end"
14576 * range, to avoid spanning over another VM region
14577 * with a possibly different VM object and/or offset.
14578 */
14579 if (local_start < old_start) {
14580 local_start = old_start;
14581 }
14582 if (local_end > old_end) {
14583 local_end = old_end;
14584 }
14585 /*
14586 * Adjust copy_offset to the start of the range.
14587 */
14588 copy_offset -= (vaddr - local_start);
14589
14590 vm_map_clip_start(map, entry, local_start);
14591 vm_map_clip_end(map, entry, local_end);
14592 if (entry->is_sub_map) {
14593 /* unnesting was done when clipping */
14594 assert(!entry->use_pmap);
14595 }
14596
14597 /* substitute copy object for */
14598 /* shared map entry */
14599 vm_map_deallocate(VME_SUBMAP(entry));
14600 assert(!entry->iokit_acct);
14601 entry->use_pmap = TRUE;
14602 VME_OBJECT_SET(entry, copy_object, false, 0);
14603
14604 /* propagate the submap entry's protections */
14605 if (entry->protection != VM_PROT_READ) {
14606 /*
14607 * Someone has already altered the top entry's
14608 * protections via vm_protect(VM_PROT_COPY).
14609 * Respect these new values and ignore the
14610 * submap entry's protections.
14611 */
14612 } else {
14613 /*
14614 * Regular copy-on-write: propagate the submap
14615 * entry's protections to the top map entry.
14616 */
14617 entry->protection |= subentry_protection;
14618 }
14619 entry->max_protection |= subentry_max_protection;
14620 /* propagate some attributes from subentry */
14621 entry->vme_no_copy_on_read = subentry_no_copy_on_read;
14622 entry->vme_permanent = subentry_permanent;
14623 entry->csm_associated = subentry_csm_associated;
14624 #if __arm64e__
14625 /* propagate TPRO iff the destination map has TPRO enabled */
14626 if (subentry_used_for_tpro) {
14627 if (vm_map_tpro(map)) {
14628 entry->used_for_tpro = subentry_used_for_tpro;
14629 } else {
14630 /* "permanent" came from being TPRO */
14631 entry->vme_permanent = FALSE;
14632 }
14633 }
14634 #endif /* __arm64e */
14635 if ((entry->protection & VM_PROT_WRITE) &&
14636 (entry->protection & VM_PROT_EXECUTE) &&
14637 #if XNU_TARGET_OS_OSX
14638 map->pmap != kernel_pmap &&
14639 (vm_map_cs_enforcement(map)
14640 #if __arm64__
14641 || !VM_MAP_IS_EXOTIC(map)
14642 #endif /* __arm64__ */
14643 ) &&
14644 #endif /* XNU_TARGET_OS_OSX */
14645 #if CODE_SIGNING_MONITOR
14646 (csm_address_space_exempt(map->pmap) != KERN_SUCCESS) &&
14647 #endif
14648 !(entry->used_for_jit) &&
14649 VM_MAP_POLICY_WX_STRIP_X(map)) {
14650 DTRACE_VM3(cs_wx,
14651 uint64_t, (uint64_t)entry->vme_start,
14652 uint64_t, (uint64_t)entry->vme_end,
14653 vm_prot_t, entry->protection);
14654 printf("CODE SIGNING: %d[%s] %s:%d(0x%llx,0x%llx,0x%x) can't have both write and exec at the same time\n",
14655 proc_selfpid(),
14656 (get_bsdtask_info(current_task())
14657 ? proc_name_address(get_bsdtask_info(current_task()))
14658 : "?"),
14659 __FUNCTION__, __LINE__,
14660 #if DEVELOPMENT || DEBUG
14661 (uint64_t)entry->vme_start,
14662 (uint64_t)entry->vme_end,
14663 #else /* DEVELOPMENT || DEBUG */
14664 (uint64_t)0,
14665 (uint64_t)0,
14666 #endif /* DEVELOPMENT || DEBUG */
14667 entry->protection);
14668 entry->protection &= ~VM_PROT_EXECUTE;
14669 }
14670
14671 if (object_copied) {
14672 VME_OFFSET_SET(entry, local_start - old_start + object_copied_offset);
14673 entry->needs_copy = object_copied_needs_copy;
14674 entry->is_shared = FALSE;
14675 } else {
14676 assert(VME_OBJECT(entry) != VM_OBJECT_NULL);
14677 assert(VME_OBJECT(entry)->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC);
14678 assert(entry->wired_count == 0);
14679 VME_OFFSET_SET(entry, copy_offset);
14680 entry->needs_copy = TRUE;
14681 if (map != old_map) {
14682 entry->is_shared = TRUE;
14683 }
14684 }
14685 if (entry->inheritance == VM_INHERIT_SHARE) {
14686 entry->inheritance = VM_INHERIT_COPY;
14687 }
14688
14689 vm_map_lock_write_to_read(map);
14690 } else {
14691 if ((cow_sub_map_parent)
14692 && (cow_sub_map_parent != *real_map)
14693 && (cow_sub_map_parent != map)) {
14694 vm_map_unlock(cow_sub_map_parent);
14695 }
14696 entry = submap_entry;
14697 vaddr = local_vaddr;
14698 }
14699 }
14700
14701 /*
14702 * Check whether this task is allowed to have
14703 * this page.
14704 */
14705
14706 prot = entry->protection;
14707
14708 if (override_nx(old_map, VME_ALIAS(entry)) && prot) {
14709 /*
14710 * HACK -- if not a stack, then allow execution
14711 */
14712 prot |= VM_PROT_EXECUTE;
14713 }
14714
14715 #if __arm64e__
14716 /*
14717 * If the entry we're dealing with is TPRO and we have a write
14718 * fault, inject VM_PROT_WRITE into protections. This allows us
14719 * to maintain RO permissions when not marked as TPRO.
14720 */
14721 if (entry->used_for_tpro && (fault_type & VM_PROT_WRITE)) {
14722 prot |= VM_PROT_WRITE;
14723 }
14724 #endif /* __arm64e__ */
14725 if (mask_protections) {
14726 fault_type &= prot;
14727 if (fault_type == VM_PROT_NONE) {
14728 goto protection_failure;
14729 }
14730 }
14731 if (((fault_type & prot) != fault_type)
14732 #if __arm64__
14733 /* prefetch abort in execute-only page */
14734 && !(prot == VM_PROT_EXECUTE && fault_type == (VM_PROT_READ | VM_PROT_EXECUTE))
14735 #elif defined(__x86_64__)
14736 /* Consider the UEXEC bit when handling an EXECUTE fault */
14737 && !((fault_type & VM_PROT_EXECUTE) && !(prot & VM_PROT_EXECUTE) && (prot & VM_PROT_UEXEC))
14738 #endif
14739 ) {
14740 protection_failure:
14741 if (*real_map != map) {
14742 vm_map_unlock(*real_map);
14743 }
14744 *real_map = map;
14745
14746 if ((fault_type & VM_PROT_EXECUTE) && prot) {
14747 log_stack_execution_failure((addr64_t)vaddr, prot);
14748 }
14749
14750 DTRACE_VM2(prot_fault, int, 1, (uint64_t *), NULL);
14751 DTRACE_VM3(prot_fault_detailed, vm_prot_t, fault_type, vm_prot_t, prot, void *, vaddr);
14752 /*
14753 * Noisy (esp. internally) and can be inferred from CrashReports. So OFF for now.
14754 *
14755 * ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_PROTECTION_FAILURE), 0);
14756 */
14757 return KERN_PROTECTION_FAILURE;
14758 }
14759
14760 /*
14761 * If this page is not pageable, we have to get
14762 * it for all possible accesses.
14763 */
14764
14765 *wired = (entry->wired_count != 0);
14766 if (*wired) {
14767 fault_type = prot;
14768 }
14769
14770 /*
14771 * If the entry was copy-on-write, we either ...
14772 */
14773
14774 if (entry->needs_copy) {
14775 /*
14776 * If we want to write the page, we may as well
14777 * handle that now since we've got the map locked.
14778 *
14779 * If we don't need to write the page, we just
14780 * demote the permissions allowed.
14781 */
14782
14783 if ((fault_type & VM_PROT_WRITE) || *wired || force_copy) {
14784 /*
14785 * Make a new object, and place it in the
14786 * object chain. Note that no new references
14787 * have appeared -- one just moved from the
14788 * map to the new object.
14789 */
14790
14791 if (vm_map_lock_read_to_write(map)) {
14792 vm_map_lock_read(map);
14793 goto RetryLookup;
14794 }
14795
14796 if (VME_OBJECT(entry)->shadowed == FALSE) {
14797 vm_object_lock(VME_OBJECT(entry));
14798 VM_OBJECT_SET_SHADOWED(VME_OBJECT(entry), TRUE);
14799 vm_object_unlock(VME_OBJECT(entry));
14800 }
14801 VME_OBJECT_SHADOW(entry,
14802 (vm_map_size_t) (entry->vme_end -
14803 entry->vme_start),
14804 vm_map_always_shadow(map));
14805 entry->needs_copy = FALSE;
14806
14807 vm_map_lock_write_to_read(map);
14808 }
14809 if ((fault_type & VM_PROT_WRITE) == 0 && *wired == 0) {
14810 /*
14811 * We're attempting to read a copy-on-write
14812 * page -- don't allow writes.
14813 */
14814
14815 prot &= (~VM_PROT_WRITE);
14816 }
14817 }
14818
14819 if (submap_needed_copy && (prot & VM_PROT_WRITE)) {
14820 /*
14821 * We went through a "needs_copy" submap without triggering
14822 * a copy, so granting write access to the page would bypass
14823 * that submap's "needs_copy".
14824 */
14825 assert(!(fault_type & VM_PROT_WRITE));
14826 assert(!*wired);
14827 assert(!force_copy);
14828 // printf("FBDP %d[%s] submap_needed_copy for %p 0x%llx\n", proc_selfpid(), proc_name_address(current_task()->bsd_info), map, vaddr);
14829 prot &= ~VM_PROT_WRITE;
14830 }
14831
14832 /*
14833 * Create an object if necessary.
14834 */
14835 if (VME_OBJECT(entry) == VM_OBJECT_NULL) {
14836 if (vm_map_lock_read_to_write(map)) {
14837 vm_map_lock_read(map);
14838 goto RetryLookup;
14839 }
14840
14841 VME_OBJECT_SET(entry,
14842 vm_object_allocate(
14843 (vm_map_size_t)(entry->vme_end -
14844 entry->vme_start)), false, 0);
14845 VME_OFFSET_SET(entry, 0);
14846 assert(entry->use_pmap);
14847 vm_map_lock_write_to_read(map);
14848 }
14849
14850 /*
14851 * Return the object/offset from this entry. If the entry
14852 * was copy-on-write or empty, it has been fixed up. Also
14853 * return the protection.
14854 */
14855
14856 *offset = (vaddr - entry->vme_start) + VME_OFFSET(entry);
14857 *object = VME_OBJECT(entry);
14858 *out_prot = prot;
14859 KDBG_FILTERED(MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_MAP_LOOKUP_OBJECT), VM_KERNEL_UNSLIDE_OR_PERM(*object), (unsigned long) VME_ALIAS(entry), 0, 0);
14860
14861 if (fault_info) {
14862 /*
14863 * Initialize fault information according to the entry being faulted
14864 * from.
14865 */
14866 fault_info->user_tag = VME_ALIAS(entry);
14867 fault_info->pmap_options = 0;
14868 if (entry->iokit_acct ||
14869 (!entry->is_sub_map && !entry->use_pmap)) {
14870 fault_info->pmap_options |= PMAP_OPTIONS_ALT_ACCT;
14871 }
14872 if (fault_info->behavior == VM_BEHAVIOR_DEFAULT) {
14873 fault_info->behavior = entry->behavior;
14874 }
14875 fault_info->lo_offset = VME_OFFSET(entry);
14876 fault_info->hi_offset =
14877 (entry->vme_end - entry->vme_start) + VME_OFFSET(entry);
14878 fault_info->no_cache = entry->no_cache;
14879 fault_info->io_sync = FALSE;
14880 fault_info->cs_bypass = (entry->used_for_jit ||
14881 #if CODE_SIGNING_MONITOR
14882 (csm_address_space_exempt(map->pmap) == KERN_SUCCESS) ||
14883 #endif
14884 entry->vme_resilient_codesign);
14885 fault_info->mark_zf_absent = FALSE;
14886 fault_info->batch_pmap_op = FALSE;
14887 /*
14888 * The pmap layer will validate this page
14889 * before allowing it to be executed from.
14890 */
14891 #if CODE_SIGNING_MONITOR
14892 fault_info->csm_associated = entry->csm_associated;
14893 #else
14894 fault_info->csm_associated = FALSE;
14895 #endif
14896
14897 fault_info->resilient_media = entry->vme_resilient_media;
14898 fault_info->fi_xnu_user_debug = entry->vme_xnu_user_debug;
14899 fault_info->no_copy_on_read = entry->vme_no_copy_on_read;
14900 #if __arm64e__
14901 fault_info->fi_used_for_tpro = entry->used_for_tpro;
14902 #else /* __arm64e__ */
14903 fault_info->fi_used_for_tpro = FALSE;
14904 #endif
14905 if (entry->translated_allow_execute) {
14906 fault_info->pmap_options |= PMAP_OPTIONS_TRANSLATED_ALLOW_EXECUTE;
14907 }
14908 }
14909
14910 /*
14911 * Lock the object to prevent it from disappearing
14912 */
14913 if (object_lock_type == OBJECT_LOCK_EXCLUSIVE) {
14914 if (contended == NULL) {
14915 vm_object_lock(*object);
14916 } else {
14917 *contended = vm_object_lock_check_contended(*object);
14918 }
14919 } else {
14920 vm_object_lock_shared(*object);
14921 }
14922
14923 /*
14924 * Save the version number
14925 */
14926
14927 out_version->main_timestamp = map->timestamp;
14928
14929 return KERN_SUCCESS;
14930 }
14931
14932
14933 /*
14934 * vm_map_verify:
14935 *
14936 * Verifies that the map in question has not changed
14937 * since the given version. The map has to be locked
14938 * ("shared" mode is fine) before calling this function
14939 * and it will be returned locked too.
14940 */
14941 boolean_t
vm_map_verify(vm_map_t map,vm_map_version_t * version)14942 vm_map_verify(
14943 vm_map_t map,
14944 vm_map_version_t *version) /* REF */
14945 {
14946 boolean_t result;
14947
14948 vm_map_lock_assert_held(map);
14949 result = (map->timestamp == version->main_timestamp);
14950
14951 return result;
14952 }
14953
14954
14955 /*
14956 * TEMPORARYTEMPORARYTEMPORARYTEMPORARYTEMPORARYTEMPORARY
14957 * Goes away after regular vm_region_recurse function migrates to
14958 * 64 bits
14959 * vm_region_recurse: A form of vm_region which follows the
14960 * submaps in a target map
14961 *
14962 */
14963
14964 kern_return_t
vm_map_region_recurse_64(vm_map_t map,vm_map_offset_ut * address_u,vm_map_size_ut * size_u,natural_t * nesting_depth,vm_region_submap_info_64_t submap_info,mach_msg_type_number_t * count)14965 vm_map_region_recurse_64(
14966 vm_map_t map,
14967 vm_map_offset_ut *address_u, /* IN/OUT */
14968 vm_map_size_ut *size_u, /* OUT */
14969 natural_t *nesting_depth, /* IN/OUT */
14970 vm_region_submap_info_64_t submap_info, /* IN/OUT */
14971 mach_msg_type_number_t *count) /* IN/OUT */
14972 {
14973 mach_msg_type_number_t original_count;
14974 vm_region_extended_info_data_t extended;
14975 vm_map_entry_t tmp_entry;
14976 vm_map_offset_t user_address;
14977 unsigned int user_max_depth;
14978
14979 /*
14980 * "curr_entry" is the VM map entry preceding or including the
14981 * address we're looking for.
14982 * "curr_map" is the map or sub-map containing "curr_entry".
14983 * "curr_address" is the equivalent of the top map's "user_address"
14984 * in the current map.
14985 * "curr_offset" is the cumulated offset of "curr_map" in the
14986 * target task's address space.
14987 * "curr_depth" is the depth of "curr_map" in the chain of
14988 * sub-maps.
14989 *
14990 * "curr_max_below" and "curr_max_above" limit the range (around
14991 * "curr_address") we should take into account in the current (sub)map.
14992 * They limit the range to what's visible through the map entries
14993 * we've traversed from the top map to the current map.
14994 *
14995 */
14996 vm_map_entry_t curr_entry;
14997 vm_map_t curr_entry_submap;
14998 vm_map_address_t curr_entry_start;
14999 vm_object_offset_t curr_entry_offset;
15000 vm_map_address_t curr_address;
15001 vm_map_offset_t curr_offset;
15002 vm_map_t curr_map;
15003 unsigned int curr_depth;
15004 vm_map_offset_t curr_max_below, curr_max_above;
15005 vm_map_offset_t curr_skip;
15006
15007 /*
15008 * "next_" is the same as "curr_" but for the VM region immediately
15009 * after the address we're looking for. We need to keep track of this
15010 * too because we want to return info about that region if the
15011 * address we're looking for is not mapped.
15012 */
15013 vm_map_entry_t next_entry;
15014 vm_map_offset_t next_offset;
15015 vm_map_offset_t next_address;
15016 vm_map_t next_map;
15017 unsigned int next_depth;
15018 vm_map_offset_t next_max_below, next_max_above;
15019 vm_map_offset_t next_skip;
15020
15021 boolean_t look_for_pages;
15022 vm_region_submap_short_info_64_t short_info;
15023 boolean_t do_region_footprint;
15024 int effective_page_size, effective_page_shift;
15025 boolean_t submap_needed_copy;
15026
15027 if (map == VM_MAP_NULL) {
15028 /* no address space to work on */
15029 return KERN_INVALID_ARGUMENT;
15030 }
15031
15032 user_address = vm_sanitize_addr(map, *address_u);
15033
15034 effective_page_shift = vm_self_region_page_shift(map);
15035 effective_page_size = (1 << effective_page_shift);
15036
15037 if (*count < VM_REGION_SUBMAP_SHORT_INFO_COUNT_64) {
15038 /*
15039 * "info" structure is not big enough and
15040 * would overflow
15041 */
15042 return KERN_INVALID_ARGUMENT;
15043 }
15044
15045 do_region_footprint = task_self_region_footprint();
15046 original_count = *count;
15047
15048 if (original_count < VM_REGION_SUBMAP_INFO_V0_COUNT_64) {
15049 *count = VM_REGION_SUBMAP_SHORT_INFO_COUNT_64;
15050 look_for_pages = FALSE;
15051 short_info = (vm_region_submap_short_info_64_t) submap_info;
15052 submap_info = NULL;
15053 } else {
15054 look_for_pages = TRUE;
15055 *count = VM_REGION_SUBMAP_INFO_V0_COUNT_64;
15056 short_info = NULL;
15057
15058 if (original_count >= VM_REGION_SUBMAP_INFO_V1_COUNT_64) {
15059 *count = VM_REGION_SUBMAP_INFO_V1_COUNT_64;
15060 }
15061 if (original_count >= VM_REGION_SUBMAP_INFO_V2_COUNT_64) {
15062 *count = VM_REGION_SUBMAP_INFO_V2_COUNT_64;
15063 }
15064 }
15065
15066 user_max_depth = *nesting_depth;
15067 submap_needed_copy = FALSE;
15068
15069 if (not_in_kdp) {
15070 vm_map_lock_read(map);
15071 }
15072
15073 recurse_again:
15074 curr_entry = NULL;
15075 curr_map = map;
15076 curr_address = user_address;
15077 curr_offset = 0;
15078 curr_skip = 0;
15079 curr_depth = 0;
15080 curr_max_above = ((vm_map_offset_t) -1) - curr_address;
15081 curr_max_below = curr_address;
15082
15083 next_entry = NULL;
15084 next_map = NULL;
15085 next_address = 0;
15086 next_offset = 0;
15087 next_skip = 0;
15088 next_depth = 0;
15089 next_max_above = (vm_map_offset_t) -1;
15090 next_max_below = (vm_map_offset_t) -1;
15091
15092 for (;;) {
15093 if (vm_map_lookup_entry(curr_map,
15094 curr_address,
15095 &tmp_entry)) {
15096 /* tmp_entry contains the address we're looking for */
15097 curr_entry = tmp_entry;
15098 } else {
15099 vm_map_offset_t skip;
15100 /*
15101 * The address is not mapped. "tmp_entry" is the
15102 * map entry preceding the address. We want the next
15103 * one, if it exists.
15104 */
15105 curr_entry = tmp_entry->vme_next;
15106
15107 if (curr_entry == vm_map_to_entry(curr_map) ||
15108 (curr_entry->vme_start >=
15109 curr_address + curr_max_above)) {
15110 /* no next entry at this level: stop looking */
15111 if (not_in_kdp) {
15112 vm_map_unlock_read(curr_map);
15113 }
15114 curr_entry = NULL;
15115 curr_map = NULL;
15116 curr_skip = 0;
15117 curr_offset = 0;
15118 curr_depth = 0;
15119 curr_max_above = 0;
15120 curr_max_below = 0;
15121 break;
15122 }
15123
15124 /* adjust current address and offset */
15125 skip = curr_entry->vme_start - curr_address;
15126 curr_address = curr_entry->vme_start;
15127 curr_skip += skip;
15128 curr_offset += skip;
15129 curr_max_above -= skip;
15130 curr_max_below = 0;
15131 }
15132
15133 /*
15134 * Is the next entry at this level closer to the address (or
15135 * deeper in the submap chain) than the one we had
15136 * so far ?
15137 */
15138 tmp_entry = curr_entry->vme_next;
15139 if (tmp_entry == vm_map_to_entry(curr_map)) {
15140 /* no next entry at this level */
15141 } else if (tmp_entry->vme_start >=
15142 curr_address + curr_max_above) {
15143 /*
15144 * tmp_entry is beyond the scope of what we mapped of
15145 * this submap in the upper level: ignore it.
15146 */
15147 } else if ((next_entry == NULL) ||
15148 (tmp_entry->vme_start + curr_offset <=
15149 next_entry->vme_start + next_offset)) {
15150 /*
15151 * We didn't have a "next_entry" or this one is
15152 * closer to the address we're looking for:
15153 * use this "tmp_entry" as the new "next_entry".
15154 */
15155 if (next_entry != NULL) {
15156 /* unlock the last "next_map" */
15157 if (next_map != curr_map && not_in_kdp) {
15158 vm_map_unlock_read(next_map);
15159 }
15160 }
15161 next_entry = tmp_entry;
15162 next_map = curr_map;
15163 next_depth = curr_depth;
15164 next_address = next_entry->vme_start;
15165 next_skip = curr_skip;
15166 next_skip += (next_address - curr_address);
15167 next_offset = curr_offset;
15168 next_offset += (next_address - curr_address);
15169 next_max_above = MIN(next_max_above, curr_max_above);
15170 next_max_above = MIN(next_max_above,
15171 next_entry->vme_end - next_address);
15172 next_max_below = MIN(next_max_below, curr_max_below);
15173 next_max_below = MIN(next_max_below,
15174 next_address - next_entry->vme_start);
15175 }
15176
15177 /*
15178 * "curr_max_{above,below}" allow us to keep track of the
15179 * portion of the submap that is actually mapped at this level:
15180 * the rest of that submap is irrelevant to us, since it's not
15181 * mapped here.
15182 * The relevant portion of the map starts at
15183 * "VME_OFFSET(curr_entry)" up to the size of "curr_entry".
15184 */
15185 curr_max_above = MIN(curr_max_above,
15186 curr_entry->vme_end - curr_address);
15187 curr_max_below = MIN(curr_max_below,
15188 curr_address - curr_entry->vme_start);
15189
15190 if (!curr_entry->is_sub_map ||
15191 curr_depth >= user_max_depth) {
15192 /*
15193 * We hit a leaf map or we reached the maximum depth
15194 * we could, so stop looking. Keep the current map
15195 * locked.
15196 */
15197 break;
15198 }
15199
15200 /*
15201 * Get down to the next submap level.
15202 */
15203
15204 if (curr_entry->needs_copy) {
15205 /* everything below this is effectively copy-on-write */
15206 submap_needed_copy = TRUE;
15207 }
15208
15209 /*
15210 * Lock the next level and unlock the current level,
15211 * unless we need to keep it locked to access the "next_entry"
15212 * later.
15213 */
15214 curr_entry_submap = VME_SUBMAP(curr_entry);
15215 curr_entry_start = curr_entry->vme_start;
15216 curr_entry_offset = VME_OFFSET(curr_entry);
15217 curr_entry = VM_MAP_ENTRY_NULL; /* no longer valid after unlocking the map */
15218 if (not_in_kdp) {
15219 vm_map_lock_read(curr_entry_submap);
15220 }
15221 if (curr_map == next_map) {
15222 /* keep "next_map" locked in case we need it */
15223 } else {
15224 /* release this map */
15225 if (not_in_kdp) {
15226 vm_map_unlock_read(curr_map);
15227 }
15228 }
15229
15230 /*
15231 * Adjust the offset. "curr_entry" mapped the submap
15232 * at relative address "curr_entry_start" in the
15233 * curr_map but skips the first "curr_entry_offset"
15234 * bytes of the submap.
15235 * "curr_offset" always represents the offset of a virtual
15236 * address in the curr_map relative to the absolute address
15237 * space (i.e. the top-level VM map).
15238 */
15239 curr_offset += curr_entry_offset - curr_entry_start;
15240 curr_address = user_address + curr_offset;
15241 /* switch to the submap */
15242 curr_map = curr_entry_submap;
15243 curr_depth++;
15244 }
15245
15246 // LP64todo: all the current tools are 32bit, obviously never worked for 64b
15247 // so probably should be a real 32b ID vs. ptr.
15248 // Current users just check for equality
15249
15250 if (curr_entry == NULL) {
15251 /* no VM region contains the address... */
15252
15253 if (do_region_footprint && /* we want footprint numbers */
15254 next_entry == NULL && /* & there are no more regions */
15255 /* & we haven't already provided our fake region: */
15256 user_address <= vm_map_last_entry(map)->vme_end) {
15257 ledger_amount_t ledger_resident, ledger_compressed;
15258
15259 /*
15260 * Add a fake memory region to account for
15261 * purgeable and/or ledger-tagged memory that
15262 * counts towards this task's memory footprint,
15263 * i.e. the resident/compressed pages of non-volatile
15264 * objects owned by that task.
15265 */
15266 task_ledgers_footprint(map->pmap->ledger,
15267 &ledger_resident,
15268 &ledger_compressed);
15269 if (ledger_resident + ledger_compressed == 0) {
15270 /* no purgeable memory usage to report */
15271 return KERN_INVALID_ADDRESS;
15272 }
15273 /* fake region to show nonvolatile footprint */
15274 if (look_for_pages) {
15275 submap_info->protection = VM_PROT_DEFAULT;
15276 submap_info->max_protection = VM_PROT_DEFAULT;
15277 submap_info->inheritance = VM_INHERIT_DEFAULT;
15278 submap_info->offset = 0;
15279 submap_info->user_tag = -1;
15280 submap_info->pages_resident = (unsigned int) (ledger_resident / effective_page_size);
15281 submap_info->pages_shared_now_private = 0;
15282 submap_info->pages_swapped_out = (unsigned int) (ledger_compressed / effective_page_size);
15283 submap_info->pages_dirtied = submap_info->pages_resident;
15284 submap_info->ref_count = 1;
15285 submap_info->shadow_depth = 0;
15286 submap_info->external_pager = 0;
15287 submap_info->share_mode = SM_PRIVATE;
15288 if (submap_needed_copy) {
15289 submap_info->share_mode = SM_COW;
15290 }
15291 submap_info->is_submap = 0;
15292 submap_info->behavior = VM_BEHAVIOR_DEFAULT;
15293 submap_info->object_id = VM_OBJECT_ID_FAKE(map, task_ledgers.purgeable_nonvolatile);
15294 submap_info->user_wired_count = 0;
15295 submap_info->pages_reusable = 0;
15296 } else {
15297 short_info->user_tag = -1;
15298 short_info->offset = 0;
15299 short_info->protection = VM_PROT_DEFAULT;
15300 short_info->inheritance = VM_INHERIT_DEFAULT;
15301 short_info->max_protection = VM_PROT_DEFAULT;
15302 short_info->behavior = VM_BEHAVIOR_DEFAULT;
15303 short_info->user_wired_count = 0;
15304 short_info->is_submap = 0;
15305 short_info->object_id = VM_OBJECT_ID_FAKE(map, task_ledgers.purgeable_nonvolatile);
15306 short_info->external_pager = 0;
15307 short_info->shadow_depth = 0;
15308 short_info->share_mode = SM_PRIVATE;
15309 if (submap_needed_copy) {
15310 short_info->share_mode = SM_COW;
15311 }
15312 short_info->ref_count = 1;
15313 }
15314 *nesting_depth = 0;
15315 *address_u = vm_sanitize_wrap_addr(vm_map_last_entry(map)->vme_end);
15316 *size_u = vm_sanitize_wrap_size(ledger_resident + ledger_compressed);
15317 return KERN_SUCCESS;
15318 }
15319
15320 if (next_entry == NULL) {
15321 /* ... and no VM region follows it either */
15322 return KERN_INVALID_ADDRESS;
15323 }
15324 /* ... gather info about the next VM region */
15325 curr_entry = next_entry;
15326 curr_map = next_map; /* still locked ... */
15327 curr_address = next_address;
15328 curr_skip = next_skip;
15329 curr_offset = next_offset;
15330 curr_depth = next_depth;
15331 curr_max_above = next_max_above;
15332 curr_max_below = next_max_below;
15333 } else {
15334 /* we won't need "next_entry" after all */
15335 if (next_entry != NULL) {
15336 /* release "next_map" */
15337 if (next_map != curr_map && not_in_kdp) {
15338 vm_map_unlock_read(next_map);
15339 }
15340 }
15341 }
15342 next_entry = NULL;
15343 next_map = NULL;
15344 next_offset = 0;
15345 next_skip = 0;
15346 next_depth = 0;
15347 next_max_below = -1;
15348 next_max_above = -1;
15349
15350 if (curr_entry->is_sub_map &&
15351 curr_depth < user_max_depth) {
15352 /*
15353 * We're not as deep as we could be: we must have
15354 * gone back up after not finding anything mapped
15355 * below the original top-level map entry's.
15356 * Let's move "curr_address" forward and recurse again.
15357 */
15358 user_address = curr_address;
15359 goto recurse_again;
15360 }
15361
15362 *nesting_depth = curr_depth;
15363 *address_u = vm_sanitize_wrap_addr(
15364 user_address + curr_skip - curr_max_below);
15365 *size_u = vm_sanitize_wrap_size(curr_max_above + curr_max_below);
15366
15367 if (look_for_pages) {
15368 submap_info->user_tag = VME_ALIAS(curr_entry);
15369 submap_info->offset = VME_OFFSET(curr_entry);
15370 submap_info->protection = curr_entry->protection;
15371 submap_info->inheritance = curr_entry->inheritance;
15372 submap_info->max_protection = curr_entry->max_protection;
15373 submap_info->behavior = curr_entry->behavior;
15374 submap_info->user_wired_count = curr_entry->user_wired_count;
15375 submap_info->is_submap = curr_entry->is_sub_map;
15376 if (curr_entry->is_sub_map) {
15377 submap_info->object_id = VM_OBJECT_ID(VME_SUBMAP(curr_entry));
15378 } else {
15379 submap_info->object_id = VM_OBJECT_ID(VME_OBJECT(curr_entry));
15380 }
15381 } else {
15382 short_info->user_tag = VME_ALIAS(curr_entry);
15383 short_info->offset = VME_OFFSET(curr_entry);
15384 short_info->protection = curr_entry->protection;
15385 short_info->inheritance = curr_entry->inheritance;
15386 short_info->max_protection = curr_entry->max_protection;
15387 short_info->behavior = curr_entry->behavior;
15388 short_info->user_wired_count = curr_entry->user_wired_count;
15389 short_info->is_submap = curr_entry->is_sub_map;
15390 if (curr_entry->is_sub_map) {
15391 short_info->object_id = VM_OBJECT_ID(VME_SUBMAP(curr_entry));
15392 } else {
15393 short_info->object_id = VM_OBJECT_ID(VME_OBJECT(curr_entry));
15394 }
15395 }
15396
15397 extended.pages_resident = 0;
15398 extended.pages_swapped_out = 0;
15399 extended.pages_shared_now_private = 0;
15400 extended.pages_dirtied = 0;
15401 extended.pages_reusable = 0;
15402 extended.external_pager = 0;
15403 extended.shadow_depth = 0;
15404 extended.share_mode = SM_EMPTY;
15405 extended.ref_count = 0;
15406
15407 if (not_in_kdp) {
15408 if (!curr_entry->is_sub_map) {
15409 vm_map_offset_t range_start, range_end;
15410 range_start = MAX((curr_address - curr_max_below),
15411 curr_entry->vme_start);
15412 range_end = MIN((curr_address + curr_max_above),
15413 curr_entry->vme_end);
15414 vm_map_region_walk(curr_map,
15415 range_start,
15416 curr_entry,
15417 (VME_OFFSET(curr_entry) +
15418 (range_start -
15419 curr_entry->vme_start)),
15420 range_end - range_start,
15421 &extended,
15422 look_for_pages, VM_REGION_EXTENDED_INFO_COUNT);
15423 if (submap_needed_copy) {
15424 extended.share_mode = SM_COW;
15425 }
15426 } else {
15427 if (curr_entry->use_pmap) {
15428 extended.share_mode = SM_TRUESHARED;
15429 } else {
15430 extended.share_mode = SM_PRIVATE;
15431 }
15432 extended.ref_count = os_ref_get_count_raw(&VME_SUBMAP(curr_entry)->map_refcnt);
15433 }
15434 }
15435
15436 if (look_for_pages) {
15437 submap_info->pages_resident = extended.pages_resident;
15438 submap_info->pages_swapped_out = extended.pages_swapped_out;
15439 submap_info->pages_shared_now_private =
15440 extended.pages_shared_now_private;
15441 submap_info->pages_dirtied = extended.pages_dirtied;
15442 submap_info->external_pager = extended.external_pager;
15443 submap_info->shadow_depth = extended.shadow_depth;
15444 submap_info->share_mode = extended.share_mode;
15445 submap_info->ref_count = extended.ref_count;
15446
15447 if (original_count >= VM_REGION_SUBMAP_INFO_V1_COUNT_64) {
15448 submap_info->pages_reusable = extended.pages_reusable;
15449 }
15450 if (original_count >= VM_REGION_SUBMAP_INFO_V2_COUNT_64) {
15451 if (curr_entry->is_sub_map) {
15452 submap_info->object_id_full = (vm_object_id_t)VM_KERNEL_ADDRHASH(VME_SUBMAP(curr_entry));
15453 } else if (VME_OBJECT(curr_entry)) {
15454 submap_info->object_id_full = (vm_object_id_t)VM_KERNEL_ADDRHASH(VME_OBJECT(curr_entry));
15455 } else {
15456 submap_info->object_id_full = 0ull;
15457 }
15458 }
15459 } else {
15460 short_info->external_pager = extended.external_pager;
15461 short_info->shadow_depth = extended.shadow_depth;
15462 short_info->share_mode = extended.share_mode;
15463 short_info->ref_count = extended.ref_count;
15464 }
15465
15466 if (not_in_kdp) {
15467 vm_map_unlock_read(curr_map);
15468 }
15469
15470 return KERN_SUCCESS;
15471 }
15472
15473 /*
15474 * vm_region:
15475 *
15476 * User call to obtain information about a region in
15477 * a task's address map. Currently, only one flavor is
15478 * supported.
15479 *
15480 * XXX The reserved and behavior fields cannot be filled
15481 * in until the vm merge from the IK is completed, and
15482 * vm_reserve is implemented.
15483 */
15484
15485 kern_return_t
vm_map_region(vm_map_t map,vm_map_offset_ut * address_u,vm_map_size_ut * size_u,vm_region_flavor_t flavor,vm_region_info_t info,mach_msg_type_number_t * count,mach_port_t * object_name)15486 vm_map_region(
15487 vm_map_t map,
15488 vm_map_offset_ut *address_u, /* IN/OUT */
15489 vm_map_size_ut *size_u, /* OUT */
15490 vm_region_flavor_t flavor, /* IN */
15491 vm_region_info_t info, /* OUT */
15492 mach_msg_type_number_t *count, /* IN/OUT */
15493 mach_port_t *object_name) /* OUT */
15494 {
15495 vm_map_entry_t tmp_entry;
15496 vm_map_entry_t entry;
15497 vm_map_offset_t start;
15498
15499 if (map == VM_MAP_NULL) {
15500 return KERN_INVALID_ARGUMENT;
15501 }
15502
15503 start = vm_sanitize_addr(map, *address_u);
15504
15505 switch (flavor) {
15506 case VM_REGION_BASIC_INFO:
15507 /* legacy for old 32-bit objects info */
15508 {
15509 vm_region_basic_info_t basic;
15510
15511 if (*count < VM_REGION_BASIC_INFO_COUNT) {
15512 return KERN_INVALID_ARGUMENT;
15513 }
15514
15515 basic = (vm_region_basic_info_t) info;
15516 *count = VM_REGION_BASIC_INFO_COUNT;
15517
15518 vm_map_lock_read(map);
15519
15520 if (!vm_map_lookup_entry(map, start, &tmp_entry)) {
15521 if ((entry = tmp_entry->vme_next) == vm_map_to_entry(map)) {
15522 vm_map_unlock_read(map);
15523 return KERN_INVALID_ADDRESS;
15524 }
15525 } else {
15526 entry = tmp_entry;
15527 }
15528
15529 start = entry->vme_start;
15530
15531 basic->offset = (uint32_t)VME_OFFSET(entry);
15532 basic->protection = entry->protection;
15533 basic->inheritance = entry->inheritance;
15534 basic->max_protection = entry->max_protection;
15535 basic->behavior = entry->behavior;
15536 basic->user_wired_count = entry->user_wired_count;
15537 basic->reserved = entry->is_sub_map;
15538
15539 *address_u = vm_sanitize_wrap_addr(start);
15540 *size_u = vm_sanitize_wrap_size(entry->vme_end - start);
15541
15542 if (object_name) {
15543 *object_name = IP_NULL;
15544 }
15545 if (entry->is_sub_map) {
15546 basic->shared = FALSE;
15547 } else {
15548 basic->shared = entry->is_shared;
15549 }
15550
15551 vm_map_unlock_read(map);
15552 return KERN_SUCCESS;
15553 }
15554
15555 case VM_REGION_BASIC_INFO_64:
15556 {
15557 vm_region_basic_info_64_t basic;
15558
15559 if (*count < VM_REGION_BASIC_INFO_COUNT_64) {
15560 return KERN_INVALID_ARGUMENT;
15561 }
15562
15563 basic = (vm_region_basic_info_64_t) info;
15564 *count = VM_REGION_BASIC_INFO_COUNT_64;
15565
15566 vm_map_lock_read(map);
15567
15568 if (!vm_map_lookup_entry(map, start, &tmp_entry)) {
15569 if ((entry = tmp_entry->vme_next) == vm_map_to_entry(map)) {
15570 vm_map_unlock_read(map);
15571 return KERN_INVALID_ADDRESS;
15572 }
15573 } else {
15574 entry = tmp_entry;
15575 }
15576
15577 start = entry->vme_start;
15578
15579 basic->offset = VME_OFFSET(entry);
15580 basic->protection = entry->protection;
15581 basic->inheritance = entry->inheritance;
15582 basic->max_protection = entry->max_protection;
15583 basic->behavior = entry->behavior;
15584 basic->user_wired_count = entry->user_wired_count;
15585 basic->reserved = entry->is_sub_map;
15586
15587 *address_u = vm_sanitize_wrap_addr(start);
15588 *size_u = vm_sanitize_wrap_size(entry->vme_end - start);
15589
15590 if (object_name) {
15591 *object_name = IP_NULL;
15592 }
15593 if (entry->is_sub_map) {
15594 basic->shared = FALSE;
15595 } else {
15596 basic->shared = entry->is_shared;
15597 }
15598
15599 vm_map_unlock_read(map);
15600 return KERN_SUCCESS;
15601 }
15602 case VM_REGION_EXTENDED_INFO:
15603 if (*count < VM_REGION_EXTENDED_INFO_COUNT) {
15604 return KERN_INVALID_ARGUMENT;
15605 }
15606 OS_FALLTHROUGH;
15607 case VM_REGION_EXTENDED_INFO__legacy:
15608 {
15609 vm_region_extended_info_t extended;
15610 mach_msg_type_number_t original_count;
15611 int effective_page_size, effective_page_shift;
15612
15613 if (*count < VM_REGION_EXTENDED_INFO_COUNT__legacy) {
15614 return KERN_INVALID_ARGUMENT;
15615 }
15616
15617 extended = (vm_region_extended_info_t) info;
15618
15619 effective_page_shift = vm_self_region_page_shift(map);
15620 effective_page_size = (1 << effective_page_shift);
15621
15622 vm_map_lock_read(map);
15623
15624 if (!vm_map_lookup_entry(map, start, &tmp_entry)) {
15625 if ((entry = tmp_entry->vme_next) == vm_map_to_entry(map)) {
15626 vm_map_unlock_read(map);
15627 return KERN_INVALID_ADDRESS;
15628 }
15629 } else {
15630 entry = tmp_entry;
15631 }
15632 start = entry->vme_start;
15633
15634 extended->protection = entry->protection;
15635 extended->user_tag = VME_ALIAS(entry);
15636 extended->pages_resident = 0;
15637 extended->pages_swapped_out = 0;
15638 extended->pages_shared_now_private = 0;
15639 extended->pages_dirtied = 0;
15640 extended->external_pager = 0;
15641 extended->shadow_depth = 0;
15642
15643 original_count = *count;
15644 if (flavor == VM_REGION_EXTENDED_INFO__legacy) {
15645 *count = VM_REGION_EXTENDED_INFO_COUNT__legacy;
15646 } else {
15647 extended->pages_reusable = 0;
15648 *count = VM_REGION_EXTENDED_INFO_COUNT;
15649 }
15650
15651 vm_map_region_walk(map, start, entry, VME_OFFSET(entry), entry->vme_end - start, extended, TRUE, *count);
15652
15653 if (object_name) {
15654 *object_name = IP_NULL;
15655 }
15656
15657 *address_u = vm_sanitize_wrap_addr(start);
15658 *size_u = vm_sanitize_wrap_size(entry->vme_end - start);
15659
15660 vm_map_unlock_read(map);
15661 return KERN_SUCCESS;
15662 }
15663 case VM_REGION_TOP_INFO:
15664 {
15665 vm_region_top_info_t top;
15666
15667 if (*count < VM_REGION_TOP_INFO_COUNT) {
15668 return KERN_INVALID_ARGUMENT;
15669 }
15670
15671 top = (vm_region_top_info_t) info;
15672 *count = VM_REGION_TOP_INFO_COUNT;
15673
15674 vm_map_lock_read(map);
15675
15676 if (!vm_map_lookup_entry(map, start, &tmp_entry)) {
15677 if ((entry = tmp_entry->vme_next) == vm_map_to_entry(map)) {
15678 vm_map_unlock_read(map);
15679 return KERN_INVALID_ADDRESS;
15680 }
15681 } else {
15682 entry = tmp_entry;
15683 }
15684 start = entry->vme_start;
15685
15686 top->private_pages_resident = 0;
15687 top->shared_pages_resident = 0;
15688
15689 vm_map_region_top_walk(entry, top);
15690
15691 if (object_name) {
15692 *object_name = IP_NULL;
15693 }
15694
15695 *address_u = vm_sanitize_wrap_addr(start);
15696 *size_u = vm_sanitize_wrap_size(entry->vme_end - start);
15697
15698 vm_map_unlock_read(map);
15699 return KERN_SUCCESS;
15700 }
15701 default:
15702 return KERN_INVALID_ARGUMENT;
15703 }
15704 }
15705
15706 #define OBJ_RESIDENT_COUNT(obj, entry_size) \
15707 MIN((entry_size), \
15708 ((obj)->all_reusable ? \
15709 (obj)->wired_page_count : \
15710 (obj)->resident_page_count - (obj)->reusable_page_count))
15711
15712 void
vm_map_region_top_walk(vm_map_entry_t entry,vm_region_top_info_t top)15713 vm_map_region_top_walk(
15714 vm_map_entry_t entry,
15715 vm_region_top_info_t top)
15716 {
15717 if (entry->is_sub_map || VME_OBJECT(entry) == 0) {
15718 top->share_mode = SM_EMPTY;
15719 top->ref_count = 0;
15720 top->obj_id = 0;
15721 return;
15722 }
15723
15724 {
15725 struct vm_object *obj, *tmp_obj;
15726 int ref_count;
15727 uint32_t entry_size;
15728
15729 entry_size = (uint32_t) ((entry->vme_end - entry->vme_start) / PAGE_SIZE_64);
15730
15731 obj = VME_OBJECT(entry);
15732
15733 vm_object_lock(obj);
15734
15735 if ((ref_count = os_ref_get_count_raw(&obj->ref_count)) > 1 &&
15736 obj->paging_in_progress) {
15737 ref_count--;
15738 }
15739
15740 assert(obj->reusable_page_count <= obj->resident_page_count);
15741 if (obj->shadow) {
15742 if (ref_count == 1) {
15743 top->private_pages_resident =
15744 OBJ_RESIDENT_COUNT(obj, entry_size);
15745 } else {
15746 top->shared_pages_resident =
15747 OBJ_RESIDENT_COUNT(obj, entry_size);
15748 }
15749 top->ref_count = ref_count;
15750 top->share_mode = SM_COW;
15751
15752 while ((tmp_obj = obj->shadow)) {
15753 vm_object_lock(tmp_obj);
15754 vm_object_unlock(obj);
15755 obj = tmp_obj;
15756
15757 if ((ref_count = os_ref_get_count_raw(&obj->ref_count)) > 1 &&
15758 obj->paging_in_progress) {
15759 ref_count--;
15760 }
15761
15762 assert(obj->reusable_page_count <= obj->resident_page_count);
15763 top->shared_pages_resident +=
15764 OBJ_RESIDENT_COUNT(obj, entry_size);
15765 top->ref_count += ref_count - 1;
15766 }
15767 } else {
15768 if (entry->superpage_size) {
15769 top->share_mode = SM_LARGE_PAGE;
15770 top->shared_pages_resident = 0;
15771 top->private_pages_resident = entry_size;
15772 } else if (entry->needs_copy) {
15773 top->share_mode = SM_COW;
15774 top->shared_pages_resident =
15775 OBJ_RESIDENT_COUNT(obj, entry_size);
15776 } else {
15777 if (ref_count == 1 ||
15778 (ref_count == 2 && obj->named)) {
15779 top->share_mode = SM_PRIVATE;
15780 top->private_pages_resident =
15781 OBJ_RESIDENT_COUNT(obj,
15782 entry_size);
15783 } else {
15784 top->share_mode = SM_SHARED;
15785 top->shared_pages_resident =
15786 OBJ_RESIDENT_COUNT(obj,
15787 entry_size);
15788 }
15789 }
15790 top->ref_count = ref_count;
15791 }
15792
15793 vm_object_unlock(obj);
15794
15795 /* XXX K64: obj_id will be truncated */
15796 top->obj_id = (unsigned int) (uintptr_t)VM_KERNEL_ADDRHASH(obj);
15797 }
15798 }
15799
15800 void
vm_map_region_walk(vm_map_t map,vm_map_offset_t va,vm_map_entry_t entry,vm_object_offset_t offset,vm_object_size_t range,vm_region_extended_info_t extended,boolean_t look_for_pages,mach_msg_type_number_t count)15801 vm_map_region_walk(
15802 vm_map_t map,
15803 vm_map_offset_t va,
15804 vm_map_entry_t entry,
15805 vm_object_offset_t offset,
15806 vm_object_size_t range,
15807 vm_region_extended_info_t extended,
15808 boolean_t look_for_pages,
15809 mach_msg_type_number_t count)
15810 {
15811 struct vm_object *obj, *tmp_obj;
15812 vm_map_offset_t last_offset;
15813 int i;
15814 int ref_count;
15815 struct vm_object *shadow_object;
15816 unsigned short shadow_depth;
15817 boolean_t do_region_footprint;
15818 int effective_page_size, effective_page_shift;
15819 vm_map_offset_t effective_page_mask;
15820
15821 do_region_footprint = task_self_region_footprint();
15822
15823 if ((entry->is_sub_map) ||
15824 (VME_OBJECT(entry) == 0) ||
15825 (VME_OBJECT(entry)->phys_contiguous &&
15826 !entry->superpage_size)) {
15827 extended->share_mode = SM_EMPTY;
15828 extended->ref_count = 0;
15829 return;
15830 }
15831
15832 if (entry->superpage_size) {
15833 extended->shadow_depth = 0;
15834 extended->share_mode = SM_LARGE_PAGE;
15835 extended->ref_count = 1;
15836 extended->external_pager = 0;
15837
15838 /* TODO4K: Superpage in 4k mode? */
15839 extended->pages_resident = (unsigned int)(range >> PAGE_SHIFT);
15840 extended->shadow_depth = 0;
15841 return;
15842 }
15843
15844 effective_page_shift = vm_self_region_page_shift(map);
15845 effective_page_size = (1 << effective_page_shift);
15846 effective_page_mask = effective_page_size - 1;
15847
15848 offset = vm_map_trunc_page(offset, effective_page_mask);
15849
15850 obj = VME_OBJECT(entry);
15851
15852 vm_object_lock(obj);
15853
15854 if ((ref_count = os_ref_get_count_raw(&obj->ref_count)) > 1 &&
15855 obj->paging_in_progress) {
15856 ref_count--;
15857 }
15858
15859 if (look_for_pages) {
15860 for (last_offset = offset + range;
15861 offset < last_offset;
15862 offset += effective_page_size, va += effective_page_size) {
15863 if (do_region_footprint) {
15864 int disp;
15865
15866 disp = 0;
15867 if (map->has_corpse_footprint) {
15868 /*
15869 * Query the page info data we saved
15870 * while forking the corpse.
15871 */
15872 vm_map_corpse_footprint_query_page_info(
15873 map,
15874 va,
15875 &disp);
15876 } else {
15877 /*
15878 * Query the pmap.
15879 */
15880 vm_map_footprint_query_page_info(
15881 map,
15882 entry,
15883 va,
15884 &disp);
15885 }
15886 if (disp & VM_PAGE_QUERY_PAGE_PRESENT) {
15887 extended->pages_resident++;
15888 }
15889 if (disp & VM_PAGE_QUERY_PAGE_REUSABLE) {
15890 extended->pages_reusable++;
15891 }
15892 if (disp & VM_PAGE_QUERY_PAGE_DIRTY) {
15893 extended->pages_dirtied++;
15894 }
15895 if (disp & PMAP_QUERY_PAGE_COMPRESSED) {
15896 extended->pages_swapped_out++;
15897 }
15898 continue;
15899 }
15900
15901 vm_map_region_look_for_page(map, va, obj,
15902 vm_object_trunc_page(offset), ref_count,
15903 0, extended, count);
15904 }
15905
15906 if (do_region_footprint) {
15907 goto collect_object_info;
15908 }
15909 } else {
15910 collect_object_info:
15911 shadow_object = obj->shadow;
15912 shadow_depth = 0;
15913
15914 if (!(obj->internal)) {
15915 extended->external_pager = 1;
15916 }
15917
15918 if (shadow_object != VM_OBJECT_NULL) {
15919 vm_object_lock(shadow_object);
15920 for (;
15921 shadow_object != VM_OBJECT_NULL;
15922 shadow_depth++) {
15923 vm_object_t next_shadow;
15924
15925 if (!(shadow_object->internal)) {
15926 extended->external_pager = 1;
15927 }
15928
15929 next_shadow = shadow_object->shadow;
15930 if (next_shadow) {
15931 vm_object_lock(next_shadow);
15932 }
15933 vm_object_unlock(shadow_object);
15934 shadow_object = next_shadow;
15935 }
15936 }
15937 extended->shadow_depth = shadow_depth;
15938 }
15939
15940 if (extended->shadow_depth || entry->needs_copy) {
15941 extended->share_mode = SM_COW;
15942 } else {
15943 if (ref_count == 1) {
15944 extended->share_mode = SM_PRIVATE;
15945 } else {
15946 if (obj->true_share) {
15947 extended->share_mode = SM_TRUESHARED;
15948 } else {
15949 extended->share_mode = SM_SHARED;
15950 }
15951 }
15952 }
15953 extended->ref_count = ref_count - extended->shadow_depth;
15954
15955 for (i = 0; i < extended->shadow_depth; i++) {
15956 if ((tmp_obj = obj->shadow) == 0) {
15957 break;
15958 }
15959 vm_object_lock(tmp_obj);
15960 vm_object_unlock(obj);
15961
15962 if ((ref_count = os_ref_get_count_raw(&tmp_obj->ref_count)) > 1 &&
15963 tmp_obj->paging_in_progress) {
15964 ref_count--;
15965 }
15966
15967 extended->ref_count += ref_count;
15968 obj = tmp_obj;
15969 }
15970 vm_object_unlock(obj);
15971
15972 if (extended->external_pager && extended->ref_count == 2 && extended->share_mode == SM_SHARED) {
15973 extended->share_mode = SM_PRIVATE;
15974 } else if (extended->share_mode == SM_SHARED && !(task_self_region_info_flags() & VM_REGION_INFO_FLAGS_NO_ALIASED)) {
15975 vm_map_entry_t cur;
15976 vm_map_entry_t last;
15977 int my_refs;
15978
15979 obj = VME_OBJECT(entry);
15980 last = vm_map_to_entry(map);
15981 my_refs = 0;
15982
15983 if ((ref_count = os_ref_get_count_raw(&obj->ref_count)) > 1 &&
15984 obj->paging_in_progress) {
15985 ref_count--;
15986 }
15987 for (cur = vm_map_first_entry(map); cur != last; cur = cur->vme_next) {
15988 if (vm_map_region_has_obj_ref(cur, obj)) {
15989 my_refs++;
15990 }
15991 }
15992
15993 if (my_refs == ref_count) {
15994 extended->share_mode = SM_PRIVATE_ALIASED;
15995 } else if (my_refs > 1) {
15996 extended->share_mode = SM_SHARED_ALIASED;
15997 }
15998 }
15999 }
16000
16001
16002 /* object is locked on entry and locked on return */
16003
16004
16005 static void
vm_map_region_look_for_page(__unused vm_map_t map,__unused vm_map_offset_t va,vm_object_t object,vm_object_offset_t offset,int max_refcnt,unsigned short depth,vm_region_extended_info_t extended,mach_msg_type_number_t count)16006 vm_map_region_look_for_page(
16007 __unused vm_map_t map,
16008 __unused vm_map_offset_t va,
16009 vm_object_t object,
16010 vm_object_offset_t offset,
16011 int max_refcnt,
16012 unsigned short depth,
16013 vm_region_extended_info_t extended,
16014 mach_msg_type_number_t count)
16015 {
16016 vm_page_t p;
16017 vm_object_t shadow;
16018 int ref_count;
16019 vm_object_t caller_object;
16020
16021 shadow = object->shadow;
16022 caller_object = object;
16023
16024
16025 while (TRUE) {
16026 if (!(object->internal)) {
16027 extended->external_pager = 1;
16028 }
16029
16030 if ((p = vm_page_lookup(object, offset)) != VM_PAGE_NULL) {
16031 if (shadow && (max_refcnt == 1)) {
16032 extended->pages_shared_now_private++;
16033 }
16034
16035 if (!vm_page_is_fictitious(p) &&
16036 (p->vmp_dirty || pmap_is_modified(VM_PAGE_GET_PHYS_PAGE(p)))) {
16037 extended->pages_dirtied++;
16038 } else if (count >= VM_REGION_EXTENDED_INFO_COUNT) {
16039 if (p->vmp_reusable || object->all_reusable) {
16040 extended->pages_reusable++;
16041 }
16042 }
16043
16044 extended->pages_resident++;
16045
16046 if (object != caller_object) {
16047 vm_object_unlock(object);
16048 }
16049
16050 return;
16051 }
16052 if (object->internal &&
16053 object->alive &&
16054 !object->terminating &&
16055 object->pager_ready) {
16056 if (vm_object_compressor_pager_state_get(object, offset)
16057 == VM_EXTERNAL_STATE_EXISTS) {
16058 /* the pager has that page */
16059 extended->pages_swapped_out++;
16060 if (object != caller_object) {
16061 vm_object_unlock(object);
16062 }
16063 return;
16064 }
16065 }
16066
16067 if (shadow) {
16068 vm_object_lock(shadow);
16069 if ((ref_count = os_ref_get_count_raw(&shadow->ref_count)) > 1 &&
16070 shadow->paging_in_progress) {
16071 ref_count--;
16072 }
16073
16074 if (++depth > extended->shadow_depth) {
16075 extended->shadow_depth = depth;
16076 }
16077
16078 if (ref_count > max_refcnt) {
16079 max_refcnt = ref_count;
16080 }
16081
16082 if (object != caller_object) {
16083 vm_object_unlock(object);
16084 }
16085
16086 offset = offset + object->vo_shadow_offset;
16087 object = shadow;
16088 shadow = object->shadow;
16089 continue;
16090 }
16091 if (object != caller_object) {
16092 vm_object_unlock(object);
16093 }
16094 break;
16095 }
16096 }
16097
16098 static inline boolean_t
vm_map_region_has_obj_ref(vm_map_entry_t entry,vm_object_t object)16099 vm_map_region_has_obj_ref(
16100 vm_map_entry_t entry,
16101 vm_object_t object)
16102 {
16103 vm_object_t cur_obj;
16104 vm_object_t shadow_obj;
16105
16106 if (entry->is_sub_map) {
16107 return FALSE;
16108 }
16109
16110 cur_obj = VME_OBJECT(entry);
16111 if (cur_obj == VM_OBJECT_NULL) {
16112 return FALSE;
16113 } else if (cur_obj == object) {
16114 return TRUE;
16115 }
16116
16117 /*
16118 * Avoid locks for first shadow check, otherwise diagnostic tools will
16119 * spend most of their time obtaining locks in this function when analyzing
16120 * processes with many VM entries which may commonly have no shadow chain.
16121 *
16122 * This is acceptable because:
16123 * - Shadow's fields are not accessed outside of its lock
16124 * - Objects are unlikely to be modified due to:
16125 * - Many diagnostic tools suspend the task
16126 * - VM map is locked
16127 * - The rare incorrect return from this function turns a guess into a
16128 * slightly worse guess
16129 * - Entire shadow chain is not locked as a whole, so can still change
16130 * while traversing, resulting in incorrect guess even with locking
16131 */
16132 shadow_obj = cur_obj->shadow;
16133 if (shadow_obj == VM_OBJECT_NULL) {
16134 return FALSE;
16135 } else if (shadow_obj == object) {
16136 return TRUE;
16137 }
16138
16139 vm_object_lock(cur_obj);
16140
16141 while ((shadow_obj = cur_obj->shadow)) {
16142 /* check if object was found before grabbing a lock */
16143 if (shadow_obj == object) {
16144 vm_object_unlock(cur_obj);
16145 return TRUE;
16146 }
16147
16148 vm_object_lock(shadow_obj);
16149 vm_object_unlock(cur_obj);
16150 cur_obj = shadow_obj;
16151 }
16152
16153 /* exhausted the shadow chain */
16154 vm_object_unlock(cur_obj);
16155 return FALSE;
16156 }
16157
16158
16159 /*
16160 * Routine: vm_map_simplify
16161 *
16162 * Description:
16163 * Attempt to simplify the map representation in
16164 * the vicinity of the given starting address.
16165 * Note:
16166 * This routine is intended primarily to keep the
16167 * kernel maps more compact -- they generally don't
16168 * benefit from the "expand a map entry" technology
16169 * at allocation time because the adjacent entry
16170 * is often wired down.
16171 */
16172 void
vm_map_simplify_entry(vm_map_t map,vm_map_entry_t this_entry)16173 vm_map_simplify_entry(
16174 vm_map_t map,
16175 vm_map_entry_t this_entry)
16176 {
16177 vm_map_entry_t prev_entry;
16178
16179 prev_entry = this_entry->vme_prev;
16180
16181 if ((this_entry != vm_map_to_entry(map)) &&
16182 (prev_entry != vm_map_to_entry(map)) &&
16183
16184 (prev_entry->vme_end == this_entry->vme_start) &&
16185
16186 (prev_entry->is_sub_map == this_entry->is_sub_map) &&
16187 (prev_entry->vme_object_value == this_entry->vme_object_value) &&
16188 (prev_entry->vme_kernel_object == this_entry->vme_kernel_object) &&
16189 ((VME_OFFSET(prev_entry) + (prev_entry->vme_end -
16190 prev_entry->vme_start))
16191 == VME_OFFSET(this_entry)) &&
16192
16193 (prev_entry->behavior == this_entry->behavior) &&
16194 (prev_entry->needs_copy == this_entry->needs_copy) &&
16195 (prev_entry->protection == this_entry->protection) &&
16196 (prev_entry->max_protection == this_entry->max_protection) &&
16197 (prev_entry->inheritance == this_entry->inheritance) &&
16198 (prev_entry->use_pmap == this_entry->use_pmap) &&
16199 (VME_ALIAS(prev_entry) == VME_ALIAS(this_entry)) &&
16200 (prev_entry->no_cache == this_entry->no_cache) &&
16201 (prev_entry->vme_permanent == this_entry->vme_permanent) &&
16202 (prev_entry->map_aligned == this_entry->map_aligned) &&
16203 (prev_entry->zero_wired_pages == this_entry->zero_wired_pages) &&
16204 (prev_entry->used_for_jit == this_entry->used_for_jit) &&
16205 #if __arm64e__
16206 (prev_entry->used_for_tpro == this_entry->used_for_tpro) &&
16207 #endif
16208 (prev_entry->csm_associated == this_entry->csm_associated) &&
16209 (prev_entry->vme_xnu_user_debug == this_entry->vme_xnu_user_debug) &&
16210 (prev_entry->iokit_acct == this_entry->iokit_acct) &&
16211 (prev_entry->vme_resilient_codesign ==
16212 this_entry->vme_resilient_codesign) &&
16213 (prev_entry->vme_resilient_media ==
16214 this_entry->vme_resilient_media) &&
16215 (prev_entry->vme_no_copy_on_read == this_entry->vme_no_copy_on_read) &&
16216 (prev_entry->translated_allow_execute == this_entry->translated_allow_execute) &&
16217
16218 (prev_entry->wired_count == this_entry->wired_count) &&
16219 (prev_entry->user_wired_count == this_entry->user_wired_count) &&
16220
16221 ((prev_entry->vme_atomic == FALSE) && (this_entry->vme_atomic == FALSE)) &&
16222 (prev_entry->in_transition == FALSE) &&
16223 (this_entry->in_transition == FALSE) &&
16224 (prev_entry->needs_wakeup == FALSE) &&
16225 (this_entry->needs_wakeup == FALSE) &&
16226 (prev_entry->is_shared == this_entry->is_shared) &&
16227 (prev_entry->superpage_size == FALSE) &&
16228 (this_entry->superpage_size == FALSE)
16229 ) {
16230 if (prev_entry->vme_permanent) {
16231 assert(this_entry->vme_permanent);
16232 prev_entry->vme_permanent = false;
16233 }
16234 vm_map_store_entry_unlink(map, prev_entry, true);
16235 assert(prev_entry->vme_start < this_entry->vme_end);
16236 if (prev_entry->map_aligned) {
16237 assert(VM_MAP_PAGE_ALIGNED(prev_entry->vme_start,
16238 VM_MAP_PAGE_MASK(map)));
16239 }
16240 this_entry->vme_start = prev_entry->vme_start;
16241 VME_OFFSET_SET(this_entry, VME_OFFSET(prev_entry));
16242
16243 if (map->holelistenabled) {
16244 vm_map_store_update_first_free(map, this_entry, TRUE);
16245 }
16246
16247 if (prev_entry->is_sub_map) {
16248 vm_map_deallocate(VME_SUBMAP(prev_entry));
16249 } else {
16250 vm_object_deallocate(VME_OBJECT(prev_entry));
16251 }
16252 vm_map_entry_dispose(prev_entry);
16253 SAVE_HINT_MAP_WRITE(map, this_entry);
16254 }
16255 }
16256
16257 void
vm_map_simplify(vm_map_t map,vm_map_offset_t start)16258 vm_map_simplify(
16259 vm_map_t map,
16260 vm_map_offset_t start)
16261 {
16262 vm_map_entry_t this_entry;
16263
16264 vm_map_lock(map);
16265 if (vm_map_lookup_entry(map, start, &this_entry)) {
16266 vm_map_simplify_entry(map, this_entry);
16267 vm_map_simplify_entry(map, this_entry->vme_next);
16268 }
16269 vm_map_unlock(map);
16270 }
16271
16272 static void
vm_map_simplify_range(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)16273 vm_map_simplify_range(
16274 vm_map_t map,
16275 vm_map_offset_t start,
16276 vm_map_offset_t end)
16277 {
16278 vm_map_entry_t entry;
16279
16280 /*
16281 * The map should be locked (for "write") by the caller.
16282 */
16283
16284 if (start >= end) {
16285 /* invalid address range */
16286 return;
16287 }
16288
16289 start = vm_map_trunc_page(start,
16290 VM_MAP_PAGE_MASK(map));
16291 end = vm_map_round_page(end,
16292 VM_MAP_PAGE_MASK(map));
16293
16294 if (!vm_map_lookup_entry(map, start, &entry)) {
16295 /* "start" is not mapped and "entry" ends before "start" */
16296 if (entry == vm_map_to_entry(map)) {
16297 /* start with first entry in the map */
16298 entry = vm_map_first_entry(map);
16299 } else {
16300 /* start with next entry */
16301 entry = entry->vme_next;
16302 }
16303 }
16304
16305 while (entry != vm_map_to_entry(map) &&
16306 entry->vme_start <= end) {
16307 /* try and coalesce "entry" with its previous entry */
16308 vm_map_simplify_entry(map, entry);
16309 entry = entry->vme_next;
16310 }
16311 }
16312
16313 static __attribute__((always_inline, warn_unused_result))
16314 kern_return_t
vm_map_machine_attribute_sanitize(vm_map_t map,vm_map_offset_ut start_u,vm_map_offset_ut end_u,mach_vm_offset_t * start,mach_vm_offset_t * end,vm_map_size_t * size)16315 vm_map_machine_attribute_sanitize(
16316 vm_map_t map,
16317 vm_map_offset_ut start_u,
16318 vm_map_offset_ut end_u,
16319 mach_vm_offset_t *start,
16320 mach_vm_offset_t *end,
16321 vm_map_size_t *size)
16322 {
16323 return vm_sanitize_addr_end(start_u, end_u,
16324 VM_SANITIZE_CALLER_VM_MAP_MACHINE_ATTRIBUTE, map,
16325 VM_SANITIZE_FLAGS_SIZE_ZERO_SUCCEEDS, start, end,
16326 size);
16327 }
16328
16329
16330 /*
16331 * Routine: vm_map_machine_attribute
16332 * Purpose:
16333 * Provide machine-specific attributes to mappings,
16334 * such as cachability etc. for machines that provide
16335 * them. NUMA architectures and machines with big/strange
16336 * caches will use this.
16337 * Note:
16338 * Responsibilities for locking and checking are handled here,
16339 * everything else in the pmap module. If any non-volatile
16340 * information must be kept, the pmap module should handle
16341 * it itself. [This assumes that attributes do not
16342 * need to be inherited, which seems ok to me]
16343 */
16344 kern_return_t
vm_map_machine_attribute(vm_map_t map,vm_map_offset_ut start_u,vm_map_offset_ut end_u,vm_machine_attribute_t attribute,vm_machine_attribute_val_t * value)16345 vm_map_machine_attribute(
16346 vm_map_t map,
16347 vm_map_offset_ut start_u,
16348 vm_map_offset_ut end_u,
16349 vm_machine_attribute_t attribute,
16350 vm_machine_attribute_val_t *value) /* IN/OUT */
16351 {
16352 mach_vm_offset_t start, end;
16353 vm_map_size_t sync_size;
16354 kern_return_t ret;
16355 vm_map_entry_t entry;
16356
16357 ret = vm_map_machine_attribute_sanitize(map,
16358 start_u,
16359 end_u,
16360 &start,
16361 &end,
16362 &sync_size);
16363 if (__improbable(ret != KERN_SUCCESS)) {
16364 return vm_sanitize_get_kr(ret);
16365 }
16366
16367 if (start < vm_map_min(map) || end > vm_map_max(map)) {
16368 return KERN_INVALID_ADDRESS;
16369 }
16370
16371 vm_map_lock(map);
16372
16373 if (attribute != MATTR_CACHE) {
16374 /* If we don't have to find physical addresses, we */
16375 /* don't have to do an explicit traversal here. */
16376 ret = pmap_attribute(map->pmap, start, end - start,
16377 attribute, value);
16378 vm_map_unlock(map);
16379 return ret;
16380 }
16381
16382 ret = KERN_SUCCESS; /* Assume it all worked */
16383
16384 while (sync_size) {
16385 if (vm_map_lookup_entry(map, start, &entry)) {
16386 vm_map_size_t sub_size;
16387 if ((entry->vme_end - start) > sync_size) {
16388 sub_size = sync_size;
16389 sync_size = 0;
16390 } else {
16391 sub_size = entry->vme_end - start;
16392 sync_size -= sub_size;
16393 }
16394 if (entry->is_sub_map) {
16395 vm_map_offset_t sub_start;
16396 vm_map_offset_t sub_end;
16397
16398 sub_start = (start - entry->vme_start)
16399 + VME_OFFSET(entry);
16400 sub_end = sub_start + sub_size;
16401 vm_map_machine_attribute(
16402 VME_SUBMAP(entry),
16403 sub_start,
16404 sub_end,
16405 attribute, value);
16406 } else if (VME_OBJECT(entry)) {
16407 vm_page_t m;
16408 vm_object_t object;
16409 vm_object_t base_object;
16410 vm_object_t last_object;
16411 vm_object_offset_t offset;
16412 vm_object_offset_t base_offset;
16413 vm_map_size_t range;
16414 range = sub_size;
16415 offset = (start - entry->vme_start)
16416 + VME_OFFSET(entry);
16417 offset = vm_object_trunc_page(offset);
16418 base_offset = offset;
16419 object = VME_OBJECT(entry);
16420 base_object = object;
16421 last_object = NULL;
16422
16423 vm_object_lock(object);
16424
16425 while (range) {
16426 m = vm_page_lookup(
16427 object, offset);
16428
16429 if (m && !vm_page_is_fictitious(m)) {
16430 ret =
16431 pmap_attribute_cache_sync(
16432 VM_PAGE_GET_PHYS_PAGE(m),
16433 PAGE_SIZE,
16434 attribute, value);
16435 } else if (object->shadow) {
16436 offset = offset + object->vo_shadow_offset;
16437 last_object = object;
16438 object = object->shadow;
16439 vm_object_lock(last_object->shadow);
16440 vm_object_unlock(last_object);
16441 continue;
16442 }
16443 if (range < PAGE_SIZE) {
16444 range = 0;
16445 } else {
16446 range -= PAGE_SIZE;
16447 }
16448
16449 if (base_object != object) {
16450 vm_object_unlock(object);
16451 vm_object_lock(base_object);
16452 object = base_object;
16453 }
16454 /* Bump to the next page */
16455 base_offset += PAGE_SIZE;
16456 offset = base_offset;
16457 }
16458 vm_object_unlock(object);
16459 }
16460 start += sub_size;
16461 } else {
16462 vm_map_unlock(map);
16463 return KERN_FAILURE;
16464 }
16465 }
16466
16467 vm_map_unlock(map);
16468
16469 return ret;
16470 }
16471
16472 /*
16473 * vm_map_behavior_set:
16474 *
16475 * Sets the paging reference behavior of the specified address
16476 * range in the target map. Paging reference behavior affects
16477 * how pagein operations resulting from faults on the map will be
16478 * clustered.
16479 */
16480 kern_return_t
vm_map_behavior_set(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_behavior_t new_behavior)16481 vm_map_behavior_set(
16482 vm_map_t map,
16483 vm_map_offset_t start,
16484 vm_map_offset_t end,
16485 vm_behavior_t new_behavior)
16486 {
16487 vm_map_entry_t entry;
16488 vm_map_entry_t temp_entry;
16489
16490 if (start > end ||
16491 start < vm_map_min(map) ||
16492 end > vm_map_max(map)) {
16493 return KERN_NO_SPACE;
16494 }
16495 if (__improbable(vm_map_range_overflows(map, start, end - start))) {
16496 return KERN_INVALID_ADDRESS;
16497 }
16498
16499 switch (new_behavior) {
16500 /*
16501 * This first block of behaviors all set a persistent state on the specified
16502 * memory range. All we have to do here is to record the desired behavior
16503 * in the vm_map_entry_t's.
16504 */
16505
16506 case VM_BEHAVIOR_DEFAULT:
16507 case VM_BEHAVIOR_RANDOM:
16508 case VM_BEHAVIOR_SEQUENTIAL:
16509 case VM_BEHAVIOR_RSEQNTL:
16510 case VM_BEHAVIOR_ZERO_WIRED_PAGES:
16511 vm_map_lock(map);
16512
16513 /*
16514 * The entire address range must be valid for the map.
16515 * Note that vm_map_range_check() does a
16516 * vm_map_lookup_entry() internally and returns the
16517 * entry containing the start of the address range if
16518 * the entire range is valid.
16519 */
16520 if (vm_map_range_check(map, start, end, &temp_entry)) {
16521 entry = temp_entry;
16522 vm_map_clip_start(map, entry, start);
16523 } else {
16524 vm_map_unlock(map);
16525 return KERN_INVALID_ADDRESS;
16526 }
16527
16528 if (new_behavior == VM_BEHAVIOR_ZERO_WIRED_PAGES) {
16529 /* zeroing requires write access */
16530 temp_entry = entry;
16531 for (;
16532 entry != vm_map_to_entry(map) && (entry->vme_start < end);
16533 entry = entry->vme_next) {
16534 if (!(entry->protection & VM_PROT_WRITE) ||
16535 #if __arm64e__
16536 entry->used_for_tpro ||
16537 #endif /* __arm64e__ */
16538 entry->used_for_jit) {
16539 vm_map_unlock(map);
16540 return KERN_PROTECTION_FAILURE;
16541 }
16542 }
16543 entry = temp_entry;
16544 }
16545
16546 while ((entry != vm_map_to_entry(map)) && (entry->vme_start < end)) {
16547 vm_map_clip_end(map, entry, end);
16548 if (entry->is_sub_map) {
16549 assert(!entry->use_pmap);
16550 }
16551
16552 if (new_behavior == VM_BEHAVIOR_ZERO_WIRED_PAGES) {
16553 assert(entry->protection & VM_PROT_WRITE);
16554 #if __arm64e__
16555 assert(!entry->used_for_tpro);
16556 #endif /* __arm64e__ */
16557 assert(!entry->used_for_jit);
16558 entry->zero_wired_pages = TRUE;
16559 } else {
16560 entry->behavior = new_behavior;
16561 }
16562 entry = entry->vme_next;
16563 }
16564
16565 vm_map_unlock(map);
16566 break;
16567
16568 /*
16569 * The rest of these are different from the above in that they cause
16570 * an immediate action to take place as opposed to setting a behavior that
16571 * affects future actions.
16572 */
16573
16574 case VM_BEHAVIOR_WILLNEED:
16575 return vm_map_willneed(map, start, end);
16576
16577 case VM_BEHAVIOR_DONTNEED:
16578 return vm_map_msync(map, start, end - start, VM_SYNC_DEACTIVATE | VM_SYNC_CONTIGUOUS);
16579
16580 case VM_BEHAVIOR_FREE:
16581 return vm_map_msync(map, start, end - start, VM_SYNC_KILLPAGES | VM_SYNC_CONTIGUOUS);
16582
16583 case VM_BEHAVIOR_REUSABLE:
16584 return vm_map_reusable_pages(map, start, end);
16585
16586 case VM_BEHAVIOR_REUSE:
16587 return vm_map_reuse_pages(map, start, end);
16588
16589 case VM_BEHAVIOR_CAN_REUSE:
16590 return vm_map_can_reuse(map, start, end);
16591
16592 #if MACH_ASSERT
16593 case VM_BEHAVIOR_PAGEOUT:
16594 return vm_map_pageout(map, start, end);
16595 #endif /* MACH_ASSERT */
16596
16597 case VM_BEHAVIOR_ZERO:
16598 return vm_map_zero(map, start, end);
16599
16600 default:
16601 return KERN_INVALID_ARGUMENT;
16602 }
16603
16604 return KERN_SUCCESS;
16605 }
16606
16607
16608 /*
16609 * Internals for madvise(MADV_WILLNEED) system call.
16610 *
16611 * The implementation is to do:-
16612 * a) read-ahead if the mapping corresponds to a mapped regular file
16613 * b) or, fault in the pages (zero-fill, decompress etc) if it's an anonymous mapping
16614 */
16615 static kern_return_t
vm_map_willneed(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)16616 vm_map_willneed(
16617 vm_map_t map,
16618 vm_map_offset_t start,
16619 vm_map_offset_t end
16620 )
16621 {
16622 vm_map_entry_t entry;
16623 kern_return_t kr;
16624 vm_object_size_t len;
16625 vm_size_t region_size;
16626
16627 KDBG(VMDBG_CODE(DBG_VM_MAP_WILLNEED) | DBG_FUNC_START,
16628 start, end);
16629 struct vm_object_fault_info fault_info = {
16630 .interruptible = THREAD_UNINT,
16631 .behavior = VM_BEHAVIOR_SEQUENTIAL,
16632 /* Do not activate pages after faulting */
16633 .stealth = true,
16634 /* Don't wait for busy pages */
16635 .fi_no_sleep = true,
16636 };
16637
16638 /*
16639 * The MADV_WILLNEED operation doesn't require any changes to the
16640 * vm_map_entry_t's, so the read lock is sufficient.
16641 */
16642
16643 vm_map_lock_read(map);
16644
16645 /*
16646 * The madvise semantics require that the address range be fully
16647 * allocated with no holes. Otherwise, we're required to return
16648 * an error.
16649 */
16650
16651 if (!vm_map_range_check(map, start, end, &entry)) {
16652 vm_map_unlock_read(map);
16653 kr = KERN_INVALID_ADDRESS;
16654 goto done;
16655 }
16656
16657 /*
16658 * Examine each vm_map_entry_t in the range.
16659 */
16660 while (start < end) {
16661 /*
16662 * Set the length so we don't go beyond the end of the
16663 * map_entry or beyond the end of the range we were given.
16664 * This range could span also multiple map entries all of which
16665 * map different files, so make sure we only do the right amount
16666 * of I/O for each object. Note that it's possible for there
16667 * to be multiple map entries all referring to the same object
16668 * but with different page permissions, but it's not worth
16669 * trying to optimize that case.
16670 */
16671 len = MIN(entry->vme_end - start, end - start);
16672
16673 vm_map_offset_t addr = start;
16674
16675 vm_size_t effective_page_mask = MIN(vm_map_page_mask(map), PAGE_MASK);
16676 vm_map_offset_t effective_page_size = effective_page_mask + 1;
16677
16678 /*
16679 * Write-fault if the entry supports it to preclude subsequent soft-faults
16680 */
16681 vm_prot_t fault_prot = entry->protection & VM_PROT_WRITE ?
16682 VM_PROT_WRITE : VM_PROT_READ;
16683
16684 vm_map_unlock_read(map);
16685
16686 region_size = len;
16687 while (region_size) {
16688 /*
16689 * Provide a hint for how much clustering we would like. Note that
16690 * each individual fault will limit the size of each request to
16691 * MAX_UPL_TRANSFER_BYTES.
16692 */
16693 fault_info.cluster_size = region_size;
16694 kr = vm_pre_fault_with_info(
16695 map,
16696 vm_map_trunc_page(addr, effective_page_mask),
16697 fault_prot,
16698 &fault_info);
16699 if (kr == KERN_ALREADY_WAITING) {
16700 /*
16701 * The page is busy being faulted/paged by another thread.
16702 */
16703 KDBG(VMDBG_CODE(DBG_VM_MAP_WILLNEED) | DBG_FUNC_NONE,
16704 task_pid(current_task()), addr, kr);
16705 kr = KERN_SUCCESS;
16706 } else if (kr != KERN_SUCCESS) {
16707 goto done;
16708 }
16709 region_size -= effective_page_size;
16710 addr += effective_page_size;
16711 }
16712
16713 start += len;
16714 if (start >= end) {
16715 kr = KERN_SUCCESS;
16716 goto done;
16717 }
16718
16719 if (thread_should_abort(current_thread())) {
16720 kr = KERN_ABORTED;
16721 goto done;
16722 }
16723
16724 /* look up next entry */
16725 vm_map_lock_read(map);
16726 if (!vm_map_lookup_entry(map, start, &entry)) {
16727 /*
16728 * There's a new hole in the address range.
16729 */
16730 vm_map_unlock_read(map);
16731 kr = KERN_INVALID_ADDRESS;
16732 goto done;
16733 }
16734 }
16735
16736 vm_map_unlock_read(map);
16737 done:
16738 KDBG(VMDBG_CODE(DBG_VM_MAP_WILLNEED) | DBG_FUNC_END,
16739 start, kr);
16740 return kr;
16741 }
16742
16743 static boolean_t
vm_map_entry_is_reusable(vm_map_entry_t entry)16744 vm_map_entry_is_reusable(
16745 vm_map_entry_t entry)
16746 {
16747 /* Only user map entries */
16748
16749 vm_object_t object;
16750
16751 if (entry->is_sub_map) {
16752 return FALSE;
16753 }
16754
16755 switch (VME_ALIAS(entry)) {
16756 case VM_MEMORY_MALLOC:
16757 case VM_MEMORY_MALLOC_SMALL:
16758 case VM_MEMORY_MALLOC_LARGE:
16759 case VM_MEMORY_REALLOC:
16760 case VM_MEMORY_MALLOC_TINY:
16761 case VM_MEMORY_MALLOC_LARGE_REUSABLE:
16762 case VM_MEMORY_MALLOC_LARGE_REUSED:
16763 /*
16764 * This is a malloc() memory region: check if it's still
16765 * in its original state and can be re-used for more
16766 * malloc() allocations.
16767 */
16768 break;
16769 default:
16770 /*
16771 * Not a malloc() memory region: let the caller decide if
16772 * it's re-usable.
16773 */
16774 return TRUE;
16775 }
16776
16777 if (/*entry->is_shared ||*/
16778 entry->is_sub_map ||
16779 entry->in_transition ||
16780 entry->protection != VM_PROT_DEFAULT ||
16781 entry->max_protection != VM_PROT_ALL ||
16782 entry->inheritance != VM_INHERIT_DEFAULT ||
16783 entry->no_cache ||
16784 entry->vme_permanent ||
16785 entry->superpage_size != FALSE ||
16786 entry->zero_wired_pages ||
16787 entry->wired_count != 0 ||
16788 entry->user_wired_count != 0) {
16789 return FALSE;
16790 }
16791
16792 object = VME_OBJECT(entry);
16793 if (object == VM_OBJECT_NULL) {
16794 return TRUE;
16795 }
16796 if (
16797 #if 0
16798 /*
16799 * Let's proceed even if the VM object is potentially
16800 * shared.
16801 * We check for this later when processing the actual
16802 * VM pages, so the contents will be safe if shared.
16803 *
16804 * But we can still mark this memory region as "reusable" to
16805 * acknowledge that the caller did let us know that the memory
16806 * could be re-used and should not be penalized for holding
16807 * on to it. This allows its "resident size" to not include
16808 * the reusable range.
16809 */
16810 object->ref_count == 1 &&
16811 #endif
16812 object->vo_copy == VM_OBJECT_NULL &&
16813 object->shadow == VM_OBJECT_NULL &&
16814 object->internal &&
16815 object->purgable == VM_PURGABLE_DENY &&
16816 HAS_DEFAULT_CACHEABILITY(object->wimg_bits & VM_WIMG_MASK) &&
16817 !object->code_signed) {
16818 return TRUE;
16819 }
16820 return FALSE;
16821 }
16822
16823 static kern_return_t
vm_map_reuse_pages(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)16824 vm_map_reuse_pages(
16825 vm_map_t map,
16826 vm_map_offset_t start,
16827 vm_map_offset_t end)
16828 {
16829 vm_map_entry_t entry;
16830 vm_object_t object;
16831 vm_object_offset_t start_offset, end_offset;
16832
16833 /*
16834 * The MADV_REUSE operation doesn't require any changes to the
16835 * vm_map_entry_t's, so the read lock is sufficient.
16836 */
16837
16838 if (VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT) {
16839 /*
16840 * XXX TODO4K
16841 * need to figure out what reusable means for a
16842 * portion of a native page.
16843 */
16844 return KERN_SUCCESS;
16845 }
16846
16847 vm_map_lock_read(map);
16848 assert(map->pmap != kernel_pmap); /* protect alias access */
16849
16850 /*
16851 * The madvise semantics require that the address range be fully
16852 * allocated with no holes. Otherwise, we're required to return
16853 * an error.
16854 */
16855
16856 if (!vm_map_range_check(map, start, end, &entry)) {
16857 vm_map_unlock_read(map);
16858 vm_page_stats_reusable.reuse_pages_failure++;
16859 return KERN_INVALID_ADDRESS;
16860 }
16861
16862 /*
16863 * Examine each vm_map_entry_t in the range.
16864 */
16865 for (; entry != vm_map_to_entry(map) && entry->vme_start < end;
16866 entry = entry->vme_next) {
16867 /*
16868 * Sanity check on the VM map entry.
16869 */
16870 if (!vm_map_entry_is_reusable(entry)) {
16871 vm_map_unlock_read(map);
16872 vm_page_stats_reusable.reuse_pages_failure++;
16873 return KERN_INVALID_ADDRESS;
16874 }
16875
16876 /*
16877 * The first time through, the start address could be anywhere
16878 * within the vm_map_entry we found. So adjust the offset to
16879 * correspond.
16880 */
16881 if (entry->vme_start < start) {
16882 start_offset = start - entry->vme_start;
16883 } else {
16884 start_offset = 0;
16885 }
16886 end_offset = MIN(end, entry->vme_end) - entry->vme_start;
16887 start_offset += VME_OFFSET(entry);
16888 end_offset += VME_OFFSET(entry);
16889
16890 object = VME_OBJECT(entry);
16891 if (object != VM_OBJECT_NULL) {
16892 vm_object_lock(object);
16893 vm_object_reuse_pages(object, start_offset, end_offset,
16894 TRUE);
16895 vm_object_unlock(object);
16896 }
16897
16898 if (VME_ALIAS(entry) == VM_MEMORY_MALLOC_LARGE_REUSABLE) {
16899 /*
16900 * XXX
16901 * We do not hold the VM map exclusively here.
16902 * The "alias" field is not that critical, so it's
16903 * safe to update it here, as long as it is the only
16904 * one that can be modified while holding the VM map
16905 * "shared".
16906 */
16907 VME_ALIAS_SET(entry, VM_MEMORY_MALLOC_LARGE_REUSED);
16908 }
16909 }
16910
16911 vm_map_unlock_read(map);
16912 vm_page_stats_reusable.reuse_pages_success++;
16913 return KERN_SUCCESS;
16914 }
16915
16916
16917 static kern_return_t
vm_map_reusable_pages(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)16918 vm_map_reusable_pages(
16919 vm_map_t map,
16920 vm_map_offset_t start,
16921 vm_map_offset_t end)
16922 {
16923 vm_map_entry_t entry;
16924 vm_object_t object;
16925 vm_object_offset_t start_offset, end_offset;
16926 vm_map_offset_t pmap_offset;
16927
16928 if (VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT) {
16929 /*
16930 * XXX TODO4K
16931 * need to figure out what reusable means for a portion
16932 * of a native page.
16933 */
16934 return KERN_SUCCESS;
16935 }
16936
16937 /*
16938 * The MADV_REUSABLE operation doesn't require any changes to the
16939 * vm_map_entry_t's, so the read lock is sufficient.
16940 */
16941
16942 vm_map_lock_read(map);
16943 assert(map->pmap != kernel_pmap); /* protect alias access */
16944
16945 /*
16946 * The madvise semantics require that the address range be fully
16947 * allocated with no holes. Otherwise, we're required to return
16948 * an error.
16949 */
16950
16951 if (!vm_map_range_check(map, start, end, &entry)) {
16952 vm_map_unlock_read(map);
16953 vm_page_stats_reusable.reusable_pages_failure++;
16954 return KERN_INVALID_ADDRESS;
16955 }
16956
16957 /*
16958 * Examine each vm_map_entry_t in the range.
16959 */
16960 for (; entry != vm_map_to_entry(map) && entry->vme_start < end;
16961 entry = entry->vme_next) {
16962 int kill_pages = 0;
16963 boolean_t kill_no_write = FALSE;
16964
16965 /*
16966 * Sanity check on the VM map entry.
16967 */
16968 if (!vm_map_entry_is_reusable(entry)) {
16969 vm_map_unlock_read(map);
16970 vm_page_stats_reusable.reusable_pages_failure++;
16971 return KERN_INVALID_ADDRESS;
16972 }
16973
16974 if (!(entry->protection & VM_PROT_WRITE) && !entry->used_for_jit
16975 #if __arm64e__
16976 && !entry->used_for_tpro
16977 #endif
16978 ) {
16979 /* not writable: can't discard contents */
16980 vm_map_unlock_read(map);
16981 vm_page_stats_reusable.reusable_nonwritable++;
16982 vm_page_stats_reusable.reusable_pages_failure++;
16983 return KERN_PROTECTION_FAILURE;
16984 }
16985
16986 /*
16987 * The first time through, the start address could be anywhere
16988 * within the vm_map_entry we found. So adjust the offset to
16989 * correspond.
16990 */
16991 if (entry->vme_start < start) {
16992 start_offset = start - entry->vme_start;
16993 pmap_offset = start;
16994 } else {
16995 start_offset = 0;
16996 pmap_offset = entry->vme_start;
16997 }
16998 end_offset = MIN(end, entry->vme_end) - entry->vme_start;
16999 start_offset += VME_OFFSET(entry);
17000 end_offset += VME_OFFSET(entry);
17001
17002 object = VME_OBJECT(entry);
17003 if (object == VM_OBJECT_NULL) {
17004 continue;
17005 }
17006
17007 if ((entry->protection & VM_PROT_EXECUTE) ||
17008 entry->vme_xnu_user_debug) {
17009 /*
17010 * Executable or user debug pages might be write-protected by
17011 * hardware, so do not attempt to write to these pages.
17012 */
17013 kill_no_write = TRUE;
17014 }
17015
17016 vm_object_lock(object);
17017 if (((os_ref_get_count_raw(&object->ref_count) == 1) ||
17018 (object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC &&
17019 object->vo_copy == VM_OBJECT_NULL)) &&
17020 object->shadow == VM_OBJECT_NULL &&
17021 /*
17022 * "iokit_acct" entries are billed for their virtual size
17023 * (rather than for their resident pages only), so they
17024 * wouldn't benefit from making pages reusable, and it
17025 * would be hard to keep track of pages that are both
17026 * "iokit_acct" and "reusable" in the pmap stats and
17027 * ledgers.
17028 */
17029 !(entry->iokit_acct ||
17030 (!entry->is_sub_map && !entry->use_pmap))) {
17031 if (os_ref_get_count_raw(&object->ref_count) != 1) {
17032 vm_page_stats_reusable.reusable_shared++;
17033 }
17034 kill_pages = 1;
17035 } else {
17036 kill_pages = -1;
17037 }
17038 if (kill_pages != -1) {
17039 vm_object_deactivate_pages(object,
17040 start_offset,
17041 end_offset - start_offset,
17042 kill_pages,
17043 TRUE /*reusable_pages*/,
17044 kill_no_write,
17045 map->pmap,
17046 pmap_offset);
17047 } else {
17048 vm_page_stats_reusable.reusable_pages_shared++;
17049 DTRACE_VM4(vm_map_reusable_pages_shared,
17050 unsigned int, VME_ALIAS(entry),
17051 vm_map_t, map,
17052 vm_map_entry_t, entry,
17053 vm_object_t, object);
17054 }
17055 vm_object_unlock(object);
17056
17057 if (VME_ALIAS(entry) == VM_MEMORY_MALLOC_LARGE ||
17058 VME_ALIAS(entry) == VM_MEMORY_MALLOC_LARGE_REUSED) {
17059 /*
17060 * XXX
17061 * We do not hold the VM map exclusively here.
17062 * The "alias" field is not that critical, so it's
17063 * safe to update it here, as long as it is the only
17064 * one that can be modified while holding the VM map
17065 * "shared".
17066 */
17067 VME_ALIAS_SET(entry, VM_MEMORY_MALLOC_LARGE_REUSABLE);
17068 }
17069 }
17070
17071 vm_map_unlock_read(map);
17072 vm_page_stats_reusable.reusable_pages_success++;
17073 return KERN_SUCCESS;
17074 }
17075
17076
17077 static kern_return_t
vm_map_can_reuse(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)17078 vm_map_can_reuse(
17079 vm_map_t map,
17080 vm_map_offset_t start,
17081 vm_map_offset_t end)
17082 {
17083 vm_map_entry_t entry;
17084
17085 /*
17086 * The MADV_REUSABLE operation doesn't require any changes to the
17087 * vm_map_entry_t's, so the read lock is sufficient.
17088 */
17089
17090 vm_map_lock_read(map);
17091 assert(map->pmap != kernel_pmap); /* protect alias access */
17092
17093 /*
17094 * The madvise semantics require that the address range be fully
17095 * allocated with no holes. Otherwise, we're required to return
17096 * an error.
17097 */
17098
17099 if (!vm_map_range_check(map, start, end, &entry)) {
17100 vm_map_unlock_read(map);
17101 vm_page_stats_reusable.can_reuse_failure++;
17102 return KERN_INVALID_ADDRESS;
17103 }
17104
17105 /*
17106 * Examine each vm_map_entry_t in the range.
17107 */
17108 for (; entry != vm_map_to_entry(map) && entry->vme_start < end;
17109 entry = entry->vme_next) {
17110 /*
17111 * Sanity check on the VM map entry.
17112 */
17113 if (!vm_map_entry_is_reusable(entry)) {
17114 vm_map_unlock_read(map);
17115 vm_page_stats_reusable.can_reuse_failure++;
17116 return KERN_INVALID_ADDRESS;
17117 }
17118 }
17119
17120 vm_map_unlock_read(map);
17121 vm_page_stats_reusable.can_reuse_success++;
17122 return KERN_SUCCESS;
17123 }
17124
17125
17126 #if MACH_ASSERT
17127 static kern_return_t
vm_map_pageout(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)17128 vm_map_pageout(
17129 vm_map_t map,
17130 vm_map_offset_t start,
17131 vm_map_offset_t end)
17132 {
17133 vm_map_entry_t entry;
17134
17135 /*
17136 * The MADV_PAGEOUT operation doesn't require any changes to the
17137 * vm_map_entry_t's, so the read lock is sufficient.
17138 */
17139
17140 vm_map_lock_read(map);
17141
17142 /*
17143 * The madvise semantics require that the address range be fully
17144 * allocated with no holes. Otherwise, we're required to return
17145 * an error.
17146 */
17147
17148 if (!vm_map_range_check(map, start, end, &entry)) {
17149 vm_map_unlock_read(map);
17150 return KERN_INVALID_ADDRESS;
17151 }
17152
17153 /*
17154 * Examine each vm_map_entry_t in the range.
17155 */
17156 for (; entry != vm_map_to_entry(map) && entry->vme_start < end;
17157 entry = entry->vme_next) {
17158 vm_object_t object;
17159
17160 /*
17161 * Sanity check on the VM map entry.
17162 */
17163 if (entry->is_sub_map) {
17164 vm_map_t submap;
17165 vm_map_offset_t submap_start;
17166 vm_map_offset_t submap_end;
17167 vm_map_entry_t submap_entry;
17168
17169 submap = VME_SUBMAP(entry);
17170 submap_start = VME_OFFSET(entry);
17171 submap_end = submap_start + (entry->vme_end -
17172 entry->vme_start);
17173
17174 vm_map_lock_read(submap);
17175
17176 if (!vm_map_range_check(submap,
17177 submap_start,
17178 submap_end,
17179 &submap_entry)) {
17180 vm_map_unlock_read(submap);
17181 vm_map_unlock_read(map);
17182 return KERN_INVALID_ADDRESS;
17183 }
17184
17185 if (submap_entry->is_sub_map) {
17186 vm_map_unlock_read(submap);
17187 continue;
17188 }
17189
17190 object = VME_OBJECT(submap_entry);
17191 if (object == VM_OBJECT_NULL || !object->internal) {
17192 vm_map_unlock_read(submap);
17193 continue;
17194 }
17195
17196 vm_object_pageout(object);
17197
17198 vm_map_unlock_read(submap);
17199 submap = VM_MAP_NULL;
17200 submap_entry = VM_MAP_ENTRY_NULL;
17201 continue;
17202 }
17203
17204 object = VME_OBJECT(entry);
17205 if (object == VM_OBJECT_NULL || !object->internal) {
17206 continue;
17207 }
17208
17209 vm_object_pageout(object);
17210 }
17211
17212 vm_map_unlock_read(map);
17213 return KERN_SUCCESS;
17214 }
17215 #endif /* MACH_ASSERT */
17216
17217 /*
17218 * This function determines if the zero operation can be run on the
17219 * respective entry. Additional checks on the object are in
17220 * vm_object_zero_preflight.
17221 */
17222 static kern_return_t
vm_map_zero_entry_preflight(vm_map_entry_t entry)17223 vm_map_zero_entry_preflight(vm_map_entry_t entry)
17224 {
17225 /*
17226 * Zeroing is restricted to writable non-executable entries and non-JIT
17227 * regions.
17228 */
17229 if (!(entry->protection & VM_PROT_WRITE) ||
17230 (entry->protection & VM_PROT_EXECUTE) ||
17231 entry->used_for_jit ||
17232 entry->vme_xnu_user_debug) {
17233 return KERN_PROTECTION_FAILURE;
17234 }
17235
17236 /*
17237 * Zeroing for copy on write isn't yet supported. Zeroing is also not
17238 * allowed for submaps.
17239 */
17240 if (entry->needs_copy || entry->is_sub_map) {
17241 return KERN_NO_ACCESS;
17242 }
17243
17244 return KERN_SUCCESS;
17245 }
17246
17247 /*
17248 * This function translates entry's start and end to offsets in the object
17249 */
17250 static void
vm_map_get_bounds_in_object(vm_map_entry_t entry,vm_map_offset_t start,vm_map_offset_t end,vm_map_offset_t * start_offset,vm_map_offset_t * end_offset)17251 vm_map_get_bounds_in_object(
17252 vm_map_entry_t entry,
17253 vm_map_offset_t start,
17254 vm_map_offset_t end,
17255 vm_map_offset_t *start_offset,
17256 vm_map_offset_t *end_offset)
17257 {
17258 if (entry->vme_start < start) {
17259 *start_offset = start - entry->vme_start;
17260 } else {
17261 *start_offset = 0;
17262 }
17263 *end_offset = MIN(end, entry->vme_end) - entry->vme_start;
17264 *start_offset += VME_OFFSET(entry);
17265 *end_offset += VME_OFFSET(entry);
17266 }
17267
17268 /*
17269 * This function iterates through the entries in the requested range
17270 * and zeroes any resident pages in the corresponding objects. Compressed
17271 * pages are dropped instead of being faulted in and zeroed.
17272 */
17273 static kern_return_t
vm_map_zero(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)17274 vm_map_zero(
17275 vm_map_t map,
17276 vm_map_offset_t start,
17277 vm_map_offset_t end)
17278 {
17279 vm_map_entry_t entry;
17280 vm_map_offset_t cur = start;
17281 kern_return_t ret;
17282
17283 /*
17284 * This operation isn't supported where the map page size is less than
17285 * the hardware page size. Caller will need to handle error and
17286 * explicitly zero memory if needed.
17287 */
17288 if (VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT) {
17289 return KERN_NO_ACCESS;
17290 }
17291
17292 /*
17293 * The MADV_ZERO operation doesn't require any changes to the
17294 * vm_map_entry_t's, so the read lock is sufficient.
17295 */
17296 vm_map_lock_read(map);
17297 assert(map->pmap != kernel_pmap); /* protect alias access */
17298
17299 /*
17300 * The madvise semantics require that the address range be fully
17301 * allocated with no holes. Otherwise, we're required to return
17302 * an error. This check needs to be redone if the map has changed.
17303 */
17304 if (!vm_map_range_check(map, cur, end, &entry)) {
17305 vm_map_unlock_read(map);
17306 return KERN_INVALID_ADDRESS;
17307 }
17308
17309 /*
17310 * Examine each vm_map_entry_t in the range.
17311 */
17312 while (entry != vm_map_to_entry(map) && entry->vme_start < end) {
17313 vm_map_offset_t cur_offset;
17314 vm_map_offset_t end_offset;
17315 unsigned int last_timestamp = map->timestamp;
17316 vm_object_t object = VME_OBJECT(entry);
17317
17318 ret = vm_map_zero_entry_preflight(entry);
17319 if (ret != KERN_SUCCESS) {
17320 vm_map_unlock_read(map);
17321 return ret;
17322 }
17323
17324 if (object == VM_OBJECT_NULL) {
17325 entry = entry->vme_next;
17326 continue;
17327 }
17328
17329 vm_map_get_bounds_in_object(entry, cur, end, &cur_offset, &end_offset);
17330 vm_object_lock(object);
17331 /*
17332 * Take a reference on the object as vm_object_zero will drop the object
17333 * lock when it encounters a busy page.
17334 */
17335 vm_object_reference_locked(object);
17336 vm_map_unlock_read(map);
17337
17338 ret = vm_object_zero(object, cur_offset, end_offset);
17339 vm_object_unlock(object);
17340 vm_object_deallocate(object);
17341 if (ret != KERN_SUCCESS) {
17342 return ret;
17343 }
17344 /*
17345 * Update cur as vm_object_zero has succeeded.
17346 */
17347 cur += (end_offset - cur_offset);
17348 if (cur == end) {
17349 return KERN_SUCCESS;
17350 }
17351
17352 /*
17353 * If the map timestamp has changed, restart by relooking up cur in the
17354 * map
17355 */
17356 vm_map_lock_read(map);
17357 if (last_timestamp != map->timestamp) {
17358 /*
17359 * Relookup cur in the map
17360 */
17361 if (!vm_map_range_check(map, cur, end, &entry)) {
17362 vm_map_unlock_read(map);
17363 return KERN_INVALID_ADDRESS;
17364 }
17365 continue;
17366 }
17367 /*
17368 * If the map hasn't changed proceed with the next entry
17369 */
17370 entry = entry->vme_next;
17371 }
17372
17373 vm_map_unlock_read(map);
17374 return KERN_SUCCESS;
17375 }
17376
17377
17378 /*
17379 * Routine: vm_map_entry_insert
17380 *
17381 * Description: This routine inserts a new vm_entry in a locked map.
17382 */
17383 static vm_map_entry_t
vm_map_entry_insert(vm_map_t map,vm_map_entry_t insp_entry,vm_map_offset_t start,vm_map_offset_t end,vm_object_t object,vm_object_offset_t offset,vm_map_kernel_flags_t vmk_flags,boolean_t needs_copy,vm_prot_t cur_protection,vm_prot_t max_protection,vm_inherit_t inheritance,boolean_t clear_map_aligned)17384 vm_map_entry_insert(
17385 vm_map_t map,
17386 vm_map_entry_t insp_entry,
17387 vm_map_offset_t start,
17388 vm_map_offset_t end,
17389 vm_object_t object,
17390 vm_object_offset_t offset,
17391 vm_map_kernel_flags_t vmk_flags,
17392 boolean_t needs_copy,
17393 vm_prot_t cur_protection,
17394 vm_prot_t max_protection,
17395 vm_inherit_t inheritance,
17396 boolean_t clear_map_aligned)
17397 {
17398 vm_map_entry_t new_entry;
17399 boolean_t map_aligned = FALSE;
17400
17401 assert(insp_entry != (vm_map_entry_t)0);
17402 vm_map_lock_assert_exclusive(map);
17403
17404 __assert_only vm_object_offset_t end_offset = 0;
17405 assertf(!os_add_overflow(end - start, offset, &end_offset), "size 0x%llx, offset 0x%llx caused overflow", (uint64_t)(end - start), offset);
17406
17407 if (VM_MAP_PAGE_SHIFT(map) != PAGE_SHIFT) {
17408 map_aligned = TRUE;
17409 }
17410 if (clear_map_aligned &&
17411 (!VM_MAP_PAGE_ALIGNED(start, VM_MAP_PAGE_MASK(map)) ||
17412 !VM_MAP_PAGE_ALIGNED(end, VM_MAP_PAGE_MASK(map)))) {
17413 map_aligned = FALSE;
17414 }
17415 if (map_aligned) {
17416 assert(VM_MAP_PAGE_ALIGNED(start, VM_MAP_PAGE_MASK(map)));
17417 assert(VM_MAP_PAGE_ALIGNED(end, VM_MAP_PAGE_MASK(map)));
17418 } else {
17419 assert(page_aligned(start));
17420 assert(page_aligned(end));
17421 }
17422 assert(start < end);
17423
17424 new_entry = vm_map_entry_create(map);
17425
17426 new_entry->vme_start = start;
17427 new_entry->vme_end = end;
17428
17429 if (vmk_flags.vmkf_submap) {
17430 new_entry->vme_atomic = vmk_flags.vmkf_submap_atomic;
17431 VME_SUBMAP_SET(new_entry, (vm_map_t)object);
17432 } else {
17433 VME_OBJECT_SET(new_entry, object, false, 0);
17434 }
17435 VME_OFFSET_SET(new_entry, offset);
17436 VME_ALIAS_SET(new_entry, vmk_flags.vm_tag);
17437
17438 new_entry->map_aligned = map_aligned;
17439 new_entry->needs_copy = needs_copy;
17440 new_entry->inheritance = inheritance;
17441 new_entry->protection = cur_protection;
17442 new_entry->max_protection = max_protection;
17443 /*
17444 * submap: "use_pmap" means "nested".
17445 * default: false.
17446 *
17447 * object: "use_pmap" means "use pmap accounting" for footprint.
17448 * default: true.
17449 */
17450 new_entry->use_pmap = !vmk_flags.vmkf_submap;
17451 new_entry->no_cache = vmk_flags.vmf_no_cache;
17452 new_entry->vme_permanent = vmk_flags.vmf_permanent;
17453 new_entry->translated_allow_execute = vmk_flags.vmkf_translated_allow_execute;
17454 new_entry->vme_no_copy_on_read = vmk_flags.vmkf_no_copy_on_read;
17455 new_entry->superpage_size = (vmk_flags.vmf_superpage_size != 0);
17456
17457 if (vmk_flags.vmkf_map_jit) {
17458 if (!(map->jit_entry_exists) ||
17459 VM_MAP_POLICY_ALLOW_MULTIPLE_JIT(map)) {
17460 new_entry->used_for_jit = TRUE;
17461 map->jit_entry_exists = TRUE;
17462 }
17463 }
17464
17465 /*
17466 * Insert the new entry into the list.
17467 */
17468
17469 vm_map_store_entry_link(map, insp_entry, new_entry, vmk_flags);
17470 map->size += end - start;
17471
17472 /*
17473 * Update the free space hint and the lookup hint.
17474 */
17475
17476 SAVE_HINT_MAP_WRITE(map, new_entry);
17477 return new_entry;
17478 }
17479
17480 /*
17481 * Routine: vm_map_remap_extract
17482 *
17483 * Description: This routine returns a vm_entry list from a map.
17484 */
17485 static kern_return_t
vm_map_remap_extract(vm_map_t map,vm_map_offset_t addr,vm_map_size_t size,boolean_t copy,vm_map_copy_t map_copy,vm_prot_t * cur_protection,vm_prot_t * max_protection,vm_inherit_t inheritance,vm_map_kernel_flags_t vmk_flags)17486 vm_map_remap_extract(
17487 vm_map_t map,
17488 vm_map_offset_t addr,
17489 vm_map_size_t size,
17490 boolean_t copy,
17491 vm_map_copy_t map_copy,
17492 vm_prot_t *cur_protection, /* IN/OUT */
17493 vm_prot_t *max_protection, /* IN/OUT */
17494 /* What, no behavior? */
17495 vm_inherit_t inheritance,
17496 vm_map_kernel_flags_t vmk_flags)
17497 {
17498 struct vm_map_header *map_header = &map_copy->cpy_hdr;
17499 kern_return_t result;
17500 vm_map_size_t mapped_size;
17501 vm_map_size_t tmp_size;
17502 vm_map_entry_t src_entry; /* result of last map lookup */
17503 vm_map_entry_t new_entry;
17504 vm_object_offset_t offset;
17505 vm_map_offset_t map_address;
17506 vm_map_offset_t src_start; /* start of entry to map */
17507 vm_map_offset_t src_end; /* end of region to be mapped */
17508 vm_object_t object;
17509 vm_map_version_t version;
17510 boolean_t src_needs_copy;
17511 boolean_t new_entry_needs_copy;
17512 vm_map_entry_t saved_src_entry;
17513 boolean_t src_entry_was_wired;
17514 vm_prot_t max_prot_for_prot_copy;
17515 vm_map_offset_t effective_page_mask;
17516 bool pageable, same_map;
17517 boolean_t vm_remap_legacy;
17518 vm_prot_t required_cur_prot, required_max_prot;
17519 vm_object_t new_copy_object; /* vm_object_copy_* result */
17520 boolean_t saved_used_for_jit; /* Saved used_for_jit. */
17521
17522 pageable = vmk_flags.vmkf_copy_pageable;
17523 same_map = vmk_flags.vmkf_copy_same_map;
17524
17525 effective_page_mask = MIN(PAGE_MASK, VM_MAP_PAGE_MASK(map));
17526
17527 assert(map != VM_MAP_NULL);
17528 assert(size != 0);
17529 assert(size == vm_map_round_page(size, effective_page_mask));
17530 assert(inheritance == VM_INHERIT_NONE ||
17531 inheritance == VM_INHERIT_COPY ||
17532 inheritance == VM_INHERIT_SHARE);
17533 assert(!(*cur_protection & ~(VM_PROT_ALL | VM_PROT_ALLEXEC)));
17534 assert(!(*max_protection & ~(VM_PROT_ALL | VM_PROT_ALLEXEC)));
17535 assert((*cur_protection & *max_protection) == *cur_protection);
17536
17537 /*
17538 * Compute start and end of region.
17539 */
17540 src_start = vm_map_trunc_page(addr, effective_page_mask);
17541 src_end = vm_map_round_page(src_start + size, effective_page_mask);
17542
17543 /*
17544 * Initialize map_header.
17545 */
17546 map_header->nentries = 0;
17547 map_header->entries_pageable = pageable;
17548 // map_header->page_shift = MIN(VM_MAP_PAGE_SHIFT(map), PAGE_SHIFT);
17549 map_header->page_shift = (uint16_t)VM_MAP_PAGE_SHIFT(map);
17550 map_header->rb_head_store.rbh_root = (void *)(int)SKIP_RB_TREE;
17551 vm_map_store_init(map_header);
17552
17553 if (copy && vmk_flags.vmkf_remap_prot_copy) {
17554 /*
17555 * Special case for vm_map_protect(VM_PROT_COPY):
17556 * we want to set the new mappings' max protection to the
17557 * specified *max_protection...
17558 */
17559 max_prot_for_prot_copy = *max_protection & (VM_PROT_ALL | VM_PROT_ALLEXEC);
17560 /* ... but we want to use the vm_remap() legacy mode */
17561 vmk_flags.vmkf_remap_legacy_mode = true;
17562 *max_protection = VM_PROT_NONE;
17563 *cur_protection = VM_PROT_NONE;
17564 } else {
17565 max_prot_for_prot_copy = VM_PROT_NONE;
17566 }
17567
17568 if (vmk_flags.vmkf_remap_legacy_mode) {
17569 /*
17570 * vm_remap() legacy mode:
17571 * Extract all memory regions in the specified range and
17572 * collect the strictest set of protections allowed on the
17573 * entire range, so the caller knows what they can do with
17574 * the remapped range.
17575 * We start with VM_PROT_ALL and we'll remove the protections
17576 * missing from each memory region.
17577 */
17578 vm_remap_legacy = TRUE;
17579 *cur_protection = VM_PROT_ALL;
17580 *max_protection = VM_PROT_ALL;
17581 required_cur_prot = VM_PROT_NONE;
17582 required_max_prot = VM_PROT_NONE;
17583 } else {
17584 /*
17585 * vm_remap_new() mode:
17586 * Extract all memory regions in the specified range and
17587 * ensure that they have at least the protections specified
17588 * by the caller via *cur_protection and *max_protection.
17589 * The resulting mapping should have these protections.
17590 */
17591 vm_remap_legacy = FALSE;
17592 if (copy) {
17593 required_cur_prot = VM_PROT_NONE;
17594 required_max_prot = VM_PROT_READ;
17595 } else {
17596 required_cur_prot = *cur_protection;
17597 required_max_prot = *max_protection;
17598 }
17599 }
17600
17601 map_address = 0;
17602 mapped_size = 0;
17603 result = KERN_SUCCESS;
17604
17605 /*
17606 * The specified source virtual space might correspond to
17607 * multiple map entries, need to loop on them.
17608 */
17609 vm_map_lock(map);
17610
17611 if (map->pmap == kernel_pmap) {
17612 map_copy->is_kernel_range = true;
17613 map_copy->orig_range = kmem_addr_get_range(addr, size);
17614 #if CONFIG_MAP_RANGES
17615 } else if (map->uses_user_ranges) {
17616 map_copy->is_user_range = true;
17617 map_copy->orig_range = vm_map_user_range_resolve(map, addr, size, NULL);
17618 #endif /* CONFIG_MAP_RANGES */
17619 }
17620
17621 if (VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT) {
17622 /*
17623 * This address space uses sub-pages so the range might
17624 * not be re-mappable in an address space with larger
17625 * pages. Re-assemble any broken-up VM map entries to
17626 * improve our chances of making it work.
17627 */
17628 vm_map_simplify_range(map, src_start, src_end);
17629 }
17630 while (mapped_size != size) {
17631 vm_map_size_t entry_size;
17632
17633 /*
17634 * Find the beginning of the region.
17635 */
17636 if (!vm_map_lookup_entry(map, src_start, &src_entry)) {
17637 result = KERN_INVALID_ADDRESS;
17638 break;
17639 }
17640
17641 if (src_start < src_entry->vme_start ||
17642 (mapped_size && src_start != src_entry->vme_start)) {
17643 result = KERN_INVALID_ADDRESS;
17644 break;
17645 }
17646
17647 tmp_size = size - mapped_size;
17648 if (src_end > src_entry->vme_end) {
17649 tmp_size -= (src_end - src_entry->vme_end);
17650 }
17651
17652 entry_size = (vm_map_size_t)(src_entry->vme_end -
17653 src_entry->vme_start);
17654
17655 if (src_entry->is_sub_map &&
17656 vmk_flags.vmkf_copy_single_object) {
17657 vm_map_t submap;
17658 vm_map_offset_t submap_start;
17659 vm_map_size_t submap_size;
17660 boolean_t submap_needs_copy;
17661
17662 /*
17663 * No check for "required protection" on "src_entry"
17664 * because the protections that matter are the ones
17665 * on the submap's VM map entry, which will be checked
17666 * during the call to vm_map_remap_extract() below.
17667 */
17668 object = VM_OBJECT_NULL;
17669
17670 submap_size = src_entry->vme_end - src_start;
17671 if (submap_size > size) {
17672 submap_size = size;
17673 }
17674 submap_start = VME_OFFSET(src_entry) + src_start - src_entry->vme_start;
17675 submap = VME_SUBMAP(src_entry);
17676 if (copy) {
17677 /*
17678 * The caller wants a copy-on-write re-mapping,
17679 * so let's extract from the submap accordingly.
17680 */
17681 submap_needs_copy = TRUE;
17682 } else if (src_entry->needs_copy) {
17683 /*
17684 * The caller wants a shared re-mapping but the
17685 * submap is mapped with "needs_copy", so its
17686 * contents can't be shared as is. Extract the
17687 * contents of the submap as "copy-on-write".
17688 * The re-mapping won't be shared with the
17689 * original mapping but this is equivalent to
17690 * what happened with the original "remap from
17691 * submap" code.
17692 * The shared region is mapped "needs_copy", for
17693 * example.
17694 */
17695 submap_needs_copy = TRUE;
17696 } else {
17697 /*
17698 * The caller wants a shared re-mapping and
17699 * this mapping can be shared (no "needs_copy"),
17700 * so let's extract from the submap accordingly.
17701 * Kernel submaps are mapped without
17702 * "needs_copy", for example.
17703 */
17704 submap_needs_copy = FALSE;
17705 }
17706 vm_map_reference(submap);
17707 vm_map_unlock(map);
17708 src_entry = NULL;
17709 if (vm_remap_legacy) {
17710 *cur_protection = VM_PROT_NONE;
17711 *max_protection = VM_PROT_NONE;
17712 }
17713
17714 DTRACE_VM7(remap_submap_recurse,
17715 vm_map_t, map,
17716 vm_map_offset_t, addr,
17717 vm_map_size_t, size,
17718 boolean_t, copy,
17719 vm_map_offset_t, submap_start,
17720 vm_map_size_t, submap_size,
17721 boolean_t, submap_needs_copy);
17722
17723 result = vm_map_remap_extract(submap,
17724 submap_start,
17725 submap_size,
17726 submap_needs_copy,
17727 map_copy,
17728 cur_protection,
17729 max_protection,
17730 inheritance,
17731 vmk_flags);
17732 vm_map_deallocate(submap);
17733
17734 if (result == KERN_SUCCESS &&
17735 submap_needs_copy &&
17736 !copy) {
17737 /*
17738 * We were asked for a "shared"
17739 * re-mapping but had to ask for a
17740 * "copy-on-write" remapping of the
17741 * submap's mapping to honor the
17742 * submap's "needs_copy".
17743 * We now need to resolve that
17744 * pending "copy-on-write" to
17745 * get something we can share.
17746 */
17747 vm_map_entry_t copy_entry;
17748 vm_object_offset_t copy_offset;
17749 vm_map_size_t copy_size;
17750 vm_object_t copy_object;
17751 copy_entry = vm_map_copy_first_entry(map_copy);
17752 copy_size = copy_entry->vme_end - copy_entry->vme_start;
17753 copy_object = VME_OBJECT(copy_entry);
17754 copy_offset = VME_OFFSET(copy_entry);
17755 if (copy_object == VM_OBJECT_NULL) {
17756 assert(copy_offset == 0);
17757 assert(!copy_entry->needs_copy);
17758 if (copy_entry->max_protection == VM_PROT_NONE) {
17759 assert(copy_entry->protection == VM_PROT_NONE);
17760 /* nothing to share */
17761 } else {
17762 assert(copy_offset == 0);
17763 copy_object = vm_object_allocate(copy_size);
17764 VME_OFFSET_SET(copy_entry, 0);
17765 VME_OBJECT_SET(copy_entry, copy_object, false, 0);
17766 assert(copy_entry->use_pmap);
17767 }
17768 } else if (copy_object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC) {
17769 /* already shareable */
17770 assert(!copy_entry->needs_copy);
17771 } else if (copy_entry->needs_copy ||
17772 copy_object->shadowed ||
17773 (copy_object->internal &&
17774 !copy_object->true_share &&
17775 !copy_entry->is_shared &&
17776 copy_object->vo_size > copy_size)) {
17777 VME_OBJECT_SHADOW(copy_entry, copy_size, TRUE);
17778 assert(copy_entry->use_pmap);
17779 if (copy_entry->needs_copy) {
17780 /* already write-protected */
17781 } else {
17782 vm_prot_t prot;
17783 prot = copy_entry->protection & ~VM_PROT_WRITE;
17784 vm_object_pmap_protect(copy_object,
17785 copy_offset,
17786 copy_size,
17787 PMAP_NULL,
17788 PAGE_SIZE,
17789 0,
17790 prot);
17791 }
17792 copy_entry->needs_copy = FALSE;
17793 }
17794 copy_object = VME_OBJECT(copy_entry);
17795 copy_offset = VME_OFFSET(copy_entry);
17796 if (copy_object &&
17797 copy_object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) {
17798 copy_object->copy_strategy = MEMORY_OBJECT_COPY_DELAY;
17799 copy_object->true_share = TRUE;
17800 }
17801 }
17802
17803 return result;
17804 }
17805
17806 if (src_entry->is_sub_map) {
17807 /* protections for submap mapping are irrelevant here */
17808 } else if (((src_entry->protection & required_cur_prot) !=
17809 required_cur_prot) ||
17810 ((src_entry->max_protection & required_max_prot) !=
17811 required_max_prot)) {
17812 if (vmk_flags.vmkf_copy_single_object &&
17813 mapped_size != 0) {
17814 /*
17815 * Single object extraction.
17816 * We can't extract more with the required
17817 * protection but we've extracted some, so
17818 * stop there and declare success.
17819 * The caller should check the size of
17820 * the copy entry we've extracted.
17821 */
17822 result = KERN_SUCCESS;
17823 } else {
17824 /*
17825 * VM range extraction.
17826 * Required proctection is not available
17827 * for this part of the range: fail.
17828 */
17829 result = KERN_PROTECTION_FAILURE;
17830 }
17831 break;
17832 }
17833
17834 if (src_entry->is_sub_map) {
17835 vm_map_t submap;
17836 vm_map_offset_t submap_start;
17837 vm_map_size_t submap_size;
17838 vm_map_copy_t submap_copy;
17839 vm_prot_t submap_curprot, submap_maxprot;
17840 boolean_t submap_needs_copy;
17841
17842 /*
17843 * No check for "required protection" on "src_entry"
17844 * because the protections that matter are the ones
17845 * on the submap's VM map entry, which will be checked
17846 * during the call to vm_map_copy_extract() below.
17847 */
17848 object = VM_OBJECT_NULL;
17849 submap_copy = VM_MAP_COPY_NULL;
17850
17851 /* find equivalent range in the submap */
17852 submap = VME_SUBMAP(src_entry);
17853 submap_start = VME_OFFSET(src_entry) + src_start - src_entry->vme_start;
17854 submap_size = tmp_size;
17855 if (copy) {
17856 /*
17857 * The caller wants a copy-on-write re-mapping,
17858 * so let's extract from the submap accordingly.
17859 */
17860 submap_needs_copy = TRUE;
17861 } else if (src_entry->needs_copy) {
17862 /*
17863 * The caller wants a shared re-mapping but the
17864 * submap is mapped with "needs_copy", so its
17865 * contents can't be shared as is. Extract the
17866 * contents of the submap as "copy-on-write".
17867 * The re-mapping won't be shared with the
17868 * original mapping but this is equivalent to
17869 * what happened with the original "remap from
17870 * submap" code.
17871 * The shared region is mapped "needs_copy", for
17872 * example.
17873 */
17874 submap_needs_copy = TRUE;
17875 } else {
17876 /*
17877 * The caller wants a shared re-mapping and
17878 * this mapping can be shared (no "needs_copy"),
17879 * so let's extract from the submap accordingly.
17880 * Kernel submaps are mapped without
17881 * "needs_copy", for example.
17882 */
17883 submap_needs_copy = FALSE;
17884 }
17885 /* extra ref to keep submap alive */
17886 vm_map_reference(submap);
17887
17888 DTRACE_VM7(remap_submap_recurse,
17889 vm_map_t, map,
17890 vm_map_offset_t, addr,
17891 vm_map_size_t, size,
17892 boolean_t, copy,
17893 vm_map_offset_t, submap_start,
17894 vm_map_size_t, submap_size,
17895 boolean_t, submap_needs_copy);
17896
17897 /*
17898 * The map can be safely unlocked since we
17899 * already hold a reference on the submap.
17900 *
17901 * No timestamp since we don't care if the map
17902 * gets modified while we're down in the submap.
17903 * We'll resume the extraction at src_start + tmp_size
17904 * anyway.
17905 */
17906 vm_map_unlock(map);
17907 src_entry = NULL; /* not valid once map is unlocked */
17908
17909 if (vm_remap_legacy) {
17910 submap_curprot = VM_PROT_NONE;
17911 submap_maxprot = VM_PROT_NONE;
17912 if (max_prot_for_prot_copy) {
17913 submap_maxprot = max_prot_for_prot_copy;
17914 }
17915 } else {
17916 assert(!max_prot_for_prot_copy);
17917 submap_curprot = *cur_protection;
17918 submap_maxprot = *max_protection;
17919 }
17920 result = vm_map_copy_extract(submap,
17921 submap_start,
17922 submap_size,
17923 submap_needs_copy,
17924 &submap_copy,
17925 &submap_curprot,
17926 &submap_maxprot,
17927 inheritance,
17928 vmk_flags);
17929
17930 /* release extra ref on submap */
17931 vm_map_deallocate(submap);
17932 submap = VM_MAP_NULL;
17933
17934 if (result != KERN_SUCCESS) {
17935 vm_map_lock(map);
17936 break;
17937 }
17938
17939 /* transfer submap_copy entries to map_header */
17940 while (vm_map_copy_first_entry(submap_copy) !=
17941 vm_map_copy_to_entry(submap_copy)) {
17942 vm_map_entry_t copy_entry;
17943 vm_map_size_t copy_entry_size;
17944
17945 copy_entry = vm_map_copy_first_entry(submap_copy);
17946
17947 /*
17948 * Prevent kernel_object from being exposed to
17949 * user space.
17950 */
17951 if (__improbable(copy_entry->vme_kernel_object)) {
17952 printf("%d[%s]: rejecting attempt to extract from kernel_object\n",
17953 proc_selfpid(),
17954 (get_bsdtask_info(current_task())
17955 ? proc_name_address(get_bsdtask_info(current_task()))
17956 : "?"));
17957 DTRACE_VM(extract_kernel_only);
17958 result = KERN_INVALID_RIGHT;
17959 vm_map_copy_discard(submap_copy);
17960 submap_copy = VM_MAP_COPY_NULL;
17961 vm_map_lock(map);
17962 break;
17963 }
17964
17965 vm_map_copy_entry_unlink(submap_copy, copy_entry);
17966 copy_entry_size = copy_entry->vme_end - copy_entry->vme_start;
17967 copy_entry->vme_start = map_address;
17968 copy_entry->vme_end = map_address + copy_entry_size;
17969 map_address += copy_entry_size;
17970 mapped_size += copy_entry_size;
17971 src_start += copy_entry_size;
17972 assert(src_start <= src_end);
17973 _vm_map_store_entry_link(map_header,
17974 map_header->links.prev,
17975 copy_entry);
17976 }
17977 /* done with submap_copy */
17978 vm_map_copy_discard(submap_copy);
17979
17980 if (vm_remap_legacy) {
17981 *cur_protection &= submap_curprot;
17982 *max_protection &= submap_maxprot;
17983 }
17984
17985 /* re-acquire the map lock and continue to next entry */
17986 vm_map_lock(map);
17987 continue;
17988 } else {
17989 object = VME_OBJECT(src_entry);
17990
17991 /*
17992 * Prevent kernel_object from being exposed to
17993 * user space.
17994 */
17995 if (__improbable(is_kernel_object(object))) {
17996 printf("%d[%s]: rejecting attempt to extract from kernel_object\n",
17997 proc_selfpid(),
17998 (get_bsdtask_info(current_task())
17999 ? proc_name_address(get_bsdtask_info(current_task()))
18000 : "?"));
18001 DTRACE_VM(extract_kernel_only);
18002 result = KERN_INVALID_RIGHT;
18003 break;
18004 }
18005
18006 if (src_entry->iokit_acct) {
18007 /*
18008 * This entry uses "IOKit accounting".
18009 */
18010 } else if (object != VM_OBJECT_NULL &&
18011 object->internal &&
18012 (object->purgable != VM_PURGABLE_DENY ||
18013 object->vo_ledger_tag != VM_LEDGER_TAG_NONE)) {
18014 /*
18015 * Purgeable objects have their own accounting:
18016 * no pmap accounting for them.
18017 */
18018 assertf(!src_entry->use_pmap,
18019 "map=%p src_entry=%p [0x%llx:0x%llx] 0x%x/0x%x %d",
18020 map,
18021 src_entry,
18022 (uint64_t)src_entry->vme_start,
18023 (uint64_t)src_entry->vme_end,
18024 src_entry->protection,
18025 src_entry->max_protection,
18026 VME_ALIAS(src_entry));
18027 } else {
18028 /*
18029 * Not IOKit or purgeable:
18030 * must be accounted by pmap stats.
18031 */
18032 assertf(src_entry->use_pmap,
18033 "map=%p src_entry=%p [0x%llx:0x%llx] 0x%x/0x%x %d",
18034 map,
18035 src_entry,
18036 (uint64_t)src_entry->vme_start,
18037 (uint64_t)src_entry->vme_end,
18038 src_entry->protection,
18039 src_entry->max_protection,
18040 VME_ALIAS(src_entry));
18041 }
18042
18043 if (object == VM_OBJECT_NULL) {
18044 assert(!src_entry->needs_copy);
18045 if (src_entry->max_protection == VM_PROT_NONE) {
18046 assert(src_entry->protection == VM_PROT_NONE);
18047 /*
18048 * No VM object and no permissions:
18049 * this must be a reserved range with
18050 * nothing to share or copy.
18051 * There could also be all sorts of
18052 * pmap shenanigans within that reserved
18053 * range, so let's just copy the map
18054 * entry as is to remap a similar
18055 * reserved range.
18056 */
18057 offset = 0; /* no object => no offset */
18058 goto copy_src_entry;
18059 }
18060 object = vm_object_allocate(entry_size);
18061 VME_OFFSET_SET(src_entry, 0);
18062 VME_OBJECT_SET(src_entry, object, false, 0);
18063 assert(src_entry->use_pmap);
18064 assert(!map->mapped_in_other_pmaps);
18065 } else if (src_entry->wired_count ||
18066 object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC) {
18067 /*
18068 * A wired memory region should not have
18069 * any pending copy-on-write and needs to
18070 * keep pointing at the VM object that
18071 * contains the wired pages.
18072 * If we're sharing this memory (copy=false),
18073 * we'll share this VM object.
18074 * If we're copying this memory (copy=true),
18075 * we'll call vm_object_copy_slowly() below
18076 * and use the new VM object for the remapping.
18077 *
18078 * Or, we are already using an asymmetric
18079 * copy, and therefore we already have
18080 * the right object.
18081 */
18082 assert(!src_entry->needs_copy);
18083 } else if (src_entry->needs_copy || object->shadowed ||
18084 (object->internal && !object->true_share &&
18085 !src_entry->is_shared &&
18086 object->vo_size > entry_size)) {
18087 bool is_writable;
18088
18089 VME_OBJECT_SHADOW(src_entry, entry_size,
18090 vm_map_always_shadow(map));
18091 assert(src_entry->use_pmap);
18092
18093 is_writable = false;
18094 if (src_entry->protection & VM_PROT_WRITE) {
18095 is_writable = true;
18096 #if __arm64e__
18097 } else if (src_entry->used_for_tpro) {
18098 is_writable = true;
18099 #endif /* __arm64e__ */
18100 }
18101 if (!src_entry->needs_copy && is_writable) {
18102 vm_prot_t prot;
18103
18104 if (pmap_has_prot_policy(map->pmap, src_entry->translated_allow_execute, src_entry->protection)) {
18105 panic("%s: map %p pmap %p entry %p 0x%llx:0x%llx prot 0x%x",
18106 __FUNCTION__,
18107 map, map->pmap,
18108 src_entry,
18109 (uint64_t)src_entry->vme_start,
18110 (uint64_t)src_entry->vme_end,
18111 src_entry->protection);
18112 }
18113
18114 prot = src_entry->protection & ~VM_PROT_WRITE;
18115
18116 if (override_nx(map,
18117 VME_ALIAS(src_entry))
18118 && prot) {
18119 prot |= VM_PROT_EXECUTE;
18120 }
18121
18122 if (pmap_has_prot_policy(map->pmap, src_entry->translated_allow_execute, prot)) {
18123 panic("%s: map %p pmap %p entry %p 0x%llx:0x%llx prot 0x%x",
18124 __FUNCTION__,
18125 map, map->pmap,
18126 src_entry,
18127 (uint64_t)src_entry->vme_start,
18128 (uint64_t)src_entry->vme_end,
18129 prot);
18130 }
18131
18132 if (map->mapped_in_other_pmaps) {
18133 vm_object_pmap_protect(
18134 VME_OBJECT(src_entry),
18135 VME_OFFSET(src_entry),
18136 entry_size,
18137 PMAP_NULL,
18138 PAGE_SIZE,
18139 src_entry->vme_start,
18140 prot);
18141 #if MACH_ASSERT
18142 } else if (__improbable(map->pmap == PMAP_NULL)) {
18143 /*
18144 * Some VM tests (in vm_tests.c)
18145 * sometimes want to use a VM
18146 * map without a pmap.
18147 * Otherwise, this should never
18148 * happen.
18149 */
18150 if (!thread_get_test_option(test_option_vm_map_allow_null_pmap)) {
18151 panic("null pmap");
18152 }
18153 #endif /* MACH_ASSERT */
18154 } else {
18155 pmap_protect(vm_map_pmap(map),
18156 src_entry->vme_start,
18157 src_entry->vme_end,
18158 prot);
18159 }
18160 }
18161
18162 object = VME_OBJECT(src_entry);
18163 src_entry->needs_copy = FALSE;
18164 }
18165
18166
18167 vm_object_lock(object);
18168 vm_object_reference_locked(object); /* object ref. for new entry */
18169 assert(!src_entry->needs_copy);
18170 if (object->copy_strategy ==
18171 MEMORY_OBJECT_COPY_SYMMETRIC) {
18172 /*
18173 * If we want to share this object (copy==0),
18174 * it needs to be COPY_DELAY.
18175 * If we want to copy this object (copy==1),
18176 * we can't just set "needs_copy" on our side
18177 * and expect the other side to do the same
18178 * (symmetrically), so we can't let the object
18179 * stay COPY_SYMMETRIC.
18180 * So we always switch from COPY_SYMMETRIC to
18181 * COPY_DELAY.
18182 */
18183 object->copy_strategy =
18184 MEMORY_OBJECT_COPY_DELAY;
18185 VM_OBJECT_SET_TRUE_SHARE(object, TRUE);
18186 }
18187 vm_object_unlock(object);
18188 }
18189
18190 offset = (VME_OFFSET(src_entry) +
18191 (src_start - src_entry->vme_start));
18192
18193 copy_src_entry:
18194
18195
18196 new_entry = _vm_map_entry_create(map_header);
18197 vm_map_entry_copy(map, new_entry, src_entry);
18198 if (new_entry->is_sub_map) {
18199 /* clr address space specifics */
18200 new_entry->use_pmap = FALSE;
18201 } else if (copy) {
18202 /*
18203 * We're dealing with a copy-on-write operation,
18204 * so the resulting mapping should not inherit the
18205 * original mapping's accounting settings.
18206 * "use_pmap" should be reset to its default (TRUE)
18207 * so that the new mapping gets accounted for in
18208 * the task's memory footprint.
18209 */
18210 new_entry->use_pmap = TRUE;
18211 }
18212 /* "iokit_acct" was cleared in vm_map_entry_copy() */
18213 assert(!new_entry->iokit_acct);
18214
18215 new_entry->map_aligned = FALSE;
18216
18217 new_entry->vme_start = map_address;
18218 new_entry->vme_end = map_address + tmp_size;
18219 assert(new_entry->vme_start < new_entry->vme_end);
18220 if (copy && vmk_flags.vmkf_remap_prot_copy) {
18221 /* security: keep "permanent" and "csm_associated" */
18222 new_entry->vme_permanent = src_entry->vme_permanent;
18223 new_entry->csm_associated = src_entry->csm_associated;
18224 /*
18225 * Remapping for vm_map_protect(VM_PROT_COPY)
18226 * to convert a read-only mapping into a
18227 * copy-on-write version of itself but
18228 * with write access:
18229 * keep the original inheritance but let's not
18230 * add VM_PROT_WRITE to the max protection yet
18231 * since we want to do more security checks against
18232 * the target map.
18233 */
18234 new_entry->inheritance = src_entry->inheritance;
18235 new_entry->protection &= max_prot_for_prot_copy;
18236
18237 #ifdef __arm64e__
18238 /*
18239 * Remapping for vm_map_protect(VM_PROT_COPY) to remap a TPRO
18240 * region to be explicitly writable without TPRO is only permitted
18241 * if TPRO enforcement has been overridden.
18242 *
18243 * In this case we ensure any entries reset the TPRO state
18244 * and we permit the region to be downgraded from permanent.
18245 */
18246 if (new_entry->used_for_tpro) {
18247 if (vmk_flags.vmkf_tpro_enforcement_override) {
18248 new_entry->used_for_tpro = FALSE;
18249 new_entry->vme_permanent = FALSE;
18250 } else {
18251 result = KERN_PROTECTION_FAILURE;
18252 vm_object_deallocate(object);
18253 vm_map_entry_dispose(new_entry);
18254 new_entry = VM_MAP_ENTRY_NULL;
18255 break;
18256 }
18257 }
18258 #endif
18259 } else {
18260 new_entry->inheritance = inheritance;
18261 if (!vm_remap_legacy) {
18262 new_entry->protection = *cur_protection;
18263 new_entry->max_protection = *max_protection;
18264 }
18265 }
18266
18267 VME_OFFSET_SET(new_entry, offset);
18268
18269 /*
18270 * The new region has to be copied now if required.
18271 */
18272 RestartCopy:
18273 if (!copy) {
18274 if (src_entry->used_for_jit == TRUE) {
18275 if (same_map) {
18276 } else if (!VM_MAP_POLICY_ALLOW_JIT_SHARING(map)) {
18277 /*
18278 * Cannot allow an entry describing a JIT
18279 * region to be shared across address spaces.
18280 */
18281 result = KERN_INVALID_ARGUMENT;
18282 vm_object_deallocate(object);
18283 vm_map_entry_dispose(new_entry);
18284 new_entry = VM_MAP_ENTRY_NULL;
18285 break;
18286 }
18287 }
18288
18289 if (!src_entry->is_sub_map &&
18290 VME_OBJECT(src_entry) == VM_OBJECT_NULL) {
18291 /* no accessible memory; nothing to share */
18292 assert(src_entry->protection == VM_PROT_NONE);
18293 assert(src_entry->max_protection == VM_PROT_NONE);
18294 src_entry->is_shared = FALSE;
18295 } else {
18296 src_entry->is_shared = TRUE;
18297 }
18298 if (!new_entry->is_sub_map &&
18299 VME_OBJECT(new_entry) == VM_OBJECT_NULL) {
18300 /* no accessible memory; nothing to share */
18301 assert(new_entry->protection == VM_PROT_NONE);
18302 assert(new_entry->max_protection == VM_PROT_NONE);
18303 new_entry->is_shared = FALSE;
18304 } else {
18305 new_entry->is_shared = TRUE;
18306 }
18307 if (!(new_entry->is_sub_map)) {
18308 new_entry->needs_copy = FALSE;
18309 }
18310 } else if (src_entry->is_sub_map) {
18311 /* make this a COW sub_map if not already */
18312 assert(new_entry->wired_count == 0);
18313 new_entry->needs_copy = TRUE;
18314 object = VM_OBJECT_NULL;
18315 } else if (src_entry->wired_count == 0 &&
18316 !(debug4k_no_cow_copyin && VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT) &&
18317 vm_object_copy_quickly(VME_OBJECT(new_entry),
18318 VME_OFFSET(new_entry),
18319 (new_entry->vme_end -
18320 new_entry->vme_start),
18321 &src_needs_copy,
18322 &new_entry_needs_copy)) {
18323 new_entry->needs_copy = new_entry_needs_copy;
18324 new_entry->is_shared = FALSE;
18325 assertf(new_entry->use_pmap, "map %p new_entry %p\n", map, new_entry);
18326
18327 /*
18328 * Handle copy_on_write semantics.
18329 */
18330 if (src_needs_copy && !src_entry->needs_copy) {
18331 vm_prot_t prot;
18332
18333 if (pmap_has_prot_policy(map->pmap, src_entry->translated_allow_execute, src_entry->protection)) {
18334 panic("%s: map %p pmap %p entry %p 0x%llx:0x%llx prot 0x%x",
18335 __FUNCTION__,
18336 map, map->pmap, src_entry,
18337 (uint64_t)src_entry->vme_start,
18338 (uint64_t)src_entry->vme_end,
18339 src_entry->protection);
18340 }
18341
18342 prot = src_entry->protection & ~VM_PROT_WRITE;
18343
18344 if (override_nx(map,
18345 VME_ALIAS(src_entry))
18346 && prot) {
18347 prot |= VM_PROT_EXECUTE;
18348 }
18349
18350 if (pmap_has_prot_policy(map->pmap, src_entry->translated_allow_execute, prot)) {
18351 panic("%s: map %p pmap %p entry %p 0x%llx:0x%llx prot 0x%x",
18352 __FUNCTION__,
18353 map, map->pmap, src_entry,
18354 (uint64_t)src_entry->vme_start,
18355 (uint64_t)src_entry->vme_end,
18356 prot);
18357 }
18358
18359 vm_object_pmap_protect(object,
18360 offset,
18361 entry_size,
18362 ((src_entry->is_shared
18363 || map->mapped_in_other_pmaps) ?
18364 PMAP_NULL : map->pmap),
18365 VM_MAP_PAGE_SIZE(map),
18366 src_entry->vme_start,
18367 prot);
18368
18369 assert(src_entry->wired_count == 0);
18370 src_entry->needs_copy = TRUE;
18371 }
18372 /*
18373 * Throw away the old object reference of the new entry.
18374 */
18375 vm_object_deallocate(object);
18376 } else {
18377 new_entry->is_shared = FALSE;
18378 assertf(new_entry->use_pmap, "map %p new_entry %p\n", map, new_entry);
18379
18380 src_entry_was_wired = (src_entry->wired_count > 0);
18381 saved_src_entry = src_entry;
18382 src_entry = VM_MAP_ENTRY_NULL;
18383
18384 /*
18385 * The map can be safely unlocked since we
18386 * already hold a reference on the object.
18387 *
18388 * Record the timestamp of the map for later
18389 * verification, and unlock the map.
18390 */
18391 version.main_timestamp = map->timestamp;
18392 vm_map_unlock(map); /* Increments timestamp once! */
18393
18394 /*
18395 * Perform the copy.
18396 */
18397 if (src_entry_was_wired > 0 ||
18398 (debug4k_no_cow_copyin &&
18399 VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT)) {
18400 vm_object_lock(object);
18401 result = vm_object_copy_slowly(
18402 object,
18403 offset,
18404 (new_entry->vme_end -
18405 new_entry->vme_start),
18406 THREAD_UNINT,
18407 &new_copy_object);
18408 /* VME_OBJECT_SET will reset used_for_jit, so preserve it. */
18409 saved_used_for_jit = new_entry->used_for_jit;
18410 VME_OBJECT_SET(new_entry, new_copy_object, false, 0);
18411 new_entry->used_for_jit = saved_used_for_jit;
18412 VME_OFFSET_SET(new_entry, offset - vm_object_trunc_page(offset));
18413 new_entry->needs_copy = FALSE;
18414 } else {
18415 vm_object_offset_t new_offset;
18416
18417 new_offset = VME_OFFSET(new_entry);
18418 result = vm_object_copy_strategically(
18419 object,
18420 offset,
18421 (new_entry->vme_end -
18422 new_entry->vme_start),
18423 false, /* forking */
18424 &new_copy_object,
18425 &new_offset,
18426 &new_entry_needs_copy);
18427 /* VME_OBJECT_SET will reset used_for_jit, so preserve it. */
18428 saved_used_for_jit = new_entry->used_for_jit;
18429 VME_OBJECT_SET(new_entry, new_copy_object, false, 0);
18430 new_entry->used_for_jit = saved_used_for_jit;
18431 if (new_offset != VME_OFFSET(new_entry)) {
18432 VME_OFFSET_SET(new_entry, new_offset);
18433 }
18434
18435 new_entry->needs_copy = new_entry_needs_copy;
18436 }
18437
18438 /*
18439 * Throw away the old object reference of the new entry.
18440 */
18441 vm_object_deallocate(object);
18442
18443 if (result != KERN_SUCCESS &&
18444 result != KERN_MEMORY_RESTART_COPY) {
18445 vm_map_entry_dispose(new_entry);
18446 vm_map_lock(map);
18447 break;
18448 }
18449
18450 /*
18451 * Verify that the map has not substantially
18452 * changed while the copy was being made.
18453 */
18454
18455 vm_map_lock(map);
18456 if (version.main_timestamp + 1 != map->timestamp) {
18457 /*
18458 * Simple version comparison failed.
18459 *
18460 * Retry the lookup and verify that the
18461 * same object/offset are still present.
18462 */
18463 saved_src_entry = VM_MAP_ENTRY_NULL;
18464 vm_object_deallocate(VME_OBJECT(new_entry));
18465 vm_map_entry_dispose(new_entry);
18466 if (result == KERN_MEMORY_RESTART_COPY) {
18467 result = KERN_SUCCESS;
18468 }
18469 continue;
18470 }
18471 /* map hasn't changed: src_entry is still valid */
18472 src_entry = saved_src_entry;
18473 saved_src_entry = VM_MAP_ENTRY_NULL;
18474
18475 if (result == KERN_MEMORY_RESTART_COPY) {
18476 vm_object_reference(object);
18477 goto RestartCopy;
18478 }
18479 }
18480
18481 _vm_map_store_entry_link(map_header,
18482 map_header->links.prev, new_entry);
18483
18484 /* protections for submap mapping are irrelevant here */
18485 if (vm_remap_legacy && !src_entry->is_sub_map) {
18486 *cur_protection &= src_entry->protection;
18487 *max_protection &= src_entry->max_protection;
18488 }
18489
18490 map_address += tmp_size;
18491 mapped_size += tmp_size;
18492 src_start += tmp_size;
18493
18494 if (vmk_flags.vmkf_copy_single_object) {
18495 if (mapped_size != size) {
18496 DEBUG4K_SHARE("map %p addr 0x%llx size 0x%llx clipped copy at mapped_size 0x%llx\n",
18497 map, (uint64_t)addr, (uint64_t)size, (uint64_t)mapped_size);
18498 if (src_entry->vme_next != vm_map_to_entry(map) &&
18499 src_entry->vme_next->vme_object_value ==
18500 src_entry->vme_object_value) {
18501 /* XXX TODO4K */
18502 DEBUG4K_ERROR("could have extended copy to next entry...\n");
18503 }
18504 }
18505 break;
18506 }
18507 } /* end while */
18508
18509 vm_map_unlock(map);
18510 if (result != KERN_SUCCESS) {
18511 /*
18512 * Free all allocated elements.
18513 */
18514 for (src_entry = map_header->links.next;
18515 src_entry != CAST_TO_VM_MAP_ENTRY(&map_header->links);
18516 src_entry = new_entry) {
18517 new_entry = src_entry->vme_next;
18518 _vm_map_store_entry_unlink(map_header, src_entry, false);
18519 if (src_entry->is_sub_map) {
18520 vm_map_deallocate(VME_SUBMAP(src_entry));
18521 } else {
18522 vm_object_deallocate(VME_OBJECT(src_entry));
18523 }
18524 vm_map_entry_dispose(src_entry);
18525 }
18526 }
18527 return result;
18528 }
18529
18530 bool
vm_map_is_exotic(vm_map_t map)18531 vm_map_is_exotic(
18532 vm_map_t map)
18533 {
18534 return VM_MAP_IS_EXOTIC(map);
18535 }
18536
18537 bool
vm_map_is_alien(vm_map_t map)18538 vm_map_is_alien(
18539 vm_map_t map)
18540 {
18541 return VM_MAP_IS_ALIEN(map);
18542 }
18543
18544 #if XNU_TARGET_OS_OSX
18545 void
vm_map_mark_alien(vm_map_t map)18546 vm_map_mark_alien(
18547 vm_map_t map)
18548 {
18549 vm_map_lock(map);
18550 map->is_alien = true;
18551 vm_map_unlock(map);
18552 }
18553
18554 void
vm_map_single_jit(vm_map_t map)18555 vm_map_single_jit(
18556 vm_map_t map)
18557 {
18558 vm_map_lock(map);
18559 map->single_jit = true;
18560 vm_map_unlock(map);
18561 }
18562 #endif /* XNU_TARGET_OS_OSX */
18563
18564
18565 /*
18566 * Callers of this function must call vm_map_copy_require on
18567 * previously created vm_map_copy_t or pass a newly created
18568 * one to ensure that it hasn't been forged.
18569 */
18570 static kern_return_t
vm_map_copy_to_physcopy(vm_map_copy_t copy_map,vm_map_t target_map)18571 vm_map_copy_to_physcopy(
18572 vm_map_copy_t copy_map,
18573 vm_map_t target_map)
18574 {
18575 vm_map_size_t size;
18576 vm_map_entry_t entry;
18577 vm_map_entry_t new_entry;
18578 vm_object_t new_object;
18579 unsigned int pmap_flags;
18580 pmap_t new_pmap;
18581 vm_map_t new_map;
18582 vm_map_address_t src_start, src_end, src_cur;
18583 vm_map_address_t dst_start, dst_end, dst_cur;
18584 kern_return_t kr;
18585 void *kbuf;
18586
18587 /*
18588 * Perform the equivalent of vm_allocate() and memcpy().
18589 * Replace the mappings in "copy_map" with the newly allocated mapping.
18590 */
18591 DEBUG4K_COPY("copy_map %p (%d %d 0x%llx 0x%llx) BEFORE\n", copy_map, copy_map->cpy_hdr.page_shift, copy_map->cpy_hdr.nentries, copy_map->offset, (uint64_t)copy_map->size);
18592
18593 assert(copy_map->cpy_hdr.page_shift != VM_MAP_PAGE_MASK(target_map));
18594
18595 /* create a new pmap to map "copy_map" */
18596 pmap_flags = 0;
18597 assert(copy_map->cpy_hdr.page_shift == FOURK_PAGE_SHIFT);
18598 #if PMAP_CREATE_FORCE_4K_PAGES
18599 pmap_flags |= PMAP_CREATE_FORCE_4K_PAGES;
18600 #endif /* PMAP_CREATE_FORCE_4K_PAGES */
18601 pmap_flags |= PMAP_CREATE_64BIT;
18602 new_pmap = pmap_create_options(NULL, (vm_map_size_t)0, pmap_flags);
18603 if (new_pmap == NULL) {
18604 return KERN_RESOURCE_SHORTAGE;
18605 }
18606
18607 /* allocate new VM object */
18608 size = VM_MAP_ROUND_PAGE(copy_map->size, PAGE_MASK);
18609 new_object = vm_object_allocate(size);
18610 assert(new_object);
18611
18612 /* allocate new VM map entry */
18613 new_entry = vm_map_copy_entry_create(copy_map);
18614 assert(new_entry);
18615
18616 /* finish initializing new VM map entry */
18617 new_entry->protection = VM_PROT_DEFAULT;
18618 new_entry->max_protection = VM_PROT_DEFAULT;
18619 new_entry->use_pmap = TRUE;
18620
18621 /* make new VM map entry point to new VM object */
18622 new_entry->vme_start = 0;
18623 new_entry->vme_end = size;
18624 VME_OBJECT_SET(new_entry, new_object, false, 0);
18625 VME_OFFSET_SET(new_entry, 0);
18626
18627 /* create a new pageable VM map to map "copy_map" */
18628 new_map = vm_map_create_options(new_pmap, 0, MACH_VM_MAX_ADDRESS,
18629 VM_MAP_CREATE_PAGEABLE);
18630 assert(new_map);
18631 vm_map_set_page_shift(new_map, copy_map->cpy_hdr.page_shift);
18632
18633 /* map "copy_map" in the new VM map */
18634 src_start = 0;
18635 kr = vm_map_copyout_internal(
18636 new_map,
18637 &src_start,
18638 copy_map,
18639 copy_map->size,
18640 FALSE, /* consume_on_success */
18641 VM_PROT_DEFAULT,
18642 VM_PROT_DEFAULT,
18643 VM_INHERIT_DEFAULT);
18644 assert(kr == KERN_SUCCESS);
18645 src_end = src_start + copy_map->size;
18646
18647 /* map "new_object" in the new VM map */
18648 vm_object_reference(new_object);
18649 dst_start = 0;
18650 kr = vm_map_enter(new_map,
18651 &dst_start,
18652 size,
18653 0, /* mask */
18654 VM_MAP_KERNEL_FLAGS_ANYWHERE(.vm_tag = VM_KERN_MEMORY_OSFMK),
18655 new_object,
18656 0, /* offset */
18657 FALSE, /* needs copy */
18658 VM_PROT_DEFAULT,
18659 VM_PROT_DEFAULT,
18660 VM_INHERIT_DEFAULT);
18661 assert(kr == KERN_SUCCESS);
18662 dst_end = dst_start + size;
18663
18664 /* get a kernel buffer */
18665 kbuf = kalloc_data(PAGE_SIZE, Z_WAITOK | Z_NOFAIL);
18666
18667 /* physically copy "copy_map" mappings to new VM object */
18668 for (src_cur = src_start, dst_cur = dst_start;
18669 src_cur < src_end;
18670 src_cur += PAGE_SIZE, dst_cur += PAGE_SIZE) {
18671 vm_size_t bytes;
18672
18673 bytes = PAGE_SIZE;
18674 if (src_cur + PAGE_SIZE > src_end) {
18675 /* partial copy for last page */
18676 bytes = src_end - src_cur;
18677 assert(bytes > 0 && bytes < PAGE_SIZE);
18678 /* rest of dst page should be zero-filled */
18679 }
18680 /* get bytes from src mapping */
18681 kr = copyinmap(new_map, src_cur, kbuf, bytes);
18682 if (kr != KERN_SUCCESS) {
18683 DEBUG4K_COPY("copyinmap(%p, 0x%llx, %p, 0x%llx) kr 0x%x\n", new_map, (uint64_t)src_cur, kbuf, (uint64_t)bytes, kr);
18684 }
18685 /* put bytes in dst mapping */
18686 assert(dst_cur < dst_end);
18687 assert(dst_cur + bytes <= dst_end);
18688 kr = copyoutmap(new_map, kbuf, dst_cur, bytes);
18689 if (kr != KERN_SUCCESS) {
18690 DEBUG4K_COPY("copyoutmap(%p, %p, 0x%llx, 0x%llx) kr 0x%x\n", new_map, kbuf, (uint64_t)dst_cur, (uint64_t)bytes, kr);
18691 }
18692 }
18693
18694 /* free kernel buffer */
18695 kfree_data(kbuf, PAGE_SIZE);
18696
18697 /* destroy new map */
18698 vm_map_destroy(new_map);
18699 new_map = VM_MAP_NULL;
18700
18701 /* dispose of the old map entries in "copy_map" */
18702 while (vm_map_copy_first_entry(copy_map) !=
18703 vm_map_copy_to_entry(copy_map)) {
18704 entry = vm_map_copy_first_entry(copy_map);
18705 vm_map_copy_entry_unlink(copy_map, entry);
18706 if (entry->is_sub_map) {
18707 vm_map_deallocate(VME_SUBMAP(entry));
18708 } else {
18709 vm_object_deallocate(VME_OBJECT(entry));
18710 }
18711 vm_map_copy_entry_dispose(entry);
18712 }
18713
18714 /* change "copy_map"'s page_size to match "target_map" */
18715 copy_map->cpy_hdr.page_shift = (uint16_t)VM_MAP_PAGE_SHIFT(target_map);
18716 copy_map->offset = 0;
18717 copy_map->size = size;
18718
18719 /* insert new map entry in "copy_map" */
18720 assert(vm_map_copy_last_entry(copy_map) == vm_map_copy_to_entry(copy_map));
18721 vm_map_copy_entry_link(copy_map, vm_map_copy_last_entry(copy_map), new_entry);
18722
18723 DEBUG4K_COPY("copy_map %p (%d %d 0x%llx 0x%llx) AFTER\n", copy_map, copy_map->cpy_hdr.page_shift, copy_map->cpy_hdr.nentries, copy_map->offset, (uint64_t)copy_map->size);
18724 return KERN_SUCCESS;
18725 }
18726
18727 void
18728 vm_map_copy_adjust_get_target_copy_map(
18729 vm_map_copy_t copy_map,
18730 vm_map_copy_t *target_copy_map_p);
18731 void
vm_map_copy_adjust_get_target_copy_map(vm_map_copy_t copy_map,vm_map_copy_t * target_copy_map_p)18732 vm_map_copy_adjust_get_target_copy_map(
18733 vm_map_copy_t copy_map,
18734 vm_map_copy_t *target_copy_map_p)
18735 {
18736 vm_map_copy_t target_copy_map;
18737 vm_map_entry_t entry, target_entry;
18738
18739 if (*target_copy_map_p != VM_MAP_COPY_NULL) {
18740 /* the caller already has a "target_copy_map": use it */
18741 return;
18742 }
18743
18744 /* the caller wants us to create a new copy of "copy_map" */
18745 assert(copy_map->type == VM_MAP_COPY_ENTRY_LIST);
18746 target_copy_map = vm_map_copy_allocate(copy_map->type);
18747 target_copy_map->offset = copy_map->offset;
18748 target_copy_map->size = copy_map->size;
18749 target_copy_map->cpy_hdr.page_shift = copy_map->cpy_hdr.page_shift;
18750 for (entry = vm_map_copy_first_entry(copy_map);
18751 entry != vm_map_copy_to_entry(copy_map);
18752 entry = entry->vme_next) {
18753 target_entry = vm_map_copy_entry_create(target_copy_map);
18754 vm_map_entry_copy_full(target_entry, entry);
18755 if (target_entry->is_sub_map) {
18756 vm_map_reference(VME_SUBMAP(target_entry));
18757 } else {
18758 vm_object_reference(VME_OBJECT(target_entry));
18759 }
18760 vm_map_copy_entry_link(
18761 target_copy_map,
18762 vm_map_copy_last_entry(target_copy_map),
18763 target_entry);
18764 }
18765 entry = VM_MAP_ENTRY_NULL;
18766 *target_copy_map_p = target_copy_map;
18767 }
18768
18769 /*
18770 * Callers of this function must call vm_map_copy_require on
18771 * previously created vm_map_copy_t or pass a newly created
18772 * one to ensure that it hasn't been forged.
18773 */
18774 static void
vm_map_copy_trim(vm_map_copy_t copy_map,uint16_t new_page_shift,vm_map_offset_t trim_start,vm_map_offset_t trim_end)18775 vm_map_copy_trim(
18776 vm_map_copy_t copy_map,
18777 uint16_t new_page_shift,
18778 vm_map_offset_t trim_start,
18779 vm_map_offset_t trim_end)
18780 {
18781 uint16_t copy_page_shift;
18782 vm_map_entry_t entry, next_entry;
18783
18784 assert(copy_map->type == VM_MAP_COPY_ENTRY_LIST);
18785 assert(copy_map->cpy_hdr.nentries > 0);
18786
18787 trim_start += vm_map_copy_first_entry(copy_map)->vme_start;
18788 trim_end += vm_map_copy_first_entry(copy_map)->vme_start;
18789
18790 /* use the new page_shift to do the clipping */
18791 copy_page_shift = VM_MAP_COPY_PAGE_SHIFT(copy_map);
18792 copy_map->cpy_hdr.page_shift = new_page_shift;
18793
18794 for (entry = vm_map_copy_first_entry(copy_map);
18795 entry != vm_map_copy_to_entry(copy_map);
18796 entry = next_entry) {
18797 next_entry = entry->vme_next;
18798 if (entry->vme_end <= trim_start) {
18799 /* entry fully before trim range: skip */
18800 continue;
18801 }
18802 if (entry->vme_start >= trim_end) {
18803 /* entry fully after trim range: done */
18804 break;
18805 }
18806 /* clip entry if needed */
18807 vm_map_copy_clip_start(copy_map, entry, trim_start);
18808 vm_map_copy_clip_end(copy_map, entry, trim_end);
18809 /* dispose of entry */
18810 copy_map->size -= entry->vme_end - entry->vme_start;
18811 vm_map_copy_entry_unlink(copy_map, entry);
18812 if (entry->is_sub_map) {
18813 vm_map_deallocate(VME_SUBMAP(entry));
18814 } else {
18815 vm_object_deallocate(VME_OBJECT(entry));
18816 }
18817 vm_map_copy_entry_dispose(entry);
18818 entry = VM_MAP_ENTRY_NULL;
18819 }
18820
18821 /* restore copy_map's original page_shift */
18822 copy_map->cpy_hdr.page_shift = copy_page_shift;
18823 }
18824
18825 /*
18826 * Make any necessary adjustments to "copy_map" to allow it to be
18827 * mapped into "target_map".
18828 * If no changes were necessary, "target_copy_map" points to the
18829 * untouched "copy_map".
18830 * If changes are necessary, changes will be made to "target_copy_map".
18831 * If "target_copy_map" was NULL, we create a new "vm_map_copy_t" and
18832 * copy the original "copy_map" to it before applying the changes.
18833 * The caller should discard "target_copy_map" if it's not the same as
18834 * the original "copy_map".
18835 */
18836 /* TODO4K: also adjust to sub-range in the copy_map -> add start&end? */
18837 kern_return_t
vm_map_copy_adjust_to_target(vm_map_copy_t src_copy_map,vm_map_offset_ut offset_u,vm_map_size_ut size_u,vm_map_t target_map,boolean_t copy,vm_map_copy_t * target_copy_map_p,vm_map_offset_t * overmap_start_p,vm_map_offset_t * overmap_end_p,vm_map_offset_t * trimmed_start_p)18838 vm_map_copy_adjust_to_target(
18839 vm_map_copy_t src_copy_map,
18840 vm_map_offset_ut offset_u,
18841 vm_map_size_ut size_u,
18842 vm_map_t target_map,
18843 boolean_t copy,
18844 vm_map_copy_t *target_copy_map_p,
18845 vm_map_offset_t *overmap_start_p,
18846 vm_map_offset_t *overmap_end_p,
18847 vm_map_offset_t *trimmed_start_p)
18848 {
18849 vm_map_copy_t copy_map, target_copy_map;
18850 vm_map_size_t target_size;
18851 vm_map_size_t src_copy_map_size;
18852 vm_map_size_t overmap_start, overmap_end;
18853 int misalignments;
18854 vm_map_entry_t entry, target_entry;
18855 vm_map_offset_t addr_adjustment;
18856 vm_map_offset_t new_start, new_end;
18857 int copy_page_mask, target_page_mask;
18858 uint16_t copy_page_shift, target_page_shift;
18859 vm_map_offset_t trimmed_end;
18860 vm_map_size_t map_size;
18861 kern_return_t kr;
18862
18863 /*
18864 * Sanitize any input parameters that are addr/size/prot/inherit
18865 */
18866 kr = vm_map_copy_addr_size_sanitize(
18867 target_map,
18868 offset_u,
18869 size_u,
18870 VM_SANITIZE_CALLER_MACH_MEMORY_ENTRY_MAP_SIZE,
18871 &new_start,
18872 &new_end,
18873 &map_size);
18874 if (__improbable(kr != KERN_SUCCESS)) {
18875 return vm_sanitize_get_kr(kr);
18876 }
18877
18878 /*
18879 * Assert that the vm_map_copy is coming from the right
18880 * zone and hasn't been forged
18881 */
18882 vm_map_copy_require(src_copy_map);
18883 assert(src_copy_map->type == VM_MAP_COPY_ENTRY_LIST);
18884
18885 /*
18886 * Start working with "src_copy_map" but we'll switch
18887 * to "target_copy_map" as soon as we start making adjustments.
18888 */
18889 copy_map = src_copy_map;
18890 src_copy_map_size = src_copy_map->size;
18891
18892 copy_page_shift = VM_MAP_COPY_PAGE_SHIFT(copy_map);
18893 copy_page_mask = VM_MAP_COPY_PAGE_MASK(copy_map);
18894 target_page_shift = (uint16_t)VM_MAP_PAGE_SHIFT(target_map);
18895 target_page_mask = VM_MAP_PAGE_MASK(target_map);
18896
18897 DEBUG4K_ADJUST("copy_map %p (%d offset 0x%llx size 0x%llx) target_map %p (%d) copy %d offset 0x%llx size 0x%llx target_copy_map %p...\n", copy_map, copy_page_shift, (uint64_t)copy_map->offset, (uint64_t)copy_map->size, target_map, target_page_shift, copy, (uint64_t)VM_SANITIZE_UNSAFE_UNWRAP(offset_u), (uint64_t)VM_SANITIZE_UNSAFE_UNWRAP(size_u), *target_copy_map_p);
18898
18899 target_copy_map = *target_copy_map_p;
18900 if (target_copy_map != VM_MAP_COPY_NULL) {
18901 vm_map_copy_require(target_copy_map);
18902 }
18903
18904 if (new_end > copy_map->size) {
18905 DEBUG4K_ERROR("copy_map %p (%d->%d) copy_map->size 0x%llx offset 0x%llx size 0x%llx KERN_INVALID_ARGUMENT\n", copy_map, copy_page_shift, target_page_shift, (uint64_t)copy_map->size, (uint64_t)VM_SANITIZE_UNSAFE_UNWRAP(offset_u), (uint64_t)VM_SANITIZE_UNSAFE_UNWRAP(size_u));
18906 return KERN_INVALID_ARGUMENT;
18907 }
18908
18909 /* trim the end */
18910 trimmed_end = 0;
18911 new_end = VM_MAP_ROUND_PAGE(new_end, target_page_mask);
18912 if (new_end < copy_map->size) {
18913 trimmed_end = src_copy_map_size - new_end;
18914 DEBUG4K_ADJUST("copy_map %p (%d->%d) copy %d offset 0x%llx size 0x%llx target_copy_map %p... trim end from 0x%llx to 0x%llx\n", copy_map, copy_page_shift, target_page_shift, copy, (uint64_t)VM_SANITIZE_UNSAFE_UNWRAP(offset_u), (uint64_t)VM_SANITIZE_UNSAFE_UNWRAP(size_u), target_copy_map, (uint64_t)new_end, (uint64_t)copy_map->size);
18915 /* get "target_copy_map" if needed and adjust it */
18916 vm_map_copy_adjust_get_target_copy_map(copy_map,
18917 &target_copy_map);
18918 copy_map = target_copy_map;
18919 vm_map_copy_trim(target_copy_map, target_page_shift,
18920 new_end, copy_map->size);
18921 }
18922
18923 /* trim the start */
18924 new_start = VM_MAP_TRUNC_PAGE(new_start, target_page_mask);
18925 if (new_start != 0) {
18926 DEBUG4K_ADJUST("copy_map %p (%d->%d) copy %d offset 0x%llx size 0x%llx target_copy_map %p... trim start from 0x%llx to 0x%llx\n", copy_map, copy_page_shift, target_page_shift, copy, (uint64_t)VM_SANITIZE_UNSAFE_UNWRAP(offset_u), (uint64_t)VM_SANITIZE_UNSAFE_UNWRAP(size_u), target_copy_map, (uint64_t)0, (uint64_t)new_start);
18927 /* get "target_copy_map" if needed and adjust it */
18928 vm_map_copy_adjust_get_target_copy_map(copy_map,
18929 &target_copy_map);
18930 copy_map = target_copy_map;
18931 vm_map_copy_trim(target_copy_map, target_page_shift,
18932 0, new_start);
18933 }
18934 *trimmed_start_p = new_start;
18935
18936 /* target_size starts with what's left after trimming */
18937 target_size = copy_map->size;
18938 assertf(target_size == src_copy_map_size - *trimmed_start_p - trimmed_end,
18939 "target_size 0x%llx src_copy_map_size 0x%llx trimmed_start 0x%llx trimmed_end 0x%llx\n",
18940 (uint64_t)target_size, (uint64_t)src_copy_map_size,
18941 (uint64_t)*trimmed_start_p, (uint64_t)trimmed_end);
18942
18943 /* check for misalignments but don't adjust yet */
18944 misalignments = 0;
18945 overmap_start = 0;
18946 overmap_end = 0;
18947 if (copy_page_shift < target_page_shift) {
18948 /*
18949 * Remapping from 4K to 16K: check the VM object alignments
18950 * throughout the range.
18951 * If the start and end of the range are mis-aligned, we can
18952 * over-map to re-align, and adjust the "overmap" start/end
18953 * and "target_size" of the range accordingly.
18954 * If there is any mis-alignment within the range:
18955 * if "copy":
18956 * we can do immediate-copy instead of copy-on-write,
18957 * else:
18958 * no way to remap and share; fail.
18959 */
18960 for (entry = vm_map_copy_first_entry(copy_map);
18961 entry != vm_map_copy_to_entry(copy_map);
18962 entry = entry->vme_next) {
18963 vm_object_offset_t object_offset_start, object_offset_end;
18964
18965 object_offset_start = VME_OFFSET(entry);
18966 object_offset_end = object_offset_start;
18967 object_offset_end += entry->vme_end - entry->vme_start;
18968 if (object_offset_start & target_page_mask) {
18969 if (entry == vm_map_copy_first_entry(copy_map) && !copy) {
18970 overmap_start++;
18971 } else {
18972 misalignments++;
18973 }
18974 }
18975 if (object_offset_end & target_page_mask) {
18976 if (entry->vme_next == vm_map_copy_to_entry(copy_map) && !copy) {
18977 overmap_end++;
18978 } else {
18979 misalignments++;
18980 }
18981 }
18982 }
18983 }
18984 entry = VM_MAP_ENTRY_NULL;
18985
18986 /* decide how to deal with misalignments */
18987 assert(overmap_start <= 1);
18988 assert(overmap_end <= 1);
18989 if (!overmap_start && !overmap_end && !misalignments) {
18990 /* copy_map is properly aligned for target_map ... */
18991 if (*trimmed_start_p) {
18992 /* ... but we trimmed it, so still need to adjust */
18993 } else {
18994 /* ... and we didn't trim anything: we're done */
18995 if (target_copy_map == VM_MAP_COPY_NULL) {
18996 target_copy_map = copy_map;
18997 }
18998 *target_copy_map_p = target_copy_map;
18999 *overmap_start_p = 0;
19000 *overmap_end_p = 0;
19001 DEBUG4K_ADJUST("copy_map %p (%d offset 0x%llx size 0x%llx) target_map %p (%d) copy %d target_copy_map %p (%d offset 0x%llx size 0x%llx) -> trimmed 0x%llx overmap start 0x%llx end 0x%llx KERN_SUCCESS\n", copy_map, copy_page_shift, (uint64_t)copy_map->offset, (uint64_t)copy_map->size, target_map, target_page_shift, copy, *target_copy_map_p, VM_MAP_COPY_PAGE_SHIFT(*target_copy_map_p), (uint64_t)(*target_copy_map_p)->offset, (uint64_t)(*target_copy_map_p)->size, (uint64_t)*trimmed_start_p, (uint64_t)*overmap_start_p, (uint64_t)*overmap_end_p);
19002 return KERN_SUCCESS;
19003 }
19004 } else if (misalignments && !copy) {
19005 /* can't "share" if misaligned */
19006 DEBUG4K_ADJUST("unsupported sharing\n");
19007 #if MACH_ASSERT
19008 if (debug4k_panic_on_misaligned_sharing) {
19009 panic("DEBUG4k %s:%d unsupported sharing", __FUNCTION__, __LINE__);
19010 }
19011 #endif /* MACH_ASSERT */
19012 DEBUG4K_ADJUST("copy_map %p (%d) target_map %p (%d) copy %d target_copy_map %p -> KERN_NOT_SUPPORTED\n", copy_map, copy_page_shift, target_map, target_page_shift, copy, *target_copy_map_p);
19013 return KERN_NOT_SUPPORTED;
19014 } else {
19015 /* can't virtual-copy if misaligned (but can physical-copy) */
19016 DEBUG4K_ADJUST("mis-aligned copying\n");
19017 }
19018
19019 /* get a "target_copy_map" if needed and switch to it */
19020 vm_map_copy_adjust_get_target_copy_map(copy_map, &target_copy_map);
19021 copy_map = target_copy_map;
19022
19023 if (misalignments && copy) {
19024 vm_map_size_t target_copy_map_size;
19025
19026 /*
19027 * Can't do copy-on-write with misaligned mappings.
19028 * Replace the mappings with a physical copy of the original
19029 * mappings' contents.
19030 */
19031 target_copy_map_size = target_copy_map->size;
19032 kr = vm_map_copy_to_physcopy(target_copy_map, target_map);
19033 if (kr != KERN_SUCCESS) {
19034 return kr;
19035 }
19036 *target_copy_map_p = target_copy_map;
19037 *overmap_start_p = 0;
19038 *overmap_end_p = target_copy_map->size - target_copy_map_size;
19039 DEBUG4K_ADJUST("copy_map %p (%d offset 0x%llx size 0x%llx) target_map %p (%d) copy %d target_copy_map %p (%d offset 0x%llx size 0x%llx)-> trimmed 0x%llx overmap start 0x%llx end 0x%llx PHYSCOPY\n", copy_map, copy_page_shift, (uint64_t)copy_map->offset, (uint64_t)copy_map->size, target_map, target_page_shift, copy, *target_copy_map_p, VM_MAP_COPY_PAGE_SHIFT(*target_copy_map_p), (uint64_t)(*target_copy_map_p)->offset, (uint64_t)(*target_copy_map_p)->size, (uint64_t)*trimmed_start_p, (uint64_t)*overmap_start_p, (uint64_t)*overmap_end_p);
19040 return KERN_SUCCESS;
19041 }
19042
19043 /* apply the adjustments */
19044 misalignments = 0;
19045 overmap_start = 0;
19046 overmap_end = 0;
19047 /* remove copy_map->offset, so that everything starts at offset 0 */
19048 addr_adjustment = copy_map->offset;
19049 /* also remove whatever we trimmed from the start */
19050 addr_adjustment += *trimmed_start_p;
19051 for (target_entry = vm_map_copy_first_entry(target_copy_map);
19052 target_entry != vm_map_copy_to_entry(target_copy_map);
19053 target_entry = target_entry->vme_next) {
19054 vm_object_offset_t object_offset_start, object_offset_end;
19055
19056 DEBUG4K_ADJUST("copy %p (%d 0x%llx 0x%llx) entry %p [ 0x%llx 0x%llx ] object %p offset 0x%llx BEFORE\n", target_copy_map, VM_MAP_COPY_PAGE_SHIFT(target_copy_map), target_copy_map->offset, (uint64_t)target_copy_map->size, target_entry, (uint64_t)target_entry->vme_start, (uint64_t)target_entry->vme_end, VME_OBJECT(target_entry), VME_OFFSET(target_entry));
19057 object_offset_start = VME_OFFSET(target_entry);
19058 if (object_offset_start & target_page_mask) {
19059 DEBUG4K_ADJUST("copy %p (%d 0x%llx 0x%llx) entry %p [ 0x%llx 0x%llx ] object %p offset 0x%llx misaligned at start\n", target_copy_map, VM_MAP_COPY_PAGE_SHIFT(target_copy_map), target_copy_map->offset, (uint64_t)target_copy_map->size, target_entry, (uint64_t)target_entry->vme_start, (uint64_t)target_entry->vme_end, VME_OBJECT(target_entry), VME_OFFSET(target_entry));
19060 if (target_entry == vm_map_copy_first_entry(target_copy_map)) {
19061 /*
19062 * start of 1st entry is mis-aligned:
19063 * re-adjust by over-mapping.
19064 */
19065 overmap_start = object_offset_start - trunc_page_mask_64(object_offset_start, target_page_mask);
19066 DEBUG4K_ADJUST("entry %p offset 0x%llx copy %d -> overmap_start 0x%llx\n", target_entry, VME_OFFSET(target_entry), copy, (uint64_t)overmap_start);
19067 VME_OFFSET_SET(target_entry, VME_OFFSET(target_entry) - overmap_start);
19068 } else {
19069 misalignments++;
19070 DEBUG4K_ADJUST("entry %p offset 0x%llx copy %d -> misalignments %d\n", target_entry, VME_OFFSET(target_entry), copy, misalignments);
19071 assert(copy);
19072 }
19073 }
19074
19075 if (target_entry == vm_map_copy_first_entry(target_copy_map)) {
19076 target_size += overmap_start;
19077 } else {
19078 target_entry->vme_start += overmap_start;
19079 }
19080 target_entry->vme_end += overmap_start;
19081
19082 object_offset_end = VME_OFFSET(target_entry) + target_entry->vme_end - target_entry->vme_start;
19083 if (object_offset_end & target_page_mask) {
19084 DEBUG4K_ADJUST("copy %p (%d 0x%llx 0x%llx) entry %p [ 0x%llx 0x%llx ] object %p offset 0x%llx misaligned at end\n", target_copy_map, VM_MAP_COPY_PAGE_SHIFT(target_copy_map), target_copy_map->offset, (uint64_t)target_copy_map->size, target_entry, (uint64_t)target_entry->vme_start, (uint64_t)target_entry->vme_end, VME_OBJECT(target_entry), VME_OFFSET(target_entry));
19085 if (target_entry->vme_next == vm_map_copy_to_entry(target_copy_map)) {
19086 /*
19087 * end of last entry is mis-aligned: re-adjust by over-mapping.
19088 */
19089 overmap_end = round_page_mask_64(object_offset_end, target_page_mask) - object_offset_end;
19090 DEBUG4K_ADJUST("entry %p offset 0x%llx copy %d -> overmap_end 0x%llx\n", target_entry, VME_OFFSET(target_entry), copy, (uint64_t)overmap_end);
19091 target_entry->vme_end += overmap_end;
19092 target_size += overmap_end;
19093 } else {
19094 misalignments++;
19095 DEBUG4K_ADJUST("entry %p offset 0x%llx copy %d -> misalignments %d\n", target_entry, VME_OFFSET(target_entry), copy, misalignments);
19096 assert(copy);
19097 }
19098 }
19099 target_entry->vme_start -= addr_adjustment;
19100 target_entry->vme_end -= addr_adjustment;
19101 DEBUG4K_ADJUST("copy %p (%d 0x%llx 0x%llx) entry %p [ 0x%llx 0x%llx ] object %p offset 0x%llx AFTER\n", target_copy_map, VM_MAP_COPY_PAGE_SHIFT(target_copy_map), target_copy_map->offset, (uint64_t)target_copy_map->size, target_entry, (uint64_t)target_entry->vme_start, (uint64_t)target_entry->vme_end, VME_OBJECT(target_entry), VME_OFFSET(target_entry));
19102 }
19103
19104 target_copy_map->size = target_size;
19105 target_copy_map->offset += overmap_start;
19106 target_copy_map->offset -= addr_adjustment;
19107 target_copy_map->cpy_hdr.page_shift = target_page_shift;
19108
19109 // assert(VM_MAP_PAGE_ALIGNED(target_copy_map->size, target_page_mask));
19110 // assert(VM_MAP_PAGE_ALIGNED(target_copy_map->offset, FOURK_PAGE_MASK));
19111 assert(overmap_start < VM_MAP_PAGE_SIZE(target_map));
19112 assert(overmap_end < VM_MAP_PAGE_SIZE(target_map));
19113
19114 *target_copy_map_p = target_copy_map;
19115 *overmap_start_p = overmap_start;
19116 *overmap_end_p = overmap_end;
19117
19118 DEBUG4K_ADJUST("copy_map %p (%d offset 0x%llx size 0x%llx) target_map %p (%d) copy %d target_copy_map %p (%d offset 0x%llx size 0x%llx) -> trimmed 0x%llx overmap start 0x%llx end 0x%llx KERN_SUCCESS\n", copy_map, copy_page_shift, (uint64_t)copy_map->offset, (uint64_t)copy_map->size, target_map, target_page_shift, copy, *target_copy_map_p, VM_MAP_COPY_PAGE_SHIFT(*target_copy_map_p), (uint64_t)(*target_copy_map_p)->offset, (uint64_t)(*target_copy_map_p)->size, (uint64_t)*trimmed_start_p, (uint64_t)*overmap_start_p, (uint64_t)*overmap_end_p);
19119 return KERN_SUCCESS;
19120 }
19121
19122 kern_return_t
vm_map_range_physical_size(vm_map_t map,vm_map_address_t start,mach_vm_size_t size,mach_vm_size_t * phys_size)19123 vm_map_range_physical_size(
19124 vm_map_t map,
19125 vm_map_address_t start,
19126 mach_vm_size_t size,
19127 mach_vm_size_t * phys_size)
19128 {
19129 kern_return_t kr;
19130 vm_map_copy_t copy_map, target_copy_map;
19131 vm_map_offset_t adjusted_start, adjusted_end;
19132 vm_map_size_t adjusted_size;
19133 vm_prot_t cur_prot, max_prot;
19134 vm_map_offset_t overmap_start, overmap_end, trimmed_start, end;
19135 vm_map_kernel_flags_t vmk_flags;
19136
19137 if (size == 0) {
19138 DEBUG4K_SHARE("map %p start 0x%llx size 0x%llx -> phys_size 0!\n", map, (uint64_t)start, (uint64_t)size);
19139 *phys_size = 0;
19140 return KERN_SUCCESS;
19141 }
19142
19143 adjusted_start = vm_map_trunc_page(start, VM_MAP_PAGE_MASK(map));
19144 adjusted_end = vm_map_round_page(start + size, VM_MAP_PAGE_MASK(map));
19145 if (__improbable(os_add_overflow(start, size, &end) ||
19146 adjusted_end <= adjusted_start)) {
19147 /* wraparound */
19148 printf("%s:%d(start=0x%llx, size=0x%llx) pgmask 0x%x: wraparound\n", __FUNCTION__, __LINE__, (uint64_t)start, (uint64_t)size, VM_MAP_PAGE_MASK(map));
19149 *phys_size = 0;
19150 return KERN_INVALID_ARGUMENT;
19151 }
19152 if (__improbable(vm_map_range_overflows(map, start, size))) {
19153 *phys_size = 0;
19154 return KERN_INVALID_ADDRESS;
19155 }
19156 assert(adjusted_end > adjusted_start);
19157 adjusted_size = adjusted_end - adjusted_start;
19158 *phys_size = adjusted_size;
19159 if (VM_MAP_PAGE_SIZE(map) == PAGE_SIZE) {
19160 return KERN_SUCCESS;
19161 }
19162 if (start == 0) {
19163 adjusted_start = vm_map_trunc_page(start, PAGE_MASK);
19164 adjusted_end = vm_map_round_page(start + size, PAGE_MASK);
19165 if (__improbable(adjusted_end <= adjusted_start)) {
19166 /* wraparound */
19167 printf("%s:%d(start=0x%llx, size=0x%llx) pgmask 0x%x: wraparound\n", __FUNCTION__, __LINE__, (uint64_t)start, (uint64_t)size, PAGE_MASK);
19168 *phys_size = 0;
19169 return KERN_INVALID_ARGUMENT;
19170 }
19171 assert(adjusted_end > adjusted_start);
19172 adjusted_size = adjusted_end - adjusted_start;
19173 *phys_size = adjusted_size;
19174 return KERN_SUCCESS;
19175 }
19176
19177 vmk_flags = VM_MAP_KERNEL_FLAGS_NONE;
19178 vmk_flags.vmkf_copy_pageable = TRUE;
19179 vmk_flags.vmkf_copy_same_map = TRUE;
19180 assert(adjusted_size != 0);
19181 cur_prot = VM_PROT_NONE; /* legacy mode */
19182 max_prot = VM_PROT_NONE; /* legacy mode */
19183 vmk_flags.vmkf_remap_legacy_mode = true;
19184 kr = vm_map_copy_extract(map, adjusted_start, adjusted_size,
19185 FALSE /* copy */,
19186 ©_map,
19187 &cur_prot, &max_prot, VM_INHERIT_DEFAULT,
19188 vmk_flags);
19189 if (kr != KERN_SUCCESS) {
19190 DEBUG4K_ERROR("map %p start 0x%llx 0x%llx size 0x%llx 0x%llx kr 0x%x\n", map, (uint64_t)start, (uint64_t)adjusted_start, size, (uint64_t)adjusted_size, kr);
19191 //assert(0);
19192 *phys_size = 0;
19193 return kr;
19194 }
19195 assert(copy_map != VM_MAP_COPY_NULL);
19196 target_copy_map = copy_map;
19197 DEBUG4K_ADJUST("adjusting...\n");
19198 kr = vm_map_copy_adjust_to_target(
19199 copy_map,
19200 start - adjusted_start, /* offset */
19201 size, /* size */
19202 kernel_map,
19203 FALSE, /* copy */
19204 &target_copy_map,
19205 &overmap_start,
19206 &overmap_end,
19207 &trimmed_start);
19208 if (kr == KERN_SUCCESS) {
19209 if (target_copy_map->size != *phys_size) {
19210 DEBUG4K_ADJUST("map %p (%d) start 0x%llx size 0x%llx adjusted_start 0x%llx adjusted_end 0x%llx overmap_start 0x%llx overmap_end 0x%llx trimmed_start 0x%llx phys_size 0x%llx -> 0x%llx\n", map, VM_MAP_PAGE_SHIFT(map), (uint64_t)start, (uint64_t)size, (uint64_t)adjusted_start, (uint64_t)adjusted_end, (uint64_t)overmap_start, (uint64_t)overmap_end, (uint64_t)trimmed_start, (uint64_t)*phys_size, (uint64_t)target_copy_map->size);
19211 }
19212 *phys_size = target_copy_map->size;
19213 } else {
19214 DEBUG4K_ERROR("map %p start 0x%llx 0x%llx size 0x%llx 0x%llx kr 0x%x\n", map, (uint64_t)start, (uint64_t)adjusted_start, size, (uint64_t)adjusted_size, kr);
19215 //assert(0);
19216 *phys_size = 0;
19217 }
19218 vm_map_copy_discard(copy_map);
19219 copy_map = VM_MAP_COPY_NULL;
19220
19221 return kr;
19222 }
19223
19224 static __attribute__((always_inline, warn_unused_result))
19225 kern_return_t
vm_map_remap_sanitize(vm_map_t src_map,vm_map_t target_map,vm_map_address_ut address_u,vm_map_size_ut size_u,vm_map_offset_ut mask_u,vm_map_offset_ut memory_address_u,vm_prot_ut cur_protection_u,vm_prot_ut max_protection_u,vm_inherit_ut inheritance_u,vm_map_kernel_flags_t vmk_flags,vm_map_address_t * target_addr,vm_map_address_t * mask,vm_map_offset_t * memory_address,vm_map_offset_t * memory_end,vm_map_size_t * memory_size,vm_prot_t * cur_protection,vm_prot_t * max_protection,vm_inherit_t * inheritance)19226 vm_map_remap_sanitize(
19227 vm_map_t src_map,
19228 vm_map_t target_map,
19229 vm_map_address_ut address_u,
19230 vm_map_size_ut size_u,
19231 vm_map_offset_ut mask_u,
19232 vm_map_offset_ut memory_address_u,
19233 vm_prot_ut cur_protection_u,
19234 vm_prot_ut max_protection_u,
19235 vm_inherit_ut inheritance_u,
19236 vm_map_kernel_flags_t vmk_flags,
19237 vm_map_address_t *target_addr,
19238 vm_map_address_t *mask,
19239 vm_map_offset_t *memory_address,
19240 vm_map_offset_t *memory_end,
19241 vm_map_size_t *memory_size,
19242 vm_prot_t *cur_protection,
19243 vm_prot_t *max_protection,
19244 vm_inherit_t *inheritance)
19245 {
19246 kern_return_t result;
19247 vm_sanitize_flags_t vm_sanitize_flags;
19248
19249 result = vm_sanitize_inherit(inheritance_u, VM_SANITIZE_CALLER_VM_MAP_REMAP,
19250 inheritance);
19251 if (__improbable(result != KERN_SUCCESS)) {
19252 return result;
19253 }
19254
19255 result = vm_sanitize_cur_and_max_prots(cur_protection_u, max_protection_u,
19256 VM_SANITIZE_CALLER_VM_MAP_REMAP, target_map,
19257 cur_protection, max_protection);
19258 if (__improbable(result != KERN_SUCCESS)) {
19259 return result;
19260 }
19261
19262 result = vm_sanitize_mask(mask_u, VM_SANITIZE_CALLER_VM_MAP_REMAP, mask);
19263 if (__improbable(result != KERN_SUCCESS)) {
19264 return result;
19265 }
19266
19267 /*
19268 * If the user is requesting that we return the address of the
19269 * first byte of the data (rather than the base of the page),
19270 * then we use different rounding semantics: specifically,
19271 * we assume that (memory_address, size) describes a region
19272 * all of whose pages we must cover, rather than a base to be truncated
19273 * down and a size to be added to that base. So we figure out
19274 * the highest page that the requested region includes and make
19275 * sure that the size will cover it.
19276 *
19277 * The key example we're worried about it is of the form:
19278 *
19279 * memory_address = 0x1ff0, size = 0x20
19280 *
19281 * With the old semantics, we round down the memory_address to 0x1000
19282 * and round up the size to 0x1000, resulting in our covering *only*
19283 * page 0x1000. With the new semantics, we'd realize that the region covers
19284 * 0x1ff0-0x2010, and compute a size of 0x2000. Thus, we cover both page
19285 * 0x1000 and page 0x2000 in the region we remap.
19286 *
19287 * VM_SANITIZE_FLAGS_REALIGN_START asks for the old (broken) semantics.
19288 */
19289 vm_sanitize_flags = VM_SANITIZE_FLAGS_SIZE_ZERO_FAILS;
19290 if (!vmk_flags.vmf_return_data_addr) {
19291 vm_sanitize_flags |= VM_SANITIZE_FLAGS_REALIGN_START;
19292 }
19293
19294 result = vm_sanitize_addr_size(memory_address_u, size_u,
19295 VM_SANITIZE_CALLER_VM_MAP_REMAP, src_map,
19296 vm_sanitize_flags, memory_address, memory_end,
19297 memory_size);
19298 if (__improbable(result != KERN_SUCCESS)) {
19299 return result;
19300 }
19301
19302 *target_addr = vm_sanitize_addr(target_map, address_u);
19303 return KERN_SUCCESS;
19304 }
19305
19306 /*
19307 * Routine: vm_remap
19308 *
19309 * Map portion of a task's address space.
19310 * Mapped region must not overlap more than
19311 * one vm memory object. Protections and
19312 * inheritance attributes remain the same
19313 * as in the original task and are out parameters.
19314 * Source and Target task can be identical
19315 * Other attributes are identical as for vm_map()
19316 */
19317 kern_return_t
vm_map_remap(vm_map_t target_map,vm_map_address_ut * address_u,vm_map_size_ut size_u,vm_map_offset_ut mask_u,vm_map_kernel_flags_t vmk_flags,vm_map_t src_map,vm_map_offset_ut memory_address_u,boolean_t copy,vm_prot_ut * cur_protection_u,vm_prot_ut * max_protection_u,vm_inherit_ut inheritance_u)19318 vm_map_remap(
19319 vm_map_t target_map,
19320 vm_map_address_ut *address_u,
19321 vm_map_size_ut size_u,
19322 vm_map_offset_ut mask_u,
19323 vm_map_kernel_flags_t vmk_flags,
19324 vm_map_t src_map,
19325 vm_map_offset_ut memory_address_u,
19326 boolean_t copy,
19327 vm_prot_ut *cur_protection_u, /* IN/OUT */
19328 vm_prot_ut *max_protection_u, /* IN/OUT */
19329 vm_inherit_ut inheritance_u)
19330 {
19331 vm_map_address_t target_addr, mask;
19332 vm_map_size_t target_size;
19333 vm_map_offset_t memory_address, memory_end;
19334 vm_map_size_t memory_size;
19335 vm_prot_t cur_protection, max_protection;
19336 vm_inherit_t inheritance;
19337 kern_return_t result;
19338 vm_map_entry_t insp_entry = VM_MAP_ENTRY_NULL;
19339 vm_map_copy_t copy_map;
19340 vm_map_offset_t offset_in_mapping;
19341 vm_map_size_t src_page_mask, target_page_mask;
19342 vm_map_size_t initial_size;
19343 VM_MAP_ZAP_DECLARE(zap_list);
19344
19345 if (target_map == VM_MAP_NULL || src_map == VM_MAP_NULL) {
19346 return KERN_INVALID_ARGUMENT;
19347 }
19348 src_page_mask = VM_MAP_PAGE_MASK(src_map);
19349 target_page_mask = VM_MAP_PAGE_MASK(target_map);
19350
19351 if (src_page_mask != target_page_mask) {
19352 if (copy) {
19353 DEBUG4K_COPY("src_map %p pgsz 0x%x addr 0x%llx size 0x%llx copy %d -> target_map %p pgsz 0x%x\n", src_map, VM_MAP_PAGE_SIZE(src_map), VM_SANITIZE_UNSAFE_UNWRAP(memory_address_u), VM_SANITIZE_UNSAFE_UNWRAP(size_u), copy, target_map, VM_MAP_PAGE_SIZE(target_map));
19354 } else {
19355 DEBUG4K_SHARE("src_map %p pgsz 0x%x addr 0x%llx size 0x%llx copy %d -> target_map %p pgsz 0x%x\n", src_map, VM_MAP_PAGE_SIZE(src_map), VM_SANITIZE_UNSAFE_UNWRAP(memory_address_u), VM_SANITIZE_UNSAFE_UNWRAP(size_u), copy, target_map, VM_MAP_PAGE_SIZE(target_map));
19356 }
19357 }
19358
19359 /*
19360 * Sanitize any input parameters that are addr/size/prot/inherit
19361 */
19362 result = vm_map_remap_sanitize(src_map,
19363 target_map,
19364 *address_u,
19365 size_u,
19366 mask_u,
19367 memory_address_u,
19368 *cur_protection_u,
19369 *max_protection_u,
19370 inheritance_u,
19371 vmk_flags,
19372 &target_addr,
19373 &mask,
19374 &memory_address,
19375 &memory_end,
19376 &memory_size,
19377 &cur_protection,
19378 &max_protection,
19379 &inheritance);
19380 if (__improbable(result != KERN_SUCCESS)) {
19381 return vm_sanitize_get_kr(result);
19382 }
19383
19384 if (vmk_flags.vmf_return_data_addr) {
19385 /*
19386 * This is safe to unwrap now that the quantities
19387 * have been validated and rounded up normally.
19388 */
19389 offset_in_mapping = vm_sanitize_offset_in_page(src_map,
19390 memory_address_u);
19391 initial_size = VM_SANITIZE_UNSAFE_UNWRAP(size_u);
19392 } else {
19393 /*
19394 * IMPORTANT:
19395 * This legacy code path is broken: for the range mentioned
19396 * above [ memory_address = 0x1ff0,size = 0x20 ], which spans
19397 * two 4k pages, it yields [ memory_address = 0x1000,
19398 * size = 0x1000 ], which covers only the first 4k page.
19399 * BUT some code unfortunately depends on this bug, so we
19400 * can't fix it without breaking something.
19401 * New code should get automatically opted in the new
19402 * behavior with the new VM_FLAGS_RETURN_DATA_ADDR flags.
19403 */
19404 offset_in_mapping = 0;
19405 initial_size = memory_size;
19406 }
19407
19408 if (vmk_flags.vmf_resilient_media) {
19409 /* must be copy-on-write to be "media resilient" */
19410 if (!copy) {
19411 return KERN_INVALID_ARGUMENT;
19412 }
19413 }
19414
19415 vmk_flags.vmkf_copy_pageable = target_map->hdr.entries_pageable;
19416 vmk_flags.vmkf_copy_same_map = (src_map == target_map);
19417
19418 assert(memory_size != 0);
19419 result = vm_map_copy_extract(src_map,
19420 memory_address,
19421 memory_size,
19422 copy, ©_map,
19423 &cur_protection, /* IN/OUT */
19424 &max_protection, /* IN/OUT */
19425 inheritance,
19426 vmk_flags);
19427 if (result != KERN_SUCCESS) {
19428 return result;
19429 }
19430 assert(copy_map != VM_MAP_COPY_NULL);
19431
19432 /*
19433 * Handle the policy for vm map ranges
19434 *
19435 * If the maps differ, the target_map policy applies like for vm_map()
19436 * For same mapping remaps, we preserve the range.
19437 */
19438 if (vmk_flags.vmkf_copy_same_map) {
19439 vmk_flags.vmkf_range_id = copy_map->orig_range;
19440 } else {
19441 vm_map_kernel_flags_update_range_id(&vmk_flags, target_map, memory_size);
19442 }
19443
19444 target_size = memory_size;
19445 if (src_page_mask != target_page_mask) {
19446 vm_map_copy_t target_copy_map;
19447 vm_map_offset_t overmap_start = 0;
19448 vm_map_offset_t overmap_end = 0;
19449 vm_map_offset_t trimmed_start = 0;
19450
19451 target_copy_map = copy_map; /* can modify "copy_map" itself */
19452 DEBUG4K_ADJUST("adjusting...\n");
19453 result = vm_map_copy_adjust_to_target(
19454 copy_map,
19455 offset_in_mapping, /* offset */
19456 initial_size,
19457 target_map,
19458 copy,
19459 &target_copy_map,
19460 &overmap_start,
19461 &overmap_end,
19462 &trimmed_start);
19463 if (result != KERN_SUCCESS) {
19464 DEBUG4K_COPY("failed to adjust 0x%x\n", result);
19465 vm_map_copy_discard(copy_map);
19466 return result;
19467 }
19468 if (trimmed_start == 0) {
19469 /* nothing trimmed: no adjustment needed */
19470 } else if (trimmed_start >= offset_in_mapping) {
19471 /* trimmed more than offset_in_mapping: nothing left */
19472 assert(overmap_start == 0);
19473 assert(overmap_end == 0);
19474 offset_in_mapping = 0;
19475 } else {
19476 /* trimmed some of offset_in_mapping: adjust */
19477 assert(overmap_start == 0);
19478 assert(overmap_end == 0);
19479 offset_in_mapping -= trimmed_start;
19480 }
19481 offset_in_mapping += overmap_start;
19482 target_size = target_copy_map->size;
19483 }
19484
19485 /*
19486 * Allocate/check a range of free virtual address
19487 * space for the target
19488 */
19489 target_size = vm_map_round_page(target_size, target_page_mask);
19490
19491 if (target_size == 0) {
19492 vm_map_copy_discard(copy_map);
19493 return KERN_INVALID_ARGUMENT;
19494 }
19495
19496 if (__improbable(!vm_map_is_map_size_valid(
19497 target_map, target_size, vmk_flags.vmkf_no_soft_limit))) {
19498 vm_map_copy_discard(copy_map);
19499 return KERN_NO_SPACE;
19500 }
19501
19502 vm_map_lock(target_map);
19503
19504 if (!vmk_flags.vmf_fixed) {
19505 result = vm_map_locate_space_anywhere(target_map, target_size,
19506 mask, vmk_flags, &target_addr, &insp_entry);
19507 } else {
19508 /*
19509 * vm_map_locate_space_fixed will reject overflowing
19510 * target_addr + target_size values
19511 */
19512 result = vm_map_locate_space_fixed(target_map, target_addr,
19513 target_size, mask, vmk_flags, &insp_entry, &zap_list);
19514
19515 if (result == KERN_MEMORY_PRESENT) {
19516 assert(!vmk_flags.vmkf_already);
19517 insp_entry = VM_MAP_ENTRY_NULL;
19518 result = KERN_NO_SPACE;
19519 }
19520 }
19521
19522 if (result == KERN_SUCCESS) {
19523 while (vm_map_copy_first_entry(copy_map) !=
19524 vm_map_copy_to_entry(copy_map)) {
19525 vm_map_entry_t entry = vm_map_copy_first_entry(copy_map);
19526
19527 vm_map_copy_entry_unlink(copy_map, entry);
19528
19529 if (vmk_flags.vmkf_remap_prot_copy) {
19530 /*
19531 * This vm_map_remap() is for a
19532 * vm_protect(VM_PROT_COPY), so the caller
19533 * expects to be allowed to add write access
19534 * to this new mapping. This is done by
19535 * adding VM_PROT_WRITE to each entry's
19536 * max_protection... unless some security
19537 * settings disallow it.
19538 */
19539 bool allow_write = false;
19540 if (entry->vme_permanent) {
19541 /* immutable mapping... */
19542 if ((entry->max_protection & VM_PROT_EXECUTE) &&
19543 developer_mode_state()) {
19544 /*
19545 * ... but executable and
19546 * possibly being debugged,
19547 * so let's allow it to become
19548 * writable, for breakpoints
19549 * and dtrace probes, for
19550 * example.
19551 */
19552 allow_write = true;
19553 } else {
19554 printf("%d[%s] vm_remap(0x%llx,0x%llx) VM_PROT_COPY denied on permanent mapping prot 0x%x/0x%x developer %d\n",
19555 proc_selfpid(),
19556 (get_bsdtask_info(current_task())
19557 ? proc_name_address(get_bsdtask_info(current_task()))
19558 : "?"),
19559 (uint64_t)memory_address,
19560 (uint64_t)memory_size,
19561 entry->protection,
19562 entry->max_protection,
19563 developer_mode_state());
19564 DTRACE_VM6(vm_map_delete_permanent_deny_protcopy,
19565 vm_map_entry_t, entry,
19566 vm_map_offset_t, entry->vme_start,
19567 vm_map_offset_t, entry->vme_end,
19568 vm_prot_t, entry->protection,
19569 vm_prot_t, entry->max_protection,
19570 int, VME_ALIAS(entry));
19571 }
19572 } else {
19573 allow_write = true;
19574 }
19575
19576 /*
19577 * VM_PROT_COPY: allow this mapping to become
19578 * writable, unless it was "permanent".
19579 */
19580 if (allow_write) {
19581 entry->max_protection |= VM_PROT_WRITE;
19582 }
19583 }
19584 if (vmk_flags.vmf_resilient_codesign) {
19585 /* no codesigning -> read-only access */
19586 entry->max_protection = VM_PROT_READ;
19587 entry->protection = VM_PROT_READ;
19588 entry->vme_resilient_codesign = TRUE;
19589 }
19590 entry->vme_start += target_addr;
19591 entry->vme_end += target_addr;
19592 assert(!entry->map_aligned);
19593 if (vmk_flags.vmf_resilient_media &&
19594 !entry->is_sub_map &&
19595 (VME_OBJECT(entry) == VM_OBJECT_NULL ||
19596 VME_OBJECT(entry)->internal)) {
19597 entry->vme_resilient_media = TRUE;
19598 }
19599 assert(VM_MAP_PAGE_ALIGNED(entry->vme_start, MIN(target_page_mask, PAGE_MASK)));
19600 assert(VM_MAP_PAGE_ALIGNED(entry->vme_end, MIN(target_page_mask, PAGE_MASK)));
19601 assert(VM_MAP_PAGE_ALIGNED(VME_OFFSET(entry), MIN(target_page_mask, PAGE_MASK)));
19602 vm_map_store_entry_link(target_map, insp_entry, entry,
19603 vmk_flags);
19604 insp_entry = entry;
19605 }
19606 }
19607
19608 if (vmk_flags.vmf_resilient_codesign) {
19609 cur_protection = VM_PROT_READ;
19610 max_protection = VM_PROT_READ;
19611 }
19612
19613 if (result == KERN_SUCCESS) {
19614 target_map->size += target_size;
19615 SAVE_HINT_MAP_WRITE(target_map, insp_entry);
19616 }
19617 vm_map_unlock(target_map);
19618
19619 vm_map_zap_dispose(&zap_list);
19620
19621 if (result == KERN_SUCCESS && target_map->wiring_required) {
19622 result = vm_map_wire_nested(target_map, target_addr,
19623 target_addr + target_size, cur_protection, VM_KERN_MEMORY_MLOCK,
19624 TRUE, PMAP_NULL, 0, NULL);
19625 }
19626
19627 if (result == KERN_SUCCESS) {
19628 #if KASAN
19629 if (target_map->pmap == kernel_pmap) {
19630 kasan_notify_address(target_addr, target_size);
19631 }
19632 #endif
19633 /*
19634 * If requested, return the address of the data pointed to by the
19635 * request, rather than the base of the resulting page.
19636 */
19637 if (vmk_flags.vmf_return_data_addr) {
19638 target_addr += offset_in_mapping;
19639 }
19640
19641 /*
19642 * Update OUT parameters.
19643 */
19644 *address_u = vm_sanitize_wrap_addr(target_addr);
19645
19646 *cur_protection_u = vm_sanitize_wrap_prot(cur_protection);
19647 *max_protection_u = vm_sanitize_wrap_prot(max_protection);
19648 }
19649
19650 if (src_page_mask != target_page_mask) {
19651 DEBUG4K_SHARE("vm_remap(%p 0x%llx 0x%llx copy=%d-> %p 0x%llx 0x%llx result=0x%x\n", src_map, (uint64_t)memory_address, (uint64_t)target_size, copy, target_map, (uint64_t)target_addr, (uint64_t)offset_in_mapping, result);
19652 }
19653 vm_map_copy_discard(copy_map);
19654 copy_map = VM_MAP_COPY_NULL;
19655
19656 return result;
19657 }
19658
19659 /*
19660 * vm_map_switch_to:
19661 *
19662 * Set the address map for the current thread to the specified map.
19663 * Returns a struct containing info about the previous map, which should be
19664 * restored with `vm_map_switch_back`
19665 */
19666
19667 vm_map_switch_context_t
vm_map_switch_to(vm_map_t map)19668 vm_map_switch_to(vm_map_t map)
19669 {
19670 thread_t thread = current_thread();
19671 vm_map_t oldmap = thread->map;
19672
19673 /*
19674 * Deactivate the current map and activate the requested map
19675 */
19676 mp_disable_preemption();
19677 PMAP_SWITCH_USER(thread, map, cpu_number());
19678 mp_enable_preemption();
19679
19680 vm_map_lock(map);
19681 task_t task = map->owning_task;
19682 if (task) {
19683 task_reference(task);
19684 }
19685 vm_map_unlock(map);
19686
19687 return (vm_map_switch_context_t) { oldmap, task };
19688 }
19689
19690 void
vm_map_switch_back(vm_map_switch_context_t ctx)19691 vm_map_switch_back(vm_map_switch_context_t ctx)
19692 {
19693 thread_t thread = current_thread();
19694 task_t task = ctx.task;
19695 vm_map_t map = ctx.map;
19696
19697 if (task) {
19698 task_deallocate(task);
19699 } else {
19700 /*
19701 * We want to make sure that vm_map_setup was not called while the
19702 * map was switched. This allows us to guarantee the property that
19703 * we always have a reference on current_map()->owning_task if it is
19704 * not NULL.
19705 */
19706 assert(!thread->map->owning_task);
19707 }
19708
19709 /*
19710 * Restore the original map from prior to vm_map_switch_to
19711 */
19712 mp_disable_preemption();
19713 PMAP_SWITCH_USER(thread, map, cpu_number());
19714 mp_enable_preemption();
19715 }
19716
19717 static __attribute__((always_inline, warn_unused_result))
19718 kern_return_t
vm_map_rw_user_sanitize(vm_map_t map,vm_map_address_ut addr_u,vm_size_ut size_u,vm_sanitize_caller_t vm_sanitize_caller,vm_map_address_t * addr,vm_map_address_t * end,vm_map_size_t * size)19719 vm_map_rw_user_sanitize(
19720 vm_map_t map,
19721 vm_map_address_ut addr_u,
19722 vm_size_ut size_u,
19723 vm_sanitize_caller_t vm_sanitize_caller,
19724 vm_map_address_t *addr,
19725 vm_map_address_t *end,
19726 vm_map_size_t *size)
19727 {
19728 vm_sanitize_flags_t flags = VM_SANITIZE_FLAGS_SIZE_ZERO_FALLTHROUGH |
19729 VM_SANITIZE_FLAGS_GET_UNALIGNED_VALUES |
19730 VM_SANITIZE_FLAGS_CHECK_ADDR_RANGE;
19731
19732 return vm_sanitize_addr_size(addr_u, size_u,
19733 vm_sanitize_caller, map,
19734 flags,
19735 addr, end, size);
19736 }
19737
19738 /*
19739 * Routine: vm_map_write_user
19740 *
19741 * Description:
19742 * Copy out data from a kernel space into space in the
19743 * destination map. The space must already exist in the
19744 * destination map.
19745 * NOTE: This routine should only be called by threads
19746 * which can block on a page fault. i.e. kernel mode user
19747 * threads.
19748 *
19749 */
19750 kern_return_t
vm_map_write_user(vm_map_t map,void * src_p,vm_map_address_ut dst_addr_u,vm_size_ut size_u)19751 vm_map_write_user(
19752 vm_map_t map,
19753 void *src_p,
19754 vm_map_address_ut dst_addr_u,
19755 vm_size_ut size_u)
19756 {
19757 kern_return_t kr;
19758 vm_map_address_t dst_addr, dst_end;
19759 vm_map_size_t size;
19760
19761 /*
19762 * src_p isn't validated: [src_p, src_p + size_u)
19763 * is trusted kernel input.
19764 *
19765 * dst_addr_u and size_u are untrusted and need to be sanitized.
19766 */
19767 kr = vm_map_rw_user_sanitize(map,
19768 dst_addr_u,
19769 size_u,
19770 VM_SANITIZE_CALLER_VM_MAP_WRITE_USER,
19771 &dst_addr,
19772 &dst_end,
19773 &size);
19774 if (__improbable(kr != KERN_SUCCESS)) {
19775 return vm_sanitize_get_kr(kr);
19776 }
19777
19778 if (current_map() == map) {
19779 if (copyout(src_p, dst_addr, size)) {
19780 kr = KERN_INVALID_ADDRESS;
19781 }
19782 } else {
19783 vm_map_switch_context_t switch_ctx;
19784
19785 /* take on the identity of the target map while doing */
19786 /* the transfer */
19787
19788 vm_map_reference(map);
19789 switch_ctx = vm_map_switch_to(map);
19790 if (copyout(src_p, dst_addr, size)) {
19791 kr = KERN_INVALID_ADDRESS;
19792 }
19793 vm_map_switch_back(switch_ctx);
19794 vm_map_deallocate(map);
19795 }
19796 return kr;
19797 }
19798
19799 /*
19800 * Routine: vm_map_read_user
19801 *
19802 * Description:
19803 * Copy in data from a user space source map into the
19804 * kernel map. The space must already exist in the
19805 * kernel map.
19806 * NOTE: This routine should only be called by threads
19807 * which can block on a page fault. i.e. kernel mode user
19808 * threads.
19809 *
19810 */
19811 kern_return_t
vm_map_read_user(vm_map_t map,vm_map_address_ut src_addr_u,void * dst_p,vm_size_ut size_u)19812 vm_map_read_user(
19813 vm_map_t map,
19814 vm_map_address_ut src_addr_u,
19815 void *dst_p,
19816 vm_size_ut size_u)
19817 {
19818 kern_return_t kr;
19819 vm_map_address_t src_addr, src_end;
19820 vm_map_size_t size;
19821
19822 /*
19823 * dst_p isn't validated: [dst_p, dst_p + size_u)
19824 * is trusted kernel input.
19825 *
19826 * src_addr_u and size_u are untrusted and need to be sanitized.
19827 */
19828 kr = vm_map_rw_user_sanitize(map,
19829 src_addr_u,
19830 size_u,
19831 VM_SANITIZE_CALLER_VM_MAP_READ_USER,
19832 &src_addr,
19833 &src_end,
19834 &size);
19835 if (__improbable(kr != KERN_SUCCESS)) {
19836 return vm_sanitize_get_kr(kr);
19837 }
19838
19839 if (current_map() == map) {
19840 if (copyin(src_addr, dst_p, size)) {
19841 kr = KERN_INVALID_ADDRESS;
19842 }
19843 } else {
19844 vm_map_switch_context_t switch_ctx;
19845
19846 /* take on the identity of the target map while doing */
19847 /* the transfer */
19848
19849 vm_map_reference(map);
19850 switch_ctx = vm_map_switch_to(map);
19851 if (copyin(src_addr, dst_p, size)) {
19852 kr = KERN_INVALID_ADDRESS;
19853 }
19854 vm_map_switch_back(switch_ctx);
19855 vm_map_deallocate(map);
19856 }
19857 return kr;
19858 }
19859
19860
19861 static __attribute__((always_inline, warn_unused_result))
19862 kern_return_t
vm_map_check_protection_sanitize(vm_map_t map,vm_map_offset_ut start_u,vm_map_offset_ut end_u,vm_prot_ut protection_u,vm_sanitize_caller_t vm_sanitize_caller,vm_map_offset_t * start,vm_map_offset_t * end,vm_prot_t * protection)19863 vm_map_check_protection_sanitize(
19864 vm_map_t map,
19865 vm_map_offset_ut start_u,
19866 vm_map_offset_ut end_u,
19867 vm_prot_ut protection_u,
19868 vm_sanitize_caller_t vm_sanitize_caller,
19869 vm_map_offset_t *start,
19870 vm_map_offset_t *end,
19871 vm_prot_t *protection)
19872 {
19873 kern_return_t kr;
19874 vm_map_size_t size;
19875
19876 kr = vm_sanitize_addr_end(start_u, end_u, vm_sanitize_caller, map,
19877 VM_SANITIZE_FLAGS_SIZE_ZERO_FALLTHROUGH, start, end,
19878 &size);
19879 if (__improbable(kr != KERN_SUCCESS)) {
19880 return kr;
19881 }
19882
19883 /*
19884 * Given that the protection is used only for comparisons below
19885 * no sanitization is being applied on it.
19886 */
19887 *protection = VM_SANITIZE_UNSAFE_UNWRAP(protection_u);
19888
19889 return KERN_SUCCESS;
19890 }
19891
19892 /*
19893 * vm_map_check_protection:
19894 *
19895 * Assert that the target map allows the specified
19896 * privilege on the entire address region given.
19897 * The entire region must be allocated.
19898 */
19899 boolean_t
vm_map_check_protection(vm_map_t map,vm_map_offset_ut start_u,vm_map_offset_ut end_u,vm_prot_ut protection_u,vm_sanitize_caller_t vm_sanitize_caller)19900 vm_map_check_protection(
19901 vm_map_t map,
19902 vm_map_offset_ut start_u,
19903 vm_map_offset_ut end_u,
19904 vm_prot_ut protection_u,
19905 vm_sanitize_caller_t vm_sanitize_caller)
19906 {
19907 vm_map_entry_t entry;
19908 vm_map_entry_t tmp_entry;
19909 vm_map_offset_t start;
19910 vm_map_offset_t end;
19911 vm_prot_t protection;
19912 kern_return_t kr;
19913
19914 kr = vm_map_check_protection_sanitize(map,
19915 start_u,
19916 end_u,
19917 protection_u,
19918 vm_sanitize_caller,
19919 &start,
19920 &end,
19921 &protection);
19922 if (__improbable(kr != KERN_SUCCESS)) {
19923 kr = vm_sanitize_get_kr(kr);
19924 if (kr == KERN_SUCCESS) {
19925 return true;
19926 }
19927 return false;
19928 }
19929
19930 vm_map_lock(map);
19931
19932 if (start < vm_map_min(map) || end > vm_map_max(map)) {
19933 vm_map_unlock(map);
19934 return false;
19935 }
19936
19937 if (!vm_map_lookup_entry(map, start, &tmp_entry)) {
19938 vm_map_unlock(map);
19939 return false;
19940 }
19941
19942 entry = tmp_entry;
19943
19944 while (start < end) {
19945 if (entry == vm_map_to_entry(map)) {
19946 vm_map_unlock(map);
19947 return false;
19948 }
19949
19950 /*
19951 * No holes allowed!
19952 */
19953
19954 if (start < entry->vme_start) {
19955 vm_map_unlock(map);
19956 return false;
19957 }
19958
19959 /*
19960 * Check protection associated with entry.
19961 */
19962
19963 if ((entry->protection & protection) != protection) {
19964 vm_map_unlock(map);
19965 return false;
19966 }
19967
19968 /* go to next entry */
19969
19970 start = entry->vme_end;
19971 entry = entry->vme_next;
19972 }
19973 vm_map_unlock(map);
19974 return true;
19975 }
19976
19977 kern_return_t
vm_map_purgable_control(vm_map_t map,vm_map_offset_ut address_u,vm_purgable_t control,int * state)19978 vm_map_purgable_control(
19979 vm_map_t map,
19980 vm_map_offset_ut address_u,
19981 vm_purgable_t control,
19982 int *state)
19983 {
19984 vm_map_offset_t address;
19985 vm_map_entry_t entry;
19986 vm_object_t object;
19987 kern_return_t kr;
19988 boolean_t was_nonvolatile;
19989
19990 /*
19991 * Vet all the input parameters and current type and state of the
19992 * underlaying object. Return with an error if anything is amiss.
19993 */
19994 if (map == VM_MAP_NULL) {
19995 return KERN_INVALID_ARGUMENT;
19996 }
19997
19998 if (control != VM_PURGABLE_SET_STATE &&
19999 control != VM_PURGABLE_GET_STATE &&
20000 control != VM_PURGABLE_PURGE_ALL &&
20001 control != VM_PURGABLE_SET_STATE_FROM_KERNEL) {
20002 return KERN_INVALID_ARGUMENT;
20003 }
20004
20005 if (control == VM_PURGABLE_PURGE_ALL) {
20006 vm_purgeable_object_purge_all();
20007 return KERN_SUCCESS;
20008 }
20009
20010 if ((control == VM_PURGABLE_SET_STATE ||
20011 control == VM_PURGABLE_SET_STATE_FROM_KERNEL) &&
20012 (((*state & ~(VM_PURGABLE_ALL_MASKS)) != 0) ||
20013 ((*state & VM_PURGABLE_STATE_MASK) > VM_PURGABLE_STATE_MASK))) {
20014 return KERN_INVALID_ARGUMENT;
20015 }
20016
20017 address = vm_sanitize_addr(map, address_u);
20018
20019 vm_map_lock_read(map);
20020
20021 if (!vm_map_lookup_entry(map, address, &entry) || entry->is_sub_map) {
20022 /*
20023 * Must pass a valid non-submap address.
20024 */
20025 vm_map_unlock_read(map);
20026 return KERN_INVALID_ADDRESS;
20027 }
20028
20029 if ((entry->protection & VM_PROT_WRITE) == 0 &&
20030 control != VM_PURGABLE_GET_STATE) {
20031 /*
20032 * Can't apply purgable controls to something you can't write.
20033 */
20034 vm_map_unlock_read(map);
20035 return KERN_PROTECTION_FAILURE;
20036 }
20037
20038 object = VME_OBJECT(entry);
20039 if (object == VM_OBJECT_NULL ||
20040 object->purgable == VM_PURGABLE_DENY) {
20041 /*
20042 * Object must already be present and be purgeable.
20043 */
20044 vm_map_unlock_read(map);
20045 return KERN_INVALID_ARGUMENT;
20046 }
20047
20048 vm_object_lock(object);
20049
20050 #if 00
20051 if (VME_OFFSET(entry) != 0 ||
20052 entry->vme_end - entry->vme_start != object->vo_size) {
20053 /*
20054 * Can only apply purgable controls to the whole (existing)
20055 * object at once.
20056 */
20057 vm_map_unlock_read(map);
20058 vm_object_unlock(object);
20059 return KERN_INVALID_ARGUMENT;
20060 }
20061 #endif
20062
20063 assert(!entry->is_sub_map);
20064 assert(!entry->use_pmap); /* purgeable has its own accounting */
20065
20066 vm_map_unlock_read(map);
20067
20068 was_nonvolatile = (object->purgable == VM_PURGABLE_NONVOLATILE);
20069
20070 kr = vm_object_purgable_control(object, control, state);
20071
20072 if (was_nonvolatile &&
20073 object->purgable != VM_PURGABLE_NONVOLATILE &&
20074 map->pmap == kernel_pmap) {
20075 #if DEBUG
20076 object->vo_purgeable_volatilizer = kernel_task;
20077 #endif /* DEBUG */
20078 }
20079
20080 vm_object_unlock(object);
20081
20082 return kr;
20083 }
20084
20085 void
vm_map_footprint_query_page_info(vm_map_t map,vm_map_entry_t map_entry,vm_map_offset_t curr_s_offset,int * disposition_p)20086 vm_map_footprint_query_page_info(
20087 vm_map_t map,
20088 vm_map_entry_t map_entry,
20089 vm_map_offset_t curr_s_offset,
20090 int *disposition_p)
20091 {
20092 int pmap_disp;
20093 vm_object_t object = VM_OBJECT_NULL;
20094 int disposition;
20095 int effective_page_size;
20096
20097 vm_map_lock_assert_held(map);
20098 assert(!map->has_corpse_footprint);
20099 assert(curr_s_offset >= map_entry->vme_start);
20100 assert(curr_s_offset < map_entry->vme_end);
20101
20102 if (map_entry->is_sub_map) {
20103 if (!map_entry->use_pmap) {
20104 /* nested pmap: no footprint */
20105 *disposition_p = 0;
20106 return;
20107 }
20108 } else {
20109 object = VME_OBJECT(map_entry);
20110 if (object == VM_OBJECT_NULL) {
20111 /* nothing mapped here: no need to ask */
20112 *disposition_p = 0;
20113 return;
20114 }
20115 }
20116
20117 effective_page_size = MIN(PAGE_SIZE, VM_MAP_PAGE_SIZE(map));
20118
20119 pmap_disp = 0;
20120
20121 /*
20122 * Query the pmap.
20123 */
20124 pmap_query_page_info(map->pmap, curr_s_offset, &pmap_disp);
20125
20126 /*
20127 * Compute this page's disposition.
20128 */
20129 disposition = 0;
20130
20131 /* deal with "alternate accounting" first */
20132 if (!map_entry->is_sub_map &&
20133 object->vo_no_footprint) {
20134 /* does not count in footprint */
20135 // assertf(!map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry);
20136 } else if (!map_entry->is_sub_map &&
20137 !object->internal &&
20138 object->vo_ledger_tag &&
20139 VM_OBJECT_OWNER(object) != NULL &&
20140 VM_OBJECT_OWNER(object)->map == map) {
20141 /* owned external object: wired pages count in footprint */
20142 assertf(map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry);
20143 if ((((curr_s_offset
20144 - map_entry->vme_start
20145 + VME_OFFSET(map_entry))
20146 / effective_page_size) <
20147 object->wired_page_count)) {
20148 /*
20149 * External object owned by this task: report the first
20150 * "#wired" pages as "resident" (to show that they
20151 * contribute to the footprint) but not "dirty"
20152 * (to avoid double-counting with the fake "owned"
20153 * region we'll report at the end of the address space
20154 * to account for all (mapped or not) owned memory
20155 * owned by this task.
20156 */
20157 disposition |= VM_PAGE_QUERY_PAGE_PRESENT;
20158 }
20159 } else if (!map_entry->is_sub_map &&
20160 object->internal &&
20161 (object->purgable == VM_PURGABLE_NONVOLATILE ||
20162 (object->purgable == VM_PURGABLE_DENY &&
20163 object->vo_ledger_tag)) &&
20164 VM_OBJECT_OWNER(object) != NULL &&
20165 VM_OBJECT_OWNER(object)->map == map) {
20166 assertf(!map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry);
20167 if ((((curr_s_offset
20168 - map_entry->vme_start
20169 + VME_OFFSET(map_entry))
20170 / effective_page_size) <
20171 (object->resident_page_count +
20172 vm_compressor_pager_get_count(object->pager)))) {
20173 /*
20174 * Non-volatile purgeable object owned
20175 * by this task: report the first
20176 * "#resident + #compressed" pages as
20177 * "resident" (to show that they
20178 * contribute to the footprint) but not
20179 * "dirty" (to avoid double-counting
20180 * with the fake "non-volatile" region
20181 * we'll report at the end of the
20182 * address space to account for all
20183 * (mapped or not) non-volatile memory
20184 * owned by this task.
20185 */
20186 disposition |= VM_PAGE_QUERY_PAGE_PRESENT;
20187 }
20188 } else if (!map_entry->is_sub_map &&
20189 object->internal &&
20190 (object->purgable == VM_PURGABLE_VOLATILE ||
20191 object->purgable == VM_PURGABLE_EMPTY) &&
20192 VM_OBJECT_OWNER(object) != NULL &&
20193 VM_OBJECT_OWNER(object)->map == map) {
20194 if (object->internal) {
20195 assertf(!map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry);
20196 }
20197 if ((((curr_s_offset
20198 - map_entry->vme_start
20199 + VME_OFFSET(map_entry))
20200 / effective_page_size) <
20201 object->wired_page_count)) {
20202 /*
20203 * Volatile|empty purgeable object owned
20204 * by this task: report the first
20205 * "#wired" pages as "resident" (to
20206 * show that they contribute to the
20207 * footprint) but not "dirty" (to avoid
20208 * double-counting with the fake
20209 * "non-volatile" region we'll report
20210 * at the end of the address space to
20211 * account for all (mapped or not)
20212 * non-volatile memory owned by this
20213 * task.
20214 */
20215 disposition |= VM_PAGE_QUERY_PAGE_PRESENT;
20216 }
20217 } else if (!map_entry->is_sub_map &&
20218 map_entry->iokit_acct &&
20219 object->internal &&
20220 object->purgable == VM_PURGABLE_DENY) {
20221 /*
20222 * Non-purgeable IOKit memory: phys_footprint
20223 * includes the entire virtual mapping.
20224 */
20225 assertf(!map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry);
20226 disposition |= VM_PAGE_QUERY_PAGE_PRESENT;
20227 disposition |= VM_PAGE_QUERY_PAGE_DIRTY;
20228 } else if (pmap_disp & (PMAP_QUERY_PAGE_ALTACCT |
20229 PMAP_QUERY_PAGE_COMPRESSED_ALTACCT)) {
20230 /* alternate accounting */
20231 #if __arm64__ && (DEVELOPMENT || DEBUG)
20232 if (map->pmap->footprint_was_suspended) {
20233 /*
20234 * The assertion below can fail if dyld
20235 * suspended footprint accounting
20236 * while doing some adjustments to
20237 * this page; the mapping would say
20238 * "use pmap accounting" but the page
20239 * would be marked "alternate
20240 * accounting".
20241 */
20242 } else
20243 #endif /* __arm64__ && (DEVELOPMENT || DEBUG) */
20244 {
20245 assertf(!map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry);
20246 }
20247 disposition = 0;
20248 } else {
20249 if (pmap_disp & PMAP_QUERY_PAGE_PRESENT) {
20250 assertf(map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry);
20251 disposition |= VM_PAGE_QUERY_PAGE_PRESENT;
20252 disposition |= VM_PAGE_QUERY_PAGE_REF;
20253 if (pmap_disp & PMAP_QUERY_PAGE_INTERNAL) {
20254 disposition |= VM_PAGE_QUERY_PAGE_DIRTY;
20255 } else {
20256 disposition |= VM_PAGE_QUERY_PAGE_EXTERNAL;
20257 }
20258 if (pmap_disp & PMAP_QUERY_PAGE_REUSABLE) {
20259 disposition |= VM_PAGE_QUERY_PAGE_REUSABLE;
20260 }
20261 } else if (pmap_disp & PMAP_QUERY_PAGE_COMPRESSED) {
20262 assertf(map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry);
20263 disposition |= VM_PAGE_QUERY_PAGE_PAGED_OUT;
20264 }
20265 }
20266
20267 *disposition_p = disposition;
20268 }
20269
20270 kern_return_t
vm_map_page_info(vm_map_t map,vm_map_offset_ut offset_u,vm_page_info_flavor_t flavor,vm_page_info_t info,mach_msg_type_number_t * count)20271 vm_map_page_info(
20272 vm_map_t map,
20273 vm_map_offset_ut offset_u,
20274 vm_page_info_flavor_t flavor,
20275 vm_page_info_t info,
20276 mach_msg_type_number_t *count)
20277 {
20278 return vm_map_page_range_info_internal(map,
20279 offset_u, /* start of range */
20280 vm_sanitize_compute_ut_end(offset_u, 1), /* this will get rounded in the call to the page boundary */
20281 (int)-1, /* effective_page_shift: unspecified */
20282 flavor,
20283 info,
20284 count);
20285 }
20286
20287 static __attribute__((always_inline, warn_unused_result))
20288 kern_return_t
vm_map_page_range_info_sanitize(vm_map_t map,vm_map_offset_ut start_offset_u,vm_map_offset_ut end_offset_u,vm_map_offset_t effective_page_mask,vm_map_offset_t * start,vm_map_offset_t * end,vm_map_offset_t * offset_in_page)20289 vm_map_page_range_info_sanitize(
20290 vm_map_t map,
20291 vm_map_offset_ut start_offset_u,
20292 vm_map_offset_ut end_offset_u,
20293 vm_map_offset_t effective_page_mask,
20294 vm_map_offset_t *start,
20295 vm_map_offset_t *end,
20296 vm_map_offset_t *offset_in_page)
20297 {
20298 kern_return_t retval;
20299 vm_map_size_t size;
20300
20301 /*
20302 * Perform validation against map's mask but don't align start/end,
20303 * as we need for those to be aligned wrt effective_page_mask
20304 */
20305 retval = vm_sanitize_addr_end(start_offset_u, end_offset_u,
20306 VM_SANITIZE_CALLER_VM_MAP_PAGE_RANGE_INFO, map,
20307 VM_SANITIZE_FLAGS_SIZE_ZERO_FALLTHROUGH |
20308 VM_SANITIZE_FLAGS_GET_UNALIGNED_VALUES, start,
20309 end, &size);
20310 if (retval != KERN_SUCCESS) {
20311 return retval;
20312 }
20313
20314 retval = vm_sanitize_addr_end(start_offset_u, end_offset_u,
20315 VM_SANITIZE_CALLER_VM_MAP_PAGE_RANGE_INFO, effective_page_mask,
20316 VM_SANITIZE_FLAGS_SIZE_ZERO_FALLTHROUGH, start,
20317 end, &size);
20318 if (retval != KERN_SUCCESS) {
20319 return retval;
20320 }
20321
20322 *offset_in_page = vm_sanitize_offset_in_page(effective_page_mask,
20323 start_offset_u);
20324
20325 return KERN_SUCCESS;
20326 }
20327
20328 kern_return_t
vm_map_page_range_info_internal(vm_map_t map,vm_map_offset_ut start_offset_u,vm_map_offset_ut end_offset_u,int effective_page_shift,vm_page_info_flavor_t flavor,vm_page_info_t info,mach_msg_type_number_t * count)20329 vm_map_page_range_info_internal(
20330 vm_map_t map,
20331 vm_map_offset_ut start_offset_u,
20332 vm_map_offset_ut end_offset_u,
20333 int effective_page_shift,
20334 vm_page_info_flavor_t flavor,
20335 vm_page_info_t info,
20336 mach_msg_type_number_t *count)
20337 {
20338 vm_map_entry_t map_entry = VM_MAP_ENTRY_NULL;
20339 vm_object_t object = VM_OBJECT_NULL, curr_object = VM_OBJECT_NULL;
20340 vm_page_t m = VM_PAGE_NULL;
20341 kern_return_t retval = KERN_SUCCESS;
20342 int disposition = 0;
20343 int ref_count = 0;
20344 int depth = 0, info_idx = 0;
20345 vm_page_info_basic_t basic_info = 0;
20346 vm_map_offset_t offset_in_page = 0, offset_in_object = 0, curr_offset_in_object = 0;
20347 vm_map_offset_t start = 0, end = 0, curr_s_offset = 0, curr_e_offset = 0;
20348 boolean_t do_region_footprint;
20349 ledger_amount_t ledger_resident, ledger_compressed;
20350 int effective_page_size;
20351 vm_map_offset_t effective_page_mask;
20352
20353 switch (flavor) {
20354 case VM_PAGE_INFO_BASIC:
20355 if (*count != VM_PAGE_INFO_BASIC_COUNT) {
20356 /*
20357 * The "vm_page_info_basic_data" structure was not
20358 * properly padded, so allow the size to be off by
20359 * one to maintain backwards binary compatibility...
20360 */
20361 if (*count != VM_PAGE_INFO_BASIC_COUNT - 1) {
20362 return KERN_INVALID_ARGUMENT;
20363 }
20364 }
20365 break;
20366 default:
20367 return KERN_INVALID_ARGUMENT;
20368 }
20369
20370 if (effective_page_shift == -1) {
20371 effective_page_shift = vm_self_region_page_shift_safely(map);
20372 if (effective_page_shift == -1) {
20373 return KERN_INVALID_ARGUMENT;
20374 }
20375 }
20376 effective_page_size = (1 << effective_page_shift);
20377 effective_page_mask = effective_page_size - 1;
20378
20379
20380 retval = vm_map_page_range_info_sanitize(map,
20381 start_offset_u,
20382 end_offset_u,
20383 effective_page_mask,
20384 &start,
20385 &end,
20386 &offset_in_page);
20387 if (retval != KERN_SUCCESS) {
20388 return vm_sanitize_get_kr(retval);
20389 }
20390
20391 assert((end - start) <= MAX_PAGE_RANGE_QUERY);
20392
20393 do_region_footprint = task_self_region_footprint();
20394 disposition = 0;
20395 ref_count = 0;
20396 depth = 0;
20397 info_idx = 0; /* Tracks the next index within the info structure to be filled.*/
20398
20399 vm_map_lock_read(map);
20400
20401
20402 task_ledgers_footprint(map->pmap->ledger, &ledger_resident, &ledger_compressed);
20403
20404 for (curr_s_offset = start; curr_s_offset < end;) {
20405 /*
20406 * New lookup needs reset of these variables.
20407 */
20408 curr_object = object = VM_OBJECT_NULL;
20409 offset_in_object = 0;
20410 ref_count = 0;
20411 depth = 0;
20412
20413 if (do_region_footprint &&
20414 curr_s_offset >= vm_map_last_entry(map)->vme_end) {
20415 /*
20416 * Request for "footprint" info about a page beyond
20417 * the end of address space: this must be for
20418 * the fake region vm_map_region_recurse_64()
20419 * reported to account for non-volatile purgeable
20420 * memory owned by this task.
20421 */
20422 disposition = 0;
20423
20424 if (curr_s_offset - vm_map_last_entry(map)->vme_end <=
20425 (unsigned) ledger_compressed) {
20426 /*
20427 * We haven't reported all the "non-volatile
20428 * compressed" pages yet, so report this fake
20429 * page as "compressed".
20430 */
20431 disposition |= VM_PAGE_QUERY_PAGE_PAGED_OUT;
20432 } else {
20433 /*
20434 * We've reported all the non-volatile
20435 * compressed page but not all the non-volatile
20436 * pages , so report this fake page as
20437 * "resident dirty".
20438 */
20439 disposition |= VM_PAGE_QUERY_PAGE_PRESENT;
20440 disposition |= VM_PAGE_QUERY_PAGE_DIRTY;
20441 disposition |= VM_PAGE_QUERY_PAGE_REF;
20442 }
20443 switch (flavor) {
20444 case VM_PAGE_INFO_BASIC:
20445 basic_info = (vm_page_info_basic_t) (((uintptr_t) info) + (info_idx * sizeof(struct vm_page_info_basic)));
20446 basic_info->disposition = disposition;
20447 basic_info->ref_count = 1;
20448 basic_info->object_id = VM_OBJECT_ID_FAKE(map, task_ledgers.purgeable_nonvolatile);
20449 basic_info->offset = 0;
20450 basic_info->depth = 0;
20451
20452 info_idx++;
20453 break;
20454 }
20455 curr_s_offset += effective_page_size;
20456 continue;
20457 }
20458
20459 /*
20460 * First, find the map entry covering "curr_s_offset", going down
20461 * submaps if necessary.
20462 */
20463 if (!vm_map_lookup_entry(map, curr_s_offset, &map_entry)) {
20464 /* no entry -> no object -> no page */
20465
20466 if (curr_s_offset < vm_map_min(map)) {
20467 /*
20468 * Illegal address that falls below map min.
20469 */
20470 curr_e_offset = MIN(end, vm_map_min(map));
20471 } else if (curr_s_offset >= vm_map_max(map)) {
20472 /*
20473 * Illegal address that falls on/after map max.
20474 */
20475 curr_e_offset = end;
20476 } else if (map_entry == vm_map_to_entry(map)) {
20477 /*
20478 * Hit a hole.
20479 */
20480 if (map_entry->vme_next == vm_map_to_entry(map)) {
20481 /*
20482 * Empty map.
20483 */
20484 curr_e_offset = MIN(map->max_offset, end);
20485 } else {
20486 /*
20487 * Hole at start of the map.
20488 */
20489 curr_e_offset = MIN(map_entry->vme_next->vme_start, end);
20490 }
20491 } else {
20492 if (map_entry->vme_next == vm_map_to_entry(map)) {
20493 /*
20494 * Hole at the end of the map.
20495 */
20496 curr_e_offset = MIN(map->max_offset, end);
20497 } else {
20498 curr_e_offset = MIN(map_entry->vme_next->vme_start, end);
20499 }
20500 }
20501
20502 assert(curr_e_offset >= curr_s_offset);
20503
20504 uint64_t num_pages = (curr_e_offset - curr_s_offset) >> effective_page_shift;
20505
20506 void *info_ptr = (void*) (((uintptr_t) info) + (info_idx * sizeof(struct vm_page_info_basic)));
20507
20508 bzero(info_ptr, num_pages * sizeof(struct vm_page_info_basic));
20509
20510 curr_s_offset = curr_e_offset;
20511
20512 info_idx += num_pages;
20513
20514 continue;
20515 }
20516
20517 /* compute offset from this map entry's start */
20518 offset_in_object = curr_s_offset - map_entry->vme_start;
20519
20520 /* compute offset into this map entry's object (or submap) */
20521 offset_in_object += VME_OFFSET(map_entry);
20522
20523 if (map_entry->is_sub_map) {
20524 vm_map_t sub_map = VM_MAP_NULL;
20525 vm_page_info_t submap_info = 0;
20526 vm_map_offset_t submap_s_offset = 0, submap_e_offset = 0, range_len = 0;
20527
20528 range_len = MIN(map_entry->vme_end, end) - curr_s_offset;
20529
20530 submap_s_offset = offset_in_object;
20531 submap_e_offset = submap_s_offset + range_len;
20532
20533 sub_map = VME_SUBMAP(map_entry);
20534
20535 vm_map_reference(sub_map);
20536 vm_map_unlock_read(map);
20537
20538 submap_info = (vm_page_info_t) (((uintptr_t) info) + (info_idx * sizeof(struct vm_page_info_basic)));
20539
20540 assertf(VM_MAP_PAGE_SHIFT(sub_map) >= VM_MAP_PAGE_SHIFT(map),
20541 "Submap page size (%d) differs from current map (%d)\n", VM_MAP_PAGE_SIZE(sub_map), VM_MAP_PAGE_SIZE(map));
20542
20543 retval = vm_map_page_range_info_internal(sub_map,
20544 submap_s_offset,
20545 submap_e_offset,
20546 effective_page_shift,
20547 VM_PAGE_INFO_BASIC,
20548 (vm_page_info_t) submap_info,
20549 count);
20550
20551 assert(retval == KERN_SUCCESS);
20552
20553 vm_map_deallocate(sub_map);
20554 sub_map = VM_MAP_NULL;
20555 vm_map_lock_read(map);
20556
20557 /* Move the "info" index by the number of pages we inspected.*/
20558 info_idx += range_len >> effective_page_shift;
20559
20560 /* Move our current offset by the size of the range we inspected.*/
20561 curr_s_offset += range_len;
20562
20563 continue;
20564 }
20565
20566 object = VME_OBJECT(map_entry);
20567
20568 if (object == VM_OBJECT_NULL) {
20569 /*
20570 * We don't have an object here and, hence,
20571 * no pages to inspect. We'll fill up the
20572 * info structure appropriately.
20573 */
20574
20575 curr_e_offset = MIN(map_entry->vme_end, end);
20576
20577 uint64_t num_pages = (curr_e_offset - curr_s_offset) >> effective_page_shift;
20578
20579 void *info_ptr = (void*) (((uintptr_t) info) + (info_idx * sizeof(struct vm_page_info_basic)));
20580
20581 bzero(info_ptr, num_pages * sizeof(struct vm_page_info_basic));
20582
20583 curr_s_offset = curr_e_offset;
20584
20585 info_idx += num_pages;
20586
20587 continue;
20588 }
20589
20590 if (do_region_footprint) {
20591 disposition = 0;
20592 if (map->has_corpse_footprint) {
20593 /*
20594 * Query the page info data we saved
20595 * while forking the corpse.
20596 */
20597 vm_map_corpse_footprint_query_page_info(
20598 map,
20599 curr_s_offset,
20600 &disposition);
20601 } else {
20602 /*
20603 * Query the live pmap for footprint info
20604 * about this page.
20605 */
20606 vm_map_footprint_query_page_info(
20607 map,
20608 map_entry,
20609 curr_s_offset,
20610 &disposition);
20611 }
20612 switch (flavor) {
20613 case VM_PAGE_INFO_BASIC:
20614 basic_info = (vm_page_info_basic_t) (((uintptr_t) info) + (info_idx * sizeof(struct vm_page_info_basic)));
20615 basic_info->disposition = disposition;
20616 basic_info->ref_count = 1;
20617 basic_info->object_id = VM_OBJECT_ID_FAKE(map, task_ledgers.purgeable_nonvolatile);
20618 basic_info->offset = 0;
20619 basic_info->depth = 0;
20620
20621 info_idx++;
20622 break;
20623 }
20624 curr_s_offset += effective_page_size;
20625 continue;
20626 }
20627
20628 vm_object_reference(object);
20629 /*
20630 * Shared mode -- so we can allow other readers
20631 * to grab the lock too.
20632 */
20633 vm_object_lock_shared(object);
20634
20635 curr_e_offset = MIN(map_entry->vme_end, end);
20636
20637 vm_map_unlock_read(map);
20638
20639 map_entry = NULL; /* map is unlocked, the entry is no longer valid. */
20640
20641 curr_object = object;
20642
20643 for (; curr_s_offset < curr_e_offset;) {
20644 if (object == curr_object) {
20645 /* account for our object reference above. */
20646 ref_count = os_ref_get_count_raw(&curr_object->ref_count) - 1;
20647 } else {
20648 ref_count = os_ref_get_count_raw(&curr_object->ref_count);
20649 }
20650
20651 curr_offset_in_object = offset_in_object;
20652
20653 for (;;) {
20654 m = vm_page_lookup(curr_object, vm_object_trunc_page(curr_offset_in_object));
20655
20656 if (m != VM_PAGE_NULL) {
20657 disposition |= VM_PAGE_QUERY_PAGE_PRESENT;
20658 break;
20659 } else {
20660 if (curr_object->internal &&
20661 curr_object->alive &&
20662 !curr_object->terminating &&
20663 curr_object->pager_ready) {
20664 if (vm_object_compressor_pager_state_get(curr_object, vm_object_trunc_page(curr_offset_in_object))
20665 == VM_EXTERNAL_STATE_EXISTS) {
20666 /* the pager has that page */
20667 disposition |= VM_PAGE_QUERY_PAGE_PAGED_OUT;
20668 break;
20669 }
20670 }
20671
20672 /*
20673 * Go down the VM object shadow chain until we find the page
20674 * we're looking for.
20675 */
20676
20677 if (curr_object->shadow != VM_OBJECT_NULL) {
20678 vm_object_t shadow = VM_OBJECT_NULL;
20679
20680 curr_offset_in_object += curr_object->vo_shadow_offset;
20681 shadow = curr_object->shadow;
20682
20683 vm_object_lock_shared(shadow);
20684 vm_object_unlock(curr_object);
20685
20686 curr_object = shadow;
20687 depth++;
20688 continue;
20689 } else {
20690 break;
20691 }
20692 }
20693 }
20694
20695 /* The ref_count is not strictly accurate, it measures the number */
20696 /* of entities holding a ref on the object, they may not be mapping */
20697 /* the object or may not be mapping the section holding the */
20698 /* target page but its still a ball park number and though an over- */
20699 /* count, it picks up the copy-on-write cases */
20700
20701 /* We could also get a picture of page sharing from pmap_attributes */
20702 /* but this would under count as only faulted-in mappings would */
20703 /* show up. */
20704
20705 if ((curr_object == object) && curr_object->shadow) {
20706 disposition |= VM_PAGE_QUERY_PAGE_COPIED;
20707 }
20708
20709 if (!curr_object->internal) {
20710 disposition |= VM_PAGE_QUERY_PAGE_EXTERNAL;
20711 }
20712
20713 if (m != VM_PAGE_NULL) {
20714 if (vm_page_is_fictitious(m)) {
20715 disposition |= VM_PAGE_QUERY_PAGE_FICTITIOUS;
20716 } else {
20717 if (m->vmp_dirty || pmap_is_modified(VM_PAGE_GET_PHYS_PAGE(m))) {
20718 disposition |= VM_PAGE_QUERY_PAGE_DIRTY;
20719 }
20720
20721 if (m->vmp_reference || pmap_is_referenced(VM_PAGE_GET_PHYS_PAGE(m))) {
20722 disposition |= VM_PAGE_QUERY_PAGE_REF;
20723 }
20724
20725 if (m->vmp_q_state == VM_PAGE_ON_SPECULATIVE_Q) {
20726 disposition |= VM_PAGE_QUERY_PAGE_SPECULATIVE;
20727 }
20728
20729 /*
20730 * XXX TODO4K:
20731 * when this routine deals with 4k
20732 * pages, check the appropriate CS bit
20733 * here.
20734 */
20735 if (m->vmp_cs_validated) {
20736 disposition |= VM_PAGE_QUERY_PAGE_CS_VALIDATED;
20737 }
20738 if (m->vmp_cs_tainted) {
20739 disposition |= VM_PAGE_QUERY_PAGE_CS_TAINTED;
20740 }
20741 if (m->vmp_cs_nx) {
20742 disposition |= VM_PAGE_QUERY_PAGE_CS_NX;
20743 }
20744 if (m->vmp_reusable || curr_object->all_reusable) {
20745 disposition |= VM_PAGE_QUERY_PAGE_REUSABLE;
20746 }
20747 }
20748 }
20749
20750 switch (flavor) {
20751 case VM_PAGE_INFO_BASIC:
20752 basic_info = (vm_page_info_basic_t) (((uintptr_t) info) + (info_idx * sizeof(struct vm_page_info_basic)));
20753 basic_info->disposition = disposition;
20754 basic_info->ref_count = ref_count;
20755 basic_info->object_id = (vm_object_id_t) (uintptr_t)
20756 VM_KERNEL_ADDRHASH(curr_object);
20757 basic_info->offset =
20758 (memory_object_offset_t) curr_offset_in_object + offset_in_page;
20759 basic_info->depth = depth;
20760
20761 info_idx++;
20762 break;
20763 }
20764
20765 disposition = 0;
20766 offset_in_page = 0; // This doesn't really make sense for any offset other than the starting offset.
20767
20768 /*
20769 * Move to next offset in the range and in our object.
20770 */
20771 curr_s_offset += effective_page_size;
20772 offset_in_object += effective_page_size;
20773 curr_offset_in_object = offset_in_object;
20774
20775 if (curr_object != object) {
20776 vm_object_unlock(curr_object);
20777
20778 curr_object = object;
20779
20780 vm_object_lock_shared(curr_object);
20781 } else {
20782 vm_object_lock_yield_shared(curr_object);
20783 }
20784 }
20785
20786 vm_object_unlock(curr_object);
20787 vm_object_deallocate(curr_object);
20788
20789 vm_map_lock_read(map);
20790 }
20791
20792 vm_map_unlock_read(map);
20793 return retval;
20794 }
20795
20796 static __attribute__((always_inline, warn_unused_result))
20797 kern_return_t
vm_map_msync_sanitize(vm_map_t map,vm_map_address_ut address_u,vm_map_size_ut size_u,vm_object_offset_t * address,vm_map_size_t * size)20798 vm_map_msync_sanitize(
20799 vm_map_t map,
20800 vm_map_address_ut address_u,
20801 vm_map_size_ut size_u,
20802 vm_object_offset_t *address,
20803 vm_map_size_t *size)
20804 {
20805 vm_object_offset_t end;
20806
20807 return vm_sanitize_addr_size(address_u, size_u,
20808 VM_SANITIZE_CALLER_VM_MAP_MSYNC,
20809 map, VM_SANITIZE_FLAGS_SIZE_ZERO_SUCCEEDS,
20810 address, &end, size);
20811 }
20812
20813 /*
20814 * vm_map_msync
20815 *
20816 * Synchronises the memory range specified with its backing store
20817 * image by either flushing or cleaning the contents to the appropriate
20818 * memory manager engaging in a memory object synchronize dialog with
20819 * the manager. The client doesn't return until the manager issues
20820 * m_o_s_completed message. MIG Magically converts user task parameter
20821 * to the task's address map.
20822 *
20823 * interpretation of sync_flags
20824 * VM_SYNC_INVALIDATE - discard pages, only return precious
20825 * pages to manager.
20826 *
20827 * VM_SYNC_INVALIDATE & (VM_SYNC_SYNCHRONOUS | VM_SYNC_ASYNCHRONOUS)
20828 * - discard pages, write dirty or precious
20829 * pages back to memory manager.
20830 *
20831 * VM_SYNC_SYNCHRONOUS | VM_SYNC_ASYNCHRONOUS
20832 * - write dirty or precious pages back to
20833 * the memory manager.
20834 *
20835 * VM_SYNC_CONTIGUOUS - does everything normally, but if there
20836 * is a hole in the region, and we would
20837 * have returned KERN_SUCCESS, return
20838 * KERN_INVALID_ADDRESS instead.
20839 *
20840 * NOTE
20841 * The memory object attributes have not yet been implemented, this
20842 * function will have to deal with the invalidate attribute
20843 *
20844 * RETURNS
20845 * KERN_INVALID_TASK Bad task parameter
20846 * KERN_INVALID_ARGUMENT both sync and async were specified.
20847 * KERN_SUCCESS The usual.
20848 * KERN_INVALID_ADDRESS There was a hole in the region.
20849 */
20850
20851 kern_return_t
vm_map_msync(vm_map_t map,vm_map_address_ut address_u,vm_map_size_ut size_u,vm_sync_t sync_flags)20852 vm_map_msync(
20853 vm_map_t map,
20854 vm_map_address_ut address_u,
20855 vm_map_size_ut size_u,
20856 vm_sync_t sync_flags)
20857 {
20858 vm_map_entry_t entry;
20859 vm_map_size_t size, amount_left;
20860 vm_object_offset_t address, offset;
20861 vm_object_offset_t start_offset, end_offset;
20862 boolean_t do_sync_req;
20863 boolean_t had_hole = FALSE;
20864 vm_map_offset_t pmap_offset;
20865 kern_return_t kr;
20866
20867 if ((sync_flags & VM_SYNC_ASYNCHRONOUS) &&
20868 (sync_flags & VM_SYNC_SYNCHRONOUS)) {
20869 return KERN_INVALID_ARGUMENT;
20870 }
20871
20872 if (map == VM_MAP_NULL) {
20873 return KERN_INVALID_TASK;
20874 }
20875
20876 kr = vm_map_msync_sanitize(map,
20877 address_u,
20878 size_u,
20879 &address,
20880 &size);
20881 if (VM_MAP_PAGE_MASK(map) < PAGE_MASK) {
20882 DEBUG4K_SHARE("map %p address 0x%llx size 0x%llx flags 0x%x\n", map, (uint64_t)address, (uint64_t)size, sync_flags);
20883 }
20884 if (__improbable(kr != KERN_SUCCESS)) {
20885 return vm_sanitize_get_kr(kr);
20886 }
20887
20888 amount_left = size;
20889
20890 while (amount_left > 0) {
20891 vm_object_size_t flush_size;
20892 vm_object_t object;
20893
20894 vm_map_lock(map);
20895 if (!vm_map_lookup_entry(map,
20896 address,
20897 &entry)) {
20898 vm_map_size_t skip;
20899
20900 /*
20901 * hole in the address map.
20902 */
20903 had_hole = TRUE;
20904
20905 if (sync_flags & VM_SYNC_KILLPAGES) {
20906 /*
20907 * For VM_SYNC_KILLPAGES, there should be
20908 * no holes in the range, since we couldn't
20909 * prevent someone else from allocating in
20910 * that hole and we wouldn't want to "kill"
20911 * their pages.
20912 */
20913 vm_map_unlock(map);
20914 break;
20915 }
20916
20917 /*
20918 * Check for empty map.
20919 */
20920 if (entry == vm_map_to_entry(map) &&
20921 entry->vme_next == entry) {
20922 vm_map_unlock(map);
20923 break;
20924 }
20925 /*
20926 * Check that we don't wrap and that
20927 * we have at least one real map entry.
20928 */
20929 if ((map->hdr.nentries == 0) ||
20930 (entry->vme_next->vme_start < address)) {
20931 vm_map_unlock(map);
20932 break;
20933 }
20934 /*
20935 * Move up to the next entry if needed
20936 */
20937 skip = (entry->vme_next->vme_start - address);
20938 if (skip >= amount_left) {
20939 amount_left = 0;
20940 } else {
20941 amount_left -= skip;
20942 }
20943 address = entry->vme_next->vme_start;
20944 vm_map_unlock(map);
20945 continue;
20946 }
20947
20948 offset = address - entry->vme_start;
20949 pmap_offset = address;
20950
20951 /*
20952 * do we have more to flush than is contained in this
20953 * entry ?
20954 */
20955 if (amount_left + entry->vme_start + offset > entry->vme_end) {
20956 flush_size = entry->vme_end -
20957 (entry->vme_start + offset);
20958 } else {
20959 flush_size = amount_left;
20960 }
20961 amount_left -= flush_size;
20962 address += flush_size;
20963
20964 if (entry->is_sub_map == TRUE) {
20965 vm_map_t local_map;
20966 vm_map_offset_t local_offset;
20967
20968 local_map = VME_SUBMAP(entry);
20969 local_offset = VME_OFFSET(entry);
20970 vm_map_reference(local_map);
20971 vm_map_unlock(map);
20972 if (vm_map_msync(
20973 local_map,
20974 local_offset,
20975 flush_size,
20976 sync_flags) == KERN_INVALID_ADDRESS) {
20977 had_hole = TRUE;
20978 }
20979 vm_map_deallocate(local_map);
20980 local_map = VM_MAP_NULL;
20981 continue;
20982 }
20983 object = VME_OBJECT(entry);
20984
20985 /*
20986 * We can't sync this object if the object has not been
20987 * created yet
20988 */
20989 if (object == VM_OBJECT_NULL) {
20990 vm_map_unlock(map);
20991 continue;
20992 }
20993 offset += VME_OFFSET(entry);
20994
20995 vm_object_lock(object);
20996
20997 if (sync_flags & (VM_SYNC_KILLPAGES | VM_SYNC_DEACTIVATE)) {
20998 int kill_pages = 0;
20999
21000 if (VM_MAP_PAGE_MASK(map) < PAGE_MASK) {
21001 /*
21002 * This is a destructive operation and so we
21003 * err on the side of limiting the range of
21004 * the operation.
21005 */
21006 start_offset = vm_object_round_page(offset);
21007 end_offset = vm_object_trunc_page(offset + flush_size);
21008
21009 if (end_offset <= start_offset) {
21010 vm_object_unlock(object);
21011 vm_map_unlock(map);
21012 continue;
21013 }
21014
21015 pmap_offset += start_offset - offset;
21016 } else {
21017 start_offset = offset;
21018 end_offset = offset + flush_size;
21019 }
21020
21021 if (sync_flags & VM_SYNC_KILLPAGES) {
21022 if (((os_ref_get_count_raw(&object->ref_count) == 1) ||
21023 ((object->copy_strategy !=
21024 MEMORY_OBJECT_COPY_SYMMETRIC) &&
21025 (object->vo_copy == VM_OBJECT_NULL))) &&
21026 (object->shadow == VM_OBJECT_NULL)) {
21027 if (os_ref_get_count_raw(&object->ref_count) != 1) {
21028 vm_page_stats_reusable.free_shared++;
21029 }
21030 kill_pages = 1;
21031 } else {
21032 kill_pages = -1;
21033 }
21034 }
21035 if (kill_pages != -1) {
21036 boolean_t kill_no_write = FALSE;
21037
21038 if ((entry->protection & VM_PROT_EXECUTE) ||
21039 entry->vme_xnu_user_debug) {
21040 /*
21041 * Executable or user debug pages might be write-protected by
21042 * hardware, so do not attempt to write to these pages.
21043 */
21044 kill_no_write = TRUE;
21045 }
21046 vm_object_deactivate_pages(
21047 object,
21048 start_offset,
21049 (vm_object_size_t) (end_offset - start_offset),
21050 kill_pages,
21051 FALSE, /* reusable_pages */
21052 kill_no_write,
21053 map->pmap,
21054 pmap_offset);
21055 }
21056 vm_object_unlock(object);
21057 vm_map_unlock(map);
21058 continue;
21059 }
21060 /*
21061 * We can't sync this object if there isn't a pager.
21062 * Don't bother to sync internal objects, since there can't
21063 * be any "permanent" storage for these objects anyway.
21064 */
21065 if ((object->pager == MEMORY_OBJECT_NULL) ||
21066 (object->internal) || (object->private)) {
21067 vm_object_unlock(object);
21068 vm_map_unlock(map);
21069 continue;
21070 }
21071 /*
21072 * keep reference on the object until syncing is done
21073 */
21074 vm_object_reference_locked(object);
21075 vm_object_unlock(object);
21076
21077 vm_map_unlock(map);
21078
21079 if (VM_MAP_PAGE_MASK(map) < PAGE_MASK) {
21080 start_offset = vm_object_trunc_page(offset);
21081 end_offset = vm_object_round_page(offset + flush_size);
21082 } else {
21083 start_offset = offset;
21084 end_offset = offset + flush_size;
21085 }
21086
21087 do_sync_req = vm_object_sync(object,
21088 start_offset,
21089 (end_offset - start_offset),
21090 sync_flags & VM_SYNC_INVALIDATE,
21091 ((sync_flags & VM_SYNC_SYNCHRONOUS) ||
21092 (sync_flags & VM_SYNC_ASYNCHRONOUS)),
21093 sync_flags & VM_SYNC_SYNCHRONOUS);
21094
21095 if ((sync_flags & VM_SYNC_INVALIDATE) && object->resident_page_count == 0) {
21096 /*
21097 * clear out the clustering and read-ahead hints
21098 */
21099 vm_object_lock(object);
21100
21101 object->pages_created = 0;
21102 object->pages_used = 0;
21103 object->sequential = 0;
21104 object->last_alloc = 0;
21105
21106 vm_object_unlock(object);
21107 }
21108 vm_object_deallocate(object);
21109 } /* while */
21110
21111 /* for proper msync() behaviour */
21112 if (had_hole == TRUE && (sync_flags & VM_SYNC_CONTIGUOUS)) {
21113 return KERN_INVALID_ADDRESS;
21114 }
21115
21116 return KERN_SUCCESS;
21117 }/* vm_msync */
21118
21119 void
vm_named_entry_associate_vm_object(vm_named_entry_t named_entry,vm_object_t object,vm_object_offset_t offset,vm_object_size_t size,vm_prot_t prot)21120 vm_named_entry_associate_vm_object(
21121 vm_named_entry_t named_entry,
21122 vm_object_t object,
21123 vm_object_offset_t offset,
21124 vm_object_size_t size,
21125 vm_prot_t prot)
21126 {
21127 vm_map_copy_t copy;
21128 vm_map_entry_t copy_entry;
21129
21130 assert(!named_entry->is_sub_map);
21131 assert(!named_entry->is_copy);
21132 assert(!named_entry->is_object);
21133 assert(!named_entry->internal);
21134 assert(named_entry->backing.copy == VM_MAP_COPY_NULL);
21135
21136 copy = vm_map_copy_allocate(VM_MAP_COPY_ENTRY_LIST);
21137 copy->offset = offset;
21138 copy->size = size;
21139 copy->cpy_hdr.page_shift = (uint16_t)PAGE_SHIFT;
21140
21141 copy_entry = vm_map_copy_entry_create(copy);
21142 copy_entry->protection = prot;
21143 copy_entry->max_protection = prot;
21144 copy_entry->use_pmap = TRUE;
21145 copy_entry->vme_start = VM_MAP_TRUNC_PAGE(offset, PAGE_MASK);
21146 copy_entry->vme_end = VM_MAP_ROUND_PAGE(offset + size, PAGE_MASK);
21147 VME_OBJECT_SET(copy_entry, object, false, 0);
21148 VME_OFFSET_SET(copy_entry, vm_object_trunc_page(offset));
21149 vm_map_copy_entry_link(copy, vm_map_copy_last_entry(copy), copy_entry);
21150
21151 named_entry->backing.copy = copy;
21152 named_entry->is_object = TRUE;
21153 if (object->internal) {
21154 named_entry->internal = TRUE;
21155 }
21156
21157 DEBUG4K_MEMENTRY("named_entry %p copy %p object %p offset 0x%llx size 0x%llx prot 0x%x\n",
21158 named_entry, copy, object, offset, size, prot);
21159 }
21160
21161 vm_object_t
vm_named_entry_to_vm_object(vm_named_entry_t named_entry)21162 vm_named_entry_to_vm_object(
21163 vm_named_entry_t named_entry)
21164 {
21165 vm_map_copy_t copy;
21166 vm_map_entry_t copy_entry;
21167 vm_object_t object;
21168
21169 assert(!named_entry->is_sub_map);
21170 assert(!named_entry->is_copy);
21171 assert(named_entry->is_object);
21172 copy = named_entry->backing.copy;
21173 assert(copy != VM_MAP_COPY_NULL);
21174 /*
21175 * Assert that the vm_map_copy is coming from the right
21176 * zone and hasn't been forged
21177 */
21178 vm_map_copy_require(copy);
21179 assert(copy->cpy_hdr.nentries == 1);
21180 copy_entry = vm_map_copy_first_entry(copy);
21181 object = VME_OBJECT(copy_entry);
21182
21183 DEBUG4K_MEMENTRY("%p -> %p -> %p [0x%llx 0x%llx 0x%llx 0x%x/0x%x ] -> %p offset 0x%llx size 0x%llx prot 0x%x\n", named_entry, copy, copy_entry, (uint64_t)copy_entry->vme_start, (uint64_t)copy_entry->vme_end, copy_entry->vme_offset, copy_entry->protection, copy_entry->max_protection, object, named_entry->offset, named_entry->size, named_entry->protection);
21184
21185 return object;
21186 }
21187
21188 /*
21189 * Routine: convert_port_entry_to_map
21190 * Purpose:
21191 * Convert from a port specifying an entry or a task
21192 * to a map. Doesn't consume the port ref; produces a map ref,
21193 * which may be null. Unlike convert_port_to_map, the
21194 * port may be task or a named entry backed.
21195 * Conditions:
21196 * Nothing locked.
21197 */
21198
21199 vm_map_t
convert_port_entry_to_map(ipc_port_t port)21200 convert_port_entry_to_map(
21201 ipc_port_t port)
21202 {
21203 vm_map_t map = VM_MAP_NULL;
21204 vm_named_entry_t named_entry;
21205
21206 if (!IP_VALID(port)) {
21207 return VM_MAP_NULL;
21208 }
21209
21210 if (ip_kotype(port) != IKOT_NAMED_ENTRY) {
21211 return convert_port_to_map(port);
21212 }
21213
21214 named_entry = mach_memory_entry_from_port(port);
21215
21216 if ((named_entry->is_sub_map) &&
21217 (named_entry->protection & VM_PROT_WRITE)) {
21218 map = named_entry->backing.map;
21219 if (map->pmap != PMAP_NULL) {
21220 if (map->pmap == kernel_pmap) {
21221 panic("userspace has access "
21222 "to a kernel map %p", map);
21223 }
21224 pmap_require(map->pmap);
21225 }
21226 vm_map_reference(map);
21227 }
21228
21229 return map;
21230 }
21231
21232 /*
21233 * Export routines to other components for the things we access locally through
21234 * macros.
21235 */
21236 #undef current_map
21237 vm_map_t
current_map(void)21238 current_map(void)
21239 {
21240 return current_map_fast();
21241 }
21242
21243 /*
21244 * vm_map_reference:
21245 *
21246 * Takes a reference on the specified map.
21247 */
21248 void
vm_map_reference(vm_map_t map)21249 vm_map_reference(
21250 vm_map_t map)
21251 {
21252 if (__probable(map != VM_MAP_NULL)) {
21253 vm_map_require(map);
21254 os_ref_retain_raw(&map->map_refcnt, &map_refgrp);
21255 }
21256 }
21257
21258 /*
21259 * vm_map_deallocate:
21260 *
21261 * Removes a reference from the specified map,
21262 * destroying it if no references remain.
21263 * The map should not be locked.
21264 */
21265 void
vm_map_deallocate(vm_map_t map)21266 vm_map_deallocate(
21267 vm_map_t map)
21268 {
21269 if (__probable(map != VM_MAP_NULL)) {
21270 vm_map_require(map);
21271 if (os_ref_release_raw(&map->map_refcnt, &map_refgrp) == 0) {
21272 vm_map_destroy(map);
21273 }
21274 }
21275 }
21276
21277 void
vm_map_inspect_deallocate(vm_map_inspect_t map)21278 vm_map_inspect_deallocate(
21279 vm_map_inspect_t map)
21280 {
21281 vm_map_deallocate((vm_map_t)map);
21282 }
21283
21284 void
vm_map_read_deallocate(vm_map_read_t map)21285 vm_map_read_deallocate(
21286 vm_map_read_t map)
21287 {
21288 vm_map_deallocate((vm_map_t)map);
21289 }
21290
21291
21292 void
vm_map_disable_NX(vm_map_t map)21293 vm_map_disable_NX(vm_map_t map)
21294 {
21295 if (map == NULL) {
21296 return;
21297 }
21298 if (map->pmap == NULL) {
21299 return;
21300 }
21301
21302 pmap_disable_NX(map->pmap);
21303 }
21304
21305 void
vm_map_disallow_data_exec(vm_map_t map)21306 vm_map_disallow_data_exec(vm_map_t map)
21307 {
21308 if (map == NULL) {
21309 return;
21310 }
21311
21312 map->map_disallow_data_exec = TRUE;
21313 }
21314
21315 /* XXX Consider making these constants (VM_MAX_ADDRESS and MACH_VM_MAX_ADDRESS)
21316 * more descriptive.
21317 */
21318 void
vm_map_set_32bit(vm_map_t map)21319 vm_map_set_32bit(vm_map_t map)
21320 {
21321 #if defined(__arm64__)
21322 map->max_offset = pmap_max_offset(FALSE, ARM_PMAP_MAX_OFFSET_DEVICE);
21323 #else
21324 map->max_offset = (vm_map_offset_t)VM_MAX_ADDRESS;
21325 #endif
21326 }
21327
21328
21329 void
vm_map_set_64bit(vm_map_t map)21330 vm_map_set_64bit(vm_map_t map)
21331 {
21332 #if defined(__arm64__)
21333 map->max_offset = pmap_max_offset(TRUE, ARM_PMAP_MAX_OFFSET_DEVICE);
21334 #else
21335 map->max_offset = (vm_map_offset_t)MACH_VM_MAX_ADDRESS;
21336 #endif
21337 }
21338
21339 /*
21340 * Expand the maximum size of an existing map to 64GB.
21341 */
21342 void
vm_map_set_jumbo(vm_map_t map)21343 vm_map_set_jumbo(vm_map_t map)
21344 {
21345 #if defined (__arm64__) && !XNU_TARGET_OS_OSX
21346 vm_map_set_max_addr(map, ~0, false);
21347 #else /* arm64 */
21348 (void) map;
21349 #endif
21350 }
21351
21352 #if XNU_TARGET_OS_IOS && EXTENDED_USER_VA_SUPPORT
21353 /*
21354 * Expand the maximum size of an existing map to the maximum supported.
21355 */
21356 void
vm_map_set_extra_jumbo(vm_map_t map)21357 vm_map_set_extra_jumbo(vm_map_t map)
21358 {
21359 #if defined (__arm64__) && !XNU_TARGET_OS_OSX
21360 vm_map_set_max_addr(map, ~0, true);
21361 #else /* arm64 */
21362 (void) map;
21363 #endif
21364 }
21365 #endif /* XNU_TARGET_OS_IOS && EXTENDED_USER_VA_SUPPORT */
21366
21367 /*
21368 * This map has a JIT entitlement
21369 */
21370 void
vm_map_set_jit_entitled(vm_map_t map)21371 vm_map_set_jit_entitled(vm_map_t map)
21372 {
21373 #if defined (__arm64__)
21374 pmap_set_jit_entitled(map->pmap);
21375 #else /* arm64 */
21376 (void) map;
21377 #endif
21378 }
21379
21380 /*
21381 * Get status of this maps TPRO flag
21382 */
21383 boolean_t
vm_map_tpro(vm_map_t map)21384 vm_map_tpro(vm_map_t map)
21385 {
21386 #if defined (__arm64e__)
21387 return pmap_get_tpro(map->pmap);
21388 #else /* arm64e */
21389 (void) map;
21390 return FALSE;
21391 #endif
21392 }
21393
21394 /*
21395 * This map has TPRO enabled
21396 */
21397 void
vm_map_set_tpro(vm_map_t map)21398 vm_map_set_tpro(vm_map_t map)
21399 {
21400 #if defined (__arm64e__)
21401 pmap_set_tpro(map->pmap);
21402 #else /* arm64e */
21403 (void) map;
21404 #endif
21405 }
21406
21407
21408 /*
21409 * Does this map have TPRO enforcement enabled
21410 */
21411 boolean_t
vm_map_tpro_enforcement(vm_map_t map)21412 vm_map_tpro_enforcement(vm_map_t map)
21413 {
21414 return map->tpro_enforcement;
21415 }
21416
21417 /*
21418 * Set TPRO enforcement for this map
21419 */
21420 void
vm_map_set_tpro_enforcement(vm_map_t map)21421 vm_map_set_tpro_enforcement(vm_map_t map)
21422 {
21423 if (vm_map_tpro(map)) {
21424 vm_map_lock(map);
21425 map->tpro_enforcement = TRUE;
21426 vm_map_unlock(map);
21427 }
21428 }
21429
21430 /*
21431 * Enable TPRO on the requested region
21432 *
21433 * Note:
21434 * This routine is primarily intended to be called during/soon after map
21435 * creation before the associated task has been released to run. It is only
21436 * currently safe when we have no resident pages.
21437 */
21438 boolean_t
vm_map_set_tpro_range(__unused vm_map_t map,__unused vm_map_address_t start,__unused vm_map_address_t end)21439 vm_map_set_tpro_range(
21440 __unused vm_map_t map,
21441 __unused vm_map_address_t start,
21442 __unused vm_map_address_t end)
21443 {
21444 return TRUE;
21445 }
21446
21447 /*
21448 * Expand the maximum size of an existing map.
21449 */
21450 void
vm_map_set_max_addr(vm_map_t map,vm_map_offset_t new_max_offset,__unused bool extra_jumbo)21451 vm_map_set_max_addr(
21452 vm_map_t map,
21453 vm_map_offset_t new_max_offset,
21454 __unused bool extra_jumbo)
21455 {
21456 #if defined(__arm64__)
21457 vm_map_offset_t max_supported_offset;
21458 vm_map_offset_t old_max_offset;
21459 unsigned int option = ARM_PMAP_MAX_OFFSET_JUMBO;
21460
21461 vm_map_lock(map);
21462
21463 old_max_offset = map->max_offset;
21464 #if XNU_TARGET_OS_IOS && EXTENDED_USER_VA_SUPPORT
21465 if (extra_jumbo) {
21466 option = ARM_PMAP_MAX_OFFSET_EXTRA_JUMBO;
21467 }
21468 #endif /* XNU_TARGET_OS_IOS && EXTENDED_USER_VA_SUPPORT */
21469 max_supported_offset = pmap_max_offset(vm_map_is_64bit(map), option);
21470
21471 new_max_offset = trunc_page(new_max_offset);
21472
21473 /* The address space cannot be shrunk using this routine. */
21474 if (old_max_offset >= new_max_offset) {
21475 vm_map_unlock(map);
21476 return;
21477 }
21478
21479 if (max_supported_offset < new_max_offset) {
21480 new_max_offset = max_supported_offset;
21481 }
21482
21483 map->max_offset = new_max_offset;
21484
21485 /*
21486 * Disable the following chunk of code that extends the "holes" list
21487 * to accomodate a larger VM map.
21488 * In `vm_map_create_options()`, we now set the end of the "holes" list to
21489 * max(map->max_offset, MACH_VM_MAX_ADDRESS) for all platforms.
21490 * MACH_VM_MAX_ADDRESS is the largest virtual address a userspace process
21491 * can map, so any `new_max_offset` value will be <= MACH_VM_MAX_ADDRESS.
21492 * The "holes" list does not need to be adjusted.
21493 */
21494 #if 0
21495 if (map->holelistenabled) {
21496 if (map->holes_list->prev->vme_end == old_max_offset) {
21497 /*
21498 * There is already a hole at the end of the map; simply make it bigger.
21499 */
21500 map->holes_list->prev->vme_end = map->max_offset;
21501 } else {
21502 /*
21503 * There is no hole at the end, so we need to create a new hole
21504 * for the new empty space we're creating.
21505 */
21506 struct vm_map_links *new_hole;
21507
21508 new_hole = zalloc_id(ZONE_ID_VM_MAP_HOLES, Z_WAITOK | Z_NOFAIL);
21509 new_hole->start = old_max_offset;
21510 new_hole->end = map->max_offset;
21511 new_hole->prev = map->holes_list->prev;
21512 new_hole->next = (struct vm_map_entry *)map->holes_list;
21513 map->holes_list->prev->vme_next = (struct vm_map_entry *)new_hole;
21514 map->holes_list->prev = (struct vm_map_entry *)new_hole;
21515 }
21516 }
21517 #endif
21518
21519 vm_map_unlock(map);
21520 #else
21521 (void)map;
21522 (void)new_max_offset;
21523 #endif
21524 }
21525
21526 vm_map_offset_t
vm_compute_max_offset(boolean_t is64)21527 vm_compute_max_offset(boolean_t is64)
21528 {
21529 #if defined(__arm64__)
21530 return pmap_max_offset(is64, ARM_PMAP_MAX_OFFSET_DEVICE);
21531 #else
21532 return is64 ? (vm_map_offset_t)MACH_VM_MAX_ADDRESS : (vm_map_offset_t)VM_MAX_ADDRESS;
21533 #endif
21534 }
21535
21536 void
vm_map_get_max_aslr_slide_section(vm_map_t map __unused,int64_t * max_sections,int64_t * section_size)21537 vm_map_get_max_aslr_slide_section(
21538 vm_map_t map __unused,
21539 int64_t *max_sections,
21540 int64_t *section_size)
21541 {
21542 #if defined(__arm64__)
21543 *max_sections = 3;
21544 *section_size = ARM_TT_TWIG_SIZE;
21545 #else
21546 *max_sections = 1;
21547 *section_size = 0;
21548 #endif
21549 }
21550
21551 uint64_t
vm_map_get_max_aslr_slide_pages(vm_map_t map)21552 vm_map_get_max_aslr_slide_pages(vm_map_t map)
21553 {
21554 #if defined(__arm64__)
21555 /* Limit arm64 slide to 16MB to conserve contiguous VA space in the more
21556 * limited embedded address space; this is also meant to minimize pmap
21557 * memory usage on 16KB page systems.
21558 */
21559 return 1 << (24 - VM_MAP_PAGE_SHIFT(map));
21560 #else
21561 return 1 << (vm_map_is_64bit(map) ? 16 : 8);
21562 #endif
21563 }
21564
21565 uint64_t
vm_map_get_max_loader_aslr_slide_pages(vm_map_t map)21566 vm_map_get_max_loader_aslr_slide_pages(vm_map_t map)
21567 {
21568 #if defined(__arm64__)
21569 /* We limit the loader slide to 4MB, in order to ensure at least 8 bits
21570 * of independent entropy on 16KB page systems.
21571 */
21572 return 1 << (22 - VM_MAP_PAGE_SHIFT(map));
21573 #else
21574 return 1 << (vm_map_is_64bit(map) ? 16 : 8);
21575 #endif
21576 }
21577
21578 boolean_t
vm_map_is_64bit(vm_map_t map)21579 vm_map_is_64bit(
21580 vm_map_t map)
21581 {
21582 return map->max_offset > ((vm_map_offset_t)VM_MAX_ADDRESS);
21583 }
21584
21585 boolean_t
vm_map_has_hard_pagezero(vm_map_t map,vm_map_offset_t pagezero_size)21586 vm_map_has_hard_pagezero(
21587 vm_map_t map,
21588 vm_map_offset_t pagezero_size)
21589 {
21590 /*
21591 * XXX FBDP
21592 * We should lock the VM map (for read) here but we can get away
21593 * with it for now because there can't really be any race condition:
21594 * the VM map's min_offset is changed only when the VM map is created
21595 * and when the zero page is established (when the binary gets loaded),
21596 * and this routine gets called only when the task terminates and the
21597 * VM map is being torn down, and when a new map is created via
21598 * load_machfile()/execve().
21599 */
21600 return map->min_offset >= pagezero_size;
21601 }
21602
21603 /*
21604 * Raise a VM map's maximun offset.
21605 */
21606 kern_return_t
vm_map_raise_max_offset(vm_map_t map,vm_map_offset_t new_max_offset)21607 vm_map_raise_max_offset(
21608 vm_map_t map,
21609 vm_map_offset_t new_max_offset)
21610 {
21611 kern_return_t ret;
21612
21613 vm_map_lock(map);
21614 ret = KERN_INVALID_ADDRESS;
21615
21616 if (new_max_offset >= map->max_offset) {
21617 if (!vm_map_is_64bit(map)) {
21618 if (new_max_offset <= (vm_map_offset_t)VM_MAX_ADDRESS) {
21619 map->max_offset = new_max_offset;
21620 ret = KERN_SUCCESS;
21621 }
21622 } else {
21623 if (new_max_offset <= (vm_map_offset_t)MACH_VM_MAX_ADDRESS) {
21624 map->max_offset = new_max_offset;
21625 ret = KERN_SUCCESS;
21626 }
21627 }
21628 }
21629
21630 vm_map_unlock(map);
21631 return ret;
21632 }
21633
21634
21635 /*
21636 * Raise a VM map's minimum offset.
21637 * To strictly enforce "page zero" reservation.
21638 */
21639 kern_return_t
vm_map_raise_min_offset(vm_map_t map,vm_map_offset_t new_min_offset)21640 vm_map_raise_min_offset(
21641 vm_map_t map,
21642 vm_map_offset_t new_min_offset)
21643 {
21644 vm_map_entry_t first_entry;
21645
21646 new_min_offset = vm_map_round_page(new_min_offset,
21647 VM_MAP_PAGE_MASK(map));
21648
21649 vm_map_lock(map);
21650
21651 if (new_min_offset < map->min_offset) {
21652 /*
21653 * Can't move min_offset backwards, as that would expose
21654 * a part of the address space that was previously, and for
21655 * possibly good reasons, inaccessible.
21656 */
21657 vm_map_unlock(map);
21658 return KERN_INVALID_ADDRESS;
21659 }
21660 if (new_min_offset >= map->max_offset) {
21661 /* can't go beyond the end of the address space */
21662 vm_map_unlock(map);
21663 return KERN_INVALID_ADDRESS;
21664 }
21665
21666 first_entry = vm_map_first_entry(map);
21667 if (first_entry != vm_map_to_entry(map) &&
21668 first_entry->vme_start < new_min_offset) {
21669 /*
21670 * Some memory was already allocated below the new
21671 * minimun offset. It's too late to change it now...
21672 */
21673 vm_map_unlock(map);
21674 return KERN_NO_SPACE;
21675 }
21676
21677 map->min_offset = new_min_offset;
21678
21679 if (map->holelistenabled) {
21680 assert(map->holes_list);
21681 map->holes_list->start = new_min_offset;
21682 assert(new_min_offset < map->holes_list->end);
21683 }
21684
21685 vm_map_unlock(map);
21686
21687 return KERN_SUCCESS;
21688 }
21689
21690 /*
21691 * Set the limit on the maximum amount of address space and user wired memory allowed for this map.
21692 * This is basically a copy of the RLIMIT_AS and RLIMIT_MEMLOCK rlimit value maintained by the BSD
21693 * side of the kernel. The limits are checked in the mach VM side, so we keep a copy so we don't
21694 * have to reach over to the BSD data structures.
21695 */
21696
21697 uint64_t vm_map_set_size_limit_count = 0;
21698 kern_return_t
vm_map_set_size_limit(vm_map_t map,uint64_t new_size_limit)21699 vm_map_set_size_limit(vm_map_t map, uint64_t new_size_limit)
21700 {
21701 kern_return_t kr;
21702
21703 vm_map_lock(map);
21704 if (new_size_limit < map->size) {
21705 /* new limit should not be lower than its current size */
21706 DTRACE_VM2(vm_map_set_size_limit_fail,
21707 vm_map_size_t, map->size,
21708 uint64_t, new_size_limit);
21709 kr = KERN_FAILURE;
21710 } else if (new_size_limit == map->size_limit) {
21711 /* no change */
21712 kr = KERN_SUCCESS;
21713 } else {
21714 /* set new limit */
21715 DTRACE_VM2(vm_map_set_size_limit,
21716 vm_map_size_t, map->size,
21717 uint64_t, new_size_limit);
21718 if (new_size_limit != RLIM_INFINITY) {
21719 vm_map_set_size_limit_count++;
21720 }
21721 map->size_limit = new_size_limit;
21722 kr = KERN_SUCCESS;
21723 }
21724 vm_map_unlock(map);
21725 return kr;
21726 }
21727
21728 uint64_t vm_map_set_data_limit_count = 0;
21729 kern_return_t
vm_map_set_data_limit(vm_map_t map,uint64_t new_data_limit)21730 vm_map_set_data_limit(vm_map_t map, uint64_t new_data_limit)
21731 {
21732 kern_return_t kr;
21733
21734 vm_map_lock(map);
21735 if (new_data_limit < map->size) {
21736 /* new limit should not be lower than its current size */
21737 DTRACE_VM2(vm_map_set_data_limit_fail,
21738 vm_map_size_t, map->size,
21739 uint64_t, new_data_limit);
21740 kr = KERN_FAILURE;
21741 } else if (new_data_limit == map->data_limit) {
21742 /* no change */
21743 kr = KERN_SUCCESS;
21744 } else {
21745 /* set new limit */
21746 DTRACE_VM2(vm_map_set_data_limit,
21747 vm_map_size_t, map->size,
21748 uint64_t, new_data_limit);
21749 if (new_data_limit != RLIM_INFINITY) {
21750 vm_map_set_data_limit_count++;
21751 }
21752 map->data_limit = new_data_limit;
21753 kr = KERN_SUCCESS;
21754 }
21755 vm_map_unlock(map);
21756 return kr;
21757 }
21758
21759 void
vm_map_set_user_wire_limit(vm_map_t map,vm_size_t limit)21760 vm_map_set_user_wire_limit(vm_map_t map,
21761 vm_size_t limit)
21762 {
21763 vm_map_lock(map);
21764 map->user_wire_limit = limit;
21765 vm_map_unlock(map);
21766 }
21767
21768
21769 void
vm_map_switch_protect(vm_map_t map,boolean_t val)21770 vm_map_switch_protect(vm_map_t map,
21771 boolean_t val)
21772 {
21773 vm_map_lock(map);
21774 map->switch_protect = val;
21775 vm_map_unlock(map);
21776 }
21777
21778 extern int cs_process_enforcement_enable;
21779 boolean_t
vm_map_cs_enforcement(vm_map_t map)21780 vm_map_cs_enforcement(
21781 vm_map_t map)
21782 {
21783 if (cs_process_enforcement_enable) {
21784 return TRUE;
21785 }
21786 return map->cs_enforcement;
21787 }
21788
21789 kern_return_t
vm_map_cs_wx_enable(__unused vm_map_t map)21790 vm_map_cs_wx_enable(
21791 __unused vm_map_t map)
21792 {
21793 #if CODE_SIGNING_MONITOR
21794 kern_return_t ret = csm_allow_invalid_code(vm_map_pmap(map));
21795 if ((ret == KERN_SUCCESS) || (ret == KERN_NOT_SUPPORTED)) {
21796 return KERN_SUCCESS;
21797 }
21798 return ret;
21799 #else
21800 /* The VM manages WX memory entirely on its own */
21801 return KERN_SUCCESS;
21802 #endif
21803 }
21804
21805 kern_return_t
vm_map_csm_allow_jit(__unused vm_map_t map)21806 vm_map_csm_allow_jit(
21807 __unused vm_map_t map)
21808 {
21809 #if CODE_SIGNING_MONITOR
21810 return csm_allow_jit_region(vm_map_pmap(map));
21811 #else
21812 /* No code signing monitor to enforce JIT policy */
21813 return KERN_SUCCESS;
21814 #endif
21815 }
21816
21817 void
vm_map_cs_debugged_set(vm_map_t map,boolean_t val)21818 vm_map_cs_debugged_set(
21819 vm_map_t map,
21820 boolean_t val)
21821 {
21822 vm_map_lock(map);
21823 map->cs_debugged = val;
21824 vm_map_unlock(map);
21825 }
21826
21827 void
vm_map_cs_enforcement_set(vm_map_t map,boolean_t val)21828 vm_map_cs_enforcement_set(
21829 vm_map_t map,
21830 boolean_t val)
21831 {
21832 vm_map_lock(map);
21833 map->cs_enforcement = val;
21834 pmap_set_vm_map_cs_enforced(map->pmap, val);
21835 vm_map_unlock(map);
21836 }
21837
21838 /*
21839 * IOKit has mapped a region into this map; adjust the pmap's ledgers appropriately.
21840 * phys_footprint is a composite limit consisting of iokit + physmem, so we need to
21841 * bump both counters.
21842 */
21843 void
vm_map_iokit_mapped_region(vm_map_t map,vm_size_t bytes)21844 vm_map_iokit_mapped_region(vm_map_t map, vm_size_t bytes)
21845 {
21846 pmap_t pmap = vm_map_pmap(map);
21847
21848 ledger_credit(pmap->ledger, task_ledgers.iokit_mapped, bytes);
21849 ledger_credit(pmap->ledger, task_ledgers.phys_footprint, bytes);
21850 }
21851
21852 void
vm_map_iokit_unmapped_region(vm_map_t map,vm_size_t bytes)21853 vm_map_iokit_unmapped_region(vm_map_t map, vm_size_t bytes)
21854 {
21855 pmap_t pmap = vm_map_pmap(map);
21856
21857 ledger_debit(pmap->ledger, task_ledgers.iokit_mapped, bytes);
21858 ledger_debit(pmap->ledger, task_ledgers.phys_footprint, bytes);
21859 }
21860
21861 /* Add (generate) code signature for memory range */
21862 #if CONFIG_DYNAMIC_CODE_SIGNING
21863 kern_return_t
vm_map_sign(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)21864 vm_map_sign(vm_map_t map,
21865 vm_map_offset_t start,
21866 vm_map_offset_t end)
21867 {
21868 vm_map_entry_t entry;
21869 vm_map_offset_t entry_start;
21870 vm_object_offset_t entry_offset;
21871 vm_page_t m;
21872 vm_object_t object;
21873
21874 /*
21875 * Vet all the input parameters and current type and state of the
21876 * underlaying object. Return with an error if anything is amiss.
21877 */
21878 if (map == VM_MAP_NULL) {
21879 return KERN_INVALID_ARGUMENT;
21880 }
21881
21882 if (__improbable(vm_map_range_overflows(map, start, end - start))) {
21883 return KERN_INVALID_ADDRESS;
21884 }
21885
21886 vm_map_lock_read(map);
21887
21888 if (!vm_map_lookup_entry(map, start, &entry) || entry->is_sub_map) {
21889 /*
21890 * Must pass a valid non-submap address.
21891 */
21892 vm_map_unlock_read(map);
21893 return KERN_INVALID_ADDRESS;
21894 }
21895
21896 if ((entry->vme_start > start) || (entry->vme_end < end)) {
21897 /*
21898 * Map entry doesn't cover the requested range. Not handling
21899 * this situation currently.
21900 */
21901 vm_map_unlock_read(map);
21902 return KERN_INVALID_ARGUMENT;
21903 }
21904
21905 object = VME_OBJECT(entry);
21906 if (object == VM_OBJECT_NULL) {
21907 /*
21908 * Object must already be present or we can't sign.
21909 */
21910 vm_map_unlock_read(map);
21911 return KERN_INVALID_ARGUMENT;
21912 }
21913
21914 vm_object_lock(object);
21915
21916 entry_start = entry->vme_start;
21917 entry_offset = VME_OFFSET(entry);
21918 vm_map_unlock_read(map);
21919 entry = VM_MAP_ENTRY_NULL; /* no longer valid after unlocking map */
21920
21921 while (start < end) {
21922 uint32_t refmod;
21923
21924 m = vm_page_lookup(object,
21925 start - entry_start + entry_offset);
21926 if (m == VM_PAGE_NULL) {
21927 /* shoud we try to fault a page here? we can probably
21928 * demand it exists and is locked for this request */
21929 vm_object_unlock(object);
21930 return KERN_FAILURE;
21931 }
21932 /* deal with special page status */
21933 if (m->vmp_busy ||
21934 (m->vmp_unusual && (VMP_ERROR_GET(m) || m->vmp_restart ||
21935 vm_page_is_private(m) || m->vmp_absent))) {
21936 vm_object_unlock(object);
21937 return KERN_FAILURE;
21938 }
21939
21940 /* Page is OK... now "validate" it */
21941 /* This is the place where we'll call out to create a code
21942 * directory, later */
21943 /* XXX TODO4K: deal with 4k subpages individually? */
21944 m->vmp_cs_validated = VMP_CS_ALL_TRUE;
21945
21946 /* The page is now "clean" for codesigning purposes. That means
21947 * we don't consider it as modified (wpmapped) anymore. But
21948 * we'll disconnect the page so we note any future modification
21949 * attempts. */
21950 m->vmp_wpmapped = FALSE;
21951 refmod = pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m));
21952
21953 /* Pull the dirty status from the pmap, since we cleared the
21954 * wpmapped bit */
21955 if ((refmod & VM_MEM_MODIFIED) && !m->vmp_dirty) {
21956 SET_PAGE_DIRTY(m, FALSE);
21957 }
21958
21959 /* On to the next page */
21960 start += PAGE_SIZE;
21961 }
21962 vm_object_unlock(object);
21963
21964 return KERN_SUCCESS;
21965 }
21966 #endif
21967
21968 kern_return_t
vm_map_partial_reap(vm_map_t map,unsigned int * reclaimed_resident,unsigned int * reclaimed_compressed)21969 vm_map_partial_reap(vm_map_t map, unsigned int *reclaimed_resident, unsigned int *reclaimed_compressed)
21970 {
21971 vm_map_entry_t entry = VM_MAP_ENTRY_NULL;
21972 vm_map_entry_t next_entry;
21973 kern_return_t kr = KERN_SUCCESS;
21974 VM_MAP_ZAP_DECLARE(zap_list);
21975
21976 vm_map_lock(map);
21977
21978 for (entry = vm_map_first_entry(map);
21979 entry != vm_map_to_entry(map);
21980 entry = next_entry) {
21981 next_entry = entry->vme_next;
21982
21983 if (!entry->is_sub_map &&
21984 VME_OBJECT(entry) &&
21985 (VME_OBJECT(entry)->internal == TRUE) &&
21986 (os_ref_get_count_raw(&VME_OBJECT(entry)->ref_count) == 1)) {
21987 *reclaimed_resident += VME_OBJECT(entry)->resident_page_count;
21988 *reclaimed_compressed += vm_compressor_pager_get_count(VME_OBJECT(entry)->pager);
21989
21990 (void)vm_map_delete(map, entry->vme_start,
21991 entry->vme_end, VM_MAP_REMOVE_NO_YIELD,
21992 KMEM_GUARD_NONE, &zap_list);
21993 }
21994 }
21995
21996 vm_map_unlock(map);
21997
21998 vm_map_zap_dispose(&zap_list);
21999
22000 return kr;
22001 }
22002
22003
22004 #if DEVELOPMENT || DEBUG
22005
22006 int
vm_map_disconnect_page_mappings(vm_map_t map,boolean_t do_unnest)22007 vm_map_disconnect_page_mappings(
22008 vm_map_t map,
22009 boolean_t do_unnest)
22010 {
22011 vm_map_entry_t entry;
22012 ledger_amount_t byte_count = 0;
22013
22014 if (do_unnest == TRUE) {
22015 #ifndef NO_NESTED_PMAP
22016 vm_map_lock(map);
22017
22018 for (entry = vm_map_first_entry(map);
22019 entry != vm_map_to_entry(map);
22020 entry = entry->vme_next) {
22021 if (entry->is_sub_map && entry->use_pmap) {
22022 /*
22023 * Make sure the range between the start of this entry and
22024 * the end of this entry is no longer nested, so that
22025 * we will only remove mappings from the pmap in use by this
22026 * this task
22027 */
22028 vm_map_clip_unnest(map, entry, entry->vme_start, entry->vme_end);
22029 }
22030 }
22031 vm_map_unlock(map);
22032 #endif
22033 }
22034 vm_map_lock_read(map);
22035
22036 ledger_get_balance(map->pmap->ledger, task_ledgers.phys_mem, &byte_count);
22037
22038 for (entry = vm_map_first_entry(map);
22039 entry != vm_map_to_entry(map);
22040 entry = entry->vme_next) {
22041 if (!entry->is_sub_map && ((VME_OBJECT(entry) == 0) ||
22042 (VME_OBJECT(entry)->phys_contiguous))) {
22043 continue;
22044 }
22045 if (entry->is_sub_map) {
22046 assert(!entry->use_pmap);
22047 }
22048
22049 pmap_remove_options(map->pmap, entry->vme_start, entry->vme_end, 0);
22050 }
22051 vm_map_unlock_read(map);
22052
22053 return (int) (byte_count / VM_MAP_PAGE_SIZE(map));
22054 }
22055
22056 kern_return_t
vm_map_inject_error(vm_map_t map,vm_map_offset_t vaddr)22057 vm_map_inject_error(vm_map_t map, vm_map_offset_t vaddr)
22058 {
22059 vm_object_t object = NULL;
22060 vm_object_offset_t offset;
22061 vm_prot_t prot;
22062 boolean_t wired;
22063 vm_map_version_t version;
22064 vm_map_t real_map;
22065 int result = KERN_FAILURE;
22066
22067 vaddr = vm_map_trunc_page(vaddr, PAGE_MASK);
22068 vm_map_lock(map);
22069
22070 result = vm_map_lookup_and_lock_object(&map, vaddr, VM_PROT_READ,
22071 OBJECT_LOCK_EXCLUSIVE, &version, &object, &offset, &prot, &wired,
22072 NULL, &real_map, NULL);
22073 if (object == NULL) {
22074 result = KERN_MEMORY_ERROR;
22075 } else if (object->pager) {
22076 result = vm_compressor_pager_inject_error(object->pager,
22077 offset);
22078 } else {
22079 result = KERN_MEMORY_PRESENT;
22080 }
22081
22082 if (object != NULL) {
22083 vm_object_unlock(object);
22084 }
22085
22086 if (real_map != map) {
22087 vm_map_unlock(real_map);
22088 }
22089 vm_map_unlock(map);
22090
22091 return result;
22092 }
22093
22094 /* iterate over map entries. Call the first argument block for the number of entries and the second for every entry
22095 * returns: KERN_SUCCESS if iteration completed ok,
22096 * error code if callback returned an error
22097 * KERN_FAILURE if there was a race of adding/removing entries during the iteration and the number of entries
22098 * iterated is different from the number in the first call
22099 */
22100 static kern_return_t
22101 vm_map_entries_foreach_locked(vm_map_t map, kern_return_t (^count_handler)(int nentries),
22102 kern_return_t (^entry_handler)(void* entry))
22103 {
22104 vm_map_lock_assert_held(map);
22105 int nentries = map->hdr.nentries;
22106 kern_return_t error = count_handler(nentries);
22107 if (error) {
22108 return error;
22109 }
22110
22111 /* iterate until we loop back to the map, see get_vmmap_entries() */
22112 vm_map_entry_t entry = vm_map_first_entry(map);
22113 int count = 0;
22114 while (entry != vm_map_to_entry(map)) {
22115 error = entry_handler(entry);
22116 if (error != KERN_SUCCESS) {
22117 return error;
22118 }
22119 entry = entry->vme_next;
22120 ++count;
22121 if (count > nentries) {
22122 /* nentries and entries iteration don't agree on how many entries there are, shouldn't really happen */
22123 return KERN_FAILURE;
22124 }
22125 }
22126 if (count < nentries) {
22127 return KERN_FAILURE;
22128 }
22129 return KERN_SUCCESS;
22130 }
22131
22132 kern_return_t
22133 vm_map_entries_foreach(vm_map_t map, kern_return_t (^count_handler)(int nentries),
22134 kern_return_t (^entry_handler)(void* entry))
22135 {
22136 vm_map_lock_read(map);
22137 kern_return_t error = vm_map_entries_foreach_locked(map, count_handler, entry_handler);
22138 vm_map_unlock_read(map);
22139 return error;
22140 }
22141
22142 /*
22143 * Dump info about the entry into the given buffer.
22144 * return true on success, false if there was not enough space in the give buffer
22145 * argument size in: bytes free in the given buffer, out: bytes written
22146 */
22147 kern_return_t
vm_map_dump_entry_and_compressor_pager(void * pentry,char * buf,size_t * size)22148 vm_map_dump_entry_and_compressor_pager(void* pentry, char *buf, size_t *size)
22149 {
22150 size_t insize = *size;
22151 kern_return_t kr;
22152 size_t offset = 0;
22153
22154 *size = 0;
22155 if (sizeof(struct vm_map_entry_info) > insize) {
22156 return KERN_NO_SPACE;
22157 }
22158
22159 vm_map_entry_t entry = (vm_map_entry_t)pentry;
22160 struct vm_map_entry_info *out_entry = (struct vm_map_entry_info*)buf;
22161 out_entry->vmei_start = entry->vme_start;
22162 out_entry->vmei_end = entry->vme_end;
22163 out_entry->vmei_alias = VME_ALIAS(entry);
22164 out_entry->vmei_offset = VME_OFFSET(entry);
22165 out_entry->vmei_is_sub_map = entry->is_sub_map;
22166 out_entry->vmei_protection = entry->protection;
22167 offset += sizeof(struct vm_map_entry_info);
22168
22169 out_entry->vmei_slot_mapping_count = 0;
22170 out_entry->vmei_is_compressor_pager = false;
22171 *size = offset;
22172 if (out_entry->vmei_is_sub_map) {
22173 return KERN_SUCCESS; // TODO: sub_map interrogation not supported yet
22174 }
22175 /* have a vm_object? */
22176 vm_object_t object = VME_OBJECT(entry);
22177 if (object == VM_OBJECT_NULL || !object->internal) {
22178 return KERN_SUCCESS;
22179 }
22180 /* objects has a pager? */
22181 memory_object_t pager = object->pager;
22182 if (pager != MEMORY_OBJECT_NULL) {
22183 return KERN_SUCCESS;
22184 }
22185 bool is_compressor = false;
22186 unsigned int slot_mapping_count = 0;
22187 size_t pager_info_size = insize - offset;
22188 kr = vm_compressor_pager_dump(pager, buf + offset, &pager_info_size, &is_compressor, &slot_mapping_count);
22189 if (kr != KERN_SUCCESS) {
22190 /* didn't have enough space for everything we want to write, caller needs to retry */
22191 return kr;
22192 }
22193 offset += pager_info_size;
22194 /* if we got here, is_compressor should be true due to the object->internal check above, so this assignment
22195 * is just for sanity sake */
22196 out_entry->vmei_is_compressor_pager = is_compressor;
22197 out_entry->vmei_slot_mapping_count = slot_mapping_count;
22198 *size = offset;
22199 return KERN_SUCCESS;
22200 }
22201
22202
22203 #endif
22204
22205
22206 #if CONFIG_FREEZE
22207
22208
22209 extern struct freezer_context freezer_context_global;
22210 AbsoluteTime c_freezer_last_yield_ts = 0;
22211
22212 extern unsigned int memorystatus_freeze_private_shared_pages_ratio;
22213 extern unsigned int memorystatus_freeze_shared_mb_per_process_max;
22214
22215 kern_return_t
vm_map_freeze(task_t task,unsigned int * purgeable_count,unsigned int * wired_count,unsigned int * clean_count,unsigned int * dirty_count,unsigned int dirty_budget,unsigned int * shared_count,int * freezer_error_code,boolean_t eval_only)22216 vm_map_freeze(
22217 task_t task,
22218 unsigned int *purgeable_count,
22219 unsigned int *wired_count,
22220 unsigned int *clean_count,
22221 unsigned int *dirty_count,
22222 unsigned int dirty_budget,
22223 unsigned int *shared_count,
22224 int *freezer_error_code,
22225 boolean_t eval_only)
22226 {
22227 vm_map_entry_t entry2 = VM_MAP_ENTRY_NULL;
22228 kern_return_t kr = KERN_SUCCESS;
22229 boolean_t evaluation_phase = TRUE;
22230 vm_object_t cur_shared_object = NULL;
22231 int cur_shared_obj_ref_cnt = 0;
22232 unsigned int dirty_private_count = 0, dirty_shared_count = 0, obj_pages_snapshot = 0;
22233
22234 *purgeable_count = *wired_count = *clean_count = *dirty_count = *shared_count = 0;
22235
22236 /*
22237 * We need the exclusive lock here so that we can
22238 * block any page faults or lookups while we are
22239 * in the middle of freezing this vm map.
22240 */
22241 vm_map_t map = task->map;
22242
22243 vm_map_lock(map);
22244
22245 assert(VM_CONFIG_COMPRESSOR_IS_PRESENT);
22246
22247 if (vm_compressor_low_on_space() || vm_swap_low_on_space()) {
22248 if (vm_compressor_low_on_space()) {
22249 *freezer_error_code = FREEZER_ERROR_NO_COMPRESSOR_SPACE;
22250 }
22251
22252 if (vm_swap_low_on_space()) {
22253 *freezer_error_code = FREEZER_ERROR_NO_SWAP_SPACE;
22254 }
22255
22256 kr = KERN_NO_SPACE;
22257 goto done;
22258 }
22259
22260 if (VM_CONFIG_FREEZER_SWAP_IS_ACTIVE == FALSE) {
22261 /*
22262 * In-memory compressor backing the freezer. No disk.
22263 * So no need to do the evaluation phase.
22264 */
22265 evaluation_phase = FALSE;
22266
22267 if (eval_only == TRUE) {
22268 /*
22269 * We don't support 'eval_only' mode
22270 * in this non-swap config.
22271 */
22272 *freezer_error_code = FREEZER_ERROR_GENERIC;
22273 kr = KERN_INVALID_ARGUMENT;
22274 goto done;
22275 }
22276
22277 freezer_context_global.freezer_ctx_uncompressed_pages = 0;
22278 clock_get_uptime(&c_freezer_last_yield_ts);
22279 }
22280 again:
22281
22282 for (entry2 = vm_map_first_entry(map);
22283 entry2 != vm_map_to_entry(map);
22284 entry2 = entry2->vme_next) {
22285 vm_object_t src_object;
22286
22287 if (entry2->is_sub_map) {
22288 continue;
22289 }
22290
22291 src_object = VME_OBJECT(entry2);
22292 if (!src_object ||
22293 src_object->phys_contiguous ||
22294 !src_object->internal) {
22295 continue;
22296 }
22297
22298 /* If eligible, scan the entry, moving eligible pages over to our parent object */
22299
22300 if (VM_CONFIG_FREEZER_SWAP_IS_ACTIVE) {
22301 /*
22302 * We skip purgeable objects during evaluation phase only.
22303 * If we decide to freeze this process, we'll explicitly
22304 * purge these objects before we go around again with
22305 * 'evaluation_phase' set to FALSE.
22306 */
22307
22308 if ((src_object->purgable == VM_PURGABLE_EMPTY) || (src_object->purgable == VM_PURGABLE_VOLATILE)) {
22309 /*
22310 * We want to purge objects that may not belong to this task but are mapped
22311 * in this task alone. Since we already purged this task's purgeable memory
22312 * at the end of a successful evaluation phase, we want to avoid doing no-op calls
22313 * on this task's purgeable objects. Hence the check for only volatile objects.
22314 */
22315 if (evaluation_phase ||
22316 src_object->purgable != VM_PURGABLE_VOLATILE ||
22317 os_ref_get_count_raw(&src_object->ref_count) != 1) {
22318 continue;
22319 }
22320 vm_object_lock(src_object);
22321 if (src_object->purgable == VM_PURGABLE_VOLATILE &&
22322 os_ref_get_count_raw(&src_object->ref_count) == 1) {
22323 purgeable_q_t old_queue;
22324
22325 /* object should be on a purgeable queue */
22326 assert(src_object->objq.next != NULL &&
22327 src_object->objq.prev != NULL);
22328 /* move object from its volatile queue to the nonvolatile queue */
22329 old_queue = vm_purgeable_object_remove(src_object);
22330 assert(old_queue);
22331 if (src_object->purgeable_when_ripe) {
22332 /* remove a token from that volatile queue */
22333 vm_page_lock_queues();
22334 vm_purgeable_token_delete_first(old_queue);
22335 vm_page_unlock_queues();
22336 }
22337 /* purge the object */
22338 vm_object_purge(src_object, 0);
22339 }
22340 vm_object_unlock(src_object);
22341 continue;
22342 }
22343
22344 /*
22345 * Pages belonging to this object could be swapped to disk.
22346 * Make sure it's not a shared object because we could end
22347 * up just bringing it back in again.
22348 *
22349 * We try to optimize somewhat by checking for objects that are mapped
22350 * more than once within our own map. But we don't do full searches,
22351 * we just look at the entries following our current entry.
22352 */
22353
22354 if (os_ref_get_count_raw(&src_object->ref_count) > 1) {
22355 if (src_object != cur_shared_object) {
22356 obj_pages_snapshot = (src_object->resident_page_count - src_object->wired_page_count) + vm_compressor_pager_get_count(src_object->pager);
22357 dirty_shared_count += obj_pages_snapshot;
22358
22359 cur_shared_object = src_object;
22360 cur_shared_obj_ref_cnt = 1;
22361 continue;
22362 } else {
22363 cur_shared_obj_ref_cnt++;
22364 if (os_ref_get_count_raw(&src_object->ref_count) == cur_shared_obj_ref_cnt) {
22365 /*
22366 * Fall through to below and treat this object as private.
22367 * So deduct its pages from our shared total and add it to the
22368 * private total.
22369 */
22370
22371 dirty_shared_count -= obj_pages_snapshot;
22372 dirty_private_count += obj_pages_snapshot;
22373 } else {
22374 continue;
22375 }
22376 }
22377 }
22378
22379
22380 if (os_ref_get_count_raw(&src_object->ref_count) == 1) {
22381 dirty_private_count += (src_object->resident_page_count - src_object->wired_page_count) + vm_compressor_pager_get_count(src_object->pager);
22382 }
22383
22384 if (evaluation_phase == TRUE) {
22385 continue;
22386 }
22387 }
22388
22389 uint32_t paged_out_count = vm_object_compressed_freezer_pageout(src_object, dirty_budget);
22390 *wired_count += src_object->wired_page_count;
22391
22392 if (vm_compressor_low_on_space() || vm_swap_low_on_space()) {
22393 if (vm_compressor_low_on_space()) {
22394 *freezer_error_code = FREEZER_ERROR_NO_COMPRESSOR_SPACE;
22395 }
22396
22397 if (vm_swap_low_on_space()) {
22398 *freezer_error_code = FREEZER_ERROR_NO_SWAP_SPACE;
22399 }
22400
22401 kr = KERN_NO_SPACE;
22402 break;
22403 }
22404 if (paged_out_count >= dirty_budget) {
22405 break;
22406 }
22407 dirty_budget -= paged_out_count;
22408 }
22409
22410 *shared_count = (unsigned int) ((dirty_shared_count * PAGE_SIZE_64) / (1024 * 1024ULL));
22411 if (evaluation_phase) {
22412 unsigned int shared_pages_threshold = (memorystatus_freeze_shared_mb_per_process_max * 1024 * 1024ULL) / PAGE_SIZE_64;
22413
22414 if (dirty_shared_count > shared_pages_threshold) {
22415 *freezer_error_code = FREEZER_ERROR_EXCESS_SHARED_MEMORY;
22416 kr = KERN_FAILURE;
22417 goto done;
22418 }
22419
22420 if (dirty_shared_count &&
22421 ((dirty_private_count / dirty_shared_count) < memorystatus_freeze_private_shared_pages_ratio)) {
22422 *freezer_error_code = FREEZER_ERROR_LOW_PRIVATE_SHARED_RATIO;
22423 kr = KERN_FAILURE;
22424 goto done;
22425 }
22426
22427 evaluation_phase = FALSE;
22428 dirty_shared_count = dirty_private_count = 0;
22429
22430 freezer_context_global.freezer_ctx_uncompressed_pages = 0;
22431 clock_get_uptime(&c_freezer_last_yield_ts);
22432
22433 if (eval_only) {
22434 kr = KERN_SUCCESS;
22435 goto done;
22436 }
22437
22438 vm_purgeable_purge_task_owned(task);
22439
22440 goto again;
22441 } else {
22442 kr = KERN_SUCCESS;
22443 }
22444
22445 done:
22446 vm_map_unlock(map);
22447
22448 if ((eval_only == FALSE) && (kr == KERN_SUCCESS)) {
22449 vm_object_compressed_freezer_done();
22450 }
22451 return kr;
22452 }
22453
22454 #endif
22455
22456 /*
22457 * vm_map_entry_should_cow_for_true_share:
22458 *
22459 * Determines if the map entry should be clipped and setup for copy-on-write
22460 * to avoid applying "true_share" to a large VM object when only a subset is
22461 * targeted.
22462 *
22463 * For now, we target only the map entries created for the Objective C
22464 * Garbage Collector, which initially have the following properties:
22465 * - alias == VM_MEMORY_MALLOC
22466 * - wired_count == 0
22467 * - !needs_copy
22468 * and a VM object with:
22469 * - internal
22470 * - copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC
22471 * - !true_share
22472 * - vo_size == ANON_CHUNK_SIZE
22473 *
22474 * Only non-kernel map entries.
22475 */
22476 boolean_t
vm_map_entry_should_cow_for_true_share(vm_map_entry_t entry)22477 vm_map_entry_should_cow_for_true_share(
22478 vm_map_entry_t entry)
22479 {
22480 vm_object_t object;
22481
22482 if (entry->is_sub_map) {
22483 /* entry does not point at a VM object */
22484 return FALSE;
22485 }
22486
22487 if (entry->needs_copy) {
22488 /* already set for copy_on_write: done! */
22489 return FALSE;
22490 }
22491
22492 if (VME_ALIAS(entry) != VM_MEMORY_MALLOC &&
22493 VME_ALIAS(entry) != VM_MEMORY_MALLOC_SMALL) {
22494 /* not a malloc heap or Obj-C Garbage Collector heap */
22495 return FALSE;
22496 }
22497
22498 if (entry->wired_count) {
22499 /* wired: can't change the map entry... */
22500 vm_counters.should_cow_but_wired++;
22501 return FALSE;
22502 }
22503
22504 object = VME_OBJECT(entry);
22505
22506 if (object == VM_OBJECT_NULL) {
22507 /* no object yet... */
22508 return FALSE;
22509 }
22510
22511 if (!object->internal) {
22512 /* not an internal object */
22513 return FALSE;
22514 }
22515
22516 if (object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC) {
22517 /* not the default copy strategy */
22518 return FALSE;
22519 }
22520
22521 if (object->true_share) {
22522 /* already true_share: too late to avoid it */
22523 return FALSE;
22524 }
22525
22526 if (VME_ALIAS(entry) == VM_MEMORY_MALLOC &&
22527 object->vo_size != ANON_CHUNK_SIZE) {
22528 /* ... not an object created for the ObjC Garbage Collector */
22529 return FALSE;
22530 }
22531
22532 if (VME_ALIAS(entry) == VM_MEMORY_MALLOC_SMALL &&
22533 object->vo_size != 2048 * 4096) {
22534 /* ... not a "MALLOC_SMALL" heap */
22535 return FALSE;
22536 }
22537
22538 /*
22539 * All the criteria match: we have a large object being targeted for "true_share".
22540 * To limit the adverse side-effects linked with "true_share", tell the caller to
22541 * try and avoid setting up the entire object for "true_share" by clipping the
22542 * targeted range and setting it up for copy-on-write.
22543 */
22544 return TRUE;
22545 }
22546
22547 uint64_t vm_map_range_overflows_count = 0;
22548 TUNABLE_WRITEABLE(boolean_t, vm_map_range_overflows_log, "vm_map_range_overflows_log", FALSE);
22549 bool
vm_map_range_overflows(vm_map_t map,vm_map_offset_t addr,vm_map_size_t size)22550 vm_map_range_overflows(
22551 vm_map_t map,
22552 vm_map_offset_t addr,
22553 vm_map_size_t size)
22554 {
22555 vm_map_offset_t start, end, sum;
22556 vm_map_offset_t pgmask;
22557
22558 if (size == 0) {
22559 /* empty range -> no overflow */
22560 return false;
22561 }
22562 pgmask = vm_map_page_mask(map);
22563 start = vm_map_trunc_page_mask(addr, pgmask);
22564 end = vm_map_round_page_mask(addr + size, pgmask);
22565 if (__improbable(os_add_overflow(addr, size, &sum) || end <= start)) {
22566 vm_map_range_overflows_count++;
22567 if (vm_map_range_overflows_log) {
22568 printf("%d[%s] vm_map_range_overflows addr 0x%llx size 0x%llx pgmask 0x%llx\n",
22569 proc_selfpid(),
22570 proc_best_name(current_proc()),
22571 (uint64_t)addr,
22572 (uint64_t)size,
22573 (uint64_t)pgmask);
22574 }
22575 DTRACE_VM4(vm_map_range_overflows,
22576 vm_map_t, map,
22577 uint32_t, pgmask,
22578 uint64_t, (uint64_t)addr,
22579 uint64_t, (uint64_t)size);
22580 return true;
22581 }
22582 return false;
22583 }
22584
22585 vm_map_offset_t
vm_map_round_page_mask(vm_map_offset_t offset,vm_map_offset_t mask)22586 vm_map_round_page_mask(
22587 vm_map_offset_t offset,
22588 vm_map_offset_t mask)
22589 {
22590 return VM_MAP_ROUND_PAGE(offset, mask);
22591 }
22592
22593 vm_map_offset_t
vm_map_trunc_page_mask(vm_map_offset_t offset,vm_map_offset_t mask)22594 vm_map_trunc_page_mask(
22595 vm_map_offset_t offset,
22596 vm_map_offset_t mask)
22597 {
22598 return VM_MAP_TRUNC_PAGE(offset, mask);
22599 }
22600
22601 boolean_t
vm_map_page_aligned(vm_map_offset_t offset,vm_map_offset_t mask)22602 vm_map_page_aligned(
22603 vm_map_offset_t offset,
22604 vm_map_offset_t mask)
22605 {
22606 return ((offset) & mask) == 0;
22607 }
22608
22609 int
vm_map_page_shift(vm_map_t map)22610 vm_map_page_shift(
22611 vm_map_t map)
22612 {
22613 return VM_MAP_PAGE_SHIFT(map);
22614 }
22615
22616 int
vm_map_page_size(vm_map_t map)22617 vm_map_page_size(
22618 vm_map_t map)
22619 {
22620 return VM_MAP_PAGE_SIZE(map);
22621 }
22622
22623 vm_map_offset_t
vm_map_page_mask(vm_map_t map)22624 vm_map_page_mask(
22625 vm_map_t map)
22626 {
22627 return VM_MAP_PAGE_MASK(map);
22628 }
22629
22630 kern_return_t
vm_map_set_page_shift(vm_map_t map,int pageshift)22631 vm_map_set_page_shift(
22632 vm_map_t map,
22633 int pageshift)
22634 {
22635 if (map->hdr.nentries != 0) {
22636 /* too late to change page size */
22637 return KERN_FAILURE;
22638 }
22639
22640 map->hdr.page_shift = (uint16_t)pageshift;
22641
22642 return KERN_SUCCESS;
22643 }
22644
22645 kern_return_t
vm_map_query_volatile(vm_map_t map,mach_vm_size_t * volatile_virtual_size_p,mach_vm_size_t * volatile_resident_size_p,mach_vm_size_t * volatile_compressed_size_p,mach_vm_size_t * volatile_pmap_size_p,mach_vm_size_t * volatile_compressed_pmap_size_p)22646 vm_map_query_volatile(
22647 vm_map_t map,
22648 mach_vm_size_t *volatile_virtual_size_p,
22649 mach_vm_size_t *volatile_resident_size_p,
22650 mach_vm_size_t *volatile_compressed_size_p,
22651 mach_vm_size_t *volatile_pmap_size_p,
22652 mach_vm_size_t *volatile_compressed_pmap_size_p)
22653 {
22654 mach_vm_size_t volatile_virtual_size;
22655 mach_vm_size_t volatile_resident_count;
22656 mach_vm_size_t volatile_compressed_count;
22657 mach_vm_size_t volatile_pmap_count;
22658 mach_vm_size_t volatile_compressed_pmap_count;
22659 mach_vm_size_t resident_count;
22660 vm_map_entry_t entry;
22661 vm_object_t object;
22662
22663 /* map should be locked by caller */
22664
22665 volatile_virtual_size = 0;
22666 volatile_resident_count = 0;
22667 volatile_compressed_count = 0;
22668 volatile_pmap_count = 0;
22669 volatile_compressed_pmap_count = 0;
22670
22671 for (entry = vm_map_first_entry(map);
22672 entry != vm_map_to_entry(map);
22673 entry = entry->vme_next) {
22674 mach_vm_size_t pmap_resident_bytes, pmap_compressed_bytes;
22675
22676 if (entry->is_sub_map) {
22677 continue;
22678 }
22679 if (!(entry->protection & VM_PROT_WRITE)) {
22680 continue;
22681 }
22682 object = VME_OBJECT(entry);
22683 if (object == VM_OBJECT_NULL) {
22684 continue;
22685 }
22686 if (object->purgable != VM_PURGABLE_VOLATILE &&
22687 object->purgable != VM_PURGABLE_EMPTY) {
22688 continue;
22689 }
22690 if (VME_OFFSET(entry)) {
22691 /*
22692 * If the map entry has been split and the object now
22693 * appears several times in the VM map, we don't want
22694 * to count the object's resident_page_count more than
22695 * once. We count it only for the first one, starting
22696 * at offset 0 and ignore the other VM map entries.
22697 */
22698 continue;
22699 }
22700 resident_count = object->resident_page_count;
22701 if ((VME_OFFSET(entry) / PAGE_SIZE) >= resident_count) {
22702 resident_count = 0;
22703 } else {
22704 resident_count -= (VME_OFFSET(entry) / PAGE_SIZE);
22705 }
22706
22707 volatile_virtual_size += entry->vme_end - entry->vme_start;
22708 volatile_resident_count += resident_count;
22709 if (object->pager) {
22710 volatile_compressed_count +=
22711 vm_compressor_pager_get_count(object->pager);
22712 }
22713 pmap_compressed_bytes = 0;
22714 pmap_resident_bytes =
22715 pmap_query_resident(map->pmap,
22716 entry->vme_start,
22717 entry->vme_end,
22718 &pmap_compressed_bytes);
22719 volatile_pmap_count += (pmap_resident_bytes / PAGE_SIZE);
22720 volatile_compressed_pmap_count += (pmap_compressed_bytes
22721 / PAGE_SIZE);
22722 }
22723
22724 /* map is still locked on return */
22725
22726 *volatile_virtual_size_p = volatile_virtual_size;
22727 *volatile_resident_size_p = volatile_resident_count * PAGE_SIZE;
22728 *volatile_compressed_size_p = volatile_compressed_count * PAGE_SIZE;
22729 *volatile_pmap_size_p = volatile_pmap_count * PAGE_SIZE;
22730 *volatile_compressed_pmap_size_p = volatile_compressed_pmap_count * PAGE_SIZE;
22731
22732 return KERN_SUCCESS;
22733 }
22734
22735 void
vm_map_sizes(vm_map_t map,vm_map_size_t * psize,vm_map_size_t * pfree,vm_map_size_t * plargest_free)22736 vm_map_sizes(vm_map_t map,
22737 vm_map_size_t * psize,
22738 vm_map_size_t * pfree,
22739 vm_map_size_t * plargest_free)
22740 {
22741 vm_map_entry_t entry;
22742 vm_map_offset_t prev;
22743 vm_map_size_t free, total_free, largest_free;
22744 boolean_t end;
22745
22746 if (!map) {
22747 *psize = *pfree = *plargest_free = 0;
22748 return;
22749 }
22750 total_free = largest_free = 0;
22751
22752 vm_map_lock_read(map);
22753 if (psize) {
22754 *psize = map->max_offset - map->min_offset;
22755 }
22756
22757 prev = map->min_offset;
22758 for (entry = vm_map_first_entry(map);; entry = entry->vme_next) {
22759 end = (entry == vm_map_to_entry(map));
22760
22761 if (end) {
22762 free = entry->vme_end - prev;
22763 } else {
22764 free = entry->vme_start - prev;
22765 }
22766
22767 total_free += free;
22768 if (free > largest_free) {
22769 largest_free = free;
22770 }
22771
22772 if (end) {
22773 break;
22774 }
22775 prev = entry->vme_end;
22776 }
22777 vm_map_unlock_read(map);
22778 if (pfree) {
22779 *pfree = total_free;
22780 }
22781 if (plargest_free) {
22782 *plargest_free = largest_free;
22783 }
22784 }
22785
22786 #if VM_SCAN_FOR_SHADOW_CHAIN
22787 int
vm_map_shadow_max(vm_map_t map)22788 vm_map_shadow_max(
22789 vm_map_t map)
22790 {
22791 int shadows, shadows_max;
22792 vm_map_entry_t entry;
22793 vm_object_t object, next_object;
22794
22795 if (map == NULL) {
22796 return 0;
22797 }
22798
22799 shadows_max = 0;
22800
22801 vm_map_lock_read(map);
22802
22803 for (entry = vm_map_first_entry(map);
22804 entry != vm_map_to_entry(map);
22805 entry = entry->vme_next) {
22806 if (entry->is_sub_map) {
22807 continue;
22808 }
22809 object = VME_OBJECT(entry);
22810 if (object == NULL) {
22811 continue;
22812 }
22813 vm_object_lock_shared(object);
22814 for (shadows = 0;
22815 object->shadow != NULL;
22816 shadows++, object = next_object) {
22817 next_object = object->shadow;
22818 vm_object_lock_shared(next_object);
22819 vm_object_unlock(object);
22820 }
22821 vm_object_unlock(object);
22822 if (shadows > shadows_max) {
22823 shadows_max = shadows;
22824 }
22825 }
22826
22827 vm_map_unlock_read(map);
22828
22829 return shadows_max;
22830 }
22831 #endif /* VM_SCAN_FOR_SHADOW_CHAIN */
22832
22833 void
vm_commit_pagezero_status(vm_map_t lmap)22834 vm_commit_pagezero_status(vm_map_t lmap)
22835 {
22836 pmap_advise_pagezero_range(lmap->pmap, lmap->min_offset);
22837 }
22838
22839 #if __x86_64__
22840 void
vm_map_set_high_start(vm_map_t map,vm_map_offset_t high_start)22841 vm_map_set_high_start(
22842 vm_map_t map,
22843 vm_map_offset_t high_start)
22844 {
22845 map->vmmap_high_start = high_start;
22846 }
22847 #endif /* __x86_64__ */
22848
22849 #if CODE_SIGNING_MONITOR
22850
22851 kern_return_t
vm_map_entry_cs_associate(vm_map_t map,vm_map_entry_t entry,vm_map_kernel_flags_t vmk_flags)22852 vm_map_entry_cs_associate(
22853 vm_map_t map,
22854 vm_map_entry_t entry,
22855 vm_map_kernel_flags_t vmk_flags)
22856 {
22857 vm_object_t cs_object, cs_shadow, backing_object;
22858 vm_object_offset_t cs_offset, backing_offset;
22859 void *cs_blobs;
22860 struct vnode *cs_vnode;
22861 kern_return_t cs_ret;
22862
22863 if (map->pmap == NULL ||
22864 entry->is_sub_map || /* XXX FBDP: recurse on sub-range? */
22865 (csm_address_space_exempt(map->pmap) == KERN_SUCCESS) ||
22866 VME_OBJECT(entry) == VM_OBJECT_NULL) {
22867 return KERN_SUCCESS;
22868 }
22869
22870 if (!(entry->protection & VM_PROT_EXECUTE)) {
22871 /*
22872 * This memory region is not executable, so the code-signing
22873 * monitor would usually not care about it...
22874 */
22875 if (vmk_flags.vmkf_remap_prot_copy &&
22876 (entry->max_protection & VM_PROT_EXECUTE)) {
22877 /*
22878 * ... except if the memory region is being remapped
22879 * from r-x/r-x to rw-/rwx via vm_protect(VM_PROT_COPY)
22880 * which is what a debugger or dtrace would be doing
22881 * to prepare to modify an executable page to insert
22882 * a breakpoint or activate a probe.
22883 * In that case, fall through so that we can mark
22884 * this region as being "debugged" and no longer
22885 * strictly code-signed.
22886 */
22887 } else {
22888 /*
22889 * Really not executable, so no need to tell the
22890 * code-signing monitor.
22891 */
22892 return KERN_SUCCESS;
22893 }
22894 }
22895
22896 vm_map_lock_assert_exclusive(map);
22897
22898 /*
22899 * Check for a debug association mapping before we check for used_for_jit. This
22900 * allows non-RWX JIT on macOS systems to masquerade their mappings as USER_DEBUG
22901 * pages instead of USER_JIT. These non-RWX JIT pages cannot be marked as USER_JIT
22902 * since they are mapped with RW or RX permissions, which the page table monitor
22903 * denies on USER_JIT pages. Given that, if they're not mapped as USER_DEBUG,
22904 * they will be mapped as USER_EXEC, and that will cause another page table monitor
22905 * violation when those USER_EXEC pages are mapped as RW.
22906 *
22907 * Since these pages switch between RW and RX through mprotect, they mimic what
22908 * we expect a debugger to do. As the code signing monitor does not enforce mappings
22909 * on macOS systems, this works in our favor here and allows us to continue to
22910 * support these legacy-programmed applications without sacrificing security on
22911 * the page table or the code signing monitor. We don't need to explicitly check
22912 * for entry_for_jit here and the mapping permissions. If the initial mapping is
22913 * created with RX, then the application must map it as RW in order to first write
22914 * to the page (MAP_JIT mappings must be private and anonymous). The switch to
22915 * RX will cause vm_map_protect to mark the entry as vmkf_remap_prot_copy.
22916 * Similarly, if the mapping was created as RW, and then switched to RX,
22917 * vm_map_protect will again mark the entry as a copy, and both these cases
22918 * lead to this if-statement being entered.
22919 *
22920 * For more information: rdar://115313336.
22921 */
22922 if (vmk_flags.vmkf_remap_prot_copy) {
22923 cs_ret = csm_associate_debug_region(
22924 map->pmap,
22925 entry->vme_start,
22926 entry->vme_end - entry->vme_start);
22927
22928 /*
22929 * csm_associate_debug_region returns not supported when the code signing
22930 * monitor is disabled. This is intentional, since cs_ret is checked towards
22931 * the end of the function, and if it is not supported, then we still want the
22932 * VM to perform code-signing enforcement on this entry. That said, if we don't
22933 * mark this as a xnu_user_debug page when the code-signing monitor is disabled,
22934 * then it never gets retyped to XNU_USER_DEBUG frame type, which then causes
22935 * an issue with debugging (since it'll be mapped in as XNU_USER_EXEC in some
22936 * cases, which will cause a violation when attempted to be mapped as writable).
22937 */
22938 if ((cs_ret == KERN_SUCCESS) || (cs_ret == KERN_NOT_SUPPORTED)) {
22939 entry->vme_xnu_user_debug = TRUE;
22940 }
22941 #if DEVELOPMENT || DEBUG
22942 if (vm_log_xnu_user_debug) {
22943 printf("FBDP %d[%s] %s:%d map %p entry %p [ 0x%llx 0x%llx ] vme_xnu_user_debug=%d cs_ret %d\n",
22944 proc_selfpid(),
22945 (get_bsdtask_info(current_task()) ? proc_name_address(get_bsdtask_info(current_task())) : "?"),
22946 __FUNCTION__, __LINE__,
22947 map, entry,
22948 (uint64_t)entry->vme_start, (uint64_t)entry->vme_end,
22949 entry->vme_xnu_user_debug,
22950 cs_ret);
22951 }
22952 #endif /* DEVELOPMENT || DEBUG */
22953 goto done;
22954 }
22955
22956 if (entry->used_for_jit) {
22957 cs_ret = csm_associate_jit_region(
22958 map->pmap,
22959 entry->vme_start,
22960 entry->vme_end - entry->vme_start);
22961 goto done;
22962 }
22963
22964 cs_object = VME_OBJECT(entry);
22965 vm_object_lock_shared(cs_object);
22966 cs_offset = VME_OFFSET(entry);
22967
22968 /* find the VM object backed by the code-signed vnode */
22969 for (;;) {
22970 /* go to the bottom of cs_object's shadow chain */
22971 for (;
22972 cs_object->shadow != VM_OBJECT_NULL;
22973 cs_object = cs_shadow) {
22974 cs_shadow = cs_object->shadow;
22975 cs_offset += cs_object->vo_shadow_offset;
22976 vm_object_lock_shared(cs_shadow);
22977 vm_object_unlock(cs_object);
22978 }
22979 if (cs_object->internal ||
22980 cs_object->pager == MEMORY_OBJECT_NULL) {
22981 vm_object_unlock(cs_object);
22982 return KERN_SUCCESS;
22983 }
22984
22985 cs_offset += cs_object->paging_offset;
22986
22987 /*
22988 * cs_object could be backed by a:
22989 * vnode_pager
22990 * apple_protect_pager
22991 * shared_region_pager
22992 * fourk_pager (multiple backing objects -> fail?)
22993 * ask the pager if it has a backing VM object
22994 */
22995 if (!memory_object_backing_object(cs_object->pager,
22996 cs_offset,
22997 &backing_object,
22998 &backing_offset)) {
22999 /* no backing object: cs_object is it */
23000 break;
23001 }
23002
23003 /* look down the backing object's shadow chain */
23004 vm_object_lock_shared(backing_object);
23005 vm_object_unlock(cs_object);
23006 cs_object = backing_object;
23007 cs_offset = backing_offset;
23008 }
23009
23010 cs_vnode = vnode_pager_lookup_vnode(cs_object->pager);
23011 if (cs_vnode == NULL) {
23012 /* no vnode, no code signatures to associate */
23013 cs_ret = KERN_SUCCESS;
23014 } else {
23015 cs_ret = vnode_pager_get_cs_blobs(cs_vnode,
23016 &cs_blobs);
23017 assert(cs_ret == KERN_SUCCESS);
23018 cs_ret = cs_associate_blob_with_mapping(map->pmap,
23019 entry->vme_start,
23020 (entry->vme_end - entry->vme_start),
23021 cs_offset,
23022 cs_blobs);
23023 }
23024 vm_object_unlock(cs_object);
23025 cs_object = VM_OBJECT_NULL;
23026
23027 done:
23028 if (cs_ret == KERN_SUCCESS) {
23029 DTRACE_VM2(vm_map_entry_cs_associate_success,
23030 vm_map_offset_t, entry->vme_start,
23031 vm_map_offset_t, entry->vme_end);
23032 if (vm_map_executable_immutable) {
23033 /*
23034 * Prevent this executable
23035 * mapping from being unmapped
23036 * or modified.
23037 */
23038 entry->vme_permanent = TRUE;
23039 }
23040 /*
23041 * pmap says it will validate the
23042 * code-signing validity of pages
23043 * faulted in via this mapping, so
23044 * this map entry should be marked so
23045 * that vm_fault() bypasses code-signing
23046 * validation for faults coming through
23047 * this mapping.
23048 */
23049 entry->csm_associated = TRUE;
23050 } else if (cs_ret == KERN_NOT_SUPPORTED) {
23051 /*
23052 * pmap won't check the code-signing
23053 * validity of pages faulted in via
23054 * this mapping, so VM should keep
23055 * doing it.
23056 */
23057 DTRACE_VM3(vm_map_entry_cs_associate_off,
23058 vm_map_offset_t, entry->vme_start,
23059 vm_map_offset_t, entry->vme_end,
23060 int, cs_ret);
23061 } else {
23062 /*
23063 * A real error: do not allow
23064 * execution in this mapping.
23065 */
23066 DTRACE_VM3(vm_map_entry_cs_associate_failure,
23067 vm_map_offset_t, entry->vme_start,
23068 vm_map_offset_t, entry->vme_end,
23069 int, cs_ret);
23070 if (vmk_flags.vmkf_overwrite_immutable) {
23071 /*
23072 * We can get here when we remap an apple_protect pager
23073 * on top of an already cs_associated executable mapping
23074 * with the same code signatures, so we don't want to
23075 * lose VM_PROT_EXECUTE in that case...
23076 */
23077 } else {
23078 entry->protection &= ~VM_PROT_ALLEXEC;
23079 entry->max_protection &= ~VM_PROT_ALLEXEC;
23080 }
23081 }
23082
23083 return cs_ret;
23084 }
23085
23086 #endif /* CODE_SIGNING_MONITOR */
23087
23088 inline bool
vm_map_is_corpse_source(vm_map_t map)23089 vm_map_is_corpse_source(vm_map_t map)
23090 {
23091 bool status = false;
23092 if (map) {
23093 vm_map_lock_read(map);
23094 status = map->corpse_source;
23095 vm_map_unlock_read(map);
23096 }
23097 return status;
23098 }
23099
23100 inline void
vm_map_set_corpse_source(vm_map_t map)23101 vm_map_set_corpse_source(vm_map_t map)
23102 {
23103 if (map) {
23104 vm_map_lock(map);
23105 map->corpse_source = true;
23106 vm_map_unlock(map);
23107 }
23108 }
23109
23110 inline void
vm_map_unset_corpse_source(vm_map_t map)23111 vm_map_unset_corpse_source(vm_map_t map)
23112 {
23113 if (map) {
23114 vm_map_lock(map);
23115 map->corpse_source = false;
23116 vm_map_unlock(map);
23117 }
23118 }
23119 /*
23120 * FORKED CORPSE FOOTPRINT
23121 *
23122 * A forked corpse gets a copy of the original VM map but its pmap is mostly
23123 * empty since it never ran and never got to fault in any pages.
23124 * Collecting footprint info (via "sysctl vm.self_region_footprint") for
23125 * a forked corpse would therefore return very little information.
23126 *
23127 * When forking a corpse, we can pass the VM_MAP_FORK_CORPSE_FOOTPRINT option
23128 * to vm_map_fork() to collect footprint information from the original VM map
23129 * and its pmap, and store it in the forked corpse's VM map. That information
23130 * is stored in place of the VM map's "hole list" since we'll never need to
23131 * lookup for holes in the corpse's map.
23132 *
23133 * The corpse's footprint info looks like this:
23134 *
23135 * vm_map->vmmap_corpse_footprint points to pageable kernel memory laid out
23136 * as follows:
23137 * +---------------------------------------+
23138 * header-> | cf_size |
23139 * +-------------------+-------------------+
23140 * | cf_last_region | cf_last_zeroes |
23141 * +-------------------+-------------------+
23142 * region1-> | cfr_vaddr |
23143 * +-------------------+-------------------+
23144 * | cfr_num_pages | d0 | d1 | d2 | d3 |
23145 * +---------------------------------------+
23146 * | d4 | d5 | ... |
23147 * +---------------------------------------+
23148 * | ... |
23149 * +-------------------+-------------------+
23150 * | dy | dz | na | na | cfr_vaddr... | <-region2
23151 * +-------------------+-------------------+
23152 * | cfr_vaddr (ctd) | cfr_num_pages |
23153 * +---------------------------------------+
23154 * | d0 | d1 ... |
23155 * +---------------------------------------+
23156 * ...
23157 * +---------------------------------------+
23158 * last region-> | cfr_vaddr |
23159 * +---------------------------------------+
23160 * + cfr_num_pages | d0 | d1 | d2 | d3 |
23161 * +---------------------------------------+
23162 * ...
23163 * +---------------------------------------+
23164 * | dx | dy | dz | na | na | na | na | na |
23165 * +---------------------------------------+
23166 *
23167 * where:
23168 * cf_size: total size of the buffer (rounded to page size)
23169 * cf_last_region: offset in the buffer of the last "region" sub-header
23170 * cf_last_zeroes: number of trailing "zero" dispositions at the end
23171 * of last region
23172 * cfr_vaddr: virtual address of the start of the covered "region"
23173 * cfr_num_pages: number of pages in the covered "region"
23174 * d*: disposition of the page at that virtual address
23175 * Regions in the buffer are word-aligned.
23176 *
23177 * We estimate the size of the buffer based on the number of memory regions
23178 * and the virtual size of the address space. While copying each memory region
23179 * during vm_map_fork(), we also collect the footprint info for that region
23180 * and store it in the buffer, packing it as much as possible (coalescing
23181 * contiguous memory regions to avoid having too many region headers and
23182 * avoiding long streaks of "zero" page dispositions by splitting footprint
23183 * "regions", so the number of regions in the footprint buffer might not match
23184 * the number of memory regions in the address space.
23185 *
23186 * We also have to copy the original task's "nonvolatile" ledgers since that's
23187 * part of the footprint and will need to be reported to any tool asking for
23188 * the footprint information of the forked corpse.
23189 */
23190
23191 uint64_t vm_map_corpse_footprint_count = 0;
23192 uint64_t vm_map_corpse_footprint_size_avg = 0;
23193 uint64_t vm_map_corpse_footprint_size_max = 0;
23194 uint64_t vm_map_corpse_footprint_full = 0;
23195 uint64_t vm_map_corpse_footprint_no_buf = 0;
23196
23197 struct vm_map_corpse_footprint_header {
23198 vm_size_t cf_size; /* allocated buffer size */
23199 uint32_t cf_last_region; /* offset of last region in buffer */
23200 union {
23201 uint32_t cfu_last_zeroes; /* during creation:
23202 * number of "zero" dispositions at
23203 * end of last region */
23204 uint32_t cfu_hint_region; /* during lookup:
23205 * offset of last looked up region */
23206 #define cf_last_zeroes cfu.cfu_last_zeroes
23207 #define cf_hint_region cfu.cfu_hint_region
23208 } cfu;
23209 };
23210 typedef uint8_t cf_disp_t;
23211 struct vm_map_corpse_footprint_region {
23212 vm_map_offset_t cfr_vaddr; /* region start virtual address */
23213 uint32_t cfr_num_pages; /* number of pages in this "region" */
23214 cf_disp_t cfr_disposition[0]; /* disposition of each page */
23215 } __attribute__((packed));
23216
23217 static cf_disp_t
vm_page_disposition_to_cf_disp(int disposition)23218 vm_page_disposition_to_cf_disp(
23219 int disposition)
23220 {
23221 assert(sizeof(cf_disp_t) == 1);
23222 /* relocate bits that don't fit in a "uint8_t" */
23223 if (disposition & VM_PAGE_QUERY_PAGE_REUSABLE) {
23224 disposition |= VM_PAGE_QUERY_PAGE_FICTITIOUS;
23225 }
23226 /* cast gets rid of extra bits */
23227 return (cf_disp_t) disposition;
23228 }
23229
23230 static int
vm_page_cf_disp_to_disposition(cf_disp_t cf_disp)23231 vm_page_cf_disp_to_disposition(
23232 cf_disp_t cf_disp)
23233 {
23234 int disposition;
23235
23236 assert(sizeof(cf_disp_t) == 1);
23237 disposition = (int) cf_disp;
23238 /* move relocated bits back in place */
23239 if (cf_disp & VM_PAGE_QUERY_PAGE_FICTITIOUS) {
23240 disposition |= VM_PAGE_QUERY_PAGE_REUSABLE;
23241 disposition &= ~VM_PAGE_QUERY_PAGE_FICTITIOUS;
23242 }
23243 return disposition;
23244 }
23245
23246 static kmem_guard_t
vm_map_corpse_footprint_guard(vm_map_t map)23247 vm_map_corpse_footprint_guard(vm_map_t map)
23248 {
23249 return (kmem_guard_t){
23250 .kmg_atomic = true,
23251 .kmg_tag = VM_KERN_MEMORY_DIAG,
23252 .kmg_context = os_hash_kernel_pointer(&map->vmmap_corpse_footprint),
23253 };
23254 }
23255
23256 /*
23257 * vm_map_corpse_footprint_new_region:
23258 * closes the current footprint "region" and creates a new one
23259 *
23260 * Returns NULL if there's not enough space in the buffer for a new region.
23261 */
23262 static struct vm_map_corpse_footprint_region *
vm_map_corpse_footprint_new_region(struct vm_map_corpse_footprint_header * footprint_header)23263 vm_map_corpse_footprint_new_region(
23264 struct vm_map_corpse_footprint_header *footprint_header)
23265 {
23266 uintptr_t footprint_edge;
23267 uint32_t new_region_offset;
23268 struct vm_map_corpse_footprint_region *footprint_region;
23269 struct vm_map_corpse_footprint_region *new_footprint_region;
23270
23271 footprint_edge = ((uintptr_t)footprint_header +
23272 footprint_header->cf_size);
23273 footprint_region = ((struct vm_map_corpse_footprint_region *)
23274 ((char *)footprint_header +
23275 footprint_header->cf_last_region));
23276 assert((uintptr_t)footprint_region + sizeof(*footprint_region) <=
23277 footprint_edge);
23278
23279 /* get rid of trailing zeroes in the last region */
23280 assert(footprint_region->cfr_num_pages >=
23281 footprint_header->cf_last_zeroes);
23282 footprint_region->cfr_num_pages -=
23283 footprint_header->cf_last_zeroes;
23284 footprint_header->cf_last_zeroes = 0;
23285
23286 /* reuse this region if it's now empty */
23287 if (footprint_region->cfr_num_pages == 0) {
23288 return footprint_region;
23289 }
23290
23291 /* compute offset of new region */
23292 new_region_offset = footprint_header->cf_last_region;
23293 new_region_offset += sizeof(*footprint_region);
23294 new_region_offset += (footprint_region->cfr_num_pages * sizeof(cf_disp_t));
23295 new_region_offset = roundup(new_region_offset, sizeof(int));
23296
23297 /* check if we're going over the edge */
23298 if (((uintptr_t)footprint_header +
23299 new_region_offset +
23300 sizeof(*footprint_region)) >=
23301 footprint_edge) {
23302 /* over the edge: no new region */
23303 return NULL;
23304 }
23305
23306 /* adjust offset of last region in header */
23307 footprint_header->cf_last_region = new_region_offset;
23308
23309 new_footprint_region = (struct vm_map_corpse_footprint_region *)
23310 ((char *)footprint_header +
23311 footprint_header->cf_last_region);
23312 new_footprint_region->cfr_vaddr = 0;
23313 new_footprint_region->cfr_num_pages = 0;
23314 /* caller needs to initialize new region */
23315
23316 return new_footprint_region;
23317 }
23318
23319 /*
23320 * vm_map_corpse_footprint_collect:
23321 * collect footprint information for "old_entry" in "old_map" and
23322 * stores it in "new_map"'s vmmap_footprint_info.
23323 */
23324 kern_return_t
vm_map_corpse_footprint_collect(vm_map_t old_map,vm_map_entry_t old_entry,vm_map_t new_map)23325 vm_map_corpse_footprint_collect(
23326 vm_map_t old_map,
23327 vm_map_entry_t old_entry,
23328 vm_map_t new_map)
23329 {
23330 vm_map_offset_t va;
23331 kmem_return_t kmr;
23332 struct vm_map_corpse_footprint_header *footprint_header;
23333 struct vm_map_corpse_footprint_region *footprint_region;
23334 struct vm_map_corpse_footprint_region *new_footprint_region;
23335 cf_disp_t *next_disp_p;
23336 uintptr_t footprint_edge;
23337 uint32_t num_pages_tmp;
23338 int effective_page_size;
23339
23340 effective_page_size = MIN(PAGE_SIZE, VM_MAP_PAGE_SIZE(old_map));
23341
23342 va = old_entry->vme_start;
23343
23344 vm_map_lock_assert_exclusive(old_map);
23345 vm_map_lock_assert_exclusive(new_map);
23346
23347 assert(new_map->has_corpse_footprint);
23348 assert(!old_map->has_corpse_footprint);
23349 if (!new_map->has_corpse_footprint ||
23350 old_map->has_corpse_footprint) {
23351 /*
23352 * This can only transfer footprint info from a
23353 * map with a live pmap to a map with a corpse footprint.
23354 */
23355 return KERN_NOT_SUPPORTED;
23356 }
23357
23358 if (new_map->vmmap_corpse_footprint == NULL) {
23359 vm_size_t buf_size;
23360
23361 buf_size = (sizeof(*footprint_header) +
23362 (old_map->hdr.nentries
23363 *
23364 (sizeof(*footprint_region) +
23365 +3)) /* potential alignment for each region */
23366 +
23367 ((old_map->size / effective_page_size)
23368 *
23369 sizeof(cf_disp_t))); /* disposition for each page */
23370 // printf("FBDP corpse map %p guestimate footprint size 0x%llx\n", new_map, (uint64_t) buf_size);
23371 buf_size = round_page(buf_size);
23372
23373 /* limit buffer to 1 page to validate overflow detection */
23374 // buf_size = PAGE_SIZE;
23375
23376 /* limit size to a somewhat sane amount */
23377 #if XNU_TARGET_OS_OSX
23378 #define VM_MAP_CORPSE_FOOTPRINT_INFO_MAX_SIZE (8*1024*1024) /* 8MB */
23379 #else /* XNU_TARGET_OS_OSX */
23380 #define VM_MAP_CORPSE_FOOTPRINT_INFO_MAX_SIZE (256*1024) /* 256KB */
23381 #endif /* XNU_TARGET_OS_OSX */
23382 if (buf_size > VM_MAP_CORPSE_FOOTPRINT_INFO_MAX_SIZE) {
23383 buf_size = VM_MAP_CORPSE_FOOTPRINT_INFO_MAX_SIZE;
23384 }
23385 kmem_guard_t guard = vm_map_corpse_footprint_guard(new_map);
23386 kmr = kmem_alloc_guard(kernel_map, buf_size + PAGE_SIZE, 0,
23387 KMA_DATA | KMA_GUARD_LAST | KMA_KOBJECT | KMA_ZERO,
23388 guard);
23389 if (kmr.kmr_return != KERN_SUCCESS) {
23390 vm_map_corpse_footprint_no_buf++;
23391 return kmr.kmr_return;
23392 }
23393
23394 /* initialize header and 1st region */
23395 footprint_header = (struct vm_map_corpse_footprint_header *)kmr.kmr_ptr;
23396 assert3p(footprint_header, !=, NULL);
23397 new_map->vmmap_corpse_footprint = footprint_header;
23398
23399 footprint_header->cf_size = buf_size;
23400 footprint_header->cf_last_region =
23401 sizeof(*footprint_header);
23402 footprint_header->cf_last_zeroes = 0;
23403
23404 footprint_region = (struct vm_map_corpse_footprint_region *)
23405 ((char *)footprint_header +
23406 footprint_header->cf_last_region);
23407 footprint_region->cfr_vaddr = 0;
23408 footprint_region->cfr_num_pages = 0;
23409 } else {
23410 /* retrieve header and last region */
23411 footprint_header = (struct vm_map_corpse_footprint_header *)
23412 new_map->vmmap_corpse_footprint;
23413 footprint_region = (struct vm_map_corpse_footprint_region *)
23414 ((char *)footprint_header +
23415 footprint_header->cf_last_region);
23416 }
23417 footprint_edge = ((uintptr_t)footprint_header +
23418 footprint_header->cf_size);
23419
23420 if ((footprint_region->cfr_vaddr +
23421 (((vm_map_offset_t)footprint_region->cfr_num_pages) *
23422 effective_page_size))
23423 != old_entry->vme_start) {
23424 uint64_t num_pages_delta, num_pages_delta_size;
23425 uint32_t region_offset_delta_size;
23426
23427 /*
23428 * Not the next contiguous virtual address:
23429 * start a new region or store "zero" dispositions for
23430 * the missing pages?
23431 */
23432 /* size of gap in actual page dispositions */
23433 num_pages_delta = ((old_entry->vme_start -
23434 footprint_region->cfr_vaddr) / effective_page_size)
23435 - footprint_region->cfr_num_pages;
23436 num_pages_delta_size = num_pages_delta * sizeof(cf_disp_t);
23437 /* size of gap as a new footprint region header */
23438 region_offset_delta_size =
23439 (sizeof(*footprint_region) +
23440 roundup(((footprint_region->cfr_num_pages -
23441 footprint_header->cf_last_zeroes) * sizeof(cf_disp_t)),
23442 sizeof(int)) -
23443 ((footprint_region->cfr_num_pages -
23444 footprint_header->cf_last_zeroes) * sizeof(cf_disp_t)));
23445 // printf("FBDP %s:%d region 0x%x 0x%llx 0x%x vme_start 0x%llx pages_delta 0x%llx region_delta 0x%x\n", __FUNCTION__, __LINE__, footprint_header->cf_last_region, footprint_region->cfr_vaddr, footprint_region->cfr_num_pages, old_entry->vme_start, num_pages_delta, region_offset_delta);
23446 if (region_offset_delta_size < num_pages_delta_size ||
23447 os_add3_overflow(footprint_region->cfr_num_pages,
23448 (uint32_t) num_pages_delta,
23449 1,
23450 &num_pages_tmp)) {
23451 /*
23452 * Storing data for this gap would take more space
23453 * than inserting a new footprint region header:
23454 * let's start a new region and save space. If it's a
23455 * tie, let's avoid using a new region, since that
23456 * would require more region hops to find the right
23457 * range during lookups.
23458 *
23459 * If the current region's cfr_num_pages would overflow
23460 * if we added "zero" page dispositions for the gap,
23461 * no choice but to start a new region.
23462 */
23463 // printf("FBDP %s:%d new region\n", __FUNCTION__, __LINE__);
23464 new_footprint_region =
23465 vm_map_corpse_footprint_new_region(footprint_header);
23466 /* check that we're not going over the edge */
23467 if (new_footprint_region == NULL) {
23468 goto over_the_edge;
23469 }
23470 footprint_region = new_footprint_region;
23471 /* initialize new region as empty */
23472 footprint_region->cfr_vaddr = old_entry->vme_start;
23473 footprint_region->cfr_num_pages = 0;
23474 } else {
23475 /*
23476 * Store "zero" page dispositions for the missing
23477 * pages.
23478 */
23479 // printf("FBDP %s:%d zero gap\n", __FUNCTION__, __LINE__);
23480 for (; num_pages_delta > 0; num_pages_delta--) {
23481 next_disp_p = (cf_disp_t *)
23482 ((uintptr_t) footprint_region +
23483 sizeof(*footprint_region));
23484 next_disp_p += footprint_region->cfr_num_pages;
23485 /* check that we're not going over the edge */
23486 if ((uintptr_t)next_disp_p >= footprint_edge) {
23487 goto over_the_edge;
23488 }
23489 /* store "zero" disposition for this gap page */
23490 footprint_region->cfr_num_pages++;
23491 *next_disp_p = (cf_disp_t) 0;
23492 footprint_header->cf_last_zeroes++;
23493 }
23494 }
23495 }
23496
23497 for (va = old_entry->vme_start;
23498 va < old_entry->vme_end;
23499 va += effective_page_size) {
23500 int disposition;
23501 cf_disp_t cf_disp;
23502
23503 vm_map_footprint_query_page_info(old_map,
23504 old_entry,
23505 va,
23506 &disposition);
23507 cf_disp = vm_page_disposition_to_cf_disp(disposition);
23508
23509 // if (va < SHARED_REGION_BASE_ARM64) printf("FBDP collect map %p va 0x%llx disp 0x%x\n", new_map, va, disp);
23510
23511 if (cf_disp == 0 && footprint_region->cfr_num_pages == 0) {
23512 /*
23513 * Ignore "zero" dispositions at start of
23514 * region: just move start of region.
23515 */
23516 footprint_region->cfr_vaddr += effective_page_size;
23517 continue;
23518 }
23519
23520 /* would region's cfr_num_pages overflow? */
23521 if (os_add_overflow(footprint_region->cfr_num_pages, 1,
23522 &num_pages_tmp)) {
23523 /* overflow: create a new region */
23524 new_footprint_region =
23525 vm_map_corpse_footprint_new_region(
23526 footprint_header);
23527 if (new_footprint_region == NULL) {
23528 goto over_the_edge;
23529 }
23530 footprint_region = new_footprint_region;
23531 footprint_region->cfr_vaddr = va;
23532 footprint_region->cfr_num_pages = 0;
23533 }
23534
23535 next_disp_p = (cf_disp_t *) ((uintptr_t) footprint_region +
23536 sizeof(*footprint_region));
23537 next_disp_p += footprint_region->cfr_num_pages;
23538 /* check that we're not going over the edge */
23539 if ((uintptr_t)next_disp_p >= footprint_edge) {
23540 goto over_the_edge;
23541 }
23542 /* store this dispostion */
23543 *next_disp_p = cf_disp;
23544 footprint_region->cfr_num_pages++;
23545
23546 if (cf_disp != 0) {
23547 /* non-zero disp: break the current zero streak */
23548 footprint_header->cf_last_zeroes = 0;
23549 /* done */
23550 continue;
23551 }
23552
23553 /* zero disp: add to the current streak of zeroes */
23554 footprint_header->cf_last_zeroes++;
23555 if ((footprint_header->cf_last_zeroes +
23556 roundup(((footprint_region->cfr_num_pages -
23557 footprint_header->cf_last_zeroes) * sizeof(cf_disp_t)) &
23558 (sizeof(int) - 1),
23559 sizeof(int))) <
23560 (sizeof(*footprint_header))) {
23561 /*
23562 * There are not enough trailing "zero" dispositions
23563 * (+ the extra padding we would need for the previous
23564 * region); creating a new region would not save space
23565 * at this point, so let's keep this "zero" disposition
23566 * in this region and reconsider later.
23567 */
23568 continue;
23569 }
23570 /*
23571 * Create a new region to avoid having too many consecutive
23572 * "zero" dispositions.
23573 */
23574 new_footprint_region =
23575 vm_map_corpse_footprint_new_region(footprint_header);
23576 if (new_footprint_region == NULL) {
23577 goto over_the_edge;
23578 }
23579 footprint_region = new_footprint_region;
23580 /* initialize the new region as empty ... */
23581 footprint_region->cfr_num_pages = 0;
23582 /* ... and skip this "zero" disp */
23583 footprint_region->cfr_vaddr = va + effective_page_size;
23584 }
23585
23586 return KERN_SUCCESS;
23587
23588 over_the_edge:
23589 // printf("FBDP map %p footprint was full for va 0x%llx\n", new_map, va);
23590 vm_map_corpse_footprint_full++;
23591 return KERN_RESOURCE_SHORTAGE;
23592 }
23593
23594 /*
23595 * vm_map_corpse_footprint_collect_done:
23596 * completes the footprint collection by getting rid of any remaining
23597 * trailing "zero" dispositions and trimming the unused part of the
23598 * kernel buffer
23599 */
23600 void
vm_map_corpse_footprint_collect_done(vm_map_t new_map)23601 vm_map_corpse_footprint_collect_done(
23602 vm_map_t new_map)
23603 {
23604 struct vm_map_corpse_footprint_header *footprint_header;
23605 struct vm_map_corpse_footprint_region *footprint_region;
23606 vm_size_t buf_size, actual_size;
23607
23608 assert(new_map->has_corpse_footprint);
23609 if (!new_map->has_corpse_footprint ||
23610 new_map->vmmap_corpse_footprint == NULL) {
23611 return;
23612 }
23613
23614 footprint_header = (struct vm_map_corpse_footprint_header *)
23615 new_map->vmmap_corpse_footprint;
23616 buf_size = footprint_header->cf_size;
23617
23618 footprint_region = (struct vm_map_corpse_footprint_region *)
23619 ((char *)footprint_header +
23620 footprint_header->cf_last_region);
23621
23622 /* get rid of trailing zeroes in last region */
23623 assert(footprint_region->cfr_num_pages >= footprint_header->cf_last_zeroes);
23624 footprint_region->cfr_num_pages -= footprint_header->cf_last_zeroes;
23625 footprint_header->cf_last_zeroes = 0;
23626
23627 actual_size = (vm_size_t)(footprint_header->cf_last_region +
23628 sizeof(*footprint_region) +
23629 (footprint_region->cfr_num_pages * sizeof(cf_disp_t)));
23630
23631 // printf("FBDP map %p buf_size 0x%llx actual_size 0x%llx\n", new_map, (uint64_t) buf_size, (uint64_t) actual_size);
23632 vm_map_corpse_footprint_size_avg =
23633 (((vm_map_corpse_footprint_size_avg *
23634 vm_map_corpse_footprint_count) +
23635 actual_size) /
23636 (vm_map_corpse_footprint_count + 1));
23637 vm_map_corpse_footprint_count++;
23638 if (actual_size > vm_map_corpse_footprint_size_max) {
23639 vm_map_corpse_footprint_size_max = actual_size;
23640 }
23641
23642 actual_size = round_page(actual_size);
23643 assert3u(buf_size, >=, actual_size);
23644 if (buf_size > actual_size) {
23645 /*
23646 * Free unused space at the end of the buffer
23647 */
23648 kmem_guard_t guard = vm_map_corpse_footprint_guard(new_map);
23649 kmem_return_t kmr = kmem_realloc_guard(kernel_map,
23650 (vm_offset_t)footprint_header,
23651 /* Account for guard page */
23652 buf_size + PAGE_SIZE,
23653 actual_size + PAGE_SIZE,
23654 KMR_DATA | KMR_GUARD_LAST | KMR_FREEOLD | KMR_KOBJECT,
23655 guard);
23656 assertf(kmr.kmr_return == KERN_SUCCESS,
23657 "trim: footprint_header %p buf_size 0x%llx actual_size 0x%llx kr=0x%x\n",
23658 footprint_header,
23659 (uint64_t) buf_size,
23660 (uint64_t) actual_size,
23661 kmr.kmr_return);
23662 footprint_header = (struct vm_map_corpse_footprint_header *)kmr.kmr_ptr;
23663 assert3p(footprint_header, !=, NULL);
23664 new_map->vmmap_corpse_footprint = footprint_header;
23665 footprint_region = NULL;
23666 }
23667
23668 footprint_header->cf_size = actual_size;
23669 }
23670
23671 /*
23672 * vm_map_corpse_footprint_query_page_info:
23673 * retrieves the disposition of the page at virtual address "vaddr"
23674 * in the forked corpse's VM map
23675 *
23676 * This is the equivalent of vm_map_footprint_query_page_info() for a forked corpse.
23677 */
23678 kern_return_t
vm_map_corpse_footprint_query_page_info(vm_map_t map,vm_map_offset_t va,int * disposition_p)23679 vm_map_corpse_footprint_query_page_info(
23680 vm_map_t map,
23681 vm_map_offset_t va,
23682 int *disposition_p)
23683 {
23684 struct vm_map_corpse_footprint_header *footprint_header;
23685 struct vm_map_corpse_footprint_region *footprint_region;
23686 uint32_t footprint_region_offset;
23687 vm_map_offset_t region_start, region_end;
23688 int disp_idx;
23689 kern_return_t kr;
23690 int effective_page_size;
23691 cf_disp_t cf_disp;
23692
23693 if (!map->has_corpse_footprint) {
23694 *disposition_p = 0;
23695 kr = KERN_INVALID_ARGUMENT;
23696 goto done;
23697 }
23698
23699 footprint_header = map->vmmap_corpse_footprint;
23700 if (footprint_header == NULL) {
23701 *disposition_p = 0;
23702 // if (va < SHARED_REGION_BASE_ARM64) printf("FBDP %d query map %p va 0x%llx disp 0x%x\n", __LINE__, map, va, *disposition_p);
23703 kr = KERN_INVALID_ARGUMENT;
23704 goto done;
23705 }
23706
23707 /* start looking at the hint ("cf_hint_region") */
23708 footprint_region_offset = footprint_header->cf_hint_region;
23709
23710 effective_page_size = MIN(PAGE_SIZE, VM_MAP_PAGE_SIZE(map));
23711
23712 lookup_again:
23713 if (footprint_region_offset < sizeof(*footprint_header)) {
23714 /* hint too low: start from 1st region */
23715 footprint_region_offset = sizeof(*footprint_header);
23716 }
23717 if (footprint_region_offset > footprint_header->cf_last_region) {
23718 /* hint too high: re-start from 1st region */
23719 footprint_region_offset = sizeof(*footprint_header);
23720 }
23721 footprint_region = (struct vm_map_corpse_footprint_region *)
23722 ((char *)footprint_header + footprint_region_offset);
23723 region_start = footprint_region->cfr_vaddr;
23724 region_end = (region_start +
23725 ((vm_map_offset_t)(footprint_region->cfr_num_pages) *
23726 effective_page_size));
23727 if (va < region_start &&
23728 footprint_region_offset != sizeof(*footprint_header)) {
23729 /* our range starts before the hint region */
23730
23731 /* reset the hint (in a racy way...) */
23732 footprint_header->cf_hint_region = sizeof(*footprint_header);
23733 /* lookup "va" again from 1st region */
23734 footprint_region_offset = sizeof(*footprint_header);
23735 goto lookup_again;
23736 }
23737
23738 while (va >= region_end) {
23739 if (footprint_region_offset >= footprint_header->cf_last_region) {
23740 break;
23741 }
23742 /* skip the region's header */
23743 footprint_region_offset += sizeof(*footprint_region);
23744 /* skip the region's page dispositions */
23745 footprint_region_offset += (footprint_region->cfr_num_pages * sizeof(cf_disp_t));
23746 /* align to next word boundary */
23747 footprint_region_offset =
23748 roundup(footprint_region_offset,
23749 sizeof(int));
23750 footprint_region = (struct vm_map_corpse_footprint_region *)
23751 ((char *)footprint_header + footprint_region_offset);
23752 region_start = footprint_region->cfr_vaddr;
23753 region_end = (region_start +
23754 ((vm_map_offset_t)(footprint_region->cfr_num_pages) *
23755 effective_page_size));
23756 }
23757 if (va < region_start || va >= region_end) {
23758 /* page not found */
23759 *disposition_p = 0;
23760 // if (va < SHARED_REGION_BASE_ARM64) printf("FBDP %d query map %p va 0x%llx disp 0x%x\n", __LINE__, map, va, *disposition_p);
23761 kr = KERN_SUCCESS;
23762 goto done;
23763 }
23764
23765 /* "va" found: set the lookup hint for next lookup (in a racy way...) */
23766 footprint_header->cf_hint_region = footprint_region_offset;
23767
23768 /* get page disposition for "va" in this region */
23769 disp_idx = (int) ((va - footprint_region->cfr_vaddr) / effective_page_size);
23770 cf_disp = footprint_region->cfr_disposition[disp_idx];
23771 *disposition_p = vm_page_cf_disp_to_disposition(cf_disp);
23772 kr = KERN_SUCCESS;
23773 done:
23774 // if (va < SHARED_REGION_BASE_ARM64) printf("FBDP %d query map %p va 0x%llx disp 0x%x\n", __LINE__, map, va, *disposition_p);
23775 /* dtrace -n 'vminfo:::footprint_query_page_info { printf("map 0x%p va 0x%llx disp 0x%x kr 0x%x", arg0, arg1, arg2, arg3); }' */
23776 DTRACE_VM4(footprint_query_page_info,
23777 vm_map_t, map,
23778 vm_map_offset_t, va,
23779 int, *disposition_p,
23780 kern_return_t, kr);
23781
23782 return kr;
23783 }
23784
23785 void
vm_map_corpse_footprint_destroy(vm_map_t map)23786 vm_map_corpse_footprint_destroy(
23787 vm_map_t map)
23788 {
23789 if (map->has_corpse_footprint &&
23790 map->vmmap_corpse_footprint != NULL) {
23791 struct vm_map_corpse_footprint_header *footprint_header;
23792 vm_size_t buf_size;
23793
23794 footprint_header = map->vmmap_corpse_footprint;
23795 buf_size = footprint_header->cf_size;
23796 kmem_guard_t guard = vm_map_corpse_footprint_guard(map);
23797 kmem_free_guard(kernel_map, (vm_offset_t)footprint_header,
23798 buf_size + PAGE_SIZE,
23799 KMF_GUARD_LAST, guard);
23800 map->vmmap_corpse_footprint = NULL;
23801 map->has_corpse_footprint = FALSE;
23802 }
23803 }
23804
23805 /*
23806 * vm_map_copy_footprint_ledgers:
23807 * copies any ledger that's relevant to the memory footprint of "old_task"
23808 * into the forked corpse's task ("new_task")
23809 */
23810 void
vm_map_copy_footprint_ledgers(task_t old_task,task_t new_task)23811 vm_map_copy_footprint_ledgers(
23812 task_t old_task,
23813 task_t new_task)
23814 {
23815 vm_map_copy_ledger(old_task, new_task, task_ledgers.phys_footprint);
23816 vm_map_copy_ledger(old_task, new_task, task_ledgers.purgeable_nonvolatile);
23817 vm_map_copy_ledger(old_task, new_task, task_ledgers.purgeable_nonvolatile_compressed);
23818 vm_map_copy_ledger(old_task, new_task, task_ledgers.internal);
23819 vm_map_copy_ledger(old_task, new_task, task_ledgers.internal_compressed);
23820 vm_map_copy_ledger(old_task, new_task, task_ledgers.iokit_mapped);
23821 vm_map_copy_ledger(old_task, new_task, task_ledgers.alternate_accounting);
23822 vm_map_copy_ledger(old_task, new_task, task_ledgers.alternate_accounting_compressed);
23823 vm_map_copy_ledger(old_task, new_task, task_ledgers.page_table);
23824 vm_map_copy_ledger(old_task, new_task, task_ledgers.tagged_footprint);
23825 vm_map_copy_ledger(old_task, new_task, task_ledgers.tagged_footprint_compressed);
23826 vm_map_copy_ledger(old_task, new_task, task_ledgers.network_nonvolatile);
23827 vm_map_copy_ledger(old_task, new_task, task_ledgers.network_nonvolatile_compressed);
23828 vm_map_copy_ledger(old_task, new_task, task_ledgers.media_footprint);
23829 vm_map_copy_ledger(old_task, new_task, task_ledgers.media_footprint_compressed);
23830 vm_map_copy_ledger(old_task, new_task, task_ledgers.graphics_footprint);
23831 vm_map_copy_ledger(old_task, new_task, task_ledgers.graphics_footprint_compressed);
23832 vm_map_copy_ledger(old_task, new_task, task_ledgers.neural_footprint);
23833 vm_map_copy_ledger(old_task, new_task, task_ledgers.neural_footprint_compressed);
23834 vm_map_copy_ledger(old_task, new_task, task_ledgers.wired_mem);
23835 vm_map_copy_ledger(old_task, new_task, task_ledgers.neural_nofootprint_total);
23836 }
23837
23838 /*
23839 * vm_map_copy_ledger:
23840 * copy a single ledger from "old_task" to "new_task"
23841 */
23842 void
vm_map_copy_ledger(task_t old_task,task_t new_task,int ledger_entry)23843 vm_map_copy_ledger(
23844 task_t old_task,
23845 task_t new_task,
23846 int ledger_entry)
23847 {
23848 ledger_amount_t old_balance, new_balance, delta;
23849
23850 assert(new_task->map->has_corpse_footprint);
23851 if (!new_task->map->has_corpse_footprint) {
23852 return;
23853 }
23854
23855 /* turn off sanity checks for the ledger we're about to mess with */
23856 ledger_disable_panic_on_negative(new_task->ledger,
23857 ledger_entry);
23858
23859 /* adjust "new_task" to match "old_task" */
23860 ledger_get_balance(old_task->ledger,
23861 ledger_entry,
23862 &old_balance);
23863 ledger_get_balance(new_task->ledger,
23864 ledger_entry,
23865 &new_balance);
23866 if (new_balance == old_balance) {
23867 /* new == old: done */
23868 } else if (new_balance > old_balance) {
23869 /* new > old ==> new -= new - old */
23870 delta = new_balance - old_balance;
23871 ledger_debit(new_task->ledger,
23872 ledger_entry,
23873 delta);
23874 } else {
23875 /* new < old ==> new += old - new */
23876 delta = old_balance - new_balance;
23877 ledger_credit(new_task->ledger,
23878 ledger_entry,
23879 delta);
23880 }
23881 }
23882
23883 /*
23884 * vm_map_get_pmap:
23885 * returns the pmap associated with the vm_map
23886 */
23887 pmap_t
vm_map_get_pmap(vm_map_t map)23888 vm_map_get_pmap(vm_map_t map)
23889 {
23890 return vm_map_pmap(map);
23891 }
23892
23893 ppnum_t
vm_map_get_phys_page(vm_map_t map,vm_offset_t addr)23894 vm_map_get_phys_page(
23895 vm_map_t map,
23896 vm_offset_t addr)
23897 {
23898 vm_object_offset_t offset;
23899 vm_object_t object;
23900 vm_map_offset_t map_offset;
23901 vm_map_entry_t entry;
23902 ppnum_t phys_page = 0;
23903
23904 map_offset = vm_map_trunc_page(addr, PAGE_MASK);
23905
23906 vm_map_lock(map);
23907 while (vm_map_lookup_entry(map, map_offset, &entry)) {
23908 if (entry->is_sub_map) {
23909 vm_map_t old_map;
23910 vm_map_lock(VME_SUBMAP(entry));
23911 old_map = map;
23912 map = VME_SUBMAP(entry);
23913 map_offset = (VME_OFFSET(entry) +
23914 (map_offset - entry->vme_start));
23915 vm_map_unlock(old_map);
23916 continue;
23917 }
23918 if (VME_OBJECT(entry) == VM_OBJECT_NULL) {
23919 vm_map_unlock(map);
23920 return (ppnum_t) 0;
23921 }
23922 if (VME_OBJECT(entry)->phys_contiguous) {
23923 /* These are not standard pageable memory mappings */
23924 /* If they are not present in the object they will */
23925 /* have to be picked up from the pager through the */
23926 /* fault mechanism. */
23927 if (VME_OBJECT(entry)->vo_shadow_offset == 0) {
23928 /* need to call vm_fault */
23929 vm_map_unlock(map);
23930 vm_fault(map, map_offset, VM_PROT_NONE,
23931 FALSE /* change_wiring */, VM_KERN_MEMORY_NONE,
23932 THREAD_UNINT, NULL, 0);
23933 vm_map_lock(map);
23934 continue;
23935 }
23936 offset = (VME_OFFSET(entry) +
23937 (map_offset - entry->vme_start));
23938 phys_page = (ppnum_t)
23939 ((VME_OBJECT(entry)->vo_shadow_offset
23940 + offset) >> PAGE_SHIFT);
23941 break;
23942 }
23943 offset = (VME_OFFSET(entry) + (map_offset - entry->vme_start));
23944 object = VME_OBJECT(entry);
23945 vm_object_lock(object);
23946 while (TRUE) {
23947 vm_page_t dst_page = vm_page_lookup(object, offset);
23948 if (dst_page == VM_PAGE_NULL) {
23949 if (object->shadow) {
23950 vm_object_t old_object;
23951 vm_object_lock(object->shadow);
23952 old_object = object;
23953 offset = offset + object->vo_shadow_offset;
23954 object = object->shadow;
23955 vm_object_unlock(old_object);
23956 } else {
23957 vm_object_unlock(object);
23958 break;
23959 }
23960 } else {
23961 phys_page = (ppnum_t)(VM_PAGE_GET_PHYS_PAGE(dst_page));
23962 vm_object_unlock(object);
23963 break;
23964 }
23965 }
23966 break;
23967 }
23968
23969 vm_map_unlock(map);
23970 return phys_page;
23971 }
23972
23973 #if CONFIG_MAP_RANGES
23974 static bitmap_t vm_map_user_range_heap_map[BITMAP_LEN(VM_MEMORY_COUNT)];
23975 static bitmap_t vm_map_user_range_large_file_map[BITMAP_LEN(VM_MEMORY_COUNT)];
23976
23977 static_assert((int)UMEM_RANGE_ID_DEFAULT == MACH_VM_RANGE_DEFAULT);
23978 static_assert((int)UMEM_RANGE_ID_HEAP == MACH_VM_RANGE_DATA);
23979
23980 /*
23981 * vm_map_range_map_init:
23982 * initializes the VM range ID map to enable index lookup
23983 * of user VM ranges based on VM tag from userspace.
23984 */
23985 static void
vm_map_range_map_init(void)23986 vm_map_range_map_init(void)
23987 {
23988 /*
23989 * VM_MEMORY_MALLOC{,_NANO} are skipped on purpose:
23990 * - the former is malloc metadata which should be kept separate
23991 * - the latter has its own ranges
23992 */
23993 bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_MALLOC_HUGE);
23994 bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_MALLOC_LARGE);
23995 bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_MALLOC_LARGE_REUSED);
23996 bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_MALLOC_MEDIUM);
23997 bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_MALLOC_PROB_GUARD);
23998 bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_MALLOC_SMALL);
23999 bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_MALLOC_TINY);
24000 bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_TCMALLOC);
24001 bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_LIBNETWORK);
24002 bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_IOACCELERATOR);
24003 bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_IOSURFACE);
24004 bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_IMAGEIO);
24005 bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_COREGRAPHICS);
24006 bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_CORESERVICES);
24007 bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_COREDATA);
24008 bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_LAYERKIT);
24009 bitmap_set(vm_map_user_range_large_file_map, VM_MEMORY_IOACCELERATOR);
24010 bitmap_set(vm_map_user_range_large_file_map, VM_MEMORY_IOSURFACE);
24011 }
24012
24013 static struct mach_vm_range
vm_map_range_random_uniform(vm_map_size_t req_size,vm_map_offset_t min_addr,vm_map_offset_t max_addr,vm_map_offset_t offmask)24014 vm_map_range_random_uniform(
24015 vm_map_size_t req_size,
24016 vm_map_offset_t min_addr,
24017 vm_map_offset_t max_addr,
24018 vm_map_offset_t offmask)
24019 {
24020 vm_map_offset_t random_addr;
24021 struct mach_vm_range alloc;
24022
24023 req_size = (req_size + offmask) & ~offmask;
24024 min_addr = (min_addr + offmask) & ~offmask;
24025 max_addr = max_addr & ~offmask;
24026
24027 read_random(&random_addr, sizeof(random_addr));
24028 random_addr %= (max_addr - req_size - min_addr);
24029 random_addr &= ~offmask;
24030
24031 alloc.min_address = min_addr + random_addr;
24032 alloc.max_address = min_addr + random_addr + req_size;
24033 return alloc;
24034 }
24035
24036 static vm_map_offset_t
vm_map_range_offmask(void)24037 vm_map_range_offmask(void)
24038 {
24039 uint32_t pte_depth;
24040
24041 /*
24042 * PTE optimizations
24043 *
24044 *
24045 * 16k pages systems
24046 * ~~~~~~~~~~~~~~~~~
24047 *
24048 * A single L1 (sub-)page covers the address space.
24049 * - L2 pages cover 64G,
24050 * - L3 pages cover 32M.
24051 *
24052 * On embedded, the dynamic VA range is 64G and uses a single L2 page.
24053 * As a result, we really only need to align the ranges to 32M to avoid
24054 * partial L3 pages.
24055 *
24056 * On macOS, the usage of L2 pages will increase, so as a result we will
24057 * want to align ranges to 64G in order to utilize them fully.
24058 *
24059 *
24060 * 4k pages systems
24061 * ~~~~~~~~~~~~~~~~
24062 *
24063 * A single L0 (sub-)page covers the address space.
24064 * - L1 pages cover 512G,
24065 * - L2 pages cover 1G,
24066 * - L3 pages cover 2M.
24067 *
24068 * The long tail of processes on a system will tend to have a VA usage
24069 * (ignoring the shared regions) in the 100s of MB order of magnitnude.
24070 * This is achievable with a single L1 and a few L2s without
24071 * randomization.
24072 *
24073 * However once randomization is introduced, the system will immediately
24074 * need several L1s and many more L2s. As a result:
24075 *
24076 * - on embedded devices, the cost of these extra pages isn't
24077 * sustainable, and we just disable the feature entirely,
24078 *
24079 * - on macOS we align ranges to a 512G boundary so that the extra L1
24080 * pages can be used to their full potential.
24081 */
24082
24083 /*
24084 * note, this function assumes _non exotic mappings_
24085 * which is why it uses the native kernel's PAGE_SHIFT.
24086 */
24087 #if XNU_PLATFORM_MacOSX
24088 pte_depth = PAGE_SHIFT > 12 ? 2 : 3;
24089 #else /* !XNU_PLATFORM_MacOSX */
24090 pte_depth = PAGE_SHIFT > 12 ? 1 : 0;
24091 #endif /* !XNU_PLATFORM_MacOSX */
24092
24093 if (pte_depth == 0) {
24094 return 0;
24095 }
24096
24097 return (1ull << ((PAGE_SHIFT - 3) * pte_depth + PAGE_SHIFT)) - 1;
24098 }
24099
24100 /*
24101 * vm_map_range_configure:
24102 * configures the user vm_map ranges by increasing the maximum VA range of
24103 * the map and carving out a range at the end of VA space (searching backwards
24104 * in the newly expanded map).
24105 */
24106 kern_return_t
vm_map_range_configure(vm_map_t map,__unused bool needs_extra_jumbo_va)24107 vm_map_range_configure(vm_map_t map, __unused bool needs_extra_jumbo_va)
24108 {
24109 const vm_map_offset_t offmask = vm_map_range_offmask();
24110 struct mach_vm_range data_range;
24111 vm_map_offset_t default_end;
24112 kern_return_t kr;
24113
24114 if (!vm_map_is_64bit(map) || vm_map_is_exotic(map) || offmask == 0) {
24115 /*
24116 * No point doing vm ranges in a 32bit address space.
24117 */
24118 return KERN_NOT_SUPPORTED;
24119 }
24120
24121 /* Should not be applying ranges to kernel map or kernel map submaps */
24122 assert(vm_map_pmap(map) != kernel_pmap);
24123
24124 #if XNU_PLATFORM_MacOSX
24125
24126 /*
24127 * on macOS, the address space is a massive 47 bits (128T),
24128 * with several carve outs that processes can't use:
24129 * - the shared region
24130 * - the commpage region
24131 * - the GPU carve out (if applicable)
24132 *
24133 * and when nano-malloc is in use it desires memory at the 96T mark.
24134 *
24135 * However, their location is architecture dependent:
24136 * - On intel, the shared region and commpage are
24137 * at the very end of the usable address space (above +127T),
24138 * and there is no GPU carve out, and pthread wants to place
24139 * threads at the 112T mark (0x70T).
24140 *
24141 * - On arm64, these are in the same spot as on embedded devices:
24142 * o shared region: [ 6G, 10G) [ will likely grow over time ]
24143 * o commpage region: [63G, 64G)
24144 * o GPU carve out: [64G, 448G)
24145 *
24146 * This is conveninent because the mappings at the end of the address
24147 * space (when they exist) are made by the kernel.
24148 *
24149 * The policy is to allocate a random 1T for the data heap
24150 * in the end of the address-space in the:
24151 * - [0x71, 0x7f) range on Intel (to leave space for pthread stacks)
24152 * - [0x61, 0x7f) range on ASM (to leave space for Nano malloc).
24153 */
24154
24155 /* see NANOZONE_SIGNATURE in libmalloc */
24156 #if __x86_64__
24157 default_end = 0x71ull << 40;
24158 #else
24159 default_end = 0x61ull << 40;
24160 #endif
24161 data_range = vm_map_range_random_uniform(1ull << 40,
24162 default_end, 0x7full << 40, offmask);
24163
24164 #else /* !XNU_PLATFORM_MacOSX */
24165
24166 /*
24167 * Embedded devices:
24168 *
24169 * The default VA Size scales with the device physical memory.
24170 *
24171 * Out of that:
24172 * - the "zero" page typically uses 4G + some slide
24173 * - the shared region uses SHARED_REGION_SIZE bytes (4G)
24174 *
24175 * Without the use of jumbo or any adjustment to the address space,
24176 * a default VM map typically looks like this:
24177 *
24178 * 0G -->╒════════════╕
24179 * │ pagezero │
24180 * │ + slide │
24181 * ~4G -->╞════════════╡<-- vm_map_min(map)
24182 * │ │
24183 * 6G -->├────────────┤
24184 * │ shared │
24185 * │ region │
24186 * 10G -->├────────────┤
24187 * │ │
24188 * max_va -->├────────────┤<-- vm_map_max(map)
24189 * │ │
24190 * ╎ jumbo ╎
24191 * ╎ ╎
24192 * │ │
24193 * 63G -->╞════════════╡<-- MACH_VM_MAX_ADDRESS
24194 * │ commpage │
24195 * 64G -->├────────────┤<-- MACH_VM_MIN_GPU_CARVEOUT_ADDRESS
24196 * │ │
24197 * ╎ GPU ╎
24198 * ╎ carveout ╎
24199 * │ │
24200 * 448G -->├────────────┤<-- MACH_VM_MAX_GPU_CARVEOUT_ADDRESS
24201 * │ │
24202 * ╎ ╎
24203 * ╎ ╎
24204 * │ │
24205 * 512G -->╘════════════╛<-- (1ull << ARM_16K_TT_L1_SHIFT)
24206 *
24207 * When this drawing was made, "max_va" was smaller than
24208 * ARM64_MAX_OFFSET_DEVICE_LARGE (~15.5G), leaving shy of
24209 * 12G of address space for the zero-page, slide, files,
24210 * binaries, heap ...
24211 *
24212 * We will want to make a "heap/data" carve out inside
24213 * the jumbo range of half of that usable space, assuming
24214 * that this is less than a forth of the jumbo range.
24215 *
24216 * The assert below intends to catch when max_va grows
24217 * too large for this heuristic.
24218 */
24219
24220 vm_map_lock_read(map);
24221 default_end = vm_map_max(map);
24222 vm_map_unlock_read(map);
24223
24224 /*
24225 * Check that we're not already jumbo'd,
24226 * or our address space was somehow modified.
24227 *
24228 * If so we cannot guarantee that we can set up the ranges
24229 * safely without interfering with the existing map.
24230 */
24231 if (default_end > vm_compute_max_offset(true)) {
24232 return KERN_NO_SPACE;
24233 }
24234
24235 if (pmap_max_offset(true, ARM_PMAP_MAX_OFFSET_DEFAULT)) {
24236 /*
24237 * an override boot-arg was set, disable user-ranges
24238 *
24239 * XXX: this is problematic because it means these boot-args
24240 * no longer test the behavior changing the value
24241 * of ARM64_MAX_OFFSET_DEVICE_* would have.
24242 */
24243 return KERN_NOT_SUPPORTED;
24244 }
24245
24246 /* expand the default VM space to 64GB */
24247 vm_map_set_jumbo(map);
24248
24249 assert3u(7 * GiB(10) / 2, <=, vm_map_max(map) - default_end);
24250 data_range = vm_map_range_random_uniform(GiB(10),
24251 default_end + PAGE_SIZE, vm_map_max(map), offmask);
24252
24253 #endif /* !XNU_PLATFORM_MacOSX */
24254
24255 /*
24256 * Poke holes so that ASAN or people listing regions
24257 * do not think this space is free.
24258 */
24259
24260 if (default_end != data_range.min_address) {
24261 kr = vm_map_enter(map, &default_end,
24262 data_range.min_address - default_end,
24263 0, VM_MAP_KERNEL_FLAGS_FIXED_PERMANENT(), VM_OBJECT_NULL,
24264 0, FALSE, VM_PROT_NONE, VM_PROT_NONE, VM_INHERIT_DEFAULT);
24265 assert(kr == KERN_SUCCESS);
24266 }
24267
24268 if (data_range.max_address != vm_map_max(map)) {
24269 vm_map_entry_t entry;
24270 vm_size_t size;
24271
24272 /*
24273 * Extend the end of the hole to the next VM entry or the end of the map,
24274 * whichever comes first.
24275 */
24276 vm_map_lock_read(map);
24277 vm_map_lookup_entry_or_next(map, data_range.max_address, &entry);
24278 if (entry == vm_map_to_entry(map) || entry->vme_start > vm_map_max(map)) {
24279 size = vm_map_max(map) - data_range.max_address;
24280 } else {
24281 size = entry->vme_start - data_range.max_address;
24282 }
24283 vm_map_unlock_read(map);
24284
24285 kr = vm_map_enter(map, &data_range.max_address, size,
24286 0, VM_MAP_KERNEL_FLAGS_FIXED_PERMANENT(), VM_OBJECT_NULL,
24287 0, FALSE, VM_PROT_NONE, VM_PROT_NONE, VM_INHERIT_DEFAULT);
24288 assert(kr == KERN_SUCCESS);
24289 }
24290
24291 #if XNU_TARGET_OS_IOS && EXTENDED_USER_VA_SUPPORT
24292 if (needs_extra_jumbo_va) {
24293 /* This will grow the address space to MACH_VM_MAX_ADDRESS */
24294 vm_map_set_extra_jumbo(map);
24295 }
24296 #endif /* XNU_TARGET_OS_IOS && EXTENDED_USER_VA_SUPPORT */
24297
24298 vm_map_lock(map);
24299 map->default_range.min_address = vm_map_min(map);
24300 map->default_range.max_address = default_end;
24301 map->data_range = data_range;
24302 #if XNU_TARGET_OS_IOS && EXTENDED_USER_VA_SUPPORT
24303 /* If process has "extra jumbo" entitlement, enable large file range */
24304 if (needs_extra_jumbo_va) {
24305 map->large_file_range = vm_map_range_random_uniform(TiB(1),
24306 MACH_VM_JUMBO_ADDRESS, MACH_VM_MAX_ADDRESS, offmask);
24307 }
24308 #endif /* XNU_TARGET_OS_IOS && EXTENDED_USER_VA_SUPPORT */
24309 map->uses_user_ranges = true;
24310 vm_map_unlock(map);
24311
24312 return KERN_SUCCESS;
24313 }
24314
24315 /*
24316 * vm_map_range_fork:
24317 * clones the array of ranges from old_map to new_map in support
24318 * of a VM map fork.
24319 */
24320 void
vm_map_range_fork(vm_map_t new_map,vm_map_t old_map)24321 vm_map_range_fork(vm_map_t new_map, vm_map_t old_map)
24322 {
24323 if (!old_map->uses_user_ranges) {
24324 /* nothing to do */
24325 return;
24326 }
24327
24328 new_map->default_range = old_map->default_range;
24329 new_map->data_range = old_map->data_range;
24330
24331 if (old_map->extra_ranges_count) {
24332 vm_map_user_range_t otable, ntable;
24333 uint16_t count;
24334
24335 otable = old_map->extra_ranges;
24336 count = old_map->extra_ranges_count;
24337 ntable = kalloc_data(count * sizeof(struct vm_map_user_range),
24338 Z_WAITOK | Z_ZERO | Z_NOFAIL);
24339 memcpy(ntable, otable,
24340 count * sizeof(struct vm_map_user_range));
24341
24342 new_map->extra_ranges_count = count;
24343 new_map->extra_ranges = ntable;
24344 }
24345
24346 new_map->uses_user_ranges = true;
24347 }
24348
24349 /*
24350 * vm_map_get_user_range:
24351 * copy the VM user range for the given VM map and range ID.
24352 */
24353 kern_return_t
vm_map_get_user_range(vm_map_t map,vm_map_range_id_t range_id,mach_vm_range_t range)24354 vm_map_get_user_range(
24355 vm_map_t map,
24356 vm_map_range_id_t range_id,
24357 mach_vm_range_t range)
24358 {
24359 if (map == NULL || !map->uses_user_ranges || range == NULL) {
24360 return KERN_INVALID_ARGUMENT;
24361 }
24362
24363 switch (range_id) {
24364 case UMEM_RANGE_ID_DEFAULT:
24365 *range = map->default_range;
24366 return KERN_SUCCESS;
24367
24368 case UMEM_RANGE_ID_HEAP:
24369 *range = map->data_range;
24370 return KERN_SUCCESS;
24371
24372 case UMEM_RANGE_ID_LARGE_FILE:
24373 /*
24374 * Because this function tells a user-space process about the user
24375 * ranges in its VM map, this case communicates whether the large file
24376 * range is in use. Note that this is different from how the large file
24377 * range ID is handled in `vm_map_get_range()`: there, we "resolve" the
24378 * VA policy and return either the large file range or data range,
24379 * depending on whether the large file range is enabled.
24380 */
24381 if (map->large_file_range.min_address != map->large_file_range.max_address) {
24382 /* large file range is configured and should be used */
24383 *range = map->large_file_range;
24384 } else {
24385 return KERN_INVALID_ARGUMENT;
24386 }
24387 return KERN_SUCCESS;
24388
24389 default:
24390 return KERN_INVALID_ARGUMENT;
24391 }
24392 }
24393
24394 static vm_map_range_id_t
vm_map_user_range_resolve(vm_map_t map,mach_vm_address_t addr,mach_vm_size_t size,mach_vm_range_t range)24395 vm_map_user_range_resolve(
24396 vm_map_t map,
24397 mach_vm_address_t addr,
24398 mach_vm_size_t size,
24399 mach_vm_range_t range)
24400 {
24401 struct mach_vm_range tmp;
24402
24403 vm_map_lock_assert_held(map);
24404
24405 static_assert((int)UMEM_RANGE_ID_DEFAULT == MACH_VM_RANGE_DEFAULT);
24406 static_assert((int)UMEM_RANGE_ID_HEAP == MACH_VM_RANGE_DATA);
24407
24408 if (mach_vm_range_contains(&map->default_range, addr, size)) {
24409 if (range) {
24410 *range = map->default_range;
24411 }
24412 return UMEM_RANGE_ID_DEFAULT;
24413 }
24414
24415 if (mach_vm_range_contains(&map->data_range, addr, size)) {
24416 if (range) {
24417 *range = map->data_range;
24418 }
24419 return UMEM_RANGE_ID_HEAP;
24420 }
24421
24422 if (mach_vm_range_contains(&map->large_file_range, addr, size)) {
24423 if (range) {
24424 *range = map->large_file_range;
24425 }
24426 return UMEM_RANGE_ID_LARGE_FILE;
24427 }
24428
24429 for (size_t i = 0; i < map->extra_ranges_count; i++) {
24430 vm_map_user_range_t r = &map->extra_ranges[i];
24431
24432 tmp.min_address = r->vmur_min_address;
24433 tmp.max_address = r->vmur_max_address;
24434
24435 if (mach_vm_range_contains(&tmp, addr, size)) {
24436 if (range) {
24437 *range = tmp;
24438 }
24439 return r->vmur_range_id;
24440 }
24441 }
24442
24443 if (range) {
24444 range->min_address = range->max_address = 0;
24445 }
24446 return UMEM_RANGE_ID_DEFAULT;
24447 }
24448 #endif /* CONFIG_MAP_RANGES */
24449
24450 void
vm_map_kernel_flags_update_range_id(vm_map_kernel_flags_t * vmkf,vm_map_t map,__unused vm_map_size_t size)24451 vm_map_kernel_flags_update_range_id(
24452 vm_map_kernel_flags_t *vmkf,
24453 vm_map_t map,
24454 __unused vm_map_size_t size)
24455 {
24456 if (map == kernel_map) {
24457 if (vmkf->vmkf_range_id == KMEM_RANGE_ID_NONE) {
24458 vmkf->vmkf_range_id = KMEM_RANGE_ID_DATA;
24459 }
24460 #if CONFIG_MAP_RANGES
24461 } else if (vmkf->vm_tag < VM_MEMORY_COUNT &&
24462 vmkf->vmkf_range_id == UMEM_RANGE_ID_DEFAULT) {
24463 if (bitmap_test(vm_map_user_range_large_file_map, vmkf->vm_tag)
24464 || size >= VM_LARGE_FILE_THRESHOLD) {
24465 /*
24466 * if the map doesn't have the large file range configured,
24467 * the range will get resolved to the heap range in `vm_map_get_range`
24468 */
24469 vmkf->vmkf_range_id = UMEM_RANGE_ID_LARGE_FILE;
24470 } else if (bitmap_test(vm_map_user_range_heap_map, vmkf->vm_tag)) {
24471 vmkf->vmkf_range_id = UMEM_RANGE_ID_HEAP;
24472 }
24473 #endif /* CONFIG_MAP_RANGES */
24474 }
24475 }
24476
24477 /*
24478 * vm_map_entry_has_device_pager:
24479 * Check if the vm map entry specified by the virtual address has a device pager.
24480 * If the vm map entry does not exist or if the map is NULL, this returns FALSE.
24481 */
24482 boolean_t
vm_map_entry_has_device_pager(vm_map_t map,vm_map_offset_t vaddr)24483 vm_map_entry_has_device_pager(vm_map_t map, vm_map_offset_t vaddr)
24484 {
24485 vm_map_entry_t entry;
24486 vm_object_t object;
24487 boolean_t result;
24488
24489 if (map == NULL) {
24490 return FALSE;
24491 }
24492
24493 vm_map_lock(map);
24494 while (TRUE) {
24495 if (!vm_map_lookup_entry(map, vaddr, &entry)) {
24496 result = FALSE;
24497 break;
24498 }
24499 if (entry->is_sub_map) {
24500 // Check the submap
24501 vm_map_t submap = VME_SUBMAP(entry);
24502 assert(submap != NULL);
24503 vm_map_lock(submap);
24504 vm_map_unlock(map);
24505 map = submap;
24506 continue;
24507 }
24508 object = VME_OBJECT(entry);
24509 if (object != NULL && object->pager != NULL && is_device_pager_ops(object->pager->mo_pager_ops)) {
24510 result = TRUE;
24511 break;
24512 }
24513 result = FALSE;
24514 break;
24515 }
24516
24517 vm_map_unlock(map);
24518 return result;
24519 }
24520
24521
24522 #if MACH_ASSERT
24523
24524 extern int pmap_ledgers_panic;
24525 extern int pmap_ledgers_panic_leeway;
24526
24527 #define LEDGER_DRIFT(__LEDGER) \
24528 int __LEDGER##_over; \
24529 ledger_amount_t __LEDGER##_over_total; \
24530 ledger_amount_t __LEDGER##_over_max; \
24531 int __LEDGER##_under; \
24532 ledger_amount_t __LEDGER##_under_total; \
24533 ledger_amount_t __LEDGER##_under_max
24534
24535 struct {
24536 uint64_t num_pmaps_checked;
24537
24538 LEDGER_DRIFT(phys_footprint);
24539 LEDGER_DRIFT(internal);
24540 LEDGER_DRIFT(internal_compressed);
24541 LEDGER_DRIFT(external);
24542 LEDGER_DRIFT(reusable);
24543 LEDGER_DRIFT(iokit_mapped);
24544 LEDGER_DRIFT(alternate_accounting);
24545 LEDGER_DRIFT(alternate_accounting_compressed);
24546 LEDGER_DRIFT(page_table);
24547 LEDGER_DRIFT(purgeable_volatile);
24548 LEDGER_DRIFT(purgeable_nonvolatile);
24549 LEDGER_DRIFT(purgeable_volatile_compressed);
24550 LEDGER_DRIFT(purgeable_nonvolatile_compressed);
24551 LEDGER_DRIFT(tagged_nofootprint);
24552 LEDGER_DRIFT(tagged_footprint);
24553 LEDGER_DRIFT(tagged_nofootprint_compressed);
24554 LEDGER_DRIFT(tagged_footprint_compressed);
24555 LEDGER_DRIFT(network_volatile);
24556 LEDGER_DRIFT(network_nonvolatile);
24557 LEDGER_DRIFT(network_volatile_compressed);
24558 LEDGER_DRIFT(network_nonvolatile_compressed);
24559 LEDGER_DRIFT(media_nofootprint);
24560 LEDGER_DRIFT(media_footprint);
24561 LEDGER_DRIFT(media_nofootprint_compressed);
24562 LEDGER_DRIFT(media_footprint_compressed);
24563 LEDGER_DRIFT(graphics_nofootprint);
24564 LEDGER_DRIFT(graphics_footprint);
24565 LEDGER_DRIFT(graphics_nofootprint_compressed);
24566 LEDGER_DRIFT(graphics_footprint_compressed);
24567 LEDGER_DRIFT(neural_nofootprint);
24568 LEDGER_DRIFT(neural_footprint);
24569 LEDGER_DRIFT(neural_nofootprint_compressed);
24570 LEDGER_DRIFT(neural_footprint_compressed);
24571 LEDGER_DRIFT(neural_nofootprint_total);
24572 } pmap_ledgers_drift;
24573
24574 void
vm_map_pmap_check_ledgers(pmap_t pmap,ledger_t ledger,int pid,char * procname)24575 vm_map_pmap_check_ledgers(
24576 pmap_t pmap,
24577 ledger_t ledger,
24578 int pid,
24579 char *procname)
24580 {
24581 ledger_amount_t bal;
24582 boolean_t do_panic;
24583
24584 do_panic = FALSE;
24585
24586 pmap_ledgers_drift.num_pmaps_checked++;
24587
24588 #define LEDGER_CHECK_BALANCE(__LEDGER) \
24589 MACRO_BEGIN \
24590 int panic_on_negative = TRUE; \
24591 ledger_get_balance(ledger, \
24592 task_ledgers.__LEDGER, \
24593 &bal); \
24594 ledger_get_panic_on_negative(ledger, \
24595 task_ledgers.__LEDGER, \
24596 &panic_on_negative); \
24597 if (bal != 0) { \
24598 if (panic_on_negative || \
24599 (pmap_ledgers_panic && \
24600 pmap_ledgers_panic_leeway > 0 && \
24601 (bal > (pmap_ledgers_panic_leeway * PAGE_SIZE) || \
24602 bal < (-pmap_ledgers_panic_leeway * PAGE_SIZE)))) { \
24603 do_panic = TRUE; \
24604 } \
24605 printf("LEDGER BALANCE proc %d (%s) " \
24606 "\"%s\" = %lld\n", \
24607 pid, procname, #__LEDGER, bal); \
24608 if (bal > 0) { \
24609 pmap_ledgers_drift.__LEDGER##_over++; \
24610 pmap_ledgers_drift.__LEDGER##_over_total += bal; \
24611 if (bal > pmap_ledgers_drift.__LEDGER##_over_max) { \
24612 pmap_ledgers_drift.__LEDGER##_over_max = bal; \
24613 } \
24614 } else if (bal < 0) { \
24615 pmap_ledgers_drift.__LEDGER##_under++; \
24616 pmap_ledgers_drift.__LEDGER##_under_total += bal; \
24617 if (bal < pmap_ledgers_drift.__LEDGER##_under_max) { \
24618 pmap_ledgers_drift.__LEDGER##_under_max = bal; \
24619 } \
24620 } \
24621 } \
24622 MACRO_END
24623
24624 LEDGER_CHECK_BALANCE(phys_footprint);
24625 LEDGER_CHECK_BALANCE(internal);
24626 LEDGER_CHECK_BALANCE(internal_compressed);
24627 LEDGER_CHECK_BALANCE(external);
24628 LEDGER_CHECK_BALANCE(reusable);
24629 LEDGER_CHECK_BALANCE(iokit_mapped);
24630 LEDGER_CHECK_BALANCE(alternate_accounting);
24631 LEDGER_CHECK_BALANCE(alternate_accounting_compressed);
24632 LEDGER_CHECK_BALANCE(page_table);
24633 LEDGER_CHECK_BALANCE(purgeable_volatile);
24634 LEDGER_CHECK_BALANCE(purgeable_nonvolatile);
24635 LEDGER_CHECK_BALANCE(purgeable_volatile_compressed);
24636 LEDGER_CHECK_BALANCE(purgeable_nonvolatile_compressed);
24637 LEDGER_CHECK_BALANCE(tagged_nofootprint);
24638 LEDGER_CHECK_BALANCE(tagged_footprint);
24639 LEDGER_CHECK_BALANCE(tagged_nofootprint_compressed);
24640 LEDGER_CHECK_BALANCE(tagged_footprint_compressed);
24641 LEDGER_CHECK_BALANCE(network_volatile);
24642 LEDGER_CHECK_BALANCE(network_nonvolatile);
24643 LEDGER_CHECK_BALANCE(network_volatile_compressed);
24644 LEDGER_CHECK_BALANCE(network_nonvolatile_compressed);
24645 LEDGER_CHECK_BALANCE(media_nofootprint);
24646 LEDGER_CHECK_BALANCE(media_footprint);
24647 LEDGER_CHECK_BALANCE(media_nofootprint_compressed);
24648 LEDGER_CHECK_BALANCE(media_footprint_compressed);
24649 LEDGER_CHECK_BALANCE(graphics_nofootprint);
24650 LEDGER_CHECK_BALANCE(graphics_footprint);
24651 LEDGER_CHECK_BALANCE(graphics_nofootprint_compressed);
24652 LEDGER_CHECK_BALANCE(graphics_footprint_compressed);
24653 LEDGER_CHECK_BALANCE(neural_nofootprint);
24654 LEDGER_CHECK_BALANCE(neural_footprint);
24655 LEDGER_CHECK_BALANCE(neural_nofootprint_compressed);
24656 LEDGER_CHECK_BALANCE(neural_footprint_compressed);
24657 LEDGER_CHECK_BALANCE(neural_nofootprint_total);
24658
24659 if (do_panic) {
24660 if (pmap_ledgers_panic) {
24661 panic("pmap_destroy(%p) %d[%s] has imbalanced ledgers",
24662 pmap, pid, procname);
24663 } else {
24664 printf("pmap_destroy(%p) %d[%s] has imbalanced ledgers\n",
24665 pmap, pid, procname);
24666 }
24667 }
24668 }
24669
24670 void
vm_map_pmap_set_process(vm_map_t map,int pid,char * procname)24671 vm_map_pmap_set_process(
24672 vm_map_t map,
24673 int pid,
24674 char *procname)
24675 {
24676 pmap_set_process(vm_map_pmap(map), pid, procname);
24677 }
24678
24679 #endif /* MACH_ASSERT */
24680
24681 /**
24682 * Check if a given given map operation size is valid for the given map, taking
24683 * in to account whether or not the map operation has overridden the soft limit.
24684 *
24685 * This function is meant to be inlined wherever possible as it can, in some
24686 * modes, generates telemetry events which capture shallow backtraces. To
24687 * maximize the usefulness of this backtrace, we want to minize the depth at
24688 * which the backtrace is taken.
24689 */
24690 __attribute__((always_inline))
24691 bool
vm_map_is_map_size_valid(vm_map_t target_map,vm_size_t size,bool no_soft_limit)24692 vm_map_is_map_size_valid(
24693 vm_map_t target_map,
24694 vm_size_t size,
24695 bool no_soft_limit)
24696 {
24697 #ifdef __x86_64__
24698 // Do not enforce any additional limits on x64
24699 (void)target_map;
24700 (void)size;
24701 (void)no_soft_limit;
24702 return true;
24703 #else
24704 if (__probable(target_map->pmap != kernel_pmap ||
24705 size < VM_KERNEL_SIMPLE_MAX_SIZE || no_soft_limit)) {
24706 // Allocation size matches policy
24707 return true;
24708 }
24709
24710 switch (vm_map_kernel_alloc_limit_mode) {
24711 default:
24712 case VM_MAP_KERNEL_ALLOC_LIMIT_MODE_BYPASS:
24713 return true;
24714 case VM_MAP_KERNEL_ALLOC_LIMIT_MODE_TRAP:
24715 trap_telemetry_report_kernel_soft_error(
24716 TRAP_TELEMETRY_KERNEL_SOFT_ERROR_VM_KERNEL_MAX_ALLOC_SIZE,
24717 /* report_once_per_site */ false);
24718 return true;
24719 case VM_MAP_KERNEL_ALLOC_LIMIT_MODE_REJECT:
24720 return false;
24721 case VM_MAP_KERNEL_ALLOC_LIMIT_MODE_PANIC:
24722 panic("1,000,000K ought to be enough for anybody "
24723 "(requested %lu bytes)", size);
24724 }
24725 #endif /* __x86_64__ */
24726 }
24727